Utility.cpp
上传用户:yxl0916
上传日期:2007-05-25
资源大小:2245k
文件大小:18k
- //////////////////////////////////////////////////////////////////////
- //ICTCLAS简介:计算所汉语词法分析系统ICTCLAS(Institute of Computing Technology, Chinese Lexical Analysis System),
- // 功能有:中文分词;词性标注;未登录词识别。
- // 分词正确率高达97.58%(973专家评测结果),
- // 未登录词识别召回率均高于90%,其中中国人名的识别召回率接近98%;
- // 处理速度为31.5Kbytes/s。
- //著作权: Copyright?2002-2005中科院计算所 职务著作权人:张华平 刘群
- //遵循协议:自然语言处理开放资源许可证1.0
- //Email: zhanghp@software.ict.ac.cn
- //Homepage:www.nlp.org.cn;mtgroup.ict.ac.cn
- /****************************************************************************
- *
- * Copyright (c) 2000, 2001
- * Machine Group
- * Software Research Lab.
- * Institute of Computing Tech.
- * Chinese Academy of Sciences
- * All rights reserved.
- *
- * This file is the confidential and proprietary property of
- * Institute of Computing Tech. and the posession or use of this file requires
- * a written license from the author.
- * Filename: Utility.c
- * Abstract:
- * Utility functions for Chinese Language Processing
- * Author: Kevin Zhang
- * (zhanghp@software.ict.ac.cn)
- * Date: 2002-1-8
- *
- * Notes:
- *
- ****************************************************************************/
- #include "stdafx.h"
- #include "Utility.h"
- #include <stdio.h>
- #include <string.h>
- /*********************************************************************
- *
- * Func Name : GB2312_Generate
- *
- * Description: Generate the GB2312 List file
- *
- *
- * Parameters : sFilename: the file name for the output GB2312 List
- *
- * Returns : bool
- * Author : Kevin Zhang
- * History :
- * 1.create 2002-1-8
- *********************************************************************/
- bool GB2312_Generate(char *sFileName)
- {
- FILE *fp;
- unsigned int i,j;
- if((fp=fopen(sFileName,"wt"))==NULL)
- return false;//fail while opening the file
- for(i=161;i<255;i++)
- for(j=161;j<255;j++)
- fprintf(fp,"%c%c,%d,%dn",i,j,i,j);
- fclose(fp);
- return true;
- }
- /*********************************************************************
- *
- * Func Name : CC_Generate
- *
- * Description: Generate the Chinese Char List file
- *
- *
- * Parameters : sFilename: the file name for the output CC List
- *
- * Returns : bool
- * Author : Kevin Zhang
- * History :
- * 1.create 2002-1-8
- *********************************************************************/
- bool CC_Generate(char *sFileName)
- {
- FILE *fp;
- unsigned int i,j;
- if((fp=fopen(sFileName,"wt"))==NULL)
- return false;//fail while opening the file
- for(i=176;i<255;i++)
- for(j=161;j<255;j++)
- fprintf(fp,"%c%c,%d,%dn",i,j,i,j);
- fclose(fp);
- return true;
- }
- /*********************************************************************
- *
- * Func Name : CC_Find
- *
- * Description: Find a Chinese sub-string in the Chinese String
- *
- *
- * Parameters : string:Null-terminated string to search
- *
- * strCharSet:Null-terminated string to search for
- *
- * Returns : char *
- * Author : Kevin Zhang
- * History :
- * 1.create 2002-1-8
- *********************************************************************/
- char *CC_Find(const char *string, const char *strCharSet)
- {
- char *cp=strstr(string,strCharSet);
- if(cp!=NULL&&(cp-string)%2==1)
- {
- return NULL;
- }
- return cp;
- }
- /*********************************************************************
- *
- * Func Name : charType
- *
- * Description: Judge the type of sChar or (sChar,sChar+1)
- *
- *
- * Parameters : sFilename: the file name for the output CC List
- *
- * Returns : int : the type of char
- * Author : Kevin Zhang
- * History :
- * 1.create 2002-1-8
- *********************************************************************/
- int charType(unsigned char *sChar)
- {
- if(*sChar<128)
- {
- if(strchr("