Utility.h
上传用户:jxhy0791
上传日期:2007-05-24
资源大小:6173k
文件大小:6k
源码类别:

多国语言处理

开发平台:

Visual C++

  1. /****************************************************************************
  2.  *
  3.  * Copyright (c) 2000, 2001 
  4.  *     Machine Group
  5.  *     Software Research Lab.
  6.  *     Institute of Computing Tech.
  7.  *     Chinese Academy of Sciences
  8.  *     All rights reserved.
  9.  *
  10.  * This file is the confidential and proprietary property of 
  11.  * Institute of Computing Tech. and the posession or use of this file requires 
  12.  * a written license from the author.
  13.  * Filename: Utility.h
  14.  * Abstract:
  15.  *           Utility functions for Chinese Language Processing
  16.  * Author:   Kevin Zhang 
  17.  *          (zhanghp@software.ict.ac.cn)
  18.  * Date:     2002-1-8
  19.  *
  20.  * Notes:
  21.  *                
  22.  * 
  23.  ****************************************************************************/
  24. #if !defined(AFX_CHINESE_UTILITY_H__B6D7EA03_7BCD_46AD_B38C_D8033ACD5813__INCLUDED_)
  25. #define AFX_CHINESE_UTILITY_H__B6D7EA03_7BCD_46AD_B38C_D8033ACD5813__INCLUDED_
  26. #if _MSC_VER > 1000
  27. #pragma once
  28. #endif // _MSC_VER > 1000
  29. #define  CT_SENTENCE_BEGIN 0//Sentence begin 
  30. #define  CT_SENTENCE_END   1//Sentence ending
  31. #define  CT_SINGLE  3//SINGLE byte
  32. #define  CT_DELIMITER CT_SINGLE+1//delimiter
  33. #define  CT_CHINESE   CT_SINGLE+2//Chinese Char
  34. #define  CT_LETTER    CT_SINGLE+3//HanYu Pinyin
  35. #define  CT_NUM       CT_SINGLE+4//HanYu Pinyin
  36. #define  CT_INDEX     CT_SINGLE+5//HanYu Pinyin
  37. #define  CT_OTHER     CT_SINGLE+12//Other
  38. #define POSTFIX_SINGLE "坝邦堡杯城池村单岛道堤店洞渡队法峰府冈港阁宫沟国海号河湖环集江奖礁角街井郡坑口矿里岭楼路门盟庙弄牌派坡铺旗桥区渠泉人山省市水寺塔台滩坛堂厅亭屯湾文屋溪峡县线乡巷型洋窑营屿语园苑院闸寨站镇州庄族陂庵町"
  39. #define POSTFIX_MUTIPLE {"半岛","草原","城市","大堤","大公国","大桥","地区","帝国","渡槽","港口","高速公路","高原","公路","公园","共和国","谷地","广场","国道","海峡","胡同","机场","集镇","教区","街道","口岸","码头","煤矿","牧场","农场","盆地","平原","丘陵","群岛","沙漠","沙洲","山脉","山丘","水库","隧道","特区","铁路","新村","雪峰","盐场","盐湖","渔场","直辖市","自治区","自治县","自治州",""}
  40.                           
  41. #define  TRANS_ENGLISH   "·—阿埃艾爱安昂敖奥澳笆芭巴白拜班邦保堡鲍北贝本比毕彼别波玻博勃伯泊卜布才采仓查差柴彻川茨慈次达大戴代丹旦但当道德得的登迪狄蒂帝丁东杜敦多额俄厄鄂恩尔伐法范菲芬费佛夫福弗甫噶盖干冈哥戈革葛格各根古瓜哈海罕翰汗汉豪合河赫亨侯呼胡华霍基吉及加贾坚简杰金京久居君喀卡凯坎康考柯科可克肯库奎拉喇莱来兰郎朗劳勒雷累楞黎理李里莉丽历利立力连廉良列烈林隆卢虏鲁路伦仑罗洛玛马买麦迈曼茅茂梅门蒙盟米蜜密敏明摩莫墨默姆木穆那娜纳乃奈南内尼年涅宁纽努诺欧帕潘畔庞培佩彭皮平泼普其契恰强乔切钦沁泉让热荣肉儒瑞若萨塞赛桑瑟森莎沙山善绍舍圣施诗石什史士守斯司丝苏素索塔泰坦汤唐陶特提汀图土吐托陀瓦万王旺威韦维魏温文翁沃乌吾武伍西锡希喜夏相香歇谢辛新牙雅亚彦尧叶依伊衣宜义因音英雍尤于约宰泽增詹珍治中仲朱诸卓孜祖佐伽娅尕腓滕济嘉津赖莲琳律略慕妮聂裴浦奇齐琴茹珊卫欣逊札哲智兹芙汶迦珀琪梵斐胥黛"
  42. #define  TRANS_RUSSIAN   "·阿安奥巴比彼波布察茨大德得丁杜尔法夫伏甫盖格哈基加坚捷金卡科可克库拉莱兰勒雷里历利连列卢鲁罗洛马梅蒙米姆娜涅宁诺帕泼普奇齐乔切日萨色山申什斯索塔坦特托娃维文乌西希谢亚耶叶依伊以扎佐柴达登蒂戈果海赫华霍吉季津柯理琳玛曼穆纳尼契钦丘桑沙舍泰图瓦万雅卓兹"
  43. #define  TRANS_JAPANESE  "安奥八白百邦保北倍本比滨博步部彩菜仓昌长朝池赤川船淳次村大代岛稻道德地典渡尔繁饭风福冈高工宫古谷关广桂贵好浩和合河黑横恒宏后户荒绘吉纪佳加见健江介金今进井静敬靖久酒菊俊康可克口梨理里礼栗丽利立凉良林玲铃柳隆鹿麻玛美萌弥敏木纳南男内鸟宁朋片平崎齐千前浅桥琴青清庆秋丘曲泉仁忍日荣若三森纱杉山善上伸神圣石实矢世市室水顺司松泰桃藤天田土万望尾未文武五舞西细夏宪相小孝新星行雄秀雅亚岩杨洋阳遥野也叶一伊衣逸义益樱永由有佑宇羽郁渊元垣原远月悦早造则泽增扎宅章昭沼真政枝知之植智治中忠仲竹助椎子佐阪坂堀荻菅薰浜濑鸠筱"
  44. //Translation type
  45. #define  TT_ENGLISH  0
  46. #define  TT_RUSSIAN  1
  47. #define  TT_JAPANESE  2
  48. //Seperator type
  49. #define  SEPERATOR_C_SENTENCE "。!?:;…"
  50. #define  SEPERATOR_C_SUB_SENTENCE "、,()“”‘’"
  51. #define  SEPERATOR_E_SENTENCE "!?:;"
  52. #define  SEPERATOR_E_SUB_SENTENCE ",()42'"
  53. #define  SEPERATOR_LINK "nr  "
  54. //Sentence begin and ending string
  55. #define SENTENCE_BEGIN "始##始"
  56. #define SENTENCE_END "末##末"
  57. bool GB2312_Generate(char *sFileName);
  58. //Generate the GB2312 List file
  59. bool CC_Generate(char *sFileName);
  60. //Generate the Chinese Char List file
  61. char *CC_Find(const char *string, const char *strCharSet);
  62. //Find a Chinese sub-string in the Chinese String 
  63. int charType(unsigned char *sChar);
  64. //Judge the type of sChar or (sChar,sChar+1)
  65. unsigned int GetCCPrefix(unsigned char *sSentence);
  66. //Get the max Prefix string made up of Chinese Char
  67. bool IsAllChinese(unsigned char *sString);
  68. //Judge the string is all made up of Chinese Char
  69. bool IsAllNonChinese(unsigned char *sString);
  70. //Judge the string is all made up of non-Chinese Char
  71. bool IsAllSingleByte(unsigned char *sString);
  72. //Judge the string is all made up of Single Byte Char
  73. bool IsAllNum(unsigned char *sString);
  74. //Judge the string is all made up of Num Char
  75. bool IsAllIndex(unsigned char *sString);
  76. //Judge the string is all made up of Index Num Char
  77. bool IsAllLetter(unsigned char *sString);
  78. //Judge the string is all made up of Letter Char
  79. bool IsAllDelimiter(unsigned char *sString);
  80. //Judge the string is all made up of Delimiter
  81. int BinarySearch(int nVal, int *nTable,int nTableLen);
  82. //Binary search a value in a table which len is nTableLen
  83. bool IsForeign(char *sWord);
  84. //sWord maybe is a foreign translation
  85. bool IsAllChineseNum(char *sWord);
  86. //Decide whether the word is Chinese Num word
  87. bool IsAllForeign(char *sWord);
  88. //Decide whether the word is all  foreign translation
  89. int GetForeignCharCount(char *sWord);
  90. //Decide whether the word is all  non-foreign translation
  91. int GetCharCount(char *sCharSet,char *sWord);
  92. //Get the count of char which is in sWord and in sCharSet
  93. int GetForeignType(char *sWord);
  94. //Return the foreign type 
  95. bool PostfixSplit(char *sWord, char *sWordRet, char *sPostfix);
  96. //Get the postfix
  97. //bool IsSingleByteDelimiter(char cByteChar);
  98. //Judge whether it's a num
  99. #endif // !defined(AFX_CHINESE_UTILITY_H__B6D7EA03_7BCD_46AD_B38C_D8033ACD5813__INCLUDED_)