UnknowWord.cpp
上传用户:sunyong76
上传日期:2021-10-03
资源大小:2236k
文件大小:5k
源码类别:

多国语言处理

开发平台:

Java

  1. //////////////////////////////////////////////////////////////////////
  2. //ICTCLAS简介:计算所汉语词法分析系统ICTCLAS(Institute of Computing Technology, Chinese Lexical Analysis System),
  3. //             功能有:中文分词;词性标注;未登录词识别。
  4. //             分词正确率高达97.58%(973专家评测结果),
  5. //             未登录词识别召回率均高于90%,其中中国人名的识别召回率接近98%;
  6. //             处理速度为31.5Kbytes/s。
  7. //著作权:  Copyright?2002-2005中科院计算所 职务著作权人:张华平 刘群
  8. //遵循协议:自然语言处理开放资源许可证1.0
  9. //Email: zhanghp@software.ict.ac.cn
  10. //Homepage:www.nlp.org.cn;mtgroup.ict.ac.cn
  11. /****************************************************************************
  12.  *
  13.  * Copyright (c) 2000, 2001 
  14.  *     Machine Group
  15.  *     Software Research Lab.
  16.  *     Institute of Computing Tech.
  17.  *     Chinese Academy of Sciences
  18.  *     All rights reserved.
  19.  *
  20.  * This file is the confidential and proprietary property of 
  21.  * Institute of Computing Tech. and the posession or use of this file requires 
  22.  * a written license from the author.
  23.  * Filename: UnknowWord.cpp
  24.  * Abstract:
  25.  *           implementation of the CUnknowWord class.
  26.  * Author:   Kevin Zhang 
  27.  *          (zhanghp@software.ict.ac.cn)
  28.  * Date:     2002-4-23
  29.  *
  30.  * Notes:   Unknown words recognition based on Role-tagging
  31.  *                
  32.  ****************************************************************************/
  33. #include "stdafx.h"
  34. #include "UnknowWord.h"
  35. #include <string.h>
  36. #include <math.h>
  37. //////////////////////////////////////////////////////////////////////
  38. // Construction/Destruction
  39. //////////////////////////////////////////////////////////////////////
  40. CUnknowWord::CUnknowWord()
  41. {
  42. m_sUnknownFlags[0]=0;
  43. }
  44. CUnknowWord::~CUnknowWord()
  45. {
  46. }
  47. //Unknown word recognition
  48. //pWordSegResult:word Segmentation result;graphOptimum: The optimized segmentation graph
  49. //graphSeg: The original segmentation graph
  50. bool CUnknowWord::Recognition(PWORD_RESULT pWordSegResult, CDynamicArray &graphOptimum,CSegGraph &graphSeg,CDictionary &dictCore)
  51. {
  52. int nStartPos=0,j=0,nAtomStart,nAtomEnd,nPOSOriginal;
  53. ELEMENT_TYPE dValue;
  54. m_roleTag.POSTagging(pWordSegResult,dictCore,m_dict);
  55. //Tag the segmentation with unknown recognition roles according the core dictionary and unknown recognition dictionary
  56. for(int i=0;i<m_roleTag.m_nUnknownIndex;i++)
  57. {
  58. while((unsigned int)j<graphSeg.m_nAtomCount&&nStartPos<m_roleTag.m_nUnknownWords[i][0])
  59. {
  60. nStartPos+=graphSeg.m_nAtomLength[j++];
  61. }
  62. nAtomStart=j;
  63. while((unsigned int)j<graphSeg.m_nAtomCount&&nStartPos<m_roleTag.m_nUnknownWords[i][1])
  64. {
  65. nStartPos+=graphSeg.m_nAtomLength[j++];
  66. }
  67. nAtomEnd=j;
  68. if(nAtomStart<nAtomEnd)
  69. {
  70. graphOptimum.GetElement(nAtomStart,nAtomEnd,&dValue,&nPOSOriginal);
  71. if(dValue>m_roleTag.m_dWordsPossibility[i])//Set the element with less frequency
  72. graphOptimum.SetElement(nAtomStart,nAtomEnd,m_roleTag.m_dWordsPossibility[i],m_nPOS,m_sUnknownFlags);
  73. }
  74. }
  75. return true;
  76. }
  77. //Load unknown recognition dictionary
  78. //Load context
  79. //type: Unknown words type (including person,place,transliterion and so on)
  80. bool CUnknowWord::Configure(char *sConfigFile,enum TAG_TYPE type)
  81. {
  82. char sFilename[100];
  83. //Load the unknown recognition dictionary
  84. strcpy(sFilename,sConfigFile);
  85. strcat(sFilename,".dct");
  86. m_dict.Load(sFilename);
  87. //Load the unknown recognition context
  88. strcpy(sFilename,sConfigFile);
  89. strcat(sFilename,".ctx");
  90. m_roleTag.LoadContext(sFilename);
  91. //Set the tagging type
  92. m_roleTag.SetTagType(type);
  93. switch(type)
  94. {
  95. case TT_PERSON:
  96. case TT_TRANS_PERSON://Set the special flag for transliterations
  97. m_nPOS=-28274;//-'n'*256-'r';
  98. strcpy(m_sUnknownFlags,"未##人");
  99. break;
  100. case TT_PLACE:
  101. m_nPOS=-28275;//-'n'*256-'s';
  102. strcpy(m_sUnknownFlags,"未##地");
  103. break;
  104. default :
  105. m_nPOS=0;
  106. break;
  107. }
  108. return true;
  109. }
  110. //Judge whether the name is a given name
  111. bool CUnknowWord::IsGivenName(char *sName)
  112. {
  113. char sFirstChar[3],sSecondChar[3];
  114. ELEMENT_TYPE dGivenNamePossibility=0,dSingleNamePossibility=0;
  115. if(strlen(sName)!=4)
  116. return false;
  117. strncpy(sFirstChar,sName,2);
  118. sFirstChar[2]=0;
  119. strncpy(sSecondChar,sName+2,2);
  120. sSecondChar[2]=0;
  121. //The possibility of P(Wi|Ti)
  122. dGivenNamePossibility+=log((double)m_dict.GetFrequency(sFirstChar,2)+1.0)-log(m_roleTag.m_context.GetFrequency(0,2)+1.0);
  123. dGivenNamePossibility+=log((double)m_dict.GetFrequency(sSecondChar,3)+1.0)-log(m_roleTag.m_context.GetFrequency(0,3)+1.0);
  124. //The possibility of conversion from 2 to 3
  125. dGivenNamePossibility+=log(m_roleTag.m_context.GetContextPossibility(0,2,3)+1.0)-log(m_roleTag.m_context.GetFrequency(0,2)+1.0);
  126. //The possibility of P(Wi|Ti)
  127. dSingleNamePossibility+=log((double)m_dict.GetFrequency(sFirstChar,1)+1.0)-log(m_roleTag.m_context.GetFrequency(0,1)+1.0);
  128. dSingleNamePossibility+=log((double)m_dict.GetFrequency(sSecondChar,4)+1.0)-log(m_roleTag.m_context.GetFrequency(0,4)+1.0);
  129. //The possibility of conversion from 1 to 4
  130. dSingleNamePossibility+=log(m_roleTag.m_context.GetContextPossibility(0,1,4)+1.0)-log(m_roleTag.m_context.GetFrequency(0,1)+1.0);
  131. if(dSingleNamePossibility>=dGivenNamePossibility)//张震||m_dict.GetFrequency(sFirstChar,1)/m_dict.GetFrequency(sFirstChar,2)>=10
  132. //The possibility being a single given name is more than being a 2-char given name
  133. return false;
  134. return true;
  135. }