UnknowWord.cpp
上传用户:chen_dj
上传日期:2013-04-22
资源大小:111k
文件大小:5k
源码类别:

多国语言处理

开发平台:

C/C++

  1. /****************************************************************************
  2.  *
  3.  * Copyright (c) 2000, 2001 
  4.  *     Machine Group
  5.  *     Software Research Lab.
  6.  *     Institute of Computing Tech.
  7.  *     Chinese Academy of Sciences
  8.  *     All rights reserved.
  9.  *
  10.  * This file is the confidential and proprietary property of 
  11.  * Institute of Computing Tech. and the posession or use of this file requires 
  12.  * a written license from the author.
  13.  * Filename: UnknowWord.cpp
  14.  * Abstract:
  15.  *           implementation of the CUnknowWord class.
  16.  * Author:   Kevin Zhang 
  17.  *          (zhanghp@software.ict.ac.cn)
  18.  * Date:     2002-4-23
  19.  *
  20.  * Notes:   Unknown words recognition based on Role-tagging
  21.  *                
  22.  ****************************************************************************/
  23. #include "stdafx.h"
  24. #include "UnknowWord.h"
  25. #include <string.h>
  26. #include <math.h>
  27. //////////////////////////////////////////////////////////////////////
  28. // Construction/Destruction
  29. //////////////////////////////////////////////////////////////////////
  30. CUnknowWord::CUnknowWord()
  31. {
  32. }
  33. CUnknowWord::~CUnknowWord()
  34. {
  35. }
  36. //Unknown word recognition
  37. //pWordSegResult:word Segmentation result;graphOptimum: The optimized segmentation graph
  38. //graphSeg: The original segmentation graph
  39. bool CUnknowWord::Recognition(PWORD_RESULT pWordSegResult, CDynamicArray &graphOptimum,CSegGraph &graphSeg,CDictionary &dictCore)
  40. {
  41. int nStartPos=0,j=0,nAtomStart,nAtomEnd,nPOSOriginal;
  42. ELEMENT_TYPE dValue;
  43. m_roleTag.POSTagging(pWordSegResult,dictCore,m_dict);
  44. //Tag the segmentation with unknown recognition roles according the core dictionary and unknown recognition dictionary
  45. for(int i=0;i<m_roleTag.m_nUnknownIndex;i++)
  46. {
  47. while((unsigned int)j<graphSeg.m_nAtomCount&&nStartPos<m_roleTag.m_nUnknownWords[i][0])
  48. {
  49. nStartPos+=graphSeg.m_nAtomLength[j++];
  50. }
  51. nAtomStart=j;
  52. while((unsigned int)j<graphSeg.m_nAtomCount&&nStartPos<m_roleTag.m_nUnknownWords[i][1])
  53. {
  54. nStartPos+=graphSeg.m_nAtomLength[j++];
  55. }
  56. nAtomEnd=j;
  57. if(nAtomStart<nAtomEnd)
  58. {
  59. graphOptimum.GetElement(nAtomStart,nAtomEnd,&dValue,&nPOSOriginal);
  60. if(dValue>m_roleTag.m_dWordsPossibility[i])//Set the element with less frequency
  61. graphOptimum.SetElement(nAtomStart,nAtomEnd,m_roleTag.m_dWordsPossibility[i],m_nPOS);
  62. }
  63. }
  64. return true;
  65. }
  66. //Load unknown recognition dictionary
  67. //Load context
  68. //type: Unknown words type (including person,place,transliterion and so on)
  69. bool CUnknowWord::Configure(char *sConfigFile,enum TAG_TYPE type)
  70. {
  71. char sFilename[100];
  72. //Load the unknown recognition dictionary
  73. strcpy(sFilename,sConfigFile);
  74. strcat(sFilename,".dct");
  75. m_dict.Load(sFilename);
  76. //Load the unknown recognition context
  77. strcpy(sFilename,sConfigFile);
  78. strcat(sFilename,".ctx");
  79. m_roleTag.LoadContext(sFilename);
  80. //Set the tagging type
  81. m_roleTag.SetTagType(type);
  82. switch(type)
  83. {
  84. case TT_PERSON:
  85. m_nPOS=-'n'*256-'r';
  86. break;
  87. case TT_TRANS://Set the special flag for transliterations
  88. m_nPOS=-'t'*256-'t';
  89. break;
  90. case TT_PLACE:
  91. m_nPOS=-'n'*256-'s';
  92. break;
  93. default :
  94. m_nPOS=0;
  95. break;
  96. }
  97. return true;
  98. }
  99. //Judge whether the name is a given name
  100. bool CUnknowWord::IsGivenName(char *sName)
  101. {
  102. char sFirstChar[3],sSecondChar[3];
  103. ELEMENT_TYPE dGivenNamePossibility=0,dSingleNamePossibility=0;
  104. if(strlen(sName)!=4)
  105. return false;
  106. strncpy(sFirstChar,sName,2);
  107. sFirstChar[2]=0;
  108. strncpy(sSecondChar,sName+2,2);
  109. sSecondChar[2]=0;
  110. //The possibility of P(Wi|Ti)
  111. dGivenNamePossibility+=log((double)m_dict.GetFrequency(sFirstChar,2)+1.0)-log(m_roleTag.m_context.GetFrequency(0,2)+1.0);
  112. dGivenNamePossibility+=log((double)m_dict.GetFrequency(sSecondChar,3)+1.0)-log(m_roleTag.m_context.GetFrequency(0,3)+1.0);
  113. //The possibility of conversion from 2 to 3
  114. dGivenNamePossibility+=log(m_roleTag.m_context.GetContextPossibility(0,2,3)+1.0)-log(m_roleTag.m_context.GetFrequency(0,2)+1.0);
  115. //The possibility of P(Wi|Ti)
  116. dSingleNamePossibility+=log((double)m_dict.GetFrequency(sFirstChar,1)+1.0)-log(m_roleTag.m_context.GetFrequency(0,1)+1.0);
  117. dSingleNamePossibility+=log((double)m_dict.GetFrequency(sSecondChar,4)+1.0)-log(m_roleTag.m_context.GetFrequency(0,4)+1.0);
  118. //The possibility of conversion from 1 to 4
  119. dSingleNamePossibility+=log(m_roleTag.m_context.GetContextPossibility(0,1,4)+1.0)-log(m_roleTag.m_context.GetFrequency(0,1)+1.0);
  120. if(dSingleNamePossibility>=dGivenNamePossibility)//张震||m_dict.GetFrequency(sFirstChar,1)/m_dict.GetFrequency(sFirstChar,2)>=10
  121. //The possibility being a single given name is more than being a 2-char given name
  122. return false;
  123. return true;
  124. }