ContextStat.cpp
上传用户:chen_dj
上传日期:2013-04-22
资源大小:111k
文件大小:8k
源码类别:

多国语言处理

开发平台:

C/C++

  1. /****************************************************************************
  2.  *
  3.  * Copyright (c) 2000, 2001 
  4.  *     Machine Group
  5.  *     Software Research Lab.
  6.  *     Institute of Computing Tech.
  7.  *     Chinese Academy of Sciences
  8.  *     All rights reserved.
  9.  *
  10.  * This file is the confidential and proprietary property of 
  11.  * Institute of Computing Tech. and the posession or use of this file requires 
  12.  * a written license from the author.
  13.  * Filename: ContextStat.cpp
  14.  * Abstract:
  15.  *           implementation of the CContextStat class.
  16.  * Author:   Kevin Zhang 
  17.  *          (zhanghp@software.ict.ac.cn)
  18.  * Date:     2002-1-24
  19.  *
  20.  * Notes:
  21.  *                
  22.  ****************************************************************************/
  23. #include "stdafx.h"
  24. #include "ContextStat.h"
  25. #include "Utility.h"
  26. #include <memory.h>
  27. #include <stdio.h>
  28. #include <string.h>
  29. //////////////////////////////////////////////////////////////////////
  30. // Construction/Destruction
  31. //////////////////////////////////////////////////////////////////////
  32. CContextStat::CContextStat()
  33. {
  34. m_pSymbolTable=0;//new buffer for symbol
  35. m_pContext=0;//init with empty
  36. }
  37. CContextStat::~CContextStat()
  38. {
  39. delete [] m_pSymbolTable;
  40. PMYCONTEXT pCur=m_pContext,pTemp;
  41. while(pCur!=NULL)
  42. {//delete the context array
  43. pTemp=pCur->next;
  44. for(int i=0;i<m_nTableLen;i++)
  45.    delete [] pCur->aContextArray[i];
  46. delete [] pCur->aContextArray;
  47. delete [] pCur->aTagFreq;
  48. delete pCur;
  49. pCur=pTemp;
  50. }
  51. }
  52. bool CContextStat::SetSymbol(int *nSymbol)
  53. {
  54. memcpy(m_pSymbolTable,nSymbol,sizeof(int)*m_nTableLen);
  55. return true;
  56. }
  57. bool CContextStat::Add(int nKey, int nPrevSymbol, int nCurSymbol, int nFrequency)
  58. {//Add the context symbol to the array
  59. PMYCONTEXT pRetItem,pNew;
  60. int nPrevIndex,nCurIndex;
  61.     if(!GetItem(nKey,&pRetItem))//Not get it
  62. {
  63. pNew=new MYCONTEXT;
  64. pNew->nKey=nKey;
  65. pNew->nTotalFreq=0;
  66. pNew->next=NULL;
  67. pNew->aContextArray=new int*[m_nTableLen];
  68. pNew->aTagFreq=new int[m_nTableLen];
  69. memset(pNew->aTagFreq,0,sizeof(int)*m_nTableLen);
  70. for(int i=0;i<m_nTableLen;i++)
  71. {//new buffer for every dimension
  72. pNew->aContextArray[i]=new int[m_nTableLen];
  73. memset(pNew->aContextArray[i],0,sizeof(int)*m_nTableLen);//Init the frequency
  74. }
  75. if(pRetItem==NULL)//Empty, the new item is head
  76. m_pContext=pNew;
  77. else//Link the new item between pRetItem and its next item
  78. {
  79. pNew->next=pRetItem->next;
  80. pRetItem->next=pNew;
  81. }
  82. pRetItem=pNew;
  83. }
  84. nPrevIndex=BinarySearch(nPrevSymbol,m_pSymbolTable,m_nTableLen);
  85. if(nPrevSymbol>256&&nPrevIndex==-1)//Not find, just for 'nx' and other uncommon POS
  86. nPrevIndex=BinarySearch(nPrevSymbol-nPrevSymbol%256,m_pSymbolTable,m_nTableLen);
  87. nCurIndex=BinarySearch(nCurSymbol,m_pSymbolTable,m_nTableLen);
  88. if(nCurSymbol>256&&nCurIndex==-1)//Not find, just for 'nx' and other uncommon POS
  89. nCurIndex=BinarySearch(nCurSymbol-nCurSymbol%256,m_pSymbolTable,m_nTableLen);
  90.     if(nPrevIndex==-1||nCurIndex==-1)//error finding the symbol
  91. return false;
  92. pRetItem->aContextArray[nPrevIndex][nCurIndex]+=nFrequency;//Add the frequency
  93. pRetItem->aTagFreq[nPrevIndex]+=nFrequency;
  94. pRetItem->nTotalFreq+=nFrequency;
  95. return true;
  96. }
  97. bool CContextStat::Save(char *sFilename)
  98. {
  99. FILE *fp,*fp1;
  100. PMYCONTEXT pCur;
  101. char sFileShow[100];
  102. int i;
  103. if((fp=fopen(sFilename,"wb"))==NULL)
  104. {
  105. return false;
  106. }
  107. strcpy(sFileShow,sFilename);
  108. strcat(sFileShow,".shw");
  109. if((fp1=fopen(sFileShow,"wb"))==NULL)
  110. {
  111. return false;
  112. }
  113.     fwrite(&m_nTableLen,sizeof(m_nTableLen),1,fp);//write the table length
  114.     fwrite(m_pSymbolTable,sizeof(int),m_nTableLen,fp);//write the symbol table
  115.     fprintf(fp1,"Table Len=%dnSymbol:n",m_nTableLen);
  116.     for(i=0;i<m_nTableLen;i++)
  117.  fprintf(fp1,"%d ",m_pSymbolTable[i]);
  118. fprintf(fp1,"n");
  119. pCur=m_pContext;
  120. while(pCur!=NULL)
  121. {
  122. fwrite(&pCur->nKey,sizeof(int),1,fp);
  123. fwrite(&pCur->nTotalFreq,sizeof(int),1,fp);
  124. fprintf(fp1,"nKey=%d,Total frequency=%d:n",pCur->nKey,pCur->nTotalFreq);
  125. fwrite(pCur->aTagFreq,sizeof(int),m_nTableLen,fp);//the every POS frequency
  126.         for(i=0;i<m_nTableLen;i++)
  127. {
  128. fwrite(pCur->aContextArray[i],sizeof(int),m_nTableLen,fp);
  129. fprintf(fp1,"No.%2d=%3d: ",i,m_pSymbolTable[i]);
  130.             for(int j=0;j<m_nTableLen;j++)
  131. fprintf(fp1,"%5d ",pCur->aContextArray[i][j]);
  132.             fprintf(fp1,"total=%d:n",pCur->aTagFreq[i]);
  133. }
  134. pCur=pCur->next;
  135. }
  136. fclose(fp);
  137. fclose(fp1);
  138. return true;
  139. }
  140. bool CContextStat::Load(char *sFilename)
  141. {
  142. FILE *fp;
  143. if((fp=fopen(sFilename,"rb"))==NULL)
  144. {
  145. return false;
  146. }
  147. PMYCONTEXT pCur=m_pContext,pTemp,pPre=NULL;
  148. while(pCur!=NULL)
  149. {//delete the context array
  150. pTemp=pCur->next;
  151. for(int i=0;i<m_nTableLen;i++)
  152.    delete [] pCur->aContextArray[i];
  153. delete [] pCur->aContextArray;
  154. delete [] pCur->aTagFreq;
  155. delete pCur;
  156. pCur=pTemp;
  157. }
  158. fread(&m_nTableLen,sizeof(m_nTableLen),1,fp);//write the table length
  159. if(m_pSymbolTable!=NULL)
  160. delete [] m_pSymbolTable;
  161. m_pSymbolTable=new int[m_nTableLen];//new buffer for symbol
  162. fread(m_pSymbolTable,sizeof(int),m_nTableLen,fp);//write the symbol table
  163.     //free exists items
  164.     while(!feof(fp))
  165. {//Read the context 
  166. pCur=new MYCONTEXT;
  167. pCur->next=NULL;
  168. if(fread(&pCur->nKey,sizeof(int),1,fp)<1)//Read error
  169. {
  170. delete pCur;
  171. break;
  172. }
  173.         fread(&pCur->nTotalFreq,sizeof(int),1,fp);
  174. pCur->aTagFreq=new int [m_nTableLen];
  175. fread(pCur->aTagFreq,sizeof(int),m_nTableLen,fp);//the every POS frequency
  176. pCur->aContextArray=new int *[m_nTableLen];
  177.         for(int i=0;i<m_nTableLen;i++)
  178. {
  179.     pCur->aContextArray[i]=new int[m_nTableLen];
  180. fread(pCur->aContextArray[i],sizeof(int),m_nTableLen,fp);
  181. }
  182. if(pPre==NULL)
  183. m_pContext=pCur;
  184. else
  185. pPre->next=pCur;
  186. pPre=pCur;
  187. }
  188. fclose(fp);
  189. return true;
  190. }
  191. double CContextStat::GetContextPossibility(int nKey, int nPrev, int nCur)
  192. {
  193. PMYCONTEXT pCur;
  194. int nCurIndex=BinarySearch(nCur,m_pSymbolTable,m_nTableLen);
  195. int nPrevIndex=BinarySearch(nPrev,m_pSymbolTable,m_nTableLen);
  196. if(!GetItem(nKey,&pCur)||nCurIndex==-1||nPrevIndex==-1||pCur->aTagFreq[nPrevIndex]==0||pCur->aContextArray[nPrevIndex][nCurIndex]==0)
  197. return 0.000001;//return a lower value, not 0 to prevent data sparse
  198. int nPrevCurConFreq=pCur->aContextArray[nPrevIndex][nCurIndex];
  199. int nPrevFreq=pCur->aTagFreq[nPrevIndex];
  200. return 0.9*(double)nPrevCurConFreq/(double)nPrevFreq+0.1*(double)nPrevFreq/(double)pCur->nTotalFreq;
  201. //0.9 and 0.1 is a value based experience
  202. }
  203. int CContextStat::GetFrequency(int nKey, int nSymbol)
  204. {//Get the frequency which nSymbol appears
  205. PMYCONTEXT pFound;
  206. int nIndex,nFrequency=0;
  207. if(!GetItem(nKey,&pFound))//Not found such a item
  208. return 0;
  209. nIndex=BinarySearch(nSymbol,m_pSymbolTable,m_nTableLen);
  210.     if(nIndex==-1)//error finding the symbol
  211. return 0;
  212. nFrequency=pFound->aTagFreq[nIndex];//Add the frequency
  213. return nFrequency;
  214. }
  215. bool CContextStat::GetItem(int nKey,PMYCONTEXT *pItemRet)
  216. {//Get the item according the nKey
  217. PMYCONTEXT pCur=m_pContext,pPrev=NULL;
  218. if(nKey==0)
  219. {
  220. *pItemRet=m_pContext;
  221. return true;
  222. }
  223. while(pCur!=NULL&&pCur->nKey<nKey)
  224. {//delete the context array
  225. pPrev=pCur;
  226. pCur=pCur->next;
  227. }
  228.     if(pCur!=NULL&&pCur->nKey==nKey)
  229. {//find it and return the current item
  230. *pItemRet=pCur;
  231. return true;
  232. }
  233. *pItemRet=pPrev;
  234. return false;
  235. }
  236. bool CContextStat::SetTableLen(int nTableLen)
  237. {
  238. m_nTableLen=nTableLen;//Set the table len
  239. m_pSymbolTable=new int[nTableLen];//new buffer for symbol
  240. m_pContext=0;//init with empty
  241. return true;
  242. }