WORDLIST.CPP
上传用户:sanxfzhen
上传日期:2014-12-28
资源大小:2324k
文件大小:11k
源码类别:

多国语言处理

开发平台:

Visual C++

  1. // WordList.cpp: implementation of the CWordList class.
  2. //
  3. //////////////////////////////////////////////////////////////////////
  4. #include "stdafx.h"
  5. #include "WordList.h"
  6. #include "cataloglist.h"
  7. #include "fstream.h"
  8. #include "float.h"
  9. #include <math.h>
  10. #ifdef _DEBUG
  11. #undef THIS_FILE
  12. static char THIS_FILE[]=__FILE__;
  13. #define new DEBUG_NEW
  14. #endif
  15. AFX_INLINE UINT AFXAPI HashKey(CString key)
  16. {
  17. return HashKey((LPCTSTR)key);
  18. }
  19. void AFXAPI SerializeElements(CArchive& ar,CWordNode* pElements,int nCount)
  20. {
  21. ASSERT(nCount==0||
  22. AfxIsValidAddress(pElements,nCount*sizeof(CWordNode)));
  23. pElements->Serialize(ar);
  24. }
  25. //////////////////////////////////////////////////////////////////////
  26. // Construction/Destruction
  27. //////////////////////////////////////////////////////////////////////
  28. CWordList::CWordList()
  29. {
  30. m_lstWordList.InitHashTable(2000);
  31. }
  32. CWordList::~CWordList()
  33. {
  34. }
  35. CWordList& CWordList::operator =(CWordList &x)
  36. {
  37. if(this==&x) return *this;
  38. CString str;
  39. POSITION pos=x.GetFirstPosition();
  40. while(pos!=NULL)
  41. {
  42. CWordNode& wordnodesrc=x.GetNext(pos,str);
  43. CWordNode& wordnodedst=Add(str);
  44. wordnodedst=wordnodesrc;
  45. }
  46. return *this;
  47. }
  48. void CWordList::InitWordList() // initialize the word list
  49. {
  50. m_lstWordList.RemoveAll();
  51. }
  52. CWordNode& CWordList::Add(const CString str)
  53. {
  54. CWordNode& wordnode=m_lstWordList[str];
  55. return wordnode;
  56. }
  57. CWordNode& CWordList::Add(const CString str, long docID)
  58. {
  59.   CWordNode& wordnode=m_lstWordList[str];
  60. if(wordnode.m_lDocID!=docID) 
  61. {
  62. wordnode.m_lDocID=docID;
  63. wordnode.m_lDocFreq++;
  64. }
  65. wordnode.m_lWordFreq++;
  66. return wordnode;
  67. }
  68. CWordNode& CWordList::Add(const CString str, short cataID, long docID, int cataNum)
  69. {
  70.   CWordNode& wordnode=m_lstWordList[str];
  71. wordnode.InitBuffer(cataNum);
  72. if(wordnode.m_lDocID!=docID) 
  73. {
  74. wordnode.m_lDocID=docID;
  75. wordnode.m_pCataDocFreq[cataID]++;
  76. wordnode.m_lDocFreq++;
  77. }
  78. wordnode.m_pCataWordFreq[cataID]++;
  79. wordnode.m_lWordFreq++;
  80. return wordnode;
  81. }
  82. bool CWordList::GetFromFile(CString strFileName) // get from mid information files WordList.mid
  83. {
  84. InitWordList();
  85. CFile fIn;
  86. if(!fIn.Open(strFileName,CFile::modeRead) )
  87. {
  88. AfxMessageBox("无法打开文件"+strFileName+"!");
  89. return false;
  90. }
  91. CArchive ar(&fIn,CArchive::load);
  92. Serialize(ar);
  93. ar.Close();
  94. fIn.Close();
  95. return true;
  96. }
  97. //特征列表文件中每一行包含特征词和特征词的权重
  98. //格式为feature weight,如果没有weight那么认为weight为1
  99. bool CWordList::GetListFromFile(CString strFileName)
  100. {
  101. InitWordList();
  102. FILE *fp;
  103. if((fp=fopen(strFileName,"r"))==NULL)
  104. {
  105. AfxMessageBox("无法打开文件"+strFileName+"!");
  106. return false;
  107. }
  108. char no[10],feature[MAX_PATH],line[MAX_PATH];
  109. float weight=1.0;
  110. int num=0;
  111. while(!feof(fp)&&fgets(line,MAX_PATH,fp))
  112. {
  113. if(sscanf(line,"%s %s %f",no,feature,&weight)>0)
  114. {
  115. if(weight<=0) weight=1.0;
  116. CWordNode &node=Add(feature);
  117. node.m_nWordID=num;
  118. node.m_dWeight=weight;
  119. }
  120. else
  121. {
  122. CString str;
  123. str.Format("文件的第%d行格式错误!",num+1);
  124. AfxMessageBox(str);
  125. fclose(fp);
  126. return false;
  127. }
  128. num++;
  129. }
  130. fclose(fp);
  131. return true;
  132. }
  133. void CWordList::DumpWordList(CString strFileName)
  134. {
  135. FILE *stream;
  136. if( (stream  = fopen( strFileName, "w+" )) == NULL )
  137. {
  138. AfxMessageBox("无法创建文件"+strFileName+"!");
  139. return;
  140. }
  141. POSITION pos;
  142. CString     str;
  143. CWordNode   wordnode; 
  144. pos   = GetFirstPosition();
  145. while(pos!=NULL)
  146. {
  147. wordnode=GetNext(pos, str);
  148. fprintf(stream,"%d %s %fn",wordnode.m_nWordID,str,wordnode.m_dWeight);
  149. }
  150. fclose(stream);
  151. }
  152. void CWordList::DumpWordProList(CString strFileName,int CataNum)
  153. {
  154. FILE *stream;
  155. if( (stream  = fopen( strFileName, "w+" )) == NULL )
  156. {
  157. AfxMessageBox("无法创建文件"+strFileName+"!");
  158. return;
  159. }
  160. POSITION pos;
  161. CString     str;
  162. CWordNode   wordnode; 
  163. pos   = GetFirstPosition();
  164. fprintf(stream,"%d %dn",m_lstWordList.GetCount(),CataNum);
  165. while(pos!=NULL)
  166. {
  167. wordnode=GetNext(pos, str);
  168. fprintf(stream,"%d",wordnode.m_nWordID);
  169. for(int i=0;i<CataNum;i++)
  170. {
  171. fprintf(stream," %f",wordnode.m_pCataWeightPro[i]);
  172. }
  173. fprintf(stream,"n");
  174. }
  175. fclose(stream);
  176. }
  177. //建立词索引 
  178. void CWordList::IndexWord()
  179. {
  180. unsigned int i=0;
  181. CString str;
  182. POSITION pos=GetFirstPosition();
  183. while(pos!=NULL)
  184. {
  185. CWordNode& wordnode=GetNext(pos,str);
  186. wordnode.m_nWordID=i;
  187. i++;
  188. }
  189. }
  190. //计算每个特征的权重, 参数sum代表文档集中的文档总数
  191. //参数bMult含义与类CWordNode中函数ComputeWeight相同
  192. void CWordList::ComputeWeight(long sum, bool bMult)
  193. {
  194. if(sum<=0) return;
  195. CString str;
  196. POSITION pos=GetFirstPosition();
  197. while(pos!=NULL)
  198. {
  199. CWordNode& wordNode=GetNext(pos,str);
  200. wordNode.ComputeWeight(sum,bMult);
  201. }
  202. }
  203. void CWordList::DumpToFile(CString strFileName)
  204. {
  205. CFile fBinOut;
  206. if(!fBinOut.Open(strFileName,CFile::modeWrite | CFile::modeCreate) )
  207. {
  208. AfxMessageBox("无法创建文件"+strFileName+"!");
  209. return;
  210. }
  211. CArchive ar(&fBinOut,CArchive::store);
  212. Serialize(ar);
  213. ar.Close();
  214. fBinOut.Close();
  215. }
  216. POSITION CWordList::GetFirstPosition()
  217. {
  218. return m_lstWordList.GetStartPosition();
  219. }
  220. CWordNode& CWordList::GetNext(POSITION& pos, CString &str)
  221. {
  222. CWordNode node;
  223. m_lstWordList.GetNextAssoc(pos,str,node);
  224. return m_lstWordList[str];
  225. }
  226. int CWordList::GetCount()
  227. {
  228. return m_lstWordList.GetCount();
  229. }
  230. long CWordList::GetWordNum()
  231. {
  232. long n=0;
  233. POSITION pos=GetFirstPosition();
  234. CString strWord;
  235. while(pos!=NULL)
  236. {
  237. CWordNode& wordnode=GetNext(pos,strWord);
  238. n+=wordnode.GetWordNum();
  239. }
  240. return n;
  241. }
  242. void CWordList::Serialize(CArchive &ar)
  243. {
  244. m_lstWordList.Serialize(ar);
  245. }
  246. BOOL CWordList::Lookup(CString str, CWordNode &wordNode)
  247. {
  248. return m_lstWordList.Lookup(str,wordNode);
  249. }
  250. CString CWordList::GetWordByID(long wordID)
  251. {
  252. CString str;
  253. POSITION pos=GetFirstPosition();
  254. bool bFound=false;
  255. while(pos!=NULL)
  256. {
  257. CWordNode& wordnode=GetNext(pos,str);
  258. if(wordnode.m_nWordID==wordID)
  259. {
  260. bFound=true;
  261. break;
  262. }
  263. }
  264. if(bFound) return str;
  265. else return "";
  266. }
  267. void CWordList::SetAt(CString str,CWordNode& node)
  268. {
  269. m_lstWordList.SetAt(str,node);
  270. }
  271. void CWordNode::Serialize(CArchive &ar)
  272. {
  273. if(ar.IsStoring())
  274. {
  275. ar<<m_nWordID;
  276. ar<<m_dWeight;
  277. ar<<m_lDocFreq;
  278. ar<<m_lWordFreq;
  279. }
  280. else
  281. {
  282. ar>>m_nWordID;
  283. ar>>m_dWeight;
  284. ar>>m_lDocFreq;
  285. ar>>m_lWordFreq;
  286. }
  287. }
  288. CWordNode::CWordNode()
  289. m_dWeight=0.0;
  290. m_nAllocLen=0;
  291. m_pCataWeight=NULL;
  292. m_pCataWeightPro=NULL;
  293. m_pCataDocFreq=NULL;
  294. m_pCataWordFreq=NULL;
  295. m_lDocFreq=0;
  296. m_lWordFreq=0;
  297. m_nWordID=-1;
  298. m_lDocID=-1;
  299. }
  300. CWordNode::~CWordNode()
  301. {
  302. DeallocBuffer();
  303. }
  304. CWordNode& CWordNode::operator = (const CWordNode& x)
  305. {
  306. if(this==&x) return *this;
  307. m_nWordID=x.m_nWordID;
  308. m_dWeight=x.m_dWeight;
  309. m_lDocID=x.m_lDocID;
  310. m_nAllocLen=x.m_nAllocLen;
  311. m_lDocFreq=x.m_lDocFreq;
  312. m_lWordFreq=x.m_lWordFreq;
  313. AllocBuffer(x.m_nAllocLen);
  314. if(x.m_pCataWeight!=NULL)
  315. memcpy(m_pCataWeight,x.m_pCataWeight,m_nAllocLen*sizeof(double));
  316. if(x.m_pCataWeightPro!=NULL)
  317. memcpy(m_pCataWeightPro,x.m_pCataWeightPro,m_nAllocLen*sizeof(double));
  318. if(x.m_pCataDocFreq!=NULL)
  319. memcpy(m_pCataDocFreq,x.m_pCataDocFreq,m_nAllocLen*sizeof(long));
  320. if(x.m_pCataWordFreq!=NULL)
  321. memcpy(m_pCataWordFreq,x.m_pCataWordFreq,m_nAllocLen*sizeof(long));
  322. return *this;
  323. }
  324. void CWordNode::DeallocBuffer()
  325. {
  326. if(m_pCataWeight!=NULL)
  327. {
  328. delete []m_pCataWeight;
  329. m_pCataWeight=NULL;
  330. }
  331. if(m_pCataWeightPro!=NULL)
  332. {
  333. delete []m_pCataWeightPro;
  334. m_pCataWeightPro=NULL;
  335. }
  336. if(m_pCataDocFreq!=NULL)
  337. {
  338. delete []m_pCataDocFreq;
  339. m_pCataDocFreq=NULL;
  340. }
  341. if(m_pCataWordFreq!=NULL)
  342. {
  343. delete []m_pCataWordFreq;
  344. m_pCataWordFreq=NULL;
  345. }
  346. m_nAllocLen=0;
  347. }
  348. void CWordNode::InitBuffer(int nLen)
  349. {
  350. if(nLen<=0) return;
  351. if(m_nAllocLen<=0&&m_pCataWeight==NULL&&m_pCataWeightPro==NULL&&
  352. m_pCataDocFreq==NULL&&m_pCataWordFreq==NULL)
  353. {
  354. m_nAllocLen=nLen;
  355. m_pCataWeight=new double[m_nAllocLen];
  356. memset(m_pCataWeight,0,sizeof(double)*m_nAllocLen);
  357. m_pCataWeightPro=new double[m_nAllocLen];
  358. memset(m_pCataWeightPro,0,sizeof(double)*m_nAllocLen);
  359. m_pCataDocFreq=new long[m_nAllocLen];
  360. memset(m_pCataDocFreq,0,sizeof(long)*m_nAllocLen);
  361. m_pCataWordFreq=new long[m_nAllocLen];
  362. memset(m_pCataWordFreq,0,sizeof(long)*m_nAllocLen);
  363. }
  364. }
  365. void CWordNode::AllocBuffer(int nLen)
  366. {
  367. if(nLen<=0) return;
  368. DeallocBuffer();
  369. m_nAllocLen=nLen;
  370. m_pCataWeight=new double[m_nAllocLen];
  371. m_pCataWeightPro=new double[m_nAllocLen];
  372. m_pCataDocFreq=new long[m_nAllocLen];
  373. m_pCataWordFreq=new long[m_nAllocLen];
  374. }
  375. //用于计算特征的权重,参数sum代表文档集中的文档总数
  376. //如果bMult=true且m_dWeight大于0, 则将特征的反比文档频率乘上m_dWeight原来的值, 再保存到成员变量m_dWeight中
  377. //否则, 将特征的反比文档频率值保存到成员变量m_dWeight中
  378. void CWordNode::ComputeWeight(long sum, bool bMult)
  379. {
  380. long docFreq=GetDocNum();
  381. if(docFreq<=0&&sum<=0)
  382. {
  383. m_dWeight=0.0;
  384. return;
  385. }
  386. double weight=log((double)sum/(double)docFreq);
  387. if(bMult&&m_dWeight>dZero)
  388. m_dWeight*=weight;
  389. else
  390. m_dWeight=weight;
  391. }
  392. long CWordNode::GetCataDocNum(int cataID)
  393. {
  394. return m_pCataDocFreq[cataID];
  395. }
  396. long CWordNode::GetCataWordNum(int cataID)
  397. {
  398. return m_pCataWordFreq[cataID];
  399. }
  400. long CWordNode::GetDocNum()
  401. {
  402. long sum=0;
  403. if(m_nAllocLen>0)
  404. {
  405. for(int i=0;i<m_nAllocLen;i++)
  406. sum+=m_pCataDocFreq[i];
  407. }
  408. else sum=m_lDocFreq;
  409. return sum;
  410. }
  411. long CWordNode::GetWordNum()
  412. {
  413. long sum=0;
  414. if(m_nAllocLen>0)
  415. {
  416. for(int i=0;i<m_nAllocLen;i++)
  417. sum+=m_pCataWordFreq[i];
  418. }
  419. else sum=m_lWordFreq;
  420. return sum;
  421. }
  422. int CWordNode::MaxWeightIndex()
  423. {
  424. int idx=-1;
  425. double nMax=-DBL_MAX;
  426. for(int i=0;i<m_nAllocLen;i++)
  427. {
  428. if(m_pCataWeight[i]>nMax)
  429. {
  430. nMax=m_pCataWeight[i];
  431. idx=i;
  432. }
  433. }
  434. return idx;
  435. }
  436. //此函数暂且只在层次分类中用到,函数名称和其实现的功能看起来有点不同
  437. void CWordNode::Copy(CWordNode &wordNode)
  438. {
  439. m_dWeight=wordNode.m_dWeight;
  440. m_nAllocLen=0;
  441. m_nWordID=wordNode.m_nWordID;
  442. m_lDocFreq=wordNode.m_lDocFreq;
  443. m_pCataWeight=NULL;
  444. m_pCataWeightPro=NULL;
  445. m_pCataDocFreq=NULL;
  446. m_pCataWordFreq=NULL;
  447. m_lDocID=0;
  448. }
  449. //DEL CWordNode& CWordList::GetWordNodeByID(long wordID)
  450. //DEL {
  451. //DEL 
  452. //DEL }
  453. double CWordList::GetWordProByID(POSITION &pos,long wordID,int classnum)
  454. {
  455. CString str;
  456. double pro;
  457. // POSITION pos=GetFirstPosition();
  458. bool bFound=false;
  459. while(pos!=NULL)
  460. {
  461. CWordNode& wordnode=GetNext(pos,str);
  462. if(wordnode.m_nWordID==wordID)
  463. {
  464. bFound=true;
  465. pro = wordnode.m_pCataWeightPro[classnum];
  466. break;
  467. }
  468. }
  469. if(bFound) return pro;
  470. else return 0.0;
  471. }
  472. CWordNode& CWordList::GetWordProByID(POSITION &pos, int j)
  473. {
  474. CString str;
  475. bool bFound=false;
  476. while(pos!=NULL)
  477. {
  478. CWordNode& wordnode=GetNext(pos,str);
  479. if(wordnode.m_nWordID==j)
  480. {
  481. bFound=true;
  482. break;
  483. }
  484. }
  485. if(bFound) return m_lstWordList[str];
  486. }
  487. bool CWordList::GetProFromFile(CString strFileName)
  488. {
  489. FILE *stream;
  490. if( (stream  = fopen( strFileName, "r" )) == NULL )
  491. {
  492. AfxMessageBox("无法打开文件"+strFileName+"!");
  493. return false;
  494. }
  495. int m,n,i;
  496. fscanf(stream,"%d %dn",&m,&n);
  497. POSITION pos;
  498. CString     str;
  499. pos   = GetFirstPosition();
  500. while(pos!=NULL)
  501. {
  502. CWordNode& wordnode=GetNext(pos, str);
  503. wordnode.InitBuffer(n);
  504. fscanf(stream,"%d",&wordnode.m_nWordID);
  505. for(i=0;i<n;i++)
  506. {
  507. fscanf(stream,"%lf",&wordnode.m_pCataWeightPro[i]);
  508. }
  509. }
  510. fclose(stream);
  511. return true;
  512. }