segChnWord.py
上传用户:l195572280
上传日期:2022-05-24
资源大小:3k
文件大小:14k
源码类别:

多国语言处理

开发平台:

Python

  1. '''
  2. goal: segment Chinese words
  3. author: jiangg_211@126.com
  4. '''
  5. import os
  6. import codecs
  7. import string
  8. import math
  9. from datetime import datetime   
  10. from datetime import date 
  11. import pickle, StringIO # for pickle
  12. class Node:
  13.      def __init__(self):
  14.          self.map={}  
  15.      def contain(self,key):
  16.          return self.map.__contains__(key)
  17.      def __getitem__(self,key):
  18.          return self.map[key]
  19.      def __setitem__(self,key,value):
  20.          self.map[key]=value
  21.          
  22. class Item:
  23.      def __init__(self,key):
  24.          self.key=key
  25.          self.freq=1
  26.          self.prob=-1 #two grammar probability
  27.          self.whetherleaf=0  #1 means from root to this node is a word
  28.          self.insubNum=0
  29.          self.outsubNum=0
  30.          self.insubNode=Node()
  31.          self.outsubNode=Node()
  32.      def addfreq (self):
  33.          self.freq=self.freq+1
  34.      def getfreq(self):
  35.          return self.freq
  36.      def setprobability (self,probability):
  37.          self.prob=probability
  38.      def getprobability (self):
  39.          return self.prob
  40.      def addinsubNode(self,key,item): 
  41.          self.insubNode[key]=item
  42.          self.insubNum=self.insubNum+1
  43.      def addoutsubNode(self,key,item):  
  44.          self.outsubNode[key]=item
  45.          self.outsubNum=self.outsubNum+1
  46. class WordGraph:
  47.      def __init__(self,key):
  48.          self.key=key         
  49.          self.rightsibliNum=0
  50.          self.rightsibliNode=Node()           
  51.          self.rightsibliProb=Node()
  52.          
  53.          self.leftsibliNode=Node()  #the left siblinode        
  54.      def addrightsibliNode(self,key,item):  
  55.          self.rightsibliNode[key]=item         
  56.          self.rightsibliNum+=1
  57.      def addleftsibliNode(self,key,item):  
  58.          self.leftsibliNode[key]=item              
  59.      def addrightsibliProb (self,key,probability):
  60.          self.rightsibliProb[key]=probability
  61. #Create all the two gram probability
  62. def computeProbTrietree(tree):
  63.     if (not tree) or (tree.insubNum==0) :
  64.         return 
  65.     for insubkey,insubNode in tree.insubNode.map.items():
  66.     #smooth by add 1
  67.         if insubNode.outsubNum>0:
  68.             current_freq=insubNode.getfreq()   
  69.             for libkey,libnode in insubNode.outsubNode.map.items():                    
  70.                 try:
  71.                     prob=math.log(1.0*(libnode.getfreq()+smoothingPara)/(current_freq+1))
  72. #                    print "p=",
  73. #                    print prob
  74.                     if prob>0:
  75.                         print "warning prob>o when ",
  76.                         print libkey
  77.                 except:
  78.                     print "Error when",
  79.                     print libkey                    
  80.                 libnode.setprobability(prob)                          
  81. #if "25220 11730 21142" exist in the tree
  82. def IsWord (tree,word):
  83.     if (not tree) or (tree.insubNum==0):
  84.         return ""        
  85.     if tree.insubNode.contain(word):
  86.         current=tree.insubNode[word]
  87.         return current
  88.     return ""
  89. #Compute each relative probability 
  90. def Find2gramprob (tree,leftword,rightword):
  91.     current=IsWord(tree,leftword)
  92.     leftfreq=1
  93.     rightfreq=smoothingPara
  94.     if current:
  95.         if current.outsubNode.contain(rightword):            
  96.             result=current.outsubNode[rightword].getprobability()            
  97.         else:
  98.             #for outkey,outnode in current.outsubNode.map.items():
  99.             #    if outkey.startswith(rightword):
  100.             #        rightfreq=rightfreq+outnode.getfreq()
  101.             result= math.log(1.0*rightfreq/(current.getfreq()+1))
  102.     else:
  103. #        for inkey,innode in tree.insubNode.map.items():
  104. #            if inkey.endswith(leftword):
  105. #                leftfreq=leftfreq+innode.getfreq()
  106. #                for outkey,outnode in innode.outsubNode.map.items():
  107. #                    if outkey.startswith(rightword):
  108. #                        rightfreq=rightfreq+outnode.getfreq()
  109.         result=math.log(1.0*rightfreq/leftfreq)
  110.         #result=0   ###float('-inf')
  111. #        print "left= ",
  112. #        print leftword,
  113. #        print "right= ",
  114. #        print rightword,
  115. #        print "p= ",
  116. #        print result
  117.     return result                       
  118.            
  119. def record_dic_tree(tree):
  120.     if (not tree) or (tree.insubNum==0):
  121.         return
  122.     for key,node in tree.insubNode.map.items(): 
  123.         logFilePrint.write(node.key)
  124.         logFilePrint.write("n")
  125. def create_trie_tree(treeFile):
  126.     trainfile=codecs.open(treeFile,"r","utf-8")
  127.     #trainfile=codecs.open("corpus-training-digit.utf-8.txt","r","utf-8")    
  128.     processed_linenum=0
  129.     #initialize tree
  130.     tree=Item("")  
  131.     print "Building trie tree..., time is: ",   
  132.     print datetime.now()
  133.     for trainline in trainfile:        
  134.         processed_linenum+=1
  135.         #if processed_linenum >2000:
  136.         #    break;
  137.         if processed_linenum %1000==0:
  138.             print processed_linenum,
  139.             print " lines ok!"
  140.         # add sentence begin and end label
  141.         trainline="<s> | "+trainline+" </s>"; 
  142.         #traintokenlist represent a sentence
  143.         traintokenlist=trainline.strip().split("|")        
  144.         prenode=""  #note the last left adjacent word
  145.         for trainword in traintokenlist:            
  146.             trainword=trainword.strip()     #represent total word       
  147.             if tree.insubNode.contain(trainword):
  148.                 tree.insubNode[trainword].addfreq()   
  149.             else:
  150.                 item=Item(trainword)
  151.                 tree.addinsubNode(trainword,item)
  152.             current=tree.insubNode[trainword]
  153.             if prenode:             
  154.                 if prenode.outsubNode.contain(trainword):
  155.                     prenode.outsubNode[trainword].addfreq()
  156.                 else:
  157.                     item=Item(trainword)
  158.                     prenode.addoutsubNode(trainword,item)            
  159.             prenode=current 
  160.     computeProbTrietree(tree)    
  161.     print "Building success! Time is: ",
  162.     return tree
  163. def save_trie_tree(tree, treeFile):
  164.     print datetime.now()
  165.     print "Saving tried tree to file..."
  166.     #pickle the mode
  167.     picklefile=open("segChnTrieTree.txt","wb")
  168.     pickle.dump(tree, picklefile, 0)    #pickle 
  169.     picklefile.close()
  170.     print "Save ok!"
  171.     print "Printing tree..."
  172.     print_trie_tree(tree)
  173.     trainfile.close()
  174. def load_trie_tree(treeFile):
  175.     print "Loading trietree..."
  176.     tree=Item("")
  177.     unpickfile=open(treeFile,"rb")
  178.     unpickfile.seek(0)
  179.     tree = pickle.load(unpickfile)   #reverse pickle 
  180.     print "Loading trietree ok!"
  181.     return tree
  182. def printWordGraph(graph):
  183.     if not graph:      
  184.         return
  185.     else:
  186.         print graph.key
  187.         for key,node in graph.rightsibliNode.map.items():
  188.             printWordGraph(node)
  189.     
  190. def creatWordGraph(sentence):  # prepare for segment, create word graph
  191.     #print "Creat word graph..."
  192.     graph=WordGraph("<s>")
  193.     preNode=graph
  194.     sentlist=sentence.strip().split() #contain all the words in a sentence
  195.     #create 1-gram word graph
  196.     for i in range(1,len(sentlist)): 
  197.         chnCharacter=sentlist[i]
  198.         graphitem=WordGraph(chnCharacter)
  199.         probability=Find2gramprob(GloabalTree,sentlist[i-1],chnCharacter)
  200.         preNode.addrightsibliNode(chnCharacter,graphitem)
  201.         preNode.addrightsibliProb(chnCharacter,probability)
  202.         
  203.         graphitem.addleftsibliNode(preNode.key, preNode)
  204.         preNode=graphitem
  205.     ###printWordGraph(graph)
  206.     #create 2-gram word graph
  207.     preNode=graph   #preNode represent character i in the graph
  208.     for i in range(1,len(sentlist)):
  209.         preNode=preNode.rightsibliNode[sentlist[i]]
  210.         postNode=preNode        
  211.         characters=sentlist[i]
  212.         for j in range(i+1,len(sentlist)-1):
  213.             postNode=postNode.rightsibliNode[sentlist[j]] #preNode represent right character j in the graph
  214.             characters=characters+" "+sentlist[j]
  215.             if IsWord(GloabalTree,characters):  #from i to j is a Chinese word
  216.                 graphitem=WordGraph(characters)                     
  217.                 #refresh left edge           
  218.                 if preNode:
  219.                     for key,node in preNode.leftsibliNode.map.items():  
  220.                         #if not node.rightsibliNode.contain(characters):
  221.                             node.addrightsibliNode(characters, graphitem)
  222.                             probability=Find2gramprob(GloabalTree,key,characters)
  223.                             node.addrightsibliProb(characters,probability)
  224.                         
  225.                             graphitem.addleftsibliNode(key,node)
  226.                     #refresh right edge
  227.                 if postNode:
  228.                     for key,node in postNode.rightsibliNode.map.items():
  229.                         #if not node.leftsibliNode.contain(characters):
  230.                             graphitem.addrightsibliNode(key, node)
  231.                             probability=Find2gramprob(GloabalTree,characters,key)
  232.                             graphitem.addrightsibliProb(key,probability)
  233.                             
  234.                             node.addleftsibliNode(characters,graphitem)      
  235.     #print "Creat word graph ok!"
  236.     #print "Word graph is: "
  237.     ###printWordGraph(graph)
  238.     return (sentlist,graph)                              
  239.             
  240. def vertibifromGraph(graph, sentlist):  # search optimal path
  241.     #print "Searching path using vertibi..."
  242.     if len(sentlist)==0:
  243.         return ""   
  244.     PathDic={}
  245.     ScoreDic={}
  246.     ResultScoreDic={}
  247.     
  248.     PathDic["<s>"]=graph
  249.     ScoreDic["<s>"]=0
  250.     NodeList=[graph]
  251.     
  252.     for i in range(1,len(sentlist)):  
  253.         #find maximum edge for a node
  254.         for node in NodeList:            
  255.             nodeDic=[]
  256.             for pathkey, pathnode in PathDic.items():   
  257.                 if pathnode==node:
  258.                     nodeDic.append(pathkey)
  259.             if len(nodeDic)>1:
  260.                 maxpath=max([(ScoreDic[x],x) for x in nodeDic])[1]
  261.                 for pathkey in nodeDic:
  262.                     if not pathkey==maxpath:
  263.                         ScoreDic.pop(pathkey)
  264.                         PathDic.pop(pathkey)
  265.         #Expand
  266.         newAddNodelist=[]
  267.         delNodelist=[]                           
  268.         for node in NodeList:             
  269.             canexpand=False        
  270.             for rightnodekey,rightsiblinode in node.rightsibliNode.map.items():
  271.                 list_key=rightnodekey.split()
  272.                 if list_key[len(list_key)-1]==sentlist[i]: 
  273.                     canexpand=True
  274.                     break
  275.             if canexpand:  #expand a node in nodelist
  276.                 delNodelist.append(node)                
  277.                 for rightnodekey,rightsiblinode in node.rightsibliNode.map.items():
  278.                     if rightsiblinode not in newAddNodelist: 
  279.                         newAddNodelist.append(rightsiblinode)
  280.                 #record the path                    
  281.                 for pathkey, pathnode in PathDic.items():                    
  282.                     if pathnode==node:                                                         
  283.                         for rightnodekey,rightsiblinode in node.rightsibliNode.map.items():
  284.                             currentpath=pathkey+ " | "+ rightnodekey
  285.                             ScoreDic[currentpath]=ScoreDic[pathkey]+node.rightsibliProb[rightnodekey]
  286.                             PathDic[currentpath]=rightsiblinode
  287.                             if  rightnodekey=="</s>":                          
  288.                                 ResultScoreDic[currentpath]=ScoreDic[currentpath]                                                                                                                                                                                                            
  289.                         ScoreDic.pop(pathkey)
  290.                         PathDic.pop(pathkey) 
  291.         if newAddNodelist:
  292.             for newnode in newAddNodelist:
  293.                 NodeList.append(newnode)  
  294.         if delNodelist:
  295.             for delnode in delNodelist:
  296.                 NodeList.remove(delnode)   
  297.     
  298.     if len(NodeList)>1:
  299.         print "Warning, len of nodelist=",
  300.         print len(NodeList)
  301.     #print "Searching path using vertibit ok!"    
  302.     segResult=max([ (ResultScoreDic[x],x) for x in ResultScoreDic])[1]
  303.     #del <s> and </s> from segResult
  304.     segResult=segResult.replace("<s> | ","")
  305.     segResult=segResult.replace("</s>","")
  306.     return segResult
  307.     
  308. def segChnText(ChnTextFile,resultFile):
  309.    testFile = codecs.open(ChnTextFile,"r","utf-8")
  310.    resultFile = codecs.open(resultFile,"w","utf-8")
  311.    processed_linenum=0
  312.    for line in testFile:
  313.        processed_linenum=processed_linenum+1
  314.        if processed_linenum %30==0:
  315.            print processed_linenum,
  316.            print " lines ok!"
  317.        wordgraph=""
  318.        #print "sentence: ",
  319.        #print line
  320.        line="<s>  "+line+" </s>"; 
  321.        (sentlist,wordgraph)=creatWordGraph(line) #create word graph
  322.        segResult = vertibifromGraph(wordgraph, sentlist)
  323.        resultFile.write(segResult)
  324.        resultFile.write("n")
  325.        resultFile.flush()
  326.        
  327. if __name__=="__main__":   
  328.     global smoothingPara  
  329.     global GloabalTree    
  330.     global logFilePrint
  331.         
  332.     smoothingPara=1.0/200
  333.     logFilePrint = codecs.open("dic_log.txt","w","utf-8")    
  334.     # Training
  335.     GloabalTree=create_trie_tree("corpus-training-digit.utf-8.txt")  
  336.     # GloabalTree=create_trie_tree("mytrain.txt")
  337.     record_dic_tree(GloabalTree)  
  338.     print "Segment begin!"
  339.     
  340.  #   print GloabalTree.insubNode["57002 2999"].outsubNode["63405 60327"].getprobability()
  341.     
  342.     
  343.     #Test input file and result file
  344.     segChnText("corpus-test-digit.utf-8.txt","corpus-test-result-digit.utf-8.txt")
  345.     print "Segment end!"
  346.     
  347.     
  348.         
  349.