segChnWord.py
上传用户:l195572280
上传日期:2022-05-24
资源大小:3k
文件大小:14k
- '''
- goal: segment Chinese words
- author: jiangg_211@126.com
- '''
- import os
- import codecs
- import string
- import math
- from datetime import datetime
- from datetime import date
- import pickle, StringIO # for pickle
- class Node:
- def __init__(self):
- self.map={}
- def contain(self,key):
- return self.map.__contains__(key)
- def __getitem__(self,key):
- return self.map[key]
- def __setitem__(self,key,value):
- self.map[key]=value
-
- class Item:
- def __init__(self,key):
- self.key=key
- self.freq=1
- self.prob=-1 #two grammar probability
- self.whetherleaf=0 #1 means from root to this node is a word
- self.insubNum=0
- self.outsubNum=0
- self.insubNode=Node()
- self.outsubNode=Node()
- def addfreq (self):
- self.freq=self.freq+1
- def getfreq(self):
- return self.freq
- def setprobability (self,probability):
- self.prob=probability
- def getprobability (self):
- return self.prob
- def addinsubNode(self,key,item):
- self.insubNode[key]=item
- self.insubNum=self.insubNum+1
- def addoutsubNode(self,key,item):
- self.outsubNode[key]=item
- self.outsubNum=self.outsubNum+1
- class WordGraph:
- def __init__(self,key):
- self.key=key
- self.rightsibliNum=0
- self.rightsibliNode=Node()
- self.rightsibliProb=Node()
-
- self.leftsibliNode=Node() #the left siblinode
- def addrightsibliNode(self,key,item):
- self.rightsibliNode[key]=item
- self.rightsibliNum+=1
- def addleftsibliNode(self,key,item):
- self.leftsibliNode[key]=item
- def addrightsibliProb (self,key,probability):
- self.rightsibliProb[key]=probability
- #Create all the two gram probability
- def computeProbTrietree(tree):
- if (not tree) or (tree.insubNum==0) :
- return
- for insubkey,insubNode in tree.insubNode.map.items():
- #smooth by add 1
- if insubNode.outsubNum>0:
- current_freq=insubNode.getfreq()
- for libkey,libnode in insubNode.outsubNode.map.items():
- try:
- prob=math.log(1.0*(libnode.getfreq()+smoothingPara)/(current_freq+1))
- # print "p=",
- # print prob
- if prob>0:
- print "warning prob>o when ",
- print libkey
- except:
- print "Error when",
- print libkey
- libnode.setprobability(prob)
- #if "25220 11730 21142" exist in the tree
- def IsWord (tree,word):
- if (not tree) or (tree.insubNum==0):
- return ""
- if tree.insubNode.contain(word):
- current=tree.insubNode[word]
- return current
- return ""
- #Compute each relative probability
- def Find2gramprob (tree,leftword,rightword):
- current=IsWord(tree,leftword)
- leftfreq=1
- rightfreq=smoothingPara
- if current:
- if current.outsubNode.contain(rightword):
- result=current.outsubNode[rightword].getprobability()
- else:
- #for outkey,outnode in current.outsubNode.map.items():
- # if outkey.startswith(rightword):
- # rightfreq=rightfreq+outnode.getfreq()
- result= math.log(1.0*rightfreq/(current.getfreq()+1))
- else:
- # for inkey,innode in tree.insubNode.map.items():
- # if inkey.endswith(leftword):
- # leftfreq=leftfreq+innode.getfreq()
- # for outkey,outnode in innode.outsubNode.map.items():
- # if outkey.startswith(rightword):
- # rightfreq=rightfreq+outnode.getfreq()
- result=math.log(1.0*rightfreq/leftfreq)
- #result=0 ###float('-inf')
- # print "left= ",
- # print leftword,
- # print "right= ",
- # print rightword,
- # print "p= ",
- # print result
- return result
-
- def record_dic_tree(tree):
- if (not tree) or (tree.insubNum==0):
- return
- for key,node in tree.insubNode.map.items():
- logFilePrint.write(node.key)
- logFilePrint.write("n")
- def create_trie_tree(treeFile):
- trainfile=codecs.open(treeFile,"r","utf-8")
- #trainfile=codecs.open("corpus-training-digit.utf-8.txt","r","utf-8")
- processed_linenum=0
- #initialize tree
- tree=Item("")
- print "Building trie tree..., time is: ",
- print datetime.now()
- for trainline in trainfile:
- processed_linenum+=1
- #if processed_linenum >2000:
- # break;
- if processed_linenum %1000==0:
- print processed_linenum,
- print " lines ok!"
- # add sentence begin and end label
- trainline="<s> | "+trainline+" </s>";
- #traintokenlist represent a sentence
- traintokenlist=trainline.strip().split("|")
- prenode="" #note the last left adjacent word
- for trainword in traintokenlist:
- trainword=trainword.strip() #represent total word
- if tree.insubNode.contain(trainword):
- tree.insubNode[trainword].addfreq()
- else:
- item=Item(trainword)
- tree.addinsubNode(trainword,item)
- current=tree.insubNode[trainword]
- if prenode:
- if prenode.outsubNode.contain(trainword):
- prenode.outsubNode[trainword].addfreq()
- else:
- item=Item(trainword)
- prenode.addoutsubNode(trainword,item)
- prenode=current
- computeProbTrietree(tree)
- print "Building success! Time is: ",
- return tree
- def save_trie_tree(tree, treeFile):
- print datetime.now()
- print "Saving tried tree to file..."
- #pickle the mode
- picklefile=open("segChnTrieTree.txt","wb")
- pickle.dump(tree, picklefile, 0) #pickle
- picklefile.close()
- print "Save ok!"
- print "Printing tree..."
- print_trie_tree(tree)
- trainfile.close()
- def load_trie_tree(treeFile):
- print "Loading trietree..."
- tree=Item("")
- unpickfile=open(treeFile,"rb")
- unpickfile.seek(0)
- tree = pickle.load(unpickfile) #reverse pickle
- print "Loading trietree ok!"
- return tree
- def printWordGraph(graph):
- if not graph:
- return
- else:
- print graph.key
- for key,node in graph.rightsibliNode.map.items():
- printWordGraph(node)
-
- def creatWordGraph(sentence): # prepare for segment, create word graph
- #print "Creat word graph..."
- graph=WordGraph("<s>")
- preNode=graph
- sentlist=sentence.strip().split() #contain all the words in a sentence
- #create 1-gram word graph
- for i in range(1,len(sentlist)):
- chnCharacter=sentlist[i]
- graphitem=WordGraph(chnCharacter)
- probability=Find2gramprob(GloabalTree,sentlist[i-1],chnCharacter)
- preNode.addrightsibliNode(chnCharacter,graphitem)
- preNode.addrightsibliProb(chnCharacter,probability)
-
- graphitem.addleftsibliNode(preNode.key, preNode)
- preNode=graphitem
- ###printWordGraph(graph)
- #create 2-gram word graph
- preNode=graph #preNode represent character i in the graph
- for i in range(1,len(sentlist)):
- preNode=preNode.rightsibliNode[sentlist[i]]
- postNode=preNode
- characters=sentlist[i]
- for j in range(i+1,len(sentlist)-1):
- postNode=postNode.rightsibliNode[sentlist[j]] #preNode represent right character j in the graph
- characters=characters+" "+sentlist[j]
- if IsWord(GloabalTree,characters): #from i to j is a Chinese word
- graphitem=WordGraph(characters)
- #refresh left edge
- if preNode:
- for key,node in preNode.leftsibliNode.map.items():
- #if not node.rightsibliNode.contain(characters):
- node.addrightsibliNode(characters, graphitem)
- probability=Find2gramprob(GloabalTree,key,characters)
- node.addrightsibliProb(characters,probability)
-
- graphitem.addleftsibliNode(key,node)
- #refresh right edge
- if postNode:
- for key,node in postNode.rightsibliNode.map.items():
- #if not node.leftsibliNode.contain(characters):
- graphitem.addrightsibliNode(key, node)
- probability=Find2gramprob(GloabalTree,characters,key)
- graphitem.addrightsibliProb(key,probability)
-
- node.addleftsibliNode(characters,graphitem)
- #print "Creat word graph ok!"
- #print "Word graph is: "
- ###printWordGraph(graph)
- return (sentlist,graph)
-
- def vertibifromGraph(graph, sentlist): # search optimal path
- #print "Searching path using vertibi..."
- if len(sentlist)==0:
- return ""
- PathDic={}
- ScoreDic={}
- ResultScoreDic={}
-
- PathDic["<s>"]=graph
- ScoreDic["<s>"]=0
- NodeList=[graph]
-
- for i in range(1,len(sentlist)):
- #find maximum edge for a node
- for node in NodeList:
- nodeDic=[]
- for pathkey, pathnode in PathDic.items():
- if pathnode==node:
- nodeDic.append(pathkey)
- if len(nodeDic)>1:
- maxpath=max([(ScoreDic[x],x) for x in nodeDic])[1]
- for pathkey in nodeDic:
- if not pathkey==maxpath:
- ScoreDic.pop(pathkey)
- PathDic.pop(pathkey)
- #Expand
- newAddNodelist=[]
- delNodelist=[]
- for node in NodeList:
- canexpand=False
- for rightnodekey,rightsiblinode in node.rightsibliNode.map.items():
- list_key=rightnodekey.split()
- if list_key[len(list_key)-1]==sentlist[i]:
- canexpand=True
- break
- if canexpand: #expand a node in nodelist
- delNodelist.append(node)
- for rightnodekey,rightsiblinode in node.rightsibliNode.map.items():
- if rightsiblinode not in newAddNodelist:
- newAddNodelist.append(rightsiblinode)
- #record the path
- for pathkey, pathnode in PathDic.items():
- if pathnode==node:
- for rightnodekey,rightsiblinode in node.rightsibliNode.map.items():
- currentpath=pathkey+ " | "+ rightnodekey
- ScoreDic[currentpath]=ScoreDic[pathkey]+node.rightsibliProb[rightnodekey]
- PathDic[currentpath]=rightsiblinode
- if rightnodekey=="</s>":
- ResultScoreDic[currentpath]=ScoreDic[currentpath]
- ScoreDic.pop(pathkey)
- PathDic.pop(pathkey)
- if newAddNodelist:
- for newnode in newAddNodelist:
- NodeList.append(newnode)
- if delNodelist:
- for delnode in delNodelist:
- NodeList.remove(delnode)
-
- if len(NodeList)>1:
- print "Warning, len of nodelist=",
- print len(NodeList)
- #print "Searching path using vertibit ok!"
- segResult=max([ (ResultScoreDic[x],x) for x in ResultScoreDic])[1]
- #del <s> and </s> from segResult
- segResult=segResult.replace("<s> | ","")
- segResult=segResult.replace("</s>","")
- return segResult
-
- def segChnText(ChnTextFile,resultFile):
- testFile = codecs.open(ChnTextFile,"r","utf-8")
- resultFile = codecs.open(resultFile,"w","utf-8")
- processed_linenum=0
- for line in testFile:
- processed_linenum=processed_linenum+1
- if processed_linenum %30==0:
- print processed_linenum,
- print " lines ok!"
- wordgraph=""
- #print "sentence: ",
- #print line
- line="<s> "+line+" </s>";
- (sentlist,wordgraph)=creatWordGraph(line) #create word graph
- segResult = vertibifromGraph(wordgraph, sentlist)
- resultFile.write(segResult)
- resultFile.write("n")
- resultFile.flush()
-
- if __name__=="__main__":
- global smoothingPara
- global GloabalTree
- global logFilePrint
-
- smoothingPara=1.0/200
- logFilePrint = codecs.open("dic_log.txt","w","utf-8")
- # Training
- GloabalTree=create_trie_tree("corpus-training-digit.utf-8.txt")
- # GloabalTree=create_trie_tree("mytrain.txt")
- record_dic_tree(GloabalTree)
- print "Segment begin!"
-
- # print GloabalTree.insubNode["57002 2999"].outsubNode["63405 60327"].getprobability()
-
-
- #Test input file and result file
- segChnText("corpus-test-digit.utf-8.txt","corpus-test-result-digit.utf-8.txt")
- print "Segment end!"
-
-
-
-