danwenben.m
资源名称:danwenben.rar [点击查看]
上传用户:dongbao
上传日期:2022-08-09
资源大小:1k
文件大小:2k
源码类别:
matlab例程
开发平台:
Matlab
- %提取单词并小写...................................................................
- text=textread('lpin.txt','%s'); %提取文本中的单词
- stopword=textread('stopword.txt','%s'); %提取stopword中的单词
- a=struct('word',[],'count',[]); %定义一个结构体函数,word存放单词,count存放单词出现的次数
- a(1).word=lower(text); %将文本中单词小写
- %去掉stopword................................................................
- k=1;
- result=[];
- flag=0;
- for i=1:length(a(1).word)
- for j=1:length(stopword)
- if strcmp(a(1).word{i},stopword{j})==1
- flag=1;
- end
- end
- if flag==0;
- result{k}=a(1).word{i};
- k=k+1;
- end
- flag=0;
- end
- %提取词干....................................................................
- k=1;s=[];
- for i=1:length(result)
- s{k}=porterStemmer(result{i});
- k=k+1;
- end
- %将与处理的单词写入 lpout.txt.................................................
- for i=1:length(s)
- fid=fopen('lpout.txt','a');
- fprintf(fid,'%s n',s{i});
- fclose(fid);
- end
- %对单词进行计数并提取出来......................................................
- m=textread('lpout.txt','%s');
- a(2).word=m;
- for i=1:length(m)
- a(2).count(i)=1; %将未计数前的所有单词数置为1
- end
- for i=1:length(m) %如果有相同的单词,则将后面的单词数置0,将本单词数加1
- for j=(i+1):length(m)
- if (strcmp(a(2).word{i},a(2).word{j})==1)&&(a(2).count(i)~=0)
- a(2).count(i)=a(2).count(i)+1;
- a(2).count(j)=0;
- else
- continue;
- end
- end
- end
- k=1;
- for i=1:length(a(2).word) %如果单词数非0,则将单词和次数写入结构体数组
- if a(2).count(i)~=0
- a(3).word{k}=a(2).word{i};
- a(3).count(k)=a(2).count(i);
- k=k+1;
- else
- continue;
- end
- end
- %将单词存入word.txt,将次数存入count.txt......................................
- for i=1:length(a(3).word)
- fid=fopen('word.txt','a');
- fprintf(fid,'%s n',a(3).word{i});
- fclose(fid);
- end
- for j=1:length(a(3).count)
- fid=fopen('count.txt','a');
- fprintf(fid,'%d n',a(3).count(j));
- fclose(fid);
- end