ClassXunLongChinese.cs
上传用户:zhangkuixh
上传日期:2013-09-30
资源大小:5473k
文件大小:13k
- using System;
- using System.Collections.Generic;
- using System.Text;
- using System.IO;
- using System.Collections;
- /*
- ' 迅龙中文分类搜索引擎 v0.6
- '
- ' LGPL 许可发行
- '
- ' 宁夏大学 张冬 康彩 zd4004@163.com
- '
- ' 官网 http://blog.163.com/zd4004/
- */
- namespace Lucene.Net.Analysis.XunLongX
- {
- /// <summary>
- /// <p>
- ///
- /// @author Zhang, Dong
- ///
- /// </summary>
- /// <summary>
- /// 分词结果存储结构
- /// </summary>
- public struct XunLongCNST
- {
- /// <summary>
- /// 词
- /// </summary>
- public string cWord;
- /// <summary>
- /// 词性
- /// </summary>
- public string cType;
-
- /// <summary>
- /// 开始位置 0
- /// </summary>
- public int cStart;
- /// <summary>
- /// 长度
- /// </summary>
- public int cLength;
-
- }
-
- /// <summary>
- /// 迅龙分词接口
- /// </summary>
- public static class ClassXunLongChinese
- {
-
- /// <summary>
- /// 设定分词类
- /// </summary>
- // private static XwordClassLibrary.ClassXWord mXW = new XwordClassLibrary.ClassXWord();
- private static XunLong.xWordNewClient.ClassXwordClientNewIt mXW = new XunLong.xWordNewClient.ClassXwordClientNewIt();
- /// <summary>
- /// 存储停止词
- /// </summary>
- public static ArrayList CnStopWord = new ArrayList();
- /// <summary>
- /// 加速设置 初始化本地缓存 可以不使用
- /// </summary>
- /// <param name="okPath"></param>
- public static void initOKxWord(string okPath, string k_c_path)
- {
- //读取配置
- Console.WriteLine("--》加载配置文件: " + k_c_path);
- XunLong.CongifData.Config.InitConfigData(k_c_path);
- mXW.hostName = XunLong.CongifData.Config.xWordCacheServer;
- mXW.nowPort = XunLong.CongifData.Config.xWordCacheServerPort;
- mXW.Init_start();
- mXW.initOKxWord(okPath);
- }
- /// <summary>
- /// 中文分词
- /// </summary>
- /// <param name="_incn"></param>
- /// <returns></returns>
- public static XunLongCNST[] ChineseIntface(TextReader _incn)
- {
- string aOLD = _incn.ReadToEnd().ToLower();
- string a = aOLD.Trim();
- a =a.Replace("---"," ");
- a =a.Replace("==="," ");
- a =a.Replace(" "," ");
- a =a.Replace(" "," ");
- if (a == null | a.Length == 0)
- {
- return null;
-
- }
- string dat = "";
- if (a.IndexOf(' ') > 0)
- {
- string[] NewAS = a.Split(' ');
- int nal =a.Length / 5;
- if (NewAS.Length >= nal)
- {
- dat = "";
- for (int c = 0; c < NewAS.Length; c++)
- {
- dat = dat + NewAS[c];
- }
- goto X2;
- }
- }
- int xWN_400 = 160;
- if (a.Length < xWN_400)
- {
- dat = mXW.GetOneXword(a);
- }
- else
- {
- //数据以000 为单位切割开
- for (int i = xWN_400; i < a.Length; i++)
- {
- if (i % xWN_400 == 0)
- {
- //取得500个字符
- string one = a.Substring(i - xWN_400, xWN_400);
- dat = dat + mXW.GetOneXword(one) + " ";
- int u00 = 0;
- }
- int onen = a.Length - (a.Length % xWN_400);
- //取得500个字符
- if (i == onen)
- {
- string one = a.Substring(i, (a.Length % xWN_400));
- dat = dat + mXW.GetOneXword(one);
- break;
- }
- }
- }
- X2:
- dat = dat.Trim();
-
- XunLongCNST[] x = new XunLongCNST[1];
-
- if (dat.IndexOf(' ') == -1)
- {
- if (dat.IndexOf('/') == -1)
- {
- x = new XunLongCNST[1];
- x[0].cStart = 0;
- x[0].cLength = aOLD.Length;
- x[0].cType = "n";
- x[0].cWord = aOLD;
- return x;
- }
- else
- {
- x = new XunLongCNST[1];
- string[] newtmp = dat.Split('/');
- x[0].cStart = 0;
- x[0].cLength = aOLD.Length;
- if (newtmp[1].Length == 0)
- {
- x[0].cType = "n";
- }
- else
- {
- x[0].cType = newtmp[1];
- }
- x[0].cWord = aOLD;
- return x;
- }
- }
-
- string[] tmps = dat.Split(' ');
- x = new XunLongCNST[tmps.Length];
- int pX = 0;
- int pNow = 0; //当前的位置
- for (int i = 0; i < tmps.Length; i++)
- {
- string TOne = tmps[i];
- if (TOne.Length > 0 & pNow <= aOLD.Length )
- {
- if (TOne.IndexOf('/') == -1)
- {
- int nn = aOLD.IndexOf(TOne, pNow);
- if (nn > -1)
- {
- pNow = nn + 1;
- x[pX].cWord = TOne;
- x[pX].cType = "n";
- x[pX].cStart = nn;
- x[pX].cLength = TOne.Length;
- pX = pX + 1;
- }
- else
- { }
- }
- else
- {
- string[] onet = TOne.Split('/');
- string onet0 = onet[0];
- int nn = aOLD.IndexOf(onet0, pNow);
- if (nn > -1)
- {
- pNow = nn + 1;
- x[pX].cWord = onet0;
- if (onet.Length == 2)
- {
- if (onet[1].Length == 0)
- {
- x[pX].cType = "n";
- }
- else
- {
- x[pX].cType = onet[1];
- }
- }
- else
- {
- x[pX].cType = "n";
- }
- x[pX].cStart = nn;
- x[pX].cLength = onet0.Length;
- pX = pX + 1;
- }
- else
- { }
- }
- }
- }
-
- return x;
- }
-
- /// <summary>
- /// 中文分词
- /// </summary>
- /// <param name="_incn"></param>
- /// <returns></returns>
- public static XunLongCNST[] ChineseIntfaceOLD(TextReader _incn)
- {
- string a = _incn.ReadToEnd();
- a = a.ToLower();
- //隔开
- char[] xx = { '!', '(', ')', '{', '}', ':', ';', ''', '"', ',', '.', '?', '!', '(', ')', ':', ';', '‘', '“', ',', '。', '?', ' ', 'n', 'r', 't' };
- string[] xa = a.Split(xx);
- // 1 xa变为 xxaa 的数组
- string[] xxaa = new string[12048];
- int xxaaLen = 0;
- for (int i = 0; i < xa.Length; i++)
- {
- if (xa[i].Length > 2) //长度小于2的 字符 不进行分词
- {
- string cca = mXW.GetOneXword(xa[i]);
- if (cca.Length > xa[i].Length) //正确分词
- {
- xa[i] = cca; //分词结果替换原来数据
- }
- }
- if (xa[i].IndexOf(' ') > -1)
- {
- string[] tmp = xa[i].Split(' ');
- for (int j = 0; j < tmp.Length; j++)
- {
- if (tmp[j] != null & tmp[j].Length > 0)
- {
- xxaa[xxaaLen] = tmp[j];
- xxaaLen = xxaaLen + 1;
- }
- }
- }
- else
- {
- xxaa[xxaaLen] = xa[i];
- xxaaLen = xxaaLen + 1;
- }
- }
- int pNow = 0; //当前的位置
- XunLongCNST[] x = new XunLongCNST[12048];
- int pX = 0;
- // a为原始数据 xxaa 为数据模板 进行匹配
- for (int i = 0; i < xxaa.Length; i++)
- {
- if (xxaa[i] != null)
- {
- //当前数据
- string tmpOne = xxaa[i];
- //类型
- string tmpType = "n";
- //文本串中位置
- int tmpStart = 0;
- //长度
- int tmpLength = 0;
- if (tmpOne.IndexOf('/') > 0)
- {
- //包含分词 说明 // 分离出类型
- string[] tmpS = tmpOne.Split('/');
- int new_tmps = tmpOne.LastIndexOf('/');
- string new_1 = tmpOne.Substring(0, new_tmps);
- string new_2 = tmpOne.Substring(new_tmps + 1, tmpOne.Length - new_tmps - 1);
- tmpOne = new_1;
- tmpType = new_2;
- //得到文本
- //tmpOne = tmpS[0];
- //得到长度
- tmpLength = tmpOne.Length;
- /*
- if (tmpS.Length > 1)
- {
- if (tmpS[1].Length > 0)
- {
- tmpType = tmpS[1];
- }
- }
- */
- }
- //检测出文本在原数据中的位置
- int tmpxx = a.IndexOf(tmpOne);
- if (tmpxx > -1)
- {
- tmpStart = tmpxx + pNow;
- a = a.Substring(tmpxx + tmpLength, a.Length - tmpLength - tmpxx);
- pNow = pNow + tmpxx + tmpLength - 1;
- x[pX].cWord = tmpOne;
- x[pX].cType = tmpType;
- x[pX].cStart = tmpStart;
- x[pX].cLength = tmpLength;
- }
- pX = pX + 1;
- }
- }
- return x;
- }
-
- /// <summary>
- /// 是否应该过滤掉
- /// </summary>
- /// <param name="a"></param>
- /// <returns></returns>
- public static bool ChineseFilterIt(XunLongCNST a)
- {
- if (a.cWord == null)
- {
- return true;
- }
- if (a.cWord == null & a.cType == null)
- {
- return true;
- }
- //过滤掉停止词
- if (CnStopWord.Contains(a.cWord) == true)
- {
- return true;
- }
- if (a.cWord != null & a.cType == null)
- {
- return false;
- }
- string x = a.cType;
- if (x.IndexOf('n') > -1 | x.IndexOf('v') > -1 | x.IndexOf('i') > -1 | x.IndexOf('j') > -1 | x.IndexOf('l') > -1 | x.IndexOf('s') > -1)
- {
- return false;
- }
- return true ;
- }
- }
- }
- //按照词性过滤
- /*
- Ag 形语素
- a 形容词
- ad 副形词
- an 名形词
- Bg 区别语素
- b 区别词
- c 连词
- Dg 副语素
- d 副词
- e 叹词
- f 方位词
- g 语素
- h 前接成分
- i 成语
- j 简略语
- k 后接成分
- l 习用语
- Mg 数语素
- m 数词
- Ng 名语素
- n 名词
- nr 人名
- ns 地名
- nt 机构团体
- nx 外文字符
- nz 其它专名
- o 拟声词
- p 介词
- Qg 量语素
- q 量词
- Rg 代语素
- r 代词
- s 处所词
- Tg 时间语素
- t 时间词
- Ug 助语素
- u 助词
- Vg 动语素
- v 动词
- vd 副动词
- vn 名动词
- w 标点符号
- x 非语素字
- Yg 语气语素
- y 语气词
- z 状态词
- *
- *
- *
- *
- */