ClassXunLongChinese.cs
上传用户:zhangkuixh
上传日期:2013-09-30
资源大小:5473k
文件大小:13k
源码类别:

搜索引擎

开发平台:

C#

  1. using System;
  2. using System.Collections.Generic;
  3. using System.Text;
  4. using System.IO;
  5. using System.Collections;
  6. /*
  7.       '       迅龙中文分类搜索引擎  v0.6
  8.       '
  9.       '        LGPL  许可发行
  10.       '
  11.       '       宁夏大学  张冬 康彩  zd4004@163.com
  12.       ' 
  13.       '        官网 http://blog.163.com/zd4004/
  14.  */
  15. namespace Lucene.Net.Analysis.XunLongX
  16. {
  17.     /// <summary>
  18.     /// <p>
  19.     /// 
  20.     /// @author Zhang, Dong
  21.     ///
  22.     /// </summary>
  23.     /// <summary>
  24.     /// 分词结果存储结构
  25.     /// </summary>
  26.     public  struct XunLongCNST
  27.     {
  28.         /// <summary>
  29.         /// 词
  30.         /// </summary>
  31.         public string cWord;
  32.         /// <summary>
  33.         /// 词性
  34.         /// </summary>
  35.         public string cType;
  36.         
  37.         /// <summary>
  38.         /// 开始位置  0
  39.         /// </summary>
  40.         public int cStart;
  41.         /// <summary>
  42.         /// 长度
  43.         /// </summary>
  44.         public int cLength;
  45.     
  46.     }
  47.     
  48.     /// <summary>
  49.     /// 迅龙分词接口
  50.     /// </summary>
  51.      public static class ClassXunLongChinese
  52.     {
  53.     
  54.          /// <summary>
  55.          /// 设定分词类
  56.          /// </summary>
  57.       //  private static XwordClassLibrary.ClassXWord mXW = new XwordClassLibrary.ClassXWord();
  58.          private static XunLong.xWordNewClient.ClassXwordClientNewIt mXW = new XunLong.xWordNewClient.ClassXwordClientNewIt();
  59.          /// <summary>
  60.          /// 存储停止词
  61.          /// </summary>
  62.          public static ArrayList CnStopWord = new ArrayList();
  63.          /// <summary>
  64.          /// 加速设置  初始化本地缓存  可以不使用
  65.          /// </summary>
  66.          /// <param name="okPath"></param>
  67.          public static void initOKxWord(string okPath, string k_c_path)
  68.          {
  69.              //读取配置
  70.              Console.WriteLine("--》加载配置文件: " + k_c_path);
  71.              XunLong.CongifData.Config.InitConfigData(k_c_path);
  72.              mXW.hostName = XunLong.CongifData.Config.xWordCacheServer;
  73.              mXW.nowPort = XunLong.CongifData.Config.xWordCacheServerPort;
  74.              mXW.Init_start();
  75.              mXW.initOKxWord(okPath);
  76.          }
  77.          /// <summary>
  78.          /// 中文分词
  79.          /// </summary>
  80.          /// <param name="_incn"></param>
  81.          /// <returns></returns>
  82.          public static XunLongCNST[] ChineseIntface(TextReader _incn)
  83.          {
  84.              string aOLD = _incn.ReadToEnd().ToLower();
  85.              string  a = aOLD.Trim();
  86.              a =a.Replace("---"," ");
  87.              a =a.Replace("==="," ");
  88.              a =a.Replace("   "," ");
  89.              a =a.Replace("  "," ");
  90.              if (a == null | a.Length == 0)
  91.              {
  92.                  return null;
  93.              
  94.              }
  95.              string dat = "";
  96.              if (a.IndexOf(' ') > 0)
  97.              {
  98.                  string[] NewAS = a.Split(' ');
  99.                  int nal =a.Length / 5;
  100.                  if (NewAS.Length >= nal)
  101.                  {
  102.                      dat = "";
  103.                      for (int c = 0; c < NewAS.Length; c++)
  104.                      {
  105.                          dat = dat + NewAS[c];
  106.                      }
  107.                      goto X2;
  108.                  }
  109.              }
  110.              int xWN_400 = 160;
  111.              if (a.Length < xWN_400)
  112.              {
  113.                  dat = mXW.GetOneXword(a);
  114.              }
  115.              else
  116.              {
  117.                  //数据以000 为单位切割开 
  118.                  for (int i = xWN_400; i < a.Length; i++)
  119.                  {
  120.                      if (i % xWN_400 == 0)
  121.                      {
  122.                          //取得500个字符
  123.                          string one = a.Substring(i - xWN_400, xWN_400);
  124.                          dat = dat + mXW.GetOneXword(one) + " ";
  125.                          int u00 = 0;
  126.                      }
  127.                      int onen = a.Length - (a.Length % xWN_400);
  128.                      //取得500个字符
  129.                      if (i == onen)
  130.                      {
  131.                          string one = a.Substring(i, (a.Length % xWN_400));
  132.                          dat = dat + mXW.GetOneXword(one);
  133.                          break;
  134.                      }
  135.                  }
  136.              }
  137.              X2:
  138.              dat = dat.Trim();
  139.         
  140.              XunLongCNST[] x = new XunLongCNST[1];
  141.              
  142.              if (dat.IndexOf(' ') == -1)
  143.              {
  144.                  if (dat.IndexOf('/') == -1)
  145.                  {
  146.                      x = new XunLongCNST[1];
  147.                      x[0].cStart = 0;
  148.                      x[0].cLength = aOLD.Length;
  149.                      x[0].cType = "n";
  150.                      x[0].cWord = aOLD;
  151.                      return x;
  152.                  }
  153.                  else
  154.                  {
  155.                      x = new XunLongCNST[1];
  156.                      string[] newtmp = dat.Split('/');
  157.                      x[0].cStart = 0;
  158.                      x[0].cLength = aOLD.Length;
  159.                      if (newtmp[1].Length == 0)
  160.                      {
  161.                          x[0].cType = "n";
  162.                      }
  163.                      else
  164.                      {
  165.                          x[0].cType = newtmp[1];
  166.                      }
  167.                      x[0].cWord = aOLD;
  168.                      return x;
  169.                  }
  170.              }
  171.         
  172.              string[] tmps = dat.Split(' ');
  173.              x = new XunLongCNST[tmps.Length];
  174.              int pX = 0;
  175.              int pNow = 0; //当前的位置
  176.              for (int i = 0; i < tmps.Length; i++)
  177.              {
  178.                  string TOne = tmps[i];
  179.                  if (TOne.Length > 0  & pNow <= aOLD.Length  )
  180.                  {
  181.                      if (TOne.IndexOf('/') == -1)
  182.                      {
  183.                          int nn = aOLD.IndexOf(TOne, pNow);
  184.                          if (nn > -1)
  185.                          {
  186.                              pNow = nn + 1;
  187.                              x[pX].cWord = TOne;
  188.                              x[pX].cType = "n";
  189.                              x[pX].cStart = nn;
  190.                              x[pX].cLength = TOne.Length;
  191.                              pX = pX + 1;
  192.                          }
  193.                          else
  194.                          { }
  195.                      }
  196.                      else
  197.                      {
  198.                          string[] onet = TOne.Split('/');
  199.                          string onet0 = onet[0];
  200.                          int nn = aOLD.IndexOf(onet0, pNow);
  201.                          if (nn > -1)
  202.                          {
  203.                              pNow = nn + 1;
  204.                              x[pX].cWord = onet0;
  205.                              if (onet.Length == 2)
  206.                              {
  207.                                  if (onet[1].Length == 0)
  208.                                  {
  209.                                      x[pX].cType = "n";
  210.                                  }
  211.                                  else
  212.                                  {
  213.                                      x[pX].cType = onet[1];
  214.                                  }
  215.                              }
  216.                              else
  217.                              {
  218.                                  x[pX].cType = "n";
  219.                              }
  220.                              x[pX].cStart = nn;
  221.                              x[pX].cLength = onet0.Length;
  222.                              pX = pX + 1;
  223.                          }
  224.                          else
  225.                          { }
  226.                      }
  227.                  }
  228.              }
  229.                    
  230.              return x;
  231.          }
  232.          
  233.          /// <summary>
  234.          /// 中文分词
  235.          /// </summary>
  236.          /// <param name="_incn"></param>
  237.          /// <returns></returns>
  238.          public static XunLongCNST[] ChineseIntfaceOLD(TextReader _incn)
  239.          {
  240.              string a = _incn.ReadToEnd();
  241.              a = a.ToLower();
  242.              //隔开
  243.              char[] xx = { '!', '(', ')', '{', '}', ':', ';', ''', '"', ',', '.', '?', '!', '(', ')', ':', ';', '‘', '“', ',', '。', '?', ' ', 'n', 'r', 't' };
  244.              string[] xa = a.Split(xx);
  245.              //  1 xa变为 xxaa  的数组
  246.              string[] xxaa = new string[12048];
  247.              int xxaaLen = 0;
  248.              for (int i = 0; i < xa.Length; i++)
  249.              {
  250.                  if (xa[i].Length > 2)  //长度小于2的 字符 不进行分词
  251.                  {
  252.                      string cca = mXW.GetOneXword(xa[i]);
  253.                      if (cca.Length > xa[i].Length)   //正确分词
  254.                      {
  255.                          xa[i] = cca;                 //分词结果替换原来数据
  256.                      }
  257.                  }
  258.                  if (xa[i].IndexOf(' ') > -1)
  259.                  {
  260.                      string[] tmp = xa[i].Split(' ');
  261.                      for (int j = 0; j < tmp.Length; j++)
  262.                      {
  263.                          if (tmp[j] != null & tmp[j].Length > 0)
  264.                          {
  265.                              xxaa[xxaaLen] = tmp[j];
  266.                              xxaaLen = xxaaLen + 1;
  267.                          }
  268.                      }
  269.                  }
  270.                  else
  271.                  {
  272.                      xxaa[xxaaLen] = xa[i];
  273.                      xxaaLen = xxaaLen + 1;
  274.                  }
  275.              }
  276.              int pNow = 0; //当前的位置
  277.              XunLongCNST[] x = new XunLongCNST[12048];
  278.              int pX = 0;
  279.              //   a为原始数据  xxaa 为数据模板  进行匹配
  280.              for (int i = 0; i < xxaa.Length; i++)
  281.              {
  282.                  if (xxaa[i] != null)
  283.                  {
  284.                      //当前数据
  285.                      string tmpOne = xxaa[i];
  286.                      //类型
  287.                      string tmpType = "n";
  288.                      //文本串中位置
  289.                      int tmpStart = 0;
  290.                      //长度
  291.                      int tmpLength = 0;
  292.                      if (tmpOne.IndexOf('/') > 0)
  293.                      {
  294.                          //包含分词 说明  // 分离出类型
  295.                          string[] tmpS = tmpOne.Split('/');
  296.                          int new_tmps = tmpOne.LastIndexOf('/');
  297.                          string new_1 = tmpOne.Substring(0, new_tmps);
  298.                          string new_2 = tmpOne.Substring(new_tmps + 1, tmpOne.Length - new_tmps - 1);
  299.                          tmpOne = new_1;
  300.                          tmpType = new_2;
  301.                          //得到文本
  302.                          //tmpOne = tmpS[0];
  303.                          //得到长度
  304.                          tmpLength = tmpOne.Length;
  305.                          /*
  306.                          if (tmpS.Length > 1)
  307.                          {
  308.                              if (tmpS[1].Length > 0)
  309.                              {
  310.                                  tmpType = tmpS[1];
  311.                              }
  312.                          }
  313.                          */
  314.                      }
  315.                      //检测出文本在原数据中的位置
  316.                      int tmpxx = a.IndexOf(tmpOne);
  317.                      if (tmpxx > -1)
  318.                      {
  319.                          tmpStart = tmpxx + pNow;
  320.                          a = a.Substring(tmpxx + tmpLength, a.Length - tmpLength - tmpxx);
  321.                          pNow = pNow + tmpxx + tmpLength - 1;
  322.                          x[pX].cWord = tmpOne;
  323.                          x[pX].cType = tmpType;
  324.                          x[pX].cStart = tmpStart;
  325.                          x[pX].cLength = tmpLength;
  326.                      }
  327.                      pX = pX + 1;
  328.                  }
  329.              }
  330.              return x;
  331.          }
  332.          
  333.          /// <summary>
  334.          /// 是否应该过滤掉
  335.          /// </summary>
  336.          /// <param name="a"></param>
  337.          /// <returns></returns>
  338.          public static bool ChineseFilterIt(XunLongCNST a)
  339.          {
  340.              if (a.cWord == null)
  341.              {
  342.                  return true;
  343.              }
  344.              if (a.cWord == null & a.cType == null)
  345.              {
  346.                  return true;
  347.              }
  348.              //过滤掉停止词
  349.              if (CnStopWord.Contains(a.cWord) == true)
  350.              {
  351.                  return true;
  352.              }
  353.              if (a.cWord != null & a.cType == null)
  354.              {
  355.                  return false;
  356.              }
  357.              string x = a.cType;
  358.              if (x.IndexOf('n') > -1 | x.IndexOf('v') > -1 | x.IndexOf('i') > -1 | x.IndexOf('j') > -1 | x.IndexOf('l') > -1 | x.IndexOf('s') > -1)
  359.              {
  360.                  return false;
  361.              }
  362.              return true ;
  363.          }
  364.     }
  365. }
  366. //按照词性过滤
  367. /*
  368. Ag 形语素
  369. a 形容词
  370. ad 副形词
  371. an 名形词
  372. Bg 区别语素
  373. b 区别词
  374. c 连词
  375. Dg 副语素
  376. d 副词
  377. e 叹词
  378. f 方位词
  379. g 语素
  380. h 前接成分
  381. i 成语
  382. j 简略语
  383. k 后接成分
  384. l 习用语
  385. Mg 数语素
  386. m 数词
  387. Ng 名语素
  388. n 名词
  389. nr 人名
  390. ns 地名
  391. nt 机构团体
  392. nx 外文字符
  393. nz 其它专名
  394. o 拟声词
  395. p 介词
  396. Qg 量语素
  397. q 量词
  398. Rg 代语素
  399. r 代词
  400. s 处所词
  401. Tg 时间语素
  402. t 时间词
  403. Ug 助语素
  404. u 助词
  405. Vg 动语素
  406. v 动词
  407. vd 副动词
  408. vn 名动词
  409. w 标点符号
  410. x 非语素字
  411. Yg 语气语素
  412. y 语气词
  413. z 状态词
  414.  *
  415.  * 
  416.  * 
  417.  * 
  418. */