ClassSearchIT.cs
上传用户:zhangkuixh
上传日期:2013-09-30
资源大小:5473k
文件大小:18k
源码类别:

搜索引擎

开发平台:

C#

  1. using System;
  2. using System.Data;
  3. using System.Globalization;
  4. using System.Threading;
  5. using Lucene.Net.Analysis.Standard;
  6. using Lucene.Net.Documents;
  7. using Lucene.Net.QueryParsers;
  8. using Lucene.Net.Search;
  9. using Lucene.Net.Store;
  10. using Lucene.Net.Index;
  11. using System.Collections;
  12. using System.Xml;
  13. /*
  14.       '       迅龙中文分类搜索引擎  v0.6
  15.       '
  16.       '        LGPL  许可发行
  17.       '
  18.       '       宁夏大学  张冬 康彩  zd4004@163.com
  19.       ' 
  20.       '        官网 http://blog.163.com/zd4004/
  21.  */
  22. namespace XunLong.SearchClassLibrary
  23. {
  24.     public  class ClassSearchIT
  25.     {
  26.         //<XL>关键词</XL>
  27.         //<XL主类别>HTML</XL主类别>
  28.         //<属性1>属性值1</属性1><属性2>属性值2</属性2>
  29.             //读取配置
  30.         /// <summary>
  31.         /// 记录临时的分词缓存
  32.         /// </summary>
  33.         Hashtable clearXwordHT = new Hashtable();
  34.         /// <summary>
  35.         /// 分词系统
  36.         /// </summary>
  37.         private static XunLong.xWordNewClient.ClassXwordClientNewIt mySDW = new XunLong.xWordNewClient.ClassXwordClientNewIt();
  38.         /// <summary>
  39.         /// Luncene 分词系统
  40.         /// </summary>
  41.         //Lucene.Net.Analysis.XunLongX.XunLongAnalyzer itStandardAnalyzer = new Lucene.Net.Analysis.XunLongX.XunLongAnalyzer();
  42.           //  StandardAnalyzer()
  43.        // Lucene.Net.Analysis.Standard.StandardAnalyzer itStandardAnalyzer = new Lucene.Net.Analysis.Standard.StandardAnalyzer();
  44.         Lucene.Net.Analysis.WhitespaceAnalyzer itStandardAnalyzer = new Lucene.Net.Analysis.WhitespaceAnalyzer();
  45.         /// <summary>
  46.         /// 数据格式化
  47.         /// </summary>
  48.         private XunLong.HtmlClassLibrary.ClassTXT2IDAT dat2dat = new XunLong.HtmlClassLibrary.ClassTXT2IDAT();
  49.         /// <summary>
  50.         /// 搜索集总体
  51.         /// </summary>
  52.          //  private MultiSearcher searcher;
  53.      private IndexSearcher searcher;
  54.         /// <summary>
  55.         /// 索引集合 各个
  56.         /// </summary>
  57.        /// IndexSearcher[] searchers;
  58.         /// <summary>
  59.         ///  索引的细节  key = 主类别   val = 索引的序号  //这样在遇到1个有主类别的时候  可以单独启动某个索引
  60.         /// </summary>
  61.         Hashtable searchers_name;
  62.         /// <summary>
  63.         /// Web 系统配置文件
  64.         /// </summary>
  65.         public string  k_c_path ;
  66.         public void ClassSearchIT_1()
  67.         {
  68.             //读取配置
  69.             XunLong.CongifData.Config.InitConfigData(k_c_path);
  70.             searcher = new IndexSearcher(XunLong.CongifData.Config.IndexData);
  71.             mySDW.hostName = XunLong.CongifData.Config.xWordCacheServer;
  72.             mySDW.nowPort = XunLong.CongifData.Config.xWordCacheServerPort;
  73.             mySDW.Init_start();
  74.         }
  75.         /// <summary>
  76.         /// 扫描得到属性列表  根据原始的数据 和 主类别数据 
  77.         /// </summary>
  78.         /// <param name="dat">搜索</param>
  79.         /// <returns>属性和值列表</returns>
  80.         public Hashtable search_GetRESLIST(string dat, string mainType)
  81.         {
  82.             Console.WriteLine(" 扫描得到属性列表_开始==> " + DateTime.Now.ToString() + " " + DateTime.Now.Millisecond.ToString());
  83.             dat = dat.ToLower();
  84.             //dat = <XL>关键词</XL><XL主类别>HTML</XL主类别>
  85.             string Data1 = "";
  86.             //<XL>关键词</XL>;
  87.             int one_a1 = dat.IndexOf("<xl>");
  88.             int one_a2 = dat.IndexOf("</xl>");
  89.             if (one_a2 > one_a1 & one_a1 > -1)
  90.             {
  91.                 string dat_tmp = dat.Substring(one_a1 + 4, one_a2 - one_a1 - 4);
  92.                 Data1 = dat2dat.stringcode(dat_tmp);
  93.                 dat = dat.Replace("<xl>"+Data1+"</xl>","");
  94.             }
  95.             else
  96.             {
  97.                 Data1 = "";
  98.             }
  99.             Data1 = GetXwordData(Data1);
  100.             Query query = QueryParser.Parse(Data1, "a", itStandardAnalyzer);
  101.             Hits hits = searcher.Search(query);
  102.             // key = 属性  val = arrlist 属性值
  103.             Hashtable resultTmpSet = new Hashtable();
  104.             resultTmpSet.Clear();
  105.             int iii = hits.Length();  //结果集不能超过10000
  106.             if (iii > 10000)
  107.             { iii = 10000; }
  108.             // 
  109.             int NowIII = 0;
  110.             //得到结果集  进行类聚 
  111.             for (int t = 0; t < iii; t++)
  112.             {
  113.                 Document doc = hits.Doc(t); // 得到该数据的类聚选项 
  114.                 string xlTag = doc.Get("b").ToLower();
  115.                     NowIII = NowIII + 1;  //  tongyi 
  116.                     if (xlTag.Length > 0)
  117.                     {
  118.                         string onexmlss = "<kC>" + xlTag + "</kC>"; //分解类聚项
  119.                         try
  120.                         {
  121.                             XmlDocument X = new XmlDocument();
  122.                             X.LoadXml(onexmlss);
  123.                             System.Xml.XmlNodeList rssDetail = X.SelectNodes("kC");
  124.                             XmlNode myKKCC = rssDetail.Item(0);
  125.                             for (int i = 0; i < myKKCC.ChildNodes.Count; i++)          // 统计属性  属性出现测试   属性值  属性值出现次数
  126.                             {
  127.                                 XmlNode muF = myKKCC.ChildNodes[i];
  128.                                 string de_Key = muF.LocalName.Trim();  //属性
  129.                                 string de_Value = muF.InnerText.Trim();//属性值
  130.                                 
  131.                                 if (resultTmpSet.Contains(de_Key) == false)
  132.                                 {
  133.                                     XunLong.PublicClassLibrary.XLSX tmp_one_it = new XunLong.PublicClassLibrary.XLSX();
  134.                                     tmp_one_it.n = 1;
  135.                                     Hashtable ntmp = new Hashtable();
  136.                                     ntmp.Clear();
  137.                                     tmp_one_it.vs = ntmp ;
  138.                                     tmp_one_it.vs.Add(de_Value, 1);
  139.                                     resultTmpSet.Add(de_Key, tmp_one_it);
  140.                                 }
  141.                                 else
  142.                                 {
  143.                                     XunLong.PublicClassLibrary.XLSX tmp_one_it = (XunLong.PublicClassLibrary.XLSX)resultTmpSet[de_Key];
  144.                                     tmp_one_it.n = tmp_one_it.n + 1;
  145.                                     if (tmp_one_it.vs.Contains(de_Value) == false)
  146.                                     {
  147.                                         tmp_one_it.vs.Add(de_Value, 1);
  148.                                     }
  149.                                     else
  150.                                     {
  151.                                         int tmp_one_it_vs = (int)tmp_one_it.vs[de_Value];
  152.                                         tmp_one_it.vs[de_Value] = tmp_one_it_vs + 1;
  153.                                     }
  154.                                     resultTmpSet[de_Key] = tmp_one_it;
  155.                                 }
  156.                             }
  157.                         }
  158.                         catch
  159.                         { }
  160.                     }
  161.                 
  162.             }
  163.             // key = 属性 val = 出现次数 和 值列表    找出 resultTmpSet 中公共的属性项目
  164.             Hashtable new_It = new Hashtable();
  165.             new_It.Clear();
  166.             foreach (System.Collections.DictionaryEntry de in resultTmpSet)
  167.             {
  168.                 XunLong.PublicClassLibrary.XLSX tmp_one_it = (XunLong.PublicClassLibrary.XLSX)de.Value;
  169.                 if (tmp_one_it.n >= NowIII)
  170.                 {
  171.                     new_It.Add(de.Key, tmp_one_it);
  172.                 }
  173.             }
  174.             Console.WriteLine(" 扫描得到属性列表_结束==> " + DateTime.Now.ToString() + " " + DateTime.Now.Millisecond.ToString());
  175.             return new_It;
  176.         }
  177.         /// <summary>
  178.         /// 得到数据  根据原始的数据 + 主类别数据 + 属性数据 + 页码  html
  179.         /// </summary>
  180.         /// <param name="dat">搜索</param>
  181.         /// <returns>HTML</returns>
  182.         public Hashtable search_GetData(string dat,int n)
  183.         {
  184.             Console.WriteLine(" 得到数据_开始==> " + DateTime.Now.ToString() + " " + DateTime.Now.Millisecond.ToString());
  185.             dat = dat.ToLower();
  186.             string olddat =dat;
  187.             //dat = <XL>关键词</XL><XL主类别>HTML</XL主类别><属性1>属性值1</属性1><属性2>属性值2</属性2>
  188.             string Data1 = "";
  189.             //string Data2 = "";
  190.             string Data3 = "";
  191.             //<XL>关键词</XL>;
  192.             int one_a1 = dat.IndexOf("<xl>");
  193.             int one_a2 = dat.IndexOf("</xl>");
  194.             if (one_a2 > one_a1 & one_a1 > -1)
  195.             {
  196.                 string dat_tmp = dat.Substring(one_a1 + 4, one_a2 - one_a1 - 4);
  197.                 Data1 = dat2dat.stringcode(dat_tmp);
  198.                 dat = dat.Replace("<xl>"+Data1+"</xl>","");
  199.             }
  200.             else
  201.             {
  202.                 Data1 = "";
  203.             }
  204.             /*
  205.             //<XL主类别>HTML</XL主类别>
  206.             int one_a12 = dat.IndexOf("<xl主类别>");
  207.             int one_a22 = dat.IndexOf("</xl主类别>");
  208.             if (one_a22 > one_a12 & one_a12 > -1)
  209.             {
  210.                 string dat_tmp2 = dat.Substring(one_a12 + 7, one_a22 - one_a12 - 7);
  211.                 Data2 = "<xl主类别>" + dat2dat.stringcode(dat_tmp2) + "</xl主类别>";
  212.                 dat = dat.Replace(Data2, "");
  213.             }
  214.             else
  215.             {
  216.                 Data2 = "";
  217.             }
  218.             */
  219.             Data3 = dat;
  220.             ArrayList newData3S = new ArrayList();
  221.             newData3S.Clear();
  222.             if (Data3.Length > 0)
  223.             {
  224.                 string onexmlss = "<kC>" + Data3 + "</kC>"; //分解类聚项
  225.                 try
  226.                 {
  227.                     XmlDocument X = new XmlDocument();
  228.                     X.LoadXml(onexmlss);
  229.                     System.Xml.XmlNodeList rssDetail = X.SelectNodes("kC");
  230.                     XmlNode myKKCC = rssDetail.Item(0);
  231.                     for (int i = 0; i < myKKCC.ChildNodes.Count; i++)          // 统计属性  属性出现测试   属性值  属性值出现次数
  232.                     {
  233.                         XmlNode muF = myKKCC.ChildNodes[i];
  234.                         string de_Key = muF.LocalName.Trim();  //属性
  235.                         string de_Value = muF.InnerText.Trim();//属性值
  236.                         if (de_Key.Length == 0 | de_Value.Length == 0 | de_Value.IndexOf("---")>=0)
  237.                         {
  238.                         }
  239.                         else
  240.                         {
  241.                             string new_it_m = "<" + de_Key + ">" + de_Value + "</" + de_Key + ">";
  242.                             if (newData3S.Contains(new_it_m) == false)
  243.                             {
  244.                                 newData3S.Add(new_it_m);
  245.                             }
  246.                         }
  247.                     }
  248.                 }
  249.                 catch
  250.                 {       }
  251.             }
  252.             Data1 = GetXwordData(Data1);
  253.             Query query = QueryParser.Parse(Data1, "a", itStandardAnalyzer);
  254.             Hits hits = searcher.Search(query);         
  255.            
  256.             // 返回的结果集
  257.             ArrayList  resultTmpSet = new ArrayList ();
  258.             resultTmpSet.Clear();
  259.             int iii = hits.Length();  //结果集不能超过10000
  260.             if (iii > 10000)
  261.             { iii = 10000; }
  262.             //起点 每页10个数据
  263.             int StartX = (n-1) * 10+1;
  264.             int ALLNUM = 0;
  265.             //得到结果集  进行类聚 
  266.             for (int t = 0; t < iii; t++)
  267.             {
  268.                 Document doc = hits.Doc(t); // 得到该数据的类聚选项 
  269.                 string xlTag = doc.Get("b").ToLower();
  270.                // if (xlTag.IndexOf(Data2) > -1)  //主类别符合
  271.               //  {
  272.                     foreach (string n_a in newData3S)
  273.                     {
  274.                         if (xlTag.IndexOf(n_a) == -1)
  275.                         {
  276.                             goto N_ST;
  277.                         }
  278.                     }
  279.                     ALLNUM = ALLNUM + 1;  //统计结果集总数
  280.                     if (ALLNUM >= StartX & ALLNUM < StartX + 10)    //符合条件的压入返回队列
  281.                     {
  282.                         XunLong.PublicClassLibrary.ShowLISTONE n_p = new XunLong.PublicClassLibrary.ShowLISTONE();
  283.                         n_p.url = doc.Get("url");
  284.                         n_p.title = doc.Get("t");
  285.                       //  n_p.data = doc.Get("a");
  286.                         n_p.data = doc.Get("s");
  287.                         resultTmpSet.Add(n_p);                        
  288.                     }
  289.               //  }
  290.             N_ST: ;
  291.             }
  292.             Hashtable BB_KK = new Hashtable();
  293.             BB_KK.Clear();
  294.             //   
  295.             BB_KK.Add("SYS_ALLNUM", ALLNUM);
  296.             BB_KK.Add("SYS_N", n);
  297.             BB_KK.Add("SYS_DAT", olddat);
  298.             BB_KK.Add("SYS_LIST",resultTmpSet);
  299.             Console.WriteLine(" 得到数据_结束==> " + DateTime.Now.ToString() + " " + DateTime.Now.Millisecond.ToString());
  300.             return BB_KK;
  301.         }
  302.         /// <summary>
  303.         /// 得到通过分词的数据结果
  304.         /// </summary>
  305.         /// <param name="dat"></param>
  306.         /// <returns></returns>
  307.         private string GetXwordData(string dat)
  308.         {
  309.             //三个字以内 不进行分词
  310.             if (dat.Length <= 3)
  311.             {
  312.                 return dat;
  313.             }
  314.             if (clearXwordHT.Contains(dat) == true)
  315.             {
  316.                 return clearXwordHT[dat].ToString(); 
  317.             }
  318.             try
  319.             {
  320.                 string a = mySDW.GetOneXword(dat);
  321.                 string b = clearXword(a);
  322.                 try
  323.                 {
  324.                     if (b.IndexOf(' ') > 0)
  325.                     {
  326.                         clearXwordHT.Add(dat, b); //缓存中间结果
  327.                     }
  328.                 }
  329.                 catch
  330.                 {
  331.                 }
  332.                 return b;
  333.             }
  334.             catch
  335.             {
  336.                 return dat;
  337.             }
  338.         }
  339.         /// <summary>
  340.         /// 清理分词结果
  341.         /// </summary>
  342.         /// <param name="data"></param>
  343.         /// <returns></returns>
  344.         private string clearXword(string data)
  345.         {
  346.             data = data.Replace("   ", " ");
  347.             data = data.Replace("  ", " ");
  348.       
  349.             string backCD = "";
  350.             if (data.IndexOf('/') == -1)
  351.             {
  352.                 return data.Trim();
  353.             }
  354.             if (data.IndexOf(' ') == -1)
  355.             {
  356.                 string[] mii = data.Split('/');
  357.                 if (mii[0].Trim().Length > 0)
  358.                 {
  359.                     backCD = mii[0].Trim();
  360.                 }
  361.                 return backCD.Trim();
  362.             }
  363.             //n 名词  nr 人名   ns 地名  nt 机构团体   nx 外文字符  nz 其它专名  v 动词  j简略  i  成语  m数  q量
  364.             string[] mui = data.Split(' ');
  365.             foreach (string a in mui)
  366.             {
  367.                 if (a.IndexOf('/') > -1)
  368.                 {
  369.                     string[] mii = a.Split('/');
  370.                     string RRR = "";
  371.                     if (mii[0].Trim().Length > 0)
  372.                     {
  373.                         switch (mii[1])
  374.                         {
  375.                             case "n":
  376.                                 RRR = mii[0];
  377.                                 break;
  378.                             case "nr":
  379.                                 RRR = mii[0];
  380.                                 break;
  381.                             case "ns":
  382.                                 RRR = mii[0];
  383.                                 break;
  384.                             case "nt":
  385.                                 RRR = mii[0];
  386.                                 break;
  387.                             case "nx":
  388.                                 RRR = mii[0];
  389.                                 break;
  390.                             case "nz":
  391.                                 RRR = mii[0];
  392.                                 break;
  393.                             case "v":
  394.                                 RRR = mii[0];
  395.                                 break;
  396.                             case "j":
  397.                                 RRR = mii[0];
  398.                                 break;
  399.                             case "i":
  400.                                 RRR = mii[0];
  401.                                 break;
  402.                             case "m":
  403.                                 RRR = mii[0];
  404.                                 break;
  405.                             case "q":
  406.                                 RRR = mii[0];
  407.                                 break;
  408.                             case "l":
  409.                                 RRR = mii[0];
  410.                                 break;
  411.                             case "vn":
  412.                                 RRR = mii[0];
  413.                                 break;
  414.                         }
  415.                         if (RRR.Length > 0)
  416.                         {
  417.                             backCD = backCD + " " + RRR.Trim();
  418.                         }
  419.                     }
  420.                 }
  421.             }
  422.             return backCD.Trim();
  423.         }
  424.     }
  425. }
  426. /*
  427.                      doc.Add(new Field("t", x.t, true, true, true));        //标题
  428.                     doc.Add(new Field("a", x_a, true, true, true));        //数据
  429.                    // doc.Add(new Field("b", x.b, true, false, false));    //类聚模板得到
  430.                     doc.Add(new Field("b", x.b, true, true, true ));    //类聚模板得到
  431.                     doc.Add(new Field("c", x.c, true, false, false));  //相关模板得到
  432.                     doc.Add(new Field("h", x.h, true, false, false));  //HTML块数据得到
  433.                     doc.Add(new Field("url", x.url, true, false, false));             //URL
  434.  */