ClassSearchIT.cs
上传用户:zhangkuixh
上传日期:2013-09-30
资源大小:5473k
文件大小:18k
- using System;
- using System.Data;
- using System.Globalization;
- using System.Threading;
- using Lucene.Net.Analysis.Standard;
- using Lucene.Net.Documents;
- using Lucene.Net.QueryParsers;
- using Lucene.Net.Search;
- using Lucene.Net.Store;
- using Lucene.Net.Index;
- using System.Collections;
- using System.Xml;
- /*
- ' 迅龙中文分类搜索引擎 v0.6
- '
- ' LGPL 许可发行
- '
- ' 宁夏大学 张冬 康彩 zd4004@163.com
- '
- ' 官网 http://blog.163.com/zd4004/
- */
- namespace XunLong.SearchClassLibrary
- {
- public class ClassSearchIT
- {
- //<XL>关键词</XL>
- //<XL主类别>HTML</XL主类别>
- //<属性1>属性值1</属性1><属性2>属性值2</属性2>
- //读取配置
- /// <summary>
- /// 记录临时的分词缓存
- /// </summary>
- Hashtable clearXwordHT = new Hashtable();
- /// <summary>
- /// 分词系统
- /// </summary>
- private static XunLong.xWordNewClient.ClassXwordClientNewIt mySDW = new XunLong.xWordNewClient.ClassXwordClientNewIt();
- /// <summary>
- /// Luncene 分词系统
- /// </summary>
- //Lucene.Net.Analysis.XunLongX.XunLongAnalyzer itStandardAnalyzer = new Lucene.Net.Analysis.XunLongX.XunLongAnalyzer();
- // StandardAnalyzer()
- // Lucene.Net.Analysis.Standard.StandardAnalyzer itStandardAnalyzer = new Lucene.Net.Analysis.Standard.StandardAnalyzer();
- Lucene.Net.Analysis.WhitespaceAnalyzer itStandardAnalyzer = new Lucene.Net.Analysis.WhitespaceAnalyzer();
- /// <summary>
- /// 数据格式化
- /// </summary>
- private XunLong.HtmlClassLibrary.ClassTXT2IDAT dat2dat = new XunLong.HtmlClassLibrary.ClassTXT2IDAT();
- /// <summary>
- /// 搜索集总体
- /// </summary>
- // private MultiSearcher searcher;
- private IndexSearcher searcher;
- /// <summary>
- /// 索引集合 各个
- /// </summary>
- /// IndexSearcher[] searchers;
- /// <summary>
- /// 索引的细节 key = 主类别 val = 索引的序号 //这样在遇到1个有主类别的时候 可以单独启动某个索引
- /// </summary>
- Hashtable searchers_name;
- /// <summary>
- /// Web 系统配置文件
- /// </summary>
- public string k_c_path ;
- public void ClassSearchIT_1()
- {
- //读取配置
- XunLong.CongifData.Config.InitConfigData(k_c_path);
- searcher = new IndexSearcher(XunLong.CongifData.Config.IndexData);
- mySDW.hostName = XunLong.CongifData.Config.xWordCacheServer;
- mySDW.nowPort = XunLong.CongifData.Config.xWordCacheServerPort;
- mySDW.Init_start();
- }
- /// <summary>
- /// 扫描得到属性列表 根据原始的数据 和 主类别数据
- /// </summary>
- /// <param name="dat">搜索</param>
- /// <returns>属性和值列表</returns>
- public Hashtable search_GetRESLIST(string dat, string mainType)
- {
- Console.WriteLine(" 扫描得到属性列表_开始==> " + DateTime.Now.ToString() + " " + DateTime.Now.Millisecond.ToString());
- dat = dat.ToLower();
- //dat = <XL>关键词</XL><XL主类别>HTML</XL主类别>
- string Data1 = "";
- //<XL>关键词</XL>;
- int one_a1 = dat.IndexOf("<xl>");
- int one_a2 = dat.IndexOf("</xl>");
- if (one_a2 > one_a1 & one_a1 > -1)
- {
- string dat_tmp = dat.Substring(one_a1 + 4, one_a2 - one_a1 - 4);
- Data1 = dat2dat.stringcode(dat_tmp);
- dat = dat.Replace("<xl>"+Data1+"</xl>","");
- }
- else
- {
- Data1 = "";
- }
- Data1 = GetXwordData(Data1);
- Query query = QueryParser.Parse(Data1, "a", itStandardAnalyzer);
- Hits hits = searcher.Search(query);
- // key = 属性 val = arrlist 属性值
- Hashtable resultTmpSet = new Hashtable();
- resultTmpSet.Clear();
- int iii = hits.Length(); //结果集不能超过10000
- if (iii > 10000)
- { iii = 10000; }
- //
- int NowIII = 0;
- //得到结果集 进行类聚
- for (int t = 0; t < iii; t++)
- {
- Document doc = hits.Doc(t); // 得到该数据的类聚选项
- string xlTag = doc.Get("b").ToLower();
- NowIII = NowIII + 1; // tongyi
- if (xlTag.Length > 0)
- {
- string onexmlss = "<kC>" + xlTag + "</kC>"; //分解类聚项
- try
- {
- XmlDocument X = new XmlDocument();
- X.LoadXml(onexmlss);
- System.Xml.XmlNodeList rssDetail = X.SelectNodes("kC");
- XmlNode myKKCC = rssDetail.Item(0);
- for (int i = 0; i < myKKCC.ChildNodes.Count; i++) // 统计属性 属性出现测试 属性值 属性值出现次数
- {
- XmlNode muF = myKKCC.ChildNodes[i];
- string de_Key = muF.LocalName.Trim(); //属性
- string de_Value = muF.InnerText.Trim();//属性值
-
- if (resultTmpSet.Contains(de_Key) == false)
- {
- XunLong.PublicClassLibrary.XLSX tmp_one_it = new XunLong.PublicClassLibrary.XLSX();
- tmp_one_it.n = 1;
- Hashtable ntmp = new Hashtable();
- ntmp.Clear();
- tmp_one_it.vs = ntmp ;
- tmp_one_it.vs.Add(de_Value, 1);
- resultTmpSet.Add(de_Key, tmp_one_it);
- }
- else
- {
- XunLong.PublicClassLibrary.XLSX tmp_one_it = (XunLong.PublicClassLibrary.XLSX)resultTmpSet[de_Key];
- tmp_one_it.n = tmp_one_it.n + 1;
- if (tmp_one_it.vs.Contains(de_Value) == false)
- {
- tmp_one_it.vs.Add(de_Value, 1);
- }
- else
- {
- int tmp_one_it_vs = (int)tmp_one_it.vs[de_Value];
- tmp_one_it.vs[de_Value] = tmp_one_it_vs + 1;
- }
- resultTmpSet[de_Key] = tmp_one_it;
- }
- }
- }
- catch
- { }
- }
-
- }
- // key = 属性 val = 出现次数 和 值列表 找出 resultTmpSet 中公共的属性项目
- Hashtable new_It = new Hashtable();
- new_It.Clear();
- foreach (System.Collections.DictionaryEntry de in resultTmpSet)
- {
- XunLong.PublicClassLibrary.XLSX tmp_one_it = (XunLong.PublicClassLibrary.XLSX)de.Value;
- if (tmp_one_it.n >= NowIII)
- {
- new_It.Add(de.Key, tmp_one_it);
- }
- }
- Console.WriteLine(" 扫描得到属性列表_结束==> " + DateTime.Now.ToString() + " " + DateTime.Now.Millisecond.ToString());
- return new_It;
- }
- /// <summary>
- /// 得到数据 根据原始的数据 + 主类别数据 + 属性数据 + 页码 html
- /// </summary>
- /// <param name="dat">搜索</param>
- /// <returns>HTML</returns>
- public Hashtable search_GetData(string dat,int n)
- {
- Console.WriteLine(" 得到数据_开始==> " + DateTime.Now.ToString() + " " + DateTime.Now.Millisecond.ToString());
- dat = dat.ToLower();
- string olddat =dat;
- //dat = <XL>关键词</XL><XL主类别>HTML</XL主类别><属性1>属性值1</属性1><属性2>属性值2</属性2>
- string Data1 = "";
- //string Data2 = "";
- string Data3 = "";
- //<XL>关键词</XL>;
- int one_a1 = dat.IndexOf("<xl>");
- int one_a2 = dat.IndexOf("</xl>");
- if (one_a2 > one_a1 & one_a1 > -1)
- {
- string dat_tmp = dat.Substring(one_a1 + 4, one_a2 - one_a1 - 4);
- Data1 = dat2dat.stringcode(dat_tmp);
- dat = dat.Replace("<xl>"+Data1+"</xl>","");
- }
- else
- {
- Data1 = "";
- }
- /*
- //<XL主类别>HTML</XL主类别>
- int one_a12 = dat.IndexOf("<xl主类别>");
- int one_a22 = dat.IndexOf("</xl主类别>");
- if (one_a22 > one_a12 & one_a12 > -1)
- {
- string dat_tmp2 = dat.Substring(one_a12 + 7, one_a22 - one_a12 - 7);
- Data2 = "<xl主类别>" + dat2dat.stringcode(dat_tmp2) + "</xl主类别>";
- dat = dat.Replace(Data2, "");
- }
- else
- {
- Data2 = "";
- }
- */
- Data3 = dat;
- ArrayList newData3S = new ArrayList();
- newData3S.Clear();
- if (Data3.Length > 0)
- {
- string onexmlss = "<kC>" + Data3 + "</kC>"; //分解类聚项
- try
- {
- XmlDocument X = new XmlDocument();
- X.LoadXml(onexmlss);
- System.Xml.XmlNodeList rssDetail = X.SelectNodes("kC");
- XmlNode myKKCC = rssDetail.Item(0);
- for (int i = 0; i < myKKCC.ChildNodes.Count; i++) // 统计属性 属性出现测试 属性值 属性值出现次数
- {
- XmlNode muF = myKKCC.ChildNodes[i];
- string de_Key = muF.LocalName.Trim(); //属性
- string de_Value = muF.InnerText.Trim();//属性值
- if (de_Key.Length == 0 | de_Value.Length == 0 | de_Value.IndexOf("---")>=0)
- {
- }
- else
- {
- string new_it_m = "<" + de_Key + ">" + de_Value + "</" + de_Key + ">";
- if (newData3S.Contains(new_it_m) == false)
- {
- newData3S.Add(new_it_m);
- }
- }
- }
- }
- catch
- { }
- }
- Data1 = GetXwordData(Data1);
- Query query = QueryParser.Parse(Data1, "a", itStandardAnalyzer);
- Hits hits = searcher.Search(query);
-
- // 返回的结果集
- ArrayList resultTmpSet = new ArrayList ();
- resultTmpSet.Clear();
- int iii = hits.Length(); //结果集不能超过10000
- if (iii > 10000)
- { iii = 10000; }
- //起点 每页10个数据
- int StartX = (n-1) * 10+1;
- int ALLNUM = 0;
- //得到结果集 进行类聚
- for (int t = 0; t < iii; t++)
- {
- Document doc = hits.Doc(t); // 得到该数据的类聚选项
- string xlTag = doc.Get("b").ToLower();
- // if (xlTag.IndexOf(Data2) > -1) //主类别符合
- // {
- foreach (string n_a in newData3S)
- {
- if (xlTag.IndexOf(n_a) == -1)
- {
- goto N_ST;
- }
- }
- ALLNUM = ALLNUM + 1; //统计结果集总数
- if (ALLNUM >= StartX & ALLNUM < StartX + 10) //符合条件的压入返回队列
- {
- XunLong.PublicClassLibrary.ShowLISTONE n_p = new XunLong.PublicClassLibrary.ShowLISTONE();
- n_p.url = doc.Get("url");
- n_p.title = doc.Get("t");
- // n_p.data = doc.Get("a");
- n_p.data = doc.Get("s");
- resultTmpSet.Add(n_p);
- }
- // }
- N_ST: ;
- }
- Hashtable BB_KK = new Hashtable();
- BB_KK.Clear();
- //
- BB_KK.Add("SYS_ALLNUM", ALLNUM);
- BB_KK.Add("SYS_N", n);
- BB_KK.Add("SYS_DAT", olddat);
- BB_KK.Add("SYS_LIST",resultTmpSet);
- Console.WriteLine(" 得到数据_结束==> " + DateTime.Now.ToString() + " " + DateTime.Now.Millisecond.ToString());
- return BB_KK;
- }
- /// <summary>
- /// 得到通过分词的数据结果
- /// </summary>
- /// <param name="dat"></param>
- /// <returns></returns>
- private string GetXwordData(string dat)
- {
- //三个字以内 不进行分词
- if (dat.Length <= 3)
- {
- return dat;
- }
- if (clearXwordHT.Contains(dat) == true)
- {
- return clearXwordHT[dat].ToString();
- }
- try
- {
- string a = mySDW.GetOneXword(dat);
- string b = clearXword(a);
- try
- {
- if (b.IndexOf(' ') > 0)
- {
- clearXwordHT.Add(dat, b); //缓存中间结果
- }
- }
- catch
- {
- }
- return b;
- }
- catch
- {
- return dat;
- }
- }
- /// <summary>
- /// 清理分词结果
- /// </summary>
- /// <param name="data"></param>
- /// <returns></returns>
- private string clearXword(string data)
- {
- data = data.Replace(" ", " ");
- data = data.Replace(" ", " ");
-
- string backCD = "";
- if (data.IndexOf('/') == -1)
- {
- return data.Trim();
- }
- if (data.IndexOf(' ') == -1)
- {
- string[] mii = data.Split('/');
- if (mii[0].Trim().Length > 0)
- {
- backCD = mii[0].Trim();
- }
- return backCD.Trim();
- }
- //n 名词 nr 人名 ns 地名 nt 机构团体 nx 外文字符 nz 其它专名 v 动词 j简略 i 成语 m数 q量
- string[] mui = data.Split(' ');
- foreach (string a in mui)
- {
- if (a.IndexOf('/') > -1)
- {
- string[] mii = a.Split('/');
- string RRR = "";
- if (mii[0].Trim().Length > 0)
- {
- switch (mii[1])
- {
- case "n":
- RRR = mii[0];
- break;
- case "nr":
- RRR = mii[0];
- break;
- case "ns":
- RRR = mii[0];
- break;
- case "nt":
- RRR = mii[0];
- break;
- case "nx":
- RRR = mii[0];
- break;
- case "nz":
- RRR = mii[0];
- break;
- case "v":
- RRR = mii[0];
- break;
- case "j":
- RRR = mii[0];
- break;
- case "i":
- RRR = mii[0];
- break;
- case "m":
- RRR = mii[0];
- break;
- case "q":
- RRR = mii[0];
- break;
- case "l":
- RRR = mii[0];
- break;
- case "vn":
- RRR = mii[0];
- break;
- }
- if (RRR.Length > 0)
- {
- backCD = backCD + " " + RRR.Trim();
- }
- }
- }
- }
- return backCD.Trim();
- }
- }
- }
- /*
- doc.Add(new Field("t", x.t, true, true, true)); //标题
- doc.Add(new Field("a", x_a, true, true, true)); //数据
- // doc.Add(new Field("b", x.b, true, false, false)); //类聚模板得到
- doc.Add(new Field("b", x.b, true, true, true )); //类聚模板得到
- doc.Add(new Field("c", x.c, true, false, false)); //相关模板得到
- doc.Add(new Field("h", x.h, true, false, false)); //HTML块数据得到
- doc.Add(new Field("url", x.url, true, false, false)); //URL
- */