ClassIndexBuilder.cs
上传用户:zhangkuixh
上传日期:2013-09-30
资源大小:5473k
文件大小:16k
源码类别:

搜索引擎

开发平台:

C#

  1. using System;
  2. using System.Collections.Generic;
  3. using System.Collections;
  4. using System.Text;
  5. using System.IO;
  6. using System.Text.RegularExpressions;
  7. using Lucene.Net.Analysis.Standard;
  8. using Lucene.Net.Documents;
  9. using Lucene.Net.Index;
  10. using Lucene.Net.Search;
  11. using Lucene.Net.QueryParsers;
  12. using Lucene.Net.Store;
  13. /*
  14.       '       迅龙中文分类搜索引擎  v0.6
  15.       '
  16.       '        LGPL  许可发行
  17.       '
  18.       '       宁夏大学  张冬 康彩  zd4004@163.com
  19.       ' 
  20.       '        官网 http://blog.163.com/zd4004/
  21.  */
  22. namespace XunLong.IndexBuilder
  23. {
  24.     /// <summary>
  25.     /// 建立索引
  26.     /// </summary>
  27.     class ClassIndexBuilder
  28.     {
  29.         /// <summary>
  30.         /// 索引存储的地方
  31.         /// </summary>
  32.         public string IndexPath = "";
  33.         /// <summary>
  34.         /// NFS源文件存储的地方
  35.         /// </summary>
  36.         public string SourcePath = "";
  37.         /// <summary>
  38.         /// 停止词表路径
  39.         /// </summary>
  40.         public string StopPath = "";
  41.         /// <summary>
  42.         /// 模板存放地址
  43.         /// </summary>
  44.         public string ModelPath = "";
  45.         /// <summary>
  46.         /// 是否停止
  47.         /// </summary>
  48.         public bool StopIt = false;
  49.         /// <summary>
  50.         /// 是否最后写入索引完成
  51.         /// </summary>
  52.         public bool isEndOK = false;
  53.         /// <summary>
  54.         /// 使用本地缓存  直接读取缓存服务器数据
  55.         /// </summary>
  56.         public string OKxWordPath = "";
  57.         /// <summary>
  58.         /// 匹配成功的模版数目
  59.         /// </summary>
  60.         private int OKNUM =0;
  61.         /// <summary>
  62.         /// 模板匹配类
  63.         /// </summary>
  64.         private XunLong.ModelUserClassLibrary.ClassUserModel mxWeb = new XunLong.ModelUserClassLibrary.ClassUserModel();
  65.         /// <summary>
  66.         /// 页面数据清理 5000
  67.         /// </summary>
  68.         private static XunLong.HtmlClassLibrary.ClassTXT2IDAT TXT2IDAT = new XunLong.HtmlClassLibrary.ClassTXT2IDAT();
  69.         XunLong.HtmlClassLibrary.ClassHTML myHTML2CLEAR = new XunLong.HtmlClassLibrary.ClassHTML();
  70.         //索引写入器
  71.         private IndexWriter writer;
  72.         /// <summary>
  73.         ///
  74.         /// </summary>
  75.         public ClassIndexBuilder()
  76.         {
  77.          
  78.         
  79.         }
  80.         /// <summary>
  81.         /// 初始化文件系统
  82.         /// </summary>
  83.         /// <param name="NFSBOOT"></param>
  84.         public void initNFS(string k_c_path)
  85.         {
  86.             // .SetClassNHT
  87.             ClassFileSystemIt.SetClassNHT(SourcePath, 3145727, k_c_path);
  88.            // Lucene.Net.Store.FSLockConfig.LockDirectory = IndexPath;// SourcePath;
  89.            // Lucene.Net.Store.
  90.               //  writer = new IndexWriter(IndexPath, new StandardAnalyzer(), true);
  91.             System.Threading.Thread.Sleep(2000);
  92.             Lucene.Net.Analysis.XunLongX.ClassXunLongChinese.initOKxWord(XunLong.CongifData.Config.xWordCacheData, k_c_path);
  93.        
  94.              Console.WriteLine("加载本地缓存完成  ");
  95.                    
  96.             
  97.            
  98.             // 添加停止词
  99.             Lucene.Net.Analysis.XunLongX.ClassXunLongChinese.CnStopWord = InitCnStopWord(StopPath);
  100.              Console.WriteLine("加载停止词 完成  ");
  101.            
  102.                 // 使用本地缓存  直接读取缓存服务器数据   
  103.             //加载模板数据
  104.             mxWeb.init(ModelPath);
  105.             Console.WriteLine("加载模版完成  ");
  106.             //已经存在的话为增加
  107.             if (System.IO.File.Exists(IndexPath + "\segments") == true)
  108.             {
  109.                 writer = new IndexWriter(IndexPath, new Lucene.Net.Analysis.XunLongX.XunLongAnalyzer(), false);
  110.             }
  111.             else
  112.             {
  113.                 writer = new IndexWriter(IndexPath, new Lucene.Net.Analysis.XunLongX.XunLongAnalyzer(), true);
  114.             }
  115.             writer.SetUseCompoundFile(true);
  116.             Console.WriteLine("加载分词模块  ");
  117.         }
  118.         /// <summary>
  119.         /// 初始化停止词
  120.         /// </summary>
  121.         /// <param name="dPath">停止词路径</param>
  122.         public ArrayList InitCnStopWord(string dPath)
  123.         {
  124.             ArrayList CnStopWord = new ArrayList();
  125.             CnStopWord.Clear();
  126.             int tmp = 0;
  127.             System.Threading.Thread.Sleep(2000);
  128.             Console.WriteLine("加载停止词  ");
  129.           //  StreamWriter wr = new StreamWriter(dPath+"bak",false, System.Text.Encoding.GetEncoding("gb2312"));
  130.             //
  131.             StreamReader reader = null;
  132.             string data = string.Empty;
  133.             try
  134.             {
  135.                 reader = new StreamReader(dPath, System.Text.Encoding.GetEncoding("gb2312"));
  136.                 for (string line = reader.ReadLine(); line != null; line = reader.ReadLine())
  137.                 {
  138.                    // string xStr = line.Replace(""", "");
  139.                     try
  140.                     {
  141.                         CnStopWord.Add(line);
  142.                         tmp = tmp + 1;
  143.                     }
  144.                     catch
  145.                     { 
  146.                     
  147.                     }
  148.                     /*
  149.                     if (CnStopWord.Contains(line ) == false)
  150.                     {
  151.                         //wr.WriteLine(xStr);
  152.                         if (line.Length  < 4)
  153.                         {
  154.                             CnStopWord.Add(line);
  155.                         }
  156.                             tmp = tmp + 1;
  157.                         if (tmp % 500 == 1)
  158.                         {
  159.                             Console.Write(">");
  160.                         }
  161.                     
  162.                     }
  163.                     */
  164.                 }
  165.                 reader.Close();
  166.             }
  167.             catch (IOException e)
  168.             {
  169.                 Console.WriteLine(e.Message);
  170.             }
  171.             finally
  172.             {
  173.                 if (reader != null)
  174.                     reader.Close();
  175.             }
  176.           //  wr.Close();
  177.             Console.WriteLine("共加载停止词  "+ tmp.ToString()+" 条");
  178.             return CnStopWord;
  179.         }
  180.         /// <summary>
  181.         /// 开始索引
  182.         /// </summary>
  183.         public void Run()
  184.         {
  185.             OKNUM = 0;
  186.             //得到文件列表
  187.             ArrayList n = ClassFileSystemIt.SearchOneList("http://");
  188.           //  Lucene.Net.Store.FSDirectory dir = Lucene.Net.Store.FSDirectory.GetDirectory(IndexPath, false);
  189.           //  Lucene.Net.Index.IndexWriter writer = new Lucene.Net.Index.IndexWriter(dir, new Lucene.Net.Analysis.Standard.StandardAnalyzer(), true); 
  190.             if (System.IO.File.Exists(IndexPath + "\segments") == true)
  191.             {
  192.                 try
  193.                 {
  194.                     //得到系统内已经索引的数据地址  从 n 中去除
  195.                     IndexSearcher searcher = new IndexSearcher(IndexPath);
  196.                     int xxx = searcher.MaxDoc();
  197.                     Console.WriteLine("  去掉已经索引过的数据 ");
  198.                     if (xxx > 1)
  199.                     {
  200.                         for (int i = 0; i < xxx; i++)
  201.                         {
  202.                             string doc = searcher.Doc(i).Get("url").ToString();
  203.                             //去掉已经索引过的
  204.                             if (n.Contains(doc) == true)
  205.                             {
  206.                                 n.Remove(doc);
  207.                             }
  208.                             if (i % 100 == 0)
  209.                             {
  210.                                 Console.WriteLine(" -> " + i);
  211.                             }
  212.                         }
  213.                     }
  214.                     searcher.Close();
  215.                 }
  216.                 catch
  217.                 {
  218.                 }
  219.             }
  220.             /*
  221.             Query query = QueryParser.Parse("", "url", new Lucene.Net.Analysis.XunLong.XunLongAnalyzer());
  222.             Hits hits = searcher.Search(query);
  223.             for (int i = 0; i < hits.Length(); i++)
  224.             {
  225.                 string doc = hits.Doc(i).Get("url").ToString();
  226.                 //去掉已经索引过的
  227.                 if (n.Contains(doc) == true)
  228.                 {
  229.                     n.Remove(doc);
  230.                 }
  231.             }
  232.             searcher.Close();
  233.             */
  234.             int m = 0;
  235.             //遍历数据 进行索引
  236.             foreach (string a in n)
  237.             {
  238.                 if (StopIt == true)
  239.                 {
  240.                     goto XDSTOP;
  241.                 }
  242.                 try
  243.                 {
  244.                     Document doc = new Document();
  245.                     Console.WriteLine("--> " + a);
  246.                     Console.WriteLine(" 解析开始时间==> " + DateTime.Now.ToString() + " " + DateTime.Now.Millisecond.ToString());
  247.                     XunLong.PublicClassLibrary.kcSearch x = xData(a);
  248.                     if (x.isOK == false)
  249.                     {
  250.                         Console.WriteLine("跳过不能匹配模版的数据>>> ");
  251.                         m = m + 1;
  252.                         Console.WriteLine("索引完成 " + m + "条数据");
  253.                         goto STP;  //  不能匹配模版的数据不要 
  254.                     }
  255.                     Console.WriteLine(" 解析完成时间==> " + DateTime.Now.ToString() + " " + DateTime.Now.Millisecond.ToString());
  256.                     if (x.a == null)
  257.                     {
  258.                         x.a = "";
  259.                     }
  260.                     if (x.b == null)
  261.                     {
  262.                         x.b = "";
  263.                     }
  264.                     if (x.t == null)
  265.                     {
  266.                         x.t = "";
  267.                     }
  268.                     if (x.h == null)
  269.                     {
  270.                         x.h = "";
  271.                     }
  272.                     if (x.s == null)
  273.                     {
  274.                         x.s = "";
  275.                     }
  276.                     if (x.url == null)
  277.                     {
  278.                         x.url = "";
  279.                     }
  280.                     string x_a = TXT2IDAT.GetOneGoodData(x.a);
  281.                     x_a = x_a.Replace("