IndexOneClass.cs
上传用户:zhangkuixh
上传日期:2013-09-30
资源大小:5473k
文件大小:16k
源码类别:

搜索引擎

开发平台:

C#

  1. using System;
  2. using System.Collections.Generic;
  3. using System.Collections;
  4. using System.Text;
  5. using System.IO;
  6. using System.Text.RegularExpressions;
  7. using Lucene.Net.Analysis.Standard;
  8. using Lucene.Net.Documents;
  9. using Lucene.Net.Index;
  10. using Lucene.Net.Search;
  11. using Lucene.Net.QueryParsers;
  12. using Lucene.Net.Store;
  13. /*
  14.       '       迅龙中文分类搜索引擎  v0.6
  15.       '
  16.       '        LGPL  许可发行
  17.       '
  18.       '       宁夏大学  张冬 康彩  zd4004@163.com
  19.       ' 
  20.       '        官网 http://blog.163.com/zd4004/
  21.  */
  22. //不进行存储 直接索引程序 
  23. namespace XunLong.clsDirectIndex
  24. {
  25.     /// <summary>
  26.     /// 直接索引类 
  27.     /// </summary>
  28.    public static class DirectIndex
  29.     {
  30.                /// <summary>
  31.         /// 索引存储的地方
  32.         /// </summary>
  33.        private static string IndexPath = "";
  34.         /// <summary>
  35.         /// NFS源文件存储的地方
  36.         /// </summary>
  37.        private static string SourcePath = "";
  38.         /// <summary>
  39.         /// 停止词表路径
  40.         /// </summary>
  41.        private static string StopPath = "";
  42.         /// <summary>
  43.         /// 模板存放地址
  44.         /// </summary>
  45.        private static string ModelPath = "";
  46.         /// <summary>
  47.         /// 是否停止
  48.         /// </summary>
  49.        private static bool StopIt = false;
  50.         /// <summary>
  51.         /// 是否最后写入索引完成
  52.         /// </summary>
  53.        private static bool isEndOK = false;
  54.         /// <summary>
  55.         /// 使用本地缓存  直接读取缓存服务器数据
  56.         /// </summary>
  57.        private static string OKxWordPath = "";
  58.         /// <summary>
  59.         /// 匹配成功的模版数目
  60.         /// </summary>
  61.        private static int OKNUM = 0;
  62.        /// <summary>
  63.        /// 记录索引的数量
  64.        /// </summary>
  65.        private static int m = 0;
  66.         /// <summary>
  67.         /// 模板匹配类
  68.         /// </summary>
  69.        private static XunLong.ModelUserClassLibrary.ClassUserModel mxWeb = new XunLong.ModelUserClassLibrary.ClassUserModel();
  70.         /// <summary>
  71.         /// 页面数据清理 5000
  72.         /// </summary>
  73.         private static XunLong.HtmlClassLibrary.ClassTXT2IDAT TXT2IDAT = new XunLong.HtmlClassLibrary.ClassTXT2IDAT();
  74.        private static XunLong.HtmlClassLibrary.ClassHTML myHTML2CLEAR = new XunLong.HtmlClassLibrary.ClassHTML();
  75.         //索引写入器
  76.        private static IndexWriter writer;
  77.        /// <summary>
  78.        /// 存放已经索引过的url
  79.        /// </summary>
  80.        private static ArrayList oldIndexData = new ArrayList();
  81.         /// <summary>
  82.         /// 初始化停止词
  83.         /// </summary>
  84.         /// <param name="dPath">停止词路径</param>
  85.        public static ArrayList InitCnStopWord(string dPath)
  86.         {
  87.             ArrayList CnStopWord = new ArrayList();
  88.             CnStopWord.Clear();
  89.             int tmp = 0;
  90.             System.Threading.Thread.Sleep(2000);
  91.             Console.WriteLine("加载停止词  ");
  92.             //  StreamWriter wr = new StreamWriter(dPath+"bak",false, System.Text.Encoding.GetEncoding("gb2312"));
  93.             //
  94.             StreamReader reader = null;
  95.             string data = string.Empty;
  96.             try
  97.             {
  98.                 reader = new StreamReader(dPath, System.Text.Encoding.GetEncoding("gb2312"));
  99.                 for (string line = reader.ReadLine(); line != null; line = reader.ReadLine())
  100.                 {
  101.                     // string xStr = line.Replace(""", "");
  102.                     try
  103.                     {
  104.                         CnStopWord.Add(line);
  105.                         tmp = tmp + 1;
  106.                     }
  107.                     catch
  108.                     {
  109.                     }
  110.                 }
  111.                 reader.Close();
  112.             }
  113.             catch (IOException e)
  114.             {
  115.                 Console.WriteLine(e.Message);
  116.             }
  117.             finally
  118.             {
  119.                 if (reader != null)
  120.                     reader.Close();
  121.             }
  122.             Console.WriteLine("共加载停止词  " + tmp.ToString() + " 条");
  123.             return CnStopWord;
  124.         }
  125.         /// <summary>
  126.         /// 解析数据
  127.         /// </summary>
  128.         /// <param name="a"></param>
  129.         /// <returns></returns>
  130.        private static XunLong.PublicClassLibrary.kcSearch xData(string url,string data)
  131.         {
  132.             XunLong.PublicClassLibrary.kcSearch xkx = new XunLong.PublicClassLibrary.kcSearch();
  133.             Console.WriteLine(" 数据读取开始时间==> " + DateTime.Now.ToString() + " " + DateTime.Now.Millisecond.ToString());
  134.             // 1 得到数据
  135.           //  string x = ClassFileSystemIt.Value(a);
  136.             Console.WriteLine(" 数据读取结束时间==> " + DateTime.Now.ToString() + " " + DateTime.Now.Millisecond.ToString());
  137.             //清理无关 HTML 数据
  138.             //x = mxHTM.HTML2CLEAR(x);
  139.            string  x = myHTML2CLEAR.HTML2CLEAR(data, url);
  140.             Console.WriteLine(" 数据清理结束时间==> " + DateTime.Now.ToString() + " " + DateTime.Now.Millisecond.ToString());
  141.             if (x.Length > 0)
  142.             {
  143.                 //2 匹配模板 得到数据
  144.                 XunLong.PublicClassLibrary.kcSearch newHT = mxWeb.getTagAndData(x);
  145.                 Console.WriteLine(" 模板匹配完成时间==> " + DateTime.Now.ToString() + " " + DateTime.Now.Millisecond.ToString());
  146.                 xkx.t = newHT.t;
  147.                 xkx.a = newHT.a;
  148.                 xkx.b = newHT.b;
  149.                 xkx.h = newHT.h;
  150.                 xkx.url = url;
  151.                 xkx.c = newHT.c;
  152.                 xkx.isOK = newHT.isOK;
  153.                 if (xkx.isOK == true)
  154.                 {
  155.                     OKNUM = OKNUM + 1;
  156.                     Console.WriteLine(" 成功匹配模版  -->  " + OKNUM.ToString() + "  条");
  157.                 }
  158.                 Console.WriteLine(" 共计匹配模版  -->  " + OKNUM.ToString() + "  条");
  159.                 return xkx;
  160.             }
  161.             return xkx;
  162.         }
  163.         /// <summary>
  164.         /// 得到URL的MD5名
  165.         /// </summary>
  166.         /// <param name="url"></param>
  167.         /// <returns></returns>
  168.        private static string getMD5name(string url)
  169.         {
  170.             string strMd5 = System.Web.Security.FormsAuthentication.HashPasswordForStoringInConfigFile(url, "md5");
  171.             return strMd5;
  172.         }
  173.        /// <summary>
  174.        /// 打开索引的地址
  175.        /// </summary>
  176.        /// <param name="path">配置文件路径</param>
  177.        public static void OpenIndex(string path)
  178.        {
  179.            OKNUM = 0;
  180.            //读取配置
  181.            XunLong.CongifData.Config.InitConfigData(path);
  182.            string a = XunLong.CongifData.Config.IndexData2; //索引文件路径
  183.            string b = XunLong.CongifData.Config.SpiderData; // 数据路径 
  184.            string c = XunLong.CongifData.Config.StopWordData; // 停止词路径
  185.            string d = XunLong.CongifData.Config.ModelData; // 模板存放地址
  186.            string e = XunLong.CongifData.Config.xWordCacheData; // 分词缓存存放地址
  187.            if ((System.IO.Directory.Exists(a) == false) | (System.IO.Directory.Exists(b) == false) | (System.IO.File.Exists(c) == false) | (System.IO.Directory.Exists(d) == false) | (System.IO.File.Exists(e) == false))
  188.            {
  189.                Console.WriteLine("参数配置错误  指定的目录不存在! ");
  190.                Console.WriteLine("任意键退出  ");
  191.                int i = Console.Read();
  192.                return;
  193.            }
  194.            Console.WriteLine("索引生成器开始工作  ");
  195.           
  196.            Console.WriteLine(" 宁夏大学 张冬 康彩 zd4004@163.com    2006.8.17  ");
  197.            Console.WriteLine(" ");
  198.            IndexPath = a;
  199.            SourcePath = b;
  200.            StopPath = c;
  201.            ModelPath = d;
  202.            OKxWordPath = e;
  203.            Console.WriteLine("索引文件路径: " + a);
  204.            Console.WriteLine("文件系统路径: " + b);
  205.            Console.WriteLine("停止词路径 : " + c);
  206.            Console.WriteLine("模板存放路径: " + d);
  207.            Console.WriteLine("分词缓存路径: " + e);
  208.            Lucene.Net.Analysis.XunLongX.ClassXunLongChinese.initOKxWord(XunLong.CongifData.Config.xWordCacheData,path);
  209.            Console.WriteLine("加载本地缓存完成  ");
  210.            // 添加停止词
  211.            Lucene.Net.Analysis.XunLongX.ClassXunLongChinese.CnStopWord = InitCnStopWord(StopPath);
  212.            Console.WriteLine("加载停止词 完成  ");
  213.            // 使用本地缓存  直接读取缓存服务器数据   
  214.            //加载模板数据
  215.            mxWeb.init(ModelPath);
  216.            Console.WriteLine("加载模版完成  ");
  217.            //已经存在的话为增加
  218.            if (System.IO.File.Exists(IndexPath + "\segments") == true)
  219.            {
  220.                writer = new IndexWriter(IndexPath, new Lucene.Net.Analysis.XunLongX.XunLongAnalyzer(), false);
  221.            }
  222.            else
  223.            {
  224.                writer = new IndexWriter(IndexPath, new Lucene.Net.Analysis.XunLongX.XunLongAnalyzer(), true);
  225.            }
  226.            writer.SetUseCompoundFile(true);
  227.            Console.WriteLine("加载分词模块  ");
  228.            OKNUM = 0;
  229.            //得到文件列表
  230.         //   ArrayList n = ClassFileSystemIt.SearchOneList("http://");
  231.            oldIndexData.Clear();
  232.            if (System.IO.File.Exists(IndexPath + "\segments") == true)
  233.            {
  234.                try
  235.                {
  236.                    //得到系统内已经索引的数据地址  从 n 中去除
  237.                    IndexSearcher searcher = new IndexSearcher(IndexPath);
  238.                    int xxx = searcher.MaxDoc();
  239.                    Console.WriteLine("  去掉已经索引过的数据 ");
  240.                    if (xxx > 1)
  241.                    {
  242.                        for (int i = 0; i < xxx; i++)
  243.                        {
  244.                            string doc = searcher.Doc(i).Get("url").ToString();
  245.                            //去掉已经索引过的
  246.                           // if (n.Contains(doc) == true)
  247.                           // {
  248.                           //     n.Remove(doc);
  249.                          //  }
  250.                            try
  251.                            {
  252.                                oldIndexData.Add(doc);
  253.                            }
  254.                            catch
  255.                            {   }
  256.                            if (i % 100 == 0)
  257.                            {
  258.                                Console.WriteLine(" -> " + i);
  259.                            }
  260.                        }
  261.                    }
  262.                    searcher.Close();
  263.                }
  264.                catch
  265.                {
  266.                }
  267.            }
  268.            /*
  269.            Query query = QueryParser.Parse("", "url", new Lucene.Net.Analysis.XunLong.XunLongAnalyzer());
  270.            Hits hits = searcher.Search(query);
  271.            for (int i = 0; i < hits.Length(); i++)
  272.            {
  273.                string doc = hits.Doc(i).Get("url").ToString();
  274.                //去掉已经索引过的
  275.                if (n.Contains(doc) == true)
  276.                {
  277.                    n.Remove(doc);
  278.                }
  279.            }
  280.            searcher.Close();
  281.            */
  282.          
  283.        }
  284.        /// <summary>
  285.        /// 关闭索引
  286.        /// </summary>
  287.        public static void CloseIndex()
  288.        {
  289.        XDSTOP:
  290.            Console.WriteLine("关闭索引写入");
  291.            StopIt = false;
  292.            isEndOK = false;
  293.            try
  294.            {
  295.                writer.Optimize();
  296.                writer.Close();
  297.                isEndOK = true;
  298.            }
  299.            catch
  300.            {
  301.                Console.WriteLine("关闭索引写入出错");
  302.            }
  303.            Console.WriteLine("索引完成");
  304.            Console.WriteLine("您可以关闭索引器了 谢谢使用!");
  305.            StopIt = false;
  306.        
  307.        }
  308.        /// <summary>
  309.        /// 索引一个数据
  310.        /// </summary>
  311.        /// <param name="url">url</param>
  312.        /// <param name="data">数据</param>
  313.        public static int IndexOneData(string url, string data)
  314.        {
  315.            if (url == null)
  316.            {
  317.                return OKNUM;
  318.            }
  319.            if (oldIndexData.Contains(url) == true)
  320.            {
  321.                return OKNUM;
  322.            }
  323.            else
  324.            {
  325.                oldIndexData.Add(url);
  326.            }
  327.            //遍历数据 进行索引
  328.            try
  329.            {
  330.                Document doc = new Document();
  331.             //   Console.WriteLine("--> " + data);
  332.                Console.WriteLine(" 解析开始时间==> " + DateTime.Now.ToString() + " " + DateTime.Now.Millisecond.ToString());
  333.                XunLong.PublicClassLibrary.kcSearch x = xData(url,data);
  334.                if (x.isOK == false)
  335.                {
  336.                    Console.WriteLine("跳过不能匹配模版的数据>>> ");
  337.                    m = m + 1;
  338.                    Console.WriteLine("索引完成 " + m + "条数据");
  339.                    //  不能匹配模版的数据不要 
  340.                    //  不能匹配的url 写入 文件  下次启动 加载到 oldIndexData
  341.                    goto STP;  //  不能匹配模版的数据不要 
  342.                }
  343.                OKNUM = OKNUM + 1;
  344.                Console.WriteLine(" 解析完成时间==> " + DateTime.Now.ToString() + " " + DateTime.Now.Millisecond.ToString());
  345.                if (x.a == null)
  346.                {
  347.                    x.a = "";
  348.                }
  349.                if (x.b == null)
  350.                {
  351.                    x.b = "";
  352.                }
  353.                if (x.t == null)
  354.                {
  355.                    x.t = "";
  356.                }
  357.                if (x.h == null)
  358.                {
  359.                    x.h = "";
  360.                }
  361.                if (x.s == null)
  362.                {
  363.                    x.s = "";
  364.                }
  365.                if (x.url == null)
  366.                {
  367.                    x.url = "";
  368.                }
  369.                string x_a = TXT2IDAT.GetOneGoodData(x.a);
  370.                x_a = x_a.Replace("