IndexOneClass.cs
上传用户:zhangkuixh
上传日期:2013-09-30
资源大小:5473k
文件大小:16k
- using System;
- using System.Collections.Generic;
- using System.Collections;
- using System.Text;
- using System.IO;
- using System.Text.RegularExpressions;
- using Lucene.Net.Analysis.Standard;
- using Lucene.Net.Documents;
- using Lucene.Net.Index;
- using Lucene.Net.Search;
- using Lucene.Net.QueryParsers;
- using Lucene.Net.Store;
- /*
- ' 迅龙中文分类搜索引擎 v0.6
- '
- ' LGPL 许可发行
- '
- ' 宁夏大学 张冬 康彩 zd4004@163.com
- '
- ' 官网 http://blog.163.com/zd4004/
- */
- //不进行存储 直接索引程序
- namespace XunLong.clsDirectIndex
- {
- /// <summary>
- /// 直接索引类
- /// </summary>
- public static class DirectIndex
- {
- /// <summary>
- /// 索引存储的地方
- /// </summary>
- private static string IndexPath = "";
- /// <summary>
- /// NFS源文件存储的地方
- /// </summary>
- private static string SourcePath = "";
- /// <summary>
- /// 停止词表路径
- /// </summary>
- private static string StopPath = "";
- /// <summary>
- /// 模板存放地址
- /// </summary>
- private static string ModelPath = "";
- /// <summary>
- /// 是否停止
- /// </summary>
- private static bool StopIt = false;
- /// <summary>
- /// 是否最后写入索引完成
- /// </summary>
- private static bool isEndOK = false;
- /// <summary>
- /// 使用本地缓存 直接读取缓存服务器数据
- /// </summary>
- private static string OKxWordPath = "";
- /// <summary>
- /// 匹配成功的模版数目
- /// </summary>
- private static int OKNUM = 0;
- /// <summary>
- /// 记录索引的数量
- /// </summary>
- private static int m = 0;
- /// <summary>
- /// 模板匹配类
- /// </summary>
- private static XunLong.ModelUserClassLibrary.ClassUserModel mxWeb = new XunLong.ModelUserClassLibrary.ClassUserModel();
- /// <summary>
- /// 页面数据清理 5000
- /// </summary>
- private static XunLong.HtmlClassLibrary.ClassTXT2IDAT TXT2IDAT = new XunLong.HtmlClassLibrary.ClassTXT2IDAT();
- private static XunLong.HtmlClassLibrary.ClassHTML myHTML2CLEAR = new XunLong.HtmlClassLibrary.ClassHTML();
- //索引写入器
- private static IndexWriter writer;
- /// <summary>
- /// 存放已经索引过的url
- /// </summary>
- private static ArrayList oldIndexData = new ArrayList();
- /// <summary>
- /// 初始化停止词
- /// </summary>
- /// <param name="dPath">停止词路径</param>
- public static ArrayList InitCnStopWord(string dPath)
- {
- ArrayList CnStopWord = new ArrayList();
- CnStopWord.Clear();
- int tmp = 0;
- System.Threading.Thread.Sleep(2000);
- Console.WriteLine("加载停止词 ");
- // StreamWriter wr = new StreamWriter(dPath+"bak",false, System.Text.Encoding.GetEncoding("gb2312"));
- //
- StreamReader reader = null;
- string data = string.Empty;
- try
- {
- reader = new StreamReader(dPath, System.Text.Encoding.GetEncoding("gb2312"));
- for (string line = reader.ReadLine(); line != null; line = reader.ReadLine())
- {
- // string xStr = line.Replace(""", "");
- try
- {
- CnStopWord.Add(line);
- tmp = tmp + 1;
- }
- catch
- {
- }
- }
- reader.Close();
- }
- catch (IOException e)
- {
- Console.WriteLine(e.Message);
- }
- finally
- {
- if (reader != null)
- reader.Close();
- }
- Console.WriteLine("共加载停止词 " + tmp.ToString() + " 条");
- return CnStopWord;
- }
- /// <summary>
- /// 解析数据
- /// </summary>
- /// <param name="a"></param>
- /// <returns></returns>
- private static XunLong.PublicClassLibrary.kcSearch xData(string url,string data)
- {
- XunLong.PublicClassLibrary.kcSearch xkx = new XunLong.PublicClassLibrary.kcSearch();
- Console.WriteLine(" 数据读取开始时间==> " + DateTime.Now.ToString() + " " + DateTime.Now.Millisecond.ToString());
- // 1 得到数据
- // string x = ClassFileSystemIt.Value(a);
- Console.WriteLine(" 数据读取结束时间==> " + DateTime.Now.ToString() + " " + DateTime.Now.Millisecond.ToString());
- //清理无关 HTML 数据
- //x = mxHTM.HTML2CLEAR(x);
- string x = myHTML2CLEAR.HTML2CLEAR(data, url);
- Console.WriteLine(" 数据清理结束时间==> " + DateTime.Now.ToString() + " " + DateTime.Now.Millisecond.ToString());
- if (x.Length > 0)
- {
- //2 匹配模板 得到数据
- XunLong.PublicClassLibrary.kcSearch newHT = mxWeb.getTagAndData(x);
- Console.WriteLine(" 模板匹配完成时间==> " + DateTime.Now.ToString() + " " + DateTime.Now.Millisecond.ToString());
- xkx.t = newHT.t;
- xkx.a = newHT.a;
- xkx.b = newHT.b;
- xkx.h = newHT.h;
- xkx.url = url;
- xkx.c = newHT.c;
- xkx.isOK = newHT.isOK;
- if (xkx.isOK == true)
- {
- OKNUM = OKNUM + 1;
- Console.WriteLine(" 成功匹配模版 --> " + OKNUM.ToString() + " 条");
- }
- Console.WriteLine(" 共计匹配模版 --> " + OKNUM.ToString() + " 条");
- return xkx;
- }
- return xkx;
- }
- /// <summary>
- /// 得到URL的MD5名
- /// </summary>
- /// <param name="url"></param>
- /// <returns></returns>
- private static string getMD5name(string url)
- {
- string strMd5 = System.Web.Security.FormsAuthentication.HashPasswordForStoringInConfigFile(url, "md5");
- return strMd5;
- }
- /// <summary>
- /// 打开索引的地址
- /// </summary>
- /// <param name="path">配置文件路径</param>
- public static void OpenIndex(string path)
- {
- OKNUM = 0;
- //读取配置
- XunLong.CongifData.Config.InitConfigData(path);
- string a = XunLong.CongifData.Config.IndexData2; //索引文件路径
- string b = XunLong.CongifData.Config.SpiderData; // 数据路径
- string c = XunLong.CongifData.Config.StopWordData; // 停止词路径
- string d = XunLong.CongifData.Config.ModelData; // 模板存放地址
- string e = XunLong.CongifData.Config.xWordCacheData; // 分词缓存存放地址
- if ((System.IO.Directory.Exists(a) == false) | (System.IO.Directory.Exists(b) == false) | (System.IO.File.Exists(c) == false) | (System.IO.Directory.Exists(d) == false) | (System.IO.File.Exists(e) == false))
- {
- Console.WriteLine("参数配置错误 指定的目录不存在! ");
- Console.WriteLine("任意键退出 ");
- int i = Console.Read();
- return;
- }
- Console.WriteLine("索引生成器开始工作 ");
-
- Console.WriteLine(" 宁夏大学 张冬 康彩 zd4004@163.com 2006.8.17 ");
- Console.WriteLine(" ");
- IndexPath = a;
- SourcePath = b;
- StopPath = c;
- ModelPath = d;
- OKxWordPath = e;
- Console.WriteLine("索引文件路径: " + a);
- Console.WriteLine("文件系统路径: " + b);
- Console.WriteLine("停止词路径 : " + c);
- Console.WriteLine("模板存放路径: " + d);
- Console.WriteLine("分词缓存路径: " + e);
- Lucene.Net.Analysis.XunLongX.ClassXunLongChinese.initOKxWord(XunLong.CongifData.Config.xWordCacheData,path);
- Console.WriteLine("加载本地缓存完成 ");
- // 添加停止词
- Lucene.Net.Analysis.XunLongX.ClassXunLongChinese.CnStopWord = InitCnStopWord(StopPath);
- Console.WriteLine("加载停止词 完成 ");
- // 使用本地缓存 直接读取缓存服务器数据
- //加载模板数据
- mxWeb.init(ModelPath);
- Console.WriteLine("加载模版完成 ");
- //已经存在的话为增加
- if (System.IO.File.Exists(IndexPath + "\segments") == true)
- {
- writer = new IndexWriter(IndexPath, new Lucene.Net.Analysis.XunLongX.XunLongAnalyzer(), false);
- }
- else
- {
- writer = new IndexWriter(IndexPath, new Lucene.Net.Analysis.XunLongX.XunLongAnalyzer(), true);
- }
- writer.SetUseCompoundFile(true);
- Console.WriteLine("加载分词模块 ");
- OKNUM = 0;
- //得到文件列表
- // ArrayList n = ClassFileSystemIt.SearchOneList("http://");
- oldIndexData.Clear();
- if (System.IO.File.Exists(IndexPath + "\segments") == true)
- {
- try
- {
- //得到系统内已经索引的数据地址 从 n 中去除
- IndexSearcher searcher = new IndexSearcher(IndexPath);
- int xxx = searcher.MaxDoc();
- Console.WriteLine(" 去掉已经索引过的数据 ");
- if (xxx > 1)
- {
- for (int i = 0; i < xxx; i++)
- {
- string doc = searcher.Doc(i).Get("url").ToString();
- //去掉已经索引过的
- // if (n.Contains(doc) == true)
- // {
- // n.Remove(doc);
- // }
- try
- {
- oldIndexData.Add(doc);
- }
- catch
- { }
- if (i % 100 == 0)
- {
- Console.WriteLine(" -> " + i);
- }
- }
- }
- searcher.Close();
- }
- catch
- {
- }
- }
- /*
- Query query = QueryParser.Parse("", "url", new Lucene.Net.Analysis.XunLong.XunLongAnalyzer());
- Hits hits = searcher.Search(query);
- for (int i = 0; i < hits.Length(); i++)
- {
- string doc = hits.Doc(i).Get("url").ToString();
- //去掉已经索引过的
- if (n.Contains(doc) == true)
- {
- n.Remove(doc);
- }
- }
- searcher.Close();
- */
-
- }
- /// <summary>
- /// 关闭索引
- /// </summary>
- public static void CloseIndex()
- {
- XDSTOP:
- Console.WriteLine("关闭索引写入");
- StopIt = false;
- isEndOK = false;
- try
- {
- writer.Optimize();
- writer.Close();
- isEndOK = true;
- }
- catch
- {
- Console.WriteLine("关闭索引写入出错");
- }
- Console.WriteLine("索引完成");
- Console.WriteLine("您可以关闭索引器了 谢谢使用!");
- StopIt = false;
-
- }
- /// <summary>
- /// 索引一个数据
- /// </summary>
- /// <param name="url">url</param>
- /// <param name="data">数据</param>
- public static int IndexOneData(string url, string data)
- {
- if (url == null)
- {
- return OKNUM;
- }
- if (oldIndexData.Contains(url) == true)
- {
- return OKNUM;
- }
- else
- {
- oldIndexData.Add(url);
- }
- //遍历数据 进行索引
- try
- {
- Document doc = new Document();
- // Console.WriteLine("--> " + data);
- Console.WriteLine(" 解析开始时间==> " + DateTime.Now.ToString() + " " + DateTime.Now.Millisecond.ToString());
- XunLong.PublicClassLibrary.kcSearch x = xData(url,data);
- if (x.isOK == false)
- {
- Console.WriteLine("跳过不能匹配模版的数据>>> ");
- m = m + 1;
- Console.WriteLine("索引完成 " + m + "条数据");
- // 不能匹配模版的数据不要
- // 不能匹配的url 写入 文件 下次启动 加载到 oldIndexData
- goto STP; // 不能匹配模版的数据不要
- }
- OKNUM = OKNUM + 1;
- Console.WriteLine(" 解析完成时间==> " + DateTime.Now.ToString() + " " + DateTime.Now.Millisecond.ToString());
- if (x.a == null)
- {
- x.a = "";
- }
- if (x.b == null)
- {
- x.b = "";
- }
- if (x.t == null)
- {
- x.t = "";
- }
- if (x.h == null)
- {
- x.h = "";
- }
- if (x.s == null)
- {
- x.s = "";
- }
- if (x.url == null)
- {
- x.url = "";
- }
- string x_a = TXT2IDAT.GetOneGoodData(x.a);
- x_a = x_a.Replace("