ClassIndexBuilder.cs
上传用户:zhangkuixh
上传日期:2013-09-30
资源大小:5473k
文件大小:16k
- using System;
- using System.Collections.Generic;
- using System.Collections;
- using System.Text;
- using System.IO;
- using System.Text.RegularExpressions;
- using Lucene.Net.Analysis.Standard;
- using Lucene.Net.Documents;
- using Lucene.Net.Index;
- using Lucene.Net.Search;
- using Lucene.Net.QueryParsers;
- using Lucene.Net.Store;
- /*
- ' 迅龙中文分类搜索引擎 v0.6
- '
- ' LGPL 许可发行
- '
- ' 宁夏大学 张冬 康彩 zd4004@163.com
- '
- ' 官网 http://blog.163.com/zd4004/
- */
- namespace XunLong.IndexBuilder
- {
- /// <summary>
- /// 建立索引
- /// </summary>
- class ClassIndexBuilder
- {
- /// <summary>
- /// 索引存储的地方
- /// </summary>
- public string IndexPath = "";
- /// <summary>
- /// NFS源文件存储的地方
- /// </summary>
- public string SourcePath = "";
- /// <summary>
- /// 停止词表路径
- /// </summary>
- public string StopPath = "";
- /// <summary>
- /// 模板存放地址
- /// </summary>
- public string ModelPath = "";
- /// <summary>
- /// 是否停止
- /// </summary>
- public bool StopIt = false;
- /// <summary>
- /// 是否最后写入索引完成
- /// </summary>
- public bool isEndOK = false;
- /// <summary>
- /// 使用本地缓存 直接读取缓存服务器数据
- /// </summary>
- public string OKxWordPath = "";
- /// <summary>
- /// 匹配成功的模版数目
- /// </summary>
- private int OKNUM =0;
- /// <summary>
- /// 模板匹配类
- /// </summary>
- private XunLong.ModelUserClassLibrary.ClassUserModel mxWeb = new XunLong.ModelUserClassLibrary.ClassUserModel();
- /// <summary>
- /// 页面数据清理 5000
- /// </summary>
- private static XunLong.HtmlClassLibrary.ClassTXT2IDAT TXT2IDAT = new XunLong.HtmlClassLibrary.ClassTXT2IDAT();
- XunLong.HtmlClassLibrary.ClassHTML myHTML2CLEAR = new XunLong.HtmlClassLibrary.ClassHTML();
- //索引写入器
- private IndexWriter writer;
- /// <summary>
- ///
- /// </summary>
- public ClassIndexBuilder()
- {
-
-
- }
- /// <summary>
- /// 初始化文件系统
- /// </summary>
- /// <param name="NFSBOOT"></param>
- public void initNFS(string k_c_path)
- {
- // .SetClassNHT
- ClassFileSystemIt.SetClassNHT(SourcePath, 3145727, k_c_path);
- // Lucene.Net.Store.FSLockConfig.LockDirectory = IndexPath;// SourcePath;
- // Lucene.Net.Store.
- // writer = new IndexWriter(IndexPath, new StandardAnalyzer(), true);
- System.Threading.Thread.Sleep(2000);
- Lucene.Net.Analysis.XunLongX.ClassXunLongChinese.initOKxWord(XunLong.CongifData.Config.xWordCacheData, k_c_path);
-
- Console.WriteLine("加载本地缓存完成 ");
-
-
-
- // 添加停止词
- Lucene.Net.Analysis.XunLongX.ClassXunLongChinese.CnStopWord = InitCnStopWord(StopPath);
- Console.WriteLine("加载停止词 完成 ");
-
- // 使用本地缓存 直接读取缓存服务器数据
- //加载模板数据
- mxWeb.init(ModelPath);
- Console.WriteLine("加载模版完成 ");
- //已经存在的话为增加
- if (System.IO.File.Exists(IndexPath + "\segments") == true)
- {
- writer = new IndexWriter(IndexPath, new Lucene.Net.Analysis.XunLongX.XunLongAnalyzer(), false);
- }
- else
- {
- writer = new IndexWriter(IndexPath, new Lucene.Net.Analysis.XunLongX.XunLongAnalyzer(), true);
- }
- writer.SetUseCompoundFile(true);
- Console.WriteLine("加载分词模块 ");
- }
- /// <summary>
- /// 初始化停止词
- /// </summary>
- /// <param name="dPath">停止词路径</param>
- public ArrayList InitCnStopWord(string dPath)
- {
- ArrayList CnStopWord = new ArrayList();
- CnStopWord.Clear();
- int tmp = 0;
- System.Threading.Thread.Sleep(2000);
- Console.WriteLine("加载停止词 ");
- // StreamWriter wr = new StreamWriter(dPath+"bak",false, System.Text.Encoding.GetEncoding("gb2312"));
- //
- StreamReader reader = null;
- string data = string.Empty;
- try
- {
- reader = new StreamReader(dPath, System.Text.Encoding.GetEncoding("gb2312"));
- for (string line = reader.ReadLine(); line != null; line = reader.ReadLine())
- {
- // string xStr = line.Replace(""", "");
- try
- {
- CnStopWord.Add(line);
- tmp = tmp + 1;
- }
- catch
- {
-
- }
- /*
- if (CnStopWord.Contains(line ) == false)
- {
- //wr.WriteLine(xStr);
- if (line.Length < 4)
- {
- CnStopWord.Add(line);
- }
- tmp = tmp + 1;
- if (tmp % 500 == 1)
- {
- Console.Write(">");
- }
-
- }
- */
- }
- reader.Close();
- }
- catch (IOException e)
- {
- Console.WriteLine(e.Message);
- }
- finally
- {
- if (reader != null)
- reader.Close();
- }
- // wr.Close();
- Console.WriteLine("共加载停止词 "+ tmp.ToString()+" 条");
- return CnStopWord;
- }
- /// <summary>
- /// 开始索引
- /// </summary>
- public void Run()
- {
- OKNUM = 0;
- //得到文件列表
- ArrayList n = ClassFileSystemIt.SearchOneList("http://");
- // Lucene.Net.Store.FSDirectory dir = Lucene.Net.Store.FSDirectory.GetDirectory(IndexPath, false);
- // Lucene.Net.Index.IndexWriter writer = new Lucene.Net.Index.IndexWriter(dir, new Lucene.Net.Analysis.Standard.StandardAnalyzer(), true);
- if (System.IO.File.Exists(IndexPath + "\segments") == true)
- {
- try
- {
- //得到系统内已经索引的数据地址 从 n 中去除
- IndexSearcher searcher = new IndexSearcher(IndexPath);
- int xxx = searcher.MaxDoc();
- Console.WriteLine(" 去掉已经索引过的数据 ");
- if (xxx > 1)
- {
- for (int i = 0; i < xxx; i++)
- {
- string doc = searcher.Doc(i).Get("url").ToString();
- //去掉已经索引过的
- if (n.Contains(doc) == true)
- {
- n.Remove(doc);
- }
- if (i % 100 == 0)
- {
- Console.WriteLine(" -> " + i);
- }
- }
- }
- searcher.Close();
- }
- catch
- {
- }
- }
- /*
- Query query = QueryParser.Parse("", "url", new Lucene.Net.Analysis.XunLong.XunLongAnalyzer());
- Hits hits = searcher.Search(query);
- for (int i = 0; i < hits.Length(); i++)
- {
- string doc = hits.Doc(i).Get("url").ToString();
- //去掉已经索引过的
- if (n.Contains(doc) == true)
- {
- n.Remove(doc);
- }
- }
- searcher.Close();
- */
- int m = 0;
- //遍历数据 进行索引
- foreach (string a in n)
- {
- if (StopIt == true)
- {
- goto XDSTOP;
- }
- try
- {
- Document doc = new Document();
- Console.WriteLine("--> " + a);
- Console.WriteLine(" 解析开始时间==> " + DateTime.Now.ToString() + " " + DateTime.Now.Millisecond.ToString());
- XunLong.PublicClassLibrary.kcSearch x = xData(a);
- if (x.isOK == false)
- {
- Console.WriteLine("跳过不能匹配模版的数据>>> ");
- m = m + 1;
- Console.WriteLine("索引完成 " + m + "条数据");
- goto STP; // 不能匹配模版的数据不要
- }
- Console.WriteLine(" 解析完成时间==> " + DateTime.Now.ToString() + " " + DateTime.Now.Millisecond.ToString());
- if (x.a == null)
- {
- x.a = "";
- }
- if (x.b == null)
- {
- x.b = "";
- }
- if (x.t == null)
- {
- x.t = "";
- }
- if (x.h == null)
- {
- x.h = "";
- }
- if (x.s == null)
- {
- x.s = "";
- }
- if (x.url == null)
- {
- x.url = "";
- }
- string x_a = TXT2IDAT.GetOneGoodData(x.a);
- x_a = x_a.Replace("