ClassRUN.cs
上传用户:zhangkuixh
上传日期:2013-09-30
资源大小:5473k
文件大小:4k
- using System;
- using System.Collections.Generic;
- using System.Text;
- using System.IO;
- using System.Text.RegularExpressions;
- using Lucene.Net.Analysis.Standard;
- using Lucene.Net.Documents;
- using Lucene.Net.Index;
- /*
- ' 迅龙中文分类搜索引擎 v0.6
- '
- ' LGPL 许可发行
- '
- ' 宁夏大学 张冬 康彩 zd4004@163.com
- '
- ' 官网 http://blog.163.com/zd4004/
- */
- namespace XunLong.IndexBuilder
- {
- public class ClassRUN
- {
- //索引写入器
- private IndexWriter writer;
- //要写入索引的文件的根目录
- private string docRootDirectory;
- //要匹配的文件格式
- private string pattern;
- /// <summary>
- /// 初始化一个索引写入器writer,directory为创建索引的目录,true代表如果不存在索引文件将重新创建索引文件,如果已经存在索引文件将覆写索引文件,如果为true将代表打开已经存在的索引文件
- /// </summary>
- /// <param name="directory">传入的要创建索引的目录,注意是字符串值,如果目录不存在,他将会被自动创建</param>
- public ClassRUN(string directory)
- {
- writer = new IndexWriter(directory, new StandardAnalyzer(), true);
- writer.SetUseCompoundFile(true);
- }
- public void AddDirection(DirectoryInfo directory, string pattern)
- {
- this.pattern = pattern;
- this.docRootDirectory = directory.FullName;
- AddSubDirectory(directory);
- }
- private void AddSubDirectory(DirectoryInfo directory)
- {
- foreach (FileInfo fi in directory.GetFiles(pattern))
- {
- //遍历要写入索引的目录的所有文件,把他先加入Docuemnt对象,再加入索引,因为索引都是有Document对象组成
- AddHtmlToDocument(fi.FullName);
- }
- foreach (DirectoryInfo di in directory.GetDirectories())
- {
- //层层遍历递归,只到把所有的子目录子文件都搞完
- AddSubDirectory(di);
- }
- }
- private void AddHtmlToDocument(string path)
- {
- Document doc = new Document();
- string html;
- using (StreamReader sr = new StreamReader(path, System.Text.Encoding.Default))
- {
- html = sr.ReadToEnd();
- }
- int relativePathStartsAt = this.docRootDirectory.EndsWith("\") ? this.docRootDirectory.Length : this.docRootDirectory.Length + 1;
- string relativePath = path.Substring(relativePathStartsAt);
- doc.Add(Field.UnStored("text", ParseHtml(html)));
- doc.Add(Field.Keyword("path", relativePath));
- doc.Add(Field.Text("title", GetTitle(html)));
- writer.AddDocument(doc);
- }
- /// <summary>
- /// 把读取的文件中的所有的html标记去掉,把 替换成空格
- /// </summary>
- /// <param name="html"></param>
- /// <returns></returns>
- private string ParseHtml(string html)
- {
- string temp = Regex.Replace(html, "<[^>]*>", "");
- return temp.Replace(" ", " ");
- }
- /// <summary>
- /// 获得读取的html文挡的标题
- /// </summary>
- /// <param name="html"></param>
- /// <returns></returns>
- private string GetTitle(string html)
- {
- Match m = Regex.Match(html, "<title>(.*)</title>");
- if (m.Groups.Count == 2)
- return m.Groups[1].Value;
- return "此文挡标题未知";
- }
- public void Close()
- {
- writer.Optimize();
- writer.Close();
- }
- }
- }