ClassMagModelBuilder.cs
上传用户:zhangkuixh
上传日期:2013-09-30
资源大小:5473k
文件大小:14k
- using System;
- using System.IO;
- using System.Collections.Generic;
- using System.Text;
- using System.Collections;
- using System.Threading;
- using System.Net;
- /*
- ' 迅龙中文分类搜索引擎 v0.6
- '
- ' LGPL 许可发行
- '
- ' 宁夏大学 张冬 康彩 zd4004@163.com
- '
- ' 官网 http://blog.163.com/zd4004/
- */
- namespace XunLong.ModelBuilder
- {
- /// <summary>
- /// 控制 模板生成器
- /// </summary>
- public class ClassMagModelBuilder
- {
- /// <summary>
- /// HTML 处理类
- /// </summary>
- // XunLong.HtmlClassLibrary.ClassHTML myHtmlClear = new XunLong.HtmlClassLibrary.ClassHTML();
- // XunLong.HtmlClassLibrary.ClassHTML myHTML2CLEAR = new XunLong.HtmlClassLibrary.ClassHTML();
- /*
- /// <summary>
- /// 得到1个模板
- /// </summary>
- /// <param name="mmu">采样的URL队列</param>
- /// <param name="dirPath">保存的目录</param>
- /// <param name="name">保存名称</param>
- /// <returns></returns>
- public void creatModel(Hashtable mmu, string dirPath, string name)
- {
- //1 根据列表 得到数据队列
- Hashtable dataList = new Hashtable();
-
- //2 得到模板数据
- string ModelData = "";
- //存放数据
- Hashtable mmHTM = new Hashtable();
- mmHTM.Clear();
- for (int i = 0; i < mmu.Count; i++)
- foreach(System.Collections.DictionaryEntry strIt in mmu)
- {
- string tmpUrl = strIt.Key.ToString(); //url
- string tmp =strIt.Value.ToString().ToLower(); //dat
- if (tmp.Length > 0)
- {
- string mhtm = "";
- if (tmp.IndexOf("file://") > -1)
- {
- string fpath = tmp.PadRight(tmp.Length - 7);
- if (System.IO.File.Exists(fpath) == true)
- {
- mhtm = getFileData(fpath);
- }
- }
- else //http://
- {
- mhtm = GetOneHTML(tmp, "gb2312");
- }
-
- // mhtm = myHTML2CLEAR.HTML2CLEAR(mhtm,tmpUrl);
- if ((mmHTM.Contains(tmp) == false) & (mhtm.Length > 0))
- {
- mmHTM.Add(tmp, mhtm);
- Console.WriteLine("得到 Web 数据 " + tmp);
- }
- }
- Console.WriteLine("得到数据 : " +mmu.Count.ToString()+ " 在总共 " + i.ToString() + " 中");
- }
- //声明模板构建类
- ClassModelBuilder myBU = new ClassModelBuilder();
- Console.WriteLine("开始建立模板");
- //建立模板
- ModelData = myBU.BuilderModel(mmHTM);
- int nn = myBU.inStrNum(ModelData, "*");
- string aData = "";
- for (int i = 0; i < nn; i++)
- {
- aData = aData + "<TAGDATA INDEX=" + i.ToString() + "/>"+"rn";
- }
- //来源数据 只保存其中的HTTP
- string ja = "";
- foreach (string j in mmu)
- {
- if (j.IndexOf("http://") > -1)
- {
- ja = ja + j + "rn";
- }
- }
- //3 保存模板数据 .a 数据模板 .b 类聚模板 .c 相关模板 .d 模板数据 .e 来源数据 将采样中的http 路径 去掉最后一个位置 以Tab 隔开 保存
- putFileData(dirPath + "\" + name + ".a", aData);
- Console.WriteLine("保存数据模板 " + dirPath + "\" + name + ".a");
- putFileData(dirPath + "\" + name + ".b", "<xl主类别>娱乐</xl主类别><其他属性><TAGDATA INDEX=1/></其他属性>");
- Console.WriteLine("保存类聚模板 " + dirPath + "\" + name + ".b");
- putFileData(dirPath + "\" + name + ".c", "<相关项>娱乐</相关项><相关项><TAGDATA INDEX=1/></相关项>");
- Console.WriteLine("保存相关模板 " + dirPath + "\" + name + ".c");
- putFileData(dirPath + "\" + name + ".d", ModelData);
- Console.WriteLine("保存模板数据 " + dirPath + "\" + name + ".d");
- putFileData(dirPath + "\" + name + ".e", ja);
- Console.WriteLine("保存来源数据 " + dirPath + "\" + name + ".e");
- putFileData(dirPath + "\" + name + ".t", "<TAGDATA INDEX=1/>");
- Console.WriteLine("保存标题模板 " + dirPath + "\" + name + ".t");
- Console.WriteLine("模板建立成功!~ ");
-
- }
- */
- /*
- /// <summary>
- /// 得到一个网页数据
- /// </summary>
- /// <param name="murl"></param>
- /// <returns></returns>
- public string GetOneHTML(string murl, string codeType)
- {
- HttpWebRequest request = (HttpWebRequest)WebRequest.Create(murl);
- request.Timeout = 20000;
- try
- {
- //下面来看看如何处理HTML页面。首先要做的当然是下载HTML页面,这可以通过C#提供的HttpWebRequest类实现:
- // request = (HttpWebRequest)WebRequest.Create(murl);
- WebResponse response = request.GetResponse();
- Stream stream = response.GetResponseStream();
- string buffer = "", line;
- //接下来我们就从request创建一个stream流。在执行其他处理之前,我们要先确定该文件是二进制文件还是文本文件,不同的文件类型处理方式也不同。下面的代码确定该文件是否为二进制文件。
- //。如果是文本文件,首先从stream创建一个StreamReader,然后将文本文件的内容一行一行加入缓冲区。
- // response.ContentType.
- // Encoding gbx = System.Text.Encoding.GetEncoding("gb2312");
- //存放当前的应用的字符集
- string NowCodeSet = "";
- if (response.ContentType.ToLower().StartsWith("text/"))
- {
- //自动检测 UTF8
- if ((response.ContentType.ToLower().IndexOf("utf-8") > -1) | (response.ContentType.ToLower().IndexOf("UTF-8") > -1))
- {
- StreamReader reader = new StreamReader(stream, System.Text.Encoding.UTF8);
- NowCodeSet = "utf-8";
- buffer = "";
- while ((line = reader.ReadLine()) != null)
- {
- buffer += line + "rn";
- }
- reader.Close();
- stream.Close();
- response.Close();
- // buffer = myHTML2CLEAR.Str2Str(buffer);
- }
- else
- {
- //自动检测GB2312
- if ((response.ContentType.ToLower().IndexOf("gb2312") > -1) | (response.ContentType.ToLower().IndexOf("GB2312") > -1))
- {
- StreamReader reader = new StreamReader(stream, System.Text.Encoding.GetEncoding("GB2312"));
- NowCodeSet = "gb2312";
- buffer = "";
- while ((line = reader.ReadLine()) != null)
- {
- buffer += line + "rn";
- }
- reader.Close();
- stream.Close();
- response.Close();
- }
- else
- {
- //自动检测 不到时按照默认设置进行
- if (codeType == "GB2312")
- {
- StreamReader reader = new StreamReader(stream, System.Text.Encoding.GetEncoding("GB2312"));
- NowCodeSet = "gb2312";
- buffer = "";
- while ((line = reader.ReadLine()) != null)
- {
- buffer += line + "rn";
- }
- reader.Close();
- stream.Close();
- response.Close();
- }
- else
- {
- StreamReader reader = new StreamReader(stream, System.Text.Encoding.UTF8);
- NowCodeSet = "utf-8";
- buffer = "";
- while ((line = reader.ReadLine()) != null)
- {
- buffer += line + "rn";
- }
- reader.Close();
- stream.Close();
- response.Close();
- // buffer = myHTML2CLEAR.Str2Str(buffer);
- }
- }
- }
- }
- ///字符集为gb2312 而刚应用为utf-8
- if ((buffer.ToLower().IndexOf("gb2312") > -1) & (NowCodeSet == "utf-8"))
- {
- HttpWebRequest requestX = (HttpWebRequest)WebRequest.Create(murl);
- WebResponse responseX = requestX.GetResponse();
- Stream streamX = responseX.GetResponseStream();
- StreamReader readerX = new StreamReader(streamX, System.Text.Encoding.GetEncoding("GB2312"));
- buffer = "";
- while ((line = readerX.ReadLine()) != null)
- {
- buffer += line + "rn";
- }
- readerX.Close();
- streamX.Close();
- responseX.Close();
- }
- ///字符集为utf-8 而刚应用为 gb2312
- if ((buffer.ToLower().IndexOf("utf-8") > -1) & (NowCodeSet == "gb2312"))
- {
- HttpWebRequest requestY = (HttpWebRequest)WebRequest.Create(murl);
- WebResponse responseY = requestY.GetResponse();
- Stream streamY = responseY.GetResponseStream();
- StreamReader readerY = new StreamReader(streamY, System.Text.Encoding.UTF8);
- buffer = "";
- while ((line = readerY.ReadLine()) != null)
- {
- buffer += line + "rn";
- }
- readerY.Close();
- streamY.Close();
- responseY.Close();
- // buffer = myHTML2CLEAR.Str2Str(buffer);
- }
- // string tmm =clearHTMLDB(buffer);
- string tmm = buffer;
- if (tmm.Length > 0)
- {
- Console.WriteLine("GUrlData : --> " + murl);
- }
- return tmm; //返回经过过滤得数据
- }
- catch
- {
- request.Abort();
- Console.WriteLine("Err : --> " + murl);
- return "";
- }
- }
- */
- /// <summary>
- /// 读文件
- /// </summary>
- /// <param name="filename"></param>
- /// <returns></returns>
- private string getFileData(string filename)
- {
- StreamReader reader = null;
- string data = string.Empty;
- try
- {
- reader = new StreamReader(filename, System.Text.Encoding.GetEncoding("gb2312"));
- data = reader.ReadToEnd();
- reader.Close();
- return data;
- }
- catch (IOException e)
- {
- Console.WriteLine(e.Message);
- }
- finally
- {
- if (reader != null)
- reader.Close();
- }
- return "";
- /*
- StreamReader reader = null;
- string data = string.Empty;
- try
- {
- reader = new StreamReader(filename, System.Text.Encoding.GetEncoding("gb2312"));
- for (string line = reader.ReadLine(); line != null; line = reader.ReadLine())
- {
- if (data == "")
- {
- data = line;
- }
- else
- {
- data = data + "n" + line;
- }
- }
- reader.Close();
- return data;
- }
- catch (IOException e)
- {
- Console.WriteLine(e.Message);
- }
- finally
- {
- if (reader != null)
- reader.Close();
- }
- return "";
- */
- }
- /// <summary>
- /// 写文件
- /// </summary>
- /// <param name="filename"></param>
- /// <param name="data"></param>
- private void putFileData(string filename, string data)
- {
- StreamWriter writer = null;
- try
- {
- writer = new StreamWriter(filename, false, System.Text.Encoding.GetEncoding("gb2312"));
- writer.Write(data);
- writer.Close();
- }
- catch (IOException e)
- {
- Console.WriteLine(e.Message);
- }
- finally
- {
- if (writer != null)
- writer.Close();
- }
- }
- }
- }