ClassMagModelBuilder.cs
上传用户:zhangkuixh
上传日期:2013-09-30
资源大小:5473k
文件大小:14k
源码类别:

搜索引擎

开发平台:

C#

  1. using System;
  2. using System.IO;
  3. using System.Collections.Generic;
  4. using System.Text;
  5. using System.Collections;
  6. using System.Threading;
  7. using System.Net;
  8. /*
  9.       '       迅龙中文分类搜索引擎  v0.6
  10.       '
  11.       '        LGPL  许可发行
  12.       '
  13.       '       宁夏大学  张冬 康彩  zd4004@163.com
  14.       ' 
  15.       '        官网 http://blog.163.com/zd4004/
  16.  */
  17. namespace XunLong.ModelBuilder
  18. {
  19.     /// <summary>
  20.     /// 控制 模板生成器
  21.     /// </summary>
  22.     public  class ClassMagModelBuilder
  23.     {
  24.         /// <summary>
  25.         /// HTML 处理类
  26.         /// </summary>
  27.       //  XunLong.HtmlClassLibrary.ClassHTML myHtmlClear = new XunLong.HtmlClassLibrary.ClassHTML();
  28.       //  XunLong.HtmlClassLibrary.ClassHTML myHTML2CLEAR = new XunLong.HtmlClassLibrary.ClassHTML();
  29.         /*
  30.         /// <summary>
  31.         /// 得到1个模板
  32.         /// </summary>
  33.         /// <param name="mmu">采样的URL队列</param>
  34.         /// <param name="dirPath">保存的目录</param>
  35.         /// <param name="name">保存名称</param>
  36.         /// <returns></returns>
  37.         public void creatModel(Hashtable mmu, string dirPath, string name)
  38.         { 
  39.             //1 根据列表 得到数据队列
  40.             Hashtable dataList = new Hashtable();
  41.         
  42.             //2 得到模板数据
  43.             string ModelData = "";
  44.             //存放数据
  45.             Hashtable mmHTM = new Hashtable();
  46.             mmHTM.Clear();
  47.             for (int i = 0; i < mmu.Count; i++)
  48.             foreach(System.Collections.DictionaryEntry strIt in mmu)
  49.             {
  50.                 string tmpUrl = strIt.Key.ToString();      //url
  51.                 string tmp =strIt.Value.ToString().ToLower(); //dat
  52.                 if (tmp.Length > 0)
  53.                 {            
  54.                     string mhtm = "";
  55.                     if (tmp.IndexOf("file://") > -1)
  56.                     {
  57.                         string fpath = tmp.PadRight(tmp.Length - 7);
  58.                         if (System.IO.File.Exists(fpath) == true)
  59.                         {
  60.                             mhtm = getFileData(fpath);
  61.                         }
  62.                     }
  63.                     else  //http://
  64.                     {
  65.                         mhtm = GetOneHTML(tmp, "gb2312");
  66.                     }
  67.                
  68.               //      mhtm =  myHTML2CLEAR.HTML2CLEAR(mhtm,tmpUrl);
  69.                     if ((mmHTM.Contains(tmp) == false) & (mhtm.Length > 0))
  70.                     {
  71.                         mmHTM.Add(tmp, mhtm);
  72.                         Console.WriteLine("得到 Web 数据 " + tmp);
  73.                     }
  74.                 }
  75.                 Console.WriteLine("得到数据 : " +mmu.Count.ToString()+ " 在总共 " + i.ToString() + " 中");
  76.             }
  77.             //声明模板构建类
  78.             ClassModelBuilder myBU = new ClassModelBuilder();
  79.             Console.WriteLine("开始建立模板");
  80.             //建立模板
  81.             ModelData = myBU.BuilderModel(mmHTM);
  82.             int nn = myBU.inStrNum(ModelData, "*");
  83.             string aData = "";
  84.             for (int i = 0; i < nn; i++)
  85.             {
  86.                 aData = aData + "<TAGDATA INDEX=" + i.ToString() + "/>"+"rn";
  87.             }
  88.             //来源数据  只保存其中的HTTP
  89.             string ja = "";
  90.             foreach (string j in mmu)
  91.             {
  92.                 if (j.IndexOf("http://") > -1)
  93.                 {
  94.                     ja = ja + j + "rn";
  95.                 }
  96.             }
  97.             //3 保存模板数据    .a 数据模板 .b 类聚模板 .c 相关模板 .d 模板数据 .e 来源数据 将采样中的http  路径 去掉最后一个位置 以Tab 隔开 保存
  98.             putFileData(dirPath + "\" + name + ".a", aData);
  99.             Console.WriteLine("保存数据模板 " + dirPath + "\" + name + ".a");
  100.             putFileData(dirPath + "\" + name + ".b", "<xl主类别>娱乐</xl主类别><其他属性><TAGDATA INDEX=1/></其他属性>");
  101.             Console.WriteLine("保存类聚模板 " + dirPath + "\" + name + ".b");
  102.             putFileData(dirPath + "\" + name + ".c", "<相关项>娱乐</相关项><相关项><TAGDATA INDEX=1/></相关项>");
  103.             Console.WriteLine("保存相关模板 " + dirPath + "\" + name + ".c");
  104.             putFileData(dirPath + "\" + name + ".d", ModelData);
  105.             Console.WriteLine("保存模板数据 " + dirPath + "\" + name + ".d");
  106.             putFileData(dirPath + "\" + name + ".e", ja);
  107.             Console.WriteLine("保存来源数据 " + dirPath + "\" + name + ".e");
  108.             putFileData(dirPath + "\" + name + ".t", "<TAGDATA INDEX=1/>");
  109.             Console.WriteLine("保存标题模板 " + dirPath + "\" + name + ".t");
  110.             Console.WriteLine("模板建立成功!~ ");
  111.             
  112.         }
  113.         */
  114.         /*
  115.         /// <summary>
  116.         /// 得到一个网页数据
  117.         /// </summary>
  118.         /// <param name="murl"></param>
  119.         /// <returns></returns>
  120.         public string GetOneHTML(string murl, string codeType)
  121.         {
  122.             HttpWebRequest request = (HttpWebRequest)WebRequest.Create(murl);
  123.             request.Timeout = 20000;
  124.             try
  125.             {
  126.                 //下面来看看如何处理HTML页面。首先要做的当然是下载HTML页面,这可以通过C#提供的HttpWebRequest类实现: 
  127.                 // request = (HttpWebRequest)WebRequest.Create(murl);
  128.                 WebResponse response = request.GetResponse();
  129.                 Stream stream = response.GetResponseStream();
  130.                 string buffer = "", line;
  131.                 //接下来我们就从request创建一个stream流。在执行其他处理之前,我们要先确定该文件是二进制文件还是文本文件,不同的文件类型处理方式也不同。下面的代码确定该文件是否为二进制文件。 
  132.                 //。如果是文本文件,首先从stream创建一个StreamReader,然后将文本文件的内容一行一行加入缓冲区。 
  133.                 //  response.ContentType.
  134.                 // Encoding gbx = System.Text.Encoding.GetEncoding("gb2312");
  135.                 //存放当前的应用的字符集
  136.                 string NowCodeSet = "";
  137.                 if (response.ContentType.ToLower().StartsWith("text/"))
  138.                 {
  139.                     //自动检测 UTF8
  140.                     if ((response.ContentType.ToLower().IndexOf("utf-8") > -1) | (response.ContentType.ToLower().IndexOf("UTF-8") > -1))
  141.                     {
  142.                         StreamReader reader = new StreamReader(stream, System.Text.Encoding.UTF8);
  143.                         NowCodeSet = "utf-8";
  144.                         buffer = "";
  145.                         while ((line = reader.ReadLine()) != null)
  146.                         {
  147.                             buffer += line + "rn";
  148.                         }
  149.                         reader.Close();
  150.                         stream.Close();
  151.                         response.Close();
  152.               //          buffer = myHTML2CLEAR.Str2Str(buffer);
  153.                     }
  154.                     else
  155.                     {
  156.                         //自动检测GB2312
  157.                         if ((response.ContentType.ToLower().IndexOf("gb2312") > -1) | (response.ContentType.ToLower().IndexOf("GB2312") > -1))
  158.                         {
  159.                             StreamReader reader = new StreamReader(stream, System.Text.Encoding.GetEncoding("GB2312"));
  160.                             NowCodeSet = "gb2312";
  161.                             buffer = "";
  162.                             while ((line = reader.ReadLine()) != null)
  163.                             {
  164.                                 buffer += line + "rn";
  165.                             }
  166.                             reader.Close();
  167.                             stream.Close();
  168.                             response.Close();
  169.                         }
  170.                         else
  171.                         {
  172.                             //自动检测 不到时按照默认设置进行
  173.                             if (codeType == "GB2312")
  174.                             {
  175.                                 StreamReader reader = new StreamReader(stream, System.Text.Encoding.GetEncoding("GB2312"));
  176.                                 NowCodeSet = "gb2312";
  177.                                 buffer = "";
  178.                                 while ((line = reader.ReadLine()) != null)
  179.                                 {
  180.                                     buffer += line + "rn";
  181.                                 }
  182.                                 reader.Close();
  183.                                 stream.Close();
  184.                                 response.Close();
  185.                             }
  186.                             else
  187.                             {
  188.                                 StreamReader reader = new StreamReader(stream, System.Text.Encoding.UTF8);
  189.                                 NowCodeSet = "utf-8";
  190.                                 buffer = "";
  191.                                 while ((line = reader.ReadLine()) != null)
  192.                                 {
  193.                                     buffer += line + "rn";
  194.                                 }
  195.                                 reader.Close();
  196.                                 stream.Close();
  197.                                 response.Close();
  198.                //                 buffer = myHTML2CLEAR.Str2Str(buffer);
  199.                             }
  200.                         }
  201.                     }
  202.                 }
  203.                 ///字符集为gb2312  而刚应用为utf-8 
  204.                 if ((buffer.ToLower().IndexOf("gb2312") > -1) & (NowCodeSet == "utf-8"))
  205.                 {
  206.                     HttpWebRequest requestX = (HttpWebRequest)WebRequest.Create(murl);
  207.                     WebResponse responseX = requestX.GetResponse();
  208.                     Stream streamX = responseX.GetResponseStream();
  209.                     StreamReader readerX = new StreamReader(streamX, System.Text.Encoding.GetEncoding("GB2312"));
  210.                     buffer = "";
  211.                     while ((line = readerX.ReadLine()) != null)
  212.                     {
  213.                         buffer += line + "rn";
  214.                     }
  215.                     readerX.Close();
  216.                     streamX.Close();
  217.                     responseX.Close();
  218.                 }
  219.                 ///字符集为utf-8 而刚应用为 gb2312 
  220.                 if ((buffer.ToLower().IndexOf("utf-8") > -1) & (NowCodeSet == "gb2312"))
  221.                 {
  222.                     HttpWebRequest requestY = (HttpWebRequest)WebRequest.Create(murl);
  223.                     WebResponse responseY = requestY.GetResponse();
  224.                     Stream streamY = responseY.GetResponseStream();
  225.                     StreamReader readerY = new StreamReader(streamY, System.Text.Encoding.UTF8);
  226.                     buffer = "";
  227.                     while ((line = readerY.ReadLine()) != null)
  228.                     {
  229.                         buffer += line + "rn";
  230.                     }
  231.                     readerY.Close();
  232.                     streamY.Close();
  233.                     responseY.Close();
  234.              //       buffer = myHTML2CLEAR.Str2Str(buffer);
  235.                 }
  236.                 //   string tmm =clearHTMLDB(buffer);
  237.                 string tmm = buffer;
  238.                 if (tmm.Length > 0)
  239.                 {
  240.                     Console.WriteLine("GUrlData : --> " + murl);
  241.                 }
  242.                 return tmm;  //返回经过过滤得数据
  243.             }
  244.             catch
  245.             {
  246.                 request.Abort();
  247.                 Console.WriteLine("Err : --> " + murl);
  248.                 return "";
  249.             }
  250.         }
  251.         */
  252.         /// <summary>
  253.         /// 读文件
  254.         /// </summary>
  255.         /// <param name="filename"></param>
  256.         /// <returns></returns>
  257.         private string getFileData(string filename)
  258.         {
  259.             StreamReader reader = null;
  260.             string data = string.Empty;
  261.             try
  262.             {
  263.                 reader = new StreamReader(filename, System.Text.Encoding.GetEncoding("gb2312"));
  264.                 data = reader.ReadToEnd();
  265.                 reader.Close();
  266.                 return data;
  267.             }
  268.             catch (IOException e)
  269.             {
  270.                 Console.WriteLine(e.Message);
  271.             }
  272.             finally
  273.             {
  274.                 if (reader != null)
  275.                     reader.Close();
  276.             }
  277.             return "";
  278.             /*
  279.             StreamReader reader = null;
  280.             string data = string.Empty;
  281.             try
  282.             {
  283.                 reader = new StreamReader(filename, System.Text.Encoding.GetEncoding("gb2312"));
  284.                 for (string line = reader.ReadLine(); line != null; line = reader.ReadLine())
  285.                 {
  286.                     if (data == "")
  287.                     {
  288.                         data = line;
  289.                     }
  290.                     else
  291.                     {
  292.                         data = data + "n" + line;
  293.                     }
  294.                 }
  295.                 reader.Close();
  296.                 return data;
  297.             }
  298.             catch (IOException e)
  299.             {
  300.                 Console.WriteLine(e.Message);
  301.             }
  302.             finally
  303.             {
  304.                 if (reader != null)
  305.                     reader.Close();
  306.             }
  307.             return "";
  308.             */
  309.         }
  310.         /// <summary>
  311.         /// 写文件
  312.         /// </summary>
  313.         /// <param name="filename"></param>
  314.         /// <param name="data"></param>
  315.         private void putFileData(string filename, string data)
  316.         {
  317.             StreamWriter writer = null;
  318.             try
  319.             {
  320.                 writer = new StreamWriter(filename, false, System.Text.Encoding.GetEncoding("gb2312"));
  321.                 writer.Write(data);
  322.                 writer.Close();
  323.             }
  324.             catch (IOException e)
  325.             {
  326.                 Console.WriteLine(e.Message);
  327.             }
  328.             finally
  329.             {
  330.                 if (writer != null)
  331.                     writer.Close();
  332.             }
  333.         }
  334.     }
  335. }