ClassSpiderMain.cs
上传用户:zhangkuixh
上传日期:2013-09-30
资源大小:5473k
文件大小:41k
源码类别:

搜索引擎

开发平台:

C#

  1. using System;
  2. using System.IO;
  3. using System.Collections.Generic;
  4. using System.Text;
  5. using System.Net;
  6. using System.Net.Sockets;
  7. using System.Threading;
  8. using System.Collections;
  9. /*
  10.       '       迅龙中文分类搜索引擎  v0.6
  11.       '
  12.       '        LGPL  许可发行
  13.       '
  14.       '       宁夏大学  张冬 康彩  zd4004@163.com
  15.       ' 
  16.       '        官网 http://blog.163.com/zd4004/
  17.  */
  18. namespace XunLong.ConsoleSpiderOne
  19. {
  20.     class ClassSpiderMain
  21.     {
  22.         /// <summary>
  23.         /// 判断索引是否变化
  24.         /// </summary>
  25.         public bool IndexChang = false;
  26.         /// <summary>
  27.         /// 是否正在运行索引拷贝工作  此时 需要使得当前蜘蛛延时
  28.         /// </summary>
  29.         private bool IsRunIndexWrite = false;
  30.         /// <summary>
  31.         /// 统计总共下载的文件数
  32.         /// </summary>
  33.         public  int Num = 0;
  34.         /// <summary>
  35.         /// 统计总共下载的文件数  有效
  36.         /// </summary>
  37.         public int Num_1 = 0;
  38.         /// <summary>
  39.         /// 判断索引个数是否变化
  40.         /// </summary>
  41.         private int N_B_OLD = 0;
  42.         /// <summary>
  43.         /// 测试速度使用
  44.         /// </summary>
  45.         private int OneTime_X = 0;
  46.         /// <summary>
  47.         /// 采样队列
  48.         /// </summary>
  49.         private ArrayList neeDurl = new ArrayList();
  50.         private  string OneDir = "";
  51.         string FileDir = "";
  52.         int YYY = 0;
  53.         /// <summary>
  54.         /// 索引存放数据
  55.         /// </summary>
  56.         string indexPath = "";
  57.         /// <summary>
  58.         /// 源url列表 只要该url 和 源列表中 任意一个地址在同一层  那么即可认为 可以下载
  59.         /// </summary>
  60.         private ArrayList sourceUrls = new ArrayList();
  61.         /// <summary>
  62.         /// 编码器
  63.         /// </summary>
  64.         private NewNxuEncoding.CNewNxuEncoding codeIt = new NewNxuEncoding.CNewNxuEncoding();
  65.         //定义URL类
  66.         private XunLong.UrlDBClassLibrary.ClassUrlDB newUrlDBClass = new XunLong.UrlDBClassLibrary.ClassUrlDB();
  67.         /// <summary>
  68.         /// 定义存储类
  69.         /// </summary>
  70.         private NetHashTableAPI.ClassNHT myDB = new NetHashTableAPI.ClassNHT();
  71.         /// <summary>
  72.         /// HTM chuli 
  73.         /// </summary>
  74.         private XunLong.HtmlClassLibrary.ClassHTML myClassHTML = new XunLong.HtmlClassLibrary.ClassHTML();
  75.         System.Random myRandom = new Random();
  76.         /// <summary>
  77.         /// 每隔1800秒 扫描一下起始的站点
  78.         /// </summary>
  79.         int iTime = Environment.TickCount;
  80.         /// <summary>
  81.         /// 不用处理的扩展名列表
  82.         /// </summary>
  83.         private string[] FILEEX ={"css",  "js",  "zip",  "avi", "rar",  "exe", "dat",  "png", "jpg", "gif", "mp3","rm","rmvb","doc",
  84.                                    "xsl","pdf","asf","wav" ,"wmv","mpeg","mp4","txt","gz","tar","torrent","swf","ppt","mdb",
  85.                                    "iso","bin","dll","obj","svg","xml","mov","pps","ico","iuc","bak","pps","gz"};
  86.         /// <summary>
  87.         /// 配置文件路径
  88.         /// </summary>
  89.         private string IndexData_Config_Path = "";
  90.         
  91.         public ClassSpiderMain()
  92.         {
  93.             //print  javas
  94.         
  95.         }
  96.         /// <summary>
  97.         /// 是否需要运行 
  98.         /// </summary>
  99.        public  bool IsRun = false;
  100.         /// <summary>
  101.         /// 开始 蜘蛛
  102.         /// </summary>
  103.         public void StartMain( string k_c_path)
  104.         {
  105.             //当前目录
  106.            // OneDir = AppDomain.CurrentDomain.BaseDirectory;
  107.             IndexData_Config_Path = k_c_path;
  108.             IsRun = true;
  109.             Num = 0;
  110.             Num_1 = 0;
  111.          // FileDir = OneDir + "WEB";
  112.             FileDir = XunLong.CongifData.Config.SpiderData;
  113.            // string urlsCache = OneDir + "urlsCache.dat";
  114.           //  string urlsCache = ;
  115.             string urlsCache = XunLong.CongifData.Config.UrlCahceData + "c";
  116.           //  string urlsSource = OneDir + "urlsSource.dat";
  117.             //string urlsSource =;
  118.             string urlsSource = XunLong.CongifData.Config.UrlSourceData;
  119.             Console.WriteLine(" 源地址  " + urlsSource);
  120.             Console.WriteLine(" 地址缓存  " + urlsSource + "c");
  121.           //  indexPath = indexPathX;
  122.             Console.WriteLine("索引存放  " + urlsSource);
  123.             newUrlDBClass.urlsCacheFile = urlsCache;
  124.             newUrlDBClass.urlsSourceFile = urlsSource;
  125.             //加载服务
  126.              newUrlDBClass.StartFirstUrls();  // .StartUrls();
  127.              sourceUrls = newUrlDBClass.SourceURLs;
  128.             // 设定存储类的路径   3M
  129.              myDB.SetClassNHT(FileDir, 3145727, k_c_path);
  130.              ArrayList aOLDUrl = new ArrayList();
  131.              aOLDUrl = myDB.SearchOneList("http://");
  132.              newUrlDBClass.AddOldurlsIOLD(aOLDUrl);
  133.              //清理 开始监控的  url
  134.             newUrlDBClass.ReSetSource();
  135.             iTime = Environment.TickCount;
  136.              if (XunLong.CongifData.Config.xlDirectIndex.IndexOf('0') > -1)
  137.              {
  138.              }
  139.              else
  140.              {
  141.                  newUrlDBClass.Load_Had_Url(XunLong.CongifData.Config.Had_Url_Data);
  142.                  XunLong.clsDirectIndex.DirectIndex.OpenIndex(k_c_path);
  143.                  XunLong.clsDirectIndex.DirectIndex.InitCnStopWord(XunLong.CongifData.Config.StopWordData);
  144.              }
  145.         }
  146.         /// <summary>
  147.         /// 实际运行程序
  148.         /// </summary>
  149.         public void ZhiZhuRun()
  150.         {
  151.         XXXXX:
  152.           
  153.             try
  154.             {
  155.                 while (IsRun)
  156.                 {
  157.                     int iTime_2 = Environment.TickCount;
  158.                     //定时监控
  159.                     if (iTime_2 - iTime > XunLong.CongifData.Config.IndexDataTIME*60*1000)
  160.                     {
  161.                         iTime = Environment.TickCount;
  162.                         // 1 iTime
  163.                         //判断索引是否改变
  164.                         if (IndexChang == true)
  165.                         {
  166.                             IsRunIndexWrite =true;
  167.                             //关闭索引 
  168.                             XunLong.clsDirectIndex.DirectIndex.CloseIndex();
  169.                             //拷贝新的索引数据到检索目录
  170.                             CopyIndexData(XunLong.CongifData.Config.IndexData, XunLong.CongifData.Config.IndexData2);
  171.                             //打开索引
  172.                             XunLong.clsDirectIndex.DirectIndex.OpenIndex(IndexData_Config_Path);
  173.                             IsRunIndexWrite = false;
  174.                             IndexChang = false;
  175.                         }
  176.                  
  177.                         //清理 开始监控的  url
  178.                         newUrlDBClass.ReSetSource();
  179.           
  180.                         Console.WriteLine("*******************************");
  181.                         Console.WriteLine("*                             *");
  182.                         Console.WriteLine("*        监控 重新得到数据    *");
  183.                         Console.WriteLine("*                             *");
  184.                         Console.WriteLine("*******************************");
  185.                     }
  186.                     
  187.                     
  188.                  //   double xxx = myRandom.NextDouble() * 500;
  189.                 //    int x = (int)xxx;
  190.                 //    System.Threading.Thread.Sleep(x);
  191.                     //判断是否停止 
  192.                     if (IsRun == false)
  193.                     {
  194.                         return;
  195.                         //停止
  196.                     }
  197.                     //得到1个新的URL
  198.                     string newUrl = "";
  199.                     try
  200.                     {
  201.                         newUrl = newUrlDBClass.getOneUrl();
  202.                     }
  203.                     catch
  204.                     {
  205.                         Console.Write("->E");
  206.                     }
  207.                     if (newUrl.Length == 0)
  208.                     {
  209.                         //URL 任务缓存为 0 
  210.                         System.Threading.Thread.Sleep(1000);
  211.                     }
  212.                     else
  213.                     {
  214.                         Console.WriteLine("开始请求 >> " + newUrl);
  215.                         //取得1个数据
  216.                         string getDB = GetOneHTML(newUrl, "gb2312");
  217.                         //非网页格式 则返回
  218.                         //return "XL_NULL";
  219.                         if (getDB != "XL_NULL")
  220.                         {
  221.                             if (getDB.Length > 0)
  222.                             {   //保存数据
  223.                                 // WriteWeb2Disk(newUrl, getDB);
  224.                                 //调用存储类   保存数据                         
  225.                                 int N_ii = 0;
  226.                                 //判断是否直接索引   //判断是否需要存储
  227.                                 // 如果需要索引 在此处进行 然后把不能索引的记载下来
  228.                                 if (IsSaveData(newUrl,getDB) == true)
  229.                                 {
  230.                                     while (true)
  231.                                     {
  232.                                         try
  233.                                         {
  234.                                            
  235.                                             if (myDB.lockIt == false)
  236.                                             {
  237.                                                 myDB.lockIt = true;
  238.                                                 myDB.add(newUrl, getDB, true);
  239.                                                 myDB.lockIt = false;
  240.                                                 break;
  241.                                             }
  242.                                             // 延时 在试
  243.                                             System.Threading.Thread.Sleep(DateTime.Now.Millisecond + 100);
  244.                                         }
  245.                                         catch
  246.                                         {
  247.                                             Console.Write("->E");
  248.                                         }
  249.                                         N_ii = N_ii + 1;
  250.                                         if (N_ii > 200)
  251.                                         {
  252.                                             Console.Write("->E-R");
  253.                                             newUrlDBClass.putOneUrl2(newUrl); // nUrlDB.putOneUrl(a);
  254.                                             Console.WriteLine(" 写入失败 重新压回 URL数据库 >> " + newUrl);
  255.                                             goto XXXXX;
  256.                                         }
  257.                                     }
  258.                                     Num_1 = Num_1 + 1;
  259.                                 }
  260.                                 else
  261.                                 {
  262.                                     Console.WriteLine("通过索引 取消存储!");
  263.                                     //********************************************************
  264.                                     //
  265.                                     //把取消存储的  压入一个已经请求的url  列表 
  266.                                     //********************************************************
  267.                                 }
  268.                                 Num = Num + 1;
  269.                                 if (Num % 10 == 0)
  270.                                 {
  271.                                     int TwoTime_X = Environment.TickCount - OneTime_X;
  272.                                      YYY = 100000 / TwoTime_X;
  273.  
  274.                                    // Console.WriteLine("===================>>>>>  当前速度:    " + YYY.ToString() + "    条/秒");
  275.  
  276.                                     OneTime_X = Environment.TickCount;
  277.                                 }
  278.                                 double yyy_d = (double)YYY / 10;
  279.                                
  280.                                 Console.WriteLine("总共下载了 " + Num.ToString() + " 条数据 其中过滤得到 "+Num_1.ToString()+" 条 有效数据");
  281.                                 Console.WriteLine("===================>>>>>  当前速度:    " + yyy_d.ToString() + "    条/秒");
  282.                                 // 根据当前页和页面URL 把数据中的链接提取出来 
  283.                                 try
  284.                                 {
  285.                                     GetAddUrl(getDB, newUrl);
  286.                                 }
  287.                                 catch
  288.                                 {
  289.                                     Console.Write("->E");
  290.                                 }
  291.                             }
  292.                             else
  293.                             {
  294.                                 //得不到数据时重新压入
  295.                                 if (newUrl.Length > 7) //== "http://")
  296.                                 {
  297.                                     try
  298.                                     {
  299.                                         newUrlDBClass.putOneUrl2(newUrl); // nUrlDB.putOneUrl(a);
  300.                                         Console.WriteLine(" 请求失败 重新压回 URL数据库 >> " + newUrl);
  301.                                     }
  302.                                     catch
  303.                                     {
  304.                                         Console.Write("->E");
  305.                                     }
  306.                                 }
  307.                             }
  308.                             //
  309.                         }
  310.                         else
  311.                         {
  312.                             int iCCCC = 0;
  313.                         }
  314.                     }
  315.                 }
  316.             }
  317.             catch(Exception e)
  318.             {
  319.                 Console.Write("->E");
  320.                 goto XXXXX;
  321.             }
  322.         
  323.         }
  324.         /// <summary>
  325.         /// 停止蜘蛛 把剩余的URL压入缓存文件
  326.         /// </summary>
  327.         public void StopSpider()
  328.         {
  329.             IsRun = false;
  330.             newUrlDBClass.SaveUrlsCache();
  331.             if (XunLong.CongifData.Config.xlDirectIndex.IndexOf('0') > -1)
  332.             {
  333.             }
  334.             else
  335.             {
  336.                 XunLong.clsDirectIndex.DirectIndex.CloseIndex();
  337.             }
  338.         }
  339.         /// <summary>
  340.         /// 得到一个网页数据
  341.         /// </summary>
  342.         /// <param name="murl"></param>
  343.         /// <returns></returns>
  344.         private string GetOneHTML(string murl, string codeType)
  345.         {
  346.             try
  347.             {
  348.                 HttpWebRequest request = (HttpWebRequest)WebRequest.Create(murl);
  349.                 request.Timeout = 10000;
  350.                 try
  351.                 {
  352.                     //下面来看看如何处理HTML页面。首先要做的当然是下载HTML页面,这可以通过C#提供的HttpWebRequest类实现: 
  353.                     // request = (HttpWebRequest)WebRequest.Create(murl);
  354.                     WebResponse response = request.GetResponse();
  355.                     Stream stream = response.GetResponseStream();
  356.                     string buffer = "", line;
  357.                     //接下来我们就从request创建一个stream流。在执行其他处理之前,我们要先确定该文件是二进制文件还是文本文件,不同的文件类型处理方式也不同。下面的代码确定该文件是否为二进制文件。 
  358.                     //。如果是文本文件,首先从stream创建一个StreamReader,然后将文本文件的内容一行一行加入缓冲区。 
  359.                     //  response.ContentType.
  360.                     // Encoding gbx = System.Text.Encoding.GetEncoding("gb2312");
  361.                     //存放当前的应用的字符集
  362.                     string NowCodeSet = "";
  363.                     if (response.ContentLength < 1024 * 128)
  364.                     {
  365.                         //判断是否是网页格式
  366.                         if (response.ContentType.ToLower().StartsWith("text/"))
  367.                         {
  368.                             //自动检测 UTF8
  369.                             if ((response.ContentType.ToLower().IndexOf("utf-8") > -1) | (response.ContentType.ToLower().IndexOf("unicode") > -1))
  370.                             {
  371.                                 StreamReader reader = new StreamReader(stream, System.Text.Encoding.UTF8);
  372.                                 NowCodeSet = "utf-8";
  373.                                 buffer = "";
  374.                                 while ((line = reader.ReadLine()) != null)
  375.                                 {
  376.                                     buffer += line + "rn";
  377.                                 }
  378.                                 reader.Close();
  379.                                 stream.Close();
  380.                                 response.Close();
  381.                                 buffer = Str2Str(buffer);
  382.                             }
  383.                             else
  384.                             {
  385.                                 //自动检测GB2312
  386.                                 if ((response.ContentType.ToLower().IndexOf("gb2312") > -1) | (response.ContentType.ToLower().IndexOf("gbk") > -1))
  387.                                 {
  388.                                     StreamReader reader = new StreamReader(stream, System.Text.Encoding.GetEncoding("GB2312"));
  389.                                     NowCodeSet = "gb2312";
  390.                                     buffer = "";
  391.                                     while ((line = reader.ReadLine()) != null)
  392.                                     {
  393.                                         buffer += line + "rn";
  394.                                     }
  395.                                     reader.Close();
  396.                                     stream.Close();
  397.                                     response.Close();
  398.                                 }
  399.                                 else
  400.                                 {
  401.                                     //自动检测 不到时按照默认设置进行
  402.                                     if (codeType.ToLower().IndexOf("utf") > -1)
  403.                                     {
  404.                                         StreamReader reader = new StreamReader(stream, System.Text.Encoding.UTF8);
  405.                                         NowCodeSet = "utf-8";
  406.                                         buffer = "";
  407.                                         while ((line = reader.ReadLine()) != null)
  408.                                         {
  409.                                             buffer += line + "rn";
  410.                                         }
  411.                                         reader.Close();
  412.                                         stream.Close();
  413.                                         response.Close();
  414.                                         buffer = Str2Str(buffer);
  415.                                     }
  416.                                     else
  417.                                     {
  418.                                         StreamReader reader = new StreamReader(stream, System.Text.Encoding.GetEncoding("GB2312"));
  419.                                         NowCodeSet = "gb2312";
  420.                                         buffer = "";
  421.                                         while ((line = reader.ReadLine()) != null)
  422.                                         {
  423.                                             buffer += line + "rn";
  424.                                         }
  425.                                         reader.Close();
  426.                                         stream.Close();
  427.                                         response.Close();
  428.                                     }
  429.                                 }
  430.                             }
  431.                         }
  432.                         else
  433.                         {
  434.                             //非网页格式 则返回
  435.                             return "XL_NULL";
  436.                         }
  437.                         ///字符集为gb2312  而刚应用为utf-8 
  438.                         if ((buffer.ToLower().IndexOf("gb2312") > -1) & (NowCodeSet == "utf-8"))
  439.                         {
  440.                             HttpWebRequest requestX = (HttpWebRequest)WebRequest.Create(murl);
  441.                             WebResponse responseX = requestX.GetResponse();
  442.                             Stream streamX = responseX.GetResponseStream();
  443.                             StreamReader readerX = new StreamReader(streamX, System.Text.Encoding.GetEncoding("GB2312"));
  444.                             buffer = "";
  445.                             while ((line = readerX.ReadLine()) != null)
  446.                             {
  447.                                 buffer += line + "rn";
  448.                             }
  449.                             readerX.Close();
  450.                             streamX.Close();
  451.                             responseX.Close();
  452.                         }
  453.                         ///字符集为utf-8 而刚应用为 gb2312 
  454.                         if ((buffer.ToLower().IndexOf("utf-8") > -1) & (NowCodeSet == "gb2312"))
  455.                         {
  456.                             HttpWebRequest requestY = (HttpWebRequest)WebRequest.Create(murl);
  457.                             WebResponse responseY = requestY.GetResponse();
  458.                             Stream streamY = responseY.GetResponseStream();
  459.                             StreamReader readerY = new StreamReader(streamY, System.Text.Encoding.UTF8);
  460.                             buffer = "";
  461.                             while ((line = readerY.ReadLine()) != null)
  462.                             {
  463.                                 buffer += line + "rn";
  464.                             }
  465.                             readerY.Close();
  466.                             streamY.Close();
  467.                             responseY.Close();
  468.                             buffer = Str2Str(buffer);
  469.                         }
  470.                         //   string tmm =clearHTMLDB(buffer);
  471.                         string tmm = buffer;
  472.                         if (tmm.Length > 0)
  473.                         {
  474.                             Console.WriteLine("GUrlData : --> " + murl);
  475.                         }
  476.                         if (tmm.Length > 1024 * 128)
  477.                         {
  478.                             return "<html><title>Too_Long2</title><body>Too_Long</body></html>";
  479.                         }
  480.                         else
  481.                         {
  482.                             return tmm;  //返回经过过滤得数据
  483.                         }
  484.                     }
  485.                     else
  486.                     {
  487.                         return "<html><title>Too_Long</title><body>Too_Long</body></html>";
  488.                     }
  489.                 }
  490.                 catch
  491.                 {
  492.                     request.Abort();
  493.                     Console.WriteLine("Err : --> " + murl);
  494.                     return "";
  495.                 }
  496.             }
  497.             catch
  498.             {
  499.                 Console.Write("->E");
  500.                 return "";
  501.             }
  502.         }
  503.         /// <summary>
  504.         /// 读文件
  505.         /// </summary>
  506.         /// <param name="filename"></param>
  507.         /// <returns></returns>
  508.         public  string getFileData(string filename)
  509.         {
  510.             StreamReader reader = null;
  511.             string data = string.Empty;
  512.             try
  513.             {
  514.                 reader = new StreamReader(filename, System.Text.Encoding.GetEncoding("gb2312"));
  515.                 data = reader.ReadToEnd();
  516.                 reader.Close();
  517.                 return data;
  518.             }
  519.             catch (IOException e)
  520.             {
  521.                 Console.WriteLine(e.Message);
  522.             }
  523.             finally
  524.             {
  525.                 if (reader != null)
  526.                     reader.Close();
  527.             }
  528.             return "";
  529.             /*
  530.             StreamReader reader = null;
  531.             string data = string.Empty;
  532.             try
  533.             {
  534.                 reader = new StreamReader(filename, System.Text.Encoding.GetEncoding("gb2312"));
  535.                 for (string line = reader.ReadLine(); line != null; line = reader.ReadLine())
  536.                 {
  537.                     if (data == "")
  538.                     {
  539.                         data = line;
  540.                     }
  541.                     else
  542.                     {
  543.                         data = data + "n" + line;
  544.                     }
  545.                 }
  546.                 reader.Close();
  547.                 return data;
  548.             }
  549.             catch (IOException e)
  550.             {
  551.                 Console.WriteLine(e.Message);
  552.             }
  553.             finally
  554.             {
  555.                 if (reader != null)
  556.                     reader.Close();
  557.             }
  558.             return "";
  559.             */
  560.         }
  561.         /// <summary>
  562.         /// 写文件
  563.         /// </summary>
  564.         /// <param name="filename"></param>
  565.         /// <param name="data"></param>
  566.         public void putFileData(string filename, string data)
  567.         {
  568.         StreamWriter writer = null;
  569.             try
  570.             {
  571.                 writer = new StreamWriter(filename, false, System.Text.Encoding.GetEncoding("gb2312"));
  572.                 writer.Write(data);
  573.                 writer.Close();
  574.             }
  575.             catch (IOException e)
  576.             {
  577.                 Console.WriteLine(e.Message);
  578.             }
  579.             finally
  580.             {
  581.                 if (writer != null)
  582.                     writer.Close();
  583.             }
  584.         }
  585.         /*
  586.         /// <summary> 
  587.         /// 把URL拆成路径 写入数据    
  588.         /// </summary>
  589.         /// <param name="url"></param>
  590.         /// <param name="data"></param>
  591.         private void WriteWeb2Disk(string url, string data)
  592.         {
  593.             url=url.Trim().ToLower();
  594.             //
  595.             if ((url.IndexOf('/') == -1) | (url.ToLower().IndexOf("http://") == -1))
  596.             {
  597.                 return;  //URL错误
  598.             }
  599.             // 去掉URL 结尾的 /
  600.             if (url.Substring(url.Length - 1, 1) == "/")
  601.             { 
  602.               
  603.                 url = url.Substring(0,url.Length-1);
  604.             
  605.             }
  606.             //找到第一个等号  最后的链接中加上
  607.             string urlAdd = "";
  608.             int ddhao = url.IndexOf('?');
  609.             if ( ddhao>-1)
  610.             {
  611.                 string urlTmp = url;
  612.                 int urlLen = url.Length;
  613.                // url = urlTmp.PadLeft(ddhao - 1);
  614.                // urlAdd = urlTmp.PadRight(urlLen -ddhao);
  615.                 url = urlTmp.Substring(0,ddhao - 1);
  616.                 urlAdd = urlTmp.Substring(ddhao+1, urlLen - ddhao-1);
  617.             }
  618.           
  619.             if (url.IndexOf('?') > -1)
  620.             {
  621.                 int ic = 0;
  622.             }
  623.             string[] U = url.Split('/');
  624.             string tmpDir = FileDir;
  625.             //起始的地址  http://www.qq.com 
  626.             if (U.Length == 3)
  627.             {
  628.                 //1  建立文件夹
  629.                 //string a = tmpDir + "\" + codeIt.DirCN2CODE(U[U.Length - 1]);  //对路径进行编码  使之符合规范
  630.                 string a = tmpDir + "\" +U[U.Length - 1];  //对路径进行编码  使之符合规范
  631.                 if (System.IO.Directory.Exists(tmpDir) == false)
  632.                 {
  633.                     System.IO.Directory.CreateDirectory(tmpDir);
  634.                 }
  635.                 if (System.IO.Directory.Exists(a) == false)
  636.                 {
  637.                     System.IO.Directory.CreateDirectory(a);
  638.                 }
  639.                 //2 保存文件  为WEBMAINPAGE.HTM
  640.                 //得到文件路径
  641.                 string filePathOne = a + "\WEBMAINPAGE.HTM" ;
  642.                 //保存数据
  643.                 putFileData(filePathOne, data);
  644.             }
  645.             else
  646.             {
  647.                 // http://www.nbd.com.cn/newShow.asp?D_ID=44860
  648.                 for (int i = 2; i < U.Length - 1; i++)
  649.                 {
  650.                     //string a = codeIt.DirCN2CODE(U[i]);  //对路径进行编码  使之符合规范
  651.                     string a = U[i];  //对路径进行编码  使之符合规范
  652.                     tmpDir = tmpDir + "\" + a;
  653.                     if (System.IO.Directory.Exists(tmpDir) == false)
  654.                     {
  655.                         System.IO.Directory.CreateDirectory(tmpDir);
  656.                     }
  657.                 }
  658.                 // 只对最后的文件名进行编码
  659.                 //得到文件路径               最后加上附加的地址部分 刚才因为 = 号 拆开的
  660.                 string filePathOne = tmpDir + "\" + codeIt.DirCN2CODE(U[U.Length - 1] +"?"+ urlAdd);
  661.                 if (U[U.Length - 1].Length == 0)
  662.                 {
  663.                     filePathOne = tmpDir + "\WEBMAINPAGE.HTM";
  664.                 }
  665.                 //保存数据
  666.                 putFileData(filePathOne, data);
  667.             }
  668.         }
  669.         */
  670.         /// <summary>
  671.         /// 得到数据中符合条件的URL
  672.         /// </summary>
  673.         /// <param name="data"></param>
  674.         private void GetAddUrl(string HData, string SourceUrl)  //数据  当前URL
  675.         {
  676.             //  变为同一层的标志
  677.             int xgl = SourceUrl.LastIndexOf('/');
  678.             string TMPurl = SourceUrl.Substring(0, xgl + 1);
  679.             string[] FastrD = SourceUrl.Split('/');
  680.             string fastUrl = "http://" + FastrD[2];
  681.             HTMParse.ParseHTML parse = new HTMParse.ParseHTML();
  682.             parse.Source = HData;
  683.             while (!parse.Eof())
  684.             {
  685.                 char ch = parse.Parse();
  686.                 if (ch == 0)
  687.                 {
  688.                     HTMParse.Attribute a = parse.GetTag()["HREF"];
  689.                     if (a != null)
  690.                     {
  691.                         HTMParse.Attribute c = parse.GetTag()["HREF"];
  692.                         string xa1Val = a.Value.ToString().Trim().ToLower();    //得到URL      判断为该站点内部的URL
  693.                         if (isOKFile(xa1Val) == false | xa1Val.Length == 0)
  694.                           //  if ( xa1Val.Length == 0)
  695.                         {
  696.                             int PP_PP = 0;
  697.                             PP_PP = PP_PP + 1;
  698.                         }
  699.                         else
  700.                         {
  701.                             string xa1 = myClassHTML.Data2Url(SourceUrl, xa1Val);
  702.                             xa1 = xa1.Trim().ToString();
  703.                             if ((xa1.Length > 7)&(xa1.Length < 160 )) //== "http://")
  704.                             {
  705.                                 //限定在同一起始位置
  706.                              //   if (xa1.ToLower().IndexOf(fastUrl.ToLower()) > -1)
  707.                                 if (jisuansourceUrls(xa1) == true)
  708.                                 {
  709.                                     try
  710.                                     {
  711.                                         //Environment.TickCount;
  712.                                         // string New_CANSHU = System.Web.HttpUtility.UrlDecode(xa1, System.Text.Encoding.GetEncoding("GB2312"));
  713.                                         string New_CANSHU = xa1;
  714.                                         newUrlDBClass.putOneUrl(New_CANSHU); // nUrlDB.putOneUrl(a);
  715.                                     }
  716.                                     catch
  717.                                     {
  718.                                         Console.Write("->E");
  719.                                     }
  720.                                     // Console.WriteLine(" 压入 URL数据库 >> " + xa1);
  721.                                 }
  722.                                 else
  723.                                 {
  724.                                 }
  725.                             }
  726.                         }
  727.                     }
  728.                 }
  729.             }
  730.         }
  731.         /// <summary>
  732.         /// 调查文件扩展名是否属于需要的 ccs  js  zip  avi rar  exe dat  png jpg gif mp3 等不抓取
  733.         /// </summary>
  734.         /// <param name="url"></param>
  735.         /// <returns></returns>
  736.         private bool isOKFile(string url)
  737.         {
  738.             if (url.IndexOf('?') > 0)
  739.             {
  740.                 return true;
  741.             }
  742.             if(url.Length<4)
  743.             {
  744.                 return true;
  745.             }
  746.             //取出最后4位 如果含有扩展名 则返回 F
  747.             string tmp_one = url.Substring(url.Length-4, 4).ToLower();
  748.             int ll = tmp_one.Length;
  749.             foreach (string a_o in FILEEX)
  750.             {
  751.                 int intU =tmp_one.LastIndexOf("."+a_o) ;
  752.                 if (intU>-1 &  intU== ll-a_o.Length -1)
  753.                 {
  754.                     return false;
  755.                 }
  756.             
  757.             }
  758.             return true;
  759.         }
  760.         /// <summary>
  761.         /// 编码转换
  762.         /// </summary>
  763.         /// <param name="data"></param>
  764.         /// <returns></returns>
  765.         private string Str2Str(string data)
  766.         {
  767.             string gb2312info = string.Empty;
  768.             Encoding utf8 = Encoding.UTF8;
  769.             Encoding gb2312 = Encoding.GetEncoding("gb2312");
  770.             // Convert the string into a byte[].
  771.             byte[] unicodeBytes = utf8.GetBytes(data);
  772.             // Perform the conversion from one encoding to the other.
  773.             byte[] asciiBytes = Encoding.Convert(utf8, gb2312, unicodeBytes);
  774.             // Convert the new byte[] into a char[] and then into a string.
  775.             // This is a slightly different approach to converting to illustrate
  776.             // the use of GetCharCount/GetChars.
  777.             char[] asciiChars = new char[gb2312.GetCharCount(asciiBytes, 0, asciiBytes.Length)];
  778.             gb2312.GetChars(asciiBytes, 0, asciiBytes.Length, asciiChars, 0);
  779.             gb2312info = new string(asciiChars);
  780.             return gb2312info;
  781.         }
  782.         /// <summary>
  783.         /// 得到URL的MD5名
  784.         /// </summary>
  785.         /// <param name="url"></param>
  786.         /// <returns></returns>
  787.         private string getMD5name(string url)
  788.         {
  789.             string strMd5 = System.Web.Security.FormsAuthentication.HashPasswordForStoringInConfigFile(url, "md5");
  790.             return strMd5;
  791.         }
  792.         /// <summary>
  793.         /// 计算该 url  是否和源URL 队列中的某个在同一站点
  794.         /// </summary>
  795.         /// <param name="nowurl">http://ee/</param>
  796.         /// <returns></returns>
  797.         private bool jisuansourceUrls(string nowurl)
  798.         {
  799.             if(nowurl.IndexOf("http://")!=0)
  800.             {
  801.                return false;
  802.             }
  803.             string[] ax  = nowurl.Split('/');
  804.             string axs ="http://" + ax[2]+"/"; 
  805.             foreach (string a in sourceUrls)
  806.             {
  807.                 if (a.ToLower().IndexOf(axs.ToLower()) ==0)
  808.                 {
  809.                     return true;      
  810.                 }           
  811.             }
  812.             return false; 
  813.         }
  814.         /// <summary>
  815.         /// 是否保存该url对应的数据  如果不需要过滤永远返回 True
  816.         /// </summary>
  817.         /// <param name="url">在过滤状态下 不能作为采样结果的数据不被保存</param>
  818.         /// <returns></returns>
  819.         private bool IsSaveData(string url,string htmldat)
  820.         {
  821.             if (XunLong.CongifData.Config.xlDirectIndex.IndexOf('0') > -1)
  822.             {
  823.                return true;  //如果不需要过滤永远返回 True
  824.             }
  825.             if (url.Length == 0 | htmldat.Length == 0)
  826.             {
  827.                 return false;
  828.             }
  829.             while (IsRunIndexWrite == true)
  830.             {
  831.                 System.Threading.Thread.Sleep(5000);
  832.             
  833.             }
  834.             //把得到的数据压入 已经得到的url列表
  835.             newUrlDBClass.add_Had_Url(XunLong.CongifData.Config.Had_Url_Data, url);
  836.             //匹配模版 压入索引
  837.            int N_B =  XunLong.clsDirectIndex.DirectIndex.IndexOneData(url, htmldat);
  838.            if (N_B > N_B_OLD)
  839.            {
  840.                IndexChang = true;
  841.            }
  842.             //判断索引是否变化
  843.            N_B_OLD = N_B;
  844.            Console.WriteLine(" ->匹配成功: " + N_B.ToString());
  845.             return false;
  846.         
  847.         }
  848.         /*
  849.         /// <summary>
  850.         /// 把采样队列压入系统
  851.         /// </summary>
  852.         /// <param name="pathcy"></param>
  853.         public void initNEEDURL(string okPath)
  854.         {
  855.             //初始化分词缓存
  856.             neeDurl.Clear();
  857.             StreamReader reader = null;
  858.             try
  859.             {
  860.                 reader = new StreamReader(okPath, System.Text.Encoding.GetEncoding("gb2312"));
  861.                 for (string line = reader.ReadLine(); line != null; line = reader.ReadLine())
  862.                 {
  863.                     if (line != null)
  864.                     {
  865.                         if (line.Length == 0)
  866.                         { }
  867.                         else
  868.                         {
  869.                             if (neeDurl.Contains(line) == false)
  870.                             {
  871.                                 neeDurl.Add(line);
  872.                             }
  873.                             else
  874.                             {
  875.                                 int u_w = 0;
  876.                             }
  877.                         }
  878.                     }
  879.                 }
  880.                 reader.Close();
  881.             }
  882.             catch (IOException e)
  883.             {
  884.                 Console.WriteLine(e.Message);
  885.             }
  886.             finally
  887.             {
  888.                 if (reader != null)
  889.                     reader.Close();
  890.             }
  891.             ArrayList tmp_needurl = new ArrayList();
  892.             tmp_needurl.Clear();
  893.             foreach (string a in neeDurl)
  894.             {
  895.                 if (a.IndexOf('?') > 0)
  896.                 {
  897.                     tmp_needurl.Add(a);
  898.                 }
  899.             }
  900.             foreach (string a in neeDurl)
  901.             {
  902.                 if (a.IndexOf('?') == -1)
  903.                 {
  904.                     tmp_needurl.Add(a);
  905.                 }
  906.             }
  907.             neeDurl = tmp_needurl;
  908.             Console.WriteLine(" 共有 " + neeDurl.Count.ToString() +" 条采样数据 ");
  909.         }
  910.         /// <summary>
  911.         /// 写入一个数据
  912.         /// </summary>
  913.         /// <param name="filename">文件名</param>
  914.         /// <param name="data">数据</param>
  915.         /// <param name="isApp">是否追加模式</param>
  916.         public void w1_w(string okPath, string data)
  917.         {
  918.             StreamWriter writer = null;
  919.             try
  920.             {
  921.                 writer = new StreamWriter(okPath, true, System.Text.Encoding.GetEncoding("gb2312"));
  922.                 //  writer.Write(data);
  923.                 writer.WriteLine(data);
  924.                 writer.Close();
  925.             }
  926.             catch (IOException e)
  927.             {
  928.                 Console.WriteLine(e.Message);
  929.             }
  930.             finally
  931.             {
  932.                 if (writer != null)
  933.                     writer.Close();
  934.             }
  935.         }
  936.         /// <summary>
  937.         /// 读取1个文件 压入系统
  938.         /// </summary>
  939.         /// <param name="okPath"></param>
  940.         public void w2_w(string okPath)
  941.         {
  942.             StreamReader reader = null;
  943.             try
  944.             {
  945.                 reader = new StreamReader(okPath, System.Text.Encoding.GetEncoding("gb2312"));
  946.                 for (string line = reader.ReadLine(); line != null; line = reader.ReadLine())
  947.                 {
  948.                     if (line != null)
  949.                     {
  950.                         if (line.Length == 0)
  951.                         { }
  952.                         else
  953.                         {
  954.                            // if (olgurl.Contains(line) == false)
  955.                            // {
  956.                            //     olgurl.Add(line);
  957.                           //  }
  958.                           //  else
  959.                           //  {
  960.                           //      int u_w = 0;
  961.                           //  }
  962.                         }
  963.                     }
  964.                 }
  965.                 reader.Close();
  966.             }
  967.             catch (IOException e)
  968.             {
  969.                 Console.WriteLine(e.Message);
  970.             }
  971.             finally
  972.             {
  973.                 if (reader != null)
  974.                     reader.Close();
  975.             }
  976.         }
  977.         */
  978.         /// <summary>
  979.         /// 拷贝文件 从2到1
  980.         /// </summary>
  981.         /// <param name="IndexData"></param>
  982.         /// <param name="IndexData2"></param>
  983.         private void  CopyIndexData(string  IndexData,string  IndexData2)
  984.         {
  985.             DirectoryInfo dir1 = new DirectoryInfo(IndexData);
  986.             foreach (FileInfo f in dir1.GetFiles("*"))   //遍历获得文件   
  987.             {
  988.                 System.IO.File.Delete(f.FullName);
  989.             }
  990.             DirectoryInfo dir = new DirectoryInfo(IndexData2);
  991.               foreach (FileInfo f in dir.GetFiles("*"))   //遍历获得文件   
  992.               {
  993.                   System.IO.File.Copy(f.FullName, IndexData + "\"+ f.Name);
  994.               }
  995.         
  996.         }
  997.         
  998.     }
  999. }