ClassSpiderMain.cs
上传用户:zhangkuixh
上传日期:2013-09-30
资源大小:5473k
文件大小:41k
- using System;
- using System.IO;
- using System.Collections.Generic;
- using System.Text;
- using System.Net;
- using System.Net.Sockets;
- using System.Threading;
- using System.Collections;
- /*
- ' 迅龙中文分类搜索引擎 v0.6
- '
- ' LGPL 许可发行
- '
- ' 宁夏大学 张冬 康彩 zd4004@163.com
- '
- ' 官网 http://blog.163.com/zd4004/
- */
- namespace XunLong.ConsoleSpiderOne
- {
- class ClassSpiderMain
- {
- /// <summary>
- /// 判断索引是否变化
- /// </summary>
- public bool IndexChang = false;
- /// <summary>
- /// 是否正在运行索引拷贝工作 此时 需要使得当前蜘蛛延时
- /// </summary>
- private bool IsRunIndexWrite = false;
- /// <summary>
- /// 统计总共下载的文件数
- /// </summary>
- public int Num = 0;
- /// <summary>
- /// 统计总共下载的文件数 有效
- /// </summary>
- public int Num_1 = 0;
- /// <summary>
- /// 判断索引个数是否变化
- /// </summary>
- private int N_B_OLD = 0;
- /// <summary>
- /// 测试速度使用
- /// </summary>
- private int OneTime_X = 0;
- /// <summary>
- /// 采样队列
- /// </summary>
- private ArrayList neeDurl = new ArrayList();
- private string OneDir = "";
- string FileDir = "";
- int YYY = 0;
- /// <summary>
- /// 索引存放数据
- /// </summary>
- string indexPath = "";
- /// <summary>
- /// 源url列表 只要该url 和 源列表中 任意一个地址在同一层 那么即可认为 可以下载
- /// </summary>
- private ArrayList sourceUrls = new ArrayList();
- /// <summary>
- /// 编码器
- /// </summary>
- private NewNxuEncoding.CNewNxuEncoding codeIt = new NewNxuEncoding.CNewNxuEncoding();
- //定义URL类
- private XunLong.UrlDBClassLibrary.ClassUrlDB newUrlDBClass = new XunLong.UrlDBClassLibrary.ClassUrlDB();
- /// <summary>
- /// 定义存储类
- /// </summary>
- private NetHashTableAPI.ClassNHT myDB = new NetHashTableAPI.ClassNHT();
- /// <summary>
- /// HTM chuli
- /// </summary>
- private XunLong.HtmlClassLibrary.ClassHTML myClassHTML = new XunLong.HtmlClassLibrary.ClassHTML();
- System.Random myRandom = new Random();
- /// <summary>
- /// 每隔1800秒 扫描一下起始的站点
- /// </summary>
- int iTime = Environment.TickCount;
- /// <summary>
- /// 不用处理的扩展名列表
- /// </summary>
- private string[] FILEEX ={"css", "js", "zip", "avi", "rar", "exe", "dat", "png", "jpg", "gif", "mp3","rm","rmvb","doc",
- "xsl","pdf","asf","wav" ,"wmv","mpeg","mp4","txt","gz","tar","torrent","swf","ppt","mdb",
- "iso","bin","dll","obj","svg","xml","mov","pps","ico","iuc","bak","pps","gz"};
- /// <summary>
- /// 配置文件路径
- /// </summary>
- private string IndexData_Config_Path = "";
-
- public ClassSpiderMain()
- {
- //print javas
-
- }
- /// <summary>
- /// 是否需要运行
- /// </summary>
- public bool IsRun = false;
- /// <summary>
- /// 开始 蜘蛛
- /// </summary>
- public void StartMain( string k_c_path)
- {
- //当前目录
- // OneDir = AppDomain.CurrentDomain.BaseDirectory;
- IndexData_Config_Path = k_c_path;
- IsRun = true;
- Num = 0;
- Num_1 = 0;
- // FileDir = OneDir + "WEB";
- FileDir = XunLong.CongifData.Config.SpiderData;
- // string urlsCache = OneDir + "urlsCache.dat";
- // string urlsCache = ;
- string urlsCache = XunLong.CongifData.Config.UrlCahceData + "c";
- // string urlsSource = OneDir + "urlsSource.dat";
- //string urlsSource =;
- string urlsSource = XunLong.CongifData.Config.UrlSourceData;
- Console.WriteLine(" 源地址 " + urlsSource);
- Console.WriteLine(" 地址缓存 " + urlsSource + "c");
- // indexPath = indexPathX;
- Console.WriteLine("索引存放 " + urlsSource);
- newUrlDBClass.urlsCacheFile = urlsCache;
- newUrlDBClass.urlsSourceFile = urlsSource;
- //加载服务
- newUrlDBClass.StartFirstUrls(); // .StartUrls();
- sourceUrls = newUrlDBClass.SourceURLs;
- // 设定存储类的路径 3M
- myDB.SetClassNHT(FileDir, 3145727, k_c_path);
- ArrayList aOLDUrl = new ArrayList();
- aOLDUrl = myDB.SearchOneList("http://");
- newUrlDBClass.AddOldurlsIOLD(aOLDUrl);
- //清理 开始监控的 url
- newUrlDBClass.ReSetSource();
- iTime = Environment.TickCount;
- if (XunLong.CongifData.Config.xlDirectIndex.IndexOf('0') > -1)
- {
- }
- else
- {
- newUrlDBClass.Load_Had_Url(XunLong.CongifData.Config.Had_Url_Data);
- XunLong.clsDirectIndex.DirectIndex.OpenIndex(k_c_path);
- XunLong.clsDirectIndex.DirectIndex.InitCnStopWord(XunLong.CongifData.Config.StopWordData);
- }
- }
- /// <summary>
- /// 实际运行程序
- /// </summary>
- public void ZhiZhuRun()
- {
- XXXXX:
-
- try
- {
- while (IsRun)
- {
- int iTime_2 = Environment.TickCount;
- //定时监控
- if (iTime_2 - iTime > XunLong.CongifData.Config.IndexDataTIME*60*1000)
- {
- iTime = Environment.TickCount;
- // 1 iTime
- //判断索引是否改变
- if (IndexChang == true)
- {
- IsRunIndexWrite =true;
- //关闭索引
- XunLong.clsDirectIndex.DirectIndex.CloseIndex();
- //拷贝新的索引数据到检索目录
- CopyIndexData(XunLong.CongifData.Config.IndexData, XunLong.CongifData.Config.IndexData2);
- //打开索引
- XunLong.clsDirectIndex.DirectIndex.OpenIndex(IndexData_Config_Path);
- IsRunIndexWrite = false;
- IndexChang = false;
- }
-
- //清理 开始监控的 url
- newUrlDBClass.ReSetSource();
-
- Console.WriteLine("*******************************");
- Console.WriteLine("* *");
- Console.WriteLine("* 监控 重新得到数据 *");
- Console.WriteLine("* *");
- Console.WriteLine("*******************************");
- }
-
-
- // double xxx = myRandom.NextDouble() * 500;
- // int x = (int)xxx;
- // System.Threading.Thread.Sleep(x);
- //判断是否停止
- if (IsRun == false)
- {
- return;
- //停止
- }
- //得到1个新的URL
- string newUrl = "";
- try
- {
- newUrl = newUrlDBClass.getOneUrl();
- }
- catch
- {
- Console.Write("->E");
- }
- if (newUrl.Length == 0)
- {
- //URL 任务缓存为 0
- System.Threading.Thread.Sleep(1000);
- }
- else
- {
- Console.WriteLine("开始请求 >> " + newUrl);
- //取得1个数据
- string getDB = GetOneHTML(newUrl, "gb2312");
- //非网页格式 则返回
- //return "XL_NULL";
- if (getDB != "XL_NULL")
- {
- if (getDB.Length > 0)
- { //保存数据
- // WriteWeb2Disk(newUrl, getDB);
- //调用存储类 保存数据
- int N_ii = 0;
- //判断是否直接索引 //判断是否需要存储
- // 如果需要索引 在此处进行 然后把不能索引的记载下来
- if (IsSaveData(newUrl,getDB) == true)
- {
- while (true)
- {
- try
- {
-
- if (myDB.lockIt == false)
- {
- myDB.lockIt = true;
- myDB.add(newUrl, getDB, true);
- myDB.lockIt = false;
- break;
- }
- // 延时 在试
- System.Threading.Thread.Sleep(DateTime.Now.Millisecond + 100);
- }
- catch
- {
- Console.Write("->E");
- }
- N_ii = N_ii + 1;
- if (N_ii > 200)
- {
- Console.Write("->E-R");
- newUrlDBClass.putOneUrl2(newUrl); // nUrlDB.putOneUrl(a);
- Console.WriteLine(" 写入失败 重新压回 URL数据库 >> " + newUrl);
- goto XXXXX;
- }
- }
- Num_1 = Num_1 + 1;
- }
- else
- {
- Console.WriteLine("通过索引 取消存储!");
- //********************************************************
- //
- //把取消存储的 压入一个已经请求的url 列表
- //********************************************************
- }
- Num = Num + 1;
- if (Num % 10 == 0)
- {
- int TwoTime_X = Environment.TickCount - OneTime_X;
- YYY = 100000 / TwoTime_X;
-
- // Console.WriteLine("===================>>>>> 当前速度: " + YYY.ToString() + " 条/秒");
-
- OneTime_X = Environment.TickCount;
- }
- double yyy_d = (double)YYY / 10;
-
- Console.WriteLine("总共下载了 " + Num.ToString() + " 条数据 其中过滤得到 "+Num_1.ToString()+" 条 有效数据");
- Console.WriteLine("===================>>>>> 当前速度: " + yyy_d.ToString() + " 条/秒");
- // 根据当前页和页面URL 把数据中的链接提取出来
- try
- {
- GetAddUrl(getDB, newUrl);
- }
- catch
- {
- Console.Write("->E");
- }
- }
- else
- {
- //得不到数据时重新压入
- if (newUrl.Length > 7) //== "http://")
- {
- try
- {
- newUrlDBClass.putOneUrl2(newUrl); // nUrlDB.putOneUrl(a);
- Console.WriteLine(" 请求失败 重新压回 URL数据库 >> " + newUrl);
- }
- catch
- {
- Console.Write("->E");
- }
- }
- }
- //
- }
- else
- {
- int iCCCC = 0;
- }
- }
- }
- }
- catch(Exception e)
- {
- Console.Write("->E");
- goto XXXXX;
- }
-
- }
- /// <summary>
- /// 停止蜘蛛 把剩余的URL压入缓存文件
- /// </summary>
- public void StopSpider()
- {
- IsRun = false;
- newUrlDBClass.SaveUrlsCache();
- if (XunLong.CongifData.Config.xlDirectIndex.IndexOf('0') > -1)
- {
- }
- else
- {
- XunLong.clsDirectIndex.DirectIndex.CloseIndex();
- }
- }
- /// <summary>
- /// 得到一个网页数据
- /// </summary>
- /// <param name="murl"></param>
- /// <returns></returns>
- private string GetOneHTML(string murl, string codeType)
- {
- try
- {
- HttpWebRequest request = (HttpWebRequest)WebRequest.Create(murl);
- request.Timeout = 10000;
- try
- {
- //下面来看看如何处理HTML页面。首先要做的当然是下载HTML页面,这可以通过C#提供的HttpWebRequest类实现:
- // request = (HttpWebRequest)WebRequest.Create(murl);
- WebResponse response = request.GetResponse();
- Stream stream = response.GetResponseStream();
- string buffer = "", line;
- //接下来我们就从request创建一个stream流。在执行其他处理之前,我们要先确定该文件是二进制文件还是文本文件,不同的文件类型处理方式也不同。下面的代码确定该文件是否为二进制文件。
- //。如果是文本文件,首先从stream创建一个StreamReader,然后将文本文件的内容一行一行加入缓冲区。
- // response.ContentType.
- // Encoding gbx = System.Text.Encoding.GetEncoding("gb2312");
- //存放当前的应用的字符集
- string NowCodeSet = "";
- if (response.ContentLength < 1024 * 128)
- {
- //判断是否是网页格式
- if (response.ContentType.ToLower().StartsWith("text/"))
- {
- //自动检测 UTF8
- if ((response.ContentType.ToLower().IndexOf("utf-8") > -1) | (response.ContentType.ToLower().IndexOf("unicode") > -1))
- {
- StreamReader reader = new StreamReader(stream, System.Text.Encoding.UTF8);
- NowCodeSet = "utf-8";
- buffer = "";
- while ((line = reader.ReadLine()) != null)
- {
- buffer += line + "rn";
- }
- reader.Close();
- stream.Close();
- response.Close();
- buffer = Str2Str(buffer);
- }
- else
- {
- //自动检测GB2312
- if ((response.ContentType.ToLower().IndexOf("gb2312") > -1) | (response.ContentType.ToLower().IndexOf("gbk") > -1))
- {
- StreamReader reader = new StreamReader(stream, System.Text.Encoding.GetEncoding("GB2312"));
- NowCodeSet = "gb2312";
- buffer = "";
- while ((line = reader.ReadLine()) != null)
- {
- buffer += line + "rn";
- }
- reader.Close();
- stream.Close();
- response.Close();
- }
- else
- {
- //自动检测 不到时按照默认设置进行
- if (codeType.ToLower().IndexOf("utf") > -1)
- {
- StreamReader reader = new StreamReader(stream, System.Text.Encoding.UTF8);
- NowCodeSet = "utf-8";
- buffer = "";
- while ((line = reader.ReadLine()) != null)
- {
- buffer += line + "rn";
- }
- reader.Close();
- stream.Close();
- response.Close();
- buffer = Str2Str(buffer);
- }
- else
- {
- StreamReader reader = new StreamReader(stream, System.Text.Encoding.GetEncoding("GB2312"));
- NowCodeSet = "gb2312";
- buffer = "";
- while ((line = reader.ReadLine()) != null)
- {
- buffer += line + "rn";
- }
- reader.Close();
- stream.Close();
- response.Close();
- }
- }
- }
- }
- else
- {
- //非网页格式 则返回
- return "XL_NULL";
- }
- ///字符集为gb2312 而刚应用为utf-8
- if ((buffer.ToLower().IndexOf("gb2312") > -1) & (NowCodeSet == "utf-8"))
- {
- HttpWebRequest requestX = (HttpWebRequest)WebRequest.Create(murl);
- WebResponse responseX = requestX.GetResponse();
- Stream streamX = responseX.GetResponseStream();
- StreamReader readerX = new StreamReader(streamX, System.Text.Encoding.GetEncoding("GB2312"));
- buffer = "";
- while ((line = readerX.ReadLine()) != null)
- {
- buffer += line + "rn";
- }
- readerX.Close();
- streamX.Close();
- responseX.Close();
- }
- ///字符集为utf-8 而刚应用为 gb2312
- if ((buffer.ToLower().IndexOf("utf-8") > -1) & (NowCodeSet == "gb2312"))
- {
- HttpWebRequest requestY = (HttpWebRequest)WebRequest.Create(murl);
- WebResponse responseY = requestY.GetResponse();
- Stream streamY = responseY.GetResponseStream();
- StreamReader readerY = new StreamReader(streamY, System.Text.Encoding.UTF8);
- buffer = "";
- while ((line = readerY.ReadLine()) != null)
- {
- buffer += line + "rn";
- }
- readerY.Close();
- streamY.Close();
- responseY.Close();
- buffer = Str2Str(buffer);
- }
- // string tmm =clearHTMLDB(buffer);
- string tmm = buffer;
- if (tmm.Length > 0)
- {
- Console.WriteLine("GUrlData : --> " + murl);
- }
- if (tmm.Length > 1024 * 128)
- {
- return "<html><title>Too_Long2</title><body>Too_Long</body></html>";
- }
- else
- {
- return tmm; //返回经过过滤得数据
- }
- }
- else
- {
- return "<html><title>Too_Long</title><body>Too_Long</body></html>";
- }
- }
- catch
- {
- request.Abort();
- Console.WriteLine("Err : --> " + murl);
- return "";
- }
- }
- catch
- {
- Console.Write("->E");
- return "";
- }
- }
- /// <summary>
- /// 读文件
- /// </summary>
- /// <param name="filename"></param>
- /// <returns></returns>
- public string getFileData(string filename)
- {
- StreamReader reader = null;
- string data = string.Empty;
- try
- {
- reader = new StreamReader(filename, System.Text.Encoding.GetEncoding("gb2312"));
- data = reader.ReadToEnd();
- reader.Close();
- return data;
- }
- catch (IOException e)
- {
- Console.WriteLine(e.Message);
- }
- finally
- {
- if (reader != null)
- reader.Close();
- }
- return "";
- /*
- StreamReader reader = null;
- string data = string.Empty;
- try
- {
- reader = new StreamReader(filename, System.Text.Encoding.GetEncoding("gb2312"));
- for (string line = reader.ReadLine(); line != null; line = reader.ReadLine())
- {
- if (data == "")
- {
- data = line;
- }
- else
- {
- data = data + "n" + line;
- }
- }
- reader.Close();
- return data;
- }
- catch (IOException e)
- {
- Console.WriteLine(e.Message);
- }
- finally
- {
- if (reader != null)
- reader.Close();
- }
- return "";
- */
- }
- /// <summary>
- /// 写文件
- /// </summary>
- /// <param name="filename"></param>
- /// <param name="data"></param>
- public void putFileData(string filename, string data)
- {
- StreamWriter writer = null;
- try
- {
- writer = new StreamWriter(filename, false, System.Text.Encoding.GetEncoding("gb2312"));
- writer.Write(data);
- writer.Close();
- }
- catch (IOException e)
- {
- Console.WriteLine(e.Message);
- }
- finally
- {
- if (writer != null)
- writer.Close();
- }
- }
- /*
- /// <summary>
- /// 把URL拆成路径 写入数据
- /// </summary>
- /// <param name="url"></param>
- /// <param name="data"></param>
- private void WriteWeb2Disk(string url, string data)
- {
- url=url.Trim().ToLower();
- //
- if ((url.IndexOf('/') == -1) | (url.ToLower().IndexOf("http://") == -1))
- {
- return; //URL错误
- }
- // 去掉URL 结尾的 /
- if (url.Substring(url.Length - 1, 1) == "/")
- {
-
- url = url.Substring(0,url.Length-1);
-
- }
- //找到第一个等号 最后的链接中加上
- string urlAdd = "";
- int ddhao = url.IndexOf('?');
- if ( ddhao>-1)
- {
- string urlTmp = url;
- int urlLen = url.Length;
- // url = urlTmp.PadLeft(ddhao - 1);
- // urlAdd = urlTmp.PadRight(urlLen -ddhao);
- url = urlTmp.Substring(0,ddhao - 1);
- urlAdd = urlTmp.Substring(ddhao+1, urlLen - ddhao-1);
- }
-
- if (url.IndexOf('?') > -1)
- {
- int ic = 0;
- }
- string[] U = url.Split('/');
- string tmpDir = FileDir;
- //起始的地址 http://www.qq.com
- if (U.Length == 3)
- {
- //1 建立文件夹
- //string a = tmpDir + "\" + codeIt.DirCN2CODE(U[U.Length - 1]); //对路径进行编码 使之符合规范
- string a = tmpDir + "\" +U[U.Length - 1]; //对路径进行编码 使之符合规范
- if (System.IO.Directory.Exists(tmpDir) == false)
- {
- System.IO.Directory.CreateDirectory(tmpDir);
- }
- if (System.IO.Directory.Exists(a) == false)
- {
- System.IO.Directory.CreateDirectory(a);
- }
- //2 保存文件 为WEBMAINPAGE.HTM
- //得到文件路径
- string filePathOne = a + "\WEBMAINPAGE.HTM" ;
- //保存数据
- putFileData(filePathOne, data);
- }
- else
- {
- // http://www.nbd.com.cn/newShow.asp?D_ID=44860
- for (int i = 2; i < U.Length - 1; i++)
- {
- //string a = codeIt.DirCN2CODE(U[i]); //对路径进行编码 使之符合规范
- string a = U[i]; //对路径进行编码 使之符合规范
- tmpDir = tmpDir + "\" + a;
- if (System.IO.Directory.Exists(tmpDir) == false)
- {
- System.IO.Directory.CreateDirectory(tmpDir);
- }
- }
- // 只对最后的文件名进行编码
- //得到文件路径 最后加上附加的地址部分 刚才因为 = 号 拆开的
- string filePathOne = tmpDir + "\" + codeIt.DirCN2CODE(U[U.Length - 1] +"?"+ urlAdd);
- if (U[U.Length - 1].Length == 0)
- {
- filePathOne = tmpDir + "\WEBMAINPAGE.HTM";
- }
- //保存数据
- putFileData(filePathOne, data);
- }
- }
- */
- /// <summary>
- /// 得到数据中符合条件的URL
- /// </summary>
- /// <param name="data"></param>
- private void GetAddUrl(string HData, string SourceUrl) //数据 当前URL
- {
- // 变为同一层的标志
- int xgl = SourceUrl.LastIndexOf('/');
- string TMPurl = SourceUrl.Substring(0, xgl + 1);
- string[] FastrD = SourceUrl.Split('/');
- string fastUrl = "http://" + FastrD[2];
- HTMParse.ParseHTML parse = new HTMParse.ParseHTML();
- parse.Source = HData;
- while (!parse.Eof())
- {
- char ch = parse.Parse();
- if (ch == 0)
- {
- HTMParse.Attribute a = parse.GetTag()["HREF"];
- if (a != null)
- {
- HTMParse.Attribute c = parse.GetTag()["HREF"];
- string xa1Val = a.Value.ToString().Trim().ToLower(); //得到URL 判断为该站点内部的URL
- if (isOKFile(xa1Val) == false | xa1Val.Length == 0)
- // if ( xa1Val.Length == 0)
- {
- int PP_PP = 0;
- PP_PP = PP_PP + 1;
- }
- else
- {
- string xa1 = myClassHTML.Data2Url(SourceUrl, xa1Val);
- xa1 = xa1.Trim().ToString();
- if ((xa1.Length > 7)&(xa1.Length < 160 )) //== "http://")
- {
- //限定在同一起始位置
- // if (xa1.ToLower().IndexOf(fastUrl.ToLower()) > -1)
- if (jisuansourceUrls(xa1) == true)
- {
- try
- {
- //Environment.TickCount;
- // string New_CANSHU = System.Web.HttpUtility.UrlDecode(xa1, System.Text.Encoding.GetEncoding("GB2312"));
- string New_CANSHU = xa1;
- newUrlDBClass.putOneUrl(New_CANSHU); // nUrlDB.putOneUrl(a);
- }
- catch
- {
- Console.Write("->E");
- }
- // Console.WriteLine(" 压入 URL数据库 >> " + xa1);
- }
- else
- {
- }
- }
- }
- }
- }
- }
- }
- /// <summary>
- /// 调查文件扩展名是否属于需要的 ccs js zip avi rar exe dat png jpg gif mp3 等不抓取
- /// </summary>
- /// <param name="url"></param>
- /// <returns></returns>
- private bool isOKFile(string url)
- {
- if (url.IndexOf('?') > 0)
- {
- return true;
- }
- if(url.Length<4)
- {
- return true;
- }
- //取出最后4位 如果含有扩展名 则返回 F
- string tmp_one = url.Substring(url.Length-4, 4).ToLower();
- int ll = tmp_one.Length;
- foreach (string a_o in FILEEX)
- {
- int intU =tmp_one.LastIndexOf("."+a_o) ;
- if (intU>-1 & intU== ll-a_o.Length -1)
- {
- return false;
- }
-
- }
- return true;
- }
- /// <summary>
- /// 编码转换
- /// </summary>
- /// <param name="data"></param>
- /// <returns></returns>
- private string Str2Str(string data)
- {
- string gb2312info = string.Empty;
- Encoding utf8 = Encoding.UTF8;
- Encoding gb2312 = Encoding.GetEncoding("gb2312");
- // Convert the string into a byte[].
- byte[] unicodeBytes = utf8.GetBytes(data);
- // Perform the conversion from one encoding to the other.
- byte[] asciiBytes = Encoding.Convert(utf8, gb2312, unicodeBytes);
- // Convert the new byte[] into a char[] and then into a string.
- // This is a slightly different approach to converting to illustrate
- // the use of GetCharCount/GetChars.
- char[] asciiChars = new char[gb2312.GetCharCount(asciiBytes, 0, asciiBytes.Length)];
- gb2312.GetChars(asciiBytes, 0, asciiBytes.Length, asciiChars, 0);
- gb2312info = new string(asciiChars);
- return gb2312info;
- }
- /// <summary>
- /// 得到URL的MD5名
- /// </summary>
- /// <param name="url"></param>
- /// <returns></returns>
- private string getMD5name(string url)
- {
- string strMd5 = System.Web.Security.FormsAuthentication.HashPasswordForStoringInConfigFile(url, "md5");
- return strMd5;
- }
- /// <summary>
- /// 计算该 url 是否和源URL 队列中的某个在同一站点
- /// </summary>
- /// <param name="nowurl">http://ee/</param>
- /// <returns></returns>
- private bool jisuansourceUrls(string nowurl)
- {
- if(nowurl.IndexOf("http://")!=0)
- {
- return false;
- }
- string[] ax = nowurl.Split('/');
- string axs ="http://" + ax[2]+"/";
- foreach (string a in sourceUrls)
- {
- if (a.ToLower().IndexOf(axs.ToLower()) ==0)
- {
- return true;
- }
- }
- return false;
- }
- /// <summary>
- /// 是否保存该url对应的数据 如果不需要过滤永远返回 True
- /// </summary>
- /// <param name="url">在过滤状态下 不能作为采样结果的数据不被保存</param>
- /// <returns></returns>
- private bool IsSaveData(string url,string htmldat)
- {
- if (XunLong.CongifData.Config.xlDirectIndex.IndexOf('0') > -1)
- {
- return true; //如果不需要过滤永远返回 True
- }
- if (url.Length == 0 | htmldat.Length == 0)
- {
- return false;
- }
- while (IsRunIndexWrite == true)
- {
- System.Threading.Thread.Sleep(5000);
-
- }
- //把得到的数据压入 已经得到的url列表
- newUrlDBClass.add_Had_Url(XunLong.CongifData.Config.Had_Url_Data, url);
- //匹配模版 压入索引
- int N_B = XunLong.clsDirectIndex.DirectIndex.IndexOneData(url, htmldat);
- if (N_B > N_B_OLD)
- {
- IndexChang = true;
- }
- //判断索引是否变化
- N_B_OLD = N_B;
- Console.WriteLine(" ->匹配成功: " + N_B.ToString());
- return false;
-
- }
- /*
- /// <summary>
- /// 把采样队列压入系统
- /// </summary>
- /// <param name="pathcy"></param>
- public void initNEEDURL(string okPath)
- {
- //初始化分词缓存
- neeDurl.Clear();
- StreamReader reader = null;
- try
- {
- reader = new StreamReader(okPath, System.Text.Encoding.GetEncoding("gb2312"));
- for (string line = reader.ReadLine(); line != null; line = reader.ReadLine())
- {
- if (line != null)
- {
- if (line.Length == 0)
- { }
- else
- {
- if (neeDurl.Contains(line) == false)
- {
- neeDurl.Add(line);
- }
- else
- {
- int u_w = 0;
- }
- }
- }
- }
- reader.Close();
- }
- catch (IOException e)
- {
- Console.WriteLine(e.Message);
- }
- finally
- {
- if (reader != null)
- reader.Close();
- }
- ArrayList tmp_needurl = new ArrayList();
- tmp_needurl.Clear();
- foreach (string a in neeDurl)
- {
- if (a.IndexOf('?') > 0)
- {
- tmp_needurl.Add(a);
- }
- }
- foreach (string a in neeDurl)
- {
- if (a.IndexOf('?') == -1)
- {
- tmp_needurl.Add(a);
- }
- }
- neeDurl = tmp_needurl;
- Console.WriteLine(" 共有 " + neeDurl.Count.ToString() +" 条采样数据 ");
- }
- /// <summary>
- /// 写入一个数据
- /// </summary>
- /// <param name="filename">文件名</param>
- /// <param name="data">数据</param>
- /// <param name="isApp">是否追加模式</param>
- public void w1_w(string okPath, string data)
- {
- StreamWriter writer = null;
- try
- {
- writer = new StreamWriter(okPath, true, System.Text.Encoding.GetEncoding("gb2312"));
- // writer.Write(data);
- writer.WriteLine(data);
- writer.Close();
- }
- catch (IOException e)
- {
- Console.WriteLine(e.Message);
- }
- finally
- {
- if (writer != null)
- writer.Close();
- }
- }
- /// <summary>
- /// 读取1个文件 压入系统
- /// </summary>
- /// <param name="okPath"></param>
- public void w2_w(string okPath)
- {
- StreamReader reader = null;
- try
- {
- reader = new StreamReader(okPath, System.Text.Encoding.GetEncoding("gb2312"));
- for (string line = reader.ReadLine(); line != null; line = reader.ReadLine())
- {
- if (line != null)
- {
- if (line.Length == 0)
- { }
- else
- {
- // if (olgurl.Contains(line) == false)
- // {
- // olgurl.Add(line);
- // }
- // else
- // {
- // int u_w = 0;
- // }
- }
- }
- }
- reader.Close();
- }
- catch (IOException e)
- {
- Console.WriteLine(e.Message);
- }
- finally
- {
- if (reader != null)
- reader.Close();
- }
- }
- */
- /// <summary>
- /// 拷贝文件 从2到1
- /// </summary>
- /// <param name="IndexData"></param>
- /// <param name="IndexData2"></param>
- private void CopyIndexData(string IndexData,string IndexData2)
- {
- DirectoryInfo dir1 = new DirectoryInfo(IndexData);
- foreach (FileInfo f in dir1.GetFiles("*")) //遍历获得文件
- {
- System.IO.File.Delete(f.FullName);
- }
- DirectoryInfo dir = new DirectoryInfo(IndexData2);
- foreach (FileInfo f in dir.GetFiles("*")) //遍历获得文件
- {
- System.IO.File.Copy(f.FullName, IndexData + "\"+ f.Name);
- }
-
- }
-
- }
- }