ClassUrlDB.cs
上传用户:zhangkuixh
上传日期:2013-09-30
资源大小:5473k
文件大小:26k
- using System;
- using System.IO;
- using System.Collections.Generic;
- using System.Text;
- using System.Collections;
- /*
- ' 迅龙中文分类搜索引擎 v0.6
- '
- ' LGPL 许可发行
- '
- ' 宁夏大学 张冬 康彩 zd4004@163.com
- '
- ' 官网 http://blog.163.com/zd4004/
- */
- namespace XunLong.UrlDBClassLibrary
- {
- /// <summary>
- /// URL 数据库 负责URL 信息的本地存储和读取 输出的URL 一定要是设定好的源站点的
- /// </summary>
- public class ClassUrlDB
- {
- /// <summary>
- /// 上一次发出的URL 如果和上次发出的相同 则 再选1次 总共持续 1次
- /// </summary>
- private string[] oldUrlIt = new string[24];
- /// <summary>
- /// oldUrlIt 指针
- /// </summary>
- private int oldUrlNum = 0;
- System.Random myRandom = new Random();
- /// <summary>
- /// 源URLS 地址列表
- /// </summary>
- public ArrayList SourceURLs = new ArrayList();
- public string urlsCacheFile = "";
- public string urlsSourceFile = "";
- /// <summary>
- /// 存放已经爬行过 但是 不需要的URL
- /// </summary>
- public ArrayList NoNeedDataURL;
- //当前队列
- private ArrayList urls = new ArrayList();
- /// <summary>
- /// 当前的数据队列 从urls中取出1000 个作为 数据获取缓存 当取光是 重新从urls中获取
- /// </summary>
- private ArrayList URLS_NOW = new ArrayList();
- //
- /// <summary>
- /// 已经处理过的URL MD5 防止重复压入 mMD5
- /// </summary>
- private ArrayList urlsIOLD = new ArrayList();
- /// <summary>
- /// 如果一个数据被重新压回 3 次 那么 直接取消 不在请求 key = url val = 压回次数
- /// </summary>
- private Hashtable errUrl = new Hashtable();
- public ClassUrlDB()
- {
- //读取配置
- // XunLong.CongifData.Config.InitConfigData("D:\XunLongRUN\xunlong.kc");
- errUrl.Clear();
- }
- /*
- /// <summary>
- /// 系统内增加一个源URL
- /// </summary>
- /// <param name="UrlInit"></param>
- public void initc3(string UrlInit)
- {
- ArrayList tmp = new ArrayList();
- tmp.Clear();
- //读出原来的 源URL 文件
- StreamReader reader = null;
- try
- {
- reader = new StreamReader(urlsSourceFile);
- for (string line = reader.ReadLine(); line != null; line = reader.ReadLine())
- {
- if (line.Length > 0)
- {
- // if (tmp.Contains(line) == false)
- // {
- // tmp.Add(line);
- // }
- try
- {
- tmp.Add(line);
- }
- catch
- { }
- }
- }
- reader.Close();
- }
- catch (IOException e)
- {
- Console.WriteLine(e.Message);
- }
- finally
- {
- if (reader != null)
- reader.Close();
- }
- //添加进去
- // if (tmp.Contains(UrlInit) == false)
- // {
- // tmp.Add(UrlInit);
- // }
- // else
- // {
- // Console.WriteLine("已经存在 >> " + UrlInit);
- // return;
- // }
- try
- {
- tmp.Add(UrlInit);
- }
- catch
- {
- Console.WriteLine("已经存在 >> " + UrlInit);
- return;
- }
- //写入文件
- StreamWriter writer = null;
- try
- {
- writer = new StreamWriter(urlsSourceFile);
- foreach (string a in tmp)
- {
- writer.WriteLine(a);
- }
- writer.Close();
- Console.WriteLine("写入成功 >> " + UrlInit);
- return;
- }
- catch (IOException e)
- {
- Console.WriteLine(e.Message);
- }
- finally
- {
- if (writer != null)
- writer.Close();
- }
- //写入 URL缓存文件
- }
- */
- /// <summary>
- /// 清除整个Urls队列 URL缓存文件
- /// </summary>
- public void ClearUrls()
- {
- //删除文件 URL缓存文件
- if (System.IO.File.Exists(urlsCacheFile) == true)
- {
- System.IO.File.Delete(urlsCacheFile);
- System.IO.File.CreateText(urlsCacheFile);
- }
- Console.WriteLine("Url缓存清除成功 >> !");
- }
- /// <summary>
- /// 开始服务 从文件读入未完成的数据
- /// </summary>
- public void StartUrls()
- {
- //StartFirstUrls();
- //urls.Clear();
- //读出原来的 源URL 文件
- StreamReader reader = null;
- int iUrlNum = 0;
- Console.WriteLine("-> 0");
- try
- {
- reader = new StreamReader(urlsCacheFile);
- for (string line = reader.ReadLine(); line != null; line = reader.ReadLine())
- {
- if (line.Length > 0)
- {
- // if (urls.Contains(line) == false & line.IndexOf("//")!=0)
- // {
- // urls.Add(line);
- // }
- if (line.IndexOf("//") != 0)
- {
- try
- {
- urls.Add(line);
- iUrlNum = iUrlNum + 1;
- if (iUrlNum % 2000 == 1)
- {
- Console.Write(" ->" + iUrlNum.ToString());
- }
- }
- catch
- {
- //URL 重复出现 不进行处理
- }
- }
- }
- }
- reader.Close();
- }
- catch (IOException e)
- {
- Console.WriteLine(e.Message);
- }
- finally
- {
- if (reader != null)
- reader.Close();
- }
- Console.WriteLine(" 读取缓存完成 URL条目数量 >> "+ urls.Count.ToString());
- }
- /// <summary>
- /// 初始化URL数据库 加载源 Url到队列
- /// </summary>
- public void StartFirstUrls()
- {
- urls.Clear();
- SourceURLs.Clear();
- //读出原来的 源URL 文件
- StreamReader reader = null;
- try
- {
- reader = new StreamReader(urlsSourceFile);
- for (string line = reader.ReadLine(); line != null; line = reader.ReadLine())
- {
- if (line.Length > 0)
- {
- if (urls.Contains(line) == false & line.IndexOf("//") != 0 & line.IndexOf("http://") >-1 )
- {
-
- urls.Add(line);
- SourceURLs.Add(line);
- }
- }
- }
- reader.Close();
- }
- catch (IOException e)
- {
- Console.WriteLine(e.Message);
- }
- finally
- {
- if (reader != null)
- reader.Close();
- }
- //再读取数据缓存中的数据
- StartUrls();
- Console.WriteLine(" 读取缓存完成 >> 共有 " + urls.Count.ToString() +" 条URL" );
- }
- /// <summary>
- /// 保存Url缓存数据
- /// </summary>
- public void SaveUrlsCache()
- {
- ArrayList xurls = urls;
- //写入文件
- StreamWriter writer = null;
- try
- {
- writer = new StreamWriter(urlsCacheFile);
- foreach (string a in xurls)
- {
- writer.WriteLine(a);
- }
- writer.Close();
- Console.WriteLine("写入成功 >> "+xurls.Count.ToString());
- return;
- }
- catch (IOException e)
- {
- Console.WriteLine(e.Message);
- }
- finally
- {
- if (writer != null)
- writer.Close();
- }
- }
- /// <summary>
- /// 压入一个数据
- /// </summary>
- /// <param name="oneUrl">地址</param>
- public void putOneUrl(string oneUrl)
- {
- /*
- //包含其他协议链接的也不要 从前后查都为4 此种过滤 需要修改
- if ((oneUrl.IndexOf("://") != 4)&(oneUrl.LastIndexOf("://") != 4))
- {
- return;
- }
- */
- oneUrl = oneUrl.Replace("t", "");
- oneUrl = oneUrl.Replace("r", "");
- oneUrl = oneUrl.Replace("n", "");
- oneUrl = oneUrl.Trim();
- if ((oneUrl.Length > 180) & (oneUrl.Length < 7))
- {
- return; //太长 或者太短 都不行
- }
-
- if (oneUrl.IndexOf("</") > -1 | oneUrl.IndexOf("/>") > -1)
- {
- return;
- }
- if (urls.Contains(oneUrl) == true)
- {
- return;
- }
- //如果已经处理过 就不要重复压
- //得到该url的MD5名
- string oneUrlMD5 = getMD5name(oneUrl);
- if (urlsIOLD.Contains(oneUrlMD5) == true)
- {
- return;
- }
- //检查Url是否合法
- //1 必须有http头
- //2 不包含 /print
- //3 不含 java sp..
- //4 不含 #
- string a = "";
- oneUrl = oneUrl.Trim().ToLower();
- a = "/print";
- if (oneUrl.IndexOf(a.ToLower()) > -1)
- {
- return;
- }
- a = "#";
- if (oneUrl.IndexOf(a.ToLower()) > -1)
- {
- return;
- }
- a = "javascript:";
- if (oneUrl.IndexOf(a.ToLower()) > -1)
- {
- return;
- }
- a = " ";
- if (oneUrl.IndexOf(a.ToLower()) > -1)
- {
- return;
- }
- a = "mailto:";
- if (oneUrl.IndexOf(a.ToLower()) > -1)
- {
- return;
- }
- a = "http://";
- if (oneUrl.IndexOf(a.ToLower()) == -1)
- {
- return;
- }
- a = ".css";
- if (oneUrl.IndexOf(a.ToLower()) > -1)
- {
- return;
- }
- a = ".zip";
- if (oneUrl.IndexOf(a.ToLower()) > -1)
- {
- return;
- }
- a = ".rar";
- if (oneUrl.IndexOf(a.ToLower()) > -1)
- {
- return;
- }
- a = ".doc";
- if (oneUrl.IndexOf(a.ToLower()) > -1)
- {
- return;
- }
- a = ".pdf";
- if (oneUrl.IndexOf(a.ToLower()) > -1)
- {
- return;
- }
- a = ".ppt";
- if (oneUrl.IndexOf(a.ToLower()) > -1)
- {
- return;
- }
- a = ".xsl";
- if (oneUrl.IndexOf(a.ToLower()) > -1)
- {
- return;
- }
- a = ".jpg";
- if (oneUrl.IndexOf(a.ToLower()) > -1)
- {
- return;
- }
- a = ".png";
- if (oneUrl.IndexOf(a.ToLower()) > -1)
- {
- return;
- }
- a = ".gif";
- if (oneUrl.IndexOf(a.ToLower()) > -1)
- {
- return;
- }
- a = ".rmvb";
- if (oneUrl.IndexOf(a.ToLower()) > -1)
- {
- return;
- }
- a = ".rm";
- if (oneUrl.IndexOf(a.ToLower()) > -1)
- {
- return;
- }
- a = ".dat";
- if (oneUrl.IndexOf(a.ToLower()) > -1)
- {
- return;
- }
- try
- {
- if (urls.Contains(oneUrl) == false)
- {
- urls.Add(oneUrl);
- }
- }
- catch
- { }
- }
- /// <summary>
- /// 错误的数据重新压入 需要清除 已经处理的队列中的纪录
- /// </summary>
- /// <param name="oneUrl"></param>
- public void putOneUrl2(string oneUrl)
- {
- try
- {
- if (errUrl.Contains(oneUrl) == true)
- {
- int i = (int)errUrl[oneUrl];
- if (i > 3)
- {
- //不再压入
- return;
- }
- else
- {
- errUrl[oneUrl] = i + 1;
- }
- }
- else
- {
- errUrl.Add(oneUrl, 1);
- }
- //得到该url的MD5名
- string oneUrlMD5 = getMD5name(oneUrl);
- if (urlsIOLD.Contains(oneUrlMD5) == true)
- {
- urlsIOLD.Remove(oneUrlMD5);
- }
- putOneUrl(oneUrl);
- }
- catch
- { }
- }
- /// <summary>
- /// 得到一个地址
- /// </summary>
- public string getOneUrl2()
- {
- //纪录重复获取次数
- int DNum = 0;
- START_D:
- try
- {
- //tt int xx = urls.Count;
- int xx = urls.Count;
- if (xx == 0)
- {
- Console.WriteLine("URL库为空!");
- return "";
- }
-
- string a = "";
- double xxx =myRandom.NextDouble() * (double)xx;
- int x = (int)xxx;
- int xi2 = x;
-
- //tt int urls_Count =urls.Count;
- int urls_Count = urls.Count;
- while (a.Length == 0 & DNum<30)
- {
- try
- {
- //tt a = urls[xi2].ToString();
- a = urls[xi2].ToString();
- //得到该url的MD5名
- string oneUrlMD5 = getMD5name(a);
- if ((itComAll(a) == true) | (urlsIOLD.Contains(oneUrlMD5) == true))
- {
- a = "";
- DNum = DNum + 1;
- // xxx = myRandom.NextDouble() * Double.Parse(xx.ToString());
- // x = (int)xxx;
- // if (xi2 > urls_Count - 3)
- // {
- // break;
- // }
- // else
- // {
- // xi2 = xi2 + 1;
- goto START_D;
- //}
- }
- else
- {
- break;
- }
- }
- catch
- {
- // xxx = myRandom.NextDouble() * Double.Parse(xx.ToString());
- // x = (int)xxx;
- System.Threading.Thread.Sleep(1);
- goto START_D;
- a = "";
- }
- }
- x = xi2;
- if (a == "")
- {
- for (int c = 0; c < urls_Count; c++)
- {
- try
- {
- a = urls[c].ToString();
- if (a.Length > 0)
- {
- break;
- }
- }
- catch
- {
-
- }
- }
-
- }
- if (a == "")
- {
- return "";
- }
- try
- {
- try
- {
- urls.Remove(a);
- }
- catch
- {
-
- }
- //增加到已经处理过的队列
- //得到该url的MD5名
- string oneUrlMD52 = getMD5name(a);
- if (urlsIOLD.Contains(oneUrlMD52) == false)
- {
- urlsIOLD.Add(oneUrlMD52);
- }
- }
- catch
- {
-
- }
- Now2oldUrl( a);
- return a;
- }
- catch
- {
- return "";
-
- }
- }
- /// <summary>
- /// 得到一个地址
- /// </summary>
- public string getOneUrl()
- {
- try
- {
- int xx = urls.Count;
- if (xx == 0)
- {
- Console.WriteLine("URL库为空!");
- return "";
- }
- string a = "";
- double xxx = myRandom.NextDouble() * (double)xx;
- int x = (int)xxx;
- try
- {
- a = urls[x].ToString();
- }
- catch
- {
- return "";
- }
-
- if (a == "")
- { return ""; }
- try
- {
- try
- {
- urls.Remove(a);
- }
- catch
- {
- return "";
- }
- //增加到已经处理过的队列
- //得到该url的MD5名
- string oneUrlMD52 = getMD5name(a);
- try
- {
- urlsIOLD.Add(oneUrlMD52);
- return a;
- }
- catch
- {
- return "";
- }
- }
- catch
- {
- return "";
- }
- }
- catch
- {
- return "";
- }
- }
- /// <summary>
- /// 是否与上次发出的URL在同一起点
- /// </summary>
- /// <param name="nowUrl"></param>
- /// <returns></returns>
- private bool itCom(string nowUrl,string oldTmp)
- {
- if (nowUrl == null)
- {
- return false;
- }
- if (oldTmp == null)
- {
- return false;
- }
- oldTmp = oldTmp.ToLower();
- nowUrl = nowUrl.ToLower();
- if ((oldTmp.IndexOf("http://") == -1) | (nowUrl.IndexOf("http://") == -1))
- {
- return false;
- }
- string[] m1 = nowUrl.Split('/');
- string[] m2 = oldTmp.Split('/');
- if ( m1[2] ==m2[2] )
- {
- return true;
- }
- else
- {
- return false;
- }
- }
- /// <summary>
- /// 判断所有的部分 取得的URL 不同于已经发送过的URL
- /// </summary>
- /// <param name="nowUrl"></param>
- /// <returns></returns>
- private bool itComAll(string nowUrl)
- {
- for (int i = 0; i < oldUrlIt.Length; i++)
- {
- if (itCom(nowUrl, oldUrlIt[i]) == true)
- {
- return true;
- }
- }
- return false;
- }
- /// <summary>
- /// 压入当前刚发出过的数组
- /// </summary>
- /// <param name="nowData"></param>
- private void Now2oldUrl(string nowData)
- {
- if (oldUrlNum > oldUrlIt.Length-1)
- {
- oldUrlNum = 0;
- oldUrlIt[oldUrlNum] = nowData;
- }
- else
- {
- oldUrlIt[oldUrlNum] = nowData;
- }
- oldUrlNum = oldUrlNum + 1;
- }
- /// <summary>
- /// 把已经下载成功和已经具有的URL 压入 防止再次发出
- /// </summary>
- /// <param name="a"></param>
- public void AddOldurlsIOLD(ArrayList a )
- {
-
-
- foreach (string x in a)
- {
- string xmd5 = getMD5name(x);
- // if (urlsIOLD.Contains(xmd5) == false)
- // {
- try
- {
- urlsIOLD.Add(xmd5);
- }
- catch
- { }
- // }
-
- }
- Console.WriteLine("压入已取得数据的 "+a.Count.ToString() +" 条URL");
-
- }
- /// <summary>
- /// 得到URL的MD5名
- /// </summary>
- /// <param name="url"></param>
- /// <returns></returns>
- private string getMD5name(string url)
- {
- string strMd5 = System.Web.Security.FormsAuthentication.HashPasswordForStoringInConfigFile(url, "md5");
- return strMd5;
- }
- /// <summary>
- /// 读取已经获得的url 如果是在一般模式下 本命令不执行 只有在直接索引时刻
- /// </summary>
- /// <param name="had_url_path"></param>
- public void Load_Had_Url(string had_url_path)
- {
- StreamReader reader = null;
- try
- {
- reader = new StreamReader(had_url_path, System.Text.Encoding.GetEncoding("gb2312"));
- for (string line = reader.ReadLine(); line != null; line = reader.ReadLine())
- {
- if (line != null)
- {
- if (line.Length == 0)
- { }
- else
- {
- string a_md5 = getMD5name(line);
- try
- {
- urlsIOLD.Add(a_md5);
- }
- catch
- {}
- }
- }
- }
- reader.Close();
- }
- catch (IOException e)
- {
- Console.WriteLine(e.Message);
- }
- finally
- {
- if (reader != null)
- reader.Close();
- }
- Console.WriteLine("已取得数据的 " + urlsIOLD.Count.ToString() + " 条");
-
- }
- /// <summary>
- /// 增加1个url 到已经获得的url 列表 如果是在一般模式下 本命令不执行 只有在直接索引时刻
- /// </summary>
- /// <param name="had_url"></param>
- public void add_Had_Url(string path_o, string had_url)
- {
- string a_md5 = getMD5name(had_url);
- try
- {
- urlsIOLD.Add(a_md5);
- StreamWriter writer = null;
- try
- {
- writer = new StreamWriter(path_o, true, System.Text.Encoding.GetEncoding("gb2312"));
- writer.WriteLine(had_url);
- writer.Close();
- }
- catch (IOException e)
- {
- Console.WriteLine(e.Message);
- }
- finally
- {
- if (writer != null)
- writer.Close();
- }
- }
- catch
- { }
- }
- /// <summary>
- /// 清除源地址 监控
- /// </summary>
- public void ReSetSource()
- {
- foreach (string one_url_1 in SourceURLs)
- {
- string a_md5 = getMD5name(one_url_1);
- try
- {
- if (urlsIOLD.Contains(a_md5) == true)
- {
- urlsIOLD.Remove(a_md5);
- }
- if (urls.Contains(one_url_1) == false)
- {
- urls.Add(one_url_1);
- }
- }
- catch
- { }
-
- }
- Console.WriteLine("源地址重新加载——监控关键性页面");
-
- }
- }
- }