搜索引擎

开发平台：
C#

ClassSpiderMain.cs：源码内容
							using System;
using System.IO;
using System.Collections.Generic;
using System.Text;
using System.Net;
using System.Net.Sockets;
using System.Threading;
using System.Collections;
/*
      '       迅龙中文分类搜索引擎  v0.6
      '
      '        LGPL  许可发行
      '
      '       宁夏大学  张冬 康彩  zd4004@163.com
      ' 
      '        官网 http://blog.163.com/zd4004/
 */
namespace XunLong.ConsoleSpiderOne
{
    class ClassSpiderMain
    {
        /// <summary>
        /// 判断索引是否变化
        /// </summary>
        public bool IndexChang = false;
        /// <summary>
        /// 是否正在运行索引拷贝工作　　此时　需要使得当前蜘蛛延时
        /// </summary>
        private bool IsRunIndexWrite = false;
        /// <summary>
        /// 统计总共下载的文件数
        /// </summary>
        public  int Num = 0;
        /// <summary>
        /// 统计总共下载的文件数  有效
        /// </summary>
        public int Num_1 = 0;
        /// <summary>
        /// 判断索引个数是否变化
        /// </summary>
        private int N_B_OLD = 0;
        /// <summary>
        /// 测试速度使用
        /// </summary>
        private int OneTime_X = 0;
        /// <summary>
        /// 采样队列
        /// </summary>
        private ArrayList neeDurl = new ArrayList();
        private  string OneDir = "";
        string FileDir = "";
        int YYY = 0;
        /// <summary>
        /// 索引存放数据
        /// </summary>
        string indexPath = "";
        /// <summary>
        /// 源url列表 只要该url 和 源列表中 任意一个地址在同一层  那么即可认为 可以下载
        /// </summary>
        private ArrayList sourceUrls = new ArrayList();
        /// <summary>
        /// 编码器
        /// </summary>
        private NewNxuEncoding.CNewNxuEncoding codeIt = new NewNxuEncoding.CNewNxuEncoding();
        //定义URL类
        private XunLong.UrlDBClassLibrary.ClassUrlDB newUrlDBClass = new XunLong.UrlDBClassLibrary.ClassUrlDB();
        /// <summary>
        /// 定义存储类
        /// </summary>
        private NetHashTableAPI.ClassNHT myDB = new NetHashTableAPI.ClassNHT();
        /// <summary>
        /// HTM chuli 
        /// </summary>
        private XunLong.HtmlClassLibrary.ClassHTML myClassHTML = new XunLong.HtmlClassLibrary.ClassHTML();
        System.Random myRandom = new Random();
        /// <summary>
        /// 每隔1800秒 扫描一下起始的站点
        /// </summary>
        int iTime = Environment.TickCount;
        /// <summary>
        /// 不用处理的扩展名列表
        /// </summary>
        private string[] FILEEX ={"css",  "js",  "zip",  "avi", "rar",  "exe", "dat",  "png", "jpg", "gif", "mp3","rm","rmvb","doc",
                                   "xsl","pdf","asf","wav" ,"wmv","mpeg","mp4","txt","gz","tar","torrent","swf","ppt","mdb",
                                   "iso","bin","dll","obj","svg","xml","mov","pps","ico","iuc","bak","pps","gz"};
        /// <summary>
        /// 配置文件路径
        /// </summary>
        private string IndexData_Config_Path = "";
        
        public ClassSpiderMain()
        {
            //print  javas
        
        }
        /// <summary>
        /// 是否需要运行 
        /// </summary>
       public  bool IsRun = false;
        /// <summary>
        /// 开始 蜘蛛
        /// </summary>
        public void StartMain( string k_c_path)
        {
            //当前目录
           // OneDir = AppDomain.CurrentDomain.BaseDirectory;
            IndexData_Config_Path = k_c_path;
            IsRun = true;
            Num = 0;
            Num_1 = 0;
         // FileDir = OneDir + "WEB";
            FileDir = XunLong.CongifData.Config.SpiderData;
           // string urlsCache = OneDir + "urlsCache.dat";
          //  string urlsCache = ;
            string urlsCache = XunLong.CongifData.Config.UrlCahceData + "c";
          //  string urlsSource = OneDir + "urlsSource.dat";
            //string urlsSource =;
            string urlsSource = XunLong.CongifData.Config.UrlSourceData;
            Console.WriteLine(" 源地址  " + urlsSource);
            Console.WriteLine(" 地址缓存  " + urlsSource + "c");
          //  indexPath = indexPathX;
            Console.WriteLine("索引存放  " + urlsSource);
            newUrlDBClass.urlsCacheFile = urlsCache;
            newUrlDBClass.urlsSourceFile = urlsSource;
            //加载服务
             newUrlDBClass.StartFirstUrls();  // .StartUrls();
             sourceUrls = newUrlDBClass.SourceURLs;
            // 设定存储类的路径   3M
             myDB.SetClassNHT(FileDir, 3145727, k_c_path);
             ArrayList aOLDUrl = new ArrayList();
             aOLDUrl = myDB.SearchOneList("http://");
             newUrlDBClass.AddOldurlsIOLD(aOLDUrl);
             //清理 开始监控的  url
            newUrlDBClass.ReSetSource();
            iTime = Environment.TickCount;
             if (XunLong.CongifData.Config.xlDirectIndex.IndexOf('0') > -1)
             {
             }
             else
             {
                 newUrlDBClass.Load_Had_Url(XunLong.CongifData.Config.Had_Url_Data);
                 XunLong.clsDirectIndex.DirectIndex.OpenIndex(k_c_path);
                 XunLong.clsDirectIndex.DirectIndex.InitCnStopWord(XunLong.CongifData.Config.StopWordData);
             }
        }
        /// <summary>
        /// 实际运行程序
        /// </summary>
        public void ZhiZhuRun()
        {
        XXXXX:
          
            try
            {
                while (IsRun)
                {
                    int iTime_2 = Environment.TickCount;
                    //定时监控
                    if (iTime_2 - iTime > XunLong.CongifData.Config.IndexDataTIME*60*1000)
                    {
                        iTime = Environment.TickCount;
                        // 1 iTime
                        //判断索引是否改变
                        if (IndexChang == true)
                        {
                            IsRunIndexWrite =true;
                            //关闭索引 
                            XunLong.clsDirectIndex.DirectIndex.CloseIndex();
                            //拷贝新的索引数据到检索目录
                            CopyIndexData(XunLong.CongifData.Config.IndexData, XunLong.CongifData.Config.IndexData2);
                            //打开索引
                            XunLong.clsDirectIndex.DirectIndex.OpenIndex(IndexData_Config_Path);
                            IsRunIndexWrite = false;
                            IndexChang = false;
                        }
                 
                        //清理 开始监控的  url
                        newUrlDBClass.ReSetSource();
          
                        Console.WriteLine("*******************************");
                        Console.WriteLine("*                             *");
                        Console.WriteLine("*        监控 重新得到数据    *");
                        Console.WriteLine("*                             *");
                        Console.WriteLine("*******************************");
                    }
                    
                    
                 //   double xxx = myRandom.NextDouble() * 500;
                //    int x = (int)xxx;
                //    System.Threading.Thread.Sleep(x);
                    //判断是否停止 
                    if (IsRun == false)
                    {
                        return;
                        //停止
                    }
                    //得到1个新的URL
                    string newUrl = "";
                    try
                    {
                        newUrl = newUrlDBClass.getOneUrl();
                    }
                    catch
                    {
                        Console.Write("->E");
                    }
                    if (newUrl.Length == 0)
                    {
                        //URL 任务缓存为 0 
                        System.Threading.Thread.Sleep(1000);
                    }
                    else
                    {
                        Console.WriteLine("开始请求 >> " + newUrl);
                        //取得1个数据
                        string getDB = GetOneHTML(newUrl, "gb2312");
                        //非网页格式 则返回
                        //return "XL_NULL";
                        if (getDB != "XL_NULL")
                        {
                            if (getDB.Length > 0)
                            {   //保存数据
                                // WriteWeb2Disk(newUrl, getDB);
                                //调用存储类   保存数据                         
                                int N_ii = 0;
                                //判断是否直接索引   //判断是否需要存储
                                // 如果需要索引 在此处进行 然后把不能索引的记载下来
                                if (IsSaveData(newUrl,getDB) == true)
                                {
                                    while (true)
                                    {
                                        try
                                        {
                                           
                                            if (myDB.lockIt == false)
                                            {
                                                myDB.lockIt = true;
                                                myDB.add(newUrl, getDB, true);
                                                myDB.lockIt = false;
                                                break;
                                            }
                                            // 延时 在试
                                            System.Threading.Thread.Sleep(DateTime.Now.Millisecond + 100);
                                        }
                                        catch
                                        {
                                            Console.Write("->E");
                                        }
                                        N_ii = N_ii + 1;
                                        if (N_ii > 200)
                                        {
                                            Console.Write("->E-R");
                                            newUrlDBClass.putOneUrl2(newUrl); // nUrlDB.putOneUrl(a);
                                            Console.WriteLine(" 写入失败 重新压回 URL数据库 >> " + newUrl);
                                            goto XXXXX;
                                        }
                                    }
                                    Num_1 = Num_1 + 1;
                                }
                                else
                                {
                                    Console.WriteLine("通过索引　取消存储！");
                                    //********************************************************
                                    //
                                    //把取消存储的  压入一个已经请求的url  列表 
                                    //********************************************************
                                }
                                Num = Num + 1;
                                if (Num % 10 == 0)
                                {
                                    int TwoTime_X = Environment.TickCount - OneTime_X;
                                     YYY = 100000 / TwoTime_X;
 
                                   // Console.WriteLine("===================>>>>>  当前速度：    " + YYY.ToString() + "    条/秒");
 
                                    OneTime_X = Environment.TickCount;
                                }
                                double yyy_d = (double)YYY / 10;
                               
                                Console.WriteLine("总共下载了 " + Num.ToString() + " 条数据 其中过滤得到　"+Num_1.ToString()+" 条 有效数据");
                                Console.WriteLine("===================>>>>>  当前速度：    " + yyy_d.ToString() + "    条/秒");
                                // 根据当前页和页面URL 把数据中的链接提取出来 
                                try
                                {
                                    GetAddUrl(getDB, newUrl);
                                }
                                catch
                                {
                                    Console.Write("->E");
                                }
                            }
                            else
                            {
                                //得不到数据时重新压入
                                if (newUrl.Length > 7) //== "http://")
                                {
                                    try
                                    {
                                        newUrlDBClass.putOneUrl2(newUrl); // nUrlDB.putOneUrl(a);
                                        Console.WriteLine(" 请求失败 重新压回 URL数据库 >> " + newUrl);
                                    }
                                    catch
                                    {
                                        Console.Write("->E");
                                    }
                                }
                            }
                            //
                        }
                        else
                        {
                            int iCCCC = 0;
                        }
                    }
                }
            }
            catch(Exception e)
            {
                Console.Write("->E");
                goto XXXXX;
            }
        
        }
        /// <summary>
        /// 停止蜘蛛 把剩余的URL压入缓存文件
        /// </summary>
        public void StopSpider()
        {
            IsRun = false;
            newUrlDBClass.SaveUrlsCache();
            if (XunLong.CongifData.Config.xlDirectIndex.IndexOf('0') > -1)
            {
            }
            else
            {
                XunLong.clsDirectIndex.DirectIndex.CloseIndex();
            }
        }
        /// <summary>
        /// 得到一个网页数据
        /// </summary>
        /// <param name="murl"></param>
        /// <returns></returns>
        private string GetOneHTML(string murl, string codeType)
        {
            try
            {
                HttpWebRequest request = (HttpWebRequest)WebRequest.Create(murl);
                request.Timeout = 10000;
                try
                {
                    //下面来看看如何处理HTML页面。首先要做的当然是下载HTML页面，这可以通过C#提供的HttpWebRequest类实现： 
                    // request = (HttpWebRequest)WebRequest.Create(murl);
                    WebResponse response = request.GetResponse();
                    Stream stream = response.GetResponseStream();
                    string buffer = "", line;
                    //接下来我们就从request创建一个stream流。在执行其他处理之前，我们要先确定该文件是二进制文件还是文本文件，不同的文件类型处理方式也不同。下面的代码确定该文件是否为二进制文件。 
                    //。如果是文本文件，首先从stream创建一个StreamReader，然后将文本文件的内容一行一行加入缓冲区。 
                    //  response.ContentType.
                    // Encoding gbx = System.Text.Encoding.GetEncoding("gb2312");
                    //存放当前的应用的字符集
                    string NowCodeSet = "";
                    if (response.ContentLength < 1024 * 128)
                    {
                        //判断是否是网页格式
                        if (response.ContentType.ToLower().StartsWith("text/"))
                        {
                            //自动检测 UTF8
                            if ((response.ContentType.ToLower().IndexOf("utf-8") > -1) | (response.ContentType.ToLower().IndexOf("unicode") > -1))
                            {
                                StreamReader reader = new StreamReader(stream, System.Text.Encoding.UTF8);
                                NowCodeSet = "utf-8";
                                buffer = "";
                                while ((line = reader.ReadLine()) != null)
                                {
                                    buffer += line + "rn";
                                }
                                reader.Close();
                                stream.Close();
                                response.Close();
                                buffer = Str2Str(buffer);
                            }
                            else
                            {
                                //自动检测GB2312
                                if ((response.ContentType.ToLower().IndexOf("gb2312") > -1) | (response.ContentType.ToLower().IndexOf("gbk") > -1))
                                {
                                    StreamReader reader = new StreamReader(stream, System.Text.Encoding.GetEncoding("GB2312"));
                                    NowCodeSet = "gb2312";
                                    buffer = "";
                                    while ((line = reader.ReadLine()) != null)
                                    {
                                        buffer += line + "rn";
                                    }
                                    reader.Close();
                                    stream.Close();
                                    response.Close();
                                }
                                else
                                {
                                    //自动检测 不到时按照默认设置进行
                                    if (codeType.ToLower().IndexOf("utf") > -1)
                                    {
                                        StreamReader reader = new StreamReader(stream, System.Text.Encoding.UTF8);
                                        NowCodeSet = "utf-8";
                                        buffer = "";
                                        while ((line = reader.ReadLine()) != null)
                                        {
                                            buffer += line + "rn";
                                        }
                                        reader.Close();
                                        stream.Close();
                                        response.Close();
                                        buffer = Str2Str(buffer);
                                    }
                                    else
                                    {
                                        StreamReader reader = new StreamReader(stream, System.Text.Encoding.GetEncoding("GB2312"));
                                        NowCodeSet = "gb2312";
                                        buffer = "";
                                        while ((line = reader.ReadLine()) != null)
                                        {
                                            buffer += line + "rn";
                                        }
                                        reader.Close();
                                        stream.Close();
                                        response.Close();
                                    }
                                }
                            }
                        }
                        else
                        {
                            //非网页格式 则返回
                            return "XL_NULL";
                        }
                        ///字符集为gb2312  而刚应用为utf-8 
                        if ((buffer.ToLower().IndexOf("gb2312") > -1) & (NowCodeSet == "utf-8"))
                        {
                            HttpWebRequest requestX = (HttpWebRequest)WebRequest.Create(murl);
                            WebResponse responseX = requestX.GetResponse();
                            Stream streamX = responseX.GetResponseStream();
                            StreamReader readerX = new StreamReader(streamX, System.Text.Encoding.GetEncoding("GB2312"));
                            buffer = "";
                            while ((line = readerX.ReadLine()) != null)
                            {
                                buffer += line + "rn";
                            }
                            readerX.Close();
                            streamX.Close();
                            responseX.Close();
                        }
                        ///字符集为utf-8 而刚应用为 gb2312 
                        if ((buffer.ToLower().IndexOf("utf-8") > -1) & (NowCodeSet == "gb2312"))
                        {
                            HttpWebRequest requestY = (HttpWebRequest)WebRequest.Create(murl);
                            WebResponse responseY = requestY.GetResponse();
                            Stream streamY = responseY.GetResponseStream();
                            StreamReader readerY = new StreamReader(streamY, System.Text.Encoding.UTF8);
                            buffer = "";
                            while ((line = readerY.ReadLine()) != null)
                            {
                                buffer += line + "rn";
                            }
                            readerY.Close();
                            streamY.Close();
                            responseY.Close();
                            buffer = Str2Str(buffer);
                        }
                        //   string tmm =clearHTMLDB(buffer);
                        string tmm = buffer;
                        if (tmm.Length > 0)
                        {
                            Console.WriteLine("GUrlData : --> " + murl);
                        }
                        if (tmm.Length > 1024 * 128)
                        {
                            return "<html><title>Too_Long2</title><body>Too_Long</body></html>";
                        }
                        else
                        {
                            return tmm;  //返回经过过滤得数据
                        }
                    }
                    else
                    {
                        return "<html><title>Too_Long</title><body>Too_Long</body></html>";
                    }
                }
                catch
                {
                    request.Abort();
                    Console.WriteLine("Err : --> " + murl);
                    return "";
                }
            }
            catch
            {
                Console.Write("->E");
                return "";
            }
        }
        /// <summary>
        /// 读文件
        /// </summary>
        /// <param name="filename"></param>
        /// <returns></returns>
        public  string getFileData(string filename)
        {
            StreamReader reader = null;
            string data = string.Empty;
            try
            {
                reader = new StreamReader(filename, System.Text.Encoding.GetEncoding("gb2312"));
                data = reader.ReadToEnd();
                reader.Close();
                return data;
            }
            catch (IOException e)
            {
                Console.WriteLine(e.Message);
            }
            finally
            {
                if (reader != null)
                    reader.Close();
            }
            return "";
            /*
            StreamReader reader = null;
            string data = string.Empty;
            try
            {
                reader = new StreamReader(filename, System.Text.Encoding.GetEncoding("gb2312"));
                for (string line = reader.ReadLine(); line != null; line = reader.ReadLine())
                {
                    if (data == "")
                    {
                        data = line;
                    }
                    else
                    {
                        data = data + "n" + line;
                    }
                }
                reader.Close();
                return data;
            }
            catch (IOException e)
            {
                Console.WriteLine(e.Message);
            }
            finally
            {
                if (reader != null)
                    reader.Close();
            }
            return "";
            */
        }
        /// <summary>
        /// 写文件
        /// </summary>
        /// <param name="filename"></param>
        /// <param name="data"></param>
        public void putFileData(string filename, string data)
        {
        StreamWriter writer = null;
            try
            {
                writer = new StreamWriter(filename, false, System.Text.Encoding.GetEncoding("gb2312"));
                writer.Write(data);
                writer.Close();
            }
            catch (IOException e)
            {
                Console.WriteLine(e.Message);
            }
            finally
            {
                if (writer != null)
                    writer.Close();
            }
        }
        /*
        /// <summary> 
        /// 把URL拆成路径 写入数据    
        /// </summary>
        /// <param name="url"></param>
        /// <param name="data"></param>
        private void WriteWeb2Disk(string url, string data)
        {
            url=url.Trim().ToLower();
            //
            if ((url.IndexOf('/') == -1) | (url.ToLower().IndexOf("http://") == -1))
            {
                return;  //URL错误
            }
            // 去掉URL 结尾的 /
            if (url.Substring(url.Length - 1, 1) == "/")
            { 
              
                url = url.Substring(0,url.Length-1);
            
            }
            //找到第一个等号  最后的链接中加上
            string urlAdd = "";
            int ddhao = url.IndexOf('?');
            if ( ddhao>-1)
            {
                string urlTmp = url;
                int urlLen = url.Length;
               // url = urlTmp.PadLeft(ddhao - 1);
               // urlAdd = urlTmp.PadRight(urlLen -ddhao);
                url = urlTmp.Substring(0,ddhao - 1);
                urlAdd = urlTmp.Substring(ddhao+1, urlLen - ddhao-1);
            }
          
            if (url.IndexOf('?') > -1)
            {
                int ic = 0;
            }
            string[] U = url.Split('/');
            string tmpDir = FileDir;
            //起始的地址  http://www.qq.com 
            if (U.Length == 3)
            {
                //1  建立文件夹
                //string a = tmpDir + "\" + codeIt.DirCN2CODE(U[U.Length - 1]);  //对路径进行编码  使之符合规范
                string a = tmpDir + "\" +U[U.Length - 1];  //对路径进行编码  使之符合规范
                if (System.IO.Directory.Exists(tmpDir) == false)
                {
                    System.IO.Directory.CreateDirectory(tmpDir);
                }
                if (System.IO.Directory.Exists(a) == false)
                {
                    System.IO.Directory.CreateDirectory(a);
                }
                //2 保存文件  为WEBMAINPAGE.HTM
                //得到文件路径
                string filePathOne = a + "\WEBMAINPAGE.HTM" ;
                //保存数据
                putFileData(filePathOne, data);
            }
            else
            {
                // http://www.nbd.com.cn/newShow.asp?D_ID=44860
                for (int i = 2; i < U.Length - 1; i++)
                {
                    //string a = codeIt.DirCN2CODE(U[i]);  //对路径进行编码  使之符合规范
                    string a = U[i];  //对路径进行编码  使之符合规范
                    tmpDir = tmpDir + "\" + a;
                    if (System.IO.Directory.Exists(tmpDir) == false)
                    {
                        System.IO.Directory.CreateDirectory(tmpDir);
                    }
                }
                // 只对最后的文件名进行编码
                //得到文件路径               最后加上附加的地址部分 刚才因为 = 号 拆开的
                string filePathOne = tmpDir + "\" + codeIt.DirCN2CODE(U[U.Length - 1] +"?"+ urlAdd);
                if (U[U.Length - 1].Length == 0)
                {
                    filePathOne = tmpDir + "\WEBMAINPAGE.HTM";
                }
                //保存数据
                putFileData(filePathOne, data);
            }
        }
        */
        /// <summary>
        /// 得到数据中符合条件的URL
        /// </summary>
        /// <param name="data"></param>
        private void GetAddUrl(string HData, string SourceUrl)  //数据　　当前URL
        {
            //  变为同一层的标志
            int xgl = SourceUrl.LastIndexOf('/');
            string TMPurl = SourceUrl.Substring(0, xgl + 1);
            string[] FastrD = SourceUrl.Split('/');
            string fastUrl = "http://" + FastrD[2];
            HTMParse.ParseHTML parse = new HTMParse.ParseHTML();
            parse.Source = HData;
            while (!parse.Eof())
            {
                char ch = parse.Parse();
                if (ch == 0)
                {
                    HTMParse.Attribute a = parse.GetTag()["HREF"];
                    if (a != null)
                    {
                        HTMParse.Attribute c = parse.GetTag()["HREF"];
                        string xa1Val = a.Value.ToString().Trim().ToLower();    //得到URL      判断为该站点内部的URL
                        if (isOKFile(xa1Val) == false | xa1Val.Length == 0)
                          //  if ( xa1Val.Length == 0)
                        {
                            int PP_PP = 0;
                            PP_PP = PP_PP + 1;
                        }
                        else
                        {
                            string xa1 = myClassHTML.Data2Url(SourceUrl, xa1Val);
                            xa1 = xa1.Trim().ToString();
                            if ((xa1.Length > 7)&(xa1.Length < 160 )) //== "http://")
                            {
                                //限定在同一起始位置
                             //   if (xa1.ToLower().IndexOf(fastUrl.ToLower()) > -1)
                                if (jisuansourceUrls(xa1) == true)
                                {
                                    try
                                    {
                                        //Environment.TickCount;
                                        // string New_CANSHU = System.Web.HttpUtility.UrlDecode(xa1, System.Text.Encoding.GetEncoding("GB2312"));
                                        string New_CANSHU = xa1;
                                        newUrlDBClass.putOneUrl(New_CANSHU); // nUrlDB.putOneUrl(a);
                                    }
                                    catch
                                    {
                                        Console.Write("->E");
                                    }
                                    // Console.WriteLine(" 压入 URL数据库 >> " + xa1);
                                }
                                else
                                {
                                }
                            }
                        }
                    }
                }
            }
        }
        /// <summary>
        /// 调查文件扩展名是否属于需要的　ccs  js  zip  avi rar  exe dat  png jpg gif mp3 等不抓取
        /// </summary>
        /// <param name="url"></param>
        /// <returns></returns>
        private bool isOKFile(string url)
        {
            if (url.IndexOf('?') > 0)
            {
                return true;
            }
            if(url.Length<4)
            {
                return true;
            }
            //取出最后４位　如果含有扩展名　则返回　F
            string tmp_one = url.Substring(url.Length-4, 4).ToLower();
            int ll = tmp_one.Length;
            foreach (string a_o in FILEEX)
            {
                int intU =tmp_one.LastIndexOf("."+a_o) ;
                if (intU>-1 &  intU== ll-a_o.Length -1)
                {
                    return false;
                }
            
            }
            return true;
        }
        /// <summary>
        /// 编码转换
        /// </summary>
        /// <param name="data"></param>
        /// <returns></returns>
        private string Str2Str(string data)
        {
            string gb2312info = string.Empty;
            Encoding utf8 = Encoding.UTF8;
            Encoding gb2312 = Encoding.GetEncoding("gb2312");
            // Convert the string into a byte[].
            byte[] unicodeBytes = utf8.GetBytes(data);
            // Perform the conversion from one encoding to the other.
            byte[] asciiBytes = Encoding.Convert(utf8, gb2312, unicodeBytes);
            // Convert the new byte[] into a char[] and then into a string.
            // This is a slightly different approach to converting to illustrate
            // the use of GetCharCount/GetChars.
            char[] asciiChars = new char[gb2312.GetCharCount(asciiBytes, 0, asciiBytes.Length)];
            gb2312.GetChars(asciiBytes, 0, asciiBytes.Length, asciiChars, 0);
            gb2312info = new string(asciiChars);
            return gb2312info;
        }
        /// <summary>
        /// 得到URL的MD5名
        /// </summary>
        /// <param name="url"></param>
        /// <returns></returns>
        private string getMD5name(string url)
        {
            string strMd5 = System.Web.Security.FormsAuthentication.HashPasswordForStoringInConfigFile(url, "md5");
            return strMd5;
        }
        /// <summary>
        /// 计算该 url  是否和源URL 队列中的某个在同一站点
        /// </summary>
        /// <param name="nowurl">http://ee/</param>
        /// <returns></returns>
        private bool jisuansourceUrls(string nowurl)
        {
            if(nowurl.IndexOf("http://")!=0)
            {
               return false;
            }
            string[] ax  = nowurl.Split('/');
            string axs ="http://" + ax[2]+"/"; 
            foreach (string a in sourceUrls)
            {
                if (a.ToLower().IndexOf(axs.ToLower()) ==0)
                {
                    return true;      
                }           
            }
            return false; 
        }
        /// <summary>
        /// 是否保存该url对应的数据  如果不需要过滤永远返回 True
        /// </summary>
        /// <param name="url">在过滤状态下　不能作为采样结果的数据不被保存</param>
        /// <returns></returns>
        private bool IsSaveData(string url,string htmldat)
        {
            if (XunLong.CongifData.Config.xlDirectIndex.IndexOf('0') > -1)
            {
               return true;  //如果不需要过滤永远返回 True
            }
            if (url.Length == 0 | htmldat.Length == 0)
            {
                return false;
            }
            while (IsRunIndexWrite == true)
            {
                System.Threading.Thread.Sleep(5000);
            
            }
            //把得到的数据压入 已经得到的url列表
            newUrlDBClass.add_Had_Url(XunLong.CongifData.Config.Had_Url_Data, url);
            //匹配模版 压入索引
           int N_B =  XunLong.clsDirectIndex.DirectIndex.IndexOneData(url, htmldat);
           if (N_B > N_B_OLD)
           {
               IndexChang = true;
           }
            //判断索引是否变化
           N_B_OLD = N_B;
           Console.WriteLine(" ->匹配成功: " + N_B.ToString());
            return false;
        
        }
        /*
        /// <summary>
        /// 把采样队列压入系统
        /// </summary>
        /// <param name="pathcy"></param>
        public void initNEEDURL(string okPath)
        {
            //初始化分词缓存
            neeDurl.Clear();
            StreamReader reader = null;
            try
            {
                reader = new StreamReader(okPath, System.Text.Encoding.GetEncoding("gb2312"));
                for (string line = reader.ReadLine(); line != null; line = reader.ReadLine())
                {
                    if (line != null)
                    {
                        if (line.Length == 0)
                        { }
                        else
                        {
                            if (neeDurl.Contains(line) == false)
                            {
                                neeDurl.Add(line);
                            }
                            else
                            {
                                int u_w = 0;
                            }
                        }
                    }
                }
                reader.Close();
            }
            catch (IOException e)
            {
                Console.WriteLine(e.Message);
            }
            finally
            {
                if (reader != null)
                    reader.Close();
            }
            ArrayList tmp_needurl = new ArrayList();
            tmp_needurl.Clear();
            foreach (string a in neeDurl)
            {
                if (a.IndexOf('?') > 0)
                {
                    tmp_needurl.Add(a);
                }
            }
            foreach (string a in neeDurl)
            {
                if (a.IndexOf('?') == -1)
                {
                    tmp_needurl.Add(a);
                }
            }
            neeDurl = tmp_needurl;
            Console.WriteLine("　共有　" + neeDurl.Count.ToString() +" 条采样数据　");
        }
        /// <summary>
        /// 写入一个数据
        /// </summary>
        /// <param name="filename">文件名</param>
        /// <param name="data">数据</param>
        /// <param name="isApp">是否追加模式</param>
        public void w1_w(string okPath, string data)
        {
            StreamWriter writer = null;
            try
            {
                writer = new StreamWriter(okPath, true, System.Text.Encoding.GetEncoding("gb2312"));
                //  writer.Write(data);
                writer.WriteLine(data);
                writer.Close();
            }
            catch (IOException e)
            {
                Console.WriteLine(e.Message);
            }
            finally
            {
                if (writer != null)
                    writer.Close();
            }
        }
        /// <summary>
        /// 读取1个文件 压入系统
        /// </summary>
        /// <param name="okPath"></param>
        public void w2_w(string okPath)
        {
            StreamReader reader = null;
            try
            {
                reader = new StreamReader(okPath, System.Text.Encoding.GetEncoding("gb2312"));
                for (string line = reader.ReadLine(); line != null; line = reader.ReadLine())
                {
                    if (line != null)
                    {
                        if (line.Length == 0)
                        { }
                        else
                        {
                           // if (olgurl.Contains(line) == false)
                           // {
                           //     olgurl.Add(line);
                          //  }
                          //  else
                          //  {
                          //      int u_w = 0;
                          //  }
                        }
                    }
                }
                reader.Close();
            }
            catch (IOException e)
            {
                Console.WriteLine(e.Message);
            }
            finally
            {
                if (reader != null)
                    reader.Close();
            }
        }
        */
        /// <summary>
        /// 拷贝文件　从2到1
        /// </summary>
        /// <param name="IndexData"></param>
        /// <param name="IndexData2"></param>
        private void  CopyIndexData(string  IndexData,string  IndexData2)
        {
            DirectoryInfo dir1 = new DirectoryInfo(IndexData);
            foreach (FileInfo f in dir1.GetFiles("*"))   //遍历获得文件   
            {
                System.IO.File.Delete(f.FullName);
            }
            DirectoryInfo dir = new DirectoryInfo(IndexData2);
              foreach (FileInfo f in dir.GetFiles("*"))   //遍历获得文件   
              {
                  System.IO.File.Copy(f.FullName, IndexData + "\"+ f.Name);
              }
        
        }
        
    }
}