AutoBuildModelClass.cs
上传用户:zhangkuixh
上传日期:2013-09-30
资源大小:5473k
文件大小:36k
源码类别:

搜索引擎

开发平台:

C#

  1. using System;
  2. using System.Collections.Generic;
  3. using System.Text;
  4. using System.Collections;
  5. using System.Threading;
  6. using System.IO;
  7. namespace XunLong.AutoModelBuilder
  8. {
  9.     /*
  10.           '       迅龙中文分类搜索引擎  v0.6
  11.           '
  12.           '        LGPL  许可发行
  13.           '
  14.           '       宁夏大学  张冬 康彩  zd4004@163.com
  15.           ' 
  16.           '        官网 http://blog.163.com/zd4004/
  17.      */
  18.     /// <summary>
  19.     /// 自动建立模板类
  20.     /// </summary>
  21.     class AutoBuildModelClass
  22.     {
  23.         /// <summary>
  24.         /// 记录采样的数据
  25.         /// </summary>
  26.         struct CaiYangShuJu
  27.         {
  28.             /// <summary>
  29.             /// 采样数据列表
  30.             /// </summary>
  31.             public ArrayList ListOne;
  32.             /// <summary>
  33.             /// 记录本次采样所使用的优先级
  34.             /// </summary>
  35.             public int TNUM;
  36.         
  37.         }
  38.         /// <summary>
  39.         /// 存储数据长短变化
  40.         /// </summary>
  41.         struct HTM2SHORT
  42.         {
  43.             /// <summary>
  44.             /// 词典 标签
  45.             /// </summary>
  46.             public ArrayList Dict1;
  47.             /// <summary>
  48.             /// 词典 数据
  49.             /// </summary>
  50.             public ArrayList Dict2;
  51.             /// <summary>
  52.             /// 变换的数据
  53.             /// </summary>
  54.             public Hashtable HASH;
  55.         }
  56.         /// <summary>
  57.         /// 模版的数量
  58.         /// </summary>
  59.         int xnum = 0;
  60.         //D:XunLongXunLong.BINSpiderWEB  
  61.         /// <summary>
  62.         /// 文件系统路径
  63.         /// </summary>
  64.         string aPath = "";
  65.         
  66.         /// <summary>
  67.         /// 要生成的模版文件路径
  68.         /// </summary>
  69.         string cPath = "";
  70.         /// <summary>
  71.         /// 是否允许建模版 
  72.         /// </summary>
  73.         bool isSURE = true;
  74.         /// <summary>
  75.         /// 线程需要的模板建立参数
  76.         /// </summary>
  77.         Hashtable TMMS = new Hashtable();
  78.         /// <summary>
  79.         /// 文件系统对象
  80.         /// </summary>
  81.         NetHashTableAPI.ClassNHT db = new NetHashTableAPI.ClassNHT();
  82.         XunLong.HtmlClassLibrary.ClassTXT2IDAT CodeIy = new XunLong.HtmlClassLibrary.ClassTXT2IDAT();
  83.         XunLong.HtmlClassLibrary.ClassHTML myHTML2CLEAR = new XunLong.HtmlClassLibrary.ClassHTML();
  84.         /// <summary>
  85.         /// 已经生成过模版的url
  86.         /// </summary>
  87.         ArrayList olgurl = new ArrayList();
  88.         /// <summary>
  89.         /// 得到的URL列表  需要计算 去掉其中 已经建立过模板的部分
  90.         /// </summary>
  91.         ArrayList LIST = new ArrayList();
  92.         /// <summary>
  93.         /// 需要建立模版的url
  94.         /// </summary>
  95.         ArrayList neeDurl = new ArrayList();
  96.         // 1  读取 URL列表  提取数据特征  a 以不同的动态页面区别 b以结构相似的静态页面区别 c以参数结构相似的页区别
  97.         //    只要该集合内数据大于160 则建立模板  采样按80来进行  模板的命名采用 模板数据的MD5 数据 有一个好处就是可以去掉完全相同的模板 
  98.         // 2  模板分析 检测出模板*前方 不属于HTML标示的第一个元素 <S>XX<4545><xxxx>*<xxsss>  自动匹配80源数据得到 每个元素的数据平均长度
  99.         // 平均长度小于 20的 放在类聚模版中   然后 把所有的  生成 放在搜索模板中  平均长度是0的 取消此项
  100.         /// <summary>
  101.         /// 全自动系统运行 Random ra = new Random(Environment.TickCount);
  102.         /// </summary>
  103.         public void AutoRun()
  104.         {
  105.                    
  106.         }
  107.         /// <summary>
  108.         /// 半自动系统运行  根据设置好的相似页面 
  109.         /// </summary>
  110.         public void Run()
  111.         {
  112.             isSURE = true;
  113.             foreach (string a_a in neeDurl)
  114.             {
  115.                 //建立模板
  116.                 CaiYangShuJu CaiYang = GetADCUrlList(a_a);
  117.                 ArrayList urlList = CaiYang.ListOne;
  118.                //把已经建过模的数据写入olgurl
  119.                 putmOLDMODELSOURCEFileData(XunLong.CongifData.Config.modelSourceOLD, a_a);
  120.                 if (urlList == null)
  121.                 {
  122.                     goto NEXTIT;
  123.                 }
  124.                 Hashtable htms = new Hashtable();
  125.                 htms.Clear();
  126.                 foreach (string akcc in urlList)
  127.                 {
  128.                     string htmData = db.Value(akcc);
  129.                     htmData = myHTML2CLEAR.HTML2CLEAR(htmData,akcc);
  130.                     if (htmData.Length < 48)
  131.                     {
  132.                         goto NEXTIT;
  133.                     }
  134.                     htms.Add(akcc, htmData);
  135.                 }
  136.                 Console.WriteLine("-开始建立模板-> " + a_a);
  137.                 TMMS = htms;
  138.                           
  139.                 Thread T1 = new Thread(new ThreadStart(TMM));
  140.                 T1.Start();
  141.                 isSURE = false;
  142.                 int  LooPNUm =0;
  143.                 // == 1 表示正在工作 
  144.                 while (isSURE == false)
  145.                 {
  146.                     LooPNUm = LooPNUm + 1;
  147.                     Console.Write(">");
  148.                    if(LooPNUm > 300)
  149.                    {
  150.                       isSURE =true; // LoopNUM 大于200 长时间没有响应
  151.                       Console.WriteLine("=-> TimeOUT Start New T1");
  152.                    }
  153.                     Thread.Sleep(1000);
  154.                 }
  155.             NEXTIT: ; //进入下一轮
  156.             }
  157.             Console.WriteLine("建模工作全部完成");
  158.         }
  159.         /// <summary>
  160.         /// 建立模板线程
  161.         /// </summary>
  162.         private void TMM()
  163.         {
  164.           //   是否允许建模版       
  165.             isSURE = false;
  166.          //   Console.WriteLine("==--> " + isSURE.ToString());
  167.             try
  168.             {
  169.                 Hashtable TMMSOne = TMMS;
  170.                 HTM2SHORT new_vc_tmm = Long2Short(TMMSOne);
  171.                 Hashtable TMMSOne2 = new_vc_tmm.HASH;
  172.                 ArrayList Short_DICT1 = new_vc_tmm.Dict1;
  173.                 ArrayList Short_DICT2 = new_vc_tmm.Dict2;
  174.                 string dataE = "";
  175.                 foreach (System.Collections.DictionaryEntry de2 in TMMSOne)
  176.                 {
  177.                     dataE = dataE + de2.Key.ToString() + "rn";
  178.                 }
  179.                 //声明模板构建类
  180.                 XunLong.ModelBuilder.ClassModelBuilder myBU = new XunLong.ModelBuilder.ClassModelBuilder();
  181.                 string ModelData ="";
  182.                 try
  183.                 {
  184.                     ModelData = myBU.BuilderModel(TMMSOne2);
  185.                     ModelData = Short2Long(ModelData, Short_DICT1,Short_DICT2);  //还原压缩的数据
  186.                 }
  187.                 catch
  188.                 {
  189.                     Console.WriteLine("--> ERROR 排除错误模版");
  190.                     //允许建立模版
  191.                     isSURE = true;
  192.                     return;
  193.                 }
  194.                 int nn = myBU.inStrNum(ModelData, "*");
  195.                 //模版建立错误 表明该数据不能完成模版创建工作 则不予考虑
  196.                 if (ModelData.Length < 150 | nn < 4)
  197.                 {
  198.                     Console.WriteLine("--> 排除错误模版");
  199.                     //允许建立模版
  200.                     isSURE = true;
  201.                     return;
  202.                 }
  203.                 ArrayList txtDat = EditOneModelTag(ModelData);
  204.                 ArrayList neWW = txtDat;
  205.                 //*************************************************************************************
  206.                 //  匹配模版  
  207.                 //*************************************************************************************
  208.                 // 匹配模版  数据全部匹配模版      
  209.                 string[] pipeiEndData = new string[neWW.Count];  //匹配得到的匹配项列表数据
  210.                 for (int i = 0; i < neWW.Count; i++)
  211.                 {
  212.                     pipeiEndData[i] = "";
  213.                 }
  214.                 foreach (System.Collections.DictionaryEntry de2 in TMMSOne)
  215.                 {
  216.                     //建立滤波类 
  217.                     XunLong.ModelUserClassLibrary.ClassUserModel m = new XunLong.ModelUserClassLibrary.ClassUserModel();
  218.                     //压入测试模板
  219.                     m.TestModeL("", "", "", ModelData, "", "", "","");
  220.                     XunLong.PublicClassLibrary.kcSearch k = m.getTagAndData(de2.Value.ToString());
  221.                     Hashtable p = m.modelOneList;
  222.                     foreach (System.Collections.DictionaryEntry de in p)
  223.                     {
  224.                         int pi = (int)de.Key;
  225.                         pipeiEndData[pi] = pipeiEndData[pi] + de.Value.ToString().Trim();
  226.                     }
  227.                 }
  228.             
  229.                 string dataA = "";
  230.                 string dataB = "";
  231.                 string TagTmp = "属性"; //记录上1个有标示的 数据 
  232.                 for (int i = 0; i < neWW.Count; i++)
  233.                 {
  234.                     if (pipeiEndData[i].Length > 0)
  235.                     {
  236.                         dataA = dataA + neWW[i].ToString() + ":" + "<TAGDATA INDEX=" + i.ToString() + "/>" + "rn";
  237.                         if (neWW[i].ToString().Length > 1)
  238.                         {
  239.                             if (neWW[i].ToString().IndexOf(' ') == -1 & neWW[i].ToString().Length < 13)
  240.                             {
  241.                                 dataB = dataB + "<" + neWW[i].ToString() + ">" + "<TAGDATA INDEX=" + i.ToString() + "/>" + "</" + neWW[i].ToString() + ">" + "rn";
  242.                                 TagTmp = neWW[i].ToString();
  243.                             }
  244.                             else
  245.                             {
  246.                                 string[] xrr = neWW[i].ToString().Split(' ');
  247.                                 string TagII = xrr[xrr.Length - 1];
  248.                                 dataB = dataB + "<" + TagII + ">" + "<TAGDATA INDEX=" + i.ToString() + "/>" + "</" + TagII + ">" + "rn";
  249.                                 TagTmp = TagII;
  250.                             }
  251.                         }
  252.                         else
  253.                         {
  254.                             //表示本属性前方 无可以使用的属性标示 则把此 取出的数据合并到前方  <XX><TAGDATA INDEX=1></XX> <XX><TAGDATA INDEX=1><TAGDATA INDEX=2></XX>
  255.                             //1 取出包含的 <TAGDATA INDEX=1>  
  256.                             if (TagTmp.Length > 0)
  257.                             {
  258.                                 dataB = dataB + "<" + TagTmp + ">" + "<TAGDATA INDEX=" + i.ToString() + "/>" + "</" + TagTmp + ">" + "rn";
  259.                             }
  260.                         }
  261.                     }
  262.                 }
  263.                 dataB = "<xl主类别>HTM</xl主类别>" + "rn" +dataB ;
  264.                 string dataT = "<TAGDATA INDEX=1/>";
  265.                 string x = cPath + "\" + getMD5name(ModelData);
  266.                 XunLong.PublicClassLibrary.kcSearch v = new XunLong.PublicClassLibrary.kcSearch();
  267.                 v.a = "";
  268.                 putFileData(x + ".a", dataA);
  269.                 putFileData(x + ".b", dataB);
  270.                 putFileData(x + ".c", "");
  271.                 putFileData(x + ".d", ModelData);
  272.                 putFileData(x + ".e", dataE);
  273.                 putFileData(x + ".t", dataT);
  274.                 putFileData(x + ".h", dataA);
  275.                 putFileData(x + ".s", dataB);
  276.                  /// <summary>
  277.         /// 显示的摘要信息
  278.         /// </summary>
  279.        // public string s;
  280.                 Console.WriteLine("-模版建立成功->" + x);
  281.                 xnum = xnum+1;
  282.                 Console.WriteLine("-全部模版数量-> " + xnum.ToString());
  283.             }
  284.             catch
  285.             {
  286.                 Console.WriteLine("-模版建立F->" );
  287.            }
  288.             //允许建立模版
  289.             isSURE = true;
  290.         }
  291.         /// <summary>
  292.         /// 是否全为数字组成
  293.         /// </summary>
  294.         /// <param name="data"></param>
  295.         /// <returns></returns>
  296.         private bool isNum(string data)
  297.         {
  298.             if (data==null |  data.Length == 0)
  299.             {
  300.                 return false;
  301.              }
  302.               foreach(char a in data)
  303.               {
  304.                   if (a < '0' | a > '9')
  305.                   {
  306.                       return false;
  307.                   }             
  308.               }
  309.               return true;         
  310.         }
  311.         /// <summary>
  312.         /// 标注模板  
  313.         /// </summary>
  314.         /// <param name="c">一个模版数据</param>
  315.         /// <returns>标注的模板各个值</returns>
  316.         private ArrayList EditOneModelTag(string c)
  317.         {
  318.             ArrayList x = new ArrayList();  //对每个*进行标注
  319.             x.Clear();
  320.             string[] xxs = c.Split('*');
  321.             int Len = 0;
  322.             foreach (string a in xxs)
  323.             {
  324.                 Len = Len + a.Length;            
  325.             }
  326.             //得到平均长度
  327.             int OneLen = (int)Len / xxs.Length;
  328.             for (int i = 0; i < xxs.Length; i++)
  329.             {
  330.                 if (xxs[i].Length > 0 & OneLen < xxs[i].Length)
  331.                 {
  332.                     string aaa = CCxmlTag( GetClearTag(xxs[i]));
  333.                     aaa = CodeIy.stringcode(aaa);
  334.                     if (aaa.Length < 16 & isNum(aaa)==false )
  335.                     {
  336.                         x.Add(aaa);
  337.                     }
  338.                     else
  339.                     {
  340.                         x.Add("");
  341.                     }
  342.                 }
  343.                 else
  344.                 {
  345.                     x.Add("");
  346.                 }
  347.             
  348.             }
  349.             return x;
  350.             // 1 把模板数据按照*顺序 分割  每个* 对应其前部的一个部分 n(i)
  351.             // 2 去掉n(i) 标签数据 作为属性   去掉 : 等  如果太长〉8 则放弃
  352.         }
  353.         /// <summary>
  354.         /// 清除数据中的HTML标签
  355.         /// </summary>
  356.         /// <param name="dat"></param>
  357.         /// <returns></returns>
  358.         private string GetClearTag(string dat)
  359.         {
  360.             //</td></tr><tr><td class="main_text_left">身份验证:</td><td class="main_text">
  361.             int Loop = dat.Length / 4;
  362.             for (int i = 0; i < Loop; i++)
  363.             {
  364.                 int a1 = dat.IndexOf('<');
  365.                 if (a1 == -1 | a1 == (dat.Length-1) )
  366.                 { }
  367.                 else
  368.                 {     
  369.                     int a2 = dat.IndexOf('>',a1 + 1);
  370.                     if (a2 == -1)
  371.                     { }
  372.                     else
  373.                     {
  374.                         int aa = dat.Length;
  375.                         // *<>*<>*
  376.                         string dat1 = "";
  377.                         if (a1 == 0)
  378.                         {
  379.                             
  380.                         }
  381.                         else
  382.                         { 
  383.                             dat1 = dat.Substring(0, a1 );
  384.                         }
  385.                         string dat2 = dat.Substring(a2 + 1, aa - a2-1);
  386.                         dat = dat1 + dat2;
  387.                     }
  388.                 }
  389.             }
  390.             // 1 去掉标签 
  391.             // 2 去掉: :
  392.             return dat;
  393.         }
  394.         /// <summary>
  395.         /// 得到一个36个url的采样队列
  396.         /// </summary>
  397.         /// <param name="a"></param>
  398.         /// <returns></returns>
  399.         private CaiYangShuJu GetADCUrlList(string aurl)
  400.         {
  401.             ArrayList x = new ArrayList();
  402.             x.Clear();
  403.             //记录得到不同的 相似度的数量
  404.             int[] ValNum = new int[51];
  405.      
  406.             foreach (string b in LIST)
  407.             {
  408.                 //采样
  409.                 int n1 = XunLong.UrlStringLib.ClassUrlString.Url2Url(aurl, b);
  410.                 if (n1 >= 12)
  411.                 {
  412.                     x.Add(b);
  413.                     if (x.Count > 24)
  414.                     {
  415.                         break;
  416.                     }
  417.                 }
  418.             }
  419.          
  420.             CaiYangShuJu one = new CaiYangShuJu();
  421.             one.ListOne = x;
  422.             one.TNUM = x.Count;
  423.             return one;
  424.         }
  425.         /// <summary>
  426.         /// 得到一个36个url的采样队列
  427.         /// </summary>
  428.         /// <param name="a"></param>
  429.         /// <returns></returns>
  430.         private CaiYangShuJu GetADCUrlList2(string aurl)
  431.         {
  432.             ArrayList x = new ArrayList();
  433.             x.Clear();
  434.             //记录得到不同的 相似度的数量
  435.             int[] ValNum = new int[51];
  436.             foreach (string b in LIST)
  437.             {
  438.                 // 和已经建立过模板的url数据比较其相似度
  439.                 int n1 = XunLong.UrlStringLib.ClassUrlString.Url2Url(aurl, b);
  440.                 ValNum[n1] = ValNum[n1] + 1;
  441.             }
  442.             //被选取得相似度
  443.             int XXX = 0;
  444.             //得到 最相似的  总体大于 120 的 
  445.             for (int i = 50; i > 10; i--)
  446.             {
  447.                 if (ValNum[i] > 64)
  448.                 {
  449.                     XXX = i;
  450.                     break;
  451.                 }
  452.             }
  453.             if (XXX < 12)
  454.             {
  455.                 CaiYangShuJu oneBad = new CaiYangShuJu();
  456.                 oneBad.ListOne = null;
  457.                 oneBad.TNUM = 0;
  458.                 return oneBad;
  459.             }
  460.             //记录  取得36个即可
  461.             foreach (string b in LIST)
  462.             {
  463.                 // 和已经建立过模板的url数据比较其相似度
  464.                 int n1 = XunLong.UrlStringLib.ClassUrlString.Url2Url(aurl, b);
  465.                 if (n1 == XXX)
  466.                 {
  467.                     x.Add(b);
  468.                     if (x.Count >= 24)
  469.                     {
  470.                         break;
  471.                     }
  472.                 }
  473.             }
  474.             CaiYangShuJu one = new CaiYangShuJu();
  475.             one.ListOne = x;
  476.             one.TNUM = XXX;
  477.             return one;
  478.         }
  479.         /// <summary>
  480.         /// 读取各个模版  记录和存储 第一个采样的url 和 模板名称(由原始模板数据MD5生成)
  481.         /// </summary>
  482.         public void Init(string filePath,string b0,string c0,string  k_c_path )
  483.         {
  484.             XunLong.CongifData.Config.InitConfigData(k_c_path);
  485.             aPath = filePath;         
  486.             cPath = c0;
  487.             db.SetClassNHT(filePath, 3145727, k_c_path);
  488.             LIST = db.SearchOneList("http");
  489.              olgurl.Clear();
  490.              neeDurl.Clear();
  491.     
  492.             initolgurl(XunLong.CongifData.Config.modelSourceOLD);
  493.             initNEEDURL(XunLong.CongifData.Config.modelSource);
  494.             Console.WriteLine("设置列表中共有 "+ neeDurl.Count.ToString()+" 条需要建模的地址");
  495.             Console.WriteLine("已经建模的有 " + olgurl.Count.ToString() + " 条");
  496.             foreach (string newItOne in olgurl)
  497.             {
  498.                 if (neeDurl.Contains(newItOne) == true)
  499.                 {
  500.                     neeDurl.Remove(newItOne);
  501.                 }
  502.             }
  503.             Console.WriteLine("本次任务中共有 " + neeDurl.Count.ToString() + " 条数据需要建模");
  504.         
  505.         }
  506.         /// <summary>
  507.         /// 得到数据的MD5名
  508.         /// </summary>
  509.         /// <param name="data"></param>
  510.         /// <returns></returns>
  511.         private string getMD5name(string data)
  512.         {
  513.             string strMd5 = System.Web.Security.FormsAuthentication.HashPasswordForStoringInConfigFile(data, "md5");
  514.             return strMd5;
  515.         }
  516.         /// <summary>
  517.         /// 读文件
  518.         /// </summary>
  519.         /// <param name="filename"></param>
  520.         /// <returns></returns>
  521.         private static string getFileData(string filename)
  522.         {
  523.             StreamReader reader = null;
  524.             string data = string.Empty;
  525.             try
  526.             {
  527.                 reader = new StreamReader(filename, System.Text.Encoding.GetEncoding("gb2312"));
  528.                 data = reader.ReadToEnd();
  529.                 reader.Close();
  530.                 return data;
  531.             }
  532.             catch (IOException e)
  533.             {
  534.                 Console.WriteLine(e.Message);
  535.             }
  536.             finally
  537.             {
  538.                 if (reader != null)
  539.                     reader.Close();
  540.             }
  541.             return "";
  542.         }
  543.         /// <summary>
  544.         /// 写文件
  545.         /// </summary>
  546.         /// <param name="filename"></param>
  547.         /// <param name="data"></param>
  548.         private void putFileData(string filename, string data)
  549.         {
  550.             StreamWriter writer = null;
  551.             try
  552.             {
  553.                 writer = new StreamWriter(filename, false, System.Text.Encoding.GetEncoding("gb2312"));
  554.                 writer.Write(data);
  555.                 writer.Close();
  556.             }
  557.             catch (IOException e)
  558.             {
  559.                 Console.WriteLine(e.Message);
  560.             }
  561.             finally
  562.             {
  563.                 if (writer != null)
  564.                     writer.Close();
  565.             }
  566.         }
  567.         /// <summary>
  568.         /// 写入已经使用过的一条数据   xxx-->> 相对于设置列表 olgurl
  569.         /// </summary>
  570.         /// <param name="filename">文件名</param>
  571.         /// <param name="data">数据</param>
  572.         /// <param name="isApp">是否追加模式</param>
  573.         public void putmOLDMODELSOURCEFileData(string okPath, string data)
  574.         {
  575.             StreamWriter writer = null;
  576.             try
  577.             {
  578.                 writer = new StreamWriter(okPath, true, System.Text.Encoding.GetEncoding("gb2312"));
  579.                 //  writer.Write(data);
  580.                 writer.WriteLine(data);
  581.                 writer.Close();
  582.             }
  583.             catch (IOException e)
  584.             {
  585.                 Console.WriteLine(e.Message);
  586.             }
  587.             finally
  588.             {
  589.                 if (writer != null)
  590.                     writer.Close();
  591.             }
  592.            
  593.         }
  594.         /// 初始化已经建模的结果  xxx-->> 相对于设置列表 olgurl
  595.         /// </summary>
  596.         /// <param name="okPath">使用缓存服务器的缓存数据</param>
  597.         public void initolgurl(string okPath)
  598.         {
  599.             //初始化分词缓存
  600.            
  601.             StreamReader reader = null;
  602.             try
  603.             {
  604.                 reader = new StreamReader(okPath, System.Text.Encoding.GetEncoding("gb2312"));
  605.                 for (string line = reader.ReadLine(); line != null; line = reader.ReadLine())
  606.                 {
  607.                     if (line != null)
  608.                     {
  609.                         if (line.Length == 0)
  610.                         { }
  611.                         else
  612.                         {
  613.                             if (olgurl.Contains(line) == false)
  614.                             {
  615.                                 olgurl.Add(line);
  616.                             }
  617.                             else
  618.                             {
  619.                                 int u_w = 0;
  620.                             }
  621.                         }
  622.                     }
  623.                 }
  624.                 reader.Close();
  625.             }
  626.             catch (IOException e)
  627.             {
  628.                 Console.WriteLine(e.Message);
  629.             }
  630.             finally
  631.             {
  632.                 if (reader != null)
  633.                     reader.Close();
  634.             }
  635.            
  636.         }
  637.         /// 初始化需要建模的列表  xxx-->> 相对于设置列表 needurl
  638.         /// </summary>
  639.         /// <param name="okPath">使用缓存服务器的缓存数据</param>
  640.         public void initNEEDURL(string okPath)
  641.         {
  642.             //初始化分词缓存
  643.             neeDurl.Clear();
  644.             StreamReader reader = null;
  645.             try
  646.             {
  647.                 reader = new StreamReader(okPath, System.Text.Encoding.GetEncoding("gb2312"));
  648.                 for (string line = reader.ReadLine(); line != null; line = reader.ReadLine())
  649.                 {
  650.                     if (line != null)
  651.                     {
  652.                         if (line.Length == 0)
  653.                         { }
  654.                         else
  655.                         {
  656.                             if (neeDurl.Contains(line) == false)
  657.                             {
  658.                                 neeDurl.Add(line);
  659.                             }
  660.                             else
  661.                             {
  662.                                 int u_w = 0;
  663.                             }
  664.                         }
  665.                     }
  666.                 }
  667.                 reader.Close();
  668.             }
  669.             catch (IOException e)
  670.             {
  671.                 Console.WriteLine(e.Message);
  672.             }
  673.             finally
  674.             {
  675.                 if (reader != null)
  676.                     reader.Close();
  677.             }
  678.             ArrayList tmp_needurl = new ArrayList();
  679.             tmp_needurl.Clear();
  680.             foreach (string a in neeDurl)
  681.             {
  682.                 if (a.IndexOf('?') > 0)
  683.                 {
  684.                     tmp_needurl.Add(a);
  685.                 }
  686.             }
  687.             foreach (string a in neeDurl)
  688.             {
  689.                 if (a.IndexOf('?') ==-1)
  690.                 {
  691.                     tmp_needurl.Add(a);
  692.                 }
  693.             }
  694.             neeDurl = tmp_needurl;
  695.         }
  696.         /// <summary>
  697.         /// 去掉标示中的其他部分
  698.         /// </summary>
  699.         /// <param name="data"></param>
  700.         /// <returns></returns>
  701.         private string CCxmlTag(string data)
  702.         {
  703.             data = data.Replace("【", "");
  704.             data = data.Replace("】", "");
  705.             data = data.Replace("&nbsp", "");
  706.             data = data.Replace("/", "");
  707.             data = data.Replace("<", "");
  708.             data = data.Replace(">", "");
  709.             data = data.Replace(";", "");
  710.             data = data.Replace("‘", "");
  711.             data = data.Replace("“", "");
  712.             data = data.Replace("”", "");
  713.             data = data.Replace(".", "");
  714.             data = data.Replace("-", "");
  715.             data = data.Replace("。", "");
  716.             data = data.Replace("|", "");
  717.             data = data.Replace("+", "");
  718.             data = data.Replace("*", "");
  719.             data = data.Replace("@", "");
  720.             data = data.Replace(":", "");
  721.             data = data.Replace(":", "");
  722.             // data = data.Replace("", "");
  723.             //  data = data.Replace("", "");
  724.             data = data.Replace("?", "");
  725.             data = data.Replace("λ", "");
  726.             data = data.Replace("[", "");
  727.             data = data.Replace("]", "");
  728.             data = data.Replace(".", "");
  729.             data = data.Replace(".", "");
  730.             data = data.Replace("↓", "");
  731.             data = data.Replace("?", "");
  732.             data = data.Replace("~", "");
  733.             data = data.Replace("`", "");
  734.             data = data.Replace("!", "");
  735.             data = data.Replace("@", "");
  736.             data = data.Replace("#", "");
  737.             data = data.Replace("$", "");
  738.             data = data.Replace("%", "");
  739.             data = data.Replace("^", "");
  740.             data = data.Replace("&", "");
  741.             data = data.Replace("*", "");
  742.             data = data.Replace("(", "");
  743.             data = data.Replace(")", "");
  744.             data = data.Replace("-", "");
  745.             data = data.Replace("_", "");
  746.             data = data.Replace("=", "");
  747.             data = data.Replace("+", "");
  748.             data = data.Replace("|", "");
  749.             data = data.Replace("\", "");
  750.             data = data.Replace("{", "");
  751.             data = data.Replace("}", "");
  752.             data = data.Replace(":", "");
  753.             data = data.Replace(":", "");
  754.             data = data.Replace(""", "");
  755.             data = data.Replace("'", "");
  756.             data = data.Replace("<", "");
  757.             data = data.Replace(">", "");
  758.             data = data.Replace(",", "");
  759.             data = data.Replace(".", "");
  760.             data = data.Replace("/", "");
  761.           //  data = data.Replace("", "");
  762.             data = data.Replace("~", "");
  763.            // data = data.Replace("", "");
  764.             data = data.Replace("`", "");
  765.             data = data.Replace("!", "");
  766.             data = data.Replace("@", "");
  767.             data = data.Replace("#", "");
  768.             data = data.Replace("$", "");
  769.             data = data.Replace("%", "");
  770.             data = data.Replace("︿", "");
  771.             data = data.Replace("&", "");
  772.             data = data.Replace("*", "");
  773.             data = data.Replace("(", "");
  774.             data = data.Replace(")", "");
  775.             data = data.Replace("_", "");
  776.             data = data.Replace("-", "");
  777.             data = data.Replace("+", "");
  778.             data = data.Replace("=", "");
  779.             data = data.Replace("|", "");
  780.             data = data.Replace("\", "");
  781.             data = data.Replace("[", "");
  782.             data = data.Replace("]", "");
  783.             data = data.Replace("{", "");
  784.             data = data.Replace("}", "");
  785.             data = data.Replace(":", "");
  786.             data = data.Replace(";", "");
  787.             data = data.Replace("'", "");
  788.             data = data.Replace(""", "");
  789.             data = data.Replace("`", "");
  790.             data = data.Replace("〃", "");
  791.             data = data.Replace("<", "");
  792.             data = data.Replace(">", "");
  793.             data = data.Replace(",", "");
  794.             data = data.Replace(".", "");
  795.             data = data.Replace("/", "");
  796.             data = data.Replace("?", "");
  797.             //data = data.Replace("", "");
  798.             //data = data.Replace("", "");
  799.             data = data.Trim();
  800.             return data;
  801.         }
  802.         /// <summary>
  803.         /// 把长的数据变为短的 词典 把数据变为《KC+ZD INX=0》  
  804.         /// </summary>
  805.         /// <param name="HM">HTML数据列表</param>
  806.         /// <returns></returns>
  807.         private HTM2SHORT Long2Short(Hashtable HM)
  808.         {
  809.             HTM2SHORT cv = new HTM2SHORT();
  810.             ArrayList i_i_1 = new ArrayList();
  811.             i_i_1.Clear();
  812.             Hashtable i_i_2 = new Hashtable();
  813.             i_i_2.Clear();
  814.             cv.Dict1 = i_i_1 ;
  815.             cv.Dict2 = i_i_1;
  816.             cv.HASH = i_i_2;
  817.           
  818.  
  819.             ArrayList shortDict1 = new ArrayList();
  820.             shortDict1.Clear();
  821.             ArrayList shortDict2 = new ArrayList();
  822.             shortDict2.Clear();
  823.             //取标签
  824.             foreach (System.Collections.DictionaryEntry de2 in HM)
  825.             {
  826.                 string ShortHMONE = de2.Value.ToString();
  827.                 for (int i = 0; i < ShortHMONE.Length - 2; i++)
  828.                 {
  829.                     int ae_1 = ShortHMONE.IndexOf('<', i);
  830.                     int ae_2 = ShortHMONE.IndexOf('>', ae_1 + 1);
  831.                     int ae_3 = ShortHMONE.IndexOf('<', ae_1 + 1);
  832.                     if (ae_1 == -1 | ae_2 == -1)
  833.                     {
  834.                         break;
  835.                     }
  836.                     else
  837.                     {
  838.                         if (ae_3 < ae_2)
  839.                         {
  840.                             i = ae_1 + 1;
  841.                         }
  842.                         else
  843.                         {
  844.                             string ae_str = ShortHMONE.Substring(ae_1 , ae_2 - ae_1+1 );
  845.                             if (ae_str.Length < 10)
  846.                             {
  847.                             }
  848.                             else
  849.                             {
  850.                                 if (shortDict1.Contains(ae_str) == false)
  851.                                 {
  852.                                     shortDict1.Add(ae_str);  //得到一个公共的词典
  853.                                 }
  854.                             }
  855.                             i = ae_2;
  856.                         }
  857.                     }
  858.                 }
  859.             }
  860.             // 取标签外 
  861.             foreach (System.Collections.DictionaryEntry de2 in HM)
  862.             {
  863.                 string ShortHMONE = de2.Value.ToString();
  864.                 for (int i = 0; i < ShortHMONE.Length-2; i++)
  865.                 {
  866.                     int ae_1 = ShortHMONE.IndexOf('>',i);
  867.                     int ae_2 = ShortHMONE.IndexOf('<', ae_1 + 1);
  868.                     int ae_3 = ShortHMONE.IndexOf('>', ae_1 + 1);
  869.                     if (ae_1 == -1 | ae_2 == -1  )
  870.                     {
  871.                         break;
  872.                     }
  873.                     else
  874.                     {
  875.                         if (ae_3 < ae_2)
  876.                         {
  877.                             i = ae_1+1;
  878.                         }
  879.                         else
  880.                         {
  881.                             string ae_str = ShortHMONE.Substring(ae_1 + 1, ae_2 - ae_1 - 1);
  882.                             if (ae_str.Length < 10)
  883.                             {
  884.                             }
  885.                             else
  886.                             {
  887.                                 if (shortDict2.Contains(ae_str) == false)
  888.                                 {
  889.                                     shortDict2.Add(ae_str);  //得到一个公共的词典
  890.                                 }
  891.                             }
  892.                             i = ae_2;
  893.                         }
  894.                         
  895.                     }
  896.                 }
  897.             }
  898.             int short_int1 = shortDict1.Count;
  899.            //对词典按照长度进行排序
  900.             string[] ae_ss1 = new string[short_int1];
  901.             for (int i = 0; i < short_int1; i++)
  902.             {
  903.                 ae_ss1[i] = shortDict1[i].ToString();
  904.             }
  905.             for (int i = 0; i < short_int1; i++)
  906.             {
  907.                 for (int j = i; j < short_int1; j++)
  908.                 {
  909.                     if (ae_ss1[i].Length < ae_ss1[j].Length)
  910.                     {
  911.                         string tmp_one = ae_ss1[j];
  912.                         ae_ss1[j] = ae_ss1[i];
  913.                         ae_ss1[i]  = tmp_one;                
  914.                     }
  915.                 }          
  916.             }
  917.             shortDict1.Clear();
  918.             int short_int2 = shortDict2.Count;
  919.             //对词典按照长度进行排序
  920.             string[] ae_ss2 = new string[short_int2];
  921.             for (int i = 0; i < short_int2; i++)
  922.             {
  923.                 ae_ss2[i] = shortDict2[i].ToString();
  924.             }
  925.             for (int i = 0; i < short_int2; i++)
  926.             {
  927.                 for (int j = i; j < short_int2; j++)
  928.                 {
  929.                     if (ae_ss2[i].Length < ae_ss2[j].Length)
  930.                     {
  931.                         string tmp_one = ae_ss2[j];
  932.                         ae_ss2[j] = ae_ss2[i];
  933.                         ae_ss2[i] = tmp_one;
  934.                     }
  935.                 }
  936.             }
  937.             shortDict2.Clear();
  938.             for (int i = 0; i < short_int1; i++)
  939.             {
  940.                 shortDict1.Add(ae_ss1[i]);
  941.             }
  942.             for (int i = 0; i < short_int2; i++)
  943.             {
  944.                 shortDict2.Add(ae_ss2[i]);
  945.             }
  946.             Hashtable new_HM = new Hashtable();
  947.             new_HM.Clear();
  948.             foreach (System.Collections.DictionaryEntry de2 in HM)
  949.             {
  950.                 string a_onre = de2.Value.ToString();
  951.                 for (int u = 0; u < short_int1; u++)
  952.                 {
  953.                     a_onre = a_onre.Replace(shortDict1[u].ToString(), "<["+ u.ToString() +")>");
  954.                 }
  955.                 for (int u = 0; u < short_int2; u++)
  956.                 {
  957.                     a_onre = a_onre.Replace(shortDict2[u].ToString(), "《[" + u.ToString() + ")》");
  958.                 }
  959.                 new_HM.Add(de2.Key, a_onre);
  960.             }
  961.             cv.Dict1 = shortDict1;
  962.             cv.Dict2 = shortDict2; 
  963.             cv.HASH = new_HM;
  964.             return cv;
  965.         }
  966.         /// <summary>
  967.         /// 还原压缩过的单个数据
  968.         /// </summary>
  969.         /// <param name="dat"></param>
  970.         /// <param name="dict"></param>
  971.         /// <returns></returns>
  972.         private string Short2Long(string dat, ArrayList dict1,ArrayList dict2)
  973.         {
  974.             for (int u = 0; u < dict1.Count; u++)
  975.             {
  976.                 dat = dat.Replace( "<[" + u.ToString() + ")>" ,dict1[u].ToString() );
  977.             }
  978.             for (int u = 0; u < dict2.Count; u++)
  979.             {
  980.                 dat = dat.Replace("《[" + u.ToString() + ")》", dict2[u].ToString());
  981.             }
  982.             return dat;
  983.         }
  984.     }
  985. }