AutoBuildModelClass.cs
上传用户:zhangkuixh
上传日期:2013-09-30
资源大小:5473k
文件大小:36k
- using System;
- using System.Collections.Generic;
- using System.Text;
- using System.Collections;
- using System.Threading;
- using System.IO;
- namespace XunLong.AutoModelBuilder
- {
- /*
- ' 迅龙中文分类搜索引擎 v0.6
- '
- ' LGPL 许可发行
- '
- ' 宁夏大学 张冬 康彩 zd4004@163.com
- '
- ' 官网 http://blog.163.com/zd4004/
- */
- /// <summary>
- /// 自动建立模板类
- /// </summary>
- class AutoBuildModelClass
- {
- /// <summary>
- /// 记录采样的数据
- /// </summary>
- struct CaiYangShuJu
- {
- /// <summary>
- /// 采样数据列表
- /// </summary>
- public ArrayList ListOne;
- /// <summary>
- /// 记录本次采样所使用的优先级
- /// </summary>
- public int TNUM;
-
- }
- /// <summary>
- /// 存储数据长短变化
- /// </summary>
- struct HTM2SHORT
- {
- /// <summary>
- /// 词典 标签
- /// </summary>
- public ArrayList Dict1;
- /// <summary>
- /// 词典 数据
- /// </summary>
- public ArrayList Dict2;
- /// <summary>
- /// 变换的数据
- /// </summary>
- public Hashtable HASH;
- }
- /// <summary>
- /// 模版的数量
- /// </summary>
- int xnum = 0;
- //D:XunLongXunLong.BINSpiderWEB
- /// <summary>
- /// 文件系统路径
- /// </summary>
- string aPath = "";
-
- /// <summary>
- /// 要生成的模版文件路径
- /// </summary>
- string cPath = "";
- /// <summary>
- /// 是否允许建模版
- /// </summary>
- bool isSURE = true;
- /// <summary>
- /// 线程需要的模板建立参数
- /// </summary>
- Hashtable TMMS = new Hashtable();
- /// <summary>
- /// 文件系统对象
- /// </summary>
- NetHashTableAPI.ClassNHT db = new NetHashTableAPI.ClassNHT();
- XunLong.HtmlClassLibrary.ClassTXT2IDAT CodeIy = new XunLong.HtmlClassLibrary.ClassTXT2IDAT();
- XunLong.HtmlClassLibrary.ClassHTML myHTML2CLEAR = new XunLong.HtmlClassLibrary.ClassHTML();
- /// <summary>
- /// 已经生成过模版的url
- /// </summary>
- ArrayList olgurl = new ArrayList();
- /// <summary>
- /// 得到的URL列表 需要计算 去掉其中 已经建立过模板的部分
- /// </summary>
- ArrayList LIST = new ArrayList();
- /// <summary>
- /// 需要建立模版的url
- /// </summary>
- ArrayList neeDurl = new ArrayList();
- // 1 读取 URL列表 提取数据特征 a 以不同的动态页面区别 b以结构相似的静态页面区别 c以参数结构相似的页区别
- // 只要该集合内数据大于160 则建立模板 采样按80来进行 模板的命名采用 模板数据的MD5 数据 有一个好处就是可以去掉完全相同的模板
- // 2 模板分析 检测出模板*前方 不属于HTML标示的第一个元素 <S>XX<4545><xxxx>*<xxsss> 自动匹配80源数据得到 每个元素的数据平均长度
- // 平均长度小于 20的 放在类聚模版中 然后 把所有的 生成 放在搜索模板中 平均长度是0的 取消此项
- /// <summary>
- /// 全自动系统运行 Random ra = new Random(Environment.TickCount);
- /// </summary>
- public void AutoRun()
- {
-
- }
- /// <summary>
- /// 半自动系统运行 根据设置好的相似页面
- /// </summary>
- public void Run()
- {
- isSURE = true;
- foreach (string a_a in neeDurl)
- {
- //建立模板
- CaiYangShuJu CaiYang = GetADCUrlList(a_a);
- ArrayList urlList = CaiYang.ListOne;
- //把已经建过模的数据写入olgurl
- putmOLDMODELSOURCEFileData(XunLong.CongifData.Config.modelSourceOLD, a_a);
- if (urlList == null)
- {
- goto NEXTIT;
- }
- Hashtable htms = new Hashtable();
- htms.Clear();
- foreach (string akcc in urlList)
- {
- string htmData = db.Value(akcc);
- htmData = myHTML2CLEAR.HTML2CLEAR(htmData,akcc);
- if (htmData.Length < 48)
- {
- goto NEXTIT;
- }
- htms.Add(akcc, htmData);
- }
- Console.WriteLine("-开始建立模板-> " + a_a);
- TMMS = htms;
-
- Thread T1 = new Thread(new ThreadStart(TMM));
- T1.Start();
- isSURE = false;
- int LooPNUm =0;
- // == 1 表示正在工作
- while (isSURE == false)
- {
- LooPNUm = LooPNUm + 1;
- Console.Write(">");
- if(LooPNUm > 300)
- {
- isSURE =true; // LoopNUM 大于200 长时间没有响应
- Console.WriteLine("=-> TimeOUT Start New T1");
- }
- Thread.Sleep(1000);
- }
- NEXTIT: ; //进入下一轮
- }
- Console.WriteLine("建模工作全部完成");
- }
- /// <summary>
- /// 建立模板线程
- /// </summary>
- private void TMM()
- {
- // 是否允许建模版
- isSURE = false;
- // Console.WriteLine("==--> " + isSURE.ToString());
- try
- {
- Hashtable TMMSOne = TMMS;
- HTM2SHORT new_vc_tmm = Long2Short(TMMSOne);
- Hashtable TMMSOne2 = new_vc_tmm.HASH;
- ArrayList Short_DICT1 = new_vc_tmm.Dict1;
- ArrayList Short_DICT2 = new_vc_tmm.Dict2;
- string dataE = "";
- foreach (System.Collections.DictionaryEntry de2 in TMMSOne)
- {
- dataE = dataE + de2.Key.ToString() + "rn";
- }
- //声明模板构建类
- XunLong.ModelBuilder.ClassModelBuilder myBU = new XunLong.ModelBuilder.ClassModelBuilder();
- string ModelData ="";
- try
- {
- ModelData = myBU.BuilderModel(TMMSOne2);
- ModelData = Short2Long(ModelData, Short_DICT1,Short_DICT2); //还原压缩的数据
- }
- catch
- {
- Console.WriteLine("--> ERROR 排除错误模版");
- //允许建立模版
- isSURE = true;
- return;
- }
- int nn = myBU.inStrNum(ModelData, "*");
- //模版建立错误 表明该数据不能完成模版创建工作 则不予考虑
- if (ModelData.Length < 150 | nn < 4)
- {
- Console.WriteLine("--> 排除错误模版");
- //允许建立模版
- isSURE = true;
- return;
- }
- ArrayList txtDat = EditOneModelTag(ModelData);
- ArrayList neWW = txtDat;
- //*************************************************************************************
- // 匹配模版
- //*************************************************************************************
- // 匹配模版 数据全部匹配模版
- string[] pipeiEndData = new string[neWW.Count]; //匹配得到的匹配项列表数据
- for (int i = 0; i < neWW.Count; i++)
- {
- pipeiEndData[i] = "";
- }
- foreach (System.Collections.DictionaryEntry de2 in TMMSOne)
- {
- //建立滤波类
- XunLong.ModelUserClassLibrary.ClassUserModel m = new XunLong.ModelUserClassLibrary.ClassUserModel();
- //压入测试模板
- m.TestModeL("", "", "", ModelData, "", "", "","");
- XunLong.PublicClassLibrary.kcSearch k = m.getTagAndData(de2.Value.ToString());
- Hashtable p = m.modelOneList;
- foreach (System.Collections.DictionaryEntry de in p)
- {
- int pi = (int)de.Key;
- pipeiEndData[pi] = pipeiEndData[pi] + de.Value.ToString().Trim();
- }
- }
-
- string dataA = "";
- string dataB = "";
- string TagTmp = "属性"; //记录上1个有标示的 数据
- for (int i = 0; i < neWW.Count; i++)
- {
- if (pipeiEndData[i].Length > 0)
- {
- dataA = dataA + neWW[i].ToString() + ":" + "<TAGDATA INDEX=" + i.ToString() + "/>" + "rn";
- if (neWW[i].ToString().Length > 1)
- {
- if (neWW[i].ToString().IndexOf(' ') == -1 & neWW[i].ToString().Length < 13)
- {
- dataB = dataB + "<" + neWW[i].ToString() + ">" + "<TAGDATA INDEX=" + i.ToString() + "/>" + "</" + neWW[i].ToString() + ">" + "rn";
- TagTmp = neWW[i].ToString();
- }
- else
- {
- string[] xrr = neWW[i].ToString().Split(' ');
- string TagII = xrr[xrr.Length - 1];
- dataB = dataB + "<" + TagII + ">" + "<TAGDATA INDEX=" + i.ToString() + "/>" + "</" + TagII + ">" + "rn";
- TagTmp = TagII;
- }
- }
- else
- {
- //表示本属性前方 无可以使用的属性标示 则把此 取出的数据合并到前方 <XX><TAGDATA INDEX=1></XX> <XX><TAGDATA INDEX=1><TAGDATA INDEX=2></XX>
- //1 取出包含的 <TAGDATA INDEX=1>
- if (TagTmp.Length > 0)
- {
- dataB = dataB + "<" + TagTmp + ">" + "<TAGDATA INDEX=" + i.ToString() + "/>" + "</" + TagTmp + ">" + "rn";
- }
- }
- }
- }
- dataB = "<xl主类别>HTM</xl主类别>" + "rn" +dataB ;
- string dataT = "<TAGDATA INDEX=1/>";
- string x = cPath + "\" + getMD5name(ModelData);
- XunLong.PublicClassLibrary.kcSearch v = new XunLong.PublicClassLibrary.kcSearch();
- v.a = "";
- putFileData(x + ".a", dataA);
- putFileData(x + ".b", dataB);
- putFileData(x + ".c", "");
- putFileData(x + ".d", ModelData);
- putFileData(x + ".e", dataE);
- putFileData(x + ".t", dataT);
- putFileData(x + ".h", dataA);
- putFileData(x + ".s", dataB);
- /// <summary>
- /// 显示的摘要信息
- /// </summary>
- // public string s;
- Console.WriteLine("-模版建立成功->" + x);
- xnum = xnum+1;
- Console.WriteLine("-全部模版数量-> " + xnum.ToString());
- }
- catch
- {
- Console.WriteLine("-模版建立F->" );
- }
- //允许建立模版
- isSURE = true;
- }
- /// <summary>
- /// 是否全为数字组成
- /// </summary>
- /// <param name="data"></param>
- /// <returns></returns>
- private bool isNum(string data)
- {
- if (data==null | data.Length == 0)
- {
- return false;
- }
- foreach(char a in data)
- {
- if (a < '0' | a > '9')
- {
- return false;
- }
- }
- return true;
- }
- /// <summary>
- /// 标注模板
- /// </summary>
- /// <param name="c">一个模版数据</param>
- /// <returns>标注的模板各个值</returns>
- private ArrayList EditOneModelTag(string c)
- {
- ArrayList x = new ArrayList(); //对每个*进行标注
- x.Clear();
- string[] xxs = c.Split('*');
- int Len = 0;
- foreach (string a in xxs)
- {
- Len = Len + a.Length;
- }
- //得到平均长度
- int OneLen = (int)Len / xxs.Length;
- for (int i = 0; i < xxs.Length; i++)
- {
- if (xxs[i].Length > 0 & OneLen < xxs[i].Length)
- {
- string aaa = CCxmlTag( GetClearTag(xxs[i]));
- aaa = CodeIy.stringcode(aaa);
- if (aaa.Length < 16 & isNum(aaa)==false )
- {
- x.Add(aaa);
- }
- else
- {
- x.Add("");
- }
- }
- else
- {
- x.Add("");
- }
-
- }
- return x;
- // 1 把模板数据按照*顺序 分割 每个* 对应其前部的一个部分 n(i)
- // 2 去掉n(i) 标签数据 作为属性 去掉 : 等 如果太长〉8 则放弃
- }
- /// <summary>
- /// 清除数据中的HTML标签
- /// </summary>
- /// <param name="dat"></param>
- /// <returns></returns>
- private string GetClearTag(string dat)
- {
- //</td></tr><tr><td class="main_text_left">身份验证:</td><td class="main_text">
- int Loop = dat.Length / 4;
- for (int i = 0; i < Loop; i++)
- {
- int a1 = dat.IndexOf('<');
- if (a1 == -1 | a1 == (dat.Length-1) )
- { }
- else
- {
- int a2 = dat.IndexOf('>',a1 + 1);
- if (a2 == -1)
- { }
- else
- {
- int aa = dat.Length;
- // *<>*<>*
- string dat1 = "";
- if (a1 == 0)
- {
-
- }
- else
- {
- dat1 = dat.Substring(0, a1 );
- }
- string dat2 = dat.Substring(a2 + 1, aa - a2-1);
- dat = dat1 + dat2;
- }
- }
- }
- // 1 去掉标签
- // 2 去掉: :
- return dat;
- }
- /// <summary>
- /// 得到一个36个url的采样队列
- /// </summary>
- /// <param name="a"></param>
- /// <returns></returns>
- private CaiYangShuJu GetADCUrlList(string aurl)
- {
- ArrayList x = new ArrayList();
- x.Clear();
- //记录得到不同的 相似度的数量
- int[] ValNum = new int[51];
-
- foreach (string b in LIST)
- {
- //采样
- int n1 = XunLong.UrlStringLib.ClassUrlString.Url2Url(aurl, b);
- if (n1 >= 12)
- {
- x.Add(b);
- if (x.Count > 24)
- {
- break;
- }
- }
- }
-
- CaiYangShuJu one = new CaiYangShuJu();
- one.ListOne = x;
- one.TNUM = x.Count;
- return one;
- }
- /// <summary>
- /// 得到一个36个url的采样队列
- /// </summary>
- /// <param name="a"></param>
- /// <returns></returns>
- private CaiYangShuJu GetADCUrlList2(string aurl)
- {
- ArrayList x = new ArrayList();
- x.Clear();
- //记录得到不同的 相似度的数量
- int[] ValNum = new int[51];
- foreach (string b in LIST)
- {
- // 和已经建立过模板的url数据比较其相似度
- int n1 = XunLong.UrlStringLib.ClassUrlString.Url2Url(aurl, b);
- ValNum[n1] = ValNum[n1] + 1;
- }
- //被选取得相似度
- int XXX = 0;
- //得到 最相似的 总体大于 120 的
- for (int i = 50; i > 10; i--)
- {
- if (ValNum[i] > 64)
- {
- XXX = i;
- break;
- }
- }
- if (XXX < 12)
- {
- CaiYangShuJu oneBad = new CaiYangShuJu();
- oneBad.ListOne = null;
- oneBad.TNUM = 0;
- return oneBad;
- }
- //记录 取得36个即可
- foreach (string b in LIST)
- {
- // 和已经建立过模板的url数据比较其相似度
- int n1 = XunLong.UrlStringLib.ClassUrlString.Url2Url(aurl, b);
- if (n1 == XXX)
- {
- x.Add(b);
- if (x.Count >= 24)
- {
- break;
- }
- }
- }
- CaiYangShuJu one = new CaiYangShuJu();
- one.ListOne = x;
- one.TNUM = XXX;
- return one;
- }
- /// <summary>
- /// 读取各个模版 记录和存储 第一个采样的url 和 模板名称(由原始模板数据MD5生成)
- /// </summary>
- public void Init(string filePath,string b0,string c0,string k_c_path )
- {
- XunLong.CongifData.Config.InitConfigData(k_c_path);
- aPath = filePath;
- cPath = c0;
- db.SetClassNHT(filePath, 3145727, k_c_path);
- LIST = db.SearchOneList("http");
- olgurl.Clear();
- neeDurl.Clear();
-
- initolgurl(XunLong.CongifData.Config.modelSourceOLD);
- initNEEDURL(XunLong.CongifData.Config.modelSource);
- Console.WriteLine("设置列表中共有 "+ neeDurl.Count.ToString()+" 条需要建模的地址");
- Console.WriteLine("已经建模的有 " + olgurl.Count.ToString() + " 条");
- foreach (string newItOne in olgurl)
- {
- if (neeDurl.Contains(newItOne) == true)
- {
- neeDurl.Remove(newItOne);
- }
- }
- Console.WriteLine("本次任务中共有 " + neeDurl.Count.ToString() + " 条数据需要建模");
-
- }
- /// <summary>
- /// 得到数据的MD5名
- /// </summary>
- /// <param name="data"></param>
- /// <returns></returns>
- private string getMD5name(string data)
- {
- string strMd5 = System.Web.Security.FormsAuthentication.HashPasswordForStoringInConfigFile(data, "md5");
- return strMd5;
- }
- /// <summary>
- /// 读文件
- /// </summary>
- /// <param name="filename"></param>
- /// <returns></returns>
- private static string getFileData(string filename)
- {
- StreamReader reader = null;
- string data = string.Empty;
- try
- {
- reader = new StreamReader(filename, System.Text.Encoding.GetEncoding("gb2312"));
- data = reader.ReadToEnd();
- reader.Close();
- return data;
- }
- catch (IOException e)
- {
- Console.WriteLine(e.Message);
- }
- finally
- {
- if (reader != null)
- reader.Close();
- }
- return "";
- }
- /// <summary>
- /// 写文件
- /// </summary>
- /// <param name="filename"></param>
- /// <param name="data"></param>
- private void putFileData(string filename, string data)
- {
- StreamWriter writer = null;
- try
- {
- writer = new StreamWriter(filename, false, System.Text.Encoding.GetEncoding("gb2312"));
- writer.Write(data);
- writer.Close();
- }
- catch (IOException e)
- {
- Console.WriteLine(e.Message);
- }
- finally
- {
- if (writer != null)
- writer.Close();
- }
- }
- /// <summary>
- /// 写入已经使用过的一条数据 xxx-->> 相对于设置列表 olgurl
- /// </summary>
- /// <param name="filename">文件名</param>
- /// <param name="data">数据</param>
- /// <param name="isApp">是否追加模式</param>
- public void putmOLDMODELSOURCEFileData(string okPath, string data)
- {
- StreamWriter writer = null;
- try
- {
- writer = new StreamWriter(okPath, true, System.Text.Encoding.GetEncoding("gb2312"));
- // writer.Write(data);
- writer.WriteLine(data);
- writer.Close();
- }
- catch (IOException e)
- {
- Console.WriteLine(e.Message);
- }
- finally
- {
- if (writer != null)
- writer.Close();
- }
-
- }
- /// 初始化已经建模的结果 xxx-->> 相对于设置列表 olgurl
- /// </summary>
- /// <param name="okPath">使用缓存服务器的缓存数据</param>
- public void initolgurl(string okPath)
- {
- //初始化分词缓存
-
- StreamReader reader = null;
- try
- {
- reader = new StreamReader(okPath, System.Text.Encoding.GetEncoding("gb2312"));
- for (string line = reader.ReadLine(); line != null; line = reader.ReadLine())
- {
- if (line != null)
- {
- if (line.Length == 0)
- { }
- else
- {
- if (olgurl.Contains(line) == false)
- {
- olgurl.Add(line);
- }
- else
- {
- int u_w = 0;
- }
- }
- }
- }
- reader.Close();
- }
- catch (IOException e)
- {
- Console.WriteLine(e.Message);
- }
- finally
- {
- if (reader != null)
- reader.Close();
- }
-
- }
- /// 初始化需要建模的列表 xxx-->> 相对于设置列表 needurl
- /// </summary>
- /// <param name="okPath">使用缓存服务器的缓存数据</param>
- public void initNEEDURL(string okPath)
- {
- //初始化分词缓存
- neeDurl.Clear();
- StreamReader reader = null;
- try
- {
- reader = new StreamReader(okPath, System.Text.Encoding.GetEncoding("gb2312"));
- for (string line = reader.ReadLine(); line != null; line = reader.ReadLine())
- {
- if (line != null)
- {
- if (line.Length == 0)
- { }
- else
- {
- if (neeDurl.Contains(line) == false)
- {
- neeDurl.Add(line);
- }
- else
- {
- int u_w = 0;
- }
- }
- }
- }
- reader.Close();
- }
- catch (IOException e)
- {
- Console.WriteLine(e.Message);
- }
- finally
- {
- if (reader != null)
- reader.Close();
- }
- ArrayList tmp_needurl = new ArrayList();
- tmp_needurl.Clear();
- foreach (string a in neeDurl)
- {
- if (a.IndexOf('?') > 0)
- {
- tmp_needurl.Add(a);
- }
- }
- foreach (string a in neeDurl)
- {
- if (a.IndexOf('?') ==-1)
- {
- tmp_needurl.Add(a);
- }
- }
- neeDurl = tmp_needurl;
- }
- /// <summary>
- /// 去掉标示中的其他部分
- /// </summary>
- /// <param name="data"></param>
- /// <returns></returns>
- private string CCxmlTag(string data)
- {
- data = data.Replace("【", "");
- data = data.Replace("】", "");
- data = data.Replace(" ", "");
- data = data.Replace("/", "");
- data = data.Replace("<", "");
- data = data.Replace(">", "");
- data = data.Replace(";", "");
- data = data.Replace("‘", "");
- data = data.Replace("“", "");
- data = data.Replace("”", "");
- data = data.Replace(".", "");
- data = data.Replace("-", "");
- data = data.Replace("。", "");
- data = data.Replace("|", "");
- data = data.Replace("+", "");
- data = data.Replace("*", "");
- data = data.Replace("@", "");
- data = data.Replace(":", "");
- data = data.Replace(":", "");
- // data = data.Replace("", "");
- // data = data.Replace("", "");
- data = data.Replace("?", "");
- data = data.Replace("λ", "");
- data = data.Replace("[", "");
- data = data.Replace("]", "");
- data = data.Replace(".", "");
- data = data.Replace(".", "");
- data = data.Replace("↓", "");
- data = data.Replace("?", "");
- data = data.Replace("~", "");
- data = data.Replace("`", "");
- data = data.Replace("!", "");
- data = data.Replace("@", "");
- data = data.Replace("#", "");
- data = data.Replace("$", "");
- data = data.Replace("%", "");
- data = data.Replace("^", "");
- data = data.Replace("&", "");
- data = data.Replace("*", "");
- data = data.Replace("(", "");
- data = data.Replace(")", "");
- data = data.Replace("-", "");
- data = data.Replace("_", "");
- data = data.Replace("=", "");
- data = data.Replace("+", "");
- data = data.Replace("|", "");
- data = data.Replace("\", "");
- data = data.Replace("{", "");
- data = data.Replace("}", "");
- data = data.Replace(":", "");
- data = data.Replace(":", "");
- data = data.Replace(""", "");
- data = data.Replace("'", "");
- data = data.Replace("<", "");
- data = data.Replace(">", "");
- data = data.Replace(",", "");
- data = data.Replace(".", "");
- data = data.Replace("/", "");
- // data = data.Replace("", "");
- data = data.Replace("~", "");
- // data = data.Replace("", "");
- data = data.Replace("`", "");
- data = data.Replace("!", "");
- data = data.Replace("@", "");
- data = data.Replace("#", "");
- data = data.Replace("$", "");
- data = data.Replace("%", "");
- data = data.Replace("︿", "");
- data = data.Replace("&", "");
- data = data.Replace("*", "");
- data = data.Replace("(", "");
- data = data.Replace(")", "");
- data = data.Replace("_", "");
- data = data.Replace("-", "");
- data = data.Replace("+", "");
- data = data.Replace("=", "");
- data = data.Replace("|", "");
- data = data.Replace("\", "");
- data = data.Replace("[", "");
- data = data.Replace("]", "");
- data = data.Replace("{", "");
- data = data.Replace("}", "");
- data = data.Replace(":", "");
- data = data.Replace(";", "");
- data = data.Replace("'", "");
- data = data.Replace(""", "");
- data = data.Replace("`", "");
- data = data.Replace("〃", "");
- data = data.Replace("<", "");
- data = data.Replace(">", "");
- data = data.Replace(",", "");
- data = data.Replace(".", "");
- data = data.Replace("/", "");
- data = data.Replace("?", "");
- //data = data.Replace("", "");
- //data = data.Replace("", "");
- data = data.Trim();
- return data;
- }
- /// <summary>
- /// 把长的数据变为短的 词典 把数据变为《KC+ZD INX=0》
- /// </summary>
- /// <param name="HM">HTML数据列表</param>
- /// <returns></returns>
- private HTM2SHORT Long2Short(Hashtable HM)
- {
- HTM2SHORT cv = new HTM2SHORT();
- ArrayList i_i_1 = new ArrayList();
- i_i_1.Clear();
- Hashtable i_i_2 = new Hashtable();
- i_i_2.Clear();
- cv.Dict1 = i_i_1 ;
- cv.Dict2 = i_i_1;
- cv.HASH = i_i_2;
-
-
- ArrayList shortDict1 = new ArrayList();
- shortDict1.Clear();
- ArrayList shortDict2 = new ArrayList();
- shortDict2.Clear();
- //取标签
- foreach (System.Collections.DictionaryEntry de2 in HM)
- {
- string ShortHMONE = de2.Value.ToString();
- for (int i = 0; i < ShortHMONE.Length - 2; i++)
- {
- int ae_1 = ShortHMONE.IndexOf('<', i);
- int ae_2 = ShortHMONE.IndexOf('>', ae_1 + 1);
- int ae_3 = ShortHMONE.IndexOf('<', ae_1 + 1);
- if (ae_1 == -1 | ae_2 == -1)
- {
- break;
- }
- else
- {
- if (ae_3 < ae_2)
- {
- i = ae_1 + 1;
- }
- else
- {
- string ae_str = ShortHMONE.Substring(ae_1 , ae_2 - ae_1+1 );
- if (ae_str.Length < 10)
- {
- }
- else
- {
- if (shortDict1.Contains(ae_str) == false)
- {
- shortDict1.Add(ae_str); //得到一个公共的词典
- }
- }
- i = ae_2;
- }
- }
- }
- }
- // 取标签外
- foreach (System.Collections.DictionaryEntry de2 in HM)
- {
- string ShortHMONE = de2.Value.ToString();
- for (int i = 0; i < ShortHMONE.Length-2; i++)
- {
- int ae_1 = ShortHMONE.IndexOf('>',i);
- int ae_2 = ShortHMONE.IndexOf('<', ae_1 + 1);
- int ae_3 = ShortHMONE.IndexOf('>', ae_1 + 1);
- if (ae_1 == -1 | ae_2 == -1 )
- {
- break;
- }
- else
- {
- if (ae_3 < ae_2)
- {
- i = ae_1+1;
- }
- else
- {
- string ae_str = ShortHMONE.Substring(ae_1 + 1, ae_2 - ae_1 - 1);
- if (ae_str.Length < 10)
- {
- }
- else
- {
- if (shortDict2.Contains(ae_str) == false)
- {
- shortDict2.Add(ae_str); //得到一个公共的词典
- }
- }
- i = ae_2;
- }
-
- }
- }
- }
- int short_int1 = shortDict1.Count;
- //对词典按照长度进行排序
- string[] ae_ss1 = new string[short_int1];
- for (int i = 0; i < short_int1; i++)
- {
- ae_ss1[i] = shortDict1[i].ToString();
- }
- for (int i = 0; i < short_int1; i++)
- {
- for (int j = i; j < short_int1; j++)
- {
- if (ae_ss1[i].Length < ae_ss1[j].Length)
- {
- string tmp_one = ae_ss1[j];
- ae_ss1[j] = ae_ss1[i];
- ae_ss1[i] = tmp_one;
- }
- }
- }
- shortDict1.Clear();
- int short_int2 = shortDict2.Count;
- //对词典按照长度进行排序
- string[] ae_ss2 = new string[short_int2];
- for (int i = 0; i < short_int2; i++)
- {
- ae_ss2[i] = shortDict2[i].ToString();
- }
- for (int i = 0; i < short_int2; i++)
- {
- for (int j = i; j < short_int2; j++)
- {
- if (ae_ss2[i].Length < ae_ss2[j].Length)
- {
- string tmp_one = ae_ss2[j];
- ae_ss2[j] = ae_ss2[i];
- ae_ss2[i] = tmp_one;
- }
- }
- }
- shortDict2.Clear();
- for (int i = 0; i < short_int1; i++)
- {
- shortDict1.Add(ae_ss1[i]);
- }
- for (int i = 0; i < short_int2; i++)
- {
- shortDict2.Add(ae_ss2[i]);
- }
- Hashtable new_HM = new Hashtable();
- new_HM.Clear();
- foreach (System.Collections.DictionaryEntry de2 in HM)
- {
- string a_onre = de2.Value.ToString();
- for (int u = 0; u < short_int1; u++)
- {
- a_onre = a_onre.Replace(shortDict1[u].ToString(), "<["+ u.ToString() +")>");
- }
- for (int u = 0; u < short_int2; u++)
- {
- a_onre = a_onre.Replace(shortDict2[u].ToString(), "《[" + u.ToString() + ")》");
- }
- new_HM.Add(de2.Key, a_onre);
- }
- cv.Dict1 = shortDict1;
- cv.Dict2 = shortDict2;
- cv.HASH = new_HM;
- return cv;
- }
- /// <summary>
- /// 还原压缩过的单个数据
- /// </summary>
- /// <param name="dat"></param>
- /// <param name="dict"></param>
- /// <returns></returns>
- private string Short2Long(string dat, ArrayList dict1,ArrayList dict2)
- {
- for (int u = 0; u < dict1.Count; u++)
- {
- dat = dat.Replace( "<[" + u.ToString() + ")>" ,dict1[u].ToString() );
- }
- for (int u = 0; u < dict2.Count; u++)
- {
- dat = dat.Replace("《[" + u.ToString() + ")》", dict2[u].ToString());
- }
- return dat;
- }
- }
- }