FormHTMLMODEL.cs
上传用户:zhangkuixh
上传日期:2013-09-30
资源大小:5473k
文件大小:20k
- using System;
- using System.Collections.Generic;
- using System.ComponentModel;
- using System.Data;
- using System.Drawing;
- using System.Text;
- using System.Windows.Forms;
- using System.Collections;
- using System.IO;
- using System.Text;
- using System.Collections;
- using System.Threading;
- using System.Net;
- /*
- ' 迅龙中文分类搜索引擎 v0.6
- '
- ' LGPL 许可发行
- '
- ' 宁夏大学 张冬 康彩 zd4004@163.com
- '
- ' 官网 http://blog.163.com/zd4004/
- */
- namespace XunLong.HTMLMODEL.TEST
- {
- public partial class FormHTMLMODEL : Form
- {
- /// <summary>
- /// 文件系统对象
- /// </summary>
- private NetHashTableAPI.ClassNHT db = new NetHashTableAPI.ClassNHT();
- XunLong.HtmlClassLibrary.ClassHTML myHTML2CLEAR = new XunLong.HtmlClassLibrary.ClassHTML();
- public string PathIT = "";
- public FormHTMLMODEL(String DPATH)
- {
- InitializeComponent();
- textBox5.Text = DPATH;
- XunLong.CongifData.Config.InitConfigData(DPATH);
- }
- private void button2_Click(object sender, EventArgs e)
- {
- if (textBox4.Text.Trim().Length == 0)
- {
- return;
- }
-
- textBox9.Text = myHTML2CLEAR.HTML2CLEAR(GetOneHTML(textBox4.Text, "gb2312"), textBox4.Text);
- MessageBox.Show("数据读取完成!");
- }
- /// <summary>
- /// 得到一个网页数据
- /// </summary>
- /// <param name="murl"></param>
- /// <returns></returns>
- public string GetOneHTML(string murl, string codeType)
- {
- HttpWebRequest request = (HttpWebRequest)WebRequest.Create(murl);
- codeType = codeType.ToLower();
- request.Timeout = 20000;
- try
- {
- //下面来看看如何处理HTML页面。首先要做的当然是下载HTML页面,这可以通过C#提供的HttpWebRequest类实现:
- // request = (HttpWebRequest)WebRequest.Create(murl);
- WebResponse response = request.GetResponse();
- Stream stream = response.GetResponseStream();
- string buffer = "", line;
- //接下来我们就从request创建一个stream流。在执行其他处理之前,我们要先确定该文件是二进制文件还是文本文件,不同的文件类型处理方式也不同。下面的代码确定该文件是否为二进制文件。
- //。如果是文本文件,首先从stream创建一个StreamReader,然后将文本文件的内容一行一行加入缓冲区。
- // response.ContentType.
- // Encoding gbx = System.Text.Encoding.GetEncoding("gb2312");
- //存放当前的应用的字符集
- string NowCodeSet = "";
- if (response.ContentType.ToLower().StartsWith("text/"))
- {
- //自动检测 UTF8
- if ((response.ContentType.ToLower().IndexOf("utf-8") > -1) | (response.ContentType.ToLower().IndexOf("UTF-8") > -1))
- {
- StreamReader reader = new StreamReader(stream, System.Text.Encoding.UTF8);
- NowCodeSet = "utf-8";
- buffer = "";
- while ((line = reader.ReadLine()) != null)
- {
- buffer += line + "rn";
- }
- reader.Close();
- stream.Close();
- response.Close();
- buffer = myHTML2CLEAR.Str2Str(buffer);
- }
- else
- {
- //自动检测GB2312
- if ((response.ContentType.ToLower().IndexOf("gb2312") > -1) | (response.ContentType.ToLower().IndexOf("GB2312") > -1))
- {
- StreamReader reader = new StreamReader(stream, System.Text.Encoding.GetEncoding("GB2312"));
- NowCodeSet = "gb2312";
- buffer = "";
- while ((line = reader.ReadLine()) != null)
- {
- buffer += line + "rn";
- }
- reader.Close();
- stream.Close();
- response.Close();
- }
- else
- {
- //自动检测 不到时按照默认设置进行
- if (codeType.ToLower() == "gb2312")
- {
- StreamReader reader = new StreamReader(stream, System.Text.Encoding.GetEncoding("GB2312"));
- NowCodeSet = "gb2312";
- buffer = "";
- while ((line = reader.ReadLine()) != null)
- {
- buffer += line + "rn";
- }
- reader.Close();
- stream.Close();
- response.Close();
- }
- else
- {
- StreamReader reader = new StreamReader(stream, System.Text.Encoding.UTF8);
- NowCodeSet = "utf-8";
- buffer = "";
- while ((line = reader.ReadLine()) != null)
- {
- buffer += line + "rn";
- }
- reader.Close();
- stream.Close();
- response.Close();
- buffer = myHTML2CLEAR.Str2Str(buffer);
- }
- }
- }
- }
- ///字符集为gb2312 而刚应用为utf-8
- if ((buffer.ToLower().IndexOf("gb2312") > -1) & (NowCodeSet == "utf-8"))
- {
- HttpWebRequest requestX = (HttpWebRequest)WebRequest.Create(murl);
- WebResponse responseX = requestX.GetResponse();
- Stream streamX = responseX.GetResponseStream();
- StreamReader readerX = new StreamReader(streamX, System.Text.Encoding.GetEncoding("GB2312"));
- buffer = "";
- while ((line = readerX.ReadLine()) != null)
- {
- buffer += line + "rn";
- }
- readerX.Close();
- streamX.Close();
- responseX.Close();
- }
- ///字符集为utf-8 而刚应用为 gb2312
- if ((buffer.ToLower().IndexOf("utf-8") > -1) & (NowCodeSet == "gb2312"))
- {
- HttpWebRequest requestY = (HttpWebRequest)WebRequest.Create(murl);
- WebResponse responseY = requestY.GetResponse();
- Stream streamY = responseY.GetResponseStream();
- StreamReader readerY = new StreamReader(streamY, System.Text.Encoding.UTF8);
- buffer = "";
- while ((line = readerY.ReadLine()) != null)
- {
- buffer += line + "rn";
- }
- readerY.Close();
- streamY.Close();
- responseY.Close();
- buffer = myHTML2CLEAR.Str2Str(buffer);
- }
- // string tmm =clearHTMLDB(buffer);
- string tmm = buffer;
- if (tmm.Length > 0)
- {
- Console.WriteLine("GUrlData : --> " + murl);
- }
- return tmm; //返回经过过滤得数据
- }
- catch
- {
- request.Abort();
- Console.WriteLine("Err : --> " + murl);
- return "";
- }
- }
- /// <summary>
- /// 读文件
- /// </summary>
- /// <param name="filename"></param>
- /// <returns></returns>
- private string getFileData(string filename)
- {
- StreamReader reader = null;
- string data = string.Empty;
- try
- {
- reader = new StreamReader(filename, System.Text.Encoding.GetEncoding("gb2312"));
- data = reader.ReadToEnd();
- reader.Close();
- return data;
- }
- catch (IOException e)
- {
- Console.WriteLine(e.Message);
- }
- finally
- {
- if (reader != null)
- reader.Close();
- }
- return "";
- }
- /// <summary>
- /// 写文件
- /// </summary>
- /// <param name="filename"></param>
- /// <param name="data"></param>
- private void putFileData(string filename, string data)
- {
- StreamWriter writer = null;
- try
- {
- writer = new StreamWriter(filename, false, System.Text.Encoding.GetEncoding("gb2312"));
- writer.Write(data);
- writer.Close();
- }
- catch (IOException e)
- {
- Console.WriteLine(e.Message);
- }
- finally
- {
- if (writer != null)
- writer.Close();
- }
- }
- private void button1_Click(object sender, EventArgs e)
- {
- openFileDialog1.Title = "选择一个模板文件 ";
- openFileDialog1.Filter = "(*.a)|*.a";
- openFileDialog1.ShowDialog();
- string a = openFileDialog1.FileName;
- if (System.IO.File.Exists(a) == false)
- {
- return;
- }
- // 检验是否选择了正确的文件
- if (a.IndexOf(".a") == -1 & a.IndexOf(".b") == -1 & a.IndexOf(".c") == -1 & a.IndexOf(".d") == -1 & a.IndexOf(".e") == -1 & a.IndexOf(".t") == -1)
- {
- MessageBox.Show("文件格式错误!");
- return;
-
- }
- string x = a.Substring(0, a.Length - 2);
- this.Text = x;
- textBox_a.Text = getFileData(x + ".a");
- textBox_b.Text = getFileData(x + ".b");
- textBox_c.Text = getFileData(x + ".c");
- textBox_d.Text = getFileData(x + ".d");
- textBox_e.Text = getFileData(x + ".e");
- textBox_t.Text = getFileData(x + ".t");
- textBox_h.Text = getFileData(x + ".h");
- textBox_s.Text = getFileData(x + ".s");
- button4.Enabled = true;
- }
- private void button3_Click(object sender, EventArgs e)
- {
-
- }
- /// <summary>
- /// 保存模板
- /// </summary>
- /// <param name="sender"></param>
- /// <param name="e"></param>
- private void button4_Click(object sender, EventArgs e)
- {
- /*
- //store 存储 index 索引 token 分析
- doc.Add(new Field("t", x.t, true, true, true)); //标题
- doc.Add(new Field("a", x.a, true, true, true)); //数据
- // doc.Add(new Field("b", x.b, true, false, false)); //类聚模板得到
- doc.Add(new Field("b", x.b, true, true, true )); //类聚模板得到
- doc.Add(new Field("c", x.c, true, false, false)); //相关模板得到
- */
- string x = this.Text;
- putFileData(x + ".a",textBox_a.Text);
- putFileData(x + ".b",textBox_b.Text);
- putFileData(x + ".c",textBox_c.Text);
- putFileData(x + ".d",textBox_d.Text);
- putFileData(x + ".e",textBox_e.Text);
- putFileData(x + ".t", textBox_t.Text);
- putFileData(x + ".h", textBox_h.Text);
- putFileData(x + ".s", textBox_s.Text);
- MessageBox.Show("模板保存成功! 地址: "+x);
- }
- private void listBox2_SelectedIndexChanged(object sender, EventArgs e)
- {
- textBox13.Text = listBox2.Text;
- }
- private void listBox1_SelectedIndexChanged(object sender, EventArgs e)
- {
- textBox13.Text = listBox1.Text;
- }
- private void button5_Click(object sender, EventArgs e)
- {
-
-
- db.SetClassNHT(textBox14.Text , 3145727,textBox5.Text);
- // ArrayList n = db.SearchOneList("http");
- // listBox3.Items.Clear();
-
- // label13.Text = "系统内共有 " + n.Count.ToString()+" 个文件";
- // foreach (string a in n)
- // {
- // listBox3.Items.Add(a);
-
- // }
- button5.Enabled = false;
- MessageBox.Show("完成!");
- }
- /*
- private void listBox3_SelectedIndexChanged(object sender, EventArgs e)
- {
- if (listBox3.Text == null)
- {
- return;
- }
- textBox4.Text = listBox3.Text;
- textBox9.Text = XunLong.HtmlClassLibrary.ClassHTML.HTML2CLEAR( db.Value(listBox3.Text));
- }
- */
- private void FormHTMLMODEL_Load(object sender, EventArgs e)
- {
-
- // XunLong.CongifData.Config.InitConfigData("D:\XunLongRUN\xunlong.kc");
- textBox14.Text = XunLong.CongifData.Config.SpiderData;
- textBox1.Text = getFileData(XunLong.CongifData.Config.main_s_type);
- // comboBox1.Items.Clear();
- // 1 得到目录下的文件
- // DirectoryInfo dir = new DirectoryInfo(XunLong.CongifData.Config.xlType);
- // 2 遍历文件 读取数据压入
- // foreach (FileInfo f in dir.GetFiles("*.xlt")) //遍历获得以xml为扩展名的文件
- // {
- // String name = f.Name; //name为该文件夹下的文件名称,如f.FullName为全名
- // name = name.Substring(0, name.Length - 4);
- // comboBox1.Items.Add(name);
- // }
- // comboBox1.Text = "共得到 "+ comboBox1.Items.Count.ToString() + "项数据";
- }
- public void InitOne(string a,string b,string c,string d,string e,string t,string h,string s, string path)
- {
- textBox_a.Text = a;
- textBox_b.Text = b;
- textBox_c.Text = c;
- textBox_d.Text = d;
- textBox_e.Text = e;
- textBox_t.Text = t;
- textBox_h.Text = h;
- textBox_s.Text = s;
- button4.Enabled = true;
- this.Text =path+"\"+ getMD5name(d);
-
- }
- /// <summary>
- /// 得到数据的MD5名
- /// </summary>
- /// <param name="data"></param>
- /// <returns></returns>
- private string getMD5name(string data)
- {
- string strMd5 = System.Web.Security.FormsAuthentication.HashPasswordForStoringInConfigFile(data, "md5");
- return strMd5;
- }
- private void comboBox1_SelectedIndexChanged(object sender, EventArgs e)
- {
- /*
- if (comboBox1.Text.Length > 1)
- {
- try
- {
- // textBox1.Text = getFileData(XunLong.CongifData.Config.xlType + "\" + comboBox1.Text + ".xlt");
- }
- catch
- { }
-
- }
- */
- }
- private void tabPage4_Click(object sender, EventArgs e)
- {
- }
- private void button6_Click(object sender, EventArgs e)
- {
- string a = textBox_e.Text;
- if(a.IndexOf("n")>0 & a.Length>20)
- {
- char[] x = {'n','r'};
- string[] ab = a.Split(x);
- if (ab[0].Length > 0)
- {
- webBrowser1.Navigate(ab[0]);
- }
- else
- {
- webBrowser1.Navigate(ab[2]);
- }
- }
- else
- {
- MessageBox.Show("采样列表为空!");
-
- }
-
- }
- private void button3_Click_1(object sender, EventArgs e)
- {
- // XunLong.HtmlClassLibrary.ClassHTML myHTML2CLEAR = new XunLong.HtmlClassLibrary.ClassHTML();
- string url ="";
- // if(textBox4.Text.Length > 0)
- // {
- // url = textBox4.Text;
- // }
- // else
- // {
- // url = textBox2.Text;
- // }
- //建立滤波类
- XunLong.ModelUserClassLibrary.ClassUserModel m = new XunLong.ModelUserClassLibrary.ClassUserModel();
- //压入测试模板
- m.TestModeL(textBox_a.Text, textBox_b.Text, textBox_c.Text, textBox_d.Text, textBox_e.Text, textBox_t.Text, textBox_h.Text, textBox_s.Text);
- XunLong.PublicClassLibrary.kcSearch k = m.getTagAndData(textBox9.Text);
- Hashtable p = m.modelOneList;
- listBox1.Items.Clear();
- listBox2.Items.Clear();
- /*
- foreach (System.Collections.DictionaryEntry de in k)
- {
- listBox1.Items.Add(de.Key.ToString() + 't' + de.Value.ToString());
- }
- */
- foreach (System.Collections.DictionaryEntry de in p)
- {
- listBox2.Items.Add(de.Key.ToString() + 't' + de.Value.ToString());
- }
- /*
- textBox12.Text = k["t"].ToString();
- textBox8.Text = k["a"].ToString();
- textBox7.Text = k["b"].ToString();
- textBox6.Text = k["c"].ToString();
- */
- textBox12.Text = k.t;
- textBox8.Text = k.a;
- textBox7.Text = k.b;
- textBox6.Text = k.c;
- textBox16.Text = k.h;
- textBox3.Text = k.s;
- }
- private void button7_Click(object sender, EventArgs e)
- {
- if (textBox2.Text.Trim().Length == 0)
- {
- return;
- }
- textBox9.Text = myHTML2CLEAR.HTML2CLEAR(db.Value(textBox2.Text), textBox2.Text); //;
- MessageBox.Show("数据读取完成!");
- }
- private void button8_Click(object sender, EventArgs e)
- {
- string a = textBox_e.Text;
- if (a.IndexOf("n") > 0 & a.Length > 20)
- {
- char[] x = { 'n', 'r' };
- string[] ab = a.Split(x);
- if (ab[0].Length > 0)
- {
- textBox4.Text = ab[0];
- textBox2.Text = ab[0];
- }
- else
- {
- textBox4.Text = ab[2];
- textBox2.Text = ab[2];
- }
- }
- else
- {
- MessageBox.Show("采样列表为空!");
- }
- }
- private void button9_Click(object sender, EventArgs e)
- {
- try
- {
- string x = this.Text;
- System.IO.File.Delete(x + ".a");
- System.IO.File.Delete(x + ".b");
- System.IO.File.Delete(x + ".c");
- System.IO.File.Delete(x + ".d");
- System.IO.File.Delete(x + ".e");
- System.IO.File.Delete(x + ".t");
- System.IO.File.Delete(x + ".h");
- System.IO.File.Delete(x + ".s");
- MessageBox.Show("删除成功! 地址: " + x);
- this.FindForm().Close();
- }
- catch
- {
- MessageBox.Show("删除出错!");
- }
- }
- private void button10_Click(object sender, EventArgs e)
- {
- putFileData(XunLong.CongifData.Config.main_s_type, textBox1.Text);
- }
- }
- }