CollectionSystem.cs
上传用户:lanchensha
上传日期:2022-02-27
资源大小:7530k
文件大小:11k
- //------------------------------------------------------------------------------
- // 版权声明
- //DotNetTextBox免费版源码版权由小宝.NET及Aspxcn中华网工作室所有!
- //非盈利性个人网站可免费使用本控件,商业及盈利性网站请购买功能更强大的商业版本授
- //权或开发版,如发现任何个人或机构违反本声明,本站将对其追究法律责任!
- //商业授权及开发版购买地址:http://www.aspxcn.com.cn/dotnettextbox/default.htm
- //联系email:webmaster@aspxcn.com.cn
- //------------------------------------------------------------------------------
- using System;
- using System.Net;
- using System.Text;
- using System.Text.RegularExpressions;
- using System.Web;
- using System.Web.UI;
- using System.Web.UI.WebControls;
- namespace DotNetTextBox
- {
- #region 自定义控件的页面采集功能
- public partial class PageCollection : System.Web.UI.Page
- {
- protected System.Web.UI.WebControls.Button btnReturn;
- protected System.Web.UI.WebControls.Button canceloading;
- protected System.Web.UI.WebControls.TextBox txtUrl;
- protected System.Web.UI.WebControls.DropDownList seltype;
- protected System.Web.UI.WebControls.HiddenField tempcontent;
- protected void Page_Load(object sender, EventArgs e)
- {
- if (!IsPostBack)
- {
- btnReturn.Text = ResourceManager.GetString("insertpage");
- seltype.Items.Add(new ListItem(ResourceManager.GetString("getallcontent"), "1"));
- seltype.Items.Add(new ListItem(ResourceManager.GetString("getnoscriptcontent"), "2"));
- seltype.Items.Add(new ListItem(ResourceManager.GetString("getalltext"), "3"));
- seltype.Items.Add(new ListItem(ResourceManager.GetString("getallimg"), "4"));
- seltype.Items.Add(new ListItem(ResourceManager.GetString("getalllink"), "5"));
- canceloading.Text = ResourceManager.GetString("canceloading");
- if (Request.Cookies["languages"] != null)
- {
- ResourceManager.SiteLanguageKey = Request.Cookies["languages"].Value;
- }
- else
- {
- ResourceManager.SiteLanguageKey = HttpContext.Current.Request.ServerVariables["HTTP_ACCEPT_LANGUAGE"].ToLower().Split(',')[0];
- }
- }
- }
- public void btnReturn_Click(object sender, System.EventArgs e)
- {
- string url = txtUrl.Text.Trim();
- WebClient wb = new WebClient();
- try
- {
- byte[] pagedata = wb.DownloadData(@url);
- string result = Encoding.Default.GetString(pagedata);
- string returnvalue = "";
- switch (seltype.SelectedValue)
- {
- case "1":
- returnvalue = result;
- break;
- case "2":
- returnvalue = wipeScript(result);
- break;
- case "3":
- returnvalue = NoHTML(result);
- break;
- case "4":
- returnvalue = getImages(result);
- break;
- case "5":
- returnvalue = getLink(result);
- break;
- default:
- break;
- }
- tempcontent.Value = returnvalue;
- ClientScript.RegisterStartupScript(typeof(Page), "Key", "addeditor();", true);
- }
- catch
- {
- ClientScript.RegisterStartupScript(typeof(Page), "Key", "alert('" + ResourceManager.GetString("getpageerror") + "')", true);
-
- }
- }
- public static string NoHTML(string Htmlstring)
- {
- //删除脚本
- Htmlstring = Regex.Replace(Htmlstring, @"<script[^>]*?>.*?</script>","",RegexOptions.IgnoreCase);
- //Htmlstring = Regex.Replace(Htmlstring, @"<script[sS]+</script *>", "", RegexOptions.IgnoreCase);
- Htmlstring = Regex.Replace(Htmlstring, @"<style[sS]+</style *>","",RegexOptions.IgnoreCase);
- //删除HTML
- Htmlstring = Regex.Replace(Htmlstring, @"<(.[^>]*)>", "", RegexOptions.IgnoreCase);
- Htmlstring = Regex.Replace(Htmlstring, @"([rn])[s]+", "", RegexOptions.IgnoreCase);
- Htmlstring = Regex.Replace(Htmlstring, @"-->", "", RegexOptions.IgnoreCase);
- Htmlstring = Regex.Replace(Htmlstring, @"<!--.*", "", RegexOptions.IgnoreCase);
- Htmlstring = Regex.Replace(Htmlstring, @"&(quot|#34);", """, RegexOptions.IgnoreCase);
- Htmlstring = Regex.Replace(Htmlstring, @"&(amp|#38);", "&", RegexOptions.IgnoreCase);
- Htmlstring = Regex.Replace(Htmlstring, @"&(lt|#60);", "<", RegexOptions.IgnoreCase);
- Htmlstring = Regex.Replace(Htmlstring, @"&(gt|#62);", ">", RegexOptions.IgnoreCase);
- Htmlstring = Regex.Replace(Htmlstring, @"&(nbsp|#160);", " ", RegexOptions.IgnoreCase);
- Htmlstring = Regex.Replace(Htmlstring, @"&(iexcl|#161);", "xa1", RegexOptions.IgnoreCase);
- Htmlstring = Regex.Replace(Htmlstring, @"&(cent|#162);", "xa2", RegexOptions.IgnoreCase);
- Htmlstring = Regex.Replace(Htmlstring, @"&(pound|#163);", "xa3", RegexOptions.IgnoreCase);
- Htmlstring = Regex.Replace(Htmlstring, @"&(copy|#169);", "xa9", RegexOptions.IgnoreCase);
- Htmlstring = Regex.Replace(Htmlstring, @"&#(d+);", "", RegexOptions.IgnoreCase);
- Htmlstring.Replace("<", "");
- Htmlstring.Replace(">", "");
- Htmlstring.Replace("rn", "");
- Htmlstring = HttpContext.Current.Server.HtmlEncode(Htmlstring).Trim();
- return Htmlstring;
- }
- public static string wipeScript(string html)
- {
- System.Text.RegularExpressions.Regex regex1 = new System.Text.RegularExpressions.Regex(@"<script[sS]+</script *>", System.Text.RegularExpressions.RegexOptions.IgnoreCase);
- System.Text.RegularExpressions.Regex regex2 = new System.Text.RegularExpressions.Regex(@" href *= *[sS]*script *:", System.Text.RegularExpressions.RegexOptions.IgnoreCase);
- System.Text.RegularExpressions.Regex regex3 = new System.Text.RegularExpressions.Regex(@" on[sS]*=", System.Text.RegularExpressions.RegexOptions.IgnoreCase);
- System.Text.RegularExpressions.Regex regex4 = new System.Text.RegularExpressions.Regex(@"<iframe[sS]+</iframe *>", System.Text.RegularExpressions.RegexOptions.IgnoreCase);
- System.Text.RegularExpressions.Regex regex5 = new System.Text.RegularExpressions.Regex(@"<frameset[sS]+</frameset *>", System.Text.RegularExpressions.RegexOptions.IgnoreCase);
- html = regex1.Replace(html, ""); //过滤<script></script>标记
- html = regex2.Replace(html, ""); //过滤href=javascript: (<A>) 属性
- html = regex3.Replace(html, " _disibledevent="); //过滤其它控件的on...事件
- html = regex4.Replace(html, ""); //过滤iframe
- html = regex5.Replace(html, ""); //过滤frameset
- return html;
- }
- public string getImages(string html)
- {
- string resultStr = "";
- string temp = "";
- string url = "";
- string[] url2;
- Match m;
- Regex r = new Regex(@"<IMG[^>]+src=s*(?:'(?<src>[^']+)'|""(?<src>[^""]+)""|(?<src>[^>s]+))s*[^>]*>", RegexOptions.IgnoreCase);
- for (m = r.Match(html); m.Success; m = m.NextMatch())
- {
- temp=m.Groups["src"].Value.ToLower();
- if (temp.IndexOf("http")==0)
- {
- resultStr += m.Value + "<br />";
- }
- else
- {
- url2 = txtUrl.Text.Trim().Split('/');
-
- try
- {
- if (url2.Length > 3)
- {
- url = txtUrl.Text.Trim().Replace(url2[url2.Length - 1], "");
- }
- else
- {
- url = txtUrl.Text.Trim();
- }
- }
- catch
- {
- url = txtUrl.Text.Trim();
- }
- if (temp.IndexOf("/") == 0)
- {
- resultStr += m.Value.Replace(m.Groups["src"].Value,"http://"+url2[2] + m.Groups["src"].Value) + "<br/>";
- }
- else
- {
- resultStr += m.Value.Replace(m.Groups["src"].Value,url + m.Groups["src"].Value) + "<br/>";
- }
- }
- }
- return resultStr;
- }
- public string getLink(string html)
- {
- string resultStr = "";
- string temp = "";
- string url = "";
- string[] url2;
- Regex re = new Regex(@"<a[^>]+href=s*(?:'(?<href>[^']+)'|""(?<href>[^""]+)""|(?<href>[^>s]+))s*[^>]*>(?<text>.*?)</a>", RegexOptions.IgnoreCase);
- MatchCollection mc = re.Matches(html);
- foreach (Match m in mc)
- {
- temp=m.Groups["href"].Value.ToLower();
- if (temp.IndexOf("http")==0)
- {
- resultStr += m.Value + "<br/>";
- }
- else
- {
- url2 = txtUrl.Text.Trim().Split('/');
- try
- {
- if (url2.Length > 1)
- {
- url = txtUrl.Text.Trim().Replace(url2[url2.Length - 1], "");
- }
- else
- {
- url = txtUrl.Text.Trim();
- }
- }
- catch
- {
- url = txtUrl.Text.Trim();
- }
- if (temp.IndexOf("/")==0)
- {
- resultStr += m.Value.Replace(m.Groups["href"].Value,"http://" + url2[2] + m.Groups["href"].Value) + "<br/>";
- }
- else if (temp.IndexOf("mailto") == 0)
- {
- resultStr += m.Value + "<br/>";
- }
- else
- {
- resultStr += m.Value.Replace(m.Groups["href"].Value, url + m.Groups["href"].Value) + "<br/>";
- }
- }
- }
-
- return resultStr;
- }
- }
- #endregion
- }