CollectionSystem.cs
上传用户:lanchensha
上传日期:2022-02-27
资源大小:7530k
文件大小:11k
源码类别:

编辑器/阅读器

开发平台:

C#

  1. //------------------------------------------------------------------------------
  2. //                                  版权声明
  3. //DotNetTextBox免费版源码版权由小宝.NET及Aspxcn中华网工作室所有!
  4. //非盈利性个人网站可免费使用本控件,商业及盈利性网站请购买功能更强大的商业版本授
  5. //权或开发版,如发现任何个人或机构违反本声明,本站将对其追究法律责任!
  6. //商业授权及开发版购买地址:http://www.aspxcn.com.cn/dotnettextbox/default.htm
  7. //联系email:webmaster@aspxcn.com.cn
  8. //------------------------------------------------------------------------------
  9. using System;
  10. using System.Net;
  11. using System.Text;
  12. using System.Text.RegularExpressions;
  13. using System.Web;
  14. using System.Web.UI;
  15. using System.Web.UI.WebControls;
  16. namespace DotNetTextBox
  17. {
  18.     #region 自定义控件的页面采集功能
  19.     public partial class PageCollection : System.Web.UI.Page
  20.     {
  21.         protected System.Web.UI.WebControls.Button btnReturn;
  22.         protected System.Web.UI.WebControls.Button canceloading;
  23.         protected System.Web.UI.WebControls.TextBox txtUrl;
  24.         protected System.Web.UI.WebControls.DropDownList seltype;
  25.         protected System.Web.UI.WebControls.HiddenField tempcontent;
  26.         protected void Page_Load(object sender, EventArgs e)
  27.         {
  28.             if (!IsPostBack)
  29.             {
  30.                 btnReturn.Text = ResourceManager.GetString("insertpage");
  31.                 seltype.Items.Add(new ListItem(ResourceManager.GetString("getallcontent"), "1"));
  32.                 seltype.Items.Add(new ListItem(ResourceManager.GetString("getnoscriptcontent"), "2"));
  33.                 seltype.Items.Add(new ListItem(ResourceManager.GetString("getalltext"), "3"));
  34.                 seltype.Items.Add(new ListItem(ResourceManager.GetString("getallimg"), "4"));
  35.                 seltype.Items.Add(new ListItem(ResourceManager.GetString("getalllink"), "5"));
  36.                 canceloading.Text = ResourceManager.GetString("canceloading");
  37.                 if (Request.Cookies["languages"] != null)
  38.                 {
  39.                     ResourceManager.SiteLanguageKey = Request.Cookies["languages"].Value;
  40.                 }
  41.                 else
  42.                 {
  43.                     ResourceManager.SiteLanguageKey = HttpContext.Current.Request.ServerVariables["HTTP_ACCEPT_LANGUAGE"].ToLower().Split(',')[0];
  44.                 }
  45.             }
  46.         }
  47.         public void btnReturn_Click(object sender, System.EventArgs e)
  48.         {
  49.             string url = txtUrl.Text.Trim();
  50.             WebClient wb = new WebClient();
  51.             try
  52.             {
  53.                 byte[] pagedata = wb.DownloadData(@url);
  54.                 string result = Encoding.Default.GetString(pagedata);
  55.                 string returnvalue = "";
  56.                 switch (seltype.SelectedValue)
  57.                 {
  58.                     case "1":
  59.                         returnvalue = result;
  60.                         break;
  61.                     case "2":
  62.                         returnvalue = wipeScript(result);
  63.                         break;
  64.                     case "3":
  65.                         returnvalue = NoHTML(result);
  66.                         break;
  67.                     case "4":
  68.                         returnvalue = getImages(result);
  69.                         break;
  70.                     case "5":
  71.                         returnvalue = getLink(result);
  72.                         break;
  73.                     default:
  74.                         break;
  75.                 }
  76.                 tempcontent.Value = returnvalue;
  77.                 ClientScript.RegisterStartupScript(typeof(Page), "Key", "addeditor();", true);
  78.             }
  79.             catch
  80.             {
  81.                 ClientScript.RegisterStartupScript(typeof(Page), "Key", "alert('" + ResourceManager.GetString("getpageerror") + "')", true);
  82.                 
  83.             }
  84.         }
  85.         public static string NoHTML(string Htmlstring)
  86.         {
  87.             //删除脚本
  88.             Htmlstring = Regex.Replace(Htmlstring, @"<script[^>]*?>.*?</script>","",RegexOptions.IgnoreCase);
  89.             //Htmlstring = Regex.Replace(Htmlstring, @"<script[sS]+</script *>", "", RegexOptions.IgnoreCase);
  90.             Htmlstring = Regex.Replace(Htmlstring, @"<style[sS]+</style *>","",RegexOptions.IgnoreCase);
  91.             //删除HTML
  92.             Htmlstring = Regex.Replace(Htmlstring, @"<(.[^>]*)>", "", RegexOptions.IgnoreCase);
  93.             Htmlstring = Regex.Replace(Htmlstring, @"([rn])[s]+", "", RegexOptions.IgnoreCase);
  94.             Htmlstring = Regex.Replace(Htmlstring, @"-->", "", RegexOptions.IgnoreCase);
  95.             Htmlstring = Regex.Replace(Htmlstring, @"<!--.*", "", RegexOptions.IgnoreCase);
  96.             Htmlstring = Regex.Replace(Htmlstring, @"&(quot|#34);", """, RegexOptions.IgnoreCase);
  97.             Htmlstring = Regex.Replace(Htmlstring, @"&(amp|#38);", "&", RegexOptions.IgnoreCase);
  98.             Htmlstring = Regex.Replace(Htmlstring, @"&(lt|#60);", "<", RegexOptions.IgnoreCase);
  99.             Htmlstring = Regex.Replace(Htmlstring, @"&(gt|#62);", ">", RegexOptions.IgnoreCase);
  100.             Htmlstring = Regex.Replace(Htmlstring, @"&(nbsp|#160);", " ", RegexOptions.IgnoreCase);
  101.             Htmlstring = Regex.Replace(Htmlstring, @"&(iexcl|#161);", "xa1", RegexOptions.IgnoreCase);
  102.             Htmlstring = Regex.Replace(Htmlstring, @"&(cent|#162);", "xa2", RegexOptions.IgnoreCase);
  103.             Htmlstring = Regex.Replace(Htmlstring, @"&(pound|#163);", "xa3", RegexOptions.IgnoreCase);
  104.             Htmlstring = Regex.Replace(Htmlstring, @"&(copy|#169);", "xa9", RegexOptions.IgnoreCase);
  105.             Htmlstring = Regex.Replace(Htmlstring, @"&#(d+);", "", RegexOptions.IgnoreCase);
  106.             Htmlstring.Replace("<", "");
  107.             Htmlstring.Replace(">", "");
  108.             Htmlstring.Replace("rn", "");
  109.             Htmlstring = HttpContext.Current.Server.HtmlEncode(Htmlstring).Trim();
  110.             return Htmlstring;
  111.         }
  112.         public static string wipeScript(string html)
  113.         {
  114.             System.Text.RegularExpressions.Regex regex1 = new System.Text.RegularExpressions.Regex(@"<script[sS]+</script *>", System.Text.RegularExpressions.RegexOptions.IgnoreCase);
  115.             System.Text.RegularExpressions.Regex regex2 = new System.Text.RegularExpressions.Regex(@" href *= *[sS]*script *:", System.Text.RegularExpressions.RegexOptions.IgnoreCase);
  116.             System.Text.RegularExpressions.Regex regex3 = new System.Text.RegularExpressions.Regex(@" on[sS]*=", System.Text.RegularExpressions.RegexOptions.IgnoreCase);
  117.             System.Text.RegularExpressions.Regex regex4 = new System.Text.RegularExpressions.Regex(@"<iframe[sS]+</iframe *>", System.Text.RegularExpressions.RegexOptions.IgnoreCase);
  118.             System.Text.RegularExpressions.Regex regex5 = new System.Text.RegularExpressions.Regex(@"<frameset[sS]+</frameset *>", System.Text.RegularExpressions.RegexOptions.IgnoreCase);
  119.             html = regex1.Replace(html, ""); //过滤<script></script>标记
  120.             html = regex2.Replace(html, ""); //过滤href=javascript: (<A>) 属性
  121.             html = regex3.Replace(html, " _disibledevent="); //过滤其它控件的on...事件
  122.             html = regex4.Replace(html, ""); //过滤iframe
  123.             html = regex5.Replace(html, ""); //过滤frameset
  124.             return html;
  125.         }
  126.         public string getImages(string html)
  127.         {
  128.             string resultStr = "";
  129.             string temp = "";
  130.             string url = "";
  131.             string[] url2;
  132.             Match m;
  133.             Regex r = new Regex(@"<IMG[^>]+src=s*(?:'(?<src>[^']+)'|""(?<src>[^""]+)""|(?<src>[^>s]+))s*[^>]*>", RegexOptions.IgnoreCase);
  134.             for (m = r.Match(html); m.Success; m = m.NextMatch())
  135.             {
  136.                 temp=m.Groups["src"].Value.ToLower();
  137.                 if (temp.IndexOf("http")==0)
  138.                 {
  139.                     resultStr += m.Value + "<br />";
  140.                 }
  141.                 else
  142.                 {
  143.                     url2 = txtUrl.Text.Trim().Split('/');
  144.                     
  145.                     try
  146.                     {
  147.                         if (url2.Length > 3)
  148.                         {
  149.                             url = txtUrl.Text.Trim().Replace(url2[url2.Length - 1], "");
  150.                         }
  151.                         else
  152.                         {
  153.                             url = txtUrl.Text.Trim();
  154.                         }
  155.                     }
  156.                     catch
  157.                     {
  158.                         url = txtUrl.Text.Trim();
  159.                     }
  160.                     if (temp.IndexOf("/") == 0)
  161.                     {
  162.                         resultStr += m.Value.Replace(m.Groups["src"].Value,"http://"+url2[2] + m.Groups["src"].Value) + "<br/>";
  163.                     }
  164.                     else
  165.                     {
  166.                         resultStr += m.Value.Replace(m.Groups["src"].Value,url + m.Groups["src"].Value) + "<br/>";
  167.                     } 
  168.                 }
  169.             }
  170.             return resultStr;
  171.         }
  172.         public string getLink(string html)
  173.         {
  174.             string resultStr = "";
  175.             string temp = "";
  176.             string url = "";
  177.             string[] url2;
  178.             Regex re = new Regex(@"<a[^>]+href=s*(?:'(?<href>[^']+)'|""(?<href>[^""]+)""|(?<href>[^>s]+))s*[^>]*>(?<text>.*?)</a>", RegexOptions.IgnoreCase);
  179.             MatchCollection mc = re.Matches(html);
  180.             foreach (Match m in mc)
  181.             {
  182.                 temp=m.Groups["href"].Value.ToLower();
  183.                 if (temp.IndexOf("http")==0)
  184.                 {
  185.                     resultStr += m.Value + "<br/>";
  186.                 }
  187.                 else
  188.                 {
  189.                     url2 = txtUrl.Text.Trim().Split('/');
  190.                     try
  191.                     {
  192.                         if (url2.Length > 1)
  193.                         {
  194.                             url = txtUrl.Text.Trim().Replace(url2[url2.Length - 1], "");
  195.                         }
  196.                         else
  197.                         {
  198.                             url = txtUrl.Text.Trim();
  199.                         }
  200.                     }
  201.                     catch
  202.                     {
  203.                         url = txtUrl.Text.Trim();
  204.                     }
  205.                     if (temp.IndexOf("/")==0)
  206.                     {
  207.                         resultStr += m.Value.Replace(m.Groups["href"].Value,"http://" + url2[2] + m.Groups["href"].Value) + "<br/>";
  208.                     }
  209.                     else if (temp.IndexOf("mailto") == 0)
  210.                     {
  211.                         resultStr += m.Value + "<br/>";
  212.                     }
  213.                     else
  214.                     {
  215.                         resultStr += m.Value.Replace(m.Groups["href"].Value, url + m.Groups["href"].Value) + "<br/>";
  216.                     }
  217.                 }
  218.             }
  219.             
  220.             return resultStr;
  221.         }
  222.     }
  223.     #endregion
  224. }