HtmlDocument.cs
上传用户:huiyue
上传日期:2022-04-08
资源大小:1429k
文件大小:22k
源码类别:

搜索引擎

开发平台:

ASP/ASPX

  1. using System;
  2. using System.IO;
  3. using System.Xml.Serialization;
  4. using System.Collections.Specialized;
  5. using System.Text;
  6. using System.Text.RegularExpressions;
  7. using System.Collections;
  8. using cd.net;
  9. using Searcharoo.Indexer;
  10. namespace Searcharoo.Common
  11. {
  12.     /// <summary>
  13.     /// Storage for parsed HTML data returned by ParsedHtmlData();
  14.     /// </summary>
  15.     /// <remarks>
  16.     /// Arbitrary class to encapsulate just the properties we need 
  17.     /// to index Html pages (Title, Meta tags, Keywords, etc).
  18.     /// A 'generic' search engine would probably have a 'generic'
  19.     /// document class, so maybe a future version of Searcharoo 
  20.     /// will too...
  21.     /// </remarks>
  22.     public class HtmlDocument : Document
  23.     {
  24.         #region Private fields: _Uri, _ContentType, _RobotIndexOK, _RobotFollowOK
  25.         private string _All = String.Empty;
  26.         private Uri _Uri;
  27.         private String _ContentType;
  28.         private bool _RobotIndexOK = true;
  29.         private bool _RobotFollowOK = true;
  30.         private string _WordsOnly = string.Empty;
  31.         /// <summary>MimeType so we know whether to try and parse the contents, eg. "text/html", "text/plain", etc</summary>
  32.         private string _MimeType = String.Empty;
  33.         /// <summary>Html &lt;title&gt; tag</summary>
  34.         private String _Title = String.Empty;
  35.         /// <summary>Html &lt;meta http-equiv='description'&gt; tag</summary>
  36.         private string _Description = String.Empty;
  37.         /// <summary>Length as reported by the server in the Http headers</summary>
  38.         #endregion
  39.         #region Constructor requires Uri
  40.         public HtmlDocument(Uri location):base(location)
  41.         {
  42.             _Uri = location;
  43.             Extension = "html";
  44.         }
  45.         #endregion
  46.         #region Public Properties: Uri, RobotIndexOK
  47.         /// <summary>
  48.         /// http://www.ietf.org/rfc/rfc2396.txt
  49.         /// </summary>
  50.         public override Uri Uri
  51.         {
  52.             get { return _Uri; }
  53.             set
  54.             {
  55.                 _Uri = value;
  56.             }
  57.         }
  58.         /// <summary>
  59.         /// Whether a robot should index the text 
  60.         /// found on this page, or just ignore it
  61.         /// </summary>
  62.         /// <remarks>
  63.         /// Set when page META tags are parsed - no 'set' property
  64.         /// More info:
  65.         /// http://www.robotstxt.org/
  66.         /// </remarks>
  67.         public override bool RobotIndexOK
  68.         {
  69.             get { return _RobotIndexOK; }
  70.         }
  71.         /// <summary>
  72.         /// Whether a robot should follow any links 
  73.         /// found on this page, or just ignore them
  74.         /// </summary>
  75.         /// <remarks>
  76.         /// Set when page META tags are parsed - no 'set' property
  77.         /// More info:
  78.         /// http://www.robotstxt.org/
  79.         /// </remarks>
  80.         public override bool RobotFollowOK
  81.         {
  82.             get { return _RobotFollowOK; }
  83.         }
  84.         public override string ContentType
  85.         {
  86.             get
  87.             {
  88.                 return _ContentType;
  89.             }
  90.             set
  91.             {
  92.                 _ContentType = value.ToString();
  93.                 string[] contentTypeArray = _ContentType.Split(';');
  94.                 // Set MimeType if it's blank
  95.                 if (_MimeType == String.Empty && contentTypeArray.Length >= 1)
  96.                 {
  97.                     _MimeType = contentTypeArray[0];
  98.                 }
  99.                 // Set Encoding if it's blank
  100.                 if (Encoding == String.Empty && contentTypeArray.Length >= 2)
  101.                 {
  102.                     int charsetpos = contentTypeArray[1].IndexOf("charset");
  103.                     if (charsetpos > 0)
  104.                     {
  105.                         Encoding = contentTypeArray[1].Substring(charsetpos + 8, contentTypeArray[1].Length - charsetpos - 8);
  106.                     }
  107.                 }
  108.             }
  109.         }
  110.         #endregion
  111.         #region Public fields: Encoding, All
  112.          /// <summary>Encoding eg. "utf-8", "Shift_JIS", "iso-8859-1", "gb2312", etc</summary>
  113.         public string Encoding = String.Empty;
  114.         
  115.         /// <summary>
  116.         /// Raw content of page, as downloaded from the server
  117.         /// Html stripped to make up the 'wordsonly'
  118.         /// </summary>
  119.         public override string All
  120.         {
  121.             get { return _All; }
  122.             set { 
  123.                 _All = value;
  124.                 _WordsOnly = StripHtml(_All);
  125.             }
  126.         }
  127.         public override string WordsOnly
  128.         {
  129.             get { return this.KeywordString + this._Description + this._WordsOnly; }
  130.         }
  131.         public override string Description
  132.         {
  133.             get {
  134.                 // ### If no META DESC, grab start of file text ###
  135.                 if (String.Empty == this._Description)
  136.                 {
  137.                     if (_WordsOnly.Length > Preferences.SummaryCharacters)
  138.                     {
  139.                         _Description = _WordsOnly.Substring(0, Preferences.SummaryCharacters);
  140.                     }
  141.                     else
  142.                     {
  143.                         _Description = WordsOnly;
  144.                     }
  145.                     _Description = Regex.Replace(_Description, @"s+", " ").Trim();
  146.                 }
  147.                 // http://authors.aspalliance.com/stevesmith/articles/removewhitespace.asp
  148.                 return _Description; 
  149.             }
  150.             set 
  151.             {
  152.                 _Description = Regex.Replace(value, @"s+", " ").Trim();
  153.             }
  154.         }
  155.         #endregion
  156.         #region Public Methods: SetRobotDirective, ToString()
  157.         /// <summary>
  158.         /// Pass in a ROBOTS meta tag found while parsing, 
  159.         /// and set HtmlDocument property/ies appropriately
  160.         /// </summary>
  161.         /// <remarks>
  162.         /// More info:
  163.         /// * Robots Exclusion Protocol *
  164.         /// - for META tags
  165.         /// http://www.robotstxt.org/wc/meta-user.html
  166.         /// - for ROBOTS.TXT in the siteroot
  167.         /// http://www.robotstxt.org/wc/norobots.html
  168.         /// </remarks>
  169.         public void SetRobotDirective (string robotMetaContent)
  170.         {
  171.             robotMetaContent = robotMetaContent.ToLower();
  172.             if (robotMetaContent.IndexOf("none") >= 0)
  173.             {
  174.                 // 'none' means you can't Index or Follow!
  175.                 _RobotIndexOK = false;
  176.                 _RobotFollowOK = false;
  177.             }
  178.             else
  179.             {
  180.                 if (robotMetaContent.IndexOf("noindex") >= 0) { _RobotIndexOK = false; }
  181.                 if (robotMetaContent.IndexOf("nofollow") >= 0) { _RobotFollowOK = false; }
  182.             }
  183.         }
  184.         /// <summary>
  185.         /// For debugging - output all links found in the page
  186.         /// </summary>
  187.         public override string ToString()
  188.         {
  189.             string linkstring = "";
  190.             foreach (object link in LocalLinks)
  191.             {
  192.                 linkstring += Convert.ToString(link) + "rn";
  193.             }
  194.             return Title + "rn" + Description + "rn----------------rn" + linkstring + "rn----------------rn" + All + "rn======================rn";
  195.         }
  196.         #endregion
  197.         /// <summary>
  198.         ///
  199.         /// </summary>
  200.         /// <remarks>
  201.         /// "Original" link search Regex used by the code was from here
  202.         /// http://www.dotnetjunkies.com/Tutorial/1B219C93-7702-4ADF-9106-DFFDF90914CF.dcik
  203.         /// but it was not sophisticated enough to match all tag permutations
  204.         ///
  205.         /// whereas the Regex on this blog will parse ALL attributes from within tags...
  206.         /// IMPORTANT when they're out of order, spaced out or over multiple lines
  207.         /// http://blogs.worldnomads.com.au/matthewb/archive/2003/10/24/158.aspx
  208.         /// http://blogs.worldnomads.com.au/matthewb/archive/2004/04/06/215.aspx
  209.         ///
  210.         /// http://www.experts-exchange.com/Programming/Programming_Languages/C_Sharp/Q_20848043.html
  211.         /// 
  212.         /// Parse GPS coordinates (latitude, longitude) [v6]
  213.         /// http://en.wikipedia.org/wiki/Geotagging
  214.         /// </remarks>
  215.         public override void Parse()
  216.         {
  217.             string htmlData = this.All; // htmlData will be munged
  218.             //xenomouse http://www.codeproject.com/aspnet/Spideroo.asp?msg=1271902#xx1271902xx
  219.             if (string.IsNullOrEmpty(this.Title))
  220.             {   // title may have been set previously... non-HTML file type (this will be refactored out, later)
  221.                 // this.Title = Regex.Match(htmlData, @"(?<=<title[^>]*>).*?(?=</title>)", RegexOptions.IgnoreCase | RegexOptions.ExplicitCapture).Value;
  222.                 // [v6] fix by Erick Brown for CRLFs in <title> tag
  223.                 // "Not only will the above work with line breaks, it also works with more variations of improperly formatted tags.   Further, it will not incorrectly catch tags that begin with "title" such as: <titlepage>"
  224.                 this.Title = Regex.Match(
  225.                       htmlData
  226.                     , @"(?<=<s*title(?:s[^>]*)?>)[sS]*?(?=</s*title(?:s[^>]*)?>)"
  227.                     , RegexOptions.IgnoreCase | RegexOptions.ExplicitCapture).Value;
  228.                 this.Title = this.Title.Trim(); //new char[] { 'r', 'n', ' '});
  229.             }
  230.             string metaKey = String.Empty, metaValue = String.Empty;
  231.             foreach (Match metamatch in Regex.Matches(htmlData
  232.                 , @"<metas*(?:(?:b(w|-)+bs*(?:=s*(?:""[^""]*""|'[^']*'|[^""'<> ]+)s*)?)*)/?s*>"
  233.                 , RegexOptions.IgnoreCase | RegexOptions.ExplicitCapture))
  234.             {
  235.                 metaKey = String.Empty;
  236.                 metaValue = String.Empty;
  237.                 // Loop through the attribute/value pairs inside the tag
  238.                 foreach (Match submetamatch in Regex.Matches(metamatch.Value.ToString()
  239.                     , @"(?<name>b(w|-)+b)s*=s*(""(?<value>[^""]*)""|'(?<value>[^']*)'|(?<value>[^""'<> ]+)s*)+"
  240.                     , RegexOptions.IgnoreCase | RegexOptions.ExplicitCapture))
  241.                 {
  242.                     if ("http-equiv" == submetamatch.Groups[1].ToString().ToLower())
  243.                     {
  244.                         metaKey = submetamatch.Groups[2].ToString();
  245.                     }
  246.                     if (("name" == submetamatch.Groups[1].ToString().ToLower())
  247.                         && (metaKey == String.Empty))
  248.                     { // if it's already set, HTTP-EQUIV takes precedence
  249.                         metaKey = submetamatch.Groups[2].ToString();
  250.                     }
  251.                     if ("content" == submetamatch.Groups[1].ToString().ToLower())
  252.                     {
  253.                         metaValue = submetamatch.Groups[2].ToString();
  254.                     }
  255.                 }
  256.                 switch (metaKey.ToLower())
  257.                 {
  258.                     case "description":
  259.                         this.Description = metaValue;
  260.                         break;
  261.                     case "keywords":
  262.                     case "keyword":
  263.                         base.SetKeywords(metaValue);// Keywords = metaValue;
  264.                         break;
  265.                     case "robots":
  266.                     case "robot":
  267.                         this.SetRobotDirective(metaValue);
  268.                         break;
  269.                     case "icbm":            // <meta name="ICBM" content="50.167958, -97.133185">
  270.                     case "geo.position":    // <meta name="geo.position" content="50.167958;-97.133185">
  271.                         this.SetGpsCoordinates(metaValue);
  272.                         break;
  273.                 }
  274. //                ProgressEvent(this, new ProgressEventArgs(4, metaKey + " = " + metaValue));
  275.             }
  276.             string link = String.Empty;
  277.             ArrayList linkLocal = new ArrayList();
  278.             ArrayList linkExternal = new ArrayList();
  279.             // Remove all non 'ignore' comments
  280.             // [v7] fix by brad1213@yahoo.com
  281.             htmlData = Regex.Replace(htmlData, @"<!--.*?[^" + Preferences.IgnoreRegionTagNoIndex + "]-->", "", RegexOptions.IgnoreCase | RegexOptions.Singleline);
  282.             // http://msdn.microsoft.com/library/en-us/script56/html/js56jsgrpregexpsyntax.asp
  283.             // Original Regex, just found <a href=""> links; and was "broken" by spaces, out-of-order, etc
  284.             // @"(?<=<as+href="").*?(?=""s*/?>)"
  285.             // Looks for the src attribute of:
  286.             // <A> anchor tags
  287.             // <AREA> imagemap links
  288.             // <FRAME> frameset links
  289.             // <IFRAME> floating frames
  290.             // <IMG> for images - new in [v6]
  291.             foreach (Match match in Regex.Matches(htmlData
  292.                 , @"(?<anchor><s*(a|area|frame|iframe|img)s*(?:(?:bw+bs*(?:=s*(?:""[^""]*""|'[^']*'|[^""'<> ]+)s*)?)*)?s*>)"
  293.                 , RegexOptions.IgnoreCase | RegexOptions.ExplicitCapture))
  294.             {
  295.                 // Parse ALL attributes from within tags... IMPORTANT when they're out of order!!
  296.                 // in addition to the 'href' attribute, there might also be 'alt', 'class', 'style', 'area', etc...
  297.                 // there might also be 'spaces' between the attributes and they may be ", ', or unquoted
  298.                 link = String.Empty;
  299. //                ProgressEvent(this, new ProgressEventArgs(4, "Match:" + System.Web.HttpUtility.HtmlEncode(match.Value) + ""));
  300.                 foreach (Match submatch in Regex.Matches(match.Value.ToString()
  301.                     , @"(?<name>bw+b)s*=s*(""(?<value>[^""]*)""|'(?<value>[^']*)'|(?<value>[^""'<> s]+)s*)+"
  302.                     , RegexOptions.IgnoreCase | RegexOptions.ExplicitCapture))
  303.                 {
  304.                     // we're only interested in the href attribute (although in future maybe index the 'alt'/'title'?)
  305. //                    ProgressEvent(this, new ProgressEventArgs(4, "Submatch: " + submatch.Groups[1].ToString() + "=" + submatch.Groups[2].ToString() + ""));
  306.                     if ("href" == submatch.Groups[1].ToString().ToLower())
  307.                     {
  308.                         link = submatch.Groups[2].ToString();
  309.                         if (link != "#") break; // break if this isn't just a placeholder href="#", which implies maybe an onclick attribute exists
  310.                     }
  311.                     if ("onclick" == submatch.Groups[1].ToString().ToLower())
  312.                     {   // maybe try to parse some javascript in here
  313.                         string jscript = submatch.Groups[2].ToString();
  314.                         // some code here to extract a filename/link to follow from the onclick="_____"
  315.                         // say it was onclick="window.location='top.htm'"
  316.                         int firstApos = jscript.IndexOf("'");
  317.                         int secondApos = jscript.IndexOf("'", firstApos + 1);
  318.                         if (secondApos > firstApos)
  319.                         {
  320.                             link = jscript.Substring(firstApos + 1, secondApos - firstApos - 1);
  321.                             break;  // break if we found something, ignoring any later href="" which may exist _after_ the onclick in the <a> element
  322.                         }
  323.                     }
  324.                     if ("src" == submatch.Groups[1].ToString().ToLower())
  325.                     {   // [v6] indexes images <img src="???">
  326.                         link = submatch.Groups[2].ToString();
  327.                         break;
  328.                     }
  329.                 }
  330.                 // [v6] fix by "mike-j-g"
  331.                 link = link.ToLower();
  332.                 // strip off internal links, so we don't index same page over again
  333.                 if (link.IndexOf("#") > -1)
  334.                 {   // hash links are intra-page links (eg href="index.html#bottom" )
  335.                     link = link.Substring(0, link.IndexOf("#"));
  336.                 }
  337.                 if (link.IndexOf("javascript:") == -1
  338.                     && link.IndexOf("mailto:") == -1
  339.                     && !link.StartsWith("#")
  340.                     && link != String.Empty)
  341.                 {   // #NOT# javascript, mailto, # or empty
  342.                     if ((link.Length > 8) && (link.StartsWith("http://")
  343.                         || link.StartsWith("https://")
  344.                         || link.StartsWith("file://")
  345.                         || link.StartsWith("//")
  346.                         || link.StartsWith(@"\")))
  347.                     {
  348.                         linkExternal.Add(link);
  349. //                        ProgressEvent(this, new ProgressEventArgs(4, "External link: " + link));
  350.                     }
  351.                     else if (link.StartsWith("?"))
  352.                     {
  353.                         // it's possible to have /?query which sends the querystring to the
  354.                         // 'default' page in a directory
  355.                         linkLocal.Add(this.Uri.AbsolutePath + link);
  356. //                        ProgressEvent(this, new ProgressEventArgs(4, "? Internal default page link: " + link));
  357.                     }
  358.                     else
  359.                     {
  360.                         linkLocal.Add(link);
  361. //                        ProgressEvent(this, new ProgressEventArgs(4, "I Internal link: " + link));
  362.                     }
  363.                 } // add each link to a collection
  364.             } // foreach
  365.             this.LocalLinks = linkLocal;
  366.             this.ExternalLinks = linkExternal;
  367.         } // Parse
  368.         public override bool GetResponse(System.Net.HttpWebResponse webresponse)
  369.         {
  370.             string enc = "utf-8"; // default
  371.             if (webresponse.ContentEncoding != String.Empty)
  372.             {
  373.                 // Use the HttpHeader Content-Type in preference to the one set in META
  374.                 this.Encoding = webresponse.ContentEncoding;
  375.             }
  376.             else if (this.Encoding == String.Empty)
  377.             {
  378.                 // TODO: if still no encoding determined, try to readline the stream until we find either
  379.                 // * META Content-Type or * </head> (ie. stop looking for META)
  380.                 this.Encoding = enc; // default
  381.             }
  382.             //http://www.c-sharpcorner.com/Code/2003/Dec/ReadingWebPageSources.asp
  383.             System.IO.StreamReader stream = new System.IO.StreamReader
  384.                 (webresponse.GetResponseStream(), System.Text.Encoding.GetEncoding(this.Encoding));
  385.             this.Uri = webresponse.ResponseUri; // we *may* have been redirected... and we want the *final* URL
  386.             this.Length = webresponse.ContentLength;
  387.             this.All = stream.ReadToEnd();
  388.             stream.Close();
  389.             return true; //success
  390.         }
  391.         /// <summary>
  392.         /// Stripping HTML
  393.         /// http://www.4guysfromrolla.com/webtech/042501-1.shtml
  394.         /// </summary>
  395.         /// <remarks>
  396.         /// Using regex to find tags without a trailing slash
  397.         /// http://concepts.waetech.com/unclosed_tags/index.cfm
  398.         ///
  399.         /// http://msdn.microsoft.com/library/en-us/script56/html/js56jsgrpregexpsyntax.asp
  400.         ///
  401.         /// Replace html comment tags
  402.         /// http://www.faqts.com/knowledge_base/view.phtml/aid/21761/fid/53
  403.         /// </remarks>
  404.         protected string StripHtml(string Html)
  405.         {
  406.             //Strips the <script> tags from the Html
  407.             string scriptregex = @"<scr" + @"ipt[^>.]*>[sS]*?</sc" + @"ript>";
  408.             System.Text.RegularExpressions.Regex scripts = new System.Text.RegularExpressions.Regex(scriptregex, RegexOptions.IgnoreCase | RegexOptions.Multiline | RegexOptions.ExplicitCapture);
  409.             string scriptless = scripts.Replace(Html, " ");
  410.             //Strips the <style> tags from the Html
  411.             string styleregex = @"<style[^>.]*>[sS]*?</style>";
  412.             System.Text.RegularExpressions.Regex styles = new System.Text.RegularExpressions.Regex(styleregex, RegexOptions.IgnoreCase | RegexOptions.Multiline | RegexOptions.ExplicitCapture);
  413.             string styleless = styles.Replace(scriptless, " ");
  414.             //Strips the <NOSEARCH> tags from the Html (where NOSEARCH is set in the web.config/Preferences class)
  415.             //TODO: NOTE: this only applies to INDEXING the text - links are parsed before now, so they aren't "excluded" by the region!! (yet)
  416.             string ignoreless = string.Empty;
  417.             if (Preferences.IgnoreRegions)
  418.             {
  419.                 string noSearchStartTag = "<!--" + Preferences.IgnoreRegionTagNoIndex + "-->";
  420.                 string noSearchEndTag = "<!--/" + Preferences.IgnoreRegionTagNoIndex + "-->";
  421.                 string ignoreregex = noSearchStartTag + @"[sS]*?" + noSearchEndTag;
  422.                 System.Text.RegularExpressions.Regex ignores = new System.Text.RegularExpressions.Regex(ignoreregex, RegexOptions.IgnoreCase | RegexOptions.Multiline | RegexOptions.ExplicitCapture);
  423.                 ignoreless = ignores.Replace(styleless, " ");
  424.             }
  425.             else
  426.             {
  427.                 ignoreless = styleless;
  428.             }
  429.             //Strips the <!--comment--> tags from the Html
  430.             //string commentregex = @"<!--.*?-->"; // alternate suggestion from antonello franzil 
  431.             string commentregex = @"<!(?:--[sS]*?--s*)?>";
  432.             System.Text.RegularExpressions.Regex comments = new System.Text.RegularExpressions.Regex(commentregex, RegexOptions.IgnoreCase | RegexOptions.Multiline | RegexOptions.ExplicitCapture);
  433.             string commentless = comments.Replace(ignoreless, " ");
  434.             //Strips the HTML tags from the Html
  435.             System.Text.RegularExpressions.Regex objRegExp = new System.Text.RegularExpressions.Regex("<(.|n)+?>", RegexOptions.IgnoreCase);
  436.             //Replace all HTML tag matches with the empty string
  437.             string output = objRegExp.Replace(commentless, " ");
  438.             //Replace all _remaining_ < and > with &lt; and &gt;
  439.             output = output.Replace("<", "&lt;");
  440.             output = output.Replace(">", "&gt;");
  441.             objRegExp = null;
  442.             return output;
  443.         }
  444.     }
  445. }