RobotsTxt.cs
上传用户:huiyue
上传日期:2022-04-08
资源大小:1429k
文件大小:9k
源码类别:

搜索引擎

开发平台:

ASP/ASPX

  1. using System;
  2. using System.Collections;
  3. using System.Text;
  4. using Searcharoo.Common; // Preferences (for Proxy)
  5. namespace Searcharoo.Indexer
  6. {
  7.     /// <summary>
  8.     /// Represents the rules for a specific domain for a specific host 
  9.     /// (ie it aggregates all the rules that match the UserAgent, plus the special * rules)
  10.     /// 
  11.     /// http://www.robotstxt.org/
  12.     /// </summary>
  13.     /// <remarks>
  14.     /// 
  15.     /// </remarks>
  16.     public class RobotsTxt
  17.     {
  18.         #region Private Fields: _FileContents, _UserAgent, _Server, _DenyUrls, _LogString
  19.         private string _FileContents;
  20.         private string _UserAgent;
  21.         private string _Server;
  22.         /// <summary>lowercase string array of url fragments that are 'denied' to the UserAgent for this RobotsTxt instance</summary>
  23.         private ArrayList _DenyUrls = new ArrayList();
  24.         private string _LogString = string.Empty;
  25.         #endregion
  26.         #region Constructors: require starting Url and UserAgent to create an object
  27.         private RobotsTxt()
  28.         { }
  29.         public RobotsTxt(Uri startPageUri, string userAgent)
  30.         {
  31.             _UserAgent = userAgent;
  32.             _Server = startPageUri.Host;
  33.             try
  34.             {
  35.                 System.Net.WebProxy proxyObject = null;
  36.                 if (Preferences.UseProxy)
  37.                 {   // [v6] stephenlane80 suggested proxy code
  38.                     proxyObject = new System.Net.WebProxy(Preferences.ProxyUrl, true);
  39.                     proxyObject.Credentials = System.Net.CredentialCache.DefaultCredentials;
  40.                 }
  41.                 System.Net.HttpWebRequest req = (System.Net.HttpWebRequest)System.Net.WebRequest.Create("http://" + startPageUri.Authority + "/robots.txt");
  42.                 if (Preferences.UseProxy) req.Proxy = proxyObject; // [v6] stephenlane80
  43.                 System.Net.HttpWebResponse webresponse = (System.Net.HttpWebResponse)req.GetResponse();
  44.                 if (webresponse.StatusCode != System.Net.HttpStatusCode.OK)
  45.                 {
  46.                     Console.WriteLine("ROBOTS.TXT request returned HttpStatus " + webresponse.StatusCode.ToString());
  47.                     _FileContents = String.Empty;
  48.                     return;
  49.                 }
  50.                 using (System.IO.StreamReader stream = new System.IO.StreamReader(webresponse.GetResponseStream(), Encoding.ASCII))
  51.                 {
  52.                     _FileContents = stream.ReadToEnd();
  53.                 } // stream.Close();
  54.                 //ProgressEvent(this, new ProgressEventArgs(1, "robots.txt file loaded from " + server + "robots.txt"));
  55.                 // [v6] fix by maaguirr (Matt) to read Unix-based ROBOTS.TXT files
  56.                 string[] fileLines = _FileContents.Split(Environment.NewLine.ToCharArray() , StringSplitOptions.RemoveEmptyEntries);
  57.                 bool rulesApply = false;
  58.                 foreach (string line in fileLines)
  59.                 {
  60.                     RobotInstruction ri = new RobotInstruction(line);
  61.                     switch(ri.Instruction[0])
  62.                     {
  63.                         case '#':   //then comment - ignore
  64.                             break;
  65.                         case 'u':   // User-Agent
  66.                             if ((ri.UrlOrAgent.IndexOf("*") >= 0)
  67.                               || (ri.UrlOrAgent.IndexOf(_UserAgent) >= 0))
  68.                             { // these rules apply
  69.                                 rulesApply = true;
  70.                                 Console.WriteLine(ri.UrlOrAgent + " " + rulesApply);
  71.                             }
  72.                             else
  73.                             {
  74.                                 rulesApply = false;
  75.                             }
  76.                             break;
  77.                         case 'd':   // Disallow
  78.                             if (rulesApply)
  79.                             {
  80.                                 _DenyUrls.Add(ri.UrlOrAgent.ToLower());
  81.                                 Console.WriteLine("D " + ri.UrlOrAgent);
  82.                             }
  83.                             else
  84.                             {
  85.                                 Console.WriteLine("D " + ri.UrlOrAgent + " is for another user-agent");
  86.                             }
  87.                             break;
  88.                         case 'a':   // Allow
  89.                             Console.WriteLine("A" + ri.UrlOrAgent);
  90.                             break;
  91.                         default:
  92.                             // empty/unknown/error
  93.                             Console.WriteLine("# Unrecognised robots.txt entry ["+line+"]");
  94.                             break;
  95.                     }
  96.                 }
  97.             }
  98.             catch (System.Net.WebException)
  99.             {
  100.                 _FileContents = String.Empty;
  101.                 //ProgressEvent(this, new ProgressEventArgs(1, "No robots.txt file found at " + server));
  102.             }
  103.             catch (System.Security.SecurityException)
  104.             {
  105.                 _FileContents = String.Empty;
  106.                 //ProgressEvent(this, new ProgressEventArgs(1, "Could not load ROBOTS.TXT file from " + server));
  107.             }
  108.         }
  109.         #endregion
  110.         #region Methods: Allow
  111.         /// <summary>
  112.         /// Does the parsed robots.txt file allow this Uri to be spidered for this user-agent?
  113.         /// </summary>
  114.         /// <remarks>
  115.         /// This method does all its "matching" in lowercase - it expects the _DenyUrl 
  116.         /// elements to be ToLower() and it calls ToLower on the passed-in Uri...
  117.         /// </remarks>
  118.         public bool Allowed (Uri uri)
  119.         {
  120.             if (_DenyUrls.Count == 0) return true;
  121.             string url = uri.AbsolutePath.ToLower();
  122.             foreach (string denyUrlFragment in _DenyUrls)
  123.             {
  124.                 if (url.Length >= denyUrlFragment.Length)
  125.                 {
  126.                     if (url.Substring(0, denyUrlFragment.Length) == denyUrlFragment)
  127.                     {
  128.                         return false;
  129.                     } // else not a match
  130.                 } // else url is shorter than fragment, therefore cannot be a 'match'
  131.             }
  132.             if (url == "/robots.txt") return false;
  133.             // no disallows were found, so allow
  134.             return true;
  135.         }
  136.         #endregion
  137.         #region Private class: RobotInstruction
  138.         /// <summary>
  139.         /// Use this class to read/parse the robots.txt file
  140.         /// </summary>
  141.         /// <remarks>
  142.         /// Types of data coming into this class
  143.         /// User-agent: * ==> _Instruction='User-agent', _Url='*'
  144.         /// Disallow: /cgi-bin/ ==> _Instruction='Disallow', _Url='/cgi-bin/'
  145.         /// Disallow: /tmp/ ==> _Instruction='Disallow', _Url='/tmp/'
  146.         /// Disallow: /~joe/ ==> _Instruction='Disallow', _Url='/~joe/'
  147.         /// </remarks>
  148.         private class RobotInstruction
  149.         {
  150.             private string _Instruction;
  151.             private string _Url = string.Empty;
  152.             /// <summary>
  153.             /// Constructor requires a line, hopefully in the format [instuction]:[url]
  154.             /// </summary>
  155.             public RobotInstruction (string line) 
  156.             {
  157.                 string instructionLine = line;
  158.                 int commentPosition = instructionLine.IndexOf('#');
  159.                 if (commentPosition == 0)
  160.                 {
  161.                     _Instruction = "#";
  162.                 }
  163.                 if (commentPosition >= 0)
  164.                 {   // comment somewhere on the line, trim it off
  165.                     instructionLine = instructionLine.Substring(0, commentPosition);
  166.                 }
  167.                 if (instructionLine.Length > 0)
  168.                 {   // wasn't just a comment line (which should have been filtered out before this anyway
  169.                     string[] lineArray = instructionLine.Split(':');
  170.                     _Instruction = lineArray[0].Trim().ToLower();
  171.                     if (lineArray.Length > 1)
  172.                     {
  173.                         _Url = lineArray[1].Trim();
  174.                     }
  175.                 }
  176.             }
  177.             /// <summary>
  178.             /// Lower-case part of robots.txt line, before the colon (:)
  179.             /// </summary>
  180.             public string Instruction
  181.             {
  182.                 get { return _Instruction; }
  183.             }
  184.             /// <summary>
  185.             /// Lower-case part of robots.txt line, after the colon (:)
  186.             /// </summary>
  187.             public string UrlOrAgent
  188.             {
  189.                 get { return _Url; }
  190.             }
  191.         }
  192.         #endregion
  193.     }
  194. }