Spider.cs
上传用户:huiyue
上传日期:2022-04-08
资源大小:1429k
文件大小:27k
源码类别:

搜索引擎

开发平台:

ASP/ASPX

  1. using System;
  2. using System.Xml.Serialization;
  3. using System.Collections.Specialized;
  4. using System.Text;
  5. using System.Text.RegularExpressions;
  6. using System.Collections;
  7. using Searcharoo.Common;
  8. namespace Searcharoo.Indexer
  9. {
  10.     /// <summary>
  11.     /// The Spider that crawls your website, link by link.
  12.     /// </summary>
  13.     /// <remarks>
  14.     /// In the Searcharoo (v2), this code was 'embedded' in an ASPX page. 
  15.     /// This was for ease of reporting 'progress' via Response.Write
  16.     /// statements. The code now uses an EventHandler to trigger progress reporting
  17.     /// by the calling code - so now it could be Reponse.Write, or saved to a file 
  18.     /// or any other mechanism. (v4) takes advantage of this by wrapping the Spider in a console
  19.     /// application so you can run it outside of a website.
  20.     /// 
  21.     /// Some of the references used when researching this code:
  22.     ///
  23.     /// C# and the Web: Writing a Web Client Application with Managed Code in the Microsoft .NET Framework - not helpful...
  24.     /// http://msdn.microsoft.com/msdnmag/issues/01/09/cweb/default.aspx
  25.     ///
  26.     /// Retrieving a List of Links & Images from a Web Page
  27.     /// http://www.dotnetjunkies.com/Tutorial/1B219C93-7702-4ADF-9106-DFFDF90914CF.dcik
  28.     /// 
  29.     /// FUTURE: In case connecting via a Proxy is required for the spidering
  30.     /// http://www.experts-exchange.com/Programming/Programming_Languages/Dot_Net/Q_20974147.html
  31.     /// http://msdn.microsoft.com/library/en-us/cpref/html/frlrfsystemnetglobalproxyselectionclasstopic.asp
  32.     /// </remarks>
  33.     public class Spider
  34.     {
  35.         #region Private fields: visited, count, catalog, 
  36.         private Uri _CurrentStartUri = null;
  37.         private string _CurrentStartUriString = String.Empty;
  38.         /// <summary></summary>
  39.         private ArrayList _Visited = new ArrayList();
  40.         /// <summary></summary>
  41.         private Hashtable _VisitedHashtable = new Hashtable();
  42.         /// <summary></summary>
  43.         //private int _Count = 0;
  44.         /// <summary></summary>
  45.         private Catalog _Catalog;
  46.         //private Cache _Cache;
  47.         
  48.         /// <summary>Stemmer to use</summary>
  49.         private IStemming _Stemmer;
  50.         /// <summary>Stemmer to use</summary>
  51.         private IStopper _Stopper;
  52.         /// <summary>Go word parser to use</summary>
  53.         private IGoWord _GoChecker;
  54.         /// <summary>Loads and acts as 'authorisation' for robot-excluded Urls</summary>
  55.         private RobotsTxt _Robot;
  56.         /// <summary>SIMONJONES</summary>
  57.         System.Net.CookieContainer _CookieContainer = new System.Net.CookieContainer();
  58.         #endregion
  59.         #region Public events/handlers: SpiderProgressEvent
  60.         /// <summary>
  61.         /// Event Handler to communicate progress and errors back to the calling code
  62.         /// </summary>
  63.         /// <remarks>
  64.         /// Learn about Events from a few different places
  65.         /// http://www.codeproject.com/csharp/csevents01.asp
  66.         /// http://www.csharphelp.com/archives/archive253.html
  67.         /// http://www.devhood.com/Tutorials/tutorial_details.aspx?tutorial_id=380
  68.         /// </remarks>
  69.         public event SpiderProgressEventHandler SpiderProgressEvent;
  70.         /// <summary>
  71.         /// Only trigger the event if a Handler has been attached.
  72.         /// </summary>
  73.         private void ProgressEvent(object sender, ProgressEventArgs pea)
  74.         {
  75.             if (this.SpiderProgressEvent != null)
  76.             {
  77.                 SpiderProgressEvent(sender, pea);
  78.             }
  79.         }
  80.         #endregion
  81.         /// <summary>
  82.         /// Takes a single Uri (Url) and returns the catalog that is generated
  83.         /// by following all the links from that point.
  84.         /// </summary>
  85.         /// <remarks>
  86.         ///This is the MAIN method of the indexing system.
  87.         /// </remarks>
  88.         public Catalog BuildCatalog (Uri startPageUri)
  89.         {
  90.             return BuildCatalog(new Uri[]{startPageUri});
  91.             /*
  92.             _Catalog = new Catalog();
  93.             
  94.             _CurrentStartUri = startPageUri;    // to compare against fully qualified links
  95.             _CurrentStartUriString = _CurrentStartUri.AbsoluteUri.ToString().ToLower();
  96.             ProgressEvent(this, new ProgressEventArgs(1, "Spider.Catalog (single Uri) " + startPageUri.AbsoluteUri));
  97.             // Setup Stop, Go, Stemming
  98.             SetPreferences();
  99.             _Robot = new RobotsTxt(startPageUri, Preferences.RobotUserAgent);
  100.             // GETS THE FIRST DOCUMENT, AND STARTS THE SPIDER! -- create the 'root' document to start the search
  101.             // HtmlDocument htmldoc = new HtmlDocument(startPageUri);
  102.             // RECURSIVE CALL TO 'Process()' STARTS HERE
  103.             ProcessUri(startPageUri, 0);
  104.             // Now we've FINISHED Spidering
  105.             ProgressEvent(this, new ProgressEventArgs(1, "Spider.Catalog() complete."));
  106.             ProgressEvent(this, new ProgressEventArgs(2, "Serializing to disk location " + Preferences.CatalogFileName));
  107.             // Serialization of the Catalog, so we can load it again if the server Application is restarted
  108.             _Catalog.Save();
  109.            
  110.             ProgressEvent(this, new ProgressEventArgs(3, "Save to disk " + Preferences.CatalogFileName + " successful"));
  111.             return _Catalog;// finished, return to the calling code to 'use'
  112.              */
  113.         }
  114.         /// <summary>
  115.         /// [v6]
  116.         /// </summary>
  117.         /// <param name="startPageUri">array of start pages</param>
  118.         /// <returns>Catalog of words/documents</returns>
  119.         public Catalog BuildCatalog(Uri[] startPageUris)
  120.         {
  121.             _Catalog = new Catalog(); //_Cache = new Cache(); // [v7]
  122.             ProgressEvent(this, new ProgressEventArgs(1, "Spider.Catalog (Uri Array) count: " + startPageUris.Length.ToString()));
  123.             // Setup Stop, Go, Stemming
  124.             SetPreferences();
  125.             foreach (Uri startPageUri in startPageUris)
  126.             {
  127.                 _CurrentStartUri = startPageUri;    // to compare against fully qualified links
  128.                 _CurrentStartUriString = _CurrentStartUri.AbsoluteUri.ToString().ToLower();
  129.                 ProgressEvent(this, new ProgressEventArgs(1, "Spider.Catalog (start Uri) " + startPageUri.AbsoluteUri));
  130.                 _Robot = new RobotsTxt(startPageUri, Preferences.RobotUserAgent);
  131.                 // GETS THE FIRST DOCUMENT, AND STARTS THE SPIDER! -- create the 'root' document to start the search
  132.                 // HtmlDocument htmldoc = new HtmlDocument(startPageUri);
  133.                 // RECURSIVE CALL TO 'Process()' STARTS HERE
  134.                 ProcessUri(startPageUri, 0);
  135.                 ProgressEvent(this, new ProgressEventArgs(1, "Spider.Catalog (end Uri) " + startPageUri.AbsoluteUri));
  136.             }
  137.             // Now we've FINISHED Spidering
  138.             ProgressEvent(this, new ProgressEventArgs(1, "Spider.Catalog() complete."));
  139.             ProgressEvent(this, new ProgressEventArgs(2, "Serializing to disk location " + Preferences.CatalogFileName));
  140.             // Serialization of the Catalog, so we can load it again if the server Application is restarted
  141.             _Catalog.Save();
  142.             //_Cache.Save(); //[v7]
  143.             ProgressEvent(this, new ProgressEventArgs(3, "Save to disk " + Preferences.CatalogFileName + " successful"));
  144.             return _Catalog;// finished, return to the calling code to 'use'
  145.         }
  146.         /// <summary>
  147.         /// Setup Stop, Go, Stemming
  148.         /// </summary>
  149.         private void SetPreferences()
  150.         {
  151.             switch (Preferences.StemmingMode)
  152.             {
  153.                 case 1:
  154.                     ProgressEvent(this, new ProgressEventArgs(1, "Stemming enabled."));
  155.                     _Stemmer = new PorterStemmer(); //Stemmer = new SnowStemming();
  156.                     break;
  157.                 case 2:
  158.                     ProgressEvent(this, new ProgressEventArgs(1, "Stemming enabled."));
  159.                     _Stemmer = new PorterStemmer();
  160.                     break;
  161.                 default:
  162.                     ProgressEvent(this, new ProgressEventArgs(1, "Stemming DISabled."));
  163.                     _Stemmer = new NoStemming();
  164.                     break;
  165.             }
  166.             switch (Preferences.StoppingMode)
  167.             {
  168.                 case 1:
  169.                     ProgressEvent(this, new ProgressEventArgs(1, "Stop words shorter than 3 chars."));
  170.                     _Stopper = new ShortStopper();
  171.                     break;
  172.                 case 2:
  173.                     ProgressEvent(this, new ProgressEventArgs(1, "Stop words from list."));
  174.                     _Stopper = new ListStopper();
  175.                     break;
  176.                 default:
  177.                     ProgressEvent(this, new ProgressEventArgs(1, "Stopping DISabled."));
  178.                     _Stopper = new NoStopping();
  179.                     break;
  180.             }
  181.             switch (Preferences.GoWordMode)
  182.             {
  183.                 case 1:
  184.                     ProgressEvent(this, new ProgressEventArgs(1, "Go Words enabled."));
  185.                     _GoChecker = new ListGoWord();
  186.                     break;
  187.                 default:
  188.                     ProgressEvent(this, new ProgressEventArgs(1, "Go Words DISabled."));
  189.                     _GoChecker = new NoGoWord();
  190.                     break;
  191.             }
  192.         }
  193.         /// <summary>
  194.         /// Recursive 'process' method: takes the uri input, downloads it (following redirects if required)
  195.         /// receiving a Document subclass, then calling the Parse() method to get the words which
  196.         /// are then added to the Catalog.
  197.         /// </summary>
  198.         protected int ProcessUri(Uri uri, int level)
  199.         {
  200.             // [j105 Rob] recursion fix 
  201.             // http://www.codeproject.com/aspnet/Spideroo.asp?df=100&forumid=71481&select=1862807#xx1862807xx
  202.             if (level > Preferences.RecursionLimit) return Preferences.RecursionLimit;
  203.             int wordcount = 0;
  204.             string url = uri.AbsoluteUri.ToLower(); // [v6]
  205.             if (!_Robot.Allowed(uri))
  206.             {
  207.                 ProgressEvent(this, new ProgressEventArgs(2, "RobotsTxt exclusion prevented indexing of " + url + ""));
  208.             }
  209.             else
  210.             {
  211.                 bool alreadyVisited = _Visited.Contains(url);
  212.                 if (!alreadyVisited && Preferences.UseDefaultDocument)
  213.                 {   // [v7] First-attempt at treating 'folder' Urls (eg mysite.com/Photos) and default documents (eg mysite.com/Photos/Default.aspx)
  214.                     // as the SAME PAGE to prevent duplicates in the search results. To do this, when we find a Url that looks like a 'folder'
  215.                     // (eg. no file extension OR ends with a / slash) we add all three 'variations' of that Url to the _Visited list so the other
  216.                     // variations aren't even retrieved/indexed.
  217.                     string defaultDoc = Preferences.DefaultDocument;
  218.                     int defaultDocLength = defaultDoc.Length;
  219.                     int defaultDocLengthPlusSlash = defaultDoc.Length;
  220.                     if (url.LastIndexOf("/") == (url.Length - 1))
  221.                     {   // Variation #1: ends in slash /
  222.                         alreadyVisited = _Visited.Contains(url + defaultDoc) || _Visited.Contains(url.Trim('/'));
  223.                         _Visited.Add(url + defaultDoc);
  224.                         _Visited.Add(url.Trim('/'));
  225.                     }
  226.                     else if (System.IO.Path.GetExtension(url) == "")
  227.                     {   // Variation #2: no file extension
  228.                         alreadyVisited = _Visited.Contains(url + "/" + defaultDoc) || _Visited.Contains(url + "/");
  229.                         _Visited.Add(url + "/" + defaultDoc);
  230.                         _Visited.Add(url + "/");
  231.                     }
  232.                     else if (url.LastIndexOf(defaultDoc) == (url.Length - defaultDocLength))
  233.                     {   // Variation #3: ends in /default.aspx (or whatever the specified default document is: index.html, default.htm, etc)
  234.                         alreadyVisited = _Visited.Contains(url.Substring(0, (url.Length - defaultDocLengthPlusSlash))) 
  235.                                       || _Visited.Contains(url.Substring(0, (url.Length - defaultDocLength)));
  236.                         _Visited.Add(url.Substring(0, (url.Length - defaultDocLengthPlusSlash)));
  237.                         _Visited.Add(url.Substring(0, (url.Length - defaultDocLength)));
  238.                     }
  239.                 }
  240.                 if (alreadyVisited)
  241.                 {
  242.                     ProgressEvent(this, new ProgressEventArgs(2, url + " already spidered"));
  243.                 }
  244.                 else
  245.                 {
  246.                     _Visited.Add(url); 
  247.                     ProgressEvent(this, new ProgressEventArgs(2, url + " being downloaded"));
  248.                     // ### IMPORTANT ### 
  249.                     // Uri is actually retrieved here!
  250.                     Document downloadDocument = Download(uri);
  251.                     if (null == downloadDocument)
  252.                     {
  253.                         ProgressEvent(this, new ProgressEventArgs(1, "Download() failed on " + url + ""));
  254.                     }
  255.                     else
  256.                     {
  257.                         // ### IMPORTANT ### 
  258.                         // Uri downloaded content is actually parsed here!
  259.                         downloadDocument.Parse();
  260.                         if (downloadDocument.RobotIndexOK)
  261.                         {
  262.                             wordcount = AddToCatalog (downloadDocument);
  263.                         }
  264.                         else
  265.                         {
  266.                             ProgressEvent(this, new ProgressEventArgs(2, "RobotMeta exclusion prevented indexing of " + url + ""));
  267.                         }
  268.                     }
  269.                     if (wordcount > 0)
  270.                     {
  271.                         ProgressEvent(this, new ProgressEventArgs(1, downloadDocument.Title + " parsed " + wordcount + " words!"));
  272.                         ProgressEvent(this, new ProgressEventArgs(4, downloadDocument.Title + " " + downloadDocument.Uri.AbsoluteUri + System.Environment.NewLine
  273.                                                                     + (downloadDocument.RobotIndexOK ? "Indexed" : "RobotMeta Excluded Index")
  274.                                                                     + downloadDocument.Description));
  275.                     }
  276.                     else
  277.                     {
  278.                         ProgressEvent(this, new ProgressEventArgs(2, url + " parsed but zero words found."));
  279.                     }
  280.                     // [v7] bugfix
  281.                     if (null == downloadDocument)
  282.                     { 
  283.                         // why is it null here?
  284.                         System.Diagnostics.Debug.WriteLine(url + " resulted in a null downloadDocument");
  285.                     }
  286.                     else
  287.                     {
  288.                         // Move some 'External' to 'Local' links
  289.                         ArrayList elinks = (ArrayList)downloadDocument.ExternalLinks.Clone();
  290.                         for (int l = 0; l < elinks.Count; l++)
  291.                         {
  292.                             string link = elinks[l].ToString();
  293.                             Uri linkUri = new Uri(link);
  294.                                                     //if (link.ToLower().StartsWith(this._CurrentStartUriString))
  295.                             if (_CurrentStartUri.IsBaseOf(linkUri))
  296.                             {   // if this link is actually 'under' the starting one, treat it as internal (even 
  297.                                 // though it started with http:
  298.                                 downloadDocument.ExternalLinks.Remove(link);
  299.                                 downloadDocument.LocalLinks.Add(link);
  300.                             }
  301.                         }
  302.                         // ### Loop through the 'local' links in the document ###
  303.                         // ### and parse each of them recursively ###
  304.                         if (null != downloadDocument && null != downloadDocument.LocalLinks && downloadDocument.RobotFollowOK)
  305.                         { // only if the Robot meta says it's OK
  306.                             foreach (object link in downloadDocument.LocalLinks)
  307.                             {
  308.                                 try
  309.                                 {
  310.                                     Uri urlToFollow = new Uri(downloadDocument.Uri, link.ToString());
  311.                                     ProcessUri(urlToFollow, level + 1); // calls THIS method, recursively
  312.                                 }
  313.                                 catch (Exception ex)
  314.                                 {
  315.                                     ProgressEvent(this, new ProgressEventArgs(2, "new Uri(" + downloadDocument.Uri + ", " + link.ToString() + ") invalid : " + ex.Message + ""));
  316.                                 }
  317.                             }
  318.                         } // process local links
  319.                     } // document was not null
  320.                 } // not visited
  321.             } // robot allowed
  322.             return level;
  323.         }
  324.         /// <summary>
  325.         /// Attempts to download the Uri and (based on it's MimeType) use the DocumentFactory
  326.         /// to get a Document subclass object that is able to parse the downloaded data.
  327.         /// </summary>
  328.         /// <remarks>
  329.         /// http://www.123aspx.com/redir.aspx?res=28320
  330.         /// </remarks>
  331.         protected Document Download (Uri uri)
  332.         {
  333.             bool success = false;
  334.             // Open the requested URL
  335.             System.Net.WebProxy proxyObject = null;
  336.             if (Preferences.UseProxy)
  337.             {   // [v6] stephenlane80 suggested proxy code
  338.                 proxyObject = new System.Net.WebProxy(Preferences.ProxyUrl, true);
  339.                 proxyObject.Credentials = System.Net.CredentialCache.DefaultCredentials;
  340.             }
  341.             // [v6] Erick Brown [work] suggested fix for & in querystring
  342.             string unescapedUri = Regex.Replace(uri.AbsoluteUri, @"&amp;amp;", @"&", RegexOptions.IgnoreCase);
  343.             System.Net.HttpWebRequest req = (System.Net.HttpWebRequest)System.Net.WebRequest.Create(unescapedUri); 
  344.             req.AllowAutoRedirect = true;
  345.             req.MaximumAutomaticRedirections = 3;
  346.             req.UserAgent = Preferences.UserAgent; //"Mozilla/6.0 (MSIE 6.0; Windows NT 5.1; Searcharoo.NET; robot)";
  347.             req.KeepAlive = true;
  348.             req.Timeout = Preferences.RequestTimeout * 1000; //prefRequestTimeout 
  349.             if (Preferences.UseProxy) req.Proxy = proxyObject; // [v6] stephenlane80
  350.             // SIMONJONES http://codeproject.com/aspnet/spideroo.asp?msg=1421158#xx1421158xx
  351.             req.CookieContainer = new System.Net.CookieContainer();
  352.             req.CookieContainer.Add(_CookieContainer.GetCookies(uri));
  353.             // Get the stream from the returned web response
  354.             System.Net.HttpWebResponse webresponse = null;
  355.             try
  356.             {
  357.                 webresponse = (System.Net.HttpWebResponse)req.GetResponse();
  358.             }
  359.             catch (System.Net.WebException we)
  360.             {   //remote url not found, 404; remote url forbidden, 403
  361.                 ProgressEvent(this, new ProgressEventArgs(2, "skipped  " + uri.AbsoluteUri + " response exception:" + we.ToString() + ""));
  362.             }
  363.             Document currentUriDocument = null;
  364.             if (webresponse != null)
  365.             {
  366.                 /* SIMONJONES */
  367.                 /* **************** this doesn't necessarily work yet...
  368.                 if (webresponse.ResponseUri != htmldoc.Uri)
  369.                 { // we've been redirected, 
  370.                     if (visited.Contains(webresponse.ResponseUri.ToString().ToLower()))
  371.                     {
  372.                         return true;
  373.                     }
  374.                     else
  375.                     {
  376.                         visited.Add(webresponse.ResponseUri.ToString().ToLower());
  377.                     }
  378.                 }*/
  379.                 try
  380.                 {
  381.                     webresponse.Cookies = req.CookieContainer.GetCookies(req.RequestUri);
  382.                     // handle cookies (need to do this in case we have any session cookies)
  383.                     foreach (System.Net.Cookie retCookie in webresponse.Cookies)
  384.                     {
  385.                         bool cookieFound = false;
  386.                         foreach (System.Net.Cookie oldCookie in _CookieContainer.GetCookies(uri))
  387.                         {
  388.                             if (retCookie.Name.Equals(oldCookie.Name))
  389.                             {
  390.                                 oldCookie.Value = retCookie.Value;
  391.                                 cookieFound = true;
  392.                             }
  393.                         }
  394.                         if (!cookieFound)
  395.                         {
  396.                             _CookieContainer.Add(retCookie);
  397.                         }
  398.                     }
  399.                 }
  400.                 catch (Exception ex)
  401.                 {
  402.                     ProgressEvent(this, new ProgressEventArgs(3, "Cookie processing error : " + ex.Message + ""));
  403.                 }
  404.                 /* end SIMONJONES */
  405.                 
  406.                 currentUriDocument = DocumentFactory.New(uri, webresponse);
  407.                 success = currentUriDocument.GetResponse(webresponse);
  408.                 webresponse.Close();
  409.                 ProgressEvent(this, new ProgressEventArgs(2, "Trying index mime type: " + currentUriDocument.MimeType + " for " + currentUriDocument.Uri + ""));
  410.                 
  411.                 _Visited.Add(currentUriDocument.Uri);   // [v7] brad1213@yahoo.com capture redirected Urls
  412.                                                         // relies on Document 'capturing' the final Uri
  413.                                                         // this.Uri = webresponse.ResponseUri;
  414.             }
  415.             else
  416.             {
  417.                 ProgressEvent(this, new ProgressEventArgs(2, "No WebResponse for " + uri + ""));
  418.                 success = false;
  419.             }
  420.             return currentUriDocument;
  421.         }
  422.         /// <summary>
  423.         /// Add the Document subclass to the catalog, BY FIRST 'copying' the main
  424.         /// properties into a File class. The distinction is a bit arbitrary: Documents
  425.         /// are downloaded and indexed, but their content is modelled in as a File
  426.         /// class in the Catalog (and represented as a ResultFile object in the search ASPX page)
  427.         /// </summary>
  428.         /// <return>Number of words catalogued in the Document</return>
  429.         protected int AddToCatalog(Document downloadDocument)
  430.         {
  431.             File infile = new File(downloadDocument.Uri.AbsoluteUri
  432.                 , downloadDocument.Title
  433.                 , downloadDocument.Description
  434.                 , DateTime.Now
  435.                 , downloadDocument.Length
  436.                 , downloadDocument.GpsLocation
  437.                 , downloadDocument.Extension
  438.                 , downloadDocument.KeywordString); // [v6] Gps, Extension, keywords
  439.             // ### Loop through words in the file ###
  440.             int i = 0, j = 0;   // count of words, count of words _indexed
  441.             string key = "";    // temp variables
  442.             
  443.             foreach (string word in downloadDocument.WordsArray)
  444.             {
  445.                 key = word.ToLower();
  446.                 if (!_GoChecker.IsGoWord(key))
  447.                 { // not a special case, parse like any other word
  448.                     RemovePunctuation(ref key);
  449.                     if (!IsNumber(ref key))
  450.                     { // not a number, so get rid of numeric seperators and catalog as a word
  451.                         // TODO: remove inline punctuation, split hyphenated words?
  452.                         // http://blogs.msdn.com/ericgu/archive/2006/01/16/513645.aspx
  453.                         key = System.Text.RegularExpressions.Regex.Replace(key, "[,.]", "", System.Text.RegularExpressions.RegexOptions.IgnoreCase);
  454.                         // Apply Stemmer (set by preferences)
  455.                         key = _Stemmer.StemWord(key);
  456.                         // Apply Stopper (set by preferences)
  457.                         key = _Stopper.StopWord(key);
  458.                     }
  459.                 }
  460.                 else
  461.                 {
  462.                     ProgressEvent(this, new ProgressEventArgs(4, "Found GoWord " + key + " in " + downloadDocument.Title));
  463.                 }
  464.                 if (key != String.Empty)
  465.                 {
  466.                     _Catalog.Add(key, infile, i);
  467.                     j++;
  468.                 }
  469.                 i++;
  470.             }
  471.             _Catalog.FileCache.Add(downloadDocument.WordsArray, infile);
  472.             return i;
  473.         }
  474.         /// <summary>
  475.         /// Each word is identified purely by the whitespace around it. It could still include punctuation
  476.         /// attached to either end of the word, or "in" the word (ie a dash, which we will remove for
  477.         /// indexing purposes)
  478.         /// </summary>
  479.         /// <remarks>
  480.         /// Andrey Shchekin suggests 'unicode' regex [w] - equivalent to [p{Ll}p{Lu}p{Lt}p{Lo}p{Nd}p{Pc}]
  481.         /// http://www.codeproject.com/cs/internet/Searcharoo_4.asp?df=100&forumid=397394&select=1992575#xx1992575xx
  482.         /// so [^w0-9,.] as a replacement for [^a-z0-9,.]
  483.         /// which might remove the need for 'AssumeAllWordsAreEnglish'. TO BE TESTED.
  484.         /// </remarks>
  485.         private void RemovePunctuation(ref string word)
  486.         {   // this stuff is a bit 'English-language-centric'
  487.             if (Preferences.AssumeAllWordsAreEnglish)
  488.             {   // if all words are english, this strict parse to remove all punctuation ensures
  489.                 // words are reduced to their least unique form before indexing
  490.                 //word = System.Text.RegularExpressions.Regex.Replace(word, @"[^a-z0-9,.]", "", System.Text.RegularExpressions.RegexOptions.IgnoreCase);
  491.                 
  492.                 // [v6] testing better i18n
  493.                 word = System.Text.RegularExpressions.Regex.Replace(word, @"[^w0-9,.]", "", System.Text.RegularExpressions.RegexOptions.IgnoreCase);
  494.             }
  495.             else 
  496.             {   // by stripping out this specific list of punctuation only, there is potential to leave lots 
  497.                 // of cruft in the word before indexing BUT this will allow any language to be indexed
  498.                 word = word.Trim(' ','?','"',',',''',';',':','.','(',')','[',']','%','*','$','-'); 
  499.             }
  500.         }
  501.         /// <summary>
  502.         /// TODO: parse numbers here 
  503.         /// ie remove thousands seperator, currency, etc
  504.         /// and also trim decimal part, so number searches are only on the integer value
  505.         /// </summary>
  506.         private bool IsNumber(ref string word)
  507.         {
  508.             try
  509.             {
  510.                 long number = Convert.ToInt64(word); //;int.Parse(word);
  511.                 word = number.ToString();
  512.                 return (word != String.Empty);
  513.             }
  514.             catch
  515.             {
  516.                 return false;
  517.             }
  518.         }
  519.     }
  520. }