Search.cs
上传用户:huiyue
上传日期:2022-04-08
资源大小:1429k
文件大小:18k
源码类别:

搜索引擎

开发平台:

ASP/ASPX

  1. using System;
  2. using System.Collections.Specialized;
  3. using System.Xml.Serialization;
  4. using System.Text.RegularExpressions;
  5. using System.Collections;
  6. using System.Collections.Generic;
  7. using Searcharoo.Common;
  8. namespace Searcharoo.Engine
  9. {
  10.     public class Search
  11.     {
  12.         #region Private Fields: _Stemmer, _Stopper, _GoChecker, _DisplayTime, _Matches
  13.         /// <summary>Stemmer to use</summary>
  14.         private IStemming _Stemmer;
  15.         /// <summary>Stopper to use</summary>
  16.         private IStopper _Stopper;
  17.         /// <summary>Go word parser to use</summary>
  18.         private IGoWord _GoChecker;
  19.         /// <summary>Display string: time the search too</summary>
  20.         private string _DisplayTime;
  21.         /// <summary>Display string: matches (links and number of)</summary>
  22.         private string _Matches = "";
  23.         /// <summary>Number of results with geolocation v6</summary>
  24.         private int _GeolocCount = 0;
  25.         #endregion
  26.         #region Public Properties: SearchQueryMatchHtml, DisplayTime
  27.         public string SearchQueryMatchHtml
  28.         {
  29.             get { return _Matches; }
  30.             set { _Matches = value; }
  31.         }
  32.         
  33.         public string DisplayTime
  34.         {
  35.             get { return _DisplayTime; }
  36.             set { _DisplayTime = value; }
  37.         }
  38.         public int GeocodedMatches
  39.         {
  40.             get { return _GeolocCount; }
  41.         }
  42.         #endregion
  43.         /// <summary>
  44.         /// Legacy method signature (pre v6)
  45.         /// </summary>
  46.         public SortedList GetResults(string searchterm, Catalog catalog)
  47.         {
  48.             return GetResults(searchterm, catalog, false);  // all results, including but not limited to geoloc
  49.         }
  50.         //public SortedList GetResults(string searchterm, Catalog catalog, bool geolocOnly)
  51.         //{
  52.         //    return GetResults(searchterm, catalog, false, new Cache());
  53.         //}
  54.         /// <summary>
  55.         /// v6
  56.         /// </summary>
  57.         /// <param name="searchterm">search query</param>
  58.         /// <param name="catalog">catalog to search</param>
  59.         /// <param name="geolocOnly">If true, ONLY return results with a lat/long</param>
  60.         /// <param name="cache">Cache of page 'content'</param>
  61.         /// <returns>ResultFile SortedList for display</returns>
  62.         public SortedList GetResults(string searchterm, Catalog catalog, bool geolocOnly)
  63.         {
  64.             SortedList output = new SortedList();
  65.             // ----------------------- DOING A SEARCH ----------------------- 
  66.             if ((null != searchterm) && (null != catalog))
  67.             {
  68.                 SetPreferences();
  69.                 string[] searchTermArray = null, searchTermDisplay = null;
  70.                 /****** Too *********/
  71.                 Regex r = new Regex(@"s+");            //remove all whitespace
  72.                 searchterm = r.Replace(searchterm, " ");// to a single space
  73.                 searchTermArray = searchterm.Split(' '); // then split
  74.                 searchTermDisplay = (string[])searchTermArray.Clone();
  75.                 for (int i = 0; i < searchTermArray.Length; i++)
  76.                 {
  77.                     if (_GoChecker.IsGoWord(searchTermArray[i]))
  78.                     { // was a Go word, just Lower it
  79.                         searchTermArray[i] = searchTermArray[i].ToLower();
  80.                     }
  81.                     else
  82.                     { // Not a Go word, apply stemming
  83.                         searchTermArray[i] = searchTermArray[i].Trim(' ', '?', '"', ',', ''', ';', ':', '.', '(', ')').ToLower();
  84.                         searchTermArray[i] = _Stemmer.StemWord(searchTermArray[i].ToString());
  85.                     }
  86.                 }
  87.                 if (searchterm == String.Empty)
  88.                 {
  89.                     // After trimming the search term, it was found to be empty!
  90.                     return output;
  91.                 }
  92.                 else
  93.                 { // we have a search term!
  94.                     DateTime start = DateTime.Now;  // to show 'time taken' to perform search
  95.                     // Array of arrays of results that match ONE of the search criteria
  96.                     Dictionary<File, List<int>>[] searchResultsArrayArray = new Dictionary<File, List<int>>[searchTermArray.Length];
  97.                     // finalResultsArray is populated with pages that *match* ALL the search criteria
  98.                     HybridDictionary finalResultsArray = new HybridDictionary();
  99.                     bool botherToFindMatches = true;
  100.                     int indexOfShortestResultSet = -1, lengthOfShortestResultSet = -1;
  101.                     for (int i = 0; i < searchTermArray.Length; i++)
  102.                     { // ##### THE SEARCH #####
  103.                         searchResultsArrayArray[i] = catalog.Search(searchTermArray[i].ToString());
  104.                         if (null == searchResultsArrayArray[i])
  105.                         {
  106.                             _Matches += searchTermDisplay[i] + " <font color=gray style='font-size:xx-small'>(not found)</font> ";
  107.                             botherToFindMatches = false; // if *any one* of the terms isn't found, there won't be a 'set' of Matches
  108.                         }
  109.                         else
  110.                         {
  111.                             int resultsInThisSet = searchResultsArrayArray[i].Count;
  112.                             _Matches += "<a href="?" + Preferences.QuerystringParameterName + "=" + searchTermDisplay[i] + "" title="" + searchTermArray[i] + "">"
  113.                                     + searchTermDisplay[i]
  114.                                     + "</a> <font color=gray style='font-size:xx-small'>(" + resultsInThisSet + ")</font> ";
  115.                             if ((lengthOfShortestResultSet == -1) || (lengthOfShortestResultSet > resultsInThisSet))
  116.                             {
  117.                                 indexOfShortestResultSet = i;
  118.                                 lengthOfShortestResultSet = resultsInThisSet;
  119.                             }
  120.                         }
  121.                     }
  122.                     // Find the common files from the array of arrays of documents
  123.                     // matching ONE of the criteria
  124.                     if (botherToFindMatches)                                            // all words have *some* matches
  125.                     { // for each result set [NOT required, but maybe later if we do AND/OR searches)
  126.                         int c = indexOfShortestResultSet;                               // loop through the *shortest* resultset
  127.                         Dictionary<File, List<int>> searchResultsArray = searchResultsArrayArray[c];
  128.                         foreach (File foundInFile in searchResultsArray.Keys)             // for each file in the *shortest* result set
  129.                         {
  130.                             //DictionaryEntry fo = (DictionaryEntry)foundInFile;          // find matching files in the other resultsets
  131.                             
  132.                             int matchcount = 0, totalcount = 0, weight = 0;
  133.                             List<int> occurences = new List<int>();
  134.                             for (int cx = 0; cx < searchResultsArrayArray.Length; cx++)
  135.                             {
  136.                                 totalcount += (cx + 1);                                // keep track, so we can compare at the end (if term is in ALL resultsets)
  137.                                 if (cx == c)                                      // current resultset
  138.                                 {
  139.                                     matchcount += (cx + 1);                          // implicitly matches in the current resultset
  140.                                     //weight += (int)fo.Value;                       // sum the weighting
  141.                                     weight += searchResultsArray[foundInFile].Count;              // sum the weighting
  142.                                     occurences.AddRange(searchResultsArray[foundInFile]);
  143.                                 }
  144.                                 else
  145.                                 {
  146.                                     Dictionary<File, List<int>> searchResultsArrayx = searchResultsArrayArray[cx];
  147.                                     if (null != searchResultsArrayx)
  148.                                     {
  149.                                         foreach (File foundInFilex in searchResultsArrayx.Keys)
  150.                                         {   // for each file in the result set
  151.                                             //DictionaryEntry fox = (DictionaryEntry)foundInFilex;
  152.                                             //if (fo.Key == fox.Key)
  153.                                             if (foundInFile == foundInFilex)
  154.                                             {
  155.                                                 matchcount += (cx + 1);               // and if it matches, track the matchcount
  156.                                                 //weight += (int)fox.Value;           // and weighting; then break out of loop, since
  157.                                                 weight += searchResultsArrayx[foundInFilex].Count;
  158.                                                 occurences.AddRange(searchResultsArrayx[foundInFilex]);
  159.                                                 break;                              // no need to keep looking through this resultset
  160.                                             }
  161.                                         } // foreach
  162.                                     } // if
  163.                                 } // else
  164.                             } // for
  165.                             if ((matchcount > 0) && (matchcount == totalcount)) // was matched in each Array
  166.                             {   // we build the finalResults here, to pass to the formatting code below
  167.                                 // - we could do the formatting here, but it would mix up the 'result generation'
  168.                                 // and display code too much
  169.                                 //fo.Value = weight; // set the 'weight' in the combined results to the sum of individual document matches
  170.                                 
  171.                                 //if (!finalResultsArray.Contains(fo.Key)) finalResultsArray.Add(fo.Key, fo);
  172.                                 if (!finalResultsArray.Contains(foundInFile)) finalResultsArray.Add(foundInFile, occurences); //.Count
  173.                             } // if
  174.                         } // foreach
  175.                     }
  176.                     // Time taken calculation
  177.                     Int64 ticks = DateTime.Now.Ticks - start.Ticks;
  178.                     TimeSpan taken = new TimeSpan(ticks);
  179.                     if (taken.Seconds > 0)
  180.                     {
  181.                         _DisplayTime = taken.Seconds + " seconds";
  182.                     }
  183.                     else if (taken.TotalMilliseconds > 0)
  184.                     {
  185.                         _DisplayTime = Convert.ToInt32(taken.TotalMilliseconds) + " milliseconds";
  186.                     }
  187.                     else
  188.                     {
  189.                         _DisplayTime = "less than 1 millisecond";
  190.                     }
  191.                     // The preceding 80 lines (or so) replaces this single line from Version 1
  192.                     //       Hashtable searchResultsArray = m_catalog.Search (searchterm);
  193.                     // when only single-word-searches were supported. Look closely and you'll see this line
  194.                     // labelled #THE SEARCH# still in the code above...
  195.                     // Format the results
  196.                     if (finalResultsArray.Count > 0)
  197.                     { // intermediate data-structure for 'ranked' result HTML
  198.                         //SortedList 
  199.                         output = new SortedList(finalResultsArray.Count); // empty sorted list
  200.                         //                DictionaryEntry fo;
  201.                         ResultFile infile;
  202.                         //                string result="";
  203.                         int sortrank = 0;
  204.                         // build each result row
  205.                         foreach (object foundInFile in finalResultsArray.Keys)
  206.                         {
  207.                             // Create a ResultFile with it's own Rank
  208.                             infile = new ResultFile((File)foundInFile);
  209.                             
  210.                             // [v7] if we have a cache of the page's content, we'll display the relevant 
  211.                             // text excerpt in the search results
  212.                             if (catalog.FileCache.Contains(infile.Url))
  213.                             {
  214.                                 string desc = "";
  215.                                 string[] words = catalog.FileCache.GetDocumentCache(infile.Url);
  216.                                 
  217.                                 int position = (words.Length / 2);      // # find the position of a searched-for word here !!!!!!!
  218.                                 
  219.                                 if (words.Length < 10)
  220.                                 {
  221.                                     for (int i = 0; i < words.Length; i++)
  222.                                     {
  223.                                         desc += words[i] + " ";
  224.                                     }
  225.                                 }
  226.                                 else
  227.                                 {
  228.                                     List<int> pos = (List<int>)finalResultsArray[foundInFile];
  229.                                     pos.Sort();
  230.                                     int q = 0;
  231.                                     position = pos[q];
  232.                                     List<int> useablePos = new List<int>();
  233.                                     foreach (int p in pos)
  234.                                     {
  235.                                         if (p < (position + 50)) useablePos.Add(p);
  236.                                         if (p > (position + 50)) break;
  237.                                     }
  238.                                     int lowerBound = (position < 24) ? position : 24;
  239.                                     int upperBound = (position < 24) ? 48 - position : 24;
  240.                                     lowerBound = position - lowerBound;
  241.                                     upperBound = position + upperBound;
  242.                                     if (upperBound > words.Length) upperBound = words.Length - 1;
  243.                                     for (int i = lowerBound; i < upperBound; i++)
  244.                                     {
  245.                                         if (i == position) desc += "<b>";
  246.                                         desc += words[i] + " ";
  247.                                         if (i == position)
  248.                                         {
  249.                                             desc += "</b>";
  250.                                             q++;
  251.                                             if (q < pos.Count) {   position = pos[q]; }
  252.                                         }
  253.                                     }
  254.                                 }
  255.                                 infile.Description = desc;
  256.                             }
  257.                             if (geolocOnly && (infile.GpsLocation == null || infile.GpsLocation == new Location() ))
  258.                             {
  259.                                 // don't add this ResultFile to output [v6]
  260.                             }
  261.                             else
  262.                             {
  263.                                 // Jim Harkins [sort for paging] ported from VB to C#
  264.                                 // http://www.codeproject.com/aspnet/spideroo.asp#xx927327xx
  265.                                 //infile.Rank = (int)((DictionaryEntry)finalResultsArray[foundInFile]).Value;
  266.                                 infile.Rank = (int)((List<int>)finalResultsArray[foundInFile]).Count;
  267.                                 sortrank = infile.Rank * -1000; // Assume not 'thousands' of results
  268.                                 if (output.Contains(sortrank))
  269.                                 { // rank exists - drop key index one number until it fits
  270.                                     for (int i = 1; i < 999; i++)
  271.                                     {
  272.                                         sortrank++;
  273.                                         if (!output.Contains(sortrank))
  274.                                         {
  275.                                             output.Add(sortrank, infile);
  276.                                             if (infile.GpsLocation != null) _GeolocCount += 1;
  277.                                             break;
  278.                                         }
  279.                                     }
  280.                                 }
  281.                                 else
  282.                                 {
  283.                                     output.Add(sortrank, infile);
  284.                                     if (infile.GpsLocation != null) _GeolocCount += 1;
  285.                                 }
  286.                             }
  287.                             sortrank = 0; // reset for next pass
  288.                         }
  289.                         // Jim Harkins [paged results]
  290.                         // http://aspnet.4guysfromrolla.com/articles/081804-1.aspx
  291.                     } // else Count == 0, so output SortedList will be empty
  292.                 } 
  293.             }
  294.             return output;
  295.         }
  296.         private void SetPreferences()
  297.         {
  298.             // Set-up Stemming (if required)
  299.             switch (Preferences.StemmingMode)
  300.             {
  301.                 case 1:
  302.                     _Stemmer = new PorterStemmer(); //Stemmer = new SnowStemming();
  303.                     break;
  304.                 case 2:
  305.                     _Stemmer = new PorterStemmer();
  306.                     break;
  307.                 default:
  308.                     _Stemmer = new NoStemming();
  309.                     break;
  310.             }
  311.             switch (Preferences.GoWordMode)
  312.             {
  313.                 case 1:
  314.                     _GoChecker = new ListGoWord();
  315.                     break;
  316.                 default:
  317.                     _GoChecker = new NoGoWord();
  318.                     break;
  319.             }
  320.         }
  321.     }
  322. }