Catalog.cs
上传用户:huiyue
上传日期:2022-04-08
资源大小:1429k
文件大小:17k
源码类别:

搜索引擎

开发平台:

ASP/ASPX

  1. using System;
  2. using System.IO;
  3. using System.Xml.Serialization;
  4. using System.Text;
  5. using System.Text.RegularExpressions;
  6. using System.Collections;
  7. using System.Collections.Generic;
  8. using System.Collections.Specialized;
  9. using cd.net;
  10. namespace Searcharoo.Common
  11. {
  12.     /// <summary>
  13.     /// Catalog of Words (and Files)
  14.     /// <summary>
  15.     /// <remarks>
  16.     /// XmlInclude
  17.     /// http://pluralsight.com/blogs/craig/archive/2004/07/08/1580.aspx
  18.     /// 
  19.     /// added v6 : fallback loading of catalog from the WebAppCatalogResource assembly
  20.     /// to get around issues where the Trust level (eg. in shared hosting) is restricted
  21.     /// to NOT ALLOW File.IO or WebClient requests. This requires the catalog to be built
  22.     /// using the Indexer.EXE then compiled into a DLL then uploaded to /bin/
  23.     /// </remarks>
  24.     [Serializable]
  25.     [System.Xml.Serialization.XmlInclude(typeof(Searcharoo.Common.Word))]
  26.     [System.Xml.Serialization.XmlInclude(typeof(Searcharoo.Common.Location))]
  27.     [System.Xml.Serialization.XmlInclude(typeof(Searcharoo.Common.CatalogWordFile))]
  28.     public class Catalog
  29.     {
  30.         /// <summary>
  31.         /// Internal datastore of Words referencing Files
  32.         /// </summary>
  33.         /// <remarks>
  34.         /// Hashtable
  35.         /// key    = STRING representation of the word, 
  36.         /// value  = Word OBJECT (with File collection, etc)
  37.         /// </remarks>
  38.         private System.Collections.Hashtable _Index; //TODO: implement collection with faster searching
  39.         private System.Collections.Generic.Dictionary<File,int> _FileIndex; //TODO: implement collection with faster searching
  40.         private Cache _Cache;
  41.         /// <summary>
  42.         /// Words in the Catalog
  43.         /// </summary>
  44.         /// <remarks>
  45.         /// Added property to allow Serialization to disk
  46.         /// NOTE: the XmlInclude attribute on the Catalog class, which is what
  47.         /// enables this array of 'non-standard' objects to be serialized correctly
  48.         /// </remarks>
  49.         [XmlElement("o")]
  50.         [XmlIgnore()]
  51.         [Obsolete("Use WordFiles and Files properties")]
  52.         public Word[] Words
  53.         {
  54.             get
  55.             {
  56.                 Word[] wordArray = new Word[_Index.Count];
  57.                 _Index.Values.CopyTo(wordArray, 0);
  58.                 return wordArray;
  59.             }
  60.             set
  61.             {
  62.                 Word[] wordArray = value;
  63.                 _Index = new Hashtable();   //HACK: index doesn't get populated with wordArray
  64.             }
  65.         }
  66.         [XmlElement("w")]
  67.         public CatalogWordFile[] WordFiles
  68.         {
  69.             get 
  70.             {
  71.                 PrepareForSerialization();
  72.                 return _WordfileArray;
  73.             }
  74.             set 
  75.             {
  76.                 _WordfileArray = value;
  77.                 PostDeserialization();
  78.             }
  79.         }
  80.         [XmlElement("f")]
  81.         public File[] Files
  82.         {
  83.             get
  84.             {
  85.                 PrepareForSerialization();
  86.                 return _FileList.ToArray();
  87.             }
  88.             set
  89.             {
  90.                 File[] fa = value;
  91.                 _FileList = new List<File>(fa);
  92.                 PostDeserialization();
  93.             }
  94.         }
  95.         #region Private fields and methods to manage XmlSerialization
  96.         /// <summary>
  97.         /// List of File objects that were referenced in the Catalog
  98.         /// </summary>
  99.         private System.Collections.Generic.List<File> _FileList;
  100.         /// <summary>
  101.         /// Array of CatalogWordFile objects, with 'ids' for each File 
  102.         /// rather than a reference to a File object
  103.         /// </summary>
  104.         private CatalogWordFile[] _WordfileArray;
  105.         private bool _SerializePreparationDone = false;
  106.         /// <summary>
  107.         /// Property helper for Files &amp; CatalogWordFiles, ensures the data retrieved
  108.         /// from those two properties is 'in sync'
  109.         /// </summary>
  110.         private void PrepareForSerialization()
  111.         {
  112.             if (_SerializePreparationDone) return;
  113.             // Create the list of 'Files' (ie. pages indexed) [v7]
  114.             _FileList = new List<File>();
  115.             foreach (File f in _FileIndex.Keys)
  116.             {
  117.                 _FileList.Add(f);
  118.                 _Cache.SetIndexId(f.Url, f.IndexId);
  119.             }
  120.             _WordfileArray = new CatalogWordFile[_Index.Count];
  121.             Word[] wordArray = new Word[_Index.Count];
  122.             _Index.Values.CopyTo(wordArray, 0);
  123.             
  124.             // go through all the words
  125.             for (int i = 0; i < wordArray.Length; i++)
  126.             {
  127.                 CatalogWordFile wf = new CatalogWordFile();
  128.                 wf.Text = wordArray[i].Text;
  129.                 foreach (File f in wordArray[i].Files)
  130.                 {   // for each File, append the File.IndexId AND the List<int> of positions
  131.                     wf.FileIdsWithPosition.Add(f.IndexId, wordArray[i].InFilesWithPosition()[f]);
  132.                 }
  133.                 _WordfileArray[i] = wf;
  134.             }
  135.             _SerializePreparationDone = true;
  136.         }
  137.         /// <summary>
  138.         /// Property helper for Files &amp; WordFiles, ensures when
  139.         /// they are both 'set', the internal Catalog datastructure is
  140.         /// setup correctly
  141.         /// </summary>
  142.         private void PostDeserialization()
  143.         {
  144.             if ((_WordfileArray != null) && (_FileList != null))
  145.             { 
  146.                 foreach (CatalogWordFile wf in _WordfileArray)
  147.                 {
  148.                     //foreach (int i in wf.FileIds)
  149.                     //{
  150.                     //    this.Add(wf.Text, _FileList[i],-1);
  151.                     //}
  152.                     foreach (int i in wf.FileIdsWithPosition.Keys)
  153.                     { 
  154.                         // get the file object
  155.                         File f = null;
  156.                         foreach (File h in _FileList)
  157.                         {
  158.                             if (h.IndexId == i)
  159.                             { f = h; break; }
  160.                         }
  161.                         if (f == null)
  162.                         {
  163.                          //   throw new NullReferenceException("There should be a file object matching index " + i);
  164.                             System.Diagnostics.Debug.WriteLine("There should be a file object matching index " + i);
  165.                         }
  166.                        else
  167.                         foreach (int j in wf.FileIdsWithPosition[i])
  168.                         {
  169.                             this.Add(wf.Text, f, j);
  170.                         }
  171.                     }
  172.                 }
  173.             }
  174.         }
  175.         #endregion
  176.         
  177.         /// <summary>
  178.         /// String array representing the list of words. 
  179.         /// Used mainly for debugging - ie in the Save() method - so you can 
  180.         /// see what the Spider found.
  181.         /// </summary>
  182.         /// <remarks>
  183.         /// Because there is no 'set' accessor, this property is not XmlSerialized;
  184.         /// XmlIgnore attribute added anyway.
  185.         /// </remarks>
  186.         [XmlIgnore()]
  187.         public string[] Dictionary
  188.         {
  189.             get
  190.             {
  191.                 string[] wordArray = new string[_Index.Count];
  192.                 _Index.Keys.CopyTo(wordArray, 0);
  193.                 return wordArray;
  194.             }
  195.         }
  196.         /// <summary>
  197.         /// Number of Words in the Catalog
  198.         /// </summary>
  199.         /// <remarks>
  200.         /// Because there is no 'set' accessor, this property is not XmlSerialized
  201.         /// </remarks>
  202.         public int Length
  203.         {
  204.             get { return _Index.Count; }
  205.         }
  206.         public Cache FileCache
  207.         {
  208.             get { return _Cache; }
  209.             set { _Cache = value; }
  210.         }
  211.         /// <summary>
  212.         /// Constructor - creates the Hashtable for internal data storage.
  213.         /// </summary>
  214.         public Catalog()
  215.         {
  216.             _Index = new System.Collections.Hashtable();
  217.             _FileIndex = new System.Collections.Generic.Dictionary<File, int>();
  218.             _Cache = new Cache();
  219.         }
  220.         /// <summary>
  221.         /// Add a new Word/File pair to the Catalog
  222.         /// </summary>
  223.         public bool Add(string word, File infile, int position)
  224.         {
  225.             // ### Keep a list of all the files we catalog [v7]
  226.             if (!_FileIndex.ContainsKey(infile))
  227.             {
  228.                 _FileIndex.Add(infile, _FileIndex.Count);
  229.                 if (infile.IndexId < 0)
  230.                     infile.IndexId = _FileIndex.Count - 1;  // zero based
  231.             }
  232.             // ### Make sure the Word object is in the index ONCE only
  233.             if (_Index.ContainsKey(word))
  234.             {
  235.                 Word theword = (Word)_Index[word]; // get Word from Index, then add this file reference to the Word
  236.                 theword.Add(infile, position);
  237.             }
  238.             else
  239.             {
  240.                 Word theword = new Word(word, infile, position); // create a new Word object and add to Index
  241.                 _Index.Add(word, theword);
  242.             }
  243.             _SerializePreparationDone = false;  // adding to the catalog invalidates 'serialization preparation'
  244.             return true;
  245.         }
  246.         /// <summary>
  247.         /// Returns all the Files which contain the searchWord
  248.         /// </summary>
  249.         /// <returns>
  250.         /// Hashtable prior to [v7]
  251.         /// </returns>
  252.         public Dictionary<File, List<int>> Search(string searchWord)
  253.         {
  254.             // apply the same 'trim' as when we're building the catalog
  255.             //searchWord = searchWord.Trim(' ','?','"',',',''',';',':','.','(', ')','[',']','%','*','$','-').ToLower();
  256.             Dictionary<File, List<int>> retval = null;
  257.             if (_Index.ContainsKey(searchWord))
  258.             { // does all the work !!!
  259.                 Word thematch = (Word)_Index[searchWord];
  260.                 //retval = thematch.InFiles(); // return the collection of File objects
  261.                 retval = thematch.InFilesWithPosition();
  262.             }
  263.             return retval;
  264.         }
  265.         /// <summary>
  266.         /// Debug string
  267.         /// </summary>
  268.         public override string ToString()
  269.         {
  270.             string wordlist = "";
  271.             //foreach (object w in index.Keys) temp += ((Word)w).ToString(); // output ALL words, will take a long time
  272.             return "CATALOG :: " + _Index.Values.Count.ToString() + " words.n" + wordlist;
  273.         }
  274.         /// <summary>
  275.         /// Save the catalog to disk by BINARY serializing the object graph as a *.DAT file.
  276.         /// </summary>
  277.         /// <remarks>
  278.         /// For 'reference', the method also saves XmlSerialized copies of the Catalog (which
  279.         /// can get quite large) and just the list of Words that were found. In production, you
  280.         /// would probably comment out/remove the Debugging code.
  281.         /// 
  282.         /// You may also wish to use a difficult-to-guess filename for the serialized data, 
  283.         /// or else change the .DAT file extension to something that will be not be served by
  284.         /// IIS (so that other people can't download your catalog).
  285.         /// </remarks>
  286.         public void Save()
  287.         {
  288.             // XML
  289.             if (Preferences.InMediumTrust)
  290.             {
  291.                 // TODO: Maybe use to save as ZIP - save space on disk? http://www.123aspx.com/redir.aspx?res=31602
  292.                 string xmlFileName = Path.GetDirectoryName(Preferences.CatalogFileName) + Path.DirectorySeparatorChar + Path.GetFileNameWithoutExtension(Preferences.CatalogFileName) + ".xml";
  293.                 Kelvin<Catalog>.ToXmlFile(this, xmlFileName);
  294.                 //XmlSerializer serializerXmlCatalog = new XmlSerializer(typeof(Catalog));
  295.                 //System.IO.TextWriter writer = new System.IO.StreamWriter(xmlFileName);
  296.                 //serializerXmlCatalog.Serialize(writer, this);
  297.                 //writer.Close();
  298.                 
  299.                 _Cache.Save();
  300.                 return;
  301.             }
  302.             // BINARY http://www.dotnetspider.com/technology/kbpages/454.aspx
  303.             System.IO.Stream stream = new System.IO.FileStream(Preferences.CatalogFileName, System.IO.FileMode.Create);
  304.             System.Runtime.Serialization.IFormatter formatter = new System.Runtime.Serialization.Formatters.Binary.BinaryFormatter();
  305.             formatter.Serialize(stream, this);
  306.             stream.Close();
  307.             
  308.             _Cache.Save();
  309.             
  310.             #region Debugging Serialization - these are only really useful for looking at; they're not re-loaded
  311.             if (Preferences.DebugSerializeXml)
  312.             {
  313.                 //Kelvin<Catalog>.ToBinaryFile(this, Path.GetFileNameWithoutExtension(Preferences.CatalogFileName) + "_Kelvin" + Path.GetExtension(Preferences.CatalogFileName));
  314.                 //Kelvin<Catalog>.ToXmlFile(this, Path.GetFileNameWithoutExtension(Preferences.CatalogFileName) + "_Kelvin.xml");
  315.                 Kelvin<string[]>.ToXmlFile(this.Dictionary, Path.GetFileNameWithoutExtension(Preferences.CatalogFileName) + "_debugwords.xml");
  316.                 // XML http://www.devhood.com/Tutorials/tutorial_details.aspx?tutorial_id=236
  317.                 //XmlSerializer serializerXmlWords = new XmlSerializer(typeof(string[]));
  318.                 //System.IO.TextWriter writerW = new System.IO.StreamWriter(Path.GetFileNameWithoutExtension(Preferences.CatalogFileName) + "_words.xml");
  319.                 //serializerXmlWords.Serialize(writerW, this.Dictionary);
  320.                 //writerW.Close();
  321.             }
  322.             #endregion
  323.         }
  324.         /// <summary>
  325.         /// Use Kelvin too
  326.         /// </summary>
  327.         /// <returns>the catalog deserialized from disk, or NULL</returns>
  328.         public static Catalog Load()
  329.         {
  330.             if (Preferences.InMediumTrust)
  331.             {
  332.                 try
  333.                 {
  334.                     throw new Exception();
  335.                     // [v5]
  336.                     string xmlFileName = Path.GetDirectoryName(Preferences.CatalogFileName) + Path.DirectorySeparatorChar + Path.GetFileNameWithoutExtension(Preferences.CatalogFileName) + ".xml";
  337.                     if (System.IO.File.Exists(xmlFileName))
  338.                     {
  339.                         Catalog c1 = Kelvin<Catalog>.FromXmlFile(xmlFileName);
  340.                         return c1;
  341.                     }
  342.                 }
  343.                 catch (Exception)
  344.                 {   // [v6] : if cannot load from .DAT or .XML, try to load from compiled resource
  345.                     try
  346.                     {   // http://www.devhood.com/tutorials/tutorial_details.aspx?tutorial_id=75
  347.                         System.Reflection.Assembly a = System.Reflection.Assembly.Load("WebAppCatalogResource");
  348.                         string[] resNames = a.GetManifestResourceNames();
  349.                         Catalog c2 = Kelvin<Catalog>.FromResource(a, resNames[0]);
  350.                         return c2;
  351.                     }
  352.                     catch (Exception e1)
  353.                     {
  354.                         throw new Exception("Searcharoo Catalog.Load() ", e1);
  355.                     }
  356.                 }
  357.                 return null;
  358.             }
  359.             else
  360.             {   // hopefully in Full trust
  361.                 // using Binary serialization requires the Binder because of the embedded 'full name'
  362.                 // of the serializing assembly - all the above methods using Xml do not have this requirement
  363.                 if (System.IO.File.Exists(Preferences.CatalogFileName))
  364.                 {
  365.                     object deserializedCatalogObject;
  366.                     using (System.IO.Stream stream = new System.IO.FileStream(Preferences.CatalogFileName, System.IO.FileMode.Open))
  367.                     {
  368.                         System.Runtime.Serialization.IFormatter formatter = new System.Runtime.Serialization.Formatters.Binary.BinaryFormatter();
  369.                         //object m = formatter.Deserialize (stream); // This doesn't work, SerializationException "Cannot find the assembly <random name>"
  370.                         formatter.Binder = new CatalogBinder(); // This custom Binder is REQUIRED to find the classes in our current 'Temporary ASP.NET Files' assembly
  371.                         deserializedCatalogObject = formatter.Deserialize(stream);
  372.                     } //stream.Close();
  373.                     Catalog catalog = deserializedCatalogObject as Catalog;
  374.                     return catalog;
  375.                 }
  376.                 else
  377.                 {
  378.                     return null;
  379.                 }
  380.             }
  381.         }
  382.     }
  383. }