Catalog.cs
上传用户:huiyue
上传日期:2022-04-08
资源大小:1429k
文件大小:17k
- using System;
- using System.IO;
- using System.Xml.Serialization;
- using System.Text;
- using System.Text.RegularExpressions;
- using System.Collections;
- using System.Collections.Generic;
- using System.Collections.Specialized;
- using cd.net;
- namespace Searcharoo.Common
- {
- /// <summary>
- /// Catalog of Words (and Files)
- /// <summary>
- /// <remarks>
- /// XmlInclude
- /// http://pluralsight.com/blogs/craig/archive/2004/07/08/1580.aspx
- ///
- /// added v6 : fallback loading of catalog from the WebAppCatalogResource assembly
- /// to get around issues where the Trust level (eg. in shared hosting) is restricted
- /// to NOT ALLOW File.IO or WebClient requests. This requires the catalog to be built
- /// using the Indexer.EXE then compiled into a DLL then uploaded to /bin/
- /// </remarks>
- [Serializable]
- [System.Xml.Serialization.XmlInclude(typeof(Searcharoo.Common.Word))]
- [System.Xml.Serialization.XmlInclude(typeof(Searcharoo.Common.Location))]
- [System.Xml.Serialization.XmlInclude(typeof(Searcharoo.Common.CatalogWordFile))]
- public class Catalog
- {
- /// <summary>
- /// Internal datastore of Words referencing Files
- /// </summary>
- /// <remarks>
- /// Hashtable
- /// key = STRING representation of the word,
- /// value = Word OBJECT (with File collection, etc)
- /// </remarks>
- private System.Collections.Hashtable _Index; //TODO: implement collection with faster searching
- private System.Collections.Generic.Dictionary<File,int> _FileIndex; //TODO: implement collection with faster searching
- private Cache _Cache;
- /// <summary>
- /// Words in the Catalog
- /// </summary>
- /// <remarks>
- /// Added property to allow Serialization to disk
- /// NOTE: the XmlInclude attribute on the Catalog class, which is what
- /// enables this array of 'non-standard' objects to be serialized correctly
- /// </remarks>
- [XmlElement("o")]
- [XmlIgnore()]
- [Obsolete("Use WordFiles and Files properties")]
- public Word[] Words
- {
- get
- {
- Word[] wordArray = new Word[_Index.Count];
- _Index.Values.CopyTo(wordArray, 0);
- return wordArray;
- }
- set
- {
- Word[] wordArray = value;
- _Index = new Hashtable(); //HACK: index doesn't get populated with wordArray
- }
- }
- [XmlElement("w")]
- public CatalogWordFile[] WordFiles
- {
- get
- {
- PrepareForSerialization();
- return _WordfileArray;
- }
- set
- {
- _WordfileArray = value;
- PostDeserialization();
- }
- }
- [XmlElement("f")]
- public File[] Files
- {
- get
- {
- PrepareForSerialization();
- return _FileList.ToArray();
- }
- set
- {
- File[] fa = value;
- _FileList = new List<File>(fa);
- PostDeserialization();
- }
- }
- #region Private fields and methods to manage XmlSerialization
- /// <summary>
- /// List of File objects that were referenced in the Catalog
- /// </summary>
- private System.Collections.Generic.List<File> _FileList;
- /// <summary>
- /// Array of CatalogWordFile objects, with 'ids' for each File
- /// rather than a reference to a File object
- /// </summary>
- private CatalogWordFile[] _WordfileArray;
- private bool _SerializePreparationDone = false;
- /// <summary>
- /// Property helper for Files & CatalogWordFiles, ensures the data retrieved
- /// from those two properties is 'in sync'
- /// </summary>
- private void PrepareForSerialization()
- {
- if (_SerializePreparationDone) return;
- // Create the list of 'Files' (ie. pages indexed) [v7]
- _FileList = new List<File>();
- foreach (File f in _FileIndex.Keys)
- {
- _FileList.Add(f);
- _Cache.SetIndexId(f.Url, f.IndexId);
- }
- _WordfileArray = new CatalogWordFile[_Index.Count];
- Word[] wordArray = new Word[_Index.Count];
- _Index.Values.CopyTo(wordArray, 0);
-
- // go through all the words
- for (int i = 0; i < wordArray.Length; i++)
- {
- CatalogWordFile wf = new CatalogWordFile();
- wf.Text = wordArray[i].Text;
- foreach (File f in wordArray[i].Files)
- { // for each File, append the File.IndexId AND the List<int> of positions
- wf.FileIdsWithPosition.Add(f.IndexId, wordArray[i].InFilesWithPosition()[f]);
- }
- _WordfileArray[i] = wf;
- }
- _SerializePreparationDone = true;
- }
- /// <summary>
- /// Property helper for Files & WordFiles, ensures when
- /// they are both 'set', the internal Catalog datastructure is
- /// setup correctly
- /// </summary>
- private void PostDeserialization()
- {
- if ((_WordfileArray != null) && (_FileList != null))
- {
- foreach (CatalogWordFile wf in _WordfileArray)
- {
- //foreach (int i in wf.FileIds)
- //{
- // this.Add(wf.Text, _FileList[i],-1);
- //}
- foreach (int i in wf.FileIdsWithPosition.Keys)
- {
- // get the file object
- File f = null;
- foreach (File h in _FileList)
- {
- if (h.IndexId == i)
- { f = h; break; }
- }
- if (f == null)
- {
- // throw new NullReferenceException("There should be a file object matching index " + i);
- System.Diagnostics.Debug.WriteLine("There should be a file object matching index " + i);
- }
- else
- foreach (int j in wf.FileIdsWithPosition[i])
- {
- this.Add(wf.Text, f, j);
- }
- }
- }
- }
- }
- #endregion
-
- /// <summary>
- /// String array representing the list of words.
- /// Used mainly for debugging - ie in the Save() method - so you can
- /// see what the Spider found.
- /// </summary>
- /// <remarks>
- /// Because there is no 'set' accessor, this property is not XmlSerialized;
- /// XmlIgnore attribute added anyway.
- /// </remarks>
- [XmlIgnore()]
- public string[] Dictionary
- {
- get
- {
- string[] wordArray = new string[_Index.Count];
- _Index.Keys.CopyTo(wordArray, 0);
- return wordArray;
- }
- }
- /// <summary>
- /// Number of Words in the Catalog
- /// </summary>
- /// <remarks>
- /// Because there is no 'set' accessor, this property is not XmlSerialized
- /// </remarks>
- public int Length
- {
- get { return _Index.Count; }
- }
- public Cache FileCache
- {
- get { return _Cache; }
- set { _Cache = value; }
- }
- /// <summary>
- /// Constructor - creates the Hashtable for internal data storage.
- /// </summary>
- public Catalog()
- {
- _Index = new System.Collections.Hashtable();
- _FileIndex = new System.Collections.Generic.Dictionary<File, int>();
- _Cache = new Cache();
- }
- /// <summary>
- /// Add a new Word/File pair to the Catalog
- /// </summary>
- public bool Add(string word, File infile, int position)
- {
- // ### Keep a list of all the files we catalog [v7]
- if (!_FileIndex.ContainsKey(infile))
- {
- _FileIndex.Add(infile, _FileIndex.Count);
- if (infile.IndexId < 0)
- infile.IndexId = _FileIndex.Count - 1; // zero based
- }
- // ### Make sure the Word object is in the index ONCE only
- if (_Index.ContainsKey(word))
- {
- Word theword = (Word)_Index[word]; // get Word from Index, then add this file reference to the Word
- theword.Add(infile, position);
- }
- else
- {
- Word theword = new Word(word, infile, position); // create a new Word object and add to Index
- _Index.Add(word, theword);
- }
- _SerializePreparationDone = false; // adding to the catalog invalidates 'serialization preparation'
- return true;
- }
- /// <summary>
- /// Returns all the Files which contain the searchWord
- /// </summary>
- /// <returns>
- /// Hashtable prior to [v7]
- /// </returns>
- public Dictionary<File, List<int>> Search(string searchWord)
- {
- // apply the same 'trim' as when we're building the catalog
- //searchWord = searchWord.Trim(' ','?','"',',',''',';',':','.','(', ')','[',']','%','*','$','-').ToLower();
- Dictionary<File, List<int>> retval = null;
- if (_Index.ContainsKey(searchWord))
- { // does all the work !!!
- Word thematch = (Word)_Index[searchWord];
- //retval = thematch.InFiles(); // return the collection of File objects
- retval = thematch.InFilesWithPosition();
- }
- return retval;
- }
- /// <summary>
- /// Debug string
- /// </summary>
- public override string ToString()
- {
- string wordlist = "";
- //foreach (object w in index.Keys) temp += ((Word)w).ToString(); // output ALL words, will take a long time
- return "CATALOG :: " + _Index.Values.Count.ToString() + " words.n" + wordlist;
- }
- /// <summary>
- /// Save the catalog to disk by BINARY serializing the object graph as a *.DAT file.
- /// </summary>
- /// <remarks>
- /// For 'reference', the method also saves XmlSerialized copies of the Catalog (which
- /// can get quite large) and just the list of Words that were found. In production, you
- /// would probably comment out/remove the Debugging code.
- ///
- /// You may also wish to use a difficult-to-guess filename for the serialized data,
- /// or else change the .DAT file extension to something that will be not be served by
- /// IIS (so that other people can't download your catalog).
- /// </remarks>
- public void Save()
- {
- // XML
- if (Preferences.InMediumTrust)
- {
- // TODO: Maybe use to save as ZIP - save space on disk? http://www.123aspx.com/redir.aspx?res=31602
- string xmlFileName = Path.GetDirectoryName(Preferences.CatalogFileName) + Path.DirectorySeparatorChar + Path.GetFileNameWithoutExtension(Preferences.CatalogFileName) + ".xml";
- Kelvin<Catalog>.ToXmlFile(this, xmlFileName);
- //XmlSerializer serializerXmlCatalog = new XmlSerializer(typeof(Catalog));
- //System.IO.TextWriter writer = new System.IO.StreamWriter(xmlFileName);
- //serializerXmlCatalog.Serialize(writer, this);
- //writer.Close();
-
- _Cache.Save();
- return;
- }
- // BINARY http://www.dotnetspider.com/technology/kbpages/454.aspx
- System.IO.Stream stream = new System.IO.FileStream(Preferences.CatalogFileName, System.IO.FileMode.Create);
- System.Runtime.Serialization.IFormatter formatter = new System.Runtime.Serialization.Formatters.Binary.BinaryFormatter();
- formatter.Serialize(stream, this);
- stream.Close();
-
- _Cache.Save();
-
- #region Debugging Serialization - these are only really useful for looking at; they're not re-loaded
- if (Preferences.DebugSerializeXml)
- {
- //Kelvin<Catalog>.ToBinaryFile(this, Path.GetFileNameWithoutExtension(Preferences.CatalogFileName) + "_Kelvin" + Path.GetExtension(Preferences.CatalogFileName));
- //Kelvin<Catalog>.ToXmlFile(this, Path.GetFileNameWithoutExtension(Preferences.CatalogFileName) + "_Kelvin.xml");
- Kelvin<string[]>.ToXmlFile(this.Dictionary, Path.GetFileNameWithoutExtension(Preferences.CatalogFileName) + "_debugwords.xml");
- // XML http://www.devhood.com/Tutorials/tutorial_details.aspx?tutorial_id=236
- //XmlSerializer serializerXmlWords = new XmlSerializer(typeof(string[]));
- //System.IO.TextWriter writerW = new System.IO.StreamWriter(Path.GetFileNameWithoutExtension(Preferences.CatalogFileName) + "_words.xml");
- //serializerXmlWords.Serialize(writerW, this.Dictionary);
- //writerW.Close();
- }
- #endregion
- }
- /// <summary>
- /// Use Kelvin too
- /// </summary>
- /// <returns>the catalog deserialized from disk, or NULL</returns>
- public static Catalog Load()
- {
- if (Preferences.InMediumTrust)
- {
- try
- {
- throw new Exception();
- // [v5]
- string xmlFileName = Path.GetDirectoryName(Preferences.CatalogFileName) + Path.DirectorySeparatorChar + Path.GetFileNameWithoutExtension(Preferences.CatalogFileName) + ".xml";
- if (System.IO.File.Exists(xmlFileName))
- {
- Catalog c1 = Kelvin<Catalog>.FromXmlFile(xmlFileName);
- return c1;
- }
- }
- catch (Exception)
- { // [v6] : if cannot load from .DAT or .XML, try to load from compiled resource
- try
- { // http://www.devhood.com/tutorials/tutorial_details.aspx?tutorial_id=75
- System.Reflection.Assembly a = System.Reflection.Assembly.Load("WebAppCatalogResource");
- string[] resNames = a.GetManifestResourceNames();
- Catalog c2 = Kelvin<Catalog>.FromResource(a, resNames[0]);
- return c2;
- }
- catch (Exception e1)
- {
- throw new Exception("Searcharoo Catalog.Load() ", e1);
- }
- }
- return null;
- }
- else
- { // hopefully in Full trust
- // using Binary serialization requires the Binder because of the embedded 'full name'
- // of the serializing assembly - all the above methods using Xml do not have this requirement
- if (System.IO.File.Exists(Preferences.CatalogFileName))
- {
- object deserializedCatalogObject;
- using (System.IO.Stream stream = new System.IO.FileStream(Preferences.CatalogFileName, System.IO.FileMode.Open))
- {
- System.Runtime.Serialization.IFormatter formatter = new System.Runtime.Serialization.Formatters.Binary.BinaryFormatter();
- //object m = formatter.Deserialize (stream); // This doesn't work, SerializationException "Cannot find the assembly <random name>"
- formatter.Binder = new CatalogBinder(); // This custom Binder is REQUIRED to find the classes in our current 'Temporary ASP.NET Files' assembly
- deserializedCatalogObject = formatter.Deserialize(stream);
- } //stream.Close();
- Catalog catalog = deserializedCatalogObject as Catalog;
- return catalog;
- }
- else
- {
- return null;
- }
- }
- }
- }
- }