DocxDocument.cs
上传用户:huiyue
上传日期:2022-04-08
资源大小:1429k
文件大小:5k
源码类别:

搜索引擎

开发平台:

ASP/ASPX

  1. using System;
  2. using System.IO;
  3. using System.Xml;
  4. using ionic.utils.zip;
  5. namespace Searcharoo.Common
  6. {
  7.     /// <summary>
  8.     /// Load a Microsoft Word 2007 Xml file format
  9.     /// </summary>
  10.     /// <remarks>
  11.     /// SharpZipLib
  12.     /// http://www.icsharpcode.net/OpenSource/SharpZipLib/
  13.     /// 
  14.     /// unzip -p some.docx word/document.xml | perl -pe 's/<[^>]+>|[^[:print:]]+//g'
  15.     /// 
  16.     /// Building WordProcessingML Document...
  17.     /// http://blogs.msdn.com/dmahugh/archive/2006/06/27/649007.aspx
  18.     /// http://openxmldeveloper.org/articles/DocxClassFormattedText.aspx
  19.     /// 
  20.     /// .NET 2.0
  21.     /// http://blogs.msdn.com/dotnetinterop/archive/2006/04/05/.NET-System.IO.Compression-and-zip-files.aspx
  22.     /// 
  23.     /// .NET 3.0
  24.     /// http://msdn2.microsoft.com/en-us/library/system.io.packaging.zippackage.aspx
  25.     /// OpenXml file formats
  26.     /// http://blogs.msdn.com/erikaehrli/archive/2006/06/23/getstartedwithOpenXMLFileFormats.aspx
  27.     /// 
  28.     /// TODO: Extract Document Properties (Title, Keywords)
  29.     /// http://msdn.microsoft.com/en-us/library/aa338205.aspx [Document Profiling]
  30.     /// http://msdn.microsoft.com/en-us/library/bb243281.aspx
  31.     /// </remarks>
  32.     public class DocxDocument : DownloadDocument
  33.     {
  34.         /*
  35. <?xml version="1.0" encoding="UTF-8" standalone="yes"?> 
  36. <CoreProperties xmlns="http://schemas.microsoft.com/package/2005/06/md/core-properties"> 
  37.    <Title>Word Document Sample</Title> 
  38.    <Subject>Microsoft Office Word 2007</Subject> 
  39.    <Creator>2007 Microsoft Office System User</Creator> 
  40.    <Keywords/> 
  41.    <Description>2007 Microsoft Office system .docx file</Description> 
  42.    <LastModifiedBy>2007 Microsoft Office System User</LastModifiedBy> 
  43.    <Revision>2</Revision> 
  44.    <DateCreated>2005-05-05T20:01:00Z</DateCreated> 
  45.    <DateModified>2005-05-05T20:02:00Z</DateModified> 
  46. </CoreProperties> 
  47.          */
  48.         private string _WordsOnly;
  49.         public DocxDocument(Uri location)
  50.             : base(location)
  51.         {
  52.             Extension = "docx";
  53.         }
  54.         
  55.         public override void Parse()
  56.         {
  57.             // no parsing (for now). perhaps in future we can regex look for urls (www.xxx.com) and try to link to them...
  58.         }
  59.         public override string WordsOnly
  60.         {
  61.             get { return _WordsOnly; }
  62.         }
  63.         /// <remarks>
  64.         /// .NET System.IO.Compression and zip files
  65.         /// http://blogs.msdn.com/dotnetinterop/archive/2006/04/05/.NET-System.IO.Compression-and-zip-files.aspx
  66.         /// </remarks>
  67.         public override bool GetResponse(System.Net.HttpWebResponse webresponse)
  68.         {
  69.             string filename = System.IO.Path.Combine(
  70.                           Preferences.DownloadedTempFilePath
  71.                         , (System.IO.Path.GetFileName(this.Uri.LocalPath)) );
  72.             this.Title = System.IO.Path.GetFileNameWithoutExtension(filename);
  73.             SaveDownloadedFile(webresponse, filename);
  74.             try
  75.             {
  76.                 string entryToExtract = @"word/document.xml";
  77.                 try
  78.                 {
  79.                     using (ZipFile zip = ZipFile.Read(filename))
  80.                     {
  81.                         MemoryStream stream = new MemoryStream();
  82.                         zip.Extract(entryToExtract, stream);
  83.                         stream.Seek(0, SeekOrigin.Begin);
  84.                         XmlDocument xmldoc = new XmlDocument();
  85.                         xmldoc.Load(stream);
  86.                         _WordsOnly = xmldoc.DocumentElement.InnerText; // TODO: may require looping to add spaces between elements
  87.                         this.All = _WordsOnly;
  88.                         #region DEPRECATED: Dodgy byte array solution
  89.                         //byte[] byteArray;
  90.                         //stream.Seek(0, SeekOrigin.Begin);
  91.                         //// Read the first 20 bytes from the stream.
  92.                         //byteArray = new byte[stream.Length];
  93.                         //int count = stream.Read(byteArray, 0, 20);
  94.                         //while (count < stream.Length)
  95.                         //{
  96.                         //    byteArray[count++] = Convert.ToByte(stream.ReadByte());
  97.                         //}
  98.                         //_WordsOnly = System.Text.Encoding.UTF8.GetString(byteArray);
  99.                         //System.Text.RegularExpressions.Regex regex = new System.Text.RegularExpressions.Regex("<(.|n)+?>", System.Text.RegularExpressions.RegexOptions.IgnoreCase);
  100.                         //_WordsOnly = regex.Replace(_WordsOnly, " ");
  101.                         //this.All = _WordsOnly;
  102.                         #endregion
  103.                     
  104.                     }
  105.                 }
  106.                 catch (Exception ex)
  107.                 {
  108.                     Console.WriteLine(ex.Message);
  109.                 }
  110.                 System.IO.File.Delete(filename);    // clean up
  111.             }
  112.             catch (Exception)
  113.             {
  114.                 //                ProgressEvent(this, new ProgressEventArgs(2, "IFilter failed on " + this.Uri + " " + e.Message + ""));
  115.             }
  116.             if (this.All != string.Empty)
  117.             {
  118.                 this.Description = base.GetDescriptionFromWordsOnly(WordsOnly);
  119.                 return true;
  120.             }
  121.             else
  122.             {
  123.                 return false;
  124.             }
  125.         }
  126.     }
  127. }