PdfDocument.cs
上传用户:huiyue
上传日期:2022-04-08
资源大小:1429k
文件大小:4k
源码类别:

搜索引擎

开发平台:

ASP/ASPX

  1. using System;
  2. using System.IO;
  3. using System.Xml;
  4. using ionic.utils.zip;
  5. namespace Searcharoo.Common
  6. {
  7.     /// <summary>
  8.     /// Special handling for PDF IFilter documents
  9.     /// </summary>
  10.     /// <remarks>
  11.     /// Extend the IFilter handling with iTextSharp:
  12.     /// 1) extract metadata (Title)
  13.     /// 2) fallback indexing if IFilter fails.
  14.     /// </remarks>
  15.     public class PdfDocument : FilterDocument
  16.     {
  17.         public PdfDocument(Uri location) : base(location)
  18.         {
  19.             Extension = "pdf";
  20.         }
  21.         
  22.         /// <summary>
  23.         /// Uses the GetResponseCore and GetResponseCoreFinalize to 'inherit' the IFilter behaviour
  24.         /// but also extend it with iTextSharp
  25.         /// </summary>
  26.         /// <remarks>
  27.         /// Add iTextSharp to get better 'title'
  28.         /// [v7] fix by brad1213@yahoo.com
  29.         /// </remarks>
  30.         public override bool GetResponse(System.Net.HttpWebResponse webresponse)
  31.         {
  32.             string filename = System.IO.Path.Combine(Preferences.DownloadedTempFilePath, (System.IO.Path.GetFileName(this.Uri.LocalPath)));
  33.             base.GetResponseCore(webresponse, filename);
  34.             // [v7] fix by brad1213@yahoo.com
  35.             iTextSharp.text.pdf.PdfReader pdfReader = new iTextSharp.text.pdf.PdfReader(filename);
  36.             if (null != pdfReader.Info["Title"])
  37.             {   // overwrite the 'filename' with the embedded title
  38.                 string pdfTitle = Convert.ToString(pdfReader.Info["Title"]).Trim();
  39.                 if (!String.IsNullOrEmpty(pdfTitle))
  40.                 {
  41.                     this.Title = pdfTitle;
  42.                 }
  43.             }
  44.             // Now, since we've loaded the iTextSharp library, and the EPocalipse IFilter sometimes
  45.             // fails (old Acrobat, installation problem, etc); let's try 'indexing' the PDF with iTextSharp
  46.             // [v7]
  47.             if (String.IsNullOrEmpty(this.All))
  48.             {
  49.                 this.All = String.Empty;
  50.                 System.Text.StringBuilder sb = new System.Text.StringBuilder();
  51.                 // Following code from:
  52.                 // http://www.vbforums.com/showthread.php?t=475759
  53.                 for (int p = 1; p <= pdfReader.NumberOfPages; p++)
  54.                 {
  55.                     byte[] pageBytes = pdfReader.GetPageContent(p);
  56.                     if (null != pageBytes)
  57.                     {
  58.                         iTextSharp.text.pdf.PRTokeniser token = new iTextSharp.text.pdf.PRTokeniser(pageBytes);
  59.                         while (token.NextToken())
  60.                         {
  61.                             int tknType = token.TokenType;
  62.                             string tknValue = token.StringValue;
  63.                             if (tknType == iTextSharp.text.pdf.PRTokeniser.TK_STRING)
  64.                             {
  65.                                 sb.Append(token.StringValue);
  66.                             }
  67.                             else if (tknType == 1 && tknValue == "-600")
  68.                             {
  69.                                 sb.Append(" ");
  70.                             }
  71.                             else if (tknType == 10 && tknValue == "TJ")
  72.                             {
  73.                                 sb.Append(" ");
  74.                             }
  75.                         }
  76.                     }
  77.                 }
  78.                 this.All += sb.ToString().Replace('', ' ');
  79.             }
  80.             pdfReader.Close();
  81.             
  82.             base.GetResponseCoreFinalize(filename);
  83.             
  84.             if (!String.IsNullOrEmpty(this.All))
  85.             {
  86.                 this.Description = base.GetDescriptionFromWordsOnly(WordsOnly);
  87.                 return true;
  88.             }
  89.             else
  90.             {
  91.                 return false;
  92.             }
  93.         }
  94.     }
  95. }