IFilter.cs
上传用户:huiyue
上传日期:2022-04-08
资源大小:1429k
文件大小:16k
- using System;
- using System.Text;
- using System.Runtime.InteropServices;
- //Contains IFilter interface translation
- //Most translations are from PInvoke.net
- //http://pinvoke.net/default.aspx/Interfaces.IFilter
- namespace EPocalipse.IFilter
- {
- [StructLayout(LayoutKind.Sequential)]
- public struct FULLPROPSPEC
- {
- public Guid guidPropSet;
- public PROPSPEC psProperty;
- }
- [StructLayout(LayoutKind.Sequential)]
- internal struct FILTERREGION
- {
- public int idChunk;
- public int cwcStart;
- public int cwcExtent;
- }
- [StructLayout(LayoutKind.Explicit)]
- public struct PROPSPEC
- {
- [FieldOffset(0)] public int ulKind; // 0 - string used; 1 - PROPID
- [FieldOffset(4)] public int propid;
- [FieldOffset(4)] public IntPtr lpwstr;
- }
- [Flags]
- internal enum IFILTER_FLAGS
- {
- /// <summary>
- /// The caller should use the IPropertySetStorage and IPropertyStorage
- /// interfaces to locate additional properties.
- /// When this flag is set, properties available through COM
- /// enumerators should not be returned from IFilter.
- /// </summary>
- IFILTER_FLAGS_OLE_PROPERTIES = 1
- }
- /// <summary>
- /// Flags controlling the operation of the FileFilter
- /// instance.
- /// </summary>
- [Flags]
- internal enum IFILTER_INIT
- {
- NONE = 0,
- /// <summary>
- /// Paragraph breaks should be marked with the Unicode PARAGRAPH
- /// SEPARATOR (0x2029)
- /// </summary>
- CANON_PARAGRAPHS = 1,
- /// <summary>
- /// Soft returns, such as the newline character in Microsoft Word, should
- /// be replaced by hard returnsLINE SEPARATOR (0x2028). Existing hard
- /// returns can be doubled. A carriage return (0x000D), line feed (0x000A),
- /// or the carriage return and line feed in combination should be considered
- /// a hard return. The intent is to enable pattern-expression matches that
- /// match against observed line breaks.
- /// </summary>
- HARD_LINE_BREAKS = 2,
- /// <summary>
- /// Various word-processing programs have forms of hyphens that are not
- /// represented in the host character set, such as optional hyphens
- /// (appearing only at the end of a line) and nonbreaking hyphens. This flag
- /// indicates that optional hyphens are to be converted to nulls, and
- /// non-breaking hyphens are to be converted to normal hyphens (0x2010), or
- /// HYPHEN-MINUSES (0x002D).
- /// </summary>
- CANON_HYPHENS = 4,
- /// <summary>
- /// Just as the CANON_HYPHENS flag standardizes hyphens,
- /// this one standardizes spaces. All special space characters, such as
- /// nonbreaking spaces, are converted to the standard space character
- /// (0x0020).
- /// </summary>
- CANON_SPACES = 8,
- /// <summary>
- /// Indicates that the client wants text split into chunks representing
- /// public value-type properties.
- /// </summary>
- APPLY_INDEX_ATTRIBUTES = 16,
- /// <summary>
- /// Indicates that the client wants text split into chunks representing
- /// properties determined during the indexing process.
- /// </summary>
- APPLY_CRAWL_ATTRIBUTES = 256,
- /// <summary>
- /// Any properties not covered by the APPLY_INDEX_ATTRIBUTES
- /// and APPLY_CRAWL_ATTRIBUTES flags should be emitted.
- /// </summary>
- APPLY_OTHER_ATTRIBUTES = 32,
- /// <summary>
- /// Optimizes IFilter for indexing because the client calls the
- /// IFilter::Init method only once and does not call IFilter::BindRegion.
- /// This eliminates the possibility of accessing a chunk both before and
- /// after accessing another chunk.
- /// </summary>
- INDEXING_ONLY = 64,
- /// <summary>
- /// The text extraction process must recursively search all linked
- /// objects within the document. If a link is unavailable, the
- /// IFilter::GetChunk call that would have obtained the first chunk of the
- /// link should return FILTER_E_LINK_UNAVAILABLE.
- /// </summary>
- SEARCH_LINKS = 128,
- /// <summary>
- /// The content indexing process can return property values set by the filter.
- /// </summary>
- FILTER_OWNED_VALUE_OK = 512
- }
- public struct STAT_CHUNK
- {
- /// <summary>
- /// The chunk identifier. Chunk identifiers must be unique for the
- /// current instance of the IFilter interface.
- /// Chunk identifiers must be in ascending order. The order in which
- /// chunks are numbered should correspond to the order in which they appear
- /// in the source document. Some search engines can take advantage of the
- /// proximity of chunks of various properties. If so, the order in which
- /// chunks with different properties are emitted will be important to the
- /// search engine.
- /// </summary>
- public int idChunk;
- /// <summary>
- /// The type of break that separates the previous chunk from the current
- /// chunk. Values are from the CHUNK_BREAKTYPE enumeration.
- /// </summary>
- [MarshalAs(UnmanagedType.U4)]
- public CHUNK_BREAKTYPE breakType;
- /// <summary>
- /// Flags indicate whether this chunk contains a text-type or a
- /// value-type property.
- /// Flag values are taken from the CHUNKSTATE enumeration. If the CHUNK_TEXT flag is set,
- /// IFilter::GetText should be used to retrieve the contents of the chunk
- /// as a series of words.
- /// If the CHUNK_VALUE flag is set, IFilter::GetValue should be used to retrieve
- /// the value and treat it as a single property value. If the filter dictates that the same
- /// content be treated as both text and as a value, the chunk should be emitted twice in two
- /// different chunks, each with one flag set.
- /// </summary>
- [MarshalAs(UnmanagedType.U4)]
- public CHUNKSTATE flags;
- /// <summary>
- /// The language and sublanguage associated with a chunk of text. Chunk locale is used
- /// by document indexers to perform proper word breaking of text. If the chunk is
- /// neither text-type nor a value-type with data type VT_LPWSTR, VT_LPSTR or VT_BSTR,
- /// this field is ignored.
- /// </summary>
- public int locale;
- /// <summary>
- /// The property to be applied to the chunk. If a filter requires that the same text
- /// have more than one property, it needs to emit the text once for each property
- /// in separate chunks.
- /// </summary>
- public FULLPROPSPEC attribute;
- /// <summary>
- /// The ID of the source of a chunk. The value of the idChunkSource
- /// member depends on the nature of the chunk:
- /// If the chunk is a text-type property, the value of the idChunkSource
- /// member must be the same as the value of the idChunk member.
- /// If the chunk is an public value-type property derived from textual
- /// content, the value of the idChunkSource member is the chunk ID for the
- /// text-type chunk from which it is derived.
- /// If the filter attributes specify to return only public value-type
- /// properties, there is no content chunk from which to derive the current
- /// public value-type property. In this case, the value of the
- /// idChunkSource member must be set to zero, which is an invalid chunk.
- /// </summary>
- public int idChunkSource;
- /// <summary>
- /// The offset from which the source text for a derived chunk starts in
- /// the source chunk.
- /// </summary>
- public int cwcStartSource;
- /// <summary>
- /// The length in characters of the source text from which the current
- /// chunk was derived.
- /// A zero value signifies character-by-character correspondence between
- /// the source text and
- /// the derived text. A nonzero value means that no such direct
- /// correspondence exists
- /// </summary>
- public int cwcLenSource;
- }
- /// <summary>
- /// Enumerates the different breaking types that occur between
- /// chunks of text read out by the FileFilter.
- /// </summary>
- public enum CHUNK_BREAKTYPE
- {
- /// <summary>
- /// No break is placed between the current chunk and the previous chunk.
- /// The chunks are glued together.
- /// </summary>
- CHUNK_NO_BREAK = 0,
- /// <summary>
- /// A word break is placed between this chunk and the previous chunk that
- /// had the same attribute.
- /// Use of CHUNK_EOW should be minimized because the choice of word
- /// breaks is language-dependent,
- /// so determining word breaks is best left to the search engine.
- /// </summary>
- CHUNK_EOW = 1,
- /// <summary>
- /// A sentence break is placed between this chunk and the previous chunk
- /// that had the same attribute.
- /// </summary>
- CHUNK_EOS = 2,
- /// <summary>
- /// A paragraph break is placed between this chunk and the previous chunk
- /// that had the same attribute.
- /// </summary>
- CHUNK_EOP = 3,
- /// <summary>
- /// A chapter break is placed between this chunk and the previous chunk
- /// that had the same attribute.
- /// </summary>
- CHUNK_EOC = 4
- }
- public enum CHUNKSTATE
- {
- /// <summary>
- /// The current chunk is a text-type property.
- /// </summary>
- CHUNK_TEXT = 0x1,
- /// <summary>
- /// The current chunk is a value-type property.
- /// </summary>
- CHUNK_VALUE = 0x2,
- /// <summary>
- /// Reserved
- /// </summary>
- CHUNK_FILTER_OWNED_VALUE = 0x4
- }
- internal enum IFilterReturnCode : uint
- {
- /// <summary>
- /// Success
- /// </summary>
- S_OK = 0,
- /// <summary>
- /// The function was denied access to the filter file.
- /// </summary>
- E_ACCESSDENIED = 0x80070005,
- /// <summary>
- /// The function encountered an invalid handle,
- /// probably due to a low-memory situation.
- /// </summary>
- E_HANDLE = 0x80070006,
- /// <summary>
- /// The function received an invalid parameter.
- /// </summary>
- E_INVALIDARG = 0x80070057,
- /// <summary>
- /// Out of memory
- /// </summary>
- E_OUTOFMEMORY = 0x8007000E,
- /// <summary>
- /// Not implemented
- /// </summary>
- E_NOTIMPL = 0x80004001,
- /// <summary>
- /// Unknown error
- /// </summary>
- E_FAIL = 0x80000008,
- /// <summary>
- /// File not filtered due to password protection
- /// </summary>
- FILTER_E_PASSWORD = 0x8004170B,
- /// <summary>
- /// The document format is not recognised by the filter
- /// </summary>
- FILTER_E_UNKNOWNFORMAT = 0x8004170C,
- /// <summary>
- /// No text in current chunk
- /// </summary>
- FILTER_E_NO_TEXT = 0x80041705,
- /// <summary>
- /// No more chunks of text available in object
- /// </summary>
- FILTER_E_END_OF_CHUNKS = 0x80041700,
- /// <summary>
- /// No more text available in chunk
- /// </summary>
- FILTER_E_NO_MORE_TEXT = 0x80041701,
- /// <summary>
- /// No more property values available in chunk
- /// </summary>
- FILTER_E_NO_MORE_VALUES = 0x80041702,
- /// <summary>
- /// Unable to access object
- /// </summary>
- FILTER_E_ACCESS = 0x80041703,
- /// <summary>
- /// Moniker doesn't cover entire region
- /// </summary>
- FILTER_W_MONIKER_CLIPPED = 0x00041704,
- /// <summary>
- /// Unable to bind IFilter for embedded object
- /// </summary>
- FILTER_E_EMBEDDING_UNAVAILABLE = 0x80041707,
- /// <summary>
- /// Unable to bind IFilter for linked object
- /// </summary>
- FILTER_E_LINK_UNAVAILABLE = 0x80041708,
- /// <summary>
- /// This is the last text in the current chunk
- /// </summary>
- FILTER_S_LAST_TEXT = 0x00041709,
- /// <summary>
- /// This is the last value in the current chunk
- /// </summary>
- FILTER_S_LAST_VALUES = 0x0004170A
- }
- [ComImport, Guid("89BCB740-6119-101A-BCB7-00DD010655AF")]
- [InterfaceType(ComInterfaceType.InterfaceIsIUnknown)]
- internal interface IFilter
- {
- /// <summary>
- /// The IFilter::Init method initializes a filtering session.
- /// </summary>
- [PreserveSig]
- IFilterReturnCode Init(
- //[in] Flag settings from the IFILTER_INIT enumeration for
- // controlling text standardization, property output, embedding
- // scope, and IFilter access patterns.
- IFILTER_INIT grfFlags,
- // [in] The size of the attributes array. When nonzero, cAttributes
- // takes
- // precedence over attributes specified in grfFlags. If no
- // attribute flags
- // are specified and cAttributes is zero, the default is given by
- // the
- // PSGUID_STORAGE storage property set, which contains the date and
- // time
- // of the last write to the file, size, and so on; and by the
- // PID_STG_CONTENTS
- // 'contents' property, which maps to the main contents of the
- // file.
- // For more information about properties and property sets, see
- // Property Sets.
- int cAttributes,
- //[in] Array of pointers to FULLPROPSPEC structures for the
- // requested properties.
- // When cAttributes is nonzero, only the properties in aAttributes
- // are returned.
- IntPtr aAttributes,
- // [out] Information about additional properties available to the
- // caller; from the IFILTER_FLAGS enumeration.
- out IFILTER_FLAGS pdwFlags);
- /// <summary>
- /// The IFilter::GetChunk method positions the filter at the beginning
- /// of the next chunk,
- /// or at the first chunk if this is the first call to the GetChunk
- /// method, and returns a description of the current chunk.
- /// </summary>
- [PreserveSig]
- IFilterReturnCode GetChunk(out STAT_CHUNK pStat);
- /// <summary>
- /// The IFilter::GetText method retrieves text (text-type properties)
- /// from the current chunk,
- /// which must have a CHUNKSTATE enumeration value of CHUNK_TEXT.
- /// </summary>
- [PreserveSig]
- IFilterReturnCode GetText(
- // [in/out] On entry, the size of awcBuffer array in wide/Unicode
- // characters. On exit, the number of Unicode characters written to
- // awcBuffer.
- // Note that this value is not the number of bytes in the buffer.
- ref uint pcwcBuffer,
- // Text retrieved from the current chunk. Do not terminate the
- // buffer with a character.
- [Out(), MarshalAs(UnmanagedType.LPArray)]
- char[] awcBuffer);
- /// <summary>
- /// The IFilter::GetValue method retrieves a value (public
- /// value-type property) from a chunk,
- /// which must have a CHUNKSTATE enumeration value of CHUNK_VALUE.
- /// </summary>
- /// <remarks>
- /// IFilter::GetValue
- /// http://msdn2.microsoft.com/en-us/library/ms690927.aspx
- /// Filtering File Properties
- /// http://msdn2.microsoft.com/en-us/library/ms692552.aspx
- /// </remarks>
- [PreserveSig]
- int GetValue(
- // Allocate the PROPVARIANT structure with CoTaskMemAlloc. Some
- // PROPVARIANT
- // structures contain pointers, which can be freed by calling the
- // PropVariantClear function.
- // It is up to the caller of the GetValue method to call the
- // PropVariantClear method.
- // ref IntPtr ppPropValue
- // [MarshalAs(UnmanagedType.Struct)]
- ref IntPtr PropVal);
- /// <summary>
- /// The IFilter::BindRegion method retrieves an interface representing
- /// the specified portion of the object.
- /// Currently reserved for future use.
- /// </summary>
- [PreserveSig]
- int BindRegion(ref FILTERREGION origPos,
- ref Guid riid, ref object ppunk);
- }
- }