DocumentWriter.cs
上传用户:zhangkuixh
上传日期:2013-09-30
资源大小:5473k
文件大小:15k
- /*
- * Copyright 2004 The Apache Software Foundation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
- using System;
- using Analyzer = Lucene.Net.Analysis.Analyzer;
- using Token = Lucene.Net.Analysis.Token;
- using TokenStream = Lucene.Net.Analysis.TokenStream;
- using Document = Lucene.Net.Documents.Document;
- using Field = Lucene.Net.Documents.Field;
- using Similarity = Lucene.Net.Search.Similarity;
- using Directory = Lucene.Net.Store.Directory;
- using IndexOutput = Lucene.Net.Store.IndexOutput;
- namespace Lucene.Net.Index
- {
-
- public sealed class DocumentWriter
- {
- private void InitBlock()
- {
- termIndexInterval = IndexWriter.DEFAULT_TERM_INDEX_INTERVAL;
- }
- private Analyzer analyzer;
- private Directory directory;
- private Similarity similarity;
- private FieldInfos fieldInfos;
- private int maxFieldLength;
- private int termIndexInterval;
- private System.IO.TextWriter infoStream;
-
- /// <summary>This ctor used by test code only.
- ///
- /// </summary>
- /// <param name="directory">The directory to write the document information to
- /// </param>
- /// <param name="analyzer">The analyzer to use for the document
- /// </param>
- /// <param name="similarity">The Similarity function
- /// </param>
- /// <param name="maxFieldLength">The maximum number of tokens a field may have
- /// </param>
- internal DocumentWriter(Directory directory, Analyzer analyzer, Similarity similarity, int maxFieldLength)
- {
- InitBlock();
- this.directory = directory;
- this.analyzer = analyzer;
- this.similarity = similarity;
- this.maxFieldLength = maxFieldLength;
- }
-
- internal DocumentWriter(Directory directory, Analyzer analyzer, IndexWriter writer)
- {
- InitBlock();
- this.directory = directory;
- this.analyzer = analyzer;
- this.similarity = writer.GetSimilarity();
- this.maxFieldLength = writer.GetMaxFieldLength();
- this.termIndexInterval = writer.GetTermIndexInterval();
- }
-
- public /*internal*/ void AddDocument(System.String segment, Document doc)
- {
- // write field names
- fieldInfos = new FieldInfos();
- fieldInfos.Add(doc);
- fieldInfos.Write(directory, segment + ".fnm");
-
- // write field values
- FieldsWriter fieldsWriter = new FieldsWriter(directory, segment, fieldInfos);
- try
- {
- fieldsWriter.AddDocument(doc);
- }
- finally
- {
- fieldsWriter.Close();
- }
-
- // invert doc into postingTable
- postingTable.Clear(); // clear postingTable
- fieldLengths = new int[fieldInfos.Size()]; // init fieldLengths
- fieldPositions = new int[fieldInfos.Size()]; // init fieldPositions
- fieldOffsets = new int[fieldInfos.Size()]; // init fieldOffsets
-
- fieldBoosts = new float[fieldInfos.Size()]; // init fieldBoosts
- float boost = doc.GetBoost();
for (int i = 0; i < fieldBoosts.Length; i++)
{
fieldBoosts[i] = boost;
}
- InvertDocument(doc);
-
- // sort postingTable into an array
- Posting[] postings = SortPostingTable();
-
- /*
- for (int i = 0; i < postings.length; i++) {
- Posting posting = postings[i];
- System.out.print(posting.term);
- System.out.print(" freq=" + posting.freq);
- System.out.print(" pos=");
- System.out.print(posting.positions[0]);
- for (int j = 1; j < posting.freq; j++)
- System.out.print("," + posting.positions[j]);
- System.out.println("");
- }
- */
-
- // write postings
- WritePostings(postings, segment);
-
- // write norms of indexed fields
- WriteNorms(segment);
- }
-
- // Keys are Terms, values are Postings.
- // Used to buffer a document before it is written to the index.
- private System.Collections.Hashtable postingTable = System.Collections.Hashtable.Synchronized(new System.Collections.Hashtable());
- private int[] fieldLengths;
- private int[] fieldPositions;
- private int[] fieldOffsets;
- private float[] fieldBoosts;
-
- // Tokenizes the fields of a document into Postings.
- private void InvertDocument(Document doc)
- {
- System.Collections.IEnumerator fields = doc.Fields();
- while (fields.MoveNext())
- {
- Field field = (Field) fields.Current;
- System.String fieldName = field.Name();
- int fieldNumber = fieldInfos.FieldNumber(fieldName);
-
- int length = fieldLengths[fieldNumber]; // length of field
- int position = fieldPositions[fieldNumber]; // position in field
- if (length > 0)
- position += analyzer.GetPositionIncrementGap(fieldName);
- int offset = fieldOffsets[fieldNumber]; // offset field
-
- if (field.IsIndexed())
- {
- if (!field.IsTokenized())
- {
- // un-tokenized field
- System.String stringValue = field.StringValue();
- if (field.IsStoreOffsetWithTermVector())
- AddPosition(fieldName, stringValue, position++, new TermVectorOffsetInfo(offset, offset + stringValue.Length));
- else
- AddPosition(fieldName, stringValue, position++, null);
- offset += stringValue.Length;
- length++;
- }
- else
- {
- System.IO.TextReader reader; // find or make Reader
- if (field.ReaderValue() != null)
- reader = field.ReaderValue();
- else if (field.StringValue() != null)
- reader = new System.IO.StringReader(field.StringValue());
- else
- throw new System.ArgumentException("field must have either String or Reader value");
-
- // Tokenize field and add to postingTable
- TokenStream stream = analyzer.TokenStream(fieldName, reader);
- try
- {
- Token lastToken = null;
- for (Token t = stream.Next(); t != null; t = stream.Next())
- {
- position += (t.GetPositionIncrement() - 1);
-
- if (field.IsStoreOffsetWithTermVector())
- AddPosition(fieldName, t.TermText(), position++, new TermVectorOffsetInfo(offset + t.StartOffset(), offset + t.EndOffset()));
- else
- AddPosition(fieldName, t.TermText(), position++, null);
-
- lastToken = t;
- if (++length > maxFieldLength)
- {
- if (infoStream != null)
- infoStream.WriteLine("maxFieldLength " + maxFieldLength + " reached, ignoring following tokens");
- break;
- }
- }
-
- if (lastToken != null)
- offset += lastToken.EndOffset() + 1;
- }
- finally
- {
- stream.Close();
- }
- }
-
- fieldLengths[fieldNumber] = length; // save field length
- fieldPositions[fieldNumber] = position; // save field position
- fieldBoosts[fieldNumber] *= field.GetBoost();
- fieldOffsets[fieldNumber] = offset;
- }
- }
- }
-
- private Term termBuffer = new Term("", ""); // avoid consing
-
- private void AddPosition(System.String field, System.String text, int position, TermVectorOffsetInfo offset)
- {
- termBuffer.Set(field, text);
- //System.out.println("Offset: " + offset);
- Posting ti = (Posting) postingTable[termBuffer];
- if (ti != null)
- {
- // word seen before
- int freq = ti.freq;
- if (ti.positions.Length == freq)
- {
- // positions array is full
- int[] newPositions = new int[freq * 2]; // double size
- int[] positions = ti.positions;
- for (int i = 0; i < freq; i++)
- // copy old positions to new
- newPositions[i] = positions[i];
- ti.positions = newPositions;
- }
- ti.positions[freq] = position; // add new position
-
- if (offset != null)
- {
- if (ti.offsets.Length == freq)
- {
- TermVectorOffsetInfo[] newOffsets = new TermVectorOffsetInfo[freq * 2];
- TermVectorOffsetInfo[] offsets = ti.offsets;
- for (int i = 0; i < freq; i++)
- {
- newOffsets[i] = offsets[i];
- }
- ti.offsets = newOffsets;
- }
- ti.offsets[freq] = offset;
- }
- ti.freq = freq + 1; // update frequency
- }
- else
- {
- // word not seen before
- Term term = new Term(field, text, false);
- postingTable[term] = new Posting(term, position, offset);
- }
- }
-
- private Posting[] SortPostingTable()
- {
- // copy postingTable into an array
- Posting[] array = new Posting[postingTable.Count];
- System.Collections.IEnumerator postings = postingTable.Values.GetEnumerator();
- for (int i = 0; postings.MoveNext(); i++)
- {
- array[i] = (Posting) postings.Current;
- }
-
- // sort the array
- QuickSort(array, 0, array.Length - 1);
-
- return array;
- }
-
- private static void QuickSort(Posting[] postings, int lo, int hi)
- {
- if (lo >= hi)
- return ;
-
- int mid = (lo + hi) / 2;
-
- if (postings[lo].term.CompareTo(postings[mid].term) > 0)
- {
- Posting tmp = postings[lo];
- postings[lo] = postings[mid];
- postings[mid] = tmp;
- }
-
- if (postings[mid].term.CompareTo(postings[hi].term) > 0)
- {
- Posting tmp = postings[mid];
- postings[mid] = postings[hi];
- postings[hi] = tmp;
-
- if (postings[lo].term.CompareTo(postings[mid].term) > 0)
- {
- Posting tmp2 = postings[lo];
- postings[lo] = postings[mid];
- postings[mid] = tmp2;
- }
- }
-
- int left = lo + 1;
- int right = hi - 1;
-
- if (left >= right)
- return ;
-
- Term partition = postings[mid].term;
-
- for (; ; )
- {
- while (postings[right].term.CompareTo(partition) > 0)
- --right;
-
- while (left < right && postings[left].term.CompareTo(partition) <= 0)
- ++left;
-
- if (left < right)
- {
- Posting tmp = postings[left];
- postings[left] = postings[right];
- postings[right] = tmp;
- --right;
- }
- else
- {
- break;
- }
- }
-
- QuickSort(postings, lo, left);
- QuickSort(postings, left + 1, hi);
- }
-
- private void WritePostings(Posting[] postings, System.String segment)
- {
- IndexOutput freq = null, prox = null;
- TermInfosWriter tis = null;
- TermVectorsWriter termVectorWriter = null;
- try
- {
- //open files for inverse index storage
- freq = directory.CreateOutput(segment + ".frq");
- prox = directory.CreateOutput(segment + ".prx");
- tis = new TermInfosWriter(directory, segment, fieldInfos, termIndexInterval);
- TermInfo ti = new TermInfo();
- System.String currentField = null;
-
- for (int i = 0; i < postings.Length; i++)
- {
- Posting posting = postings[i];
-
- // add an entry to the dictionary with pointers to prox and freq files
- ti.Set(1, freq.GetFilePointer(), prox.GetFilePointer(), - 1);
- tis.Add(posting.term, ti);
-
- // add an entry to the freq file
- int postingFreq = posting.freq;
- if (postingFreq == 1)
- // optimize freq=1
- freq.WriteVInt(1);
- // set low bit of doc num.
- else
- {
- freq.WriteVInt(0); // the document number
- freq.WriteVInt(postingFreq); // frequency in doc
- }
-
- int lastPosition = 0; // write positions
- int[] positions = posting.positions;
- for (int j = 0; j < postingFreq; j++)
- {
- // use delta-encoding
- int position = positions[j];
- prox.WriteVInt(position - lastPosition);
- lastPosition = position;
- }
- // check to see if we switched to a new field
- System.String termField = posting.term.Field();
- if (currentField != termField)
- {
- // changing field - see if there is something to save
- currentField = termField;
- FieldInfo fi = fieldInfos.FieldInfo(currentField);
- if (fi.storeTermVector)
- {
- if (termVectorWriter == null)
- {
- termVectorWriter = new TermVectorsWriter(directory, segment, fieldInfos);
- termVectorWriter.OpenDocument();
- }
- termVectorWriter.OpenField(currentField);
- }
- else if (termVectorWriter != null)
- {
- termVectorWriter.CloseField();
- }
- }
- if (termVectorWriter != null && termVectorWriter.IsFieldOpen())
- {
- termVectorWriter.AddTerm(posting.term.Text(), postingFreq, posting.positions, posting.offsets);
- }
- }
- if (termVectorWriter != null)
- termVectorWriter.CloseDocument();
- }
- finally
- {
- // make an effort to close all streams we can but remember and re-throw
- // the first exception encountered in this process
- System.IO.IOException keep = null;
- if (freq != null)
- try
- {
- freq.Close();
- }
- catch (System.IO.IOException e)
- {
- if (keep == null)
- keep = e;
- }
- if (prox != null)
- try
- {
- prox.Close();
- }
- catch (System.IO.IOException e)
- {
- if (keep == null)
- keep = e;
- }
- if (tis != null)
- try
- {
- tis.Close();
- }
- catch (System.IO.IOException e)
- {
- if (keep == null)
- keep = e;
- }
- if (termVectorWriter != null)
- try
- {
- termVectorWriter.Close();
- }
- catch (System.IO.IOException e)
- {
- if (keep == null)
- keep = e;
- }
- if (keep != null)
- {
- throw new System.IO.IOException(keep.StackTrace);
- }
- }
- }
-
- private void WriteNorms(System.String segment)
- {
- for (int n = 0; n < fieldInfos.Size(); n++)
- {
- FieldInfo fi = fieldInfos.FieldInfo(n);
- if (fi.isIndexed && !fi.omitNorms)
- {
- float norm = fieldBoosts[n] * similarity.LengthNorm(fi.name, fieldLengths[n]);
- IndexOutput norms = directory.CreateOutput(segment + ".f" + n);
- try
- {
- norms.WriteByte(Similarity.EncodeNorm(norm));
- }
- finally
- {
- norms.Close();
- }
- }
- }
- }
-
- /// <summary>If non-null, a message will be printed to this if maxFieldLength is reached.</summary>
- internal void SetInfoStream(System.IO.TextWriter infoStream)
- {
- this.infoStream = infoStream;
- }
- }
-
- sealed class Posting
- {
- // info about a Term in a doc
- internal Term term; // the Term
- internal int freq; // its frequency in doc
- internal int[] positions; // positions it occurs at
- internal TermVectorOffsetInfo[] offsets;
-
- internal Posting(Term t, int position, TermVectorOffsetInfo offset)
- {
- term = t;
- freq = 1;
- positions = new int[1];
- positions[0] = position;
- if (offset != null)
- {
- offsets = new TermVectorOffsetInfo[1];
- offsets[0] = offset;
- }
- else
- offsets = null;
- }
- }
- }