PDFBoxLuceneIndex.java
上传用户:cctqzzy
上传日期:2022-03-14
资源大小:12198k
文件大小:3k
- package chapter9;
- import java.io.FileInputStream;
- import java.io.IOException;
- import org.apache.lucene.analysis.Analyzer;
- import org.apache.lucene.analysis.SimpleAnalyzer;
- import org.apache.lucene.analysis.standard.*;
- import org.apache.lucene.document.Document;
- import org.apache.lucene.index.IndexWriter;
- import org.apache.lucene.index.Term;
- import org.apache.lucene.search.Hits;
- import org.apache.lucene.search.IndexSearcher;
- import org.apache.lucene.search.Query;
- import org.apache.lucene.search.TermQuery;
- import org.pdfbox.searchengine.lucene.LucenePDFDocument;
- public class PDFBoxLuceneIndex {
- private static String Dest_Index_Path = "D:\workshop\index";
-
- /*================================================================
- * 名 称:PDFQueryIndex
- * 功 能:构造PDF文档检索查询器,对指定的索引进行查询。
- ===============================================================*/
- public static void PDFQueryIndex(){
-
- try {
- IndexSearcher searcher = new IndexSearcher(Dest_Index_Path); // 生成检索器对象
- Term term = new Term("contents","pdf"); // 检索关键字
- Query query = new TermQuery(term); // 生成检索对象
- System.out.println("----------检索内容:"+query.toString()+"----------");
- Hits hits = searcher.search(query); // 提交检索
-
- System.out.println("----------检索结果: 共检索到 "+hits.length()+" 条 ----------");
-
- for(int i=0; i < hits.length(); i++) // 获得结果
- {
- System.out.println(hits.doc(i));
- System.out.println(hits.doc(i).getField("id"));
- }
-
- }catch (IOException e) {
- e.printStackTrace();
- }
- System.out.println("----------索引检索:PDF索引查询成功----------");
-
- }
-
- /*================================================================
- * 名 称:PDFIndexBuilder
- * 功 能:构造PDF磁盘索引,添加内容到指定目录,为后续检索查询做好准备。
- ===============================================================*/
- public static void PDFIndexBuilder(){
-
- try {
-
- //Analyzer TextAnalyzer = new SimpleAnalyzer(); // 生成分析器
- Analyzer TextAnalyzer = new StandardAnalyzer(); // 生成分析器
- IndexWriter TextIndex = new IndexWriter(Dest_Index_Path,TextAnalyzer,true); // 生成索引器
- TextIndex.setUseCompoundFile(true);
- FileInputStream instream = new FileInputStream("D:\workshop\docs\index.pdf"); // 根据指定文件创建输入流
-
- Document document = LucenePDFDocument.getDocument( instream ) ; // 由PDF文件生成文档对象
- System.out.println("----------创建索引:PDF 文件内容 ----------");
- System.out.println(document);
- TextIndex.addDocument(document); // 添加文档到索引
- TextIndex.optimize();
- TextIndex.close(); // 索引完毕
-
- }catch (IOException e) {
- e.printStackTrace();
- }
- System.out.println("----------创建索引:PDF 文件成功. ----------");
- }
- /*================================================================
- * 名 称:main
- * 功 能:测试Lucene中PDF文件的索引建立和检索查询功能。
- ===============================================================*/
- public static void main(String[] args) {
-
- PDFIndexBuilder(); // 创建索引
- PDFQueryIndex(); // 检索关键字
-
- System.out.println("----------PDF Lucene 检索测试 ----------");
- }
- }