POIOfficeExtractor.java
上传用户:cctqzzy
上传日期:2022-03-14
资源大小:12198k
文件大小:7k
- package chapter9;
- import org.apache.lucene.analysis.Analyzer;
- import org.apache.lucene.analysis.standard.StandardAnalyzer;
- import org.apache.lucene.document.Document;
- import org.apache.lucene.document.Field;
- import org.apache.lucene.index.IndexWriter;
- import org.apache.lucene.index.Term;
- import org.apache.lucene.search.Hits;
- import org.apache.lucene.search.IndexSearcher;
- import org.apache.lucene.search.Query;
- import org.apache.lucene.search.TermQuery;
- import org.apache.poi.hssf.usermodel.HSSFWorkbook;
- import org.apache.poi.hssf.usermodel.HSSFSheet;
- import org.apache.poi.hssf.usermodel.HSSFRow;
- import org.apache.poi.hssf.usermodel.HSSFCell;
- import org.apache.poi.hwpf.extractor.*;
- import org.apache.poi.hwpf.HWPFDocument;
- import org.apache.poi.hwpf.model.*;
- import org.apache.poi.hwpf.usermodel.*;
- import org.pdfbox.searchengine.lucene.LucenePDFDocument;
- //import LuceneBook.ChineseAnalyzer;
- import java.io.FileInputStream;
- import java.io.File;
- import java.io.IOException;
- import java.io.InputStream;
- public class POIOfficeExtractor {
- public static String xlsfileToBeRead="D:\workshop\docs\books.xls";
- public static String docfileToBeRead="D:\workshop\docs\softwarerequest.doc";
-
- private static String Dest_Index_Path = "D:\workshop\index";
-
- /*================================================================
- * 名 称:DocQueryIndex
- * 功 能:构造Doc文档检索查询器,对指定的索引进行查询。
- ===============================================================*/
- public static void DocQueryIndex(){
-
- try {
- IndexSearcher searcher = new IndexSearcher(Dest_Index_Path); // 生成检索器对象
-
- Term term = new Term("名称","doc"); // 检索关键字
- Query query = new TermQuery(term); // 生成检索对象
- System.out.println("----------检索内容:"+query.toString()+"----------");
- Hits hits = searcher.search(query); // 提交检索
-
- System.out.println("----------检索结果: 共检索到 "+hits.length()+" 条 ----------");
-
- for(int i=0; i < hits.length(); i++) // 获得结果
- {
- System.out.println(hits.doc(i));
- System.out.println(hits.doc(i).getField("id"));
- }
-
- }catch (IOException e) {
- e.printStackTrace();
- }
- System.out.println("----------索引检索:PDF索引查询成功----------");
-
- }
- /*================================================================
- * 名 称:DocIndexBuilder
- * 功 能:构造Doc磁盘索引,添加内容到指定目录,为后续检索查询做好准备。
- ===============================================================*/
- public static void DocIndexBuilder(){
-
- try {
-
- //Analyzer TextAnalyzer = new StandardAnalyzer(); // 生成分析器
- Analyzer TextAnalyzer = new ChineseAnalyzer();
-
- IndexWriter TextIndex = new IndexWriter(Dest_Index_Path,TextAnalyzer,true); // 生成索引器
- TextIndex.setUseCompoundFile(true);
-
- Document document = new Document() ; // 由Office文件生成文档对象
-
- FileInputStream in = new FileInputStream(new File(docfileToBeRead));
-
- HWPFDocument doc = new HWPFDocument(in);
- Range range = doc.getRange();
- String text = range.text();
-
- Field field_doc = new Field("doc", text,
- Field.Store.YES,Field.Index.TOKENIZED);
- document.add(field_doc);
- System.out.println("----------创建索引:Office 文件内容 ----------");
- //System.out.println(document);
- TextIndex.addDocument(document); // 添加文档到索引
- TextIndex.optimize();
- TextIndex.close(); // 索引完毕
-
- }catch (IOException e) {
- e.printStackTrace();
- }
- System.out.println("----------创建索引:Office 文件成功. ----------");
- }
-
-
- public static void GetWordDetail(String filename) throws Exception
- {
- FileInputStream in = new FileInputStream(new File(filename));
-
- HWPFDocument doc = new HWPFDocument(in);
- Range range = doc.getRange();
- String text = range.text();
- for(int i=0;i < range.numParagraphs();i++ ){
- Paragraph p = range.getParagraph(i); //取得每个段落
- //组合文字并添加换行
- text = p.text();
- text.trim();
- //text = " <br> " ;
- System.out.println( text );
- }
- }
-
- // 获取doc内纯文本信息
- public static void GetWordContent(String filename) throws Exception
- {
- FileInputStream in = new FileInputStream(new File(filename));
-
- WordExtractor extractor = new WordExtractor(in); // 创建WordExtractor
- String text = extractor.getText(); // 对DOC文件进行提取
- System.out.println( text );
- }
- public static void GetWordHWPFDocument(String filename) throws Exception
- {
- InputStream in = new FileInputStream(new File( "c:\test.doc ")); //流入doc文档
- HWPFDocument wordDocument = new HWPFDocument(in); //通过流得到文档类型
- Range range = wordDocument.getRange(); //取得文档篇幅
- int total = range.numParagraphs(); //文档内的总段落数
- String content = " ";//文章内容
- for(int i=0;i <total;i++ ){
- Paragraph p = range.getParagraph(i); //取得每个段落
- //组合文字并添加换行
- content = p.text();
- content = " <br> " ;
- }
- }
- // 获取Excel内纯文本信息
- public static void GetExcelContent(String filename) throws Exception
- {
- // 创建对指定Excel工作文件的引用
- HSSFWorkbook workbook = new HSSFWorkbook(new FileInputStream(filename));
- HSSFSheet sheet = workbook.getSheetAt(0); // 创建对工作表的引用。
- for( int i =0 ; i < workbook.getNumberOfSheets() ; i++ ) // 循环取表单对象
- {
- System.out.print("########## sheet:--" + i + " --########## " );
- sheet = workbook.getSheetAt(i); // 查阅文档的Sheet属性
- if( sheet != null )
- {
- for(int m = 0; m < sheet.getLastRowNum(); m++ ) // 按行循环取行对象
- {
- HSSFRow row = sheet.getRow(m);
- if( row == null){ break;}
- System.out.println("");
- if(row.getLastCellNum() <= 0) break;
- System.out.println( "-----line:--" + m + " ---- ,col num:"
- + row.getLastCellNum());
- for(int n = 0; n < row.getLastCellNum(); n++) // 按列循环取单元格对象
- {
- HSSFCell cell = row.getCell((short)n);
- if( cell == null){ break; }
- int type = cell.getCellType();
- switch(type)
- { case 0:
- System.out.print( cell.getNumericCellValue() + " , ");
- break;
- case 1:
- System.out.print(cell.getStringCellValue() + " , ");
- break;
- case 2:
- break;
- case 3:
- System.out.print( " , ");
- break;
- default:
- System.out.print("未知的单元类型" + type+" , ");
- }
- }
- }
- }
- System.out.println();
- }
- }
-
- public static void main(String argv[]){
- try{
- //GetExcelContent(xlsfileToBeRead);
- //GetWordContent(docfileToBeRead);
- GetWordDetail(docfileToBeRead);
-
- //DocIndexBuilder();
- //DocQueryIndex();
- }catch(Exception e) {
- System.out.println("运行错误 : " + e );
- }
- }
-
-
- }