TestIndexUpdater.java
上传用户:quxuerui
上传日期:2018-01-08
资源大小:41811k
文件大小:8k
- /**
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
- package org.apache.hadoop.contrib.index.mapred;
- import java.io.File;
- import java.io.IOException;
- import java.text.NumberFormat;
- import org.apache.hadoop.conf.Configuration;
- import org.apache.hadoop.contrib.index.lucene.FileSystemDirectory;
- import org.apache.hadoop.hdfs.MiniDFSCluster;
import org.apache.hadoop.fs.FileStatus;
- import org.apache.hadoop.fs.FileSystem;
- import org.apache.hadoop.fs.Path;
- import org.apache.hadoop.fs.PathFilter;
- import org.apache.hadoop.mapred.MiniMRCluster;
- import org.apache.lucene.document.Document;
- import org.apache.lucene.index.IndexReader;
- import org.apache.lucene.index.IndexWriter;
- import org.apache.lucene.index.KeepOnlyLastCommitDeletionPolicy;
- import org.apache.lucene.index.MultiReader;
- import org.apache.lucene.index.Term;
- import org.apache.lucene.search.Hits;
- import org.apache.lucene.search.IndexSearcher;
- import org.apache.lucene.search.TermQuery;
- import org.apache.lucene.store.Directory;
- import junit.framework.TestCase;
- public class TestIndexUpdater extends TestCase {
- private static final NumberFormat NUMBER_FORMAT = NumberFormat.getInstance();
- static {
- NUMBER_FORMAT.setMinimumIntegerDigits(5);
- NUMBER_FORMAT.setGroupingUsed(false);
- }
- // however, "we only allow 0 or 1 reducer in local mode" - from
- // LocalJobRunner
- private Configuration conf;
- private Path localInputPath = new Path(System.getProperty("build.test") + "/sample/data.txt");
- private Path inputPath = new Path("/myexample/data.txt");
- private Path outputPath = new Path("/myoutput");
- private Path indexPath = new Path("/myindex");
- private int initNumShards = 3;
- private int numMapTasks = 5;
- private int numDataNodes = 3;
- private int numTaskTrackers = 3;
- private int numRuns = 3;
- private int numDocsPerRun = 10; // num of docs in local input path
- private FileSystem fs;
- private MiniDFSCluster dfsCluster;
- private MiniMRCluster mrCluster;
- public TestIndexUpdater() throws IOException {
- super();
- if (System.getProperty("hadoop.log.dir") == null) {
- String base = new File(".").getPath(); // getAbsolutePath();
- System.setProperty("hadoop.log.dir", new Path(base).toString() + "/logs");
- }
- conf = new Configuration();
- }
- protected void setUp() throws Exception {
- super.setUp();
- try {
- dfsCluster =
- new MiniDFSCluster(conf, numDataNodes, true, (String[]) null);
- fs = dfsCluster.getFileSystem();
- if (fs.exists(inputPath)) {
- fs.delete(inputPath);
- }
- fs.copyFromLocalFile(localInputPath, inputPath);
- if (fs.exists(outputPath)) {
- // do not create, mapred will create
- fs.delete(outputPath);
- }
- if (fs.exists(indexPath)) {
- fs.delete(indexPath);
- }
- mrCluster =
- new MiniMRCluster(numTaskTrackers, fs.getUri().toString(), 1);
- } catch (IOException e) {
- if (dfsCluster != null) {
- dfsCluster.shutdown();
- dfsCluster = null;
- }
- if (fs != null) {
- fs.close();
- fs = null;
- }
- if (mrCluster != null) {
- mrCluster.shutdown();
- mrCluster = null;
- }
- throw e;
- }
- }
- protected void tearDown() throws Exception {
- if (dfsCluster != null) {
- dfsCluster.shutdown();
- dfsCluster = null;
- }
- if (fs != null) {
- fs.close();
- fs = null;
- }
- if (mrCluster != null) {
- mrCluster.shutdown();
- mrCluster = null;
- }
- super.tearDown();
- }
- public void testIndexUpdater() throws IOException {
- IndexUpdateConfiguration iconf = new IndexUpdateConfiguration(conf);
- // max field length, compound file and number of segments will be checked
- // later
- iconf.setIndexMaxFieldLength(2);
- iconf.setIndexUseCompoundFile(true);
- iconf.setIndexMaxNumSegments(1);
- long versionNumber = -1;
- long generation = -1;
- for (int i = 0; i < numRuns; i++) {
- if (fs.exists(outputPath)) {
- fs.delete(outputPath);
- }
- Shard[] shards = new Shard[initNumShards + i];
- for (int j = 0; j < shards.length; j++) {
- shards[j] =
- new Shard(versionNumber, new Path(indexPath,
- NUMBER_FORMAT.format(j)).toString(), generation);
- }
- run(i + 1, shards);
- }
- }
- private void run(int numRuns, Shard[] shards) throws IOException {
- IIndexUpdater updater = new IndexUpdater();
- updater.run(conf, new Path[] { inputPath }, outputPath, numMapTasks,
- shards);
- // verify the done files
- Path[] doneFileNames = new Path[shards.length];
- int count = 0;
- FileStatus[] fileStatus = fs.listStatus(outputPath);
- for (int i = 0; i < fileStatus.length; i++) {
- FileStatus[] doneFiles = fs.listStatus(fileStatus[i].getPath());
- for (int j = 0; j < doneFiles.length; j++) {
- doneFileNames[count++] = doneFiles[j].getPath();
- }
- }
- assertEquals(shards.length, count);
- for (int i = 0; i < count; i++) {
- assertTrue(doneFileNames[i].getName().startsWith(
- IndexUpdateReducer.DONE.toString()));
- }
- // verify the index
- IndexReader[] readers = new IndexReader[shards.length];
- for (int i = 0; i < shards.length; i++) {
- Directory dir =
- new FileSystemDirectory(fs, new Path(shards[i].getDirectory()),
- false, conf);
- readers[i] = IndexReader.open(dir);
- }
- IndexReader reader = new MultiReader(readers);
- IndexSearcher searcher = new IndexSearcher(reader);
- Hits hits = searcher.search(new TermQuery(new Term("content", "apache")));
- assertEquals(numRuns * numDocsPerRun, hits.length());
- int[] counts = new int[numDocsPerRun];
- for (int i = 0; i < hits.length(); i++) {
- Document doc = hits.doc(i);
- counts[Integer.parseInt(doc.get("id"))]++;
- }
- for (int i = 0; i < numDocsPerRun; i++) {
- assertEquals(numRuns, counts[i]);
- }
- // max field length is 2, so "dot" is also indexed but not "org"
- hits = searcher.search(new TermQuery(new Term("content", "dot")));
- assertEquals(numRuns, hits.length());
- hits = searcher.search(new TermQuery(new Term("content", "org")));
- assertEquals(0, hits.length());
- searcher.close();
- reader.close();
- // open and close an index writer with KeepOnlyLastCommitDeletionPolicy
- // to remove earlier checkpoints
- for (int i = 0; i < shards.length; i++) {
- Directory dir =
- new FileSystemDirectory(fs, new Path(shards[i].getDirectory()),
- false, conf);
- IndexWriter writer =
- new IndexWriter(dir, false, null,
- new KeepOnlyLastCommitDeletionPolicy());
- writer.close();
- }
- // verify the number of segments, must be done after an writer with
- // KeepOnlyLastCommitDeletionPolicy so that earlier checkpoints are removed
- for (int i = 0; i < shards.length; i++) {
- PathFilter cfsFilter = new PathFilter() {
- public boolean accept(Path path) {
- return path.getName().endsWith(".cfs");
- }
- };
- FileStatus[] cfsFiles =
- fs.listStatus(new Path(shards[i].getDirectory()), cfsFilter);
- assertEquals(1, cfsFiles.length);
- }
- }
- }