File: LuceneIndexer.java

package info (click to toggle)
liblucy-perl 0.3.3-4
links: PTS, VCS
area: main
in suites: jessie, jessie-kfreebsd
size: 9,328 kB
ctags: 8,492
sloc: ansic: 80,468; perl: 7,080; yacc: 681; java: 174; lex: 96; makefile: 20
file content (240 lines) | stat: -rwxr-xr-x 8,587 bytes
parent folder | download | duplicates (2)
/* Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.analysis.WhitespaceAnalyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.store.FSDirectory;

import java.io.File;
import java.io.BufferedReader;
import java.io.FileReader;
import java.text.DecimalFormat;
import java.util.Date;
import java.util.Vector;
import java.util.Collections;
import java.util.Arrays;

/**
 * LuceneIndexer - benchmarking app
 * usage: java LuceneIndexer [-docs MAX_TO_INDEX] [-reps NUM_REPETITIONS]
 *
 * Recommended options: -server -Xmx500M -XX:CompileThreshold=100
 */

public class LuceneIndexer {
  static File corpusDir = new File("extracted_corpus");
  static File indexDir  = new File("lucene_index");
  static String[] fileList;

  public static void main (String[] args) throws Exception {
    // verify that we're running from the right directory
    if (!corpusDir.exists())
      throw new Exception("Can't find 'extracted_corpus' directory");

    // assemble the sorted list of article files
    fileList = buildFileList();

    // parse command line args
    int maxToIndex = fileList.length; // default: index all docs
    int numReps    = 1;               // default: run once
    int increment  = 0;
    boolean store  = false;
    String arg;
    int i = 0;
    while (i < (args.length - 1) && args[i].startsWith("-")) {
      arg = args[i++];
      if (arg.equals("-docs")) {
         maxToIndex = Integer.parseInt(args[i++]);
      }
      else if (arg.equals("-reps")) {
        numReps = Integer.parseInt(args[i++]);
      }
      else if (arg.equals("-increment")) {
        increment = Integer.parseInt(args[i++]);
      }
      else if (arg.equals("-store")) {
        if (Integer.parseInt(args[i++]) != 0) {
          store = true;
        }
      }
      else {
        throw new Exception("Unknown argument: " + arg);
      }
    }
    increment = increment == 0 ? maxToIndex + 1 : increment;

    // start the output
    System.out.println("---------------------------------------------------");

    // build the index numReps times, then print a final report
    float[] times = new float[numReps];
    for (int rep = 1; rep <= numReps; rep++) {
      // start the clock and build the index
      long start = new Date().getTime(); 
      int numIndexed = buildIndex(fileList, maxToIndex, increment, store);
  
      // stop the clock and print a report
      long end = new Date().getTime();
      float secs = (float)(end - start) / 1000;
      times[rep - 1] = secs;
      printInterimReport(rep, secs, numIndexed);
    }
    printFinalReport(times);
  }

  // Return a lexically sorted list of all article files from all subdirs.
  static String[] buildFileList () throws Exception {
    File[] articleDirs = corpusDir.listFiles();
    Vector<String> filePaths = new Vector<String>();
    for (int i = 0; i < articleDirs.length; i++) {
      File[] articles = articleDirs[i].listFiles();
      for (int j = 0; j < articles.length; j++) {
        String path = articles[j].getPath();
        if (path.indexOf("article") == -1)
          continue;
        filePaths.add(path);
      }
    }
    Collections.sort(filePaths);
    return (String[])filePaths.toArray(new String[filePaths.size()]);
  }

  // Initialize an IndexWriter
  static IndexWriter initWriter (int count) throws Exception {
    boolean create = count > 0 ? false : true;
    FSDirectory directory = FSDirectory.getDirectory(indexDir);
    IndexWriter writer = new IndexWriter(directory, true, 
        new WhitespaceAnalyzer(), create);
      // writer.setMaxBufferedDocs(1000);
      writer.setUseCompoundFile(false);
      
    return writer;
  }

  // Build an index, stopping at maxToIndex docs if maxToIndex > 0.
  static int buildIndex (String[] fileList, int maxToIndex, int increment,
                         boolean store) throws Exception {
    IndexWriter writer = initWriter(0);
    int docsSoFar = 0;

    while (docsSoFar < maxToIndex) {
      for (int i = 0; i < fileList.length; i++) {
        // add content to index
        File f = new File(fileList[i]);
        Document doc = new Document();
        BufferedReader br = new BufferedReader(new FileReader(f));
    
        try {
          // the title is the first line
          String title;
          if ( (title = br.readLine()) == null)
            throw new Exception("Failed to read title");
          Field titleField = new Field("title", title, Field.Store.YES, 
              Field.Index.TOKENIZED, Field.TermVector.NO);
          doc.add(titleField);
      
          // the body is the rest
          if (store) {
            StringBuffer buf = new StringBuffer();
            String str;
            while ( (str = br.readLine()) != null )
              buf.append( str );
            String body = buf.toString();
            Field bodyField = new Field("body", body, Field.Store.YES, 
                Field.Index.TOKENIZED, Field.TermVector.WITH_POSITIONS_OFFSETS);
            doc.add(bodyField);
          } else {
            Field bodyField = new Field("body", br);
            doc.add(bodyField);
          }

          writer.addDocument(doc);
        } finally {
          br.close();
        }

        docsSoFar++;
        if (docsSoFar >= maxToIndex) {
          break;
        }
        if (docsSoFar % increment == 0) {
          writer.close();
          writer = initWriter(docsSoFar);
        }
      }
    }

    // finish index
    int numIndexed = writer.docCount();
    writer.optimize();
    writer.close();
    
    return numIndexed;
  }

  // Print out stats for one run.
  private static void printInterimReport(int rep, float secs, 
                                         int numIndexed) {
    DecimalFormat secsFormat = new DecimalFormat("#,##0.00");
    String secString = secsFormat.format(secs);
    System.out.println(rep + "   Secs: " + secString + 
                       "  Docs: " + numIndexed);
  }

  // Print out aggregate stats
  private static void printFinalReport(float[] times) {
    // produce mean and truncated mean
    Arrays.sort(times);
    float meanTime = 0.0f;
    float truncatedMeanTime = 0.0f;
    int numToChop = times.length >> 2;
    int numKept = 0;
    for (int i = 0; i < times.length; i++) {
        meanTime += times[i];
        // discard fastest 25% and slowest 25% of reps
        if (i < numToChop || i >= (times.length - numToChop))
            continue;
        truncatedMeanTime += times[i];
        numKept++;
    }
    meanTime /= times.length;
    truncatedMeanTime /= numKept;
    int numDiscarded = times.length - numKept;
    DecimalFormat format = new DecimalFormat("#,##0.00");
    String meanString = format.format(meanTime);
    String truncatedMeanString = format.format(truncatedMeanTime);

    // get the Lucene version
    Package lucenePackage = org.apache.lucene.LucenePackage.get();
    String luceneVersion = lucenePackage.getSpecificationVersion();

    System.out.println("---------------------------------------------------");
    System.out.println("Lucene " +  luceneVersion);
    System.out.println("JVM " + System.getProperty("java.version") +
                       " (" + System.getProperty("java.vendor") + ")");
    System.out.println(System.getProperty("os.name") + " " + 
                       System.getProperty("os.version") + " " +
                       System.getProperty("os.arch"));
    System.out.println("Mean: " + meanString + " secs");
    System.out.println("Truncated mean (" +
                        numKept + " kept, " +
                        numDiscarded + " discarded): " + 
                        truncatedMeanString + " secs");
    System.out.println("---------------------------------------------------");
  }
}