A concise Tika/Lucene content parsing and indexing example

Apache Tika is a wonderfully simple toolkit (at the top level) for detecting and extracting metadata and structured text content from various documents using existing parser libraries.  It does have a lot of dependencies.

The Tika parser is very simple, requiring only a single method with three parameters:



void parse(InputStream stream, ContentHandler handler, Metadata metadata)
    throws IOException, SAXException, TikaException;

This example will extract 5 fields:  the file name, the full canonical path, the title and author if present in the particular document format’s metadata, and the full-text content.


/*
Tikaindexer: The simple indexer example for Lucene 3.0 extended with TIka to detect and parse
			 multiple document types.
Author: John Reece

First, lucene-core-*.jar and tika-app-*.jar need to be in your CLASSPATH

To index a single text file:
	java TikaIndexer <index_dir> <filename>
To index a directory tree of text files:
	ls  <document dir> | xargs java TikaIndexer <index_dir>
If <index_dir> doesn't exist it will created, otherwise it will be updated. 

Yup, there are a lot of dependencies...
*/
import java.io.*;
import java.util.*;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.store.FSDirectory;
import org.apache.lucene.store.Directory;
import org.apache.lucene.util.Version;

import org.apache.tika.metadata.Metadata;
import org.apache.tika.parser.AutoDetectParser;
import org.apache.tika.parser.Parser;
import org.apache.tika.sax.BodyContentHandler;
import org.xml.sax.ContentHandler;

public class TikaIndexer {

    public static void main (String [] args) throws Exception {
    Directory dir = FSDirectory.open(new File(args[0]));
    // The Version.LUCENE_XX is a required constructor argument in Version 3.
    Analyzer analysis = new StandardAnalyzer(Version.LUCENE_30);
    // IndexWriter will intelligently open an index for appending if the
    // index directory exists, else it will create a new index directory.
    IndexWriter idx = new IndexWriter (dir,analysis,IndexWriter.MaxFieldLength.UNLIMITED);

    // **** Tika specific-stuff.  Otherwise this is like the basic Lucene Indexer example.
    File f = new File(args[1]);
    FileInputStream is = new FileInputStream(f);

    ContentHandler contenthandler = new BodyContentHandler();
    Metadata metadata = new Metadata();
    metadata.set(Metadata.RESOURCE_NAME_KEY, f.getName());
    Parser parser = new AutoDetectParser();
    // OOXMLParser parser = new OOXMLParser();
    parser.parse(is, contenthandler, metadata);
    // **** End Tika-specific
    Document doc = new Document();
    // Fields you want to display in toto in search results need to be stored
    // using the Field.Store.YES. The NOT_ANALYZED and ANALYZED
    // constant has replaced UN_TOKENIZED and TOKENIZED from previous versions.
    doc.add(new Field("name",f.getName(),Field.Store.YES, Field.Index.NOT_ANALYZED));
    doc.add(new Field("path",f.getCanonicalPath(),Field.Store.YES, Field.Index.NOT_ANALYZED));
    doc.add(new Field("title",metadata.get(Metadata.TITLE),Field.Store.YES, Field.Index.ANALYZED));
    doc.add(new Field("author",metadata.get(Metadata.AUTHOR),Field.Store.YES, Field.Index.ANALYZED));
    doc.add(new Field("contents",contenthandler.toString(),Field.Store.NO,Field.Index.ANALYZED));
    idx.addDocument(doc);
    idx.close();
    } // main

} // IndexFile