Sunday, March 18, 2012

Wrap Lucene classes to do search easily.

This post is about build some wrapper to wrap the classes in Lucene
to do search easily.

The sample is as the first one at
http://ben-bai.blogspot.com/2012/03/basic-command-line-lucene-test.html

but use some wrapper in stead of call Lucene Classes directly.

The Test and Wrappers

LuceneTestTwo.java

package test.lucene.testtwo;

import java.io.IOException;

import org.apache.lucene.document.Document;
import org.apache.lucene.queryParser.ParseException;
import org.apache.lucene.search.ScoreDoc;

import test.lucene.wrapper.*;

public class LuceneTestTwo {
    public static void main(String[] args)
    throws IOException, ParseException {
        // Create IndexWriter
        WrappedIndexWriter wiw = new WrappedIndexWriter();
        wiw.createIndexWriter();

        // add Documents and close
        wiw.addDocument(new WrappedDocument("title", "Test Title One", true, true)
                            .addField("content", "Test Content One", true, true)
                            .getDocument())
            .addDocument(new WrappedDocument("title", "Test Title Two", true, true)
                            .addField("content", "Test Content Two", true, false)
                            .getDocument())
            .addDocument(new WrappedDocument("title", "Test Title Three", true, true)
                            .addField("content", "Test Content Three", true, true)
                            .getDocument())
            .addDocument(new WrappedDocument("title", "Test Title Four", true, true)
                            .addField("content", "Test Content Four", false, true)
                            .getDocument())
            .close();

        // create WrappedSearcher, initiate searcher and do search
        WrappedSearcher ws = new WrappedSearcher();
        ScoreDoc[] results =
            ws.initSearcher(wiw)
                .doSearch( new WrappedQuery()
                                .createQuery(wiw, "content", "Test Content Three"),
                            10);

        // display results
        System.out.println(results.length + " results.");
        for(int i=0; i < results.length; i++) {
            int docId = results[i].doc;
            Document doc = ws.getIndexSearcher().doc(docId);
            System.out.println((i + 1) + "\ttitle: " + doc.get("title")
                                + "\n\tcontent: " + doc.get("content"));
        }
    }
}

WrappedIndexWriter.java

package test.lucene.wrapper;

import java.io.IOException;

import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.index.*;
import org.apache.lucene.store.*;
import org.apache.lucene.util.Version;

/**
 * Wrap the index writer with Analyzer/Directory,
 * to make sure we can get the correct one.
 *
 */
public class WrappedIndexWriter {
    // The analyzer for tokenizing text, indexing and searching
    private Analyzer _analyzer;
    // create the index
    private Directory _dir;
    // create the index writer config
    private IndexWriterConfig _config;
    // create index writer by index and config
    private IndexWriter _iw;

    /**
     * Create the IndexWriter
     * @throws CorruptIndexException
     * @throws LockObtainFailedException
     * @throws IOException
     */
    public void createIndexWriter ()
        throws CorruptIndexException, LockObtainFailedException, IOException {
        // The analyzer for tokenizing text, indexing and searching
        _analyzer = new StandardAnalyzer(Version.LUCENE_35);
        // create the index
        _dir = new RAMDirectory();
        // create the index writer config
        _config = new IndexWriterConfig(Version.LUCENE_35, _analyzer);
        // create index writer by index and config
        _iw = new IndexWriter(_dir, _config);
    }
    /**
     * Get the Analyzer of this IndexWriter
     * @return Analyzer
     */
    public Analyzer getAnalyzer () {
        return _analyzer;
    }
    /**
     * Get the Directory of this IndexWriter
     * @return Directory
     */
    public Directory getDirectory () {
        return _dir;
    }
    /**
     * Get the wrapped IndexWriter
     * @return IndexWriter The wrapped IndexWriter
     */
    public IndexWriter getIndexWriter() {
        return _iw;
    }
    /**
     * Add document into IndexWriter
     * @param doc The document to add
     * @return WrappedIndexWriter Self instance
     * @throws CorruptIndexException
     * @throws IOException
     */
    public WrappedIndexWriter addDocument (Document doc)
        throws CorruptIndexException, IOException {
        _iw.addDocument(doc);
        return this;
    }
    /**
     * Update a Document in the IndexWriter
     * @param term The term that indicate the Document to be updated
     * @param doc The new Document
     * @return WrappedIndexWriter Self instance
     * @throws CorruptIndexException
     * @throws IOException
     */
    public WrappedIndexWriter updateDocument (Term term, Document doc)
        throws CorruptIndexException, IOException {
        _iw.updateDocument(term, doc);
        return this;
    }
    /**
     * Close the IndexWriter
     * @throws IOException 
     * @throws CorruptIndexException 
     * 
     */
    public void close() throws CorruptIndexException, IOException {
        _iw.close();
    }
}

The Analyzer and Directory is stored with IndexWriter in this wrapper,
so we can make sure that will get the correct instance from it.

WrappedDocument.java

package test.lucene.wrapper;

import java.io.IOException;

import org.apache.lucene.document.*;

/**
 * Wrap the Document so we can do get document in more convenient way:
 * 
 * DocumentWrapper.createDoc(...).addField(...)
 * .addField.......getDocument()
 *
 */
public class WrappedDocument {
    // The wrapped Document
    private Document _doc;

    public WrappedDocument () {
        
    }
    public WrappedDocument (String name, String value,
            boolean store, boolean analyzed)
            throws IllegalStateException, IOException {
        createDoc(name, value, store, analyzed);
    }
    /**
     * create document and add field.
     * @param name Field name
     * @param value Field value
     * @param store Store value or not
     * @param analyzed analyze value or not
     * @return DocumentWrapper Self instance
     * @throws IOException
     * @throws IllegalStateException
     */
    public WrappedDocument createDoc(String name, String value,
        boolean store, boolean analyzed)
            throws IOException, IllegalStateException {
        if (_doc != null)
            throw new IllegalStateException ("Document already created!");
        _doc = new Document();
        return addField(name, value, store, analyzed);
    }
    /**
     * Add field
     * @param name Field name
     * @param value Field value
     * @param store Store value or not
     * @param analyzed Analyze value or not
     * @return DocumentWrapper Self instance
     * @throws IOException
     */
    public WrappedDocument addField(String name, String value,
            boolean store, boolean analyzed) throws IOException {
        _doc.add(new Field(name,
                value,
                store? Field.Store.YES : Field.Store.NO,
                analyzed? Field.Index.ANALYZED : Field.Index.NOT_ANALYZED));
        return this;
    }
    /**
     * Get the document
     * @return The document wrapped by this wrapper.
     */
    public Document getDocument () {
        return _doc;
    }
}

This is almost the same as the Document wrapper in first test.

WrappedQuery.java

package test.lucene.wrapper;

import org.apache.lucene.index.Term;
import org.apache.lucene.queryParser.*;
import org.apache.lucene.search.*;
import org.apache.lucene.util.Version;
/**
 * Wrap Query so we can create various type of Query instance and
 * do something like createBooleanQuery.addQuery.addQuery...
 *
 */
public class WrappedQuery {
    // the wrapped Query
    private Query _query;
    // Create a Query by QueryParser
    public WrappedQuery createQuery (WrappedIndexWriter wiw, String field, String content)
        throws ParseException {
        _query = new QueryParser(Version.LUCENE_35, field, wiw.getAnalyzer())
            .parse(content);
        return this;
    }

    /**
     * Create a BooleanQuery
     * @return WrappedQuery Self instance
     */
    public WrappedQuery createBooleanQuery () {
        _query = new BooleanQuery();
        return this;
    }

    /**
     * Create a TermQuery
     * @param term The term to query
     * @return WrappedQuery Self instance
     */
    public WrappedQuery createTermQuery (Term term) {
        _query = new TermQuery(term);
        return this;
    }

    /**
     * Create a TermRangeQuery
     * @param field The field to query
     * @param lower The lower bound
     * @param upper The upper bound
     * @param includeLower Grater Equal (true) or Greater Then (false)
     * @param includeUpper Less Equal (true) or Less Then (false)
     * @return WrappedQuery Self instance
     */
    public WrappedQuery createTermRangeQuery (String field, String lower,
            String upper, boolean includeLower, boolean includeUpper) {
        _query = new TermRangeQuery(field, lower, upper,
                                    includeLower, includeUpper);
        return this;
    }

    /**
     * Create a WildcardQuery
     * @param term The term to query
     * @return WrappedQuery Self instance
     */
    public WrappedQuery createWildcardQuery (Term term) {
        _query = new WildcardQuery(term);
        return this;
    }

    /**
     * Add query into a BooleanQuery
     * @param term The term to add
     * @param occur MUST, MUST_NOT or SHOULD
     * @return WrappedQuery Self instance
     */
    public WrappedQuery addQuery (Term term, BooleanClause.Occur occur) {
        if (_query instanceof BooleanQuery)
            ((BooleanQuery) _query).add(new TermQuery(term), occur);
        else
            throw new UnsupportedOperationException("addQuery only works with BooleanQuery");
        return this;
    }

    /**
     * Get the wrapped Query
     * @return Query The wrapped query
     */
    public Query getQuery () {
        return _query;
    }
}

Wrap the Query so we can create various type of Query as need.

WrappedSearcher.java

package test.lucene.wrapper;

import java.io.IOException;

import org.apache.lucene.index.*;
import org.apache.lucene.queryParser.*;
import org.apache.lucene.search.*;
/**
 * Wrap the Searcher to encapsulate the initiate and
 * search process
 *
 */
public class WrappedSearcher {
    /**
     * The only instance of searcher
     */
    private static IndexSearcher _searcher;

    /**
     * Initiate or renew the _searcher
     * @param wiw Thw wrapped index writer
     * @return WrappedSearcher Self instance
     * @throws CorruptIndexException
     * @throws IOException
     */
    public WrappedSearcher initSearcher (WrappedIndexWriter wiw)
        throws CorruptIndexException, IOException {
        IndexReader ir = IndexReader.open(wiw.getDirectory());
        _searcher = new IndexSearcher(ir);
        return this;
    }
    /**
     * Get the searcher instance
     * @return IndexSearcher The instance of searcher
     */
    public IndexSearcher getIndexSearcher () {
        return _searcher;
    }

    /**
     * Search and return the results
     * @param wq WrappedQuery
     * @param resultsPerPage
     * @return ScoreDoc[] The results
     * @throws IOException
     * @throws ParseException
     */
    public ScoreDoc[] doSearch (WrappedQuery wq, int resultsPerPage)
        throws IOException, ParseException{

        TopScoreDocCollector collector =
            TopScoreDocCollector.create(resultsPerPage, true);
        _searcher.search(wq.getQuery(), collector);
        ScoreDoc[] results = collector.topDocs().scoreDocs;
        
        return results;
    }
}

Wrap the Searcher so we do not need to worry about
the initiate and search process.

Download:

The test project is at github
https://github.com/benbai123/JSP_Servlet_Practice/tree/master/Practice/JAVA/Search/LuceneTest

Reference:
Official Javadoc
http://lucene.apache.org/core/old_versioned_docs/versions/3_5_0/api/all/
Using Apache Lucene to search text (IBM)
http://www.ibm.com/developerworks/java/library/os-apache-lucenesearch/index.html

No comments:

Post a Comment