Saturday, March 17, 2012

Basic Command Line Lucene Test

This post is about the basic command line Lucene sample,
include Create Document, Add Field, Analyze, Store, Indexing and Search.

Getting Started:

Download lucene here:
http://www.apache.org/dyn/closer.cgi/lucene/java/3.5.0

LuceneTest.java

package test.lucene;

import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.index.*;
import org.apache.lucene.queryParser.*;
import org.apache.lucene.search.*;
import org.apache.lucene.store.*;
import org.apache.lucene.util.Version;

import java.io.IOException;

public class LuceneTest {
    // The analyzer for tokenizing text, indexing and searching
    public static StandardAnalyzer analyzer =
        new StandardAnalyzer(Version.LUCENE_35);

    public static void main(String[] args)
        throws IOException, ParseException {
        // create the index
        Directory index = new RAMDirectory();
        // create the index writer config
        IndexWriterConfig config =
            new IndexWriterConfig(Version.LUCENE_35, analyzer);
        // create index writer by index and config
        IndexWriter iw = new IndexWriter(index, config);

        // add documents into index writer
        addDocuments(iw);
        iw.close();

        // Search the 'Test Content One' in documents content
        doSearch(index, "content", "Test Content Three");

    }
    private static void addDocuments(IndexWriter iw)
        throws IOException, ParseException {

        iw.addDocument(new DocumentWrapper()
                .createDoc("title", "Test Title One", true, true)
                .addField("content", "Test Content One", true, true)
                .getDocument());
        // content of this one not analyzed,
        // can not be searched
        iw.addDocument(new DocumentWrapper()
                .createDoc("title", "Test Title Two", true, true)
                .addField("content", "Test Content Two", true, false)
                .getDocument());
        iw.addDocument(new DocumentWrapper()
                .createDoc("title", "Test Title Three", true, true)
                .addField("content", "Test Content Three", true, true)
                .getDocument());
        // content of this one not is stored,
        // can be searched but the content is null
        iw.addDocument(new DocumentWrapper()
                .createDoc("title", "Test Title Four", true, true)
                .addField("content", "Test Content Four", false, true)
                .getDocument());
    }
    private static void doSearch (Directory index, String field, String content)
        throws IOException, ParseException{
        // query string
        String querystr = content;

        // query, with default field
        Query q = new QueryParser(Version.LUCENE_35, field, analyzer)
                        .parse(querystr);

        // search
        int hitsPerPage = 10;
        IndexReader reader = IndexReader.open(index);
        IndexSearcher searcher = new IndexSearcher(reader);
        TopScoreDocCollector collector =
            TopScoreDocCollector.create(hitsPerPage, true);
        searcher.search(q, collector);
        ScoreDoc[] hits = collector.topDocs().scoreDocs;
        
        // display results
        System.out.println(hits.length + " results.");
        for(int i=0; i < hits.length; i++) {
            int docId = hits[i].doc;
            Document doc = searcher.doc(docId);
            System.out.println((i + 1) + "\ttitle: " + doc.get("title")
                                + "\n\tcontent: " + doc.get("content"));
        }

        // close searcher 
        searcher.close();
    }
}

Create analyzer, index and index writer, add some documents into index writer,
then search the 'Test Content Three' in the content field.

DocumentWrapper.java

package test.lucene;

import java.io.IOException;

import org.apache.lucene.document.*;

/**
 * Wrap the Document so we can do get document in more convenient way:
 * 
 * DocumentWrapper.createDoc(...).addField(...)
 * .addField.......getDocument()
 *
 */
public class DocumentWrapper {
    // The wrapped Document
    private Document _doc;

    /**
     * create document and add field.
     * @param name Field name
     * @param value Field value
     * @param store Store value or not
     * @param analyzed analyze value or not
     * @return DocumentWrapper instance
     * @throws IOException
     * @throws IllegalStateException
     */
    public DocumentWrapper createDoc(String name, String value,
        boolean store, boolean analyzed)
            throws IOException, IllegalStateException {
        if (_doc != null)
            throw new IllegalStateException ("Document already created!");
        _doc = new Document();
        return addField(name, value, store, analyzed);
    }
    /**
     * Add field
     * @param name Field name
     * @param value Field value
     * @param store Store value or not
     * @param analyzed Analyze value or not
     * @return DocumentWrapper instance
     * @throws IOException
     */
    public DocumentWrapper addField(String name, String value,
            boolean store, boolean analyzed) throws IOException {
        _doc.add(new Field(name,
                value,
                store? Field.Store.YES : Field.Store.NO,
                analyzed? Field.Index.ANALYZED : Field.Index.NOT_ANALYZED));
        return this;
    }
    /**
     * Get the document
     * @return The document wrapped by this wrapper.
     */
    public Document getDocument () {
        return _doc;
    }
}

Wrap a document, has two function 'createDoc' and 'addField'
that can be called as a chain.

Execute result:



Test Content Three is the first result as expected,
the Test Content Two is not in the result list because it is not analyzed,
the Test Content For displayed as 'null' because it is not stored.

Download:

The full project is at github:
https://github.com/benbai123/JSP_Servlet_Practice/tree/master/Practice/JAVA/Search/LuceneTest

Reference:
http://www.lucenetutorial.com/lucene-in-5-minutes.html

No comments:

Post a Comment