uk.co.flax.luwak.DocumentBatch Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of luwak Show documentation
The newest version!
package uk.co.flax.luwak;

/*
 *   Copyright (c) 2015 Lemur Consulting Ltd.
 *
 *   Licensed under the Apache License, Version 2.0 (the "License");
 *   you may not use this file except in compliance with the License.
 *   You may obtain a copy of the License at
 *
 *   http://www.apache.org/licenses/LICENSE-2.0
 *
 *   Unless required by applicable law or agreed to in writing, software
 *   distributed under the License is distributed on an "AS IS" BASIS,
 *   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 *   See the License for the specific language governing permissions and
 *   limitations under the License.
 */

import java.io.Closeable;
import java.io.IOException;
import java.util.*;

import org.apache.lucene.index.*;
import org.apache.lucene.index.memory.MemoryIndex;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.similarities.BM25Similarity;
import org.apache.lucene.search.similarities.Similarity;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.RAMDirectory;
import org.apache.lucene.util.IOUtils;

/**
 * A collection of InputDocuments to be matched.
 *
 * A batch containing a single InputDocument uses a lucene MemoryIndex for indexing,
 * otherwise a RAMDirectory is used to hold the documents.
 *
 * To build a batch, either use one of the static factory methods, or a Builder object:
 *  *     DocumentBatch batch1 = DocumentBatch.of(doc1, doc2)
 *     DocumentBatch batch2 = new DocumentBatch.Builder()
 *                                  .setSimilarity(new MySimilarity())
 *                                  .add(doc1)
 *                                  .addAll(listOfDocs)
 *                                  .build()
 * 
 */
public abstract class DocumentBatch implements Closeable, Iterable {

    /** The {@link Similarity} to be used for scoring (if scoring is required) */
    protected final Similarity similarity;

    /** A list of {@link InputDocument} objects to match */
    protected final List documents = new ArrayList<>();

    /**
     * Create a DocumentBatch containing a single InputDocument
     * @param doc the document to add
     * @return the batch containing the input document
     */
    public static DocumentBatch of(InputDocument doc) {
        return new DocumentBatch.Builder().add(doc).build();
    }

    /**
     * Create a DocumentBatch containing a set of InputDocuments
     * @param docs Collection of documents to add
     * @return the batch containing the input documents
     */
    public static DocumentBatch of(Collection docs) {
        return new DocumentBatch.Builder().addAll(docs).build();
    }

    /**
     * Create a DocumentBatch containing a set of InputDocuments
     * @param docs list of documents to add
     * @return the batch containing the input documents
     */
    public static DocumentBatch of(InputDocument... docs) {
        return of(Arrays.asList(docs));
    }

    /**
     * Builder class for DocumentBatch
     */
    public static class Builder {

        private Similarity similarity = new BM25Similarity();
        private List documents = new ArrayList<>();

        /** Add an InputDocument
         * @param doc Single document to add
         * @return the current builder object
         */
        public Builder add(InputDocument doc) {
            documents.add(doc);
            return this;
        }

        /** Add a collection of InputDocuments
         * @param docs Collection of documents to add
         * @return the current builder object
         */
        public Builder addAll(Collection docs) {
            documents.addAll(docs);
            return this;
        }

        /** Set the {@link Similarity} to be used for scoring this batch
         * @param similarity the {@link Similarity} to be used for scoring this batch
         * @return the current builder object
         */
        public Builder setSimilarity(Similarity similarity) {
            this.similarity = similarity;
            return this;
        }

        /** Create the DocumentBatch
         * @return the newly created DocumentBatch
         */
        public DocumentBatch build() {
            if (documents.size() == 0)
                throw new IllegalStateException("Cannot build DocumentBatch with zero documents");
            if (documents.size() == 1)
                return new SingletonDocumentBatch(documents, similarity);
            return new MultiDocumentBatch(documents, similarity);
        }

    }

    /**
     * Create a new DocumentBatch
     * @param documents the documents to match
     * @param similarity the {@link Similarity} to use for scoring
     */
    protected DocumentBatch(Collection documents, Similarity similarity) {
        this.similarity = similarity;
        this.documents.addAll(documents);
    }

    /**
     * @return a {@link LeafReader} over the documents in this batch
     * @throws IOException on error
     */
    public abstract LeafReader getIndexReader() throws IOException;

    /**
     * Convert the lucene docid for a document in the batch to the luwak docid
     * @param docId the lucene docid
     * @return the luwak docid
     */
    public abstract String resolveDocId(int docId);

    /**
     * @return an {@link IndexSearcher} over the documents in this batch
     * @throws IOException on error
     */
    public IndexSearcher getSearcher() throws IOException {
        IndexSearcher searcher = new IndexSearcher(getIndexReader());
        searcher.setSimilarity(similarity);
        return searcher;
    }

    @Override
    public Iterator iterator() {
        return documents.iterator();
    }

    /**
     * @return the number of documents in the batch
     */
    public int getBatchSize() {
        return documents.size();
    }

    // Implementation of DocumentBatch for collections of documents
    private static class MultiDocumentBatch extends DocumentBatch {

        private final Directory directory = new RAMDirectory();
        private LeafReader reader = null;
        private String[] docIds = null;

        MultiDocumentBatch(List docs, Similarity similarity) {
            super(docs, similarity);
            assert docs.size() > 1;
            IndexWriterConfig iwc = new IndexWriterConfig(docs.get(0).getAnalyzers()).setSimilarity(similarity);
            try (IndexWriter writer = new IndexWriter(directory, iwc)) {
                this.reader = build(writer);
            }
            catch (IOException e) {
                throw new RuntimeException(e);  // This is a RAMDirectory, so should never happen...
            }
        }

        @Override
        public LeafReader getIndexReader() throws IOException {
            return reader;
        }

        private LeafReader build(IndexWriter writer) throws IOException {

            for (InputDocument doc : documents) {
                writer.addDocument(doc.getDocument());
            }

            writer.commit();
            writer.forceMerge(1);
            LeafReader reader = DirectoryReader.open(directory).leaves().get(0).reader();
            assert reader != null;

            docIds = new String[reader.maxDoc()];
            for (int i = 0; i < docIds.length; i++) {
                docIds[i] = reader.document(i).get(InputDocument.ID_FIELD);     // TODO can this be more efficient?
            }

            return reader;

        }

        @Override
        public String resolveDocId(int docId) {
            return docIds[docId];
        }

        @Override
        public void close() throws IOException {
            IOUtils.close(reader, directory);
        }

    }

    // Specialized class for batches containing a single object - MemoryIndex benchmarks as
    // better performing than RAMDirectory for this case
    private static class SingletonDocumentBatch extends DocumentBatch {

        private final MemoryIndex memoryindex = new MemoryIndex(true, true);
        private final LeafReader reader;

        private SingletonDocumentBatch(Collection documents, Similarity similarity) {
            super(documents, similarity);
            assert documents.size() == 1;
            memoryindex.setSimilarity(similarity);
            for (InputDocument doc : documents) {
                for (IndexableField field : doc.getDocument()) {
                    memoryindex.addField(field, doc.getAnalyzers());
                }
            }
            memoryindex.freeze();
            reader = (LeafReader) memoryindex.createSearcher().getIndexReader();
        }

        @Override
        public LeafReader getIndexReader() throws IOException {
            return reader;
        }

        @Override
        public String resolveDocId(int docId) {
            assert docId == 0;
            return documents.get(0).getId();
        }

        @Override
        public void close() throws IOException {
            reader.close();
        }
    }

}