All Downloads are FREE. Search and download functionalities are using the official Maven repository.

uk.co.flax.luwak.DocumentBatch Maven / Gradle / Ivy

The newest version!
package uk.co.flax.luwak;

/*
 *   Copyright (c) 2015 Lemur Consulting Ltd.
 *
 *   Licensed under the Apache License, Version 2.0 (the "License");
 *   you may not use this file except in compliance with the License.
 *   You may obtain a copy of the License at
 *
 *   http://www.apache.org/licenses/LICENSE-2.0
 *
 *   Unless required by applicable law or agreed to in writing, software
 *   distributed under the License is distributed on an "AS IS" BASIS,
 *   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 *   See the License for the specific language governing permissions and
 *   limitations under the License.
 */

import java.io.Closeable;
import java.io.IOException;
import java.util.*;

import org.apache.lucene.index.*;
import org.apache.lucene.index.memory.MemoryIndex;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.similarities.BM25Similarity;
import org.apache.lucene.search.similarities.Similarity;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.RAMDirectory;
import org.apache.lucene.util.IOUtils;

/**
 * A collection of InputDocuments to be matched.
 *
 * A batch containing a single InputDocument uses a lucene MemoryIndex for indexing,
 * otherwise a RAMDirectory is used to hold the documents.
 *
 * To build a batch, either use one of the static factory methods, or a Builder object:
 * 
 *     DocumentBatch batch1 = DocumentBatch.of(doc1, doc2)
 *     DocumentBatch batch2 = new DocumentBatch.Builder()
 *                                  .setSimilarity(new MySimilarity())
 *                                  .add(doc1)
 *                                  .addAll(listOfDocs)
 *                                  .build()
 * 
*/ public abstract class DocumentBatch implements Closeable, Iterable { /** The {@link Similarity} to be used for scoring (if scoring is required) */ protected final Similarity similarity; /** A list of {@link InputDocument} objects to match */ protected final List documents = new ArrayList<>(); /** * Create a DocumentBatch containing a single InputDocument * @param doc the document to add * @return the batch containing the input document */ public static DocumentBatch of(InputDocument doc) { return new DocumentBatch.Builder().add(doc).build(); } /** * Create a DocumentBatch containing a set of InputDocuments * @param docs Collection of documents to add * @return the batch containing the input documents */ public static DocumentBatch of(Collection docs) { return new DocumentBatch.Builder().addAll(docs).build(); } /** * Create a DocumentBatch containing a set of InputDocuments * @param docs list of documents to add * @return the batch containing the input documents */ public static DocumentBatch of(InputDocument... docs) { return of(Arrays.asList(docs)); } /** * Builder class for DocumentBatch */ public static class Builder { private Similarity similarity = new BM25Similarity(); private List documents = new ArrayList<>(); /** Add an InputDocument * @param doc Single document to add * @return the current builder object */ public Builder add(InputDocument doc) { documents.add(doc); return this; } /** Add a collection of InputDocuments * @param docs Collection of documents to add * @return the current builder object */ public Builder addAll(Collection docs) { documents.addAll(docs); return this; } /** Set the {@link Similarity} to be used for scoring this batch * @param similarity the {@link Similarity} to be used for scoring this batch * @return the current builder object */ public Builder setSimilarity(Similarity similarity) { this.similarity = similarity; return this; } /** Create the DocumentBatch * @return the newly created DocumentBatch */ public DocumentBatch build() { if (documents.size() == 0) throw new IllegalStateException("Cannot build DocumentBatch with zero documents"); if (documents.size() == 1) return new SingletonDocumentBatch(documents, similarity); return new MultiDocumentBatch(documents, similarity); } } /** * Create a new DocumentBatch * @param documents the documents to match * @param similarity the {@link Similarity} to use for scoring */ protected DocumentBatch(Collection documents, Similarity similarity) { this.similarity = similarity; this.documents.addAll(documents); } /** * @return a {@link LeafReader} over the documents in this batch * @throws IOException on error */ public abstract LeafReader getIndexReader() throws IOException; /** * Convert the lucene docid for a document in the batch to the luwak docid * @param docId the lucene docid * @return the luwak docid */ public abstract String resolveDocId(int docId); /** * @return an {@link IndexSearcher} over the documents in this batch * @throws IOException on error */ public IndexSearcher getSearcher() throws IOException { IndexSearcher searcher = new IndexSearcher(getIndexReader()); searcher.setSimilarity(similarity); return searcher; } @Override public Iterator iterator() { return documents.iterator(); } /** * @return the number of documents in the batch */ public int getBatchSize() { return documents.size(); } // Implementation of DocumentBatch for collections of documents private static class MultiDocumentBatch extends DocumentBatch { private final Directory directory = new RAMDirectory(); private LeafReader reader = null; private String[] docIds = null; MultiDocumentBatch(List docs, Similarity similarity) { super(docs, similarity); assert docs.size() > 1; IndexWriterConfig iwc = new IndexWriterConfig(docs.get(0).getAnalyzers()).setSimilarity(similarity); try (IndexWriter writer = new IndexWriter(directory, iwc)) { this.reader = build(writer); } catch (IOException e) { throw new RuntimeException(e); // This is a RAMDirectory, so should never happen... } } @Override public LeafReader getIndexReader() throws IOException { return reader; } private LeafReader build(IndexWriter writer) throws IOException { for (InputDocument doc : documents) { writer.addDocument(doc.getDocument()); } writer.commit(); writer.forceMerge(1); LeafReader reader = DirectoryReader.open(directory).leaves().get(0).reader(); assert reader != null; docIds = new String[reader.maxDoc()]; for (int i = 0; i < docIds.length; i++) { docIds[i] = reader.document(i).get(InputDocument.ID_FIELD); // TODO can this be more efficient? } return reader; } @Override public String resolveDocId(int docId) { return docIds[docId]; } @Override public void close() throws IOException { IOUtils.close(reader, directory); } } // Specialized class for batches containing a single object - MemoryIndex benchmarks as // better performing than RAMDirectory for this case private static class SingletonDocumentBatch extends DocumentBatch { private final MemoryIndex memoryindex = new MemoryIndex(true, true); private final LeafReader reader; private SingletonDocumentBatch(Collection documents, Similarity similarity) { super(documents, similarity); assert documents.size() == 1; memoryindex.setSimilarity(similarity); for (InputDocument doc : documents) { for (IndexableField field : doc.getDocument()) { memoryindex.addField(field, doc.getAnalyzers()); } } memoryindex.freeze(); reader = (LeafReader) memoryindex.createSearcher().getIndexReader(); } @Override public LeafReader getIndexReader() throws IOException { return reader; } @Override public String resolveDocId(int docId) { assert docId == 0; return documents.get(0).getId(); } @Override public void close() throws IOException { reader.close(); } } }




© 2015 - 2025 Weber Informatics LLC | Privacy Policy