org.metaeffekt.artifact.resolver.generic.index.lucene.SimpleLuceneIndex Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of ae-artifact-resolver Show documentation
The newest version!
package org.metaeffekt.artifact.resolver.generic.index.lucene;

import lombok.NonNull;
import lombok.extern.slf4j.Slf4j;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.core.WhitespaceAnalyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.document.TextField;
import org.apache.lucene.index.*;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.TermQuery;

import java.io.File;
import java.io.IOException;
import java.util.Collection;
import java.util.List;
import java.util.Map;

// FIXME: split in read and write index
// FIXME: may be prone to chnanges; prefer dedicated index implementation to control index change.

/**
 * A simple index made with lucene.
 * 

 * Persists to the given path. Note that this means you manually need to delete or clear the index to fully rebuild it.
 * 

 * Due to its abstract interface, it may be useful for a variety of scenarios, akin to an "external memory hashmap".
 */
@Slf4j
public class SimpleLuceneIndex extends AbstractLuceneIndex {

    private static final String DEDUPLICATE = "DEDUPLICATE";
    private static final int DEFAULT_ANALYZER_MAX_WORD_LENGTH = 1024;
    private static final int WRITER_MAX_RAM_BUFFER_SIZE_MB = 128;

    private final IndexWriter writer;

    public SimpleLuceneIndex(@NonNull File indexDir, @NonNull Analyzer analyzer) throws IOException {
        super(indexDir, analyzer);

        final IndexWriterConfig conf = new IndexWriterConfig(analyzer);
        conf.setCommitOnClose(true);
        conf.setRAMBufferSizeMB(WRITER_MAX_RAM_BUFFER_SIZE_MB);
        conf.setUseCompoundFile(false);

        this.writer = new IndexWriter(getDirectory(), conf);
    }

    public SimpleLuceneIndex(@NonNull File indexDir) throws IOException {
        this(indexDir, new WhitespaceAnalyzer(DEFAULT_ANALYZER_MAX_WORD_LENGTH));
    }

    public void addEntry(@NonNull Map> keyValueMap) throws IOException {
        final Document document = new Document();

        if (keyValueMap.containsKey(DEDUPLICATE)) {
            log.error("Can't support key named [DEDUPLICATE]: key reserved by this class.");
            throw new IllegalArgumentException("Can't support key named [DEDUPLICATE]: key reserved by this class.");
        }

        for (Map.Entry> entry : keyValueMap.entrySet()) {
            for (String value : entry.getValue()) {
                document.add(new TextField(entry.getKey(), value, Field.Store.YES));
            }
        }

        this.writer.addDocument(document);
    }

    // TODO: reevaluate the feasibility and requorement of a deduplicating interface given lucene's lack of version
    //  interoparability: might be better to use another data store for long-term storage.
    // TODO: whip up interface with support for deduplication via writer.updateDocument and user-constructed "Term"s.
    //  trying to do deduplication inside this class isn't viable; it would limit its use case as accounting for all
    //  deduplication strategies is impossible. User-constructed and passed-in Term objects allow users to steer
    //  dedup behaviour via construction rules for said Term objects while also keeping this class truly "Simple".
    //  This would then allow for persistent indices that can be updated whenever.

    public void clear() throws IOException {
        this.writer.deleteAll();
        this.writer.forceMergeDeletes(false);
        this.writer.commit();
    }

    /**
     * Simple passthrough method to commit changes to storage.
     * @throws IOException throws on write failure
     */
    public void commit() throws IOException {
        this.writer.commit();
    }

    /**
     * Lookup value (being whatever substring shall be included) in key.
     * @param key field name to search
     * @param value value to search for
     * @param n maximum number of documents to output
     * @return found documents
     * @throws IOException on failure to open / search index
     */
    @NonNull
    public List lookupContains(@NonNull String key, @NonNull String value, int n) throws IOException {
        final Query query = new TermQuery(new Term(key, value));
        try (final IndexReader reader = DirectoryReader.open(this.writer)) {
            final IndexSearcher searcher = new IndexSearcher(reader);
            return runQuery(query, n, searcher);
        }
    }

    public long size() {
        try (final IndexReader reader = DirectoryReader.open(this.writer)) {
            return reader.numDocs();
        } catch (IOException e) {
            throw new RuntimeException(e);
        }
    }

    /**
     * Closes underlying index interfaces.
     */
    @Override
    public void close() throws Exception {
        writer.close();
        super.close();
    }
}