com.arcadedb.index.lsm.LSMTreeFullTextIndex Maven / Gradle / Ivy
/*
* Copyright © 2021-present Arcade Data Ltd ([email protected])
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*
* SPDX-FileCopyrightText: 2021-present Arcade Data Ltd ([email protected])
* SPDX-License-Identifier: Apache-2.0
*/
package com.arcadedb.index.lsm;
import com.arcadedb.database.DatabaseInternal;
import com.arcadedb.database.Identifiable;
import com.arcadedb.database.RID;
import com.arcadedb.engine.ComponentFactory;
import com.arcadedb.engine.ComponentFile;
import com.arcadedb.engine.PaginatedComponent;
import com.arcadedb.index.Index;
import com.arcadedb.index.IndexCursor;
import com.arcadedb.index.IndexCursorEntry;
import com.arcadedb.index.IndexException;
import com.arcadedb.index.IndexInternal;
import com.arcadedb.index.TempIndexCursor;
import com.arcadedb.index.TypeIndex;
import com.arcadedb.schema.IndexBuilder;
import com.arcadedb.schema.Schema;
import com.arcadedb.schema.Type;
import com.arcadedb.serializer.json.JSONObject;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import java.io.*;
import java.util.*;
import java.util.concurrent.atomic.*;
/**
* Full Text index implementation based on LSM-Tree index.
* In order to support a full-text index, we leverage on the Lucene ecosystem in terms of Analyzer, Tokenizers, and stemmers, but leaving the current efficient
* LSM-Tree implementation with the management for ACID(ity), bg compaction, wal, replication, ha, etc.
*
* The idea to index a text is:
*
* parse the text with the configured analyzer. The analyzer uses a tokenizer that splits the text into words, then the stemmer extracts the stem of each word.
* In the end, the stop words are removed. The output of this phase is an array of strings to be indexed.
* Put all the strings from the resulting array in the underlying LSM index with the RID as value (as with default LSM-Tree index implementation)
* For the search, the process is similar, with the computation of the score:
*
* parse the text with the configured analyzer, extract the array of strings (see above)
* search for all the strings in the array, by storing the multiple results in a {@literal Map>} (as {@literal Map})
* browse all the results in the maps, by adding all of them to a final {@literal TreeMap} that represents the score, where the key is the record id
* and the value is a counter that stores the score. At the beginning the score is 1. Every time a RID is already present in the score TreeMap, then the value
* is incremented. In this way, the records that match a higher number of keywords will have a higher score. The score can start from 1 to Integer.MAX_INT.
* the query result will be the TreeMap ordered by score, so if the query has a limit, only the first X items will be returned ordered by score desc
*/
public class LSMTreeFullTextIndex implements Index, IndexInternal {
private final LSMTreeIndex underlyingIndex;
private final Analyzer analyzer;
private TypeIndex typeIndex;
public static class IndexFactoryHandler implements com.arcadedb.index.IndexFactoryHandler {
@Override
public IndexInternal create(final IndexBuilder builder) {
if (builder.isUnique())
throw new IllegalArgumentException("Full text index cannot be unique");
if (builder.getKeyTypes().length != 1)
throw new IllegalArgumentException("Full text index can only be defined on one only string property");
if (builder.getKeyTypes()[0] != Type.STRING)
throw new IllegalArgumentException(
"Full text index can only be defined on a '" + builder.getKeyTypes()[0] + "' property, only string");
return new LSMTreeFullTextIndex(builder.getDatabase(), builder.getIndexName(), builder.getFilePath(),
ComponentFile.MODE.READ_WRITE, builder.getPageSize(), builder.getNullStrategy());
}
}
/**
* Called at load time. The Full Text index is just a wrapper of an LSMTree Inddex.
*/
public LSMTreeFullTextIndex(final LSMTreeIndex index) {
analyzer = new StandardAnalyzer();
underlyingIndex = index;
}
/**
* Creation time.
*/
public LSMTreeFullTextIndex(final DatabaseInternal database, final String name, final String filePath,
final ComponentFile.MODE mode, final int pageSize, final LSMTreeIndexAbstract.NULL_STRATEGY nullStrategy) {
analyzer = new StandardAnalyzer();
underlyingIndex = new LSMTreeIndex(database, name, false, filePath, mode, new Type[] { Type.STRING }, pageSize, nullStrategy);
}
/**
* Loading time.
*/
public LSMTreeFullTextIndex(final DatabaseInternal database, final String name, final String filePath, final int fileId,
final ComponentFile.MODE mode, final int pageSize, final int version) {
try {
underlyingIndex = new LSMTreeIndex(database, name, false, filePath, fileId, mode, pageSize, version);
} catch (final IOException e) {
throw new IndexException("Cannot create search engine (error=" + e + ")", e);
}
analyzer = new StandardAnalyzer();
}
@Override
public IndexCursor get(final Object[] keys) {
return get(keys, -1);
}
@Override
public IndexCursor get(final Object[] keys, final int limit) {
final List keywords = analyzeText(analyzer, keys);
final HashMap scoreMap = new HashMap<>();
for (final String k : keywords) {
final IndexCursor rids = underlyingIndex.get(new String[] { k });
while (rids.hasNext()) {
final RID rid = rids.next().getIdentity();
final AtomicInteger score = scoreMap.get(rid);
if (score == null)
scoreMap.put(rid, new AtomicInteger(1));
else
score.incrementAndGet();
}
}
final int maxElements = limit > -1 ? limit : scoreMap.size();
final ArrayList list = new ArrayList<>(maxElements);
for (final Map.Entry entry : scoreMap.entrySet())
list.add(new IndexCursorEntry(keys, entry.getKey(), entry.getValue().get()));
if (list.size() > 1)
list.sort((o1, o2) -> {
if (o1.score == o2.score)
return 0;
return o1.score < o2.score ? -1 : 1;
});
return new TempIndexCursor(list);
}
@Override
public void put(final Object[] keys, final RID[] rids) {
final List keywords = analyzeText(analyzer, keys);
for (final String k : keywords)
underlyingIndex.put(new String[] { k }, rids);
}
@Override
public void remove(final Object[] keys) {
final List keywords = analyzeText(analyzer, keys);
for (final String k : keywords)
underlyingIndex.remove(new String[] { k });
}
@Override
public void remove(final Object[] keys, final Identifiable rid) {
final List keywords = analyzeText(analyzer, keys);
for (final String k : keywords)
underlyingIndex.remove(new String[] { k }, rid);
}
@Override
public JSONObject toJSON() {
final JSONObject json = new JSONObject();
json.put("type", getType());
json.put("bucket", underlyingIndex.mutable.getDatabase().getSchema().getBucketById(getAssociatedBucketId()).getName());
json.put("properties", getPropertyNames());
json.put("nullStrategy", getNullStrategy());
return json;
}
@Override
public IndexInternal getAssociatedIndex() {
return null;
}
@Override
public long countEntries() {
return underlyingIndex.countEntries();
}
@Override
public boolean compact() throws IOException, InterruptedException {
return underlyingIndex.compact();
}
@Override
public boolean isCompacting() {
return underlyingIndex.isCompacting();
}
@Override
public boolean scheduleCompaction() {
return underlyingIndex.scheduleCompaction();
}
@Override
public String getMostRecentFileName() {
return underlyingIndex.getMostRecentFileName();
}
@Override
public void setMetadata(final String name, final String[] propertyNames, final int associatedBucketId) {
underlyingIndex.setMetadata(name, propertyNames, associatedBucketId);
}
@Override
public boolean setStatus(final INDEX_STATUS[] expectedStatuses, final INDEX_STATUS newStatus) {
return underlyingIndex.setStatus(expectedStatuses, newStatus);
}
@Override
public String getTypeName() {
return underlyingIndex.getTypeName();
}
@Override
public List getPropertyNames() {
return underlyingIndex.getPropertyNames();
}
@Override
public void close() {
underlyingIndex.close();
}
@Override
public void drop() {
underlyingIndex.drop();
}
@Override
public String getName() {
return underlyingIndex.getName();
}
@Override
public Map getStats() {
return underlyingIndex.getStats();
}
@Override
public LSMTreeIndexAbstract.NULL_STRATEGY getNullStrategy() {
return LSMTreeIndexAbstract.NULL_STRATEGY.ERROR;
}
@Override
public void setNullStrategy(final LSMTreeIndexAbstract.NULL_STRATEGY nullStrategy) {
if (nullStrategy != LSMTreeIndexAbstract.NULL_STRATEGY.ERROR)
throw new IllegalArgumentException("Unsupported null strategy '" + nullStrategy + "'");
}
@Override
public int getFileId() {
return underlyingIndex.getFileId();
}
@Override
public boolean isUnique() {
return false;
}
@Override
public PaginatedComponent getComponent() {
return underlyingIndex.getComponent();
}
@Override
public Type[] getKeyTypes() {
return underlyingIndex.getKeyTypes();
}
@Override
public byte[] getBinaryKeyTypes() {
return underlyingIndex.getBinaryKeyTypes();
}
@Override
public int getAssociatedBucketId() {
return underlyingIndex.getAssociatedBucketId();
}
@Override
public boolean supportsOrderedIterations() {
return false;
}
@Override
public boolean isAutomatic() {
return underlyingIndex.propertyNames != null;
}
@Override
public int getPageSize() {
return underlyingIndex.getPageSize();
}
@Override
public List getFileIds() {
return underlyingIndex.getFileIds();
}
@Override
public void setTypeIndex(final TypeIndex typeIndex) {
this.typeIndex = typeIndex;
}
@Override
public TypeIndex getTypeIndex() {
return typeIndex;
}
@Override
public long build(final int buildIndexBatchSize, final BuildIndexCallback callback) {
return underlyingIndex.build(buildIndexBatchSize, callback);
}
@Override
public Schema.INDEX_TYPE getType() {
return Schema.INDEX_TYPE.FULL_TEXT;
}
public Analyzer getAnalyzer() {
return analyzer;
}
@Override
public boolean isValid() {
return underlyingIndex.isValid();
}
public List analyzeText(final Analyzer analyzer, final Object[] text) {
final List tokens = new ArrayList<>();
for (final Object t : text) {
if (t == null)
tokens.add(null);
else {
final TokenStream tokenizer = analyzer.tokenStream("contents", t.toString());
try {
tokenizer.reset();
final CharTermAttribute termAttribute = tokenizer.getAttribute(CharTermAttribute.class);
try {
while (tokenizer.incrementToken()) {
final String token = termAttribute.toString();
tokens.add(token);
}
} catch (final IOException e) {
throw new IndexException("Error on analyzing text", e);
}
} catch (final IOException e) {
throw new IndexException("Error on tokenizer", e);
} finally {
try {
tokenizer.close();
} catch (final IOException e) {
// IGNORE IT
}
}
}
}
return tokens;
}
}
© 2015 - 2024 Weber Informatics LLC | Privacy Policy