All Downloads are FREE. Search and download functionalities are using the official Maven repository.

com.arcadedb.index.lsm.LSMTreeFullTextIndex Maven / Gradle / Ivy

There is a newer version: 24.11.1
Show newest version
/*
 * Copyright © 2021-present Arcade Data Ltd ([email protected])
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 *
 * SPDX-FileCopyrightText: 2021-present Arcade Data Ltd ([email protected])
 * SPDX-License-Identifier: Apache-2.0
 */
package com.arcadedb.index.lsm;

import com.arcadedb.database.DatabaseInternal;
import com.arcadedb.database.Identifiable;
import com.arcadedb.database.RID;
import com.arcadedb.engine.ComponentFactory;
import com.arcadedb.engine.ComponentFile;
import com.arcadedb.engine.PaginatedComponent;
import com.arcadedb.index.Index;
import com.arcadedb.index.IndexCursor;
import com.arcadedb.index.IndexCursorEntry;
import com.arcadedb.index.IndexException;
import com.arcadedb.index.IndexInternal;
import com.arcadedb.index.TempIndexCursor;
import com.arcadedb.index.TypeIndex;
import com.arcadedb.schema.IndexBuilder;
import com.arcadedb.schema.Schema;
import com.arcadedb.schema.Type;
import com.arcadedb.serializer.json.JSONObject;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;

import java.io.*;
import java.util.*;
import java.util.concurrent.atomic.*;

/**
 * Full Text index implementation based on LSM-Tree index.
 * In order to support a full-text index, we leverage on the Lucene ecosystem in terms of Analyzer, Tokenizers, and stemmers, but leaving the current efficient
 * LSM-Tree implementation with the management for ACID(ity), bg compaction, wal, replication, ha, etc.
 * 
* The idea to index a text is: *
* parse the text with the configured analyzer. The analyzer uses a tokenizer that splits the text into words, then the stemmer extracts the stem of each word. * In the end, the stop words are removed. The output of this phase is an array of strings to be indexed. * Put all the strings from the resulting array in the underlying LSM index with the RID as value (as with default LSM-Tree index implementation) * For the search, the process is similar, with the computation of the score: *
* parse the text with the configured analyzer, extract the array of strings (see above) * search for all the strings in the array, by storing the multiple results in a {@literal Map>} (as {@literal Map}) * browse all the results in the maps, by adding all of them to a final {@literal TreeMap} that represents the score, where the key is the record id * and the value is a counter that stores the score. At the beginning the score is 1. Every time a RID is already present in the score TreeMap, then the value * is incremented. In this way, the records that match a higher number of keywords will have a higher score. The score can start from 1 to Integer.MAX_INT. * the query result will be the TreeMap ordered by score, so if the query has a limit, only the first X items will be returned ordered by score desc */ public class LSMTreeFullTextIndex implements Index, IndexInternal { private final LSMTreeIndex underlyingIndex; private final Analyzer analyzer; private TypeIndex typeIndex; public static class IndexFactoryHandler implements com.arcadedb.index.IndexFactoryHandler { @Override public IndexInternal create(final IndexBuilder builder) { if (builder.isUnique()) throw new IllegalArgumentException("Full text index cannot be unique"); if (builder.getKeyTypes().length != 1) throw new IllegalArgumentException("Full text index can only be defined on one only string property"); if (builder.getKeyTypes()[0] != Type.STRING) throw new IllegalArgumentException( "Full text index can only be defined on a '" + builder.getKeyTypes()[0] + "' property, only string"); return new LSMTreeFullTextIndex(builder.getDatabase(), builder.getIndexName(), builder.getFilePath(), ComponentFile.MODE.READ_WRITE, builder.getPageSize(), builder.getNullStrategy()); } } /** * Called at load time. The Full Text index is just a wrapper of an LSMTree Inddex. */ public LSMTreeFullTextIndex(final LSMTreeIndex index) { analyzer = new StandardAnalyzer(); underlyingIndex = index; } /** * Creation time. */ public LSMTreeFullTextIndex(final DatabaseInternal database, final String name, final String filePath, final ComponentFile.MODE mode, final int pageSize, final LSMTreeIndexAbstract.NULL_STRATEGY nullStrategy) { analyzer = new StandardAnalyzer(); underlyingIndex = new LSMTreeIndex(database, name, false, filePath, mode, new Type[] { Type.STRING }, pageSize, nullStrategy); } /** * Loading time. */ public LSMTreeFullTextIndex(final DatabaseInternal database, final String name, final String filePath, final int fileId, final ComponentFile.MODE mode, final int pageSize, final int version) { try { underlyingIndex = new LSMTreeIndex(database, name, false, filePath, fileId, mode, pageSize, version); } catch (final IOException e) { throw new IndexException("Cannot create search engine (error=" + e + ")", e); } analyzer = new StandardAnalyzer(); } @Override public IndexCursor get(final Object[] keys) { return get(keys, -1); } @Override public IndexCursor get(final Object[] keys, final int limit) { final List keywords = analyzeText(analyzer, keys); final HashMap scoreMap = new HashMap<>(); for (final String k : keywords) { final IndexCursor rids = underlyingIndex.get(new String[] { k }); while (rids.hasNext()) { final RID rid = rids.next().getIdentity(); final AtomicInteger score = scoreMap.get(rid); if (score == null) scoreMap.put(rid, new AtomicInteger(1)); else score.incrementAndGet(); } } final int maxElements = limit > -1 ? limit : scoreMap.size(); final ArrayList list = new ArrayList<>(maxElements); for (final Map.Entry entry : scoreMap.entrySet()) list.add(new IndexCursorEntry(keys, entry.getKey(), entry.getValue().get())); if (list.size() > 1) list.sort((o1, o2) -> { if (o1.score == o2.score) return 0; return o1.score < o2.score ? -1 : 1; }); return new TempIndexCursor(list); } @Override public void put(final Object[] keys, final RID[] rids) { final List keywords = analyzeText(analyzer, keys); for (final String k : keywords) underlyingIndex.put(new String[] { k }, rids); } @Override public void remove(final Object[] keys) { final List keywords = analyzeText(analyzer, keys); for (final String k : keywords) underlyingIndex.remove(new String[] { k }); } @Override public void remove(final Object[] keys, final Identifiable rid) { final List keywords = analyzeText(analyzer, keys); for (final String k : keywords) underlyingIndex.remove(new String[] { k }, rid); } @Override public JSONObject toJSON() { final JSONObject json = new JSONObject(); json.put("type", getType()); json.put("bucket", underlyingIndex.mutable.getDatabase().getSchema().getBucketById(getAssociatedBucketId()).getName()); json.put("properties", getPropertyNames()); json.put("nullStrategy", getNullStrategy()); return json; } @Override public IndexInternal getAssociatedIndex() { return null; } @Override public long countEntries() { return underlyingIndex.countEntries(); } @Override public boolean compact() throws IOException, InterruptedException { return underlyingIndex.compact(); } @Override public boolean isCompacting() { return underlyingIndex.isCompacting(); } @Override public boolean scheduleCompaction() { return underlyingIndex.scheduleCompaction(); } @Override public String getMostRecentFileName() { return underlyingIndex.getMostRecentFileName(); } @Override public void setMetadata(final String name, final String[] propertyNames, final int associatedBucketId) { underlyingIndex.setMetadata(name, propertyNames, associatedBucketId); } @Override public boolean setStatus(final INDEX_STATUS[] expectedStatuses, final INDEX_STATUS newStatus) { return underlyingIndex.setStatus(expectedStatuses, newStatus); } @Override public String getTypeName() { return underlyingIndex.getTypeName(); } @Override public List getPropertyNames() { return underlyingIndex.getPropertyNames(); } @Override public void close() { underlyingIndex.close(); } @Override public void drop() { underlyingIndex.drop(); } @Override public String getName() { return underlyingIndex.getName(); } @Override public Map getStats() { return underlyingIndex.getStats(); } @Override public LSMTreeIndexAbstract.NULL_STRATEGY getNullStrategy() { return LSMTreeIndexAbstract.NULL_STRATEGY.ERROR; } @Override public void setNullStrategy(final LSMTreeIndexAbstract.NULL_STRATEGY nullStrategy) { if (nullStrategy != LSMTreeIndexAbstract.NULL_STRATEGY.ERROR) throw new IllegalArgumentException("Unsupported null strategy '" + nullStrategy + "'"); } @Override public int getFileId() { return underlyingIndex.getFileId(); } @Override public boolean isUnique() { return false; } @Override public PaginatedComponent getComponent() { return underlyingIndex.getComponent(); } @Override public Type[] getKeyTypes() { return underlyingIndex.getKeyTypes(); } @Override public byte[] getBinaryKeyTypes() { return underlyingIndex.getBinaryKeyTypes(); } @Override public int getAssociatedBucketId() { return underlyingIndex.getAssociatedBucketId(); } @Override public boolean supportsOrderedIterations() { return false; } @Override public boolean isAutomatic() { return underlyingIndex.propertyNames != null; } @Override public int getPageSize() { return underlyingIndex.getPageSize(); } @Override public List getFileIds() { return underlyingIndex.getFileIds(); } @Override public void setTypeIndex(final TypeIndex typeIndex) { this.typeIndex = typeIndex; } @Override public TypeIndex getTypeIndex() { return typeIndex; } @Override public long build(final int buildIndexBatchSize, final BuildIndexCallback callback) { return underlyingIndex.build(buildIndexBatchSize, callback); } @Override public Schema.INDEX_TYPE getType() { return Schema.INDEX_TYPE.FULL_TEXT; } public Analyzer getAnalyzer() { return analyzer; } @Override public boolean isValid() { return underlyingIndex.isValid(); } public List analyzeText(final Analyzer analyzer, final Object[] text) { final List tokens = new ArrayList<>(); for (final Object t : text) { if (t == null) tokens.add(null); else { final TokenStream tokenizer = analyzer.tokenStream("contents", t.toString()); try { tokenizer.reset(); final CharTermAttribute termAttribute = tokenizer.getAttribute(CharTermAttribute.class); try { while (tokenizer.incrementToken()) { final String token = termAttribute.toString(); tokens.add(token); } } catch (final IOException e) { throw new IndexException("Error on analyzing text", e); } } catch (final IOException e) { throw new IndexException("Error on tokenizer", e); } finally { try { tokenizer.close(); } catch (final IOException e) { // IGNORE IT } } } } return tokens; } }




© 2015 - 2024 Weber Informatics LLC | Privacy Policy