com.arcadedb.index.lsm.LSMTreeFullTextIndex Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of arcadedb-engine Show documentation
There is a newer version: 24.11.1
/*
 * Copyright © 2021-present Arcade Data Ltd ([email protected])
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 *
 * SPDX-FileCopyrightText: 2021-present Arcade Data Ltd ([email protected])
 * SPDX-License-Identifier: Apache-2.0
 */
package com.arcadedb.index.lsm;

import com.arcadedb.database.DatabaseInternal;
import com.arcadedb.database.Identifiable;
import com.arcadedb.database.RID;
import com.arcadedb.engine.ComponentFactory;
import com.arcadedb.engine.ComponentFile;
import com.arcadedb.engine.PaginatedComponent;
import com.arcadedb.index.Index;
import com.arcadedb.index.IndexCursor;
import com.arcadedb.index.IndexCursorEntry;
import com.arcadedb.index.IndexException;
import com.arcadedb.index.IndexInternal;
import com.arcadedb.index.TempIndexCursor;
import com.arcadedb.index.TypeIndex;
import com.arcadedb.schema.IndexBuilder;
import com.arcadedb.schema.Schema;
import com.arcadedb.schema.Type;
import com.arcadedb.serializer.json.JSONObject;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;

import java.io.*;
import java.util.*;
import java.util.concurrent.atomic.*;

/**
 * Full Text index implementation based on LSM-Tree index.
 * In order to support a full-text index, we leverage on the Lucene ecosystem in terms of Analyzer, Tokenizers, and stemmers, but leaving the current efficient
 * LSM-Tree implementation with the management for ACID(ity), bg compaction, wal, replication, ha, etc.
 * 

 * The idea to index a text is:
 * 

 * parse the text with the configured analyzer. The analyzer uses a tokenizer that splits the text into words, then the stemmer extracts the stem of each word.
 * In the end, the stop words are removed. The output of this phase is an array of strings to be indexed.
 * Put all the strings from the resulting array in the underlying LSM index with the RID as value (as with default LSM-Tree index implementation)
 * For the search, the process is similar, with the computation of the score:
 * 

 * parse the text with the configured analyzer, extract the array of strings (see above)
 * search for all the strings in the array, by storing the multiple results in a {@literal Map>} (as {@literal Map})
 * browse all the results in the maps, by adding all of them to a final {@literal TreeMap} that represents the score, where the key is the record id
 * and the value is a counter that stores the score. At the beginning the score is 1. Every time a RID is already present in the score TreeMap, then the value
 * is incremented. In this way, the records that match a higher number of keywords will have a higher score. The score can start from 1 to Integer.MAX_INT.
 * the query result will be the TreeMap ordered by score, so if the query has a limit, only the first X items will be returned ordered by score desc
 */
public class LSMTreeFullTextIndex implements Index, IndexInternal {
  private final LSMTreeIndex underlyingIndex;
  private final Analyzer     analyzer;
  private       TypeIndex    typeIndex;

  public static class IndexFactoryHandler implements com.arcadedb.index.IndexFactoryHandler {
    @Override
    public IndexInternal create(final IndexBuilder builder) {
      if (builder.isUnique())
        throw new IllegalArgumentException("Full text index cannot be unique");

      if (builder.getKeyTypes().length != 1)
        throw new IllegalArgumentException("Full text index can only be defined on one only string property");

      if (builder.getKeyTypes()[0] != Type.STRING)
        throw new IllegalArgumentException(
            "Full text index can only be defined on a '" + builder.getKeyTypes()[0] + "' property, only string");

      return new LSMTreeFullTextIndex(builder.getDatabase(), builder.getIndexName(), builder.getFilePath(),
          ComponentFile.MODE.READ_WRITE, builder.getPageSize(), builder.getNullStrategy());
    }
  }

  /**
   * Called at load time. The Full Text index is just a wrapper of an LSMTree Inddex.
   */
  public LSMTreeFullTextIndex(final LSMTreeIndex index) {
    analyzer = new StandardAnalyzer();
    underlyingIndex = index;
  }

  /**
   * Creation time.
   */
  public LSMTreeFullTextIndex(final DatabaseInternal database, final String name, final String filePath,
      final ComponentFile.MODE mode, final int pageSize, final LSMTreeIndexAbstract.NULL_STRATEGY nullStrategy) {
    analyzer = new StandardAnalyzer();
    underlyingIndex = new LSMTreeIndex(database, name, false, filePath, mode, new Type[] { Type.STRING }, pageSize, nullStrategy);
  }

  /**
   * Loading time.
   */
  public LSMTreeFullTextIndex(final DatabaseInternal database, final String name, final String filePath, final int fileId,
      final ComponentFile.MODE mode, final int pageSize, final int version) {
    try {
      underlyingIndex = new LSMTreeIndex(database, name, false, filePath, fileId, mode, pageSize, version);
    } catch (final IOException e) {
      throw new IndexException("Cannot create search engine (error=" + e + ")", e);
    }
    analyzer = new StandardAnalyzer();
  }

  @Override
  public IndexCursor get(final Object[] keys) {
    return get(keys, -1);
  }

  @Override
  public IndexCursor get(final Object[] keys, final int limit) {
    final List keywords = analyzeText(analyzer, keys);

    final HashMap scoreMap = new HashMap<>();

    for (final String k : keywords) {
      final IndexCursor rids = underlyingIndex.get(new String[] { k });

      while (rids.hasNext()) {
        final RID rid = rids.next().getIdentity();

        final AtomicInteger score = scoreMap.get(rid);
        if (score == null)
          scoreMap.put(rid, new AtomicInteger(1));
        else
          score.incrementAndGet();
      }
    }

    final int maxElements = limit > -1 ? limit : scoreMap.size();

    final ArrayList list = new ArrayList<>(maxElements);
    for (final Map.Entry entry : scoreMap.entrySet())
      list.add(new IndexCursorEntry(keys, entry.getKey(), entry.getValue().get()));

    if (list.size() > 1)
      list.sort((o1, o2) -> {
        if (o1.score == o2.score)
          return 0;
        return o1.score < o2.score ? -1 : 1;
      });

    return new TempIndexCursor(list);
  }

  @Override
  public void put(final Object[] keys, final RID[] rids) {
    final List keywords = analyzeText(analyzer, keys);
    for (final String k : keywords)
      underlyingIndex.put(new String[] { k }, rids);
  }

  @Override
  public void remove(final Object[] keys) {
    final List keywords = analyzeText(analyzer, keys);
    for (final String k : keywords)
      underlyingIndex.remove(new String[] { k });
  }

  @Override
  public void remove(final Object[] keys, final Identifiable rid) {
    final List keywords = analyzeText(analyzer, keys);
    for (final String k : keywords)
      underlyingIndex.remove(new String[] { k }, rid);
  }

  @Override
  public JSONObject toJSON() {
    final JSONObject json = new JSONObject();
    json.put("type", getType());

    json.put("bucket", underlyingIndex.mutable.getDatabase().getSchema().getBucketById(getAssociatedBucketId()).getName());
    json.put("properties", getPropertyNames());
    json.put("nullStrategy", getNullStrategy());
    return json;
  }

  @Override
  public IndexInternal getAssociatedIndex() {
    return null;
  }

  @Override
  public long countEntries() {
    return underlyingIndex.countEntries();
  }

  @Override
  public boolean compact() throws IOException, InterruptedException {
    return underlyingIndex.compact();
  }

  @Override
  public boolean isCompacting() {
    return underlyingIndex.isCompacting();
  }

  @Override
  public boolean scheduleCompaction() {
    return underlyingIndex.scheduleCompaction();
  }

  @Override
  public String getMostRecentFileName() {
    return underlyingIndex.getMostRecentFileName();
  }

  @Override
  public void setMetadata(final String name, final String[] propertyNames, final int associatedBucketId) {
    underlyingIndex.setMetadata(name, propertyNames, associatedBucketId);
  }

  @Override
  public boolean setStatus(final INDEX_STATUS[] expectedStatuses, final INDEX_STATUS newStatus) {
    return underlyingIndex.setStatus(expectedStatuses, newStatus);
  }

  @Override
  public String getTypeName() {
    return underlyingIndex.getTypeName();
  }

  @Override
  public List getPropertyNames() {
    return underlyingIndex.getPropertyNames();
  }

  @Override
  public void close() {
    underlyingIndex.close();
  }

  @Override
  public void drop() {
    underlyingIndex.drop();
  }

  @Override
  public String getName() {
    return underlyingIndex.getName();
  }

  @Override
  public Map getStats() {
    return underlyingIndex.getStats();
  }

  @Override
  public LSMTreeIndexAbstract.NULL_STRATEGY getNullStrategy() {
    return LSMTreeIndexAbstract.NULL_STRATEGY.ERROR;
  }

  @Override
  public void setNullStrategy(final LSMTreeIndexAbstract.NULL_STRATEGY nullStrategy) {
    if (nullStrategy != LSMTreeIndexAbstract.NULL_STRATEGY.ERROR)
      throw new IllegalArgumentException("Unsupported null strategy '" + nullStrategy + "'");
  }

  @Override
  public int getFileId() {
    return underlyingIndex.getFileId();
  }

  @Override
  public boolean isUnique() {
    return false;
  }

  @Override
  public PaginatedComponent getComponent() {
    return underlyingIndex.getComponent();
  }

  @Override
  public Type[] getKeyTypes() {
    return underlyingIndex.getKeyTypes();
  }

  @Override
  public byte[] getBinaryKeyTypes() {
    return underlyingIndex.getBinaryKeyTypes();
  }

  @Override
  public int getAssociatedBucketId() {
    return underlyingIndex.getAssociatedBucketId();
  }

  @Override
  public boolean supportsOrderedIterations() {
    return false;
  }

  @Override
  public boolean isAutomatic() {
    return underlyingIndex.propertyNames != null;
  }

  @Override
  public int getPageSize() {
    return underlyingIndex.getPageSize();
  }

  @Override
  public List getFileIds() {
    return underlyingIndex.getFileIds();
  }

  @Override
  public void setTypeIndex(final TypeIndex typeIndex) {
    this.typeIndex = typeIndex;
  }

  @Override
  public TypeIndex getTypeIndex() {
    return typeIndex;
  }

  @Override
  public long build(final int buildIndexBatchSize, final BuildIndexCallback callback) {
    return underlyingIndex.build(buildIndexBatchSize, callback);
  }

  @Override
  public Schema.INDEX_TYPE getType() {
    return Schema.INDEX_TYPE.FULL_TEXT;
  }

  public Analyzer getAnalyzer() {
    return analyzer;
  }

  @Override
  public boolean isValid() {
    return underlyingIndex.isValid();
  }

  public List analyzeText(final Analyzer analyzer, final Object[] text) {
    final List tokens = new ArrayList<>();

    for (final Object t : text) {
      if (t == null)
        tokens.add(null);
      else {
        final TokenStream tokenizer = analyzer.tokenStream("contents", t.toString());
        try {
          tokenizer.reset();
          final CharTermAttribute termAttribute = tokenizer.getAttribute(CharTermAttribute.class);

          try {
            while (tokenizer.incrementToken()) {
              final String token = termAttribute.toString();
              tokens.add(token);
            }

          } catch (final IOException e) {
            throw new IndexException("Error on analyzing text", e);
          }
        } catch (final IOException e) {
          throw new IndexException("Error on tokenizer", e);
        } finally {
          try {
            tokenizer.close();
          } catch (final IOException e) {
            // IGNORE IT
          }
        }
      }
    }
    return tokens;
  }
}