All Downloads are FREE. Search and download functionalities are using the official Maven repository.

com.simiacryptus.text.CharTrieIndex Maven / Gradle / Ivy

/*
 * Copyright (c) 2018 by Andrew Charneski.
 *
 * The author licenses this file to you under the
 * Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance
 * with the License.  You may obtain a copy
 * of the License at
 *
 *   http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing,
 * software distributed under the License is distributed on an
 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 * KIND, either express or implied.  See the License for the
 * specific language governing permissions and limitations
 * under the License.
 */

package com.simiacryptus.text;

import com.simiacryptus.util.data.SerialArrayList;

import java.util.ArrayList;
import java.util.Collection;
import java.util.List;
import java.util.concurrent.atomic.AtomicInteger;
import java.util.stream.Collectors;
import java.util.stream.IntStream;

/**
 * The type Char trie index.
 */
public class CharTrieIndex extends CharTrie {

  /**
   * The Cursors.
   */
  protected final SerialArrayList cursors;
  /**
   * The Documents.
   */
  protected final ArrayList documents;

  private CharTrieIndex(SerialArrayList nodes, SerialArrayList cursors,
                        ArrayList documents) {
    super(nodes);
    this.cursors = cursors;
    this.documents = documents;
  }

  /**
   * Instantiates a new Char trie index.
   *
   * @param copyFrom the copy from
   */
  public CharTrieIndex(CharTrieIndex copyFrom) {
    this(copyFrom.nodes.copy(), copyFrom.cursors.copy(), new ArrayList<>(copyFrom.documents));

  }

  /**
   * Instantiates a new Char trie index.
   */
  public CharTrieIndex() {
    this(new SerialArrayList<>(NodeType.INSTANCE, new NodeData(NodewalkerCodec.END_OF_STRING, (short) -1, -1, -1, 0)), new SerialArrayList<>(CursorType.INSTANCE), new ArrayList<>());
  }

  /**
   * Index words char trie.
   *
   * @param documents the documents
   * @param maxLevels the max levels
   * @param minWeight the min weight
   * @return the char trie
   */
  public static CharTrie indexWords(Collection documents, int maxLevels, int minWeight) {
    return create(documents, maxLevels, minWeight, true);
  }

  /**
   * Index fulltext char trie.
   *
   * @param documents the documents
   * @param maxLevels the max levels
   * @param minWeight the min weight
   * @return the char trie
   */
  public static CharTrie indexFulltext(Collection documents, int maxLevels, int minWeight) {
    return create(documents, maxLevels, minWeight, false);
  }

  private static CharTrie create(Collection documents, int maxLevels, int minWeight, boolean words) {
    List> a = new ArrayList<>();
    List b = new ArrayList<>();
    int blockSize = 1024 * 1024;
    for (CharSequence s : documents) {
      b.add(s);
      if (b.stream().mapToInt(x -> x.length()).sum() > blockSize) {
        a.add(b);
        b = new ArrayList<>();
      }
    }
    a.add(b);
    return a.parallelStream().map(list -> {
      CharTrieIndex trie = new CharTrieIndex();
      list.forEach(s -> {
        if (words) {
          trie.addDictionary(s);
        } else {
          trie.addDocument(s);
        }
      });
      trie.index(maxLevels, minWeight);
      return (CharTrie) trie;
    }).reduce((l, r) -> l.add(r)).get();
  }

  @Override
  public int getMemorySize() {
    return cursors.getMemorySize() + nodes.getMemorySize();
  }

  @Override
  public long getIndexedSize() {
    return documents.isEmpty() ? super.getIndexedSize() : documents.stream().mapToInt(doc -> doc.length()).sum();
  }

  /**
   * Removes cursor data, retaining only the tree of tokens and counts. Subsequent calls to methods dealing apply cursors
   * will fail.
   *
   * @return this
   */
  public CharTrie truncate() {
    return new CharTrie(this);
  }

  /**
   * Creates the index tree using the accumulated documents
   *
   * @return this char trie index
   */
  public CharTrieIndex index() {
    return index(Integer.MAX_VALUE);
  }

  /**
   * Creates the index tree using the accumulated documents
   *
   * @param maxLevels - Maximum depth of the tree to build
   * @return this char trie index
   */
  public CharTrieIndex index(int maxLevels) {
    return index(maxLevels, 0);
  }

  /**
   * Creates the index tree using the accumulated documents
   *
   * @param maxLevels - Maximum depth of the tree to build
   * @param minWeight - Minimum number of cursors for a node to be index using,                  exclusive bound
   * @return this char trie index
   */
  public CharTrieIndex index(int maxLevels, int minWeight) {

    AtomicInteger numberSplit = new AtomicInteger(0);
    int depth = -1;
    do {
      numberSplit.set(0);
      if (0 == ++depth) {
        numberSplit.incrementAndGet();
        root().split();
      } else {
        root().streamDecendents(depth).forEach(node -> {
          TrieNode godparent = node.godparent();
          if (node.getDepth() < maxLevels) {
            if (null == godparent || godparent.getCursorCount() > minWeight) {
              if (node.getChar() != NodewalkerCodec.END_OF_STRING || node.getDepth() == 0) {
                ((IndexNode) node).split();
                numberSplit.incrementAndGet();
              }
            }
          }
        });
      }
    } while (numberSplit.get() > 0);
    return this;
  }

  /**
   * Adds a document to be indexed. This can only be performed before splitting.
   *
   * @param document the document
   * @return this int
   */
  public int addDictionary(CharSequence document) {
    if (root().getNumberOfChildren() >= 0) {
      throw new IllegalStateException("Tree sorting has begun");
    }
    final int index;
    synchronized (this) {
      index = documents.size();
      documents.add(document);
    }
    cursors.addAll(
        IntStream.range(0, 1).mapToObj(i -> new CursorData(index, i)).collect(Collectors.toList()));
    nodes.update(0, node -> node.setCursorCount(cursors.length()));
    return index;
  }

  /**
   * Adds a document to be indexed. This can only be performed before splitting.
   *
   * @param document the document
   * @return this int
   */
  public int addDocument(CharSequence document) {
    if (root().getNumberOfChildren() >= 0) {
      throw new IllegalStateException("Tree sorting has begun");
    }
    final int index;
    synchronized (this) {
      index = documents.size();
      documents.add(document);
    }
    cursors.addAll(
        IntStream.range(0, document.length() + 1).mapToObj(i -> new CursorData(index, i)).collect(Collectors.toList()));
    nodes.update(0, node -> node.setCursorCount(cursors.length()));
    return index;
  }

  /**
   * Add alphabet char trie.
   *
   * @param document the document
   * @return the char trie
   */
  public CharTrie addAlphabet(CharSequence document) {
    document.chars().mapToObj(i -> new String(Character.toChars(i))).forEach(s -> addDocument(s));
    return this;
  }

  @Override
  CharTrieIndex recomputeCursorDetails() {
    return (CharTrieIndex) super.recomputeCursorDetails();
  }

  public CharTrieIndex copy() {
    return new CharTrieIndex(this);
  }

  @Override
  public IndexNode root() {
    return new IndexNode(this, (short) 0, 0, null);
  }

  @Override
  public IndexNode traverse(String search) {
    return root().traverse(search);
  }

}




© 2015 - 2025 Weber Informatics LLC | Privacy Policy