All Downloads are FREE. Search and download functionalities are using the official Maven repository.

com.simiacryptus.text.CharTrie Maven / Gradle / Ivy

/*
 * Copyright (c) 2019 by Andrew Charneski.
 *
 * The author licenses this file to you under the
 * Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance
 * with the License.  You may obtain a copy
 * of the License at
 *
 *   http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing,
 * software distributed under the License is distributed on an
 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 * KIND, either express or implied.  See the License for the
 * specific language governing permissions and limitations
 * under the License.
 */

package com.simiacryptus.text;

import com.google.common.collect.Iterators;
import com.simiacryptus.util.data.SerialArrayList;

import javax.annotation.Nonnull;
import javax.annotation.Nullable;
import java.util.*;
import java.util.function.BiFunction;
import java.util.function.Function;
import java.util.stream.Collectors;
import java.util.stream.Stream;
import java.util.stream.StreamSupport;

import static com.simiacryptus.text.NodewalkerCodec.*;

public class CharTrie {
  protected final SerialArrayList nodes;
  @Nullable
  protected int[] parentIndex = null;
  @Nullable
  protected int[] godparentIndex = null;

  public CharTrie(SerialArrayList nodes) {
    super();
    this.nodes = nodes;
  }

  public CharTrie() {
    this(new SerialArrayList<>(NodeType.INSTANCE, new NodeData(END_OF_STRING, (short) -1, -1, -1, 0)));
  }

  public CharTrie(@Nonnull CharTrie charTrie) {
    this(charTrie.nodes.copy());
    this.parentIndex = null == charTrie.parentIndex ? null
        : Arrays.copyOf(charTrie.parentIndex, charTrie.parentIndex.length);
    this.godparentIndex = null == charTrie.godparentIndex ? null
        : Arrays.copyOf(charTrie.godparentIndex, charTrie.godparentIndex.length);
  }

  @Nonnull
  public TextAnalysis getAnalyzer() {
    return new TextAnalysis(this.truncate().copy());
  }

  @Nonnull
  public NodewalkerCodec getCodec() {
    return new NodewalkerCodec(this);
  }

  @Nonnull
  public TextGenerator getGenerator() {
    return new TextGenerator(this.truncate().copy());
  }

  public long getIndexedSize() {
    return this.nodes.get(0).cursorCount;
  }

  public int getMemorySize() {
    return this.nodes.getMemorySize();
  }

  public int getNodeCount() {
    return nodes.length();
  }

  @Nonnull
  public static BiFunction reducer(
      @Nonnull BiFunction> fn) {
    return (left, right) -> left.reduce(right, fn);
  }

  @Nullable
  public TrieNode root() {
    return new TrieNode(this, 0, null);
  }

  @Nonnull
  public CharTrie reverse() {
    CharTrie result = new CharTrieIndex();
    TreeMap childrenMap = root().getChildrenMap();
    reverseSubtree(childrenMap, result.root());
    return result.recomputeCursorDetails();
  }

  @Nonnull
  public CharTrie rewrite(@Nonnull BiFunction, TreeMap> fn) {
    CharTrie result = new CharTrieIndex();
    rewriteSubtree(root(), result.root(), fn);
    return result.recomputeCursorDetails();
  }

  @Nonnull
  public CharTrie add(@Nonnull CharTrie z) {
    return reduceSimple(z, (left, right) -> (null == left ? 0 : left) + (null == right ? 0 : right));
  }

  @Nonnull
  public CharTrie product(@Nonnull CharTrie z) {
    return reduceSimple(z, (left, right) -> (null == left ? 0 : left) * (null == right ? 0 : right));
  }

  @Nonnull
  public CharTrie divide(@Nonnull CharTrie z, int factor) {
    return reduceSimple(z, (left, right) -> null == right ? 0 : (null == left ? 0 : left) * factor / right);
  }

  @Nonnull
  public CharTrie reduceSimple(@Nonnull CharTrie z, @Nonnull BiFunction fn) {
    return reduce(z, (left, right) -> {
      TreeMap leftChildren = null == left ? new TreeMap<>() : left.getChildrenMap();
      TreeMap rightChildren = null == right ? new TreeMap<>() : right.getChildrenMap();
      Map map = Stream.of(rightChildren.keySet(), leftChildren.keySet()).flatMap(x -> x.stream())
          .distinct().collect(Collectors.toMap(c -> c, (Character c) -> {
            assert null != c;
            TrieNode leftChild = leftChildren.get(c);
            Long l = null == leftChild ? null : leftChild.getCursorCount();
            TrieNode rightChild = rightChildren.get(c);
            Long r = null == rightChild ? null : rightChild.getCursorCount();
            return fn.apply(l, r);
          }));
      return new TreeMap<>(map);
    });
  }

  @Nonnull
  public CharTrie reduce(@Nonnull CharTrie right, @Nonnull BiFunction> fn) {
    CharTrie result = new CharTrieIndex();
    reduceSubtree(root(), right.root(), result.root(), fn);
    return result.recomputeCursorDetails();
  }

  public TrieNode traverse(@Nonnull String search) {
    return root().traverse(search);
  }

  @Nullable
  public TrieNode matchEnd(@Nonnull String search) {
    if (search.isEmpty())
      return root();
    int min = 0;
    int max = search.length();
    int i = Math.min(max, 12);
    int winner = -1;
    while (max > min) {
      String attempt = search.substring(search.length() - i);
      TrieNode cursor = traverse(attempt);
      if (cursor.getString().equals(attempt)) {
        min = Math.max(min, i + 1);
        winner = Math.max(winner, i);
      } else {
        max = Math.min(max, i - 1);
      }
      i = (3 * max + min) / 4;
    }
    if (winner < 0)
      return root();
    String matched = search.substring(search.length() - winner);
    return traverse(matched);
  }

  @Nullable
  public TrieNode matchPredictor(@Nonnull String search) {
    TrieNode cursor = matchEnd(search);
    assert cursor != null;
    if (cursor.getNumberOfChildren() > 0) {
      return cursor;
    }
    String string = cursor.getString();
    if (string.isEmpty())
      return null;
    return matchPredictor(string.substring(1));
  }

  @Nonnull
  public CharTrie copy() {
    return new CharTrie(this);
  }

  @Override
  public boolean equals(@Nullable Object o) {
    if (this == o)
      return true;
    if (o == null || getClass() != o.getClass())
      return false;

    CharTrie charTrie = (CharTrie) o;

    return nodes.equals(charTrie.nodes);
  }

  @Override
  public int hashCode() {
    return nodes.hashCode();
  }

  public Set tokens() {
    return root().getChildrenMap().keySet().stream().filter(c -> c != END_OF_STRING && c != FALLBACK && c != ESCAPE)
        .collect(Collectors.toSet());
  }

  public boolean contains(@Nonnull String text) {
    return traverse(text).getString().endsWith(text);
  }

  public > Stream max(@Nonnull Function fn, int maxResults) {
    return max(fn, maxResults, root());
  }

  synchronized void ensureParentIndexCapacity(int start, int length, int parentId) {
    int end = start + length;
    if (null == parentIndex) {
      parentIndex = new int[end];
      Arrays.fill(parentIndex, parentId);
    } else {
      int newLength = parentIndex.length;
      while (newLength < end)
        newLength *= 2;
      if (newLength > parentIndex.length) {
        parentIndex = Arrays.copyOfRange(parentIndex, 0, newLength);
        Arrays.fill(parentIndex, end, newLength, -1);
      }
      Arrays.fill(parentIndex, start, end, parentId);
    }
    if (null == godparentIndex) {
      godparentIndex = new int[end];
      Arrays.fill(godparentIndex, -1);
    } else {
      int newLength = godparentIndex.length;
      while (newLength < end)
        newLength *= 2;
      if (newLength > godparentIndex.length) {
        int prevLength = godparentIndex.length;
        godparentIndex = Arrays.copyOfRange(godparentIndex, 0, newLength);
        Arrays.fill(godparentIndex, prevLength, newLength, -1);
      }
    }
  }

  @Nonnull
  CharTrie recomputeCursorDetails() {
    godparentIndex = new int[getNodeCount()];
    parentIndex = new int[getNodeCount()];
    Arrays.fill(godparentIndex, 0, godparentIndex.length, -1);
    Arrays.fill(parentIndex, 0, parentIndex.length, -1);
    System.gc();
    recomputeCursorTotals(root());
    System.gc();
    recomputeCursorPositions(root(), 0);
    System.gc();
    return this;
  }

  @Nonnull
  protected CharTrie truncate() {
    return this;
  }

  private void reverseSubtree(@Nonnull TreeMap childrenMap, @Nonnull TrieNode destination) {
    String suffix = new StringBuilder(destination.getRawString()).reverse().toString();
    TreeMap children = new TreeMap<>();
    childrenMap.forEach((token, node) -> {
      TrieNode analog = node.traverse(suffix);
      boolean found = (token + suffix).equals(analog.getRawString());
      if (found) {
        children.put(token, analog.getCursorCount());
      }
    });
    destination.writeChildren(children);
    destination.getChildren().forEach(child -> reverseSubtree(childrenMap, child));
  }

  private void rewriteSubtree(@Nonnull TrieNode sourceNode, @Nonnull TrieNode destNode,
                              @Nonnull BiFunction, TreeMap> fn) {
    CharTrie result = destNode.getTrie();
    TreeMap sourceChildren = sourceNode.getChildrenMap();
    TreeMap newCounts = fn.apply(sourceNode, (Map) sourceChildren);
    destNode.writeChildren(newCounts);
    TreeMap newChildren = destNode.getChildrenMap();
    newCounts.keySet().forEach(key -> {
      if (sourceChildren.containsKey(key)) {
        rewriteSubtree(sourceChildren.get(key), newChildren.get(key), fn);
      }
    });
  }

  @javax.annotation.Nullable
  private NodeData recomputeCursorTotals(@Nonnull TrieNode node) {
    assert parentIndex != null;
    parentIndex[node.index] = null == node.getParent() ? -1 : node.getParent().index;
    List newChildren = node.getChildren().map(child -> recomputeCursorTotals(child))
        .collect(Collectors.toList());
    if (newChildren.isEmpty())
      return node.getData();
    long cursorCount = newChildren.stream().mapToLong(n -> n.cursorCount).sum();
    assert 0 < cursorCount;
    return node.update(d -> d.setCursorCount(cursorCount));
  }

  private void recomputeCursorPositions(@Nonnull TrieNode node, final int position) {
    node.update(n -> n.setFirstCursorIndex(position));
    int childPosition = position;
    Stream stream = node.getChildren().map(x -> x);
    for (TrieNode child : stream.collect(Collectors.toList())) {
      recomputeCursorPositions(child, childPosition);
      childPosition += child.getCursorCount();
    }
  }

  private void reduceSubtree(@Nullable TrieNode sourceNodeA, @Nullable TrieNode sourceNodeB, @Nonnull TrieNode destNode,
                             @Nonnull BiFunction> fn) {
    destNode.writeChildren(fn.apply(sourceNodeA, sourceNodeB));
    TreeMap sourceChildrenA = null == sourceNodeA ? null : sourceNodeA.getChildrenMap();
    TreeMap sourceChildrenB = null == sourceNodeB ? null : sourceNodeB.getChildrenMap();
    destNode.getChildrenMap().forEach((key, newChild) -> {
      boolean containsA = null != sourceChildrenA && sourceChildrenA.containsKey(key);
      boolean containsB = null != sourceChildrenB && sourceChildrenB.containsKey(key);
      if (containsA && containsB) {
        reduceSubtree(sourceChildrenA.get(key), sourceChildrenB.get(key), newChild, fn);
      } else if (containsA) {
        reduceSubtree(sourceChildrenA.get(key), null, newChild, fn);
      } else if (containsB) {
        reduceSubtree(null, sourceChildrenB.get(key), newChild, fn);
      }
    });
  }

  private > Stream max(@Nonnull Function fn, int maxResults, @Nonnull TrieNode node) {
    return StreamSupport.stream(Spliterators.spliteratorUnknownSize(
        Iterators
            .mergeSorted(Stream.concat(Stream.of(Stream.of(node)), node.getChildren().map(x -> max(fn, maxResults, x)))
                .map(x -> x.iterator()).collect(Collectors.toList()), Comparator.comparing(fn).reversed()),
        Spliterator.ORDERED), false).limit(maxResults).collect(Collectors.toList()).stream();
  }
}




© 2015 - 2025 Weber Informatics LLC | Privacy Policy