All Downloads are FREE. Search and download functionalities are using the official Maven repository.

morfologik.fsa.builders.FSABuilder Maven / Gradle / Ivy

There is a newer version: 2.1.9
Show newest version
package morfologik.fsa.builders;

import java.util.*;

import morfologik.fsa.FSA;
import static morfologik.fsa.builders.ConstantArcSizeFSA.*;

/**
 * Fast, memory-conservative finite state automaton builder, returning an
 * in-memory {@link FSA} that is a tradeoff between construction speed and
 * memory consumption. Use serializers to compress the returned automaton into
 * more compact form.
 * 
 * @see FSASerializer
 */
public final class FSABuilder {
  /**
   * Debug and information constants.
   * 
   * @see FSABuilder#getInfo()
   */
  public enum InfoEntry {
    SERIALIZATION_BUFFER_SIZE("Serialization buffer size"),
    SERIALIZATION_BUFFER_REALLOCATIONS("Serialization buffer reallocs"), 
    CONSTANT_ARC_AUTOMATON_SIZE("Constant arc FSA size"), 
    MAX_ACTIVE_PATH_LENGTH("Max active path"), 
    STATE_REGISTRY_TABLE_SLOTS("Registry hash slots"), 
    STATE_REGISTRY_SIZE("Registry hash entries"), 
    ESTIMATED_MEMORY_CONSUMPTION_MB("Estimated mem consumption (MB)");

    private final String stringified;

    InfoEntry(String stringified) {
      this.stringified = stringified;
    }

    @Override
    public String toString() {
      return stringified;
    }
  }

  /** A megabyte. */
  private final static int MB = 1024 * 1024;

  /**
   * Internal serialized FSA buffer expand ratio.
   */
  private final static int BUFFER_GROWTH_SIZE = 5 * MB;

  /**
   * Maximum number of labels from a single state.
   */
  private final static int MAX_LABELS = 256;

  /**
   * A comparator comparing full byte arrays. Unsigned byte comparisons ('C'-locale).
   */
  public static final Comparator LEXICAL_ORDERING = new Comparator() {
    public int compare(byte[] o1, byte[] o2) {
      return FSABuilder.compare(o1, 0, o1.length, o2, 0, o2.length);
    }
  };

  /**
   * Internal serialized FSA buffer expand ratio.
   */
  private final int bufferGrowthSize;

  /**
   * Holds serialized and mutable states. Each state is a sequential list of
   * arcs, the last arc is marked with {@link #BIT_ARC_LAST}.
   */
  private byte[] serialized = new byte[0];

  /**
   * Number of bytes already taken in {@link #serialized}. Start from 1 to keep
   * 0 a sentinel value (for the hash set and final state).
   */
  private int size;

  /**
   * States on the "active path" (still mutable). Values are addresses of each
   * state's first arc.
   */
  private int[] activePath = new int[0];

  /**
   * Current length of the active path.
   */
  private int activePathLen;

  /**
   * The next offset at which an arc will be added to the given state on
   * {@link #activePath}.
   */
  private int[] nextArcOffset = new int[0];

  /**
   * Root state. If negative, the automaton has been built already and cannot be
   * extended.
   */
  private int root;

  /**
   * An epsilon state. The first and only arc of this state points either to the
   * root or to the terminal state, indicating an empty automaton.
   */
  private int epsilon;

  /**
   * Hash set of state addresses in {@link #serialized}, hashed by
   * {@link #hash(int, int)}. Zero reserved for an unoccupied slot.
   */
  private int[] hashSet = new int[2];

  /**
   * Number of entries currently stored in {@link #hashSet}.
   */
  private int hashSize = 0;

  /**
   * Previous sequence added to the automaton in {@link #add(byte[], int, int)}.
   * Used in assertions only.
   */
  private byte[] previous;

  /**
   * Information about the automaton and its compilation.
   */
  private TreeMap info;

  /**
   * {@link #previous} sequence's length, used in assertions only.
   */
  private int previousLength;

  /** */
  public FSABuilder() {
    this(BUFFER_GROWTH_SIZE);
  }

  /**
   * @param bufferGrowthSize Buffer growth size (in bytes) when constructing the automaton.
   */
  public FSABuilder(int bufferGrowthSize) {
    this.bufferGrowthSize = Math.max(bufferGrowthSize, ARC_SIZE * MAX_LABELS);

    // Allocate epsilon state.
    epsilon = allocateState(1);
    serialized[epsilon + FLAGS_OFFSET] |= BIT_ARC_LAST;

    // Allocate root, with an initial empty set of output arcs.
    expandActivePath(1);
    root = activePath[0];
  }

  /**
   * Add a single sequence of bytes to the FSA. The input must be
   * lexicographically greater than any previously added sequence.
   * 
   * @param sequence The array holding input sequence of bytes. 
   * @param start Starting offset (inclusive)
   * @param len Length of the input sequence (at least 1 byte).
   */
  public void add(byte[] sequence, int start, int len) {
    assert serialized != null : "Automaton already built.";
    assert previous == null || len == 0 || compare(previous, 0, previousLength, sequence, start, len) <= 0 : "Input must be sorted: "
        + Arrays.toString(Arrays.copyOf(previous, previousLength))
        + " >= "
        + Arrays.toString(Arrays.copyOfRange(sequence, start, len));
    assert setPrevious(sequence, start, len);

    // Determine common prefix length.
    final int commonPrefix = commonPrefix(sequence, start, len);

    // Make room for extra states on active path, if needed.
    expandActivePath(len);

    // Freeze all the states after the common prefix.
    for (int i = activePathLen - 1; i > commonPrefix; i--) {
      final int frozenState = freezeState(i);
      setArcTarget(nextArcOffset[i - 1] - ARC_SIZE, frozenState);
      nextArcOffset[i] = activePath[i];
    }

    // Create arcs to new suffix states.
    for (int i = commonPrefix + 1, j = start + commonPrefix; i <= len; i++) {
      final int p = nextArcOffset[i - 1];

      serialized[p + FLAGS_OFFSET] = (byte) (i == len ? BIT_ARC_FINAL : 0);
      serialized[p + LABEL_OFFSET] = sequence[j++];
      setArcTarget(p, i == len ? TERMINAL_STATE : activePath[i]);

      nextArcOffset[i - 1] = p + ARC_SIZE;
    }

    // Save last sequence's length so that we don't need to calculate it again.
    this.activePathLen = len;
  }

  /** Number of serialization buffer reallocations. */
  private int serializationBufferReallocations;

  /**
   * @return Finalizes the construction of the automaton and returns it.
   */
  public FSA complete() {
    add(new byte[0], 0, 0);

    if (nextArcOffset[0] - activePath[0] == 0) {
      // An empty FSA.
      setArcTarget(epsilon, TERMINAL_STATE);
    } else {
      // An automaton with at least a single arc from root.
      root = freezeState(0);
      setArcTarget(epsilon, root);
    }

    info = new TreeMap();
    info.put(InfoEntry.SERIALIZATION_BUFFER_SIZE, serialized.length);
    info.put(InfoEntry.SERIALIZATION_BUFFER_REALLOCATIONS, serializationBufferReallocations);
    info.put(InfoEntry.CONSTANT_ARC_AUTOMATON_SIZE, size);
    info.put(InfoEntry.MAX_ACTIVE_PATH_LENGTH, activePath.length);
    info.put(InfoEntry.STATE_REGISTRY_TABLE_SLOTS, hashSet.length);
    info.put(InfoEntry.STATE_REGISTRY_SIZE, hashSize);
    info.put(InfoEntry.ESTIMATED_MEMORY_CONSUMPTION_MB, 
        (this.serialized.length + this.hashSet.length * 4) / (double) MB);

    final FSA fsa = new ConstantArcSizeFSA(java.util.Arrays.copyOf(this.serialized, this.size), epsilon);
    this.serialized = null;
    this.hashSet = null;
    return fsa;
  }

  /**
   * Build a minimal, deterministic automaton from a sorted list of byte
   * sequences.
   * 
   * @param input Input sequences to build automaton from. 
   * @return Returns the automaton encoding all input sequences.
   */
  public static FSA build(byte[][] input) {
    final FSABuilder builder = new FSABuilder();

    for (byte[] chs : input) {
      builder.add(chs, 0, chs.length);
    }

    return builder.complete();
  }

  /**
   * Build a minimal, deterministic automaton from an iterable list of byte
   * sequences.
   * 
   * @param input Input sequences to build automaton from. 
   * @return Returns the automaton encoding all input sequences.
   */
  public static FSA build(Iterable input) {
    final FSABuilder builder = new FSABuilder();

    for (byte[] chs : input) {
      builder.add(chs, 0, chs.length);
    }

    return builder.complete();
  }

  /**
   * @return Returns various statistics concerning the FSA and its compilation.
   * @see InfoEntry
   */
  public Map getInfo() {
    return info;
  }

  /** Is this arc the state's last? */
  private boolean isArcLast(int arc) {
    return (serialized[arc + FLAGS_OFFSET] & BIT_ARC_LAST) != 0;
  }

  /** Is this arc final? */
  private boolean isArcFinal(int arc) {
    return (serialized[arc + FLAGS_OFFSET] & BIT_ARC_FINAL) != 0;
  }

  /** Get label's arc. */
  private byte getArcLabel(int arc) {
    return serialized[arc + LABEL_OFFSET];
  }

  /**
   * Fills the target state address of an arc.
   */
  private void setArcTarget(int arc, int state) {
    arc += ADDRESS_OFFSET + TARGET_ADDRESS_SIZE;
    for (int i = 0; i < TARGET_ADDRESS_SIZE; i++) {
      serialized[--arc] = (byte) state;
      state >>>= 8;
    }
  }

  /**
   * Returns the address of an arc.
   */
  private int getArcTarget(int arc) {
    arc += ADDRESS_OFFSET;
    return (serialized[arc]           ) << 24 | 
           (serialized[arc + 1] & 0xff) << 16 | 
           (serialized[arc + 2] & 0xff) << 8  |
           (serialized[arc + 3] & 0xff);
  }

  /**
   * @return The number of common prefix characters with the previous sequence.
   */
  private int commonPrefix(byte[] sequence, int start, int len) {
    // Empty root state case.
    final int max = Math.min(len, activePathLen);
    int i;
    for (i = 0; i < max; i++) {
      final int lastArc = nextArcOffset[i] - ARC_SIZE;
      if (sequence[start++] != getArcLabel(lastArc)) {
        break;
      }
    }

    return i;
  }

  /**
   * Freeze a state: try to find an equivalent state in the interned states
   * dictionary first, if found, return it, otherwise, serialize the mutable
   * state at activePathIndex and return it.
   */
  private int freezeState(final int activePathIndex) {
    final int start = activePath[activePathIndex];
    final int end = nextArcOffset[activePathIndex];
    final int len = end - start;

    // Set the last arc flag on the current active path's state.
    serialized[end - ARC_SIZE + FLAGS_OFFSET] |= BIT_ARC_LAST;

    // Try to locate a state with an identical content in the hash set.
    final int bucketMask = (hashSet.length - 1);
    int slot = hash(start, len) & bucketMask;
    for (int i = 0;;) {
      int state = hashSet[slot];
      if (state == 0) {
        state = hashSet[slot] = serialize(activePathIndex);
        if (++hashSize > hashSet.length / 2)
          expandAndRehash();
        return state;
      } else if (equivalent(state, start, len)) {
        return state;
      }

      slot = (slot + (++i)) & bucketMask;
    }
  }

  /**
   * Reallocate and rehash the hash set.
   */
  private void expandAndRehash() {
    final int[] newHashSet = new int[hashSet.length * 2];
    final int bucketMask = (newHashSet.length - 1);

    for (int j = 0; j < hashSet.length; j++) {
      final int state = hashSet[j];
      if (state > 0) {
        int slot = hash(state, stateLength(state)) & bucketMask;
        for (int i = 0; newHashSet[slot] > 0;) {
          slot = (slot + (++i)) & bucketMask;
        }
        newHashSet[slot] = state;
      }
    }
    this.hashSet = newHashSet;
  }

  /**
   * The total length of the serialized state data (all arcs).
   */
  private int stateLength(int state) {
    int arc = state;
    while (!isArcLast(arc)) {
      arc += ARC_SIZE;
    }
    return arc - state + ARC_SIZE;
  }

  /**
   * Return true if two regions in {@link #serialized} are
   * identical.
   */
  private boolean equivalent(int start1, int start2, int len) {
    if (start1 + len > size || start2 + len > size)
      return false;

    while (len-- > 0)
      if (serialized[start1++] != serialized[start2++])
        return false;

    return true;
  }

  /**
   * Serialize a given state on the active path.
   */
  private int serialize(final int activePathIndex) {
    expandBuffers();

    final int newState = size;
    final int start = activePath[activePathIndex];
    final int len = nextArcOffset[activePathIndex] - start;
    System.arraycopy(serialized, start, serialized, newState, len);

    size += len;
    return newState;
  }

  /**
   * Hash code of a fragment of {@link #serialized} array.
   */
  private int hash(int start, int byteCount) {
    assert byteCount % ARC_SIZE == 0 : "Not an arc multiply?";

    int h = 0;
    for (int arcs = byteCount / ARC_SIZE; --arcs >= 0; start += ARC_SIZE) {
      h = 17 * h + getArcLabel(start);
      h = 17 * h + getArcTarget(start);
      if (isArcFinal(start))
        h += 17;
    }

    return h;
  }

  /**
   * Append a new mutable state to the active path.
   */
  private void expandActivePath(int size) {
    if (activePath.length < size) {
      final int p = activePath.length;
      activePath = java.util.Arrays.copyOf(activePath, size);
      nextArcOffset = java.util.Arrays.copyOf(nextArcOffset, size);

      for (int i = p; i < size; i++) {
        nextArcOffset[i] = activePath[i] = allocateState(/* assume max labels count */MAX_LABELS);
      }
    }
  }

  /**
   * Expand internal buffers for the next state.
   */
  private void expandBuffers() {
    if (this.serialized.length < size + ARC_SIZE * MAX_LABELS) {
      serialized = java.util.Arrays.copyOf(serialized, serialized.length + bufferGrowthSize);
      serializationBufferReallocations++;
    }
  }

  /**
   * Allocate space for a state with the given number of outgoing labels.
   * 
   * @return state offset
   */
  private int allocateState(int labels) {
    expandBuffers();
    final int state = size;
    size += labels * ARC_SIZE;
    return state;
  }

  /**
   * Copy current into an internal buffer.
   */
  private boolean setPrevious(byte[] sequence, int start, int length) {
    if (previous == null || previous.length < length) {
      previous = new byte[length];
    }

    System.arraycopy(sequence, start, previous, 0, length);
    previousLength = length;
    return true;
  }

  /**
   * Lexicographic order of input sequences. By default, consistent with the "C"
   * sort (absolute value of bytes, 0-255).
   */
  private static int compare(byte[] s1, int start1, int lens1, byte[] s2, int start2, int lens2) {
    final int max = Math.min(lens1, lens2);

    for (int i = 0; i < max; i++) {
      final byte c1 = s1[start1++];
      final byte c2 = s2[start2++];
      if (c1 != c2)
        return (c1 & 0xff) - (c2 & 0xff);
    }

    return lens1 - lens2;
  }
}




© 2015 - 2024 Weber Informatics LLC | Privacy Policy