All Downloads are FREE. Search and download functionalities are using the official Maven repository.

morfologik.fsa.FSA Maven / Gradle / Ivy

There is a newer version: 2.1.9
Show newest version
package morfologik.fsa;

import java.io.ByteArrayOutputStream;
import java.io.IOException;
import java.io.InputStream;
import java.nio.ByteBuffer;
import java.util.BitSet;
import java.util.Collections;
import java.util.Iterator;
import java.util.Locale;
import java.util.Set;

/**
 * This is a top abstract class for handling finite state automata. These
 * automata are arc-based, a design described in Jan Daciuk's Incremental
 * Construction of Finite-State Automata and Transducers, and Their Use in the
 * Natural Language Processing (PhD thesis, Technical University of Gdansk).
 */
public abstract class FSA implements Iterable {
  /**
   * @return Returns the identifier of the root node of this automaton. Returns
   *         0 if the start node is also the end node (the automaton is empty).
   */
  public abstract int getRootNode();

  /**
   * @param node
   *          Identifier of the node.
   * @return Returns the identifier of the first arc leaving node
   *         or 0 if the node has no outgoing arcs.
   */
  public abstract int getFirstArc(int node);

  /**
   * @param arc
   *          The arc's identifier.
   * @return Returns the identifier of the next arc after arc and
   *         leaving node. Zero is returned if no more arcs are
   *         available for the node.
   */
  public abstract int getNextArc(int arc);

  /**
   * @param node
   *          Identifier of the node.
   * @param label
   *          The arc's label.
   * @return Returns the identifier of an arc leaving node and
   *         labeled with label. An identifier equal to 0 means the
   *         node has no outgoing arc labeled label.
   */
  public abstract int getArc(int node, byte label);

  /**
   * @param arc
   *          The arc's identifier.
   * @return Return the label associated with a given arc.
   */
  public abstract byte getArcLabel(int arc);

  /**
   * @param arc
   *          The arc's identifier.
   * @return Returns true if the destination node at the end of
   *         this arc corresponds to an input sequence created when
   *         building this automaton.
   */
  public abstract boolean isArcFinal(int arc);

  /**
   * @param arc
   *          The arc's identifier.
   * @return Returns true if this arc does not have a
   *         terminating node (@link {@link #getEndNode(int)} will throw an
   *         exception). Implies {@link #isArcFinal(int)}.
   */
  public abstract boolean isArcTerminal(int arc);

  /**
   * @param arc
   *          The arc's identifier.
   * @return Return the end node pointed to by a given arc.
   *         Terminal arcs (those that point to a terminal state) have no end
   *         node representation and throw a runtime exception.
   */
  public abstract int getEndNode(int arc);

  /**
   * @return Returns a set of flags for this FSA instance.
   */
  public abstract Set getFlags();

  /**
   * @param node
   *          Identifier of the node.
   * @return Calculates and returns the number of arcs of a given node.
   */
  public int getArcCount(int node) {
    int count = 0;
    for (int arc = getFirstArc(node); arc != 0; arc = getNextArc(arc)) {
      count++;
    }
    return count;
  }

  /**
   * @param node
   *          Identifier of the node.
   * 
   * @return Returns the number of sequences reachable from the given state if
   *         the automaton was compiled with {@link FSAFlags#NUMBERS}. The size
   *         of the right language of the state, in other words.
   * 
   * @throws UnsupportedOperationException
   *           If the automaton was not compiled with {@link FSAFlags#NUMBERS}.
   *           The value can then be computed by manual count of
   *           {@link #getSequences}.
   */
  public int getRightLanguageCount(int node) {
    throw new UnsupportedOperationException("Automaton not compiled with " + FSAFlags.NUMBERS);
  }

  /**
   * Returns an iterator over all binary sequences starting at the given FSA
   * state (node) and ending in final nodes. This corresponds to a set of
   * suffixes of a given prefix from all sequences stored in the automaton.
   * 
   * 

* The returned iterator is a {@link ByteBuffer} whose contents changes on * each call to {@link Iterator#next()}. The keep the contents between calls * to {@link Iterator#next()}, one must copy the buffer to some other * location. *

* *

* Important. It is guaranteed that the returned byte buffer is backed * by a byte array and that the content of the byte buffer starts at the * array's index 0. *

* * @param node * Identifier of the starting node from which to return subsequences. * @return An iterable over all sequences encoded starting at the given node. */ public Iterable getSequences(final int node) { if (node == 0) { return Collections. emptyList(); } return new Iterable() { public Iterator iterator() { return new ByteSequenceIterator(FSA.this, node); } }; } /** * An alias of calling {@link #iterator} directly ({@link FSA} is also * {@link Iterable}). * * @return Returns all sequences encoded in the automaton. */ public final Iterable getSequences() { return getSequences(getRootNode()); } /** * Returns an iterator over all binary sequences starting from the initial FSA * state (node) and ending in final nodes. The returned iterator is a * {@link ByteBuffer} whose contents changes on each call to * {@link Iterator#next()}. The keep the contents between calls to * {@link Iterator#next()}, one must copy the buffer to some other location. * *

* Important. It is guaranteed that the returned byte buffer is backed * by a byte array and that the content of the byte buffer starts at the * array's index 0. *

*/ public final Iterator iterator() { return getSequences().iterator(); } /** * Visit all states. The order of visiting is undefined. This method may be * faster than traversing the automaton in post or preorder since it can scan * states linearly. Returning false from {@link StateVisitor#accept(int)} * immediately terminates the traversal. * * @param v Visitor to receive traversal calls. * @param A subclass of {@link StateVisitor}. * @return Returns the argument (for access to anonymous class fields). */ public T visitAllStates(T v) { return visitInPostOrder(v); } /** * Same as {@link #visitInPostOrder(StateVisitor, int)}, starting from root * automaton node. * * @param v Visitor to receive traversal calls. * @param A subclass of {@link StateVisitor}. * @return Returns the argument (for access to anonymous class fields). */ public T visitInPostOrder(T v) { return visitInPostOrder(v, getRootNode()); } /** * Visits all states reachable from node in postorder. Returning * false from {@link StateVisitor#accept(int)} immediately terminates the * traversal. * * @param v Visitor to receive traversal calls. * @param A subclass of {@link StateVisitor}. * @param node Identifier of the node. * @return Returns the argument (for access to anonymous class fields). */ public T visitInPostOrder(T v, int node) { visitInPostOrder(v, node, new BitSet()); return v; } /** Private recursion. */ private boolean visitInPostOrder(StateVisitor v, int node, BitSet visited) { if (visited.get(node)) return true; visited.set(node); for (int arc = getFirstArc(node); arc != 0; arc = getNextArc(arc)) { if (!isArcTerminal(arc)) { if (!visitInPostOrder(v, getEndNode(arc), visited)) return false; } } return v.accept(node); } /** * Same as {@link #visitInPreOrder(StateVisitor, int)}, starting from root * automaton node. * * @param v Visitor to receive traversal calls. * @param A subclass of {@link StateVisitor}. * @return Returns the argument (for access to anonymous class fields). */ public T visitInPreOrder(T v) { return visitInPreOrder(v, getRootNode()); } /** * Visits all states in preorder. Returning false from * {@link StateVisitor#accept(int)} skips traversal of all sub-states of a * given state. * * @param v Visitor to receive traversal calls. * @param A subclass of {@link StateVisitor}. * @param node Identifier of the node. * @return Returns the argument (for access to anonymous class fields). */ public T visitInPreOrder(T v, int node) { visitInPreOrder(v, node, new BitSet()); return v; } /** * @param in The input stream. * @return Reads all remaining bytes from an input stream and returns * them as a byte array. * @throws IOException Rethrown if an I/O exception occurs. */ protected static final byte[] readRemaining(InputStream in) throws IOException { ByteArrayOutputStream baos = new ByteArrayOutputStream(); byte[] buffer = new byte[1024 * 8]; int len; while ((len = in.read(buffer)) >= 0) { baos.write(buffer, 0, len); } return baos.toByteArray(); } /** Private recursion. */ private void visitInPreOrder(StateVisitor v, int node, BitSet visited) { if (visited.get(node)) { return; } visited.set(node); if (v.accept(node)) { for (int arc = getFirstArc(node); arc != 0; arc = getNextArc(arc)) { if (!isArcTerminal(arc)) { visitInPreOrder(v, getEndNode(arc), visited); } } } } /** * A factory for reading automata in any of the supported versions. * * @param stream * The input stream to read automaton data from. The stream is not * closed. * @return Returns an instantiated automaton. Never null. * @throws IOException * If the input stream does not represent an automaton or is * otherwise invalid. */ public static FSA read(InputStream stream) throws IOException { final FSAHeader header = FSAHeader.read(stream); switch (header.version) { case FSA5.VERSION: return new FSA5(stream); case CFSA.VERSION: return new CFSA(stream); case CFSA2.VERSION: return new CFSA2(stream); default: throw new IOException( String.format(Locale.ROOT, "Unsupported automaton version: 0x%02x", header.version & 0xFF)); } } /** * A factory for reading a specific FSA subclass, including proper casting. * * @param stream * The input stream to read automaton data from. The stream is not * closed. * @param clazz A subclass of {@link FSA} to cast the read automaton to. * @param A subclass of {@link FSA} to cast the read automaton to. * @return Returns an instantiated automaton. Never null. * @throws IOException * If the input stream does not represent an automaton, is otherwise * invalid or the class of the automaton read from the input stream * is not assignable to clazz. */ public static T read(InputStream stream, Class clazz) throws IOException { FSA fsa = read(stream); if (!clazz.isInstance(fsa)) { throw new IOException(String.format(Locale.ROOT, "Expected FSA type %s, but read an incompatible type %s.", clazz.getName(), fsa.getClass().getName())); } return clazz.cast(fsa); } }




© 2015 - 2025 Weber Informatics LLC | Privacy Policy