All Downloads are FREE. Search and download functionalities are using the official Maven repository.

morfologik.fsa.FSA Maven / Gradle / Ivy

There is a newer version: 2.1.9
Show newest version
package morfologik.fsa;

import java.io.ByteArrayInputStream;
import java.io.ByteArrayOutputStream;
import java.io.DataInput;
import java.io.DataInputStream;
import java.io.File;
import java.io.FileInputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.PushbackInputStream;
import java.io.UnsupportedEncodingException;
import java.nio.ByteBuffer;
import java.util.Iterator;

import morfologik.util.FileUtils;

/**
 * This class implements Finite State Automaton traversal as described in Jan
 * Daciuk's Incremental Construction of Finite-State Automata and
 * Transducers, and Their Use in the Natural Language Processing (PhD
 * thesis, Technical University of Gdansk).
 * 
 * 

* This is an abstract base class for all forms of binary storage present in Jan * Daciuk's FSA package. */ public abstract class FSA implements Iterable { /** * Version number for version 5 of the automaton. */ public final static byte VERSION_5 = 5; /** * Dictionary version (derived from the combination of flags). */ protected byte version; /** * The meaning of this field is not clear (check the FSA documentation). */ protected byte filler; /** * Size of transition's destination node "address". This field may also have * different interpretation, or may not be used at all. It depends on the * combination of flags used for building FSA. */ protected byte gotoLength; /** * Annotation separator is a special character used for separating "tokens" * in a FSA. For instance an inflected form of a word may be separated from * the base form. */ private byte annotationSeparator; /** * The encoding (codepage) in which the dictionary has been compiled; * byte-to-character conversion scheme. */ private String dictionaryEncoding; /** * Creates a new automaton reading the FSA automaton from an input stream. * * @param fsaStream * An input stream with FSA automaton. * @throws IOException * if the dictionary file cannot be read, or version of the file * is not supported. */ protected FSA(InputStream fsaStream, String dictionaryEncoding) throws IOException { if (fsaStream == null) { throw new IllegalArgumentException( "The input stream must not be null."); } if (dictionaryEncoding == null) { throw new IllegalArgumentException( "Dictionary encoding must not be null."); } this.dictionaryEncoding = dictionaryEncoding; /* * This implementation requires the length of stream to be known in * advance. Preload the dictionary entirely. */ final byte[] fsa = readFully(fsaStream); DataInputStream input = null; try { input = new DataInputStream(new ByteArrayInputStream(fsa)); readHeader(input, fsa.length); } finally { FileUtils.close(input); } } /** * Returns the version number of the binary representation of this FSA. * *

* The version number is a derivation of combination of flags and is exactly * the same as in Jan Daciuk's FSA package. */ public final int getVersion() { return version; } /** * Returns a set of flags for this FSA instance. Each flag is represented by * a unique bit in the integer returned. Therefore to check whether the * dictionary has been built using {@link FSAFlags#FLEXIBLE} flag, one must * perform a bitwise AND: * boolean isFlexible = ((dict.getFlags() & FSA.FSA_FLEXIBLE ) != 0) */ public final int getFlags() { return FSAHelpers.getFlags(version); } /** * Return the annotation separator character, converted to a character * according to the encoding scheme passed in in the constructor of this * class. */ public final char getAnnotationSeparator() { try { final String annotationChar = new String( new byte[] { this.annotationSeparator }, this.dictionaryEncoding); if (annotationChar.length() != 1) { throw new RuntimeException( "Unexpected annotation character length (should be 1): " + annotationChar.length()); } return annotationChar.charAt(0); } catch (UnsupportedEncodingException e) { throw new RuntimeException(e); } } /** * Return the filler character, converted to a character according to the * encoding scheme passed in in the constructor of this class. */ public final char getFillerCharacter() { try { final String fillerChar = new String(new byte[] { this.filler }, this.dictionaryEncoding); if (fillerChar.length() != 1) { throw new RuntimeException( "Unexpected filler character length (should be 1): " + fillerChar.length()); } return fillerChar.charAt(0); } catch (UnsupportedEncodingException e) { throw new RuntimeException(e); } } /** * Returns the number of arcs in this automaton. Depending on the * representation of the automaton, this method may take a long time to * finish. */ public abstract int getNumberOfArcs(); /** * Returns the number of nodes in this automaton. Depending on the * representation of the automaton, this method may take a long time to * finish. */ public abstract int getNumberOfNodes(); /** * Returns an object which can be used to walk the edges of this finite * state automaton and match arbitrary sequences against its states. */ public FSATraversalHelper getTraversalHelper() { return new FSATraversalHelper(this); } /** * This static method will attempt to instantiate an appropriate * implementation of the FSA for the version found in file given in the * input argument. * * @throws IOException * An exception is thrown if no corresponding FSA parser is * found or if the input file cannot be opened. */ public static FSA getInstance(File fsaFile, String dictionaryEncoding) throws IOException { if (!fsaFile.exists()) { throw new IOException("File does not exist: " + fsaFile.getAbsolutePath()); } return getInstance(new FileInputStream(fsaFile), dictionaryEncoding); } /** * This static method will attempt to instantiate an appropriate * implementation of the FSA for the version found in file given in the * input argument. * * @throws IOException * An exception is thrown if no corresponding FSA parser is * found or if the input file cannot be opened. */ public static FSA getInstance(InputStream fsaStream, String dictionaryEncoding) throws IOException { if (fsaStream == null) throw new IllegalArgumentException("FSA stream cannot be null."); final PushbackInputStream stream = new PushbackInputStream(fsaStream, 5); final byte[] header = new byte[5]; for (int bytesRead = 0; bytesRead < header.length;) { bytesRead += stream.read(header, bytesRead, header.length - bytesRead); } if (header[0] == '\\' && header[1] == 'f' && header[2] == 's' && header[3] == 'a') { // Read FSA version final byte version = header[4]; // put back header info stream.unread(header); switch (version) { case 0x05: return new FSAVer5Impl(stream, dictionaryEncoding); } // No supporting implementation found. throw new IOException("Cannot read FSA: support for version " + version + " (" + FSAHelpers.flagsToString(FSAHelpers.getFlags(version)) + ") not implemented."); } else { throw new IOException( "Cannot read FSA: file does not begin with a valid magic number."); } } /** * Reads a FSA header from a stream. * * @throws IOException * If the stream is not a dictionary, or if the version is not * supported. */ protected void readHeader(DataInput in, long fileSize) throws IOException { final byte[] magic = new byte[4]; in.readFully(magic); if (magic[0] == '\\' && magic[1] == 'f' && magic[2] == 's' && magic[3] == 'a') { version = in.readByte(); filler = in.readByte(); annotationSeparator = in.readByte(); gotoLength = in.readByte(); } else { throw new IOException( "Cannot read FSA: File does not begin with a valid magic number."); } } /** * Reads all bytes from an input stream. * * @param stream * @return Returns an array of read bytes. */ protected byte[] readFully(InputStream stream) throws IOException { final ByteArrayOutputStream baos = new ByteArrayOutputStream(1024 * 16); final byte[] buffer = new byte[1024 * 8]; int bytesCount; while ((bytesCount = stream.read(buffer)) > 0) { baos.write(buffer, 0, bytesCount); } return baos.toByteArray(); } /** * Returns an iterator over all binary sequences starting from the initial * FSA state and ending in final nodes. The returned iterator is a * {@link ByteBuffer} that changes on each call to {@link Iterator#next()}, * so if the content should be preserved, it must be copied somewhere else. * *

* It is guaranteed that the returned byte buffer is backed by a byte array * and that the content of the byte buffer starts at the array's index 0. */ public Iterator iterator() { return getTraversalHelper().getAllSubsequences(getRootNode()); } /** * Returns the identifier of the root node of this automaton. May return 0 * if the start node is also the end node. * * @see #getTraversalHelper() */ public abstract int getRootNode(); /** * Returns the identifier of the first arc leaving node or 0 if * the node has no outgoing arcs. * * @see #getTraversalHelper() */ public abstract int getFirstArc(int node); /** * Returns the identifier of an arc leaving node and labeled * with label. An identifier equal to 0 means the node has no * outgoing arc labeled label. * * @see #getTraversalHelper() */ public abstract int getArc(int node, byte label); /** * Returns the identifier of the next arc after arc and leaving * node. Zero is returned if no more arcs are available for the * node. * * @see #getTraversalHelper() */ public abstract int getNextArc(int node, int arc); /** * Return the end node pointed to by a given arc. Terminal arcs * (those that point to a terminal state) have no end node representation * and throw a runtime exception. * * @see #getTraversalHelper() */ public abstract int getEndNode(int arc); /** * Return the label associated with a given arc. */ public abstract byte getArcLabel(int arc); /** * Returns true if the destination node at the end of this * arc corresponds to an input sequence created when building * this automaton. */ public abstract boolean isArcFinal(int arc); /** * Returns true if this arc does not have a * terminating node. */ public abstract boolean isArcTerminal(int arc); }





© 2015 - 2024 Weber Informatics LLC | Privacy Policy