
morfologik.fsa.CFSA2 Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of morfologik-stemming Show documentation
Show all versions of morfologik-stemming Show documentation
Morfologik provides high quality lemmatisation for the Polish language,
along with tools for building and using byte-based finite state automata.
package morfologik.fsa;
import static morfologik.util.FileUtils.readFully;
import java.io.IOException;
import java.io.InputStream;
import java.util.EnumSet;
import java.util.Set;
import morfologik.util.FileUtils;
/**
* CFSA (Compact Finite State Automaton) binary format implementation, version 2:
*
* - {@link #BIT_TARGET_NEXT} applicable on all arcs, not necessarily the last one.
* - v-coded goto field
* - v-coded perfect hashing numbers, if any
* - 31 most frequent labels integrated with flags byte
*
*
* The encoding of automaton body is as follows.
*
*
* ---- CFSA header
* Byte Description
* +-+-+-+-+-+-+-+-+\
* 0 | | | | | | | | | +------ '\'
* +-+-+-+-+-+-+-+-+/
* +-+-+-+-+-+-+-+-+\
* 1 | | | | | | | | | +------ 'f'
* +-+-+-+-+-+-+-+-+/
* +-+-+-+-+-+-+-+-+\
* 2 | | | | | | | | | +------ 's'
* +-+-+-+-+-+-+-+-+/
* +-+-+-+-+-+-+-+-+\
* 3 | | | | | | | | | +------ 'a'
* +-+-+-+-+-+-+-+-+/
* +-+-+-+-+-+-+-+-+\
* 4 | | | | | | | | | +------ version (fixed 0xc6)
* +-+-+-+-+-+-+-+-+/
* +-+-+-+-+-+-+-+-+\
* 5 | | | | | | | | | +----\
* +-+-+-+-+-+-+-+-+/ \ flags [MSB first]
* +-+-+-+-+-+-+-+-+\ /
* 6 | | | | | | | | | +----/
* +-+-+-+-+-+-+-+-+/
* +-+-+-+-+-+-+-+-+\
* 7 | | | | | | | | | +------ label lookup table size
* +-+-+-+-+-+-+-+-+/
* +-+-+-+-+-+-+-+-+\
* 8-32 | | | | | | | | | +------ label value lookup table
* : : : : : : : : : |
* +-+-+-+-+-+-+-+-+/
*
* ---- Start of a node; only if automaton was compiled with NUMBERS option.
*
* Byte
* +-+-+-+-+-+-+-+-+\
* 0 | | | | | | | | | \
* +-+-+-+-+-+-+-+-+ +
* 1 | | | | | | | | | | number of strings recognized
* +-+-+-+-+-+-+-+-+ +----- by the automaton starting
* : : : : : : : : : | from this node. v-coding
* +-+-+-+-+-+-+-+-+ +
* | | | | | | | | | /
* +-+-+-+-+-+-+-+-+/
*
* ---- A vector of this node's arcs. An arc's layout depends on the combination of flags.
*
* 1) NEXT bit set, mapped arc label.
*
* +----------------------- node pointed to is next
* | +--------------------- the last arc of the node
* | | +------------------- this arc leads to a final state (acceptor)
* | | | _______+--------- arc's label; indexed if M > 0, otherwise explicit label follows
* | | | / | | | |
* +-+-+-+-+-+-+-+-+\
* 0 |N|L|F|M|M|M|M|M| +------ flags + (M) index of the mapped label.
* +-+-+-+-+-+-+-+-+/
* +-+-+-+-+-+-+-+-+\
* 1 | | | | | | | | | +------ optional label if M == 0
* +-+-+-+-+-+-+-+-+/
* : : : : : : : : :
* +-+-+-+-+-+-+-+-+\
* |A|A|A|A|A|A|A|A| +------ v-coded goto address
* +-+-+-+-+-+-+-+-+/
*
*/
public final class CFSA2 extends FSA {
/**
* Automaton header version value.
*/
public static final byte VERSION = (byte) 0xc6;
/**
* The target node of this arc follows the last arc of the current state
* (no goto field).
*/
public static final int BIT_TARGET_NEXT = 1 << 7;
/**
* The arc is the last one from the current node's arcs list.
*/
public static final int BIT_LAST_ARC = 1 << 6;
/**
* The arc corresponds to the last character of a sequence
* available when building the automaton (acceptor transition).
*/
public static final int BIT_FINAL_ARC = 1 << 5;
/**
* The count of bits assigned to storing an indexed label.
*/
static final int LABEL_INDEX_BITS = 5;
/**
* Masks only the M bits of a flag byte.
*/
static final int LABEL_INDEX_MASK = (1 << LABEL_INDEX_BITS) - 1;
/**
* Maximum size of the labels index.
*/
static final int LABEL_INDEX_SIZE = (1 << LABEL_INDEX_BITS) - 1;
/**
* An array of bytes with the internal representation of the automaton.
* Please see the documentation of this class for more information on how
* this structure is organized.
*/
public byte[] arcs;
/**
* Flags for this automaton version.
*/
private final EnumSet flags;
/**
* Label mapping for M-indexed labels.
*/
public final byte[] labelMapping;
/**
* If true
states are prepended with numbers.
*/
private final boolean hasNumbers;
/**
* Epsilon node's offset.
*/
private final int epsilon = 0;
/**
* Reads an automaton from a byte stream.
*/
public CFSA2(InputStream in) throws IOException {
// Read the header first.
if (FSAHeader.FSA_MAGIC != FileUtils.readInt(in))
throw new IOException("Invalid file header magic bytes.");
// Ensure we have the correct version.
final int version = FileUtils.readByte(in);
if (version != VERSION) {
throw new IOException("This class can only read FSA version: " + VERSION);
}
// Read flags.
short flagBits = FileUtils.readShort(in);
flags = EnumSet.noneOf(FSAFlags.class);
for (FSAFlags f : FSAFlags.values()) {
if (FSAFlags.isSet(flagBits, f))
flags.add(f);
}
if (flagBits != FSAFlags.asShort(flags))
throw new IOException("Unrecognized flags remained: 0x" + Integer.toHexString(flagBits));
this.hasNumbers = flags.contains(FSAFlags.NUMBERS);
/*
* Read mapping dictionary.
*/
int labelMappingSize = FileUtils.readByte(in) & 0xff;
labelMapping = new byte[labelMappingSize];
readFully(in, labelMapping);
/*
* Read arcs' data.
*/
arcs = readFully(in);
}
/**
* {@inheritDoc}
*/
@Override
public int getRootNode() {
// Skip dummy node marking terminating state.
return getDestinationNodeOffset(getFirstArc(epsilon));
}
/**
* {@inheritDoc}
*/
@Override
public final int getFirstArc(int node) {
if (hasNumbers) {
return skipVInt(node);
} else {
return node;
}
}
/**
* {@inheritDoc}
*/
@Override
public final int getNextArc(int arc) {
if (isArcLast(arc))
return 0;
else
return skipArc(arc);
}
/**
* {@inheritDoc}
*/
@Override
public int getArc(int node, byte label) {
for (int arc = getFirstArc(node); arc != 0; arc = getNextArc(arc)) {
if (getArcLabel(arc) == label)
return arc;
}
// An arc labeled with "label" not found.
return 0;
}
/**
* {@inheritDoc}
*/
@Override
public int getEndNode(int arc) {
final int nodeOffset = getDestinationNodeOffset(arc);
assert nodeOffset != 0 : "Can't follow a terminal arc: " + arc;
assert nodeOffset < arcs.length : "Node out of bounds.";
return nodeOffset;
}
/**
* {@inheritDoc}
*/
@Override
public byte getArcLabel(int arc) {
int index = arcs[arc] & LABEL_INDEX_MASK;
if (index > 0) {
return this.labelMapping[index];
} else {
return arcs[arc + 1];
}
}
/**
* {@inheritDoc}
*/
@Override
public int getRightLanguageCount(int node) {
assert getFlags().contains(FSAFlags.NUMBERS): "This FSA was not compiled with NUMBERS.";
return readVInt(arcs, node);
}
/**
* {@inheritDoc}
*/
@Override
public boolean isArcFinal(int arc) {
return (arcs[arc] & BIT_FINAL_ARC) != 0;
}
/**
* {@inheritDoc}
*/
@Override
public boolean isArcTerminal(int arc) {
return (0 == getDestinationNodeOffset(arc));
}
/**
* Returns true
if this arc has NEXT
bit set.
*
* @see #BIT_LAST_ARC
*/
public boolean isArcLast(int arc) {
return (arcs[arc] & BIT_LAST_ARC) != 0;
}
/**
* @see #BIT_TARGET_NEXT
*/
public boolean isNextSet(int arc) {
return (arcs[arc] & BIT_TARGET_NEXT) != 0;
}
/**
* {@inheritDoc}
*/
public Set getFlags() {
return flags;
}
/**
* Returns the address of the node pointed to by this arc.
*/
final int getDestinationNodeOffset(int arc) {
if (isNextSet(arc)) {
/* Follow until the last arc of this state. */
while (!isArcLast(arc)) {
arc = getNextArc(arc);
}
/* And return the byte right after it. */
return skipArc(arc);
} else {
/*
* The destination node address is v-coded. v-code starts either
* at the next byte (label indexed) or after the next byte (label explicit).
*/
return readVInt(arcs, arc + ((arcs[arc] & LABEL_INDEX_MASK) == 0 ? 2 : 1));
}
}
/**
* Read the arc's layout and skip as many bytes, as needed, to skip it.
*/
private int skipArc(int offset) {
int flag = arcs[offset++];
// Explicit label?
if ((flag & LABEL_INDEX_MASK) == 0) {
offset++;
}
// Explicit goto?
if ((flag & BIT_TARGET_NEXT) == 0) {
offset = skipVInt(offset);
}
assert offset < this.arcs.length;
return offset;
}
/**
* Read a v-int.
*/
static int readVInt(byte[] array, int offset) {
byte b = array[offset];
int value = b & 0x7F;
for (int shift = 7; b < 0; shift += 7) {
b = array[++offset];
value |= (b & 0x7F) << shift;
}
return value;
}
/**
* Write a v-int to a byte array.
*/
static int writeVInt(byte[] array, int offset, int value) {
assert value >= 0 : "Can't v-code negative ints.";
while (value > 0x7F) {
array[offset++] = (byte) (0x80 | (value & 0x7F));
value >>= 7;
}
array[offset++] = (byte) value;
return offset;
}
/**
* Return the byte-length of a v-coded int.
*/
static int vIntLength(int value) {
assert value >= 0 : "Can't v-code negative ints.";
int bytes;
for (bytes = 1; value >= 0x80; bytes++) {
value >>= 7;
}
return bytes;
}
/**
* Skip a v-int.
*/
private int skipVInt(int offset) {
while (arcs[offset++] < 0);
return offset;
}
}
© 2015 - 2025 Weber Informatics LLC | Privacy Policy