All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.apache.lucene.util.fst.FST Maven / Gradle / Ivy

/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.lucene.util.fst;


import java.io.BufferedInputStream;
import java.io.BufferedOutputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.OutputStream;
import java.nio.file.Files;
import java.nio.file.Path;

import org.apache.lucene.codecs.CodecUtil;
import org.apache.lucene.index.CorruptIndexException;
import org.apache.lucene.store.DataInput;
import org.apache.lucene.store.DataOutput;
import org.apache.lucene.store.InputStreamDataInput;
import org.apache.lucene.store.OutputStreamDataOutput;
import org.apache.lucene.store.RAMOutputStream;
import org.apache.lucene.util.Accountable;
import org.apache.lucene.util.ArrayUtil;
import org.apache.lucene.util.Constants;
import org.apache.lucene.util.RamUsageEstimator;

import static org.apache.lucene.util.fst.FST.Arc.BitTable;

// TODO: break this into WritableFST and ReadOnlyFST.. then
// we can have subclasses of ReadOnlyFST to handle the
// different byte[] level encodings (packed or
// not)... and things like nodeCount, arcCount are read only

// TODO: if FST is pure prefix trie we can do a more compact
// job, ie, once we are at a 'suffix only', just store the
// completion labels as a string not as a series of arcs.

// NOTE: while the FST is able to represent a non-final
// dead-end state (NON_FINAL_END_NODE=0), the layers above
// (FSTEnum, Util) have problems with this!!

/** Represents an finite state machine (FST), using a
 *  compact byte[] format.
 *  

The format is similar to what's used by Morfologik * (https://github.com/morfologik/morfologik-stemming). * *

See the {@link org.apache.lucene.util.fst package * documentation} for some simple examples. * * @lucene.experimental */ public final class FST implements Accountable { /** Specifies allowed range of each int input label for * this FST. */ public enum INPUT_TYPE {BYTE1, BYTE2, BYTE4} private static final long BASE_RAM_BYTES_USED = RamUsageEstimator.shallowSizeOfInstance(FST.class); private static final long ARC_SHALLOW_RAM_BYTES_USED = RamUsageEstimator.shallowSizeOfInstance(Arc.class); private static final int BIT_FINAL_ARC = 1 << 0; static final int BIT_LAST_ARC = 1 << 1; static final int BIT_TARGET_NEXT = 1 << 2; // TODO: we can free up a bit if we can nuke this: private static final int BIT_STOP_NODE = 1 << 3; /** This flag is set if the arc has an output. */ public static final int BIT_ARC_HAS_OUTPUT = 1 << 4; private static final int BIT_ARC_HAS_FINAL_OUTPUT = 1 << 5; /** Value of the arc flags to declare a node with fixed length arcs * designed for binary search. */ // We use this as a marker because this one flag is illegal by itself. public static final byte ARCS_FOR_BINARY_SEARCH = BIT_ARC_HAS_FINAL_OUTPUT; /** Value of the arc flags to declare a node with fixed length arcs * and bit table designed for direct addressing. */ static final byte ARCS_FOR_DIRECT_ADDRESSING = 1 << 6; /** * @see #shouldExpandNodeWithFixedLengthArcs */ static final int FIXED_LENGTH_ARC_SHALLOW_DEPTH = 3; // 0 => only root node. /** * @see #shouldExpandNodeWithFixedLengthArcs */ static final int FIXED_LENGTH_ARC_SHALLOW_NUM_ARCS = 5; /** * @see #shouldExpandNodeWithFixedLengthArcs */ static final int FIXED_LENGTH_ARC_DEEP_NUM_ARCS = 10; /** * Maximum oversizing factor allowed for direct addressing compared to binary search when expansion * credits allow the oversizing. This factor prevents expansions that are obviously too costly even * if there are sufficient credits. * * @see #shouldExpandNodeWithDirectAddressing */ private static final float DIRECT_ADDRESSING_MAX_OVERSIZE_WITH_CREDIT_FACTOR = 1.66f; // Increment version to change it private static final String FILE_FORMAT_NAME = "FST"; private static final int VERSION_START = 6; private static final int VERSION_CURRENT = 7; // Never serialized; just used to represent the virtual // final node w/ no arcs: private static final long FINAL_END_NODE = -1; // Never serialized; just used to represent the virtual // non-final node w/ no arcs: private static final long NON_FINAL_END_NODE = 0; /** If arc has this label then that arc is final/accepted */ public static final int END_LABEL = -1; final INPUT_TYPE inputType; // if non-null, this FST accepts the empty string and // produces this output T emptyOutput; /** A {@link BytesStore}, used during building, or during reading when * the FST is very large (more than 1 GB). If the FST is less than 1 * GB then bytesArray is set instead. */ final BytesStore bytes; private final FSTStore fstStore; private long startNode = -1; public final Outputs outputs; /** Represents a single arc. */ public static final class Arc { //*** Arc fields. private int label; private T output; private long target; private byte flags; private T nextFinalOutput; private long nextArc; private byte nodeFlags; //*** Fields for arcs belonging to a node with fixed length arcs. // So only valid when bytesPerArc != 0. // nodeFlags == ARCS_FOR_BINARY_SEARCH || nodeFlags == ARCS_FOR_DIRECT_ADDRESSING. private int bytesPerArc; private long posArcsStart; private int arcIdx; private int numArcs; //*** Fields for a direct addressing node. nodeFlags == ARCS_FOR_DIRECT_ADDRESSING. /** Start position in the {@link FST.BytesReader} of the presence bits for a direct addressing node, aka the bit-table */ private long bitTableStart; /** First label of a direct addressing node. */ private int firstLabel; /** * Index of the current label of a direct addressing node. While {@link #arcIdx} is the current index in the label * range, {@link #presenceIndex} is its corresponding index in the list of actually present labels. It is equal * to the number of bits set before the bit at {@link #arcIdx} in the bit-table. This field is a cache to avoid * to count bits set repeatedly when iterating the next arcs. */ private int presenceIndex; /** Returns this */ public Arc copyFrom(Arc other) { label = other.label(); target = other.target(); flags = other.flags(); output = other.output(); nextFinalOutput = other.nextFinalOutput(); nextArc = other.nextArc(); nodeFlags = other.nodeFlags(); bytesPerArc = other.bytesPerArc(); // Fields for arcs belonging to a node with fixed length arcs. // We could avoid copying them if bytesPerArc() == 0 (this was the case with previous code, and the current code // still supports that), but it may actually help external uses of FST to have consistent arc state, and debugging // is easier. posArcsStart = other.posArcsStart(); arcIdx = other.arcIdx(); numArcs = other.numArcs(); bitTableStart = other.bitTableStart; firstLabel = other.firstLabel(); presenceIndex = other.presenceIndex; return this; } boolean flag(int flag) { return FST.flag(flags, flag); } public boolean isLast() { return flag(BIT_LAST_ARC); } public boolean isFinal() { return flag(BIT_FINAL_ARC); } @Override public String toString() { StringBuilder b = new StringBuilder(); b.append(" target=").append(target()); b.append(" label=0x").append(Integer.toHexString(label())); if (flag(BIT_FINAL_ARC)) { b.append(" final"); } if (flag(BIT_LAST_ARC)) { b.append(" last"); } if (flag(BIT_TARGET_NEXT)) { b.append(" targetNext"); } if (flag(BIT_STOP_NODE)) { b.append(" stop"); } if (flag(BIT_ARC_HAS_OUTPUT)) { b.append(" output=").append(output()); } if (flag(BIT_ARC_HAS_FINAL_OUTPUT)) { b.append(" nextFinalOutput=").append(nextFinalOutput()); } if (bytesPerArc() != 0) { b.append(" arcArray(idx=").append(arcIdx()).append(" of ").append(numArcs()).append(")") .append("(").append(nodeFlags() == ARCS_FOR_DIRECT_ADDRESSING ? "da" : "bs").append(")"); } return b.toString(); } public int label() { return label; } public T output() { return output; } /** Ord/address to target node. */ public long target() { return target; } public byte flags() { return flags; } public T nextFinalOutput() { return nextFinalOutput; } /** Address (into the byte[]) of the next arc - only for list of variable length arc. * Or ord/address to the next node if label == {@link #END_LABEL}. */ long nextArc() { return nextArc; } /** Where we are in the array; only valid if bytesPerArc != 0. */ public int arcIdx() { return arcIdx; } /** Node header flags. Only meaningful to check if the value is either * {@link #ARCS_FOR_BINARY_SEARCH} or {@link #ARCS_FOR_DIRECT_ADDRESSING} * (other value when bytesPerArc == 0). */ public byte nodeFlags() { return nodeFlags; } /** Where the first arc in the array starts; only valid if bytesPerArc != 0 */ public long posArcsStart() { return posArcsStart; } /** Non-zero if this arc is part of a node with fixed length arcs, which means all * arcs for the node are encoded with a fixed number of bytes so * that we binary search or direct address. We do when there are enough * arcs leaving one node. It wastes some bytes but gives faster lookups. */ public int bytesPerArc() { return bytesPerArc; } /** How many arcs; only valid if bytesPerArc != 0 (fixed length arcs). * For a node designed for binary search this is the array size. * For a node designed for direct addressing, this is the label range. */ public int numArcs() { return numArcs; } /** First label of a direct addressing node. * Only valid if nodeFlags == {@link #ARCS_FOR_DIRECT_ADDRESSING}. */ int firstLabel() { return firstLabel; } /** * Helper methods to read the bit-table of a direct addressing node. * Only valid for {@link Arc} with {@link Arc#nodeFlags()} == {@code ARCS_FOR_DIRECT_ADDRESSING}. */ static class BitTable { /** See {@link BitTableUtil#isBitSet(int, FST.BytesReader)}. */ static boolean isBitSet(int bitIndex, Arc arc, FST.BytesReader in) throws IOException { assert arc.nodeFlags() == ARCS_FOR_DIRECT_ADDRESSING; in.setPosition(arc.bitTableStart); return BitTableUtil.isBitSet(bitIndex, in); } /** * See {@link BitTableUtil#countBits(int, FST.BytesReader)}. * The count of bit set is the number of arcs of a direct addressing node. */ static int countBits(Arc arc, FST.BytesReader in) throws IOException { assert arc.nodeFlags() == ARCS_FOR_DIRECT_ADDRESSING; in.setPosition(arc.bitTableStart); return BitTableUtil.countBits(getNumPresenceBytes(arc.numArcs()), in); } /** See {@link BitTableUtil#countBitsUpTo(int, FST.BytesReader)}. */ static int countBitsUpTo(int bitIndex, Arc arc, FST.BytesReader in) throws IOException { assert arc.nodeFlags() == ARCS_FOR_DIRECT_ADDRESSING; in.setPosition(arc.bitTableStart); return BitTableUtil.countBitsUpTo(bitIndex, in); } /** See {@link BitTableUtil#nextBitSet(int, int, FST.BytesReader)}. */ static int nextBitSet(int bitIndex, Arc arc, FST.BytesReader in) throws IOException { assert arc.nodeFlags() == ARCS_FOR_DIRECT_ADDRESSING; in.setPosition(arc.bitTableStart); return BitTableUtil.nextBitSet(bitIndex, getNumPresenceBytes(arc.numArcs()), in); } /** See {@link BitTableUtil#previousBitSet(int, FST.BytesReader)}. */ static int previousBitSet(int bitIndex, Arc arc, FST.BytesReader in) throws IOException { assert arc.nodeFlags() == ARCS_FOR_DIRECT_ADDRESSING; in.setPosition(arc.bitTableStart); return BitTableUtil.previousBitSet(bitIndex, in); } /** * Asserts the bit-table of the provided {@link Arc} is valid. */ static boolean assertIsValid(Arc arc, FST.BytesReader in) throws IOException { assert arc.bytesPerArc() > 0; assert arc.nodeFlags() == ARCS_FOR_DIRECT_ADDRESSING; // First bit must be set. assert isBitSet(0, arc, in); // Last bit must be set. assert isBitSet(arc.numArcs() - 1, arc, in); // No bit set after the last arc. assert nextBitSet(arc.numArcs() - 1, arc, in) == -1; return true; } } } private static boolean flag(int flags, int bit) { return (flags & bit) != 0; } // make a new empty FST, for building; Builder invokes this FST(INPUT_TYPE inputType, Outputs outputs, int bytesPageBits) { this.inputType = inputType; this.outputs = outputs; fstStore = null; bytes = new BytesStore(bytesPageBits); // pad: ensure no node gets address 0 which is reserved to mean // the stop state w/ no arcs bytes.writeByte((byte) 0); emptyOutput = null; } private static final int DEFAULT_MAX_BLOCK_BITS = Constants.JRE_IS_64BIT ? 30 : 28; /** Load a previously saved FST. */ public FST(DataInput metaIn, DataInput in, Outputs outputs) throws IOException { this(metaIn, in, outputs, new OnHeapFSTStore(DEFAULT_MAX_BLOCK_BITS)); } /** Load a previously saved FST; maxBlockBits allows you to * control the size of the byte[] pages used to hold the FST bytes. */ public FST(DataInput metaIn, DataInput in, Outputs outputs, FSTStore fstStore) throws IOException { bytes = null; this.fstStore = fstStore; this.outputs = outputs; // NOTE: only reads formats VERSION_START up to VERSION_CURRENT; we don't have // back-compat promise for FSTs (they are experimental), but we are sometimes able to offer it CodecUtil.checkHeader(metaIn, FILE_FORMAT_NAME, VERSION_START, VERSION_CURRENT); if (metaIn.readByte() == 1) { // accepts empty string // 1 KB blocks: BytesStore emptyBytes = new BytesStore(10); int numBytes = metaIn.readVInt(); emptyBytes.copyBytes(metaIn, numBytes); // De-serialize empty-string output: BytesReader reader = emptyBytes.getReverseReader(); // NoOutputs uses 0 bytes when writing its output, // so we have to check here else BytesStore gets // angry: if (numBytes > 0) { reader.setPosition(numBytes-1); } emptyOutput = outputs.readFinalOutput(reader); } else { emptyOutput = null; } final byte t = metaIn.readByte(); switch(t) { case 0: inputType = INPUT_TYPE.BYTE1; break; case 1: inputType = INPUT_TYPE.BYTE2; break; case 2: inputType = INPUT_TYPE.BYTE4; break; default: throw new CorruptIndexException("invalid input type " + t, in); } startNode = metaIn.readVLong(); long numBytes = metaIn.readVLong(); this.fstStore.init(in, numBytes); } @Override public long ramBytesUsed() { long size = BASE_RAM_BYTES_USED; if (this.fstStore != null) { size += this.fstStore.ramBytesUsed(); } else { size += bytes.ramBytesUsed(); } return size; } @Override public String toString() { return getClass().getSimpleName() + "(input=" + inputType + ",output=" + outputs; } void finish(long newStartNode) throws IOException { assert newStartNode <= bytes.getPosition(); if (startNode != -1) { throw new IllegalStateException("already finished"); } if (newStartNode == FINAL_END_NODE && emptyOutput != null) { newStartNode = 0; } startNode = newStartNode; bytes.finish(); } public T getEmptyOutput() { return emptyOutput; } void setEmptyOutput(T v) { if (emptyOutput != null) { emptyOutput = outputs.merge(emptyOutput, v); } else { emptyOutput = v; } } public void save(DataOutput metaOut, DataOutput out) throws IOException { if (startNode == -1) { throw new IllegalStateException("call finish first"); } CodecUtil.writeHeader(metaOut, FILE_FORMAT_NAME, VERSION_CURRENT); // TODO: really we should encode this as an arc, arriving // to the root node, instead of special casing here: if (emptyOutput != null) { // Accepts empty string metaOut.writeByte((byte) 1); // Serialize empty-string output: RAMOutputStream ros = new RAMOutputStream(); outputs.writeFinalOutput(emptyOutput, ros); byte[] emptyOutputBytes = new byte[(int) ros.getFilePointer()]; ros.writeTo(emptyOutputBytes, 0); int emptyLen = emptyOutputBytes.length; // reverse final int stopAt = emptyLen / 2; int upto = 0; while(upto < stopAt) { final byte b = emptyOutputBytes[upto]; emptyOutputBytes[upto] = emptyOutputBytes[emptyLen - upto - 1]; emptyOutputBytes[emptyLen - upto - 1] = b; upto++; } metaOut.writeVInt(emptyLen); metaOut.writeBytes(emptyOutputBytes, 0, emptyLen); } else { metaOut.writeByte((byte) 0); } final byte t; if (inputType == INPUT_TYPE.BYTE1) { t = 0; } else if (inputType == INPUT_TYPE.BYTE2) { t = 1; } else { t = 2; } metaOut.writeByte(t); metaOut.writeVLong(startNode); if (bytes != null) { long numBytes = bytes.getPosition(); metaOut.writeVLong(numBytes); bytes.writeTo(out); } else { assert fstStore != null; fstStore.writeTo(out); } } /** * Writes an automaton to a file. */ public void save(final Path path) throws IOException { try (OutputStream os = new BufferedOutputStream(Files.newOutputStream(path))) { DataOutput out = new OutputStreamDataOutput(os); save(out, out); } } /** * Reads an automaton from a file. */ public static FST read(Path path, Outputs outputs) throws IOException { try (InputStream is = Files.newInputStream(path)) { DataInput in = new InputStreamDataInput(new BufferedInputStream(is)); return new FST<>(in, in, outputs); } } private void writeLabel(DataOutput out, int v) throws IOException { assert v >= 0: "v=" + v; if (inputType == INPUT_TYPE.BYTE1) { assert v <= 255: "v=" + v; out.writeByte((byte) v); } else if (inputType == INPUT_TYPE.BYTE2) { assert v <= 65535: "v=" + v; out.writeShort((short) v); } else { out.writeVInt(v); } } /** Reads one BYTE1/2/4 label from the provided {@link DataInput}. */ public int readLabel(DataInput in) throws IOException { final int v; if (inputType == INPUT_TYPE.BYTE1) { // Unsigned byte: v = in.readByte() & 0xFF; } else if (inputType == INPUT_TYPE.BYTE2) { // Unsigned short: v = in.readShort() & 0xFFFF; } else { v = in.readVInt(); } return v; } /** returns true if the node at this address has any * outgoing arcs */ public static boolean targetHasArcs(Arc arc) { return arc.target() > 0; } // serializes new node by appending its bytes to the end // of the current byte[] long addNode(Builder builder, Builder.UnCompiledNode nodeIn) throws IOException { T NO_OUTPUT = outputs.getNoOutput(); //System.out.println("FST.addNode pos=" + bytes.getPosition() + " numArcs=" + nodeIn.numArcs); if (nodeIn.numArcs == 0) { if (nodeIn.isFinal) { return FINAL_END_NODE; } else { return NON_FINAL_END_NODE; } } final long startAddress = builder.bytes.getPosition(); //System.out.println(" startAddr=" + startAddress); final boolean doFixedLengthArcs = shouldExpandNodeWithFixedLengthArcs(builder, nodeIn); if (doFixedLengthArcs) { //System.out.println(" fixed length arcs"); if (builder.numBytesPerArc.length < nodeIn.numArcs) { builder.numBytesPerArc = new int[ArrayUtil.oversize(nodeIn.numArcs, Integer.BYTES)]; builder.numLabelBytesPerArc = new int[builder.numBytesPerArc.length]; } } builder.arcCount += nodeIn.numArcs; final int lastArc = nodeIn.numArcs-1; long lastArcStart = builder.bytes.getPosition(); int maxBytesPerArc = 0; int maxBytesPerArcWithoutLabel = 0; for(int arcIdx=0; arcIdx < nodeIn.numArcs; arcIdx++) { final Builder.Arc arc = nodeIn.arcs[arcIdx]; final Builder.CompiledNode target = (Builder.CompiledNode) arc.target; int flags = 0; //System.out.println(" arc " + arcIdx + " label=" + arc.label + " -> target=" + target.node); if (arcIdx == lastArc) { flags += BIT_LAST_ARC; } if (builder.lastFrozenNode == target.node && !doFixedLengthArcs) { // TODO: for better perf (but more RAM used) we // could avoid this except when arc is "near" the // last arc: flags += BIT_TARGET_NEXT; } if (arc.isFinal) { flags += BIT_FINAL_ARC; if (arc.nextFinalOutput != NO_OUTPUT) { flags += BIT_ARC_HAS_FINAL_OUTPUT; } } else { assert arc.nextFinalOutput == NO_OUTPUT; } boolean targetHasArcs = target.node > 0; if (!targetHasArcs) { flags += BIT_STOP_NODE; } if (arc.output != NO_OUTPUT) { flags += BIT_ARC_HAS_OUTPUT; } builder.bytes.writeByte((byte) flags); long labelStart = builder.bytes.getPosition(); writeLabel(builder.bytes, arc.label); int numLabelBytes = (int) (builder.bytes.getPosition() - labelStart); // System.out.println(" write arc: label=" + (char) arc.label + " flags=" + flags + " target=" + target.node + " pos=" + bytes.getPosition() + " output=" + outputs.outputToString(arc.output)); if (arc.output != NO_OUTPUT) { outputs.write(arc.output, builder.bytes); //System.out.println(" write output"); } if (arc.nextFinalOutput != NO_OUTPUT) { //System.out.println(" write final output"); outputs.writeFinalOutput(arc.nextFinalOutput, builder.bytes); } if (targetHasArcs && (flags & BIT_TARGET_NEXT) == 0) { assert target.node > 0; //System.out.println(" write target"); builder.bytes.writeVLong(target.node); } // just write the arcs "like normal" on first pass, but record how many bytes each one took // and max byte size: if (doFixedLengthArcs) { int numArcBytes = (int) (builder.bytes.getPosition() - lastArcStart); builder.numBytesPerArc[arcIdx] = numArcBytes; builder.numLabelBytesPerArc[arcIdx] = numLabelBytes; lastArcStart = builder.bytes.getPosition(); maxBytesPerArc = Math.max(maxBytesPerArc, numArcBytes); maxBytesPerArcWithoutLabel = Math.max(maxBytesPerArcWithoutLabel, numArcBytes - numLabelBytes); //System.out.println(" arcBytes=" + numArcBytes + " labelBytes=" + numLabelBytes); } } // TODO: try to avoid wasteful cases: disable doFixedLengthArcs in that case /* * * LUCENE-4682: what is a fair heuristic here? * It could involve some of these: * 1. how "busy" the node is: nodeIn.inputCount relative to frontier[0].inputCount? * 2. how much binSearch saves over scan: nodeIn.numArcs * 3. waste: numBytes vs numBytesExpanded * * the one below just looks at #3 if (doFixedLengthArcs) { // rough heuristic: make this 1.25 "waste factor" a parameter to the phd ctor???? int numBytes = lastArcStart - startAddress; int numBytesExpanded = maxBytesPerArc * nodeIn.numArcs; if (numBytesExpanded > numBytes*1.25) { doFixedLengthArcs = false; } } */ if (doFixedLengthArcs) { assert maxBytesPerArc > 0; // 2nd pass just "expands" all arcs to take up a fixed byte size int labelRange = nodeIn.arcs[nodeIn.numArcs - 1].label - nodeIn.arcs[0].label + 1; assert labelRange > 0; if (shouldExpandNodeWithDirectAddressing(builder, nodeIn, maxBytesPerArc, maxBytesPerArcWithoutLabel, labelRange)) { writeNodeForDirectAddressing(builder, nodeIn, startAddress, maxBytesPerArcWithoutLabel, labelRange); builder.directAddressingNodeCount++; } else { writeNodeForBinarySearch(builder, nodeIn, startAddress, maxBytesPerArc); builder.binarySearchNodeCount++; } } final long thisNodeAddress = builder.bytes.getPosition()-1; builder.bytes.reverse(startAddress, thisNodeAddress); builder.nodeCount++; return thisNodeAddress; } /** * Returns whether the given node should be expanded with fixed length arcs. * Nodes will be expanded depending on their depth (distance from the root node) and their number * of arcs. *

* Nodes with fixed length arcs use more space, because they encode all arcs with a fixed number * of bytes, but they allow either binary search or direct addressing on the arcs (instead of linear * scan) on lookup by arc label. */ private boolean shouldExpandNodeWithFixedLengthArcs(Builder builder, Builder.UnCompiledNode node) { return builder.allowFixedLengthArcs && ((node.depth <= FIXED_LENGTH_ARC_SHALLOW_DEPTH && node.numArcs >= FIXED_LENGTH_ARC_SHALLOW_NUM_ARCS) || node.numArcs >= FIXED_LENGTH_ARC_DEEP_NUM_ARCS); } /** * Returns whether the given node should be expanded with direct addressing instead of binary search. *

* Prefer direct addressing for performance if it does not oversize binary search byte size too much, * so that the arcs can be directly addressed by label. * * @see Builder#getDirectAddressingMaxOversizingFactor() */ private boolean shouldExpandNodeWithDirectAddressing(Builder builder, Builder.UnCompiledNode nodeIn, int numBytesPerArc, int maxBytesPerArcWithoutLabel, int labelRange) { // Anticipate precisely the size of the encodings. int sizeForBinarySearch = numBytesPerArc * nodeIn.numArcs; int sizeForDirectAddressing = getNumPresenceBytes(labelRange) + builder.numLabelBytesPerArc[0] + maxBytesPerArcWithoutLabel * nodeIn.numArcs; // Determine the allowed oversize compared to binary search. // This is defined by a parameter of FST Builder (default 1: no oversize). int allowedOversize = (int) (sizeForBinarySearch * builder.getDirectAddressingMaxOversizingFactor()); int expansionCost = sizeForDirectAddressing - allowedOversize; // Select direct addressing if either: // - Direct addressing size is smaller than binary search. // In this case, increment the credit by the reduced size (to use it later). // - Direct addressing size is larger than binary search, but the positive credit allows the oversizing. // In this case, decrement the credit by the oversize. // In addition, do not try to oversize to a clearly too large node size // (this is the DIRECT_ADDRESSING_MAX_OVERSIZE_WITH_CREDIT_FACTOR parameter). if (expansionCost <= 0 || (builder.directAddressingExpansionCredit >= expansionCost && sizeForDirectAddressing <= allowedOversize * DIRECT_ADDRESSING_MAX_OVERSIZE_WITH_CREDIT_FACTOR)) { builder.directAddressingExpansionCredit -= expansionCost; return true; } return false; } private void writeNodeForBinarySearch(Builder builder, Builder.UnCompiledNode nodeIn, long startAddress, int maxBytesPerArc) { // Build the header in a buffer. // It is a false/special arc which is in fact a node header with node flags followed by node metadata. builder.fixedLengthArcsBuffer .resetPosition() .writeByte(ARCS_FOR_BINARY_SEARCH) .writeVInt(nodeIn.numArcs) .writeVInt(maxBytesPerArc); int headerLen = builder.fixedLengthArcsBuffer.getPosition(); // Expand the arcs in place, backwards. long srcPos = builder.bytes.getPosition(); long destPos = startAddress + headerLen + nodeIn.numArcs * maxBytesPerArc; assert destPos >= srcPos; if (destPos > srcPos) { builder.bytes.skipBytes((int) (destPos - srcPos)); for (int arcIdx = nodeIn.numArcs - 1; arcIdx >= 0; arcIdx--) { destPos -= maxBytesPerArc; int arcLen = builder.numBytesPerArc[arcIdx]; srcPos -= arcLen; if (srcPos != destPos) { assert destPos > srcPos: "destPos=" + destPos + " srcPos=" + srcPos + " arcIdx=" + arcIdx + " maxBytesPerArc=" + maxBytesPerArc + " arcLen=" + arcLen + " nodeIn.numArcs=" + nodeIn.numArcs; builder.bytes.copyBytes(srcPos, destPos, arcLen); } } } // Write the header. builder.bytes.writeBytes(startAddress, builder.fixedLengthArcsBuffer.getBytes(), 0, headerLen); } private void writeNodeForDirectAddressing(Builder builder, Builder.UnCompiledNode nodeIn, long startAddress, int maxBytesPerArcWithoutLabel, int labelRange) { // Expand the arcs backwards in a buffer because we remove the labels. // So the obtained arcs might occupy less space. This is the reason why this // whole method is more complex. // Drop the label bytes since we can infer the label based on the arc index, // the presence bits, and the first label. Keep the first label. int headerMaxLen = 11; int numPresenceBytes = getNumPresenceBytes(labelRange); long srcPos = builder.bytes.getPosition(); int totalArcBytes = builder.numLabelBytesPerArc[0] + nodeIn.numArcs * maxBytesPerArcWithoutLabel; int bufferOffset = headerMaxLen + numPresenceBytes + totalArcBytes; byte[] buffer = builder.fixedLengthArcsBuffer.ensureCapacity(bufferOffset).getBytes(); // Copy the arcs to the buffer, dropping all labels except first one. for (int arcIdx = nodeIn.numArcs - 1; arcIdx >= 0; arcIdx--) { bufferOffset -= maxBytesPerArcWithoutLabel; int srcArcLen = builder.numBytesPerArc[arcIdx]; srcPos -= srcArcLen; int labelLen = builder.numLabelBytesPerArc[arcIdx]; // Copy the flags. builder.bytes.copyBytes(srcPos, buffer, bufferOffset, 1); // Skip the label, copy the remaining. int remainingArcLen = srcArcLen - 1 - labelLen; if (remainingArcLen != 0) { builder.bytes.copyBytes(srcPos + 1 + labelLen, buffer, bufferOffset + 1, remainingArcLen); } if (arcIdx == 0) { // Copy the label of the first arc only. bufferOffset -= labelLen; builder.bytes.copyBytes(srcPos + 1, buffer, bufferOffset, labelLen); } } assert bufferOffset == headerMaxLen + numPresenceBytes; // Build the header in the buffer. // It is a false/special arc which is in fact a node header with node flags followed by node metadata. builder.fixedLengthArcsBuffer .resetPosition() .writeByte(ARCS_FOR_DIRECT_ADDRESSING) .writeVInt(labelRange) // labelRange instead of numArcs. .writeVInt(maxBytesPerArcWithoutLabel); // maxBytesPerArcWithoutLabel instead of maxBytesPerArc. int headerLen = builder.fixedLengthArcsBuffer.getPosition(); // Prepare the builder byte store. Enlarge or truncate if needed. long nodeEnd = startAddress + headerLen + numPresenceBytes + totalArcBytes; long currentPosition = builder.bytes.getPosition(); if (nodeEnd >= currentPosition) { builder.bytes.skipBytes((int) (nodeEnd - currentPosition)); } else { builder.bytes.truncate(nodeEnd); } assert builder.bytes.getPosition() == nodeEnd; // Write the header. long writeOffset = startAddress; builder.bytes.writeBytes(writeOffset, builder.fixedLengthArcsBuffer.getBytes(), 0, headerLen); writeOffset += headerLen; // Write the presence bits writePresenceBits(builder, nodeIn, writeOffset, numPresenceBytes); writeOffset += numPresenceBytes; // Write the first label and the arcs. builder.bytes.writeBytes(writeOffset, builder.fixedLengthArcsBuffer.getBytes(), bufferOffset, totalArcBytes); } private void writePresenceBits(Builder builder, Builder.UnCompiledNode nodeIn, long dest, int numPresenceBytes) { long bytePos = dest; byte presenceBits = 1; // The first arc is always present. int presenceIndex = 0; int previousLabel = nodeIn.arcs[0].label; for (int arcIdx = 1; arcIdx < nodeIn.numArcs; arcIdx++) { int label = nodeIn.arcs[arcIdx].label; assert label > previousLabel; presenceIndex += label - previousLabel; while (presenceIndex >= Byte.SIZE) { builder.bytes.writeByte(bytePos++, presenceBits); presenceBits = 0; presenceIndex -= Byte.SIZE; } // Set the bit at presenceIndex to flag that the corresponding arc is present. presenceBits |= 1 << presenceIndex; previousLabel = label; } assert presenceIndex == (nodeIn.arcs[nodeIn.numArcs - 1].label - nodeIn.arcs[0].label) % 8; assert presenceBits != 0; // The last byte is not 0. assert (presenceBits & (1 << presenceIndex)) != 0; // The last arc is always present. builder.bytes.writeByte(bytePos++, presenceBits); assert bytePos - dest == numPresenceBytes; } /** Gets the number of bytes required to flag the presence of each arc in the given label range, one bit per arc. */ private static int getNumPresenceBytes(int labelRange) { assert labelRange >= 0; return (labelRange + 7) >> 3; } /** * Reads the presence bits of a direct-addressing node. * Actually we don't read them here, we just keep the pointer to the bit-table start and we skip them. */ private void readPresenceBytes(Arc arc, BytesReader in) throws IOException { assert arc.bytesPerArc() > 0; assert arc.nodeFlags() == ARCS_FOR_DIRECT_ADDRESSING; arc.bitTableStart = in.getPosition(); in.skipBytes(getNumPresenceBytes(arc.numArcs())); } /** Fills virtual 'start' arc, ie, an empty incoming arc to the FST's start node */ public Arc getFirstArc(Arc arc) { T NO_OUTPUT = outputs.getNoOutput(); if (emptyOutput != null) { arc.flags = BIT_FINAL_ARC | BIT_LAST_ARC; arc.nextFinalOutput = emptyOutput; if (emptyOutput != NO_OUTPUT) { arc.flags = (byte) (arc.flags() | BIT_ARC_HAS_FINAL_OUTPUT); } } else { arc.flags = BIT_LAST_ARC; arc.nextFinalOutput = NO_OUTPUT; } arc.output = NO_OUTPUT; // If there are no nodes, ie, the FST only accepts the // empty string, then startNode is 0 arc.target = startNode; return arc; } /** Follows the follow arc and reads the last * arc of its target; this changes the provided * arc (2nd arg) in-place and returns it. * * @return Returns the second argument * (arc). */ Arc readLastTargetArc(Arc follow, Arc arc, BytesReader in) throws IOException { //System.out.println("readLast"); if (!targetHasArcs(follow)) { //System.out.println(" end node"); assert follow.isFinal(); arc.label = END_LABEL; arc.target = FINAL_END_NODE; arc.output = follow.nextFinalOutput(); arc.flags = BIT_LAST_ARC; arc.nodeFlags = arc.flags; return arc; } else { in.setPosition(follow.target()); byte flags = arc.nodeFlags = in.readByte(); if (flags == ARCS_FOR_BINARY_SEARCH || flags == ARCS_FOR_DIRECT_ADDRESSING) { // Special arc which is actually a node header for fixed length arcs. // Jump straight to end to find the last arc. arc.numArcs = in.readVInt(); arc.bytesPerArc = in.readVInt(); //System.out.println(" array numArcs=" + arc.numArcs + " bpa=" + arc.bytesPerArc); if (flags == ARCS_FOR_DIRECT_ADDRESSING) { readPresenceBytes(arc, in); arc.firstLabel = readLabel(in); arc.posArcsStart = in.getPosition(); readLastArcByDirectAddressing(arc, in); } else { arc.arcIdx = arc.numArcs() - 2; arc.posArcsStart = in.getPosition(); readNextRealArc(arc, in); } } else { arc.flags = flags; // non-array: linear scan arc.bytesPerArc = 0; //System.out.println(" scan"); while(!arc.isLast()) { // skip this arc: readLabel(in); if (arc.flag(BIT_ARC_HAS_OUTPUT)) { outputs.skipOutput(in); } if (arc.flag(BIT_ARC_HAS_FINAL_OUTPUT)) { outputs.skipFinalOutput(in); } if (arc.flag(BIT_STOP_NODE)) { } else if (arc.flag(BIT_TARGET_NEXT)) { } else { readUnpackedNodeTarget(in); } arc.flags = in.readByte(); } // Undo the byte flags we read: in.skipBytes(-1); arc.nextArc = in.getPosition(); readNextRealArc(arc, in); } assert arc.isLast(); return arc; } } private long readUnpackedNodeTarget(BytesReader in) throws IOException { return in.readVLong(); } /** * Follow the follow arc and read the first arc of its target; * this changes the provided arc (2nd arg) in-place and returns * it. * * @return Returns the second argument (arc). */ public Arc readFirstTargetArc(Arc follow, Arc arc, BytesReader in) throws IOException { //int pos = address; //System.out.println(" readFirstTarget follow.target=" + follow.target + " isFinal=" + follow.isFinal()); if (follow.isFinal()) { // Insert "fake" final first arc: arc.label = END_LABEL; arc.output = follow.nextFinalOutput(); arc.flags = BIT_FINAL_ARC; if (follow.target() <= 0) { arc.flags |= BIT_LAST_ARC; } else { // NOTE: nextArc is a node (not an address!) in this case: arc.nextArc = follow.target(); } arc.target = FINAL_END_NODE; arc.nodeFlags = arc.flags; //System.out.println(" insert isFinal; nextArc=" + follow.target + " isLast=" + arc.isLast() + " output=" + outputs.outputToString(arc.output)); return arc; } else { return readFirstRealTargetArc(follow.target(), arc, in); } } public Arc readFirstRealTargetArc(long nodeAddress, Arc arc, final BytesReader in) throws IOException { in.setPosition(nodeAddress); //System.out.println(" flags=" + arc.flags); byte flags = arc.nodeFlags = in.readByte(); if (flags == ARCS_FOR_BINARY_SEARCH || flags == ARCS_FOR_DIRECT_ADDRESSING) { //System.out.println(" fixed length arc"); // Special arc which is actually a node header for fixed length arcs. arc.numArcs = in.readVInt(); arc.bytesPerArc = in.readVInt(); arc.arcIdx = -1; if (flags == ARCS_FOR_DIRECT_ADDRESSING) { readPresenceBytes(arc, in); arc.firstLabel = readLabel(in); arc.presenceIndex = -1; } arc.posArcsStart = in.getPosition(); //System.out.println(" bytesPer=" + arc.bytesPerArc + " numArcs=" + arc.numArcs + " arcsStart=" + pos); } else { arc.nextArc = nodeAddress; arc.bytesPerArc = 0; } return readNextRealArc(arc, in); } /** * Returns whether arc's target points to a node in expanded format (fixed length arcs). */ boolean isExpandedTarget(Arc follow, BytesReader in) throws IOException { if (!targetHasArcs(follow)) { return false; } else { in.setPosition(follow.target()); byte flags = in.readByte(); return flags == ARCS_FOR_BINARY_SEARCH || flags == ARCS_FOR_DIRECT_ADDRESSING; } } /** In-place read; returns the arc. */ public Arc readNextArc(Arc arc, BytesReader in) throws IOException { if (arc.label() == END_LABEL) { // This was a fake inserted "final" arc if (arc.nextArc() <= 0) { throw new IllegalArgumentException("cannot readNextArc when arc.isLast()=true"); } return readFirstRealTargetArc(arc.nextArc(), arc, in); } else { return readNextRealArc(arc, in); } } /** Peeks at next arc's label; does not alter arc. Do * not call this if arc.isLast()! */ int readNextArcLabel(Arc arc, BytesReader in) throws IOException { assert !arc.isLast(); if (arc.label() == END_LABEL) { //System.out.println(" nextArc fake " + arc.nextArc); // Next arc is the first arc of a node. // Position to read the first arc label. in.setPosition(arc.nextArc()); byte flags = in.readByte(); if (flags == ARCS_FOR_BINARY_SEARCH || flags == ARCS_FOR_DIRECT_ADDRESSING) { //System.out.println(" nextArc fixed length arc"); // Special arc which is actually a node header for fixed length arcs. int numArcs = in.readVInt(); in.readVInt(); // Skip bytesPerArc. if (flags == ARCS_FOR_BINARY_SEARCH) { in.readByte(); // Skip arc flags. } else { in.skipBytes(getNumPresenceBytes(numArcs)); } } } else { if (arc.bytesPerArc() != 0) { //System.out.println(" nextArc real array"); // Arcs have fixed length. if (arc.nodeFlags() == ARCS_FOR_BINARY_SEARCH) { // Point to next arc, -1 to skip arc flags. in.setPosition(arc.posArcsStart() - (1 + arc.arcIdx()) * arc.bytesPerArc() - 1); } else { assert arc.nodeFlags() == ARCS_FOR_DIRECT_ADDRESSING; // Direct addressing node. The label is not stored but rather inferred // based on first label and arc index in the range. assert BitTable.assertIsValid(arc, in); assert BitTable.isBitSet(arc.arcIdx(), arc, in); int nextIndex = BitTable.nextBitSet(arc.arcIdx(), arc, in); assert nextIndex != -1; return arc.firstLabel() + nextIndex; } } else { // Arcs have variable length. //System.out.println(" nextArc real list"); // Position to next arc, -1 to skip flags. in.setPosition(arc.nextArc() - 1); } } return readLabel(in); } public Arc readArcByIndex(Arc arc, final BytesReader in, int idx) throws IOException { assert arc.bytesPerArc() > 0; assert arc.nodeFlags() == ARCS_FOR_BINARY_SEARCH; assert idx >= 0 && idx < arc.numArcs(); in.setPosition(arc.posArcsStart() - idx * arc.bytesPerArc()); arc.arcIdx = idx; arc.flags = in.readByte(); return readArc(arc, in); } /** * Reads a present direct addressing node arc, with the provided index in the label range. * * @param rangeIndex The index of the arc in the label range. It must be present. * The real arc offset is computed based on the presence bits of * the direct addressing node. */ public Arc readArcByDirectAddressing(Arc arc, final BytesReader in, int rangeIndex) throws IOException { assert BitTable.assertIsValid(arc, in); assert rangeIndex >= 0 && rangeIndex < arc.numArcs(); assert BitTable.isBitSet(rangeIndex, arc, in); int presenceIndex = BitTable.countBitsUpTo(rangeIndex, arc, in); return readArcByDirectAddressing(arc, in, rangeIndex, presenceIndex); } /** * Reads a present direct addressing node arc, with the provided index in the label range and its corresponding * presence index (which is the count of presence bits before it). */ private Arc readArcByDirectAddressing(Arc arc, final BytesReader in, int rangeIndex, int presenceIndex) throws IOException { in.setPosition(arc.posArcsStart() - presenceIndex * arc.bytesPerArc()); arc.arcIdx = rangeIndex; arc.presenceIndex = presenceIndex; arc.flags = in.readByte(); return readArc(arc, in); } /** * Reads the last arc of a direct addressing node. * This method is equivalent to call {@link #readArcByDirectAddressing(Arc, BytesReader, int)} with {@code rangeIndex} * equal to {@code arc.numArcs() - 1}, but it is faster. */ public Arc readLastArcByDirectAddressing(Arc arc, final BytesReader in) throws IOException { assert BitTable.assertIsValid(arc, in); int presenceIndex = BitTable.countBits(arc, in) - 1; return readArcByDirectAddressing(arc, in, arc.numArcs() - 1, presenceIndex); } /** Never returns null, but you should never call this if * arc.isLast() is true. */ public Arc readNextRealArc(Arc arc, final BytesReader in) throws IOException { // TODO: can't assert this because we call from readFirstArc // assert !flag(arc.flags, BIT_LAST_ARC); switch (arc.nodeFlags()) { case ARCS_FOR_BINARY_SEARCH: assert arc.bytesPerArc() > 0; arc.arcIdx++; assert arc.arcIdx() >= 0 && arc.arcIdx() < arc.numArcs(); in.setPosition(arc.posArcsStart() - arc.arcIdx() * arc.bytesPerArc()); arc.flags = in.readByte(); break; case ARCS_FOR_DIRECT_ADDRESSING: assert BitTable.assertIsValid(arc, in); assert arc.arcIdx() == -1 || BitTable.isBitSet(arc.arcIdx(), arc, in); int nextIndex = BitTable.nextBitSet(arc.arcIdx(), arc, in); return readArcByDirectAddressing(arc, in, nextIndex, arc.presenceIndex + 1); default: // Variable length arcs - linear search. assert arc.bytesPerArc() == 0; in.setPosition(arc.nextArc()); arc.flags = in.readByte(); } return readArc(arc, in); } /** * Reads an arc. *
Precondition: The arc flags byte has already been read and set; * the given BytesReader is positioned just after the arc flags byte. */ private Arc readArc(Arc arc, BytesReader in) throws IOException { if (arc.nodeFlags() == ARCS_FOR_DIRECT_ADDRESSING) { arc.label = arc.firstLabel() + arc.arcIdx(); } else { arc.label = readLabel(in); } if (arc.flag(BIT_ARC_HAS_OUTPUT)) { arc.output = outputs.read(in); } else { arc.output = outputs.getNoOutput(); } if (arc.flag(BIT_ARC_HAS_FINAL_OUTPUT)) { arc.nextFinalOutput = outputs.readFinalOutput(in); } else { arc.nextFinalOutput = outputs.getNoOutput(); } if (arc.flag(BIT_STOP_NODE)) { if (arc.flag(BIT_FINAL_ARC)) { arc.target = FINAL_END_NODE; } else { arc.target = NON_FINAL_END_NODE; } arc.nextArc = in.getPosition(); // Only useful for list. } else if (arc.flag(BIT_TARGET_NEXT)) { arc.nextArc = in.getPosition(); // Only useful for list. // TODO: would be nice to make this lazy -- maybe // caller doesn't need the target and is scanning arcs... if (!arc.flag(BIT_LAST_ARC)) { if (arc.bytesPerArc() == 0) { // must scan seekToNextNode(in); } else { int numArcs = arc.nodeFlags == ARCS_FOR_DIRECT_ADDRESSING ? BitTable.countBits(arc, in) : arc.numArcs(); in.setPosition(arc.posArcsStart() - arc.bytesPerArc() * numArcs); } } arc.target = in.getPosition(); } else { arc.target = readUnpackedNodeTarget(in); arc.nextArc = in.getPosition(); // Only useful for list. } return arc; } static Arc readEndArc(Arc follow, Arc arc) { if (follow.isFinal()) { if (follow.target() <= 0) { arc.flags = FST.BIT_LAST_ARC; } else { arc.flags = 0; // NOTE: nextArc is a node (not an address!) in this case: arc.nextArc = follow.target(); } arc.output = follow.nextFinalOutput(); arc.label = FST.END_LABEL; return arc; } else { return null; } } // TODO: could we somehow [partially] tableize arc lookups // like automaton? /** Finds an arc leaving the incoming arc, replacing the arc in place. * This returns null if the arc was not found, else the incoming arc. */ public Arc findTargetArc(int labelToMatch, Arc follow, Arc arc, BytesReader in) throws IOException { if (labelToMatch == END_LABEL) { if (follow.isFinal()) { if (follow.target() <= 0) { arc.flags = BIT_LAST_ARC; } else { arc.flags = 0; // NOTE: nextArc is a node (not an address!) in this case: arc.nextArc = follow.target(); } arc.output = follow.nextFinalOutput(); arc.label = END_LABEL; arc.nodeFlags = arc.flags; return arc; } else { return null; } } if (!targetHasArcs(follow)) { return null; } in.setPosition(follow.target()); // System.out.println("fta label=" + (char) labelToMatch); byte flags = arc.nodeFlags = in.readByte(); if (flags == ARCS_FOR_DIRECT_ADDRESSING) { arc.numArcs = in.readVInt(); // This is in fact the label range. arc.bytesPerArc = in.readVInt(); readPresenceBytes(arc, in); arc.firstLabel = readLabel(in); arc.posArcsStart = in.getPosition(); int arcIndex = labelToMatch - arc.firstLabel(); if (arcIndex < 0 || arcIndex >= arc.numArcs()) { return null; // Before or after label range. } else if (!BitTable.isBitSet(arcIndex, arc, in)) { return null; // Arc missing in the range. } return readArcByDirectAddressing(arc, in, arcIndex); } else if (flags == ARCS_FOR_BINARY_SEARCH) { arc.numArcs = in.readVInt(); arc.bytesPerArc = in.readVInt(); arc.posArcsStart = in.getPosition(); // Array is sparse; do binary search: int low = 0; int high = arc.numArcs() - 1; while (low <= high) { //System.out.println(" cycle"); int mid = (low + high) >>> 1; // +1 to skip over flags in.setPosition(arc.posArcsStart() - (arc.bytesPerArc() * mid + 1)); int midLabel = readLabel(in); final int cmp = midLabel - labelToMatch; if (cmp < 0) { low = mid + 1; } else if (cmp > 0) { high = mid - 1; } else { arc.arcIdx = mid - 1; //System.out.println(" found!"); return readNextRealArc(arc, in); } } return null; } // Linear scan readFirstRealTargetArc(follow.target(), arc, in); while(true) { //System.out.println(" non-bs cycle"); // TODO: we should fix this code to not have to create // object for the output of every arc we scan... only // for the matching arc, if found if (arc.label() == labelToMatch) { //System.out.println(" found!"); return arc; } else if (arc.label() > labelToMatch) { return null; } else if (arc.isLast()) { return null; } else { readNextRealArc(arc, in); } } } private void seekToNextNode(BytesReader in) throws IOException { while(true) { final int flags = in.readByte(); readLabel(in); if (flag(flags, BIT_ARC_HAS_OUTPUT)) { outputs.skipOutput(in); } if (flag(flags, BIT_ARC_HAS_FINAL_OUTPUT)) { outputs.skipFinalOutput(in); } if (!flag(flags, BIT_STOP_NODE) && !flag(flags, BIT_TARGET_NEXT)) { readUnpackedNodeTarget(in); } if (flag(flags, BIT_LAST_ARC)) { return; } } } /** Returns a {@link BytesReader} for this FST, positioned at * position 0. */ public BytesReader getBytesReader() { if (this.fstStore != null) { return this.fstStore.getReverseBytesReader(); } else { return bytes.getReverseReader(); } } /** Reads bytes stored in an FST. */ public static abstract class BytesReader extends DataInput { /** Get current read position. */ public abstract long getPosition(); /** Set current read position. */ public abstract void setPosition(long pos); /** Returns true if this reader uses reversed bytes * under-the-hood. */ public abstract boolean reversed(); } }





© 2015 - 2025 Weber Informatics LLC | Privacy Policy