com.ibm.icu.text.RBBITableBuilder Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of icu4j Show documentation
Show all versions of icu4j Show documentation
International Component for Unicode for Java (ICU4J) is a mature, widely used Java library
providing Unicode and Globalization support
/*
**********************************************************************
* Copyright (c) 2002-2009, International Business Machines
* Corporation and others. All Rights Reserved.
**********************************************************************
*/
package com.ibm.icu.text;
import java.util.ArrayList;
import java.util.Collection;
import java.util.HashSet;
import java.util.List;
import java.util.Set;
import java.util.SortedSet;
import java.util.TreeSet;
import com.ibm.icu.impl.Assert;
import com.ibm.icu.lang.UCharacter;
import com.ibm.icu.lang.UProperty;
//
// class RBBITableBuilder is part of the RBBI rule compiler.
// It builds the state transition table used by the RBBI runtime
// from the expression syntax tree generated by the rule scanner.
//
// This class is part of the RBBI implementation only.
// There is no user-visible public API here.
//
class RBBITableBuilder {
//
// RBBIStateDescriptor - The DFA is initially constructed as a set of these descriptors,
// one for each state.
static class RBBIStateDescriptor {
boolean fMarked;
int fAccepting;
int fLookAhead;
SortedSet fTagVals;
int fTagsIdx;
Set fPositions; // Set of parse tree positions associated
// with this state. Unordered (it's a set).
// UVector contents are RBBINode *
int[] fDtran; // Transitions out of this state.
// indexed by input character
// contents is int index of dest state
// in RBBITableBuilder.fDStates
RBBIStateDescriptor(int maxInputSymbol) {
fTagVals = new TreeSet();
fPositions = new HashSet();
fDtran = new int[maxInputSymbol+1]; // fDtran needs to be pre-sized.
// It is indexed by input symbols, and will
// hold the next state number for each
// symbol.
}
}
private RBBIRuleBuilder fRB;
private int fRootIx; // The array index into RBBIRuleBuilder.fTreeRoots
// for the parse tree to operate on.
// Too bad Java can't do indirection more easily!
private List fDStates; // D states (Aho's terminology)
// Index is state number
// Contents are RBBIStateDescriptor pointers.
//-----------------------------------------------------------------------------
//
// Constructor for RBBITableBuilder.
//
// rootNode is an index into the array of root nodes that is held by
// the overall RBBIRuleBuilder.
//-----------------------------------------------------------------------------
RBBITableBuilder(RBBIRuleBuilder rb, int rootNodeIx) {
fRootIx = rootNodeIx;
fRB = rb;
fDStates = new ArrayList();
}
//-----------------------------------------------------------------------------
//
// RBBITableBuilder::build - This is the main function for building the DFA state transtion
// table from the RBBI rules parse tree.
//
//-----------------------------------------------------------------------------
void build() {
// If there were no rules, just return. This situation can easily arise
// for the reverse rules.
if (fRB.fTreeRoots[fRootIx]==null) {
return;
}
//
// Walk through the tree, replacing any references to $variables with a copy of the
// parse tree for the substition expression.
//
fRB.fTreeRoots[fRootIx] = fRB.fTreeRoots[fRootIx].flattenVariables();
if (fRB.fDebugEnv!=null && fRB.fDebugEnv.indexOf("ftree")>=0) {
System.out.println("Parse tree after flattening variable references.");
fRB.fTreeRoots[fRootIx].printTree(true);
}
//
// If the rules contained any references to {bof}
// add a {bof} to the
// tree. Means that all matches must start out with the
// {bof} fake character.
//
if (fRB.fSetBuilder.sawBOF()) {
RBBINode bofTop = new RBBINode(RBBINode.opCat);
RBBINode bofLeaf = new RBBINode(RBBINode.leafChar);
bofTop.fLeftChild = bofLeaf;
bofTop.fRightChild = fRB.fTreeRoots[fRootIx];
bofLeaf.fParent = bofTop;
bofLeaf.fVal = 2; // Reserved value for {bof}.
fRB.fTreeRoots[fRootIx] = bofTop;
}
//
// Add a unique right-end marker to the expression.
// Appears as a cat-node, left child being the original tree,
// right child being the end marker.
//
RBBINode cn = new RBBINode(RBBINode.opCat);
cn.fLeftChild = fRB.fTreeRoots[fRootIx];
fRB.fTreeRoots[fRootIx].fParent = cn;
cn.fRightChild = new RBBINode(RBBINode.endMark);
cn.fRightChild.fParent = cn;
fRB.fTreeRoots[fRootIx] = cn;
//
// Replace all references to UnicodeSets with the tree for the equivalent
// expression.
//
fRB.fTreeRoots[fRootIx].flattenSets();
if (fRB.fDebugEnv!=null && fRB.fDebugEnv.indexOf("stree")>=0) {
System.out.println("Parse tree after flattening Unicode Set references.");
fRB.fTreeRoots[fRootIx].printTree(true);
}
//
// calculate the functions nullable, firstpos, lastpos and followpos on
// nodes in the parse tree.
// See the alogrithm description in Aho.
// Understanding how this works by looking at the code alone will be
// nearly impossible.
//
calcNullable(fRB.fTreeRoots[fRootIx]);
calcFirstPos(fRB.fTreeRoots[fRootIx]);
calcLastPos(fRB.fTreeRoots[fRootIx]);
calcFollowPos(fRB.fTreeRoots[fRootIx]);
if (fRB.fDebugEnv!=null && fRB.fDebugEnv.indexOf("pos")>=0) {
System.out.print("\n");
printPosSets(fRB.fTreeRoots[fRootIx]);
}
//
// For "chained" rules, modify the followPos sets
//
if (fRB.fChainRules) {
calcChainedFollowPos(fRB.fTreeRoots[fRootIx]);
}
//
// BOF (start of input) test fixup.
//
if (fRB.fSetBuilder.sawBOF()) {
bofFixup();
}
//
// Build the DFA state transition tables.
//
buildStateTable();
flagAcceptingStates();
flagLookAheadStates();
flagTaggedStates();
//
// Update the global table of rule status {tag} values
// The rule builder has a global vector of status values that are common
// for all tables. Merge the ones from this table into the global set.
//
mergeRuleStatusVals();
if (fRB.fDebugEnv!=null && fRB.fDebugEnv.indexOf("states")>=0) {printStates();}
}
//-----------------------------------------------------------------------------
//
// calcNullable. Impossible to explain succinctly. See Aho, section 3.9
//
//-----------------------------------------------------------------------------
void calcNullable(RBBINode n) {
if (n == null) {
return;
}
if (n.fType == RBBINode.setRef ||
n.fType == RBBINode.endMark ) {
// These are non-empty leaf node types.
n.fNullable = false;
return;
}
if (n.fType == RBBINode.lookAhead || n.fType == RBBINode.tag) {
// Lookahead marker node. It's a leaf, so no recursion on children.
// It's nullable because it does not match any literal text from the input stream.
n.fNullable = true;
return;
}
// The node is not a leaf.
// Calculate nullable on its children.
calcNullable(n.fLeftChild);
calcNullable(n.fRightChild);
// Apply functions from table 3.40 in Aho
if (n.fType == RBBINode.opOr) {
n.fNullable = n.fLeftChild.fNullable || n.fRightChild.fNullable;
}
else if (n.fType == RBBINode.opCat) {
n.fNullable = n.fLeftChild.fNullable && n.fRightChild.fNullable;
}
else if (n.fType == RBBINode.opStar || n.fType == RBBINode.opQuestion) {
n.fNullable = true;
}
else {
n.fNullable = false;
}
}
//-----------------------------------------------------------------------------
//
// calcFirstPos. Impossible to explain succinctly. See Aho, section 3.9
//
//-----------------------------------------------------------------------------
void calcFirstPos(RBBINode n) {
if (n == null) {
return;
}
if (n.fType == RBBINode.leafChar ||
n.fType == RBBINode.endMark ||
n.fType == RBBINode.lookAhead ||
n.fType == RBBINode.tag) {
// These are non-empty leaf node types.
n.fFirstPosSet.add(n);
return;
}
// The node is not a leaf.
// Calculate firstPos on its children.
calcFirstPos(n.fLeftChild);
calcFirstPos(n.fRightChild);
// Apply functions from table 3.40 in Aho
if (n.fType == RBBINode.opOr) {
n.fFirstPosSet.addAll(n.fLeftChild.fFirstPosSet);
n.fFirstPosSet.addAll(n.fRightChild.fFirstPosSet);
}
else if (n.fType == RBBINode.opCat) {
n.fFirstPosSet.addAll(n.fLeftChild.fFirstPosSet);
if (n.fLeftChild.fNullable) {
n.fFirstPosSet.addAll(n.fRightChild.fFirstPosSet);
}
}
else if (n.fType == RBBINode.opStar ||
n.fType == RBBINode.opQuestion ||
n.fType == RBBINode.opPlus) {
n.fFirstPosSet.addAll(n.fLeftChild.fFirstPosSet);
}
}
//-----------------------------------------------------------------------------
//
// calcLastPos. Impossible to explain succinctly. See Aho, section 3.9
//
//-----------------------------------------------------------------------------
void calcLastPos(RBBINode n) {
if (n == null) {
return;
}
if (n.fType == RBBINode.leafChar ||
n.fType == RBBINode.endMark ||
n.fType == RBBINode.lookAhead ||
n.fType == RBBINode.tag) {
// These are non-empty leaf node types.
n.fLastPosSet.add(n);
return;
}
// The node is not a leaf.
// Calculate lastPos on its children.
calcLastPos(n.fLeftChild);
calcLastPos(n.fRightChild);
// Apply functions from table 3.40 in Aho
if (n.fType == RBBINode.opOr) {
n.fLastPosSet.addAll(n.fLeftChild.fLastPosSet);
n.fLastPosSet.addAll(n.fRightChild.fLastPosSet);
}
else if (n.fType == RBBINode.opCat) {
n.fLastPosSet.addAll(n.fRightChild.fLastPosSet);
if (n.fRightChild.fNullable) {
n.fLastPosSet.addAll(n.fLeftChild.fLastPosSet);
}
}
else if (n.fType == RBBINode.opStar ||
n.fType == RBBINode.opQuestion ||
n.fType == RBBINode.opPlus) {
n.fLastPosSet.addAll(n.fLeftChild.fLastPosSet);
}
}
//-----------------------------------------------------------------------------
//
// calcFollowPos. Impossible to explain succinctly. See Aho, section 3.9
//
//-----------------------------------------------------------------------------
void calcFollowPos(RBBINode n) {
if (n == null ||
n.fType == RBBINode.leafChar ||
n.fType == RBBINode.endMark) {
return;
}
calcFollowPos(n.fLeftChild);
calcFollowPos(n.fRightChild);
// Aho rule #1
if (n.fType == RBBINode.opCat) {
for (RBBINode i /* is 'i' in Aho's description */ : n.fLeftChild.fLastPosSet) {
i.fFollowPos.addAll(n.fRightChild.fFirstPosSet);
}
}
// Aho rule #2
if (n.fType == RBBINode.opStar ||
n.fType == RBBINode.opPlus) {
for (RBBINode i /* again, n and i are the names from Aho's description */ : n.fLastPosSet) {
i.fFollowPos.addAll(n.fFirstPosSet);
}
}
}
//-----------------------------------------------------------------------------
//
// calcChainedFollowPos. Modify the previously calculated followPos sets
// to implement rule chaining. NOT described by Aho
//
//-----------------------------------------------------------------------------
void calcChainedFollowPos(RBBINode tree) {
List endMarkerNodes = new ArrayList();
List leafNodes = new ArrayList();
// get a list of all endmarker nodes.
tree.findNodes(endMarkerNodes, RBBINode.endMark);
// get a list all leaf nodes
tree.findNodes(leafNodes, RBBINode.leafChar);
// Get all nodes that can be the start a match, which is FirstPosition()
// of the portion of the tree corresponding to user-written rules.
// See the tree description in bofFixup().
RBBINode userRuleRoot = tree;
if (fRB.fSetBuilder.sawBOF()) {
userRuleRoot = tree.fLeftChild.fRightChild;
}
Assert.assrt(userRuleRoot != null);
Set matchStartNodes = userRuleRoot.fFirstPosSet;
// Iteratate over all leaf nodes,
//
for (RBBINode tNode : leafNodes) {
RBBINode endNode = null;
// Identify leaf nodes that correspond to overall rule match positions.
// These include an endMarkerNode in their followPos sets.
for (RBBINode endMarkerNode : endMarkerNodes) {
if (tNode.fFollowPos.contains(endMarkerNode)) {
endNode = tNode;
break;
}
}
if (endNode == null) {
// node wasn't an end node. Try again with the next.
continue;
}
// We've got a node that can end a match.
// Line Break Specific hack: If this node's val correspond to the $CM char class,
// don't chain from it.
// TODO: Add rule syntax for this behavior, get specifics out of here and
// into the rule file.
if (fRB.fLBCMNoChain) {
int c = this.fRB.fSetBuilder.getFirstChar(endNode.fVal);
if (c != -1) {
// c == -1 occurs with sets containing only the {eof} marker string.
int cLBProp = UCharacter.getIntPropertyValue(c, UProperty.LINE_BREAK);
if (cLBProp == UCharacter.LineBreak.COMBINING_MARK) {
continue;
}
}
}
// Now iterate over the nodes that can start a match, looking for ones
// with the same char class as our ending node.
for (RBBINode startNode : matchStartNodes) {
if (startNode.fType != RBBINode.leafChar) {
continue;
}
if (endNode.fVal == startNode.fVal) {
// The end val (character class) of one possible match is the
// same as the start of another.
// Add all nodes from the followPos of the start node to the
// followPos set of the end node, which will have the effect of
// letting matches transition from a match state at endNode
// to the second char of a match starting with startNode.
endNode.fFollowPos.addAll(startNode.fFollowPos);
}
}
}
}
//-----------------------------------------------------------------------------
//
// bofFixup. Fixup for state tables that include {bof} beginning of input testing.
// Do an swizzle similar to chaining, modifying the followPos set of
// the bofNode to include the followPos nodes from other {bot} nodes
// scattered through the tree.
//
// This function has much in common with calcChainedFollowPos().
//
//-----------------------------------------------------------------------------
void bofFixup() {
//
// The parse tree looks like this ...
// fTree root --.
// / \
// <#end node>
// / \
// rest
// of tree
//
// We will be adding things to the followPos set of the
//
RBBINode bofNode = fRB.fTreeRoots[fRootIx].fLeftChild.fLeftChild;
Assert.assrt(bofNode.fType == RBBINode.leafChar);
Assert.assrt(bofNode.fVal == 2);
// Get all nodes that can be the start a match of the user-written rules
// (excluding the fake bofNode)
// We want the nodes that can start a match in the
// part labeled "rest of tree"
//
Set matchStartNodes = fRB.fTreeRoots[fRootIx].fLeftChild.fRightChild.fFirstPosSet;
for (RBBINode startNode : matchStartNodes) {
if (startNode.fType != RBBINode.leafChar) {
continue;
}
if (startNode.fVal == bofNode.fVal) {
// We found a leaf node corresponding to a {bof} that was
// explicitly written into a rule.
// Add everything from the followPos set of this node to the
// followPos set of the fake bofNode at the start of the tree.
//
bofNode.fFollowPos.addAll(startNode.fFollowPos);
}
}
}
//-----------------------------------------------------------------------------
//
// buildStateTable() Determine the set of runtime DFA states and the
// transition tables for these states, by the algorithm
// of fig. 3.44 in Aho.
//
// Most of the comments are quotes of Aho's psuedo-code.
//
//-----------------------------------------------------------------------------
void buildStateTable() {
//
// Add a dummy state 0 - the stop state. Not from Aho.
int lastInputSymbol = fRB.fSetBuilder.getNumCharCategories() - 1;
RBBIStateDescriptor failState = new RBBIStateDescriptor(lastInputSymbol);
fDStates.add(failState);
// initially, the only unmarked state in Dstates is firstpos(root),
// where toot is the root of the syntax tree for (r)#;
RBBIStateDescriptor initialState = new RBBIStateDescriptor(lastInputSymbol);
initialState.fPositions.addAll(fRB.fTreeRoots[fRootIx].fFirstPosSet);
fDStates.add(initialState);
// while there is an unmarked state T in Dstates do begin
for (;;) {
RBBIStateDescriptor T = null;
int tx;
for (tx=1; tx U = null;
for (RBBINode p : T.fPositions) {
if ((p.fType == RBBINode.leafChar) && (p.fVal == a)) {
if (U == null) {
U = new HashSet();
}
U.addAll(p.fFollowPos);
}
}
// if U is not empty and not in DStates then
int ux = 0;
boolean UinDstates = false;
if (U != null) {
Assert.assrt(U.size() > 0);
int ix;
for (ix=0; ix endMarkerNodes = new ArrayList();
RBBINode endMarker;
int i;
int n;
fRB.fTreeRoots[fRootIx].findNodes(endMarkerNodes, RBBINode.endMark);
for (i=0; i= 0) {
if (sd.fPositions.contains(endMarker)) {
// Any non-zero value for fAccepting means this is an accepting node.
// The value is what will be returned to the user as the break status.
// If no other value was specified, force it to -1.
if (sd.fAccepting==0) {
// State hasn't been marked as accepting yet. Do it now.
sd.fAccepting = endMarker.fVal;
if (sd.fAccepting == 0) {
sd.fAccepting = -1;
}
}
if (sd.fAccepting==-1 && endMarker.fVal != 0) {
// Both lookahead and non-lookahead accepting for this state.
// Favor the look-ahead. Expedient for line break.
// TODO: need a more elegant resolution for conflicting rules.
sd.fAccepting = endMarker.fVal;
}
// implicit else:
// if sd.fAccepting already had a value other than 0 or -1, leave it be.
// If the end marker node is from a look-ahead rule, set
// the fLookAhead field or this state also.
if (endMarker.fLookAheadEnd) {
// TODO: don't change value if already set?
// TODO: allow for more than one active look-ahead rule in engine.
// Make value here an index to a side array in engine?
sd.fLookAhead = sd.fAccepting;
}
}
}
}
}
//-----------------------------------------------------------------------------
//
// flagLookAheadStates Very similar to flagAcceptingStates, above.
//
//-----------------------------------------------------------------------------
void flagLookAheadStates() {
List lookAheadNodes = new ArrayList();
RBBINode lookAheadNode;
int i;
int n;
fRB.fTreeRoots[fRootIx].findNodes(lookAheadNodes, RBBINode.lookAhead);
for (i=0; i tagNodes = new ArrayList();
RBBINode tagNode;
int i;
int n;
fRB.fTreeRoots[fRootIx].findNodes(tagNodes, RBBINode.tag);
for (i=0; i s0 = new TreeSet();
Integer izero = Integer.valueOf(0);
fRB.fStatusSets.put(s0, izero);
SortedSet s1 = new TreeSet();
s1.add(izero);
fRB.fStatusSets.put(s0, izero);
}
// For each state, check whether the state's status tag values are
// already entered into the status values array, and add them if not.
for (n=0; n statusVals = sd.fTagVals;
Integer arrayIndexI = fRB.fStatusSets.get(statusVals);
if (arrayIndexI == null) {
// This is the first encounter of this set of status values.
// Add them to the statusSets map, This map associates
// the set of status values with an index in the runtime status
// values array.
arrayIndexI = Integer.valueOf(fRB.fRuleStatusVals.size());
fRB.fStatusSets.put(statusVals, arrayIndexI);
// Add the new set of status values to the vector of values that
// will eventually become the array used by the runtime engine.
fRB.fRuleStatusVals.add(Integer.valueOf(statusVals.size()));
fRB.fRuleStatusVals.addAll(statusVals);
}
// Save the runtime array index back into the state descriptor.
sd.fTagsIdx = arrayIndexI.intValue();
}
}
//-----------------------------------------------------------------------------
//
// printPosSets Debug function. Dump Nullable, firstpos, lastpos and followpos
// for each node in the tree.
//
//-----------------------------------------------------------------------------
void printPosSets(RBBINode n) {
if (n==null) {
return;
}
RBBINode.printNode(n);
System.out.print(" Nullable: " + n.fNullable);
System.out.print(" firstpos: ");
printSet(n.fFirstPosSet);
System.out.print(" lastpos: ");
printSet(n.fLastPosSet);
System.out.print(" followpos: ");
printSet(n.fFollowPos);
printPosSets(n.fLeftChild);
printPosSets(n.fRightChild);
}
//-----------------------------------------------------------------------------
//
// getTableSize() Calculate the size in bytes of the runtime form of this
// state transition table.
//
// Note: Refer to common/rbbidata.h from ICU4C for the declarations
// of the structures being matched by this calculation.
//
//-----------------------------------------------------------------------------
int getTableSize() {
int size = 0;
int numRows;
int numCols;
int rowSize;
if (fRB.fTreeRoots[fRootIx] == null) {
return 0;
}
size = /*sizeof(RBBIStateTable) - 4 */ 16; // The header, with no rows to the table.
numRows = fDStates.size();
numCols = fRB.fSetBuilder.getNumCharCategories();
// Note The declaration of RBBIStateTableRow is for a table of two columns.
// Therefore we subtract two from numCols when determining
// how much storage to add to a row for the total columns.
// rowSize = sizeof(RBBIStateTableRow) + sizeof(uint16_t)*(numCols-2);
rowSize = 8 + 2*numCols;
size += numRows * rowSize;
while (size % 8 > 0) { // Size must be multiple of 8 bytes in size.
size++;
}
return size;
}
//-----------------------------------------------------------------------------
//
// exportTable() export the state transition table in the ICU4C format.
//
// Most of the table is 16 bit shorts. This function exports
// the whole thing as an array of shorts.
//
// The size of the array must be rounded up to a multiple of
// 8 bytes.
//
// See struct RBBIStateTable in ICU4C, common/rbbidata.h
//
//-----------------------------------------------------------------------------
short [] exportTable() {
int state;
int col;
if (fRB.fTreeRoots[fRootIx] == null) {
return new short[0];
}
Assert.assrt(fRB.fSetBuilder.getNumCharCategories() < 0x7fff &&
fDStates.size() < 0x7fff);
int numStates = fDStates.size();
// Size of table size in shorts.
// the "4" is the size of struct RBBIStateTableRow, the row header part only.
int rowLen = 4 + fRB.fSetBuilder.getNumCharCategories();
int tableSize = getTableSize() / 2;
short [] table = new short[tableSize];
//
// Fill in the header fields.
// Annoying because they really want to be ints, not shorts.
//
// RBBIStateTable.fNumStates
table[RBBIDataWrapper.NUMSTATES] = (short)(numStates >>> 16);
table[RBBIDataWrapper.NUMSTATES+1] = (short)(numStates & 0x0000ffff);
// RBBIStateTable.fRowLen
table[RBBIDataWrapper.ROWLEN] = (short)(rowLen >>> 16);
table[RBBIDataWrapper.ROWLEN+1] = (short)(rowLen & 0x0000ffff);
// RBBIStateTable.fFlags
int flags = 0;
if (fRB.fLookAheadHardBreak) {
flags |= RBBIDataWrapper.RBBI_LOOKAHEAD_HARD_BREAK;
}
if (fRB.fSetBuilder.sawBOF()) {
flags |= RBBIDataWrapper.RBBI_BOF_REQUIRED;
}
table[RBBIDataWrapper.FLAGS] = (short)(flags >>> 16);
table[RBBIDataWrapper.FLAGS+1] = (short)(flags & 0x0000ffff);
int numCharCategories = fRB.fSetBuilder.getNumCharCategories();
for (state=0; state s) {
for (RBBINode n : s) {
RBBINode.printInt(n.fSerialNum, 8);
}
System.out.println();
}
//-----------------------------------------------------------------------------
//
// printStates Debug Function. Dump the fully constructed state transition table.
//
//-----------------------------------------------------------------------------
void printStates() {
int c; // input "character"
int n; // state number
System.out.print("state | i n p u t s y m b o l s \n");
System.out.print(" | Acc LA Tag");
for (c=0; c tbl = fRB.fRuleStatusVals;
System.out.print("index | tags \n");
System.out.print("-------------------\n");
while (nextRecord < tbl.size()) {
thisRecord = nextRecord;
nextRecord = thisRecord + tbl.get(thisRecord).intValue() + 1;
RBBINode.printInt(thisRecord, 7);
for (i=thisRecord+1; i