com.ibm.icu.text.RBBIRuleBuilder Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of icu4j Show documentation
Show all versions of icu4j Show documentation
International Component for Unicode for Java (ICU4J) is a mature, widely used Java library
providing Unicode and Globalization support
// © 2016 and later: Unicode, Inc. and others.
// License & terms of use: http://www.unicode.org/copyright.html
//
// Copyright (C) 2002-2014, International Business Machines Corporation and others.
// All Rights Reserved.
//
//
package com.ibm.icu.text;
import java.io.DataOutputStream;
import java.io.IOException;
import java.io.OutputStream;
import java.nio.charset.StandardCharsets;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.Set;
import com.ibm.icu.impl.Assert;
import com.ibm.icu.impl.ICUBinary;
import com.ibm.icu.impl.ICUDebug;
import com.ibm.icu.impl.RBBIDataWrapper;
class RBBIRuleBuilder {
// This is the main class for building (compiling) break rules into the tables
// required by the runtime RBBI engine.
//
String fDebugEnv; // controls debug trace output
String fRules; // The rule string that we are compiling
StringBuilder fStrippedRules; // The rule string, with comments stripped.
RBBIRuleScanner fScanner; // The scanner.
//
// There are four separate parse trees generated, one for each of the
// forward rules, reverse rules, safe forward rules and safe reverse rules.
// This array references the root of each of the trees.
// Only fForwardTree data is actually used to generate a state table.
// The other three are retained for back compatibility with old rule files,
// which may have safe and reverse rules. These are still parsed.
//
RBBINode[] fTreeRoots = new RBBINode[4];
static final int fForwardTree = 0; // Indexes into the above fTreeRoots array
static final int fReverseTree = 1; // for each of the trees.
static final int fSafeFwdTree = 2; // (in C, these are pointer variables and
static final int fSafeRevTree = 3; // there is no array.)
int fDefaultTree = fForwardTree; // For rules not qualified with a !
// the tree to which they belong to.
boolean fChainRules; // True for chained Unicode TR style rules.
// False for traditional regexp rules.
boolean fLBCMNoChain; // True: suppress chaining of rules on
// chars with LineBreak property == CM.
boolean fLookAheadHardBreak; // True: Look ahead matches cause an
// immediate break, no continuing for the
// longest match.
RBBISetBuilder fSetBuilder; // Set and Character Category builder.
List fUSetNodes; // Vector of all used nodes.
RBBITableBuilder fForwardTable; // State transition tables
//
// Status {tag} values. These structures are common to all of the rule sets (Forward, Reverse, etc.).
//
Map, Integer> fStatusSets = new HashMap<>(); // Status value sets encountered so far.
// Map Key is the set of values.
// Map Value is the runtime array index.
List fRuleStatusVals; // List of Integer objects. Has same layout as the
// runtime array of status (tag) values -
// number of values in group 1
// first status value in group 1
// 2nd status value in group 1
// ...
// number of values in group 2
// first status value in group 2
// etc.
//
// Error codes from ICU4C.
// using these simplified the porting, and consolidated the
// creation of Java exceptions
//
static final int U_ILLEGAL_CHAR_FOUND = 12;
/**< Character conversion: Illegal input sequence/combination of input units. */
static final int U_BRK_ERROR_START = 0x10200;
/**< Start of codes indicating Break Iterator failures */
static final int U_BRK_INTERNAL_ERROR = 0x10201;
/**< An internal error (bug) was detected. */
static final int U_BRK_HEX_DIGITS_EXPECTED = 0x10202;
/**< Hex digits expected as part of a escaped char in a rule. */
static final int U_BRK_SEMICOLON_EXPECTED = 0x10203;
/**< Missing ';' at the end of a RBBI rule. */
static final int U_BRK_RULE_SYNTAX = 0x10204;
/**< Syntax error in RBBI rule. */
static final int U_BRK_UNCLOSED_SET = 0x10205;
/**< UnicodeSet writing an RBBI rule missing a closing ']'. */
static final int U_BRK_ASSIGN_ERROR = 0x10206;
/**< Syntax error in RBBI rule assignment statement. */
static final int U_BRK_VARIABLE_REDFINITION = 0x10207;
/**< RBBI rule $Variable redefined. */
static final int U_BRK_MISMATCHED_PAREN = 0x10208;
/**< Mis-matched parentheses in an RBBI rule. */
static final int U_BRK_NEW_LINE_IN_QUOTED_STRING = 0x10209;
/**< Missing closing quote in an RBBI rule. */
static final int U_BRK_UNDEFINED_VARIABLE = 0x1020a;
/**< Use of an undefined $Variable in an RBBI rule. */
static final int U_BRK_INIT_ERROR = 0x1020b;
/**< Initialization failure. Probable missing ICU Data. */
static final int U_BRK_RULE_EMPTY_SET = 0x1020c;
/**< Rule contains an empty Unicode Set. */
static final int U_BRK_UNRECOGNIZED_OPTION = 0x1020d;
/**< !!option in RBBI rules not recognized. */
static final int U_BRK_MALFORMED_RULE_TAG = 0x1020e;
/**< The {nnn} tag on a rule is mal formed */
static final int U_BRK_MALFORMED_SET = 0x1020f;
static final int U_BRK_ERROR_LIMIT = 0x10210;
/**< This must always be the last value to indicate the limit for Break Iterator failures */
//----------------------------------------------------------------------------------------
//
// Constructor.
//
//----------------------------------------------------------------------------------------
RBBIRuleBuilder(String rules)
{
fDebugEnv = ICUDebug.enabled("rbbi") ?
ICUDebug.value("rbbi") : null;
fRules = rules;
fStrippedRules = new StringBuilder(rules);
fUSetNodes = new ArrayList<>();
fRuleStatusVals = new ArrayList<>();
fScanner = new RBBIRuleScanner(this);
fSetBuilder = new RBBISetBuilder(this);
}
//----------------------------------------------------------------------------------------
//
// flattenData() - Collect up the compiled RBBI rule data and put it into
// the format for saving in ICU data files,
//
// See the ICU4C file common/rbidata.h for a detailed description.
//
//----------------------------------------------------------------------------------------
static final int align8(int i)
{
return (i + 7) & 0xfffffff8;
}
void flattenData(OutputStream os) throws IOException {
DataOutputStream dos = new DataOutputStream(os);
int i;
// Remove whitespace from the rules to make it smaller.
// The rule parser has already removed comments.
String strippedRules = RBBIRuleScanner.stripRules(fStrippedRules.toString());
// Calculate the size of each section in the data in bytes.
// Sizes here are padded up to a multiple of 8 for better memory alignment.
// Sections sizes actually stored in the header are for the actual data
// without the padding.
//
int headerSize = RBBIDataWrapper.DH_SIZE * 4; // align8(sizeof(RBBIDataHeader));
int forwardTableSize = align8(fForwardTable.getTableSize());
int reverseTableSize = align8(fForwardTable.getSafeTableSize());
int trieSize = align8(fSetBuilder.getTrieSize());
int statusTableSize = align8(fRuleStatusVals.size() * 4);
byte[] strippedRulesUTF8 = strippedRules.getBytes(StandardCharsets.UTF_8);
int rulesSize = align8(strippedRulesUTF8.length + 1);
int totalSize = headerSize
+ forwardTableSize
+ reverseTableSize
+ statusTableSize + trieSize + rulesSize;
int outputPos = 0; // Track stream position, starting from RBBIDataHeader.
//
// Write out an ICU Data Header
//
ICUBinary.writeHeader(RBBIDataWrapper.DATA_FORMAT, RBBIDataWrapper.FORMAT_VERSION, 0, dos);
//
// Write out the RBBIDataHeader
//
int[] header = new int[RBBIDataWrapper.DH_SIZE]; // sizeof struct RBBIDataHeader
header[RBBIDataWrapper.DH_MAGIC] = 0xb1a0;
header[RBBIDataWrapper.DH_FORMATVERSION] = RBBIDataWrapper.FORMAT_VERSION;
header[RBBIDataWrapper.DH_LENGTH] = totalSize; // fLength, the total size of all rule sections.
header[RBBIDataWrapper.DH_CATCOUNT] = fSetBuilder.getNumCharCategories();
header[RBBIDataWrapper.DH_FTABLE] = headerSize; // fFTable
header[RBBIDataWrapper.DH_FTABLELEN] = forwardTableSize; // fTableLen
header[RBBIDataWrapper.DH_RTABLE] = header[RBBIDataWrapper.DH_FTABLE] + forwardTableSize; // fRTable
header[RBBIDataWrapper.DH_RTABLELEN] = reverseTableSize; // fRTableLen
header[RBBIDataWrapper.DH_TRIE] = header[RBBIDataWrapper.DH_RTABLE]
+ header[RBBIDataWrapper.DH_RTABLELEN]; // fTrie
header[RBBIDataWrapper.DH_TRIELEN] = fSetBuilder.getTrieSize(); // fTrieLen
header[RBBIDataWrapper.DH_STATUSTABLE] = header[RBBIDataWrapper.DH_TRIE]
+ trieSize;
header[RBBIDataWrapper.DH_STATUSTABLELEN] = statusTableSize; // fStatusTableLen
header[RBBIDataWrapper.DH_RULESOURCE] = header[RBBIDataWrapper.DH_STATUSTABLE]
+ statusTableSize;
header[RBBIDataWrapper.DH_RULESOURCELEN] = strippedRulesUTF8.length;
for (i = 0; i < header.length; i++) {
dos.writeInt(header[i]);
outputPos += 4;
}
// Write out the actual state tables.
RBBIDataWrapper.RBBIStateTable table = fForwardTable.exportTable();
assert(outputPos == header[RBBIDataWrapper.DH_FTABLE]);
outputPos += table.put(dos);
table = fForwardTable.exportSafeTable();
Assert.assrt(outputPos == header[RBBIDataWrapper.DH_RTABLE]);
outputPos += table.put(dos);
// write out the Trie table
Assert.assrt(outputPos == header[RBBIDataWrapper.DH_TRIE]);
fSetBuilder.serializeTrie(os);
outputPos += header[RBBIDataWrapper.DH_TRIELEN];
while (outputPos % 8 != 0) { // pad to an 8 byte boundary
dos.write(0);
outputPos += 1;
}
// Write out the status {tag} table.
Assert.assrt(outputPos == header[RBBIDataWrapper.DH_STATUSTABLE]);
for (Integer val : fRuleStatusVals) {
dos.writeInt(val.intValue());
outputPos += 4;
}
while (outputPos % 8 != 0) { // pad to an 8 byte boundary
dos.write(0);
outputPos += 1;
}
// Write out the stripped rules (rules with extra spaces removed
// These go last in the data area, even though they are not last in the header.
Assert.assrt(outputPos == header[RBBIDataWrapper.DH_RULESOURCE]);
dos.write(strippedRulesUTF8, 0, strippedRulesUTF8.length);
dos.write(0); // Null termination
outputPos += strippedRulesUTF8.length + 1;
while (outputPos % 8 != 0) { // pad to an 8 byte boundary
dos.write(0);
outputPos += 1;
}
}
//----------------------------------------------------------------------------------------
//
// compileRules compile source rules, placing the compiled form into a output stream
// The compiled form is identical to that from ICU4C (Big Endian).
//
//----------------------------------------------------------------------------------------
static void compileRules(String rules, OutputStream os) throws IOException
{
//
// Read the input rules, generate a parse tree, symbol table,
// and list of all Unicode Sets referenced by the rules.
//
RBBIRuleBuilder builder = new RBBIRuleBuilder(rules);
builder.build(os);
}
/**
* Compile rules to the binary form, write that to an output stream.
*
*/
void build(OutputStream os) throws IOException {
fScanner.parse();
//
// UnicodeSet processing.
// Munge the Unicode Sets to create an initial set of character categories.
//
fSetBuilder.buildRanges();
//
// Generate the DFA state transition table.
//
fForwardTable = new RBBITableBuilder(this, fForwardTree);
fForwardTable.buildForwardTable();
// State table and character category optimization.
// Merge equivalent rows and columns.
// Note that this process alters the the initial set of character categories,
// causing the representation of UnicodeSets in the parse tree to become invalid.
optimizeTables();
fForwardTable.buildSafeReverseTable();
if (fDebugEnv != null
&& fDebugEnv.indexOf("states") >= 0) {
fForwardTable.printStates();
fForwardTable.printRuleStatusTable();
fForwardTable.printReverseTable();
}
// Generate the mapping tables (TRIE) from input code points to
// the character categories.
//
fSetBuilder.buildTrie();
//
// Package up the compiled data, writing it to an output stream
// in the serialization format. This is the same as the ICU4C runtime format.
//
flattenData(os);
}
static class IntPair {
int first = 0;
int second = 0;
IntPair() {};
IntPair(int f, int s) {
first = f;
second = s;
}
}
void optimizeTables() {
boolean didSomething;
do {
didSomething = false;
// Begin looking for duplicates with char class 3.
// Classes 0, 1 and 2 are special; they are unused, {bof} and {eof} respectively,
// and should not have other categories merged into them.
IntPair duplPair = new IntPair(3, 0);
while (fForwardTable.findDuplCharClassFrom(duplPair)) {
fSetBuilder.mergeCategories(duplPair);
fForwardTable.removeColumn(duplPair.second);
didSomething = true;
}
while (fForwardTable.removeDuplicateStates() > 0) {
didSomething = true;
};
} while (didSomething);
}
}