com.ibm.icu.text.BreakCTDictionary Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of icu4j Show documentation
Show all versions of icu4j Show documentation
International Component for Unicode for Java (ICU4J) is a mature, widely used Java library
providing Unicode and Globalization support
/*
*******************************************************************************
* Copyright (C) 1996-2011, International Business Machines Corporation and *
* others. All Rights Reserved. *
*******************************************************************************
*/
package com.ibm.icu.text;
import java.io.DataInputStream;
import java.io.IOException;
import java.io.InputStream;
import java.text.CharacterIterator;
import com.ibm.icu.impl.ICUBinary;
/*
* This is a class used to load in the compact trie dictionary file
* used for dictionary based break iteration.
*/
class BreakCTDictionary {
private CompactTrieHeader fData;
static class CompactTrieHeader {
int size; // Size of data in bytes
int magic; // Magic number (including versions)
int nodeCount; // Number of entries in offsets[]
int root; // Node number of the root node
int []offset; // Offsets to nodes from start of data
CompactTrieHeader() {
size = 0;
magic = 0;
nodeCount = 0;
root = 0;
offset = null;
}
}
static final class CompactTrieNodeFlags {
static final int kVerticalNode = 0x1000; // This is a vertical node
static final int kParentEndsWord = 0x2000; // The node whose equal link
// points to this ends a
// word
static final int kReservedFlag1 = 0x4000;
static final int kReservedFlag2 = 0x8000;
static final int kCountMask = 0x0FFF; // The count portion of
// flagscount
static final int kFlagMask = 0xF000; // The flags portion of
// flagscount
}
// The two node types are distinguished by the kVerticalNode flag.
static class CompactTrieHorizontalNode {
char ch; // UChar
int equal; // Equal link node index
CompactTrieHorizontalNode(char newCh, int newEqual) {
ch = newCh;
equal = newEqual;
}
}
static class CompactTrieVerticalNode {
int equal; // Equal link node index
char chars[]; // Code units
CompactTrieVerticalNode() {
equal = 0;
chars = null;
}
}
private CompactTrieNodes getCompactTrieNode(int node) {
return nodes[node];
}
// private class to hold both node information
static class CompactTrieNodes {
short flagscount; // Count of sub-entries, plus flags
CompactTrieHorizontalNode[] hnode;
CompactTrieVerticalNode vnode;
CompactTrieNodes() {
flagscount = 0;
hnode = null;
vnode = null;
}
}
private CompactTrieNodes[] nodes;
// Constructor
public BreakCTDictionary(InputStream is) throws IOException {
ICUBinary.readHeader(is, DATA_FORMAT_ID, null);
DataInputStream in = new DataInputStream(is);
// Get header information
fData = new CompactTrieHeader();
fData.size = in.readInt();
fData.magic = in.readInt();
fData.nodeCount = in.readShort();
fData.root = in.readShort();
loadBreakCTDictionary(in);
}
// Loads the compact trie dictionary file into the CompactTrieNodes
private void loadBreakCTDictionary(DataInputStream in) throws IOException {
// skip over offset information
for (int i = 0; i < fData.nodeCount; i++) {
in.readInt();
}
// Create compact trie dictionary
nodes = new CompactTrieNodes[fData.nodeCount];
nodes[0] = new CompactTrieNodes();
// Load in compact trie dictionary
for (int j = 1; j < fData.nodeCount; j++) {
nodes[j] = new CompactTrieNodes();
nodes[j].flagscount = in.readShort();
int count = nodes[j].flagscount & CompactTrieNodeFlags.kCountMask;
if (count != 0) {
boolean isVerticalNode = (nodes[j].flagscount & CompactTrieNodeFlags.kVerticalNode) != 0;
// Vertical node
if (isVerticalNode) {
nodes[j].vnode = new CompactTrieVerticalNode();
nodes[j].vnode.equal = in.readShort();
nodes[j].vnode.chars = new char[count];
for (int l = 0; l < count; l++) {
nodes[j].vnode.chars[l] = in.readChar();
}
} else { // Horizontal node
nodes[j].hnode = new CompactTrieHorizontalNode[count];
for (int n = 0; n < count; n++) {
nodes[j].hnode[n] = new CompactTrieHorizontalNode(in
.readChar(), in.readShort());
}
}
}
}
}
/**
* Find dictionary words that match the text.
*
* @param text A CharacterIterator representing the text. The iterator is
* left after the longest prefix match in the dictionary.
* @param maxLength The maximum number of code units to match.
* @param lengths An array that is filled with the lengths of words that matched.
* @param count Filled with the number of elements output in lengths.
* @param limit The size of the lengths array; this limits the number of words output.
* @return The number of characters in text that were matched.
*/
public int matches(CharacterIterator text, int maxLength, int lengths[],
int count[], int limit) {
// Current implementation works in UTF-16 space
CompactTrieNodes node = getCompactTrieNode(fData.root);
int mycount = 0;
char uc = text.current();
int i = 0;
boolean exitFlag = false;
while (node != null) {
// Check if the node we just exited ends a word
if (limit > 0
&& (node.flagscount & CompactTrieNodeFlags.kParentEndsWord) != 0) {
lengths[mycount++] = i;
--limit;
}
// Check that we haven't exceeded the maximum number of input
// characters.
// We have to do that here rather than in the while condition so
// that
// we can check for ending of a word above.
if (i >= maxLength) {
break;
}
int nodeCount = (node.flagscount & CompactTrieNodeFlags.kCountMask);
if (nodeCount == 0) {
// Special terminal node; return now
break;
}
if ((node.flagscount & CompactTrieNodeFlags.kVerticalNode) != 0) {
// Vertical node; check all the characters in it
CompactTrieVerticalNode vnode = node.vnode;
for (int j = 0; j < nodeCount && i < maxLength; j++) {
if (uc != vnode.chars[j]) {
// We hit a non equal character return;
exitFlag = true;
break;
}
text.next();
uc = text.current();
i++;
}
if (exitFlag) {
break;
}
// To get here, we must have come through the whole list successfully;
// go on to the next node. Note that a word cannot end in the middle
// of a vertical node.
node = getCompactTrieNode(vnode.equal);
} else {
// Horizontal node; do binary search
CompactTrieHorizontalNode[] hnode = node.hnode;
int low = 0;
int high = nodeCount - 1;
int middle;
node = null; // If we don't find a match, we'll fall out of the loop
while (high >= low) {
middle = (high + low) >>> 1;
if (uc == hnode[middle].ch) {
// We hit a match; get the next node and next character
node = getCompactTrieNode(hnode[middle].equal);
text.next();
uc = text.current();
i++;
break;
} else if (uc < hnode[middle].ch) {
high = middle - 1;
} else {
low = middle + 1;
}
}
}
}
count[0] = mycount;
return i;
}
// Use for reading the header portion of the file
private static final byte DATA_FORMAT_ID[] = { (byte) 0x54, (byte) 0x72,
(byte) 0x44, (byte) 0x63 };
}