com.ibm.icu.text.DictionaryBreakEngine Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of icu4j Show documentation
Show all versions of icu4j Show documentation
International Component for Unicode for Java (ICU4J) is a mature, widely used Java library
providing Unicode and Globalization support
/*
*******************************************************************************
* Copyright (C) 2014, International Business Machines Corporation and *
* others. All Rights Reserved. *
*******************************************************************************
*/
package com.ibm.icu.text;
import java.text.CharacterIterator;
import java.util.BitSet;
import com.ibm.icu.impl.CharacterIteration;
abstract class DictionaryBreakEngine implements LanguageBreakEngine {
/* Helper class for improving readability of the Thai/Lao/Khmer word break
* algorithm.
*/
static class PossibleWord {
// List size, limited by the maximum number of words in the dictionary
// that form a nested sequence.
private final static int POSSIBLE_WORD_LIST_MAX = 20;
//list of word candidate lengths, in increasing length order
private int lengths[];
private int count[]; // Count of candidates
private int prefix; // The longest match with a dictionary word
private int offset; // Offset in the text of these candidates
private int mark; // The preferred candidate's offset
private int current; // The candidate we're currently looking at
// Default constructor
public PossibleWord() {
lengths = new int[POSSIBLE_WORD_LIST_MAX];
count = new int[1]; // count needs to be an array of 1 so that it can be pass as reference
offset = -1;
}
// Fill the list of candidates if needed, select the longest, and return the number found
public int candidates(CharacterIterator fIter, DictionaryMatcher dict, int rangeEnd) {
int start = fIter.getIndex();
if (start != offset) {
offset = start;
prefix = dict.matches(fIter, rangeEnd - start, lengths, count, lengths.length);
// Dictionary leaves text after longest prefix, not longest word. Back up.
if (count[0] <= 0) {
fIter.setIndex(start);
}
}
if (count[0] > 0) {
fIter.setIndex(start + lengths[count[0]-1]);
}
current = count[0] - 1;
mark = current;
return count[0];
}
// Select the currently marked candidate, point after it in the text, and invalidate self
public int acceptMarked(CharacterIterator fIter) {
fIter.setIndex(offset + lengths[mark]);
return lengths[mark];
}
// Backup from the current candidate to the next shorter one; return true if that exists
// and point the text after it
public boolean backUp(CharacterIterator fIter) {
if (current > 0) {
fIter.setIndex(offset + lengths[--current]);
return true;
}
return false;
}
// Return the longest prefix this candidate location shares with a dictionary word
public int longestPrefix() {
return prefix;
}
// Mark the current candidate as the one we like
public void markCurrent() {
mark = current;
}
}
/**
* A deque-like structure holding raw ints.
* Partial, limited implementation, only what is needed by the dictionary implementation.
* For internal use only.
* @internal
*/
static class DequeI {
private int[] data = new int[50];
private int lastIdx = 4; // or base of stack. Index of element.
private int firstIdx = 4; // or Top of Stack. Index of element + 1.
int size() {
return firstIdx - lastIdx;
}
boolean isEmpty() {
return size() == 0;
}
private void grow() {
int[] newData = new int[data.length * 2];
System.arraycopy(data, 0, newData, 0, data.length);
data = newData;
}
void offer(int v) {
// Note that the actual use cases of offer() add at most one element.
// We make no attempt to handle more than a few.
assert lastIdx > 0;
data[--lastIdx] = v;
}
void push(int v) {
if (firstIdx >= data.length) {
grow();
}
data[firstIdx++] = v;
}
int pop() {
assert size() > 0;
return data[--firstIdx];
}
int peek() {
assert size() > 0;
return data[firstIdx - 1];
}
int peekLast() {
assert size() > 0;
return data[lastIdx];
}
int pollLast() {
assert size() > 0;
return data[lastIdx++];
}
boolean contains(int v) {
for (int i=lastIdx; i< firstIdx; i++) {
if (data[i] == v) {
return true;
}
}
return false;
}
}
UnicodeSet fSet = new UnicodeSet();
private BitSet fTypes = new BitSet(32);
/**
* @param breakTypes The types of break iterators that can use this engine.
* For example, BreakIterator.KIND_LINE
*/
public DictionaryBreakEngine(Integer... breakTypes) {
for (Integer type: breakTypes) {
fTypes.set(type);
}
}
public boolean handles(int c, int breakType) {
return fTypes.get(breakType) && // this type can use us
fSet.contains(c); // we recognize the character
}
public int findBreaks(CharacterIterator text, int startPos, int endPos,
boolean reverse, int breakType, DequeI foundBreaks) {
int result = 0;
// Find the span of characters included in the set.
// The span to break begins at the current position int the text, and
// extends towards the start or end of the text, depending on 'reverse'.
int start = text.getIndex();
int current;
int rangeStart;
int rangeEnd;
int c = CharacterIteration.current32(text);
if (reverse) {
boolean isDict = fSet.contains(c);
while ((current = text.getIndex()) > startPos && isDict) {
c = CharacterIteration.previous32(text);
isDict = fSet.contains(c);
}
rangeStart = (current < startPos) ? startPos :
current + (isDict ? 0 : 1);
rangeEnd = start + 1;
} else {
while ((current = text.getIndex()) < endPos && fSet.contains(c)) {
CharacterIteration.next32(text);
c = CharacterIteration.current32(text);
}
rangeStart = start;
rangeEnd = current;
}
result = divideUpDictionaryRange(text, rangeStart, rangeEnd, foundBreaks);
text.setIndex(current);
return result;
}
void setCharacters(UnicodeSet set) {
fSet = new UnicodeSet(set);
fSet.compact();
}
/**
* Divide up a range of known dictionary characters handled by this break engine.
*
* @param text A UText representing the text
* @param rangeStart The start of the range of dictionary characters
* @param rangeEnd The end of the range of dictionary characters
* @param foundBreaks Output of break positions. Positions are pushed.
* Pre-existing contents of the output stack are unaltered.
* @return The number of breaks found
*/
abstract int divideUpDictionaryRange(CharacterIterator text,
int rangeStart,
int rangeEnd,
DequeI foundBreaks );
}