com.ibm.icu.text.DictionaryBreakEngine Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of virtdata-lib-realer Show documentation
Show all versions of virtdata-lib-realer Show documentation
With inspiration from other libraries
// © 2016 and later: Unicode, Inc. and others.
// License & terms of use: http://www.unicode.org/copyright.html#License
/*
*******************************************************************************
* Copyright (C) 2014, International Business Machines Corporation and *
* others. All Rights Reserved. *
*******************************************************************************
*/
package com.ibm.icu.text;
import java.text.CharacterIterator;
import java.util.BitSet;
import com.ibm.icu.impl.CharacterIteration;
abstract class DictionaryBreakEngine implements LanguageBreakEngine {
/* Helper class for improving readability of the Thai/Lao/Khmer word break
* algorithm.
*/
static class PossibleWord {
// List size, limited by the maximum number of words in the dictionary
// that form a nested sequence.
private final static int POSSIBLE_WORD_LIST_MAX = 20;
//list of word candidate lengths, in increasing length order
private int lengths[];
private int count[]; // Count of candidates
private int prefix; // The longest match with a dictionary word
private int offset; // Offset in the text of these candidates
private int mark; // The preferred candidate's offset
private int current; // The candidate we're currently looking at
// Default constructor
public PossibleWord() {
lengths = new int[POSSIBLE_WORD_LIST_MAX];
count = new int[1]; // count needs to be an array of 1 so that it can be pass as reference
offset = -1;
}
// Fill the list of candidates if needed, select the longest, and return the number found
public int candidates(CharacterIterator fIter, DictionaryMatcher dict, int rangeEnd) {
int start = fIter.getIndex();
if (start != offset) {
offset = start;
prefix = dict.matches(fIter, rangeEnd - start, lengths, count, lengths.length);
// Dictionary leaves text after longest prefix, not longest word. Back up.
if (count[0] <= 0) {
fIter.setIndex(start);
}
}
if (count[0] > 0) {
fIter.setIndex(start + lengths[count[0]-1]);
}
current = count[0] - 1;
mark = current;
return count[0];
}
// Select the currently marked candidate, point after it in the text, and invalidate self
public int acceptMarked(CharacterIterator fIter) {
fIter.setIndex(offset + lengths[mark]);
return lengths[mark];
}
// Backup from the current candidate to the next shorter one; return true if that exists
// and point the text after it
public boolean backUp(CharacterIterator fIter) {
if (current > 0) {
fIter.setIndex(offset + lengths[--current]);
return true;
}
return false;
}
// Return the longest prefix this candidate location shares with a dictionary word
public int longestPrefix() {
return prefix;
}
// Mark the current candidate as the one we like
public void markCurrent() {
mark = current;
}
}
/**
* A deque-like structure holding raw ints.
* Partial, limited implementation, only what is needed by the dictionary implementation.
* For internal use only.
* @internal
*/
static class DequeI {
private int[] data = new int[50];
private int lastIdx = 4; // or base of stack. Index of element.
private int firstIdx = 4; // or Top of Stack. Index of element + 1.
int size() {
return firstIdx - lastIdx;
}
boolean isEmpty() {
return size() == 0;
}
private void grow() {
int[] newData = new int[data.length * 2];
System.arraycopy(data, 0, newData, 0, data.length);
data = newData;
}
void offer(int v) {
// Note that the actual use cases of offer() add at most one element.
// We make no attempt to handle more than a few.
assert lastIdx > 0;
data[--lastIdx] = v;
}
void push(int v) {
if (firstIdx >= data.length) {
grow();
}
data[firstIdx++] = v;
}
int pop() {
assert size() > 0;
return data[--firstIdx];
}
int peek() {
assert size() > 0;
return data[firstIdx - 1];
}
int peekLast() {
assert size() > 0;
return data[lastIdx];
}
int pollLast() {
assert size() > 0;
return data[lastIdx++];
}
boolean contains(int v) {
for (int i=lastIdx; i< firstIdx; i++) {
if (data[i] == v) {
return true;
}
}
return false;
}
}
UnicodeSet fSet = new UnicodeSet();
private BitSet fTypes = new BitSet(32);
/**
* @param breakTypes The types of break iterators that can use this engine.
* For example, BreakIterator.KIND_LINE
*/
public DictionaryBreakEngine(Integer... breakTypes) {
for (Integer type: breakTypes) {
fTypes.set(type);
}
}
@Override
public boolean handles(int c, int breakType) {
return fTypes.get(breakType) && // this type can use us
fSet.contains(c); // we recognize the character
}
@Override
public int findBreaks(CharacterIterator text, int startPos, int endPos,
boolean reverse, int breakType, DequeI foundBreaks) {
int result = 0;
// Find the span of characters included in the set.
// The span to break begins at the current position int the text, and
// extends towards the start or end of the text, depending on 'reverse'.
int start = text.getIndex();
int current;
int rangeStart;
int rangeEnd;
int c = CharacterIteration.current32(text);
if (reverse) {
boolean isDict = fSet.contains(c);
while ((current = text.getIndex()) > startPos && isDict) {
c = CharacterIteration.previous32(text);
isDict = fSet.contains(c);
}
rangeStart = (current < startPos) ? startPos :
current + (isDict ? 0 : 1);
rangeEnd = start + 1;
} else {
while ((current = text.getIndex()) < endPos && fSet.contains(c)) {
CharacterIteration.next32(text);
c = CharacterIteration.current32(text);
}
rangeStart = start;
rangeEnd = current;
}
result = divideUpDictionaryRange(text, rangeStart, rangeEnd, foundBreaks);
text.setIndex(current);
return result;
}
void setCharacters(UnicodeSet set) {
fSet = new UnicodeSet(set);
fSet.compact();
}
/**
* Divide up a range of known dictionary characters handled by this break engine.
*
* @param text A UText representing the text
* @param rangeStart The start of the range of dictionary characters
* @param rangeEnd The end of the range of dictionary characters
* @param foundBreaks Output of break positions. Positions are pushed.
* Pre-existing contents of the output stack are unaltered.
* @return The number of breaks found
*/
abstract int divideUpDictionaryRange(CharacterIterator text,
int rangeStart,
int rangeEnd,
DequeI foundBreaks );
}
© 2015 - 2025 Weber Informatics LLC | Privacy Policy