com.ibm.icu.text.CjkBreakEngine Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of icu4j Show documentation
Show all versions of icu4j Show documentation
International Component for Unicode for Java (ICU4J) is a mature, widely used Java library
providing Unicode and Globalization support
/*
*******************************************************************************
* Copyright (C) 2012, International Business Machines Corporation and *
* others. All Rights Reserved. *
*******************************************************************************
*/
package com.ibm.icu.text;
import static com.ibm.icu.impl.CharacterIteration.DONE32;
import static com.ibm.icu.impl.CharacterIteration.current32;
import static com.ibm.icu.impl.CharacterIteration.next32;
import java.io.IOException;
import java.text.CharacterIterator;
import java.util.Stack;
import com.ibm.icu.impl.Assert;
class CjkBreakEngine implements LanguageBreakEngine {
private static final UnicodeSet fHangulWordSet = new UnicodeSet();
private static final UnicodeSet fHanWordSet = new UnicodeSet();
private static final UnicodeSet fKatakanaWordSet = new UnicodeSet();
private static final UnicodeSet fHiraganaWordSet = new UnicodeSet();
static {
fHangulWordSet.applyPattern("[\\uac00-\\ud7a3]");
fHanWordSet.applyPattern("[:Han:]");
fKatakanaWordSet.applyPattern("[[:Katakana:]\\uff9e\\uff9f]");
fHiraganaWordSet.applyPattern("[:Hiragana:]");
// freeze them all
fHangulWordSet.freeze();
fHanWordSet.freeze();
fKatakanaWordSet.freeze();
fHiraganaWordSet.freeze();
}
private final UnicodeSet fWordSet;
private DictionaryMatcher fDictionary = null;
public CjkBreakEngine(boolean korean) throws IOException {
fDictionary = DictionaryData.loadDictionaryFor("Hira");
if (korean) {
fWordSet = fHangulWordSet;
} else {
fWordSet = new UnicodeSet();
fWordSet.addAll(fHanWordSet);
fWordSet.addAll(fKatakanaWordSet);
fWordSet.addAll(fHiraganaWordSet);
fWordSet.add("\\uff70\\u30fc");
}
}
public boolean handles(int c, int breakType) {
return (breakType == BreakIterator.KIND_WORD) &&
(fWordSet.contains(c));
}
private static final int kMaxKatakanaLength = 8;
private static final int kMaxKatakanaGroupLength = 20;
private static final int maxSnlp = 255;
private static final int kint32max = Integer.MAX_VALUE;
private static int getKatakanaCost(int wordlength) {
int katakanaCost[] = new int[] { 8192, 984, 408, 240, 204, 252, 300, 372, 480 };
return (wordlength > kMaxKatakanaLength) ? 8192 : katakanaCost[wordlength];
}
private static boolean isKatakana(int value) {
return (value >= 0x30A1 && value <= 0x30FE && value != 0x30FB) ||
(value >= 0xFF66 && value <= 0xFF9F);
}
public int findBreaks(CharacterIterator inText, int startPos, int endPos,
boolean reverse, int breakType, Stack foundBreaks) {
if (startPos >= endPos) {
return 0;
}
inText.setIndex(startPos);
int inputLength = endPos - startPos;
int[] charPositions = new int[inputLength + 1];
StringBuffer s = new StringBuffer("");
inText.setIndex(startPos);
while (inText.getIndex() < endPos) {
s.append(inText.current());
inText.next();
}
String prenormstr = s.toString();
boolean isNormalized = Normalizer.quickCheck(prenormstr, Normalizer.NFKC) == Normalizer.YES ||
Normalizer.isNormalized(prenormstr, Normalizer.NFKC, 0);
CharacterIterator text = inText;
int numChars = 0;
if (isNormalized) {
int index = 0;
charPositions[0] = 0;
while (index < prenormstr.length()) {
int codepoint = prenormstr.codePointAt(index);
index += Character.charCount(codepoint);
numChars++;
charPositions[numChars] = index;
}
} else {
String normStr = Normalizer.normalize(prenormstr, Normalizer.NFKC);
text = new java.text.StringCharacterIterator(normStr);
charPositions = new int[normStr.length() + 1];
Normalizer normalizer = new Normalizer(prenormstr, Normalizer.NFKC, 0);
int index = 0;
charPositions[0] = 0;
while (index < normalizer.endIndex()) {
normalizer.next();
numChars++;
index = normalizer.getIndex();
charPositions[numChars] = index;
}
}
// From here on out, do the algorithm. Note that our indices
// refer to indices within the normalized string.
int[] bestSnlp = new int[numChars + 1];
bestSnlp[0] = 0;
for (int i = 1; i <= numChars; i++) {
bestSnlp[i] = kint32max;
}
int[] prev = new int[numChars + 1];
for (int i = 0; i <= numChars; i++) {
prev[i] = -1;
}
final int maxWordSize = 20;
int values[] = new int[numChars];
int lengths[] = new int[numChars];
// dynamic programming to find the best segmentation
boolean is_prev_katakana = false;
for (int i = 0; i < numChars; i++) {
text.setIndex(i);
if (bestSnlp[i] == kint32max) {
continue;
}
int maxSearchLength = (i + maxWordSize < numChars) ? maxWordSize : (numChars - i);
int[] count_ = new int[1];
fDictionary.matches(text, maxSearchLength, lengths, count_, maxSearchLength, values);
int count = count_[0];
// if there are no single character matches found in the dictionary
// starting with this character, treat character as a 1-character word
// with the highest value possible (i.e. the least likely to occur).
// Exclude Korean characters from this treatment, as they should be
// left together by default.
if ((count == 0 || lengths[0] != 1) && current32(text) != DONE32 && !fHangulWordSet.contains(current32(text))) {
values[count] = maxSnlp;
lengths[count] = 1;
count++;
}
for (int j = 0; j < count; j++) {
int newSnlp = bestSnlp[i] + values[j];
if (newSnlp < bestSnlp[lengths[j] + i]) {
bestSnlp[lengths[j] + i] = newSnlp;
prev[lengths[j] + i] = i;
}
}
// In Japanese, single-character Katakana words are pretty rare.
// So we apply the following heuristic to Katakana: any continuous
// run of Katakana characters is considered a candidate word with
// a default cost specified in the katakanaCost table according
// to its length.
text.setIndex(i);
boolean is_katakana = isKatakana(current32(text));
if (!is_prev_katakana && is_katakana) {
int j = i + 1;
next32(text);
while (j < numChars && (j - i) < kMaxKatakanaGroupLength && isKatakana(current32(text))) {
next32(text);
++j;
}
if ((j - i) < kMaxKatakanaGroupLength) {
int newSnlp = bestSnlp[i] + getKatakanaCost(j - i);
if (newSnlp < bestSnlp[j]) {
bestSnlp[j] = newSnlp;
prev[j] = i;
}
}
}
is_prev_katakana = is_katakana;
}
int t_boundary[] = new int[numChars + 1];
int numBreaks = 0;
if (bestSnlp[numChars] == kint32max) {
t_boundary[numBreaks] = numChars;
numBreaks++;
} else {
for (int i = numChars; i > 0; i = prev[i]) {
t_boundary[numBreaks] = i;
numBreaks++;
}
Assert.assrt(prev[t_boundary[numBreaks - 1]] == 0);
}
if (foundBreaks.size() == 0 || foundBreaks.peek() < startPos) {
t_boundary[numBreaks++] = 0;
}
for (int i = numBreaks - 1; i >= 0; i--) {
int pos = charPositions[t_boundary[i]] + startPos;
if (!(foundBreaks.contains(pos) || pos == startPos))
foundBreaks.push(charPositions[t_boundary[i]] + startPos);
}
if (!foundBreaks.empty() && foundBreaks.peek() == endPos)
foundBreaks.pop();
if (!foundBreaks.empty())
inText.setIndex(foundBreaks.peek());
return 0;
}
}