com.ibm.icu.impl.coll.FCDIterCollationIterator Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of icu4j Show documentation
Show all versions of icu4j Show documentation
International Component for Unicode for Java (ICU4J) is a mature, widely used Java library
providing Unicode and Globalization support
The newest version!
// © 2016 and later: Unicode, Inc. and others.
// License & terms of use: http://www.unicode.org/copyright.html
/*
*******************************************************************************
* Copyright (C) 2012-2014, International Business Machines
* Corporation and others. All Rights Reserved.
*******************************************************************************
* FCDIterCollationIterator.java, ported from uitercollationiterator.h/.cpp
*
* C++ version created on: 2012sep23 (from utf16collationiterator.h)
* created by: Markus W. Scherer
*/
package com.ibm.icu.impl.coll;
import com.ibm.icu.impl.Normalizer2Impl;
import com.ibm.icu.text.UCharacterIterator;
/**
* Incrementally checks the input text for FCD and normalizes where necessary.
*/
public final class FCDIterCollationIterator extends IterCollationIterator {
public FCDIterCollationIterator(CollationData data, boolean numeric,
UCharacterIterator ui, int startIndex) {
super(data, numeric, ui);
state = State.ITER_CHECK_FWD;
start = startIndex;
nfcImpl = data.nfcImpl;
}
@Override
public void resetToOffset(int newOffset) {
super.resetToOffset(newOffset);
start = newOffset;
state = State.ITER_CHECK_FWD;
}
@Override
public int getOffset() {
if(state.compareTo(State.ITER_CHECK_BWD) <= 0) {
return iter.getIndex();
} else if(state == State.ITER_IN_FCD_SEGMENT) {
return pos;
} else if(pos == 0) {
return start;
} else {
return limit;
}
}
@Override
public int nextCodePoint() {
int c;
for(;;) {
if(state == State.ITER_CHECK_FWD) {
c = iter.next();
if(c < 0) {
return c;
}
if(CollationFCD.hasTccc(c)) {
if(CollationFCD.maybeTibetanCompositeVowel(c) ||
CollationFCD.hasLccc(iter.current())) {
iter.previous();
if(!nextSegment()) {
return Collation.SENTINEL_CP;
}
continue;
}
}
if(isLeadSurrogate(c)) {
int trail = iter.next();
if(isTrailSurrogate(trail)) {
return Character.toCodePoint((char)c, (char)trail);
} else if(trail >= 0) {
iter.previous();
}
}
return c;
} else if(state == State.ITER_IN_FCD_SEGMENT && pos != limit) {
c = iter.nextCodePoint();
pos += Character.charCount(c);
assert(c >= 0);
return c;
} else if(state.compareTo(State.IN_NORM_ITER_AT_LIMIT) >= 0 &&
pos != normalized.length()) {
c = normalized.codePointAt(pos);
pos += Character.charCount(c);
return c;
} else {
switchToForward();
}
}
}
@Override
public int previousCodePoint() {
int c;
for(;;) {
if(state == State.ITER_CHECK_BWD) {
c = iter.previous();
if(c < 0) {
start = pos = 0;
state = State.ITER_IN_FCD_SEGMENT;
return Collation.SENTINEL_CP;
}
if(CollationFCD.hasLccc(c)) {
int prev = Collation.SENTINEL_CP;
if(CollationFCD.maybeTibetanCompositeVowel(c) ||
CollationFCD.hasTccc(prev = iter.previous())) {
iter.next();
if(prev >= 0) {
iter.next();
}
if(!previousSegment()) {
return Collation.SENTINEL_CP;
}
continue;
}
// hasLccc(trail)=true for all trail surrogates
if(isTrailSurrogate(c)) {
if(prev < 0) {
prev = iter.previous();
}
if(isLeadSurrogate(prev)) {
return Character.toCodePoint((char)prev, (char)c);
}
}
if(prev >= 0) {
iter.next();
}
}
return c;
} else if(state == State.ITER_IN_FCD_SEGMENT && pos != start) {
c = iter.previousCodePoint();
pos -= Character.charCount(c);
assert(c >= 0);
return c;
} else if(state.compareTo(State.IN_NORM_ITER_AT_LIMIT) >= 0 && pos != 0) {
c = normalized.codePointBefore(pos);
pos -= Character.charCount(c);
return c;
} else {
switchToBackward();
}
}
}
@Override
protected long handleNextCE32() {
int c;
for(;;) {
if(state == State.ITER_CHECK_FWD) {
c = iter.next();
if(c < 0) {
return NO_CP_AND_CE32;
}
if(CollationFCD.hasTccc(c)) {
if(CollationFCD.maybeTibetanCompositeVowel(c) ||
CollationFCD.hasLccc(iter.current())) {
iter.previous();
if(!nextSegment()) {
c = Collation.SENTINEL_CP;
return Collation.FALLBACK_CE32;
}
continue;
}
}
break;
} else if(state == State.ITER_IN_FCD_SEGMENT && pos != limit) {
c = iter.next();
++pos;
assert(c >= 0);
break;
} else if(state.compareTo(State.IN_NORM_ITER_AT_LIMIT) >= 0 &&
pos != normalized.length()) {
c = normalized.charAt(pos++);
break;
} else {
switchToForward();
}
}
return makeCodePointAndCE32Pair(c, trie.getFromU16SingleLead((char)c));
}
@Override
protected char handleGetTrailSurrogate() {
if(state.compareTo(State.ITER_IN_FCD_SEGMENT) <= 0) {
int trail = iter.next();
if(isTrailSurrogate(trail)) {
if(state == State.ITER_IN_FCD_SEGMENT) { ++pos; }
} else if(trail >= 0) {
iter.previous();
}
return (char)trail;
} else {
assert(pos < normalized.length());
char trail;
if(Character.isLowSurrogate(trail = normalized.charAt(pos))) { ++pos; }
return trail;
}
}
@Override
protected void forwardNumCodePoints(int num) {
// Specify the class to avoid a virtual-function indirection.
// In Java, we would declare this class final.
while(num > 0 && nextCodePoint() >= 0) {
--num;
}
}
@Override
protected void backwardNumCodePoints(int num) {
// Specify the class to avoid a virtual-function indirection.
// In Java, we would declare this class final.
while(num > 0 && previousCodePoint() >= 0) {
--num;
}
}
/**
* Switches to forward checking if possible.
*/
private void switchToForward() {
assert(state == State.ITER_CHECK_BWD ||
(state == State.ITER_IN_FCD_SEGMENT && pos == limit) ||
(state.compareTo(State.IN_NORM_ITER_AT_LIMIT) >= 0 && pos == normalized.length()));
if(state == State.ITER_CHECK_BWD) {
// Turn around from backward checking.
start = pos = iter.getIndex();
if(pos == limit) {
state = State.ITER_CHECK_FWD; // Check forward.
} else { // pos < limit
state = State.ITER_IN_FCD_SEGMENT; // Stay in FCD segment.
}
} else {
// Reached the end of the FCD segment.
if(state == State.ITER_IN_FCD_SEGMENT) {
// The input text segment is FCD, extend it forward.
} else {
// The input text segment needed to be normalized.
// Switch to checking forward from it.
if(state == State.IN_NORM_ITER_AT_START) {
iter.moveIndex(limit - start);
}
start = limit;
}
state = State.ITER_CHECK_FWD;
}
}
/**
* Extends the FCD text segment forward or normalizes around pos.
* @return true if success
*/
private boolean nextSegment() {
assert(state == State.ITER_CHECK_FWD);
// The input text [start..(iter index)[ passes the FCD check.
pos = iter.getIndex();
// Collect the characters being checked, in case they need to be normalized.
if(s == null) {
s = new StringBuilder();
} else {
s.setLength(0);
}
int prevCC = 0;
for(;;) {
// Fetch the next character and its fcd16 value.
int c = iter.nextCodePoint();
if(c < 0) { break; }
int fcd16 = nfcImpl.getFCD16(c);
int leadCC = fcd16 >> 8;
if(leadCC == 0 && s.length() != 0) {
// FCD boundary before this character.
iter.previousCodePoint();
break;
}
s.appendCodePoint(c);
if(leadCC != 0 && (prevCC > leadCC || CollationFCD.isFCD16OfTibetanCompositeVowel(fcd16))) {
// Fails FCD check. Find the next FCD boundary and normalize.
for(;;) {
c = iter.nextCodePoint();
if(c < 0) { break; }
if(nfcImpl.getFCD16(c) <= 0xff) {
iter.previousCodePoint();
break;
}
s.appendCodePoint(c);
}
normalize(s);
start = pos;
limit = pos + s.length();
state = State.IN_NORM_ITER_AT_LIMIT;
pos = 0;
return true;
}
prevCC = fcd16 & 0xff;
if(prevCC == 0) {
// FCD boundary after the last character.
break;
}
}
limit = pos + s.length();
assert(pos != limit);
iter.moveIndex(-s.length());
state = State.ITER_IN_FCD_SEGMENT;
return true;
}
/**
* Switches to backward checking.
*/
private void switchToBackward() {
assert(state == State.ITER_CHECK_FWD ||
(state == State.ITER_IN_FCD_SEGMENT && pos == start) ||
(state.compareTo(State.IN_NORM_ITER_AT_LIMIT) >= 0 && pos == 0));
if(state == State.ITER_CHECK_FWD) {
// Turn around from forward checking.
limit = pos = iter.getIndex();
if(pos == start) {
state = State.ITER_CHECK_BWD; // Check backward.
} else { // pos > start
state = State.ITER_IN_FCD_SEGMENT; // Stay in FCD segment.
}
} else {
// Reached the start of the FCD segment.
if(state == State.ITER_IN_FCD_SEGMENT) {
// The input text segment is FCD, extend it backward.
} else {
// The input text segment needed to be normalized.
// Switch to checking backward from it.
if(state == State.IN_NORM_ITER_AT_LIMIT) {
iter.moveIndex(start - limit);
}
limit = start;
}
state = State.ITER_CHECK_BWD;
}
}
/**
* Extends the FCD text segment backward or normalizes around pos.
* @return true if success
*/
private boolean previousSegment() {
assert(state == State.ITER_CHECK_BWD);
// The input text [(iter index)..limit[ passes the FCD check.
pos = iter.getIndex();
// Collect the characters being checked, in case they need to be normalized.
if(s == null) {
s = new StringBuilder();
} else {
s.setLength(0);
}
int nextCC = 0;
for(;;) {
// Fetch the previous character and its fcd16 value.
int c = iter.previousCodePoint();
if(c < 0) { break; }
int fcd16 = nfcImpl.getFCD16(c);
int trailCC = fcd16 & 0xff;
if(trailCC == 0 && s.length() != 0) {
// FCD boundary after this character.
iter.nextCodePoint();
break;
}
s.appendCodePoint(c);
if(trailCC != 0 && ((nextCC != 0 && trailCC > nextCC) ||
CollationFCD.isFCD16OfTibetanCompositeVowel(fcd16))) {
// Fails FCD check. Find the previous FCD boundary and normalize.
while(fcd16 > 0xff) {
c = iter.previousCodePoint();
if(c < 0) { break; }
fcd16 = nfcImpl.getFCD16(c);
if(fcd16 == 0) {
iter.nextCodePoint();
break;
}
s.appendCodePoint(c);
}
s.reverse();
normalize(s);
limit = pos;
start = pos - s.length();
state = State.IN_NORM_ITER_AT_START;
pos = normalized.length();
return true;
}
nextCC = fcd16 >> 8;
if(nextCC == 0) {
// FCD boundary before the following character.
break;
}
}
start = pos - s.length();
assert(pos != start);
iter.moveIndex(s.length());
state = State.ITER_IN_FCD_SEGMENT;
return true;
}
private void normalize(CharSequence s) {
if(normalized == null) {
normalized = new StringBuilder();
}
// NFD without argument checking.
nfcImpl.decompose(s, normalized);
}
private enum State {
/**
* The input text [start..(iter index)[ passes the FCD check.
* Moving forward checks incrementally.
* pos & limit are undefined.
*/
ITER_CHECK_FWD,
/**
* The input text [(iter index)..limit[ passes the FCD check.
* Moving backward checks incrementally.
* start & pos are undefined.
*/
ITER_CHECK_BWD,
/**
* The input text [start..limit[ passes the FCD check.
* pos tracks the current text index.
*/
ITER_IN_FCD_SEGMENT,
/**
* The input text [start..limit[ failed the FCD check and was normalized.
* pos tracks the current index in the normalized string.
* The text iterator is at the limit index.
*/
IN_NORM_ITER_AT_LIMIT,
/**
* The input text [start..limit[ failed the FCD check and was normalized.
* pos tracks the current index in the normalized string.
* The text iterator is at the start index.
*/
IN_NORM_ITER_AT_START
}
private State state;
private int start;
private int pos;
private int limit;
private final Normalizer2Impl nfcImpl;
private StringBuilder s;
private StringBuilder normalized;
}