com.ibm.icu.impl.coll.FCDUTF16CollationIterator Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of icu4j Show documentation
Show all versions of icu4j Show documentation
International Component for Unicode for Java (ICU4J) is a mature, widely used Java library
providing Unicode and Globalization support
// © 2016 and later: Unicode, Inc. and others.
// License & terms of use: http://www.unicode.org/copyright.html
/*
*******************************************************************************
* Copyright (C) 2010-2014, International Business Machines
* Corporation and others. All Rights Reserved.
*******************************************************************************
* FCDUTF16CollationIterator.java, ported from utf16collationiterator.h/.cpp
*
* C++ version created on: 2010oct27
* created by: Markus W. Scherer
*/
package com.ibm.icu.impl.coll;
import com.ibm.icu.impl.Normalizer2Impl;
/**
* Incrementally checks the input text for FCD and normalizes where necessary.
*/
public final class FCDUTF16CollationIterator extends UTF16CollationIterator {
/**
* Partial constructor, see {@link CollationIterator#CollationIterator(CollationData)}.
*/
public FCDUTF16CollationIterator(CollationData d) {
super(d);
nfcImpl = d.nfcImpl;
}
public FCDUTF16CollationIterator(CollationData data, boolean numeric, CharSequence s, int p) {
super(data, numeric, s, p);
rawSeq = s;
segmentStart = p;
rawLimit = s.length();
nfcImpl = data.nfcImpl;
checkDir = 1;
}
@Override
public boolean equals(Object other) {
// Skip the UTF16CollationIterator and call its parent.
if (!(other instanceof CollationIterator)
|| !((CollationIterator)this).equals(other)
|| !(other instanceof FCDUTF16CollationIterator))
{
return false;
}
FCDUTF16CollationIterator o = (FCDUTF16CollationIterator)other;
// Compare the iterator state but not the text: Assume that the caller does that.
if (checkDir != o.checkDir) {
return false;
}
if (checkDir == 0 && (seq == rawSeq) != (o.seq == o.rawSeq)) {
return false;
}
if (checkDir != 0 || seq == rawSeq) {
return (pos - rawStart) == (o.pos - /*o.*/ rawStart);
}
else {
return (segmentStart - rawStart) == (o.segmentStart - /*o.*/ rawStart) &&
(pos - start) == (o.pos - o.start);
}
}
@Override
public int hashCode() {
assert false : "hashCode not designed";
return 42; // any arbitrary constant will do
}
@Override
public void resetToOffset(int newOffset) {
reset();
seq = rawSeq;
start = segmentStart = pos = rawStart + newOffset;
limit = rawLimit;
checkDir = 1;
}
@Override
public int getOffset() {
if(checkDir != 0 || seq == rawSeq) {
return pos - rawStart;
} else if(pos == start) {
return segmentStart - rawStart;
} else {
return segmentLimit - rawStart;
}
}
@Override
public void setText(boolean numeric, CharSequence s, int p) {
super.setText(numeric, s, p);
rawSeq = s;
segmentStart = p;
rawLimit = limit = s.length();
checkDir = 1;
}
@Override
public int nextCodePoint() {
char c;
for(;;) {
if(checkDir > 0) {
if(pos == limit) {
return Collation.SENTINEL_CP;
}
c = seq.charAt(pos++);
if(CollationFCD.hasTccc(c)) {
if(CollationFCD.maybeTibetanCompositeVowel(c) ||
(pos != limit && CollationFCD.hasLccc(seq.charAt(pos)))) {
--pos;
nextSegment();
c = seq.charAt(pos++);
}
}
break;
} else if(checkDir == 0 && pos != limit) {
c = seq.charAt(pos++);
break;
} else {
switchToForward();
}
}
char trail;
if(Character.isHighSurrogate(c) && pos != limit &&
Character.isLowSurrogate(trail = seq.charAt(pos))) {
++pos;
return Character.toCodePoint(c, trail);
} else {
return c;
}
}
@Override
public int previousCodePoint() {
char c;
for(;;) {
if(checkDir < 0) {
if(pos == start) {
return Collation.SENTINEL_CP;
}
c = seq.charAt(--pos);
if(CollationFCD.hasLccc(c)) {
if(CollationFCD.maybeTibetanCompositeVowel(c) ||
(pos != start && CollationFCD.hasTccc(seq.charAt(pos - 1)))) {
++pos;
previousSegment();
c = seq.charAt(--pos);
}
}
break;
} else if(checkDir == 0 && pos != start) {
c = seq.charAt(--pos);
break;
} else {
switchToBackward();
}
}
char lead;
if(Character.isLowSurrogate(c) && pos != start &&
Character.isHighSurrogate(lead = seq.charAt(pos - 1))) {
--pos;
return Character.toCodePoint(lead, c);
} else {
return c;
}
}
@Override
protected long handleNextCE32() {
char c;
for(;;) {
if(checkDir > 0) {
if(pos == limit) {
return NO_CP_AND_CE32;
}
c = seq.charAt(pos++);
if(CollationFCD.hasTccc(c)) {
if(CollationFCD.maybeTibetanCompositeVowel(c) ||
(pos != limit && CollationFCD.hasLccc(seq.charAt(pos)))) {
--pos;
nextSegment();
c = seq.charAt(pos++);
}
}
break;
} else if(checkDir == 0 && pos != limit) {
c = seq.charAt(pos++);
break;
} else {
switchToForward();
}
}
return makeCodePointAndCE32Pair(c, trie.getFromU16SingleLead(c));
}
/* boolean foundNULTerminator(); */
@Override
protected void forwardNumCodePoints(int num) {
// Specify the class to avoid a virtual-function indirection.
// In Java, we would declare this class final.
while(num > 0 && nextCodePoint() >= 0) {
--num;
}
}
@Override
protected void backwardNumCodePoints(int num) {
// Specify the class to avoid a virtual-function indirection.
// In Java, we would declare this class final.
while(num > 0 && previousCodePoint() >= 0) {
--num;
}
}
/**
* Switches to forward checking if possible.
* To be called when checkDir < 0 || (checkDir == 0 && pos == limit).
* Returns with checkDir > 0 || (checkDir == 0 && pos != limit).
*/
private void switchToForward() {
assert((checkDir < 0 && seq == rawSeq) || (checkDir == 0 && pos == limit));
if(checkDir < 0) {
// Turn around from backward checking.
start = segmentStart = pos;
if(pos == segmentLimit) {
limit = rawLimit;
checkDir = 1; // Check forward.
} else { // pos < segmentLimit
checkDir = 0; // Stay in FCD segment.
}
} else {
// Reached the end of the FCD segment.
if(seq == rawSeq) {
// The input text segment is FCD, extend it forward.
} else {
// The input text segment needed to be normalized.
// Switch to checking forward from it.
seq = rawSeq;
pos = start = segmentStart = segmentLimit;
// Note: If this segment is at the end of the input text,
// then it might help to return false to indicate that, so that
// we do not have to re-check and normalize when we turn around and go backwards.
// However, that would complicate the call sites for an optimization of an unusual case.
}
limit = rawLimit;
checkDir = 1;
}
}
/**
* Extend the FCD text segment forward or normalize around pos.
* To be called when checkDir > 0 && pos != limit.
* Returns with checkDir == 0 and pos != limit.
*/
private void nextSegment() {
assert(checkDir > 0 && seq == rawSeq && pos != limit);
// The input text [segmentStart..pos[ passes the FCD check.
int p = pos;
int prevCC = 0;
for(;;) {
// Fetch the next character's fcd16 value.
int q = p;
int c = Character.codePointAt(seq, p);
p += Character.charCount(c);
int fcd16 = nfcImpl.getFCD16(c);
int leadCC = fcd16 >> 8;
if(leadCC == 0 && q != pos) {
// FCD boundary before the [q, p[ character.
limit = segmentLimit = q;
break;
}
if(leadCC != 0 && (prevCC > leadCC || CollationFCD.isFCD16OfTibetanCompositeVowel(fcd16))) {
// Fails FCD check. Find the next FCD boundary and normalize.
do {
q = p;
if(p == rawLimit) { break; }
c = Character.codePointAt(seq, p);
p += Character.charCount(c);
} while(nfcImpl.getFCD16(c) > 0xff);
normalize(pos, q);
pos = start;
break;
}
prevCC = fcd16 & 0xff;
if(p == rawLimit || prevCC == 0) {
// FCD boundary after the last character.
limit = segmentLimit = p;
break;
}
}
assert(pos != limit);
checkDir = 0;
}
/**
* Switches to backward checking.
* To be called when checkDir > 0 || (checkDir == 0 && pos == start).
* Returns with checkDir < 0 || (checkDir == 0 && pos != start).
*/
private void switchToBackward() {
assert((checkDir > 0 && seq == rawSeq) || (checkDir == 0 && pos == start));
if(checkDir > 0) {
// Turn around from forward checking.
limit = segmentLimit = pos;
if(pos == segmentStart) {
start = rawStart;
checkDir = -1; // Check backward.
} else { // pos > segmentStart
checkDir = 0; // Stay in FCD segment.
}
} else {
// Reached the start of the FCD segment.
if(seq == rawSeq) {
// The input text segment is FCD, extend it backward.
} else {
// The input text segment needed to be normalized.
// Switch to checking backward from it.
seq = rawSeq;
pos = limit = segmentLimit = segmentStart;
}
start = rawStart;
checkDir = -1;
}
}
/**
* Extend the FCD text segment backward or normalize around pos.
* To be called when checkDir < 0 && pos != start.
* Returns with checkDir == 0 and pos != start.
*/
private void previousSegment() {
assert(checkDir < 0 && seq == rawSeq && pos != start);
// The input text [pos..segmentLimit[ passes the FCD check.
int p = pos;
int nextCC = 0;
for(;;) {
// Fetch the previous character's fcd16 value.
int q = p;
int c = Character.codePointBefore(seq, p);
p -= Character.charCount(c);
int fcd16 = nfcImpl.getFCD16(c);
int trailCC = fcd16 & 0xff;
if(trailCC == 0 && q != pos) {
// FCD boundary after the [p, q[ character.
start = segmentStart = q;
break;
}
if(trailCC != 0 && ((nextCC != 0 && trailCC > nextCC) ||
CollationFCD.isFCD16OfTibetanCompositeVowel(fcd16))) {
// Fails FCD check. Find the previous FCD boundary and normalize.
do {
q = p;
if(fcd16 <= 0xff || p == rawStart) { break; }
c = Character.codePointBefore(seq, p);
p -= Character.charCount(c);
} while((fcd16 = nfcImpl.getFCD16(c)) != 0);
normalize(q, pos);
pos = limit;
break;
}
nextCC = fcd16 >> 8;
if(p == rawStart || nextCC == 0) {
// FCD boundary before the following character.
start = segmentStart = p;
break;
}
}
assert(pos != start);
checkDir = 0;
}
private void normalize(int from, int to) {
if(normalized == null) {
normalized = new StringBuilder();
}
// NFD without argument checking.
nfcImpl.decompose(rawSeq, from, to, normalized, to - from);
// Switch collation processing into the FCD buffer
// with the result of normalizing [segmentStart, segmentLimit[.
segmentStart = from;
segmentLimit = to;
seq = normalized;
start = 0;
limit = start + normalized.length();
}
// Text pointers: The input text is rawSeq[rawStart, rawLimit[.
// (In C++, these are const UChar * pointers.
// In Java, we use CharSequence rawSeq and the parent class' seq
// together with int indexes.)
//
// checkDir > 0:
//
// The input text rawSeq[segmentStart..pos[ passes the FCD check.
// Moving forward checks incrementally.
// segmentLimit is undefined. seq == rawSeq. limit == rawLimit.
//
// checkDir < 0:
// The input text rawSeq[pos..segmentLimit[ passes the FCD check.
// Moving backward checks incrementally.
// segmentStart is undefined. seq == rawSeq. start == rawStart.
//
// checkDir == 0:
//
// The input text rawSeq[segmentStart..segmentLimit[ is being processed.
// These pointers are at FCD boundaries.
// Either this text segment already passes the FCD check
// and seq==rawSeq && segmentStart==start<=pos<=limit==segmentLimit,
// or the current segment had to be normalized so that
// rawSeq[segmentStart..segmentLimit[ turned into the normalized string,
// corresponding to seq==normalized && 0==start<=pos<=limit==start+normalized.length().
private CharSequence rawSeq;
private static final int rawStart = 0;
private int segmentStart;
private int segmentLimit;
private int rawLimit;
private final Normalizer2Impl nfcImpl;
private StringBuilder normalized;
// Direction of incremental FCD check. See comments before rawStart.
private int checkDir;
}