com.ibm.icu.impl.coll.FCDUTF16CollationIterator Maven / Gradle / Ivy
// © 2016 and later: Unicode, Inc. and others.
// License & terms of use: http://www.unicode.org/copyright.html#License
/*
*******************************************************************************
* Copyright (C) 2010-2014, International Business Machines
* Corporation and others. All Rights Reserved.
*******************************************************************************
* FCDUTF16CollationIterator.java, ported from utf16collationiterator.h/.cpp
*
* C++ version created on: 2010oct27
* created by: Markus W. Scherer
*/
package com.ibm.icu.impl.coll;
import com.ibm.icu.impl.Normalizer2Impl;
/**
* Incrementally checks the input text for FCD and normalizes where necessary.
*/
public final class FCDUTF16CollationIterator extends UTF16CollationIterator {
/**
* Partial constructor, see {@link CollationIterator#CollationIterator(CollationData)}.
*/
public FCDUTF16CollationIterator(CollationData d) {
super(d);
nfcImpl = d.nfcImpl;
}
public FCDUTF16CollationIterator(CollationData data, boolean numeric, CharSequence s, int p) {
super(data, numeric, s, p);
rawSeq = s;
segmentStart = p;
rawLimit = s.length();
nfcImpl = data.nfcImpl;
checkDir = 1;
}
@Override
public boolean equals(Object other) {
// Skip the UTF16CollationIterator and call its parent.
if (!(other instanceof CollationIterator)
|| !((CollationIterator)this).equals(other)
|| !(other instanceof FCDUTF16CollationIterator))
{
return false;
}
FCDUTF16CollationIterator o = (FCDUTF16CollationIterator)other;
// Compare the iterator state but not the text: Assume that the caller does that.
if (checkDir != o.checkDir) {
return false;
}
if (checkDir == 0 && (seq == rawSeq) != (o.seq == o.rawSeq)) {
return false;
}
if (checkDir != 0 || seq == rawSeq) {
return (pos - rawStart) == (o.pos - /*o.*/ rawStart);
}
else {
return (segmentStart - rawStart) == (o.segmentStart - /*o.*/ rawStart) &&
(pos - start) == (o.pos - o.start);
}
}
@Override
public int hashCode() {
assert false : "hashCode not designed";
return 42; // any arbitrary constant will do
}
@Override
public void resetToOffset(int newOffset) {
reset();
seq = rawSeq;
start = segmentStart = pos = rawStart + newOffset;
limit = rawLimit;
checkDir = 1;
}
@Override
public int getOffset() {
if(checkDir != 0 || seq == rawSeq) {
return pos - rawStart;
} else if(pos == start) {
return segmentStart - rawStart;
} else {
return segmentLimit - rawStart;
}
}
@Override
public void setText(boolean numeric, CharSequence s, int p) {
super.setText(numeric, s, p);
rawSeq = s;
segmentStart = p;
rawLimit = limit = s.length();
checkDir = 1;
}
@Override
public int nextCodePoint() {
char c;
for(;;) {
if(checkDir > 0) {
if(pos == limit) {
return Collation.SENTINEL_CP;
}
c = seq.charAt(pos++);
if(CollationFCD.hasTccc(c)) {
if(CollationFCD.maybeTibetanCompositeVowel(c) ||
(pos != limit && CollationFCD.hasLccc(seq.charAt(pos)))) {
--pos;
nextSegment();
c = seq.charAt(pos++);
}
}
break;
} else if(checkDir == 0 && pos != limit) {
c = seq.charAt(pos++);
break;
} else {
switchToForward();
}
}
char trail;
if(Character.isHighSurrogate(c) && pos != limit &&
Character.isLowSurrogate(trail = seq.charAt(pos))) {
++pos;
return Character.toCodePoint(c, trail);
} else {
return c;
}
}
@Override
public int previousCodePoint() {
char c;
for(;;) {
if(checkDir < 0) {
if(pos == start) {
return Collation.SENTINEL_CP;
}
c = seq.charAt(--pos);
if(CollationFCD.hasLccc(c)) {
if(CollationFCD.maybeTibetanCompositeVowel(c) ||
(pos != start && CollationFCD.hasTccc(seq.charAt(pos - 1)))) {
++pos;
previousSegment();
c = seq.charAt(--pos);
}
}
break;
} else if(checkDir == 0 && pos != start) {
c = seq.charAt(--pos);
break;
} else {
switchToBackward();
}
}
char lead;
if(Character.isLowSurrogate(c) && pos != start &&
Character.isHighSurrogate(lead = seq.charAt(pos - 1))) {
--pos;
return Character.toCodePoint(lead, c);
} else {
return c;
}
}
@Override
protected long handleNextCE32() {
char c;
for(;;) {
if(checkDir > 0) {
if(pos == limit) {
return NO_CP_AND_CE32;
}
c = seq.charAt(pos++);
if(CollationFCD.hasTccc(c)) {
if(CollationFCD.maybeTibetanCompositeVowel(c) ||
(pos != limit && CollationFCD.hasLccc(seq.charAt(pos)))) {
--pos;
nextSegment();
c = seq.charAt(pos++);
}
}
break;
} else if(checkDir == 0 && pos != limit) {
c = seq.charAt(pos++);
break;
} else {
switchToForward();
}
}
return makeCodePointAndCE32Pair(c, trie.getFromU16SingleLead(c));
}
/* boolean foundNULTerminator(); */
@Override
protected void forwardNumCodePoints(int num) {
// Specify the class to avoid a virtual-function indirection.
// In Java, we would declare this class final.
while(num > 0 && nextCodePoint() >= 0) {
--num;
}
}
@Override
protected void backwardNumCodePoints(int num) {
// Specify the class to avoid a virtual-function indirection.
// In Java, we would declare this class final.
while(num > 0 && previousCodePoint() >= 0) {
--num;
}
}
/**
* Switches to forward checking if possible.
* To be called when checkDir < 0 || (checkDir == 0 && pos == limit).
* Returns with checkDir > 0 || (checkDir == 0 && pos != limit).
*/
private void switchToForward() {
assert((checkDir < 0 && seq == rawSeq) || (checkDir == 0 && pos == limit));
if(checkDir < 0) {
// Turn around from backward checking.
start = segmentStart = pos;
if(pos == segmentLimit) {
limit = rawLimit;
checkDir = 1; // Check forward.
} else { // pos < segmentLimit
checkDir = 0; // Stay in FCD segment.
}
} else {
// Reached the end of the FCD segment.
if(seq == rawSeq) {
// The input text segment is FCD, extend it forward.
} else {
// The input text segment needed to be normalized.
// Switch to checking forward from it.
seq = rawSeq;
pos = start = segmentStart = segmentLimit;
// Note: If this segment is at the end of the input text,
// then it might help to return false to indicate that, so that
// we do not have to re-check and normalize when we turn around and go backwards.
// However, that would complicate the call sites for an optimization of an unusual case.
}
limit = rawLimit;
checkDir = 1;
}
}
/**
* Extend the FCD text segment forward or normalize around pos.
* To be called when checkDir > 0 && pos != limit.
* Returns with checkDir == 0 and pos != limit.
*/
private void nextSegment() {
assert(checkDir > 0 && seq == rawSeq && pos != limit);
// The input text [segmentStart..pos[ passes the FCD check.
int p = pos;
int prevCC = 0;
for(;;) {
// Fetch the next character's fcd16 value.
int q = p;
int c = Character.codePointAt(seq, p);
p += Character.charCount(c);
int fcd16 = nfcImpl.getFCD16(c);
int leadCC = fcd16 >> 8;
if(leadCC == 0 && q != pos) {
// FCD boundary before the [q, p[ character.
limit = segmentLimit = q;
break;
}
if(leadCC != 0 && (prevCC > leadCC || CollationFCD.isFCD16OfTibetanCompositeVowel(fcd16))) {
// Fails FCD check. Find the next FCD boundary and normalize.
do {
q = p;
if(p == rawLimit) { break; }
c = Character.codePointAt(seq, p);
p += Character.charCount(c);
} while(nfcImpl.getFCD16(c) > 0xff);
normalize(pos, q);
pos = start;
break;
}
prevCC = fcd16 & 0xff;
if(p == rawLimit || prevCC == 0) {
// FCD boundary after the last character.
limit = segmentLimit = p;
break;
}
}
assert(pos != limit);
checkDir = 0;
}
/**
* Switches to backward checking.
* To be called when checkDir > 0 || (checkDir == 0 && pos == start).
* Returns with checkDir < 0 || (checkDir == 0 && pos != start).
*/
private void switchToBackward() {
assert((checkDir > 0 && seq == rawSeq) || (checkDir == 0 && pos == start));
if(checkDir > 0) {
// Turn around from forward checking.
limit = segmentLimit = pos;
if(pos == segmentStart) {
start = rawStart;
checkDir = -1; // Check backward.
} else { // pos > segmentStart
checkDir = 0; // Stay in FCD segment.
}
} else {
// Reached the start of the FCD segment.
if(seq == rawSeq) {
// The input text segment is FCD, extend it backward.
} else {
// The input text segment needed to be normalized.
// Switch to checking backward from it.
seq = rawSeq;
pos = limit = segmentLimit = segmentStart;
}
start = rawStart;
checkDir = -1;
}
}
/**
* Extend the FCD text segment backward or normalize around pos.
* To be called when checkDir < 0 && pos != start.
* Returns with checkDir == 0 and pos != start.
*/
private void previousSegment() {
assert(checkDir < 0 && seq == rawSeq && pos != start);
// The input text [pos..segmentLimit[ passes the FCD check.
int p = pos;
int nextCC = 0;
for(;;) {
// Fetch the previous character's fcd16 value.
int q = p;
int c = Character.codePointBefore(seq, p);
p -= Character.charCount(c);
int fcd16 = nfcImpl.getFCD16(c);
int trailCC = fcd16 & 0xff;
if(trailCC == 0 && q != pos) {
// FCD boundary after the [p, q[ character.
start = segmentStart = q;
break;
}
if(trailCC != 0 && ((nextCC != 0 && trailCC > nextCC) ||
CollationFCD.isFCD16OfTibetanCompositeVowel(fcd16))) {
// Fails FCD check. Find the previous FCD boundary and normalize.
do {
q = p;
if(fcd16 <= 0xff || p == rawStart) { break; }
c = Character.codePointBefore(seq, p);
p -= Character.charCount(c);
} while((fcd16 = nfcImpl.getFCD16(c)) != 0);
normalize(q, pos);
pos = limit;
break;
}
nextCC = fcd16 >> 8;
if(p == rawStart || nextCC == 0) {
// FCD boundary before the following character.
start = segmentStart = p;
break;
}
}
assert(pos != start);
checkDir = 0;
}
private void normalize(int from, int to) {
if(normalized == null) {
normalized = new StringBuilder();
}
// NFD without argument checking.
nfcImpl.decompose(rawSeq, from, to, normalized, to - from);
// Switch collation processing into the FCD buffer
// with the result of normalizing [segmentStart, segmentLimit[.
segmentStart = from;
segmentLimit = to;
seq = normalized;
start = 0;
limit = start + normalized.length();
}
// Text pointers: The input text is rawSeq[rawStart, rawLimit[.
// (In C++, these are const UChar * pointers.
// In Java, we use CharSequence rawSeq and the parent class' seq
// together with int indexes.)
//
// checkDir > 0:
//
// The input text rawSeq[segmentStart..pos[ passes the FCD check.
// Moving forward checks incrementally.
// segmentLimit is undefined. seq == rawSeq. limit == rawLimit.
//
// checkDir < 0:
// The input text rawSeq[pos..segmentLimit[ passes the FCD check.
// Moving backward checks incrementally.
// segmentStart is undefined. seq == rawSeq. start == rawStart.
//
// checkDir == 0:
//
// The input text rawSeq[segmentStart..segmentLimit[ is being processed.
// These pointers are at FCD boundaries.
// Either this text segment already passes the FCD check
// and seq==rawSeq && segmentStart==start<=pos<=limit==segmentLimit,
// or the current segment had to be normalized so that
// rawSeq[segmentStart..segmentLimit[ turned into the normalized string,
// corresponding to seq==normalized && 0==start<=pos<=limit==start+normalized.length().
private CharSequence rawSeq;
private static final int rawStart = 0;
private int segmentStart;
private int segmentLimit;
private int rawLimit;
private final Normalizer2Impl nfcImpl;
private StringBuilder normalized;
// Direction of incremental FCD check. See comments before rawStart.
private int checkDir;
}
© 2015 - 2024 Weber Informatics LLC | Privacy Policy