com.ibm.icu.impl.coll.FCDUTF16CollationIterator Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of icu4j Show documentation
International Component for Unicode for Java (ICU4J) is a mature, widely used Java library providing Unicode and Globalization support
There is a newer version: 76.1
Show newest version
// © 2016 and later: Unicode, Inc. and others.
// License & terms of use: http://www.unicode.org/copyright.html#License
/*
*******************************************************************************
* Copyright (C) 2010-2014, International Business Machines
* Corporation and others.  All Rights Reserved.
*******************************************************************************
* FCDUTF16CollationIterator.java, ported from utf16collationiterator.h/.cpp
*
* C++ version created on: 2010oct27
* created by: Markus W. Scherer
*/

package com.ibm.icu.impl.coll;

import com.ibm.icu.impl.Normalizer2Impl;

/**
 * Incrementally checks the input text for FCD and normalizes where necessary.
 */
public final class FCDUTF16CollationIterator extends UTF16CollationIterator {
    /**
     * Partial constructor, see {@link CollationIterator#CollationIterator(CollationData)}.
     */
    public FCDUTF16CollationIterator(CollationData d) {
        super(d);
        nfcImpl = d.nfcImpl;
    }

    public FCDUTF16CollationIterator(CollationData data, boolean numeric, CharSequence s, int p) {
        super(data, numeric, s, p);
        rawSeq = s;
        segmentStart = p;
        rawLimit = s.length();
        nfcImpl = data.nfcImpl;
        checkDir = 1;
    }

    @Override
    public boolean equals(Object other) {
        // Skip the UTF16CollationIterator and call its parent.
        if (!(other instanceof CollationIterator)
            || !((CollationIterator)this).equals(other)
            || !(other instanceof FCDUTF16CollationIterator))
        {
            return false;
        }
        FCDUTF16CollationIterator o = (FCDUTF16CollationIterator)other;
        // Compare the iterator state but not the text: Assume that the caller does that.
        if (checkDir != o.checkDir) {
            return false;
        }
        if (checkDir == 0 && (seq == rawSeq) != (o.seq == o.rawSeq)) {
            return false;
        }
        if (checkDir != 0 || seq == rawSeq) {
            return (pos - rawStart) == (o.pos - /*o.*/ rawStart);
        }
        else {
            return (segmentStart - rawStart) == (o.segmentStart - /*o.*/ rawStart) &&
                    (pos - start) == (o.pos - o.start);
        }
    }

    @Override
    public int hashCode() {
        assert false : "hashCode not designed";
        return 42; // any arbitrary constant will do
    }

    @Override
    public void resetToOffset(int newOffset) {
        reset();
        seq = rawSeq;
        start = segmentStart = pos = rawStart + newOffset;
        limit = rawLimit;
        checkDir = 1;
    }

    @Override
    public int getOffset() {
        if(checkDir != 0 || seq == rawSeq) {
            return pos - rawStart;
        } else if(pos == start) {
            return segmentStart - rawStart;
        } else {
            return segmentLimit - rawStart;
        }
    }

    @Override
    public void setText(boolean numeric, CharSequence s, int p) {
        super.setText(numeric, s, p);
        rawSeq = s;
        segmentStart = p;
        rawLimit = limit = s.length();
        checkDir = 1;
    }

    @Override
    public int nextCodePoint() {
        char c;
        for(;;) {
            if(checkDir > 0) {
                if(pos == limit) {
                    return Collation.SENTINEL_CP;
                }
                c = seq.charAt(pos++);
                if(CollationFCD.hasTccc(c)) {
                    if(CollationFCD.maybeTibetanCompositeVowel(c) ||
                            (pos != limit && CollationFCD.hasLccc(seq.charAt(pos)))) {
                        --pos;
                        nextSegment();
                        c = seq.charAt(pos++);
                    }
                }
                break;
            } else if(checkDir == 0 && pos != limit) {
                c = seq.charAt(pos++);
                break;
            } else {
                switchToForward();
            }
        }
        char trail;
        if(Character.isHighSurrogate(c) && pos != limit &&
                Character.isLowSurrogate(trail = seq.charAt(pos))) {
            ++pos;
            return Character.toCodePoint(c, trail);
        } else {
            return c;
        }
    }

    @Override
    public int previousCodePoint() {
        char c;
        for(;;) {
            if(checkDir < 0) {
                if(pos == start) {
                    return Collation.SENTINEL_CP;
                }
                c = seq.charAt(--pos);
                if(CollationFCD.hasLccc(c)) {
                    if(CollationFCD.maybeTibetanCompositeVowel(c) ||
                            (pos != start && CollationFCD.hasTccc(seq.charAt(pos - 1)))) {
                        ++pos;
                        previousSegment();
                        c = seq.charAt(--pos);
                    }
                }
                break;
            } else if(checkDir == 0 && pos != start) {
                c = seq.charAt(--pos);
                break;
            } else {
                switchToBackward();
            }
        }
        char lead;
        if(Character.isLowSurrogate(c) && pos != start &&
                Character.isHighSurrogate(lead = seq.charAt(pos - 1))) {
            --pos;
            return Character.toCodePoint(lead, c);
        } else {
            return c;
        }
    }

    @Override
    protected long handleNextCE32() {
        char c;
        for(;;) {
            if(checkDir > 0) {
                if(pos == limit) {
                    return NO_CP_AND_CE32;
                }
                c = seq.charAt(pos++);
                if(CollationFCD.hasTccc(c)) {
                    if(CollationFCD.maybeTibetanCompositeVowel(c) ||
                            (pos != limit && CollationFCD.hasLccc(seq.charAt(pos)))) {
                        --pos;
                        nextSegment();
                        c = seq.charAt(pos++);
                    }
                }
                break;
            } else if(checkDir == 0 && pos != limit) {
                c = seq.charAt(pos++);
                break;
            } else {
                switchToForward();
            }
        }
        return makeCodePointAndCE32Pair(c, trie.getFromU16SingleLead(c));
    }

    /* boolean foundNULTerminator(); */

    @Override
    protected void forwardNumCodePoints(int num) {
        // Specify the class to avoid a virtual-function indirection.
        // In Java, we would declare this class final.
        while(num > 0 && nextCodePoint() >= 0) {
            --num;
        }
    }

    @Override
    protected void backwardNumCodePoints(int num) {
        // Specify the class to avoid a virtual-function indirection.
        // In Java, we would declare this class final.
        while(num > 0 && previousCodePoint() >= 0) {
            --num;
        }
    }

    /**
     * Switches to forward checking if possible.
     * To be called when checkDir < 0 || (checkDir == 0 && pos == limit).
     * Returns with checkDir > 0 || (checkDir == 0 && pos != limit).
     */
    private void switchToForward() {
        assert((checkDir < 0 && seq == rawSeq) || (checkDir == 0 && pos == limit));
        if(checkDir < 0) {
            // Turn around from backward checking.
            start = segmentStart = pos;
            if(pos == segmentLimit) {
                limit = rawLimit;
                checkDir = 1;  // Check forward.
            } else {  // pos < segmentLimit
                checkDir = 0;  // Stay in FCD segment.
            }
        } else {
            // Reached the end of the FCD segment.
            if(seq == rawSeq) {
                // The input text segment is FCD, extend it forward.
            } else {
                // The input text segment needed to be normalized.
                // Switch to checking forward from it.
                seq = rawSeq;
                pos = start = segmentStart = segmentLimit;
                // Note: If this segment is at the end of the input text,
                // then it might help to return false to indicate that, so that
                // we do not have to re-check and normalize when we turn around and go backwards.
                // However, that would complicate the call sites for an optimization of an unusual case.
            }
            limit = rawLimit;
            checkDir = 1;
        }
    }

    /**
     * Extend the FCD text segment forward or normalize around pos.
     * To be called when checkDir > 0 && pos != limit.
     * Returns with checkDir == 0 and pos != limit.
     */
    private void nextSegment() {
        assert(checkDir > 0 && seq == rawSeq && pos != limit);
        // The input text [segmentStart..pos[ passes the FCD check.
        int p = pos;
        int prevCC = 0;
        for(;;) {
            // Fetch the next character's fcd16 value.
            int q = p;
            int c = Character.codePointAt(seq, p);
            p += Character.charCount(c);
            int fcd16 = nfcImpl.getFCD16(c);
            int leadCC = fcd16 >> 8;
            if(leadCC == 0 && q != pos) {
                // FCD boundary before the [q, p[ character.
                limit = segmentLimit = q;
                break;
            }
            if(leadCC != 0 && (prevCC > leadCC || CollationFCD.isFCD16OfTibetanCompositeVowel(fcd16))) {
                // Fails FCD check. Find the next FCD boundary and normalize.
                do {
                    q = p;
                    if(p == rawLimit) { break; }
                    c = Character.codePointAt(seq, p);
                    p += Character.charCount(c);
                } while(nfcImpl.getFCD16(c) > 0xff);
                normalize(pos, q);
                pos = start;
                break;
            }
            prevCC = fcd16 & 0xff;
            if(p == rawLimit || prevCC == 0) {
                // FCD boundary after the last character.
                limit = segmentLimit = p;
                break;
            }
        }
        assert(pos != limit);
        checkDir = 0;
    }

    /**
     * Switches to backward checking.
     * To be called when checkDir > 0 || (checkDir == 0 && pos == start).
     * Returns with checkDir < 0 || (checkDir == 0 && pos != start).
     */
    private void switchToBackward() {
        assert((checkDir > 0 && seq == rawSeq) || (checkDir == 0 && pos == start));
        if(checkDir > 0) {
            // Turn around from forward checking.
            limit = segmentLimit = pos;
            if(pos == segmentStart) {
                start = rawStart;
                checkDir = -1;  // Check backward.
            } else {  // pos > segmentStart
                checkDir = 0;  // Stay in FCD segment.
            }
        } else {
            // Reached the start of the FCD segment.
            if(seq == rawSeq) {
                // The input text segment is FCD, extend it backward.
            } else {
                // The input text segment needed to be normalized.
                // Switch to checking backward from it.
                seq = rawSeq;
                pos = limit = segmentLimit = segmentStart;
            }
            start = rawStart;
            checkDir = -1;
        }
    }

    /**
     * Extend the FCD text segment backward or normalize around pos.
     * To be called when checkDir < 0 && pos != start.
     * Returns with checkDir == 0 and pos != start.
     */
    private void previousSegment() {
        assert(checkDir < 0 && seq == rawSeq && pos != start);
        // The input text [pos..segmentLimit[ passes the FCD check.
        int p = pos;
        int nextCC = 0;
        for(;;) {
            // Fetch the previous character's fcd16 value.
            int q = p;
            int c = Character.codePointBefore(seq, p);
            p -= Character.charCount(c);
            int fcd16 = nfcImpl.getFCD16(c);
            int trailCC = fcd16 & 0xff;
            if(trailCC == 0 && q != pos) {
                // FCD boundary after the [p, q[ character.
                start = segmentStart = q;
                break;
            }
            if(trailCC != 0 && ((nextCC != 0 && trailCC > nextCC) ||
                                CollationFCD.isFCD16OfTibetanCompositeVowel(fcd16))) {
                // Fails FCD check. Find the previous FCD boundary and normalize.
                do {
                    q = p;
                    if(fcd16 <= 0xff || p == rawStart) { break; }
                    c = Character.codePointBefore(seq, p);
                    p -= Character.charCount(c);
                } while((fcd16 = nfcImpl.getFCD16(c)) != 0);
                normalize(q, pos);
                pos = limit;
                break;
            }
            nextCC = fcd16 >> 8;
            if(p == rawStart || nextCC == 0) {
                // FCD boundary before the following character.
                start = segmentStart = p;
                break;
            }
        }
        assert(pos != start);
        checkDir = 0;
    }

    private void normalize(int from, int to) {
        if(normalized == null) {
            normalized = new StringBuilder();
        }
        // NFD without argument checking.
        nfcImpl.decompose(rawSeq, from, to, normalized, to - from);
        // Switch collation processing into the FCD buffer
        // with the result of normalizing [segmentStart, segmentLimit[.
        segmentStart = from;
        segmentLimit = to;
        seq = normalized;
        start = 0;
        limit = start + normalized.length();
    }

    // Text pointers: The input text is rawSeq[rawStart, rawLimit[.
    // (In C++, these are const UChar * pointers.
    // In Java, we use CharSequence rawSeq and the parent class' seq
    // together with int indexes.)
    //
    // checkDir > 0:
    //
    // The input text rawSeq[segmentStart..pos[ passes the FCD check.
    // Moving forward checks incrementally.
    // segmentLimit is undefined. seq == rawSeq. limit == rawLimit.
    //
    // checkDir < 0:
    // The input text rawSeq[pos..segmentLimit[ passes the FCD check.
    // Moving backward checks incrementally.
    // segmentStart is undefined. seq == rawSeq. start == rawStart.
    //
    // checkDir == 0:
    //
    // The input text rawSeq[segmentStart..segmentLimit[ is being processed.
    // These pointers are at FCD boundaries.
    // Either this text segment already passes the FCD check
    // and seq==rawSeq && segmentStart==start<=pos<=limit==segmentLimit,
    // or the current segment had to be normalized so that
    // rawSeq[segmentStart..segmentLimit[ turned into the normalized string,
    // corresponding to seq==normalized && 0==start<=pos<=limit==start+normalized.length().
    private CharSequence rawSeq;
    private static final int rawStart = 0;
    private int segmentStart;
    private int segmentLimit;
    private int rawLimit;

    private final Normalizer2Impl nfcImpl;
    private StringBuilder normalized;
    // Direction of incremental FCD check. See comments before rawStart.
    private int checkDir;
}