com.ibm.icu.impl.coll.FCDIterCollationIterator Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of icu4j Show documentation
International Component for Unicode for Java (ICU4J) is a mature, widely used Java library providing Unicode and Globalization support
The newest version!
// © 2016 and later: Unicode, Inc. and others.
// License & terms of use: http://www.unicode.org/copyright.html
/*
*******************************************************************************
* Copyright (C) 2012-2014, International Business Machines
* Corporation and others.  All Rights Reserved.
*******************************************************************************
* FCDIterCollationIterator.java, ported from uitercollationiterator.h/.cpp
*
* C++ version created on: 2012sep23 (from utf16collationiterator.h)
* created by: Markus W. Scherer
*/

package com.ibm.icu.impl.coll;

import com.ibm.icu.impl.Normalizer2Impl;
import com.ibm.icu.text.UCharacterIterator;

/**
 * Incrementally checks the input text for FCD and normalizes where necessary.
 */
public final class FCDIterCollationIterator extends IterCollationIterator {
    public FCDIterCollationIterator(CollationData data, boolean numeric,
            UCharacterIterator ui, int startIndex) {
        super(data, numeric, ui);
        state = State.ITER_CHECK_FWD;
        start = startIndex;
        nfcImpl = data.nfcImpl;
    }

    @Override
    public void resetToOffset(int newOffset) {
        super.resetToOffset(newOffset);
        start = newOffset;
        state = State.ITER_CHECK_FWD;
    }

    @Override
    public int getOffset() {
        if(state.compareTo(State.ITER_CHECK_BWD) <= 0) {
            return iter.getIndex();
        } else if(state == State.ITER_IN_FCD_SEGMENT) {
            return pos;
        } else if(pos == 0) {
            return start;
        } else {
            return limit;
        }
    }

    @Override
    public int nextCodePoint() {
        int c;
        for(;;) {
            if(state == State.ITER_CHECK_FWD) {
                c = iter.next();
                if(c < 0) {
                    return c;
                }
                if(CollationFCD.hasTccc(c)) {
                    if(CollationFCD.maybeTibetanCompositeVowel(c) ||
                            CollationFCD.hasLccc(iter.current())) {
                        iter.previous();
                        if(!nextSegment()) {
                            return Collation.SENTINEL_CP;
                        }
                        continue;
                    }
                }
                if(isLeadSurrogate(c)) {
                    int trail = iter.next();
                    if(isTrailSurrogate(trail)) {
                        return Character.toCodePoint((char)c, (char)trail);
                    } else if(trail >= 0) {
                        iter.previous();
                    }
                }
                return c;
            } else if(state == State.ITER_IN_FCD_SEGMENT && pos != limit) {
                c = iter.nextCodePoint();
                pos += Character.charCount(c);
                assert(c >= 0);
                return c;
            } else if(state.compareTo(State.IN_NORM_ITER_AT_LIMIT) >= 0 &&
                    pos != normalized.length()) {
                c = normalized.codePointAt(pos);
                pos += Character.charCount(c);
                return c;
            } else {
                switchToForward();
            }
        }
    }

    @Override
    public int previousCodePoint() {
        int c;
        for(;;) {
            if(state == State.ITER_CHECK_BWD) {
                c = iter.previous();
                if(c < 0) {
                    start = pos = 0;
                    state = State.ITER_IN_FCD_SEGMENT;
                    return Collation.SENTINEL_CP;
                }
                if(CollationFCD.hasLccc(c)) {
                    int prev = Collation.SENTINEL_CP;
                    if(CollationFCD.maybeTibetanCompositeVowel(c) ||
                            CollationFCD.hasTccc(prev = iter.previous())) {
                        iter.next();
                        if(prev >= 0) {
                            iter.next();
                        }
                        if(!previousSegment()) {
                            return Collation.SENTINEL_CP;
                        }
                        continue;
                    }
                    // hasLccc(trail)=true for all trail surrogates
                    if(isTrailSurrogate(c)) {
                        if(prev < 0) {
                            prev = iter.previous();
                        }
                        if(isLeadSurrogate(prev)) {
                            return Character.toCodePoint((char)prev, (char)c);
                        }
                    }
                    if(prev >= 0) {
                        iter.next();
                    }
                }
                return c;
            } else if(state == State.ITER_IN_FCD_SEGMENT && pos != start) {
                c = iter.previousCodePoint();
                pos -= Character.charCount(c);
                assert(c >= 0);
                return c;
            } else if(state.compareTo(State.IN_NORM_ITER_AT_LIMIT) >= 0 && pos != 0) {
                c = normalized.codePointBefore(pos);
                pos -= Character.charCount(c);
                return c;
            } else {
                switchToBackward();
            }
        }
    }

    @Override
    protected long handleNextCE32() {
        int c;
        for(;;) {
            if(state == State.ITER_CHECK_FWD) {
                c = iter.next();
                if(c < 0) {
                    return NO_CP_AND_CE32;
                }
                if(CollationFCD.hasTccc(c)) {
                    if(CollationFCD.maybeTibetanCompositeVowel(c) ||
                            CollationFCD.hasLccc(iter.current())) {
                        iter.previous();
                        if(!nextSegment()) {
                            c = Collation.SENTINEL_CP;
                            return Collation.FALLBACK_CE32;
                        }
                        continue;
                    }
                }
                break;
            } else if(state == State.ITER_IN_FCD_SEGMENT && pos != limit) {
                c = iter.next();
                ++pos;
                assert(c >= 0);
                break;
            } else if(state.compareTo(State.IN_NORM_ITER_AT_LIMIT) >= 0 &&
                    pos != normalized.length()) {
                c = normalized.charAt(pos++);
                break;
            } else {
                switchToForward();
            }
        }
        return makeCodePointAndCE32Pair(c, trie.getFromU16SingleLead((char)c));
    }

    @Override
    protected char handleGetTrailSurrogate() {
        if(state.compareTo(State.ITER_IN_FCD_SEGMENT) <= 0) {
            int trail = iter.next();
            if(isTrailSurrogate(trail)) {
                if(state == State.ITER_IN_FCD_SEGMENT) { ++pos; }
            } else if(trail >= 0) {
                iter.previous();
            }
            return (char)trail;
        } else {
            assert(pos < normalized.length());
            char trail;
            if(Character.isLowSurrogate(trail = normalized.charAt(pos))) { ++pos; }
            return trail;
        }
    }

    @Override
    protected void forwardNumCodePoints(int num) {
        // Specify the class to avoid a virtual-function indirection.
        // In Java, we would declare this class final.
        while(num > 0 && nextCodePoint() >= 0) {
            --num;
        }
    }

    @Override
    protected void backwardNumCodePoints(int num) {
        // Specify the class to avoid a virtual-function indirection.
        // In Java, we would declare this class final.
        while(num > 0 && previousCodePoint() >= 0) {
            --num;
        }
    }

    /**
     * Switches to forward checking if possible.
     */
    private void switchToForward() {
        assert(state == State.ITER_CHECK_BWD ||
                (state == State.ITER_IN_FCD_SEGMENT && pos == limit) ||
                (state.compareTo(State.IN_NORM_ITER_AT_LIMIT) >= 0 && pos == normalized.length()));
        if(state == State.ITER_CHECK_BWD) {
            // Turn around from backward checking.
            start = pos = iter.getIndex();
            if(pos == limit) {
                state = State.ITER_CHECK_FWD;  // Check forward.
            } else {  // pos < limit
                state = State.ITER_IN_FCD_SEGMENT;  // Stay in FCD segment.
            }
        } else {
            // Reached the end of the FCD segment.
            if(state == State.ITER_IN_FCD_SEGMENT) {
                // The input text segment is FCD, extend it forward.
            } else {
                // The input text segment needed to be normalized.
                // Switch to checking forward from it.
                if(state == State.IN_NORM_ITER_AT_START) {
                    iter.moveIndex(limit - start);
                }
                start = limit;
            }
            state = State.ITER_CHECK_FWD;
        }
    }

    /**
     * Extends the FCD text segment forward or normalizes around pos.
     * @return true if success
     */
    private boolean nextSegment() {
        assert(state == State.ITER_CHECK_FWD);
        // The input text [start..(iter index)[ passes the FCD check.
        pos = iter.getIndex();
        // Collect the characters being checked, in case they need to be normalized.
        if(s == null) {
            s = new StringBuilder();
        } else {
            s.setLength(0);
        }
        int prevCC = 0;
        for(;;) {
            // Fetch the next character and its fcd16 value.
            int c = iter.nextCodePoint();
            if(c < 0) { break; }
            int fcd16 = nfcImpl.getFCD16(c);
            int leadCC = fcd16 >> 8;
            if(leadCC == 0 && s.length() != 0) {
                // FCD boundary before this character.
                iter.previousCodePoint();
                break;
            }
            s.appendCodePoint(c);
            if(leadCC != 0 && (prevCC > leadCC || CollationFCD.isFCD16OfTibetanCompositeVowel(fcd16))) {
                // Fails FCD check. Find the next FCD boundary and normalize.
                for(;;) {
                    c = iter.nextCodePoint();
                    if(c < 0) { break; }
                    if(nfcImpl.getFCD16(c) <= 0xff) {
                        iter.previousCodePoint();
                        break;
                    }
                    s.appendCodePoint(c);
                }
                normalize(s);
                start = pos;
                limit = pos + s.length();
                state = State.IN_NORM_ITER_AT_LIMIT;
                pos = 0;
                return true;
            }
            prevCC = fcd16 & 0xff;
            if(prevCC == 0) {
                // FCD boundary after the last character.
                break;
            }
        }
        limit = pos + s.length();
        assert(pos != limit);
        iter.moveIndex(-s.length());
        state = State.ITER_IN_FCD_SEGMENT;
        return true;
    }

    /**
     * Switches to backward checking.
     */
    private void switchToBackward() {
        assert(state == State.ITER_CHECK_FWD ||
                (state == State.ITER_IN_FCD_SEGMENT && pos == start) ||
                (state.compareTo(State.IN_NORM_ITER_AT_LIMIT) >= 0 && pos == 0));
        if(state == State.ITER_CHECK_FWD) {
            // Turn around from forward checking.
            limit = pos = iter.getIndex();
            if(pos == start) {
                state = State.ITER_CHECK_BWD;  // Check backward.
            } else {  // pos > start
                state = State.ITER_IN_FCD_SEGMENT;  // Stay in FCD segment.
            }
        } else {
            // Reached the start of the FCD segment.
            if(state == State.ITER_IN_FCD_SEGMENT) {
                // The input text segment is FCD, extend it backward.
            } else {
                // The input text segment needed to be normalized.
                // Switch to checking backward from it.
                if(state == State.IN_NORM_ITER_AT_LIMIT) {
                    iter.moveIndex(start - limit);
                }
                limit = start;
            }
            state = State.ITER_CHECK_BWD;
        }
    }

    /**
     * Extends the FCD text segment backward or normalizes around pos.
     * @return true if success
     */
    private boolean previousSegment() {
        assert(state == State.ITER_CHECK_BWD);
        // The input text [(iter index)..limit[ passes the FCD check.
        pos = iter.getIndex();
        // Collect the characters being checked, in case they need to be normalized.
        if(s == null) {
            s = new StringBuilder();
        } else {
            s.setLength(0);
        }
        int nextCC = 0;
        for(;;) {
            // Fetch the previous character and its fcd16 value.
            int c = iter.previousCodePoint();
            if(c < 0) { break; }
            int fcd16 = nfcImpl.getFCD16(c);
            int trailCC = fcd16 & 0xff;
            if(trailCC == 0 && s.length() != 0) {
                // FCD boundary after this character.
                iter.nextCodePoint();
                break;
            }
            s.appendCodePoint(c);
            if(trailCC != 0 && ((nextCC != 0 && trailCC > nextCC) ||
                                CollationFCD.isFCD16OfTibetanCompositeVowel(fcd16))) {
                // Fails FCD check. Find the previous FCD boundary and normalize.
                while(fcd16 > 0xff) {
                    c = iter.previousCodePoint();
                    if(c < 0) { break; }
                    fcd16 = nfcImpl.getFCD16(c);
                    if(fcd16 == 0) {
                        iter.nextCodePoint();
                        break;
                    }
                    s.appendCodePoint(c);
                }
                s.reverse();
                normalize(s);
                limit = pos;
                start = pos - s.length();
                state = State.IN_NORM_ITER_AT_START;
                pos = normalized.length();
                return true;
            }
            nextCC = fcd16 >> 8;
            if(nextCC == 0) {
                // FCD boundary before the following character.
                break;
            }
        }
        start = pos - s.length();
        assert(pos != start);
        iter.moveIndex(s.length());
        state = State.ITER_IN_FCD_SEGMENT;
        return true;
    }

    private void normalize(CharSequence s) {
        if(normalized == null) {
            normalized = new StringBuilder();
        }
        // NFD without argument checking.
        nfcImpl.decompose(s, normalized);
    }

    private enum State {
        /**
         * The input text [start..(iter index)[ passes the FCD check.
         * Moving forward checks incrementally.
         * pos & limit are undefined.
         */
        ITER_CHECK_FWD,
        /**
         * The input text [(iter index)..limit[ passes the FCD check.
         * Moving backward checks incrementally.
         * start & pos are undefined.
         */
        ITER_CHECK_BWD,
        /**
         * The input text [start..limit[ passes the FCD check.
         * pos tracks the current text index.
         */
        ITER_IN_FCD_SEGMENT,
        /**
         * The input text [start..limit[ failed the FCD check and was normalized.
         * pos tracks the current index in the normalized string.
         * The text iterator is at the limit index.
         */
        IN_NORM_ITER_AT_LIMIT,
        /**
         * The input text [start..limit[ failed the FCD check and was normalized.
         * pos tracks the current index in the normalized string.
         * The text iterator is at the start index.
         */
        IN_NORM_ITER_AT_START
    }

    private State state;

    private int start;
    private int pos;
    private int limit;

    private final Normalizer2Impl nfcImpl;
    private StringBuilder s;
    private StringBuilder normalized;
}