All Downloads are FREE. Search and download functionalities are using the official Maven repository.

com.ibm.icu.impl.UnicodeSetStringSpan Maven / Gradle / Ivy

Go to download

International Component for Unicode for Java (ICU4J) is a mature, widely used Java library providing Unicode and Globalization support

There is a newer version: 76.1
Show newest version
/*
 ******************************************************************************
 *
 *   Copyright (C) 2009-2012, International Business Machines
 *   Corporation and others.  All Rights Reserved.
 *
 ******************************************************************************
 */

package com.ibm.icu.impl;

import java.util.ArrayList;

import com.ibm.icu.text.UnicodeSet;
import com.ibm.icu.text.UnicodeSet.SpanCondition;

/*
 * Implement span() etc. for a set with strings.
 * Avoid recursion because of its exponential complexity.
 * Instead, try multiple paths at once and track them with an IndexList.
 */
public class UnicodeSetStringSpan {

    /*
     * Which span() variant will be used? The object is either built for one variant and used once, or built for all and
     * may be used many times.
     */
    public static final int FWD           = 0x20;
    public static final int BACK          = 0x10;
    public static final int UTF16         = 8;
    public static final int CONTAINED     = 2;
    public static final int NOT_CONTAINED = 1;

    public static final int ALL = 0x3f;

    public static final int FWD_UTF16_CONTAINED      = FWD  | UTF16 |     CONTAINED;
    public static final int FWD_UTF16_NOT_CONTAINED  = FWD  | UTF16 | NOT_CONTAINED;
    public static final int BACK_UTF16_CONTAINED     = BACK | UTF16 |     CONTAINED;
    public static final int BACK_UTF16_NOT_CONTAINED = BACK | UTF16 | NOT_CONTAINED;

    // Special spanLength short values. (since Java has not unsigned byte type)
    // All code points in the string are contained in the parent set.
    static final short ALL_CP_CONTAINED = 0xff;
    // The spanLength is >=0xfe.
    static final short LONG_SPAN = ALL_CP_CONTAINED - 1;

    // Set for span(). Same as parent but without strings.
    private UnicodeSet spanSet;

    // Set for span(not contained).
    // Same as spanSet, plus characters that start or end strings.
    private UnicodeSet spanNotSet;

    // The strings of the parent set.
    private ArrayList strings;

    // the lengths of span(), spanBack() etc. for each string.
    private short[] spanLengths;

    // Maximum lengths of relevant strings.
    private int maxLength16;

    // Set up for all variants of span()?
    private boolean all;

    // Span helper
    private OffsetList offsets;

    // Construct for all variants of span(), or only for any one variant.
    // Initialize as little as possible, for single use.
    public UnicodeSetStringSpan(final UnicodeSet set, final ArrayList setStrings, int which) {
        spanSet = new UnicodeSet(0, 0x10ffff);
        strings = setStrings;
        all = (which == ALL);
        spanSet.retainAll(set);
        if (0 != (which & NOT_CONTAINED)) {
            // Default to the same sets.
            // addToSpanNotSet() will create a separate set if necessary.
            spanNotSet = spanSet;
        }
        offsets = new OffsetList();

        // Determine if the strings even need to be taken into account at all for span() etc.
        // If any string is relevant, then all strings need to be used for
        // span(longest match) but only the relevant ones for span(while contained).
        // TODO: Possible optimization: Distinguish CONTAINED vs. LONGEST_MATCH
        // and do not store UTF-8 strings if !thisRelevant and CONTAINED.
        // (Only store irrelevant UTF-8 strings for LONGEST_MATCH where they are relevant after all.)
        // Also count the lengths of the UTF-8 versions of the strings for memory allocation.
        int stringsLength = strings.size();

        int i, spanLength;
        boolean someRelevant = false;
        for (i = 0; i < stringsLength; ++i) {
            String string = strings.get(i);
            int length16 = string.length();
            spanLength = spanSet.span(string, SpanCondition.CONTAINED);
            if (spanLength < length16) { // Relevant string.
                someRelevant = true;
            }
            if ((0 != (which & UTF16)) && length16 > maxLength16) {
                maxLength16 = length16;
            }
        }
        if (!someRelevant) {
            maxLength16 = 0;
            return;
        }

        // Freeze after checking for the need to use strings at all because freezing
        // a set takes some time and memory which are wasted if there are no relevant strings.
        if (all) {
            spanSet.freeze();
        }

        int spanBackLengthsOffset;

        // Allocate a block of meta data.
        int allocSize;
        if (all) {
            // 2 sets of span lengths
            allocSize = stringsLength * (2);
        } else {
            allocSize = stringsLength; // One set of span lengths.
        }
        spanLengths = new short[allocSize];

        if (all) {
            // Store span lengths for all span() variants.
            spanBackLengthsOffset = stringsLength;
        } else {
            // Store span lengths for only one span() variant.
            spanBackLengthsOffset = 0;
        }

        // Set the meta data and spanNotSet and write the UTF-8 strings.

        for (i = 0; i < stringsLength; ++i) {
            String string = strings.get(i);
            int length16 = string.length();
            spanLength = spanSet.span(string, SpanCondition.CONTAINED);
            if (spanLength < length16) { // Relevant string.
                if (0 != (which & UTF16)) {
                    if (0 != (which & CONTAINED)) {
                        if (0 != (which & FWD)) {
                            spanLengths[i] = makeSpanLengthByte(spanLength);
                        }
                        if (0 != (which & BACK)) {
                            spanLength = length16
                                    - spanSet.spanBack(string, length16, SpanCondition.CONTAINED);
                            spanLengths[spanBackLengthsOffset + i] = makeSpanLengthByte(spanLength);
                        }
                    } else /* not CONTAINED, not all, but NOT_CONTAINED */{
                        spanLengths[i] = spanLengths[spanBackLengthsOffset + i] = 0; // Only store a relevant/irrelevant
                                                                                     // flag.
                    }
                }
                if (0 != (which & NOT_CONTAINED)) {
                    // Add string start and end code points to the spanNotSet so that
                    // a span(while not contained) stops before any string.
                    int c;
                    if (0 != (which & FWD)) {
                        c = string.codePointAt(0);
                        addToSpanNotSet(c);
                    }
                    if (0 != (which & BACK)) {
                        c = string.codePointBefore(length16);
                        addToSpanNotSet(c);
                    }
                }
            } else { // Irrelevant string.
                if (all) {
                    spanLengths[i] = spanLengths[spanBackLengthsOffset + i] = ALL_CP_CONTAINED;
                } else {
                    // All spanXYZLengths pointers contain the same address.
                    spanLengths[i] = ALL_CP_CONTAINED;
                }
            }
        }

        // Finish.
        if (all) {
            spanNotSet.freeze();
        }
    }

    /**
     * Constructs a copy of an existing UnicodeSetStringSpan.
     * Assumes which==ALL for a frozen set.
     */
    public UnicodeSetStringSpan(final UnicodeSetStringSpan otherStringSpan, final ArrayList newParentSetStrings) {
        spanSet = otherStringSpan.spanSet;
        strings = newParentSetStrings;
        maxLength16 = otherStringSpan.maxLength16;
        all = true;
        if (otherStringSpan.spanNotSet == otherStringSpan.spanSet) {
            spanNotSet = spanSet;
        } else {
            spanNotSet = (UnicodeSet) otherStringSpan.spanNotSet.clone();
        }
        offsets = new OffsetList();

        spanLengths = otherStringSpan.spanLengths.clone();
    }

    /*
     * Do the strings need to be checked in span() etc.?
     * 
     * @return TRUE if strings need to be checked (call span() here), FALSE if not (use a BMPSet for best performance).
     */
    public boolean needsStringSpanUTF16() {
        return (maxLength16 != 0);
    }

    // For fast UnicodeSet::contains(c).
    public boolean contains(int c) {
        return spanSet.contains(c);
    }

    // Add a starting or ending string character to the spanNotSet
    // so that a character span ends before any string.
    private void addToSpanNotSet(int c) {
        if (spanNotSet == null || spanNotSet == spanSet) {
            if (spanSet.contains(c)) {
                return; // Nothing to do.
            }
            spanNotSet = spanSet.cloneAsThawed();
        }
        spanNotSet.add(c);
    }

    /*
     * Note: In span() when spanLength==0 (after a string match, or at the beginning after an empty code point span) and
     * in spanNot() and spanNotUTF8(), string matching could use a binary search because all string matches are done
     * from the same start index.
     * 
     * For UTF-8, this would require a comparison function that returns UTF-16 order.
     * 
     * This optimization should not be necessary for normal UnicodeSets because most sets have no strings, and most sets
     * with strings have very few very short strings. For cases with many strings, it might be better to use a different
     * API and implementation with a DFA (state machine).
     */

    /*
     * Algorithm for span(SpanCondition.CONTAINED)
     * 
     * Theoretical algorithm: - Iterate through the string, and at each code point boundary: + If the code point there
     * is in the set, then remember to continue after it. + If a set string matches at the current position, then
     * remember to continue after it. + Either recursively span for each code point or string match, or recursively span
     * for all but the shortest one and iteratively continue the span with the shortest local match. + Remember the
     * longest recursive span (the farthest end point). + If there is no match at the current position, neither for the
     * code point there nor for any set string, then stop and return the longest recursive span length.
     * 
     * Optimized implementation:
     * 
     * (We assume that most sets will have very few very short strings. A span using a string-less set is extremely
     * fast.)
     * 
     * Create and cache a spanSet which contains all of the single code points of the original set but none of its
     * strings.
     * 
     * - Start with spanLength=spanSet.span(SpanCondition.CONTAINED). - Loop: + Try to match each set
     * string at the end of the spanLength. ~ Set strings that start with set-contained code points must be matched with
     * a partial overlap because the recursive algorithm would have tried to match them at every position. ~ Set strings
     * that entirely consist of set-contained code points are irrelevant for span(SpanCondition.CONTAINED)
     * because the recursive algorithm would continue after them anyway and find the longest recursive match from their
     * end. ~ Rather than recursing, note each end point of a set string match. + If no set string matched after
     * spanSet.span(), then return with where the spanSet.span() ended. + If at least one set string matched after
     * spanSet.span(), then pop the shortest string match end point and continue the loop, trying to match all set
     * strings from there. + If at least one more set string matched after a previous string match, then test if the
     * code point after the previous string match is also contained in the set. Continue the loop with the shortest end
     * point of either this code point or a matching set string. + If no more set string matched after a previous string
     * match, then try another spanLength=spanSet.span(SpanCondition.CONTAINED). Stop if spanLength==0,
     * otherwise continue the loop.
     * 
     * By noting each end point of a set string match, the function visits each string position at most once and
     * finishes in linear time.
     * 
     * The recursive algorithm may visit the same string position many times if multiple paths lead to it and finishes
     * in exponential time.
     */

    /*
     * Algorithm for span(SIMPLE)
     * 
     * Theoretical algorithm: - Iterate through the string, and at each code point boundary: + If the code point there
     * is in the set, then remember to continue after it. + If a set string matches at the current position, then
     * remember to continue after it. + Continue from the farthest match position and ignore all others. + If there is
     * no match at the current position, then stop and return the current position.
     * 
     * Optimized implementation:
     * 
     * (Same assumption and spanSet as above.)
     * 
     * - Start with spanLength=spanSet.span(SpanCondition.CONTAINED). - Loop: + Try to match each set
     * string at the end of the spanLength. ~ Set strings that start with set-contained code points must be matched with
     * a partial overlap because the standard algorithm would have tried to match them earlier. ~ Set strings that
     * entirely consist of set-contained code points must be matched with a full overlap because the longest-match
     * algorithm would hide set string matches that end earlier. Such set strings need not be matched earlier inside the
     * code point span because the standard algorithm would then have continued after the set string match anyway. ~
     * Remember the longest set string match (farthest end point) from the earliest starting point. + If no set string
     * matched after spanSet.span(), then return with where the spanSet.span() ended. + If at least one set string
     * matched, then continue the loop after the longest match from the earliest position. + If no more set string
     * matched after a previous string match, then try another
     * spanLength=spanSet.span(SpanCondition.CONTAINED). Stop if spanLength==0, otherwise continue the
     * loop.
     */
    /**
     * Span a string.
     * 
     * @param s The string to be spanned
     * @param start The start index that the span begins
     * @param spanCondition The span condition
     * @return the length of the span
     */
    public synchronized int span(CharSequence s, int start, int length, SpanCondition spanCondition) {
        if (spanCondition == SpanCondition.NOT_CONTAINED) {
            return spanNot(s, start, length);
        }
        int spanLength = spanSet.span(s.subSequence(start, start + length), SpanCondition.CONTAINED);
        if (spanLength == length) {
            return length;
        }

        // Consider strings; they may overlap with the span.
        int initSize = 0;
        if (spanCondition == SpanCondition.CONTAINED) {
            // Use offset list to try all possibilities.
            initSize = maxLength16;
        }
        offsets.setMaxLength(initSize);
        int pos = start + spanLength, rest = length - spanLength;
        int i, stringsLength = strings.size();
        for (;;) {
            if (spanCondition == SpanCondition.CONTAINED) {
                for (i = 0; i < stringsLength; ++i) {
                    int overlap = spanLengths[i];
                    if (overlap == ALL_CP_CONTAINED) {
                        continue; // Irrelevant string.
                    }
                    String string = strings.get(i);

                    int length16 = string.length();

                    // Try to match this string at pos-overlap..pos.
                    if (overlap >= LONG_SPAN) {
                        overlap = length16;
                        // While contained: No point matching fully inside the code point span.
                        overlap = string.offsetByCodePoints(overlap, -1); // Length of the string minus the last code
                                                                          // point.
                    }
                    if (overlap > spanLength) {
                        overlap = spanLength;
                    }
                    int inc = length16 - overlap; // Keep overlap+inc==length16.
                    for (;;) {
                        if (inc > rest) {
                            break;
                        }
                        // Try to match if the increment is not listed already.
                        if (!offsets.containsOffset(inc) && matches16CPB(s, pos - overlap, length, string, length16)) {
                            if (inc == rest) {
                                return length; // Reached the end of the string.
                            }
                            offsets.addOffset(inc);
                        }
                        if (overlap == 0) {
                            break;
                        }
                        --overlap;
                        ++inc;
                    }
                }
            } else /* SIMPLE */{
                int maxInc = 0, maxOverlap = 0;
                for (i = 0; i < stringsLength; ++i) {
                    int overlap = spanLengths[i];
                    // For longest match, we do need to try to match even an all-contained string
                    // to find the match from the earliest start.

                    String string = strings.get(i);

                    int length16 = string.length();

                    // Try to match this string at pos-overlap..pos.
                    if (overlap >= LONG_SPAN) {
                        overlap = length16;
                        // Longest match: Need to match fully inside the code point span
                        // to find the match from the earliest start.
                    }
                    if (overlap > spanLength) {
                        overlap = spanLength;
                    }
                    int inc = length16 - overlap; // Keep overlap+inc==length16.
                    for (;;) {
                        if (inc > rest || overlap < maxOverlap) {
                            break;
                        }
                        // Try to match if the string is longer or starts earlier.
                        if ((overlap > maxOverlap || /* redundant overlap==maxOverlap && */inc > maxInc)
                                && matches16CPB(s, pos - overlap, length, string, length16)) {
                            maxInc = inc; // Longest match from earliest start.
                            maxOverlap = overlap;
                            break;
                        }
                        --overlap;
                        ++inc;
                    }
                }

                if (maxInc != 0 || maxOverlap != 0) {
                    // Longest-match algorithm, and there was a string match.
                    // Simply continue after it.
                    pos += maxInc;
                    rest -= maxInc;
                    if (rest == 0) {
                        return length; // Reached the end of the string.
                    }
                    spanLength = 0; // Match strings from after a string match.
                    continue;
                }
            }
            // Finished trying to match all strings at pos.

            if (spanLength != 0 || pos == 0) {
                // The position is after an unlimited code point span (spanLength!=0),
                // not after a string match.
                // The only position where spanLength==0 after a span is pos==0.
                // Otherwise, an unlimited code point span is only tried again when no
                // strings match, and if such a non-initial span fails we stop.
                if (offsets.isEmpty()) {
                    return pos - start; // No strings matched after a span.
                }
                // Match strings from after the next string match.
            } else {
                // The position is after a string match (or a single code point).
                if (offsets.isEmpty()) {
                    // No more strings matched after a previous string match.
                    // Try another code point span from after the last string match.
                    spanLength = spanSet.span(s.subSequence(pos, pos + rest), SpanCondition.CONTAINED);
                    if (spanLength == rest || // Reached the end of the string, or
                            spanLength == 0 // neither strings nor span progressed.
                    ) {
                        return pos + spanLength - start;
                    }
                    pos += spanLength;
                    rest -= spanLength;
                    continue; // spanLength>0: Match strings from after a span.
                } else {
                    // Try to match only one code point from after a string match if some
                    // string matched beyond it, so that we try all possible positions
                    // and don't overshoot.
                    spanLength = spanOne(spanSet, s, pos, rest);
                    if (spanLength > 0) {
                        if (spanLength == rest) {
                            return length; // Reached the end of the string.
                        }
                        // Match strings after this code point.
                        // There cannot be any increments below it because UnicodeSet strings
                        // contain multiple code points.
                        pos += spanLength;
                        rest -= spanLength;
                        offsets.shift(spanLength);
                        spanLength = 0;
                        continue; // Match strings from after a single code point.
                    }
                    // Match strings from after the next string match.
                }
            }
            int minOffset = offsets.popMinimum();
            pos += minOffset;
            rest -= minOffset;
            spanLength = 0; // Match strings from after a string match.
        }
    }

    /**
     * Span a string backwards.
     * 
     * @param s The string to be spanned
     * @param spanCondition The span condition
     * @return The string index which starts the span (i.e. inclusive).
     */
    public synchronized int spanBack(CharSequence s, int length, SpanCondition spanCondition) {
        if (spanCondition == SpanCondition.NOT_CONTAINED) {
            return spanNotBack(s, length);
        }
        int pos = spanSet.spanBack(s, length, SpanCondition.CONTAINED);
        if (pos == 0) {
            return 0;
        }
        int spanLength = length - pos;

        // Consider strings; they may overlap with the span.
        int initSize = 0;
        if (spanCondition == SpanCondition.CONTAINED) {
            // Use offset list to try all possibilities.
            initSize = maxLength16;
        }
        offsets.setMaxLength(initSize);
        int i, stringsLength = strings.size();
        int spanBackLengthsOffset = 0;
        if (all) {
            spanBackLengthsOffset = stringsLength;
        }
        for (;;) {
            if (spanCondition == SpanCondition.CONTAINED) {
                for (i = 0; i < stringsLength; ++i) {
                    int overlap = spanLengths[spanBackLengthsOffset + i];
                    if (overlap == ALL_CP_CONTAINED) {
                        continue; // Irrelevant string.
                    }
                    String string = strings.get(i);

                    int length16 = string.length();

                    // Try to match this string at pos-(length16-overlap)..pos-length16.
                    if (overlap >= LONG_SPAN) {
                        overlap = length16;
                        // While contained: No point matching fully inside the code point span.
                        int len1 = 0;
                        len1 = string.offsetByCodePoints(0, 1);
                        overlap -= len1; // Length of the string minus the first code point.
                    }
                    if (overlap > spanLength) {
                        overlap = spanLength;
                    }
                    int dec = length16 - overlap; // Keep dec+overlap==length16.
                    for (;;) {
                        if (dec > pos) {
                            break;
                        }
                        // Try to match if the decrement is not listed already.
                        if (!offsets.containsOffset(dec) && matches16CPB(s, pos - dec, length, string, length16)) {
                            if (dec == pos) {
                                return 0; // Reached the start of the string.
                            }
                            offsets.addOffset(dec);
                        }
                        if (overlap == 0) {
                            break;
                        }
                        --overlap;
                        ++dec;
                    }
                }
            } else /* SIMPLE */{
                int maxDec = 0, maxOverlap = 0;
                for (i = 0; i < stringsLength; ++i) {
                    int overlap = spanLengths[spanBackLengthsOffset + i];
                    // For longest match, we do need to try to match even an all-contained string
                    // to find the match from the latest end.

                    String string = strings.get(i);

                    int length16 = string.length();

                    // Try to match this string at pos-(length16-overlap)..pos-length16.
                    if (overlap >= LONG_SPAN) {
                        overlap = length16;
                        // Longest match: Need to match fully inside the code point span
                        // to find the match from the latest end.
                    }
                    if (overlap > spanLength) {
                        overlap = spanLength;
                    }
                    int dec = length16 - overlap; // Keep dec+overlap==length16.
                    for (;;) {
                        if (dec > pos || overlap < maxOverlap) {
                            break;
                        }
                        // Try to match if the string is longer or ends later.
                        if ((overlap > maxOverlap || /* redundant overlap==maxOverlap && */dec > maxDec)
                                && matches16CPB(s, pos - dec, length, string, length16)) {
                            maxDec = dec; // Longest match from latest end.
                            maxOverlap = overlap;
                            break;
                        }
                        --overlap;
                        ++dec;
                    }
                }

                if (maxDec != 0 || maxOverlap != 0) {
                    // Longest-match algorithm, and there was a string match.
                    // Simply continue before it.
                    pos -= maxDec;
                    if (pos == 0) {
                        return 0; // Reached the start of the string.
                    }
                    spanLength = 0; // Match strings from before a string match.
                    continue;
                }
            }
            // Finished trying to match all strings at pos.

            if (spanLength != 0 || pos == length) {
                // The position is before an unlimited code point span (spanLength!=0),
                // not before a string match.
                // The only position where spanLength==0 before a span is pos==length.
                // Otherwise, an unlimited code point span is only tried again when no
                // strings match, and if such a non-initial span fails we stop.
                if (offsets.isEmpty()) {
                    return pos; // No strings matched before a span.
                }
                // Match strings from before the next string match.
            } else {
                // The position is before a string match (or a single code point).
                if (offsets.isEmpty()) {
                    // No more strings matched before a previous string match.
                    // Try another code point span from before the last string match.
                    int oldPos = pos;
                    pos = spanSet.spanBack(s, oldPos, SpanCondition.CONTAINED);
                    spanLength = oldPos - pos;
                    if (pos == 0 || // Reached the start of the string, or
                            spanLength == 0 // neither strings nor span progressed.
                    ) {
                        return pos;
                    }
                    continue; // spanLength>0: Match strings from before a span.
                } else {
                    // Try to match only one code point from before a string match if some
                    // string matched beyond it, so that we try all possible positions
                    // and don't overshoot.
                    spanLength = spanOneBack(spanSet, s, pos);
                    if (spanLength > 0) {
                        if (spanLength == pos) {
                            return 0; // Reached the start of the string.
                        }
                        // Match strings before this code point.
                        // There cannot be any decrements below it because UnicodeSet strings
                        // contain multiple code points.
                        pos -= spanLength;
                        offsets.shift(spanLength);
                        spanLength = 0;
                        continue; // Match strings from before a single code point.
                    }
                    // Match strings from before the next string match.
                }
            }
            pos -= offsets.popMinimum();
            spanLength = 0; // Match strings from before a string match.
        }
    }

    /*
     * Algorithm for spanNot()==span(SpanCondition.NOT_CONTAINED)
     * 
     * Theoretical algorithm: - Iterate through the string, and at each code point boundary: + If the code point there
     * is in the set, then return with the current position. + If a set string matches at the current position, then
     * return with the current position.
     * 
     * Optimized implementation:
     * 
     * (Same assumption as for span() above.)
     * 
     * Create and cache a spanNotSet which contains all of the single code points of the original set but none of its
     * strings. For each set string add its initial code point to the spanNotSet. (Also add its final code point for
     * spanNotBack().)
     * 
     * - Loop:
     *   + Do spanLength=spanNotSet.span(SpanCondition.NOT_CONTAINED).
     *   + If the current code point is in the original set, then return the current position.
     *   + If any set string matches at the current position, then return the current position.
     *   + If there is no match at the current position, neither for the code point
     * there nor for any set string, then skip this code point and continue the loop. This happens for
     * set-string-initial code points that were added to spanNotSet when there is not actually a match for such a set
     * string.
     *
     * @return the length of the span
     */
    private int spanNot(CharSequence s, int start, int length) {
        int pos = start, rest = length;
        int i, stringsLength = strings.size();
        do {
            // Span until we find a code point from the set,
            // or a code point that starts or ends some string.
            i = spanNotSet.span(s.subSequence(pos, pos + rest), SpanCondition.NOT_CONTAINED);
            if (i == rest) {
                return length; // Reached the end of the string.
            }
            pos += i;
            rest -= i;

            // Check whether the current code point is in the original set,
            // without the string starts and ends.
            int cpLength = spanOne(spanSet, s, pos, rest);
            if (cpLength > 0) {
                return pos - start; // There is a set element at pos.
            }

            // Try to match the strings at pos.
            for (i = 0; i < stringsLength; ++i) {
                if (spanLengths[i] == ALL_CP_CONTAINED) {
                    continue; // Irrelevant string.
                }
                String string = strings.get(i);

                int length16 = string.length();
                if (length16 <= rest && matches16CPB(s, pos, length, string, length16)) {
                    return pos - start; // There is a set element at pos.
                }
            }

            // The span(while not contained) ended on a string start/end which is
            // not in the original set. Skip this code point and continue.
            // cpLength<0
            pos -= cpLength;
            rest += cpLength;
        } while (rest != 0);
        return length; // Reached the end of the string.
    }

    private int spanNotBack(CharSequence s, int length) {
        int pos = length;
        int i, stringsLength = strings.size();
        do {
            // Span until we find a code point from the set,
            // or a code point that starts or ends some string.
            pos = spanNotSet.spanBack(s, pos, SpanCondition.NOT_CONTAINED);
            if (pos == 0) {
                return 0; // Reached the start of the string.
            }

            // Check whether the current code point is in the original set,
            // without the string starts and ends.
            int cpLength = spanOneBack(spanSet, s, pos);
            if (cpLength > 0) {
                return pos; // There is a set element at pos.
            }

            // Try to match the strings at pos.
            for (i = 0; i < stringsLength; ++i) {
                // Use spanLengths rather than a spanLengths pointer because
                // it is easier and we only need to know whether the string is irrelevant
                // which is the same in either array.
                if (spanLengths[i] == ALL_CP_CONTAINED) {
                    continue; // Irrelevant string.
                }
                String string = strings.get(i);

                int length16 = string.length();
                if (length16 <= pos && matches16CPB(s, pos - length16, length, string, length16)) {
                    return pos; // There is a set element at pos.
                }
            }

            // The span(while not contained) ended on a string start/end which is
            // not in the original set. Skip this code point and continue.
            // cpLength<0
            pos += cpLength;
        } while (pos != 0);
        return 0; // Reached the start of the string.
    }

    static short makeSpanLengthByte(int spanLength) {
        // 0xfe==UnicodeSetStringSpan::LONG_SPAN
        return spanLength < LONG_SPAN ? (short) spanLength : LONG_SPAN;
    }

    // Compare strings without any argument checks. Requires length>0.
    private static boolean matches16(CharSequence s, int start, final String t, int length) {
        int end = start + length;
        while (length-- > 0) {
            if (s.charAt(--end) != t.charAt(length)) {
                return false;
            }
        }
        return true;
    }

    /**
     * Compare 16-bit Unicode strings (which may be malformed UTF-16)
     * at code point boundaries.
     * That is, each edge of a match must not be in the middle of a surrogate pair.
     * @param start   The start index of s.
     * @param slength The length of s from start.
     * @param tlength The length of t.
     */
    static boolean matches16CPB(CharSequence s, int start, int slength, final String t, int tlength) {
        return !(0 < start && com.ibm.icu.text.UTF16.isLeadSurrogate (s.charAt(start - 1)) &&
                              com.ibm.icu.text.UTF16.isTrailSurrogate(s.charAt(start + 0)))
                && !(tlength < slength && com.ibm.icu.text.UTF16.isLeadSurrogate (s.charAt(start + tlength - 1)) &&
                                       com.ibm.icu.text.UTF16.isTrailSurrogate(s.charAt(start + tlength)))
                && matches16(s, start, t, tlength);
    }

    // Does the set contain the next code point?
    // If so, return its length; otherwise return its negative length.
    static int spanOne(final UnicodeSet set, CharSequence s, int start, int length) {
        char c = s.charAt(start);
        if (c >= 0xd800 && c <= 0xdbff && length >= 2) {
            char c2 = s.charAt(start + 1);
            if (com.ibm.icu.text.UTF16.isTrailSurrogate(c2)) {
                int supplementary = UCharacterProperty.getRawSupplementary(c, c2);
                return set.contains(supplementary) ? 2 : -2;
            }
        }
        return set.contains(c) ? 1 : -1;
    }

    static int spanOneBack(final UnicodeSet set, CharSequence s, int length) {
        char c = s.charAt(length - 1);
        if (c >= 0xdc00 && c <= 0xdfff && length >= 2) {
            char c2 = s.charAt(length - 2);
            if (com.ibm.icu.text.UTF16.isLeadSurrogate(c2)) {
                int supplementary = UCharacterProperty.getRawSupplementary(c2, c);
                return set.contains(supplementary) ? 2 : -2;
            }
        }
        return set.contains(c) ? 1 : -1;
    }


    /*
     * Helper class for UnicodeSetStringSpan.
     *
     * List of offsets from the current position from where to try matching a code point or a string. Store offsets rather
     * than indexes to simplify the code and use the same list for both increments (in span()) and decrements (in
     * spanBack()).
     * 
     * Assumption: The maximum offset is limited, and the offsets that are stored at any one time are relatively dense, that
     * is, there are normally no gaps of hundreds or thousands of offset values.
     * 
     * The implementation uses a circular buffer of byte flags, each indicating whether the corresponding offset is in the
     * list. This avoids inserting into a sorted list of offsets (or absolute indexes) and physically moving part of the
     * list.
     * 
     * Note: In principle, the caller should setMaxLength() to the maximum of the max string length and U16_LENGTH/U8_LENGTH
     * to account for "long" single code points.
     * 
     * Note: If maxLength were guaranteed to be no more than 32 or 64, the list could be stored as bit flags in a single
     * integer. Rather than handling a circular buffer with a start list index, the integer would simply be shifted when
     * lower offsets are removed. UnicodeSet does not have a limit on the lengths of strings.
     */
    static class OffsetList {
        private boolean[] list;
        private int length;
        private int start;

        public OffsetList() {
            list = new boolean[16];  // default size
        }

        public void setMaxLength(int maxLength) {
            if (maxLength > list.length) {
                list = new boolean[maxLength];
            }
            clear();
        }

        public void clear() {
            for (int i = list.length; i-- > 0;) {
                list[i] = false;
            }
            start = length = 0;
        }

        public boolean isEmpty() {
            return (length == 0);
        }

        // Reduce all stored offsets by delta, used when the current position
        // moves by delta.
        // There must not be any offsets lower than delta.
        // If there is an offset equal to delta, it is removed.
        // delta=[1..maxLength]
        public void shift(int delta) {
            int i = start + delta;
            if (i >= list.length) {
                i -= list.length;
            }
            if (list[i]) {
                list[i] = false;
                --length;
            }
            start = i;
        }

        // Add an offset. The list must not contain it yet.
        // offset=[1..maxLength]
        public void addOffset(int offset) {
            int i = start + offset;
            if (i >= list.length) {
                i -= list.length;
            }
            list[i] = true;
            ++length;
        }

        // offset=[1..maxLength]
        public boolean containsOffset(int offset) {
            int i = start + offset;
            if (i >= list.length) {
                i -= list.length;
            }
            return list[i];
        }

        // Find the lowest stored offset from a non-empty list, remove it,
        // and reduce all other offsets by this minimum.
        // Returns [1..maxLength].
        public int popMinimum() {
            // Look for the next offset in list[start+1..list.length-1].
            int i = start, result;
            while (++i < list.length) {
                if (list[i]) {
                    list[i] = false;
                    --length;
                    result = i - start;
                    start = i;
                    return result;
                }
            }
            // i==list.length

            // Wrap around and look for the next offset in list[0..start].
            // Since the list is not empty, there will be one.
            result = list.length - start;
            i = 0;
            while (!list[i]) {
                ++i;
            }
            list[i] = false;
            --length;
            start = i;
            return result + i;
        }
    }
}




© 2015 - 2024 Weber Informatics LLC | Privacy Policy