com.ibm.icu.impl.StringSegment Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of icu4j Show documentation
International Component for Unicode for Java (ICU4J) is a mature, widely used Java library providing Unicode and Globalization support
There is a newer version: 76.1
Show newest version
// © 2017 and later: Unicode, Inc. and others.
// License & terms of use: http://www.unicode.org/copyright.html
package com.ibm.icu.impl;

import com.ibm.icu.lang.UCharacter;
import com.ibm.icu.text.UnicodeSet;

/**
 * A mutable String wrapper with a variable offset and length and
 * support for case folding. The charAt, length, and subSequence methods all
 * operate relative to the fixed offset into the String.
 *
 * Intended to be useful for parsing.
 *
 * CAUTION: Since this class is mutable, it must not be used anywhere that an
 * immutable object is required, like in a cache or as the key of a hash map.
 *
 * @author sffc (Shane Carr)
 */
public class StringSegment implements CharSequence {
    private final String str;
    private int start;
    private int end;
    private boolean foldCase;

    public StringSegment(String str, boolean foldCase) {
        this.str = str;
        this.start = 0;
        this.end = str.length();
        this.foldCase = foldCase;
    }

    public int getOffset() {
        return start;
    }

    public void setOffset(int start) {
        assert start <= end;
        this.start = start;
    }

    /**
     * Equivalent to setOffset(getOffset()+delta).
     *
     * 
     * Number parsing note: This method is usually called by a Matcher to register that a char was
     * consumed. If the char is strong (it usually is, except for things like whitespace), follow this
     * with a call to ParsedNumber#setCharsConsumed(). For more information on strong chars, see that
     * method.
     */
    public void adjustOffset(int delta) {
        assert start + delta >= 0;
        assert start + delta <= end;
        start += delta;
    }

    /**
     * Adjusts the offset by the width of the current lead code point, either 1 or 2 chars.
     */
    public void adjustOffsetByCodePoint() {
        start += Character.charCount(getCodePoint());
    }

    public void setLength(int length) {
        assert length >= 0;
        assert start + length <= str.length();
        end = start + length;
    }

    public void resetLength() {
        end = str.length();
    }

    @Override
    public int length() {
        return end - start;
    }

    @Override
    public char charAt(int index) {
        return str.charAt(index + start);
    }

    @Override
    public CharSequence subSequence(int start, int end) {
        return str.subSequence(start + this.start, end + this.start);
    }

    /**
     * Returns the first code point in the string segment.
     *
     * 

     * Important: Most of the time, you should use {@link #startsWith}, which handles
     * case folding logic, instead of this method.
     */
    public int getCodePoint() {
        assert start < end;
        char lead = str.charAt(start);
        char trail;
        if (Character.isHighSurrogate(lead)
                && start + 1 < end
                && Character.isLowSurrogate(trail = str.charAt(start + 1))) {
            return Character.toCodePoint(lead, trail);
        }
        return lead;
    }

    /**
     * Returns the code point at the given index relative to the current offset.
     */
    public int codePointAt(int index) {
        return str.codePointAt(start + index);
    }

    /**
     * Returns true if the first code point of this StringSegment equals the given code point.
     *
     * 

     * This method will perform case folding if case folding is enabled for the parser.
     */
    public boolean startsWith(int otherCp) {
        return codePointsEqual(getCodePoint(), otherCp, foldCase);
    }

    /**
     * Returns true if the first code point of this StringSegment is in the given UnicodeSet.
     */
    public boolean startsWith(UnicodeSet uniset) {
        // TODO: Move UnicodeSet case-folding logic here.
        // TODO: Handle string matches here instead of separately.
        int cp = getCodePoint();
        if (cp == -1) {
            return false;
        }
        return uniset.contains(cp);
    }

    /**
     * Returns true if there is at least one code point of overlap between this StringSegment and the
     * given CharSequence. Null-safe.
     */
    public boolean startsWith(CharSequence other) {
        if (other == null || other.length() == 0 || length() == 0) {
            return false;
        }
        int cp1 = Character.codePointAt(this, 0);
        int cp2 = Character.codePointAt(other, 0);
        return codePointsEqual(cp1, cp2, foldCase);
    }

    /**
     * Returns the length of the prefix shared by this StringSegment and the given CharSequence. For
     * example, if this string segment is "aab", and the char sequence is "aac", this method returns 2,
     * since the first 2 characters are the same.
     *
     * 

     * This method only returns offsets along code point boundaries.
     *
     * 

     * This method will perform case folding if case folding was enabled in the constructor.
     *
     * 

     * IMPORTANT: The given CharSequence must not be empty! It is the caller's responsibility to check.
     */
    public int getCommonPrefixLength(CharSequence other) {
        return getPrefixLengthInternal(other, foldCase);
    }

    /**
     * Like {@link #getCommonPrefixLength}, but never performs case folding, even if case folding was
     * enabled in the constructor.
     */
    public int getCaseSensitivePrefixLength(CharSequence other) {
        return getPrefixLengthInternal(other, false);
    }

    private int getPrefixLengthInternal(CharSequence other, boolean foldCase) {
        assert other.length() != 0;
        int offset = 0;
        for (; offset < Math.min(length(), other.length());) {
            // TODO: case-fold code points, not chars
            int cp1 = Character.codePointAt(this, offset);
            int cp2 = Character.codePointAt(other, offset);
            if (!codePointsEqual(cp1, cp2, foldCase)) {
                break;
            }
            offset += Character.charCount(cp1);
        }
        return offset;
    }

    private static final boolean codePointsEqual(int cp1, int cp2, boolean foldCase) {
        if (cp1 == cp2) {
            return true;
        }
        if (!foldCase) {
            return false;
        }
        cp1 = UCharacter.foldCase(cp1, true);
        cp2 = UCharacter.foldCase(cp2, true);
        return cp1 == cp2;
    }

    /**
     * Returns true if this segment contains the same characters as the other CharSequence.
     *
     * This method does not perform case folding; if you want case-insensitive equality, use
     * {@link #getCommonPrefixLength}.
     */
    public boolean contentEquals(CharSequence other) {
        return Utility.charSequenceEquals(this, other);
    }

    /** Returns a string representation useful for debugging. */
    @Override
    public String toString() {
        return str.substring(0, start) + "[" + str.substring(start, end) + "]" + str.substring(end);
    }

    /** Returns a String that is equivalent to the CharSequence representation. */
    public String asString() {
        return str.substring(start, end);
    }
}