com.ibm.icu.impl.StringSegment Maven / Gradle / Ivy
Show all versions of icu4j Show documentation
// © 2017 and later: Unicode, Inc. and others.
// License & terms of use: http://www.unicode.org/copyright.html
package com.ibm.icu.impl;
import com.ibm.icu.lang.UCharacter;
import com.ibm.icu.text.UnicodeSet;
/**
* A mutable String wrapper with a variable offset and length and
* support for case folding. The charAt, length, and subSequence methods all
* operate relative to the fixed offset into the String.
*
* Intended to be useful for parsing.
*
* CAUTION: Since this class is mutable, it must not be used anywhere that an
* immutable object is required, like in a cache or as the key of a hash map.
*
* @author sffc (Shane Carr)
*/
public class StringSegment implements CharSequence {
private final String str;
private int start;
private int end;
private boolean foldCase;
public StringSegment(String str, boolean foldCase) {
this.str = str;
this.start = 0;
this.end = str.length();
this.foldCase = foldCase;
}
public int getOffset() {
return start;
}
public void setOffset(int start) {
assert start <= end;
this.start = start;
}
/**
* Equivalent to setOffset(getOffset()+delta)
.
*
*
* Number parsing note: This method is usually called by a Matcher to register that a char was
* consumed. If the char is strong (it usually is, except for things like whitespace), follow this
* with a call to ParsedNumber#setCharsConsumed(). For more information on strong chars, see that
* method.
*/
public void adjustOffset(int delta) {
assert start + delta >= 0;
assert start + delta <= end;
start += delta;
}
/**
* Adjusts the offset by the width of the current lead code point, either 1 or 2 chars.
*/
public void adjustOffsetByCodePoint() {
start += Character.charCount(getCodePoint());
}
public void setLength(int length) {
assert length >= 0;
assert start + length <= str.length();
end = start + length;
}
public void resetLength() {
end = str.length();
}
@Override
public int length() {
return end - start;
}
@Override
public char charAt(int index) {
return str.charAt(index + start);
}
@Override
public CharSequence subSequence(int start, int end) {
return str.subSequence(start + this.start, end + this.start);
}
/**
* Returns the first code point in the string segment.
*
*
* Important: Most of the time, you should use {@link #startsWith}, which handles
* case folding logic, instead of this method.
*/
public int getCodePoint() {
assert start < end;
char lead = str.charAt(start);
char trail;
if (Character.isHighSurrogate(lead)
&& start + 1 < end
&& Character.isLowSurrogate(trail = str.charAt(start + 1))) {
return Character.toCodePoint(lead, trail);
}
return lead;
}
/**
* Returns the code point at the given index relative to the current offset.
*/
public int codePointAt(int index) {
return str.codePointAt(start + index);
}
/**
* Returns true if the first code point of this StringSegment equals the given code point.
*
*
* This method will perform case folding if case folding is enabled for the parser.
*/
public boolean startsWith(int otherCp) {
return codePointsEqual(getCodePoint(), otherCp, foldCase);
}
/**
* Returns true if the first code point of this StringSegment is in the given UnicodeSet.
*/
public boolean startsWith(UnicodeSet uniset) {
// TODO: Move UnicodeSet case-folding logic here.
// TODO: Handle string matches here instead of separately.
int cp = getCodePoint();
if (cp == -1) {
return false;
}
return uniset.contains(cp);
}
/**
* Returns true if there is at least one code point of overlap between this StringSegment and the
* given CharSequence. Null-safe.
*/
public boolean startsWith(CharSequence other) {
if (other == null || other.length() == 0 || length() == 0) {
return false;
}
int cp1 = Character.codePointAt(this, 0);
int cp2 = Character.codePointAt(other, 0);
return codePointsEqual(cp1, cp2, foldCase);
}
/**
* Returns the length of the prefix shared by this StringSegment and the given CharSequence. For
* example, if this string segment is "aab", and the char sequence is "aac", this method returns 2,
* since the first 2 characters are the same.
*
*
* This method only returns offsets along code point boundaries.
*
*
* This method will perform case folding if case folding was enabled in the constructor.
*
*
* IMPORTANT: The given CharSequence must not be empty! It is the caller's responsibility to check.
*/
public int getCommonPrefixLength(CharSequence other) {
return getPrefixLengthInternal(other, foldCase);
}
/**
* Like {@link #getCommonPrefixLength}, but never performs case folding, even if case folding was
* enabled in the constructor.
*/
public int getCaseSensitivePrefixLength(CharSequence other) {
return getPrefixLengthInternal(other, false);
}
private int getPrefixLengthInternal(CharSequence other, boolean foldCase) {
assert other.length() != 0;
int offset = 0;
for (; offset < Math.min(length(), other.length());) {
// TODO: case-fold code points, not chars
int cp1 = Character.codePointAt(this, offset);
int cp2 = Character.codePointAt(other, offset);
if (!codePointsEqual(cp1, cp2, foldCase)) {
break;
}
offset += Character.charCount(cp1);
}
return offset;
}
private static final boolean codePointsEqual(int cp1, int cp2, boolean foldCase) {
if (cp1 == cp2) {
return true;
}
if (!foldCase) {
return false;
}
cp1 = UCharacter.foldCase(cp1, true);
cp2 = UCharacter.foldCase(cp2, true);
return cp1 == cp2;
}
/**
* Returns true if this segment contains the same characters as the other CharSequence.
*
*
This method does not perform case folding; if you want case-insensitive equality, use
* {@link #getCommonPrefixLength}.
*/
public boolean contentEquals(CharSequence other) {
return Utility.charSequenceEquals(this, other);
}
/** Returns a string representation useful for debugging. */
@Override
public String toString() {
return str.substring(0, start) + "[" + str.substring(start, end) + "]" + str.substring(end);
}
/** Returns a String that is equivalent to the CharSequence representation. */
public String asString() {
return str.substring(start, end);
}
}