com.google.common.base.CharMatcher Maven / Gradle / Ivy
/*
* Copyright (C) 2008 The Guava Authors
*
* Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
* in compliance with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software distributed under the License
* is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
* or implied. See the License for the specific language governing permissions and limitations under
* the License.
*/
package com.google.common.base;
import static com.google.common.base.Preconditions.checkArgument;
import static com.google.common.base.Preconditions.checkNotNull;
import static com.google.common.base.Preconditions.checkPositionIndex;
import com.google.common.annotations.GwtCompatible;
import com.google.common.annotations.GwtIncompatible;
import com.google.common.annotations.VisibleForTesting;
import java.util.Arrays;
import java.util.BitSet;
/**
* Determines a true or false value for any Java {@code char} value, just as {@link Predicate} does
* for any {@link Object}. Also offers basic text processing methods based on this function.
* Implementations are strongly encouraged to be side-effect-free and immutable.
*
* Throughout the documentation of this class, the phrase "matching character" is used to mean
* "any {@code char} value {@code c} for which {@code this.matches(c)} returns {@code true}".
*
*
Warning: This class deals only with {@code char} values, that is, BMP characters. It does not understand
* supplementary Unicode code
* points in the range {@code 0x10000} to {@code 0x10FFFF} which includes the majority of
* assigned characters, including important CJK characters and emoji.
*
*
Supplementary characters are encoded
* into a {@code String} using surrogate pairs, and a {@code CharMatcher} treats these just as
* two separate characters. {@link #countIn} counts each supplementary character as 2 {@code char}s.
*
*
For up-to-date Unicode character properties (digit, letter, etc.) and support for
* supplementary code points, use ICU4J UCharacter and UnicodeSet (freeze() after building). For
* basic text processing based on UnicodeSet use the ICU4J UnicodeSetSpanner.
*
*
Example usages:
*
*
* String trimmed = {@link #whitespace() whitespace()}.{@link #trimFrom trimFrom}(userInput);
* if ({@link #ascii() ascii()}.{@link #matchesAllOf matchesAllOf}(s)) { ... }
*
* See the Guava User Guide article on {@code CharMatcher}
* .
*
* @author Kevin Bourrillion
* @since 1.0
*/
@GwtCompatible(emulated = true)
public abstract class CharMatcher implements Predicate {
/*
* N777777777NO
* N7777777777777N
* M777777777777777N
* $N877777777D77777M
* N M77777777ONND777M
* MN777777777NN D777
* N7ZN777777777NN ~M7778
* N777777777777MMNN88777N
* N777777777777MNZZZ7777O
* DZN7777O77777777777777
* N7OONND7777777D77777N
* 8$M++++?N???$77777$
* M7++++N+M77777777N
* N77O777777777777$ M
* DNNM$$$$777777N D
* N$N:=N$777N7777M NZ
* 77Z::::N777777777 ODZZZ
* 77N::::::N77777777M NNZZZ$
* $777:::::::77777777MN ZM8ZZZZZ
* 777M::::::Z7777777Z77 N++ZZZZNN
* 7777M:::::M7777777$777M $++IZZZZM
* M777$:::::N777777$M7777M +++++ZZZDN
* NN$::::::7777$$M777777N N+++ZZZZNZ
* N::::::N:7$O:77777777 N++++ZZZZN
* M::::::::::::N77777777+ +?+++++ZZZM
* 8::::::::::::D77777777M O+++++ZZ
* ::::::::::::M777777777N O+?D
* M:::::::::::M77777777778 77=
* D=::::::::::N7777777777N 777
* INN===::::::=77777777777N I777N
* ?777N========N7777777777787M N7777
* 77777$D======N77777777777N777N? N777777
* I77777$$$N7===M$$77777777$77777777$MMZ77777777N
* $$$$$$$$$$$NIZN$$$$$$$$$M$$7777777777777777ON
* M$$$$$$$$M M$$$$$$$$N=N$$$$7777777$$$ND
* O77Z$$$$$$$ M$$$$$$$$MNI==$DNNNNM=~N
* 7 :N MNN$$$$M$ $$$777$8 8D8I
* NMM.:7O 777777778
* 7777777MN
* M NO .7:
* M : M
* 8
*/
// Constant matcher factory methods
/**
* Matches any character.
*
* @since 19.0 (since 1.0 as constant {@code ANY})
*/
public static CharMatcher any() {
return Any.INSTANCE;
}
/**
* Matches no characters.
*
* @since 19.0 (since 1.0 as constant {@code NONE})
*/
public static CharMatcher none() {
return None.INSTANCE;
}
/**
* Determines whether a character is whitespace according to the latest Unicode standard, as
* illustrated here.
* This is not the same definition used by other Java APIs. (See a comparison of several definitions of "whitespace".)
*
* All Unicode White_Space characters are on the BMP and thus supported by this API.
*
*
Note: as the Unicode definition evolves, we will modify this matcher to keep it up to
* date.
*
* @since 19.0 (since 1.0 as constant {@code WHITESPACE})
*/
public static CharMatcher whitespace() {
return Whitespace.INSTANCE;
}
/**
* Determines whether a character is a breaking whitespace (that is, a whitespace which can be
* interpreted as a break between words for formatting purposes). See {@link #whitespace()} for a
* discussion of that term.
*
* @since 19.0 (since 2.0 as constant {@code BREAKING_WHITESPACE})
*/
public static CharMatcher breakingWhitespace() {
return BreakingWhitespace.INSTANCE;
}
/**
* Determines whether a character is ASCII, meaning that its code point is less than 128.
*
* @since 19.0 (since 1.0 as constant {@code ASCII})
*/
public static CharMatcher ascii() {
return Ascii.INSTANCE;
}
/**
* Determines whether a character is a BMP digit according to Unicode. If
* you only care to match ASCII digits, you can use {@code inRange('0', '9')}.
*
* @deprecated Many digits are supplementary characters; see the class documentation.
* @since 19.0 (since 1.0 as constant {@code DIGIT})
*/
@Deprecated
public static CharMatcher digit() {
return Digit.INSTANCE;
}
/**
* Determines whether a character is a BMP digit according to {@linkplain Character#isDigit(char)
* Java's definition}. If you only care to match ASCII digits, you can use {@code inRange('0',
* '9')}.
*
* @deprecated Many digits are supplementary characters; see the class documentation.
* @since 19.0 (since 1.0 as constant {@code JAVA_DIGIT})
*/
@Deprecated
public static CharMatcher javaDigit() {
return JavaDigit.INSTANCE;
}
/**
* Determines whether a character is a BMP letter according to {@linkplain
* Character#isLetter(char) Java's definition}. If you only care to match letters of the Latin
* alphabet, you can use {@code inRange('a', 'z').or(inRange('A', 'Z'))}.
*
* @deprecated Most letters are supplementary characters; see the class documentation.
* @since 19.0 (since 1.0 as constant {@code JAVA_LETTER})
*/
@Deprecated
public static CharMatcher javaLetter() {
return JavaLetter.INSTANCE;
}
/**
* Determines whether a character is a BMP letter or digit according to {@linkplain
* Character#isLetterOrDigit(char) Java's definition}.
*
* @deprecated Most letters and digits are supplementary characters; see the class documentation.
* @since 19.0 (since 1.0 as constant {@code JAVA_LETTER_OR_DIGIT}).
*/
@Deprecated
public static CharMatcher javaLetterOrDigit() {
return JavaLetterOrDigit.INSTANCE;
}
/**
* Determines whether a BMP character is upper case according to {@linkplain
* Character#isUpperCase(char) Java's definition}.
*
* @deprecated Some uppercase characters are supplementary characters; see the class
* documentation.
* @since 19.0 (since 1.0 as constant {@code JAVA_UPPER_CASE})
*/
@Deprecated
public static CharMatcher javaUpperCase() {
return JavaUpperCase.INSTANCE;
}
/**
* Determines whether a BMP character is lower case according to {@linkplain
* Character#isLowerCase(char) Java's definition}.
*
* @deprecated Some lowercase characters are supplementary characters; see the class
* documentation.
* @since 19.0 (since 1.0 as constant {@code JAVA_LOWER_CASE})
*/
@Deprecated
public static CharMatcher javaLowerCase() {
return JavaLowerCase.INSTANCE;
}
/**
* Determines whether a character is an ISO control character as specified by {@link
* Character#isISOControl(char)}.
*
*
All ISO control codes are on the BMP and thus supported by this API.
*
* @since 19.0 (since 1.0 as constant {@code JAVA_ISO_CONTROL})
*/
public static CharMatcher javaIsoControl() {
return JavaIsoControl.INSTANCE;
}
/**
* Determines whether a character is invisible; that is, if its Unicode category is any of
* SPACE_SEPARATOR, LINE_SEPARATOR, PARAGRAPH_SEPARATOR, CONTROL, FORMAT, SURROGATE, and
* PRIVATE_USE according to ICU4J.
*
*
See also the Unicode Default_Ignorable_Code_Point property (available via ICU).
*
* @deprecated Most invisible characters are supplementary characters; see the class
* documentation.
* @since 19.0 (since 1.0 as constant {@code INVISIBLE})
*/
@Deprecated
public static CharMatcher invisible() {
return Invisible.INSTANCE;
}
/**
* Determines whether a character is single-width (not double-width). When in doubt, this matcher
* errs on the side of returning {@code false} (that is, it tends to assume a character is
* double-width).
*
*
Note: as the reference file evolves, we will modify this matcher to keep it up to
* date.
*
*
See also UAX #11 East Asian Width.
*
* @deprecated Many such characters are supplementary characters; see the class documentation.
* @since 19.0 (since 1.0 as constant {@code SINGLE_WIDTH})
*/
@Deprecated
public static CharMatcher singleWidth() {
return SingleWidth.INSTANCE;
}
// Static factories
/** Returns a {@code char} matcher that matches only one specified BMP character. */
public static CharMatcher is(final char match) {
return new Is(match);
}
/**
* Returns a {@code char} matcher that matches any character except the BMP character specified.
*
*
To negate another {@code CharMatcher}, use {@link #negate()}.
*/
public static CharMatcher isNot(final char match) {
return new IsNot(match);
}
/**
* Returns a {@code char} matcher that matches any BMP character present in the given character
* sequence. Returns a bogus matcher if the sequence contains supplementary characters.
*/
public static CharMatcher anyOf(final CharSequence sequence) {
switch (sequence.length()) {
case 0:
return none();
case 1:
return is(sequence.charAt(0));
case 2:
return isEither(sequence.charAt(0), sequence.charAt(1));
default:
// TODO(lowasser): is it potentially worth just going ahead and building a precomputed
// matcher?
return new AnyOf(sequence);
}
}
/**
* Returns a {@code char} matcher that matches any BMP character not present in the given
* character sequence. Returns a bogus matcher if the sequence contains supplementary characters.
*/
public static CharMatcher noneOf(CharSequence sequence) {
return anyOf(sequence).negate();
}
/**
* Returns a {@code char} matcher that matches any character in a given BMP range (both endpoints
* are inclusive). For example, to match any lowercase letter of the English alphabet, use {@code
* CharMatcher.inRange('a', 'z')}.
*
* @throws IllegalArgumentException if {@code endInclusive < startInclusive}
*/
public static CharMatcher inRange(final char startInclusive, final char endInclusive) {
return new InRange(startInclusive, endInclusive);
}
/**
* Returns a matcher with identical behavior to the given {@link Character}-based predicate, but
* which operates on primitive {@code char} instances instead.
*/
public static CharMatcher forPredicate(final Predicate super Character> predicate) {
return predicate instanceof CharMatcher ? (CharMatcher) predicate : new ForPredicate(predicate);
}
// Constructors
/**
* Constructor for use by subclasses. When subclassing, you may want to override {@code
* toString()} to provide a useful description.
*/
protected CharMatcher() {}
// Abstract methods
/** Determines a true or false value for the given character. */
public abstract boolean matches(char c);
// Non-static factories
/** Returns a matcher that matches any character not matched by this matcher. */
// @Override under Java 8 but not under Java 7
@Override
public CharMatcher negate() {
return new Negated(this);
}
/**
* Returns a matcher that matches any character matched by both this matcher and {@code other}.
*/
public CharMatcher and(CharMatcher other) {
return new And(this, other);
}
/**
* Returns a matcher that matches any character matched by either this matcher or {@code other}.
*/
public CharMatcher or(CharMatcher other) {
return new Or(this, other);
}
/**
* Returns a {@code char} matcher functionally equivalent to this one, but which may be faster to
* query than the original; your mileage may vary. Precomputation takes time and is likely to be
* worthwhile only if the precomputed matcher is queried many thousands of times.
*
*
This method has no effect (returns {@code this}) when called in GWT: it's unclear whether a
* precomputed matcher is faster, but it certainly consumes more memory, which doesn't seem like a
* worthwhile tradeoff in a browser.
*/
public CharMatcher precomputed() {
return Platform.precomputeCharMatcher(this);
}
private static final int DISTINCT_CHARS = Character.MAX_VALUE - Character.MIN_VALUE + 1;
/**
* This is the actual implementation of {@link #precomputed}, but we bounce calls through a method
* on {@link Platform} so that we can have different behavior in GWT.
*
*
This implementation tries to be smart in a number of ways. It recognizes cases where the
* negation is cheaper to precompute than the matcher itself; it tries to build small hash tables
* for matchers that only match a few characters, and so on. In the worst-case scenario, it
* constructs an eight-kilobyte bit array and queries that. In many situations this produces a
* matcher which is faster to query than the original.
*/
@GwtIncompatible // SmallCharMatcher
CharMatcher precomputedInternal() {
final BitSet table = new BitSet();
setBits(table);
int totalCharacters = table.cardinality();
if (totalCharacters * 2 <= DISTINCT_CHARS) {
return precomputedPositive(totalCharacters, table, toString());
} else {
// TODO(lowasser): is it worth it to worry about the last character of large matchers?
table.flip(Character.MIN_VALUE, Character.MAX_VALUE + 1);
int negatedCharacters = DISTINCT_CHARS - totalCharacters;
String suffix = ".negate()";
final String description = toString();
String negatedDescription =
description.endsWith(suffix)
? description.substring(0, description.length() - suffix.length())
: description + suffix;
return new NegatedFastMatcher(
precomputedPositive(negatedCharacters, table, negatedDescription)) {
@Override
public String toString() {
return description;
}
};
}
}
/**
* Helper method for {@link #precomputedInternal} that doesn't test if the negation is cheaper.
*/
@GwtIncompatible // SmallCharMatcher
private static CharMatcher precomputedPositive(
int totalCharacters, BitSet table, String description) {
switch (totalCharacters) {
case 0:
return none();
case 1:
return is((char) table.nextSetBit(0));
case 2:
char c1 = (char) table.nextSetBit(0);
char c2 = (char) table.nextSetBit(c1 + 1);
return isEither(c1, c2);
default:
return isSmall(totalCharacters, table.length())
? SmallCharMatcher.from(table, description)
: new BitSetMatcher(table, description);
}
}
@GwtIncompatible // SmallCharMatcher
private static boolean isSmall(int totalCharacters, int tableLength) {
return totalCharacters <= SmallCharMatcher.MAX_SIZE
&& tableLength > (totalCharacters * 4 * Character.SIZE);
// err on the side of BitSetMatcher
}
/** Sets bits in {@code table} matched by this matcher. */
@GwtIncompatible // used only from other GwtIncompatible code
void setBits(BitSet table) {
for (int c = Character.MAX_VALUE; c >= Character.MIN_VALUE; c--) {
if (matches((char) c)) {
table.set(c);
}
}
}
// Text processing routines
/**
* Returns {@code true} if a character sequence contains at least one matching BMP character.
* Equivalent to {@code !matchesNoneOf(sequence)}.
*
*
The default implementation iterates over the sequence, invoking {@link #matches} for each
* character, until this returns {@code true} or the end is reached.
*
* @param sequence the character sequence to examine, possibly empty
* @return {@code true} if this matcher matches at least one character in the sequence
* @since 8.0
*/
public boolean matchesAnyOf(CharSequence sequence) {
return !matchesNoneOf(sequence);
}
/**
* Returns {@code true} if a character sequence contains only matching BMP characters.
*
*
The default implementation iterates over the sequence, invoking {@link #matches} for each
* character, until this returns {@code false} or the end is reached.
*
* @param sequence the character sequence to examine, possibly empty
* @return {@code true} if this matcher matches every character in the sequence, including when
* the sequence is empty
*/
public boolean matchesAllOf(CharSequence sequence) {
for (int i = sequence.length() - 1; i >= 0; i--) {
if (!matches(sequence.charAt(i))) {
return false;
}
}
return true;
}
/**
* Returns {@code true} if a character sequence contains no matching BMP characters. Equivalent to
* {@code !matchesAnyOf(sequence)}.
*
*
The default implementation iterates over the sequence, invoking {@link #matches} for each
* character, until this returns {@code true} or the end is reached.
*
* @param sequence the character sequence to examine, possibly empty
* @return {@code true} if this matcher matches no characters in the sequence, including when the
* sequence is empty
*/
public boolean matchesNoneOf(CharSequence sequence) {
return indexIn(sequence) == -1;
}
/**
* Returns the index of the first matching BMP character in a character sequence, or {@code -1} if
* no matching character is present.
*
*
The default implementation iterates over the sequence in forward order calling {@link
* #matches} for each character.
*
* @param sequence the character sequence to examine from the beginning
* @return an index, or {@code -1} if no character matches
*/
public int indexIn(CharSequence sequence) {
return indexIn(sequence, 0);
}
/**
* Returns the index of the first matching BMP character in a character sequence, starting from a
* given position, or {@code -1} if no character matches after that position.
*
*
The default implementation iterates over the sequence in forward order, beginning at {@code
* start}, calling {@link #matches} for each character.
*
* @param sequence the character sequence to examine
* @param start the first index to examine; must be nonnegative and no greater than {@code
* sequence.length()}
* @return the index of the first matching character, guaranteed to be no less than {@code start},
* or {@code -1} if no character matches
* @throws IndexOutOfBoundsException if start is negative or greater than {@code
* sequence.length()}
*/
public int indexIn(CharSequence sequence, int start) {
int length = sequence.length();
checkPositionIndex(start, length);
for (int i = start; i < length; i++) {
if (matches(sequence.charAt(i))) {
return i;
}
}
return -1;
}
/**
* Returns the index of the last matching BMP character in a character sequence, or {@code -1} if
* no matching character is present.
*
*
The default implementation iterates over the sequence in reverse order calling {@link
* #matches} for each character.
*
* @param sequence the character sequence to examine from the end
* @return an index, or {@code -1} if no character matches
*/
public int lastIndexIn(CharSequence sequence) {
for (int i = sequence.length() - 1; i >= 0; i--) {
if (matches(sequence.charAt(i))) {
return i;
}
}
return -1;
}
/**
* Returns the number of matching {@code char}s found in a character sequence.
*
*
Counts 2 per supplementary character, such as for {@link #whitespace}().{@link #negate}().
*/
public int countIn(CharSequence sequence) {
int count = 0;
for (int i = 0; i < sequence.length(); i++) {
if (matches(sequence.charAt(i))) {
count++;
}
}
return count;
}
/**
* Returns a string containing all non-matching characters of a character sequence, in order. For
* example:
*
*
{@code
* CharMatcher.is('a').removeFrom("bazaar")
* }
*
* ... returns {@code "bzr"}.
*/
public String removeFrom(CharSequence sequence) {
String string = sequence.toString();
int pos = indexIn(string);
if (pos == -1) {
return string;
}
char[] chars = string.toCharArray();
int spread = 1;
// This unusual loop comes from extensive benchmarking
OUT:
while (true) {
pos++;
while (true) {
if (pos == chars.length) {
break OUT;
}
if (matches(chars[pos])) {
break;
}
chars[pos - spread] = chars[pos];
pos++;
}
spread++;
}
return new String(chars, 0, pos - spread);
}
/**
* Returns a string containing all matching BMP characters of a character sequence, in order. For
* example:
*
* {@code
* CharMatcher.is('a').retainFrom("bazaar")
* }
*
* ... returns {@code "aaa"}.
*/
public String retainFrom(CharSequence sequence) {
return negate().removeFrom(sequence);
}
/**
* Returns a string copy of the input character sequence, with each matching BMP character
* replaced by a given replacement character. For example:
*
* {@code
* CharMatcher.is('a').replaceFrom("radar", 'o')
* }
*
* ... returns {@code "rodor"}.
*
* The default implementation uses {@link #indexIn(CharSequence)} to find the first matching
* character, then iterates the remainder of the sequence calling {@link #matches(char)} for each
* character.
*
* @param sequence the character sequence to replace matching characters in
* @param replacement the character to append to the result string in place of each matching
* character in {@code sequence}
* @return the new string
*/
public String replaceFrom(CharSequence sequence, char replacement) {
String string = sequence.toString();
int pos = indexIn(string);
if (pos == -1) {
return string;
}
char[] chars = string.toCharArray();
chars[pos] = replacement;
for (int i = pos + 1; i < chars.length; i++) {
if (matches(chars[i])) {
chars[i] = replacement;
}
}
return new String(chars);
}
/**
* Returns a string copy of the input character sequence, with each matching BMP character
* replaced by a given replacement sequence. For example:
*
*
{@code
* CharMatcher.is('a').replaceFrom("yaha", "oo")
* }
*
* ... returns {@code "yoohoo"}.
*
* Note: If the replacement is a fixed string with only one character, you are better
* off calling {@link #replaceFrom(CharSequence, char)} directly.
*
* @param sequence the character sequence to replace matching characters in
* @param replacement the characters to append to the result string in place of each matching
* character in {@code sequence}
* @return the new string
*/
public String replaceFrom(CharSequence sequence, CharSequence replacement) {
int replacementLen = replacement.length();
if (replacementLen == 0) {
return removeFrom(sequence);
}
if (replacementLen == 1) {
return replaceFrom(sequence, replacement.charAt(0));
}
String string = sequence.toString();
int pos = indexIn(string);
if (pos == -1) {
return string;
}
int len = string.length();
StringBuilder buf = new StringBuilder((len * 3 / 2) + 16);
int oldpos = 0;
do {
buf.append(string, oldpos, pos);
buf.append(replacement);
oldpos = pos + 1;
pos = indexIn(string, oldpos);
} while (pos != -1);
buf.append(string, oldpos, len);
return buf.toString();
}
/**
* Returns a substring of the input character sequence that omits all matching BMP characters from
* the beginning and from the end of the string. For example:
*
*
{@code
* CharMatcher.anyOf("ab").trimFrom("abacatbab")
* }
*
* ... returns {@code "cat"}.
*
* Note that:
*
*
{@code
* CharMatcher.inRange('\0', ' ').trimFrom(str)
* }
*
* ... is equivalent to {@link String#trim()}.
*/
public String trimFrom(CharSequence sequence) {
int len = sequence.length();
int first;
int last;
for (first = 0; first < len; first++) {
if (!matches(sequence.charAt(first))) {
break;
}
}
for (last = len - 1; last > first; last--) {
if (!matches(sequence.charAt(last))) {
break;
}
}
return sequence.subSequence(first, last + 1).toString();
}
/**
* Returns a substring of the input character sequence that omits all matching BMP characters from
* the beginning of the string. For example:
*
* {@code
* CharMatcher.anyOf("ab").trimLeadingFrom("abacatbab")
* }
*
* ... returns {@code "catbab"}.
*/
public String trimLeadingFrom(CharSequence sequence) {
int len = sequence.length();
for (int first = 0; first < len; first++) {
if (!matches(sequence.charAt(first))) {
return sequence.subSequence(first, len).toString();
}
}
return "";
}
/**
* Returns a substring of the input character sequence that omits all matching BMP characters from
* the end of the string. For example:
*
* {@code
* CharMatcher.anyOf("ab").trimTrailingFrom("abacatbab")
* }
*
* ... returns {@code "abacat"}.
*/
public String trimTrailingFrom(CharSequence sequence) {
int len = sequence.length();
for (int last = len - 1; last >= 0; last--) {
if (!matches(sequence.charAt(last))) {
return sequence.subSequence(0, last + 1).toString();
}
}
return "";
}
/**
* Returns a string copy of the input character sequence, with each group of consecutive matching
* BMP characters replaced by a single replacement character. For example:
*
* {@code
* CharMatcher.anyOf("eko").collapseFrom("bookkeeper", '-')
* }
*
* ... returns {@code "b-p-r"}.
*
* The default implementation uses {@link #indexIn(CharSequence)} to find the first matching
* character, then iterates the remainder of the sequence calling {@link #matches(char)} for each
* character.
*
* @param sequence the character sequence to replace matching groups of characters in
* @param replacement the character to append to the result string in place of each group of
* matching characters in {@code sequence}
* @return the new string
*/
public String collapseFrom(CharSequence sequence, char replacement) {
// This implementation avoids unnecessary allocation.
int len = sequence.length();
for (int i = 0; i < len; i++) {
char c = sequence.charAt(i);
if (matches(c)) {
if (c == replacement && (i == len - 1 || !matches(sequence.charAt(i + 1)))) {
// a no-op replacement
i++;
} else {
StringBuilder builder = new StringBuilder(len).append(sequence, 0, i).append(replacement);
return finishCollapseFrom(sequence, i + 1, len, replacement, builder, true);
}
}
}
// no replacement needed
return sequence.toString();
}
/**
* Collapses groups of matching characters exactly as {@link #collapseFrom} does, except that
* groups of matching BMP characters at the start or end of the sequence are removed without
* replacement.
*/
public String trimAndCollapseFrom(CharSequence sequence, char replacement) {
// This implementation avoids unnecessary allocation.
int len = sequence.length();
int first = 0;
int last = len - 1;
while (first < len && matches(sequence.charAt(first))) {
first++;
}
while (last > first && matches(sequence.charAt(last))) {
last--;
}
return (first == 0 && last == len - 1)
? collapseFrom(sequence, replacement)
: finishCollapseFrom(
sequence, first, last + 1, replacement, new StringBuilder(last + 1 - first), false);
}
private String finishCollapseFrom(
CharSequence sequence,
int start,
int end,
char replacement,
StringBuilder builder,
boolean inMatchingGroup) {
for (int i = start; i < end; i++) {
char c = sequence.charAt(i);
if (matches(c)) {
if (!inMatchingGroup) {
builder.append(replacement);
inMatchingGroup = true;
}
} else {
builder.append(c);
inMatchingGroup = false;
}
}
return builder.toString();
}
/**
* @deprecated Provided only to satisfy the {@link Predicate} interface; use {@link #matches}
* instead.
*/
@Deprecated
@Override
public boolean apply(Character character) {
return matches(character);
}
/**
* Returns a string representation of this {@code CharMatcher}, such as {@code
* CharMatcher.or(WHITESPACE, JAVA_DIGIT)}.
*/
@Override
public String toString() {
return super.toString();
}
/**
* Returns the Java Unicode escape sequence for the given {@code char}, in the form "\u12AB" where
* "12AB" is the four hexadecimal digits representing the 16-bit code unit.
*/
private static String showCharacter(char c) {
String hex = "0123456789ABCDEF";
char[] tmp = {'\\', 'u', '\0', '\0', '\0', '\0'};
for (int i = 0; i < 4; i++) {
tmp[5 - i] = hex.charAt(c & 0xF);
c = (char) (c >> 4);
}
return String.copyValueOf(tmp);
}
// Fast matchers
/** A matcher for which precomputation will not yield any significant benefit. */
abstract static class FastMatcher extends CharMatcher {
@Override
public final CharMatcher precomputed() {
return this;
}
@Override
public CharMatcher negate() {
return new NegatedFastMatcher(this);
}
}
/** {@link FastMatcher} which overrides {@code toString()} with a custom name. */
abstract static class NamedFastMatcher extends FastMatcher {
private final String description;
NamedFastMatcher(String description) {
this.description = checkNotNull(description);
}
@Override
public final String toString() {
return description;
}
}
/** Negation of a {@link FastMatcher}. */
static class NegatedFastMatcher extends Negated {
NegatedFastMatcher(CharMatcher original) {
super(original);
}
@Override
public final CharMatcher precomputed() {
return this;
}
}
/** Fast matcher using a {@link BitSet} table of matching characters. */
@GwtIncompatible // used only from other GwtIncompatible code
private static final class BitSetMatcher extends NamedFastMatcher {
private final BitSet table;
private BitSetMatcher(BitSet table, String description) {
super(description);
if (table.length() + Long.SIZE < table.size()) {
table = (BitSet) table.clone();
// If only we could actually call BitSet.trimToSize() ourselves...
}
this.table = table;
}
@Override
public boolean matches(char c) {
return table.get(c);
}
@Override
void setBits(BitSet bitSet) {
bitSet.or(table);
}
}
// Static constant implementation classes
/** Implementation of {@link #any()}. */
private static final class Any extends NamedFastMatcher {
static final Any INSTANCE = new Any();
private Any() {
super("CharMatcher.any()");
}
@Override
public boolean matches(char c) {
return true;
}
@Override
public int indexIn(CharSequence sequence) {
return (sequence.length() == 0) ? -1 : 0;
}
@Override
public int indexIn(CharSequence sequence, int start) {
int length = sequence.length();
checkPositionIndex(start, length);
return (start == length) ? -1 : start;
}
@Override
public int lastIndexIn(CharSequence sequence) {
return sequence.length() - 1;
}
@Override
public boolean matchesAllOf(CharSequence sequence) {
checkNotNull(sequence);
return true;
}
@Override
public boolean matchesNoneOf(CharSequence sequence) {
return sequence.length() == 0;
}
@Override
public String removeFrom(CharSequence sequence) {
checkNotNull(sequence);
return "";
}
@Override
public String replaceFrom(CharSequence sequence, char replacement) {
char[] array = new char[sequence.length()];
Arrays.fill(array, replacement);
return new String(array);
}
@Override
public String replaceFrom(CharSequence sequence, CharSequence replacement) {
StringBuilder result = new StringBuilder(sequence.length() * replacement.length());
for (int i = 0; i < sequence.length(); i++) {
result.append(replacement);
}
return result.toString();
}
@Override
public String collapseFrom(CharSequence sequence, char replacement) {
return (sequence.length() == 0) ? "" : String.valueOf(replacement);
}
@Override
public String trimFrom(CharSequence sequence) {
checkNotNull(sequence);
return "";
}
@Override
public int countIn(CharSequence sequence) {
return sequence.length();
}
@Override
public CharMatcher and(CharMatcher other) {
return checkNotNull(other);
}
@Override
public CharMatcher or(CharMatcher other) {
checkNotNull(other);
return this;
}
@Override
public CharMatcher negate() {
return none();
}
}
/** Implementation of {@link #none()}. */
private static final class None extends NamedFastMatcher {
static final None INSTANCE = new None();
private None() {
super("CharMatcher.none()");
}
@Override
public boolean matches(char c) {
return false;
}
@Override
public int indexIn(CharSequence sequence) {
checkNotNull(sequence);
return -1;
}
@Override
public int indexIn(CharSequence sequence, int start) {
int length = sequence.length();
checkPositionIndex(start, length);
return -1;
}
@Override
public int lastIndexIn(CharSequence sequence) {
checkNotNull(sequence);
return -1;
}
@Override
public boolean matchesAllOf(CharSequence sequence) {
return sequence.length() == 0;
}
@Override
public boolean matchesNoneOf(CharSequence sequence) {
checkNotNull(sequence);
return true;
}
@Override
public String removeFrom(CharSequence sequence) {
return sequence.toString();
}
@Override
public String replaceFrom(CharSequence sequence, char replacement) {
return sequence.toString();
}
@Override
public String replaceFrom(CharSequence sequence, CharSequence replacement) {
checkNotNull(replacement);
return sequence.toString();
}
@Override
public String collapseFrom(CharSequence sequence, char replacement) {
return sequence.toString();
}
@Override
public String trimFrom(CharSequence sequence) {
return sequence.toString();
}
@Override
public String trimLeadingFrom(CharSequence sequence) {
return sequence.toString();
}
@Override
public String trimTrailingFrom(CharSequence sequence) {
return sequence.toString();
}
@Override
public int countIn(CharSequence sequence) {
checkNotNull(sequence);
return 0;
}
@Override
public CharMatcher and(CharMatcher other) {
checkNotNull(other);
return this;
}
@Override
public CharMatcher or(CharMatcher other) {
return checkNotNull(other);
}
@Override
public CharMatcher negate() {
return any();
}
}
/** Implementation of {@link #whitespace()}. */
@VisibleForTesting
static final class Whitespace extends NamedFastMatcher {
// TABLE is a precomputed hashset of whitespace characters. MULTIPLIER serves as a hash function
// whose key property is that it maps 25 characters into the 32-slot table without collision.
// Basically this is an opportunistic fast implementation as opposed to "good code". For most
// other use-cases, the reduction in readability isn't worth it.
static final String TABLE =
"\u2002\u3000\r\u0085\u200A\u2005\u2000\u3000"
+ "\u2029\u000B\u3000\u2008\u2003\u205F\u3000\u1680"
+ "\u0009\u0020\u2006\u2001\u202F\u00A0\u000C\u2009"
+ "\u3000\u2004\u3000\u3000\u2028\n\u2007\u3000";
static final int MULTIPLIER = 1682554634;
static final int SHIFT = Integer.numberOfLeadingZeros(TABLE.length() - 1);
static final Whitespace INSTANCE = new Whitespace();
Whitespace() {
super("CharMatcher.whitespace()");
}
@Override
public boolean matches(char c) {
return TABLE.charAt((MULTIPLIER * c) >>> SHIFT) == c;
}
@GwtIncompatible // used only from other GwtIncompatible code
@Override
void setBits(BitSet table) {
for (int i = 0; i < TABLE.length(); i++) {
table.set(TABLE.charAt(i));
}
}
}
/** Implementation of {@link #breakingWhitespace()}. */
private static final class BreakingWhitespace extends CharMatcher {
static final CharMatcher INSTANCE = new BreakingWhitespace();
@Override
public boolean matches(char c) {
switch (c) {
case '\t':
case '\n':
case '\013':
case '\f':
case '\r':
case ' ':
case '\u0085':
case '\u1680':
case '\u2028':
case '\u2029':
case '\u205f':
case '\u3000':
return true;
case '\u2007':
return false;
default:
return c >= '\u2000' && c <= '\u200a';
}
}
@Override
public String toString() {
return "CharMatcher.breakingWhitespace()";
}
}
/** Implementation of {@link #ascii()}. */
private static final class Ascii extends NamedFastMatcher {
static final Ascii INSTANCE = new Ascii();
Ascii() {
super("CharMatcher.ascii()");
}
@Override
public boolean matches(char c) {
return c <= '\u007f';
}
}
/** Implementation that matches characters that fall within multiple ranges. */
private static class RangesMatcher extends CharMatcher {
private final String description;
private final char[] rangeStarts;
private final char[] rangeEnds;
RangesMatcher(String description, char[] rangeStarts, char[] rangeEnds) {
this.description = description;
this.rangeStarts = rangeStarts;
this.rangeEnds = rangeEnds;
checkArgument(rangeStarts.length == rangeEnds.length);
for (int i = 0; i < rangeStarts.length; i++) {
checkArgument(rangeStarts[i] <= rangeEnds[i]);
if (i + 1 < rangeStarts.length) {
checkArgument(rangeEnds[i] < rangeStarts[i + 1]);
}
}
}
@Override
public boolean matches(char c) {
int index = Arrays.binarySearch(rangeStarts, c);
if (index >= 0) {
return true;
} else {
index = ~index - 1;
return index >= 0 && c <= rangeEnds[index];
}
}
@Override
public String toString() {
return description;
}
}
/** Implementation of {@link #digit()}. */
private static final class Digit extends RangesMatcher {
// Plug the following UnicodeSet pattern into
// https://unicode.org/cldr/utility/list-unicodeset.jsp
// [[:Nd:]&[:nv=0:]&[\u0000-\uFFFF]]
// and get the zeroes from there.
// Must be in ascending order.
private static final String ZEROES =
"0\u0660\u06f0\u07c0\u0966\u09e6\u0a66\u0ae6\u0b66\u0be6\u0c66\u0ce6\u0d66\u0de6"
+ "\u0e50\u0ed0\u0f20\u1040\u1090\u17e0\u1810\u1946\u19d0\u1a80\u1a90\u1b50\u1bb0"
+ "\u1c40\u1c50\ua620\ua8d0\ua900\ua9d0\ua9f0\uaa50\uabf0\uff10";
private static char[] zeroes() {
return ZEROES.toCharArray();
}
private static char[] nines() {
char[] nines = new char[ZEROES.length()];
for (int i = 0; i < ZEROES.length(); i++) {
nines[i] = (char) (ZEROES.charAt(i) + 9);
}
return nines;
}
static final Digit INSTANCE = new Digit();
private Digit() {
super("CharMatcher.digit()", zeroes(), nines());
}
}
/** Implementation of {@link #javaDigit()}. */
private static final class JavaDigit extends CharMatcher {
static final JavaDigit INSTANCE = new JavaDigit();
@Override
public boolean matches(char c) {
return Character.isDigit(c);
}
@Override
public String toString() {
return "CharMatcher.javaDigit()";
}
}
/** Implementation of {@link #javaLetter()}. */
private static final class JavaLetter extends CharMatcher {
static final JavaLetter INSTANCE = new JavaLetter();
@Override
public boolean matches(char c) {
return Character.isLetter(c);
}
@Override
public String toString() {
return "CharMatcher.javaLetter()";
}
}
/** Implementation of {@link #javaLetterOrDigit()}. */
private static final class JavaLetterOrDigit extends CharMatcher {
static final JavaLetterOrDigit INSTANCE = new JavaLetterOrDigit();
@Override
public boolean matches(char c) {
return Character.isLetterOrDigit(c);
}
@Override
public String toString() {
return "CharMatcher.javaLetterOrDigit()";
}
}
/** Implementation of {@link #javaUpperCase()}. */
private static final class JavaUpperCase extends CharMatcher {
static final JavaUpperCase INSTANCE = new JavaUpperCase();
@Override
public boolean matches(char c) {
return Character.isUpperCase(c);
}
@Override
public String toString() {
return "CharMatcher.javaUpperCase()";
}
}
/** Implementation of {@link #javaLowerCase()}. */
private static final class JavaLowerCase extends CharMatcher {
static final JavaLowerCase INSTANCE = new JavaLowerCase();
@Override
public boolean matches(char c) {
return Character.isLowerCase(c);
}
@Override
public String toString() {
return "CharMatcher.javaLowerCase()";
}
}
/** Implementation of {@link #javaIsoControl()}. */
private static final class JavaIsoControl extends NamedFastMatcher {
static final JavaIsoControl INSTANCE = new JavaIsoControl();
private JavaIsoControl() {
super("CharMatcher.javaIsoControl()");
}
@Override
public boolean matches(char c) {
return c <= '\u001f' || (c >= '\u007f' && c <= '\u009f');
}
}
/** Implementation of {@link #invisible()}. */
private static final class Invisible extends RangesMatcher {
// Plug the following UnicodeSet pattern into
// https://unicode.org/cldr/utility/list-unicodeset.jsp
// [[[:Zs:][:Zl:][:Zp:][:Cc:][:Cf:][:Cs:][:Co:]]&[\u0000-\uFFFF]]
// with the "Abbreviate" option, and get the ranges from there.
private static final String RANGE_STARTS =
"\u0000\u007f\u00ad\u0600\u061c\u06dd\u070f\u08e2\u1680\u180e\u2000\u2028\u205f\u2066"
+ "\u3000\ud800\ufeff\ufff9";
private static final String RANGE_ENDS = // inclusive ends
"\u0020\u00a0\u00ad\u0605\u061c\u06dd\u070f\u08e2\u1680\u180e\u200f\u202f\u2064\u206f"
+ "\u3000\uf8ff\ufeff\ufffb";
static final Invisible INSTANCE = new Invisible();
private Invisible() {
super("CharMatcher.invisible()", RANGE_STARTS.toCharArray(), RANGE_ENDS.toCharArray());
}
}
/** Implementation of {@link #singleWidth()}. */
private static final class SingleWidth extends RangesMatcher {
static final SingleWidth INSTANCE = new SingleWidth();
private SingleWidth() {
super(
"CharMatcher.singleWidth()",
"\u0000\u05be\u05d0\u05f3\u0600\u0750\u0e00\u1e00\u2100\ufb50\ufe70\uff61".toCharArray(),
"\u04f9\u05be\u05ea\u05f4\u06ff\u077f\u0e7f\u20af\u213a\ufdff\ufeff\uffdc".toCharArray());
}
}
// Non-static factory implementation classes
/** Implementation of {@link #negate()}. */
private static class Negated extends CharMatcher {
final CharMatcher original;
Negated(CharMatcher original) {
this.original = checkNotNull(original);
}
@Override
public boolean matches(char c) {
return !original.matches(c);
}
@Override
public boolean matchesAllOf(CharSequence sequence) {
return original.matchesNoneOf(sequence);
}
@Override
public boolean matchesNoneOf(CharSequence sequence) {
return original.matchesAllOf(sequence);
}
@Override
public int countIn(CharSequence sequence) {
return sequence.length() - original.countIn(sequence);
}
@GwtIncompatible // used only from other GwtIncompatible code
@Override
void setBits(BitSet table) {
BitSet tmp = new BitSet();
original.setBits(tmp);
tmp.flip(Character.MIN_VALUE, Character.MAX_VALUE + 1);
table.or(tmp);
}
@Override
public CharMatcher negate() {
return original;
}
@Override
public String toString() {
return original + ".negate()";
}
}
/** Implementation of {@link #and(CharMatcher)}. */
private static final class And extends CharMatcher {
final CharMatcher first;
final CharMatcher second;
And(CharMatcher a, CharMatcher b) {
first = checkNotNull(a);
second = checkNotNull(b);
}
@Override
public boolean matches(char c) {
return first.matches(c) && second.matches(c);
}
@GwtIncompatible // used only from other GwtIncompatible code
@Override
void setBits(BitSet table) {
BitSet tmp1 = new BitSet();
first.setBits(tmp1);
BitSet tmp2 = new BitSet();
second.setBits(tmp2);
tmp1.and(tmp2);
table.or(tmp1);
}
@Override
public String toString() {
return "CharMatcher.and(" + first + ", " + second + ")";
}
}
/** Implementation of {@link #or(CharMatcher)}. */
private static final class Or extends CharMatcher {
final CharMatcher first;
final CharMatcher second;
Or(CharMatcher a, CharMatcher b) {
first = checkNotNull(a);
second = checkNotNull(b);
}
@GwtIncompatible // used only from other GwtIncompatible code
@Override
void setBits(BitSet table) {
first.setBits(table);
second.setBits(table);
}
@Override
public boolean matches(char c) {
return first.matches(c) || second.matches(c);
}
@Override
public String toString() {
return "CharMatcher.or(" + first + ", " + second + ")";
}
}
// Static factory implementations
/** Implementation of {@link #is(char)}. */
private static final class Is extends FastMatcher {
private final char match;
Is(char match) {
this.match = match;
}
@Override
public boolean matches(char c) {
return c == match;
}
@Override
public String replaceFrom(CharSequence sequence, char replacement) {
return sequence.toString().replace(match, replacement);
}
@Override
public CharMatcher and(CharMatcher other) {
return other.matches(match) ? this : none();
}
@Override
public CharMatcher or(CharMatcher other) {
return other.matches(match) ? other : super.or(other);
}
@Override
public CharMatcher negate() {
return isNot(match);
}
@GwtIncompatible // used only from other GwtIncompatible code
@Override
void setBits(BitSet table) {
table.set(match);
}
@Override
public String toString() {
return "CharMatcher.is('" + showCharacter(match) + "')";
}
}
/** Implementation of {@link #isNot(char)}. */
private static final class IsNot extends FastMatcher {
private final char match;
IsNot(char match) {
this.match = match;
}
@Override
public boolean matches(char c) {
return c != match;
}
@Override
public CharMatcher and(CharMatcher other) {
return other.matches(match) ? super.and(other) : other;
}
@Override
public CharMatcher or(CharMatcher other) {
return other.matches(match) ? any() : this;
}
@GwtIncompatible // used only from other GwtIncompatible code
@Override
void setBits(BitSet table) {
table.set(0, match);
table.set(match + 1, Character.MAX_VALUE + 1);
}
@Override
public CharMatcher negate() {
return is(match);
}
@Override
public String toString() {
return "CharMatcher.isNot('" + showCharacter(match) + "')";
}
}
private static CharMatcher.IsEither isEither(char c1, char c2) {
return new CharMatcher.IsEither(c1, c2);
}
/** Implementation of {@link #anyOf(CharSequence)} for exactly two characters. */
private static final class IsEither extends FastMatcher {
private final char match1;
private final char match2;
IsEither(char match1, char match2) {
this.match1 = match1;
this.match2 = match2;
}
@Override
public boolean matches(char c) {
return c == match1 || c == match2;
}
@GwtIncompatible // used only from other GwtIncompatible code
@Override
void setBits(BitSet table) {
table.set(match1);
table.set(match2);
}
@Override
public String toString() {
return "CharMatcher.anyOf(\"" + showCharacter(match1) + showCharacter(match2) + "\")";
}
}
/** Implementation of {@link #anyOf(CharSequence)} for three or more characters. */
private static final class AnyOf extends CharMatcher {
private final char[] chars;
public AnyOf(CharSequence chars) {
this.chars = chars.toString().toCharArray();
Arrays.sort(this.chars);
}
@Override
public boolean matches(char c) {
return Arrays.binarySearch(chars, c) >= 0;
}
@Override
@GwtIncompatible // used only from other GwtIncompatible code
void setBits(BitSet table) {
for (char c : chars) {
table.set(c);
}
}
@Override
public String toString() {
StringBuilder description = new StringBuilder("CharMatcher.anyOf(\"");
for (char c : chars) {
description.append(showCharacter(c));
}
description.append("\")");
return description.toString();
}
}
/** Implementation of {@link #inRange(char, char)}. */
private static final class InRange extends FastMatcher {
private final char startInclusive;
private final char endInclusive;
InRange(char startInclusive, char endInclusive) {
checkArgument(endInclusive >= startInclusive);
this.startInclusive = startInclusive;
this.endInclusive = endInclusive;
}
@Override
public boolean matches(char c) {
return startInclusive <= c && c <= endInclusive;
}
@GwtIncompatible // used only from other GwtIncompatible code
@Override
void setBits(BitSet table) {
table.set(startInclusive, endInclusive + 1);
}
@Override
public String toString() {
return "CharMatcher.inRange('"
+ showCharacter(startInclusive)
+ "', '"
+ showCharacter(endInclusive)
+ "')";
}
}
/** Implementation of {@link #forPredicate(Predicate)}. */
private static final class ForPredicate extends CharMatcher {
private final Predicate super Character> predicate;
ForPredicate(Predicate super Character> predicate) {
this.predicate = checkNotNull(predicate);
}
@Override
public boolean matches(char c) {
return predicate.apply(c);
}
@SuppressWarnings("deprecation") // intentional; deprecation is for callers primarily
@Override
public boolean apply(Character character) {
return predicate.apply(checkNotNull(character));
}
@Override
public String toString() {
return "CharMatcher.forPredicate(" + predicate + ")";
}
}
}