All Downloads are FREE. Search and download functionalities are using the official Maven repository.

com.google.common.base.CharMatcher Maven / Gradle / Ivy

Go to download

This artifact provides a single jar that contains all classes required to use remote Jakarta Enterprise Beans and Jakarta Messaging, including all dependencies. It is intended for use by those not using maven, maven users should just import the Jakarta Enterprise Beans and Jakarta Messaging BOM's instead (shaded JAR's cause lots of problems with maven, as it is very easy to inadvertently end up with different versions on classes on the class path).

There is a newer version: 35.0.0.Final
Show newest version
/*
 * Copyright (C) 2008 The Guava Authors
 *
 * Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
 * in compliance with the License. You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software distributed under the License
 * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
 * or implied. See the License for the specific language governing permissions and limitations under
 * the License.
 */

package com.google.common.base;

import static com.google.common.base.Preconditions.checkArgument;
import static com.google.common.base.Preconditions.checkNotNull;
import static com.google.common.base.Preconditions.checkPositionIndex;

import com.google.common.annotations.GwtCompatible;
import com.google.common.annotations.GwtIncompatible;
import com.google.common.annotations.VisibleForTesting;
import java.util.Arrays;
import java.util.BitSet;

/**
 * Determines a true or false value for any Java {@code char} value, just as {@link Predicate} does
 * for any {@link Object}. Also offers basic text processing methods based on this function.
 * Implementations are strongly encouraged to be side-effect-free and immutable.
 *
 * 

Throughout the documentation of this class, the phrase "matching character" is used to mean * "any {@code char} value {@code c} for which {@code this.matches(c)} returns {@code true}". * *

Warning: This class deals only with {@code char} values, that is, BMP characters. It does not understand * supplementary Unicode code * points in the range {@code 0x10000} to {@code 0x10FFFF} which includes the majority of * assigned characters, including important CJK characters and emoji. * *

Supplementary characters are encoded * into a {@code String} using surrogate pairs, and a {@code CharMatcher} treats these just as * two separate characters. {@link #countIn} counts each supplementary character as 2 {@code char}s. * *

For up-to-date Unicode character properties (digit, letter, etc.) and support for * supplementary code points, use ICU4J UCharacter and UnicodeSet (freeze() after building). For * basic text processing based on UnicodeSet use the ICU4J UnicodeSetSpanner. * *

Example usages: * *

 *   String trimmed = {@link #whitespace() whitespace()}.{@link #trimFrom trimFrom}(userInput);
 *   if ({@link #ascii() ascii()}.{@link #matchesAllOf matchesAllOf}(s)) { ... }
* *

See the Guava User Guide article on {@code CharMatcher} * . * * @author Kevin Bourrillion * @since 1.0 */ @GwtCompatible(emulated = true) @ElementTypesAreNonnullByDefault public abstract class CharMatcher implements Predicate { /* * N777777777NO * N7777777777777N * M777777777777777N * $N877777777D77777M * N M77777777ONND777M * MN777777777NN D777 * N7ZN777777777NN ~M7778 * N777777777777MMNN88777N * N777777777777MNZZZ7777O * DZN7777O77777777777777 * N7OONND7777777D77777N * 8$M++++?N???$77777$ * M7++++N+M77777777N * N77O777777777777$ M * DNNM$$$$777777N D * N$N:=N$777N7777M NZ * 77Z::::N777777777 ODZZZ * 77N::::::N77777777M NNZZZ$ * $777:::::::77777777MN ZM8ZZZZZ * 777M::::::Z7777777Z77 N++ZZZZNN * 7777M:::::M7777777$777M $++IZZZZM * M777$:::::N777777$M7777M +++++ZZZDN * NN$::::::7777$$M777777N N+++ZZZZNZ * N::::::N:7$O:77777777 N++++ZZZZN * M::::::::::::N77777777+ +?+++++ZZZM * 8::::::::::::D77777777M O+++++ZZ * ::::::::::::M777777777N O+?D * M:::::::::::M77777777778 77= * D=::::::::::N7777777777N 777 * INN===::::::=77777777777N I777N * ?777N========N7777777777787M N7777 * 77777$D======N77777777777N777N? N777777 * I77777$$$N7===M$$77777777$77777777$MMZ77777777N * $$$$$$$$$$$NIZN$$$$$$$$$M$$7777777777777777ON * M$$$$$$$$M M$$$$$$$$N=N$$$$7777777$$$ND * O77Z$$$$$$$ M$$$$$$$$MNI==$DNNNNM=~N * 7 :N MNN$$$$M$ $$$777$8 8D8I * NMM.:7O 777777778 * 7777777MN * M NO .7: * M : M * 8 */ // Constant matcher factory methods /** * Matches any character. * * @since 19.0 (since 1.0 as constant {@code ANY}) */ public static CharMatcher any() { return Any.INSTANCE; } /** * Matches no characters. * * @since 19.0 (since 1.0 as constant {@code NONE}) */ public static CharMatcher none() { return None.INSTANCE; } /** * Determines whether a character is whitespace according to the latest Unicode standard, as * illustrated here. * This is not the same definition used by other Java APIs. (See a comparison of several definitions of "whitespace".) * *

All Unicode White_Space characters are on the BMP and thus supported by this API. * *

Note: as the Unicode definition evolves, we will modify this matcher to keep it up to * date. * * @since 19.0 (since 1.0 as constant {@code WHITESPACE}) */ public static CharMatcher whitespace() { return Whitespace.INSTANCE; } /** * Determines whether a character is a breaking whitespace (that is, a whitespace which can be * interpreted as a break between words for formatting purposes). See {@link #whitespace()} for a * discussion of that term. * * @since 19.0 (since 2.0 as constant {@code BREAKING_WHITESPACE}) */ public static CharMatcher breakingWhitespace() { return BreakingWhitespace.INSTANCE; } /** * Determines whether a character is ASCII, meaning that its code point is less than 128. * * @since 19.0 (since 1.0 as constant {@code ASCII}) */ public static CharMatcher ascii() { return Ascii.INSTANCE; } /** * Determines whether a character is a BMP digit according to Unicode. If * you only care to match ASCII digits, you can use {@code inRange('0', '9')}. * * @deprecated Many digits are supplementary characters; see the class documentation. * @since 19.0 (since 1.0 as constant {@code DIGIT}) */ @Deprecated public static CharMatcher digit() { return Digit.INSTANCE; } /** * Determines whether a character is a BMP digit according to {@linkplain Character#isDigit(char) * Java's definition}. If you only care to match ASCII digits, you can use {@code inRange('0', * '9')}. * * @deprecated Many digits are supplementary characters; see the class documentation. * @since 19.0 (since 1.0 as constant {@code JAVA_DIGIT}) */ @Deprecated public static CharMatcher javaDigit() { return JavaDigit.INSTANCE; } /** * Determines whether a character is a BMP letter according to {@linkplain * Character#isLetter(char) Java's definition}. If you only care to match letters of the Latin * alphabet, you can use {@code inRange('a', 'z').or(inRange('A', 'Z'))}. * * @deprecated Most letters are supplementary characters; see the class documentation. * @since 19.0 (since 1.0 as constant {@code JAVA_LETTER}) */ @Deprecated public static CharMatcher javaLetter() { return JavaLetter.INSTANCE; } /** * Determines whether a character is a BMP letter or digit according to {@linkplain * Character#isLetterOrDigit(char) Java's definition}. * * @deprecated Most letters and digits are supplementary characters; see the class documentation. * @since 19.0 (since 1.0 as constant {@code JAVA_LETTER_OR_DIGIT}). */ @Deprecated public static CharMatcher javaLetterOrDigit() { return JavaLetterOrDigit.INSTANCE; } /** * Determines whether a BMP character is upper case according to {@linkplain * Character#isUpperCase(char) Java's definition}. * * @deprecated Some uppercase characters are supplementary characters; see the class * documentation. * @since 19.0 (since 1.0 as constant {@code JAVA_UPPER_CASE}) */ @Deprecated public static CharMatcher javaUpperCase() { return JavaUpperCase.INSTANCE; } /** * Determines whether a BMP character is lower case according to {@linkplain * Character#isLowerCase(char) Java's definition}. * * @deprecated Some lowercase characters are supplementary characters; see the class * documentation. * @since 19.0 (since 1.0 as constant {@code JAVA_LOWER_CASE}) */ @Deprecated public static CharMatcher javaLowerCase() { return JavaLowerCase.INSTANCE; } /** * Determines whether a character is an ISO control character as specified by {@link * Character#isISOControl(char)}. * *

All ISO control codes are on the BMP and thus supported by this API. * * @since 19.0 (since 1.0 as constant {@code JAVA_ISO_CONTROL}) */ public static CharMatcher javaIsoControl() { return JavaIsoControl.INSTANCE; } /** * Determines whether a character is invisible; that is, if its Unicode category is any of * SPACE_SEPARATOR, LINE_SEPARATOR, PARAGRAPH_SEPARATOR, CONTROL, FORMAT, SURROGATE, and * PRIVATE_USE according to ICU4J. * *

See also the Unicode Default_Ignorable_Code_Point property (available via ICU). * * @deprecated Most invisible characters are supplementary characters; see the class * documentation. * @since 19.0 (since 1.0 as constant {@code INVISIBLE}) */ @Deprecated public static CharMatcher invisible() { return Invisible.INSTANCE; } /** * Determines whether a character is single-width (not double-width). When in doubt, this matcher * errs on the side of returning {@code false} (that is, it tends to assume a character is * double-width). * *

Note: as the reference file evolves, we will modify this matcher to keep it up to * date. * *

See also UAX #11 East Asian Width. * * @deprecated Many such characters are supplementary characters; see the class documentation. * @since 19.0 (since 1.0 as constant {@code SINGLE_WIDTH}) */ @Deprecated public static CharMatcher singleWidth() { return SingleWidth.INSTANCE; } // Static factories /** Returns a {@code char} matcher that matches only one specified BMP character. */ public static CharMatcher is(final char match) { return new Is(match); } /** * Returns a {@code char} matcher that matches any character except the BMP character specified. * *

To negate another {@code CharMatcher}, use {@link #negate()}. */ public static CharMatcher isNot(final char match) { return new IsNot(match); } /** * Returns a {@code char} matcher that matches any BMP character present in the given character * sequence. Returns a bogus matcher if the sequence contains supplementary characters. */ public static CharMatcher anyOf(final CharSequence sequence) { switch (sequence.length()) { case 0: return none(); case 1: return is(sequence.charAt(0)); case 2: return isEither(sequence.charAt(0), sequence.charAt(1)); default: // TODO(lowasser): is it potentially worth just going ahead and building a precomputed // matcher? return new AnyOf(sequence); } } /** * Returns a {@code char} matcher that matches any BMP character not present in the given * character sequence. Returns a bogus matcher if the sequence contains supplementary characters. */ public static CharMatcher noneOf(CharSequence sequence) { return anyOf(sequence).negate(); } /** * Returns a {@code char} matcher that matches any character in a given BMP range (both endpoints * are inclusive). For example, to match any lowercase letter of the English alphabet, use {@code * CharMatcher.inRange('a', 'z')}. * * @throws IllegalArgumentException if {@code endInclusive < startInclusive} */ public static CharMatcher inRange(final char startInclusive, final char endInclusive) { return new InRange(startInclusive, endInclusive); } /** * Returns a matcher with identical behavior to the given {@link Character}-based predicate, but * which operates on primitive {@code char} instances instead. */ public static CharMatcher forPredicate(final Predicate predicate) { return predicate instanceof CharMatcher ? (CharMatcher) predicate : new ForPredicate(predicate); } // Constructors /** * Constructor for use by subclasses. When subclassing, you may want to override {@code * toString()} to provide a useful description. */ protected CharMatcher() {} // Abstract methods /** Determines a true or false value for the given character. */ public abstract boolean matches(char c); // Non-static factories /** Returns a matcher that matches any character not matched by this matcher. */ // @Override under Java 8 but not under Java 7 @Override public CharMatcher negate() { return new Negated(this); } /** * Returns a matcher that matches any character matched by both this matcher and {@code other}. */ public CharMatcher and(CharMatcher other) { return new And(this, other); } /** * Returns a matcher that matches any character matched by either this matcher or {@code other}. */ public CharMatcher or(CharMatcher other) { return new Or(this, other); } /** * Returns a {@code char} matcher functionally equivalent to this one, but which may be faster to * query than the original; your mileage may vary. Precomputation takes time and is likely to be * worthwhile only if the precomputed matcher is queried many thousands of times. * *

This method has no effect (returns {@code this}) when called in GWT: it's unclear whether a * precomputed matcher is faster, but it certainly consumes more memory, which doesn't seem like a * worthwhile tradeoff in a browser. */ public CharMatcher precomputed() { return Platform.precomputeCharMatcher(this); } private static final int DISTINCT_CHARS = Character.MAX_VALUE - Character.MIN_VALUE + 1; /** * This is the actual implementation of {@link #precomputed}, but we bounce calls through a method * on {@link Platform} so that we can have different behavior in GWT. * *

This implementation tries to be smart in a number of ways. It recognizes cases where the * negation is cheaper to precompute than the matcher itself; it tries to build small hash tables * for matchers that only match a few characters, and so on. In the worst-case scenario, it * constructs an eight-kilobyte bit array and queries that. In many situations this produces a * matcher which is faster to query than the original. */ @GwtIncompatible // SmallCharMatcher CharMatcher precomputedInternal() { final BitSet table = new BitSet(); setBits(table); int totalCharacters = table.cardinality(); if (totalCharacters * 2 <= DISTINCT_CHARS) { return precomputedPositive(totalCharacters, table, toString()); } else { // TODO(lowasser): is it worth it to worry about the last character of large matchers? table.flip(Character.MIN_VALUE, Character.MAX_VALUE + 1); int negatedCharacters = DISTINCT_CHARS - totalCharacters; String suffix = ".negate()"; final String description = toString(); String negatedDescription = description.endsWith(suffix) ? description.substring(0, description.length() - suffix.length()) : description + suffix; return new NegatedFastMatcher( precomputedPositive(negatedCharacters, table, negatedDescription)) { @Override public String toString() { return description; } }; } } /** * Helper method for {@link #precomputedInternal} that doesn't test if the negation is cheaper. */ @GwtIncompatible // SmallCharMatcher private static CharMatcher precomputedPositive( int totalCharacters, BitSet table, String description) { switch (totalCharacters) { case 0: return none(); case 1: return is((char) table.nextSetBit(0)); case 2: char c1 = (char) table.nextSetBit(0); char c2 = (char) table.nextSetBit(c1 + 1); return isEither(c1, c2); default: return isSmall(totalCharacters, table.length()) ? SmallCharMatcher.from(table, description) : new BitSetMatcher(table, description); } } @GwtIncompatible // SmallCharMatcher private static boolean isSmall(int totalCharacters, int tableLength) { return totalCharacters <= SmallCharMatcher.MAX_SIZE && tableLength > (totalCharacters * 4 * Character.SIZE); // err on the side of BitSetMatcher } /** Sets bits in {@code table} matched by this matcher. */ @GwtIncompatible // used only from other GwtIncompatible code void setBits(BitSet table) { for (int c = Character.MAX_VALUE; c >= Character.MIN_VALUE; c--) { if (matches((char) c)) { table.set(c); } } } // Text processing routines /** * Returns {@code true} if a character sequence contains at least one matching BMP character. * Equivalent to {@code !matchesNoneOf(sequence)}. * *

The default implementation iterates over the sequence, invoking {@link #matches} for each * character, until this returns {@code true} or the end is reached. * * @param sequence the character sequence to examine, possibly empty * @return {@code true} if this matcher matches at least one character in the sequence * @since 8.0 */ public boolean matchesAnyOf(CharSequence sequence) { return !matchesNoneOf(sequence); } /** * Returns {@code true} if a character sequence contains only matching BMP characters. * *

The default implementation iterates over the sequence, invoking {@link #matches} for each * character, until this returns {@code false} or the end is reached. * * @param sequence the character sequence to examine, possibly empty * @return {@code true} if this matcher matches every character in the sequence, including when * the sequence is empty */ public boolean matchesAllOf(CharSequence sequence) { for (int i = sequence.length() - 1; i >= 0; i--) { if (!matches(sequence.charAt(i))) { return false; } } return true; } /** * Returns {@code true} if a character sequence contains no matching BMP characters. Equivalent to * {@code !matchesAnyOf(sequence)}. * *

The default implementation iterates over the sequence, invoking {@link #matches} for each * character, until this returns {@code true} or the end is reached. * * @param sequence the character sequence to examine, possibly empty * @return {@code true} if this matcher matches no characters in the sequence, including when the * sequence is empty */ public boolean matchesNoneOf(CharSequence sequence) { return indexIn(sequence) == -1; } /** * Returns the index of the first matching BMP character in a character sequence, or {@code -1} if * no matching character is present. * *

The default implementation iterates over the sequence in forward order calling {@link * #matches} for each character. * * @param sequence the character sequence to examine from the beginning * @return an index, or {@code -1} if no character matches */ public int indexIn(CharSequence sequence) { return indexIn(sequence, 0); } /** * Returns the index of the first matching BMP character in a character sequence, starting from a * given position, or {@code -1} if no character matches after that position. * *

The default implementation iterates over the sequence in forward order, beginning at {@code * start}, calling {@link #matches} for each character. * * @param sequence the character sequence to examine * @param start the first index to examine; must be nonnegative and no greater than {@code * sequence.length()} * @return the index of the first matching character, guaranteed to be no less than {@code start}, * or {@code -1} if no character matches * @throws IndexOutOfBoundsException if start is negative or greater than {@code * sequence.length()} */ public int indexIn(CharSequence sequence, int start) { int length = sequence.length(); checkPositionIndex(start, length); for (int i = start; i < length; i++) { if (matches(sequence.charAt(i))) { return i; } } return -1; } /** * Returns the index of the last matching BMP character in a character sequence, or {@code -1} if * no matching character is present. * *

The default implementation iterates over the sequence in reverse order calling {@link * #matches} for each character. * * @param sequence the character sequence to examine from the end * @return an index, or {@code -1} if no character matches */ public int lastIndexIn(CharSequence sequence) { for (int i = sequence.length() - 1; i >= 0; i--) { if (matches(sequence.charAt(i))) { return i; } } return -1; } /** * Returns the number of matching {@code char}s found in a character sequence. * *

Counts 2 per supplementary character, such as for {@link #whitespace}().{@link #negate}(). */ public int countIn(CharSequence sequence) { int count = 0; for (int i = 0; i < sequence.length(); i++) { if (matches(sequence.charAt(i))) { count++; } } return count; } /** * Returns a string containing all non-matching characters of a character sequence, in order. For * example: * *

{@code
   * CharMatcher.is('a').removeFrom("bazaar")
   * }
* * ... returns {@code "bzr"}. */ public String removeFrom(CharSequence sequence) { String string = sequence.toString(); int pos = indexIn(string); if (pos == -1) { return string; } char[] chars = string.toCharArray(); int spread = 1; // This unusual loop comes from extensive benchmarking OUT: while (true) { pos++; while (true) { if (pos == chars.length) { break OUT; } if (matches(chars[pos])) { break; } chars[pos - spread] = chars[pos]; pos++; } spread++; } return new String(chars, 0, pos - spread); } /** * Returns a string containing all matching BMP characters of a character sequence, in order. For * example: * *
{@code
   * CharMatcher.is('a').retainFrom("bazaar")
   * }
* * ... returns {@code "aaa"}. */ public String retainFrom(CharSequence sequence) { return negate().removeFrom(sequence); } /** * Returns a string copy of the input character sequence, with each matching BMP character * replaced by a given replacement character. For example: * *
{@code
   * CharMatcher.is('a').replaceFrom("radar", 'o')
   * }
* * ... returns {@code "rodor"}. * *

The default implementation uses {@link #indexIn(CharSequence)} to find the first matching * character, then iterates the remainder of the sequence calling {@link #matches(char)} for each * character. * * @param sequence the character sequence to replace matching characters in * @param replacement the character to append to the result string in place of each matching * character in {@code sequence} * @return the new string */ public String replaceFrom(CharSequence sequence, char replacement) { String string = sequence.toString(); int pos = indexIn(string); if (pos == -1) { return string; } char[] chars = string.toCharArray(); chars[pos] = replacement; for (int i = pos + 1; i < chars.length; i++) { if (matches(chars[i])) { chars[i] = replacement; } } return new String(chars); } /** * Returns a string copy of the input character sequence, with each matching BMP character * replaced by a given replacement sequence. For example: * *

{@code
   * CharMatcher.is('a').replaceFrom("yaha", "oo")
   * }
* * ... returns {@code "yoohoo"}. * *

Note: If the replacement is a fixed string with only one character, you are better * off calling {@link #replaceFrom(CharSequence, char)} directly. * * @param sequence the character sequence to replace matching characters in * @param replacement the characters to append to the result string in place of each matching * character in {@code sequence} * @return the new string */ public String replaceFrom(CharSequence sequence, CharSequence replacement) { int replacementLen = replacement.length(); if (replacementLen == 0) { return removeFrom(sequence); } if (replacementLen == 1) { return replaceFrom(sequence, replacement.charAt(0)); } String string = sequence.toString(); int pos = indexIn(string); if (pos == -1) { return string; } int len = string.length(); StringBuilder buf = new StringBuilder((len * 3 / 2) + 16); int oldpos = 0; do { buf.append(string, oldpos, pos); buf.append(replacement); oldpos = pos + 1; pos = indexIn(string, oldpos); } while (pos != -1); buf.append(string, oldpos, len); return buf.toString(); } /** * Returns a substring of the input character sequence that omits all matching BMP characters from * the beginning and from the end of the string. For example: * *

{@code
   * CharMatcher.anyOf("ab").trimFrom("abacatbab")
   * }
* * ... returns {@code "cat"}. * *

Note that: * *

{@code
   * CharMatcher.inRange('\0', ' ').trimFrom(str)
   * }
* * ... is equivalent to {@link String#trim()}. */ public String trimFrom(CharSequence sequence) { int len = sequence.length(); int first; int last; for (first = 0; first < len; first++) { if (!matches(sequence.charAt(first))) { break; } } for (last = len - 1; last > first; last--) { if (!matches(sequence.charAt(last))) { break; } } return sequence.subSequence(first, last + 1).toString(); } /** * Returns a substring of the input character sequence that omits all matching BMP characters from * the beginning of the string. For example: * *
{@code
   * CharMatcher.anyOf("ab").trimLeadingFrom("abacatbab")
   * }
* * ... returns {@code "catbab"}. */ public String trimLeadingFrom(CharSequence sequence) { int len = sequence.length(); for (int first = 0; first < len; first++) { if (!matches(sequence.charAt(first))) { return sequence.subSequence(first, len).toString(); } } return ""; } /** * Returns a substring of the input character sequence that omits all matching BMP characters from * the end of the string. For example: * *
{@code
   * CharMatcher.anyOf("ab").trimTrailingFrom("abacatbab")
   * }
* * ... returns {@code "abacat"}. */ public String trimTrailingFrom(CharSequence sequence) { int len = sequence.length(); for (int last = len - 1; last >= 0; last--) { if (!matches(sequence.charAt(last))) { return sequence.subSequence(0, last + 1).toString(); } } return ""; } /** * Returns a string copy of the input character sequence, with each group of consecutive matching * BMP characters replaced by a single replacement character. For example: * *
{@code
   * CharMatcher.anyOf("eko").collapseFrom("bookkeeper", '-')
   * }
* * ... returns {@code "b-p-r"}. * *

The default implementation uses {@link #indexIn(CharSequence)} to find the first matching * character, then iterates the remainder of the sequence calling {@link #matches(char)} for each * character. * * @param sequence the character sequence to replace matching groups of characters in * @param replacement the character to append to the result string in place of each group of * matching characters in {@code sequence} * @return the new string */ public String collapseFrom(CharSequence sequence, char replacement) { // This implementation avoids unnecessary allocation. int len = sequence.length(); for (int i = 0; i < len; i++) { char c = sequence.charAt(i); if (matches(c)) { if (c == replacement && (i == len - 1 || !matches(sequence.charAt(i + 1)))) { // a no-op replacement i++; } else { StringBuilder builder = new StringBuilder(len).append(sequence, 0, i).append(replacement); return finishCollapseFrom(sequence, i + 1, len, replacement, builder, true); } } } // no replacement needed return sequence.toString(); } /** * Collapses groups of matching characters exactly as {@link #collapseFrom} does, except that * groups of matching BMP characters at the start or end of the sequence are removed without * replacement. */ public String trimAndCollapseFrom(CharSequence sequence, char replacement) { // This implementation avoids unnecessary allocation. int len = sequence.length(); int first = 0; int last = len - 1; while (first < len && matches(sequence.charAt(first))) { first++; } while (last > first && matches(sequence.charAt(last))) { last--; } return (first == 0 && last == len - 1) ? collapseFrom(sequence, replacement) : finishCollapseFrom( sequence, first, last + 1, replacement, new StringBuilder(last + 1 - first), false); } private String finishCollapseFrom( CharSequence sequence, int start, int end, char replacement, StringBuilder builder, boolean inMatchingGroup) { for (int i = start; i < end; i++) { char c = sequence.charAt(i); if (matches(c)) { if (!inMatchingGroup) { builder.append(replacement); inMatchingGroup = true; } } else { builder.append(c); inMatchingGroup = false; } } return builder.toString(); } /** * @deprecated Provided only to satisfy the {@link Predicate} interface; use {@link #matches} * instead. */ @Deprecated @Override public boolean apply(Character character) { return matches(character); } /** * Returns a string representation of this {@code CharMatcher}, such as {@code * CharMatcher.or(WHITESPACE, JAVA_DIGIT)}. */ @Override public String toString() { return super.toString(); } /** * Returns the Java Unicode escape sequence for the given {@code char}, in the form "\u12AB" where * "12AB" is the four hexadecimal digits representing the 16-bit code unit. */ private static String showCharacter(char c) { String hex = "0123456789ABCDEF"; char[] tmp = {'\\', 'u', '\0', '\0', '\0', '\0'}; for (int i = 0; i < 4; i++) { tmp[5 - i] = hex.charAt(c & 0xF); c = (char) (c >> 4); } return String.copyValueOf(tmp); } // Fast matchers /** A matcher for which precomputation will not yield any significant benefit. */ abstract static class FastMatcher extends CharMatcher { @Override public final CharMatcher precomputed() { return this; } @Override public CharMatcher negate() { return new NegatedFastMatcher(this); } } /** {@link FastMatcher} which overrides {@code toString()} with a custom name. */ abstract static class NamedFastMatcher extends FastMatcher { private final String description; NamedFastMatcher(String description) { this.description = checkNotNull(description); } @Override public final String toString() { return description; } } /** Negation of a {@link FastMatcher}. */ static class NegatedFastMatcher extends Negated { NegatedFastMatcher(CharMatcher original) { super(original); } @Override public final CharMatcher precomputed() { return this; } } /** Fast matcher using a {@link BitSet} table of matching characters. */ @GwtIncompatible // used only from other GwtIncompatible code private static final class BitSetMatcher extends NamedFastMatcher { private final BitSet table; private BitSetMatcher(BitSet table, String description) { super(description); if (table.length() + Long.SIZE < table.size()) { table = (BitSet) table.clone(); // If only we could actually call BitSet.trimToSize() ourselves... } this.table = table; } @Override public boolean matches(char c) { return table.get(c); } @Override void setBits(BitSet bitSet) { bitSet.or(table); } } // Static constant implementation classes /** Implementation of {@link #any()}. */ private static final class Any extends NamedFastMatcher { static final Any INSTANCE = new Any(); private Any() { super("CharMatcher.any()"); } @Override public boolean matches(char c) { return true; } @Override public int indexIn(CharSequence sequence) { return (sequence.length() == 0) ? -1 : 0; } @Override public int indexIn(CharSequence sequence, int start) { int length = sequence.length(); checkPositionIndex(start, length); return (start == length) ? -1 : start; } @Override public int lastIndexIn(CharSequence sequence) { return sequence.length() - 1; } @Override public boolean matchesAllOf(CharSequence sequence) { checkNotNull(sequence); return true; } @Override public boolean matchesNoneOf(CharSequence sequence) { return sequence.length() == 0; } @Override public String removeFrom(CharSequence sequence) { checkNotNull(sequence); return ""; } @Override public String replaceFrom(CharSequence sequence, char replacement) { char[] array = new char[sequence.length()]; Arrays.fill(array, replacement); return new String(array); } @Override public String replaceFrom(CharSequence sequence, CharSequence replacement) { StringBuilder result = new StringBuilder(sequence.length() * replacement.length()); for (int i = 0; i < sequence.length(); i++) { result.append(replacement); } return result.toString(); } @Override public String collapseFrom(CharSequence sequence, char replacement) { return (sequence.length() == 0) ? "" : String.valueOf(replacement); } @Override public String trimFrom(CharSequence sequence) { checkNotNull(sequence); return ""; } @Override public int countIn(CharSequence sequence) { return sequence.length(); } @Override public CharMatcher and(CharMatcher other) { return checkNotNull(other); } @Override public CharMatcher or(CharMatcher other) { checkNotNull(other); return this; } @Override public CharMatcher negate() { return none(); } } /** Implementation of {@link #none()}. */ private static final class None extends NamedFastMatcher { static final None INSTANCE = new None(); private None() { super("CharMatcher.none()"); } @Override public boolean matches(char c) { return false; } @Override public int indexIn(CharSequence sequence) { checkNotNull(sequence); return -1; } @Override public int indexIn(CharSequence sequence, int start) { int length = sequence.length(); checkPositionIndex(start, length); return -1; } @Override public int lastIndexIn(CharSequence sequence) { checkNotNull(sequence); return -1; } @Override public boolean matchesAllOf(CharSequence sequence) { return sequence.length() == 0; } @Override public boolean matchesNoneOf(CharSequence sequence) { checkNotNull(sequence); return true; } @Override public String removeFrom(CharSequence sequence) { return sequence.toString(); } @Override public String replaceFrom(CharSequence sequence, char replacement) { return sequence.toString(); } @Override public String replaceFrom(CharSequence sequence, CharSequence replacement) { checkNotNull(replacement); return sequence.toString(); } @Override public String collapseFrom(CharSequence sequence, char replacement) { return sequence.toString(); } @Override public String trimFrom(CharSequence sequence) { return sequence.toString(); } @Override public String trimLeadingFrom(CharSequence sequence) { return sequence.toString(); } @Override public String trimTrailingFrom(CharSequence sequence) { return sequence.toString(); } @Override public int countIn(CharSequence sequence) { checkNotNull(sequence); return 0; } @Override public CharMatcher and(CharMatcher other) { checkNotNull(other); return this; } @Override public CharMatcher or(CharMatcher other) { return checkNotNull(other); } @Override public CharMatcher negate() { return any(); } } /** Implementation of {@link #whitespace()}. */ @VisibleForTesting static final class Whitespace extends NamedFastMatcher { // TABLE is a precomputed hashset of whitespace characters. MULTIPLIER serves as a hash function // whose key property is that it maps 25 characters into the 32-slot table without collision. // Basically this is an opportunistic fast implementation as opposed to "good code". For most // other use-cases, the reduction in readability isn't worth it. static final String TABLE = "\u2002\u3000\r\u0085\u200A\u2005\u2000\u3000" + "\u2029\u000B\u3000\u2008\u2003\u205F\u3000\u1680" + "\u0009\u0020\u2006\u2001\u202F\u00A0\u000C\u2009" + "\u3000\u2004\u3000\u3000\u2028\n\u2007\u3000"; static final int MULTIPLIER = 1682554634; static final int SHIFT = Integer.numberOfLeadingZeros(TABLE.length() - 1); static final Whitespace INSTANCE = new Whitespace(); Whitespace() { super("CharMatcher.whitespace()"); } @Override public boolean matches(char c) { return TABLE.charAt((MULTIPLIER * c) >>> SHIFT) == c; } @GwtIncompatible // used only from other GwtIncompatible code @Override void setBits(BitSet table) { for (int i = 0; i < TABLE.length(); i++) { table.set(TABLE.charAt(i)); } } } /** Implementation of {@link #breakingWhitespace()}. */ private static final class BreakingWhitespace extends CharMatcher { static final CharMatcher INSTANCE = new BreakingWhitespace(); @Override public boolean matches(char c) { switch (c) { case '\t': case '\n': case '\013': case '\f': case '\r': case ' ': case '\u0085': case '\u1680': case '\u2028': case '\u2029': case '\u205f': case '\u3000': return true; case '\u2007': return false; default: return c >= '\u2000' && c <= '\u200a'; } } @Override public String toString() { return "CharMatcher.breakingWhitespace()"; } } /** Implementation of {@link #ascii()}. */ private static final class Ascii extends NamedFastMatcher { static final Ascii INSTANCE = new Ascii(); Ascii() { super("CharMatcher.ascii()"); } @Override public boolean matches(char c) { return c <= '\u007f'; } } /** Implementation that matches characters that fall within multiple ranges. */ private static class RangesMatcher extends CharMatcher { private final String description; private final char[] rangeStarts; private final char[] rangeEnds; RangesMatcher(String description, char[] rangeStarts, char[] rangeEnds) { this.description = description; this.rangeStarts = rangeStarts; this.rangeEnds = rangeEnds; checkArgument(rangeStarts.length == rangeEnds.length); for (int i = 0; i < rangeStarts.length; i++) { checkArgument(rangeStarts[i] <= rangeEnds[i]); if (i + 1 < rangeStarts.length) { checkArgument(rangeEnds[i] < rangeStarts[i + 1]); } } } @Override public boolean matches(char c) { int index = Arrays.binarySearch(rangeStarts, c); if (index >= 0) { return true; } else { index = ~index - 1; return index >= 0 && c <= rangeEnds[index]; } } @Override public String toString() { return description; } } /** Implementation of {@link #digit()}. */ private static final class Digit extends RangesMatcher { // Plug the following UnicodeSet pattern into // https://unicode.org/cldr/utility/list-unicodeset.jsp // [[:Nd:]&[:nv=0:]&[\u0000-\uFFFF]] // and get the zeroes from there. // Must be in ascending order. private static final String ZEROES = "0\u0660\u06f0\u07c0\u0966\u09e6\u0a66\u0ae6\u0b66\u0be6\u0c66\u0ce6\u0d66\u0de6" + "\u0e50\u0ed0\u0f20\u1040\u1090\u17e0\u1810\u1946\u19d0\u1a80\u1a90\u1b50\u1bb0" + "\u1c40\u1c50\ua620\ua8d0\ua900\ua9d0\ua9f0\uaa50\uabf0\uff10"; private static char[] zeroes() { return ZEROES.toCharArray(); } private static char[] nines() { char[] nines = new char[ZEROES.length()]; for (int i = 0; i < ZEROES.length(); i++) { nines[i] = (char) (ZEROES.charAt(i) + 9); } return nines; } static final Digit INSTANCE = new Digit(); private Digit() { super("CharMatcher.digit()", zeroes(), nines()); } } /** Implementation of {@link #javaDigit()}. */ private static final class JavaDigit extends CharMatcher { static final JavaDigit INSTANCE = new JavaDigit(); @Override public boolean matches(char c) { return Character.isDigit(c); } @Override public String toString() { return "CharMatcher.javaDigit()"; } } /** Implementation of {@link #javaLetter()}. */ private static final class JavaLetter extends CharMatcher { static final JavaLetter INSTANCE = new JavaLetter(); @Override public boolean matches(char c) { return Character.isLetter(c); } @Override public String toString() { return "CharMatcher.javaLetter()"; } } /** Implementation of {@link #javaLetterOrDigit()}. */ private static final class JavaLetterOrDigit extends CharMatcher { static final JavaLetterOrDigit INSTANCE = new JavaLetterOrDigit(); @Override public boolean matches(char c) { return Character.isLetterOrDigit(c); } @Override public String toString() { return "CharMatcher.javaLetterOrDigit()"; } } /** Implementation of {@link #javaUpperCase()}. */ private static final class JavaUpperCase extends CharMatcher { static final JavaUpperCase INSTANCE = new JavaUpperCase(); @Override public boolean matches(char c) { return Character.isUpperCase(c); } @Override public String toString() { return "CharMatcher.javaUpperCase()"; } } /** Implementation of {@link #javaLowerCase()}. */ private static final class JavaLowerCase extends CharMatcher { static final JavaLowerCase INSTANCE = new JavaLowerCase(); @Override public boolean matches(char c) { return Character.isLowerCase(c); } @Override public String toString() { return "CharMatcher.javaLowerCase()"; } } /** Implementation of {@link #javaIsoControl()}. */ private static final class JavaIsoControl extends NamedFastMatcher { static final JavaIsoControl INSTANCE = new JavaIsoControl(); private JavaIsoControl() { super("CharMatcher.javaIsoControl()"); } @Override public boolean matches(char c) { return c <= '\u001f' || (c >= '\u007f' && c <= '\u009f'); } } /** Implementation of {@link #invisible()}. */ private static final class Invisible extends RangesMatcher { // Plug the following UnicodeSet pattern into // https://unicode.org/cldr/utility/list-unicodeset.jsp // [[[:Zs:][:Zl:][:Zp:][:Cc:][:Cf:][:Cs:][:Co:]]&[\u0000-\uFFFF]] // with the "Abbreviate" option, and get the ranges from there. private static final String RANGE_STARTS = "\u0000\u007f\u00ad\u0600\u061c\u06dd\u070f\u0890\u08e2\u1680\u180e\u2000\u2028\u205f\u2066" + "\u3000\ud800\ufeff\ufff9"; private static final String RANGE_ENDS = // inclusive ends "\u0020\u00a0\u00ad\u0605\u061c\u06dd\u070f\u0891\u08e2\u1680\u180e\u200f\u202f\u2064\u206f" + "\u3000\uf8ff\ufeff\ufffb"; static final Invisible INSTANCE = new Invisible(); private Invisible() { super("CharMatcher.invisible()", RANGE_STARTS.toCharArray(), RANGE_ENDS.toCharArray()); } } /** Implementation of {@link #singleWidth()}. */ private static final class SingleWidth extends RangesMatcher { static final SingleWidth INSTANCE = new SingleWidth(); private SingleWidth() { super( "CharMatcher.singleWidth()", "\u0000\u05be\u05d0\u05f3\u0600\u0750\u0e00\u1e00\u2100\ufb50\ufe70\uff61".toCharArray(), "\u04f9\u05be\u05ea\u05f4\u06ff\u077f\u0e7f\u20af\u213a\ufdff\ufeff\uffdc".toCharArray()); } } // Non-static factory implementation classes /** Implementation of {@link #negate()}. */ private static class Negated extends CharMatcher { final CharMatcher original; Negated(CharMatcher original) { this.original = checkNotNull(original); } @Override public boolean matches(char c) { return !original.matches(c); } @Override public boolean matchesAllOf(CharSequence sequence) { return original.matchesNoneOf(sequence); } @Override public boolean matchesNoneOf(CharSequence sequence) { return original.matchesAllOf(sequence); } @Override public int countIn(CharSequence sequence) { return sequence.length() - original.countIn(sequence); } @GwtIncompatible // used only from other GwtIncompatible code @Override void setBits(BitSet table) { BitSet tmp = new BitSet(); original.setBits(tmp); tmp.flip(Character.MIN_VALUE, Character.MAX_VALUE + 1); table.or(tmp); } @Override public CharMatcher negate() { return original; } @Override public String toString() { return original + ".negate()"; } } /** Implementation of {@link #and(CharMatcher)}. */ private static final class And extends CharMatcher { final CharMatcher first; final CharMatcher second; And(CharMatcher a, CharMatcher b) { first = checkNotNull(a); second = checkNotNull(b); } @Override public boolean matches(char c) { return first.matches(c) && second.matches(c); } @GwtIncompatible // used only from other GwtIncompatible code @Override void setBits(BitSet table) { BitSet tmp1 = new BitSet(); first.setBits(tmp1); BitSet tmp2 = new BitSet(); second.setBits(tmp2); tmp1.and(tmp2); table.or(tmp1); } @Override public String toString() { return "CharMatcher.and(" + first + ", " + second + ")"; } } /** Implementation of {@link #or(CharMatcher)}. */ private static final class Or extends CharMatcher { final CharMatcher first; final CharMatcher second; Or(CharMatcher a, CharMatcher b) { first = checkNotNull(a); second = checkNotNull(b); } @GwtIncompatible // used only from other GwtIncompatible code @Override void setBits(BitSet table) { first.setBits(table); second.setBits(table); } @Override public boolean matches(char c) { return first.matches(c) || second.matches(c); } @Override public String toString() { return "CharMatcher.or(" + first + ", " + second + ")"; } } // Static factory implementations /** Implementation of {@link #is(char)}. */ private static final class Is extends FastMatcher { private final char match; Is(char match) { this.match = match; } @Override public boolean matches(char c) { return c == match; } @Override public String replaceFrom(CharSequence sequence, char replacement) { return sequence.toString().replace(match, replacement); } @Override public CharMatcher and(CharMatcher other) { return other.matches(match) ? this : none(); } @Override public CharMatcher or(CharMatcher other) { return other.matches(match) ? other : super.or(other); } @Override public CharMatcher negate() { return isNot(match); } @GwtIncompatible // used only from other GwtIncompatible code @Override void setBits(BitSet table) { table.set(match); } @Override public String toString() { return "CharMatcher.is('" + showCharacter(match) + "')"; } } /** Implementation of {@link #isNot(char)}. */ private static final class IsNot extends FastMatcher { private final char match; IsNot(char match) { this.match = match; } @Override public boolean matches(char c) { return c != match; } @Override public CharMatcher and(CharMatcher other) { return other.matches(match) ? super.and(other) : other; } @Override public CharMatcher or(CharMatcher other) { return other.matches(match) ? any() : this; } @GwtIncompatible // used only from other GwtIncompatible code @Override void setBits(BitSet table) { table.set(0, match); table.set(match + 1, Character.MAX_VALUE + 1); } @Override public CharMatcher negate() { return is(match); } @Override public String toString() { return "CharMatcher.isNot('" + showCharacter(match) + "')"; } } private static CharMatcher.IsEither isEither(char c1, char c2) { return new CharMatcher.IsEither(c1, c2); } /** Implementation of {@link #anyOf(CharSequence)} for exactly two characters. */ private static final class IsEither extends FastMatcher { private final char match1; private final char match2; IsEither(char match1, char match2) { this.match1 = match1; this.match2 = match2; } @Override public boolean matches(char c) { return c == match1 || c == match2; } @GwtIncompatible // used only from other GwtIncompatible code @Override void setBits(BitSet table) { table.set(match1); table.set(match2); } @Override public String toString() { return "CharMatcher.anyOf(\"" + showCharacter(match1) + showCharacter(match2) + "\")"; } } /** Implementation of {@link #anyOf(CharSequence)} for three or more characters. */ private static final class AnyOf extends CharMatcher { private final char[] chars; public AnyOf(CharSequence chars) { this.chars = chars.toString().toCharArray(); Arrays.sort(this.chars); } @Override public boolean matches(char c) { return Arrays.binarySearch(chars, c) >= 0; } @Override @GwtIncompatible // used only from other GwtIncompatible code void setBits(BitSet table) { for (char c : chars) { table.set(c); } } @Override public String toString() { StringBuilder description = new StringBuilder("CharMatcher.anyOf(\""); for (char c : chars) { description.append(showCharacter(c)); } description.append("\")"); return description.toString(); } } /** Implementation of {@link #inRange(char, char)}. */ private static final class InRange extends FastMatcher { private final char startInclusive; private final char endInclusive; InRange(char startInclusive, char endInclusive) { checkArgument(endInclusive >= startInclusive); this.startInclusive = startInclusive; this.endInclusive = endInclusive; } @Override public boolean matches(char c) { return startInclusive <= c && c <= endInclusive; } @GwtIncompatible // used only from other GwtIncompatible code @Override void setBits(BitSet table) { table.set(startInclusive, endInclusive + 1); } @Override public String toString() { return "CharMatcher.inRange('" + showCharacter(startInclusive) + "', '" + showCharacter(endInclusive) + "')"; } } /** Implementation of {@link #forPredicate(Predicate)}. */ private static final class ForPredicate extends CharMatcher { private final Predicate predicate; ForPredicate(Predicate predicate) { this.predicate = checkNotNull(predicate); } @Override public boolean matches(char c) { return predicate.apply(c); } @SuppressWarnings("deprecation") // intentional; deprecation is for callers primarily @Override public boolean apply(Character character) { return predicate.apply(checkNotNull(character)); } @Override public String toString() { return "CharMatcher.forPredicate(" + predicate + ")"; } } }





© 2015 - 2025 Weber Informatics LLC | Privacy Policy