All Downloads are FREE. Search and download functionalities are using the official Maven repository.

com.google.common.base.CharMatcher Maven / Gradle / Ivy

There is a newer version: 3.0.0-alpha-3
Show newest version
/*
 * Copyright (C) 2008 The Guava Authors
 *
 * Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
 * in compliance with the License. You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software distributed under the License
 * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
 * or implied. See the License for the specific language governing permissions and limitations under
 * the License.
 */

package com.google.common.base;

import static com.google.common.base.Preconditions.checkArgument;
import static com.google.common.base.Preconditions.checkNotNull;
import static com.google.common.base.Preconditions.checkPositionIndex;

import com.google.common.annotations.GwtCompatible;
import com.google.common.annotations.GwtIncompatible;
import com.google.common.annotations.VisibleForTesting;
import java.util.Arrays;
import java.util.BitSet;

/**
 * Determines a true or false value for any Java {@code char} value, just as {@link Predicate} does
 * for any {@link Object}. Also offers basic text processing methods based on this function.
 * Implementations are strongly encouraged to be side-effect-free and immutable.
 *
 * 

Throughout the documentation of this class, the phrase "matching character" is used to mean * "any {@code char} value {@code c} for which {@code this.matches(c)} returns {@code true}". * *

Warning: This class deals only with {@code char} values, that is, BMP characters. It does not understand * supplementary Unicode code * points in the range {@code 0x10000} to {@code 0x10FFFF} which includes the majority of * assigned characters, including important CJK characters and emoji. * *

Supplementary characters are encoded * into a {@code String} using surrogate pairs, and a {@code CharMatcher} treats these just as * two separate characters. {@link #countIn} counts each supplementary character as 2 {@code char}s. * *

For up-to-date Unicode character properties (digit, letter, etc.) and support for * supplementary code points, use ICU4J UCharacter and UnicodeSet (freeze() after building). For * basic text processing based on UnicodeSet use the ICU4J UnicodeSetSpanner. * *

Example usages: * *

 *   String trimmed = {@link #whitespace() whitespace()}.{@link #trimFrom trimFrom}(userInput);
 *   if ({@link #ascii() ascii()}.{@link #matchesAllOf matchesAllOf}(s)) { ... }
* *

See the Guava User Guide article on {@code CharMatcher} * . * * @author Kevin Bourrillion * @since 1.0 */ @GwtCompatible(emulated = true) @ElementTypesAreNonnullByDefault public abstract class CharMatcher implements Predicate { /* * N777777777NO * N7777777777777N * M777777777777777N * $N877777777D77777M * N M77777777ONND777M * MN777777777NN D777 * N7ZN777777777NN ~M7778 * N777777777777MMNN88777N * N777777777777MNZZZ7777O * DZN7777O77777777777777 * N7OONND7777777D77777N * 8$M++++?N???$77777$ * M7++++N+M77777777N * N77O777777777777$ M * DNNM$$$$777777N D * N$N:=N$777N7777M NZ * 77Z::::N777777777 ODZZZ * 77N::::::N77777777M NNZZZ$ * $777:::::::77777777MN ZM8ZZZZZ * 777M::::::Z7777777Z77 N++ZZZZNN * 7777M:::::M7777777$777M $++IZZZZM * M777$:::::N777777$M7777M +++++ZZZDN * NN$::::::7777$$M777777N N+++ZZZZNZ * N::::::N:7$O:77777777 N++++ZZZZN * M::::::::::::N77777777+ +?+++++ZZZM * 8::::::::::::D77777777M O+++++ZZ * ::::::::::::M777777777N O+?D * M:::::::::::M77777777778 77= * D=::::::::::N7777777777N 777 * INN===::::::=77777777777N I777N * ?777N========N7777777777787M N7777 * 77777$D======N77777777777N777N? N777777 * I77777$$$N7===M$$77777777$77777777$MMZ77777777N * $$$$$$$$$$$NIZN$$$$$$$$$M$$7777777777777777ON * M$$$$$$$$M M$$$$$$$$N=N$$$$7777777$$$ND * O77Z$$$$$$$ M$$$$$$$$MNI==$DNNNNM=~N * 7 :N MNN$$$$M$ $$$777$8 8D8I * NMM.:7O 777777778 * 7777777MN * M NO .7: * M : M * 8 */ // Constant matcher factory methods /** * Matches any character. * * @since 19.0 (since 1.0 as constant {@code ANY}) */ public static CharMatcher any() { return Any.INSTANCE; } /** * Matches no characters. * * @since 19.0 (since 1.0 as constant {@code NONE}) */ public static CharMatcher none() { return None.INSTANCE; } /** * Determines whether a character is whitespace according to the latest Unicode standard, as * illustrated here. * This is not the same definition used by other Java APIs. (See a comparison of several definitions of "whitespace".) * *

All Unicode White_Space characters are on the BMP and thus supported by this API. * *

Note: as the Unicode definition evolves, we will modify this matcher to keep it up to * date. * * @since 19.0 (since 1.0 as constant {@code WHITESPACE}) */ public static CharMatcher whitespace() { return Whitespace.INSTANCE; } /** * Determines whether a character is a breaking whitespace (that is, a whitespace which can be * interpreted as a break between words for formatting purposes). See {@link #whitespace()} for a * discussion of that term. * * @since 19.0 (since 2.0 as constant {@code BREAKING_WHITESPACE}) */ public static CharMatcher breakingWhitespace() { return BreakingWhitespace.INSTANCE; } /** * Determines whether a character is ASCII, meaning that its code point is less than 128. * * @since 19.0 (since 1.0 as constant {@code ASCII}) */ public static CharMatcher ascii() { return Ascii.INSTANCE; } /** * Determines whether a character is a BMP digit according to Unicode. If * you only care to match ASCII digits, you can use {@code inRange('0', '9')}. * * @deprecated Many digits are supplementary characters; see the class documentation. * @since 19.0 (since 1.0 as constant {@code DIGIT}) */ @Deprecated public static CharMatcher digit() { return Digit.INSTANCE; } /** * Determines whether a character is a BMP digit according to {@linkplain Character#isDigit(char) * Java's definition}. If you only care to match ASCII digits, you can use {@code inRange('0', * '9')}. * * @deprecated Many digits are supplementary characters; see the class documentation. * @since 19.0 (since 1.0 as constant {@code JAVA_DIGIT}) */ @Deprecated public static CharMatcher javaDigit() { return JavaDigit.INSTANCE; } /** * Determines whether a character is a BMP letter according to {@linkplain * Character#isLetter(char) Java's definition}. If you only care to match letters of the Latin * alphabet, you can use {@code inRange('a', 'z').or(inRange('A', 'Z'))}. * * @deprecated Most letters are supplementary characters; see the class documentation. * @since 19.0 (since 1.0 as constant {@code JAVA_LETTER}) */ @Deprecated public static CharMatcher javaLetter() { return JavaLetter.INSTANCE; } /** * Determines whether a character is a BMP letter or digit according to {@linkplain * Character#isLetterOrDigit(char) Java's definition}. * * @deprecated Most letters and digits are supplementary characters; see the class documentation. * @since 19.0 (since 1.0 as constant {@code JAVA_LETTER_OR_DIGIT}). */ @Deprecated public static CharMatcher javaLetterOrDigit() { return JavaLetterOrDigit.INSTANCE; } /** * Determines whether a BMP character is upper case according to {@linkplain * Character#isUpperCase(char) Java's definition}. * * @deprecated Some uppercase characters are supplementary characters; see the class * documentation. * @since 19.0 (since 1.0 as constant {@code JAVA_UPPER_CASE}) */ @Deprecated public static CharMatcher javaUpperCase() { return JavaUpperCase.INSTANCE; } /** * Determines whether a BMP character is lower case according to {@linkplain * Character#isLowerCase(char) Java's definition}. * * @deprecated Some lowercase characters are supplementary characters; see the class * documentation. * @since 19.0 (since 1.0 as constant {@code JAVA_LOWER_CASE}) */ @Deprecated public static CharMatcher javaLowerCase() { return JavaLowerCase.INSTANCE; } /** * Determines whether a character is an ISO control character as specified by {@link * Character#isISOControl(char)}. * *

All ISO control codes are on the BMP and thus supported by this API. * * @since 19.0 (since 1.0 as constant {@code JAVA_ISO_CONTROL}) */ public static CharMatcher javaIsoControl() { return JavaIsoControl.INSTANCE; } /** * Determines whether a character is invisible; that is, if its Unicode category is any of * SPACE_SEPARATOR, LINE_SEPARATOR, PARAGRAPH_SEPARATOR, CONTROL, FORMAT, SURROGATE, and * PRIVATE_USE according to ICU4J. * *

See also the Unicode Default_Ignorable_Code_Point property (available via ICU). * * @deprecated Most invisible characters are supplementary characters; see the class * documentation. * @since 19.0 (since 1.0 as constant {@code INVISIBLE}) */ @Deprecated public static CharMatcher invisible() { return Invisible.INSTANCE; } /** * Determines whether a character is single-width (not double-width). When in doubt, this matcher * errs on the side of returning {@code false} (that is, it tends to assume a character is * double-width). * *

Note: as the reference file evolves, we will modify this matcher to keep it up to * date. * *

See also UAX #11 East Asian Width. * * @deprecated Many such characters are supplementary characters; see the class documentation. * @since 19.0 (since 1.0 as constant {@code SINGLE_WIDTH}) */ @Deprecated public static CharMatcher singleWidth() { return SingleWidth.INSTANCE; } // Static factories /** Returns a {@code char} matcher that matches only one specified BMP character. */ public static CharMatcher is(final char match) { return new Is(match); } /** * Returns a {@code char} matcher that matches any character except the BMP character specified. * *

To negate another {@code CharMatcher}, use {@link #negate()}. */ public static CharMatcher isNot(final char match) { return new IsNot(match); } /** * Returns a {@code char} matcher that matches any BMP character present in the given character * sequence. Returns a bogus matcher if the sequence contains supplementary characters. */ public static CharMatcher anyOf(final CharSequence sequence) { switch (sequence.length()) { case 0: return none(); case 1: return is(sequence.charAt(0)); case 2: return isEither(sequence.charAt(0), sequence.charAt(1)); default: // TODO(lowasser): is it potentially worth just going ahead and building a precomputed // matcher? return new AnyOf(sequence); } } /** * Returns a {@code char} matcher that matches any BMP character not present in the given * character sequence. Returns a bogus matcher if the sequence contains supplementary characters. */ public static CharMatcher noneOf(CharSequence sequence) { return anyOf(sequence).negate(); } /** * Returns a {@code char} matcher that matches any character in a given BMP range (both endpoints * are inclusive). For example, to match any lowercase letter of the English alphabet, use {@code * CharMatcher.inRange('a', 'z')}. * * @throws IllegalArgumentException if {@code endInclusive < startInclusive} */ public static CharMatcher inRange(final char startInclusive, final char endInclusive) { return new InRange(startInclusive, endInclusive); } /** * Returns a matcher with identical behavior to the given {@link Character}-based predicate, but * which operates on primitive {@code char} instances instead. */ public static CharMatcher forPredicate(final Predicate predicate) { return predicate instanceof CharMatcher ? (CharMatcher) predicate : new ForPredicate(predicate); } // Constructors /** * Constructor for use by subclasses. When subclassing, you may want to override {@code * toString()} to provide a useful description. */ protected CharMatcher() {} // Abstract methods /** Determines a true or false value for the given character. */ public abstract boolean matches(char c); // Non-static factories /** Returns a matcher that matches any character not matched by this matcher. */ // @Override under Java 8 but not under Java 7 @Override public CharMatcher negate() { return new Negated(this); } /** * Returns a matcher that matches any character matched by both this matcher and {@code other}. */ public CharMatcher and(CharMatcher other) { return new And(this, other); } /** * Returns a matcher that matches any character matched by either this matcher or {@code other}. */ public CharMatcher or(CharMatcher other) { return new Or(this, other); } /** * Returns a {@code char} matcher functionally equivalent to this one, but which may be faster to * query than the original; your mileage may vary. Precomputation takes time and is likely to be * worthwhile only if the precomputed matcher is queried many thousands of times. * *

This method has no effect (returns {@code this}) when called in GWT: it's unclear whether a * precomputed matcher is faster, but it certainly consumes more memory, which doesn't seem like a * worthwhile tradeoff in a browser. */ public CharMatcher precomputed() { return Platform.precomputeCharMatcher(this); } private static final int DISTINCT_CHARS = Character.MAX_VALUE - Character.MIN_VALUE + 1; /** * This is the actual implementation of {@link #precomputed}, but we bounce calls through a method * on {@link Platform} so that we can have different behavior in GWT. * *

This implementation tries to be smart in a number of ways. It recognizes cases where the * negation is cheaper to precompute than the matcher itself; it tries to build small hash tables * for matchers that only match a few characters, and so on. In the worst-case scenario, it * constructs an eight-kilobyte bit array and queries that. In many situations this produces a * matcher which is faster to query than the original. */ @GwtIncompatible // SmallCharMatcher CharMatcher precomputedInternal() { final BitSet table = new BitSet(); setBits(table); int totalCharacters = table.cardinality(); if (totalCharacters * 2 <= DISTINCT_CHARS) { return precomputedPositive(totalCharacters, table, toString()); } else { // TODO(lowasser): is it worth it to worry about the last character of large matchers? table.flip(Character.MIN_VALUE, Character.MAX_VALUE + 1); int negatedCharacters = DISTINCT_CHARS - totalCharacters; String suffix = ".negate()"; final String description = toString(); String negatedDescription = description.endsWith(suffix) ? description.substring(0, description.length() - suffix.length()) : description + suffix; return new NegatedFastMatcher( precomputedPositive(negatedCharacters, table, negatedDescription)) { @Override public String toString() { return description; } }; } } /** * Helper method for {@link #precomputedInternal} that doesn't test if the negation is cheaper. */ @GwtIncompatible // SmallCharMatcher private static CharMatcher precomputedPositive( int totalCharacters, BitSet table, String description) { switch (totalCharacters) { case 0: return none(); case 1: return is((char) table.nextSetBit(0)); case 2: char c1 = (char) table.nextSetBit(0); char c2 = (char) table.nextSetBit(c1 + 1); return isEither(c1, c2); default: return isSmall(totalCharacters, table.length()) ? SmallCharMatcher.from(table, description) : new BitSetMatcher(table, description); } } @GwtIncompatible // SmallCharMatcher private static boolean isSmall(int totalCharacters, int tableLength) { return totalCharacters <= SmallCharMatcher.MAX_SIZE && tableLength > (totalCharacters * 4 * Character.SIZE); // err on the side of BitSetMatcher } /** Sets bits in {@code table} matched by this matcher. */ @GwtIncompatible // used only from other GwtIncompatible code void setBits(BitSet table) { for (int c = Character.MAX_VALUE; c >= Character.MIN_VALUE; c--) { if (matches((char) c)) { table.set(c); } } } // Text processing routines /** * Returns {@code true} if a character sequence contains at least one matching BMP character. * Equivalent to {@code !matchesNoneOf(sequence)}. * *

The default implementation iterates over the sequence, invoking {@link #matches} for each * character, until this returns {@code true} or the end is reached. * * @param sequence the character sequence to examine, possibly empty * @return {@code true} if this matcher matches at least one character in the sequence * @since 8.0 */ public boolean matchesAnyOf(CharSequence sequence) { return !matchesNoneOf(sequence); } /** * Returns {@code true} if a character sequence contains only matching BMP characters. * *

The default implementation iterates over the sequence, invoking {@link #matches} for each * character, until this returns {@code false} or the end is reached. * * @param sequence the character sequence to examine, possibly empty * @return {@code true} if this matcher matches every character in the sequence, including when * the sequence is empty */ public boolean matchesAllOf(CharSequence sequence) { for (int i = sequence.length() - 1; i >= 0; i--) { if (!matches(sequence.charAt(i))) { return false; } } return true; } /** * Returns {@code true} if a character sequence contains no matching BMP characters. Equivalent to * {@code !matchesAnyOf(sequence)}. * *

The default implementation iterates over the sequence, invoking {@link #matches} for each * character, until this returns {@code true} or the end is reached. * * @param sequence the character sequence to examine, possibly empty * @return {@code true} if this matcher matches no characters in the sequence, including when the * sequence is empty */ public boolean matchesNoneOf(CharSequence sequence) { return indexIn(sequence) == -1; } /** * Returns the index of the first matching BMP character in a character sequence, or {@code -1} if * no matching character is present. * *

The default implementation iterates over the sequence in forward order calling {@link * #matches} for each character. * * @param sequence the character sequence to examine from the beginning * @return an index, or {@code -1} if no character matches */ public int indexIn(CharSequence sequence) { return indexIn(sequence, 0); } /** * Returns the index of the first matching BMP character in a character sequence, starting from a * given position, or {@code -1} if no character matches after that position. * *

The default implementation iterates over the sequence in forward order, beginning at {@code * start}, calling {@link #matches} for each character. * * @param sequence the character sequence to examine * @param start the first index to examine; must be nonnegative and no greater than {@code * sequence.length()} * @return the index of the first matching character, guaranteed to be no less than {@code start}, * or {@code -1} if no character matches * @throws IndexOutOfBoundsException if start is negative or greater than {@code * sequence.length()} */ public int indexIn(CharSequence sequence, int start) { int length = sequence.length(); checkPositionIndex(start, length); for (int i = start; i < length; i++) { if (matches(sequence.charAt(i))) { return i; } } return -1; } /** * Returns the index of the last matching BMP character in a character sequence, or {@code -1} if * no matching character is present. * *

The default implementation iterates over the sequence in reverse order calling {@link * #matches} for each character. * * @param sequence the character sequence to examine from the end * @return an index, or {@code -1} if no character matches */ public int lastIndexIn(CharSequence sequence) { for (int i = sequence.length() - 1; i >= 0; i--) { if (matches(sequence.charAt(i))) { return i; } } return -1; } /** * Returns the number of matching {@code char}s found in a character sequence. * *

Counts 2 per supplementary character, such as for {@link #whitespace}().{@link #negate}(). */ public int countIn(CharSequence sequence) { int count = 0; for (int i = 0; i < sequence.length(); i++) { if (matches(sequence.charAt(i))) { count++; } } return count; } /** * Returns a string containing all non-matching characters of a character sequence, in order. For * example: * *

{@code
   * CharMatcher.is('a').removeFrom("bazaar")
   * }
* * ... returns {@code "bzr"}. */ public String removeFrom(CharSequence sequence) { String string = sequence.toString(); int pos = indexIn(string); if (pos == -1) { return string; } char[] chars = string.toCharArray(); int spread = 1; // This unusual loop comes from extensive benchmarking OUT: while (true) { pos++; while (true) { if (pos == chars.length) { break OUT; } if (matches(chars[pos])) { break; } chars[pos - spread] = chars[pos]; pos++; } spread++; } return new String(chars, 0, pos - spread); } /** * Returns a string containing all matching BMP characters of a character sequence, in order. For * example: * *
{@code
   * CharMatcher.is('a').retainFrom("bazaar")
   * }
* * ... returns {@code "aaa"}. */ public String retainFrom(CharSequence sequence) { return negate().removeFrom(sequence); } /** * Returns a string copy of the input character sequence, with each matching BMP character * replaced by a given replacement character. For example: * *
{@code
   * CharMatcher.is('a').replaceFrom("radar", 'o')
   * }
* * ... returns {@code "rodor"}. * *

The default implementation uses {@link #indexIn(CharSequence)} to find the first matching * character, then iterates the remainder of the sequence calling {@link #matches(char)} for each * character. * * @param sequence the character sequence to replace matching characters in * @param replacement the character to append to the result string in place of each matching * character in {@code sequence} * @return the new string */ public String replaceFrom(CharSequence sequence, char replacement) { String string = sequence.toString(); int pos = indexIn(string); if (pos == -1) { return string; } char[] chars = string.toCharArray(); chars[pos] = replacement; for (int i = pos + 1; i < chars.length; i++) { if (matches(chars[i])) { chars[i] = replacement; } } return new String(chars); } /** * Returns a string copy of the input character sequence, with each matching BMP character * replaced by a given replacement sequence. For example: * *

{@code
   * CharMatcher.is('a').replaceFrom("yaha", "oo")
   * }
* * ... returns {@code "yoohoo"}. * *

Note: If the replacement is a fixed string with only one character, you are better * off calling {@link #replaceFrom(CharSequence, char)} directly. * * @param sequence the character sequence to replace matching characters in * @param replacement the characters to append to the result string in place of each matching * character in {@code sequence} * @return the new string */ public String replaceFrom(CharSequence sequence, CharSequence replacement) { int replacementLen = replacement.length(); if (replacementLen == 0) { return removeFrom(sequence); } if (replacementLen == 1) { return replaceFrom(sequence, replacement.charAt(0)); } String string = sequence.toString(); int pos = indexIn(string); if (pos == -1) { return string; } int len = string.length(); StringBuilder buf = new StringBuilder((len * 3 / 2) + 16); int oldpos = 0; do { buf.append(string, oldpos, pos); buf.append(replacement); oldpos = pos + 1; pos = indexIn(string, oldpos); } while (pos != -1); buf.append(string, oldpos, len); return buf.toString(); } /** * Returns a substring of the input character sequence that omits all matching BMP characters from * the beginning and from the end of the string. For example: * *

{@code
   * CharMatcher.anyOf("ab").trimFrom("abacatbab")
   * }
* * ... returns {@code "cat"}. * *

Note that: * *

{@code
   * CharMatcher.inRange('\0', ' ').trimFrom(str)
   * }
* * ... is equivalent to {@link String#trim()}. */ public String trimFrom(CharSequence sequence) { int len = sequence.length(); int first; int last; for (first = 0; first < len; first++) { if (!matches(sequence.charAt(first))) { break; } } for (last = len - 1; last > first; last--) { if (!matches(sequence.charAt(last))) { break; } } return sequence.subSequence(first, last + 1).toString(); } /** * Returns a substring of the input character sequence that omits all matching BMP characters from * the beginning of the string. For example: * *
{@code
   * CharMatcher.anyOf("ab").trimLeadingFrom("abacatbab")
   * }
* * ... returns {@code "catbab"}. */ public String trimLeadingFrom(CharSequence sequence) { int len = sequence.length(); for (int first = 0; first < len; first++) { if (!matches(sequence.charAt(first))) { return sequence.subSequence(first, len).toString(); } } return ""; } /** * Returns a substring of the input character sequence that omits all matching BMP characters from * the end of the string. For example: * *
{@code
   * CharMatcher.anyOf("ab").trimTrailingFrom("abacatbab")
   * }
* * ... returns {@code "abacat"}. */ public String trimTrailingFrom(CharSequence sequence) { int len = sequence.length(); for (int last = len - 1; last >= 0; last--) { if (!matches(sequence.charAt(last))) { return sequence.subSequence(0, last + 1).toString(); } } return ""; } /** * Returns a string copy of the input character sequence, with each group of consecutive matching * BMP characters replaced by a single replacement character. For example: * *
{@code
   * CharMatcher.anyOf("eko").collapseFrom("bookkeeper", '-')
   * }
* * ... returns {@code "b-p-r"}. * *

The default implementation uses {@link #indexIn(CharSequence)} to find the first matching * character, then iterates the remainder of the sequence calling {@link #matches(char)} for each * character. * * @param sequence the character sequence to replace matching groups of characters in * @param replacement the character to append to the result string in place of each group of * matching characters in {@code sequence} * @return the new string */ public String collapseFrom(CharSequence sequence, char replacement) { // This implementation avoids unnecessary allocation. int len = sequence.length(); for (int i = 0; i < len; i++) { char c = sequence.charAt(i); if (matches(c)) { if (c == replacement && (i == len - 1 || !matches(sequence.charAt(i + 1)))) { // a no-op replacement i++; } else { StringBuilder builder = new StringBuilder(len).append(sequence, 0, i).append(replacement); return finishCollapseFrom(sequence, i + 1, len, replacement, builder, true); } } } // no replacement needed return sequence.toString(); } /** * Collapses groups of matching characters exactly as {@link #collapseFrom} does, except that * groups of matching BMP characters at the start or end of the sequence are removed without * replacement. */ public String trimAndCollapseFrom(CharSequence sequence, char replacement) { // This implementation avoids unnecessary allocation. int len = sequence.length(); int first = 0; int last = len - 1; while (first < len && matches(sequence.charAt(first))) { first++; } while (last > first && matches(sequence.charAt(last))) { last--; } return (first == 0 && last == len - 1) ? collapseFrom(sequence, replacement) : finishCollapseFrom( sequence, first, last + 1, replacement, new StringBuilder(last + 1 - first), false); } private String finishCollapseFrom( CharSequence sequence, int start, int end, char replacement, StringBuilder builder, boolean inMatchingGroup) { for (int i = start; i < end; i++) { char c = sequence.charAt(i); if (matches(c)) { if (!inMatchingGroup) { builder.append(replacement); inMatchingGroup = true; } } else { builder.append(c); inMatchingGroup = false; } } return builder.toString(); } /** * @deprecated Provided only to satisfy the {@link Predicate} interface; use {@link #matches} * instead. */ @Deprecated @Override public boolean apply(Character character) { return matches(character); } /** * Returns a string representation of this {@code CharMatcher}, such as {@code * CharMatcher.or(WHITESPACE, JAVA_DIGIT)}. */ @Override public String toString() { return super.toString(); } /** * Returns the Java Unicode escape sequence for the given {@code char}, in the form "\u12AB" where * "12AB" is the four hexadecimal digits representing the 16-bit code unit. */ private static String showCharacter(char c) { String hex = "0123456789ABCDEF"; char[] tmp = {'\\', 'u', '\0', '\0', '\0', '\0'}; for (int i = 0; i < 4; i++) { tmp[5 - i] = hex.charAt(c & 0xF); c = (char) (c >> 4); } return String.copyValueOf(tmp); } // Fast matchers /** A matcher for which precomputation will not yield any significant benefit. */ abstract static class FastMatcher extends CharMatcher { @Override public final CharMatcher precomputed() { return this; } @Override public CharMatcher negate() { return new NegatedFastMatcher(this); } } /** {@link FastMatcher} which overrides {@code toString()} with a custom name. */ abstract static class NamedFastMatcher extends FastMatcher { private final String description; NamedFastMatcher(String description) { this.description = checkNotNull(description); } @Override public final String toString() { return description; } } /** Negation of a {@link FastMatcher}. */ private static class NegatedFastMatcher extends Negated { NegatedFastMatcher(CharMatcher original) { super(original); } @Override public final CharMatcher precomputed() { return this; } } /** Fast matcher using a {@link BitSet} table of matching characters. */ @GwtIncompatible // used only from other GwtIncompatible code private static final class BitSetMatcher extends NamedFastMatcher { private final BitSet table; private BitSetMatcher(BitSet table, String description) { super(description); if (table.length() + Long.SIZE < table.size()) { table = (BitSet) table.clone(); // If only we could actually call BitSet.trimToSize() ourselves... } this.table = table; } @Override public boolean matches(char c) { return table.get(c); } @Override void setBits(BitSet bitSet) { bitSet.or(table); } } // Static constant implementation classes /** Implementation of {@link #any()}. */ private static final class Any extends NamedFastMatcher { static final CharMatcher INSTANCE = new Any(); private Any() { super("CharMatcher.any()"); } @Override public boolean matches(char c) { return true; } @Override public int indexIn(CharSequence sequence) { return (sequence.length() == 0) ? -1 : 0; } @Override public int indexIn(CharSequence sequence, int start) { int length = sequence.length(); checkPositionIndex(start, length); return (start == length) ? -1 : start; } @Override public int lastIndexIn(CharSequence sequence) { return sequence.length() - 1; } @Override public boolean matchesAllOf(CharSequence sequence) { checkNotNull(sequence); return true; } @Override public boolean matchesNoneOf(CharSequence sequence) { return sequence.length() == 0; } @Override public String removeFrom(CharSequence sequence) { checkNotNull(sequence); return ""; } @Override public String replaceFrom(CharSequence sequence, char replacement) { char[] array = new char[sequence.length()]; Arrays.fill(array, replacement); return new String(array); } @Override public String replaceFrom(CharSequence sequence, CharSequence replacement) { StringBuilder result = new StringBuilder(sequence.length() * replacement.length()); for (int i = 0; i < sequence.length(); i++) { result.append(replacement); } return result.toString(); } @Override public String collapseFrom(CharSequence sequence, char replacement) { return (sequence.length() == 0) ? "" : String.valueOf(replacement); } @Override public String trimFrom(CharSequence sequence) { checkNotNull(sequence); return ""; } @Override public int countIn(CharSequence sequence) { return sequence.length(); } @Override public CharMatcher and(CharMatcher other) { return checkNotNull(other); } @Override public CharMatcher or(CharMatcher other) { checkNotNull(other); return this; } @Override public CharMatcher negate() { return none(); } } /** Implementation of {@link #none()}. */ private static final class None extends NamedFastMatcher { static final CharMatcher INSTANCE = new None(); private None() { super("CharMatcher.none()"); } @Override public boolean matches(char c) { return false; } @Override public int indexIn(CharSequence sequence) { checkNotNull(sequence); return -1; } @Override public int indexIn(CharSequence sequence, int start) { int length = sequence.length(); checkPositionIndex(start, length); return -1; } @Override public int lastIndexIn(CharSequence sequence) { checkNotNull(sequence); return -1; } @Override public boolean matchesAllOf(CharSequence sequence) { return sequence.length() == 0; } @Override public boolean matchesNoneOf(CharSequence sequence) { checkNotNull(sequence); return true; } @Override public String removeFrom(CharSequence sequence) { return sequence.toString(); } @Override public String replaceFrom(CharSequence sequence, char replacement) { return sequence.toString(); } @Override public String replaceFrom(CharSequence sequence, CharSequence replacement) { checkNotNull(replacement); return sequence.toString(); } @Override public String collapseFrom(CharSequence sequence, char replacement) { return sequence.toString(); } @Override public String trimFrom(CharSequence sequence) { return sequence.toString(); } @Override public String trimLeadingFrom(CharSequence sequence) { return sequence.toString(); } @Override public String trimTrailingFrom(CharSequence sequence) { return sequence.toString(); } @Override public int countIn(CharSequence sequence) { checkNotNull(sequence); return 0; } @Override public CharMatcher and(CharMatcher other) { checkNotNull(other); return this; } @Override public CharMatcher or(CharMatcher other) { return checkNotNull(other); } @Override public CharMatcher negate() { return any(); } } /** Implementation of {@link #whitespace()}. */ @VisibleForTesting static final class Whitespace extends NamedFastMatcher { // TABLE is a precomputed hashset of whitespace characters. MULTIPLIER serves as a hash function // whose key property is that it maps 25 characters into the 32-slot table without collision. // Basically this is an opportunistic fast implementation as opposed to "good code". For most // other use-cases, the reduction in readability isn't worth it. static final String TABLE = "\u2002\u3000\r\u0085\u200A\u2005\u2000\u3000" + "\u2029\u000B\u3000\u2008\u2003\u205F\u3000\u1680" + "\u0009\u0020\u2006\u2001\u202F\u00A0\u000C\u2009" + "\u3000\u2004\u3000\u3000\u2028\n\u2007\u3000"; static final int MULTIPLIER = 1682554634; static final int SHIFT = Integer.numberOfLeadingZeros(TABLE.length() - 1); static final CharMatcher INSTANCE = new Whitespace(); Whitespace() { super("CharMatcher.whitespace()"); } @Override public boolean matches(char c) { return TABLE.charAt((MULTIPLIER * c) >>> SHIFT) == c; } @GwtIncompatible // used only from other GwtIncompatible code @Override void setBits(BitSet table) { for (int i = 0; i < TABLE.length(); i++) { table.set(TABLE.charAt(i)); } } } /** Implementation of {@link #breakingWhitespace()}. */ private static final class BreakingWhitespace extends CharMatcher { static final CharMatcher INSTANCE = new BreakingWhitespace(); @Override public boolean matches(char c) { switch (c) { case '\t': case '\n': case '\013': case '\f': case '\r': case ' ': case '\u0085': case '\u1680': case '\u2028': case '\u2029': case '\u205f': case '\u3000': return true; case '\u2007': return false; default: return c >= '\u2000' && c <= '\u200a'; } } @Override public String toString() { return "CharMatcher.breakingWhitespace()"; } } /** Implementation of {@link #ascii()}. */ private static final class Ascii extends NamedFastMatcher { static final CharMatcher INSTANCE = new Ascii(); Ascii() { super("CharMatcher.ascii()"); } @Override public boolean matches(char c) { return c <= '\u007f'; } } /** Implementation that matches characters that fall within multiple ranges. */ private static class RangesMatcher extends CharMatcher { private final String description; private final char[] rangeStarts; private final char[] rangeEnds; RangesMatcher(String description, char[] rangeStarts, char[] rangeEnds) { this.description = description; this.rangeStarts = rangeStarts; this.rangeEnds = rangeEnds; checkArgument(rangeStarts.length == rangeEnds.length); for (int i = 0; i < rangeStarts.length; i++) { checkArgument(rangeStarts[i] <= rangeEnds[i]); if (i + 1 < rangeStarts.length) { checkArgument(rangeEnds[i] < rangeStarts[i + 1]); } } } @Override public boolean matches(char c) { int index = Arrays.binarySearch(rangeStarts, c); if (index >= 0) { return true; } else { index = ~index - 1; return index >= 0 && c <= rangeEnds[index]; } } @Override public String toString() { return description; } } /** Implementation of {@link #digit()}. */ private static final class Digit extends RangesMatcher { // Plug the following UnicodeSet pattern into // https://unicode.org/cldr/utility/list-unicodeset.jsp // [[:Nd:]&[:nv=0:]&[\u0000-\uFFFF]] // and get the zeroes from there. // Must be in ascending order. private static final String ZEROES = "0\u0660\u06f0\u07c0\u0966\u09e6\u0a66\u0ae6\u0b66\u0be6\u0c66\u0ce6\u0d66\u0de6" + "\u0e50\u0ed0\u0f20\u1040\u1090\u17e0\u1810\u1946\u19d0\u1a80\u1a90\u1b50\u1bb0" + "\u1c40\u1c50\ua620\ua8d0\ua900\ua9d0\ua9f0\uaa50\uabf0\uff10"; private static char[] zeroes() { return ZEROES.toCharArray(); } private static char[] nines() { char[] nines = new char[ZEROES.length()]; for (int i = 0; i < ZEROES.length(); i++) { nines[i] = (char) (ZEROES.charAt(i) + 9); } return nines; } static final CharMatcher INSTANCE = new Digit(); private Digit() { super("CharMatcher.digit()", zeroes(), nines()); } } /** Implementation of {@link #javaDigit()}. */ private static final class JavaDigit extends CharMatcher { static final CharMatcher INSTANCE = new JavaDigit(); @Override public boolean matches(char c) { return Character.isDigit(c); } @Override public String toString() { return "CharMatcher.javaDigit()"; } } /** Implementation of {@link #javaLetter()}. */ private static final class JavaLetter extends CharMatcher { static final CharMatcher INSTANCE = new JavaLetter(); @Override public boolean matches(char c) { return Character.isLetter(c); } @Override public String toString() { return "CharMatcher.javaLetter()"; } } /** Implementation of {@link #javaLetterOrDigit()}. */ private static final class JavaLetterOrDigit extends CharMatcher { static final CharMatcher INSTANCE = new JavaLetterOrDigit(); @Override public boolean matches(char c) { return Character.isLetterOrDigit(c); } @Override public String toString() { return "CharMatcher.javaLetterOrDigit()"; } } /** Implementation of {@link #javaUpperCase()}. */ private static final class JavaUpperCase extends CharMatcher { static final CharMatcher INSTANCE = new JavaUpperCase(); @Override public boolean matches(char c) { return Character.isUpperCase(c); } @Override public String toString() { return "CharMatcher.javaUpperCase()"; } } /** Implementation of {@link #javaLowerCase()}. */ private static final class JavaLowerCase extends CharMatcher { static final CharMatcher INSTANCE = new JavaLowerCase(); @Override public boolean matches(char c) { return Character.isLowerCase(c); } @Override public String toString() { return "CharMatcher.javaLowerCase()"; } } /** Implementation of {@link #javaIsoControl()}. */ private static final class JavaIsoControl extends NamedFastMatcher { static final CharMatcher INSTANCE = new JavaIsoControl(); private JavaIsoControl() { super("CharMatcher.javaIsoControl()"); } @Override public boolean matches(char c) { return c <= '\u001f' || (c >= '\u007f' && c <= '\u009f'); } } /** Implementation of {@link #invisible()}. */ private static final class Invisible extends RangesMatcher { // Plug the following UnicodeSet pattern into // https://unicode.org/cldr/utility/list-unicodeset.jsp // [[[:Zs:][:Zl:][:Zp:][:Cc:][:Cf:][:Cs:][:Co:]]&[\u0000-\uFFFF]] // with the "Abbreviate" option, and get the ranges from there. private static final String RANGE_STARTS = "\u0000\u007f\u00ad\u0600\u061c\u06dd\u070f\u0890\u08e2\u1680\u180e\u2000\u2028\u205f\u2066" + "\u3000\ud800\ufeff\ufff9"; private static final String RANGE_ENDS = // inclusive ends "\u0020\u00a0\u00ad\u0605\u061c\u06dd\u070f\u0891\u08e2\u1680\u180e\u200f\u202f\u2064\u206f" + "\u3000\uf8ff\ufeff\ufffb"; static final CharMatcher INSTANCE = new Invisible(); private Invisible() { super("CharMatcher.invisible()", RANGE_STARTS.toCharArray(), RANGE_ENDS.toCharArray()); } } /** Implementation of {@link #singleWidth()}. */ private static final class SingleWidth extends RangesMatcher { static final CharMatcher INSTANCE = new SingleWidth(); private SingleWidth() { super( "CharMatcher.singleWidth()", "\u0000\u05be\u05d0\u05f3\u0600\u0750\u0e00\u1e00\u2100\ufb50\ufe70\uff61".toCharArray(), "\u04f9\u05be\u05ea\u05f4\u06ff\u077f\u0e7f\u20af\u213a\ufdff\ufeff\uffdc".toCharArray()); } } // Non-static factory implementation classes /** Implementation of {@link #negate()}. */ private static class Negated extends CharMatcher { final CharMatcher original; Negated(CharMatcher original) { this.original = checkNotNull(original); } @Override public boolean matches(char c) { return !original.matches(c); } @Override public boolean matchesAllOf(CharSequence sequence) { return original.matchesNoneOf(sequence); } @Override public boolean matchesNoneOf(CharSequence sequence) { return original.matchesAllOf(sequence); } @Override public int countIn(CharSequence sequence) { return sequence.length() - original.countIn(sequence); } @GwtIncompatible // used only from other GwtIncompatible code @Override void setBits(BitSet table) { BitSet tmp = new BitSet(); original.setBits(tmp); tmp.flip(Character.MIN_VALUE, Character.MAX_VALUE + 1); table.or(tmp); } @Override public CharMatcher negate() { return original; } @Override public String toString() { return original + ".negate()"; } } /** Implementation of {@link #and(CharMatcher)}. */ private static final class And extends CharMatcher { final CharMatcher first; final CharMatcher second; And(CharMatcher a, CharMatcher b) { first = checkNotNull(a); second = checkNotNull(b); } @Override public boolean matches(char c) { return first.matches(c) && second.matches(c); } @GwtIncompatible // used only from other GwtIncompatible code @Override void setBits(BitSet table) { BitSet tmp1 = new BitSet(); first.setBits(tmp1); BitSet tmp2 = new BitSet(); second.setBits(tmp2); tmp1.and(tmp2); table.or(tmp1); } @Override public String toString() { return "CharMatcher.and(" + first + ", " + second + ")"; } } /** Implementation of {@link #or(CharMatcher)}. */ private static final class Or extends CharMatcher { final CharMatcher first; final CharMatcher second; Or(CharMatcher a, CharMatcher b) { first = checkNotNull(a); second = checkNotNull(b); } @GwtIncompatible // used only from other GwtIncompatible code @Override void setBits(BitSet table) { first.setBits(table); second.setBits(table); } @Override public boolean matches(char c) { return first.matches(c) || second.matches(c); } @Override public String toString() { return "CharMatcher.or(" + first + ", " + second + ")"; } } // Static factory implementations /** Implementation of {@link #is(char)}. */ private static final class Is extends FastMatcher { private final char match; Is(char match) { this.match = match; } @Override public boolean matches(char c) { return c == match; } @Override public String replaceFrom(CharSequence sequence, char replacement) { return sequence.toString().replace(match, replacement); } @Override public CharMatcher and(CharMatcher other) { return other.matches(match) ? this : none(); } @Override public CharMatcher or(CharMatcher other) { return other.matches(match) ? other : super.or(other); } @Override public CharMatcher negate() { return isNot(match); } @GwtIncompatible // used only from other GwtIncompatible code @Override void setBits(BitSet table) { table.set(match); } @Override public String toString() { return "CharMatcher.is('" + showCharacter(match) + "')"; } } /** Implementation of {@link #isNot(char)}. */ private static final class IsNot extends FastMatcher { private final char match; IsNot(char match) { this.match = match; } @Override public boolean matches(char c) { return c != match; } @Override public CharMatcher and(CharMatcher other) { return other.matches(match) ? super.and(other) : other; } @Override public CharMatcher or(CharMatcher other) { return other.matches(match) ? any() : this; } @GwtIncompatible // used only from other GwtIncompatible code @Override void setBits(BitSet table) { table.set(0, match); table.set(match + 1, Character.MAX_VALUE + 1); } @Override public CharMatcher negate() { return is(match); } @Override public String toString() { return "CharMatcher.isNot('" + showCharacter(match) + "')"; } } private static CharMatcher.IsEither isEither(char c1, char c2) { return new CharMatcher.IsEither(c1, c2); } /** Implementation of {@link #anyOf(CharSequence)} for exactly two characters. */ private static final class IsEither extends FastMatcher { private final char match1; private final char match2; IsEither(char match1, char match2) { this.match1 = match1; this.match2 = match2; } @Override public boolean matches(char c) { return c == match1 || c == match2; } @GwtIncompatible // used only from other GwtIncompatible code @Override void setBits(BitSet table) { table.set(match1); table.set(match2); } @Override public String toString() { return "CharMatcher.anyOf(\"" + showCharacter(match1) + showCharacter(match2) + "\")"; } } /** Implementation of {@link #anyOf(CharSequence)} for three or more characters. */ private static final class AnyOf extends CharMatcher { private final char[] chars; public AnyOf(CharSequence chars) { this.chars = chars.toString().toCharArray(); Arrays.sort(this.chars); } @Override public boolean matches(char c) { return Arrays.binarySearch(chars, c) >= 0; } @Override @GwtIncompatible // used only from other GwtIncompatible code void setBits(BitSet table) { for (char c : chars) { table.set(c); } } @Override public String toString() { StringBuilder description = new StringBuilder("CharMatcher.anyOf(\""); for (char c : chars) { description.append(showCharacter(c)); } description.append("\")"); return description.toString(); } } /** Implementation of {@link #inRange(char, char)}. */ private static final class InRange extends FastMatcher { private final char startInclusive; private final char endInclusive; InRange(char startInclusive, char endInclusive) { checkArgument(endInclusive >= startInclusive); this.startInclusive = startInclusive; this.endInclusive = endInclusive; } @Override public boolean matches(char c) { return startInclusive <= c && c <= endInclusive; } @GwtIncompatible // used only from other GwtIncompatible code @Override void setBits(BitSet table) { table.set(startInclusive, endInclusive + 1); } @Override public String toString() { return "CharMatcher.inRange('" + showCharacter(startInclusive) + "', '" + showCharacter(endInclusive) + "')"; } } /** Implementation of {@link #forPredicate(Predicate)}. */ private static final class ForPredicate extends CharMatcher { private final Predicate predicate; ForPredicate(Predicate predicate) { this.predicate = checkNotNull(predicate); } @Override public boolean matches(char c) { return predicate.apply(c); } @SuppressWarnings("deprecation") // intentional; deprecation is for callers primarily @Override public boolean apply(Character character) { return predicate.apply(checkNotNull(character)); } @Override public String toString() { return "CharMatcher.forPredicate(" + predicate + ")"; } } }





© 2015 - 2024 Weber Informatics LLC | Privacy Policy