de.unkrig.commons.lang.Characters Maven / Gradle / Ivy
/*
* de.unkrig.commons - A general-purpose Java class library
*
* Copyright (c) 2016, Arno Unkrig
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without modification, are permitted provided that the
* following conditions are met:
*
* 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the
* following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the
* following disclaimer in the documentation and/or other materials provided with the distribution.
* 3. Neither the name of the copyright holder nor the names of its contributors may be used to endorse or promote
* products derived from this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES,
* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
* SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
* WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
package de.unkrig.commons.lang;
import java.lang.Character.UnicodeBlock;
import java.util.Collections;
import java.util.HashMap;
import java.util.Locale;
import java.util.Map;
import java.util.regex.Pattern;
import de.unkrig.commons.lang.OptionalMethods.MethodWrapper1;
import de.unkrig.commons.lang.protocol.NoException;
import de.unkrig.commons.lang.protocol.Predicate;
import de.unkrig.commons.nullanalysis.Nullable;
/**
* Extensions for the JRE's {@link Character} class.
*/
public final
class Characters {
private Characters() {}
private abstract static
class IntegerPredicate implements Predicate {
private final String toString;
IntegerPredicate(String toString) { this.toString = toString; }
@Override public String
toString() { return this.toString; }
}
/**
* Evaluates whether a given code point lies in the POSIX character class "lower" ({@code [a-z]}).
*
* @see The Open
* Group Base Specifications Issue 7, section 7.3.1: LC_CTYPE
*/
public static final Predicate
IS_POSIX_LOWER = Characters.rangePredicate("posixLower", 'a', 'z');
/**
* Evaluates whether a given code point lies in the POSIX character class "upper" ({@code [A-Z]}).
*
* @see The Open
* Group Base Specifications Issue 7, section 7.3.1: LC_CTYPE
*/
public static final Predicate
IS_POSIX_UPPER = Characters.rangePredicate("posixUpper", 'A', 'Z');
/**
* Evaluates whether a given code point is in the ASCII range (0-127).
*/
public static final Predicate
IS_POSIX_ASCII = Characters.rangePredicate("posixAscii", 0, 0x7f);
/**
* Evaluates whether a given code point lies in the POSIX character class "alpha" ({@code [A-Za-z]}).
*
* @see The Open
* Group Base Specifications Issue 7, section 7.3.1: LC_CTYPE
*/
public static final Predicate
IS_POSIX_ALPHA = new IntegerPredicate("posixAlpha") {
@Override public boolean
evaluate(Integer subject) {
int cp = subject;
return (cp >= 'a' && cp <= 'z') || (cp >= 'A' && cp <= 'Z');
}
};
/**
* Evaluates whether a given code point lies in the POSIX character class "digit" ({@code [0-9]}).
*
* @see The Open
* Group Base Specifications Issue 7, section 7.3.1: LC_CTYPE
*/
public static final Predicate
IS_POSIX_DIGIT = Characters.rangePredicate("posixDigit", '0', '9');
/**
* Evaluates whether a given code point lies in the POSIX character class "alnum" ({@code [A-Za-z0-9]}).
*
* @see The Open
* Group Base Specifications Issue 7, section 7.3.1: LC_CTYPE
*/
public static final Predicate
IS_POSIX_ALNUM = new IntegerPredicate("posixAlnum") {
@Override public boolean
evaluate(Integer subject) {
int cp = subject;
return (cp >= 'a' && cp <= 'z') || (cp >= 'A' && cp <= 'Z') || (cp >= '0' && cp <= '9');
}
};
/**
* Evaluates whether a given code point lies in the POSIX character class "punct" (one of ! " # $ % &
* ' ( ) * + , - . / : ; < = > ? @ [ \ ] ^ _ ` { | } ~).
*
* @see The Open
* Group Base Specifications Issue 7, section 7.3.1: LC_CTYPE
*/
public static final Predicate
IS_POSIX_PUNCT = new IntegerPredicate("posixPunct") {
@Override public boolean
evaluate(Integer subject) {
int cp = subject;
return (
(cp >= '!' && cp <= '/') // !"#$%&'()*+,-./ (33...47)
|| (cp >= ':' && cp <= '@') // :;<=>?@ (58...64)
|| (cp >= '[' && cp <= '`') // [\]^_` (91...96)
|| (cp >= '{' && cp <= '~') // {|}~ (123...126)
);
}
};
/**
* Evaluates whether a given code point lies in the POSIX character class "graph"; the union of classes "alpha",
* "digit", and "punct".
*
* @see The Open
* Group Base Specifications Issue 7, section 7.3.1: LC_CTYPE
*/
public static final Predicate
IS_POSIX_GRAPH = new IntegerPredicate("posixGraph") {
@Override public boolean
evaluate(Integer subject) {
return Characters.IS_POSIX_ALNUM.evaluate(subject) || Characters.IS_POSIX_PUNCT.evaluate(subject);
}
};
/**
* Evaluates whether a given code point lies in the POSIX character class "print"; the union of classes "alpha",
* "digit" and "punct", and the SPACE character.
*
* @see The Open
* Group Base Specifications Issue 7, section 7.3.1: LC_CTYPE
*/
public static final Predicate
IS_POSIX_PRINT = new IntegerPredicate("posixPrint") {
@Override public boolean
evaluate(Integer subject) {
return (
subject == ' '
|| Characters.IS_POSIX_ALNUM.evaluate(subject)
|| Characters.IS_POSIX_PUNCT.evaluate(subject)
);
}
};
/**
* Evaluates whether a given code point lies in the POSIX character class "blank"; which consists of the SPACE
* character and the TAB character.
*
* @see The Open
* Group Base Specifications Issue 7, section 7.3.1: LC_CTYPE
*/
public static final Predicate
IS_POSIX_BLANK = new IntegerPredicate("posixBlank") {
@Override public boolean
evaluate(Integer subject) {
int cp = subject;
return cp == ' ' || cp == '\t';
}
};
/**
* Evaluates whether a given code point lies in the POSIX character class "cntrl" ({@code [\0-\x1f\x7f]}).
*
* @see The Open
* Group Base Specifications Issue 7, section 7.3.1: LC_CTYPE
*/
public static final Predicate
IS_POSIX_CNTRL = new IntegerPredicate("posixCntrl") {
@Override public boolean
evaluate(Integer subject) {
int cp = subject;
return cp <= 0x1f || cp == 0x7f;
}
};
/**
* Evaluates whether a given code point lies in the POSIX character class "xdigit" ({@code [0-9a-fA-F]}).
*
* @see The Open
* Group Base Specifications Issue 7, section 7.3.1: LC_CTYPE
*/
public static final Predicate
IS_POSIX_XDIGIT = new IntegerPredicate("posixXdigit") {
@Override public boolean
evaluate(Integer subject) {
int cp = subject;
return (cp >= '0' && cp <= '9') || (cp >= 'a' && cp <= 'f') || (cp >= 'A' && cp <= 'F');
}
};
/**
* Evaluates whether a given code point lies in the POSIX character class "space" (consisting of the tab, newline,
* vertical-tab, form-feed, carriage-return and space characters).
*
* @see The Open
* Group Base Specifications Issue 7, section 7.3.1: LC_CTYPE
*/
public static final Predicate
IS_POSIX_SPACE = new IntegerPredicate("posixSpace") {
@Override public boolean
evaluate(Integer subject) {
int cp = subject;
return (
(cp <= 13 && cp >= 9) // 0x09=tab, 0x0a=newline, 0x0b=vertical-tab, 0x0c=form-feed, 0x0d=carriage-return
|| cp == ' ' // 0x20=space
);
}
};
/**
* @return A predicate-of-int that implements the given "simple java character type".
* @see Pattern
*/
@Nullable public static Predicate
javaCharacterClassFromName(String name) {
return (
"javaAlphabetic".equals(name) ? Characters.IS_UNICODE_ALPHA :
"javaIdeographic".equals(name) ? Characters.IS_UNICODE_IDEOGRAPHIC :
"javaLetter".equals(name) ? Characters.IS_UNICODE_LETTER :
"javaLowerCase".equals(name) ? Characters.IS_UNICODE_LOWER :
"javaUpperCase".equals(name) ? Characters.IS_UNICODE_UPPER :
"javaTitleCase".equals(name) ? Characters.IS_UNICODE_TITLE :
"javaWhitespace".equals(name) ? Characters.IS_WHITESPACE :
"javaMirrored".equals(name) ? Characters.IS_MIRRORED :
"javaDigit".equals(name) ? Characters.IS_UNICODE_DIGIT :
"javaLetterOrDigit".equals(name) ? Characters.IS_LETTER_OR_DIGIT :
"javaDefined".equals(name) ? Characters.IS_DEFINED :
"javaJavaIdentifierStart".equals(name) ? Characters.IS_JAVA_IDENTIFIER_START :
"javaJavaIdentifierPart".equals(name) ? Characters.IS_JAVA_IDENTIFIER_PART :
"javaIdentifierIgnorable".equals(name) ? Characters.IS_IDENTIFIER_IGNORABLE :
"javaUnicodeIdentifierStart".equals(name) ? Characters.IS_UNICODE_IDENTIFIER_START :
"javaUnicodeIdentifierPart".equals(name) ? Characters.IS_UNICODE_IDENTIFIER_PART :
"javaSpaceChar".equals(name) ? Characters.IS_SPACE_CHAR :
"javaISOControl".equals(name) ? Characters.IS_ISO_CONTROL :
null
);
}
/** A predicate for {@link Character#isISOControl(int)}. */
public static final Predicate
IS_ISO_CONTROL = new IntegerPredicate("javaISOControl") {
@Override public boolean evaluate(Integer subject) { return Character.isISOControl(subject); }
};
/** A predicate for {@link Character#isSpaceChar(int)}. */
public static final Predicate
IS_SPACE_CHAR = new IntegerPredicate("javaSpaceChar") {
@Override public boolean evaluate(Integer subject) { return Character.isSpaceChar(subject); }
};
/** A predicate for {@link Character#isDefined(int)}. */
public static final Predicate
IS_DEFINED = new IntegerPredicate("javaDefined") {
@Override public boolean evaluate(Integer subject) { return Character.isDefined(subject); }
};
/** A predicate for {@link Character#isJavaIdentifierStart(int)}. */
public static final Predicate
IS_JAVA_IDENTIFIER_START = new IntegerPredicate("isJavaIdentifierStart") {
@Override public boolean evaluate(Integer subject) { return Character.isJavaIdentifierStart(subject); }
};
/** A predicate for {@link Character#isJavaIdentifierPart(int)}. */
public static final Predicate
IS_JAVA_IDENTIFIER_PART = new IntegerPredicate("isJavaIdentifierPart") {
@Override public boolean evaluate(Integer subject) { return Character.isJavaIdentifierPart(subject); }
};
/** A predicate for {@link Character#isIdentifierIgnorable(int)}. */
public static final Predicate
IS_IDENTIFIER_IGNORABLE = new IntegerPredicate("isIdentifierIgnorable") {
@Override public boolean evaluate(Integer subject) { return Character.isIdentifierIgnorable(subject); }
};
/** A predicate for {@link Character#isUnicodeIdentifierStart(int)}. */
public static final Predicate
IS_UNICODE_IDENTIFIER_START = new IntegerPredicate("isUnicodeIdentifierStart") {
@Override public boolean evaluate(Integer subject) { return Character.isUnicodeIdentifierStart(subject); }
};
/** A predicate for {@link Character#isUnicodeIdentifierPart(int)}. */
public static final Predicate
IS_UNICODE_IDENTIFIER_PART = new IntegerPredicate("isUnicodeIdentifierPart") {
@Override public boolean evaluate(Integer subject) { return Character.isUnicodeIdentifierPart(subject); }
};
/** A predicate for {@link Character#isLetterOrDigit(int)}. */
public static final Predicate
IS_LETTER_OR_DIGIT = new IntegerPredicate("javaDefined") {
@Override public boolean evaluate(Integer subject) { return Character.isLetterOrDigit(subject); }
};
/** A predicate for {@link Character#isWhitespace(int)}. */
public static final Predicate
IS_WHITESPACE = new IntegerPredicate("javaWhitespace") {
@Override public boolean evaluate(Integer subject) { return Character.isWhitespace(subject); }
};
/** A predicate for {@link Characters#isHorizontalWhitespace(int)}. */
public static final Predicate
IS_HORIZONTAL_WHITESPACE = new IntegerPredicate("horizontalWhitespace") {
@Override public boolean evaluate(Integer subject) { return Characters.isHorizontalWhitespace(subject); }
};
/**
* @return Whether the given codePoint is a "horizontal whitespace character" (which matches pattern
* {@code "\h"})
*/
public static boolean
isHorizontalWhitespace(int codePoint) {
return (
" \t\u00A0\u1680\u180e\u202f\u205f\u3000".indexOf(codePoint) != -1
|| (codePoint >= '\u2000' && codePoint <= '\u200a')
);
}
/** A predicate for {@link Character#isMirrored(int)}. */
public static final Predicate
IS_MIRRORED = new IntegerPredicate("mirrored") {
@Override public boolean evaluate(Integer subject) { return Character.isMirrored(subject); }
};
/** A word character: [a-zA-Z_0-9] */
public static final Predicate
IS_WORD = new IntegerPredicate("posixWord") {
@Override public boolean evaluate(Integer subject) { return Characters.isWordCharacter(subject); }
};
/** A word character: [a-zA-Z_0-9] */
public static boolean
isWordCharacter(int codePoint) {
return (
(codePoint >= 'a' && codePoint <= 'z')
|| (codePoint >= 'A' && codePoint <= 'Z')
|| (codePoint >= '0' && codePoint <= '9')
|| codePoint == '_'
);
}
// =============================== UNICODE CATEGORIES ===============================
// SUPPRESS CHECKSTYLE LineLength|JavadocVariable:31
public static final Predicate IS_UNICODE_UNASSIGNED = Characters.unicodeGeneralCategoryPredicate("unicodeUnassigned", Character.UNASSIGNED);
public static final Predicate IS_UNICODE_LETTER = new IntegerPredicate("unicodeLetter") { @Override public boolean evaluate(Integer subject) { return Character.isLetter(subject); } };
public static final Predicate IS_UNICODE_UPPER = new IntegerPredicate("unicodeUpper") { @Override public boolean evaluate(Integer subject) { return Character.isUpperCase(subject); } };
public static final Predicate IS_UNICODE_LOWER = new IntegerPredicate("unicodeLower") { @Override public boolean evaluate(Integer subject) { return Character.isLowerCase(subject); } };
public static final Predicate IS_UNICODE_TITLE = new IntegerPredicate("unicodeTitle") { @Override public boolean evaluate(Integer subject) { return Character.isTitleCase(subject); } };
public static final Predicate IS_UNICODE_MODIFIER_LETTER = Characters.unicodeGeneralCategoryPredicate("unicodeModifier", Character.MODIFIER_LETTER);
public static final Predicate IS_UNICODE_OTHER_LETTER = Characters.unicodeGeneralCategoryPredicate("unicodeOther", Character.OTHER_LETTER);
public static final Predicate IS_UNICODE_NON_SPACING_MARK = Characters.unicodeGeneralCategoryPredicate("unicodeNonSpacingNark", Character.NON_SPACING_MARK);
public static final Predicate IS_UNICODE_ENCLOSING_MARK = Characters.unicodeGeneralCategoryPredicate("unicodeEnclosingMark", Character.ENCLOSING_MARK);
public static final Predicate IS_UNICODE_COMBINING_SPACING_MARK = Characters.unicodeGeneralCategoryPredicate("unicodeCombiningSpacingMark", Character.COMBINING_SPACING_MARK);
public static final Predicate IS_UNICODE_DECIMAL_DIGIT_NUMBER = Characters.unicodeGeneralCategoryPredicate("unicodeDecimalDigitNumber", Character.DECIMAL_DIGIT_NUMBER);
public static final Predicate IS_UNICODE_LETTER_NUMBER = Characters.unicodeGeneralCategoryPredicate("unicodeLetterNumber", Character.LETTER_NUMBER);
public static final Predicate IS_UNICODE_OTHER_NUMBER = Characters.unicodeGeneralCategoryPredicate("unicodeOtherNumber", Character.OTHER_NUMBER);
public static final Predicate IS_UNICODE_SPACE_SEPARATOR = Characters.unicodeGeneralCategoryPredicate("unicodeSpaceSeparator", Character.SPACE_SEPARATOR);
public static final Predicate IS_UNICODE_LINE_SEPARATOR = Characters.unicodeGeneralCategoryPredicate("unicodeLineSeparator", Character.LINE_SEPARATOR);
public static final Predicate IS_UNICODE_PARAGRAPH_SEPARATOR = Characters.unicodeGeneralCategoryPredicate("unicodeParagraphSeparator", Character.PARAGRAPH_SEPARATOR);
public static final Predicate IS_UNICODE_CONTROL = Characters.unicodeGeneralCategoryPredicate("unicodeControl", Character.CONTROL);
public static final Predicate IS_UNICODE_FORMAT = Characters.unicodeGeneralCategoryPredicate("unicodeFormat", Character.FORMAT);
public static final Predicate IS_UNICODE_PRIVATE_USE = Characters.unicodeGeneralCategoryPredicate("unicodePrivateUse", Character.PRIVATE_USE);
public static final Predicate IS_UNICODE_SURROGATE = Characters.unicodeGeneralCategoryPredicate("unicodeSurrogate", Character.SURROGATE);
public static final Predicate IS_UNICODE_DASH_PUNCTUATION = Characters.unicodeGeneralCategoryPredicate("unicodeDashPunctuation", Character.DASH_PUNCTUATION);
public static final Predicate IS_UNICODE_START_PUNCTUATION = Characters.unicodeGeneralCategoryPredicate("unicodeStartPunctuation", Character.START_PUNCTUATION);
public static final Predicate IS_UNICODE_END_PUNCTUATION = Characters.unicodeGeneralCategoryPredicate("unicodeEndPunctuation", Character.END_PUNCTUATION);
public static final Predicate IS_UNICODE_CONNECTOR_PUNCTUATION = Characters.unicodeGeneralCategoryPredicate("unicodeConnectorPunctuation", Character.CONNECTOR_PUNCTUATION);
public static final Predicate IS_UNICODE_OTHER_PUNCTUATION = Characters.unicodeGeneralCategoryPredicate("unicodeOtherPunctuation", Character.OTHER_PUNCTUATION);
public static final Predicate IS_UNICODE_MATH_SYMBOL = Characters.unicodeGeneralCategoryPredicate("unicodeMATH_Symbol", Character.MATH_SYMBOL);
public static final Predicate IS_UNICODE_CURRENCY_SYMBOL = Characters.unicodeGeneralCategoryPredicate("unicodeCurrencySymbol", Character.CURRENCY_SYMBOL);
public static final Predicate IS_UNICODE_MODIFIER_SYMBOL = Characters.unicodeGeneralCategoryPredicate("unicodeModifierSymbol", Character.MODIFIER_SYMBOL);
public static final Predicate IS_UNICODE_OTHER_SYMBOL = Characters.unicodeGeneralCategoryPredicate("unicodeOtherSymbol", Character.OTHER_SYMBOL);
public static final Predicate IS_UNICODE_INITIAL_QUOTE_PUNCTUATION = Characters.unicodeGeneralCategoryPredicate("unicodeInitialQuotePunctuation", Character.INITIAL_QUOTE_PUNCTUATION);
public static final Predicate IS_UNICODE_FINAL_QUOTE_PUNCTUATION = Characters.unicodeGeneralCategoryPredicate("unicodeFinalQuotePunctuation", Character.FINAL_QUOTE_PUNCTUATION);
/**
* Wrapper for {@link Characters#isAlphabetic(Integer)}; {@link Predicate#evaluate(Object)} throws an {@link
* UnsupportedOperationException} iff the JRE is pre-1.7.
*/
public static final Predicate
IS_UNICODE_ALPHA = new IntegerPredicate("unicodeAlpha") {
@Override public boolean
evaluate(Integer subject) { return Characters.isAlphabetic(subject); }
};
/**
* Calls {@code java.lang.Character.isAlphabetic(int)}, which is available only sind JRE 1.7.
*
* @throws UnsupportedOperationException The JRE is pre-1.7
*/
@SuppressWarnings("null") public static boolean
isAlphabetic(Integer codePoint) {
return Characters.CHARACTER_IS_ALPHABETIC.invoke(null, codePoint);
}
private static final MethodWrapper1
CHARACTER_IS_ALPHABETIC = OptionalMethods.get1(
"IsAlphabetic only available in JRE 7+", // message
Character.class, // declaringClass
"isAlphabetic", // methodName
int.class, // parameterType
NoException.class // checkedException
);
/**
* A predicate that implements the "ideographic" Unicode binary property.
*
* @see Pattern
*/
public static final Predicate
IS_UNICODE_IDEOGRAPHIC = new IntegerPredicate("unicodeIdeographic") {
@SuppressWarnings("null") @Override public boolean
evaluate(Integer subject) { return Characters.CHARACTER_IS_IDEOGRAPHIC.invoke(null, subject); }
};
private static final MethodWrapper1
CHARACTER_IS_IDEOGRAPHIC = OptionalMethods.get1(
"IsIdeographic only available in JRE 7+", // message
Character.class, // declaringClass
"isIdeographic", // methodName
int.class, // parameterType
NoException.class // checkedException
);
/**
* A predicate that implements the "White_space" Unicode binary property.
*
* @see Pattern
*/
public static final Predicate
IS_UNICODE_WHITE_SPACE = new IntegerPredicate("unicodeWhiteSpace") {
@Override public boolean
evaluate(Integer subject) {
int cp = subject;
int type = Character.getType(cp);
return (
type == Character.SPACE_SEPARATOR
|| type == Character.LINE_SEPARATOR
|| type == Character.PARAGRAPH_SEPARATOR
|| (cp >= 0x9 && cp <= 0xd)
|| (cp == 0x85)
);
}
@Override public String
toString() { return "unicodeWhiteSpace"; }
};
/**
* A predicate that implements the Unicode "control character" class.
*
* @see Pattern
*/
public static final Predicate
IS_UNICODE_CNTRL = Characters.unicodeGeneralCategoryPredicate("unicodeCntrl", Character.CONTROL);
/**
* A predicate that implements the Unicode "punctuation character" class.
*
* @see Pattern
*/
public static final Predicate
IS_UNICODE_PUNCT = new IntegerPredicate("unicodePunct") {
@Override public boolean
evaluate(Integer subject) {
int cp = subject;
// See "UnicodeProp.PUNCTUATION"
int type = Character.getType(cp);
return (
type == Character.CONNECTOR_PUNCTUATION
|| type == Character.DASH_PUNCTUATION
|| type == Character.START_PUNCTUATION
|| type == Character.END_PUNCTUATION
|| type == Character.OTHER_PUNCTUATION
|| type == Character.INITIAL_QUOTE_PUNCTUATION
|| type == Character.FINAL_QUOTE_PUNCTUATION
);
}
};
/**
* A predicate that implements the Unicode "hexadecimal digit" class.
*
* @see Pattern
*/
public static final Predicate
IS_UNICODE_HEX_DIGIT = new IntegerPredicate("unicodeHexDigit") {
@Override public boolean
evaluate(Integer subject) {
int cp = subject;
// See "UnicodeProp.HEX_DIGIT"
return (
Character.isDigit(cp)
|| (cp >= '0' && cp <= '9')
|| (cp >= 'A' && cp <= 'F')
|| (cp >= 'a' && cp <= 'f')
|| (cp >= 0xFF10 && cp <= 0xFF19)
|| (cp >= 0xFF21 && cp <= 0xFF26)
|| (cp >= 0xFF41 && cp <= 0xFF46)
);
}
};
/**
* A predicate that implements the Unicode "Assigned" binary property.
*
* @see Pattern
*/
public static final Predicate
IS_UNICODE_ASSIGNED = new IntegerPredicate("unicodeAssigned") {
@Override public boolean
evaluate(Integer subject) { return Character.getType(subject) != Character.UNASSIGNED; }
};
/**
* A predicate that implements the Unicode "Noncharacter_Code_Point" binary property.
*
* @see Pattern
*/
public static final Predicate
IS_UNICODE_NONCHARACTER = new IntegerPredicate("unicodeNoncharacter") {
@Override public boolean
evaluate(Integer subject) {
int cp = subject;
return (cp & 0xfffe) == 0xfffe || (cp >= 0xfdd0 && cp <= 0xfdef);
}
};
/**
* A predicate that implements the Unicode "Digit" binary property.
*
* @see Pattern
*/
public static final Predicate
IS_UNICODE_DIGIT = new IntegerPredicate("unicodeDigit") {
@Override public boolean evaluate(Integer subject) { return Character.isDigit(subject); }
};
/**
* A predicate that implements the Unicode "alphanumeric character" character class.
*
* @see Pattern
*/
public static final Predicate
IS_UNICODE_ALNUM = new IntegerPredicate("unicodeAlnum") {
@Override public boolean
evaluate(Integer subject) {
int cp = subject;
return Characters.IS_UNICODE_ALPHA.evaluate(cp) || Characters.IS_UNICODE_DIGIT.evaluate(cp);
}
};
/**
* A predicate that implements the Unicode "space or tab" character class.
*
* @see Pattern
*/
public static final Predicate
IS_UNICODE_BLANK = new IntegerPredicate("unicodeBlank") {
@Override public boolean
evaluate(Integer subject) {
int cp = subject;
return Character.getType(cp) == Character.SPACE_SEPARATOR || cp == 0x9;
}
};
/**
* A predicate that implements the Unicode "Graph" character class.
*
* @see Pattern
*/
public static final Predicate
IS_UNICODE_GRAPH = new IntegerPredicate("unicodeGraph") {
@Override public boolean
evaluate(Integer subject) {
int cp = subject;
// See "UnicodeProp.GRAPH"
int type = Character.getType(cp);
return (
type != Character.SPACE_SEPARATOR
&& type != Character.LINE_SEPARATOR
&& type != Character.PARAGRAPH_SEPARATOR
&& type != Character.CONTROL
&& type != Character.SURROGATE
&& type != Character.UNASSIGNED
);
}
};
/**
* A predicate that implements the Unicode "Print" character class.
*
* @see Pattern
*/
public static final Predicate
IS_UNICODE_PRINT = new IntegerPredicate("unicodePrint") {
@Override public boolean
evaluate(Integer subject) {
int cp = subject;
// See "UnicodeProp.PRINT"
return (
(Characters.IS_UNICODE_GRAPH.evaluate(cp) || Characters.IS_UNICODE_BLANK.evaluate(cp))
&& !Characters.IS_UNICODE_CNTRL.evaluate(cp)
);
}
};
/**
* A predicate that implements the Unicode "word character" class.
*
* @see Pattern
*/
public static final Predicate
IS_UNICODE_WORD = new IntegerPredicate("unicodeWord") {
@Override public boolean evaluate(Integer subject) { return Characters.isUnicodeWord(subject); }
};
/**
* A predicate that implements the Unicode "Join_Control" character class.
*
* @see Pattern
*/
public static final Predicate
IS_UNICODE_JOIN_CONTROL = new IntegerPredicate("unicodeJoinControl") {
@Override public boolean
evaluate(Integer subject) {
int cp = subject;
return cp == 0x200C || cp == 0x200D;
}
};
/**
* A predicate that implements the union of the "non-spacing mark", "enclosing mark" and "combining spacing mark"
* general categories.
*/
public static final Predicate
IS_UNICODE_MARK = Characters.unicodeGeneralCategoryPredicate(
"unicodeMark",
Character.NON_SPACING_MARK,
Character.ENCLOSING_MARK,
Character.COMBINING_SPACING_MARK
);
/**
* A predicate that implements the union of the "decimal digit number", "letter number" and "other number" general
* categories.
*/
public static final Predicate
IS_UNICODE_NUMBER = Characters.unicodeGeneralCategoryPredicate(
"unicodeNumber",
Character.DECIMAL_DIGIT_NUMBER,
Character.LETTER_NUMBER,
Character.OTHER_NUMBER
);
/**
* A predicate that implements the union of the "space separator", "line separator" and "paragraph separator"
* general categories.
*/
public static final Predicate
IS_UNICODE_SEPARATOR = Characters.unicodeGeneralCategoryPredicate(
"unicodeSeparator",
Character.SPACE_SEPARATOR,
Character.LINE_SEPARATOR,
Character.PARAGRAPH_SEPARATOR
);
/**
* A predicate that implements the union of the "control", "format", "private use" and "surrogate" general
* categories.
*/
public static final Predicate
IS_UNICODE_SPECIAL = Characters.unicodeGeneralCategoryPredicate(
"unicodeSpecial",
Character.CONTROL,
Character.FORMAT,
Character.PRIVATE_USE,
Character.SURROGATE
);
/**
* A predicate that implements the union of the "math symbol", "currency symbol", "modifier symbol" and "other
* symbol" general categories.
*/
public static final Predicate
IS_UNICODE_SYMBOL = Characters.unicodeGeneralCategoryPredicate(
"unicodeSymbol",
Character.MATH_SYMBOL,
Character.CURRENCY_SYMBOL,
Character.MODIFIER_SYMBOL,
Character.OTHER_SYMBOL
);
/**
* A predicate that implements the union of the "uppercase letter", "lowercase letter" and "titlecase letter"
* binary Unicode property.
*
* @see Pattern
*/
public static final Predicate
IS_UNICODE_UPPER_LOWER_TITLE = Characters.unicodeGeneralCategoryPredicate(
"unicodeUpperLowerTitle",
Character.UPPERCASE_LETTER,
Character.LOWERCASE_LETTER,
Character.TITLECASE_LETTER
);
/**
* A predicate that implements the union of the "uppercase letter", "lowercase letter", "titlecase letter",
* "modifier letter", "other letter" and "decimal digit number" Unicode character classes.
*
* @see Pattern
*/
public static final Predicate
IS_UNICODE_ALPHA2 = Characters.unicodeGeneralCategoryPredicate(
"unicodeAlpha2",
Character.UPPERCASE_LETTER,
Character.LOWERCASE_LETTER,
Character.TITLECASE_LETTER,
Character.MODIFIER_LETTER,
Character.OTHER_LETTER,
Character.DECIMAL_DIGIT_NUMBER
);
/**
* A predicate for the ISO Latin 1 character range (0...255).
*/
public static final Predicate
IS_UNICODE_LATIN1 = Characters.rangePredicate("unicodeAlpha2", 0, 0xff);
/**
* @return Whether the codePoint is a "word character" in the sense of the regular expression {@code
* "\w"}, with the {@link Pattern#UNICODE_CHARACTER_CLASS} flag set
*/
public static boolean
isUnicodeWord(int codePoint) {
int type = Character.getType(codePoint);
return (
(type >= 1 && type <= 10)
// || type == Character.UPPERCASE_LETTER // 1
// || type == Character.LOWERCASE_LETTER // 2
// || type == Character.TITLECASE_LETTER // 3
// || type == Character.MODIFIER_LETTER // 4
// || type == Character.OTHER_LETTER // 5
// || type == Character.NON_SPACING_MARK // 6
// || type == Character.ENCLOSING_MARK // 7
// || type == Character.COMBINING_SPACING_MARK // 8
// || type == Character.DECIMAL_DIGIT_NUMBER // 9
// || type == Character.LETTER_NUMBER // 10
|| (
type == Character.OTHER_SYMBOL // 28
&& codePoint >= 0x24b6
&& codePoint <= 0x24e9
)
|| type == Character.CONNECTOR_PUNCTUATION // 23
|| codePoint == 0x200C || codePoint == 0x200D // JOIN CONTROL
);
}
/**
* @return {@code null} iff the named category is unknown
*/
@Nullable public static Predicate
unicodeCategoryFromName(String name) {
return Characters.UNICODE_CATEGORIES.get(name.toUpperCase(Locale.US));
}
private static final Map> UNICODE_CATEGORIES;
static {
Map> m = new HashMap>();
m.put("CN", Characters.IS_UNICODE_UNASSIGNED);
m.put("LU", Characters.IS_UNICODE_UPPER);
m.put("LL", Characters.IS_UNICODE_LOWER);
m.put("LT", Characters.IS_UNICODE_TITLE);
m.put("LM", Characters.IS_UNICODE_MODIFIER_LETTER);
m.put("LO", Characters.IS_UNICODE_OTHER_LETTER);
m.put("MN", Characters.IS_UNICODE_NON_SPACING_MARK);
m.put("ME", Characters.IS_UNICODE_ENCLOSING_MARK);
m.put("MC", Characters.IS_UNICODE_COMBINING_SPACING_MARK);
m.put("ND", Characters.IS_UNICODE_DECIMAL_DIGIT_NUMBER);
m.put("NL", Characters.IS_UNICODE_LETTER_NUMBER);
m.put("NO", Characters.IS_UNICODE_OTHER_NUMBER);
m.put("ZS", Characters.IS_UNICODE_SPACE_SEPARATOR);
m.put("ZL", Characters.IS_UNICODE_LINE_SEPARATOR);
m.put("ZP", Characters.IS_UNICODE_PARAGRAPH_SEPARATOR);
m.put("CC", Characters.IS_UNICODE_CONTROL);
m.put("CF", Characters.IS_UNICODE_FORMAT);
m.put("CO", Characters.IS_UNICODE_PRIVATE_USE);
m.put("CS", Characters.IS_UNICODE_SURROGATE);
m.put("PD", Characters.IS_UNICODE_DASH_PUNCTUATION);
m.put("PS", Characters.IS_UNICODE_START_PUNCTUATION);
m.put("PE", Characters.IS_UNICODE_END_PUNCTUATION);
m.put("PC", Characters.IS_UNICODE_CONNECTOR_PUNCTUATION);
m.put("PO", Characters.IS_UNICODE_OTHER_PUNCTUATION);
m.put("SM", Characters.IS_UNICODE_MATH_SYMBOL);
m.put("SC", Characters.IS_UNICODE_CURRENCY_SYMBOL);
m.put("SK", Characters.IS_UNICODE_MODIFIER_SYMBOL);
m.put("SO", Characters.IS_UNICODE_OTHER_SYMBOL);
m.put("PI", Characters.IS_UNICODE_INITIAL_QUOTE_PUNCTUATION);
m.put("PF", Characters.IS_UNICODE_FINAL_QUOTE_PUNCTUATION);
m.put("L", Characters.IS_UNICODE_LETTER);
m.put("M", Characters.IS_UNICODE_MARK);
m.put("N", Characters.IS_UNICODE_NUMBER);
m.put("Z", Characters.IS_UNICODE_SEPARATOR);
m.put("C", Characters.IS_UNICODE_SPECIAL);
m.put("P", Characters.IS_UNICODE_PUNCT);
m.put("S", Characters.IS_UNICODE_SYMBOL);
m.put("LC", Characters.IS_UNICODE_UPPER_LOWER_TITLE);
m.put("LD", Characters.IS_UNICODE_ALPHA2);
m.put("L1", Characters.IS_UNICODE_LATIN1);
m.put("ALPHABETIC", Characters.IS_UNICODE_ALPHA);
m.put("LETTER", Characters.IS_UNICODE_LETTER);
m.put("IDEOGRAPHIC", Characters.IS_UNICODE_IDEOGRAPHIC);
m.put("LOWERCASE", Characters.IS_UNICODE_LOWER);
m.put("UPPERCASE", Characters.IS_UNICODE_UPPER);
m.put("TITLECASE", Characters.IS_UNICODE_TITLE);
m.put("WHITE_SPACE", Characters.IS_UNICODE_WHITE_SPACE);
m.put("CONTROL", Characters.IS_UNICODE_CNTRL);
m.put("PUNCTUATION", Characters.IS_UNICODE_PUNCT);
m.put("HEX_DIGIT", Characters.IS_UNICODE_HEX_DIGIT);
m.put("ASSIGNED", Characters.IS_UNICODE_ASSIGNED);
m.put("NONCHARACTER_CODE_POINT", Characters.IS_UNICODE_NONCHARACTER);
m.put("DIGIT", Characters.IS_UNICODE_DIGIT);
m.put("ALNUM", Characters.IS_UNICODE_ALNUM);
m.put("BLANK", Characters.IS_UNICODE_BLANK);
m.put("GRAPH", Characters.IS_UNICODE_GRAPH);
m.put("PRINT", Characters.IS_UNICODE_PRINT);
m.put("WORD", Characters.IS_UNICODE_WORD);
m.put("JOIN_CONTROL", Characters.IS_UNICODE_JOIN_CONTROL);
// Aliases.
m.put("WHITESPACE", m.get("WHITE_SPACE"));
m.put("HEXDIGIT", m.get("HEX_DIGIT"));
m.put("NONCHARACTERCODEPOINT", m.get("NONCHARACTER_CODE_POINT"));
m.put("JOINCONTROL", m.get("JOIN_CONTROL"));
UNICODE_CATEGORIES = Collections.unmodifiableMap(m);
}
/**
* A predicate that implements the named "Unicode binary property".
*
* @see Pattern
*/
@Nullable public static Predicate
unicodeBinaryPropertyFromName(String name) {
return Characters.UNICODE_PROPERTIES.get(name.toUpperCase(Locale.US));
}
private static final Map> UNICODE_PROPERTIES;
static {
Map> m = new HashMap>();
m.put("ALPHABETIC", Characters.IS_UNICODE_ALPHA);
m.put("IDEOGRAPHIC", Characters.IS_UNICODE_IDEOGRAPHIC);
m.put("LETTER", Characters.IS_UNICODE_LETTER);
m.put("LOWERCASE", Characters.IS_UNICODE_LOWER);
m.put("UPPERCASE", Characters.IS_UNICODE_UPPER);
m.put("TITLECASE", Characters.IS_UNICODE_TITLE);
m.put("PUNCTUATION", Characters.IS_UNICODE_PUNCT);
m.put("CONTROL", Characters.IS_UNICODE_CNTRL);
m.put("WHITE_SPACE", Characters.IS_UNICODE_WHITE_SPACE);
m.put("DIGIT", Characters.IS_UNICODE_DIGIT);
m.put("HEX_DIGIT", Characters.IS_UNICODE_HEX_DIGIT);
m.put("JOIN_CONTROL", Characters.IS_UNICODE_JOIN_CONTROL);
m.put("NONCHARACTER_CODE_POINT", Characters.IS_UNICODE_NONCHARACTER);
m.put("ASSIGNED", Characters.IS_UNICODE_ASSIGNED);
UNICODE_PROPERTIES = Collections.unmodifiableMap(m);
}
// @Nullable public static Predicate
// unicodePredefinedCharacterClassFromName(String name) {
// return Characters.UNICODE_PREDEFINIED_CHARACTER_CLASSES.get(name.toUpperCase(Locale.US));
// }
// private static final Map> UNICODE_PREDEFINIED_CHARACTER_CLASSES;
// static {
// Map> m = new HashMap>();
//
// m.put("ALNUM", Characters.IS_UNICODE_ALNUM);
// m.put("BLANK", Characters.IS_UNICODE_BLANK);
// m.put("GRAPH", Characters.IS_UNICODE_GRAPH);
// m.put("PRINT", Characters.IS_UNICODE_PRINT);
// m.put("WORD", Characters.IS_UNICODE_WORD);
//
// // Aliases.
// m.put("WHITESPACE", m.get("WHITE_SPACE"));
// m.put("HEXDIGIT", m.get("HEX_DIGIT"));
// m.put("NONCHARACTERCODEPOINT", m.get("NONCHARACTER_CODE_POINT"));
// m.put("JOINCONTROL", m.get("JOIN_CONTROL"));
//
// UNICODE_PREDEFINIED_CHARACTER_CLASSES = Collections.unmodifiableMap(m);
// }
/**
* @return A wrapper predicate around {@link Character#getType(int)}
*/
private static IntegerPredicate
unicodeGeneralCategoryPredicate(String toString, final byte generalCategory) {
return new IntegerPredicate(toString) {
@Override public boolean evaluate(Integer subject) { return Character.getType(subject) == generalCategory; }
};
}
/**
* @return A wrapper predicate around {@link Character#getType(int)}
*/
private static IntegerPredicate
unicodeGeneralCategoryPredicate(String toString, final byte gc1, final byte gc2, final byte... gc3) {
int mask = 1 << gc1 | 1 << gc2;
for (byte gc : gc3) mask |= 1 << gc;
final int finalMask = mask;
return new IntegerPredicate(toString) {
@Override public boolean
evaluate(Integer subject) { return (finalMask & 1 << Character.getType(subject)) != 0; }
};
}
private static IntegerPredicate
rangePredicate(String toString, final int minCp, final int maxCp) {
return new IntegerPredicate(toString) {
@Override public boolean
evaluate(Integer subject) {
int cp = subject;
return cp >= minCp && cp <= maxCp;
}
};
}
/**
* @return A predicate that implements the named "Unicode predefined character class"
* @see Pattern
*/
@Nullable public static Predicate
unicodePredefinedCharacterClassFromName(String name) {
return Characters.UNICODE_PREDEFINED_CHARACTER_CLASSES.get(name.toUpperCase(Locale.US));
}
private static final Map> UNICODE_PREDEFINED_CHARACTER_CLASSES;
static {
Map> m = new HashMap>();
m.put("LOWER", Characters.IS_UNICODE_LOWER);
m.put("UPPER", Characters.IS_UNICODE_UPPER);
m.put("ASCII", Characters.IS_POSIX_ASCII);
m.put("ALPHA", Characters.IS_UNICODE_ALPHA);
m.put("DIGIT", Characters.IS_UNICODE_DIGIT);
m.put("ALNUM", Characters.IS_UNICODE_ALNUM);
m.put("PUNCT", Characters.IS_UNICODE_PUNCT);
m.put("GRAPH", Characters.IS_UNICODE_GRAPH);
m.put("PRINT", Characters.IS_UNICODE_PRINT);
m.put("BLANK", Characters.IS_UNICODE_BLANK);
m.put("CNTRL", Characters.IS_UNICODE_CNTRL);
m.put("XDIGIT", Characters.IS_UNICODE_HEX_DIGIT);
m.put("SPACE", Characters.IS_UNICODE_WHITE_SPACE);
UNICODE_PREDEFINED_CHARACTER_CLASSES = Collections.unmodifiableMap(m);
}
/**
* @return A predicate that implements the named "POSIX character class"
* @see Pattern
*/
@Nullable public static Predicate
posixCharacterClassFromName(String name) {
return Characters.POSIX_CHARACTER_CLASSES.get(name.toUpperCase(Locale.US));
}
private static final Map> POSIX_CHARACTER_CLASSES;
static {
Map> m = new HashMap>();
m.put("LOWER", Characters.IS_POSIX_LOWER);
m.put("UPPER", Characters.IS_POSIX_UPPER);
m.put("ASCII", Characters.IS_POSIX_ASCII);
m.put("ALPHA", Characters.IS_POSIX_ALPHA);
m.put("DIGIT", Characters.IS_POSIX_DIGIT);
m.put("ALNUM", Characters.IS_POSIX_ALNUM);
m.put("PUNCT", Characters.IS_POSIX_PUNCT);
m.put("GRAPH", Characters.IS_POSIX_GRAPH);
m.put("PRINT", Characters.IS_POSIX_PRINT);
m.put("BLANK", Characters.IS_POSIX_BLANK);
m.put("CNTRL", Characters.IS_POSIX_CNTRL);
m.put("XDIGIT", Characters.IS_POSIX_XDIGIT);
m.put("SPACE", Characters.IS_POSIX_SPACE);
POSIX_CHARACTER_CLASSES = Collections.unmodifiableMap(m);
}
/**
* @return A predicate that implements the named "Unicode block"
* @see Pattern
*/
@Nullable public static Predicate
unicodeBlockFromName(String name) {
final UnicodeBlock block;
try {
block = Character.UnicodeBlock.forName(name);
} catch (IllegalArgumentException iae) {
return null;
}
return new Predicate() {
@Override public boolean evaluate(Integer subject) { return Character.UnicodeBlock.of(subject) == block; }
@Override public String toString() { return "inUnicodeBlock(" + block + ")"; }
};
}
private static final MethodWrapper1, Object, String, NoException>
UNICODE_SCRIPT_FOR_NAME = OptionalMethods.get1(
"Unicode scripts only available in JRE 7+", // message
null, // classLoader
"java.lang.Character$UnicodeScript", // declaringClassName
"forName", // methodName
String.class, // parameterType
null // checkedException
);
private static final MethodWrapper1, Object, Integer, NoException>
UNICODE_SCRIPT_OF = OptionalMethods.get1(
"Unicode scripts only available in JRE 7+", // message
null, // classLoader
"java.lang.Character$UnicodeScript", // declaringClassName
"of", // methodName
int.class, // parameterType
null // checkedException
);
private static final boolean
UNICODE_SCRIPT_AVAILABLE = (
Characters.UNICODE_SCRIPT_FOR_NAME.isAvailable()
&& Characters.UNICODE_SCRIPT_OF.isAvailable()
);
/**
* @return Whether this JRE supports Unicode scripts (because it is 1.7 or later)
*/
public static boolean
unicodeScriptAvailable() { return Characters.UNICODE_SCRIPT_AVAILABLE; }
/**
* @return A predicate that tests if a given code point is in the named script;
* {@code null} iff a Unicode script with the name is unknown
* @throws UnsupportedOperationException This JRE does not support Unicode scripts (because it is pre-1.7)
* @see #unicodeScriptAvailable()
*/
@Nullable public static Predicate
unicodeScriptPredicate(String name) {
final Object unicodeScript1;
try {
unicodeScript1 = Characters.UNICODE_SCRIPT_FOR_NAME.invoke(null, name);
} catch (IllegalArgumentException iae) {
// Script name is unknown.
return null;
}
return new Predicate() {
@Override public boolean
evaluate(Integer subject) {
Object unicodeScript2 = Characters.UNICODE_SCRIPT_OF.invoke(null, subject);
return unicodeScript1 == unicodeScript2;
}
@Override public String
toString() { return "unicodeScript(" + unicodeScript1 + ")"; }
};
}
/**
* @return The set of codepoints that are regarded as case-insensitively "equal", including the cp, e.g.
* { 'a', 'A' }, or {@code null} iff no other codepoints are case-insensitively equal with
* cp
*/
@Nullable public static String
caseInsensitivelyEqualCharacters(int cp) {
{
String s = Characters.SPECIAL_CASES.get(cp);
if (s != null) return s;
}
int lc = Character.toLowerCase(cp);
int uc = Character.toUpperCase(cp);
int tc = Character.toTitleCase(cp);
if (lc == uc) {
if (uc == tc) return null; // xxx
return new String(new int[] { lc, tc }, 0, 2); // xxy
} else
if (lc == tc) {
return new String(new int[] { lc, uc }, 0, 2); // xyx
} else
if (uc == tc) {
return new String(new int[] { lc, uc }, 0, 2); // xyy
} else
{
return new String(new int[] { lc, uc, tc }, 0, 3); // xyz
}
}
private static final Map SPECIAL_CASES;
static {
Map m = new HashMap();
for (String s : new String[] {
"I" + "i" + "\u0130" + "\u0131",
"K" + "k" + "\u212a",
"S" + "s" + "\u017f",
"µ" + "\u039c" + "\u03bc",
"Å" + "å" + "\u212b",
"\u0345" + "\u0399" + "\u03b9" + "\u1fbe",
"\u0392" + "\u03b2" + "\u03d0",
"\u0395" + "\u03b5" + "\u03f5",
"\u0398" + "\u03b8" + "\u03d1" + "\u03f4",
"\u039a" + "\u03ba" + "\u03f0",
"\u03a0" + "\u03c0" + "\u03d6",
"\u03a1" + "\u03c1" + "\u03f1",
"\u03a3" + "\u03c2" + "\u03c3",
"\u03a6" + "\u03c6" + "\u03d5",
"\u03a9" + "\u03c9" + "\u2126",
"\u1e60" + "\u1e61" + "\u1e9b",
}) {
for (int c : s.toCharArray()) m.put(c, s);
}
SPECIAL_CASES = m;
}
}
© 2015 - 2025 Weber Informatics LLC | Privacy Policy