com.ibm.icu.text.SpoofChecker Maven / Gradle / Ivy
Show all versions of icu4j Show documentation
// © 2016 and later: Unicode, Inc. and others.
// License & terms of use: http://www.unicode.org/copyright.html
/*
***************************************************************************
* Copyright (C) 2008-2016 International Business Machines Corporation
* and others. All Rights Reserved.
***************************************************************************
*
* Unicode Spoof Detection
*/
package com.ibm.icu.text;
import java.io.IOException;
import java.io.LineNumberReader;
import java.io.Reader;
import java.nio.ByteBuffer;
import java.text.ParseException;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.BitSet;
import java.util.Collections;
import java.util.Comparator;
import java.util.HashSet;
import java.util.Hashtable;
import java.util.LinkedHashSet;
import java.util.Locale;
import java.util.MissingResourceException;
import java.util.Set;
import java.util.Vector;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import com.ibm.icu.impl.ICUBinary;
import com.ibm.icu.impl.ICUBinary.Authenticate;
import com.ibm.icu.impl.Utility;
import com.ibm.icu.lang.UCharacter;
import com.ibm.icu.lang.UCharacter.IdentifierType;
import com.ibm.icu.lang.UCharacterCategory;
import com.ibm.icu.lang.UProperty;
import com.ibm.icu.lang.UScript;
import com.ibm.icu.util.ULocale;
/**
*
* This class, based on Unicode Technical Report #36 and
* Unicode Technical Standard #39, has two main functions:
*
*
* - Checking whether two strings are visually confusable with each other, such as "desparejado" and
* "ԁеѕрагејаԁо".
* - Checking whether an individual string is likely to be an attempt at confusing the reader (spoof
* detection), such as "pаypаl" spelled with Cyrillic 'а' characters.
*
*
*
* Although originally designed as a method for flagging suspicious identifier strings such as URLs,
* SpoofChecker
has a number of other practical use cases, such as preventing attempts to evade bad-word
* content filters.
*
*
Confusables
*
*
* The following example shows how to use SpoofChecker
to check for confusability between two strings:
*
*
*
* SpoofChecker sc = new SpoofChecker.Builder().setChecks(SpoofChecker.CONFUSABLE).build();
* int result = sc.areConfusable("desparejado", "ԁеѕрагејаԁо");
* System.out.println(result != 0); // true
*
*
*
*
* SpoofChecker
uses a builder paradigm: options are specified within the context of a lightweight
* {@link SpoofChecker.Builder} object, and upon calling {@link SpoofChecker.Builder#build}, expensive data loading
* operations are performed, and an immutable SpoofChecker
is returned.
*
*
* The first line of the example creates a SpoofChecker
object with confusable-checking enabled; the second
* line performs the confusability test. For best performance, the instance should be created once (e.g., upon
* application startup), and the more efficient {@link SpoofChecker#areConfusable} method can be used at runtime.
*
*
* If the paragraph direction used to display the strings is known, it should be passed to {@link SpoofChecker#areConfusable}:
*
*
*
* // These strings look identical when rendered in a left-to-right context.
* // They look distinct in a right-to-left context.
* String s1 = "A1\u05D0"; // A1א
* String s2 = "A\u05D01"; // Aא1
*
* SpoofChecker sc = new SpoofChecker.Builder().setChecks(SpoofChecker.CONFUSABLE).build();
* int result = sc.areConfusable(Bidi.DIRECTION_LEFT_TO_RIGHT, s1, s2);
* System.out.println(result != 0); // true
*
*
*
*
* UTS 39 defines two strings to be confusable if they map to the same skeleton. A skeleton is a
* sequence of families of confusable characters, where each family has a single exemplar character.
* {@link SpoofChecker#getSkeleton} computes the skeleton for a particular string, so the following snippet is
* equivalent to the example above:
*
*
*
* SpoofChecker sc = new SpoofChecker.Builder().setChecks(SpoofChecker.CONFUSABLE).build();
* boolean result = sc.getSkeleton("desparejado").equals(sc.getSkeleton("ԁеѕрагејаԁо"));
* System.out.println(result); // true
*
*
*
*
* If you need to check if a string is confusable with any string in a dictionary of many strings, rather than calling
* {@link SpoofChecker#areConfusable} many times in a loop, {@link SpoofChecker#getSkeleton} can be used instead, as
* shown below:
*
*
* // Setup:
* String[] DICTIONARY = new String[]{ "lorem", "ipsum" }; // example
* SpoofChecker sc = new SpoofChecker.Builder().setChecks(SpoofChecker.CONFUSABLE).build();
* HashSet<String> skeletons = new HashSet<String>();
* for (String word : DICTIONARY) {
* skeletons.add(sc.getSkeleton(word));
* }
*
* // Live Check:
* boolean result = skeletons.contains(sc.getSkeleton("1orern"));
* System.out.println(result); // true
*
*
*
* Note: Since the Unicode confusables mapping table is frequently updated, confusable skeletons are not
* guaranteed to be the same between ICU releases. We therefore recommend that you always compute confusable skeletons
* at runtime and do not rely on creating a permanent, or difficult to update, database of skeletons.
*
*
Spoof Detection
*
*
* The following snippet shows a minimal example of using SpoofChecker
to perform spoof detection on a
* string:
*
*
* SpoofChecker sc = new SpoofChecker.Builder()
* .setAllowedChars(SpoofChecker.RECOMMENDED.cloneAsThawed().addAll(SpoofChecker.INCLUSION))
* .setRestrictionLevel(SpoofChecker.RestrictionLevel.MODERATELY_RESTRICTIVE)
* .setChecks(SpoofChecker.ALL_CHECKS &~ SpoofChecker.CONFUSABLE)
* .build();
* boolean result = sc.failsChecks("pаypаl"); // with Cyrillic 'а' characters
* System.out.println(result); // true
*
*
*
* As in the case for confusability checking, it is good practice to create one SpoofChecker
instance at
* startup, and call the cheaper {@link SpoofChecker#failsChecks} online. In the second line, we specify the set of
* allowed characters to be those with type RECOMMENDED or INCLUSION, according to the recommendation in UTS 39. In the
* third line, the CONFUSABLE checks are disabled. It is good practice to disable them if you won't be using the
* instance to perform confusability checking.
*
*
* To get more details on why a string failed the checks, use a {@link SpoofChecker.CheckResult}:
*
*
*
* SpoofChecker sc = new SpoofChecker.Builder()
* .setAllowedChars(SpoofChecker.RECOMMENDED.cloneAsThawed().addAll(SpoofChecker.INCLUSION))
* .setRestrictionLevel(SpoofChecker.RestrictionLevel.MODERATELY_RESTRICTIVE)
* .setChecks(SpoofChecker.ALL_CHECKS &~ SpoofChecker.CONFUSABLE)
* .build();
* SpoofChecker.CheckResult checkResult = new SpoofChecker.CheckResult();
* boolean result = sc.failsChecks("pаypаl", checkResult);
* System.out.println(checkResult.checks); // 16
*
*
*
*
* The return value is a bitmask of the checks that failed. In this case, there was one check that failed:
* {@link SpoofChecker#RESTRICTION_LEVEL}, corresponding to the fifth bit (16). The possible checks are:
*
*
* RESTRICTION_LEVEL
: flags strings that violate the
* Restriction Level test as specified in UTS
* 39; in most cases, this means flagging strings that contain characters from multiple different scripts.
* INVISIBLE
: flags strings that contain invisible characters, such as zero-width spaces, or character
* sequences that are likely not to display, such as multiple occurrences of the same non-spacing mark.
* CHAR_LIMIT
: flags strings that contain characters outside of a specified set of acceptable
* characters. See {@link SpoofChecker.Builder#setAllowedChars} and {@link SpoofChecker.Builder#setAllowedLocales}.
* MIXED_NUMBERS
: flags strings that contain digits from multiple different numbering systems.
*
*
*
* These checks can be enabled independently of each other. For example, if you were interested in checking for only the
* INVISIBLE and MIXED_NUMBERS conditions, you could do:
*
*
*
* SpoofChecker sc = new SpoofChecker.Builder()
* .setChecks(SpoofChecker.INVISIBLE | SpoofChecker.MIXED_NUMBERS)
* .build();
* boolean result = sc.failsChecks("৪8");
* System.out.println(result); // true
*
*
*
*
* Note: The Restriction Level is the most powerful of the checks. The full logic is documented in
* UTS 39, but the basic idea is that strings
* are restricted to contain characters from only a single script, except that most scripts are allowed to have
* Latin characters interspersed. Although the default restriction level is HIGHLY_RESTRICTIVE
, it is
* recommended that users set their restriction level to MODERATELY_RESTRICTIVE
, which allows Latin mixed
* with all other scripts except Cyrillic, Greek, and Cherokee, with which it is often confusable. For more details on
* the levels, see UTS 39 or {@link SpoofChecker.RestrictionLevel}. The Restriction Level test is aware of the set of
* allowed characters set in {@link SpoofChecker.Builder#setAllowedChars}. Note that characters which have script code
* COMMON or INHERITED, such as numbers and punctuation, are ignored when computing whether a string has multiple
* scripts.
*
*
Advanced bidirectional usage
* If the paragraph direction with which the identifiers will be displayed is not known, there are
* multiple options for confusable detection depending on the circumstances.
*
*
* In some circumstances, the only concern is confusion between identifiers displayed with the same
* paragraph direction.
*
*
* An example is the case where identifiers are usernames prefixed with the @ symbol.
* That symbol will appear to the left in a left-to-right context, and to the right in a
* right-to-left context, so that an identifier displayed in a left-to-right context can never be
* confused with an identifier displayed in a right-to-left context:
*
* -
* The usernames "A1א" (A one aleph) and "Aא1" (A aleph 1)
* would be considered confusable, since they both appear as @A1א in a left-to-right context, and the
* usernames "אA_1" (aleph A underscore one) and "א1_A" (aleph one underscore A) would be considered
* confusable, since they both appear as A_1א@ in a right-to-left context.
*
* -
* The username "Mark_" would not be considered confusable with the username "_Mark",
* even though the latter would appear as Mark_@ in a right-to-left context, and the
* former as @Mark_ in a left-to-right context.
*
*
*
* In that case, the caller should check for both LTR-confusability and RTL-confusability:
*
*
*
* boolean confusableInEitherDirection =
* sc.areConfusable(Bidi.DIRECTION_LEFT_TO_RIGHT, id1, id2) ||
* sc.areConfusable(Bidi.DIRECTION_RIGHT_TO_LEFT, id1, id2);
*
*
*
* If the bidiSkeleton is used, the LTR and RTL skeleta should be kept separately and compared, LTR
* with LTR and RTL with RTL.
*
*
* In cases where confusability between the visual appearances of an identifier displayed in a
* left-to-right context with another identifier displayed in a right-to-left context is a concern,
* the LTR skeleton of one can be compared with the RTL skeleton of the other. However, this
* very broad definition of confusability may have unexpected results; for instance, it treats the
* ASCII identifiers "Mark_" and "_Mark" as confusable.
*
*
Additional Information
*
*
* A SpoofChecker
instance may be used repeatedly to perform checks on any number of identifiers.
*
*
* Thread Safety: The methods on SpoofChecker
objects are thread safe. The test functions for
* checking a single identifier, or for testing whether two identifiers are potentially confusable, may called
* concurrently from multiple threads using the same SpoofChecker
instance.
*
* @stable ICU 4.6
*/
public class SpoofChecker {
/**
* Constants from UTS 39 for use in setRestrictionLevel.
*
* @stable ICU 53
*/
public enum RestrictionLevel {
/**
* All characters in the string are in the identifier profile and all characters in the string are in the ASCII
* range.
*
* @stable ICU 53
*/
ASCII,
/**
* The string classifies as ASCII-Only, or all characters in the string are in the identifier profile and the
* string is single-script, according to the definition in UTS 39 section 5.1.
*
* @stable ICU 53
*/
SINGLE_SCRIPT_RESTRICTIVE,
/**
* The string classifies as Single Script, or all characters in the string are in the identifier profile and the
* string is covered by any of the following sets of scripts, according to the definition in UTS 39 section 5.1:
*
* - Latin + Han + Bopomofo (or equivalently: Latn + Hanb)
* - Latin + Han + Hiragana + Katakana (or equivalently: Latn + Jpan)
* - Latin + Han + Hangul (or equivalently: Latn +Kore)
*
*
* @stable ICU 53
*/
HIGHLY_RESTRICTIVE,
/**
* The string classifies as Highly Restrictive, or all characters in the string are in the identifier profile
* and the string is covered by Latin and any one other Recommended or Aspirational script, except Cyrillic,
* Greek, and Cherokee.
*
* @stable ICU 53
*/
MODERATELY_RESTRICTIVE,
/**
* All characters in the string are in the identifier profile. Allow arbitrary mixtures of scripts, such as
* Ωmega, Teχ, HλLF-LIFE, Toys-Я-Us.
*
* @stable ICU 53
*/
MINIMALLY_RESTRICTIVE,
/**
* Any valid identifiers, including characters outside of the Identifier Profile, such as I♥NY.org
*
* @stable ICU 53
*/
UNRESTRICTIVE,
}
/**
* Security Profile constant from UTS 39 for use in {@link SpoofChecker.Builder#setAllowedChars}.
*
* @stable ICU 58
*/
public static final UnicodeSet INCLUSION =
new UnicodeSet().
applyIntPropertyValue(UProperty.IDENTIFIER_TYPE, IdentifierType.INCLUSION.ordinal()).
freeze();
/**
* Security Profile constant from UTS 39 for use in {@link SpoofChecker.Builder#setAllowedChars}.
*
* @stable ICU 58
*/
public static final UnicodeSet RECOMMENDED =
new UnicodeSet().
applyIntPropertyValue(UProperty.IDENTIFIER_TYPE, IdentifierType.RECOMMENDED.ordinal()).
freeze();
/**
* Constants for the kinds of checks that USpoofChecker can perform. These values are used both to select the set of
* checks that will be performed, and to report results from the check function.
*
*/
/**
* When performing the two-string {@link SpoofChecker#areConfusable} test, this flag in the return value indicates
* that the two strings are visually confusable and that they are from the same script, according to UTS 39 section
* 4.
*
* @stable ICU 4.6
*/
public static final int SINGLE_SCRIPT_CONFUSABLE = 1;
/**
* When performing the two-string {@link SpoofChecker#areConfusable} test, this flag in the return value indicates
* that the two strings are visually confusable and that they are not from the same script, according to UTS
* 39 section 4.
*
* @stable ICU 4.6
*/
public static final int MIXED_SCRIPT_CONFUSABLE = 2;
/**
* When performing the two-string {@link SpoofChecker#areConfusable} test, this flag in the return value indicates
* that the two strings are visually confusable and that they are not from the same script but both of them are
* single-script strings, according to UTS 39 section 4.
*
* @stable ICU 4.6
*/
public static final int WHOLE_SCRIPT_CONFUSABLE = 4;
/**
* Enable this flag in {@link SpoofChecker.Builder#setChecks} to turn on all types of confusables. You may set the
* checks to some subset of SINGLE_SCRIPT_CONFUSABLE, MIXED_SCRIPT_CONFUSABLE, or WHOLE_SCRIPT_CONFUSABLE to make
* {@link SpoofChecker#areConfusable} return only those types of confusables.
*
* @stable ICU 58
*/
public static final int CONFUSABLE = SINGLE_SCRIPT_CONFUSABLE | MIXED_SCRIPT_CONFUSABLE | WHOLE_SCRIPT_CONFUSABLE;
/**
* This flag is deprecated and no longer affects the behavior of SpoofChecker.
*
* @deprecated ICU 58 Any case confusable mappings were removed from UTS 39; the corresponding ICU API was
* deprecated.
*/
@Deprecated
public static final int ANY_CASE = 8;
/**
* Check that an identifier satisfies the requirements for the restriction level specified in
* {@link SpoofChecker.Builder#setRestrictionLevel}. The default restriction level is
* {@link RestrictionLevel#HIGHLY_RESTRICTIVE}.
*
* @stable ICU 58
*/
public static final int RESTRICTION_LEVEL = 16;
/**
* Check that an identifier contains only characters from a single script (plus chars from the common and inherited
* scripts.) Applies to checks of a single identifier check only.
*
* @deprecated ICU 51 Use RESTRICTION_LEVEL
*/
@Deprecated
public static final int SINGLE_SCRIPT = RESTRICTION_LEVEL;
/**
* Check an identifier for the presence of invisible characters, such as zero-width spaces, or character sequences
* that are likely not to display, such as multiple occurrences of the same non-spacing mark. This check does not
* test the input string as a whole for conformance to any particular syntax for identifiers.
*
* @stable ICU 4.6
*/
public static final int INVISIBLE = 32;
/**
* Check that an identifier contains only characters from a specified set of acceptable characters. See
* {@link Builder#setAllowedChars} and {@link Builder#setAllowedLocales}. Note that a string that fails this check
* will also fail the {@link #RESTRICTION_LEVEL} check.
*
* @stable ICU 4.6
*/
public static final int CHAR_LIMIT = 64;
/**
* Check that an identifier does not mix numbers from different numbering systems. For more information, see UTS 39
* section 5.3.
*
* @stable ICU 58
*/
public static final int MIXED_NUMBERS = 128;
/**
* Check that an identifier does not have a combining character following a character in which that
* combining character would be hidden; for example 'i' followed by a U+0307 combining dot.
*
* More specifically, the following characters are forbidden from preceding a U+0307:
*
* - Those with the Soft_Dotted Unicode property (which includes 'i' and 'j')
* - Latin lowercase letter 'l'
* - Dotless 'i' and 'j' ('ı' and 'ȷ', U+0131 and U+0237)
* - Any character whose confusable prototype ends with such a character
* (Soft_Dotted, 'l', 'ı', or 'ȷ')
*
* In addition, combining characters are allowed between the above characters and U+0307 except those
* with combining class 0 or combining class "Above" (230, same class as U+0307).
*
* This list and the number of combing characters considered by this check may grow over time.
*
* @stable ICU 62
*/
public static final int HIDDEN_OVERLAY = 256;
// Update CheckResult.toString() when a new check is added.
/**
* Enable all spoof checks.
*
* @stable ICU 4.6
*/
public static final int ALL_CHECKS = 0xFFFFFFFF;
// Used for checking for ASCII-Only restriction level
static final UnicodeSet ASCII = new UnicodeSet(0, 0x7F).freeze();
/**
* private constructor: a SpoofChecker has to be built by the builder
*/
private SpoofChecker() {
}
/**
* SpoofChecker Builder. To create a SpoofChecker, first instantiate a SpoofChecker.Builder, set the desired
* checking options on the builder, then call the build() function to create a SpoofChecker instance.
*
* @stable ICU 4.6
*/
public static class Builder {
int fChecks; // Bit vector of checks to perform.
SpoofData fSpoofData;
final UnicodeSet fAllowedCharsSet = new UnicodeSet(0, 0x10ffff); // The UnicodeSet of allowed characters.
// for this Spoof Checker. Defaults to all chars.
final Set fAllowedLocales = new LinkedHashSet<>(); // The list of allowed locales.
private RestrictionLevel fRestrictionLevel;
/**
* Constructor: Create a default Unicode Spoof Checker Builder, configured to perform all checks except for
* LOCALE_LIMIT and CHAR_LIMIT. Note that additional checks may be added in the future, resulting in the changes
* to the default checking behavior.
*
* @stable ICU 4.6
*/
public Builder() {
fChecks = ALL_CHECKS;
fSpoofData = null;
fRestrictionLevel = RestrictionLevel.HIGHLY_RESTRICTIVE;
}
/**
* Constructor: Create a Spoof Checker Builder, and set the configuration from an existing SpoofChecker.
*
* @param src
* The existing checker.
* @stable ICU 4.6
*/
public Builder(SpoofChecker src) {
fChecks = src.fChecks;
fSpoofData = src.fSpoofData; // For the data, we will either use the source data
// as-is, or drop the builder's reference to it
// and generate new data, depending on what our
// caller does with the builder.
fAllowedCharsSet.set(src.fAllowedCharsSet);
fAllowedLocales.addAll(src.fAllowedLocales);
fRestrictionLevel = src.fRestrictionLevel;
}
/**
* Create a SpoofChecker with current configuration.
*
* @return SpoofChecker
* @stable ICU 4.6
*/
public SpoofChecker build() {
// TODO: Make this data loading be lazy (see #12696).
if (fSpoofData == null) {
// read binary file
fSpoofData = SpoofData.getDefault();
}
// Copy all state from the builder to the new SpoofChecker.
// Make sure that everything is either cloned or copied, so
// that subsequent re-use of the builder won't modify the built
// SpoofChecker.
//
// One exception to this: the SpoofData is just assigned.
// If the builder subsequently needs to modify fSpoofData
// it will create a new SpoofData object first.
SpoofChecker result = new SpoofChecker();
result.fChecks = this.fChecks;
result.fSpoofData = this.fSpoofData;
result.fAllowedCharsSet = (UnicodeSet) (this.fAllowedCharsSet.clone());
result.fAllowedCharsSet.freeze();
result.fAllowedLocales = new HashSet<>(this.fAllowedLocales);
result.fRestrictionLevel = this.fRestrictionLevel;
return result;
}
/**
* Specify the source form of the spoof data Spoof Checker. The inputs correspond to the Unicode data file
* confusables.txt as described in Unicode UAX 39. The syntax of the source data is as described in UAX 39 for
* these files, and the content of these files is acceptable input.
*
* @param confusables
* the Reader of confusable characters definitions, as found in file confusables.txt from
* unicode.org.
* @throws ParseException
* To report syntax errors in the input.
*
* @stable ICU 58
*/
public Builder setData(Reader confusables) throws ParseException, IOException {
// Compile the binary data from the source (text) format.
// Drop the builder's reference to any pre-existing data, which may
// be in use in an already-built checker.
fSpoofData = new SpoofData();
ConfusabledataBuilder.buildConfusableData(confusables, fSpoofData);
return this;
}
/**
* Deprecated as of ICU 58; use {@link SpoofChecker.Builder#setData(Reader confusables)} instead.
*
* @param confusables
* the Reader of confusable characters definitions, as found in file confusables.txt from
* unicode.org.
* @param confusablesWholeScript
* No longer supported.
* @throws ParseException
* To report syntax errors in the input.
*
* @deprecated ICU 58
*/
@Deprecated
public Builder setData(Reader confusables, Reader confusablesWholeScript) throws ParseException, IOException {
setData(confusables);
return this;
}
/**
* Specify the bitmask of checks that will be performed by {@link SpoofChecker#failsChecks}. Calling this method
* overwrites any checks that may have already been enabled. By default, all checks are enabled.
*
* To enable specific checks and disable all others,
* OR together only the bit constants for the desired checks.
* For example, to fail strings containing characters outside of
* the set specified by {@link #setAllowedChars} and
* also strings that contain digits from mixed numbering systems:
*
*
* {@code
* builder.setChecks(SpoofChecker.CHAR_LIMIT | SpoofChecker.MIXED_NUMBERS);
* }
*
*
* To disable specific checks and enable all others,
* start with ALL_CHECKS and "AND away" the not-desired checks.
* For example, if you are not planning to use the {@link SpoofChecker#areConfusable} functionality,
* it is good practice to disable the CONFUSABLE check:
*
*
* {@code
* builder.setChecks(SpoofChecker.ALL_CHECKS & ~SpoofChecker.CONFUSABLE);
* }
*
*
* Note that methods such as {@link #setAllowedChars}, {@link #setAllowedLocales}, and
* {@link #setRestrictionLevel} will enable certain checks when called. Those methods will OR the check they
* enable onto the existing bitmask specified by this method. For more details, see the documentation of those
* methods.
*
* @param checks
* The set of checks that this spoof checker will perform. The value is an 'or' of the desired
* checks.
* @return self
* @stable ICU 4.6
*/
public Builder setChecks(int checks) {
// Verify that the requested checks are all ones (bits) that
// are acceptable, known values.
if (0 != (checks & ~SpoofChecker.ALL_CHECKS)) {
throw new IllegalArgumentException("Bad Spoof Checks value.");
}
this.fChecks = (checks & SpoofChecker.ALL_CHECKS);
return this;
}
/**
* Limit characters that are acceptable in identifiers being checked to those normally used with the languages
* associated with the specified locales. Any previously specified list of locales is replaced by the new
* settings.
*
* A set of languages is determined from the locale(s), and from those a set of acceptable Unicode scripts is
* determined. Characters from this set of scripts, along with characters from the "common" and "inherited"
* Unicode Script categories will be permitted.
*
* Supplying an empty string removes all restrictions; characters from any script will be allowed.
*
* The {@link #CHAR_LIMIT} test is automatically enabled for this SpoofChecker when calling this function with a
* non-empty list of locales.
*
* The Unicode Set of characters that will be allowed is accessible via the {@link #getAllowedChars} function.
* setAllowedLocales() will replace any previously applied set of allowed characters.
*
* Adjustments, such as additions or deletions of certain classes of characters, can be made to the result of
* {@link #setAllowedChars} by fetching the resulting set with {@link #getAllowedChars}, manipulating it with
* the Unicode Set API, then resetting the spoof detectors limits with {@link #setAllowedChars}.
*
* @param locales
* A Set of ULocales, from which the language and associated script are extracted. If the locales Set
* is null, no restrictions will be placed on the allowed characters.
*
* @return self
* @stable ICU 4.6
*/
public Builder setAllowedLocales(Set locales) {
fAllowedCharsSet.clear();
for (ULocale locale : locales) {
// Add the script chars for this locale to the accumulating set
// of allowed chars.
addScriptChars(locale, fAllowedCharsSet);
}
// If our caller provided an empty list of locales, we disable the
// allowed characters checking
fAllowedLocales.clear();
if (locales.size() == 0) {
fAllowedCharsSet.add(0, 0x10ffff);
fChecks &= ~CHAR_LIMIT;
return this;
}
// Add all common and inherited characters to the set of allowed
// chars.
UnicodeSet tempSet = new UnicodeSet();
tempSet.applyIntPropertyValue(UProperty.SCRIPT, UScript.COMMON);
fAllowedCharsSet.addAll(tempSet);
tempSet.applyIntPropertyValue(UProperty.SCRIPT, UScript.INHERITED);
fAllowedCharsSet.addAll(tempSet);
// Store the updated spoof checker state.
fAllowedLocales.clear();
fAllowedLocales.addAll(locales);
fChecks |= CHAR_LIMIT;
return this;
}
/**
* Limit characters that are acceptable in identifiers being checked to those normally used with the languages
* associated with the specified locales. Any previously specified list of locales is replaced by the new
* settings.
*
* @param locales
* A Set of Locales, from which the language and associated script are extracted. If the locales Set
* is null, no restrictions will be placed on the allowed characters.
*
* @return self
* @stable ICU 54
*/
public Builder setAllowedJavaLocales(Set locales) {
HashSet ulocales = new HashSet<>(locales.size());
for (Locale locale : locales) {
ulocales.add(ULocale.forLocale(locale));
}
return setAllowedLocales(ulocales);
}
// Add (union) to the UnicodeSet all of the characters for the scripts
// used for the specified locale. Part of the implementation of
// setAllowedLocales.
private void addScriptChars(ULocale locale, UnicodeSet allowedChars) {
int scripts[] = UScript.getCode(locale);
if (scripts != null) {
UnicodeSet tmpSet = new UnicodeSet();
for (int i = 0; i < scripts.length; i++) {
tmpSet.applyIntPropertyValue(UProperty.SCRIPT, scripts[i]);
allowedChars.addAll(tmpSet);
}
}
// else it's an unknown script.
// Maybe they asked for the script of "zxx", which refers to no linguistic content.
// Maybe they asked for the script of a newer locale that we don't know in the older version of ICU.
}
/**
* Limit the acceptable characters to those specified by a Unicode Set. Any previously specified character limit
* is replaced by the new settings. This includes limits on characters that were set with the
* setAllowedLocales() function. Note that the RESTRICTED set is useful.
*
* The {@link #CHAR_LIMIT} test is automatically enabled for this SpoofChecker by this function.
*
* @param chars
* A Unicode Set containing the list of characters that are permitted. The incoming set is cloned by
* this function, so there are no restrictions on modifying or deleting the UnicodeSet after calling
* this function. Note that this clears the allowedLocales set.
* @return self
* @stable ICU 4.6
*/
public Builder setAllowedChars(UnicodeSet chars) {
fAllowedCharsSet.set(chars);
fAllowedLocales.clear();
fChecks |= CHAR_LIMIT;
return this;
}
/**
* Set the loosest restriction level allowed for strings. The default if this is not called is
* {@link RestrictionLevel#HIGHLY_RESTRICTIVE}. Calling this method enables the {@link #RESTRICTION_LEVEL} and
* {@link #MIXED_NUMBERS} checks, corresponding to Sections 5.1 and 5.2 of UTS 39. To customize which checks are
* to be performed by {@link SpoofChecker#failsChecks}, see {@link #setChecks}.
*
* @param restrictionLevel
* The loosest restriction level allowed.
* @return self
* @stable ICU 58
*/
public Builder setRestrictionLevel(RestrictionLevel restrictionLevel) {
fRestrictionLevel = restrictionLevel;
fChecks |= RESTRICTION_LEVEL | MIXED_NUMBERS;
return this;
}
/*
* *****************************************************************************
* Internal classes for compiling confusable data into its binary (runtime) form.
* *****************************************************************************
*/
// ---------------------------------------------------------------------
//
// buildConfusableData Compile the source confusable data, as defined by
// the Unicode data file confusables.txt, into the binary
// structures used by the confusable detector.
//
// The binary structures are described in uspoof_impl.h
//
// 1. parse the data, making a hash table mapping from a codepoint to a String.
//
// 2. Sort all of the strings encountered by length, since they will need to
// be stored in that order in the final string table.
// TODO: Sorting these strings by length is no longer needed since the removal of
// the string lengths table. This logic can be removed to save processing time
// when building confusables data.
//
// 3. Build a list of keys (UChar32s) from the mapping table. Sort the
// list because that will be the ordering of our runtime table.
//
// 4. Generate the run time string table. This is generated before the key & value
// table because we need the string indexes when building those tables.
//
// 5. Build the run-time key and value table. These are parallel tables, and
// are built at the same time
// class ConfusabledataBuilder
// An instance of this class exists while the confusable data is being built from source.
// It encapsulates the intermediate data structures that are used for building.
// It exports one static function, to do a confusable data build.
private static class ConfusabledataBuilder {
private Hashtable fTable;
private UnicodeSet fKeySet; // A set of all keys (UChar32s) that go into the
// four mapping tables.
// The compiled data is first assembled into the following four collections,
// then output to the builder's SpoofData object.
private StringBuffer fStringTable;
private ArrayList fKeyVec;
private ArrayList fValueVec;
private SPUStringPool stringPool;
private Pattern fParseLine;
private Pattern fParseHexNum;
private int fLineNum;
ConfusabledataBuilder() {
fTable = new Hashtable<>();
fKeySet = new UnicodeSet();
fKeyVec = new ArrayList<>();
fValueVec = new ArrayList<>();
stringPool = new SPUStringPool();
}
void build(Reader confusables, SpoofData dest) throws ParseException, java.io.IOException {
StringBuffer fInput = new StringBuffer();
// Convert the user input data from UTF-8 to char (UTF-16)
LineNumberReader lnr = new LineNumberReader(confusables);
do {
String line = lnr.readLine();
if (line == null) {
break;
}
fInput.append(line);
fInput.append('\n');
} while (true);
// Regular Expression to parse a line from Confusables.txt. The expression will match
// any line. What was matched is determined by examining which capture groups have a match.
// Capture Group 1: the source char
// Capture Group 2: the replacement chars
// Capture Group 3-6 the table type, SL, SA, ML, or MA (deprecated)
// Capture Group 7: A blank or comment only line.
// Capture Group 8: A syntactically invalid line. Anything that didn't match before.
// Example Line from the confusables.txt source file:
// "1D702 ; 006E 0329 ; SL # MATHEMATICAL ITALIC SMALL ETA ... "
fParseLine = Pattern.compile("(?m)^[ \\t]*([0-9A-Fa-f]+)[ \\t]+;" + // Match the source char
"[ \\t]*([0-9A-Fa-f]+" + // Match the replacement char(s)
"(?:[ \\t]+[0-9A-Fa-f]+)*)[ \\t]*;" + // (continued)
"\\s*(?:(SL)|(SA)|(ML)|(MA))" + // Match the table type
"[ \\t]*(?:#.*?)?$" + // Match any trailing #comment
"|^([ \\t]*(?:#.*?)?)$" + // OR match empty lines or lines with only a #comment
"|^(.*?)$"); // OR match any line, which catches illegal lines.
// Regular expression for parsing a hex number out of a space-separated list of them.
// Capture group 1 gets the number, with spaces removed.
fParseHexNum = Pattern.compile("\\s*([0-9A-F]+)");
// Zap any Byte Order Mark at the start of input. Changing it to a space
// is benign given the syntax of the input.
if (fInput.charAt(0) == 0xfeff) {
fInput.setCharAt(0, (char) 0x20);
}
// Parse the input, one line per iteration of this loop.
Matcher matcher = fParseLine.matcher(fInput);
while (matcher.find()) {
fLineNum++;
if (matcher.start(7) >= 0) {
// this was a blank or comment line.
continue;
}
if (matcher.start(8) >= 0) {
// input file syntax error.
// status = U_PARSE_ERROR;
throw new ParseException(
"Confusables, line " + fLineNum + ": Unrecognized Line: " + matcher.group(8),
matcher.start(8));
}
// We have a good input line. Extract the key character and mapping
// string, and
// put them into the appropriate mapping table.
int keyChar = Integer.parseInt(matcher.group(1), 16);
if (keyChar > 0x10ffff) {
throw new ParseException(
"Confusables, line " + fLineNum + ": Bad code point: " + matcher.group(1),
matcher.start(1));
}
Matcher m = fParseHexNum.matcher(matcher.group(2));
StringBuilder mapString = new StringBuilder();
while (m.find()) {
int c = Integer.parseInt(m.group(1), 16);
if (c > 0x10ffff) {
throw new ParseException(
"Confusables, line " + fLineNum + ": Bad code point: " + Integer.toString(c, 16),
matcher.start(2));
}
mapString.appendCodePoint(c);
}
assert (mapString.length() >= 1);
// Put the map (value) string into the string pool
// This a little like a Java intern() - any duplicates will be
// eliminated.
SPUString smapString = stringPool.addString(mapString.toString());
// Add the char . string mapping to the table.
// For Unicode 8, the SL, SA and ML tables have been discontinued.
// All input data from confusables.txt is tagged MA.
fTable.put(keyChar, smapString);
fKeySet.add(keyChar);
}
// Input data is now all parsed and collected.
// Now create the run-time binary form of the data.
//
// This is done in two steps. First the data is assembled into vectors and strings,
// for ease of construction, then the contents of these collections are copied
// into the actual SpoofData object.
// Build up the string array, and record the index of each string therein
// in the (build time only) string pool.
// Strings of length one are not entered into the strings array.
// (Strings in the table are sorted by length)
stringPool.sort();
fStringTable = new StringBuffer();
int poolSize = stringPool.size();
int i;
for (i = 0; i < poolSize; i++) {
SPUString s = stringPool.getByIndex(i);
int strLen = s.fStr.length();
int strIndex = fStringTable.length();
if (strLen == 1) {
// strings of length one do not get an entry in the string table.
// Keep the single string character itself here, which is the same
// convention that is used in the final run-time string table index.
s.fCharOrStrTableIndex = s.fStr.charAt(0);
} else {
s.fCharOrStrTableIndex = strIndex;
fStringTable.append(s.fStr);
}
}
// Construct the compile-time Key and Value table.
//
// The keys in the Key table follow the format described in uspoof.h for the
// Cfu confusables data structure.
//
// Starting in ICU 58, each code point has exactly one entry in the data
// structure.
for (String keyCharStr : fKeySet) {
int keyChar = keyCharStr.codePointAt(0);
SPUString targetMapping = fTable.get(keyChar);
assert targetMapping != null;
// Throw a sane exception if trying to consume a long string. Otherwise,
// codePointAndLengthToKey will throw an assertion error.
if (targetMapping.fStr.length() > 256) {
throw new IllegalArgumentException("Confusable prototypes cannot be longer than 256 entries.");
}
int key = ConfusableDataUtils.codePointAndLengthToKey(keyChar, targetMapping.fStr.length());
int value = targetMapping.fCharOrStrTableIndex;
fKeyVec.add(key);
fValueVec.add(value);
}
// Put the assembled data into the destination SpoofData object.
// The Key Table
// While copying the keys to the output array,
// also sanity check that the keys are sorted.
int numKeys = fKeyVec.size();
dest.fCFUKeys = new int[numKeys];
int previousCodePoint = 0;
for (i = 0; i < numKeys; i++) {
int key = fKeyVec.get(i);
int codePoint = ConfusableDataUtils.keyToCodePoint(key);
// strictly greater because there can be only one entry per code point
assert codePoint > previousCodePoint;
dest.fCFUKeys[i] = key;
previousCodePoint = codePoint;
}
// The Value Table, parallels the key table
int numValues = fValueVec.size();
assert (numKeys == numValues);
dest.fCFUValues = new short[numValues];
i = 0;
for (int value : fValueVec) {
assert (value < 0xffff);
dest.fCFUValues[i++] = (short) value;
}
// The Strings Table.
dest.fCFUStrings = fStringTable.toString();
}
public static void buildConfusableData(Reader confusables, SpoofData dest)
throws java.io.IOException, ParseException {
ConfusabledataBuilder builder = new ConfusabledataBuilder();
builder.build(confusables, dest);
}
/*
* *****************************************************************************
* Internal classes for compiling confusable data into its binary (runtime) form.
* *****************************************************************************
*/
// SPUString
// Holds a string that is the result of one of the mappings defined
// by the confusable mapping data (confusables.txt from Unicode.org)
// Instances of SPUString exist during the compilation process only.
private static class SPUString {
String fStr; // The actual string.
int fCharOrStrTableIndex; // Index into the final runtime data for this string.
// (or, for length 1, the single string char itself,
// there being no string table entry for it.)
SPUString(String s) {
fStr = s;
fCharOrStrTableIndex = 0;
}
}
// Comparison function for ordering strings in the string pool.
// Compare by length first, then, within a group of the same length,
// by code point order.
private static class SPUStringComparator implements Comparator {
@Override
public int compare(SPUString sL, SPUString sR) {
int lenL = sL.fStr.length();
int lenR = sR.fStr.length();
if (lenL < lenR) {
return -1;
} else if (lenL > lenR) {
return 1;
} else {
return sL.fStr.compareTo(sR.fStr);
}
}
final static SPUStringComparator INSTANCE = new SPUStringComparator();
}
// String Pool A utility class for holding the strings that are the result of
// the spoof mappings. These strings will utimately end up in the
// run-time String Table.
// This is sort of like a sorted set of strings, except that ICU's anemic
// built-in collections don't support those, so it is implemented with a
// combination of a uhash and a Vector.
private static class SPUStringPool {
public SPUStringPool() {
fVec = new Vector<>();
fHash = new Hashtable<>();
}
public int size() {
return fVec.size();
}
// Get the n-th string in the collection.
public SPUString getByIndex(int index) {
SPUString retString = fVec.elementAt(index);
return retString;
}
// Add a string. Return the string from the table.
// If the input parameter string is already in the table, delete the
// input parameter and return the existing string.
public SPUString addString(String src) {
SPUString hashedString = fHash.get(src);
if (hashedString == null) {
hashedString = new SPUString(src);
fHash.put(src, hashedString);
fVec.addElement(hashedString);
}
return hashedString;
}
// Sort the contents; affects the ordering of getByIndex().
public void sort() {
Collections.sort(fVec, SPUStringComparator.INSTANCE);
}
private Vector fVec; // Elements are SPUString *
private Hashtable fHash; // Key: Value:
}
}
}
/**
* Get the Restriction Level that is being tested.
*
* @return The restriction level
* @internal
* @deprecated This API is ICU internal only.
*/
@Deprecated
public RestrictionLevel getRestrictionLevel() {
return fRestrictionLevel;
}
/**
* Get the set of checks that this Spoof Checker has been configured to perform.
*
* @return The set of checks that this spoof checker will perform.
* @stable ICU 4.6
*/
public int getChecks() {
return fChecks;
}
/**
* Get a read-only set of locales for the scripts that are acceptable in strings to be checked. If no limitations on
* scripts have been specified, an empty set will be returned.
*
* setAllowedChars() will reset the list of allowed locales to be empty.
*
* The returned set may not be identical to the originally specified set that is supplied to setAllowedLocales();
* the information other than languages from the originally specified locales may be omitted.
*
* @return A set of locales corresponding to the acceptable scripts.
*
* @stable ICU 4.6
*/
public Set getAllowedLocales() {
return Collections.unmodifiableSet(fAllowedLocales);
}
/**
* Get a set of {@link java.util.Locale} instances for the scripts that are acceptable in strings to be checked. If
* no limitations on scripts have been specified, an empty set will be returned.
*
* @return A set of locales corresponding to the acceptable scripts.
* @stable ICU 54
*/
public Set getAllowedJavaLocales() {
HashSet locales = new HashSet<>(fAllowedLocales.size());
for (ULocale uloc : fAllowedLocales) {
locales.add(uloc.toLocale());
}
return locales;
}
/**
* Get a UnicodeSet for the characters permitted in an identifier. This corresponds to the limits imposed by the Set
* Allowed Characters functions. Limitations imposed by other checks will not be reflected in the set returned by
* this function.
*
* The returned set will be frozen, meaning that it cannot be modified by the caller.
*
* @return A UnicodeSet containing the characters that are permitted by the CHAR_LIMIT test.
* @stable ICU 4.6
*/
public UnicodeSet getAllowedChars() {
return fAllowedCharsSet;
}
/**
* A struct-like class to hold the results of a Spoof Check operation. Tells which check(s) have failed.
*
* @stable ICU 4.6
*/
public static class CheckResult {
/**
* Indicates which of the spoof check(s) have failed. The value is a bitwise OR of the constants for the tests
* in question: RESTRICTION_LEVEL, CHAR_LIMIT, and so on.
*
* @stable ICU 4.6
* @see Builder#setChecks
*/
public int checks;
/**
* The index of the first string position that failed a check.
*
* @deprecated ICU 51. No longer supported. Always set to zero.
*/
@Deprecated
public int position;
/**
* The numerics found in the string, if MIXED_NUMBERS was set; otherwise null. The set will contain the zero
* digit from each decimal number system found in the input string.
*
* @stable ICU 58
*/
public UnicodeSet numerics;
/**
* The restriction level that the text meets, if RESTRICTION_LEVEL is set; otherwise null.
*
* @stable ICU 58
*/
public RestrictionLevel restrictionLevel;
/**
* Default constructor
*
* @stable ICU 4.6
*/
public CheckResult() {
checks = 0;
position = 0;
}
/**
* {@inheritDoc}
*
* @stable ICU 4.6
*/
@Override
public String toString() {
StringBuilder sb = new StringBuilder();
sb.append("checks:");
if (checks == 0) {
sb.append(" none");
} else if (checks == ALL_CHECKS) {
sb.append(" all");
} else {
if ((checks & SINGLE_SCRIPT_CONFUSABLE) != 0) {
sb.append(" SINGLE_SCRIPT_CONFUSABLE");
}
if ((checks & MIXED_SCRIPT_CONFUSABLE) != 0) {
sb.append(" MIXED_SCRIPT_CONFUSABLE");
}
if ((checks & WHOLE_SCRIPT_CONFUSABLE) != 0) {
sb.append(" WHOLE_SCRIPT_CONFUSABLE");
}
if ((checks & ANY_CASE) != 0) {
sb.append(" ANY_CASE");
}
if ((checks & RESTRICTION_LEVEL) != 0) {
sb.append(" RESTRICTION_LEVEL");
}
if ((checks & INVISIBLE) != 0) {
sb.append(" INVISIBLE");
}
if ((checks & CHAR_LIMIT) != 0) {
sb.append(" CHAR_LIMIT");
}
if ((checks & MIXED_NUMBERS) != 0) {
sb.append(" MIXED_NUMBERS");
}
}
sb.append(", numerics: ").append(numerics.toPattern(false));
sb.append(", position: ").append(position);
sb.append(", restrictionLevel: ").append(restrictionLevel);
return sb.toString();
}
}
/**
* Check the specified string for possible security issues. The text to be checked will typically be an identifier
* of some sort. The set of checks to be performed was specified when building the SpoofChecker.
*
* @param text
* A String to be checked for possible security issues.
* @param checkResult
* Output parameter, indicates which specific tests failed. May be null if the information is not wanted.
* @return True there any issue is found with the input string.
* @stable ICU 4.8
*/
public boolean failsChecks(String text, CheckResult checkResult) {
int length = text.length();
int result = 0;
if (checkResult != null) {
checkResult.position = 0;
checkResult.numerics = null;
checkResult.restrictionLevel = null;
}
if (0 != (this.fChecks & RESTRICTION_LEVEL)) {
RestrictionLevel textRestrictionLevel = getRestrictionLevel(text);
if (textRestrictionLevel.compareTo(fRestrictionLevel) > 0) {
result |= RESTRICTION_LEVEL;
}
if (checkResult != null) {
checkResult.restrictionLevel = textRestrictionLevel;
}
}
if (0 != (this.fChecks & MIXED_NUMBERS)) {
UnicodeSet numerics = new UnicodeSet();
getNumerics(text, numerics);
if (numerics.size() > 1) {
result |= MIXED_NUMBERS;
}
if (checkResult != null) {
checkResult.numerics = numerics;
}
}
if (0 != (this.fChecks & HIDDEN_OVERLAY)) {
int index = findHiddenOverlay(text);
if (index != -1) {
result |= HIDDEN_OVERLAY;
}
}
if (0 != (this.fChecks & CHAR_LIMIT)) {
int i;
int c;
for (i = 0; i < length;) {
// U16_NEXT(text, i, length, c);
c = Character.codePointAt(text, i);
i = Character.offsetByCodePoints(text, i, 1);
if (!this.fAllowedCharsSet.contains(c)) {
result |= CHAR_LIMIT;
break;
}
}
}
if (0 != (this.fChecks & INVISIBLE)) {
// This check needs to be done on NFD input
String nfdText = nfdNormalizer.normalize(text);
// scan for more than one occurrence of the same non-spacing mark
// in a sequence of non-spacing marks.
int i;
int c;
int firstNonspacingMark = 0;
boolean haveMultipleMarks = false;
UnicodeSet marksSeenSoFar = new UnicodeSet(); // Set of combining marks in a
// single combining sequence.
for (i = 0; i < length;) {
c = Character.codePointAt(nfdText, i);
i = Character.offsetByCodePoints(nfdText, i, 1);
if (Character.getType(c) != UCharacterCategory.NON_SPACING_MARK) {
firstNonspacingMark = 0;
if (haveMultipleMarks) {
marksSeenSoFar.clear();
haveMultipleMarks = false;
}
continue;
}
if (firstNonspacingMark == 0) {
firstNonspacingMark = c;
continue;
}
if (!haveMultipleMarks) {
marksSeenSoFar.add(firstNonspacingMark);
haveMultipleMarks = true;
}
if (marksSeenSoFar.contains(c)) {
// report the error, and stop scanning.
// No need to find more than the first failure.
result |= INVISIBLE;
break;
}
marksSeenSoFar.add(c);
}
}
if (checkResult != null) {
checkResult.checks = result;
}
return (0 != result);
}
/**
* Check the specified string for possible security issues. The text to be checked will typically be an identifier
* of some sort. The set of checks to be performed was specified when building the SpoofChecker.
*
* @param text
* A String to be checked for possible security issues.
* @return True there any issue is found with the input string.
* @stable ICU 4.8
*/
public boolean failsChecks(String text) {
return failsChecks(text, null);
}
/**
* Check whether two specified strings are visually confusable. The types of confusability to be tested - single
* script, mixed script, or whole script - are determined by the check options set for the SpoofChecker.
*
* The tests to be performed are controlled by the flags SINGLE_SCRIPT_CONFUSABLE MIXED_SCRIPT_CONFUSABLE
* WHOLE_SCRIPT_CONFUSABLE At least one of these tests must be selected.
*
* ANY_CASE is a modifier for the tests. Select it if the identifiers may be of mixed case. If identifiers are case
* folded for comparison and display to the user, do not select the ANY_CASE option.
*
*
* @param s1
* The first of the two strings to be compared for confusability.
* @param s2
* The second of the two strings to be compared for confusability.
* @return Non-zero if s1 and s1 are confusable. If not 0, the value will indicate the type(s) of confusability
* found, as defined by spoof check test constants.
* @stable ICU 4.6
*/
public int areConfusable(String s1, String s2) {
//
// See section 4 of UTS #39 for the algorithm for checking whether two strings are confusable,
// and for definitions of the types (single, whole, mixed-script) of confusables.
// We only care about a few of the check flags. Ignore the others.
// If no tests relevant to this function have been specified, signal an error.
// TODO: is this really the right thing to do? It's probably an error on
// the caller's part, but logically we would just return 0 (no error).
if ((this.fChecks & CONFUSABLE) == 0) {
throw new IllegalArgumentException("No confusable checks are enabled.");
}
// Compute the skeletons and check for confusability.
String s1Skeleton = getSkeleton(s1);
String s2Skeleton = getSkeleton(s2);
if (!s1Skeleton.equals(s2Skeleton)) {
return 0;
}
// If we get here, the strings are confusable. Now we just need to set the flags for the appropriate classes
// of confusables according to UTS 39 section 4.
// Start by computing the resolved script sets of s1 and s2.
ScriptSet s1RSS = new ScriptSet();
getResolvedScriptSet(s1, s1RSS);
ScriptSet s2RSS = new ScriptSet();
getResolvedScriptSet(s2, s2RSS);
// Turn on all applicable flags
int result = 0;
if (s1RSS.intersects(s2RSS)) {
result |= SINGLE_SCRIPT_CONFUSABLE;
} else {
result |= MIXED_SCRIPT_CONFUSABLE;
if (!s1RSS.isEmpty() && !s2RSS.isEmpty()) {
result |= WHOLE_SCRIPT_CONFUSABLE;
}
}
// Turn off flags that the user doesn't want
return result & fChecks;
}
/**
* Check whether two specified strings are visually when displayed in a paragraph with the given direction.
* The types of confusability to be tested—single script, mixed script, or whole script—are determined by the check options set for the SpoofChecker.
*
* The tests to be performed are controlled by the flags SINGLE_SCRIPT_CONFUSABLE MIXED_SCRIPT_CONFUSABLE
* WHOLE_SCRIPT_CONFUSABLE At least one of these tests must be selected.
*
* ANY_CASE is a modifier for the tests. Select it if the identifiers may be of mixed case. If identifiers are case
* folded for comparison and display to the user, do not select the ANY_CASE option.
*
*
* @param direction The paragraph direction with which the identifiers are displayed.
* Must be either {@link Bidi#DIRECTION_LEFT_TO_RIGHT} or {@link Bidi#DIRECTION_RIGHT_TO_LEFT}.
* @param s1
* The first of the two strings to be compared for confusability.
* @param s2
* The second of the two strings to be compared for confusability.
* @return Non-zero if s1 and s1 are confusable. If not 0, the value will indicate the type(s) of confusability
* found, as defined by spoof check test constants.
* @stable ICU 74
*/
public int areConfusable(int direction, CharSequence s1, CharSequence s2) {
//
// See section 4 of UTS #39 for the algorithm for checking whether two strings are confusable,
// and for definitions of the types (single, whole, mixed-script) of confusables.
// We only care about a few of the check flags. Ignore the others.
// If no tests relevant to this function have been specified, signal an error.
// TODO: is this really the right thing to do? It's probably an error on
// the caller's part, but logically we would just return 0 (no error).
if ((this.fChecks & CONFUSABLE) == 0) {
throw new IllegalArgumentException("No confusable checks are enabled.");
}
// Compute the skeletons and check for confusability.
String s1Skeleton = getBidiSkeleton(direction, s1);
String s2Skeleton = getBidiSkeleton(direction, s2);
if (!s1Skeleton.equals(s2Skeleton)) {
return 0;
}
// If we get here, the strings are confusable. Now we just need to set the flags for the appropriate classes
// of confusables according to UTS 39 section 4.
// Start by computing the resolved script sets of s1 and s2.
ScriptSet s1RSS = new ScriptSet();
getResolvedScriptSet(s1, s1RSS);
ScriptSet s2RSS = new ScriptSet();
getResolvedScriptSet(s2, s2RSS);
// Turn on all applicable flags
int result = 0;
if (s1RSS.intersects(s2RSS)) {
result |= SINGLE_SCRIPT_CONFUSABLE;
} else {
result |= MIXED_SCRIPT_CONFUSABLE;
if (!s1RSS.isEmpty() && !s2RSS.isEmpty()) {
result |= WHOLE_SCRIPT_CONFUSABLE;
}
}
// Turn off flags that the user doesn't want
result &= fChecks;
return result;
}
/**
* Get the "bidiSkeleton" for an identifier string and a direction.
* Skeletons are a transformation of the input string;
* Two identifiers are LTR-confusable if their LTR bidiSkeletons are identical;
* they are RTL-confusable if their RTL bidiSkeletons are identical.
* See Unicode Technical Standard #39 for additional information:
* https://www.unicode.org/reports/tr39/#Confusable_Detection.
*
* Using skeletons directly makes it possible to quickly check whether an identifier is confusable with any of some
* large set of existing identifiers, by creating an efficiently searchable collection of the skeletons.
*
* Skeletons are computed using the algorithm and data described in UTS #39.
*
* @param direction The paragraph direction with which the string is displayed.
* Must be either {@link Bidi#DIRECTION_LEFT_TO_RIGHT} or {@link Bidi#DIRECTION_RIGHT_TO_LEFT}.
* @param str The input string whose bidiSkeleton will be generated.
* @return The output skeleton string.
*
* @stable ICU 74
*/
public String getBidiSkeleton(int direction, CharSequence str) {
if (direction != Bidi.DIRECTION_LEFT_TO_RIGHT && direction != Bidi.DIRECTION_RIGHT_TO_LEFT) {
throw new IllegalArgumentException("direction should be DIRECTION_LEFT_TO_RIGHT or DIRECTION_RIGHT_TO_LEFT");
}
Bidi bidi = new Bidi(str.toString(), direction);
return getSkeleton(bidi.writeReordered(Bidi.KEEP_BASE_COMBINING | Bidi.DO_MIRRORING));
}
/**
* Get the "skeleton" for an identifier string. Skeletons are a transformation of the input string; Two strings are
* confusable if their skeletons are identical. See Unicode UAX 39 for additional information.
*
* Using skeletons directly makes it possible to quickly check whether an identifier is confusable with any of some
* large set of existing identifiers, by creating an efficiently searchable collection of the skeletons.
*
* Skeletons are computed using the algorithm and data described in Unicode UAX 39.
*
* @param str
* The input string whose skeleton will be generated.
* @return The output skeleton string.
*
* @stable ICU 58
*/
public String getSkeleton(CharSequence str) {
// Apply the skeleton mapping to the NFD normalized input string
// Accumulate the skeleton, possibly unnormalized, in a String.
String nfdId = nfdNormalizer.normalize(str);
int normalizedLen = nfdId.length();
StringBuilder skelSB = new StringBuilder();
for (int inputIndex = 0; inputIndex < normalizedLen;) {
int c = Character.codePointAt(nfdId, inputIndex);
inputIndex += Character.charCount(c);
if (!UCharacter.hasBinaryProperty(c, UProperty.DEFAULT_IGNORABLE_CODE_POINT)) {
this.fSpoofData.confusableLookup(c, skelSB);
}
}
String skelStr = skelSB.toString();
skelStr = nfdNormalizer.normalize(skelStr);
return skelStr;
}
/**
* Calls {@link SpoofChecker#getSkeleton(CharSequence id)}. Starting with ICU 55, the "type" parameter has been
* ignored, and starting with ICU 58, this function has been deprecated.
*
* @param type
* No longer supported. Prior to ICU 55, was used to specify the mapping table SL, SA, ML, or MA.
* @param id
* The input identifier whose skeleton will be generated.
* @return The output skeleton string.
*
* @deprecated ICU 58
*/
@Deprecated
public String getSkeleton(int type, String id) {
return getSkeleton(id);
}
/**
* Equality function. Return true if the two SpoofChecker objects incorporate the same confusable data and have
* enabled the same set of checks.
*
* @param other
* the SpoofChecker being compared with.
* @return true if the two SpoofCheckers are equal.
* @stable ICU 4.6
*/
@Override
public boolean equals(Object other) {
if (!(other instanceof SpoofChecker)) {
return false;
}
SpoofChecker otherSC = (SpoofChecker) other;
if (fSpoofData != otherSC.fSpoofData && fSpoofData != null && !fSpoofData.equals(otherSC.fSpoofData)) {
return false;
}
if (fChecks != otherSC.fChecks) {
return false;
}
if (fAllowedLocales != otherSC.fAllowedLocales && fAllowedLocales != null
&& !fAllowedLocales.equals(otherSC.fAllowedLocales)) {
return false;
}
if (fAllowedCharsSet != otherSC.fAllowedCharsSet && fAllowedCharsSet != null
&& !fAllowedCharsSet.equals(otherSC.fAllowedCharsSet)) {
return false;
}
if (fRestrictionLevel != otherSC.fRestrictionLevel) {
return false;
}
return true;
}
/**
* Overrides {@link Object#hashCode()}.
* @stable ICU 4.6
*/
@Override
public int hashCode() {
return fChecks
^ fSpoofData.hashCode()
^ fAllowedLocales.hashCode()
^ fAllowedCharsSet.hashCode()
^ fRestrictionLevel.ordinal();
}
/**
* Computes the augmented script set for a code point, according to UTS 39 section 5.1.
*/
private static void getAugmentedScriptSet(int codePoint, ScriptSet result) {
result.clear();
UScript.getScriptExtensions(codePoint, result);
// Section 5.1 step 1
if (result.get(UScript.HAN)) {
result.set(UScript.HAN_WITH_BOPOMOFO);
result.set(UScript.JAPANESE);
result.set(UScript.KOREAN);
}
if (result.get(UScript.HIRAGANA)) {
result.set(UScript.JAPANESE);
}
if (result.get(UScript.KATAKANA)) {
result.set(UScript.JAPANESE);
}
if (result.get(UScript.HANGUL)) {
result.set(UScript.KOREAN);
}
if (result.get(UScript.BOPOMOFO)) {
result.set(UScript.HAN_WITH_BOPOMOFO);
}
// Section 5.1 step 2
if (result.get(UScript.COMMON) || result.get(UScript.INHERITED)) {
result.setAll();
}
}
/**
* Computes the resolved script set for a string, according to UTS 39 section 5.1.
*/
private void getResolvedScriptSet(CharSequence input, ScriptSet result) {
getResolvedScriptSetWithout(input, UScript.CODE_LIMIT, result);
}
/**
* Computes the resolved script set for a string, omitting characters having the specified script. If
* UScript.CODE_LIMIT is passed as the second argument, all characters are included.
*/
private void getResolvedScriptSetWithout(CharSequence input, int script, ScriptSet result) {
result.setAll();
ScriptSet temp = new ScriptSet();
for (int utf16Offset = 0; utf16Offset < input.length();) {
int codePoint = Character.codePointAt(input, utf16Offset);
utf16Offset += Character.charCount(codePoint);
// Compute the augmented script set for the character
getAugmentedScriptSet(codePoint, temp);
// Intersect the augmented script set with the resolved script set, but only if the character doesn't
// have the script specified in the function call
if (script == UScript.CODE_LIMIT || !temp.get(script)) {
result.and(temp);
}
}
}
/**
* Computes the set of numerics for a string, according to UTS 39 section 5.3.
*/
private void getNumerics(String input, UnicodeSet result) {
result.clear();
for (int utf16Offset = 0; utf16Offset < input.length();) {
int codePoint = Character.codePointAt(input, utf16Offset);
utf16Offset += Character.charCount(codePoint);
// Store a representative character for each kind of decimal digit
if (UCharacter.getType(codePoint) == UCharacterCategory.DECIMAL_DIGIT_NUMBER) {
// Store the zero character as a representative for comparison.
// Unicode guarantees it is codePoint - value
result.add(codePoint - UCharacter.getNumericValue(codePoint));
}
}
}
/**
* Computes the restriction level of a string, according to UTS 39 section 5.2.
*/
private RestrictionLevel getRestrictionLevel(String input) {
// Section 5.2 step 1:
if (!fAllowedCharsSet.containsAll(input)) {
return RestrictionLevel.UNRESTRICTIVE;
}
// Section 5.2 step 2:
if (ASCII.containsAll(input)) {
return RestrictionLevel.ASCII;
}
// Section 5.2 steps 3:
ScriptSet resolvedScriptSet = new ScriptSet();
getResolvedScriptSet(input, resolvedScriptSet);
// Section 5.2 step 4:
if (!resolvedScriptSet.isEmpty()) {
return RestrictionLevel.SINGLE_SCRIPT_RESTRICTIVE;
}
// Section 5.2 step 5:
ScriptSet resolvedNoLatn = new ScriptSet();
getResolvedScriptSetWithout(input, UScript.LATIN, resolvedNoLatn);
// Section 5.2 step 6:
if (resolvedNoLatn.get(UScript.HAN_WITH_BOPOMOFO) || resolvedNoLatn.get(UScript.JAPANESE)
|| resolvedNoLatn.get(UScript.KOREAN)) {
return RestrictionLevel.HIGHLY_RESTRICTIVE;
}
// Section 5.2 step 7:
if (!resolvedNoLatn.isEmpty() && !resolvedNoLatn.get(UScript.CYRILLIC) && !resolvedNoLatn.get(UScript.GREEK)
&& !resolvedNoLatn.get(UScript.CHEROKEE)) {
return RestrictionLevel.MODERATELY_RESTRICTIVE;
}
// Section 5.2 step 8:
return RestrictionLevel.MINIMALLY_RESTRICTIVE;
}
int findHiddenOverlay(String input) {
boolean sawLeadCharacter = false;
StringBuilder sb = new StringBuilder();
for (int i=0; i fAllowedLocales; // The Set of allowed locales.
private UnicodeSet fAllowedCharsSet; // The UnicodeSet of allowed characters.
private RestrictionLevel fRestrictionLevel;
private static Normalizer2 nfdNormalizer = Normalizer2.getNFDInstance();
// Confusable Mappings Data Structures, version 2.0
//
// This description and the corresponding implementation are to be kept
// in-sync with the copy in icu4c uspoof_impl.h.
//
// For the confusable data, we are essentially implementing a map,
// key: a code point
// value: a string. Most commonly one char in length, but can be more.
//
// The keys are stored as a sorted array of 32 bit ints.
// bits 0-23 a code point value
// bits 24-31 length of value string, in UChars (between 1 and 256 UChars).
// The key table is sorted in ascending code point order. (not on the
// 32 bit int value, the flag bits do not participate in the sorting.)
//
// Lookup is done by means of a binary search in the key table.
//
// The corresponding values are kept in a parallel array of 16 bit ints.
// If the value string is of length 1, it is literally in the value array.
// For longer strings, the value array contains an index into the strings
// table.
//
// String Table:
// The strings table contains all of the value strings (those of length two or greater)
// concatenated together into one long char (UTF-16) array.
//
// There is no nul character or other mark between adjacent strings.
//
//----------------------------------------------------------------------------
//
// Changes from format version 1 to format version 2:
// 1) Removal of the whole-script confusable data tables.
// 2) Removal of the SL/SA/ML/MA and multi-table flags in the key bitmask.
// 3) Expansion of string length value in the key bitmask from 2 bits to 8 bits.
// 4) Removal of the string lengths table since 8 bits is sufficient for the
// lengths of all entries in confusables.txt.
//
private static final class ConfusableDataUtils {
public static final int FORMAT_VERSION = 2; // version for ICU 58
public static final int keyToCodePoint(int key) {
return key & 0x00ffffff;
}
public static final int keyToLength(int key) {
return ((key & 0xff000000) >> 24) + 1;
}
public static final int codePointAndLengthToKey(int codePoint, int length) {
assert (codePoint & 0x00ffffff) == codePoint;
assert length <= 256;
return codePoint | ((length - 1) << 24);
}
}
// -------------------------------------------------------------------------------------
//
// SpoofData
//
// This class corresponds to the ICU SpoofCheck data.
//
// The data can originate with the Binary ICU data that is generated in ICU4C,
// or it can originate from source rules that are compiled in ICU4J.
//
// This class does not include the set of checks to be performed, but only
// data that is serialized into the ICU binary data.
//
// Because Java cannot easily wrap binary data like ICU4C, the binary data is
// copied into Java structures that are convenient for use by the run time code.
//
// ---------------------------------------------------------------------------------------
private static class SpoofData {
// The Confusable data, Java data structures for.
int[] fCFUKeys;
short[] fCFUValues;
String fCFUStrings;
private static final int DATA_FORMAT = 0x43667520; // "Cfu "
private static final class IsAcceptable implements Authenticate {
@Override
public boolean isDataVersionAcceptable(byte version[]) {
return version[0] == ConfusableDataUtils.FORMAT_VERSION || version[1] != 0 || version[2] != 0
|| version[3] != 0;
}
}
private static final IsAcceptable IS_ACCEPTABLE = new IsAcceptable();
private static final class DefaultData {
private static SpoofData INSTANCE = null;
private static IOException EXCEPTION = null;
static {
// Note: Although this is static, the Java runtime can delay execution of this block until
// the data is actually requested via SpoofData.getDefault().
try {
INSTANCE = new SpoofData(ICUBinary.getRequiredData("confusables.cfu"));
} catch (IOException e) {
EXCEPTION = e;
}
}
}
/**
* @return instance for Unicode standard data
*/
public static SpoofData getDefault() {
if (DefaultData.EXCEPTION != null) {
throw new MissingResourceException(
"Could not load default confusables data: " + DefaultData.EXCEPTION.getMessage(),
"SpoofChecker", "");
}
return DefaultData.INSTANCE;
}
// SpoofChecker Data constructor for use from data builder.
// Initializes a new, empty data area that will be populated later.
private SpoofData() {
}
// Constructor for use when creating from prebuilt default data.
// A ByteBuffer is what the ICU internal data loading functions provide.
private SpoofData(ByteBuffer bytes) throws java.io.IOException {
ICUBinary.readHeader(bytes, DATA_FORMAT, IS_ACCEPTABLE);
bytes.mark();
readData(bytes);
}
@Override
public boolean equals(Object other) {
if (!(other instanceof SpoofData)) {
return false;
}
SpoofData otherData = (SpoofData) other;
if (!Arrays.equals(fCFUKeys, otherData.fCFUKeys))
return false;
if (!Arrays.equals(fCFUValues, otherData.fCFUValues))
return false;
if (!Utility.sameObjects(fCFUStrings, otherData.fCFUStrings) && fCFUStrings != null
&& !fCFUStrings.equals(otherData.fCFUStrings))
return false;
return true;
}
@Override
public int hashCode() {
return Arrays.hashCode(fCFUKeys)
^ Arrays.hashCode(fCFUValues)
^ fCFUStrings.hashCode();
}
// Set the SpoofChecker data from pre-built binary data in a byte buffer.
// The binary data format is as described for ICU4C spoof data.
//
private void readData(ByteBuffer bytes) throws java.io.IOException {
int magic = bytes.getInt();
if (magic != 0x3845fdef) {
throw new IllegalArgumentException("Bad Spoof Check Data.");
}
@SuppressWarnings("unused")
int dataFormatVersion = bytes.getInt();
@SuppressWarnings("unused")
int dataLength = bytes.getInt();
int CFUKeysOffset = bytes.getInt();
int CFUKeysSize = bytes.getInt();
int CFUValuesOffset = bytes.getInt();
int CFUValuesSize = bytes.getInt();
int CFUStringTableOffset = bytes.getInt();
int CFUStringTableSize = bytes.getInt();
// We have now read the file header, and obtained the position for each
// of the data items. Now read each in turn, first seeking the
// input stream to the position of the data item.
bytes.reset();
ICUBinary.skipBytes(bytes, CFUKeysOffset);
fCFUKeys = ICUBinary.getInts(bytes, CFUKeysSize, 0);
bytes.reset();
ICUBinary.skipBytes(bytes, CFUValuesOffset);
fCFUValues = ICUBinary.getShorts(bytes, CFUValuesSize, 0);
bytes.reset();
ICUBinary.skipBytes(bytes, CFUStringTableOffset);
fCFUStrings = ICUBinary.getString(bytes, CFUStringTableSize, 0);
}
/**
* Append the confusable skeleton transform for a single code point to a StringBuilder. The string to be
* appended will between 1 and 18 characters as of Unicode 9.
*
* This is the heart of the confusable skeleton generation implementation.
*/
public void confusableLookup(int inChar, StringBuilder dest) {
// Perform a binary search.
// [lo, hi), i.e lo is inclusive, hi is exclusive.
// The result after the loop will be in lo.
int lo = 0;
int hi = length();
do {
int mid = (lo + hi) / 2;
if (codePointAt(mid) > inChar) {
hi = mid;
} else if (codePointAt(mid) < inChar) {
lo = mid;
} else {
// Found result. Break early.
lo = mid;
break;
}
} while (hi - lo > 1);
// Did we find an entry? If not, the char maps to itself.
if (codePointAt(lo) != inChar) {
dest.appendCodePoint(inChar);
return;
}
// Add the element to the string builder and return.
appendValueTo(lo, dest);
return;
}
/**
* Return the number of confusable entries in this SpoofData.
*
* @return The number of entries.
*/
public int length() {
return fCFUKeys.length;
}
/**
* Return the code point (key) at the specified index.
*
* @param index
* The index within the SpoofData.
* @return The code point.
*/
public int codePointAt(int index) {
return ConfusableDataUtils.keyToCodePoint(fCFUKeys[index]);
}
/**
* Append the confusable skeleton at the specified index to the StringBuilder dest.
*
* @param index
* The index within the SpoofData.
* @param dest
* The StringBuilder to which to append the skeleton.
*/
public void appendValueTo(int index, StringBuilder dest) {
int stringLength = ConfusableDataUtils.keyToLength(fCFUKeys[index]);
// Value is either a char (for strings of length 1) or
// an index into the string table (for longer strings)
short value = fCFUValues[index];
if (stringLength == 1) {
dest.append((char) value);
} else {
dest.append(fCFUStrings, value, value + stringLength);
}
}
}
// -------------------------------------------------------------------------------
//
// ScriptSet - Script code bit sets.
// Extends Java BitSet with input/output support and a few helper methods.
// Note: The I/O is not currently being used, so it has been commented out. If
// it is needed again, the code can be restored.
//
// -------------------------------------------------------------------------------
static class ScriptSet extends BitSet {
// Eclipse default value to quell warnings:
private static final long serialVersionUID = 1L;
// // The serialized version of this class can hold INT_CAPACITY * 32 scripts.
// private static final int INT_CAPACITY = 6;
// private static final long serialVersionUID = INT_CAPACITY;
// static {
// assert ScriptSet.INT_CAPACITY * Integer.SIZE <= UScript.CODE_LIMIT;
// }
//
// public ScriptSet() {
// }
//
// public ScriptSet(ByteBuffer bytes) throws java.io.IOException {
// for (int i = 0; i < INT_CAPACITY; i++) {
// int bits = bytes.getInt();
// for (int j = 0; j < Integer.SIZE; j++) {
// if ((bits & (1 << j)) != 0) {
// set(i * Integer.SIZE + j);
// }
// }
// }
// }
//
// public void output(DataOutputStream os) throws java.io.IOException {
// for (int i = 0; i < INT_CAPACITY; i++) {
// int bits = 0;
// for (int j = 0; j < Integer.SIZE; j++) {
// if (get(i * Integer.SIZE + j)) {
// bits |= (1 << j);
// }
// }
// os.writeInt(bits);
// }
// }
public void and(int script) {
this.clear(0, script);
this.clear(script + 1, UScript.CODE_LIMIT);
}
public void setAll() {
this.set(0, UScript.CODE_LIMIT);
}
public boolean isFull() {
return cardinality() == UScript.CODE_LIMIT;
}
public void appendStringTo(StringBuilder sb) {
sb.append("{ ");
if (isEmpty()) {
sb.append("- ");
} else if (isFull()) {
sb.append("* ");
} else {
for (int script = 0; script < UScript.CODE_LIMIT; script++) {
if (get(script)) {
sb.append(UScript.getShortName(script));
sb.append(" ");
}
}
}
sb.append("}");
}
@Override
public String toString() {
StringBuilder sb = new StringBuilder();
sb.append("");
return sb.toString();
}
}
}