All Downloads are FREE. Search and download functionalities are using the official Maven repository.

com.ibm.icu.impl.number.parse.UnicodeSetStaticCache Maven / Gradle / Ivy

There is a newer version: 2.12.15
Show newest version
// © 2017 and later: Unicode, Inc. and others.
// License & terms of use: http://www.unicode.org/copyright.html#License
package com.ibm.icu.impl.number.parse;

import java.util.EnumMap;
import java.util.Map;

import com.ibm.icu.text.UnicodeSet;

/**
 * This class statically initializes UnicodeSets useful for number parsing. Microbenchmarks show this to
 * bring a very sizeable performance boost.
 *
 * IMPORTANT ASSUMPTION: All of the sets contain code points (no strings) and they are all case-folded.
 * If this assumption were ever broken, logic in classes such as SymbolMatcher would need to be updated
 * in order to return well-formed sets upon calls to getLeadCodePoints().
 *
 * @author sffc
 */
public class UnicodeSetStaticCache {
    public static enum Key {
        // Ignorables
        BIDI,
        WHITESPACE,
        DEFAULT_IGNORABLES,
        STRICT_IGNORABLES,

        // Separators
        // Notes:
        // - COMMA is a superset of STRICT_COMMA
        // - PERIOD is a superset of SCRICT_PERIOD
        // - ALL_SEPARATORS is the union of COMMA, PERIOD, and OTHER_GROUPING_SEPARATORS
        // - STRICT_ALL_SEPARATORS is the union of STRICT_COMMA, STRICT_PERIOD, and OTHER_GRP_SEPARATORS
        COMMA,
        PERIOD,
        STRICT_COMMA,
        STRICT_PERIOD,
        OTHER_GROUPING_SEPARATORS,
        ALL_SEPARATORS,
        STRICT_ALL_SEPARATORS,

        // Symbols
        // TODO: NaN?
        MINUS_SIGN,
        PLUS_SIGN,
        PERCENT_SIGN,
        PERMILLE_SIGN,
        INFINITY,

        // Other
        DIGITS,
        NAN_LEAD,
        SCIENTIFIC_LEAD,
        CWCF, // TODO: Check if this is being used and remove it if not.

        // Combined Separators with Digits (for lead code points)
        DIGITS_OR_ALL_SEPARATORS,
        DIGITS_OR_STRICT_ALL_SEPARATORS,
    };

    private static final Map unicodeSets = new EnumMap(Key.class);

    public static UnicodeSet get(Key key) {
        return unicodeSets.get(key);
    }

    public static Key chooseFrom(String str, Key key1) {
        return get(key1).contains(str) ? key1 : null;
    }

    public static Key chooseFrom(String str, Key key1, Key key2) {
        return get(key1).contains(str) ? key1 : chooseFrom(str, key2);
    }

    public static Key chooseFrom(String str, Key key1, Key key2, Key key3) {
        return get(key1).contains(str) ? key1 : chooseFrom(str, key2, key3);
    }

    private static UnicodeSet computeUnion(Key k1, Key k2) {
        return new UnicodeSet().addAll(get(k1)).addAll(get(k2)).freeze();
    }

    private static UnicodeSet computeUnion(Key k1, Key k2, Key k3) {
        return new UnicodeSet().addAll(get(k1)).addAll(get(k2)).addAll(get(k3)).freeze();
    }

    static {
        // BiDi characters are skipped over and ignored at any point in the string, even in strict mode.
        unicodeSets.put(Key.BIDI, new UnicodeSet("[[\\u200E\\u200F\\u061C]]").freeze());

        // This set was decided after discussion with icu-design@. See ticket #13309.
        // Zs+TAB is "horizontal whitespace" according to UTS #18 (blank property).
        unicodeSets.put(Key.WHITESPACE, new UnicodeSet("[[:Zs:][\\u0009]]").freeze());

        unicodeSets.put(Key.DEFAULT_IGNORABLES, computeUnion(Key.BIDI, Key.WHITESPACE));
        unicodeSets.put(Key.STRICT_IGNORABLES, get(Key.BIDI));

        // TODO: Re-generate these sets from the UCD. They probably haven't been updated in a while.
        unicodeSets.put(Key.COMMA, new UnicodeSet("[,،٫、︐︑﹐﹑,、]").freeze());
        unicodeSets.put(Key.STRICT_COMMA, new UnicodeSet("[,٫︐﹐,]").freeze());
        unicodeSets.put(Key.PERIOD, new UnicodeSet("[.․。︒﹒.。]").freeze());
        unicodeSets.put(Key.STRICT_PERIOD, new UnicodeSet("[.․﹒.。]").freeze());
        unicodeSets.put(Key.OTHER_GROUPING_SEPARATORS,
                new UnicodeSet("['٬‘’'\\u0020\\u00A0\\u2000-\\u200A\\u202F\\u205F\\u3000]").freeze());
        unicodeSets.put(Key.ALL_SEPARATORS,
                computeUnion(Key.COMMA, Key.PERIOD, Key.OTHER_GROUPING_SEPARATORS));
        unicodeSets.put(Key.STRICT_ALL_SEPARATORS,
                computeUnion(Key.STRICT_COMMA, Key.STRICT_PERIOD, Key.OTHER_GROUPING_SEPARATORS));

        unicodeSets.put(Key.MINUS_SIGN, new UnicodeSet("[-⁻₋−➖﹣-]").freeze());
        unicodeSets.put(Key.PLUS_SIGN, new UnicodeSet("[+⁺₊➕﬩﹢+]").freeze());

        // TODO: Fill in the next three sets.
        unicodeSets.put(Key.PERCENT_SIGN, new UnicodeSet("[%٪]").freeze());
        unicodeSets.put(Key.PERMILLE_SIGN, new UnicodeSet("[‰؉]").freeze());
        unicodeSets.put(Key.INFINITY, new UnicodeSet("[∞]").freeze());

        unicodeSets.put(Key.DIGITS, new UnicodeSet("[:digit:]").freeze());
        unicodeSets.put(Key.NAN_LEAD,
                new UnicodeSet("[NnТтmeՈոс¤НнчTtsҳ\u975e\u1002\u0e9a\u10d0\u0f68\u0644\u0646]")
                        .freeze());
        unicodeSets.put(Key.SCIENTIFIC_LEAD, new UnicodeSet("[Ee×·е\u0627]").freeze());
        unicodeSets.put(Key.CWCF, new UnicodeSet("[:CWCF:]").freeze());

        unicodeSets.put(Key.DIGITS_OR_ALL_SEPARATORS, computeUnion(Key.DIGITS, Key.ALL_SEPARATORS));
        unicodeSets.put(Key.DIGITS_OR_STRICT_ALL_SEPARATORS,
                computeUnion(Key.DIGITS, Key.STRICT_ALL_SEPARATORS));
    }
}




© 2015 - 2024 Weber Informatics LLC | Privacy Policy