Lib.unicodedata.py Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of jython-standalone Show documentation
Jython is an implementation of the high-level, dynamic, object-oriented language Python written in 100% Pure Java, and seamlessly integrated with the Java platform. It thus allows you to run Python on any Java platform.
There is a newer version: 2.7.4
Show newest version
import java.lang.Character
try:
    # import from jarjar-ed version
    from org.python.icu.text import Normalizer
    from org.python.icu.lang import UCharacter, UProperty
    from org.python.icu.util import VersionInfo
    from org.python.icu.lang.UCharacter import EastAsianWidth, DecompositionType
    from org.python.icu.lang.UCharacterEnums import ECharacterCategory, ECharacterDirection
except ImportError:
    # development version of Jython, so use extlibs
    from com.ibm.icu.text import Normalizer
    from com.ibm.icu.lang import UCharacter, UProperty
    from com.ibm.icu.util import VersionInfo
    from com.ibm.icu.lang.UCharacter import EastAsianWidth, DecompositionType
    from com.ibm.icu.lang.UCharacterEnums import ECharacterCategory, ECharacterDirection


__all__ = (
    "bidirectional", "category", "combining", "decimal", "decomposition", "digit", "east_asian_width",
    "lookup", "mirrored", "name", "normalize", "numeric", "unidata_version")


_forms = {
    'NFC':  Normalizer.NFC,
    'NFKC': Normalizer.NFKC,
    'NFD':  Normalizer.NFD,
    'NFKD': Normalizer.NFKD
}

Nonesuch = object()   # to distinguish from None, which is a valid return value for some functions


def _validate_unichr(unichr):
    if not(isinstance(unichr, unicode)):
        raise TypeError("must be unicode, not {}".format(type(unichr).__name__))
    if len(unichr) > 1 or len(unichr) == 0:
        raise TypeError("need a single Unicode character as parameter")


def _get_codepoint(unichr):
    _validate_unichr(unichr)
    return ord(unichr)


def name(unichr, default=Nonesuch):
    # handle None
    n = UCharacter.getName(_get_codepoint(unichr))
    if n is None:
        if default is not Nonesuch:
            return default
        else:
            raise ValueError("no such name")
    return n


def lookup(name):
    codepoint = UCharacter.getCharFromName(name)
    if codepoint == -1:
        raise KeyError("undefined character name '{}".format(name))
    return unichr(codepoint)


def digit(unichr, default=Nonesuch):
    d = UCharacter.digit(_get_codepoint(unichr))
    if d == -1:
        if default is not Nonesuch:
            return default
        else:
            raise ValueError("not a digit")
    return d


def decimal(unichr, default=Nonesuch):
    d = UCharacter.getNumericValue(_get_codepoint(unichr))
    if d < 0 or d > 9:
        if default is not Nonesuch:
            return default
        else:
            raise ValueError("not a decimal")
    return d


def numeric(unichr, default=Nonesuch):
    n = UCharacter.getUnicodeNumericValue(_get_codepoint(unichr))
    if n == UCharacter.NO_NUMERIC_VALUE:
        if default is not Nonesuch:
            return default
        else:
            raise ValueError("not a numeric")
    return n


_decomp = {
    DecompositionType.CANONICAL: "canonical",
    DecompositionType.CIRCLE: "circle",
    DecompositionType.COMPAT: "compat", 
    DecompositionType.FINAL: "final", 
    DecompositionType.FONT: "font",
    DecompositionType.FRACTION: "fraction",
    DecompositionType.INITIAL: "initial",
    DecompositionType.ISOLATED: "isolated",
    DecompositionType.MEDIAL: "medial",
    DecompositionType.NARROW: "narrow",
    DecompositionType.NOBREAK: "nobreak",
    DecompositionType.NONE: None,
    DecompositionType.SMALL: "small",
    DecompositionType.SQUARE: "square",
    DecompositionType.SUB: "sub",
    DecompositionType.SUPER: "super",
    DecompositionType.VERTICAL: "vertical", 
    DecompositionType.WIDE: "wide"
}

def _get_decomp_type(unichr):
    if unichr == u"\u2044":  # FRACTION SLASH
        # special case this for CPython compatibility even though this returns as not being combining, eg, see
        # http://www.fileformat.info/info/unicode/char/2044/index.htm
        return "fraction"
    else:
        return _decomp[UCharacter.getIntPropertyValue(ord(unichr), UProperty.DECOMPOSITION_TYPE)]

def decomposition(unichr):
    _validate_unichr(unichr)
    d = Normalizer.decompose(unichr, True)
    decomp_type = None
    if len(d) == 1:
        decomp_type = _get_decomp_type(unichr)
    else:
        for c in d:
            decomp_type = _get_decomp_type(c)
            # print "Got a decomp_type %r %r %r" % (c, d, decomp_type)
            if decomp_type is not None:
                break
    hexed = " ".join(("{0:04X}".format(ord(c)) for c in d))
    if decomp_type:
        return "<{}> {}".format(decomp_type, hexed)
    elif len(d) == 1:
        return ""
    else:
        return hexed


# To map from ICU4J enumerations for category, bidirection, and
# east_asian_width to the underlying property values that Python uses
# from UnicodeData.txt required a manual mapping between the following
# two files:
#
# http://icu-project.org/apiref/icu4j/constant-values.html
# http://www.unicode.org/Public/6.3.0/ucd/PropertyValueAliases.txt

_cat = {
    ECharacterCategory.COMBINING_SPACING_MARK: "Mc",
    ECharacterCategory.CONNECTOR_PUNCTUATION: "Pc",
    ECharacterCategory.CONTROL: "Cc",
    ECharacterCategory.CURRENCY_SYMBOL: "Sc",
    ECharacterCategory.DASH_PUNCTUATION: "Pd",
    ECharacterCategory.DECIMAL_DIGIT_NUMBER: "Nd",
    ECharacterCategory.ENCLOSING_MARK: "Me",
    ECharacterCategory.END_PUNCTUATION: "Pe",
    ECharacterCategory.FINAL_PUNCTUATION: "Pf",
    ECharacterCategory.FORMAT: "Cf",
    # per http://icu-project.org/apiref/icu4j/com/ibm/icu/lang/UCharacterEnums.ECharacterCategory.html#GENERAL_OTHER_TYPES
    # - no characters in [UnicodeData.txt] have this property
    ECharacterCategory.GENERAL_OTHER_TYPES: "Cn Not Assigned",
    ECharacterCategory.INITIAL_PUNCTUATION: "Pi",
    ECharacterCategory.LETTER_NUMBER: "Nl",
    ECharacterCategory.LINE_SEPARATOR: "Zl",
    ECharacterCategory.LOWERCASE_LETTER: "Ll",
    ECharacterCategory.MATH_SYMBOL: "Sm",
    ECharacterCategory.MODIFIER_LETTER: "Lm",
    ECharacterCategory.MODIFIER_SYMBOL: "Sk",
    ECharacterCategory.NON_SPACING_MARK: "Mn",
    ECharacterCategory.OTHER_LETTER: "Lo",
    ECharacterCategory.OTHER_NUMBER: "No",
    ECharacterCategory.OTHER_PUNCTUATION: "Po",
    ECharacterCategory.OTHER_SYMBOL: "So",
    ECharacterCategory.PARAGRAPH_SEPARATOR: "Zp",
    ECharacterCategory.PRIVATE_USE: "Co",
    ECharacterCategory.SPACE_SEPARATOR: "Zs",
    ECharacterCategory.START_PUNCTUATION: "Ps",
    ECharacterCategory.SURROGATE: "Cs",
    ECharacterCategory.TITLECASE_LETTER: "Lt",
    ECharacterCategory.UNASSIGNED: "Cn",
    ECharacterCategory.UPPERCASE_LETTER: "Lu",
}

def category(unichr):
    return _cat[UCharacter.getType(_get_codepoint(unichr))]


_dir = {
    ECharacterDirection.ARABIC_NUMBER: "An",
    ECharacterDirection.BLOCK_SEPARATOR: "B",
    ECharacterDirection.BOUNDARY_NEUTRAL: "BN",
    ECharacterDirection.COMMON_NUMBER_SEPARATOR: "CS",
    ECharacterDirection.DIR_NON_SPACING_MARK: "NSM",
    ECharacterDirection.EUROPEAN_NUMBER: "EN",
    ECharacterDirection.EUROPEAN_NUMBER_SEPARATOR: "ES",
    ECharacterDirection.EUROPEAN_NUMBER_TERMINATOR: "ET",
    ECharacterDirection.FIRST_STRONG_ISOLATE: "FSI",
    ECharacterDirection.LEFT_TO_RIGHT: "L",
    ECharacterDirection.LEFT_TO_RIGHT_EMBEDDING: "LRE",
    ECharacterDirection.LEFT_TO_RIGHT_ISOLATE: "LRI",
    ECharacterDirection.LEFT_TO_RIGHT_OVERRIDE: "LRO",
    ECharacterDirection.OTHER_NEUTRAL: "ON",
    ECharacterDirection.POP_DIRECTIONAL_FORMAT: "PDF",
    ECharacterDirection.POP_DIRECTIONAL_ISOLATE: "PDI",
    ECharacterDirection.RIGHT_TO_LEFT: "R",
    ECharacterDirection.RIGHT_TO_LEFT_ARABIC: "AL",
    ECharacterDirection.RIGHT_TO_LEFT_EMBEDDING: "RLE",
    ECharacterDirection.RIGHT_TO_LEFT_ISOLATE: "RLI",
    ECharacterDirection.RIGHT_TO_LEFT_OVERRIDE: "RLO",
    ECharacterDirection.SEGMENT_SEPARATOR: "S",
    ECharacterDirection.WHITE_SPACE_NEUTRAL: "WS"
}

def bidirectional(unichr):
    return _dir[UCharacter.getDirection(_get_codepoint(unichr))]


def combining(unichr):
    return UCharacter.getCombiningClass(_get_codepoint(unichr))


def mirrored(unichr):
    return UCharacter.isMirrored(_get_codepoint(unichr))


_eaw = {
    # http://www.unicode.org/reports/tr11/
    EastAsianWidth.AMBIGUOUS : "A",
    EastAsianWidth.COUNT     : "?",  # apparently not used, see above TR
    EastAsianWidth.FULLWIDTH : "F",
    EastAsianWidth.HALFWIDTH : "H", 
    EastAsianWidth.NARROW    : "Na",
    EastAsianWidth.NEUTRAL   : "N",
    EastAsianWidth.WIDE      : "W"
}

def east_asian_width(unichr):
    return _eaw[UCharacter.getIntPropertyValue(_get_codepoint(unichr), UProperty.EAST_ASIAN_WIDTH)]


def normalize(form, unistr):
    """
    Return the normal form 'form' for the Unicode string unistr.  Valid
    values for form are 'NFC', 'NFKC', 'NFD', and 'NFKD'.
    """

    try:
        normalizer_form = _forms[form]
    except KeyError:
        raise ValueError('invalid normalization form')

    return Normalizer.normalize(unistr, normalizer_form)


def get_icu_version():
    versions = []
    for k in VersionInfo.__dict__.iterkeys():
        if k.startswith("UNICODE_"):
            v = getattr(VersionInfo, k)
            versions.append((v.getMajor(), v.getMinor(), v.getMilli()))
    return ".".join(str(x) for x in max(versions))


unidata_version = get_icu_version()