Lib.unicodedata.py Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of jython Show documentation
Show all versions of jython Show documentation
Jython is an implementation of the high-level, dynamic, object-oriented
language Python written in 100% Pure Java, and seamlessly integrated with
the Java platform. It thus allows you to run Python on any Java platform.
import java.lang.Character
try:
# import from jarjar-ed version
from org.python.icu.text import Normalizer
from org.python.icu.lang import UCharacter, UProperty
from org.python.icu.util import VersionInfo
from org.python.icu.lang.UCharacter import EastAsianWidth, DecompositionType
from org.python.icu.lang.UCharacterEnums import ECharacterCategory, ECharacterDirection
except ImportError:
# development version of Jython, so use extlibs
from com.ibm.icu.text import Normalizer
from com.ibm.icu.lang import UCharacter, UProperty
from com.ibm.icu.util import VersionInfo
from com.ibm.icu.lang.UCharacter import EastAsianWidth, DecompositionType
from com.ibm.icu.lang.UCharacterEnums import ECharacterCategory, ECharacterDirection
__all__ = (
"bidirectional", "category", "combining", "decimal", "decomposition", "digit", "east_asian_width",
"lookup", "mirrored", "name", "normalize", "numeric", "unidata_version")
_forms = {
'NFC': Normalizer.NFC,
'NFKC': Normalizer.NFKC,
'NFD': Normalizer.NFD,
'NFKD': Normalizer.NFKD
}
Nonesuch = object() # to distinguish from None, which is a valid return value for some functions
def _validate_unichr(unichr):
if not(isinstance(unichr, unicode)):
raise TypeError("must be unicode, not {}".format(type(unichr).__name__))
if len(unichr) > 1 or len(unichr) == 0:
raise TypeError("need a single Unicode character as parameter")
def _get_codepoint(unichr):
_validate_unichr(unichr)
return ord(unichr)
def name(unichr, default=Nonesuch):
# handle None
n = UCharacter.getName(_get_codepoint(unichr))
if n is None:
if default is not Nonesuch:
return default
else:
raise ValueError("no such name")
return n
def lookup(name):
codepoint = UCharacter.getCharFromName(name)
if codepoint == -1:
raise KeyError("undefined character name '{}".format(name))
return unichr(codepoint)
def digit(unichr, default=Nonesuch):
d = UCharacter.digit(_get_codepoint(unichr))
if d == -1:
if default is not Nonesuch:
return default
else:
raise ValueError("not a digit")
return d
def decimal(unichr, default=Nonesuch):
d = UCharacter.getNumericValue(_get_codepoint(unichr))
if d < 0 or d > 9:
if default is not Nonesuch:
return default
else:
raise ValueError("not a decimal")
return d
def numeric(unichr, default=Nonesuch):
n = UCharacter.getUnicodeNumericValue(_get_codepoint(unichr))
if n == UCharacter.NO_NUMERIC_VALUE:
if default is not Nonesuch:
return default
else:
raise ValueError("not a numeric")
return n
_decomp = {
DecompositionType.CANONICAL: "canonical",
DecompositionType.CIRCLE: "circle",
DecompositionType.COMPAT: "compat",
DecompositionType.FINAL: "final",
DecompositionType.FONT: "font",
DecompositionType.FRACTION: "fraction",
DecompositionType.INITIAL: "initial",
DecompositionType.ISOLATED: "isolated",
DecompositionType.MEDIAL: "medial",
DecompositionType.NARROW: "narrow",
DecompositionType.NOBREAK: "nobreak",
DecompositionType.NONE: None,
DecompositionType.SMALL: "small",
DecompositionType.SQUARE: "square",
DecompositionType.SUB: "sub",
DecompositionType.SUPER: "super",
DecompositionType.VERTICAL: "vertical",
DecompositionType.WIDE: "wide"
}
def _get_decomp_type(unichr):
if unichr == u"\u2044": # FRACTION SLASH
# special case this for CPython compatibility even though this returns as not being combining, eg, see
# http://www.fileformat.info/info/unicode/char/2044/index.htm
return "fraction"
else:
return _decomp[UCharacter.getIntPropertyValue(ord(unichr), UProperty.DECOMPOSITION_TYPE)]
def decomposition(unichr):
_validate_unichr(unichr)
d = Normalizer.decompose(unichr, True)
decomp_type = None
if len(d) == 1:
decomp_type = _get_decomp_type(unichr)
else:
for c in d:
decomp_type = _get_decomp_type(c)
# print "Got a decomp_type %r %r %r" % (c, d, decomp_type)
if decomp_type is not None:
break
hexed = " ".join(("{0:04X}".format(ord(c)) for c in d))
if decomp_type:
return "<{}> {}".format(decomp_type, hexed)
elif len(d) == 1:
return ""
else:
return hexed
# To map from ICU4J enumerations for category, bidirection, and
# east_asian_width to the underlying property values that Python uses
# from UnicodeData.txt required a manual mapping between the following
# two files:
#
# http://icu-project.org/apiref/icu4j/constant-values.html
# http://www.unicode.org/Public/6.3.0/ucd/PropertyValueAliases.txt
_cat = {
ECharacterCategory.COMBINING_SPACING_MARK: "Mc",
ECharacterCategory.CONNECTOR_PUNCTUATION: "Pc",
ECharacterCategory.CONTROL: "Cc",
ECharacterCategory.CURRENCY_SYMBOL: "Sc",
ECharacterCategory.DASH_PUNCTUATION: "Pd",
ECharacterCategory.DECIMAL_DIGIT_NUMBER: "Nd",
ECharacterCategory.ENCLOSING_MARK: "Me",
ECharacterCategory.END_PUNCTUATION: "Pe",
ECharacterCategory.FINAL_PUNCTUATION: "Pf",
ECharacterCategory.FORMAT: "Cf",
# per http://icu-project.org/apiref/icu4j/com/ibm/icu/lang/UCharacterEnums.ECharacterCategory.html#GENERAL_OTHER_TYPES
# - no characters in [UnicodeData.txt] have this property
ECharacterCategory.GENERAL_OTHER_TYPES: "Cn Not Assigned",
ECharacterCategory.INITIAL_PUNCTUATION: "Pi",
ECharacterCategory.LETTER_NUMBER: "Nl",
ECharacterCategory.LINE_SEPARATOR: "Zl",
ECharacterCategory.LOWERCASE_LETTER: "Ll",
ECharacterCategory.MATH_SYMBOL: "Sm",
ECharacterCategory.MODIFIER_LETTER: "Lm",
ECharacterCategory.MODIFIER_SYMBOL: "Sk",
ECharacterCategory.NON_SPACING_MARK: "Mn",
ECharacterCategory.OTHER_LETTER: "Lo",
ECharacterCategory.OTHER_NUMBER: "No",
ECharacterCategory.OTHER_PUNCTUATION: "Po",
ECharacterCategory.OTHER_SYMBOL: "So",
ECharacterCategory.PARAGRAPH_SEPARATOR: "Zp",
ECharacterCategory.PRIVATE_USE: "Co",
ECharacterCategory.SPACE_SEPARATOR: "Zs",
ECharacterCategory.START_PUNCTUATION: "Ps",
ECharacterCategory.SURROGATE: "Cs",
ECharacterCategory.TITLECASE_LETTER: "Lt",
ECharacterCategory.UNASSIGNED: "Cn",
ECharacterCategory.UPPERCASE_LETTER: "Lu",
}
def category(unichr):
return _cat[UCharacter.getType(_get_codepoint(unichr))]
_dir = {
ECharacterDirection.ARABIC_NUMBER: "An",
ECharacterDirection.BLOCK_SEPARATOR: "B",
ECharacterDirection.BOUNDARY_NEUTRAL: "BN",
ECharacterDirection.COMMON_NUMBER_SEPARATOR: "CS",
ECharacterDirection.DIR_NON_SPACING_MARK: "NSM",
ECharacterDirection.EUROPEAN_NUMBER: "EN",
ECharacterDirection.EUROPEAN_NUMBER_SEPARATOR: "ES",
ECharacterDirection.EUROPEAN_NUMBER_TERMINATOR: "ET",
ECharacterDirection.FIRST_STRONG_ISOLATE: "FSI",
ECharacterDirection.LEFT_TO_RIGHT: "L",
ECharacterDirection.LEFT_TO_RIGHT_EMBEDDING: "LRE",
ECharacterDirection.LEFT_TO_RIGHT_ISOLATE: "LRI",
ECharacterDirection.LEFT_TO_RIGHT_OVERRIDE: "LRO",
ECharacterDirection.OTHER_NEUTRAL: "ON",
ECharacterDirection.POP_DIRECTIONAL_FORMAT: "PDF",
ECharacterDirection.POP_DIRECTIONAL_ISOLATE: "PDI",
ECharacterDirection.RIGHT_TO_LEFT: "R",
ECharacterDirection.RIGHT_TO_LEFT_ARABIC: "AL",
ECharacterDirection.RIGHT_TO_LEFT_EMBEDDING: "RLE",
ECharacterDirection.RIGHT_TO_LEFT_ISOLATE: "RLI",
ECharacterDirection.RIGHT_TO_LEFT_OVERRIDE: "RLO",
ECharacterDirection.SEGMENT_SEPARATOR: "S",
ECharacterDirection.WHITE_SPACE_NEUTRAL: "WS"
}
def bidirectional(unichr):
return _dir[UCharacter.getDirection(_get_codepoint(unichr))]
def combining(unichr):
return UCharacter.getCombiningClass(_get_codepoint(unichr))
def mirrored(unichr):
return UCharacter.isMirrored(_get_codepoint(unichr))
_eaw = {
# http://www.unicode.org/reports/tr11/
EastAsianWidth.AMBIGUOUS : "A",
EastAsianWidth.COUNT : "?", # apparently not used, see above TR
EastAsianWidth.FULLWIDTH : "F",
EastAsianWidth.HALFWIDTH : "H",
EastAsianWidth.NARROW : "Na",
EastAsianWidth.NEUTRAL : "N",
EastAsianWidth.WIDE : "W"
}
def east_asian_width(unichr):
return _eaw[UCharacter.getIntPropertyValue(_get_codepoint(unichr), UProperty.EAST_ASIAN_WIDTH)]
def normalize(form, unistr):
"""
Return the normal form 'form' for the Unicode string unistr. Valid
values for form are 'NFC', 'NFKC', 'NFD', and 'NFKD'.
"""
try:
normalizer_form = _forms[form]
except KeyError:
raise ValueError('invalid normalization form')
return Normalizer.normalize(unistr, normalizer_form)
def get_icu_version():
versions = []
for k in VersionInfo.__dict__.iterkeys():
if k.startswith("UNICODE_"):
v = getattr(VersionInfo, k)
versions.append((v.getMajor(), v.getMinor(), v.getMilli()))
return ".".join(str(x) for x in max(versions))
unidata_version = get_icu_version()