com.ibm.icu.impl.UCharacterProperty Maven / Gradle / Ivy
Show all versions of icu4j Show documentation
// © 2016 and later: Unicode, Inc. and others.
// License & terms of use: http://www.unicode.org/copyright.html
/*
*******************************************************************************
* Copyright (C) 1996-2016, International Business Machines Corporation and
* others. All Rights Reserved.
*******************************************************************************
*/
package com.ibm.icu.impl;
import java.io.IOException;
import java.nio.ByteBuffer;
import java.util.EnumSet;
import java.util.Iterator;
import java.util.MissingResourceException;
import com.ibm.icu.lang.UCharacter;
import com.ibm.icu.lang.UCharacter.HangulSyllableType;
import com.ibm.icu.lang.UCharacter.IdentifierStatus;
import com.ibm.icu.lang.UCharacter.IdentifierType;
import com.ibm.icu.lang.UCharacter.NumericType;
import com.ibm.icu.lang.UCharacterCategory;
import com.ibm.icu.lang.UProperty;
import com.ibm.icu.lang.UScript;
import com.ibm.icu.text.Normalizer2;
import com.ibm.icu.text.UTF16;
import com.ibm.icu.text.UnicodeSet;
import com.ibm.icu.util.CodePointMap;
import com.ibm.icu.util.CodePointTrie;
import com.ibm.icu.util.ICUException;
import com.ibm.icu.util.ICUUncheckedIOException;
import com.ibm.icu.util.VersionInfo;
/**
* Internal class used for Unicode character property database.
* This classes store binary data read from uprops.icu.
* It does not have the capability to parse the data into more high-level
* information. It only returns bytes of information when required.
* Due to the form most commonly used for retrieval, array of char is used
* to store the binary data.
* UCharacterPropertyDB also contains information on accessing indexes to
* significant points in the binary data.
* Responsibility for molding the binary data into more meaning form lies on
* UCharacter.
* @author Syn Wee Quek
* @since release 2.1, february 1st 2002
*/
public final class UCharacterProperty
{
// public data members -----------------------------------------------
/*
* public singleton instance
*/
public static final UCharacterProperty INSTANCE;
/**
* Trie data
*/
public Trie2_16 m_trie_;
/**
* Unicode version
*/
public VersionInfo m_unicodeVersion_;
/**
* Latin capital letter i with dot above
*/
public static final char LATIN_CAPITAL_LETTER_I_WITH_DOT_ABOVE_ = 0x130;
/**
* Latin small letter i with dot above
*/
public static final char LATIN_SMALL_LETTER_DOTLESS_I_ = 0x131;
/**
* Latin lowercase i
*/
public static final char LATIN_SMALL_LETTER_I_ = 0x69;
/**
* Character type mask
*/
public static final int TYPE_MASK = 0x1F;
// uprops.h enum UPropertySource --------------------------------------- ***
/** No source, not a supported property. */
public static final int SRC_NONE=0;
/** From uchar.c/uprops.icu main trie */
public static final int SRC_CHAR=1;
/** From uchar.c/uprops.icu properties vectors trie */
public static final int SRC_PROPSVEC=2;
/** From unames.c/unames.icu */
public static final int SRC_NAMES=3;
/** From ucase.c/ucase.icu */
public static final int SRC_CASE=4;
/** From ubidi_props.c/ubidi.icu */
public static final int SRC_BIDI=5;
/** From uchar.c/uprops.icu main trie as well as properties vectors trie */
public static final int SRC_CHAR_AND_PROPSVEC=6;
/** From ucase.c/ucase.icu as well as unorm.cpp/unorm.icu */
public static final int SRC_CASE_AND_NORM=7;
/** From normalizer2impl.cpp/nfc.nrm */
public static final int SRC_NFC=8;
/** From normalizer2impl.cpp/nfkc.nrm */
public static final int SRC_NFKC=9;
/** From normalizer2impl.cpp/nfkc_cf.nrm */
public static final int SRC_NFKC_CF=10;
/** From normalizer2impl.cpp/nfc.nrm canonical iterator data */
public static final int SRC_NFC_CANON_ITER=11;
// Text layout properties.
public static final int SRC_INPC=12;
public static final int SRC_INSC=13;
public static final int SRC_VO=14;
public static final int SRC_EMOJI=15;
public static final int SRC_IDSU=16;
public static final int SRC_ID_COMPAT_MATH=17;
public static final int SRC_BLOCK=18;
public static final int SRC_MCM=19;
/** One more than the highest UPropertySource (SRC_) constant. */
public static final int SRC_COUNT=20;
private static final class LayoutProps {
private static final class IsAcceptable implements ICUBinary.Authenticate {
@Override
public boolean isDataVersionAcceptable(byte version[]) {
return version[0] == 1;
}
}
private static final IsAcceptable IS_ACCEPTABLE = new IsAcceptable();
private static final int DATA_FORMAT = 0x4c61796f; // "Layo"
// indexes into indexes[]
// Element 0 stores the length of the indexes[] array.
//ivate static final int IX_INDEXES_LENGTH = 0;
// Elements 1..7 store the tops of consecutive code point tries.
// No trie is stored if the difference between two of these is less than 16.
private static final int IX_INPC_TRIE_TOP = 1;
private static final int IX_INSC_TRIE_TOP = 2;
private static final int IX_VO_TRIE_TOP = 3;
//ivate static final int IX_RESERVED_TOP = 4;
//ivate static final int IX_TRIES_TOP = 7;
private static final int IX_MAX_VALUES = 9;
// Length of indexes[]. Multiple of 4 to 16-align the tries.
//ivate static final int IX_COUNT = 12;
private static final int MAX_INPC_SHIFT = 24;
private static final int MAX_INSC_SHIFT = 16;
private static final int MAX_VO_SHIFT = 8;
static final LayoutProps INSTANCE = new LayoutProps();
CodePointTrie inpcTrie = null; // Indic_Positional_Category
CodePointTrie inscTrie = null; // Indic_Syllabic_Category
CodePointTrie voTrie = null; // Vertical_Orientation
int maxInpcValue = 0;
int maxInscValue = 0;
int maxVoValue = 0;
LayoutProps() {
ByteBuffer bytes = ICUBinary.getRequiredData("ulayout.icu");
try {
ICUBinary.readHeaderAndDataVersion(bytes, DATA_FORMAT, IS_ACCEPTABLE);
int startPos = bytes.position();
int indexesLength = bytes.getInt(); // inIndexes[IX_INDEXES_LENGTH]
if (indexesLength < 12) {
throw new ICUUncheckedIOException(
"Text layout properties data: not enough indexes");
}
int[] inIndexes = new int[indexesLength];
inIndexes[0] = indexesLength;
for (int i = 1; i < indexesLength; ++i) {
inIndexes[i] = bytes.getInt();
}
int offset = indexesLength * 4;
int top = inIndexes[IX_INPC_TRIE_TOP];
int trieSize = top - offset;
if (trieSize >= 16) {
inpcTrie = CodePointTrie.fromBinary(null, null, bytes);
}
int pos = bytes.position() - startPos;
assert top >= pos;
ICUBinary.skipBytes(bytes, top - pos); // skip padding after trie bytes
offset = top;
top = inIndexes[IX_INSC_TRIE_TOP];
trieSize = top - offset;
if (trieSize >= 16) {
inscTrie = CodePointTrie.fromBinary(null, null, bytes);
}
pos = bytes.position() - startPos;
assert top >= pos;
ICUBinary.skipBytes(bytes, top - pos); // skip padding after trie bytes
offset = top;
top = inIndexes[IX_VO_TRIE_TOP];
trieSize = top - offset;
if (trieSize >= 16) {
voTrie = CodePointTrie.fromBinary(null, null, bytes);
}
pos = bytes.position() - startPos;
assert top >= pos;
ICUBinary.skipBytes(bytes, top - pos); // skip padding after trie bytes
int maxValues = inIndexes[IX_MAX_VALUES];
maxInpcValue = maxValues >>> MAX_INPC_SHIFT;
maxInscValue = (maxValues >> MAX_INSC_SHIFT) & 0xff;
maxVoValue = (maxValues >> MAX_VO_SHIFT) & 0xff;
} catch(IOException e) {
throw new ICUUncheckedIOException(e);
}
}
public UnicodeSet addPropertyStarts(int src, UnicodeSet set) {
CodePointTrie trie;
switch (src) {
case SRC_INPC:
trie = inpcTrie;
break;
case SRC_INSC:
trie = inscTrie;
break;
case SRC_VO:
trie = voTrie;
break;
default:
throw new IllegalStateException();
}
if (trie == null) {
throw new MissingResourceException(
"no data for one of the text layout properties; src=" + src,
"LayoutProps", "");
}
// Add the start code point of each same-value range of the trie.
CodePointMap.Range range = new CodePointMap.Range();
int start = 0;
while (trie.getRange(start, null, range)) {
set.add(start);
start = range.getEnd() + 1;
}
return set;
}
}
// public methods ----------------------------------------------------
/**
* Gets the main property value for code point ch.
* @param ch code point whose property value is to be retrieved
* @return property value of code point
*/
public final int getProperty(int ch)
{
return m_trie_.get(ch);
}
/**
* Gets the unicode additional properties.
* Java version of C u_getUnicodeProperties().
* @param codepoint codepoint whose additional properties is to be
* retrieved
* @param column The column index.
* @return unicode properties
*/
public int getAdditional(int codepoint, int column) {
assert column >= 0;
if (column >= m_additionalColumnsCount_) {
return 0;
}
return m_additionalVectors_[m_additionalTrie_.get(codepoint) + column];
}
static final int MY_MASK = UCharacterProperty.TYPE_MASK
& ((1<Get the "age" of the code point.
* The "age" is the Unicode version when the code point was first
* designated (as a non-character or for Private Use) or assigned a
* character.
* This can be useful to avoid emitting code points to receiving
* processes that do not accept newer characters.
* The data is from the UCD file DerivedAge.txt.
* This API does not check the validity of the codepoint.
* @param codepoint The code point.
* @return the Unicode version number
*/
public VersionInfo getAge(int codepoint)
{
int version = getAdditional(codepoint, 0) >>> AGE_SHIFT_;
return VersionInfo.getInstance(version >> 2, version & 3, 0, 0);
}
private static final int GC_CN_MASK = getMask(UCharacter.UNASSIGNED);
private static final int GC_CC_MASK = getMask(UCharacter.CONTROL);
private static final int GC_CS_MASK = getMask(UCharacter.SURROGATE);
private static final int GC_ZS_MASK = getMask(UCharacter.SPACE_SEPARATOR);
private static final int GC_ZL_MASK = getMask(UCharacter.LINE_SEPARATOR);
private static final int GC_ZP_MASK = getMask(UCharacter.PARAGRAPH_SEPARATOR);
/** Mask constant for multiple UCharCategory bits (Z Separators). */
private static final int GC_Z_MASK = GC_ZS_MASK|GC_ZL_MASK|GC_ZP_MASK;
/**
* Checks if c is in
* [^\p{space}\p{gc=Control}\p{gc=Surrogate}\p{gc=Unassigned}]
* with space=\p{Whitespace} and Control=Cc.
* Implements UCHAR_POSIX_GRAPH.
* @internal
*/
private static final boolean isgraphPOSIX(int c) {
/* \p{space}\p{gc=Control} == \p{gc=Z}\p{Control} */
/* comparing ==0 returns false for the categories mentioned */
return (getMask(UCharacter.getType(c))&
(GC_CC_MASK|GC_CS_MASK|GC_CN_MASK|GC_Z_MASK))
==0;
}
// binary properties --------------------------------------------------- ***
private class BinaryProperty {
int column; // SRC_PROPSVEC column, or "source" if mask==0
int mask;
BinaryProperty(int column, int mask) {
this.column=column;
this.mask=mask;
}
BinaryProperty(int source) {
this.column=source;
this.mask=0;
}
final int getSource() {
return mask==0 ? column : SRC_PROPSVEC;
}
boolean contains(int c) {
// systematic, directly stored properties
return (getAdditional(c, column)&mask)!=0;
}
}
private class CaseBinaryProperty extends BinaryProperty { // case mapping properties
int which;
CaseBinaryProperty(int which) {
super(SRC_CASE);
this.which=which;
}
@Override
boolean contains(int c) {
return UCaseProps.INSTANCE.hasBinaryProperty(c, which);
}
}
private class EmojiBinaryProperty extends BinaryProperty {
int which;
EmojiBinaryProperty(int which) {
super(SRC_EMOJI);
this.which=which;
}
@Override
boolean contains(int c) {
return EmojiProps.INSTANCE.hasBinaryProperty(c, which);
}
}
private class NormInertBinaryProperty extends BinaryProperty { // UCHAR_NF*_INERT properties
int which;
NormInertBinaryProperty(int source, int which) {
super(source);
this.which=which;
}
@Override
boolean contains(int c) {
return Norm2AllModes.getN2WithImpl(which-UProperty.NFD_INERT).isInert(c);
}
}
/** Ranges (start/limit pairs) of ID_Compat_Math_Continue (only), from UCD PropList.txt. */
private static final int[] ID_COMPAT_MATH_CONTINUE = {
0x00B2, 0x00B3 + 1,
0x00B9, 0x00B9 + 1,
0x2070, 0x2070 + 1,
0x2074, 0x207E + 1,
0x2080, 0x208E + 1
};
/** ID_Compat_Math_Start characters, from UCD PropList.txt. */
private static final int[] ID_COMPAT_MATH_START = {
0x2202,
0x2207,
0x221E,
0x1D6C1,
0x1D6DB,
0x1D6FB,
0x1D715,
0x1D735,
0x1D74F,
0x1D76F,
0x1D789,
0x1D7A9,
0x1D7C3
};
/** Ranges (start/limit pairs) of Modifier_Combining_mark (only), from UCD PropList.txt. */
private static final int[] MODIFIER_COMBINING_MARK = {
0x0654, 0x0655 + 1,
0x0658, 0x0658 + 1, // U+0658
0x06DC, 0x06DC + 1, // U+06DC
0x06E3, 0x06E3 + 1, // U+06E3
0x06E7, 0x06E8 + 1,
0x08CA, 0x08CB + 1,
0x08CD, 0x08CF + 1,
0x08D3, 0x08D3 + 1, // U+08D3
0x08F3, 0x08F3 + 1 // U+08F3
};
private class MathCompatBinaryProperty extends BinaryProperty {
int which;
MathCompatBinaryProperty(int which) {
super(SRC_ID_COMPAT_MATH);
this.which=which;
}
@Override
boolean contains(int c) {
if (which == UProperty.ID_COMPAT_MATH_CONTINUE) {
for (int i = 0; i < ID_COMPAT_MATH_CONTINUE.length; i += 2) {
if (c < ID_COMPAT_MATH_CONTINUE[i]) { return false; } // below range start
if (c < ID_COMPAT_MATH_CONTINUE[i + 1]) { return true; } // below range limit
}
}
if (c < ID_COMPAT_MATH_START[0]) { return false; } // fastpath for common scripts
for (int startChar : ID_COMPAT_MATH_START) {
if (c == startChar) { return true; }
}
return false;
}
}
private class MCMBinaryProperty extends BinaryProperty {
MCMBinaryProperty() {
super(SRC_MCM);
}
@Override
boolean contains(int c) {
for (int i = 0; i < MODIFIER_COMBINING_MARK.length; i += 2) {
if (c < MODIFIER_COMBINING_MARK[i]) { return false; } // below range start
if (c < MODIFIER_COMBINING_MARK[i + 1]) { return true; } // below range limit
}
return false;
}
}
BinaryProperty[] binProps={
/*
* Binary-property implementations must be in order of corresponding UProperty,
* and there must be exactly one entry per binary UProperty.
*/
new BinaryProperty(1, (1<=0x41 && (c<=0x46 || c>=0x61)) ||
(c>=0xff21 && c<=0xff46 && (c<=0xff26 || c>=0xff41))
) {
return true;
}
return UCharacter.getType(c)==UCharacter.DECIMAL_DIGIT_NUMBER;
}
},
new CaseBinaryProperty(UProperty.CASED),
new CaseBinaryProperty(UProperty.CASE_IGNORABLE),
new CaseBinaryProperty(UProperty.CHANGES_WHEN_LOWERCASED),
new CaseBinaryProperty(UProperty.CHANGES_WHEN_UPPERCASED),
new CaseBinaryProperty(UProperty.CHANGES_WHEN_TITLECASED),
new BinaryProperty(SRC_CASE_AND_NORM) { // UCHAR_CHANGES_WHEN_CASEFOLDED
@Override
boolean contains(int c) {
String nfd=Norm2AllModes.getNFCInstance().impl.getDecomposition(c);
if(nfd!=null) {
/* c has a decomposition */
c=nfd.codePointAt(0);
if(Character.charCount(c)!=nfd.length()) {
/* multiple code points */
c=-1;
}
} else if(c<0) {
return false; /* protect against bad input */
}
if(c>=0) {
/* single code point */
UCaseProps csp=UCaseProps.INSTANCE;
UCaseProps.dummyStringBuilder.setLength(0);
return csp.toFullFolding(c, UCaseProps.dummyStringBuilder,
UCharacter.FOLD_CASE_DEFAULT)>=0;
} else {
String folded=UCharacter.foldCase(nfd, true);
return !folded.equals(nfd);
}
}
},
new CaseBinaryProperty(UProperty.CHANGES_WHEN_CASEMAPPED),
new BinaryProperty(SRC_NFKC_CF) { // UCHAR_CHANGES_WHEN_NFKC_CASEFOLDED
@Override
boolean contains(int c) {
Normalizer2Impl kcf=Norm2AllModes.getNFKC_CFInstance().impl;
String src=UTF16.valueOf(c);
StringBuilder dest=new StringBuilder();
// Small destCapacity for NFKC_CF(c).
Normalizer2Impl.ReorderingBuffer buffer=new Normalizer2Impl.ReorderingBuffer(kcf, dest, 5);
kcf.compose(src, 0, src.length(), false, true, buffer);
return !Normalizer2Impl.UTF16Plus.equal(dest, src);
}
},
new EmojiBinaryProperty(UProperty.EMOJI),
new EmojiBinaryProperty(UProperty.EMOJI_PRESENTATION),
new EmojiBinaryProperty(UProperty.EMOJI_MODIFIER),
new EmojiBinaryProperty(UProperty.EMOJI_MODIFIER_BASE),
new EmojiBinaryProperty(UProperty.EMOJI_COMPONENT),
new BinaryProperty(SRC_PROPSVEC) { // REGIONAL_INDICATOR
// Property starts are a subset of lb=RI etc.
@Override
boolean contains(int c) {
return 0x1F1E6<=c && c<=0x1F1FF;
}
},
new BinaryProperty(1, 1<>>shift;
}
int getMaxValue(int which) {
return (getMaxValues(column)&mask)>>>shift;
}
}
private class BiDiIntProperty extends IntProperty {
BiDiIntProperty() {
super(SRC_BIDI);
}
@Override
int getMaxValue(int which) {
return UBiDiProps.INSTANCE.getMaxValue(which);
}
}
private class CombiningClassIntProperty extends IntProperty {
CombiningClassIntProperty(int source) {
super(source);
}
@Override
int getMaxValue(int which) {
return 0xff;
}
}
private class NormQuickCheckIntProperty extends IntProperty { // UCHAR_NF*_QUICK_CHECK properties
int which;
int max;
NormQuickCheckIntProperty(int source, int which, int max) {
super(source);
this.which=which;
this.max=max;
}
@Override
int getValue(int c) {
return Norm2AllModes.getN2WithImpl(which-UProperty.NFD_QUICK_CHECK).getQuickCheck(c);
}
@Override
int getMaxValue(int which) {
return max;
}
}
IntProperty intProps[]={
new BiDiIntProperty() { // BIDI_CLASS
@Override
int getValue(int c) {
return UBiDiProps.INSTANCE.getClass(c);
}
},
new IntProperty(SRC_BLOCK) { // BLOCK
@Override
int getValue(int c) {
// We store Block values indexed by the code point shifted right 4 bits
// and use a "small" UCPTrie=CodePointTrie for minimal data size.
// This works because blocks have xxx0..xxxF ranges.
int c4 = c;
// Shift unless out of range, in which case we fetch the trie's error value.
if (c4 <= 0x10ffff) {
c4 >>= 4;
}
return m_blockTrie_.get(c4);
}
@Override
int getMaxValue(int which) {
return m_maxValuesOther_ & MAX_BLOCK;
}
},
new CombiningClassIntProperty(SRC_NFC) { // CANONICAL_COMBINING_CLASS
@Override
int getValue(int c) {
return Normalizer2.getNFDInstance().getCombiningClass(c);
}
},
new IntProperty(2, DECOMPOSITION_TYPE_MASK_, 0),
new IntProperty(0, EAST_ASIAN_MASK_, EAST_ASIAN_SHIFT_),
new IntProperty(SRC_CHAR) { // GENERAL_CATEGORY
@Override
int getValue(int c) {
return getType(c);
}
@Override
int getMaxValue(int which) {
return UCharacterCategory.CHAR_CATEGORY_COUNT-1;
}
},
new BiDiIntProperty() { // JOINING_GROUP
@Override
int getValue(int c) {
return UBiDiProps.INSTANCE.getJoiningGroup(c);
}
},
new BiDiIntProperty() { // JOINING_TYPE
@Override
int getValue(int c) {
return UBiDiProps.INSTANCE.getJoiningType(c);
}
},
new IntProperty(2, LB_MASK, LB_SHIFT), // LINE_BREAK
new IntProperty(SRC_CHAR) { // NUMERIC_TYPE
@Override
int getValue(int c) {
return ntvGetType(getNumericTypeValue(getProperty(c)));
}
@Override
int getMaxValue(int which) {
return NumericType.COUNT-1;
}
},
new IntProperty(SRC_PROPSVEC) {
@Override
int getValue(int c) {
return UScript.getScript(c);
}
@Override
int getMaxValue(int which) {
return getMaxValues(0)&MAX_SCRIPT;
}
},
new IntProperty(SRC_PROPSVEC) { // HANGUL_SYLLABLE_TYPE
@Override
int getValue(int c) {
// Ignore supplementary code points: They all have HST=NA.
// This is a simple way to handle the GCB!=hst cases since Unicode 16
// (Kirat Rai vowels).
if(c>0xffff) {
return HangulSyllableType.NOT_APPLICABLE;
}
/* see comments on gcbToHst[] above */
int gcb=(getAdditional(c, 2)&GCB_MASK)>>>GCB_SHIFT;
if(gcb>8;
}
},
new CombiningClassIntProperty(SRC_NFC) { // TRAIL_CANONICAL_COMBINING_CLASS
@Override
int getValue(int c) {
return Norm2AllModes.getNFCInstance().impl.getFCD16(c)&0xff;
}
},
new IntProperty(2, GCB_MASK, GCB_SHIFT), // GRAPHEME_CLUSTER_BREAK
new IntProperty(2, SB_MASK, SB_SHIFT), // SENTENCE_BREAK
new IntProperty(2, WB_MASK, WB_SHIFT), // WORD_BREAK
new BiDiIntProperty() { // BIDI_PAIRED_BRACKET_TYPE
@Override
int getValue(int c) {
return UBiDiProps.INSTANCE.getPairedBracketType(c);
}
},
new IntProperty(SRC_INPC) {
@Override
int getValue(int c) {
CodePointTrie trie = LayoutProps.INSTANCE.inpcTrie;
return trie != null ? trie.get(c) : 0;
}
@Override
int getMaxValue(int which) {
return LayoutProps.INSTANCE.maxInpcValue;
}
},
new IntProperty(SRC_INSC) {
@Override
int getValue(int c) {
CodePointTrie trie = LayoutProps.INSTANCE.inscTrie;
return trie != null ? trie.get(c) : 0;
}
@Override
int getMaxValue(int which) {
return LayoutProps.INSTANCE.maxInscValue;
}
},
new IntProperty(SRC_VO) {
@Override
int getValue(int c) {
CodePointTrie trie = LayoutProps.INSTANCE.voTrie;
return trie != null ? trie.get(c) : 0;
}
@Override
int getMaxValue(int which) {
return LayoutProps.INSTANCE.maxVoValue;
}
},
new IntProperty(SRC_PROPSVEC) { // IDENTIFIER_STATUS
@Override
int getValue(int c) {
int value = getAdditional(c, 2) >>> ID_TYPE_SHIFT;
return value >= ID_TYPE_ALLOWED_MIN ?
IdentifierStatus.ALLOWED.ordinal() : IdentifierStatus.RESTRICTED.ordinal();
}
@Override
int getMaxValue(int which) {
return IdentifierStatus.ALLOWED.ordinal();
}
},
new IntProperty(0, INCB_MASK, INCB_SHIFT), // INDIC_CONJUNCT_BREAK
};
public int getIntPropertyValue(int c, int which) {
if(which
* Unicode property names and property value names are compared
* "loosely". Property[Value]Aliases.txt say:
*
* "With loose matching of property names, the case distinctions,
* whitespace, and '_' are ignored."
*
*
*
* This function does just that, for ASCII (char *) name strings.
* It is almost identical to ucnv_compareNames() but also ignores
* ASCII White_Space characters (U+0009..U+000d).
*
* @param name1 name to compare
* @param name2 name to compare
* @return 0 if names are equal, < 0 if name1 is less than name2 and > 0
* if name1 is greater than name2.
*/
/* to be implemented in 2.4
* public static int comparePropertyNames(String name1, String name2)
{
int result = 0;
int i1 = 0;
int i2 = 0;
while (true) {
char ch1 = 0;
char ch2 = 0;
// Ignore delimiters '-', '_', and ASCII White_Space
if (i1 < name1.length()) {
ch1 = name1.charAt(i1 ++);
}
while (ch1 == '-' || ch1 == '_' || ch1 == ' ' || ch1 == '\t'
|| ch1 == '\n' // synwee what is || ch1 == '\v'
|| ch1 == '\f' || ch1=='\r') {
if (i1 < name1.length()) {
ch1 = name1.charAt(i1 ++);
}
else {
ch1 = 0;
}
}
if (i2 < name2.length()) {
ch2 = name2.charAt(i2 ++);
}
while (ch2 == '-' || ch2 == '_' || ch2 == ' ' || ch2 == '\t'
|| ch2 == '\n' // synwee what is || ch1 == '\v'
|| ch2 == '\f' || ch2=='\r') {
if (i2 < name2.length()) {
ch2 = name2.charAt(i2 ++);
}
else {
ch2 = 0;
}
}
// If we reach the ends of both strings then they match
if (ch1 == 0 && ch2 == 0) {
return 0;
}
// Case-insensitive comparison
if (ch1 != ch2) {
result = Character.toLowerCase(ch1)
- Character.toLowerCase(ch2);
if (result != 0) {
return result;
}
}
}
}
*/
/**
* Get the the maximum values for some enum/int properties.
* @return maximum values for the integer properties.
*/
public int getMaxValues(int column)
{
// return m_maxBlockScriptValue_;
switch(column) {
case 0:
return m_maxBlockScriptValue_;
case 2:
return m_maxJTGValue_;
default:
return 0;
}
}
/**
* Gets the type mask
* @param type character type
* @return mask
*/
public static final int getMask(int type)
{
return 1 << type;
}
/**
* Returns the digit values of characters like 'A' - 'Z', normal,
* half-width and full-width. This method assumes that the other digit
* characters are checked by the calling method.
* @param ch character to test
* @return -1 if ch is not a character of the form 'A' - 'Z', otherwise
* its corresponding digit will be returned.
*/
public static int getEuropeanDigit(int ch) {
if ((ch > 0x7a && ch < 0xff21)
|| ch < 0x41 || (ch > 0x5a && ch < 0x61)
|| ch > 0xff5a || (ch > 0xff3a && ch < 0xff41)) {
return -1;
}
if (ch <= 0x7a) {
// ch >= 0x41 or ch < 0x61
return ch + 10 - ((ch <= 0x5a) ? 0x41 : 0x61);
}
// ch >= 0xff21
if (ch <= 0xff3a) {
return ch + 10 - 0xff21;
}
// ch >= 0xff41 && ch <= 0xff5a
return ch + 10 - 0xff41;
}
public int digit(int c) {
int value = getNumericTypeValue(getProperty(c)) - NTV_DECIMAL_START_;
if(value<=9) {
return value;
} else {
return -1;
}
}
public int getNumericValue(int c) {
// slightly pruned version of getUnicodeNumericValue(), plus getEuropeanDigit()
int ntv = getNumericTypeValue(getProperty(c));
if(ntv==NTV_NONE_) {
return getEuropeanDigit(c);
} else if(ntv>5)-14;
int exp=(ntv&0x1f)+2;
if(exp<9 || (exp==9 && mant<=2)) {
int numValue=mant;
do {
numValue*=10;
} while(--exp>0);
return numValue;
} else {
return -2;
}
} else if(ntv>2)-0xbf;
int exp=(ntv&3)+1;
switch(exp) {
case 4:
numValue*=60*60*60*60;
break;
case 3:
numValue*=60*60*60;
break;
case 2:
numValue*=60*60;
break;
case 1:
numValue*=60;
break;
case 0:
default:
break;
}
return numValue;
} else if(ntv>4)-12;
int denominator=(ntv&0xf)+1;
return (double)numerator/denominator;
} else if(ntv>5)-14;
int exp=(ntv&0x1f)+2;
numValue=mant;
/* multiply by 10^exp without math.h */
while(exp>=4) {
numValue*=10000.;
exp-=4;
}
switch(exp) {
case 3:
numValue*=1000.;
break;
case 2:
numValue*=100.;
break;
case 1:
numValue*=10.;
break;
case 0:
default:
break;
}
return numValue;
} else if(ntv>2)-0xbf;
int exp=(ntv&3)+1;
switch(exp) {
case 4:
numValue*=60*60*60*60;
break;
case 3:
numValue*=60*60*60;
break;
case 2:
numValue*=60*60;
break;
case 1:
numValue*=60;
break;
case 0:
default:
break;
}
return numValue;
} else if(ntv>2);
return (double)numerator/denominator;
} else if(ntv>2);
return (double)numerator/denominator;
} else {
/* reserved */
return UCharacter.NO_NUMERIC_VALUE;
}
}
// protected variables -----------------------------------------------
/**
* Extra property trie
*/
Trie2_16 m_additionalTrie_;
/**
* Extra property vectors, 1st column for age and second for binary
* properties.
*/
int m_additionalVectors_[];
/**
* Number of additional columns
*/
int m_additionalColumnsCount_;
/**
* Maximum values for block, bits used as in vector word
* 0
*/
int m_maxBlockScriptValue_;
/**
* Maximum values for script, bits used as in vector word
* 0
*/
int m_maxJTGValue_;
/** maximum values for other code values */
int m_maxValuesOther_;
/**
* Script_Extensions data
*/
public char[] m_scriptExtensions_;
CodePointTrie m_blockTrie_;
// private variables -------------------------------------------------
/**
* Default name of the datafile
*/
private static final String DATA_FILE_NAME_ = "uprops.icu";
// property data constants -------------------------------------------------
/**
* Numeric types and values in the main properties words.
*/
private static final int NUMERIC_TYPE_VALUE_SHIFT_ = 6;
private static final int getNumericTypeValue(int props) {
return props >> NUMERIC_TYPE_VALUE_SHIFT_;
}
/* constants for the storage form of numeric types and values */
/** No numeric value. */
private static final int NTV_NONE_ = 0;
/** Decimal digits: nv=0..9 */
private static final int NTV_DECIMAL_START_ = 1;
/** Other digits: nv=0..9 */
private static final int NTV_DIGIT_START_ = 11;
/** Small integers: nv=0..154 */
private static final int NTV_NUMERIC_START_ = 21;
/** Fractions: ((ntv>>4)-12) / ((ntv&0xf)+1) = -1..17 / 1..16 */
private static final int NTV_FRACTION_START_ = 0xb0;
/**
* Large integers:
* ((ntv>>5)-14) * 10^((ntv&0x1f)+2) = (1..9)*(10^2..10^33)
* (only one significant decimal digit)
*/
private static final int NTV_LARGE_START_ = 0x1e0;
/**
* Sexagesimal numbers:
* ((ntv>>2)-0xbf) * 60^((ntv&3)+1) = (1..9)*(60^1..60^4)
*/
private static final int NTV_BASE60_START_=0x300;
/**
* Fraction-20 values:
* frac20 = ntv-0x324 = 0..0x17 -> 1|3|5|7 / 20|40|80|160|320|640
* numerator: num = 2*(frac20&3)+1
* denominator: den = 20<<(frac20>>2)
*/
private static final int NTV_FRACTION20_START_ = NTV_BASE60_START_ + 36; // 0x300+9*4=0x324
/**
* Fraction-32 values:
* frac32 = ntv-0x34c = 0..15 -> 1|3|5|7 / 32|64|128|256
* numerator: num = 2*(frac32&3)+1
* denominator: den = 32<<(frac32>>2)
*/
private static final int NTV_FRACTION32_START_ = NTV_FRACTION20_START_ + 24; // 0x324+6*4=0x34c
/** No numeric value (yet). */
private static final int NTV_RESERVED_START_ = NTV_FRACTION32_START_ + 16; // 0x34c+4*4=0x35c
private static final int ntvGetType(int ntv) {
return
(ntv==NTV_NONE_) ? NumericType.NONE :
(ntv expectedTrieLength) {
throw new IOException("uprops.icu: not enough bytes for main trie");
}
// skip padding after trie bytes
ICUBinary.skipBytes(bytes, expectedTrieLength - trieLength);
// skip unused intervening data structures
ICUBinary.skipBytes(bytes, (additionalOffset - propertyOffset) * 4);
if(m_additionalColumnsCount_ > 0) {
// reads the additional property block
m_additionalTrie_ = Trie2_16.createFromSerialized(bytes);
expectedTrieLength = (additionalVectorsOffset-additionalOffset)*4;
trieLength = m_additionalTrie_.getSerializedLength();
if(trieLength > expectedTrieLength) {
throw new IOException("uprops.icu: not enough bytes for additional-properties trie");
}
// skip padding after trie bytes
ICUBinary.skipBytes(bytes, expectedTrieLength - trieLength);
// additional properties
int size = scriptExtensionsOffset - additionalVectorsOffset;
m_additionalVectors_ = ICUBinary.getInts(bytes, size, 0);
}
// Script_Extensions
int numChars = (blockTrieOffset - scriptExtensionsOffset) * 2;
if(numChars > 0) {
m_scriptExtensions_ = ICUBinary.getChars(bytes, numChars, 0);
}
// Read the blockTrie.
int partLength = (reservedOffset8 - blockTrieOffset) * 4;
int triePosition = bytes.position();
m_blockTrie_ = CodePointTrie.fromBinary(null, CodePointTrie.ValueWidth.BITS_16, bytes);
trieLength = bytes.position() - triePosition;
if (trieLength > partLength) {
throw new ICUUncheckedIOException("uprops.icu: not enough bytes for blockTrie");
}
ICUBinary.skipBytes(bytes, partLength - trieLength); // skip padding after trie bytes
}
private static final class IsAcceptable implements ICUBinary.Authenticate {
@Override
public boolean isDataVersionAcceptable(byte version[]) {
return version[0] == 9;
}
}
private static final int DATA_FORMAT = 0x5550726F; // "UPro"
// private methods -------------------------------------------------------
/*
* Compare additional properties to see if it has argument type
* @param property 32 bit properties
* @param type character type
* @return true if property has type
*/
/*private boolean compareAdditionalType(int property, int type)
{
return (property & (1 << type)) != 0;
}*/
// property starts for UnicodeSet -------------------------------------- ***
private static final int TAB = 0x0009;
//private static final int LF = 0x000a;
//private static final int FF = 0x000c;
private static final int CR = 0x000d;
private static final int U_A = 0x0041;
private static final int U_F = 0x0046;
private static final int U_Z = 0x005a;
private static final int U_a = 0x0061;
private static final int U_f = 0x0066;
private static final int U_z = 0x007a;
private static final int DEL = 0x007f;
private static final int NL = 0x0085;
private static final int NBSP = 0x00a0;
private static final int CGJ = 0x034f;
private static final int FIGURESP= 0x2007;
private static final int HAIRSP = 0x200a;
//private static final int ZWNJ = 0x200c;
//private static final int ZWJ = 0x200d;
private static final int RLM = 0x200f;
private static final int NNBSP = 0x202f;
private static final int WJ = 0x2060;
private static final int INHSWAP = 0x206a;
private static final int NOMDIG = 0x206f;
private static final int U_FW_A = 0xff21;
private static final int U_FW_F = 0xff26;
private static final int U_FW_Z = 0xff3a;
private static final int U_FW_a = 0xff41;
private static final int U_FW_f = 0xff46;
private static final int U_FW_z = 0xff5a;
private static final int ZWNBSP = 0xfeff;
public UnicodeSet addPropertyStarts(UnicodeSet set) {
/* add the start code point of each same-value range of the main trie */
Iterator trieIterator = m_trie_.iterator();
Trie2.Range range;
while(trieIterator.hasNext() && !(range=trieIterator.next()).leadSurrogate) {
set.add(range.startCodePoint);
}
/* add code points with hardcoded properties, plus the ones following them */
/* add for u_isblank() */
set.add(TAB);
set.add(TAB+1);
/* add for IS_THAT_CONTROL_SPACE() */
set.add(CR+1); /* range TAB..CR */
set.add(0x1c);
set.add(0x1f+1);
set.add(NL);
set.add(NL+1);
/* add for u_isIDIgnorable() what was not added above */
set.add(DEL); /* range DEL..NBSP-1, NBSP added below */
set.add(HAIRSP);
set.add(RLM+1);
set.add(INHSWAP);
set.add(NOMDIG+1);
set.add(ZWNBSP);
set.add(ZWNBSP+1);
/* add no-break spaces for u_isWhitespace() what was not added above */
set.add(NBSP);
set.add(NBSP+1);
set.add(FIGURESP);
set.add(FIGURESP+1);
set.add(NNBSP);
set.add(NNBSP+1);
/* add for u_charDigitValue() */
// TODO remove when UCharacter.getHanNumericValue() is changed to just return
// Unicode numeric values
set.add(0x3007);
set.add(0x3008);
set.add(0x4e00);
set.add(0x4e01);
set.add(0x4e8c);
set.add(0x4e8d);
set.add(0x4e09);
set.add(0x4e0a);
set.add(0x56db);
set.add(0x56dc);
set.add(0x4e94);
set.add(0x4e95);
set.add(0x516d);
set.add(0x516e);
set.add(0x4e03);
set.add(0x4e04);
set.add(0x516b);
set.add(0x516c);
set.add(0x4e5d);
set.add(0x4e5e);
/* add for u_digit() */
set.add(U_a);
set.add(U_z+1);
set.add(U_A);
set.add(U_Z+1);
set.add(U_FW_a);
set.add(U_FW_z+1);
set.add(U_FW_A);
set.add(U_FW_Z+1);
/* add for u_isxdigit() */
set.add(U_f+1);
set.add(U_F+1);
set.add(U_FW_f+1);
set.add(U_FW_F+1);
/* add for UCHAR_DEFAULT_IGNORABLE_CODE_POINT what was not added above */
set.add(WJ); /* range WJ..NOMDIG */
set.add(0xfff0);
set.add(0xfffb+1);
set.add(0xe0000);
set.add(0xe0fff+1);
/* add for UCHAR_GRAPHEME_BASE and others */
set.add(CGJ);
set.add(CGJ+1);
return set; // for chaining
}
public void upropsvec_addPropertyStarts(UnicodeSet set) {
/* add the start code point of each same-value range of the properties vectors trie */
if(m_additionalColumnsCount_>0) {
/* if m_additionalColumnsCount_==0 then the properties vectors trie may not be there at all */
Iterator trieIterator = m_additionalTrie_.iterator();
Trie2.Range range;
while(trieIterator.hasNext() && !(range=trieIterator.next()).leadSurrogate) {
set.add(range.startCodePoint);
}
}
}
static UnicodeSet ulayout_addPropertyStarts(int src, UnicodeSet set) {
return LayoutProps.INSTANCE.addPropertyStarts(src, set);
}
static void mathCompat_addPropertyStarts(UnicodeSet set) {
// range limits
for (int c : ID_COMPAT_MATH_CONTINUE) {
set.add(c);
}
// single characters
for (int c : ID_COMPAT_MATH_START) {
set.add(c);
set.add(c + 1);
}
}
static void mcm_addPropertyStarts(UnicodeSet set) {
// range limits
for (int c : MODIFIER_COMBINING_MARK) {
set.add(c);
}
}
public void ublock_addPropertyStarts(UnicodeSet set) {
// Add the start code point of each same-value range of the trie.
// We store Block values indexed by the code point shifted right 4 bits;
// see ublock_getCode().
CodePointMap.Range range = new CodePointMap.Range();
int start = 0;
while (start < 0x11000 && // limit: (max code point + 1) >> 4
m_blockTrie_.getRange(start, null, range)) {
set.add(start << 4);
start = range.getEnd() + 1;
}
}
public boolean hasIDType(int c, int typeIndex) {
if (typeIndex < 0 || typeIndex >= idTypeToEncoded.length) {
return false;
}
int encodedType = idTypeToEncoded[typeIndex];
int value = getAdditional(c, 2) >>> ID_TYPE_SHIFT;
if ((encodedType & ID_TYPE_BIT) != 0) {
return value < ID_TYPE_FORBIDDEN && (value & encodedType) != 0;
} else {
return value == encodedType;
}
}
public boolean hasIDType(int c, IdentifierType type) {
return hasIDType(c, type.ordinal());
}
private static void maybeAddType(int value, int bit, IdentifierType t,
EnumSet types) {
if ((value & bit) != 0) {
types.add(t);
}
}
public int getIDTypes(int c, EnumSet types) {
types.clear();
int value = getAdditional(c, 2) >>> ID_TYPE_SHIFT;;
if ((value & ID_TYPE_FORBIDDEN) == ID_TYPE_FORBIDDEN || value == ID_TYPE_NOT_CHARACTER) {
// single value
IdentifierType t;
switch (value) {
case ID_TYPE_NOT_CHARACTER: t = IdentifierType.NOT_CHARACTER; break;
case ID_TYPE_DEPRECATED: t = IdentifierType.DEPRECATED; break;
case ID_TYPE_DEFAULT_IGNORABLE: t = IdentifierType.DEFAULT_IGNORABLE; break;
case ID_TYPE_NOT_NFKC: t = IdentifierType.NOT_NFKC; break;
case ID_TYPE_INCLUSION: t = IdentifierType.INCLUSION; break;
case ID_TYPE_RECOMMENDED: t = IdentifierType.RECOMMENDED; break;
default:
throw new IllegalStateException(
String.format("unknown IdentifierType data value 0x%02x", value));
}
types.add(t);
return 1;
} else {
// one or more combinable bits
maybeAddType(value, ID_TYPE_NOT_XID, IdentifierType.NOT_XID, types);
maybeAddType(value, ID_TYPE_EXCLUSION, IdentifierType.EXCLUSION, types);
maybeAddType(value, ID_TYPE_OBSOLETE, IdentifierType.OBSOLETE, types);
maybeAddType(value, ID_TYPE_TECHNICAL, IdentifierType.TECHNICAL, types);
maybeAddType(value, ID_TYPE_UNCOMMON_USE, IdentifierType.UNCOMMON_USE, types);
maybeAddType(value, ID_TYPE_LIMITED_USE, IdentifierType.LIMITED_USE, types);
return types.size();
}
}
// This static initializer block must be placed after
// other static member initialization
static {
try {
INSTANCE = new UCharacterProperty();
}
catch (IOException e) {
throw new MissingResourceException(e.getMessage(),"","");
}
}
/*----------------------------------------------------------------
* Inclusions list
*----------------------------------------------------------------*/
/*
* Return a set of characters for property enumeration.
* The set implicitly contains 0x110000 as well, which is one more than the highest
* Unicode code point.
*
* This set is used as an ordered list - its code points are ordered, and
* consecutive code points (in Unicode code point order) in the set define a range.
* For each two consecutive characters (start, limit) in the set,
* all of the UCD/normalization and related properties for
* all code points start..limit-1 are all the same,
* except for character names and ISO comments.
*
* All Unicode code points U+0000..U+10ffff are covered by these ranges.
* The ranges define a partition of the Unicode code space.
* ICU uses the inclusions set to enumerate properties for generating
* UnicodeSets containing all code points that have a certain property value.
*
* The Inclusion List is generated from the UCD. It is generated
* by enumerating the data tries, and code points for hardcoded properties
* are added as well.
*
* --------------------------------------------------------------------------
*
* The following are ideas for getting properties-unique code point ranges,
* with possible optimizations beyond the current implementation.
* These optimizations would require more code and be more fragile.
* The current implementation generates one single list (set) for all properties.
*
* To enumerate properties efficiently, one needs to know ranges of
* repetitive values, so that the value of only each start code point
* can be applied to the whole range.
* This information is in principle available in the uprops.icu/unorm.icu data.
*
* There are two obstacles:
*
* 1. Some properties are computed from multiple data structures,
* making it necessary to get repetitive ranges by intersecting
* ranges from multiple tries.
*
* 2. It is not economical to write code for getting repetitive ranges
* that are precise for each of some 50 properties.
*
* Compromise ideas:
*
* - Get ranges per trie, not per individual property.
* Each range contains the same values for a whole group of properties.
* This would generate currently five range sets, two for uprops.icu tries
* and three for unorm.icu tries.
*
* - Combine sets of ranges for multiple tries to get sufficient sets
* for properties, e.g., the uprops.icu main and auxiliary tries
* for all non-normalization properties.
*
* Ideas for representing ranges and combining them:
*
* - A UnicodeSet could hold just the start code points of ranges.
* Multiple sets are easily combined by or-ing them together.
*
* - Alternatively, a UnicodeSet could hold each even-numbered range.
* All ranges could be enumerated by using each start code point
* (for the even-numbered ranges) as well as each limit (end+1) code point
* (for the odd-numbered ranges).
* It should be possible to combine two such sets by xor-ing them,
* but no more than two.
*
* The second way to represent ranges may(?!) yield smaller UnicodeSet arrays,
* but the first one is certainly simpler and applicable for combining more than
* two range sets.
*
* It is possible to combine all range sets for all uprops/unorm tries into one
* set that can be used for all properties.
* As an optimization, there could be less-combined range sets for certain
* groups of properties.
* The relationship of which less-combined range set to use for which property
* depends on the implementation of the properties and must be hardcoded
* - somewhat error-prone and higher maintenance but can be tested easily
* by building property sets "the simple way" in test code.
*
* ---
*
* Do not use a UnicodeSet pattern because that causes infinite recursion;
* UnicodeSet depends on the inclusions set.
*
* ---
*
* getInclusions() is commented out starting 2005-feb-12 because
* UnicodeSet now calls the uxyz_addPropertyStarts() directly,
* and only for the relevant property source.
*/
/*
public UnicodeSet getInclusions() {
UnicodeSet set = new UnicodeSet();
NormalizerImpl.addPropertyStarts(set);
addPropertyStarts(set);
return set;
}
*/
}