com.ibm.icu.impl.CaseMapImpl Maven / Gradle / Ivy
Show all versions of icu4j Show documentation
// © 2016 and later: Unicode, Inc. and others.
// License & terms of use: http://www.unicode.org/copyright.html#License
package com.ibm.icu.impl;
import java.io.IOException;
import java.text.CharacterIterator;
import java.util.Locale;
import com.ibm.icu.lang.UCharacter;
import com.ibm.icu.lang.UCharacterCategory;
import com.ibm.icu.text.BreakIterator;
import com.ibm.icu.text.Edits;
import com.ibm.icu.util.ICUUncheckedIOException;
import com.ibm.icu.util.ULocale;
public final class CaseMapImpl {
/**
* Implementation of UCaseProps.ContextIterator, iterates over a String.
* See ustrcase.c/utf16_caseContextIterator().
*/
public static final class StringContextIterator implements UCaseProps.ContextIterator {
/**
* Constructor.
* @param src String to iterate over.
*/
public StringContextIterator(CharSequence src) {
this.s=src;
limit=src.length();
cpStart=cpLimit=index=0;
dir=0;
}
/**
* Constructor.
* @param src String to iterate over.
* @param cpStart Start index of the current code point.
* @param cpLimit Limit index of the current code point.
*/
public StringContextIterator(CharSequence src, int cpStart, int cpLimit) {
s = src;
index = 0;
limit = src.length();
this.cpStart = cpStart;
this.cpLimit = cpLimit;
dir = 0;
}
/**
* Set the iteration limit for nextCaseMapCP() to an index within the string.
* If the limit parameter is negative or past the string, then the
* string length is restored as the iteration limit.
*
* This limit does not affect the next() function which always
* iterates to the very end of the string.
*
* @param lim The iteration limit.
*/
public void setLimit(int lim) {
if(0<=lim && lim<=s.length()) {
limit=lim;
} else {
limit=s.length();
}
}
/**
* Move to the iteration limit without fetching code points up to there.
*/
public void moveToLimit() {
cpStart=cpLimit=limit;
}
/**
* Iterate forward through the string to fetch the next code point
* to be case-mapped, and set the context indexes for it.
*
*
When the iteration limit is reached (and -1 is returned),
* getCPStart() will be at the iteration limit.
*
*
Iteration with next() does not affect the position for nextCaseMapCP().
*
* @return The next code point to be case-mapped, or <0 when the iteration is done.
*/
public int nextCaseMapCP() {
cpStart=cpLimit;
if(cpLimit0) {
/* reset for forward iteration */
dir=1;
index=cpLimit;
} else if(direction<0) {
/* reset for backward iteration */
dir=-1;
index=cpStart;
} else {
// not a valid direction
dir=0;
index=0;
}
}
@Override
public int next() {
int c;
if(dir>0 && index0) {
c=Character.codePointBefore(s, index);
index-=Character.charCount(c);
return c;
}
return -1;
}
// variables
protected CharSequence s;
protected int index, limit, cpStart, cpLimit;
protected int dir; // 0=initial state >0=forward <0=backward
}
public static final int TITLECASE_WHOLE_STRING = 0x20;
public static final int TITLECASE_SENTENCES = 0x40;
/**
* Bit mask for the titlecasing iterator options bit field.
* Currently only 3 out of 8 values are used:
* 0 (words), TITLECASE_WHOLE_STRING, TITLECASE_SENTENCES.
* See stringoptions.h.
* @internal
*/
private static final int TITLECASE_ITERATOR_MASK = 0xe0;
public static final int TITLECASE_ADJUST_TO_CASED = 0x400;
/**
* Bit mask for the titlecasing index adjustment options bit set.
* Currently two bits are defined:
* TITLECASE_NO_BREAK_ADJUSTMENT, TITLECASE_ADJUST_TO_CASED.
* See stringoptions.h.
* @internal
*/
private static final int TITLECASE_ADJUSTMENT_MASK = 0x600;
public static int addTitleAdjustmentOption(int options, int newOption) {
int adjOptions = options & TITLECASE_ADJUSTMENT_MASK;
if (adjOptions !=0 && adjOptions != newOption) {
throw new IllegalArgumentException("multiple titlecasing index adjustment options");
}
return options | newOption;
}
private static final int LNS =
(1 << UCharacterCategory.UPPERCASE_LETTER) |
(1 << UCharacterCategory.LOWERCASE_LETTER) |
(1 << UCharacterCategory.TITLECASE_LETTER) |
// Not MODIFIER_LETTER: We count only cased modifier letters.
(1 << UCharacterCategory.OTHER_LETTER) |
(1 << UCharacterCategory.DECIMAL_DIGIT_NUMBER) |
(1 << UCharacterCategory.LETTER_NUMBER) |
(1 << UCharacterCategory.OTHER_NUMBER) |
(1 << UCharacterCategory.MATH_SYMBOL) |
(1 << UCharacterCategory.CURRENCY_SYMBOL) |
(1 << UCharacterCategory.MODIFIER_SYMBOL) |
(1 << UCharacterCategory.OTHER_SYMBOL) |
(1 << UCharacterCategory.PRIVATE_USE);
private static boolean isLNS(int c) {
// Letter, number, symbol,
// or a private use code point because those are typically used as letters or numbers.
// Consider modifier letters only if they are cased.
int gc = UCharacterProperty.INSTANCE.getType(c);
return ((1 << gc) & LNS) != 0 ||
(gc == UCharacterCategory.MODIFIER_LETTER &&
UCaseProps.INSTANCE.getType(c) != UCaseProps.NONE);
}
public static int addTitleIteratorOption(int options, int newOption) {
int iterOptions = options & TITLECASE_ITERATOR_MASK;
if (iterOptions !=0 && iterOptions != newOption) {
throw new IllegalArgumentException("multiple titlecasing iterator options");
}
return options | newOption;
}
public static BreakIterator getTitleBreakIterator(
Locale locale, int options, BreakIterator iter) {
options &= TITLECASE_ITERATOR_MASK;
if (options != 0 && iter != null) {
throw new IllegalArgumentException(
"titlecasing iterator option together with an explicit iterator");
}
if (iter == null) {
switch (options) {
case 0:
iter = BreakIterator.getWordInstance(locale);
break;
case TITLECASE_WHOLE_STRING:
iter = new WholeStringBreakIterator();
break;
case TITLECASE_SENTENCES:
iter = BreakIterator.getSentenceInstance(locale);
break;
default:
throw new IllegalArgumentException("unknown titlecasing iterator option");
}
}
return iter;
}
public static BreakIterator getTitleBreakIterator(
ULocale locale, int options, BreakIterator iter) {
options &= TITLECASE_ITERATOR_MASK;
if (options != 0 && iter != null) {
throw new IllegalArgumentException(
"titlecasing iterator option together with an explicit iterator");
}
if (iter == null) {
switch (options) {
case 0:
iter = BreakIterator.getWordInstance(locale);
break;
case TITLECASE_WHOLE_STRING:
iter = new WholeStringBreakIterator();
break;
case TITLECASE_SENTENCES:
iter = BreakIterator.getSentenceInstance(locale);
break;
default:
throw new IllegalArgumentException("unknown titlecasing iterator option");
}
}
return iter;
}
/**
* Omit unchanged text when case-mapping with Edits.
*/
public static final int OMIT_UNCHANGED_TEXT = 0x4000;
private static final class WholeStringBreakIterator extends BreakIterator {
private int length;
private static void notImplemented() {
throw new UnsupportedOperationException("should not occur");
}
@Override
public int first() {
return 0;
}
@Override
public int last() {
notImplemented();
return 0;
}
@Override
public int next(int n) {
notImplemented();
return 0;
}
@Override
public int next() {
return length;
}
@Override
public int previous() {
notImplemented();
return 0;
}
@Override
public int following(int offset) {
notImplemented();
return 0;
}
@Override
public int current() {
notImplemented();
return 0;
}
@Override
public CharacterIterator getText() {
notImplemented();
return null;
}
@Override
public void setText(CharacterIterator newText) {
length = newText.getEndIndex();
}
@Override
public void setText(CharSequence newText) {
length = newText.length();
}
@Override
public void setText(String newText) {
length = newText.length();
}
}
private static int appendCodePoint(Appendable a, int c) throws IOException {
if (c <= Character.MAX_VALUE) {
a.append((char)c);
return 1;
} else {
a.append((char)(0xd7c0 + (c >> 10)));
a.append((char)(Character.MIN_LOW_SURROGATE + (c & 0x3ff)));
return 2;
}
}
/**
* Appends a full case mapping result, see {@link UCaseProps#MAX_STRING_LENGTH}.
* @throws IOException
*/
private static void appendResult(int result, Appendable dest,
int cpLength, int options, Edits edits) throws IOException {
// Decode the result.
if (result < 0) {
// (not) original code point
if (edits != null) {
edits.addUnchanged(cpLength);
}
if ((options & OMIT_UNCHANGED_TEXT) != 0) {
return;
}
appendCodePoint(dest, ~result);
} else if (result <= UCaseProps.MAX_STRING_LENGTH) {
// The mapping has already been appended to result.
if (edits != null) {
edits.addReplace(cpLength, result);
}
} else {
// Append the single-code point mapping.
int length = appendCodePoint(dest, result);
if (edits != null) {
edits.addReplace(cpLength, length);
}
}
}
private static final void appendUnchanged(CharSequence src, int start, int length,
Appendable dest, int options, Edits edits) throws IOException {
if (length > 0) {
if (edits != null) {
edits.addUnchanged(length);
}
if ((options & OMIT_UNCHANGED_TEXT) != 0) {
return;
}
dest.append(src, start, start + length);
}
}
private static String applyEdits(CharSequence src, StringBuilder replacementChars, Edits edits) {
if (!edits.hasChanges()) {
return src.toString();
}
StringBuilder result = new StringBuilder(src.length() + edits.lengthDelta());
for (Edits.Iterator ei = edits.getCoarseIterator(); ei.next();) {
if (ei.hasChange()) {
int i = ei.replacementIndex();
result.append(replacementChars, i, i + ei.newLength());
} else {
int i = ei.sourceIndex();
result.append(src, i, i + ei.oldLength());
}
}
return result.toString();
}
private static final Trie2_16 CASE_TRIE = UCaseProps.getTrie();
/**
* caseLocale >= 0: Lowercases [srcStart..srcLimit[ but takes context [0..srcLength[ into account.
* caseLocale < 0: Case-folds [srcStart..srcLimit[.
*/
private static void internalToLower(int caseLocale, int options,
CharSequence src, int srcStart, int srcLimit, StringContextIterator iter,
Appendable dest, Edits edits) throws IOException {
byte[] latinToLower;
if (caseLocale == UCaseProps.LOC_ROOT ||
(caseLocale >= 0 ?
!(caseLocale == UCaseProps.LOC_TURKISH || caseLocale == UCaseProps.LOC_LITHUANIAN) :
(options & UCaseProps.FOLD_CASE_OPTIONS_MASK) == UCharacter.FOLD_CASE_DEFAULT)) {
latinToLower = UCaseProps.LatinCase.TO_LOWER_NORMAL;
} else {
latinToLower = UCaseProps.LatinCase.TO_LOWER_TR_LT;
}
int prev = srcStart;
int srcIndex = srcStart;
outerLoop:
for (;;) {
// fast path for simple cases
char lead;
for (;;) {
if (srcIndex >= srcLimit) {
break outerLoop;
}
lead = src.charAt(srcIndex);
int delta;
if (lead < UCaseProps.LatinCase.LONG_S) {
byte d = latinToLower[lead];
if (d == UCaseProps.LatinCase.EXC) { break; }
++srcIndex;
if (d == 0) { continue; }
delta = d;
} else if (lead >= 0xd800) {
break; // surrogate or higher
} else {
int props = CASE_TRIE.getFromU16SingleLead(lead);
if (UCaseProps.propsHasException(props)) { break; }
++srcIndex;
if (!UCaseProps.isUpperOrTitleFromProps(props) ||
(delta = UCaseProps.getDelta(props)) == 0) {
continue;
}
}
lead += delta;
appendUnchanged(src, prev, srcIndex - 1 - prev, dest, options, edits);
dest.append(lead);
if (edits != null) {
edits.addReplace(1, 1);
}
prev = srcIndex;
}
// slow path
int cpStart = srcIndex++;
char trail;
int c;
if (Character.isHighSurrogate(lead) && srcIndex < srcLimit &&
Character.isLowSurrogate(trail = src.charAt(srcIndex))) {
c = Character.toCodePoint(lead, trail);
++srcIndex;
} else {
c = lead;
}
// We need to append unchanged text before calling the UCaseProps.toFullXyz() methods
// because they will sometimes append their mapping to dest,
// and that must be after copying the previous text.
appendUnchanged(src, prev, cpStart - prev, dest, options, edits);
prev = cpStart;
if (caseLocale >= 0) {
if (iter == null) {
iter = new StringContextIterator(src, cpStart, srcIndex);
} else {
iter.setCPStartAndLimit(cpStart, srcIndex);
}
c = UCaseProps.INSTANCE.toFullLower(c, iter, dest, caseLocale);
} else {
c = UCaseProps.INSTANCE.toFullFolding(c, dest, options);
}
if (c >= 0) {
appendResult(c, dest, srcIndex - cpStart, options, edits);
prev = srcIndex;
}
}
appendUnchanged(src, prev, srcIndex - prev, dest, options, edits);
}
private static void internalToUpper(int caseLocale, int options,
CharSequence src, Appendable dest, Edits edits) throws IOException {
StringContextIterator iter = null;
byte[] latinToUpper;
if (caseLocale == UCaseProps.LOC_TURKISH) {
latinToUpper = UCaseProps.LatinCase.TO_UPPER_TR;
} else {
latinToUpper = UCaseProps.LatinCase.TO_UPPER_NORMAL;
}
int prev = 0;
int srcIndex = 0;
int srcLength = src.length();
outerLoop:
for (;;) {
// fast path for simple cases
char lead;
for (;;) {
if (srcIndex >= srcLength) {
break outerLoop;
}
lead = src.charAt(srcIndex);
int delta;
if (lead < UCaseProps.LatinCase.LONG_S) {
byte d = latinToUpper[lead];
if (d == UCaseProps.LatinCase.EXC) { break; }
++srcIndex;
if (d == 0) { continue; }
delta = d;
} else if (lead >= 0xd800) {
break; // surrogate or higher
} else {
int props = CASE_TRIE.getFromU16SingleLead(lead);
if (UCaseProps.propsHasException(props)) { break; }
++srcIndex;
if (UCaseProps.getTypeFromProps(props) != UCaseProps.LOWER ||
(delta = UCaseProps.getDelta(props)) == 0) {
continue;
}
}
lead += delta;
appendUnchanged(src, prev, srcIndex - 1 - prev, dest, options, edits);
dest.append(lead);
if (edits != null) {
edits.addReplace(1, 1);
}
prev = srcIndex;
}
// slow path
int cpStart = srcIndex++;
char trail;
int c;
if (Character.isHighSurrogate(lead) && srcIndex < srcLength &&
Character.isLowSurrogate(trail = src.charAt(srcIndex))) {
c = Character.toCodePoint(lead, trail);
++srcIndex;
} else {
c = lead;
}
if (iter == null) {
iter = new StringContextIterator(src, cpStart, srcIndex);
} else {
iter.setCPStartAndLimit(cpStart, srcIndex);
}
// We need to append unchanged text before calling UCaseProps.toFullUpper()
// because it will sometimes append its mapping to dest,
// and that must be after copying the previous text.
appendUnchanged(src, prev, cpStart - prev, dest, options, edits);
prev = cpStart;
c = UCaseProps.INSTANCE.toFullUpper(c, iter, dest, caseLocale);
if (c >= 0) {
appendResult(c, dest, srcIndex - cpStart, options, edits);
prev = srcIndex;
}
}
appendUnchanged(src, prev, srcIndex - prev, dest, options, edits);
}
public static String toLower(int caseLocale, int options, CharSequence src) {
if (src.length() <= 100 && (options & OMIT_UNCHANGED_TEXT) == 0) {
if (src.length() == 0) {
return src.toString();
}
// Collect and apply only changes.
// Good if no or few changes. Bad (slow) if many changes.
Edits edits = new Edits();
StringBuilder replacementChars = toLower(
caseLocale, options | OMIT_UNCHANGED_TEXT, src, new StringBuilder(), edits);
return applyEdits(src, replacementChars, edits);
} else {
return toLower(caseLocale, options, src,
new StringBuilder(src.length()), null).toString();
}
}
public static A toLower(int caseLocale, int options,
CharSequence src, A dest, Edits edits) {
try {
if (edits != null) {
edits.reset();
}
internalToLower(caseLocale, options, src, 0, src.length(), null, dest, edits);
return dest;
} catch (IOException e) {
throw new ICUUncheckedIOException(e);
}
}
public static String toUpper(int caseLocale, int options, CharSequence src) {
if (src.length() <= 100 && (options & OMIT_UNCHANGED_TEXT) == 0) {
if (src.length() == 0) {
return src.toString();
}
// Collect and apply only changes.
// Good if no or few changes. Bad (slow) if many changes.
Edits edits = new Edits();
StringBuilder replacementChars = toUpper(
caseLocale, options | OMIT_UNCHANGED_TEXT, src, new StringBuilder(), edits);
return applyEdits(src, replacementChars, edits);
} else {
return toUpper(caseLocale, options, src,
new StringBuilder(src.length()), null).toString();
}
}
public static A toUpper(int caseLocale, int options,
CharSequence src, A dest, Edits edits) {
try {
if (edits != null) {
edits.reset();
}
if (caseLocale == UCaseProps.LOC_GREEK) {
return GreekUpper.toUpper(options, src, dest, edits);
}
internalToUpper(caseLocale, options, src, dest, edits);
return dest;
} catch (IOException e) {
throw new ICUUncheckedIOException(e);
}
}
public static String toTitle(int caseLocale, int options, BreakIterator iter, CharSequence src) {
if (src.length() <= 100 && (options & OMIT_UNCHANGED_TEXT) == 0) {
if (src.length() == 0) {
return src.toString();
}
// Collect and apply only changes.
// Good if no or few changes. Bad (slow) if many changes.
Edits edits = new Edits();
StringBuilder replacementChars = toTitle(
caseLocale, options | OMIT_UNCHANGED_TEXT, iter, src,
new StringBuilder(), edits);
return applyEdits(src, replacementChars, edits);
} else {
return toTitle(caseLocale, options, iter, src,
new StringBuilder(src.length()), null).toString();
}
}
public static A toTitle(
int caseLocale, int options, BreakIterator titleIter,
CharSequence src, A dest, Edits edits) {
try {
if (edits != null) {
edits.reset();
}
/* set up local variables */
StringContextIterator iter = new StringContextIterator(src);
int srcLength = src.length();
int prev=0;
boolean isFirstIndex=true;
/* titlecasing loop */
while(prevsrcLength) {
index=srcLength;
}
/*
* Segment [prev..index[ into 3 parts:
* a) skipped characters (copy as-is) [prev..titleStart[
* b) first letter (titlecase) [titleStart..titleLimit[
* c) subsequent characters (lowercase) [titleLimit..index[
*/
if(prev=0) {}
// If c<0 then we have only uncased characters in [prev..index[
// and stopped with titleStart==titleLimit==index.
titleStart=iter.getCPStart();
if (prev < titleStart) {
appendUnchanged(src, prev, titleStart-prev, dest, options, edits);
}
}
if(titleStart A fold(int options,
CharSequence src, A dest, Edits edits) {
try {
if (edits != null) {
edits.reset();
}
internalToLower(-1, options, src, 0, src.length(), null, dest, edits);
return dest;
} catch (IOException e) {
throw new ICUUncheckedIOException(e);
}
}
private static final class GreekUpper {
// Data bits.
private static final int UPPER_MASK = 0x3ff;
private static final int HAS_VOWEL = 0x1000;
private static final int HAS_YPOGEGRAMMENI = 0x2000;
private static final int HAS_ACCENT = 0x4000;
private static final int HAS_DIALYTIKA = 0x8000;
// Further bits during data building and processing, not stored in the data map.
private static final int HAS_COMBINING_DIALYTIKA = 0x10000;
private static final int HAS_OTHER_GREEK_DIACRITIC = 0x20000;
private static final int HAS_VOWEL_AND_ACCENT = HAS_VOWEL | HAS_ACCENT;
private static final int HAS_VOWEL_AND_ACCENT_AND_DIALYTIKA =
HAS_VOWEL_AND_ACCENT | HAS_DIALYTIKA;
private static final int HAS_EITHER_DIALYTIKA = HAS_DIALYTIKA | HAS_COMBINING_DIALYTIKA;
// State bits.
private static final int AFTER_CASED = 1;
private static final int AFTER_VOWEL_WITH_ACCENT = 2;
// Data generated by prototype code, see
// http://site.icu-project.org/design/case/greek-upper
// TODO: Move this data into ucase.icu.
private static final char[] data0370 = {
// U+0370..03FF
0x0370, // Ͱ
0x0370, // ͱ
0x0372, // Ͳ
0x0372, // ͳ
0,
0,
0x0376, // Ͷ
0x0376, // ͷ
0,
0,
0x037A, // ͺ
0x03FD, // ͻ
0x03FE, // ͼ
0x03FF, // ͽ
0,
0x037F, // Ϳ
0,
0,
0,
0,
0,
0,
0x0391 | HAS_VOWEL | HAS_ACCENT, // Ά
0,
0x0395 | HAS_VOWEL | HAS_ACCENT, // Έ
0x0397 | HAS_VOWEL | HAS_ACCENT, // Ή
0x0399 | HAS_VOWEL | HAS_ACCENT, // Ί
0,
0x039F | HAS_VOWEL | HAS_ACCENT, // Ό
0,
0x03A5 | HAS_VOWEL | HAS_ACCENT, // Ύ
0x03A9 | HAS_VOWEL | HAS_ACCENT, // Ώ
0x0399 | HAS_VOWEL | HAS_ACCENT | HAS_DIALYTIKA, // ΐ
0x0391 | HAS_VOWEL, // Α
0x0392, // Β
0x0393, // Γ
0x0394, // Δ
0x0395 | HAS_VOWEL, // Ε
0x0396, // Ζ
0x0397 | HAS_VOWEL, // Η
0x0398, // Θ
0x0399 | HAS_VOWEL, // Ι
0x039A, // Κ
0x039B, // Λ
0x039C, // Μ
0x039D, // Ν
0x039E, // Ξ
0x039F | HAS_VOWEL, // Ο
0x03A0, // Π
0x03A1, // Ρ
0,
0x03A3, // Σ
0x03A4, // Τ
0x03A5 | HAS_VOWEL, // Υ
0x03A6, // Φ
0x03A7, // Χ
0x03A8, // Ψ
0x03A9 | HAS_VOWEL, // Ω
0x0399 | HAS_VOWEL | HAS_DIALYTIKA, // Ϊ
0x03A5 | HAS_VOWEL | HAS_DIALYTIKA, // Ϋ
0x0391 | HAS_VOWEL | HAS_ACCENT, // ά
0x0395 | HAS_VOWEL | HAS_ACCENT, // έ
0x0397 | HAS_VOWEL | HAS_ACCENT, // ή
0x0399 | HAS_VOWEL | HAS_ACCENT, // ί
0x03A5 | HAS_VOWEL | HAS_ACCENT | HAS_DIALYTIKA, // ΰ
0x0391 | HAS_VOWEL, // α
0x0392, // β
0x0393, // γ
0x0394, // δ
0x0395 | HAS_VOWEL, // ε
0x0396, // ζ
0x0397 | HAS_VOWEL, // η
0x0398, // θ
0x0399 | HAS_VOWEL, // ι
0x039A, // κ
0x039B, // λ
0x039C, // μ
0x039D, // ν
0x039E, // ξ
0x039F | HAS_VOWEL, // ο
0x03A0, // π
0x03A1, // ρ
0x03A3, // ς
0x03A3, // σ
0x03A4, // τ
0x03A5 | HAS_VOWEL, // υ
0x03A6, // φ
0x03A7, // χ
0x03A8, // ψ
0x03A9 | HAS_VOWEL, // ω
0x0399 | HAS_VOWEL | HAS_DIALYTIKA, // ϊ
0x03A5 | HAS_VOWEL | HAS_DIALYTIKA, // ϋ
0x039F | HAS_VOWEL | HAS_ACCENT, // ό
0x03A5 | HAS_VOWEL | HAS_ACCENT, // ύ
0x03A9 | HAS_VOWEL | HAS_ACCENT, // ώ
0x03CF, // Ϗ
0x0392, // ϐ
0x0398, // ϑ
0x03D2, // ϒ
0x03D2 | HAS_ACCENT, // ϓ
0x03D2 | HAS_DIALYTIKA, // ϔ
0x03A6, // ϕ
0x03A0, // ϖ
0x03CF, // ϗ
0x03D8, // Ϙ
0x03D8, // ϙ
0x03DA, // Ϛ
0x03DA, // ϛ
0x03DC, // Ϝ
0x03DC, // ϝ
0x03DE, // Ϟ
0x03DE, // ϟ
0x03E0, // Ϡ
0x03E0, // ϡ
0,
0,
0,
0,
0,
0,
0,
0,
0,
0,
0,
0,
0,
0,
0x039A, // ϰ
0x03A1, // ϱ
0x03F9, // ϲ
0x037F, // ϳ
0x03F4, // ϴ
0x0395 | HAS_VOWEL, // ϵ
0,
0x03F7, // Ϸ
0x03F7, // ϸ
0x03F9, // Ϲ
0x03FA, // Ϻ
0x03FA, // ϻ
0x03FC, // ϼ
0x03FD, // Ͻ
0x03FE, // Ͼ
0x03FF, // Ͽ
};
private static final char[] data1F00 = {
// U+1F00..1FFF
0x0391 | HAS_VOWEL, // ἀ
0x0391 | HAS_VOWEL, // ἁ
0x0391 | HAS_VOWEL | HAS_ACCENT, // ἂ
0x0391 | HAS_VOWEL | HAS_ACCENT, // ἃ
0x0391 | HAS_VOWEL | HAS_ACCENT, // ἄ
0x0391 | HAS_VOWEL | HAS_ACCENT, // ἅ
0x0391 | HAS_VOWEL | HAS_ACCENT, // ἆ
0x0391 | HAS_VOWEL | HAS_ACCENT, // ἇ
0x0391 | HAS_VOWEL, // Ἀ
0x0391 | HAS_VOWEL, // Ἁ
0x0391 | HAS_VOWEL | HAS_ACCENT, // Ἂ
0x0391 | HAS_VOWEL | HAS_ACCENT, // Ἃ
0x0391 | HAS_VOWEL | HAS_ACCENT, // Ἄ
0x0391 | HAS_VOWEL | HAS_ACCENT, // Ἅ
0x0391 | HAS_VOWEL | HAS_ACCENT, // Ἆ
0x0391 | HAS_VOWEL | HAS_ACCENT, // Ἇ
0x0395 | HAS_VOWEL, // ἐ
0x0395 | HAS_VOWEL, // ἑ
0x0395 | HAS_VOWEL | HAS_ACCENT, // ἒ
0x0395 | HAS_VOWEL | HAS_ACCENT, // ἓ
0x0395 | HAS_VOWEL | HAS_ACCENT, // ἔ
0x0395 | HAS_VOWEL | HAS_ACCENT, // ἕ
0,
0,
0x0395 | HAS_VOWEL, // Ἐ
0x0395 | HAS_VOWEL, // Ἑ
0x0395 | HAS_VOWEL | HAS_ACCENT, // Ἒ
0x0395 | HAS_VOWEL | HAS_ACCENT, // Ἓ
0x0395 | HAS_VOWEL | HAS_ACCENT, // Ἔ
0x0395 | HAS_VOWEL | HAS_ACCENT, // Ἕ
0,
0,
0x0397 | HAS_VOWEL, // ἠ
0x0397 | HAS_VOWEL, // ἡ
0x0397 | HAS_VOWEL | HAS_ACCENT, // ἢ
0x0397 | HAS_VOWEL | HAS_ACCENT, // ἣ
0x0397 | HAS_VOWEL | HAS_ACCENT, // ἤ
0x0397 | HAS_VOWEL | HAS_ACCENT, // ἥ
0x0397 | HAS_VOWEL | HAS_ACCENT, // ἦ
0x0397 | HAS_VOWEL | HAS_ACCENT, // ἧ
0x0397 | HAS_VOWEL, // Ἠ
0x0397 | HAS_VOWEL, // Ἡ
0x0397 | HAS_VOWEL | HAS_ACCENT, // Ἢ
0x0397 | HAS_VOWEL | HAS_ACCENT, // Ἣ
0x0397 | HAS_VOWEL | HAS_ACCENT, // Ἤ
0x0397 | HAS_VOWEL | HAS_ACCENT, // Ἥ
0x0397 | HAS_VOWEL | HAS_ACCENT, // Ἦ
0x0397 | HAS_VOWEL | HAS_ACCENT, // Ἧ
0x0399 | HAS_VOWEL, // ἰ
0x0399 | HAS_VOWEL, // ἱ
0x0399 | HAS_VOWEL | HAS_ACCENT, // ἲ
0x0399 | HAS_VOWEL | HAS_ACCENT, // ἳ
0x0399 | HAS_VOWEL | HAS_ACCENT, // ἴ
0x0399 | HAS_VOWEL | HAS_ACCENT, // ἵ
0x0399 | HAS_VOWEL | HAS_ACCENT, // ἶ
0x0399 | HAS_VOWEL | HAS_ACCENT, // ἷ
0x0399 | HAS_VOWEL, // Ἰ
0x0399 | HAS_VOWEL, // Ἱ
0x0399 | HAS_VOWEL | HAS_ACCENT, // Ἲ
0x0399 | HAS_VOWEL | HAS_ACCENT, // Ἳ
0x0399 | HAS_VOWEL | HAS_ACCENT, // Ἴ
0x0399 | HAS_VOWEL | HAS_ACCENT, // Ἵ
0x0399 | HAS_VOWEL | HAS_ACCENT, // Ἶ
0x0399 | HAS_VOWEL | HAS_ACCENT, // Ἷ
0x039F | HAS_VOWEL, // ὀ
0x039F | HAS_VOWEL, // ὁ
0x039F | HAS_VOWEL | HAS_ACCENT, // ὂ
0x039F | HAS_VOWEL | HAS_ACCENT, // ὃ
0x039F | HAS_VOWEL | HAS_ACCENT, // ὄ
0x039F | HAS_VOWEL | HAS_ACCENT, // ὅ
0,
0,
0x039F | HAS_VOWEL, // Ὀ
0x039F | HAS_VOWEL, // Ὁ
0x039F | HAS_VOWEL | HAS_ACCENT, // Ὂ
0x039F | HAS_VOWEL | HAS_ACCENT, // Ὃ
0x039F | HAS_VOWEL | HAS_ACCENT, // Ὄ
0x039F | HAS_VOWEL | HAS_ACCENT, // Ὅ
0,
0,
0x03A5 | HAS_VOWEL, // ὐ
0x03A5 | HAS_VOWEL, // ὑ
0x03A5 | HAS_VOWEL | HAS_ACCENT, // ὒ
0x03A5 | HAS_VOWEL | HAS_ACCENT, // ὓ
0x03A5 | HAS_VOWEL | HAS_ACCENT, // ὔ
0x03A5 | HAS_VOWEL | HAS_ACCENT, // ὕ
0x03A5 | HAS_VOWEL | HAS_ACCENT, // ὖ
0x03A5 | HAS_VOWEL | HAS_ACCENT, // ὗ
0,
0x03A5 | HAS_VOWEL, // Ὑ
0,
0x03A5 | HAS_VOWEL | HAS_ACCENT, // Ὓ
0,
0x03A5 | HAS_VOWEL | HAS_ACCENT, // Ὕ
0,
0x03A5 | HAS_VOWEL | HAS_ACCENT, // Ὗ
0x03A9 | HAS_VOWEL, // ὠ
0x03A9 | HAS_VOWEL, // ὡ
0x03A9 | HAS_VOWEL | HAS_ACCENT, // ὢ
0x03A9 | HAS_VOWEL | HAS_ACCENT, // ὣ
0x03A9 | HAS_VOWEL | HAS_ACCENT, // ὤ
0x03A9 | HAS_VOWEL | HAS_ACCENT, // ὥ
0x03A9 | HAS_VOWEL | HAS_ACCENT, // ὦ
0x03A9 | HAS_VOWEL | HAS_ACCENT, // ὧ
0x03A9 | HAS_VOWEL, // Ὠ
0x03A9 | HAS_VOWEL, // Ὡ
0x03A9 | HAS_VOWEL | HAS_ACCENT, // Ὢ
0x03A9 | HAS_VOWEL | HAS_ACCENT, // Ὣ
0x03A9 | HAS_VOWEL | HAS_ACCENT, // Ὤ
0x03A9 | HAS_VOWEL | HAS_ACCENT, // Ὥ
0x03A9 | HAS_VOWEL | HAS_ACCENT, // Ὦ
0x03A9 | HAS_VOWEL | HAS_ACCENT, // Ὧ
0x0391 | HAS_VOWEL | HAS_ACCENT, // ὰ
0x0391 | HAS_VOWEL | HAS_ACCENT, // ά
0x0395 | HAS_VOWEL | HAS_ACCENT, // ὲ
0x0395 | HAS_VOWEL | HAS_ACCENT, // έ
0x0397 | HAS_VOWEL | HAS_ACCENT, // ὴ
0x0397 | HAS_VOWEL | HAS_ACCENT, // ή
0x0399 | HAS_VOWEL | HAS_ACCENT, // ὶ
0x0399 | HAS_VOWEL | HAS_ACCENT, // ί
0x039F | HAS_VOWEL | HAS_ACCENT, // ὸ
0x039F | HAS_VOWEL | HAS_ACCENT, // ό
0x03A5 | HAS_VOWEL | HAS_ACCENT, // ὺ
0x03A5 | HAS_VOWEL | HAS_ACCENT, // ύ
0x03A9 | HAS_VOWEL | HAS_ACCENT, // ὼ
0x03A9 | HAS_VOWEL | HAS_ACCENT, // ώ
0,
0,
0x0391 | HAS_VOWEL | HAS_YPOGEGRAMMENI, // ᾀ
0x0391 | HAS_VOWEL | HAS_YPOGEGRAMMENI, // ᾁ
0x0391 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT, // ᾂ
0x0391 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT, // ᾃ
0x0391 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT, // ᾄ
0x0391 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT, // ᾅ
0x0391 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT, // ᾆ
0x0391 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT, // ᾇ
0x0391 | HAS_VOWEL | HAS_YPOGEGRAMMENI, // ᾈ
0x0391 | HAS_VOWEL | HAS_YPOGEGRAMMENI, // ᾉ
0x0391 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT, // ᾊ
0x0391 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT, // ᾋ
0x0391 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT, // ᾌ
0x0391 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT, // ᾍ
0x0391 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT, // ᾎ
0x0391 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT, // ᾏ
0x0397 | HAS_VOWEL | HAS_YPOGEGRAMMENI, // ᾐ
0x0397 | HAS_VOWEL | HAS_YPOGEGRAMMENI, // ᾑ
0x0397 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT, // ᾒ
0x0397 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT, // ᾓ
0x0397 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT, // ᾔ
0x0397 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT, // ᾕ
0x0397 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT, // ᾖ
0x0397 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT, // ᾗ
0x0397 | HAS_VOWEL | HAS_YPOGEGRAMMENI, // ᾘ
0x0397 | HAS_VOWEL | HAS_YPOGEGRAMMENI, // ᾙ
0x0397 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT, // ᾚ
0x0397 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT, // ᾛ
0x0397 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT, // ᾜ
0x0397 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT, // ᾝ
0x0397 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT, // ᾞ
0x0397 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT, // ᾟ
0x03A9 | HAS_VOWEL | HAS_YPOGEGRAMMENI, // ᾠ
0x03A9 | HAS_VOWEL | HAS_YPOGEGRAMMENI, // ᾡ
0x03A9 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT, // ᾢ
0x03A9 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT, // ᾣ
0x03A9 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT, // ᾤ
0x03A9 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT, // ᾥ
0x03A9 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT, // ᾦ
0x03A9 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT, // ᾧ
0x03A9 | HAS_VOWEL | HAS_YPOGEGRAMMENI, // ᾨ
0x03A9 | HAS_VOWEL | HAS_YPOGEGRAMMENI, // ᾩ
0x03A9 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT, // ᾪ
0x03A9 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT, // ᾫ
0x03A9 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT, // ᾬ
0x03A9 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT, // ᾭ
0x03A9 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT, // ᾮ
0x03A9 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT, // ᾯ
0x0391 | HAS_VOWEL, // ᾰ
0x0391 | HAS_VOWEL, // ᾱ
0x0391 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT, // ᾲ
0x0391 | HAS_VOWEL | HAS_YPOGEGRAMMENI, // ᾳ
0x0391 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT, // ᾴ
0,
0x0391 | HAS_VOWEL | HAS_ACCENT, // ᾶ
0x0391 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT, // ᾷ
0x0391 | HAS_VOWEL, // Ᾰ
0x0391 | HAS_VOWEL, // Ᾱ
0x0391 | HAS_VOWEL | HAS_ACCENT, // Ὰ
0x0391 | HAS_VOWEL | HAS_ACCENT, // Ά
0x0391 | HAS_VOWEL | HAS_YPOGEGRAMMENI, // ᾼ
0,
0x0399 | HAS_VOWEL, // ι
0,
0,
0,
0x0397 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT, // ῂ
0x0397 | HAS_VOWEL | HAS_YPOGEGRAMMENI, // ῃ
0x0397 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT, // ῄ
0,
0x0397 | HAS_VOWEL | HAS_ACCENT, // ῆ
0x0397 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT, // ῇ
0x0395 | HAS_VOWEL | HAS_ACCENT, // Ὲ
0x0395 | HAS_VOWEL | HAS_ACCENT, // Έ
0x0397 | HAS_VOWEL | HAS_ACCENT, // Ὴ
0x0397 | HAS_VOWEL | HAS_ACCENT, // Ή
0x0397 | HAS_VOWEL | HAS_YPOGEGRAMMENI, // ῌ
0,
0,
0,
0x0399 | HAS_VOWEL, // ῐ
0x0399 | HAS_VOWEL, // ῑ
0x0399 | HAS_VOWEL | HAS_ACCENT | HAS_DIALYTIKA, // ῒ
0x0399 | HAS_VOWEL | HAS_ACCENT | HAS_DIALYTIKA, // ΐ
0,
0,
0x0399 | HAS_VOWEL | HAS_ACCENT, // ῖ
0x0399 | HAS_VOWEL | HAS_ACCENT | HAS_DIALYTIKA, // ῗ
0x0399 | HAS_VOWEL, // Ῐ
0x0399 | HAS_VOWEL, // Ῑ
0x0399 | HAS_VOWEL | HAS_ACCENT, // Ὶ
0x0399 | HAS_VOWEL | HAS_ACCENT, // Ί
0,
0,
0,
0,
0x03A5 | HAS_VOWEL, // ῠ
0x03A5 | HAS_VOWEL, // ῡ
0x03A5 | HAS_VOWEL | HAS_ACCENT | HAS_DIALYTIKA, // ῢ
0x03A5 | HAS_VOWEL | HAS_ACCENT | HAS_DIALYTIKA, // ΰ
0x03A1, // ῤ
0x03A1, // ῥ
0x03A5 | HAS_VOWEL | HAS_ACCENT, // ῦ
0x03A5 | HAS_VOWEL | HAS_ACCENT | HAS_DIALYTIKA, // ῧ
0x03A5 | HAS_VOWEL, // Ῠ
0x03A5 | HAS_VOWEL, // Ῡ
0x03A5 | HAS_VOWEL | HAS_ACCENT, // Ὺ
0x03A5 | HAS_VOWEL | HAS_ACCENT, // Ύ
0x03A1, // Ῥ
0,
0,
0,
0,
0,
0x03A9 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT, // ῲ
0x03A9 | HAS_VOWEL | HAS_YPOGEGRAMMENI, // ῳ
0x03A9 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT, // ῴ
0,
0x03A9 | HAS_VOWEL | HAS_ACCENT, // ῶ
0x03A9 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT, // ῷ
0x039F | HAS_VOWEL | HAS_ACCENT, // Ὸ
0x039F | HAS_VOWEL | HAS_ACCENT, // Ό
0x03A9 | HAS_VOWEL | HAS_ACCENT, // Ὼ
0x03A9 | HAS_VOWEL | HAS_ACCENT, // Ώ
0x03A9 | HAS_VOWEL | HAS_YPOGEGRAMMENI, // ῼ
0,
0,
0,
};
// U+2126 Ohm sign
private static final char data2126 = 0x03A9 | HAS_VOWEL; // Ω
private static final int getLetterData(int c) {
if (c < 0x370 || 0x2126 < c || (0x3ff < c && c < 0x1f00)) {
return 0;
} else if (c <= 0x3ff) {
return data0370[c - 0x370];
} else if (c <= 0x1fff) {
return data1F00[c - 0x1f00];
} else if (c == 0x2126) {
return data2126;
} else {
return 0;
}
}
/**
* Returns a non-zero value for each of the Greek combining diacritics
* listed in The Unicode Standard, version 8, chapter 7.2 Greek,
* plus some perispomeni look-alikes.
*/
private static final int getDiacriticData(int c) {
switch (c) {
case '\u0300': // varia
case '\u0301': // tonos = oxia
case '\u0342': // perispomeni
case '\u0302': // circumflex can look like perispomeni
case '\u0303': // tilde can look like perispomeni
case '\u0311': // inverted breve can look like perispomeni
return HAS_ACCENT;
case '\u0308': // dialytika = diaeresis
return HAS_COMBINING_DIALYTIKA;
case '\u0344': // dialytika tonos
return HAS_COMBINING_DIALYTIKA | HAS_ACCENT;
case '\u0345': // ypogegrammeni = iota subscript
return HAS_YPOGEGRAMMENI;
case '\u0304': // macron
case '\u0306': // breve
case '\u0313': // comma above
case '\u0314': // reversed comma above
case '\u0343': // koronis
return HAS_OTHER_GREEK_DIACRITIC;
default:
return 0;
}
}
private static boolean isFollowedByCasedLetter(CharSequence s, int i) {
while (i < s.length()) {
int c = Character.codePointAt(s, i);
int type = UCaseProps.INSTANCE.getTypeOrIgnorable(c);
if ((type & UCaseProps.IGNORABLE) != 0) {
// Case-ignorable, continue with the loop.
i += Character.charCount(c);
} else if (type != UCaseProps.NONE) {
return true; // Followed by cased letter.
} else {
return false; // Uncased and not case-ignorable.
}
}
return false; // Not followed by cased letter.
}
/**
* Greek string uppercasing with a state machine.
* Probably simpler than a stateless function that has to figure out complex context-before
* for each character.
* TODO: Try to re-consolidate one way or another with the non-Greek function.
*
*