All Downloads are FREE. Search and download functionalities are using the official Maven repository.

com.ibm.icu.impl.CaseMapImpl Maven / Gradle / Ivy

There is a newer version: 2.12.15
Show newest version
// © 2016 and later: Unicode, Inc. and others.
// License & terms of use: http://www.unicode.org/copyright.html#License
package com.ibm.icu.impl;

import java.io.IOException;
import java.text.CharacterIterator;
import java.util.Locale;

import com.ibm.icu.lang.UCharacter;
import com.ibm.icu.lang.UCharacterCategory;
import com.ibm.icu.text.BreakIterator;
import com.ibm.icu.text.Edits;
import com.ibm.icu.util.ICUUncheckedIOException;
import com.ibm.icu.util.ULocale;

public final class CaseMapImpl {
    /**
     * Implementation of UCaseProps.ContextIterator, iterates over a String.
     * See ustrcase.c/utf16_caseContextIterator().
     */
    public static final class StringContextIterator implements UCaseProps.ContextIterator {
        /**
         * Constructor.
         * @param src String to iterate over.
         */
        public StringContextIterator(CharSequence src) {
            this.s=src;
            limit=src.length();
            cpStart=cpLimit=index=0;
            dir=0;
        }

        /**
         * Constructor.
         * @param src String to iterate over.
         * @param cpStart Start index of the current code point.
         * @param cpLimit Limit index of the current code point.
         */
        public StringContextIterator(CharSequence src, int cpStart, int cpLimit) {
            s = src;
            index = 0;
            limit = src.length();
            this.cpStart = cpStart;
            this.cpLimit = cpLimit;
            dir = 0;
        }

        /**
         * Set the iteration limit for nextCaseMapCP() to an index within the string.
         * If the limit parameter is negative or past the string, then the
         * string length is restored as the iteration limit.
         *
         * 

This limit does not affect the next() function which always * iterates to the very end of the string. * * @param lim The iteration limit. */ public void setLimit(int lim) { if(0<=lim && lim<=s.length()) { limit=lim; } else { limit=s.length(); } } /** * Move to the iteration limit without fetching code points up to there. */ public void moveToLimit() { cpStart=cpLimit=limit; } /** * Iterate forward through the string to fetch the next code point * to be case-mapped, and set the context indexes for it. * *

When the iteration limit is reached (and -1 is returned), * getCPStart() will be at the iteration limit. * *

Iteration with next() does not affect the position for nextCaseMapCP(). * * @return The next code point to be case-mapped, or <0 when the iteration is done. */ public int nextCaseMapCP() { cpStart=cpLimit; if(cpLimit0) { /* reset for forward iteration */ dir=1; index=cpLimit; } else if(direction<0) { /* reset for backward iteration */ dir=-1; index=cpStart; } else { // not a valid direction dir=0; index=0; } } @Override public int next() { int c; if(dir>0 && index0) { c=Character.codePointBefore(s, index); index-=Character.charCount(c); return c; } return -1; } // variables protected CharSequence s; protected int index, limit, cpStart, cpLimit; protected int dir; // 0=initial state >0=forward <0=backward } public static final int TITLECASE_WHOLE_STRING = 0x20; public static final int TITLECASE_SENTENCES = 0x40; /** * Bit mask for the titlecasing iterator options bit field. * Currently only 3 out of 8 values are used: * 0 (words), TITLECASE_WHOLE_STRING, TITLECASE_SENTENCES. * See stringoptions.h. * @internal */ private static final int TITLECASE_ITERATOR_MASK = 0xe0; public static final int TITLECASE_ADJUST_TO_CASED = 0x400; /** * Bit mask for the titlecasing index adjustment options bit set. * Currently two bits are defined: * TITLECASE_NO_BREAK_ADJUSTMENT, TITLECASE_ADJUST_TO_CASED. * See stringoptions.h. * @internal */ private static final int TITLECASE_ADJUSTMENT_MASK = 0x600; public static int addTitleAdjustmentOption(int options, int newOption) { int adjOptions = options & TITLECASE_ADJUSTMENT_MASK; if (adjOptions !=0 && adjOptions != newOption) { throw new IllegalArgumentException("multiple titlecasing index adjustment options"); } return options | newOption; } private static final int LNS = (1 << UCharacterCategory.UPPERCASE_LETTER) | (1 << UCharacterCategory.LOWERCASE_LETTER) | (1 << UCharacterCategory.TITLECASE_LETTER) | // Not MODIFIER_LETTER: We count only cased modifier letters. (1 << UCharacterCategory.OTHER_LETTER) | (1 << UCharacterCategory.DECIMAL_DIGIT_NUMBER) | (1 << UCharacterCategory.LETTER_NUMBER) | (1 << UCharacterCategory.OTHER_NUMBER) | (1 << UCharacterCategory.MATH_SYMBOL) | (1 << UCharacterCategory.CURRENCY_SYMBOL) | (1 << UCharacterCategory.MODIFIER_SYMBOL) | (1 << UCharacterCategory.OTHER_SYMBOL) | (1 << UCharacterCategory.PRIVATE_USE); private static boolean isLNS(int c) { // Letter, number, symbol, // or a private use code point because those are typically used as letters or numbers. // Consider modifier letters only if they are cased. int gc = UCharacterProperty.INSTANCE.getType(c); return ((1 << gc) & LNS) != 0 || (gc == UCharacterCategory.MODIFIER_LETTER && UCaseProps.INSTANCE.getType(c) != UCaseProps.NONE); } public static int addTitleIteratorOption(int options, int newOption) { int iterOptions = options & TITLECASE_ITERATOR_MASK; if (iterOptions !=0 && iterOptions != newOption) { throw new IllegalArgumentException("multiple titlecasing iterator options"); } return options | newOption; } public static BreakIterator getTitleBreakIterator( Locale locale, int options, BreakIterator iter) { options &= TITLECASE_ITERATOR_MASK; if (options != 0 && iter != null) { throw new IllegalArgumentException( "titlecasing iterator option together with an explicit iterator"); } if (iter == null) { switch (options) { case 0: iter = BreakIterator.getWordInstance(locale); break; case TITLECASE_WHOLE_STRING: iter = new WholeStringBreakIterator(); break; case TITLECASE_SENTENCES: iter = BreakIterator.getSentenceInstance(locale); break; default: throw new IllegalArgumentException("unknown titlecasing iterator option"); } } return iter; } public static BreakIterator getTitleBreakIterator( ULocale locale, int options, BreakIterator iter) { options &= TITLECASE_ITERATOR_MASK; if (options != 0 && iter != null) { throw new IllegalArgumentException( "titlecasing iterator option together with an explicit iterator"); } if (iter == null) { switch (options) { case 0: iter = BreakIterator.getWordInstance(locale); break; case TITLECASE_WHOLE_STRING: iter = new WholeStringBreakIterator(); break; case TITLECASE_SENTENCES: iter = BreakIterator.getSentenceInstance(locale); break; default: throw new IllegalArgumentException("unknown titlecasing iterator option"); } } return iter; } /** * Omit unchanged text when case-mapping with Edits. */ public static final int OMIT_UNCHANGED_TEXT = 0x4000; private static final class WholeStringBreakIterator extends BreakIterator { private int length; private static void notImplemented() { throw new UnsupportedOperationException("should not occur"); } @Override public int first() { return 0; } @Override public int last() { notImplemented(); return 0; } @Override public int next(int n) { notImplemented(); return 0; } @Override public int next() { return length; } @Override public int previous() { notImplemented(); return 0; } @Override public int following(int offset) { notImplemented(); return 0; } @Override public int current() { notImplemented(); return 0; } @Override public CharacterIterator getText() { notImplemented(); return null; } @Override public void setText(CharacterIterator newText) { length = newText.getEndIndex(); } @Override public void setText(CharSequence newText) { length = newText.length(); } @Override public void setText(String newText) { length = newText.length(); } } private static int appendCodePoint(Appendable a, int c) throws IOException { if (c <= Character.MAX_VALUE) { a.append((char)c); return 1; } else { a.append((char)(0xd7c0 + (c >> 10))); a.append((char)(Character.MIN_LOW_SURROGATE + (c & 0x3ff))); return 2; } } /** * Appends a full case mapping result, see {@link UCaseProps#MAX_STRING_LENGTH}. * @throws IOException */ private static void appendResult(int result, Appendable dest, int cpLength, int options, Edits edits) throws IOException { // Decode the result. if (result < 0) { // (not) original code point if (edits != null) { edits.addUnchanged(cpLength); } if ((options & OMIT_UNCHANGED_TEXT) != 0) { return; } appendCodePoint(dest, ~result); } else if (result <= UCaseProps.MAX_STRING_LENGTH) { // The mapping has already been appended to result. if (edits != null) { edits.addReplace(cpLength, result); } } else { // Append the single-code point mapping. int length = appendCodePoint(dest, result); if (edits != null) { edits.addReplace(cpLength, length); } } } private static final void appendUnchanged(CharSequence src, int start, int length, Appendable dest, int options, Edits edits) throws IOException { if (length > 0) { if (edits != null) { edits.addUnchanged(length); } if ((options & OMIT_UNCHANGED_TEXT) != 0) { return; } dest.append(src, start, start + length); } } private static String applyEdits(CharSequence src, StringBuilder replacementChars, Edits edits) { if (!edits.hasChanges()) { return src.toString(); } StringBuilder result = new StringBuilder(src.length() + edits.lengthDelta()); for (Edits.Iterator ei = edits.getCoarseIterator(); ei.next();) { if (ei.hasChange()) { int i = ei.replacementIndex(); result.append(replacementChars, i, i + ei.newLength()); } else { int i = ei.sourceIndex(); result.append(src, i, i + ei.oldLength()); } } return result.toString(); } private static final Trie2_16 CASE_TRIE = UCaseProps.getTrie(); /** * caseLocale >= 0: Lowercases [srcStart..srcLimit[ but takes context [0..srcLength[ into account. * caseLocale < 0: Case-folds [srcStart..srcLimit[. */ private static void internalToLower(int caseLocale, int options, CharSequence src, int srcStart, int srcLimit, StringContextIterator iter, Appendable dest, Edits edits) throws IOException { byte[] latinToLower; if (caseLocale == UCaseProps.LOC_ROOT || (caseLocale >= 0 ? !(caseLocale == UCaseProps.LOC_TURKISH || caseLocale == UCaseProps.LOC_LITHUANIAN) : (options & UCaseProps.FOLD_CASE_OPTIONS_MASK) == UCharacter.FOLD_CASE_DEFAULT)) { latinToLower = UCaseProps.LatinCase.TO_LOWER_NORMAL; } else { latinToLower = UCaseProps.LatinCase.TO_LOWER_TR_LT; } int prev = srcStart; int srcIndex = srcStart; outerLoop: for (;;) { // fast path for simple cases char lead; for (;;) { if (srcIndex >= srcLimit) { break outerLoop; } lead = src.charAt(srcIndex); int delta; if (lead < UCaseProps.LatinCase.LONG_S) { byte d = latinToLower[lead]; if (d == UCaseProps.LatinCase.EXC) { break; } ++srcIndex; if (d == 0) { continue; } delta = d; } else if (lead >= 0xd800) { break; // surrogate or higher } else { int props = CASE_TRIE.getFromU16SingleLead(lead); if (UCaseProps.propsHasException(props)) { break; } ++srcIndex; if (!UCaseProps.isUpperOrTitleFromProps(props) || (delta = UCaseProps.getDelta(props)) == 0) { continue; } } lead += delta; appendUnchanged(src, prev, srcIndex - 1 - prev, dest, options, edits); dest.append(lead); if (edits != null) { edits.addReplace(1, 1); } prev = srcIndex; } // slow path int cpStart = srcIndex++; char trail; int c; if (Character.isHighSurrogate(lead) && srcIndex < srcLimit && Character.isLowSurrogate(trail = src.charAt(srcIndex))) { c = Character.toCodePoint(lead, trail); ++srcIndex; } else { c = lead; } if (caseLocale >= 0) { if (iter == null) { iter = new StringContextIterator(src, cpStart, srcIndex); } else { iter.setCPStartAndLimit(cpStart, srcIndex); } c = UCaseProps.INSTANCE.toFullLower(c, iter, dest, caseLocale); } else { c = UCaseProps.INSTANCE.toFullFolding(c, dest, options); } if (c >= 0) { appendUnchanged(src, prev, cpStart - prev, dest, options, edits); appendResult(c, dest, srcIndex - cpStart, options, edits); prev = srcIndex; } } appendUnchanged(src, prev, srcIndex - prev, dest, options, edits); } private static void internalToUpper(int caseLocale, int options, CharSequence src, Appendable dest, Edits edits) throws IOException { StringContextIterator iter = null; byte[] latinToUpper; if (caseLocale == UCaseProps.LOC_TURKISH) { latinToUpper = UCaseProps.LatinCase.TO_UPPER_TR; } else { latinToUpper = UCaseProps.LatinCase.TO_UPPER_NORMAL; } int prev = 0; int srcIndex = 0; int srcLength = src.length(); outerLoop: for (;;) { // fast path for simple cases char lead; for (;;) { if (srcIndex >= srcLength) { break outerLoop; } lead = src.charAt(srcIndex); int delta; if (lead < UCaseProps.LatinCase.LONG_S) { byte d = latinToUpper[lead]; if (d == UCaseProps.LatinCase.EXC) { break; } ++srcIndex; if (d == 0) { continue; } delta = d; } else if (lead >= 0xd800) { break; // surrogate or higher } else { int props = CASE_TRIE.getFromU16SingleLead(lead); if (UCaseProps.propsHasException(props)) { break; } ++srcIndex; if (UCaseProps.getTypeFromProps(props) != UCaseProps.LOWER || (delta = UCaseProps.getDelta(props)) == 0) { continue; } } lead += delta; appendUnchanged(src, prev, srcIndex - 1 - prev, dest, options, edits); dest.append(lead); if (edits != null) { edits.addReplace(1, 1); } prev = srcIndex; } // slow path int cpStart = srcIndex++; char trail; int c; if (Character.isHighSurrogate(lead) && srcIndex < srcLength && Character.isLowSurrogate(trail = src.charAt(srcIndex))) { c = Character.toCodePoint(lead, trail); ++srcIndex; } else { c = lead; } if (iter == null) { iter = new StringContextIterator(src, cpStart, srcIndex); } else { iter.setCPStartAndLimit(cpStart, srcIndex); } c = UCaseProps.INSTANCE.toFullUpper(c, iter, dest, caseLocale); if (c >= 0) { appendUnchanged(src, prev, cpStart - prev, dest, options, edits); appendResult(c, dest, srcIndex - cpStart, options, edits); prev = srcIndex; } } appendUnchanged(src, prev, srcIndex - prev, dest, options, edits); } public static String toLower(int caseLocale, int options, CharSequence src) { if (src.length() <= 100 && (options & OMIT_UNCHANGED_TEXT) == 0) { if (src.length() == 0) { return src.toString(); } // Collect and apply only changes. // Good if no or few changes. Bad (slow) if many changes. Edits edits = new Edits(); StringBuilder replacementChars = toLower( caseLocale, options | OMIT_UNCHANGED_TEXT, src, new StringBuilder(), edits); return applyEdits(src, replacementChars, edits); } else { return toLower(caseLocale, options, src, new StringBuilder(src.length()), null).toString(); } } public static A toLower(int caseLocale, int options, CharSequence src, A dest, Edits edits) { try { if (edits != null) { edits.reset(); } internalToLower(caseLocale, options, src, 0, src.length(), null, dest, edits); return dest; } catch (IOException e) { throw new ICUUncheckedIOException(e); } } public static String toUpper(int caseLocale, int options, CharSequence src) { if (src.length() <= 100 && (options & OMIT_UNCHANGED_TEXT) == 0) { if (src.length() == 0) { return src.toString(); } // Collect and apply only changes. // Good if no or few changes. Bad (slow) if many changes. Edits edits = new Edits(); StringBuilder replacementChars = toUpper( caseLocale, options | OMIT_UNCHANGED_TEXT, src, new StringBuilder(), edits); return applyEdits(src, replacementChars, edits); } else { return toUpper(caseLocale, options, src, new StringBuilder(src.length()), null).toString(); } } public static A toUpper(int caseLocale, int options, CharSequence src, A dest, Edits edits) { try { if (edits != null) { edits.reset(); } if (caseLocale == UCaseProps.LOC_GREEK) { return GreekUpper.toUpper(options, src, dest, edits); } internalToUpper(caseLocale, options, src, dest, edits); return dest; } catch (IOException e) { throw new ICUUncheckedIOException(e); } } public static String toTitle(int caseLocale, int options, BreakIterator iter, CharSequence src) { if (src.length() <= 100 && (options & OMIT_UNCHANGED_TEXT) == 0) { if (src.length() == 0) { return src.toString(); } // Collect and apply only changes. // Good if no or few changes. Bad (slow) if many changes. Edits edits = new Edits(); StringBuilder replacementChars = toTitle( caseLocale, options | OMIT_UNCHANGED_TEXT, iter, src, new StringBuilder(), edits); return applyEdits(src, replacementChars, edits); } else { return toTitle(caseLocale, options, iter, src, new StringBuilder(src.length()), null).toString(); } } public static A toTitle( int caseLocale, int options, BreakIterator titleIter, CharSequence src, A dest, Edits edits) { try { if (edits != null) { edits.reset(); } /* set up local variables */ StringContextIterator iter = new StringContextIterator(src); int srcLength = src.length(); int prev=0; boolean isFirstIndex=true; /* titlecasing loop */ while(prevsrcLength) { index=srcLength; } /* * Segment [prev..index[ into 3 parts: * a) skipped characters (copy as-is) [prev..titleStart[ * b) first letter (titlecase) [titleStart..titleLimit[ * c) subsequent characters (lowercase) [titleLimit..index[ */ if(prev=0) {} // If c<0 then we have only uncased characters in [prev..index[ // and stopped with titleStart==titleLimit==index. titleStart=iter.getCPStart(); if (prev < titleStart) { appendUnchanged(src, prev, titleStart-prev, dest, options, edits); } } if(titleStart A fold(int options, CharSequence src, A dest, Edits edits) { try { if (edits != null) { edits.reset(); } internalToLower(-1, options, src, 0, src.length(), null, dest, edits); return dest; } catch (IOException e) { throw new ICUUncheckedIOException(e); } } private static final class GreekUpper { // Data bits. private static final int UPPER_MASK = 0x3ff; private static final int HAS_VOWEL = 0x1000; private static final int HAS_YPOGEGRAMMENI = 0x2000; private static final int HAS_ACCENT = 0x4000; private static final int HAS_DIALYTIKA = 0x8000; // Further bits during data building and processing, not stored in the data map. private static final int HAS_COMBINING_DIALYTIKA = 0x10000; private static final int HAS_OTHER_GREEK_DIACRITIC = 0x20000; private static final int HAS_VOWEL_AND_ACCENT = HAS_VOWEL | HAS_ACCENT; private static final int HAS_VOWEL_AND_ACCENT_AND_DIALYTIKA = HAS_VOWEL_AND_ACCENT | HAS_DIALYTIKA; private static final int HAS_EITHER_DIALYTIKA = HAS_DIALYTIKA | HAS_COMBINING_DIALYTIKA; // State bits. private static final int AFTER_CASED = 1; private static final int AFTER_VOWEL_WITH_ACCENT = 2; // Data generated by prototype code, see // http://site.icu-project.org/design/case/greek-upper // TODO: Move this data into ucase.icu. private static final char[] data0370 = { // U+0370..03FF 0x0370, // Ͱ 0x0370, // ͱ 0x0372, // Ͳ 0x0372, // ͳ 0, 0, 0x0376, // Ͷ 0x0376, // ͷ 0, 0, 0x037A, // ͺ 0x03FD, // ͻ 0x03FE, // ͼ 0x03FF, // ͽ 0, 0x037F, // Ϳ 0, 0, 0, 0, 0, 0, 0x0391 | HAS_VOWEL | HAS_ACCENT, // Ά 0, 0x0395 | HAS_VOWEL | HAS_ACCENT, // Έ 0x0397 | HAS_VOWEL | HAS_ACCENT, // Ή 0x0399 | HAS_VOWEL | HAS_ACCENT, // Ί 0, 0x039F | HAS_VOWEL | HAS_ACCENT, // Ό 0, 0x03A5 | HAS_VOWEL | HAS_ACCENT, // Ύ 0x03A9 | HAS_VOWEL | HAS_ACCENT, // Ώ 0x0399 | HAS_VOWEL | HAS_ACCENT | HAS_DIALYTIKA, // ΐ 0x0391 | HAS_VOWEL, // Α 0x0392, // Β 0x0393, // Γ 0x0394, // Δ 0x0395 | HAS_VOWEL, // Ε 0x0396, // Ζ 0x0397 | HAS_VOWEL, // Η 0x0398, // Θ 0x0399 | HAS_VOWEL, // Ι 0x039A, // Κ 0x039B, // Λ 0x039C, // Μ 0x039D, // Ν 0x039E, // Ξ 0x039F | HAS_VOWEL, // Ο 0x03A0, // Π 0x03A1, // Ρ 0, 0x03A3, // Σ 0x03A4, // Τ 0x03A5 | HAS_VOWEL, // Υ 0x03A6, // Φ 0x03A7, // Χ 0x03A8, // Ψ 0x03A9 | HAS_VOWEL, // Ω 0x0399 | HAS_VOWEL | HAS_DIALYTIKA, // Ϊ 0x03A5 | HAS_VOWEL | HAS_DIALYTIKA, // Ϋ 0x0391 | HAS_VOWEL | HAS_ACCENT, // ά 0x0395 | HAS_VOWEL | HAS_ACCENT, // έ 0x0397 | HAS_VOWEL | HAS_ACCENT, // ή 0x0399 | HAS_VOWEL | HAS_ACCENT, // ί 0x03A5 | HAS_VOWEL | HAS_ACCENT | HAS_DIALYTIKA, // ΰ 0x0391 | HAS_VOWEL, // α 0x0392, // β 0x0393, // γ 0x0394, // δ 0x0395 | HAS_VOWEL, // ε 0x0396, // ζ 0x0397 | HAS_VOWEL, // η 0x0398, // θ 0x0399 | HAS_VOWEL, // ι 0x039A, // κ 0x039B, // λ 0x039C, // μ 0x039D, // ν 0x039E, // ξ 0x039F | HAS_VOWEL, // ο 0x03A0, // π 0x03A1, // ρ 0x03A3, // ς 0x03A3, // σ 0x03A4, // τ 0x03A5 | HAS_VOWEL, // υ 0x03A6, // φ 0x03A7, // χ 0x03A8, // ψ 0x03A9 | HAS_VOWEL, // ω 0x0399 | HAS_VOWEL | HAS_DIALYTIKA, // ϊ 0x03A5 | HAS_VOWEL | HAS_DIALYTIKA, // ϋ 0x039F | HAS_VOWEL | HAS_ACCENT, // ό 0x03A5 | HAS_VOWEL | HAS_ACCENT, // ύ 0x03A9 | HAS_VOWEL | HAS_ACCENT, // ώ 0x03CF, // Ϗ 0x0392, // ϐ 0x0398, // ϑ 0x03D2, // ϒ 0x03D2 | HAS_ACCENT, // ϓ 0x03D2 | HAS_DIALYTIKA, // ϔ 0x03A6, // ϕ 0x03A0, // ϖ 0x03CF, // ϗ 0x03D8, // Ϙ 0x03D8, // ϙ 0x03DA, // Ϛ 0x03DA, // ϛ 0x03DC, // Ϝ 0x03DC, // ϝ 0x03DE, // Ϟ 0x03DE, // ϟ 0x03E0, // Ϡ 0x03E0, // ϡ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0x039A, // ϰ 0x03A1, // ϱ 0x03F9, // ϲ 0x037F, // ϳ 0x03F4, // ϴ 0x0395 | HAS_VOWEL, // ϵ 0, 0x03F7, // Ϸ 0x03F7, // ϸ 0x03F9, // Ϲ 0x03FA, // Ϻ 0x03FA, // ϻ 0x03FC, // ϼ 0x03FD, // Ͻ 0x03FE, // Ͼ 0x03FF, // Ͽ }; private static final char[] data1F00 = { // U+1F00..1FFF 0x0391 | HAS_VOWEL, // ἀ 0x0391 | HAS_VOWEL, // ἁ 0x0391 | HAS_VOWEL | HAS_ACCENT, // ἂ 0x0391 | HAS_VOWEL | HAS_ACCENT, // ἃ 0x0391 | HAS_VOWEL | HAS_ACCENT, // ἄ 0x0391 | HAS_VOWEL | HAS_ACCENT, // ἅ 0x0391 | HAS_VOWEL | HAS_ACCENT, // ἆ 0x0391 | HAS_VOWEL | HAS_ACCENT, // ἇ 0x0391 | HAS_VOWEL, // Ἀ 0x0391 | HAS_VOWEL, // Ἁ 0x0391 | HAS_VOWEL | HAS_ACCENT, // Ἂ 0x0391 | HAS_VOWEL | HAS_ACCENT, // Ἃ 0x0391 | HAS_VOWEL | HAS_ACCENT, // Ἄ 0x0391 | HAS_VOWEL | HAS_ACCENT, // Ἅ 0x0391 | HAS_VOWEL | HAS_ACCENT, // Ἆ 0x0391 | HAS_VOWEL | HAS_ACCENT, // Ἇ 0x0395 | HAS_VOWEL, // ἐ 0x0395 | HAS_VOWEL, // ἑ 0x0395 | HAS_VOWEL | HAS_ACCENT, // ἒ 0x0395 | HAS_VOWEL | HAS_ACCENT, // ἓ 0x0395 | HAS_VOWEL | HAS_ACCENT, // ἔ 0x0395 | HAS_VOWEL | HAS_ACCENT, // ἕ 0, 0, 0x0395 | HAS_VOWEL, // Ἐ 0x0395 | HAS_VOWEL, // Ἑ 0x0395 | HAS_VOWEL | HAS_ACCENT, // Ἒ 0x0395 | HAS_VOWEL | HAS_ACCENT, // Ἓ 0x0395 | HAS_VOWEL | HAS_ACCENT, // Ἔ 0x0395 | HAS_VOWEL | HAS_ACCENT, // Ἕ 0, 0, 0x0397 | HAS_VOWEL, // ἠ 0x0397 | HAS_VOWEL, // ἡ 0x0397 | HAS_VOWEL | HAS_ACCENT, // ἢ 0x0397 | HAS_VOWEL | HAS_ACCENT, // ἣ 0x0397 | HAS_VOWEL | HAS_ACCENT, // ἤ 0x0397 | HAS_VOWEL | HAS_ACCENT, // ἥ 0x0397 | HAS_VOWEL | HAS_ACCENT, // ἦ 0x0397 | HAS_VOWEL | HAS_ACCENT, // ἧ 0x0397 | HAS_VOWEL, // Ἠ 0x0397 | HAS_VOWEL, // Ἡ 0x0397 | HAS_VOWEL | HAS_ACCENT, // Ἢ 0x0397 | HAS_VOWEL | HAS_ACCENT, // Ἣ 0x0397 | HAS_VOWEL | HAS_ACCENT, // Ἤ 0x0397 | HAS_VOWEL | HAS_ACCENT, // Ἥ 0x0397 | HAS_VOWEL | HAS_ACCENT, // Ἦ 0x0397 | HAS_VOWEL | HAS_ACCENT, // Ἧ 0x0399 | HAS_VOWEL, // ἰ 0x0399 | HAS_VOWEL, // ἱ 0x0399 | HAS_VOWEL | HAS_ACCENT, // ἲ 0x0399 | HAS_VOWEL | HAS_ACCENT, // ἳ 0x0399 | HAS_VOWEL | HAS_ACCENT, // ἴ 0x0399 | HAS_VOWEL | HAS_ACCENT, // ἵ 0x0399 | HAS_VOWEL | HAS_ACCENT, // ἶ 0x0399 | HAS_VOWEL | HAS_ACCENT, // ἷ 0x0399 | HAS_VOWEL, // Ἰ 0x0399 | HAS_VOWEL, // Ἱ 0x0399 | HAS_VOWEL | HAS_ACCENT, // Ἲ 0x0399 | HAS_VOWEL | HAS_ACCENT, // Ἳ 0x0399 | HAS_VOWEL | HAS_ACCENT, // Ἴ 0x0399 | HAS_VOWEL | HAS_ACCENT, // Ἵ 0x0399 | HAS_VOWEL | HAS_ACCENT, // Ἶ 0x0399 | HAS_VOWEL | HAS_ACCENT, // Ἷ 0x039F | HAS_VOWEL, // ὀ 0x039F | HAS_VOWEL, // ὁ 0x039F | HAS_VOWEL | HAS_ACCENT, // ὂ 0x039F | HAS_VOWEL | HAS_ACCENT, // ὃ 0x039F | HAS_VOWEL | HAS_ACCENT, // ὄ 0x039F | HAS_VOWEL | HAS_ACCENT, // ὅ 0, 0, 0x039F | HAS_VOWEL, // Ὀ 0x039F | HAS_VOWEL, // Ὁ 0x039F | HAS_VOWEL | HAS_ACCENT, // Ὂ 0x039F | HAS_VOWEL | HAS_ACCENT, // Ὃ 0x039F | HAS_VOWEL | HAS_ACCENT, // Ὄ 0x039F | HAS_VOWEL | HAS_ACCENT, // Ὅ 0, 0, 0x03A5 | HAS_VOWEL, // ὐ 0x03A5 | HAS_VOWEL, // ὑ 0x03A5 | HAS_VOWEL | HAS_ACCENT, // ὒ 0x03A5 | HAS_VOWEL | HAS_ACCENT, // ὓ 0x03A5 | HAS_VOWEL | HAS_ACCENT, // ὔ 0x03A5 | HAS_VOWEL | HAS_ACCENT, // ὕ 0x03A5 | HAS_VOWEL | HAS_ACCENT, // ὖ 0x03A5 | HAS_VOWEL | HAS_ACCENT, // ὗ 0, 0x03A5 | HAS_VOWEL, // Ὑ 0, 0x03A5 | HAS_VOWEL | HAS_ACCENT, // Ὓ 0, 0x03A5 | HAS_VOWEL | HAS_ACCENT, // Ὕ 0, 0x03A5 | HAS_VOWEL | HAS_ACCENT, // Ὗ 0x03A9 | HAS_VOWEL, // ὠ 0x03A9 | HAS_VOWEL, // ὡ 0x03A9 | HAS_VOWEL | HAS_ACCENT, // ὢ 0x03A9 | HAS_VOWEL | HAS_ACCENT, // ὣ 0x03A9 | HAS_VOWEL | HAS_ACCENT, // ὤ 0x03A9 | HAS_VOWEL | HAS_ACCENT, // ὥ 0x03A9 | HAS_VOWEL | HAS_ACCENT, // ὦ 0x03A9 | HAS_VOWEL | HAS_ACCENT, // ὧ 0x03A9 | HAS_VOWEL, // Ὠ 0x03A9 | HAS_VOWEL, // Ὡ 0x03A9 | HAS_VOWEL | HAS_ACCENT, // Ὢ 0x03A9 | HAS_VOWEL | HAS_ACCENT, // Ὣ 0x03A9 | HAS_VOWEL | HAS_ACCENT, // Ὤ 0x03A9 | HAS_VOWEL | HAS_ACCENT, // Ὥ 0x03A9 | HAS_VOWEL | HAS_ACCENT, // Ὦ 0x03A9 | HAS_VOWEL | HAS_ACCENT, // Ὧ 0x0391 | HAS_VOWEL | HAS_ACCENT, // ὰ 0x0391 | HAS_VOWEL | HAS_ACCENT, // ά 0x0395 | HAS_VOWEL | HAS_ACCENT, // ὲ 0x0395 | HAS_VOWEL | HAS_ACCENT, // έ 0x0397 | HAS_VOWEL | HAS_ACCENT, // ὴ 0x0397 | HAS_VOWEL | HAS_ACCENT, // ή 0x0399 | HAS_VOWEL | HAS_ACCENT, // ὶ 0x0399 | HAS_VOWEL | HAS_ACCENT, // ί 0x039F | HAS_VOWEL | HAS_ACCENT, // ὸ 0x039F | HAS_VOWEL | HAS_ACCENT, // ό 0x03A5 | HAS_VOWEL | HAS_ACCENT, // ὺ 0x03A5 | HAS_VOWEL | HAS_ACCENT, // ύ 0x03A9 | HAS_VOWEL | HAS_ACCENT, // ὼ 0x03A9 | HAS_VOWEL | HAS_ACCENT, // ώ 0, 0, 0x0391 | HAS_VOWEL | HAS_YPOGEGRAMMENI, // ᾀ 0x0391 | HAS_VOWEL | HAS_YPOGEGRAMMENI, // ᾁ 0x0391 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT, // ᾂ 0x0391 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT, // ᾃ 0x0391 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT, // ᾄ 0x0391 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT, // ᾅ 0x0391 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT, // ᾆ 0x0391 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT, // ᾇ 0x0391 | HAS_VOWEL | HAS_YPOGEGRAMMENI, // ᾈ 0x0391 | HAS_VOWEL | HAS_YPOGEGRAMMENI, // ᾉ 0x0391 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT, // ᾊ 0x0391 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT, // ᾋ 0x0391 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT, // ᾌ 0x0391 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT, // ᾍ 0x0391 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT, // ᾎ 0x0391 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT, // ᾏ 0x0397 | HAS_VOWEL | HAS_YPOGEGRAMMENI, // ᾐ 0x0397 | HAS_VOWEL | HAS_YPOGEGRAMMENI, // ᾑ 0x0397 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT, // ᾒ 0x0397 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT, // ᾓ 0x0397 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT, // ᾔ 0x0397 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT, // ᾕ 0x0397 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT, // ᾖ 0x0397 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT, // ᾗ 0x0397 | HAS_VOWEL | HAS_YPOGEGRAMMENI, // ᾘ 0x0397 | HAS_VOWEL | HAS_YPOGEGRAMMENI, // ᾙ 0x0397 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT, // ᾚ 0x0397 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT, // ᾛ 0x0397 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT, // ᾜ 0x0397 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT, // ᾝ 0x0397 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT, // ᾞ 0x0397 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT, // ᾟ 0x03A9 | HAS_VOWEL | HAS_YPOGEGRAMMENI, // ᾠ 0x03A9 | HAS_VOWEL | HAS_YPOGEGRAMMENI, // ᾡ 0x03A9 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT, // ᾢ 0x03A9 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT, // ᾣ 0x03A9 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT, // ᾤ 0x03A9 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT, // ᾥ 0x03A9 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT, // ᾦ 0x03A9 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT, // ᾧ 0x03A9 | HAS_VOWEL | HAS_YPOGEGRAMMENI, // ᾨ 0x03A9 | HAS_VOWEL | HAS_YPOGEGRAMMENI, // ᾩ 0x03A9 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT, // ᾪ 0x03A9 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT, // ᾫ 0x03A9 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT, // ᾬ 0x03A9 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT, // ᾭ 0x03A9 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT, // ᾮ 0x03A9 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT, // ᾯ 0x0391 | HAS_VOWEL, // ᾰ 0x0391 | HAS_VOWEL, // ᾱ 0x0391 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT, // ᾲ 0x0391 | HAS_VOWEL | HAS_YPOGEGRAMMENI, // ᾳ 0x0391 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT, // ᾴ 0, 0x0391 | HAS_VOWEL | HAS_ACCENT, // ᾶ 0x0391 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT, // ᾷ 0x0391 | HAS_VOWEL, // Ᾰ 0x0391 | HAS_VOWEL, // Ᾱ 0x0391 | HAS_VOWEL | HAS_ACCENT, // Ὰ 0x0391 | HAS_VOWEL | HAS_ACCENT, // Ά 0x0391 | HAS_VOWEL | HAS_YPOGEGRAMMENI, // ᾼ 0, 0x0399 | HAS_VOWEL, // ι 0, 0, 0, 0x0397 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT, // ῂ 0x0397 | HAS_VOWEL | HAS_YPOGEGRAMMENI, // ῃ 0x0397 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT, // ῄ 0, 0x0397 | HAS_VOWEL | HAS_ACCENT, // ῆ 0x0397 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT, // ῇ 0x0395 | HAS_VOWEL | HAS_ACCENT, // Ὲ 0x0395 | HAS_VOWEL | HAS_ACCENT, // Έ 0x0397 | HAS_VOWEL | HAS_ACCENT, // Ὴ 0x0397 | HAS_VOWEL | HAS_ACCENT, // Ή 0x0397 | HAS_VOWEL | HAS_YPOGEGRAMMENI, // ῌ 0, 0, 0, 0x0399 | HAS_VOWEL, // ῐ 0x0399 | HAS_VOWEL, // ῑ 0x0399 | HAS_VOWEL | HAS_ACCENT | HAS_DIALYTIKA, // ῒ 0x0399 | HAS_VOWEL | HAS_ACCENT | HAS_DIALYTIKA, // ΐ 0, 0, 0x0399 | HAS_VOWEL | HAS_ACCENT, // ῖ 0x0399 | HAS_VOWEL | HAS_ACCENT | HAS_DIALYTIKA, // ῗ 0x0399 | HAS_VOWEL, // Ῐ 0x0399 | HAS_VOWEL, // Ῑ 0x0399 | HAS_VOWEL | HAS_ACCENT, // Ὶ 0x0399 | HAS_VOWEL | HAS_ACCENT, // Ί 0, 0, 0, 0, 0x03A5 | HAS_VOWEL, // ῠ 0x03A5 | HAS_VOWEL, // ῡ 0x03A5 | HAS_VOWEL | HAS_ACCENT | HAS_DIALYTIKA, // ῢ 0x03A5 | HAS_VOWEL | HAS_ACCENT | HAS_DIALYTIKA, // ΰ 0x03A1, // ῤ 0x03A1, // ῥ 0x03A5 | HAS_VOWEL | HAS_ACCENT, // ῦ 0x03A5 | HAS_VOWEL | HAS_ACCENT | HAS_DIALYTIKA, // ῧ 0x03A5 | HAS_VOWEL, // Ῠ 0x03A5 | HAS_VOWEL, // Ῡ 0x03A5 | HAS_VOWEL | HAS_ACCENT, // Ὺ 0x03A5 | HAS_VOWEL | HAS_ACCENT, // Ύ 0x03A1, // Ῥ 0, 0, 0, 0, 0, 0x03A9 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT, // ῲ 0x03A9 | HAS_VOWEL | HAS_YPOGEGRAMMENI, // ῳ 0x03A9 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT, // ῴ 0, 0x03A9 | HAS_VOWEL | HAS_ACCENT, // ῶ 0x03A9 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT, // ῷ 0x039F | HAS_VOWEL | HAS_ACCENT, // Ὸ 0x039F | HAS_VOWEL | HAS_ACCENT, // Ό 0x03A9 | HAS_VOWEL | HAS_ACCENT, // Ὼ 0x03A9 | HAS_VOWEL | HAS_ACCENT, // Ώ 0x03A9 | HAS_VOWEL | HAS_YPOGEGRAMMENI, // ῼ 0, 0, 0, }; // U+2126 Ohm sign private static final char data2126 = 0x03A9 | HAS_VOWEL; // Ω private static final int getLetterData(int c) { if (c < 0x370 || 0x2126 < c || (0x3ff < c && c < 0x1f00)) { return 0; } else if (c <= 0x3ff) { return data0370[c - 0x370]; } else if (c <= 0x1fff) { return data1F00[c - 0x1f00]; } else if (c == 0x2126) { return data2126; } else { return 0; } } /** * Returns a non-zero value for each of the Greek combining diacritics * listed in The Unicode Standard, version 8, chapter 7.2 Greek, * plus some perispomeni look-alikes. */ private static final int getDiacriticData(int c) { switch (c) { case '\u0300': // varia case '\u0301': // tonos = oxia case '\u0342': // perispomeni case '\u0302': // circumflex can look like perispomeni case '\u0303': // tilde can look like perispomeni case '\u0311': // inverted breve can look like perispomeni return HAS_ACCENT; case '\u0308': // dialytika = diaeresis return HAS_COMBINING_DIALYTIKA; case '\u0344': // dialytika tonos return HAS_COMBINING_DIALYTIKA | HAS_ACCENT; case '\u0345': // ypogegrammeni = iota subscript return HAS_YPOGEGRAMMENI; case '\u0304': // macron case '\u0306': // breve case '\u0313': // comma above case '\u0314': // reversed comma above case '\u0343': // koronis return HAS_OTHER_GREEK_DIACRITIC; default: return 0; } } private static boolean isFollowedByCasedLetter(CharSequence s, int i) { while (i < s.length()) { int c = Character.codePointAt(s, i); int type = UCaseProps.INSTANCE.getTypeOrIgnorable(c); if ((type & UCaseProps.IGNORABLE) != 0) { // Case-ignorable, continue with the loop. i += Character.charCount(c); } else if (type != UCaseProps.NONE) { return true; // Followed by cased letter. } else { return false; // Uncased and not case-ignorable. } } return false; // Not followed by cased letter. } /** * Greek string uppercasing with a state machine. * Probably simpler than a stateless function that has to figure out complex context-before * for each character. * TODO: Try to re-consolidate one way or another with the non-Greek function. * *

Keep this consistent with the C++ versions in ustrcase.cpp (UTF-16) and ucasemap.cpp (UTF-8). * @throws IOException */ private static A toUpper(int options, CharSequence src, A dest, Edits edits) throws IOException { int state = 0; for (int i = 0; i < src.length();) { int c = Character.codePointAt(src, i); int nextIndex = i + Character.charCount(c); int nextState = 0; int type = UCaseProps.INSTANCE.getTypeOrIgnorable(c); if ((type & UCaseProps.IGNORABLE) != 0) { // c is case-ignorable nextState |= (state & AFTER_CASED); } else if (type != UCaseProps.NONE) { // c is cased nextState |= AFTER_CASED; } int data = getLetterData(c); if (data > 0) { int upper = data & UPPER_MASK; // Add a dialytika to this iota or ypsilon vowel // if we removed a tonos from the previous vowel, // and that previous vowel did not also have (or gain) a dialytika. // Adding one only to the final vowel in a longer sequence // (which does not occur in normal writing) would require lookahead. // Set the same flag as for preserving an existing dialytika. if ((data & HAS_VOWEL) != 0 && (state & AFTER_VOWEL_WITH_ACCENT) != 0 && (upper == 'Ι' || upper == 'Υ')) { data |= HAS_DIALYTIKA; } int numYpogegrammeni = 0; // Map each one to a trailing, spacing, capital iota. if ((data & HAS_YPOGEGRAMMENI) != 0) { numYpogegrammeni = 1; } // Skip combining diacritics after this Greek letter. while (nextIndex < src.length()) { int diacriticData = getDiacriticData(src.charAt(nextIndex)); if (diacriticData != 0) { data |= diacriticData; if ((diacriticData & HAS_YPOGEGRAMMENI) != 0) { ++numYpogegrammeni; } ++nextIndex; } else { break; // not a Greek diacritic } } if ((data & HAS_VOWEL_AND_ACCENT_AND_DIALYTIKA) == HAS_VOWEL_AND_ACCENT) { nextState |= AFTER_VOWEL_WITH_ACCENT; } // Map according to Greek rules. boolean addTonos = false; if (upper == 'Η' && (data & HAS_ACCENT) != 0 && numYpogegrammeni == 0 && (state & AFTER_CASED) == 0 && !isFollowedByCasedLetter(src, nextIndex)) { // Keep disjunctive "or" with (only) a tonos. // We use the same "word boundary" conditions as for the Final_Sigma test. if (i == nextIndex) { upper = 'Ή'; // Preserve the precomposed form. } else { addTonos = true; } } else if ((data & HAS_DIALYTIKA) != 0) { // Preserve a vowel with dialytika in precomposed form if it exists. if (upper == 'Ι') { upper = 'Ϊ'; data &= ~HAS_EITHER_DIALYTIKA; } else if (upper == 'Υ') { upper = 'Ϋ'; data &= ~HAS_EITHER_DIALYTIKA; } } boolean change; if (edits == null && (options & OMIT_UNCHANGED_TEXT) == 0) { change = true; // common, simple usage } else { // Find out first whether we are changing the text. change = src.charAt(i) != upper || numYpogegrammeni > 0; int i2 = i + 1; if ((data & HAS_EITHER_DIALYTIKA) != 0) { change |= i2 >= nextIndex || src.charAt(i2) != 0x308; ++i2; } if (addTonos) { change |= i2 >= nextIndex || src.charAt(i2) != 0x301; ++i2; } int oldLength = nextIndex - i; int newLength = (i2 - i) + numYpogegrammeni; change |= oldLength != newLength; if (change) { if (edits != null) { edits.addReplace(oldLength, newLength); } } else { if (edits != null) { edits.addUnchanged(oldLength); } // Write unchanged text? change = (options & OMIT_UNCHANGED_TEXT) == 0; } } if (change) { dest.append((char)upper); if ((data & HAS_EITHER_DIALYTIKA) != 0) { dest.append('\u0308'); // restore or add a dialytika } if (addTonos) { dest.append('\u0301'); } while (numYpogegrammeni > 0) { dest.append('Ι'); --numYpogegrammeni; } } } else { c = UCaseProps.INSTANCE.toFullUpper(c, null, dest, UCaseProps.LOC_GREEK); appendResult(c, dest, nextIndex - i, options, edits); } i = nextIndex; state = nextState; } return dest; } } }





© 2015 - 2024 Weber Informatics LLC | Privacy Policy