All Downloads are FREE. Search and download functionalities are using the official Maven repository.

com.ibm.icu.impl.CaseMap Maven / Gradle / Ivy

There is a newer version: 2.12.15
Show newest version
// © 2016 and later: Unicode, Inc. and others.
// License & terms of use: http://www.unicode.org/copyright.html#License
package com.ibm.icu.impl;

import com.ibm.icu.util.ULocale;

public final class CaseMap {
    /**
     * Implementation of UCaseProps.ContextIterator, iterates over a String.
     * See ustrcase.c/utf16_caseContextIterator().
     */
    public static final class StringContextIterator implements UCaseProps.ContextIterator {
        /**
         * Constructor.
         * @param s String to iterate over.
         */
        public StringContextIterator(String s) {
            this.s=s;
            limit=s.length();
            cpStart=cpLimit=index=0;
            dir=0;
        }

        /**
         * Set the iteration limit for nextCaseMapCP() to an index within the string.
         * If the limit parameter is negative or past the string, then the
         * string length is restored as the iteration limit.
         *
         * 

This limit does not affect the next() function which always * iterates to the very end of the string. * * @param lim The iteration limit. */ public void setLimit(int lim) { if(0<=lim && lim<=s.length()) { limit=lim; } else { limit=s.length(); } } /** * Move to the iteration limit without fetching code points up to there. */ public void moveToLimit() { cpStart=cpLimit=limit; } /** * Iterate forward through the string to fetch the next code point * to be case-mapped, and set the context indexes for it. * *

When the iteration limit is reached (and -1 is returned), * getCPStart() will be at the iteration limit. * *

Iteration with next() does not affect the position for nextCaseMapCP(). * * @return The next code point to be case-mapped, or <0 when the iteration is done. */ public int nextCaseMapCP() { cpStart=cpLimit; if(cpLimit0) { /* reset for forward iteration */ dir=1; index=cpLimit; } else if(direction<0) { /* reset for backward iteration */ dir=-1; index=cpStart; } else { // not a valid direction dir=0; index=0; } } @Override public int next() { int c; if(dir>0 && index0) { c=s.codePointBefore(index); index-=Character.charCount(c); return c; } return -1; } // variables protected String s; protected int index, limit, cpStart, cpLimit; protected int dir; // 0=initial state >0=forward <0=backward } /** Appends a full case mapping result, see {@link UCaseProps#MAX_STRING_LENGTH}. */ private static final void appendResult(int c, StringBuilder result) { // Decode the result. if (c < 0) { // (not) original code point result.appendCodePoint(~c); } else if (c <= UCaseProps.MAX_STRING_LENGTH) { // The mapping has already been appended to result. } else { // Append the single-code point mapping. result.appendCodePoint(c); } } // TODO: Move the other string case mapping functions from UCharacter to here, too. public static String toUpper(ULocale locale, String str) { if (locale == null) { locale = ULocale.getDefault(); } int[] locCache = new int[] { UCaseProps.getCaseLocale(locale, null) }; if (locCache[0] == UCaseProps.LOC_GREEK) { return GreekUpper.toUpper(str, locCache); } StringContextIterator iter = new StringContextIterator(str); StringBuilder result = new StringBuilder(str.length()); int c; while((c=iter.nextCaseMapCP())>=0) { c = UCaseProps.INSTANCE.toFullUpper(c, iter, result, locale, locCache); appendResult(c, result); } return result.toString(); } private static final class GreekUpper { // Data bits. private static final int UPPER_MASK = 0x3ff; private static final int HAS_VOWEL = 0x1000; private static final int HAS_YPOGEGRAMMENI = 0x2000; private static final int HAS_ACCENT = 0x4000; private static final int HAS_DIALYTIKA = 0x8000; // Further bits during data building and processing, not stored in the data map. private static final int HAS_COMBINING_DIALYTIKA = 0x10000; private static final int HAS_OTHER_GREEK_DIACRITIC = 0x20000; private static final int HAS_VOWEL_AND_ACCENT = HAS_VOWEL | HAS_ACCENT; private static final int HAS_VOWEL_AND_ACCENT_AND_DIALYTIKA = HAS_VOWEL_AND_ACCENT | HAS_DIALYTIKA; private static final int HAS_EITHER_DIALYTIKA = HAS_DIALYTIKA | HAS_COMBINING_DIALYTIKA; // State bits. private static final int AFTER_CASED = 1; private static final int AFTER_VOWEL_WITH_ACCENT = 2; // Data generated by prototype code, see // http://site.icu-project.org/design/case/greek-upper // TODO: Move this data into ucase.icu. private static final char[] data0370 = { // U+0370..03FF 0x0370, // Ͱ 0x0370, // ͱ 0x0372, // Ͳ 0x0372, // ͳ 0, 0, 0x0376, // Ͷ 0x0376, // ͷ 0, 0, 0x037A, // ͺ 0x03FD, // ͻ 0x03FE, // ͼ 0x03FF, // ͽ 0, 0x037F, // Ϳ 0, 0, 0, 0, 0, 0, 0x0391 | HAS_VOWEL | HAS_ACCENT, // Ά 0, 0x0395 | HAS_VOWEL | HAS_ACCENT, // Έ 0x0397 | HAS_VOWEL | HAS_ACCENT, // Ή 0x0399 | HAS_VOWEL | HAS_ACCENT, // Ί 0, 0x039F | HAS_VOWEL | HAS_ACCENT, // Ό 0, 0x03A5 | HAS_VOWEL | HAS_ACCENT, // Ύ 0x03A9 | HAS_VOWEL | HAS_ACCENT, // Ώ 0x0399 | HAS_VOWEL | HAS_ACCENT | HAS_DIALYTIKA, // ΐ 0x0391 | HAS_VOWEL, // Α 0x0392, // Β 0x0393, // Γ 0x0394, // Δ 0x0395 | HAS_VOWEL, // Ε 0x0396, // Ζ 0x0397 | HAS_VOWEL, // Η 0x0398, // Θ 0x0399 | HAS_VOWEL, // Ι 0x039A, // Κ 0x039B, // Λ 0x039C, // Μ 0x039D, // Ν 0x039E, // Ξ 0x039F | HAS_VOWEL, // Ο 0x03A0, // Π 0x03A1, // Ρ 0, 0x03A3, // Σ 0x03A4, // Τ 0x03A5 | HAS_VOWEL, // Υ 0x03A6, // Φ 0x03A7, // Χ 0x03A8, // Ψ 0x03A9 | HAS_VOWEL, // Ω 0x0399 | HAS_VOWEL | HAS_DIALYTIKA, // Ϊ 0x03A5 | HAS_VOWEL | HAS_DIALYTIKA, // Ϋ 0x0391 | HAS_VOWEL | HAS_ACCENT, // ά 0x0395 | HAS_VOWEL | HAS_ACCENT, // έ 0x0397 | HAS_VOWEL | HAS_ACCENT, // ή 0x0399 | HAS_VOWEL | HAS_ACCENT, // ί 0x03A5 | HAS_VOWEL | HAS_ACCENT | HAS_DIALYTIKA, // ΰ 0x0391 | HAS_VOWEL, // α 0x0392, // β 0x0393, // γ 0x0394, // δ 0x0395 | HAS_VOWEL, // ε 0x0396, // ζ 0x0397 | HAS_VOWEL, // η 0x0398, // θ 0x0399 | HAS_VOWEL, // ι 0x039A, // κ 0x039B, // λ 0x039C, // μ 0x039D, // ν 0x039E, // ξ 0x039F | HAS_VOWEL, // ο 0x03A0, // π 0x03A1, // ρ 0x03A3, // ς 0x03A3, // σ 0x03A4, // τ 0x03A5 | HAS_VOWEL, // υ 0x03A6, // φ 0x03A7, // χ 0x03A8, // ψ 0x03A9 | HAS_VOWEL, // ω 0x0399 | HAS_VOWEL | HAS_DIALYTIKA, // ϊ 0x03A5 | HAS_VOWEL | HAS_DIALYTIKA, // ϋ 0x039F | HAS_VOWEL | HAS_ACCENT, // ό 0x03A5 | HAS_VOWEL | HAS_ACCENT, // ύ 0x03A9 | HAS_VOWEL | HAS_ACCENT, // ώ 0x03CF, // Ϗ 0x0392, // ϐ 0x0398, // ϑ 0x03D2, // ϒ 0x03D2 | HAS_ACCENT, // ϓ 0x03D2 | HAS_DIALYTIKA, // ϔ 0x03A6, // ϕ 0x03A0, // ϖ 0x03CF, // ϗ 0x03D8, // Ϙ 0x03D8, // ϙ 0x03DA, // Ϛ 0x03DA, // ϛ 0x03DC, // Ϝ 0x03DC, // ϝ 0x03DE, // Ϟ 0x03DE, // ϟ 0x03E0, // Ϡ 0x03E0, // ϡ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0x039A, // ϰ 0x03A1, // ϱ 0x03F9, // ϲ 0x037F, // ϳ 0x03F4, // ϴ 0x0395 | HAS_VOWEL, // ϵ 0, 0x03F7, // Ϸ 0x03F7, // ϸ 0x03F9, // Ϲ 0x03FA, // Ϻ 0x03FA, // ϻ 0x03FC, // ϼ 0x03FD, // Ͻ 0x03FE, // Ͼ 0x03FF, // Ͽ }; private static final char[] data1F00 = { // U+1F00..1FFF 0x0391 | HAS_VOWEL, // ἀ 0x0391 | HAS_VOWEL, // ἁ 0x0391 | HAS_VOWEL | HAS_ACCENT, // ἂ 0x0391 | HAS_VOWEL | HAS_ACCENT, // ἃ 0x0391 | HAS_VOWEL | HAS_ACCENT, // ἄ 0x0391 | HAS_VOWEL | HAS_ACCENT, // ἅ 0x0391 | HAS_VOWEL | HAS_ACCENT, // ἆ 0x0391 | HAS_VOWEL | HAS_ACCENT, // ἇ 0x0391 | HAS_VOWEL, // Ἀ 0x0391 | HAS_VOWEL, // Ἁ 0x0391 | HAS_VOWEL | HAS_ACCENT, // Ἂ 0x0391 | HAS_VOWEL | HAS_ACCENT, // Ἃ 0x0391 | HAS_VOWEL | HAS_ACCENT, // Ἄ 0x0391 | HAS_VOWEL | HAS_ACCENT, // Ἅ 0x0391 | HAS_VOWEL | HAS_ACCENT, // Ἆ 0x0391 | HAS_VOWEL | HAS_ACCENT, // Ἇ 0x0395 | HAS_VOWEL, // ἐ 0x0395 | HAS_VOWEL, // ἑ 0x0395 | HAS_VOWEL | HAS_ACCENT, // ἒ 0x0395 | HAS_VOWEL | HAS_ACCENT, // ἓ 0x0395 | HAS_VOWEL | HAS_ACCENT, // ἔ 0x0395 | HAS_VOWEL | HAS_ACCENT, // ἕ 0, 0, 0x0395 | HAS_VOWEL, // Ἐ 0x0395 | HAS_VOWEL, // Ἑ 0x0395 | HAS_VOWEL | HAS_ACCENT, // Ἒ 0x0395 | HAS_VOWEL | HAS_ACCENT, // Ἓ 0x0395 | HAS_VOWEL | HAS_ACCENT, // Ἔ 0x0395 | HAS_VOWEL | HAS_ACCENT, // Ἕ 0, 0, 0x0397 | HAS_VOWEL, // ἠ 0x0397 | HAS_VOWEL, // ἡ 0x0397 | HAS_VOWEL | HAS_ACCENT, // ἢ 0x0397 | HAS_VOWEL | HAS_ACCENT, // ἣ 0x0397 | HAS_VOWEL | HAS_ACCENT, // ἤ 0x0397 | HAS_VOWEL | HAS_ACCENT, // ἥ 0x0397 | HAS_VOWEL | HAS_ACCENT, // ἦ 0x0397 | HAS_VOWEL | HAS_ACCENT, // ἧ 0x0397 | HAS_VOWEL, // Ἠ 0x0397 | HAS_VOWEL, // Ἡ 0x0397 | HAS_VOWEL | HAS_ACCENT, // Ἢ 0x0397 | HAS_VOWEL | HAS_ACCENT, // Ἣ 0x0397 | HAS_VOWEL | HAS_ACCENT, // Ἤ 0x0397 | HAS_VOWEL | HAS_ACCENT, // Ἥ 0x0397 | HAS_VOWEL | HAS_ACCENT, // Ἦ 0x0397 | HAS_VOWEL | HAS_ACCENT, // Ἧ 0x0399 | HAS_VOWEL, // ἰ 0x0399 | HAS_VOWEL, // ἱ 0x0399 | HAS_VOWEL | HAS_ACCENT, // ἲ 0x0399 | HAS_VOWEL | HAS_ACCENT, // ἳ 0x0399 | HAS_VOWEL | HAS_ACCENT, // ἴ 0x0399 | HAS_VOWEL | HAS_ACCENT, // ἵ 0x0399 | HAS_VOWEL | HAS_ACCENT, // ἶ 0x0399 | HAS_VOWEL | HAS_ACCENT, // ἷ 0x0399 | HAS_VOWEL, // Ἰ 0x0399 | HAS_VOWEL, // Ἱ 0x0399 | HAS_VOWEL | HAS_ACCENT, // Ἲ 0x0399 | HAS_VOWEL | HAS_ACCENT, // Ἳ 0x0399 | HAS_VOWEL | HAS_ACCENT, // Ἴ 0x0399 | HAS_VOWEL | HAS_ACCENT, // Ἵ 0x0399 | HAS_VOWEL | HAS_ACCENT, // Ἶ 0x0399 | HAS_VOWEL | HAS_ACCENT, // Ἷ 0x039F | HAS_VOWEL, // ὀ 0x039F | HAS_VOWEL, // ὁ 0x039F | HAS_VOWEL | HAS_ACCENT, // ὂ 0x039F | HAS_VOWEL | HAS_ACCENT, // ὃ 0x039F | HAS_VOWEL | HAS_ACCENT, // ὄ 0x039F | HAS_VOWEL | HAS_ACCENT, // ὅ 0, 0, 0x039F | HAS_VOWEL, // Ὀ 0x039F | HAS_VOWEL, // Ὁ 0x039F | HAS_VOWEL | HAS_ACCENT, // Ὂ 0x039F | HAS_VOWEL | HAS_ACCENT, // Ὃ 0x039F | HAS_VOWEL | HAS_ACCENT, // Ὄ 0x039F | HAS_VOWEL | HAS_ACCENT, // Ὅ 0, 0, 0x03A5 | HAS_VOWEL, // ὐ 0x03A5 | HAS_VOWEL, // ὑ 0x03A5 | HAS_VOWEL | HAS_ACCENT, // ὒ 0x03A5 | HAS_VOWEL | HAS_ACCENT, // ὓ 0x03A5 | HAS_VOWEL | HAS_ACCENT, // ὔ 0x03A5 | HAS_VOWEL | HAS_ACCENT, // ὕ 0x03A5 | HAS_VOWEL | HAS_ACCENT, // ὖ 0x03A5 | HAS_VOWEL | HAS_ACCENT, // ὗ 0, 0x03A5 | HAS_VOWEL, // Ὑ 0, 0x03A5 | HAS_VOWEL | HAS_ACCENT, // Ὓ 0, 0x03A5 | HAS_VOWEL | HAS_ACCENT, // Ὕ 0, 0x03A5 | HAS_VOWEL | HAS_ACCENT, // Ὗ 0x03A9 | HAS_VOWEL, // ὠ 0x03A9 | HAS_VOWEL, // ὡ 0x03A9 | HAS_VOWEL | HAS_ACCENT, // ὢ 0x03A9 | HAS_VOWEL | HAS_ACCENT, // ὣ 0x03A9 | HAS_VOWEL | HAS_ACCENT, // ὤ 0x03A9 | HAS_VOWEL | HAS_ACCENT, // ὥ 0x03A9 | HAS_VOWEL | HAS_ACCENT, // ὦ 0x03A9 | HAS_VOWEL | HAS_ACCENT, // ὧ 0x03A9 | HAS_VOWEL, // Ὠ 0x03A9 | HAS_VOWEL, // Ὡ 0x03A9 | HAS_VOWEL | HAS_ACCENT, // Ὢ 0x03A9 | HAS_VOWEL | HAS_ACCENT, // Ὣ 0x03A9 | HAS_VOWEL | HAS_ACCENT, // Ὤ 0x03A9 | HAS_VOWEL | HAS_ACCENT, // Ὥ 0x03A9 | HAS_VOWEL | HAS_ACCENT, // Ὦ 0x03A9 | HAS_VOWEL | HAS_ACCENT, // Ὧ 0x0391 | HAS_VOWEL | HAS_ACCENT, // ὰ 0x0391 | HAS_VOWEL | HAS_ACCENT, // ά 0x0395 | HAS_VOWEL | HAS_ACCENT, // ὲ 0x0395 | HAS_VOWEL | HAS_ACCENT, // έ 0x0397 | HAS_VOWEL | HAS_ACCENT, // ὴ 0x0397 | HAS_VOWEL | HAS_ACCENT, // ή 0x0399 | HAS_VOWEL | HAS_ACCENT, // ὶ 0x0399 | HAS_VOWEL | HAS_ACCENT, // ί 0x039F | HAS_VOWEL | HAS_ACCENT, // ὸ 0x039F | HAS_VOWEL | HAS_ACCENT, // ό 0x03A5 | HAS_VOWEL | HAS_ACCENT, // ὺ 0x03A5 | HAS_VOWEL | HAS_ACCENT, // ύ 0x03A9 | HAS_VOWEL | HAS_ACCENT, // ὼ 0x03A9 | HAS_VOWEL | HAS_ACCENT, // ώ 0, 0, 0x0391 | HAS_VOWEL | HAS_YPOGEGRAMMENI, // ᾀ 0x0391 | HAS_VOWEL | HAS_YPOGEGRAMMENI, // ᾁ 0x0391 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT, // ᾂ 0x0391 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT, // ᾃ 0x0391 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT, // ᾄ 0x0391 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT, // ᾅ 0x0391 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT, // ᾆ 0x0391 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT, // ᾇ 0x0391 | HAS_VOWEL | HAS_YPOGEGRAMMENI, // ᾈ 0x0391 | HAS_VOWEL | HAS_YPOGEGRAMMENI, // ᾉ 0x0391 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT, // ᾊ 0x0391 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT, // ᾋ 0x0391 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT, // ᾌ 0x0391 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT, // ᾍ 0x0391 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT, // ᾎ 0x0391 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT, // ᾏ 0x0397 | HAS_VOWEL | HAS_YPOGEGRAMMENI, // ᾐ 0x0397 | HAS_VOWEL | HAS_YPOGEGRAMMENI, // ᾑ 0x0397 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT, // ᾒ 0x0397 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT, // ᾓ 0x0397 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT, // ᾔ 0x0397 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT, // ᾕ 0x0397 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT, // ᾖ 0x0397 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT, // ᾗ 0x0397 | HAS_VOWEL | HAS_YPOGEGRAMMENI, // ᾘ 0x0397 | HAS_VOWEL | HAS_YPOGEGRAMMENI, // ᾙ 0x0397 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT, // ᾚ 0x0397 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT, // ᾛ 0x0397 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT, // ᾜ 0x0397 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT, // ᾝ 0x0397 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT, // ᾞ 0x0397 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT, // ᾟ 0x03A9 | HAS_VOWEL | HAS_YPOGEGRAMMENI, // ᾠ 0x03A9 | HAS_VOWEL | HAS_YPOGEGRAMMENI, // ᾡ 0x03A9 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT, // ᾢ 0x03A9 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT, // ᾣ 0x03A9 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT, // ᾤ 0x03A9 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT, // ᾥ 0x03A9 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT, // ᾦ 0x03A9 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT, // ᾧ 0x03A9 | HAS_VOWEL | HAS_YPOGEGRAMMENI, // ᾨ 0x03A9 | HAS_VOWEL | HAS_YPOGEGRAMMENI, // ᾩ 0x03A9 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT, // ᾪ 0x03A9 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT, // ᾫ 0x03A9 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT, // ᾬ 0x03A9 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT, // ᾭ 0x03A9 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT, // ᾮ 0x03A9 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT, // ᾯ 0x0391 | HAS_VOWEL, // ᾰ 0x0391 | HAS_VOWEL, // ᾱ 0x0391 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT, // ᾲ 0x0391 | HAS_VOWEL | HAS_YPOGEGRAMMENI, // ᾳ 0x0391 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT, // ᾴ 0, 0x0391 | HAS_VOWEL | HAS_ACCENT, // ᾶ 0x0391 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT, // ᾷ 0x0391 | HAS_VOWEL, // Ᾰ 0x0391 | HAS_VOWEL, // Ᾱ 0x0391 | HAS_VOWEL | HAS_ACCENT, // Ὰ 0x0391 | HAS_VOWEL | HAS_ACCENT, // Ά 0x0391 | HAS_VOWEL | HAS_YPOGEGRAMMENI, // ᾼ 0, 0x0399 | HAS_VOWEL, // ι 0, 0, 0, 0x0397 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT, // ῂ 0x0397 | HAS_VOWEL | HAS_YPOGEGRAMMENI, // ῃ 0x0397 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT, // ῄ 0, 0x0397 | HAS_VOWEL | HAS_ACCENT, // ῆ 0x0397 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT, // ῇ 0x0395 | HAS_VOWEL | HAS_ACCENT, // Ὲ 0x0395 | HAS_VOWEL | HAS_ACCENT, // Έ 0x0397 | HAS_VOWEL | HAS_ACCENT, // Ὴ 0x0397 | HAS_VOWEL | HAS_ACCENT, // Ή 0x0397 | HAS_VOWEL | HAS_YPOGEGRAMMENI, // ῌ 0, 0, 0, 0x0399 | HAS_VOWEL, // ῐ 0x0399 | HAS_VOWEL, // ῑ 0x0399 | HAS_VOWEL | HAS_ACCENT | HAS_DIALYTIKA, // ῒ 0x0399 | HAS_VOWEL | HAS_ACCENT | HAS_DIALYTIKA, // ΐ 0, 0, 0x0399 | HAS_VOWEL | HAS_ACCENT, // ῖ 0x0399 | HAS_VOWEL | HAS_ACCENT | HAS_DIALYTIKA, // ῗ 0x0399 | HAS_VOWEL, // Ῐ 0x0399 | HAS_VOWEL, // Ῑ 0x0399 | HAS_VOWEL | HAS_ACCENT, // Ὶ 0x0399 | HAS_VOWEL | HAS_ACCENT, // Ί 0, 0, 0, 0, 0x03A5 | HAS_VOWEL, // ῠ 0x03A5 | HAS_VOWEL, // ῡ 0x03A5 | HAS_VOWEL | HAS_ACCENT | HAS_DIALYTIKA, // ῢ 0x03A5 | HAS_VOWEL | HAS_ACCENT | HAS_DIALYTIKA, // ΰ 0x03A1, // ῤ 0x03A1, // ῥ 0x03A5 | HAS_VOWEL | HAS_ACCENT, // ῦ 0x03A5 | HAS_VOWEL | HAS_ACCENT | HAS_DIALYTIKA, // ῧ 0x03A5 | HAS_VOWEL, // Ῠ 0x03A5 | HAS_VOWEL, // Ῡ 0x03A5 | HAS_VOWEL | HAS_ACCENT, // Ὺ 0x03A5 | HAS_VOWEL | HAS_ACCENT, // Ύ 0x03A1, // Ῥ 0, 0, 0, 0, 0, 0x03A9 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT, // ῲ 0x03A9 | HAS_VOWEL | HAS_YPOGEGRAMMENI, // ῳ 0x03A9 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT, // ῴ 0, 0x03A9 | HAS_VOWEL | HAS_ACCENT, // ῶ 0x03A9 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT, // ῷ 0x039F | HAS_VOWEL | HAS_ACCENT, // Ὸ 0x039F | HAS_VOWEL | HAS_ACCENT, // Ό 0x03A9 | HAS_VOWEL | HAS_ACCENT, // Ὼ 0x03A9 | HAS_VOWEL | HAS_ACCENT, // Ώ 0x03A9 | HAS_VOWEL | HAS_YPOGEGRAMMENI, // ῼ 0, 0, 0, }; // U+2126 Ohm sign private static final char data2126 = 0x03A9 | HAS_VOWEL; // Ω private static final int getLetterData(int c) { if (c < 0x370 || 0x2126 < c || (0x3ff < c && c < 0x1f00)) { return 0; } else if (c <= 0x3ff) { return data0370[c - 0x370]; } else if (c <= 0x1fff) { return data1F00[c - 0x1f00]; } else if (c == 0x2126) { return data2126; } else { return 0; } } /** * Returns a non-zero value for each of the Greek combining diacritics * listed in The Unicode Standard, version 8, chapter 7.2 Greek, * plus some perispomeni look-alikes. */ private static final int getDiacriticData(int c) { switch (c) { case '\u0300': // varia case '\u0301': // tonos = oxia case '\u0342': // perispomeni case '\u0302': // circumflex can look like perispomeni case '\u0303': // tilde can look like perispomeni case '\u0311': // inverted breve can look like perispomeni return HAS_ACCENT; case '\u0308': // dialytika = diaeresis return HAS_COMBINING_DIALYTIKA; case '\u0344': // dialytika tonos return HAS_COMBINING_DIALYTIKA | HAS_ACCENT; case '\u0345': // ypogegrammeni = iota subscript return HAS_YPOGEGRAMMENI; case '\u0304': // macron case '\u0306': // breve case '\u0313': // comma above case '\u0314': // reversed comma above case '\u0343': // koronis return HAS_OTHER_GREEK_DIACRITIC; default: return 0; } } private static boolean isFollowedByCasedLetter(CharSequence s, int i) { while (i < s.length()) { int c = Character.codePointAt(s, i); int type = UCaseProps.INSTANCE.getTypeOrIgnorable(c); if ((type & UCaseProps.IGNORABLE) != 0) { // Case-ignorable, continue with the loop. } else if (type != UCaseProps.NONE) { return true; // Followed by cased letter. } else { return false; // Uncased and not case-ignorable. } } return false; // Not followed by cased letter. } /** * Greek string uppercasing with a state machine. * Probably simpler than a stateless function that has to figure out complex context-before * for each character. * TODO: Try to re-consolidate one way or another with the non-Greek function. * *

Keep this consistent with the C++ versions in ustrcase.cpp (UTF-16) and ucasemap.cpp (UTF-8). */ private static String toUpper(CharSequence s, int[] locCache) { StringBuilder result = new StringBuilder(s.length()); int state = 0; for (int i = 0; i < s.length();) { int c = Character.codePointAt(s, i); int nextIndex = i + Character.charCount(c); int nextState = 0; int type = UCaseProps.INSTANCE.getTypeOrIgnorable(c); if ((type & UCaseProps.IGNORABLE) != 0) { // c is case-ignorable nextState |= (state & AFTER_CASED); } else if (type != UCaseProps.NONE) { // c is cased nextState |= AFTER_CASED; } int data = getLetterData(c); if (data > 0) { int upper = data & UPPER_MASK; // Add a dialytika to this iota or ypsilon vowel // if we removed a tonos from the previous vowel, // and that previous vowel did not also have (or gain) a dialytika. // Adding one only to the final vowel in a longer sequence // (which does not occur in normal writing) would require lookahead. // Set the same flag as for preserving an existing dialytika. if ((data & HAS_VOWEL) != 0 && (state & AFTER_VOWEL_WITH_ACCENT) != 0 && (upper == 'Ι' || upper == 'Υ')) { data |= HAS_DIALYTIKA; } int numYpogegrammeni = 0; // Map each one to a trailing, spacing, capital iota. if ((data & HAS_YPOGEGRAMMENI) != 0) { numYpogegrammeni = 1; } // Skip combining diacritics after this Greek letter. while (nextIndex < s.length()) { int diacriticData = getDiacriticData(s.charAt(nextIndex)); if (diacriticData != 0) { data |= diacriticData; if ((diacriticData & HAS_YPOGEGRAMMENI) != 0) { ++numYpogegrammeni; } ++nextIndex; } else { break; // not a Greek diacritic } } if ((data & HAS_VOWEL_AND_ACCENT_AND_DIALYTIKA) == HAS_VOWEL_AND_ACCENT) { nextState |= AFTER_VOWEL_WITH_ACCENT; } // Map according to Greek rules. boolean addTonos = false; if (upper == 'Η' && (data & HAS_ACCENT) != 0 && numYpogegrammeni == 0 && (state & AFTER_CASED) == 0 && !isFollowedByCasedLetter(s, nextIndex)) { // Keep disjunctive "or" with (only) a tonos. // We use the same "word boundary" conditions as for the Final_Sigma test. if (i == nextIndex) { upper = 'Ή'; // Preserve the precomposed form. } else { addTonos = true; } } else if ((data & HAS_DIALYTIKA) != 0) { // Preserve a vowel with dialytika in precomposed form if it exists. if (upper == 'Ι') { upper = 'Ϊ'; data &= ~HAS_EITHER_DIALYTIKA; } else if (upper == 'Υ') { upper = 'Ϋ'; data &= ~HAS_EITHER_DIALYTIKA; } } result.appendCodePoint(upper); if ((data & HAS_EITHER_DIALYTIKA) != 0) { result.append('\u0308'); // restore or add a dialytika } if (addTonos) { result.append('\u0301'); } while (numYpogegrammeni > 0) { result.append('Ι'); --numYpogegrammeni; } } else { c = UCaseProps.INSTANCE.toFullUpper(c, null, result, null, locCache); appendResult(c, result); } i = nextIndex; state = nextState; } return result.toString(); } } }





© 2015 - 2024 Weber Informatics LLC | Privacy Policy