All Downloads are FREE. Search and download functionalities are using the official Maven repository.

com.ibm.icu.lang.UScriptRun Maven / Gradle / Ivy

Go to download

International Component for Unicode for Java (ICU4J) is a mature, widely used Java library providing Unicode and Globalization support

There is a newer version: 76.1
Show newest version
/*
 *******************************************************************************
 *
 *   Copyright (C) 1999-2009, International Business Machines
 *   Corporation and others.  All Rights Reserved.
 *
 *******************************************************************************
 */

package com.ibm.icu.lang;

import com.ibm.icu.text.UTF16;

/**
 * UScriptRun is used to find runs of characters in
 * the same script, as defined in the UScript class.
 * It implements a simple iterator over an array of characters.
 * The iterator will assign COMMON and INHERITED
 * characters to the same script as the preceeding characters. If the
 * COMMON and INHERITED characters are first, they will be assigned to
 * the same script as the following characters.
 *
 * The iterator will try to match paired punctuation. If it sees an
 * opening punctuation character, it will remember the script that
 * was assigned to that character, and assign the same script to the
 * matching closing punctuation.
 *
 * No attempt is made to combine related scripts into a single run. In
 * particular, Hiragana, Katakana, and Han characters will appear in separate
 * runs.

 * Here is an example of how to iterate over script runs:
 * 
 * void printScriptRuns(char[] text)
 * {
 *     UScriptRun scriptRun = new UScriptRun(text);
 *
 *     while (scriptRun.next()) {
 *         int start  = scriptRun.getScriptStart();
 *         int limit  = scriptRun.getScriptLimit();
 *         int script = scriptRun.getScriptCode();
 *
 *         System.out.println("Script \"" + UScript.getName(script) + "\" from " +
 *                            start + " to " + limit + ".");
 *     }
 *  }
 * 
* * @internal * @deprecated This API is ICU internal only. */ public final class UScriptRun { /** * Construct an empty UScriptRun object. The next() * method will return false the first time it is called. * * @internal * @deprecated This API is ICU internal only. */ public UScriptRun() { char[] nullChars = null; reset(nullChars, 0, 0); } /** * Construct a UScriptRun object which iterates over the * characters in the given string. * * @param text the string of characters over which to iterate. * * @internal * @deprecated This API is ICU internal only. */ public UScriptRun(String text) { reset (text); } /** * Construct a UScriptRun object which iterates over a subrange * of the characetrs in the given string. * * @param text the string of characters over which to iterate. * @param start the index of the first character over which to iterate * @param count the number of characters over which to iterate * * @internal * @deprecated This API is ICU internal only. */ public UScriptRun(String text, int start, int count) { reset(text, start, count); } /** * Construct a UScriptRun object which iterates over the given * characetrs. * * @param chars the array of characters over which to iterate. * * @internal * @deprecated This API is ICU internal only. */ public UScriptRun(char[] chars) { reset(chars); } /** * Construct a UScriptRun object which iterates over a subrange * of the given characetrs. * * @param chars the array of characters over which to iterate. * @param start the index of the first character over which to iterate * @param count the number of characters over which to iterate * * @internal * @deprecated This API is ICU internal only. */ public UScriptRun(char[] chars, int start, int count) { reset(chars, start, count); } /** * Reset the iterator to the start of the text. * * @internal * @deprecated This API is ICU internal only. */ public final void reset() { // empty any old parenStack contents. // NOTE: this is not the most efficient way // to do this, but it's the easiest to write... while (stackIsNotEmpty()) { pop(); } scriptStart = textStart; scriptLimit = textStart; scriptCode = UScript.INVALID_CODE; parenSP = -1; pushCount = 0; fixupCount = 0; textIndex = textStart; } /** * Reset the iterator to iterate over the given range of the text. Throws * IllegalArgumentException if the range is outside of the bounds of the * character array. * * @param start the index of the new first character over which to iterate * @param count the new number of characters over which to iterate. * @exception IllegalArgumentException If invalid arguments are passed. * * @internal * @deprecated This API is ICU internal only. */ public final void reset(int start, int count) throws IllegalArgumentException { int len = 0; if (text != null) { len = text.length; } if (start < 0 || count < 0 || start > len - count) { throw new IllegalArgumentException(); } textStart = start; textLimit = start + count; reset(); } /** * Reset the iterator to iterate over count characters * in chars starting at start. This allows * clients to reuse an iterator. * * @param chars the new array of characters over which to iterate. * @param start the index of the first character over which to iterate. * @param count the number of characters over which to iterate. * * @internal * @deprecated This API is ICU internal only. */ public final void reset(char[] chars, int start, int count) { if (chars == null) { chars = emptyCharArray; } text = chars; reset(start, count); } /** * Reset the iterator to iterate over the characters * in chars. This allows clients to reuse an iterator. * * @param chars the new array of characters over which to iterate. * * @internal * @deprecated This API is ICU internal only. */ public final void reset(char[] chars) { int length = 0; if (chars != null) { length = chars.length; } reset(chars, 0, length); } /** * Reset the iterator to iterate over count characters * in text starting at start. This allows * clients to reuse an iterator. * * @param str the new string of characters over which to iterate. * @param start the index of the first character over which to iterate. * @param count the nuber of characters over which to iterate. * * @internal * @deprecated This API is ICU internal only. */ public final void reset(String str, int start, int count) { char[] chars = null; if (str != null) { chars = str.toCharArray(); } reset(chars, start, count); } /** * Reset the iterator to iterate over the characters * in text. This allows clients to reuse an iterator. * * @param str the new string of characters over which to iterate. * * @internal * @deprecated This API is ICU internal only. */ public final void reset(String str) { int length = 0; if (str != null) { length = str.length(); } reset(str, 0, length); } /** * Get the starting index of the current script run. * * @return the index of the first character in the current script run. * * @internal * @deprecated This API is ICU internal only. */ public final int getScriptStart() { return scriptStart; } /** * Get the index of the first character after the current script run. * * @return the index of the first character after the current script run. * * @internal * @deprecated This API is ICU internal only. */ public final int getScriptLimit() { return scriptLimit; } /** * Get the script code for the script of the current script run. * * @return the script code for the script of the current script run. * @see com.ibm.icu.lang.UScript * * @internal * @deprecated This API is ICU internal only. */ public final int getScriptCode() { return scriptCode; } /** * Find the next script run. Returns false if there * isn't another run, returns true if there is. * * @return false if there isn't another run, true if there is. * * @internal * @deprecated This API is ICU internal only. */ public final boolean next() { // if we've fallen off the end of the text, we're done if (scriptLimit >= textLimit) { return false; } scriptCode = UScript.COMMON; scriptStart = scriptLimit; syncFixup(); while (textIndex < textLimit) { int ch = UTF16.charAt(text, textStart, textLimit, textIndex - textStart); int codePointCount = UTF16.getCharCount(ch); int sc = UScript.getScript(ch); int pairIndex = getPairIndex(ch); textIndex += codePointCount; // Paired character handling: // // if it's an open character, push it onto the stack. // if it's a close character, find the matching open on the // stack, and use that script code. Any non-matching open // characters above it on the stack will be poped. if (pairIndex >= 0) { if ((pairIndex & 1) == 0) { push(pairIndex, scriptCode); } else { int pi = pairIndex & ~1; while (stackIsNotEmpty() && top().pairIndex != pi) { pop(); } if (stackIsNotEmpty()) { sc = top().scriptCode; } } } if (sameScript(scriptCode, sc)) { if (scriptCode <= UScript.INHERITED && sc > UScript.INHERITED) { scriptCode = sc; fixup(scriptCode); } // if this character is a close paired character, // pop the matching open character from the stack if (pairIndex >= 0 && (pairIndex & 1) != 0) { pop(); } } else { // We've just seen the first character of // the next run. Back over it so we'll see // it again the next time. textIndex -= codePointCount; break; } } scriptLimit = textIndex; return true; } /** * Compare two script codes to see if they are in the same script. If one script is * a strong script, and the other is INHERITED or COMMON, it will compare equal. * * @param scriptOne one of the script codes. * @param scriptTwo the other script code. * @return true if the two scripts are the same. * @see com.ibm.icu.lang.UScript */ private static boolean sameScript(int scriptOne, int scriptTwo) { return scriptOne <= UScript.INHERITED || scriptTwo <= UScript.INHERITED || scriptOne == scriptTwo; } /* * An internal class which holds entries on the paren stack. */ private static final class ParenStackEntry { int pairIndex; int scriptCode; public ParenStackEntry(int thePairIndex, int theScriptCode) { pairIndex = thePairIndex; scriptCode = theScriptCode; } } private static final int mod(int sp) { return sp % PAREN_STACK_DEPTH; } private static final int inc(int sp, int count) { return mod(sp + count); } private static final int inc(int sp) { return inc(sp, 1); } private static final int dec(int sp, int count) { return mod(sp + PAREN_STACK_DEPTH - count); } private static final int dec(int sp) { return dec(sp, 1); } private static final int limitInc(int count) { if (count < PAREN_STACK_DEPTH) { count += 1; } return count; } private final boolean stackIsEmpty() { return pushCount <= 0; } private final boolean stackIsNotEmpty() { return ! stackIsEmpty(); } private final void push(int pairIndex, int scrptCode) { pushCount = limitInc(pushCount); fixupCount = limitInc(fixupCount); parenSP = inc(parenSP); parenStack[parenSP] = new ParenStackEntry(pairIndex, scrptCode); } private final void pop() { if (stackIsEmpty()) { return; } parenStack[parenSP] = null; if (fixupCount > 0) { fixupCount -= 1; } pushCount -= 1; parenSP = dec(parenSP); // If the stack is now empty, reset the stack // pointers to their initial values. if (stackIsEmpty()) { parenSP = -1; } } private final ParenStackEntry top() { return parenStack[parenSP]; } private final void syncFixup() { fixupCount = 0; } private final void fixup(int scrptCode) { int fixupSP = dec(parenSP, fixupCount); while (fixupCount-- > 0) { fixupSP = inc(fixupSP); parenStack[fixupSP].scriptCode = scrptCode; } } private char[] emptyCharArray = {}; private char[] text; private int textIndex; private int textStart; private int textLimit; private int scriptStart; private int scriptLimit; private int scriptCode; private static int PAREN_STACK_DEPTH = 32; private static ParenStackEntry parenStack[] = new ParenStackEntry[PAREN_STACK_DEPTH]; private int parenSP = -1; private int pushCount = 0; private int fixupCount = 0; /** * Find the highest bit that's set in a word. Uses a binary search through * the bits. * * @param n the word in which to find the highest bit that's set. * @return the bit number (counting from the low order bit) of the highest bit. */ private static final byte highBit(int n) { if (n <= 0) { return -32; } byte bit = 0; if (n >= 1 << 16) { n >>= 16; bit += 16; } if (n >= 1 << 8) { n >>= 8; bit += 8; } if (n >= 1 << 4) { n >>= 4; bit += 4; } if (n >= 1 << 2) { n >>= 2; bit += 2; } if (n >= 1 << 1) { n >>= 1; bit += 1; } return bit; } /** * Search the pairedChars array for the given character. * * @param ch the character for which to search. * @return the index of the character in the table, or -1 if it's not there. */ private static int getPairIndex(int ch) { int probe = pairedCharPower; int index = 0; if (ch >= pairedChars[pairedCharExtra]) { index = pairedCharExtra; } while (probe > (1 << 0)) { probe >>= 1; if (ch >= pairedChars[index + probe]) { index += probe; } } if (pairedChars[index] != ch) { index = -1; } return index; } private static int pairedChars[] = { 0x0028, 0x0029, // ascii paired punctuation 0x003c, 0x003e, 0x005b, 0x005d, 0x007b, 0x007d, 0x00ab, 0x00bb, // guillemets 0x2018, 0x2019, // general punctuation 0x201c, 0x201d, 0x2039, 0x203a, 0x3008, 0x3009, // chinese paired punctuation 0x300a, 0x300b, 0x300c, 0x300d, 0x300e, 0x300f, 0x3010, 0x3011, 0x3014, 0x3015, 0x3016, 0x3017, 0x3018, 0x3019, 0x301a, 0x301b }; private static int pairedCharPower = 1 << highBit(pairedChars.length); private static int pairedCharExtra = pairedChars.length - pairedCharPower; }




© 2015 - 2024 Weber Informatics LLC | Privacy Policy