com.ibm.icu.lang.UScriptRun Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of hydraql-template_1.4 Show documentation
The newest version!
// © 2016 and later: Unicode, Inc. and others.
// License & terms of use: http://www.unicode.org/copyright.html
/*
 *******************************************************************************
 *
 *   Copyright (C) 1999-2014, International Business Machines
 *   Corporation and others.  All Rights Reserved.
 *
 *******************************************************************************
 */

package com.ibm.icu.lang;

import com.ibm.icu.text.UTF16;

/**
 * UScriptRun is used to find runs of characters in
 * the same script, as defined in the UScript class.
 * It implements a simple iterator over an array of characters.
 * The iterator will assign COMMON and INHERITED
 * characters to the same script as the preceding characters. If the
 * COMMON and INHERITED characters are first, they will be assigned to
 * the same script as the following characters.
 *
 * The iterator will try to match paired punctuation. If it sees an
 * opening punctuation character, it will remember the script that
 * was assigned to that character, and assign the same script to the
 * matching closing punctuation.
 *
 * No attempt is made to combine related scripts into a single run. In
 * particular, Hiragana, Katakana, and Han characters will appear in separate
 * runs.

 * Here is an example of how to iterate over script runs:
 *  * void printScriptRuns(char[] text)
 * {
 *     UScriptRun scriptRun = new UScriptRun(text);
 *
 *     while (scriptRun.next()) {
 *         int start  = scriptRun.getScriptStart();
 *         int limit  = scriptRun.getScriptLimit();
 *         int script = scriptRun.getScriptCode();
 *
 *         System.out.println("Script \"" + UScript.getName(script) + "\" from " +
 *                            start + " to " + limit + ".");
 *     }
 *  }
 * 
 *
 * @internal
 * @deprecated This API is ICU internal only.
 */
@Deprecated
public final class UScriptRun
{
    /**
     * Construct an empty UScriptRun object. The next()
     * method will return false the first time it is called.
     *
     * @internal
     * @deprecated This API is ICU internal only.
     */
    @Deprecated
    public UScriptRun()
    {
        char[] nullChars = null;
        
        reset(nullChars, 0, 0);
    }
    
    /**
     * Construct a UScriptRun object which iterates over the
     * characters in the given string.
     *
     * @param text the string of characters over which to iterate.
     *
     * @internal
     * @deprecated This API is ICU internal only.
     */
    @Deprecated
    public UScriptRun(String text)
    {
        reset (text);
    }
    
    /**
     * Construct a UScriptRun object which iterates over a subrange
     * of the characters in the given string.
     *
     * @param text the string of characters over which to iterate.
     * @param start the index of the first character over which to iterate
     * @param count the number of characters over which to iterate
     *
     * @internal
     * @deprecated This API is ICU internal only.
     */
    @Deprecated
    public UScriptRun(String text, int start, int count)
    {
        reset(text, start, count);
    }

    /**
     * Construct a UScriptRun object which iterates over the given
     * characters.
     *
     * @param chars the array of characters over which to iterate.
     *
     * @internal
     * @deprecated This API is ICU internal only.
     */
    @Deprecated
    public UScriptRun(char[] chars)
    {
        reset(chars);
    }

    /**
     * Construct a UScriptRun object which iterates over a subrange
     * of the given characters.
     *
     * @param chars the array of characters over which to iterate.
     * @param start the index of the first character over which to iterate
     * @param count the number of characters over which to iterate
     *
     * @internal
     * @deprecated This API is ICU internal only.
     */
    @Deprecated
    public UScriptRun(char[] chars, int start, int count)
    {
        reset(chars, start, count);
    }


    /**
     * Reset the iterator to the start of the text.
     *
     * @internal
     * @deprecated This API is ICU internal only.
     */
    @Deprecated
    public final void reset()
    {
        // empty any old parenStack contents.
        // NOTE: this is not the most efficient way
        // to do this, but it's the easiest to write...
        while (stackIsNotEmpty()) {
            pop();
        }
        
        scriptStart = textStart;
        scriptLimit = textStart;
        scriptCode  = UScript.INVALID_CODE;
        parenSP     = -1;
        pushCount   =  0;
        fixupCount  =  0;
        
        textIndex = textStart;
    }

    /**
     * Reset the iterator to iterate over the given range of the text. Throws
     * IllegalArgumentException if the range is outside of the bounds of the
     * character array.
     *
     * @param start the index of the new first character over which to iterate
     * @param count the new number of characters over which to iterate.
     * @exception IllegalArgumentException If invalid arguments are passed.
     *
     * @internal
     * @deprecated This API is ICU internal only.
     */
    @Deprecated
    public final void reset(int start, int count)
    throws IllegalArgumentException
    {
        int len = 0;
        
        if (text != null) {
            len = text.length;
        }
        
        if (start < 0 || count < 0 || start > len - count) {
            throw new IllegalArgumentException();
        }
        
        textStart = start;
        textLimit = start + count;

        reset();
    }

    /**
     * Reset the iterator to iterate over count characters
     * in chars starting at start. This allows
     * clients to reuse an iterator.
     *
     * @param chars the new array of characters over which to iterate.
     * @param start the index of the first character over which to iterate.
     * @param count the number of characters over which to iterate.
     *
     * @internal
     * @deprecated This API is ICU internal only.
     */
    @Deprecated
    public final void reset(char[] chars, int start, int count)
    {
        if (chars == null) {
            chars = emptyCharArray;
        }
        
        text = chars;

        reset(start, count);
    }
    
    /**
     * Reset the iterator to iterate over the characters
     * in chars. This allows clients to reuse an iterator.
     *
     * @param chars the new array of characters over which to iterate.
     *
     * @internal
     * @deprecated This API is ICU internal only.
     */
    @Deprecated
    public final void reset(char[] chars)
    {
        int length = 0;
        
        if (chars != null) {
            length = chars.length;
        }
        
        reset(chars, 0, length);
    }
    
    /**
     * Reset the iterator to iterate over count characters
     * in text starting at start. This allows
     * clients to reuse an iterator.
     *
     * @param str the new string of characters over which to iterate.
     * @param start the index of the first character over which to iterate.
     * @param count the nuber of characters over which to iterate.
     *
     * @internal
     * @deprecated This API is ICU internal only.
     */
    @Deprecated
    public final void reset(String str, int start, int count)
    {
        char[] chars = null;
        
        if (str != null) {
            chars = str.toCharArray();
        }
        
        reset(chars, start, count);
    }
    
    /**
     * Reset the iterator to iterate over the characters
     * in text. This allows clients to reuse an iterator.
     *
     * @param str the new string of characters over which to iterate.
     *
     * @internal
     * @deprecated This API is ICU internal only.
     */
    @Deprecated
    public final void reset(String str)
    {
        int length   = 0;
        
        if (str != null) {
            length = str.length();
        }
        
        reset(str, 0, length);
    }
        


    /**
     * Get the starting index of the current script run.
     *
     * @return the index of the first character in the current script run.
     *
     * @internal
     * @deprecated This API is ICU internal only.
     */
    @Deprecated
    public final int getScriptStart()
    {
        return scriptStart;
    }

    /**
     * Get the index of the first character after the current script run.
     *
     * @return the index of the first character after the current script run.
     *
     * @internal
     * @deprecated This API is ICU internal only.
     */
    @Deprecated
    public final int getScriptLimit()
    {
        return scriptLimit;
    }

    /**
     * Get the script code for the script of the current script run.
     *
     * @return the script code for the script of the current script run.
     * @see com.ibm.icu.lang.UScript
     *
     * @internal
     * @deprecated This API is ICU internal only.
     */
    @Deprecated
    public final int getScriptCode()
    {
        return scriptCode;
    }

    /**
     * Find the next script run. Returns false if there
     * isn't another run, returns true if there is.
     *
     * @return false if there isn't another run, true if there is.
     *
     * @internal
     * @deprecated This API is ICU internal only.
     */
    @Deprecated
    public final boolean next()
    {
        // if we've fallen off the end of the text, we're done
        if (scriptLimit >= textLimit) {
            return false;
        }
    
        scriptCode  = UScript.COMMON;
        scriptStart = scriptLimit;
        
        syncFixup();
        
        while (textIndex < textLimit) {
            int ch = UTF16.charAt(text, textStart, textLimit, textIndex - textStart);
            int codePointCount = UTF16.getCharCount(ch);
            int sc = UScript.getScript(ch);
            int pairIndex = getPairIndex(ch);

            textIndex += codePointCount;
            
            // Paired character handling:
            //
            // if it's an open character, push it onto the stack.
            // if it's a close character, find the matching open on the
            // stack, and use that script code. Any non-matching open
            // characters above it on the stack will be poped.
            if (pairIndex >= 0) {
                if ((pairIndex & 1) == 0) {
                    push(pairIndex, scriptCode);
                } else {
                    int pi = pairIndex & ~1;

                    while (stackIsNotEmpty() && top().pairIndex != pi) {
                        pop();
                    }

                    if (stackIsNotEmpty()) {
                        sc = top().scriptCode;
                    }
                }
            }

            if (sameScript(scriptCode, sc)) {
                if (scriptCode <= UScript.INHERITED && sc > UScript.INHERITED) {
                    scriptCode = sc;

                    fixup(scriptCode);
                }

                // if this character is a close paired character,
                // pop the matching open character from the stack
                if (pairIndex >= 0 && (pairIndex & 1) != 0) {
                    pop();
                }
            } else {
                // We've just seen the first character of
                // the next run. Back over it so we'll see
                // it again the next time.
                textIndex -= codePointCount;
                break;
            }
        }

        scriptLimit = textIndex;
        return true;
    }

    /**
     * Compare two script codes to see if they are in the same script. If one script is
     * a strong script, and the other is INHERITED or COMMON, it will compare equal.
     *
     * @param scriptOne one of the script codes.
     * @param scriptTwo the other script code.
     * @return true if the two scripts are the same.
     * @see com.ibm.icu.lang.UScript
     */
    private static boolean sameScript(int scriptOne, int scriptTwo)
    {
        return scriptOne <= UScript.INHERITED || scriptTwo <= UScript.INHERITED || scriptOne == scriptTwo;
    }

    /*
     * An internal class which holds entries on the paren stack.
     */
    private static final class ParenStackEntry
    {
        int pairIndex;
        int scriptCode;
        
        public ParenStackEntry(int thePairIndex, int theScriptCode)
        {
            pairIndex  = thePairIndex;
            scriptCode = theScriptCode;
        }
    }
    
    private static final int mod(int sp)
    {
        return sp % PAREN_STACK_DEPTH;
    }
    
    private static final int inc(int sp, int count)
    {
        return mod(sp + count);
    }
    
    private static final int inc(int sp)
    {
        return inc(sp, 1);
    }
    
    private static final int dec(int sp, int count)
    {
        return mod(sp + PAREN_STACK_DEPTH - count);
    }
    
    private static final int dec(int sp)
    {
        return dec(sp, 1);
    }
    
    private static final int limitInc(int count)
    {
        if (count < PAREN_STACK_DEPTH) {
            count += 1;
        }
        
        return count;
    }
    
    private final boolean stackIsEmpty()
    {
        return pushCount <= 0;
    }
    
    private final boolean stackIsNotEmpty()
    {
        return ! stackIsEmpty();
    }
    
    private final void push(int pairIndex, int scrptCode)
    {
        pushCount  = limitInc(pushCount);
        fixupCount = limitInc(fixupCount);
        
        parenSP = inc(parenSP);
        parenStack[parenSP] = new ParenStackEntry(pairIndex, scrptCode);
    }
    
    private final void pop()
    {
        
        if (stackIsEmpty()) {
            return;
        }
        
        parenStack[parenSP] = null;
        
        if (fixupCount > 0) {
            fixupCount -= 1;
        }
        
        pushCount -= 1;
        parenSP = dec(parenSP);
        
        // If the stack is now empty, reset the stack
        // pointers to their initial values.
        if (stackIsEmpty()) {
            parenSP = -1;
        }
    }
    
    private final ParenStackEntry top()
    {
        return parenStack[parenSP];
    }
    
    private final void syncFixup()
    {
        fixupCount = 0;
    }
    
    private final void fixup(int scrptCode)
    {
        int fixupSP = dec(parenSP, fixupCount);
        
        while (fixupCount-- > 0) {
            fixupSP = inc(fixupSP);
            parenStack[fixupSP].scriptCode = scrptCode;
        }
    }
    
    private char[] emptyCharArray = {};

    private char[] text;

    private int textIndex;
    private int  textStart;
    private int  textLimit;
    
    private int  scriptStart;
    private int  scriptLimit;
    private int  scriptCode;

    private static int PAREN_STACK_DEPTH = 32;
    private static ParenStackEntry parenStack[] = new ParenStackEntry[PAREN_STACK_DEPTH];
    private int parenSP = -1;
    private int pushCount = 0;
    private int fixupCount = 0;

    /**
     * Find the highest bit that's set in a word. Uses a binary search through
     * the bits.
     *
     * @param n the word in which to find the highest bit that's set.
     * @return the bit number (counting from the low order bit) of the highest bit.
     */
    private static final byte highBit(int n)
    {
        if (n <= 0) {
            return -32;
        }

        byte bit = 0;

        if (n >= 1 << 16) {
            n >>= 16;
            bit += 16;
        }

        if (n >= 1 << 8) {
            n >>= 8;
            bit += 8;
        }

        if (n >= 1 << 4) {
            n >>= 4;
            bit += 4;
        }

        if (n >= 1 << 2) {
            n >>= 2;
            bit += 2;
        }

        if (n >= 1 << 1) {
            n >>= 1;
            bit += 1;
        }

        return bit;
    }

    /**
     * Search the pairedChars array for the given character.
     *
     * @param ch the character for which to search.
     * @return the index of the character in the table, or -1 if it's not there.
     */
    private static int getPairIndex(int ch)
    {
        int probe = pairedCharPower;
        int index = 0;

        if (ch >= pairedChars[pairedCharExtra]) {
            index = pairedCharExtra;
        }

        while (probe > (1 << 0)) {
            probe >>= 1;

            if (ch >= pairedChars[index + probe]) {
                index += probe;
            }
        }

        if (pairedChars[index] != ch) {
            index = -1;
        }

        return index;
    }

    private static int pairedChars[] = {
        0x0028, 0x0029, // ascii paired punctuation
        0x003c, 0x003e,
        0x005b, 0x005d,
        0x007b, 0x007d,
        0x00ab, 0x00bb, // guillemets
        0x2018, 0x2019, // general punctuation
        0x201c, 0x201d,
        0x2039, 0x203a,
        0x3008, 0x3009, // chinese paired punctuation
        0x300a, 0x300b,
        0x300c, 0x300d,
        0x300e, 0x300f,
        0x3010, 0x3011,
        0x3014, 0x3015,
        0x3016, 0x3017,
        0x3018, 0x3019,
        0x301a, 0x301b
    };

    private static int pairedCharPower = 1 << highBit(pairedChars.length);
    private static int pairedCharExtra = pairedChars.length - pairedCharPower;
}