All Downloads are FREE. Search and download functionalities are using the official Maven repository.

com.adobe.xfa.ut.UniCharIterator Maven / Gradle / Ivy

There is a newer version: 2024.11.18751.20241128T090041Z-241100
Show newest version
package com.adobe.xfa.ut;

/**
 * Allow iteration by Unicode characters over a Java UTF-16 encoded
 * string.
 * 

* A Java character is only a 16-bit quantity. Unicode characters can * have values up to 0x10FFFF, which exceeds the space available in a * Java character. When such Unicode characters appear in a Java string * they are encoded using the UTF-16 encoding and occupuy two * consecutive Java characters, known as a surrogate pair. *

*

* This class allows the caller to step through a Java string it true * Unicode character amounts. It also provides some static methods to * generate Java characters from Unicode characters. *

*

* An iterator instance is associated with an instance of the Java * CharSequence interface. This interface is implemented by both the * String and StringBuilder classes. *

*

* At any given time, one can think of the iterator as being positioned * between characters in the associated character sequence. It can also * be positioned before the first character and after the last. * Operations move the iterator forward or backward in the underlying * character sequence and return the Unicode character passed over. *

*

* The iterator carries an index number that can be useful for indexing * into the character sequence independently of the iterator. Index * values start at zero and count up to the number of Java characters in * the sequence. Index zero is before the first character, index one is * between the first and second characters, and so on. *

*

* It does not make sense for the iterator to be positioned between the * two Java characters making up a surrogate pair. Subsequent * operations could lead to assertion errors and unpredictable results. *

*

* Note: The iterator caches the length of the given character sequence. * If the caller is using an iterator and modifies the sequence in such * a way that its length changes, it must call an associate() overload * to re-establish the length. *

* @exclude from published api. */ public class UniCharIterator { private CharSequence mCharSequence; private int mLength; private int mIndex; /** * Default constructor. *

* The iterator is not associated with any character sequence, and is * not particularly useful until the attach() method is called. *

*/ public UniCharIterator () { } /** * Construct an iterator associated with a given character sequence. * The iterator is initially positioned before the first character in * the sequence. * @param charSequence Character sequence to associate the iterator * with. */ public UniCharIterator (CharSequence charSequence) { attach (charSequence); } /** * Construct an iterator associated with a given sequence, and initially * positioned at a specified index. * @param charSequence Character sequence to associate the iterator * with. * @param index Index number into the character sequence, with meaning * as described above. */ public UniCharIterator (CharSequence charSequence, int index) { attach (charSequence, index); } /** * Append a Unicode character to a Java StringBuilder. This method * determines whether the Unicode character can be represented as a * single Java character or must be a surrogate pair. It then adds the * appropriate Java character(s) to the given string buffer. * @param s String buffer to add to. * @param c Unicode character to be added. */ public static void append (StringBuilder s, int c) { assert ((c >= 0) && (c <= 0x10FFFF)); if (c < 0x10000) { s.append ((char) c); } else { c -= 0x10000; s.append ((char) ((c >> 10) | 0xD800)); s.append ((char) ((c & 0x3FF) | 0xDC00)); } } /** * Attach the iterator to a given character sequence. The iterator is * initially positioned before the first character in the sequence. * @param charSequence Character sequence to associate the iterator * with. */ public void attach (CharSequence charSequence) { attach (charSequence, 0); } /** * Attach the iterator to a given sequence, and initially positioned at * a specified index. * @param charSequence Character sequence to associate the iterator * with. * @param index Index number into the character sequence, with meaning * as described above. */ public void attach (CharSequence charSequence, int index) { mCharSequence = charSequence; mLength = charSequence.length(); mIndex = index; if (mIndex > mLength) { mIndex = mLength; } } /** * Get the current Java character index number of the iterator. * @return Index number, as described above. */ public int getIndex () { return mIndex; } /** * Query whether the iterator is at the end of the text. * @return True if the iterator is positioned after the last character * in the underlying text; false if not. */ public boolean isAtEnd () { return mIndex >= mLength; } /** * Query whether the iterator is at the the of the text. * @return True if the iterator is positioned before the first character * in the underlying text; false if not. */ public boolean isAtStart () { return mIndex == 0; } /** * Advance the iterator by one Unicode character. The iterator will not * be advanced if it is already positioined after the last Java * character in the sequence. The iterator's index will increase by one * or two, depending on the makeup of the Unicode character it advances * over. * @return Unicode character advanced over. */ public int next () { assert (mCharSequence != null); if (mIndex >= mLength) { return 0; } assert (mCharSequence != null); int result = mCharSequence.charAt (mIndex++); // TODO: need to worry about sign extension? assert ((result < 0xDC00) || (result > 0xDFFF)); if ((result >= 0xD800) && (result <= 0xDBFF)) { assert (mIndex < mLength); int low = mCharSequence.charAt (mIndex++); // TODO: need to worry about sign extension? assert ((low >= 0xDC00) || (low <= 0xDFFF)); result = (result << 10) | (low & 0x3FF); } return result; } /** * Back up the iterator by one Unicode character. The iterator will not * be moved if it is already positioined after the last Java character * in the sequence. The iterator's index will decrease by one or two, * depending on the makeup of the Unicode character it moves over. * @return Unicode character passed over. */ public int prev () { if (mIndex <= 0) { return 0; } assert (mCharSequence != null); int result = mCharSequence.charAt (--mIndex); // TODO: need to worry about sign extension? assert ((result < 0xD800) || (result > 0xDBFF)); if ((result >= 0xDC00) && (result <= 0xDFFF)) { assert (mIndex > 0); int high = mCharSequence.charAt (--mIndex); // TODO: need to worry about sign extension? assert ((high >= 0xD800) || (high <= 0xDBFF)); result = (high << 10) | (result & 0x3FF); } return result; } /** * Set the iterator's index. This method changes the index, but keeps * the iterator associated with the same character sequence. * @param index New index to set for this iterator. */ public void setIndex (int index) { assert ((index >= 0) && (index <= mCharSequence.length())); mIndex = index; } /** * Return a Java string that represents the given Unicode character. * @param c Unicode character to convert to a Java string. * @return Resulting String. If the character is less than 0x10000, the * result will simply contain the single character passed in. Otherwise * it will contain the two characters making up the surrogate pair. */ public static String toString (int c) { StringBuilder s = new StringBuilder(); append (s, c); return s.toString(); } }




© 2015 - 2025 Weber Informatics LLC | Privacy Policy