com.adobe.xfa.ut.UniCharIterator Maven / Gradle / Ivy
Show all versions of aem-sdk-api Show documentation
package com.adobe.xfa.ut;
/**
* Allow iteration by Unicode characters over a Java UTF-16 encoded
* string.
*
* A Java character is only a 16-bit quantity. Unicode characters can
* have values up to 0x10FFFF, which exceeds the space available in a
* Java character. When such Unicode characters appear in a Java string
* they are encoded using the UTF-16 encoding and occupuy two
* consecutive Java characters, known as a surrogate pair.
*
*
* This class allows the caller to step through a Java string it true
* Unicode character amounts. It also provides some static methods to
* generate Java characters from Unicode characters.
*
*
* An iterator instance is associated with an instance of the Java
* CharSequence interface. This interface is implemented by both the
* String and StringBuilder classes.
*
*
* At any given time, one can think of the iterator as being positioned
* between characters in the associated character sequence. It can also
* be positioned before the first character and after the last.
* Operations move the iterator forward or backward in the underlying
* character sequence and return the Unicode character passed over.
*
*
* The iterator carries an index number that can be useful for indexing
* into the character sequence independently of the iterator. Index
* values start at zero and count up to the number of Java characters in
* the sequence. Index zero is before the first character, index one is
* between the first and second characters, and so on.
*
*
* It does not make sense for the iterator to be positioned between the
* two Java characters making up a surrogate pair. Subsequent
* operations could lead to assertion errors and unpredictable results.
*
*
* Note: The iterator caches the length of the given character sequence.
* If the caller is using an iterator and modifies the sequence in such
* a way that its length changes, it must call an associate() overload
* to re-establish the length.
*
* @exclude from published api.
*/
public class UniCharIterator {
private CharSequence mCharSequence;
private int mLength;
private int mIndex;
/**
* Default constructor.
*
* The iterator is not associated with any character sequence, and is
* not particularly useful until the attach() method is called.
*
*/
public UniCharIterator () {
}
/**
* Construct an iterator associated with a given character sequence.
* The iterator is initially positioned before the first character in
* the sequence.
* @param charSequence Character sequence to associate the iterator
* with.
*/
public UniCharIterator (CharSequence charSequence) {
attach (charSequence);
}
/**
* Construct an iterator associated with a given sequence, and initially
* positioned at a specified index.
* @param charSequence Character sequence to associate the iterator
* with.
* @param index Index number into the character sequence, with meaning
* as described above.
*/
public UniCharIterator (CharSequence charSequence, int index) {
attach (charSequence, index);
}
/**
* Append a Unicode character to a Java StringBuilder. This method
* determines whether the Unicode character can be represented as a
* single Java character or must be a surrogate pair. It then adds the
* appropriate Java character(s) to the given string buffer.
* @param s String buffer to add to.
* @param c Unicode character to be added.
*/
public static void append (StringBuilder s, int c) {
assert ((c >= 0) && (c <= 0x10FFFF));
if (c < 0x10000) {
s.append ((char) c);
} else {
c -= 0x10000;
s.append ((char) ((c >> 10) | 0xD800));
s.append ((char) ((c & 0x3FF) | 0xDC00));
}
}
/**
* Attach the iterator to a given character sequence. The iterator is
* initially positioned before the first character in the sequence.
* @param charSequence Character sequence to associate the iterator
* with.
*/
public void attach (CharSequence charSequence) {
attach (charSequence, 0);
}
/**
* Attach the iterator to a given sequence, and initially positioned at
* a specified index.
* @param charSequence Character sequence to associate the iterator
* with.
* @param index Index number into the character sequence, with meaning
* as described above.
*/
public void attach (CharSequence charSequence, int index) {
mCharSequence = charSequence;
mLength = charSequence.length();
mIndex = index;
if (mIndex > mLength) {
mIndex = mLength;
}
}
/**
* Get the current Java character index number of the iterator.
* @return Index number, as described above.
*/
public int getIndex () {
return mIndex;
}
/**
* Query whether the iterator is at the end of the text.
* @return True if the iterator is positioned after the last character
* in the underlying text; false if not.
*/
public boolean isAtEnd () {
return mIndex >= mLength;
}
/**
* Query whether the iterator is at the the of the text.
* @return True if the iterator is positioned before the first character
* in the underlying text; false if not.
*/
public boolean isAtStart () {
return mIndex == 0;
}
/**
* Advance the iterator by one Unicode character. The iterator will not
* be advanced if it is already positioined after the last Java
* character in the sequence. The iterator's index will increase by one
* or two, depending on the makeup of the Unicode character it advances
* over.
* @return Unicode character advanced over.
*/
public int next () {
assert (mCharSequence != null);
if (mIndex >= mLength) {
return 0;
}
assert (mCharSequence != null);
int result = mCharSequence.charAt (mIndex++); // TODO: need to worry about sign extension?
assert ((result < 0xDC00) || (result > 0xDFFF));
if ((result >= 0xD800) && (result <= 0xDBFF)) {
assert (mIndex < mLength);
int low = mCharSequence.charAt (mIndex++); // TODO: need to worry about sign extension?
assert ((low >= 0xDC00) || (low <= 0xDFFF));
result = (result << 10) | (low & 0x3FF);
}
return result;
}
/**
* Back up the iterator by one Unicode character. The iterator will not
* be moved if it is already positioined after the last Java character
* in the sequence. The iterator's index will decrease by one or two,
* depending on the makeup of the Unicode character it moves over.
* @return Unicode character passed over.
*/
public int prev () {
if (mIndex <= 0) {
return 0;
}
assert (mCharSequence != null);
int result = mCharSequence.charAt (--mIndex); // TODO: need to worry about sign extension?
assert ((result < 0xD800) || (result > 0xDBFF));
if ((result >= 0xDC00) && (result <= 0xDFFF)) {
assert (mIndex > 0);
int high = mCharSequence.charAt (--mIndex); // TODO: need to worry about sign extension?
assert ((high >= 0xD800) || (high <= 0xDBFF));
result = (high << 10) | (result & 0x3FF);
}
return result;
}
/**
* Set the iterator's index. This method changes the index, but keeps
* the iterator associated with the same character sequence.
* @param index New index to set for this iterator.
*/
public void setIndex (int index) {
assert ((index >= 0) && (index <= mCharSequence.length()));
mIndex = index;
}
/**
* Return a Java string that represents the given Unicode character.
* @param c Unicode character to convert to a Java string.
* @return Resulting String. If the character is less than 0x10000, the
* result will simply contain the single character passed in. Otherwise
* it will contain the two characters making up the surrogate pair.
*/
public static String toString (int c) {
StringBuilder s = new StringBuilder();
append (s, c);
return s.toString();
}
}