com.adobe.xfa.ut.UniCharIterator Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of aem-sdk-api Show documentation
The Adobe Experience Manager SDK
There is a newer version: 2024.11.18751.20241128T090041Z-241100
package com.adobe.xfa.ut;

/**
 * Allow iteration by Unicode characters over a Java UTF-16 encoded
 * string.
 * 
 * A Java character is only a 16-bit quantity.	Unicode characters can
 * have values up to 0x10FFFF, which exceeds the space available in a
 * Java character.	When such Unicode characters appear in a Java string
 * they are encoded using the UTF-16 encoding and occupuy two
 * consecutive Java characters, known as a surrogate pair.
 * 
 * 
 * This class allows the caller to step through a Java string it true
 * Unicode character amounts.  It also provides some static methods to
 * generate Java characters from Unicode characters.
 * 
 * 
 * An iterator instance is associated with an instance of the Java
 * CharSequence interface.	This interface is implemented by both the
 * String and StringBuilder classes.
 * 
 * 
 * At any given time, one can think of the iterator as being positioned
 * between characters in the associated character sequence.  It can also
 * be positioned before the first character and after the last.
 * Operations move the iterator forward or backward in the underlying
 * character sequence and return the Unicode character passed over.
 * 
 * 
 * The iterator carries an index number that can be useful for indexing
 * into the character sequence independently of the iterator.  Index
 * values start at zero and count up to the number of Java characters in
 * the sequence.  Index zero is before the first character, index one is
 * between the first and second characters, and so on.
 * 
 * 
 * It does not make sense for the iterator to be positioned between the
 * two Java characters making up a surrogate pair.	Subsequent
 * operations could lead to assertion errors and unpredictable results.
 * 
 * 
 * Note: The iterator caches the length of the given character sequence.
 * If the caller is using an iterator and modifies the sequence in such
 * a way that its length changes, it must call an associate() overload
 * to re-establish the length.
 * 
 * @exclude from published api.
 */

public class UniCharIterator {
	private CharSequence mCharSequence;
	private int mLength;
	private int mIndex;

/**
 * Default constructor.
 * 
 * The iterator is not associated with any character sequence, and is
 * not particularly useful until the attach() method is called.
 * 
 */
	public UniCharIterator () {
	}

/**
 * Construct an iterator associated with a given character sequence.
 * The iterator is initially positioned before the first character in
 * the sequence.
 * @param charSequence Character sequence to associate the iterator
 * with.
 */
	public UniCharIterator (CharSequence charSequence) {
		attach (charSequence);
	}

/**
 * Construct an iterator associated with a given sequence, and initially
 * positioned at a specified index.
 * @param charSequence Character sequence to associate the iterator
 * with.
 * @param index Index number into the character sequence, with meaning
 * as described above.
 */
	public UniCharIterator (CharSequence charSequence, int index) {
		attach (charSequence, index);
	}

/**
 * Append a Unicode character to a Java StringBuilder.  This method
 * determines whether the Unicode character can be represented as a
 * single Java character or must be a surrogate pair.  It then adds the
 * appropriate Java character(s) to the given string buffer.
 * @param s String buffer to add to.
 * @param c Unicode character to be added.
 */
	public static void append (StringBuilder s, int c) {
		assert ((c >= 0) && (c <= 0x10FFFF));
		if (c < 0x10000) {
			s.append ((char) c);
		} else {
			c -= 0x10000;
			s.append ((char) ((c >> 10) | 0xD800));
			s.append ((char) ((c & 0x3FF) | 0xDC00));
		}
	}

/**
 * Attach the iterator to a given character sequence.  The iterator is
 * initially positioned before the first character in the sequence.
 * @param charSequence Character sequence to associate the iterator
 * with.
 */
	public void attach (CharSequence charSequence) {
		attach (charSequence, 0);
	}

/**
 * Attach the iterator to a given sequence, and initially positioned at
 * a specified index.
 * @param charSequence Character sequence to associate the iterator
 * with.
 * @param index Index number into the character sequence, with meaning
 * as described above.
 */
	public void attach (CharSequence charSequence, int index) {
		mCharSequence = charSequence;
		mLength = charSequence.length();
		mIndex = index;
		if (mIndex > mLength) {
			mIndex = mLength;
		}
	}

/**
 * Get the current Java character index number of the iterator.
 * @return Index number, as described above.
 */
	public int getIndex () {
		return mIndex;
	}

/**
 * Query whether the iterator is at the end of the text.
 * @return True if the iterator is positioned after the last character
 * in the underlying text; false if not.
 */
	public boolean isAtEnd () {
		return mIndex >= mLength;
	}

/**
 * Query whether the iterator is at the the of the text.
 * @return True if the iterator is positioned before the first character
 * in the underlying text; false if not.
 */
	public boolean isAtStart () {
		return mIndex == 0;
	}

/**
 * Advance the iterator by one Unicode character.  The iterator will not
 * be advanced if it is already positioined after the last Java
 * character in the sequence.  The iterator's index will increase by one
 * or two, depending on the makeup of the Unicode character it advances
 * over.
 * @return Unicode character advanced over.
 */
	public int next () {
		assert (mCharSequence != null);
		if (mIndex >= mLength) {
			return 0;
		}
		assert (mCharSequence != null);
		int result = mCharSequence.charAt (mIndex++);		// TODO: need to worry about sign extension?
		assert ((result < 0xDC00) || (result > 0xDFFF));
		if ((result >= 0xD800) && (result <= 0xDBFF)) {
			assert (mIndex < mLength);
			int low = mCharSequence.charAt (mIndex++);		// TODO: need to worry about sign extension?
			assert ((low >= 0xDC00) || (low <= 0xDFFF));
			result = (result << 10) | (low & 0x3FF);
		}
		return result;
	}

/**
 * Back up the iterator by one Unicode character.  The iterator will not
 * be moved if it is already positioined after the last Java character
 * in the sequence.  The iterator's index will decrease by one or two,
 * depending on the makeup of the Unicode character it moves over.
 * @return Unicode character passed over.
 */
	public int prev () {
		if (mIndex <= 0) {
			return 0;
		}
		assert (mCharSequence != null);
		int result = mCharSequence.charAt (--mIndex);		// TODO: need to worry about sign extension?
		assert ((result < 0xD800) || (result > 0xDBFF));
		if ((result >= 0xDC00) && (result <= 0xDFFF)) {
			assert (mIndex > 0);
			int high = mCharSequence.charAt (--mIndex);		// TODO: need to worry about sign extension?
			assert ((high >= 0xD800) || (high <= 0xDBFF));
			result = (high << 10) | (result & 0x3FF);
		}
		return result;
	}

/**
 * Set the iterator's index.  This method changes the index, but keeps
 * the iterator associated with the same character sequence.
 * @param index New index to set for this iterator.
 */
	public void setIndex (int index) {
		assert ((index >= 0) && (index <= mCharSequence.length()));
		mIndex = index;
	}

/**
 * Return a Java string that represents the given Unicode character.
 * @param c Unicode character to convert to a Java string.
 * @return Resulting String.  If the character is less than 0x10000, the
 * result will simply contain the single character passed in.  Otherwise
 * it will contain the two characters making up the surrogate pair.
 */
	public static String toString (int c) {
		StringBuilder s = new StringBuilder();
		append (s, c);
		return s.toString();
	}
}