
com.upokecenter.text.NormalizingCharacterInput Maven / Gradle / Ivy
package com.upokecenter.text;
/*
Written by Peter O. in 2014.
Any copyright is dedicated to the Public Domain.
http://creativecommons.org/publicdomain/zero/1.0/
If you like this, you should donate to Peter O.
at: http://upokecenter.dreamhosters.com/articles/donate-now-2/
*/
import java.util.*;
import com.upokecenter.text.encoders.*;
/**
* A character input class that implements the Unicode normalization
* algorithm and contains methods and functionality to test and convert
* text strings for normalization. This is similar to the Normalizer
* class, except it implements the ICharacterInput interface.
* NOTICE: While this class's source code is in the public domain,
* the class uses an class, called NormalizationData, that
* includes data derived from the Unicode Character Database. In case
* doing so is required, the permission notice for the Unicode Character
* Database is given here:
COPYRIGHT AND PERMISSION NOTICE
* Copyright (c) 1991-2014 Unicode, Inc. All rights reserved.
* Distributed under the Terms of Use in
* http://www.unicode.org/copyright.html.
Permission is hereby
* granted, free of charge, to any person obtaining a copy of the
* Unicode data files and any associated documentation (the "Data
* Files") or Unicode software and any associated documentation (the
* "Software") to deal in the Data Files or Software without
* restriction, including without limitation the rights to use, copy,
* modify, merge, publish, distribute, and/or sell copies of the Data
* Files or Software, and to permit persons to whom the Data Files or
* Software are furnished to do so, provided that (a) this copyright and
* permission notice appear with all copies of the Data Files or
* Software, (b) this copyright and permission notice appear in
* associated documentation, and (c) there is clear notice in each
* modified Data File or in the Software as well as in the documentation
* associated with the Data File(s) or Software that the data or
* software has been modified.
THE DATA FILES AND SOFTWARE ARE
* PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED,
* INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT OF THIRD PARTY
* RIGHTS. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR HOLDERS INCLUDED IN
* THIS NOTICE BE LIABLE FOR ANY CLAIM, OR ANY SPECIAL INDIRECT OR
* CONSEQUENTIAL DAMAGES, OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS
* OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE
* OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE
* USE OR PERFORMANCE OF THE DATA FILES OR SOFTWARE.
Except as
* contained in this notice, the name of a copyright holder shall not be
* used in advertising or otherwise to promote the sale, use or other
* dealings in these Data Files or Software without prior written
* authorization of the copyright holder.
*/
public final class NormalizingCharacterInput implements ICharacterInput
{
static int DecompToBufferInternal(
int ch,
boolean compat,
int[] buffer,
int index) {
int offset = UnicodeDatabase.GetDecomposition(
ch,
compat,
buffer,
index);
if (buffer[index] != ch) {
int[] copy = new int[offset - index];
System.arraycopy(buffer, index, copy, 0, copy.length);
offset = index;
for (int i = 0; i < copy.length; ++i) {
offset = DecompToBufferInternal(copy[i], compat, buffer, offset);
}
}
return offset;
}
static int DecompToBuffer(
int ch,
boolean compat,
int[] buffer,
int index) {
if (ch >= 0xac00 && ch < 0xac00 + 11172) {
// Hangul syllable
int valueSIndex = ch - 0xac00;
int trail = 0x11a7 + (valueSIndex % 28);
buffer[index++] = 0x1100 + (valueSIndex / 588);
buffer[index++] = 0x1161 + ((valueSIndex % 588) / 28);
if (trail != 0x11a7) {
buffer[index++] = trail;
}
return index;
}
return DecompToBufferInternal(ch, compat, buffer, index);
}
static boolean IsStableCodePoint(int cp, Normalization form) {
// Exclude YOD and HIRIQ because of Corrigendum 2
return UnicodeDatabase.IsStableCodePoint(cp, form) && cp != 0x5b4 &&
cp != 0x5d9;
}
static void ReorderBuffer(int[] buffer, int index, int length) {
int i;
if (length < 2) {
return;
}
boolean changed;
do {
changed = false;
// System.out.println(toString(buffer, index, length));
int lead = UnicodeDatabase.GetCombiningClass(buffer[index]);
int trail;
for (i = 1; i < length; ++i) {
int offset = index + i;
trail = UnicodeDatabase.GetCombiningClass(buffer[offset]);
if (trail != 0 && lead > trail) {
int c = buffer[offset - 1];
buffer[offset - 1] = buffer[offset];
buffer[offset] = c;
// System.out.println("lead= {0:X4} ccc=" + lead);
// System.out.println("trail={0:X4} ccc=" + trail);
// System.out.println("now "+toString(buffer,index,length));
changed = true;
// Lead is now at trail's position
} else {
lead = trail;
}
}
} while (changed);
}
static int ComposeBuffer(int[] array, int length) {
if (length < 2) {
return length;
}
int starterPos = 0;
int retval = length;
int starter = array[0];
int last = UnicodeDatabase.GetCombiningClass(starter);
if (last != 0) {
last = 256;
}
int endPos = 0 + length;
boolean composed = false;
for (int decompPos = 0; decompPos < endPos; ++decompPos) {
int ch = array[decompPos];
int valuecc = UnicodeDatabase.GetCombiningClass(ch);
if (decompPos > 0) {
int lead = starter - 0x1100;
if (0 <= lead && lead < 19) {
// Found Hangul L jamo
int vowel = ch - 0x1161;
if (0 <= vowel && vowel < 21 && (last < valuecc || last == 0)) {
starter = 0xac00 + (((lead * 21) + vowel) * 28);
array[starterPos] = starter;
array[decompPos] = 0x110000;
composed = true;
--retval;
continue;
}
}
int syllable = starter - 0xac00;
if (0 <= syllable && syllable < 11172 && (syllable % 28) == 0) {
// Found Hangul LV jamo
int trail = ch - 0x11a7;
if (0 < trail && trail < 28 && (last < valuecc || last == 0)) {
starter += trail;
array[starterPos] = starter;
array[decompPos] = 0x110000;
composed = true;
--retval;
continue;
}
}
}
int composite = UnicodeDatabase.GetComposedPair(starter, ch);
boolean diffClass = last < valuecc;
if (composite >= 0 && (diffClass || last == 0)) {
array[starterPos] = composite;
starter = composite;
array[decompPos] = 0x110000;
composed = true;
--retval;
continue;
}
if (valuecc == 0) {
starterPos = decompPos;
starter = ch;
}
last = valuecc;
}
if (composed) {
int j = 0;
for (int i = 0; i < endPos; ++i) {
if (array[i] != 0x110000) {
array[j++] = array[i];
}
}
}
return retval;
}
/**
* Gets a list of normalized code points after reading from a string.
* @param str A string object.
* @param form Specifies the normalization form to use when normalizing the
* text.
* @return A list of the normalized Unicode characters.
* @throws NullPointerException The parameter {@code str} is null.
*/
public static List GetChars(String str, Normalization form) {
if (str == null) {
throw new NullPointerException("str");
}
return GetChars(new StringCharacterInput(str), form);
}
/**
* Gets a list of normalized code points after reading from a character stream.
* @param str An object that implements a stream of Unicode characters.
* @param form Specifies the normalization form to use when normalizing the
* text.
* @return A list of the normalized Unicode characters.
* @throws NullPointerException The parameter {@code str} is null.
*/
public static List GetChars(ICharacterInput str, Normalization form) {
if (str == null) {
throw new NullPointerException("str");
}
NormalizingCharacterInput norm = new NormalizingCharacterInput(str, form);
int[] buffer = new int[64];
List ret = new ArrayList(24);
int count = 0;
while ((count = norm.Read(buffer, 0, buffer.length)) > 0) {
for (int i = 0; i < count; ++i) {
ret.add(buffer[i]);
}
}
return ret;
}
private int lastStableIndex;
private int endIndex;
private int[] buffer;
private boolean compatMode;
private Normalization form;
private int processedIndex;
private int flushIndex;
private ICharacterInput iterator;
private List characterList;
private int characterListPos;
/**
* Initializes a new instance of the NormalizingCharacterInput class using
* Normalization Form C.
* @param characterList A list of Unicode code points specifying the text to
* normalize.
*/
public NormalizingCharacterInput (List characterList) {
this(characterList, Normalization.NFC);
}
/**
* Initializes a new instance of the NormalizingCharacterInput class using
* Normalization Form C.
* @param str A string specifying the text to normalize.
*/
public NormalizingCharacterInput (
String str) {
this(
str, Normalization.NFC);
}
/**
* Initializes a new instance of the NormalizingCharacterInput class using
* Normalization Form C.
* @param input An ICharacterInput object.
*/
public NormalizingCharacterInput (
ICharacterInput input) {
this(
input, Normalization.NFC);
}
/**
* Initializes a new instance of the NormalizingCharacterInput class using the
* given normalization form.
* @param characterList An List object.
* @param form Specifies the normalization form to use when normalizing the
* text.
* @throws NullPointerException The parameter {@code characterList} is null.
*/
public NormalizingCharacterInput (
List characterList,
Normalization form) {
if (characterList == null) {
throw new NullPointerException("characterList");
}
this.lastStableIndex = -1;
this.characterList = characterList;
this.form = form;
this.compatMode = form == Normalization.NFKC || form ==
Normalization.NFKD;
}
/**
* Initializes a new instance of the NormalizingCharacterInput class. Uses a
* portion of a string as the input.
* @param str A string object.
* @param index A 32-bit signed integer.
* @param length A 32-bit signed integer. (2).
* @param form Specifies the normalization form to use when normalizing the
* text.
*/
public NormalizingCharacterInput (
String str,
int index,
int length,
Normalization form) {
this(
new StringCharacterInput(str, index, length), form);
}
/**
* Initializes a new instance of the NormalizingCharacterInput class.
* @param str A string object.
* @param form Specifies the normalization form to use when normalizing the
* text.
*/
public NormalizingCharacterInput (String str, Normalization form) {
this(new StringCharacterInput(str), form);
}
/**
* Initializes a new instance of the NormalizingCharacterInput class.
* @param stream An ICharacterInput object.
* @param form Specifies the normalization form to use when normalizing the
* text.
* @throws NullPointerException The parameter {@code stream} is null.
*/
public NormalizingCharacterInput (
ICharacterInput stream,
Normalization form) {
if (stream == null) {
throw new NullPointerException("stream");
}
this.lastStableIndex = -1;
this.iterator = stream;
this.form = form;
this.compatMode = form == Normalization.NFKC || form ==
Normalization.NFKD;
}
/**
* Determines whether the text provided by a character input is normalized.
* @param chars A object that implements a streamable character input.
* @param form Specifies the normalization form to use when normalizing the
* text.
* @return True if the text is normalized; otherwise, false.
* @throws NullPointerException The parameter {@code chars} is null.
*/
public static boolean IsNormalized(ICharacterInput chars, Normalization form) {
if (chars == null) {
throw new NullPointerException("chars");
}
List list = new ArrayList();
int ch = 0;
while ((ch = chars.ReadChar()) >= 0) {
if ((ch & 0x1ff800) == 0xd800) {
return false;
}
list.add(ch);
}
return IsNormalized(list, form);
}
private static boolean NormalizeAndCheck(
List charList,
int start,
int length,
Normalization form) {
int i = 0;
for (int ch : NormalizingCharacterInput.GetChars(
new PartialListCharacterInput(charList, start, length),
form)) {
if (i >= length) {
return false;
}
if (ch != charList.get(start + i)) {
return false;
}
++i;
}
return true;
}
/**
* Converts a string to the given Unicode normalization form.
* @param str An arbitrary string.
* @param form The Unicode normalization form to convert to.
* @return The parameter {@code str} converted to the given normalization form.
* @throws NullPointerException The parameter {@code str} is null.
*/
public static String Normalize(String str, Normalization form) {
if (str == null) {
throw new NullPointerException("str");
}
if (str.length() <= 1024 && IsNormalized(str, form)) {
return str;
}
return Encodings.InputToString(
new NormalizingCharacterInput(str, form));
}
/**
* Determines whether the given string is in the given Unicode normalization
* form.
* @param str An arbitrary string.
* @param form Specifies the normalization form to use when normalizing the
* text.
* @return True if the given string is in the given Unicode normalization form;
* otherwise, false.
* @throws NullPointerException The parameter {@code str} is null.
*/
public static boolean IsNormalized(String str, Normalization form) {
if ((str) == null) {
throw new NullPointerException("str");
}
int nonStableStart = -1;
int mask = (form == Normalization.NFC) ? 0xff : 0x7f;
for (int i = 0; i < str.length(); ++i) {
int c = str.charAt(i);
if ((c & 0xfc00) == 0xd800 && i + 1 < str.length() &&
str.charAt(i + 1) >= 0xdc00 && str.charAt(i + 1) <= 0xdfff) {
// Get the Unicode code point for the surrogate pair
c = 0x10000 + ((c - 0xd800) << 10) + (str.charAt(i + 1) - 0xdc00);
} else if ((c & 0xf800) == 0xd800) {
// unpaired surrogate
return false;
}
boolean isStable = false;
if ((c & mask) == c && (i + 1 == str.length() || (str.charAt(i + 1) & mask)
== str.charAt(i + 1))) {
// Quick check for an ASCII character followed by another
// ASCII character (or Latin-1 in NFC) or the end of String.
// Treat the first character as stable
// in this situation.
isStable = true;
} else {
isStable = NormalizingCharacterInput.IsStableCodePoint(c, form);
}
if (nonStableStart < 0 && !isStable) {
// First non-stable code point in a row
nonStableStart = i;
} else if (nonStableStart >= 0 && isStable) {
// We have at least one non-stable code point,
// normalize these code points.
if (!NormalizeAndCheckString(
str,
nonStableStart,
i - nonStableStart,
form)) {
return false;
}
nonStableStart = -1;
}
if (c >= 0x10000) {
++i;
}
}
if (nonStableStart >= 0) {
if (!NormalizeAndCheckString(
str,
nonStableStart,
str.length() - nonStableStart,
form)) {
return false;
}
}
return true;
}
private static boolean NormalizeAndCheckString(
String charString,
int start,
int length,
Normalization form) {
int i = start;
NormalizingCharacterInput norm = new NormalizingCharacterInput(
charString,
start,
length,
form);
int ch = 0;
while ((ch = norm.ReadChar()) >= 0) {
int c = charString.charAt(i);
if ((c & 0x1ffc00) == 0xd800 && i + 1 < charString.length() &&
charString.charAt(i + 1) >= 0xdc00 && charString.charAt(i + 1) <= 0xdfff) {
// Get the Unicode code point for the surrogate pair
c = 0x10000 + ((c - 0xd800) << 10) + (charString.charAt(i + 1) - 0xdc00);
++i;
} else if ((c & 0x1ff800) == 0xd800) {
// unpaired surrogate
c = 0xfffd;
}
++i;
if (c != ch) {
return false;
}
}
return i == start + length;
}
/**
* Determines whether the given list of characters is in the given Unicode
* normalization form.
* @param charList A list of Unicode code points.
* @param form Specifies the normalization form to use when normalizing the
* text.
* @return True if the given list of characters is in the given Unicode
* normalization form; otherwise, false.
* @throws NullPointerException The parameter {@code charList} is null.
*/
public static boolean IsNormalized(List charList, Normalization form) {
int nonStableStart = -1;
int mask = (form == Normalization.NFC) ? 0xff : 0x7f;
if (charList == null) {
throw new NullPointerException("charList");
}
for (int i = 0; i < charList.size(); ++i) {
int c = charList.get(i);
if (c < 0 || c > 0x10ffff || ((c & 0x1ff800) == 0xd800)) {
return false;
}
boolean isStable = false;
if ((c & mask) == c && (i + 1 == charList.size() || (charList.get(i + 1)&
mask) == charList.get(i + 1))) {
// Quick check for an ASCII character followed by another
// ASCII character (or Latin-1 in NFC) or the end of String.
// Treat the first character as stable
// in this situation.
isStable = true;
} else {
isStable = IsStableCodePoint(c, form);
}
if (nonStableStart < 0 && !isStable) {
// First non-stable code point in a row
nonStableStart = i;
} else if (nonStableStart >= 0 && isStable) {
// We have at least one non-stable code point,
// normalize these code points.
if (!NormalizeAndCheck(
charList,
nonStableStart,
i - nonStableStart,
form)) {
return false;
}
nonStableStart = -1;
}
}
if (nonStableStart >= 0) {
if (!NormalizeAndCheck(
charList,
nonStableStart,
charList.size() - nonStableStart,
form)) {
return false;
}
}
return true;
}
private int[] readbuffer = new int[1];
/**
* Reads a Unicode character from a data source.
* @return Either a Unicode code point (from 0-0xd7ff or from 0xe000 to
* 0x10ffff), or the value -1 indicating the end of the source.
*/
public int ReadChar() {
int r = this.Read(this.readbuffer, 0, 1);
return r == 1 ? this.readbuffer[0] : -1;
}
private boolean endOfString;
private int lastChar = -1;
private boolean ungetting;
private void Unget() {
this.ungetting = true;
}
private int GetNextChar() {
int ch;
if (this.ungetting) {
ch = this.lastChar;
this.ungetting = false;
return ch;
}
ch = (this.iterator == null) ? ((this.characterListPos >=
this.characterList.size()) ? -1 :
this.characterList.get(this.characterListPos++)) :
this.iterator.ReadChar();
if (ch < 0) {
this.endOfString = true;
} else if (ch > 0x10ffff || ((ch & 0x1ff800) == 0xd800)) {
throw new IllegalArgumentException("Invalid character: " + ch);
}
this.lastChar = ch;
return ch;
}
/**
* Reads a sequence of Unicode code points from a data source.
* @param chars Output buffer.
* @param index A zero-based index showing where the desired portion of {@code
* chars} begins.
* @param length The number of elements in the desired portion of {@code chars}
* (but not more than {@code chars} 's length).
* @return The number of Unicode code points read, or 0 if the end of the
* source is reached.
* @throws NullPointerException The parameter {@code chars} is null.
* @throws IllegalArgumentException Either {@code index} or {@code length} is less
* than 0 or greater than {@code chars} 's length, or {@code chars} 's
* length minus {@code index} is less than {@code length}.
*/
public int Read(int[] chars, int index, int length) {
if (chars == null) {
throw new NullPointerException("chars");
}
if (index < 0) {
throw new IllegalArgumentException("index (" + index + ") is less than " +
"0");
}
if (index > chars.length) {
throw new IllegalArgumentException("index (" + index + ") is more than " +
chars.length);
}
if (length < 0) {
throw new IllegalArgumentException("length (" + length + ") is less than " +
"0");
}
if (length > chars.length) {
throw new IllegalArgumentException("length (" + length + ") is more than " +
chars.length);
}
if (chars.length - index < length) {
throw new IllegalArgumentException("chars's length minus " + index + " (" +
(chars.length - index) + ") is less than " + length);
}
if (length == 0) {
return 0;
}
int total = 0;
int count = 0;
if (this.processedIndex == this.flushIndex && this.flushIndex == 0) {
while (total < length) {
int c = this.GetNextChar();
if (c < 0) {
return (total == 0) ? -1 : total;
}
if (IsStableCodePoint(c, this.form)) {
chars[index] = c;
++total;
++index;
} else {
this.Unget();
break;
}
}
if (total == length) {
return total;
}
}
do {
// System.out.println("indexes=" + this.processedIndex + " " +
// this.flushIndex + ", length=" + length + " total=" + total);
count = Math.min(this.processedIndex - this.flushIndex, length - total);
if (count < 0) {
count = 0;
}
if (count != 0) {
// Fill buffer with processed code points
System.arraycopy(this.buffer, this.flushIndex, chars, index, count);
}
index += count;
total += count;
this.flushIndex += count;
// Try to fill buffer with stable code points,
// as an optimization
while (total < length) {
// charbufpos, charbufend);
int c = this.GetNextChar();
if (c < 0) {
this.endOfString = true;
break;
}
if (IsStableCodePoint(c, this.form)) {
chars[index++] = c;
++total;
} else {
this.Unget();
break;
}
}
// Ensure that more data is available
if (total < length && this.flushIndex == this.processedIndex) {
if (this.lastStableIndex > 0) {
// Move unprocessed data to the beginning of
// the buffer
System.arraycopy(
this.buffer,
this.lastStableIndex,
this.buffer,
0,
this.buffer.length - this.lastStableIndex);
// System.out.println("endIndex=" + (this.endIndex));
this.endIndex -= this.lastStableIndex;
this.lastStableIndex = 0;
} else {
this.endIndex = 0;
}
if (!this.LoadMoreData()) {
break;
}
}
} while (total < length);
// Fill buffer with processed code points
count = Math.max(
0,
Math.min(this.processedIndex - this.flushIndex, length - total));
System.arraycopy(this.buffer, this.flushIndex, chars, index, count);
index += count;
total += count;
this.flushIndex += count;
return (total == 0) ? -1 : total;
}
private boolean LoadMoreData() {
boolean done = false;
while (!done) {
this.buffer = (this.buffer == null) ? ((new int[32])) : this.buffer;
// Fill buffer with decompositions until the buffer is full
// or the end of the String is reached.
while (this.endIndex + 18 <= this.buffer.length) {
int c = this.GetNextChar();
if (c < 0) {
this.endOfString = true;
break;
}
this.endIndex = DecompToBuffer(
c,
this.compatMode,
this.buffer,
this.endIndex);
}
// Check for the last stable code point if the
// end of the String is not reached yet
if (!this.endOfString) {
boolean haveNewStable = false;
// NOTE: lastStableIndex begins at -1
for (int i = this.endIndex - 1; i > this.lastStableIndex; --i) {
// System.out.println("stable({0:X4})=" +
// (IsStableCodePoint(this.buffer[i], this.form)));
if (IsStableCodePoint(this.buffer[i], this.form)) {
this.lastStableIndex = i;
haveNewStable = true;
break;
}
}
if (!haveNewStable || this.lastStableIndex <= 0) {
// No stable code point was found (or last stable
// code point is at beginning of buffer), increase
// the buffer size
int[] newBuffer = new int[(this.buffer.length + 4) * 2];
System.arraycopy(this.buffer, 0, newBuffer, 0, this.buffer.length);
this.buffer = newBuffer;
continue;
}
} else {
// End of String
this.lastStableIndex = this.endIndex;
}
done = true;
}
// No data in buffer
if (this.endIndex == 0) {
return false;
}
this.flushIndex = 0;
// Canonical reordering
ReorderBuffer(this.buffer, 0, this.lastStableIndex);
if (this.form == Normalization.NFC || this.form == Normalization.NFKC) {
// Composition
this.processedIndex = ComposeBuffer(
this.buffer,
this.lastStableIndex);
} else {
this.processedIndex = this.lastStableIndex;
}
return true;
}
}