net.sf.saxon.serialize.codenorm.Normalizer Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of saxon-he Show documentation
Show all versions of saxon-he Show documentation
An OSGi bundle for Saxon-HE
////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
// Copyright (c) 2013 Saxonica Limited.
// This Source Code Form is subject to the terms of the Mozilla Public License, v. 2.0.
// If a copy of the MPL was not distributed with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
// This Source Code Form is "Incompatible With Secondary Licenses", as defined by the Mozilla Public License, v. 2.0.
////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
package net.sf.saxon.serialize.codenorm;
import net.sf.saxon.Configuration;
import net.sf.saxon.serialize.charcode.UTF16CharacterSet;
import net.sf.saxon.trans.XPathException;
import net.sf.saxon.tree.util.FastStringBuffer;
/**
* Implements Unicode Normalization Forms C, D, KC, KD.
* Copyright (c) 1991-2005 Unicode, Inc.
* For terms of use, see http://www.unicode.org/terms_of_use.html
* For documentation, see UAX#15.
* The Unicode Consortium makes no expressed or implied warranty of any
* kind, and assumes no liability for errors or omissions.
* No liability is assumed for incidental and consequential damages
* in connection with or arising out of the use of the information here.
* @author Mark Davis
* Updates for supplementary code points: Vladimir Weinstein & Markus Scherer
* Modified to remove dependency on ICU code: Michael Kay
*/
public class Normalizer {
/**
* The requested normalization form.
*/
private int form;
/**
* Create a normalizer for a given form.
* @param form the normalization form required: for example {@link Normalizer#C}, {@link Normalizer#D}
* @param config the Saxon configuration
* @throws XPathException if normalization fails
*/
public Normalizer(int form, Configuration config) throws XPathException {
this.form = form;
if (data == null) {
//data = UnicodeDataParser.build(); // load 1st time
data = UnicodeDataParserFromXML.build(config);
}
}
/**
* Masks for the form selector
*/
static final int
COMPATIBILITY_MASK = 1,
COMPOSITION_MASK = 2;
/**
* Normalization Form Selector
*/
public static final int
D = 0 ,
C = COMPOSITION_MASK,
KD = COMPATIBILITY_MASK,
KC = (byte)(COMPATIBILITY_MASK + COMPOSITION_MASK),
NO_ACTION = 8;
/**
* Normalizes text according to the chosen form
* @param source the original text, unnormalized
* @return target the resulting normalized text
*/
public CharSequence normalize(CharSequence source) {
if (form == NO_ACTION || source.length() == 0) {
return source;
}
// First decompose the source into target,
// then compose if the form requires.
FastStringBuffer target = new FastStringBuffer(source.length()+8);
internalDecompose(source, target);
if ((form & COMPOSITION_MASK) != 0) {
internalCompose(target);
}
return target;
}
/**
* Decomposes text, either canonical or compatibility,
* replacing contents of the target buffer.
// * @param form the normalization form. If COMPATIBILITY_MASK
// * bit is on in this byte, then selects the recursive
// * compatibility decomposition, otherwise selects
// * the recursive canonical decomposition.
* @param source the original text, unnormalized
* @param target the resulting normalized text
*/
private void internalDecompose(CharSequence source, FastStringBuffer target) {
FastStringBuffer buffer = new FastStringBuffer(8);
boolean canonical = (form & COMPATIBILITY_MASK) == 0;
int ch32;
for (int i = 0; i < source.length();) {
buffer.setLength(0);
ch32 = source.charAt(i++);
if (ch32 < 128) {
// fast path - for ASCII characters, decomposition is a no-op
target.append((char)ch32);
continue;
}
if (UTF16CharacterSet.isHighSurrogate(ch32)) {
char low = source.charAt(i++);
ch32 = UTF16CharacterSet.combinePair((char)ch32, low);
}
data.getRecursiveDecomposition(canonical, ch32, buffer);
// add all of the characters in the decomposition.
// (may be just the original character, if there was
// no decomposition mapping)
int ch;
for (int j = 0; j < buffer.length();) {
ch = buffer.charAt(j++);
if (UTF16CharacterSet.isHighSurrogate(ch)) {
char low = buffer.charAt(j++);
ch = UTF16CharacterSet.combinePair((char)ch, low);
}
int chClass = data.getCanonicalClass(ch);
int k = target.length(); // insertion point
if (chClass != 0) {
// bubble-sort combining marks as necessary
int ch2;
while (k > 0) {
int step = 1;
ch2 = target.charAt(k-1);
if (UTF16CharacterSet.isSurrogate(ch2)) {
step = 2;
char high = target.charAt(k-2);
ch2 = UTF16CharacterSet.combinePair(high, (char)ch2);
}
if (data.getCanonicalClass(ch2) <= chClass) break;
k -= step;
}
}
if (ch < 65536) {
target.insert(k, (char)ch);
} else {
target.insertWideChar(k, ch);
}
}
}
}
/**
* Composes text in place. Target must already
* have been decomposed.
* @param target input: decomposed text.
* output: the resulting normalized text.
*/
private void internalCompose(FastStringBuffer target) {
int starterPos = 0;
//int starterCh = UTF16.charAt(target,0);
//int compPos = (starterCh<65536 ? 1 : 2); // length of last composition
int starterCh = target.charAt(0);
int compPos = 1;
if (UTF16CharacterSet.isHighSurrogate(starterCh)) {
starterCh = UTF16CharacterSet.combinePair((char)starterCh, target.charAt(1));
compPos++;
}
int lastClass = data.getCanonicalClass(starterCh);
if (lastClass != 0) lastClass = 256; // fix for strings staring with a combining mark
int oldLen = target.length();
// Loop on the decomposed characters, combining where possible
int ch;
//for (int decompPos = compPos; decompPos < target.length(); decompPos += (ch<65536 ? 1 : 2)) {
for (int decompPos = compPos; decompPos < target.length();) {
ch = target.charAt(decompPos++);
if (UTF16CharacterSet.isHighSurrogate(ch)) {
ch = UTF16CharacterSet.combinePair((char)ch, target.charAt(decompPos++));
}
//ch = UTF16.charAt(target, decompPos);
int chClass = data.getCanonicalClass(ch);
int composite = data.getPairwiseComposition(starterCh, ch);
if (composite != NormalizerData.NOT_COMPOSITE && (lastClass < chClass || lastClass == 0)) {
setCharAt(target, starterPos, composite);
// we know that we will only be replacing non-supplementaries by non-supplementaries
// so we don't have to adjust the decompPos
starterCh = composite;
} else {
if (chClass == 0) {
starterPos = compPos;
starterCh = ch;
}
lastClass = chClass;
setCharAt(target, compPos, ch);
if (target.length() != oldLen) { // MAY HAVE TO ADJUST!
decompPos += target.length() - oldLen;
oldLen = target.length();
}
compPos += (ch<65536 ? 1 : 2);
}
}
target.setLength(compPos);
}
/**
* Set the 32-bit character at a particular 16-bit offset in a string buffer,
* replacing the previous character at that position, and taking account of the
* fact that either, both, or neither of the characters might be a surrogate pair.
* @param target the StringBuffer in which the data is to be inserted
* @param offset the position at which the data is to be inserted
* @param ch32 the character to be inserted, as a 32-bit Unicode codepoint
*/
private static void setCharAt(FastStringBuffer target, int offset, int ch32) {
if (ch32 < 65536) {
if (UTF16CharacterSet.isHighSurrogate(target.charAt(offset))) {
target.setCharAt(offset, (char)ch32);
target.removeCharAt(offset + 1);
} else {
target.setCharAt(offset, (char)ch32);
}
} else {
if (UTF16CharacterSet.isHighSurrogate(target.charAt(offset))) {
target.setCharAt(offset, UTF16CharacterSet.highSurrogate(ch32));
target.setCharAt(offset+1, UTF16CharacterSet.lowSurrogate(ch32));
} else {
target.setCharAt(offset, UTF16CharacterSet.highSurrogate(ch32));
target.insert(offset+1, UTF16CharacterSet.lowSurrogate(ch32));
}
}
}
/**
* Contains normalization data from the Unicode Character Database.
*/
/*@Nullable*/ private static NormalizerData data = null;
/**
* Just accessible for testing.
* @param ch a character
* @return true if the character is an excluded character
*/
// boolean getExcluded (char ch) {
// return data.getExcluded(ch);
// }
/**
* Just accessible for testing.
* @param ch a character
* @return the raw decomposition mapping of the character
*/
// String getRawDecompositionMapping (char ch) {
// return data.getRawDecompositionMapping(ch);
// }
}
// * The class is derived from the sample program Normalizer.java published by the
// * Unicode consortium.
// * Updates for supplementary code points: Vladimir Weinstein & Markus Scherer
// * Modified to remove dependency on ICU code: Michael Kay, Saxonica Limited