org.apache.abdera.i18n.text.Normalizer Maven / Gradle / Ivy
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. The ASF licenses this file to You
* under the Apache License, Version 2.0 (the "License"); you may not
* use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License. For additional information regarding
* copyright in this work, please see the NOTICE file in the top level
* directory of this distribution.
*/
package org.apache.abdera.i18n.text;
import java.io.IOException;
import org.apache.abdera.i18n.text.data.UnicodeCharacterDatabase;
/**
* Performs Unicode Normalization (Form D,C,KD and KC)
*/
public final class Normalizer {
private enum Mask {
NONE, COMPATIBILITY, COMPOSITION
}
public enum Form {
D, C(Mask.COMPOSITION), KD(Mask.COMPATIBILITY), KC(Mask.COMPATIBILITY, Mask.COMPOSITION);
private int mask = 0;
Form(Mask... masks) {
for (Mask mask : masks) {
this.mask |= (mask.ordinal());
}
}
public boolean isCompatibility() {
return (mask & (Mask.COMPATIBILITY.ordinal())) != 0;
}
public boolean isCanonical() {
return !isCompatibility();
}
public boolean isComposition() {
return (mask & (Mask.COMPOSITION.ordinal())) != 0;
}
}
private Normalizer() {
}
/**
* Normalize the string using NFKC
*/
public static String normalize(CharSequence source) {
return normalize(source, Form.KC);
}
/**
* Normalize the string using the specified Form
*/
public static String normalize(CharSequence source, Form form) {
return normalize(source, form, new StringBuilder());
}
/**
* Normalize the string into the given StringBuilder using the given Form
*/
public static String normalize(CharSequence source, Form form, StringBuilder buf) {
if (source.length() != 0) {
try {
decompose(source, form, buf);
compose(form, buf);
} catch (IOException e) {
throw new RuntimeException(e);
}
}
return buf.toString();
}
private static void decompose(CharSequence source, Form form, StringBuilder buf) throws IOException {
StringBuilder internal = new StringBuilder();
CodepointIterator ci = CodepointIterator.forCharSequence(source);
boolean canonical = form.isCanonical();
while (ci.hasNext()) {
Codepoint c = ci.next();
internal.setLength(0);
UnicodeCharacterDatabase.decompose(c.getValue(), canonical, internal);
CodepointIterator ii = CodepointIterator.forCharSequence(internal);
while (ii.hasNext()) {
Codepoint ch = ii.next();
int i = findInsertionPoint(buf, ch.getValue());
buf.insert(i, CharUtils.toString(ch.getValue()));
}
}
}
private static int findInsertionPoint(StringBuilder buf, int c) {
int cc = UnicodeCharacterDatabase.getCanonicalClass(c);
int i = buf.length();
if (cc != 0) {
int ch;
for (; i > 0; i -= CharUtils.length(c)) {
ch = CharUtils.codepointAt(buf, i - 1).getValue();
if (UnicodeCharacterDatabase.getCanonicalClass(ch) <= cc)
break;
}
}
return i;
}
private static void compose(Form form, StringBuilder buf) throws IOException {
if (!form.isComposition())
return;
int pos = 0;
int lc = CharUtils.codepointAt(buf, pos).getValue();
int cpos = CharUtils.length(lc);
int lcc = UnicodeCharacterDatabase.getCanonicalClass(lc);
if (lcc != 0)
lcc = 256;
int len = buf.length();
int c;
for (int dpos = cpos; dpos < buf.length(); dpos += CharUtils.length(c)) {
c = CharUtils.codepointAt(buf, dpos).getValue();
int cc = UnicodeCharacterDatabase.getCanonicalClass(c);
int composite = UnicodeCharacterDatabase.getPairComposition(lc, c);
if (composite != '\uFFFF' && (lcc < cc || lcc == 0)) {
CharUtils.setChar(buf, pos, composite);
lc = composite;
} else {
if (cc == 0) {
pos = cpos;
lc = c;
}
lcc = cc;
CharUtils.setChar(buf, cpos, c);
if (buf.length() != len) {
dpos += buf.length() - len;
len = buf.length();
}
cpos += CharUtils.length(c);
}
}
buf.setLength(cpos);
}
}
© 2015 - 2024 Weber Informatics LLC | Privacy Policy