Many resources are needed to download a project. Please understand that we have to compensate our server costs. Thank you in advance. Project price only 1 $
You can buy this project and download/modify it how often you want.
/**
* Copyright (C) 2004, 2005, 2006, 2007, 2008, 2009 Free Software Foundation, Inc.
*
* Author: Oliver Hitz
*
* This file is part of GNU Libidn.
*
* This library is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public License
* as published by the Free Software Foundation; either version 2.1 of
* the License, or (at your option) any later version.
*
* This library is distributed in the hope that it will be useful, but
* WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with this library; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301
* USA
*/
package gnu.inet.encoding;
/**
* This class offers static methods for Unicode NFKC string normalization.
*/
public class NFKC
{
/**
* Applies NFKC normalization to a string.
*
* @param in The string to normalize.
* @return An NFKC normalized string.
*/
public static String normalizeNFKC(String in)
{
StringBuffer out = new StringBuffer();
for (int i = 0; i < in.length(); i++) {
char code = in.charAt(i);
// In Unicode 3.0, Hangul was defined as the block from U+AC00
// to U+D7A3, however, since Unicode 3.2 the block extends until
// U+D7AF. The decomposeHangul function only decomposes until
// U+D7A3. Should this be changed?
if (code >= 0xAC00 && code <= 0xD7AF) {
out.append(decomposeHangul(code));
} else {
int index = decomposeIndex(code);
if (index == -1) {
out.append(code);
} else {
out.append(DecompositionMappings.m[index]);
}
}
}
// Bring the stringbuffer into canonical order.
canonicalOrdering(out);
// Do the canonical composition.
int last_cc = 0;
int last_start = 0;
for (int i = 0; i < out.length(); i++) {
int cc = combiningClass(out.charAt(i));
if (i > 0 && (last_cc == 0 || last_cc != cc)) {
// Try to combine characters
char a = out.charAt(last_start);
char b = out.charAt(i);
int c = compose(a, b);
if (c != -1) {
out.setCharAt(last_start, (char) c);
out.deleteCharAt(i);
i--;
if (i == last_start) {
last_cc = 0;
} else {
last_cc = combiningClass(out.charAt(i-1));
}
continue;
}
}
if (cc == 0) {
last_start = i;
}
last_cc = cc;
}
return out.toString();
}
/**
* Returns the index inside the decomposition table, implemented
* using a binary search.
*
* @param c Character to look up.
* @return Index if found, -1 otherwise.
*/
static int decomposeIndex(char c)
{
int start = 0;
int end = DecompositionKeys.k.length/2;
while (true) {
int half = (start + end) / 2;
int code = DecompositionKeys.k[half*2];
if (c == code) {
return DecompositionKeys.k[half*2 + 1];
}
if (half == start) {
// Character not found
return -1;
} else if (c > code) {
start = half;
} else {
end = half;
}
}
}
/**
* Returns the combining class of a given character.
*
* @param c The character.
* @return The combining class.
*/
static int combiningClass(char c)
{
int h = c >> 8;
int l = c & 0xff;
int i = CombiningClass.i[h];
if (i > -1) {
return CombiningClass.c[i][l];
} else {
return 0;
}
}
/**
* Rearranges characters in a stringbuffer in order to respect the
* canonical ordering properties.
*
* @param The StringBuffer to rearrange.
*/
static void canonicalOrdering(StringBuffer in)
{
boolean isOrdered = false;
while (!isOrdered) {
isOrdered = true;
int lastCC = combiningClass(in.charAt(0));
for (int i = 0; i < in.length()-1; i++) {
int nextCC = combiningClass(in.charAt(i+1));
if (nextCC != 0 && lastCC > nextCC) {
for (int j = i+1; j > 0; j--) {
if (combiningClass(in.charAt(j-1)) <= nextCC) {
break;
}
char t = in.charAt(j);
in.setCharAt(j, in.charAt(j-1));
in.setCharAt(j-1, t);
isOrdered = false;
}
nextCC = lastCC;
}
lastCC = nextCC;
}
}
}
/**
* Returns the index inside the composition table.
*
* @param a Character to look up.
* @return Index if found, -1 otherwise.
*/
static int composeIndex(char a)
{
if (a>>8 >= Composition.composePage.length) {
return -1;
}
int ap = Composition.composePage[a>>8];
if (ap == -1) {
return -1;
}
return Composition.composeData[ap][a & 0xff];
}
/**
* Tries to compose two characters canonically.
*
* @param a First character.
* @param b Second character.
* @return The composed character or -1 if no composition could be
* found.
*/
static int compose(char a, char b)
{
int h = composeHangul(a, b);
if (h != -1) {
return h;
}
int ai = composeIndex(a);
if (ai >= Composition.singleFirstStart && ai < Composition.singleSecondStart) {
if (b == Composition.singleFirst[ai - Composition.singleFirstStart][0]) {
return Composition.singleFirst[ai - Composition.singleFirstStart][1];
} else {
return -1;
}
}
int bi = composeIndex(b);
if (bi >= Composition.singleSecondStart) {
if (a == Composition.singleSecond[bi - Composition.singleSecondStart][0]) {
return Composition.singleSecond[bi - Composition.singleSecondStart][1];
} else {
return -1;
}
}
if (ai >= 0 && ai < Composition.multiSecondStart &&
bi >= Composition.multiSecondStart && bi < Composition.singleFirstStart) {
char[] f = Composition.multiFirst[ai];
if (bi - Composition.multiSecondStart < f.length) {
char r = f[bi - Composition.multiSecondStart];
if (r == 0) {
return -1;
} else {
return r;
}
}
}
return -1;
}
/**
* Entire hangul code copied from:
* http://www.unicode.org/unicode/reports/tr15/
*
* Several hangul specific constants
*/
static final int SBase = 0xAC00;
static final int LBase = 0x1100;
static final int VBase = 0x1161;
static final int TBase = 0x11A7;
static final int LCount = 19;
static final int VCount = 21;
static final int TCount = 28;
static final int NCount = VCount * TCount;
static final int SCount = LCount * NCount;
/**
* Decomposes a hangul character.
*
* @param s A character to decompose.
* @return A string containing the hangul decomposition of the input
* character. If no hangul decomposition can be found, a string
* containing the character itself is returned.
*/
static String decomposeHangul(char s)
{
int SIndex = s - SBase;
if (SIndex < 0 || SIndex >= SCount) {
return String.valueOf(s);
}
StringBuffer result = new StringBuffer();
int L = LBase + SIndex / NCount;
int V = VBase + (SIndex % NCount) / TCount;
int T = TBase + SIndex % TCount;
result.append((char)L);
result.append((char)V);
if (T != TBase) result.append((char)T);
return result.toString();
}
/**
* Composes two hangul characters.
*
* @param a First character.
* @param b Second character.
* @return Returns the composed character or -1 if the two
* characters cannot be composed.
*/
static int composeHangul(char a, char b)
{
// 1. check to see if two current characters are L and V
int LIndex = a - LBase;
if (0 <= LIndex && LIndex < LCount) {
int VIndex = b - VBase;
if (0 <= VIndex && VIndex < VCount) {
// make syllable of form LV
return SBase + (LIndex * VCount + VIndex) * TCount;
}
}
// 2. check to see if two current characters are LV and T
int SIndex = a - SBase;
if (0 <= SIndex && SIndex < SCount && (SIndex % TCount) == 0) {
int TIndex = b - TBase;
if (0 <= TIndex && TIndex <= TCount) {
// make syllable of form LVT
return a+TIndex;
}
}
return -1;
}
}