src.org.python.modules.ucnhash Maven / Gradle / Ivy
// Copyright 1998 Finn Bock.
package org.python.modules;
import java.io.*;
import org.python.core.*;
public class ucnhash implements ucnhashAPI {
// Parameters for the word hash.
private static int n;
private static int m;
private static int minchar;
private static int maxchar;
private static int alphasz;
private static int maxlen;
private static int maxidx;
private static int maxklen;
private static short[] G;
private static short[] T0;
private static short[] T1;
private static short[] T2;
// Map the hashed values into the text (as bytes).
private static byte[] worddata;
private static short[] wordoffs;
// wordindex greate then cutoff is stored as into two bytes.
private static short wordstart;
private static short wordcutoff;
// The raw data and indexes into start of each name
// The rawindex is sorted based on the wordindexes.
private static byte[] rawdata;
private static char[] rawindex;
// The mapping from raw data index to unicode code points.
private static char[] codepoint;
public static String[] __depends__ = new String[] {
"/org/python/modules/ucnhash.dat",
};
public static void loadTables() throws Exception {
InputStream instream = ucnhash.class.
getResourceAsStream("ucnhash.dat");
if (instream == null)
throw new IOException("Unicode name database not found: " +
"ucnhash.dat");
DataInputStream in = new DataInputStream(
new BufferedInputStream(instream));
n = in.readShort();
m = in.readShort();
minchar= in.readShort();
maxchar = in.readShort();
alphasz = in.readShort();
maxlen = in.readShort();
maxidx = maxlen*alphasz-minchar;
G = readShortTable(in);
if (in.readShort() != 3)
throw new IOException("UnicodeNameMap file corrupt, " +
"unknown dimension");
T0 = readShortTable(in);
T1 = readShortTable(in);
T2 = readShortTable(in);
wordoffs = readShortTable(in);
worddata = readByteTable(in);
wordstart = in.readShort();
wordcutoff = in.readShort();
maxklen = in.readShort();
rawdata = readByteTable(in);
rawindex = readCharTable(in);
codepoint = readCharTable(in);
}
private static short[] readShortTable(DataInputStream in)
throws IOException
{
if (in.read() != 't')
throw new IOException("UnicodeNameMap file corrupt, shorttable");
int n = in.readUnsignedShort() / 2;
short[] table = new short[n];
for (int i = 0; i < n; i++) {
table[i] = in.readShort();
}
return table;
}
private static char[] readCharTable(DataInputStream in)
throws IOException
{
if (in.read() != 't')
throw new IOException("UnicodeNameMap file corrupt, chartable");
int n = in.readUnsignedShort() / 2;
char[] table = new char[n];
for (int i = 0; i < n; i++) {
table[i] = in.readChar();
}
return table;
}
private static byte[] readByteTable(DataInputStream in)
throws IOException
{
if (in.read() != 't')
throw new IOException("UnicodeNameMap file corrupt, byte table");
int n = in.readUnsignedShort();
byte[] table = new byte[n];
in.readFully(table);
return table;
}
public static int hash(String key) {
return hash(key, 0, key.length());
}
public static int hash(String key, int start, int end) {
int i, j;
int f0, f1, f2;
for (j = start, i=-minchar, f0=f1=f2=0; j < end; j++) {
char ch = key.charAt(j);
if (ch >= 'a' && ch <= 'z')
ch = (char) (ch - 'a' + 'A');
f0 += T0[i + ch];
f1 += T1[i + ch];
f2 += T2[i + ch];
i += alphasz;
if (i >= maxidx)
i = -minchar;
}
f0 %= n;
f1 %= n;
f2 %= n;
return (G[f0] + G[f1] + G[f2]) % m;
}
private static final char[] charmap =
" ABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789-".toCharArray();
private static String getWord(int idx) {
int offset = wordoffs[idx];
int end = worddata.length;
if (idx < wordoffs.length-1)
end = wordoffs[idx+1];
StringBuilder buf = new StringBuilder();
for (int i = offset; i < end; i++)
buf.append(charmap[worddata[i]]);
return buf.toString();
}
private static boolean match(int idx, byte[] raw, int begin, int end) {
int woff = wordoffs[idx];
int wend = worddata.length;
if (idx < wordoffs.length-1)
wend = wordoffs[idx+1];
if (end-begin != wend - woff)
return false;
int l = end-begin;
for (int i = 0; i < l; i++) {
if (worddata[woff + i] != raw[begin + i])
return false;
}
return true;
}
private static int compare(byte[] a1, int off1, int len1,
byte[] a2, int off2, int len2)
{
for (int i = 0; i < len1 && i < len2; i++) {
int d = (a1[off1 + i] & 0xFF) - (a2[off2 + i] & 0xFF);
if (d != 0)
return d;
}
return len1 - len2;
}
private static int binarysearch(byte[] rawlist, int start, int end) {
int floor = 0;
int ceiling = (rawindex.length) / 5;
while (floor < ceiling - 1) {
int middle = (floor + ceiling) / 2;
if (debug)
System.out.println("floor:" + floor + " ceiling:" +
ceiling +" => " + middle);
int off = rawindex[middle*5];
int len = rawindex[middle*5+4] & 0x1F;
int d = compare(rawlist, start, end - start, rawdata, off, len);
if (d < 0)
ceiling = middle;
else if (d > 0)
floor = middle;
else
return middle * 12;
}
int tmp = floor*5;
int off = rawindex[tmp++];
long lengths = ((long) rawindex[tmp++] << 48) |
((long) rawindex[tmp++] << 32) |
((long) rawindex[tmp++] << 16) |
rawindex[tmp++];
floor *= 12;
for (int i = 0; i < 12; i++) {
int len = (int) (lengths >> (i * 5)) & 0x1F;
if (compare(rawlist, start, end, rawdata, off, len) == 0)
return floor;
off += len;
floor++;
}
return -1;
}
public static int lookup(String name) {
return lookup(name, 0, name.length());
}
private static int lookup(String name, int start, int end) {
byte[] rawlist = new byte[32];
int ridx = 0;
int rbegin = 0;
int rstart = 0;
int i;
while (true) {
rbegin = ridx;
int begin = start;
for (i = start; i < end; i++) {
char ch = name.charAt(i);
if (ch == ' ') {
start = i+1;
break;
}
int v;
if (ch >= 'a' && ch <= 'z')
ch = (char) (ch - 'a' + 'A');
if (ch >= 'A' && ch <= 'Z')
v = ch - 'A' + 1;
else if (ch >= '0' && ch <= '9')
v = ch - '0' + 27;
else if (ch == '-')
v = 37;
else
return -1;
rawlist[ridx++] = (byte) v;
if (ch == '-' && start != i) {
start = ++i;
break;
}
}
int hash = hash(name, begin, i);
if (debug)
System.out.println(name.substring(begin, i) + " " + hash);
boolean isWord = hash >= 0 &&
ridx - rbegin > 1 &&
match(hash, rawlist, rbegin, ridx);
if (isWord) {
if (debug)
System.out.println("match " + getWord(hash));
hash += wordstart;
ridx = rstart;
if (hash > wordcutoff) {
rawlist[ridx++] = (byte) ((hash >> 8) + wordcutoff);
rawlist[ridx++] = (byte) (hash & 0xFF);
} else
rawlist[ridx++] = (byte) hash;
}
rstart = ridx;
if (i >= end)
break;
if (!isWord) {
rawlist[ridx++] = 0;
}
}
if (debug) {
System.out.print("rawdata: ");
for (int k = 0; k < ridx; k++)
System.out.print((rawlist[k] & 0xFF) + " ");
System.out.println();
}
int idx = binarysearch(rawlist, 0, ridx);
if (idx < 0)
return idx;
if (debug) {
System.out.println("idx:" + idx);
System.out.println("codepoint:" + codepoint[idx] + " " +
Integer.toHexString(codepoint[idx]));
}
return codepoint[idx];
}
// From the ucnhashAPI interface
public int getCchMax() {
if (!initialized())
return -1;
return maxklen;
}
private static String cjkPrefix = "CJK COMPATIBILITY IDEOGRAPH-";
private static int cjkPrefixLen = cjkPrefix.length();
// From the ucnhashAPI interface
public int getValue(String s, int start, int end) {
if (!initialized())
return -1;
if (s.regionMatches(start, cjkPrefix, 0, cjkPrefixLen)) {
try {
String hex = s.substring(start + cjkPrefixLen, end);
int v = Integer.parseInt(hex, 16);
return v;
} catch (NumberFormatException exc) {
return -1; // Maybe fallthrough to the main algorithme.
}
}
return lookup(s, start, end);
}
private static boolean initialized = false;
private static boolean loaded = false;
private synchronized boolean initialized() {
if (initialized && loaded)
return true;
if (initialized)
return false;
try {
loadTables();
loaded = true;
} catch (Exception exc) {
return false;
}
initialized = true;
return true;
}
private static boolean debug = false;
public static void main(String[] args) throws Exception {
loadTables();
debug = true;
/*
System.out.println(getWord(hash("ARABIC")));
System.out.println(getWord(hash("SMALL")));
System.out.println(getWord(hash("YI")));
System.out.println(getWord(hash("SYLLABLE")));
System.out.println(getWord(hash("WITH")));
System.out.println(getWord(hash("LETTER")));
System.out.println(lookup("NULL"));
System.out.println(lookup("LATIN CAPITAL LETTER AFRICAN D"));
System.out.println(lookup("GURMUKHI TIPPI"));
System.out.println(lookup("TIBETAN MARK GTER YIG MGO -UM " +
"RNAM BCAD MA"));
System.out.println(lookup("HANGUL CHOSEONG PIEUP"));
System.out.println(lookup("SINGLE LOW-9 QUOTATION MARK"));
*/
System.out.println(lookup("BACKSPACE"));
// System.out.println(lookup("ACTIVATE SYMMETRIC SWAPPING"));
/*
System.out.println(lookup("LATIN CAPITAL LETTER A"));
System.out.println(lookup("GREATER-THAN SIGN"));
System.out.println(lookup("EURO-CURRENCY SIGN"));
*/
}
}
© 2015 - 2025 Weber Informatics LLC | Privacy Policy