org.python.modules.ucnhash Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of jython-slim Show documentation
Show all versions of jython-slim Show documentation
Jython is an implementation of the high-level, dynamic, object-oriented
language Python written in 100% Pure Java, and seamlessly integrated with
the Java platform. It thus allows you to run Python on any Java platform.
/* Copyright 1998 Finn Bock.
* Updated 2017 by Stefan Richthofer to support Unicode 9.0
*/
package org.python.modules;
import java.io.InputStream;
import java.io.DataInputStream;
import java.io.BufferedInputStream;
import java.io.IOException;
import org.python.core.ucnhashAPI;
public class ucnhash implements ucnhashAPI {
// Parameters for the word hash.
private static int n;
private static int m;
private static int minchar;
private static int alphasz;
private static int maxlen;
private static int maxidx;
private static int maxklen;
private static short[] G;
private static short[] T0;
private static short[] T1;
private static short[] T2;
// Map the hashed values into the text (as bytes).
private static byte[] worddata;
private static short[] wordoffs;
// wordindex greate then cutoff is stored as into two bytes.
private static short wordstart;
private static short wordcutoff;
// The raw data and indexes into start of each name
// The rawindex is sorted based on the wordindexes.
private static byte[] rawdata;
private static int[] rawindex;
// The mapping from raw data index to unicode code points.
private static int[] codepoint;
public static String[] __depends__ = new String[] {
"/org/python/modules/ucnhash.dat",
};
public static void loadTables() throws Exception {
InputStream instream =
ucnhash.class.getResourceAsStream("ucnhash.dat");
if (instream == null)
throw new IOException(
"Unicode name database not found: ucnhash.dat");
DataInputStream in =
new DataInputStream(new BufferedInputStream(instream));
n = in.readShort();
m = in.readShort();
minchar = in.readShort();
alphasz = in.readShort();
maxlen = in.readShort();
maxidx = maxlen*alphasz-minchar;
/*
if (debug) {
System.out.println("n "+n+" m "+m+" maxlen "+maxlen+
" minchar "+minchar+" alphasz "+alphasz);
} */
G = readShortTable(in);
if (in.readShort() != 3)
throw new IOException("UnicodeNameMap file corrupt, " +
"unknown dimension");
T0 = readShortTable(in);
T1 = readShortTable(in);
T2 = readShortTable(in);
wordoffs = readShortTable(in);
worddata = readByteTable(in);
/*
if (debug) {
System.out.println("G "+G.length+" T0 "+T0.length+
" T1 "+T1.length+" T2 "+T2.length);
System.out.println("wordoffs: "+wordoffs.length+
" worddata: "+worddata.length);
}*/
wordstart = in.readShort();
wordcutoff = in.readShort();
maxklen = in.readShort();
rawdata = readByteTable(in);
// Formerly rawindex and codepoint were 16 bit
//rawindex = readCharTable(in);
//codepoint = readCharTable(in);
rawindex = readIntTable(in);
codepoint = readIntTable(in);
/*
if (debug) {
System.out.println("wordstart: "+wordstart+
" wordcutoff: "+wordcutoff+" maxklen: "+maxklen);
System.out.println("rawdata: "+rawdata.length);
System.out.println("rawindex: "+rawindex.length+
" codepoint: "+codepoint.length);
}*/
}
private static short[] readShortTable(DataInputStream in)
throws IOException
{
if (in.read() != 't') {
throw new IOException("UnicodeNameMap file corrupt, shorttable");
}
int n = in.readInt() / 2;
short[] table = new short[n];
for (int i = 0; i < n; i++) {
table[i] = in.readShort();
}
return table;
}
private static int[] readIntTable(DataInputStream in)
throws IOException
{
if (in.read() != 't') {
throw new IOException("UnicodeNameMap file corrupt, inttable");
}
int n = in.readInt() / 4;
int[] table = new int[n];
for (int i = 0; i < n; i++) {
table[i] = in.readInt();
}
return table;
}
private static char[] readCharTable(DataInputStream in)
throws IOException
{
if (in.read() != 't') {
throw new IOException("UnicodeNameMap file corrupt, chartable");
}
int n = in.readInt() / 2;
char[] table = new char[n];
for (int i = 0; i < n; i++) {
table[i] = in.readChar();
}
return table;
}
private static byte[] readByteTable(DataInputStream in)
throws IOException
{
if (in.read() != 't') {
throw new IOException("UnicodeNameMap file corrupt, byte table");
}
int n = in.readInt();
byte[] table = new byte[n];
in.readFully(table);
return table;
}
public static int hash(String key) {
return hash(key, 0, key.length());
}
public static int hash(String key, int start, int end) {
int i, j;
int f0, f1, f2;
for (j = start, i=-minchar, f0=f1=f2=0; j < end; j++) {
char ch = key.charAt(j);
if (ch >= 'a' && ch <= 'z')
ch = (char) (ch - 'a' + 'A');
f0 += T0[i + ch];
f1 += T1[i + ch];
f2 += T2[i + ch];
i += alphasz;
if (i >= maxidx)
i = -minchar;
}
f0 %= n;
f1 %= n;
f2 %= n;
return (G[f0] + G[f1] + G[f2]) % m;
}
private static final char[] charmap =
" ABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789-()".toCharArray();
private static String getWord(int idx) {
int offset = wordoffs[idx];
int end = worddata.length;
if (idx < wordoffs.length-1)
end = wordoffs[idx+1];
StringBuilder buf = new StringBuilder();
for (int i = offset; i < end; i++)
buf.append(charmap[worddata[i]]);
return buf.toString();
}
private static boolean match(int idx, byte[] raw, int begin, int end) {
int woff = wordoffs[idx];
int wend = worddata.length;
if (idx < wordoffs.length-1) {
wend = wordoffs[idx+1];
}
if (end-begin != wend - woff) {
return false;
}
int l = end-begin;
for (int i = 0; i < l; i++) {
if (worddata[woff + i] != raw[begin + i]) {
return false;
}
}
return true;
}
private static int compare(byte[] a1, int off1, int len1,
byte[] a2, int off2, int len2)
{
for (int i = 0; i < len1 && i < len2; i++) {
int d = (a1[off1 + i] & 0xFF) - (a2[off2 + i] & 0xFF);
if (d != 0) {
return d;
}
}
return len1 - len2;
}
// Was formerly 5, before rawindex changed to 32 bit:
private static final int raw_block = 3;
private static int binarysearch(byte[] rawlist, int start, int end) {
int floor = 0;
int ceiling = (rawindex.length) / raw_block;
int middle, off, len, d;
while (floor < ceiling - 1) {
middle = (floor + ceiling) / 2;
/*if (debug)
System.out.println("floor:" + floor + " ceiling:" +
ceiling +" => " + middle); */
off = rawindex[middle*raw_block];
len = rawindex[middle*raw_block+raw_block-1] & 0x1F;
d = compare(rawlist, start, end - start, rawdata, off, len);
if (d < 0)
ceiling = middle;
else if (d > 0)
floor = middle;
else
return middle * 12;
}
int tmp = floor*raw_block;
off = rawindex[tmp++];
long lengths = (long) rawindex[tmp++] << 32 |
rawindex[tmp++] & 0xFFFFFFFFL;
floor *= 12;
for (int i = 0; i < 12; i++) {
len = (int) (lengths >> (i * 5)) & 0x1F;
if (compare(rawlist, start, end, rawdata, off, len) == 0)
return floor;
off += len;
floor++;
}
return -1;
}
public static int lookup(String name) {
return lookup(name, 0, name.length());
}
private static int lookup(String name, int start, int end) {
byte[] rawlist = new byte[32];
int ridx = 0;
int rbegin = 0;
int rstart = 0;
int i, begin; char ch; byte v;
while (true) {
rbegin = ridx;
begin = start;
lfor: for (i = start; i < end; i++) {
ch = name.charAt(i);
/*
if (ch == ' ') {
start = i+1;
break;
}
if (ch >= 'a' && ch <= 'z')
ch = (char) (ch - 'a' + 'A');
if (ch >= 'A' && ch <= 'Z')
v = ch - 'A' + 1;
else if (ch >= '0' && ch <= '9')
v = ch - '0' + 27;
else {
switch (ch) {
case '-': v = 37; break;
case '(': v = 38; break;
case ')': v = 39; break;
default: return -1;
}
} */ // Unfold this logic into one switch:
// (generated by printCharCases(), see below)
switch (ch) {
case ' ': start = i+1; break lfor;
case 'a': v = 1; break;
case 'b': v = 2; break;
case 'c': v = 3; break;
case 'd': v = 4; break;
case 'e': v = 5; break;
case 'f': v = 6; break;
case 'g': v = 7; break;
case 'h': v = 8; break;
case 'i': v = 9; break;
case 'j': v = 10; break;
case 'k': v = 11; break;
case 'l': v = 12; break;
case 'm': v = 13; break;
case 'n': v = 14; break;
case 'o': v = 15; break;
case 'p': v = 16; break;
case 'q': v = 17; break;
case 'r': v = 18; break;
case 's': v = 19; break;
case 't': v = 20; break;
case 'u': v = 21; break;
case 'v': v = 22; break;
case 'w': v = 23; break;
case 'x': v = 24; break;
case 'y': v = 25; break;
case 'z': v = 26; break;
case 'A': v = 1; break;
case 'B': v = 2; break;
case 'C': v = 3; break;
case 'D': v = 4; break;
case 'E': v = 5; break;
case 'F': v = 6; break;
case 'G': v = 7; break;
case 'H': v = 8; break;
case 'I': v = 9; break;
case 'J': v = 10; break;
case 'K': v = 11; break;
case 'L': v = 12; break;
case 'M': v = 13; break;
case 'N': v = 14; break;
case 'O': v = 15; break;
case 'P': v = 16; break;
case 'Q': v = 17; break;
case 'R': v = 18; break;
case 'S': v = 19; break;
case 'T': v = 20; break;
case 'U': v = 21; break;
case 'V': v = 22; break;
case 'W': v = 23; break;
case 'X': v = 24; break;
case 'Y': v = 25; break;
case 'Z': v = 26; break;
case '0': v = 27; break;
case '1': v = 28; break;
case '2': v = 29; break;
case '3': v = 30; break;
case '4': v = 31; break;
case '5': v = 32; break;
case '6': v = 33; break;
case '7': v = 34; break;
case '8': v = 35; break;
case '9': v = 36; break;
case '-': v = 37; break;
case '(': v = 38; break;
case ')': v = 39; break;
default: return -1;
}
rawlist[ridx++] = v;
if (ch == '-' && start != i) {
start = ++i;
break;
}
}
int hash = hash(name, begin, i);
/*
We skip this try for now, because the issue doesn't occur
with Unicode 9.0 ucnhash.dat bundled with Jython.
Anyway, this might point to some subtle bug.
Todo: Investigate
int hash;
// Currently needed if with older Unicode a
// name containing '(' or ')' is searched:
try {
hash = hash(name, begin, i);
} catch (ArrayIndexOutOfBoundsException aexc) {
return -1;
}
*/
// if (debug) System.out.println(name.substring(begin, i) + " " + hash);
if (hash >= 0 && ridx - rbegin > 1 &&
match(hash, rawlist, rbegin, ridx)) {
// if (debug) System.out.println("match " + getWord(hash));
hash += wordstart;
ridx = rstart;
if (hash > wordcutoff) {
rawlist[ridx++] = (byte) ((hash >> 8) + wordcutoff);
rawlist[ridx++] = (byte) (hash & 0xFF);
} else {
rawlist[ridx++] = (byte) hash;
}
rstart = ridx;
if (i >= end) {
break;
}
} else {
rstart = ridx;
if (i >= end) {
break;
}
rawlist[ridx++] = 0;
}
}
/*
if (debug) {
System.out.print("rawdata: ");
for (int k = 0; k < ridx; k++)
System.out.print((rawlist[k] & 0xFF) + " ");
System.out.println();
} */
int idx = binarysearch(rawlist, 0, ridx);
if (idx < 0) {
return idx;
}
/*
if (debug) {
System.out.println("idx:" + idx);
System.out.println("codepoint:" + codepoint[idx] + " " +
Integer.toHexString(codepoint[idx]));
} */
return codepoint[idx];
}
// From the ucnhashAPI interface
public int getCchMax() {
if (!initialized())
return -1;
return maxklen;
}
private static String cjkPrefix = "CJK COMPATIBILITY IDEOGRAPH-";
private static int cjkPrefixLen = cjkPrefix.length();
// From the ucnhashAPI interface
public int getValue(String s, int start, int end) {
if (!initialized()) {
return -1;
}
if (s.regionMatches(start, cjkPrefix, 0, cjkPrefixLen)) {
try {
String hex = s.substring(start + cjkPrefixLen, end);
int v = Integer.parseInt(hex, 16);
return v;
} catch (NumberFormatException exc) {
return -1; // Maybe fall through to the main algorithm.
}
}
return lookup(s, start, end);
}
private static boolean initialized = false;
private static boolean loaded = false;
private synchronized boolean initialized() {
if (initialized) {
return loaded;
}
try {
loadTables();
loaded = true;
} catch (Exception exc) {
return false;
}
initialized = true;
return true;
}
/*
public static int lookupChar(char ch) {
int v;
if (ch >= 'a' && ch <= 'z')
ch = (char) (ch - 'a' + 'A');
if (ch >= 'A' && ch <= 'Z')
v = ch - 'A' + 1;
else if (ch >= '0' && ch <= '9')
v = ch - '0' + 27;
else {
switch (ch) {
case '-': v = 37; break;
case '(': v = 38; break;
case ')': v = 39; break;
default: return -1;
}
}
return v;
}
public static void printCharCases() {
char[] charmapFull =
" abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789-()"
.toCharArray();
for (char ch: charmapFull) {
System.out.println("case \'"+ch+"\': v = "+lookupChar(ch)+"; break;");
}
} */
/*
private static boolean debug = false;
public static void main(String[] args) throws Exception {
loadTables();
debug = true;
System.out.println(getWord(hash("ARABIC")));
System.out.println(getWord(hash("SMALL")));
System.out.println(getWord(hash("YI")));
System.out.println(getWord(hash("SYLLABLE")));
System.out.println(getWord(hash("WITH")));
System.out.println(getWord(hash("LETTER")));
System.out.println(lookup("NULL")); // 0
System.out.println(lookup("LATIN CAPITAL LETTER AFRICAN D")); // 393
System.out.println(lookup("DOUBLE-STRUCK ITALIC SMALL D")); // 8518
System.out.println(lookup("GURMUKHI TIPPI")); // 2672
System.out.println(lookup("TIBETAN MARK GTER YIG MGO -UM" +
" RNAM BCAD MA")); // 3842
System.out.println(lookup("HANGUL CHOSEONG PIEUP")); // 4359
System.out.println(lookup("SINGLE LOW-9 QUOTATION MARK")); // 8218
System.out.println(lookup("BACKSPACE")); // 8
System.out.println(lookup("ACTIVATE SYMMETRIC SWAPPING")); // 8299
System.out.println(lookup("LATIN CAPITAL LETTER A")); // 65
System.out.println(lookup("GREATER-THAN SIGN")); // 62
System.out.println(lookup("EURO-CURRENCY SIGN")); // 8352
System.out.println(lookup("FORM FEED (FF)")); // 12
System.out.println(lookup("FORM FEED (F")); // -1
} */
}