org.elasticsearch.xpack.core.ml.inference.preprocessing.customwordembedding.ScriptDetector Maven / Gradle / Ivy
/*
* Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
* or more contributor license agreements. Licensed under the Elastic License
* 2.0; you may not use this file except in compliance with the Elastic License
* 2.0.
*
* This Java port of CLD3 was derived from Google's CLD3 project at https://github.com/google/cld3
*/
package org.elasticsearch.xpack.core.ml.inference.preprocessing.customwordembedding;
import static java.lang.Character.UnicodeBlock.ARABIC;
import static java.lang.Character.UnicodeBlock.CYRILLIC;
import static java.lang.Character.UnicodeBlock.GREEK;
import static java.lang.Character.UnicodeBlock.HANGUL_JAMO;
import static java.lang.Character.UnicodeBlock.HEBREW;
import static java.lang.Character.UnicodeBlock.HIRAGANA;
import static java.lang.Character.UnicodeBlock.KATAKANA;
/**
* Derived from https://github.com/google/cld3/blob/master/src/script_detector.h
*
* We take advantage of Java codepoints to determine the specific script value we care about
*/
public final class ScriptDetector {
private ScriptDetector() { }
// Unicode scripts we care about. To get compact and fast code, we detect only
// a few Unicode scripts that offer a strong indication about the language of
// the text (e.g., Hiragana -> Japanese).
public enum Script {
// Special value to indicate internal errors in the script detection code.
kScriptError(0),
// Special values for all Unicode scripts that we do not detect. One special
// value for Unicode characters of 1, 2, 3, respectively 4 bytes (as we
// already have that information, we use it). kScriptOtherUtf8OneByte means
// ~Latin and kScriptOtherUtf8FourBytes means ~Han.
kScriptOtherUtf8OneByte(1),
kScriptOtherUtf8TwoBytes(2),
kScriptOtherUtf8ThreeBytes(3),
kScriptOtherUtf8FourBytes(4),
kScriptGreek(5),
kScriptCyrillic(6),
kScriptHebrew(7),
kScriptArabic(8),
kScriptHangulJamo(9), // Used primarily for Korean.
kScriptHiragana(10), // Used primarily for Japanese.
kScriptKatakana(11); // Used primarily for Japanese.
private final int code;
Script(int code) {
this.code = code;
}
public int toInt() {
return code;
}
public static Script fromCodePoint(int codePoint) {
// Using blocks for the HANGUL vs HANGUL_JANO distinctions
// If one exists. Needs investigated
Character.UnicodeBlock block = Character.UnicodeBlock.of(codePoint);
if (GREEK.equals(block)) {
return kScriptGreek;
}
if (CYRILLIC.equals(block)) {
return kScriptCyrillic;
}
if (ARABIC.equals(block)) {
return kScriptArabic;
}
if (HEBREW.equals(block)) {
return kScriptHebrew;
}
if (KATAKANA.equals(block)) {
return kScriptKatakana;
}
if (HIRAGANA.equals(block)) {
return kScriptHiragana;
}
if (HANGUL_JAMO.equals(block)) {
return kScriptHangulJamo;
}
// Not one of our special cases, need to determine the utf8 byte size
if (codePoint > 0) {
// Fits in a single UTF-8 byte
if (codePoint < 128) {
return kScriptOtherUtf8OneByte;
}
if (codePoint < 2048) {
return kScriptOtherUtf8TwoBytes;
}
if (codePoint < 65536) {
return kScriptOtherUtf8ThreeBytes;
}
if (codePoint < 1114112) {
return kScriptOtherUtf8FourBytes;
}
}
return kScriptError;
}
}
}