
org.jcodings.unicode.UnicodeEncoding Maven / Gradle / Ivy
/*
* Permission is hereby granted, free of charge, to any person obtaining a copy of
* this software and associated documentation files (the "Software"), to deal in
* the Software without restriction, including without limitation the rights to
* use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies
* of the Software, and to permit persons to whom the Software is furnished to do
* so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in all
* copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
* SOFTWARE.
*/
package org.jcodings.unicode;
import java.io.DataInputStream;
import java.io.IOException;
import org.jcodings.ApplyAllCaseFoldFunction;
import org.jcodings.CaseFoldCodeItem;
import org.jcodings.CodeRange;
import org.jcodings.Config;
import org.jcodings.IntHolder;
import org.jcodings.MultiByteEncoding;
import org.jcodings.constants.CharacterType;
import org.jcodings.exception.CharacterPropertyException;
import org.jcodings.exception.EncodingError;
import org.jcodings.exception.ErrorMessages;
import org.jcodings.util.ArrayReader;
import org.jcodings.util.CaseInsensitiveBytesHash;
import org.jcodings.util.IntArrayHash;
import org.jcodings.util.IntHash;
public abstract class UnicodeEncoding extends MultiByteEncoding {
private static final int PROPERTY_NAME_MAX_SIZE = UnicodeCodeRange.MAX_WORD_LENGTH + 1;
static final int I_WITH_DOT_ABOVE = 0x0130;
static final int DOTLESS_i = 0x0131;
static final int DOT_ABOVE = 0x0307;
protected UnicodeEncoding(String name, int minLength, int maxLength, int[]EncLen, int[][]Trans) {
// ASCII type tables for all Unicode encodings
super(name, minLength, maxLength, EncLen, Trans, UNICODE_ISO_8859_1_CTypeTable);
isUnicode = true;
}
protected UnicodeEncoding(String name, int minLength, int maxLength, int[]EncLen) {
this(name, minLength, maxLength, EncLen, null);
}
@Override
public String getCharsetName() {
return new String(getName());
}
// onigenc_unicode_is_code_ctype
@Override
public boolean isCodeCType(int code, int ctype) {
if (Config.USE_UNICODE_PROPERTIES) {
if (ctype <= CharacterType.MAX_STD_CTYPE && code < 256)
return isCodeCTypeInternal(code, ctype);
} else {
if (code < 256) return isCodeCTypeInternal(code, ctype);
}
if (ctype > UnicodeCodeRange.CodeRangeTable.length) throw new InternalError(ErrorMessages.ERR_TYPE_BUG);
return CodeRange.isInCodeRange(UnicodeCodeRange.CodeRangeTable[ctype].getRange(), code);
}
public static boolean isInCodeRange(UnicodeCodeRange range, int code) {
return CodeRange.isInCodeRange(range.getRange(), code);
}
// onigenc_unicode_ctype_code_range
protected final int[]ctypeCodeRange(int ctype) {
if (ctype >= UnicodeCodeRange.CodeRangeTable.length) throw new InternalError(ErrorMessages.ERR_TYPE_BUG);
return UnicodeCodeRange.CodeRangeTable[ctype].getRange();
}
// onigenc_unicode_property_name_to_ctype
@Override
public int propertyNameToCType(byte[]name, int p, int end) {
byte[]buf = new byte[PROPERTY_NAME_MAX_SIZE];
int len = 0;
for(int p_ = p; p_ < end; p_+= length(name, p_, end)) {
int code = mbcToCode(name, p_, end);
if (code == ' ' || code == '-' || code == '_') continue;
if (code >= 0x80) throw new CharacterPropertyException(EncodingError.ERR_INVALID_CHAR_PROPERTY_NAME, name, p, end);
buf[len++] = (byte)code;
if (len >= PROPERTY_NAME_MAX_SIZE) throw new CharacterPropertyException(EncodingError.ERR_INVALID_CHAR_PROPERTY_NAME, name, p, end);
}
Integer ctype = CTypeName.Values.get(buf, 0, len);
if (ctype == null) throw new CharacterPropertyException(EncodingError.ERR_INVALID_CHAR_PROPERTY_NAME, name, p, end);
return ctype;
}
// onigenc_unicode_mbc_case_fold
@Override
public int mbcCaseFold(int flag, byte[]bytes, IntHolder pp, int end, byte[]fold) {
int p = pp.value;
int foldP = 0;
int code = mbcToCode(bytes, p, end);
int len = length(bytes, p, end);
pp.value += len;
if (Config.USE_UNICODE_CASE_FOLD_TURKISH_AZERI) {
if ((flag & Config.CASE_FOLD_TURKISH_AZERI) != 0) {
if (code == 'I') {
return codeToMbc(DOTLESS_i, fold, foldP);
} else if (code == I_WITH_DOT_ABOVE) {
return codeToMbc('i', fold, foldP);
}
}
}
CodeList to = CaseFold.Values.get(code);
if (to != null) {
if (to.codes.length == 1) {
return codeToMbc(to.codes[0], fold, foldP);
} else {
int rlen = 0;
for (int i=0; i= 'a' && code <= 'z') {
if ((flags & Config.CASE_UPCASE) != 0) {
flags |= Config.CASE_MODIFIED;
if ((flags & Config.CASE_FOLD_TURKISH_AZERI) != 0 && code == 'i') code = I_WITH_DOT_ABOVE; else code += 'A' - 'a';
}
} else if (code >= 'A' && code <= 'Z') {
if ((flags & (Config.CASE_DOWNCASE | Config.CASE_FOLD)) != 0) {
flags |= Config.CASE_MODIFIED;
if ((flags & Config.CASE_FOLD_TURKISH_AZERI) != 0 && code == 'I') code = DOTLESS_i; else code += 'a' - 'A';
}
}
} else if ((flags & Config.CASE_ASCII_ONLY) == 0 && code >= 0x00B5) {
CodeList folded;
if (code == I_WITH_DOT_ABOVE) {
if ((flags & (Config.CASE_DOWNCASE | Config.CASE_FOLD)) != 0) {
flags |= Config.CASE_MODIFIED;
code = 'i';
if ((flags & Config.CASE_FOLD_TURKISH_AZERI) == 0) {
toP += codeToMbc(code, to, toP);
code = DOT_ABOVE;
}
}
} else if (code == DOTLESS_i) {
if ((flags & Config.CASE_UPCASE) != 0) {
flags |= Config.CASE_MODIFIED;
code = 'I';
}
} else if ((folded = CaseFold.Values.get(code)) != null) { /* data about character found in CaseFold_Table */
if ((flags & Config.CASE_TITLECASE) != 0 && code >= 0x1C90 && code <= 0x1CBF) { /* Georgian MTAVRULI */
flags |= Config.CASE_MODIFIED;
code += 0x10D0 - 0x1C90;
} else if ((flags & Config.CASE_TITLECASE) != 0 && (folded.flags & Config.CASE_IS_TITLECASE) != 0) { /* Titlecase needed, but already Titlecase */
/* already Titlecase, no changes needed */
} else if ((flags & folded.flags) != 0) {
final int[]codes;
final int start;
final int finish;
boolean specialCopy = false;
flags |= Config.CASE_MODIFIED;
if ((flags & folded.flags & Config.CASE_SPECIALS) != 0) {
codes = CaseMappingSpecials.Values;
int specialStart = (folded.flags & Config.SpecialIndexMask) >>> Config.SpecialIndexShift;
if ((folded.flags & Config.CASE_IS_TITLECASE) != 0) {
if ((flags & (Config.CASE_UPCASE | Config.CASE_DOWNCASE)) == (Config.CASE_UPCASE | Config.CASE_DOWNCASE))
specialCopy = true;
else
specialStart += extractLength(codes[specialStart]);
}
if (!specialCopy && (folded.flags & Config.CASE_TITLECASE) != 0) {
if ((flags & Config.CASE_TITLECASE) != 0)
specialCopy = true;
else
specialStart += extractLength(codes[specialStart]);
}
if (!specialCopy && (folded.flags & Config.CASE_DOWN_SPECIAL) != 0) {
if ((flags & Config.CASE_DOWN_SPECIAL) == 0)
specialStart += extractLength(codes[specialStart]);
}
start = specialStart;
finish = start + extractLength(codes[specialStart]);
code = extractCode(codes[specialStart]);
} else {
codes = folded.codes;
start = 0;
finish = folded.codes.length;
code = codes[0];
}
for (int i = start + 1; i < finish; i++) {
toP += codeToMbc(code, to, toP);
code = codes[i];
}
}
} else if ((folded = CaseUnfold11.Values.get(code)) != null) { /* data about character found in CaseUnfold_11_Table */
if ((flags & Config.CASE_TITLECASE) != 0 && (folded.flags & Config.CASE_IS_TITLECASE) != 0) { /* Titlecase needed, but already Titlecase */
/* already Titlecase, no changes needed */
} else if ((flags & folded.flags) != 0) { /* needs and data availability match */
flags |= Config.CASE_MODIFIED;
code = folded.codes[(flags & folded.flags & Config.CASE_TITLECASE) != 0 ? 1 : 0];
}
}
}
toP += codeToMbc(code, to, toP);
if ((flags & Config.CASE_TITLECASE) != 0) {
flags ^= (Config.CASE_UPCASE | Config.CASE_DOWNCASE | Config.CASE_TITLECASE | Config.CASE_UP_SPECIAL | Config.CASE_DOWN_SPECIAL);
}
} // while
flagP.value = flags;
return toP - toStart;
}
static final short UNICODE_ISO_8859_1_CTypeTable[] = {
0x4008, 0x4008, 0x4008, 0x4008, 0x4008, 0x4008, 0x4008, 0x4008,
0x4008, 0x420c, 0x4209, 0x4208, 0x4208, 0x4208, 0x4008, 0x4008,
0x4008, 0x4008, 0x4008, 0x4008, 0x4008, 0x4008, 0x4008, 0x4008,
0x4008, 0x4008, 0x4008, 0x4008, 0x4008, 0x4008, 0x4008, 0x4008,
0x4284, 0x41a0, 0x41a0, 0x41a0, 0x41a0, 0x41a0, 0x41a0, 0x41a0,
0x41a0, 0x41a0, 0x41a0, 0x41a0, 0x41a0, 0x41a0, 0x41a0, 0x41a0,
0x78b0, 0x78b0, 0x78b0, 0x78b0, 0x78b0, 0x78b0, 0x78b0, 0x78b0,
0x78b0, 0x78b0, 0x41a0, 0x41a0, 0x41a0, 0x41a0, 0x41a0, 0x41a0,
0x41a0, 0x7ca2, 0x7ca2, 0x7ca2, 0x7ca2, 0x7ca2, 0x7ca2, 0x74a2,
0x74a2, 0x74a2, 0x74a2, 0x74a2, 0x74a2, 0x74a2, 0x74a2, 0x74a2,
0x74a2, 0x74a2, 0x74a2, 0x74a2, 0x74a2, 0x74a2, 0x74a2, 0x74a2,
0x74a2, 0x74a2, 0x74a2, 0x41a0, 0x41a0, 0x41a0, 0x41a0, 0x51a0,
0x41a0, 0x78e2, 0x78e2, 0x78e2, 0x78e2, 0x78e2, 0x78e2, 0x70e2,
0x70e2, 0x70e2, 0x70e2, 0x70e2, 0x70e2, 0x70e2, 0x70e2, 0x70e2,
0x70e2, 0x70e2, 0x70e2, 0x70e2, 0x70e2, 0x70e2, 0x70e2, 0x70e2,
0x70e2, 0x70e2, 0x70e2, 0x41a0, 0x41a0, 0x41a0, 0x41a0, 0x4008,
0x0008, 0x0008, 0x0008, 0x0008, 0x0008, 0x0288, 0x0008, 0x0008,
0x0008, 0x0008, 0x0008, 0x0008, 0x0008, 0x0008, 0x0008, 0x0008,
0x0008, 0x0008, 0x0008, 0x0008, 0x0008, 0x0008, 0x0008, 0x0008,
0x0008, 0x0008, 0x0008, 0x0008, 0x0008, 0x0008, 0x0008, 0x0008,
0x0284, 0x01a0, 0x00a0, 0x00a0, 0x00a0, 0x00a0, 0x00a0, 0x00a0,
0x00a0, 0x00a0, 0x30e2, 0x01a0, 0x00a0, 0x00a8, 0x00a0, 0x00a0,
0x00a0, 0x00a0, 0x10a0, 0x10a0, 0x00a0, 0x30e2, 0x00a0, 0x01a0,
0x00a0, 0x10a0, 0x30e2, 0x01a0, 0x10a0, 0x10a0, 0x10a0, 0x01a0,
0x34a2, 0x34a2, 0x34a2, 0x34a2, 0x34a2, 0x34a2, 0x34a2, 0x34a2,
0x34a2, 0x34a2, 0x34a2, 0x34a2, 0x34a2, 0x34a2, 0x34a2, 0x34a2,
0x34a2, 0x34a2, 0x34a2, 0x34a2, 0x34a2, 0x34a2, 0x34a2, 0x00a0,
0x34a2, 0x34a2, 0x34a2, 0x34a2, 0x34a2, 0x34a2, 0x34a2, 0x30e2,
0x30e2, 0x30e2, 0x30e2, 0x30e2, 0x30e2, 0x30e2, 0x30e2, 0x30e2,
0x30e2, 0x30e2, 0x30e2, 0x30e2, 0x30e2, 0x30e2, 0x30e2, 0x30e2,
0x30e2, 0x30e2, 0x30e2, 0x30e2, 0x30e2, 0x30e2, 0x30e2, 0x00a0,
0x30e2, 0x30e2, 0x30e2, 0x30e2, 0x30e2, 0x30e2, 0x30e2, 0x30e2
};
static class CTypeName {
private static final CaseInsensitiveBytesHash Values = initializeCTypeNameTable();
private static CaseInsensitiveBytesHash initializeCTypeNameTable() {
CaseInsensitiveBytesHash table = new CaseInsensitiveBytesHash();
for (int i = 0; i < UnicodeCodeRange.CodeRangeTable.length; i++) {
table.putDirect(UnicodeCodeRange.CodeRangeTable[i].name, i);
}
return table;
}
}
private static class CodeList {
CodeList(DataInputStream dis) throws IOException {
int packed = dis.readInt();
flags = packed & ~Config.CodePointMask;
int length = packed & Config.CodePointMask;
codes = new int[length];
for (int j = 0; j < length; j++) {
codes[j] = dis.readInt();
}
}
final int[]codes;
final int flags;
}
private static class CaseFold {
static IntHash read(String table) {
try {
DataInputStream dis = ArrayReader.openStream(table);
int size = dis.readInt();
IntHash hash = new IntHash(size);
for (int i = 0; i < size; i++) {
hash.putDirect(dis.readInt(), new CodeList(dis));
}
dis.close();
return hash;
} catch (IOException iot) {
throw new RuntimeException(iot);
}
}
static final IntHashValues = read("CaseFold");
}
private static class CaseUnfold11 {
private static final int From[];
private static final CodeList To[];
private static final int Locale_From[];
private static final CodeList Locale_To[];
static Object[] read(String table) {
try {
DataInputStream dis = ArrayReader.openStream(table);
int size = dis.readInt();
int[]from = new int[size];
CodeList[]to = new CodeList[size];
for (int i = 0; i < size; i++) {
from[i] = dis.readInt();
to[i] = new CodeList(dis);
}
dis.close();
return new Object[] {from, to};
} catch (IOException iot) {
throw new RuntimeException(iot);
}
}
static {
Object[]unfold;
unfold = read("CaseUnfold_11");
From = (int[])unfold[0];
To = (CodeList[])unfold[1];
unfold = read("CaseUnfold_11_Locale");
Locale_From = (int[])unfold[0];
Locale_To = (CodeList[])unfold[1];
}
static IntHash initializeUnfold1Hash() {
IntHash hash = new IntHash(From.length + Locale_From.length);
for (int i = 0; i < From.length; i++) {
hash.putDirect(From[i], To[i]);
}
for (int i = 0; i < Locale_From.length; i++) {
hash.putDirect(Locale_From[i], Locale_To[i]);
}
return hash;
}
static final IntHash Values = initializeUnfold1Hash();
}
private static Object[] readFoldN(int fromSize, String table) {
try {
DataInputStream dis = ArrayReader.openStream(table);
int size = dis.readInt();
int[][]from = new int[size][];
CodeList[]to = new CodeList[size];
for (int i = 0; i < size; i++) {
from[i] = new int[fromSize];
for (int j = 0; j < fromSize; j++) {
from[i][j] = dis.readInt();
}
to[i] = new CodeList(dis);
}
dis.close();
return new Object[] {from, to};
} catch (IOException iot) {
throw new RuntimeException(iot);
}
}
private static class CaseUnfold12 {
private static final int From[][];
private static final CodeList To[];
private static final int Locale_From[][];
private static final CodeList Locale_To[];
static {
Object[]unfold;
unfold = readFoldN(2, "CaseUnfold_12");
From = (int[][])unfold[0];
To = (CodeList[])unfold[1];
unfold = readFoldN(2, "CaseUnfold_12_Locale");
Locale_From = (int[][])unfold[0];
Locale_To = (CodeList[])unfold[1];
}
private static IntArrayHash initializeUnfold2Hash() {
IntArrayHash unfold2 = new IntArrayHash(From.length + Locale_From.length);
for (int i = 0; i < From.length; i++) {
unfold2.putDirect(From[i], To[i]);
}
for (int i = 0; i < Locale_From.length; i++) {
unfold2.putDirect(Locale_From[i], Locale_To[i]);
}
return unfold2;
}
static final IntArrayHash Values = initializeUnfold2Hash();
}
private static class CaseUnfold13 {
private static final int From[][];
private static final CodeList To[];
static {
Object[]unfold;
unfold = readFoldN(3, "CaseUnfold_13");
From = (int[][])unfold[0];
To = (CodeList[])unfold[1];
}
private static IntArrayHash initializeUnfold3Hash() {
IntArrayHash unfold3 = new IntArrayHash(From.length);
for (int i = 0; i < From.length; i++) {
unfold3.putDirect(From[i], To[i]);
}
return unfold3;
}
static final IntArrayHash Values = initializeUnfold3Hash();
}
private static int extractLength(int packed) {
return packed >>> Config.SpecialsLengthOffset;
}
private static int extractCode(int packed) {
return packed & ((1 << Config.SpecialsLengthOffset) - 1);
}
private static class CaseMappingSpecials {
static final int[] Values = ArrayReader.readIntArray("CaseMappingSpecials");
}
}
© 2015 - 2025 Weber Informatics LLC | Privacy Policy