org.jcodings.unicode.UnicodeEncoding Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of jcodings Show documentation
Show all versions of jcodings Show documentation
Byte based encoding support library for java
The newest version!
/*
* Permission is hereby granted, free of charge, to any person obtaining a copy of
* this software and associated documentation files (the "Software"), to deal in
* the Software without restriction, including without limitation the rights to
* use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies
* of the Software, and to permit persons to whom the Software is furnished to do
* so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in all
* copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
* SOFTWARE.
*/
package org.jcodings.unicode;
import java.io.DataInputStream;
import java.io.IOException;
import org.jcodings.ApplyAllCaseFoldFunction;
import org.jcodings.CaseFoldCodeItem;
import org.jcodings.CodeRange;
import org.jcodings.Config;
import org.jcodings.IntHolder;
import org.jcodings.MultiByteEncoding;
import org.jcodings.constants.CharacterType;
import org.jcodings.exception.CharacterPropertyException;
import org.jcodings.exception.EncodingError;
import org.jcodings.exception.ErrorMessages;
import org.jcodings.util.ArrayReader;
import org.jcodings.util.CaseInsensitiveBytesHash;
import org.jcodings.util.IntArrayHash;
import org.jcodings.util.IntHash;
public abstract class UnicodeEncoding extends MultiByteEncoding {
private static final int PROPERTY_NAME_MAX_SIZE = UnicodeCodeRange.MAX_WORD_LENGTH + 1;
static final int I_WITH_DOT_ABOVE = 0x0130;
static final int DOTLESS_i = 0x0131;
static final int DOT_ABOVE = 0x0307;
protected UnicodeEncoding(String name, int minLength, int maxLength, int[]EncLen, int[][]Trans) {
// ASCII type tables for all Unicode encodings
super(name, minLength, maxLength, EncLen, Trans, UNICODE_ISO_8859_1_CTypeTable);
isUnicode = true;
}
protected UnicodeEncoding(String name, int minLength, int maxLength, int[]EncLen) {
this(name, minLength, maxLength, EncLen, null);
}
@Override
public String getCharsetName() {
return new String(getName());
}
// onigenc_unicode_is_code_ctype
@Override
public boolean isCodeCType(int code, int ctype) {
if (Config.USE_UNICODE_PROPERTIES) {
if (ctype <= CharacterType.MAX_STD_CTYPE && code < 256)
return isCodeCTypeInternal(code, ctype);
} else {
if (code < 256) return isCodeCTypeInternal(code, ctype);
}
if (ctype > UnicodeCodeRange.CodeRangeTable.length) throw new InternalError(ErrorMessages.ERR_TYPE_BUG);
return CodeRange.isInCodeRange(UnicodeCodeRange.CodeRangeTable[ctype].getRange(), code);
}
public static boolean isInCodeRange(UnicodeCodeRange range, int code) {
return CodeRange.isInCodeRange(range.getRange(), code);
}
// onigenc_unicode_ctype_code_range
protected final int[]ctypeCodeRange(int ctype) {
if (ctype >= UnicodeCodeRange.CodeRangeTable.length) throw new InternalError(ErrorMessages.ERR_TYPE_BUG);
return UnicodeCodeRange.CodeRangeTable[ctype].getRange();
}
// onigenc_unicode_property_name_to_ctype
@Override
public int propertyNameToCType(byte[]name, int p, int end) {
byte[]buf = new byte[PROPERTY_NAME_MAX_SIZE];
int len = 0;
for(int p_ = p; p_ < end; p_+= length(name, p_, end)) {
int code = mbcToCode(name, p_, end);
if (code == ' ' || code == '-' || code == '_') continue;
if (code >= 0x80) throw new CharacterPropertyException(EncodingError.ERR_INVALID_CHAR_PROPERTY_NAME, name, p, end);
buf[len++] = (byte)code;
if (len >= PROPERTY_NAME_MAX_SIZE) throw new CharacterPropertyException(EncodingError.ERR_INVALID_CHAR_PROPERTY_NAME, name, p, end);
}
Integer ctype = CTypeName.Values.get(buf, 0, len);
if (ctype == null) throw new CharacterPropertyException(EncodingError.ERR_INVALID_CHAR_PROPERTY_NAME, name, p, end);
return ctype;
}
// onigenc_unicode_mbc_case_fold
@Override
public int mbcCaseFold(int flag, byte[]bytes, IntHolder pp, int end, byte[]fold) {
int p = pp.value;
int foldP = 0;
int code = mbcToCode(bytes, p, end);
int len = length(bytes, p, end);
pp.value += len;
if (Config.USE_UNICODE_CASE_FOLD_TURKISH_AZERI) {
if ((flag & Config.CASE_FOLD_TURKISH_AZERI) != 0) {
if (code == 'I') {
return codeToMbc(DOTLESS_i, fold, foldP);
} else if (code == I_WITH_DOT_ABOVE) {
return codeToMbc('i', fold, foldP);
}
}
}
CodeList to = CaseFold.Values.get(code);
if (to != null) {
if (to.codes.length == 1) {
return codeToMbc(to.codes[0], fold, foldP);
} else {
int rlen = 0;
for (int i=0; i= 'a' && code <= 'z') {
if ((flags & Config.CASE_UPCASE) != 0) {
flags |= Config.CASE_MODIFIED;
if ((flags & Config.CASE_FOLD_TURKISH_AZERI) != 0 && code == 'i') code = I_WITH_DOT_ABOVE; else code += 'A' - 'a';
}
} else if (code >= 'A' && code <= 'Z') {
if ((flags & (Config.CASE_DOWNCASE | Config.CASE_FOLD)) != 0) {
flags |= Config.CASE_MODIFIED;
if ((flags & Config.CASE_FOLD_TURKISH_AZERI) != 0 && code == 'I') code = DOTLESS_i; else code += 'a' - 'A';
}
}
} else if ((flags & Config.CASE_ASCII_ONLY) == 0 && code >= 0x00B5) {
CodeList folded;
if (code == I_WITH_DOT_ABOVE) {
if ((flags & (Config.CASE_DOWNCASE | Config.CASE_FOLD)) != 0) {
flags |= Config.CASE_MODIFIED;
code = 'i';
if ((flags & Config.CASE_FOLD_TURKISH_AZERI) == 0) {
toP += codeToMbc(code, to, toP);
code = DOT_ABOVE;
}
}
} else if (code == DOTLESS_i) {
if ((flags & Config.CASE_UPCASE) != 0) {
flags |= Config.CASE_MODIFIED;
code = 'I';
}
} else if ((folded = CaseFold.Values.get(code)) != null) { /* data about character found in CaseFold_Table */
if ((flags & Config.CASE_TITLECASE) != 0 && code >= 0x1C90 && code <= 0x1CBF) { /* Georgian MTAVRULI */
flags |= Config.CASE_MODIFIED;
code += 0x10D0 - 0x1C90;
} else if ((flags & Config.CASE_TITLECASE) != 0 && (folded.flags & Config.CASE_IS_TITLECASE) != 0) { /* Titlecase needed, but already Titlecase */
/* already Titlecase, no changes needed */
} else if ((flags & folded.flags) != 0) {
final int[]codes;
final int start;
final int finish;
boolean specialCopy = false;
flags |= Config.CASE_MODIFIED;
if ((flags & folded.flags & Config.CASE_SPECIALS) != 0) {
codes = CaseMappingSpecials.Values;
int specialStart = (folded.flags & Config.SpecialIndexMask) >>> Config.SpecialIndexShift;
if ((folded.flags & Config.CASE_IS_TITLECASE) != 0) {
if ((flags & (Config.CASE_UPCASE | Config.CASE_DOWNCASE)) == (Config.CASE_UPCASE | Config.CASE_DOWNCASE))
specialCopy = true;
else
specialStart += extractLength(codes[specialStart]);
}
if (!specialCopy && (folded.flags & Config.CASE_TITLECASE) != 0) {
if ((flags & Config.CASE_TITLECASE) != 0)
specialCopy = true;
else
specialStart += extractLength(codes[specialStart]);
}
if (!specialCopy && (folded.flags & Config.CASE_DOWN_SPECIAL) != 0) {
if ((flags & Config.CASE_DOWN_SPECIAL) == 0)
specialStart += extractLength(codes[specialStart]);
}
start = specialStart;
finish = start + extractLength(codes[specialStart]);
code = extractCode(codes[specialStart]);
} else {
codes = folded.codes;
start = 0;
finish = folded.codes.length;
code = codes[0];
}
for (int i = start + 1; i < finish; i++) {
toP += codeToMbc(code, to, toP);
code = codes[i];
}
}
} else if ((folded = CaseUnfold11.Values.get(code)) != null) { /* data about character found in CaseUnfold_11_Table */
if ((flags & Config.CASE_TITLECASE) != 0 && (folded.flags & Config.CASE_IS_TITLECASE) != 0) { /* Titlecase needed, but already Titlecase */
/* already Titlecase, no changes needed */
} else if ((flags & folded.flags) != 0) { /* needs and data availability match */
flags |= Config.CASE_MODIFIED;
code = folded.codes[(flags & folded.flags & Config.CASE_TITLECASE) != 0 ? 1 : 0];
}
}
}
toP += codeToMbc(code, to, toP);
if ((flags & Config.CASE_TITLECASE) != 0) {
flags ^= (Config.CASE_UPCASE | Config.CASE_DOWNCASE | Config.CASE_TITLECASE | Config.CASE_UP_SPECIAL | Config.CASE_DOWN_SPECIAL);
}
} // while
flagP.value = flags;
return toP - toStart;
}
static final short UNICODE_ISO_8859_1_CTypeTable[] = {
0x4008, 0x4008, 0x4008, 0x4008, 0x4008, 0x4008, 0x4008, 0x4008,
0x4008, 0x420c, 0x4209, 0x4208, 0x4208, 0x4208, 0x4008, 0x4008,
0x4008, 0x4008, 0x4008, 0x4008, 0x4008, 0x4008, 0x4008, 0x4008,
0x4008, 0x4008, 0x4008, 0x4008, 0x4008, 0x4008, 0x4008, 0x4008,
0x4284, 0x41a0, 0x41a0, 0x41a0, 0x41a0, 0x41a0, 0x41a0, 0x41a0,
0x41a0, 0x41a0, 0x41a0, 0x41a0, 0x41a0, 0x41a0, 0x41a0, 0x41a0,
0x78b0, 0x78b0, 0x78b0, 0x78b0, 0x78b0, 0x78b0, 0x78b0, 0x78b0,
0x78b0, 0x78b0, 0x41a0, 0x41a0, 0x41a0, 0x41a0, 0x41a0, 0x41a0,
0x41a0, 0x7ca2, 0x7ca2, 0x7ca2, 0x7ca2, 0x7ca2, 0x7ca2, 0x74a2,
0x74a2, 0x74a2, 0x74a2, 0x74a2, 0x74a2, 0x74a2, 0x74a2, 0x74a2,
0x74a2, 0x74a2, 0x74a2, 0x74a2, 0x74a2, 0x74a2, 0x74a2, 0x74a2,
0x74a2, 0x74a2, 0x74a2, 0x41a0, 0x41a0, 0x41a0, 0x41a0, 0x51a0,
0x41a0, 0x78e2, 0x78e2, 0x78e2, 0x78e2, 0x78e2, 0x78e2, 0x70e2,
0x70e2, 0x70e2, 0x70e2, 0x70e2, 0x70e2, 0x70e2, 0x70e2, 0x70e2,
0x70e2, 0x70e2, 0x70e2, 0x70e2, 0x70e2, 0x70e2, 0x70e2, 0x70e2,
0x70e2, 0x70e2, 0x70e2, 0x41a0, 0x41a0, 0x41a0, 0x41a0, 0x4008,
0x0008, 0x0008, 0x0008, 0x0008, 0x0008, 0x0288, 0x0008, 0x0008,
0x0008, 0x0008, 0x0008, 0x0008, 0x0008, 0x0008, 0x0008, 0x0008,
0x0008, 0x0008, 0x0008, 0x0008, 0x0008, 0x0008, 0x0008, 0x0008,
0x0008, 0x0008, 0x0008, 0x0008, 0x0008, 0x0008, 0x0008, 0x0008,
0x0284, 0x01a0, 0x00a0, 0x00a0, 0x00a0, 0x00a0, 0x00a0, 0x00a0,
0x00a0, 0x00a0, 0x30e2, 0x01a0, 0x00a0, 0x00a8, 0x00a0, 0x00a0,
0x00a0, 0x00a0, 0x10a0, 0x10a0, 0x00a0, 0x30e2, 0x00a0, 0x01a0,
0x00a0, 0x10a0, 0x30e2, 0x01a0, 0x10a0, 0x10a0, 0x10a0, 0x01a0,
0x34a2, 0x34a2, 0x34a2, 0x34a2, 0x34a2, 0x34a2, 0x34a2, 0x34a2,
0x34a2, 0x34a2, 0x34a2, 0x34a2, 0x34a2, 0x34a2, 0x34a2, 0x34a2,
0x34a2, 0x34a2, 0x34a2, 0x34a2, 0x34a2, 0x34a2, 0x34a2, 0x00a0,
0x34a2, 0x34a2, 0x34a2, 0x34a2, 0x34a2, 0x34a2, 0x34a2, 0x30e2,
0x30e2, 0x30e2, 0x30e2, 0x30e2, 0x30e2, 0x30e2, 0x30e2, 0x30e2,
0x30e2, 0x30e2, 0x30e2, 0x30e2, 0x30e2, 0x30e2, 0x30e2, 0x30e2,
0x30e2, 0x30e2, 0x30e2, 0x30e2, 0x30e2, 0x30e2, 0x30e2, 0x00a0,
0x30e2, 0x30e2, 0x30e2, 0x30e2, 0x30e2, 0x30e2, 0x30e2, 0x30e2
};
static class CTypeName {
private static final CaseInsensitiveBytesHash Values = initializeCTypeNameTable();
private static CaseInsensitiveBytesHash initializeCTypeNameTable() {
CaseInsensitiveBytesHash table = new CaseInsensitiveBytesHash();
for (int i = 0; i < UnicodeCodeRange.CodeRangeTable.length; i++) {
table.putDirect(UnicodeCodeRange.CodeRangeTable[i].name, i);
}
return table;
}
}
private static class CodeList {
CodeList(DataInputStream dis) throws IOException {
int packed = dis.readInt();
flags = packed & ~Config.CodePointMask;
int length = packed & Config.CodePointMask;
codes = new int[length];
for (int j = 0; j < length; j++) {
codes[j] = dis.readInt();
}
}
final int[]codes;
final int flags;
}
private static class CaseFold {
static IntHash read(String table) {
try {
DataInputStream dis = ArrayReader.openStream(table);
int size = dis.readInt();
IntHash hash = new IntHash(size);
for (int i = 0; i < size; i++) {
hash.putDirect(dis.readInt(), new CodeList(dis));
}
dis.close();
return hash;
} catch (IOException iot) {
throw new RuntimeException(iot);
}
}
static final IntHashValues = read("CaseFold");
}
private static class CaseUnfold11 {
private static final int From[];
private static final CodeList To[];
private static final int Locale_From[];
private static final CodeList Locale_To[];
static Object[] read(String table) {
try {
DataInputStream dis = ArrayReader.openStream(table);
int size = dis.readInt();
int[]from = new int[size];
CodeList[]to = new CodeList[size];
for (int i = 0; i < size; i++) {
from[i] = dis.readInt();
to[i] = new CodeList(dis);
}
dis.close();
return new Object[] {from, to};
} catch (IOException iot) {
throw new RuntimeException(iot);
}
}
static {
Object[]unfold;
unfold = read("CaseUnfold_11");
From = (int[])unfold[0];
To = (CodeList[])unfold[1];
unfold = read("CaseUnfold_11_Locale");
Locale_From = (int[])unfold[0];
Locale_To = (CodeList[])unfold[1];
}
static IntHash initializeUnfold1Hash() {
IntHash hash = new IntHash(From.length + Locale_From.length);
for (int i = 0; i < From.length; i++) {
hash.putDirect(From[i], To[i]);
}
for (int i = 0; i < Locale_From.length; i++) {
hash.putDirect(Locale_From[i], Locale_To[i]);
}
return hash;
}
static final IntHash Values = initializeUnfold1Hash();
}
private static Object[] readFoldN(int fromSize, String table) {
try {
DataInputStream dis = ArrayReader.openStream(table);
int size = dis.readInt();
int[][]from = new int[size][];
CodeList[]to = new CodeList[size];
for (int i = 0; i < size; i++) {
from[i] = new int[fromSize];
for (int j = 0; j < fromSize; j++) {
from[i][j] = dis.readInt();
}
to[i] = new CodeList(dis);
}
dis.close();
return new Object[] {from, to};
} catch (IOException iot) {
throw new RuntimeException(iot);
}
}
private static class CaseUnfold12 {
private static final int From[][];
private static final CodeList To[];
private static final int Locale_From[][];
private static final CodeList Locale_To[];
static {
Object[]unfold;
unfold = readFoldN(2, "CaseUnfold_12");
From = (int[][])unfold[0];
To = (CodeList[])unfold[1];
unfold = readFoldN(2, "CaseUnfold_12_Locale");
Locale_From = (int[][])unfold[0];
Locale_To = (CodeList[])unfold[1];
}
private static IntArrayHash initializeUnfold2Hash() {
IntArrayHash unfold2 = new IntArrayHash(From.length + Locale_From.length);
for (int i = 0; i < From.length; i++) {
unfold2.putDirect(From[i], To[i]);
}
for (int i = 0; i < Locale_From.length; i++) {
unfold2.putDirect(Locale_From[i], Locale_To[i]);
}
return unfold2;
}
static final IntArrayHash Values = initializeUnfold2Hash();
}
private static class CaseUnfold13 {
private static final int From[][];
private static final CodeList To[];
static {
Object[]unfold;
unfold = readFoldN(3, "CaseUnfold_13");
From = (int[][])unfold[0];
To = (CodeList[])unfold[1];
}
private static IntArrayHash initializeUnfold3Hash() {
IntArrayHash unfold3 = new IntArrayHash(From.length);
for (int i = 0; i < From.length; i++) {
unfold3.putDirect(From[i], To[i]);
}
return unfold3;
}
static final IntArrayHash Values = initializeUnfold3Hash();
}
private static int extractLength(int packed) {
return packed >>> Config.SpecialsLengthOffset;
}
private static int extractCode(int packed) {
return packed & ((1 << Config.SpecialsLengthOffset) - 1);
}
private static class CaseMappingSpecials {
static final int[] Values = ArrayReader.readIntArray("CaseMappingSpecials");
}
}
© 2015 - 2025 Weber Informatics LLC | Privacy Policy