src.org.python.modules._codecs Maven / Gradle / Ivy
/*
* Copyright 2000 Finn Bock
*
* This program contains material copyrighted by:
* Copyright (c) Corporation for National Research Initiatives.
* Originally written by Marc-Andre Lemburg ([email protected]).
*/
package org.python.modules;
import java.nio.ByteBuffer;
import java.nio.charset.Charset;
import java.util.Iterator;
import org.python.core.Py;
import org.python.core.PyDictionary;
import org.python.core.PyInteger;
import org.python.core.PyNone;
import org.python.core.PyObject;
import org.python.core.PyString;
import org.python.core.PySystemState;
import org.python.core.PyTuple;
import org.python.core.PyUnicode;
import org.python.core.codecs;
import org.python.expose.ExposedType;
public class _codecs {
public static void register(PyObject search_function) {
codecs.register(search_function);
}
public static PyTuple lookup(String encoding) {
return codecs.lookup(encoding);
}
public static PyObject lookup_error(String handlerName) {
return codecs.lookup_error(handlerName);
}
public static void register_error(String name, PyObject errorHandler) {
codecs.register_error(name, errorHandler);
}
public static PyObject charmap_build(PyUnicode map) {
return EncodingMap.buildEncodingMap(map);
}
private static PyTuple decode_tuple(String s, int len) {
return new PyTuple(new PyUnicode(s), Py.newInteger(len));
}
private static PyTuple decode_tuple_str(String s, int len) {
return new PyTuple(new PyString(s), Py.newInteger(len));
}
private static PyTuple encode_tuple(String s, int len) {
return new PyTuple(new PyString(s), Py.newInteger(len));
}
/* --- UTF-8 Codec --------------------------------------------------- */
public static PyTuple utf_8_decode(String str) {
return utf_8_decode(str, null);
}
public static PyTuple utf_8_decode(String str, String errors) {
return utf_8_decode(str, errors, false);
}
public static PyTuple utf_8_decode(String str, String errors, boolean final_) {
int[] consumed = final_ ? null : new int[1];
return decode_tuple(codecs.PyUnicode_DecodeUTF8Stateful(str, errors, consumed),
final_ ? str.length() : consumed[0]);
}
public static PyTuple utf_8_encode(String str) {
return utf_8_encode(str, null);
}
public static PyTuple utf_8_encode(String str, String errors) {
int size = str.length();
return encode_tuple(codecs.PyUnicode_EncodeUTF8(str, errors), size);
}
/* --- UTF-7 Codec --------------------------------------------------- */
public static PyTuple utf_7_decode(String str) {
return utf_7_decode(str, null);
}
public static PyTuple utf_7_decode(String str, String errors) {
int size = str.length();
return decode_tuple(codecs.PyUnicode_DecodeUTF7(str, errors), size);
}
public static PyTuple utf_7_encode(String str) {
return utf_7_encode(str, null);
}
public static PyTuple utf_7_encode(String str, String errors) {
int size = str.length();
return encode_tuple(codecs.PyUnicode_EncodeUTF7(str, false, false, errors), size);
}
public static PyTuple escape_decode(String str) {
return escape_decode(str, null);
}
public static PyTuple escape_decode(String str, String errors) {
return decode_tuple_str(PyString.decode_UnicodeEscape(str,
0,
str.length(),
errors,
true), str.length());
}
public static PyTuple escape_encode(String str) {
return escape_encode(str, null);
}
public static PyTuple escape_encode(String str, String errors) {
return encode_tuple(PyString.encode_UnicodeEscape(str, false), str.length());
}
/* --- Character Mapping Codec --------------------------------------- */
public static PyTuple charmap_decode(String str,
String errors,
PyObject mapping) {
return charmap_decode(str, errors, mapping, false);
}
public static PyTuple charmap_decode(String str,
String errors,
PyObject mapping, boolean ignoreUnmapped) {
int size = str.length();
StringBuilder v = new StringBuilder(size);
for (int i = 0; i < size; i++) {
char ch = str.charAt(i);
if (ch > 0xFF) {
i = codecs.insertReplacementAndGetResume(v,
errors,
"charmap",
str,
i,
i + 1,
"ordinal not in range(255)") - 1;
continue;
}
PyObject w = Py.newInteger(ch);
PyObject x = mapping.__finditem__(w);
if (x == null) {
if (ignoreUnmapped) {
v.append(ch);
} else {
i = codecs.insertReplacementAndGetResume(v, errors, "charmap", str, i, i + 1, "no mapping found") - 1;
}
continue;
}
/* Apply mapping */
if (x instanceof PyInteger) {
int value = ((PyInteger) x).getValue();
if (value < 0 || value > PySystemState.maxunicode) {
throw Py.TypeError("character mapping must return " + "integer greater than 0 and less than sys.maxunicode");
}
v.append((char) value);
} else if (x == Py.None) {
i = codecs.insertReplacementAndGetResume(v,
errors,
"charmap",
str,
i,
i + 1,
"character maps to ") - 1;
} else if (x instanceof PyString) {
v.append(x.toString());
} else {
/* wrong return value */
throw Py.TypeError("character mapping must return " + "integer, None or str");
}
}
return decode_tuple(v.toString(), size);
}
// parallel to CPython's PyUnicode_TranslateCharmap
public static PyObject translateCharmap(PyUnicode str, String errors, PyObject mapping) {
StringBuilder buf = new StringBuilder(str.toString().length());
for (Iterator iter = str.newSubsequenceIterator(); iter.hasNext();) {
int codePoint = iter.next();
PyObject result = mapping.__finditem__(Py.newInteger(codePoint));
if (result == null) {
// No mapping found means: use 1:1 mapping
buf.appendCodePoint(codePoint);
} else if (result == Py.None) {
// XXX: We don't support the fancier error handling CPython does here of
// capturing regions of chars removed by the None mapping to optionally
// pass to an error handler. Though we don't seem to even use this
// functionality anywhere either
;
} else if (result instanceof PyInteger) {
int value = result.asInt();
if (value < 0 || value > PySystemState.maxunicode) {
throw Py.TypeError(String.format("character mapping must be in range(0x%x)",
PySystemState.maxunicode + 1));
}
buf.appendCodePoint(value);
} else if (result instanceof PyUnicode) {
buf.append(result.toString());
} else {
// wrong return value
throw Py.TypeError("character mapping must return integer, None or unicode");
}
}
return new PyUnicode(buf.toString());
}
public static PyTuple charmap_encode(String str, String errors,
PyObject mapping) {
//Default to Latin-1
if (mapping == null) {
return latin_1_encode(str, errors);
}
return charmap_encode_internal(str, errors, mapping, new StringBuilder(str.length()), true);
}
private static PyTuple charmap_encode_internal(String str,
String errors,
PyObject mapping,
StringBuilder v,
boolean letLookupHandleError) {
EncodingMap encodingMap = mapping instanceof EncodingMap ? (EncodingMap)mapping : null;
int size = str.length();
for (int i = 0; i < size; i++) {
char ch = str.charAt(i);
PyObject x;
if (encodingMap != null) {
int result = encodingMap.lookup(ch);
if (result == -1) {
x = null;
} else {
x = Py.newInteger(result);
}
} else {
x = mapping.__finditem__(Py.newInteger(ch));
}
if (x == null) {
if (letLookupHandleError) {
i = handleBadMapping(str, errors, mapping, v, size, i);
} else {
throw Py.UnicodeEncodeError("charmap",
str,
i,
i + 1,
"character maps to ");
}
} else if (x instanceof PyInteger) {
int value = ((PyInteger) x).getValue();
if (value < 0 || value > 255) {
throw Py.TypeError("character mapping must be in range(256)");
}
v.append((char) value);
} else if (x instanceof PyString && !(x instanceof PyUnicode)) {
v.append(x.toString());
} else if (x instanceof PyNone) {
i = handleBadMapping(str, errors, mapping, v, size, i);
} else {
/* wrong return value */
throw Py.TypeError("character mapping must return " + "integer, None or str");
}
}
return encode_tuple(v.toString(), size);
}
private static int handleBadMapping(String str,
String errors,
PyObject mapping,
StringBuilder v,
int size,
int i) {
if (errors != null) {
if (errors.equals(codecs.IGNORE)) {
return i;
} else if (errors.equals(codecs.REPLACE)) {
charmap_encode_internal("?", errors, mapping, v, false);
return i;
} else if (errors.equals(codecs.XMLCHARREFREPLACE)) {
charmap_encode_internal(codecs.xmlcharrefreplace(i, i + 1, str).toString(), errors, mapping, v, false);
return i;
} else if (errors.equals(codecs.BACKSLASHREPLACE)) {
charmap_encode_internal(codecs.backslashreplace(i, i + 1, str).toString(), errors, mapping, v, false);
return i;
}
}
PyObject replacement = codecs.encoding_error(errors,
"charmap",
str,
i,
i + 1,
"character maps to ");
String replStr = replacement.__getitem__(0).toString();
charmap_encode_internal(replStr, errors, mapping, v, false);
return codecs.calcNewPosition(size, replacement) - 1;
}
public static PyTuple ascii_decode(String str) {
return ascii_decode(str, null);
}
public static PyTuple ascii_decode(String str, String errors) {
int size = str.length();
return decode_tuple(codecs.PyUnicode_DecodeASCII(str, size, errors),
size);
}
public static PyTuple ascii_encode(String str) {
return ascii_encode(str, null);
}
public static PyTuple ascii_encode(String str, String errors) {
int size = str.length();
return encode_tuple(codecs.PyUnicode_EncodeASCII(str, size, errors),
size);
}
/* --- Latin-1 Codec -------------------------------------------- */
public static PyTuple latin_1_decode(String str) {
return latin_1_decode(str, null);
}
public static PyTuple latin_1_decode(String str, String errors) {
int size = str.length();
return decode_tuple(codecs.PyUnicode_DecodeLatin1(str, size, errors),
size);
}
public static PyTuple latin_1_encode(String str) {
return latin_1_encode(str, null);
}
public static PyTuple latin_1_encode(String str, String errors) {
int size = str.length();
return encode_tuple(codecs.PyUnicode_EncodeLatin1(str, size, errors), size);
}
/* --- UTF16 Codec -------------------------------------------- */
public static PyTuple utf_16_encode(String str) {
return utf_16_encode(str, null);
}
public static PyTuple utf_16_encode(String str, String errors) {
return encode_tuple(encode_UTF16(str, errors, 0), str.length());
}
public static PyTuple utf_16_encode(String str, String errors,
int byteorder) {
return encode_tuple(encode_UTF16(str, errors, byteorder),
str.length());
}
public static PyTuple utf_16_le_encode(String str) {
return utf_16_le_encode(str, null);
}
public static PyTuple utf_16_le_encode(String str, String errors) {
return encode_tuple(encode_UTF16(str, errors, -1), str.length());
}
public static PyTuple utf_16_be_encode(String str) {
return utf_16_be_encode(str, null);
}
public static PyTuple utf_16_be_encode(String str, String errors) {
return encode_tuple(encode_UTF16(str, errors, 1), str.length());
}
public static String encode_UTF16(String str, String errors, int byteorder) {
final Charset utf16;
if (byteorder == 0) {
utf16 = Charset.forName("UTF-16");
} else if (byteorder == -1) {
utf16 = Charset.forName("UTF-16LE");
} else {
utf16 = Charset.forName("UTF-16BE");
}
final ByteBuffer bbuf = utf16.encode(str);
final StringBuilder v = new StringBuilder(bbuf.limit());
while (bbuf.remaining() > 0) {
int val = bbuf.get();
if (val < 0) {
val = 256 + val;
}
v.appendCodePoint(val);
}
return v.toString();
}
public static PyTuple utf_16_decode(String str) {
return utf_16_decode(str, null);
}
public static PyTuple utf_16_decode(String str, String errors) {
return utf_16_decode(str, errors, false);
}
public static PyTuple utf_16_decode(String str, String errors, boolean final_) {
int[] bo = new int[] { 0 };
int[] consumed = final_ ? null : new int[1];
return decode_tuple(decode_UTF16(str, errors, bo, consumed),
final_ ? str.length() : consumed[0]);
}
public static PyTuple utf_16_le_decode(String str) {
return utf_16_le_decode(str, null);
}
public static PyTuple utf_16_le_decode(String str, String errors) {
return utf_16_le_decode(str, errors, false);
}
public static PyTuple utf_16_le_decode(String str, String errors, boolean final_) {
int[] bo = new int[] { -1 };
int[] consumed = final_ ? null : new int[1];
return decode_tuple(decode_UTF16(str, errors, bo, consumed),
final_ ? str.length() : consumed[0]);
}
public static PyTuple utf_16_be_decode(String str) {
return utf_16_be_decode(str, null);
}
public static PyTuple utf_16_be_decode(String str, String errors) {
return utf_16_be_decode(str, errors, false);
}
public static PyTuple utf_16_be_decode(String str, String errors, boolean final_) {
int[] bo = new int[] { 1 };
int[] consumed = final_ ? null : new int[1];
return decode_tuple(decode_UTF16(str, errors, bo, consumed),
final_ ? str.length() : consumed[0]);
}
public static PyTuple utf_16_ex_decode(String str) {
return utf_16_ex_decode(str, null);
}
public static PyTuple utf_16_ex_decode(String str, String errors) {
return utf_16_ex_decode(str, errors, 0);
}
public static PyTuple utf_16_ex_decode(String str, String errors, int byteorder) {
return utf_16_ex_decode(str, errors, byteorder, false);
}
public static PyTuple utf_16_ex_decode(String str, String errors, int byteorder,
boolean final_) {
int[] bo = new int[] { 0 };
int[] consumed = final_ ? null : new int[1];
String decoded = decode_UTF16(str, errors, bo, consumed);
return new PyTuple(Py.newString(decoded),
Py.newInteger(final_ ? str.length() : consumed[0]),
Py.newInteger(bo[0]));
}
private static String decode_UTF16(String str,
String errors,
int[] byteorder) {
return decode_UTF16(str, errors, byteorder, null);
}
private static String decode_UTF16(String str,
String errors,
int[] byteorder,
int[] consumed) {
int bo = 0;
if (byteorder != null) {
bo = byteorder[0];
}
int size = str.length();
StringBuilder v = new StringBuilder(size / 2);
int i;
for (i = 0; i < size; i += 2) {
char ch1 = str.charAt(i);
if (i + 1 == size) {
if (consumed != null) {
break;
}
i = codecs.insertReplacementAndGetResume(v,
errors,
"utf-16",
str,
i,
i + 1,
"truncated data");
continue;
}
char ch2 = str.charAt(i + 1);
if (ch1 == 0xFE && ch2 == 0xFF) {
bo = 1;
continue;
} else if (ch1 == 0xFF && ch2 == 0xFE) {
bo = -1;
continue;
}
int W1;
if (bo == -1) {
W1 = (ch2 << 8 | ch1);
} else {
W1 = (ch1 << 8 | ch2);
}
if (W1 < 0xD800 || W1 > 0xDFFF) {
v.appendCodePoint(W1);
continue;
} else if (W1 >= 0xD800 && W1 <= 0xDBFF && i < size - 1) {
i += 2;
char ch3 = str.charAt(i);
char ch4 = str.charAt(i + 1);
int W2;
if (bo == -1) {
W2 = (ch4 << 8 | ch3);
} else {
W2 = (ch3 << 8 | ch4);
}
if (W2 >= 0xDC00 && W2 <= 0xDFFF) {
int U = (((W1 & 0x3FF) << 10) | (W2 & 0x3FF)) + 0x10000;
v.appendCodePoint(U);
continue;
}
i = codecs.insertReplacementAndGetResume(v,
errors,
"utf-16",
str,
i,
i + 1,
"illegal UTF-16 surrogate");
continue;
}
i = codecs.insertReplacementAndGetResume(v,
errors,
"utf-16",
str,
i,
i + 1,
"illegal encoding");
}
if (byteorder != null) {
byteorder[0] = bo;
}
if (consumed != null) {
consumed[0] = i;
}
return v.toString();
}
/* --- RawUnicodeEscape Codec ----------------------------------------- */
public static PyTuple raw_unicode_escape_encode(String str) {
return raw_unicode_escape_encode(str, null);
}
public static PyTuple raw_unicode_escape_encode(String str,
String errors) {
return encode_tuple(codecs.PyUnicode_EncodeRawUnicodeEscape(str,
errors, false),
str.length());
}
public static PyTuple raw_unicode_escape_decode(String str) {
return raw_unicode_escape_decode(str, null);
}
public static PyTuple raw_unicode_escape_decode(String str,
String errors) {
return decode_tuple(codecs.PyUnicode_DecodeRawUnicodeEscape(str,
errors),
str.length());
}
/* --- UnicodeEscape Codec -------------------------------------------- */
public static PyTuple unicode_escape_encode(String str) {
return unicode_escape_encode(str, null);
}
public static PyTuple unicode_escape_encode(String str, String errors) {
return encode_tuple(PyString.encode_UnicodeEscape(str, false),
str.length());
}
public static PyTuple unicode_escape_decode(String str) {
return unicode_escape_decode(str, null);
}
public static PyTuple unicode_escape_decode(String str, String errors) {
int n = str.length();
return decode_tuple(PyString.decode_UnicodeEscape(str,
0,
n,
errors,
true), n);
}
/* --- UnicodeInternal Codec ------------------------------------------ */
public static PyTuple unicode_internal_encode(String str) {
return unicode_internal_encode(str, null);
}
public static PyTuple unicode_internal_encode(String str, String errors) {
return encode_tuple(str, str.length());
}
public static PyTuple unicode_internal_decode(String str) {
return unicode_internal_decode(str, null);
}
public static PyTuple unicode_internal_decode(String str, String errors) {
return decode_tuple(str, str.length());
}
/**
* Optimized charmap encoder mapping.
*
* Uses a trie structure instead of a dictionary; the speedup primarily comes from not
* creating integer objects in the process. The trie is created by inverting the
* encoding map.
*/
@ExposedType(name = "EncodingMap", isBaseType = false)
public static class EncodingMap extends PyObject {
char[] level1;
char[] level23;
int count2;
int count3;
private EncodingMap(char[] level1, char[] level23, int count2, int count3) {
this.level1 = level1;
this.level23 = level23;
this.count2 = count2;
this.count3 = count3;
}
/**
* Create and populate an EncodingMap from a 256 length PyUnicode char. Returns a
* PyDictionary if the mapping isn't easily optimized.
*
* @param string a 256 length unicode mapping
* @return an encoder mapping
*/
public static PyObject buildEncodingMap(PyObject string) {
if (!(string instanceof PyUnicode) || string.__len__() != 256) {
throw Py.TypeError("bad argument type for built-in operation");
}
boolean needDict = false;
char[] level1 = new char[32];
char[] level23 = new char[512];
int i;
int count2 = 0;
int count3 = 0;
String decode = string.toString();
for (i = 0; i < level1.length; i++) {
level1[i] = 0xFF;
}
for (i = 0; i < level23.length; i++) {
level23[i] = 0xFF;
}
if (decode.charAt(0) != 0) {
needDict = true;
}
for (i = 1; i < 256; i++) {
int l1, l2;
char charAt = decode.charAt(i);
if (charAt == 0) {
needDict = true;
}
if (charAt == 0xFFFE) {
// unmapped character
continue;
}
l1 = charAt >> 11;
l2 = charAt >> 7;
if (level1[l1] == 0xFF) {
level1[l1] = (char)count2++;
}
if (level23[l2] == 0xFF) {
level23[l2] = (char)count3++;
}
}
if (count2 > 0xFF || count3 > 0xFF) {
needDict = true;
}
if (needDict) {
PyObject result = new PyDictionary();
for (i = 0; i < 256; i++) {
result.__setitem__(Py.newInteger(decode.charAt(i)), Py.newInteger(i));
}
return result;
}
// Create a three-level trie
int length2 = 16 * count2;
int length3 = 128 * count3;
level23 = new char[length2 + length3];
PyObject result = new EncodingMap(level1, level23, count2, count3);
for (i = 0; i < length2; i++) {
level23[i] = 0xFF;
}
for (i = length2; i < length2 + length3; i++) {
level23[i] = 0;
}
count3 = 0;
for (i = 1; i < 256; i++) {
int o1, o2, o3, i2, i3;
char charAt = decode.charAt(i);
if (charAt == 0xFFFE) {
// unmapped character
continue;
}
o1 = charAt >> 11;
o2 = (charAt >> 7) & 0xF;
i2 = 16 * level1[o1] + o2;
if (level23[i2] == 0xFF) {
level23[i2] = (char)count3++;
}
o3 = charAt & 0x7F;
i3 = 128 * level23[i2] + o3;
level23[length2 + i3] = (char)i;
}
return result;
}
/**
* Lookup a char in the EncodingMap.
*
* @param c a char
* @return an int, -1 for failure
*/
public int lookup(char c) {
int l1 = c >> 11;
int l2 = (c >> 7) & 0xF;
int l3 = c & 0x7F;
int i;
if (c == 0) {
return 0;
}
// level 1
i = level1[l1];
if (i == 0xFF) {
return -1;
}
// level 2
i = level23[16 * i + l2];
if (i == 0xFF) {
return -1;
}
// level 3
i = level23[16 * count2 + 128 * i + l3];
if (i == 0) {
return -1;
}
return i;
}
}
}
© 2015 - 2025 Weber Informatics LLC | Privacy Policy