src.org.python.modules._codecs Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of jython Show documentation
Show all versions of jython Show documentation
Jython is an implementation of the high-level, dynamic, object-oriented
language Python written in 100% Pure Java, and seamlessly integrated with
the Java platform. It thus allows you to run Python on any Java platform.
/*
* Copyright 2000 Finn Bock
*
* This program contains material copyrighted by:
* Copyright (c) Corporation for National Research Initiatives.
* Originally written by Marc-Andre Lemburg ([email protected]).
*/
package org.python.modules;
import java.nio.ByteBuffer;
import java.nio.charset.Charset;
import java.util.Iterator;
import org.python.core.Py;
import org.python.core.PyDictionary;
import org.python.core.PyInteger;
import org.python.core.PyNone;
import org.python.core.PyObject;
import org.python.core.PyString;
import org.python.core.PySystemState;
import org.python.core.PyTuple;
import org.python.core.PyUnicode;
import org.python.core.codecs;
import org.python.expose.ExposedType;
public class _codecs {
public static void register(PyObject search_function) {
codecs.register(search_function);
}
public static PyTuple lookup(String encoding) {
return codecs.lookup(encoding);
}
public static PyObject lookup_error(String handlerName) {
return codecs.lookup_error(handlerName);
}
public static void register_error(String name, PyObject errorHandler) {
codecs.register_error(name, errorHandler);
}
public static PyObject charmap_build(PyUnicode map) {
return EncodingMap.buildEncodingMap(map);
}
private static PyTuple decode_tuple(String s, int len) {
return new PyTuple(new PyUnicode(s), Py.newInteger(len));
}
private static PyTuple decode_tuple_str(String s, int len) {
return new PyTuple(new PyString(s), Py.newInteger(len));
}
private static PyTuple encode_tuple(String s, int len) {
return new PyTuple(new PyString(s), Py.newInteger(len));
}
/* --- UTF-8 Codec --------------------------------------------------- */
public static PyTuple utf_8_decode(String str) {
return utf_8_decode(str, null);
}
public static PyTuple utf_8_decode(String str, String errors) {
return utf_8_decode(str, errors, false);
}
public static PyTuple utf_8_decode(String str, String errors, boolean final_) {
int[] consumed = final_ ? null : new int[1];
return decode_tuple(codecs.PyUnicode_DecodeUTF8Stateful(str, errors, consumed),
final_ ? str.length() : consumed[0]);
}
public static PyTuple utf_8_encode(String str) {
return utf_8_encode(str, null);
}
public static PyTuple utf_8_encode(String str, String errors) {
int size = str.length();
return encode_tuple(codecs.PyUnicode_EncodeUTF8(str, errors), size);
}
/* --- UTF-7 Codec --------------------------------------------------- */
public static PyTuple utf_7_decode(String str) {
return utf_7_decode(str, null);
}
public static PyTuple utf_7_decode(String str, String errors) {
int size = str.length();
return decode_tuple(codecs.PyUnicode_DecodeUTF7(str, errors), size);
}
public static PyTuple utf_7_encode(String str) {
return utf_7_encode(str, null);
}
public static PyTuple utf_7_encode(String str, String errors) {
int size = str.length();
return encode_tuple(codecs.PyUnicode_EncodeUTF7(str, false, false, errors), size);
}
public static PyTuple escape_decode(String str) {
return escape_decode(str, null);
}
public static PyTuple escape_decode(String str, String errors) {
return decode_tuple_str(PyString.decode_UnicodeEscape(str,
0,
str.length(),
errors,
true), str.length());
}
public static PyTuple escape_encode(String str) {
return escape_encode(str, null);
}
public static PyTuple escape_encode(String str, String errors) {
return encode_tuple(PyString.encode_UnicodeEscape(str, false), str.length());
}
/* --- Character Mapping Codec --------------------------------------- */
public static PyTuple charmap_decode(String str,
String errors,
PyObject mapping) {
return charmap_decode(str, errors, mapping, false);
}
public static PyTuple charmap_decode(String str,
String errors,
PyObject mapping, boolean ignoreUnmapped) {
int size = str.length();
StringBuilder v = new StringBuilder(size);
for (int i = 0; i < size; i++) {
char ch = str.charAt(i);
if (ch > 0xFF) {
i = codecs.insertReplacementAndGetResume(v,
errors,
"charmap",
str,
i,
i + 1,
"ordinal not in range(255)") - 1;
continue;
}
PyObject w = Py.newInteger(ch);
PyObject x = mapping.__finditem__(w);
if (x == null) {
if (ignoreUnmapped) {
v.append(ch);
} else {
i = codecs.insertReplacementAndGetResume(v, errors, "charmap", str, i, i + 1, "no mapping found") - 1;
}
continue;
}
/* Apply mapping */
if (x instanceof PyInteger) {
int value = ((PyInteger) x).getValue();
if (value < 0 || value > PySystemState.maxunicode) {
throw Py.TypeError("character mapping must return " + "integer greater than 0 and less than sys.maxunicode");
}
v.append((char) value);
} else if (x == Py.None) {
i = codecs.insertReplacementAndGetResume(v,
errors,
"charmap",
str,
i,
i + 1,
"character maps to ") - 1;
} else if (x instanceof PyString) {
v.append(x.toString());
} else {
/* wrong return value */
throw Py.TypeError("character mapping must return " + "integer, None or str");
}
}
return decode_tuple(v.toString(), size);
}
// parallel to CPython's PyUnicode_TranslateCharmap
public static PyObject translateCharmap(PyUnicode str, String errors, PyObject mapping) {
StringBuilder buf = new StringBuilder(str.toString().length());
for (Iterator iter = str.newSubsequenceIterator(); iter.hasNext();) {
int codePoint = iter.next();
PyObject result = mapping.__finditem__(Py.newInteger(codePoint));
if (result == null) {
// No mapping found means: use 1:1 mapping
buf.appendCodePoint(codePoint);
} else if (result == Py.None) {
// XXX: We don't support the fancier error handling CPython does here of
// capturing regions of chars removed by the None mapping to optionally
// pass to an error handler. Though we don't seem to even use this
// functionality anywhere either
;
} else if (result instanceof PyInteger) {
int value = result.asInt();
if (value < 0 || value > PySystemState.maxunicode) {
throw Py.TypeError(String.format("character mapping must be in range(0x%x)",
PySystemState.maxunicode + 1));
}
buf.appendCodePoint(value);
} else if (result instanceof PyUnicode) {
buf.append(result.toString());
} else {
// wrong return value
throw Py.TypeError("character mapping must return integer, None or unicode");
}
}
return new PyUnicode(buf.toString());
}
public static PyTuple charmap_encode(String str, String errors,
PyObject mapping) {
//Default to Latin-1
if (mapping == null) {
return latin_1_encode(str, errors);
}
return charmap_encode_internal(str, errors, mapping, new StringBuilder(str.length()), true);
}
private static PyTuple charmap_encode_internal(String str,
String errors,
PyObject mapping,
StringBuilder v,
boolean letLookupHandleError) {
EncodingMap encodingMap = mapping instanceof EncodingMap ? (EncodingMap)mapping : null;
int size = str.length();
for (int i = 0; i < size; i++) {
char ch = str.charAt(i);
PyObject x;
if (encodingMap != null) {
int result = encodingMap.lookup(ch);
if (result == -1) {
x = null;
} else {
x = Py.newInteger(result);
}
} else {
x = mapping.__finditem__(Py.newInteger(ch));
}
if (x == null) {
if (letLookupHandleError) {
i = handleBadMapping(str, errors, mapping, v, size, i);
} else {
throw Py.UnicodeEncodeError("charmap",
str,
i,
i + 1,
"character maps to ");
}
} else if (x instanceof PyInteger) {
int value = ((PyInteger) x).getValue();
if (value < 0 || value > 255) {
throw Py.TypeError("character mapping must be in range(256)");
}
v.append((char) value);
} else if (x instanceof PyString && !(x instanceof PyUnicode)) {
v.append(x.toString());
} else if (x instanceof PyNone) {
i = handleBadMapping(str, errors, mapping, v, size, i);
} else {
/* wrong return value */
throw Py.TypeError("character mapping must return " + "integer, None or str");
}
}
return encode_tuple(v.toString(), size);
}
private static int handleBadMapping(String str,
String errors,
PyObject mapping,
StringBuilder v,
int size,
int i) {
if (errors != null) {
if (errors.equals(codecs.IGNORE)) {
return i;
} else if (errors.equals(codecs.REPLACE)) {
charmap_encode_internal("?", errors, mapping, v, false);
return i;
} else if (errors.equals(codecs.XMLCHARREFREPLACE)) {
charmap_encode_internal(codecs.xmlcharrefreplace(i, i + 1, str).toString(), errors, mapping, v, false);
return i;
} else if (errors.equals(codecs.BACKSLASHREPLACE)) {
charmap_encode_internal(codecs.backslashreplace(i, i + 1, str).toString(), errors, mapping, v, false);
return i;
}
}
PyObject replacement = codecs.encoding_error(errors,
"charmap",
str,
i,
i + 1,
"character maps to ");
String replStr = replacement.__getitem__(0).toString();
charmap_encode_internal(replStr, errors, mapping, v, false);
return codecs.calcNewPosition(size, replacement) - 1;
}
public static PyTuple ascii_decode(String str) {
return ascii_decode(str, null);
}
public static PyTuple ascii_decode(String str, String errors) {
int size = str.length();
return decode_tuple(codecs.PyUnicode_DecodeASCII(str, size, errors),
size);
}
public static PyTuple ascii_encode(String str) {
return ascii_encode(str, null);
}
public static PyTuple ascii_encode(String str, String errors) {
int size = str.length();
return encode_tuple(codecs.PyUnicode_EncodeASCII(str, size, errors),
size);
}
/* --- Latin-1 Codec -------------------------------------------- */
public static PyTuple latin_1_decode(String str) {
return latin_1_decode(str, null);
}
public static PyTuple latin_1_decode(String str, String errors) {
int size = str.length();
return decode_tuple(codecs.PyUnicode_DecodeLatin1(str, size, errors),
size);
}
public static PyTuple latin_1_encode(String str) {
return latin_1_encode(str, null);
}
public static PyTuple latin_1_encode(String str, String errors) {
int size = str.length();
return encode_tuple(codecs.PyUnicode_EncodeLatin1(str, size, errors), size);
}
/* --- UTF16 Codec -------------------------------------------- */
public static PyTuple utf_16_encode(String str) {
return utf_16_encode(str, null);
}
public static PyTuple utf_16_encode(String str, String errors) {
return encode_tuple(encode_UTF16(str, errors, 0), str.length());
}
public static PyTuple utf_16_encode(String str, String errors,
int byteorder) {
return encode_tuple(encode_UTF16(str, errors, byteorder),
str.length());
}
public static PyTuple utf_16_le_encode(String str) {
return utf_16_le_encode(str, null);
}
public static PyTuple utf_16_le_encode(String str, String errors) {
return encode_tuple(encode_UTF16(str, errors, -1), str.length());
}
public static PyTuple utf_16_be_encode(String str) {
return utf_16_be_encode(str, null);
}
public static PyTuple utf_16_be_encode(String str, String errors) {
return encode_tuple(encode_UTF16(str, errors, 1), str.length());
}
public static String encode_UTF16(String str, String errors, int byteorder) {
final Charset utf16;
if (byteorder == 0) {
utf16 = Charset.forName("UTF-16");
} else if (byteorder == -1) {
utf16 = Charset.forName("UTF-16LE");
} else {
utf16 = Charset.forName("UTF-16BE");
}
final ByteBuffer bbuf = utf16.encode(str);
final StringBuilder v = new StringBuilder(bbuf.limit());
while (bbuf.remaining() > 0) {
int val = bbuf.get();
if (val < 0) {
val = 256 + val;
}
v.appendCodePoint(val);
}
return v.toString();
}
public static PyTuple utf_16_decode(String str) {
return utf_16_decode(str, null);
}
public static PyTuple utf_16_decode(String str, String errors) {
return utf_16_decode(str, errors, false);
}
public static PyTuple utf_16_decode(String str, String errors, boolean final_) {
int[] bo = new int[] { 0 };
int[] consumed = final_ ? null : new int[1];
return decode_tuple(decode_UTF16(str, errors, bo, consumed),
final_ ? str.length() : consumed[0]);
}
public static PyTuple utf_16_le_decode(String str) {
return utf_16_le_decode(str, null);
}
public static PyTuple utf_16_le_decode(String str, String errors) {
return utf_16_le_decode(str, errors, false);
}
public static PyTuple utf_16_le_decode(String str, String errors, boolean final_) {
int[] bo = new int[] { -1 };
int[] consumed = final_ ? null : new int[1];
return decode_tuple(decode_UTF16(str, errors, bo, consumed),
final_ ? str.length() : consumed[0]);
}
public static PyTuple utf_16_be_decode(String str) {
return utf_16_be_decode(str, null);
}
public static PyTuple utf_16_be_decode(String str, String errors) {
return utf_16_be_decode(str, errors, false);
}
public static PyTuple utf_16_be_decode(String str, String errors, boolean final_) {
int[] bo = new int[] { 1 };
int[] consumed = final_ ? null : new int[1];
return decode_tuple(decode_UTF16(str, errors, bo, consumed),
final_ ? str.length() : consumed[0]);
}
public static PyTuple utf_16_ex_decode(String str) {
return utf_16_ex_decode(str, null);
}
public static PyTuple utf_16_ex_decode(String str, String errors) {
return utf_16_ex_decode(str, errors, 0);
}
public static PyTuple utf_16_ex_decode(String str, String errors, int byteorder) {
return utf_16_ex_decode(str, errors, byteorder, false);
}
public static PyTuple utf_16_ex_decode(String str, String errors, int byteorder,
boolean final_) {
int[] bo = new int[] { 0 };
int[] consumed = final_ ? null : new int[1];
String decoded = decode_UTF16(str, errors, bo, consumed);
return new PyTuple(Py.newString(decoded),
Py.newInteger(final_ ? str.length() : consumed[0]),
Py.newInteger(bo[0]));
}
private static String decode_UTF16(String str,
String errors,
int[] byteorder) {
return decode_UTF16(str, errors, byteorder, null);
}
private static String decode_UTF16(String str,
String errors,
int[] byteorder,
int[] consumed) {
int bo = 0;
if (byteorder != null) {
bo = byteorder[0];
}
int size = str.length();
StringBuilder v = new StringBuilder(size / 2);
int i;
for (i = 0; i < size; i += 2) {
char ch1 = str.charAt(i);
if (i + 1 == size) {
if (consumed != null) {
break;
}
i = codecs.insertReplacementAndGetResume(v,
errors,
"utf-16",
str,
i,
i + 1,
"truncated data");
continue;
}
char ch2 = str.charAt(i + 1);
if (ch1 == 0xFE && ch2 == 0xFF) {
bo = 1;
continue;
} else if (ch1 == 0xFF && ch2 == 0xFE) {
bo = -1;
continue;
}
int W1;
if (bo == -1) {
W1 = (ch2 << 8 | ch1);
} else {
W1 = (ch1 << 8 | ch2);
}
if (W1 < 0xD800 || W1 > 0xDFFF) {
v.appendCodePoint(W1);
continue;
} else if (W1 >= 0xD800 && W1 <= 0xDBFF && i < size - 1) {
i += 2;
char ch3 = str.charAt(i);
char ch4 = str.charAt(i + 1);
int W2;
if (bo == -1) {
W2 = (ch4 << 8 | ch3);
} else {
W2 = (ch3 << 8 | ch4);
}
if (W2 >= 0xDC00 && W2 <= 0xDFFF) {
int U = (((W1 & 0x3FF) << 10) | (W2 & 0x3FF)) + 0x10000;
v.appendCodePoint(U);
continue;
}
i = codecs.insertReplacementAndGetResume(v,
errors,
"utf-16",
str,
i,
i + 1,
"illegal UTF-16 surrogate");
continue;
}
i = codecs.insertReplacementAndGetResume(v,
errors,
"utf-16",
str,
i,
i + 1,
"illegal encoding");
}
if (byteorder != null) {
byteorder[0] = bo;
}
if (consumed != null) {
consumed[0] = i;
}
return v.toString();
}
/* --- RawUnicodeEscape Codec ----------------------------------------- */
public static PyTuple raw_unicode_escape_encode(String str) {
return raw_unicode_escape_encode(str, null);
}
public static PyTuple raw_unicode_escape_encode(String str,
String errors) {
return encode_tuple(codecs.PyUnicode_EncodeRawUnicodeEscape(str,
errors, false),
str.length());
}
public static PyTuple raw_unicode_escape_decode(String str) {
return raw_unicode_escape_decode(str, null);
}
public static PyTuple raw_unicode_escape_decode(String str,
String errors) {
return decode_tuple(codecs.PyUnicode_DecodeRawUnicodeEscape(str,
errors),
str.length());
}
/* --- UnicodeEscape Codec -------------------------------------------- */
public static PyTuple unicode_escape_encode(String str) {
return unicode_escape_encode(str, null);
}
public static PyTuple unicode_escape_encode(String str, String errors) {
return encode_tuple(PyString.encode_UnicodeEscape(str, false),
str.length());
}
public static PyTuple unicode_escape_decode(String str) {
return unicode_escape_decode(str, null);
}
public static PyTuple unicode_escape_decode(String str, String errors) {
int n = str.length();
return decode_tuple(PyString.decode_UnicodeEscape(str,
0,
n,
errors,
true), n);
}
/* --- UnicodeInternal Codec ------------------------------------------ */
public static PyTuple unicode_internal_encode(String str) {
return unicode_internal_encode(str, null);
}
public static PyTuple unicode_internal_encode(String str, String errors) {
return encode_tuple(str, str.length());
}
public static PyTuple unicode_internal_decode(String str) {
return unicode_internal_decode(str, null);
}
public static PyTuple unicode_internal_decode(String str, String errors) {
return decode_tuple(str, str.length());
}
/**
* Optimized charmap encoder mapping.
*
* Uses a trie structure instead of a dictionary; the speedup primarily comes from not
* creating integer objects in the process. The trie is created by inverting the
* encoding map.
*/
@ExposedType(name = "EncodingMap", isBaseType = false)
public static class EncodingMap extends PyObject {
char[] level1;
char[] level23;
int count2;
int count3;
private EncodingMap(char[] level1, char[] level23, int count2, int count3) {
this.level1 = level1;
this.level23 = level23;
this.count2 = count2;
this.count3 = count3;
}
/**
* Create and populate an EncodingMap from a 256 length PyUnicode char. Returns a
* PyDictionary if the mapping isn't easily optimized.
*
* @param string a 256 length unicode mapping
* @return an encoder mapping
*/
public static PyObject buildEncodingMap(PyObject string) {
if (!(string instanceof PyUnicode) || string.__len__() != 256) {
throw Py.TypeError("bad argument type for built-in operation");
}
boolean needDict = false;
char[] level1 = new char[32];
char[] level23 = new char[512];
int i;
int count2 = 0;
int count3 = 0;
String decode = string.toString();
for (i = 0; i < level1.length; i++) {
level1[i] = 0xFF;
}
for (i = 0; i < level23.length; i++) {
level23[i] = 0xFF;
}
if (decode.charAt(0) != 0) {
needDict = true;
}
for (i = 1; i < 256; i++) {
int l1, l2;
char charAt = decode.charAt(i);
if (charAt == 0) {
needDict = true;
}
if (charAt == 0xFFFE) {
// unmapped character
continue;
}
l1 = charAt >> 11;
l2 = charAt >> 7;
if (level1[l1] == 0xFF) {
level1[l1] = (char)count2++;
}
if (level23[l2] == 0xFF) {
level23[l2] = (char)count3++;
}
}
if (count2 > 0xFF || count3 > 0xFF) {
needDict = true;
}
if (needDict) {
PyObject result = new PyDictionary();
for (i = 0; i < 256; i++) {
result.__setitem__(Py.newInteger(decode.charAt(i)), Py.newInteger(i));
}
return result;
}
// Create a three-level trie
int length2 = 16 * count2;
int length3 = 128 * count3;
level23 = new char[length2 + length3];
PyObject result = new EncodingMap(level1, level23, count2, count3);
for (i = 0; i < length2; i++) {
level23[i] = 0xFF;
}
for (i = length2; i < length2 + length3; i++) {
level23[i] = 0;
}
count3 = 0;
for (i = 1; i < 256; i++) {
int o1, o2, o3, i2, i3;
char charAt = decode.charAt(i);
if (charAt == 0xFFFE) {
// unmapped character
continue;
}
o1 = charAt >> 11;
o2 = (charAt >> 7) & 0xF;
i2 = 16 * level1[o1] + o2;
if (level23[i2] == 0xFF) {
level23[i2] = (char)count3++;
}
o3 = charAt & 0x7F;
i3 = 128 * level23[i2] + o3;
level23[length2 + i3] = (char)i;
}
return result;
}
/**
* Lookup a char in the EncodingMap.
*
* @param c a char
* @return an int, -1 for failure
*/
public int lookup(char c) {
int l1 = c >> 11;
int l2 = (c >> 7) & 0xF;
int l3 = c & 0x7F;
int i;
if (c == 0) {
return 0;
}
// level 1
i = level1[l1];
if (i == 0xFF) {
return -1;
}
// level 2
i = level23[16 * i + l2];
if (i == 0xFF) {
return -1;
}
// level 3
i = level23[16 * count2 + 128 * i + l3];
if (i == 0) {
return -1;
}
return i;
}
}
}