org.python.modules._codecs Maven / Gradle / Ivy
Show all versions of jython-slim Show documentation
/*
* Copyright (c)2013 Jython Developers. Original Java version copyright 2000 Finn Bock.
*
* This program contains material copyrighted by: Copyright (c) Corporation for National Research
* Initiatives. Originally written by Marc-Andre Lemburg ([email protected]).
*/
package org.python.modules;
import java.nio.ByteBuffer;
import java.nio.charset.Charset;
import java.util.Iterator;
import org.python.core.Py;
import org.python.core.PyDictionary;
import org.python.core.PyInteger;
import org.python.core.PyNone;
import org.python.core.PyObject;
import org.python.core.PyString;
import org.python.core.PySystemState;
import org.python.core.PyTuple;
import org.python.core.PyUnicode;
import org.python.core.codecs;
import org.python.core.Untraversable;
import org.python.expose.ExposedType;
/**
* This class corresponds to the Python _codecs module, which in turn lends its functions to the
* codecs module (in Lib/codecs.py). It exposes the implementing functions of several codec families
* called out in the Python codecs library Lib/encodings/*.py, where it is usually claimed that they
* are bound "as C functions". Obviously, C stands for "compiled" in this context, rather than
* dependence on a particular implementation language. Actual transcoding methods often come from
* the related {@link codecs} class.
*/
public class _codecs {
public static void register(PyObject search_function) {
codecs.register(search_function);
}
private static String _castString(PyString pystr) {
// Jython used to treat String as equivalent to PyString, or maybe PyUnicode, as
// it made sense. We need to be more careful now! Insert this cast check as necessary
// to ensure the appropriate compliance.
if (pystr == null) {
return null;
}
String s = pystr.toString();
if (pystr instanceof PyUnicode) {
return s;
} else {
// May throw UnicodeEncodeError, per CPython behavior
return codecs.PyUnicode_EncodeASCII(s, s.length(), null);
}
}
public static PyTuple lookup(PyString encoding) {
return codecs.lookup(_castString(encoding));
}
public static PyObject lookup_error(PyString handlerName) {
return codecs.lookup_error(_castString(handlerName));
}
public static void register_error(String name, PyObject errorHandler) {
codecs.register_error(name, errorHandler);
}
/**
* Decode bytes
using the system default encoding (see
* {@link codecs#getDefaultEncoding()}). Decoding errors raise a ValueError
.
*
* @param bytes to be decoded
* @return Unicode string decoded from bytes
*/
public static PyObject decode(PyString bytes) {
return decode(bytes, null, null);
}
/**
* Decode bytes
using the codec registered for the encoding
. The
* encoding
defaults to the system default encoding (see
* {@link codecs#getDefaultEncoding()}). Decoding errors raise a ValueError
.
*
* @param bytes to be decoded
* @param encoding name of encoding (to look up in codec registry)
* @return Unicode string decoded from bytes
*/
public static PyObject decode(PyString bytes, PyString encoding) {
return decode(bytes, encoding, null);
}
/**
* Decode bytes
using the codec registered for the encoding
. The
* encoding
defaults to the system default encoding (see
* {@link codecs#getDefaultEncoding()}). The string errors
may name a different
* error handling policy (built-in or registered with {@link #register_error(String, PyObject)}
* ). The default error policy is 'strict' meaning that decoding errors raise a
* ValueError
.
*
* @param bytes to be decoded
* @param encoding name of encoding (to look up in codec registry)
* @param errors error policy name (e.g. "ignore")
* @return Unicode string decoded from bytes
*/
public static PyObject decode(PyString bytes, PyString encoding, PyString errors) {
return codecs.decode(bytes, _castString(encoding), _castString(errors));
}
/**
* Encode unicode
using the system default encoding (see
* {@link codecs#getDefaultEncoding()}). Encoding errors raise a ValueError
.
*
* @param unicode string to be encoded
* @return bytes object encoding unicode
*/
public static PyString encode(PyUnicode unicode) {
return encode(unicode, null, null);
}
/**
* Encode unicode
using the codec registered for the encoding
. The
* encoding
defaults to the system default encoding (see
* {@link codecs#getDefaultEncoding()}). Encoding errors raise a ValueError
.
*
* @param unicode string to be encoded
* @param encoding name of encoding (to look up in codec registry)
* @return bytes object encoding unicode
*/
public static PyString encode(PyUnicode unicode, PyString encoding) {
return encode(unicode, encoding, null);
}
/**
* Encode unicode
using the codec registered for the encoding
. The
* encoding
defaults to the system default encoding (see
* {@link codecs#getDefaultEncoding()}). The string errors
may name a different
* error handling policy (built-in or registered with {@link #register_error(String, PyObject)}
* ). The default error policy is 'strict' meaning that encoding errors raise a
* ValueError
.
*
* @param unicode string to be encoded
* @param encoding name of encoding (to look up in codec registry)
* @param errors error policy name (e.g. "ignore")
* @return bytes object encoding unicode
*/
public static PyString encode(PyUnicode unicode, PyString encoding, PyString errors) {
return Py.newString(codecs.encode(unicode, _castString(encoding), _castString(errors)));
}
/* --- Some codec support methods -------------------------------------------- */
public static PyObject charmap_build(PyUnicode map) {
return EncodingMap.buildEncodingMap(map);
}
/**
* Enumeration representing the possible endianness of UTF-32 (possibly UTF-16) encodings.
* Python uses integers {-1, 0, 1}
, but we can be more expressive. For encoding
* UNDEFINED means choose the endianness of the platform and insert a byte order mark (BOM). But
* since the platform is Java, that is always big-endian. For decoding it means read the BOM
* from the stream, and it is an error not to find one (compare
* Lib/encodings/utf_32.py
).
*/
enum ByteOrder {
LE, UNDEFINED, BE;
/** Returns the Python equivalent code -1 = LE, 0 = as marked/platform, +1 = BE */
int code() {
return ordinal() - 1;
}
/** Returns equivalent to the Python code -1 = LE, 0 = as marked/platform, +1 = BE */
static ByteOrder fromInt(int byteorder) {
switch (byteorder) {
case -1:
return LE;
case 1:
return BE;
default:
return UNDEFINED;
}
}
}
/**
* Convenience method to construct the return value of decoders, providing the Unicode result as
* a String, and the number of bytes consumed.
*
* @param u the unicode result as a UTF-16 Java String
* @param bytesConsumed the number of bytes consumed
* @return the tuple (unicode(u), bytesConsumed)
*/
private static PyTuple decode_tuple(String u, int bytesConsumed) {
return new PyTuple(new PyUnicode(u), Py.newInteger(bytesConsumed));
}
/**
* Convenience method to construct the return value of decoders, providing the Unicode result as
* a String, and the number of bytes consumed in decoding as either a single-element array or an
* int to be used if the array argument is null.
*
* @param u the unicode result as a UTF-16 Java String
* @param consumed if not null, element [0] is the number of bytes consumed
* @param defConsumed if consumed==null, use this as the number of bytes consumed
* @return the tuple (unicode(u), bytesConsumed)
*/
private static PyTuple decode_tuple(String u, int[] consumed, int defConsumed) {
return decode_tuple(u, consumed != null ? consumed[0] : defConsumed);
}
/**
* Convenience method to construct the return value of decoders that infer the byte order from
* the byte-order mark.
*
* @param u the unicode result as a UTF-16 Java String
* @param bytesConsumed the number of bytes consumed
* @param order the byte order (deduced by codec)
* @return the tuple (unicode(u), bytesConsumed, byteOrder)
*/
private static PyTuple decode_tuple(String u, int bytesConsumed, ByteOrder order) {
int bo = order.code();
return new PyTuple(new PyUnicode(u), Py.newInteger(bytesConsumed), Py.newInteger(bo));
}
private static PyTuple decode_tuple_str(String s, int len) {
return new PyTuple(new PyString(s), Py.newInteger(len));
}
private static PyTuple encode_tuple(String s, int len) {
return new PyTuple(new PyString(s), Py.newInteger(len));
}
/* --- UTF-8 Codec --------------------------------------------------- */
public static PyTuple utf_8_decode(String str) {
return utf_8_decode(str, null);
}
public static PyTuple utf_8_decode(String str, String errors) {
return utf_8_decode(str, errors, false);
}
public static PyTuple utf_8_decode(String str, String errors, PyObject final_) {
return utf_8_decode(str, errors, final_.__nonzero__());
}
public static PyTuple utf_8_decode(String str, String errors, boolean final_) {
int[] consumed = final_ ? null : new int[1];
return decode_tuple(codecs.PyUnicode_DecodeUTF8Stateful(str, errors, consumed), final_
? str.length() : consumed[0]);
}
public static PyTuple utf_8_encode(String str) {
return utf_8_encode(str, null);
}
public static PyTuple utf_8_encode(String str, String errors) {
int size = str.length();
return encode_tuple(codecs.PyUnicode_EncodeUTF8(str, errors), size);
}
/* --- UTF-7 Codec --------------------------------------------------- */
public static PyTuple utf_7_decode(String bytes) {
return utf_7_decode(bytes, null);
}
public static PyTuple utf_7_decode(String bytes, String errors) {
return utf_7_decode(bytes, null, false);
}
public static PyTuple utf_7_decode(String bytes, String errors, boolean finalFlag) {
int[] consumed = finalFlag ? null : new int[1];
String decoded = codecs.PyUnicode_DecodeUTF7Stateful(bytes, errors, consumed);
return decode_tuple(decoded, consumed, bytes.length());
}
public static PyTuple utf_7_encode(String str) {
return utf_7_encode(str, null);
}
public static PyTuple utf_7_encode(String str, String errors) {
int size = str.length();
return encode_tuple(codecs.PyUnicode_EncodeUTF7(str, false, false, errors), size);
}
/* --- string-escape Codec -------------------------------------------- */
public static PyTuple escape_decode(String str) {
return escape_decode(str, null);
}
public static PyTuple escape_decode(String str, String errors) {
return decode_tuple_str(PyString.decode_UnicodeEscape(str, 0, str.length(), errors, true),
str.length());
}
public static PyTuple escape_encode(String str) {
return escape_encode(str, null);
}
public static PyTuple escape_encode(String str, String errors) {
return encode_tuple(PyString.encode_UnicodeEscape(str, false), str.length());
}
/* --- Character Mapping Codec --------------------------------------- */
/**
* Equivalent to charmap_decode(bytes, errors, null)
. This method is here so the
* error and mapping arguments can be optional at the Python level.
*
* @param bytes sequence of bytes to decode
* @return decoded string and number of bytes consumed
*/
public static PyTuple charmap_decode(String bytes) {
return charmap_decode(bytes, null, null);
}
/**
* Equivalent to charmap_decode(bytes, errors, null)
. This method is here so the
* error argument can be optional at the Python level.
*
* @param bytes sequence of bytes to decode
* @param errors error policy
* @return decoded string and number of bytes consumed
*/
public static PyTuple charmap_decode(String bytes, String errors) {
return charmap_decode(bytes, errors, null);
}
/**
* Decode a sequence of bytes into Unicode characters via a mapping supplied as a container to
* be indexed by the byte values (as unsigned integers). If the mapping is null or None, decode
* with latin-1 (essentially treating bytes as character codes directly).
*
* @param bytes sequence of bytes to decode
* @param errors error policy
* @param mapping to convert bytes to characters
* @return decoded string and number of bytes consumed
*/
public static PyTuple charmap_decode(String bytes, String errors, PyObject mapping) {
if (mapping == null || mapping == Py.None) {
// Default to Latin-1
return latin_1_decode(bytes, errors);
} else {
return charmap_decode(bytes, errors, mapping, false);
}
}
/**
* Decode a sequence of bytes into Unicode characters via a mapping supplied as a container to
* be indexed by the byte values (as unsigned integers).
*
* @param bytes sequence of bytes to decode
* @param errors error policy
* @param mapping to convert bytes to characters
* @param ignoreUnmapped if true, pass unmapped byte values as character codes [0..256)
* @return decoded string and number of bytes consumed
*/
public static PyTuple charmap_decode(String bytes, String errors, PyObject mapping,
boolean ignoreUnmapped) {
// XXX bytes: would prefer to accept any object with buffer API
int size = bytes.length();
StringBuilder v = new StringBuilder(size);
for (int i = 0; i < size; i++) {
// Process the i.th input byte
int b = bytes.charAt(i);
if (b > 0xff) {
i = codecs.insertReplacementAndGetResume(v, errors, "charmap", bytes, //
i, i + 1, "ordinal not in range(255)") - 1;
continue;
}
// Map the byte to an output character code (or possibly string)
PyObject w = Py.newInteger(b);
PyObject x = mapping.__finditem__(w);
// Apply to the output
if (x == null) {
// Error case: mapping not found
if (ignoreUnmapped) {
v.appendCodePoint(b);
} else {
i = codecs.insertReplacementAndGetResume(v, errors, "charmap", bytes, //
i, i + 1, "no mapping found") - 1;
}
} else if (x instanceof PyInteger) {
// Mapping was to an int: treat as character code
int value = ((PyInteger)x).getValue();
if (value < 0 || value > PySystemState.maxunicode) {
throw Py.TypeError("character mapping must return "
+ "integer greater than 0 and less than sys.maxunicode");
}
v.appendCodePoint(value);
} else if (x == Py.None) {
i = codecs.insertReplacementAndGetResume(v, errors, "charmap", bytes, //
i, i + 1, "character maps to ") - 1;
} else if (x instanceof PyString) {
String s = x.toString();
if (s.charAt(0) == 0xfffe) {
// Invalid indicates "undefined" see C-API PyUnicode_DecodeCharmap()
i = codecs.insertReplacementAndGetResume(v, errors, "charmap", bytes, //
i, i + 1, "character maps to ") - 1;
} else {
v.append(s);
}
} else {
/* wrong return value */
throw Py.TypeError("character mapping must return " + "integer, None or str");
}
}
return decode_tuple(v.toString(), size);
}
// parallel to CPython's PyUnicode_TranslateCharmap
public static PyObject translateCharmap(PyUnicode str, String errors, PyObject mapping) {
StringBuilder buf = new StringBuilder(str.toString().length());
for (Iterator iter = str.newSubsequenceIterator(); iter.hasNext();) {
int codePoint = iter.next();
PyObject result = mapping.__finditem__(Py.newInteger(codePoint));
if (result == null) {
// No mapping found means: use 1:1 mapping
buf.appendCodePoint(codePoint);
} else if (result == Py.None) {
// XXX: We don't support the fancier error handling CPython does here of
// capturing regions of chars removed by the None mapping to optionally
// pass to an error handler. Though we don't seem to even use this
// functionality anywhere either
;
} else if (result instanceof PyInteger) {
int value = result.asInt();
if (value < 0 || value > PySystemState.maxunicode) {
throw Py.TypeError(String.format("character mapping must be in range(0x%x)",
PySystemState.maxunicode + 1));
}
buf.appendCodePoint(value);
} else if (result instanceof PyUnicode) {
buf.append(result.toString());
} else {
// wrong return value
throw Py.TypeError("character mapping must return integer, None or unicode");
}
}
return new PyUnicode(buf.toString());
}
/**
* Equivalent to charmap_encode(str, null, null)
. This method is here so the error
* and mapping arguments can be optional at the Python level.
*
* @param str to be encoded
* @return (encoded data, size(str)) as a pair
*/
public static PyTuple charmap_encode(String str) {
return charmap_encode(str, null, null);
}
/**
* Equivalent to charmap_encode(str, errors, null)
. This method is here so the
* mapping can be optional at the Python level.
*
* @param str to be encoded
* @param errors error policy name (e.g. "ignore")
* @return (encoded data, size(str)) as a pair
*/
public static PyTuple charmap_encode(String str, String errors) {
return charmap_encode(str, errors, null);
}
/**
* Encoder based on an optional character mapping. This mapping is either an
* EncodingMap
of 256 entries, or an arbitrary container indexable with integers
* using __finditem__
and yielding byte strings. If the mapping is null, latin-1
* (effectively a mapping of character code to the numerically-equal byte) is used
*
* @param str to be encoded
* @param errors error policy name (e.g. "ignore")
* @param mapping from character code to output byte (or string)
* @return (encoded data, size(str)) as a pair
*/
public static PyTuple charmap_encode(String str, String errors, PyObject mapping) {
if (mapping == null || mapping == Py.None) {
// Default to Latin-1
return latin_1_encode(str, errors);
} else {
return charmap_encode_internal(str, errors, mapping, new StringBuilder(str.length()),
true);
}
}
/**
* Helper to implement the several variants of charmap_encode
, given an optional
* mapping. This mapping is either an EncodingMap
of 256 entries, or an arbitrary
* container indexable with integers using __finditem__
and yielding byte strings.
*
* @param str to be encoded
* @param errors error policy name (e.g. "ignore")
* @param mapping from character code to output byte (or string)
* @param v to contain the encoded bytes
* @param letLookupHandleError
* @return (encoded data, size(str)) as a pair
*/
private static PyTuple charmap_encode_internal(String str, String errors, PyObject mapping,
StringBuilder v, boolean letLookupHandleError) {
EncodingMap encodingMap = mapping instanceof EncodingMap ? (EncodingMap)mapping : null;
int size = str.length();
for (int i = 0; i < size; i++) {
// Map the i.th character of str to some value
char ch = str.charAt(i);
PyObject x;
if (encodingMap != null) {
// The mapping given was an EncodingMap [0,256) => on-negative int
int result = encodingMap.lookup(ch);
x = (result == -1) ? null : Py.newInteger(result);
} else {
// The mapping was a map or similar: non-negative int -> object
x = mapping.__finditem__(Py.newInteger(ch));
}
// And map this object to an output character
if (x == null) {
// Error during lookup
if (letLookupHandleError) {
// Some kind of substitute can be placed in the output
i = handleBadMapping(str, errors, mapping, v, size, i);
} else {
// Hard error
throw Py.UnicodeEncodeError("charmap", str, i, i + 1,
"character maps to ");
}
} else if (x instanceof PyInteger) {
// Look-up had integer result: output as byte value
int value = ((PyInteger)x).getValue();
if (value < 0 || value > 255) {
throw Py.TypeError("character mapping must be in range(256)");
}
v.append((char)value);
} else if (x instanceof PyString && !(x instanceof PyUnicode)) {
// Look-up had str or unicode result: output as Java String
// XXX: (Py3k) Look-up had bytes or str result: output as ... this is a problem
v.append(x.toString());
} else if (x instanceof PyNone) {
i = handleBadMapping(str, errors, mapping, v, size, i);
} else {
/* wrong return value */
throw Py.TypeError("character mapping must return " + "integer, None or str");
}
}
return encode_tuple(v.toString(), size);
}
/**
* Helper for {@link #charmap_encode_internal(String, String, PyObject, StringBuilder, boolean)}
* called when we need some kind of substitute in the output for an invalid input.
*
* @param str to be encoded
* @param errors error policy name (e.g. "ignore")
* @param mapping from character code to output byte (or string)
* @param v to contain the encoded bytes
* @param size of str
* @param i index in str of current (and problematic) character
* @return index of last character of problematic section
*/
private static int handleBadMapping(String str, String errors, PyObject mapping,
StringBuilder v, int size, int i) {
// If error policy specified, execute it
if (errors != null) {
if (errors.equals(codecs.IGNORE)) {
return i;
} else if (errors.equals(codecs.REPLACE)) {
String replStr = "?";
charmap_encode_internal(replStr, errors, mapping, v, false);
return i;
} else if (errors.equals(codecs.XMLCHARREFREPLACE)) {
String replStr = codecs.xmlcharrefreplace(i, i + 1, str).toString();
charmap_encode_internal(replStr, errors, mapping, v, false);
return i;
} else if (errors.equals(codecs.BACKSLASHREPLACE)) {
String replStr = codecs.backslashreplace(i, i + 1, str).toString();
charmap_encode_internal(replStr, errors, mapping, v, false);
return i;
}
}
// Default behaviour (error==null or does not match known case)
String msg = "character maps to ";
PyObject replacement = codecs.encoding_error(errors, "charmap", str, i, i + 1, msg);
String replStr = replacement.__getitem__(0).toString();
charmap_encode_internal(replStr, errors, mapping, v, false);
return codecs.calcNewPosition(size, replacement) - 1;
}
/* --- ascii Codec ---------------------------------------------- */
public static PyTuple ascii_decode(String str) {
return ascii_decode(str, null);
}
public static PyTuple ascii_decode(String str, String errors) {
int size = str.length();
return decode_tuple(codecs.PyUnicode_DecodeASCII(str, size, errors), size);
}
public static PyTuple ascii_encode(String str) {
return ascii_encode(str, null);
}
public static PyTuple ascii_encode(String str, String errors) {
int size = str.length();
return encode_tuple(codecs.PyUnicode_EncodeASCII(str, size, errors), size);
}
/* --- Latin-1 Codec -------------------------------------------- */
public static PyTuple latin_1_decode(String str) {
return latin_1_decode(str, null);
}
public static PyTuple latin_1_decode(String str, String errors) {
int size = str.length();
return decode_tuple(codecs.PyUnicode_DecodeLatin1(str, size, errors), size);
}
public static PyTuple latin_1_encode(String str) {
return latin_1_encode(str, null);
}
public static PyTuple latin_1_encode(String str, String errors) {
int size = str.length();
return encode_tuple(codecs.PyUnicode_EncodeLatin1(str, size, errors), size);
}
/* --- UTF-16 Codec ------------------------------------------- */
public static PyTuple utf_16_encode(String str) {
return utf_16_encode(str, null);
}
public static PyTuple utf_16_encode(String str, String errors) {
return encode_tuple(encode_UTF16(str, errors, 0), str.length());
}
public static PyTuple utf_16_encode(String str, String errors, int byteorder) {
return encode_tuple(encode_UTF16(str, errors, byteorder), str.length());
}
public static PyTuple utf_16_le_encode(String str) {
return utf_16_le_encode(str, null);
}
public static PyTuple utf_16_le_encode(String str, String errors) {
return encode_tuple(encode_UTF16(str, errors, -1), str.length());
}
public static PyTuple utf_16_be_encode(String str) {
return utf_16_be_encode(str, null);
}
public static PyTuple utf_16_be_encode(String str, String errors) {
return encode_tuple(encode_UTF16(str, errors, 1), str.length());
}
public static String encode_UTF16(String str, String errors, int byteorder) {
final Charset utf16;
if (byteorder == 0) {
utf16 = Charset.forName("UTF-16");
} else if (byteorder == -1) {
utf16 = Charset.forName("UTF-16LE");
} else {
utf16 = Charset.forName("UTF-16BE");
}
// XXX errors argument ignored: Java's codecs implement "replace"
final ByteBuffer bbuf = utf16.encode(str);
final StringBuilder v = new StringBuilder(bbuf.limit());
while (bbuf.remaining() > 0) {
int val = bbuf.get();
if (val < 0) {
val = 256 + val;
}
v.appendCodePoint(val);
}
return v.toString();
}
public static PyTuple utf_16_decode(String str) {
return utf_16_decode(str, null);
}
public static PyTuple utf_16_decode(String str, String errors) {
return utf_16_decode(str, errors, false);
}
public static PyTuple utf_16_decode(String str, String errors, boolean final_) {
int[] bo = new int[] {0};
int[] consumed = final_ ? null : new int[1];
return decode_tuple(decode_UTF16(str, errors, bo, consumed), final_ ? str.length()
: consumed[0]);
}
public static PyTuple utf_16_le_decode(String str) {
return utf_16_le_decode(str, null);
}
public static PyTuple utf_16_le_decode(String str, String errors) {
return utf_16_le_decode(str, errors, false);
}
public static PyTuple utf_16_le_decode(String str, String errors, boolean final_) {
int[] bo = new int[] {-1};
int[] consumed = final_ ? null : new int[1];
return decode_tuple(decode_UTF16(str, errors, bo, consumed), final_ ? str.length()
: consumed[0]);
}
public static PyTuple utf_16_be_decode(String str) {
return utf_16_be_decode(str, null);
}
public static PyTuple utf_16_be_decode(String str, String errors) {
return utf_16_be_decode(str, errors, false);
}
public static PyTuple utf_16_be_decode(String str, String errors, boolean final_) {
int[] bo = new int[] {1};
int[] consumed = final_ ? null : new int[1];
return decode_tuple(decode_UTF16(str, errors, bo, consumed), final_ ? str.length()
: consumed[0]);
}
public static PyTuple utf_16_ex_decode(String str) {
return utf_16_ex_decode(str, null);
}
public static PyTuple utf_16_ex_decode(String str, String errors) {
return utf_16_ex_decode(str, errors, 0);
}
public static PyTuple utf_16_ex_decode(String str, String errors, int byteorder) {
return utf_16_ex_decode(str, errors, byteorder, false);
}
public static PyTuple
utf_16_ex_decode(String str, String errors, int byteorder, boolean final_) {
int[] bo = new int[] {0};
int[] consumed = final_ ? null : new int[1];
String decoded = decode_UTF16(str, errors, bo, consumed);
return new PyTuple(new PyUnicode(decoded), Py.newInteger(final_ ? str.length()
: consumed[0]), Py.newInteger(bo[0]));
}
private static String decode_UTF16(String str, String errors, int[] byteorder) {
return decode_UTF16(str, errors, byteorder, null);
}
private static String decode_UTF16(String str, String errors, int[] byteorder, int[] consumed) {
int bo = 0;
if (byteorder != null) {
bo = byteorder[0];
}
int size = str.length();
StringBuilder v = new StringBuilder(size / 2);
int i;
for (i = 0; i < size; i += 2) {
char ch1 = str.charAt(i);
if (i + 1 == size) {
if (consumed != null) {
break;
}
i = codecs.insertReplacementAndGetResume(v, errors, "utf-16", str, //
i, i + 1, "truncated data");
continue;
}
char ch2 = str.charAt(i + 1);
if (ch1 == 0xFE && ch2 == 0xFF) {
bo = 1;
continue;
} else if (ch1 == 0xFF && ch2 == 0xFE) {
bo = -1;
continue;
}
int W1;
if (bo == -1) {
W1 = (ch2 << 8 | ch1);
} else {
W1 = (ch1 << 8 | ch2);
}
if (W1 < 0xD800 || W1 > 0xDFFF) {
v.appendCodePoint(W1);
continue;
} else if (W1 >= 0xD800 && W1 <= 0xDBFF && i < size - 1) {
i += 2;
char ch3 = str.charAt(i);
char ch4 = str.charAt(i + 1);
int W2;
if (bo == -1) {
W2 = (ch4 << 8 | ch3);
} else {
W2 = (ch3 << 8 | ch4);
}
if (W2 >= 0xDC00 && W2 <= 0xDFFF) {
int U = (((W1 & 0x3FF) << 10) | (W2 & 0x3FF)) + 0x10000;
v.appendCodePoint(U);
continue;
}
i = codecs.insertReplacementAndGetResume(v, errors, "utf-16", str, //
i, i + 1, "illegal UTF-16 surrogate");
continue;
}
i = codecs.insertReplacementAndGetResume(v, errors, "utf-16", str, //
i, i + 1, "illegal encoding");
}
if (byteorder != null) {
byteorder[0] = bo;
}
if (consumed != null) {
consumed[0] = i;
}
return v.toString();
}
/* --- UTF-32 Codec ------------------------------------------- */
/**
* Encode a Unicode Java String as UTF-32 with byte order mark. (Encoding is in platform byte
* order, which is big-endian for Java.)
*
* @param unicode to be encoded
* @return tuple (encoded_bytes, unicode_consumed)
*/
public static PyTuple utf_32_encode(String unicode) {
return utf_32_encode(unicode, null);
}
/**
* Encode a Unicode Java String as UTF-32 with byte order mark. (Encoding is in platform byte
* order, which is big-endian for Java.)
*
* @param unicode to be encoded
* @param errors error policy name or null meaning "strict"
* @return tuple (encoded_bytes, unicode_consumed)
*/
public static PyTuple utf_32_encode(String unicode, String errors) {
return PyUnicode_EncodeUTF32(unicode, errors, ByteOrder.UNDEFINED);
}
/**
* Encode a Unicode Java String as UTF-32 in specified byte order with byte order mark.
*
* @param unicode to be encoded
* @param errors error policy name or null meaning "strict"
* @param byteorder decoding "endianness" specified (in the Python -1, 0, +1 convention)
* @return tuple (encoded_bytes, unicode_consumed)
*/
public static PyTuple utf_32_encode(String unicode, String errors, int byteorder) {
ByteOrder order = ByteOrder.fromInt(byteorder);
return PyUnicode_EncodeUTF32(unicode, errors, order);
}
/**
* Encode a Unicode Java String as UTF-32 with little-endian byte order. No byte-order mark is
* generated.
*
* @param unicode to be encoded
* @return tuple (encoded_bytes, unicode_consumed)
*/
public static PyTuple utf_32_le_encode(String unicode) {
return utf_32_le_encode(unicode, null);
}
/**
* Encode a Unicode Java String as UTF-32 with little-endian byte order. No byte-order mark is
* generated.
*
* @param unicode to be encoded
* @param errors error policy name or null meaning "strict"
* @return tuple (encoded_bytes, unicode_consumed)
*/
public static PyTuple utf_32_le_encode(String unicode, String errors) {
return PyUnicode_EncodeUTF32(unicode, errors, ByteOrder.LE);
}
/**
* Encode a Unicode Java String as UTF-32 with big-endian byte order. No byte-order mark is
* generated.
*
* @param unicode to be encoded
* @return tuple (encoded_bytes, unicode_consumed)
*/
public static PyTuple utf_32_be_encode(String unicode) {
return utf_32_be_encode(unicode, null);
}
/**
* Encode a Unicode Java String as UTF-32 with big-endian byte order. No byte-order mark is
* generated.
*
* @param unicode to be encoded
* @param errors error policy name or null meaning "strict"
* @return tuple (encoded_bytes, unicode_consumed)
*/
public static PyTuple utf_32_be_encode(String unicode, String errors) {
return PyUnicode_EncodeUTF32(unicode, errors, ByteOrder.BE);
}
/**
* Encode a Unicode Java String as UTF-32 in specified byte order. A byte-order mark is
* generated if order = ByteOrder.UNDEFINED
, and the byte order in that case will
* be the platform default, which is BE since the platform is Java.
*
* The input String must be valid UTF-16, in particular, if it contains surrogate code
* units they must be ordered and paired correctly. The last char in unicode
is not
* allowed to be an unpaired surrogate. These criteria will be met if the String
* unicode
is the contents of a valid {@link PyUnicode} or {@link PyString}.
*
* @param unicode to be encoded
* @param errors error policy name or null meaning "strict"
* @param order byte order to use BE, LE or UNDEFINED (a BOM will be written)
* @return tuple (encoded_bytes, unicode_consumed)
*/
private static PyTuple PyUnicode_EncodeUTF32(String unicode, String errors, ByteOrder order) {
// We use a StringBuilder but we are really storing encoded bytes
StringBuilder v = new StringBuilder(4 * (unicode.length() + 1));
int uptr = 0;
// Write a BOM (if required to)
if (order == ByteOrder.UNDEFINED) {
v.append("\u0000\u0000\u00fe\u00ff");
order = ByteOrder.BE;
}
if (order != ByteOrder.LE) {
uptr = PyUnicode_EncodeUTF32BELoop(v, unicode, errors);
} else {
uptr = PyUnicode_EncodeUTF32LELoop(v, unicode, errors);
}
// XXX Issue #2002: should probably report length consumed in Unicode characters
return encode_tuple(v.toString(), uptr);
}
/**
* Helper to {@link #PyUnicode_EncodeUTF32(String, String, ByteOrder)} when big-endian encoding
* is to be carried out.
*
* @param v output buffer building String of bytes (Jython PyString convention)
* @param unicode character input
* @param errors error policy name (e.g. "ignore", "replace")
* @return number of Java characters consumed from unicode
*/
private static int PyUnicode_EncodeUTF32BELoop(StringBuilder v, String unicode, String errors) {
int len = unicode.length();
int uptr = 0;
char[] buf = new char[6]; // first 3 elements always zero
/*
* Main codec loop outputs arrays of 4 bytes at a time.
*/
while (uptr < len) {
int ch = unicode.charAt(uptr++);
if ((ch & 0xF800) == 0xD800) {
/*
* This is a surrogate. In Jython, unicode should always be the internal value of a
* PyUnicode, and since this should never contain invalid data, it should be a lead
* surrogate, uptr < len, and the next char must be the trail surrogate. We ought
* not to have to chech that, however ...
*/
if ((ch & 0x0400) == 0) {
// Yes, it's a lead surrogate
if (uptr < len) {
// And there is something to follow
int ch2 = unicode.charAt(uptr++);
if ((ch2 & 0xFC00) == 0xDC00) {
// And it is a trail surrogate, so we can get on with the encoding
ch = ((ch & 0x3ff) << 10) + (ch2 & 0x3ff) + 0x10000;
buf[3] = (char)((ch >> 16) & 0xff);
buf[4] = (char)((ch >> 8) & 0xff);
buf[5] = (char)(ch & 0xff);
v.append(buf, 2, 4);
} else {
// The trail surrogate was missing: accuse ch at uptr-2
uptr = PyUnicode_EncodeUTF32Error(v, errors, ByteOrder.BE, //
unicode, uptr - 2, uptr - 1, "second surrogate missing");
}
} else {
// End of input instread of trail surrogate: accuse ch at uptr-1
uptr = PyUnicode_EncodeUTF32Error(v, errors, ByteOrder.BE, //
unicode, uptr - 1, len, "truncated data");
}
} else {
// The trail encountered in lead position: accuse ch at uptr-2
uptr = PyUnicode_EncodeUTF32Error(v, errors, ByteOrder.BE, //
unicode, uptr - 2, uptr - 1, "unexpected second surrogate");
}
} else if (ch > 255) {
// This is a BMP character: only two bytes non-zero
buf[3] = (char)((ch >> 8) & 0xff);
buf[4] = (char)(ch & 0xff);
v.append(buf, 1, 4);
} else {
// This is one-byte BMP character: only one byte non-zero
buf[3] = (char)(ch & 0xff);
v.append(buf, 0, 4);
}
}
// XXX Issue #2002: should probably report length consumed in Unicode characters
return uptr;
}
/**
* Helper to {@link #PyUnicode_EncodeUTF32(String, String, ByteOrder)} when big-endian encoding
* is to be carried out.
*
* @param v output buffer building String of bytes (Jython PyString convention)
* @param unicode character input
* @param errors error policy name (e.g. "ignore", "replace")
* @return number of Java characters consumed from unicode
*/
private static int PyUnicode_EncodeUTF32LELoop(StringBuilder v, String unicode, String errors) {
int len = unicode.length();
int uptr = 0;
char[] buf = new char[6]; // last 3 elements always zero
/*
* Main codec loop outputs arrays of 4 bytes at a time.
*/
while (uptr < len) {
int ch = unicode.charAt(uptr++);
if ((ch & 0xF800) == 0xD800) {
/*
* This is a surrogate. In Jython, unicode should always be the internal value of a
* PyUnicode, and since this should never contain invalid data, it should be a lead
* surrogate, uptr < len, and the next char must be the trail surrogate. We ought
* not to have to chech that, however ...
*/
if ((ch & 0x0400) == 0) {
// Yes, it's a lead surrogate
if (uptr < len) {
// And there is something to follow
int ch2 = unicode.charAt(uptr++);
if ((ch2 & 0xFC00) == 0xDC00) {
// And it is a trail surrogate, so we can get on with the encoding
ch = ((ch & 0x3ff) << 10) + (ch2 & 0x3ff) + 0x10000;
buf[0] = (char)(ch & 0xff);
buf[1] = (char)((ch >> 8) & 0xff);
buf[2] = (char)((ch >> 16) & 0xff);
v.append(buf, 0, 4);
} else {
// The trail surrogate was missing: accuse ch at uptr-2
uptr = PyUnicode_EncodeUTF32Error(v, errors, ByteOrder.LE, //
unicode, uptr - 2, uptr - 1, "second surrogate missing");
}
} else {
// End of input instread of trail surrogate: accuse ch at uptr-1
uptr = PyUnicode_EncodeUTF32Error(v, errors, ByteOrder.LE, //
unicode, uptr - 1, len, "truncated data");
}
} else {
// The trail encountered in lead position: accuse ch at uptr-2
uptr = PyUnicode_EncodeUTF32Error(v, errors, ByteOrder.LE, //
unicode, uptr - 2, uptr - 1, "unexpected second surrogate");
}
} else if (ch > 255) {
// This is a BMP character: only two bytes non-zero
buf[1] = (char)(ch & 0xff);
buf[2] = (char)((ch >> 8) & 0xff);
v.append(buf, 1, 4);
} else {
// This is one-byte BMP character: only one byte non-zero
buf[2] = (char)(ch & 0xff);
v.append(buf, 2, 4);
}
}
// XXX Issue #2002: should probably report length consumed in Unicode characters
return uptr;
}
/**
* Specific UTF-32 encoder error handler. This is a helper called in the inner loop of
* {@link #PyUnicode_EncodeUTF32(String, String, ByteOrder)} when the Unicode input is in valid.
* In theory, since the input Unicode data should come from a {@link PyUnicode}, there should
* never be any errors.
*
* @param v output buffer building String of bytes (Jython PyString convention)
* @param errors error policy name (e.g. "ignore", "replace")
* @param order LE or BE indicator
* @param toEncode character input
* @param start index of first problematic character
* @param end index of character after the last problematic character
* @param reason text contribution to the exception raised (if any)
* @return position within input at which to restart
*/
private static int PyUnicode_EncodeUTF32Error(StringBuilder v, String errors, ByteOrder order,
String toEncode, int start, int end, String reason) {
// Handle special cases locally
if (errors != null) {
if (errors.equals(codecs.IGNORE)) {
// Just skip to the first non-problem byte
return end;
} else if (errors.equals(codecs.REPLACE)) {
// Insert a replacement UTF-32 character(s) and skip
for (int i = start; i < end; i++) {
if (order != ByteOrder.LE) {
v.append("\000\000\000?");
} else {
v.append("?\000\000\000");
}
}
return end;
}
}
// If errors not one of those, invoke the generic mechanism
PyObject replacementSpec =
codecs.encoding_error(errors, "utf-32", toEncode, start, end, reason);
// Note the replacement is unicode text that still needs to be encoded
String u = replacementSpec.__getitem__(0).toString();
PyUnicode_EncodeUTF32BELoop(v, u, errors);
// Return the index in toEncode at which we should resume
return codecs.calcNewPosition(toEncode.length(), replacementSpec);
}
/**
* Decode (perhaps partially) a sequence of bytes representing the UTF-32 encoded form of a
* Unicode string and return as a tuple the unicode text, and the amount of input consumed. The
* endianness used will have been deduced from a byte-order mark, if present, or will be
* big-endian (Java platform default). The unicode text is presented as a Java String (the
* UTF-16 representation used by {@link PyUnicode}). It is an error for the input bytes not to
* form a whole number of valid UTF-32 codes.
*
* @param bytes to be decoded (Jython {@link PyString} convention)
* @return tuple (unicode_result, bytes_consumed)
*/
public static PyTuple utf_32_decode(String bytes) {
return utf_32_decode(bytes, null);
}
/**
* Decode a sequence of bytes representing the UTF-32 encoded form of a Unicode string and
* return as a tuple the unicode text, and the amount of input consumed. The endianness used
* will have been deduced from a byte-order mark, if present, or will be big-endian (Java
* platform default). The unicode text is presented as a Java String (the UTF-16 representation
* used by {@link PyUnicode}). It is an error for the input bytes not to form a whole number of
* valid UTF-32 codes.
*
* @param bytes to be decoded (Jython {@link PyString} convention)
* @param errors error policy name (e.g. "ignore", "replace")
* @return tuple (unicode_result, bytes_consumed)
*/
public static PyTuple utf_32_decode(String bytes, String errors) {
return utf_32_decode(bytes, errors, false);
}
/**
* Decode (perhaps partially) a sequence of bytes representing the UTF-32 encoded form of a
* Unicode string and return as a tuple the unicode text, and the amount of input consumed. The
* endianness used will have been deduced from a byte-order mark, if present, or will be
* big-endian (Java platform default). The unicode text is presented as a Java String (the
* UTF-16 representation used by {@link PyUnicode}).
*
* @param bytes to be decoded (Jython {@link PyString} convention)
* @param errors error policy name (e.g. "ignore", "replace")
* @param isFinal if a "final" call, meaning the input must all be consumed
* @return tuple (unicode_result, bytes_consumed)
*/
public static PyTuple utf_32_decode(String bytes, String errors, boolean isFinal) {
return PyUnicode_DecodeUTF32Stateful(bytes, errors, ByteOrder.UNDEFINED, isFinal, false);
}
/**
* Decode a sequence of bytes representing the UTF-32 little-endian encoded form of a Unicode
* string and return as a tuple the unicode text, and the amount of input consumed. A
* (correctly-oriented) byte-order mark will pass as a zero-width non-breaking space. The
* unicode text is presented as a Java String (the UTF-16 representation used by
* {@link PyUnicode}). It is an error for the input bytes not to form a whole number of valid
* UTF-32 codes.
*
* @param bytes to be decoded (Jython {@link PyString} convention)
* @return tuple (unicode_result, bytes_consumed)
*/
public static PyTuple utf_32_le_decode(String bytes) {
return utf_32_le_decode(bytes, null);
}
/**
* Decode a sequence of bytes representing the UTF-32 little-endian encoded form of a Unicode
* string and return as a tuple the unicode text, and the amount of input consumed. A
* (correctly-oriented) byte-order mark will pass as a zero-width non-breaking space. The
* unicode text is presented as a Java String (the UTF-16 representation used by
* {@link PyUnicode}). It is an error for the input bytes not to form a whole number of valid
* UTF-32 codes.
*
* @param bytes to be decoded (Jython {@link PyString} convention)
* @param errors error policy name (e.g. "ignore", "replace")
* @return tuple (unicode_result, bytes_consumed)
*/
public static PyTuple utf_32_le_decode(String bytes, String errors) {
return utf_32_le_decode(bytes, errors, false);
}
/**
* Decode (perhaps partially) a sequence of bytes representing the UTF-32 little-endian encoded
* form of a Unicode string and return as a tuple the unicode text, and the amount of input
* consumed. A (correctly-oriented) byte-order mark will pass as a zero-width non-breaking
* space. The unicode text is presented as a Java String (the UTF-16 representation used by
* {@link PyUnicode}).
*
* @param bytes to be decoded (Jython {@link PyString} convention)
* @param errors error policy name (e.g. "ignore", "replace")
* @param isFinal if a "final" call, meaning the input must all be consumed
* @return tuple (unicode_result, bytes_consumed)
*/
public static PyTuple utf_32_le_decode(String bytes, String errors, boolean isFinal) {
return PyUnicode_DecodeUTF32Stateful(bytes, errors, ByteOrder.LE, isFinal, false);
}
/**
* Decode a sequence of bytes representing the UTF-32 big-endian encoded form of a Unicode
* string and return as a tuple the unicode text, and the amount of input consumed. A
* (correctly-oriented) byte-order mark will pass as a zero-width non-breaking space. The
* unicode text is presented as a Java String (the UTF-16 representation used by
* {@link PyUnicode}). It is an error for the input bytes not to form a whole number of valid
* UTF-32 codes.
*
* @param bytes to be decoded (Jython {@link PyString} convention)
* @return tuple (unicode_result, bytes_consumed)
*/
public static PyTuple utf_32_be_decode(String bytes) {
return utf_32_be_decode(bytes, null);
}
/**
* Decode a sequence of bytes representing the UTF-32 big-endian encoded form of a Unicode
* string and return as a tuple the unicode text, and the amount of input consumed. A
* (correctly-oriented) byte-order mark will pass as a zero-width non-breaking space. The
* unicode text is presented as a Java String (the UTF-16 representation used by
* {@link PyUnicode}). It is an error for the input bytes not to form a whole number of valid
* UTF-32 codes.
*
* @param bytes to be decoded (Jython {@link PyString} convention)
* @param errors error policy name (e.g. "ignore", "replace")
* @return tuple (unicode_result, bytes_consumed)
*/
public static PyTuple utf_32_be_decode(String bytes, String errors) {
return utf_32_be_decode(bytes, errors, false);
}
/**
* Decode (perhaps partially) a sequence of bytes representing the UTF-32 big-endian encoded
* form of a Unicode string and return as a tuple the unicode text, and the amount of input
* consumed. A (correctly-oriented) byte-order mark will pass as a zero-width non-breaking
* space. Unicode string and return as a tuple the unicode text, the amount of input consumed.
* The unicode text is presented as a Java String (the UTF-16 representation used by
* {@link PyUnicode}).
*
* @param bytes to be decoded (Jython {@link PyString} convention)
* @param errors error policy name (e.g. "ignore", "replace")
* @param isFinal if a "final" call, meaning the input must all be consumed
* @return tuple (unicode_result, bytes_consumed)
*/
public static PyTuple utf_32_be_decode(String bytes, String errors, boolean isFinal) {
return PyUnicode_DecodeUTF32Stateful(bytes, errors, ByteOrder.BE, isFinal, false);
}
/**
* Decode a sequence of bytes representing the UTF-32 encoded form of a Unicode string and
* return as a tuple the unicode text, the amount of input consumed, and the decoding
* "endianness" used (in the Python -1, 0, +1 convention). The endianness, if not unspecified
* (=0), will be deduced from a byte-order mark and returned. (This codec entrypoint is used in
* that way in the utf_32.py
codec, but only until the byte order is known.) When
* not defined by a BOM, processing assumes big-endian coding (Java platform default), but
* returns "unspecified". (The utf_32.py
codec treats this as an error, once more
* than 4 bytes have been processed.) (Java platform default). The unicode text is presented as
* a Java String (the UTF-16 representation used by {@link PyUnicode}).
*
* @param bytes to be decoded (Jython {@link PyString} convention)
* @param errors error policy name (e.g. "ignore", "replace")
* @param byteorder decoding "endianness" specified (in the Python -1, 0, +1 convention)
* @return tuple (unicode_result, bytes_consumed, endianness)
*/
public static PyTuple utf_32_ex_decode(String bytes, String errors, int byteorder) {
return utf_32_ex_decode(bytes, errors, byteorder, false);
}
/**
* Decode (perhaps partially) a sequence of bytes representing the UTF-32 encoded form of a
* Unicode string and return as a tuple the unicode text, the amount of input consumed, and the
* decoding "endianness" used (in the Python -1, 0, +1 convention). The endianness will be that
* specified, will have been deduced from a byte-order mark, if present, or will be big-endian
* (Java platform default). Or it may still be undefined if fewer than 4 bytes are presented.
* (This codec entrypoint is used in the utf-32 codec only untile the byte order is known.) The
* unicode text is presented as a Java String (the UTF-16 representation used by
* {@link PyUnicode}).
*
* @param bytes to be decoded (Jython {@link PyString} convention)
* @param errors error policy name (e.g. "ignore", "replace")
* @param byteorder decoding "endianness" specified (in the Python -1, 0, +1 convention)
* @param isFinal if a "final" call, meaning the input must all be consumed
* @return tuple (unicode_result, bytes_consumed, endianness)
*/
public static PyTuple utf_32_ex_decode(String bytes, String errors, int byteorder,
boolean isFinal) {
ByteOrder order = ByteOrder.fromInt(byteorder);
return PyUnicode_DecodeUTF32Stateful(bytes, errors, order, isFinal, true);
}
/**
* Decode (perhaps partially) a sequence of bytes representing the UTF-32 encoded form of a
* Unicode string and return as a tuple the (Jython internal representation of) the unicode
* text, the amount of input consumed, and if requested, the decoding "endianness" used (in
* Python -1, 0, +1 conventions). The state we preserve is our read position, i.e. how many
* bytes we have consumed and the byte order (endianness). If the input ends part way through a
* UTF-32 sequence (4 bytes) the data reported as consumed is just that up to and not including
* the first of these bytes. The Java String in the returned tuple is a UTF-16 representation of
* the Unicode result, in line with Java conventions, where Unicode characters above the BMP are
* represented as surrogate pairs.
*
* @param bytes input represented as String (Jython PyString convention)
* @param errors error policy name (e.g. "ignore", "replace")
* @param order LE, BE or UNDEFINED (meaning bytes may begin with a byte order mark)
* @param isFinal if a "final" call, meaning the input must all be consumed
* @param findOrder if the returned tuple should include a report of the byte order
* @return tuple (unicode_result, bytes_consumed [, endianness])
*/
private static PyTuple PyUnicode_DecodeUTF32Stateful(String bytes, String errors,
ByteOrder order, boolean isFinal, boolean findOrder) {
int size = bytes.length(); // Number of bytes waiting (not necessarily multiple of 4)
int limit = size & ~0x3; // First index at which fewer than 4 bytes will be available
// Output Unicode characters will build up here (as UTF-16:
StringBuilder unicode = new StringBuilder(1 + limit / 4);
int q = 0; // Read pointer in bytes
if (limit > 0) {
/*
* Check for BOM (U+FEFF) in the input and adjust current byte order setting
* accordingly. If we know the byte order (it is LE or BE) then bytes ressembling a byte
* order mark are actually a ZERO WIDTH NON-BREAKING SPACE and will be passed through to
* the output in the main codec loop as such.
*/
if (order == ByteOrder.UNDEFINED) {
/*
* The byte order is not known. If the first 4 bytes is a BOM for LE or BE, that
* will set the byte order and the BOM will not be copied to the output. Otherwise
* these bytes are data and will be left for the main codec loop to consume.
*/
char a = bytes.charAt(q);
if (a == 0xff) {
if (bytes.charAt(q + 1) == 0xfe && bytes.charAt(q + 2) == 0
&& bytes.charAt(q + 3) == 0) {
// Somebody set up us the BOM (0xff 0xfe 0x00 0x00) - LE
order = ByteOrder.LE;
q += 4;
}
} else if (a == 0) {
if (bytes.charAt(q + 1) == 0 && bytes.charAt(q + 2) == 0xfe
&& bytes.charAt(q + 3) == 0xff) {
// Other (big-endian) BOM (0x00 0x00 0xfe 0xff) - already set BE
order = ByteOrder.BE;
q += 4;
}
}
/*
* If no BOM found, order is still undefined. This is an error to utf_32.py, but
* here is treated as big-endian.
*/
}
/*
* Main codec loop consumes 4 bytes and emits one code point with each pass, until there
* are fewer than 4 bytes left. There's a version for each endianness
*/
if (order != ByteOrder.LE) {
q = PyUnicode_DecodeUTF32BELoop(unicode, bytes, q, limit, errors);
} else {
q = PyUnicode_DecodeUTF32LELoop(unicode, bytes, q, limit, errors);
}
}
/*
* We have processed all we can: if we have some bytes left over that we can't store for
* next time, that's an error.
*/
if (isFinal && q < size) {
q = codecs.insertReplacementAndGetResume(unicode, errors, "utf-32", //
bytes, q, size, "truncated data");
}
// Finally, the return depends whether we were asked to work out the byte order
if (findOrder) {
return decode_tuple(unicode.toString(), q, order);
} else {
return decode_tuple(unicode.toString(), q);
}
}
/**
* Helper to {@link #PyUnicode_DecodeUTF32Stateful(String, String, ByteOrder, boolean, boolean)}
* when big-endian decoding is to be carried out.
*
* @param unicode character output
* @param bytes input represented as String (Jython PyString convention)
* @param q number of elements already consumed from bytes
array
* @param limit (multiple of 4) first byte not to process
* @param errors error policy name (e.g. "ignore", "replace")
* @return number of elements consumed now from bytes
array
*/
private static int PyUnicode_DecodeUTF32BELoop(StringBuilder unicode, String bytes, int q,
int limit, String errors) {
/*
* Main codec loop consumes 4 bytes and emits one code point with each pass, until there are
* fewer than 4 bytes left.
*/
while (q < limit) {
// Read 4 bytes in two 16-bit chunks according to byte order
int hi, lo;
hi = (bytes.charAt(q) << 8) | bytes.charAt(q + 1);
lo = (bytes.charAt(q + 2) << 8) | bytes.charAt(q + 3);
if (hi == 0) {
// It's a BMP character so we can't go wrong
unicode.append((char)lo);
q += 4;
} else {
// Code may be invalid: let the appendCodePoint method detect that
try {
unicode.appendCodePoint((hi << 16) + lo);
q += 4;
} catch (IllegalArgumentException e) {
q = codecs.insertReplacementAndGetResume(unicode, errors, "utf-32", //
bytes, q, q + 4, "codepoint not in range(0x110000)");
}
}
}
return q;
}
/**
* Helper to {@link #PyUnicode_DecodeUTF32Stateful(String, String, ByteOrder, boolean, boolean)}
* when little-endian decoding is to be carried out.
*
* @param unicode character output
* @param bytes input represented as String (Jython PyString convention)
* @param q number of elements already consumed from bytes
array
* @param limit (multiple of 4) first byte not to process
* @param errors error policy name (e.g. "ignore", "replace")
* @return number of elements consumed now from bytes
array
*/
private static int PyUnicode_DecodeUTF32LELoop(StringBuilder unicode, String bytes, int q,
int limit, String errors) {
/*
* Main codec loop consumes 4 bytes and emits one code point with each pass, until there are
* fewer than 4 bytes left.
*/
while (q < limit) {
// Read 4 bytes in two 16-bit chunks according to byte order
int hi, lo;
hi = (bytes.charAt(q + 3) << 8) | bytes.charAt(q + 2);
lo = (bytes.charAt(q + 1) << 8) | bytes.charAt(q);
if (hi == 0) {
// It's a BMP character so we can't go wrong
unicode.append((char)lo);
q += 4;
} else {
// Code may be invalid: let the appendCodePoint method detect that
try {
unicode.appendCodePoint((hi << 16) + lo);
q += 4;
} catch (IllegalArgumentException e) {
q = codecs.insertReplacementAndGetResume(unicode, errors, "utf-32", //
bytes, q, q + 4, "codepoint not in range(0x110000)");
}
}
}
return q;
}
/* --- RawUnicodeEscape Codec ----------------------------------------- */
public static PyTuple raw_unicode_escape_encode(String str) {
return raw_unicode_escape_encode(str, null);
}
public static PyTuple raw_unicode_escape_encode(String str, String errors) {
return encode_tuple(codecs.PyUnicode_EncodeRawUnicodeEscape(str, errors, false),
str.length());
}
public static PyTuple raw_unicode_escape_decode(String str) {
return raw_unicode_escape_decode(str, null);
}
public static PyTuple raw_unicode_escape_decode(String str, String errors) {
return decode_tuple(codecs.PyUnicode_DecodeRawUnicodeEscape(str, errors), str.length());
}
/* --- unicode-escape Codec ------------------------------------------- */
public static PyTuple unicode_escape_encode(String str) {
return unicode_escape_encode(str, null);
}
public static PyTuple unicode_escape_encode(String str, String errors) {
return encode_tuple(PyString.encode_UnicodeEscape(str, false), str.length());
}
public static PyTuple unicode_escape_decode(String str) {
return unicode_escape_decode(str, null);
}
public static PyTuple unicode_escape_decode(String str, String errors) {
int n = str.length();
return decode_tuple(PyString.decode_UnicodeEscape(str, 0, n, errors, true), n);
}
/* --- UnicodeInternal Codec ------------------------------------------ */
/*
* This codec is supposed to deal with an encoded form equal to the internal representation of
* the unicode object considered as bytes in memory. This was confusing in CPython as it varied
* with machine architecture (width and endian-ness). In Jython, where both are fixed, the most
* compatible choice is UTF-32BE. The codec is deprecated in v3.3 as irrelevant, or impossible,
* in view of the flexible string representation (which Jython emulates in its own way).
*
* See http://mail.python.org/pipermail/python-dev/2011-November/114415.html
*/
/**
* Legacy method to encode given unicode in CPython wide-build internal format (equivalent
* UTF-32BE).
*/
@Deprecated
public static PyTuple unicode_internal_encode(String unicode) {
return utf_32_be_encode(unicode, null);
}
/**
* Legacy method to encode given unicode in CPython wide-build internal format (equivalent
* UTF-32BE). There must be a multiple of 4 bytes.
*/
@Deprecated
public static PyTuple unicode_internal_encode(String unicode, String errors) {
return utf_32_be_encode(unicode, errors);
}
/**
* Legacy method to decode given bytes as if CPython wide-build internal format (equivalent
* UTF-32BE). There must be a multiple of 4 bytes.
*/
@Deprecated
public static PyTuple unicode_internal_decode(String bytes) {
return utf_32_be_decode(bytes, null, true);
}
/**
* Legacy method to decode given bytes as if CPython wide-build internal format (equivalent
* UTF-32BE). There must be a multiple of 4 bytes.
*/
@Deprecated
public static PyTuple unicode_internal_decode(String bytes, String errors) {
return utf_32_be_decode(bytes, errors, true);
}
/**
* Optimized charmap encoder mapping.
*
* Uses a trie structure instead of a dictionary; the speedup primarily comes from not creating
* integer objects in the process. The trie is created by inverting the encoding map.
*/
@Untraversable
@ExposedType(name = "EncodingMap", isBaseType = false)
public static class EncodingMap extends PyObject {
char[] level1;
char[] level23;
int count2;
int count3;
private EncodingMap(char[] level1, char[] level23, int count2, int count3) {
this.level1 = level1;
this.level23 = level23;
this.count2 = count2;
this.count3 = count3;
}
/**
* Create and populate an EncodingMap from a 256 length PyUnicode char. Returns a
* PyDictionary if the mapping isn't easily optimized.
*
* @param string a 256 length unicode mapping
* @return an encoder mapping
*/
public static PyObject buildEncodingMap(PyObject string) {
if (!(string instanceof PyUnicode) || string.__len__() != 256) {
throw Py.TypeError("bad argument type for built-in operation");
}
boolean needDict = false;
char[] level1 = new char[32];
char[] level23 = new char[512];
int i;
int count2 = 0;
int count3 = 0;
String decode = string.toString();
for (i = 0; i < level1.length; i++) {
level1[i] = 0xFF;
}
for (i = 0; i < level23.length; i++) {
level23[i] = 0xFF;
}
if (decode.charAt(0) != 0) {
needDict = true;
}
for (i = 1; i < 256; i++) {
int l1, l2;
char charAt = decode.charAt(i);
if (charAt == 0) {
needDict = true;
}
if (charAt == 0xFFFE) {
// unmapped character
continue;
}
l1 = charAt >> 11;
l2 = charAt >> 7;
if (level1[l1] == 0xFF) {
level1[l1] = (char)count2++;
}
if (level23[l2] == 0xFF) {
level23[l2] = (char)count3++;
}
}
if (count2 > 0xFF || count3 > 0xFF) {
needDict = true;
}
if (needDict) {
PyObject result = new PyDictionary();
for (i = 0; i < 256; i++) {
result.__setitem__(Py.newInteger(decode.charAt(i)), Py.newInteger(i));
}
return result;
}
// Create a three-level trie
int length2 = 16 * count2;
int length3 = 128 * count3;
level23 = new char[length2 + length3];
PyObject result = new EncodingMap(level1, level23, count2, count3);
for (i = 0; i < length2; i++) {
level23[i] = 0xFF;
}
for (i = length2; i < length2 + length3; i++) {
level23[i] = 0;
}
count3 = 0;
for (i = 1; i < 256; i++) {
int o1, o2, o3, i2, i3;
char charAt = decode.charAt(i);
if (charAt == 0xFFFE) {
// unmapped character
continue;
}
o1 = charAt >> 11;
o2 = (charAt >> 7) & 0xF;
i2 = 16 * level1[o1] + o2;
if (level23[i2] == 0xFF) {
level23[i2] = (char)count3++;
}
o3 = charAt & 0x7F;
i3 = 128 * level23[i2] + o3;
level23[length2 + i3] = (char)i;
}
return result;
}
/**
* Lookup a char in the EncodingMap.
*
* @param c a char
* @return an int, -1 for failure
*/
public int lookup(char c) {
int l1 = c >> 11;
int l2 = (c >> 7) & 0xF;
int l3 = c & 0x7F;
int i;
if (c == 0) {
return 0;
}
// level 1
i = level1[l1];
if (i == 0xFF) {
return -1;
}
// level 2
i = level23[16 * i + l2];
if (i == 0xFF) {
return -1;
}
// level 3
i = level23[16 * count2 + 128 * i + l3];
if (i == 0) {
return -1;
}
return i;
}
}
}