All Downloads are FREE. Search and download functionalities are using the official Maven repository.

src.org.python.modules._codecs Maven / Gradle / Ivy

Go to download

Jython is an implementation of the high-level, dynamic, object-oriented language Python written in 100% Pure Java, and seamlessly integrated with the Java platform. It thus allows you to run Python on any Java platform.

There is a newer version: 2.7.4
Show newest version
/*
 * Copyright (c)2013 Jython Developers. Original Java version copyright 2000 Finn Bock.
 *
 * This program contains material copyrighted by: Copyright (c) Corporation for National Research
 * Initiatives. Originally written by Marc-Andre Lemburg ([email protected]).
 */
package org.python.modules;

import java.nio.ByteBuffer;
import java.nio.charset.Charset;
import java.util.Iterator;

import org.python.core.Py;
import org.python.core.PyDictionary;
import org.python.core.PyInteger;
import org.python.core.PyNone;
import org.python.core.PyObject;
import org.python.core.PyString;
import org.python.core.PySystemState;
import org.python.core.PyTuple;
import org.python.core.PyUnicode;
import org.python.core.codecs;
import org.python.core.Untraversable;
import org.python.expose.ExposedType;

/**
 * This class corresponds to the Python _codecs module, which in turn lends its functions to the
 * codecs module (in Lib/codecs.py). It exposes the implementing functions of several codec families
 * called out in the Python codecs library Lib/encodings/*.py, where it is usually claimed that they
 * are bound "as C functions". Obviously, C stands for "compiled" in this context, rather than
 * dependence on a particular implementation language. Actual transcoding methods often come from
 * the related {@link codecs} class.
 */
public class _codecs {

    public static void register(PyObject search_function) {
        codecs.register(search_function);
    }

    private static String _castString(PyString pystr) {
        // Jython used to treat String as equivalent to PyString, or maybe PyUnicode, as
        // it made sense. We need to be more careful now! Insert this cast check as necessary
        // to ensure the appropriate compliance.
        if (pystr == null) {
            return null;
        }
        String s = pystr.toString();
        if (pystr instanceof PyUnicode) {
            return s;
        } else {
            // May  throw UnicodeEncodeError, per CPython behavior
            return codecs.PyUnicode_EncodeASCII(s, s.length(), null);
        }
    }

    public static PyTuple lookup(PyString encoding) {
        return codecs.lookup(_castString(encoding));
    }

    public static PyObject lookup_error(PyString handlerName) {
        return codecs.lookup_error(_castString(handlerName));
    }

    public static void register_error(String name, PyObject errorHandler) {
        codecs.register_error(name, errorHandler);
    }

    /**
     * Decode bytes using the system default encoding (see
     * {@link codecs#getDefaultEncoding()}). Decoding errors raise a ValueError.
     *
     * @param bytes to be decoded
     * @return Unicode string decoded from bytes
     */
    public static PyObject decode(PyString bytes) {
        return decode(bytes, null, null);
    }

    /**
     * Decode bytes using the codec registered for the encoding. The
     * encoding defaults to the system default encoding (see
     * {@link codecs#getDefaultEncoding()}). Decoding errors raise a ValueError.
     *
     * @param bytes to be decoded
     * @param encoding name of encoding (to look up in codec registry)
     * @return Unicode string decoded from bytes
     */
    public static PyObject decode(PyString bytes, PyString encoding) {
        return decode(bytes, encoding, null);
    }

    /**
     * Decode bytes using the codec registered for the encoding. The
     * encoding defaults to the system default encoding (see
     * {@link codecs#getDefaultEncoding()}). The string errors may name a different
     * error handling policy (built-in or registered with {@link #register_error(String, PyObject)}
     * ). The default error policy is 'strict' meaning that decoding errors raise a
     * ValueError.
     *
     * @param bytes to be decoded
     * @param encoding name of encoding (to look up in codec registry)
     * @param errors error policy name (e.g. "ignore")
     * @return Unicode string decoded from bytes
     */
    public static PyObject decode(PyString bytes, PyString encoding, PyString errors) {
        return codecs.decode(bytes, _castString(encoding), _castString(errors));
    }

    /**
     * Encode unicode using the system default encoding (see
     * {@link codecs#getDefaultEncoding()}). Encoding errors raise a ValueError.
     *
     * @param unicode string to be encoded
     * @return bytes object encoding unicode
     */
    public static PyString encode(PyUnicode unicode) {
        return encode(unicode, null, null);
    }

    /**
     * Encode unicode using the codec registered for the encoding. The
     * encoding defaults to the system default encoding (see
     * {@link codecs#getDefaultEncoding()}). Encoding errors raise a ValueError.
     *
     * @param unicode string to be encoded
     * @param encoding name of encoding (to look up in codec registry)
     * @return bytes object encoding unicode
     */
    public static PyString encode(PyUnicode unicode, PyString encoding) {
        return encode(unicode, encoding, null);
    }

    /**
     * Encode unicode using the codec registered for the encoding. The
     * encoding defaults to the system default encoding (see
     * {@link codecs#getDefaultEncoding()}). The string errors may name a different
     * error handling policy (built-in or registered with {@link #register_error(String, PyObject)}
     * ). The default error policy is 'strict' meaning that encoding errors raise a
     * ValueError.
     *
     * @param unicode string to be encoded
     * @param encoding name of encoding (to look up in codec registry)
     * @param errors error policy name (e.g. "ignore")
     * @return bytes object encoding unicode
     */
    public static PyString encode(PyUnicode unicode, PyString encoding, PyString errors) {
        return Py.newString(codecs.encode(unicode, _castString(encoding), _castString(errors)));
    }

    /* --- Some codec support methods -------------------------------------------- */

    public static PyObject charmap_build(PyUnicode map) {
        return EncodingMap.buildEncodingMap(map);
    }

    /**
     * Enumeration representing the possible endianness of UTF-32 (possibly UTF-16) encodings.
     * Python uses integers {-1, 0, 1}, but we can be more expressive. For encoding
     * UNDEFINED means choose the endianness of the platform and insert a byte order mark (BOM). But
     * since the platform is Java, that is always big-endian. For decoding it means read the BOM
     * from the stream, and it is an error not to find one (compare
     * Lib/encodings/utf_32.py).
     */
    enum ByteOrder {
        LE, UNDEFINED, BE;

        /** Returns the Python equivalent code -1 = LE, 0 = as marked/platform, +1 = BE */
        int code() {
            return ordinal() - 1;
        }

        /** Returns equivalent to the Python code -1 = LE, 0 = as marked/platform, +1 = BE */
        static ByteOrder fromInt(int byteorder) {
            switch (byteorder) {
                case -1:
                    return LE;
                case 1:
                    return BE;
                default:
                    return UNDEFINED;
            }
        }
    }

    /**
     * Convenience method to construct the return value of decoders, providing the Unicode result as
     * a String, and the number of bytes consumed.
     *
     * @param u the unicode result as a UTF-16 Java String
     * @param bytesConsumed the number of bytes consumed
     * @return the tuple (unicode(u), bytesConsumed)
     */
    private static PyTuple decode_tuple(String u, int bytesConsumed) {
        return new PyTuple(new PyUnicode(u), Py.newInteger(bytesConsumed));
    }

    /**
     * Convenience method to construct the return value of decoders, providing the Unicode result as
     * a String, and the number of bytes consumed in decoding as either a single-element array or an
     * int to be used if the array argument is null.
     *
     * @param u the unicode result as a UTF-16 Java String
     * @param consumed if not null, element [0] is the number of bytes consumed
     * @param defConsumed if consumed==null, use this as the number of bytes consumed
     * @return the tuple (unicode(u), bytesConsumed)
     */
    private static PyTuple decode_tuple(String u, int[] consumed, int defConsumed) {
        return decode_tuple(u, consumed != null ? consumed[0] : defConsumed);
    }

    /**
     * Convenience method to construct the return value of decoders that infer the byte order from
     * the byte-order mark.
     *
     * @param u the unicode result as a UTF-16 Java String
     * @param bytesConsumed the number of bytes consumed
     * @param order the byte order (deduced by codec)
     * @return the tuple (unicode(u), bytesConsumed, byteOrder)
     */
    private static PyTuple decode_tuple(String u, int bytesConsumed, ByteOrder order) {
        int bo = order.code();
        return new PyTuple(new PyUnicode(u), Py.newInteger(bytesConsumed), Py.newInteger(bo));
    }

    private static PyTuple decode_tuple_str(String s, int len) {
        return new PyTuple(new PyString(s), Py.newInteger(len));
    }

    private static PyTuple encode_tuple(String s, int len) {
        return new PyTuple(new PyString(s), Py.newInteger(len));
    }

    /* --- UTF-8 Codec --------------------------------------------------- */
    public static PyTuple utf_8_decode(String str) {
        return utf_8_decode(str, null);
    }

    public static PyTuple utf_8_decode(String str, String errors) {
        return utf_8_decode(str, errors, false);
    }

    public static PyTuple utf_8_decode(String str, String errors, PyObject final_) {
        return utf_8_decode(str, errors, final_.__nonzero__());
    }

    public static PyTuple utf_8_decode(String str, String errors, boolean final_) {
        int[] consumed = final_ ? null : new int[1];
        return decode_tuple(codecs.PyUnicode_DecodeUTF8Stateful(str, errors, consumed), final_
                ? str.length() : consumed[0]);
    }

    public static PyTuple utf_8_encode(String str) {
        return utf_8_encode(str, null);
    }

    public static PyTuple utf_8_encode(String str, String errors) {
        int size = str.length();
        return encode_tuple(codecs.PyUnicode_EncodeUTF8(str, errors), size);
    }

    /* --- UTF-7 Codec --------------------------------------------------- */
    public static PyTuple utf_7_decode(String bytes) {
        return utf_7_decode(bytes, null);
    }

    public static PyTuple utf_7_decode(String bytes, String errors) {
        return utf_7_decode(bytes, null, false);
    }

    public static PyTuple utf_7_decode(String bytes, String errors, boolean finalFlag) {
        int[] consumed = finalFlag ? null : new int[1];
        String decoded = codecs.PyUnicode_DecodeUTF7Stateful(bytes, errors, consumed);
        return decode_tuple(decoded, consumed, bytes.length());
    }

    public static PyTuple utf_7_encode(String str) {
        return utf_7_encode(str, null);
    }

    public static PyTuple utf_7_encode(String str, String errors) {
        int size = str.length();
        return encode_tuple(codecs.PyUnicode_EncodeUTF7(str, false, false, errors), size);
    }

    /* --- string-escape Codec -------------------------------------------- */
    public static PyTuple escape_decode(String str) {
        return escape_decode(str, null);
    }

    public static PyTuple escape_decode(String str, String errors) {
        return decode_tuple_str(PyString.decode_UnicodeEscape(str, 0, str.length(), errors, true),
                str.length());
    }

    public static PyTuple escape_encode(String str) {
        return escape_encode(str, null);
    }

    public static PyTuple escape_encode(String str, String errors) {
        return encode_tuple(PyString.encode_UnicodeEscape(str, false), str.length());
    }

    /* --- Character Mapping Codec --------------------------------------- */

    /**
     * Equivalent to charmap_decode(bytes, errors, null). This method is here so the
     * error and mapping arguments can be optional at the Python level.
     *
     * @param bytes sequence of bytes to decode
     * @return decoded string and number of bytes consumed
     */
    public static PyTuple charmap_decode(String bytes) {
        return charmap_decode(bytes, null, null);
    }

    /**
     * Equivalent to charmap_decode(bytes, errors, null). This method is here so the
     * error argument can be optional at the Python level.
     *
     * @param bytes sequence of bytes to decode
     * @param errors error policy
     * @return decoded string and number of bytes consumed
     */
    public static PyTuple charmap_decode(String bytes, String errors) {
        return charmap_decode(bytes, errors, null);
    }

    /**
     * Decode a sequence of bytes into Unicode characters via a mapping supplied as a container to
     * be indexed by the byte values (as unsigned integers). If the mapping is null or None, decode
     * with latin-1 (essentially treating bytes as character codes directly).
     *
     * @param bytes sequence of bytes to decode
     * @param errors error policy
     * @param mapping to convert bytes to characters
     * @return decoded string and number of bytes consumed
     */
    public static PyTuple charmap_decode(String bytes, String errors, PyObject mapping) {
        if (mapping == null || mapping == Py.None) {
            // Default to Latin-1
            return latin_1_decode(bytes, errors);
        } else {
            return charmap_decode(bytes, errors, mapping, false);
        }
    }

    /**
     * Decode a sequence of bytes into Unicode characters via a mapping supplied as a container to
     * be indexed by the byte values (as unsigned integers).
     *
     * @param bytes sequence of bytes to decode
     * @param errors error policy
     * @param mapping to convert bytes to characters
     * @param ignoreUnmapped if true, pass unmapped byte values as character codes [0..256)
     * @return decoded string and number of bytes consumed
     */
    public static PyTuple charmap_decode(String bytes, String errors, PyObject mapping,
            boolean ignoreUnmapped) {
        // XXX bytes: would prefer to accept any object with buffer API
        int size = bytes.length();
        StringBuilder v = new StringBuilder(size);

        for (int i = 0; i < size; i++) {

            // Process the i.th input byte
            int b = bytes.charAt(i);
            if (b > 0xff) {
                i = codecs.insertReplacementAndGetResume(v, errors, "charmap", bytes, //
                        i, i + 1, "ordinal not in range(255)") - 1;
                continue;
            }

            // Map the byte to an output character code (or possibly string)
            PyObject w = Py.newInteger(b);
            PyObject x = mapping.__finditem__(w);

            // Apply to the output
            if (x == null) {
                // Error case: mapping not found
                if (ignoreUnmapped) {
                    v.appendCodePoint(b);
                } else {
                    i = codecs.insertReplacementAndGetResume(v, errors, "charmap", bytes, //
                            i, i + 1, "no mapping found") - 1;
                }

            } else if (x instanceof PyInteger) {
                // Mapping was to an int: treat as character code
                int value = ((PyInteger)x).getValue();
                if (value < 0 || value > PySystemState.maxunicode) {
                    throw Py.TypeError("character mapping must return "
                            + "integer greater than 0 and less than sys.maxunicode");
                }
                v.appendCodePoint(value);

            } else if (x == Py.None) {
                i = codecs.insertReplacementAndGetResume(v, errors, "charmap", bytes, //
                        i, i + 1, "character maps to ") - 1;

            } else if (x instanceof PyString) {
                String s = x.toString();
                if (s.charAt(0) == 0xfffe) {
                    // Invalid indicates "undefined" see C-API PyUnicode_DecodeCharmap()
                    i = codecs.insertReplacementAndGetResume(v, errors, "charmap", bytes, //
                            i, i + 1, "character maps to ") - 1;
                } else {
                    v.append(s);
                }

            } else {
                /* wrong return value */
                throw Py.TypeError("character mapping must return " + "integer, None or str");
            }
        }

        return decode_tuple(v.toString(), size);
    }

    // parallel to CPython's PyUnicode_TranslateCharmap
    public static PyObject translateCharmap(PyUnicode str, String errors, PyObject mapping) {
        StringBuilder buf = new StringBuilder(str.toString().length());

        for (Iterator iter = str.newSubsequenceIterator(); iter.hasNext();) {
            int codePoint = iter.next();
            PyObject result = mapping.__finditem__(Py.newInteger(codePoint));
            if (result == null) {
                // No mapping found means: use 1:1 mapping
                buf.appendCodePoint(codePoint);
            } else if (result == Py.None) {
                // XXX: We don't support the fancier error handling CPython does here of
                // capturing regions of chars removed by the None mapping to optionally
                // pass to an error handler. Though we don't seem to even use this
                // functionality anywhere either
                ;
            } else if (result instanceof PyInteger) {
                int value = result.asInt();
                if (value < 0 || value > PySystemState.maxunicode) {
                    throw Py.TypeError(String.format("character mapping must be in range(0x%x)",
                            PySystemState.maxunicode + 1));
                }
                buf.appendCodePoint(value);
            } else if (result instanceof PyUnicode) {
                buf.append(result.toString());
            } else {
                // wrong return value
                throw Py.TypeError("character mapping must return integer, None or unicode");
            }
        }
        return new PyUnicode(buf.toString());
    }

    /**
     * Equivalent to charmap_encode(str, null, null). This method is here so the error
     * and mapping arguments can be optional at the Python level.
     *
     * @param str to be encoded
     * @return (encoded data, size(str)) as a pair
     */
    public static PyTuple charmap_encode(String str) {
        return charmap_encode(str, null, null);
    }

    /**
     * Equivalent to charmap_encode(str, errors, null). This method is here so the
     * mapping can be optional at the Python level.
     *
     * @param str to be encoded
     * @param errors error policy name (e.g. "ignore")
     * @return (encoded data, size(str)) as a pair
     */
    public static PyTuple charmap_encode(String str, String errors) {
        return charmap_encode(str, errors, null);
    }

    /**
     * Encoder based on an optional character mapping. This mapping is either an
     * EncodingMap of 256 entries, or an arbitrary container indexable with integers
     * using __finditem__ and yielding byte strings. If the mapping is null, latin-1
     * (effectively a mapping of character code to the numerically-equal byte) is used
     *
     * @param str to be encoded
     * @param errors error policy name (e.g. "ignore")
     * @param mapping from character code to output byte (or string)
     * @return (encoded data, size(str)) as a pair
     */
    public static PyTuple charmap_encode(String str, String errors, PyObject mapping) {
        if (mapping == null || mapping == Py.None) {
            // Default to Latin-1
            return latin_1_encode(str, errors);
        } else {
            return charmap_encode_internal(str, errors, mapping, new StringBuilder(str.length()),
                    true);
        }
    }

    /**
     * Helper to implement the several variants of charmap_encode, given an optional
     * mapping. This mapping is either an EncodingMap of 256 entries, or an arbitrary
     * container indexable with integers using __finditem__ and yielding byte strings.
     *
     * @param str to be encoded
     * @param errors error policy name (e.g. "ignore")
     * @param mapping from character code to output byte (or string)
     * @param v to contain the encoded bytes
     * @param letLookupHandleError
     * @return (encoded data, size(str)) as a pair
     */
    private static PyTuple charmap_encode_internal(String str, String errors, PyObject mapping,
            StringBuilder v, boolean letLookupHandleError) {

        EncodingMap encodingMap = mapping instanceof EncodingMap ? (EncodingMap)mapping : null;
        int size = str.length();

        for (int i = 0; i < size; i++) {

            // Map the i.th character of str to some value
            char ch = str.charAt(i);
            PyObject x;
            if (encodingMap != null) {
                // The mapping given was an EncodingMap [0,256) => on-negative int
                int result = encodingMap.lookup(ch);
                x = (result == -1) ? null : Py.newInteger(result);
            } else {
                // The mapping was a map or similar: non-negative int -> object
                x = mapping.__finditem__(Py.newInteger(ch));
            }

            // And map this object to an output character
            if (x == null) {
                // Error during lookup
                if (letLookupHandleError) {
                    // Some kind of substitute can be placed in the output
                    i = handleBadMapping(str, errors, mapping, v, size, i);
                } else {
                    // Hard error
                    throw Py.UnicodeEncodeError("charmap", str, i, i + 1,
                            "character maps to ");
                }

            } else if (x instanceof PyInteger) {
                // Look-up had integer result: output as byte value
                int value = ((PyInteger)x).getValue();
                if (value < 0 || value > 255) {
                    throw Py.TypeError("character mapping must be in range(256)");
                }
                v.append((char)value);

            } else if (x instanceof PyString && !(x instanceof PyUnicode)) {
                // Look-up had str or unicode result: output as Java String
                // XXX: (Py3k) Look-up had bytes or str result: output as ... this is a problem
                v.append(x.toString());

            } else if (x instanceof PyNone) {
                i = handleBadMapping(str, errors, mapping, v, size, i);

            } else {
                /* wrong return value */
                throw Py.TypeError("character mapping must return " + "integer, None or str");
            }
        }

        return encode_tuple(v.toString(), size);
    }

    /**
     * Helper for {@link #charmap_encode_internal(String, String, PyObject, StringBuilder, boolean)}
     * called when we need some kind of substitute in the output for an invalid input.
     *
     * @param str to be encoded
     * @param errors error policy name (e.g. "ignore")
     * @param mapping from character code to output byte (or string)
     * @param v to contain the encoded bytes
     * @param size of str
     * @param i index in str of current (and problematic) character
     * @return index of last character of problematic section
     */
    private static int handleBadMapping(String str, String errors, PyObject mapping,
            StringBuilder v, int size, int i) {

        // If error policy specified, execute it
        if (errors != null) {

            if (errors.equals(codecs.IGNORE)) {
                return i;

            } else if (errors.equals(codecs.REPLACE)) {
                String replStr = "?";
                charmap_encode_internal(replStr, errors, mapping, v, false);
                return i;

            } else if (errors.equals(codecs.XMLCHARREFREPLACE)) {
                String replStr = codecs.xmlcharrefreplace(i, i + 1, str).toString();
                charmap_encode_internal(replStr, errors, mapping, v, false);
                return i;

            } else if (errors.equals(codecs.BACKSLASHREPLACE)) {
                String replStr = codecs.backslashreplace(i, i + 1, str).toString();
                charmap_encode_internal(replStr, errors, mapping, v, false);
                return i;
            }
        }

        // Default behaviour (error==null or does not match known case)
        String msg = "character maps to ";
        PyObject replacement = codecs.encoding_error(errors, "charmap", str, i, i + 1, msg);
        String replStr = replacement.__getitem__(0).toString();
        charmap_encode_internal(replStr, errors, mapping, v, false);

        return codecs.calcNewPosition(size, replacement) - 1;
    }

    /* --- ascii Codec ---------------------------------------------- */
    public static PyTuple ascii_decode(String str) {
        return ascii_decode(str, null);
    }

    public static PyTuple ascii_decode(String str, String errors) {
        int size = str.length();
        return decode_tuple(codecs.PyUnicode_DecodeASCII(str, size, errors), size);
    }

    public static PyTuple ascii_encode(String str) {
        return ascii_encode(str, null);
    }

    public static PyTuple ascii_encode(String str, String errors) {
        int size = str.length();
        return encode_tuple(codecs.PyUnicode_EncodeASCII(str, size, errors), size);
    }

    /* --- Latin-1 Codec -------------------------------------------- */
    public static PyTuple latin_1_decode(String str) {
        return latin_1_decode(str, null);
    }

    public static PyTuple latin_1_decode(String str, String errors) {
        int size = str.length();
        return decode_tuple(codecs.PyUnicode_DecodeLatin1(str, size, errors), size);
    }

    public static PyTuple latin_1_encode(String str) {
        return latin_1_encode(str, null);
    }

    public static PyTuple latin_1_encode(String str, String errors) {
        int size = str.length();
        return encode_tuple(codecs.PyUnicode_EncodeLatin1(str, size, errors), size);
    }

    /* --- UTF-16 Codec ------------------------------------------- */
    public static PyTuple utf_16_encode(String str) {
        return utf_16_encode(str, null);
    }

    public static PyTuple utf_16_encode(String str, String errors) {
        return encode_tuple(encode_UTF16(str, errors, 0), str.length());
    }

    public static PyTuple utf_16_encode(String str, String errors, int byteorder) {
        return encode_tuple(encode_UTF16(str, errors, byteorder), str.length());
    }

    public static PyTuple utf_16_le_encode(String str) {
        return utf_16_le_encode(str, null);
    }

    public static PyTuple utf_16_le_encode(String str, String errors) {
        return encode_tuple(encode_UTF16(str, errors, -1), str.length());
    }

    public static PyTuple utf_16_be_encode(String str) {
        return utf_16_be_encode(str, null);
    }

    public static PyTuple utf_16_be_encode(String str, String errors) {
        return encode_tuple(encode_UTF16(str, errors, 1), str.length());
    }

    public static String encode_UTF16(String str, String errors, int byteorder) {
        final Charset utf16;
        if (byteorder == 0) {
            utf16 = Charset.forName("UTF-16");
        } else if (byteorder == -1) {
            utf16 = Charset.forName("UTF-16LE");
        } else {
            utf16 = Charset.forName("UTF-16BE");
        }

        // XXX errors argument ignored: Java's codecs implement "replace"

        final ByteBuffer bbuf = utf16.encode(str);
        final StringBuilder v = new StringBuilder(bbuf.limit());
        while (bbuf.remaining() > 0) {
            int val = bbuf.get();
            if (val < 0) {
                val = 256 + val;
            }
            v.appendCodePoint(val);
        }
        return v.toString();
    }

    public static PyTuple utf_16_decode(String str) {
        return utf_16_decode(str, null);
    }

    public static PyTuple utf_16_decode(String str, String errors) {
        return utf_16_decode(str, errors, false);
    }

    public static PyTuple utf_16_decode(String str, String errors, boolean final_) {
        int[] bo = new int[] {0};
        int[] consumed = final_ ? null : new int[1];
        return decode_tuple(decode_UTF16(str, errors, bo, consumed), final_ ? str.length()
                : consumed[0]);
    }

    public static PyTuple utf_16_le_decode(String str) {
        return utf_16_le_decode(str, null);
    }

    public static PyTuple utf_16_le_decode(String str, String errors) {
        return utf_16_le_decode(str, errors, false);
    }

    public static PyTuple utf_16_le_decode(String str, String errors, boolean final_) {
        int[] bo = new int[] {-1};
        int[] consumed = final_ ? null : new int[1];
        return decode_tuple(decode_UTF16(str, errors, bo, consumed), final_ ? str.length()
                : consumed[0]);
    }

    public static PyTuple utf_16_be_decode(String str) {
        return utf_16_be_decode(str, null);
    }

    public static PyTuple utf_16_be_decode(String str, String errors) {
        return utf_16_be_decode(str, errors, false);
    }

    public static PyTuple utf_16_be_decode(String str, String errors, boolean final_) {
        int[] bo = new int[] {1};
        int[] consumed = final_ ? null : new int[1];
        return decode_tuple(decode_UTF16(str, errors, bo, consumed), final_ ? str.length()
                : consumed[0]);
    }

    public static PyTuple utf_16_ex_decode(String str) {
        return utf_16_ex_decode(str, null);
    }

    public static PyTuple utf_16_ex_decode(String str, String errors) {
        return utf_16_ex_decode(str, errors, 0);
    }

    public static PyTuple utf_16_ex_decode(String str, String errors, int byteorder) {
        return utf_16_ex_decode(str, errors, byteorder, false);
    }

    public static PyTuple
            utf_16_ex_decode(String str, String errors, int byteorder, boolean final_) {
        int[] bo = new int[] {0};
        int[] consumed = final_ ? null : new int[1];
        String decoded = decode_UTF16(str, errors, bo, consumed);
        return new PyTuple(new PyUnicode(decoded), Py.newInteger(final_ ? str.length()
                : consumed[0]), Py.newInteger(bo[0]));
    }

    private static String decode_UTF16(String str, String errors, int[] byteorder) {
        return decode_UTF16(str, errors, byteorder, null);
    }

    private static String decode_UTF16(String str, String errors, int[] byteorder, int[] consumed) {
        int bo = 0;
        if (byteorder != null) {
            bo = byteorder[0];
        }
        int size = str.length();
        StringBuilder v = new StringBuilder(size / 2);
        int i;
        for (i = 0; i < size; i += 2) {
            char ch1 = str.charAt(i);
            if (i + 1 == size) {
                if (consumed != null) {
                    break;
                }
                i = codecs.insertReplacementAndGetResume(v, errors, "utf-16", str, //
                        i, i + 1, "truncated data");
                continue;
            }
            char ch2 = str.charAt(i + 1);
            if (ch1 == 0xFE && ch2 == 0xFF) {
                bo = 1;
                continue;
            } else if (ch1 == 0xFF && ch2 == 0xFE) {
                bo = -1;
                continue;
            }
            int W1;
            if (bo == -1) {
                W1 = (ch2 << 8 | ch1);
            } else {
                W1 = (ch1 << 8 | ch2);
            }

            if (W1 < 0xD800 || W1 > 0xDFFF) {
                v.appendCodePoint(W1);
                continue;
            } else if (W1 >= 0xD800 && W1 <= 0xDBFF && i < size - 1) {
                i += 2;
                char ch3 = str.charAt(i);
                char ch4 = str.charAt(i + 1);
                int W2;
                if (bo == -1) {
                    W2 = (ch4 << 8 | ch3);
                } else {
                    W2 = (ch3 << 8 | ch4);
                }
                if (W2 >= 0xDC00 && W2 <= 0xDFFF) {
                    int U = (((W1 & 0x3FF) << 10) | (W2 & 0x3FF)) + 0x10000;
                    v.appendCodePoint(U);
                    continue;
                }
                i = codecs.insertReplacementAndGetResume(v, errors, "utf-16", str, //
                        i, i + 1, "illegal UTF-16 surrogate");
                continue;
            }

            i = codecs.insertReplacementAndGetResume(v, errors, "utf-16", str, //
                    i, i + 1, "illegal encoding");
        }
        if (byteorder != null) {
            byteorder[0] = bo;
        }
        if (consumed != null) {
            consumed[0] = i;
        }
        return v.toString();
    }

    /* --- UTF-32 Codec ------------------------------------------- */

    /**
     * Encode a Unicode Java String as UTF-32 with byte order mark. (Encoding is in platform byte
     * order, which is big-endian for Java.)
     *
     * @param unicode to be encoded
     * @return tuple (encoded_bytes, unicode_consumed)
     */
    public static PyTuple utf_32_encode(String unicode) {
        return utf_32_encode(unicode, null);
    }

    /**
     * Encode a Unicode Java String as UTF-32 with byte order mark. (Encoding is in platform byte
     * order, which is big-endian for Java.)
     *
     * @param unicode to be encoded
     * @param errors error policy name or null meaning "strict"
     * @return tuple (encoded_bytes, unicode_consumed)
     */
    public static PyTuple utf_32_encode(String unicode, String errors) {
        return PyUnicode_EncodeUTF32(unicode, errors, ByteOrder.UNDEFINED);
    }

    /**
     * Encode a Unicode Java String as UTF-32 in specified byte order with byte order mark.
     *
     * @param unicode to be encoded
     * @param errors error policy name or null meaning "strict"
     * @param byteorder decoding "endianness" specified (in the Python -1, 0, +1 convention)
     * @return tuple (encoded_bytes, unicode_consumed)
     */
    public static PyTuple utf_32_encode(String unicode, String errors, int byteorder) {
        ByteOrder order = ByteOrder.fromInt(byteorder);
        return PyUnicode_EncodeUTF32(unicode, errors, order);
    }

    /**
     * Encode a Unicode Java String as UTF-32 with little-endian byte order. No byte-order mark is
     * generated.
     *
     * @param unicode to be encoded
     * @return tuple (encoded_bytes, unicode_consumed)
     */
    public static PyTuple utf_32_le_encode(String unicode) {
        return utf_32_le_encode(unicode, null);
    }

    /**
     * Encode a Unicode Java String as UTF-32 with little-endian byte order. No byte-order mark is
     * generated.
     *
     * @param unicode to be encoded
     * @param errors error policy name or null meaning "strict"
     * @return tuple (encoded_bytes, unicode_consumed)
     */
    public static PyTuple utf_32_le_encode(String unicode, String errors) {
        return PyUnicode_EncodeUTF32(unicode, errors, ByteOrder.LE);
    }

    /**
     * Encode a Unicode Java String as UTF-32 with big-endian byte order. No byte-order mark is
     * generated.
     *
     * @param unicode to be encoded
     * @return tuple (encoded_bytes, unicode_consumed)
     */
    public static PyTuple utf_32_be_encode(String unicode) {
        return utf_32_be_encode(unicode, null);
    }

    /**
     * Encode a Unicode Java String as UTF-32 with big-endian byte order. No byte-order mark is
     * generated.
     *
     * @param unicode to be encoded
     * @param errors error policy name or null meaning "strict"
     * @return tuple (encoded_bytes, unicode_consumed)
     */
    public static PyTuple utf_32_be_encode(String unicode, String errors) {
        return PyUnicode_EncodeUTF32(unicode, errors, ByteOrder.BE);
    }

    /**
     * Encode a Unicode Java String as UTF-32 in specified byte order. A byte-order mark is
     * generated if order = ByteOrder.UNDEFINED, and the byte order in that case will
     * be the platform default, which is BE since the platform is Java.
     * 

* The input String must be valid UTF-16, in particular, if it contains surrogate code * units they must be ordered and paired correctly. The last char in unicode is not * allowed to be an unpaired surrogate. These criteria will be met if the String * unicode is the contents of a valid {@link PyUnicode} or {@link PyString}. * * @param unicode to be encoded * @param errors error policy name or null meaning "strict" * @param order byte order to use BE, LE or UNDEFINED (a BOM will be written) * @return tuple (encoded_bytes, unicode_consumed) */ private static PyTuple PyUnicode_EncodeUTF32(String unicode, String errors, ByteOrder order) { // We use a StringBuilder but we are really storing encoded bytes StringBuilder v = new StringBuilder(4 * (unicode.length() + 1)); int uptr = 0; // Write a BOM (if required to) if (order == ByteOrder.UNDEFINED) { v.append("\u0000\u0000\u00fe\u00ff"); order = ByteOrder.BE; } if (order != ByteOrder.LE) { uptr = PyUnicode_EncodeUTF32BELoop(v, unicode, errors); } else { uptr = PyUnicode_EncodeUTF32LELoop(v, unicode, errors); } // XXX Issue #2002: should probably report length consumed in Unicode characters return encode_tuple(v.toString(), uptr); } /** * Helper to {@link #PyUnicode_EncodeUTF32(String, String, ByteOrder)} when big-endian encoding * is to be carried out. * * @param v output buffer building String of bytes (Jython PyString convention) * @param unicode character input * @param errors error policy name (e.g. "ignore", "replace") * @return number of Java characters consumed from unicode */ private static int PyUnicode_EncodeUTF32BELoop(StringBuilder v, String unicode, String errors) { int len = unicode.length(); int uptr = 0; char[] buf = new char[6]; // first 3 elements always zero /* * Main codec loop outputs arrays of 4 bytes at a time. */ while (uptr < len) { int ch = unicode.charAt(uptr++); if ((ch & 0xF800) == 0xD800) { /* * This is a surrogate. In Jython, unicode should always be the internal value of a * PyUnicode, and since this should never contain invalid data, it should be a lead * surrogate, uptr < len, and the next char must be the trail surrogate. We ought * not to have to chech that, however ... */ if ((ch & 0x0400) == 0) { // Yes, it's a lead surrogate if (uptr < len) { // And there is something to follow int ch2 = unicode.charAt(uptr++); if ((ch2 & 0xFC00) == 0xDC00) { // And it is a trail surrogate, so we can get on with the encoding ch = ((ch & 0x3ff) << 10) + (ch2 & 0x3ff) + 0x10000; buf[3] = (char)((ch >> 16) & 0xff); buf[4] = (char)((ch >> 8) & 0xff); buf[5] = (char)(ch & 0xff); v.append(buf, 2, 4); } else { // The trail surrogate was missing: accuse ch at uptr-2 uptr = PyUnicode_EncodeUTF32Error(v, errors, ByteOrder.BE, // unicode, uptr - 2, uptr - 1, "second surrogate missing"); } } else { // End of input instread of trail surrogate: accuse ch at uptr-1 uptr = PyUnicode_EncodeUTF32Error(v, errors, ByteOrder.BE, // unicode, uptr - 1, len, "truncated data"); } } else { // The trail encountered in lead position: accuse ch at uptr-2 uptr = PyUnicode_EncodeUTF32Error(v, errors, ByteOrder.BE, // unicode, uptr - 2, uptr - 1, "unexpected second surrogate"); } } else if (ch > 255) { // This is a BMP character: only two bytes non-zero buf[3] = (char)((ch >> 8) & 0xff); buf[4] = (char)(ch & 0xff); v.append(buf, 1, 4); } else { // This is one-byte BMP character: only one byte non-zero buf[3] = (char)(ch & 0xff); v.append(buf, 0, 4); } } // XXX Issue #2002: should probably report length consumed in Unicode characters return uptr; } /** * Helper to {@link #PyUnicode_EncodeUTF32(String, String, ByteOrder)} when big-endian encoding * is to be carried out. * * @param v output buffer building String of bytes (Jython PyString convention) * @param unicode character input * @param errors error policy name (e.g. "ignore", "replace") * @return number of Java characters consumed from unicode */ private static int PyUnicode_EncodeUTF32LELoop(StringBuilder v, String unicode, String errors) { int len = unicode.length(); int uptr = 0; char[] buf = new char[6]; // last 3 elements always zero /* * Main codec loop outputs arrays of 4 bytes at a time. */ while (uptr < len) { int ch = unicode.charAt(uptr++); if ((ch & 0xF800) == 0xD800) { /* * This is a surrogate. In Jython, unicode should always be the internal value of a * PyUnicode, and since this should never contain invalid data, it should be a lead * surrogate, uptr < len, and the next char must be the trail surrogate. We ought * not to have to chech that, however ... */ if ((ch & 0x0400) == 0) { // Yes, it's a lead surrogate if (uptr < len) { // And there is something to follow int ch2 = unicode.charAt(uptr++); if ((ch2 & 0xFC00) == 0xDC00) { // And it is a trail surrogate, so we can get on with the encoding ch = ((ch & 0x3ff) << 10) + (ch2 & 0x3ff) + 0x10000; buf[0] = (char)(ch & 0xff); buf[1] = (char)((ch >> 8) & 0xff); buf[2] = (char)((ch >> 16) & 0xff); v.append(buf, 0, 4); } else { // The trail surrogate was missing: accuse ch at uptr-2 uptr = PyUnicode_EncodeUTF32Error(v, errors, ByteOrder.LE, // unicode, uptr - 2, uptr - 1, "second surrogate missing"); } } else { // End of input instread of trail surrogate: accuse ch at uptr-1 uptr = PyUnicode_EncodeUTF32Error(v, errors, ByteOrder.LE, // unicode, uptr - 1, len, "truncated data"); } } else { // The trail encountered in lead position: accuse ch at uptr-2 uptr = PyUnicode_EncodeUTF32Error(v, errors, ByteOrder.LE, // unicode, uptr - 2, uptr - 1, "unexpected second surrogate"); } } else if (ch > 255) { // This is a BMP character: only two bytes non-zero buf[1] = (char)(ch & 0xff); buf[2] = (char)((ch >> 8) & 0xff); v.append(buf, 1, 4); } else { // This is one-byte BMP character: only one byte non-zero buf[2] = (char)(ch & 0xff); v.append(buf, 2, 4); } } // XXX Issue #2002: should probably report length consumed in Unicode characters return uptr; } /** * Specific UTF-32 encoder error handler. This is a helper called in the inner loop of * {@link #PyUnicode_EncodeUTF32(String, String, ByteOrder)} when the Unicode input is in valid. * In theory, since the input Unicode data should come from a {@link PyUnicode}, there should * never be any errors. * * @param v output buffer building String of bytes (Jython PyString convention) * @param errors error policy name (e.g. "ignore", "replace") * @param order LE or BE indicator * @param toEncode character input * @param start index of first problematic character * @param end index of character after the last problematic character * @param reason text contribution to the exception raised (if any) * @return position within input at which to restart */ private static int PyUnicode_EncodeUTF32Error(StringBuilder v, String errors, ByteOrder order, String toEncode, int start, int end, String reason) { // Handle special cases locally if (errors != null) { if (errors.equals(codecs.IGNORE)) { // Just skip to the first non-problem byte return end; } else if (errors.equals(codecs.REPLACE)) { // Insert a replacement UTF-32 character(s) and skip for (int i = start; i < end; i++) { if (order != ByteOrder.LE) { v.append("\000\000\000?"); } else { v.append("?\000\000\000"); } } return end; } } // If errors not one of those, invoke the generic mechanism PyObject replacementSpec = codecs.encoding_error(errors, "utf-32", toEncode, start, end, reason); // Note the replacement is unicode text that still needs to be encoded String u = replacementSpec.__getitem__(0).toString(); PyUnicode_EncodeUTF32BELoop(v, u, errors); // Return the index in toEncode at which we should resume return codecs.calcNewPosition(toEncode.length(), replacementSpec); } /** * Decode (perhaps partially) a sequence of bytes representing the UTF-32 encoded form of a * Unicode string and return as a tuple the unicode text, and the amount of input consumed. The * endianness used will have been deduced from a byte-order mark, if present, or will be * big-endian (Java platform default). The unicode text is presented as a Java String (the * UTF-16 representation used by {@link PyUnicode}). It is an error for the input bytes not to * form a whole number of valid UTF-32 codes. * * @param bytes to be decoded (Jython {@link PyString} convention) * @return tuple (unicode_result, bytes_consumed) */ public static PyTuple utf_32_decode(String bytes) { return utf_32_decode(bytes, null); } /** * Decode a sequence of bytes representing the UTF-32 encoded form of a Unicode string and * return as a tuple the unicode text, and the amount of input consumed. The endianness used * will have been deduced from a byte-order mark, if present, or will be big-endian (Java * platform default). The unicode text is presented as a Java String (the UTF-16 representation * used by {@link PyUnicode}). It is an error for the input bytes not to form a whole number of * valid UTF-32 codes. * * @param bytes to be decoded (Jython {@link PyString} convention) * @param errors error policy name (e.g. "ignore", "replace") * @return tuple (unicode_result, bytes_consumed) */ public static PyTuple utf_32_decode(String bytes, String errors) { return utf_32_decode(bytes, errors, false); } /** * Decode (perhaps partially) a sequence of bytes representing the UTF-32 encoded form of a * Unicode string and return as a tuple the unicode text, and the amount of input consumed. The * endianness used will have been deduced from a byte-order mark, if present, or will be * big-endian (Java platform default). The unicode text is presented as a Java String (the * UTF-16 representation used by {@link PyUnicode}). * * @param bytes to be decoded (Jython {@link PyString} convention) * @param errors error policy name (e.g. "ignore", "replace") * @param isFinal if a "final" call, meaning the input must all be consumed * @return tuple (unicode_result, bytes_consumed) */ public static PyTuple utf_32_decode(String bytes, String errors, boolean isFinal) { return PyUnicode_DecodeUTF32Stateful(bytes, errors, ByteOrder.UNDEFINED, isFinal, false); } /** * Decode a sequence of bytes representing the UTF-32 little-endian encoded form of a Unicode * string and return as a tuple the unicode text, and the amount of input consumed. A * (correctly-oriented) byte-order mark will pass as a zero-width non-breaking space. The * unicode text is presented as a Java String (the UTF-16 representation used by * {@link PyUnicode}). It is an error for the input bytes not to form a whole number of valid * UTF-32 codes. * * @param bytes to be decoded (Jython {@link PyString} convention) * @return tuple (unicode_result, bytes_consumed) */ public static PyTuple utf_32_le_decode(String bytes) { return utf_32_le_decode(bytes, null); } /** * Decode a sequence of bytes representing the UTF-32 little-endian encoded form of a Unicode * string and return as a tuple the unicode text, and the amount of input consumed. A * (correctly-oriented) byte-order mark will pass as a zero-width non-breaking space. The * unicode text is presented as a Java String (the UTF-16 representation used by * {@link PyUnicode}). It is an error for the input bytes not to form a whole number of valid * UTF-32 codes. * * @param bytes to be decoded (Jython {@link PyString} convention) * @param errors error policy name (e.g. "ignore", "replace") * @return tuple (unicode_result, bytes_consumed) */ public static PyTuple utf_32_le_decode(String bytes, String errors) { return utf_32_le_decode(bytes, errors, false); } /** * Decode (perhaps partially) a sequence of bytes representing the UTF-32 little-endian encoded * form of a Unicode string and return as a tuple the unicode text, and the amount of input * consumed. A (correctly-oriented) byte-order mark will pass as a zero-width non-breaking * space. The unicode text is presented as a Java String (the UTF-16 representation used by * {@link PyUnicode}). * * @param bytes to be decoded (Jython {@link PyString} convention) * @param errors error policy name (e.g. "ignore", "replace") * @param isFinal if a "final" call, meaning the input must all be consumed * @return tuple (unicode_result, bytes_consumed) */ public static PyTuple utf_32_le_decode(String bytes, String errors, boolean isFinal) { return PyUnicode_DecodeUTF32Stateful(bytes, errors, ByteOrder.LE, isFinal, false); } /** * Decode a sequence of bytes representing the UTF-32 big-endian encoded form of a Unicode * string and return as a tuple the unicode text, and the amount of input consumed. A * (correctly-oriented) byte-order mark will pass as a zero-width non-breaking space. The * unicode text is presented as a Java String (the UTF-16 representation used by * {@link PyUnicode}). It is an error for the input bytes not to form a whole number of valid * UTF-32 codes. * * @param bytes to be decoded (Jython {@link PyString} convention) * @return tuple (unicode_result, bytes_consumed) */ public static PyTuple utf_32_be_decode(String bytes) { return utf_32_be_decode(bytes, null); } /** * Decode a sequence of bytes representing the UTF-32 big-endian encoded form of a Unicode * string and return as a tuple the unicode text, and the amount of input consumed. A * (correctly-oriented) byte-order mark will pass as a zero-width non-breaking space. The * unicode text is presented as a Java String (the UTF-16 representation used by * {@link PyUnicode}). It is an error for the input bytes not to form a whole number of valid * UTF-32 codes. * * @param bytes to be decoded (Jython {@link PyString} convention) * @param errors error policy name (e.g. "ignore", "replace") * @return tuple (unicode_result, bytes_consumed) */ public static PyTuple utf_32_be_decode(String bytes, String errors) { return utf_32_be_decode(bytes, errors, false); } /** * Decode (perhaps partially) a sequence of bytes representing the UTF-32 big-endian encoded * form of a Unicode string and return as a tuple the unicode text, and the amount of input * consumed. A (correctly-oriented) byte-order mark will pass as a zero-width non-breaking * space. Unicode string and return as a tuple the unicode text, the amount of input consumed. * The unicode text is presented as a Java String (the UTF-16 representation used by * {@link PyUnicode}). * * @param bytes to be decoded (Jython {@link PyString} convention) * @param errors error policy name (e.g. "ignore", "replace") * @param isFinal if a "final" call, meaning the input must all be consumed * @return tuple (unicode_result, bytes_consumed) */ public static PyTuple utf_32_be_decode(String bytes, String errors, boolean isFinal) { return PyUnicode_DecodeUTF32Stateful(bytes, errors, ByteOrder.BE, isFinal, false); } /** * Decode a sequence of bytes representing the UTF-32 encoded form of a Unicode string and * return as a tuple the unicode text, the amount of input consumed, and the decoding * "endianness" used (in the Python -1, 0, +1 convention). The endianness, if not unspecified * (=0), will be deduced from a byte-order mark and returned. (This codec entrypoint is used in * that way in the utf_32.py codec, but only until the byte order is known.) When * not defined by a BOM, processing assumes big-endian coding (Java platform default), but * returns "unspecified". (The utf_32.py codec treats this as an error, once more * than 4 bytes have been processed.) (Java platform default). The unicode text is presented as * a Java String (the UTF-16 representation used by {@link PyUnicode}). * * @param bytes to be decoded (Jython {@link PyString} convention) * @param errors error policy name (e.g. "ignore", "replace") * @param byteorder decoding "endianness" specified (in the Python -1, 0, +1 convention) * @return tuple (unicode_result, bytes_consumed, endianness) */ public static PyTuple utf_32_ex_decode(String bytes, String errors, int byteorder) { return utf_32_ex_decode(bytes, errors, byteorder, false); } /** * Decode (perhaps partially) a sequence of bytes representing the UTF-32 encoded form of a * Unicode string and return as a tuple the unicode text, the amount of input consumed, and the * decoding "endianness" used (in the Python -1, 0, +1 convention). The endianness will be that * specified, will have been deduced from a byte-order mark, if present, or will be big-endian * (Java platform default). Or it may still be undefined if fewer than 4 bytes are presented. * (This codec entrypoint is used in the utf-32 codec only untile the byte order is known.) The * unicode text is presented as a Java String (the UTF-16 representation used by * {@link PyUnicode}). * * @param bytes to be decoded (Jython {@link PyString} convention) * @param errors error policy name (e.g. "ignore", "replace") * @param byteorder decoding "endianness" specified (in the Python -1, 0, +1 convention) * @param isFinal if a "final" call, meaning the input must all be consumed * @return tuple (unicode_result, bytes_consumed, endianness) */ public static PyTuple utf_32_ex_decode(String bytes, String errors, int byteorder, boolean isFinal) { ByteOrder order = ByteOrder.fromInt(byteorder); return PyUnicode_DecodeUTF32Stateful(bytes, errors, order, isFinal, true); } /** * Decode (perhaps partially) a sequence of bytes representing the UTF-32 encoded form of a * Unicode string and return as a tuple the (Jython internal representation of) the unicode * text, the amount of input consumed, and if requested, the decoding "endianness" used (in * Python -1, 0, +1 conventions). The state we preserve is our read position, i.e. how many * bytes we have consumed and the byte order (endianness). If the input ends part way through a * UTF-32 sequence (4 bytes) the data reported as consumed is just that up to and not including * the first of these bytes. The Java String in the returned tuple is a UTF-16 representation of * the Unicode result, in line with Java conventions, where Unicode characters above the BMP are * represented as surrogate pairs. * * @param bytes input represented as String (Jython PyString convention) * @param errors error policy name (e.g. "ignore", "replace") * @param order LE, BE or UNDEFINED (meaning bytes may begin with a byte order mark) * @param isFinal if a "final" call, meaning the input must all be consumed * @param findOrder if the returned tuple should include a report of the byte order * @return tuple (unicode_result, bytes_consumed [, endianness]) */ private static PyTuple PyUnicode_DecodeUTF32Stateful(String bytes, String errors, ByteOrder order, boolean isFinal, boolean findOrder) { int size = bytes.length(); // Number of bytes waiting (not necessarily multiple of 4) int limit = size & ~0x3; // First index at which fewer than 4 bytes will be available // Output Unicode characters will build up here (as UTF-16: StringBuilder unicode = new StringBuilder(1 + limit / 4); int q = 0; // Read pointer in bytes if (limit > 0) { /* * Check for BOM (U+FEFF) in the input and adjust current byte order setting * accordingly. If we know the byte order (it is LE or BE) then bytes ressembling a byte * order mark are actually a ZERO WIDTH NON-BREAKING SPACE and will be passed through to * the output in the main codec loop as such. */ if (order == ByteOrder.UNDEFINED) { /* * The byte order is not known. If the first 4 bytes is a BOM for LE or BE, that * will set the byte order and the BOM will not be copied to the output. Otherwise * these bytes are data and will be left for the main codec loop to consume. */ char a = bytes.charAt(q); if (a == 0xff) { if (bytes.charAt(q + 1) == 0xfe && bytes.charAt(q + 2) == 0 && bytes.charAt(q + 3) == 0) { // Somebody set up us the BOM (0xff 0xfe 0x00 0x00) - LE order = ByteOrder.LE; q += 4; } } else if (a == 0) { if (bytes.charAt(q + 1) == 0 && bytes.charAt(q + 2) == 0xfe && bytes.charAt(q + 3) == 0xff) { // Other (big-endian) BOM (0x00 0x00 0xfe 0xff) - already set BE order = ByteOrder.BE; q += 4; } } /* * If no BOM found, order is still undefined. This is an error to utf_32.py, but * here is treated as big-endian. */ } /* * Main codec loop consumes 4 bytes and emits one code point with each pass, until there * are fewer than 4 bytes left. There's a version for each endianness */ if (order != ByteOrder.LE) { q = PyUnicode_DecodeUTF32BELoop(unicode, bytes, q, limit, errors); } else { q = PyUnicode_DecodeUTF32LELoop(unicode, bytes, q, limit, errors); } } /* * We have processed all we can: if we have some bytes left over that we can't store for * next time, that's an error. */ if (isFinal && q < size) { q = codecs.insertReplacementAndGetResume(unicode, errors, "utf-32", // bytes, q, size, "truncated data"); } // Finally, the return depends whether we were asked to work out the byte order if (findOrder) { return decode_tuple(unicode.toString(), q, order); } else { return decode_tuple(unicode.toString(), q); } } /** * Helper to {@link #PyUnicode_DecodeUTF32Stateful(String, String, ByteOrder, boolean, boolean)} * when big-endian decoding is to be carried out. * * @param unicode character output * @param bytes input represented as String (Jython PyString convention) * @param q number of elements already consumed from bytes array * @param limit (multiple of 4) first byte not to process * @param errors error policy name (e.g. "ignore", "replace") * @return number of elements consumed now from bytes array */ private static int PyUnicode_DecodeUTF32BELoop(StringBuilder unicode, String bytes, int q, int limit, String errors) { /* * Main codec loop consumes 4 bytes and emits one code point with each pass, until there are * fewer than 4 bytes left. */ while (q < limit) { // Read 4 bytes in two 16-bit chunks according to byte order int hi, lo; hi = (bytes.charAt(q) << 8) | bytes.charAt(q + 1); lo = (bytes.charAt(q + 2) << 8) | bytes.charAt(q + 3); if (hi == 0) { // It's a BMP character so we can't go wrong unicode.append((char)lo); q += 4; } else { // Code may be invalid: let the appendCodePoint method detect that try { unicode.appendCodePoint((hi << 16) + lo); q += 4; } catch (IllegalArgumentException e) { q = codecs.insertReplacementAndGetResume(unicode, errors, "utf-32", // bytes, q, q + 4, "codepoint not in range(0x110000)"); } } } return q; } /** * Helper to {@link #PyUnicode_DecodeUTF32Stateful(String, String, ByteOrder, boolean, boolean)} * when little-endian decoding is to be carried out. * * @param unicode character output * @param bytes input represented as String (Jython PyString convention) * @param q number of elements already consumed from bytes array * @param limit (multiple of 4) first byte not to process * @param errors error policy name (e.g. "ignore", "replace") * @return number of elements consumed now from bytes array */ private static int PyUnicode_DecodeUTF32LELoop(StringBuilder unicode, String bytes, int q, int limit, String errors) { /* * Main codec loop consumes 4 bytes and emits one code point with each pass, until there are * fewer than 4 bytes left. */ while (q < limit) { // Read 4 bytes in two 16-bit chunks according to byte order int hi, lo; hi = (bytes.charAt(q + 3) << 8) | bytes.charAt(q + 2); lo = (bytes.charAt(q + 1) << 8) | bytes.charAt(q); if (hi == 0) { // It's a BMP character so we can't go wrong unicode.append((char)lo); q += 4; } else { // Code may be invalid: let the appendCodePoint method detect that try { unicode.appendCodePoint((hi << 16) + lo); q += 4; } catch (IllegalArgumentException e) { q = codecs.insertReplacementAndGetResume(unicode, errors, "utf-32", // bytes, q, q + 4, "codepoint not in range(0x110000)"); } } } return q; } /* --- RawUnicodeEscape Codec ----------------------------------------- */ public static PyTuple raw_unicode_escape_encode(String str) { return raw_unicode_escape_encode(str, null); } public static PyTuple raw_unicode_escape_encode(String str, String errors) { return encode_tuple(codecs.PyUnicode_EncodeRawUnicodeEscape(str, errors, false), str.length()); } public static PyTuple raw_unicode_escape_decode(String str) { return raw_unicode_escape_decode(str, null); } public static PyTuple raw_unicode_escape_decode(String str, String errors) { return decode_tuple(codecs.PyUnicode_DecodeRawUnicodeEscape(str, errors), str.length()); } /* --- unicode-escape Codec ------------------------------------------- */ public static PyTuple unicode_escape_encode(String str) { return unicode_escape_encode(str, null); } public static PyTuple unicode_escape_encode(String str, String errors) { return encode_tuple(PyString.encode_UnicodeEscape(str, false), str.length()); } public static PyTuple unicode_escape_decode(String str) { return unicode_escape_decode(str, null); } public static PyTuple unicode_escape_decode(String str, String errors) { int n = str.length(); return decode_tuple(PyString.decode_UnicodeEscape(str, 0, n, errors, true), n); } /* --- UnicodeInternal Codec ------------------------------------------ */ /* * This codec is supposed to deal with an encoded form equal to the internal representation of * the unicode object considered as bytes in memory. This was confusing in CPython as it varied * with machine architecture (width and endian-ness). In Jython, where both are fixed, the most * compatible choice is UTF-32BE. The codec is deprecated in v3.3 as irrelevant, or impossible, * in view of the flexible string representation (which Jython emulates in its own way). * * See http://mail.python.org/pipermail/python-dev/2011-November/114415.html */ /** * Legacy method to encode given unicode in CPython wide-build internal format (equivalent * UTF-32BE). */ @Deprecated public static PyTuple unicode_internal_encode(String unicode) { return utf_32_be_encode(unicode, null); } /** * Legacy method to encode given unicode in CPython wide-build internal format (equivalent * UTF-32BE). There must be a multiple of 4 bytes. */ @Deprecated public static PyTuple unicode_internal_encode(String unicode, String errors) { return utf_32_be_encode(unicode, errors); } /** * Legacy method to decode given bytes as if CPython wide-build internal format (equivalent * UTF-32BE). There must be a multiple of 4 bytes. */ @Deprecated public static PyTuple unicode_internal_decode(String bytes) { return utf_32_be_decode(bytes, null, true); } /** * Legacy method to decode given bytes as if CPython wide-build internal format (equivalent * UTF-32BE). There must be a multiple of 4 bytes. */ @Deprecated public static PyTuple unicode_internal_decode(String bytes, String errors) { return utf_32_be_decode(bytes, errors, true); } /** * Optimized charmap encoder mapping. * * Uses a trie structure instead of a dictionary; the speedup primarily comes from not creating * integer objects in the process. The trie is created by inverting the encoding map. */ @Untraversable @ExposedType(name = "EncodingMap", isBaseType = false) public static class EncodingMap extends PyObject { char[] level1; char[] level23; int count2; int count3; private EncodingMap(char[] level1, char[] level23, int count2, int count3) { this.level1 = level1; this.level23 = level23; this.count2 = count2; this.count3 = count3; } /** * Create and populate an EncodingMap from a 256 length PyUnicode char. Returns a * PyDictionary if the mapping isn't easily optimized. * * @param string a 256 length unicode mapping * @return an encoder mapping */ public static PyObject buildEncodingMap(PyObject string) { if (!(string instanceof PyUnicode) || string.__len__() != 256) { throw Py.TypeError("bad argument type for built-in operation"); } boolean needDict = false; char[] level1 = new char[32]; char[] level23 = new char[512]; int i; int count2 = 0; int count3 = 0; String decode = string.toString(); for (i = 0; i < level1.length; i++) { level1[i] = 0xFF; } for (i = 0; i < level23.length; i++) { level23[i] = 0xFF; } if (decode.charAt(0) != 0) { needDict = true; } for (i = 1; i < 256; i++) { int l1, l2; char charAt = decode.charAt(i); if (charAt == 0) { needDict = true; } if (charAt == 0xFFFE) { // unmapped character continue; } l1 = charAt >> 11; l2 = charAt >> 7; if (level1[l1] == 0xFF) { level1[l1] = (char)count2++; } if (level23[l2] == 0xFF) { level23[l2] = (char)count3++; } } if (count2 > 0xFF || count3 > 0xFF) { needDict = true; } if (needDict) { PyObject result = new PyDictionary(); for (i = 0; i < 256; i++) { result.__setitem__(Py.newInteger(decode.charAt(i)), Py.newInteger(i)); } return result; } // Create a three-level trie int length2 = 16 * count2; int length3 = 128 * count3; level23 = new char[length2 + length3]; PyObject result = new EncodingMap(level1, level23, count2, count3); for (i = 0; i < length2; i++) { level23[i] = 0xFF; } for (i = length2; i < length2 + length3; i++) { level23[i] = 0; } count3 = 0; for (i = 1; i < 256; i++) { int o1, o2, o3, i2, i3; char charAt = decode.charAt(i); if (charAt == 0xFFFE) { // unmapped character continue; } o1 = charAt >> 11; o2 = (charAt >> 7) & 0xF; i2 = 16 * level1[o1] + o2; if (level23[i2] == 0xFF) { level23[i2] = (char)count3++; } o3 = charAt & 0x7F; i3 = 128 * level23[i2] + o3; level23[length2 + i3] = (char)i; } return result; } /** * Lookup a char in the EncodingMap. * * @param c a char * @return an int, -1 for failure */ public int lookup(char c) { int l1 = c >> 11; int l2 = (c >> 7) & 0xF; int l3 = c & 0x7F; int i; if (c == 0) { return 0; } // level 1 i = level1[l1]; if (i == 0xFF) { return -1; } // level 2 i = level23[16 * i + l2]; if (i == 0xFF) { return -1; } // level 3 i = level23[16 * count2 + 128 * i + l3]; if (i == 0) { return -1; } return i; } } }





© 2015 - 2024 Weber Informatics LLC | Privacy Policy