org.python.core.codecs Maven / Gradle / Ivy
Show all versions of jython-slim Show documentation
/*
* Copyright (c)2013 Jython Developers. Original Java version copyright 2000 Finn Bock.
*
* This program contains material copyrighted by: Copyright (c) Corporation for National Research
* Initiatives. Originally written by Marc-Andre Lemburg ([email protected]).
*/
package org.python.core;
import java.nio.charset.Charset;
import java.util.ArrayList;
import java.util.Iterator;
import org.python.core.util.StringUtil;
import org.python.modules._codecs;
/**
* This class implements the codec registry and utility methods supporting codecs, such as those
* providing the standard replacement strategies ("ignore", "backslashreplace", etc.). The _codecs
* module relies heavily on apparatus implemented here, and therefore so does the Python
* codecs
module (in Lib/codecs.py
). It corresponds approximately to
* CPython's Python/codecs.c
.
*
* The class also contains the inner methods of the standard Unicode codecs, available for
* transcoding of text at the Java level. These also are exposed through the _codecs
* module. In CPython, the implementations are found in Objects/unicodeobject.c
.
*
* @since Jython 2.0
*/
public class codecs {
public static final String BACKSLASHREPLACE = "backslashreplace";
public static final String IGNORE = "ignore";
public static final String REPLACE = "replace";
public static final String XMLCHARREFREPLACE = "xmlcharrefreplace";
private static char Py_UNICODE_REPLACEMENT_CHARACTER = 0xFFFD;
public static String getDefaultEncoding() {
return Py.getSystemState().getCodecState().getDefaultEncoding();
}
public static void setDefaultEncoding(String encoding) {
Py.getSystemState().getCodecState().setDefaultEncoding(encoding);
}
public static PyObject lookup_error(String handlerName) {
return Py.getSystemState().getCodecState().lookup_error(handlerName);
}
public static void register_error(String name, PyObject error) {
Py.getSystemState().getCodecState().register_error(name, error);
}
public static void register(PyObject search_function) {
Py.getSystemState().getCodecState().register(search_function);
}
public static PyTuple lookup(String encoding) {
return Py.getSystemState().getCodecState().lookup(encoding);
}
private static String normalizestring(String string) {
return string.toLowerCase().replace(' ', '-');
}
/**
* Decode the bytes v
using the codec registered for the encoding
. The
* encoding
defaults to the system default encoding (see
* {@link codecs#getDefaultEncoding()}). The string errors
may name a different
* error handling policy (built-in or registered with
* {@link #register_error(String, PyObject)}). The default error policy is 'strict' meaning that
* encoding errors raise a ValueError
. This method is exposed through the _codecs
* module as {@link _codecs#decode(PyString, PyString, PyString)}
*
* @param v bytes to be decoded
* @param encoding name of encoding (to look up in codec registry)
* @param errors error policy name (e.g. "ignore", "replace")
* @return Unicode string decoded from bytes
*/
public static PyObject decode(PyString v, String encoding, String errors) {
if (encoding == null) {
encoding = getDefaultEncoding();
} else {
encoding = normalizestring(encoding);
}
if (errors != null) {
errors = errors.intern();
}
/* Shortcut for ascii encoding */
if (encoding.equals("ascii")) {
return wrapDecodeResult(PyUnicode_DecodeASCII(v.toString(), v.__len__(), errors));
}
/* Decode via the codec registry */
PyObject decoder;
try {
decoder = lookup(encoding).__getitem__(1);
} catch (PyException ex) {
if (ex.match(Py.LookupError)) {
// If we couldn't find an encoding, see if we have a builtin
if (encoding.equals("utf-8")) {
return wrapDecodeResult(PyUnicode_DecodeUTF8(v.toString(), errors));
} else if (encoding.equals("utf-7")) {
return wrapDecodeResult(PyUnicode_DecodeUTF7(v.toString(), errors));
} else if (encoding.equals("latin-1")) {
return wrapDecodeResult(PyUnicode_DecodeLatin1(v.toString(), v.__len__(),
errors));
}
}
throw ex;
}
PyObject result;
if (errors != null) {
result = decoder.__call__(v, new PyString(errors));
} else {
result = decoder.__call__(v);
}
if (!(result instanceof PyTuple) || result.__len__() != 2) {
throw Py.TypeError("decoder must return a tuple (object,integer)");
}
return result.__getitem__(0);
}
private static PyUnicode wrapDecodeResult(String result) {
return new PyUnicode(result);
}
/**
* Encode v
using the codec registered for the encoding
.
* The encoding
defaults to the system default encoding
* (see {@link codecs#getDefaultEncoding()}).
* The string errors
may name a different error handling
* policy (built-in or registered with {@link #register_error(String, PyObject)}).
* The default error policy is 'strict' meaning that encoding errors raise a
* ValueError
.
*
* @param v unicode string to be encoded
* @param encoding name of encoding (to look up in codec registry)
* @param errors error policy name (e.g. "ignore")
* @return bytes object encoding v
*/
// XXX v should probably be declared PyUnicode (or thing delivering unicode code points)
public static String encode(PyString v, String encoding, String errors) {
if (encoding == null) {
encoding = getDefaultEncoding();
} else {
encoding = normalizestring(encoding);
}
if (errors != null) {
errors = errors.intern();
}
/*
* Shortcuts for common default encodings. latin-1 must not use the lookup registry for the
* encodings module to work correctly
*/
if (encoding.equals("latin-1")) {
return PyUnicode_EncodeLatin1(v.toString(), v.__len__(), errors);
} else if (encoding.equals("ascii")) {
return PyUnicode_EncodeASCII(v.toString(), v.__len__(), errors);
}
/* Encode via the codec registry */
PyObject encoder;
try {
encoder = lookup(encoding).__getitem__(0);
} catch (PyException ex) {
if (ex.match(Py.LookupError)) {
// If we couldn't find an encoding, see if we have a builtin
if (encoding.equals("utf-8")) {
return PyUnicode_EncodeUTF8(v.toString(), errors);
} else if (encoding.equals("utf-7")) {
return codecs.PyUnicode_EncodeUTF7(v.toString(), false, false, errors);
}
}
throw ex;
}
PyObject result;
if (errors != null) {
result = encoder.__call__(v, new PyString(errors));
} else {
result = encoder.__call__(v);
}
if (!(result instanceof PyTuple) || result.__len__() != 2) {
throw Py.TypeError("encoder must return a tuple (object,integer)");
}
PyObject encoded = result.__getitem__(0);
if (encoded instanceof PyString) {
return encoded.toString();
} else {
throw Py.TypeError("encoder did not return a string/unicode object (type="
+ encoded.getType().fastGetName() + ")");
}
}
public static PyObject strict_errors(PyObject[] args, String[] kws) {
ArgParser ap = new ArgParser("strict_errors", args, kws, "exc");
PyObject exc = ap.getPyObject(0);
if (Py.isInstance(exc, Py.UnicodeDecodeError)) {
throw new PyException(Py.UnicodeDecodeError, exc);
} else if (Py.isInstance(exc, Py.UnicodeEncodeError)) {
throw new PyException(Py.UnicodeEncodeError, exc);
} else if (Py.isInstance(exc, Py.UnicodeTranslateError)) {
throw new PyException(Py.UnicodeTranslateError, exc);
}
throw wrong_exception_type(exc);
}
public static PyObject ignore_errors(PyObject[] args, String[] kws) {
ArgParser ap = new ArgParser("ignore_errors", args, kws, "exc");
PyObject exc = ap.getPyObject(0);
if (!isUnicodeError(exc)) {
throw wrong_exception_type(exc);
}
PyObject end = exc.__getattr__("end");
return new PyTuple(Py.EmptyUnicode, end);
}
private static boolean isUnicodeError(PyObject exc) {
return Py.isInstance(exc, Py.UnicodeDecodeError)
|| Py.isInstance(exc, Py.UnicodeEncodeError)
|| Py.isInstance(exc, Py.UnicodeTranslateError);
}
public static PyObject replace_errors(PyObject[] args, String[] kws) {
ArgParser ap = new ArgParser("replace_errors", args, kws, "exc");
PyObject exc = ap.getPyObject(0);
if (Py.isInstance(exc, Py.UnicodeEncodeError)) {
int end = exceptions.getEnd(exc, true);
return new PyTuple(new PyUnicode("?"), Py.newInteger(end));
} else if (Py.isInstance(exc, Py.UnicodeDecodeError)) {
int end = exceptions.getEnd(exc, false);
return new PyTuple(new PyUnicode(Py_UNICODE_REPLACEMENT_CHARACTER), Py.newInteger(end));
} else if (Py.isInstance(exc, Py.UnicodeTranslateError)) {
int end = exceptions.getEnd(exc, true);
return new PyTuple(new PyUnicode(Py_UNICODE_REPLACEMENT_CHARACTER), Py.newInteger(end));
}
throw wrong_exception_type(exc);
}
public static PyObject xmlcharrefreplace_errors(PyObject[] args, String[] kws) {
ArgParser ap = new ArgParser("xmlcharrefreplace_errors", args, kws, "exc");
PyObject exc = ap.getPyObject(0);
if (!Py.isInstance(exc, Py.UnicodeEncodeError)) {
throw wrong_exception_type(exc);
}
int start = ((PyInteger)exc.__getattr__("start")).getValue();
int end = ((PyInteger)exc.__getattr__("end")).getValue();
String object = exc.__getattr__("object").toString();
StringBuilder replacement = new StringBuilder();
xmlcharrefreplace_internal(start, end, object, replacement);
return new PyTuple(Py.java2py(replacement.toString()), exc.__getattr__("end"));
}
public static StringBuilder xmlcharrefreplace(int start, int end, String toReplace) {
StringBuilder replacement = new StringBuilder();
xmlcharrefreplace_internal(start, end, toReplace, replacement);
return replacement;
}
private static void xmlcharrefreplace_internal(int start, int end, String object,
StringBuilder replacement) {
for (int i = start; i < end; i++) {
replacement.append("");
char cur = object.charAt(i);
int digits;
int base;
if (cur < 10) {
digits = 1;
base = 1;
} else if (cur < 100) {
digits = 2;
base = 10;
} else if (cur < 1000) {
digits = 3;
base = 100;
} else if (cur < 10000) {
digits = 4;
base = 1000;
} else if (cur < 100000) {
digits = 5;
base = 10000;
} else if (cur < 1000000) {
digits = 6;
base = 100000;
} else {
digits = 7;
base = 1000000;
}
while (digits-- > 0) {
replacement.append((char)('0' + cur / base));
cur %= base;
base /= 10;
}
replacement.append(';');
}
}
private static PyException wrong_exception_type(PyObject exc) {
PyObject excClass = exc.__getattr__("__class__");
PyObject className = excClass.__getattr__("__name__");
return new PyException(Py.TypeError, "Don't know how to handle " + className
+ " in error callback");
}
static char hexdigits[] = {//@formatter:off
'0', '1', '2', '3', '4', '5', '6', '7',
'8', '9', 'a', 'b', 'c', 'd', 'e', 'f'
}; //@formatter:on
public static PyObject backslashreplace_errors(PyObject[] args, String[] kws) {
ArgParser ap = new ArgParser("backslashreplace_errors", args, kws, "exc");
PyObject exc = ap.getPyObject(0);
if (!Py.isInstance(exc, Py.UnicodeEncodeError)) {
throw wrong_exception_type(exc);
}
int start = ((PyInteger)exc.__getattr__("start")).getValue();
int end = ((PyInteger)exc.__getattr__("end")).getValue();
String object = exc.__getattr__("object").toString();
StringBuilder replacement = new StringBuilder();
backslashreplace_internal(start, end, object, replacement);
return new PyTuple(Py.java2py(replacement.toString()), exc.__getattr__("end"));
}
public static StringBuilder backslashreplace(int start, int end, String toReplace) {
StringBuilder replacement = new StringBuilder();
backslashreplace_internal(start, end, toReplace, replacement);
return replacement;
}
private static void backslashreplace_internal(int start, int end, String object,
StringBuilder replacement) {
for (Iterator iter = new StringSubsequenceIterator(object, start, end, 1); iter
.hasNext();) {
int c = iter.next();
replacement.append('\\');
if (c >= 0x00010000) {
replacement.append('U');
replacement.append(hexdigits[(c >> 28) & 0xf]);
replacement.append(hexdigits[(c >> 24) & 0xf]);
replacement.append(hexdigits[(c >> 20) & 0xf]);
replacement.append(hexdigits[(c >> 16) & 0xf]);
replacement.append(hexdigits[(c >> 12) & 0xf]);
replacement.append(hexdigits[(c >> 8) & 0xf]);
} else if (c >= 0x100) {
replacement.append('u');
replacement.append(hexdigits[(c >> 12) & 0xf]);
replacement.append(hexdigits[(c >> 8) & 0xf]);
} else {
replacement.append('x');
}
replacement.append(hexdigits[(c >> 4) & 0xf]);
replacement.append(hexdigits[c & 0xf]);
}
}
/* --- UTF-7 Codec -------------------------------------------------------- */
/*
* This codec was converted to Java from the CPython v2.7.3 final. See RFC2152 for details of
* the encoding scheme. We encode conservatively and decode liberally.
*/
/* //@formatter:off
* The UTF-7 encoder treats ASCII characters differently according to whether they are Set D,
* Set O, Whitespace, or special (i.e. none of the above). See RFC2152. This array identifies
* these different sets:
* 0 : "Set D"
* alphanumeric and '(),-./:?
* 1 : "Set O"
* !"#$%&*;<=>@[]^_`{|}
* 2 : "whitespace"
* ht nl cr sp
* 3 : special (must be base64 encoded)
* everything else (i.e. +\~ and non-printing codes 0-8 11-12 14-31 127)
*/
private static final byte[] utf7_category = {
/* nul soh stx etx eot enq ack bel bs ht nl vt np cr so si */
3, 3, 3, 3, 3, 3, 3, 3, 3, 2, 2, 3, 3, 2, 3, 3,
/* dle dc1 dc2 dc3 dc4 nak syn etb can em sub esc fs gs rs us */
3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
/* sp ! " # $ % & ' ( ) * + , - . / */
2, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 3, 0, 0, 0, 0,
/* 0 1 2 3 4 5 6 7 8 9 : ; < = > ? */
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0,
/* @ A B C D E F G H I J K L M N O */
1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
/* P Q R S T U V W X Y Z [ \ ] ^ _ */
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 3, 1, 1, 1,
/* ` a b c d e f g h i j k l m n o */
1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
/* p q r s t u v w x y z { | } ~ del */
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 3, 3,
};//@formatter:on
/**
* Determine whether, in the UTF-7 encoder, this character should be encoded as itself. The
* answer depends on whether we are encoding set O (optional special characters) as itself, and
* also on whether we are encoding whitespace as itself. RFC2152 makes it clear that the answers
* to these questions vary between applications, so this code needs to be flexible.
*
* @param c code point of the character
* @param directO true if characters in "set O" may be encoded as themselves
* @param directWS true if whitespace characters may be encoded as themselves
* @return {@code true} if {@code c} should be encoded as itself
*/
private static boolean ENCODE_DIRECT(int c, boolean directO, boolean directWS) {
if (c >= 128 || c < 0) {
return false; // Character not in table is always special
} else {
switch (utf7_category[c]) {
case 0: // This is a regular character
return true;
case 1: // This is a white space character
return directWS;
case 2: // This is an optional special character
return directO;
default: // This is always a special character (including '+')
return false;
}
}
}
/** Look-up for the Base64 encoded byte [0..0x3f] */
private static final String B64_CHARS =
"ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/";
/** What is the Base64 encoded byte for (the bottom 6 bits of) n? */
private static char TO_BASE64(int n) {
return B64_CHARS.charAt(n & 0x3f);
}
/**
* Is c the code point of a Base64 character? And if so, what is the 6-bit quantity to be
* decodec from c? Return the 6-bit equivalent of c in a Base64 segment, -1 if it cannot be used
* in a Base64 segment, and -2 for the special case of '-' ending the segment.
*/
private static int FROM_BASE64(int c) {
return (c >= 128) ? -1 : BASE64_VALUE[c];
}
/**
* Look-up table to convert ASCII byte to 6-bit Base64 value, -1 if not Base64, and -2 if
* special terminator '-'.
*/
private static final byte[] BASE64_VALUE = {//@formatter:off
// nul soh stx etx eot enq ack bel bs ht nl vt np cr so si
-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
// dle dc1 dc2 dc3 dc4 nak syn etb can em sub esc fs gs rs us
-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
// sp ! " # $ % & ' ( ) * + , - . /
-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 62, -1, -2, -1, 63,
// 0 1 2 3 4 5 6 7 8 9 : ; < = > ?
52, 53, 54, 55, 56, 57, 58, 59, 60, 61, -1, -1, -1, -1, -1, -1,
// @ A B C D E F G H I J K L M N O
-1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14,
// P Q R S T U V W X Y Z [ \ ] ^ _
15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, -1, -1, -1, -1, -1,
// ` a b c d e f g h i j k l m n o
-1, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40,
// p q r s t u v w x y z { | } ~ del
41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, -1, -1, -1, -1, -1,
};//@formatter:on
/**
* Enumeration of the error causes during decoding of the Base64 segment of UTF-7
*/
static enum UTF7Error {
NONE("No error"), // No error
PADDING("non-zero padding bits in shift sequence"), // Error when at end
PARTIAL("partial character in shift sequence"), // Error when at end
TRUNCATED("second surrogate missing at end of shift sequence"), // Error when at end
MISSING("second surrogate missing"), // Lead surrogate followed by another, or BMP
TRAIL("unexpected second surrogate"); // Trail surrogate not preceded by lead
/** Suitable error message */
final String msg;
private UTF7Error(String msg) {
this.msg = msg;
}
}
/**
* Decode (perhaps partially) a sequence of bytes representing the UTF-7 encoded form of a
* Unicode string and return the (Jython internal representation of) the unicode object, and
* amount of input consumed. The only state we preserve is our read position, i.e. how many
* bytes we have consumed. So if the input ends part way through a Base64 sequence the data
* reported as consumed is just that up to and not including the Base64 start marker ('+').
* Performance will be poor (quadratic cost) on runs of Base64 data long enough to exceed the
* input quantum in incremental decoding. The returned Java String is a UTF-16 representation of
* the Unicode result, in line with Java conventions. Unicode characters above the BMP are
* represented as surrogate pairs.
*
* @param bytes input represented as String (Jython PyString convention)
* @param errors error policy name (e.g. "ignore", "replace")
* @param consumed returns number of bytes consumed in element 0, or is null if a "final" call
* @return unicode result (as UTF-16 Java String)
*/
public static String PyUnicode_DecodeUTF7Stateful(String bytes, String errors, int[] consumed) {
int s; // Index in the input bytes
boolean inBase64 = false; // Whether s is currently in a Base64 segment
long base64buffer = 0; // Stored bits buffer during Base64 decoding
int base64bits = 0; // Number of valid bits buffered during Base64 decoding
int startInBytes = 0; // Place in input bytes where most recent Base64 segment begins
int syncInBytes = 0; // Place in input bytes where stored bits buffer last empty
int startInUnicode = 0; // Place in output unicode where most recent Base64 segment begins
int size = bytes.length();
StringBuilder unicode = new StringBuilder(size);
for (s = 0; s < size; s++) { // In error cases s may skip forwards in bytes
// Next byte to process
int b = bytes.charAt(s);
if (b >= 128) {
// The input was supposed to be 7-bit clean
s = insertReplacementAndGetResume(unicode, errors, "utf-7", //
bytes, s, s + 1, "unexpected special character") - 1;
} else if (inBase64) {
// We are currently processing a Base64 section
if (base64bits == 0) {
// Mark this point as latest easy error recovery point (bits buffer empty)
syncInBytes = s;
}
int sixBits = FROM_BASE64(b); // returns -ve if not Base64
if (sixBits >= 0) {
// And we continue processing a Base64 section
base64buffer = (base64buffer << 6) | sixBits;
base64bits += 6;
if (base64bits >= 32) {
// We have enough bits for a code point
base64bits = emitCodePoints(unicode, base64buffer, base64bits);
if (base64bits >= 32) {
// We stopped prematurely. Why?
UTF7Error error = emitCodePointsDiagnosis(base64buffer, base64bits);
// Difficult to know exactly what input characters to blame
s = insertReplacementAndGetResume(unicode, errors, "utf-7", //
bytes, syncInBytes, s + 1, error.msg) - 1;
// Discard one UTF-16 output and hope for the best
base64bits -= 16;
}
}
} else {
// We are now leaving a Base64 section
inBase64 = false;
// We should have a whole number of code points and < 6 bits zero padding
if (base64bits > 0) {
// Try to emit them all
base64bits = emitCodePoints(unicode, base64buffer, base64bits);
// Now check for errors
UTF7Error error = emitCodePointsDiagnosis(base64buffer, base64bits);
if (error != UTF7Error.NONE) {
// Difficult to know exactly what input characters to blame
s = insertReplacementAndGetResume(unicode, errors, "utf-7", //
bytes, s, s + 1, error.msg) - 1;
}
// We are, in any case, discarding whatever is in the buffer
base64bits = 0;
}
if (b == '-') {
/*
* '-' signals the end of Base64. The byte is is simply absorbed, but in the
* special case where it is the first byte of the Base64 segment, the
* zero-length segment '+-' actually encodes "+".
*/
if (s == startInBytes + 1) {
unicode.append('+');
}
} else {
/*
* This b is a US-ASCII byte for some character.
*/
unicode.appendCodePoint(b);
}
}
} else if (b == '+') {
/*
* We are not currently processing a Base64 section, but this starts one. Remember
* where it starts, in the input bytes and the output unicode so that, if we hit the
* end of input before it ends, we can leave it unprocessed for next time.
*/
startInBytes = s;
startInUnicode = unicode.length();
// Initialise the Base64 decoder
base64bits = 0;
inBase64 = true;
} else {
/*
* This b is a US-ASCII byte for some character. We are permissive on decoding; the
* only ASCII byte not decoding to itself is the + which begins a base64 string.
*/
unicode.appendCodePoint(b);
}
}
/*
* We hit the end of the input. If we were part way through some Base64 processing, since we
* don't store all that state (inBase64, base64bits, base64buffer) the strategy is to back
* up the input pointer to the '-' that started the current Base64 segment.
*/
if (inBase64) {
// Restore state to beginning of last Base64 sequence
s = startInBytes;
unicode.setLength(startInUnicode);
}
if (consumed != null) {
// Not a final call, so report how much consumed in the consumed argument
consumed[0] = s;
} else if (s < size) {
// This was final but we didn't exhaust the input: that's an error.
s = insertReplacementAndGetResume(unicode, errors, "utf-7", //
bytes, startInBytes, size, "unterminated shift sequence");
}
return unicode.toString();
}
/**
* Decode completely a sequence of bytes representing the UTF-7 encoded form of a Unicode string
* and return the (Jython internal representation of) the unicode object. The retruned Java
* String is a UTF-16 representation of the Unicode result, in line with Java conventions.
* Unicode characters above the BMP are represented as surrogate pairs.
*
* @param bytes input represented as String (Jython PyString convention)
* @param errors error policy name (e.g. "ignore", "replace")
* @return unicode result (as UTF-16 Java String)
*/
public static String PyUnicode_DecodeUTF7(String bytes, String errors) {
return PyUnicode_DecodeUTF7Stateful(bytes, errors, null);
}
/**
* Helper for {@link #PyUnicode_DecodeUTF7Stateful(String, String, int[])} to emit characters
* that accumulated as UTF-16 code units in the bits of a long integer (from Base64 decoding,
* say). The buffer variable may hold any number of bits (up to its 64-bit capacity). The number
* of valid bits is given by argument n
and they are the n
least
* significant of the buffer.
*
* Only complete Unicode characters are emitted, which are obtained by consuming 16 bits (when
* those bits identify a BMP character), or 32 bits (when those bits form a surrogate pair).
* Consumed bits are not cleared from the buffer (it is passed by value), and there is no need
* for the client to clear them, but the method returns the new number of valid bits n1, which
* are in the least significant positions (that is, bits n1-1
to 0
).
*
* If the method returns with 32 or more bits unconsumed, it has encountered an invalid sequence
* of bits: the leading bits will then either be an "unaccompanied" trail surrogate, or a lead
* surrogate not followed by a trail surrogate.
*
* @param v output UTF-16 sequence
* @param buffer holding the bits
* @param n the number of bits held (<=64)
* @return the number of bits not emitted (<32 unless error)
*/
private static int emitCodePoints(StringBuilder v, long buffer, int n) {
// Emit code points until too few in the buffer to process.
while (n >= 16) {
/*
* Get the top 16 bits of the buffer to bottom of an int. Note no 0xffff mask as bits to
* left of bit-15 are harmless
*/
int unit = (int)(buffer >>> (n - 16));
boolean unitIsSurrogate = ((unit & 0xF800) == 0xD800);
if (!unitIsSurrogate) {
// This (or rather its bottom 16 bits) is a BMP codepoint: easy
v.append((char)unit);
n -= 16;
} else if (n >= 32) {
// This a surrogate unit and we have enough bits for the whole code point.
if ((unit & 0x0400) == 0) {
// This is a lead surrogate as expected ... get the trail surrogate.
int unit2 = (int)(buffer >>> (n - 32));
if ((unit2 & 0xFC00) == 0xDC00) {
// And this is the trail surrogate we expected
v.appendCodePoint(0x10000 + ((unit & 0x3ff) << 10) + (unit2 & 0x3ff));
n -= 32;
} else {
// But this isn't a trail surrogate: jam at >=32
return n;
}
} else {
// This is an unaccompanied trail surrogate: jam at >=32
return n;
}
} else {
// This a non-BMP code point but we don't have enough bits to deal with it yet
return n;
}
}
return n;
}
/**
* Helper for {@link #PyUnicode_DecodeUTF7Stateful(String, String, int[])} to diagnose what went
* wrong in {@link #emitCodePoints(StringBuilder, long, int)}. When called with fewer than 32
* bits in the buffer, it assumes we are in the run-down of processing at the end of the
* decoder, where partial output characters are an error. For 32 bits or more, It duplicates
* some logic, but is called only during abnormal processing. The return is:
*
* Values returned
*
* NONE
* No error
*
*
* PADDING
* non-zero padding bits in shift sequence
* (error if at end of shift sequence)
*
*
* PARTIAL
* partial character in shift sequence
* (error if at end of shift sequence)
*
*
* TRUNCATED
* second surrogate missing at end of shift sequence
*
*
* MISSING
* second surrogate missing
*
*
* TRAIL
* unexpected second surrogate
*
*
*
* We are compatible with CPython in using the term "second surrogate" in error messages rather
* than "trail surrogate" (which is used in the code).
*
* Note that CPython (see Issue13333) allows this codec to decode lone surrogates into the
* internal data of unicode objects. It is difficult to reconcile this with the idea that the
* v3.3 statement "Strings contain Unicode characters", but that reconciliation is probably to
* be found in PEP383, not implemented in Jython.
*
* @param buffer holding the bits
* @param n the number of bits held (<=64)
* @return the diagnosis
*/
private static UTF7Error emitCodePointsDiagnosis(long buffer, int n) {
if (n >= 16) {
/*
* Get the top 16 bits of the buffer to bottom of an int. Note no 0xffff mask as bits to
* left of bit-15 are harmless
*/
int unit = (int)(buffer >>> (n - 16));
boolean unitIsSurrogate = ((unit & 0xF800) == 0xD800);
if (!unitIsSurrogate) {
// No problem. In practice, we should never land here.
return UTF7Error.NONE;
} else if (n >= 32) {
if ((unit & 0x0400) == 0) {
// This is a lead surrogate, which is valid: check the next 16 bits.
int unit2 = ((int)(buffer >>> (n - 32))) & 0xffff;
if ((unit2 & 0xFC00) == 0xDC00) {
// Hmm ... why was I called?
return UTF7Error.NONE;
} else {
// Not trail surrogate: that's the problem
return UTF7Error.MISSING;
}
} else {
// This is an unexpected trail surrogate
return UTF7Error.TRAIL;
}
} else {
// Note that 32 > n >= 16, so we are at the end of decoding
if ((unit & 0x0400) == 0) {
/*
* This is a lead surrogate, but since decoding stopped we must have reched the
* end of a Base64 segment without the trail surrogate appearing.
*/
return UTF7Error.TRUNCATED;
} else {
// This is an unexpected trail surrogate
return UTF7Error.TRAIL;
}
}
} else if (n >= 6) {
// Fewer than 16 bits: at end of decoding with Base64 characters left over
return UTF7Error.PARTIAL;
} else {
// Fewer than 6 bits, which should all be zero. Make a mask to extract them.
int validBits = (1 << n) - 1;
int padding = ((int)buffer) & validBits;
if (padding != 0) {
// At end of decoding with non-zero padding
return UTF7Error.PADDING;
} else {
// Any bits left are zero: that's ok then.
return UTF7Error.NONE;
}
}
}
/**
* Encode a UTF-16 Java String as UTF-7 bytes represented by the low bytes of the characters in
* a String. (String representation for byte data is chosen so that it may immediately become a
* PyString.)
*
* This method differs from the CPython equivalent (in Object/unicodeobject.c
)
* which works with an array of code points that are, in a wide build, Unicode code points.
*
* @param unicode to be encoded
* @param base64SetO true if characters in "set O" should be translated to base64
* @param base64WhiteSpace true if white-space characters should be translated to base64
* @param errors error policy name (e.g. "ignore", "replace")
* @return bytes representing the encoded unicode string
*/
public static String PyUnicode_EncodeUTF7(String unicode, boolean base64SetO,
boolean base64WhiteSpace, String errors) {
boolean inBase64 = false;
int base64bits = 0;
long base64buffer = 0;
int size = unicode.length();
// Output bytes here: sized for ASCII + a few non-BMP characters
// We use a StringBuilder and return a String, but we are really storing encoded bytes
StringBuilder v = new StringBuilder(size + size / 8 + 10);
for (int i = 0; i < size; i++) {
// Next UTF-16 code unit to process
int ch = unicode.charAt(i);
/*
* Decide what to output and prepare for it. Mainly, decide whether to represent this
* UTF-16 code unit in Base64 or US-ASCII, and switch modes, with output, accordingly.
*/
if (inBase64) {
// Currently we are in Base64 encoding: should we switch out?
if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) {
/*
* The next character is one for which we do not need to be in Base64, so pad
* out to 6n the Base64 bits we currently have buffered and emit them. Then
* switch to US-ASCII.
*/
emitBase64Padded(v, base64buffer, base64bits);
inBase64 = false;
if (FROM_BASE64(ch) != -1) {
// Character is in the Base64 set, or is a '-': must signal end explicitly.
v.append('-');
}
}
} else {
// Not currently in Base64 encoding: should we switch in?
if (ch == '+') {
// Special case for + since it would otherwise flag a start
v.append('+');
ch = '-'; // Comes out as +-
} else if (!ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) {
/*
* The next character is one for which we need to be in Base64, so switch to it
* and emit the Base64 start marker and initialise the coder.
*/
v.append('+');
inBase64 = true;
base64bits = 0;
}
}
/*
* We have decided what to do (US-ASCII or Base64) but we haven't done it yet.
*/
if (!inBase64) {
// We decided to encode the current character as US-ASCII and are in that mode
v.append((char)ch);
} else {
// We decided to encode the current character as Base64 and are in that mode
/*
* In the present implementation the characters are suppplied as a UTF-16 Java
* String. The UTF-7 approach to characters beyond the BMP is to encode the
* surrogate pair as two 16-bit pseudo-characters, which is how Jython represents it
* already, so the first part is already done for us by accessing the internal
* representation.
*/
// XXX see issue #2002: we should only count surrogate pairs as one character
// if ((ch & 0xFC00)==0xD800) { count++; }
if (base64bits > 48) {
// No room for the next 16 bits: emit all we have
base64bits = emitBase64(v, base64buffer, base64bits);
}
base64bits += 16;
base64buffer = (base64buffer << 16) + ch;
}
}
/*
* We've run out of input to encode. If we are currently in US-ASCII mode, we can just stop.
* If we are in Base64 mode, we have to come to a clean stop, since there is no opportunity
* to store this fact as state for next time (and there may be no next time).
*/
if (inBase64) {
/*
* Currently we are in Base64 encoding and must switch out. Pad out to 6n the bits we
* currently have buffered and emit them. We don't know what might come next so emit a
* '-' to round out the segment.
*/
emitBase64Padded(v, base64buffer, base64bits);
v.append('-');
}
return v.toString();
}
/**
* Helper for {@link #PyUnicode_EncodeUTF7(String, boolean, boolean, String)} to emit 6-bit
* Base64 code units as bytes to the output. The buffer variable may hold any number of bits (up
* to its 64-bit capacity). The number of valid bits is given by argument n
and
* they are the n
least significant of the buffer. Bits will be emitted in groups
* of 6, represented by their Base64 character, starting with the 6 most-significant valid bits
* of the buffer (that is, bits n-6
to n-1
). The buffer is not cleared
* (it is passed by value), but the method returns the new number of valid bits n1, which are in
* the least significant positions (that is, bits n1-1
to 0
).
*
* @param v output byte array
* @param buffer holding the bits
* @param n the number of bits held (<=64)
* @return the number of bits (<6) not emitted
*/
private static int emitBase64(StringBuilder v, long buffer, int n) {
while (n >= 6) {
n -= 6;
long sixBits = buffer >>> n;
char b64byte = TO_BASE64((int)sixBits);
v.append(b64byte);
}
return n;
}
/**
* Helper for {@link #PyUnicode_EncodeUTF7(String, boolean, boolean, String)} to emit 6-bit
* Base64 code units as bytes to the output. The buffer variable may hold any number of bits (up
* to 60 bits). The number of valid bits is given by argument n
and they are the
* n
least significant of the buffer. The buffer will be padded, by shifting in
* zeros at the least significant end, until it the number of valid bits is a multiple of 6.
* Bits will then be emitted in groups of 6, represented by their Base64 character, starting
* with the 6 most-significant valid bits of the buffer (that is, bits n-6
to
* n-1
). The buffer is not cleared (it is passed by value), but can be considered
* empty.
*
* @param v output byte array
* @param buffer holding the bits
* @param n the number of bits held (<=60)
*/
private static void emitBase64Padded(StringBuilder v, long buffer, int n) {
if (n > 0) {
int npad = 5 - (n + 5) % 6; // smallest such that (n+npad) mod 6 == 0
emitBase64(v, buffer << npad, n + npad); // == 0 as a result of the padding
}
}
/* --- UTF-8 Codec ---------------------------------------------------- */
private static byte utf8_code_length[] = {//@formatter:off
/* Map UTF-8 encoded prefix byte to sequence length. zero means
illegal prefix. see RFC 2279 for details */
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 6, 6, 0, 0
}; //@formatter:on
// TODO: need to modify to use a codepoint approach (which is almost the case now,
// ch is an
public static String PyUnicode_DecodeUTF8(String str, String errors) {
return PyUnicode_DecodeUTF8Stateful(str, errors, null);
}
public static String PyUnicode_DecodeUTF8Stateful(String str, String errors, int[] consumed) {
int size = str.length();
StringBuilder unicode = new StringBuilder(size);
/* Unpack UTF-8 encoded data */
int i;
for (i = 0; i < size;) {
int ch = str.charAt(i);
if (ch < 0x80) {
unicode.append((char)ch);
i++;
continue;
}
if (ch > 0xFF) {
i = insertReplacementAndGetResume(unicode, errors, "utf-8", str, //
i, i + 1, "ordinal not in range(255)");
continue;
}
int n = utf8_code_length[ch];
if (i + n > size) {
if (consumed != null) {
break;
}
i = insertReplacementAndGetResume(unicode, errors, "utf-8", str, //
i, i + 1, "unexpected end of data");
continue;
}
switch (n) {
case 0:
i = insertReplacementAndGetResume(unicode, errors, "utf-8", str, //
i, i + 1, "unexpected code byte");
continue;
case 1:
i = insertReplacementAndGetResume(unicode, errors, "utf-8", str, //
i, i + 1, "internal error");
continue;
case 2:
char ch1 = str.charAt(i + 1);
if ((ch1 & 0xc0) != 0x80) {
i = insertReplacementAndGetResume(unicode, errors, "utf-8", str, //
i, i + 2, "invalid data");
continue;
}
ch = ((ch & 0x1f) << 6) + (ch1 & 0x3f);
if (ch < 0x80) {
i = insertReplacementAndGetResume(unicode, errors, "utf-8", str, //
i, i + 2, "illegal encoding");
continue;
} else {
unicode.appendCodePoint(ch);
}
break;
case 3:
ch1 = str.charAt(i + 1);
char ch2 = str.charAt(i + 2);
if ((ch1 & 0xc0) != 0x80 || (ch2 & 0xc0) != 0x80) {
i = insertReplacementAndGetResume(unicode, errors, "utf-8", str, //
i, i + 3, "invalid data");
continue;
}
ch = ((ch & 0x0f) << 12) + ((ch1 & 0x3f) << 6) + (ch2 & 0x3f);
if (ch < 0x800 || (ch >= 0xd800 && ch < 0xe000)) {
i = insertReplacementAndGetResume(unicode, errors, "utf-8", str, //
i, i + 3, "illegal encoding");
continue;
} else {
unicode.appendCodePoint(ch);
}
break;
case 4:
ch1 = str.charAt(i + 1);
ch2 = str.charAt(i + 2);
char ch3 = str.charAt(i + 3);
if ((ch1 & 0xc0) != 0x80 || (ch2 & 0xc0) != 0x80 || (ch3 & 0xc0) != 0x80) {
i = insertReplacementAndGetResume(unicode, errors, "utf-8", str, //
i, i + 4, "invalid data");
continue;
}
ch = ((ch & 0x7) << 18) + ((ch1 & 0x3f) << 12) + //
((ch2 & 0x3f) << 6) + (ch3 & 0x3f);
// validate and convert to UTF-16
if ((ch < 0x10000) || // minimum value allowed for 4 byte encoding
(ch > 0x10ffff)) { // maximum value allowed for UTF-16
i = insertReplacementAndGetResume(unicode, errors, "utf-8", str, //
i, i + 4, "illegal encoding");
continue;
}
unicode.appendCodePoint(ch);
break;
default:
// TODO: support
/* Other sizes are only needed for UCS-4 */
i = insertReplacementAndGetResume(unicode, errors, "utf-8", str, //
i, i + n, "unsupported Unicode code range");
continue;
}
i += n;
}
if (consumed != null) {
consumed[0] = i;
}
return unicode.toString();
}
public static String PyUnicode_EncodeUTF8(String str, String errors) {
return StringUtil.fromBytes(Charset.forName("UTF-8").encode(str));
}
/* --- ASCII and Latin-1 Codecs --------------------------------------- */
public static String PyUnicode_DecodeASCII(String str, int size, String errors) {
return PyUnicode_DecodeIntLimited(str, size, errors, "ascii", 128);
}
public static String PyUnicode_DecodeLatin1(String str, int size, String errors) {
return PyUnicode_DecodeIntLimited(str, size, errors, "latin-1", 256);
}
private static String PyUnicode_DecodeIntLimited(String str, int size, String errors,
String encoding, int limit) {
StringBuilder v = new StringBuilder(size);
String reason = "ordinal not in range(" + limit + ")";
for (int i = 0; i < size; i++) {
char ch = str.charAt(i);
if (ch < limit) {
v.append(ch);
} else {
i = insertReplacementAndGetResume(v, errors, encoding, str, i, i + 1, reason) - 1;
}
}
return v.toString();
}
public static String PyUnicode_EncodeASCII(String str, int size, String errors) {
return PyUnicode_EncodeIntLimited(str, size, errors, "ascii", 128);
}
public static String PyUnicode_EncodeLatin1(String str, int size, String errors) {
return PyUnicode_EncodeIntLimited(str, size, errors, "latin-1", 256);
}
private static String PyUnicode_EncodeIntLimited(String str, int size, String errors,
String encoding, int limit) {
String reason = "ordinal not in range(" + limit + ")";
StringBuilder v = new StringBuilder(size);
for (int i = 0; i < size; i++) {
char ch = str.charAt(i);
if (ch >= limit) {
int nextGood = i + 1;
for (; nextGood < size; nextGood++) {
if (str.charAt(nextGood) < limit) {
break;
}
}
if (errors != null) {
if (errors.equals(IGNORE)) {
i = nextGood - 1;
continue;
} else if (errors.equals(REPLACE)) {
for (int j = i; j < nextGood; j++) {
v.append('?');
}
i = nextGood - 1;
continue;
} else if (errors.equals(XMLCHARREFREPLACE)) {
v.append(xmlcharrefreplace(i, nextGood, str));
i = nextGood - 1;
continue;
} else if (errors.equals(BACKSLASHREPLACE)) {
v.append(backslashreplace(i, nextGood, str));
i = nextGood - 1;
continue;
}
}
PyObject replacement = encoding_error(errors, encoding, str, i, nextGood, reason);
String replStr = replacement.__getitem__(0).toString();
for (int j = 0; j < replStr.length(); j++) {
if (replStr.charAt(j) >= limit) {
throw Py.UnicodeEncodeError(encoding, str, i + j, i + j + 1, reason);
}
}
v.append(replStr);
i = calcNewPosition(size, replacement) - 1;
} else {
v.append(ch);
}
}
return v.toString();
}
/* --- RawUnicodeEscape Codec ---------------------------------------- */
private static char[] hexdigit = "0123456789ABCDEF".toCharArray();
// The modified flag is used by cPickle.
public static String
PyUnicode_EncodeRawUnicodeEscape(String str, String errors, boolean modifed) {
StringBuilder v = new StringBuilder(str.length());
for (Iterator iter = new PyUnicode(str).newSubsequenceIterator(); iter.hasNext();) {
int codePoint = iter.next();
if (codePoint >= Character.MIN_SUPPLEMENTARY_CODE_POINT) {
// Map 32-bit characters to '\\Uxxxxxxxx'
v.append("\\U");
v.append(hexdigit[(codePoint >> 28) & 0xF]);
v.append(hexdigit[(codePoint >> 24) & 0xF]);
v.append(hexdigit[(codePoint >> 20) & 0xF]);
v.append(hexdigit[(codePoint >> 16) & 0xF]);
v.append(hexdigit[(codePoint >> 12) & 0xF]);
v.append(hexdigit[(codePoint >> 8) & 0xF]);
v.append(hexdigit[(codePoint >> 4) & 0xF]);
v.append(hexdigit[codePoint & 0xF]);
} else if (codePoint >= 256 || (modifed && (codePoint == '\\' || codePoint == '\n'))) {
// Map 16-bit chararacters to '\\uxxxx'
v.append("\\u");
v.append(hexdigit[(codePoint >> 12) & 0xF]);
v.append(hexdigit[(codePoint >> 8) & 0xF]);
v.append(hexdigit[(codePoint >> 4) & 0xF]);
v.append(hexdigit[codePoint & 0xF]);
} else {
v.append((char)codePoint);
}
}
return v.toString();
}
public static String PyUnicode_DecodeRawUnicodeEscape(String str, String errors) {
int size = str.length();
StringBuilder v = new StringBuilder(size);
for (int i = 0; i < size;) {
char ch = str.charAt(i);
// Non-escape characters are interpreted as Unicode ordinals
if (ch != '\\') {
v.append(ch);
i++;
continue;
}
// \\u-escapes are only interpreted if the number of leading backslashes is
// odd
int bs = i;
while (i < size) {
ch = str.charAt(i);
if (ch != '\\') {
break;
}
v.append(ch);
i++;
}
if (((i - bs) & 1) == 0 || i >= size || (ch != 'u' && ch != 'U')) {
continue;
}
v.setLength(v.length() - 1);
int count = ch == 'u' ? 4 : 8;
i++;
// \\uXXXX with 4 hex digits, \Uxxxxxxxx with 8
int codePoint = 0, asDigit = -1;
for (int j = 0; j < count; i++, j++) {
if (i == size) {
// EOF in a truncated escape
asDigit = -1;
break;
}
ch = str.charAt(i);
asDigit = Character.digit(ch, 16);
if (asDigit == -1) {
break;
}
codePoint = ((codePoint << 4) & ~0xF) + asDigit;
}
if (asDigit == -1) {
i = codecs.insertReplacementAndGetResume(v, errors, "rawunicodeescape", str, //
bs, i, "truncated \\uXXXX");
} else {
v.appendCodePoint(codePoint);
}
}
return v.toString();
}
private static class Punycode {
// specified by punycode, http://www.ietf.org/rfc/rfc3492.txt
private static final int BASE = 36;
private static final int TMIN = 1;
private static final int TMAX = 26;
private static final int SKEW = 38;
private static final int DAMP = 700;
private static final int INITIAL_BIAS = 72;
private static final int INITIAL_N = 128;
private static final int BASIC = 0x80;
private Punycode() {
}
private static int adapt(int delta, int numpoints, boolean firsttime) {
delta = firsttime ? delta / DAMP : delta >> 1;
delta += delta / numpoints;
int k = 0;
while (delta > (((BASE - TMIN) * TMAX) / 2)) {
delta /= BASE - TMIN;
k += BASE;
}
return k + (((BASE - TMIN + 1) * delta) / (delta + SKEW));
}
private static boolean isBasic(int codePoint) {
return codePoint < BASIC;
}
}
public static String PyUnicode_EncodePunycode(PyUnicode input, String errors) {
int n = Punycode.INITIAL_N;
int delta = 0;
long guard_delta;
int bias = Punycode.INITIAL_BIAS;
int b = 0;
final StringBuilder buffer = new StringBuilder();
for (Iterator iter = input.iterator(); iter.hasNext();) {
int c = iter.next();
if (Punycode.isBasic(c)) {
buffer.appendCodePoint(c);
b++;
}
}
if (b > 0) {
buffer.appendCodePoint('-');
}
int h = b;
int size = input.getCodePointCount();
while (h < size) {
int m = Integer.MAX_VALUE;
int i = 0;
int codePointIndex = 0;
for (Iterator iter = input.iterator(); iter.hasNext(); i++) {
int c = iter.next();
if (c > n && c < m) {
m = c;
codePointIndex = i;
}
}
guard_delta = delta + ((m - n) * (h + 1));
if (guard_delta > Integer.MAX_VALUE) {
throw Py.UnicodeEncodeError("punycode", input.getString(), codePointIndex,
codePointIndex + 1, "overflow");
}
delta = (int)guard_delta;
n = m;
i = 0;
for (Iterator iter = input.iterator(); iter.hasNext(); i++) {
int c = iter.next();
if (c < n) {
guard_delta = delta + 1;
if (guard_delta > Integer.MAX_VALUE) {
throw Py.UnicodeEncodeError("punycode", input.getString(), i, i + 1,
"overflow");
}
delta = (int)guard_delta;
}
if (c == n) {
int q = delta;
for (int k = Punycode.BASE;; k += Punycode.BASE) {
int t = k <= bias ? Punycode.TMIN : //
(k >= bias + Punycode.TMAX ? Punycode.TMAX : k - bias);
if (q < t) {
break;
}
buffer.appendCodePoint(t + ((q - t) % (Punycode.BASE - t)));
q = (q - t) / (Punycode.BASE - t);
}
buffer.appendCodePoint(q);
bias = Punycode.adapt(delta, h + 1, h == b);
delta = 0;
h++;
}
}
delta++;
n++;
}
return buffer.toString();
}
public static PyUnicode PyUnicode_DecodePunycode(String input, String errors) {
int input_size = input.length();
int output_size = 0;
ArrayList ucs4 = new ArrayList(input_size);
int j = 0;
for (; j < input_size; j++) {
int c = input.charAt(j);
if (!Punycode.isBasic(c)) {
throw Py.UnicodeDecodeError("punycode", input, j, j + 1, "not basic");
} else if (c == '-') {
break;
} else {
ucs4.add(c);
output_size++;
}
}
int n = Punycode.INITIAL_N;
int i = 0;
int bias = Punycode.INITIAL_BIAS;
while (j < input_size) {
int old_i = i;
int w = 1;
for (int k = Punycode.BASE;; k += Punycode.BASE) {
int c = input.charAt(j++);
int digit = c - '0';
long guard_i = i + digit * w;
if (guard_i > Integer.MAX_VALUE) {
throw Py.UnicodeDecodeError("punycode", input, j, j + 1, "overflow");
}
i = (int)guard_i;
int t = k <= bias ? Punycode.TMIN : //
(k >= bias + Punycode.TMAX ? Punycode.TMAX : k - bias);
if (digit < t) {
break;
}
long guard_w = w * Punycode.BASE - t;
if (guard_w > Integer.MAX_VALUE) {
throw Py.UnicodeDecodeError("punycode", input, j, j + 1, "overflow");
}
}
bias = Punycode.adapt(i - old_i, output_size + 1, old_i == 0);
n += i / (output_size + 1);
i %= output_size + 1;
ucs4.add(i, n);
}
return new PyUnicode(ucs4);
}
public static String PyUnicode_EncodeIDNA(PyUnicode input, String errors) {
throw new UnsupportedOperationException();
// 1. If the sequence contains any code points outside the ASCII range
// (0..7F) then proceed to step 2, otherwise skip to step 3.
//
// 2. Perform the steps specified in [NAMEPREP] and fail if there is an
// error. The AllowUnassigned flag is used in [NAMEPREP].
// this basically enails changing out space, etc.
//
// 3. If the UseSTD3ASCIIRules flag is set, then perform these checks:
//
// (a) Verify the absence of non-LDH ASCII code points; that is, the
// absence of 0..2C, 2E..2F, 3A..40, 5B..60, and 7B..7F.
//
// (b) Verify the absence of leading and trailing hyphen-minus; that
// is, the absence of U+002D at the beginning and end of the
// sequence.
//
// 4. If the sequence contains any code points outside the ASCII range
// (0..7F) then proceed to step 5, otherwise skip to step 8.
//
// 5. Verify that the sequence does NOT begin with the ACE prefix.
//
// 6. Encode the sequence using the encoding algorithm in [PUNYCODE] and
// fail if there is an error.
//
// 7. Prepend the ACE prefix.
//
// 8. Verify that the number of code points is in the range 1 to 63
// inclusive.
}
public static PyUnicode PyUnicode_DecodeIDNA(String input, String errors) {
throw new UnsupportedOperationException();
}
/* --- Utility methods -------------------------------------------- */
/**
* Invoke a user-defined error-handling mechanism, for errors encountered during encoding, as
* registered through {@link #register_error(String, PyObject)}. The return value is the return
* from the error handler indicating the replacement codec input and the the position at
* which to resume encoding. Invokes the mechanism described in PEP-293.
*
* @param errors name of the error policy (or null meaning "strict")
* @param encoding name of encoding that encountered the error
* @param toEncode unicode string being encoded
* @param start index of first char it couldn't encode
* @param end index+1 of last char it couldn't encode (usually becomes the resume point)
* @param reason contribution to error message if any
* @return must be a tuple (replacement_unicode, resume_index)
*/
public static PyObject encoding_error(String errors, String encoding, String toEncode,
int start, int end, String reason) {
// Retrieve handler registered through register_error(). null is equivalent to "strict".
PyObject errorHandler = lookup_error(errors);
// Construct an exception to hand to the error handler
PyException exc = Py.UnicodeEncodeError(encoding, toEncode, start, end, reason);
exc.normalize();
// And invoke the handler.
PyObject replacement = errorHandler.__call__(new PyObject[] {exc.value});
checkErrorHandlerReturn(errors, replacement);
return replacement;
}
/**
* Handler for errors encountered during decoding, adjusting the output buffer contents and
* returning the correct position to resume decoding (if the handler does not simply raise an
* exception).
*
* @param partialDecode output buffer of unicode (as UTF-16) that the codec is building
* @param errors name of the error policy (or null meaning "strict")
* @param encoding name of encoding that encountered the error
* @param toDecode bytes being decoded
* @param start index of first byte it couldn't decode
* @param end index+1 of last byte it couldn't decode (usually becomes the resume point)
* @param reason contribution to error message if any
* @return the resume position: index of next byte to decode
*/
public static int insertReplacementAndGetResume(StringBuilder partialDecode, String errors,
String encoding, String toDecode, int start, int end, String reason) {
// Handle the two special cases "ignore" and "replace" locally
if (errors != null) {
if (errors.equals(IGNORE)) {
// Just skip to the first non-problem byte
return end;
} else if (errors.equals(REPLACE)) {
// Insert *one* Unicode replacement character and skip
partialDecode.appendCodePoint(Py_UNICODE_REPLACEMENT_CHARACTER);
return end;
}
}
// If errors not one of those, invoke the generic mechanism
PyObject replacementSpec = decoding_error(errors, encoding, toDecode, start, end, reason);
// Deliver the replacement unicode text to the output buffer
partialDecode.append(replacementSpec.__getitem__(0).toString());
// Return the index in toDecode at which we should resume
return calcNewPosition(toDecode.length(), replacementSpec);
}
/**
* Invoke a user-defined error-handling mechanism, for errors encountered during decoding, as
* registered through {@link #register_error(String, PyObject)}. The return value is the return
* from the error handler indicating the replacement codec output and the the position at
* which to resume decoding. Invokes the mechanism described in PEP-293.
*
* @param errors name of the error policy (or null meaning "strict")
* @param encoding name of encoding that encountered the error
* @param toDecode bytes being decoded
* @param start index of first byte it couldn't decode
* @param end index+1 of last byte it couldn't decode (usually becomes the resume point)
* @param reason contribution to error message if any
* @return must be a tuple (replacement_unicode, resume_index)
*/
public static PyObject decoding_error(String errors, String encoding, String toDecode,
int start, int end, String reason) {
// Retrieve handler registered through register_error(). null is equivalent to "strict".
PyObject errorHandler = lookup_error(errors);
// Construct an exception to hand to the error handler
PyException exc = Py.UnicodeDecodeError(encoding, toDecode, start, end, reason);
exc.normalize();
// And invoke the handler.
PyObject replacementSpec = errorHandler.__call__(new PyObject[] {exc.value});
checkErrorHandlerReturn(errors, replacementSpec);
return replacementSpec;
}
/**
* Check thet the error handler returned a tuple
* (replacement_unicode, resume_index)
.
*
* @param errors name of the error policy
* @param replacementSpec from error handler
*/
private static void checkErrorHandlerReturn(String errors, PyObject replacementSpec) {
if (!(replacementSpec instanceof PyTuple) || replacementSpec.__len__() != 2
|| !(replacementSpec.__getitem__(0) instanceof PyBaseString)
|| !(replacementSpec.__getitem__(1) instanceof PyInteger)) {
throw new PyException(Py.TypeError, "error_handler " + errors
+ " must return a tuple of (replacement, new position)");
}
}
/**
* Given the return from some codec error handler (invoked while encoding or decoding), which
* specifies a resume position, and the length of the input being encoded or decoded, check and
* interpret the resume position. Negative indexes in the error handler return are interpreted
* as "from the end". If the result would be out of bounds in the input, an
* IndexError
exception is raised.
*
* @param size of byte buffer being decoded
* @param errorTuple returned from error handler
* @return absolute resume position.
*/
public static int calcNewPosition(int size, PyObject errorTuple) {
int newPosition = ((PyInteger)errorTuple.__getitem__(1)).getValue();
if (newPosition < 0) {
newPosition = size + newPosition;
}
if (newPosition > size || newPosition < 0) {
throw Py.IndexError(newPosition + " out of bounds of encoded string");
}
return newPosition;
}
public static class CodecState {
private PyList searchPath;
private PyStringMap searchCache;
private PyStringMap errorHandlers;
private String default_encoding = "ascii";
public static final String[] BUILTIN_ERROR_HANDLERS = new String[]{"strict",
IGNORE,
REPLACE,
XMLCHARREFREPLACE,
BACKSLASHREPLACE
};
public CodecState() {
searchPath = new PyList();
searchCache = new PyStringMap();
errorHandlers = new PyStringMap();
for (String builtinErrorHandler : BUILTIN_ERROR_HANDLERS) {
register_error(builtinErrorHandler, Py.newJavaFunc(codecs.class,
builtinErrorHandler + "_errors"));
}
}
public String getDefaultEncoding() {
return default_encoding;
}
public void setDefaultEncoding(String encoding) {
lookup(encoding);
default_encoding = encoding;
}
public void register_error(String name, PyObject error) {
if (!error.isCallable()) {
throw Py.TypeError("argument must be callable");
}
errorHandlers.__setitem__(name.intern(), error);
}
public void register(PyObject search_function) {
if (!search_function.isCallable()) {
throw Py.TypeError("argument must be callable");
}
searchPath.append(search_function);
}
public PyTuple lookup(String encoding) {
PyString v = new PyString(normalizestring(encoding));
PyObject cached = searchCache.__finditem__(v);
if (cached != null) {
return (PyTuple)cached;
}
if (searchPath.__len__() == 0) {
throw new PyException(Py.LookupError,
"no codec search functions registered: can't find encoding '" + encoding + "'");
}
for (PyObject func : searchPath.asIterable()) {
PyObject created = func.__call__(v);
if (created == Py.None) {
continue;
}
if (!(created instanceof PyTuple) || created.__len__() != 4) {
throw Py.TypeError("codec search functions must return 4-tuples");
}
searchCache.__setitem__(v, created);
return (PyTuple)created;
}
throw new PyException(Py.LookupError, "unknown encoding '" + encoding + "'");
}
public PyObject lookup_error(String handlerName) {
if (handlerName == null) {
handlerName = "strict";
}
PyObject handler = errorHandlers.__finditem__(handlerName.intern());
if (handler == null) {
throw new PyException(Py.LookupError,
"unknown error handler name '" + handlerName + "'");
}
return handler;
}
}
}
class StringSubsequenceIterator implements Iterator {
private final String s;
private int current, k, start, stop, step;
StringSubsequenceIterator(String s, int start, int stop, int step) {
// System.out.println("s=" + s.length() + ",start=" + start + ",stop=" + stop);
this.s = s;
k = 0;
current = start;
this.start = start;
this.stop = stop;
this.step = step;
/*
* this bounds checking is necessary to convert between use of code units elsewhere, and
* codepoints here it would be nice if it were unnecessary!
*/
int count = getCodePointCount(s);
if (start >= count) {
this.stop = -1;
} else if (stop >= count) {
this.stop = count;
}
for (int i = 0; i < start; i++) {
nextCodePoint();
}
}
StringSubsequenceIterator(String s) {
this(s, 0, getCodePointCount(s), 1);
}
private static int getCodePointCount(String s) {
return s.codePointCount(0, s.length());
}
@Override
public boolean hasNext() {
return current < stop;
}
@Override
public Object next() {
int codePoint = nextCodePoint();
current += 1;
for (int j = 1; j < step && hasNext(); j++) {
nextCodePoint();
current += 1;
}
return codePoint;
}
private int nextCodePoint() {
int U;
// System.out.println("k=" + k);
int W1 = s.charAt(k);
if (W1 >= 0xD800 && W1 < 0xDC00) {
int W2 = s.charAt(k + 1);
U = (((W1 & 0x3FF) << 10) | (W2 & 0x3FF)) + 0x10000;
k += 2;
} else {
U = W1;
k += 1;
}
return U;
}
@Override
public void remove() {
throw new UnsupportedOperationException("Not supported on String objects (immutable)");
}
}