org.python.core.PyString Maven / Gradle / Ivy

Show more of this group Show more artifacts with this name
Show all versions of jython-slim Show documentation
Jython is an implementation of the high-level, dynamic, object-oriented language Python written in 100% Pure Java, and seamlessly integrated with the Java platform. It thus allows you to run Python on any Java platform.
The newest version!
// Copyright (c) Corporation for National Research Initiatives
package org.python.core;

import java.lang.ref.Reference;
import java.lang.ref.SoftReference;
import java.math.BigInteger;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collection;
import java.util.List;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

import org.python.core.buffer.BaseBuffer;
import org.python.core.buffer.SimpleStringBuffer;
import org.python.core.stringlib.FieldNameIterator;
import org.python.core.stringlib.FloatFormatter;
import org.python.core.stringlib.IntegerFormatter;
import org.python.core.stringlib.InternalFormat;
import org.python.core.stringlib.InternalFormat.Formatter;
import org.python.core.stringlib.InternalFormat.Spec;
import org.python.core.stringlib.MarkupIterator;
import org.python.core.stringlib.TextFormatter;
import org.python.core.util.StringUtil;
import org.python.expose.ExposedMethod;
import org.python.expose.ExposedNew;
import org.python.expose.ExposedType;
import org.python.expose.MethodType;

/**
 * A builtin python string.
 */
@Untraversable
@ExposedType(name = "str", base = PyBaseString.class, doc = BuiltinDocs.str_doc)
public class PyString extends PyBaseString implements BufferProtocol {

    public static final PyType TYPE = PyType.fromClass(PyString.class);
    protected String string; // cannot make final because of Python intern support
    protected transient boolean interned = false;
    /** Supports the buffer API, see {@link #getBuffer(int)}. */
    private Reference export;

    public String getString() {
        return string;
    }

    // for PyJavaClass.init()
    public PyString() {
        this(TYPE, "", true);
    }

    protected PyString(PyType subType, String string, boolean isBytes) {
        super(subType);
        if (string == null) {
            throw new IllegalArgumentException("Cannot create PyString from null");
        } else if (!isBytes && !charsFitWidth(string, 8)) {
            throw new IllegalArgumentException(nonByteStringMsg(string));
        }
        this.string = string;
    }

    /**
     * Create the dreaded "non-byte value" error message.
     *
     * @param s problematic string
     * @return the message
     */
    private static String nonByteStringMsg(String s) {
        return String.format("Cannot create PyString with non-byte value: %.500s",
                encode_UnicodeEscape(s, true));
    }

    /**
     * Fundamental constructor for PyString objects when the client provides a Java
     * String, necessitating that we range check the characters.
     *
     * @param subType the actual type being constructed
     * @param string a Java String to be wrapped
     */
    public PyString(PyType subType, String string) {
        this(subType, string, false);
    }

    public PyString(String string) {
        this(TYPE, string);
    }

    public PyString(char c) {
        this(TYPE, String.valueOf(c), c < 256);
    }

    PyString(StringBuilder buffer) {
        this(TYPE, buffer.toString());
    }

    PyString(PyBuffer buffer) {
        this(TYPE, buffer.toString(), true);
    }

    /**
     * Local-use constructor in which the client is allowed to guarantee that the
     * String argument contains only characters in the byte range. We do not then
     * range-check the characters.
     *
     * @param string a Java String to be wrapped (not null)
     * @param isBytes true if the client guarantees we are dealing with bytes
     */
    PyString(String string, boolean isBytes) {
        this(TYPE, string, isBytes);
    }

    /**
     * Determine whether a Java {@code String} consists entirely of characters in the range 0 to
     * 2^width-1. We use this to test for "byte-like" or ASCII.
     *
     * @param s string to test
     * @param width number of bits within which each character must fit (<16)
     * @return true if and only if every character has a code less than 2^width
     */
    static boolean charsFitWidth(String s, int width) {

        final int N = s.length();

        if (N == 0) {
            return true;

        } else {
            // A pointer into the string and the logical-or of characters so far
            int p = 0, c = 0;
            // We work in blocks of 8 to reduce loop tests.
            int M = N - (N % 8), W = 1 << width;

            // M is a multiple of 8 and M < N.
            for (; p < M && c < W; p += 8) {
                // Bitwise-or 8 character codes together in order to test once.
                c = s.charAt(p) | s.charAt(p + 1) | s.charAt(p + 2) | s.charAt(p + 3)
                        | s.charAt(p + 4) | s.charAt(p + 5) | s.charAt(p + 6) | s.charAt(p + 7);
            }

            if (c < W) {
                // Scan the rest, fewer than 8, from M to N-1
                for (; p < N; p++) {
                    c |= s.charAt(p);
                }
                // Test is we reached the end with every character less than W.
                return c < W && p == N;
            } else {
                // Blocks of 8 loop already gave the answer.
                return false;
            }
        }
    }

    /**
     * Creates a {@code PyString} from an already interned {@code String} representing bytes. The
     * caller guarantees that the character codes are all < 256. (The method is used frequently
     * from compiled code, and with identifiers, where this is guaranteed.) Just means it won't be
     * re-interned if used in a place that requires interned Strings.
     *
     * @param interned {@code String} representing bytes
     * @return {@code PyString} for those bytes
     */
    public static PyString fromInterned(String interned) {
        assert charsFitWidth(interned, 8);
        PyString str = new PyString(TYPE, interned, true);
        str.interned = true;
        return str;
    }

    /**
     * Determine whether the string consists entirely of basic-plane characters. For a
     * {@link PyString}, of course, it is always true, but this is useful in cases
     * where either a PyString or a {@link PyUnicode} is acceptable.
     *
     * @return true
     */
    public boolean isBasicPlane() {
        return true;
    }

    @ExposedNew
    static PyObject str_new(PyNewWrapper new_, boolean init, PyType subtype, PyObject[] args,
            String[] keywords) {
        ArgParser ap = new ArgParser("str", args, keywords, new String[] {"object"}, 0);
        PyObject S = ap.getPyObject(0, null);
        // Get the textual representation of the object into str/bytes form
        String str;
        if (S == null) {
            str = "";
        } else {
            // Let the object tell us its representation: this may be str or unicode.
            S = S.__str__();
            if (S instanceof PyUnicode) {
                // Encoding will raise UnicodeEncodeError if not 7-bit clean.
                str = codecs.encode((PyUnicode) S, null, null);
            } else {
                // Must be str/bytes, and should be 8-bit clean already.
                str = S.toString();
            }
        }
        if (new_.for_type == subtype) {
            return new PyString(str);
        } else {
            return new PyStringDerived(subtype, str);
        }
    }

    public int[] toCodePoints() {
        int n = getString().length();
        int[] codePoints = new int[n];
        for (int i = 0; i < n; i++) {
            codePoints[i] = getString().charAt(i);
        }
        return codePoints;
    }

    /**
     * Return a read-only buffer view of the contents of the string, treating it as a sequence of
     * unsigned bytes. The caller specifies its requirements and navigational capabilities in the
     * flags argument (see the constants in interface {@link PyBUF} for an
     * explanation). The method may return the same PyBuffer object to more than one consumer.
     *
     * @param flags consumer requirements
     * @return the requested buffer
     */
    @Override
    public synchronized PyBuffer getBuffer(int flags) {
        // If we have already exported a buffer it may still be available for re-use
        BaseBuffer pybuf = getExistingBuffer(flags);
        if (pybuf == null) {
            /*
             * No existing export we can re-use. Return a buffer, but specialised to defer
             * construction of the buf object, and cache a soft reference to it.
             */
            pybuf = new SimpleStringBuffer(flags, this, getString());
            export = new SoftReference(pybuf);
        }
        return pybuf;
    }

    /**
     * Helper for {@link #getBuffer(int)} that tries to re-use an existing exported buffer, or
     * returns null if can't.
     */
    private BaseBuffer getExistingBuffer(int flags) {
        BaseBuffer pybuf = null;
        if (export != null) {
            // A buffer was exported at some time.
            pybuf = export.get();
            if (pybuf != null) {
                /*
                 * And this buffer still exists. Even in the case where the buffer has been released
                 * by all its consumers, it remains safe to re-acquire it because the target String
                 * has not changed.
                 */
                pybuf = pybuf.getBufferAgain(flags);
            }
        }
        return pybuf;
    }

    /**
     * Return a substring of this object as a Java String.
     *
     * @param start the beginning index, inclusive.
     * @param end the ending index, exclusive.
     * @return the specified substring.
     */
    public String substring(int start, int end) {
        return getString().substring(start, end);
    }

    @Override
    public PyString __str__() {
        return str___str__();
    }

    @ExposedMethod(doc = BuiltinDocs.str___str___doc)
    final PyString str___str__() {
        if (getClass() == PyString.class) {
            return this;
        }
        return new PyString(getString(), true);
    }

    @Override
    public PyUnicode __unicode__() {
        return new PyUnicode(this);  // Decodes with default codec.
    }

    @Override
    public int __len__() {
        return str___len__();
    }

    @ExposedMethod(doc = BuiltinDocs.str___len___doc)
    final int str___len__() {
        return getString().length();
    }

    @Override
    public String toString() {
        return getString();
    }

    public String internedString() {
        if (interned) {
            return getString();
        } else {
            string = getString().intern();
            interned = true;
            return getString();
        }
    }

    @Override
    public PyString __repr__() {
        return str___repr__();
    }

    @ExposedMethod(doc = BuiltinDocs.str___repr___doc)
    final PyString str___repr__() {
        return new PyString(encode_UnicodeEscape(getString(), true));
    }

    private static char[] hexdigit = "0123456789abcdef".toCharArray();

    public static String encode_UnicodeEscape(String str, boolean use_quotes) {
        char quote = use_quotes ? '?' : 0;
        return encode_UnicodeEscape(str, quote);
    }

    /**
     * The inner logic of the string __repr__ producing an ASCII representation of the target
     * string, optionally in quotations. The caller can determine whether the returned string will
     * be wrapped in quotation marks, and whether Python rules are used to choose them through
     * quote.
     *
     * @param str
     * @param quoteChar '"' or '\'' use that, '?' = let Python choose, 0 or anything = no quotes
     * @return encoded string (possibly the same string if unchanged)
     */
    static String encode_UnicodeEscape(String str, char quote) {

        // Choose whether to quote and the actual quote character
        boolean use_quotes;
        switch (quote) {
            case '?':
                use_quotes = true;
                // Python rules
                quote = str.indexOf('\'') >= 0 && str.indexOf('"') == -1 ? '"' : '\'';
                break;
            case '"':
            case '\'':
                use_quotes = true;
                break;
            default:
                use_quotes = false;
                break;
        }

        // Allocate a buffer for the result (25% bigger and room for quotes)
        int size = str.length();
        StringBuilder v = new StringBuilder(size + (size >> 2) + 2);

        if (use_quotes) {
            v.append(quote);
        }

        // Now chunter through the original string a character at a time
        for (int i = 0; size-- > 0;) {
            int ch = str.charAt(i++);
            // Escape quotes and backslash
            if ((use_quotes && ch == quote) || ch == '\\') {
                v.append('\\');
                v.append((char) ch);
                continue;
            }
            /* Map UTF-16 surrogate pairs to Unicode \UXXXXXXXX escapes */
            else if (size > 0 && ch >= 0xD800 && ch < 0xDC00) {
                char ch2 = str.charAt(i++);
                size--;
                if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) {
                    int ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF)) + 0x00010000;
                    v.append('\\');
                    v.append('U');
                    v.append(hexdigit[(ucs >> 28) & 0xf]);
                    v.append(hexdigit[(ucs >> 24) & 0xf]);
                    v.append(hexdigit[(ucs >> 20) & 0xf]);
                    v.append(hexdigit[(ucs >> 16) & 0xf]);
                    v.append(hexdigit[(ucs >> 12) & 0xf]);
                    v.append(hexdigit[(ucs >> 8) & 0xf]);
                    v.append(hexdigit[(ucs >> 4) & 0xf]);
                    v.append(hexdigit[ucs & 0xf]);
                    continue;
                }
                /* Fall through: isolated surrogates are copied as-is */
                i--;
                size++;
            }
            /* Map 16-bit characters to '\\uxxxx' */
            if (ch >= 256) {
                v.append('\\');
                v.append('u');
                v.append(hexdigit[(ch >> 12) & 0xf]);
                v.append(hexdigit[(ch >> 8) & 0xf]);
                v.append(hexdigit[(ch >> 4) & 0xf]);
                v.append(hexdigit[ch & 15]);
            }
            /* Map special whitespace to '\t', \n', '\r' */
            else if (ch == '\t') {
                v.append("\\t");
            } else if (ch == '\n') {
                v.append("\\n");
            } else if (ch == '\r') {
                v.append("\\r");
            } else if (ch < ' ' || ch >= 127) {
                /* Map non-printable US ASCII to '\xNN' */
                v.append('\\');
                v.append('x');
                v.append(hexdigit[(ch >> 4) & 0xf]);
                v.append(hexdigit[ch & 0xf]);
            } else {/* Copy everything else as-is */
                v.append((char) ch);
            }
        }

        if (use_quotes) {
            v.append(quote);
        }

        // Return the original string if we didn't quote or escape anything
        return v.length() > size ? v.toString() : str;
    }

    private static ucnhashAPI pucnHash = null;

    public static String decode_UnicodeEscape(String str, int start, int end, String errors,
            boolean unicode) {
        StringBuilder v = new StringBuilder(end - start);
        for (int s = start; s < end;) {
            char ch = str.charAt(s);
            /* Non-escape characters are interpreted as Unicode ordinals */
            if (ch != '\\') {
                v.append(ch);
                s++;
                continue;
            }
            int loopStart = s;
            /* \ - Escapes */
            s++;
            if (s == end) {
                s = codecs.insertReplacementAndGetResume(v, errors, "unicodeescape", //
                        str, loopStart, s + 1, "\\ at end of string");
                continue;
            }
            ch = str.charAt(s++);
            switch (ch) {
                /* \x escapes */
                case '\n':
                    break;
                case '\\':
                    v.append('\\');
                    break;
                case '\'':
                    v.append('\'');
                    break;
                case '\"':
                    v.append('\"');
                    break;
                case 'b':
                    v.append('\b');
                    break;
                case 'f':
                    v.append('\014');
                    break; /* FF */
                case 't':
                    v.append('\t');
                    break;
                case 'n':
                    v.append('\n');
                    break;
                case 'r':
                    v.append('\r');
                    break;
                case 'v':
                    v.append('\013');
                    break; /* VT */
                case 'a':
                    v.append('\007');
                    break; /* BEL, not classic C */
                /* \OOO (octal) escapes */
                case '0':
                case '1':
                case '2':
                case '3':
                case '4':
                case '5':
                case '6':
                case '7':
                    int x = Character.digit(ch, 8);
                    for (int j = 0; j < 2 && s < end; j++, s++) {
                        ch = str.charAt(s);
                        if (ch < '0' || ch > '7') {
                            break;
                        }
                        x = (x << 3) + Character.digit(ch, 8);
                    }
                    v.append((char) x);
                    break;
                case 'x':
                    s = hexescape(v, errors, 2, s, str, end, "truncated \\xXX");
                    break;
                case 'u':
                    if (!unicode) {
                        v.append('\\');
                        v.append('u');
                        break;
                    }
                    s = hexescape(v, errors, 4, s, str, end, "truncated \\uXXXX");
                    break;
                case 'U':
                    if (!unicode) {
                        v.append('\\');
                        v.append('U');
                        break;
                    }
                    s = hexescape(v, errors, 8, s, str, end, "truncated \\UXXXXXXXX");
                    break;
                case 'N':
                    if (!unicode) {
                        v.append('\\');
                        v.append('N');
                        break;
                    }
                    /*
                     * Ok, we need to deal with Unicode Character Names now, make sure we've
                     * imported the hash table data...
                     */
                    if (pucnHash == null) {
                        PyObject mod = imp.importName("ucnhash", true);
                        mod = mod.__call__();
                        pucnHash = (ucnhashAPI) mod.__tojava__(Object.class);
                        if (pucnHash.getCchMax() < 0) {
                            throw Py.UnicodeError("Unicode names not loaded");
                        }
                    }
                    if (str.charAt(s) == '{') {
                        int startName = s + 1;
                        int endBrace = startName;
                        /*
                         * look for either the closing brace, or we exceed the maximum length of the
                         * unicode character names
                         */
                        int maxLen = pucnHash.getCchMax();
                        while (endBrace < end && str.charAt(endBrace) != '}'
                                && (endBrace - startName) <= maxLen) {
                            endBrace++;
                        }
                        if (endBrace != end && str.charAt(endBrace) == '}') {
                            int value = pucnHash.getValue(str, startName, endBrace);
                            if (storeUnicodeCharacter(value, v)) {
                                s = endBrace + 1;
                            } else {
                                s = codecs.insertReplacementAndGetResume( //
                                        v, errors, "unicodeescape", //
                                        str, loopStart, endBrace + 1, "illegal Unicode character");
                            }
                        } else {
                            s = codecs.insertReplacementAndGetResume(v, errors, "unicodeescape", //
                                    str, loopStart, endBrace, "malformed \\N character escape");
                        }
                        break;
                    } else {
                        s = codecs.insertReplacementAndGetResume(v, errors, "unicodeescape", //
                                str, loopStart, s + 1, "malformed \\N character escape");
                    }
                    break;
                default:
                    v.append('\\');
                    v.append(str.charAt(s - 1));
                    break;
            }
        }
        return v.toString();
    }

    private static int hexescape(StringBuilder partialDecode, String errors, int digits,
            int hexDigitStart, String str, int size, String errorMessage) {
        if (hexDigitStart + digits > size) {
            return codecs.insertReplacementAndGetResume(partialDecode, errors, "unicodeescape", str,
                    hexDigitStart - 2, size, errorMessage);
        }
        int i = 0;
        int x = 0;
        for (; i < digits; ++i) {
            char c = str.charAt(hexDigitStart + i);
            int d = Character.digit(c, 16);
            if (d == -1) {
                return codecs.insertReplacementAndGetResume(partialDecode, errors, "unicodeescape",
                        str, hexDigitStart - 2, hexDigitStart + i + 1, errorMessage);
            }
            x = (x << 4) & ~0xF;
            if (c >= '0' && c <= '9') {
                x += c - '0';
            } else if (c >= 'a' && c <= 'f') {
                x += 10 + c - 'a';
            } else {
                x += 10 + c - 'A';
            }
        }
        if (storeUnicodeCharacter(x, partialDecode)) {
            return hexDigitStart + i;
        } else {
            return codecs.insertReplacementAndGetResume(partialDecode, errors, "unicodeescape", str,
                    hexDigitStart - 2, hexDigitStart + i + 1, "illegal Unicode character");
        }
    }

    /* pass in an int since this can be a UCS-4 character */
    private static boolean storeUnicodeCharacter(int value, StringBuilder partialDecode) {
        if (value < 0 || (value >= 0xD800 && value <= 0xDFFF)) {
            return false;
        } else if (value <= PySystemState.maxunicode) {
            partialDecode.appendCodePoint(value);
            return true;
        }
        return false;
    }

    @ExposedMethod(doc = BuiltinDocs.str___getitem___doc)
    final PyObject str___getitem__(PyObject index) {
        PyObject ret = seq___finditem__(index);
        if (ret == null) {
            throw Py.IndexError("string index out of range");
        }
        return ret;
    }

    // XXX: need doc
    @ExposedMethod(defaults = "null")
    final PyObject str___getslice__(PyObject start, PyObject stop, PyObject step) {
        return seq___getslice__(start, stop, step);
    }

    @Override
    public int __cmp__(PyObject other) {
        return str___cmp__(other);
    }

    @ExposedMethod(type = MethodType.CMP)
    final int str___cmp__(PyObject other) {
        if (!(other instanceof PyString)) {
            return -2;
        }

        int c = getString().compareTo(((PyString) other).getString());
        return c < 0 ? -1 : c > 0 ? 1 : 0;
    }

    @Override
    public PyObject __eq__(PyObject other) {
        return str___eq__(other);
    }

    @ExposedMethod(type = MethodType.BINARY, doc = BuiltinDocs.str___eq___doc)
    final PyObject str___eq__(PyObject other) {
        String s = coerce(other);
        if (s == null) {
            return null;
        }
        return getString().equals(s) ? Py.True : Py.False;
    }

    @Override
    public PyObject __ne__(PyObject other) {
        return str___ne__(other);
    }

    @ExposedMethod(type = MethodType.BINARY, doc = BuiltinDocs.str___ne___doc)
    final PyObject str___ne__(PyObject other) {
        String s = coerce(other);
        if (s == null) {
            return null;
        }
        return getString().equals(s) ? Py.False : Py.True;
    }

    @Override
    public PyObject __lt__(PyObject other) {
        return str___lt__(other);
    }

    @ExposedMethod(type = MethodType.BINARY, doc = BuiltinDocs.str___lt___doc)
    final PyObject str___lt__(PyObject other) {
        String s = coerce(other);
        if (s == null) {
            return null;
        }
        return getString().compareTo(s) < 0 ? Py.True : Py.False;
    }

    @Override
    public PyObject __le__(PyObject other) {
        return str___le__(other);
    }

    @ExposedMethod(type = MethodType.BINARY, doc = BuiltinDocs.str___le___doc)
    final PyObject str___le__(PyObject other) {
        String s = coerce(other);
        if (s == null) {
            return null;
        }
        return getString().compareTo(s) <= 0 ? Py.True : Py.False;
    }

    @Override
    public PyObject __gt__(PyObject other) {
        return str___gt__(other);
    }

    @ExposedMethod(type = MethodType.BINARY, doc = BuiltinDocs.str___gt___doc)
    final PyObject str___gt__(PyObject other) {
        String s = coerce(other);
        if (s == null) {
            return null;
        }
        return getString().compareTo(s) > 0 ? Py.True : Py.False;
    }

    @Override
    public PyObject __ge__(PyObject other) {
        return str___ge__(other);
    }

    @ExposedMethod(type = MethodType.BINARY, doc = BuiltinDocs.str___ge___doc)
    final PyObject str___ge__(PyObject other) {
        String s = coerce(other);
        if (s == null) {
            return null;
        }
        return getString().compareTo(s) >= 0 ? Py.True : Py.False;
    }

    /** Interpret the object as a Java String representing bytes or return null. */
    private static String coerce(PyObject o) {
        if (o instanceof PyString && !(o instanceof PyUnicode)) {
            return o.toString();
        }
        return null;
    }

    @Override
    public int hashCode() {
        return str___hash__();
    }

    @ExposedMethod(doc = BuiltinDocs.str___hash___doc)
    final int str___hash__() {
        return getString().hashCode();
    }

    /**
     * @return a byte array with one byte for each char in this object's underlying String. Each
     *         byte contains the low-order bits of its corresponding char.
     */
    public byte[] toBytes() {
        return StringUtil.toBytes(getString());
    }

    @Override
    public Object __tojava__(Class c) {
        if (c.isAssignableFrom(String.class)) {
            /*
             * If c is a CharSequence we assume the caller is prepared to get maybe not an actual
             * String. In that case we avoid conversion so the caller can do special stuff with the
             * returned PyString or PyUnicode or whatever. (If c is Object.class, the caller usually
             * expects to get actually a String)
             */
            return c == CharSequence.class ? this : getString();
        }

        if (c == Character.TYPE || c == Character.class) {
            if (getString().length() == 1) {
                return getString().charAt(0);
            }
        }

        if (c.isArray()) {
            if (c.getComponentType() == Byte.TYPE) {
                return toBytes();
            }
            if (c.getComponentType() == Character.TYPE) {
                return getString().toCharArray();
            }
        }

        if (c.isAssignableFrom(Collection.class)) {
            List