All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.python.core.PyUnicode Maven / Gradle / Ivy

Go to download

Jython is an implementation of the high-level, dynamic, object-oriented language Python written in 100% Pure Java, and seamlessly integrated with the Java platform. It thus allows you to run Python on any Java platform.

There is a newer version: 2.7.4
Show newest version
package org.python.core;

import java.io.Serializable;
import java.util.Collection;
import java.util.Collections;
import java.util.Iterator;
import java.util.LinkedList;
import java.util.List;
import java.util.Set;

import org.python.core.stringlib.FieldNameIterator;
import org.python.core.stringlib.MarkupIterator;
import org.python.expose.ExposedMethod;
import org.python.expose.ExposedNew;
import org.python.expose.ExposedType;
import org.python.expose.MethodType;
import org.python.modules._codecs;
import org.python.util.Generic;

import com.google.common.base.CharMatcher;

/**
 * a builtin python unicode string.
 */
@Untraversable
@ExposedType(name = "unicode", base = PyBaseString.class, doc = BuiltinDocs.unicode_doc)
public class PyUnicode extends PyString implements Iterable {

    /**
     * Nearly every significant method comes in two versions: one applicable when the string
     * contains only basic plane characters, and one that is correct when supplementary characters
     * are also present. Set this constant true to treat all strings as containing
     * supplementary characters, so that these versions will be exercised in tests.
     */
    private static final boolean DEBUG_NON_BMP_METHODS = false;

    public static final PyType TYPE = PyType.fromClass(PyUnicode.class);

    // for PyJavaClass.init()
    public PyUnicode() {
        this(TYPE, "", true);
    }

    /**
     * Construct a PyUnicode interpreting the Java String argument as UTF-16.
     *
     * @param string UTF-16 string encoding the characters (as Java).
     */
    public PyUnicode(String string) {
        this(TYPE, string, false);
    }

    /**
     * Construct a PyUnicode interpreting the Java String argument as UTF-16. If it is known that
     * the string contains no supplementary characters, argument isBasic may be set true by the
     * caller. If it is false, the PyUnicode will scan the string to find out.
     *
     * @param string UTF-16 string encoding the characters (as Java).
     * @param isBasic true if it is known that only BMP characters are present.
     */
    public PyUnicode(String string, boolean isBasic) {
        this(TYPE, string, isBasic);
    }

    public PyUnicode(PyType subtype, String string) {
        this(subtype, string, false);
    }

    public PyUnicode(PyString pystring) {
        this(TYPE, pystring);
    }

    public PyUnicode(PyType subtype, PyString pystring) {
        this(subtype, //
                pystring instanceof PyUnicode ? pystring.string : pystring.decode().toString(), //
                pystring.isBasicPlane());
    }

    public PyUnicode(char c) {
        this(TYPE, String.valueOf(c), true);
    }

    public PyUnicode(int codepoint) {
        this(TYPE, checkedCPString(codepoint));
    }

    public PyUnicode(int[] codepoints) {
        this(TYPE, checkedCPString(codepoints));
    }

    PyUnicode(StringBuilder buffer) {
        this(TYPE, buffer.toString());
    }

    /**
     * Translate a code point to a Java String, guaranteeing validity. (This avoids a Java stack
     * dump.)
     *
     * @param codePoint to translate
     * @return String from codepoint
     * @throws PyException(ValueError) if not a valid Unicode codepoint.
     */
    private static String checkedCPString(int codePoint) throws PyException {
        if (Character.isValidCodePoint(codePoint)) {
            return new String(Character.toChars(codePoint));
        } else {
            throw Py.ValueError(
                    String.format("character U+%08x is not in Unicode range", codePoint));
        }
    }

    /**
     * Translate a code point to a Java String, guaranteeing validity. (This avoids a Java stack
     * dump.)
     *
     * @param codePoints to translate
     * @return String from codepoint
     * @throws PyException(ValueError) if any element is not a valid Unicode codepoint.
     */
    private static String checkedCPString(int[] codePoints) throws PyException {
        try {
            return new String(codePoints, 0, codePoints.length);
        } catch (IllegalArgumentException e) {
            // Scan it again because the other call produces a better error message
            for (int c : codePoints) {
                checkedCPString(c);
            }
            return ""; // never reached in practice
        }
    }

    private static StringBuilder fromCodePoints(Iterator iter) {
        StringBuilder buffer = new StringBuilder();
        while (iter.hasNext()) {
            buffer.append(checkedCPString(iter.next()));
        }
        return buffer;
    }

    public PyUnicode(Iterator iter) {
        this(fromCodePoints(iter));
    }

    public PyUnicode(Collection ucs4) {
        this(ucs4.iterator());
    }

    /**
     * Fundamental all-features constructor on which the others depend. If it is known that the
     * string contains no supplementary characters, argument isBasic may be set true by the caller.
     * If it is false, the PyUnicode will scan the string to find out.
     *
     * @param subtype actual type to create.
     * @param string UTF-16 string encoding the characters (as Java).
     * @param isBasic true if it is known that only BMP characters are present.
     */
    private PyUnicode(PyType subtype, String string, boolean isBasic) {
        super(subtype, "", true);
        this.string = string;
        translator = isBasic ? BASIC : this.chooseIndexTranslator();
    }

    @Override
    public int[] toCodePoints() {
        int n = getCodePointCount();
        int[] codePoints = new int[n];
        int i = 0;
        for (Iterator iter = newSubsequenceIterator(); iter.hasNext(); i++) {
            codePoints[i] = iter.next();
        }
        return codePoints;
    }

    /**
     * {@code PyUnicode} implements the interface {@link BufferProtocol} technically by inheritance from {@link PyString},
     * but does not provide a buffer (in CPython). We therefore arrange that all calls to {@code getBuffer}
     * raise an error.
     *
     * @return always throws a {@code ClassCastException}
     */
    @Override
    public synchronized PyBuffer getBuffer(int flags) throws ClassCastException {
        throw new ClassCastException("'unicode' does not support the buffer protocol");
    }

    // ------------------------------------------------------------------------------------------
    // Index translation for Unicode beyond the BMP
    // ------------------------------------------------------------------------------------------

    /**
     * Index translation between code point index (as seen by Python) and UTF-16 index (as used in
     * the Java String.
     */
    private interface IndexTranslator extends Serializable {

        /** Number of supplementary characters (hence point code length may be found). */
        public int suppCount();

        /** Translate a UTF-16 code unit index to its equivalent code point index. */
        public int codePointIndex(int utf16Index);

        /** Translate a code point index to its equivalent UTF-16 code unit index. */
        public int utf16Index(int codePointIndex);
    }

    /**
     * The instance of index translation in use in this string. It will be set to either
     * {@link #BASIC} or an instance of {@link PyUnicode.Supplementary}.
     */
    private final IndexTranslator translator;

    /**
     * A singleton provides the translation service (which is a pass-through) for all BMP strings.
     */
    static final IndexTranslator BASIC = new IndexTranslator() {

        @Override
        public int suppCount() {
            return 0;
        }

        @Override
        public int codePointIndex(int u) {
            return u;
        }

        @Override
        public int utf16Index(int i) {
            return i;
        }
    };

    /**
     * A class of index translation that uses the cumulative count so far of supplementary
     * characters, tabulated in blocks of a standard size. The count is then used as an offset
     * between the code point index and the corresponding point in the UTF-16 representation.
     */
    private final class Supplementary implements IndexTranslator {

        /** Tabulates cumulative count so far of supplementary characters, by blocks of size M. */
        final int[] count;

        /** Configure the block size M, as this power of 2. */
        static final int LOG2M = 4;
        /** The block size used for indexing (power of 2). */
        static final int M = 1 << LOG2M;
        /** A mask used to separate the block number and offset in the block. */
        static final int MASK = M - 1;

        /**
         * The constructor works on a count array prepared by
         * {@link PyUnicode#getSupplementaryCounts(String)}.
         */
        Supplementary(int[] count) {
            this.count = count;
        }

        @Override
        public int codePointIndex(int u) {
            /*
             * Let the desired result be j such that utf16Index(j) = u. As we have only a forward
             * index of the string, we have to conduct a search. In principle, we bound j by a pair
             * of values (j1,j2) such that j1<=j> LOG2M) + 1;
            // The count of supplementary characters before the start of block k2 is:
            int c2 = count[k2 - 1];
            /*
             * Since the count array is non-decreasing, and j < k2*M, we have u-j <= count[k2-1].
             * That is, j >= k1*M, where:
             */
            int k1 = Math.max(0, u - c2) >> LOG2M;
            // The count of supplementary characters before the start of block k1 is:
            int c1 = (k1 == 0) ? 0 : count[k1 - 1];

            /*
             * Now, j (to be found) is in an unknown block k, where k1<=k u) {
                        // k*M+c > u therefore j is not in block k but to its left.
                        k2 = k;
                        c2 = c;
                    } else {
                        // k*M+c <= u therefore j must be in block k, or to its right.
                        k1 = k;
                        c1 = c;
                    }
                }
            }

            /*
             * At this point, j is known to be in block k1 (and k2=k1+1). c1 is the number of
             * supplementary characters to the left of code point index k1*M and c2 is the number of
             * supplementary characters to the left of code point index (k1+1)*M. We have to search
             * this block sequentially. The current position in the UTF-16 is:
             */
            int p = (k1 << LOG2M) + c1;
            while (p < u) {
                if (Character.isHighSurrogate(string.charAt(p++))) {
                    // c1 tracks the number of supplementary characters to the left of p
                    c1 += 1;
                    if (c1 == c2) {
                        // We have found all supplementary characters in the block.
                        break;
                    }
                    // Skip the trailing surrogate.
                    p++;
                }
            }
            // c1 is the number of supplementary characters to the left of u, so the result j is:
            return u - c1;
        }

        @Override
        public int utf16Index(int i) {
            // The code point index i lies in the k-th block where:
            int k = i >> LOG2M;
            // The offset for the code point index k*M is exactly
            int d = (k == 0) ? 0 : count[k - 1];
            // The offset for the code point index (k+1)*M is exactly
            int e = count[k];
            if (d == e) {
                /*
                 * The offset for the code point index (k+1)*M is the same, and since this is a
                 * non-decreasing function of k, it is also the value for i.
                 */
                return i + d;
            } else {
                /*
                 * The offset for the code point index (k+1)*M is different (higher). We must scan
                 * along until we have found all the supplementary characters that precede i,
                 * starting the scan at code point index k*M.
                 */
                for (int q = i & ~MASK; q < i; q++) {
                    if (Character.isHighSurrogate(string.charAt(q + d))) {
                        d += 1;
                        if (d == e) {
                            /*
                             * We have found all the supplementary characters in this block, so we
                             * must have found all those to the left of i.
                             */
                            break;
                        }
                    }
                }

                // d counts all the supplementary characters to the left of i.
                return i + d;
            }
        }

        @Override
        public int suppCount() {
            // The last element of the count array is the total number of supplementary characters.
            return count[count.length - 1];
        }
    }

    /**
     * Generate the table that is used by the class {@link Supplementary} to accelerate access to
     * the the implementation string. The method returns null if the string passed
     * contains no surrogate pairs, in which case we'll use {@link #BASIC} as the translator. This
     * method is sensitive to {@link #DEBUG_NON_BMP_METHODS} which if true will prevent it returning
     * null, hance we will always use a {@link Supplementary} {@link #translator}.
     *
     * @param string to index
     * @return the index (counts) or null if basic plane
     */
    private static int[] getSupplementaryCounts(final String string) {

        final int n = string.length();
        int p; // Index of the current UTF-16 code unit.

        /*
         * We scan to the first surrogate code unit, in a simple loop. If we hit the end before we
         * find one, no count array will be necessary and we'll use BASIC. If we find a surrogate it
         * may be half a supplementary character, or a lone surrogate: we'll find out later.
         */
        for (p = 0; p < n; p++) {
            if (Character.isSurrogate(string.charAt(p))) {
                break;
            }
        }

        if (p == n && !DEBUG_NON_BMP_METHODS) {
            // There are no supplementary characters so the 1:1 translator is fine.
            return null;

        } else {
            /*
             * We have to do this properly, using a scheme in which code point indexes are
             * efficiently translatable to UTF-16 indexes through a table called here count[]. In
             * this array, count[k] contains the total number of supplementary characters up to the
             * end of the k.th block, that is, to the left of code point (k+1)M. We have to fill
             * this array by scanning the string.
             */
            int q = p; // The current code point index (q = p+s).
            int k = q >> Supplementary.LOG2M; // The block number k = q/M.

            /*
             * When addressing with a code point index q<=L (the length in code points) we will
             * index the count array with k = q/M. We have q<=L<=n, therefore q/M <= n/M, the
             * maximum valid k is 1 + n/M. A q>=L should raise IndexOutOfBoundsException, but it
             * doesn't matter whether that's from indexing this array, or the string later.
             */
            int[] count = new int[1 + (n >> Supplementary.LOG2M)];

            /*
             * To get the generation of count[] going efficiently, we need to advance the next whole
             * block. The next loop will complete processing of the block containing the first
             * supplementary character. Note that in all these loops, if we exit because p reaches a
             * limit, the count for the last partial block is known from p-q and we take care of
             * that right at the end of this method. The limit of these loops is n-1, so if we spot
             * a lead surrogate, the we may access the low-surrogate confident that p+1p we find a lead surrogate without a trailing one
     * following, or a trailing surrogate directly. It should not be called on the final code unit,
     * when p==string.length()-1, since it may check the next code unit as well.
     *
     * @param string of UTF-16 code units
     * @param p index into that string
     * @return 2 if a surrogate pair stands at p, 1 if not
     * @throws PyException {@code ValueError} if a lone surrogate stands at p.
     */
    private static int calcAdvance(String string, int p) throws PyException {

        // Catch supplementary characters and lone surrogate code units.
        char c = string.charAt(p);

        if (c >= Character.MIN_SURROGATE) {
            if (c < Character.MIN_LOW_SURROGATE) {
                // This is a lead surrogate.
                if (Character.isLowSurrogate(string.charAt(p + 1))) {
                    // Required trailing surrogate follows, so step over both.
                    return 2;
                } else {
                    // Required trailing surrogate missing.
                    throw unpairedSurrogate(p, c);
                }

            } else if (c <= Character.MAX_SURROGATE) {
                // This is a lone trailing surrogate
                throw unpairedSurrogate(p, c);

            } // else this is a private use or special character in 0xE000 to 0xFFFF.

        }
        return 1;
    }

    /**
     * Return a ready-to-throw exception indicating an unpaired surrogate.
     *
     * @param p index within that sequence of the problematic code unit
     * @param c the code unit
     * @return an exception
     */
    private static PyException unpairedSurrogate(int p, int c) {
        String fmt = "unpaired surrogate %#4x at code unit %d";
        String msg = String.format(fmt, c, p);
        return Py.ValueError(msg);
    }

    /**
     * Choose an {@link IndexTranslator} implementation for efficient working, according to the
     * contents of the {@link PyString#string}.
     *
     * @return chosen IndexTranslator
     */
    private IndexTranslator chooseIndexTranslator() {
        int[] count = getSupplementaryCounts(string);
        if (DEBUG_NON_BMP_METHODS) {
            return new Supplementary(count);
        } else {
            return count == null ? BASIC : new Supplementary(count);
        }
    }

    /**
     * {@inheritDoc}
     * 

* In the PyUnicode version, the arguments are code point indices, such as are * received from the Python caller, while the first two elements of the returned array have been * translated to UTF-16 indices in the implementation string. */ @Override protected int[] translateIndices(PyObject start, PyObject end) { int[] indices = super.translateIndices(start, end); indices[0] = translator.utf16Index(indices[0]); indices[1] = translator.utf16Index(indices[1]); // indices[2] and [3] remain Unicode indices (and may be out of bounds) relative to len() return indices; } // ------------------------------------------------------------------------------------------ /** Table used by {@link #from(char)} to intern single byte strings. */ private static final PyUnicode[] unichars = new PyUnicode[128]; static { for (char j = 0; j < 128; j++) { PyUnicode uni = new PyUnicode(TYPE, String.valueOf(j).intern(), true); uni.interned = true; unichars[j] = uni; } } /** * {@inheritDoc} The indices are code point indices, not UTF-16 (char) indices. For * example: * *

     * PyUnicode u = new PyUnicode("..\ud800\udc02\ud800\udc03...");
     * // (Python) u = u'..\U00010002\U00010003...'
     *
     * String s = u.substring(2, 4);  // = "\ud800\udc02\ud800\udc03" (Java)
     * 
*/ @Override public String substring(int start, int end) { return super.substring(translator.utf16Index(start), translator.utf16Index(end)); } /** * Returns a PyUnicode from an already interned String. Just means it won't be re-interned if * used in a place that requires interned Strings. */ public static PyUnicode fromInterned(String s) { int n = s.length(); if (n > 1) { PyUnicode uni = new PyUnicode(TYPE, s, false); uni.interned = true; return uni; } else if (n == 1) { return from(s.charAt(0)); } else { return Py.EmptyUnicode; } } /** * Return a not-necessarily new {@link PyUnicode} from a Java {@code String}. * @param s UTF-16 string encoding the characters (as Java). * @param isBasic true if it is known that only BMP characters are present. * @return a new or re-used {@code PyUnicode} */ public static PyUnicode fromString(String s, boolean isBasic) { int n = s.length(); if (n > 1) { PyUnicode uni = new PyUnicode(TYPE, s, isBasic); return uni; } else if (n == 1) { return from(s.charAt(0)); } else { return Py.EmptyUnicode; } } /** * Return a not-necessarily new {@link PyUnicode} from a Java {@code char}. Some low index chars * (ASCII) return a re-used {@code PyUnicode}. This method does not assume the character is * basic-plane. * * @param c to convert to a {@code PyUnicode}. * @return a new or re-used {@code PyUnicode} */ public static PyUnicode from(char c) { if (c >= 0 && c < unichars.length) { return unichars[c]; } else { return new PyUnicode(c); } } /** * Return a not-necessarily new {@code PyUnicode} from a Java code point. * * @param codepoint of the single character required * @return a new or cached {@code PyUnicode} for the character */ public static PyUnicode fromCodepoint(int codepoint) { if (codepoint >= 0 && codepoint < unichars.length) { return unichars[codepoint]; } else { return new PyUnicode(codepoint); } } /** * {@inheritDoc} * * @return true if the string consists only of BMP characters */ @Override public boolean isBasicPlane() { return translator == BASIC; } public int getCodePointCount() { return string.length() - translator.suppCount(); } public static String checkEncoding(String s) { if (s == null || CharMatcher.ascii().matchesAllOf(s)) { return s; } return codecs.PyUnicode_EncodeASCII(s, s.length(), null); } @ExposedNew final static PyObject unicode_new(PyNewWrapper new_, boolean init, PyType subtype, PyObject[] args, String[] keywords) { ArgParser ap = new ArgParser("unicode", args, keywords, new String[] {"string", "encoding", "errors"}, 0); PyObject S = ap.getPyObject(0, null); String encoding = checkEncoding(ap.getString(1, null)); String errors = checkEncoding(ap.getString(2, null)); if (new_.for_type == subtype) { if (S == null) { return new PyUnicode(""); } if (S instanceof PyUnicode) { return new PyUnicode(((PyUnicode) S).getString()); } if (S instanceof PyString) { if (S.getType() != PyString.TYPE && encoding == null && errors == null) { return S.__unicode__(); } PyObject decoded = codecs.decode((PyString) S, encoding, errors); if (decoded instanceof PyUnicode) { return new PyUnicode(((PyUnicode) decoded).getString()); } else { throw Py.TypeError("decoder did not return a unicode object (type=" + decoded.getType().fastGetName() + ")"); } } return S.__unicode__(); } else { if (S == null) { return new PyUnicodeDerived(subtype, Py.EmptyString); } if (S instanceof PyUnicode) { return new PyUnicodeDerived(subtype, (PyUnicode) S); } else { return new PyUnicodeDerived(subtype, S.__str__()); } } } @Override public PyString createInstance(String string) { return new PyUnicode(string); } /** * @param string UTF-16 string encoding the characters (as Java). * @param isBasic true if it is known that only BMP characters are present. */ @Override protected PyString createInstance(String string, boolean isBasic) { return fromString(string, false); } @Override public PyObject __mod__(PyObject other) { return unicode___mod__(other); } @ExposedMethod(doc = BuiltinDocs.unicode___mod___doc) final PyObject unicode___mod__(PyObject other) { StringFormatter fmt = new StringFormatter(getString(), true); return fmt.format(other); } @Override public PyUnicode __unicode__() { return this; } @Override public PyString __str__() { return unicode___str__(); } @ExposedMethod(doc = BuiltinDocs.unicode___str___doc) final PyString unicode___str__() { return new PyString(encode()); } @Override public int __len__() { return unicode___len__(); } @ExposedMethod(doc = BuiltinDocs.unicode___len___doc) final int unicode___len__() { return getCodePointCount(); } @Override public PyString __repr__() { return unicode___repr__(); } @ExposedMethod(doc = BuiltinDocs.unicode___repr___doc) final PyString unicode___repr__() { return new PyString("u" + encode_UnicodeEscape(getString(), true)); } @ExposedMethod(doc = BuiltinDocs.unicode___getitem___doc) final PyObject unicode___getitem__(PyObject index) { return str___getitem__(index); } @ExposedMethod(defaults = "null", doc = BuiltinDocs.unicode___getslice___doc) final PyObject unicode___getslice__(PyObject start, PyObject stop, PyObject step) { return seq___getslice__(start, stop, step); } @Override protected PyObject getslice(int start, int stop, int step) { if (isBasicPlane()) { return super.getslice(start, stop, step); } if (step > 0 && stop < start) { stop = start; } StringBuilder buffer = new StringBuilder(sliceLength(start, stop, step)); for (Iterator iter = newSubsequenceIterator(start, stop, step); iter.hasNext();) { buffer.appendCodePoint(iter.next()); } return createInstance(buffer.toString()); } @ExposedMethod(type = MethodType.CMP) final int unicode___cmp__(PyObject other) { // XXX needs proper coercion like __eq__, then UCS-32 code point order :( return str___cmp__(other); } @Override public PyObject __eq__(PyObject other) { return unicode___eq__(other); } @ExposedMethod(type = MethodType.BINARY, doc = BuiltinDocs.unicode___eq___doc) final PyObject unicode___eq__(PyObject other) { try { String s = coerceForComparison(other); if (s == null) { return null; } return getString().equals(s) ? Py.True : Py.False; } catch (PyException e) { // Decoding failed: treat as unequal return Py.False; } } @Override public PyObject __ne__(PyObject other) { return unicode___ne__(other); } @ExposedMethod(type = MethodType.BINARY, doc = BuiltinDocs.unicode___ne___doc) final PyObject unicode___ne__(PyObject other) { try { String s = coerceForComparison(other); if (s == null) { return null; } return getString().equals(s) ? Py.False : Py.True; } catch (PyException e) { // Decoding failed: treat as unequal return Py.True; } } @Override public PyObject __lt__(PyObject other) { return unicode___lt__(other); } @ExposedMethod(type = MethodType.BINARY, doc = BuiltinDocs.unicode___lt___doc) final PyObject unicode___lt__(PyObject other) { String s = coerceForComparison(other); if (s == null) { return null; } return getString().compareTo(s) < 0 ? Py.True : Py.False; } @Override public PyObject __le__(PyObject other) { return unicode___le__(other); } @ExposedMethod(type = MethodType.BINARY, doc = BuiltinDocs.unicode___le___doc) final PyObject unicode___le__(PyObject other) { String s = coerceForComparison(other); if (s == null) { return null; } return getString().compareTo(s) <= 0 ? Py.True : Py.False; } @Override public PyObject __gt__(PyObject other) { return unicode___gt__(other); } @ExposedMethod(type = MethodType.BINARY, doc = BuiltinDocs.unicode___gt___doc) final PyObject unicode___gt__(PyObject other) { String s = coerceForComparison(other); if (s == null) { return null; } return getString().compareTo(s) > 0 ? Py.True : Py.False; } @Override public PyObject __ge__(PyObject other) { return unicode___ge__(other); } @ExposedMethod(type = MethodType.BINARY, doc = BuiltinDocs.unicode___ge___doc) final PyObject unicode___ge__(PyObject other) { String s = coerceForComparison(other); if (s == null) { return null; } return getString().compareTo(s) >= 0 ? Py.True : Py.False; } @ExposedMethod(doc = BuiltinDocs.unicode___hash___doc) final int unicode___hash__() { return str___hash__(); } @Override protected PyObject pyget(int i) { int codepoint = getString().codePointAt(translator.utf16Index(i)); return PyUnicode.fromCodepoint(codepoint); } @Override public int getInt(int i) { return getString().codePointAt(translator.utf16Index(i)); } /** * An iterator returning code points from this array, for use when not basic plane. */ private class SubsequenceIteratorImpl extends SubsequenceIteratorBasic { private int k; // UTF-16 index (of current) SubsequenceIteratorImpl(int start, int stop, int step) { super(start, stop, step); k = translator.utf16Index(current); } SubsequenceIteratorImpl() { this(0, getCodePointCount(), 1); } @Override protected int nextCodePoint() { int U; int W1 = getString().charAt(k); if (W1 >= 0xD800 && W1 < 0xDC00) { int W2 = getString().charAt(k + 1); U = (((W1 & 0x3FF) << 10) | (W2 & 0x3FF)) + 0x10000; k += 2; } else { U = W1; k += 1; } current += 1; return U; } } /** * An iterator returning code points from this array, for use when basic plane. */ private class SubsequenceIteratorBasic implements Iterator { protected int current, stop, step; // Character indexes SubsequenceIteratorBasic(int start, int stop, int step) { current = start; this.stop = stop; this.step = step; } SubsequenceIteratorBasic() { this(0, getCodePointCount(), 1); } @Override public boolean hasNext() { return current < stop; } @Override public Integer next() { int codePoint = nextCodePoint(); for (int j = 1; j < step && hasNext(); j++) { nextCodePoint(); } return codePoint; } protected int nextCodePoint() { return getString().charAt(current++); } @Override public void remove() { throw new UnsupportedOperationException( "Not supported on PyUnicode objects (immutable)"); } } private static class SteppedIterator implements Iterator { private final Iterator iter; private final int step; private T lookahead = null; public SteppedIterator(int step, Iterator iter) { this.iter = iter; this.step = step; lookahead = advance(); } private T advance() { if (iter.hasNext()) { T elem = iter.next(); for (int i = 1; i < step && iter.hasNext(); i++) { iter.next(); } return elem; } else { return null; } } @Override public boolean hasNext() { return lookahead != null; } @Override public T next() { T old = lookahead; if (iter.hasNext()) { lookahead = iter.next(); for (int i = 1; i < step && iter.hasNext(); i++) { iter.next(); } } else { lookahead = null; } return old; } @Override public void remove() { throw new UnsupportedOperationException(); } } // XXX: Parameterize SubsequenceIteratorImpl and friends (and make them Iterable) /** Get an iterator over the code point sequence. */ public Iterator newSubsequenceIterator() { if (isBasicPlane()) { return new SubsequenceIteratorBasic(); } else { return new SubsequenceIteratorImpl(); } } /** Get an iterator over a slice of the code point sequence. */ public Iterator newSubsequenceIterator(int start, int stop, int step) { if (isBasicPlane()) { if (step < 0) { return new SteppedIterator(step * -1, new ReversedIterator( new SubsequenceIteratorBasic(stop + 1, start + 1, 1))); } else { return new SubsequenceIteratorBasic(start, stop, step); } } else { if (step < 0) { return new SteppedIterator(step * -1, new ReversedIterator( new SubsequenceIteratorImpl(stop + 1, start + 1, 1))); } else { return new SubsequenceIteratorImpl(start, stop, step); } } } /** * Interpret the object as a Java String representing characters as UTF-16, or * return null if the type does not admit this conversion. From a * PyUnicode we return its internal string. A byte argument is decoded with the * default encoding. * * @param o the object to coerce * @return an equivalent String */ private static String coerceToStringOrNull(PyObject o) { if (o instanceof PyUnicode) { return ((PyUnicode) o).getString(); } else if (o instanceof PyString) { return ((PyString) o).decode().toString(); } else if (o instanceof BufferProtocol) { // PyByteArray, PyMemoryView, Py2kBuffer ... // We ought to be able to call codecs.decode on o but see Issue #2164 try (PyBuffer buf = ((BufferProtocol) o).getBuffer(PyBUF.FULL_RO)) { PyString s = new PyString(buf); // For any sensible codec, the return is unicode and toString() is getString(). return s.decode().toString(); } } else { // o is some type not allowed: return null; } } /** * Interpret the object as a Java String for use in comparison. The return * represents characters as UTF-16. From a PyUnicode we return its internal string. * A str and buffer argument is decoded with the default encoding. *

* This method could be replaced by {@link #coerceToStringOrNull(PyObject)} if we were content * to allowing a wider range of types to be supported in comparison operations than (C)Python * unicode.__eq__. * * @param o the object to coerce * @return an equivalent String */ private static String coerceForComparison(PyObject o) { if (o instanceof PyUnicode) { return ((PyUnicode) o).getString(); } else if (o instanceof PyString) { return ((PyString) o).decode().toString(); } else if (o instanceof Py2kBuffer) { // We ought to be able to call codecs.decode on o but see Issue #2164 try (PyBuffer buf = ((BufferProtocol) o).getBuffer(PyBUF.FULL_RO)) { PyString s = new PyString(buf); // For any sensible codec, the return is unicode and toString() is getString(). return s.decode().toString(); } } else { // o is some type not allowed: return null; } } /** * Interpret the object as a Java String representing characters as UTF-16, or * raise an error if the type does not admit this conversion. A byte argument is decoded with * the default encoding. * * @param o the object to coerce * @return an equivalent String (and never null) */ private static String coerceToString(PyObject o) { String s = coerceToStringOrNull(o); if (s == null) { throw errorCoercingToUnicode(o); } return s; } /** * Interpret the object as a Java String representing characters as UTF-16, or * optionally as null (for a null or None argument if the * second argument is true). Raise an error if the type does not admit this * conversion. * * @param o the object to coerce * @param allowNullArgument iff true allow a null or none argument * @return an equivalent String or null */ private static String coerceToString(PyObject o, boolean allowNullArgument) { if (allowNullArgument && (o == null || o == Py.None)) { return null; } else { return coerceToString(o); } } /** Construct exception "coercing to Unicode: ..." */ private static PyException errorCoercingToUnicode(PyObject o) { return Py.TypeError("coercing to Unicode: need string or buffer, " + (o == null ? Py.None : o).getType().fastGetName() + " found"); } /** * Interpret the object as a PyUnicode, or return null if the type * does not admit this conversion. From a PyUnicode we return itself. A byte * argument is decoded with the default encoding. * * @param o the object to coerce * @return an equivalent PyUnicode (or o itself) */ private static PyUnicode coerceToUnicodeOrNull(PyObject o) { if (o instanceof PyUnicode) { return (PyUnicode) o; } else if (o instanceof PyString) { // For any sensible codec, the return here is unicode. PyObject u = ((PyString) o).decode(); return (u instanceof PyUnicode) ? (PyUnicode) u : new PyUnicode(o.toString()); } else if (o instanceof BufferProtocol) { // PyByteArray, PyMemoryView, Py2kBuffer ... // We ought to be able to call codecs.decode on o but see Issue #2164 try (PyBuffer buf = ((BufferProtocol) o).getBuffer(PyBUF.FULL_RO)) { PyString s = new PyString(buf); // For any sensible codec, the return is unicode and toString() is getString(). PyObject u = s.decode(); return (u instanceof PyUnicode) ? (PyUnicode) u : new PyUnicode(o.toString()); } } else { // o is some type not allowed: return null; } } /** * Interpret the object as a PyUnicode, or raise a TypeError if the * type does not admit this conversion. From a PyUnicode we return itself. A byte * argument is decoded with the default encoding. * * @param o the object to coerce * @return an equivalent PyUnicode (or o itself) */ private static PyUnicode coerceToUnicode(PyObject o) { PyUnicode u = coerceToUnicodeOrNull(o); if (u == null) { throw errorCoercingToUnicode(o); } return u; } @Override public boolean __contains__(PyObject o) { return unicode___contains__(o); } @ExposedMethod(doc = BuiltinDocs.unicode___contains___doc) final boolean unicode___contains__(PyObject o) { String other = coerceToString(o); return getString().indexOf(other) >= 0; } @ExposedMethod(type = MethodType.BINARY, doc = BuiltinDocs.unicode___getslice___doc) final PyObject unicode___mul__(PyObject o) { return str___mul__(o); } @ExposedMethod(type = MethodType.BINARY, doc = BuiltinDocs.unicode___getslice___doc) final PyObject unicode___rmul__(PyObject o) { return str___rmul__(o); } @Override public PyObject __add__(PyObject other) { return unicode___add__(other); } @ExposedMethod(type = MethodType.BINARY, doc = BuiltinDocs.unicode___getslice___doc) final PyObject unicode___add__(PyObject other) { // Interpret other as a Java String String s = coerceToStringOrNull(other); return s == null ? null : new PyUnicode(getString().concat(s)); } @ExposedMethod(doc = BuiltinDocs.unicode_lower_doc) final PyObject unicode_lower() { return new PyUnicode(getString().toLowerCase()); } @ExposedMethod(doc = BuiltinDocs.unicode_upper_doc) final PyObject unicode_upper() { return new PyUnicode(getString().toUpperCase()); } @ExposedMethod(doc = BuiltinDocs.unicode_title_doc) final PyObject unicode_title() { StringBuilder buffer = new StringBuilder(getString().length()); boolean previous_is_cased = false; for (Iterator iter = newSubsequenceIterator(); iter.hasNext();) { int codePoint = iter.next(); if (previous_is_cased) { buffer.appendCodePoint(Character.toLowerCase(codePoint)); } else { buffer.appendCodePoint(Character.toTitleCase(codePoint)); } if (Character.isLowerCase(codePoint) || Character.isUpperCase(codePoint) || Character.isTitleCase(codePoint)) { previous_is_cased = true; } else { previous_is_cased = false; } } return new PyUnicode(buffer); } @ExposedMethod(doc = BuiltinDocs.unicode_swapcase_doc) final PyObject unicode_swapcase() { StringBuilder buffer = new StringBuilder(getString().length()); for (Iterator iter = newSubsequenceIterator(); iter.hasNext();) { int codePoint = iter.next(); if (Character.isUpperCase(codePoint)) { buffer.appendCodePoint(Character.toLowerCase(codePoint)); } else if (Character.isLowerCase(codePoint)) { buffer.appendCodePoint(Character.toUpperCase(codePoint)); } else { buffer.appendCodePoint(codePoint); } } return new PyUnicode(buffer); } /** Define what characters are to be treated as a space according to Python 2. */ private static boolean isPythonSpace(int ch) { // Use the Java built-in methods as far as possible return Character.isWhitespace(ch) // catches the ASCII spaces and some others || Character.isSpaceChar(ch) // catches remaining Unicode spaces || ch == 0x0085 // NEXT LINE (not a space in Java) || ch == 0x180e; // MONGOLIAN VOWEL SEPARATOR (not a space in Java 9+ or Python 3) } private static class StripIterator implements Iterator { private final Iterator iter; private int lookahead = -1; public StripIterator(PyUnicode sep, Iterator iter) { this.iter = iter; if (sep != null) { Set sepSet = Generic.set(); for (Iterator sepIter = sep.newSubsequenceIterator(); sepIter.hasNext();) { sepSet.add(sepIter.next()); } while (iter.hasNext()) { int codePoint = iter.next(); if (!sepSet.contains(codePoint)) { lookahead = codePoint; return; } } } else { while (iter.hasNext()) { int codePoint = iter.next(); if (!isPythonSpace(codePoint)) { lookahead = codePoint; return; } } } } @Override public boolean hasNext() { return lookahead != -1; } @Override public Integer next() { int old = lookahead; if (iter.hasNext()) { lookahead = iter.next(); } else { lookahead = -1; } return old; } @Override public void remove() { throw new UnsupportedOperationException(); } } // Compliance requires a bit of inconsistency with other coercions used. /** * Helper used in .strip() to "coerce" a method argument into a * PyUnicode (which it may already be). A null argument or a * PyNone causes null to be returned. A buffer type is not acceptable * to (Unicode) .strip(). This is the difference from * {@link #coerceToUnicode(PyObject, boolean)}. * * @param o the object to coerce * @param name of method * @return an equivalent PyUnicode (or o itself, or null) */ private static PyUnicode coerceStripSepToUnicode(PyObject o, String name) { if (o == null) { return null; } else if (o instanceof PyUnicode) { return (PyUnicode) o; } else if (o instanceof PyString) { PyObject u = ((PyString) o).decode(); return (u instanceof PyUnicode) ? (PyUnicode) u : new PyUnicode(u.toString()); } else if (o == Py.None) { return null; } else { throw Py.TypeError(name + " arg must be None, unicode or str"); } } @ExposedMethod(defaults = "null", doc = BuiltinDocs.unicode_strip_doc) final PyObject unicode_strip(PyObject sepObj) { PyUnicode sep = coerceStripSepToUnicode(sepObj, "strip"); if (isBasicPlane()) { // this contains only basic plane characters if (sep == null) { // And we're stripping whitespace, so use the PyString implementation return new PyUnicode(_strip()); } else if (sep.isBasicPlane()) { // And the strip characters are basic plane too, so use the PyString implementation return new PyUnicode(_strip(sep.getString())); } } // Not basic plane: have to do real Unicode return new PyUnicode(new ReversedIterator(new StripIterator(sep, new ReversedIterator<>(new StripIterator(sep, newSubsequenceIterator()))))); } @ExposedMethod(defaults = "null", doc = BuiltinDocs.unicode_lstrip_doc) final PyObject unicode_lstrip(PyObject sepObj) { PyUnicode sep = coerceStripSepToUnicode(sepObj, "lstrip"); if (isBasicPlane()) { // this contains only basic plane characters if (sep == null) { // And we're stripping whitespace, so use the PyString implementation return new PyUnicode(_lstrip()); } else if (sep.isBasicPlane()) { // And the strip characters are basic plane too, so use the PyString implementation return new PyUnicode(_lstrip(sep.getString())); } } // Not basic plane: have to do real Unicode return new PyUnicode(new StripIterator(sep, newSubsequenceIterator())); } @ExposedMethod(defaults = "null", doc = BuiltinDocs.unicode_rstrip_doc) final PyObject unicode_rstrip(PyObject sepObj) { PyUnicode sep = coerceStripSepToUnicode(sepObj, "rstrip"); if (isBasicPlane()) { // this contains only basic plane characters if (sep == null) { // And we're stripping whitespace, so use the PyString implementation return new PyUnicode(_rstrip()); } else if (sep.isBasicPlane()) { // And the strip characters are basic plane too, so use the PyString implementation return new PyUnicode(_rstrip(sep.getString())); } } // Not basic plane: have to do real Unicode return new PyUnicode(new ReversedIterator( new StripIterator(sep, new ReversedIterator<>(newSubsequenceIterator())))); } /** {@inheritDoc} */ @Override protected int _findLeft(int right) { String s = getString(); for (int left = 0; left < right; left++) { if (!isPythonSpace(s.charAt(left))) { return left; } } return right; } /** {@inheritDoc} */ @Override protected int _findRight() { String s = getString(); for (int right = s.length(); --right >= 0;) { if (!isPythonSpace(s.charAt(right))) { return right; } } return -1; } @Override public PyTuple partition(PyObject sep) { return unicode_partition(sep); } @ExposedMethod(doc = BuiltinDocs.unicode_partition_doc) final PyTuple unicode_partition(PyObject sep) { return unicodePartition(coerceToUnicode(sep)); } private abstract class SplitIterator implements Iterator { protected final int maxsplit; protected final Iterator iter = newSubsequenceIterator(); protected final LinkedList lookahead = new LinkedList(); protected int numSplits = 0; protected boolean completeSeparator = false; SplitIterator(int maxsplit) { this.maxsplit = maxsplit; } @Override public boolean hasNext() { return lookahead.peek() != null || (iter.hasNext() && (maxsplit == -1 || numSplits <= maxsplit)); } protected void addLookahead(StringBuilder buffer) { for (int codepoint : lookahead) { buffer.appendCodePoint(codepoint); } lookahead.clear(); } @Override public void remove() { throw new UnsupportedOperationException(); } public boolean getEndsWithSeparator() { return completeSeparator && !hasNext(); } } private class WhitespaceSplitIterator extends SplitIterator { WhitespaceSplitIterator(int maxsplit) { super(maxsplit); } @Override public PyUnicode next() { StringBuilder buffer = new StringBuilder(); addLookahead(buffer); if (numSplits == maxsplit) { while (iter.hasNext()) { buffer.appendCodePoint(iter.next()); } return new PyUnicode(buffer); } boolean inSeparator = false; boolean atBeginning = numSplits == 0; while (iter.hasNext()) { int codepoint = iter.next(); if (isPythonSpace(codepoint)) { completeSeparator = true; if (!atBeginning) { inSeparator = true; } } else if (!inSeparator) { completeSeparator = false; buffer.appendCodePoint(codepoint); } else { completeSeparator = false; lookahead.add(codepoint); break; } atBeginning = false; } numSplits++; return new PyUnicode(buffer); } } private static class PeekIterator implements Iterator { private T lookahead = null; private final Iterator iter; public PeekIterator(Iterator iter) { this.iter = iter; next(); } public T peek() { return lookahead; } @Override public boolean hasNext() { return lookahead != null; } @Override public T next() { T peeked = lookahead; lookahead = iter.hasNext() ? iter.next() : null; return peeked; } @Override public void remove() { throw new UnsupportedOperationException(); } } private static class ReversedIterator implements Iterator { private final List reversed = Generic.list(); private final Iterator iter; ReversedIterator(Iterator iter) { while (iter.hasNext()) { reversed.add(iter.next()); } Collections.reverse(reversed); this.iter = reversed.iterator(); } @Override public boolean hasNext() { return iter.hasNext(); } @Override public T next() { return iter.next(); } @Override public void remove() { throw new UnsupportedOperationException(); } } private class LineSplitIterator implements Iterator { private final PeekIterator iter = new PeekIterator<>(newSubsequenceIterator()); private final boolean keepends; LineSplitIterator(boolean keepends) { this.keepends = keepends; } @Override public boolean hasNext() { return iter.hasNext(); } @Override public PyObject next() { StringBuilder buffer = new StringBuilder(); while (iter.hasNext()) { int codepoint = iter.next(); if (codepoint == '\r' && iter.peek() != null && iter.peek() == '\n') { if (keepends) { buffer.appendCodePoint(codepoint); buffer.appendCodePoint(iter.next()); } else { iter.next(); } break; } else if (codepoint == '\n' || codepoint == '\r' || Character.getType(codepoint) == Character.LINE_SEPARATOR) { if (keepends) { buffer.appendCodePoint(codepoint); } break; } else { buffer.appendCodePoint(codepoint); } } return new PyUnicode(buffer); } @Override public void remove() { throw new UnsupportedOperationException(); } } private class SepSplitIterator extends SplitIterator { private final PyUnicode sep; SepSplitIterator(PyUnicode sep, int maxsplit) { super(maxsplit); this.sep = sep; } @Override public PyUnicode next() { StringBuilder buffer = new StringBuilder(); addLookahead(buffer); if (numSplits == maxsplit) { while (iter.hasNext()) { buffer.appendCodePoint(iter.next()); } return new PyUnicode(buffer); } boolean inSeparator = true; while (iter.hasNext()) { // TODO: should cache the first codepoint inSeparator = true; for (Iterator sepIter = sep.newSubsequenceIterator(); sepIter.hasNext();) { int codepoint = iter.next(); if (codepoint != sepIter.next()) { addLookahead(buffer); buffer.appendCodePoint(codepoint); inSeparator = false; break; } else { lookahead.add(codepoint); } } if (inSeparator) { lookahead.clear(); break; } } numSplits++; completeSeparator = inSeparator; return new PyUnicode(buffer); } } private SplitIterator newSplitIterator(PyUnicode sep, int maxsplit) { if (sep == null) { return new WhitespaceSplitIterator(maxsplit); } else if (sep.getCodePointCount() == 0) { throw Py.ValueError("empty separator"); } else { return new SepSplitIterator(sep, maxsplit); } } @Override public PyTuple rpartition(PyObject sep) { return unicode_rpartition(sep); } @ExposedMethod(doc = BuiltinDocs.unicode_rpartition_doc) final PyTuple unicode_rpartition(PyObject sep) { return unicodeRpartition(coerceToUnicode(sep)); } @ExposedMethod(defaults = {"null", "-1"}, doc = BuiltinDocs.unicode_split_doc) final PyList unicode_split(PyObject sepObj, int maxsplit) { String sep = coerceToString(sepObj, true); if (sep != null) { return _split(sep, maxsplit); } else { return _split(null, maxsplit); } } /** * {@inheritDoc} The split sections will be {@link PyUnicode} and use the Python * unicode definition of "space". */ @Override protected PyList splitfields(int maxsplit) { /* * Result built here is a list of split parts, exactly as required for s.split(None, * maxsplit). If there are to be n splits, there will be n+1 elements in L. */ PyList list = new PyList(); String s = getString(); int length = s.length(), start = 0, splits = 0, index; if (maxsplit < 0) { // Make all possible splits: there can't be more than: maxsplit = length; } // start is always the first character not consumed into a piece on the list while (start < length) { // Find the next occurrence of non-whitespace while (start < length) { if (!isPythonSpace(s.charAt(start))) { // Break leaving start pointing at non-whitespace break; } start++; } if (start >= length) { // Only found whitespace so there is no next segment break; } else if (splits >= maxsplit) { // The next segment is the last and contains all characters up to the end index = length; } else { // The next segment runs up to the next next whitespace or end for (index = start; index < length; index++) { if (isPythonSpace(s.charAt(index))) { // Break leaving index pointing at whitespace break; } } } // Make a piece from start up to index list.append(fromSubstring(start, index)); splits++; // Start next segment search at that point start = index; } return list; } @ExposedMethod(defaults = {"null", "-1"}, doc = BuiltinDocs.unicode_rsplit_doc) final PyList unicode_rsplit(PyObject sepObj, int maxsplit) { String sep = coerceToString(sepObj, true); if (sep != null) { return _rsplit(sep, maxsplit); } else { return _rsplit(null, maxsplit); } } /** * {@inheritDoc} The split sections will be {@link PyUnicode} and use the Python * unicode definition of "space". */ @Override protected PyList rsplitfields(int maxsplit) { /* * Result built here (in reverse) is a list of split parts, exactly as required for * s.rsplit(None, maxsplit). If there are to be n splits, there will be n+1 elements. */ PyList list = new PyList(); String s = getString(); int length = s.length(), end = length - 1, splits = 0, index; if (maxsplit < 0) { // Make all possible splits: there can't be more than: maxsplit = length; } // end is always the rightmost character not consumed into a piece on the list while (end >= 0) { // Find the next occurrence of non-whitespace (working leftwards) while (end >= 0) { if (!isPythonSpace(s.charAt(end))) { // Break leaving end pointing at non-whitespace break; } --end; } if (end < 0) { // Only found whitespace so there is no next segment break; } else if (splits >= maxsplit) { // The next segment is the last and contains all characters back to the beginning index = -1; } else { // The next segment runs back to the next next whitespace or beginning for (index = end; index >= 0; --index) { if (isPythonSpace(s.charAt(index))) { // Break leaving index pointing at whitespace break; } } } // Make a piece from index+1 start up to end+1 list.append(fromSubstring(index + 1, end + 1)); splits++; // Start next segment search at that point end = index; } list.reverse(); return list; } @ExposedMethod(defaults = "false", doc = BuiltinDocs.unicode___getslice___doc) final PyList unicode_splitlines(boolean keepends) { return new PyList(new LineSplitIterator(keepends)); } @Override protected PyString fromSubstring(int begin, int end) { assert (isBasicPlane()); // can only be used on a codepath from str_ equivalents return fromString(getString().substring(begin, end), true); } @ExposedMethod(defaults = {"null", "null"}, doc = BuiltinDocs.unicode_index_doc) final int unicode_index(PyObject subObj, PyObject start, PyObject end) { final String sub = coerceToString(subObj); // Now use the mechanics of the PyString on the UTF-16. return checkIndex(_find(sub, start, end)); } @ExposedMethod(defaults = {"null", "null"}, doc = BuiltinDocs.unicode_index_doc) final int unicode_rindex(PyObject subObj, PyObject start, PyObject end) { final String sub = coerceToString(subObj); // Now use the mechanics of the PyString on the UTF-16. return checkIndex(_rfind(sub, start, end)); } @ExposedMethod(defaults = {"null", "null"}, doc = BuiltinDocs.unicode_count_doc) final int unicode_count(PyObject subObj, PyObject start, PyObject end) { final PyUnicode sub = coerceToUnicode(subObj); if (isBasicPlane()) { return _count(sub.getString(), start, end); } int[] indices = super.translateIndices(start, end); // do not convert to utf-16 indices. int count = 0; for (Iterator mainIter = newSubsequenceIterator(indices[0], indices[1], 1); mainIter.hasNext();) { int matched = sub.getCodePointCount(); for (Iterator subIter = sub.newSubsequenceIterator(); mainIter.hasNext() && subIter.hasNext();) { if (mainIter.next() != subIter.next()) { break; } matched--; } if (matched == 0) { count++; } } return count; } @ExposedMethod(defaults = {"null", "null"}, doc = BuiltinDocs.unicode_find_doc) final int unicode_find(PyObject subObj, PyObject start, PyObject end) { int found = _find(coerceToString(subObj), start, end); return found < 0 ? -1 : translator.codePointIndex(found); } @ExposedMethod(defaults = {"null", "null"}, doc = BuiltinDocs.unicode_rfind_doc) final int unicode_rfind(PyObject subObj, PyObject start, PyObject end) { int found = _rfind(coerceToString(subObj), start, end); return found < 0 ? -1 : translator.codePointIndex(found); } private static String padding(int n, int pad) { StringBuilder buffer = new StringBuilder(n); for (int i = 0; i < n; i++) { buffer.appendCodePoint(pad); } return buffer.toString(); } private static int parse_fillchar(String function, String fillchar) { if (fillchar == null) { return ' '; } if (fillchar.codePointCount(0, fillchar.length()) != 1) { throw Py.TypeError(function + "() argument 2 must be char, not str"); } return fillchar.codePointAt(0); } @ExposedMethod(defaults = "null", doc = BuiltinDocs.unicode___getslice___doc) final PyObject unicode_ljust(int width, String padding) { int n = width - getCodePointCount(); if (n <= 0) { return new PyUnicode(getString()); } else { return new PyUnicode(getString() + padding(n, parse_fillchar("ljust", padding))); } } @ExposedMethod(defaults = "null", doc = BuiltinDocs.unicode___getslice___doc) final PyObject unicode_rjust(int width, String padding) { int n = width - getCodePointCount(); if (n <= 0) { return new PyUnicode(getString()); } else { return new PyUnicode(padding(n, parse_fillchar("ljust", padding)) + getString()); } } @ExposedMethod(defaults = "null", doc = BuiltinDocs.unicode___getslice___doc) final PyObject unicode_center(int width, String padding) { int n = width - getCodePointCount(); if (n <= 0) { return new PyUnicode(getString()); } int half = n / 2; if (n % 2 > 0 && width % 2 > 0) { half += 1; } int pad = parse_fillchar("center", padding); return new PyUnicode(padding(half, pad) + getString() + padding(n - half, pad)); } @ExposedMethod(doc = BuiltinDocs.unicode_zfill_doc) final PyObject unicode_zfill(int width) { int n = getCodePointCount(); if (n >= width) { return new PyUnicode(getString()); } if (isBasicPlane()) { return new PyUnicode(str_zfill(width)); } StringBuilder buffer = new StringBuilder(width); int nzeros = width - n; boolean first = true; boolean leadingSign = false; for (Iterator iter = newSubsequenceIterator(); iter.hasNext();) { int codePoint = iter.next(); if (first) { first = false; if (codePoint == '+' || codePoint == '-') { buffer.appendCodePoint(codePoint); leadingSign = true; } for (int i = 0; i < nzeros; i++) { buffer.appendCodePoint('0'); } if (!leadingSign) { buffer.appendCodePoint(codePoint); } } else { buffer.appendCodePoint(codePoint); } } if (first) { for (int i = 0; i < nzeros; i++) { buffer.appendCodePoint('0'); } } return new PyUnicode(buffer); } @ExposedMethod(defaults = "8", doc = BuiltinDocs.unicode___getslice___doc) final PyObject unicode_expandtabs(int tabsize) { return new PyUnicode(str_expandtabs(tabsize)); } @ExposedMethod(doc = BuiltinDocs.unicode_capitalize_doc) final PyObject unicode_capitalize() { if (getString().length() == 0) { return this; } StringBuilder buffer = new StringBuilder(getString().length()); boolean first = true; for (Iterator iter = newSubsequenceIterator(); iter.hasNext();) { if (first) { buffer.appendCodePoint(Character.toUpperCase(iter.next())); first = false; } else { buffer.appendCodePoint(Character.toLowerCase(iter.next())); } } return new PyUnicode(buffer); } @ExposedMethod(defaults = "-1", doc = BuiltinDocs.unicode_replace_doc) final PyString unicode_replace(PyObject oldPieceObj, PyObject newPieceObj, int count) { // Convert other argument types to PyUnicode (or error) PyUnicode newPiece = coerceToUnicode(newPieceObj); PyUnicode oldPiece = coerceToUnicode(oldPieceObj); if (isBasicPlane() && newPiece.isBasicPlane() && oldPiece.isBasicPlane()) { // Use the mechanics of PyString, since all is basic plane return _replace(oldPiece.getString(), newPiece.getString(), count); } else { // A Unicode-specific implementation is needed working in code points StringBuilder buffer = new StringBuilder(); if (oldPiece.getCodePointCount() == 0) { Iterator iter = newSubsequenceIterator(); for (int i = 1; (count == -1 || i < count) && iter.hasNext(); i++) { if (i == 1) { buffer.append(newPiece.getString()); } buffer.appendCodePoint(iter.next()); buffer.append(newPiece.getString()); } while (iter.hasNext()) { buffer.appendCodePoint(iter.next()); } } else { SplitIterator iter = newSplitIterator(oldPiece, count); int numSplits = 0; while (iter.hasNext()) { buffer.append(iter.next().getString()); if (iter.hasNext()) { buffer.append(newPiece.getString()); } numSplits++; } if (iter.getEndsWithSeparator() && (count == -1 || numSplits <= count)) { buffer.append(newPiece.getString()); } } return new PyUnicode(buffer); } } // end utf-16 aware @Override public PyString join(PyObject seq) { return unicode_join(seq); } @ExposedMethod(doc = BuiltinDocs.unicode_join_doc) final PyUnicode unicode_join(PyObject seq) { return unicodeJoin(seq); } /** * Equivalent to the Python unicode.startswith method, testing whether a string * starts with a specified prefix, where a sub-range is specified by [start:end]. * Arguments start and end are interpreted as in slice notation, with * null or {@link Py#None} representing "missing". prefix can also be a tuple of * prefixes to look for. * * @param prefix string to check for (or a PyTuple of them). * @param start start of slice. * @param end end of slice. * @return true if this string slice starts with a specified prefix, otherwise * false. */ @Override public boolean startswith(PyObject prefix, PyObject start, PyObject end) { return unicode_startswith(prefix, start, end); } @ExposedMethod(defaults = {"null", "null"}, doc = BuiltinDocs.unicode_startswith_doc) final boolean unicode_startswith(PyObject prefix, PyObject startObj, PyObject endObj) { int[] indices = translateIndices(startObj, endObj); int start = indices[0]; int sliceLen = indices[1] - start; if (!(prefix instanceof PyTuple)) { // It ought to be PyUnicode or some kind of bytes with the buffer API to decode. String s = coerceToString(prefix); return sliceLen >= s.length() && getString().startsWith(s, start); } else { // Loop will return true if this slice starts with any prefix in the tuple for (PyObject prefixObj : ((PyTuple) prefix).getArray()) { // It ought to be PyUnicode or some kind of bytes with the buffer API. String s = coerceToString(prefixObj); if (sliceLen >= s.length() && getString().startsWith(s, start)) { return true; } } // None matched return false; } } /** * Equivalent to the Python unicode.endswith method, testing whether a string ends * with a specified suffix, where a sub-range is specified by [start:end]. * Arguments start and end are interpreted as in slice notation, with * null or {@link Py#None} representing "missing". suffix can also be a tuple of * suffixes to look for. * * @param suffix string to check for (or a PyTuple of them). * @param start start of slice. * @param end end of slice. * @return true if this string slice ends with a specified suffix, otherwise * false. */ @Override public boolean endswith(PyObject suffix, PyObject start, PyObject end) { return unicode_endswith(suffix, start, end); } @ExposedMethod(defaults = {"null", "null"}, doc = BuiltinDocs.unicode_endswith_doc) final boolean unicode_endswith(PyObject suffix, PyObject startObj, PyObject endObj) { int[] indices = translateIndices(startObj, endObj); String substr = getString().substring(indices[0], indices[1]); if (!(suffix instanceof PyTuple)) { // It ought to be PyUnicode or some kind of bytes with the buffer API. String s = coerceToString(suffix); return substr.endsWith(s); } else { // Loop will return true if this slice ends with any suffix in the tuple for (PyObject suffixObj : ((PyTuple) suffix).getArray()) { // It ought to be PyUnicode or some kind of bytes with the buffer API. String s = coerceToString(suffixObj); if (substr.endsWith(s)) { return true; } } // None matched return false; } } @ExposedMethod(doc = BuiltinDocs.unicode_translate_doc) final PyObject unicode_translate(PyObject table) { return _codecs.translateCharmap(this, "ignore", table); } @ExposedMethod(doc = BuiltinDocs.unicode_islower_doc) final boolean unicode_islower() { boolean cased = false; for (Iterator iter = newSubsequenceIterator(); iter.hasNext();) { int codepoint = iter.next(); if (Character.isUpperCase(codepoint) || Character.isTitleCase(codepoint)) { return false; } else if (!cased && Character.isLowerCase(codepoint)) { cased = true; } } return cased; } @ExposedMethod(doc = BuiltinDocs.unicode_isupper_doc) final boolean unicode_isupper() { boolean cased = false; for (Iterator iter = newSubsequenceIterator(); iter.hasNext();) { int codepoint = iter.next(); if (Character.isLowerCase(codepoint) || Character.isTitleCase(codepoint)) { return false; } else if (!cased && Character.isUpperCase(codepoint)) { cased = true; } } return cased; } @ExposedMethod(doc = BuiltinDocs.unicode_isalpha_doc) final boolean unicode_isalpha() { if (getCodePointCount() == 0) { return false; } for (Iterator iter = newSubsequenceIterator(); iter.hasNext();) { if (!Character.isLetter(iter.next())) { return false; } } return true; } @ExposedMethod(doc = BuiltinDocs.unicode_isalnum_doc) final boolean unicode_isalnum() { if (getCodePointCount() == 0) { return false; } for (Iterator iter = newSubsequenceIterator(); iter.hasNext();) { int codePoint = iter.next(); if (!(Character.isLetterOrDigit(codePoint) || // Character.getType(codePoint) == Character.LETTER_NUMBER)) { return false; } } return true; } @ExposedMethod(doc = BuiltinDocs.unicode_isdecimal_doc) final boolean unicode_isdecimal() { if (getCodePointCount() == 0) { return false; } for (Iterator iter = newSubsequenceIterator(); iter.hasNext();) { if (Character.getType(iter.next()) != Character.DECIMAL_DIGIT_NUMBER) { return false; } } return true; } @ExposedMethod(doc = BuiltinDocs.unicode_isdigit_doc) final boolean unicode_isdigit() { if (getCodePointCount() == 0) { return false; } for (Iterator iter = newSubsequenceIterator(); iter.hasNext();) { if (!Character.isDigit(iter.next())) { return false; } } return true; } @ExposedMethod(doc = BuiltinDocs.unicode_isnumeric_doc) final boolean unicode_isnumeric() { if (getCodePointCount() == 0) { return false; } for (Iterator iter = newSubsequenceIterator(); iter.hasNext();) { int type = Character.getType(iter.next()); if (type != Character.DECIMAL_DIGIT_NUMBER && type != Character.LETTER_NUMBER && type != Character.OTHER_NUMBER) { return false; } } return true; } @ExposedMethod(doc = BuiltinDocs.unicode_istitle_doc) final boolean unicode_istitle() { if (getCodePointCount() == 0) { return false; } boolean cased = false; boolean previous_is_cased = false; for (Iterator iter = newSubsequenceIterator(); iter.hasNext();) { int codePoint = iter.next(); if (Character.isUpperCase(codePoint) || Character.isTitleCase(codePoint)) { if (previous_is_cased) { return false; } previous_is_cased = true; cased = true; } else if (Character.isLowerCase(codePoint)) { if (!previous_is_cased) { return false; } previous_is_cased = true; cased = true; } else { previous_is_cased = false; } } return cased; } @ExposedMethod(doc = BuiltinDocs.unicode_isspace_doc) final boolean unicode_isspace() { if (getCodePointCount() == 0) { return false; } for (Iterator iter = newSubsequenceIterator(); iter.hasNext();) { if (!isPythonSpace(iter.next())) { return false; } } return true; } // end utf-16 aware @ExposedMethod(doc = "isunicode is deprecated.") final boolean unicode_isunicode() { Py.warning(Py.DeprecationWarning, "isunicode is deprecated."); return true; } @ExposedMethod(doc = BuiltinDocs.unicode_encode_doc) final String unicode_encode(PyObject[] args, String[] keywords) { return str_encode(args, keywords); } @ExposedMethod(doc = BuiltinDocs.unicode_decode_doc) final PyObject unicode_decode(PyObject[] args, String[] keywords) { return str_decode(args, keywords); } @ExposedMethod(doc = BuiltinDocs.unicode___getnewargs___doc) final PyTuple unicode___getnewargs__() { return new PyTuple(new PyUnicode(this.getString())); } @Override public PyObject __format__(PyObject formatSpec) { return unicode___format__(formatSpec); } @ExposedMethod(doc = BuiltinDocs.unicode___format___doc) final PyObject unicode___format__(PyObject formatSpec) { // Re-use the str implementation, which adapts itself to unicode. return str___format__(formatSpec); } @ExposedMethod(doc = BuiltinDocs.unicode__formatter_parser_doc) final PyObject unicode__formatter_parser() { return new MarkupIterator(this); } @ExposedMethod(doc = BuiltinDocs.unicode__formatter_field_name_split_doc) final PyObject unicode__formatter_field_name_split() { FieldNameIterator iterator = new FieldNameIterator(this); return new PyTuple(iterator.pyHead(), iterator); } @ExposedMethod(doc = BuiltinDocs.unicode_format_doc) final PyObject unicode_format(PyObject[] args, String[] keywords) { try { return new PyUnicode(buildFormattedString(args, keywords, null, null)); } catch (IllegalArgumentException e) { throw Py.ValueError(e.getMessage()); } } @Override public Iterator iterator() { return newSubsequenceIterator(); } @Override public PyComplex __complex__() { return new PyString(encodeDecimal()).__complex__(); } @Override public int atoi(int base) { return new PyString(encodeDecimal()).atoi(base); } @Override public PyLong atol(int base) { return new PyString(encodeDecimal()).atol(base); } @Override public double atof() { return new PyString(encodeDecimal()).atof(); } /** * Encode unicode into a valid decimal String. Throws a UnicodeEncodeError on invalid * characters. * * @return a valid decimal as an encoded String */ private String encodeDecimal() { if (isBasicPlane()) { return encodeDecimalBasic(); } int digit; StringBuilder sb = new StringBuilder(); int i = 0; for (Iterator iter = newSubsequenceIterator(); iter.hasNext(); i++) { int codePoint = iter.next(); if (isPythonSpace(codePoint)) { sb.append(' '); continue; } digit = Character.digit(codePoint, 10); if (digit >= 0) { sb.append(digit); continue; } if (0 < codePoint && codePoint < 256) { sb.appendCodePoint(codePoint); continue; } // All other characters are considered unencodable codecs.encoding_error("strict", "decimal", getString(), i, i + 1, "invalid decimal Unicode string"); } return sb.toString(); } /** * Encode unicode in the basic plane into a valid decimal String. Throws a UnicodeEncodeError on * invalid characters. * * @return a valid decimal as an encoded String */ private String encodeDecimalBasic() { int digit; StringBuilder sb = new StringBuilder(); for (int i = 0; i < getString().length(); i++) { char ch = getString().charAt(i); if (isPythonSpace(ch)) { sb.append(' '); continue; } digit = Character.digit(ch, 10); if (digit >= 0) { sb.append(digit); continue; } if (0 < ch && ch < 256) { sb.append(ch); continue; } // All other characters are considered unencodable codecs.encoding_error("strict", "decimal", getString(), i, i + 1, "invalid decimal Unicode string"); } return sb.toString(); } }





© 2015 - 2024 Weber Informatics LLC | Privacy Policy