![JAR search and dependency download from the Maven repository](/logo.png)
org.python.core.PyUnicode Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of jython-standalone Show documentation
Show all versions of jython-standalone Show documentation
Jython is an implementation of the high-level, dynamic, object-oriented
language Python written in 100% Pure Java, and seamlessly integrated with
the Java platform. It thus allows you to run Python on any Java platform.
package org.python.core;
import java.util.Collection;
import java.util.Collections;
import java.util.Iterator;
import java.util.LinkedList;
import java.util.List;
import java.util.Set;
import org.python.expose.ExposedMethod;
import org.python.expose.ExposedNew;
import org.python.expose.ExposedType;
import org.python.expose.MethodType;
import org.python.modules._codecs;
import org.python.util.Generic;
/**
* a builtin python unicode string.
*/
@ExposedType(name = "unicode", base = PyBaseString.class)
public class PyUnicode extends PyString implements Iterable {
private enum Plane {
UNKNOWN, BASIC, ASTRAL
}
private volatile Plane plane = Plane.UNKNOWN;
private volatile int codePointCount = -1;
public static final PyType TYPE = PyType.fromClass(PyUnicode.class);
// for PyJavaClass.init()
public PyUnicode() {
this(TYPE, "");
}
public PyUnicode(String string) {
this(TYPE, string);
}
public PyUnicode(String string, boolean isBasic) {
this(TYPE, string);
plane = isBasic ? Plane.BASIC : Plane.UNKNOWN;
}
public PyUnicode(PyType subtype, String string) {
super(subtype, string);
}
public PyUnicode(PyString pystring) {
this(TYPE, pystring);
}
public PyUnicode(PyType subtype, PyString pystring) {
this(subtype, pystring instanceof PyUnicode ? pystring.string : pystring.decode().toString());
}
public PyUnicode(char c) {
this(TYPE, String.valueOf(c));
}
public PyUnicode(int codepoint) {
this(TYPE, new String(new int[]{codepoint}, 0, 1));
}
public PyUnicode(int[] codepoints) {
this(new String(codepoints, 0, codepoints.length));
}
PyUnicode(StringBuilder buffer) {
this(TYPE, new String(buffer));
}
private static StringBuilder fromCodePoints(Iterator iter) {
StringBuilder buffer = new StringBuilder();
while (iter.hasNext()) {
buffer.appendCodePoint(iter.next());
}
return buffer;
}
public PyUnicode(Iterator iter) {
this(fromCodePoints(iter));
}
public PyUnicode(Collection ucs4) {
this(ucs4.iterator());
}
@Override
public int[] toCodePoints() {
int n = getCodePointCount();
int[] codePoints = new int[n];
int i = 0;
for (Iterator iter = newSubsequenceIterator(); iter.hasNext(); i++) {
codePoints[i] = iter.next();
}
return codePoints;
}
// modified to know something about codepoints; we just need to return the
// corresponding substring; darn UTF16!
// TODO: we could avoid doing this unnecessary copy
@Override
public String substring(int start, int end) {
if (isBasicPlane()) {
return super.substring(start, end);
}
return new PyUnicode(newSubsequenceIterator(start, end, 1)).string;
}
/**
* Creates a PyUnicode from an already interned String. Just means it won't
* be reinterned if used in a place that requires interned Strings.
*/
public static PyUnicode fromInterned(String interned) {
PyUnicode uni = new PyUnicode(TYPE, interned);
uni.interned = true;
return uni;
}
public boolean isBasicPlane() {
if (plane == Plane.BASIC) {
return true;
} else if (plane == Plane.UNKNOWN) {
plane = (string.length() == getCodePointCount()) ? Plane.BASIC : Plane.ASTRAL;
}
return plane == Plane.BASIC;
}
// RETAIN THE BELOW CODE, it facilitates testing astral support more completely
// public boolean isBasicPlane() {
// return false;
// }
// END RETAIN
public int getCodePointCount() {
if (codePointCount >= 0) {
return codePointCount;
}
codePointCount = string.codePointCount(0, string.length());
return codePointCount;
}
@ExposedNew
final static PyObject unicode_new(PyNewWrapper new_, boolean init, PyType subtype,
PyObject[] args, String[] keywords) {
ArgParser ap = new ArgParser("unicode",
args,
keywords,
new String[]{"string",
"encoding",
"errors"
},
0);
PyObject S = ap.getPyObject(0, null);
String encoding = ap.getString(1, null);
String errors = ap.getString(2, null);
if (new_.for_type == subtype) {
if (S == null) {
return new PyUnicode("");
}
if (S instanceof PyUnicode) {
return new PyUnicode(((PyUnicode) S).string);
}
if (S instanceof PyString) {
if (S.getType() != PyString.TYPE && encoding == null && errors == null) {
return S.__unicode__();
}
PyObject decoded = codecs.decode((PyString) S, encoding, errors);
if (decoded instanceof PyUnicode) {
return new PyUnicode((PyUnicode) decoded);
} else {
throw Py.TypeError("decoder did not return an unicode object (type=" +
decoded.getType().fastGetName() + ")");
}
}
return S.__unicode__();
} else {
if (S == null) {
return new PyUnicodeDerived(subtype, Py.EmptyString);
}
if (S instanceof PyUnicode) {
return new PyUnicodeDerived(subtype, (PyUnicode) S);
} else {
return new PyUnicodeDerived(subtype, S.__str__());
}
}
}
@Override
public PyString createInstance(String str) {
return new PyUnicode(str);
}
// Unicode ops consisting of basic strings can only produce basic strings;
// this may not be the case for astral ones - they also might be basic, in
// case of deletes. So optimize by providing a tainting mechanism.
@Override
protected PyString createInstance(String str, boolean isBasic) {
return new PyUnicode(str, isBasic);
}
@Override
public PyObject __mod__(PyObject other) {
return unicode___mod__(other);
}
@ExposedMethod(doc = BuiltinDocs.unicode___mod___doc)
final PyObject unicode___mod__(PyObject other) {
StringFormatter fmt = new StringFormatter(string, true);
return fmt.format(other);
}
@Override
public PyUnicode __unicode__() {
return this;
}
@Override
public PyString __str__() {
return unicode___str__();
}
@ExposedMethod(doc = BuiltinDocs.unicode___str___doc)
final PyString unicode___str__() {
return new PyString(encode());
}
@Override
public int __len__() {
return unicode___len__();
}
@ExposedMethod(doc = BuiltinDocs.unicode___len___doc)
final int unicode___len__() {
return getCodePointCount();
}
@Override
public PyString __repr__() {
return unicode___repr__();
}
@ExposedMethod(doc = BuiltinDocs.unicode___repr___doc)
final PyString unicode___repr__() {
return new PyString("u" + encode_UnicodeEscape(string, true));
}
@ExposedMethod(doc = BuiltinDocs.unicode___getitem___doc)
final PyObject unicode___getitem__(PyObject index) {
return str___getitem__(index);
}
@ExposedMethod(defaults = "null", doc = BuiltinDocs.unicode___getslice___doc)
final PyObject unicode___getslice__(PyObject start, PyObject stop, PyObject step) {
return seq___getslice__(start, stop, step);
}
@Override
protected PyObject getslice(int start, int stop, int step) {
if (isBasicPlane()) {
return super.getslice(start, stop, step);
}
if (step > 0 && stop < start) {
stop = start;
}
StringBuilder buffer = new StringBuilder(sliceLength(start, stop, step));
for (Iterator iter = newSubsequenceIterator(start, stop, step); iter.hasNext();) {
buffer.appendCodePoint(iter.next());
}
return createInstance(new String(buffer));
}
@ExposedMethod(type = MethodType.CMP, doc = BuiltinDocs.unicode___getslice___doc)
final int unicode___cmp__(PyObject other) {
return str___cmp__(other);
}
@ExposedMethod(type = MethodType.BINARY, doc = BuiltinDocs.unicode___getslice___doc)
final PyObject unicode___eq__(PyObject other) {
return str___eq__(other);
}
@ExposedMethod(type = MethodType.BINARY, doc = BuiltinDocs.unicode___getslice___doc)
final PyObject unicode___ne__(PyObject other) {
return str___ne__(other);
}
@ExposedMethod(doc = BuiltinDocs.unicode___hash___doc)
final int unicode___hash__() {
return str___hash__();
}
@Override
protected PyObject pyget(int i) {
if (isBasicPlane()) {
return Py.makeCharacter(string.charAt(i), true);
}
int k = 0;
while (i > 0) {
int W1 = string.charAt(k);
if (W1 >= 0xD800 && W1 < 0xDC00) {
k += 2;
} else {
k += 1;
}
i--;
}
int codepoint = string.codePointAt(k);
return Py.makeCharacter(codepoint, true);
}
private class SubsequenceIteratorImpl implements Iterator {
private int current, k, start, stop, step;
SubsequenceIteratorImpl(int start, int stop, int step) {
k = 0;
current = start;
this.start = start;
this.stop = stop;
this.step = step;
for (int i = 0; i < start; i++) {
nextCodePoint();
}
}
SubsequenceIteratorImpl() {
this(0, getCodePointCount(), 1);
}
public boolean hasNext() {
return current < stop;
}
public Object next() {
int codePoint = nextCodePoint();
current += 1;
for (int j = 1; j < step && hasNext(); j++) {
nextCodePoint();
current += 1;
}
return codePoint;
}
private int nextCodePoint() {
int U;
int W1 = string.charAt(k);
if (W1 >= 0xD800 && W1 < 0xDC00) {
int W2 = string.charAt(k + 1);
U = (((W1 & 0x3FF) << 10) | (W2 & 0x3FF)) + 0x10000;
k += 2;
} else {
U = W1;
k += 1;
}
return U;
}
public void remove() {
throw new UnsupportedOperationException("Not supported on PyUnicode objects (immutable)");
}
}
private static class SteppedIterator implements Iterator {
private final Iterator iter;
private final int step;
private T lookahead = null;
public SteppedIterator(int step, Iterator iter) {
this.iter = iter;
this.step = step;
lookahead = advance();
}
private T advance() {
if (iter.hasNext()) {
T elem = iter.next();
for (int i = 1; i < step && iter.hasNext(); i++) {
iter.next();
}
return elem;
} else {
return null;
}
}
public boolean hasNext() {
return lookahead != null;
}
public T next() {
T old = lookahead;
if (iter.hasNext()) {
lookahead = iter.next();
for (int i = 1; i < step && iter.hasNext(); i++) {
iter.next();
}
} else {
lookahead = null;
}
return old;
}
public void remove() {
throw new UnsupportedOperationException();
}
}
// XXX: Parameterize SubsequenceIteratorImpl and friends (and make them Iterable)
public Iterator newSubsequenceIterator() {
return new SubsequenceIteratorImpl();
}
public Iterator newSubsequenceIterator(int start, int stop, int step) {
if (step < 0) {
return new SteppedIterator(step * -1,
new ReversedIterator(new SubsequenceIteratorImpl(stop + 1,
start + 1,
1)));
} else {
return new SubsequenceIteratorImpl(start, stop, step);
}
}
private PyUnicode coerceToUnicode(PyObject o) {
if (o == null) {
return null;
} else if (o instanceof PyUnicode) {
return (PyUnicode) o;
} else if (o instanceof PyString) {
return new PyUnicode(o.toString());
} else if (o == Py.None) {
return null;
} else {
throw Py.TypeError("coercing to Unicode: need string or buffer, " +
o.getType().fastGetName() + "found");
}
}
@ExposedMethod(doc = BuiltinDocs.unicode___contains___doc)
final boolean unicode___contains__(PyObject o) {
return str___contains__(o);
}
@ExposedMethod(type = MethodType.BINARY, doc = BuiltinDocs.unicode___getslice___doc)
final PyObject unicode___mul__(PyObject o) {
return str___mul__(o);
}
@ExposedMethod(type = MethodType.BINARY, doc = BuiltinDocs.unicode___getslice___doc)
final PyObject unicode___rmul__(PyObject o) {
return str___rmul__(o);
}
@Override
public PyObject __add__(PyObject other) {
return unicode___add__(other);
}
@ExposedMethod(type = MethodType.BINARY, doc = BuiltinDocs.unicode___getslice___doc)
final PyObject unicode___add__(PyObject other) {
PyUnicode otherUnicode;
if (other instanceof PyUnicode) {
otherUnicode = (PyUnicode)other;
} else if (other instanceof PyString) {
otherUnicode = (PyUnicode)((PyString)other).decode();
} else {
return null;
}
return new PyUnicode(string.concat(otherUnicode.string));
}
@ExposedMethod(doc = BuiltinDocs.unicode_lower_doc)
final PyObject unicode_lower() {
return new PyUnicode(str_lower());
}
@ExposedMethod(doc = BuiltinDocs.unicode_upper_doc)
final PyObject unicode_upper() {
return new PyUnicode(str_upper());
}
@ExposedMethod(doc = BuiltinDocs.unicode_title_doc)
final PyObject unicode_title() {
if (isBasicPlane()) {
return new PyUnicode(str_title());
}
StringBuilder buffer = new StringBuilder(string.length());
boolean previous_is_cased = false;
for (Iterator iter = newSubsequenceIterator(); iter.hasNext();) {
int codePoint = iter.next();
if (previous_is_cased) {
buffer.appendCodePoint(Character.toLowerCase(codePoint));
} else {
buffer.appendCodePoint(Character.toTitleCase(codePoint));
}
if (Character.isLowerCase(codePoint) ||
Character.isUpperCase(codePoint) ||
Character.isTitleCase(codePoint)) {
previous_is_cased = true;
} else {
previous_is_cased = false;
}
}
return new PyUnicode(buffer);
}
@ExposedMethod(doc = BuiltinDocs.unicode_swapcase_doc)
final PyObject unicode_swapcase() {
if (isBasicPlane()) {
return new PyUnicode(str_swapcase());
}
StringBuilder buffer = new StringBuilder(string.length());
for (Iterator iter = newSubsequenceIterator(); iter.hasNext();) {
int codePoint = iter.next();
if (Character.isUpperCase(codePoint)) {
buffer.appendCodePoint(Character.toLowerCase(codePoint));
} else if (Character.isLowerCase(codePoint)) {
buffer.appendCodePoint(Character.toUpperCase(codePoint));
} else {
buffer.appendCodePoint(codePoint);
}
}
return new PyUnicode(buffer);
}
private static class StripIterator implements Iterator {
private final Iterator iter;
private int lookahead = -1;
public StripIterator(PyUnicode sep, Iterator iter) {
this.iter = iter;
if (sep != null) {
Set sepSet = Generic.set();
for (Iterator sepIter = sep.newSubsequenceIterator(); sepIter.hasNext();) {
sepSet.add(sepIter.next());
}
while (iter.hasNext()) {
int codePoint = iter.next();
if (!sepSet.contains(codePoint)) {
lookahead = codePoint;
return;
}
}
} else {
while (iter.hasNext()) {
int codePoint = iter.next();
if (!Character.isWhitespace(codePoint)) {
lookahead = codePoint;
return;
}
}
}
}
public boolean hasNext() {
return lookahead != -1;
}
public Object next() {
int old = lookahead;
if (iter.hasNext()) {
lookahead = iter.next();
} else {
lookahead = -1;
}
return old;
}
public void remove() {
throw new UnsupportedOperationException();
}
}
// compliance requires that we need to support a bit of inconsistency
// compared to other coercion used
private PyUnicode coerceStripSepToUnicode(PyObject o) {
if (o == null) {
return null;
} else if (o instanceof PyUnicode) {
return (PyUnicode) o;
} else if (o instanceof PyString) {
return new PyUnicode(((PyString) o).decode().toString());
} else if (o == Py.None) {
return null;
} else {
throw Py.TypeError("strip arg must be None, unicode or str");
}
}
@ExposedMethod(defaults = "null", doc = BuiltinDocs.unicode___getslice___doc)
final PyObject unicode_strip(PyObject sepObj) {
PyUnicode sep = coerceStripSepToUnicode(sepObj);
if (isBasicPlane() && (sep == null || sep.isBasicPlane())) {
if (sep == null) {
return new PyUnicode(str_strip(null));
} else {
return new PyUnicode(str_strip(sep.string));
}
}
return new PyUnicode(new ReversedIterator(new StripIterator(sep,
new ReversedIterator(new StripIterator(sep, newSubsequenceIterator())))));
}
@ExposedMethod(defaults = "null", doc = BuiltinDocs.unicode___getslice___doc)
final PyObject unicode_lstrip(PyObject sepObj) {
PyUnicode sep = coerceStripSepToUnicode(sepObj);
if (isBasicPlane() && (sep == null || sep.isBasicPlane())) {
if (sep == null) {
return new PyUnicode(str_lstrip(null));
} else {
return new PyUnicode(str_lstrip(sep.string));
}
}
return new PyUnicode(new StripIterator(sep, newSubsequenceIterator()));
}
@ExposedMethod(defaults = "null", doc = BuiltinDocs.unicode___getslice___doc)
final PyObject unicode_rstrip(PyObject sepObj) {
PyUnicode sep = coerceStripSepToUnicode(sepObj);
if (isBasicPlane() && (sep == null || sep.isBasicPlane())) {
if (sep == null) {
return new PyUnicode(str_rstrip(null));
} else {
return new PyUnicode(str_rstrip(sep.string));
}
}
return new PyUnicode(new ReversedIterator(new StripIterator(sep,
new ReversedIterator(newSubsequenceIterator()))));
}
@Override
public PyTuple partition(PyObject sep) {
return unicode_partition(sep);
}
@ExposedMethod(doc = BuiltinDocs.unicode_partition_doc)
final PyTuple unicode_partition(PyObject sep) {
return unicodePartition(sep);
}
private abstract class SplitIterator implements Iterator {
protected final int maxsplit;
protected final Iterator iter = newSubsequenceIterator();
protected final LinkedList lookahead = new LinkedList();
protected int numSplits = 0;
protected boolean completeSeparator = false;
SplitIterator(int maxsplit) {
this.maxsplit = maxsplit;
}
public boolean hasNext() {
return lookahead.peek() != null ||
(iter.hasNext() && (maxsplit == -1 || numSplits <= maxsplit));
}
protected void addLookahead(StringBuilder buffer) {
for (int codepoint : lookahead) {
buffer.appendCodePoint(codepoint);
}
lookahead.clear();
}
public void remove() {
throw new UnsupportedOperationException();
}
public boolean getEndsWithSeparator() {
return completeSeparator && !hasNext();
}
}
private class WhitespaceSplitIterator extends SplitIterator {
WhitespaceSplitIterator(int maxsplit) {
super(maxsplit);
}
public PyUnicode next() {
StringBuilder buffer = new StringBuilder();
addLookahead(buffer);
if (numSplits == maxsplit) {
while (iter.hasNext()) {
buffer.appendCodePoint(iter.next());
}
return new PyUnicode(buffer);
}
boolean inSeparator = false;
boolean atBeginning = numSplits == 0;
while (iter.hasNext()) {
int codepoint = iter.next();
if (Character.isWhitespace(codepoint)) {
completeSeparator = true;
if (!atBeginning) {
inSeparator = true;
}
} else if (!inSeparator) {
completeSeparator = false;
buffer.appendCodePoint(codepoint);
} else {
completeSeparator = false;
lookahead.add(codepoint);
break;
}
atBeginning = false;
}
numSplits++;
return new PyUnicode(buffer);
}
}
private static class PeekIterator implements Iterator {
private T lookahead = null;
private final Iterator iter;
public PeekIterator(Iterator iter) {
this.iter = iter;
next();
}
public T peek() {
return lookahead;
}
public boolean hasNext() {
return lookahead != null;
}
public T next() {
T peeked = lookahead;
lookahead = iter.hasNext() ? iter.next() : null;
return peeked;
}
public void remove() {
throw new UnsupportedOperationException();
}
}
private static class ReversedIterator implements Iterator {
private final List reversed = Generic.list();
private final Iterator iter;
ReversedIterator(Iterator iter) {
while (iter.hasNext()) {
reversed.add(iter.next());
}
Collections.reverse(reversed);
this.iter = reversed.iterator();
}
public boolean hasNext() {
return iter.hasNext();
}
public T next() {
return iter.next();
}
public void remove() {
throw new UnsupportedOperationException();
}
}
private class LineSplitIterator implements Iterator {
private final PeekIterator iter = new PeekIterator(newSubsequenceIterator());
private final boolean keepends;
LineSplitIterator(boolean keepends) {
this.keepends = keepends;
}
public boolean hasNext() {
return iter.hasNext();
}
public Object next() {
StringBuilder buffer = new StringBuilder();
while (iter.hasNext()) {
int codepoint = iter.next();
if (codepoint == '\r' && iter.peek() != null && iter.peek() == '\n') {
if (keepends) {
buffer.appendCodePoint(codepoint);
buffer.appendCodePoint(iter.next());
} else {
iter.next();
}
break;
} else if (codepoint == '\n' || codepoint == '\r' ||
Character.getType(codepoint) == Character.LINE_SEPARATOR) {
if (keepends) {
buffer.appendCodePoint(codepoint);
}
break;
} else {
buffer.appendCodePoint(codepoint);
}
}
return new PyUnicode(buffer);
}
public void remove() {
throw new UnsupportedOperationException();
}
}
private class SepSplitIterator extends SplitIterator {
private final PyUnicode sep;
SepSplitIterator(PyUnicode sep, int maxsplit) {
super(maxsplit);
this.sep = sep;
}
public PyUnicode next() {
StringBuilder buffer = new StringBuilder();
addLookahead(buffer);
if (numSplits == maxsplit) {
while (iter.hasNext()) {
buffer.appendCodePoint(iter.next());
}
return new PyUnicode(buffer);
}
boolean inSeparator = true;
while (iter.hasNext()) {
// TODO: should cache the first codepoint
inSeparator = true;
for (Iterator sepIter = sep.newSubsequenceIterator();
sepIter.hasNext();) {
int codepoint = iter.next();
if (codepoint != sepIter.next()) {
addLookahead(buffer);
buffer.appendCodePoint(codepoint);
inSeparator = false;
break;
} else {
lookahead.add(codepoint);
}
}
if (inSeparator) {
lookahead.clear();
break;
}
}
numSplits++;
completeSeparator = inSeparator;
return new PyUnicode(buffer);
}
}
private SplitIterator newSplitIterator(PyUnicode sep, int maxsplit) {
if (sep == null) {
return new WhitespaceSplitIterator(maxsplit);
} else if (sep.getCodePointCount() == 0) {
throw Py.ValueError("empty separator");
} else {
return new SepSplitIterator(sep, maxsplit);
}
}
@Override
public PyTuple rpartition(PyObject sep) {
return unicode_rpartition(sep);
}
@ExposedMethod(doc = BuiltinDocs.unicode_rpartition_doc)
final PyTuple unicode_rpartition(PyObject sep) {
return unicodeRpartition(sep);
}
@ExposedMethod(defaults = {"null", "-1"}, doc = BuiltinDocs.unicode___getslice___doc)
final PyList unicode_split(PyObject sepObj, int maxsplit) {
PyUnicode sep = coerceToUnicode(sepObj);
if (sep != null) {
return str_split(sep.string, maxsplit);
} else {
return str_split(null, maxsplit);
}
}
@ExposedMethod(defaults = {"null", "-1"}, doc = BuiltinDocs.unicode___getslice___doc)
final PyList unicode_rsplit(PyObject sepObj, int maxsplit) {
PyUnicode sep = coerceToUnicode(sepObj);
if (sep != null) {
return str_rsplit(sep.string, maxsplit);
} else {
return str_rsplit(null, maxsplit);
}
}
@ExposedMethod(defaults = "false", doc = BuiltinDocs.unicode___getslice___doc)
final PyList unicode_splitlines(boolean keepends) {
if (isBasicPlane()) {
return str_splitlines(keepends);
}
return new PyList(new LineSplitIterator(keepends));
}
@Override
protected PyString fromSubstring(int begin, int end) {
assert(isBasicPlane()); // can only be used on a codepath from str_ equivalents
return new PyUnicode(string.substring(begin, end));
}
@ExposedMethod(defaults = {"0", "null"}, doc = BuiltinDocs.unicode___getslice___doc)
final int unicode_index(String sub, int start, PyObject end) {
return str_index(sub, start, end);
}
@ExposedMethod(defaults = {"0", "null"}, doc = BuiltinDocs.unicode___getslice___doc)
final int unicode_rindex(String sub, int start, PyObject end) {
return str_rindex(sub, start, end);
}
@ExposedMethod(defaults = {"0", "null"}, doc = BuiltinDocs.unicode___getslice___doc)
final int unicode_count(PyObject subObj, int start, PyObject end) {
final PyUnicode sub = coerceToUnicode(subObj);
if (isBasicPlane()) {
return str_count(sub.string, start, end);
}
int[] indices = translateIndices(start, end);
int count = 0;
for (Iterator mainIter = newSubsequenceIterator(indices[0], indices[1], 1);
mainIter.hasNext();) {
int matched = sub.getCodePointCount();
for (Iterator subIter = sub.newSubsequenceIterator();
mainIter.hasNext() && subIter.hasNext();) {
if (mainIter.next() != subIter.next()) {
break;
}
matched--;
}
if (matched == 0) {
count++;
}
}
return count;
}
@ExposedMethod(defaults = {"0", "null"}, doc = BuiltinDocs.unicode___getslice___doc)
final int unicode_find(String sub, int start, PyObject end) {
return str_find(sub, start, end);
}
@ExposedMethod(defaults = {"0", "null"}, doc = BuiltinDocs.unicode___getslice___doc)
final int unicode_rfind(String sub, int start, PyObject end) {
return str_rfind(sub, start, end);
}
private static String padding(int n, int pad) {
StringBuilder buffer = new StringBuilder(n);
for (int i=0; i 0 && width % 2 > 0) {
half += 1;
}
int pad = parse_fillchar("center", padding);
return new PyUnicode(padding(half, pad) + string + padding(n - half, pad));
}
@ExposedMethod(doc = BuiltinDocs.unicode_zfill_doc)
final PyObject unicode_zfill(int width) {
int n = getCodePointCount();
if (n >= width) {
return new PyUnicode(string);
}
if (isBasicPlane()) {
return new PyUnicode(str_zfill(width));
}
StringBuilder buffer = new StringBuilder(width);
int nzeros = width - n;
boolean first = true;
boolean leadingSign = false;
for (Iterator iter = newSubsequenceIterator(); iter.hasNext();) {
int codePoint = iter.next();
if (first) {
first = false;
if (codePoint == '+' || codePoint == '-') {
buffer.appendCodePoint(codePoint);
leadingSign = true;
}
for (int i = 0; i < nzeros; i++) {
buffer.appendCodePoint('0');
}
if (!leadingSign) {
buffer.appendCodePoint(codePoint);
}
} else {
buffer.appendCodePoint(codePoint);
}
}
if (first) {
for (int i = 0; i < nzeros; i++) {
buffer.appendCodePoint('0');
}
}
return new PyUnicode(buffer);
}
@ExposedMethod(defaults = "8", doc = BuiltinDocs.unicode___getslice___doc)
final PyObject unicode_expandtabs(int tabsize) {
return new PyUnicode(str_expandtabs(tabsize));
}
@ExposedMethod(doc = BuiltinDocs.unicode_capitalize_doc)
final PyObject unicode_capitalize() {
if (string.length() == 0) {
return this;
}
if (isBasicPlane()) {
return new PyUnicode(str_capitalize());
}
StringBuilder buffer = new StringBuilder(string.length());
boolean first = true;
for (Iterator iter = newSubsequenceIterator(); iter.hasNext();) {
if (first) {
buffer.appendCodePoint(Character.toUpperCase(iter.next()));
first = false;
} else {
buffer.appendCodePoint(Character.toLowerCase(iter.next()));
}
}
return new PyUnicode(buffer);
}
@ExposedMethod(defaults = "-1", doc = BuiltinDocs.unicode___getslice___doc)
final PyObject unicode_replace(PyObject oldPieceObj, PyObject newPieceObj, int maxsplit) {
PyUnicode newPiece = coerceToUnicode(newPieceObj);
PyUnicode oldPiece = coerceToUnicode(oldPieceObj);
if (isBasicPlane() && newPiece.isBasicPlane() && oldPiece.isBasicPlane()) {
return replace(oldPiece, newPiece, maxsplit);
}
StringBuilder buffer = new StringBuilder();
if (oldPiece.getCodePointCount() == 0) {
Iterator iter = newSubsequenceIterator();
for (int i = 1; (maxsplit == -1 || i < maxsplit) && iter.hasNext(); i++) {
if (i == 1) {
buffer.append(newPiece.string);
}
buffer.appendCodePoint(iter.next());
buffer.append(newPiece.string);
}
while (iter.hasNext()) {
buffer.appendCodePoint(iter.next());
}
return new PyUnicode(buffer);
} else {
SplitIterator iter = newSplitIterator(oldPiece, maxsplit);
int numSplits = 0;
while (iter.hasNext()) {
buffer.append(((PyUnicode) iter.next()).string);
if (iter.hasNext()) {
buffer.append(newPiece.string);
}
numSplits++;
}
if (iter.getEndsWithSeparator() && (maxsplit == -1 || numSplits <= maxsplit)) {
buffer.append(newPiece.string);
}
return new PyUnicode(buffer);
}
}
// end utf-16 aware
@Override
public PyString join(PyObject seq) {
return unicode_join(seq);
}
@ExposedMethod(doc = BuiltinDocs.unicode_join_doc)
final PyUnicode unicode_join(PyObject seq) {
return unicodeJoin(seq);
}
@ExposedMethod(defaults = {"0", "null"}, doc = BuiltinDocs.unicode___getslice___doc)
final boolean unicode_startswith(PyObject prefix, int start, PyObject end) {
return str_startswith(prefix, start, end);
}
@ExposedMethod(defaults = {"0", "null"}, doc = BuiltinDocs.unicode___getslice___doc)
final boolean unicode_endswith(PyObject suffix, int start, PyObject end) {
return str_endswith(suffix, start, end);
}
@ExposedMethod(doc = BuiltinDocs.unicode_translate_doc)
final PyObject unicode_translate(PyObject table) {
String trans = _codecs.translate_charmap(string, "ignore", table, true).__getitem__(0).toString();
return new PyUnicode(trans);
}
// these tests need to be UTF-16 aware because they are character-by-character tests,
// so we can only use equivalent str_XXX tests if we are in basic plane
@ExposedMethod(doc = BuiltinDocs.unicode_islower_doc)
final boolean unicode_islower() {
if (isBasicPlane()) {
return str_islower();
}
boolean cased = false;
for (Iterator iter = newSubsequenceIterator(); iter.hasNext();) {
int codepoint = iter.next();
if (Character.isUpperCase(codepoint) || Character.isTitleCase(codepoint)) {
return false;
} else if (!cased && Character.isLowerCase(codepoint)) {
cased = true;
}
}
return cased;
}
@ExposedMethod(doc = BuiltinDocs.unicode_isupper_doc)
final boolean unicode_isupper() {
if (isBasicPlane()) {
return str_isupper();
}
boolean cased = false;
for (Iterator iter = newSubsequenceIterator(); iter.hasNext();) {
int codepoint = iter.next();
if (Character.isLowerCase(codepoint) || Character.isTitleCase(codepoint)) {
return false;
} else if (!cased && Character.isUpperCase(codepoint)) {
cased = true;
}
}
return cased;
}
@ExposedMethod(doc = BuiltinDocs.unicode_isalpha_doc)
final boolean unicode_isalpha() {
if (isBasicPlane()) {
return str_isalpha();
}
if (getCodePointCount() == 0) {
return false;
}
for (Iterator iter = newSubsequenceIterator(); iter.hasNext();) {
if (!Character.isLetter(iter.next())) {
return false;
}
}
return true;
}
@ExposedMethod(doc = BuiltinDocs.unicode_isalnum_doc)
final boolean unicode_isalnum() {
if (isBasicPlane()) {
return str_isalnum();
}
if (getCodePointCount() == 0) {
return false;
}
for (Iterator iter = newSubsequenceIterator(); iter.hasNext();) {
int codePoint = iter.next();
if (!(Character.isLetterOrDigit(codePoint) ||
Character.getType(codePoint) == Character.LETTER_NUMBER)) {
return false;
}
}
return true;
}
@ExposedMethod(doc = BuiltinDocs.unicode_isdecimal_doc)
final boolean unicode_isdecimal() {
if (isBasicPlane()) {
return str_isdecimal();
}
if (getCodePointCount() == 0) {
return false;
}
for (Iterator iter = newSubsequenceIterator(); iter.hasNext();) {
if (Character.getType(iter.next()) != Character.DECIMAL_DIGIT_NUMBER) {
return false;
}
}
return true;
}
@ExposedMethod(doc = BuiltinDocs.unicode_isdigit_doc)
final boolean unicode_isdigit() {
if (isBasicPlane()) {
return str_isdigit();
}
if (getCodePointCount() == 0) {
return false;
}
for (Iterator iter = newSubsequenceIterator(); iter.hasNext();) {
if (!Character.isDigit(iter.next())) {
return false;
}
}
return true;
}
@ExposedMethod(doc = BuiltinDocs.unicode_isnumeric_doc)
final boolean unicode_isnumeric() {
if (isBasicPlane()) {
return str_isnumeric();
}
if (getCodePointCount() == 0) {
return false;
}
for (Iterator iter = newSubsequenceIterator(); iter.hasNext();) {
int type = Character.getType(iter.next());
if (type != Character.DECIMAL_DIGIT_NUMBER &&
type != Character.LETTER_NUMBER &&
type != Character.OTHER_NUMBER) {
return false;
}
}
return true;
}
@ExposedMethod(doc = BuiltinDocs.unicode_istitle_doc)
final boolean unicode_istitle() {
if (isBasicPlane()) {
return str_istitle();
}
if (getCodePointCount() == 0) {
return false;
}
boolean cased = false;
boolean previous_is_cased = false;
for (Iterator iter = newSubsequenceIterator(); iter.hasNext();) {
int codePoint = iter.next();
if (Character.isUpperCase(codePoint) || Character.isTitleCase(codePoint)) {
if (previous_is_cased) {
return false;
}
previous_is_cased = true;
cased = true;
} else if (Character.isLowerCase(codePoint)) {
if (!previous_is_cased) {
return false;
}
previous_is_cased = true;
cased = true;
} else {
previous_is_cased = false;
}
}
return cased;
}
@ExposedMethod(doc = BuiltinDocs.unicode_isspace_doc)
final boolean unicode_isspace() {
if (isBasicPlane()) {
return str_isspace();
}
if (getCodePointCount() == 0) {
return false;
}
for (Iterator iter = newSubsequenceIterator(); iter.hasNext();) {
if (!Character.isWhitespace(iter.next())) {
return false;
}
}
return true;
}
// end utf-16 aware
@ExposedMethod(doc = "isunicode is deprecated.")
final boolean unicode_isunicode() {
Py.warning(Py.DeprecationWarning, "isunicode is deprecated.");
return true;
}
@ExposedMethod(defaults = {"null", "null"}, doc = BuiltinDocs.unicode___getslice___doc)
final String unicode_encode(String encoding, String errors) {
return str_encode(encoding, errors);
}
@ExposedMethod(defaults = {"null", "null"}, doc = BuiltinDocs.unicode___getslice___doc)
final PyObject unicode_decode(String encoding, String errors) {
return str_decode(encoding, errors);
}
@ExposedMethod(doc = BuiltinDocs.unicode___getnewargs___doc)
final PyTuple unicode___getnewargs__() {
return new PyTuple(new PyUnicode(this.string));
}
public Iterator iterator() {
return newSubsequenceIterator();
}
@Override
public PyComplex __complex__() {
return new PyString(encodeDecimal()).__complex__();
}
@Override
public int atoi(int base) {
return new PyString(encodeDecimal()).atoi(base);
}
@Override
public PyLong atol(int base) {
return new PyString(encodeDecimal()).atol(base);
}
@Override
public double atof() {
return new PyString(encodeDecimal()).atof();
}
/**
* Encode unicode into a valid decimal String. Throws a UnicodeEncodeError on invalid
* characters.
*
* @return a valid decimal as an encoded String
*/
private String encodeDecimal() {
if (isBasicPlane()) {
return encodeDecimalBasic();
}
int digit;
StringBuilder sb = new StringBuilder();
int i = 0;
for (Iterator iter = newSubsequenceIterator(); iter.hasNext(); i++) {
int codePoint = iter.next();
if (Character.isWhitespace(codePoint)) {
sb.append(' ');
continue;
}
digit = Character.digit(codePoint, 10);
if (digit >= 0) {
sb.append(digit);
continue;
}
if (0 < codePoint && codePoint < 256) {
sb.appendCodePoint(codePoint);
continue;
}
// All other characters are considered unencodable
codecs.encoding_error("strict", "decimal", string, i, i + 1,
"invalid decimal Unicode string");
}
return sb.toString();
}
/**
* Encode unicode in the basic plane into a valid decimal String. Throws a
* UnicodeEncodeError on invalid characters.
*
* @return a valid decimal as an encoded String
*/
private String encodeDecimalBasic() {
int digit;
StringBuilder sb = new StringBuilder();
for (int i = 0; i < string.length(); i++) {
char ch = string.charAt(i);
if (Character.isWhitespace(ch)) {
sb.append(' ');
continue;
}
digit = Character.digit(ch, 10);
if (digit >= 0) {
sb.append(digit);
continue;
}
if (0 < ch && ch < 256) {
sb.append(ch);
continue;
}
// All other characters are considered unencodable
codecs.encoding_error("strict", "decimal", string, i, i + 1,
"invalid decimal Unicode string");
}
return sb.toString();
}
}
© 2015 - 2025 Weber Informatics LLC | Privacy Policy