org.jruby.util.StringSupport Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of jruby-complete Show documentation
There is a newer version: 9.4.12.0
/***** BEGIN LICENSE BLOCK *****
 * Version: EPL 2.0/GPL 2.0/LGPL 2.1
 *
 * The contents of this file are subject to the Eclipse Public
 * License Version 2.0 (the "License"); you may not use this file
 * except in compliance with the License. You may obtain a copy of
 * the License at http://www.eclipse.org/legal/epl-v20.html
 *
 * Software distributed under the License is distributed on an "AS
 * IS" basis, WITHOUT WARRANTY OF ANY KIND, either express or
 * implied. See the License for the specific language governing
 * rights and limitations under the License.
 *
 * Alternatively, the contents of this file may be used under the terms of
 * either of the GNU General Public License Version 2 or later (the "GPL"),
 * or the GNU Lesser General Public License Version 2.1 or later (the "LGPL"),
 * in which case the provisions of the GPL or the LGPL are applicable instead
 * of those above. If you wish to allow use of your version of this file only
 * under the terms of either the GPL or the LGPL, and not to allow others to
 * use your version of this file under the terms of the EPL, indicate your
 * decision by deleting the provisions above and replace them with the notice
 * and other provisions required by the GPL or the LGPL. If you do not delete
 * the provisions above, a recipient may use your version of this file under
 * the terms of any one of the EPL, the GPL or the LGPL.
 ***** END LICENSE BLOCK *****/

package org.jruby.util;

import static org.jcodings.Encoding.CHAR_INVALID;
import static org.jruby.RubyEnumerator.enumeratorize;

import org.jcodings.Config;
import org.jcodings.Encoding;
import org.jcodings.IntHolder;
import org.jcodings.ascii.AsciiTables;
import org.jcodings.constants.CharacterType;
import org.jcodings.exception.EncodingError;
import org.jcodings.exception.EncodingException;
import org.jcodings.specific.ASCIIEncoding;
import org.jcodings.specific.UTF8Encoding;
import org.jcodings.util.IntHash;
import org.joni.Matcher;
import org.jruby.ObjectFlags;
import org.jruby.Ruby;
import org.jruby.RubyArray;
import org.jruby.RubyEncoding;
import org.jruby.RubyIO;
import org.jruby.RubyString;
import org.jruby.RubySymbol;
import org.jruby.ast.util.ArgsUtil;
import org.jruby.runtime.Block;
import org.jruby.runtime.Helpers;
import org.jruby.runtime.ThreadContext;
import org.jruby.runtime.builtin.IRubyObject;
import org.jruby.util.collections.IntHashMap;
import org.jruby.util.io.EncodingUtils;
import sun.misc.Unsafe;

import java.nio.charset.Charset;
import java.nio.charset.UnsupportedCharsetException;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collections;
import java.util.List;

public final class StringSupport {
    public static final int CR_7BIT_F    = ObjectFlags.CR_7BIT_F;
    public static final int CR_VALID_F   = ObjectFlags.CR_VALID_F;
    public static final int CR_UNKNOWN   = 0;

    // We hardcode these so they can be used in a switch below. The assert verifies they match FlagRegistry's value.
    public static final int CR_7BIT      = 16;
    public static final int CR_VALID     = 32;
    static {
        assert CR_7BIT == CR_7BIT_F : "CR_7BIT = " + CR_7BIT + " but should be " + CR_7BIT_F;
        assert CR_VALID == CR_VALID_F : "CR_VALID = " + CR_VALID + " but should be " + CR_VALID_F;
    }

    public static final int CR_BROKEN    = CR_7BIT | CR_VALID;
    public static final int CR_MASK      = CR_7BIT | CR_VALID;

    static final int ARRAY_BYTE_BASE_OFFSET;
    static {
        final Unsafe unsafe = org.jruby.util.unsafe.UnsafeHolder.U;
        ARRAY_BYTE_BASE_OFFSET = unsafe != null ? unsafe.arrayBaseOffset(byte[].class) : 0;
    }

    public static final int TRANS_SIZE = 256;

    public static final ByteList[] EMPTY_BYTELIST_ARRAY = new ByteList[0];
    public static final String[] EMPTY_STRING_ARRAY = new String[0];

    /**
     * Split string into sub-parts.
     * @param str the string
     * @param sep the separator
     * @see String#split(String)
     *
     * @note We differ from the non-limited {@link String#split(String)} in handling consecutive separator chars at the
     * end of string. While "1;;"split(";") returns `[ "1" ]` this version returns `[ "1", "" ]` which is
     * consistent when consecutive separators occur elsewhere.
     */
    public static List split(final String str, final char sep) {
        return split(str, sep, 0);
    }

    /**
     * Split string into (limited) sub-parts.
     * @param str the string
     * @param sep the separator
     * @param lim has same effect as with {@link String#split(String, int)}
     */
    public static List split(final String str, final char sep, final int lim) {
        final int len = str.length();
        if ( len == 0 ) return Collections.singletonList(str);

        final ArrayList result = new ArrayList<>(lim <= 0 ? 8 : lim);

        int e; int s = 0; int count = 0;
        while ( (e = str.indexOf(sep, s)) != -1 ) {
            if ( lim == ++count ) { // limited (lim > 0) case
                result.add(str.substring(s));
                return result;
            }
            result.add(str.substring(s, e));
            s = e + 1;
        }
        if ( s < len || ( s == len && lim > 0 ) ) result.add(str.substring(s));

        return result;
    }

    // String.startsWith for a CharSequence
    public static boolean startsWith(final CharSequence str, final String prefix) {
        int p = prefix.length();
        if ( p > str.length() ) return false;
        int i = 0;
        while ( --p >= 0 ) {
            if (str.charAt(i) != prefix.charAt(i)) return false;
            i++;
        }
        return true;
    }

    public static boolean startsWith(final CharSequence str, final char c) {
        return str.length() >= 1 && str.charAt(0) == c;
    }

    public static boolean startsWith(final CharSequence str, final char c1, final char c2) {
        return str.length() >= 2 && str.charAt(0) == c1 && str.charAt(1) == c2;
    }

    // without any char[] array copying, also StringBuilder only has lastIndexOf(String)
    public static int lastIndexOf(final CharSequence str, final char c, int index) {
        while ( index >= 0 ) {
            if ( str.charAt(index) == c ) return index;
            index--;
        }
        return -1;
    }

    public static boolean contentEquals(final CharSequence str, final int chr) {
        return (str.length() == 1) && str.charAt(0) == chr;
    }

    public static boolean contentEquals(final CharSequence str, final int chr1, final int chr2) {
        return (str.length() == 2) && str.charAt(0) == chr1 && str.charAt(1) == chr2;
    }

    public static CharSequence concat(final CharSequence str1, final CharSequence str2) {
        return new StringBuilder(str1.length() + str2.length()).append(str1).append(str2);
    }

    public static String delete(final String str, final char c) { // str.replaceAll(c.toString(), "")
        char[] ary = null; int end = 0, s = 0;
        for (int i = 0; i < str.length(); i++) {
            if (str.charAt(i) == c) {
                if (ary == null) {
                    ary = new char[str.length() - 1];
                }
                end = copy(str, s, i - s, ary, end);
                s = i + 1;
            }
        }
        return ary == null ? str : new String(ary, 0, end);
    }

    public static CharSequence replaceFirst(final String str, final String sub, final String repl) {
        return replaceImpl(str, sub, repl, 1, false);
    }

    public static CharSequence replaceAll(final String str, final String sub, final String repl) {
        return replaceImpl(str, sub, repl, -1, false);
    }

    // borrowed from commons-lang StringUtils
    private static CharSequence replaceImpl(final String str, String sub, final String repl, int max, final boolean ignoreCase) {
        if (str.length() == 0 || sub.length() == 0) return str;

        String search = str;
        if (ignoreCase) {
            search = str.toLowerCase();
            sub = sub.toLowerCase();
        }
        int start = 0;
        int end = search.indexOf(sub, start);
        if (end == -1) return str;

        final int replLength = sub.length();
        int increase = repl.length() - replLength;
        increase = increase < 0 ? 0 : increase;
        increase *= max < 0 ? 16 : max > 64 ? 64 : max;
        final StringBuilder buf = new StringBuilder(str.length() + increase);
        while (end != -1) {
            buf.append(str, start, end).append(repl);
            start = end + replLength;
            if (--max == 0) break;
            end = search.indexOf(sub, start);
        }
        buf.append(str, start, str.length());
        return buf;
    }

    private static int copy(final String str, final int soff, final int slen, final char[] dest, int doff) {
        switch(slen) {
            case 0:
                break;
            case 1:
                dest[doff++] = str.charAt(soff);
                break;
            case 2:
                dest[doff++] = str.charAt(soff);
                dest[doff++] = str.charAt(soff + 1);
                break;
            case 3:
                dest[doff++] = str.charAt(soff);
                dest[doff++] = str.charAt(soff + 1);
                dest[doff++] = str.charAt(soff + 2);
                break;
            default:
                for (int s = soff; s < slen; s++) dest[doff++] = str.charAt(s);
        }
        return doff;
    }

    public static String codeRangeAsString(int codeRange) {
        switch (codeRange) {
            case CR_UNKNOWN: return "unknown";
            case CR_7BIT: return "7bit";
            case CR_VALID: return "valid";
            case CR_BROKEN: return "broken";
        }

        return "???";  // Not reached unless something seriously boned
    }

    // rb_enc_fast_mbclen
    public static int encFastMBCLen(byte[] bytes, int p, int e, Encoding enc) {
        return enc.length(bytes, p, e);
    }

    // rb_enc_mbclen
    public static int length(Encoding enc, byte[]bytes, int p, int end) {
        int n = enc.length(bytes, p, end);
        if (MBCLEN_CHARFOUND_P(n) && MBCLEN_CHARFOUND_LEN(n) <= end - p) return MBCLEN_CHARFOUND_LEN(n);
        int min = enc.minLength();
        return min <= end - p ? min : end - p;
    }

    // rb_enc_precise_mbclen
    public static int preciseLength(Encoding enc, byte[]bytes, int p, int end) {
        if (p >= end) return MBCLEN_NEEDMORE(1);
        int n = enc.length(bytes, p, end);
        if (n > end - p) return MBCLEN_NEEDMORE(n - (end - p));
        return n;
    }

    // MBCLEN_NEEDMORE_P, ONIGENC_MBCLEN_NEEDMORE_P
    public static boolean MBCLEN_NEEDMORE_P(int r) {
        return r < -1;
    }

    // MBCLEN_NEEDMORE, ONIGENC_MBCLEN_NEEDMORE
    public static int MBCLEN_NEEDMORE(int n) {
        return -1 - n;
    }

    // MBCLEN_NEEDMORE_LEN, ONIGENC_MBCLEN_NEEDMORE_LEN
    public static int MBCLEN_NEEDMORE_LEN(int r) {
        return -1 - r;
    }

    // MBCLEN_INVALID_P, ONIGENC_MBCLEN_INVALID_P
    public static boolean MBCLEN_INVALID_P(int r) {
        return r == -1;
    }

    // MBCLEN_CHARFOUND_LEN, ONIGENC_MBCLEN_CHARFOUND_LEN
    public static int MBCLEN_CHARFOUND_LEN(int r) {
        return r;
    }

    // MBCLEN_CHARFOUND_P, ONIGENC_MBCLEN_CHARFOUND_P
    public static boolean MBCLEN_CHARFOUND_P(int r) {
        return 0 < r;
    }

    // CONSTRUCT_MBCLEN_CHARFOUND, ONIGENC_CONSTRUCT_MBCLEN_CHARFOUND
    public static int CONSTRUCT_MBCLEN_CHARFOUND(int n) {
        return n;
    }

    // MRI: search_nonascii
    public static int searchNonAscii(byte[]bytes, int p, int end) {
        while (p < end) {
            if (!Encoding.isAscii(bytes[p])) return p;
            p++;
        }
        return -1;
    }

    public static int searchNonAscii(ByteList bytes) {
        return searchNonAscii(bytes.getUnsafeBytes(), bytes.getBegin(), bytes.getBegin() + bytes.getRealSize());
    }

    public static int codeRangeScan(Encoding enc, byte[]bytes, int p, int len) {
        if (enc == ASCIIEncoding.INSTANCE) {
            return searchNonAscii(bytes, p, p + len) != -1 ? CR_VALID : CR_7BIT;
        }
        if (enc.isAsciiCompatible()) {
            return codeRangeScanAsciiCompatible(enc, bytes, p, len);
        }
        return codeRangeScanNonAsciiCompatible(enc, bytes, p, len);
    }

    private static int codeRangeScanAsciiCompatible(Encoding enc, byte[]bytes, int p, int len) {
        int end = p + len;
        p = searchNonAscii(bytes, p, end);
        if (p == -1) return CR_7BIT;

        while (p < end) {
            int cl = preciseLength(enc, bytes, p, end);
            if (cl <= 0) return CR_BROKEN;
            p += cl;
            if (p < end) {
                p = searchNonAscii(bytes, p, end);
                if (p == -1) return CR_VALID;
            }
        }
        return p > end ? CR_BROKEN : CR_VALID;
    }

    private static int codeRangeScanNonAsciiCompatible(Encoding enc, byte[]bytes, int p, int len) {
        int end = p + len;
        while (p < end) {
            int cl = preciseLength(enc, bytes, p, end);
            if (cl <= 0) return CR_BROKEN;
            p += cl;
        }
        return p > end ? CR_BROKEN : CR_VALID;
    }

    public static int codeRangeScan(Encoding enc, ByteList bytes) {
        return codeRangeScan(enc, bytes.getUnsafeBytes(), bytes.getBegin(), bytes.getRealSize());
    }

    public static long codeRangeScanRestartable(Encoding enc, byte[]bytes, int s, int end, int cr) {
        if (cr == CR_BROKEN) return pack(end - s, cr);
        int p = s;

        if (enc == ASCIIEncoding.INSTANCE) {
            return pack(end - s, searchNonAscii(bytes, p, end) == -1 && cr != CR_VALID ? CR_7BIT : CR_VALID);
        } else if (enc.isAsciiCompatible()) {
            p = searchNonAscii(bytes, p, end);
            if (p == -1) return pack(end - s, cr != CR_VALID ? CR_7BIT : cr);

            while (p < end) {
                int cl = preciseLength(enc, bytes, p, end);
                if (cl <= 0) return pack(p - s, cl == CHAR_INVALID ? CR_BROKEN : CR_UNKNOWN);
                p += cl;

                if (p < end) {
                    p = searchNonAscii(bytes, p, end);
                    if (p == -1) return pack(end - s, CR_VALID);
                }
            }
        } else {
            while (p < end) {
                int cl = preciseLength(enc, bytes, p, end);
                if (cl <= 0) return pack(p - s, cl == CHAR_INVALID ? CR_BROKEN: CR_UNKNOWN);
                p += cl;
            }
        }
        return pack(p - s, p > end ? CR_BROKEN : CR_VALID);
    }

    private static final long NONASCII_MASK = 0x8080808080808080L;
    private static int countUtf8LeadBytes(long d) {
        d |= ~(d >>> 1);
        d >>>= 6;
        d &= NONASCII_MASK >>> 7;
        d += (d >>> 8);
        d += (d >>> 16);
        d += (d >>> 32);
        return (int)(d & 0xf);
    }

    private static final int LONG_SIZE = 8;
    private static final int LOWBITS = LONG_SIZE - 1;
    @SuppressWarnings("deprecation")
    public static int utf8Length(byte[] bytes, int p, int end) {
        int len = 0;
        if (ARRAY_BYTE_BASE_OFFSET > 0) { // Unsafe
            if (end - p > LONG_SIZE * 2) {
                int ep = ~LOWBITS & (p + LOWBITS);
                while (p < ep) {
                    if ((bytes[p++] & 0xc0 /*utf8 lead byte*/) != 0x80) len++;
                }
                final Unsafe unsafe = org.jruby.util.unsafe.UnsafeHolder.U;
                int eend = ~LOWBITS & end;
                while (p < eend) {
                    len += countUtf8LeadBytes(unsafe.getLong(bytes, (long) (ARRAY_BYTE_BASE_OFFSET + p)));
                    p += LONG_SIZE;
                }
            }
        }
        while (p < end) {
            if ((bytes[p++] & 0xc0 /*utf8 lead byte*/) != 0x80) len++;
        }
        return len;
    }

    public static int utf8Length(ByteList bytes) {
        return utf8Length(bytes.getUnsafeBytes(), bytes.getBegin(), bytes.getBegin() + bytes.getRealSize());
    }

    // MRI: rb_enc_strlen
    public static int strLength(Encoding enc, byte[]bytes, int p, int end) {
        return strLength(enc, bytes, p, end, CR_UNKNOWN);
    }

    // MRI: enc_strlen
    public static int strLength(Encoding enc, byte[]bytes, int p, int e, int cr) {
        int c;
        if (enc.isFixedWidth()) {
            return (e - p + enc.minLength() - 1) / enc.minLength();
        } else if (enc.isAsciiCompatible()) {
            c = 0;
            if (cr == CR_7BIT || cr == CR_VALID) {
                while (p < e) {
                    if (Encoding.isAscii(bytes[p])) {
                        int q = searchNonAscii(bytes, p, e);
                        if (q == -1) return c + (e - p);
                        c += q - p;
                        p = q;
                    }
                    p += encFastMBCLen(bytes, p, e, enc);
                    c++;
                }
            } else {
                while (p < e) {
                    if (Encoding.isAscii(bytes[p])) {
                        int q = searchNonAscii(bytes, p, e);
                        if (q == -1) return c + (e - p);
                        c += q - p;
                        p = q;
                    }
                    p += length(enc, bytes, p, e);
                    c++;
                }
            }
            return c;
        }

        for (c = 0; p < e; c++) {
            p += length(enc, bytes, p, e);
        }
        return c;
    }

    public static int strLength(ByteList bytes) {
        return strLength(bytes.getEncoding(), bytes.getUnsafeBytes(), bytes.getBegin(), bytes.getBegin() + bytes.getRealSize());
    }

    public static long strLengthWithCodeRange(Encoding enc, byte[]bytes, int p, int end) {
        if (enc.isFixedWidth()) {
            return (end - p + enc.minLength() - 1) / enc.minLength();
        } else if (enc.isAsciiCompatible()) {
            return strLengthWithCodeRangeAsciiCompatible(enc, bytes, p, end);
        } else {
            return strLengthWithCodeRangeNonAsciiCompatible(enc, bytes, p, end);
        }
    }

    public static long strLengthWithCodeRangeAsciiCompatible(Encoding enc, byte[]bytes, int p, int end) {
        int cr = 0, c = 0;
        while (p < end) {
            if (Encoding.isAscii(bytes[p])) {
                int q = searchNonAscii(bytes, p, end);
                if (q == -1) return pack(c + (end - p), cr == 0 ? CR_7BIT : cr);
                c += q - p;
                p = q;
            }
            int cl = preciseLength(enc, bytes, p, end);
            if (cl > 0) {
                cr |= CR_VALID;
                p += cl;
            } else {
                cr = CR_BROKEN;
                p++;
            }
            c++;
        }
        return pack(c, cr == 0 ? CR_7BIT : cr);
    }

    public static long strLengthWithCodeRangeNonAsciiCompatible(Encoding enc, byte[]bytes, int p, int end) {
        int cr = 0, c;
        for (c = 0; p < end; c++) {
            int cl = preciseLength(enc, bytes, p, end);
            if (cl > 0) {
                cr |= CR_VALID;
                p += cl;
            } else {
                cr = CR_BROKEN;
                p++;
            }
        }
        return pack(c, cr == 0 ? CR_7BIT : cr);
    }

    public static long strLengthWithCodeRange(ByteList bytes) {
        return strLengthWithCodeRange(bytes.getEncoding(), bytes.getUnsafeBytes(), bytes.getBegin(), bytes.getBegin() + bytes.getRealSize());
    }

    public static long strLengthWithCodeRange(ByteList bytes, Encoding enc) {
        return strLengthWithCodeRange(enc, bytes.getUnsafeBytes(), bytes.getBegin(), bytes.getBegin() + bytes.getRealSize());
    }

    // arg cannot be negative
    public static long pack(int result, int arg) {
        return ((long)arg << 31) | result;
    }

    public static int unpackResult(long len) {
        return (int)len & 0x7fffffff;
    }

    public static int unpackArg(long cr) {
        return (int)(cr >>> 31);
    }

    public static int codePoint(Encoding enc, byte[] bytes, int p, int end) {
        if (p >= end) throw new IllegalArgumentException("empty string");
        int cl = preciseLength(enc, bytes, p, end);
        if (cl <= 0) throw new IllegalArgumentException("invalid byte sequence in " + enc);
        return enc.mbcToCode(bytes, p, end);
    }

    public static int codePoint(Ruby runtime, Encoding enc, byte[] bytes, int p, int end) {
        try {
            return codePoint(enc, bytes, p, end);
        } catch (IllegalArgumentException e) {
            throw runtime.newArgumentError(e.getMessage());
        }
    }

    public static int codePoint(final Ruby runtime, final ByteList value) {
        return codePoint(runtime, EncodingUtils.getEncoding(value),
                value.getUnsafeBytes(), value.getBegin(), value.getBegin() + value.getRealSize());
    }

    public static int codeLength(Encoding enc, int c) {
        int i = enc.codeToMbcLength(c);
        return checkCodepointError(i);
    }

    public static int checkCodepointError(int i) {
        if (i < 0) {
            // for backward compat with code expecting exceptions
            throw new EncodingException(EncodingError.fromCode(i));
        }
        return i;
    }

    public static long getAscii(Encoding enc, byte[]bytes, int p, int end) {
        return getAscii(enc, bytes, p, end, 0);
    }

    public static long getAscii(Encoding enc, byte[]bytes, int p, int end, int len) {
        if (p >= end) return pack(-1, len);

        if (enc.isAsciiCompatible()) {
            int c = bytes[p] & 0xff;
            if (!Encoding.isAscii(c)) return pack(-1, len);
            return pack(c, len == 0 ? 0 : 1);
        } else {
            int cl = preciseLength(enc, bytes, p, end);
            if (cl <= 0) return pack(-1, len);
            int c = enc.mbcToCode(bytes, p, end);
            if (!Encoding.isAscii(c)) return pack(-1, len);
            return pack(c, len == 0 ? 0 : cl);
        }
    }

    public static int preciseCodePoint(Encoding enc, byte[]bytes, int p, int end) {
        int l = preciseLength(enc, bytes, p, end);
        if (l > 0) return enc.mbcToCode(bytes, p, end);
        return -1;
    }

    @SuppressWarnings("deprecation")
    public static int utf8Nth(byte[]bytes, int p, int e, int nth) {
        // FIXME: Missing our UNSAFE impl because it was doing the wrong thing: See GH #1986
        while (p < e) {
            if ((bytes[p] & 0xc0 /*utf8 lead byte*/) != 0x80) {
                if (nth == 0) break;
                nth--;
            }
            p++;
        }
        return p;
    }

    public static int nth(Encoding enc, byte[]bytes, int p, int end, int n) {
        return nth(enc, bytes, p, end, n, enc.isSingleByte());
    }

    /**
     * Get the position of the nth character in the given byte array, using the given encoding and range.
     *
     * @param enc encoding to use
     * @param bytes bytes to scan
     * @param p starting byte offset
     * @param end ending byte offset
     * @param n index of character for which to find byte offset
     * @param singlebyte whether the byte contents are in a single byte encoding
     * @return the offset of the nth character in the string, or -1 if nth is out of the string
     */
    public static int nth(Encoding enc, byte[]bytes, int p, int end, int n, boolean singlebyte) {
        if (singlebyte) {
            p += n;
        } else if (enc.isFixedWidth()) {
            p += n * enc.maxLength();
        } else if (enc.isAsciiCompatible()) {
            p = nthAsciiCompatible(enc, bytes, p, end, n);
        } else {
            p = nthNonAsciiCompatible(enc, bytes, p, end, n);
        }
        if (p < 0) return -1;
        return p > end ? end : p;
    }

    private static int nthAsciiCompatible(Encoding enc, byte[]bytes, int p, int end, int n) {
        while (p < end && n > 0) {
            int end2 = p + n;
            if (end < end2) return end;
            if (Encoding.isAscii(bytes[p])) {
                int p2 = searchNonAscii(bytes, p, end2);
                if (p2 == -1) return end2;
                n -= p2 - p;
                p = p2;
            }
            int cl = length(enc, bytes, p, end);
            p += cl;
            n--;
        }
        return n != 0 ? end : p;
    }

    private static int nthNonAsciiCompatible(Encoding enc, byte[]bytes, int p, int end, int n) {
        while (p < end && n-- != 0) {
            p += length(enc, bytes, p, end);
        }
        return p;
    }

    public static int utf8Offset(byte[]bytes, int p, int end, int n) {
        int pp = utf8Nth(bytes, p, end, n);
        return pp == -1 ? end - p : pp - p;
    }

    public static int offset(Encoding enc, byte[]bytes, int p, int end, int n) {
        int pp = nth(enc, bytes, p, end, n);
        return pp == -1 ? end - p : pp - p;
    }

    public static int offset(Encoding enc, byte[]bytes, int p, int end, int n, boolean singlebyte) {
        int pp = nth(enc, bytes, p, end, n, singlebyte);
        return pp == -1 ? end - p : pp - p;
    }

    public static int offset(RubyString str, int pos) {
        ByteList value = str.getByteList();
        return offset(str.getEncoding(), value.getUnsafeBytes(), value.getBegin(), value.getBegin() + value.getRealSize(), pos);
    }

    @Deprecated
    public static int toLower(Encoding enc, int c) {
        return Encoding.isAscii(c) ? AsciiTables.ToLowerCaseTable[c] : c;
    }

    @Deprecated
    public static int toUpper(Encoding enc, int c) {
        return Encoding.isAscii(c) ? AsciiTables.ToUpperCaseTable[c] : c;
    }

    public static int caseCmp(byte[]bytes1, int p1, byte[]bytes2, int p2, int len) {
        int i = -1;
        for (; ++i < len && bytes1[p1 + i] == bytes2[p2 + i];) {}
        if (i < len) return (bytes1[p1 + i] & 0xff) > (bytes2[p2 + i] & 0xff) ? 1 : -1;
        return 0;
    }

    public static int scanHex(byte[]bytes, int p, int len) {
        return scanHex(bytes, p, len, ASCIIEncoding.INSTANCE);
    }

    public static int scanHex(byte[]bytes, int p, int len, Encoding enc) {
        int v = 0;
        int c;
        while (len-- > 0 && enc.isXDigit(c = bytes[p++] & 0xff)) {
            v = (v << 4) + enc.xdigitVal(c);
        }
        return v;
    }

    public static int hexLength(byte[]bytes, int p, int len) {
        return hexLength(bytes, p, len, ASCIIEncoding.INSTANCE);
    }

    public static int hexLength(byte[]bytes, int p, int len, Encoding enc) {
        int hlen = 0;
        while (len-- > 0 && enc.isXDigit(bytes[p++] & 0xff)) hlen++;
        return hlen;
    }

    public static int scanOct(byte[]bytes, int p, int len) {
        return scanOct(bytes, p, len, ASCIIEncoding.INSTANCE);
    }

    public static int scanOct(byte[]bytes, int p, int len, Encoding enc) {
        int v = 0;
        int c;
        while (len-- > 0 && enc.isDigit(c = bytes[p++] & 0xff) && c < '8') {
            v = (v << 3) + Encoding.digitVal(c);
        }
        return v;
    }

    public static int octLength(byte[]bytes, int p, int len) {
        return octLength(bytes, p, len, ASCIIEncoding.INSTANCE);
    }

    public static int octLength(byte[]bytes, int p, int len, Encoding enc) {
        int olen = 0;
        int c;
        while (len-- > 0 && enc.isDigit(c = bytes[p++] & 0xff) && c < '8') olen++;
        return olen;
    }

    /**
     * Check whether input object's string value contains a null byte, and if so
     * throw SecurityError.
     * @param runtime
     * @param value
     */
    public static final void checkStringSafety(Ruby runtime, IRubyObject value) {
        RubyString s = value.asString();
        ByteList bl = s.getByteList();
        final byte[] array = bl.getUnsafeBytes();
        final int end = bl.length();
        for (int i = bl.begin(); i < end; ++i) {
            if (array[i] == (byte) 0) {
                throw runtime.newSecurityError("string contains null byte");
            }
        }
    }

    public static String escapedCharFormat(int c, boolean isUnicode) {
        String format;
        // c comparisons must be unsigned 32-bit
        if (isUnicode) {

            if ((c & 0xFFFFFFFFL) < 0x7F && Encoding.isAscii(c) && ASCIIEncoding.INSTANCE.isPrint(c)) {
                format = "%c";
            } else if (c < 0x10000) {
                format = "\\u%04X";
            } else {
                format = "\\u{%X}";
            }
        } else {
            if ((c & 0xFFFFFFFFL) < 0x100) {
                format = "\\x%02X";
            } else {
                format = "\\x{%X}";
            }
        }
        return format;
    }

    // mri: ONIGENC_MBCLEN_NEEDMORE_P - onigurama.h
    public static boolean isIncompleteChar(int b) {
        return b < -1;
    }

    public static int bytesToFixBrokenTrailingCharacter(ByteList val, int usingLength) {
        return bytesToFixBrokenTrailingCharacter(val.getUnsafeBytes(), val.getBegin(), val.getRealSize(), val.getEncoding(), usingLength);
    }

    public static int bytesToFixBrokenTrailingCharacter(byte[] bytes, int begin, int byteSize, Encoding encoding, int usingLength) {
        // read additional bytes to fix broken char
        if (byteSize > 0) {
            // get head offset of broken character
            int charHead = encoding.leftAdjustCharHead(
                    bytes, // string bytes
                    begin, // start of string
                    begin + usingLength - 1, // last byte
                    begin + usingLength); // end of using

            // external offset
            charHead -= begin;

            // byte at char head
            byte byteHead = (byte)(bytes[begin + charHead] & 0xFF);

            // total bytes we would need to complete character
            int extra = encoding.length(byteHead);

            // what we already have
            extra -= usingLength - charHead;

            return extra;
        }

        return 0;
    }

    public static int memchr(byte[] ptr, int start, final int find, int len) {
        for (int i = start; i < start + len; i++) {
            if ( ptr[i] == find ) return i;
        }
        return -1;
    }

    // StringValueCStr, rb_string_value_cstr without trailing null addition
    public static RubyString checkEmbeddedNulls(Ruby runtime, IRubyObject ptr) {
        final RubyString s = ptr.convertToString();
        ByteList sByteList = s.getByteList();
        byte[] sBytes = sByteList.unsafeBytes();
        int beg = sByteList.begin();
        int len = sByteList.length();
        final Encoding enc = s.getEncoding();
        final int minlen = enc.minLength();

        if (minlen > 1) {
            if (strNullChar(sBytes, beg, len, minlen, enc) != -1) {
                throw runtime.newArgumentError("string contains null char");
            }
            return strFillTerm(s, sBytes, beg, len, minlen);
        }
        if (memchr(sBytes, beg, '\0', len) != -1) {
            throw runtime.newArgumentError("string contains null byte");
        }
        //if (s[len]) {
        //    s = str_fill_term(str, s, len, minlen);
        //}
        return s;
    }

    // MRI: str_null_char
    private static int strNullChar(byte[] sBytes, int s, int len, final int minlen, Encoding enc) {
        int e = s + len;

        for (; s + minlen <= e; s += enc.length(sBytes, s, e)) {
            if (zeroFilled(sBytes, s, minlen)) return s;
        }
        return -1;
    }

    // MRI: zero_filled
    private static boolean zeroFilled(byte[] sBytes, int s, int n) {
        for (; n > 0; --n) {
            if (sBytes[s++] != 0) return false;
        }
        return true;
    }

    // MRI: str_fill_term
    private static RubyString strFillTerm(RubyString str, byte[] sBytes, int beg, int len, int termlen) {
        int capa = sBytes.length - beg;

        if (capa < len + termlen) {
            // rb_check_lockedtmp(str);
            str = str.makeIndependent(len + termlen);
            sBytes = str.getByteList().unsafeBytes();
            beg = str.getByteList().begin();
        }
        else if ( ! str.independent() ) {
            if ( ! zeroFilled(sBytes, beg + len, termlen) ) {
                str = str.makeIndependent(len + termlen);
                sBytes = str.getByteList().unsafeBytes();
                beg = str.getByteList().begin();
            }
        }

        TERM_FILL(sBytes, beg, len, termlen);
        return str;
    }

    private static void TERM_FILL(byte[] ptr, final int beg, final int len, final int termlen) {
        final int p = beg + len; Arrays.fill(ptr, p, p + termlen, (byte) '\0');
    }

    /**
     * rb_str_scan
     */

    public static int positionEndForScan(ByteList value, Matcher matcher, Encoding enc, int begin, int range) {
        int end = matcher.getEnd();
        if (matcher.getBegin() == end) {
            if (value.getRealSize() > end) {
                return end + enc.length(value.getUnsafeBytes(), begin + end, range);
            } else {
                return end + 1;
            }
        } else {
            return end;
        }
    }

    /**
     * rb_str_dump
     */
    public static ByteList dumpCommon(Ruby runtime, ByteList bytelist) {
        return dumpCommon(runtime, bytelist, false);
    }

    public static ByteList dumpCommon(Ruby runtime, ByteList byteList, boolean quoteOnlyIfNeeded) {
        Encoding enc = byteList.getEncoding();
        boolean includingsNonprintable = false;

        int p = byteList.getBegin();
        int end = p + byteList.getRealSize();
        byte[]bytes = byteList.getUnsafeBytes();

        int len = 2;
        while (p < end) {
            int c = bytes[p++] & 0xff;

            switch (c) {
            case '"':case '\\':case '\n':case '\r':case '\t':case '\f':
            case '\013': case '\010': case '\007': case '\033':
                len += 2;
                break;
            case '#':
                len += isEVStr(bytes, p, end) ? 2 : 1;
                break;
            default:
                if (ASCIIEncoding.INSTANCE.isPrint(c)) {
                    len++;
                } else {
                    includingsNonprintable = true;
                    if (enc.isUTF8() && c > 0x7F) {
                        int n = preciseLength(enc, bytes, p - 1, end) - 1;
                        if (MBCLEN_CHARFOUND_LEN(n) > 0) {
                            int cc = codePoint(runtime, enc, bytes, p - 1, end);
                            if (cc <= 0xFFFF) {
                                len += 6;
                            } else if (cc <= 0xFFFFF) {
                                len += 9;
                            } else {
                                len += 10;
                            }
                            p += MBCLEN_CHARFOUND_LEN(n) - 1;
                            break;
                        }
                    }
                    len += 4;
                }
                break;
            }
        }

        if (!enc.isAsciiCompatible()) {
            len += ".force_encoding(\"".length() + enc.getName().length + "\")".length();
        }

        ByteList outBytes = new ByteList(len);
        byte out[] = outBytes.getUnsafeBytes();
        int q = 0;
        p = byteList.getBegin();
        end = p + byteList.getRealSize();

        if ((quoteOnlyIfNeeded && includingsNonprintable) || !quoteOnlyIfNeeded) out[q++] = '"';
        while (p < end) {
            int c = bytes[p++] & 0xff;
            switch (c) {
                case '"': case '\\':
                    out[q++] = '\\'; out[q++] = (byte)c; break;
                case '#':
                    if (isEVStr(bytes, p, end)) out[q++] = '\\';
                    out[q++] = '#';
                    break;
                case '\n':
                    out[q++] = '\\'; out[q++] = 'n'; break;
                case '\r':
                    out[q++] = '\\'; out[q++] = 'r'; break;
                case '\t':
                    out[q++] = '\\'; out[q++] = 't'; break;
                case '\f':
                    out[q++] = '\\'; out[q++] = 'f'; break;
                case '\013':
                    out[q++] = '\\'; out[q++] = 'v'; break;
                case '\010':
                    out[q++] = '\\'; out[q++] = 'b'; break;
                case '\007':
                    out[q++] = '\\'; out[q++] = 'a'; break;
                case '\033':
                    out[q++] = '\\'; out[q++] = 'e'; break;
                default:
                    if (ASCIIEncoding.INSTANCE.isPrint(c)) {
                        out[q++] = (byte)c;
                    } else {
                        out[q++] = '\\';
                        outBytes.setRealSize(q);
                        if (enc.isUTF8()) {
                            int n = preciseLength(enc, bytes, p - 1, end) - 1;
                            if (MBCLEN_CHARFOUND_LEN(n) > 0) {
                                int cc = codePoint(runtime, enc, bytes, p - 1, end);
                                outBytes.setRealSize(q);
                                p += n;
                                if (cc <= 0xFFFF) {
                                    Sprintf.sprintf(runtime, outBytes, "u%04X", cc);
                                } else {
                                    Sprintf.sprintf(runtime, outBytes, "u{%X}", cc);
                                }
                                q = outBytes.getRealSize();
                                continue;
                            }
                        }
                        Sprintf.sprintf(runtime, outBytes, "x%02X", c);
                        q = outBytes.getRealSize();
                    }
            }
        }
        if ((quoteOnlyIfNeeded && includingsNonprintable) || !quoteOnlyIfNeeded) out[q++] = '"';
        outBytes.setRealSize(q);
        assert out == outBytes.getUnsafeBytes(); // must not reallocate

        return outBytes;
    }

    public static boolean isEVStr(byte[] bytes, int p, int end) {
        return p < end ? isEVStr(bytes[p] & 0xff) : false;
    }

    public static boolean isEVStr(int c) {
        return c == '$' || c == '@' || c == '{';
    }

    /**
     * rb_str_count
     */
    public static int strCount(ByteList str, boolean[] table, TrTables tables, Encoding enc) {
        final byte[] bytes = str.getUnsafeBytes();
        int p = str.getBegin();
        final int end = p + str.getRealSize();
        final boolean asciiCompat = enc.isAsciiCompatible();

        int count = 0;
        while (p < end) {
            int c;
            if (asciiCompat && (c = bytes[p] & 0xff) < 0x80) {
                if (table[c]) count++;
                p++;
            } else {
                c = codePoint(enc, bytes, p, end);
                int cl = codeLength(enc, c);
                if (trFind(c, table, tables)) count++;
                p += cl;
            }
        }

        return count;
    }

    public static int strCount(ByteList str, Ruby runtime, boolean[] table, TrTables tables, Encoding enc) {
        try {
            return strCount(str, table, tables, enc);
        } catch (IllegalArgumentException e) {
            throw runtime.newArgumentError(e.getMessage());
        }
    }

    /**
     * @deprecated renamed to {@link #strCount(ByteList, Ruby, boolean[], TrTables, Encoding)}
     */
    public static int countCommon19(ByteList str, Ruby runtime, boolean[] table, TrTables tables, Encoding enc) {
        return strCount(str, runtime, table, tables, enc);
    }

    // MRI: rb_str_rindex
    public static int rindex(ByteList source, int sourceChars, int subChars, int pos, CodeRangeable subStringCodeRangeable, Encoding enc) {
        if (subStringCodeRangeable.scanForCodeRange() == CR_BROKEN) return -1;

        final ByteList subString = subStringCodeRangeable.getByteList();

        final int srcLen = source.getRealSize();
        final int subLen = subString.getRealSize();

        if (sourceChars < subChars || srcLen < subLen) return -1;
        if (sourceChars - pos < subChars) pos = sourceChars - subChars;
        if (sourceChars == 0) return pos;

        byte[] srcBytes = source.getUnsafeBytes();
        final int srcBeg = source.getBegin();

        if (pos == 0) {
            if (ByteList.memcmp(srcBytes, srcBeg, subString.getUnsafeBytes(), subString.getBegin(), subLen) == 0) {
                return 0;
            }
            return -1;
        }

        int s = nth(enc, srcBytes, srcBeg, srcBeg + srcLen, pos);

        return strRindex(srcBytes, srcBeg, srcLen, subString.getUnsafeBytes(), subString.getBegin(), subLen, s, pos, enc);
    }

    private static int strRindex(final byte[] strBytes, final int strBeg, final int strLen,
                                 final byte[] subBytes, final int subBeg, final int subLen,
                                 int s, int pos, final Encoding enc) {

        final int e = strBeg + strLen;

        while (s >= strBeg) {
            if (s + subLen <= e && ByteList.memcmp(strBytes, s, subBytes, subBeg, subLen) == 0) {
                return pos;
            }
            if (pos == 0) break; pos--;
            s = enc.prevCharHead(strBytes, strBeg, s, e);
        }

        return -1;
    }

    public static int strLengthFromRubyString(CodeRangeable string, Encoding enc) {
        final ByteList bytes = string.getByteList();

        if (isSingleByteOptimizable(string, enc)) return bytes.getRealSize();
        return strLengthFromRubyStringFull(string, bytes, enc);
    }

    public static int strLengthFromRubyString(CodeRangeable string) {
        final ByteList bytes = string.getByteList();

        if (isSingleByteOptimizable(string, bytes.getEncoding())) return bytes.getRealSize();
        return strLengthFromRubyStringFull(string, bytes, bytes.getEncoding());
    }

    public static int strLengthFromRubyString(CodeRangeable string, final ByteList bytes, final Encoding enc) {
        if (isSingleByteOptimizable(string, enc)) return bytes.getRealSize();
        // NOTE: strLengthFromRubyStringFull but without string.setCodeRange(..)
        if (string.isCodeRangeValid() && enc.isUTF8()) return utf8Length(bytes);

        long lencr = strLengthWithCodeRange(bytes, enc);
        return unpackResult(lencr);
    }

    private static int strLengthFromRubyStringFull(CodeRangeable string, ByteList bytes, Encoding enc) {
        if (string.isCodeRangeValid() && enc.isUTF8()) return utf8Length(bytes);

        long lencr = strLengthWithCodeRange(bytes, enc);
        int cr = unpackArg(lencr);
        if (cr != 0) string.setCodeRange(cr);
        return unpackResult(lencr);
    }

    /**
     * rb_str_tr / rb_str_tr_bang
     */
    public static final class TR {
        public TR(ByteList bytes) {
            p = bytes.getBegin();
            pend = bytes.getRealSize() + p;
            buf = bytes.getUnsafeBytes();
            now = max = 0;
            gen = false;
        }

        final byte[] buf;
        int p, pend, now, max;
        boolean gen;
    }

    /**
     * tr_setup_table
     */
    public static final class TrTables {
        IntHashMap