net.sf.saxon.str.StringTool Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of Saxon-HE Show documentation
The XSLT and XQuery Processor
There is a newer version: 12.5
////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
// Copyright (c) 2018-2023 Saxonica Limited
// This Source Code Form is subject to the terms of the Mozilla Public License, v. 2.0.
// If a copy of the MPL was not distributed with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
// This Source Code Form is "Incompatible With Secondary Licenses", as defined by the Mozilla Public License, v. 2.0.
////////////////////////////////////////////////////////////////////////////////////////////////////////////////////

package net.sf.saxon.str;

import net.sf.saxon.serialize.charcode.UTF16CharacterSet;
import net.sf.saxon.transpile.CSharpInnerClass;
import net.sf.saxon.value.Whitespace;
import net.sf.saxon.z.IntIterator;

import java.util.Arrays;

public class StringTool {

    /**
     * Get the length of a string, as defined in XPath. This is not the same as the Java length,
     * as a Unicode surrogate pair counts as a single character.
     *
     * @param s The string whose length is required
     * @return the length of the string in Unicode code points
     */

    public static int getStringLength(/*@NotNull*/ CharSequence s) {
        int n = 0;
        for (int i = 0; i < s.length(); i++) {
            int c = s.charAt(i);
            if (c < 55296 || c > 56319) {
                n++;    // don't count high surrogates, i.e. D800 to DBFF
            }
        }
        return n;
    }

    /**
     * Expand a string into an array of 32-bit characters
     *
     * @param s the string to be expanded
     * @return an array of integers representing the Unicode code points
     */

    public static int[] expand(UnicodeString s) {
        int[] array = new int[s.length32()];
        s.copy32bit(array, 0);
        return array;
    }

    /**
     * Ask whether a string contains astral characters (represented as surrogate pairs)
     * @param str the string to be tested
     * @return true if the string contains surrogate characters
     */

    public static boolean containsSurrogates(CharSequence str) {
        for (int i=0; i> 16) & 0xff);
                    triples[j++] = (byte) ((cp >> 8) & 0xff);
                    triples[j++] = (byte) (cp & 0xff);
                } else {
                    triples[j++] = 0;
                    triples[j++] = (byte) ((c >> 8) & 0xff);
                    triples[j++] = (byte) (c & 0xff);
                }
            }
            return new Twine24(triples);
        }
    }

    public static UnicodeString fromLatin1(String str) {
        byte[] bytes = new byte[str.length()];
        for (int i = 0; i= 0x20 && c <= 0x7e) {
                fsb.append(c);
            } else {
                fsb.append("\\u");
                for (int shift = 12; shift >= 0; shift -= 4) {
                    fsb.append("0123456789ABCDEF".charAt((c >> shift) & 0xF));
                }
            }
        }
        return fsb.toString();
    }

    /**
     * Insert a wide character (surrogate pair) at the start of a StringBuilder
     * @param builder the string builder
     * @param ch the codepoint of the character to be inserted
     */

    public static void prependWideChar(StringBuilder builder, int ch) {
        if (ch > 0xffff) {
            char[] pair = new char[]{UTF16CharacterSet.highSurrogate(ch), UTF16CharacterSet.lowSurrogate(ch)};
            builder.insert(0, pair);
        } else {
            builder.insert(0, (char) ch);
        }
    }

    /**
     * Insert repeated occurrences of a given character at the start of a StringBuilder
     * @param builder the string builder
     * @param ch the character to be inserted
     * @param count the number of repetitions
     */

    public static void prependRepeated(StringBuilder builder, char ch, int count) {
        char[] array = new char[count];
        Arrays.fill(array, ch);
        builder.insert(0, array);
    }

    /**
     * Insert repeated occurrences of a given character at the end of a StringBuilder
     *
     * @param builder the string builder
     * @param ch      the character to be inserted
     * @param count   the number of repetitions
     */

    public static void appendRepeated(StringBuilder builder, char ch, int count) {
        for (int i=0; i=0; i--) {
            if (str.codePointAt(i) == codePoint) {
                return i;
            }
        }
        return -1L;
    }


    /**
     * Attempt to compress a UnicodeString consisting entirely of whitespace. This is the first thing we
     * do to an incoming text node
     *
     * @param in the Unicode string to be compressed
     * @param offset the start position of the substring we are interested in
     * @param len the length of the substring we are interested in
     * @param compressWS set to true if whitespace compression is to be attempted
     * @return the compressed sequence if it can be compressed; or the uncompressed UnicodeString otherwise
     */

    /*@NotNull*/
    public static UnicodeString compress(char[] in, int offset, int len, boolean compressWS) {
        //final int inlen = in.length;
        if (len == 0) {
            return EmptyUnicodeString.getInstance();
        }
        int max = 255;
        int end = offset + len;
        boolean allWhite = compressWS;
        int surrogates = 0;
        // Find the maximum code value, and test whether all-white or surrogate
        int k = offset;
        if (compressWS) {
            while (k < end) {
                int c = in[k];
                if (!Whitespace.isWhite(c)) {
                    allWhite = false;
                    break;
                }
                k++;
            }
            if (allWhite) {
                return CompressedWhitespace.compressWS(in, offset, end);
            }
        }
        while (k < end) {
            int c = in[k++];
            max |= c;
            if (UTF16CharacterSet.isSurrogate(c)) {
                surrogates++;
            }
        }

//        for (int i=offset; i < end; i++) {
//            int c = in[i];
//            max |= c;
//            if (allWhite && !Whitespace.isWhite(c)) {
//                allWhite = false;
//            }
//            if (UTF16CharacterSet.isSurrogate(c)) {
//                surrogates++;
//            }
//        }
//        if (allWhite) {
//            return CompressedWhitespace.compressWS(in, offset, end);
//        }
        if (max < 256) {
            byte[] array = new byte[len];
            for (int i = offset, j=0; i < end;) {
                array[j++] = (byte)in[i++];
            }
            return new Twine8(array);
            //Following is slower:
            //byte[] array = new String(in, offset, len).getBytes(StandardCharsets.ISO_8859_1);
            //return new Twine8(array);
        }
        if (surrogates == 0) {
            char[] array = Arrays.copyOfRange(in, offset, offset + len);
            return new Twine16(array);
        } else {
            byte[] array = new byte[3 * (len - surrogates/2)];
            for (int i = offset, j = 0; i < end; ) {
                char c = in[i++];
                if (UTF16CharacterSet.isSurrogate(c)) {
                    int cp = UTF16CharacterSet.combinePair(c, in[i++]);
                    array[j++] = (byte) ((cp & 0xffffff) >> 16);
                    array[j++] = (byte) ((cp & 0xffff) >> 8);
                    array[j++] = (byte) (cp & 0xff);
                } else {
                    array[j++] = (byte) 0;
                    array[j++] = (byte) ((c & 0xffff) >> 8);
                    array[j++] = (byte) (c & 0xff);
                }
            }
            return new Twine24(array);
        }

    }

    /**
     * Copy from an array of 8-bit characters to an array holding 16-bit characters.
     * The caller is responsible for ensuring that the offsets are in range and that the
     * destination array is large enough.
     * @param source the source array
     * @param sourcePos the position in the source array where copying is to start
     * @param dest the destination array
     * @param destPos the position in the destination array where copying is to start
     * @param count the number of characters (codepoints) to copy
     */

    public static void copy8to16(byte[] source, int sourcePos, char[] dest, int destPos, int count) {
        int last = sourcePos + count;
        for (int i=sourcePos, j=destPos; i> 8) & 0xff);
            dest[j++] = (byte) (c & 0xff);
        }
    }
}