net.sf.saxon.str.StringTool Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of Saxon-HE Show documentation
Show all versions of Saxon-HE Show documentation
The XSLT and XQuery Processor
////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
// Copyright (c) 2018-2023 Saxonica Limited
// This Source Code Form is subject to the terms of the Mozilla Public License, v. 2.0.
// If a copy of the MPL was not distributed with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
// This Source Code Form is "Incompatible With Secondary Licenses", as defined by the Mozilla Public License, v. 2.0.
////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
package net.sf.saxon.str;
import net.sf.saxon.serialize.charcode.UTF16CharacterSet;
import net.sf.saxon.transpile.CSharpInnerClass;
import net.sf.saxon.value.Whitespace;
import net.sf.saxon.z.IntIterator;
import java.util.Arrays;
public class StringTool {
/**
* Get the length of a string, as defined in XPath. This is not the same as the Java length,
* as a Unicode surrogate pair counts as a single character.
*
* @param s The string whose length is required
* @return the length of the string in Unicode code points
*/
public static int getStringLength(/*@NotNull*/ CharSequence s) {
int n = 0;
for (int i = 0; i < s.length(); i++) {
int c = s.charAt(i);
if (c < 55296 || c > 56319) {
n++; // don't count high surrogates, i.e. D800 to DBFF
}
}
return n;
}
/**
* Expand a string into an array of 32-bit characters
*
* @param s the string to be expanded
* @return an array of integers representing the Unicode code points
*/
public static int[] expand(UnicodeString s) {
int[] array = new int[s.length32()];
s.copy32bit(array, 0);
return array;
}
/**
* Ask whether a string contains astral characters (represented as surrogate pairs)
* @param str the string to be tested
* @return true if the string contains surrogate characters
*/
public static boolean containsSurrogates(CharSequence str) {
for (int i=0; i> 16) & 0xff);
triples[j++] = (byte) ((cp >> 8) & 0xff);
triples[j++] = (byte) (cp & 0xff);
} else {
triples[j++] = 0;
triples[j++] = (byte) ((c >> 8) & 0xff);
triples[j++] = (byte) (c & 0xff);
}
}
return new Twine24(triples);
}
}
public static UnicodeString fromLatin1(String str) {
byte[] bytes = new byte[str.length()];
for (int i = 0; i= 0x20 && c <= 0x7e) {
fsb.append(c);
} else {
fsb.append("\\u");
for (int shift = 12; shift >= 0; shift -= 4) {
fsb.append("0123456789ABCDEF".charAt((c >> shift) & 0xF));
}
}
}
return fsb.toString();
}
/**
* Insert a wide character (surrogate pair) at the start of a StringBuilder
* @param builder the string builder
* @param ch the codepoint of the character to be inserted
*/
public static void prependWideChar(StringBuilder builder, int ch) {
if (ch > 0xffff) {
char[] pair = new char[]{UTF16CharacterSet.highSurrogate(ch), UTF16CharacterSet.lowSurrogate(ch)};
builder.insert(0, pair);
} else {
builder.insert(0, (char) ch);
}
}
/**
* Insert repeated occurrences of a given character at the start of a StringBuilder
* @param builder the string builder
* @param ch the character to be inserted
* @param count the number of repetitions
*/
public static void prependRepeated(StringBuilder builder, char ch, int count) {
char[] array = new char[count];
Arrays.fill(array, ch);
builder.insert(0, array);
}
/**
* Insert repeated occurrences of a given character at the end of a StringBuilder
*
* @param builder the string builder
* @param ch the character to be inserted
* @param count the number of repetitions
*/
public static void appendRepeated(StringBuilder builder, char ch, int count) {
for (int i=0; i=0; i--) {
if (str.codePointAt(i) == codePoint) {
return i;
}
}
return -1L;
}
/**
* Attempt to compress a UnicodeString consisting entirely of whitespace. This is the first thing we
* do to an incoming text node
*
* @param in the Unicode string to be compressed
* @param offset the start position of the substring we are interested in
* @param len the length of the substring we are interested in
* @param compressWS set to true if whitespace compression is to be attempted
* @return the compressed sequence if it can be compressed; or the uncompressed UnicodeString otherwise
*/
/*@NotNull*/
public static UnicodeString compress(char[] in, int offset, int len, boolean compressWS) {
//final int inlen = in.length;
if (len == 0) {
return EmptyUnicodeString.getInstance();
}
int max = 255;
int end = offset + len;
boolean allWhite = compressWS;
int surrogates = 0;
// Find the maximum code value, and test whether all-white or surrogate
int k = offset;
if (compressWS) {
while (k < end) {
int c = in[k];
if (!Whitespace.isWhite(c)) {
allWhite = false;
break;
}
k++;
}
if (allWhite) {
return CompressedWhitespace.compressWS(in, offset, end);
}
}
while (k < end) {
int c = in[k++];
max |= c;
if (UTF16CharacterSet.isSurrogate(c)) {
surrogates++;
}
}
// for (int i=offset; i < end; i++) {
// int c = in[i];
// max |= c;
// if (allWhite && !Whitespace.isWhite(c)) {
// allWhite = false;
// }
// if (UTF16CharacterSet.isSurrogate(c)) {
// surrogates++;
// }
// }
// if (allWhite) {
// return CompressedWhitespace.compressWS(in, offset, end);
// }
if (max < 256) {
byte[] array = new byte[len];
for (int i = offset, j=0; i < end;) {
array[j++] = (byte)in[i++];
}
return new Twine8(array);
//Following is slower:
//byte[] array = new String(in, offset, len).getBytes(StandardCharsets.ISO_8859_1);
//return new Twine8(array);
}
if (surrogates == 0) {
char[] array = Arrays.copyOfRange(in, offset, offset + len);
return new Twine16(array);
} else {
byte[] array = new byte[3 * (len - surrogates/2)];
for (int i = offset, j = 0; i < end; ) {
char c = in[i++];
if (UTF16CharacterSet.isSurrogate(c)) {
int cp = UTF16CharacterSet.combinePair(c, in[i++]);
array[j++] = (byte) ((cp & 0xffffff) >> 16);
array[j++] = (byte) ((cp & 0xffff) >> 8);
array[j++] = (byte) (cp & 0xff);
} else {
array[j++] = (byte) 0;
array[j++] = (byte) ((c & 0xffff) >> 8);
array[j++] = (byte) (c & 0xff);
}
}
return new Twine24(array);
}
}
/**
* Copy from an array of 8-bit characters to an array holding 16-bit characters.
* The caller is responsible for ensuring that the offsets are in range and that the
* destination array is large enough.
* @param source the source array
* @param sourcePos the position in the source array where copying is to start
* @param dest the destination array
* @param destPos the position in the destination array where copying is to start
* @param count the number of characters (codepoints) to copy
*/
public static void copy8to16(byte[] source, int sourcePos, char[] dest, int destPos, int count) {
int last = sourcePos + count;
for (int i=sourcePos, j=destPos; i> 8) & 0xff);
dest[j++] = (byte) (c & 0xff);
}
}
}