com.yahoo.text.StringUtilities Maven / Gradle / Ivy
// Copyright Vespa.ai. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
package com.yahoo.text;
import java.nio.charset.Charset;
import java.nio.charset.StandardCharsets;
import java.util.HashSet;
import java.util.List;
import java.io.ByteArrayOutputStream;
import java.util.Set;
/**
* Escapes strings into and out of a format where they only contain printable characters.
*
* Need to duplicate escape / unescape of strings as we have in C++ for java version of system states.
*
* @author Haakon Humberset
*/
// TODO: Text utilities should which are still needed should move to Text. This should be deprecated.
public class StringUtilities {
private static final Charset UTF8 = StandardCharsets.UTF_8;
private static byte toHex(int val) { return (byte) (val < 10 ? '0' + val : 'a' + (val - 10)); }
private static class ReplacementCharacters {
public byte[] needEscape = new byte[256];
public byte[] replacement1 = new byte[256];
public byte[] replacement2 = new byte[256];
public ReplacementCharacters() {
for (int i=0; i<256; ++i) {
if (i >= 32 && i <= 126) {
needEscape[i] = 0;
} else {
needEscape[i] = 3;
replacement1[i] = toHex((i >> 4) & 0xF);
replacement2[i] = toHex(i & 0xF);
}
}
makeSimpleEscape('"', '"');
makeSimpleEscape('\\', '\\');
makeSimpleEscape('\t', 't');
makeSimpleEscape('\n', 'n');
makeSimpleEscape('\r', 'r');
makeSimpleEscape('\f', 'f');
}
private void makeSimpleEscape(char source, char dest) {
needEscape[source] = 1;
replacement1[source] = '\\';
replacement2[source] = (byte) dest;
}
}
private final static ReplacementCharacters replacementCharacters = new ReplacementCharacters();
public static String escape(String source) { return escape(source, '\0'); }
/**
* Escapes strings into a format with only printable ASCII characters.
*
* @param source The string to escape
* @param delimiter Escape this character too, even if it is printable.
* @return The escaped string
*/
public static String escape(String source, char delimiter) {
byte[] bytes = source.getBytes(UTF8);
ByteArrayOutputStream result = new ByteArrayOutputStream();
for (byte b : bytes) {
int val = b;
if (val < 0) val += 256;
if (b == delimiter) {
result.write('\\');
result.write('x');
result.write(toHex((val >> 4) & 0xF));
result.write(toHex(val & 0xF));
} else if (replacementCharacters.needEscape[val] == 0) {
result.write(b);
} else {
if (replacementCharacters.needEscape[val] == 3) {
result.write('\\');
result.write('x');
}
result.write(replacementCharacters.replacement1[val]);
result.write(replacementCharacters.replacement2[val]);
}
}
return result.toString(UTF8);
}
public static String unescape(String source) {
byte[] bytes = source.getBytes(UTF8);
ByteArrayOutputStream result = new ByteArrayOutputStream();
for (int i=0; i result.write('\\');
case '"' -> result.write('"');
case 't' -> result.write('\t');
case 'n' -> result.write('\n');
case 'r' -> result.write('\r');
case 'f' -> result.write('\f');
default -> throw new IllegalArgumentException("Illegal escape sequence \\" + ((char) bytes[i + 1]) + " found");
}
++i;
continue;
}
if (i + 3 >= bytes.length) throw new IllegalArgumentException("Found \\x at end of input");
String hexdigits = "" + ((char) bytes[i + 2]) + ((char) bytes[i + 3]);
result.write((byte) Integer.parseInt(hexdigits, 16));
i += 3;
}
return result.toString(UTF8);
}
/**
* Returns the given array flattened to string, with the given separator string
* @param array the array
* @param sepString or null
* @return imploded array
*/
public static String implode(String[] array, String sepString) {
if (array==null) return null;
StringBuilder ret = new StringBuilder();
if (sepString==null) sepString="";
for (int i = 0 ; i lines) {
if (lines==null) return null;
return implode(lines.toArray(new String[0]), "\n");
}
/**
* This will truncate sequences in a string of the same character that exceed the maximum
* allowed length.
*
* @return The same string or a new one if truncation is done.
*/
public static String truncateSequencesIfNecessary(String text, int maxConsecutiveLength) {
char prev = 0;
int sequenceCount = 1;
for (int i = 0, m = text.length(); i < m ; i++) {
char curr = text.charAt(i);
if (prev == curr) {
sequenceCount++;
if (sequenceCount > maxConsecutiveLength) {
return truncateSequences(text, maxConsecutiveLength, i);
}
} else {
sequenceCount = 1;
prev = curr;
}
}
return text;
}
private static String truncateSequences(String text, int maxConsecutiveLength, int firstTruncationPos) {
char [] truncated = text.toCharArray();
char prev = truncated[firstTruncationPos];
int sequenceCount = maxConsecutiveLength + 1;
int wp=firstTruncationPos;
for (int rp=wp+1; rp < truncated.length; rp++) {
char curr = truncated[rp];
if (prev == curr) {
sequenceCount++;
if (sequenceCount <= maxConsecutiveLength) {
truncated[wp++] = curr;
}
} else {
truncated[wp++] = curr;
sequenceCount = 1;
prev = curr;
}
}
return String.copyValueOf(truncated, 0, wp);
}
public static String stripSuffix(String string, String suffix) {
int index = string.lastIndexOf(suffix);
return index == -1 ? string : string.substring(0, index);
}
/**
* Adds single quotes around object.toString
* Example: '12'
*/
public static String quote(Object object) {
return "'" + object.toString() + "'";
}
/** Splits a string on both space and comma */
public static Set split(String s) {
if (s == null || s.isEmpty()) return Set.of();
Set b = new HashSet<>();
for (String item : s.split("[\\s,]"))
if ( ! item.isEmpty())
b.add(item);
return Set.copyOf(b);
}
}