All Downloads are FREE. Search and download functionalities are using the official Maven repository.

com.yahoo.text.StringUtilities Maven / Gradle / Ivy

// Copyright Vespa.ai. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
package com.yahoo.text;

import java.nio.charset.Charset;
import java.nio.charset.StandardCharsets;
import java.util.HashSet;
import java.util.List;
import java.io.ByteArrayOutputStream;
import java.util.Set;

/**
 * Escapes strings into and out of a format where they only contain printable characters.
 *
 * Need to duplicate escape / unescape of strings as we have in C++ for java version of system states.
 *
 * @author Haakon Humberset
 */
// TODO: Text utilities should which are still needed should move to Text. This should be deprecated.
public class StringUtilities {

    private static final Charset UTF8 = StandardCharsets.UTF_8;

    private static byte toHex(int val) { return (byte) (val < 10 ? '0' + val : 'a' + (val - 10)); }

    private static class ReplacementCharacters {

        public byte[] needEscape = new byte[256];
        public byte[] replacement1 = new byte[256];
        public byte[] replacement2 = new byte[256];

        public ReplacementCharacters() {
            for (int i=0; i<256; ++i) {
                if (i >= 32 && i <= 126) {
                    needEscape[i] = 0;
                } else {
                    needEscape[i] = 3;
                    replacement1[i] = toHex((i >> 4) & 0xF);
                    replacement2[i] = toHex(i & 0xF);
                }
            }
            makeSimpleEscape('"', '"');
            makeSimpleEscape('\\', '\\');
            makeSimpleEscape('\t', 't');
            makeSimpleEscape('\n', 'n');
            makeSimpleEscape('\r', 'r');
            makeSimpleEscape('\f', 'f');
        }

        private void makeSimpleEscape(char source, char dest) {
            needEscape[source] = 1;
            replacement1[source] = '\\';
            replacement2[source] = (byte) dest;
        }
    }

    private final static ReplacementCharacters replacementCharacters = new ReplacementCharacters();

    public static String escape(String source) { return escape(source, '\0'); }

    /**
     * Escapes strings into a format with only printable ASCII characters.
     *
     * @param source The string to escape
     * @param delimiter Escape this character too, even if it is printable.
     * @return The escaped string
     */
    public static String escape(String source, char delimiter) {
        byte[] bytes = source.getBytes(UTF8);
        ByteArrayOutputStream result = new ByteArrayOutputStream();
        for (byte b : bytes) {
            int val = b;
            if (val < 0) val += 256;
            if (b == delimiter) {
                result.write('\\');
                result.write('x');
                result.write(toHex((val >> 4) & 0xF));
                result.write(toHex(val & 0xF));
            } else if (replacementCharacters.needEscape[val] == 0) {
                result.write(b);
            } else {
                if (replacementCharacters.needEscape[val] == 3) {
                    result.write('\\');
                    result.write('x');
                }
                result.write(replacementCharacters.replacement1[val]);
                result.write(replacementCharacters.replacement2[val]);
            }
        }
        return result.toString(UTF8);
    }

    public static String unescape(String source) {
        byte[] bytes = source.getBytes(UTF8);
        ByteArrayOutputStream result = new ByteArrayOutputStream();
        for (int i=0; i result.write('\\');
                    case '"' -> result.write('"');
                    case 't' -> result.write('\t');
                    case 'n' -> result.write('\n');
                    case 'r' -> result.write('\r');
                    case 'f' -> result.write('\f');
                    default -> throw new IllegalArgumentException("Illegal escape sequence \\" + ((char) bytes[i + 1]) + " found");
                }
                ++i;
                continue;
            }

            if (i + 3 >= bytes.length) throw new IllegalArgumentException("Found \\x at end of input");

            String hexdigits = "" + ((char) bytes[i + 2]) + ((char) bytes[i + 3]);
            result.write((byte) Integer.parseInt(hexdigits, 16));
            i += 3;
        }
        return result.toString(UTF8);
    }

    /**
     * Returns the given array flattened to string, with the given separator string
     * @param array the array
     * @param sepString or null
     * @return imploded array
     */
    public static String implode(String[] array, String sepString) {
        if (array==null) return null;
        StringBuilder ret = new StringBuilder();
        if (sepString==null) sepString="";
        for (int i = 0 ; i lines) {
        if (lines==null) return null;
        return implode(lines.toArray(new String[0]), "\n");
    }

    /**
     * This will truncate sequences in a string of the same character that exceed the maximum
     * allowed length.
     *
     * @return The same string or a new one if truncation is done.
     */
    public static String truncateSequencesIfNecessary(String text, int maxConsecutiveLength) {
        char prev = 0;
        int sequenceCount = 1;
        for (int i = 0, m = text.length(); i < m ; i++) {
            char curr = text.charAt(i);
            if (prev == curr) {
                sequenceCount++;
                if (sequenceCount > maxConsecutiveLength) {
                    return truncateSequences(text, maxConsecutiveLength, i);
                }
            } else {
                sequenceCount = 1;
                prev = curr;
            }
        }
        return text;
    }

    private static String truncateSequences(String text, int maxConsecutiveLength, int firstTruncationPos) {
        char [] truncated = text.toCharArray();
        char prev = truncated[firstTruncationPos];
        int sequenceCount = maxConsecutiveLength + 1;
        int wp=firstTruncationPos;
        for (int rp=wp+1; rp < truncated.length; rp++) {
            char curr = truncated[rp];
            if (prev == curr) {
                sequenceCount++;
                if (sequenceCount <= maxConsecutiveLength) {
                    truncated[wp++] = curr;
                }
            } else {
                truncated[wp++] = curr;
                sequenceCount = 1;
                prev = curr;
            }
        }
        return String.copyValueOf(truncated, 0, wp);
    }

    public static String stripSuffix(String string, String suffix) {
        int index = string.lastIndexOf(suffix);
        return index == -1 ? string : string.substring(0, index);
    }

    /**
     * Adds single quotes around object.toString
     * Example:  '12'
     */
    public static String quote(Object object) {
        return "'" + object.toString() + "'";
    }

    /** Splits a string on both space and comma */
    public static Set split(String s) {
        if (s == null || s.isEmpty()) return Set.of();
        Set b = new HashSet<>();
        for (String item : s.split("[\\s,]"))
            if ( ! item.isEmpty())
                b.add(item);
        return Set.copyOf(b);
    }

}




© 2015 - 2024 Weber Informatics LLC | Privacy Policy