htsjdk.samtools.util.StringUtil Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of htsjdk Show documentation
htsjdk
There is a newer version: 4.1.3
/*
 * The MIT License
 *
 * Copyright (c) 2009 The Broad Institute
 *
 * Permission is hereby granted, free of charge, to any person obtaining a copy
 * of this software and associated documentation files (the "Software"), to deal
 * in the Software without restriction, including without limitation the rights
 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 * copies of the Software, and to permit persons to whom the Software is
 * furnished to do so, subject to the following conditions:
 *
 * The above copyright notice and this permission notice shall be included in
 * all copies or substantial portions of the Software.
 *
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
 * THE SOFTWARE.
 */

package htsjdk.samtools.util;

import java.util.Arrays;
import java.util.Collection;
import java.util.List;

/**
 * Grab-bag of stateless String-oriented utilities.
 */
public class StringUtil {
    private static final byte UPPER_CASE_OFFSET = 'A' - 'a';

    /**
     * @param separator String to interject between each string in strings arg
     * @param objs List of objs to be joined
     * @return String that concatenates the result of each item's to String method for all items in objs, with separator between each of them.
     */
    public static  String join(final String separator, final Collection objs) {
        if (objs.isEmpty()) {
            return "";
        }

        boolean notFirst = false;

        final StringBuilder ret = new StringBuilder();
        for (final Object obj : objs) {
            if(notFirst) {
                ret.append(separator);
            }
            ret.append(obj.toString());
            notFirst = true;
        }
        return ret.toString();
    }

    public static  String join(final String separator, final T... objs) {
        final List values = Arrays.asList(objs);
        return join(separator, values);
    }


    /**
     * Split the string into tokens separated by the given delimiter.  Profiling has
     * revealed that the standard string.split() method typically takes > 1/2
     * the total time when used for parsing ascii files.
     * Note that if tokens arg is not large enough to all the tokens in the string, excess tokens are discarded.
     *
     * @param aString  the string to split
     * @param tokens an array to hold the parsed tokens
     * @param delim  character that delimits tokens
     * @return the number of tokens parsed
     */
    public static int split(final String aString, final String[] tokens, final char delim) {

        final int maxTokens = tokens.length;
        int nTokens = 0;
        int start = 0;
        int end = aString.indexOf(delim);
        if(end < 0) {
            tokens[nTokens++] = aString;
            return nTokens;
        }
        while ((end >= 0) && (nTokens < maxTokens))
        {
            tokens[nTokens++] = aString.substring(start, end);
            start = end + 1;
            end = aString.indexOf(delim, start);

        }
        // Add the trailing string,  if there is room and if it is not empty.
        if (nTokens < maxTokens)
        {
            final String trailingString = aString.substring(start);
            if (!trailingString.isEmpty())
            {
                tokens[nTokens++] = trailingString;
            }
        }
        return nTokens;
    }

    /**
     * Split the string into tokens separated by the given delimiter.  Profiling has
     * revealed that the standard string.split() method typically takes > 1/2
     * the total time when used for parsing ascii files.
     * Note that the string is split into no more elements than tokens arg will hold, so the final tokenized
     * element may contain delimiter chars.
     *
     * @param aString  the string to split
     * @param tokens an array to hold the parsed tokens
     * @param delim  character that delimits tokens
     * @return the number of tokens parsed
     */
    public static int splitConcatenateExcessTokens(final String aString, final String[] tokens, final char delim) {

        final int maxTokens = tokens.length;
        int nTokens = 0;
        int start = 0;
        int end = aString.indexOf(delim);
        if(end < 0) {
            tokens[nTokens++] = aString;
            return nTokens;
        }
        while ((end >= 0) && (nTokens < maxTokens - 1))
        {
            tokens[nTokens++] = aString.substring(start, end);
            start = end + 1;
            end = aString.indexOf(delim, start);

        }
        // Add the trailing string,  if it is not empty.
        final String trailingString = aString.substring(start);
        if (!trailingString.isEmpty())
        {
            tokens[nTokens++] = trailingString;
        }
        return nTokens;
    }

    /**
     * @param b ASCII character
     * @return lowercase version of arg if it was uppercase, otherwise returns arg
     */
    public static byte toLowerCase(final byte b) {
        if (b < 'A' || b > 'Z') {
            return b;
        }
        return (byte)(b - UPPER_CASE_OFFSET);
    }

    /**
     * @param b ASCII character
     * @return uppercase version of arg if it was lowercase, otherwise returns arg
     */
    public static byte toUpperCase(final byte b) {
        if (b < 'a' || b > 'z') {
            return b;
        }
        return (byte)(b + UPPER_CASE_OFFSET);
    }

    /**
     * Converts in place all lower case letters to upper case in the byte array provided.
     */
    public static void toUpperCase(final byte[] bytes) {
        final int length = bytes.length;
        for (int i=0; i= 'a' && bytes[i] <= 'z') {
                bytes[i] = (byte) (bytes[i] + UPPER_CASE_OFFSET);
            }
        }
    }


    /**
     * Checks that a String doesn't contain one or more characters of interest.
     *
     * @param illegalChars the String to check
     * @param chars the characters to check for
     * @return String the input String for convenience
     * @throws IllegalArgumentException if the String contains one or more of the characters
     */
    public static String assertCharactersNotInString(final String illegalChars, final char... chars) {
        for (final char illegalChar : illegalChars.toCharArray()) {
            for (final char ch: chars) {
                if (illegalChar == ch) {
                    throw new IllegalArgumentException("Supplied String contains illegal character '" + illegalChar + "'.");
                }
            }
        }

        return illegalChars;
    }

    /**
     * Return input string with newlines inserted to ensure that all lines
     * have length <= maxLineLength.  if a word is too long, it is simply broken
     * at maxLineLength.  Does not handle tabs intelligently (due to implementer laziness).
     */
    public static String wordWrap(final String s, final int maxLineLength) {
        final String[] lines = s.split("\n");
        final StringBuilder sb = new StringBuilder();
        for (final String line: lines) {
            if (sb.length() > 0) {
                sb.append('\n');
            }
            sb.append(wordWrapSingleLine(line, maxLineLength));
        }
        if (s.endsWith("\n")) {
            sb.append('\n');
        }
        return sb.toString();
    }

    public static String wordWrapSingleLine(final String s, final int maxLineLength) {
        if (s.length() <= maxLineLength) {
            return s;
        }
        final StringBuilder sb = new StringBuilder();
        int startCopyFrom = 0;
        while (startCopyFrom < s.length()) {
            int lastSpaceIndex = startCopyFrom;
            int i;
            // Find break point (if it exists)
            for (i = startCopyFrom; i < s.length() && i - startCopyFrom < maxLineLength; ++i) {
                if (Character.isWhitespace(s.charAt(i))) {
                    lastSpaceIndex = i;
                }
            }
            if (i - startCopyFrom < maxLineLength) {
                lastSpaceIndex = i;
            }
            // Include any trailing whitespace
            for (; lastSpaceIndex < s.length() && Character.isWhitespace(s.charAt(lastSpaceIndex)); ++lastSpaceIndex) {}
            if (sb.length() > 0) {
                sb.append('\n');
            }
            // Handle situation in which there is no word break.  Just break the word in the middle.
            if (lastSpaceIndex == startCopyFrom) {
                lastSpaceIndex = i;
            }
            sb.append(s.substring(startCopyFrom, lastSpaceIndex));
            startCopyFrom = lastSpaceIndex;
        }
        return sb.toString();
    }


    public static String intValuesToString(final int[] intVals) {
        final StringBuilder sb = new StringBuilder(intVals.length);
        if(intVals.length > 0) {
            sb.append(String.valueOf(intVals[0]));
            for(int i = 1; i < intVals.length; i++) {
                sb.append(", ");
                sb.append(String.valueOf(intVals[i]));
            }
        }

        return sb.toString();
    }

    public static String intValuesToString(final short[] shortVals) {
        final StringBuilder sb = new StringBuilder(shortVals.length);
        if(shortVals.length > 0) {
            sb.append(String.valueOf(shortVals[0]));
            for(int i = 1; i < shortVals.length; i++) {
                sb.append(", ");
                sb.append(String.valueOf(shortVals[i]));
            }
        }

        return sb.toString();
    }

    ////////////////////////////////////////////////////////////////////
    // The following methods all convert btw bytes and Strings, without
    // using the Java character set mechanism.
    ////////////////////////////////////////////////////////////////////

    public static String bytesToString(final byte[] data) {
        if (data == null) {
            return null;
        }
        return bytesToString(data, 0, data.length);
    }

    @SuppressWarnings("deprecation")
    public static String bytesToString(final byte[] buffer, final int offset, final int length) {
/*
        The non-deprecated way, that requires allocating char[]
        final char[] charBuffer = new char[length];
        for (int i = 0; i < length; ++i) {
            charBuffer[i] = (char)buffer[i+offset];
        }
        return new String(charBuffer);
*/
        return new String(buffer, 0, offset, length);
    }

    @SuppressWarnings("deprecation")
    public static byte[] stringToBytes(final String s) {
/*
        The non-deprecated way, that requires allocating char[]
        final byte[] byteBuffer = new byte[s.length()];
        final char[] charBuffer = s.toCharArray();
        for (int i = 0; i < charBuffer.length; ++i) {
            byteBuffer[i] = (byte)(charBuffer[i] & 0xff);
        }
        return byteBuffer;
*/
        if (s == null) {
            return null;
        }
        final byte[] byteBuffer = new byte[s.length()];
        s.getBytes(0, byteBuffer.length, byteBuffer, 0);
        return byteBuffer;
    }

    @SuppressWarnings("deprecation")
    public static byte[] stringToBytes(final String s, final int offset, final int length) {
        if (s == null) {
            return null;
        }
        final byte[] byteBuffer = new byte[length];
        s.getBytes(offset, offset + length, byteBuffer, 0);
        return byteBuffer;
    }

    // This method might more appropriately live in BinaryCodec, but all the byte <=> char conversion
    // should be in the same place.
    public static String readNullTerminatedString(final BinaryCodec binaryCodec) {
        final StringBuilder ret = new StringBuilder();
        for (byte b = binaryCodec.readByte(); b != 0; b = binaryCodec.readByte()) {
            ret.append((char)(b & 0xff));
        }
        return ret.toString();
    }

    /**
     * Convert chars to bytes merely by casting
     * @param chars input chars
     * @param charOffset where to start converting from chars array
     * @param length how many chars to convert
     * @param bytes where to put the converted output
     * @param byteOffset where to start writing the converted output.
     */
    public static void charsToBytes(final char[] chars, final int charOffset, final int length,
                                    final byte[] bytes, final int byteOffset) {
        for (int i = 0; i < length; ++i) {
            bytes[byteOffset + i] = (byte)chars[charOffset + i];
        }
    }

    /**
     * Convert ASCII char to byte.
     */
    public static byte charToByte(final char c) {
        return (byte)c;
    }

    /**
     * Convert ASCII byte to ASCII char.
     */
    public static char byteToChar(final byte b) {
        return (char)(b & 0xff);
    }

    /**
     * Convert a byte array into a String hex representation.
     * @param data Input to be converted.
     * @return String twice as long as data.length with hex representation of data.
     */
    public static String bytesToHexString(final byte[] data) {
        final char[] chars = new char[2 * data.length];
        for (int i = 0; i < data.length; i++) {
            final byte b = data[i];
            chars[2*i] = toHexDigit((b >> 4) & 0xF);
            chars[2*i+1] = toHexDigit(b & 0xF);
        }
        return new String(chars);
    }

    /**
     * Convert a String containing hex characters into an array of bytes with the binary representation
     * of the hex string
     * @param s Hex string.  Length must be even because each pair of hex chars is converted into a byte.
     * @return byte array with binary representation of hex string.
     * @throws NumberFormatException
     */
    public static byte[] hexStringToBytes(final String s)  throws NumberFormatException {
        if (s.length() % 2 != 0) {
            throw new NumberFormatException("Hex representation of byte string does not have even number of hex chars: " + s);
        }
        final byte[] ret = new byte[s.length() / 2];
        for (int i = 0; i < ret.length; ++i) {
            ret[i] = (byte) ((fromHexDigit(s.charAt(i * 2)) << 4) | fromHexDigit(s.charAt(i * 2 + 1)));
        }
        return ret;
    }

    public static char toHexDigit(final int value) {
        return (char) ((value < 10) ? ('0' + value) : ('A' + value - 10));
    }

    public static int fromHexDigit(final char c) throws NumberFormatException {
        final int ret = Character.digit(c, 16);
        if (ret == -1) {
            throw new NumberFormatException("Not a valid hex digit: " + c);
        }
        return ret;
    }

    /**
     * Reverse the given string.  Does not check for null.
     * @param s String to be reversed.
     * @return New string that is the reverse of the input string.
     */
    public static String reverseString(final String s) {
        final StringBuilder sb = new StringBuilder(s);
        sb.reverse();
        return sb.toString();
    }

    /**
     * Checks if a String is whitespace, empty ("") or null.
     *
     *      * StringUtils.isBlank(null)      = true
     * StringUtils.isBlank("")        = true
     * StringUtils.isBlank(" ")       = true
     * StringUtils.isBlank("sam")     = false
     * StringUtils.isBlank("  sam  ") = false
     * 
     *
     * @param str  the String to check, may be null
     * @return true if the String is null, empty or whitespace
     */
    public static boolean isBlank(String str) {
        int strLen;
        if (str == null || (strLen = str.length()) == 0) {
            return true;
        }
        for (int i = 0; i < strLen; i++) {
            if (!Character.isWhitespace(str.charAt(i)) ) {
                return false;
            }
        }
        return true;
    }

     /* Generates a string of one character to a specified length
     *
     * @param c  the Character to repeat
     * @param repeatNumber the number of times to repeat the character
     * @return String with the character c repeated repeatNumber times
     */
    public static String repeatCharNTimes(char c, int repeatNumber) {
        char[] output = new char[repeatNumber];
        Arrays.fill(output, c);
        return String.valueOf(output);
    }

    /** Returns {@link Object#toString()} of the provided value if it isn't null; "" otherwise. */
    public static final String EMPTY_STRING = "";
    public static String asEmptyIfNull(final Object string) {
        return string == null ? EMPTY_STRING : string.toString();
    }

    /*
    * This is from GIT!
    *  This function implements the Damerau-Levenshtein algorithm to
    * calculate a distance between strings.
    *
    * Basically, it says how many letters need to be swapped, substituted,
    * deleted from, or added to string1, at least, to get string2.
    *
    * The idea is to build a distance matrix for the substrings of both
    * strings.  To avoid a large space complexity, only the last three rows
    * are kept in memory (if swaps had the same or higher cost as one deletion
    * plus one insertion, only two rows would be needed).
    *
    * At any stage, "i + 1" denotes the length of the current substring of
    * string1 that the distance is calculated for.
    *
    * row2 holds the current row, row1 the previous row (i.e. for the substring
    * of string1 of length "i"), and row0 the row before that.
    *
    * In other words, at the start of the big loop, row2[j + 1] contains the
    * Damerau-Levenshtein distance between the substring of string1 of length
    * "i" and the substring of string2 of length "j + 1".
    *
    * All the big loop does is determine the partial minimum-cost paths.
    *
    * It does so by calculating the costs of the path ending in characters
    * i (in string1) and j (in string2), respectively, given that the last
    * operation is a substitution, a swap, a deletion, or an insertion.
    *
    * This implementation allows the costs to be weighted:
    *
    * Note that this algorithm calculates a distance _iff_ d == a.
    */
    public static int levenshteinDistance(final String string1, final String string2, int swap, int substitution, int insertion, int deletion) {
        int i, j;

        int[] row0 = new int[(string2.length() + 1)];
        int[] row1 = new int[(string2.length() + 1)];
        int[] row2 = new int[(string2.length() + 1)];
        int[] dummy;

        final byte[] str1 = string1.getBytes();
        final byte[] str2 = string2.getBytes();

        for (j = 0; j < str2.length; j++) {
            row1[j] = j * insertion;
        }
        for (i = 0; i < str1.length; i++) {
            row2[0] = (i + 1) * deletion;
            for (j = 0; j < str2.length; j++) {
                /* substitution */
                row2[j + 1] = row1[j];
                if (str1[i] != str2[j]) {
                    row2[j + 1] += substitution;
                }
                /* swap */
                if (i > 0 && j > 0 && str1[i - 1] == str2[j] &&
                        str1[i] == str2[j - 1] &&
                        row2[j + 1] > row0[j - 1] + swap) {
                    row2[j + 1] = row0[j - 1] + swap;
                }
                /* deletion */
                if (row2[j + 1] > row1[j + 1] + deletion) {
                    row2[j + 1] = row1[j + 1] + deletion;
                }
                /* insertion */
                if (row2[j + 1] > row2[j] + insertion) {
                    row2[j + 1] = row2[j] + insertion;
                }
            }

            dummy = row0;
            row0 = row1;
            row1 = row2;
            row2 = dummy;
        }

        i = row1[str2.length];

        return i;
    }

    /**
     * Calculates the Hamming distance (number of character mismatches) between two strings s1 and s2.
     * Since Hamming distance is not defined for strings of differing lengths, we throw an exception if
     * the two strings are of different lengths.  Hamming distance is case sensitive and does not have
     * any special treatment for DNA.
     *
     * @param s1 The first string to compare
     * @param s2 The second string to compare, note that if s1 and s2 are swapped the value returned will be identical.
     * @return Hamming distance between s1 and s2.
     * @throws IllegalArgumentException If the two strings have differing lengths.
     */
    public static int hammingDistance(final String s1, final String s2) {
        if (s1.length() != s2.length()) {
            throw new IllegalArgumentException("Attempted to determine Hamming distance of strings with differing lengths. " +
                    "The first string has length " + s1.length() + " and the second string has length " + s2.length() + ".");
        }
        int measuredDistance = 0;
        for (int i = 0;i < s1.length();i++) {
            if (s1.charAt(i) != s2.charAt(i)) {
                measuredDistance++;
            }
        }
        return measuredDistance;
    }

    /**
     * Determines if two strings s1 and s2 are within maxHammingDistance of each other using the Hamming distance metric.
     * Since Hamming distance is not defined for strings of differing lengths, we throw an exception if
     * the two strings are of different lengths.  Hamming distance is case sensitive and does not have any
     * special treatment for DNA.
     *
     * @param s1 The first string to compare
     * @param s2 The second string to compare, note that if s1 and s2 are swapped the value returned will be identical.
     * @param maxHammingDistance The largest Hamming distance the strings can have for this function to return true.
     * @return true if the two strings are within maxHammingDistance of each other, false otherwise.
     * @throws IllegalArgumentException If the two strings have differing lengths.
     */
    public static boolean isWithinHammingDistance(final String s1, final String s2, final int maxHammingDistance) {
        if (s1.length() != s2.length()) {
            throw new IllegalArgumentException("Attempted to determine if two strings of different length were within a specified edit distance.");
        }
        int measuredDistance = 0;
        for (int i = 0;i < s1.length();i++) {
            if (s1.charAt(i) != s2.charAt(i)) {
                measuredDistance++;
                // If the measuredDistance is larger than the maxHammingDistance we can short circuit and return
                // false, there is no need to continue evaluating the distance.
                if (measuredDistance > maxHammingDistance) {
                    return false;
                }
            }
        }
        return true;
    }

    /**
     * Takes a long value representing the number of bytes and produces a human readable byte count.
     * @param bytes The number of bytes to create a human readable string for.
     * @return A human readable string of the number of bytes given.
     */
    public static String humanReadableByteCount(long bytes) {
        if (bytes < 1024) {
            return bytes + " B";
        }

        int exp = (int) (Math.log(bytes) / Math.log(1024));
        return String.format("%.1f %sB", bytes / Math.pow(1024, exp), "kMGTPE".charAt(exp - 1));
    }
}