All Downloads are FREE. Search and download functionalities are using the official Maven repository.

com.cedarsoftware.util.StringUtilities Maven / Gradle / Ivy

The newest version!
package com.cedarsoftware.util;

import java.io.UnsupportedEncodingException;
import java.util.Optional;
import java.util.Random;

import static java.lang.Character.toLowerCase;

/**
 * Useful String utilities for common tasks
 *
 * @author Ken Partlow
 * @author John DeRegnaucourt ([email protected])
 *         
* Copyright (c) Cedar Software LLC *

* Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at *

* License *

* Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ public final class StringUtilities { private static char[] _hex = { '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', 'A', 'B', 'C', 'D', 'E', 'F' }; public static String FOLDER_SEPARATOR = "/"; public static String EMPTY = ""; /** *

Constructor is declared private since all methods are static.

*/ private StringUtilities() { } /** * Compares two CharSequences, returning {@code true} if they represent * equal sequences of characters. * *

{@code null}s are handled without exceptions. Two {@code null} * references are considered to be equal. The comparison is case-sensitive.

* * @param cs1 the first CharSequence, may be {@code null} * @param cs2 the second CharSequence, may be {@code null} * @return {@code true} if the CharSequences are equal (case-sensitive), or both {@code null} * @see #equalsIgnoreCase(CharSequence, CharSequence) */ public static boolean equals(CharSequence cs1, CharSequence cs2) { if (cs1 == cs2) { return true; } if (cs1 == null || cs2 == null) { return false; } if (cs1.length() != cs2.length()) { return false; } if (cs1 instanceof String && cs2 instanceof String) { return cs1.equals(cs2); } // Step-wise comparison int length = cs1.length(); for (int i = 0; i < length; i++) { if (cs1.charAt(i) != cs2.charAt(i)) { return false; } } return true; } /** * @see StringUtilities#equals(CharSequence, CharSequence) */ public static boolean equals(String s1, String s2) { return equals((CharSequence) s1, (CharSequence) s2); } /** * Compares two CharSequences, returning {@code true} if they represent * equal sequences of characters, ignoring case. * *

{@code null}s are handled without exceptions. Two {@code null} * references are considered equal. The comparison is case insensitive.

* * @param cs1 the first CharSequence, may be {@code null} * @param cs2 the second CharSequence, may be {@code null} * @return {@code true} if the CharSequences are equal (case-insensitive), or both {@code null} * @see #equals(CharSequence, CharSequence) */ public static boolean equalsIgnoreCase(CharSequence cs1, CharSequence cs2) { if (cs1 == cs2) { return true; } if (cs1 == null || cs2 == null) { return false; } if (cs1.length() != cs2.length()) { return false; } return regionMatches(cs1, true, 0, cs2, 0, cs1.length()); } /** * @see StringUtilities#equalsIgnoreCase(CharSequence, CharSequence) */ public static boolean equalsIgnoreCase(String s1, String s2) { return equalsIgnoreCase((CharSequence) s1, (CharSequence) s2); } /** * Green implementation of regionMatches. * * @param cs the {@link CharSequence} to be processed * @param ignoreCase whether to be case-insensitive * @param thisStart the index to start on the {@code cs} CharSequence * @param substring the {@link CharSequence} to be looked for * @param start the index to start on the {@code substring} CharSequence * @param length character length of the region * @return whether the region matched */ static boolean regionMatches(CharSequence cs, boolean ignoreCase, int thisStart, CharSequence substring, int start, int length) { Convention.throwIfNull(cs, "cs to be processed cannot be null"); Convention.throwIfNull(substring, "substring cannot be null"); if (cs instanceof String && substring instanceof String) { return ((String) cs).regionMatches(ignoreCase, thisStart, (String) substring, start, length); } int index1 = thisStart; int index2 = start; int tmpLen = length; // Extract these first so we detect NPEs the same as the java.lang.String version int srcLen = cs.length() - thisStart; int otherLen = substring.length() - start; // Check for invalid parameters if (thisStart < 0 || start < 0 || length < 0) { return false; } // Check that the regions are long enough if (srcLen < length || otherLen < length) { return false; } while (tmpLen-- > 0) { char c1 = cs.charAt(index1++); char c2 = substring.charAt(index2++); if (c1 == c2) { continue; } if (!ignoreCase) { return false; } // The real same check as in String.regionMatches(): char u1 = Character.toUpperCase(c1); char u2 = Character.toUpperCase(c2); if (u1 != u2 && toLowerCase(u1) != toLowerCase(u2)) { return false; } } return true; } public static boolean equalsWithTrim(String s1, String s2) { if (s1 == null || s2 == null) { return s1 == s2; } return s1.trim().equals(s2.trim()); } public static boolean equalsIgnoreCaseWithTrim(String s1, String s2) { if (s1 == null || s2 == null) { return s1 == s2; } return s1.trim().equalsIgnoreCase(s2.trim()); } /** * Checks if a CharSequence is empty (""), null, or only whitespace. * * @param cs the CharSequence to check, may be null * @return {@code true} if the CharSequence is empty or null */ public static boolean isEmpty(CharSequence cs) { return isWhitespace(cs); } /** * @see StringUtilities#isEmpty(CharSequence) */ public static boolean isEmpty(String s) { return isWhitespace(s); } /** * Checks if a CharSequence is empty (""), null or whitespace only. * * @param cs the CharSequence to check, may be null * @return {@code true} if the CharSequence is null, empty or whitespace only */ public static boolean isWhitespace(CharSequence cs) { int strLen = length(cs); if (strLen == 0) { return true; } for (int i = 0; i < strLen; i++) { if (!Character.isWhitespace(cs.charAt(i))) { return false; } } return true; } /** * Checks if a String is not empty (""), not null and not whitespace only. * * @param s the CharSequence to check, may be null * @return {@code true} if the CharSequence is * not empty and not null and not whitespace only */ public static boolean hasContent(String s) { return !isWhitespace(s); } /** * Gets a CharSequence length or {@code 0} if the CharSequence is {@code null}. * * @param cs a CharSequence or {@code null} * @return CharSequence length or {@code 0} if the CharSequence is {@code null}. */ public static int length(CharSequence cs) { return cs == null ? 0 : cs.length(); } /** * @see StringUtilities#length(CharSequence) */ public static int length(String s) { return s == null ? 0 : s.length(); } /** * @param s a String or {@code null} * @return the trimmed length of the String or 0 if the string is null. */ public static int trimLength(String s) { return trimToEmpty(s).length(); } public static int lastIndexOf(String path, char ch) { if (path == null) { return -1; } return path.lastIndexOf(ch); } // Turn hex String into byte[] // If string is not even length, return null. public static byte[] decode(String s) { int len = s.length(); if (len % 2 != 0) { return null; } byte[] bytes = new byte[len / 2]; int pos = 0; for (int i = 0; i < len; i += 2) { byte hi = (byte) Character.digit(s.charAt(i), 16); byte lo = (byte) Character.digit(s.charAt(i + 1), 16); bytes[pos++] = (byte) (hi * 16 + lo); } return bytes; } /** * Convert a byte array into a printable format containing a * String of hex digit characters (two per byte). * * @param bytes array representation */ public static String encode(byte[] bytes) { StringBuilder sb = new StringBuilder(bytes.length << 1); for (byte aByte : bytes) { sb.append(convertDigit(aByte >> 4)); sb.append(convertDigit(aByte & 0x0f)); } return sb.toString(); } /** * Convert the specified value (0 .. 15) to the corresponding hex digit. * * @param value to be converted * @return '0'..'F' in char format. */ private static char convertDigit(int value) { return _hex[value & 0x0f]; } public static int count(String s, char c) { return count(s, EMPTY + c); } /** * Count the number of times that 'token' occurs within 'content'. * * @return int count (0 if it never occurs, null is the source string, or null is the token). */ public static int count(CharSequence content, CharSequence token) { if (content == null || token == null) { return 0; } String source = content.toString(); if (source.isEmpty()) { return 0; } String sub = token.toString(); if (sub.isEmpty()) { return 0; } int answer = 0; int idx = 0; while (true) { idx = source.indexOf(sub, idx); if (idx < answer) { return answer; } ++answer; ++idx; } } /** * Convert strings containing DOS-style '*' or '?' to a regex String. */ public static String wildcardToRegexString(String wildcard) { int len = wildcard.length(); StringBuilder s = new StringBuilder(len); s.append('^'); for (int i = 0; i < len; i++) { char c = wildcard.charAt(i); switch (c) { case '*': s.append(".*"); break; case '?': s.append('.'); break; // escape special regexp-characters case '(': case ')': case '[': case ']': case '$': case '^': case '.': case '{': case '}': case '|': case '\\': s.append('\\'); s.append(c); break; default: s.append(c); break; } } s.append('$'); return s.toString(); } /** * The Levenshtein distance is a string metric for measuring the difference between two sequences. * Informally, the Levenshtein distance between two words is the minimum number of single-character edits * (i.e. insertions, deletions or substitutions) required to change one word into the other. The phrase * 'edit distance' is often used to refer specifically to Levenshtein distance. * * @param s String one * @param t String two * @return the 'edit distance' (Levenshtein distance) between the two strings. */ public static int levenshteinDistance(CharSequence s, CharSequence t) { // degenerate cases if (s == null || EMPTY.contentEquals(s)) { return t == null || EMPTY.contentEquals(t) ? 0 : t.length(); } else if (t == null || EMPTY.contentEquals(t)) { return s.length(); } // create two work vectors of integer distances int[] v0 = new int[t.length() + 1]; int[] v1 = new int[t.length() + 1]; // initialize v0 (the previous row of distances) // this row is A[0][i]: edit distance for an empty s // the distance is just the number of characters to delete from t for (int i = 0; i < v0.length; i++) { v0[i] = i; } int sLen = s.length(); int tLen = t.length(); for (int i = 0; i < sLen; i++) { // calculate v1 (current row distances) from the previous row v0 // first element of v1 is A[i+1][0] // edit distance is delete (i+1) chars from s to match empty t v1[0] = i + 1; // use formula to fill in the rest of the row for (int j = 0; j < tLen; j++) { int cost = (s.charAt(i) == t.charAt(j)) ? 0 : 1; v1[j + 1] = (int) MathUtilities.minimum(v1[j] + 1, v0[j + 1] + 1, v0[j] + cost); } // copy v1 (current row) to v0 (previous row) for next iteration System.arraycopy(v1, 0, v0, 0, v0.length); } return v1[t.length()]; } /** * Calculate the Damerau-Levenshtein Distance between two strings. The basic difference * between this algorithm and the general Levenshtein algorithm is that damerau-Levenshtein * counts a swap of two characters next to each other as 1 instead of 2. This breaks the * 'triangular equality', which makes it unusable for Metric trees. See Wikipedia pages on * both Levenshtein and Damerau-Levenshtein and then make your decision as to which algorithm * is appropriate for your situation. * * @param source Source input string * @param target Target input string * @return The number of substitutions it would take * to make the source string identical to the target * string */ public static int damerauLevenshteinDistance(CharSequence source, CharSequence target) { if (source == null || EMPTY.contentEquals(source)) { return target == null || EMPTY.contentEquals(target) ? 0 : target.length(); } else if (target == null || EMPTY.contentEquals(target)) { return source.length(); } int srcLen = source.length(); int targetLen = target.length(); int[][] distanceMatrix = new int[srcLen + 1][targetLen + 1]; // We need indexers from 0 to the length of the source string. // This sequential set of numbers will be the row "headers" // in the matrix. for (int srcIndex = 0; srcIndex <= srcLen; srcIndex++) { distanceMatrix[srcIndex][0] = srcIndex; } // We need indexers from 0 to the length of the target string. // This sequential set of numbers will be the // column "headers" in the matrix. for (int targetIndex = 0; targetIndex <= targetLen; targetIndex++) { // Set the value of the first cell in the column // equivalent to the current value of the iterator distanceMatrix[0][targetIndex] = targetIndex; } for (int srcIndex = 1; srcIndex <= srcLen; srcIndex++) { for (int targetIndex = 1; targetIndex <= targetLen; targetIndex++) { // If the current characters in both strings are equal int cost = source.charAt(srcIndex - 1) == target.charAt(targetIndex - 1) ? 0 : 1; // Find the current distance by determining the shortest path to a // match (hence the 'minimum' calculation on distances). distanceMatrix[srcIndex][targetIndex] = (int) MathUtilities.minimum( // Character match between current character in // source string and next character in target distanceMatrix[srcIndex - 1][targetIndex] + 1, // Character match between next character in // source string and current character in target distanceMatrix[srcIndex][targetIndex - 1] + 1, // No match, at current, add cumulative penalty distanceMatrix[srcIndex - 1][targetIndex - 1] + cost); // We don't want to do the next series of calculations on // the first pass because we would get an index out of bounds // exception. if (srcIndex == 1 || targetIndex == 1) { continue; } // transposition check (if the current and previous // character are switched around (e.g.: t[se]t and t[es]t)... if (source.charAt(srcIndex - 1) == target.charAt(targetIndex - 2) && source.charAt(srcIndex - 2) == target.charAt(targetIndex - 1)) { // What's the minimum cost between the current distance // and a transposition. distanceMatrix[srcIndex][targetIndex] = (int) MathUtilities.minimum( // Current cost distanceMatrix[srcIndex][targetIndex], // Transposition distanceMatrix[srcIndex - 2][targetIndex - 2] + cost); } } } return distanceMatrix[srcLen][targetLen]; } /** * @param random Random instance * @param minLen minimum number of characters * @param maxLen maximum number of characters * @return String of alphabetical characters, with the first character uppercase (Proper case strings). */ public static String getRandomString(Random random, int minLen, int maxLen) { StringBuilder s = new StringBuilder(); int len = minLen + random.nextInt(maxLen - minLen + 1); for (int i = 0; i < len; i++) { s.append(getRandomChar(random, i == 0)); } return s.toString(); } public static String getRandomChar(Random random, boolean upper) { int r = random.nextInt(26); return upper ? EMPTY + (char) ((int) 'A' + r) : EMPTY + (char) ((int) 'a' + r); } /** * Convert a String into a byte[] with a particular encoding. * Preferable used when the encoding is one of the guaranteed Java types * and you don't want to have to catch the UnsupportedEncodingException * required by Java * * @param s string to encode into bytes * @param encoding encoding to use */ public static byte[] getBytes(String s, String encoding) { try { return s == null ? null : s.getBytes(encoding); } catch (UnsupportedEncodingException e) { throw new IllegalArgumentException(String.format("Encoding (%s) is not supported by your JVM", encoding), e); } } /** * Convert a byte[] into a UTF-8 String. Preferable used when the encoding * is one of the guaranteed Java types and you don't want to have to catch * the UnsupportedEncodingException required by Java * * @param bytes bytes to encode into a string */ public static String createUtf8String(byte[] bytes) { return createString(bytes, "UTF-8"); } /** * Convert a String into a byte[] encoded by UTF-8. * * @param s string to encode into bytes */ public static byte[] getUTF8Bytes(String s) { return getBytes(s, "UTF-8"); } /** * Convert a byte[] into a String with a particular encoding. * Preferable used when the encoding is one of the guaranteed Java types * and you don't want to have to catch the UnsupportedEncodingException * required by Java * * @param bytes bytes to encode into a string * @param encoding encoding to use */ public static String createString(byte[] bytes, String encoding) { try { return bytes == null ? null : new String(bytes, encoding); } catch (UnsupportedEncodingException e) { throw new IllegalArgumentException(String.format("Encoding (%s) is not supported by your JVM", encoding), e); } } /** * Convert a byte[] into a UTF-8 encoded String. * * @param bytes bytes to encode into a string */ public static String createUTF8String(byte[] bytes) { return createString(bytes, "UTF-8"); } /** * Get the hashCode of a String, insensitive to case, without any new Strings * being created on the heap. * * @param s String input * @return int hashCode of input String insensitive to case */ public static int hashCodeIgnoreCase(String s) { if (s == null) { return 0; } final int len = s.length(); int hash = 0; for (int i = 0; i < len; i++) { hash = 31 * hash + toLowerCase(s.charAt(i)); } return hash; } /** * Removes control characters (char <= 32) from both * ends of this String, handling {@code null} by returning * {@code null}. * *

The String is trimmed using {@link String#trim()}. * Trim removes start and end characters <= 32. * * @param str the String to be trimmed, may be null * @return the trimmed string, {@code null} if null String input */ public static String trim(String str) { return str == null ? null : str.trim(); } /** * Trims a string, its null safe and null will return empty string here.. * * @param value string input * @return String trimmed string, if value was null this will be empty */ public static String trimToEmpty(String value) { return value == null ? EMPTY : value.trim(); } /** * Trims a string, If the string trims to empty then we return null. * * @param value string input * @return String, trimmed from value. If the value was empty we return null. */ public static String trimToNull(String value) { String ts = trim(value); return isEmpty(ts) ? null : ts; } /** * Trims a string, If the string trims to empty then we return the default. * * @param value string input * @param defaultValue value to return on empty or null * @return trimmed string, or defaultValue when null or empty */ public static String trimEmptyToDefault(String value, String defaultValue) { return Optional.ofNullable(value).map(StringUtilities::trimToNull).orElse(defaultValue); } }





© 2015 - 2024 Weber Informatics LLC | Privacy Policy