All Downloads are FREE. Search and download functionalities are using the official Maven repository.

com.cinchapi.common.base.AnyStrings Maven / Gradle / Ivy

Go to download

Accent4J is a suite of libraries, helpers and data structures that make Java programming idioms more fluent.

There is a newer version: 1.13.1
Show newest version
/*
 * Copyright (c) 2015 Cinchapi Inc.
 * 
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 * 
 * http://www.apache.org/licenses/LICENSE-2.0
 * 
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package com.cinchapi.common.base;

import java.io.ByteArrayOutputStream;
import java.io.PrintStream;
import java.text.MessageFormat;
import java.util.Arrays;
import java.util.List;
import java.util.Set;
import java.util.stream.Collectors;

import javax.annotation.Nullable;

import com.google.common.base.MoreObjects;
import com.google.common.collect.ImmutableSet;
import com.google.common.collect.Lists;
import com.google.common.collect.Sets;
import com.google.common.primitives.Doubles;
import com.google.common.primitives.Floats;
import com.google.common.primitives.Ints;
import com.google.common.primitives.Longs;

/**
 * A collection of functions that efficiently operate on {@link String strings}.
 * 

* In some cases, we provide optimized implementations of functionality that * exists within the {@link String} class itself or other frameworks. *

* * @author Jeff Nelson */ public class AnyStrings { /** * The set of all unicode double quotation mark characters * * See http://www.unicode.org/Public/security/revision-03/confusablesSummary. * txt for the list of characters *

**/ private static final Set DOUBLE_QUOTE_UNICODE_CHARS = ImmutableSet .of('ʺ', '˝', 'ˮ', '˶', 'ײ', '״', '“', '”', '‟', '″', '‶', '〃', '"'); /** * The set of all unicode single quotation mark characters * * See http://www.unicode.org/Public/security/revision-03/confusablesSummary. * txt for the list of characters *

**/ private static final Set SINGLE_QUOTE_UNICODE_CHARS = ImmutableSet .of('`', 'ꞌ', 'ʻ', 'ʼ', 'י', 'ʹ', 'ʽ', 'ʾ', 'ˊ', 'ˋ', 'ߴ', 'ߵ', 'ʹ', '׳', '’', '˴', '՚', '՝', '‘', '‛', '′', '‵', '´', '΄', '᾽', '᾿', '`', '´', '῾', ''', '`'); /** * The start of a placeholder sequence for the * {@link #format(String, Object...)} method. */ private static final char PLACEHOLDER_BEGIN = '{'; /** * The end of a placeholder sequence for the * {@link #format(String, Object...)} method. */ private static final char PLACEHOLDER_END = '}'; /** * Perform a {@link String#compareToIgnoreCase(String) case insensitive * comparison} between {@code s1} and {@code s2}. * * @param s1 * @param s2 * @return the result of the comparison */ public static int compareToIgnoreCase(String s1, String s2) { return s1.compareToIgnoreCase(s2); } /** * Ensure that {@code string} ends with {@code suffix} by appending it to * {@code string} if and only if it is not already the last sequence of * characters in the string. * * @param string the {@link String} to that should end with {@code suffix} * @param suffix the {@link String} of characters with which {@code string} * should end * @return {@code string} if it already ends with {@code suffix} or a new * {@link String} that contains {@code suffix} appended to * {@code string} */ public static String ensureEndsWith(String string, String suffix) { if(string.endsWith(suffix)) { return string; } else { return joinSimple(string, suffix); } } /** * Ensure that {@code string} starts with {@code prefix} by prepending it to * {@code string} if and only if it is not already the first sequence of * characters in the string. * * @param string the {@link String} to that should start with {@code prefix} * @param prefix the {@link String} of characters with which {@code string} * should start * @return {@code string} if it already begins with {@code prefix} or a new * {@link String} that contains {@code prefix} prepended to * {@code string} */ public static String ensureStartsWith(String string, String prefix) { if(string.startsWith(prefix)) { return string; } else { return joinSimple(prefix, string); } } /** * Ensure that {@code string} is surrounded by quotes. If that is not the * case, alter the string so that it is and return the altered form. * *

* Calling {@link Strings#isWithinQuotes(String)} on the result of this * method will always return {@code true}. *

* * @param string the string that must be quoted * @return {@code string} or {@code string} surrounded by quotes if it is * not already */ public static String ensureWithinQuotes(String string) { return isWithinQuotes(string) ? string : joinSimple("\"", string, "\""); } /** * Wrap {@code string} within quotes if it is necessary to do so. Otherwise, * return the original {@code string}. * *

* The original {@code string} will be wrapped in quotes and returned as * such if: *

    *
  • it is not already wrapped {@link #isWithinQuotes(String) within * quotes}, and
  • *
  • {@code delimiter} appears at least once
  • *
* If those conditions are met, the original string will be wrapped in * either *
    *
  • double quotes if a single quote appears in the original string, * or
  • *
  • single quotes if a double quote appears in the original string, * or
  • *
  • double quotes if both a single and double quote appear in the * original string; furthermore, all instances of double quotes within the * original string will be escaped
  • *
*

* * @param string the string to potentially quote * @param delimiter the delimiter that determines whether quoting should * happen * @return the original {@code string} or a properly quoted alternative */ public static String ensureWithinQuotesIfNeeded(String string, char delimiter) { boolean foundDouble = false; boolean foundSingle = false; boolean foundDelimiter = false; StringBuilder escaped = new StringBuilder(); escaped.append('"'); if(!isWithinQuotes(string)) { char[] chars = string.toCharArray(); for (int i = 0; i < chars.length; ++i) { char c = chars[i]; if(c == delimiter) { foundDelimiter = true; } else if(c == '"') { foundDouble = true; escaped.append('\\'); } else if(c == '\'') { foundSingle = true; } escaped.append(c); } escaped.append('"'); if(foundDelimiter) { if(foundDouble && foundSingle) { return escaped.toString(); } else if(foundDouble) { return format("'{}'", string); } else { // foundSingle or found no quotes return format("\"{}\"", string); } } } return string; } /** * Wrap {@code string} within quotes if it is necessary to do so. Otherwise, * return the original {@code string}. * *

* The original {@code string} will be wrapped in quotes and returned as * such if: *

    *
  • it is not already wrapped {@link #isWithinQuotes(String) within * quotes}, and
  • *
  • {@code delimiter} appears at least once
  • *
* If those conditions are met, the original string will be wrapped in * either *
    *
  • double quotes if a single quote appears in the original string, * or
  • *
  • single quotes if a double quote appears in the original string, * or
  • *
  • double quotes if both a single and double quote appear in the * original string; furthermore, all instances of double quotes within the * original string will be escaped
  • *
*

* * @param string the string to potentially quote * @param delimiters the delimiters that determines whether quoting should * happen * @return the original {@code string} or a properly quoted alternative */ public static String ensureWithinQuotesIfNeeded(String string, Character... delimiters) { Set _delimiters = Sets.newHashSet(delimiters); _delimiters.remove('\''); _delimiters.remove('"'); boolean foundDouble = false; boolean foundSingle = false; boolean foundDelimiter = false; StringBuilder escaped = new StringBuilder(); escaped.append('"'); if(!isWithinQuotes(string)) { char[] chars = string.toCharArray(); for (int i = 0; i < chars.length; ++i) { char c = chars[i]; if(_delimiters.contains(c)) { foundDelimiter = true; } else if(c == '"') { foundDouble = true; escaped.append('\\'); } else if(c == '\'') { foundSingle = true; } escaped.append(c); } escaped.append('"'); if(foundDelimiter) { if(foundDouble && foundSingle) { return escaped.toString(); } else if(foundDouble) { return "'" + string + "'"; } else { // foundSingle or found no quotes return "\"" + string + "\""; } } } return string; } /** * Efficiently escape inner occurrences of each of the {@code characters} * within the {@code string}, if necessary. *

* Escaped characters are prepended with the backslash ('\') character. *

*

* An "inner occurrence" for a character is one that is not at the head or * tail of the string. *

* * @param string the string to escape * @param characters the characters to escape within the {@code string} * @return the escaped {@code string} */ public static String escapeInner(String string, char... characters) { char c = '\0'; char pchar = '\0'; StringBuilder sb = null; Set chars = null; if(characters.length == 1) { c = characters[0]; } else { chars = Sets.newHashSetWithExpectedSize(characters.length); for (char ch : characters) { chars.add(ch); } } char[] schars = string.toCharArray(); int offset = 0; int i = 0; while (i < schars.length) { if(i > 0 && i < schars.length - 1) { char schar = schars[i]; if(pchar != '\\' && ((c != '\0' && c == schar) || (chars != null && chars.contains(schar)))) { sb = MoreObjects.firstNonNull(sb, new StringBuilder()); sb.append(schars, offset, i - offset); sb.append('\\'); char escaped = Characters .getEscapedCharOrNullLiteral(schar); if(escaped != '0') { sb.append(escaped); } else { sb.append(schar); } offset = i + 1; } pchar = schar; } ++i; } if(sb != null) { sb.append(schars, offset, i - offset); return sb.toString(); } else { return string; } } /** * Inspired by the SLF4J logging framework! *

* Take a {@code template} string that contains placeholders ({}) and inject * each of the {@code args} in their place, respectively. *

*

* This method behaves similarly to {@link MessageFormat#format(Object)} * except the placeholders do not need to be numbered (i.e. "foo {} bar" vs * "foo {0} bar"). In most cases, the implementation of this method is also * much more efficient. *

*

* If there are more placeholders than {@code args}, the extra placeholders * will be retained formatted string. *

*

* If there are more {@code args} than placeholders, the extra {@code args} * will be placed at the end of the formatted string. *

*

* If an {@link Exception} is included in the injected {@code args}, only * the Exception's message will be included in the formatted. If you want to * include the full stack trace of an Exception, place it as the last of the * {@code args} and make sure that there is no correspond placeholder. *

Example (borrowed from the SLf4J docs)

* *
     * String s = "Hello world";
     * try {
     *     Integer i = Integer.valueOf(s);
     * }
     * catch (NumberFormatException e) {
     *     System.out.println(AnyStrings.format("Failed to format {}", s, e));
     * }
     * 
* *

* * @param template a template that may or may not contain placeholders for * variable {@code args} * @param args the values to inject in the {@code template} placeholders, * respectively * @return the formatted string */ public static String format(String template, Object... args) { if(args == null || args.length == 0) { return template; } else if(template.isEmpty()) { return formatExtraArgs(new StringBuilder(), args, 0, args.length) .toString(); } else { StringBuilder sb = new StringBuilder(); char[] chars = template.toCharArray(); int templateLength = chars.length; int argsLength = args.length; int templateIndex = 0; int argsIndex = 0; int copyOffset = 0; int copyLength = 0; while (templateIndex < templateLength) { char c = chars[templateIndex]; int next = templateIndex + 1; char nextc = next < templateLength ? chars[next] : Characters.NULL; if(c == Characters.ESCAPE && nextc == PLACEHOLDER_BEGIN) { // When escaping the placeholder, simply append any // characters that are currently buffered (see copyLength) // and then skip the escape character before proceeding if(copyLength > 0) { sb.append(chars, copyOffset, copyLength); } sb.append(chars, templateIndex + 1, 1); // append // PLACEHOLDER_BEGIN // so that we don't // execute the block // below on the next // loop templateIndex += 2; copyOffset = templateIndex; copyLength = 0; } else if(c == PLACEHOLDER_BEGIN && nextc == PLACEHOLDER_END && argsIndex < argsLength) { sb.append(chars, copyOffset, copyLength); sb.append(String.valueOf(args[argsIndex])); templateIndex = next + 1; ++argsIndex; copyOffset = templateIndex; copyLength = 0; } else { ++templateIndex; ++copyLength; } } sb.append(chars, copyOffset, copyLength); if(argsIndex < argsLength) { // If there are remaining args that weren't represented in the // template with variable markers, simply append them as a list sb.append(": "); formatExtraArgs(sb, args, argsIndex, argsLength); } return sb.toString(); } } /** * Return a set that contains every possible substring of {@code string} * excluding pure whitespace strings. * * @param string the string to divide into substrings * @return the set of substrings */ public static Set getAllSubStrings(String string) { Set result = Sets.newHashSet(); for (int i = 0; i < string.length(); ++i) { for (int j = i + 1; j <= string.length(); ++j) { String substring = string.substring(i, j).trim(); if(!com.google.common.base.Strings.isNullOrEmpty(substring)) { result.add(substring); } } } return result; } /** * An optimized version of {@link String#contains(CharSequence)} to see if * {@code needle} is a substring of {@code haystack}. * * @param needle the substring for which to search * @param haystack the string in which to search for the substring * @return {@code true} if {@code needle} is a substring */ public static boolean isSubString(String needle, String haystack) { if(needle.length() > haystack.length()) { return false; } else if(needle.length() == haystack.length()) { return needle.equals(haystack); } else { char[] n = needle.toCharArray(); char[] h = haystack.toCharArray(); int npos = 0; int hpos = 0; int stop = h.length - n.length; int hstart = -1; while (hpos < h.length && npos < n.length) { char hi = h[hpos]; char ni = n[npos]; if(hi == ni) { if(hstart == -1) { hstart = hpos; } ++npos; ++hpos; } else { if(npos > 0) { npos = 0; hpos = hstart + 1; hstart = -1; } else { ++hpos; } if(hpos > stop) { return false; } } } return npos == n.length; } } /** * Return {@code true} if {@code string} both starts and ends with single or * double quotes. * * @param string * @param forceTreatAsNonQuoteChar a list of characters may be * technically quotes but should not be considered when checking * if {@code string} is within quotes * @return {@code true} if the string is between quotes */ public static boolean isWithinQuotes(String string, Character... forceTreatAsNonQuoteChar) { string = replaceUnicodeConfusables(string, forceTreatAsNonQuoteChar); if(string.length() > 2) { char first = string.charAt(0); if(first == '"' || first == '\'') { char last = string.charAt(string.length() - 1); return first == last; } } return false; } /** * Concatenates the {@link Object#toString string} representation of all the * {@code args}, separated by the {@code separator} char in an efficient * manner. * * @param separator the separator to place between each of the {@code args} * @param args the args to join * @return the resulting String */ public static String join(char separator, Object... args) { StringBuilder builder = new StringBuilder(); for (int i = 0; i < args.length; ++i) { builder.append(args[i]); builder.append(separator); } builder.deleteCharAt(builder.length() - 1); return builder.toString(); } /** * Concatenates the {@link Object#toString string} representation of all the * {@code args}, separated by the {@code separator} string in an efficient * manner. * * @param separator the separator to place between each of the {@code args} * @param args the args to join * @return the resulting String */ public static String join(String separator, Object... args) { StringBuilder builder = new StringBuilder(); for (int i = 0; i < args.length; ++i) { builder.append(args[i]); builder.append(separator); } builder.deleteCharAt(builder.length() - 1); return builder.toString(); } /** * Concatenates the toString values of all the {@code args} in an efficient * manner. * * @param args * @return the resulting String */ public static String joinSimple(Object... args) { StringBuilder builder = new StringBuilder(); for (int i = 0; i < args.length; ++i) { builder.append(args[i]); } return builder.toString(); } /** * Concatenates the toString values of all the {@code args}, separated by * whitespace in an efficient manner. * * @param args * @return the resulting String */ public static String joinWithSpace(Object... args) { return join(' ', args); } /** * Replace all instances of "confusable" unicode characters with a * canoncial/normalized character. *

* See http://www.unicode.org/Public/security/revision-03/confusablesSummary. * txt for a list of characters that are considered to be confusable. *

* * @param string the {@link String} in which the replacements should occur * @param a list of characters that should not be replaced, even if they are * a confusable * @return a {@link String} free of confusable unicode characters */ public static String replaceUnicodeConfusables(String string, Character... preserve) { Set preserved = Arrays.stream(preserve) .collect(Collectors.toSet()); char[] chars = string.toCharArray(); for (int i = 0; i < chars.length; ++i) { char c = chars[i]; if(DOUBLE_QUOTE_UNICODE_CHARS.contains(c) && !preserved.contains(c)) { c = '"'; } else if(SINGLE_QUOTE_UNICODE_CHARS.contains(c) && !preserved.contains(c)) { c = '\''; } chars[i] = c; } return String.valueOf(chars); } /** * Split a string, using whitespace as a delimiter, as long as the * whitespace is not wrapped in double or single quotes. * * @param string * @return the tokens that result from the split * @deprecated in version 0.5.0, use {@link QuoteAwareStringSplitter} * instead. */ @Deprecated public static String[] splitButRespectQuotes(String string) { return splitStringByDelimiterButRespectQuotes(string, " "); } /** * Split a camel case {@code string} into tokens that represent the distinct * words. *

*

Example

*
    * thisIsACamelCaseSTRING -> [this, Is, A, Camel, Case, S, T, R, I, N, G] *
*
    * ThisIsACamelCaseSTRING -> [This, Is, A, Camel, Case, S, T, R, I, N, G] *
*
    * thisisacamelcasestring -> [thisisacamelcasestring] *
*

* * @param string * @return a list of tokens after splitting the string on camel case word * boundaries */ public static List splitCamelCase(String string) { List words = Lists.newArrayList(); char[] chars = string.toCharArray(); StringBuilder word = new StringBuilder(); for (int i = 0; i < chars.length; ++i) { char c = chars[i]; if(Character.isUpperCase(c) || c == '$') { if(word.length() > 0) { words.add(word.toString()); } word.setLength(0); } word.append(c); } words.add(word.toString()); return words; } /** * Split a string on a delimiter as long as that delimiter is not wrapped in * double or single quotes. *

* If {@code delimiter} is a single character string, it is more efficient * to use a {@link StringSplitter} as opposed to this method. *

* * @param string the string to split * @param delimiter the delimiting string/regex on which the input * {@code string} is split * @return the tokens that result from the split */ public static String[] splitStringByDelimiterButRespectQuotes(String string, String delimiter) { // This is pretty inefficient: convert all single quotes to double // quotes (except one off single quotes that are used as apostrophes) so // the regex below works string = string.replaceAll(" '", " \""); string = string.replaceAll("' ", "\" "); string = string.replaceAll("'$", "\""); return string.split(delimiter + "(?=([^\"]*\"[^\"]*\")*[^\"]*$)"); } /** * This method efficiently tries to parse {@code value} into a * {@link Boolean} object if possible. If the string is not a boolean, then * the method returns {@code null} as quickly as possible. * * @param value * @return a Boolean object that represents the string or {@code null} if it * is not possible to parse the string into a boolean */ public static Boolean tryParseBoolean(String value) { if(value.equalsIgnoreCase("true")) { return true; } else if(value.equalsIgnoreCase("false")) { return false; } else { return null; } } /** * This method efficiently tries to parse {@code value} into a * {@link Number} object if possible. If the string is not a number, then * the method returns {@code null} as quickly as possible. * * @param value * @return a Number object that represents the string or {@code null} if it * is not possible to parse the string into a number */ @Nullable public static Number tryParseNumber(String value) { int size = value.length(); if(value == null || size == 0) { return null; } else if(value.charAt(0) == '0' && size > 1 && value.charAt(1) != '.') { // Do not parse a string as a number if it has a leading 0 that is // not followed by a decimal (i.e. 007) return null; } boolean decimal = false; boolean scientific = false; for (int i = 0; i < size; ++i) { char c = value.charAt(i); if(!Character.isDigit(c)) { if(i == 0 && c == '-' || (scientific && (c == '-' || c == '+'))) { continue; } else if(c == '.') { if(!decimal && size > 1) { decimal = true; } else { // Since we've already seen a decimal, the appearance of // another one suggests this is an IP address instead of // a number return null; } } else if(i == size - 1 && c == 'D' && size > 1) { // Respect the convention to coerce numeric strings to // Double objects by appending a single 'D' character. return Double.valueOf(value.substring(0, i)); } else if((c == 'E' || c == 'e') && i < size - 1) { // CON-627: Account for valid representations of scientific // notation if(!scientific) { scientific = true; } else { // Since we've already seen a scientific notation // indicator, another one suggests that this is not // really a number return null; } } else { return null; } } } try { if(decimal || scientific) { // Try to return a float (for space compactness) if it is // possible to fit the entire decimal without any loss of // precision. In order to do this, we have to compare the string // output of both the parsed double and the parsed float. This // is kind of inefficient, so substitute for a better way if it // exists. double d = Doubles.tryParse(value); float f = Floats.tryParse(value); if(String.valueOf(d).equals(String.valueOf(f))) { return f; } else { return d; } } else if(value.equals("-")) { // CON-597 return null; } else { return MoreObjects.firstNonNull(Ints.tryParse(value), Longs.tryParse(value)); } } catch (NullPointerException e) { throw new NumberFormatException(format( "{} appears to be a number but cannot be parsed as such", value)); } } /** * A stricter version of {@link #tryParseNumber(String)} that does not parse * strings that masquerade as numbers (i.e. 3.124D). Instead this method * will only parse the string into a Number if it contains characters that * are either a decimal digit, a decimal separator or a negative sign. * * @param value * @return a Number object that represents the string or {@code null} if it * is not possible to parse the string into a number */ @Nullable public static Number tryParseNumberStrict(String value) { if(value == null || value.length() == 0) { return null; } char last = value.charAt(value.length() - 1); if(Character.isDigit(last)) { return tryParseNumber(value); } else { return null; } } /** * Similar to the {@link String#valueOf(char)} method, but this one will * return a cached copy of the string for frequently used characters. * * @param c the character to convert * @return a string of length 1 containing the input char */ public static String valueOfCached(char c) { if(c == '(') { return "("; } else if(c == ')') { return ")"; } else { return String.valueOf(c); } } /** * Add the {@code wrapper} character to the beginning and end of the * {@code string} and escape any instance of the {@code wrapper} within the * {@code string} with the specified {@code escape} character. * * @param string * @param wrapper * @param escape * @return the wrapped (and escaped, if necessary) string */ public static String wrap(String string, char wrapper, char escape) { StringBuilder sb = new StringBuilder(); sb.append(wrapper); char[] chars = string.toCharArray(); for (char c : chars) { if(c == wrapper) { sb.append(escape); } sb.append(c); } sb.append(wrapper); return sb.toString(); } /** * Execute the rules for formatting {@code args} with no corresponding * placeholders in the original template. * * @param sb the {@link StringBuilder} that contains the formatted string; * modified in-place within this method and returned for * convenience * @param args all of the placeholder values * @param argsIndex the index of the first of the {@code args} that is * "extra" * @param argsLength the length of {@code args}, required so it is not * computed twice * @return {@code sb} for convenience */ private static StringBuilder formatExtraArgs(StringBuilder sb, Object[] args, int argsIndex, int argsLength) { for (int i = argsIndex; i < argsLength; ++i) { Object arg = args[i]; int nextIndex = i + 1; if(nextIndex == argsLength && arg instanceof Exception) { ByteArrayOutputStream baos = new ByteArrayOutputStream(); ((Exception) arg).printStackTrace(new PrintStream(baos)); sb.append(baos.toString()); } else { sb.append(String.valueOf(arg)); } if(nextIndex != argsLength) { sb.append(' '); } } return sb; } }




© 2015 - 2025 Weber Informatics LLC | Privacy Policy