tech.tablesaw.util.StringUtils Maven / Gradle / Ivy
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package tech.tablesaw.util;
import com.google.common.base.Strings;
import java.util.ArrayList;
import java.util.List;
/**
* Operations on {@link java.lang.String} that are
* {@code null} safe.
*
*
{@code StringUtils} handles {@code null} input Strings quietly.
* That is to say that a {@code null} input will return {@code null}.
* Where a {@code boolean} or {@code int} is being returned
* details vary by method.
*
*
A side effect of the {@code null} handling is that a
* {@code NullPointerException} should be considered a bug in
* {@code StringUtils}.
*
*
Methods in this class give sample code to explain their operation.
* The symbol {@code *} is used to indicate any input including {@code null}.
*
*
#ThreadSafe#
* @see java.lang.String
* @since 1.0
*/
//@Immutable
public class StringUtils {
// Performance testing notes (JDK 1.4, Jul03, scolebourne)
// Whitespace:
// Character.isWhitespace() is faster than WHITESPACE.indexOf()
// where WHITESPACE is a string of all whitespace characters
//
// Character access:
// String.charAt(n) versus toCharArray(), then array[n]
// String.charAt(n) is about 15% worse for a 10K string
// They are about equal for a length 50 string
// String.charAt(n) is about 4 times better for a length 3 string
// String.charAt(n) is best bet overall
//
// Append:
// String.concat about twice as fast as StringBuffer.append
// (not sure who tested this)
/**
* The empty String {@code ""}.
* @since 2.0
*/
private static final String EMPTY = "";
/**
*
The maximum size to which the padding constant(s) can expand.
*/
private static final int PAD_LIMIT = 8192;
private StringUtils() {}
// Empty checks
//-----------------------------------------------------------------------
/**
*
Splits a String by Character type as returned by
* {@code java.lang.Character.getType(char)}. Groups of contiguous
* characters of the same type are returned as complete tokens, with the
* following exception: if {@code camelCase} is {@code true},
* the character of type {@code Character.UPPERCASE_LETTER}, if any,
* immediately preceding a token of type {@code Character.LOWERCASE_LETTER}
* will belong to the following token rather than to the preceding, if any,
* {@code Character.UPPERCASE_LETTER} token.
* @param str the String to split, may be {@code null}
* @return an array of parsed Strings, {@code null} if null String input
* @since 2.4
*/
public static String[] splitByCharacterTypeCamelCase(final String str) {
if (str == null) {
return null;
}
if (str.isEmpty()) {
return new String[0];
}
final char[] c = str.toCharArray();
final List list = new ArrayList<>();
int tokenStart = 0;
int currentType = Character.getType(c[tokenStart]);
for (int pos = tokenStart + 1; pos < c.length; pos++) {
final int type = Character.getType(c[pos]);
if (type == currentType) {
continue;
}
if (type == Character.LOWERCASE_LETTER && currentType == Character.UPPERCASE_LETTER) {
final int newTokenStart = pos - 1;
if (newTokenStart != tokenStart) {
list.add(new String(c, tokenStart, newTokenStart - tokenStart));
tokenStart = newTokenStart;
}
} else {
list.add(new String(c, tokenStart, pos - tokenStart));
tokenStart = pos;
}
currentType = type;
}
list.add(new String(c, tokenStart, c.length - tokenStart));
return list.toArray(new String[0]);
}
// Joining
//-----------------------------------------------------------------------
/**
* Joins the elements of the provided array into a single String
* containing the provided list of elements.
*
*
No delimiter is added before or after the list.
* Null objects or empty strings within the array are represented by
* empty strings.
*
*
* StringUtils.join(null, *) = null
* StringUtils.join([], *) = ""
* StringUtils.join([null], *) = ""
* StringUtils.join(["a", "b", "c"], ';') = "a;b;c"
* StringUtils.join(["a", "b", "c"], null) = "abc"
* StringUtils.join([null, "", "a"], ';') = ";;a"
*
*
* @param array the array of values to join together, may be null
* @param separator the separator character to use
* @return the joined String, {@code null} if null array input
* @since 2.0
*/
public static String join(final Object[] array, final char separator) {
if (array == null) {
return null;
}
return join(array, separator, 0, array.length);
}
/**
* Joins the elements of the provided array into a single String
* containing the provided list of elements.
*
*
No delimiter is added before or after the list.
* Null objects or empty strings within the array are represented by
* empty strings.
*
*
* StringUtils.join(null, *) = null
* StringUtils.join([], *) = ""
* StringUtils.join([null], *) = ""
* StringUtils.join(["a", "b", "c"], ';') = "a;b;c"
* StringUtils.join(["a", "b", "c"], null) = "abc"
* StringUtils.join([null, "", "a"], ';') = ";;a"
*
*
* @param array the array of values to join together, may be null
* @param separator the separator character to use
* @param startIndex the first index to start joining from. It is
* an error to pass in an end index past the end of the array
* @param endIndex the index to stop joining from (exclusive). It is
* an error to pass in an end index past the end of the array
* @return the joined String, {@code null} if null array input
* @since 2.0
*/
private static String join(final Object[] array, final char separator, final int startIndex, final int endIndex) {
if (array == null) {
return null;
}
final int noOfItems = endIndex - startIndex;
if (noOfItems <= 0) {
return EMPTY;
}
final StringBuilder buf = new StringBuilder(noOfItems * 16);
for (int i = startIndex; i < endIndex; i++) {
if (i > startIndex) {
buf.append(separator);
}
if (array[i] != null) {
buf.append(array[i]);
}
}
return buf.toString();
}
// Padding
//-----------------------------------------------------------------------
/**
* Repeat a String {@code repeat} times to form a
* new String.
*
*
* StringUtils.repeat(null, 2) = null
* StringUtils.repeat("", 0) = ""
* StringUtils.repeat("", 2) = ""
* StringUtils.repeat("a", 3) = "aaa"
* StringUtils.repeat("ab", 2) = "abab"
* StringUtils.repeat("a", -2) = ""
*
*
* @param str the String to repeat, may be null
* @param repeat number of times to repeat str, negative treated as zero
* @return a new String consisting of the original String repeated,
* {@code null} if null String input
*/
public static String repeat(final String str, final int repeat) {
// Performance tuned for 2.0 (JDK1.4)
if (str == null) {
return null;
}
if (repeat <= 0) {
return EMPTY;
}
final int inputLength = str.length();
if (repeat == 1 || inputLength == 0) {
return str;
}
if (inputLength == 1 && repeat <= PAD_LIMIT) {
return repeat(str.charAt(0), repeat);
}
final int outputLength = inputLength * repeat;
switch (inputLength) {
case 1 :
return repeat(str.charAt(0), repeat);
case 2 :
final char ch0 = str.charAt(0);
final char ch1 = str.charAt(1);
final char[] output2 = new char[outputLength];
for (int i = repeat * 2 - 2; i >= 0; i--, i--) {
output2[i] = ch0;
output2[i + 1] = ch1;
}
return new String(output2);
default :
final StringBuilder buf = new StringBuilder(outputLength);
for (int i = 0; i < repeat; i++) {
buf.append(str);
}
return buf.toString();
}
}
/**
* Returns padding using the specified delimiter repeated
* to a given length.
*
*
* StringUtils.repeat('e', 0) = ""
* StringUtils.repeat('e', 3) = "eee"
* StringUtils.repeat('e', -2) = ""
*
*
* Note: this method does not support padding with
* Unicode Supplementary Characters
* as they require a pair of {@code char}s to be represented.
* If you are needing to support full I18N of your applications
* consider using {@link #repeat(String, int)} instead.
*
*
* @param ch character to repeat
* @param repeat number of times to repeat char, negative treated as zero
* @return String with repeated character
* @see #repeat(String, int)
*/
private static String repeat(final char ch, final int repeat) {
if (repeat <= 0) {
return EMPTY;
}
final char[] buf = new char[repeat];
for (int i = repeat - 1; i >= 0; i--) {
buf[i] = ch;
}
return new String(buf);
}
/**
* Gets a CharSequence length or {@code 0} if the CharSequence is
* {@code null}.
*
* @param cs
* a CharSequence or {@code null}
* @return CharSequence length or {@code 0} if the CharSequence is
* {@code null}.
* @since 2.4
* @since 3.0 Changed signature from length(String) to length(CharSequence)
*/
public static int length(final CharSequence cs) {
return cs == null ? 0 : cs.length();
}
// Case conversion
//-----------------------------------------------------------------------
/**
*
Capitalizes a String changing the first character to title case as
* per {@link Character#toTitleCase(int)}. No other characters are changed.
*
* A {@code null} input String returns {@code null}.
*
*
* StringUtils.capitalize(null) = null
* StringUtils.capitalize("") = ""
* StringUtils.capitalize("cat") = "Cat"
* StringUtils.capitalize("cAt") = "CAt"
* StringUtils.capitalize("'cat'") = "'cat'"
*
*
* @param str the String to capitalize, may be null
* @return the capitalized String, {@code null} if null String input
* @since 2.0
*/
public static String capitalize(final String str) {
int strLen;
if (str == null || (strLen = str.length()) == 0) {
return str;
}
final int firstCodepoint = str.codePointAt(0);
final int newCodePoint = Character.toTitleCase(firstCodepoint);
if (firstCodepoint == newCodePoint) {
// already capitalized
return str;
}
final int newCodePoints[] = new int[strLen]; // cannot be longer than the char array
int outOffset = 0;
newCodePoints[outOffset++] = newCodePoint; // copy the first codepoint
for (int inOffset = Character.charCount(firstCodepoint); inOffset < strLen; ) {
final int codepoint = str.codePointAt(inOffset);
newCodePoints[outOffset++] = codepoint; // copy the remaining ones
inOffset += Character.charCount(codepoint);
}
return new String(newCodePoints, 0, outOffset);
}
// Character Tests
//-----------------------------------------------------------------------
/**
* Checks if the CharSequence contains only Unicode letters.
*
*
{@code null} will return {@code false}.
* An empty CharSequence (length()=0) will return {@code false}.
*
*
* StringUtils.isAlpha(null) = false
* StringUtils.isAlpha("") = false
* StringUtils.isAlpha(" ") = false
* StringUtils.isAlpha("abc") = true
* StringUtils.isAlpha("ab2c") = false
* StringUtils.isAlpha("ab-c") = false
*
*
* @param cs the CharSequence to check, may be null
* @return {@code true} if only contains letters, and is non-null
* @since 3.0 Changed signature from isAlpha(String) to isAlpha(CharSequence)
* @since 3.0 Changed "" to return false and not true
*/
public static boolean isAlpha(final String cs) {
if (Strings.isNullOrEmpty(cs)) {
return false;
}
final int sz = cs.length();
for (int i = 0; i < sz; i++) {
if (!Character.isLetter(cs.charAt(i))) {
return false;
}
}
return true;
}
/**
* Checks if the CharSequence contains only Unicode letters or digits.
*
*
{@code null} will return {@code false}.
* An empty CharSequence (length()=0) will return {@code false}.
*
*
* StringUtils.isAlphanumeric(null) = false
* StringUtils.isAlphanumeric("") = false
* StringUtils.isAlphanumeric(" ") = false
* StringUtils.isAlphanumeric("abc") = true
* StringUtils.isAlphanumeric("ab c") = false
* StringUtils.isAlphanumeric("ab2c") = true
* StringUtils.isAlphanumeric("ab-c") = false
*
*
* @param cs the CharSequence to check, may be null
* @return {@code true} if only contains letters or digits,
* and is non-null
* @since 3.0 Changed signature from isAlphanumeric(String) to isAlphanumeric(CharSequence)
* @since 3.0 Changed "" to return false and not true
*/
public static boolean isAlphanumeric(final String cs) {
if (Strings.isNullOrEmpty(cs)) {
return false;
}
final int sz = cs.length();
for (int i = 0; i < sz; i++) {
if (!Character.isLetterOrDigit(cs.charAt(i))) {
return false;
}
}
return true;
}
/**
* Checks if the CharSequence contains only Unicode digits.
* A decimal point is not a Unicode digit and returns false.
*
*
{@code null} will return {@code false}.
* An empty CharSequence (length()=0) will return {@code false}.
*
*
Note that the method does not allow for a leading sign, either positive or negative.
* Also, if a String passes the numeric test, it may still generate a NumberFormatException
* when parsed by Integer.parseInt or Long.parseLong, e.g. if the value is outside the range
* for int or long respectively.
*
*
* StringUtils.isNumeric(null) = false
* StringUtils.isNumeric("") = false
* StringUtils.isNumeric(" ") = false
* StringUtils.isNumeric("123") = true
* StringUtils.isNumeric("\u0967\u0968\u0969") = true
* StringUtils.isNumeric("12 3") = false
* StringUtils.isNumeric("ab2c") = false
* StringUtils.isNumeric("12-3") = false
* StringUtils.isNumeric("12.3") = false
* StringUtils.isNumeric("-123") = false
* StringUtils.isNumeric("+123") = false
*
*
* @param cs the CharSequence to check, may be null
* @return {@code true} if only contains digits, and is non-null
* @since 3.0 Changed signature from isNumeric(String) to isNumeric(CharSequence)
* @since 3.0 Changed "" to return false and not true
*/
public static boolean isNumeric(final String cs) {
if (Strings.isNullOrEmpty(cs)) {
return false;
}
final int sz = cs.length();
for (int i = 0; i < sz; i++) {
if (!Character.isDigit(cs.charAt(i))) {
return false;
}
}
return true;
}
/**
* Checks if the CharSequence contains only lowercase characters.
*
*
{@code null} will return {@code false}.
* An empty CharSequence (length()=0) will return {@code false}.
*
*
* StringUtils.isAllLowerCase(null) = false
* StringUtils.isAllLowerCase("") = false
* StringUtils.isAllLowerCase(" ") = false
* StringUtils.isAllLowerCase("abc") = true
* StringUtils.isAllLowerCase("abC") = false
* StringUtils.isAllLowerCase("ab c") = false
* StringUtils.isAllLowerCase("ab1c") = false
* StringUtils.isAllLowerCase("ab/c") = false
*
*
* @param cs the CharSequence to check, may be null
* @return {@code true} if only contains lowercase characters, and is non-null
* @since 2.5
* @since 3.0 Changed signature from isAllLowerCase(String) to isAllLowerCase(CharSequence)
*/
public static boolean isAllLowerCase(final String cs) {
if (Strings.isNullOrEmpty(cs)) {
return false;
}
final int sz = cs.length();
for (int i = 0; i < sz; i++) {
if (!Character.isLowerCase(cs.charAt(i))) {
return false;
}
}
return true;
}
/**
* Checks if the CharSequence contains only uppercase characters.
*
*
{@code null} will return {@code false}.
* An empty String (length()=0) will return {@code false}.
*
*
* StringUtils.isAllUpperCase(null) = false
* StringUtils.isAllUpperCase("") = false
* StringUtils.isAllUpperCase(" ") = false
* StringUtils.isAllUpperCase("ABC") = true
* StringUtils.isAllUpperCase("aBC") = false
* StringUtils.isAllUpperCase("A C") = false
* StringUtils.isAllUpperCase("A1C") = false
* StringUtils.isAllUpperCase("A/C") = false
*
*
* @param cs the CharSequence to check, may be null
* @return {@code true} if only contains uppercase characters, and is non-null
* @since 2.5
* @since 3.0 Changed signature from isAllUpperCase(String) to isAllUpperCase(CharSequence)
*/
public static boolean isAllUpperCase(final String cs) {
if (Strings.isNullOrEmpty(cs)) {
return false;
}
final int sz = cs.length();
for (int i = 0; i < sz; i++) {
if (!Character.isUpperCase(cs.charAt(i))) {
return false;
}
}
return true;
}
// Abbreviating
//-----------------------------------------------------------------------
/**
* Abbreviates a String using ellipses. This will turn
* "Now is the time for all good men" into "Now is the time for..."
*
*
Specifically:
*
* - If the number of characters in {@code str} is less than or equal to
* {@code maxWidth}, return {@code str}.
* - Else abbreviate it to {@code (substring(str, 0, max-3) + "...")}.
* - If {@code maxWidth} is less than {@code 4}, throw an
* {@code IllegalArgumentException}.
* - In no case will it return a String of length greater than
* {@code maxWidth}.
*
*
*
* StringUtils.abbreviate(null, *) = null
* StringUtils.abbreviate("", 4) = ""
* StringUtils.abbreviate("abcdefg", 6) = "abc..."
* StringUtils.abbreviate("abcdefg", 7) = "abcdefg"
* StringUtils.abbreviate("abcdefg", 8) = "abcdefg"
* StringUtils.abbreviate("abcdefg", 4) = "a..."
* StringUtils.abbreviate("abcdefg", 3) = IllegalArgumentException
*
*
* @param str the String to check, may be null
* @param abbrevMarker the String indicate abbreviation
* @param maxWidth maximum length of result String, must be at least 4
* @return abbreviated String, {@code null} if null String input
* @throws IllegalArgumentException if the width is too small
* @since 2.0
*/
public static String abbreviate(final String str, final String abbrevMarker, final int maxWidth) {
return abbreviate(str, abbrevMarker, 0, maxWidth);
}
/**
* Abbreviates a String using a given replacement marker. This will turn
* "Now is the time for all good men" into "...is the time for..." if "..." was defined
* as the replacement marker.
*
*
Works like {@code abbreviate(String, String, int)}, but allows you to specify
* a "left edge" offset. Note that this left edge is not necessarily going to
* be the leftmost character in the result, or the first character following the
* replacement marker, but it will appear somewhere in the result.
*
*
In no case will it return a String of length greater than {@code maxWidth}.
*
*
* StringUtils.abbreviate(null, null, *, *) = null
* StringUtils.abbreviate("abcdefghijklmno", null, *, *) = "abcdefghijklmno"
* StringUtils.abbreviate("", "...", 0, 4) = ""
* StringUtils.abbreviate("abcdefghijklmno", "---", -1, 10) = "abcdefg---"
* StringUtils.abbreviate("abcdefghijklmno", ",", 0, 10) = "abcdefghi,"
* StringUtils.abbreviate("abcdefghijklmno", ",", 1, 10) = "abcdefghi,"
* StringUtils.abbreviate("abcdefghijklmno", ",", 2, 10) = "abcdefghi,"
* StringUtils.abbreviate("abcdefghijklmno", "::", 4, 10) = "::efghij::"
* StringUtils.abbreviate("abcdefghijklmno", "...", 6, 10) = "...ghij..."
* StringUtils.abbreviate("abcdefghijklmno", "*", 9, 10) = "*ghijklmno"
* StringUtils.abbreviate("abcdefghijklmno", "'", 10, 10) = "'ghijklmno"
* StringUtils.abbreviate("abcdefghijklmno", "!", 12, 10) = "!ghijklmno"
* StringUtils.abbreviate("abcdefghij", "abra", 0, 4) = IllegalArgumentException
* StringUtils.abbreviate("abcdefghij", "...", 5, 6) = IllegalArgumentException
*
*
* @param str the String to check, may be null
* @param abbrevMarker the String used as replacement marker
* @param offset left edge of source String
* @param maxWidth maximum length of result String, must be at least 4
* @return abbreviated String, {@code null} if null String input
* @throws IllegalArgumentException if the width is too small
* @since 3.6
*/
private static String abbreviate(final String str, final String abbrevMarker, int offset, final int maxWidth) {
if (Strings.isNullOrEmpty(str) || Strings.isNullOrEmpty(abbrevMarker)) {
return str;
}
final int abbrevMarkerLength = abbrevMarker.length();
final int minAbbrevWidth = abbrevMarkerLength + 1;
final int minAbbrevWidthOffset = abbrevMarkerLength + abbrevMarkerLength + 1;
if (maxWidth < minAbbrevWidth) {
throw new IllegalArgumentException(String.format("Minimum abbreviation width is %d", minAbbrevWidth));
}
if (str.length() <= maxWidth) {
return str;
}
if (offset > str.length()) {
offset = str.length();
}
if (str.length() - offset < maxWidth - abbrevMarkerLength) {
offset = str.length() - (maxWidth - abbrevMarkerLength);
}
if (offset <= abbrevMarkerLength+1) {
return str.substring(0, maxWidth - abbrevMarkerLength) + abbrevMarker;
}
if (maxWidth < minAbbrevWidthOffset) {
throw new IllegalArgumentException(String.format("Minimum abbreviation width with offset is %d", minAbbrevWidthOffset));
}
if (offset + maxWidth - abbrevMarkerLength < str.length()) {
return abbrevMarker + abbreviate(str.substring(offset), abbrevMarker, maxWidth - abbrevMarkerLength);
}
return abbrevMarker + str.substring(str.length() - (maxWidth - abbrevMarkerLength));
}
}