com.force.i18n.commons.text.TextUtil Maven / Gradle / Ivy
Show all versions of grammaticus Show documentation
/*
* Copyright (c) 2017, salesforce.com, inc.
* All rights reserved.
* Licensed under the BSD 3-Clause license.
* For full license text, see LICENSE.txt file in the repo root or https://opensource.org/licenses/BSD-3-Clause
*/
package com.force.i18n.commons.text;
import java.text.CollationKey;
import java.text.Collator;
import java.util.ArrayList;
import java.util.Comparator;
import java.util.HashMap;
import java.util.LinkedList;
import java.util.List;
import java.util.Map;
import java.util.logging.Logger;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import com.google.common.annotations.Beta;
import com.google.common.collect.Iterators;
/**
* A package of generic text utility functions.
*
* Beta class. Classes under com.force.i18n.commons package will be moved into a dedicated project.
*
* @author davem,pnakada,jjordano,et. al.
*/
@Beta
public final class TextUtil {
private static final Logger logger = Logger.getLogger(TextUtil.class.getName());
//Various statics for logging long strings undergoing escaping
private static final int MIN_LOG_LENGTH = 1000000; //1,000,000 character strings are suspicious :P
public static final int MIN_GACK_LENGTH = 23000000; //23,000,000 character strings are insane
private static final int MIN_REJECT_LENGTH = 30000000; //30,000,000 character strings are unacceptable
/* package */ static final String LOG_REJECT_MESSAGE = " attempt to process a string of length ";
private static final String ESCAPE_TO_XML = "escapeToXml";
private static final String ESCAPE_TO_HTML = "escapeToHtml";
private static final int defaultMaxWordLength = 30;
private static final Map longWords = new HashMap();
private static final String wbr = " ";
private static SuspiciousValueLogger SUSPICIOUS_LOGGER = null;
/**
* Interface you can override for logging suspicious values (very long strings).
*/
public static interface SuspiciousValueLogger {
/**
* Provide the ability for the caller to log/monitor/vomit over a long string.
* 1M chars+
* @param logger the logger for the TextUtil class
* @param source the source specified by the caller
* @param length the length of the long string
* @param first1000Chars the first 1000 chars of the string
*/
void logLongString(Logger logger, String source, int length, String first1000Chars);
/**
* Provide the ability for the caller to log/monitor/vomit over a gigantic string.
* 23M chars. It will call logLongString first.
* @param logger the logger for the TextUtil class
* @param source the source specified by the caller
* @param length the length of the huge string
* @param ex an exception in the logValue method
*/
void logHugeString(Logger logger, String source, int length, Exception ex);
/**
* Provide the ability for the caller to log/monitor/vomit over a gigantic string.
* 23M chars. It will call logLongString first.
* @param logger the logger for the TextUtil class
* @param source the source specified by the caller
* @param ex an exception in the logValue method
*/
void logExceptionWhenLogging(Logger logger, String source, Exception ex);
}
protected static void logValue(String source, CharSequence value) {
if(value == null) {
return;
}
try {
int length = value.length();
if(length > MIN_LOG_LENGTH) {
//Only log the first 1000 characters so we don't make logging sad. We just want to know
//whether the string is legitimate or if it should be blocked
if (SUSPICIOUS_LOGGER != null) {
SUSPICIOUS_LOGGER.logLongString(logger, source, length, value.subSequence(0, 1000).toString());
}
}
if(length > MIN_GACK_LENGTH) {
if (SUSPICIOUS_LOGGER != null) {
SUSPICIOUS_LOGGER.logHugeString(logger, source, length, new Exception());
}
}
if(length > MIN_REJECT_LENGTH) {
throw new IllegalArgumentException(source + LOG_REJECT_MESSAGE + length);
}
}
catch(IllegalArgumentException e) {
//So this doesn't fall into the exception case below
throw e;
}
catch(Exception e) {
//Catch everything because we don't want any exceptions to bubble up and fail when
//we're just logging
if (SUSPICIOUS_LOGGER != null) {
SUSPICIOUS_LOGGER.logExceptionWhenLogging(logger, source, new Exception());
}
}
}
/**
* Escapes String
s into valid xml. Similar to escapeInput
except that it will also
* replace control characters with spaces.
*
* Unless you are writing an element class or writing something that doesn't use elements,
* you probably shouldn't call this method.
*
* Calling this method in conjunction with an element is an error and will result in double-escaping.
*
* The convention in the app is that all escaping is done at output time by Elements
* and output should go through elements when possible. If you are using this method, you should think
* carefully about what you are doing and decide if it's truly necessary to bypass elements.
* @param input the text to escape
* @return the string after escaping
*/
public static String escapeToXml(CharSequence input) {
return escapeToXml(input, false, false);
}
public static String escapeToXml(CharSequence input, boolean allowNewLines, boolean convertNulls) {
return escapeToXml(input, allowNewLines, convertNulls, false);
}
public static String escapeToXml(CharSequence input, boolean allowNewLines, boolean convertNulls,
boolean escapeApos) {
return escapeToXml(input, allowNewLines, convertNulls, escapeApos, false);
}
/**
* Escapes String
s into valid xml. Similar to escapeInput
except that it will also
* replace control characters with spaces.
*
* Unless you are writing an element class or writing something that doesn't use elements,
* you probably shouldn't call this method.
*
* Calling this method in conjunction with an element is an error and will result in double-escaping.
*
* The convention in the app is that all escaping is done at output time by Elements
* and output should go through elements when possible. If you are using this method, you should think
* carefully about what you are doing and decide if it's truly necessary to bypass elements.
* @return the string after escaping
* @param input
* the text to escape
* @param allowNewLines
* if false, newlines (\r or \n) are converted to spaces instead
* @param convertNulls
* convert nulls to the empty string if treu
* @param escapeApos
* Add a backslash in front of apostrophes to deal with MSXML's nonsense
* @param preserveWhitespace
* if true, whitespace chars (as defined by {@link Character#isWhitespace(char)}) are not converted to
* spaces. If false, they may be converted to spaces if they are control characters. This argument is weaker
* than {@code allowNewLines}, so if {@code allowNewLines} is true but this argument is false, newlines will
* be preserved anyway. Since newlines are also whitespace, if {@code allowNewLines} is false but this
* argument is true, then newlines will still be preserved.
*/
public static String escapeToXml(CharSequence input, boolean allowNewLines, boolean convertNulls,
boolean escapeApos, boolean preserveWhitespace) {
if (input == null || input.length() == 0) {
return convertNulls ? "" : input == null ? null : "";
}
logValue(ESCAPE_TO_XML, input);
int limit = input.length();
DeferredStringBuilder buf = new DeferredStringBuilder(input);
for (int i = 0; i < limit; i++) {
char c = input.charAt(i);
switch (c) {
case '\n':
buf.append(allowNewLines ? '\n' : ' ');
break;
case '\r':
buf.append(allowNewLines ? '\r' : ' ');
break;
case '<':
buf.append("<");
break;
case '>':
buf.append(">");
break;
case '&':
buf.append("&");
break;
case '"':
buf.append(""");
break;
case '\'':
buf.append(escapeApos ? "'" : "\'");
break;
default:
if (!(preserveWhitespace && Character.isWhitespace(c)) && isIsoControlOrOddUnicode(c)) {
buf.append(' ');
} else {
buf.append(c);
}
break;
}
}
return buf.toString();
}
/**
* @return if the given input char is an iso-control character, undefined, or in an unusable Unicode block.
* @param c the character to test
*/
public static boolean isIsoControlOrOddUnicode(char c) {
Character.UnicodeBlock block = Character.UnicodeBlock.of(c);
return (Character.isISOControl(c) || !Character.isDefined(c) || block == Character.UnicodeBlock.HIGH_SURROGATES
|| block == Character.UnicodeBlock.HIGH_PRIVATE_USE_SURROGATES || block == Character.UnicodeBlock.LOW_SURROGATES);
}
/**
* Escape output being sent to the user to be safe in HTML. Replaces < > & " etc. with their HTML escape
* sequences. Does not translate \n's.
*
* Unless you are writing an element class or writing something that doesn't use elements,
* you probably shouldn't call this method.
*
* Calling this method in conjunction with an element is an error and will result in double-escaping.
*
* The convention in the app is that all escaping is done at output time by Elements
* and output should go through elements when possible. If you are using this method, you should think
* carefully about what you are doing and decide if it's truly necessary to bypass elements.
* @param value the string to escape to HTML
* @return the escaped string
*/
public static String escapeToHtml(String value) {
return TextUtil.escapeToHtml(value, false);
}
/**
* Escape output being sent to the user to be safe in HTML. Replaces < > & " etc. with their HTML escape
* sequences. Also translates '\n' to <br> if escapeNewline
is true
.
*
* Unless you are writing an element class or writing something that doesn't use elements,
* you probably shouldn't call this method.
*
* Calling this method in conjunction with an element is an error and will result in double-escaping.
*
* The convention in the app is that all escaping is done at output time by Elements
* and output should go through elements when possible. If you are using this method, you should think
* carefully about what you are doing and decide if it's truly necessary to bypass elements.
* @param value the string to escape to HTML
* @param escapeNewline if the new lines should be converted to <br>
* @return the escaped string
*/
public static String escapeToHtml(String value, boolean escapeNewline) {
if (value == null || value.length() == 0) {
return value;
}
logValue(ESCAPE_TO_HTML, value);
DeferredStringBuilder buf = new DeferredStringBuilder(value);
// Optimized version of appendEscapedOutput where we can use appendQuicklyForEscapingWithoutSkips
final int length = value.length();
for (int i = 0; i < length; i++) {
char c = value.charAt(i);
// TODO: Is this switch statement faster than an IntHashMap? I'm guessing it is.
switch (c) {
case '\n': if (escapeNewline) {
buf.append("
");
} else {
buf.appendQuicklyForEscapingWithoutSkips(c);
} break;
case '<': buf.append("<"); break;
case '>': buf.append(">"); break;
case '&': buf.appendAsDifferent("&"); break;
case '"': buf.append("""); break;
case '\'': buf.append("'"); break;
case '\u2028': buf.append("
"); break;
case '\u2029': buf.append(""); break;
case '\u00a9': buf.append("©"); break; // ©
default: buf.appendQuicklyForEscapingWithoutSkips(c);
}
}
return buf.toString();
}
/**
* @return a copy of the string, with all leading and trailing whitespace characters omitted. This is different from
* java.lang.String.trim()
, which only trims characters before '\u0020'
* but not characters like the wide space character in Japanese ('\u3000'
). will return
* an empty String if the input String is all whitespace
* @param str the string to trim
*/
public static String trim(String str) {
return TextUtil.trim(str, false);
}
public static boolean isNullEmptyOrWhitespace(CharSequence str) {
if (str == null) {
return true;
}
return isEmptyOrWhitespace(str);
}
public static boolean isEmptyOrWhitespace(CharSequence str) {
int end = str.length();
char c;
for (int i = 0; i < end; i++) {
if (!((c = str.charAt(i)) <= ' ' || Character.isWhitespace(c))) {
return false;
}
}
return true;
}
/**
* @param str
* String to be trimmed
* @param returnNullIfEmptyString
* returnNullIfEmptyString
* @return String if input String is null, return null if input String is non-empty after trimming, return the
* trimmed String if input String is empty after trimming, check boolean to determine return value
*/
private static String trim(String str, boolean returnNullIfEmptyString) {
if (str == null) {
return null;
}
int start = 0;
int end = str.length();
char c;
while ((start < end) && ((c = str.charAt(start)) <= ' ' || Character.isWhitespace(c))) {
start++;
}
while ((start < end) && ((c = str.charAt(end - 1)) <= ' ' || Character.isWhitespace(c))) {
end--;
}
if ((start == end) && returnNullIfEmptyString) {
return null;
}
return ((start > 0) || (end < str.length())) ? str.substring(start, end) : str;
}
/**
* Splits the given string str using the given delimiter and returns the result as a string list. If str is null, then
* null is returned.
*
* The returned string list is an ArrayList that is constructed using the 4 as the ArrayList's initial size. If you
* expect to have more than four elements more than just on the rare occasion, then please consider using another
* splitSimple overload that lets you pass in the expected size.
*
* This is more efficient than String.split or TextUtil.split because it does not use a regular expression.
*
* CAUTION: The str and delimiter parameters are in an order that differs from other string splitting
* methods. Be absolutely sure that you get the str and delimiter parameter arguments correct. This may eventually
* be fixed with a refactoring.
*
* @param delimiter
* The delimiter to split the string using
* @param str
* The string to split
* @return String list or, if str was null, then null
*/
public static List splitSimple(String delimiter, String str) {
return splitSimple(delimiter, str, 4);
}
/**
* Splits the given string str using the given delimiter and returns the result as a string list. If str is null, then
* null is returned.
*
* The returned string list is an ArrayList that is constructed using the given expected size as the ArrayList's
* initial size. If you are not aware of the expected size, then use 0, which will cause this method to use a
* LinkedList instead of an ArrayList.
*
* This is more efficient than String.split or TextUtil.split because it does not use a regular expression.
*
* CAUTION: The str and delimiter parameters are in an order that differs from other string splitting
* methods. Be absolutely sure that you get the str and delimiter parameter arguments correct. This may eventually
* be fixed with a refactoring.
*
* @param delimiter
* The delimiter to split the string using
* @param str
* The string to split
* @param expectedSize
* The expected number of elements in the output list. If you don't know, or if it could be arbitrarily
* large, and if you will only access the returned list sequentially with an iterator, then use 0 to tell
* this method to use a LinkedList
* @return String list or, if str was null, then null
*/
public static List splitSimple(String delimiter, String str, int expectedSize) {
return splitSimple(str, delimiter, expectedSize, false);
}
/**
* Splits the given string str using the given delimiter, trims each element, and returns the result as a string
* list. If str is null, then null is returned.
*
* The returned string list is an ArrayList that is constructed using the given expected size as the ArrayList's
* initial size. If you are not aware of the expected size, then use 0, which will cause this method to use a
* LinkedList instead of an ArrayList.
*
* This is more efficient than String.split or TextUtil.split because it does not use a regular expression.
*
* @param str
* The string to split
* @param delimiter
* The delimiter to split the string using
* @param expectedSize
* The expected number of elements in the output list. If you don't know, or if it could be arbitrarily
* large, and if you will only access the returned list sequentially with an iterator, then use 0 to tell
* this method to use a LinkedList
* @return String list or, if str was null, then null
*/
public static List splitSimpleAndTrim(String str, String delimiter, int expectedSize) {
return splitSimple(str, delimiter, expectedSize, true);
}
private static List splitSimple(String s, String split, int expectedSize, boolean shouldTrim) {
return splitSimple(s, split, expectedSize, shouldTrim, false);
}
private static List splitSimple(String str, String delimiter, int expectedSize, boolean shouldTrim, boolean ignoreTrailingEmpty) {
if (str == null) {
return null;
}
List result = (expectedSize == 0)? new LinkedList(): new ArrayList(expectedSize);
if(delimiter.length() == 0) {
if(!ignoreTrailingEmpty) {
throw new IllegalArgumentException();
}
//Special case to match java's behavior
char[] chars = new char[str.length()];
str.getChars(0, str.length(), chars, 0);
result.add("");
for(char c : chars) {
result.add(Character.toString(c));
}
return result;
}
//Special case to match java's behavior
if(ignoreTrailingEmpty && "".equals(str)) {
result.add("");
return result;
}
int start = 0;
int indexof;
while ((indexof = str.indexOf(delimiter, start)) != -1) {
String substring = str.substring(start, indexof);
if (shouldTrim) {
substring = substring.trim();
}
result.add(substring);
start = indexof + delimiter.length();
if (start >= str.length()) {
break;
}
}
if (start == str.length()) {
result.add("");
} else if (start < str.length()) {
String substring = str.substring(start);
if (shouldTrim) {
substring = substring.trim();
}
result.add(substring);
}
if(ignoreTrailingEmpty && result.size() > 0) {
//Discard empty substrings at the end
for(int i=result.size()-1; i>=0; i--) {
if(result.get(i).equals("")) {
result.remove(i);
}
else {
break;
}
}
}
return result;
}
/**
* Makes the first letter of the input string upper case.
* @param in the string to initialized
* @return the string with the first character capitalized
*/
public static String initCap(String in) {
if (in == null || in.length() == 0 || Character.isUpperCase(in.charAt(0))) {
return in;
}
if (in.length() == 1) {
return in.toUpperCase();
}
StringBuilder result = new StringBuilder(in.length()).append(in);
result.setCharAt(0, Character.toUpperCase(in.charAt(0)));
return result.toString();
}
public static String escapeForMessageFormat(String str) {
return escapeForMessageFormat(str, new StringBuilder(str.length()), true).toString();
}
/**
* Single-quotes and curly brackets are special characters used by Java's MessageFormat class.
* Specify check = true to only escape if a numbered param (eg. {0}) is specified
* The escaping rules are:
* 1) All single quotes => ''
* 2) All left curly brackets that are not part of a numbered param need
* to be wrapped with single quotes => '{'
* NOTE: There's only support for wrapping left curly brackets with single-quotes; any
* other use of single-quotes will be escaped
* @param src the source string
* @param sb the string buider to append to
* @param check should we check first for whether there's a curly brace. Performance improvement if you know there is one.
* @return sb
*/
public static StringBuilder escapeForMessageFormat(String src, StringBuilder sb, boolean check) {
if (check && src.indexOf('{') < 0) {
// Label doesn't specify any substitutions, ie. {0}
return sb.append(src);
}
for (int idx = 0; idx < src.length(); idx++) {
char ch = src.charAt(idx);
if (ch == '{') {
if (idx + 1 == src.length()) {
// Always escape if it's the last char
sb.append("'{'");
} else {
// Wrap the left curly bracket if it's not used for numbered params
// Numbered params can be of the following format:
// {arguementIndex} => {0}
// {argumentIndex, formatType} => {0, number}
// {argumentIndex, formatType, formatStyle} => {0, number, integer}
boolean escape = false;
for (int i = idx + 1; i < src.length(); i++) {
char nextCh = src.charAt(i);
if (nextCh == '}') {
// We'll need to escape if the substring is {}
escape = i == idx + 1;
break;
} else if (nextCh > '9' || nextCh < '0' || i == src.length() - 1) {
// We want to escape the { if it's not part of a numbered param.
// or if we never saw a closing }
// Because there are 3 types of params supported, we also don't
// escape if we see a comma after a number (we know we've only
// seen numbers if i > idx + 1)
escape = nextCh != ',' || i == idx + 1;
break;
}
}
if (escape) {
// Wrap left curly bracket with single-quotes
sb.append("'{'");
} else {
// This curly bracket is for a numbered param,
// so leave it alone
sb.append("{");
}
}
} else if (ch == '\'') {
if (idx + 1 == src.length()) {
// Always escape if it's the last char
sb.append("''");
} else {
char nextCh = src.charAt(idx + 1);
if (nextCh == '\'') {
// Single-quote is already escaped
sb.append("''");
idx++;
} else if (nextCh == '{' && src.charAt(idx + 2) == '\'') {
// Found an escaped curly left bracket, so leave it alone
sb.append("'{'");
idx = idx + 2;
} else {
// Escape this single quotes
sb.append("''");
}
}
} else {
sb.append(ch);
}
}
return sb;
}
/**
* Break long words and escape markup to HTML. This method does not activate links.
* @param text the text to break and escape
* @param preserveNewLinesInHtml whether to preserve new line characters (\n) as <br> tags
* @return the escaped and broken string
*/
public static String breakLongWordsAndEscapeToHTML(String text, boolean preserveNewLinesInHtml) {
return breakLongWordsAndEscapeToHTML(text, preserveNewLinesInHtml, defaultMaxWordLength);
}
/**
* Break long words and escape markup to HTML. This method does not activate links.
* @param text the text to break and escape
* @param preserveNewLinesInHtml whether to preserve new line characters (\n) as <br> tags
* @param maxWordLength words longer than this will be broken with <br> tags (defaults to 30)
* @return the escaped and broken string
*/
public static String breakLongWordsAndEscapeToHTML(String text, boolean preserveNewLinesInHtml, int maxWordLength) {
if (text == null){
return null;
}
Pattern pattern = longWords.get(maxWordLength);
if (pattern == null) {
pattern = Pattern.compile("\\S{" + maxWordLength + "}");
longWords.put(maxWordLength, pattern);
}
Matcher m = pattern.matcher(text);
StringBuilder sb = new StringBuilder();
int index = 0;
while (m.find()) {
final int current = m.end();
final String escaped = escapeToHtml(text.substring(index, current), preserveNewLinesInHtml);
sb.append(escaped).append(wbr);
index = current;
}
String escaped = escapeToHtml(text.substring(index), preserveNewLinesInHtml);
sb.append(escaped);
return sb.toString();
}
/**
* Concatenate the string values of zero or more strings.
*
* @return the concatenated strings
* @param separator - string that will delimit the result.
* @param strings - the strings whose values will be concatenated
*/
public static String join(String separator, String ... strings) {
return join(separator, (Object[])strings);
}
/**
* Concatenate the string values of zero or more objects.
*
* @return the concatenated objects
* @param separator - string that will delimit the result.
* @param objects - the objects whose string values will be concatenated
*/
public static String join(String separator, Object ... objects) {
switch (objects.length) {
case 0: return "";
case 1: return objects[0].toString();
default: return join(separator, Iterators.forArray(objects));
}
}
/**
* @return a new fast comparator for strings for the given collator.
* @param size the number of elements to compare (default is 16).
* @param collator the collator to use for comparison
*/
public static Comparator getComparator(Collator collator, int size) {
return new CollatingComparator(collator, size);
}
/**
* A String comparator that uses the current collation, but
* @author stamm
*/
static class CollatingComparator implements Comparator {
private final Collator collator;
private final Map cKeyMap;
CollatingComparator(Collator collator) {
this(collator, 16);
}
CollatingComparator(Collator collator, int defaultSize) {
this.collator = collator;
cKeyMap = new HashMap(defaultSize);
}
@Override
public int compare(String o1, String o2) {
if (o1 == o2) {
return 0;
} else if (o2 == null) {
return 1;
} else if (o1 == null) {
return -1;
}
return getCollationKey(o1).compareTo(getCollationKey(o2));
}
private CollationKey getCollationKey(String comp) {
CollationKey key = cKeyMap.get(comp);
if (key == null) {
key = collator.getCollationKey(comp);
cKeyMap.put(comp, key);
}
return key;
}
}
private static final String[] JSON_IN = new String[] { "\\", "\b", "\f", "\n", "\r", "\t", "\"" };
private static final String[] JSON_OUT = new String[] { "\\\\", "\\b", "\\f", "\\n", "\\r", "\\t", "\\\""};
private static final TrieMatcher JSON_SEARCH_REPLACE = TrieMatcher.compile(JSON_IN, JSON_OUT);
/**
* Properly escapes strings to be displayed in Json Strings. This means that backslashes and double quotes are
* escaped.
* NOTE: refer RFC8259 / ECMA 404. this method does not escape solidus (\x2f) as it seems to be both
* acceptable in either slash, or escaped.
* @see RFC 8259 #7 Strings
* @param in the string to escape
* @return the escaped string for json
*/
public static String escapeForJsonString(String in) {
return TrieMatcher.replaceMultiple(in, JSON_SEARCH_REPLACE);
}
}