org.jsoup.internal.StringUtil Maven / Gradle / Ivy
package org.jsoup.internal;
import org.jsoup.helper.Validate;
import org.jspecify.annotations.Nullable;
import java.net.MalformedURLException;
import java.net.URL;
import java.util.Arrays;
import java.util.Collection;
import java.util.Iterator;
import java.util.Stack;
import java.util.regex.Pattern;
/**
A minimal String utility class. Designed for internal jsoup use only - the API and outcome may change without
notice.
*/
public final class StringUtil {
// memoised padding up to 21 (blocks 0 to 20 spaces)
static final String[] padding = {"", " ", " ", " ", " ", " ", " ", " ", " ",
" ", " ", " ", " ", " ", " ", " ",
" ", " ", " ", " ", " "};
/**
* Join a collection of strings by a separator
* @param strings collection of string objects
* @param sep string to place between strings
* @return joined string
*/
public static String join(Collection> strings, String sep) {
return join(strings.iterator(), sep);
}
/**
* Join a collection of strings by a separator
* @param strings iterator of string objects
* @param sep string to place between strings
* @return joined string
*/
public static String join(Iterator> strings, String sep) {
if (!strings.hasNext())
return "";
String start = strings.next().toString();
if (!strings.hasNext()) // only one, avoid builder
return start;
StringJoiner j = new StringJoiner(sep);
j.add(start);
while (strings.hasNext()) {
j.add(strings.next());
}
return j.complete();
}
/**
* Join an array of strings by a separator
* @param strings collection of string objects
* @param sep string to place between strings
* @return joined string
*/
public static String join(String[] strings, String sep) {
return join(Arrays.asList(strings), sep);
}
/**
A StringJoiner allows incremental / filtered joining of a set of stringable objects.
@since 1.14.1
*/
public static class StringJoiner {
@Nullable StringBuilder sb = borrowBuilder(); // sets null on builder release so can't accidentally be reused
final String separator;
boolean first = true;
/**
Create a new joiner, that uses the specified separator. MUST call {@link #complete()} or will leak a thread
local string builder.
@param separator the token to insert between strings
*/
public StringJoiner(String separator) {
this.separator = separator;
}
/**
Add another item to the joiner, will be separated
*/
public StringJoiner add(Object stringy) {
Validate.notNull(sb); // don't reuse
if (!first)
sb.append(separator);
sb.append(stringy);
first = false;
return this;
}
/**
Append content to the current item; not separated
*/
public StringJoiner append(Object stringy) {
Validate.notNull(sb); // don't reuse
sb.append(stringy);
return this;
}
/**
Return the joined string, and release the builder back to the pool. This joiner cannot be reused.
*/
public String complete() {
String string = releaseBuilder(sb);
sb = null;
return string;
}
}
/**
* Returns space padding (up to the default max of 30). Use {@link #padding(int, int)} to specify a different limit.
* @param width amount of padding desired
* @return string of spaces * width
* @see #padding(int, int)
*/
public static String padding(int width) {
return padding(width, 30);
}
/**
* Returns space padding, up to a max of maxPaddingWidth.
* @param width amount of padding desired
* @param maxPaddingWidth maximum padding to apply. Set to {@code -1} for unlimited.
* @return string of spaces * width
*/
public static String padding(int width, int maxPaddingWidth) {
Validate.isTrue(width >= 0, "width must be >= 0");
Validate.isTrue(maxPaddingWidth >= -1);
if (maxPaddingWidth != -1)
width = Math.min(width, maxPaddingWidth);
if (width < padding.length)
return padding[width];
char[] out = new char[width];
for (int i = 0; i < width; i++)
out[i] = ' ';
return String.valueOf(out);
}
/**
* Tests if a string is blank: null, empty, or only whitespace (" ", \r\n, \t, etc)
* @param string string to test
* @return if string is blank
*/
public static boolean isBlank(final String string) {
if (string == null || string.length() == 0)
return true;
int l = string.length();
for (int i = 0; i < l; i++) {
if (!StringUtil.isWhitespace(string.codePointAt(i)))
return false;
}
return true;
}
/**
Tests if a string starts with a newline character
@param string string to test
@return if its first character is a newline
*/
public static boolean startsWithNewline(final String string) {
if (string == null || string.length() == 0)
return false;
return string.charAt(0) == '\n';
}
/**
* Tests if a string is numeric, i.e. contains only digit characters
* @param string string to test
* @return true if only digit chars, false if empty or null or contains non-digit chars
*/
public static boolean isNumeric(String string) {
if (string == null || string.length() == 0)
return false;
int l = string.length();
for (int i = 0; i < l; i++) {
if (!Character.isDigit(string.codePointAt(i)))
return false;
}
return true;
}
/**
* Tests if a code point is "whitespace" as defined in the HTML spec. Used for output HTML.
* @param c code point to test
* @return true if code point is whitespace, false otherwise
* @see #isActuallyWhitespace(int)
*/
public static boolean isWhitespace(int c){
return c == ' ' || c == '\t' || c == '\n' || c == '\f' || c == '\r';
}
/**
* Tests if a code point is "whitespace" as defined by what it looks like. Used for Element.text etc.
* @param c code point to test
* @return true if code point is whitespace, false otherwise
*/
public static boolean isActuallyWhitespace(int c){
return c == ' ' || c == '\t' || c == '\n' || c == '\f' || c == '\r' || c == 160;
// 160 is (non-breaking space). Not in the spec but expected.
}
public static boolean isInvisibleChar(int c) {
return c == 8203 || c == 173; // zero width sp, soft hyphen
// previously also included zw non join, zw join - but removing those breaks semantic meaning of text
}
/**
* Normalise the whitespace within this string; multiple spaces collapse to a single, and all whitespace characters
* (e.g. newline, tab) convert to a simple space.
* @param string content to normalise
* @return normalised string
*/
public static String normaliseWhitespace(String string) {
StringBuilder sb = StringUtil.borrowBuilder();
appendNormalisedWhitespace(sb, string, false);
return StringUtil.releaseBuilder(sb);
}
/**
* After normalizing the whitespace within a string, appends it to a string builder.
* @param accum builder to append to
* @param string string to normalize whitespace within
* @param stripLeading set to true if you wish to remove any leading whitespace
*/
public static void appendNormalisedWhitespace(StringBuilder accum, String string, boolean stripLeading) {
boolean lastWasWhite = false;
boolean reachedNonWhite = false;
int len = string.length();
int c;
for (int i = 0; i < len; i+= Character.charCount(c)) {
c = string.codePointAt(i);
if (isActuallyWhitespace(c)) {
if ((stripLeading && !reachedNonWhite) || lastWasWhite)
continue;
accum.append(' ');
lastWasWhite = true;
}
else if (!isInvisibleChar(c)) {
accum.appendCodePoint(c);
lastWasWhite = false;
reachedNonWhite = true;
}
}
}
public static boolean in(final String needle, final String... haystack) {
final int len = haystack.length;
for (int i = 0; i < len; i++) {
if (haystack[i].equals(needle))
return true;
}
return false;
}
public static boolean inSorted(String needle, String[] haystack) {
return Arrays.binarySearch(haystack, needle) >= 0;
}
/**
Tests that a String contains only ASCII characters.
@param string scanned string
@return true if all characters are in range 0 - 127
*/
public static boolean isAscii(String string) {
Validate.notNull(string);
for (int i = 0; i < string.length(); i++) {
int c = string.charAt(i);
if (c > 127) { // ascii range
return false;
}
}
return true;
}
private static final Pattern extraDotSegmentsPattern = Pattern.compile("^/((\\.{1,2}/)+)");
/**
* Create a new absolute URL, from a provided existing absolute URL and a relative URL component.
* @param base the existing absolute base URL
* @param relUrl the relative URL to resolve. (If it's already absolute, it will be returned)
* @return the resolved absolute URL
* @throws MalformedURLException if an error occurred generating the URL
*/
public static URL resolve(URL base, String relUrl) throws MalformedURLException {
relUrl = stripControlChars(relUrl);
// workaround: java resolves '//path/file + ?foo' to '//path/?foo', not '//path/file?foo' as desired
if (relUrl.startsWith("?"))
relUrl = base.getPath() + relUrl;
// workaround: //example.com + ./foo = //example.com/./foo, not //example.com/foo
URL url = new URL(base, relUrl);
String fixedFile = extraDotSegmentsPattern.matcher(url.getFile()).replaceFirst("/");
if (url.getRef() != null) {
fixedFile = fixedFile + "#" + url.getRef();
}
return new URL(url.getProtocol(), url.getHost(), url.getPort(), fixedFile);
}
/**
* Create a new absolute URL, from a provided existing absolute URL and a relative URL component.
* @param baseUrl the existing absolute base URL
* @param relUrl the relative URL to resolve. (If it's already absolute, it will be returned)
* @return an absolute URL if one was able to be generated, or the empty string if not
*/
public static String resolve(String baseUrl, String relUrl) {
// workaround: java will allow control chars in a path URL and may treat as relative, but Chrome / Firefox will strip and may see as a scheme. Normalize to browser's view.
baseUrl = stripControlChars(baseUrl); relUrl = stripControlChars(relUrl);
try {
URL base;
try {
base = new URL(baseUrl);
} catch (MalformedURLException e) {
// the base is unsuitable, but the attribute/rel may be abs on its own, so try that
URL abs = new URL(relUrl);
return abs.toExternalForm();
}
return resolve(base, relUrl).toExternalForm();
} catch (MalformedURLException e) {
// it may still be valid, just that Java doesn't have a registered stream handler for it, e.g. tel
// we test here vs at start to normalize supported URLs (e.g. HTTP -> http)
return validUriScheme.matcher(relUrl).find() ? relUrl : "";
}
}
private static final Pattern validUriScheme = Pattern.compile("^[a-zA-Z][a-zA-Z0-9+-.]*:");
private static final Pattern controlChars = Pattern.compile("[\\x00-\\x1f]*"); // matches ascii 0 - 31, to strip from url
private static String stripControlChars(final String input) {
return controlChars.matcher(input).replaceAll("");
}
private static final ThreadLocal> threadLocalBuilders = ThreadLocal.withInitial(Stack::new);
/**
* Maintains cached StringBuilders in a flyweight pattern, to minimize new StringBuilder GCs. The StringBuilder is
* prevented from growing too large.
*
* Care must be taken to release the builder once its work has been completed, with {@link #releaseBuilder}
* @return an empty StringBuilder
*/
public static StringBuilder borrowBuilder() {
Stack builders = threadLocalBuilders.get();
return builders.empty() ?
new StringBuilder(MaxCachedBuilderSize) :
builders.pop();
}
/**
* Release a borrowed builder. Care must be taken not to use the builder after it has been returned, as its
* contents may be changed by this method, or by a concurrent thread.
* @param sb the StringBuilder to release.
* @return the string value of the released String Builder (as an incentive to release it!).
*/
public static String releaseBuilder(StringBuilder sb) {
Validate.notNull(sb);
String string = sb.toString();
if (sb.length() > MaxCachedBuilderSize)
sb = new StringBuilder(MaxCachedBuilderSize); // make sure it hasn't grown too big
else
sb.delete(0, sb.length()); // make sure it's emptied on release
Stack builders = threadLocalBuilders.get();
builders.push(sb);
while (builders.size() > MaxIdleBuilders) {
builders.pop();
}
return string;
}
private static final int MaxCachedBuilderSize = 8 * 1024;
private static final int MaxIdleBuilders = 8;
}