edu.isi.nlp.StringUtils Maven / Gradle / Ivy
package edu.isi.nlp;
import static com.google.common.base.Preconditions.checkArgument;
import static com.google.common.base.Preconditions.checkNotNull;
import com.google.common.annotations.Beta;
import com.google.common.base.CharMatcher;
import com.google.common.base.Charsets;
import com.google.common.base.Function;
import com.google.common.base.Joiner;
import com.google.common.base.Predicate;
import com.google.common.base.Splitter;
import com.google.common.base.Strings;
import com.google.common.collect.ImmutableList;
import com.google.common.collect.ImmutableSet;
import com.google.common.io.CharSource;
import com.google.common.io.Files;
import com.google.common.io.LineProcessor;
import edu.isi.nlp.strings.offsets.CharOffset;
import edu.isi.nlp.strings.offsets.OffsetRange;
import java.io.File;
import java.io.IOException;
import java.text.Normalizer;
import java.util.Locale;
import java.util.Set;
import java.util.regex.MatchResult;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import javax.annotation.Nullable;
@Beta
public final class StringUtils {
private StringUtils() {
throw new UnsupportedOperationException();
}
/**
* Makes a string into a {@link UnicodeFriendlyString}. See that interface's Javadoc for more
* details.
*/
public static UnicodeFriendlyString unicodeFriendly(String utf16CodeUnits) {
if (utf16CodeUnits.codePointCount(0, utf16CodeUnits.length()) == utf16CodeUnits.length()) {
return StringWithoutNonBmp.of(utf16CodeUnits);
} else {
return StringWithNonBmp.of(utf16CodeUnits);
}
}
/** Applies {@link #unicodeFriendly(String)} to the contents of a list. */
public static ImmutableList unicodeFriendlyList(Iterable strings) {
final ImmutableList.Builder ret = ImmutableList.builder();
for (final String s : strings) {
ret.add(unicodeFriendly(s));
}
return ret.build();
}
/** Applies {@link #unicodeFriendly(String)} to the contents of a set. */
public static ImmutableSet unicodeFriendlySet(Iterable strings) {
final ImmutableSet.Builder ret = ImmutableSet.builder();
for (final String s : strings) {
ret.add(unicodeFriendly(s));
}
return ret.build();
}
public static ImmutableSet unicodeFriendlySet(
String s1, String... strings) {
final ImmutableSet.Builder ret = ImmutableSet.builder();
ret.add(unicodeFriendly(s1));
for (final String s : strings) {
ret.add(unicodeFriendly(s));
}
return ret.build();
}
public static String codepointToString(int codePoint) {
return new String(Character.toChars(codePoint));
}
/**
* Returns a string which is the result of replacing every match of regex in the input string with
* the results of applying replacementFunction to the matched string. This is a candidate to be
* moved to a more general utility package.
*
* @param replacementFunction May not return null.
*/
public static String replaceAll(
final String input,
final String regex,
final Function replacementFunction) {
return replaceAll(input, Pattern.compile(regex), replacementFunction);
}
/**
* Returns a string which is the result of replacing every match of regex in the input string with
* the results of applying replacementFunction to the matched string. This is a candidate to be
* moved to a more general utility package.
*
* @param replacementFunction May not return null.
*/
public static String replaceAll(
final String input,
final Pattern regex,
final Function replacementFunction) {
final StringBuffer output = new StringBuffer();
final Matcher matcher = regex.matcher(input);
while (matcher.find()) {
final MatchResult match = matcher.toMatchResult();
final String replacement = replacementFunction.apply(match);
if (replacement == null) {
throw new IllegalArgumentException(
String.format("Replacement function returned null for match %s", match.group()));
}
if (!replacement.equals(match.group())) {
matcher.appendReplacement(output, replacement);
}
}
matcher.appendTail(output);
return output.toString();
}
/**
* * Returns the index of the {@code n}-th occurence of {@code needle} in {@code s}. If {@code
* needle} does not appear in {@code s}, returns -1.
*
* @param s The string to search. Cannot be null.
* @param needle The character to search for.
* @param n Return the {@code n}-th occurence
*/
public static int nthOccurrenceOf(final String s, final char needle, int n) {
checkNotNull(s);
checkArgument(n > 0);
for (int i = 0; i < s.length(); ++i) {
if (needle == s.charAt(i)) {
--n;
if (n == 0) {
return i;
}
}
}
return -1;
}
public static Set stringSetFrom(final File stringFile) throws IOException {
return stringSetFrom(Files.asCharSource(stringFile, Charsets.UTF_8));
}
public static Set stringSetFrom(final CharSource supplier) throws IOException {
final LineProcessor> callback =
new LineProcessor>() {
private final ImmutableSet.Builder builder = ImmutableSet.builder();
@Override
public boolean processLine(final String s) {
builder.add(s.trim());
return true;
}
@Override
public Set getResult() {
return builder.build();
}
};
supplier.readLines(callback);
return callback.getResult();
}
/** Returns a Function which will join the string with the specified separator */
public static Function, String> joinFunction(final Joiner joiner) {
return new Function, String>() {
@Override
public String apply(final Iterable> list) {
return joiner.join(list);
}
};
}
public static Joiner spaceJoiner() {
return Joiner.on(" ");
}
public static Joiner commaSpaceJoiner() {
return Joiner.on(", ");
}
private static final Joiner OR_JOINER = Joiner.on("|");
/** A {@link Joiner} which joins on |. Handy for constructing regular expressions. */
public static Joiner pipeJoiner() {
return OR_JOINER;
}
public static Joiner unixNewlineJoiner() {
return Joiner.on("\n");
}
public static Joiner commaJoiner() {
return Joiner.on(",");
}
public static Joiner dotJoiner() {
return Joiner.on(".");
}
/** *********** Splitters ******************* */
/** Splits on tab, omitting empty strings and trimming results. */
public static Splitter onTabs() {
return Splitter.on("\t").trimResults().omitEmptyStrings();
}
/** Splits on spaces, omitting empty strings and trimming results. */
public static Splitter onSpaces() {
return Splitter.on(" ").trimResults().omitEmptyStrings();
}
/** Splits on Unix newlines, omitting empty strings and trimming results. */
public static Splitter onUnixNewlines() {
return Splitter.on("\n").trimResults().omitEmptyStrings();
}
/** Splits on commas, omitting empty strings and trimming results. */
public static Splitter onCommas() {
return Splitter.on(",").trimResults().omitEmptyStrings();
}
private static final Splitter onDots = Splitter.on(".").trimResults().omitEmptyStrings();
/** Splits on periods, omitting empty strings and trimming results. */
public static Splitter onDots() {
return onDots;
}
private static final Splitter onDashes = Splitter.on("-").trimResults().omitEmptyStrings();
/** Splits on dashes, omitting empty strings and trimming results. */
public static Splitter onDashes() {
return onDashes;
}
/**
* A Guava function for converting strings to lowercase.
*
* @param locale
* @return
*/
public static Function toLowerCaseFunction(final Locale locale) {
return new Function() {
@Override
public String apply(final String s) {
return s.toLowerCase(locale);
}
@Override
public String toString() {
return "toLowercase(" + locale + ")";
}
};
}
public static Function toUpperCaseFunction(final Locale locale) {
return new Function() {
@Override
public String apply(final String s) {
return s.toUpperCase(locale);
}
@Override
public String toString() {
return "toUppercase(" + locale + ")";
}
};
}
public static final Predicate ContainsLetterOrDigit =
s -> {
for (int i = 0; i < s.length(); ++i) {
if (Character.isLetterOrDigit(s.charAt(i))) {
return true;
}
}
return false;
};
public static final Function prefixWithFunction(final String prefix) {
return new Function() {
@Override
public String apply(final String s) {
return prefix + s;
}
};
}
public static final Function suffixWithFunction(final String suffix) {
return new Function() {
@Override
public String apply(final String s) {
return s + suffix;
}
};
}
public static final Predicate startsWith(final String prefix) {
return new Predicate() {
@Override
public boolean apply(final String x) {
return x.startsWith(prefix);
}
};
}
public static final String removeSuffixIfPresent(final String name, final String badSuffix) {
if (name.endsWith(badSuffix)) {
return name.substring(0, name.length() - badSuffix.length());
} else {
return name;
}
}
/**
* Gets a predicate which returns true for a {@code String} iff it contains {@code probe} as a
* substring.
*/
@SuppressWarnings("unchecked")
public static final Predicate containsPredicate(final String probe) {
return x -> x.contains(probe);
}
/**
* A predicate which returns true for a {@code String} iff at least one of its characters matches
* the provided {@link CharMatcher}
*/
public static final Predicate anyCharMatches(final CharMatcher matcher) {
return new Predicate() {
@Override
public boolean apply(@Nullable final String input) {
return matcher.matchesAnyOf(input);
}
};
}
public static final Predicate isEmpty() {
return new Predicate() {
@Override
public boolean apply(final String input) {
checkArgument(input != null);
return input.isEmpty();
}
};
}
/**
* Just like {@link java.lang.String#indexOf(String, int)}, except it searches for all strings in
* {@code probes}. If none are found, returns -1. If any are found, returns the earliest index of
* a match. The current implementation naively searches for each string separately. If speed is
* important, consider an alternative approach.
*/
public static int earliestIndexOfAny(String s, Iterable probes, int from) {
int earliestIdx = -1;
for (final String probe : probes) {
final int probeIdx = s.indexOf(probe, from);
// if we found something for this probe
if (probeIdx >= 0
// and either we haven't found anything else yet or
// this is earlier than anything we've found yet
&& (earliestIdx == -1 || probeIdx < earliestIdx)) {
// then this is our new earliest match
earliestIdx = probeIdx;
}
}
return earliestIdx;
}
/** Returns the number of codepoints in a string. */
public static int codepointCount(final String input) {
return CodepointCountFunction.INSTANCE.apply(input);
}
/** Returns a function that computes the number of code points in a string. */
public static Function codepointCountFunction() {
return CodepointCountFunction.INSTANCE;
}
/**
* Returns the substring of {@code s} which starts at the Unicode codepoint offset at {@code
* startIndexInclusive} and ends before the Unicode codepoint offset at {@code endIndexExclusive}.
* You should typically use this instead of {@link String#substring(int)} because the latter will
* fail badly in the presence of non-BMP characters.
*
* Beware this takes linear time according to the end index of the substring, rather than the
* substring's length, like for {@link String#substring(int)}
*/
public static String substringByCodepoints(
String s, int startIndexInclusive, int endIndexExclusive) {
return substringByCodepoints(s, startIndexInclusive, endIndexExclusive, false);
}
/**
* Returns the substring of {@code s} indicated by {@code substringBounds}, where the character
* offsets are interpreted as Unicode code point offsets. You should typically use this instead of
* {@link String#substring(int)} because the latter will fail badly in the presence of non-BMP
* characters.
*
*
Beware this takes linear time according to the end index of the substring, rather than the
* substring's length, like for {@link String#substring(int)}
*/
public static String substringByCodepoints(String s, OffsetRange substringBounds) {
return substringByCodepoints(
s,
substringBounds.startInclusive().asInt(),
// +1 because called method takes exclusive end offset like String#substring
substringBounds.endInclusive().asInt() + 1);
}
/**
* Acts just like {@link #laxSubstringByCodepoints(String, int, int)} except that if either index
* is out-of-bounds, it is clipped to the most extreme legal value. This guarantees that as long
* as {@code s} is non-null and {@code endIndexExclusive>=startIndexInclusive}, no exception will
* be thrown when calling this method.
*/
public static String laxSubstringByCodepoints(
String s, int startIndexInclusive, int endIndexExclusive) {
return substringByCodepoints(s, startIndexInclusive, endIndexExclusive, true);
}
private static String substringByCodepoints(
String s, int startIndexInclusive, int endIndexExclusive, boolean lax) {
checkArgument(startIndexInclusive <= endIndexExclusive);
if (lax) {
startIndexInclusive = Math.max(startIndexInclusive, 0);
}
checkArgument(startIndexInclusive >= 0);
final int startCharIdx = s.offsetByCodePoints(0, startIndexInclusive);
final int substringCodePointLength = endIndexExclusive - startIndexInclusive;
final int endCharIdxInclusive;
try {
endCharIdxInclusive = s.offsetByCodePoints(startCharIdx, substringCodePointLength - 1);
} catch (IndexOutOfBoundsException ibe) {
if (lax) {
// handle clipping at the end in lax mode
return s.substring(startCharIdx, s.length());
} else {
throw ibe;
}
}
// we want an exclusive character offset for toString below, so we need to go one more
// codepoint.
// However, that may be one or two characters depending on exactly what the last code point is
final int lastCodePoint = s.codePointAt(endCharIdxInclusive);
int endCharIdxExclusive = endCharIdxInclusive + Character.charCount(lastCodePoint);
if (lax && endCharIdxExclusive > s.length()) {
// if we are requested to substring "safely", clip the substring to the end of the string
endCharIdxExclusive = s.length();
}
return s.substring(startCharIdx, endCharIdxExclusive);
}
/**
* Checks that the supplied string is non-empty. If it is empty, an {@link
* java.lang.IllegalArgumentException} is thrown with the supplied message.
*/
public static String checkNonEmpty(String s, String msg) {
checkArgument(!s.isEmpty(), msg);
return s;
}
/**
* Produces a string representation of a positive integer padded with leading zeros. Enough zeros
* are adding so that the supplied {@code maxValue} would have the same number of digits.
*/
public static String padWithMax(final int numToPad, final int maxValue) {
checkArgument(numToPad >= 0);
checkArgument(numToPad <= maxValue);
final int maxLength = Integer.toString(maxValue).length();
final String baseString = Integer.toString(numToPad);
final String padding = Strings.repeat("0", maxLength - baseString.length());
return padding + numToPad;
}
/**
* Returns the code points contained in a {@link String}. Use {@link #toCodepointStrings(String)}
* to get the {@link String} representation for each codepoint.
*
* @see #toCodepoints(String)
*/
public static ImmutableList toCodepoints(final String s) {
final ImmutableList.Builder ret = ImmutableList.builder();
for (int offset = 0; offset < s.length(); ) {
final int codePoint = s.codePointAt(offset);
ret.add(codePoint);
offset += Character.charCount(codePoint);
}
return ret.build();
}
/**
* Returns each code point in a {@link String} converted into a {@link String}. Useful for
* iterating over {@link String}s in a Unicode-aware fashion. Use {@link #toCodepoints(String)} to
* get the codepoints themselves.
*
* @see #toCodepoints(String)
*/
public static ImmutableList toCodepointStrings(final String s) {
final ImmutableList.Builder ret = ImmutableList.builder();
for (int codePoint : toCodepoints(s)) {
ret.add(new String(Character.toChars(codePoint)));
}
return ret.build();
}
@SuppressWarnings("deprecation")
public static Function lengthFunction() {
return x -> x.length();
}
/** Guava {@link Function} which runs {@link String#trim()} on all inputs. */
public static Function trimFunction() {
return TrimFunction.INSTANCE;
}
private enum TrimFunction implements Function {
INSTANCE;
@Override
public String apply(final String input) {
return input.trim();
}
}
private enum CodepointCountFunction implements Function {
INSTANCE;
@Override
public Integer apply(final String input) {
return input.codePointCount(0, input.length());
}
}
// \p{M} means all Unicode "marks"
private static final Pattern ACCENT_STRIPPER = Pattern.compile("[\\p{M}]");
/** Removes all Unicode marks from a string. As a side effect, applies NFD normalization. */
public static UnicodeFriendlyString stripAccents(final UnicodeFriendlyString input) {
// this nifty normalization courtesy of
// http://stackoverflow.com/questions/3322152/is-there-a-way-to-get-rid-of-accents-and-convert-a-whole-string-to-regular-lette
return StringUtils.unicodeFriendly(
ACCENT_STRIPPER
.matcher(Normalizer.normalize(input.utf16CodeUnits(), Normalizer.Form.NFD))
// note this replaceAll is really deleteAll
.replaceAll(""));
}
}