edu.isi.nlp.StringUtils Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of common-core-open Show documentation
The newest version!
package edu.isi.nlp;

import static com.google.common.base.Preconditions.checkArgument;
import static com.google.common.base.Preconditions.checkNotNull;

import com.google.common.annotations.Beta;
import com.google.common.base.CharMatcher;
import com.google.common.base.Charsets;
import com.google.common.base.Function;
import com.google.common.base.Joiner;
import com.google.common.base.Predicate;
import com.google.common.base.Splitter;
import com.google.common.base.Strings;
import com.google.common.collect.ImmutableList;
import com.google.common.collect.ImmutableSet;
import com.google.common.io.CharSource;
import com.google.common.io.Files;
import com.google.common.io.LineProcessor;
import edu.isi.nlp.strings.offsets.CharOffset;
import edu.isi.nlp.strings.offsets.OffsetRange;
import java.io.File;
import java.io.IOException;
import java.text.Normalizer;
import java.util.Locale;
import java.util.Set;
import java.util.regex.MatchResult;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import javax.annotation.Nullable;

@Beta
public final class StringUtils {

  private StringUtils() {
    throw new UnsupportedOperationException();
  }

  /**
   * Makes a string into a {@link UnicodeFriendlyString}. See that interface's Javadoc for more
   * details.
   */
  public static UnicodeFriendlyString unicodeFriendly(String utf16CodeUnits) {
    if (utf16CodeUnits.codePointCount(0, utf16CodeUnits.length()) == utf16CodeUnits.length()) {
      return StringWithoutNonBmp.of(utf16CodeUnits);
    } else {
      return StringWithNonBmp.of(utf16CodeUnits);
    }
  }

  /** Applies {@link #unicodeFriendly(String)} to the contents of a list. */
  public static ImmutableList unicodeFriendlyList(Iterable strings) {
    final ImmutableList.Builder ret = ImmutableList.builder();

    for (final String s : strings) {
      ret.add(unicodeFriendly(s));
    }

    return ret.build();
  }

  /** Applies {@link #unicodeFriendly(String)} to the contents of a set. */
  public static ImmutableSet unicodeFriendlySet(Iterable strings) {
    final ImmutableSet.Builder ret = ImmutableSet.builder();

    for (final String s : strings) {
      ret.add(unicodeFriendly(s));
    }

    return ret.build();
  }

  public static ImmutableSet unicodeFriendlySet(
      String s1, String... strings) {
    final ImmutableSet.Builder ret = ImmutableSet.builder();

    ret.add(unicodeFriendly(s1));
    for (final String s : strings) {
      ret.add(unicodeFriendly(s));
    }

    return ret.build();
  }

  public static String codepointToString(int codePoint) {
    return new String(Character.toChars(codePoint));
  }

  /**
   * Returns a string which is the result of replacing every match of regex in the input string with
   * the results of applying replacementFunction to the matched string. This is a candidate to be
   * moved to a more general utility package.
   *
   * @param replacementFunction May not return null.
   */
  public static String replaceAll(
      final String input,
      final String regex,
      final Function replacementFunction) {
    return replaceAll(input, Pattern.compile(regex), replacementFunction);
  }

  /**
   * Returns a string which is the result of replacing every match of regex in the input string with
   * the results of applying replacementFunction to the matched string. This is a candidate to be
   * moved to a more general utility package.
   *
   * @param replacementFunction May not return null.
   */
  public static String replaceAll(
      final String input,
      final Pattern regex,
      final Function replacementFunction) {
    final StringBuffer output = new StringBuffer();
    final Matcher matcher = regex.matcher(input);
    while (matcher.find()) {
      final MatchResult match = matcher.toMatchResult();
      final String replacement = replacementFunction.apply(match);
      if (replacement == null) {
        throw new IllegalArgumentException(
            String.format("Replacement function returned null for match %s", match.group()));
      }
      if (!replacement.equals(match.group())) {
        matcher.appendReplacement(output, replacement);
      }
    }
    matcher.appendTail(output);
    return output.toString();
  }

  /**
   * * Returns the index of the {@code n}-th occurence of {@code needle} in {@code s}. If {@code
   * needle} does not appear in {@code s}, returns -1.
   *
   * @param s The string to search. Cannot be null.
   * @param needle The character to search for.
   * @param n Return the {@code n}-th occurence
   */
  public static int nthOccurrenceOf(final String s, final char needle, int n) {
    checkNotNull(s);
    checkArgument(n > 0);
    for (int i = 0; i < s.length(); ++i) {
      if (needle == s.charAt(i)) {
        --n;
        if (n == 0) {
          return i;
        }
      }
    }
    return -1;
  }

  public static Set stringSetFrom(final File stringFile) throws IOException {
    return stringSetFrom(Files.asCharSource(stringFile, Charsets.UTF_8));
  }

  public static Set stringSetFrom(final CharSource supplier) throws IOException {
    final LineProcessor> callback =
        new LineProcessor>() {
          private final ImmutableSet.Builder builder = ImmutableSet.builder();

          @Override
          public boolean processLine(final String s) {
            builder.add(s.trim());
            return true;
          }

          @Override
          public Set getResult() {
            return builder.build();
          }
        };

    supplier.readLines(callback);
    return callback.getResult();
  }

  /** Returns a Function which will join the string with the specified separator */
  public static Function, String> joinFunction(final Joiner joiner) {
    return new Function, String>() {
      @Override
      public String apply(final Iterable list) {
        return joiner.join(list);
      }
    };
  }

  public static Joiner spaceJoiner() {
    return Joiner.on(" ");
  }

  public static Joiner commaSpaceJoiner() {
    return Joiner.on(", ");
  }

  private static final Joiner OR_JOINER = Joiner.on("|");

  /** A {@link Joiner} which joins on |. Handy for constructing regular expressions. */
  public static Joiner pipeJoiner() {
    return OR_JOINER;
  }

  public static Joiner unixNewlineJoiner() {
    return Joiner.on("\n");
  }

  public static Joiner commaJoiner() {
    return Joiner.on(",");
  }

  public static Joiner dotJoiner() {
    return Joiner.on(".");
  }

  /** *********** Splitters ******************* */

  /** Splits on tab, omitting empty strings and trimming results. */
  public static Splitter onTabs() {
    return Splitter.on("\t").trimResults().omitEmptyStrings();
  }

  /** Splits on spaces, omitting empty strings and trimming results. */
  public static Splitter onSpaces() {
    return Splitter.on(" ").trimResults().omitEmptyStrings();
  }

  /** Splits on Unix newlines, omitting empty strings and trimming results. */
  public static Splitter onUnixNewlines() {
    return Splitter.on("\n").trimResults().omitEmptyStrings();
  }

  /** Splits on commas, omitting empty strings and trimming results. */
  public static Splitter onCommas() {
    return Splitter.on(",").trimResults().omitEmptyStrings();
  }

  private static final Splitter onDots = Splitter.on(".").trimResults().omitEmptyStrings();

  /** Splits on periods, omitting empty strings and trimming results. */
  public static Splitter onDots() {
    return onDots;
  }

  private static final Splitter onDashes = Splitter.on("-").trimResults().omitEmptyStrings();

  /** Splits on dashes, omitting empty strings and trimming results. */
  public static Splitter onDashes() {
    return onDashes;
  }

  /**
   * A Guava function for converting strings to lowercase.
   *
   * @param locale
   * @return
   */
  public static Function toLowerCaseFunction(final Locale locale) {
    return new Function() {
      @Override
      public String apply(final String s) {
        return s.toLowerCase(locale);
      }

      @Override
      public String toString() {
        return "toLowercase(" + locale + ")";
      }
    };
  }

  public static Function toUpperCaseFunction(final Locale locale) {
    return new Function() {
      @Override
      public String apply(final String s) {
        return s.toUpperCase(locale);
      }

      @Override
      public String toString() {
        return "toUppercase(" + locale + ")";
      }
    };
  }

  public static final Predicate ContainsLetterOrDigit =
      s -> {
        for (int i = 0; i < s.length(); ++i) {
          if (Character.isLetterOrDigit(s.charAt(i))) {
            return true;
          }
        }
        return false;
      };

  public static final Function prefixWithFunction(final String prefix) {
    return new Function() {
      @Override
      public String apply(final String s) {
        return prefix + s;
      }
    };
  }

  public static final Function suffixWithFunction(final String suffix) {
    return new Function() {
      @Override
      public String apply(final String s) {
        return s + suffix;
      }
    };
  }

  public static final Predicate startsWith(final String prefix) {
    return new Predicate() {
      @Override
      public boolean apply(final String x) {
        return x.startsWith(prefix);
      }
    };
  }

  public static final String removeSuffixIfPresent(final String name, final String badSuffix) {
    if (name.endsWith(badSuffix)) {
      return name.substring(0, name.length() - badSuffix.length());
    } else {
      return name;
    }
  }

  /**
   * Gets a predicate which returns true for a {@code String} iff it contains {@code probe} as a
   * substring.
   */
  @SuppressWarnings("unchecked")
  public static final Predicate containsPredicate(final String probe) {
    return x -> x.contains(probe);
  }

  /**
   * A predicate which returns true for a {@code String} iff at least one of its characters matches
   * the provided {@link CharMatcher}
   */
  public static final Predicate anyCharMatches(final CharMatcher matcher) {
    return new Predicate() {
      @Override
      public boolean apply(@Nullable final String input) {
        return matcher.matchesAnyOf(input);
      }
    };
  }

  public static final Predicate isEmpty() {
    return new Predicate() {
      @Override
      public boolean apply(final String input) {
        checkArgument(input != null);
        return input.isEmpty();
      }
    };
  }

  /**
   * Just like {@link java.lang.String#indexOf(String, int)}, except it searches for all strings in
   * {@code probes}. If none are found, returns -1. If any are found, returns the earliest index of
   * a match. The current implementation naively searches for each string separately. If speed is
   * important, consider an alternative approach.
   */
  public static int earliestIndexOfAny(String s, Iterable probes, int from) {
    int earliestIdx = -1;

    for (final String probe : probes) {
      final int probeIdx = s.indexOf(probe, from);
      // if we found something for this probe
      if (probeIdx >= 0
          // and either we haven't found anything else yet or
          // this is earlier than anything we've found yet
          && (earliestIdx == -1 || probeIdx < earliestIdx)) {
        // then this is our new earliest match
        earliestIdx = probeIdx;
      }
    }

    return earliestIdx;
  }

  /** Returns the number of codepoints in a string. */
  public static int codepointCount(final String input) {
    return CodepointCountFunction.INSTANCE.apply(input);
  }

  /** Returns a function that computes the number of code points in a string. */
  public static Function codepointCountFunction() {
    return CodepointCountFunction.INSTANCE;
  }

  /**
   * Returns the substring of {@code s} which starts at the Unicode codepoint offset at {@code
   * startIndexInclusive} and ends before the Unicode codepoint offset at {@code endIndexExclusive}.
   * You should typically use this instead of {@link String#substring(int)} because the latter will
   * fail badly in the presence of non-BMP characters.
   *
   * Beware this takes linear time according to the end index of the substring, rather than the
   * substring's length, like for {@link String#substring(int)}
   */
  public static String substringByCodepoints(
      String s, int startIndexInclusive, int endIndexExclusive) {
    return substringByCodepoints(s, startIndexInclusive, endIndexExclusive, false);
  }

  /**
   * Returns the substring of {@code s} indicated by {@code substringBounds}, where the character
   * offsets are interpreted as Unicode code point offsets. You should typically use this instead of
   * {@link String#substring(int)} because the latter will fail badly in the presence of non-BMP
   * characters.
   *
   * Beware this takes linear time according to the end index of the substring, rather than the
   * substring's length, like for {@link String#substring(int)}
   */
  public static String substringByCodepoints(String s, OffsetRange substringBounds) {
    return substringByCodepoints(
        s,
        substringBounds.startInclusive().asInt(),
        // +1 because called method takes exclusive end offset like String#substring
        substringBounds.endInclusive().asInt() + 1);
  }

  /**
   * Acts just like {@link #laxSubstringByCodepoints(String, int, int)} except that if either index
   * is out-of-bounds, it is clipped to the most extreme legal value. This guarantees that as long
   * as {@code s} is non-null and {@code endIndexExclusive>=startIndexInclusive}, no exception will
   * be thrown when calling this method.
   */
  public static String laxSubstringByCodepoints(
      String s, int startIndexInclusive, int endIndexExclusive) {
    return substringByCodepoints(s, startIndexInclusive, endIndexExclusive, true);
  }

  private static String substringByCodepoints(
      String s, int startIndexInclusive, int endIndexExclusive, boolean lax) {
    checkArgument(startIndexInclusive <= endIndexExclusive);
    if (lax) {
      startIndexInclusive = Math.max(startIndexInclusive, 0);
    }
    checkArgument(startIndexInclusive >= 0);

    final int startCharIdx = s.offsetByCodePoints(0, startIndexInclusive);

    final int substringCodePointLength = endIndexExclusive - startIndexInclusive;
    final int endCharIdxInclusive;
    try {
      endCharIdxInclusive = s.offsetByCodePoints(startCharIdx, substringCodePointLength - 1);
    } catch (IndexOutOfBoundsException ibe) {
      if (lax) {
        // handle clipping at the end in lax mode
        return s.substring(startCharIdx, s.length());
      } else {
        throw ibe;
      }
    }

    // we want an exclusive character offset for toString below, so we need to go one more
    // codepoint.
    // However, that may be one or two characters depending on exactly what the last code point is
    final int lastCodePoint = s.codePointAt(endCharIdxInclusive);
    int endCharIdxExclusive = endCharIdxInclusive + Character.charCount(lastCodePoint);

    if (lax && endCharIdxExclusive > s.length()) {
      // if we are requested to substring "safely", clip the substring to the end of the string
      endCharIdxExclusive = s.length();
    }
    return s.substring(startCharIdx, endCharIdxExclusive);
  }

  /**
   * Checks that the supplied string is non-empty. If it is empty, an {@link
   * java.lang.IllegalArgumentException} is thrown with the supplied message.
   */
  public static String checkNonEmpty(String s, String msg) {
    checkArgument(!s.isEmpty(), msg);
    return s;
  }

  /**
   * Produces a string representation of a positive integer padded with leading zeros. Enough zeros
   * are adding so that the supplied {@code maxValue} would have the same number of digits.
   */
  public static String padWithMax(final int numToPad, final int maxValue) {
    checkArgument(numToPad >= 0);
    checkArgument(numToPad <= maxValue);
    final int maxLength = Integer.toString(maxValue).length();
    final String baseString = Integer.toString(numToPad);
    final String padding = Strings.repeat("0", maxLength - baseString.length());
    return padding + numToPad;
  }

  /**
   * Returns the code points contained in a {@link String}. Use {@link #toCodepointStrings(String)}
   * to get the {@link String} representation for each codepoint.
   *
   * @see #toCodepoints(String)
   */
  public static ImmutableList toCodepoints(final String s) {
    final ImmutableList.Builder ret = ImmutableList.builder();
    for (int offset = 0; offset < s.length(); ) {
      final int codePoint = s.codePointAt(offset);
      ret.add(codePoint);
      offset += Character.charCount(codePoint);
    }
    return ret.build();
  }

  /**
   * Returns each code point in a {@link String} converted into a {@link String}. Useful for
   * iterating over {@link String}s in a Unicode-aware fashion. Use {@link #toCodepoints(String)} to
   * get the codepoints themselves.
   *
   * @see #toCodepoints(String)
   */
  public static ImmutableList toCodepointStrings(final String s) {
    final ImmutableList.Builder ret = ImmutableList.builder();
    for (int codePoint : toCodepoints(s)) {
      ret.add(new String(Character.toChars(codePoint)));
    }
    return ret.build();
  }

  @SuppressWarnings("deprecation")
  public static Function lengthFunction() {
    return x -> x.length();
  }

  /** Guava {@link Function} which runs {@link String#trim()} on all inputs. */
  public static Function trimFunction() {
    return TrimFunction.INSTANCE;
  }

  private enum TrimFunction implements Function {
    INSTANCE;

    @Override
    public String apply(final String input) {
      return input.trim();
    }
  }

  private enum CodepointCountFunction implements Function {
    INSTANCE;

    @Override
    public Integer apply(final String input) {
      return input.codePointCount(0, input.length());
    }
  }

  // \p{M} means all Unicode "marks"
  private static final Pattern ACCENT_STRIPPER = Pattern.compile("[\\p{M}]");

  /** Removes all Unicode marks from a string. As a side effect, applies NFD normalization. */
  public static UnicodeFriendlyString stripAccents(final UnicodeFriendlyString input) {
    // this nifty normalization courtesy of
    // http://stackoverflow.com/questions/3322152/is-there-a-way-to-get-rid-of-accents-and-convert-a-whole-string-to-regular-lette
    return StringUtils.unicodeFriendly(
        ACCENT_STRIPPER
            .matcher(Normalizer.normalize(input.utf16CodeUnits(), Normalizer.Form.NFD))
            // note this replaceAll is really deleteAll
            .replaceAll(""));
  }
}