All Downloads are FREE. Search and download functionalities are using the official Maven repository.

edu.isi.nlp.StringUtils Maven / Gradle / Ivy

The newest version!
package edu.isi.nlp;

import static com.google.common.base.Preconditions.checkArgument;
import static com.google.common.base.Preconditions.checkNotNull;

import com.google.common.annotations.Beta;
import com.google.common.base.CharMatcher;
import com.google.common.base.Charsets;
import com.google.common.base.Function;
import com.google.common.base.Joiner;
import com.google.common.base.Predicate;
import com.google.common.base.Splitter;
import com.google.common.base.Strings;
import com.google.common.collect.ImmutableList;
import com.google.common.collect.ImmutableSet;
import com.google.common.io.CharSource;
import com.google.common.io.Files;
import com.google.common.io.LineProcessor;
import edu.isi.nlp.strings.offsets.CharOffset;
import edu.isi.nlp.strings.offsets.OffsetRange;
import java.io.File;
import java.io.IOException;
import java.text.Normalizer;
import java.util.Locale;
import java.util.Set;
import java.util.regex.MatchResult;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import javax.annotation.Nullable;

@Beta
public final class StringUtils {

  private StringUtils() {
    throw new UnsupportedOperationException();
  }

  /**
   * Makes a string into a {@link UnicodeFriendlyString}. See that interface's Javadoc for more
   * details.
   */
  public static UnicodeFriendlyString unicodeFriendly(String utf16CodeUnits) {
    if (utf16CodeUnits.codePointCount(0, utf16CodeUnits.length()) == utf16CodeUnits.length()) {
      return StringWithoutNonBmp.of(utf16CodeUnits);
    } else {
      return StringWithNonBmp.of(utf16CodeUnits);
    }
  }

  /** Applies {@link #unicodeFriendly(String)} to the contents of a list. */
  public static ImmutableList unicodeFriendlyList(Iterable strings) {
    final ImmutableList.Builder ret = ImmutableList.builder();

    for (final String s : strings) {
      ret.add(unicodeFriendly(s));
    }

    return ret.build();
  }

  /** Applies {@link #unicodeFriendly(String)} to the contents of a set. */
  public static ImmutableSet unicodeFriendlySet(Iterable strings) {
    final ImmutableSet.Builder ret = ImmutableSet.builder();

    for (final String s : strings) {
      ret.add(unicodeFriendly(s));
    }

    return ret.build();
  }

  public static ImmutableSet unicodeFriendlySet(
      String s1, String... strings) {
    final ImmutableSet.Builder ret = ImmutableSet.builder();

    ret.add(unicodeFriendly(s1));
    for (final String s : strings) {
      ret.add(unicodeFriendly(s));
    }

    return ret.build();
  }

  public static String codepointToString(int codePoint) {
    return new String(Character.toChars(codePoint));
  }

  /**
   * Returns a string which is the result of replacing every match of regex in the input string with
   * the results of applying replacementFunction to the matched string. This is a candidate to be
   * moved to a more general utility package.
   *
   * @param replacementFunction May not return null.
   */
  public static String replaceAll(
      final String input,
      final String regex,
      final Function replacementFunction) {
    return replaceAll(input, Pattern.compile(regex), replacementFunction);
  }

  /**
   * Returns a string which is the result of replacing every match of regex in the input string with
   * the results of applying replacementFunction to the matched string. This is a candidate to be
   * moved to a more general utility package.
   *
   * @param replacementFunction May not return null.
   */
  public static String replaceAll(
      final String input,
      final Pattern regex,
      final Function replacementFunction) {
    final StringBuffer output = new StringBuffer();
    final Matcher matcher = regex.matcher(input);
    while (matcher.find()) {
      final MatchResult match = matcher.toMatchResult();
      final String replacement = replacementFunction.apply(match);
      if (replacement == null) {
        throw new IllegalArgumentException(
            String.format("Replacement function returned null for match %s", match.group()));
      }
      if (!replacement.equals(match.group())) {
        matcher.appendReplacement(output, replacement);
      }
    }
    matcher.appendTail(output);
    return output.toString();
  }

  /**
   * * Returns the index of the {@code n}-th occurence of {@code needle} in {@code s}. If {@code
   * needle} does not appear in {@code s}, returns -1.
   *
   * @param s The string to search. Cannot be null.
   * @param needle The character to search for.
   * @param n Return the {@code n}-th occurence
   */
  public static int nthOccurrenceOf(final String s, final char needle, int n) {
    checkNotNull(s);
    checkArgument(n > 0);
    for (int i = 0; i < s.length(); ++i) {
      if (needle == s.charAt(i)) {
        --n;
        if (n == 0) {
          return i;
        }
      }
    }
    return -1;
  }

  public static Set stringSetFrom(final File stringFile) throws IOException {
    return stringSetFrom(Files.asCharSource(stringFile, Charsets.UTF_8));
  }

  public static Set stringSetFrom(final CharSource supplier) throws IOException {
    final LineProcessor> callback =
        new LineProcessor>() {
          private final ImmutableSet.Builder builder = ImmutableSet.builder();

          @Override
          public boolean processLine(final String s) {
            builder.add(s.trim());
            return true;
          }

          @Override
          public Set getResult() {
            return builder.build();
          }
        };

    supplier.readLines(callback);
    return callback.getResult();
  }

  /** Returns a Function which will join the string with the specified separator */
  public static Function, String> joinFunction(final Joiner joiner) {
    return new Function, String>() {
      @Override
      public String apply(final Iterable list) {
        return joiner.join(list);
      }
    };
  }

  public static Joiner spaceJoiner() {
    return Joiner.on(" ");
  }

  public static Joiner commaSpaceJoiner() {
    return Joiner.on(", ");
  }

  private static final Joiner OR_JOINER = Joiner.on("|");

  /** A {@link Joiner} which joins on |. Handy for constructing regular expressions. */
  public static Joiner pipeJoiner() {
    return OR_JOINER;
  }

  public static Joiner unixNewlineJoiner() {
    return Joiner.on("\n");
  }

  public static Joiner commaJoiner() {
    return Joiner.on(",");
  }

  public static Joiner dotJoiner() {
    return Joiner.on(".");
  }

  /** *********** Splitters ******************* */

  /** Splits on tab, omitting empty strings and trimming results. */
  public static Splitter onTabs() {
    return Splitter.on("\t").trimResults().omitEmptyStrings();
  }

  /** Splits on spaces, omitting empty strings and trimming results. */
  public static Splitter onSpaces() {
    return Splitter.on(" ").trimResults().omitEmptyStrings();
  }

  /** Splits on Unix newlines, omitting empty strings and trimming results. */
  public static Splitter onUnixNewlines() {
    return Splitter.on("\n").trimResults().omitEmptyStrings();
  }

  /** Splits on commas, omitting empty strings and trimming results. */
  public static Splitter onCommas() {
    return Splitter.on(",").trimResults().omitEmptyStrings();
  }

  private static final Splitter onDots = Splitter.on(".").trimResults().omitEmptyStrings();

  /** Splits on periods, omitting empty strings and trimming results. */
  public static Splitter onDots() {
    return onDots;
  }

  private static final Splitter onDashes = Splitter.on("-").trimResults().omitEmptyStrings();

  /** Splits on dashes, omitting empty strings and trimming results. */
  public static Splitter onDashes() {
    return onDashes;
  }

  /**
   * A Guava function for converting strings to lowercase.
   *
   * @param locale
   * @return
   */
  public static Function toLowerCaseFunction(final Locale locale) {
    return new Function() {
      @Override
      public String apply(final String s) {
        return s.toLowerCase(locale);
      }

      @Override
      public String toString() {
        return "toLowercase(" + locale + ")";
      }
    };
  }

  public static Function toUpperCaseFunction(final Locale locale) {
    return new Function() {
      @Override
      public String apply(final String s) {
        return s.toUpperCase(locale);
      }

      @Override
      public String toString() {
        return "toUppercase(" + locale + ")";
      }
    };
  }

  public static final Predicate ContainsLetterOrDigit =
      s -> {
        for (int i = 0; i < s.length(); ++i) {
          if (Character.isLetterOrDigit(s.charAt(i))) {
            return true;
          }
        }
        return false;
      };

  public static final Function prefixWithFunction(final String prefix) {
    return new Function() {
      @Override
      public String apply(final String s) {
        return prefix + s;
      }
    };
  }

  public static final Function suffixWithFunction(final String suffix) {
    return new Function() {
      @Override
      public String apply(final String s) {
        return s + suffix;
      }
    };
  }

  public static final Predicate startsWith(final String prefix) {
    return new Predicate() {
      @Override
      public boolean apply(final String x) {
        return x.startsWith(prefix);
      }
    };
  }

  public static final String removeSuffixIfPresent(final String name, final String badSuffix) {
    if (name.endsWith(badSuffix)) {
      return name.substring(0, name.length() - badSuffix.length());
    } else {
      return name;
    }
  }

  /**
   * Gets a predicate which returns true for a {@code String} iff it contains {@code probe} as a
   * substring.
   */
  @SuppressWarnings("unchecked")
  public static final Predicate containsPredicate(final String probe) {
    return x -> x.contains(probe);
  }

  /**
   * A predicate which returns true for a {@code String} iff at least one of its characters matches
   * the provided {@link CharMatcher}
   */
  public static final Predicate anyCharMatches(final CharMatcher matcher) {
    return new Predicate() {
      @Override
      public boolean apply(@Nullable final String input) {
        return matcher.matchesAnyOf(input);
      }
    };
  }

  public static final Predicate isEmpty() {
    return new Predicate() {
      @Override
      public boolean apply(final String input) {
        checkArgument(input != null);
        return input.isEmpty();
      }
    };
  }

  /**
   * Just like {@link java.lang.String#indexOf(String, int)}, except it searches for all strings in
   * {@code probes}. If none are found, returns -1. If any are found, returns the earliest index of
   * a match. The current implementation naively searches for each string separately. If speed is
   * important, consider an alternative approach.
   */
  public static int earliestIndexOfAny(String s, Iterable probes, int from) {
    int earliestIdx = -1;

    for (final String probe : probes) {
      final int probeIdx = s.indexOf(probe, from);
      // if we found something for this probe
      if (probeIdx >= 0
          // and either we haven't found anything else yet or
          // this is earlier than anything we've found yet
          && (earliestIdx == -1 || probeIdx < earliestIdx)) {
        // then this is our new earliest match
        earliestIdx = probeIdx;
      }
    }

    return earliestIdx;
  }

  /** Returns the number of codepoints in a string. */
  public static int codepointCount(final String input) {
    return CodepointCountFunction.INSTANCE.apply(input);
  }

  /** Returns a function that computes the number of code points in a string. */
  public static Function codepointCountFunction() {
    return CodepointCountFunction.INSTANCE;
  }

  /**
   * Returns the substring of {@code s} which starts at the Unicode codepoint offset at {@code
   * startIndexInclusive} and ends before the Unicode codepoint offset at {@code endIndexExclusive}.
   * You should typically use this instead of {@link String#substring(int)} because the latter will
   * fail badly in the presence of non-BMP characters.
   *
   * 

Beware this takes linear time according to the end index of the substring, rather than the * substring's length, like for {@link String#substring(int)} */ public static String substringByCodepoints( String s, int startIndexInclusive, int endIndexExclusive) { return substringByCodepoints(s, startIndexInclusive, endIndexExclusive, false); } /** * Returns the substring of {@code s} indicated by {@code substringBounds}, where the character * offsets are interpreted as Unicode code point offsets. You should typically use this instead of * {@link String#substring(int)} because the latter will fail badly in the presence of non-BMP * characters. * *

Beware this takes linear time according to the end index of the substring, rather than the * substring's length, like for {@link String#substring(int)} */ public static String substringByCodepoints(String s, OffsetRange substringBounds) { return substringByCodepoints( s, substringBounds.startInclusive().asInt(), // +1 because called method takes exclusive end offset like String#substring substringBounds.endInclusive().asInt() + 1); } /** * Acts just like {@link #laxSubstringByCodepoints(String, int, int)} except that if either index * is out-of-bounds, it is clipped to the most extreme legal value. This guarantees that as long * as {@code s} is non-null and {@code endIndexExclusive>=startIndexInclusive}, no exception will * be thrown when calling this method. */ public static String laxSubstringByCodepoints( String s, int startIndexInclusive, int endIndexExclusive) { return substringByCodepoints(s, startIndexInclusive, endIndexExclusive, true); } private static String substringByCodepoints( String s, int startIndexInclusive, int endIndexExclusive, boolean lax) { checkArgument(startIndexInclusive <= endIndexExclusive); if (lax) { startIndexInclusive = Math.max(startIndexInclusive, 0); } checkArgument(startIndexInclusive >= 0); final int startCharIdx = s.offsetByCodePoints(0, startIndexInclusive); final int substringCodePointLength = endIndexExclusive - startIndexInclusive; final int endCharIdxInclusive; try { endCharIdxInclusive = s.offsetByCodePoints(startCharIdx, substringCodePointLength - 1); } catch (IndexOutOfBoundsException ibe) { if (lax) { // handle clipping at the end in lax mode return s.substring(startCharIdx, s.length()); } else { throw ibe; } } // we want an exclusive character offset for toString below, so we need to go one more // codepoint. // However, that may be one or two characters depending on exactly what the last code point is final int lastCodePoint = s.codePointAt(endCharIdxInclusive); int endCharIdxExclusive = endCharIdxInclusive + Character.charCount(lastCodePoint); if (lax && endCharIdxExclusive > s.length()) { // if we are requested to substring "safely", clip the substring to the end of the string endCharIdxExclusive = s.length(); } return s.substring(startCharIdx, endCharIdxExclusive); } /** * Checks that the supplied string is non-empty. If it is empty, an {@link * java.lang.IllegalArgumentException} is thrown with the supplied message. */ public static String checkNonEmpty(String s, String msg) { checkArgument(!s.isEmpty(), msg); return s; } /** * Produces a string representation of a positive integer padded with leading zeros. Enough zeros * are adding so that the supplied {@code maxValue} would have the same number of digits. */ public static String padWithMax(final int numToPad, final int maxValue) { checkArgument(numToPad >= 0); checkArgument(numToPad <= maxValue); final int maxLength = Integer.toString(maxValue).length(); final String baseString = Integer.toString(numToPad); final String padding = Strings.repeat("0", maxLength - baseString.length()); return padding + numToPad; } /** * Returns the code points contained in a {@link String}. Use {@link #toCodepointStrings(String)} * to get the {@link String} representation for each codepoint. * * @see #toCodepoints(String) */ public static ImmutableList toCodepoints(final String s) { final ImmutableList.Builder ret = ImmutableList.builder(); for (int offset = 0; offset < s.length(); ) { final int codePoint = s.codePointAt(offset); ret.add(codePoint); offset += Character.charCount(codePoint); } return ret.build(); } /** * Returns each code point in a {@link String} converted into a {@link String}. Useful for * iterating over {@link String}s in a Unicode-aware fashion. Use {@link #toCodepoints(String)} to * get the codepoints themselves. * * @see #toCodepoints(String) */ public static ImmutableList toCodepointStrings(final String s) { final ImmutableList.Builder ret = ImmutableList.builder(); for (int codePoint : toCodepoints(s)) { ret.add(new String(Character.toChars(codePoint))); } return ret.build(); } @SuppressWarnings("deprecation") public static Function lengthFunction() { return x -> x.length(); } /** Guava {@link Function} which runs {@link String#trim()} on all inputs. */ public static Function trimFunction() { return TrimFunction.INSTANCE; } private enum TrimFunction implements Function { INSTANCE; @Override public String apply(final String input) { return input.trim(); } } private enum CodepointCountFunction implements Function { INSTANCE; @Override public Integer apply(final String input) { return input.codePointCount(0, input.length()); } } // \p{M} means all Unicode "marks" private static final Pattern ACCENT_STRIPPER = Pattern.compile("[\\p{M}]"); /** Removes all Unicode marks from a string. As a side effect, applies NFD normalization. */ public static UnicodeFriendlyString stripAccents(final UnicodeFriendlyString input) { // this nifty normalization courtesy of // http://stackoverflow.com/questions/3322152/is-there-a-way-to-get-rid-of-accents-and-convert-a-whole-string-to-regular-lette return StringUtils.unicodeFriendly( ACCENT_STRIPPER .matcher(Normalizer.normalize(input.utf16CodeUnits(), Normalizer.Form.NFD)) // note this replaceAll is really deleteAll .replaceAll("")); } }





© 2015 - 2025 Weber Informatics LLC | Privacy Policy