org.languagetool.tools.StringTools Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of languagetool-core Show documentation
LanguageTool is an Open Source proofreading software for English, French, German, Polish, Romanian, and more than 20 other languages. It finds many errors that a simple spell checker cannot detect like mixing up there/their and it detects some grammar problems.
There is a newer version: 6.5
Show newest version
/* LanguageTool, a natural language style checker 
 * Copyright (C) 2005 Daniel Naber (http://www.danielnaber.de)
 * 
 * This library is free software; you can redistribute it and/or
 * modify it under the terms of the GNU Lesser General Public
 * License as published by the Free Software Foundation; either
 * version 2.1 of the License, or (at your option) any later version.
 *
 * This library is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 * Lesser General Public License for more details.
 *
 * You should have received a copy of the GNU Lesser General Public
 * License along with this library; if not, write to the Free Software
 * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301
 * USA
 */
package org.languagetool.tools;

import com.google.common.xml.XmlEscapers;
import org.jetbrains.annotations.Contract;
import org.jetbrains.annotations.Nullable;
import org.languagetool.JLanguageTool;
import org.languagetool.Language;

import java.io.*;
import java.nio.charset.StandardCharsets;
import java.text.Normalizer;
import java.util.*;
import java.util.regex.Pattern;

/**
 * Tools for working with strings.
 * 
 * @author Daniel Naber
 */
public final class StringTools {

  /**
   * Constants for printing XML rule matches.
   */
  public enum ApiPrintMode {
    /**
     * Normally output the rule matches by starting and
     * ending the XML/JSON output on every call.
     */
    NORMAL_API,
    /**
     * Start XML/JSON output by printing the preamble and the
     * start of the root element.
     */
    START_API,
    /**
     * End XML/JSON output by closing the root element.
     */
    END_API,
    /**
     * Simply continue rule match output.
     */
    CONTINUE_API
  }

  private static final Pattern XML_COMMENT_PATTERN = Pattern.compile("", Pattern.DOTALL);
  private static final Pattern XML_PATTERN = Pattern.compile("(?]+>", Pattern.DOTALL);
  public static final Set UPPERCASE_GREEK_LETTERS = Collections.unmodifiableSet(new HashSet<>(Arrays.asList("Α","Β","Γ","Δ","Ε","Ζ","Η","Θ","Ι","Κ","Λ","Μ","Ν","Ξ","Ο","Π","Ρ","Σ","Τ","Υ","Φ","Χ","Ψ","Ω")));
  public static final Set LOWERCASE_GREEK_LETTERS = Collections.unmodifiableSet(new HashSet<>(Arrays.asList("α","β","γ","δ","ε","ζ","η","θ","ι","κ","λ","μ","ν","ξ","ο","π","ρ","σ","τ","υ","φ","χ","ψ","ω")));
  private static final Pattern PUNCTUATION_PATTERN = Pattern.compile("[\\p{IsPunctuation}']", Pattern.DOTALL);
  private static final Pattern NOT_WORD_CHARACTER = Pattern.compile("[^\\p{L}]", Pattern.DOTALL);

  private static final Pattern NOT_WORD_STR = Pattern.compile("[^\\p{L}]+", Pattern.DOTALL);

  private StringTools() {
    // only static stuff
  }

  /**
   * Throw exception if the given string is null or empty or only whitespace.
   */
  public static void assureSet(String s, String varName) {
    Objects.requireNonNull(varName);
    if (isEmpty(s.trim())) {
      throw new IllegalArgumentException(varName + " cannot be empty or whitespace only");
    }
  }

  /**
   * Read the text stream using the given encoding.
   *
   * @param stream InputStream the stream to be read
   * @param encoding the stream's character encoding, e.g. {@code utf-8}, or {@code null} to use the system encoding
   * @return a string with the stream's content, lines separated by {@code \n} (note that {@code \n} will
   *  be added to the last line even if it is not in the stream)
   * @since 2.3
   */
  public static String readStream(InputStream stream, String encoding) throws IOException {
    InputStreamReader isr = null;
    StringBuilder sb = new StringBuilder();
    try {
      if (encoding == null) {
        isr = new InputStreamReader(stream);
      } else {
        isr = new InputStreamReader(stream, encoding);
      }
      try (BufferedReader br = new BufferedReader(isr)) {
        String line;
        while ((line = br.readLine()) != null) {
          sb.append(line);
          sb.append('\n');
        }
      }
    } finally {
      if (isr != null) {
        isr.close();
      }
    }
    return sb.toString();
  }

  /**
   * Returns true if the given string is made up of all-uppercase characters
   * (ignoring characters for which no upper-/lowercase distinction exists).
   */
  public static boolean isAllUppercase(String str) {
    for (int i = 0; i < str.length(); i++) {
      char c = str.charAt(i);
      if (Character.isLetter(c) && Character.isLowerCase(c)) {
        return false;
      }
    }
    return true;
  }

  /**
   * Returns true if the given list of string is made up of all-uppercase words.
   * If the list contains only numbers or punctuation marks it is not considered all-uppercase
   */
  public static boolean isAllUppercase(List strList) {
    boolean isInputAllUppercase = true;
    boolean isAllNotLetters = true;
    for (int i = 0; i < strList.size(); i++) {
      isInputAllUppercase = isInputAllUppercase && StringTools.isAllUppercase(strList.get(i));
      isAllNotLetters = isAllNotLetters && (StringTools.isNotWordString(strList.get(i))
        || StringTools.isPunctuationMark(strList.get(i)));
    }
    return isInputAllUppercase && ! isAllNotLetters;
  }

  /**
   * Returns true if the given string is mixed case, like {@code MixedCase} or {@code mixedCase}
   * (but not {@code Mixedcase}).
   * @param str input str
   */
  public static boolean isMixedCase(String str) {
    return !isAllUppercase(str)
        && !isCapitalizedWord(str)
        && isNotAllLowercase(str);
  }

  /**
   * Returns true if str is not made up of all-lowercase characters
   * (ignoring characters for which no upper-/lowercase distinction exists).
   * @since 2.5
   */
  public static boolean isNotAllLowercase(String str) {
    for (int i = 0; i < str.length(); i++) {
      char c = str.charAt(i);
      if (Character.isLetter(c) && !Character.isLowerCase(c)) {
        return true;
      }
    }
    return false;
  }

  /**
   * @param str input string
   * @return true if word starts with an uppercase letter and all other letters are lowercase
   */
  @Contract("null -> false")
  public static boolean isCapitalizedWord(@Nullable String str) {
    if (!isEmpty(str) && Character.isUpperCase(str.charAt(0))) {
      for (int i = 1; i < str.length(); i++) {
        char c = str.charAt(i);
        if (Character.isLetter(c) && !Character.isLowerCase(c)) {
          return false;
        }
      }
      return true;
    }
    return false;
  }

  /**
   * Whether the first character of str is an uppercase character.
   */
  public static boolean startsWithUppercase(String str) {
    if (isEmpty(str)) {
      return false;
    }
    return Character.isUpperCase(str.charAt(0));
  }

  /**
   * Whether the first character of str is an uppercase character.
   * @since 4.9
   */
  public static boolean startsWithLowercase(String str) {
    if (isEmpty(str)) {
      return false;
    }
    return Character.isLowerCase(str.charAt(0));
  }

  /**
   * Return str modified so that its first character is now an
   * uppercase character. If str starts with non-alphabetic
   * characters, such as quotes or parentheses, the first character is 
   * determined as the first alphabetic character.
   */
  @Contract("!null -> !null")
  @Nullable
  public static String uppercaseFirstChar(@Nullable String str) {
    return changeFirstCharCase(str, true);
  }

  /**
   * Like {@link #uppercaseFirstChar(String)}, but handles a special case for Dutch (IJ in 
   * e.g. "ijsselmeer" -> "IJsselmeer").
   * @param language the language, will be ignored if it's {@code null}
   * @since 2.7
   */
  @Contract("!null, _ -> !null")
  @Nullable
  public static String uppercaseFirstChar(@Nullable String str, Language language) {
    if (language != null && "nl".equals(language.getShortCode()) && str != null && str.toLowerCase().startsWith("ij")) {
      // hack to fix https://github.com/languagetool-org/languagetool/issues/148
      return "IJ" + str.substring(2);
    } else {
      return changeFirstCharCase(str, true);
    }
  }

  /**
   * Return str modified so that its first character is now an
   * lowercase character. If str starts with non-alphabetic
   * characters, such as quotes or parentheses, the first character is 
   * determined as the first alphabetic character.
   */
  @Contract("!null -> !null")
  @Nullable
  public static String lowercaseFirstChar(@Nullable String str) {
    return changeFirstCharCase(str, false);
  }

  /**
   * Return str if str is capitalized {@link #isCapitalizedWord(String)},
   * otherwise return modified str so that its first character
   * is now a lowercase character.
   */
  @Contract("!null, -> !null")
  @Nullable
  public static String lowercaseFirstCharIfCapitalized(@Nullable String str) {
    if (!isCapitalizedWord(str)) return str;
    return changeFirstCharCase(str, false);
  }

  /**
   * Return str modified so that its first character is now an
   * lowercase or uppercase character, depending on toUpperCase.
   * If str starts with non-alphabetic
   * characters, such as quotes or parentheses, the first character is 
   * determined as the first alphabetic character.
   */
  @Contract("!null, _ -> !null")
  @Nullable
  private static String changeFirstCharCase(@Nullable String str, boolean toUpperCase) {
    if (isEmpty(str)) {
      return str;
    }
    if (str.length() == 1) {
      return toUpperCase ? str.toUpperCase(Locale.ENGLISH) : str.toLowerCase();
    }
    int pos = 0;
    int len = str.length() - 1;
    while (!Character.isLetterOrDigit(str.charAt(pos)) && len > pos) {
      pos++;
    }
    char firstChar = str.charAt(pos);    
    return str.substring(0, pos) 
        + (toUpperCase ? Character.toUpperCase(firstChar) : Character.toLowerCase(firstChar))
        + str.substring(pos + 1);
  }

  public static String readerToString(Reader reader) throws IOException {
    StringBuilder sb = new StringBuilder();
    int readBytes = 0;
    char[] chars = new char[4000];
    while (readBytes >= 0) {
      readBytes = reader.read(chars, 0, 4000);
      if (readBytes <= 0) {
        break;
      }
      sb.append(new String(chars, 0, readBytes));
    }
    return sb.toString();
  }

  public static String streamToString(InputStream is, String charsetName) throws IOException {
    try (InputStreamReader isr = new InputStreamReader(is, charsetName)) {
      return readerToString(isr);
    }
  } 
  
  /**
   * Calls {@link #escapeHTML(String)}.
   */
  public static String escapeXML(String s) {
    return escapeHTML(s);
  }

  /**
   * @since 2.9
   */
  public static String escapeForXmlAttribute(String s) {
    return XmlEscapers.xmlAttributeEscaper().escape(s);
  }

  /**
   * @since 2.9
   */
  public static String escapeForXmlContent(String s) {
    return XmlEscapers.xmlContentEscaper().escape(s);
  }

  /**
   * Escapes these characters: less than, greater than, quote, ampersand.
   */
  public static String escapeHTML(String s) {
    // this version is much faster than using s.replaceAll()
    StringBuilder sb = new StringBuilder();
    int n = s.length();
    for (int i = 0; i < n; i++) {
      char c = s.charAt(i);
      switch (c) {
        case '<':
          sb.append("<");
          break;
        case '>':
          sb.append(">");
          break;
        case '&':
          sb.append("&");
          break;
        case '"':
          sb.append(""");
          break;
        default:
          sb.append(c);
          break;
      }
    }
    return sb.toString();
  }

  /**
   * Filters any whitespace characters. Useful for trimming the contents of
   * token elements that cannot possibly contain any spaces, with the exception
   * for a single space in a word (for example, if the language supports numbers
   * formatted with spaces as single tokens, as Catalan in LanguageTool).
   * 
   * @param s String to be filtered.
   * @return Filtered s.
   */
  public static String trimWhitespace(String s) {
    StringBuilder filter = new StringBuilder();
    String str = s.trim();
    for (int i = 0; i < str.length(); i++) {
      while (str.charAt(i) <= ' ' && i < str.length() &&
          (str.charAt(i + 1) <= ' ' || i > 1 && str.charAt(i - 1) <= ' ')) {
        i++;
      }
      char c = str.charAt(i);
      if (c != '\n' && c != '\t' && c != '\r') {
        filter.append(c);
      }
    }
    return filter.length() == str.length() ? str : filter.toString();
  }

  /**
   * eliminate special (unicode) characters, e.g. soft hyphens
   * @since 4.3
   * @param s String to filter
   * @return s, with non-(alphanumeric, punctuation, space) characters deleted
   */
  public static String trimSpecialCharacters(String s) {
    // need unicode character classes -> (?U)
    // lists all allowed character classes, replace everything else
    return s.replaceAll("(?U)[^\\p{Space}\\p{Alnum}\\p{Punct}]", "");
  }

  /**
   * Adds spaces before words that are not punctuation.
   * 
   * @param word Word to add the preceding space.
   * @param language
   *          Language of the word (to check typography conventions). Currently
   *          French convention of not adding spaces only before '.' and ',' is
   *          implemented; other languages assume that before ,.;:!? no spaces
   *          should be added.
   * @return String containing a space or an empty string.
   */
  public static String addSpace(String word, Language language) {
    String space = " ";
    if (word.length() == 1) {
      char c = word.charAt(0);
      if ("fr".equals(language.getShortCode())) {
        if (c == '.' || c == ',') {
          space = "";
        }
      } else {
        if (c == '.' || c == ',' || c == ';' || c == ':' || c == '?' || c == '!') {
          space = "";
        }
      }
    }
    return space;
  }

  /**
   * Checks if a string contains a whitespace, including:
   * 
   * all Unicode whitespace
   * 
the non-breaking space (U+00A0)
   * 
the narrow non-breaking space (U+202F)
   * 
the zero width space (U+200B), used in Khmer
   * 
   * @param str String to check
   * @return true if the string is a whitespace character
   */
  public static boolean isWhitespace(String str) {
    if ("\u0002".equals(str) // unbreakable field, e.g. a footnote number in OOo
        || "\u0001".equals(str)) { // breakable field in OOo
      return false;
    }

    if ("\uFEFF".equals(str)) {
      return true;
    }
    String trimStr = str.trim();
    if (isEmpty(trimStr)) {
      return true;
    }
    if (trimStr.length() == 1) {
      if ("\u200B".equals(str) ||// We need u200B to be detected as whitespace for Khmer, as it was the case before Java 7.
          "\u00A0".equals(str) || "\u202F".equals(str)) { // non-breaking space and narrow non-breaking space
        return true;
      }
      return Character.isWhitespace(trimStr.charAt(0));
    }
    return false;
  }
  
  /**
   * Checks if a string is the non-breaking whitespace (\u00A0).
   * @since 2.1
   */
  public static boolean isNonBreakingWhitespace(String str) {
    return "\u00A0".equals(str);
  }

  /**
   * @param ch Character to check
   * @return True if the character is a positive number (decimal digit from 1 to 9).
   */
  public static boolean isPositiveNumber(char ch) {
    return ch >= '1' && ch <= '9';
  }

  /**
   * Helper method to replace calls to {@code "".equals()}.
   * 
   * @param str String to check
   * @return true if string is empty or {@code null}
   */
  public static boolean isEmpty(@Nullable String str) {
    return str == null || str.length() == 0;
  }

  /**
   * Simple XML filtering for XML tags.
   * @param str XML string to be filtered.
   * @return Filtered string without XML tags.
   */
  public static String filterXML(String str) {
    String s = str;       
    if (s.contains("<")) { // don't run slow regex unless we have to
      s = XML_COMMENT_PATTERN.matcher(s).replaceAll(" ");
      s = XML_PATTERN.matcher(s).replaceAll("");
    }
    return s;
  }
  
  public static boolean hasDiacritics(String str) {
    return !str.equals(removeDiacritics(str));
  }
  
  public static String removeDiacritics(String str) {
    String s = Normalizer.normalize(str, Normalizer.Form.NFD);
    return s.replaceAll("[\\p{InCombiningDiacriticalMarks}]", "");
  }
  
  public static String normalizeNFKC(String str) {
    return Normalizer.normalize(str, Normalizer.Form.NFKC);
  }
  
  public static String normalizeNFC(String str) {
    return Normalizer.normalize(str, Normalizer.Form.NFC);
  }
  
  /**
   * Apply to inputString the casing of modelString
   * @param inputString, modelString 
   * @return string
   */
  public static String preserveCase(String inputString, String modelString) {
    if (modelString.isEmpty()) {
      return inputString; 
    }
    // modelString="L'" is ambiguous, apply capitalization
    if (isCapitalizedWord(modelString)) {
      return uppercaseFirstChar(inputString.toLowerCase()); 
    }
    if (isAllUppercase(modelString)) {
      return inputString.toUpperCase(); 
    }  
//    if (!isNotAllLowercase(modelString)) {
//      return inputString.toLowerCase();
//    }
    return inputString;
    
  }

  @Nullable
  public static String asString(CharSequence s) {
    if (s == null) {
      return null;
    }
    return s.toString();
  }

  /**
   * @since 4.3
   */
  public static boolean isParagraphEnd(String sentence, boolean singleLineBreaksMarksPara) {
    boolean isParaEnd = false;
    if (singleLineBreaksMarksPara) {
      if (sentence.endsWith("\n") || sentence.endsWith("\n\r")) {
        isParaEnd = true;
      }
    } else if (sentence.endsWith("\n\n") || sentence.endsWith("\n\r\n\r") || sentence.endsWith("\r\n\r\n")) {
      isParaEnd = true;
    }
    return isParaEnd;
  }

  /**
   * Loads file, ignoring comments (lines starting with {@code #}).
   * @param path path in resource dir
   * @since 4.6
   * @deprecated use DataBroker#getFromResourceDirAsLines(java.lang.String) instead (NOTE: it won't handle comments)
   */
  public static List loadLines(String path) {
    InputStream stream = JLanguageTool.getDataBroker().getFromResourceDirAsStream(path);
    List l = new ArrayList<>();
    try (
      InputStreamReader reader = new InputStreamReader(stream, StandardCharsets.UTF_8);
      BufferedReader br = new BufferedReader(reader)
    ) {
      String line;
      while ((line = br.readLine()) != null) {
        if (line.isEmpty() || line.charAt(0) == '#') {   // ignore comments
          continue;
        }
        l.add(line);
      }
    } catch (IOException e) {
      throw new RuntimeException("Could not load data from " + path, e);
    }
    return Collections.unmodifiableList(l);
  }

  /**
   * Will turn a string into a typical rule ID, i.e. uppercase and
   * "_" instead of spaces.
   *
   * All non-ASCII characters are replaced with "_", EXCEPT for
   * Latin-1 ranges U+00C0-U+00D6 and U+00D8-U+00DE.
   *
   * "de" locales have a special implementation (ä => ae, etc.).
   *
   * @param language LT language object, used to apply language-specific normalisation rules.
   *
   * @since 5.1
   */
  public static String toId(String input, Language language) {
    String languageCode = language.getShortCode();
    String normalisedId;
    normalisedId = input.toUpperCase().trim()
      .replace(' ', '_')
      .replace("'", "_Q_");
    // Standard toUpperCase implementation already converts ß to SS, so that'll be done for all locales and there's no
    // need to run a separate replace here.
    if (Objects.equals(languageCode, "de")) {
      normalisedId = normalisedId
        .replace("Ä", "AE")
        .replace("Ü", "UE")
        .replace("Ö", "OE");
    }
    normalisedId = normalisedId.replaceAll("[^A-Z\\u00c0-\\u00D6\\u00D8-\\u00DE]", "_");
    return normalisedId;
  }

  /**
   * Whether the string is camelCase. Works only with ASCII input and with single words.
   * @since 5.3
   */
  public static boolean isCamelCase(String token) {
    return token.matches("[a-z]+[A-Z][A-Za-z]+");
  }
  
  /**
   * Whether the string is a punctuation mark
   * @since 5.5
   */
  public static boolean isPunctuationMark(String input) {
    return PUNCTUATION_PATTERN.matcher(input).matches();
  }
  
  /**
   * Whether the string is a punctuation mark
   * @since 6.1
   */
  public static boolean isNotWordCharacter(String input) {
    return NOT_WORD_CHARACTER.matcher(input).matches();
  }
  
  
  /**
   * Difference between two strings (only one difference)
   * @return: List of strings: 0: common string at the start; 1: diff in string1; 2: diff in string2; 3: common string at the end
   * @since 6.2
   */
  
  public static List getDifference(String s1, String s2) {
    List results = new ArrayList<>();
    if (s1.equals(s2)) {
      results.add(s1);
      results.add("");
      results.add("");
      results.add("");
      return results;
    }
    int l1 = s1.length();
    int l2 = s2.length();
    int fromStart = 0;
    while (fromStart < l1 && fromStart < l2 && s1.charAt(fromStart) == s2.charAt(fromStart)) {
      fromStart++;
    }
    int fromEnd = 0;
    while (fromEnd < l1 && fromEnd < l2 && s1.charAt(l1 - 1 - fromEnd) == s2.charAt(l2 - 1 - fromEnd)) {
      fromEnd++;
    }
    // corrections (e.g. stress vs stresses)
    while (fromStart > l1 - fromEnd) {
      fromEnd--;
    }
    while (fromStart > l2 - fromEnd) {
      fromEnd--;
    }
    // common string at start
    results.add(s1.substring(0, fromStart));
    // diff in string1
    results.add(s1.substring(fromStart, l1 - fromEnd));
    // diff in string2
    results.add(s2.substring(fromStart, l2 - fromEnd));
    // common string at end
    results.add(s1.substring(l1 - fromEnd, l1));
    return results;
  }
  
  
  /*
   * Invent a wrong word to find possible replacements. 
   */
  
  public static String makeWrong(String s) {
    if (s.contains("a")) {
      return s.replace("a", "ä");
    }
    if (s.contains("e")) {
      return s.replace("e", "ë");
    }
    if (s.contains("i")) {
      return s.replace("i", "ï");
    }
    if (s.contains("o")) {
      return s.replace("o", "ö");
    }
    if (s.contains("u")) {
      return s.replace("u", "ù");
    }
    if (s.contains("á")) {
      return s.replace("á", "ä");
    }
    if (s.contains("é")) {
      return s.replace("é", "ë");
    }
    if (s.contains("í")) {
      return s.replace("í", "ï");
    }
    if (s.contains("ó")) {
      return s.replace("ó", "ö");
    }
    if (s.contains("ú")) {
      return s.replace("ú", "ù");
    }
    if (s.contains("à")) {
      return s.replace("à", "ä");
    }
    if (s.contains("è")) {
      return s.replace("è", "ë");
    }
    if (s.contains("ì")) {
      return s.replace("ì", "i");
    }
    if (s.contains("ò")) {
      return s.replace("ò", "ö");
    }
    if (s.contains("ï")) {
      return s.replace("ï", "ì");
    }
    if (s.contains("ü")) {
      return s.replace("ü", "ù");
    }
    return s + "-";
  }

  /**
    * Return str without tashkeel characters
    * @param str input str
    */
   public static String removeTashkeel(String str) {
     String striped = str.replaceAll("["
       + "\u064B"  // Fathatan
       + "\u064C"  // Dammatan
       + "\u064D"  // Kasratan
       + "\u064E"  // Fatha
       + "\u064F"  // Damma
       + "\u0650"  // Kasra
       + "\u0651"  // Shadda
       + "\u0652"  // Sukun
       + "\u0653"  // Maddah Above
       + "\u0654"  // Hamza Above
       + "\u0655"  // Hamza Below
       + "\u0656"  // Subscript Alef
       + "\u0640"  // Tatweel
       + "]", "");
      return striped;
    }

  public static boolean isNotWordString(String input) {
    return NOT_WORD_STR.matcher(input).matches();
  }
}