opennlp.tools.util.StringUtil Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of opennlp-tools Show documentation
There is a newer version: 2.5.0
/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License. You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package opennlp.tools.util;

public class StringUtil {

  /**
   * Determines if the specified character is a whitespace.
   *
   * A character is considered a whitespace when one
   * of the following conditions is meet:
   *
   * 
   * Its a {@link Character#isWhitespace(int)} whitespace.
   * Its a part of the Unicode Zs category ({@link Character#SPACE_SEPARATOR}).
   * 
   *
   * Character.isWhitespace(int) does not include no-break spaces.
   * In OpenNLP no-break spaces are also considered as white spaces.
   *
   * @param charCode
   * @return true if white space otherwise false
   */
  public static boolean isWhitespace(char charCode) {
    return Character.isWhitespace(charCode)  ||
        Character.getType(charCode) == Character.SPACE_SEPARATOR;
  }

  /**
   * Determines if the specified character is a whitespace.
   *
   * A character is considered a whitespace when one
   * of the following conditions is meet:
   *
   * 
   * Its a {@link Character#isWhitespace(int)} whitespace.
   * Its a part of the Unicode Zs category ({@link Character#SPACE_SEPARATOR}).
   * 
   *
   * Character.isWhitespace(int) does not include no-break spaces.
   * In OpenNLP no-break spaces are also considered as white spaces.
   *
   * @param charCode
   * @return true if white space otherwise false
   */
  public static boolean isWhitespace(int charCode) {
    return Character.isWhitespace(charCode)  ||
        Character.getType(charCode) == Character.SPACE_SEPARATOR;
  }


  /**
   * Converts to lower case independent of the current locale via
   * {@link Character#toLowerCase(char)} which uses mapping information
   * from the UnicodeData file.
   *
   * @param string
   * @return lower cased String
   */
  public static String toLowerCase(CharSequence string) {
    char lowerCaseChars[] = new char[string.length()];

    for (int i = 0; i < string.length(); i++) {
      lowerCaseChars[i] = Character.toLowerCase(string.charAt(i));
    }

    return new String(lowerCaseChars);
  }

  /**
   * Converts to upper case independent of the current locale via
   * {@link Character#toUpperCase(char)} which uses mapping information
   * from the UnicodeData file.
   *
   * @param string
   * @return upper cased String
   */
  public static String toUpperCase(CharSequence string) {
    char upperCaseChars[] = new char[string.length()];

    for (int i = 0; i < string.length(); i++) {
      upperCaseChars[i] = Character.toUpperCase(string.charAt(i));
    }

    return new String(upperCaseChars);
  }

  /**
   * Returns true if {@link CharSequence#length()} is
   * 0 or null.
   *
   * @return true if {@link CharSequence#length()} is 0, otherwise
   *         false
   *
   * @since 1.5.1
   */
  public static boolean isEmpty(CharSequence theString) {
    return theString.length() == 0;
  }

  /**
   * Get mininum of three values.
   * @param a number a
   * @param b number b
   * @param c number c
   * @return the minimum
   */
  private static int minimum(int a, int b, int c) {
    int minValue;
    minValue = a;
    if (b < minValue) {
      minValue = b;
    }
    if (c < minValue) {
      minValue = c;
    }
    return minValue;
  }

  /**
   * Computes the Levenshtein distance of two strings in a matrix.
   * Based on pseudo-code provided here:
   * https://en.wikipedia.org/wiki/Levenshtein_distance#Computing_Levenshtein_distance
   * which in turn is based on the paper Wagner, Robert A.; Fischer, Michael J. (1974),
   * "The String-to-String Correction Problem", Journal of the ACM 21 (1): 168-173
   * @param wordForm the form
   * @param lemma the lemma
   * @return the distance
   */
  public static int[][] levenshteinDistance(String wordForm, String lemma) {
    int wordLength = wordForm.length();
    int lemmaLength = lemma.length();
    int cost;
    int[][] distance = new int[wordLength + 1][lemmaLength + 1];

    if (wordLength == 0) {
      return distance;
    }
    if (lemmaLength == 0) {
      return distance;
    }
    //fill in the rows of column 0
    for (int i = 0; i <= wordLength; i++) {
      distance[i][0] = i;
    }
    //fill in the columns of row 0
    for (int j = 0; j <= lemmaLength; j++) {
      distance[0][j] = j;
    }
    //fill in the rest of the matrix calculating the minimum distance
    for (int i = 1; i <= wordLength; i++) {
      int s_i = wordForm.charAt(i - 1);
      for (int j = 1; j <= lemmaLength; j++) {
        if (s_i == lemma.charAt(j - 1)) {
          cost = 0;
        } else {
          cost = 1;
        }
        //obtain minimum distance from calculating deletion, insertion, substitution
        distance[i][j] = minimum(distance[i - 1][j] + 1, distance[i][j - 1]
            + 1, distance[i - 1][j - 1] + cost);
      }
    }
    return distance;
  }

  /**
   * Computes the Shortest Edit Script (SES) to convert a word into its lemma.
   * This is based on Chrupala's PhD thesis (2008).
   * @param wordForm the token
   * @param lemma the target lemma
   * @param distance the levenshtein distance
   * @param permutations the number of permutations
   */
  public static void computeShortestEditScript(String wordForm, String lemma,
      int[][] distance, StringBuffer permutations) {

    int n = distance.length;
    int m = distance[0].length;

    int wordFormLength = n - 1;
    int lemmaLength = m - 1;
    while (true) {

      if (distance[wordFormLength][lemmaLength] == 0) {
        break;
      }
      if ((lemmaLength > 0 && wordFormLength > 0) && (distance[wordFormLength - 1][lemmaLength - 1]
          < distance[wordFormLength][lemmaLength])) {
        permutations.append('R').append(Integer.toString(wordFormLength - 1))
            .append(wordForm.charAt(wordFormLength - 1)).append(lemma.charAt(lemmaLength - 1));
        lemmaLength--;
        wordFormLength--;
        continue;
      }
      if (lemmaLength > 0 && (distance[wordFormLength][lemmaLength - 1]
          < distance[wordFormLength][lemmaLength])) {
        permutations.append('I').append(Integer.toString(wordFormLength))
            .append(lemma.charAt(lemmaLength - 1));
        lemmaLength--;
        continue;
      }
      if (wordFormLength > 0 && (distance[wordFormLength - 1][lemmaLength]
          < distance[wordFormLength][lemmaLength])) {
        permutations.append('D').append(Integer.toString(wordFormLength - 1))
            .append(wordForm.charAt(wordFormLength - 1));
        wordFormLength--;
        continue;
      }
      if ((wordFormLength > 0 && lemmaLength > 0) && (distance[wordFormLength - 1][lemmaLength - 1]
          == distance[wordFormLength][lemmaLength])) {
        wordFormLength--;
        lemmaLength--;
        continue ;
      }
      if (wordFormLength > 0 && (distance[wordFormLength - 1][lemmaLength]
          == distance[wordFormLength][lemmaLength])) {
        wordFormLength--;
        continue;
      }
      if (lemmaLength > 0 && (distance[wordFormLength][lemmaLength - 1]
          == distance[wordFormLength][lemmaLength])) {
        lemmaLength--;
      }
    }
  }

  /**
   * Read predicted SES by the lemmatizer model and apply the
   * permutations to obtain the lemma from the wordForm.
   * @param wordForm the wordForm
   * @param permutations the permutations predicted by the lemmatizer model
   * @return the lemma
   */
  public static String decodeShortestEditScript(String wordForm, String permutations) {

    StringBuffer lemma = new StringBuffer(wordForm).reverse();

    int permIndex = 0;
    while (true) {
      if (permutations.length() <= permIndex) {
        break;
      }
      //read first letter of permutation string
      char nextOperation = permutations.charAt(permIndex);
      //System.err.println("-> NextOP: " + nextOperation);
      //go to the next permutation letter
      permIndex++;
      if (nextOperation == 'R') {
        String charAtPerm = Character.toString(permutations.charAt(permIndex));
        int charIndex = Integer.parseInt(charAtPerm);
        // go to the next character in the permutation buffer
        // which is the replacement character
        permIndex++;
        char replace = permutations.charAt(permIndex);
        //go to the next char in the permutation buffer
        // which is the candidate character
        permIndex++;
        char with = permutations.charAt(permIndex);

        if (lemma.length() <= charIndex) {
          return wordForm;
        }
        if (lemma.charAt(charIndex) == replace) {
          lemma.setCharAt(charIndex, with);
        }
        //System.err.println("-> ROP: " + lemma.toString());
        //go to next permutation
        permIndex++;

      } else if (nextOperation == 'I') {
        String charAtPerm = Character.toString(permutations.charAt(permIndex));
        int charIndex = Integer.parseInt(charAtPerm);
        permIndex++;
        //character to be inserted
        char in = permutations.charAt(permIndex);

        if (lemma.length() < charIndex) {
          return wordForm;
        }
        lemma.insert(charIndex, in);
        //System.err.println("-> IOP " + lemma.toString());
        //go to next permutation
        permIndex++;
      } else if (nextOperation == 'D') {
        String charAtPerm = Character.toString(permutations.charAt(permIndex));
        int charIndex = Integer.parseInt(charAtPerm);
        if (lemma.length() <= charIndex) {
          return wordForm;
        }
        lemma.deleteCharAt(charIndex);
        permIndex++;
        // go to next permutation
        permIndex++;
      }
    }
    return lemma.reverse().toString();
  }

  /**
   * Get the SES required to go from a word to a lemma.
   * @param wordForm the word
   * @param lemma the lemma
   * @return the shortest edit script
   */
  public static String getShortestEditScript(String wordForm, String lemma) {
    String reversedWF = new StringBuffer(wordForm.toLowerCase()).reverse().toString();
    String reversedLemma = new StringBuffer(lemma.toLowerCase()).reverse().toString();
    StringBuffer permutations = new StringBuffer();
    String ses;
    if (!reversedWF.equals(reversedLemma)) {
      int[][]levenDistance = StringUtil.levenshteinDistance(reversedWF, reversedLemma);
      StringUtil.computeShortestEditScript(reversedWF, reversedLemma, levenDistance, permutations);
      ses = permutations.toString();
    } else {
      ses = "O";
    }
    return ses;
  }

}