opennlp.tools.util.StringUtil Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of opennlp-tools Show documentation
There is a newer version: 2.5.0
/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License. You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package opennlp.tools.util;

public class StringUtil {

  /**
   * Determines if the specified character is a whitespace.
   *
   * A character is considered a whitespace when one
   * of the following conditions is meet:
   *
   * 
   * Its a {@link Character#isWhitespace(int)} whitespace.
   * Its a part of the Unicode Zs category ({@link Character#SPACE_SEPARATOR}).
   * 
   *
   * Character.isWhitespace(int) does not include no-break spaces.
   * In OpenNLP no-break spaces are also considered as white spaces.
   *
   * @param charCode
   * @return true if white space otherwise false
   */
  public static boolean isWhitespace(char charCode) {
    return Character.isWhitespace(charCode)  ||
    Character.getType(charCode) == Character.SPACE_SEPARATOR;
  }

  /**
   * Determines if the specified character is a whitespace.
   *
   * A character is considered a whitespace when one
   * of the following conditions is meet:
   *
   * 
   * Its a {@link Character#isWhitespace(int)} whitespace.
   * Its a part of the Unicode Zs category ({@link Character#SPACE_SEPARATOR}).
   * 
   *
   * Character.isWhitespace(int) does not include no-break spaces.
   * In OpenNLP no-break spaces are also considered as white spaces.
   *
   * @param charCode
   * @return true if white space otherwise false
   */
  public static boolean isWhitespace(int charCode) {
    return Character.isWhitespace(charCode)  ||
        Character.getType(charCode) == Character.SPACE_SEPARATOR;
  }


  /**
   * Converts to lower case independent of the current locale via
   * {@link Character#toLowerCase(char)} which uses mapping information
   * from the UnicodeData file.
   *
   * @param string
   * @return lower cased String
   */
  public static String toLowerCase(CharSequence string) {

    char lowerCaseChars[] = new char[string.length()];

    for (int i = 0; i < string.length(); i++) {
      lowerCaseChars[i] = Character.toLowerCase(string.charAt(i));
    }

    return new String(lowerCaseChars);
  }

  /**
   * Converts to upper case independent of the current locale via
   * {@link Character#toUpperCase(char)} which uses mapping information
   * from the UnicodeData file.
   *
   * @param string
   * @return upper cased String
   */
  public static String toUpperCase(CharSequence string) {
    char upperCaseChars[] = new char[string.length()];

    for (int i = 0; i < string.length(); i++) {
      upperCaseChars[i] = Character.toUpperCase(string.charAt(i));
    }

    return new String(upperCaseChars);
  }

  /**
    * Returns true if {@link CharSequence#length()} is
    * 0 or null.
    *
    * @return true if {@link CharSequence#length()} is 0, otherwise
    *         false
    *
    * @since 1.5.1
    */
  public static boolean isEmpty(CharSequence theString) {
	return theString.length() == 0;
  }
  
  /**
   * Get mininum of three values.
   * @param a number a
   * @param b number b
   * @param c number c
   * @return the minimum
   */
  private static int minimum(int a, int b, int c) {
      int minValue;
      minValue = a;
      if (b < minValue) {
        minValue = b;
      }
      if (c < minValue) {
        minValue = c;
      }
      return minValue;
  }
  
  /**
   * Computes the Levenshtein distance of two strings in a matrix.
   * Based on pseudo-code provided here:
   * https://en.wikipedia.org/wiki/Levenshtein_distance#Computing_Levenshtein_distance
   * which in turn is based on the paper Wagner, Robert A.; Fischer, Michael J. (1974),
   * "The String-to-String Correction Problem", Journal of the ACM 21 (1): 168-173
   * @param wordForm the form
   * @param lemma the lemma
   * @return the distance
   */
  public static int[][] levenshteinDistance(String wordForm, String lemma) {

    int wordLength = wordForm.length();
    int lemmaLength = lemma.length();
    int cost;
    int[][] distance = new int[wordLength + 1][lemmaLength + 1];
    
    if (wordLength == 0) {
      return distance;
    }
    if (lemmaLength == 0) {
      return distance;
    }
    //fill in the rows of column 0
    for (int i = 0; i <= wordLength; i++) {
      distance[i][0] = i;
    }
    //fill in the columns of row 0
    for (int j = 0; j <= lemmaLength; j++) {
      distance[0][j] = j;
    }
    //fill in the rest of the matrix calculating the minimum distance
    for (int i = 1; i <= wordLength; i++) {
      int s_i = wordForm.charAt(i - 1);
      for (int j = 1; j <= lemmaLength; j++) {
        if (s_i == lemma.charAt(j - 1)) {
          cost = 0;
        } else {
          cost = 1;
        }
        //obtain minimum distance from calculating deletion, insertion, substitution
        distance[i][j] = minimum(distance[i - 1][j] + 1, distance[i][j - 1] + 1, distance[i - 1][j - 1] + cost);
      }
    }
    return distance;
  }
  
  /**
   * Computes the Shortest Edit Script (SES) to convert a word into its lemma.
   * This is based on Chrupala's PhD thesis (2008).
 * @param wordForm the token
 * @param lemma the target lemma
 * @param distance the levenshtein distance
 * @param permutations the number of permutations
 */
public static void computeShortestEditScript(String wordForm, String lemma, int[][] distance, StringBuffer permutations) {
    
    int n = distance.length;
    int m = distance[0].length;
    
    int wordFormLength = n - 1;
    int lemmaLength = m - 1;
    while(true) {
        
        if (distance[wordFormLength][lemmaLength] == 0) {
          break;
        }
        if ((lemmaLength > 0 && wordFormLength > 0) && (distance[wordFormLength - 1][lemmaLength - 1] < distance[wordFormLength][lemmaLength])) {
            permutations.append('R').append(Integer.toString(wordFormLength - 1)).append(wordForm.charAt(wordFormLength - 1)).append(lemma.charAt(lemmaLength - 1));
            lemmaLength--;
            wordFormLength--;
            continue;
        }
        if (lemmaLength > 0 && (distance[wordFormLength][lemmaLength - 1] < distance[wordFormLength][lemmaLength])) {
            permutations.append('I').append(Integer.toString(wordFormLength)).append(lemma.charAt(lemmaLength - 1));
            lemmaLength--;
            continue;
        }
        if (wordFormLength > 0 && (distance[wordFormLength - 1][lemmaLength] < distance[wordFormLength][lemmaLength])) {
            permutations.append('D').append(Integer.toString(wordFormLength - 1)).append(wordForm.charAt(wordFormLength - 1));
            wordFormLength--;
            continue;
        }
        if ((wordFormLength > 0 && lemmaLength > 0) && (distance[wordFormLength - 1][lemmaLength - 1] == distance[wordFormLength][lemmaLength])) {
            wordFormLength--; lemmaLength--;
            continue ;
        }
        if (wordFormLength > 0 && (distance[wordFormLength - 1][lemmaLength] == distance[wordFormLength][lemmaLength])) {
            wordFormLength--;
            continue;
        }
        if (lemmaLength > 0 && (distance[wordFormLength][lemmaLength - 1] == distance[wordFormLength][lemmaLength])) {
            lemmaLength--;
            continue;
        }   
    }
}

/**
 * Read predicted SES by the lemmatizer model and apply the
 * permutations to obtain the lemma from the wordForm.
 * @param wordForm the wordForm
 * @param permutations the permutations predicted by the lemmatizer model
 * @return the lemma
 */
public static String decodeShortestEditScript(String wordForm, String permutations) {
  
  StringBuffer lemma = new StringBuffer(wordForm).reverse();
  
  int permIndex = 0;
  while(true) {
      if (permutations.length() <= permIndex) {
        break;
      }
      //read first letter of permutation string
      char nextOperation = permutations.charAt(permIndex);
      //System.err.println("-> NextOP: " + nextOperation);
      //go to the next permutation letter
      permIndex++;
      if (nextOperation == 'R') {
          String charAtPerm = Character.toString(permutations.charAt(permIndex));
          int charIndex = Integer.parseInt(charAtPerm);
          // go to the next character in the permutation buffer
          // which is the replacement character
          permIndex++;
          char replace = permutations.charAt(permIndex);
          //go to the next char in the permutation buffer
          // which is the candidate character
          permIndex++;
          char with = permutations.charAt(permIndex);
          
          if (lemma.length() <= charIndex) {
            return wordForm; 
          }
          if (lemma.charAt(charIndex) == replace) {
            lemma.setCharAt(charIndex, with);
          }
          //System.err.println("-> ROP: " + lemma.toString());
          //go to next permutation
          permIndex++;
          
      } else if (nextOperation == 'I') {
          String charAtPerm = Character.toString(permutations.charAt(permIndex));
          int charIndex = Integer.parseInt(charAtPerm);
          permIndex++;
          //character to be inserted
          char in = permutations.charAt(permIndex);
      
          if (lemma.length() < charIndex) {
            return wordForm; 
          }
          lemma.insert(charIndex, in);
          //System.err.println("-> IOP " + lemma.toString());
          //go to next permutation
          permIndex++;
      } else if (nextOperation == 'D') {
          String charAtPerm = Character.toString(permutations.charAt(permIndex));
          int charIndex = Integer.parseInt(charAtPerm);
          if (lemma.length() <= charIndex) {
            return wordForm;
          }
          lemma.deleteCharAt(charIndex);
          permIndex++;
          // go to next permutation
          permIndex++;
      }
  }
  return lemma.reverse().toString();
}

/**
 * Get the SES required to go from a word to a lemma.
 * @param wordForm the word
 * @param lemma the lemma
 * @return the shortest edit script
 */
public static String getShortestEditScript(String wordForm, String lemma) {
  String reversedWF = new StringBuffer(wordForm.toLowerCase()).reverse().toString();
  String reversedLemma = new StringBuffer(lemma.toLowerCase()).reverse().toString();
  StringBuffer permutations = new StringBuffer();
  String ses;
  if (!reversedWF.equals(reversedLemma)) {
    int[][]levenDistance = StringUtil.levenshteinDistance(reversedWF, reversedLemma);
    StringUtil.computeShortestEditScript(reversedWF, reversedLemma, levenDistance, permutations);
    ses = permutations.toString();
  } else {
    ses = "O";
  }
  return ses;
}

}