opennlp.tools.util.StringUtil Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of opennlp-tools Show documentation
There is a newer version: 2.5.3
/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License. You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package opennlp.tools.util;

import java.nio.CharBuffer;

import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

public class StringUtil {

  private static final Logger logger = LoggerFactory.getLogger(StringUtil.class);

  /**
   * Determines if the specified {@link Character} is a whitespace.
   * A character is considered a whitespace when one of the following conditions is met:
   * 
   * It's a {@link Character#isWhitespace(int)} whitespace.
   * It's a part of the Unicode Zs category ({@link Character#SPACE_SEPARATOR}).
   * 
   *
   * {@link Character#isWhitespace(int)} does not include no-break spaces.
   * In OpenNLP no-break spaces are also considered as white spaces.
   *
   * @param charCode The character to check.
   *                 
   * @return {@code true} if {@code charCode} represents a white space, {@code false} otherwise.
   */
  public static boolean isWhitespace(char charCode) {
    return Character.isWhitespace(charCode)  ||
        Character.getType(charCode) == Character.SPACE_SEPARATOR;
  }

  /**
   * Determines if the specified {@link Character} is a whitespace.
   * A character is considered a whitespace when one of the following conditions is met:
   *
   * 
   * Its a {@link Character#isWhitespace(int)} whitespace.
   * Its a part of the Unicode Zs category ({@link Character#SPACE_SEPARATOR}).
   * 
   *
   * {@link Character#isWhitespace(int)} does not include no-break spaces.
   * In OpenNLP no-break spaces are also considered as white spaces.
   *
   * @param charCode An int representation of a character to check.
   *
   * @return {@code true} if {@code charCode} represents a white space, {@code false} otherwise.
   */
  public static boolean isWhitespace(int charCode) {
    return Character.isWhitespace(charCode)  ||
        Character.getType(charCode) == Character.SPACE_SEPARATOR;
  }


  /**
   * Converts a {@link CharSequence} to lower case, independent of the current
   * {@link java.util.Locale} via {@link Character#toLowerCase(int)} which uses
   * mapping information from the UnicodeData file.
   *
   * @param string The {@link CharSequence} to transform.
   * @return The lower-cased String.
   */
  public static String toLowerCase(CharSequence string) {
    int[] cp = string.codePoints().map(Character::toLowerCase).toArray();
    return new String(cp, 0, cp.length);
  }

  public static CharBuffer toLowerCaseCharBuffer(CharSequence sequence) {
    CharBuffer result = CharBuffer.allocate(sequence.length());
    for (int cp : sequence.codePoints().toArray()) {
      for (char c : Character.toChars(Character.toLowerCase(cp))) {
        result.append(c);
      }
    }
    result.clear();
    return result;
  }

  /*
  public static CharBuffer toLowerCaseCharBuffer(CharSequence string) {
    int[] cp = string.codePoints().map(Character::toLowerCase).toArray();
    CharBuffer result = CharBuffer.allocate(string.length());
    for (int j : cp) {
      char[] chars = Character.toChars(j);
      result.put(chars, 0, chars.length);
    }
    result.clear();
    return result;
  }
  */

  /**
   * Converts a {@link CharSequence} to upper case, independent of the current
   * {@link java.util.Locale} via {@link Character#toUpperCase(char)} which uses
   * mapping information from the UnicodeData file.
   *
   * @param string The {@link CharSequence} to transform.
   * @return The upper-cased String
   */
  public static String toUpperCase(CharSequence string) {
    char[] upperCaseChars = new char[string.length()];

    for (int i = 0; i < string.length(); i++) {
      upperCaseChars[i] = Character.toUpperCase(string.charAt(i));
    }

    return new String(upperCaseChars);
  }

  /**
   * @return {@code true} if {@link CharSequence#length()} is {@code 0} or {@code null}, otherwise
   *         {@code false}
   *
   * @since 1.5.1
   */
  public static boolean isEmpty(CharSequence theString) {
    return theString.length() == 0;
  }

  /**
   * Get the minimum of three values.
   *
   * @param a number a
   * @param b number b
   * @param c number c
   * @return the minimum among the three parameters {@code a}, {@code b} or {@code c}.
   */
  private static int minimum(int a, int b, int c) {
    int minValue;
    minValue = a;
    if (b < minValue) {
      minValue = b;
    }
    if (c < minValue) {
      minValue = c;
    }
    return minValue;
  }

  /**
   * Computes the Levenshtein distance of two strings in a matrix.
   * 
   * Based on this
   * 
   * pseudo-code which in turn is based on the paper Wagner, Robert A.; Fischer, Michael J. (1974),
   * "The String-to-String Correction Problem", Journal of the ACM 21 (1): 168-173
   * 
   * @param wordForm The form as input.
   * @param lemma The target lemma.
   * @return A 2-dimensional Levenshtein distance matrix.
   */
  public static int[][] levenshteinDistance(String wordForm, String lemma) {
    int wordLength = wordForm.length();
    int lemmaLength = lemma.length();
    int cost;
    int[][] distance = new int[wordLength + 1][lemmaLength + 1];

    if (wordLength == 0) {
      return distance;
    }
    if (lemmaLength == 0) {
      return distance;
    }
    //fill in the rows of column 0
    for (int i = 0; i <= wordLength; i++) {
      distance[i][0] = i;
    }
    //fill in the columns of row 0
    for (int j = 0; j <= lemmaLength; j++) {
      distance[0][j] = j;
    }
    //fill in the rest of the matrix calculating the minimum distance
    for (int i = 1; i <= wordLength; i++) {
      int s_i = wordForm.charAt(i - 1);
      for (int j = 1; j <= lemmaLength; j++) {
        if (s_i == lemma.charAt(j - 1)) {
          cost = 0;
        } else {
          cost = 1;
        }
        //obtain minimum distance from calculating deletion, insertion, substitution
        distance[i][j] = minimum(distance[i - 1][j] + 1, distance[i][j - 1]
            + 1, distance[i - 1][j - 1] + cost);
      }
    }
    return distance;
  }

  /**
   * Computes the Shortest Edit Script (SES) to convert a word into its lemma.
   * This is based on Chrupala's PhD thesis (2008).
   *
   * @param wordForm The token.
   * @param lemma The target lemma.
   * @param distance A 2-dimensional Levenshtein distance matrix.
   * @param permutations The number of permutations.
   */
  public static void computeShortestEditScript(String wordForm, String lemma,
      int[][] distance, StringBuffer permutations) {

    int n = distance.length;
    int m = distance[0].length;

    int wordFormLength = n - 1;
    int lemmaLength = m - 1;
    while (true) {

      if (distance[wordFormLength][lemmaLength] == 0) {
        break;
      }
      if ((lemmaLength > 0 && wordFormLength > 0) && (distance[wordFormLength - 1][lemmaLength - 1]
          < distance[wordFormLength][lemmaLength])) {
        permutations.append('R').append(wordFormLength - 1)
            .append(wordForm.charAt(wordFormLength - 1)).append(lemma.charAt(lemmaLength - 1));
        lemmaLength--;
        wordFormLength--;
        continue;
      }
      if (lemmaLength > 0 && (distance[wordFormLength][lemmaLength - 1]
          < distance[wordFormLength][lemmaLength])) {
        permutations.append('I').append(wordFormLength)
            .append(lemma.charAt(lemmaLength - 1));
        lemmaLength--;
        continue;
      }
      if (wordFormLength > 0 && (distance[wordFormLength - 1][lemmaLength]
          < distance[wordFormLength][lemmaLength])) {
        permutations.append('D').append(wordFormLength - 1)
            .append(wordForm.charAt(wordFormLength - 1));
        wordFormLength--;
        continue;
      }
      if ((wordFormLength > 0 && lemmaLength > 0) && (distance[wordFormLength - 1][lemmaLength - 1]
          == distance[wordFormLength][lemmaLength])) {
        wordFormLength--;
        lemmaLength--;
        continue ;
      }
      if (wordFormLength > 0 && (distance[wordFormLength - 1][lemmaLength]
          == distance[wordFormLength][lemmaLength])) {
        wordFormLength--;
        continue;
      }
      if (lemmaLength > 0 && (distance[wordFormLength][lemmaLength - 1]
          == distance[wordFormLength][lemmaLength])) {
        lemmaLength--;
      }
    }
  }

  /**
   * Reads the predicted Shortest Edit Script (SES) by a lemmatizer model and applies the
   * permutations to obtain the lemma from the {@code wordForm}.
   *
   * @param wordForm The wordForm as input.
   * @param permutations The permutations predicted by the lemmatizer model.
   * @return The decoded lemma.
   */
  public static String decodeShortestEditScript(String wordForm, String permutations) {

    StringBuffer lemma = new StringBuffer(wordForm).reverse();

    int permIndex = 0;
    while (true) {
      if (permutations.length() <= permIndex) {
        break;
      }
      //read first letter of permutation string
      char nextOperation = permutations.charAt(permIndex);
      if (logger.isTraceEnabled()) {
        logger.trace("-> NextOP: {}", nextOperation);
      }
      //go to the next permutation letter
      permIndex++;
      if (nextOperation == 'R') {
        String charAtPerm = Character.toString(permutations.charAt(permIndex));
        int charIndex = Integer.parseInt(charAtPerm);
        // go to the next character in the permutation buffer
        // which is the replacement character
        permIndex++;
        char replace = permutations.charAt(permIndex);
        //go to the next char in the permutation buffer
        // which is the candidate character
        permIndex++;
        char with = permutations.charAt(permIndex);

        if (lemma.length() <= charIndex) {
          return wordForm;
        }
        if (lemma.charAt(charIndex) == replace) {
          lemma.setCharAt(charIndex, with);
        }
        if (logger.isTraceEnabled()) {
          logger.trace("-> ROP: {}", lemma);
        }
        //go to next permutation
        permIndex++;

      } else if (nextOperation == 'I') {
        String charAtPerm = Character.toString(permutations.charAt(permIndex));
        int charIndex = Integer.parseInt(charAtPerm);
        permIndex++;
        //character to be inserted
        char in = permutations.charAt(permIndex);

        if (lemma.length() < charIndex) {
          return wordForm;
        }
        lemma.insert(charIndex, in);

        if (logger.isTraceEnabled()) {
          logger.trace("-> IOP {}", lemma);
        }
        //go to next permutation
        permIndex++;
      } else if (nextOperation == 'D') {
        String charAtPerm = Character.toString(permutations.charAt(permIndex));
        int charIndex = Integer.parseInt(charAtPerm);
        if (lemma.length() <= charIndex) {
          return wordForm;
        }
        lemma.deleteCharAt(charIndex);
        permIndex++;
        // go to next permutation
        permIndex++;
      }
    }
    return lemma.reverse().toString();
  }

  /**
   * @param wordForm The word as input.
   * @param lemma The target lemma.
   * @return Retrieves the Shortest Edit Script (SES) required to go from a word to a lemma.
   */
  public static String getShortestEditScript(String wordForm, String lemma) {
    String reversedWF = new StringBuffer(wordForm.toLowerCase()).reverse().toString();
    String reversedLemma = new StringBuffer(lemma.toLowerCase()).reverse().toString();
    StringBuffer permutations = new StringBuffer();
    String ses;
    if (!reversedWF.equals(reversedLemma)) {
      int[][]levenDistance = StringUtil.levenshteinDistance(reversedWF, reversedLemma);
      StringUtil.computeShortestEditScript(reversedWF, reversedLemma, levenDistance, permutations);
      ses = permutations.toString();
    } else {
      ses = "O";
    }
    return ses;
  }

}