All Downloads are FREE. Search and download functionalities are using the official Maven repository.

edu.stanford.nlp.util.UTF8EquivalenceFunction Maven / Gradle / Ivy

Go to download

Stanford Parser processes raw text in English, Chinese, German, Arabic, and French, and extracts constituency parse trees.

There is a newer version: 3.9.2
Show newest version
package edu.stanford.nlp.util;

import java.io.Serializable;
import java.util.function.Function;

/**
 * A word function that can be applied to Chinese text in the tagger
 * or similar systems to make it treat ( and ( the same.
 * That is, it maps regular ASCII-range printable characters to their
 * full-width equivalents
 *
 * @author John Bauer
 */
public class UTF8EquivalenceFunction implements Function, Serializable {

  public static String replaceAscii(String w) {
    return StringUtils.tr(w,
                          "!\"#$%&'()*+,-./0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\\]^_`abcdefghijklmnopqrstuvwxyz{|}~",
                          "\uFF01\uFF02\uFF03\uFF04\uFF05\uFF06\uFF07\uFF08\uFF09\uFF0A\uFF0B\uFF0C\uFF0D\uFF0E\uFF0F\uFF10\uFF11\uFF12\uFF13\uFF14\uFF15\uFF16\uFF17\uFF18\uFF19\uFF1A\uFF1B\uFF1C\uFF1D\uFF1E\uFF1F\uFF20\uFF21\uFF22\uFF23\uFF24\uFF25\uFF26\uFF27\uFF28\uFF29\uFF2A\uFF2B\uFF2C\uFF2D\uFF2E\uFF2F\uFF30\uFF31\uFF32\uFF33\uFF34\uFF35\uFF36\uFF37\uFF38\uFF39\uFF3A\uFF3B\uFF3C\uFF3D\uFF3E\uFF3F\uFF40\uFF41\uFF42\uFF43\uFF44\uFF45\uFF46\uFF47\uFF48\uFF49\uFF4A\uFF4B\uFF4C\uFF4D\uFF4E\uFF4F\uFF50\uFF51\uFF52\uFF53\uFF54\uFF55\uFF56\uFF57\uFF58\uFF59\uFF5A\uFF5B\uFF5C\uFF5D\uFF5E");
  }

  @Override
  public String apply(String input) {
    if (input == null) {
      return null;
    } else if (input.equals(".$.") || input.equals(".$$.")) {
      return input;
    } else {
      return replaceAscii(input);
    }
  }

  private static final long serialVersionUID = 1L;

}




© 2015 - 2024 Weber Informatics LLC | Privacy Policy