edu.stanford.nlp.util.UTF8EquivalenceFunction Maven / Gradle / Ivy
package edu.stanford.nlp.util;
import java.io.Serializable;
import java.util.function.Function;
/**
* A word function that can be applied to Chinese text in the tagger
* or similar systems to make it treat ( and ( the same.
*
* @author John Bauer
*/
public class UTF8EquivalenceFunction implements Function, Serializable {
public static String replaceAscii(String w) {
return StringUtils.tr(w,
"!\"#$%&'()*+,-./0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\\]^_`abcdefghijklmnopqrstuvwxyz{|}~",
"\uFF01\uFF02\uFF03\uFF04\uFF05\uFF06\uFF07\uFF08\uFF09\uFF0A\uFF0B\uFF0C\uFF0D\uFF0E\uFF0F\uFF10\uFF11\uFF12\uFF13\uFF14\uFF15\uFF16\uFF17\uFF18\uFF19\uFF1A\uFF1B\uFF1C\uFF1D\uFF1E\uFF1F\uFF20\uFF21\uFF22\uFF23\uFF24\uFF25\uFF26\uFF27\uFF28\uFF29\uFF2A\uFF2B\uFF2C\uFF2D\uFF2E\uFF2F\uFF30\uFF31\uFF32\uFF33\uFF34\uFF35\uFF36\uFF37\uFF38\uFF39\uFF3A\uFF3B\uFF3C\uFF3D\uFF3E\uFF3F\uFF40\uFF41\uFF42\uFF43\uFF44\uFF45\uFF46\uFF47\uFF48\uFF49\uFF4A\uFF4B\uFF4C\uFF4D\uFF4E\uFF4F\uFF50\uFF51\uFF52\uFF53\uFF54\uFF55\uFF56\uFF57\uFF58\uFF59\uFF5A\uFF5B\uFF5C\uFF5D\uFF5E");
}
public String apply(String input) {
if (input == null) {
return null;
}
if (input.equals(".$.") || input.equals(".$$.")) {
return input;
} else {
return replaceAscii(input);
}
}
private static final long serialVersionUID = 1L;
}