edu.stanford.nlp.process.WordShapeClassifier Maven / Gradle / Ivy
package edu.stanford.nlp.process;
import java.util.*;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import edu.stanford.nlp.trees.international.pennchinese.ChineseUtils;
import edu.stanford.nlp.objectbank.ObjectBank;
import edu.stanford.nlp.util.Generics;
import edu.stanford.nlp.util.Timing;
// TODO: put in a regexp for ordinals, fraction num/num and perhaps even 30-5/8
/**
* Provides static methods which
* map any String to another String indicative of its "word shape" -- e.g.,
* whether capitalized, numeric, etc. Different implementations may
* implement quite different, normally language specific ideas of what
* word shapes are useful.
*
* @author Christopher Manning
* @author Dan Klein
*/
public class WordShapeClassifier {
public static final int NOWORDSHAPE = -1;
public static final int WORDSHAPEDAN1 = 0;
public static final int WORDSHAPECHRIS1 = 1;
public static final int WORDSHAPEDAN2 = 2;
public static final int WORDSHAPEDAN2USELC = 3;
public static final int WORDSHAPEDAN2BIO = 4;
public static final int WORDSHAPEDAN2BIOUSELC = 5;
public static final int WORDSHAPEJENNY1 = 6;
public static final int WORDSHAPEJENNY1USELC = 7;
public static final int WORDSHAPECHRIS2 = 8;
public static final int WORDSHAPECHRIS2USELC = 9;
public static final int WORDSHAPECHRIS3 = 10;
public static final int WORDSHAPECHRIS3USELC = 11;
public static final int WORDSHAPECHRIS4 = 12;
public static final int WORDSHAPEDIGITS = 13;
public static final int WORDSHAPECHINESE = 14;
public static final int WORDSHAPECLUSTER1 = 15;
// This class cannot be instantiated
private WordShapeClassifier() {
}
/** Look up a shaper by a short String name.
*
* @param name Shaper name. Known names have patterns along the lines of:
* dan[12](bio)?(UseLC)?, jenny1(useLC)?, chris[1234](useLC)?, cluster1.
* @return An integer constant for the shaper
*/
public static int lookupShaper(String name) {
if (name == null) {
return NOWORDSHAPE;
} else if (name.equalsIgnoreCase("dan1")) {
return WORDSHAPEDAN1;
} else if (name.equalsIgnoreCase("chris1")) {
return WORDSHAPECHRIS1;
} else if (name.equalsIgnoreCase("dan2")) {
return WORDSHAPEDAN2;
} else if (name.equalsIgnoreCase("dan2useLC")) {
return WORDSHAPEDAN2USELC;
} else if (name.equalsIgnoreCase("dan2bio")) {
return WORDSHAPEDAN2BIO;
} else if (name.equalsIgnoreCase("dan2bioUseLC")) {
return WORDSHAPEDAN2BIOUSELC;
} else if (name.equalsIgnoreCase("jenny1")) {
return WORDSHAPEJENNY1;
} else if (name.equalsIgnoreCase("jenny1useLC")) {
return WORDSHAPEJENNY1USELC;
} else if (name.equalsIgnoreCase("chris2")) {
return WORDSHAPECHRIS2;
} else if (name.equalsIgnoreCase("chris2useLC")) {
return WORDSHAPECHRIS2USELC;
} else if (name.equalsIgnoreCase("chris3")) {
return WORDSHAPECHRIS3;
} else if (name.equalsIgnoreCase("chris3useLC")) {
return WORDSHAPECHRIS3USELC;
} else if (name.equalsIgnoreCase("chris4")) {
return WORDSHAPECHRIS4;
} else if (name.equalsIgnoreCase("digits")) {
return WORDSHAPEDIGITS;
} else if (name.equalsIgnoreCase("chinese")) {
return WORDSHAPECHINESE;
} else if (name.equalsIgnoreCase("cluster1")) {
return WORDSHAPECLUSTER1;
} else {
return NOWORDSHAPE;
}
}
/**
* Returns true if the specified word shaper doesn't use
* known lower case words, even if a list of them is present.
* This is used for backwards compatibility. It is suggested that
* new word shape functions are either passed a non-null list of
* lowercase words or not, depending on whether you want knownLC marking
* (if it is available in a shaper). This is how chris4 works.
*
* @param shape One of the defined shape constants
* @return true if the specified word shaper uses
* known lower case words.
*/
private static boolean dontUseLC(int shape) {
return shape == WORDSHAPEDAN2 ||
shape == WORDSHAPEDAN2BIO ||
shape == WORDSHAPEJENNY1 ||
shape == WORDSHAPECHRIS2 ||
shape == WORDSHAPECHRIS3;
}
/**
* Specify the String and the int identifying which word shaper to
* use and this returns the result of using that wordshaper on the String.
*
* @param inStr String to calculate word shape of
* @param wordShaper Constant for which shaping formula to use
* @return The wordshape String
*/
public static String wordShape(String inStr, int wordShaper) {
return wordShape(inStr, wordShaper, null);
}
/**
* Specify the string and the int identifying which word shaper to
* use and this returns the result of using that wordshaper on the String.
*
* @param inStr String to calculate word shape of
* @param wordShaper Constant for which shaping formula to use
* @param knownLCWords A Collection of known lowercase words, which some shapers use
* to decide the class of capitalized words.
* Note: while this code works with any Collection, you should
* provide a Set for decent performance. If this parameter is
* null or empty, then this option is not used (capitalized words
* are treated the same, regardless of whether the lowercased
* version of the String has been seen).
* @return The wordshape String
*/
public static String wordShape(String inStr, int wordShaper, Collection knownLCWords) {
// this first bit is for backwards compatibility with how things were first
// implemented, where the word shaper name encodes whether to useLC.
// If the shaper is in the old compatibility list, then a specified
// list of knownLCwords is ignored
if (knownLCWords != null && dontUseLC(wordShaper)) {
knownLCWords = null;
}
switch (wordShaper) {
case NOWORDSHAPE:
return inStr;
case WORDSHAPEDAN1:
return wordShapeDan1(inStr);
case WORDSHAPECHRIS1:
return wordShapeChris1(inStr);
case WORDSHAPEDAN2:
return wordShapeDan2(inStr, knownLCWords);
case WORDSHAPEDAN2USELC:
return wordShapeDan2(inStr, knownLCWords);
case WORDSHAPEDAN2BIO:
return wordShapeDan2Bio(inStr, knownLCWords);
case WORDSHAPEDAN2BIOUSELC:
return wordShapeDan2Bio(inStr, knownLCWords);
case WORDSHAPEJENNY1:
return wordShapeJenny1(inStr, knownLCWords);
case WORDSHAPEJENNY1USELC:
return wordShapeJenny1(inStr, knownLCWords);
case WORDSHAPECHRIS2:
return wordShapeChris2(inStr, false, knownLCWords);
case WORDSHAPECHRIS2USELC:
return wordShapeChris2(inStr, false, knownLCWords);
case WORDSHAPECHRIS3:
return wordShapeChris2(inStr, true, knownLCWords);
case WORDSHAPECHRIS3USELC:
return wordShapeChris2(inStr, true, knownLCWords);
case WORDSHAPECHRIS4:
return wordShapeChris4(inStr, false, knownLCWords);
case WORDSHAPEDIGITS:
return wordShapeDigits(inStr);
case WORDSHAPECHINESE:
return wordShapeChinese(inStr);
case WORDSHAPECLUSTER1:
return wordShapeCluster1(inStr);
default:
throw new IllegalStateException("Bad WordShapeClassifier");
}
}
/**
* A fairly basic 5-way classifier, that notes digits, and upper
* and lower case, mixed, and non-alphanumeric.
*
* @param s String to find word shape of
* @return Its word shape: a 5 way classification
*/
private static String wordShapeDan1(String s) {
boolean digit = true;
boolean upper = true;
boolean lower = true;
boolean mixed = true;
for (int i = 0; i < s.length(); i++) {
char c = s.charAt(i);
if (!Character.isDigit(c)) {
digit = false;
}
if (!Character.isLowerCase(c)) {
lower = false;
}
if (!Character.isUpperCase(c)) {
upper = false;
}
if ((i == 0 && !Character.isUpperCase(c)) || (i >= 1 && !Character.isLowerCase(c))) {
mixed = false;
}
}
if (digit) {
return "ALL-DIGITS";
}
if (upper) {
return "ALL-UPPER";
}
if (lower) {
return "ALL-LOWER";
}
if (mixed) {
return "MIXED-CASE";
}
return "OTHER";
}
/**
* A fine-grained word shape classifier, that equivalence classes
* lower and upper case and digits, and collapses sequences of the
* same type, but keeps all punctuation, etc.
* Note: We treat '_' as a lowercase letter, sort of like many
* programming languages. We do this because we use '_' joining of
* tokens in some applications like RTE.
*
* @param s The String whose shape is to be returned
* @param knownLCWords If this is non-null and non-empty, mark words whose
* lower case form is found in the
* Collection of known lower case words
* @return The word shape
*/
private static String wordShapeDan2(String s, Collection knownLCWords) {
StringBuilder sb = new StringBuilder("WT-");
char lastM = '~';
boolean nonLetters = false;
int len = s.length();
for (int i = 0; i < len; i++) {
char c = s.charAt(i);
char m = c;
if (Character.isDigit(c)) {
m = 'd';
} else if (Character.isLowerCase(c) || c == '_') {
m = 'x';
} else if (Character.isUpperCase(c)) {
m = 'X';
}
if (m != 'x' && m != 'X') {
nonLetters = true;
}
if (m != lastM) {
sb.append(m);
}
lastM = m;
}
if (len <= 3) {
sb.append(':').append(len);
}
if (knownLCWords != null) {
if (!nonLetters && knownLCWords.contains(s.toLowerCase())) {
sb.append('k');
}
}
// System.err.println("wordShapeDan2: " + s + " became " + sb);
return sb.toString();
}
private static String wordShapeJenny1(String s, Collection knownLCWords) {
StringBuilder sb = new StringBuilder("WT-");
char lastM = '~';
boolean nonLetters = false;
for (int i = 0; i < s.length(); i++) {
char c = s.charAt(i);
char m = c;
if (Character.isDigit(c)) {
m = 'd';
} else if (Character.isLowerCase(c)) {
m = 'x';
} else if (Character.isUpperCase(c)) {
m = 'X';
}
for (String gr : greek) {
if (s.startsWith(gr, i)) {
m = 'g';
i = i + gr.length() - 1;
//System.out.println(s + " :: " + s.substring(i+1));
break;
}
}
if (m != 'x' && m != 'X') {
nonLetters = true;
}
if (m != lastM) {
sb.append(m);
}
lastM = m;
}
if (s.length() <= 3) {
sb.append(':').append(s.length());
}
if (knownLCWords != null) {
if ( ! nonLetters && knownLCWords.contains(s.toLowerCase())) {
sb.append('k');
}
}
//System.out.println(s+" became "+sb);
return sb.toString();
}
/** Note: the optimizations in wordShapeChris2 would break if BOUNDARY_SIZE
* was greater than the shortest greek word, so valid values are: 0, 1, 2, 3.
*/
private static final int BOUNDARY_SIZE = 2;
/**
* This one picks up on Dan2 ideas, but seeks to make less distinctions
* mid sequence by sorting for long words, but to maintain extra
* distinctions for short words. It exactly preserves the character shape
* of the first and last 2 (i.e., BOUNDARY_SIZE) characters and then
* will record shapes that occur between them (perhaps only if they are
* different)
*
* @param s The String to find the word shape of
* @param omitIfInBoundary If true, character classes present in the
* first or last two (i.e., BOUNDARY_SIZE) letters
* of the word are not also registered
* as classes that appear in the middle of the word.
* @param knownLCWords If non-null and non-empty, tag with a "k" suffix words
* that are in this list when lowercased (representing
* that the word is "known" as a lowercase word).
* @return A word shape for the word.
*/
private static String wordShapeChris2(String s, boolean omitIfInBoundary, Collection knownLCWords) {
int len = s.length();
if (len <= BOUNDARY_SIZE * 2) {
return wordShapeChris2Short(s, len, knownLCWords);
} else {
return wordShapeChris2Long(s, omitIfInBoundary, len, knownLCWords);
}
}
// Do the simple case of words <= BOUNDARY_SIZE * 2 (i.e., 4) with only 1 object allocation!
private static String wordShapeChris2Short(String s, int len, Collection knownLCWords) {
int sbLen = (knownLCWords != null) ? len + 1: len; // markKnownLC makes String 1 longer
final StringBuilder sb = new StringBuilder(sbLen);
boolean nonLetters = false;
for (int i = 0; i < len; i++) {
char c = s.charAt(i);
char m = c;
if (Character.isDigit(c)) {
m = 'd';
} else if (Character.isLowerCase(c)) {
m = 'x';
} else if (Character.isUpperCase(c) || Character.isTitleCase(c)) {
m = 'X';
}
for (String gr : greek) {
if (s.startsWith(gr, i)) {
m = 'g';
//System.out.println(s + " :: " + s.substring(i+1));
i += gr.length() - 1;
// System.out.println("Position skips to " + i);
break;
}
}
if (m != 'x' && m != 'X') {
nonLetters = true;
}
sb.append(m);
}
if (knownLCWords != null) {
if ( ! nonLetters && knownLCWords.contains(s.toLowerCase())) {
sb.append('k');
}
}
// System.out.println(s + " became " + sb);
return sb.toString();
}
// introduce sizes and optional allocation to reduce memory churn demands;
// this class could blow a lot of memory if used in a tight loop,
// as the naive version allocates lots of kind of heavyweight objects
// endSB should be of length BOUNDARY_SIZE
// sb is maximally of size s.length() + 1, but is usually (much) shorter. The +1 might happen if markKnownLC is true and it applies
// boundSet is maximally of size BOUNDARY_SIZE * 2 (and is often smaller)
// seenSet is maximally of size s.length() - BOUNDARY_SIZE * 2, but might often be of size <= 4. But it has no initial size allocation
// But we want the initial size to be greater than BOUNDARY_SIZE * 2 * (4/3) since the default loadfactor is 3/4.
// That is, of size 6, which become 8, since HashMaps are powers of 2. Still, it's half the size
private static String wordShapeChris2Long(String s, boolean omitIfInBoundary, int len, Collection knownLCWords) {
final char[] beginChars = new char[BOUNDARY_SIZE];
final char[] endChars = new char[BOUNDARY_SIZE];
int beginUpto = 0;
int endUpto = 0;
final Set seenSet = new TreeSet<>(); // TreeSet guarantees stable ordering; has no size parameter
boolean nonLetters = false;
for (int i = 0; i < len; i++) {
int iIncr = 0;
char c = s.charAt(i);
char m = c;
if (Character.isDigit(c)) {
m = 'd';
} else if (Character.isLowerCase(c)) {
m = 'x';
} else if (Character.isUpperCase(c) || Character.isTitleCase(c)) {
m = 'X';
}
for (String gr : greek) {
if (s.startsWith(gr, i)) {
m = 'g';
//System.out.println(s + " :: " + s.substring(i+1));
iIncr = gr.length() - 1;
break;
}
}
if (m != 'x' && m != 'X') {
nonLetters = true;
}
if (i < BOUNDARY_SIZE) {
beginChars[beginUpto++] = m;
} else if (i < len - BOUNDARY_SIZE) {
seenSet.add(Character.valueOf(m));
} else {
endChars[endUpto++] = m;
}
i += iIncr;
// System.out.println("Position skips to " + i);
}
// Calculate size. This may be an upperbound, but is often correct
int sbSize = beginUpto + endUpto + seenSet.size();
if (knownLCWords != null) { sbSize++; }
final StringBuilder sb = new StringBuilder(sbSize);
// put in the beginning chars
sb.append(beginChars, 0, beginUpto);
// put in the stored ones sorted
if (omitIfInBoundary) {
for (Character chr : seenSet) {
char ch = chr.charValue();
boolean insert = true;
for (int i = 0; i < beginUpto; i++) {
if (beginChars[i] == ch) {
insert = false;
break;
}
}
for (int i = 0; i < endUpto; i++) {
if (endChars[i] == ch) {
insert = false;
break;
}
}
if (insert) {
sb.append(ch);
}
}
} else {
for (Character chr : seenSet) {
sb.append(chr.charValue());
}
}
// and add end ones
sb.append(endChars, 0, endUpto);
if (knownLCWords != null) {
if (!nonLetters && knownLCWords.contains(s.toLowerCase())) {
sb.append('k');
}
}
// System.out.println(s + " became " + sb);
return sb.toString();
}
private static char chris4equivalenceClass(final char c) {
int type = Character.getType(c);
if (Character.isDigit(c) || type == Character.LETTER_NUMBER
|| type == Character.OTHER_NUMBER
|| "一二三四五六七八九十零〇百千万亿兩○◯".indexOf(c) > 0) {
// include Chinese numbers that are just of unicode type OTHER_LETTER (and a couple of round symbols often used (by mistake?) for zeroes)
return 'd';
} else if (c == '第') {
return 'o'; // detect those Chinese ordinals!
} else if (c == '年' || c == '月' || c == '日') { // || c == '号') {
return 'D'; // Chinese date characters.
} else if (Character.isLowerCase(c)) {
return 'x';
} else if (Character.isUpperCase(c) || Character.isTitleCase(c)) {
return 'X';
} else if (Character.isWhitespace(c) || Character.isSpaceChar(c)) {
return 's';
} else if (type == Character.OTHER_LETTER) {
return 'c'; // Chinese characters, etc. without case
} else if (type == Character.CURRENCY_SYMBOL) {
return '$';
} else if (type == Character.MATH_SYMBOL) {
return '+';
} else if (type == Character.OTHER_SYMBOL || c == '|') {
return '|';
} else if (type == Character.START_PUNCTUATION) {
return '(';
} else if (type == Character.END_PUNCTUATION) {
return ')';
} else if (type == Character.INITIAL_QUOTE_PUNCTUATION) {
return '`';
} else if (type == Character.FINAL_QUOTE_PUNCTUATION || c == '\'') {
return '\'';
} else if (c == '%') {
return '%';
} else if (type == Character.OTHER_PUNCTUATION) {
return '.';
} else if (type == Character.CONNECTOR_PUNCTUATION) {
return '_';
} else if (type == Character.DASH_PUNCTUATION) {
return '-';
} else {
return 'q';
}
}
public static String wordShapeChris4(String s) {
return wordShapeChris4(s, false, null);
}
/**
* This one picks up on Dan2 ideas, but seeks to make less distinctions
* mid sequence by sorting for long words, but to maintain extra
* distinctions for short words, by always recording the class of the
* first and last two characters of the word.
* Compared to chris2 on which it is based,
* it uses more Unicode classes, and so collapses things like
* punctuation more, and might work better with real unicode.
*
* @param s The String to find the word shape of
* @param omitIfInBoundary If true, character classes present in the
* first or last two (i.e., BOUNDARY_SIZE) letters
* of the word are not also registered
* as classes that appear in the middle of the word.
* @param knownLCWords If non-null and non-empty, tag with a "k" suffix words
* that are in this list when lowercased (representing
* that the word is "known" as a lowercase word).
* @return A word shape for the word.
*/
private static String wordShapeChris4(String s, boolean omitIfInBoundary, Collection knownLCWords) {
int len = s.length();
if (len <= BOUNDARY_SIZE * 2) {
return wordShapeChris4Short(s, len, knownLCWords);
} else {
return wordShapeChris4Long(s, omitIfInBoundary, len, knownLCWords);
}
}
// Do the simple case of words <= BOUNDARY_SIZE * 2 (i.e., 4) with only 1 object allocation!
private static String wordShapeChris4Short(String s, int len, Collection knownLCWords) {
int sbLen = (knownLCWords != null) ? len + 1: len; // markKnownLC makes String 1 longer
final StringBuilder sb = new StringBuilder(sbLen);
boolean nonLetters = false;
for (int i = 0; i < len; i++) {
char c = s.charAt(i);
char m = chris4equivalenceClass(c);
for (String gr : greek) {
if (s.startsWith(gr, i)) {
m = 'g';
//System.out.println(s + " :: " + s.substring(i+1));
i += gr.length() - 1;
// System.out.println("Position skips to " + i);
break;
}
}
if (m != 'x' && m != 'X') {
nonLetters = true;
}
sb.append(m);
}
if (knownLCWords != null) {
if ( ! nonLetters && knownLCWords.contains(s.toLowerCase())) {
sb.append('k');
}
}
// System.out.println(s + " became " + sb);
return sb.toString();
}
private static String wordShapeChris4Long(String s, boolean omitIfInBoundary, int len, Collection knownLCWords) {
StringBuilder sb = new StringBuilder(s.length() + 1);
StringBuilder endSB = new StringBuilder(BOUNDARY_SIZE);
Set boundSet = Generics.newHashSet(BOUNDARY_SIZE * 2);
Set seenSet = new TreeSet<>(); // TreeSet guarantees stable ordering
boolean nonLetters = false;
for (int i = 0; i < len; i++) {
char c = s.charAt(i);
char m = chris4equivalenceClass(c);
int iIncr = 0;
for (String gr : greek) {
if (s.startsWith(gr, i)) {
m = 'g';
iIncr = gr.length() - 1;
//System.out.println(s + " :: " + s.substring(i+1));
break;
}
}
if (m != 'x' && m != 'X') {
nonLetters = true;
}
if (i < BOUNDARY_SIZE) {
sb.append(m);
boundSet.add(Character.valueOf(m));
} else if (i < len - BOUNDARY_SIZE) {
seenSet.add(Character.valueOf(m));
} else {
boundSet.add(Character.valueOf(m));
endSB.append(m);
}
// System.out.println("Position " + i + " --> " + m);
i += iIncr;
}
// put in the stored ones sorted and add end ones
for (Character chr : seenSet) {
if (!omitIfInBoundary || !boundSet.contains(chr)) {
char ch = chr.charValue();
sb.append(ch);
}
}
sb.append(endSB);
if (knownLCWords != null) {
if (!nonLetters && knownLCWords.contains(s.toLowerCase())) {
sb.append('k');
}
}
// System.out.println(s + " became " + sb);
return sb.toString();
}
/**
* Returns a fine-grained word shape classifier, that equivalence classes
* lower and upper case and digits, and collapses sequences of the
* same type, but keeps all punctuation. This adds an extra recognizer
* for a greek letter embedded in the String, which is useful for bio.
*/
private static String wordShapeDan2Bio(String s, Collection knownLCWords) {
if (containsGreekLetter(s)) {
return wordShapeDan2(s, knownLCWords) + "-GREEK";
} else {
return wordShapeDan2(s, knownLCWords);
}
}
/** List of greek letters for bio. We omit eta, mu, nu, xi, phi, chi, psi.
* Maybe should omit rho too, but it is used in bio "Rho kinase inhibitor".
*/
private static final String[] greek = {"alpha", "beta", "gamma", "delta", "epsilon", "zeta", "theta", "iota", "kappa", "lambda", "omicron", "rho", "sigma", "tau", "upsilon", "omega"};
private static final Pattern biogreek = Pattern.compile("alpha|beta|gamma|delta|epsilon|zeta|theta|iota|kappa|lambda|omicron|rho|sigma|tau|upsilon|omega", Pattern.CASE_INSENSITIVE);
/**
* Somewhat ad-hoc list of only greek letters that bio people use, partly
* to avoid false positives on short ones.
* @param s String to check for Greek
* @return true iff there is a greek lette embedded somewhere in the String
*/
private static boolean containsGreekLetter(String s) {
Matcher m = biogreek.matcher(s);
return m.find();
}
/** This one equivalence classes all strings into one of 24 semantically
* informed classes, somewhat similarly to the function specified in the
* BBN Nymble NER paper (Bikel et al. 1997).
*
* Note that it regards caseless non-Latin letters as lowercase.
*
* @param s String to word class
* @return The string's class
*/
private static String wordShapeChris1(String s) {
int length = s.length();
if (length == 0) {
return "SYMBOL"; // unclear if this is sensible, but it's what a length 0 String becomes....
}
boolean cardinal = false;
boolean number = true;
boolean seenDigit = false;
boolean seenNonDigit = false;
for (int i = 0; i < length; i++) {
char ch = s.charAt(i);
boolean digit = Character.isDigit(ch);
if (digit) {
seenDigit = true;
} else {
seenNonDigit = true;
}
// allow commas, decimals, and negative numbers
digit = digit || ch == '.' || ch == ',' || (i == 0 && (ch == '-' || ch == '+'));
if (!digit) {
number = false;
}
}
if ( ! seenDigit) {
number = false;
} else if ( ! seenNonDigit) {
cardinal = true;
}
if (cardinal) {
if (length < 4) {
return "CARDINAL13";
} else if (length == 4) {
return "CARDINAL4";
} else {
return "CARDINAL5PLUS";
}
} else if (number) {
return "NUMBER";
}
boolean seenLower = false;
boolean seenUpper = false;
boolean allCaps = true;
boolean allLower = true;
boolean initCap = false;
boolean dash = false;
boolean period = false;
for (int i = 0; i < length; i++) {
char ch = s.charAt(i);
boolean up = Character.isUpperCase(ch);
boolean let = Character.isLetter(ch);
boolean tit = Character.isTitleCase(ch);
if (ch == '-') {
dash = true;
} else if (ch == '.') {
period = true;
}
if (tit) {
seenUpper = true;
allLower = false;
seenLower = true;
allCaps = false;
} else if (up) {
seenUpper = true;
allLower = false;
} else if (let) {
seenLower = true;
allCaps = false;
}
if (i == 0 && (up || tit)) {
initCap = true;
}
}
if (length == 2 && initCap && period) {
return "ACRONYM1";
} else if (seenUpper && allCaps && !seenDigit && period) {
return "ACRONYM";
} else if (seenDigit && dash && !seenUpper && !seenLower) {
return "DIGIT-DASH";
} else if (initCap && seenLower && seenDigit && dash) {
return "CAPITALIZED-DIGIT-DASH";
} else if (initCap && seenLower && seenDigit) {
return "CAPITALIZED-DIGIT";
} else if (initCap && seenLower && dash) {
return "CAPITALIZED-DASH";
} else if (initCap && seenLower) {
return "CAPITALIZED";
} else if (seenUpper && allCaps && seenDigit && dash) {
return "ALLCAPS-DIGIT-DASH";
} else if (seenUpper && allCaps && seenDigit) {
return "ALLCAPS-DIGIT";
} else if (seenUpper && allCaps && dash) {
return "ALLCAPS";
} else if (seenUpper && allCaps) {
return "ALLCAPS";
} else if (seenLower && allLower && seenDigit && dash) {
return "LOWERCASE-DIGIT-DASH";
} else if (seenLower && allLower && seenDigit) {
return "LOWERCASE-DIGIT";
} else if (seenLower && allLower && dash) {
return "LOWERCASE-DASH";
} else if (seenLower && allLower) {
return "LOWERCASE";
} else if (seenLower && seenDigit) {
return "MIXEDCASE-DIGIT";
} else if (seenLower) {
return "MIXEDCASE";
} else if (seenDigit) {
return "SYMBOL-DIGIT";
} else {
return "SYMBOL";
}
}
/**
* Just collapses digits to 9 characters.
* Does lazy copying of String.
*
* @param s String to find word shape of
* @return The same string except digits are equivalence classed to 9.
*/
private static String wordShapeDigits(final String s) {
char[] outChars = null;
for (int i = 0; i < s.length(); i++) {
char c = s.charAt(i);
if (Character.isDigit(c)) {
if (outChars == null) {
outChars = s.toCharArray();
}
outChars[i] = '9';
}
}
if (outChars == null) {
// no digit found
return s;
} else {
return new String(outChars);
}
}
/**
* Uses distributional similarity clusters for unknown words. Except that
* numbers are just turned into NUMBER.
* This one uses ones from a fixed file that we've used for NER.
*
* @param s String to find word shape of
* @return Its word shape
*/
private static String wordShapeCluster1(String s) {
boolean digit = true;
for (int i = 0; i < s.length(); i++) {
char c = s.charAt(i);
if ( ! (Character.isDigit(c) || c == '.' || c == ',' || (i == 0 && (c == '-' || c == '+')))) {
digit = false;
}
}
if (digit) {
return "NUMBER";
} else {
String cluster = DistributionalClusters.cluster1.get(s);
if (cluster == null) {
cluster = "NULL";
}
return cluster;
}
}
private static String wordShapeChinese(final String s) {
return ChineseUtils.shapeOf(s, true, true);
}
private static class DistributionalClusters {
private DistributionalClusters() {}
public static Map cluster1 = loadWordClusters("/u/nlp/data/pos_tags_are_useless/egw.bnc.200",
"alexClark");
private static class LcMap extends HashMap {
private static final long serialVersionUID = -457913281600751901L;
@Override
public V get(Object key) {
return super.get(key.toString().toLowerCase());
}
}
public static Map loadWordClusters(String file, String format) {
Timing.startDoing("Loading distsim lexicon from " + file);
Map lexicon = new LcMap<>();
if ("terryKoo".equals(format)) {
for (String line : ObjectBank.getLineIterator(file)) {
String[] bits = line.split("\\t");
String word = bits[1];
// for now, always lowercase, but should revisit this
word = word.toLowerCase();
String wordClass = bits[0];
lexicon.put(word, wordClass);
}
} else {
// "alexClark"
for (String line : ObjectBank.getLineIterator(file)) {
String[] bits = line.split("\\s+");
String word = bits[0];
// for now, always lowercase, but should revisit this
word = word.toLowerCase();
lexicon.put(word, bits[1]);
}
}
Timing.endDoing();
return lexicon;
}
}
/**
* Usage: java edu.stanford.nlp.process.WordShapeClassifier
* [-wordShape name] string+
* where name
is an argument to lookupShaper
.
* Known names have patterns along the lines of: dan[12](bio)?(UseLC)?,
* jenny1(useLC)?, chris[1234](useLC)?, cluster1.
* If you don't specify a word shape function, you get chris1.
*
* @param args Command-line arguments, as above.
*/
public static void main(String[] args) {
int i = 0;
int classifierToUse = WORDSHAPECHRIS1;
if (args.length == 0) {
System.out.println("edu.stanford.nlp.process.WordShapeClassifier [-wordShape name] string+");
} else if (args[0].charAt(0) == '-') {
if (args[0].equals("-wordShape") && args.length >= 2) {
classifierToUse = lookupShaper(args[1]);
i += 2;
} else {
System.err.println("Unknown flag: " + args[0]);
i++;
}
}
for (; i < args.length; i++) {
System.out.print(args[i] + ": ");
System.out.println(wordShape(args[i], classifierToUse));
}
}
}