All Downloads are FREE. Search and download functionalities are using the official Maven repository.

edu.stanford.nlp.process.WordShapeClassifier Maven / Gradle / Ivy

package edu.stanford.nlp.process;

import java.util.*;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import edu.stanford.nlp.trees.international.pennchinese.ChineseUtils;

import edu.stanford.nlp.objectbank.ObjectBank;
import edu.stanford.nlp.util.Generics;
import edu.stanford.nlp.util.Timing;

// TODO: put in a regexp for ordinals, fraction num/num and perhaps even 30-5/8


/**
 * Provides static methods which
 * map any String to another String indicative of its "word shape" -- e.g.,
 * whether capitalized, numeric, etc.  Different implementations may
 * implement quite different, normally language specific ideas of what
 * word shapes are useful.
 *
 * @author Christopher Manning
 * @author Dan Klein
 */
public class WordShapeClassifier {

  public static final int NOWORDSHAPE = -1;
  public static final int WORDSHAPEDAN1 = 0;
  public static final int WORDSHAPECHRIS1 = 1;
  public static final int WORDSHAPEDAN2 = 2;
  public static final int WORDSHAPEDAN2USELC = 3;
  public static final int WORDSHAPEDAN2BIO = 4;
  public static final int WORDSHAPEDAN2BIOUSELC = 5;
  public static final int WORDSHAPEJENNY1 = 6;
  public static final int WORDSHAPEJENNY1USELC = 7;
  public static final int WORDSHAPECHRIS2 = 8;
  public static final int WORDSHAPECHRIS2USELC = 9;
  public static final int WORDSHAPECHRIS3 = 10;
  public static final int WORDSHAPECHRIS3USELC = 11;
  public static final int WORDSHAPECHRIS4 = 12;
  public static final int WORDSHAPEDIGITS = 13;
  public static final int WORDSHAPECHINESE = 14;
  public static final int WORDSHAPECLUSTER1 = 15;


  // This class cannot be instantiated
  private WordShapeClassifier() {
  }


  /** Look up a shaper by a short String name.
   *
   * @param name Shaper name.  Known names have patterns along the lines of:
   *             dan[12](bio)?(UseLC)?, jenny1(useLC)?, chris[1234](useLC)?, cluster1.
   * @return An integer constant for the shaper
   */
  public static int lookupShaper(String name) {
    if (name == null) {
      return NOWORDSHAPE;
    } else if (name.equalsIgnoreCase("dan1")) {
      return WORDSHAPEDAN1;
    } else if (name.equalsIgnoreCase("chris1")) {
      return WORDSHAPECHRIS1;
    } else if (name.equalsIgnoreCase("dan2")) {
      return WORDSHAPEDAN2;
    } else if (name.equalsIgnoreCase("dan2useLC")) {
      return WORDSHAPEDAN2USELC;
    } else if (name.equalsIgnoreCase("dan2bio")) {
      return WORDSHAPEDAN2BIO;
    } else if (name.equalsIgnoreCase("dan2bioUseLC")) {
      return WORDSHAPEDAN2BIOUSELC;
    } else if (name.equalsIgnoreCase("jenny1")) {
      return WORDSHAPEJENNY1;
    } else if (name.equalsIgnoreCase("jenny1useLC")) {
      return WORDSHAPEJENNY1USELC;
    } else if (name.equalsIgnoreCase("chris2")) {
      return WORDSHAPECHRIS2;
    } else if (name.equalsIgnoreCase("chris2useLC")) {
      return WORDSHAPECHRIS2USELC;
    } else if (name.equalsIgnoreCase("chris3")) {
      return WORDSHAPECHRIS3;
    } else if (name.equalsIgnoreCase("chris3useLC")) {
      return WORDSHAPECHRIS3USELC;
    } else if (name.equalsIgnoreCase("chris4")) {
      return WORDSHAPECHRIS4;
    } else if (name.equalsIgnoreCase("digits")) {
      return WORDSHAPEDIGITS;
    } else if (name.equalsIgnoreCase("chinese")) {
      return WORDSHAPECHINESE;
    } else if (name.equalsIgnoreCase("cluster1")) {
      return WORDSHAPECLUSTER1;
    } else {
      return NOWORDSHAPE;
    }
  }

  /**
   * Returns true if the specified word shaper doesn't use
   * known lower case words, even if a list of them is present.
   * This is used for backwards compatibility. It is suggested that
   * new word shape functions are either passed a non-null list of
   * lowercase words or not, depending on whether you want knownLC marking
   * (if it is available in a shaper).  This is how chris4 works.
   *
   * @param shape One of the defined shape constants
   * @return true if the specified word shaper uses
   *     known lower case words.
   */
  private static boolean dontUseLC(int shape) {
    return shape == WORDSHAPEDAN2 ||
            shape == WORDSHAPEDAN2BIO ||
            shape == WORDSHAPEJENNY1 ||
            shape == WORDSHAPECHRIS2 ||
            shape == WORDSHAPECHRIS3;
  }


  /**
   * Specify the String and the int identifying which word shaper to
   * use and this returns the result of using that wordshaper on the String.
   *
   * @param inStr String to calculate word shape of
   * @param wordShaper Constant for which shaping formula to use
   * @return The wordshape String
   */
  public static String wordShape(String inStr, int wordShaper) {
    return wordShape(inStr, wordShaper, null);
  }


  /**
   * Specify the string and the int identifying which word shaper to
   * use and this returns the result of using that wordshaper on the String.
   *
   * @param inStr String to calculate word shape of
   * @param wordShaper Constant for which shaping formula to use
   * @param knownLCWords A Collection of known lowercase words, which some shapers use
   *           to decide the class of capitalized words.
   *           Note: while this code works with any Collection, you should
   *           provide a Set for decent performance.  If this parameter is
   *           null or empty, then this option is not used (capitalized words
   *           are treated the same, regardless of whether the lowercased
   *           version of the String has been seen).
   * @return The wordshape String
   */
  public static String wordShape(String inStr, int wordShaper, Collection knownLCWords) {
    // this first bit is for backwards compatibility with how things were first
    // implemented, where the word shaper name encodes whether to useLC.
    // If the shaper is in the old compatibility list, then a specified
    // list of knownLCwords is ignored
    if (knownLCWords != null && dontUseLC(wordShaper)) {
      knownLCWords = null;
    }
    switch (wordShaper) {
      case NOWORDSHAPE:
        return inStr;
      case WORDSHAPEDAN1:
        return wordShapeDan1(inStr);
      case WORDSHAPECHRIS1:
        return wordShapeChris1(inStr);
      case WORDSHAPEDAN2:
        return wordShapeDan2(inStr, knownLCWords);
      case WORDSHAPEDAN2USELC:
        return wordShapeDan2(inStr, knownLCWords);
      case WORDSHAPEDAN2BIO:
        return wordShapeDan2Bio(inStr, knownLCWords);
      case WORDSHAPEDAN2BIOUSELC:
        return wordShapeDan2Bio(inStr, knownLCWords);
      case WORDSHAPEJENNY1:
        return wordShapeJenny1(inStr, knownLCWords);
      case WORDSHAPEJENNY1USELC:
        return wordShapeJenny1(inStr, knownLCWords);
      case WORDSHAPECHRIS2:
        return wordShapeChris2(inStr, false, knownLCWords);
      case WORDSHAPECHRIS2USELC:
        return wordShapeChris2(inStr, false, knownLCWords);
      case WORDSHAPECHRIS3:
        return wordShapeChris2(inStr, true, knownLCWords);
      case WORDSHAPECHRIS3USELC:
        return wordShapeChris2(inStr, true, knownLCWords);
      case WORDSHAPECHRIS4:
        return wordShapeChris4(inStr, false, knownLCWords);
      case WORDSHAPEDIGITS:
        return wordShapeDigits(inStr);
      case WORDSHAPECHINESE:
        return wordShapeChinese(inStr);
      case WORDSHAPECLUSTER1:
        return wordShapeCluster1(inStr);
      default:
        throw new IllegalStateException("Bad WordShapeClassifier");
    }
  }

  /**
   * A fairly basic 5-way classifier, that notes digits, and upper
   * and lower case, mixed, and non-alphanumeric.
   *
   * @param s String to find word shape of
   * @return Its word shape: a 5 way classification
   */
  private static String wordShapeDan1(String s) {
    boolean digit = true;
    boolean upper = true;
    boolean lower = true;
    boolean mixed = true;
    for (int i = 0; i < s.length(); i++) {
      char c = s.charAt(i);
      if (!Character.isDigit(c)) {
        digit = false;
      }
      if (!Character.isLowerCase(c)) {
        lower = false;
      }
      if (!Character.isUpperCase(c)) {
        upper = false;
      }
      if ((i == 0 && !Character.isUpperCase(c)) || (i >= 1 && !Character.isLowerCase(c))) {
        mixed = false;
      }
    }
    if (digit) {
      return "ALL-DIGITS";
    }
    if (upper) {
      return "ALL-UPPER";
    }
    if (lower) {
      return "ALL-LOWER";
    }
    if (mixed) {
      return "MIXED-CASE";
    }
    return "OTHER";
  }


  /**
   * A fine-grained word shape classifier, that equivalence classes
   * lower and upper case and digits, and collapses sequences of the
   * same type, but keeps all punctuation, etc. 

* Note: We treat '_' as a lowercase letter, sort of like many * programming languages. We do this because we use '_' joining of * tokens in some applications like RTE. * * @param s The String whose shape is to be returned * @param knownLCWords If this is non-null and non-empty, mark words whose * lower case form is found in the * Collection of known lower case words * @return The word shape */ private static String wordShapeDan2(String s, Collection knownLCWords) { StringBuilder sb = new StringBuilder("WT-"); char lastM = '~'; boolean nonLetters = false; int len = s.length(); for (int i = 0; i < len; i++) { char c = s.charAt(i); char m = c; if (Character.isDigit(c)) { m = 'd'; } else if (Character.isLowerCase(c) || c == '_') { m = 'x'; } else if (Character.isUpperCase(c)) { m = 'X'; } if (m != 'x' && m != 'X') { nonLetters = true; } if (m != lastM) { sb.append(m); } lastM = m; } if (len <= 3) { sb.append(':').append(len); } if (knownLCWords != null) { if (!nonLetters && knownLCWords.contains(s.toLowerCase())) { sb.append('k'); } } // System.err.println("wordShapeDan2: " + s + " became " + sb); return sb.toString(); } private static String wordShapeJenny1(String s, Collection knownLCWords) { StringBuilder sb = new StringBuilder("WT-"); char lastM = '~'; boolean nonLetters = false; for (int i = 0; i < s.length(); i++) { char c = s.charAt(i); char m = c; if (Character.isDigit(c)) { m = 'd'; } else if (Character.isLowerCase(c)) { m = 'x'; } else if (Character.isUpperCase(c)) { m = 'X'; } for (String gr : greek) { if (s.startsWith(gr, i)) { m = 'g'; i = i + gr.length() - 1; //System.out.println(s + " :: " + s.substring(i+1)); break; } } if (m != 'x' && m != 'X') { nonLetters = true; } if (m != lastM) { sb.append(m); } lastM = m; } if (s.length() <= 3) { sb.append(':').append(s.length()); } if (knownLCWords != null) { if ( ! nonLetters && knownLCWords.contains(s.toLowerCase())) { sb.append('k'); } } //System.out.println(s+" became "+sb); return sb.toString(); } /** Note: the optimizations in wordShapeChris2 would break if BOUNDARY_SIZE * was greater than the shortest greek word, so valid values are: 0, 1, 2, 3. */ private static final int BOUNDARY_SIZE = 2; /** * This one picks up on Dan2 ideas, but seeks to make less distinctions * mid sequence by sorting for long words, but to maintain extra * distinctions for short words. It exactly preserves the character shape * of the first and last 2 (i.e., BOUNDARY_SIZE) characters and then * will record shapes that occur between them (perhaps only if they are * different) * * @param s The String to find the word shape of * @param omitIfInBoundary If true, character classes present in the * first or last two (i.e., BOUNDARY_SIZE) letters * of the word are not also registered * as classes that appear in the middle of the word. * @param knownLCWords If non-null and non-empty, tag with a "k" suffix words * that are in this list when lowercased (representing * that the word is "known" as a lowercase word). * @return A word shape for the word. */ private static String wordShapeChris2(String s, boolean omitIfInBoundary, Collection knownLCWords) { int len = s.length(); if (len <= BOUNDARY_SIZE * 2) { return wordShapeChris2Short(s, len, knownLCWords); } else { return wordShapeChris2Long(s, omitIfInBoundary, len, knownLCWords); } } // Do the simple case of words <= BOUNDARY_SIZE * 2 (i.e., 4) with only 1 object allocation! private static String wordShapeChris2Short(String s, int len, Collection knownLCWords) { int sbLen = (knownLCWords != null) ? len + 1: len; // markKnownLC makes String 1 longer final StringBuilder sb = new StringBuilder(sbLen); boolean nonLetters = false; for (int i = 0; i < len; i++) { char c = s.charAt(i); char m = c; if (Character.isDigit(c)) { m = 'd'; } else if (Character.isLowerCase(c)) { m = 'x'; } else if (Character.isUpperCase(c) || Character.isTitleCase(c)) { m = 'X'; } for (String gr : greek) { if (s.startsWith(gr, i)) { m = 'g'; //System.out.println(s + " :: " + s.substring(i+1)); i += gr.length() - 1; // System.out.println("Position skips to " + i); break; } } if (m != 'x' && m != 'X') { nonLetters = true; } sb.append(m); } if (knownLCWords != null) { if ( ! nonLetters && knownLCWords.contains(s.toLowerCase())) { sb.append('k'); } } // System.out.println(s + " became " + sb); return sb.toString(); } // introduce sizes and optional allocation to reduce memory churn demands; // this class could blow a lot of memory if used in a tight loop, // as the naive version allocates lots of kind of heavyweight objects // endSB should be of length BOUNDARY_SIZE // sb is maximally of size s.length() + 1, but is usually (much) shorter. The +1 might happen if markKnownLC is true and it applies // boundSet is maximally of size BOUNDARY_SIZE * 2 (and is often smaller) // seenSet is maximally of size s.length() - BOUNDARY_SIZE * 2, but might often be of size <= 4. But it has no initial size allocation // But we want the initial size to be greater than BOUNDARY_SIZE * 2 * (4/3) since the default loadfactor is 3/4. // That is, of size 6, which become 8, since HashMaps are powers of 2. Still, it's half the size private static String wordShapeChris2Long(String s, boolean omitIfInBoundary, int len, Collection knownLCWords) { final char[] beginChars = new char[BOUNDARY_SIZE]; final char[] endChars = new char[BOUNDARY_SIZE]; int beginUpto = 0; int endUpto = 0; final Set seenSet = new TreeSet<>(); // TreeSet guarantees stable ordering; has no size parameter boolean nonLetters = false; for (int i = 0; i < len; i++) { int iIncr = 0; char c = s.charAt(i); char m = c; if (Character.isDigit(c)) { m = 'd'; } else if (Character.isLowerCase(c)) { m = 'x'; } else if (Character.isUpperCase(c) || Character.isTitleCase(c)) { m = 'X'; } for (String gr : greek) { if (s.startsWith(gr, i)) { m = 'g'; //System.out.println(s + " :: " + s.substring(i+1)); iIncr = gr.length() - 1; break; } } if (m != 'x' && m != 'X') { nonLetters = true; } if (i < BOUNDARY_SIZE) { beginChars[beginUpto++] = m; } else if (i < len - BOUNDARY_SIZE) { seenSet.add(Character.valueOf(m)); } else { endChars[endUpto++] = m; } i += iIncr; // System.out.println("Position skips to " + i); } // Calculate size. This may be an upperbound, but is often correct int sbSize = beginUpto + endUpto + seenSet.size(); if (knownLCWords != null) { sbSize++; } final StringBuilder sb = new StringBuilder(sbSize); // put in the beginning chars sb.append(beginChars, 0, beginUpto); // put in the stored ones sorted if (omitIfInBoundary) { for (Character chr : seenSet) { char ch = chr.charValue(); boolean insert = true; for (int i = 0; i < beginUpto; i++) { if (beginChars[i] == ch) { insert = false; break; } } for (int i = 0; i < endUpto; i++) { if (endChars[i] == ch) { insert = false; break; } } if (insert) { sb.append(ch); } } } else { for (Character chr : seenSet) { sb.append(chr.charValue()); } } // and add end ones sb.append(endChars, 0, endUpto); if (knownLCWords != null) { if (!nonLetters && knownLCWords.contains(s.toLowerCase())) { sb.append('k'); } } // System.out.println(s + " became " + sb); return sb.toString(); } private static char chris4equivalenceClass(final char c) { int type = Character.getType(c); if (Character.isDigit(c) || type == Character.LETTER_NUMBER || type == Character.OTHER_NUMBER || "一二三四五六七八九十零〇百千万亿兩○◯".indexOf(c) > 0) { // include Chinese numbers that are just of unicode type OTHER_LETTER (and a couple of round symbols often used (by mistake?) for zeroes) return 'd'; } else if (c == '第') { return 'o'; // detect those Chinese ordinals! } else if (c == '年' || c == '月' || c == '日') { // || c == '号') { return 'D'; // Chinese date characters. } else if (Character.isLowerCase(c)) { return 'x'; } else if (Character.isUpperCase(c) || Character.isTitleCase(c)) { return 'X'; } else if (Character.isWhitespace(c) || Character.isSpaceChar(c)) { return 's'; } else if (type == Character.OTHER_LETTER) { return 'c'; // Chinese characters, etc. without case } else if (type == Character.CURRENCY_SYMBOL) { return '$'; } else if (type == Character.MATH_SYMBOL) { return '+'; } else if (type == Character.OTHER_SYMBOL || c == '|') { return '|'; } else if (type == Character.START_PUNCTUATION) { return '('; } else if (type == Character.END_PUNCTUATION) { return ')'; } else if (type == Character.INITIAL_QUOTE_PUNCTUATION) { return '`'; } else if (type == Character.FINAL_QUOTE_PUNCTUATION || c == '\'') { return '\''; } else if (c == '%') { return '%'; } else if (type == Character.OTHER_PUNCTUATION) { return '.'; } else if (type == Character.CONNECTOR_PUNCTUATION) { return '_'; } else if (type == Character.DASH_PUNCTUATION) { return '-'; } else { return 'q'; } } public static String wordShapeChris4(String s) { return wordShapeChris4(s, false, null); } /** * This one picks up on Dan2 ideas, but seeks to make less distinctions * mid sequence by sorting for long words, but to maintain extra * distinctions for short words, by always recording the class of the * first and last two characters of the word. * Compared to chris2 on which it is based, * it uses more Unicode classes, and so collapses things like * punctuation more, and might work better with real unicode. * * @param s The String to find the word shape of * @param omitIfInBoundary If true, character classes present in the * first or last two (i.e., BOUNDARY_SIZE) letters * of the word are not also registered * as classes that appear in the middle of the word. * @param knownLCWords If non-null and non-empty, tag with a "k" suffix words * that are in this list when lowercased (representing * that the word is "known" as a lowercase word). * @return A word shape for the word. */ private static String wordShapeChris4(String s, boolean omitIfInBoundary, Collection knownLCWords) { int len = s.length(); if (len <= BOUNDARY_SIZE * 2) { return wordShapeChris4Short(s, len, knownLCWords); } else { return wordShapeChris4Long(s, omitIfInBoundary, len, knownLCWords); } } // Do the simple case of words <= BOUNDARY_SIZE * 2 (i.e., 4) with only 1 object allocation! private static String wordShapeChris4Short(String s, int len, Collection knownLCWords) { int sbLen = (knownLCWords != null) ? len + 1: len; // markKnownLC makes String 1 longer final StringBuilder sb = new StringBuilder(sbLen); boolean nonLetters = false; for (int i = 0; i < len; i++) { char c = s.charAt(i); char m = chris4equivalenceClass(c); for (String gr : greek) { if (s.startsWith(gr, i)) { m = 'g'; //System.out.println(s + " :: " + s.substring(i+1)); i += gr.length() - 1; // System.out.println("Position skips to " + i); break; } } if (m != 'x' && m != 'X') { nonLetters = true; } sb.append(m); } if (knownLCWords != null) { if ( ! nonLetters && knownLCWords.contains(s.toLowerCase())) { sb.append('k'); } } // System.out.println(s + " became " + sb); return sb.toString(); } private static String wordShapeChris4Long(String s, boolean omitIfInBoundary, int len, Collection knownLCWords) { StringBuilder sb = new StringBuilder(s.length() + 1); StringBuilder endSB = new StringBuilder(BOUNDARY_SIZE); Set boundSet = Generics.newHashSet(BOUNDARY_SIZE * 2); Set seenSet = new TreeSet<>(); // TreeSet guarantees stable ordering boolean nonLetters = false; for (int i = 0; i < len; i++) { char c = s.charAt(i); char m = chris4equivalenceClass(c); int iIncr = 0; for (String gr : greek) { if (s.startsWith(gr, i)) { m = 'g'; iIncr = gr.length() - 1; //System.out.println(s + " :: " + s.substring(i+1)); break; } } if (m != 'x' && m != 'X') { nonLetters = true; } if (i < BOUNDARY_SIZE) { sb.append(m); boundSet.add(Character.valueOf(m)); } else if (i < len - BOUNDARY_SIZE) { seenSet.add(Character.valueOf(m)); } else { boundSet.add(Character.valueOf(m)); endSB.append(m); } // System.out.println("Position " + i + " --> " + m); i += iIncr; } // put in the stored ones sorted and add end ones for (Character chr : seenSet) { if (!omitIfInBoundary || !boundSet.contains(chr)) { char ch = chr.charValue(); sb.append(ch); } } sb.append(endSB); if (knownLCWords != null) { if (!nonLetters && knownLCWords.contains(s.toLowerCase())) { sb.append('k'); } } // System.out.println(s + " became " + sb); return sb.toString(); } /** * Returns a fine-grained word shape classifier, that equivalence classes * lower and upper case and digits, and collapses sequences of the * same type, but keeps all punctuation. This adds an extra recognizer * for a greek letter embedded in the String, which is useful for bio. */ private static String wordShapeDan2Bio(String s, Collection knownLCWords) { if (containsGreekLetter(s)) { return wordShapeDan2(s, knownLCWords) + "-GREEK"; } else { return wordShapeDan2(s, knownLCWords); } } /** List of greek letters for bio. We omit eta, mu, nu, xi, phi, chi, psi. * Maybe should omit rho too, but it is used in bio "Rho kinase inhibitor". */ private static final String[] greek = {"alpha", "beta", "gamma", "delta", "epsilon", "zeta", "theta", "iota", "kappa", "lambda", "omicron", "rho", "sigma", "tau", "upsilon", "omega"}; private static final Pattern biogreek = Pattern.compile("alpha|beta|gamma|delta|epsilon|zeta|theta|iota|kappa|lambda|omicron|rho|sigma|tau|upsilon|omega", Pattern.CASE_INSENSITIVE); /** * Somewhat ad-hoc list of only greek letters that bio people use, partly * to avoid false positives on short ones. * @param s String to check for Greek * @return true iff there is a greek lette embedded somewhere in the String */ private static boolean containsGreekLetter(String s) { Matcher m = biogreek.matcher(s); return m.find(); } /** This one equivalence classes all strings into one of 24 semantically * informed classes, somewhat similarly to the function specified in the * BBN Nymble NER paper (Bikel et al. 1997). *

* Note that it regards caseless non-Latin letters as lowercase. * * @param s String to word class * @return The string's class */ private static String wordShapeChris1(String s) { int length = s.length(); if (length == 0) { return "SYMBOL"; // unclear if this is sensible, but it's what a length 0 String becomes.... } boolean cardinal = false; boolean number = true; boolean seenDigit = false; boolean seenNonDigit = false; for (int i = 0; i < length; i++) { char ch = s.charAt(i); boolean digit = Character.isDigit(ch); if (digit) { seenDigit = true; } else { seenNonDigit = true; } // allow commas, decimals, and negative numbers digit = digit || ch == '.' || ch == ',' || (i == 0 && (ch == '-' || ch == '+')); if (!digit) { number = false; } } if ( ! seenDigit) { number = false; } else if ( ! seenNonDigit) { cardinal = true; } if (cardinal) { if (length < 4) { return "CARDINAL13"; } else if (length == 4) { return "CARDINAL4"; } else { return "CARDINAL5PLUS"; } } else if (number) { return "NUMBER"; } boolean seenLower = false; boolean seenUpper = false; boolean allCaps = true; boolean allLower = true; boolean initCap = false; boolean dash = false; boolean period = false; for (int i = 0; i < length; i++) { char ch = s.charAt(i); boolean up = Character.isUpperCase(ch); boolean let = Character.isLetter(ch); boolean tit = Character.isTitleCase(ch); if (ch == '-') { dash = true; } else if (ch == '.') { period = true; } if (tit) { seenUpper = true; allLower = false; seenLower = true; allCaps = false; } else if (up) { seenUpper = true; allLower = false; } else if (let) { seenLower = true; allCaps = false; } if (i == 0 && (up || tit)) { initCap = true; } } if (length == 2 && initCap && period) { return "ACRONYM1"; } else if (seenUpper && allCaps && !seenDigit && period) { return "ACRONYM"; } else if (seenDigit && dash && !seenUpper && !seenLower) { return "DIGIT-DASH"; } else if (initCap && seenLower && seenDigit && dash) { return "CAPITALIZED-DIGIT-DASH"; } else if (initCap && seenLower && seenDigit) { return "CAPITALIZED-DIGIT"; } else if (initCap && seenLower && dash) { return "CAPITALIZED-DASH"; } else if (initCap && seenLower) { return "CAPITALIZED"; } else if (seenUpper && allCaps && seenDigit && dash) { return "ALLCAPS-DIGIT-DASH"; } else if (seenUpper && allCaps && seenDigit) { return "ALLCAPS-DIGIT"; } else if (seenUpper && allCaps && dash) { return "ALLCAPS"; } else if (seenUpper && allCaps) { return "ALLCAPS"; } else if (seenLower && allLower && seenDigit && dash) { return "LOWERCASE-DIGIT-DASH"; } else if (seenLower && allLower && seenDigit) { return "LOWERCASE-DIGIT"; } else if (seenLower && allLower && dash) { return "LOWERCASE-DASH"; } else if (seenLower && allLower) { return "LOWERCASE"; } else if (seenLower && seenDigit) { return "MIXEDCASE-DIGIT"; } else if (seenLower) { return "MIXEDCASE"; } else if (seenDigit) { return "SYMBOL-DIGIT"; } else { return "SYMBOL"; } } /** * Just collapses digits to 9 characters. * Does lazy copying of String. * * @param s String to find word shape of * @return The same string except digits are equivalence classed to 9. */ private static String wordShapeDigits(final String s) { char[] outChars = null; for (int i = 0; i < s.length(); i++) { char c = s.charAt(i); if (Character.isDigit(c)) { if (outChars == null) { outChars = s.toCharArray(); } outChars[i] = '9'; } } if (outChars == null) { // no digit found return s; } else { return new String(outChars); } } /** * Uses distributional similarity clusters for unknown words. Except that * numbers are just turned into NUMBER. * This one uses ones from a fixed file that we've used for NER. * * @param s String to find word shape of * @return Its word shape */ private static String wordShapeCluster1(String s) { boolean digit = true; for (int i = 0; i < s.length(); i++) { char c = s.charAt(i); if ( ! (Character.isDigit(c) || c == '.' || c == ',' || (i == 0 && (c == '-' || c == '+')))) { digit = false; } } if (digit) { return "NUMBER"; } else { String cluster = DistributionalClusters.cluster1.get(s); if (cluster == null) { cluster = "NULL"; } return cluster; } } private static String wordShapeChinese(final String s) { return ChineseUtils.shapeOf(s, true, true); } private static class DistributionalClusters { private DistributionalClusters() {} public static Map cluster1 = loadWordClusters("/u/nlp/data/pos_tags_are_useless/egw.bnc.200", "alexClark"); private static class LcMap extends HashMap { private static final long serialVersionUID = -457913281600751901L; @Override public V get(Object key) { return super.get(key.toString().toLowerCase()); } } public static Map loadWordClusters(String file, String format) { Timing.startDoing("Loading distsim lexicon from " + file); Map lexicon = new LcMap<>(); if ("terryKoo".equals(format)) { for (String line : ObjectBank.getLineIterator(file)) { String[] bits = line.split("\\t"); String word = bits[1]; // for now, always lowercase, but should revisit this word = word.toLowerCase(); String wordClass = bits[0]; lexicon.put(word, wordClass); } } else { // "alexClark" for (String line : ObjectBank.getLineIterator(file)) { String[] bits = line.split("\\s+"); String word = bits[0]; // for now, always lowercase, but should revisit this word = word.toLowerCase(); lexicon.put(word, bits[1]); } } Timing.endDoing(); return lexicon; } } /** * Usage: java edu.stanford.nlp.process.WordShapeClassifier * [-wordShape name] string+
* where name is an argument to lookupShaper. * Known names have patterns along the lines of: dan[12](bio)?(UseLC)?, * jenny1(useLC)?, chris[1234](useLC)?, cluster1. * If you don't specify a word shape function, you get chris1. * * @param args Command-line arguments, as above. */ public static void main(String[] args) { int i = 0; int classifierToUse = WORDSHAPECHRIS1; if (args.length == 0) { System.out.println("edu.stanford.nlp.process.WordShapeClassifier [-wordShape name] string+"); } else if (args[0].charAt(0) == '-') { if (args[0].equals("-wordShape") && args.length >= 2) { classifierToUse = lookupShaper(args[1]); i += 2; } else { System.err.println("Unknown flag: " + args[0]); i++; } } for (; i < args.length; i++) { System.out.print(args[i] + ": "); System.out.println(wordShape(args[i], classifierToUse)); } } }





© 2015 - 2024 Weber Informatics LLC | Privacy Policy