edu.stanford.nlp.parser.lexparser.EnglishUnknownWordModel Maven / Gradle / Ivy

Show more of this group Show more artifacts with this name
Show all versions of stanford-corenlp Show documentation
Stanford CoreNLP provides a set of natural language analysis tools which can take raw English language text input and give the base forms of words, their parts of speech, whether they are names of companies, people, etc., normalize dates, times, and numeric quantities, mark up the structure of sentences in terms of phrases and word dependencies, and indicate which noun phrases refer to the same entities. It provides the foundational building blocks for higher level text understanding applications.
There is a newer version: 4.5.7
Show newest version
// Stanford Parser -- a probabilistic lexicalized NL CFG parser
// Copyright (c) 2002, 2003, 2004, 2005, 2008 The Board of Trustees of
// The Leland Stanford Junior University. All Rights Reserved.
//
// This program is free software; you can redistribute it and/or
// modify it under the terms of the GNU General Public License
// as published by the Free Software Foundation; either version 2
// of the License, or (at your option) any later version.
//
// This program is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
// GNU General Public License for more details.
//
// You should have received a copy of the GNU General Public License
// along with this program; if not, write to the Free Software
// Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA  02111-1307, USA.
//
// For more information, bug reports, fixes, contact:
//    Christopher Manning
//    Dept of Computer Science, Gates 1A
//    Stanford CA 94305-9010
//    USA
//    [email protected]
//    http://nlp.stanford.edu/software/lex-parser.shtml

package edu.stanford.nlp.parser.lexparser; 
import edu.stanford.nlp.util.logging.Redwood;

import edu.stanford.nlp.process.DistSimClassifier;
import edu.stanford.nlp.stats.ClassicCounter;
import edu.stanford.nlp.util.Index;


/**
 * This is a basic unknown word model for English.  It supports 5 different
 * types of feature modeling; see {@link #getSignature(String, int)}.
 *
 * Implementation note: the contents of this class tend to overlap somewhat
 * with {@link ArabicUnknownWordModel} and were originally included in {@link BaseLexicon}.
 *
 * @author Dan Klein
 * @author Galen Andrew
 * @author Christopher Manning
 * @author Anna Rafferty
 */
public class EnglishUnknownWordModel extends BaseUnknownWordModel  {

  /** A logger for this class */
  private static Redwood.RedwoodChannels log = Redwood.channels(EnglishUnknownWordModel.class);

  private static final long serialVersionUID = 4825624957364628770L;

  private static final boolean DEBUG_UWM = false;

  protected final boolean smartMutation;

  protected final int unknownSuffixSize;
  protected final int unknownPrefixSize;

  protected final String wordClassesFile;

  private static final int MIN_UNKNOWN = 0;
  private static final int MAX_UNKNOWN = 8;

  public EnglishUnknownWordModel(Options op, Lexicon lex,
                                 Index wordIndex,
                                 Index tagIndex,
                                 ClassicCounter unSeenCounter) {
    super(op, lex, wordIndex, tagIndex, unSeenCounter, null, null, null);
    if (unknownLevel < MIN_UNKNOWN || unknownLevel > MAX_UNKNOWN) {
      throw new IllegalArgumentException("Invalid value for useUnknownWordSignatures: " + unknownLevel);
    }
    this.smartMutation = op.lexOptions.smartMutation;
    this.unknownSuffixSize = op.lexOptions.unknownSuffixSize;
    this.unknownPrefixSize = op.lexOptions.unknownPrefixSize;
    wordClassesFile = op.lexOptions.wordClassesFile;
  }

  /**
   * This constructor creates an UWM with empty data structures.  Only
   * use if loading in the data separately, such as by reading in text
   * lines containing the data.
   */
  public EnglishUnknownWordModel(Options op, Lexicon lex,
                                 Index wordIndex,
                                 Index tagIndex) {
    this(op, lex, wordIndex, tagIndex, new ClassicCounter<>());
  }

  @Override
  public float score(IntTaggedWord iTW, int loc, double c_Tseen, double total, double smooth, String word) {
    double pb_T_S = scoreProbTagGivenWordSignature(iTW, loc, smooth, word);
    double p_T = (c_Tseen / total);
    double p_W = 1.0 / total;
    double pb_W_T = Math.log(pb_T_S * p_W / p_T);

    if (pb_W_T > -100.0) {
      if (DEBUG_UWM) {
        log.info(iTW + " tagging has probability " + pb_W_T);
      }
      return (float) pb_W_T;
    }
    if (DEBUG_UWM) {
      log.info(iTW + " tagging is impossible.");
    }
    return Float.NEGATIVE_INFINITY;
  } // end score()


  /** Calculate P(Tag|Signature) with Bayesian smoothing via just P(Tag|Unknown) */
  @Override
  public double scoreProbTagGivenWordSignature(IntTaggedWord iTW, int loc, double smooth, String word) {
    // iTW.tag = nullTag;
    // double c_W = ((BaseLexicon) l).getCount(iTW);
    // iTW.tag = tag;

    // unknown word model for P(T|S)

    int wordSig = getSignatureIndex(iTW.word, loc, word);
    IntTaggedWord temp = new IntTaggedWord(wordSig, iTW.tag);
    double c_TS = unSeenCounter.getCount(temp);
    temp = new IntTaggedWord(wordSig, nullTag);
    double c_S = unSeenCounter.getCount(temp);
    double c_U = unSeenCounter.getCount(NULL_ITW);
    temp = new IntTaggedWord(nullWord, iTW.tag);
    double c_T = unSeenCounter.getCount(temp);

    double p_T_U = c_T / c_U;
    if (unknownLevel == 0) {
      c_TS = 0;
      c_S = 0;
    }
    return (c_TS + smooth * p_T_U) / (c_S + smooth);
  }


  /**
   * Returns the index of the signature of the word numbered wordIndex, where
   * the signature is the String representation of unknown word features.
   */
  @Override
  public int getSignatureIndex(int index, int sentencePosition, String word) {
    String uwSig = getSignature(word, sentencePosition);
    int sig = wordIndex.addToIndex(uwSig);
    if (DEBUG_UWM) {
      log.info("Signature (" + unknownLevel + "): mapped " + word +
                         " (" + index + ") to " + uwSig + " (" + sig + ")");
    }
    return sig;
  }

  /**
   * This routine returns a String that is the "signature" of the class of a
   * word. For, example, it might represent whether it is a number of ends in
   * -s. The strings returned by convention matches the pattern UNK(-.+)? ,
   * which is just assumed to not match any real word. Behavior depends on the
   * unknownLevel (-uwm flag) passed in to the class. The recognized numbers are
   * 1-5: 5 is fairly English-specific; 4, 3, and 2 look for various word
   * features (digits, dashes, etc.) which are only vaguely English-specific; 1
   * uses the last two characters combined with a simple classification by
   * capitalization.
   *
   * @param word The word to make a signature for
   * @param loc Its position in the sentence (mainly so sentence-initial
   *          capitalized words can be treated differently)
   * @return A String that is its signature (equivalence class)
   */
  @Override
  public String getSignature(String word, int loc) {
    StringBuilder sb = new StringBuilder("UNK");
    switch (unknownLevel) {
      case 8:
        getSignature8(word, sb);
        break;
      case 7:
        getSignature7(word, loc, sb);
        break;
      case 6:
        getSignature6(word, loc, sb);
        break;
      case 5:
        getSignature5(word, loc, sb);
        break;
      case 4:
        getSignature4(word, loc, sb);
        break;
      case 3:
        getSignature3(word, loc, sb);
        break;
      case 2:
        getSignature2(word, loc, sb);
        break;
      case 1:
        getSignature1(word, loc, sb);
        break;
      default:
        // 0 = do nothing so it just stays as "UNK"
    } // end switch (unknownLevel)
    // log.info("Summarized " + word + " to " + sb.toString());
    return sb.toString();
  } // end getSignature()


  private static void getSignature7(String word, int loc, StringBuilder sb) {
    // New Sep 2008. Like 2 but rely more on Caps somewhere than initial Caps
    // {-ALLC, -INIT, -UC somewhere, -LC, zero} +
    // {-DASH, zero} +
    // {-NUM, -DIG, zero} +
    // {lowerLastChar, zeroIfShort}
    boolean hasDigit = false;
    boolean hasNonDigit = false;
    boolean hasLower = false;
    boolean hasUpper = false;
    boolean hasDash = false;
    int wlen = word.length();
    for (int i = 0; i < wlen; i++) {
      char ch = word.charAt(i);
      if (Character.isDigit(ch)) {
        hasDigit = true;
      } else {
        hasNonDigit = true;
        if (Character.isLetter(ch)) {
          if (Character.isLowerCase(ch) || Character.isTitleCase(ch)) {
            hasLower = true;
          } else {
            hasUpper = true;
          }
        } else if (ch == '-') {
          hasDash = true;
        }
      }
    }
    if (wlen > 0 && hasUpper) {
      if ( ! hasLower) {
        sb.append("-ALLC");
      } else if (loc == 0) {
        sb.append("-INIT");
      } else {
        sb.append("-UC");
      }
    } else if (hasLower) { // if (Character.isLowerCase(word.charAt(0))) {
      sb.append("-LC");
    }
    // no suffix = no (lowercase) letters
    if (hasDash) {
      sb.append("-DASH");
    }
    if (hasDigit) {
      if (!hasNonDigit) {
        sb.append("-NUM");
      } else {
        sb.append("-DIG");
      }
    } else if (wlen > 3) {
      // don't do for very short words: "yes" isn't an "-es" word
      // try doing to lower for further densening and skipping digits
      char ch = word.charAt(word.length() - 1);
      sb.append(Character.toLowerCase(ch));
    }
    // no suffix = short non-number, non-alphabetic
  }


  private void getSignature6(String word, int loc, StringBuilder sb) {
    // New Sep 2008. Like 5 but rely more on Caps somewhere than initial Caps
    // { -INITC, -CAPS, (has) -CAP, -LC lowercase, 0 } +
    // { -KNOWNLC, 0 } + [only for INITC]
    // { -NUM, 0 } +
    // { -DASH, 0 } +
    // { -last lowered char(s) if known discriminating suffix, 0}
    int wlen = word.length();
    int numCaps = 0;
    boolean hasDigit = false;
    boolean hasDash = false;
    boolean hasLower = false;
    for (int i = 0; i < wlen; i++) {
      char ch = word.charAt(i);
      if (Character.isDigit(ch)) {
        hasDigit = true;
      } else if (ch == '-') {
        hasDash = true;
      } else if (Character.isLetter(ch)) {
        if (Character.isLowerCase(ch)) {
          hasLower = true;
        } else if (Character.isTitleCase(ch)) {
          hasLower = true;
          numCaps++;
        } else {
          numCaps++;
        }
      }
    }
    String lowered = word.toLowerCase();
    if (numCaps > 1) {
      sb.append("-CAPS");
    } else if (numCaps > 0) {
      if (loc == 0) {
        sb.append("-INITC");
        if (getLexicon().isKnown(lowered)) {
          sb.append("-KNOWNLC");
        }
      } else {
        sb.append("-CAP");
      }
    } else if (hasLower) { // (Character.isLowerCase(ch0)) {
      sb.append("-LC");
    }
    if (hasDigit) {
      sb.append("-NUM");
    }
    if (hasDash) {
      sb.append("-DASH");
    }
    if (lowered.endsWith("s") && wlen >= 3) {
      // here length 3, so you don't miss out on ones like 80s
      char ch2 = lowered.charAt(wlen - 2);
      // not -ess suffixes or greek/latin -us, -is
      if (ch2 != 's' && ch2 != 'i' && ch2 != 'u') {
        sb.append("-s");
      }
    } else if (word.length() >= 5 && !hasDash && !(hasDigit && numCaps > 0)) {
      // don't do for very short words;
      // Implement common discriminating suffixes
      if (lowered.endsWith("ed")) {
        sb.append("-ed");
      } else if (lowered.endsWith("ing")) {
        sb.append("-ing");
      } else if (lowered.endsWith("ion")) {
        sb.append("-ion");
      } else if (lowered.endsWith("er")) {
        sb.append("-er");
      } else if (lowered.endsWith("est")) {
        sb.append("-est");
      } else if (lowered.endsWith("ly")) {
        sb.append("-ly");
      } else if (lowered.endsWith("ity")) {
        sb.append("-ity");
      } else if (lowered.endsWith("y")) {
        sb.append("-y");
      } else if (lowered.endsWith("al")) {
        sb.append("-al");
        // } else if (lowered.endsWith("ble")) {
        // sb.append("-ble");
        // } else if (lowered.endsWith("e")) {
        // sb.append("-e");
      }
    }
  }


  private void getSignature5(String word, int loc, StringBuilder sb) {
    // Reformed Mar 2004 (cdm); hopefully better now.
    // { -CAPS, -INITC ap, -LC lowercase, 0 } +
    // { -KNOWNLC, 0 } + [only for INITC]
    // { -NUM, 0 } +
    // { -DASH, 0 } +
    // { -last lowered char(s) if known discriminating suffix, 0}
    int wlen = word.length();
    int numCaps = 0;
    boolean hasDigit = false;
    boolean hasDash = false;
    boolean hasLower = false;
    for (int i = 0; i < wlen; i++) {
      char ch = word.charAt(i);
      if (Character.isDigit(ch)) {
        hasDigit = true;
      } else if (ch == '-') {
        hasDash = true;
      } else if (Character.isLetter(ch)) {
        if (Character.isLowerCase(ch)) {
          hasLower = true;
        } else if (Character.isTitleCase(ch)) {
          hasLower = true;
          numCaps++;
        } else {
          numCaps++;
        }
      }
    }
    char ch0 = word.charAt(0);
    String lowered = word.toLowerCase();
    if (Character.isUpperCase(ch0) || Character.isTitleCase(ch0)) {
      if (loc == 0 && numCaps == 1) {
        sb.append("-INITC");
        if (getLexicon().isKnown(lowered)) {
          sb.append("-KNOWNLC");
        }
      } else {
        sb.append("-CAPS");
      }
    } else if (!Character.isLetter(ch0) && numCaps > 0) {
      sb.append("-CAPS");
    } else if (hasLower) { // (Character.isLowerCase(ch0)) {
      sb.append("-LC");
    }
    if (hasDigit) {
      sb.append("-NUM");
    }
    if (hasDash) {
      sb.append("-DASH");
    }
    if (lowered.endsWith("s") && wlen >= 3) {
      // here length 3, so you don't miss out on ones like 80s
      char ch2 = lowered.charAt(wlen - 2);
      // not -ess suffixes or greek/latin -us, -is
      if (ch2 != 's' && ch2 != 'i' && ch2 != 'u') {
        sb.append("-s");
      }
    } else if (word.length() >= 5 && !hasDash && !(hasDigit && numCaps > 0)) {
      // don't do for very short words;
      // Implement common discriminating suffixes
      if (lowered.endsWith("ed")) {
        sb.append("-ed");
      } else if (lowered.endsWith("ing")) {
        sb.append("-ing");
      } else if (lowered.endsWith("ion")) {
        sb.append("-ion");
      } else if (lowered.endsWith("er")) {
        sb.append("-er");
      } else if (lowered.endsWith("est")) {
        sb.append("-est");
      } else if (lowered.endsWith("ly")) {
        sb.append("-ly");
      } else if (lowered.endsWith("ity")) {
        sb.append("-ity");
      } else if (lowered.endsWith("y")) {
        sb.append("-y");
      } else if (lowered.endsWith("al")) {
        sb.append("-al");
        // } else if (lowered.endsWith("ble")) {
        // sb.append("-ble");
        // } else if (lowered.endsWith("e")) {
        // sb.append("-e");
      }
    }
  }


  private static void getSignature4(String word, int loc, StringBuilder sb) {
    boolean hasDigit = false;
    boolean hasNonDigit = false;
    boolean hasLetter = false;
    boolean hasLower = false;
    boolean hasDash = false;
    boolean hasPeriod = false;
    boolean hasComma = false;
    for (int i = 0; i < word.length(); i++) {
      char ch = word.charAt(i);
      if (Character.isDigit(ch)) {
        hasDigit = true;
      } else {
        hasNonDigit = true;
        if (Character.isLetter(ch)) {
          hasLetter = true;
          if (Character.isLowerCase(ch) || Character.isTitleCase(ch)) {
            hasLower = true;
          }
        } else {
          if (ch == '-') {
            hasDash = true;
          } else if (ch == '.') {
            hasPeriod = true;
          } else if (ch == ',') {
            hasComma = true;
          }
        }
      }
    }
    // 6 way on letters
    if (Character.isUpperCase(word.charAt(0)) || Character.isTitleCase(word.charAt(0))) {
      if (!hasLower) {
        sb.append("-AC");
      } else if (loc == 0) {
        sb.append("-SC");
      } else {
        sb.append("-C");
      }
    } else if (hasLower) {
      sb.append("-L");
    } else if (hasLetter) {
      sb.append("-U");
    } else {
      // no letter
      sb.append("-S");
    }
    // 3 way on number
    if (hasDigit && !hasNonDigit) {
      sb.append("-N");
    } else if (hasDigit) {
      sb.append("-n");
    }
    // binary on period, dash, comma
    if (hasDash) {
      sb.append("-H");
    }
    if (hasPeriod) {
      sb.append("-P");
    }
    if (hasComma) {
      sb.append("-C");
    }
    if (word.length() > 3) {
      // don't do for very short words: "yes" isn't an "-es" word
      // try doing to lower for further densening and skipping digits
      char ch = word.charAt(word.length() - 1);
      if (Character.isLetter(ch)) {
        sb.append('-');
        sb.append(Character.toLowerCase(ch));
      }
    }
  }


  private static void getSignature3(String word, int loc, StringBuilder sb) {
    // This basically works right, except note that 'S' is applied to all
    // capitalized letters in first word of sentence, not just first....
    sb.append('-');
    char lastClass = '-'; // i.e., nothing
    int num = 0;
    for (int i = 0; i < word.length(); i++) {
      char ch = word.charAt(i);
      char newClass;
      if (Character.isUpperCase(ch) || Character.isTitleCase(ch)) {
        if (loc == 0) {
          newClass = 'S';
        } else {
          newClass = 'L';
        }
      } else if (Character.isLetter(ch)) {
        newClass = 'l';
      } else if (Character.isDigit(ch)) {
        newClass = 'd';
      } else if (ch == '-') {
        newClass = 'h';
      } else if (ch == '.') {
        newClass = 'p';
      } else {
        newClass = 's';
      }
      if (newClass != lastClass) {
        lastClass = newClass;
        sb.append(lastClass);
        num = 1;
      } else {
        if (num < 2) {
          sb.append('+');
        }
        num++;
      }
    }
    if (word.length() > 3) {
      // don't do for very short words: "yes" isn't an "-es" word
      // try doing to lower for further densening and skipping digits
      char ch = Character.toLowerCase(word.charAt(word.length() - 1));
      sb.append('-');
      sb.append(ch);
    }
  }


  private static void getSignature2(String word, int loc, StringBuilder sb) {
    // {-ALLC, -INIT, -UC, -LC, zero} +
    // {-DASH, zero} +
    // {-NUM, -DIG, zero} +
    // {lowerLastChar, zeroIfShort}
    boolean hasDigit = false;
    boolean hasNonDigit = false;
    boolean hasLower = false;
    int wlen = word.length();
    for (int i = 0; i < wlen; i++) {
      char ch = word.charAt(i);
      if (Character.isDigit(ch)) {
        hasDigit = true;
      } else {
        hasNonDigit = true;
        if (Character.isLetter(ch)) {
          if (Character.isLowerCase(ch) || Character.isTitleCase(ch)) {
            hasLower = true;
          }
        }
      }
    }
    if (wlen > 0
            && (Character.isUpperCase(word.charAt(0)) || Character.isTitleCase(word.charAt(0)))) {
      if (!hasLower) {
        sb.append("-ALLC");
      } else if (loc == 0) {
        sb.append("-INIT");
      } else {
        sb.append("-UC");
      }
    } else if (hasLower) { // if (Character.isLowerCase(word.charAt(0))) {
      sb.append("-LC");
    }
    // no suffix = no (lowercase) letters
    if (word.indexOf('-') >= 0) {
      sb.append("-DASH");
    }
    if (hasDigit) {
      if (!hasNonDigit) {
        sb.append("-NUM");
      } else {
        sb.append("-DIG");
      }
    } else if (wlen > 3) {
      // don't do for very short words: "yes" isn't an "-es" word
      // try doing toLower for further densening and skipping digits
      char ch = word.charAt(word.length() - 1);
      sb.append(Character.toLowerCase(ch));
    }
    // no suffix = short non-number, non-alphabetic
  }


  private static void getSignature1(String word, int loc, StringBuilder sb) {
    sb.append('-');
    sb.append(word.substring(Math.max(word.length() - 2, 0), word.length()));
    sb.append('-');
    if (Character.isLowerCase(word.charAt(0))) {
      sb.append("LOWER");
    } else {
      if (Character.isUpperCase(word.charAt(0))) {
        if (loc == 0) {
          sb.append("INIT");
        } else {
          sb.append("UPPER");
        }
      } else {
        sb.append("OTHER");
      }
    }
  }


  private void getSignature8(String word, StringBuilder sb) {
    sb.append('-');
    boolean digit = true;
    for (int i = 0; i < word.length(); i++) {
      char c = word.charAt(i);
      if ( ! (Character.isDigit(c) || c == '.' || c == ',' || (i == 0 && (c == '-' || c == '+')))) {
        digit = false;
      }
    }
    // digit = false;  // todo: Just turned off while we test it.
    if (digit) {
      sb.append("NUMBER");
    } else {
      if (distSim == null) {
        distSim = new DistSimClassifier(wordClassesFile, false, true);
        // todo XXXX booleans depend on distsim file; need more options
      }

      String cluster = distSim.distSimClass(word);
      if (cluster == null) {
        cluster = "NULL";
      }
      sb.append(cluster);
    }
  }

  private transient DistSimClassifier distSim;

} // end class