edu.stanford.nlp.parser.lexparser.ChineseUnknownWordModel Maven / Gradle / Ivy

Show more of this group Show more artifacts with this name
Show all versions of stanford-corenlp Show documentation
Stanford CoreNLP provides a set of natural language analysis tools which can take raw English language text input and give the base forms of words, their parts of speech, whether they are names of companies, people, etc., normalize dates, times, and numeric quantities, mark up the structure of sentences in terms of phrases and word dependencies, and indicate which noun phrases refer to the same entities. It provides the foundational building blocks for higher level text understanding applications.
There is a newer version: 4.5.7
Show newest version
package edu.stanford.nlp.parser.lexparser; 
import edu.stanford.nlp.util.logging.Redwood;

import java.util.Map;
import java.util.Set;

import edu.stanford.nlp.io.EncodingPrintWriter;
import edu.stanford.nlp.ling.TaggedWord;
import edu.stanford.nlp.ling.WordTag;
import edu.stanford.nlp.ling.Tag;
import edu.stanford.nlp.ling.Label;
import edu.stanford.nlp.stats.ClassicCounter;
import edu.stanford.nlp.util.Generics;
import edu.stanford.nlp.util.Index;

/**
 * Stores, trains, and scores with an unknown word model.  A couple
 * of filters deterministically force rewrites for certain proper
 * nouns, dates, and cardinal and ordinal numbers; when none of these
 * filters are met, either the distribution of terminals with the same
 * first character is used, or Good-Turing smoothing is used. Although
 * this is developed for Chinese, the training and storage methods
 * could be used cross-linguistically.
 *
 * @author Roger Levy
 */
public class ChineseUnknownWordModel extends BaseUnknownWordModel  {

  /** A logger for this class */
  private static Redwood.RedwoodChannels log = Redwood.channels(ChineseUnknownWordModel.class);

  private static final String encoding = "GB18030"; // used only for debugging

  private final boolean useUnicodeType;


  /* These strings are stored in ascii-type Unicode encoding.  To
   * edit them, either use the Unicode codes or use native2ascii or a
   * similar program to convert the file into a Chinese encoding, then
   * convert back. */
  private static final String numberMatch = ".*[0-9\uff10-\uff19\u4e00\u4e8c\u4e09\u56db\u4e94\u516d\u4e03\u516b\u4e5d\u5341\u767e\u5343\u4e07\u4ebf\u96F6\u3007\u25cb\u25ef].*";
  private static final String dateMatch = numberMatch + "[\u5e74\u6708\u65e5\u53f7]";
  private static final String ordinalMatch = "\u7b2c.*";
  // uses midDot characters as one clue of being proper name
  private static final String properNameMatch = ".*[\u00b7\u0387\u2022\u2024\u2027\u2219\u22C5\u30FB].*";

  private final Set seenFirst;


  public ChineseUnknownWordModel(Options op, Lexicon lex,
                                 Index wordIndex,
                                 Index tagIndex,
                                 ClassicCounter unSeenCounter,
                                 Map> tagHash,
                                 Map unknownGT,
                                 boolean useGT,
                                 Set seenFirst) {
    super(op, lex, wordIndex, tagIndex,
          unSeenCounter, tagHash, unknownGT, null);
    this.useFirst = !useGT;
    this.useGT = useGT;
    this.useUnicodeType = op.lexOptions.useUnicodeType;
    this.seenFirst = seenFirst;
  }

  /**
   * This constructor creates an UWM with empty data structures.  Only
   * use if loading in the data separately, such as by reading in text
   * lines containing the data.
   * TODO: would need to set useGT correctly if you saved a model with
   * useGT and then wanted to recover it from text.
   */
  public ChineseUnknownWordModel(Options op, Lexicon lex,
                                 Index wordIndex,
                                 Index tagIndex) {
    this(op, lex, wordIndex, tagIndex,
            new ClassicCounter<>(),
         Generics.>newHashMap(),
         Generics.newHashMap(),
         false, Generics.newHashSet());
  }

  @Override
  public float score(IntTaggedWord itw, String word) {
    // Label tagL = itw.tagLabel();
    // String tag = tagL.value();
    String tag = itw.tagString(tagIndex);
    Label tagL = new Tag(tag);

    float logProb;

    if (VERBOSE) EncodingPrintWriter.out.println("Scoring unknown word |" + word + "| with tag " + tag, encoding);

    if (word.matches(dateMatch)) {
      //EncodingPrintWriter.out.println("Date match for " + word,encoding);
      if (tag.equals("NT")) {
        logProb = 0.0f;
      } else {
        logProb = Float.NEGATIVE_INFINITY;
      }
    } else if (word.matches(numberMatch)) {
      //EncodingPrintWriter.out.println("Number match for " + word,encoding);
      if (tag.equals("CD") && (!word.matches(ordinalMatch))) {
        logProb = 0.0f;
      } else if (tag.equals("OD") && word.matches(ordinalMatch)) {
        logProb = 0.0f;
      } else {
        logProb = Float.NEGATIVE_INFINITY;
      }
    } else if (word.matches(properNameMatch)) {
      //EncodingPrintWriter.out.println("Proper name match for " + word,encoding);
      if (tag.equals("NR")) {
        logProb = 0.0f;
      } else {
        logProb = Float.NEGATIVE_INFINITY;
      }
    /* -------------
      // this didn't seem to work -- too categorical
      int type = Character.getType(word.charAt(0));
      // the below may not normalize probs over options, but is probably okay
      if (type == Character.START_PUNCTUATION) {
        if (tag.equals("PU-LPAREN") || tag.equals("PU-PAREN") ||
            tag.equals("PU-LQUOTE") || tag.equals("PU-QUOTE") ||
            tag.equals("PU")) {
          // if (VERBOSE) log.info("ChineseUWM: unknown L Punc");
          logProb = 0.0f;
        } else {
          logProb = Float.NEGATIVE_INFINITY;
        }
      } else if (type == Character.END_PUNCTUATION) {
        if (tag.equals("PU-RPAREN") || tag.equals("PU-PAREN") ||
            tag.equals("PU-RQUOTE") || tag.equals("PU-QUOTE") ||
            tag.equals("PU")) {
          // if (VERBOSE) log.info("ChineseUWM: unknown R Punc");
          logProb = 0.0f;
        } else {
          logProb = Float.NEGATIVE_INFINITY;
        }
      } else {
        if (tag.equals("PU-OTHER") || tag.equals("PU-ENDSENT") ||
            tag.equals("PU")) {
          // if (VERBOSE) log.info("ChineseUWM: unknown O Punc");
          logProb = 0.0f;
        } else {
          logProb = Float.NEGATIVE_INFINITY;
        }
      }
    ------------- */
    } else {
      first:
        if (useFirst) {
          String first = word.substring(0, 1);
          if (useUnicodeType) {
            char ch = word.charAt(0);
            int type = Character.getType(ch);
            if (type != Character.OTHER_LETTER) {
              // standard Chinese characters are of type "OTHER_LETTER"!!
              first = Integer.toString(type);
            }
          }
          if ( ! seenFirst.contains(first)) {
            if (useGT) {
              logProb = scoreGT(tag);
              break first;
            } else {
              first = unknown;
            }
          }

          /* get the Counter of terminal rewrites for the relevant tag */
          ClassicCounter wordProbs = tagHash.get(tagL);

          /* if the proposed tag has never been seen before, issue a
             warning and return probability 0. */
          if (wordProbs == null) {
            if (VERBOSE) log.info("Warning: proposed tag is unseen in training data!");
            logProb = Float.NEGATIVE_INFINITY;
          } else if (wordProbs.containsKey(first)) {
            logProb = (float) wordProbs.getCount(first);
          } else {
            logProb = (float) wordProbs.getCount(unknown);
          }
        } else if (useGT) {
          logProb = scoreGT(tag);
        } else {
          if (VERBOSE) log.info("Warning: no unknown word model in place!\nGiving the combination " + word + " " + tag + " zero probability.");
          logProb = Float.NEGATIVE_INFINITY; // should never get this!
        }
    }

    if (VERBOSE) EncodingPrintWriter.out.println("Unknown word estimate for " + word + " as " + tag + ": " + logProb, encoding);
    return logProb;
  }



  public static void main(String[] args) {
    System.out.println("Testing unknown matching");
    String s = "\u5218\u00b7\u9769\u547d";
    if (s.matches(properNameMatch)) {
      System.out.println("hooray names!");
    } else {
      System.out.println("Uh-oh names!");
    }
    String s1 = "\uff13\uff10\uff10\uff10";
    if (s1.matches(numberMatch)) {
      System.out.println("hooray numbers!");
    } else {
      System.out.println("Uh-oh numbers!");
    }
    String s11 = "\u767e\u5206\u4e4b\u56db\u5341\u4e09\u70b9\u4e8c";
    if (s11.matches(numberMatch)) {
      System.out.println("hooray numbers!");
    } else {
      System.out.println("Uh-oh numbers!");
    }
    String s12 = "\u767e\u5206\u4e4b\u4e09\u5341\u516b\u70b9\u516d";
    if (s12.matches(numberMatch)) {
      System.out.println("hooray numbers!");
    } else {
      System.out.println("Uh-oh numbers!");
    }
    String s2 = "\u4e09\u6708";
    if (s2.matches(dateMatch)) {
      System.out.println("hooray dates!");
    } else {
      System.out.println("Uh-oh dates!");
    }

    System.out.println("Testing tagged word");
    ClassicCounter c = new ClassicCounter<>();
    TaggedWord tw1 = new TaggedWord("w", "t");
    c.incrementCount(tw1);
    TaggedWord tw2 = new TaggedWord("w", "t2");
    System.out.println(c.containsKey(tw2));
    System.out.println(tw1.equals(tw2));

    WordTag wt1 = toWordTag(tw1);
    WordTag wt2 = toWordTag(tw2);
    WordTag wt3 = new WordTag("w", "t2");
    System.out.println(wt1.equals(wt2));
    System.out.println(wt2.equals(wt3));
  }

  private static WordTag toWordTag(TaggedWord tw) {
    return new WordTag(tw.word(), tw.tag());
  }

  private static final long serialVersionUID = 221L;


  @Override
  public String getSignature(String word, int loc) {
    throw new UnsupportedOperationException();
  }

}