All Downloads are FREE. Search and download functionalities are using the official Maven repository.

edu.stanford.nlp.parser.lexparser.FactoredLexicon Maven / Gradle / Ivy

Go to download

Stanford CoreNLP provides a set of natural language analysis tools which can take raw English language text input and give the base forms of words, their parts of speech, whether they are names of companies, people, etc., normalize dates, times, and numeric quantities, mark up the structure of sentences in terms of phrases and word dependencies, and indicate which noun phrases refer to the same entities. It provides the foundational building blocks for higher level text understanding applications.

There is a newer version: 4.5.7
Show newest version
package edu.stanford.nlp.parser.lexparser; 
import edu.stanford.nlp.util.logging.Redwood;

import java.util.ArrayList;
import java.util.Collection;
import java.util.Collections;
import java.util.Iterator;
import java.util.List;
import java.util.Set;

import edu.stanford.nlp.international.Language;
import edu.stanford.nlp.international.arabic.ArabicMorphoFeatureSpecification;
import edu.stanford.nlp.international.french.FrenchMorphoFeatureSpecification;
import edu.stanford.nlp.international.morph.MorphoFeatureSpecification;
import edu.stanford.nlp.international.morph.MorphoFeatureSpecification.MorphoFeatureType;
import edu.stanford.nlp.ling.CoreLabel;
import edu.stanford.nlp.ling.Label;
import edu.stanford.nlp.stats.ClassicCounter;
import edu.stanford.nlp.stats.Counter;
import edu.stanford.nlp.stats.Counters;
import edu.stanford.nlp.stats.TwoDimensionalIntCounter;
import edu.stanford.nlp.trees.Tree;
import edu.stanford.nlp.trees.Treebank;
import edu.stanford.nlp.util.Generics;
import edu.stanford.nlp.util.HashIndex;
import edu.stanford.nlp.util.Index;
import edu.stanford.nlp.util.Pair;

/**
 *
 * @author Spence Green
 *
 */
public class FactoredLexicon extends BaseLexicon  {

  /** A logger for this class */
  private static Redwood.RedwoodChannels log = Redwood.channels(FactoredLexicon.class);

  private static final long serialVersionUID = -744693222804176489L;

  private static final boolean DEBUG = false;

  private MorphoFeatureSpecification morphoSpec;

  private static final String NO_MORPH_ANALYSIS = "xXxNONExXx";

  private Index morphIndex = new HashIndex<>();

  private TwoDimensionalIntCounter wordTag = new TwoDimensionalIntCounter<>(40000);
  private Counter wordTagUnseen = new ClassicCounter<>(500);

  private TwoDimensionalIntCounter lemmaTag = new TwoDimensionalIntCounter<>(40000);
  private Counter lemmaTagUnseen = new ClassicCounter<>(500);

  private TwoDimensionalIntCounter morphTag = new TwoDimensionalIntCounter<>(500);
  private Counter morphTagUnseen = new ClassicCounter<>(500);

  private Counter tagCounter = new ClassicCounter<>(300);

  public FactoredLexicon(MorphoFeatureSpecification morphoSpec, Index wordIndex, Index tagIndex) {
    super(wordIndex, tagIndex);
    this.morphoSpec = morphoSpec;
  }

  public FactoredLexicon(Options op, MorphoFeatureSpecification morphoSpec, Index wordIndex, Index tagIndex) {
    super(op, wordIndex, tagIndex);
    this.morphoSpec = morphoSpec;
  }

  /**
   * Rule table is lemmas. So isKnown() is slightly trickier.
   */
  @Override
  public Iterator ruleIteratorByWord(int word, int loc, String featureSpec) {

    if (word == wordIndex.indexOf(BOUNDARY)) {
      // Deterministic tagging of the boundary symbol
      return rulesWithWord[word].iterator();

    } else if (isKnown(word)) {
      // Strict lexical tagging for seen *lemma* types
      // We need to copy the word form into the rules, which currently have lemmas in them
      return rulesWithWord[word].iterator();

    } else {
      if (DEBUG) log.info("UNKNOWN WORD");
      // Unknown word signatures
      Set lexRules = Generics.newHashSet(10);
      List uwRules = rulesWithWord[wordIndex.indexOf(UNKNOWN_WORD)];
      // Inject the word into these rules instead of the UW signature
      for (IntTaggedWord iTW : uwRules) {
        lexRules.add(new IntTaggedWord(word, iTW.tag));
      }
      return lexRules.iterator();
    }
  }

  @Override
  public float score(IntTaggedWord iTW, int loc, String word, String featureSpec) {
    final int wordId = iTW.word();
    final int tagId = iTW.tag();

    // Force 1-best path to go through the boundary symbol
    // (deterministic tagging)
    final int boundaryId = wordIndex.indexOf(BOUNDARY);
    final int boundaryTagId = tagIndex.indexOf(BOUNDARY_TAG);
    if (wordId == boundaryId && tagId == boundaryTagId) {
      return 0.0f;
    }

    // Morphological features
    String tag = tagIndex.get(iTW.tag());
    Pair lemmaMorph = MorphoFeatureSpecification.splitMorphString(word, featureSpec);
    String lemma = lemmaMorph.first();
    int lemmaId = wordIndex.indexOf(lemma);
    String richMorphTag = lemmaMorph.second();
    String reducedMorphTag = morphoSpec.strToFeatures(richMorphTag).toString().trim();
    reducedMorphTag = reducedMorphTag.length() == 0 ? NO_MORPH_ANALYSIS : reducedMorphTag;
    int morphId = morphIndex.addToIndex(reducedMorphTag);

    // Score the factors and create the rule score p_W_T
    double p_W_Tf = Math.log(probWordTag(word, loc, wordId, tagId));
//    double p_L_T = Math.log(probLemmaTag(word, loc, tagId, lemmaId));
    double p_L_T = 0.0;
    double p_M_T = Math.log(probMorphTag(tagId, morphId));
    double p_W_T = p_W_Tf + p_L_T + p_M_T;

    if (DEBUG) {
//      String tag = tagIndex.get(tagId);
      System.err.printf("WSGDEBUG: %s --> %s %s %s ||  %.10f (%.5f / %.5f / %.5f)%n", tag, word, lemma,
          reducedMorphTag, p_W_T, p_W_Tf, p_L_T, p_M_T);
    }

    // Filter low probability taggings
    return p_W_T > -100.0 ? (float) p_W_T : Float.NEGATIVE_INFINITY;
  }

  private double probWordTag(String word, int loc, int wordId, int tagId) {
    double cW = wordTag.totalCount(wordId);
    double cWT = wordTag.getCount(wordId, tagId);

    // p_L
    double p_W = cW / wordTag.totalCount();

    // p_T
    double cTseen = tagCounter.getCount(tagId);
    double p_T = cTseen / tagCounter.totalCount();

    // p_T_L
    double p_W_T = 0.0;
    if (cW > 0.0) { // Seen lemma
      double p_T_W = 0.0;
      if (cW > 100.0 && cWT > 0.0) {
        p_T_W = cWT / cW;
      } else {
        double cTunseen = wordTagUnseen.getCount(tagId);
        // TODO p_T_U is 0?
        double p_T_U = cTunseen / wordTagUnseen.totalCount();
        p_T_W = (cWT + smooth[1]*p_T_U) / (cW + smooth[1]);
      }
      p_W_T = p_T_W * p_W / p_T;

    } else { // Unseen word. Score based on the word signature (of the surface form)
      IntTaggedWord iTW = new IntTaggedWord(wordId, tagId);
      double c_T = tagCounter.getCount(tagId);
      p_W_T = Math.exp(getUnknownWordModel().score(iTW, loc, c_T, tagCounter.totalCount(), smooth[0], word));
    }

    return p_W_T;
  }

  /**
   * This method should never return 0!!
   */
  private double probLemmaTag(String word, int loc, int tagId, int lemmaId) {
    double cL = lemmaTag.totalCount(lemmaId);
    double cLT = lemmaTag.getCount(lemmaId, tagId);

    // p_L
    double p_L = cL / lemmaTag.totalCount();

    // p_T
    double cTseen = tagCounter.getCount(tagId);
    double p_T = cTseen / tagCounter.totalCount();

    // p_T_L
    double p_L_T = 0.0;
    if (cL > 0.0) { // Seen lemma
      double p_T_L = 0.0;
      if (cL > 100.0 && cLT > 0.0) {
        p_T_L = cLT / cL;
      } else {
        double cTunseen = lemmaTagUnseen.getCount(tagId);
        // TODO(spenceg): p_T_U is 0??
        double p_T_U = cTunseen / lemmaTagUnseen.totalCount();
        p_T_L = (cLT + smooth[1]*p_T_U) / (cL + smooth[1]);
      }
      p_L_T = p_T_L * p_L / p_T;

    } else { // Unseen lemma. Score based on the word signature (of the surface form)
      // Hack
      double cTunseen = lemmaTagUnseen.getCount(tagId);
      p_L_T = cTunseen / tagCounter.totalCount();

      //      int wordId = wordIndex.indexOf(word);
//      IntTaggedWord iTW = new IntTaggedWord(wordId, tagId);
//      double c_T = tagCounter.getCount(tagId);
//      p_L_T = Math.exp(getUnknownWordModel().score(iTW, loc, c_T, tagCounter.totalCount(), smooth[0], word));
    }

    return p_L_T;
  }

  /**
   * This method should never return 0!
   */
  private double probMorphTag(int tagId, int morphId) {
    double cM = morphTag.totalCount(morphId);
    double cMT = morphTag.getCount(morphId, tagId);

    // p_M
    double p_M = cM / morphTag.totalCount();

    // p_T
    double cTseen = tagCounter.getCount(tagId);
    double p_T = cTseen / tagCounter.totalCount();

    double p_M_T = 0.0;
    if (cM > 100.0 && cMT > 0.0) {
      double p_T_M = cMT / cM;

//      else {
//        double cTunseen = morphTagUnseen.getCount(tagId);
//        double p_T_U = cTunseen / morphTagUnseen.totalCount();
//        p_T_M = (cMT + smooth[1]*p_T_U) / (cM + smooth[1]);
//      }
      p_M_T = p_T_M * p_M / p_T;

    } else { // Unseen morphological analysis
      // Hack....unseen morph tags are extremely rare
      // Add+1 smoothing
      p_M_T = 1.0 / (morphTag.totalCount() + tagIndex.size() + 1.0);
    }

    return p_M_T;
  }

  /**
   * This method should populate wordIndex, tagIndex, and morphIndex.
   */
  @Override
  public void train(Collection trees, Collection rawTrees) {
    double weight = 1.0;
    // Train uw model on words
    uwModelTrainer.train(trees, weight);

    final double numTrees = trees.size();
    Iterator rawTreesItr = rawTrees == null ? null : rawTrees.iterator();
    Iterator treeItr = trees.iterator();

    // Train factored lexicon on lemmas and morph tags
    int treeId = 0;
    while (treeItr.hasNext()) {
      Tree tree = treeItr.next();
      // CoreLabels, with morph analysis in the originalText annotation
      List




© 2015 - 2024 Weber Informatics LLC | Privacy Policy