All Downloads are FREE. Search and download functionalities are using the official Maven repository.

edu.stanford.nlp.parser.lexparser.FactoredLexicon Maven / Gradle / Ivy

Go to download

Stanford Parser processes raw text in English, Chinese, German, Arabic, and French, and extracts constituency parse trees.

There is a newer version: 3.9.2
Show newest version
package edu.stanford.nlp.parser.lexparser; 
import edu.stanford.nlp.util.logging.Redwood;

import java.util.ArrayList;
import java.util.Collection;
import java.util.Collections;
import java.util.Iterator;
import java.util.List;
import java.util.Set;

import edu.stanford.nlp.international.Language;
import edu.stanford.nlp.international.arabic.ArabicMorphoFeatureSpecification;
import edu.stanford.nlp.international.french.FrenchMorphoFeatureSpecification;
import edu.stanford.nlp.international.morph.MorphoFeatureSpecification;
import edu.stanford.nlp.international.morph.MorphoFeatureSpecification.MorphoFeatureType;
import edu.stanford.nlp.ling.CoreLabel;
import edu.stanford.nlp.ling.Label;
import edu.stanford.nlp.stats.ClassicCounter;
import edu.stanford.nlp.stats.Counter;
import edu.stanford.nlp.stats.Counters;
import edu.stanford.nlp.stats.TwoDimensionalIntCounter;
import edu.stanford.nlp.trees.Tree;
import edu.stanford.nlp.trees.Treebank;
import edu.stanford.nlp.util.Generics;
import edu.stanford.nlp.util.HashIndex;
import edu.stanford.nlp.util.Index;
import edu.stanford.nlp.util.Pair;

/**
 *
 * @author Spence Green
 *
 */
public class FactoredLexicon extends BaseLexicon  {

  /** A logger for this class */
  private static Redwood.RedwoodChannels log = Redwood.channels(FactoredLexicon.class);

  private static final long serialVersionUID = -744693222804176489L;

  private static final boolean DEBUG = false;

  private MorphoFeatureSpecification morphoSpec;

  private static final String NO_MORPH_ANALYSIS = "xXxNONExXx";

  private Index morphIndex = new HashIndex<>();

  private TwoDimensionalIntCounter wordTag = new TwoDimensionalIntCounter<>(40000);
  private Counter wordTagUnseen = new ClassicCounter<>(500);

  private TwoDimensionalIntCounter lemmaTag = new TwoDimensionalIntCounter<>(40000);
  private Counter lemmaTagUnseen = new ClassicCounter<>(500);

  private TwoDimensionalIntCounter morphTag = new TwoDimensionalIntCounter<>(500);
  private Counter morphTagUnseen = new ClassicCounter<>(500);

  private Counter tagCounter = new ClassicCounter<>(300);

  public FactoredLexicon(MorphoFeatureSpecification morphoSpec, Index wordIndex, Index tagIndex) {
    super(wordIndex, tagIndex);
    this.morphoSpec = morphoSpec;
  }

  public FactoredLexicon(Options op, MorphoFeatureSpecification morphoSpec, Index wordIndex, Index tagIndex) {
    super(op, wordIndex, tagIndex);
    this.morphoSpec = morphoSpec;
  }

  /**
   * Rule table is lemmas. So isKnown() is slightly trickier.
   */
  @Override
  public Iterator ruleIteratorByWord(int word, int loc, String featureSpec) {

    if (word == wordIndex.indexOf(BOUNDARY)) {
      // Deterministic tagging of the boundary symbol
      return rulesWithWord[word].iterator();

    } else if (isKnown(word)) {
      // Strict lexical tagging for seen *lemma* types
      // We need to copy the word form into the rules, which currently have lemmas in them
      return rulesWithWord[word].iterator();

    } else {
      if (DEBUG) log.info("UNKNOWN WORD");
      // Unknown word signatures
      Set lexRules = Generics.newHashSet(10);
      List uwRules = rulesWithWord[wordIndex.indexOf(UNKNOWN_WORD)];
      // Inject the word into these rules instead of the UW signature
      for (IntTaggedWord iTW : uwRules) {
        lexRules.add(new IntTaggedWord(word, iTW.tag));
      }
      return lexRules.iterator();
    }
  }

  @Override
  public float score(IntTaggedWord iTW, int loc, String word, String featureSpec) {
    final int wordId = iTW.word();
    final int tagId = iTW.tag();

    // Force 1-best path to go through the boundary symbol
    // (deterministic tagging)
    final int boundaryId = wordIndex.indexOf(BOUNDARY);
    final int boundaryTagId = tagIndex.indexOf(BOUNDARY_TAG);
    if (wordId == boundaryId && tagId == boundaryTagId) {
      return 0.0f;
    }

    // Morphological features
    String tag = tagIndex.get(iTW.tag());
    Pair lemmaMorph = MorphoFeatureSpecification.splitMorphString(word, featureSpec);
    String lemma = lemmaMorph.first();
    int lemmaId = wordIndex.indexOf(lemma);
    String richMorphTag = lemmaMorph.second();
    String reducedMorphTag = morphoSpec.strToFeatures(richMorphTag).toString().trim();
    reducedMorphTag = reducedMorphTag.length() == 0 ? NO_MORPH_ANALYSIS : reducedMorphTag;
    int morphId = morphIndex.addToIndex(reducedMorphTag);

    // Score the factors and create the rule score p_W_T
    double p_W_Tf = Math.log(probWordTag(word, loc, wordId, tagId));
//    double p_L_T = Math.log(probLemmaTag(word, loc, tagId, lemmaId));
    double p_L_T = 0.0;
    double p_M_T = Math.log(probMorphTag(tagId, morphId));
    double p_W_T = p_W_Tf + p_L_T + p_M_T;

    if (DEBUG) {
//      String tag = tagIndex.get(tagId);
      System.err.printf("WSGDEBUG: %s --> %s %s %s ||  %.10f (%.5f / %.5f / %.5f)%n", tag, word, lemma,
          reducedMorphTag, p_W_T, p_W_Tf, p_L_T, p_M_T);
    }

    // Filter low probability taggings
    return p_W_T > -100.0 ? (float) p_W_T : Float.NEGATIVE_INFINITY;
  }

  private double probWordTag(String word, int loc, int wordId, int tagId) {
    double cW = wordTag.totalCount(wordId);
    double cWT = wordTag.getCount(wordId, tagId);

    // p_L
    double p_W = cW / wordTag.totalCount();

    // p_T
    double cTseen = tagCounter.getCount(tagId);
    double p_T = cTseen / tagCounter.totalCount();

    // p_T_L
    double p_W_T = 0.0;
    if (cW > 0.0) { // Seen lemma
      double p_T_W = 0.0;
      if (cW > 100.0 && cWT > 0.0) {
        p_T_W = cWT / cW;
      } else {
        double cTunseen = wordTagUnseen.getCount(tagId);
        // TODO p_T_U is 0?
        double p_T_U = cTunseen / wordTagUnseen.totalCount();
        p_T_W = (cWT + smooth[1]*p_T_U) / (cW + smooth[1]);
      }
      p_W_T = p_T_W * p_W / p_T;

    } else { // Unseen word. Score based on the word signature (of the surface form)
      IntTaggedWord iTW = new IntTaggedWord(wordId, tagId);
      double c_T = tagCounter.getCount(tagId);
      p_W_T = Math.exp(getUnknownWordModel().score(iTW, loc, c_T, tagCounter.totalCount(), smooth[0], word));
    }

    return p_W_T;
  }

  /**
   * This method should never return 0!!
   */
  private double probLemmaTag(String word, int loc, int tagId, int lemmaId) {
    double cL = lemmaTag.totalCount(lemmaId);
    double cLT = lemmaTag.getCount(lemmaId, tagId);

    // p_L
    double p_L = cL / lemmaTag.totalCount();

    // p_T
    double cTseen = tagCounter.getCount(tagId);
    double p_T = cTseen / tagCounter.totalCount();

    // p_T_L
    double p_L_T = 0.0;
    if (cL > 0.0) { // Seen lemma
      double p_T_L = 0.0;
      if (cL > 100.0 && cLT > 0.0) {
        p_T_L = cLT / cL;
      } else {
        double cTunseen = lemmaTagUnseen.getCount(tagId);
        // TODO(spenceg): p_T_U is 0??
        double p_T_U = cTunseen / lemmaTagUnseen.totalCount();
        p_T_L = (cLT + smooth[1]*p_T_U) / (cL + smooth[1]);
      }
      p_L_T = p_T_L * p_L / p_T;

    } else { // Unseen lemma. Score based on the word signature (of the surface form)
      // Hack
      double cTunseen = lemmaTagUnseen.getCount(tagId);
      p_L_T = cTunseen / tagCounter.totalCount();

      //      int wordId = wordIndex.indexOf(word);
//      IntTaggedWord iTW = new IntTaggedWord(wordId, tagId);
//      double c_T = tagCounter.getCount(tagId);
//      p_L_T = Math.exp(getUnknownWordModel().score(iTW, loc, c_T, tagCounter.totalCount(), smooth[0], word));
    }

    return p_L_T;
  }

  /**
   * This method should never return 0!
   */
  private double probMorphTag(int tagId, int morphId) {
    double cM = morphTag.totalCount(morphId);
    double cMT = morphTag.getCount(morphId, tagId);

    // p_M
    double p_M = cM / morphTag.totalCount();

    // p_T
    double cTseen = tagCounter.getCount(tagId);
    double p_T = cTseen / tagCounter.totalCount();

    double p_M_T = 0.0;
    if (cM > 100.0 && cMT > 0.0) {
      double p_T_M = cMT / cM;

//      else {
//        double cTunseen = morphTagUnseen.getCount(tagId);
//        double p_T_U = cTunseen / morphTagUnseen.totalCount();
//        p_T_M = (cMT + smooth[1]*p_T_U) / (cM + smooth[1]);
//      }
      p_M_T = p_T_M * p_M / p_T;

    } else { // Unseen morphological analysis
      // Hack....unseen morph tags are extremely rare
      // Add+1 smoothing
      p_M_T = 1.0 / (morphTag.totalCount() + tagIndex.size() + 1.0);
    }

    return p_M_T;
  }

  /**
   * This method should populate wordIndex, tagIndex, and morphIndex.
   */
  @Override
  public void train(Collection trees, Collection rawTrees) {
    double weight = 1.0;
    // Train uw model on words
    uwModelTrainer.train(trees, weight);

    final double numTrees = trees.size();
    Iterator rawTreesItr = rawTrees == null ? null : rawTrees.iterator();
    Iterator treeItr = trees.iterator();

    // Train factored lexicon on lemmas and morph tags
    int treeId = 0;
    while (treeItr.hasNext()) {
      Tree tree = treeItr.next();
      // CoreLabels, with morph analysis in the originalText annotation
      List




© 2015 - 2024 Weber Informatics LLC | Privacy Policy