edu.stanford.nlp.parser.lexparser.FactoredLexicon Maven / Gradle / Ivy

Show more of this group Show more artifacts with this name
Show all versions of stanford-corenlp Show documentation
Stanford CoreNLP provides a set of natural language analysis tools which can take raw English language text input and give the base forms of words, their parts of speech, whether they are names of companies, people, etc., normalize dates, times, and numeric quantities, mark up the structure of sentences in terms of phrases and word dependencies, and indicate which noun phrases refer to the same entities. It provides the foundational building blocks for higher level text understanding applications.
There is a newer version: 4.5.7
Show newest version
package edu.stanford.nlp.parser.lexparser; 
import edu.stanford.nlp.util.logging.Redwood;

import java.util.ArrayList;
import java.util.Collection;
import java.util.Collections;
import java.util.Iterator;
import java.util.List;
import java.util.Set;

import edu.stanford.nlp.international.Language;
import edu.stanford.nlp.international.arabic.ArabicMorphoFeatureSpecification;
import edu.stanford.nlp.international.french.FrenchMorphoFeatureSpecification;
import edu.stanford.nlp.international.morph.MorphoFeatureSpecification;
import edu.stanford.nlp.international.morph.MorphoFeatureSpecification.MorphoFeatureType;
import edu.stanford.nlp.ling.CoreLabel;
import edu.stanford.nlp.ling.Label;
import edu.stanford.nlp.stats.ClassicCounter;
import edu.stanford.nlp.stats.Counter;
import edu.stanford.nlp.stats.Counters;
import edu.stanford.nlp.stats.TwoDimensionalIntCounter;
import edu.stanford.nlp.trees.Tree;
import edu.stanford.nlp.trees.Treebank;
import edu.stanford.nlp.util.Generics;
import edu.stanford.nlp.util.HashIndex;
import edu.stanford.nlp.util.Index;
import edu.stanford.nlp.util.Pair;

/**
 *
 * @author Spence Green
 *
 */
public class FactoredLexicon extends BaseLexicon  {

  /** A logger for this class */
  private static Redwood.RedwoodChannels log = Redwood.channels(FactoredLexicon.class);

  private static final long serialVersionUID = -744693222804176489L;

  private static final boolean DEBUG = false;

  private MorphoFeatureSpecification morphoSpec;

  private static final String NO_MORPH_ANALYSIS = "xXxNONExXx";

  private Index morphIndex = new HashIndex<>();

  private TwoDimensionalIntCounter wordTag = new TwoDimensionalIntCounter<>(40000);
  private Counter wordTagUnseen = new ClassicCounter<>(500);

  private TwoDimensionalIntCounter lemmaTag = new TwoDimensionalIntCounter<>(40000);
  private Counter lemmaTagUnseen = new ClassicCounter<>(500);

  private TwoDimensionalIntCounter morphTag = new TwoDimensionalIntCounter<>(500);
  private Counter morphTagUnseen = new ClassicCounter<>(500);

  private Counter tagCounter = new ClassicCounter<>(300);

  public FactoredLexicon(MorphoFeatureSpecification morphoSpec, Index wordIndex, Index tagIndex) {
    super(wordIndex, tagIndex);
    this.morphoSpec = morphoSpec;
  }

  public FactoredLexicon(Options op, MorphoFeatureSpecification morphoSpec, Index wordIndex, Index tagIndex) {
    super(op, wordIndex, tagIndex);
    this.morphoSpec = morphoSpec;
  }

  /**
   * Rule table is lemmas. So isKnown() is slightly trickier.
   */
  @Override
  public Iterator ruleIteratorByWord(int word, int loc, String featureSpec) {

    if (word == wordIndex.indexOf(BOUNDARY)) {
      // Deterministic tagging of the boundary symbol
      return rulesWithWord[word].iterator();

    } else if (isKnown(word)) {
      // Strict lexical tagging for seen *lemma* types
      // We need to copy the word form into the rules, which currently have lemmas in them
      return rulesWithWord[word].iterator();

    } else {
      if (DEBUG) log.info("UNKNOWN WORD");
      // Unknown word signatures
      Set lexRules = Generics.newHashSet(10);
      List uwRules = rulesWithWord[wordIndex.indexOf(UNKNOWN_WORD)];
      // Inject the word into these rules instead of the UW signature
      for (IntTaggedWord iTW : uwRules) {
        lexRules.add(new IntTaggedWord(word, iTW.tag));
      }
      return lexRules.iterator();
    }
  }

  @Override
  public float score(IntTaggedWord iTW, int loc, String word, String featureSpec) {
    final int wordId = iTW.word();
    final int tagId = iTW.tag();

    // Force 1-best path to go through the boundary symbol
    // (deterministic tagging)
    final int boundaryId = wordIndex.indexOf(BOUNDARY);
    final int boundaryTagId = tagIndex.indexOf(BOUNDARY_TAG);
    if (wordId == boundaryId && tagId == boundaryTagId) {
      return 0.0f;
    }

    // Morphological features
    String tag = tagIndex.get(iTW.tag());
    Pair lemmaMorph = MorphoFeatureSpecification.splitMorphString(word, featureSpec);
    String lemma = lemmaMorph.first();
    int lemmaId = wordIndex.indexOf(lemma);
    String richMorphTag = lemmaMorph.second();
    String reducedMorphTag = morphoSpec.strToFeatures(richMorphTag).toString().trim();
    reducedMorphTag = reducedMorphTag.length() == 0 ? NO_MORPH_ANALYSIS : reducedMorphTag;
    int morphId = morphIndex.addToIndex(reducedMorphTag);

    // Score the factors and create the rule score p_W_T
    double p_W_Tf = Math.log(probWordTag(word, loc, wordId, tagId));
//    double p_L_T = Math.log(probLemmaTag(word, loc, tagId, lemmaId));
    double p_L_T = 0.0;
    double p_M_T = Math.log(probMorphTag(tagId, morphId));
    double p_W_T = p_W_Tf + p_L_T + p_M_T;

    if (DEBUG) {
//      String tag = tagIndex.get(tagId);
      System.err.printf("WSGDEBUG: %s --> %s %s %s ||  %.10f (%.5f / %.5f / %.5f)%n", tag, word, lemma,
          reducedMorphTag, p_W_T, p_W_Tf, p_L_T, p_M_T);
    }

    // Filter low probability taggings
    return p_W_T > -100.0 ? (float) p_W_T : Float.NEGATIVE_INFINITY;
  }

  private double probWordTag(String word, int loc, int wordId, int tagId) {
    double cW = wordTag.totalCount(wordId);
    double cWT = wordTag.getCount(wordId, tagId);

    // p_L
    double p_W = cW / wordTag.totalCount();

    // p_T
    double cTseen = tagCounter.getCount(tagId);
    double p_T = cTseen / tagCounter.totalCount();

    // p_T_L
    double p_W_T = 0.0;
    if (cW > 0.0) { // Seen lemma
      double p_T_W = 0.0;
      if (cW > 100.0 && cWT > 0.0) {
        p_T_W = cWT / cW;
      } else {
        double cTunseen = wordTagUnseen.getCount(tagId);
        // TODO p_T_U is 0?
        double p_T_U = cTunseen / wordTagUnseen.totalCount();
        p_T_W = (cWT + smooth[1]*p_T_U) / (cW + smooth[1]);
      }
      p_W_T = p_T_W * p_W / p_T;

    } else { // Unseen word. Score based on the word signature (of the surface form)
      IntTaggedWord iTW = new IntTaggedWord(wordId, tagId);
      double c_T = tagCounter.getCount(tagId);
      p_W_T = Math.exp(getUnknownWordModel().score(iTW, loc, c_T, tagCounter.totalCount(), smooth[0], word));
    }

    return p_W_T;
  }

  /**
   * This method should never return 0!!
   */
  private double probLemmaTag(String word, int loc, int tagId, int lemmaId) {
    double cL = lemmaTag.totalCount(lemmaId);
    double cLT = lemmaTag.getCount(lemmaId, tagId);

    // p_L
    double p_L = cL / lemmaTag.totalCount();

    // p_T
    double cTseen = tagCounter.getCount(tagId);
    double p_T = cTseen / tagCounter.totalCount();

    // p_T_L
    double p_L_T = 0.0;
    if (cL > 0.0) { // Seen lemma
      double p_T_L = 0.0;
      if (cL > 100.0 && cLT > 0.0) {
        p_T_L = cLT / cL;
      } else {
        double cTunseen = lemmaTagUnseen.getCount(tagId);
        // TODO(spenceg): p_T_U is 0??
        double p_T_U = cTunseen / lemmaTagUnseen.totalCount();
        p_T_L = (cLT + smooth[1]*p_T_U) / (cL + smooth[1]);
      }
      p_L_T = p_T_L * p_L / p_T;

    } else { // Unseen lemma. Score based on the word signature (of the surface form)
      // Hack
      double cTunseen = lemmaTagUnseen.getCount(tagId);
      p_L_T = cTunseen / tagCounter.totalCount();

      //      int wordId = wordIndex.indexOf(word);
//      IntTaggedWord iTW = new IntTaggedWord(wordId, tagId);
//      double c_T = tagCounter.getCount(tagId);
//      p_L_T = Math.exp(getUnknownWordModel().score(iTW, loc, c_T, tagCounter.totalCount(), smooth[0], word));
    }

    return p_L_T;
  }

  /**
   * This method should never return 0!
   */
  private double probMorphTag(int tagId, int morphId) {
    double cM = morphTag.totalCount(morphId);
    double cMT = morphTag.getCount(morphId, tagId);

    // p_M
    double p_M = cM / morphTag.totalCount();

    // p_T
    double cTseen = tagCounter.getCount(tagId);
    double p_T = cTseen / tagCounter.totalCount();

    double p_M_T = 0.0;
    if (cM > 100.0 && cMT > 0.0) {
      double p_T_M = cMT / cM;

//      else {
//        double cTunseen = morphTagUnseen.getCount(tagId);
//        double p_T_U = cTunseen / morphTagUnseen.totalCount();
//        p_T_M = (cMT + smooth[1]*p_T_U) / (cM + smooth[1]);
//      }
      p_M_T = p_T_M * p_M / p_T;

    } else { // Unseen morphological analysis
      // Hack....unseen morph tags are extremely rare
      // Add+1 smoothing
      p_M_T = 1.0 / (morphTag.totalCount() + tagIndex.size() + 1.0);
    }

    return p_M_T;
  }

  /**
   * This method should populate wordIndex, tagIndex, and morphIndex.
   */
  @Override
  public void train(Collection trees, Collection rawTrees) {
    double weight = 1.0;
    // Train uw model on words
    uwModelTrainer.train(trees, weight);

    final double numTrees = trees.size();
    Iterator rawTreesItr = rawTrees == null ? null : rawTrees.iterator();
    Iterator treeItr = trees.iterator();

    // Train factored lexicon on lemmas and morph tags
    int treeId = 0;
    while (treeItr.hasNext()) {
      Tree tree = treeItr.next();
      // CoreLabels, with morph analysis in the originalText annotation
      List yield = rawTrees == null ? tree.yield() : rawTreesItr.next().yield();
      // Annotated, binarized tree for the tags (labels are usually CategoryWordTag)
      List pretermYield = tree.preTerminalYield();

      int yieldLen = yield.size();
      for (int i = 0; i < yieldLen; ++i) {
        String word = yield.get(i).value();
        int wordId = wordIndex.addToIndex(word); // Don't do anything with words
        String tag = pretermYield.get(i).value();
        int tagId = tagIndex.addToIndex(tag);

        // Use the word as backup if there is no lemma
        String featureStr = ((CoreLabel) yield.get(i)).originalText();
        Pair lemmaMorph = MorphoFeatureSpecification.splitMorphString(word, featureStr);
        String lemma = lemmaMorph.first();
        int lemmaId = wordIndex.addToIndex(lemma);
        String richMorphTag = lemmaMorph.second();
        String reducedMorphTag = morphoSpec.strToFeatures(richMorphTag).toString().trim();
        reducedMorphTag = reducedMorphTag.isEmpty() ? NO_MORPH_ANALYSIS : reducedMorphTag;
        int morphId = morphIndex.addToIndex(reducedMorphTag);

        // Seen event counts
        wordTag.incrementCount(wordId, tagId);
        lemmaTag.incrementCount(lemmaId, tagId);
        morphTag.incrementCount(morphId, tagId);
        tagCounter.incrementCount(tagId);

        // Unseen event counts
        if (treeId > op.trainOptions.fractionBeforeUnseenCounting*numTrees) {
          if (! wordTag.firstKeySet().contains(wordId) || wordTag.getCounter(wordId).totalCount() < 2) {
            wordTagUnseen.incrementCount(tagId);
          }
          if (! lemmaTag.firstKeySet().contains(lemmaId) || lemmaTag.getCounter(lemmaId).totalCount() < 2) {
            lemmaTagUnseen.incrementCount(tagId);
          }
          if (! morphTag.firstKeySet().contains(morphId) || morphTag.getCounter(morphId).totalCount() < 2) {
            morphTagUnseen.incrementCount(tagId);
          }
        }
      }
      ++treeId;

      if (DEBUG && (treeId % 100) == 0) {
        System.err.printf("[%d]",treeId);
      }
      if (DEBUG && (treeId % 10000) == 0) {
        log.info();
      }
    }
  }

  /**
   * Rule table is lemmas!
   */
  @Override
  protected void initRulesWithWord() {
    // Add synthetic symbols to the indices
    int unkWord = wordIndex.addToIndex(UNKNOWN_WORD);
    int boundaryWordId = wordIndex.addToIndex(BOUNDARY);
    int boundaryTagId = tagIndex.addToIndex(BOUNDARY_TAG);

    // Initialize rules table
    final int numWords = wordIndex.size();
    rulesWithWord = new List[numWords];
    for (int w = 0; w < numWords; w++) {
      rulesWithWord[w] = new ArrayList<>(1);
    }

    // Collect rules, indexed by word
    Set lexRules = Generics.newHashSet(40000);
    for (int wordId : wordTag.firstKeySet()) {
      for (int tagId : wordTag.getCounter(wordId).keySet()) {
        lexRules.add(new IntTaggedWord(wordId, tagId));
        lexRules.add(new IntTaggedWord(nullWord, tagId));
      }
    }

    // Known words and signatures
    for (IntTaggedWord iTW : lexRules) {
      if (iTW.word() == nullWord) {
        // Mix in UW signature rules for open class types
        double types = uwModel.unSeenCounter().getCount(iTW);
        if (types > trainOptions.openClassTypesThreshold) {
          IntTaggedWord iTU = new IntTaggedWord(unkWord, iTW.tag);
          if (!rulesWithWord[unkWord].contains(iTU)) {
            rulesWithWord[unkWord].add(iTU);
          }
        }
      } else {
        // Known word
        rulesWithWord[iTW.word].add(iTW);
      }
    }

    log.info("The " + rulesWithWord[unkWord].size() + " open class tags are: [");
    for (IntTaggedWord item : rulesWithWord[unkWord]) {
      log.info(" " + tagIndex.get(item.tag()));
    }
    log.info(" ] ");

    // Boundary symbol has one tagging
    rulesWithWord[boundaryWordId].add(new IntTaggedWord(boundaryWordId, boundaryTagId));
  }

  /**
   * Convert a treebank to factored lexicon events for fast iteration in the
   * optimizer.
   */
  private static List treebankToLexiconEvents(List treebank,
      FactoredLexicon lexicon) {
    List events = new ArrayList<>(70000);
    for (Tree tree : treebank) {
      List yield = tree.yield();
      List preterm = tree.preTerminalYield();
      assert yield.size() == preterm.size();
      int yieldLen = yield.size();
      for (int i = 0; i < yieldLen; ++i) {
        String tag = preterm.get(i).value();
        int tagId = lexicon.tagIndex.indexOf(tag);
        String word = yield.get(i).value();
        int wordId = lexicon.wordIndex.indexOf(word);
        // Two checks to see if we keep this example
        if (tagId < 0) {
          log.info("Discarding training example: " + word + " " + tag);
          continue;
        }
//        if (counts.probWordTag(wordId, tagId) == 0.0) {
//          log.info("Discarding low counts  pair: " + word + " " + tag);
//          continue;
//        }

        String featureStr = ((CoreLabel) yield.get(i)).originalText();
        Pair lemmaMorph = MorphoFeatureSpecification.splitMorphString(word, featureStr);
        String lemma = lemmaMorph.first();
        String richTag = lemmaMorph.second();
        String reducedTag = lexicon.morphoSpec.strToFeatures(richTag).toString();
        reducedTag = reducedTag.length() == 0 ? NO_MORPH_ANALYSIS : reducedTag;

        int lemmaId = lexicon.wordIndex.indexOf(lemma);
        int morphId = lexicon.morphIndex.indexOf(reducedTag);
        FactoredLexiconEvent event = new FactoredLexiconEvent(wordId, tagId, lemmaId, morphId, i, word, featureStr);
        events.add(event);
      }
    }
    return events;
  }

  private static List getTuningSet(Treebank devTreebank,
      FactoredLexicon lexicon, TreebankLangParserParams tlpp) {
    List devTrees = new ArrayList<>(3000);
    for (Tree tree : devTreebank) {
      for (Tree subTree : tree) {
        if (!subTree.isLeaf()) {
          tlpp.transformTree(subTree, tree);
        }
      }
      devTrees.add(tree);
    }
    List tuningSet = treebankToLexiconEvents(devTrees, lexicon);
    return tuningSet;
  }


  private static Options getOptions(Language language) {
    Options options = new Options();
    if (language.equals(Language.Arabic)) {
      options.lexOptions.useUnknownWordSignatures = 9;
      options.lexOptions.unknownPrefixSize = 1;
      options.lexOptions.unknownSuffixSize = 1;
      options.lexOptions.uwModelTrainer = "edu.stanford.nlp.parser.lexparser.ArabicUnknownWordModelTrainer";
    } else if (language.equals(Language.French)) {
      options.lexOptions.useUnknownWordSignatures = 1;
      options.lexOptions.unknownPrefixSize = 1;
      options.lexOptions.unknownSuffixSize = 2;
      options.lexOptions.uwModelTrainer = "edu.stanford.nlp.parser.lexparser.FrenchUnknownWordModelTrainer";
    } else {
      throw new UnsupportedOperationException();
    }
    return options;
  }

  /**
   * @param args
   */
  public static void main(String[] args) {
    if (args.length != 4) {
      System.err.printf("Usage: java %s language features train_file dev_file%n", FactoredLexicon.class.getName());
      System.exit(-1);
    }
    // Command line options
    Language language = Language.valueOf(args[0]);
    TreebankLangParserParams tlpp = language.params;
    Treebank trainTreebank = tlpp.diskTreebank();
    trainTreebank.loadPath(args[2]);
    Treebank devTreebank = tlpp.diskTreebank();
    devTreebank.loadPath(args[3]);
    MorphoFeatureSpecification morphoSpec;
    Options options = getOptions(language);
    if (language.equals(Language.Arabic)) {
      morphoSpec = new ArabicMorphoFeatureSpecification();
      String[] languageOptions = {"-arabicFactored"};
      tlpp.setOptionFlag(languageOptions, 0);
    } else if (language.equals(Language.French)) {
      morphoSpec = new FrenchMorphoFeatureSpecification();
      String[] languageOptions = {"-frenchFactored"};
      tlpp.setOptionFlag(languageOptions, 0);
    } else {
      throw new UnsupportedOperationException();
    }
    String featureList = args[1];
    String[] features = featureList.trim().split(",");
    for (String feature : features) {
      morphoSpec.activate(MorphoFeatureType.valueOf(feature));
    }
    System.out.println("Language: " + language.toString());
    System.out.println("Features: " + args[1]);

    // Create word and tag indices
    // Save trees in a collection since the interface requires that....
    System.out.print("Loading training trees...");
    List trainTrees = new ArrayList<>(19000);
    Index wordIndex = new HashIndex<>();
    Index tagIndex = new HashIndex<>();
    for (Tree tree : trainTreebank) {
      for (Tree subTree : tree) {
        if (!subTree.isLeaf()) {
          tlpp.transformTree(subTree, tree);
        }
      }
      trainTrees.add(tree);
    }
    System.out.printf("Done! (%d trees)%n", trainTrees.size());

    // Setup and train the lexicon.
    System.out.print("Collecting sufficient statistics for lexicon...");
    FactoredLexicon lexicon = new FactoredLexicon(options, morphoSpec, wordIndex, tagIndex);
    lexicon.initializeTraining(trainTrees.size());
    lexicon.train(trainTrees, null);
    lexicon.finishTraining();
    System.out.println("Done!");
    trainTrees = null;

    // Load the tuning set
    System.out.print("Loading tuning set...");
    List tuningSet = getTuningSet(devTreebank, lexicon, tlpp);
    System.out.printf("...Done! (%d events)%n", tuningSet.size());

    // Print the probabilities that we obtain
    // TODO(spenceg): Implement tagging accuracy with FactLex
    int nCorrect = 0;
    Counter errors = new ClassicCounter<>();
    for (FactoredLexiconEvent event : tuningSet) {
      Iterator itr = lexicon.ruleIteratorByWord(event.word(), event.getLoc(), event.featureStr());
      Counter logScores = new ClassicCounter<>();
      boolean noRules = true;
      int goldTagId = -1;
      while (itr.hasNext()) {
        noRules = false;
        IntTaggedWord iTW = itr.next();
        if (iTW.tag() == event.tagId()) {
          log.info("GOLD-");
          goldTagId = iTW.tag();
        }
        float tagScore = lexicon.score(iTW, event.getLoc(), event.word(), event.featureStr());
        logScores.incrementCount(iTW.tag(), tagScore);
      }
      if (noRules) {
        System.err.printf("NO TAGGINGS: %s %s%n", event.word(), event.featureStr());
      } else {
        // Score the tagging
        int hypTagId = Counters.argmax(logScores);
        if (hypTagId == goldTagId) {
          ++nCorrect;
        } else {
          String goldTag = goldTagId < 0 ? "UNSEEN" : lexicon.tagIndex.get(goldTagId);
          errors.incrementCount(goldTag);
        }
      }
      log.info();
    }

    // Output accuracy
    double acc = (double) nCorrect / (double) tuningSet.size();
    System.err.printf("%n%nACCURACY: %.2f%n%n", acc*100.0);
    log.info("% of errors by type:");
    List biggestKeys = new ArrayList<>(errors.keySet());
    Collections.sort(biggestKeys, Counters.toComparator(errors, false, true));
    Counters.normalize(errors);
    for (String key : biggestKeys) {
      System.err.printf("%s\t%.2f%n", key, errors.getCount(key)*100.0);
    }
  }

}