edu.stanford.nlp.tagger.maxent.ExtractorVerbalVBNZero Maven / Gradle / Ivy

Show more of this group Show more artifacts with this name
Show all versions of stanford-corenlp Show documentation

Stanford CoreNLP provides a set of natural language analysis tools which can take raw English language text input and give the base forms of words, their parts of speech, whether they are names of companies, people, etc., normalize dates, times, and numeric quantities, mark up the structure of sentences in terms of phrases and word dependencies, and indicate which noun phrases refer to the same entities. It provides the foundational building blocks for higher level text understanding applications.

There is a newer version: 4.5.7

Show newest version

package edu.stanford.nlp.tagger.maxent; 
import edu.stanford.nlp.util.logging.Redwood;

import java.util.regex.Pattern;


/**
 * Look for verbs selecting a VBN verb.
 * This is now a zeroeth order observed data only feature.
 * But reminiscent of what was done in Toutanova and Manning 2000.
 * It doesn't seem to help tagging performance any more.
 *
 * @author Christopher Manning
 */
public class ExtractorVerbalVBNZero extends DictionaryExtractor  {

  /** A logger for this class */
  private static Redwood.RedwoodChannels log = Redwood.channels(ExtractorVerbalVBNZero.class);

  private static final String vbnTag = "VBN";
  private static final String vbdTag = "VBD";
  private static final String jjTag = "JJ";
  private static final String edSuff = "ed";
  private static final String enSuff = "en";
  private static final String oneSt = "1";
  private static final String naWord = "NA";

  private final int bound;
  private static final Pattern stopper = Pattern.compile("(?i:and|or|but|,|;|-|--)");
  private static final Pattern vbnWord = Pattern.compile("(?i:have|has|having|had|is|am|are|was|were|be|being|been|'ve|'s|s|'d|'re|'m|gotten|got|gets|get|getting)"); // cf. list in EnglishPTBTreebankCorrector


  public ExtractorVerbalVBNZero(int bound) {
    this.bound = bound;
  }


  @Override
  public boolean precondition(String tag) {
    log.info("VBN: Testing precondition on " + tag + ": " + (tag.equals(vbnTag) || tag.equals(vbdTag) || tag.equals(jjTag)));
    return tag.equals(vbnTag) || tag.equals(vbdTag) || tag.equals(jjTag);
  }


  @Override
  String extract(History h, PairsHolder pH) {
    String cword = pH.getWord(h, 0);
    int allCount = dict.sum(cword);
    int vBNCount = dict.getCount(cword, vbnTag);
    int vBDCount = dict.getCount(cword, vbdTag);

    // Conditions for deciding inapplicable
    if ((allCount == 0) && (!(cword.endsWith(edSuff) || cword.endsWith(enSuff)))) {
      return zeroSt;
    }
    if ((allCount > 0) && (vBNCount + vBDCount <= allCount / 100)) {
      return zeroSt;
    }

    String lastverb = naWord;
    //String lastvtag = zeroSt; // mg: written but never read

    for (int index = -1; index >= -bound; index--) {
      String word2 = pH.getWord(h, index);
      if ("NA".equals(word2)) {
        break;
      }
      if (stopper.matcher(word2).matches()) {
        break;
      }
      if (vbnWord.matcher(word2).matches()) {
        lastverb = word2;
        break;
      }
      index--;
    }

    if ( ! lastverb.equals(naWord)) {
      log.info("VBN: For " + cword + ", found preceding VBN cue " + lastverb);
      return oneSt;
    }

    return zeroSt;
  }

  @Override
  public String toString() {
    return "ExtractorVerbalVBNZero(bound=" + bound + ')';
  }



  private static final long serialVersionUID = -5881204185400060636L;

}