edu.stanford.nlp.sequences.ObjectBankWrapper Maven / Gradle / Ivy

Show more of this group Show more artifacts with this name
Show all versions of stanford-corenlp Show documentation
Stanford CoreNLP provides a set of natural language analysis tools which can take raw English language text input and give the base forms of words, their parts of speech, whether they are names of companies, people, etc., normalize dates, times, and numeric quantities, mark up the structure of sentences in terms of phrases and word dependencies, and indicate which noun phrases refer to the same entities. It provides the foundational building blocks for higher level text understanding applications.
There is a newer version: 4.5.7
Show newest version
package edu.stanford.nlp.sequences;

import edu.stanford.nlp.ling.CoreAnnotations;
import edu.stanford.nlp.objectbank.ObjectBank;
import edu.stanford.nlp.process.Americanize;
import edu.stanford.nlp.process.WordShapeClassifier;
import edu.stanford.nlp.process.WordToSentenceProcessor;
import edu.stanford.nlp.util.AbstractIterator;
import edu.stanford.nlp.util.CoreMap;

import java.util.*;
import java.util.regex.Pattern;


/**
 * This class is used to wrap the ObjectBank used by the sequence
 * models and is where any sort of general processing, like the IOB mapping
 * stuff and wordshape stuff, should go.
 * It checks the SeqClassifierFlags to decide what to do.
 * 
 * TODO: We should rearchitect this so that the FeatureFactory-specific
 * stuff is done by a callback to the relevant FeatureFactory.
 *
 * @author Jenny Finkel
 */
public class ObjectBankWrapper extends ObjectBank> {

  private static final long serialVersionUID = -3838331732026362075L;

  private final SeqClassifierFlags flags;
  private final ObjectBank> wrapped;
  private final Set knownLCWords;


  public ObjectBankWrapper(SeqClassifierFlags flags, ObjectBank> wrapped, Set knownLCWords) {
    super(null, null);
    this.flags = flags;
    this.wrapped = wrapped;
    this.knownLCWords = knownLCWords;
  }


  @Override
  public Iterator> iterator() {
    Iterator> iter = new WrappedIterator(wrapped.iterator());
    return iter;
  }

  private class WrappedIterator extends AbstractIterator> {
    Iterator> wrappedIter;
    Iterator> spilloverIter;

    public WrappedIterator(Iterator> wrappedIter) {
      this.wrappedIter = wrappedIter;
    }

    @Override
    public boolean hasNext() {
      while ((spilloverIter == null || !spilloverIter.hasNext()) &&
             wrappedIter.hasNext()) {
        List doc = wrappedIter.next();
        List> docs = new ArrayList<>();
        docs.add(doc);
        fixDocLengths(docs);
        spilloverIter = docs.iterator();
      }
      return wrappedIter.hasNext() ||
        (spilloverIter != null && spilloverIter.hasNext());
    }

    @Override
    public List next() {
      // this while loop now is redundant because it should
      // have already been done in "hasNext".
      // I'm keeping it so that the diff is minimal.
      // -pichuan
      while (spilloverIter == null || !spilloverIter.hasNext()) {
        List doc = wrappedIter.next();
        List> docs = new ArrayList<>();
        docs.add(doc);
        fixDocLengths(docs);
        spilloverIter = docs.iterator();
      }

      return processDocument(spilloverIter.next());
    }
  }

  public List processDocument(List doc) {
    if (flags.mergeTags) { mergeTags(doc); }
    if (flags.iobTags) { iobTags(doc); }
    doBasicStuff(doc);

    return doc;
  }

  private String intern(String s) {
    if (flags.intern) {
      return s.intern();
    } else {
      return s;
    }
  }


  private final Pattern monthDayPattern = Pattern.compile("Monday|Tuesday|Wednesday|Thursday|Friday|Saturday|Sunday|January|February|March|April|May|June|July|August|September|October|November|December", Pattern.CASE_INSENSITIVE);

  private String fix(String word) {
    if (flags.normalizeTerms || flags.normalizeTimex) {
      // Same case for days/months: map to lowercase
      if (monthDayPattern.matcher(word).matches()) {
        return word.toLowerCase();
      }
    }
    if (flags.normalizeTerms) {
      return Americanize.americanize(word, false);
    }
    return word;
  }


  private void doBasicStuff(List doc) {
    int position = 0;
    for (IN fl : doc) {

      // position in document
      fl.set(CoreAnnotations.PositionAnnotation.class, Integer.toString((position++)));

      // word shape
      if ((flags.wordShape > WordShapeClassifier.NOWORDSHAPE) && (!flags.useShapeStrings)) {
        // TODO: if we pass in a FeatureFactory, as suggested by an earlier comment,
        // we should use that FeatureFactory's getWord function
        String word = fl.get(CoreAnnotations.TextAnnotation.class);
        if (flags.wordFunction != null) {
          word = flags.wordFunction.apply(word);
        }
        if ( ! word.isEmpty() && Character.isLowerCase(word.codePointAt(0))) {
          knownLCWords.add(word);
        }

        String s = intern(WordShapeClassifier.wordShape(word, flags.wordShape, knownLCWords));
        fl.set(CoreAnnotations.ShapeAnnotation.class, s);
      }

      // normalizing and interning
      // was the following; should presumably now be
      // if ("CTBSegDocumentReader".equalsIgnoreCase(flags.documentReader)) {
      if ("edu.stanford.nlp.wordseg.Sighan2005DocumentReaderAndWriter".equalsIgnoreCase(flags.readerAndWriter)) {
        // for Chinese segmentation, "word" is no use and ignore goldAnswer for memory efficiency.
        fl.set(CoreAnnotations.CharAnnotation.class,intern(fix(fl.get(CoreAnnotations.CharAnnotation.class))));
      } else {
        fl.set(CoreAnnotations.TextAnnotation.class, intern(fix(fl.get(CoreAnnotations.TextAnnotation.class))));
        // only override GoldAnswer if not set - so that a DocumentReaderAndWriter can set it right in the first place.
        if (fl.get(CoreAnnotations.AnswerAnnotation.class) == null) {
          fl.set(CoreAnnotations.GoldAnswerAnnotation.class, fl.get(CoreAnnotations.AnswerAnnotation.class));
        }
      }
    }
  }

  /**
   * Take a {@link List} of documents (which are themselves {@link List}s
   * of something that extends {@link CoreMap}, CoreLabel by default),
   * and if any are longer than the length
   * specified by flags.maxDocSize split them up.  If maxDocSize is negative,
   * nothing is changed.  In practice, documents need to be not too long or
   * else the CRF inference will fail due to numerical problems.
   * This method tries to be smart
   * and split on sentence boundaries, but this is hard-coded to English.
   *
   * @param docs The list of documents whose length might be adjusted.
   */
  private void fixDocLengths(List> docs) {
    final int maxDocSize = flags.maxDocSize;

    WordToSentenceProcessor wts = new WordToSentenceProcessor<>();
    List> newDocuments = new ArrayList<>();
    for (List document : docs) {
      if (maxDocSize <= 0 || document.size() <= maxDocSize) {
        if (flags.keepEmptySentences || !document.isEmpty()) {
          newDocuments.add(document);
        }
        continue;
      }
      List> sentences = wts.process(document);
      List newDocument = new ArrayList<>();
      for (List sentence : sentences) {
        if (newDocument.size() + sentence.size() > maxDocSize) {
          if (!newDocument.isEmpty()) {
            newDocuments.add(newDocument);
          }
          newDocument = new ArrayList<>();
        }
        newDocument.addAll(sentence);
      }
      if (flags.keepEmptySentences || !newDocument.isEmpty()) {
        newDocuments.add(newDocument);
      }
    }

    docs.clear();
    docs.addAll(newDocuments);
  }

  private void iobTags(List doc) {
    String lastTag = "";
    for (IN wi : doc) {
      String answer = wi.get(CoreAnnotations.AnswerAnnotation.class);
      if (!flags.backgroundSymbol.equals(answer) && answer != null) {
        int index = answer.indexOf('-');
        String prefix;
        String label;
        if (index < 0) {
          prefix = "";
          label = answer;
        } else {
          prefix = answer.substring(0,index);
          label = answer.substring(index+1);
        }

        if (!prefix.equals("B")) {
          if (!label.equals(lastTag)) {
            wi.set(CoreAnnotations.AnswerAnnotation.class, "B-" + label);
          } else {
            wi.set(CoreAnnotations.AnswerAnnotation.class, "I-" + label);
          }
        }
        lastTag = label;
      } else {
        lastTag = answer;
      }
    }
  }

  /** Change some form of IOB/IOE encoding via forms like "I-PERS" to
   *  IO encoding as just "PERS".
   *
   *  @param doc The document for which the AnswerAnnotation will be changed (in place)
   */
  private void mergeTags(List doc) {
    for (IN wi : doc) {
      String answer = wi.get(CoreAnnotations.AnswerAnnotation.class);
      if (answer == null) {
        continue;
      }
      if ( ! answer.equals(flags.backgroundSymbol)) {
        int index = answer.indexOf('-');
        if (index >= 0) {
          answer = answer.substring(index + 1);
        }
      }
      wi.set(CoreAnnotations.AnswerAnnotation.class, answer);
    }
  }


  // all the other the crap from ObjectBank
  @Override
  public boolean add(List o) { return wrapped.add(o); }
  @Override
  public boolean addAll(Collection> c) { return wrapped.addAll(c); }
  @Override
  public void clear() { wrapped.clear(); }
  @Override
  public void clearMemory() { wrapped.clearMemory(); }
  public boolean contains(List o) { return wrapped.contains(o); }
  @Override
  public boolean containsAll(Collection c) { return wrapped.containsAll(c); }
  @Override
  public boolean isEmpty() { return wrapped.isEmpty(); }
  @Override
  public void keepInMemory(boolean keep) { wrapped.keepInMemory(keep); }
  public boolean remove(List o) { return wrapped.remove(o); }
  @Override
  public boolean removeAll(Collection c) { return wrapped.removeAll(c); }
  @Override
  public boolean retainAll(Collection c) { return wrapped.retainAll(c); }
  @Override
  public int size() { return wrapped.size(); }
  @Override
  public Object[] toArray() { return wrapped.toArray(); }
  public List[] toArray(List[] o) { return wrapped.toArray(o); }

} // end class ObjectBankWrapper