edu.stanford.nlp.sequences.ObjectBankWrapper Maven / Gradle / Ivy

package edu.stanford.nlp.sequences;

import edu.stanford.nlp.ling.CoreAnnotations.AnswerAnnotation;
import edu.stanford.nlp.ling.CoreAnnotations.CharAnnotation;
import edu.stanford.nlp.ling.CoreAnnotations.GoldAnswerAnnotation;
import edu.stanford.nlp.ling.CoreAnnotations.PositionAnnotation;
import edu.stanford.nlp.ling.CoreAnnotations.ShapeAnnotation;
import edu.stanford.nlp.ling.CoreAnnotations.TextAnnotation;
import edu.stanford.nlp.objectbank.ObjectBank;
import edu.stanford.nlp.process.Americanize;
import edu.stanford.nlp.process.WordShapeClassifier;
import edu.stanford.nlp.process.WordToSentenceProcessor;
import edu.stanford.nlp.util.AbstractIterator;
import edu.stanford.nlp.util.CoreMap;

import java.util.*;
import java.util.regex.Pattern;


/**
 * This class is used to wrap the ObjectBank used by the sequence
 * models and is where any sort of general processing, like the IOB mapping
 * stuff and wordshape stuff, should go.
 * It checks the SeqClassifierFlags to decide what to do.
 * 
 * TODO: We should rearchitect this so that the FeatureFactory-specific
 * stuff is done by a callback to the relevant FeatureFactory.
 *
 * @author Jenny Finkel
 */

public class ObjectBankWrapper extends ObjectBank> {

  private static final long serialVersionUID = -3838331732026362075L;

  private final SeqClassifierFlags flags;
  private final ObjectBank> wrapped;
  private final Set knownLCWords;


  public ObjectBankWrapper(SeqClassifierFlags flags, ObjectBank> wrapped, Set knownLCWords) {
    super(null,null);
    this.flags = flags;
    this.wrapped = wrapped;
    this.knownLCWords = knownLCWords;
  }


  @Override
  public Iterator> iterator() {
    Iterator> iter = new WrappedIterator(wrapped.iterator());
    return iter;
  }

  private class WrappedIterator extends AbstractIterator> {
    Iterator> wrappedIter;
    Iterator> spilloverIter;

    public WrappedIterator(Iterator> wrappedIter) {
      this.wrappedIter = wrappedIter;
    }

    @Override
    public boolean hasNext() {
      while ((spilloverIter == null || !spilloverIter.hasNext()) &&
             wrappedIter.hasNext()) {
        List doc = wrappedIter.next();
        List> docs = new ArrayList>();
        docs.add(doc);
        fixDocLengths(docs);
        spilloverIter = docs.iterator();
      }
      return wrappedIter.hasNext() ||
        (spilloverIter != null && spilloverIter.hasNext());
    }

    @Override
    public List next() {
      // this while loop now is redundant because it should
      // have already been done in "hasNext".
      // I'm keeping it so that the diff is minimal.
      // -pichuan
      while (spilloverIter == null || !spilloverIter.hasNext()) {
        List doc = wrappedIter.next();
        List> docs = new ArrayList>();
        docs.add(doc);
        fixDocLengths(docs);
        spilloverIter = docs.iterator();
      }

      return processDocument(spilloverIter.next());
    }
  }

  public List processDocument(List doc) {
    if (flags.mergeTags) { mergeTags(doc); }
    if (flags.iobTags) { iobTags(doc); }
    doBasicStuff(doc);

    return doc;
  }

  private String intern(String s) {
    if (flags.intern) {
      return s.intern();
    } else {
      return s;
    }
  }


  private final Pattern monthDayPattern = Pattern.compile("Monday|Tuesday|Wednesday|Thursday|Friday|Saturday|Sunday|January|February|March|April|May|June|July|August|September|October|November|December", Pattern.CASE_INSENSITIVE);

  private String fix(String word) {
    if (flags.normalizeTerms || flags.normalizeTimex) {
      // Same case for days/months: map to lowercase
      if (monthDayPattern.matcher(word).matches()) {
        return word.toLowerCase();
      }
    }
    if (flags.normalizeTerms) {
      return Americanize.americanize(word, false);
    }
    return word;
  }


  private void doBasicStuff(List doc) {
    int position = 0;
    for (IN fl : doc) {

      // position in document
      fl.set(PositionAnnotation.class, Integer.toString((position++)));

      // word shape
      if ((flags.wordShape > WordShapeClassifier.NOWORDSHAPE) && (!flags.useShapeStrings)) {
        // TODO: if we pass in a FeatureFactory, as suggested by an
        // earlier comment, we should use that FeatureFactory's
        // getWord function
        String word = fl.get(TextAnnotation.class);
        if (flags.wordFunction != null) {
          word = flags.wordFunction.apply(word);
        }
        if (word.length() > 0) {
          char ch = word.charAt(0);
          if (Character.isLowerCase(ch)) {
            knownLCWords.add(word);
          }
        }

        String s = intern(WordShapeClassifier.wordShape(word, flags.wordShape, knownLCWords));
        fl.set(ShapeAnnotation.class, s);
      }

      // normalizing and interning
      // was the following; should presumably now be
      // if ("CTBSegDocumentReader".equalsIgnoreCase(flags.documentReader)) {
      if ("edu.stanford.nlp.wordseg.Sighan2005DocumentReaderAndWriter".equalsIgnoreCase(flags.readerAndWriter)) {
        // for Chinese segmentation, "word" is no use and ignore goldAnswer for memory efficiency.
        fl.set(CharAnnotation.class,intern(fix(fl.get(CharAnnotation.class))));
      } else {
        fl.set(TextAnnotation.class, intern(fix(fl.get(TextAnnotation.class))));
        fl.set(GoldAnswerAnnotation.class, fl.get(AnswerAnnotation.class));
      }
    }
  }

  /**
   * Take a {@link List} of documents (which are themselves {@link List}s
   * of something that extends {@link CoreMap}, CoreLabel by default),
   * and if any are longer than the length
   * specified by flags.maxDocSize split them up.  If maxDocSize is negative,
   * nothing is changed.  In practice, documents need to be not too long or
   * else the CRF inference will fail due to numerical problems.
   * This method tries to be smart
   * and split on sentence boundaries, but this is hard-coded to English.
   *
   * @param docs The list of documents whose length might be adjusted.
   */
  private void fixDocLengths(List> docs) {
    final int maxDocSize = flags.maxDocSize;

    WordToSentenceProcessor wts = new WordToSentenceProcessor();
    List> newDocuments = new ArrayList>();
    for (List document : docs) {
      if (maxDocSize <= 0 || document.size() <= maxDocSize) {
        if (!document.isEmpty()) {
          newDocuments.add(document);
        }
        continue;
      }
      List> sentences = wts.process(document);
      List newDocument = new ArrayList();
      for (List sentence : sentences) {
        if (newDocument.size() + sentence.size() > maxDocSize) {
          if (!newDocument.isEmpty()) {
            newDocuments.add(newDocument);
          }
          newDocument = new ArrayList();
        }
        newDocument.addAll(sentence);
      }
      if (!newDocument.isEmpty()) {
        newDocuments.add(newDocument);
      }
    }

    docs.clear();
    docs.addAll(newDocuments);
  }

  private void iobTags(List doc) {
    String lastTag = "";
    for (IN wi : doc) {
      String answer = wi.get(AnswerAnnotation.class);
      if (!answer.equals(flags.backgroundSymbol)) {
        int index = answer.indexOf('-');
        String prefix;
        String label;
        if (index < 0) {
          prefix = "";
          label = answer;
        } else {
          prefix = answer.substring(0,index);
          label = answer.substring(index+1);
        }

        if (!prefix.equals("B")) {
          if (!lastTag.equals(label)) {
            wi.set(AnswerAnnotation.class, "B-" + label);
          } else {
            wi.set(AnswerAnnotation.class, "I-" + label);
          }
        }
        lastTag = label;
      } else {
        lastTag = answer;
      }
    }
  }


  private void mergeTags(List doc) {
    for (IN wi : doc) {
      String answer = wi.get(AnswerAnnotation.class);
      if (answer == null) {
        continue;
      }
      if (!answer.equals(flags.backgroundSymbol)) {
        int index = answer.indexOf('-');
        if (index >= 0) {
          answer = answer.substring(index + 1);
        }
      }
      wi.set(AnswerAnnotation.class, answer);
    }
  }


  // all the other the crap from ObjectBank
  @Override
  public boolean add(List o) { return wrapped.add(o); }
  @Override
  public boolean addAll(Collection> c) { return wrapped.addAll(c); }
  @Override
  public void clear() { wrapped.clear(); }
  @Override
  public void clearMemory() { wrapped.clearMemory(); }
  public boolean contains(List o) { return wrapped.contains(o); }
  @Override
  public boolean containsAll(Collection c) { return wrapped.containsAll(c); }
  @Override
  public boolean isEmpty() { return wrapped.isEmpty(); }
  @Override
  public void keepInMemory(boolean keep) { wrapped.keepInMemory(keep); }
  public boolean remove(List o) { return wrapped.remove(o); }
  @Override
  public boolean removeAll(Collection c) { return wrapped.removeAll(c); }
  @Override
  public boolean retainAll(Collection c) { return wrapped.retainAll(c); }
  @Override
  public int size() { return wrapped.size(); }
  @Override
  public Object[] toArray() { return wrapped.toArray(); }
  public List[] toArray(List[] o) { return wrapped.toArray(o); }

} // end class ObjectBankWrapper