edu.stanford.nlp.sequences.PlainTextDocumentReaderAndWriter Maven / Gradle / Ivy

Show more of this group Show more artifacts with this name
Show all versions of stanford-corenlp Show documentation
Stanford CoreNLP provides a set of natural language analysis tools which can take raw English language text input and give the base forms of words, their parts of speech, whether they are names of companies, people, etc., normalize dates, times, and numeric quantities, mark up the structure of sentences in terms of phrases and word dependencies, and indicate which noun phrases refer to the same entities. It provides the foundational building blocks for higher level text understanding applications.
There is a newer version: 4.5.7
Show newest version
package edu.stanford.nlp.sequences; 
import edu.stanford.nlp.util.logging.Redwood;

import edu.stanford.nlp.ling.CoreAnnotations;
import edu.stanford.nlp.ling.CoreLabel;
import edu.stanford.nlp.ling.HasWord;
import edu.stanford.nlp.process.TokenizerFactory;
import edu.stanford.nlp.process.PTBTokenizer;
import edu.stanford.nlp.process.Tokenizer;
import edu.stanford.nlp.process.WordToSentenceProcessor;
import edu.stanford.nlp.util.CoreMap;
import edu.stanford.nlp.util.ErasureUtils;
import edu.stanford.nlp.util.Generics;
import edu.stanford.nlp.util.StringUtils;
import edu.stanford.nlp.util.XMLUtils;

import java.io.*;
import java.lang.reflect.Method;
import java.util.*;
import java.util.regex.*;

/**
 * This class provides methods for reading plain text documents and writing out
 * those documents once classified in several different formats.
 * The output formats are named: slashTags, xml, inlineXML, tsv, tabbedEntities.
 * 
 * Implementation note: see
 * itest/src/edu/stanford/nlp/ie/crf/CRFClassifierITest.java for examples and
 * test cases for the output options.
 *
 * This class works over a list of anything that extends {@link CoreMap}.
 * The usual case is {@link CoreLabel}.
 *
 * @author Jenny Finkel
 * @author Christopher Manning (new output options organization)
 * @author Sonal Gupta (made the class generic)
 */
public class PlainTextDocumentReaderAndWriter implements DocumentReaderAndWriter  {

  /** A logger for this class */
  private static Redwood.RedwoodChannels log = Redwood.channels(PlainTextDocumentReaderAndWriter.class);

  private static final long serialVersionUID = -2420535144980273136L;

  public enum OutputStyle {
    SLASH_TAGS    ("slashTags"),
    XML           ("xml"),
    INLINE_XML    ("inlineXML"),
    TSV           ("tsv"),
    TABBED        ("tabbedEntities");

    private final String shortName;
    OutputStyle(String shortName) {
      this.shortName = shortName;
    }

    private static final Map shortNames = Generics.newHashMap();

    static {
      for (OutputStyle style : OutputStyle.values())
        shortNames.put(style.shortName, style);
    }

    /** Convert a String expressing an output format to its internal
     *  coding as an OutputStyle.
     *
     *  @param name The String name
     *  @return OutputStyle The internal constant
     */
    public static OutputStyle fromShortName(String name) {
      OutputStyle result = shortNames.get(name);
      if (result == null)
        throw new IllegalArgumentException(name + " is not an OutputStyle");
      return result;
    }

    public static boolean defaultToPreserveSpacing(String str) {
      return str.equals(XML.shortName) || str.equals(INLINE_XML.shortName);
    }

  } // end enum Output style

  private static final Pattern sgml = Pattern.compile("<[^>]*>");
  private final WordToSentenceProcessor wts =
          new WordToSentenceProcessor<>(WordToSentenceProcessor.NewlineIsSentenceBreak.ALWAYS);

  private SeqClassifierFlags flags; // = null;
  private TokenizerFactory tokenizerFactory;

  /**
   * Construct a PlainTextDocumentReaderAndWriter. You should call init() after
   * using the constructor.
   */
  public PlainTextDocumentReaderAndWriter() {
  }

  @Override
  public void init(SeqClassifierFlags flags) {
    String options = "tokenizeNLs=false,invertible=true";
    if (flags.tokenizerOptions != null) {
      options = options + ',' + flags.tokenizerOptions;
    }
    TokenizerFactory factory;
    if (flags.tokenizerFactory != null) {
      try {
        Class> clazz = ErasureUtils.uncheckedCast(Class.forName(flags.tokenizerFactory));
        Method factoryMethod = clazz.getMethod("newCoreLabelTokenizerFactory", String.class);
        factory = ErasureUtils.uncheckedCast(factoryMethod.invoke(null, options));
      } catch (Exception e) {
        throw new RuntimeException(e);
      }
    } else {
      factory = ErasureUtils.uncheckedCast(PTBTokenizer.PTBTokenizerFactory.newCoreLabelTokenizerFactory(options));
    }
    init(flags, factory);
  }

  public void init(SeqClassifierFlags flags, TokenizerFactory tokenizerFactory) {
    this.flags = flags;
    this.tokenizerFactory = tokenizerFactory;
  }

  // todo: give options for document splitting. A line or the whole file or sentence splitting as now
  @Override
  public Iterator> getIterator(Reader r) {
    Tokenizer tokenizer = tokenizerFactory.getTokenizer(r);
    // PTBTokenizer.newPTBTokenizer(r, false, true);
    List words = new ArrayList<>();
    IN previous = null;
    StringBuilder prepend = new StringBuilder();

    /*
     * This changes SGML tags into whitespace -- it should maybe be moved
     * elsewhere
     */
    while (tokenizer.hasNext()) {
      IN w = tokenizer.next();
      String word = w.get(CoreAnnotations.TextAnnotation.class);
      Matcher m = sgml.matcher(word);
      if (m.matches()) {

        String before = StringUtils.getNotNullString(w.get(CoreAnnotations.BeforeAnnotation.class));
        String after = StringUtils.getNotNullString(w.get(CoreAnnotations.AfterAnnotation.class));
        prepend.append(before).append(word);
        if (previous != null) {
          String previousTokenAfter = StringUtils.getNotNullString(previous.get(CoreAnnotations.AfterAnnotation.class));
          previous.set(CoreAnnotations.AfterAnnotation.class, previousTokenAfter + word + after);
        }
        // previous.appendAfter(w.word() + w.after());
      } else {

        String before = StringUtils.getNotNullString(w.get(CoreAnnotations.BeforeAnnotation.class));
        if (prepend.length() > 0) {
          prepend.append(before);
          w.set(CoreAnnotations.BeforeAnnotation.class, prepend.toString());
          prepend = new StringBuilder();
        }
        words.add(w);
        previous = w;
      }
    }

    List> sentences = wts.process(words);
    String after = "";
    IN last = null;
    for (List sentence : sentences) {
      int pos = 0;
      for (IN w : sentence) {
        w.set(CoreAnnotations.PositionAnnotation.class, Integer.toString(pos));
        after = StringUtils.getNotNullString(w.get(CoreAnnotations.AfterAnnotation.class));
        w.remove(CoreAnnotations.AfterAnnotation.class);
        last = w;
      }
    }
    if (last != null) {
      last.set(CoreAnnotations.AfterAnnotation.class, after);
    }

    return sentences.iterator();
  }


  /**
   * Print the classifications for the document to the given Writer. This method
   * now checks the outputFormat property, and can print in
   * slashTags, inlineXML, xml (stand-Off XML), tsv, or a 3-column tabbed format
   * for easy entity retrieval. For both the XML output
   * formats, it preserves spacing, while for the other formats, it prints
   * tokenized (since preserveSpacing output is somewhat dysfunctional with these
   * formats, but you can control this by calling getAnswers()).
   *
   * @param list List of tokens with classifier answers
   * @param out Where to print the output to
   */
  @Override
  public void printAnswers(List list, PrintWriter out) {
    String style = null;
    if (flags != null) {
      style = flags.outputFormat;
    }
    if (style == null || style.isEmpty()) {
      style = "slashTags";
    }
    OutputStyle outputStyle = OutputStyle.fromShortName(style);
    printAnswers(list, out, outputStyle, OutputStyle.defaultToPreserveSpacing(style));
  }

  public String getAnswers(List l,
                           OutputStyle outputStyle, boolean preserveSpacing) {
    StringWriter sw = new StringWriter();
    PrintWriter pw = new PrintWriter(sw);
    printAnswers(l, pw, outputStyle, preserveSpacing);
    pw.flush();
    return sw.toString();
  }

  public void printAnswers(List l, PrintWriter out,
                           OutputStyle outputStyle, boolean preserveSpacing) {
    switch (outputStyle) {
    case SLASH_TAGS:
      if (preserveSpacing) {
        printAnswersAsIsText(l, out);
      } else {
        printAnswersTokenizedText(l, out);
      }
      break;
    case XML:
      if (preserveSpacing) {
        printAnswersXML(l, out);
      } else {
        printAnswersTokenizedXML(l, out);
      }
      break;
    case INLINE_XML:
      if (preserveSpacing) {
        printAnswersInlineXML(l, out);
      } else {
        printAnswersTokenizedInlineXML(l, out);
      }
      break;
      case TSV:
        if (preserveSpacing) {
          printAnswersAsIsTextTsv(l, out);
        } else {
          printAnswersTokenizedTextTsv(l, out);
        }
        break;
      case TABBED:
        if (preserveSpacing) {
          printAnswersAsIsTextTabbed(l, out);
        } else {
          printAnswersTokenizedTextTabbed(l, out);
        }
        break;
    default:
      throw new IllegalArgumentException(outputStyle +
                                         " is an unsupported OutputStyle");
    }
  }

  private static  void printAnswersTokenizedText(List l, PrintWriter out) {
    for (IN wi : l) {
      out.print(StringUtils.getNotNullString(wi.get(CoreAnnotations.TextAnnotation.class)));
      out.print('/');
      out.print(StringUtils.getNotNullString(wi.get(CoreAnnotations.AnswerAnnotation.class)));
      out.print(' ');
    }
    out.println(); // put a single newline at the end [added 20091024].
  }

  private static  void printAnswersAsIsText(List l, PrintWriter out) {
    for (IN wi : l) {
      out.print(StringUtils.getNotNullString(wi.get(CoreAnnotations.BeforeAnnotation.class)));
      out.print(StringUtils.getNotNullString(wi.get(CoreAnnotations.TextAnnotation.class)));
      out.print('/');
      out.print(StringUtils.getNotNullString(wi.get(CoreAnnotations.AnswerAnnotation.class)));
      out.print(StringUtils.getNotNullString(wi.get(CoreAnnotations.AfterAnnotation.class)));
    }
  }

  private static  void printAnswersTokenizedTextTsv(List l, PrintWriter out) {
    for (IN wi : l) {
      out.print(StringUtils.getNotNullString(wi.get(CoreAnnotations.TextAnnotation.class)));
      out.print('\t');
      out.println(StringUtils.getNotNullString(wi.get(CoreAnnotations.AnswerAnnotation.class)));
    }
    out.println(); // put a single newline at the end [added 20091024].
  }

  private static  void printAnswersAsIsTextTsv(List l, PrintWriter out) {
    for (IN wi : l) {
      out.print(StringUtils.getNotNullString(wi.get(CoreAnnotations.BeforeAnnotation.class)));
      out.print(StringUtils.getNotNullString(wi.get(CoreAnnotations.TextAnnotation.class)));
      out.print(StringUtils.getNotNullString(wi.get(CoreAnnotations.AfterAnnotation.class)));
      out.print('\t');
      out.println(StringUtils.getNotNullString(wi.get(CoreAnnotations.AnswerAnnotation.class)));
    }
  }

  private void printAnswersAsIsTextTabbed(List l, PrintWriter out) {
    final String background = flags.backgroundSymbol;
    String lastEntityType = null;
    for (IN wi : l) {
      String entityType = wi.get(CoreAnnotations.AnswerAnnotation.class);
      String token = StringUtils.getNotNullString(wi.get(CoreAnnotations.TextAnnotation.class));
      String before = StringUtils.getNotNullString(wi.get(CoreAnnotations.BeforeAnnotation.class));
      String after = StringUtils.getNotNullString(wi.get(CoreAnnotations.AfterAnnotation.class));

      if (entityType.equals(lastEntityType)) {
        // continue the same entity in column 1 or 3
        out.print(before);
        out.print(token);
        out.print(after);
      } else {
        if (lastEntityType != null &&  ! background.equals(lastEntityType)) {
          // different entity type.  If previous not background/start, write in column 2
          out.print('\t');
          out.print(lastEntityType);
        }
        if (background.equals(entityType)) {
          // we'll print it in column 3. Normally, we're in column 2, unless we were at the start of doc
          if (lastEntityType == null) {
            out.print('\t');
          }
          out.print('\t');
        } else {
          // otherwise we're printing in column 1 again
          out.println();
        }
        out.print(before);
        out.print(token);
        out.print(after);
        lastEntityType = entityType;
      }
    }
    // if we're in the middle of printing an entity, then we should print its type
    if (lastEntityType != null && ! background.equals(lastEntityType)) {
      out.print('\t');
      out.print(lastEntityType);
    }
    // finish line then add blank line
    out.println();
    out.println();
  }

  private void printAnswersTokenizedTextTabbed(List l, PrintWriter out) {
    final String background = flags.backgroundSymbol;
    String lastEntityType = null;
    for (IN wi : l) {
      String entityType = wi.get(CoreAnnotations.AnswerAnnotation.class);
      String token = StringUtils.getNotNullString(wi.get(CoreAnnotations.TextAnnotation.class));
      if (entityType.equals(lastEntityType)) {
        // continue the same entity in column 1 or 3
        out.print(' ');
        out.print(token);
      } else {
        if (lastEntityType != null && ! background.equals(lastEntityType)) {
          // different entity type.  If previous not background/start, write in column 2
          out.print('\t');
          out.print(lastEntityType);
        }
        if (background.equals(entityType)) {
          // we'll print it in column 3. Normally, we're in column 2, unless we were at the start of doc
          if (lastEntityType == null) {
            out.print('\t');
          }
          out.print('\t');
        } else {
          // otherwise we're printing in column 1 again
          out.println();
        }
        out.print(token);
        lastEntityType = entityType;
      }
    }
    // if we're in the middle of printing an entity, then we should print its type
    if (lastEntityType != null && ! background.equals(lastEntityType)) {
      out.print('\t');
      out.print(lastEntityType);
    }
    // finish line then add blank line
    out.println();
    out.println();
  }

  private static  void printAnswersXML(List doc, PrintWriter out) {
    int num = 0;
    for (IN wi : doc) {
      String prev = StringUtils.getNotNullString(wi.get(CoreAnnotations.BeforeAnnotation.class));
      out.print(prev);
      out.print("");
      out.print(XMLUtils.escapeXML(StringUtils.getNotNullString(wi.get(CoreAnnotations.TextAnnotation.class))));
      out.print("");
      String after = StringUtils.getNotNullString(wi.get(CoreAnnotations.AfterAnnotation.class));
      out.print(after);
    }
  }

  private static  void printAnswersTokenizedXML(List doc, PrintWriter out) {
    int num = 0;
    for (IN wi : doc) {
      out.print("");
      out.print(XMLUtils.escapeXML(StringUtils.getNotNullString(wi.get(CoreAnnotations.TextAnnotation.class))));
      out.println("");
    }
  }

  private void printAnswersInlineXML(List doc, PrintWriter out) {
    final String background = flags.backgroundSymbol;
    String prevTag = background;
    for (Iterator wordIter = doc.iterator(); wordIter.hasNext();) {
      IN wi = wordIter.next();
      String tag = StringUtils.getNotNullString(wi.get(CoreAnnotations.AnswerAnnotation.class));

      String before = StringUtils.getNotNullString(wi.get(CoreAnnotations.BeforeAnnotation.class));

      String current = StringUtils.getNotNullString(wi.get(CoreAnnotations.OriginalTextAnnotation.class));
      if (!tag.equals(prevTag)) {
        if (!prevTag.equals(background) && !tag.equals(background)) {
          out.print("');
          out.print(before);
          out.print('<');
          out.print(tag);
          out.print('>');
        } else if (!prevTag.equals(background)) {
          out.print("');
          out.print(before);
        } else if (!tag.equals(background)) {
          out.print(before);
          out.print('<');
          out.print(tag);
          out.print('>');
        }
      } else {
        out.print(before);
      }
      out.print(current);
      String afterWS = StringUtils.getNotNullString(wi.get(CoreAnnotations.AfterAnnotation.class));

      if (!tag.equals(background) && !wordIter.hasNext()) {
        out.print("');
        prevTag = background;
      } else {
        prevTag = tag;
      }
      out.print(afterWS);
    }
  }

  private void printAnswersTokenizedInlineXML(List doc, PrintWriter out) {
    final String background = flags.backgroundSymbol;
    String prevTag = background;
    boolean first = true;
    for (Iterator wordIter = doc.iterator(); wordIter.hasNext();) {
      IN wi = wordIter.next();
      String tag = StringUtils.getNotNullString(wi.get(CoreAnnotations.AnswerAnnotation.class));
      if (!tag.equals(prevTag)) {
        if (!prevTag.equals(background) && !tag.equals(background)) {
          out.print(" <");
          out.print(tag);
          out.print('>');
        } else if (!prevTag.equals(background)) {
          out.print(" ");
        } else if (!tag.equals(background)) {
          if (!first) {
            out.print(' ');
          }
          out.print('<');
          out.print(tag);
          out.print('>');
        }
      } else {
        if (!first) {
          out.print(' ');
        }
      }
      first = false;
      out.print(StringUtils.getNotNullString(wi.get(CoreAnnotations.OriginalTextAnnotation.class)));

      if (!wordIter.hasNext()) {
        if (!tag.equals(background)) {
          out.print("');
        }
        out.print(' ');
        prevTag = background;
      } else {
        prevTag = tag;
      }
    }
    out.println();
  }

}

/*
 * This is old stuff from a brief period when this DocumentReaderAndWriter tried
 * to handle treating SGML tags as part of white space, even though they were
 * returned as tokens by the tokenizer. If this is to be revived, it seems like
 * this handling should be moved down into the tokenizer.
 *
 * These first two class declarations used to be in CoreAnnotations. The rest
 * used to be in this class.
 *
 * public static class PrevSGMLAnnotation implements CoreAnnotation {
 * public Class getType() { return String.class; } }
 *
 * public static class AfterSGMLAnnotation implements CoreAnnotation {
 * public Class getType() { return String.class; } }
 *
 *
 *
 * public Iterator> getIterator(Reader r) { PTBTokenizer ptb =
 * PTBTokenizer.newPTBTokenizer(r, false, true); List firstSplit = new
 * ArrayList(); List d = new ArrayList();
 *
 * while (ptb.hasNext()) { IN w = ptb.next(); Matcher m =
 * sgml.matcher(w.word()); if (m.matches()) { if (d.size() > 0) {
 * firstSplit.add(d); d = new ArrayList(); } firstSplit.add(w); continue; }
 * d.add(w); } if (d.size() > 0) { firstSplit.add(d); }
 *
 * List secondSplit = new ArrayList(); for (Object o : firstSplit) { if (o
 * instanceof List) { secondSplit.addAll(wts.process((List) o)); } else {
 * secondSplit.add(o); } }
 *
 * String prevTags = ""; IN lastWord = null;
 *
 * List> documents = new ArrayList>();
 *
 * boolean first = true;
 *
 * for (Object o : secondSplit) { if (o instanceof List) { List doc = (List) o;
 * List document = new ArrayList(); int pos = 0; for (Iterator wordIter
 * = doc.iterator(); wordIter.hasNext(); pos++) { IN w = (IN) wordIter.next();
 * w.set(CoreAnnotations.PositionAnnotation.class, Integer.toString(pos)); if (first &&
 * prevTags.length() > 0) { w.set(PrevSGMLAnnotation.class, prevTags); } first =
 * false; lastWord = w; document.add(w); } documents.add(document); } else {
 * //String tag = ((Word) o).word(); IN word = (IN) o; String tag =
 * word.before() + word.current(); if (first) { log.info(word);
 * prevTags = tag; } else { String t =
 * lastWord.getString(AfterSGMLAnnotation.class); tag = t + tag;
 * lastWord.set(AfterSGMLAnnotation.class, tag); } } }
 *
 * // this is a hack to deal with the incorrect assumption in the above code
 * that // SGML only occurs between sentences and never inside of them. List
 * allWords = new ArrayList(); for (List doc : documents) {
 * allWords.addAll(doc); }
 *
 * List> documentsFinal = wts.process(allWords);
 * log.info(documentsFinal.get(0).get(0)); System.exit(0);
 *
 * return documentsFinal.iterator(); // return documents.iterator(); }
 *
 *
 * public void printAnswersInlineXML(List doc, PrintWriter out) { final
 * String background = flags.backgroundSymbol; String prevTag = background; for
 * (Iterator wordIter = doc.iterator(); wordIter.hasNext(); ) { IN wi =
 * wordIter.next(); String prev = wi.getString(PrevSGMLAnnotation.class);
 * out.print(prev); if (prev.length() > 0) { prevTag = background; } String tag
 * = wi.getString(CoreAnnotations.AnswerAnnotation.class); if ( ! tag.equals(prevTag)) { if ( !
 * prevTag.equals(background) && ! tag.equals(background)) { out.print("');
 * out.print(wi.getString(CoreAnnotations.BeforeAnnotation.class)); out.print('<');
 * out.print(tag); out.print('>'); } else if ( ! prevTag.equals(background)) {
 * out.print("');
 * out.print(wi.getString(CoreAnnotations.BeforeAnnotation.class)); } else if ( !
 * tag.equals(background)) { out.print(wi.getString(CoreAnnotations.BeforeAnnotation.class));
 * out.print('<'); out.print(tag); out.print('>'); } } else {
 * out.print(wi.getString(CoreAnnotations.BeforeAnnotation.class)); }
 * out.print(wi.getString(CoreAnnotations.OriginalTextAnnotation.class)); String after =
 * wi.getString(AfterSGMLAnnotation.class); String afterWS =
 * wi.getString(CoreAnnotations.AfterAnnotation.class);
 *
 * if ( ! tag.equals(background) && ( ! wordIter.hasNext() || after.length() >
 * 0)) { out.print("'); prevTag = background;
 * } else { prevTag = tag; } out.print(afterWS); out.print(after); } }
 */