edu.stanford.nlp.sequences.PlainTextDocumentReaderAndWriter Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of stanford-parser Show documentation
Stanford Parser processes raw text in English, Chinese, German, Arabic, and French, and extracts constituency parse trees.
There is a newer version: 3.9.2
Show newest version
package edu.stanford.nlp.sequences;

import edu.stanford.nlp.ling.CoreAnnotations;
import edu.stanford.nlp.ling.CoreLabel;
import edu.stanford.nlp.ling.HasWord;
import edu.stanford.nlp.process.TokenizerFactory;
import edu.stanford.nlp.process.PTBTokenizer;
import edu.stanford.nlp.process.Tokenizer;
import edu.stanford.nlp.process.WordToSentenceProcessor;
import edu.stanford.nlp.util.CoreMap;
import edu.stanford.nlp.util.ErasureUtils;
import edu.stanford.nlp.util.Generics;
import edu.stanford.nlp.util.StringUtils;
import edu.stanford.nlp.util.XMLUtils;

import java.io.*;
import java.lang.reflect.Method;
import java.util.*;
import java.util.regex.*;

/**
 * This class provides methods for reading plain text documents and writing out
 * those documents once classified in several different formats.
 * The output formats are named: slashTags, xml, inlineXML, tsv, tabbedEntities.
 * 
 * Implementation note: see
 * itest/src/edu/stanford/nlp/ie/crf/CRFClassifierITest.java for examples and
 * test cases for the output options.
 *
 * This class works over a list of anything that extends {@link CoreMap}.
 * The usual case is {@link CoreLabel}.
 *
 * @author Jenny Finkel
 * @author Christopher Manning (new output options organization)
 * @author Sonal Gupta (made the class generic)
 */
public class PlainTextDocumentReaderAndWriter implements DocumentReaderAndWriter {

  private static final long serialVersionUID = -2420535144980273136L;

  public enum OutputStyle {
    SLASH_TAGS    ("slashTags"),
    XML           ("xml"),
    INLINE_XML    ("inlineXML"),
    TSV           ("tsv"),
    TABBED        ("tabbedEntities");

    private final String shortName;
    OutputStyle(String shortName) {
      this.shortName = shortName;
    }

    private static final Map shortNames = Generics.newHashMap();

    static {
      for (OutputStyle style : OutputStyle.values())
        shortNames.put(style.shortName, style);
    }

    /** Convert a String expressing an output format to its internal
     *  coding as an OutputStyle.
     *
     *  @param name The String name
     *  @return OutputStyle The internal constant
     */
    public static OutputStyle fromShortName(String name) {
      OutputStyle result = shortNames.get(name);
      if (result == null)
        throw new IllegalArgumentException(name + " is not an OutputStyle");
      return result;
    }

    public static boolean defaultToPreserveSpacing(String str) {
      return str.equals(XML.shortName) || str.equals(INLINE_XML.shortName);
    }

  } // end enum Output style

  private static final Pattern sgml = Pattern.compile("<[^>]*>");
  private final WordToSentenceProcessor wts =
          new WordToSentenceProcessor<>(WordToSentenceProcessor.NewlineIsSentenceBreak.ALWAYS);

  private SeqClassifierFlags flags; // = null;
  private TokenizerFactory tokenizerFactory;

  /**
   * Construct a PlainTextDocumentReaderAndWriter. You should call init() after
   * using the constructor.
   */
  public PlainTextDocumentReaderAndWriter() {
  }

  @Override
  public void init(SeqClassifierFlags flags) {
    String options = "tokenizeNLs=false,invertible=true";
    if (flags.tokenizerOptions != null) {
      options = options + ',' + flags.tokenizerOptions;
    }
    TokenizerFactory factory;
    if (flags.tokenizerFactory != null) {
      try {
        Class> clazz = ErasureUtils.uncheckedCast(Class.forName(flags.tokenizerFactory));
        Method factoryMethod = clazz.getMethod("newCoreLabelTokenizerFactory", String.class);
        factory = ErasureUtils.uncheckedCast(factoryMethod.invoke(null, options));
      } catch (Exception e) {
        throw new RuntimeException(e);
      }
    } else {
      factory = ErasureUtils.uncheckedCast(PTBTokenizer.PTBTokenizerFactory.newCoreLabelTokenizerFactory(options));
    }
    init(flags, factory);
  }

  public void init(SeqClassifierFlags flags, TokenizerFactory tokenizerFactory) {
    this.flags = flags;
    this.tokenizerFactory = tokenizerFactory;
  }

  // todo: give options for document splitting. A line or the whole file or sentence splitting as now
  @Override
  public Iterator> getIterator(Reader r) {
    Tokenizer tokenizer = tokenizerFactory.getTokenizer(r);
    // PTBTokenizer.newPTBTokenizer(r, false, true);
    List words = new ArrayList<>();
    IN previous = null;
    StringBuilder prepend = new StringBuilder();

    /*
     * This changes SGML tags into whitespace -- it should maybe be moved
     * elsewhere
     */
    while (tokenizer.hasNext()) {
      IN w = tokenizer.next();
      String word = w.get(CoreAnnotations.TextAnnotation.class);
      Matcher m = sgml.matcher(word);
      if (m.matches()) {

        String before = StringUtils.getNotNullString(w.get(CoreAnnotations.BeforeAnnotation.class));
        String after = StringUtils.getNotNullString(w.get(CoreAnnotations.AfterAnnotation.class));
        prepend.append(before).append(word);
        if (previous != null) {
          String previousTokenAfter = StringUtils.getNotNullString(previous.get(CoreAnnotations.AfterAnnotation.class));
          previous.set(CoreAnnotations.AfterAnnotation.class, previousTokenAfter + word + after);
        }
        // previous.appendAfter(w.word() + w.after());
      } else {

        String before = StringUtils.getNotNullString(w.get(CoreAnnotations.BeforeAnnotation.class));
        if (prepend.length() > 0) {
          prepend.append(before);
          w.set(CoreAnnotations.BeforeAnnotation.class, prepend.toString());
          prepend = new StringBuilder();
        }
        words.add(w);
        previous = w;
      }
    }

    List> sentences = wts.process(words);
    String after = "";
    IN last = null;
    for (List sentence : sentences) {
      int pos = 0;
      for (IN w : sentence) {
        w.set(CoreAnnotations.PositionAnnotation.class, Integer.toString(pos));
        after = StringUtils.getNotNullString(w.get(CoreAnnotations.AfterAnnotation.class));
        w.remove(CoreAnnotations.AfterAnnotation.class);
        last = w;
      }
    }
    if (last != null) {
      last.set(CoreAnnotations.AfterAnnotation.class, after);
    }

    return sentences.iterator();
  }


  /**
   * Print the classifications for the document to the given Writer. This method
   * now checks the outputFormat property, and can print in
   * slashTags, inlineXML, xml (stand-Off XML), tsv, or a 3-column tabbed format
   * for easy entity retrieval. For both the XML output
   * formats, it preserves spacing, while for the other formats, it prints
   * tokenized (since preserveSpacing output is somewhat dysfunctional with these
   * formats, but you can control this by calling getAnswers()).
   *
   * @param list List of tokens with classifier answers
   * @param out Where to print the output to
   */
  @Override
  public void printAnswers(List list, PrintWriter out) {
    String style = null;
    if (flags != null) {
      style = flags.outputFormat;
    }
    if (style == null || style.isEmpty()) {
      style = "slashTags";
    }
    OutputStyle outputStyle = OutputStyle.fromShortName(style);
    printAnswers(list, out, outputStyle, OutputStyle.defaultToPreserveSpacing(style));
  }

  public String getAnswers(List l,
                           OutputStyle outputStyle, boolean preserveSpacing) {
    StringWriter sw = new StringWriter();
    PrintWriter pw = new PrintWriter(sw);
    printAnswers(l, pw, outputStyle, preserveSpacing);
    pw.flush();
    return sw.toString();
  }

  public void printAnswers(List l, PrintWriter out,
                           OutputStyle outputStyle, boolean preserveSpacing) {
    switch (outputStyle) {
    case SLASH_TAGS:
      if (preserveSpacing) {
        printAnswersAsIsText(l, out);
      } else {
        printAnswersTokenizedText(l, out);
      }
      break;
    case XML:
      if (preserveSpacing) {
        printAnswersXML(l, out);
      } else {
        printAnswersTokenizedXML(l, out);
      }
      break;
    case INLINE_XML:
      if (preserveSpacing) {
        printAnswersInlineXML(l, out);
      } else {
        printAnswersTokenizedInlineXML(l, out);
      }
      break;
      case TSV:
        if (preserveSpacing) {
          printAnswersAsIsTextTsv(l, out);
        } else {
          printAnswersTokenizedTextTsv(l, out);
        }
        break;
      case TABBED:
        if (preserveSpacing) {
          printAnswersAsIsTextTabbed(l, out);
        } else {
          printAnswersTokenizedTextTabbed(l, out);
        }
        break;
    default:
      throw new IllegalArgumentException(outputStyle +
                                         " is an unsupported OutputStyle");
    }
  }

  private static  void printAnswersTokenizedText(List l, PrintWriter out) {
    for (IN wi : l) {
      out.print(StringUtils.getNotNullString(wi.get(CoreAnnotations.TextAnnotation.class)));
      out.print('/');
      out.print(StringUtils.getNotNullString(wi.get(CoreAnnotations.AnswerAnnotation.class)));
      out.print(' ');
    }
    out.println(); // put a single newline at the end [added 20091024].
  }

  private static  void printAnswersAsIsText(List l, PrintWriter out) {
    for (IN wi : l) {
      out.print(StringUtils.getNotNullString(wi.get(CoreAnnotations.BeforeAnnotation.class)));
      out.print(StringUtils.getNotNullString(wi.get(CoreAnnotations.TextAnnotation.class)));
      out.print('/');
      out.print(StringUtils.getNotNullString(wi.get(CoreAnnotations.AnswerAnnotation.class)));
      out.print(StringUtils.getNotNullString(wi.get(CoreAnnotations.AfterAnnotation.class)));
    }
  }

  private static  void printAnswersTokenizedTextTsv(List l, PrintWriter out) {
    for (IN wi : l) {
      out.print(StringUtils.getNotNullString(wi.get(CoreAnnotations.TextAnnotation.class)));
      out.print('\t');
      out.println(StringUtils.getNotNullString(wi.get(CoreAnnotations.AnswerAnnotation.class)));
    }
    out.println(); // put a single newline at the end [added 20091024].
  }

  private static  void printAnswersAsIsTextTsv(List l, PrintWriter out) {
    for (IN wi : l) {
      out.print(StringUtils.getNotNullString(wi.get(CoreAnnotations.BeforeAnnotation.class)));
      out.print(StringUtils.getNotNullString(wi.get(CoreAnnotations.TextAnnotation.class)));
      out.print(StringUtils.getNotNullString(wi.get(CoreAnnotations.AfterAnnotation.class)));
      out.print('\t');
      out.println(StringUtils.getNotNullString(wi.get(CoreAnnotations.AnswerAnnotation.class)));
    }
  }

  private void printAnswersAsIsTextTabbed(List l, PrintWriter out) {
    final String background = flags.backgroundSymbol;
    String lastEntityType = null;
    for (IN wi : l) {
      String entityType = wi.get(CoreAnnotations.AnswerAnnotation.class);
      String token = StringUtils.getNotNullString(wi.get(CoreAnnotations.TextAnnotation.class));
      String before = StringUtils.getNotNullString(wi.get(CoreAnnotations.BeforeAnnotation.class));
      String after = StringUtils.getNotNullString(wi.get(CoreAnnotations.AfterAnnotation.class));

      if (entityType.equals(lastEntityType)) {
        // continue the same entity in column 1 or 3
        out.print(before);
        out.print(token);
        out.print(after);
      } else {
        if (lastEntityType != null &&  ! background.equals(lastEntityType)) {
          // different entity type.  If previous not background/start, write in column 2
          out.print('\t');
          out.print(lastEntityType);
        }
        if (background.equals(entityType)) {
          // we'll print it in column 3. Normally, we're in column 2, unless we were at the start of doc
          if (lastEntityType == null) {
            out.print('\t');
          }
          out.print('\t');
        } else {
          // otherwise we're printing in column 1 again
          out.println();
        }
        out.print(before);
        out.print(token);
        out.print(after);
        lastEntityType = entityType;
      }
    }
    // if we're in the middle of printing an entity, then we should print its type
    if (lastEntityType != null && ! background.equals(lastEntityType)) {
      out.print('\t');
      out.print(lastEntityType);
    }
    // finish line then add blank line
    out.println();
    out.println();
  }

  private void printAnswersTokenizedTextTabbed(List l, PrintWriter out) {
    final String background = flags.backgroundSymbol;
    String lastEntityType = null;
    for (IN wi : l) {
      String entityType = wi.get(CoreAnnotations.AnswerAnnotation.class);
      String token = StringUtils.getNotNullString(wi.get(CoreAnnotations.TextAnnotation.class));
      if (entityType.equals(lastEntityType)) {
        // continue the same entity in column 1 or 3
        out.print(' ');
        out.print(token);
      } else {
        if (lastEntityType != null && ! background.equals(lastEntityType)) {
          // different entity type.  If previous not background/start, write in column 2
          out.print('\t');
          out.print(lastEntityType);
        }
        if (background.equals(entityType)) {
          // we'll print it in column 3. Normally, we're in column 2, unless we were at the start of doc
          if (lastEntityType == null) {
            out.print('\t');
          }
          out.print('\t');
        } else {
          // otherwise we're printing in column 1 again
          out.println();
        }
        out.print(token);
        lastEntityType = entityType;
      }
    }
    // if we're in the middle of printing an entity, then we should print its type
    if (lastEntityType != null && ! background.equals(lastEntityType)) {
      out.print('\t');
      out.print(lastEntityType);
    }
    // finish line then add blank line
    out.println();
    out.println();
  }

  private static  void printAnswersXML(List doc, PrintWriter out) {
    int num = 0;
    for (IN wi : doc) {
      String prev = StringUtils.getNotNullString(wi.get(CoreAnnotations.BeforeAnnotation.class));
      out.print(prev);
      out.print("");
      out.print(XMLUtils.escapeXML(StringUtils.getNotNullString(wi.get(CoreAnnotations.TextAnnotation.class))));
      out.print("");
      String after = StringUtils.getNotNullString(wi.get(CoreAnnotations.AfterAnnotation.class));
      out.print(after);
    }
  }

  private static  void printAnswersTokenizedXML(List doc, PrintWriter out) {
    int num = 0;
    for (IN wi : doc) {
      out.print("");
      out.print(XMLUtils.escapeXML(StringUtils.getNotNullString(wi.get(CoreAnnotations.TextAnnotation.class))));
      out.println("");
    }
  }

  private void printAnswersInlineXML(List doc, PrintWriter out) {
    final String background = flags.backgroundSymbol;
    String prevTag = background;
    for (Iterator wordIter = doc.iterator(); wordIter.hasNext();) {
      IN wi = wordIter.next();
      String tag = StringUtils.getNotNullString(wi.get(CoreAnnotations.AnswerAnnotation.class));

      String before = StringUtils.getNotNullString(wi.get(CoreAnnotations.BeforeAnnotation.class));

      String current = StringUtils.getNotNullString(wi.get(CoreAnnotations.OriginalTextAnnotation.class));
      if (!tag.equals(prevTag)) {
        if (!prevTag.equals(background) && !tag.equals(background)) {
          out.print("');
          out.print(before);
          out.print('<');
          out.print(tag);
          out.print('>');
        } else if (!prevTag.equals(background)) {
          out.print("');
          out.print(before);
        } else if (!tag.equals(background)) {
          out.print(before);
          out.print('<');
          out.print(tag);
          out.print('>');
        }
      } else {
        out.print(before);
      }
      out.print(current);
      String afterWS = StringUtils.getNotNullString(wi.get(CoreAnnotations.AfterAnnotation.class));

      if (!tag.equals(background) && !wordIter.hasNext()) {
        out.print("');
        prevTag = background;
      } else {
        prevTag = tag;
      }
      out.print(afterWS);
    }
  }

  private void printAnswersTokenizedInlineXML(List doc, PrintWriter out) {
    final String background = flags.backgroundSymbol;
    String prevTag = background;
    boolean first = true;
    for (Iterator wordIter = doc.iterator(); wordIter.hasNext();) {
      IN wi = wordIter.next();
      String tag = StringUtils.getNotNullString(wi.get(CoreAnnotations.AnswerAnnotation.class));
      if (!tag.equals(prevTag)) {
        if (!prevTag.equals(background) && !tag.equals(background)) {
          out.print(" <");
          out.print(tag);
          out.print('>');
        } else if (!prevTag.equals(background)) {
          out.print(" ");
        } else if (!tag.equals(background)) {
          if (!first) {
            out.print(' ');
          }
          out.print('<');
          out.print(tag);
          out.print('>');
        }
      } else {
        if (!first) {
          out.print(' ');
        }
      }
      first = false;
      out.print(StringUtils.getNotNullString(wi.get(CoreAnnotations.OriginalTextAnnotation.class)));

      if (!wordIter.hasNext()) {
        if (!tag.equals(background)) {
          out.print("');
        }
        out.print(' ');
        prevTag = background;
      } else {
        prevTag = tag;
      }
    }
    out.println();
  }

}

/*
 * This is old stuff from a brief period when this DocumentReaderAndWriter tried
 * to handle treating SGML tags as part of white space, even though they were
 * returned as tokens by the tokenizer. If this is to be revived, it seems like
 * this handling should be moved down into the tokenizer.
 *
 * These first two class declarations used to be in CoreAnnotations. The rest
 * used to be in this class.
 *
 * public static class PrevSGMLAnnotation implements CoreAnnotation {
 * public Class getType() { return String.class; } }
 *
 * public static class AfterSGMLAnnotation implements CoreAnnotation {
 * public Class getType() { return String.class; } }
 *
 *
 *
 * public Iterator> getIterator(Reader r) { PTBTokenizer ptb =
 * PTBTokenizer.newPTBTokenizer(r, false, true); List firstSplit = new
 * ArrayList(); List d = new ArrayList();
 *
 * while (ptb.hasNext()) { IN w = ptb.next(); Matcher m =
 * sgml.matcher(w.word()); if (m.matches()) { if (d.size() > 0) {
 * firstSplit.add(d); d = new ArrayList(); } firstSplit.add(w); continue; }
 * d.add(w); } if (d.size() > 0) { firstSplit.add(d); }
 *
 * List secondSplit = new ArrayList(); for (Object o : firstSplit) { if (o
 * instanceof List) { secondSplit.addAll(wts.process((List) o)); } else {
 * secondSplit.add(o); } }
 *
 * String prevTags = ""; IN lastWord = null;
 *
 * List> documents = new ArrayList>();
 *
 * boolean first = true;
 *
 * for (Object o : secondSplit) { if (o instanceof List) { List doc = (List) o;
 * List document = new ArrayList(); int pos = 0; for (Iterator wordIter
 * = doc.iterator(); wordIter.hasNext(); pos++) { IN w = (IN) wordIter.next();
 * w.set(CoreAnnotations.PositionAnnotation.class, Integer.toString(pos)); if (first &&
 * prevTags.length() > 0) { w.set(PrevSGMLAnnotation.class, prevTags); } first =
 * false; lastWord = w; document.add(w); } documents.add(document); } else {
 * //String tag = ((Word) o).word(); IN word = (IN) o; String tag =
 * word.before() + word.current(); if (first) { System.err.println(word);
 * prevTags = tag; } else { String t =
 * lastWord.getString(AfterSGMLAnnotation.class); tag = t + tag;
 * lastWord.set(AfterSGMLAnnotation.class, tag); } } }
 *
 * // this is a hack to deal with the incorrect assumption in the above code
 * that // SGML only occurs between sentences and never inside of them. List
 * allWords = new ArrayList(); for (List doc : documents) {
 * allWords.addAll(doc); }
 *
 * List> documentsFinal = wts.process(allWords);
 * System.err.println(documentsFinal.get(0).get(0)); System.exit(0);
 *
 * return documentsFinal.iterator(); // return documents.iterator(); }
 *
 *
 * public void printAnswersInlineXML(List doc, PrintWriter out) { final
 * String background = flags.backgroundSymbol; String prevTag = background; for
 * (Iterator wordIter = doc.iterator(); wordIter.hasNext(); ) { IN wi =
 * wordIter.next(); String prev = wi.getString(PrevSGMLAnnotation.class);
 * out.print(prev); if (prev.length() > 0) { prevTag = background; } String tag
 * = wi.getString(CoreAnnotations.AnswerAnnotation.class); if ( ! tag.equals(prevTag)) { if ( !
 * prevTag.equals(background) && ! tag.equals(background)) { out.print("');
 * out.print(wi.getString(CoreAnnotations.BeforeAnnotation.class)); out.print('<');
 * out.print(tag); out.print('>'); } else if ( ! prevTag.equals(background)) {
 * out.print("');
 * out.print(wi.getString(CoreAnnotations.BeforeAnnotation.class)); } else if ( !
 * tag.equals(background)) { out.print(wi.getString(CoreAnnotations.BeforeAnnotation.class));
 * out.print('<'); out.print(tag); out.print('>'); } } else {
 * out.print(wi.getString(CoreAnnotations.BeforeAnnotation.class)); }
 * out.print(wi.getString(CoreAnnotations.OriginalTextAnnotation.class)); String after =
 * wi.getString(AfterSGMLAnnotation.class); String afterWS =
 * wi.getString(CoreAnnotations.AfterAnnotation.class);
 *
 * if ( ! tag.equals(background) && ( ! wordIter.hasNext() || after.length() >
 * 0)) { out.print("'); prevTag = background;
 * } else { prevTag = tag; } out.print(afterWS); out.print(after); } }
 */