edu.stanford.nlp.patterns.surface.AnnotatedTextReader Maven / Gradle / Ivy

Show more of this group Show more artifacts with this name
Show all versions of stanford-corenlp Show documentation

Stanford CoreNLP provides a set of natural language analysis tools which can take raw English language text input and give the base forms of words, their parts of speech, whether they are names of companies, people, etc., normalize dates, times, and numeric quantities, mark up the structure of sentences in terms of phrases and word dependencies, and indicate which noun phrases refer to the same entities. It provides the foundational building blocks for higher level text understanding applications.

There is a newer version: 4.5.7

Show newest version

package edu.stanford.nlp.patterns.surface;

import java.io.BufferedReader;
import java.io.IOException;
import java.io.Reader;
import java.io.StringReader;
import java.util.*;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

import edu.stanford.nlp.ling.CoreAnnotations;
import edu.stanford.nlp.ling.CoreLabel;
import edu.stanford.nlp.ling.HasWord;
import edu.stanford.nlp.patterns.DataInstance;
import edu.stanford.nlp.process.DocumentPreprocessor;
import edu.stanford.nlp.process.PTBTokenizer.PTBTokenizerFactory;
import edu.stanford.nlp.sequences.CoNLLDocumentReaderAndWriter;
import edu.stanford.nlp.sequences.SeqClassifierFlags;
import edu.stanford.nlp.util.ArrayCoreMap;
import edu.stanford.nlp.util.CoreMap;
import edu.stanford.nlp.util.StringUtils;
import edu.stanford.nlp.util.TypesafeMap;

/**
 * CanNOT handle overlapping labeled text (that is one token cannot belong to
 * multiple labels)! Note that there has to be spaces around the tags 
 * and  for the reader to work correctly!
 * 
 * @author Sonal Gupta ([email protected])
 * 
 */
public class AnnotatedTextReader {

    public static Map parseColumnFile(BufferedReader reader,
                                                Set categoriesAllowed,
                                                Map>> setClassForTheseLabels,
                                                boolean setGoldClass, String sentIDprefix ){

      CoNLLDocumentReaderAndWriter conllreader = new CoNLLDocumentReaderAndWriter();
      Properties props = new Properties();
      SeqClassifierFlags flags = new SeqClassifierFlags(props);
      flags.entitySubclassification = "noprefix";
      flags.retainEntitySubclassification = false;
      conllreader.init(flags);

      Iterator> dociter = conllreader.getIterator(reader);;
      int num = -1;
      Map sents = new HashMap<>();
      while(dociter.hasNext()){

        List doc = dociter.next();

        List words = new ArrayList<>();
        List sentcore = new ArrayList<>();


        int tokenindex = 0;
        for(CoreLabel l: doc){

          if(l.word().equals(CoNLLDocumentReaderAndWriter.BOUNDARY) || l.word().equals("-DOCSTART-")){
            if(words.size() > 0){
              num++;
              String docid = sentIDprefix + "-"+String.valueOf(num);
              DataInstance sentInst = DataInstance.getNewSurfaceInstance(sentcore);
              sents.put(docid, sentInst);

              words = new ArrayList<>();
              sentcore = new ArrayList<>();
              tokenindex = 0;
            }
            continue;
          }
          tokenindex ++;
          words.add(l.word());

          l.set(CoreAnnotations.IndexAnnotation.class, tokenindex);
          l.set(CoreAnnotations.ValueAnnotation.class, l.word());
          String label = l.get(CoreAnnotations.AnswerAnnotation.class);

          assert label != null : "label cannot be null";

          l.set(CoreAnnotations.TextAnnotation.class, l.word());
          l.set(CoreAnnotations.OriginalTextAnnotation.class, l.word());

          if (setGoldClass){
            l.set(CoreAnnotations.GoldAnswerAnnotation.class, label);
          }

          if (setClassForTheseLabels != null
            && setClassForTheseLabels.containsKey(label))
            l.set(setClassForTheseLabels.get(label), label);

          sentcore.add(l);

        }

        if(words.size() > 0){
          num++;
          String docid = sentIDprefix + "-"+String.valueOf(num);;
          DataInstance sentInst = DataInstance.getNewSurfaceInstance(sentcore);
          sents.put(docid, sentInst);
        }
      }
      return sents;

    }

  public static List parseFile(
      BufferedReader reader,
      Set categoriesAllowed,
      Map>> setClassForTheseLabels,
      boolean setGoldClass, String sentIDprefix)
      throws IOException {

    Pattern startingLabelToken = Pattern.compile("<("
        + StringUtils.join(categoriesAllowed, "|") + ")>");
    Pattern endLabelToken = Pattern.compile("");
    String backgroundSymbol = "O";

    List sentences = new ArrayList<>();
    int lineNum = -1;
    String l = null;

    while ((l = reader.readLine()) != null) {
      lineNum++;
      String[] t = l.split("\t", 2);
      String id = null;
      String text = null;
      if (t.length == 2) {
        id = t[0];
        text = t[1];
      } else if (t.length == 1) {
        text = t[0];
        id = String.valueOf(lineNum);
      }
      id = sentIDprefix + id;
      DocumentPreprocessor dp = new DocumentPreprocessor(new StringReader(text));
      PTBTokenizerFactory tokenizerFactory = PTBTokenizerFactory
          .newCoreLabelTokenizerFactory("ptb3Escaping=false,normalizeParentheses=false,escapeForwardSlashAsterisk=false");
      dp.setTokenizerFactory(tokenizerFactory);

      String label = backgroundSymbol;
      int sentNum = -1;

      for (List sentence : dp) {
        sentNum++;
        String sentStr = "";
        List sent = new ArrayList<>();
        for (HasWord tokw : sentence) {
          String tok = tokw.word();
          Matcher startingMatcher = startingLabelToken.matcher(tok);
          Matcher endMatcher = endLabelToken.matcher(tok);
          if (startingMatcher.matches()) {
            //System.out.println("matched starting");
            label = startingMatcher.group(1);
          } else if (endMatcher.matches()) {
            //System.out.println("matched end");
            label = backgroundSymbol;
          } else {

            CoreLabel c = new CoreLabel();

            List toks = new ArrayList<>();

            toks.add(tok);

            for (String toksplit : toks) {

              sentStr += " " + toksplit;

              c.setWord(toksplit);
              c.setLemma(toksplit);
              c.setValue(toksplit);
              c.set(CoreAnnotations.TextAnnotation.class, toksplit);
              c.set(CoreAnnotations.OriginalTextAnnotation.class, tok);

              if (setGoldClass){
                 
                c.set(CoreAnnotations.GoldAnswerAnnotation.class, label);
              }
              
              if (setClassForTheseLabels != null
                  && setClassForTheseLabels.containsKey(label))
                c.set(setClassForTheseLabels.get(label), label);

              sent.add(c);
            }
          }
        }
        CoreMap sentcm = new ArrayCoreMap();
        sentcm.set(CoreAnnotations.TextAnnotation.class, sentStr.trim());
        sentcm.set(CoreAnnotations.TokensAnnotation.class, sent);
        sentcm.set(CoreAnnotations.DocIDAnnotation.class, id + "-" + sentNum);
        sentences.add(sentcm);
      }
    }
    return sentences;
  }
}