All Downloads are FREE. Search and download functionalities are using the official Maven repository.

edu.stanford.nlp.parser.charniak.CharniakScoredParsesReaderWriter Maven / Gradle / Ivy

Go to download

Stanford CoreNLP provides a set of natural language analysis tools which can take raw English language text input and give the base forms of words, their parts of speech, whether they are names of companies, people, etc., normalize dates, times, and numeric quantities, mark up the structure of sentences in terms of phrases and word dependencies, and indicate which noun phrases refer to the same entities. It provides the foundational building blocks for higher level text understanding applications.

There is a newer version: 4.5.7
Show newest version
package edu.stanford.nlp.parser.charniak;

import java.io.BufferedReader;
import java.io.IOException;
import java.io.PrintWriter;
import java.io.StringReader;
import java.util.ArrayList;
import java.util.Iterator;
import java.util.List;
import java.util.NoSuchElementException;
import java.util.logging.Logger;
import java.util.regex.Pattern;

import edu.stanford.nlp.io.IOUtils;
import edu.stanford.nlp.io.StringOutputStream;
import edu.stanford.nlp.trees.Tree;
import edu.stanford.nlp.trees.Trees;
import edu.stanford.nlp.util.AbstractIterator;
import edu.stanford.nlp.util.IterableIterator;
import edu.stanford.nlp.util.ScoredObject;
import edu.stanford.nlp.util.Timing;

/**
 * Utility routines for printing/reading scored parses for the Charniak Parser
 *
 * @author Angel Chang
 */
public class CharniakScoredParsesReaderWriter {
  private final static Logger logger = Logger.getLogger(CharniakScoredParsesReaderWriter.class.getName());

  private final static Pattern wsDelimiter = Pattern.compile("\\s+");

  /**
   * Reads scored parses from the charniak parser
   *
   * File format of the scored parses
   * <# of parses>\t
   * 
   * 
   * 
   * 
   * ...
   *
   * @param filename  - File to read parses from
   * @return iterable with list of scored parse trees
   */
  public Iterable>> readScoredTrees(String filename)
  {
    try {
      ScoredParsesIterator iter = new ScoredParsesIterator(filename);
      return new IterableIterator<>(iter);
    } catch (IOException ex) {
      throw new RuntimeException(ex);
    }
  }

  /**
   * Reads scored parses from the charniak parser
   *
   * @param inputDesc - Description of input used in log messages
   * @param br - input reader
   * @return iterable with list of scored parse trees
   */
  public Iterable>> readScoredTrees(String inputDesc, BufferedReader br)
  {
    ScoredParsesIterator iter = new ScoredParsesIterator(inputDesc, br);
    return new IterableIterator<>(iter);
  }

  /**
   * Convert string representing scored parses (in the charniak parser output format)
   *   to list of scored parse trees
   * @param parseStr
   * @return list of scored parse trees
   */
  public List> stringToParses(String parseStr)
  {
    try {
      BufferedReader br = new BufferedReader(new StringReader(parseStr));
      Iterable>> trees = readScoredTrees("", br);
      List> res = null;
      if (trees != null) {
        Iterator>> iter = trees.iterator();
        if (iter != null && iter.hasNext()) {
          res = iter.next();
        }
      }
      br.close();
      return res;
    } catch (IOException ex) {
      throw new RuntimeException(ex);
    }
  }


  /**
   * Convert list of scored parse trees to string representing scored parses
   *   (in the charniak parser output format)
   * @param parses - list of scored parse trees
   * @return string representing scored parses
   */
  public String parsesToString(List> parses)
  {
    if (parses == null) return null;
    StringOutputStream os = new StringOutputStream();
    PrintWriter pw = new PrintWriter(os);
    printScoredTrees(pw, 0, parses);
    pw.close();
    return os.toString();
  }

  /**
   * Print scored parse trees in format used by charniak parser
   * @param trees - trees to output
   * @param filename - file to output to
   */
  public void printScoredTrees(Iterable>> trees, String filename)
  {
    try {
      PrintWriter pw = IOUtils.getPrintWriter(filename);
      int i = 0;
      for (List> treeList:trees) {
        printScoredTrees(pw, i, treeList);
        i++;
      }
      pw.close();
    } catch (IOException ex) {
      throw new RuntimeException(ex);
    }
  }

  /**
   * Print scored parse trees for one sentence in format used by charniak parser
   * @param pw - printwriter
   * @param id - sentence id
   * @param trees - trees to output
   */
  public void printScoredTrees(PrintWriter pw, int id, List> trees)
  {
    pw.println(trees.size() + "\t" + id);
    for (ScoredObject scoredTree:trees) {
      pw.println(scoredTree.score());
      pw.println(scoredTree.object());
    }
  }

  private static class ScoredParsesIterator extends AbstractIterator>>
  {
    String inputDesc;
    BufferedReader br;
    List> next;
    Timing timing;
    int processed = 0;
    boolean done = false;
    boolean closeBufferNeeded = true;
    boolean expectConsecutiveSentenceIds = true;
    int lastSentenceId = -1;

    private ScoredParsesIterator(String filename) throws IOException {
      this(filename, IOUtils.getBufferedFileReader(filename));
    }


    private ScoredParsesIterator(String inputDesc, BufferedReader br) {
      this.inputDesc = inputDesc;
      this.br = br;
      logger.info("Reading cached parses from " + inputDesc);
      timing = new Timing();
      timing.start();
      next = getNext();
      done = next == null;
    }

    private List> getNext()
    {
      try {
        String line;
        int parsesExpected = 0;
        int sentenceId = lastSentenceId;
        ScoredObject curParse = null;
        Double score = null;
        List> curParses = null;
        while ((line = br.readLine()) != null) {
          line = line.trim();
          if (line.length() > 0) {
            if (parsesExpected == 0) {
              // Finished processing parses
              String[] fields = wsDelimiter.split(line, 2);
              parsesExpected = Integer.parseInt(fields[0]);
              sentenceId = Integer.parseInt(fields[1]);
              if (expectConsecutiveSentenceIds) {
               if (sentenceId != lastSentenceId+1) {
                  if (lastSentenceId < sentenceId) {
                    StringBuilder sb = new StringBuilder("Missing sentences");
                    for (int i = lastSentenceId+1; i < sentenceId; i++) {
                      sb.append(" ").append(i);
                    }
                    logger.warning(sb.toString());
                  } else {
                    logger.warning("sentenceIds are not increasing (last="
                          + lastSentenceId + ", curr=" + sentenceId + ")");
                  }
                }
              }
              lastSentenceId = sentenceId;
              curParses = new ArrayList<>(parsesExpected);
            } else {
              if (score == null) {
                // read score
                score = Double.parseDouble(wsDelimiter.split(line, 2)[0]);
              } else {
                // Reading a parse
                curParse = new ScoredObject<>(Trees.readTree(line), score);
                curParses.add(curParse);
                curParse = null;
                score = null;
                parsesExpected--;
                if (parsesExpected == 0) {
                  return curParses;
                }
              }
            }
          }
        }
      } catch (IOException ex) {
        throw new RuntimeException(ex);
      }
      return null;
    }

    public boolean hasNext()
    {
      return !done;
    }

    public List> next()
    {
      if (!done) {
        List> cur = next;
        next = getNext();
        processed++;
        if (next == null) {
          logger.finer("Read " + processed + " trees, from "
                  + inputDesc + " in " + timing.toSecondsString() + " secs");
          done = true;
          if (closeBufferNeeded) {
            try { br.close();  } catch (IOException ex) {};
          }
        }
        return cur;
      } else {
        throw new NoSuchElementException("No more elements from " + inputDesc);
      }
    }
  }

}




© 2015 - 2024 Weber Informatics LLC | Privacy Policy