edu.stanford.nlp.trees.DiskTreebank Maven / Gradle / Ivy

Show more of this group Show more artifacts with this name
Show all versions of stanford-corenlp Show documentation
Stanford CoreNLP provides a set of natural language analysis tools which can take raw English language text input and give the base forms of words, their parts of speech, whether they are names of companies, people, etc., normalize dates, times, and numeric quantities, mark up the structure of sentences in terms of phrases and word dependencies, and indicate which noun phrases refer to the same entities. It provides the foundational building blocks for higher level text understanding applications.
There is a newer version: 4.5.7
Show newest version
package edu.stanford.nlp.trees; 
import edu.stanford.nlp.util.logging.Redwood;

import java.io.*;
import java.util.*;

import edu.stanford.nlp.io.IOUtils;
import edu.stanford.nlp.ling.HasIndex;

/**
 * A DiskTreebank is a Collection of
 * Trees.
 * A DiskTreebank object stores merely the information to
 * get at a corpus of trees that is stored on disk.  Access is usually
 * via apply()'ing a TreeVisitor to each Tree in the Treebank or by using
 * an iterator() to get an iteration over the Trees.
 * 
 * If the root Label of the Tree objects built by the TreeReader
 * implements HasIndex, then the filename and index of the tree in
 * a corpus will be inserted as they are read in.
 *
 * @author Christopher Manning
 * @author Spence Green
 */
public final class DiskTreebank extends Treebank  {

  /** A logger for this class */
  private static Redwood.RedwoodChannels log = Redwood.channels(DiskTreebank.class);

  private static boolean PRINT_FILENAMES = false;

  private final List filePaths = new ArrayList<>();
  private final List fileFilters = new ArrayList<>();

  /*
   * Absolute path of the file currently being read.
   */
  private String currentFilename; // = null;


  /**
   * Create a new DiskTreebank. The trees are made with a LabeledScoredTreeReaderFactory.
   */
  public DiskTreebank() {
    this(new LabeledScoredTreeReaderFactory());
  }

  /**
   * Create a new treebank, set the encoding for file access.
   *
   * @param encoding The charset encoding to use for treebank file decoding
   */
  public DiskTreebank(String encoding) {
    this(new LabeledScoredTreeReaderFactory(), encoding);
  }

  /**
   * Create a new DiskTreebank.
   *
   * @param trf the factory class to be called to create a new
   *            TreeReader
   */
  public DiskTreebank(TreeReaderFactory trf) {
    super(trf);
  }

  /**
   * Create a new DiskTreebank.
   *
   * @param trf      the factory class to be called to create a new
   *                 TreeReader
   * @param encoding The charset encoding to use for treebank file decoding
   */
  public DiskTreebank(TreeReaderFactory trf, String encoding) {
    super(trf, encoding);
  }

  /**
   * Create a new Treebank. The trees are made with a LabeledScoredTreeReaderFactory.
   *
   * @param initialCapacity The initial size of the underlying Collection.
   *                        For a DiskTreebank, this parameter is ignored.
   */
  public DiskTreebank(int initialCapacity) {
    this(initialCapacity, new LabeledScoredTreeReaderFactory());
  }

  /**
   * Create a new Treebank.
   *
   * @param initialCapacity The initial size of the underlying Collection,
   *                        For a DiskTreebank, this parameter is ignored.
   * @param trf             the factory class to be called to create a new
   *                        TreeReader
   */
  public DiskTreebank(int initialCapacity, TreeReaderFactory trf) {
    this(trf);
  }

  /**
   * Empty a Treebank.
   */
  @Override
  public void clear() {
    filePaths.clear();
    fileFilters.clear();
  }

  /**
   * Load trees from given directory.  This version just records
   * the paths to be processed, and actually processes them at apply time.
   *
   * @param path file or directory to load from
   * @param filt a FilenameFilter of files to load
   */
  @Override
  public void loadPath(File path, FileFilter filt) {
    if(path.exists()) {
      filePaths.add(path);
      fileFilters.add(filt);
    } else {
      System.err.printf("%s: File/path %s does not exist. Skipping.%n" , this.getClass().getName(), path.getPath());
    }
  }

  /**
   * Applies the TreeVisitor to to all trees in the Treebank.
   *
   * @param tp A class that can process trees.
   */
  @Override
  public void apply(final TreeVisitor tp) {
    for (Tree t : this) {
      tp.visitTree(t);
    }
  }

  /**
   * Returns the absolute path of the file currently being read.
   *
   */
  public String getCurrentFilename() {
    return currentFilename;
  }

  public List getCurrentPaths() {
    return Collections.unmodifiableList(filePaths);
  }

  public void printFileNames() {
    PRINT_FILENAMES = true;
  }

  private class DiskTreebankIterator implements Iterator {

    private TreeReader tr = null;
    private Tree storedTree = null;  // null means iterator is exhausted (or not yet constructed)

    //Create local copies so that calls to loadPath() in the parent class
    //don't cause exceptions i.e., this iterator is valid over the state of DiskTreebank
    //when the iterator is created.
    private final List localPathList;
    private final List localFilterList;
    private int fileListPtr = 0;

    private File currentFile;
    private int curLineId = 1;

    private List curFileList;
    private Iterator curPathIter;

    private DiskTreebankIterator() {
      localPathList = new ArrayList<>(filePaths);
      localFilterList = new ArrayList<>(fileFilters);

      if(primeNextPath() && primeNextFile())
        storedTree = primeNextTree();
    }

    //In the case of a recursive file filter, performs a BFS through the directory structure.
    private boolean primeNextPath() {
      while(fileListPtr < localPathList.size() && fileListPtr < localFilterList.size()) {
        final File nextPath = localPathList.get(fileListPtr);
        final FileFilter nextFilter = localFilterList.get(fileListPtr);
        fileListPtr++;

        final List pathListing = ((nextPath.isDirectory()) ?
                                        Arrays.asList(nextPath.listFiles(nextFilter)) : Collections.singletonList(nextPath));

        if(pathListing != null) {
          if(pathListing.size() > 1) Collections.sort(pathListing);

          curFileList = new ArrayList<>();
          for(File path : pathListing) {
            if(path.isDirectory()) {
              localPathList.add(path);
              localFilterList.add(nextFilter);
            } else {
              curFileList.add(path);
            }
          }

          if(curFileList.size() != 0) {
            curPathIter = curFileList.iterator();
            return true;
          }
        }
      }

      return false;
    }

    private boolean primeNextFile() {
      try {
        if(curPathIter.hasNext() || (primeNextPath() && curPathIter.hasNext())) {
          currentFile = curPathIter.next();
          currentFilename = currentFile.getAbsolutePath();
          if(PRINT_FILENAMES) log.info(currentFile);

          if (tr != null) { tr.close(); }
          tr = treeReaderFactory().newTreeReader(IOUtils.readerFromFile(currentFile, encoding()));
          curLineId = 1;
          return true;
        }

      } catch (UnsupportedEncodingException e) {
        System.err.printf("%s: Filesystem does not support encoding:%n%s%n", this.getClass().getName(), e.toString());
        throw new RuntimeException(e);
      } catch (FileNotFoundException e) {
        System.err.printf("%s: File does not exist:%n%s%n", this.getClass().getName(),e.toString());
        throw new RuntimeException(e);
      } catch (IOException e) {
        System.err.printf("%s: Unable to close open tree reader:%n%s%n", this.getClass().getName(),currentFile.getPath());
        throw new RuntimeException(e);
      }
      return false;
    }

    private Tree primeNextTree() {
      Tree t = null;

      try {
        t = tr.readTree();
        if(t == null && primeNextFile()) //Current file is exhausted
          t = tr.readTree();

        //Associate this tree with a file and line number
        if(t != null && t.label() != null && t.label() instanceof HasIndex) {
          HasIndex lab = (HasIndex) t.label();
          lab.setSentIndex(curLineId++);
          lab.setDocID(currentFile.getName());
        }

      } catch (IOException e) {
        System.err.printf("%s: Error reading from file %s:%n%s%n", this.getClass().getName(), currentFile.getPath(), e.toString());
        throw new RuntimeException(e);
      }

      return t;
    }

    /**
     * Returns true if the iteration has more elements.
     */
    @Override
    public boolean hasNext() { return storedTree != null; }

    /**
     * Returns the next element in the iteration.
     */
    @Override
    public Tree next() {
      if(storedTree == null)
        throw new NoSuchElementException();

      Tree ret = storedTree;
      storedTree = primeNextTree();
      return ret;
    }

    /**
     * Not supported
     */
    @Override
    public void remove() { throw new UnsupportedOperationException(); }
  }


  /**
   * Return an Iterator over Trees in the Treebank.  This is implemented
   * by building per-file MemoryTreebanks for the files in the
   * DiskTreebank.  As such, it isn't as efficient as using
   * apply().
   */
  @Override
  public Iterator iterator() {
    return new DiskTreebankIterator();
  }

}