All Downloads are FREE. Search and download functionalities are using the official Maven repository.

edu.berkeley.nlp.io.PennTreebankReader Maven / Gradle / Ivy

Go to download

The Berkeley parser analyzes the grammatical structure of natural language using probabilistic context-free grammars (PCFGs).

The newest version!
package edu.berkeley.nlp.io;

import java.io.BufferedReader;
import java.io.File;
import java.io.FileFilter;
import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.IOException;
import java.io.InputStreamReader;
import java.nio.charset.Charset;
import java.nio.charset.UnsupportedCharsetException;
import java.util.AbstractCollection;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collection;
import java.util.Iterator;
import java.util.List;
import java.util.SortedSet;
import java.util.TreeSet;

import edu.berkeley.nlp.syntax.Tree;
import edu.berkeley.nlp.syntax.Trees;
import edu.berkeley.nlp.util.ConcatenationIterator;

/**
 * @author Dan Klein
 */
public class PennTreebankReader {

  static class TreeCollection extends AbstractCollection> {

    List files;
    Charset charset;
    
    static class TreeIteratorIterator implements Iterator>> {
      Iterator fileIterator;
      Iterator> nextTreeIterator;
      Charset charset;
      BufferedReader currentFileReader, lastReader, readerToClose;

      public boolean hasNext() {
        return nextTreeIterator != null;
      }

      public Iterator> next() {
        Iterator> currentTreeIterator = nextTreeIterator;
        advance();
        return currentTreeIterator;
      }

      public void remove() {
        throw new UnsupportedOperationException();
      }

      private void advance() {
        nextTreeIterator = null;
        while (nextTreeIterator == null && fileIterator.hasNext()) {
        	File file = fileIterator.next();
//        	System.out.println(file);
          try {
          	if (readerToClose!=null) {
//          		System.out.println("closing "+lastReader.toString());
          		readerToClose.close();
          	}
          	readerToClose = lastReader;
          	lastReader = currentFileReader;
//          	currentFileReader = new BufferedReader(
//								new InputStreamReader(new FileInputStream(file), this.charset));
//            nextTreeIterator = new Trees.PennTreeReader(currentFileReader);
            nextTreeIterator = new Trees.PennTreeReader(new BufferedReader(
								new InputStreamReader(new FileInputStream(file), this.charset)));
          } catch (FileNotFoundException e) {
          } catch (UnsupportedCharsetException e) {
          	throw new Error("Unsupported charset in file "+file.getPath());
          } 
          catch (IOException e) {
          	new Error("Error closing file handle");
          }
        }
      	if (readerToClose!=null){
					try {
						readerToClose.close();
					} catch (IOException e) {
						// TODO Auto-generated catch block
						e.printStackTrace();
					}
      	}
      }

      TreeIteratorIterator(List files, Charset charset) {
        this.fileIterator = files.iterator();
        this.charset = charset;
        advance();
      }
    }

    @Override
		public Iterator> iterator() {
      return new ConcatenationIterator>(new TreeIteratorIterator(files, this.charset));
    }

    @Override
		public int size() {
      int size = 0;
      Iterator i = iterator();
      while (i.hasNext()) {
        size++;
        i.next();
      }
      return size;
    }

    private List getFilesUnder(String path, FileFilter fileFilter) {
      File root = new File(path);
      List files = new ArrayList();
      addFilesUnder(root, files, fileFilter);
      return files;
    }

    private void addFilesUnder(File root, List files, FileFilter fileFilter) {
      if (! fileFilter.accept(root)) return;
      if (root.isFile()) {
        files.add(root);
        return;
      }
      if (root.isDirectory()) {
        SortedSet children = new TreeSet(Arrays.asList(root.listFiles()));
        for (File child : children) {
         
          addFilesUnder(child, files, fileFilter);
        }
      }
    }

    public TreeCollection(String path, int lowFileNum, int highFileNum, Charset charset) {
      FileFilter fileFilter = new NumberRangeFileFilter(".mrg", lowFileNum, highFileNum, true);
      this.files = getFilesUnder(path, fileFilter);
//      for (File f : files) System.out.println(f.toString());
      this.charset = charset;
    }
    public TreeCollection(String path, int lowFileNum, int highFileNum, String charsetName) {
    	this(path,lowFileNum,highFileNum,Charset.forName(charsetName));
    }
    public TreeCollection(String path, int lowFileNum, int highFileNum) {
    	this(path,lowFileNum,highFileNum,Charset.defaultCharset());
    }
  }

  public static Collection> readTrees(String path, Charset charset) {
    return readTrees(path, -1, Integer.MAX_VALUE, charset);
  }

  public static Collection> readTrees(String path, int lowFileNum, int highFileNumber, Charset charset) {
    return new TreeCollection(path, lowFileNum, highFileNumber, charset);
  }

  public static void main(String[] args) {
    Collection> trees = readTrees(args[0], Charset.defaultCharset());
    for (Tree tree : trees) {
      tree = (new Trees.StandardTreeNormalizer()).transformTree(tree);
      System.out.println(Trees.PennTreeRenderer.render(tree));
    }
  }

}




© 2015 - 2025 Weber Informatics LLC | Privacy Policy