All Downloads are FREE. Search and download functionalities are using the official Maven repository.

edu.berkeley.nlp.treebank.TreebankFetcher Maven / Gradle / Ivy

Go to download

The Berkeley parser analyzes the grammatical structure of natural language using probabilistic context-free grammars (PCFGs).

The newest version!
package edu.berkeley.nlp.treebank;

import java.util.ArrayList;
import java.util.Collection;
import java.util.Iterator;
import java.util.List;

import edu.berkeley.nlp.syntax.Tree;
import edu.berkeley.nlp.syntax.Trees.TreeTransformer;
import edu.berkeley.nlp.util.Logger;
import edu.berkeley.nlp.util.StopWatch;

public class TreebankFetcher {

	private final List> transformers = new ArrayList>();
	private int maxLength = Integer.MAX_VALUE;
	private int minLength = 0;
	private int maxTrees = Integer.MAX_VALUE;
	private final boolean verbose ;

	public TreebankFetcher(boolean verbose) {
		this.verbose = verbose;
	}

	public TreebankFetcher() {
		this(false);
	}

	public void setMaxLength(int maxLength) {
		this.maxLength = maxLength;
	}

	public void setMinLength(int minLength) {
		this.minLength = minLength;
	}

	public void setMaxTrees(int maxTrees) {
		this.maxTrees = maxTrees;
	}



	public void addTransformer(TreeTransformer transformer) {
		transformers.add ( transformer );
	}

	public Iterable> getTrees(String path) {
		return getTrees(path,-1,Integer.MAX_VALUE);
	}

  public Iterable> getTrees(String path, String ext) {
    return getTrees(path,ext,-1,Integer.MAX_VALUE);
  }


  public Iterable> getTrees(String path, int start, int stop) {
    return getTrees(path, "mrg", start,stop);
  }
  
	public Iterable> getTrees(String path, String ext, int start, int stop) {
		StopWatch stopwatch = new StopWatch();
		if (verbose) {
			Logger.i().logs("Loading Trees from %s [%d,%d]...", path, start, stop);
			System.err.flush();
			stopwatch.start();
		}
		final Collection> rawTrees =  PennTreebankReader.readTrees(path, start * 100, stop * 100);
		if (verbose) {
			stopwatch.accumStop();
			Logger.i().logs("Done loaded %d trees in %.3f seconds\n", rawTrees.size(), stopwatch.ms);
			Logger.i().logs("Applying transformers %s...\n", transformers.toString());
			stopwatch.reset();
			stopwatch.start();
		}
		
		return new Iterable>() {

			public Iterator> iterator() {
				final Iterator> rawIt = rawTrees.iterator();;
				return new Iterator>() {

					Tree nextTree = null;
					int count = 0;
					
					public boolean hasNext() {
						// TODO Auto-generated method stub
						if (count >= maxTrees) { return false; }
						queueNext();
						return nextTree != null;						
					}

					public Tree next() {
						queueNext();						
						Tree retTree = nextTree;
						nextTree = null;
						++count;
						return retTree;
					}
					
					private void queueNext() {
						if (nextTree != null) {
							return;
						}
						if (!rawIt.hasNext()) { return ; }
						Tree tree = rawIt.next();
						for (TreeTransformer transformer: transformers) {
							tree = transformer.transformTree(tree);
						}
						if (tree.getYield().size() > maxLength) {
							queueNext();
							return;
						}
						if (tree.getYield().size() < minLength) {
							queueNext();
							return;
						}
						nextTree = tree;
					}

					public void remove() {
						// TODO Auto-generated method stub
						throw new RuntimeException();
					}
					
				};
			}			
		};
//		for (Tree tree: rawTrees) {
//			for (TreeTransformer transformer: transformers) {
//				tree = transformer.transformTree(tree);
//			}
//			if (tree.getYield().size() > maxLength) {
//				continue;
//			}
//
//			if (tree.getYield().size() < minLength) {
//				continue;
//			}
//			trees.add(tree);
//			if (trees.size() >= maxTrees) {
//				break;
//			}
//		}
//		if (verbose) {
//			stopwatch.accumStop();
//			LogInfo.logs("Transformed %d trees in %.3f seconds\n", trees.size(), stopwatch.ms);
//		}
//		return trees;
	}

}




© 2015 - 2025 Weber Informatics LLC | Privacy Policy