All Downloads are FREE. Search and download functionalities are using the official Maven repository.

edu.berkeley.nlp.PCFGLA.TreeLabeler Maven / Gradle / Ivy

Go to download

The Berkeley parser analyzes the grammatical structure of natural language using probabilistic context-free grammars (PCFGs).

The newest version!
/**
 * 
 */
package edu.berkeley.nlp.PCFGLA;


import java.io.BufferedReader;
import java.io.FileInputStream;
import java.io.FileOutputStream;
import java.io.InputStreamReader;
import java.io.OutputStreamWriter;
import java.io.PrintWriter;
import java.util.ArrayList;
import java.util.List;

import edu.berkeley.nlp.syntax.StateSet;
import edu.berkeley.nlp.syntax.Tree;
import edu.berkeley.nlp.syntax.Trees;
import edu.berkeley.nlp.syntax.Trees.PennTreeReader;
import edu.berkeley.nlp.syntax.Trees.PennTreeRenderer;
import edu.berkeley.nlp.util.Numberer;

/**
 * @author petrov
 *
 */
public class TreeLabeler {

	public static class Options {

		@Option(name = "-gr", usage = "Input File for Grammar (Required)\n")
		public String inFileName;

		@Option(name = "-labelLevel", usage = "Parse with projected grammar from this level (yielding 2^level substates) (Default: -1 = input grammar)")
		public int labelLevel = -1;

		@Option(name = "-scores", usage = "Output inside scores. (Default: false)")
		public boolean scores;

		@Option(name = "-getYield", usage = "Get the sentences only")
		public boolean getyield;

		@Option(name = "-labelOnlyPOS", usage = "Labels only the POS categories")
		public boolean labelOnlyPOS;

		@Option(name = "-onlyConfidence", usage = "Output only confidence measure, i.e. tree likelihood: P(T|w) (Default: false)")
		public boolean onlyConfidence;

		@Option(name = "-maxLength", usage = "Remove sentences that are longer than this (doesn't print an empty line)")
		public int maxLength = 1000;		

		@Option(name = "-inputFile", usage = "Read input from this file instead of reading it from STDIN.")
		public String inputFile;

		@Option(name = "-outputFile", usage = "Store output in this file instead of printing it to STDOUT.")
		public String outputFile;

		@Option(name = "-prettyPrint", usage = "Print in human readable form rather than one tree per line")
		public boolean prettyPrint;

		@Option(name = "-getPOSandYield", usage = "Get POS and words in CoNLL format")
		public boolean getPOSandYield;

		@Option(name = "-annotateTrees", usage = "Binarize and annotate trees")
		public boolean annotateTrees;

		@Option(name = "-escapeChars", usage = "Escape parantheses and backslashes")
		public boolean escapeChars;

		@Option(name = "-keepFunctionLabels", usage = "Retain function labels")
		public boolean keepFunctionLabels;

		@Option(name = "-horizontalMarkovization", usage = "Level of horizontal Markovization (Default: 0, i.e. no sibling information)")
		public int h_markov = 0;    

		@Option(name = "-verticalMarkovization", usage = "Level of vertical Markovization (Default: 1, i.e. no parent information)")
		public int v_markov = 1;    

		@Option(name = "-b", usage = "LEFT/RIGHT Binarization (Default: RIGHT)")
		public Binarization binarization = Binarization.RIGHT;

	}


	/**
	 * @param grammar
	 * @param lexicon
	 * @param labelLevel
	 */
	Grammar grammar;
	SophisticatedLexicon lexicon;
	ArrayParser labeler;
	CoarseToFineMaxRuleParser parser;
	Numberer tagNumberer;
	Binarization binarization;

	public TreeLabeler(Grammar grammar, SophisticatedLexicon lexicon, int labelLevel, Binarization bin) {
		if (labelLevel==-1){
			this.grammar = grammar.copyGrammar(false);
			this.lexicon = lexicon.copyLexicon();
		} else { // need to project
			int[][] fromMapping = grammar.computeMapping(1);
			int[][] toSubstateMapping = grammar.computeSubstateMapping(labelLevel);
			int[][] toMapping = grammar.computeToMapping(labelLevel,toSubstateMapping);
			double[] condProbs = grammar.computeConditionalProbabilities(fromMapping,toMapping);

			this.grammar = grammar.projectGrammar(condProbs,fromMapping,toSubstateMapping);
			this.lexicon = lexicon.projectLexicon(condProbs,fromMapping,toSubstateMapping);
			this.grammar.splitRules();
			double filter = 1.0e-10;
			this.grammar.removeUnlikelyRules(filter,1.0);
			this.lexicon.removeUnlikelyTags(filter,1.0);
		}
		this.grammar.logarithmMode();
		this.lexicon.logarithmMode();
		this.labeler = new ArrayParser(this.grammar, this.lexicon);
		this.parser = new CoarseToFineMaxRuleParser(grammar, lexicon, 
				1,-1,true,false, false, false, false, false, true);      
		this.tagNumberer = Numberer.getGlobalNumberer("tags");
		this.binarization = bin;
	}


	public static void main(String[] args) {
		OptionParser optParser = new OptionParser(Options.class);
		Options opts = (Options) optParser.parse(args, true);
		// provide feedback on command-line arguments
		System.err.println("Calling with " + optParser.getPassedInOptions());

		String inFileName = opts.inFileName;
		Grammar grammar = null;
		SophisticatedLexicon lexicon = null;
		TreeLabeler treeLabeler = null;
		boolean labelTree = false;
		ParserData pData = null;
		short[] numSubstates = null;
		if (inFileName==null) {
			System.err.println("Did not provide a grammar.");
		}
		else {
			labelTree = true;
			System.err.println("Loading grammar from "+inFileName+".");

			pData = ParserData.Load(inFileName);
			if (pData==null) {
				System.out.println("Failed to load grammar from file"+inFileName+".");
				System.exit(1);
			}
			grammar = pData.getGrammar();
			grammar.splitRules();
			lexicon = (SophisticatedLexicon)pData.getLexicon();

			Numberer.setNumberers(pData.getNumbs());

			int labelLevel = opts.labelLevel;
			if (labelLevel!=-1) System.err.println("Labeling with projected grammar from level "+labelLevel+".");
			treeLabeler = new TreeLabeler(grammar, lexicon, labelLevel, pData.bin);
			numSubstates = treeLabeler.grammar.numSubStates;

		}
		Numberer tagNumberer =  Numberer.getGlobalNumberer("tags");

		Trees.TreeTransformer treeTransformer = (opts.keepFunctionLabels) ? new Trees.FunctionLabelRetainingTreeNormalizer() : new Trees.StandardTreeNormalizer();
		try{
	    	BufferedReader inputData = (opts.inputFile==null) ? new BufferedReader(new InputStreamReader(System.in)) : new BufferedReader(new InputStreamReader(new FileInputStream(opts.inputFile), "UTF-8"));
			PrintWriter outputData = (opts.outputFile==null) ? new PrintWriter(new OutputStreamWriter(System.out)) : new PrintWriter(new OutputStreamWriter(new FileOutputStream(opts.outputFile), "UTF-8"), true);

			Tree tree = null;
	    	String line = "";
	    	while((line=inputData.readLine()) != null){
	    		if (line.equals("")) {
	    			outputData.write("\n");
	    			continue;
	    		}
	    		if (line.equals("(())")) {
	    			outputData.write("(())\n");
	    			continue;
	    		}
	    		tree = PennTreeReader.parseEasy(line);
	    		if (tree==null) continue;
	    		if (tree.getYield().get(0).equals("")){ // empty tree -> parse failure
	    			outputData.write("(())\n");
	    			continue;
	    		}
				if (tree.getChildren().size() == 0 || tree.getChildren().get(0).getLabel().equals("(") || tree.getYield().get(0).equals("")){ // empty tree -> parse failure
					outputData.write("(())\n");
					continue;
				}
				if (tree.getYield().size() > opts.maxLength) continue;

				if (!labelTree){
					if (opts.getyield){
						List words = tree.getYield();
						for (String word : words){
							outputData.write(word+" ");
						}
						outputData.write("\n");
					} else if (opts.annotateTrees) {
						tree = TreeAnnotations.processTree(tree, opts.v_markov, opts.h_markov, opts.binarization, false);
						outputData.write(tree+"\n");
					}
					else {
						Tree normalizedTree = treeTransformer.transformTree(tree);
						if (opts.getPOSandYield){
							List> leafs = normalizedTree.getPreTerminals();
							for (Tree leaf : leafs){
								outputData.write(leaf.getChild(0).getLabel()+"\t"+leaf.getLabel()+"\n");
							}
							outputData.write("\n");
						}
						else {
							if (opts.escapeChars) {
								outputData.write(normalizedTree.toEscapedString()+"\n");
							} else {
								outputData.write(normalizedTree+"\n");
							}
						}
					}
					continue;
				}


				tree = TreeAnnotations.processTree(tree,pData.v_markov, pData.h_markov,pData.bin,false);
				List sentence = tree.getYield();
				Tree stateSetTree = StateSetTreeList.stringTreeToStatesetTree(tree, numSubstates, false, tagNumberer);
				allocate(stateSetTree);
				Tree labeledTree = treeLabeler.label(stateSetTree, sentence, opts.scores, opts.labelOnlyPOS);
				if (opts.onlyConfidence) {
					double treeLL = stateSetTree.getLabel().getIScore(0);
					outputData.write(treeLL+"\n");
					outputData.flush();
					continue;
				}

				if (labeledTree!=null && labeledTree.getChildren().size()>0) {
					if (opts.prettyPrint){
						outputData.write(PennTreeRenderer.render(labeledTree)+"\n");
					} else {
						if (opts.labelOnlyPOS){
							labeledTree = TreeAnnotations.debinarizeTree(labeledTree);
						}
						if (opts.escapeChars) {
							outputData.write("( "+labeledTree.getChildren().get(0).toEscapedString()+")\n");
						} else {
							outputData.write("( "+labeledTree.getChildren().get(0)+")\n");
						}

					}
				}
				else {
					if (opts.labelOnlyPOS){
						List> pos = tree.getPreTerminals();
						tree = TreeAnnotations.unAnnotateTree(tree, opts.keepFunctionLabels);
						for (Tree tag : pos){
							String t = tag.getLabel();
							t = t + "-0";
							tag.setLabel(t);
						}
						if (opts.escapeChars) {
							outputData.write("( "+tree.getChildren().get(0).toEscapedString()+")\n");
						} else {
							outputData.write("( "+tree.getChildren().get(0)+")\n");
						}
					} else {
						outputData.write("(())\n");
					}
				}
				outputData.flush();
			}
			outputData.close();
		}catch (Exception ex) {
			ex.printStackTrace();
		}
		System.exit(0);
	}



	/**
	 * @param stateSetTree
	 * @return
	 */
	private Tree label(Tree stateSetTree, List sentence, boolean outputScores, boolean labelOnlyPOS) {
		Tree tree = labeler.getBestViterbiDerivation(stateSetTree,outputScores, labelOnlyPOS);
		//		if (tree==null){ // max-rule tree had no viterbi derivation
		//			tree = parser.getBestConstrainedParse(sentence, null);
		//			tree = TreeAnnotations.processTree(tree,1, 0, binarization,false);
		////			System.out.println(tree);
		//			stateSetTree = StateSetTreeList.stringTreeToStatesetTree(tree, this.grammar.numSubStates, false, tagNumberer);
		//			allocate(stateSetTree);
		//			tree = labeler.getBestViterbiDerivation(stateSetTree,outputScores);
		//		}
		return tree;
	}

	/*
	 * Allocate the inside and outside score arrays for the whole tree
	 */
	static void allocate(Tree tree) {
		tree.getLabel().allocate();
		for (Tree child : tree.getChildren()) {
			allocate(child);
		}
	}

}




© 2015 - 2025 Weber Informatics LLC | Privacy Policy