All Downloads are FREE. Search and download functionalities are using the official Maven repository.

edu.berkeley.nlp.PCFGLA.TreeGenerator Maven / Gradle / Ivy

Go to download

The Berkeley parser analyzes the grammatical structure of natural language using probabilistic context-free grammars (PCFGs).

The newest version!
package edu.berkeley.nlp.PCFGLA;

import java.util.ArrayList;
import java.util.Collections;
import java.util.HashMap;
import java.util.List;

import edu.berkeley.nlp.syntax.Tree;
import edu.berkeley.nlp.syntax.Trees;
import edu.berkeley.nlp.util.Numberer;

public class TreeGenerator {

	static Grammar grammar;
	static SophisticatedLexicon lexicon;
	static Numberer tagNumberer;
	/**
	 * @param args
	 */
	public static void main(String[] args) {
		if (args.length <3) {
			System.out.println("usage: java TreeGenerator   \n");
			System.exit(2);
		}
		String inFileName = args[0];
		int maxLength = Integer.parseInt(args[1]);
		int nTrees = Integer.parseInt(args[2]);

		System.out.println("Loading grammar from " + inFileName + ".");
		ParserData pData = ParserData.Load(inFileName);
		if (pData == null) {
			System.out.println("Failed to load grammar from file" + inFileName + ".");
			System.exit(1);
		}
		
		grammar = pData.getGrammar();
		lexicon = (SophisticatedLexicon)pData.getLexicon();
		Numberer.setNumberers(pData.getNumbs());
		tagNumberer = Numberer.getGlobalNumberer("tags");
		grammar.splitRules();
		
		int nGen = 0;
		while (nGen < nTrees){
			Tree artTree = generateTree(0, 0);
			System.out.println(artTree.getYield().toString());
		  Tree tree = TreeAnnotations.unAnnotateTree(artTree, false);
		  if (tree.getYield().size() > maxLength) continue;
		  System.out.println("Generated tree of length "+tree.getYield().size()+".\n"+Trees.PennTreeRenderer.render(tree)+"\n");
		  nGen++;
		}

	}

	private static Tree generateTree(int pState, int pSubState) {
   	String root = (String)tagNumberer.object(pState);
   	//System.out.println("Current parent: "+root+"-"+pSubState);
		BinaryRule[] bRules = grammar.splitRulesWithP(pState);
    //System.out.println("Number of binary rules: " +bRules.length);
    double randval = GrammarTrainer.RANDOM.nextDouble();
    double sum=0;
    ArrayList> children = new ArrayList>();
    for (int i = 0; i < bRules.length; i++) {
    	double[][][] scores = bRules[i].scores;
    	for (int lC=0; lCrandval){
    	      children.add( generateTree(bRules[i].leftChildState, lC) );
    	      children.add( generateTree(bRules[i].rightChildState, rC) );
    	      return new Tree( root, children );
    			}
    		}
    	}
   	}
    List uRulesList = grammar.getUnaryRulesByParent(pState); //)  getClosedViterbiUnaryRulesByParent(
    //for (int i = 0; i < uRules.length; i++) {
    	//double[][] scores = uRules[i].scores;
    for (UnaryRule uRule : uRulesList){
    	double[][] scores = uRule.scores;
  		for (int uC=0; uCrandval){
  	      children.add( generateTree(uRule.childState, uC) );
  	      return new Tree( root, children );
  			}
  		}
   	}    
    
    if (sum==0) {
    	//System.out.println("There are no rules with "+root+" as parent.");
    	String word = sampleWord(pState, pSubState);
    	List> child = Collections.singletonList( new Tree(word) );
      return new Tree( root, child );
    }
    else
      throw new Error("rule probability sum "+sum+" is more than 1!");
 	}

	// P(T|W) = P(W|T)*P(W)/P(T)
	private static String sampleWord(int tag, int substate) {
		String w = (String)tagNumberer.object(tag);
		double randval = GrammarTrainer.RANDOM.nextDouble();
    double sum=0;
    HashMap wordToTagCounter = lexicon.wordToTagCounters[tag];
    for (String word : wordToTagCounter.keySet()){
	    double c_TW = 0;
			if (lexicon.wordToTagCounters[tag]!=null &&
					lexicon.wordToTagCounters[tag].get(word)!=null) {
				c_TW = wordToTagCounter.get(word)[substate];
			}
			
	    double c_W = lexicon.wordCounter.getCount(word);
	    double c_T = lexicon.tagCounter[tag][substate];
			double total = lexicon.totalTokens;
			double pb_T_W = c_TW / c_W;
	
			double p_T = (c_T / total);
			double p_W = (c_W / total);
			double pb_W_T = pb_T_W * p_W / p_T;
			sum += pb_W_T;
			if (sum>randval) 
				return word;
    }

    return w;
	}

}




© 2015 - 2025 Weber Informatics LLC | Privacy Policy