edu.berkeley.nlp.PCFGLA.PosteriorMerger Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of berkeleyparser Show documentation
The Berkeley parser analyzes the grammatical structure of natural language using probabilistic context-free grammars (PCFGs).
The newest version!
/**
 * 
 */
package edu.berkeley.nlp.PCFGLA;

import java.io.BufferedReader;
import java.io.FileInputStream;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.InputStreamReader;
import java.io.ObjectInputStream;
import java.io.OutputStreamWriter;
import java.io.PrintWriter;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.List;
import java.util.zip.GZIPInputStream;

import edu.berkeley.nlp.syntax.Tree;
import edu.berkeley.nlp.util.ArrayUtil;
import edu.berkeley.nlp.util.Numberer;
import edu.berkeley.nlp.util.ScalingTools;


public class PosteriorMerger{

	public static class Options {

		@Option(name = "-grammarFiles", required = true, usage = "Input Files for Grammars.")
		public String grammarFiles;

		@Option(name = "-inputFile", usage = "Read input from this file instead of reading it from STDIN.")
		public String inputFile;

		@Option(name = "-outputFile", usage = "Store output in this file instead of printing it to STDOUT.")
		public String outputFile;

		@Option(name = "-nGrammars", usage = "Number of Grammars")
		public int nGrammars;

		@Option(name = "-maxLength", usage = "Maximum sentence length (Default = 200).")
		public int maxLength = 200;
	}

	static double[][][] maxcScore;  // start, end, state --> logProb
	static int[][][] maxcSplit;  // start, end, state -> split position
	static int[][][] maxcChild;  // start, end, state -> unary child (if any)
	static int[][][] maxcLeftChild;  // start, end, state -> left child
	static int[][][] maxcRightChild;  // start, end, state -> right child


	public static void main(String[] args) {
		OptionParser optParser = new OptionParser(Options.class);
		Options opts = (Options) optParser.parse(args, true);
		// provide feedback on command-line arguments
		System.err.println("Calling with " + optParser.getPassedInOptions());


		String inFileName = opts.grammarFiles;
		if (inFileName==null) {
			throw new Error("Did not provide a grammar.");
		}

		short[][] numSubstates = new short[opts.nGrammars][];
		Grammar[] grammars = new Grammar[opts.nGrammars];
		Lexicon[] lexicons = new Lexicon[opts.nGrammars];
		for (int gr=0; gr[] posteriors = null;
			while ((line = inputData.readLine()) != null) {
				List sentence = Arrays.asList(line.split(" "));
				if (posteriors == null || lineIndex == posteriors[0].size()){
					posteriors = new ArrayList[nGrammars];
					for (int gr=0; gr opts.maxLength){
					//    			lineIndex++;
					outputData.write("(())\n");
					continue;
				}

				List iScores = new ArrayList(nGrammars);
				List oScores = new ArrayList(nGrammars);
				List iScales = new ArrayList(nGrammars);
				List oScales = new ArrayList(nGrammars);
				boolean[][][] allowedStates = null;

				boolean skip = false;
				for (int gr=0; gr parsedTree = parser.extractBestMaxRuleParse(0, sentence.size(), sentence);
				parsedTree = TreeAnnotations.unAnnotateTree(parsedTree, false);
				outputData.write(parsedTree+"\n");
				outputData.flush();
			}



			outputData.flush();
			outputData.close();
		}catch (Exception ex) {
			ex.printStackTrace();
		}
		System.exit(0);
	}



	private static boolean[][][] mergeAllowedStates(boolean[][][] allowedStates,
			boolean[][][] allowedStates2) {
		if (allowedStates==null) return allowedStates2;
		for (int i=0; i sentence, 
			List iScores, List oScores, 
			List iScales, List oScales,
			boolean[][][] allowedStates, 
			Grammar[] grammars, Lexicon[] lexicons,
			short[][] numSubstates, boolean scale) {

		int length = sentence.size();
		int nGrammars = numSubstates.length;
		int numStates = numSubstates[0].length;
		boolean[] grammarTags = grammars[0].isGrammarTag;
		Numberer tagNumberer = Numberer.getGlobalNumberer("tags");

		maxcScore = new double[length][length + 1][numStates];
		maxcSplit = new int[length][length + 1][numStates];
		maxcChild      = new int[length][length + 1][numStates];
		maxcLeftChild  = new int[length][length + 1][numStates];
		maxcRightChild = new int[length][length + 1][numStates];
		ArrayUtil.fill(maxcScore, Double.NEGATIVE_INFINITY);
		
		double[] logNormalizer = new double[nGrammars];
		for (int i=0; i 1) {
					// diff > 1: Try binary rules
					for (short pState=0; pState "+(String)tagNumberer.object(lState)
												+" "+(String)tagNumberer.object(rState)+" in grammar "+gr);
										continue;
									}
									double[][][] scores = rule.getScores2();
									int nParentStates = numSubstates[gr][pState]; // == scores[0][0].length;
									int nLeftChildStates = numSubstates[gr][lState]; // == scores.length;
									int nRightChildStates = numSubstates[gr][rState]; // == scores[0].length;

									for (int lp = 0; lp < nLeftChildStates; lp++) {
										double lIS = iScores.get(gr)[start][split][lState][lp];
										if (lIS == 0) continue;

										for (int rp = 0; rp < nRightChildStates; rp++) {
											if (scores[lp][rp]==null) continue;
											double rIS = iScores.get(gr)[split][end][rState][rp];
											if (rIS == 0) continue;
											for (int np = 0; np < nParentStates; np++) {
												double pOS = oScores.get(gr)[start][end][pState][np];
												if (pOS == 0) continue;

												double ruleS = scores[lp][rp][np];
												if (ruleS == 0) continue;
												ruleScore += (pOS * ruleS * lIS * rIS) / logNormalizer[gr];
											}
										}
									}
//									if (ruleScore==0) continue;
									gScore += Math.log(ruleScore);
								}	


								if (gScore > scoreToBeat) {
									scoreToBeat = gScore;
									maxcScore[start][end][pState] = gScore;
									maxcSplit[start][end][pState] = split;
									maxcLeftChild[start][end][pState] = lState;
									maxcRightChild[start][end][pState] = rState;
								}
							}
						} 
					}
				} else { // diff == 1
					// We treat TAG --> word exactly as if it was a unary rule, except the score of the rule is
					// given by the lexicon rather than the grammar and that we allow another unary on top of it.
					//for (int tag : lexicon.getAllTags()){
					for (int tag=0; tag "+(String)tagNumberer.object(cState)+" in grammar "+gr);
								continue;
							}
							double[][] scores = rule.getScores2();

							int nChildStates = numSubstates[gr][cState]; // == scores.length;
							int nParentStates = numSubstates[gr][pState]; // == scores[0].length;

							for (int cp = 0; cp < nChildStates; cp++) {
								double cIS = iScores.get(gr)[start][end][cState][cp];
								if (cIS == 0) continue;

								if (scores[cp]==null) continue;
								for (int np = 0; np < nParentStates; np++) {
									double pOS = oScores.get(gr)[start][end][pState][np];
									if (pOS < 0) continue;

									double ruleS = scores[cp][np];
									if (ruleS == 0) continue;
									ruleScore += (pOS * ruleS * cIS) / logNormalizer[gr];
								}
							}
//							if (ruleScore==0) continue;
							gScore += Math.log(ruleScore);
						}

						if (gScore > maxcScoreStartEnd[pState]) {
							maxcScoreStartEnd[pState] = gScore;
							maxcChild[start][end][pState] = cState;
						}
					}
				}
				maxcScore[start][end] = maxcScoreStartEnd;
			}
		}
	}


	public static List loadPosteriors(String fileName) {
		List posteriors = null;
		try {
			FileInputStream fis = new FileInputStream(fileName); // Load from file
			GZIPInputStream gzis = new GZIPInputStream(fis); // Compressed
			ObjectInputStream in = new ObjectInputStream(gzis); // Load objects
			posteriors = (List)in.readObject(); // Read the mix of grammars
			in.close(); // And close the stream.
			gzis.close();
			fis.close();
		} catch (IOException e) {
			System.out.println("IOException\n"+e);
			return null;
		} catch (ClassNotFoundException e) {
			System.out.println("Class not found!");
			return null;
		}
		return posteriors;
	}


}