All Downloads are FREE. Search and download functionalities are using the official Maven repository.

edu.berkeley.nlp.PCFGLA.DumpGrammar Maven / Gradle / Ivy

Go to download

The Berkeley parser analyzes the grammatical structure of natural language using probabilistic context-free grammars (PCFGs).

The newest version!
/**
 * 
 */
package edu.berkeley.nlp.PCFGLA;

import java.io.BufferedWriter;
import java.io.FileWriter;
import java.io.IOException;
import java.io.PrintWriter;
import java.io.Writer;

import edu.berkeley.nlp.util.Numberer;

/**
 * @author petrov
 *
 */
public class DumpGrammar {

	/**
	 * @param args
	 */
	public static void main(String[] args) {
		
		if (args.length<2) {
      System.out.println(
          "usage: java -cp berkeleyParser.jar edu/berkeley/nlp/parser/DumpGrammar   [] \n " +
          "reads in a serialized grammar file and writes it to a text file."
                	);
      System.exit(2);
    }

		String inFileName = args[0];
		String outName = args[1];
		
		System.out.println("Loading grammar from file "+inFileName+".");
		ParserData pData = ParserData.Load(inFileName);
		if (pData == null) {
			System.out.println("Failed to load grammar from file" + inFileName + ".");
			System.exit(1);
		}
		
		Grammar grammar = pData.getGrammar();
//		if (grammar instanceof HierarchicalGrammar)
//			grammar = (HierarchicalGrammar)grammar;
		SophisticatedLexicon lexicon = (SophisticatedLexicon)pData.getLexicon();
		
		
		Numberer.setNumberers(pData.getNumbs());
		dumpGrammar(outName, grammar, lexicon);

	}

	/**
	 * @param args
	 * @param outName
	 * @param pData
	 * @param grammar
	 * @param lexicon
	 */
	public static void dumpGrammar(String outName, Grammar grammar, SophisticatedLexicon lexicon) {
		Numberer tagNumberer = Numberer.getGlobalNumberer("tags");
		grammar.splitRules();
		

    Numberer n = Numberer.getGlobalNumberer("tags");

		System.out.println("Writing output to files "+outName+".xxx");
		PrintWriter out, outN;
		try {

			// write binary rules
			out = new PrintWriter(new BufferedWriter(new FileWriter(outName+".binary")));
			outN = new PrintWriter(new BufferedWriter(new FileWriter(outName+".num.binary")));

			for (int state = 0; state < grammar.numStates; state++) {
				BinaryRule[] parentRules = grammar.splitRulesWithP(state);
				for (int i = 0; i < parentRules.length; i++) {
					int number = 0;
					BinaryRule r = parentRules[i];
					double[][][] scores = r.getScores2();
			    
			    String lState = (String)n.object(r.leftChildState);
			    if (lState.endsWith("^g")) lState = lState.substring(0,lState.length()-2);
			    String rState = (String)n.object(r.rightChildState);
			    if (rState.endsWith("^g")) rState = rState.substring(0,rState.length()-2);
			    String pState = (String)n.object(r.parentState);
			    if (pState.endsWith("^g")) pState = pState.substring(0,pState.length()-2);
			    StringBuilder sb = new StringBuilder();

			    for (int lS=0; lS0) {
			    				sb.append(pState+"_"+pS+ " "+lState+"_"+lS+ " "+rState+"_"+rS +" "+p+"\n");
			    				number++;
			    			}
			    		}
			    	}
			    }
			    out.print(sb.toString());
			    outN.print(number+"\n");
				}
			}
			out.flush();
			outN.flush();
			out.close();
			outN.close();

			// write unary rules
			out = new PrintWriter(new BufferedWriter(new FileWriter(outName+".unary")));
			outN = new PrintWriter(new BufferedWriter(new FileWriter(outName+".num.unary")));

			for (int state = 0; state < grammar.numStates; state++) {
				UnaryRule[] unaries = grammar.getClosedViterbiUnaryRulesByParent(state);
				for (int r = 0; r < unaries.length; r++) {
					int number = 0;
					UnaryRule ur = unaries[r];
					double[][] scores = ur.getScores2();
					
			    String cState = (String)n.object(ur.childState);
			    if (cState.endsWith("^g")) cState = cState.substring(0,cState.length()-2);
			    String pState = (String)n.object(ur.parentState);
			    if (pState.endsWith("^g")) pState = pState.substring(0,pState.length()-2);
			    StringBuilder sb = new StringBuilder();

			    for (int cS=0; cS0) {
			  				sb.append(pState+"_"+pS+ " " + cState+"_"+cS +" "+p+"\n");
			  				number++;
			  			}
			  		}
			    }
			    out.print(sb.toString());
			    outN.print(number+"\n");
				}
			}
			out.flush();
			outN.flush();
			out.close();
			outN.close();
			
			// split trees
			grammar.writeSplitTrees(new BufferedWriter(new FileWriter(outName+".hierarchy")));

			// numbsubstates
			out = new PrintWriter(new BufferedWriter(new FileWriter(outName+".numstates")));
			for (int state = 0; state < grammar.numStates; state++) {
				String tag = (String)tagNumberer.object(state);
				if (tag.endsWith("^g")) tag = tag.substring(0,tag.length()-2);
				out.write(tag+"\t"+grammar.numSubStates[state]+"\n");
			}
			out.flush();
			out.close();
			
			// lexicon
			out = new PrintWriter(new BufferedWriter(new FileWriter(outName+".lexicon")));
			out.write(lexicon.toString());
			out.flush();
			out.close();

			// words
			out = new PrintWriter(new BufferedWriter(new FileWriter(outName+".words")));
			for (String word : lexicon.wordCounter.keySet())
				out.write(word + "\n");
			out.flush();
			out.close();

		} catch (IOException ex) { ex.printStackTrace();}
	}

}




© 2015 - 2025 Weber Informatics LLC | Privacy Policy