All Downloads are FREE. Search and download functionalities are using the official Maven repository.

edu.berkeley.nlp.PCFGLA.SentenceSegmenter Maven / Gradle / Ivy

Go to download

The Berkeley parser analyzes the grammatical structure of natural language using probabilistic context-free grammars (PCFGs).

The newest version!
package edu.berkeley.nlp.PCFGLA;

import edu.berkeley.nlp.PCFGLA.smoothing.SmoothAcrossParentBits;
import edu.berkeley.nlp.PCFGLA.smoothing.SmoothAcrossParentSubstate;
import edu.berkeley.nlp.PCFGLA.smoothing.Smoother;
import edu.berkeley.nlp.io.PTBLineLexer;
import edu.berkeley.nlp.io.PTBTokenizer;
import edu.berkeley.nlp.io.PTBLexer;
import edu.berkeley.nlp.syntax.StateSet;
import edu.berkeley.nlp.syntax.Tree;
import edu.berkeley.nlp.syntax.Trees;
import edu.berkeley.nlp.ui.TreeJPanel;
import edu.berkeley.nlp.util.CommandLineUtils;
import edu.berkeley.nlp.util.Numberer;
import edu.berkeley.nlp.util.Pair;

import java.awt.AlphaComposite;
import java.awt.Graphics2D;
import java.awt.geom.Rectangle2D;
import java.awt.image.BufferedImage;
import java.io.BufferedReader;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.InputStreamReader;
import java.io.OutputStreamWriter;
import java.io.PrintWriter;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.List;

import javax.imageio.ImageIO;
import javax.swing.JFrame;


/**
 *
 * @author Slav Petrov
 */
public class SentenceSegmenter  {
	static TreeJPanel tjp;
	static JFrame frame;
	
	public static class Options {

		@Option(name = "-gr", required = true, usage = "Grammarfile (Required)\n")
		public String grFileName;

		@Option(name = "-tokenize", usage = "Tokenize input first. (Default: false=text is already tokenized)")
		public boolean tokenize;
		
		@Option(name = "-accurate", usage = "Set thresholds for accuracy. (Default: set thresholds for efficiency)")
		public boolean accurate;

		@Option(name = "-constituent", usage = "Instead of sentence probabilities return constituent probabilities")
		public boolean constituent = false;

		@Option(name = "-inputFile", usage = "Read input from this file instead of reading it from STDIN.")
		public String inputFile;

		@Option(name = "-outputFile", usage = "Store output in this file instead of printing it to STDOUT.")
		public String outputFile;
	}
	
  @SuppressWarnings("unchecked")
	public static void main(String[] args) {
		OptionParser optParser = new OptionParser(Options.class);
		Options opts = (Options) optParser.parse(args, true);

    double threshold = 1.0;
    
    String inFileName = opts.grFileName;
    ParserData pData = ParserData.Load(inFileName);
    if (pData==null) {
      System.out.println("Failed to load grammar from file"+inFileName+".");
      System.exit(1);
    }
    Grammar grammar = pData.getGrammar();
    Lexicon lexicon = pData.getLexicon();
    Numberer.setNumberers(pData.getNumbs());
    
    
    CoarseToFineMaxRuleParser parser = null;
    parser = new CoarseToFineMaxRuleParser(grammar, lexicon, threshold,-1,false,false,false, opts.accurate, false, true, true);
    parser.binarization = pData.getBinarization();
    
    
    try{
    	BufferedReader inputData = (opts.inputFile==null) ? new BufferedReader(new InputStreamReader(System.in)) : new BufferedReader(new InputStreamReader(new FileInputStream(opts.inputFile), "UTF-8"));
    	PrintWriter outputData = (opts.outputFile==null) ? new PrintWriter(new OutputStreamWriter(System.out)) : new PrintWriter(new OutputStreamWriter(new FileOutputStream(opts.outputFile), "UTF-8"), true);
    	PTBLineLexer tokenizer = null;
    	if (opts.tokenize) tokenizer = new PTBLineLexer();

    	String line = "";
    	while((line=inputData.readLine()) != null){
      	List sentence = null;
      	List posTags = null;
    		
      	String[] parts = line.split("\t");
      	if (parts.length<3) continue;
      	int nPoints = Integer.parseInt(parts[0]);
      	List> points = new ArrayList>(nPoints);
      	
      	String[] segments = parts[1].split("\\(");
      	for (int i=1; i<=nPoints; i++){
      		String[] numbers = segments[i].split(" ");
      		String n0 = numbers[0];
      		String n1 = numbers[1].substring(0,numbers[1].length()-1);
      		Pair number = new Pair(Integer.parseInt(n0), Integer.parseInt(n1));
      		points.add(number);
      	}
      	
    		if (!opts.tokenize) sentence = Arrays.asList(parts[parts.length-1].split(" "));
    		else sentence = tokenizer.tokenizeLine(parts[parts.length-1]);
    		
//    		if (sentence.size()==0) { outputData.write("\n"); continue; }//break;
    		if (sentence.size()>=200) { sentence = new ArrayList(); System.err.println("Skipping sentence with "+sentence.size()+" words since it is too long.");continue; }//break;
    		
	  		Tree parsedTree = parser.getBestConstrainedParse(sentence,posTags,null);
				double allLL = (parsedTree.getChildren().isEmpty()) ? Double.NEGATIVE_INFINITY : parser.getLogLikelihood();
				outputData.write(allLL+" ");
				for (Pair point : points){
					double partLL = parser.getSentenceProbability(point.getFirst(), point.getSecond(), opts.constituent);
					outputData.write(partLL+" ");
				}
				outputData.write("\n");
    	}
  		outputData.flush();
  		outputData.close();
    } catch (Exception ex) {
      ex.printStackTrace();
    }
    System.exit(0);
  }

  
  
//  /**
//	 * @param parsedTree
//	 * @param outputData
//	 * @param opts
//	 */
//	private static void outputTrees(List> parseTrees, PrintWriter outputData, 
//			CoarseToFineMaxRuleParser parser, Options opts) {
//		for (Tree parsedTree : parseTrees){
//			double allLL = (parsedTree.getChildren().isEmpty()) ? Double.NEGATIVE_INFINITY : parser.getLogLikelihood();
//			outputData.write(allLL+"\n");
////				continue;
//			}
//			if (!opts.binarize) parsedTree = TreeAnnotations.unAnnotateTree(parsedTree);
//			if (opts.confidence) {
//				double treeLL = (parsedTree.getChildren().isEmpty()) ? Double.NEGATIVE_INFINITY : parser.getLogLikelihood(parsedTree);
//				outputData.write(treeLL+"\t");
//			}
//			if (!parsedTree.getChildren().isEmpty()) { 
//	       			if (true) outputData.write("( "+parsedTree.getChildren().get(0)+" )\n");
////	       			else outputData.write(parsedTree.getChildren().get(0)+"\n\n");
//	    } else {
//	    	if (true) outputData.write("(())\n");
////	    	else outputData.write("()\n\n");
//	    }
//		}
//	}



	public static void writeTreeToImage(Tree tree, String fileName) throws IOException{
  	tjp.setTree(tree);
    
    BufferedImage bi =new BufferedImage(tjp.width(),tjp.height(),BufferedImage.TYPE_INT_ARGB);
    int t=tjp.height();
    Graphics2D g2 = bi.createGraphics();
    
    
    g2.setComposite(AlphaComposite.getInstance(AlphaComposite.CLEAR, 1.0f));
    Rectangle2D.Double rect = new Rectangle2D.Double(0,0,tjp.width(),tjp.height()); 
    g2.fill(rect);
    
    g2.setComposite(AlphaComposite.getInstance(AlphaComposite.SRC_OVER, 1.0f));
    
    tjp.paintComponent(g2); //paint the graphic to the offscreen image
    g2.dispose();
    
    ImageIO.write(bi,"png",new File(fileName)); //save as png format DONE!
  }

}





© 2015 - 2025 Weber Informatics LLC | Privacy Policy