All Downloads are FREE. Search and download functionalities are using the official Maven repository.

edu.berkeley.nlp.conll.DepToPTB Maven / Gradle / Ivy

Go to download

The Berkeley parser analyzes the grammatical structure of natural language using probabilistic context-free grammars (PCFGs).

The newest version!
/**
 * 
 */
package edu.berkeley.nlp.conll;

import java.io.BufferedReader;
import java.io.FileInputStream;
import java.io.InputStreamReader;
import java.nio.charset.Charset;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.LinkedList;
import java.util.List;

import edu.berkeley.nlp.PCFGLA.Option;
import edu.berkeley.nlp.PCFGLA.OptionParser;
import edu.berkeley.nlp.PCFGLA.GrammarTrainer.Options;
import edu.berkeley.nlp.syntax.Tree;

/**
 * @author petrov
 *
 */
public class DepToPTB {

	public static class Options {

		@Option(name = "-in", required = true, usage = "Input File for Grammar (Required)")
		public String inFileName;

		@Option(name = "-finePOStags", usage = "Use fine POS tags (Default: false=coarse")
		public boolean useFinePOS = false;

	}
	
	public static void main(String[] args) {
//		String[] sentence = {
//				"1 The _ DT DT _ 4 NMOD _ _\n",
//				"2 luxury _ NN NN _ 4 NMOD _ _\n",
//				"3 auto _ NN NN _ 4 NMOD _ _\n",
//				"4 maker _ NN NN _ 7 SBJ _ _\n",
//				"5 last _ JJ JJ _ 6 NMOD _ _\n",
//				"6 year _ NN NN _ 7 VMOD _ _\n",
//				"7 sold _ VB VBD _ 0 ROOT _ _\n",
//				"8 1,214 _ CD CD _ 9 NMOD _ _\n",
//				"9 cars _ NN NNS _ 7 OBJ _ _\n",
//				"10 in _ IN IN _ 7 ADV _ _\n",
//				"11 the _ DT DT _ 12 NMOD _ _\n",
//				"12 U.S. _ NN NNP _ 10 PMOD _ _\n"};
		OptionParser optParser = new OptionParser(Options.class);
		Options opts = (Options) optParser.parse(args, true);
		// provide feedback on command-line arguments
//		System.out.println("Calling with " + optParser.getPassedInOptions());

		BufferedReader input = null;
		String fileName = opts.inFileName;
		try {
		    input = new BufferedReader(new InputStreamReader(new FileInputStream(fileName), Charset.forName("UTF-8")));//GB18030")));
		    String line = "";
		    List sentence = new ArrayList();
				while ((line = input.readLine()) != null) {
				    System.out.println(line);
					if (line.equals("")){
						Tree tree = turnIntoTree(sentence, opts.useFinePOS);
						System.out.println("( "+tree+")");
						sentence = new LinkedList();
					}
					else sentence.add(line);
				}

			} catch (Exception ex) {
			ex.printStackTrace();
		}

	}

	/**
	 * @param sentence
	 * @return
	 */
	private static Tree turnIntoTree(List sentence, boolean useFinePOS) {
		int posIndex = (useFinePOS) ? 4 : 3;
		int nWords = sentence.size();
		Tree[] trees = new Tree[nWords];
		List[] childIndices = new List[nWords];
		int[] freeKids = new int[nWords];
		int[] parentIndices = new int[nWords];
		int rootIndex = -1;
		
		for (int i=0; i();
		}
		
		for (int i=0; i child = new Tree(word);
			List> childList = new ArrayList>(1);
			childList.add(child);
			String tag = fields[posIndex];
			if (tag.equals("(")||tag.equals(")")) tag = "LRB";
			trees[i] = new Tree(tag, childList);
			int pIndex = Integer.parseInt(fields[6])-1;
			parentIndices[i] = pIndex;
			if (pIndex==-1) rootIndex=i;
			else childIndices[pIndex].add(i);
			childIndices[i].add(i);
		}
		
		if (nWords == 1) return trees[0];
		
		for (int i=0; i0){
			for (int i=0; i attach them
					List> childList = new ArrayList>();
					for (Integer c : childIndices[i]){
						childList.add(trees[c]);
					}		
					Tree newTree = new Tree(trees[i].getLabel()+"*",childList);
					trees[i] = newTree;
					if (parentIndices[i]>=0) freeKids[parentIndices[i]]++;
					childIndices[i] = new LinkedList();
				}
			}
		}
		return trees[rootIndex];
	}
	
}




© 2015 - 2025 Weber Informatics LLC | Privacy Policy