org.cleartk.syntax.berkeley.ParserAnnotator Maven / Gradle / Ivy
/**
* Copyright (c) 2011, Regents of the University of Colorado
* All rights reserved.
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public License
* as published by the Free Software Foundation; either version 2
* of the License, or (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* For a complete copy of the license please see the file LICENSE distributed
* with the cleartk-syntax-berkeley project or visit
* http://www.gnu.org/licenses/old-licenses/gpl-2.0.html.
*/
package org.cleartk.syntax.berkeley;
import java.io.IOException;
import java.io.InputStream;
import java.io.ObjectInputStream;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.List;
import java.util.zip.GZIPInputStream;
import org.apache.uima.UimaContext;
import org.apache.uima.analysis_engine.AnalysisEngineProcessException;
import org.apache.uima.jcas.JCas;
import org.apache.uima.jcas.tcas.Annotation;
import org.apache.uima.resource.ResourceInitializationException;
import org.cleartk.util.IOUtil;
import org.uimafit.descriptor.ConfigurationParameter;
import org.uimafit.factory.ConfigurationParameterFactory;
import edu.berkeley.nlp.PCFGLA.CoarseToFineMaxRuleParser;
import edu.berkeley.nlp.PCFGLA.Grammar;
import edu.berkeley.nlp.PCFGLA.Lexicon;
import edu.berkeley.nlp.PCFGLA.ParserData;
import edu.berkeley.nlp.syntax.Tree;
import edu.berkeley.nlp.util.Numberer;
/**
*
* Copyright (c) 2011, Regents of the University of Colorado
* All rights reserved.
*
*
* @author Philip Ogren
*/
public class ParserAnnotator
extends ParserWrapper_ImplBase, TOP_NODE_TYPE> {
public static final String PARAM_PARSER_MODEL_PATH = ConfigurationParameterFactory.createConfigurationParameterName(
ParserAnnotator.class,
"parserModelPath");
@ConfigurationParameter
private String parserModelPath;
protected CoarseToFineMaxRuleParser parser;
private int parseFailureCount = 0;
private int sentenceCount = 0;
@Override
public void initialize(UimaContext context) throws ResourceInitializationException {
super.initialize(context);
System.out.print("initializing Berkeley Parser: " + parserModelPath + " ... ");
long start = System.nanoTime();
InputStream modelInputStream;
try {
modelInputStream = IOUtil.getInputStream(ParserAnnotator.class, parserModelPath);
GZIPInputStream gzis = new GZIPInputStream(modelInputStream); // Compressed
ObjectInputStream in = new ObjectInputStream(gzis); // Load objects
ParserData parserData = (ParserData) in.readObject(); // Read the mix of grammars
Grammar grammar = parserData.getGrammar();
Lexicon lexicon = parserData.getLexicon();
Numberer.setNumberers(parserData.getNumbs());
parser = new CoarseToFineMaxRuleParser(
grammar,
lexicon,
1.0,
-1,
false,
false,
false,
false,
false,
true,
true);
long stop = System.nanoTime();
float seconds = (float) (stop - start) / 1000000000;
System.out.println("done. Loaded in: " + seconds + " seconds");
} catch (IOException e) {
throw new ResourceInitializationException(e);
} catch (ClassNotFoundException e) {
throw new ResourceInitializationException(e);
}
}
@Override
public void process(JCas jCas) throws AnalysisEngineProcessException {
List sentenceList = inputTypesHelper.getSentences(jCas);
for (SENTENCE_TYPE sentence : sentenceList) {
sentenceCount++;
List tokens = inputTypesHelper.getTokens(jCas, sentence);
List words = new ArrayList();
List tags = new ArrayList();
for (TOKEN_TYPE token : tokens) {
words.add(token.getCoveredText());
String tag = inputTypesHelper.getPosTag(token);
tags.add(tag);
}
Tree tree = parser.getBestConstrainedParse(words, tags, null);
if (tree.isLeaf()) {
System.out.println("words: " + words.size() + " " + words);
System.out.println("tags: " + tags.size() + " " + tags);
System.out.println("unable to parse sentence: " + sentence.getCoveredText());
parseFailureCount++;
} else {
outputTypesHelper.addParse(jCas, tree, sentence, tokens);
}
}
}
@Override
public void collectionProcessComplete() throws AnalysisEngineProcessException {
System.out.println("total number of sentences that were not parsed was: " + parseFailureCount
+ " out of " + sentenceCount);
}
public static void main(String[] args) {
// ParserData parserData = ParserData.Load("data/experiment/berkeley/fold1.gr");
ParserData parserData = ParserData.Load("src/test/resources/models/11597317.gr");
Grammar grammar = parserData.getGrammar();
Lexicon lexicon = parserData.getLexicon();
Numberer.setNumberers(parserData.getNumbs());
CoarseToFineMaxRuleParser parser = new CoarseToFineMaxRuleParser(
grammar,
lexicon,
1.0,
-1,
false,
false,
false,
false,
false,
true,
true);
List sentence = Arrays.asList(new String[] {
"The",
"striatum",
"plays",
"a",
"pivotal",
"role",
"in",
"modulating",
"motor",
"activity",
"and",
"higher",
"cognitive",
"function",
"." });
List posTags = Arrays.asList(new String[] {
"DT",
"NN",
"VBZ",
"DT",
"JJ",
"NN",
"IN",
"VBG",
"NN",
"NN",
"CC",
"JJR",
"JJ",
"NN",
"." });
System.out.println("sentence size=" + sentence.size());
System.out.println("posTags size=" + posTags.size());
Tree parsedTree = parser.getBestConstrainedParse(sentence, posTags, null);
System.out.println(parsedTree);
}
}