org.cleartk.syntax.berkeley.ParserAnnotator Maven / Gradle / Ivy

Go to download
/** 
 * Copyright (c) 2011, Regents of the University of Colorado 
 * All rights reserved.
 * 
 * This program is free software; you can redistribute it and/or
 * modify it under the terms of the GNU General Public License
 * as published by the Free Software Foundation; either version 2
 * of the License, or (at your option) any later version.
 * 
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 * 
 * For a complete copy of the license please see the file LICENSE distributed 
 * with the cleartk-syntax-berkeley project or visit 
 * http://www.gnu.org/licenses/old-licenses/gpl-2.0.html.
 */

package org.cleartk.syntax.berkeley;

import java.io.IOException;
import java.io.InputStream;
import java.io.ObjectInputStream;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.List;
import java.util.zip.GZIPInputStream;

import org.apache.uima.UimaContext;
import org.apache.uima.analysis_engine.AnalysisEngineProcessException;
import org.apache.uima.jcas.JCas;
import org.apache.uima.jcas.tcas.Annotation;
import org.apache.uima.resource.ResourceInitializationException;
import org.cleartk.util.IOUtil;
import org.uimafit.descriptor.ConfigurationParameter;
import org.uimafit.factory.ConfigurationParameterFactory;

import edu.berkeley.nlp.PCFGLA.CoarseToFineMaxRuleParser;
import edu.berkeley.nlp.PCFGLA.Grammar;
import edu.berkeley.nlp.PCFGLA.Lexicon;
import edu.berkeley.nlp.PCFGLA.ParserData;
import edu.berkeley.nlp.syntax.Tree;
import edu.berkeley.nlp.util.Numberer;

/**
 * 

 * Copyright (c) 2011, Regents of the University of Colorado 

 * All rights reserved.
 * 
 * 
 * @author Philip Ogren
 */

public class ParserAnnotator
    extends ParserWrapper_ImplBase, TOP_NODE_TYPE> {

  public static final String PARAM_PARSER_MODEL_PATH = ConfigurationParameterFactory.createConfigurationParameterName(
      ParserAnnotator.class,
      "parserModelPath");

  @ConfigurationParameter
  private String parserModelPath;

  protected CoarseToFineMaxRuleParser parser;

  private int parseFailureCount = 0;

  private int sentenceCount = 0;

  @Override
  public void initialize(UimaContext context) throws ResourceInitializationException {
    super.initialize(context);
    System.out.print("initializing Berkeley Parser: " + parserModelPath + " ... ");

    long start = System.nanoTime();

    InputStream modelInputStream;
    try {
      modelInputStream = IOUtil.getInputStream(ParserAnnotator.class, parserModelPath);

      GZIPInputStream gzis = new GZIPInputStream(modelInputStream); // Compressed
      ObjectInputStream in = new ObjectInputStream(gzis); // Load objects
      ParserData parserData = (ParserData) in.readObject(); // Read the mix of grammars

      Grammar grammar = parserData.getGrammar();
      Lexicon lexicon = parserData.getLexicon();
      Numberer.setNumberers(parserData.getNumbs());
      parser = new CoarseToFineMaxRuleParser(
          grammar,
          lexicon,
          1.0,
          -1,
          false,
          false,
          false,
          false,
          false,
          true,
          true);
      long stop = System.nanoTime();
      float seconds = (float) (stop - start) / 1000000000;
      System.out.println("done.   Loaded in: " + seconds + " seconds");
    } catch (IOException e) {
      throw new ResourceInitializationException(e);
    } catch (ClassNotFoundException e) {
      throw new ResourceInitializationException(e);
    }

  }

  @Override
  public void process(JCas jCas) throws AnalysisEngineProcessException {

    List sentenceList = inputTypesHelper.getSentences(jCas);

    for (SENTENCE_TYPE sentence : sentenceList) {
      sentenceCount++;
      List tokens = inputTypesHelper.getTokens(jCas, sentence);
      List words = new ArrayList();
      List tags = new ArrayList();

      for (TOKEN_TYPE token : tokens) {
        words.add(token.getCoveredText());
        String tag = inputTypesHelper.getPosTag(token);
        tags.add(tag);
      }

      Tree tree = parser.getBestConstrainedParse(words, tags, null);
      if (tree.isLeaf()) {
        System.out.println("words: " + words.size() + "  " + words);
        System.out.println("tags: " + tags.size() + "  " + tags);
        System.out.println("unable to parse sentence: " + sentence.getCoveredText());
        parseFailureCount++;
      } else {
        outputTypesHelper.addParse(jCas, tree, sentence, tokens);
      }
    }
  }

  @Override
  public void collectionProcessComplete() throws AnalysisEngineProcessException {
    System.out.println("total number of sentences that were not parsed was: " + parseFailureCount
        + " out of " + sentenceCount);
  }

  public static void main(String[] args) {
    // ParserData parserData = ParserData.Load("data/experiment/berkeley/fold1.gr");
    ParserData parserData = ParserData.Load("src/test/resources/models/11597317.gr");
    Grammar grammar = parserData.getGrammar();
    Lexicon lexicon = parserData.getLexicon();
    Numberer.setNumberers(parserData.getNumbs());

    CoarseToFineMaxRuleParser parser = new CoarseToFineMaxRuleParser(
        grammar,
        lexicon,
        1.0,
        -1,
        false,
        false,
        false,
        false,
        false,
        true,
        true);

    List sentence = Arrays.asList(new String[] {
        "The",
        "striatum",
        "plays",
        "a",
        "pivotal",
        "role",
        "in",
        "modulating",
        "motor",
        "activity",
        "and",
        "higher",
        "cognitive",
        "function",
        "." });
    List posTags = Arrays.asList(new String[] {
        "DT",
        "NN",
        "VBZ",
        "DT",
        "JJ",
        "NN",
        "IN",
        "VBG",
        "NN",
        "NN",
        "CC",
        "JJR",
        "JJ",
        "NN",
        "." });

    System.out.println("sentence size=" + sentence.size());
    System.out.println("posTags size=" + posTags.size());

    Tree parsedTree = parser.getBestConstrainedParse(sentence, posTags, null);
    System.out.println(parsedTree);

  }
}