All Downloads are FREE. Search and download functionalities are using the official Maven repository.

opennlp.tools.parser.AbstractContextGenerator Maven / Gradle / Ivy

The newest version!
/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License. You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */


package opennlp.tools.parser;

import java.util.Collection;
import java.util.List;
import java.util.Set;

/**
 * Abstract class containing many of the methods used to generate contexts for parsing.
 */
public abstract class AbstractContextGenerator {

  protected static final String EOS = "eos";

  protected boolean zeroBackOff;
  /** Set of punctuation to be used in generating features. */
  protected Set punctSet;
  protected boolean useLabel;

  /**
   * Creates punctuation feature for the specified punctuation at the specified index
   * {@code i} based on the {@code punctuation} mark.
   *
   * @param punct The punctuation which is in context.
   * @param i The index of the {@code punctuation} with relative to the parse.
   * @return Punctuation feature for the parse and the punctuation at the specified index.
   */
  protected String punct(Parse punct, int i) {
    return i + "=" + punct.getCoveredText();
  }

  /**
   * Creates punctuation feature for the specified punctuation at the specified index
   * {@code i} based on the {@code punctuation}'s tag.
   * 
   * @param punct The punctuation which is in context.
   * @param i The index of the {@code punctuation} relative to the parse.
   * @return Punctuation feature for the parse and the punctuation at the specified index.
   */
  protected String punctbo(Parse punct, int i) {
    return i + "=" + punct.getType();
  }

  protected String cons(Parse p, int i) {
    StringBuilder feat = new StringBuilder(20);
    feat.append(i).append("=");
    if (p != null) {
      if (useLabel && i < 0) {
        feat.append(p.getLabel()).append("|");
      }
      feat.append(p.getType()).append("|").append(p.getHead().getCoveredText());
    }
    else {
      feat.append(EOS);
    }
    return feat.toString();
  }

  protected String consbo(Parse p, int i) { //cons back-off
    StringBuilder feat = new StringBuilder(20);
    feat.append(i).append("*=");
    if (p != null) {
      if (useLabel && i < 0) {
        feat.append(p.getLabel()).append("|");
      }
      feat.append(p.getType());
    }
    else {
      feat.append(EOS);
    }
    return feat.toString();
  }

  /**
   * Generates a string representing the grammar rule production that the specified parse
   * is starting.
   * 

* The rule is of the form {@code p.type -> c.children[0..n].type}. * * @param p The {@link Parse} which stats the production. * @param includePunctuation Whether punctuation should be included in the production. * @return A string representing the grammar rule production that the specified parse * is starting. */ protected String production(Parse p, boolean includePunctuation) { StringBuilder production = new StringBuilder(20); production.append(p.getType()).append("->"); Parse[] children = AbstractBottomUpParser.collapsePunctuation(p.getChildren(),punctSet); for (int ci = 0; ci < children.length; ci++) { production.append(children[ci].getType()); if (ci + 1 != children.length) { production.append(","); Collection nextPunct = children[ci].getNextPunctuationSet(); if (includePunctuation && nextPunct != null) { //TODO: make sure multiple punctuation comes out the same for (Parse punct : nextPunct) { production.append(punct.getType()).append(","); } } } } return production.toString(); } protected void cons2(List features, Cons c0, Cons c1, Collection punct1s, boolean bigram) { if (punct1s != null) { for (Parse p : punct1s) { String punctbo = punctbo(p, c1.index <= 0 ? c1.index - 1 : c1.index); //punctbo(1); features.add(punctbo); if (c0.index == 0) { //TODO look at removing case //cons(0)punctbo(1) if (c0.unigram) features.add(c0.cons + "," + punctbo); features.add(c0.consbo + "," + punctbo); } if (c1.index == 0) { //TODO look at removing case //punctbo(1)cons(1) if (c1.unigram) features.add(punctbo + "," + c1.cons); features.add(punctbo + "," + c1.consbo); } //cons(0)punctbo(1)cons(1) if (bigram) features.add(c0.cons + "," + punctbo + "," + c1.cons); if (c1.unigram) features.add(c0.consbo + "," + punctbo + "," + c1.cons); if (c0.unigram) features.add(c0.cons + "," + punctbo + "," + c1.consbo); features.add(c0.consbo + "," + punctbo + "," + c1.consbo); } } else { //cons(0),cons(1) if (bigram) features.add(c0.cons + "," + c1.cons); if (c1.unigram) features.add(c0.consbo + "," + c1.cons); if (c0.unigram) features.add(c0.cons + "," + c1.consbo); features.add(c0.consbo + "," + c1.consbo); } } /** * Creates cons features involving the 3 specified nodes and adds them to the specified feature list. * * @param features The list of features. * @param c0 The first {@link Cons node}. * @param c1 The second {@link Cons node}. * @param c2 The third {@link Cons node}. * @param punct1s The punctuation between {@code c0} and {@code c1}. * @param punct2s The punctuation between {@code c1} and {@code c2}. * @param trigram Specifies whether lexical tri-gram features between these nodes * should be generated. * @param bigram1 Specifies whether lexical bi-gram features between {@code c0} and {@code c1} * should be generated. * @param bigram2 Specifies whether lexical bi-gram features between {@code c1} and {@code c2} * should be generated. */ protected void cons3(List features, Cons c0, Cons c1, Cons c2, Collection punct1s, Collection punct2s, boolean trigram, boolean bigram1, boolean bigram2) { // features.add("stage=cons(0),cons(1),cons(2)"); if (punct1s != null) { if (c0.index == -2) { for (Parse p : punct1s) { // String punct = punct(p,c1.index); String punctbo = punctbo(p, c1.index <= 0 ? c1.index - 1 : c1.index); //punct(-2) //TODO consider changing //features.add(punct); //punctbo(-2) features.add(punctbo); } } } if (punct2s != null) { if (c2.index == 2) { for (Parse p : punct2s) { // String punct = punct(p,c2.index); String punctbo = punctbo(p, c2.index); //punct(2) //TODO consider changing //features.add(punct); //punctbo(2) features.add(punctbo); } } if (punct1s != null) { //cons(0),punctbo(1),cons(1),punctbo(2),cons(2) for (Parse punct2 : punct2s) { String punctbo2 = punctbo(punct2, c2.index <= 0 ? c2.index - 1 : c2.index); for (Parse punct1 : punct1s) { String punctbo1 = punctbo(punct1, c1.index <= 0 ? c1.index - 1 : c1.index); if (trigram) features.add(c0.cons + "," + punctbo1 + "," + c1.cons + "," + punctbo2 + "," + c2.cons); if (bigram2) features.add(c0.consbo + "," + punctbo1 + "," + c1.cons + "," + punctbo2 + "," + c2.cons); if (c0.unigram && c2.unigram) features.add(c0.cons + "," + punctbo1 + "," + c1.consbo + "," + punctbo2 + "," + c2.cons); if (bigram1) features.add(c0.cons + "," + punctbo1 + "," + c1.cons + "," + punctbo2 + "," + c2.consbo); if (c2.unigram) features.add(c0.consbo + "," + punctbo1 + "," + c1.consbo + "," + punctbo2 + "," + c2.cons); if (c1.unigram) features.add(c0.consbo + "," + punctbo1 + "," + c1.cons + "," + punctbo2 + "," + c2.consbo); if (c0.unigram) features.add(c0.cons + "," + punctbo1 + "," + c1.consbo + "," + punctbo2 + "," + c2.consbo); features.add(c0.consbo + "," + punctbo1 + "," + c1.consbo + "," + punctbo2 + "," + c2.consbo); if (zeroBackOff) { if (bigram1) features.add(c0.cons + "," + punctbo1 + "," + c1.cons + "," + punctbo2); if (c1.unigram) features.add(c0.consbo + "," + punctbo1 + "," + c1.cons + "," + punctbo2); if (c0.unigram) features.add(c0.cons + "," + punctbo1 + "," + c1.consbo + "," + punctbo2); features.add(c0.consbo + "," + punctbo1 + "," + c1.consbo + "," + punctbo2); } } } } else { //punct1s == null //cons(0),cons(1),punctbo(2),cons(2) for (Parse punct2 : punct2s) { String punctbo2 = punctbo(punct2, c2.index <= 0 ? c2.index - 1 : c2.index); if (trigram) features.add(c0.cons + "," + c1.cons + "," + punctbo2 + "," + c2.cons); if (bigram2) features.add(c0.consbo + "," + c1.cons + "," + punctbo2 + "," + c2.cons); if (c0.unigram && c2.unigram) features.add(c0.cons + "," + c1.consbo + "," + punctbo2 + "," + c2.cons); if (bigram1) features.add(c0.cons + "," + c1.cons + "," + punctbo2 + "," + c2.consbo); if (c2.unigram) features.add(c0.consbo + "," + c1.consbo + "," + punctbo2 + "," + c2.cons); if (c1.unigram) features.add(c0.consbo + "," + c1.cons + "," + punctbo2 + "," + c2.consbo); if (c0.unigram) features.add(c0.cons + "," + c1.consbo + "," + punctbo2 + "," + c2.consbo); features.add(c0.consbo + "," + c1.consbo + "," + punctbo2 + "," + c2.consbo); if (zeroBackOff) { if (bigram1) features.add(c0.cons + "," + c1.cons + "," + punctbo2); if (c1.unigram) features.add(c0.consbo + "," + c1.cons + "," + punctbo2); if (c0.unigram) features.add(c0.cons + "," + c1.consbo + "," + punctbo2); features.add(c0.consbo + "," + c1.consbo + "," + punctbo2); } } } } else { if (punct1s != null) { //cons(0),punctbo(1),cons(1),cons(2) for (Parse punct1 : punct1s) { String punctbo1 = punctbo(punct1, c1.index <= 0 ? c1.index - 1 : c1.index); if (trigram) features.add(c0.cons + "," + punctbo1 + "," + c1.cons + "," + c2.cons); if (bigram2) features.add(c0.consbo + "," + punctbo1 + "," + c1.cons + "," + c2.cons); if (c0.unigram && c2.unigram) features.add(c0.cons + "," + punctbo1 + "," + c1.consbo + "," + c2.cons); if (bigram1) features.add(c0.cons + "," + punctbo1 + "," + c1.cons + "," + c2.consbo); if (c2.unigram) features.add(c0.consbo + "," + punctbo1 + "," + c1.consbo + "," + c2.cons); if (c1.unigram) features.add(c0.consbo + "," + punctbo1 + "," + c1.cons + "," + c2.consbo); if (c0.unigram) features.add(c0.cons + "," + punctbo1 + "," + c1.consbo + "," + c2.consbo); features.add(c0.consbo + "," + punctbo1 + "," + c1.consbo + "," + c2.consbo); //zero backoff case covered by cons(0)cons(1) } } else { //cons(0),cons(1),cons(2) if (trigram) features.add(c0.cons + "," + c1.cons + "," + c2.cons); if (bigram2) features.add(c0.consbo + "," + c1.cons + "," + c2.cons); if (c0.unigram && c2.unigram) features.add(c0.cons + "," + c1.consbo + "," + c2.cons); if (bigram1) features.add(c0.cons + "," + c1.cons + "," + c2.consbo); if (c2.unigram) features.add(c0.consbo + "," + c1.consbo + "," + c2.cons); if (c1.unigram) features.add(c0.consbo + "," + c1.cons + "," + c2.consbo); if (c0.unigram) features.add(c0.cons + "," + c1.consbo + "," + c2.consbo); features.add(c0.consbo + "," + c1.consbo + "," + c2.consbo); } } } /** * Generates features for nodes surrounding a completed node of the specified {@code type}. * * @param node A surrounding {@link Parse node}. * @param i The index of the surrounding {@code node} with respect to the completed node. * @param type The type of the completed node. * @param punctuation The punctuation adjacent and between the specified surrounding node. * @param features A list to which features are added. */ protected void surround(Parse node, int i, String type, Collection punctuation, List features) { StringBuilder feat = new StringBuilder(20); feat.append("s").append(i).append("="); if (punctuation != null) { for (Parse punct : punctuation) { if (node != null) { feat.append(node.getHead().getCoveredText()).append("|").append(type) .append("|").append(node.getType()).append("|").append(punct.getType()); } else { feat.append(type).append("|").append(EOS).append("|").append(punct.getType()); } features.add(feat.toString()); feat.setLength(0); feat.append("s").append(i).append("*="); if (node != null) { feat.append(type).append("|").append(node.getType()).append("|").append(punct.getType()); } else { feat.append(type).append("|").append(EOS).append("|").append(punct.getType()); } features.add(feat.toString()); feat.setLength(0); feat.append("s").append(i).append("*="); feat.append(type).append("|").append(punct.getType()); features.add(feat.toString()); } } else { if (node != null) { feat.append(node.getHead().getCoveredText()).append("|").append(type) .append("|").append(node.getType()); } else { feat.append(type).append("|").append(EOS); } features.add(feat.toString()); feat.setLength(0); feat.append("s").append(i).append("*="); if (node != null) { feat.append(type).append("|").append(node.getType()); } else { feat.append(type).append("|").append(EOS); } features.add(feat.toString()); } } /** * Produces features to determine whether the specified child node is part of * a complete constituent of the specified type and adds those features to the * specified list. * * @param child The {@link Parse node} to consider. * @param i A string indicating the position of the child node. * @param type The type of constituent being built. * @param features List to add features to. */ protected void checkcons(Parse child, String i, String type, List features) { StringBuilder feat = new StringBuilder(20); feat.append("c").append(i).append("=").append(child.getType()).append("|") .append(child.getHead().getCoveredText()).append("|").append(type); features.add(feat.toString()); feat.setLength(0); feat.append("c").append(i).append("*=").append(child.getType()).append("|").append(type); features.add(feat.toString()); } protected void checkcons(Parse p1, Parse p2, String type, List features) { StringBuilder feat = new StringBuilder(20); feat.append("cil=").append(type).append(",").append(p1.getType()).append("|") .append(p1.getHead().getCoveredText()).append(",").append(p2.getType()) .append("|").append(p2.getHead().getCoveredText()); features.add(feat.toString()); feat.setLength(0); feat.append("ci*l=").append(type).append(",").append(p1.getType()).append(",") .append(p2.getType()).append("|").append(p2.getHead().getCoveredText()); features.add(feat.toString()); feat.setLength(0); feat.append("cil*=").append(type).append(",").append(p1.getType()).append("|") .append(p1.getHead().getCoveredText()).append(",").append(p2.getType()); features.add(feat.toString()); feat.setLength(0); feat.append("ci*l*=").append(type).append(",").append(p1.getType()) .append(",").append(p2.getType()); features.add(feat.toString()); } /** * Populates specified nodes array with left-most right frontier * node with a unique head. If the right frontier doesn't contain * enough nodes, then {@code nulls} are placed in the array elements. * * @param rf The current right frontier. * @param nodes The array to be populated. */ protected void getFrontierNodes(List rf, Parse[] nodes) { int leftIndex = 0; int prevHeadIndex = -1; for (Parse fn : rf) { int headIndex = fn.getHeadIndex(); if (headIndex != prevHeadIndex) { nodes[leftIndex] = fn; leftIndex++; prevHeadIndex = headIndex; if (leftIndex == nodes.length) { break; } } } for (int ni = leftIndex; ni < nodes.length; ni++) { nodes[ni] = null; } } }





© 2015 - 2025 Weber Informatics LLC | Privacy Policy