opennlp.tools.parser.chunking.BuildContextGenerator Maven / Gradle / Ivy

Go to download
/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License. You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package opennlp.tools.parser.chunking;

import java.util.ArrayList;
import java.util.Collection;
import java.util.List;

import opennlp.tools.dictionary.Dictionary;
import opennlp.tools.parser.AbstractContextGenerator;
import opennlp.tools.parser.Cons;
import opennlp.tools.parser.Parse;
import opennlp.tools.util.StringList;

/**
 * Class to generator predictive contexts for deciding how constituents should be combined together.
 */
public class BuildContextGenerator extends AbstractContextGenerator {

  private Dictionary dict;
  private String[] unigram;
  private String[] bigram;
  private String[] trigram;

  /**
   * Creates a new context generator for making decisions about combining constitients togehter.
   *
   */
  public BuildContextGenerator() {
    super();
    zeroBackOff = false;
    useLabel = true;
  }

  public BuildContextGenerator(Dictionary dict) {
    this();
    this.dict = dict;
    unigram = new String[1];
    bigram = new String[2];
    trigram = new String[3];
  }

  public String[] getContext(Object o) {
    Object[] params = (Object[]) o;
    return getContext((Parse[]) params[0], (Integer) params[1]);
  }

  /**
   * Returns the predictive context used to determine how constituent at the specified index
   * should be combined with other contisuents.
   * @param constituents The constituents which have yet to be combined into new constituents.
   * @param index The index of the constituent whcihi is being considered.
   * @return the context for building constituents at the specified index.
   */
  public String[] getContext(Parse[] constituents, int index) {
    List features = new ArrayList<>(100);
    int ps = constituents.length;

    // cons(-2), cons(-1), cons(0), cons(1), cons(2)
    // cons(-2)

    Collection punct2s = null;
    Collection punct_2s = null;

    Parse p_2 = null;
    if (index - 2 >= 0) {
      p_2 = constituents[index - 2];
    }

    Parse p_1 = null;
    if (index - 1 >= 0) {
      p_1 = constituents[index - 1];
      punct_2s = p_1.getPreviousPunctuationSet();
    }
    Parse p0 = constituents[index];
    Collection punct_1s = p0.getPreviousPunctuationSet();
    Collection punct1s = p0.getNextPunctuationSet();

    Parse p1 = null;
    if (index + 1 < ps) {
      p1 = constituents[index + 1];
      punct2s = p1.getNextPunctuationSet();
    }

    Parse p2 = null;
    if (index + 2 < ps) {
      p2 = constituents[index + 2];
    }

    boolean u_2 = true;
    boolean u_1 = true;
    boolean u0 = true;
    boolean u1 = true;
    boolean u2 = true;
    boolean b_2_1 = true;
    boolean b_10 = true;
    boolean b01 = true;
    boolean b12 = true;
    boolean t_2_10 = true;
    boolean t_101 = true;
    boolean t012 = true;

    if (dict != null) {

      if (p_2 != null) {
        unigram[0] = p_2.getHead().getCoveredText();
        u_2 = dict.contains(new StringList(unigram));
      }

      if (p2 != null) {
        unigram[0] = p2.getHead().getCoveredText();
        u2 = dict.contains(new StringList(unigram));
      }

      unigram[0] = p0.getHead().getCoveredText();
      u0 = dict.contains(new StringList(unigram));

      if (p_2 != null && p_1 != null) {
        bigram[0] = p_2.getHead().getCoveredText();
        bigram[1] = p_1.getHead().getCoveredText();
        b_2_1 = dict.contains(new StringList(bigram));

        trigram[0] = p_2.getHead().getCoveredText();
        trigram[1] = p_1.getHead().getCoveredText();
        trigram[2] = p0.getHead().getCoveredText();
        t_2_10 = dict.contains(new StringList(trigram));
      }
      if (p_1 != null && p1 != null) {
        trigram[0] = p_1.getHead().getCoveredText();
        trigram[1] = p0.getHead().getCoveredText();
        trigram[2] = p1.getHead().getCoveredText();
        t_101 = dict.contains(new StringList(trigram));
      }
      if (p_1 != null) {
        unigram[0] = p_1.getHead().getCoveredText();
        u_1 = dict.contains(new StringList(unigram));

        //extra check for 2==null case
        b_2_1 = b_2_1 && u_1 & u_2;
        t_2_10 = t_2_10 && u_1 & u_2 & u0;
        t_101 = t_101 && u_1 & u0 && u1;

        bigram[0] = p_1.getHead().getCoveredText();
        bigram[1] = p0.getHead().getCoveredText();
        b_10 = dict.contains(new StringList(bigram)) && u_1 && u0;
      }
      if (p1 != null && p2 != null) {
        bigram[0] = p1.getHead().getCoveredText();
        bigram[1] = p2.getHead().getCoveredText();
        b12 = dict.contains(new StringList(bigram));

        trigram[0] = p0.getHead().getCoveredText();
        trigram[1] = p1.getHead().getCoveredText();
        trigram[2] = p2.getHead().getCoveredText();
        t012 = dict.contains(new StringList(trigram));
      }
      if (p1 != null) {
        unigram[0] = p1.getHead().getCoveredText();
        u1 = dict.contains(new StringList(unigram));

        //extra check for 2==null case
        b12 = b12 && u1 && u2;
        t012 = t012 && u1 && u2 && u0;
        t_101 = t_101 && u0 && u_1 && u1;

        bigram[0] = p0.getHead().getCoveredText();
        bigram[1] = p1.getHead().getCoveredText();
        b01 = dict.contains(new StringList(bigram));
        b01 = b01 && u0 && u1;
      }
    }

    String consp_2 = cons(p_2, -2);
    String consp_1 = cons(p_1, -1);
    String consp0 = cons(p0, 0);
    String consp1 = cons(p1, 1);
    String consp2 = cons(p2, 2);

    String consbop_2 = consbo(p_2, -2);
    String consbop_1 = consbo(p_1, -1);
    String consbop0 = consbo(p0, 0);
    String consbop1 = consbo(p1, 1);
    String consbop2 = consbo(p2, 2);

    Cons c_2 = new Cons(consp_2,consbop_2,-2,u_2);
    Cons c_1 = new Cons(consp_1,consbop_1,-1,u_1);
    Cons c0 = new Cons(consp0,consbop0,0,u0);
    Cons c1 = new Cons(consp1,consbop1,1,u1);
    Cons c2 = new Cons(consp2,consbop2,2,u2);

    //default
    features.add("default");
    //first constituent label
    //features.add("fl="+constituents[0].getLabel());

    // features.add("stage=cons(i)");
    // cons(-2), cons(-1), cons(0), cons(1), cons(2)
    if (u0) features.add(consp0);
    features.add(consbop0);

    if (u_2) features.add(consp_2);
    features.add(consbop_2);
    if (u_1) features.add(consp_1);
    features.add(consbop_1);
    if (u1) features.add(consp1);
    features.add(consbop1);
    if (u2) features.add(consp2);
    features.add(consbop2);

    //cons(0),cons(1)
    cons2(features,c0,c1,punct1s,b01);
    //cons(-1),cons(0)
    cons2(features,c_1,c0,punct_1s,b_10);
    //features.add("stage=cons(0),cons(1),cons(2)");
    cons3(features,c0,c1,c2,punct1s,punct2s,t012,b01,b12);
    cons3(features,c_2,c_1,c0,punct_2s,punct_1s,t_2_10,b_2_1,b_10);
    cons3(features,c_1,c0,c1,punct_1s,punct1s,t_101,b_10,b01);
    //features.add("stage=other");
    String p0Tag = p0.getType();
    if (p0Tag.equals("-RRB-")) {
      for (int pi = index - 1; pi >= 0; pi--) {
        Parse p = constituents[pi];
        if (p.getType().equals("-LRB-")) {
          features.add("bracketsmatch");
          break;
        }
        if (p.getLabel().startsWith(Parser.START)) {
          break;
        }
      }
    }
    if (p0Tag.equals("-RCB-")) {
      for (int pi = index - 1; pi >= 0; pi--) {
        Parse p = constituents[pi];
        if (p.getType().equals("-LCB-")) {
          features.add("bracketsmatch");
          break;
        }
        if (p.getLabel().startsWith(Parser.START)) {
          break;
        }
      }
    }
    if (p0Tag.equals("''")) {
      for (int pi = index - 1; pi >= 0; pi--) {
        Parse p = constituents[pi];
        if (p.getType().equals("``")) {
          features.add("quotesmatch");
          break;
        }
        if (p.getLabel().startsWith(Parser.START)) {
          break;
        }
      }
    }
    if (p0Tag.equals("'")) {
      for (int pi = index - 1; pi >= 0; pi--) {
        Parse p = constituents[pi];
        if (p.getType().equals("`")) {
          features.add("quotesmatch");
          break;
        }
        if (p.getLabel().startsWith(Parser.START)) {
          break;
        }
      }
    }
    if (p0Tag.equals(",")) {
      for (int pi = index - 1; pi >= 0; pi--) {
        Parse p = constituents[pi];
        if (p.getType().equals(",")) {
          features.add("iscomma");
          break;
        }
        if (p.getLabel().startsWith(Parser.START)) {
          break;
        }
      }
    }
    if (p0Tag.equals(".") && index == ps - 1) {
      for (int pi = index - 1; pi >= 0; pi--) {
        Parse p = constituents[pi];
        if (p.getLabel().startsWith(Parser.START)) {
          if (pi == 0) {
            features.add("endofsentence");
          }
          break;
        }
      }
    }
    return features.toArray(new String[features.size()]);
  }
}