opennlp.tools.tokenize.DefaultTokenContextGenerator Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of opennlp-tools Show documentation
There is a newer version: 2.5.0
/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License. You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package opennlp.tools.tokenize;

import java.util.ArrayList;
import java.util.Collections;
import java.util.List;
import java.util.Set;

import opennlp.tools.util.StringUtil;

/**
 * Generate events for maxent decisions for tokenization.
 */
public class DefaultTokenContextGenerator implements TokenContextGenerator {

  protected final Set inducedAbbreviations;

  /**
   * Creates a default context generator for tokenizer.
   */
  public DefaultTokenContextGenerator() {
    this(Collections.emptySet());
  }

  /**
   * Creates a default context generator for tokenizer.
   *
   * @param inducedAbbreviations the induced abbreviations
   */
  public DefaultTokenContextGenerator(Set inducedAbbreviations) {
    this.inducedAbbreviations = inducedAbbreviations;
  }

  /* (non-Javadoc)
   * @see opennlp.tools.tokenize.TokenContextGenerator#getContext(java.lang.String, int)
   */
  public String[] getContext(String sentence, int index) {
    List preds = createContext(sentence, index);
    String[] context = new String[preds.size()];
    preds.toArray(context);
    return context;
  }

  /**
   * Returns an {@link ArrayList} of features for the specified sentence string
   * at the specified index. Extensions of this class can override this method
   * to create a customized {@link TokenContextGenerator}
   *
   * @param sentence
   *          the token been analyzed
   * @param index
   *          the index of the character been analyzed
   * @return an {@link ArrayList} of features for the specified sentence string
   *         at the specified index.
   */
  protected List createContext(String sentence, int index) {
    List preds = new ArrayList<>();
    String prefix = sentence.substring(0, index);
    String suffix = sentence.substring(index);
    preds.add("p=" + prefix);
    preds.add("s=" + suffix);
    if (index > 0) {
      addCharPreds("p1", sentence.charAt(index - 1), preds);
      if (index > 1) {
        addCharPreds("p2", sentence.charAt(index - 2), preds);
        preds.add("p21=" + sentence.charAt(index - 2) + sentence.charAt(index - 1));
      }
      else {
        preds.add("p2=bok");
      }
      preds.add("p1f1=" + sentence.charAt(index - 1) + sentence.charAt(index));
    }
    else {
      preds.add("p1=bok");
    }
    addCharPreds("f1", sentence.charAt(index), preds);
    if (index + 1 < sentence.length()) {
      addCharPreds("f2", sentence.charAt(index + 1), preds);
      preds.add("f12=" + sentence.charAt(index) + sentence.charAt(index + 1));
    }
    else {
      preds.add("f2=bok");
    }
    if (sentence.charAt(0) == '&' && sentence.charAt(sentence.length() - 1) == ';') {
      preds.add("cc");//character code
    }

    if (index == sentence.length() - 1 && inducedAbbreviations.contains(sentence)) {
      preds.add("pabb");
    }

    return preds;
  }


  /**
   * Helper function for getContext.
   */
  protected void addCharPreds(String key, char c, List preds) {
    preds.add(key + "=" + c);
    if (Character.isLetter(c)) {
      preds.add(key + "_alpha");
      if (Character.isUpperCase(c)) {
        preds.add(key + "_caps");
      }
    }
    else if (Character.isDigit(c)) {
      preds.add(key + "_num");
    }
    else if (StringUtil.isWhitespace(c)) {
      preds.add(key + "_ws");
    }
    else {
      if (c == '.' || c == '?' || c == '!') {
        preds.add(key + "_eos");
      }
      else if (c == '`' || c == '"' || c == '\'') {
        preds.add(key + "_quote");
      }
      else if (c == '[' || c == '{' || c == '(') {
        preds.add(key + "_lp");
      }
      else if (c == ']' || c == '}' || c == ')') {
        preds.add(key + "_rp");
      }
    }
  }
}