edu.stanford.nlp.sequences.FeatureFactory Maven / Gradle / Ivy

Show more of this group Show more artifacts with this name
Show all versions of stanford-corenlp Show documentation

Stanford CoreNLP provides a set of natural language analysis tools which can take raw English language text input and give the base forms of words, their parts of speech, whether they are names of companies, people, etc., normalize dates, times, and numeric quantities, mark up the structure of sentences in terms of phrases and word dependencies, and indicate which noun phrases refer to the same entities. It provides the foundational building blocks for higher level text understanding applications.

There is a newer version: 4.5.7

Show newest version

package edu.stanford.nlp.sequences;

import java.util.*;
import java.io.Serializable;

import edu.stanford.nlp.ling.CoreAnnotations;
import edu.stanford.nlp.ling.CoreLabel;
import edu.stanford.nlp.util.PaddedList;


/**
 * This is the abstract class that all feature factories must
 * subclass.  It also defines most of the basic {@link Clique}s
 * that you would want to make features over.  It contains a
 * convenient method, getCliques(maxLeft, maxRight) which will give
 * you all the cliques within the specified limits.
 *
 * @param  The type of the items in the PaddedList from which features
 *     are extracted
 *
 * @author Jenny Finkel
 */
public abstract class FeatureFactory implements Serializable {

  private static final long serialVersionUID = 7249250071983091694L;

  protected SeqClassifierFlags flags;

  public FeatureFactory() {}

  public void init (SeqClassifierFlags flags) {
    this.flags = flags;
  }

  public static final Clique cliqueC = Clique.valueOf(new int[] {0});
  public static final Clique cliqueCpC = Clique.valueOf(new int[] {-1, 0});
  public static final Clique cliqueCp2C = Clique.valueOf(new int[] {-2, 0});
  public static final Clique cliqueCp3C = Clique.valueOf(new int[] {-3, 0});
  public static final Clique cliqueCp4C = Clique.valueOf(new int[] {-4, 0});
  public static final Clique cliqueCp5C = Clique.valueOf(new int[] {-5, 0});
  public static final Clique cliqueCpCp2C = Clique.valueOf(new int[] {-2, -1, 0});
  public static final Clique cliqueCpCp2Cp3C = Clique.valueOf(new int[] {-3, -2, -1, 0});
  public static final Clique cliqueCpCp2Cp3Cp4C = Clique.valueOf(new int[] {-4, -3, -2, -1, 0});
  public static final Clique cliqueCpCp2Cp3Cp4Cp5C = Clique.valueOf(new int[] {-5, -4, -3, -2, -1, 0});
  public static final Clique cliqueCnC = Clique.valueOf(new int[] {0, 1});
  public static final Clique cliqueCpCnC = Clique.valueOf(new int[] {-1, 0, 1});

  public static final List knownCliques = Arrays.asList(cliqueC, cliqueCpC, cliqueCp2C, cliqueCp3C, cliqueCp4C, cliqueCp5C, cliqueCpCp2C, cliqueCpCp2Cp3C, cliqueCpCp2Cp3Cp4C, cliqueCpCp2Cp3Cp4Cp5C, cliqueCnC, cliqueCpCnC);

  public List getCliques() {
    return getCliques(flags.maxLeft, flags.maxRight);
  }

  public static List getCliques(int maxLeft, int maxRight) {
    List cliques = new ArrayList<>();
    for (Clique c : knownCliques) {
      if (-c.maxLeft() <= maxLeft && c.maxRight() <= maxRight) {
        cliques.add(c);
      }
    }
    return cliques;
  }

  /**
   * This method returns a {@link Collection} of the features
   * calculated for the word at the specified position in info (the list of
   * words) for the specified {@link Clique}.
   * It should return the actual String features, NOT wrapped in any
   * other object, as the wrapping
   * will be done automatically.
   * Because it takes a {@link PaddedList} you don't
   * need to worry about indices which are outside of the list.
   *
   * @param info A PaddedList of the feature-value pairs
   * @param position The current position to extract features at
   * @param clique The particular clique for which to extract features. It
   *     should be a member of the knownCliques list.
   * @return A {@link Collection} of the features
   *     calculated for the word at the specified position in info.
   */
  public abstract Collection getCliqueFeatures(PaddedList info, int position, Clique clique);


  /** Makes more complete feature names out of partial feature names, by
   *  adding a suffix to the String feature name, adding results to an
   *  accumulator
   *
   * @param accumulator The output features are added here
   * @param addend The base set of features
   * @param suffix The suffix added to each feature in the addend set
   */
  @SuppressWarnings({"MethodMayBeStatic"})
  protected void addAllInterningAndSuffixing(Collection accumulator, Collection addend, String suffix) {
    boolean nonNullSuffix = suffix != null && ! suffix.isEmpty();
    if (nonNullSuffix) {
      suffix = '|' + suffix;
    }
    // boolean intern2 = flags.intern2;
    for (String feat : addend) {
      if (nonNullSuffix) {
        feat = feat.concat(suffix);
      }
      // if (intern2) {
      //   feat = feat.intern();
      // }
      accumulator.add(feat);
    }
  }

  /**
   * Convenience methods for subclasses which use CoreLabel.  Gets the
   * word after applying any wordFunction present in the
   * SeqClassifierFlags.
   *
   * @param label A CoreLabel
   * @return The TextAnnotation of the label, perhaps after passing it through
   *     a function (flags.wordFunction)
   */
  protected String getWord(CoreLabel label) {
    String word = label.getString(CoreAnnotations.TextAnnotation.class);
    if (flags.wordFunction != null) {
      word = flags.wordFunction.apply(word);
    }
    return word;
  }

}