cc.mallet.extract.Extractor Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of jcore-mallet-2.0.9 Show documentation
MALLET is a Java-based package for statistical natural language processing, document classification, clustering, topic modeling, information extraction, and other machine learning applications to text.
The newest version!
/* Copyright (C) 2002 Univ. of Massachusetts Amherst, Computer Science Dept.
   This file is part of "MALLET" (MAchine Learning for LanguagE Toolkit).
   http://www.cs.umass.edu/~mccallum/mallet
   This software is provided under the terms of the Common Public License,
   version 1.0, as published by http://www.opensource.org.  For further
   information, see the file `LICENSE' included with this distribution. */




/** 
   @author Andrew McCallum [email protected]
 */

package cc.mallet.extract;


import java.io.Serializable;
import java.util.Iterator;

import cc.mallet.pipe.Pipe;
import cc.mallet.types.Alphabet;
import cc.mallet.types.LabelAlphabet;
import cc.mallet.types.Instance;

// Analogous to base.classify.Classifier

/**
 * Generic interface for objects that do information extraction.
 *  Typically, this will mean extraction of database records
 *  (see @link{Record}) from Strings, but this interface is not
 *  specific to this case.
 */
//TODO: Possibly in the future, create Document and Corpus objects.
// (This would allow calling an extractor on multiple documents in a type-safe manner.
public interface Extractor extends Serializable
{
  /**
   * Performs extraction given a raw object.  The object will
   *  be passed through the Extractor's pipe.
   * @param o The document to extract from (often a String).
   * @return Extraction the results of performing extraction
   */
  public Extraction extract (Object o);

  /**
   * Performs extraction from an object that has been
   *  already been tokenized.  This method will pass spans
   *  through the extractor's pipe.
   * @param toks A tokenized document
   * @return Extraction the results of performing extraction
   */
  public Extraction extract (Tokenization toks);

  /**
   * Performs extraction on a a set of raw documents.  The
   *   Instances output from source will be passed through
   *   both the tokentization pipe and the feature extraction
   *   pipe.
   * @param source A source of raw documents
   * @return Extraction the results of performing extraction
   */
  public Extraction extract (Iterator source);

  /**
   * Returns the pipe used by this extractor for.  The pipe
   *  takes an Instance and converts it into a form usable
   *  by the particular extraction algorithm.  This pipe expects
   *  the Instance's data field to be a Tokenization.  For example,
   *  pipes often perform feature extraction.  The type of
   *  raw object expected by the pipe depends on the particular
   *  subclass of extractor.
   * @return a pipe
   */
  public Pipe getFeaturePipe ();


  /**
   * Returns the pipe used by this extractor to tokenize the input.
   *  The type of Instance of this pipe expects is specific to the
   *  individual extractor.  This pipe will return an Instance whose
   *  data is a Tokenization.
   * @return a pipe
   */
  public Pipe getTokenizationPipe ();


  /**
   * Sets the pipe used by this extractor for tokenization.  The pipe should
   *  takes a raw object and convert it into a Tokenization.
   * 
   * The pipe @link{edu.umass.cs.mallet.base.pipe.CharSequence2TokenSequence} is an
   *  example of a pipe that could be used here.
   */
  public void setTokenizationPipe (Pipe pipe);

  /**
   * Returns an alphabet of the features used by the extractor.
   *   The alphabet maps strings describing the features to indices.
   * @return the input alphabet
   */
  public Alphabet getInputAlphabet ();

  /**
   * Returns an alphabet of the labels used by the extractor.
   *  Labels include entity types (such as PERSON) and slot
   *  names (such as EMPLOYEE-OF).
   * @return the target alphabet
   */
  public LabelAlphabet getTargetAlphabet ();
}