edu.stanford.nlp.pipeline.Annotator Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of stanford-parser Show documentation
Show all versions of stanford-parser Show documentation
Stanford Parser processes raw text in English, Chinese, German, Arabic, and French, and extracts constituency parse trees.
The newest version!
package edu.stanford.nlp.pipeline;
import edu.stanford.nlp.ling.CoreAnnotation;
import java.util.*;
/**
* This is an interface for adding annotations to a partially annotated
* Annotation. In some ways, it is just a glorified function, except
* that it explicitly operates in-place on Annotation objects. Annotators
* should be given to an AnnotationPipeline in order to make
* annotation pipelines (the whole motivation of this package), and
* therefore implementers of this interface should be designed to play
* well with other Annotators and in their javadocs they should
* explicitly state what annotations they are assuming already exist
* in the annotation (like parse, POS tag, etc), what keys they are
* expecting them under (see, for instance, the ones in CoreAnnotations),
* and what annotations they will add (or modify) and the keys
* for them as well. If you would like to look at the code for a
* relatively simple Annotator, I recommend NERAnnotator. For a lot
* of code you could just add the implements directly, but I recommend
* wrapping instead because I believe that it will help to keep the
* pipeline code more manageable.
*
* An Annotator should also provide a description of what it produces and
* a description of what it requires to have been produced by using Sets
* of requirements.
* The StanfordCoreNLP version of the AnnotationPipeline can
* enforce requirements, throwing an exception if an annotator does
* not have all of its prerequisites met. An Annotator which does not
* participate in this system can simply return Collections.emptySet()
* for both requires() and requirementsSatisfied().
*
* Properties
*
* We extensively use Properties objects to configure each Annotator.
* In particular, CoreNLP has most of its properties in an informal
* namespace with properties names like "parse.maxlen" to specify that
* a property only applies to a parser annotator. There can also be
* global properties; they should not have any periods in their names.
* Each Annotator knows its own name; we assume these don't collide badly,
* though possibly two parsers could share the "parse.*" namespace.
* An Annotator should have a constructor that simply takes a Properties
* object. At this point, the Annotator should expect to be getting
* properties in namespaces. The classes that annotators call (like
* a concrete parser, tagger, or whatever) mainly expect properties
* not in namespaces. In general the annotator should subset the
* passed in properties to keep only global properties and ones in
* its own namespace, and then strip the namespace prefix from the
* latter properties.
*
* @author Jenny Finkel
*/
public interface Annotator {
/**
* Given an Annotation, perform a task on this Annotation.
*/
void annotate(Annotation annotation);
/**
* A block of code called when this annotator unmounts from the
* {@link AnnotatorPool}.
* By default, nothing is done.
*/
default void unmount() { }
/**
* Returns a set of requirements for which tasks this annotator can
* provide. For example, the POS annotator will return "pos".
*/
Set> requirementsSatisfied();
/**
* Returns the set of tasks which this annotator requires in order
* to perform. For example, the POS annotator will return
* "tokenize", "ssplit".
*/
Set> requires();
/**
* These are annotators which StanfordCoreNLP knows how to create.
* Add new annotators and/or annotators from other groups here!
*/
String STANFORD_TOKENIZE = "tokenize";
String STANFORD_CLEAN_XML = "cleanxml";
String STANFORD_SSPLIT = "ssplit";
String STANFORD_POS = "pos";
String STANFORD_LEMMA = "lemma";
String STANFORD_NER = "ner";
String STANFORD_REGEXNER = "regexner";
String STANFORD_TOKENSREGEX = "tokensregex";
String STANFORD_ENTITY_MENTIONS = "entitymentions";
String STANFORD_GENDER = "gender";
String STANFORD_TRUECASE = "truecase";
String STANFORD_PARSE = "parse";
String STANFORD_DETERMINISTIC_COREF = "dcoref";
String STANFORD_COREF = "coref";
String STANFORD_COREF_MENTION = "coref.mention"; // TODO(jebolton) Merge with entitymention
String STANFORD_RELATION = "relation";
String STANFORD_SENTIMENT = "sentiment";
String STANFORD_COLUMN_DATA_CLASSIFIER = "cdc";
String STANFORD_DEPENDENCIES = "depparse";
String STANFORD_NATLOG = "natlog";
String STANFORD_OPENIE = "openie";
String STANFORD_QUOTE = "quote";
String STANFORD_QUOTE_ATTRIBUTION = "quote.attribution";
String STANFORD_UD_FEATURES = "udfeats";
String STANFORD_LINK = "entitylink";
String STANFORD_KBP = "kbp";
/**
* A mapping from an annotator to a its default transitive dependencies.
* Note that this is not guaranteed to be accurate, as properties set in the annotator
* can change the annotator's dependencies; but, it's a reasonable guess if you're using
* things out-of-the-box.
*/
@SuppressWarnings("ArraysAsListWithZeroOrOneArgument")
Map> DEFAULT_REQUIREMENTS = new HashMap>(){{
put(STANFORD_TOKENIZE, new LinkedHashSet<>(Arrays.asList()));
put(STANFORD_CLEAN_XML, new LinkedHashSet<>(Arrays.asList(STANFORD_TOKENIZE)));
put(STANFORD_SSPLIT, new LinkedHashSet<>(Arrays.asList(STANFORD_TOKENIZE)));
put(STANFORD_POS, new LinkedHashSet<>(Arrays.asList(STANFORD_TOKENIZE, STANFORD_SSPLIT)));
put(STANFORD_LEMMA, new LinkedHashSet<>(Arrays.asList(STANFORD_TOKENIZE, STANFORD_SSPLIT, STANFORD_POS)));
put(STANFORD_NER, new LinkedHashSet<>(Arrays.asList(STANFORD_TOKENIZE, STANFORD_SSPLIT, STANFORD_POS, STANFORD_LEMMA)));
put(STANFORD_TOKENSREGEX, new LinkedHashSet<>(Arrays.asList(STANFORD_TOKENIZE)));
put(STANFORD_REGEXNER, new LinkedHashSet<>(Arrays.asList(STANFORD_TOKENIZE, STANFORD_SSPLIT)));
put(STANFORD_ENTITY_MENTIONS, new LinkedHashSet<>(Arrays.asList(STANFORD_TOKENIZE, STANFORD_SSPLIT, STANFORD_POS, STANFORD_LEMMA, STANFORD_NER)));
put(STANFORD_GENDER, new LinkedHashSet<>(Arrays.asList(STANFORD_TOKENIZE, STANFORD_SSPLIT, STANFORD_POS, STANFORD_LEMMA, STANFORD_NER)));
put(STANFORD_TRUECASE, new LinkedHashSet<>(Arrays.asList(STANFORD_TOKENIZE, STANFORD_SSPLIT)));
put(STANFORD_PARSE, new LinkedHashSet<>(Arrays.asList(STANFORD_TOKENIZE, STANFORD_SSPLIT, STANFORD_POS)));
put(STANFORD_DETERMINISTIC_COREF, new LinkedHashSet<>(Arrays.asList(STANFORD_TOKENIZE, STANFORD_SSPLIT, STANFORD_POS, STANFORD_LEMMA, STANFORD_NER, STANFORD_PARSE)));
put(STANFORD_COREF, new LinkedHashSet<>(Arrays.asList(STANFORD_TOKENIZE, STANFORD_SSPLIT, STANFORD_POS, STANFORD_LEMMA, STANFORD_NER, STANFORD_DEPENDENCIES)));
put(STANFORD_COREF_MENTION, new LinkedHashSet<>(Arrays.asList(STANFORD_TOKENIZE, STANFORD_SSPLIT, STANFORD_POS, STANFORD_LEMMA, STANFORD_NER, STANFORD_DEPENDENCIES)));
put(STANFORD_RELATION, new LinkedHashSet<>(Arrays.asList(STANFORD_TOKENIZE, STANFORD_SSPLIT, STANFORD_POS, STANFORD_LEMMA, STANFORD_NER, STANFORD_PARSE, STANFORD_DEPENDENCIES)));
put(STANFORD_SENTIMENT, new LinkedHashSet<>(Arrays.asList(STANFORD_TOKENIZE, STANFORD_SSPLIT, STANFORD_POS, STANFORD_PARSE)));
put(STANFORD_COLUMN_DATA_CLASSIFIER, new LinkedHashSet<>(Arrays.asList()));
put(STANFORD_DEPENDENCIES, new LinkedHashSet<>(Arrays.asList(STANFORD_TOKENIZE, STANFORD_SSPLIT, STANFORD_POS)));
put(STANFORD_NATLOG, new LinkedHashSet<>(Arrays.asList(STANFORD_TOKENIZE, STANFORD_SSPLIT, STANFORD_POS, STANFORD_LEMMA, STANFORD_DEPENDENCIES)));
put(STANFORD_OPENIE, new LinkedHashSet<>(Arrays.asList(STANFORD_TOKENIZE, STANFORD_SSPLIT, STANFORD_POS, STANFORD_LEMMA, STANFORD_DEPENDENCIES, STANFORD_NATLOG)));
put(STANFORD_QUOTE, new LinkedHashSet<>(Arrays.asList(STANFORD_TOKENIZE, STANFORD_SSPLIT, STANFORD_POS, STANFORD_LEMMA, STANFORD_NER)));
put(STANFORD_QUOTE_ATTRIBUTION, new LinkedHashSet<>(Arrays.asList(STANFORD_TOKENIZE, STANFORD_SSPLIT, STANFORD_POS, STANFORD_LEMMA, STANFORD_NER, STANFORD_COREF_MENTION, STANFORD_DEPENDENCIES, STANFORD_QUOTE)));
put(STANFORD_UD_FEATURES, new LinkedHashSet<>(Arrays.asList(STANFORD_TOKENIZE, STANFORD_SSPLIT, STANFORD_POS, STANFORD_DEPENDENCIES)));
put(STANFORD_LINK, new LinkedHashSet<>(Arrays.asList(STANFORD_TOKENIZE, STANFORD_SSPLIT, STANFORD_POS, STANFORD_DEPENDENCIES, STANFORD_LEMMA, STANFORD_NER, STANFORD_ENTITY_MENTIONS)));
put(STANFORD_KBP, new LinkedHashSet<>(Arrays.asList(STANFORD_TOKENIZE, STANFORD_SSPLIT, STANFORD_POS, STANFORD_DEPENDENCIES, STANFORD_LEMMA, STANFORD_NER, STANFORD_COREF)));
}};
}