All Downloads are FREE. Search and download functionalities are using the official Maven repository.

edu.stanford.nlp.ling.CoreAnnotations Maven / Gradle / Ivy

Go to download

Stanford CoreNLP provides a set of natural language analysis tools which can take raw English language text input and give the base forms of words, their parts of speech, whether they are names of companies, people, etc., normalize dates, times, and numeric quantities, mark up the structure of sentences in terms of phrases and word dependencies, and indicate which noun phrases refer to the same entities. It provides the foundational building blocks for higher level text understanding applications.

There is a newer version: 4.5.7
Show newest version
package edu.stanford.nlp.ling;

import edu.stanford.nlp.ie.util.RelationTriple;
import edu.stanford.nlp.util.*;

import java.util.Calendar;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.SortedSet;

/**
 * 

* Set of common annotations for {@link CoreMap}s. The classes * defined here are typesafe keys for getting and setting annotation * values. These classes need not be instantiated outside of this * class. e.g {@link TextAnnotation}.class serves as the key and a * {@code String} serves as the value containing the * corresponding word. *

* *

* New types of {@link CoreAnnotation} can be defined anywhere that is * convenient in the source tree - they are just classes. This file exists to * hold widely used "core" annotations and others inherited from the * {@link Label} family. In general, most keys should be placed in this file as * they may often be reused throughout the code. This architecture allows for * flexibility, but in many ways it should be considered as equivalent to an * enum in which everything should be defined *

* *

* The getType method required by CoreAnnotation must return the same class type * as its value type parameter. It feels like one should be able to get away * without that method, but because Java erases the generic type signature, that * info disappears at runtime. See {@link ValueAnnotation} for an example. *

* * @author dramage * @author rafferty * @author bethard */ public class CoreAnnotations { private CoreAnnotations() { } // only static members /** * The CoreMap key identifying the annotation's text. * * Note that this key is intended to be used with many different kinds of * annotations - documents, sentences and tokens all have their own text. */ public static class TextAnnotation implements CoreAnnotation { @Override public Class getType() { return String.class; } } /** * The CoreMap key for getting the lemma (morphological stem) of a token. * * This key is typically set on token annotations. * * TODO: merge with StemAnnotation? */ public static class LemmaAnnotation implements CoreAnnotation { @Override public Class getType() { return String.class; } } /** * The CoreMap key for getting the Penn part of speech of a token. * * This key is typically set on token annotations. */ public static class PartOfSpeechAnnotation implements CoreAnnotation { @Override public Class getType() { return String.class; } } /** * The CoreMap key for getting the token-level named entity tag (e.g., DATE, * PERSON, etc.) * * This key is typically set on token annotations. */ public static class NamedEntityTagAnnotation implements CoreAnnotation { @Override public Class getType() { return String.class; } } /** * The CoreMap key for getting the token-level named entity tag (e.g., DATE, * PERSON, etc.) from a previous NER tagger. NERFeatureFactory is sensitive to * this tag and will turn the annotations from the previous NER tagger into * new features. This is currently used to implement one level of stacking -- * we may later change it to take a list as needed. * * This key is typically set on token annotations. */ public static class StackedNamedEntityTagAnnotation implements CoreAnnotation { @Override public Class getType() { return String.class; } } /** * The CoreMap key for getting the token-level true case annotation (e.g., * INIT_UPPER) * * This key is typically set on token annotations. */ public static class TrueCaseAnnotation implements CoreAnnotation { @Override public Class getType() { return String.class; } } /** * The CoreMap key identifying the annotation's true-cased text. * * Note that this key is intended to be used with many different kinds of * annotations - documents, sentences and tokens all have their own text. */ public static class TrueCaseTextAnnotation implements CoreAnnotation { @Override public Class getType() { return String.class; } } /** * The CoreMap key for getting the tokens contained by an annotation. * * This key should be set for any annotation that contains tokens. It can be * done without much memory overhead using List.subList. */ public static class TokensAnnotation implements CoreAnnotation> { @Override public Class> getType() { return ErasureUtils.uncheckedCast(List.class); } } /** * The CoreMap key for getting the tokens (can be words, phrases or anything that are of type CoreMap) contained by an annotation. * * This key should be set for any annotation that contains tokens (words, phrases etc). It can be * done without much memory overhead using List.subList. */ public static class GenericTokensAnnotation implements CoreAnnotation> { @Override public Class> getType() { return ErasureUtils.uncheckedCast(List.class); } } /** * The CoreMap key for getting the sentences contained in an annotation. * The sentences are represented as a {@code List}. * Each sentence might typically have annotations such as {@code TextAnnotation}, * {@code TokensAnnotation}, {@code SentenceIndexAnnotation}, and {@code BasicDependenciesAnnotation}. * * This key is typically set only on document annotations. */ public static class SentencesAnnotation implements CoreAnnotation> { @Override public Class> getType() { return ErasureUtils.uncheckedCast(List.class); } } /** * The CoreMap key for getting the quotations contained by an annotation. * * This key is typically set only on document annotations. */ public static class QuotationsAnnotation implements CoreAnnotation> { @Override public Class> getType() { return ErasureUtils.uncheckedCast(List.class); } } /** * Unique identifier within a document for a given quotation. */ public static class QuotationIndexAnnotation implements CoreAnnotation { @Override public Class getType() { return Integer.class; } } /** * The index of the sentence that this annotation begins in. */ public static class SentenceBeginAnnotation implements CoreAnnotation { @Override public Class getType() { return Integer.class; } } /** * The index of the sentence that this annotation begins in. */ public static class SentenceEndAnnotation implements CoreAnnotation { @Override public Class getType() { return Integer.class; } } /** * The CoreMap key for getting the paragraphs contained by an annotation. * * This key is typically set only on document annotations. */ public static class ParagraphsAnnotation implements CoreAnnotation> { @Override public Class> getType() { return ErasureUtils.uncheckedCast(List.class); } } /** * The CoreMap key identifying the first token included in an annotation. The * token with index 0 is the first token in the document. * * This key should be set for any annotation that contains tokens. */ public static class TokenBeginAnnotation implements CoreAnnotation { @Override public Class getType() { return Integer.class; } } /** * The CoreMap key identifying the last token after the end of an annotation. * The token with index 0 is the first token in the document. * * This key should be set for any annotation that contains tokens. */ public static class TokenEndAnnotation implements CoreAnnotation { @Override public Class getType() { return Integer.class; } } /** * The CoreMap key identifying the date and time associated with an * annotation. * * This key is typically set on document annotations. */ public static class CalendarAnnotation implements CoreAnnotation { @Override public Class getType() { return Calendar.class; } } /* * These are the keys hashed on by IndexedWord */ /** * This refers to the unique identifier for a "document", where document may * vary based on your application. */ public static class DocIDAnnotation implements CoreAnnotation { @Override public Class getType() { return String.class; } } /** * This indexes a token number inside a sentence. Standardly, tokens are * indexed within a sentence starting at 1 (not 0: we follow common parlance * whereby we speak of the first word of a sentence). * This is generally an individual word or feature index - it is local, and * may not be uniquely identifying without other identifiers such as sentence * and doc. However, if these are the same, the index annotation should be a * unique identifier for differentiating objects. */ public static class IndexAnnotation implements CoreAnnotation { @Override public Class getType() { return Integer.class; } } /** * This indexes the beginning of a span of words, e.g., a constituent in a * tree. See {@link edu.stanford.nlp.trees.Tree#indexSpans(int)}. * This annotation counts tokens. * It standardly indexes from 1 (like IndexAnnotation). The reasons for * this are: (i) Talking about the first word of a sentence is kind of * natural, and (ii) We use index 0 to refer to an imaginary root in * dependency output. */ public static class BeginIndexAnnotation implements CoreAnnotation { @Override public Class getType() { return Integer.class; } } /** * This indexes the end of a span of words, e.g., a constituent in a * tree. See {@link edu.stanford.nlp.trees.Tree#indexSpans(int)}. This annotation * counts tokens. It standardly indexes from 1 (like IndexAnnotation). * The end index is not a fencepost: its value is equal to the * IndexAnnotation of the last word in the span. */ public static class EndIndexAnnotation implements CoreAnnotation { @Override public Class getType() { return Integer.class; } } /** * This indicates that starting at this token, the sentence should not be ended until * we see a ForcedSentenceEndAnnotation. Used to force the ssplit annotator * (eg the WordToSentenceProcessor) to keep tokens in the same sentence * until ForcedSentenceEndAnnotation is seen. */ public static class ForcedSentenceUntilEndAnnotation implements CoreAnnotation { @Override public Class getType() { return Boolean.class; } } /** * This indicates the sentence should end at this token. Used to * force the ssplit annotator (eg the WordToSentenceProcessor) to * start a new sentence at the next token. */ public static class ForcedSentenceEndAnnotation implements CoreAnnotation { @Override public Class getType() { return Boolean.class; } } /** * Unique identifier within a document for a given sentence. */ public static class SentenceIndexAnnotation implements CoreAnnotation { @Override public Class getType() { return Integer.class; } } /** * Line number for a sentence in a document delimited by newlines * instead of punctuation. May skip numbers if there are blank * lines not represented as sentences. Indexed from 1 rather than 0. */ public static class LineNumberAnnotation implements CoreAnnotation { @Override public Class getType() { return Integer.class; } } /** * Contains the "value" - an ill-defined string used widely in MapLabel. */ public static class ValueAnnotation implements CoreAnnotation { @Override public Class getType() { return String.class; } } public static class CategoryAnnotation implements CoreAnnotation { @Override public Class getType() { return String.class; } } /** * The exact original surface form of a token. This is created in the * invertible PTBTokenizer. The tokenizer may normalize the token form to * match what appears in the PTB, but this key will hold the original characters. */ public static class OriginalTextAnnotation implements CoreAnnotation { @Override public Class getType() { return String.class; } } /** * Annotation for the whitespace characters appearing before this word. This * can be filled in by the tokenizer so that the original text string can be * reconstructed. */ public static class BeforeAnnotation implements CoreAnnotation { @Override public Class getType() { return String.class; } } /** * Annotation for the whitespace characters appear after this word. This can * be filled in by the tokenizer so that the original text string can be * reconstructed. */ public static class AfterAnnotation implements CoreAnnotation { @Override public Class getType() { return String.class; } } /** * CoNLL dep parsing - coarser POS tags. */ public static class CoarseTagAnnotation implements CoreAnnotation { @Override public Class getType() { return String.class; } } /** * CoNLL dep parsing - the dependency type */ public static class CoNLLDepAnnotation implements CoreAnnotation { @Override public Class getType() { return CoreMap.class; } } /** * CoNLL SRL/dep parsing - whether the word is a predicate */ public static class CoNLLPredicateAnnotation implements CoreAnnotation { @Override public Class getType() { return Boolean.class; } } /** * CoNLL SRL/dep parsing - map which, for the current word, specifies its * specific role for each predicate */ public static class CoNLLSRLAnnotation implements CoreAnnotation> { @Override public Class> getType() { return ErasureUtils.uncheckedCast(Map.class); } } /** * CoNLL dep parsing - the dependency type */ public static class CoNLLDepTypeAnnotation implements CoreAnnotation { @Override public Class getType() { return String.class; } } /** * CoNLL-U dep parsing - span of multiword tokens */ public static class CoNLLUTokenSpanAnnotation implements CoreAnnotation { @Override public Class getType() { return ErasureUtils.uncheckedCast(Pair.class); } } /** * CoNLL-U dep parsing - List of secondary dependencies */ public static class CoNLLUSecondaryDepsAnnotation implements CoreAnnotation> { @Override public Class> getType() { return ErasureUtils.uncheckedCast(Pair.class); } } /** * CoNLL-U dep parsing - List of morphological features */ public static class CoNLLUFeats implements CoreAnnotation> { @Override public Class> getType() { return ErasureUtils.uncheckedCast(HashMap.class); } } /** * CoNLL-U dep parsing - Any other annotation */ public static class CoNLLUMisc implements CoreAnnotation { @Override public Class getType() { return String.class; } } /** * CoNLL dep parsing - the index of the word which is the parent of this word * in the dependency tree */ public static class CoNLLDepParentIndexAnnotation implements CoreAnnotation { @Override public Class getType() { return Integer.class; } } /** * Inverse document frequency of the word this label represents */ public static class IDFAnnotation implements CoreAnnotation { @Override public Class getType() { return Double.class; } } /** * The standard key for a propbank label which is of type Argument */ public static class ArgumentAnnotation implements CoreAnnotation { @Override public Class getType() { return String.class; } } /** * Another key used for propbank - to signify core arg nodes or predicate * nodes */ public static class MarkingAnnotation implements CoreAnnotation { @Override public Class getType() { return String.class; } } /** * The standard key for Semantic Head Word which is a String */ public static class SemanticHeadWordAnnotation implements CoreAnnotation { @Override public Class getType() { return String.class; } } /** * The standard key for Semantic Head Word POS which is a String */ public static class SemanticHeadTagAnnotation implements CoreAnnotation { @Override public Class getType() { return String.class; } } /** * Probank key for the Verb sense given in the Propbank Annotation, should * only be in the verbnode */ public static class VerbSenseAnnotation implements CoreAnnotation { @Override public Class getType() { return String.class; } } /** * The standard key for storing category with functional tags. */ public static class CategoryFunctionalTagAnnotation implements CoreAnnotation { @Override public Class getType() { return String.class; } } /** * This is an NER ID annotation (in case the all caps parsing didn't work out * for you...) */ public static class NERIDAnnotation implements CoreAnnotation { @Override public Class getType() { return String.class; } } /** * The key for the normalized value of numeric named entities. */ public static class NormalizedNamedEntityTagAnnotation implements CoreAnnotation { @Override public Class getType() { return String.class; } } public enum SRL_ID { ARG, NO, ALL_NO, REL } /** * The key for semantic role labels (Note: please add to this description if * you use this key) */ public static class SRLIDAnnotation implements CoreAnnotation { @Override public Class getType() { return SRL_ID.class; } } /** * The standard key for the "shape" of a word: a String representing the type * of characters in a word, such as "Xx" for a capitalized word. See * {@link edu.stanford.nlp.process.WordShapeClassifier} for functions for * making shape strings. */ public static class ShapeAnnotation implements CoreAnnotation { @Override public Class getType() { return String.class; } } /** * The Standard key for storing the left terminal number relative to the root * of the tree of the leftmost terminal dominated by the current node */ public static class LeftTermAnnotation implements CoreAnnotation { @Override public Class getType() { return Integer.class; } } /** * The standard key for the parent which is a String */ public static class ParentAnnotation implements CoreAnnotation { @Override public Class getType() { return String.class; } } public static class INAnnotation implements CoreAnnotation { @Override public Class getType() { return String.class; } } /** * The standard key for span which is an IntPair */ public static class SpanAnnotation implements CoreAnnotation { @Override public Class getType() { return IntPair.class; } } /** * The standard key for the answer which is a String */ public static class AnswerAnnotation implements CoreAnnotation { @Override public Class getType() { return String.class; } } /** * The standard key for gold answer which is a String */ public static class GoldAnswerAnnotation implements CoreAnnotation { @Override public Class getType() { return String.class; } } /** * The standard key for the features which is a Collection */ public static class FeaturesAnnotation implements CoreAnnotation { @Override public Class getType() { return String.class; } } /** * The standard key for the semantic interpretation */ public static class InterpretationAnnotation implements CoreAnnotation { @Override public Class getType() { return String.class; } } /** * The standard key for the semantic role label of a phrase. */ public static class RoleAnnotation implements CoreAnnotation { @Override public Class getType() { return String.class; } } /** * The standard key for the gazetteer information */ public static class GazetteerAnnotation implements CoreAnnotation> { @Override public Class> getType() { return ErasureUtils.uncheckedCast(List.class); } } /** * Morphological stem of the word this label represents */ public static class StemAnnotation implements CoreAnnotation { @Override public Class getType() { return String.class; } } public static class PolarityAnnotation implements CoreAnnotation { @Override public Class getType() { return String.class; } } public static class MorphoNumAnnotation implements CoreAnnotation { @Override public Class getType() { return String.class; } } public static class MorphoPersAnnotation implements CoreAnnotation { @Override public Class getType() { return String.class; } } public static class MorphoGenAnnotation implements CoreAnnotation { @Override public Class getType() { return String.class; } } public static class MorphoCaseAnnotation implements CoreAnnotation { @Override public Class getType() { return String.class; } } /** * For Chinese: character level information, segmentation. Used for representing * a single character as a token. */ public static class ChineseCharAnnotation implements CoreAnnotation { @Override public Class getType() { return String.class; } } /** For Chinese: the segmentation info existing in the original text. */ public static class ChineseOrigSegAnnotation implements CoreAnnotation { @Override public Class getType() { return String.class; } } /** For Chinese: the segmentation information from the segmenter. * Either a "1" for a new word starting at this position or a "0". */ public static class ChineseSegAnnotation implements CoreAnnotation { @Override public Class getType() { return String.class; } } /** * Not sure exactly what this is, but it is different from * ChineseSegAnnotation and seems to indicate if the text is segmented */ public static class ChineseIsSegmentedAnnotation implements CoreAnnotation { @Override public Class getType() { return Boolean.class; } } /** * for Arabic: character level information, segmentation */ public static class ArabicCharAnnotation implements CoreAnnotation { public Class getType() { return String.class; } } /** For Arabic: the segmentation information from the segmenter. */ public static class ArabicSegAnnotation implements CoreAnnotation { public Class getType() { return String.class; } } /** * The CoreMap key identifying the offset of the first character of an * annotation. The character with index 0 is the first character in the * document. * * This key should be set for any annotation that represents a span of text. */ public static class CharacterOffsetBeginAnnotation implements CoreAnnotation { @Override public Class getType() { return Integer.class; } } /** * The CoreMap key identifying the offset of the last character after the end * of an annotation. The character with index 0 is the first character in the * document. * * This key should be set for any annotation that represents a span of text. */ public static class CharacterOffsetEndAnnotation implements CoreAnnotation { @Override public Class getType() { return Integer.class; } } /** * Key for relative value of a word - used in RTE */ public static class CostMagnificationAnnotation implements CoreAnnotation { @Override public Class getType() { return Double.class; } } public static class WordSenseAnnotation implements CoreAnnotation { @Override public Class getType() { return String.class; } } public static class SRLInstancesAnnotation implements CoreAnnotation>>> { @Override public Class>>> getType() { return ErasureUtils.uncheckedCast(List.class); } } /** * Used by RTE to track number of text sentences, to determine when hyp * sentences begin. */ public static class NumTxtSentencesAnnotation implements CoreAnnotation { @Override public Class getType() { return Integer.class; } } /** * Used in Trees */ public static class TagLabelAnnotation implements CoreAnnotation