edu.stanford.nlp.pipeline.POSTaggerAnnotator Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of stanford-corenlp Show documentation
Show all versions of stanford-corenlp Show documentation
Stanford CoreNLP provides a set of natural language analysis tools which can take raw English language text input and give the base forms of words, their parts of speech, whether they are names of companies, people, etc., normalize dates, times, and numeric quantities, mark up the structure of sentences in terms of phrases and word dependencies, and indicate which noun phrases refer to the same entities. It provides the foundational building blocks for higher level text understanding applications.
package edu.stanford.nlp.pipeline;
import edu.stanford.nlp.util.logging.Redwood;
import java.util.*;
import edu.stanford.nlp.ling.*;
import edu.stanford.nlp.tagger.maxent.MaxentTagger;
import edu.stanford.nlp.util.*;
import edu.stanford.nlp.util.concurrent.MulticoreWrapper;
import edu.stanford.nlp.util.concurrent.ThreadsafeProcessor;
/**
* Wrapper for the maxent part of speech tagger.
*
* @author Anna Rafferty
*/
public class POSTaggerAnnotator implements Annotator {
/** A logger for this class */
private static Redwood.RedwoodChannels log = Redwood.channels(POSTaggerAnnotator.class);
private final MaxentTagger pos;
private final int maxSentenceLength;
private final int nThreads;
private final boolean reuseTags;
/** Create a tagger annotator using the default English tagger from the models jar
* (and non-verbose initialization).
*/
public POSTaggerAnnotator() {
this(false);
}
public POSTaggerAnnotator(boolean verbose) {
this(System.getProperty("pos.model", MaxentTagger.DEFAULT_JAR_PATH), verbose);
}
public POSTaggerAnnotator(String posLoc, boolean verbose) {
this(posLoc, verbose, Integer.MAX_VALUE, 1);
}
/** Create a POS tagger annotator.
*
* @param posLoc Location of POS tagger model (may be file path, classpath resource, or URL
* @param verbose Whether to show verbose information on model loading
* @param maxSentenceLength Sentences longer than this length will be skipped in processing
* @param numThreads The number of threads for the POS tagger annotator to use
*/
public POSTaggerAnnotator(String posLoc, boolean verbose, int maxSentenceLength, int numThreads) {
this(loadModel(posLoc, verbose), maxSentenceLength, numThreads);
}
public POSTaggerAnnotator(MaxentTagger model) {
this(model, Integer.MAX_VALUE, 1);
}
public POSTaggerAnnotator(MaxentTagger model, int maxSentenceLength, int numThreads) {
this.pos = model;
this.maxSentenceLength = maxSentenceLength;
this.nThreads = numThreads;
this.reuseTags = false;
}
public POSTaggerAnnotator(String annotatorName, Properties props) {
String posLoc = props.getProperty(annotatorName + ".model");
if (posLoc == null) {
posLoc = DefaultPaths.DEFAULT_POS_MODEL;
}
boolean verbose = PropertiesUtils.getBool(props, annotatorName + ".verbose", false);
this.pos = loadModel(posLoc, verbose);
this.maxSentenceLength = PropertiesUtils.getInt(props, annotatorName + ".maxlen", Integer.MAX_VALUE);
this.nThreads = PropertiesUtils.getInt(props, annotatorName + ".nthreads", PropertiesUtils.getInt(props, "nthreads", 1));
this.reuseTags = PropertiesUtils.getBool(props, annotatorName + ".reuseTags", false);
}
private static MaxentTagger loadModel(String loc, boolean verbose) {
Timing timer = null;
if (verbose) {
timer = new Timing();
timer.doing("Loading POS Model [" + loc + ']');
}
MaxentTagger tagger = new MaxentTagger(loc);
if (verbose) {
timer.done();
}
return tagger;
}
@Override
public void annotate(Annotation annotation) {
// turn the annotation into a sentence
if (annotation.containsKey(CoreAnnotations.SentencesAnnotation.class)) {
if (nThreads == 1) {
for (CoreMap sentence : annotation.get(CoreAnnotations.SentencesAnnotation.class)) {
doOneSentence(sentence);
}
} else {
MulticoreWrapper wrapper = new MulticoreWrapper<>(nThreads, new POSTaggerProcessor());
for (CoreMap sentence : annotation.get(CoreAnnotations.SentencesAnnotation.class)) {
wrapper.put(sentence);
while (wrapper.peek()) {
wrapper.poll();
}
}
wrapper.join();
while (wrapper.peek()) {
wrapper.poll();
}
}
} else {
throw new RuntimeException("unable to find words/tokens in: " + annotation);
}
}
private class POSTaggerProcessor implements ThreadsafeProcessor {
@Override
public CoreMap process(CoreMap sentence) {
return doOneSentence(sentence);
}
@Override
public ThreadsafeProcessor newInstance() {
return this;
}
}
private CoreMap doOneSentence(CoreMap sentence) {
List tokens = sentence.get(CoreAnnotations.TokensAnnotation.class);
List tagged = null;
if (tokens.size() <= maxSentenceLength) {
try {
tagged = pos.tagSentence(tokens, this.reuseTags);
} catch (OutOfMemoryError e) {
log.info("WARNING: Tagging of sentence ran out of memory. " +
"Will ignore and continue: " +
SentenceUtils.listToString(tokens));
}
}
if (tagged != null) {
for (int i = 0, sz = tokens.size(); i < sz; i++) {
tokens.get(i).set(CoreAnnotations.PartOfSpeechAnnotation.class, tagged.get(i).tag());
}
} else {
for (CoreLabel token : tokens) {
token.set(CoreAnnotations.PartOfSpeechAnnotation.class, "X");
}
}
return sentence;
}
@Override
public Set> requires() {
return Collections.unmodifiableSet(new ArraySet<>(Arrays.asList(
CoreAnnotations.TextAnnotation.class,
CoreAnnotations.TokensAnnotation.class,
CoreAnnotations.CharacterOffsetBeginAnnotation.class,
CoreAnnotations.CharacterOffsetEndAnnotation.class,
CoreAnnotations.SentencesAnnotation.class
)));
}
@Override
public Set> requirementsSatisfied() {
return Collections.singleton(CoreAnnotations.PartOfSpeechAnnotation.class);
}
}