edu.stanford.nlp.process.WordToSentenceProcessor Maven / Gradle / Ivy
package edu.stanford.nlp.process;
import java.util.*;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import edu.stanford.nlp.io.EncodingPrintWriter;
import edu.stanford.nlp.ling.Document;
import edu.stanford.nlp.ling.HasWord;
import edu.stanford.nlp.ling.CoreAnnotations;
import edu.stanford.nlp.util.CoreMap;
import edu.stanford.nlp.util.Generics;
// todo [cdm Feb 2012]: Rewrite the Set's as List's since while conceptually
// sets, we just don't need to be hashing things here!
/**
* Transforms a Document of Words into a Document of Sentences by grouping the
* Words. The word stream is assumed to already be adequately tokenized,
* and this class just divides the list into sentences, perhaps discarding
* some separator tokens based on the setting of the following three sets:
*
* - sentenceBoundaryTokens are tokens that are left in a sentence, but are
* to be regarded as ending a sentence. A canonical example is a period.
* If two of these follow each other, the second will be a sentence
* consisting of only the sentenceBoundaryToken.
*
- sentenceBoundaryFollowers are tokens that are left in a sentence, and
* which can follow a sentenceBoundaryToken while still belonging to
* the previous sentence. They cannot begin a sentence (except at the
* beginning of a document). A canonical example is a close parenthesis
* ')'.
*
- sentenceBoundaryToDiscard are tokens which separate sentences and
* which should be thrown away. In web documents, a typical example would
* be a '{@code
}' tag. If two of these follow each other, they are
* coalesced: no empty Sentence is output. The end-of-file is not
* represented in this Set, but the code behaves as if it were a member.
*
- sentenceRegionBeginPattern A regular expression for marking the start
* of a sentence region. Not included in the sentence.
*
- sentenceRegionEndPattern A regular expression for marking the end
* of a sentence region. Not included in the sentence.
*
* See DocumentPreprocessor for a class with a main method that will call this
* and cut a text file up into sentences.
*
* @author Joseph Smarr ([email protected])
* @author Christopher Manning
* @author Teg Grenager ([email protected])
* @author Sarah Spikes ([email protected]) (Templatization)
*
* @param The type of the tokens in the sentences
*/
public class WordToSentenceProcessor implements ListProcessor> {
private static final boolean DEBUG = false;
/**
* Regex for tokens (Strings) that qualify as sentence-final tokens.
*/
private final Pattern sentenceBoundaryTokenPattern;
/**
* Set of tokens (Strings) that qualify as tokens that can follow
* what normally counts as an end of sentence token, and which are
* attributed to the preceding sentence. For example ")" coming after
* a period.
*/
private final Set sentenceBoundaryFollowers;
/**
* List of regex Pattern that are sentence boundaries to be discarded.
*/
private List sentenceBoundaryToDiscard;
private final Pattern sentenceRegionBeginPattern;
private final Pattern sentenceRegionEndPattern;
private boolean isOneSentence;
public void setSentenceBoundaryToDiscard(Set regexSet) {
sentenceBoundaryToDiscard = new ArrayList(regexSet.size());
for (String s: regexSet) {
sentenceBoundaryToDiscard.add(Pattern.compile(Pattern.quote(s)));
}
}
public boolean isOneSentence() {
return isOneSentence;
}
public void setOneSentence(boolean oneSentence) {
isOneSentence = oneSentence;
}
public void addHtmlSentenceBoundaryToDiscard(Set set) {
if (sentenceBoundaryToDiscard == null) {
sentenceBoundaryToDiscard = new ArrayList();
}
for (String s: set) {
sentenceBoundaryToDiscard.add(Pattern.compile("<\\s*/?\\s*" + s + "\\s*/?\\s*>", Pattern.CASE_INSENSITIVE));
sentenceBoundaryToDiscard.add(Pattern.compile("<\\s*" + s + "\\s+[^>]+>", Pattern.CASE_INSENSITIVE));
}
}
private boolean matchesSentenceBoundaryToDiscard(String word) {
for(Pattern p: sentenceBoundaryToDiscard){
Matcher m = p.matcher(word);
if(m.matches()){
return true;
}
}
return false;
}
public List> process(List words) {
if (isOneSentence) {
List> sentences = Generics.newArrayList();
sentences.add(new ArrayList(words));
return sentences;
} else {
return wordsToSentences(words);
}
}
/**
* Returns a List of Lists where each element is built from a run
* of Words in the input Document. Specifically, reads through each word in
* the input document and breaks off a sentence after finding a valid
* sentence boundary token or end of file.
* Note that for this to work, the words in the
* input document must have been tokenized with a tokenizer that makes
* sentence boundary tokens their own tokens (e.g., {@link PTBTokenizer}).
*
* @param words A list of already tokenized words (must implement HasWord or be a String)
* @return A list of Sentence
* @see #WordToSentenceProcessor(String, Set, Set, Pattern, Pattern)
*/
public List> wordsToSentences(List words) {
List> sentences = Generics.newArrayList();
List currentSentence = null;
List lastSentence = null;
boolean insideRegion = false;
for (IN o: words) {
String word;
if (o instanceof HasWord) {
HasWord h = (HasWord) o;
word = h.word();
} else if (o instanceof String) {
word = (String) o;
} else if (o instanceof CoreMap) {
word = ((CoreMap)o).get(CoreAnnotations.TextAnnotation.class);
} else {
throw new RuntimeException("Expected token to be either Word or String.");
}
boolean forcedEnd = false;
if (o instanceof CoreMap) {
Boolean forcedEndValue =
((CoreMap)o).get(CoreAnnotations.ForcedSentenceEndAnnotation.class);
if (forcedEndValue != null)
forcedEnd = forcedEndValue;
}
if (DEBUG) {
EncodingPrintWriter.err.println("Word is " + word, "UTF-8");
}
if (currentSentence == null) {
currentSentence = new ArrayList();
}
if (sentenceRegionBeginPattern != null && ! insideRegion) {
if (sentenceRegionBeginPattern.matcher(word).matches()) {
insideRegion = true;
}
if (DEBUG) {
System.err.println(" outside region");
}
continue;
}
if (sentenceBoundaryFollowers.contains(word) && lastSentence != null && currentSentence.isEmpty()) {
lastSentence.add(o);
if (DEBUG) {
System.err.println(" added to last");
}
} else {
boolean newSent = false;
if (matchesSentenceBoundaryToDiscard(word)) {
newSent = true;
} else if (sentenceRegionEndPattern != null && sentenceRegionEndPattern.matcher(word).matches()) {
insideRegion = false;
newSent = true;
} else if (sentenceBoundaryTokenPattern.matcher(word).matches()) {
currentSentence.add(o);
if (DEBUG) {
System.err.println(" is sentence boundary; added to current");
}
newSent = true;
} else if (forcedEnd) {
currentSentence.add(o);
newSent = true;
if (DEBUG) {
System.err.println(" annotated to be the end of a sentence");
}
} else {
currentSentence.add(o);
if (DEBUG) {
System.err.println(" added to current");
}
}
if (newSent && currentSentence.size() > 0) {
if (DEBUG) {
System.err.println(" beginning new sentence");
}
sentences.add(currentSentence);
// adds this sentence now that it's complete
lastSentence = currentSentence;
currentSentence = null; // clears the current sentence
}
}
}
// add any words at the end, even if there isn't a sentence
// terminator at the end of file
if (currentSentence != null && currentSentence.size() > 0) {
sentences.add(currentSentence); // adds last sentence
}
return sentences;
}
public Document> processDocument(Document in) {
Document> doc = in.blankDocument();
doc.addAll(process(in));
return doc;
}
/**
* Create a WordToSentenceProcessor
using a sensible default
* list of tokens to split on. The default set is: {".","?","!"} and
* any combination of ! or ?, as in !!!?!?!?!!!?!!?!!!
*/
public WordToSentenceProcessor() {
this("\\.|[!?]+");
}
/**
* Flexibly set the set of acceptable sentence boundary tokens, but with
* a default set of allowed boundary following tokens (based on English
* and Penn Treebank encoding).
* The allowed set of boundary followers is:
* {")","]","\"","\'", "''", "-RRB-", "-RSB-", "-RCB-"}.
*
* @param boundaryTokenRegex The set of boundary tokens
*/
public WordToSentenceProcessor(String boundaryTokenRegex) {
this(boundaryTokenRegex, Generics.newHashSet(Arrays.asList(")", "]", "\"", "\'", "''", "-RRB-", "-RSB-", "-RCB-")));
}
/**
* Flexibly set the set of acceptable sentence boundary tokens and
* also the set of tokens commonly following sentence boundaries, and
* the set of discarded separator tokens.
* The default set of discarded separator tokens is: {"\n"}.
*/
public WordToSentenceProcessor(String boundaryTokenRegex, Set boundaryFollowers) {
this(boundaryTokenRegex, boundaryFollowers, Collections.singleton("\n"));
}
/**
* Flexibly set the set of acceptable sentence boundary tokens,
* the set of tokens commonly following sentence boundaries, and also
* the set of tokens that are sentences boundaries that should be
* discarded.
*/
public WordToSentenceProcessor(String boundaryTokenRegex,
Set boundaryFollowers,
Set boundaryToDiscard) {
this(boundaryTokenRegex, boundaryFollowers, boundaryToDiscard, null, null);
}
public WordToSentenceProcessor(Pattern regionBeginPattern, Pattern regionEndPattern) {
this("", Collections.emptySet(),
Collections.emptySet(), regionBeginPattern, regionEndPattern);
}
/**
* Flexibly set a pattern that matches acceptable sentence boundaries,
* the set of tokens commonly following sentence boundaries, and also
* the set of tokens that are sentence boundaries that should be discarded.
* This is private because it is a dangerous constructor. It's not clear what the semantics
* should be if there are both boundary token sets, and patterns to match.
*/
private WordToSentenceProcessor(String boundaryTokenRegex, Set boundaryFollowers, Set boundaryToDiscard, Pattern regionBeginPattern, Pattern regionEndPattern) {
sentenceBoundaryTokenPattern = Pattern.compile(boundaryTokenRegex);
sentenceBoundaryFollowers = boundaryFollowers;
setSentenceBoundaryToDiscard(boundaryToDiscard);
sentenceRegionBeginPattern = regionBeginPattern;
sentenceRegionEndPattern = regionEndPattern;
if (DEBUG) {
EncodingPrintWriter.err.println("WordToSentenceProcessor: boundaryTokens=" + boundaryTokenRegex, "UTF-8");
EncodingPrintWriter.err.println(" boundaryFollowers=" + boundaryFollowers, "UTF-8");
EncodingPrintWriter.err.println(" boundaryToDiscard=" + boundaryToDiscard, "UTF-8");
}
}
}