All Downloads are FREE. Search and download functionalities are using the official Maven repository.

edu.stanford.nlp.process.WordToSentenceProcessor Maven / Gradle / Ivy

Go to download

Stanford CoreNLP provides a set of natural language analysis tools which can take raw English language text input and give the base forms of words, their parts of speech, whether they are names of companies, people, etc., normalize dates, times, and numeric quantities, mark up the structure of sentences in terms of phrases and word dependencies, and indicate which noun phrases refer to the same entities. It provides the foundational building blocks for higher level text understanding applications.

There is a newer version: 4.5.7
Show newest version
package edu.stanford.nlp.process;

import java.util.*;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

import edu.stanford.nlp.ling.CoreAnnotations;
import edu.stanford.nlp.ling.Document;
import edu.stanford.nlp.ling.HasWord;
import edu.stanford.nlp.ling.MultiTokenTag;
import edu.stanford.nlp.ling.tokensregex.SequenceMatcher;
import edu.stanford.nlp.ling.tokensregex.SequencePattern;
import edu.stanford.nlp.util.CoreMap;
import edu.stanford.nlp.util.Generics;
import edu.stanford.nlp.util.logging.Redwood;

/**
 * Transforms a List of words into a List of Lists of words (that is, a List
 * of sentences), by grouping the words.  The word stream is assumed to
 * already be adequately tokenized, and this class just divides the List into
 * sentences, perhaps discarding some separator tokens as it goes.
 * 

* The main behavior is to look for sentence ending tokens like "." or "?!?", * and to split after them and any following sentence closers like ")". * Overlaid on this is an overall choice of state: The WordToSentenceProcessor * can be a non-splitter, which always returns one sentence. Otherwise, the * WordToSentenceProcessor will also split based on paragraphs using one of * these three states: (1) Ignore line breaks in splitting sentences, * (2) Treat each line as a separate paragraph, or (3) Treat two consecutive * line breaks as marking the end of a paragraph. The details of sentence * breaking within paragraphs is controlled based on the following three * variables: *

    *
  • sentenceBoundaryTokens are tokens that are left in a sentence, but are * to be regarded as ending a sentence. A canonical example is a period. * If two of these follow each other, the second will be a sentence * consisting of only the sentenceBoundaryToken. *
  • sentenceBoundaryFollowers are tokens that are left in a sentence, and * which can follow a sentenceBoundaryToken while still belonging to * the previous sentence. They cannot begin a sentence (except at the * beginning of a document). A canonical example is a close parenthesis * ')'. *
  • sentenceBoundaryToDiscard are tokens which separate sentences and * which should be thrown away. In web documents, a typical example would * be a '{@code

    }' tag. If two of these follow each other, they are * coalesced: no empty Sentence is output. The end-of-file is not * represented in this Set, but the code behaves as if it were a member. *

  • regionElementRegex A regular expression for element names containing * a sentence region. Only tokens in such elements will be included in * sentences. The start and end tags themselves are not included in the * sentence. *
* * Instances of this class are now immutable. ☺ * * @author Joseph Smarr ([email protected]) * @author Christopher Manning * @author Teg Grenager ([email protected]) * @author Sarah Spikes ([email protected]) (Templatization) * * @param The type of the tokens in the sentences */ public class WordToSentenceProcessor implements ListProcessor> { /** A logger for this class */ private static final Redwood.RedwoodChannels log = Redwood.channels(WordToSentenceProcessor.class); // todo [cdm Aug 2012]: This should be unified with the PlainTextIterator // in DocumentPreprocessor, perhaps by making this one implement Iterator. // (DocumentProcessor once used to use this class, but now doesn't....) public enum NewlineIsSentenceBreak { NEVER, ALWAYS, TWO_CONSECUTIVE } public static final String DEFAULT_BOUNDARY_REGEX = "\\.|[!?]+"; /** Pe = Close_Punctuation (close brackets), Pf = Final_Punctuation (close quotes); * add straight quotes, PTB escaped right brackets (-RRB-, etc.), greater than as close angle bracket, * and those forms in full width range. */ public static final String DEFAULT_BOUNDARY_FOLLOWERS_REGEX = "[\\p{Pe}\\p{Pf}\"'>"'>]|''|-R[CRS]B-"; public static final Set DEFAULT_SENTENCE_BOUNDARIES_TO_DISCARD = Collections.unmodifiableSet(Generics.newHashSet( Arrays.asList(WhitespaceLexer.NEWLINE, PTBLexer.NEWLINE_TOKEN))); private static final boolean DEBUG = false; /** * Regex for tokens (Strings) that qualify as sentence-final tokens. */ private final Pattern sentenceBoundaryTokenPattern; /** * Regex for multi token sequences that qualify as sentence-final tokens. * (i.e. use if you want to sentence split on 2 or more newlines) */ private final SequencePattern sentenceBoundaryMultiTokenPattern; /** * Regex for tokens (Strings) that qualify as tokens that can follow * what normally counts as an end of sentence token, and which are * attributed to the preceding sentence. For example ")" coming after * a period. */ private final Pattern sentenceBoundaryFollowersPattern; /** * List of regex Pattern that are sentence boundaries to be discarded. * This is normally newline tokens or representations of them. */ private final Set sentenceBoundaryToDiscard; /** Patterns that match the start and end tags of XML elements. These will * be discarded, but taken to mark a sentence boundary. * The value will be null if there are no such elements being used * (for efficiency). */ private final List xmlBreakElementsToDiscard; /** * List of regex Patterns that are not to be treated as sentence boundaries but should be discarded * (i.e. these may have been used with context to identify sentence boundaries but are not needed any more) */ private final List tokenPatternsToDiscard; private final Pattern sentenceRegionBeginPattern; private final Pattern sentenceRegionEndPattern; private final NewlineIsSentenceBreak newlineIsSentenceBreak; private final boolean isOneSentence; private final boolean allowEmptySentences; public static NewlineIsSentenceBreak stringToNewlineIsSentenceBreak(String name) { if ("always".equals(name)) { return NewlineIsSentenceBreak.ALWAYS; } else if ("never".equals(name)) { return NewlineIsSentenceBreak.NEVER; } else if (name != null && name.contains("two")) { return NewlineIsSentenceBreak.TWO_CONSECUTIVE; } else { throw new IllegalArgumentException("Not a valid NewlineIsSentenceBreak name: '" + name + "' (should be one of 'always', 'never', 'two')"); } } /** This is a sort of hacked in other way to end sentences. * Tokens with the ForcedSentenceEndAnnotation set to true * will also end a sentence. */ @SuppressWarnings("OverlyStrongTypeCast") private static boolean isForcedEndToken(Object o) { if (o instanceof CoreMap) { Boolean forcedEndValue = ((CoreMap)o).get(CoreAnnotations.ForcedSentenceEndAnnotation.class); return forcedEndValue != null && forcedEndValue; } else { return false; } } @SuppressWarnings("OverlyStrongTypeCast") private static String getString(Object o) { if (o instanceof HasWord) { HasWord h = (HasWord) o; return h.word(); } else if (o instanceof String) { return (String) o; } else if (o instanceof CoreMap) { return ((CoreMap) o).get(CoreAnnotations.TextAnnotation.class); } else { throw new RuntimeException("Expected token to be either Word or String."); } } private static boolean matches(List patterns, String word) { for (Pattern p: patterns) { Matcher m = p.matcher(word); if (m.matches()) { return true; } } return false; } private boolean matchesXmlBreakElementToDiscard(String word) { return matches(xmlBreakElementsToDiscard, word); } private boolean matchesTokenPatternsToDiscard(String word) { return matches(tokenPatternsToDiscard, word); } // todo [cdm 2016]: Should really sort out generics here so don't need to have extra list copying @Override public List> process(List words) { if (isOneSentence) { // put all the words in one sentence List> sentences = Generics.newArrayList(); sentences.add(new ArrayList<>(words)); return sentences; } else { return wordsToSentences(words); } } /** * Returns a List of Lists where each element is built from a run * of Words in the input Document. Specifically, reads through each word in * the input document and breaks off a sentence after finding a valid * sentence boundary token or end of file. * Note that for this to work, the words in the * input document must have been tokenized with a tokenizer that makes * sentence boundary tokens their own tokens (e.g., {@link PTBTokenizer}). * * @param words A list of already tokenized words (must implement HasWord or be a String). * @return A list of sentences. * @see #WordToSentenceProcessor(String, String, Set, Set, String, NewlineIsSentenceBreak, SequencePattern, Set, boolean, boolean) */ public List> wordsToSentences(List words) { IdentityHashMap isSentenceBoundary = null; // is null unless used by sentenceBoundaryMultiTokenPattern if (sentenceBoundaryMultiTokenPattern != null) { // Do initial pass using tokensregex to identify multi token patterns that need to be matched // and add the last token to our table of sentence boundary tokens isSentenceBoundary = new IdentityHashMap<>(); SequenceMatcher matcher = sentenceBoundaryMultiTokenPattern.getMatcher(words); while (matcher.find()) { List nodes = matcher.groupNodes(); if (nodes != null && ! nodes.isEmpty()) { isSentenceBoundary.put(nodes.get(nodes.size() - 1), true); } } } // Split tokens into sentences!!! List> sentences = Generics.newArrayList(); List currentSentence = new ArrayList<>(); List lastSentence = null; boolean insideRegion = false; boolean inWaitForForcedEnd = false; boolean lastTokenWasNewline = false; for (IN o: words) { String word = getString(o); boolean forcedEnd = isForcedEndToken(o); boolean inMultiTokenExpr = false; boolean discardToken = false; if (o instanceof CoreMap) { // Hacky stuff to ensure sentence breaks do not happen in certain cases CoreMap cm = (CoreMap) o; Boolean forcedUntilEndValue = cm.get(CoreAnnotations.ForcedSentenceUntilEndAnnotation.class); if (!forcedEnd) { if (forcedUntilEndValue != null && forcedUntilEndValue) inWaitForForcedEnd = true; else { MultiTokenTag mt = cm.get(CoreAnnotations.MentionTokenAnnotation.class); if (mt != null && !mt.isEnd()) { // In the middle of a multi token mention, make sure sentence is not ended here inMultiTokenExpr = true; } } } } if (tokenPatternsToDiscard != null) { discardToken = matchesTokenPatternsToDiscard(word); } if (sentenceRegionBeginPattern != null && ! insideRegion) { if (DEBUG) { log.info("Word is " + word + "; outside region; deleted"); } if (sentenceRegionBeginPattern.matcher(word).matches()) { insideRegion = true; if (DEBUG) { log.info(" entering region"); } } lastTokenWasNewline = false; continue; } if (lastSentence != null && currentSentence.isEmpty() && sentenceBoundaryFollowersPattern.matcher(word).matches()) { if (!discardToken) { lastSentence.add(o); } if (DEBUG) { log.info("Word is " + word + (discardToken ? "discarded":" added to last sentence")); } lastTokenWasNewline = false; continue; } boolean newSent = false; String debugText = (discardToken)? "discarded": "added to current"; if (inWaitForForcedEnd && !forcedEnd) { if (!discardToken) currentSentence.add(o); if (DEBUG) { log.info("Word is " + word + "; is in wait for forced end; " + debugText); } } else if (inMultiTokenExpr && !forcedEnd) { if (!discardToken) currentSentence.add(o); if (DEBUG) { log.info("Word is " + word + "; is in multi token expr; " + debugText); } } else if (sentenceBoundaryToDiscard.contains(word)) { if (newlineIsSentenceBreak == NewlineIsSentenceBreak.ALWAYS) { newSent = true; } else if (newlineIsSentenceBreak == NewlineIsSentenceBreak.TWO_CONSECUTIVE) { if (lastTokenWasNewline) { newSent = true; } } lastTokenWasNewline = true; if (DEBUG) { log.info("Word is " + word + " discarded sentence boundary"); } } else { lastTokenWasNewline = false; Boolean isb; if (xmlBreakElementsToDiscard != null && matchesXmlBreakElementToDiscard(word)) { newSent = true; if (DEBUG) { log.info("Word is " + word + "; is XML break element; discarded"); } } else if (sentenceRegionEndPattern != null && sentenceRegionEndPattern.matcher(word).matches()) { insideRegion = false; newSent = true; // Marked sentence boundaries } else if ((isSentenceBoundary != null) && ((isb = isSentenceBoundary.get(o)) != null) && isb) { if (!discardToken) currentSentence.add(o); if (DEBUG) { log.info("Word is " + word + "; is sentence boundary (matched multi-token pattern); " + debugText); } newSent = true; } else if (sentenceBoundaryTokenPattern.matcher(word).matches()) { if (!discardToken) currentSentence.add(o); if (DEBUG) { log.info("Word is " + word + "; is sentence boundary; " + debugText); } newSent = true; } else if (forcedEnd) { if (!discardToken) currentSentence.add(o); inWaitForForcedEnd = false; newSent = true; if (DEBUG) { log.info("Word is " + word + "; annotated to be the end of a sentence; " + debugText); } } else { if (!discardToken) currentSentence.add(o); if (DEBUG) { log.info("Word is " + word + "; " + debugText); } } } if (newSent && (!currentSentence.isEmpty() || allowEmptySentences)) { if (DEBUG) { log.info(" beginning new sentence"); } sentences.add(currentSentence); // adds this sentence now that it's complete lastSentence = currentSentence; currentSentence = new ArrayList<>(); // clears the current sentence } } // add any words at the end, even if there isn't a sentence // terminator at the end of file if ( ! currentSentence.isEmpty()) { sentences.add(currentSentence); // adds last sentence } return sentences; } public Document> processDocument(Document in) { Document> doc = in.blankDocument(); doc.addAll(process(in)); return doc; } /* ---------- Constructors --------- */ /** * Create a {@code WordToSentenceProcessor} using a sensible default * list of tokens for sentence ending for English/Latin writing systems. * The default set is: {".","?","!"} and * any combination of ! or ?, as in !!!?!?!?!!!?!!?!!!. * A sequence of two or more consecutive line breaks is taken as a paragraph break * which also splits sentences. This is the usual constructor for sentence * breaking reasonable text, which uses hard-line breaking, so two * blank lines indicate a paragraph break. * People commonly use this constructor. */ public WordToSentenceProcessor() { this(false); } /** * Create a {@code WordToSentenceProcessor} using a sensible default * list of tokens for sentence ending for English/Latin writing systems. * The default set is: {".","?","!"} and * any combination of ! or ?, as in !!!?!?!?!!!?!!?!!!. * You can specify the treatment of newlines as sentence breaks as one * of ignored, every newline is a sentence break, or only two or more * consecutive newlines are a sentence break. * * @param newlineIsSentenceBreak Strategy for treating newlines as * paragraph breaks. */ public WordToSentenceProcessor(NewlineIsSentenceBreak newlineIsSentenceBreak) { this(DEFAULT_BOUNDARY_REGEX, newlineIsSentenceBreak, false); } /** * Create a {@code WordToSentenceProcessor} which never breaks the input * into multiple sentences. If the argument is true, the input stream * is always output as one sentence. (If it is false, this is * equivalent to the no argument constructor, so why use this?) * * @param isOneSentence Marker argument: true means to treat input * as one sentence */ public WordToSentenceProcessor(boolean isOneSentence) { this(DEFAULT_BOUNDARY_REGEX, NewlineIsSentenceBreak.TWO_CONSECUTIVE, isOneSentence); } /** * Set the set of Strings that will mark the end of a sentence, * and which will be discarded after doing so. * This constructor is used for, and usually only for, doing * one-sentence-per-line sentence splitting. Since in such cases, you * generally want to strictly preserve the set of lines in the input, * it preserves empty lines as empty sentences in the output. * * @param boundaryToDiscard A Set of String that will be matched * with .equals() and will mark an * end of sentence and be discarded. */ public WordToSentenceProcessor(Set boundaryToDiscard) { this("", "", boundaryToDiscard, null, null, NewlineIsSentenceBreak.ALWAYS, null, null, false, true); } /** * Create a basic {@code WordToSentenceProcessor} specifying just a few top-level options. * * @param boundaryTokenRegex The set of boundary tokens * @param newlineIsSentenceBreak Strategy for treating newlines as sentence breaks * @param isOneSentence Whether to treat whole text as one sentence * (if true, the other two parameters are ignored). */ public WordToSentenceProcessor(String boundaryTokenRegex, NewlineIsSentenceBreak newlineIsSentenceBreak, boolean isOneSentence) { this(boundaryTokenRegex, DEFAULT_BOUNDARY_FOLLOWERS_REGEX, DEFAULT_SENTENCE_BOUNDARIES_TO_DISCARD, null, null, newlineIsSentenceBreak, null, null, isOneSentence, false); } /** * Flexibly set the set of acceptable sentence boundary tokens, but with * a default set of allowed boundary following tokens. Also can set sentence boundary * to discard tokens and xmlBreakElementsToDiscard and set the treatment of newlines * (boundaryToDiscard) as sentence ends. * * This one is convenient in allowing any of the first 3 arguments to be null, * and then the usual defaults are substituted for it. * The allowed set of boundary followers is the regex: "[\\p{Pe}\\p{Pf}'\"]|''|-R[CRS]B-". * The default set of discarded separator tokens includes the * newline tokens used by WhitespaceLexer and PTBLexer. * * @param boundaryTokenRegex The regex of boundary tokens. If null, use default. * @param boundaryFollowersRegex The regex of boundary following tokens. If null, use default * @param boundaryToDiscard The set of regex for sentence boundary tokens that should be discarded. * If null, use default. * @param xmlBreakElementsToDiscard xml element names like "p", which will be recognized, * treated as sentence ends, and discarded. * If null, use none. * @param newlineIsSentenceBreak Strategy for counting line ends (boundaryToDiscard) as sentence ends. */ public WordToSentenceProcessor(String boundaryTokenRegex, String boundaryFollowersRegex, Set boundaryToDiscard, Set xmlBreakElementsToDiscard, NewlineIsSentenceBreak newlineIsSentenceBreak, SequencePattern sentenceBoundaryMultiTokenPattern, Set tokenRegexesToDiscard) { this(boundaryTokenRegex == null ? DEFAULT_BOUNDARY_REGEX : boundaryTokenRegex, boundaryFollowersRegex == null ? DEFAULT_BOUNDARY_FOLLOWERS_REGEX: boundaryFollowersRegex, boundaryToDiscard == null || boundaryToDiscard.isEmpty() ? DEFAULT_SENTENCE_BOUNDARIES_TO_DISCARD : boundaryToDiscard, xmlBreakElementsToDiscard == null ? Collections.emptySet() : xmlBreakElementsToDiscard, null, newlineIsSentenceBreak, sentenceBoundaryMultiTokenPattern, tokenRegexesToDiscard, false, false); } /** * Configure all parameters for converting a list of tokens into sentences. * The whole enchilada. * * @param boundaryTokenRegex Tokens that match this regex will end a * sentence, but are retained at the end of * the sentence. Substantive value must be supplied. * @param boundaryFollowersRegex This is a Set of String that are matched with * .equals() which are allowed to be tacked onto * the end of a sentence after a sentence boundary * token, for example ")". Substantive value must be supplied. * @param boundariesToDiscard This is normally used for newline tokens if * they are included in the tokenization. They * may end the sentence (depending on the setting * of newlineIsSentenceBreak), but at any rate * are deleted from sentences in the output. * Substantive value must be supplied. * @param xmlBreakElementsToDiscard These are elements like "p" or "sent", * which will be wrapped into regex for * approximate XML matching. They will be * deleted in the output, and will always * trigger a sentence boundary. * May be null; means discard none. * @param regionElementRegex XML element name regex to delimit regions processed. * Tokens outside one of these elements are discarded. * May be null; means to not filter by regions * @param newlineIsSentenceBreak How to treat newlines. Must have substantive value. * @param sentenceBoundaryMultiTokenPattern A TokensRegex multi-token pattern for finding boundaries. * May be null; means that there are no such patterns. * @param tokenRegexesToDiscard Regex for tokens to discard. * May be null; means that no tokens are discarded in this way. * @param isOneSentence Whether to treat whole of input as one sentence regardless. * Must have substantive value. Overrides anything else. * @param allowEmptySentences Whether to allow empty sentences to be output * Must have substantive value. Often suppressed, but don't want that in things like * strict one-sentence-per-line mode. */ public WordToSentenceProcessor(String boundaryTokenRegex, String boundaryFollowersRegex, Set boundariesToDiscard, Set xmlBreakElementsToDiscard, String regionElementRegex, NewlineIsSentenceBreak newlineIsSentenceBreak, SequencePattern sentenceBoundaryMultiTokenPattern, Set tokenRegexesToDiscard, boolean isOneSentence, boolean allowEmptySentences) { sentenceBoundaryTokenPattern = Pattern.compile(boundaryTokenRegex); sentenceBoundaryFollowersPattern = Pattern.compile(boundaryFollowersRegex); sentenceBoundaryToDiscard = Collections.unmodifiableSet(boundariesToDiscard); if (xmlBreakElementsToDiscard == null || xmlBreakElementsToDiscard.isEmpty()) { this.xmlBreakElementsToDiscard = null; } else { this.xmlBreakElementsToDiscard = new ArrayList<>(xmlBreakElementsToDiscard.size()); for (String s: xmlBreakElementsToDiscard) { String regex = "<\\s*(?:/\\s*)?(?:" + s + ")(?:\\s+[^>]+?|\\s*(?:/\\s*)?)>"; // log.info("Regex is |" + regex + "|"); // todo: Historically case insensitive, but maybe better and more proper to make case sensitive? this.xmlBreakElementsToDiscard.add(Pattern.compile(regex, Pattern.CASE_INSENSITIVE)); } } if (regionElementRegex != null) { sentenceRegionBeginPattern = Pattern.compile("<\\s*(?:" + regionElementRegex + ")(?:\\s+[^>]+?)?>"); sentenceRegionEndPattern = Pattern.compile("<\\s*/\\s*(?:" + regionElementRegex + ")\\s*>"); } else { sentenceRegionBeginPattern = null; sentenceRegionEndPattern = null; } this.newlineIsSentenceBreak = newlineIsSentenceBreak; this.sentenceBoundaryMultiTokenPattern = sentenceBoundaryMultiTokenPattern; if (tokenRegexesToDiscard != null) { this.tokenPatternsToDiscard = new ArrayList<>(tokenRegexesToDiscard.size()); for (String s: tokenRegexesToDiscard) { this.tokenPatternsToDiscard.add(Pattern.compile(s)); } } else { this.tokenPatternsToDiscard = null; } this.isOneSentence = isOneSentence; this.allowEmptySentences = allowEmptySentences; if (DEBUG) { log.info("WordToSentenceProcessor: boundaryTokens=" + boundaryTokenRegex); log.info(" boundaryFollowers=" + boundaryFollowersRegex); log.info(" boundariesToDiscard=" + boundariesToDiscard); log.info(" xmlBreakElementsToDiscard=" + xmlBreakElementsToDiscard); log.info(" regionBeginPattern=" + sentenceRegionBeginPattern); log.info(" regionEndPattern=" + sentenceRegionEndPattern); log.info(" newlineIsSentenceBreak=" + newlineIsSentenceBreak); log.info(" sentenceBoundaryMultiTokenPattern=" + sentenceBoundaryMultiTokenPattern); log.info(" tokenPatternsToDiscard=" + tokenPatternsToDiscard); log.info(" isOneSentence=" + isOneSentence); log.info(" allowEmptySentences=" + allowEmptySentences); } } }




© 2015 - 2024 Weber Informatics LLC | Privacy Policy