Many resources are needed to download a project. Please understand that we have to compensate our server costs. Thank you in advance. Project price only 1 $
You can buy this project and download/modify it how often you want.
Stanford CoreNLP provides a set of natural language analysis tools which can take raw English language text input and give the base forms of words, their parts of speech, whether they are names of companies, people, etc., normalize dates, times, and numeric quantities, mark up the structure of sentences in terms of phrases and word dependencies, and indicate which noun phrases refer to the same entities. It provides the foundational building blocks for higher level text understanding applications.
package edu.stanford.nlp.process;
import java.util.*;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import edu.stanford.nlp.ling.CoreAnnotations;
import edu.stanford.nlp.ling.Document;
import edu.stanford.nlp.ling.HasWord;
import edu.stanford.nlp.ling.MultiTokenTag;
import edu.stanford.nlp.ling.tokensregex.SequenceMatcher;
import edu.stanford.nlp.ling.tokensregex.SequencePattern;
import edu.stanford.nlp.util.CoreMap;
import edu.stanford.nlp.util.Generics;
import edu.stanford.nlp.util.logging.Redwood;
/**
* Transforms a List of words into a List of Lists of words (that is, a List
* of sentences), by grouping the words. The word stream is assumed to
* already be adequately tokenized, and this class just divides the List into
* sentences, perhaps discarding some separator tokens as it goes.
*
* The main behavior is to look for sentence ending tokens like "." or "?!?",
* and to split after them and any following sentence closers like ")".
* Overlaid on this is an overall choice of state: The WordToSentenceProcessor
* can be a non-splitter, which always returns one sentence. Otherwise, the
* WordToSentenceProcessor will also split based on paragraphs using one of
* these three states: (1) Ignore line breaks in splitting sentences,
* (2) Treat each line as a separate paragraph, or (3) Treat two consecutive
* line breaks as marking the end of a paragraph. The details of sentence
* breaking within paragraphs is controlled based on the following three
* variables:
*
*
sentenceBoundaryTokens are tokens that are left in a sentence, but are
* to be regarded as ending a sentence. A canonical example is a period.
* If two of these follow each other, the second will be a sentence
* consisting of only the sentenceBoundaryToken.
*
sentenceBoundaryFollowers are tokens that are left in a sentence, and
* which can follow a sentenceBoundaryToken while still belonging to
* the previous sentence. They cannot begin a sentence (except at the
* beginning of a document). A canonical example is a close parenthesis
* ')'.
*
sentenceBoundaryToDiscard are tokens which separate sentences and
* which should be thrown away. In web documents, a typical example would
* be a '{@code
}' tag. If two of these follow each other, they are
* coalesced: no empty Sentence is output. The end-of-file is not
* represented in this Set, but the code behaves as if it were a member.
*
regionElementRegex A regular expression for element names containing
* a sentence region. Only tokens in such elements will be included in
* sentences. The start and end tags themselves are not included in the
* sentence.
*
*
* Instances of this class are now immutable. ☺
*
* @author Joseph Smarr ([email protected])
* @author Christopher Manning
* @author Teg Grenager ([email protected])
* @author Sarah Spikes ([email protected]) (Templatization)
*
* @param The type of the tokens in the sentences
*/
public class WordToSentenceProcessor implements ListProcessor> {
/** A logger for this class */
private static final Redwood.RedwoodChannels log = Redwood.channels(WordToSentenceProcessor.class);
// todo [cdm Aug 2012]: This should be unified with the PlainTextIterator
// in DocumentPreprocessor, perhaps by making this one implement Iterator.
// (DocumentProcessor once used to use this class, but now doesn't....)
public enum NewlineIsSentenceBreak { NEVER, ALWAYS, TWO_CONSECUTIVE }
public static final String DEFAULT_BOUNDARY_REGEX = "\\.|[!?]+";
/** Pe = Close_Punctuation (close brackets), Pf = Final_Punctuation (close quotes);
* add straight quotes, PTB escaped right brackets (-RRB-, etc.), greater than as close angle bracket,
* and those forms in full width range.
*/
public static final String DEFAULT_BOUNDARY_FOLLOWERS_REGEX = "[\\p{Pe}\\p{Pf}\"'>"'>]|''|-R[CRS]B-";
public static final Set DEFAULT_SENTENCE_BOUNDARIES_TO_DISCARD = Collections.unmodifiableSet(Generics.newHashSet(
Arrays.asList(WhitespaceLexer.NEWLINE, PTBLexer.NEWLINE_TOKEN)));
private static final boolean DEBUG = false;
/**
* Regex for tokens (Strings) that qualify as sentence-final tokens.
*/
private final Pattern sentenceBoundaryTokenPattern;
/**
* Regex for multi token sequences that qualify as sentence-final tokens.
* (i.e. use if you want to sentence split on 2 or more newlines)
*/
private final SequencePattern sentenceBoundaryMultiTokenPattern;
/**
* Regex for tokens (Strings) that qualify as tokens that can follow
* what normally counts as an end of sentence token, and which are
* attributed to the preceding sentence. For example ")" coming after
* a period.
*/
private final Pattern sentenceBoundaryFollowersPattern;
/**
* List of regex Pattern that are sentence boundaries to be discarded.
* This is normally newline tokens or representations of them.
*/
private final Set sentenceBoundaryToDiscard;
/** Patterns that match the start and end tags of XML elements. These will
* be discarded, but taken to mark a sentence boundary.
* The value will be null if there are no such elements being used
* (for efficiency).
*/
private final List xmlBreakElementsToDiscard;
/**
* List of regex Patterns that are not to be treated as sentence boundaries but should be discarded
* (i.e. these may have been used with context to identify sentence boundaries but are not needed any more)
*/
private final List tokenPatternsToDiscard;
private final Pattern sentenceRegionBeginPattern;
private final Pattern sentenceRegionEndPattern;
private final NewlineIsSentenceBreak newlineIsSentenceBreak;
private final boolean isOneSentence;
private final boolean allowEmptySentences;
public static NewlineIsSentenceBreak stringToNewlineIsSentenceBreak(String name) {
if ("always".equals(name)) {
return NewlineIsSentenceBreak.ALWAYS;
} else if ("never".equals(name)) {
return NewlineIsSentenceBreak.NEVER;
} else if (name != null && name.contains("two")) {
return NewlineIsSentenceBreak.TWO_CONSECUTIVE;
} else {
throw new IllegalArgumentException("Not a valid NewlineIsSentenceBreak name: '" + name + "' (should be one of 'always', 'never', 'two')");
}
}
/** This is a sort of hacked in other way to end sentences.
* Tokens with the ForcedSentenceEndAnnotation set to true
* will also end a sentence.
*/
@SuppressWarnings("OverlyStrongTypeCast")
private static boolean isForcedEndToken(Object o) {
if (o instanceof CoreMap) {
Boolean forcedEndValue =
((CoreMap)o).get(CoreAnnotations.ForcedSentenceEndAnnotation.class);
return forcedEndValue != null && forcedEndValue;
} else {
return false;
}
}
@SuppressWarnings("OverlyStrongTypeCast")
private static String getString(Object o) {
if (o instanceof HasWord) {
HasWord h = (HasWord) o;
return h.word();
} else if (o instanceof String) {
return (String) o;
} else if (o instanceof CoreMap) {
return ((CoreMap) o).get(CoreAnnotations.TextAnnotation.class);
} else {
throw new RuntimeException("Expected token to be either Word or String.");
}
}
private static boolean matches(List patterns, String word) {
for (Pattern p: patterns) {
Matcher m = p.matcher(word);
if (m.matches()) {
return true;
}
}
return false;
}
private boolean matchesXmlBreakElementToDiscard(String word) {
return matches(xmlBreakElementsToDiscard, word);
}
private boolean matchesTokenPatternsToDiscard(String word) {
return matches(tokenPatternsToDiscard, word);
}
// todo [cdm 2016]: Should really sort out generics here so don't need to have extra list copying
@Override
public List> process(List words) {
if (isOneSentence) {
// put all the words in one sentence
List> sentences = Generics.newArrayList();
sentences.add(new ArrayList<>(words));
return sentences;
} else {
return wordsToSentences(words);
}
}
/**
* Returns a List of Lists where each element is built from a run
* of Words in the input Document. Specifically, reads through each word in
* the input document and breaks off a sentence after finding a valid
* sentence boundary token or end of file.
* Note that for this to work, the words in the
* input document must have been tokenized with a tokenizer that makes
* sentence boundary tokens their own tokens (e.g., {@link PTBTokenizer}).
*
* @param words A list of already tokenized words (must implement HasWord or be a String).
* @return A list of sentences.
* @see #WordToSentenceProcessor(String, String, Set, Set, String, NewlineIsSentenceBreak, SequencePattern, Set, boolean, boolean)
*/
public List> wordsToSentences(List words) {
IdentityHashMap