edu.stanford.nlp.ling.Sentence Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of stanford-parser Show documentation
Show all versions of stanford-parser Show documentation
Stanford Parser processes raw text in English, Chinese, German, Arabic, and French, and extracts constituency parse trees.
package edu.stanford.nlp.ling;
import java.util.ArrayList;
import java.util.Iterator;
import java.util.List;
/**
* Sentence holds a couple utility methods for lists.
* Those include a method that nicely prints a list and methods that
* construct lists of words from lists of strings.
*
* @author Dan Klein
* @author Christopher Manning (generified)
* @author John Bauer
* @version 2010
*/
public class Sentence {
private Sentence() {} // static methods
/**
* Create an ArrayList as a list of TaggedWord
from two
* lists of String
, one for the words, and the second for
* the tags.
*
* @param lex a list whose items are of type String
and
* are the words
* @param tags a list whose items are of type String
and
* are the tags
* @return The Sentence
*/
public static ArrayList toTaggedList(List lex, List tags) {
ArrayList sent = new ArrayList<>();
int ls = lex.size();
int ts = tags.size();
if (ls != ts) {
throw new IllegalArgumentException("Sentence.toSentence: lengths differ");
}
for (int i = 0; i < ls; i++) {
sent.add(new TaggedWord(lex.get(i), tags.get(i)));
}
return sent;
}
/**
* Create an ArrayList as a list of Word
from a
* list of String
.
*
* @param lex a list whose items are of type String
and
* are the words
* @return The Sentence
*/
//TODO wsg2010: This should be deprecated in favor of the method below with new labels
public static ArrayList toUntaggedList(List lex) {
ArrayList sent = new ArrayList<>();
for (String str : lex) {
sent.add(new Word(str));
}
return sent;
}
/**
* Create a Sentence as a list of Word
objects from
* an array of String objects.
*
* @param words The words to make it from
* @return The Sentence
*/
//TODO wsg2010: This should be deprecated in favor of the method below with new labels
public static ArrayList toUntaggedList(String... words) {
ArrayList sent = new ArrayList<>();
for (String str : words) {
sent.add(new Word(str));
}
return sent;
}
public static List toWordList(String... words) {
List sent = new ArrayList<>();
for (String word : words) {
CoreLabel cl = new CoreLabel();
cl.setValue(word);
cl.setWord(word);
sent.add(cl);
}
return sent;
}
/**
* Create a sentence as a List of CoreLabel
objects from
* an array (or varargs) of String objects.
*
* @param words The words to make it from
* @return The Sentence
*/
public static List toCoreLabelList(String... words) {
List sent = new ArrayList<>(words.length);
for (String word : words) {
CoreLabel cl = new CoreLabel();
cl.setValue(word);
cl.setWord(word);
sent.add(cl);
}
return sent;
}
/**
* Create a sentence as a List of CoreLabel
objects from
* a List of other label objects.
*
* @param words The words to make it from
* @return The Sentence
*/
public static List toCoreLabelList(List extends HasWord> words) {
List sent = new ArrayList<>(words.size());
for (HasWord word : words) {
CoreLabel cl = new CoreLabel();
if (word instanceof Label) {
cl.setValue(((Label) word).value());
}
cl.setWord(word.word());
if (word instanceof HasTag) {
cl.setTag(((HasTag) word).tag());
}
if (word instanceof HasLemma) {
cl.setLemma(((HasLemma) word).lemma());
}
sent.add(cl);
}
return sent;
}
/**
* Returns the sentence as a string with a space between words.
* It prints out the {@code value()} of each item -
* this will give the expected answer for a short form representation
* of the "sentence" over a range of cases. It is equivalent to
* calling {@code toString(true)}.
*
* TODO: Sentence used to be a subclass of ArrayList, with this
* method as the toString. Therefore, there may be instances of
* ArrayList being printed that expect this method to be used.
*
* @param list The tokenized sentence to print out
* @return The tokenized sentence as a String
*/
public static String listToString(List list) {
return listToString(list, true);
}
/**
* Returns the sentence as a string with a space between words.
* Designed to work robustly, even if the elements stored in the
* 'Sentence' are not of type Label.
*
* This one uses the default separators for any word type that uses
* separators, such as TaggedWord.
*
* @param list The tokenized sentence to print out
* @param justValue If true
and the elements are of type
* Label
, return just the
* value()
of the Label
of each word;
* otherwise,
* call the toString()
method on each item.
* @return The sentence in String form
*/
public static String listToString(List list, final boolean justValue) {
return listToString(list, justValue, null);
}
/**
* As already described, but if separator is not null, then objects
* such as TaggedWord
*
* @param separator The string used to separate Word and Tag
* in TaggedWord, etc
*/
public static String listToString(List list, final boolean justValue,
final String separator) {
StringBuilder s = new StringBuilder();
for (Iterator wordIterator = list.iterator(); wordIterator.hasNext();) {
T o = wordIterator.next();
s.append(wordToString(o, justValue, separator));
if (wordIterator.hasNext()) {
s.append(' ');
}
}
return s.toString();
}
/**
* Returns the sentence as a string, based on the original text and spacing
* prior to tokenization.
* This method assumes that this extra information has been encoded in CoreLabel
* objects for each token of the sentence, which do have the original spacing
* preserved (done with "invertible=true" for PTBTokenizer).
* However, the method has loose typing for easier inter-operation
* with old code that still works with a {@code List}.
*
* @param list The sentence (List of tokens) to print out
* @return The original sentence String, which may contain newlines or other artifacts of spacing
*/
public static String listToOriginalTextString(List list) {
return listToOriginalTextString(list, true);
}
/**
* Returns the sentence as a string, based on the original text and spacing
* prior to tokenization.
* This method assumes that this extra information has been encoded in CoreLabel
* objects for each token of the sentence, which do have the original spacing
* preserved (done with "invertible=true" for PTBTokenizer). If that information
* is not there, you will see null outputs, and if you do not pass in a List
* of CoreLabel objects, then the code will Exception.
* The method has loose typing for easier inter-operation
* with old code that still works with a {@code List}.
*
* @param list The sentence (List of tokens) to print out
* @param printBeforeBeforeStart Whether to print the BeforeAnnotation before the first token
* of the sentence. (In general, the BeforeAnnotation is the same
* as the AfterAnnotation of the preceding token. So, usually this
* is correct to do only for the first sentence of a text.)
* @return The original sentence String, which may contain newlines or other artifacts of spacing
*/
public static String listToOriginalTextString(List list, boolean printBeforeBeforeStart) {
StringBuilder s = new StringBuilder();
for (HasWord word : list) {
CoreLabel cl = (CoreLabel) word;
if (printBeforeBeforeStart) {
// Only print Before for first token, since otherwise same as After of previous token
// BUG: if you print a sequence of sentences, you double up between sentence spacing.
if (cl.get(CoreAnnotations.BeforeAnnotation.class) != null) {
s.append(cl.get(CoreAnnotations.BeforeAnnotation.class));
}
printBeforeBeforeStart = false;
}
s.append(cl.get(CoreAnnotations.OriginalTextAnnotation.class));
if (cl.get(CoreAnnotations.AfterAnnotation.class) != null) {
s.append(cl.get(CoreAnnotations.AfterAnnotation.class));
} else {
s.append(" ");
}
}
return s.toString();
}
public static String wordToString(T o, final boolean justValue) {
return wordToString(o, justValue, null);
}
public static String wordToString(T o, final boolean justValue,
final String separator) {
if (justValue && o instanceof Label) {
if (o instanceof CoreLabel) {
CoreLabel l = (CoreLabel) o;
String w = l.value();
if (w == null)
w = l.word();
return w;
} else {
return (((Label) o).value());
}
} else if (o instanceof CoreLabel) {
CoreLabel l = ((CoreLabel) o);
String w = l.value();
if (w == null)
w = l.word();
if (l.tag() != null) {
if (separator == null) {
return w + CoreLabel.TAG_SEPARATOR + l.tag();
} else {
return w + separator + l.tag();
}
}
return w;
// an interface that covered these next four cases would be
// nice, but we're moving away from these data types anyway
} else if (separator != null && o instanceof TaggedWord) {
return ((TaggedWord) o).toString(separator);
} else if (separator != null && o instanceof LabeledWord) {
return ((LabeledWord) o).toString(separator);
} else if (separator != null && o instanceof WordLemmaTag) {
return ((WordLemmaTag) o).toString(separator);
} else if (separator != null && o instanceof WordTag) {
return ((WordTag) o).toString(separator);
} else {
return (o.toString());
}
}
/**
* Returns the substring of the sentence from start (inclusive)
* to end (exclusive).
*
* @param start Leftmost index of the substring
* @param end Rightmost index of the ngram
* @return The ngram as a String
*/
public static String extractNgram(List list, int start, int end) {
if (start < 0 || end > list.size() || start >= end) return null;
final StringBuilder sb = new StringBuilder();
// TODO: iterator
for (int i = start; i < end; i++) {
T o = list.get(i);
if (sb.length() != 0) sb.append(" ");
sb.append((o instanceof HasWord) ? ((HasWord) o).word() : o.toString());
}
return sb.toString();
}
}