justhalf.nlp.sentencesplitter.StanfordSentenceSplitter Maven / Gradle / Ivy
package justhalf.nlp.sentencesplitter;
import java.io.StringReader;
import java.util.ArrayList;
import java.util.List;
import edu.stanford.nlp.ling.CoreLabel;
import edu.stanford.nlp.ling.HasWord;
import edu.stanford.nlp.process.DocumentPreprocessor;
import edu.stanford.nlp.process.PTBTokenizer.PTBTokenizerFactory;
import edu.stanford.nlp.process.TokenizerFactory;
/**
* An implementation of {@link SentenceSplitter} using Stanford CoreNLP
*/
public class StanfordSentenceSplitter implements SentenceSplitter {
private TokenizerFactory tokenizerFactory;
public StanfordSentenceSplitter() {
tokenizerFactory = PTBTokenizerFactory.newCoreLabelTokenizerFactory("ptb3Escaping=false,invertible=true");
}
@Override
public String[] splitToString(String input) {
List sentenceList = new ArrayList();
for(List sentenceTokenized: splitAndTokenize(input)){
StringBuilder sentence = new StringBuilder();
int lastIndex = -1;
for(CoreLabel word: sentenceTokenized){
int curIndex = word.beginPosition();
if(curIndex < lastIndex) continue;
if(sentence.length() > 0){
sentence.append(input.substring(lastIndex, curIndex));
}
sentence.append(word.word());
lastIndex = word.endPosition();
}
sentenceList.add(sentence.toString());
}
return sentenceList.toArray(new String[sentenceList.size()]);
}
@Override
public List split(String input){
List sentenceList = new ArrayList();
for(List sentenceTokenized: splitAndTokenize(input)){
StringBuilder sentenceText = new StringBuilder();
int sentenceBegin = sentenceTokenized.get(0).beginPosition();
String before = sentenceTokenized.get(0).before();
String after = sentenceTokenized.get(sentenceTokenized.size()-1).after();
int lastIndex = -1;
for(CoreLabel word: sentenceTokenized){
int curIndex = word.beginPosition();
if(curIndex < lastIndex) continue;
if(sentenceText.length() > 0){
sentenceText.append(input.substring(lastIndex, curIndex));
}
sentenceText.append(word.word());
lastIndex = word.endPosition();
}
CoreLabel sentence = new CoreLabel();
sentence.setBefore(before);
sentence.setAfter(after);
sentence.setBeginPosition(sentenceBegin);
sentence.setEndPosition(lastIndex);
sentence.setOriginalText(sentenceText.toString());
sentence.setWord(sentenceText.toString());
sentence.setValue(sentenceText.toString());
sentenceList.add(sentence);
}
return sentenceList;
}
/**
* Stanford CoreNLP's document processor also tokenize the input sentence while splitting
* the sentence. This method will return the original output of Stanford CoreNLP
* @param input
* A text which contains possibly multiple sentences.
* @return
* A list of sentences, each represented as a list of CoreLabel objects.
*/
public List> splitAndTokenize(String input){
DocumentPreprocessor splitter = new DocumentPreprocessor(new StringReader(input));
splitter.setTokenizerFactory(tokenizerFactory);
List> sentenceList = new ArrayList>();
for(List sentenceTokenized: splitter){
List sentence = new ArrayList();
for(HasWord word: sentenceTokenized){
sentence.add((CoreLabel)word);
}
sentenceList.add(sentence);
}
return sentenceList;
}
@Override
public boolean isThreadSafe(){
return true;
}
}
© 2015 - 2025 Weber Informatics LLC | Privacy Policy