edu.stanford.nlp.process.WordSegmentingTokenizer Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of stanford-parser Show documentation
Show all versions of stanford-parser Show documentation
Stanford Parser processes raw text in English, Chinese, German, Arabic, and French, and extracts constituency parse trees.
package edu.stanford.nlp.process;
import java.io.Reader;
import java.io.Serializable;
import java.util.Collections;
import java.util.Iterator;
import java.util.List;
import java.util.Properties;
import edu.stanford.nlp.ling.CoreLabel;
import edu.stanford.nlp.ling.HasWord;
import edu.stanford.nlp.util.PropertiesUtils;
import edu.stanford.nlp.util.StringUtils;
/** A tokenizer that works by calling a WordSegmenter.
* This is used for Chinese and Arabic.
*
* @author Galen Andrew
* @author Spence Green
*/
public class WordSegmentingTokenizer extends AbstractTokenizer {
private Iterator wordIter;
private Tokenizer tok;
private WordSegmenter wordSegmenter;
public WordSegmentingTokenizer(WordSegmenter segmenter, Reader r) {
this(segmenter, WhitespaceTokenizer.newCoreLabelWhitespaceTokenizer(r));
}
public WordSegmentingTokenizer(WordSegmenter segmenter, Tokenizer tokenizer) {
wordSegmenter = segmenter;
tok = tokenizer;
}
@Override
protected HasWord getNext() {
while (wordIter == null || ! wordIter.hasNext()) {
if ( ! tok.hasNext()) {
return null;
}
CoreLabel token = tok.next();
String s = token.word();
if (s == null) {
return null;
}
if (s.equals(WhitespaceLexer.NEWLINE)) {
// if newlines were significant, we should make sure to return
// them when we see them
List se = Collections.singletonList(token);
wordIter = se.iterator();
} else {
List se = wordSegmenter.segment(s);
wordIter = se.iterator();
}
}
return wordIter.next();
}
public static TokenizerFactory factory(WordSegmenter wordSegmenter) {
return new WordSegmentingTokenizerFactory(wordSegmenter);
}
private static class WordSegmentingTokenizerFactory implements TokenizerFactory, Serializable {
private static final long serialVersionUID = -4697961121607489828L;
boolean tokenizeNLs = false;
private WordSegmenter segmenter;
public WordSegmentingTokenizerFactory(WordSegmenter wordSegmenter) {
segmenter = wordSegmenter;
}
public Iterator getIterator(Reader r) {
return getTokenizer(r);
}
public Tokenizer getTokenizer(Reader r) {
return getTokenizer(r, null);
}
public Tokenizer getTokenizer(Reader r, String extraOptions) {
boolean tokenizeNewlines = this.tokenizeNLs;
if (extraOptions != null) {
Properties prop = StringUtils.stringToProperties(extraOptions);
tokenizeNewlines = PropertiesUtils.getBool(prop, "tokenizeNLs", this.tokenizeNLs);
}
return new WordSegmentingTokenizer(segmenter, WhitespaceTokenizer.newCoreLabelWhitespaceTokenizer(r, tokenizeNewlines));
}
public void setOptions(String options) {
Properties prop = StringUtils.stringToProperties(options);
tokenizeNLs = PropertiesUtils.getBool(prop, "tokenizeNLs", tokenizeNLs);
}
}
}