All Downloads are FREE. Search and download functionalities are using the official Maven repository.

edu.stanford.nlp.process.WordSegmentingTokenizer Maven / Gradle / Ivy

Go to download

Stanford Parser processes raw text in English, Chinese, German, Arabic, and French, and extracts constituency parse trees.

The newest version!
package edu.stanford.nlp.process;

import java.io.Reader;
import java.io.Serializable;
import java.util.Collections;
import java.util.Iterator;
import java.util.List;
import java.util.Properties;

import edu.stanford.nlp.ling.CoreLabel;
import edu.stanford.nlp.ling.HasWord;
import edu.stanford.nlp.util.PropertiesUtils;
import edu.stanford.nlp.util.StringUtils;

/** A tokenizer that works by calling a WordSegmenter.
 *  This is used for Chinese and Arabic.
 *
 *  @author Galen Andrew
 *  @author Spence Green
 */
public class WordSegmentingTokenizer extends AbstractTokenizer {

  private Iterator wordIter;
  private Tokenizer tok;
  private WordSegmenter wordSegmenter;

  public WordSegmentingTokenizer(WordSegmenter segmenter, Reader r) {
    this(segmenter, WhitespaceTokenizer.newCoreLabelWhitespaceTokenizer(r));
  }

  public WordSegmentingTokenizer(WordSegmenter segmenter, Tokenizer tokenizer) {
    wordSegmenter = segmenter;
    tok = tokenizer;
  }

  @Override
  protected HasWord getNext() {
    while (wordIter == null || ! wordIter.hasNext()) {
      if ( ! tok.hasNext()) {
        return null;
      }
      CoreLabel token = tok.next();
      String s = token.word();
      if (s == null) {
        return null;
      }
      if (s.equals(WhitespaceLexer.NEWLINE)) {
        // if newlines were significant, we should make sure to return
        // them when we see them
        List se = Collections.singletonList(token);
        wordIter = se.iterator();
      } else {
        List se = wordSegmenter.segment(s);
        wordIter = se.iterator();
      }
    }
    return wordIter.next();
  }

  public static TokenizerFactory factory(WordSegmenter wordSegmenter) {
    return new WordSegmentingTokenizerFactory(wordSegmenter);
  }

  private static class WordSegmentingTokenizerFactory implements TokenizerFactory, Serializable {
    private static final long serialVersionUID = -4697961121607489828L;

    boolean tokenizeNLs = false;
    private WordSegmenter segmenter;

    public WordSegmentingTokenizerFactory(WordSegmenter wordSegmenter) {
      segmenter = wordSegmenter;
    }

    public Iterator getIterator(Reader r) {
      return getTokenizer(r);
    }

    public Tokenizer getTokenizer(Reader r) {
      return getTokenizer(r, null);
    }

    public Tokenizer getTokenizer(Reader r, String extraOptions) {
      boolean tokenizeNewlines = this.tokenizeNLs;
      if (extraOptions != null) {
        Properties prop = StringUtils.stringToProperties(extraOptions);
        tokenizeNewlines = PropertiesUtils.getBool(prop, "tokenizeNLs", this.tokenizeNLs);
      }

      return new WordSegmentingTokenizer(segmenter, WhitespaceTokenizer.newCoreLabelWhitespaceTokenizer(r, tokenizeNewlines));
    }

    public void setOptions(String options) {
      Properties prop = StringUtils.stringToProperties(options);
      tokenizeNLs = PropertiesUtils.getBool(prop, "tokenizeNLs", tokenizeNLs);
    }
  }
}




© 2015 - 2024 Weber Informatics LLC | Privacy Policy