edu.stanford.nlp.pipeline.ArabicSegmenterAnnotator Maven / Gradle / Ivy

Show more of this group Show more artifacts with this name
Show all versions of stanford-corenlp Show documentation

Stanford CoreNLP provides a set of natural language analysis tools which can take raw English language text input and give the base forms of words, their parts of speech, whether they are names of companies, people, etc., normalize dates, times, and numeric quantities, mark up the structure of sentences in terms of phrases and word dependencies, and indicate which noun phrases refer to the same entities. It provides the foundational building blocks for higher level text understanding applications.

There is a newer version: 4.5.7

Show newest version

package edu.stanford.nlp.pipeline;

import java.util.*;

import edu.stanford.nlp.international.arabic.process.ArabicSegmenter;
import edu.stanford.nlp.ling.SegmenterCoreAnnotations;
import edu.stanford.nlp.ling.CoreAnnotation;
import edu.stanford.nlp.ling.CoreAnnotations;
import edu.stanford.nlp.ling.CoreLabel;
import edu.stanford.nlp.ling.HasWord;
import edu.stanford.nlp.util.CoreMap;
import edu.stanford.nlp.util.PropertiesUtils;
import edu.stanford.nlp.util.logging.Redwood;

/**
 * This class will add segmentation information to an Annotation.
 * It assumes that the original document is a List of sentences under the
 * SentencesAnnotation.class key, and that each sentence has a
 * TextAnnotation.class key. This Annotator adds corresponding
 * information under a CharactersAnnotation.class key prior to segmentation,
 * and a TokensAnnotation.class key with value of a List of CoreLabel
 * after segmentation.
 *
 * Based on the ChineseSegmenterAnnotator by Pi-Chuan Chang.
 *
 * @author Will Monroe
 */
public class ArabicSegmenterAnnotator implements Annotator  {

  /** A logger for this class */
  private static Redwood.RedwoodChannels log = Redwood.channels(ArabicSegmenterAnnotator.class);

  private ArabicSegmenter segmenter;
  private final boolean VERBOSE;

  private static final String DEFAULT_SEG_LOC =
    "/u/nlp/data/arabic-segmenter/arabic-segmenter-atb+bn+arztrain.ser.gz";

  public ArabicSegmenterAnnotator() {
    this(DEFAULT_SEG_LOC, false);
  }

  public ArabicSegmenterAnnotator(boolean verbose) {
    this(DEFAULT_SEG_LOC, verbose);
  }

  public ArabicSegmenterAnnotator(String segLoc, boolean verbose) {
    VERBOSE = verbose;
    Properties props = new Properties();
    loadModel(segLoc, props);
  }

  public ArabicSegmenterAnnotator(String name, Properties props) {
    String model = null;
    // Keep only the properties that apply to this annotator
    Properties modelProps = new Properties();
    String desiredKey = name + '.';
    for (String key : props.stringPropertyNames()) {
      if (key.startsWith(desiredKey)) {
        // skip past name and the subsequent "."
        String modelKey = key.substring(desiredKey.length());
        if (modelKey.equals("model")) {
          model = props.getProperty(key);
        } else {
          modelProps.setProperty(modelKey, props.getProperty(key));
        }
      }
    }
    this.VERBOSE = PropertiesUtils.getBool(props, name + ".verbose", false);
    if (model == null) {
      throw new RuntimeException("Expected a property " + name + ".model");
    }
    loadModel(model, modelProps);
  }

  @SuppressWarnings("unused")
  private void loadModel(String segLoc) {
    // don't write very much, because the CRFClassifier already reports loading
    if (VERBOSE) {
      log.info("Loading segmentation model ... ");
    }
    Properties modelProps = new Properties();
    modelProps.setProperty("model", segLoc);
    segmenter = ArabicSegmenter.getSegmenter(modelProps);
  }

  private void loadModel(String segLoc, Properties props) {
    // don't write very much, because the CRFClassifier already reports loading
    if (VERBOSE) {
      log.info("Loading Segmentation Model ... ");
    }
    Properties modelProps = new Properties();
    modelProps.setProperty("model", segLoc);
    modelProps.putAll(props);
    try {
      segmenter = ArabicSegmenter.getSegmenter(modelProps);
    } catch (RuntimeException e) {
      throw e;
    } catch (Exception e) {
      throw new RuntimeException(e);
    }
  }

  @Override
  public void annotate(Annotation annotation) {
    if (VERBOSE) {
      log.info("Adding Segmentation annotation ... ");
    }
    List sentences = annotation.get(CoreAnnotations.SentencesAnnotation.class);
    if (sentences != null) {
      for (CoreMap sentence : sentences) {
        doOneSentence(sentence);
      }
    } else {
      doOneSentence(annotation);
    }
  }

  private void doOneSentence(CoreMap annotation) {
    String text = annotation.get(CoreAnnotations.TextAnnotation.class);
    List tokens = segmenter.segmentStringToTokenList(text);
    annotation.set(CoreAnnotations.TokensAnnotation.class, tokens);
  }


  @Override
  public Set> requires() {
    return Collections.emptySet();
  }

  @Override
  public Set> requirementsSatisfied() {
    return new HashSet<>(Arrays.asList(
        CoreAnnotations.TextAnnotation.class,
        CoreAnnotations.TokensAnnotation.class,
        CoreAnnotations.CharacterOffsetBeginAnnotation.class,
        CoreAnnotations.CharacterOffsetEndAnnotation.class,
        CoreAnnotations.BeforeAnnotation.class,
        CoreAnnotations.AfterAnnotation.class,
        CoreAnnotations.TokenBeginAnnotation.class,
        CoreAnnotations.TokenEndAnnotation.class,
        CoreAnnotations.PositionAnnotation.class,
        CoreAnnotations.IndexAnnotation.class,
        CoreAnnotations.OriginalTextAnnotation.class,
        CoreAnnotations.ValueAnnotation.class
    ));
  }

}