edu.stanford.nlp.wordseg.TagAffixDetector Maven / Gradle / Ivy

Show more of this group Show more artifacts with this name
Show all versions of stanford-corenlp Show documentation

Stanford CoreNLP provides a set of natural language analysis tools which can take raw English language text input and give the base forms of words, their parts of speech, whether they are names of companies, people, etc., normalize dates, times, and numeric quantities, mark up the structure of sentences in terms of phrases and word dependencies, and indicate which noun phrases refer to the same entities. It provides the foundational building blocks for higher level text understanding applications.

There is a newer version: 4.5.7

Show newest version

package edu.stanford.nlp.wordseg;

import edu.stanford.nlp.sequences.SeqClassifierFlags;
import edu.stanford.nlp.util.logging.Redwood;

/** @author Huihsin Tseng */
class TagAffixDetector {

  private static final Redwood.RedwoodChannels logger = Redwood.channels(TagAffixDetector.class);
  private static final boolean VERBOSE = false;

  private final CorpusChar cc;
  private final AffixDictionary aD;
  // String sighanCorporaDict = "/u/nlp/data/chinese-segmenter/";
  private static final String DEFAULT_CORPORA_DICT = "/u/nlp/data/gale/segtool/stanford-seg/data";

  public TagAffixDetector(SeqClassifierFlags flags) {
    String corporaDict;
    if (flags.sighanCorporaDict != null) {
      corporaDict = flags.sighanCorporaDict;
    } else {
      corporaDict = DEFAULT_CORPORA_DICT;
    }

    if ( ! corporaDict.isEmpty() && ! corporaDict.endsWith("/")) {
      corporaDict = corporaDict + '/';
    }

    String ccPath;
    String adPath;
    if (flags.useChPos || flags.useCTBChar2 || flags.usePKChar2) {
      // if we're using POS information, override the ccPath
      // For now we only have list for CTB and PK
      if (flags.useASBCChar2 || flags.useHKChar2 || flags.useMSRChar2) {
        throw new RuntimeException("only support settings for CTB and PK now.");
      } else if (flags.useCTBChar2) {
        ccPath = corporaDict+"dict/character_list";
        adPath = corporaDict+"dict/in.ctb";
      } else if (flags.usePKChar2) {
        ccPath = corporaDict+"dict/pos_open/character_list.pku.utf8";
        adPath = corporaDict+"dict/in.pk";
      } else {
        throw new RuntimeException("none of flags.useXXXChar2 are on");
      }
    } else {
      ccPath = corporaDict+"dict/pos_close/char.ctb.list";
      adPath = corporaDict+"dict/in.ctb";
    }
    if (VERBOSE) {
      logger.info("TagAffixDetector: useChPos=" + flags.useChPos +
              " | useCTBChar2=" + flags.useCTBChar2 + " | usePKChar2=" + flags.usePKChar2);
      logger.info("TagAffixDetector: building TagAffixDetector from " + ccPath + " and " + adPath);
    }
    cc = new CorpusChar(ccPath);
    aD = new AffixDictionary(adPath);
  }

  String checkDic(String t2, String c2 ) {
    if(cc.getTag(t2, c2).equals("1"))
      return "1";
    return "0";
  }

  String checkInDic(String c2 ){
    if(aD.getInDict(c2).equals("1"))
      return "1";
    return "0";
  }

}