edu.stanford.nlp.ie.regexp.ChineseNumberSequenceClassifier Maven / Gradle / Ivy

Show more of this group Show more artifacts with this name
Show all versions of stanford-corenlp Show documentation
Stanford CoreNLP provides a set of natural language analysis tools which can take raw English language text input and give the base forms of words, their parts of speech, whether they are names of companies, people, etc., normalize dates, times, and numeric quantities, mark up the structure of sentences in terms of phrases and word dependencies, and indicate which noun phrases refer to the same entities. It provides the foundational building blocks for higher level text understanding applications.
There is a newer version: 4.5.7
Show newest version
package edu.stanford.nlp.ie.regexp;

import edu.stanford.nlp.ie.AbstractSequenceClassifier;
import edu.stanford.nlp.io.IOUtils;
import edu.stanford.nlp.ling.CoreAnnotation;
import edu.stanford.nlp.ling.CoreAnnotations;
import edu.stanford.nlp.ling.CoreLabel;
import edu.stanford.nlp.pipeline.Annotation;
import edu.stanford.nlp.pipeline.StanfordCoreNLP;
import edu.stanford.nlp.sequences.DocumentReaderAndWriter;
import edu.stanford.nlp.time.TimeExpressionExtractor;
import edu.stanford.nlp.util.CoreMap;
import edu.stanford.nlp.util.PaddedList;
import edu.stanford.nlp.util.StringUtils;
import edu.stanford.nlp.util.logging.Redwood;

import java.io.IOException;
import java.io.ObjectInputStream;
import java.io.ObjectOutputStream;
import java.io.PrintStream;
import java.util.*;
import java.util.regex.Pattern;

/**
 * A simple rule-based classifier that detects NUMBERs in a sequence of Chinese tokens. This classifier mimics the
 * behavior of {@link edu.stanford.nlp.ie.regexp.NumberSequenceClassifier} (without using SUTime) and works on Chinese sequence.
 *
 * TODO: An interface needs to be used to reuse code for NumberSequenceClassifier
 * TODO: Ideally a Chinese version of SUTime needs to be used to provide more flexibility and accuracy.
 *
 * @author Yuhao Zhang
 * @author Peng Qi
 */
public class ChineseNumberSequenceClassifier extends AbstractSequenceClassifier {

  /** A logger for this class */
  private static Redwood.RedwoodChannels log = Redwood.channels(ChineseNumberSequenceClassifier.class);

  private static final boolean DEBUG = false;

  private final boolean useSUTime;

  public static final boolean USE_SUTIME_DEFAULT = false;
  public static final String USE_SUTIME_PROPERTY = "ner.useSUTime";
  public static final String USE_SUTIME_PROPERTY_BASE = "useSUTime";
  public static final String SUTIME_PROPERTY = "sutime";

  private final TimeExpressionExtractor timexExtractor;

  public ChineseNumberSequenceClassifier() {
    this(new Properties(), USE_SUTIME_DEFAULT, new Properties());
  }

  public ChineseNumberSequenceClassifier(boolean useSUTime) {
    this(new Properties(), useSUTime, new Properties());
  }

  public ChineseNumberSequenceClassifier(Properties props, boolean useSUTime, Properties sutimeProps) {
    super(props);
    this.useSUTime = useSUTime;
    if(this.useSUTime) {
      // TODO: Need a Chinese version of SUTime
      log.warn("SUTime currently does not support Chinese. Ignore property ner.useSUTime.");
    }
    this.timexExtractor = null;
  }

  // All the tags we need
  public static final String NUMBER_TAG = "NUMBER";
  public static final String DATE_TAG = "DATE";
  public static final String TIME_TAG = "TIME";
  public static final String MONEY_TAG = "MONEY";
  public static final String ORDINAL_TAG = "ORDINAL";
  public static final String PERCENT_TAG = "PERCENT";

  // Patterns we need
  public static final Pattern CURRENCY_WORD_PATTERN =
      Pattern.compile("元|刀|(?:美|欧|澳|加|日|韩)元|英?镑|法郎|卢比|卢布|马克|先令|克朗|泰?铢|(?:越南)?盾|美分|便士|块钱|毛钱|角钱");
  // In theory 块 钱 should be separated by segmenter, but just in case segmenter fails
  // TODO(yuhao): Need to add support for 块 钱, 毛 钱, 角 钱, 角, 五 块 二
  public static final Pattern PERCENT_WORD_PATTERN1 = Pattern.compile("(?:百分之|千分之).+");
  public static final Pattern PERCENT_WORD_PATTERN2 = Pattern.compile(".+%");
  public static final Pattern DATE_PATTERN1 = Pattern.compile(".+(?:年代?|月份?|日|号|世纪)");
  public static final Pattern DATE_PATTERN2 = Pattern.compile("(?:星期|周|礼拜).+");
  public static final Pattern DATE_PATTERN3 = Pattern.compile("[0-9一二三四五六七八九零〇十]{2,4}");
  public static final Pattern DATE_PATTERN4 = Pattern.compile("(?:[0-9]{2,4}[/\\-\\.][0-9]+[/\\-\\.][0-9]+|[0-9]+[/\\-\\.][0-9]+[/\\-\\.][0-9]{2,4}|[0-9]+[/\\-\\.]?[0-9]+)");
  public static final Pattern DATE_PATTERN5 = Pattern.compile("[昨今明][天晨晚夜早]");
  public static final Pattern TIME_PATTERN1 = Pattern.compile(".+(?::|点|时)(?:过|欠|差)?(?:.+(?::|分)?|整?|钟?|.+刻)?(?:.+秒?)"); // This only works when POS = NT

  private static final Pattern CHINESE_AND_ARABIC_NUMERALS_PATTERN = Pattern.compile("[一二三四五六七八九零十〇\\d]+");
  // This is used to capture a special case of date in Chinese: 70 后 or 七零 后
  private static final String DATE_AGE_LOCALIZER = "后";

  // order it by number of characters DESC for handy one-by-one matching of string suffix
  public static final String[] CURRENCY_WORDS_VALUES = new String[] {"越南盾", "美元", "欧元", "澳元", "加元", "日元", "韩元",
      "英镑", "法郎", "卢比", "卢布", "马克", "先令", "克朗", "泰铢", "盾", "铢", "刀", "镑", "元"};

  public static final String[] DATE_WORDS_VALUES = new String[] {"明天", "后天", "昨天", "前天", "明年", "后年", "去年", "前年",
      "昨日", "明日", "来年", "上月", "本月", "目前", "今后", "未来", "日前", "最近", "当时", "后来", "那时", "这时", "今", "今天",
      "当今", "如今", "之后", "当代", "以前", "现在", "将来", "此时", "此前", "元旦"};
  public static final HashSet DATE_WORDS = new HashSet<>(Arrays.asList(DATE_WORDS_VALUES));

  public static final String[] TIME_WORDS_VALUES = new String[] {"早晨", "清晨", "凌晨", "上午", "中午", "下午", "傍晚", "晚上",
      "夜间", "晨间", "晚间", "午前", "午后", "早", "晚"};
  public static final HashSet TIME_WORDS = new HashSet<>(Arrays.asList(TIME_WORDS_VALUES));

  /**
   * Use a set of heuristic rules to assign NER tags to tokens.
   * @param document A {@link List} of something that extends {@link CoreMap}.
   * @return
   */
  @Override
  public List classify(List document) {
    // The actual implementation of the classifier
    PaddedList pl = new PaddedList<>(document, pad);
    for (int i = 0, sz = pl.size(); i < sz; i++) {
      CoreLabel me = pl.get(i);
      CoreLabel prev = pl.get(i - 1);
      CoreLabel next = pl.get(i + 1);
      // by default set to be "O"
      me.set(CoreAnnotations.AnswerAnnotation.class, flags.backgroundSymbol);

      // If current word is OD, label it as ORDINAL
      if(me.getString(CoreAnnotations.PartOfSpeechAnnotation.class).equals("OD")) {
        me.set(CoreAnnotations.AnswerAnnotation.class, ORDINAL_TAG);
      } else if(CURRENCY_WORD_PATTERN.matcher(me.word()).matches() &&
          prev.getString(CoreAnnotations.PartOfSpeechAnnotation.class).equals("CD")) {
        // If current word is currency word and prev word is a CD
        me.set(CoreAnnotations.AnswerAnnotation.class, MONEY_TAG);
      } else if(me.getString(CoreAnnotations.PartOfSpeechAnnotation.class).equals("CD")) {
        // TODO(yuhao): Need to support Chinese captial numbers like 叁拾 (This won't be POS-tagged as CD).
        // If current word is a CD
        if(PERCENT_WORD_PATTERN1.matcher(me.word()).matches() ||
            PERCENT_WORD_PATTERN2.matcher(me.word()).matches()) {
          // If current word is a percent
          me.set(CoreAnnotations.AnswerAnnotation.class, PERCENT_TAG);
        } else if(rightScanFindsMoneyWord(pl, i)) {
          // If one the right finds a currency word
          me.set(CoreAnnotations.AnswerAnnotation.class, MONEY_TAG);
        } else if(me.word().length() == 2 && CHINESE_AND_ARABIC_NUMERALS_PATTERN.matcher(me.word()).matches() &&
            DATE_AGE_LOCALIZER.equals(next.word())) {
          // This is to extract a special case of DATE: 70 后 or 七零 后
          me.set(CoreAnnotations.AnswerAnnotation.class, DATE_TAG);
        } else {
          // Otherwise we should safely label it as NUMBER
          me.set(CoreAnnotations.AnswerAnnotation.class, NUMBER_TAG);
        }
      } else if(me.getString(CoreAnnotations.PartOfSpeechAnnotation.class).equals("NT")) {
        // If current word is a NT (temporal noun)
        if(DATE_PATTERN1.matcher(me.word()).matches() ||
            DATE_PATTERN2.matcher(me.word()).matches() ||
            DATE_PATTERN3.matcher(me.word()).matches() ||
            DATE_PATTERN4.matcher(me.word()).matches() ||
            DATE_PATTERN5.matcher(me.word()).matches() ||
            DATE_WORDS.contains(me.word())) {
          me.set(CoreAnnotations.AnswerAnnotation.class, DATE_TAG);
        } else if(TIME_PATTERN1.matcher(me.word()).matches() ||
            TIME_WORDS.contains(me.word())) {
          me.set(CoreAnnotations.AnswerAnnotation.class, TIME_TAG);
        } else {
          // TIME may have more variants (really?) so always add as TIME by default
          me.set(CoreAnnotations.AnswerAnnotation.class, TIME_TAG);
        }
      } else if(DATE_AGE_LOCALIZER.equals(me.word()) && prev.word().length() == 2 &&
          CHINESE_AND_ARABIC_NUMERALS_PATTERN.matcher(prev.word()).matches()) {
        // Label 后 as DATE if the sequence is 70 后 or 七零 后
        me.set(CoreAnnotations.AnswerAnnotation.class, DATE_TAG);
      }
    }
    return document;
  }

  /**
   * Look along CD words and see if next thing is a money word.
   *
   * @param pl The list of CoreLabel
   * @param i The position to scan right from
   * @return Whether a money word is found
   */
  private static boolean rightScanFindsMoneyWord(List pl, int i) {
    int j = i;
    if (DEBUG) {
      log.info("rightScan from: " + pl.get(j).word());
    }
    int sz = pl.size();
    while (j < sz && pl.get(j).getString(CoreAnnotations.PartOfSpeechAnnotation.class).equals("CD")) {
      j++;
    }
    if (j >= sz) {
      return false;
    }
    String tag = pl.get(j).getString(CoreAnnotations.PartOfSpeechAnnotation.class);
    String word = pl.get(j).word();
    if (DEBUG) {
      log.info("rightScan testing: " + word + '/' + tag + "; answer is: " + Boolean.toString((tag.equals("NN") || tag.equals("NNS")) && CURRENCY_WORD_PATTERN.matcher(word).matches()));
    }
    return (tag.equals("M") || tag.equals("NN") || tag.equals("NNS")) && CURRENCY_WORD_PATTERN.matcher(word).matches();
  }

  @Override
  public List classifyWithGlobalInformation(List tokenSequence, CoreMap document, CoreMap sentence) {
    if(useSUTime) {
      log.fatal("ChineseNumberSequenceClassifier does not have SUTime implementation.");
    }
    return classify(tokenSequence);
  }

  @Override
  public void train(Collection> docs, DocumentReaderAndWriter readerAndWriter) {
    // Train is not needed for this rule-based classifier
  }

  @Override
  public void serializeClassifier(String serializePath) {
  }

  @Override
  public void serializeClassifier(ObjectOutputStream oos) {
  }

  @Override
  public void loadClassifier(ObjectInputStream in, Properties props) throws IOException, ClassCastException, ClassNotFoundException {
  }

  public static void main(String[] args) throws IOException {
    Properties props = StringUtils.argsToProperties("-props", "/Users/yuhao/Research/tmp/ChineseNumberClassifierProps.properties");
//    Properties props = StringUtils.argsToProperties("-props", "/Users/yuhao/Research/tmp/EnglishNumberClassifierProps.properties");
    props.setProperty("outputFormat", "text");
    props.setProperty("ssplit.boundaryTokenRegex", "\\n"); // one sentence per line
    StanfordCoreNLP pipeline = new StanfordCoreNLP(props);
    String docFileName = "/Users/yuhao/Research/tmp/chinese_number_examples.txt";
//    String docFileName = "/Users/yuhao/Research/tmp/english_number_examples.txt";
    List docLines = IOUtils.linesFromFile(docFileName);
    PrintStream out = new PrintStream(docFileName + ".out");
    for (String docLine : docLines) {
      Annotation sentenceAnnotation = new Annotation(docLine);
      pipeline.annotate(sentenceAnnotation);
      pipeline.prettyPrint(sentenceAnnotation, out);
      pipeline.prettyPrint(sentenceAnnotation, System.out);
    }

    out.close();
  }
}