All Downloads are FREE. Search and download functionalities are using the official Maven repository.

edu.stanford.nlp.ie.regexp.NumberSequenceClassifier Maven / Gradle / Ivy

Go to download

Stanford CoreNLP provides a set of natural language analysis tools which can take raw English language text input and give the base forms of words, their parts of speech, whether they are names of companies, people, etc., normalize dates, times, and numeric quantities, mark up the structure of sentences in terms of phrases and word dependencies, and indicate which noun phrases refer to the same entities. It provides the foundational building blocks for higher level text understanding applications.

There is a newer version: 4.5.7
Show newest version
package edu.stanford.nlp.ie.regexp; 
import edu.stanford.nlp.util.logging.Redwood;

import edu.stanford.nlp.ie.AbstractSequenceClassifier;
import edu.stanford.nlp.ling.CoreAnnotation;
import edu.stanford.nlp.ling.CoreAnnotations;
import edu.stanford.nlp.ling.CoreLabel;
import edu.stanford.nlp.pipeline.Annotation;
import edu.stanford.nlp.sequences.DocumentReaderAndWriter;
import edu.stanford.nlp.sequences.PlainTextDocumentReaderAndWriter;
import edu.stanford.nlp.time.TimeAnnotations;
import edu.stanford.nlp.time.TimeExpressionExtractor;
import edu.stanford.nlp.time.TimeExpressionExtractorFactory;
import edu.stanford.nlp.time.Timex;
import edu.stanford.nlp.util.CoreMap;
import edu.stanford.nlp.util.PaddedList;
import edu.stanford.nlp.util.StringUtils;

import java.io.ObjectInputStream;
import java.io.IOException;
import java.io.ObjectOutputStream;
import java.util.ArrayList;
import java.util.Collection;
import java.util.List;
import java.util.Properties;
import java.util.regex.Pattern;

/**
 * A set of deterministic rules for marking certain entities, to add
 * categories and to correct for failures of statistical NER taggers.
 * This is an extremely simple and ungeneralized implementation of
 * AbstractSequenceClassifier that was written for PASCAL RTE.
 * It could profitably be extended and generalized.
 * It marks a NUMBER category based on part-of-speech tags in a
 * deterministic manner.
 * It marks an ORDINAL category based on word form in a deterministic manner.
 * It tags as MONEY currency signs and things tagged CD after a currency sign.
 * It marks a number before a month name as a DATE.
 * It marks as a DATE a word of the form xx/xx/xxxx
 * (where x is a digit from a suitable range).
 * It marks as a TIME a word of the form x(x):xx (where x is a digit).
 * It marks everything else tagged "CD" as a NUMBER, and instances
 * of "and" appearing between CD tags in contexts suggestive of a number.
 * It requires text to be POS-tagged (have the getString(TagAnnotation.class) attribute).
 * Effectively these rules assume that
 * this classifier will be used as a secondary classifier by
 * code such as ClassifierCombiner: it will mark most CD as NUMBER, and it
 * is assumed that something else with higher priority is marking ones that
 * are PERCENT, ADDRESS, etc.
 *
 * @author Christopher Manning
 * @author Mihai (integrated with NumberNormalizer, SUTime)
 */
public class NumberSequenceClassifier extends AbstractSequenceClassifier  {

  /** A logger for this class */
  private static Redwood.RedwoodChannels log = Redwood.channels(NumberSequenceClassifier.class);

  private static final boolean DEBUG = false;

  private final boolean useSUTime;

  public static final boolean USE_SUTIME_DEFAULT = TimeExpressionExtractorFactory.DEFAULT_EXTRACTOR_PRESENT;
  public static final String USE_SUTIME_PROPERTY = "ner.useSUTime";
  public static final String USE_SUTIME_PROPERTY_BASE = "useSUTime";
  public static final String SUTIME_PROPERTY = "sutime";

  private final TimeExpressionExtractor timexExtractor;

  public NumberSequenceClassifier() {
    this(new Properties(), USE_SUTIME_DEFAULT, new Properties());
    if (! CURRENCY_WORD_PATTERN.matcher("pounds").matches()) {
      log.info("NumberSequence: Currency pattern broken");
    }
  }

  public NumberSequenceClassifier(boolean useSUTime) {
    this(new Properties(), useSUTime, new Properties());
  }

  public NumberSequenceClassifier(Properties props,
                                  boolean useSUTime, Properties sutimeProps) {
    super(props);
    this.useSUTime = useSUTime;
    if(this.useSUTime) {
      this.timexExtractor = TimeExpressionExtractorFactory.createExtractor(SUTIME_PROPERTY, sutimeProps);
    } else {
      this.timexExtractor = null;
    }
  }

  /**
   * Classify a {@link List} of {@link CoreLabel}s.
   *
   * @param document A {@link List} of {@link CoreLabel}s.
   * @return the same {@link List}, but with the elements annotated
   *         with their answers.
   */
  @Override
  public List classify(List document) {
    return classifyWithGlobalInformation(document, null, null);
  }

  @Override
  public List classifyWithGlobalInformation(List tokens, final CoreMap document, final CoreMap sentence) {
    if(useSUTime) return classifyWithSUTime(tokens, document, sentence);
    return classifyOld(tokens);
  }

  public void finalizeClassification(final CoreMap document) {
    if (useSUTime) {
      timexExtractor.finalize(document);
    }
  }

  // todo [cdm, 2013]: Where does this call NumberNormalizer?  Is it the call buried in SUTime's TimeExpressionExtractorImpl?
  /**
   * Modular classification using NumberNormalizer for numbers, SUTime for date/time.
   * Note: this is slower than classifyOld because it runs multiple passes
   *   over the tokens (one for numbers and dates, and others for money and ordinals).
   *   However, the slowdown is not substantial since the passes are fast. Plus,
   *   the code is much cleaner than before...
   * @param tokenSequence
   */
  private List classifyWithSUTime(List tokenSequence, final CoreMap document, final CoreMap sentence) {
    //
    // set everything to "O" by default
    //
    for (CoreLabel token: tokenSequence) {
      if (token.get(CoreAnnotations.AnswerAnnotation.class) == null)
        token.set(CoreAnnotations.AnswerAnnotation.class, flags.backgroundSymbol);
    }

    //
    // run SUTime
    // note: SUTime requires TextAnnotation to be set at document/sent level and
    //   that the Character*Offset annotations be aligned with the token words.
    //   This is guaranteed because here we work on a copy generated by copyTokens()
    //
    CoreMap timeSentence = (sentence != null ?
        alignSentence(sentence) :
        buildSentenceFromTokens(tokenSequence));
    List timeExpressions = runSUTime(timeSentence, document);
    List numbers = timeSentence.get(CoreAnnotations.NumerizedTokensAnnotation.class);

    //
    // store DATE and TIME
    //
    if (timeExpressions != null) {
      for(CoreMap timeExpression: timeExpressions) {
        // todo [cdm 2013]: We should also store these in the Sentence, but we've just got the list of tokens here
        int start = timeExpression.get(CoreAnnotations.TokenBeginAnnotation.class);
        int end = timeExpression.get(CoreAnnotations.TokenEndAnnotation.class);
        int offset = 0;
        if(sentence != null && sentence.containsKey(CoreAnnotations.TokenBeginAnnotation.class)) {
          offset = sentence.get(CoreAnnotations.TokenBeginAnnotation.class);
        }
        Timex timex = timeExpression.get(TimeAnnotations.TimexAnnotation.class);
        if(timex != null){
          if(DEBUG){
            log.info("FOUND DATE/TIME \"" + timeExpression +
                "\" with offsets " + start + " " + end +
                " and value " + timex);
            log.info("The above CoreMap has the following fields:");
            // for(Class key: timeExpression.keySet()) log.info("\t" + key + ": " + timeExpression.get(key));
          }
          String label = timex.timexType();
          for(int i = start; i < end; i ++){
            CoreLabel token = tokenSequence.get(i - offset);
            if(token.get(CoreAnnotations.AnswerAnnotation.class).equals(flags.backgroundSymbol)){
              token.set(CoreAnnotations.AnswerAnnotation.class, label);
              token.set(TimeAnnotations.TimexAnnotation.class, timex);
            }
          }
        }
      }
    }

    //
    // store the numbers found by SUTime as NUMBER if they are not part of anything else
    //
    if(numbers != null){
      for(CoreMap number: numbers) {
        if(number.containsKey(CoreAnnotations.NumericCompositeValueAnnotation.class)){
          int start = number.get(CoreAnnotations.TokenBeginAnnotation.class);
          int end = number.get(CoreAnnotations.TokenEndAnnotation.class);
          int offset = 0;
          if(sentence != null && sentence.containsKey(CoreAnnotations.TokenBeginAnnotation.class)) {
            offset = sentence.get(CoreAnnotations.TokenBeginAnnotation.class);
          }
          String type = number.get(CoreAnnotations.NumericCompositeTypeAnnotation.class);
          Number value = number.get(CoreAnnotations.NumericCompositeValueAnnotation.class);
          if(type != null){
            if(DEBUG) log.info("FOUND NUMBER \"" + number + "\" with offsets " + start + " " + end + " and value " + value + " and type " + type);
            for(int i = start; i < end; i ++){
              CoreLabel token = tokenSequence.get(i - offset);
              if(token.get(CoreAnnotations.AnswerAnnotation.class).equals(flags.backgroundSymbol)){
                token.set(CoreAnnotations.AnswerAnnotation.class, type);
                if(value != null){
                  token.set(CoreAnnotations.NumericCompositeValueAnnotation.class, value);
                }
              }
            }
          }
        }
      }
    }
    // everything tagged as CD is also a number
    // NumberNormalizer probably catches these but let's be safe
    // use inverted "CD".equals() because tag could be null (if no POS info available)
    for (CoreLabel token: tokenSequence) {
      if ("CD".equals(token.tag()) &&
         token.get(CoreAnnotations.AnswerAnnotation.class).equals(flags.backgroundSymbol)) {
        token.set(CoreAnnotations.AnswerAnnotation.class, "NUMBER");
      }
    }

    // extract money and percents
    moneyAndPercentRecognizer(tokenSequence);

    // ordinals
    // NumberNormalizer probably catches these but let's be safe
    ordinalRecognizer(tokenSequence);

    return tokenSequence;
  }

  /**
   * Copies one sentence replicating only information necessary for SUTime
   * @param sentence
   */
  public static CoreMap alignSentence(CoreMap sentence) {

    String text = sentence.get(CoreAnnotations.TextAnnotation.class);
    if(text != null){
      // original text is preserved; no need to align anything
      return sentence;
    }

    CoreMap newSentence = buildSentenceFromTokens(
        sentence.get(CoreAnnotations.TokensAnnotation.class),
        sentence.get(CoreAnnotations.CharacterOffsetBeginAnnotation.class),
        sentence.get(CoreAnnotations.CharacterOffsetEndAnnotation.class));

    newSentence.set(CoreAnnotations.TokenBeginAnnotation.class,
        sentence.get(CoreAnnotations.TokenBeginAnnotation.class));
    newSentence.set(CoreAnnotations.TokenEndAnnotation.class,
        sentence.get(CoreAnnotations.TokenEndAnnotation.class));

    return newSentence;
  }

  private static CoreMap buildSentenceFromTokens(List tokens) {
    return buildSentenceFromTokens(tokens, null, null);
  }

  private static CoreMap buildSentenceFromTokens(
      List tokens,
      Integer characterOffsetStart,
      Integer characterOffsetEnd) {

    //
    // Recover the sentence text:
    // a) try to get it from TextAnnotation
    // b) if not present, build it from the OriginalTextAnnotation of each token
    // c) if not present, build it from the TextAnnotation of each token
    //
    boolean adjustCharacterOffsets = false;
    // try to recover the text from the original tokens
    String text = buildText(tokens, CoreAnnotations.OriginalTextAnnotation.class);
    if(text == null){
      text = buildText(tokens, CoreAnnotations.TextAnnotation.class);
      // character offset will point to the original tokens
      //   so we need to align them to the text built from normalized tokens
      adjustCharacterOffsets = true;
      if(text == null){
        throw new RuntimeException("ERROR: to use SUTime, sentences must have TextAnnotation set, or the individual tokens must have OriginalTextAnnotation or TextAnnotation set!");
      }
    }

    // make sure token character offsets are aligned with text
    List tokenSequence = copyTokens(tokens, adjustCharacterOffsets, false);

    Annotation newSentence = new Annotation(text);
    newSentence.set(CoreAnnotations.TokensAnnotation.class, tokenSequence);
    if (! adjustCharacterOffsets &&
        characterOffsetStart != null &&
        characterOffsetEnd != null){
      newSentence.set(CoreAnnotations.CharacterOffsetBeginAnnotation.class, characterOffsetStart);
      newSentence.set(CoreAnnotations.CharacterOffsetEndAnnotation.class, characterOffsetEnd);
    } else {
      int tokenCharStart = tokenSequence.get(0).get(CoreAnnotations.CharacterOffsetBeginAnnotation.class);
      int tokenCharEnd = tokenSequence.get(tokenSequence.size() - 1).get(CoreAnnotations.CharacterOffsetEndAnnotation.class);
      newSentence.set(CoreAnnotations.CharacterOffsetBeginAnnotation.class, tokenCharStart);
      newSentence.set(CoreAnnotations.CharacterOffsetEndAnnotation.class, tokenCharEnd);
    }

    // some default token offsets
    newSentence.set(CoreAnnotations.TokenBeginAnnotation.class, 0);
    newSentence.set(CoreAnnotations.TokenEndAnnotation.class, tokenSequence.size());

    return newSentence;
  }

  private static String buildText(List tokens, Class> textAnnotation) {
    StringBuilder os = new StringBuilder();
    for (int i = 0, sz = tokens.size(); i < sz; i ++) {
      CoreLabel crt = tokens.get(i);
      // System.out.println("\t" + crt.word() + "\t" + crt.get(CoreAnnotations.CharacterOffsetBeginAnnotation.class) + "\t" + crt.get(CoreAnnotations.CharacterOffsetEndAnnotation.class));
      if (i > 0) {
        CoreLabel prev = tokens.get(i - 1);
        int spaces = 1;
        if (crt.containsKey(CoreAnnotations.CharacterOffsetBeginAnnotation.class)) {
          spaces = crt.get(CoreAnnotations.CharacterOffsetBeginAnnotation.class) -
              prev.get(CoreAnnotations.CharacterOffsetEndAnnotation.class);
        }
        while (spaces > 0) {
          os.append(' ');
          spaces--;
        }
      }
      String word = crt.get(textAnnotation);
      if (word == null) {
        // this annotation does not exist; bail out
        return null;
      }
      os.append(word);
    }
    return os.toString();
  }

  /**
   * Runs SUTime and converts its output into NamedEntityTagAnnotations
   * @param sentence
   * @param document Contains document-level annotations such as DocDateAnnotation
   */
  private List runSUTime(CoreMap sentence, final CoreMap document) {
    /*
    log.info("PARSING SENTENCE: " + sentence.get(CoreAnnotations.TextAnnotation.class));
    for(CoreLabel t: sentence.get(CoreAnnotations.TokensAnnotation.class)){
      log.info("TOKEN: \"" + t.word() + "\" \"" + t.get(CoreAnnotations.OriginalTextAnnotation.class) + "\" " + t.get(CoreAnnotations.CharacterOffsetBeginAnnotation.class) + " " + t.get(CoreAnnotations.CharacterOffsetEndAnnotation.class));
    }
    */

    List timeExpressions = timexExtractor.extractTimeExpressionCoreMaps(sentence, document);
    if(timeExpressions != null){
      if(DEBUG) System.out.println("FOUND TEMPORALS: " + timeExpressions);
    }

    return timeExpressions;
  }

  /**
   * Recognizes money and percents.
   * This accepts currency symbols (e.g., $) both before and after numbers; but it accepts units
   * (e.g., "dollar") only after numbers.
   *
   * @param tokenSequence The list of tokens to find money and percents in
   */
  private void moneyAndPercentRecognizer(List tokenSequence) {
    for(int i = 0; i < tokenSequence.size(); i ++){
      CoreLabel crt = tokenSequence.get(i);
      CoreLabel next = (i < tokenSequence.size() - 1 ? tokenSequence.get(i + 1) : null);
      CoreLabel prev = (i > 0 ? tokenSequence.get(i - 1) : null);

      // $5
      if (CURRENCY_SYMBOL_PATTERN.matcher(crt.word()).matches() && next != null &&
         (next.get(CoreAnnotations.AnswerAnnotation.class).equals("NUMBER") || "CD".equals(next.tag()))) {
        crt.set(CoreAnnotations.AnswerAnnotation.class, "MONEY");
        i = changeLeftToRight(tokenSequence, i + 1,
            next.get(CoreAnnotations.AnswerAnnotation.class),
            next.tag(), "MONEY") - 1;
      }

      // 5$, 5 dollars
      else if((CURRENCY_WORD_PATTERN.matcher(crt.word()).matches() ||
               CURRENCY_SYMBOL_PATTERN.matcher(crt.word()).matches()) &&
               prev != null &&
               (prev.get(CoreAnnotations.AnswerAnnotation.class).equals("NUMBER") ||
                "CD".equals(prev.tag())) &&
               ! leftScanFindsWeightWord(tokenSequence, i)) {
        crt.set(CoreAnnotations.AnswerAnnotation.class, "MONEY");
        changeRightToLeft(tokenSequence, i - 1,
            prev.get(CoreAnnotations.AnswerAnnotation.class),
            prev.tag(), "MONEY");
      }

      // 5%, 5 percent
      else if ((PERCENT_WORD_PATTERN.matcher(crt.word()).matches() ||
               PERCENT_SYMBOL_PATTERN.matcher(crt.word()).matches()) &&
               prev != null &&
               (prev.get(CoreAnnotations.AnswerAnnotation.class).equals("NUMBER") ||
                "CD".equals(prev.tag()))) {
        crt.set(CoreAnnotations.AnswerAnnotation.class, "PERCENT");
        changeRightToLeft(tokenSequence, i - 1,
            prev.get(CoreAnnotations.AnswerAnnotation.class),
            prev.tag(), "PERCENT");
      }
    }
  }

  /**
   * Recognizes ordinal numbers
   * @param tokenSequence
   */
  private void ordinalRecognizer(List tokenSequence) {
    for (CoreLabel crt : tokenSequence) {
      if ((crt.get(CoreAnnotations.AnswerAnnotation.class).equals(flags.backgroundSymbol) ||
              crt.get(CoreAnnotations.AnswerAnnotation.class).equals("NUMBER")) &&
              ORDINAL_PATTERN.matcher(crt.word()).matches()) {
        crt.set(CoreAnnotations.AnswerAnnotation.class, "ORDINAL");
      }
    }
  }

  private int changeLeftToRight(List tokens,
      int start,
      String oldTag,
      String posTag,
      String newTag) {
    while(start < tokens.size()) {
      CoreLabel crt = tokens.get(start);
      // we are scanning for a NER tag and found something different
      if(! oldTag.equals(flags.backgroundSymbol) && ! crt.get(CoreAnnotations.AnswerAnnotation.class).equals(oldTag)) {
        break;
      }
      // the NER tag is not set, so we scan for similar POS tags
      if(oldTag.equals(flags.backgroundSymbol) && ! crt.tag().equals(posTag)) {
        break;
      }

      crt.set(CoreAnnotations.AnswerAnnotation.class, newTag);
      start ++;
    }
    return start;
  }

  private int changeRightToLeft(List tokens,
      int start,
      String oldTag,
      String posTag,
      String newTag) {
    while(start >= 0) {
      CoreLabel crt = tokens.get(start);
      // we are scanning for a NER tag and found something different
      if(! oldTag.equals(flags.backgroundSymbol) && ! crt.get(CoreAnnotations.AnswerAnnotation.class).equals(oldTag)) {
        break;
      }
      // the NER tag is not set, so we scan for similar POS tags
      if(oldTag.equals(flags.backgroundSymbol) && ! crt.tag().equals(posTag)) {
        break;
      }

      crt.set(CoreAnnotations.AnswerAnnotation.class, newTag);
      start --;
    }
    return start;
  }

  /**
   * Aligns the character offsets of these tokens with the actual text stored in each token
   * Note that this copies the list ONLY when we need to adjust the character offsets. Otherwise, it keeps the original list.
   * Note that this looks first at OriginalTextAnnotation and only when null at TextAnnotation.
   * @param srcList
   * @param adjustCharacterOffsets If true, it adjust the character offsets to match exactly with the token lengths
   */
  private static List copyTokens(List srcList,
      boolean adjustCharacterOffsets,
      boolean forceCopy) {
    // no need to adjust anything; use the original list
    if(! adjustCharacterOffsets && ! forceCopy) return srcList;

    List dstList = new ArrayList<>();
    int adjustment = 0;
    int offset = 0; // for when offsets are not available
    for(CoreLabel src: srcList) {
      if(adjustCharacterOffsets) {
        int wordLength = (src.containsKey(CoreAnnotations.OriginalTextAnnotation.class))?
          src.get(CoreAnnotations.OriginalTextAnnotation.class).length():src.word().length();

        // We try to preserve the old character offsets but they just don't work well for normalized token text
        // Also, in some cases, these offsets are not set
        if(src.containsKey(CoreAnnotations.CharacterOffsetBeginAnnotation.class) &&
           src.containsKey(CoreAnnotations.CharacterOffsetEndAnnotation.class)){
          int start = src.get(CoreAnnotations.CharacterOffsetBeginAnnotation.class);
          int end = src.get(CoreAnnotations.CharacterOffsetEndAnnotation.class);
          int origLength = end - start;
          start += adjustment;
          end = start + wordLength;
          dstList.add(copyCoreLabel(src, start, end));
          adjustment += wordLength - origLength;
        } else {
          int start = offset;
          int end = start + wordLength;
          offset = end + 1; // allow for one space character
          dstList.add(copyCoreLabel(src, start, end));
        }
      } else {
        dstList.add(copyCoreLabel(src, null, null));
      }
    }

    return dstList;
  }

  /**
   * Transfer from src to dst all annotations generated bu SUTime and NumberNormalizer
   * @param src
   * @param dst
   */
  public static void transferAnnotations(CoreLabel src, CoreLabel dst) {
    //
    // annotations potentially set by NumberNormalizer
    //
    if(src.containsKey(CoreAnnotations.NumericCompositeValueAnnotation.class)){
      dst.set(CoreAnnotations.NumericCompositeValueAnnotation.class,
          src.get(CoreAnnotations.NumericCompositeValueAnnotation.class));
    }

    if(src.containsKey(CoreAnnotations.NumericCompositeTypeAnnotation.class))
      dst.set(CoreAnnotations.NumericCompositeTypeAnnotation.class,
          src.get(CoreAnnotations.NumericCompositeTypeAnnotation.class));

    //
    // annotations set by SUTime
    //
    if(src.containsKey(TimeAnnotations.TimexAnnotation.class))
      dst.set(TimeAnnotations.TimexAnnotation.class,
          src.get(TimeAnnotations.TimexAnnotation.class));
  }

  /**
   * Create a copy of srcTokens, detecting on the fly if character offsets need adjusting
   * @param srcTokens
   * @param srcSentence
   */
  public static List copyTokens(List srcTokens, CoreMap srcSentence) {
    boolean adjustCharacterOffsets = false;
    if (srcSentence == null ||
        srcSentence.get(CoreAnnotations.TextAnnotation.class) == null ||
        srcTokens.isEmpty() ||
        srcTokens.get(0).get(CoreAnnotations.OriginalTextAnnotation.class) == null) {
      adjustCharacterOffsets = true;
    }

    return copyTokens(srcTokens, adjustCharacterOffsets, true);
  }

  /**
   * Copies only the fields required for numeric entity extraction into  the new CoreLabel.
   *
   * @param src Source CoreLabel to copy.
   */
  private static CoreLabel copyCoreLabel(CoreLabel src, Integer startOffset, Integer endOffset) {
    CoreLabel dst = new CoreLabel();
    dst.setWord(src.word());
    dst.setTag(src.tag());
    if (src.containsKey(CoreAnnotations.OriginalTextAnnotation.class)) {
      dst.set(CoreAnnotations.OriginalTextAnnotation.class, src.get(CoreAnnotations.OriginalTextAnnotation.class));
    }
    if(startOffset == null){
      dst.set(CoreAnnotations.CharacterOffsetBeginAnnotation.class, src.get(CoreAnnotations.CharacterOffsetBeginAnnotation.class));
    } else {
      dst.set(CoreAnnotations.CharacterOffsetBeginAnnotation.class, startOffset);
    }
    if(endOffset == null){
      dst.set(CoreAnnotations.CharacterOffsetEndAnnotation.class, src.get(CoreAnnotations.CharacterOffsetEndAnnotation.class));
    } else {
      dst.set(CoreAnnotations.CharacterOffsetEndAnnotation.class, endOffset);
    }

    transferAnnotations(src, dst);

    return dst;
  }

  private static final Pattern MONTH_PATTERN = Pattern.compile("January|Jan\\.?|February|Feb\\.?|March|Mar\\.?|April|Apr\\.?|May|June|Jun\\.?|July|Jul\\.?|August|Aug\\.?|September|Sept?\\.?|October|Oct\\.?|November|Nov\\.?|December|Dec\\.");

  private  static final Pattern YEAR_PATTERN = Pattern.compile("[1-3][0-9]{3}|'?[0-9]{2}");

  private static final Pattern DAY_PATTERN = Pattern.compile("(?:[1-9]|[12][0-9]|3[01])(?:st|nd|rd)?");

  private static final Pattern DATE_PATTERN = Pattern.compile("(?:[1-9]|[0-3][0-9])\\\\?/(?:[1-9]|[0-3][0-9])\\\\?/[1-3][0-9]{3}");

  private static final Pattern DATE_PATTERN2 = Pattern.compile("[12][0-9]{3}[-/](?:0?[1-9]|1[0-2])[-/][0-3][0-9]");

  private static final Pattern TIME_PATTERN = Pattern.compile("[0-2]?[0-9]:[0-5][0-9]");

  private static final Pattern TIME_PATTERN2 = Pattern.compile("[0-2][0-9]:[0-5][0-9]:[0-5][0-9]");

  private static final Pattern AM_PM = Pattern.compile("(a\\.?m\\.?)|(p\\.?m\\.?)", Pattern.CASE_INSENSITIVE);

  public static final Pattern CURRENCY_WORD_PATTERN = Pattern.compile("(?:dollar|cent|euro|pound)s?|penny|pence|yen|yuan|won", Pattern.CASE_INSENSITIVE);

  // pattern matches: dollar, pound sign XML escapes; pound sign, yen sign, euro, won; other country dollars; now omit # for pound
  // TODO: Delete # as currency.  But doing this involves changing PTBTokenizer currency normalization rules
  // Code \u0023 '#' was used for pound '£' in the ISO version of ASCII (ISO 646), and this is found in very old materials
  // e.g., the 1999 Penn Treebank, but we now don't recognize this, as it now doesn't occur and wrongly recognizes
  // currency whenever someone refers to the #4 country etc.
  public static final Pattern CURRENCY_SYMBOL_PATTERN = Pattern.compile("\\$|#|£|£|\u00A3|\u00A5|\u20AC|\u20A9|(?:US|HK|A|C|NT|S|NZ)\\$", Pattern.CASE_INSENSITIVE);  // TODO: No longer include archaic # for pound

  public static final Pattern ORDINAL_PATTERN = Pattern.compile("(?i)[2-9]?1st|[2-9]?2nd|[2-9]?3rd|1[0-9]th|[2-9]?[04-9]th|100+th|zeroth|first|second|third|fourth|fifth|sixth|seventh|eighth|ninth|tenth|eleventh|twelfth|thirteenth|fourteenth|fifteenth|sixteenth|seventeenth|eighteenth|nineteenth|twentieth|twenty-first|twenty-second|twenty-third|twenty-fourth|twenty-fifth|twenty-sixth|twenty-seventh|twenty-eighth|twenty-ninth|thirtieth|thirty-first|fortieth|fiftieth|sixtieth|seventieth|eightieth|ninetieth|hundredth|thousandth|millionth");

  public static final Pattern ARMY_TIME_MORNING = Pattern.compile("0([0-9])([0-9]){2}");

  public static final Pattern GENERIC_TIME_WORDS = Pattern.compile("(morning|evening|night|noon|midnight|teatime|lunchtime|dinnertime|suppertime|afternoon|midday|dusk|dawn|sunup|sundown|daybreak|day)");

  public static final Pattern PERCENT_WORD_PATTERN = Pattern.compile("percent", Pattern.CASE_INSENSITIVE);
  public static final Pattern PERCENT_SYMBOL_PATTERN = Pattern.compile("%");

  private List classifyOld(List document) {
    // if (DEBUG) { log.info("NumberSequenceClassifier tagging"); }
    PaddedList pl = new PaddedList<>(document, pad);
    for (int i = 0, sz = pl.size(); i < sz; i++) {
      CoreLabel me = pl.get(i);
      CoreLabel prev = pl.get(i - 1);
      CoreLabel next = pl.get(i + 1);
      CoreLabel next2 = pl.get(i + 2);
      //if (DEBUG) { log.info("Tagging:" + me.word()); }
      me.set(CoreAnnotations.AnswerAnnotation.class, flags.backgroundSymbol);
      if (CURRENCY_SYMBOL_PATTERN.matcher(me.word()).matches() &&
              (prev.getString(CoreAnnotations.PartOfSpeechAnnotation.class).equals("CD") ||
               next.getString(CoreAnnotations.PartOfSpeechAnnotation.class).equals("CD"))) {
        // dollar, pound, pound, yen,
        // Penn Treebank ancient # as pound, euro,
        if (DEBUG) {
          log.info("Found currency sign:" + me.word());
        }
        me.set(CoreAnnotations.AnswerAnnotation.class, "MONEY");
      } else if (me.getString(CoreAnnotations.PartOfSpeechAnnotation.class).equals("CD")) {
        if (DEBUG) {
          log.info("Tagging CD:" + me.word());
        }

        if (TIME_PATTERN.matcher(me.word()).matches()) {
          me.set(CoreAnnotations.AnswerAnnotation.class, "TIME");
        } else if (TIME_PATTERN2.matcher(me.word()).matches()) {
            me.set(CoreAnnotations.AnswerAnnotation.class, "TIME");
        } else if (DATE_PATTERN.matcher(me.word()).matches()) {
          me.set(CoreAnnotations.AnswerAnnotation.class, "DATE");
        } else if (DATE_PATTERN2.matcher(me.word()).matches()) {
          me.set(CoreAnnotations.AnswerAnnotation.class, "DATE");

        } else if (next.get(CoreAnnotations.TextAnnotation.class) != null &&
            me.get(CoreAnnotations.TextAnnotation.class) != null &&
            DAY_PATTERN.matcher(me.get(CoreAnnotations.TextAnnotation.class)).matches() &&
            MONTH_PATTERN.matcher(next.get(CoreAnnotations.TextAnnotation.class)).matches()) {
          // deterministically make DATE for British-style number before month
          me.set(CoreAnnotations.AnswerAnnotation.class, "DATE");
        } else if (prev.get(CoreAnnotations.TextAnnotation.class) != null &&
            MONTH_PATTERN.matcher(prev.get(CoreAnnotations.TextAnnotation.class)).matches() &&
            me.get(CoreAnnotations.TextAnnotation.class) != null &&
            DAY_PATTERN.matcher(me.get(CoreAnnotations.TextAnnotation.class)).matches()) {
          // deterministically make DATE for number after month
          me.set(CoreAnnotations.AnswerAnnotation.class, "DATE");
        } else if (rightScanFindsMoneyWord(pl, i) && ! leftScanFindsWeightWord(pl, i)) {
          me.set(CoreAnnotations.AnswerAnnotation.class, "MONEY");
        } else if(ARMY_TIME_MORNING.matcher(me.word()).matches()) {
          me.set(CoreAnnotations.AnswerAnnotation.class, "TIME");
        } else
        if (YEAR_PATTERN.matcher(me.word()).matches() &&
            prev.getString(CoreAnnotations.AnswerAnnotation.class).equals("DATE") &&
            (MONTH_PATTERN.matcher(prev.word()).matches() ||
             pl.get(i - 2).get(CoreAnnotations.AnswerAnnotation.class).equals("DATE")))
        {
          me.set(CoreAnnotations.AnswerAnnotation.class, "DATE");
        } else {
          if (DEBUG) {
            log.info("Found number:" + me.word());
          }
          if (prev.getString(CoreAnnotations.AnswerAnnotation.class).equals("MONEY")) {
            me.set(CoreAnnotations.AnswerAnnotation.class, "MONEY");
          } else {
            me.set(CoreAnnotations.AnswerAnnotation.class, "NUMBER");
          }
        }
      } else if(AM_PM.matcher(me.word()).matches() &&
          prev.get(CoreAnnotations.AnswerAnnotation.class).equals("TIME")){
        me.set(CoreAnnotations.AnswerAnnotation.class, "TIME");
      } else if (me.getString(CoreAnnotations.PartOfSpeechAnnotation.class) != null &&
          me.getString(CoreAnnotations.PartOfSpeechAnnotation.class).equals(",") &&
          prev.getString(CoreAnnotations.AnswerAnnotation.class).equals("DATE") &&
          next.word() != null && YEAR_PATTERN.matcher(next.word()).matches()) {
        me.set(CoreAnnotations.AnswerAnnotation.class, "DATE");
      } else if (me.getString(CoreAnnotations.PartOfSpeechAnnotation.class).equals("NNP") &&
          MONTH_PATTERN.matcher(me.word()).matches()) {
        if (prev.getString(CoreAnnotations.AnswerAnnotation.class).equals("DATE") ||
            next.getString(CoreAnnotations.PartOfSpeechAnnotation.class).equals("CD")) {
          me.set(CoreAnnotations.AnswerAnnotation.class, "DATE");
        }
      } else if (me.getString(CoreAnnotations.PartOfSpeechAnnotation.class) != null &&
          me.getString(CoreAnnotations.PartOfSpeechAnnotation.class).equals("CC")) {
        if (prev.tag() != null && prev.tag().equals("CD") &&
            next.tag() != null && next.tag().equals("CD") &&
            me.get(CoreAnnotations.TextAnnotation.class) != null &&
            me.get(CoreAnnotations.TextAnnotation.class).equalsIgnoreCase("and")) {
          if (DEBUG) {
            log.info("Found number and:" + me.word());
          }
          String wd = prev.word();
          if (wd.equalsIgnoreCase("hundred") ||
              wd.equalsIgnoreCase("thousand") ||
              wd.equalsIgnoreCase("million") ||
              wd.equalsIgnoreCase("billion") ||
              wd.equalsIgnoreCase("trillion"))
          {
            me.set(CoreAnnotations.AnswerAnnotation.class, "NUMBER");
          }
        }
      } else if (me.getString(CoreAnnotations.PartOfSpeechAnnotation.class) != null &&
          (me.getString(CoreAnnotations.PartOfSpeechAnnotation.class).equals("NN") ||
           me.getString(CoreAnnotations.PartOfSpeechAnnotation.class).equals("NNS"))) {
        if (CURRENCY_WORD_PATTERN.matcher(me.word()).matches()) {
          if (prev.getString(CoreAnnotations.PartOfSpeechAnnotation.class).equals("CD") &&
              prev.getString(CoreAnnotations.AnswerAnnotation.class).equals("MONEY")) {
            me.set(CoreAnnotations.AnswerAnnotation.class, "MONEY");
          }
        } else if (me.word().equals("m") || me.word().equals("b")) {
          // could be metres, but it's probably million or billion in our
          // applications
          if (prev.getString(CoreAnnotations.AnswerAnnotation.class).equals("MONEY")) {
            me.set(CoreAnnotations.AnswerAnnotation.class, "MONEY");
          } else {
            me.set(CoreAnnotations.AnswerAnnotation.class, "NUMBER");
          }
        } else if (ORDINAL_PATTERN.matcher(me.word()).matches()) {
          if ((next.word() != null && MONTH_PATTERN.matcher(next.word()).matches()) ||
              (next.word() != null && next.word().equalsIgnoreCase("of") &&
               next2.word() != null && MONTH_PATTERN.matcher(next2.word()).matches())) {
            me.set(CoreAnnotations.AnswerAnnotation.class, "DATE");
          }
        } else if(GENERIC_TIME_WORDS.matcher(me.word()).matches()){
          me.set(CoreAnnotations.AnswerAnnotation.class, "TIME");
        }
      } else if (me.getString(CoreAnnotations.PartOfSpeechAnnotation.class).equals("JJ")) {
        if ((next.word() != null && MONTH_PATTERN.matcher(next.word()).matches()) ||
            next.word() != null && next.word().equalsIgnoreCase("of") &&
            next2.word() != null && MONTH_PATTERN.matcher(next2.word()).matches()) {
          me.set(CoreAnnotations.AnswerAnnotation.class, "DATE");
        } else if (ORDINAL_PATTERN.matcher(me.word()).matches()) {
          // don't do other tags: don't want 'second' as noun, or 'first' as adverb
          // introducing reasons
          me.set(CoreAnnotations.AnswerAnnotation.class, "ORDINAL");
        }
      } else if (me.getString(CoreAnnotations.PartOfSpeechAnnotation.class).equals("IN") &&
          me.word().equalsIgnoreCase("of")) {
        if (prev.get(CoreAnnotations.TextAnnotation.class) != null &&
            ORDINAL_PATTERN.matcher(prev.get(CoreAnnotations.TextAnnotation.class)).matches() &&
            next.get(CoreAnnotations.TextAnnotation.class) != null &&
            MONTH_PATTERN.matcher(next.get(CoreAnnotations.TextAnnotation.class)).matches()) {
          me.set(CoreAnnotations.AnswerAnnotation.class, "DATE");
        }
      }
    }
    return document;
  }

  /**
   * Look for a distance of up to 3 for something that indicates weight not
   * money.
   *
   * @param pl The list of CoreLabel
   * @param i The position to scan right from
   * @return whether a weight word is found
   */
  private static boolean leftScanFindsWeightWord(List pl, int i) {
    if (DEBUG) {
      log.info("leftScan from: " + pl.get(i).word());
    }
    for (int j = i - 1; j >= 0 && j >= i - 3; j--) {
      CoreLabel fl = pl.get(j);
      if (fl.word().startsWith("weigh")) {
        if (DEBUG) {
          log.info("leftScan found weight: " + fl.word());
        }
        return true;
      }
    }
    return false;
  }


  /**
   * Look along CD words and see if next thing is a money word
   * like cents or pounds.
   *
   * @param pl The list of CoreLabel
   * @param i The position to scan right from
   * @return Whether a money word is found
   */
  private static boolean rightScanFindsMoneyWord(List pl, int i) {
    int j = i;
    if (DEBUG) {
      log.info("rightScan from: " + pl.get(j).word());
    }
    int sz = pl.size();
    while (j < sz && pl.get(j).getString(CoreAnnotations.PartOfSpeechAnnotation.class).equals("CD")) {
      j++;
    }
    if (j >= sz) {
      return false;
    }
    String tag = pl.get(j).getString(CoreAnnotations.PartOfSpeechAnnotation.class);
    String word = pl.get(j).word();
    if (DEBUG) {
      log.info("rightScan testing: " + word + '/' + tag + "; answer is: " + Boolean.toString((tag.equals("NN") || tag.equals("NNS")) && CURRENCY_WORD_PATTERN.matcher(word).matches()));
    }
    return (tag.equals("NN") || tag.equals("NNS")) && CURRENCY_WORD_PATTERN.matcher(word).matches();
  }

  // Implement other methods of AbstractSequenceClassifier interface

  @Override
  public void train(Collection> docs,
                    DocumentReaderAndWriter readerAndWriter) {
  }

  @Override
  public void serializeClassifier(String serializePath) {
    log.info("Serializing classifier to " + serializePath + "...");
    log.info("done.");
  }

  public void serializeClassifier(ObjectOutputStream oos) {}

  @Override
  public void loadClassifier(ObjectInputStream in, Properties props) throws IOException, ClassCastException, ClassNotFoundException {
  }

  public static void main(String[] args) throws Exception {
    Properties props = StringUtils.argsToProperties(args);
    NumberSequenceClassifier nsc =
      new NumberSequenceClassifier(props, true, props);
    String trainFile = nsc.flags.trainFile;
    String testFile = nsc.flags.testFile;
    String textFile = nsc.flags.textFile;
    String loadPath = nsc.flags.loadClassifier;
    String serializeTo = nsc.flags.serializeTo;

    if (loadPath != null) {
      nsc.loadClassifierNoExceptions(loadPath);
      nsc.flags.setProperties(props);
    } else if (trainFile != null) {
      nsc.train(trainFile);
    }

    if (serializeTo != null) {
      nsc.serializeClassifier(serializeTo);
    }

    if (testFile != null) {
      nsc.classifyAndWriteAnswers(testFile, nsc.makeReaderAndWriter(), true);
    }

    if (textFile != null) {
      DocumentReaderAndWriter readerAndWriter =
              new PlainTextDocumentReaderAndWriter<>();
      nsc.classifyAndWriteAnswers(textFile, readerAndWriter, false);
    }
  } // end main

}




© 2015 - 2024 Weber Informatics LLC | Privacy Policy