edu.stanford.nlp.ling.tokensregex.CoreMapExpressionExtractor Maven / Gradle / Ivy

Show more of this group Show more artifacts with this name
Show all versions of stanford-corenlp Show documentation
Stanford CoreNLP provides a set of natural language analysis tools which can take raw English language text input and give the base forms of words, their parts of speech, whether they are names of companies, people, etc., normalize dates, times, and numeric quantities, mark up the structure of sentences in terms of phrases and word dependencies, and indicate which noun phrases refer to the same entities. It provides the foundational building blocks for higher level text understanding applications.
There is a newer version: 4.5.7
Show newest version
package edu.stanford.nlp.ling.tokensregex;

import edu.stanford.nlp.io.IOUtils;
import edu.stanford.nlp.ling.CoreAnnotation;
import edu.stanford.nlp.ling.CoreAnnotations;
import edu.stanford.nlp.ling.tokensregex.parser.ParseException;
import edu.stanford.nlp.ling.tokensregex.parser.TokenSequenceParseException;
import edu.stanford.nlp.ling.tokensregex.parser.TokenSequenceParser;
import edu.stanford.nlp.ling.tokensregex.types.Expression;
import edu.stanford.nlp.ling.tokensregex.types.Tags;
import edu.stanford.nlp.ling.tokensregex.types.Value;
import edu.stanford.nlp.util.*;
import edu.stanford.nlp.util.logging.Redwood;

import java.io.BufferedReader;
import java.io.IOException;
import java.io.StringReader;
import java.util.*;
import java.util.function.Predicate;


/**
 * Represents a list of assignment and extraction rules over sequence patterns.
 *    See {@link SequenceMatchRules} for the syntax of rules.
 * 
 *
 * Assignment rules are used to assign a value to a variable for later use in
 * extraction rules or for expansions in patterns.
 * Extraction rules are used to extract text/tokens matching regular expressions.
 * Extraction rules are grouped into stages, with each stage consisting of the following:
 * 

 *   Matching of rules over text and tokens.  These rules are applied directly on the text and tokens fields of the {@code CoreMap}.
 *   Matching of composite rules.  Matched expression are merged, and composite rules
 *       are applied recursively until no more changes to the matched expressions are detected.
 *   Filtering of an invalid expression.  In the final phase, a final filtering stage filters out invalid expressions.
 * 
 * The different stages are numbered and are applied in numeric order.
 * 
 *
 * @author Angel Chang
 * @see SequenceMatchRules
 */
public class CoreMapExpressionExtractor  {

  /** A logger for this class */
  private static final Redwood.RedwoodChannels log = Redwood.channels(CoreMapExpressionExtractor.class);

  private static boolean verbose = false;

  // TODO: Remove templating of MatchedExpressions  (keep for now until TimeExpression rules can be decoupled)

  private final Env env;
  /* Keeps temporary tags created by extractor */
  private boolean keepTags = false;
  /* Collapses extraction rules - use with care */
  private boolean collapseExtractionRules = false;
  private final Class>> tokensAnnotationKey;
  private final Map> stages;

  /**
   * Describes one stage of extraction.
   * @param 
   */
  public static class Stage {
    /** Whether to clear matched expressions from previous stages or not */
    boolean clearMatched = false;
    /**
     * Limit the number of iterations for which the composite rules are applied
     * (prevents badly formed rules from iterating forever)
     */
    int limitIters = 50;
    /**
     * Stage id (stages are applied in numeric order from low to high)
     */
    int stageId;
    /** Rules to extract matched  expressions directly from tokens */
    SequenceMatchRules.ExtractRule basicExtractRule;
    /** Rules to extract composite expressions (grouped in stages) */
    SequenceMatchRules.ExtractRule, T> compositeExtractRule;
    /** Filtering rule */
    Predicate filterRule;

    private static  SequenceMatchRules.ExtractRule addRule(SequenceMatchRules.ExtractRule origRule,
                                                                     SequenceMatchRules.ExtractRule rule) {
      SequenceMatchRules.ListExtractRule r;
      if (origRule instanceof SequenceMatchRules.ListExtractRule) {
        r = (SequenceMatchRules.ListExtractRule) origRule;
      } else {
        r = new SequenceMatchRules.ListExtractRule<>();
        if (origRule != null)
        r.addRules(origRule);
      }
      r.addRules(rule);
      return r;
    }

    private void addCompositeRule(SequenceMatchRules.ExtractRule, T> rule) {
      compositeExtractRule = addRule(compositeExtractRule, rule);
    }

    private void addBasicRule(SequenceMatchRules.ExtractRule rule) {
      basicExtractRule = addRule(basicExtractRule, rule);
    }

    private void addFilterRule(Predicate rule) {
      Filters.DisjFilter r;
      if (filterRule instanceof Filters.DisjFilter) {
        r = (Filters.DisjFilter) filterRule;
        r.addFilter(rule);
      } else {
        if (filterRule == null) {
          r = new Filters.DisjFilter<>(rule);
        } else {
          r = new Filters.DisjFilter<>(filterRule, rule);
        }
        filterRule = r;
      }
    }
  }

  /**
   * Creates an empty instance with no rules.
   */
  public CoreMapExpressionExtractor() {
    this(null);
  }

  /**
   * Creates a default instance with the specified environment.
   *   (use the default tokens annotation key as specified in the environment)
   * @param env Environment to use for binding variables and applying rules
   */
  public CoreMapExpressionExtractor(Env env) {
    this.stages = new HashMap<>();//Generics.newHashMap();
    this.env = env;
    this.tokensAnnotationKey = EnvLookup.getDefaultTokensAnnotationKey(env);
    this.collapseExtractionRules = false;
    if (env != null) {
      this.collapseExtractionRules = Objects.equals((Boolean) env.get("collapseExtractionRules"), true);
      if (env.get("verbose") != null)
        verbose =  (env.get("verbose") != null) &&
                Objects.equals((Boolean) env.get("verbose"), true);
    }
  }

  /**
   * Creates an instance with the specified environment and list of rules
   * @param env Environment to use for binding variables and applying rules
   * @param rules List of rules for this extractor
   */
  public CoreMapExpressionExtractor(Env env, List rules) {
    this(env);
    appendRules(rules);
  }

  /**
   * Add specified rules to this extractor.
   *
   * @param rules
   */
  public void appendRules(List rules)
  {
    if (verbose)
      log.info("Read " + rules.size() + " rules");
    // Put rules into stages
    if (collapseExtractionRules) {
      rules = collapse(rules);
      if (verbose)
        log.info("Collapsing into " + rules.size() + " rules");
    }
    for (SequenceMatchRules.Rule r:rules) {
      if (r instanceof SequenceMatchRules.AssignmentRule) {
        // Nothing to do
        // Assignments are added to environment as they are parsed
        ((SequenceMatchRules.AssignmentRule) r).evaluate(env);
      } else if (r instanceof SequenceMatchRules.AnnotationExtractRule) {
        SequenceMatchRules.AnnotationExtractRule aer = (SequenceMatchRules.AnnotationExtractRule) r;
        Stage stage = stages.get(aer.stage);
        if (stage == null) {
          stages.put(aer.stage, stage = new Stage<>());
          stage.stageId = aer.stage;
          Boolean clearMatched = (Boolean) env.getDefaults().get("stage.clearMatched");
          if (clearMatched != null) {
            stage.clearMatched = clearMatched;
          }
          Integer limitIters = (Integer) env.getDefaults().get("stage.limitIters");
          if (limitIters != null) {
            stage.limitIters = limitIters;
          }
        }
        if (aer.active) {
          if (SequenceMatchRules.FILTER_RULE_TYPE.equals(aer.ruleType)) {
            stage.addFilterRule(aer);
          } else {
            if (aer.isComposite) {
//            if (SequenceMatchRules.COMPOSITE_RULE_TYPE.equals(aer.ruleType)) {
              stage.addCompositeRule(aer);
            } else {
              stage.addBasicRule(aer);
            }
          }
        } else {
          log.debug("Ignoring inactive rule: " + aer.name); // used to be INFO but annoyed Chris/users
        }
      }
    }
  }

  private SequenceMatchRules.AnnotationExtractRule createMergedRule(SequenceMatchRules.AnnotationExtractRule aerTemplate, List patterns) {
    return SequenceMatchRules.createMultiTokenPatternRule(env, aerTemplate, patterns);
  }

  private List collapse(List rules) {
    List collapsed = new ArrayList<>();
    List patterns = null;
    SequenceMatchRules.AnnotationExtractRule aerTemplate = null;
    for (SequenceMatchRules.Rule rule:rules) {
      boolean ruleHandled = false;
      if (rule instanceof SequenceMatchRules.AnnotationExtractRule) {
        SequenceMatchRules.AnnotationExtractRule aer = (SequenceMatchRules.AnnotationExtractRule) rule;
        if (aer.hasTokensRegexPattern()) {
          if (aerTemplate == null || aerTemplate.isMostlyCompatible(aer)) {
            if (aerTemplate == null) {
              aerTemplate = aer;
            }
            if (patterns == null) {
              patterns = new ArrayList<>();
            }
            patterns.add((TokenSequencePattern) aer.pattern);
            ruleHandled = true;
          }
        }
      }

      // Did we handle this rule?
      if (!ruleHandled) {
        if (aerTemplate != null) {
          SequenceMatchRules.AnnotationExtractRule merged = createMergedRule(aerTemplate, patterns);
          collapsed.add(merged);
          aerTemplate = null;
          patterns = null;
        }
        collapsed.add(rule);
      }
    }
    if (aerTemplate != null) {
      SequenceMatchRules.AnnotationExtractRule merged = createMergedRule(aerTemplate, patterns);
      collapsed.add(merged);
    }
    return collapsed;
  }

  public Env getEnv() {
    return env;
  }

  public void setExtractRules(SequenceMatchRules.ExtractRule basicExtractRule,
                              SequenceMatchRules.ExtractRule, T> compositeExtractRule,
                              Predicate filterRule)
  {
    Stage stage = new Stage<>();
    stage.basicExtractRule = basicExtractRule;
    stage.compositeExtractRule = compositeExtractRule;
    stage.filterRule = filterRule;
    this.stages.clear();
    this.stages.put(1, stage);
  }

  /**
   * Creates an extractor using the specified environment, and reading the rules from the given filenames.
   * @param env
   * @param filenames
   * @throws RuntimeException
   */
  public static  CoreMapExpressionExtractor createExtractorFromFiles(Env env, String... filenames) throws RuntimeException {
    return createExtractorFromFiles(env, Arrays.asList(filenames));
  }

  /**
   * Creates an extractor using the specified environment, and reading the rules from the given filenames.
   * @param env
   * @param filenames
   * @throws RuntimeException
   */
  public static  CoreMapExpressionExtractor createExtractorFromFiles(Env env, List filenames) throws RuntimeException {
    CoreMapExpressionExtractor extractor = new CoreMapExpressionExtractor<>(env);
    for (String filename:filenames) {
      try {
        if (verbose)
          log.info("Reading TokensRegex rules from " + filename);
        BufferedReader br = IOUtils.readerFromString(filename);
        TokenSequenceParser parser = new TokenSequenceParser();
        parser.updateExpressionExtractor(extractor, br);
        IOUtils.closeIgnoringExceptions(br);
      } catch (Exception ex) {
        throw new RuntimeException("Error parsing file: " + filename, ex);
      }
    }
    return extractor;
  }

  /**
   * Creates an extractor using the specified environment, and reading the rules from the given filename.
   * @param env
   * @param filename
   * @throws RuntimeException
   */
  public static CoreMapExpressionExtractor createExtractorFromFile(Env env, String filename) throws RuntimeException {
    return createExtractorFromFiles(env, Collections.singletonList(filename));
  }

  /**
   * Creates an extractor using the specified environment, and reading the rules from the given string
   * @param env
   * @param str
   * @throws IOException, ParseException
   */
  public static CoreMapExpressionExtractor createExtractorFromString(Env env, String str) throws IOException, ParseException, TokenSequenceParseException {
    TokenSequenceParser parser = new TokenSequenceParser();
    CoreMapExpressionExtractor extractor = parser.getExpressionExtractor(env, new StringReader(str));
    return extractor;
  }

  public Value getValue(String varname)
  {
    Expression expr = (Expression) env.get(varname);
    if (expr != null) {
      return expr.evaluate(env);
    } else {
      throw new RuntimeException("Unable get expression for variable " + varname);
    }
  }

  private List extractCoreMapsToList(List res, CoreMap annotation) {
    List exprs = extractExpressions(annotation);
    for (T expr : exprs) {
      res.add(expr.getAnnotation());
    }
    return res;
  }

  /**
   * Returns list of coremaps that matches the specified rules.
   *
   * @param annotation
   */
  public List extractCoreMaps(CoreMap annotation) {
    List res = new ArrayList<>();
    return extractCoreMapsToList(res, annotation);
  }

  /**
   * Returns list of merged tokens and original tokens.
   *
   * @param annotation
   */
  public List extractCoreMapsMergedWithTokens(CoreMap annotation) {
    List res = extractCoreMaps(annotation);
    Integer startTokenOffset = annotation.get(CoreAnnotations.TokenBeginAnnotation.class);
    if (startTokenOffset == null) {
      startTokenOffset = 0;
    }
    final Integer startTokenOffsetFinal = startTokenOffset;
    List merged = CollectionUtils.mergeListWithSortedMatchedPreAggregated(
            annotation.get(tokensAnnotationKey), res, (CoreMap in) -> Interval.toInterval(in.get(CoreAnnotations.TokenBeginAnnotation.class) - startTokenOffsetFinal,
                    in.get(CoreAnnotations.TokenEndAnnotation.class) - startTokenOffsetFinal)
    );
    return merged;
  }

  public List flatten(List cms) {
    return flatten(cms, tokensAnnotationKey);
  }

  private static List flatten(List cms, Class key) {
    List res = new ArrayList<>();
    for (CoreMap cm : cms) {
      if (cm.get(key) != null) {
        res.addAll( (List) cm.get(key));
      } else {
        res.add(cm);
      }
    }
    return res;
  }

  private void cleanupTags(Collection objs, Map cleaned) {
    for (Object obj:objs) {
      if (!cleaned.containsKey(obj)) {
        cleaned.put(obj, false);
        if (obj instanceof CoreMap) {
          cleanupTags((CoreMap) obj, cleaned);
        } else if (obj instanceof Collection) {
          cleanupTags((Collection) obj, cleaned);
        }
        cleaned.put(obj, true);
      }
    }
  }

  private void cleanupTags(CoreMap cm) {
    cleanupTags(cm, new IdentityHashMap<>());
  }

  private void cleanupTags(CoreMap cm, Map cleaned) {
    cm.remove(Tags.TagsAnnotation.class);
    for (Class key:cm.keySet()) {
      Object obj = cm.get(key);
      if (!cleaned.containsKey(obj)) {
        cleaned.put(obj, false);
        if (obj instanceof CoreMap) {
          cleanupTags((CoreMap) obj, cleaned);
        } else if (obj instanceof Collection) {
          cleanupTags((Collection) obj, cleaned);
        }
        cleaned.put(obj, true);
      }
    }
  }

  private Pair, List> applyCompositeRule(
          SequenceMatchRules.ExtractRule, T> compositeExtractRule,
          List merged,
          List matchedExpressions, int limit) {
    // Apply higher order rules
    boolean done = false;
    // Limit of number of times rules are applied just in case
    int maxIters = limit;
    int iters = 0;
    while (! done) {
      List newExprs = new ArrayList<>();
      boolean extracted = compositeExtractRule.extract(merged, newExprs);
      if (verbose && extracted) log.info("applyCompositeRule() extracting with " + compositeExtractRule + " from " + merged + " gives " + newExprs);
      if (extracted) {
        annotateExpressions(merged, newExprs);
        newExprs = MatchedExpression.removeNullValues(newExprs);
        if ( ! newExprs.isEmpty()) {
          newExprs = MatchedExpression.removeNested(newExprs);
          newExprs = MatchedExpression.removeOverlapping(newExprs);
          merged = MatchedExpression.replaceMerged(merged, newExprs);
          // Favor newly matched expressions over older ones
          newExprs.addAll(matchedExpressions);
          matchedExpressions = MatchedExpression.removeNested(newExprs);
          matchedExpressions = MatchedExpression.removeOverlapping(matchedExpressions);
        } else {
          extracted = false;
        }
      }
      done = ! extracted;
      iters++;
      if (maxIters > 0 && iters >= maxIters) {
        if (verbose) {
          log.warn("Aborting application of composite rules: Maximum iteration " + maxIters + " reached");
        }
        break;
      }
    }
    return new Pair<>(merged, matchedExpressions);
  }

  private static class CompositeMatchState {
    List merged;
    List matched;
    int iters;

    private CompositeMatchState(List merged, List matched, int iters) {
      this.merged = merged;
      this.matched = matched;
      this.iters = iters;
    }
  }

  public List extractExpressions(CoreMap annotation) {
    // Extract potential expressions
    List matchedExpressions = new ArrayList<>();
    List stageIds = new ArrayList<>(stages.keySet());
    Collections.sort(stageIds);
    for (int stageId : stageIds) {
      Stage stage = stages.get(stageId);
      SequenceMatchRules.ExtractRule basicExtractRule = stage.basicExtractRule;
      if (stage.clearMatched) {
        matchedExpressions.clear();
      }
      if (basicExtractRule != null) {
        basicExtractRule.extract(annotation, matchedExpressions);
        if (verbose && matchedExpressions != null) {
          log.info("extractExpressions() extracting with " + basicExtractRule + " from " + annotation + " gives " + matchedExpressions);
        }
        annotateExpressions(annotation, matchedExpressions);
        matchedExpressions = MatchedExpression.removeNullValues(matchedExpressions);
        matchedExpressions = MatchedExpression.removeNested(matchedExpressions);
        matchedExpressions = MatchedExpression.removeOverlapping(matchedExpressions);
      }

      List merged = MatchedExpression.replaceMergedUsingTokenOffsets(annotation.get(tokensAnnotationKey), matchedExpressions);
      SequenceMatchRules.ExtractRule, T> compositeExtractRule = stage.compositeExtractRule;
      if (compositeExtractRule != null) {
        Pair, List> p = applyCompositeRule(
                compositeExtractRule, merged, matchedExpressions, stage.limitIters);
        merged = p.first();
        matchedExpressions = p.second();
      }
      matchedExpressions = filterInvalidExpressions(stage.filterRule, matchedExpressions);
    }
    Collections.sort(matchedExpressions, MatchedExpression.EXPR_TOKEN_OFFSETS_NESTED_FIRST_COMPARATOR);
    if (!keepTags) {
      cleanupTags(annotation);
    }
    return matchedExpressions;
  }

  private void annotateExpressions(CoreMap annotation, List expressions) {
    // TODO: Logging can be excessive
    List toDiscard = new ArrayList<>();
    for (T te : expressions) {
      // Add attributes and all
      if (te.annotation == null) {
        try {
          boolean extractOkay = te.extractAnnotation(env, annotation);
          if (verbose && extractOkay) {
            log.info("annotateExpressions() matched " + te + " from " + annotation);
          }
          if (!extractOkay) {
            // Things didn't turn out so well
            toDiscard.add(te);
            log.warn("Error extracting annotation from " + te /*+ ", " + te.getExtractErrorMessage() */);
          }
        } catch (Exception ex) {
          if (verbose) {
            log.warn("Error extracting annotation from " + te);
            log.warn(ex);
          }
        }
      }
    }
    expressions.removeAll(toDiscard);
  }

  private void annotateExpressions(List chunks, List expressions) {
    // TODO: Logging can be excessive
    List toDiscard = new ArrayList<>();
    for (T te : expressions) {
      // Add attributes and all
      try {
        boolean extractOkay = te.extractAnnotation(env, chunks);
        if (verbose && extractOkay) {
          log.info("annotateExpressions() matched " + te + " from " + chunks);
        }
        if (!extractOkay) {
          // Things didn't turn out so well
          toDiscard.add(te);
          log.warn("Error extracting annotation from " + te /*+ ", " + te.getExtractErrorMessage() */);
        }
      } catch (Exception ex) {
        if (verbose) {
          log.warn("Error extracting annotation from " + te);
          log.warn(ex);
        }
      }
    }
    expressions.removeAll(toDiscard);
  }

  private List filterInvalidExpressions(Predicate filterRule, List expressions) {
    if (filterRule == null) return expressions;
    if (expressions.isEmpty()) return expressions;
    int nfiltered = 0;
    List kept = new ArrayList<>(expressions.size());   // Approximate size
    for (T expr : expressions) {
      if (!filterRule.test(expr)) {
        kept.add(expr);
      } else {
        nfiltered++;
//        logger.warning("Filtering out " + expr.getText());
      }
    }
    if (nfiltered > 0 && verbose) {
      log.debug("Filtered " + nfiltered);
    }
    return kept;
  }

  /**
   * Keeps the temporary tags on the sentence after extraction has finished.
   * This can have potentially unexpected results if you run the same sentence through multiple extractors;
   * but, it makes the extraction process 20+% faster.
   *
   * @return This object
   */
  public CoreMapExpressionExtractor keepTemporaryTags() {
    this.keepTags = true;
    return this;
  }

  public static void setVerbose(boolean v) {
    verbose = v;
  }

}