edu.stanford.nlp.patterns.surface.SurfacePatternFactory Maven / Gradle / Ivy

Show more of this group Show more artifacts with this name
Show all versions of stanford-corenlp Show documentation
Stanford CoreNLP provides a set of natural language analysis tools which can take raw English language text input and give the base forms of words, their parts of speech, whether they are names of companies, people, etc., normalize dates, times, and numeric quantities, mark up the structure of sentences in terms of phrases and word dependencies, and indicate which noun phrases refer to the same entities. It provides the foundational building blocks for higher level text understanding applications.
There is a newer version: 4.5.7
Show newest version
package edu.stanford.nlp.patterns.surface;

import edu.stanford.nlp.ling.CoreAnnotations;
import edu.stanford.nlp.ling.CoreLabel;
import edu.stanford.nlp.patterns.CandidatePhrase;
import edu.stanford.nlp.patterns.ConstantsAndVariables;
import edu.stanford.nlp.patterns.DataInstance;
import edu.stanford.nlp.patterns.PatternFactory;
import edu.stanford.nlp.sequences.SeqClassifierFlags;
import edu.stanford.nlp.util.CollectionUtils;
import edu.stanford.nlp.util.ArgumentParser;
import edu.stanford.nlp.util.StringUtils;
import edu.stanford.nlp.util.Triple;

import java.nio.charset.Charset;
import java.util.*;

/**
 * Created by sonalg on 10/27/14.
 */
public class SurfacePatternFactory extends PatternFactory {

  /**
   * Use POS tag restriction in the target term: One of this and
   * addPatWithoutPOS has to be true.
   */
  @ArgumentParser.Option(name = "usePOS4Pattern")
  public static boolean usePOS4Pattern = true;

  /**
   * Use first two letters of the POS tag
   */
  @ArgumentParser.Option(name="useCoarsePOS")
  public static boolean useCoarsePOS = true;

  /**
   * Add patterns without POS restriction as well: One of this and
   * usePOS4Pattern has to be true.
   */
  @ArgumentParser.Option(name = "addPatWithoutPOS")
  public static boolean addPatWithoutPOS = true;

  /**
   * Consider contexts longer or equal to these many tokens.
   */
  @ArgumentParser.Option(name = "minWindow4Pattern")
  public static int minWindow4Pattern = 2;

  /**
   * Consider contexts less than or equal to these many tokens -- total of left
   * and right contexts be can double of this.
   */
  @ArgumentParser.Option(name = "maxWindow4Pattern")
  public static int maxWindow4Pattern = 4;

  /**
   * Consider contexts on the left of a token.
   */
  @ArgumentParser.Option(name = "usePreviousContext")
  public static boolean usePreviousContext = true;

  /**
   * Consider contexts on the right of a token.
   */
  @ArgumentParser.Option(name = "useNextContext")
  public static boolean useNextContext = false;;

  /**
   * If the whole (either left or right) context is just stop words, add the
   * pattern only if number of tokens is equal or more than this. This is get
   * patterns like "I am on X" but ignore "on X".
   */
  @ArgumentParser.Option(name = "numMinStopWordsToAdd")
  public static int numMinStopWordsToAdd = 3;


  /**
   * Adds the parent's tag from the parse tree to the target phrase in the patterns
   */
  @ArgumentParser.Option(name = "useTargetParserParentRestriction")
  public static boolean useTargetParserParentRestriction = false;

  /**
   * If the NER tag of the context tokens is not the background symbol,
   * generalize the token with the NER tag
   */
  @ArgumentParser.Option(name = "useContextNERRestriction")
  public static boolean useContextNERRestriction = false;

  /**
   * Ignore words like "a", "an", "the" when matching a pattern.
   */
  @ArgumentParser.Option(name = "useFillerWordsInPat")
  public static boolean useFillerWordsInPat = true;



  public static enum Genre {
    PREV, NEXT, PREVNEXT
  };

  static Token fw, sw;

  public static void setUp(Properties props){
    ArgumentParser.fillOptions(PatternFactory.class, props);
    ArgumentParser.fillOptions(SurfacePatternFactory.class, props);
    ArgumentParser.fillOptions(SurfacePattern.class, props);

    if (!addPatWithoutPOS && !usePOS4Pattern) {
      throw new RuntimeException(
        "addPatWithoutPOS and usePOS4Pattern both cannot be false ");
    }

    fw = new Token(PatternType.SURFACE);
    if (useFillerWordsInPat) {
      fw.setEnvBindRestriction("$FILLER");
      fw.setNumOcc(0,2);
    }
    sw = new Token(PatternType.SURFACE);
    if (useStopWordsBeforeTerm) {
      sw.setEnvBindRestriction("$STOPWORD");
      sw.setNumOcc(0, 2);
    }
  }


  public static Set getContext(List sent, int i, Set stopWords) {


    Set prevpatterns = new HashSet<>();
    Set nextpatterns = new HashSet<>();
    Set prevnextpatterns = new HashSet<>();
    CoreLabel token = sent.get(i);
    String tag = null;
    if (usePOS4Pattern) {
      String fulltag = token.tag();
      if(useCoarsePOS)
        tag = fulltag.substring(0, Math.min(fulltag.length(), 2));
      else
        tag = fulltag;
    }
    String nerTag = token.get(CoreAnnotations.NamedEntityTagAnnotation.class);
    for (int maxWin = 1; maxWin <= maxWindow4Pattern; maxWin++) {
      List previousTokens = new ArrayList<>();
      List originalPrev = new ArrayList<>(), originalNext = new ArrayList<>();
      List nextTokens = new ArrayList<>();

      int numStopWordsprev = 0, numStopWordsnext = 0;
      // int numPrevTokensSpecial = 0, numNextTokensSpecial = 0;
      int numNonStopWordsNext = 0, numNonStopWordsPrev = 0;
      boolean useprev = false, usenext = false;


      PatternToken twithoutPOS = null;
      //TODO: right now using numWordsCompoundMax.
      if (addPatWithoutPOS) {
        twithoutPOS = new PatternToken(tag, false,
          numWordsCompoundMax > 1, numWordsCompoundMax,
          nerTag, useTargetNERRestriction, useTargetParserParentRestriction, token.get(CoreAnnotations.GrandparentAnnotation.class));
      }

      PatternToken twithPOS = null;
      if (usePOS4Pattern) {
        twithPOS = new PatternToken(tag, true,
          numWordsCompoundMax > 1, numWordsCompoundMax,
          nerTag, useTargetNERRestriction, useTargetParserParentRestriction, token.get(CoreAnnotations.GrandparentAnnotation.class));
      }

      if (usePreviousContext) {
        // int j = Math.max(0, i - 1);
        int j = i - 1;
        int numTokens = 0;
        while (numTokens < maxWin && j >= 0) {
          // for (int j = Math.max(i - maxWin, 0); j < i; j++) {
          CoreLabel tokenj = sent.get(j);

          String tokenjStr;
          if (useLemmaContextTokens)
            tokenjStr = tokenj.lemma();
          else
            tokenjStr = tokenj.word();

          // do not use this word in context consideration
          if (useFillerWordsInPat
            && fillerWords.contains(tokenj.word().toLowerCase())) {
            j--;
            continue;
          }
//          if (!tokenj.containsKey(answerClass.get(label))) {
//            throw new RuntimeException("how come the class "
//                + answerClass.get(label) + " for token "
//                + tokenj.word() + " in " + sent + " is not set");
//          }

          Triple tr = getContextTokenStr(tokenj);
          boolean isLabeledO = tr.first;
          Token strgeneric = tr.second;
          String strOriginal = tr.third;

          if (!isLabeledO) {
            // numPrevTokensSpecial++;
            previousTokens.add(0, strgeneric);
            // previousTokens.add(0,
            // "[{answer:"
            // + tokenj.get(answerClass.get(label)).toString()
            // + "}]");
            originalPrev.add(0, strOriginal);
            numNonStopWordsPrev++;
          } else if (tokenj.word().startsWith("http")) {
            useprev = false;
            previousTokens.clear();
            originalPrev.clear();
            break;
          } else {
            Token str = SurfacePattern.getContextToken(tokenj);
            previousTokens.add(0, str);
            originalPrev.add(0, tokenjStr);
            if (doNotUse(tokenjStr, stopWords)) {
              numStopWordsprev++;
            } else
              numNonStopWordsPrev++;
          }
          numTokens++;
          j--;
        }
      }

      if (useNextContext) {
        int numTokens = 0;
        int j = i + 1;
        while (numTokens < maxWin && j < sent.size()) {
          // for (int j = i + 1; j < sent.size() && j <= i + maxWin; j++) {
          CoreLabel tokenj = sent.get(j);

          String tokenjStr;
          if (useLemmaContextTokens)
            tokenjStr = tokenj.lemma();
          else
            tokenjStr = tokenj.word();

          // do not use this word in context consideration
          if (useFillerWordsInPat
            && fillerWords.contains(tokenj.word().toLowerCase())) {
            j++;
            continue;
          }
//          if (!tokenj.containsKey(answerClass.get(label))) {
//            throw new RuntimeException(
//                "how come the dict annotation for token " + tokenj.word()
//                    + " in " + sent + " is not set");
//          }

          Triple tr = getContextTokenStr(tokenj);
          boolean isLabeledO = tr.first;
          Token strgeneric = tr.second;
          String strOriginal = tr.third;

          // boolean isLabeledO = tokenj.get(answerClass.get(label))
          // .equals(SeqClassifierFlags.DEFAULT_BACKGROUND_SYMBOL);
          if (!isLabeledO) {
            // numNextTokensSpecial++;
            numNonStopWordsNext++;
            nextTokens.add(strgeneric);
            // nextTokens.add("[{" + label + ":"
            // + tokenj.get(answerClass.get(label)).toString()
            // + "}]");
            originalNext.add(strOriginal);
            // originalNextStr += " "
            // + tokenj.get(answerClass.get(label)).toString();
          } else if (tokenj.word().startsWith("http")) {
            usenext = false;
            nextTokens.clear();
            originalNext.clear();
            break;
          } else {// if (!tokenj.word().matches("[.,?()]")) {
            Token str = SurfacePattern.getContextToken(tokenj);
            nextTokens.add(str);
            originalNext.add(tokenjStr);
            if (doNotUse(tokenjStr, stopWords)) {
              numStopWordsnext++;
            } else
              numNonStopWordsNext++;
          }
          j++;
          numTokens++;
        }
      }
      // String prevContext = null, nextContext = null;

      // int numNonSpecialPrevTokens = previousTokens.size()
      // - numPrevTokensSpecial;
      // int numNonSpecialNextTokens = nextTokens.size() - numNextTokensSpecial;



      Token[] prevContext = null;
      //String[] prevContext = null;
      //String[] prevOriginalArr = null;
      // if (previousTokens.size() >= minWindow4Pattern
      // && (numStopWordsprev < numNonSpecialPrevTokens ||
      // numNonSpecialPrevTokens > numMinStopWordsToAdd)) {
      if (previousTokens.size() >= minWindow4Pattern
        && (numNonStopWordsPrev > 0 || numStopWordsprev > numMinStopWordsToAdd)) {

        // prevContext = StringUtils.join(previousTokens, fw);

        List prevContextList = new ArrayList<>();
        List prevOriginal = new ArrayList<>();
        for (Token p : previousTokens) {
          prevContextList.add(p);
          if (!fw.isEmpty())
            prevContextList.add(fw);
        }

        // add fw and sw to the the originalprev
        for (String p : originalPrev) {
          prevOriginal.add(p);
          if (!fw.isEmpty())
            prevOriginal.add(" FW ");
        }

        if (!sw.isEmpty()) {
          prevContextList.add(sw);
          prevOriginal.add(" SW ");
        }

        // String str = prevContext + fw + sw;


        if (isASCII(StringUtils.join(prevOriginal))) {
          prevContext = prevContextList.toArray(new Token[0]);
          //prevOriginalArr = prevOriginal.toArray(new String[0]);
          if (previousTokens.size() >= minWindow4Pattern) {
            if (twithoutPOS != null) {
              SurfacePattern pat = new SurfacePattern(prevContext, twithoutPOS,
                null, Genre.PREV);
              prevpatterns.add(pat);
            }
            if (twithPOS != null) {
              SurfacePattern patPOS = new SurfacePattern(prevContext, twithPOS,
                null, Genre.PREV);
              prevpatterns.add(patPOS);
            }
          }
          useprev = true;
        }
      }

      Token[] nextContext = null;
      //String [] nextOriginalArr = null;
      // if (nextTokens.size() > 0
      // && (numStopWordsnext < numNonSpecialNextTokens ||
      // numNonSpecialNextTokens > numMinStopWordsToAdd)) {
      if (nextTokens.size() > 0
        && (numNonStopWordsNext > 0 || numStopWordsnext > numMinStopWordsToAdd)) {
        // nextContext = StringUtils.join(nextTokens, fw);
        List nextContextList = new ArrayList<>();

        List nextOriginal = new ArrayList<>();

        if (!sw.isEmpty()) {
          nextContextList.add(sw);
          nextOriginal.add(" SW ");
        }

        for (Token n : nextTokens) {
          if (!fw.isEmpty())
            nextContextList.add(fw);
          nextContextList.add(n);
        }

        for (String n : originalNext) {
          if (!fw.isEmpty())
            nextOriginal.add(" FW ");
          nextOriginal.add(n);
        }

        if (nextTokens.size() >= minWindow4Pattern) {
          nextContext = nextContextList.toArray(new Token[0]);
          //nextOriginalArr =  nextOriginal.toArray(new String[0]);
          if (twithoutPOS != null) {
            SurfacePattern pat = new SurfacePattern(null, twithoutPOS,
              nextContext, Genre.NEXT);
            nextpatterns.add(pat);
          }
          if (twithPOS != null) {
            SurfacePattern patPOS = new SurfacePattern(null, twithPOS,
              nextContext, Genre.NEXT);
            nextpatterns.add(patPOS);
          }

        }
        usenext = true;

      }

      if (useprev && usenext) {
        // String strprev = prevContext + fw + sw;

        // String strnext = sw + fw + nextContext;
        if (previousTokens.size() + nextTokens.size() >= minWindow4Pattern) {

          if (twithoutPOS != null) {
            SurfacePattern pat = new SurfacePattern(prevContext, twithoutPOS,
              nextContext, Genre.PREVNEXT);
            prevnextpatterns.add(pat);
          }

          if (twithPOS != null) {
            SurfacePattern patPOS = new SurfacePattern(prevContext, twithPOS,
              nextContext, Genre.PREVNEXT);
            prevnextpatterns.add(patPOS);
          }
        }

      }
    }

//    Triple, Set, Set> patterns = new Triple, Set, Set>(
//        prevpatterns, nextpatterns, prevnextpatterns);
    // System.out.println("For word " + sent.get(i) + " in sentence " + sent +
    // " prev patterns are " + prevpatterns);
    // System.out.println("For word " + sent.get(i) + " in sentence " + sent +
    // " next patterns are " + nextpatterns);
    // System.out.println("For word " + sent.get(i) + " in sentence " + sent +
    // " prevnext patterns are " + prevnextpatterns);
    //getPatternIndex().finishCommit();
    return CollectionUtils.unionAsSet(prevpatterns, nextpatterns, prevnextpatterns);
  }



  static Triple getContextTokenStr(CoreLabel tokenj) {
    Token strgeneric = new Token(PatternType.SURFACE);
    String strOriginal = "";
    boolean isLabeledO = true;
//    for (Entry>> e : getAnswerClass().entrySet()) {
//      if (!tokenj.get(e.getValue()).equals(backgroundSymbol)) {
//        isLabeledO = false;
//        if (strOriginal.isEmpty()) {
//          strOriginal = e.getKey();
//        } else {
//          strOriginal += "|" + e.getKey();
//        }
//        strgeneric.addRestriction(e.getKey(), e.getKey());
//      }
//    }

    for (Map.Entry e : ConstantsAndVariables.getGeneralizeClasses().entrySet()) {
      if(!tokenj.containsKey(e.getValue()) || tokenj.get(e.getValue()) == null)
        throw new RuntimeException(" Why does the token not have the class " + e.getValue() + " set? Existing classes " + tokenj.toString(CoreLabel.OutputFormat.ALL));


      if (!tokenj.get(e.getValue()).equals(ConstantsAndVariables.backgroundSymbol)) {
        isLabeledO = false;
        if (strOriginal.isEmpty()) {

          strOriginal = e.getKey();
        } else {

          strOriginal += "|" + e.getKey();
        }
        strgeneric.addORRestriction(e.getValue(), e.getKey());
      }
    }

    if (useContextNERRestriction) {
      String nerTag = tokenj
        .get(CoreAnnotations.NamedEntityTagAnnotation.class);
      if (nerTag != null
        && !nerTag.equals(SeqClassifierFlags.DEFAULT_BACKGROUND_SYMBOL)) {
        isLabeledO = false;
        if (strOriginal.isEmpty()) {

          strOriginal = nerTag;
        } else {

          strOriginal += "|" + nerTag;
        }
        strgeneric.addORRestriction(CoreAnnotations.NamedEntityTagAnnotation.class, nerTag);
      }
    }

    return new Triple<>(isLabeledO, strgeneric,
            strOriginal);
  }

  public static boolean isASCII(String text) {

    Charset charset = Charset.forName("US-ASCII");
    String checked = new String(text.getBytes(charset), charset);
    return checked.equals(text);// && !text.contains("+") &&
    // !text.contains("*");// && !
    // text.contains("$") && !text.contains("\"");

  }

  public static Map getPatternsAroundTokens(DataInstance sent, Set stopWords) {
    Map p = new HashMap<>();
    List tokens = sent.getTokens();
    for (int i = 0; i < tokens.size(); i++) {
//          p.put(
//              i,
//              new Triple, Set, Set>(
//                  new HashSet(), new HashSet(),
//                  new HashSet()));
      p.put(i, new HashSet());
      CoreLabel token = tokens.get(i);
      // do not create patterns around stop words!
      if (PatternFactory.doNotUse(token.word(), stopWords)) {
        continue;
      }

      Set pat = getContext(sent.getTokens(), i, stopWords);
      p.put(i, pat);

    }
    return p;
  }
}