edu.stanford.nlp.pipeline.QuoteAnnotator Maven / Gradle / Ivy

Show more of this group Show more artifacts with this name
Show all versions of stanford-corenlp Show documentation
Stanford CoreNLP provides a set of natural language analysis tools which can take raw English language text input and give the base forms of words, their parts of speech, whether they are names of companies, people, etc., normalize dates, times, and numeric quantities, mark up the structure of sentences in terms of phrases and word dependencies, and indicate which noun phrases refer to the same entities. It provides the foundational building blocks for higher level text understanding applications.
There is a newer version: 4.5.7
Show newest version
package edu.stanford.nlp.pipeline; 
import edu.stanford.nlp.util.logging.Redwood;

import edu.stanford.nlp.ling.CoreAnnotation;
import edu.stanford.nlp.ling.CoreAnnotations;
import edu.stanford.nlp.ling.CoreLabel;
import edu.stanford.nlp.util.CoreMap;
import edu.stanford.nlp.util.Generics;
import edu.stanford.nlp.util.Pair;
import edu.stanford.nlp.util.Timing;

import java.util.*;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

/**
 * An annotator which picks quotations out of the given text. Allows
 * for embedded quotations so long as they are either directed unicode quotes or are
 * of a different type of quote than the outer quotations
 * (e.g. "'Gadzooks' is what he said to me" is legal whereas
 * "They called me "Danger" when I was..." is illegal).
 * Uses regular-expression-like rules to find quotes and does not
 * depend on the tokenizer, which allows quotes like ''Tis true!' to be
 * correctly identified.
 *
 * Considers regular ascii ("", '', ``'', and `') as well as "smart" and
 * international quotation marks as follows:
 * “”,‘’, «», ‹›, 「」, 『』, „”, and ‚’.
 *
 * Note: extracts everything within these pairs as a whole quote segment, which may or may
 * not be the desired behaviour for texts that use different formatting styles than
 * standard english ones.
 *
 * There are a number of options that can be passed to the quote annotator to
 * customize its' behaviour:
 * 
 *   singleQuotes: "true" or "false", indicating whether or not to consider ' tokens
 *    to be quotation marks (default=false).
 *   maxLength: maximum character length of quotes to consider (default=-1).
 *   asciiQuotes: "true" or "false", indicating whether or not to convert all quotes
 *   to ascii quotes before processing (can help when there are errors in quote directionality)
 *   (default=false).
 *   allowEmbeddedSame: "true" or "false" indicating whether or not to allow smart/directed
 *   (everything except " and ') quotes of the same kind to be embedded within one another
 *   (default=false).
 * 
 *
 * The annotator adds a QuotationsAnnotation to the Annotation
 * which returns a List that
 * contain the following information:
 * 
 *  CharacterOffsetBeginAnnotation
 *  CharacterOffsetEndAnnotation
 *  QuotationIndexAnnotation
 *  QuotationsAnnotation (if there are embedded quotes)
 *  TokensAnnotation (if the tokenizer is run before the quote annotator)
 *  TokenBeginAnnotation (if the tokenizer is run before the quote annotator)
 *  TokenEndAnnotation (if the tokenizer is run before the quote annotator)
 *  SentenceBeginAnnotation (if the sentence splitter has bee run before the quote annotator)
 *  SentenceEndAnnotation (if the sentence splitter has bee run before the quote annotator)
 * 
 *
 * @author Grace Muzny
 */
public class QuoteAnnotator implements Annotator  {

  /** A logger for this class */
  private static Redwood.RedwoodChannels log = Redwood.channels(QuoteAnnotator.class);

  private final boolean VERBOSE;
  private final boolean DEBUG = false;

  // whether or not to consider single single quotes as quote-marking
  public boolean USE_SINGLE = false;
  // max length to consider for quotes
  public int MAX_LENGTH = -1;
  // whether to convert unicode quotes to non-unicode " and '
  // before processing
  public boolean ASCII_QUOTES = false;
  // Whether or not to allow quotes of the same type embedded inside of each other
  public boolean ALLOW_EMBEDDED_SAME = false;

  // Whether or not to allow quotes of the same type embedded inside of each other
  public boolean SMART_QUOTES = false;

  // TODO: implement this
//  public boolean closeUnclosedQuotes = false;
  //TODO: add directed quote/unicode quote understanding capabilities.
  // will need substantial logic, probably, as quotation mark conventions
  // vary widely.
  public static final Map DIRECTED_QUOTES;
  static {
    Map tmp = Generics.newHashMap();
    tmp.put("“", "”");  // directed double inward
    tmp.put("‘", "’");  // directed single inward
    tmp.put("«", "»");  // guillemets
    tmp.put("‹","›");  // single guillemets
    tmp.put("「", "」");  // cjk brackets
    tmp.put("『", "』");  // cjk brackets
    tmp.put("„","”");  // directed double down/up left pointing
    tmp.put("‚","’");  // directed single down/up left pointing
    tmp.put("``","''");  // double latex -- single latex quotes don't belong here!
    DIRECTED_QUOTES = Collections.unmodifiableMap(tmp);
  }

  /** Return a QuoteAnnotator that isolates quotes denoted by the
   * ASCII characters " and '. If an unclosed quote appears, by default,
   * this quote will not be counted as a quote.
   *
   *  @param s String that is ignored but allows for creation of the
   *           QuoteAnnotator via a customAnnotatorClass
   *
   *  @param  props Properties object that contains the customizable properties
   *                 attributes.
   *  @return A QuoteAnnotator.
   */
  public QuoteAnnotator(String s, Properties props) {
    this(props, false);
  }

  /** Return a QuoteAnnotator that isolates quotes denoted by the
   * ASCII characters " and ' as well as a variety of smart and international quotes.
   * If an unclosed quote appears, by default, this quote will not be counted as a quote.
   *
   *  @param  props Properties object that contains the customizable properties
   *                 attributes.
   *  @return A QuoteAnnotator.
   */
  public QuoteAnnotator(Properties props) {
    this(props, false);
  }

  /** Return a QuoteAnnotator that isolates quotes denoted by the
   * ASCII characters " and '. If an unclosed quote appears, by default,
   * this quote will not be counted as a quote.
   *
   *  @param props Properties object that contains the customizable properties
   *                 attributes.
   *  @param verbose whether or not to output verbose information.
   *  @return A QuoteAnnotator.
   */
  public QuoteAnnotator(Properties props, boolean verbose) {
    USE_SINGLE = Boolean.parseBoolean(props.getProperty("singleQuotes", "false"));
    MAX_LENGTH = Integer.parseInt(props.getProperty("maxLength", "-1"));
    ASCII_QUOTES = Boolean.parseBoolean(props.getProperty("asciiQuotes", "false"));
    ALLOW_EMBEDDED_SAME = Boolean.parseBoolean(props.getProperty("allowEmbeddedSame", "false"));
    SMART_QUOTES = Boolean.parseBoolean(props.getProperty("smartQuotes", "false"));

    VERBOSE = verbose;
    Timing timer = null;
    if (VERBOSE) {
      timer = new Timing();
      log.info("Preparing quote annotator...");
    }

    if (VERBOSE) {
      timer.stop("done.");
    }
  }

  @Override
  public void annotate(Annotation annotation) {
    String text = annotation.get(CoreAnnotations.TextAnnotation.class);

    // TODO: the following, if you want the quote annotator to get these truly correct
    // Pre-process to make word terminal apostrophes specially encoded (Jones' dog)
    List tokens = annotation.get(CoreAnnotations.TokensAnnotation.class);
    List sentences = annotation.get(CoreAnnotations.SentencesAnnotation.class);

    String quotesFrom = text;

    if (SMART_QUOTES) {
      // we're just going to try a bunch of different things and pick
      // whichever results in the most total quotes

      // try unicode
      List> overall = getQuotes(quotesFrom);
      String docID = annotation.get(CoreAnnotations.DocIDAnnotation.class);
      List cmQuotesUnicode = getCoreMapQuotes(overall, tokens, sentences, text, docID);
      int numUnicode = countQuotes(cmQuotesUnicode);

      // try ascii
      if (ASCII_QUOTES) {
        quotesFrom = replaceUnicode(text);
      }
      overall = getQuotes(quotesFrom);
      docID = annotation.get(CoreAnnotations.DocIDAnnotation.class);
      List cmQuotesAscii = getCoreMapQuotes(overall, tokens, sentences, text, docID);
      int numAsciiSingle = countQuotes(cmQuotesAscii);

      // don't allow single quotes
      USE_SINGLE = false;
      overall = getQuotes(quotesFrom);
      docID = annotation.get(CoreAnnotations.DocIDAnnotation.class);
      List cmQuotesAsciiNoSingle = getCoreMapQuotes(overall, tokens, sentences, text, docID);
      int numAsciiNoSingle = countQuotes(cmQuotesAsciiNoSingle);

      log.info("Number of quotes + unicode - single : " + numUnicode);
      log.info("Number of quotes + ascii - single : " + numAsciiNoSingle);
      log.info("Number of quotes + ascii + single : " + numAsciiSingle);
      if (numUnicode >= numAsciiNoSingle && numUnicode > (numAsciiSingle / 2)) {
        annotation.set(CoreAnnotations.QuotationsAnnotation.class, cmQuotesUnicode);
        log.info("Using unicode quotes.");
      } else if (numAsciiSingle > (numAsciiNoSingle / 2)) {
        annotation.set(CoreAnnotations.QuotationsAnnotation.class, cmQuotesAscii);
        log.info("Using ascii quotes.");
      } else {
        annotation.set(CoreAnnotations.QuotationsAnnotation.class, cmQuotesAsciiNoSingle);
        log.info("Using ascii quotes with no single quotes.");
      }
    } else {
      if (ASCII_QUOTES) {
        quotesFrom = replaceUnicode(text);
      }
      List> overall = getQuotes(quotesFrom);

      String docID = annotation.get(CoreAnnotations.DocIDAnnotation.class);

      List cmQuotes = getCoreMapQuotes(overall, tokens, sentences, text, docID);

      // add quotes to document
      annotation.set(CoreAnnotations.QuotationsAnnotation.class, cmQuotes);
    }

    

  }

  //TODO: update this so that it goes more than 1 layer deep
  private int countQuotes(List quotes) {
    int total = quotes.size();
    for (CoreMap quote : quotes) {
      List innerQuotes = quote.get(CoreAnnotations.QuotationsAnnotation.class);
      if (innerQuotes != null) {
        total += innerQuotes.size();
      }
    }
    return total;
  }

  // Stolen from PTBLexer
  private static final Pattern asciiSingleQuote = Pattern.compile("'|[\u0091\u2018\u0092\u2019\u201A\u201B\u2039\u203A']");
  private static final Pattern asciiDoubleQuote = Pattern.compile(""|[\u0093\u201C\u0094\u201D\u201E\u00AB\u00BB\"]");

  private static String asciiQuotes(String in) {
    String s1 = in;
    s1 = asciiSingleQuote.matcher(s1).replaceAll("'");
    s1 = asciiDoubleQuote.matcher(s1).replaceAll("\"");
    return s1;
  }

  public static String replaceUnicode(String text) {
    return asciiQuotes(text);
  }

  public static Comparator getQuoteComparator() {
   return new Comparator() {
     @Override
     public int compare(CoreMap o1, CoreMap o2) {
       int s1 = o1.get(CoreAnnotations.CharacterOffsetBeginAnnotation.class);
       int s2 = o2.get(CoreAnnotations.CharacterOffsetBeginAnnotation.class);
       return s1 - s2;
     }
   };
  }

  public static List getCoreMapQuotes(List> quotes,
                                               List tokens,
                                               List sentences,
                                              String text, String docID) {
    List cmQuotes = Generics.newArrayList();
    for (Pair p : quotes) {
      int begin = p.first();
      int end = p.second();

      // find the tokens for this quote
      List quoteTokens = new ArrayList<>();
      int tokenOffset = -1;
      int currTok = 0;
      if (tokens != null) {
        while (currTok < tokens.size() && tokens.get(currTok).beginPosition() < begin) {
          currTok++;
        }
        int i = currTok;
        tokenOffset = i;
        while (i < tokens.size() && tokens.get(i).endPosition() <= end) {
          quoteTokens.add(tokens.get(i));
          i++;
        }
      }

      // find the sentences for this quote
      int beginSentence = -1;
      int endSentence = -1;
      if (sentences != null) {
        for (CoreMap sentence : sentences) {
          int sentBegin = sentence.get(CoreAnnotations.CharacterOffsetBeginAnnotation.class);
          int sentEnd = sentence.get(CoreAnnotations.CharacterOffsetEndAnnotation.class);
          int sentIndex = sentence.get(CoreAnnotations.SentenceIndexAnnotation.class);
          if (sentBegin <= begin) {
            beginSentence = sentIndex;
          }
          if (sentEnd >= end && endSentence < 0) {
            endSentence = sentIndex;
          }
        }
      }

      // create a quote annotation with text and token offsets
      Annotation quote = makeQuote(text.substring(begin, end), begin, end, quoteTokens,
          tokenOffset, beginSentence, endSentence, docID);

      // add quote in
      cmQuotes.add(quote);
    }

    // sort quotes by beginning index
    Comparator quoteComparator = getQuoteComparator();
    Collections.sort(cmQuotes, quoteComparator);

    // embed quotes
    List toRemove = new ArrayList<>();
    for (CoreMap cmQuote : cmQuotes) {
      int start = cmQuote.get(CoreAnnotations.CharacterOffsetBeginAnnotation.class);
      int end = cmQuote.get(CoreAnnotations.CharacterOffsetEndAnnotation.class);
      // See if we need to embed a quote
      List embeddedQuotes = new ArrayList<>();
      for (CoreMap cmQuoteComp : cmQuotes) {
        int startComp = cmQuoteComp.get(CoreAnnotations.CharacterOffsetBeginAnnotation.class);
        int endComp = cmQuoteComp.get(CoreAnnotations.CharacterOffsetEndAnnotation.class);
        if (start < startComp && end >= endComp) {
          // p contains comp
          embeddedQuotes.add(cmQuoteComp);
          // now we want to remove it from the top-level quote list
          toRemove.add(cmQuoteComp);
        }
      }
      cmQuote.set(CoreAnnotations.QuotationsAnnotation.class, embeddedQuotes);
    }

    // Remove all the quotes that we want to.
    for (CoreMap r : toRemove) {
      // remove that quote from the overall list
      cmQuotes.remove(r);
    }

    // Set the quote index annotations properly
    setQuoteIndices(cmQuotes);
    return cmQuotes;
  }

  private static void setQuoteIndices(List topLevel) {
    List level = topLevel;
    int index = 0;
    while (!level.isEmpty()) {
      List nextLevel = Generics.newArrayList();
      for (CoreMap quote : level) {
        quote.set(CoreAnnotations.QuotationIndexAnnotation.class, index);
        List quoteTokens = quote.get(CoreAnnotations.TokensAnnotation.class);
        if (quoteTokens != null) {
          for (CoreLabel qt : quoteTokens) {
            qt.set(CoreAnnotations.QuotationIndexAnnotation.class, index);
          }
        }
        index++;
        if (quote.get(CoreAnnotations.QuotationsAnnotation.class) != null) {
          nextLevel.addAll(quote.get(CoreAnnotations.QuotationsAnnotation.class));
        }
      }
      level = nextLevel;
    }
  }

  public static Annotation makeQuote(String surfaceForm, int begin, int end,
                                     List quoteTokens,
                                     int tokenOffset,
                                     int sentenceBeginIndex,
                                     int sentenceEndIndex,
                                     String docID) {
    Annotation quote = new Annotation(surfaceForm);
    // create a quote annotation with text and token offsets
    quote.set(CoreAnnotations.CharacterOffsetBeginAnnotation.class, begin);
    quote.set(CoreAnnotations.CharacterOffsetEndAnnotation.class, end);
    if (docID != null) {
      quote.set(CoreAnnotations.DocIDAnnotation.class, docID);
    }

    if (quoteTokens != null) {
      quote.set(CoreAnnotations.TokensAnnotation.class, quoteTokens);
      quote.set(CoreAnnotations.TokenBeginAnnotation.class, tokenOffset);
      quote.set(CoreAnnotations.TokenEndAnnotation.class, tokenOffset + quoteTokens.size() - 1);
    }
    quote.set(CoreAnnotations.SentenceBeginAnnotation.class, sentenceBeginIndex);
    quote.set(CoreAnnotations.SentenceEndAnnotation.class, sentenceEndIndex);

    return quote;
  }

  public List> getQuotes(String text) {
    return recursiveQuotes(text, 0, null);
  }

  public List> recursiveQuotes(String text, int offset, String prevQuote) {
    Map>> quotesMap = new HashMap<>();
    int start = -1;
    int end = -1;
    String quote = null;
    int directed = 0;
    for (int i = 0 ; i < text.length(); i++) {
      // Either I'm not in any quote or this one matches
      // the kind that I am.
      String c = text.substring(i, i + 1);

      if (c.equals("`") && i < text.length() - 1 &&
          text.charAt(i + 1) == '`') {
        c += text.charAt(i + 1);
      } else if (c.equals("'") && (quote != null && (quote.equals("``") || quote.equals("`")))) {
        // we want to ignore it if unless is is the beginning of the
        // last set of ' of the proper length
        int curr = i;
        while (curr < text.length() && text.charAt(curr) == '\'') {
          curr++;
        }
        if (i == curr - quote.length() ||
            (directed > 0 && i == curr - (directed * quote.length()))) {
          for (int a = i + 1; a < i + quote.length(); a++) {
            c += text.charAt(a);
          }
        } else {
          continue;
        }
      }

      if (DIRECTED_QUOTES.containsKey(quote) &&
          DIRECTED_QUOTES.get(quote).equals(c)) {
        if (c.equals("’")) {
          if ((i == text.length() - 1 || isSingleQuoteEnd(text, i))) {
            // check to make sure that this isn't an apostrophe..
            directed--;
          }
        } else {
          // closing
          directed--;
        }
      }

      // opening
      if ((start < 0) && !matchesPrevQuote(c, prevQuote) &&
          (((isSingleQuoteWithUse(c) || c.equals("`")) && isSingleQuoteStart(text, i)) ||
            (c.equals("\"") || DIRECTED_QUOTES.containsKey(c)))) {
        start = i;
        quote = c;
      // closing
      } else if ((start >= 0 && end < 0) &&
          ((c.equals(quote) &&
           (((c.equals("'") || c.equals("`")) && isSingleQuoteEnd(text, i)) ||
            (c.equals("\"") && isDoubleQuoteEnd(text, i)))) ||
           (c.equals("'") && quote.equals("`") && isSingleQuoteEnd(text, i)) ||  // latex quotes are kind of problematic
           (DIRECTED_QUOTES.containsKey(quote) &&
               DIRECTED_QUOTES.get(quote).equals(c) &&
           directed == 0))) {
        end = i + c.length();
      }

      if (DIRECTED_QUOTES.containsKey(c) &&
          c.equals(quote)) {
        // opening of this kind of directed quote
        directed++;
      }

      if (start >= 0 && end > 0) {
        if (!quotesMap.containsKey(quote)) {
          quotesMap.put(quote, new ArrayList<>());
        }
        quotesMap.get(quote).add(new Pair(start, end));
        start = -1;
        end = -1;
        quote = null;
      }

      if (c.length() > 1) {
        i += c.length() - 1;
      }

      // forget about this quote
      if (MAX_LENGTH > 0 && start >= 0 &&
          i - start > MAX_LENGTH) {
        // go back to the right index after start
        i = start + quote.length();

        start = -1;
        end = -1;
        quote = null;
      }
    }

//    // TODO: determine if we want to be more strict w/ single quotes than double
//    // answer: we do want to.
//    // if we reached then end and we have an open quote, close it
//    if (closeUnclosedQuotes && start >= 0 && start < text.length() - 2) {
//      if (!quotesMap.containsKey(quote)) {
//        quotesMap.put(quote, new ArrayList<>());
//      }
//      quotesMap.get(quote).add(new Pair(start, text.length()));
//    } else
    if (start >= 0 && start < text.length() - 3) {
      String warning = text;
      if (text.length() > 150) {
        warning = text.substring(0, 150) + "...";
      }
      log.info("WARNING: unmatched quote of type " +
          quote + " found at index " + start + " in text segment: " + warning);
    }

    // recursively look for embedded quotes in these ones
    List> quotes = Generics.newArrayList();
    // If I didn't find any quotes, but did find a quote-beginning, try again,
    // but without the part of the text before the single quote
    if (quotesMap.isEmpty() && start >= 0 && start < text.length() - 3) {
      String toPass = text.substring(start + quote.length(), text.length());
      List> embedded = recursiveQuotes(toPass, offset, null);
      for (Pair e : embedded) {
        quotes.add(new Pair(e.first() + start + quote.length(),
            e.second() + start + 1));
      }
    } else {
      for (String qKind : quotesMap.keySet()) {
        for (Pair q : quotesMap.get(qKind)) {
          if (q.second() - q.first() >= qKind.length() * 2) {
            String toPass = text.substring(q.first() + qKind.length(),
                q.second() - qKind.length());
            String qKindToPass = null;
            if (!(DIRECTED_QUOTES.containsKey(qKind) || qKind.equals("`"))
                    || !ALLOW_EMBEDDED_SAME) {
              qKindToPass = qKind;
            }
            List> embedded = recursiveQuotes(toPass,
                q.first() + qKind.length() + offset, qKindToPass);
            for (Pair e : embedded) {
              // don't add offset here because the
              // recursive method already added it
              if (e.second() - e.first() > 2) {
                quotes.add(new Pair(e.first(), e.second()));
              }
            }
          }
          quotes.add(new Pair(q.first() + offset, q.second() + offset));
        }
      }
    }

    return quotes;
  }

  private boolean isSingleQuoteWithUse(String c) {
    return c.equals("'") && USE_SINGLE;
  }

  private static boolean matchesPrevQuote(String c, String prev) {
    return prev != null && prev.equals(c);
  }

  private static boolean isSingleQuoteStart(String text, int i) {
    if (i == 0) return true;
    String prev = text.substring(i - 1, i);
    return isWhitespaceOrPunct(prev);
  }

  private static boolean isSingleQuoteEnd(String text, int i) {
    if (i == text.length() - 1) return true;
    String next = text.substring(i + 1, i + 2);
    return isWhitespaceOrPunct(next);
  }

  private static boolean isDoubleQuoteEnd(String text, int i) {
    if (i == text.length() - 1) return true;
    String next = text.substring(i + 1, i + 2);
    if (i == text.length() - 2 && isWhitespaceOrPunct(next)) {
      return true;
    }
    String nextNext = text.substring(i + 2, i + 3);
    return ((isWhitespaceOrPunct(next) &&
           !isSingleQuote(next)) || (isSingleQuote(next) && isWhitespaceOrPunct(nextNext)));
  }

  public static boolean isWhitespaceOrPunct(String c) {
    Pattern punctOrWhite = Pattern.compile("[\\s\\p{Punct}]", Pattern.UNICODE_CHARACTER_CLASS);
    Matcher m = punctOrWhite.matcher(c);
    return m.matches();
  }

  public static boolean isSingleQuote(String c) {
    return c.equals("'");
  }

  @Override
  public Set> requires() {
    return Collections.EMPTY_SET;
  }

  @Override
  public Set> requirementsSatisfied() {
    return Collections.singleton(CoreAnnotations.QuotationsAnnotation.class);
  }

}