All Downloads are FREE. Search and download functionalities are using the official Maven repository.

edu.stanford.nlp.pipeline.QuoteAnnotator Maven / Gradle / Ivy

Go to download

Stanford CoreNLP provides a set of natural language analysis tools which can take raw English language text input and give the base forms of words, their parts of speech, whether they are names of companies, people, etc., normalize dates, times, and numeric quantities, mark up the structure of sentences in terms of phrases and word dependencies, and indicate which noun phrases refer to the same entities. It provides the foundational building blocks for higher level text understanding applications.

There is a newer version: 4.5.7
Show newest version
package edu.stanford.nlp.pipeline; 
import edu.stanford.nlp.util.logging.Redwood;

import edu.stanford.nlp.ling.CoreAnnotation;
import edu.stanford.nlp.ling.CoreAnnotations;
import edu.stanford.nlp.ling.CoreLabel;
import edu.stanford.nlp.util.CoreMap;
import edu.stanford.nlp.util.Generics;
import edu.stanford.nlp.util.Pair;
import edu.stanford.nlp.util.Timing;

import java.util.*;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

/**
 * An annotator which picks quotations out of the given text. Allows
 * for embedded quotations so long as they are either directed unicode quotes or are
 * of a different type of quote than the outer quotations
 * (e.g. "'Gadzooks' is what he said to me" is legal whereas
 * "They called me "Danger" when I was..." is illegal).
 * Uses regular-expression-like rules to find quotes and does not
 * depend on the tokenizer, which allows quotes like ''Tis true!' to be
 * correctly identified.
 *
 * Considers regular ascii ("", '', ``'', and `') as well as "smart" and
 * international quotation marks as follows:
 * “”,‘’, «», ‹›, 「」, 『』, „”, and ‚’.
 *
 * Note: extracts everything within these pairs as a whole quote segment, which may or may
 * not be the desired behaviour for texts that use different formatting styles than
 * standard english ones.
 *
 * There are a number of options that can be passed to the quote annotator to
 * customize its' behaviour:
 * 
    *
  • singleQuotes: "true" or "false", indicating whether or not to consider ' tokens * to be quotation marks (default=false).
  • *
  • maxLength: maximum character length of quotes to consider (default=-1).
  • *
  • asciiQuotes: "true" or "false", indicating whether or not to convert all quotes * to ascii quotes before processing (can help when there are errors in quote directionality) * (default=false).
  • *
  • allowEmbeddedSame: "true" or "false" indicating whether or not to allow smart/directed * (everything except " and ') quotes of the same kind to be embedded within one another * (default=false).
  • *
* * The annotator adds a QuotationsAnnotation to the Annotation * which returns a List that * contain the following information: *
    *
  • CharacterOffsetBeginAnnotation
  • *
  • CharacterOffsetEndAnnotation
  • *
  • QuotationIndexAnnotation
  • *
  • QuotationsAnnotation (if there are embedded quotes)
  • *
  • TokensAnnotation (if the tokenizer is run before the quote annotator)
  • *
  • TokenBeginAnnotation (if the tokenizer is run before the quote annotator)
  • *
  • TokenEndAnnotation (if the tokenizer is run before the quote annotator)
  • *
  • SentenceBeginAnnotation (if the sentence splitter has bee run before the quote annotator)
  • *
  • SentenceEndAnnotation (if the sentence splitter has bee run before the quote annotator)
  • *
* * @author Grace Muzny */ public class QuoteAnnotator implements Annotator { /** A logger for this class */ private static Redwood.RedwoodChannels log = Redwood.channels(QuoteAnnotator.class); private final boolean VERBOSE; private final boolean DEBUG = false; // whether or not to consider single single quotes as quote-marking public boolean USE_SINGLE = false; // max length to consider for quotes public int MAX_LENGTH = -1; // whether to convert unicode quotes to non-unicode " and ' // before processing public boolean ASCII_QUOTES = false; // Whether or not to allow quotes of the same type embedded inside of each other public boolean ALLOW_EMBEDDED_SAME = false; // Whether or not to allow quotes of the same type embedded inside of each other public boolean SMART_QUOTES = false; // TODO: implement this // public boolean closeUnclosedQuotes = false; //TODO: add directed quote/unicode quote understanding capabilities. // will need substantial logic, probably, as quotation mark conventions // vary widely. public static final Map DIRECTED_QUOTES; static { Map tmp = Generics.newHashMap(); tmp.put("“", "”"); // directed double inward tmp.put("‘", "’"); // directed single inward tmp.put("«", "»"); // guillemets tmp.put("‹","›"); // single guillemets tmp.put("「", "」"); // cjk brackets tmp.put("『", "』"); // cjk brackets tmp.put("„","”"); // directed double down/up left pointing tmp.put("‚","’"); // directed single down/up left pointing tmp.put("``","''"); // double latex -- single latex quotes don't belong here! DIRECTED_QUOTES = Collections.unmodifiableMap(tmp); } /** Return a QuoteAnnotator that isolates quotes denoted by the * ASCII characters " and '. If an unclosed quote appears, by default, * this quote will not be counted as a quote. * * @param s String that is ignored but allows for creation of the * QuoteAnnotator via a customAnnotatorClass * * @param props Properties object that contains the customizable properties * attributes. * @return A QuoteAnnotator. */ public QuoteAnnotator(String s, Properties props) { this(props, false); } /** Return a QuoteAnnotator that isolates quotes denoted by the * ASCII characters " and ' as well as a variety of smart and international quotes. * If an unclosed quote appears, by default, this quote will not be counted as a quote. * * @param props Properties object that contains the customizable properties * attributes. * @return A QuoteAnnotator. */ public QuoteAnnotator(Properties props) { this(props, false); } /** Return a QuoteAnnotator that isolates quotes denoted by the * ASCII characters " and '. If an unclosed quote appears, by default, * this quote will not be counted as a quote. * * @param props Properties object that contains the customizable properties * attributes. * @param verbose whether or not to output verbose information. * @return A QuoteAnnotator. */ public QuoteAnnotator(Properties props, boolean verbose) { USE_SINGLE = Boolean.parseBoolean(props.getProperty("singleQuotes", "false")); MAX_LENGTH = Integer.parseInt(props.getProperty("maxLength", "-1")); ASCII_QUOTES = Boolean.parseBoolean(props.getProperty("asciiQuotes", "false")); ALLOW_EMBEDDED_SAME = Boolean.parseBoolean(props.getProperty("allowEmbeddedSame", "false")); SMART_QUOTES = Boolean.parseBoolean(props.getProperty("smartQuotes", "false")); VERBOSE = verbose; Timing timer = null; if (VERBOSE) { timer = new Timing(); log.info("Preparing quote annotator..."); } if (VERBOSE) { timer.stop("done."); } } @Override public void annotate(Annotation annotation) { String text = annotation.get(CoreAnnotations.TextAnnotation.class); // TODO: the following, if you want the quote annotator to get these truly correct // Pre-process to make word terminal apostrophes specially encoded (Jones' dog) List tokens = annotation.get(CoreAnnotations.TokensAnnotation.class); List sentences = annotation.get(CoreAnnotations.SentencesAnnotation.class); String quotesFrom = text; if (SMART_QUOTES) { // we're just going to try a bunch of different things and pick // whichever results in the most total quotes // try unicode List> overall = getQuotes(quotesFrom); String docID = annotation.get(CoreAnnotations.DocIDAnnotation.class); List cmQuotesUnicode = getCoreMapQuotes(overall, tokens, sentences, text, docID); int numUnicode = countQuotes(cmQuotesUnicode); // try ascii if (ASCII_QUOTES) { quotesFrom = replaceUnicode(text); } overall = getQuotes(quotesFrom); docID = annotation.get(CoreAnnotations.DocIDAnnotation.class); List cmQuotesAscii = getCoreMapQuotes(overall, tokens, sentences, text, docID); int numAsciiSingle = countQuotes(cmQuotesAscii); // don't allow single quotes USE_SINGLE = false; overall = getQuotes(quotesFrom); docID = annotation.get(CoreAnnotations.DocIDAnnotation.class); List cmQuotesAsciiNoSingle = getCoreMapQuotes(overall, tokens, sentences, text, docID); int numAsciiNoSingle = countQuotes(cmQuotesAsciiNoSingle); log.info("Number of quotes + unicode - single : " + numUnicode); log.info("Number of quotes + ascii - single : " + numAsciiNoSingle); log.info("Number of quotes + ascii + single : " + numAsciiSingle); if (numUnicode >= numAsciiNoSingle && numUnicode > (numAsciiSingle / 2)) { annotation.set(CoreAnnotations.QuotationsAnnotation.class, cmQuotesUnicode); log.info("Using unicode quotes."); } else if (numAsciiSingle > (numAsciiNoSingle / 2)) { annotation.set(CoreAnnotations.QuotationsAnnotation.class, cmQuotesAscii); log.info("Using ascii quotes."); } else { annotation.set(CoreAnnotations.QuotationsAnnotation.class, cmQuotesAsciiNoSingle); log.info("Using ascii quotes with no single quotes."); } } else { if (ASCII_QUOTES) { quotesFrom = replaceUnicode(text); } List> overall = getQuotes(quotesFrom); String docID = annotation.get(CoreAnnotations.DocIDAnnotation.class); List cmQuotes = getCoreMapQuotes(overall, tokens, sentences, text, docID); // add quotes to document annotation.set(CoreAnnotations.QuotationsAnnotation.class, cmQuotes); } } //TODO: update this so that it goes more than 1 layer deep private int countQuotes(List quotes) { int total = quotes.size(); for (CoreMap quote : quotes) { List innerQuotes = quote.get(CoreAnnotations.QuotationsAnnotation.class); if (innerQuotes != null) { total += innerQuotes.size(); } } return total; } // Stolen from PTBLexer private static final Pattern asciiSingleQuote = Pattern.compile("'|[\u0091\u2018\u0092\u2019\u201A\u201B\u2039\u203A']"); private static final Pattern asciiDoubleQuote = Pattern.compile(""|[\u0093\u201C\u0094\u201D\u201E\u00AB\u00BB\"]"); private static String asciiQuotes(String in) { String s1 = in; s1 = asciiSingleQuote.matcher(s1).replaceAll("'"); s1 = asciiDoubleQuote.matcher(s1).replaceAll("\""); return s1; } public static String replaceUnicode(String text) { return asciiQuotes(text); } public static Comparator getQuoteComparator() { return new Comparator() { @Override public int compare(CoreMap o1, CoreMap o2) { int s1 = o1.get(CoreAnnotations.CharacterOffsetBeginAnnotation.class); int s2 = o2.get(CoreAnnotations.CharacterOffsetBeginAnnotation.class); return s1 - s2; } }; } public static List getCoreMapQuotes(List> quotes, List tokens, List sentences, String text, String docID) { List cmQuotes = Generics.newArrayList(); for (Pair p : quotes) { int begin = p.first(); int end = p.second(); // find the tokens for this quote List quoteTokens = new ArrayList<>(); int tokenOffset = -1; int currTok = 0; if (tokens != null) { while (currTok < tokens.size() && tokens.get(currTok).beginPosition() < begin) { currTok++; } int i = currTok; tokenOffset = i; while (i < tokens.size() && tokens.get(i).endPosition() <= end) { quoteTokens.add(tokens.get(i)); i++; } } // find the sentences for this quote int beginSentence = -1; int endSentence = -1; if (sentences != null) { for (CoreMap sentence : sentences) { int sentBegin = sentence.get(CoreAnnotations.CharacterOffsetBeginAnnotation.class); int sentEnd = sentence.get(CoreAnnotations.CharacterOffsetEndAnnotation.class); int sentIndex = sentence.get(CoreAnnotations.SentenceIndexAnnotation.class); if (sentBegin <= begin) { beginSentence = sentIndex; } if (sentEnd >= end && endSentence < 0) { endSentence = sentIndex; } } } // create a quote annotation with text and token offsets Annotation quote = makeQuote(text.substring(begin, end), begin, end, quoteTokens, tokenOffset, beginSentence, endSentence, docID); // add quote in cmQuotes.add(quote); } // sort quotes by beginning index Comparator quoteComparator = getQuoteComparator(); Collections.sort(cmQuotes, quoteComparator); // embed quotes List toRemove = new ArrayList<>(); for (CoreMap cmQuote : cmQuotes) { int start = cmQuote.get(CoreAnnotations.CharacterOffsetBeginAnnotation.class); int end = cmQuote.get(CoreAnnotations.CharacterOffsetEndAnnotation.class); // See if we need to embed a quote List embeddedQuotes = new ArrayList<>(); for (CoreMap cmQuoteComp : cmQuotes) { int startComp = cmQuoteComp.get(CoreAnnotations.CharacterOffsetBeginAnnotation.class); int endComp = cmQuoteComp.get(CoreAnnotations.CharacterOffsetEndAnnotation.class); if (start < startComp && end >= endComp) { // p contains comp embeddedQuotes.add(cmQuoteComp); // now we want to remove it from the top-level quote list toRemove.add(cmQuoteComp); } } cmQuote.set(CoreAnnotations.QuotationsAnnotation.class, embeddedQuotes); } // Remove all the quotes that we want to. for (CoreMap r : toRemove) { // remove that quote from the overall list cmQuotes.remove(r); } // Set the quote index annotations properly setQuoteIndices(cmQuotes); return cmQuotes; } private static void setQuoteIndices(List topLevel) { List level = topLevel; int index = 0; while (!level.isEmpty()) { List nextLevel = Generics.newArrayList(); for (CoreMap quote : level) { quote.set(CoreAnnotations.QuotationIndexAnnotation.class, index); List quoteTokens = quote.get(CoreAnnotations.TokensAnnotation.class); if (quoteTokens != null) { for (CoreLabel qt : quoteTokens) { qt.set(CoreAnnotations.QuotationIndexAnnotation.class, index); } } index++; if (quote.get(CoreAnnotations.QuotationsAnnotation.class) != null) { nextLevel.addAll(quote.get(CoreAnnotations.QuotationsAnnotation.class)); } } level = nextLevel; } } public static Annotation makeQuote(String surfaceForm, int begin, int end, List quoteTokens, int tokenOffset, int sentenceBeginIndex, int sentenceEndIndex, String docID) { Annotation quote = new Annotation(surfaceForm); // create a quote annotation with text and token offsets quote.set(CoreAnnotations.CharacterOffsetBeginAnnotation.class, begin); quote.set(CoreAnnotations.CharacterOffsetEndAnnotation.class, end); if (docID != null) { quote.set(CoreAnnotations.DocIDAnnotation.class, docID); } if (quoteTokens != null) { quote.set(CoreAnnotations.TokensAnnotation.class, quoteTokens); quote.set(CoreAnnotations.TokenBeginAnnotation.class, tokenOffset); quote.set(CoreAnnotations.TokenEndAnnotation.class, tokenOffset + quoteTokens.size() - 1); } quote.set(CoreAnnotations.SentenceBeginAnnotation.class, sentenceBeginIndex); quote.set(CoreAnnotations.SentenceEndAnnotation.class, sentenceEndIndex); return quote; } public List> getQuotes(String text) { return recursiveQuotes(text, 0, null); } public List> recursiveQuotes(String text, int offset, String prevQuote) { Map>> quotesMap = new HashMap<>(); int start = -1; int end = -1; String quote = null; int directed = 0; for (int i = 0 ; i < text.length(); i++) { // Either I'm not in any quote or this one matches // the kind that I am. String c = text.substring(i, i + 1); if (c.equals("`") && i < text.length() - 1 && text.charAt(i + 1) == '`') { c += text.charAt(i + 1); } else if (c.equals("'") && (quote != null && (quote.equals("``") || quote.equals("`")))) { // we want to ignore it if unless is is the beginning of the // last set of ' of the proper length int curr = i; while (curr < text.length() && text.charAt(curr) == '\'') { curr++; } if (i == curr - quote.length() || (directed > 0 && i == curr - (directed * quote.length()))) { for (int a = i + 1; a < i + quote.length(); a++) { c += text.charAt(a); } } else { continue; } } if (DIRECTED_QUOTES.containsKey(quote) && DIRECTED_QUOTES.get(quote).equals(c)) { if (c.equals("’")) { if ((i == text.length() - 1 || isSingleQuoteEnd(text, i))) { // check to make sure that this isn't an apostrophe.. directed--; } } else { // closing directed--; } } // opening if ((start < 0) && !matchesPrevQuote(c, prevQuote) && (((isSingleQuoteWithUse(c) || c.equals("`")) && isSingleQuoteStart(text, i)) || (c.equals("\"") || DIRECTED_QUOTES.containsKey(c)))) { start = i; quote = c; // closing } else if ((start >= 0 && end < 0) && ((c.equals(quote) && (((c.equals("'") || c.equals("`")) && isSingleQuoteEnd(text, i)) || (c.equals("\"") && isDoubleQuoteEnd(text, i)))) || (c.equals("'") && quote.equals("`") && isSingleQuoteEnd(text, i)) || // latex quotes are kind of problematic (DIRECTED_QUOTES.containsKey(quote) && DIRECTED_QUOTES.get(quote).equals(c) && directed == 0))) { end = i + c.length(); } if (DIRECTED_QUOTES.containsKey(c) && c.equals(quote)) { // opening of this kind of directed quote directed++; } if (start >= 0 && end > 0) { if (!quotesMap.containsKey(quote)) { quotesMap.put(quote, new ArrayList<>()); } quotesMap.get(quote).add(new Pair(start, end)); start = -1; end = -1; quote = null; } if (c.length() > 1) { i += c.length() - 1; } // forget about this quote if (MAX_LENGTH > 0 && start >= 0 && i - start > MAX_LENGTH) { // go back to the right index after start i = start + quote.length(); start = -1; end = -1; quote = null; } } // // TODO: determine if we want to be more strict w/ single quotes than double // // answer: we do want to. // // if we reached then end and we have an open quote, close it // if (closeUnclosedQuotes && start >= 0 && start < text.length() - 2) { // if (!quotesMap.containsKey(quote)) { // quotesMap.put(quote, new ArrayList<>()); // } // quotesMap.get(quote).add(new Pair(start, text.length())); // } else if (start >= 0 && start < text.length() - 3) { String warning = text; if (text.length() > 150) { warning = text.substring(0, 150) + "..."; } log.info("WARNING: unmatched quote of type " + quote + " found at index " + start + " in text segment: " + warning); } // recursively look for embedded quotes in these ones List> quotes = Generics.newArrayList(); // If I didn't find any quotes, but did find a quote-beginning, try again, // but without the part of the text before the single quote if (quotesMap.isEmpty() && start >= 0 && start < text.length() - 3) { String toPass = text.substring(start + quote.length(), text.length()); List> embedded = recursiveQuotes(toPass, offset, null); for (Pair e : embedded) { quotes.add(new Pair(e.first() + start + quote.length(), e.second() + start + 1)); } } else { for (String qKind : quotesMap.keySet()) { for (Pair q : quotesMap.get(qKind)) { if (q.second() - q.first() >= qKind.length() * 2) { String toPass = text.substring(q.first() + qKind.length(), q.second() - qKind.length()); String qKindToPass = null; if (!(DIRECTED_QUOTES.containsKey(qKind) || qKind.equals("`")) || !ALLOW_EMBEDDED_SAME) { qKindToPass = qKind; } List> embedded = recursiveQuotes(toPass, q.first() + qKind.length() + offset, qKindToPass); for (Pair e : embedded) { // don't add offset here because the // recursive method already added it if (e.second() - e.first() > 2) { quotes.add(new Pair(e.first(), e.second())); } } } quotes.add(new Pair(q.first() + offset, q.second() + offset)); } } } return quotes; } private boolean isSingleQuoteWithUse(String c) { return c.equals("'") && USE_SINGLE; } private static boolean matchesPrevQuote(String c, String prev) { return prev != null && prev.equals(c); } private static boolean isSingleQuoteStart(String text, int i) { if (i == 0) return true; String prev = text.substring(i - 1, i); return isWhitespaceOrPunct(prev); } private static boolean isSingleQuoteEnd(String text, int i) { if (i == text.length() - 1) return true; String next = text.substring(i + 1, i + 2); return isWhitespaceOrPunct(next); } private static boolean isDoubleQuoteEnd(String text, int i) { if (i == text.length() - 1) return true; String next = text.substring(i + 1, i + 2); if (i == text.length() - 2 && isWhitespaceOrPunct(next)) { return true; } String nextNext = text.substring(i + 2, i + 3); return ((isWhitespaceOrPunct(next) && !isSingleQuote(next)) || (isSingleQuote(next) && isWhitespaceOrPunct(nextNext))); } public static boolean isWhitespaceOrPunct(String c) { Pattern punctOrWhite = Pattern.compile("[\\s\\p{Punct}]", Pattern.UNICODE_CHARACTER_CLASS); Matcher m = punctOrWhite.matcher(c); return m.matches(); } public static boolean isSingleQuote(String c) { return c.equals("'"); } @Override public Set> requires() { return Collections.EMPTY_SET; } @Override public Set> requirementsSatisfied() { return Collections.singleton(CoreAnnotations.QuotationsAnnotation.class); } }




© 2015 - 2024 Weber Informatics LLC | Privacy Policy