All Downloads are FREE. Search and download functionalities are using the official Maven repository.

edu.stanford.nlp.coref.data.DocumentPreprocessor Maven / Gradle / Ivy

Go to download

Stanford CoreNLP provides a set of natural language analysis tools which can take raw English language text input and give the base forms of words, their parts of speech, whether they are names of companies, people, etc., normalize dates, times, and numeric quantities, mark up the structure of sentences in terms of phrases and word dependencies, and indicate which noun phrases refer to the same entities. It provides the foundational building blocks for higher level text understanding applications.

There is a newer version: 4.5.7
Show newest version
package edu.stanford.nlp.coref.data;

import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collection;
import java.util.Comparator;
import java.util.LinkedList;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.TreeSet;

import edu.stanford.nlp.classify.LogisticClassifier;
import edu.stanford.nlp.coref.CorefRules;
import edu.stanford.nlp.coref.CorefUtils;
import edu.stanford.nlp.coref.data.Document.DocType;
import edu.stanford.nlp.coref.data.Dictionaries.Number;
import edu.stanford.nlp.ling.CoreAnnotations;
import edu.stanford.nlp.ling.CoreAnnotations.SentencesAnnotation;
import edu.stanford.nlp.ling.CoreAnnotations.SpeakerAnnotation;
import edu.stanford.nlp.ling.CoreAnnotations.TokensAnnotation;
import edu.stanford.nlp.ling.CoreAnnotations.UtteranceAnnotation;
import edu.stanford.nlp.ling.CoreLabel;
import edu.stanford.nlp.ling.IndexedWord;
import edu.stanford.nlp.math.NumberMatchingRegex;
import edu.stanford.nlp.semgraph.SemanticGraph;
import edu.stanford.nlp.semgraph.SemanticGraphCoreAnnotations;
import edu.stanford.nlp.semgraph.SemanticGraphCoreAnnotations.BasicDependenciesAnnotation;
import edu.stanford.nlp.semgraph.SemanticGraphEdge;
import edu.stanford.nlp.trees.GrammaticalRelation;
import edu.stanford.nlp.trees.HeadFinder;
import edu.stanford.nlp.trees.Tree;
import edu.stanford.nlp.trees.TreeCoreAnnotations.TreeAnnotation;
import edu.stanford.nlp.trees.UniversalEnglishGrammaticalRelations;
import edu.stanford.nlp.util.CollectionValuedMap;
import edu.stanford.nlp.util.CoreMap;
import edu.stanford.nlp.util.Generics;
import edu.stanford.nlp.util.IntPair;
import edu.stanford.nlp.util.IntTuple;
import edu.stanford.nlp.util.Pair;
import edu.stanford.nlp.util.logging.Redwood;

/**
 * Coref document preprocessor.
 * @author Heeyoung Lee
 * @author Kevin Clark
 */
public class DocumentPreprocessor  {

  /** A logger for this class */
  private static Redwood.RedwoodChannels log = Redwood.channels(DocumentPreprocessor.class);

  private DocumentPreprocessor() {}

  /**
   * Fill missing information in document including mention ID, mention attributes, syntactic relation, etc.
   *
   * @throws Exception
   */
  public static void preprocess(Document doc, Dictionaries dict, LogisticClassifier singletonPredictor, HeadFinder headFinder) throws Exception {
    // assign mention IDs, find twin mentions, fill mention positions, sentNum, headpositions
    initializeMentions(doc, dict, singletonPredictor, headFinder);

    // mention reordering
    mentionReordering(doc, headFinder);

    // find syntactic information
    fillSyntacticInfo(doc);

    // process discourse (speaker info etc)
    setParagraphAnnotation(doc);
    processDiscourse(doc, dict);

    // initialize cluster info
    initializeClusters(doc);

    // extract gold clusters if we have
    if(doc.goldMentions!=null) {
      extractGoldClusters(doc);
      int foundGoldCount = 0;
      for(Mention g : doc.goldMentionsByID.values()) {
        if(g.hasTwin) foundGoldCount++;
      }
      Redwood.log("debug-md", "# of found gold mentions: "+ foundGoldCount +
          " / # of gold mentions: "+ doc.goldMentionsByID.size());

    }

    // assign mention numbers
    assignMentionNumbers(doc);
  }

  /** Extract gold coref cluster information. */
  public static void extractGoldClusters(Document doc){
    doc.goldCorefClusters = Generics.newHashMap();
    for (List mentions : doc.goldMentions) {
      for (Mention m : mentions) {
        int id = m.goldCorefClusterID;
        if (id == -1) {
          throw new RuntimeException("No gold info");
        }
        CorefCluster c = doc.goldCorefClusters.get(id);
        if (c == null) {
          c = new CorefCluster(id);
          doc.goldCorefClusters.put(id, c);
        }
        c.corefMentions.add(m);
      }
    }
  }

  private static void assignMentionNumbers(Document document) {
    List mentionsList = CorefUtils.getSortedMentions(document);
    for (int i = 0; i < mentionsList.size(); i++) {
      mentionsList.get(i).mentionNum = i;
    }
  }


  private static void mentionReordering(Document doc, HeadFinder headFinder) throws Exception {
    List> mentions = doc.predictedMentions;
    List sentences = doc.annotation.get(SentencesAnnotation.class);

    for (int i=0 ; i mentionsInSent = mentions.get(i);
      mentions.set(i, mentionReorderingBySpan(mentionsInSent));
    }
  }

  protected static int getHeadIndex(Tree t, HeadFinder headFinder) {
    // The trees passed in do not have the CoordinationTransformer
    // applied, but that just means the SemanticHeadFinder results are
    // slightly worse.
    Tree ht = t.headTerminal(headFinder);
    if(ht==null) return -1;  // temporary: a key which is matched to nothing
    CoreLabel l = (CoreLabel) ht.label();
    return l.get(CoreAnnotations.IndexAnnotation.class);
  }

  private static List mentionReorderingBySpan(List mentionsInSent) {
    TreeSet ordering = new TreeSet<>(new Comparator() {
      @Override
      public int compare(Mention m1, Mention m2) {
        return (m1.appearEarlierThan(m2)) ? -1 : (m2.appearEarlierThan(m1)) ? 1 : 0;
      }
    });
    ordering.addAll(mentionsInSent);
    List orderedMentions = Generics.newArrayList(ordering);
    return orderedMentions;
  }

  private static void fillSyntacticInfo(Document doc) {

    List> mentions = doc.predictedMentions;
    List sentences = doc.annotation.get(SentencesAnnotation.class);

    for (int i=0 ; i mentionsInSent = mentions.get(i);
      findSyntacticRelationsFromDependency(mentionsInSent);
    }
  }

  /** assign mention IDs, find twin mentions, fill mention positions, initialize coref clusters, etc
   * @throws Exception */
  private static void initializeMentions(Document doc, Dictionaries dict, LogisticClassifier singletonPredictor, HeadFinder headFinder) throws Exception {
    boolean hasGold = (doc.goldMentions != null);
    assignMentionIDs(doc);
    if(hasGold) findTwinMentions(doc, true);
    fillMentionInfo(doc, dict, singletonPredictor, headFinder);
    doc.allPositions = Generics.newHashMap(doc.positions);    // allPositions retain all mentions even after postprocessing
  }

  private static void assignMentionIDs(Document doc) {
    boolean hasGold = (doc.goldMentions != null);
    int maxID = 0;
    if(hasGold) {
      for (List golds : doc.goldMentions) {
        for (Mention g : golds) {
          g.mentionID = maxID++;
        }
      }
    }
    for (List predicted : doc.predictedMentions) {
      for (Mention p : predicted) {
        p.mentionID = maxID++;
      }
    }
  }

  /** Mark twin mentions in gold and predicted mentions */
  protected static void findTwinMentions(Document doc, boolean strict){
    if(strict) findTwinMentionsStrict(doc);
    else findTwinMentionsRelaxed(doc);
  }

  /** Mark twin mentions: All mention boundaries should be matched */
  private static void findTwinMentionsStrict(Document doc){
    for(int sentNum = 0; sentNum < doc.goldMentions.size(); sentNum++) {
      List golds = doc.goldMentions.get(sentNum);
      List predicts = doc.predictedMentions.get(sentNum);

      // For CoNLL training there are some documents with gold mentions with the same position offsets
      // See /scr/nlp/data/conll-2011/v2/data/train/data/english/annotations/nw/wsj/09/wsj_0990.v2_auto_conll
      //  (Packwood - Roth)
      CollectionValuedMap goldMentionPositions = new CollectionValuedMap<>();
      for(Mention g : golds) {
        IntPair ip = new IntPair(g.startIndex, g.endIndex);
        if (goldMentionPositions.containsKey(ip)) {
          StringBuilder existingMentions = new StringBuilder();
          for (Mention eg: goldMentionPositions.get(ip)) {
            if (existingMentions.length() > 0) {
              existingMentions.append(",");
            }
            existingMentions.append(eg.mentionID);
          }
          Redwood.log("debug-preprocessor", "WARNING: gold mentions with the same offsets: " + ip
                  + " mentions=" + g.mentionID + "," + existingMentions + ", " + g.spanToString());
        }
        //assert(!goldMentionPositions.containsKey(ip));
        goldMentionPositions.add(new IntPair(g.startIndex, g.endIndex), g);
      }
      for(Mention p : predicts) {
        IntPair pos = new IntPair(p.startIndex, p.endIndex);
        if(goldMentionPositions.containsKey(pos)) {
          Collection cm = goldMentionPositions.get(pos);
          int minId = Integer.MAX_VALUE;
          Mention g = null;
          for (Mention m : cm) {
            if (m.mentionID < minId) {
              g = m;
              minId = m.mentionID;
            }
          }
          cm.remove(g);
          p.mentionID = g.mentionID;
          p.hasTwin = true;
          g.hasTwin = true;
        }
      }
    }
  }

  /** Mark twin mentions: heads of the mentions are matched */
  private static void findTwinMentionsRelaxed(Document doc) {
    for(int sentNum = 0; sentNum < doc.goldMentions.size(); sentNum++) {
      List golds = doc.goldMentions.get(sentNum);
      List predicts = doc.predictedMentions.get(sentNum);

      Map goldMentionPositions = Generics.newHashMap();
      Map> goldMentionHeadPositions = Generics.newHashMap();
      for(Mention g : golds) {
        goldMentionPositions.put(new IntPair(g.startIndex, g.endIndex), g);
        if(!goldMentionHeadPositions.containsKey(g.headIndex)) {
          goldMentionHeadPositions.put(g.headIndex, new LinkedList<>());
        }
        goldMentionHeadPositions.get(g.headIndex).add(g);
      }

      List remains = new ArrayList<>();
      for (Mention p : predicts) {
        IntPair pos = new IntPair(p.startIndex, p.endIndex);
        if(goldMentionPositions.containsKey(pos)) {
          Mention g = goldMentionPositions.get(pos);
          p.mentionID = g.mentionID;
          p.hasTwin = true;
          g.hasTwin = true;
          goldMentionHeadPositions.get(g.headIndex).remove(g);
          if(goldMentionHeadPositions.get(g.headIndex).isEmpty()) {
            goldMentionHeadPositions.remove(g.headIndex);
          }
        }
        else remains.add(p);
      }
      for (Mention r : remains){
        if(goldMentionHeadPositions.containsKey(r.headIndex)) {
          Mention g = goldMentionHeadPositions.get(r.headIndex).poll();
          r.mentionID = g.mentionID;
          r.hasTwin = true;
          g.hasTwin = true;
          if(goldMentionHeadPositions.get(g.headIndex).isEmpty()) {
            goldMentionHeadPositions.remove(g.headIndex);
          }
        }
      }
    }
  }

  /** initialize several variables for mentions
   * @throws Exception
   */
  private static void fillMentionInfo(Document doc, Dictionaries dict,
      LogisticClassifier singletonPredictor, HeadFinder headFinder) throws Exception {
    List sentences = doc.annotation.get(SentencesAnnotation.class);

    for(int i = 0; i < doc.predictedMentions.size(); i ++){
      CoreMap sentence = sentences.get(i);
      for(int j = 0; j < doc.predictedMentions.get(i).size(); j ++){
        Mention m = doc.predictedMentions.get(i).get(j);
        doc.predictedMentionsByID.put(m.mentionID, m);      // mentionsByID

        IntTuple pos = new IntTuple(2);
        pos.set(0, i);
        pos.set(1, j);
        doc.positions.put(m, pos);        // positions
        m.sentNum = i;                    // sentNum

        IntTuple headPosition = new IntTuple(2);
        headPosition.set(0, i);
        headPosition.set(1, m.headIndex);
        doc.mentionheadPositions.put(headPosition, m);    // headPositions

        m.contextParseTree = sentence.get(TreeAnnotation.class);
//        m.sentenceWords = sentence.get(TokensAnnotation.class);
        m.basicDependency = sentence.get(BasicDependenciesAnnotation.class);
        m.enhancedDependency = sentence.get(SemanticGraphCoreAnnotations.EnhancedDependenciesAnnotation.class);
        if (m.enhancedDependency == null) {
          m.enhancedDependency = sentence.get(SemanticGraphCoreAnnotations.BasicDependenciesAnnotation.class);
        }

        // mentionSubTree (highest NP that has the same head) if constituency tree available
        if (m.contextParseTree != null) {
          Tree headTree = m.contextParseTree.getLeaves().get(m.headIndex);
          if (headTree == null) { throw new RuntimeException("Missing head tree for a mention!"); }
          Tree t = headTree;
          while ((t = t.parent(m.contextParseTree)) != null) {
            if (t.headTerminal(headFinder) == headTree && t.value().equals("NP")) {
              m.mentionSubTree = t;
            } else if(m.mentionSubTree != null){
              break;
            }
          }
          if (m.mentionSubTree == null) {
            m.mentionSubTree = headTree;
          }
        }

        m.process(dict, null, singletonPredictor);
      }
    }


    boolean hasGold = (doc.goldMentions != null);
    if(hasGold) {
      doc.goldMentionsByID = Generics.newHashMap();
      int sentNum = 0;
      for(List golds : doc.goldMentions) {
        for(Mention g : golds) {
          doc.goldMentionsByID.put(g.mentionID, g);
          g.sentNum = sentNum;
        }
        sentNum++;
      }
    }
  }

  private static void findSyntacticRelationsFromDependency(List orderedMentions) {
    if(orderedMentions.size()==0) return;
    markListMemberRelation(orderedMentions);
    SemanticGraph dependency = orderedMentions.get(0).enhancedDependency;

    // apposition
    Set> appos = Generics.newHashSet();
    List appositions = dependency.findAllRelns(UniversalEnglishGrammaticalRelations.APPOSITIONAL_MODIFIER);
    for(SemanticGraphEdge edge : appositions) {
      int sIdx = edge.getSource().index()-1;
      int tIdx = edge.getTarget().index()-1;
      appos.add(Pair.makePair(sIdx, tIdx));
    }
    markMentionRelation(orderedMentions, appos, "APPOSITION");

    // predicate nominatives
    Set> preNomi = Generics.newHashSet();
    List copula = dependency.findAllRelns(UniversalEnglishGrammaticalRelations.COPULA);
    for(SemanticGraphEdge edge : copula) {
      IndexedWord source = edge.getSource();
      IndexedWord target = dependency.getChildWithReln(source, UniversalEnglishGrammaticalRelations.NOMINAL_SUBJECT);
      if(target==null) target = dependency.getChildWithReln(source, UniversalEnglishGrammaticalRelations.CLAUSAL_SUBJECT);
      // TODO
      if(target == null) continue;

      // to handle relative clause: e.g., Tim who is a student,
      if(target.tag().startsWith("W")) {
        IndexedWord parent = dependency.getParent(source);
        if(parent!=null && dependency.reln(parent, source).equals(UniversalEnglishGrammaticalRelations.RELATIVE_CLAUSE_MODIFIER)) {
          target = parent;
        }
      }
      int sIdx = source.index()-1;
      int tIdx = target.index()-1;
      preNomi.add(Pair.makePair(tIdx, sIdx));
    }
    markMentionRelation(orderedMentions, preNomi, "PREDICATE_NOMINATIVE");


    // relative pronouns  TODO
    Set> relativePronounPairs = Generics.newHashSet();
    markMentionRelation(orderedMentions, relativePronounPairs, "RELATIVE_PRONOUN");
  }

  private static void initializeClusters(Document doc) {
    for (List predicted : doc.predictedMentions) {
      for (Mention p : predicted) {
        doc.corefClusters.put(p.mentionID, new CorefCluster(p.mentionID, Generics.newHashSet(Arrays.asList(p))));
        p.corefClusterID = p.mentionID;
      }
    }
    boolean hasGold = (doc.goldMentions != null);
    if(hasGold) {
      for(List golds : doc.goldMentions) {
        for(Mention g : golds) {
          doc.goldMentionsByID.put(g.mentionID, g);
        }
      }
    }
  }

  /** Find document type: Conversation or article  */
  private static DocType findDocType(Document doc) {
    boolean speakerChange = false;

    for(CoreMap sent : doc.annotation.get(CoreAnnotations.SentencesAnnotation.class)) {
      for(CoreLabel w : sent.get(CoreAnnotations.TokensAnnotation.class)) {
        int utterIndex = w.get(CoreAnnotations.UtteranceAnnotation.class);
        if(utterIndex!=0) speakerChange = true;
        if(speakerChange && utterIndex==0) return DocType.ARTICLE;
        if(doc.maxUtter < utterIndex) doc.maxUtter = utterIndex;
      }
    }
    if(!speakerChange) return DocType.ARTICLE;
    return DocType.CONVERSATION;  // in conversation, utter index keep increasing.
  }

  /** Set paragraph index */
  private static void setParagraphAnnotation(Document doc) {
    int paragraphIndex = 0;
    int previousOffset = -10;
    for(CoreMap sent : doc.annotation.get(CoreAnnotations.SentencesAnnotation.class)) {
      for(CoreLabel w : sent.get(CoreAnnotations.TokensAnnotation.class)) {
        if(w.containsKey(CoreAnnotations.CharacterOffsetBeginAnnotation.class)) {
          if(w.get(CoreAnnotations.CharacterOffsetBeginAnnotation.class) > previousOffset+2) paragraphIndex++;
          w.set(CoreAnnotations.ParagraphAnnotation.class, paragraphIndex);
          previousOffset = w.get(CoreAnnotations.CharacterOffsetEndAnnotation.class);
        } else {
          w.set(CoreAnnotations.ParagraphAnnotation.class, -1);
        }
      }
    }
    for(List l : doc.predictedMentions) {
      for(Mention m : l){
        m.paragraph = m.headWord.get(CoreAnnotations.ParagraphAnnotation.class);
      }
    }
    doc.numParagraph = paragraphIndex;
  }

  /** Process discourse information */
  protected static void processDiscourse(Document doc, Dictionaries dict) {
    Boolean useMarkedDiscourse =
        doc.annotation.get(CoreAnnotations.UseMarkedDiscourseAnnotation.class);
    if (useMarkedDiscourse == null || !useMarkedDiscourse) {
      for (CoreLabel l : doc.annotation.get(CoreAnnotations.TokensAnnotation.class)) {
        l.remove(CoreAnnotations.SpeakerAnnotation.class);
        l.remove(CoreAnnotations.UtteranceAnnotation.class);
      }
    }

    setUtteranceAndSpeakerAnnotation(doc);
//    markQuotations(this.annotation.get(CoreAnnotations.SentencesAnnotation.class), false);

    // mention utter setting
    for(Mention m : doc.predictedMentionsByID.values()) {
      m.utter = m.headWord.get(CoreAnnotations.UtteranceAnnotation.class);
    }

    doc.docType = findDocType(doc);
    findSpeakers(doc, dict);

    boolean debug = false;
    if(debug) {
      for(CoreMap sent : doc.annotation.get(CoreAnnotations.SentencesAnnotation.class)) {
        for(CoreLabel cl : sent.get(TokensAnnotation.class)) {
          log.info("   "+cl.word()+"-"+cl.get(UtteranceAnnotation.class)+"-"+cl.get(SpeakerAnnotation.class));
        }
      }


      for(Integer utter : doc.speakers.keySet()) {
        String speakerID = doc.speakers.get(utter);
        log.info("utterance: "+utter);
        log.info("speakers value: " + speakerID);
        log.info("mention for it: "+
            ( (NumberMatchingRegex.isDecimalInteger(speakerID))?
                doc.predictedMentionsByID.get(Integer.parseInt(doc.speakers.get(utter)))
                : "no mention for this speaker yet") );
      }
      log.info("AA SPEAKERS: "+ doc.speakers);
    }

    // build 'speakerInfo' from 'speakers'
    for(Integer utter : doc.speakers.keySet()) {
      String speaker = doc.speakers.get(utter);
      SpeakerInfo speakerInfo = doc.speakerInfoMap.get(speaker);
      if (speakerInfo == null) {
        doc.speakerInfoMap.put(speaker, speakerInfo = new SpeakerInfo(speaker));
      }
    }
    if(debug){
      log.info("BB SPEAKER INFO MAP: "+doc.speakerInfoMap);
    }

    // mention -> to its speakerID: m.headWord.get(SpeakerAnnotation.class)
    // speakerID -> more info: speakerInfoMap.get(speakerID)
    // if exists, set(mentionID, its speakerID pair): speakerPairs

    // for speakerInfo with real speaker name, find corresponding mention by strict/loose matching
    Map speakerConversion = Generics.newHashMap();
    for(String speaker : doc.speakerInfoMap.keySet()) {
      SpeakerInfo speakerInfo = doc.speakerInfoMap.get(speaker);
      if (speakerInfo.hasRealSpeakerName()) {   // do only for real name speaker, not mention ID
        boolean found = false;
        for(Mention m : doc.predictedMentionsByID.values()) {
          if (CorefRules.mentionMatchesSpeaker(m, speakerInfo, true)) {
            speakerConversion.put(speaker, m.mentionID);
            found = true;
            break;
          }
        }
        if(!found) {
          for(Mention m : doc.predictedMentionsByID.values()) {
            if (CorefRules.mentionMatchesSpeaker(m, speakerInfo, false)) {
              speakerConversion.put(speaker, m.mentionID);
              break;
            }
          }
        }
      }
    }

    if(debug) log.info("CC speaker conversion: " + speakerConversion);

    // convert real name speaker to speaker mention id
    for(Integer utter : doc.speakers.keySet()) {
      String speaker = doc.speakers.get(utter);
      if(speakerConversion.containsKey(speaker)) {
        int speakerID = speakerConversion.get(speaker);
        doc.speakers.put(utter, Integer.toString(speakerID));
      }
    }
    for(String speaker : speakerConversion.keySet()) {
      doc.speakerInfoMap.put( Integer.toString(speakerConversion.get(speaker)), doc.speakerInfoMap.get(speaker));
      doc.speakerInfoMap.remove(speaker);
    }

    // fix SpeakerAnnotation
    for(CoreLabel cl : doc.annotation.get(TokensAnnotation.class)) {
      int utter = cl.get(UtteranceAnnotation.class);
      if(doc.speakers.containsKey(utter)) {
        cl.set(CoreAnnotations.SpeakerAnnotation.class, doc.speakers.get(utter));
      }
    }

    // find speakerPairs
    for(Mention m : doc.predictedMentionsByID.values()) {
      String speaker = m.headWord.get(CoreAnnotations.SpeakerAnnotation.class);
      if(debug) log.info("DD: "+speaker);
      if (NumberMatchingRegex.isDecimalInteger(speaker)) {
        int speakerMentionID = Integer.parseInt(speaker);
        doc.speakerPairs.add(new Pair<>(m.mentionID, speakerMentionID));
      }
    }

    if(debug) {
      log.info("==========================================================================");
      for(Integer utter : doc.speakers.keySet()) {
        String speakerID = doc.speakers.get(utter);
        log.info("utterance: "+utter);
        log.info("speakers value: " + speakerID);
        log.info("mention for it: "+
            ( (NumberMatchingRegex.isDecimalInteger(speakerID))?
                doc.predictedMentionsByID.get(Integer.parseInt(doc.speakers.get(utter)))
                : "no mention for this speaker yet") );
      }
      log.info(doc.speakers);
    }
  }

  private static void setUtteranceAndSpeakerAnnotation(Document doc) {
    doc.speakerInfoGiven = false;
    int utterance = 0;
    int outsideQuoteUtterance = 0;   // the utterance of outside of quotation
    boolean insideQuotation = false;
    List tokens = doc.annotation.get(CoreAnnotations.TokensAnnotation.class);
    String preSpeaker = (tokens.size() > 0)? tokens.get(0).get(CoreAnnotations.SpeakerAnnotation.class) : null;

    for (CoreLabel l : tokens) {
      String curSpeaker = l.get(CoreAnnotations.SpeakerAnnotation.class);
      String w = l.get(CoreAnnotations.TextAnnotation.class);

      if (curSpeaker!=null && !curSpeaker.equals("-")) doc.speakerInfoGiven = true;

      boolean speakerChange = doc.speakerInfoGiven && curSpeaker!=null && !curSpeaker.equals(preSpeaker);
      boolean quoteStart = w.equals("``") || (!insideQuotation && w.equals("\""));
      boolean quoteEnd = w.equals("''") || (insideQuotation && w.equals("\""));

      if(speakerChange) {
        if(quoteStart) {
          utterance = doc.maxUtter + 1;
          outsideQuoteUtterance = utterance+1;
        } else {
          utterance = doc.maxUtter + 1;
          outsideQuoteUtterance = utterance;
        }
        preSpeaker = curSpeaker;
      } else {
        if(quoteStart) {
          utterance = doc.maxUtter + 1;
        }
      }
      if(quoteEnd) {
        utterance = outsideQuoteUtterance;
        insideQuotation = false;
      }
      if(doc.maxUtter < utterance) doc.maxUtter = utterance;

      l.set(CoreAnnotations.UtteranceAnnotation.class, utterance);
      if(quoteStart) l.set(CoreAnnotations.UtteranceAnnotation.class, outsideQuoteUtterance);   // quote start got outside utterance idx

      boolean noSpeakerInfo = !l.containsKey(CoreAnnotations.SpeakerAnnotation.class)
      || l.get(CoreAnnotations.SpeakerAnnotation.class).equals("")
      || l.get(CoreAnnotations.SpeakerAnnotation.class).startsWith("PER");

      if(noSpeakerInfo || insideQuotation){
        l.set(CoreAnnotations.SpeakerAnnotation.class, "PER"+utterance);
      }
      if(quoteStart) insideQuotation = true;
    }
  }

  /** Speaker extraction */
  private static void findSpeakers(Document doc, Dictionaries dict) {
    Boolean useMarkedDiscourseBoolean = doc.annotation.get(CoreAnnotations.UseMarkedDiscourseAnnotation.class);
    boolean useMarkedDiscourse = (useMarkedDiscourseBoolean != null)? useMarkedDiscourseBoolean: false;

    if(!useMarkedDiscourse) {
      if(doc.docType==DocType.CONVERSATION) findSpeakersInConversation(doc, dict);
      else if (doc.docType==DocType.ARTICLE) findSpeakersInArticle(doc, dict);
    }

    for(CoreMap sent : doc.annotation.get(CoreAnnotations.SentencesAnnotation.class)) {
      for(CoreLabel w : sent.get(CoreAnnotations.TokensAnnotation.class)) {
        int utterIndex = w.get(CoreAnnotations.UtteranceAnnotation.class);
        if(!doc.speakers.containsKey(utterIndex)) {
          doc.speakers.put(utterIndex, w.get(CoreAnnotations.SpeakerAnnotation.class));
        }
      }
    }
  }

  private static void findSpeakersInArticle(Document doc, Dictionaries dict) {
    List sentences = doc.annotation.get(CoreAnnotations.SentencesAnnotation.class);
    IntPair beginQuotation = null;
    IntPair endQuotation = null;
    boolean insideQuotation = false;
    int utterNum = -1;

    for (int i = 0 ; i < sentences.size(); i++) {
      List sent = sentences.get(i).get(CoreAnnotations.TokensAnnotation.class);
      for(int j = 0 ; j < sent.size() ; j++) {
        int utterIndex = sent.get(j).get(CoreAnnotations.UtteranceAnnotation.class);

        if(utterIndex != 0 && !insideQuotation) {
          utterNum = utterIndex;
          insideQuotation = true;
          beginQuotation = new IntPair(i,j);
        } else if (utterIndex == 0 && insideQuotation) {
          insideQuotation = false;
          endQuotation = new IntPair(i,j);
          findQuotationSpeaker(doc, utterNum, sentences, beginQuotation, endQuotation, dict);
        }
      }
    }
    if(insideQuotation) {
      endQuotation = new IntPair(sentences.size()-1, sentences.get(sentences.size()-1).get(CoreAnnotations.TokensAnnotation.class).size()-1);
      findQuotationSpeaker(doc, utterNum, sentences, beginQuotation, endQuotation, dict);
    }
  }

  private static void findQuotationSpeaker(Document doc, int utterNum, List sentences,
      IntPair beginQuotation, IntPair endQuotation, Dictionaries dict) {

    if(findSpeaker(doc, utterNum, beginQuotation.get(0), sentences, 0, beginQuotation.get(1), dict))
      return ;

    if(findSpeaker(doc, utterNum, endQuotation.get(0), sentences, endQuotation.get(1),
        sentences.get(endQuotation.get(0)).get(CoreAnnotations.TokensAnnotation.class).size(), dict))
      return;

    if(beginQuotation.get(1) <= 1 && beginQuotation.get(0) > 0) {
      if(findSpeaker(doc, utterNum, beginQuotation.get(0)-1, sentences, 0,
          sentences.get(beginQuotation.get(0)-1).get(CoreAnnotations.TokensAnnotation.class).size(), dict))
        return;
    }

    if(endQuotation.get(1) >= sentences.get(endQuotation.get(0)).size()-2
        && sentences.size() > endQuotation.get(0)+1) {
      if(findSpeaker(doc, utterNum, endQuotation.get(0)+1, sentences, 0,
          sentences.get(endQuotation.get(0)+1).get(CoreAnnotations.TokensAnnotation.class).size(), dict))
        return;
    }
  }

  private static boolean findSpeaker(Document doc, int utterNum, int sentNum, List sentences,
      int startIndex, int endIndex, Dictionaries dict) {
    List sent = sentences.get(sentNum).get(CoreAnnotations.TokensAnnotation.class);
    for(int i = startIndex ; i < endIndex ; i++) {
      CoreLabel cl = sent.get(i);
      if(cl.get(CoreAnnotations.UtteranceAnnotation.class)!=0) continue;
      String lemma = cl.lemma();
      String word = cl.word();
      if(dict.reportVerb.contains(lemma) && cl.tag().startsWith("V")) {
        // find subject
        SemanticGraph dependency = sentences.get(sentNum).get(SemanticGraphCoreAnnotations.EnhancedDependenciesAnnotation.class);
        if (dependency == null) {
          dependency = sentences.get(sentNum).get(SemanticGraphCoreAnnotations.BasicDependenciesAnnotation.class);
        }
        IndexedWord w = dependency.getNodeByWordPattern(word);

        if (w != null) {
          if(findSubject(doc, dependency, w, sentNum, utterNum)) return true;
          for(IndexedWord p : dependency.getPathToRoot(w)) {
            if(!p.tag().startsWith("V") && !p.tag().startsWith("MD")) break;
            if(findSubject(doc, dependency, p, sentNum, utterNum)) return true;    // handling something like "was talking", "can tell"
          }
        } else {
          Redwood.log("debug-preprocessor", "Cannot find node in dependency for word " + word);
        }
      }
    }
    return false;
  }

  private static boolean findSubject(Document doc, SemanticGraph dependency, IndexedWord w, int sentNum, int utterNum) {
    for(Pair child : dependency.childPairs(w)){
      if(child.first().getShortName().equals("nsubj")) {
        String subjectString = child.second().word();
        int subjectIndex = child.second().index();  // start from 1
        IntTuple headPosition = new IntTuple(2);
        headPosition.set(0, sentNum);
        headPosition.set(1, subjectIndex-1);
        String speaker;
        if(doc.mentionheadPositions.containsKey(headPosition)) {
          speaker = Integer.toString(doc.mentionheadPositions.get(headPosition).mentionID);
        } else {
          speaker = subjectString;
        }
        doc.speakers.put(utterNum, speaker);
        return true;
      }
    }
    return false;
  }

  private static void findSpeakersInConversation(Document doc, Dictionaries dict) {
    for(List l : doc.predictedMentions) {
      for(Mention m : l){
        if(m.predicateNominatives == null) continue;
        for (Mention a : m.predicateNominatives){
          if(a.spanToString().toLowerCase().equals("i")) {
            doc.speakers.put(m.headWord.get(CoreAnnotations.UtteranceAnnotation.class), Integer.toString(m.mentionID));
          }
        }
      }
    }
    List paragraph = new ArrayList<>();
    int paragraphUtterIndex = 0;
    String nextParagraphSpeaker = "";
    int paragraphOffset = 0;
    for(CoreMap sent : doc.annotation.get(CoreAnnotations.SentencesAnnotation.class)) {
      paragraph.add(sent);
      int currentUtter = sent.get(CoreAnnotations.TokensAnnotation.class).get(0).get(CoreAnnotations.UtteranceAnnotation.class);
      if(paragraphUtterIndex!=currentUtter) {
        nextParagraphSpeaker = findParagraphSpeaker(doc, paragraph, paragraphUtterIndex, nextParagraphSpeaker, paragraphOffset, dict);
        paragraphUtterIndex = currentUtter;
        paragraphOffset += paragraph.size();
        paragraph = new ArrayList<>();
      }
    }
    findParagraphSpeaker(doc, paragraph, paragraphUtterIndex, nextParagraphSpeaker, paragraphOffset, dict);
  }

  private static String findParagraphSpeaker(Document doc, List paragraph,
            int paragraphUtterIndex, String nextParagraphSpeaker, int paragraphOffset, Dictionaries dict) {
    if ( ! doc.speakers.containsKey(paragraphUtterIndex)) {
      if ( ! nextParagraphSpeaker.isEmpty()) {
        doc.speakers.put(paragraphUtterIndex, nextParagraphSpeaker);
      } else {  // find the speaker of this paragraph (John, nbc news)
        // cdm [Sept 2015] added this check to try to avoid crash
        if (paragraph.isEmpty()) {
          Redwood.log("debug-preprocessor", "Empty paragraph; skipping findParagraphSpeaker");
          return "";
        }
        CoreMap lastSent = paragraph.get(paragraph.size()-1);
        String speaker = "";
        boolean hasVerb = false;
        for(int i = 0 ; i < lastSent.get(CoreAnnotations.TokensAnnotation.class).size() ; i++){
          CoreLabel w = lastSent.get(CoreAnnotations.TokensAnnotation.class).get(i);
          String pos = w.get(CoreAnnotations.PartOfSpeechAnnotation.class);
          String ner = w.get(CoreAnnotations.NamedEntityTagAnnotation.class);
          if(pos.startsWith("V")) {
            hasVerb = true;
            break;
          }
          if(ner.startsWith("PER")) {
            IntTuple headPosition = new IntTuple(2);
            headPosition.set(0, paragraph.size()-1 + paragraphOffset);
            headPosition.set(1, i);
            if(doc.mentionheadPositions.containsKey(headPosition)) {
              speaker = Integer.toString(doc.mentionheadPositions.get(headPosition).mentionID);
            }
          }
        }
        if(!hasVerb && !speaker.equals("")) {
          doc.speakers.put(paragraphUtterIndex, speaker);
        }
      }
    }
    return findNextParagraphSpeaker(doc, paragraph, paragraphOffset, dict);
  }

  private static String findNextParagraphSpeaker(Document doc, List paragraph, int paragraphOffset, Dictionaries dict) {
    if (paragraph.isEmpty()) {
      return "";
    }
    CoreMap lastSent = paragraph.get(paragraph.size()-1);
    String speaker = "";
    for(CoreLabel w : lastSent.get(CoreAnnotations.TokensAnnotation.class)) {
      if(w.get(CoreAnnotations.LemmaAnnotation.class).equals("report") || w.get(CoreAnnotations.LemmaAnnotation.class).equals("say")) {
        String word = w.get(CoreAnnotations.TextAnnotation.class);
        SemanticGraph dependency = lastSent.get(SemanticGraphCoreAnnotations.EnhancedDependenciesAnnotation.class);
        if (dependency == null) {
          dependency = lastSent.get(SemanticGraphCoreAnnotations.BasicDependenciesAnnotation.class);
        }
        IndexedWord t = dependency.getNodeByWordPattern(word);

        for(Pair child : dependency.childPairs(t)){
          if(child.first().getShortName().equals("nsubj")) {
            int subjectIndex = child.second().index();  // start from 1
            IntTuple headPosition = new IntTuple(2);
            headPosition.set(0, paragraph.size()-1 + paragraphOffset);
            headPosition.set(1, subjectIndex-1);
            if(doc.mentionheadPositions.containsKey(headPosition)
                && doc.mentionheadPositions.get(headPosition).nerString.startsWith("PER")) {
              speaker = Integer.toString(doc.mentionheadPositions.get(headPosition).mentionID);
            }
          }
        }
      }
    }
    return speaker;
  }

  /** Check one mention is the speaker of the other mention */
  public static boolean isSpeaker(Mention m, Mention ant, Dictionaries dict) {

    if(!dict.firstPersonPronouns.contains(ant.spanToString().toLowerCase())
        || ant.number==Number.PLURAL || ant.sentNum!=m.sentNum) return false;

    int countQuotationMark = 0;
    for(int i = Math.min(m.headIndex, ant.headIndex)+1 ; i < Math.max(m.headIndex, ant.headIndex) ; i++) {
      String word = m.sentenceWords.get(i).get(CoreAnnotations.TextAnnotation.class);
      if(word.equals("``") || word.equals("''")) countQuotationMark++;
    }
    if(countQuotationMark!=1) return false;

    IndexedWord w = m.enhancedDependency.getNodeByWordPattern(m.sentenceWords.get(m.headIndex).get(CoreAnnotations.TextAnnotation.class));
    if(w== null) return false;

    for(Pair parent : m.enhancedDependency.parentPairs(w)){
      if(parent.first().getShortName().equals("nsubj")
          && dict.reportVerb.contains(parent.second().get(CoreAnnotations.LemmaAnnotation.class))) {
        return true;
      }
    }
    return false;
  }

  private static void markListMemberRelation(List orderedMentions) {
    for(Mention m1 : orderedMentions){
      for(Mention m2 : orderedMentions){
        // Mark if m2 and m1 are in list relationship
        if (m1.isListMemberOf(m2)) {
          m2.addListMember(m1);
          m1.addBelongsToList(m2);
        } else if (m2.isListMemberOf(m1)) {
          m1.addListMember(m2);
          m2.addBelongsToList(m1);
        }
      }
    }
  }
  private static void markMentionRelation(List orderedMentions, Set> foundPairs, String flag) {
    for(Mention m1 : orderedMentions){
      for(Mention m2 : orderedMentions){
        if(m1==m2) continue;
        // Ignore if m2 and m1 are in list relationship
        if (m1.isListMemberOf(m2) || m2.isListMemberOf(m1) || m1.isMemberOfSameList(m2)) {
          //Redwood.log("debug-preprocessor", "Not checking '" + m1 + "' and '" + m2 + "' for " + flag + ": in list relationship");
          continue;
        }
        for(Pair foundPair: foundPairs){
          if (foundPair.first() == m1.headIndex && foundPair.second() == m2.headIndex) {
            if(flag.equals("APPOSITION")) {
              if ( ! foundPair.first().equals(foundPair.second()) || m2.insideIn(m1)) {
                m2.addApposition(m1);
              }
            }
            else if(flag.equals("PREDICATE_NOMINATIVE")) {
              m2.addPredicateNominatives(m1);
            }
            else if(flag.equals("RELATIVE_PRONOUN")) m2.addRelativePronoun(m1);
            else throw new RuntimeException("check flag in markMentionRelation (dcoref/MentionExtractor.java)");
          }
        }
      }
    }
  }

//  private static final TregexPattern relativePronounPattern = TregexPattern.compile("NP < (NP=m1 $.. (SBAR < (WHNP < WP|WDT=m2)))");
//  private static void findRelativePronouns(Tree tree, Set> relativePronounPairs) {
//    findTreePattern(tree, relativePronounPattern, relativePronounPairs);
//  }
}




© 2015 - 2024 Weber Informatics LLC | Privacy Policy