All Downloads are FREE. Search and download functionalities are using the official Maven repository.

edu.stanford.nlp.dcoref.Document Maven / Gradle / Ivy

Go to download

Stanford CoreNLP provides a set of natural language analysis tools which can take raw English language text input and give the base forms of words, their parts of speech, whether they are names of companies, people, etc., normalize dates, times, and numeric quantities, mark up the structure of sentences in terms of phrases and word dependencies, and indicate which noun phrases refer to the same entities. It provides the foundational building blocks for higher level text understanding applications.

There is a newer version: 4.5.7
Show newest version
//
// StanfordCoreNLP -- a suite of NLP tools
// Copyright (c) 2009-2010 The Board of Trustees of
// The Leland Stanford Junior University. All Rights Reserved.
//
// This program is free software; you can redistribute it and/or
// modify it under the terms of the GNU General Public License
// as published by the Free Software Foundation; either version 2
// of the License, or (at your option) any later version.
//
// This program is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
// GNU General Public License for more details.
//
// You should have received a copy of the GNU General Public License
// along with this program; if not, write to the Free Software
// Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA  02111-1307, USA.
//
// For more information, bug reports, fixes, contact:
//    Christopher Manning
//    Dept of Computer Science, Gates 1A
//    Stanford CA 94305-9010
//    USA
//

package edu.stanford.nlp.dcoref;

import java.io.Serializable;
import java.util.*;

import edu.stanford.nlp.dcoref.Dictionaries.Number;
import edu.stanford.nlp.dcoref.Dictionaries.Person;
import edu.stanford.nlp.ling.CoreAnnotations;
import edu.stanford.nlp.ling.CoreLabel;
import edu.stanford.nlp.ling.IndexedWord;
import edu.stanford.nlp.math.NumberMatchingRegex;
import edu.stanford.nlp.pipeline.Annotation;
import edu.stanford.nlp.trees.GrammaticalRelation;
import edu.stanford.nlp.semgraph.SemanticGraph;
import edu.stanford.nlp.semgraph.SemanticGraphCoreAnnotations;
import edu.stanford.nlp.util.CollectionValuedMap;
import edu.stanford.nlp.util.CoreMap;
import edu.stanford.nlp.util.Generics;
import edu.stanford.nlp.util.IntPair;
import edu.stanford.nlp.util.IntTuple;
import edu.stanford.nlp.util.Pair;
import edu.stanford.nlp.util.TwoDimensionalMap;
import edu.stanford.nlp.util.TwoDimensionalSet;

public class Document implements Serializable {

  private static final long serialVersionUID = -4139866807494603953L;

  public enum DocType { CONVERSATION, ARTICLE }

  /** The type of document: conversational or article */
  public DocType docType;

  /** Document annotation */
  public Annotation annotation;

  /** for conll shared task 2011  */
  public CoNLL2011DocumentReader.Document conllDoc;

  /** The list of gold mentions */
  public List> goldOrderedMentionsBySentence;
  /** The list of predicted mentions */
  public List> predictedOrderedMentionsBySentence;

  /** return the list of predicted mentions */
  public List> getOrderedMentions() {
    return predictedOrderedMentionsBySentence;
  }

  /** Clusters for coreferent mentions */
  public Map corefClusters;

  /** Gold Clusters for coreferent mentions */
  public Map goldCorefClusters;

  /** For all mentions in a document, map mentionID to mention. */
  public Map allPredictedMentions;
  public Map allGoldMentions;

  /** Set of roles (in role apposition) in a document  */
  public Set roleSet;

  /**
   * Position of each mention in the input matrix
   * Each mention occurrence with sentence # and position within sentence
   * (Nth mention, not Nth token)
   */
  public Map positions;              // mentions may be removed from this due to post processing
  public Map allPositions;           // all mentions (mentions will not be removed from this)

  public final Map mentionheadPositions;

  /** List of gold links in a document by positions */
  private List> goldLinks;

  /** Map UtteranceAnnotation to String (speaker): mention ID or speaker string  */
  public Map speakers;

  /** Pair of mention id, and the mention's speaker id  */
  public Set> speakerPairs;

  public int maxUtter;
  public int numParagraph;
  public int numSentences;

  /** Set of incompatible clusters pairs */
  private TwoDimensionalSet incompatibles;
  private TwoDimensionalSet incompatibleClusters;

  protected TwoDimensionalMap acronymCache;

  /** Map of speaker name/id to speaker info */
  transient private Map speakerInfoMap = Generics.newHashMap();

  public Document() {
    positions = Generics.newHashMap();
    mentionheadPositions = Generics.newHashMap();
    roleSet = Generics.newHashSet();
    corefClusters = Generics.newHashMap();
    goldCorefClusters = null;
    allPredictedMentions = Generics.newHashMap();
    allGoldMentions = Generics.newHashMap();
    speakers = Generics.newHashMap();
    speakerPairs = Generics.newHashSet();
    incompatibles = TwoDimensionalSet.hashSet();
    incompatibleClusters = TwoDimensionalSet.hashSet();
    acronymCache = TwoDimensionalMap.hashMap();
  }

  public Document(Annotation anno, List> predictedMentions,
      List> goldMentions, Dictionaries dict) {
    this();
    annotation = anno;
    numSentences = anno.get(CoreAnnotations.SentencesAnnotation.class).size();
    predictedOrderedMentionsBySentence = predictedMentions;
    goldOrderedMentionsBySentence = goldMentions;
    if(goldMentions!=null) {
      findTwinMentions(true);
      // fill allGoldMentions
      for(List l : goldOrderedMentionsBySentence) {
        for(Mention g : l) {
          allGoldMentions.put(g.mentionID, g);
        }
      }
    }
    // set original ID, initial coref clusters, paragraph annotation, mention positions
    initialize();
    processDiscourse(dict);
    printMentionDetection();
  }

  /** Process discourse information */
  protected void processDiscourse(Dictionaries dict) {
    docType = findDocType(dict);
    markQuotations(this.annotation.get(CoreAnnotations.SentencesAnnotation.class), false);
    findSpeakers(dict);

    // find 'speaker mention' for each mention
    for(Mention m : allPredictedMentions.values()) {
      int utter = m.headWord.get(CoreAnnotations.UtteranceAnnotation.class);
      String speaker = m.headWord.get(CoreAnnotations.SpeakerAnnotation.class);
      if (speaker != null) {
        // Populate speaker info
        SpeakerInfo speakerInfo = speakerInfoMap.get(speaker);
        if (speakerInfo == null) {
          speakerInfoMap.put(speaker, speakerInfo = new SpeakerInfo(speaker));
          // span indicates this is the speaker
          if (Rules.mentionMatchesSpeaker(m, speakerInfo, true)) {
            m.speakerInfo = speakerInfo;
          }
        }

        if (NumberMatchingRegex.isDecimalInteger(speaker)) {
          try{
            int speakerMentionID = Integer.parseInt(speaker);
            if (utter != 0) {
              // Add pairs of mention id and the mention id of the speaker
              speakerPairs.add(new Pair<>(m.mentionID, speakerMentionID));
//              speakerPairs.add(new Pair(speakerMentionID, m.mentionID));
            }
          } catch (Exception e){
            // no mention found for the speaker
            // nothing to do
          }
        }
      }
      // set generic 'you' : e.g., you know in conversation
      if(docType!=DocType.ARTICLE && m.person==Person.YOU && m.endIndex < m.sentenceWords.size()-1
          && m.sentenceWords.get(m.endIndex).get(CoreAnnotations.TextAnnotation.class).equalsIgnoreCase("know")) {
        m.generic = true;
      }
    }
    // now that we have identified the speakers, first pass to check if mentions should cluster with the speakers
    for(Mention m : allPredictedMentions.values()) {
      if (m.speakerInfo == null) {
        for (SpeakerInfo speakerInfo: speakerInfoMap.values()) {
          if (speakerInfo.hasRealSpeakerName()) {
            // do loose match - assumes that there isn't that many speakers....
            if (Rules.mentionMatchesSpeaker(m, speakerInfo, false)) {
              m.speakerInfo = speakerInfo;
              break;
            }
          }
        }
      }
    }

  }

  /** Document initialize */
  protected void initialize() {
    if(goldOrderedMentionsBySentence==null) assignOriginalID();
    setParagraphAnnotation();
    initializeCorefCluster();
    this.allPositions = Generics.newHashMap(this.positions);
  }

  /** initialize positions and corefClusters (put each mention in each CorefCluster) */
  private void initializeCorefCluster() {
    for(int i = 0; i < predictedOrderedMentionsBySentence.size(); i ++){
      for(int j = 0; j < predictedOrderedMentionsBySentence.get(i).size(); j ++){
        Mention m = predictedOrderedMentionsBySentence.get(i).get(j);
        if (allPredictedMentions.containsKey(m.mentionID)) {
          SieveCoreferenceSystem.logger.warning("WARNING: Already contain mention " + m.mentionID);
          Mention m1 = allPredictedMentions.get(m.mentionID);
          SieveCoreferenceSystem.logger.warning("OLD mention: " + m1.spanToString() + "[" + m1.startIndex + "," + m1.endIndex + "]");
          SieveCoreferenceSystem.logger.warning("NEW mention: " + m.spanToString() + "[" + m.startIndex + "," + m.endIndex + "]");
          //          SieveCoreferenceSystem.debugPrintMentions(System.err, "PREDICTED ORDERED", predictedOrderedMentionsBySentence);
//          SieveCoreferenceSystem.debugPrintMentions(System.err, "GOLD ORDERED", goldOrderedMentionsBySentence);
        }
        assert(!allPredictedMentions.containsKey(m.mentionID));
        allPredictedMentions.put(m.mentionID, m);

        IntTuple pos = new IntTuple(2);
        pos.set(0, i);
        pos.set(1, j);
        positions.put(m, pos);
        m.sentNum = i;

        assert(!corefClusters.containsKey(m.mentionID));
        corefClusters.put(m.mentionID, new CorefCluster(m.mentionID, Generics.newHashSet(Collections.singletonList(m))));
        m.corefClusterID = m.mentionID;

        IntTuple headPosition = new IntTuple(2);
        headPosition.set(0, i);
        headPosition.set(1, m.headIndex);
        mentionheadPositions.put(headPosition, m);
      }
    }
  }

  public boolean isIncompatible(CorefCluster c1, CorefCluster c2) {
    // Was any of the pairs of mentions marked as incompatible
    int cid1 = Math.min(c1.clusterID, c2.clusterID);
    int cid2 = Math.max(c1.clusterID, c2.clusterID);
    return incompatibleClusters.contains(cid1,cid2);
  }

  // Update incompatibles for two clusters that are about to be merged
  public void mergeIncompatibles(CorefCluster to, CorefCluster from) {
    List, Pair>> replacements =
            new ArrayList<>();
    for (Pair p : incompatibleClusters) {
      Integer other = null;
      if (p.first == from.clusterID) {
        other = p.second;
      } else if (p.second == from.clusterID) {
        other = p.first;
      }
      if (other != null && other != to.clusterID) {
        int cid1 = Math.min(other, to.clusterID);
        int cid2 = Math.max(other, to.clusterID);
        replacements.add(Pair.makePair(p, Pair.makePair(cid1, cid2)));
      }
    }
    for (Pair, Pair> r:replacements)  {
      incompatibleClusters.remove(r.first.first(), r.first.second());
      incompatibleClusters.add(r.second.first(), r.second.second());
    }
  }

  public void mergeAcronymCache(CorefCluster to, CorefCluster from) {
    TwoDimensionalSet replacements = TwoDimensionalSet.hashSet();
    for (Integer first : acronymCache.firstKeySet()) {
      for (Integer second : acronymCache.get(first).keySet()) {
        if (acronymCache.get(first, second)) {
          Integer other = null;
          if (first == from.clusterID) {
            other = second;
          } else if (second == from.clusterID) {
            other = first;
          }
          if (other != null && other != to.clusterID) {
            int cid1 = Math.min(other, to.clusterID);
            int cid2 = Math.max(other, to.clusterID);
            replacements.add(cid1, cid2);
          }
        }
      }
    }
    for (Integer first : replacements.firstKeySet()) {
      for (Integer second : replacements.secondKeySet(first)) {
        acronymCache.put(first, second, true);
      }
    }
  }

  public boolean isIncompatible(Mention m1, Mention m2) {
    int mid1 = Math.min(m1.mentionID, m2.mentionID);
    int mid2 = Math.max(m1.mentionID, m2.mentionID);
    return incompatibles.contains(mid1,mid2);
  }

  public void addIncompatible(Mention m1, Mention m2) {
    int mid1 = Math.min(m1.mentionID, m2.mentionID);
    int mid2 = Math.max(m1.mentionID, m2.mentionID);
    incompatibles.add(mid1,mid2);
    int cid1 = Math.min(m1.corefClusterID, m2.corefClusterID);
    int cid2 = Math.max(m1.corefClusterID, m2.corefClusterID);
    incompatibleClusters.add(cid1,cid2);
  }

  /** Mark twin mentions in gold and predicted mentions */
  protected void findTwinMentions(boolean strict){
    if(strict) findTwinMentionsStrict();
    else findTwinMentionsRelaxed();
  }

  /** Mark twin mentions: All mention boundaries should be matched */
  private void findTwinMentionsStrict(){
    for(int sentNum = 0; sentNum < goldOrderedMentionsBySentence.size(); sentNum++) {
      List golds = goldOrderedMentionsBySentence.get(sentNum);
      List predicts = predictedOrderedMentionsBySentence.get(sentNum);

      // For CoNLL training there are some documents with gold mentions with the same position offsets
      // See /scr/nlp/data/conll-2011/v2/data/train/data/english/annotations/nw/wsj/09/wsj_0990.v2_auto_conll
      //  (Packwood - Roth)
      CollectionValuedMap goldMentionPositions = new CollectionValuedMap<>();
      for(Mention g : golds) {
        IntPair ip = new IntPair(g.startIndex, g.endIndex);
        if (goldMentionPositions.containsKey(ip)) {
          StringBuilder existingMentions = new StringBuilder();
          for (Mention eg: goldMentionPositions.get(ip)) {
            if (existingMentions.length() > 0) {
              existingMentions.append(",");
            }
            existingMentions.append(eg.mentionID);
          }
          SieveCoreferenceSystem.logger.warning("WARNING: gold mentions with the same offsets: " + ip
                  + " mentions=" + g.mentionID + "," + existingMentions + ", " + g.spanToString());
        }
        //assert(!goldMentionPositions.containsKey(ip));
        goldMentionPositions.add(new IntPair(g.startIndex, g.endIndex), g);
      }
      for(Mention p : predicts) {
        IntPair pos = new IntPair(p.startIndex, p.endIndex);
        if(goldMentionPositions.containsKey(pos)) {
          Collection cm = goldMentionPositions.get(pos);
          Mention g = cm.iterator().next();
          cm.remove(g);
          p.mentionID = g.mentionID;
          p.twinless = false;
          g.twinless = false;
        }
      }
      // temp: for making easy to recognize twinless mention
      for(Mention p : predicts){
        if(p.twinless) p.mentionID += 10000;
      }
    }
  }

  /** Mark twin mentions: heads of the mentions are matched */
  private void findTwinMentionsRelaxed() {
    for(int sentNum = 0; sentNum < goldOrderedMentionsBySentence.size(); sentNum++) {
      List golds = goldOrderedMentionsBySentence.get(sentNum);
      List predicts = predictedOrderedMentionsBySentence.get(sentNum);

      Map goldMentionPositions = Generics.newHashMap();
      Map> goldMentionHeadPositions = Generics.newHashMap();
      for(Mention g : golds) {
        goldMentionPositions.put(new IntPair(g.startIndex, g.endIndex), g);
        if(!goldMentionHeadPositions.containsKey(g.headIndex)) {
          goldMentionHeadPositions.put(g.headIndex, new LinkedList<>());
        }
        goldMentionHeadPositions.get(g.headIndex).add(g);
      }

      List remains = new ArrayList<>();
      for (Mention p : predicts) {
        IntPair pos = new IntPair(p.startIndex, p.endIndex);
        if(goldMentionPositions.containsKey(pos)) {
          Mention g = goldMentionPositions.get(pos);
          p.mentionID = g.mentionID;
          p.twinless = false;
          g.twinless = false;
          goldMentionHeadPositions.get(g.headIndex).remove(g);
          if(goldMentionHeadPositions.get(g.headIndex).isEmpty()) {
            goldMentionHeadPositions.remove(g.headIndex);
          }
        }
        else remains.add(p);
      }
      for (Mention r : remains){
        if(goldMentionHeadPositions.containsKey(r.headIndex)) {
          Mention g = goldMentionHeadPositions.get(r.headIndex).poll();
          r.mentionID = g.mentionID;
          r.twinless = false;
          g.twinless = false;
          if(goldMentionHeadPositions.get(g.headIndex).isEmpty()) {
            goldMentionHeadPositions.remove(g.headIndex);
          }
        }
      }
    }
  }

  /** Set paragraph index */
  private void setParagraphAnnotation() {
    int paragraphIndex = 0;
    int previousOffset = -10;
    for(CoreMap sent : annotation.get(CoreAnnotations.SentencesAnnotation.class)) {
      for(CoreLabel w : sent.get(CoreAnnotations.TokensAnnotation.class)) {
        if(w.containsKey(CoreAnnotations.CharacterOffsetBeginAnnotation.class)) {
          if(w.get(CoreAnnotations.CharacterOffsetBeginAnnotation.class) > previousOffset+2) paragraphIndex++;
          w.set(CoreAnnotations.ParagraphAnnotation.class, paragraphIndex);
          previousOffset = w.get(CoreAnnotations.CharacterOffsetEndAnnotation.class);
        } else {
          w.set(CoreAnnotations.ParagraphAnnotation.class, -1);
        }
      }
    }
    for(List l : predictedOrderedMentionsBySentence) {
      for(Mention m : l){
        m.paragraph = m.headWord.get(CoreAnnotations.ParagraphAnnotation.class);
      }
    }
    numParagraph = paragraphIndex;
  }

  /** Find document type: Conversation or article  */
  private DocType findDocType(Dictionaries dict) {
    boolean speakerChange = false;
    Set discourseWithIorYou = Generics.newHashSet();

    for(CoreMap sent : annotation.get(CoreAnnotations.SentencesAnnotation.class)) {
      for(CoreLabel w : sent.get(CoreAnnotations.TokensAnnotation.class)) {
        int utterIndex = w.get(CoreAnnotations.UtteranceAnnotation.class);
        if(utterIndex!=0) speakerChange = true;
        if(speakerChange && utterIndex==0) return DocType.ARTICLE;
        if(dict.firstPersonPronouns.contains(w.get(CoreAnnotations.TextAnnotation.class).toLowerCase())
            || dict.secondPersonPronouns.contains(w.get(CoreAnnotations.TextAnnotation.class).toLowerCase())) {
          discourseWithIorYou.add(utterIndex);
        }
        if(maxUtter < utterIndex) maxUtter = utterIndex;
      }
    }
    if(!speakerChange) return DocType.ARTICLE;
    return DocType.CONVERSATION;  // in conversation, utter index keep increasing.
  }

  /** When there is no mentionID information (without gold annotation), assign mention IDs */
  protected void assignOriginalID(){
    List> orderedMentionsBySentence = this.getOrderedMentions();
    boolean hasOriginalID = true;
    for(List l : orderedMentionsBySentence){
      if (l.size()==0) continue;
      for(Mention m : l){
        if(m.mentionID == -1){
          hasOriginalID = false;
        }
      }
    }
    if(!hasOriginalID){
      int id = 0;
      for(List l : orderedMentionsBySentence){
        for(Mention m : l){
          m.mentionID = id++;
        }
      }
    }
  }

  /** Extract gold coref cluster information. */
  public void extractGoldCorefClusters(){
    goldCorefClusters = Generics.newHashMap();
    for (List mentions : goldOrderedMentionsBySentence) {
      for (Mention m : mentions) {
        int id = m.goldCorefClusterID;
        if (id == -1) {
          throw new RuntimeException("No gold info");
        }
        CorefCluster c = goldCorefClusters.get(id);
        if (c == null) {
          c = new CorefCluster(id);
          goldCorefClusters.put(id, c);
        }
        c.corefMentions.add(m);
      }
    }
  }

  protected List> getGoldLinks() {
    if(goldLinks==null) this.extractGoldLinks();
    return goldLinks;
  }

  /** Extract gold coref link information */
  protected void extractGoldLinks() {
    //    List> orderedMentionsBySentence = this.getOrderedMentions();
    List> links = new ArrayList<>();

    // position of each mention in the input matrix, by id
    Map positions = Generics.newHashMap();
    // positions of antecedents
    Map> antecedents = Generics.newHashMap();
    for(int i = 0; i < goldOrderedMentionsBySentence.size(); i ++){
      for(int j = 0; j < goldOrderedMentionsBySentence.get(i).size(); j ++){
        Mention m = goldOrderedMentionsBySentence.get(i).get(j);
        int id = m.mentionID;
        IntTuple pos = new IntTuple(2);
        pos.set(0, i);
        pos.set(1, j);
        positions.put(id, pos);
        antecedents.put(id, new ArrayList<>());
      }
    }

//    SieveCoreferenceSystem.debugPrintMentions(System.err, "", goldOrderedMentionsBySentence);
    for (List mentions : goldOrderedMentionsBySentence) {
      for (Mention m : mentions) {
        int id = m.mentionID;
        IntTuple src = positions.get(id);

        assert (src != null);
        if (m.originalRef >= 0) {
          IntTuple dst = positions.get(m.originalRef);
          if (dst == null) {
            throw new RuntimeException("Cannot find gold mention with ID=" + m.originalRef);
          }

          // to deal with cataphoric annotation
          while (dst.get(0) > src.get(0) || (dst.get(0) == src.get(0) && dst.get(1) > src.get(1))) {
            Mention dstMention = goldOrderedMentionsBySentence.get(dst.get(0)).get(dst.get(1));
            m.originalRef = dstMention.originalRef;
            dstMention.originalRef = id;

            if (m.originalRef < 0) break;
            dst = positions.get(m.originalRef);
          }
          if (m.originalRef < 0) continue;

          // A B C: if A<-B, A<-C => make a link B<-C
          for (int k = dst.get(0); k <= src.get(0); k++) {
            for (int l = 0; l < goldOrderedMentionsBySentence.get(k).size(); l++) {
              if (k == dst.get(0) && l < dst.get(1)) continue;
              if (k == src.get(0) && l > src.get(1)) break;
              IntTuple missed = new IntTuple(2);
              missed.set(0, k);
              missed.set(1, l);
              if (links.contains(new Pair<>(missed, dst))) {
                antecedents.get(id).add(missed);
                links.add(new Pair<>(src, missed));
              }
            }
          }

          links.add(new Pair<>(src, dst));

          assert (antecedents.get(id) != null);
          antecedents.get(id).add(dst);

          List ants = antecedents.get(m.originalRef);
          assert (ants != null);
          for (IntTuple ant : ants) {
            antecedents.get(id).add(ant);
            links.add(new Pair<>(src, ant));
          }
        }
      }
    }
    goldLinks = links;
  }

  /** set UtteranceAnnotation for quotations: default UtteranceAnnotation = 0 is given */
  private void markQuotations(List results, boolean normalQuotationType) {
    boolean insideQuotation = false;
    for(CoreMap m : results) {
      for(CoreLabel l : m.get(CoreAnnotations.TokensAnnotation.class)) {
        String w = l.get(CoreAnnotations.TextAnnotation.class);

        boolean noSpeakerInfo = !l.containsKey(CoreAnnotations.SpeakerAnnotation.class)
        || l.get(CoreAnnotations.SpeakerAnnotation.class).equals("")
        || l.get(CoreAnnotations.SpeakerAnnotation.class).startsWith("PER");

        if(w.equals("``")
            || (!insideQuotation && normalQuotationType && w.equals("\""))) {
          insideQuotation = true;
          maxUtter++;
          continue;
        } else if(w.equals("''")
            || (insideQuotation && normalQuotationType && w.equals("\""))) {
          insideQuotation = false;
        }
        if(insideQuotation) {
          l.set(CoreAnnotations.UtteranceAnnotation.class, maxUtter);
        }
        if(noSpeakerInfo){
          l.set(CoreAnnotations.SpeakerAnnotation.class, "PER"+l.get(CoreAnnotations.UtteranceAnnotation.class));
        }
      }
    }
    if(maxUtter==0 && !normalQuotationType) markQuotations(results, true);
  }

  /** Speaker extraction */
  private void findSpeakers(Dictionaries dict) {
    Boolean useMarkedDiscourseBoolean = annotation.get(CoreAnnotations.UseMarkedDiscourseAnnotation.class);
    boolean useMarkedDiscourse = (useMarkedDiscourseBoolean != null)? useMarkedDiscourseBoolean: false;
    if (Constants.USE_GOLD_SPEAKER_TAGS || useMarkedDiscourse) {
      for(CoreMap sent : annotation.get(CoreAnnotations.SentencesAnnotation.class)) {
        for(CoreLabel w : sent.get(CoreAnnotations.TokensAnnotation.class)) {
          int utterIndex = w.get(CoreAnnotations.UtteranceAnnotation.class);
          speakers.put(utterIndex, w.get(CoreAnnotations.SpeakerAnnotation.class));
        }
      }
    } else {
      if(docType==DocType.CONVERSATION) findSpeakersInConversation(dict);
      else if (docType==DocType.ARTICLE) findSpeakersInArticle(dict);

      // set speaker info to annotation
      for(CoreMap sent : annotation.get(CoreAnnotations.SentencesAnnotation.class)) {
        for(CoreLabel w : sent.get(CoreAnnotations.TokensAnnotation.class)) {
          int utterIndex = w.get(CoreAnnotations.UtteranceAnnotation.class);
          if(speakers.containsKey(utterIndex)) {
            w.set(CoreAnnotations.SpeakerAnnotation.class, speakers.get(utterIndex));
          }
        }
      }
    }
  }
  private void findSpeakersInArticle(Dictionaries dict) {
    List sentences = annotation.get(CoreAnnotations.SentencesAnnotation.class);
    Pair beginQuotation = new Pair<>();
    Pair endQuotation = new Pair<>();
    boolean insideQuotation = false;
    int utterNum = -1;

    for (int i = 0 ; i < sentences.size(); i++) {
      List sent = sentences.get(i).get(CoreAnnotations.TokensAnnotation.class);
      for(int j = 0 ; j < sent.size() ; j++) {
        int utterIndex = sent.get(j).get(CoreAnnotations.UtteranceAnnotation.class);

        if(utterIndex != 0 && !insideQuotation) {
          utterNum = utterIndex;
          insideQuotation = true;
          beginQuotation.setFirst(i);
          beginQuotation.setSecond(j);
        } else if (utterIndex == 0 && insideQuotation) {
          insideQuotation = false;
          endQuotation.setFirst(i);
          endQuotation.setSecond(j);
          findQuotationSpeaker(utterNum, sentences, beginQuotation, endQuotation, dict);
        }
      }
    }
  }

  private void findQuotationSpeaker(int utterNum, List sentences,
      Pair beginQuotation, Pair endQuotation, Dictionaries dict) {

    if(findSpeaker(utterNum, beginQuotation.first(), sentences, 0, beginQuotation.second(), dict))
      return ;

    if(findSpeaker(utterNum, endQuotation.first(), sentences, endQuotation.second(),
        sentences.get(endQuotation.first()).get(CoreAnnotations.TokensAnnotation.class).size(), dict))
      return;

    if(beginQuotation.second() <= 1 && beginQuotation.first() > 0) {
      if(findSpeaker(utterNum, beginQuotation.first()-1, sentences, 0,
          sentences.get(beginQuotation.first()-1).get(CoreAnnotations.TokensAnnotation.class).size(), dict))
        return;
    }

    if(endQuotation.second() == sentences.get(endQuotation.first()).size()-1
        && sentences.size() > endQuotation.first()+1) {
      if(findSpeaker(utterNum, endQuotation.first()+1, sentences, 0,
          sentences.get(endQuotation.first()+1).get(CoreAnnotations.TokensAnnotation.class).size(), dict))
        return;
    }
  }

  private boolean findSpeaker(int utterNum, int sentNum, List sentences,
      int startIndex, int endIndex, Dictionaries dict) {
    List sent = sentences.get(sentNum).get(CoreAnnotations.TokensAnnotation.class);
    for(int i = startIndex ; i < endIndex ; i++) {
      if(sent.get(i).get(CoreAnnotations.UtteranceAnnotation.class)!=0) continue;
      String lemma = sent.get(i).get(CoreAnnotations.LemmaAnnotation.class);
      String word = sent.get(i).get(CoreAnnotations.TextAnnotation.class);
      if(dict.reportVerb.contains(lemma)) {
        // find subject
        SemanticGraph dependency = sentences.get(sentNum).get(SemanticGraphCoreAnnotations.EnhancedDependenciesAnnotation.class);
        IndexedWord w = dependency.getNodeByWordPattern(word);

        if (w != null) {
          for(Pair child : dependency.childPairs(w)){
            if(child.first().getShortName().equals("nsubj")) {
              String subjectString = child.second().word();
              int subjectIndex = child.second().index();  // start from 1
              IntTuple headPosition = new IntTuple(2);
              headPosition.set(0, sentNum);
              headPosition.set(1, subjectIndex-1);
              String speaker;
              if(mentionheadPositions.containsKey(headPosition)) {
                speaker = Integer.toString(mentionheadPositions.get(headPosition).mentionID);
              } else {
                speaker = subjectString;
              }
              speakers.put(utterNum, speaker);
              return true;
            }
          }
        } else {
          SieveCoreferenceSystem.logger.warning("Cannot find node in dependency for word " + word);
        }
      }
    }
    return false;
  }

  private void findSpeakersInConversation(Dictionaries dict) {
    for(List l : predictedOrderedMentionsBySentence) {
      for(Mention m : l){
        if(m.predicateNominatives == null) continue;
        for (Mention a : m.predicateNominatives){
          if(a.spanToString().toLowerCase().equals("i")) {
            speakers.put(m.headWord.get(CoreAnnotations.UtteranceAnnotation.class), Integer.toString(m.mentionID));
          }
        }
      }
    }
    List paragraph = new ArrayList<>();
    int paragraphUtterIndex = 0;
    String nextParagraphSpeaker = "";
    int paragraphOffset = 0;
    for(CoreMap sent : annotation.get(CoreAnnotations.SentencesAnnotation.class)) {
      int currentUtter = sent.get(CoreAnnotations.TokensAnnotation.class).get(0).get(CoreAnnotations.UtteranceAnnotation.class);
      if(paragraphUtterIndex!=currentUtter) {
        nextParagraphSpeaker = findParagraphSpeaker(paragraph, paragraphUtterIndex, nextParagraphSpeaker, paragraphOffset, dict);
        paragraphUtterIndex = currentUtter;
        paragraphOffset += paragraph.size();
        paragraph = new ArrayList<>();
      }
      paragraph.add(sent);
    }
    findParagraphSpeaker(paragraph, paragraphUtterIndex, nextParagraphSpeaker, paragraphOffset, dict);
  }

  private String findParagraphSpeaker(List paragraph,
      int paragraphUtterIndex, String nextParagraphSpeaker, int paragraphOffset, Dictionaries dict) {
    if(!speakers.containsKey(paragraphUtterIndex)) {
      if(!nextParagraphSpeaker.equals("")) {
        speakers.put(paragraphUtterIndex, nextParagraphSpeaker);
      } else {  // find the speaker of this paragraph (John, nbc news)
        CoreMap lastSent = paragraph.get(paragraph.size()-1);
        String speaker = "";
        boolean hasVerb = false;
        for(int i = 0 ; i < lastSent.get(CoreAnnotations.TokensAnnotation.class).size() ; i++){
          CoreLabel w = lastSent.get(CoreAnnotations.TokensAnnotation.class).get(i);
          String pos = w.get(CoreAnnotations.PartOfSpeechAnnotation.class);
          String ner = w.get(CoreAnnotations.NamedEntityTagAnnotation.class);
          if(pos.startsWith("V")) {
            hasVerb = true;
            break;
          }
          if(ner.startsWith("PER")) {
            IntTuple headPosition = new IntTuple(2);
            headPosition.set(0, paragraph.size()-1 + paragraphOffset);
            headPosition.set(1, i);
            if(mentionheadPositions.containsKey(headPosition)) {
              speaker = Integer.toString(mentionheadPositions.get(headPosition).mentionID);
            }
          }
        }
        if(!hasVerb && !speaker.equals("")) {
          speakers.put(paragraphUtterIndex, speaker);
        }
      }
    }
    return findNextParagraphSpeaker(paragraph, paragraphOffset, dict);
  }

  private String findNextParagraphSpeaker(List paragraph, int paragraphOffset, Dictionaries dict) {
    CoreMap lastSent = paragraph.get(paragraph.size()-1);
    String speaker = "";
    for(CoreLabel w : lastSent.get(CoreAnnotations.TokensAnnotation.class)) {
      if(w.get(CoreAnnotations.LemmaAnnotation.class).equals("report") || w.get(CoreAnnotations.LemmaAnnotation.class).equals("say")) {
        String word = w.get(CoreAnnotations.TextAnnotation.class);
        SemanticGraph dependency = lastSent.get(SemanticGraphCoreAnnotations.EnhancedDependenciesAnnotation.class);
        IndexedWord t = dependency.getNodeByWordPattern(word);

        for(Pair child : dependency.childPairs(t)){
          if(child.first().getShortName().equals("nsubj")) {
            int subjectIndex = child.second().index();  // start from 1
            IntTuple headPosition = new IntTuple(2);
            headPosition.set(0, paragraph.size()-1 + paragraphOffset);
            headPosition.set(1, subjectIndex-1);
            if(mentionheadPositions.containsKey(headPosition)
                && mentionheadPositions.get(headPosition).nerString.startsWith("PER")) {
              speaker = Integer.toString(mentionheadPositions.get(headPosition).mentionID);
            }
          }
        }
      }
    }
    return speaker;
  }

  public SpeakerInfo getSpeakerInfo(String speaker) {
    return speakerInfoMap.get(speaker);
  }

  public int numberOfSpeakers() {
    return speakerInfoMap.size();
  }

  /** Check one mention is the speaker of the other mention */
  public static boolean isSpeaker(Mention m, Mention ant, Dictionaries dict) {

    if(!dict.firstPersonPronouns.contains(ant.spanToString().toLowerCase())
        || ant.number==Number.PLURAL || ant.sentNum!=m.sentNum) return false;

    int countQuotationMark = 0;
    for(int i = Math.min(m.headIndex, ant.headIndex)+1 ; i < Math.max(m.headIndex, ant.headIndex) ; i++) {
      String word = m.sentenceWords.get(i).get(CoreAnnotations.TextAnnotation.class);
      if(word.equals("``") || word.equals("''")) countQuotationMark++;
    }
    if(countQuotationMark!=1) return false;

    IndexedWord w = m.dependency.getNodeByWordPattern(m.sentenceWords.get(m.headIndex).get(CoreAnnotations.TextAnnotation.class));
    if(w== null) return false;

    for(Pair parent : m.dependency.parentPairs(w)){
      if(parent.first().getShortName().equals("nsubj")
          && dict.reportVerb.contains(parent.second().get(CoreAnnotations.LemmaAnnotation.class))) {
        return true;
      }
    }
    return false;
  }

  protected void printMentionDetection() {
    int foundGoldCount = 0;
    for(Mention g : allGoldMentions.values()) {
      if(!g.twinless) foundGoldCount++;
    }
    SieveCoreferenceSystem.logger.fine("# of found gold mentions: "+foundGoldCount + " / # of gold mentions: "+allGoldMentions.size());
    SieveCoreferenceSystem.logger.fine("gold mentions == ");
  }

}




© 2015 - 2024 Weber Informatics LLC | Privacy Policy