edu.stanford.nlp.coref.data.Document Maven / Gradle / Ivy

Show more of this group Show more artifacts with this name
Show all versions of stanford-corenlp Show documentation
Stanford CoreNLP provides a set of natural language analysis tools which can take raw English language text input and give the base forms of words, their parts of speech, whether they are names of companies, people, etc., normalize dates, times, and numeric quantities, mark up the structure of sentences in terms of phrases and word dependencies, and indicate which noun phrases refer to the same entities. It provides the foundational building blocks for higher level text understanding applications.
There is a newer version: 4.5.7
Show newest version
//
// StanfordCoreNLP -- a suite of NLP tools
// Copyright (c) 2009-2010 The Board of Trustees of
// The Leland Stanford Junior University. All Rights Reserved.
//
// This program is free software; you can redistribute it and/or
// modify it under the terms of the GNU General Public License
// as published by the Free Software Foundation; either version 2
// of the License, or (at your option) any later version.
//
// This program is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
// GNU General Public License for more details.
//
// You should have received a copy of the GNU General Public License
// along with this program; if not, write to the Free Software
// Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA  02111-1307, USA.
//
// For more information, bug reports, fixes, contact:
//    Christopher Manning
//    Dept of Computer Science, Gates 1A
//    Stanford CA 94305-9010
//    USA
//

package edu.stanford.nlp.coref.data;

import java.io.Serializable;
import java.util.ArrayList;
import java.util.List;
import java.util.Map;
import java.util.Set;

import edu.stanford.nlp.coref.docreader.CoNLLDocumentReader;
import edu.stanford.nlp.ling.CoreAnnotations.SentencesAnnotation;
import edu.stanford.nlp.pipeline.Annotation;
import edu.stanford.nlp.util.Generics;
import edu.stanford.nlp.util.IntTuple;
import edu.stanford.nlp.util.Pair;

public class Document implements Serializable {

  private static final long serialVersionUID = -4139866807494603953L;

  public enum DocType { CONVERSATION, ARTICLE }

  /** The type of document: conversational or article */
  public DocType docType;

  /** Document annotation */
  public Annotation annotation;

  /** for conll shared task 2011  */
  public CoNLLDocumentReader.CoNLLDocument conllDoc;

  /** The list of gold mentions */
  public List> goldMentions;
  /** The list of predicted mentions */
  public List> predictedMentions;

  /** return the list of predicted mentions */
  public List> getOrderedMentions() {
    return predictedMentions;
  }

  /** Clusters for coreferent mentions */
  public Map corefClusters;

  /** Gold Clusters for coreferent mentions */
  public Map goldCorefClusters;

  /** All mentions in a document {@literal mentionID -> mention} */
  public Map predictedMentionsByID;
  public Map goldMentionsByID;

  /** Set of roles (in role apposition) in a document  */
  public Set roleSet;

  /**
   * Position of each mention in the input matrix
   * Each mention occurrence with sentence # and position within sentence
   * (Nth mention, not Nth token)
   */
  public Map positions;              // mentions may be removed from this due to post processing
  public Map allPositions;           // all mentions (mentions will not be removed from this)

  public final Map mentionheadPositions;

  /** List of gold links in a document by positions */
  private List> goldLinks;

  /** UtteranceAnnotation {@literal ->} String (speaker): mention ID or speaker string
   *   e.g., the value can be "34" (mentionID), "Larry" (speaker string), or "PER3" (autoassigned speaker string)
   */
  public Map speakers;

  /** Pair of mention id, and the mention's speaker id
   *  the second value is the "speaker mention"'s id.
   *  e.g., Larry said, "San Francisco is a city.": (id(Larry), id(San Francisco))
   */
  public Set> speakerPairs;

  public boolean speakerInfoGiven;

  public int maxUtter;
  public int numParagraph;
  public int numSentences;

  /** Set of incompatible clusters pairs */
  private final Set> incompatibles;
  private final Set> incompatibleClusters;

  public Map, Boolean> acronymCache;

  /** Map of speaker name/id to speaker info
   *  the key is the value of the variable 'speakers'
   */
  public Map speakerInfoMap = Generics.newHashMap();

  // public Counter properNouns = new ClassicCounter<>();
  // public Counter phraseCounter = new ClassicCounter<>();
  // public Counter headwordCounter = new ClassicCounter<>();

  /** Additional information about the document. Can be used as features */
  public Map docInfo;

  public Document() {
    positions = Generics.newHashMap();
    mentionheadPositions = Generics.newHashMap();
    roleSet = Generics.newHashSet();
    corefClusters = Generics.newHashMap();
    goldCorefClusters = null;
    predictedMentionsByID = Generics.newHashMap();
//    goldMentionsByID = Generics.newHashMap();
    speakers = Generics.newHashMap();
    speakerPairs = Generics.newHashSet();
    incompatibles = Generics.newHashSet();
    incompatibleClusters = Generics.newHashSet();
    acronymCache = Generics.newHashMap();
  }

  public Document(Annotation anno, List> predictedMentions, List> goldMentions) {
    this();
    annotation = anno;
    this.predictedMentions = predictedMentions;
    this.goldMentions = goldMentions;
  }

  public Document(InputDoc input, List> mentions) {
    this();
    this.annotation = input.annotation;
    this.predictedMentions = mentions;
    this.goldMentions = input.goldMentions;
    this.docInfo = input.docInfo;
    this.numSentences = input.annotation.get(SentencesAnnotation.class).size();
    this.conllDoc = input.conllDoc;   // null if it's not conll input
  }

  public boolean isIncompatible(CorefCluster c1, CorefCluster c2) {
    // Was any of the pairs of mentions marked as incompatible
    int cid1 = Math.min(c1.clusterID, c2.clusterID);
    int cid2 = Math.max(c1.clusterID, c2.clusterID);
    return incompatibleClusters.contains(Pair.makePair(cid1,cid2));
  }

  // Update incompatibles for two clusters that are about to be merged
  public void mergeIncompatibles(CorefCluster to, CorefCluster from) {
    List, Pair>> replacements =
            new ArrayList<>();
    for (Pair p:incompatibleClusters) {
      Integer other = null;
      if (p.first == from.clusterID) {
        other = p.second;
      } else if (p.second == from.clusterID) {
        other = p.first;
      }
      if (other != null && other != to.clusterID) {
        int cid1 = Math.min(other, to.clusterID);
        int cid2 = Math.max(other, to.clusterID);
        replacements.add(Pair.makePair(p, Pair.makePair(cid1, cid2)));
      }
    }
    for (Pair, Pair> r:replacements)  {
      incompatibleClusters.remove(r.first);
      incompatibleClusters.add(r.second);
    }
  }
  public void mergeAcronymCache(CorefCluster to, CorefCluster from) {
    Map, Boolean> replacements = Generics.newHashMap();
    for(Pair p : acronymCache.keySet()) {
      if(acronymCache.get(p)) {
        Integer other = null;
        if(p.first==from.clusterID){
          other = p.second;
        } else if(p.second==from.clusterID) {
          other = p.first;
        }
        if(other != null && other != to.clusterID) {
          int cid1 = Math.min(other, to.clusterID);
          int cid2 = Math.max(other, to.clusterID);
          replacements.put(Pair.makePair(cid1, cid2), true);
        }
      }
    }
    for(Pair p : replacements.keySet()) {
      acronymCache.put(p, replacements.get(p));
    }
  }

  public boolean isIncompatible(Mention m1, Mention m2) {
    int mid1 = Math.min(m1.mentionID, m2.mentionID);
    int mid2 = Math.max(m1.mentionID, m2.mentionID);
    return incompatibles.contains(Pair.makePair(mid1,mid2));
  }

  public void addIncompatible(Mention m1, Mention m2) {
    int mid1 = Math.min(m1.mentionID, m2.mentionID);
    int mid2 = Math.max(m1.mentionID, m2.mentionID);
    incompatibles.add(Pair.makePair(mid1,mid2));
    int cid1 = Math.min(m1.corefClusterID, m2.corefClusterID);
    int cid2 = Math.max(m1.corefClusterID, m2.corefClusterID);
    incompatibleClusters.add(Pair.makePair(cid1,cid2));
  }

  public List> getGoldLinks() {
    if(goldLinks==null) this.extractGoldLinks();
    return goldLinks;
  }

  /** Extract gold coref link information */
  protected void extractGoldLinks() {
    //    List> orderedMentionsBySentence = this.getOrderedMentions();
    List> links = new ArrayList<>();

    // position of each mention in the input matrix, by id
    Map positions = Generics.newHashMap();
    // positions of antecedents
    Map> antecedents = Generics.newHashMap();
    for(int i = 0; i < goldMentions.size(); i ++){
      for(int j = 0; j < goldMentions.get(i).size(); j ++){
        Mention m = goldMentions.get(i).get(j);
        int id = m.mentionID;
        IntTuple pos = new IntTuple(2);
        pos.set(0, i);
        pos.set(1, j);
        positions.put(id, pos);
        antecedents.put(id, new ArrayList<>());
      }
    }

//    SieveCoreferenceSystem.debugPrintMentions(System.err, "", goldOrderedMentionsBySentence);
    for (List mentions : goldMentions) {
      for (Mention m : mentions) {
        int id = m.mentionID;
        IntTuple src = positions.get(id);

        assert (src != null);
        if (m.originalRef >= 0) {
          IntTuple dst = positions.get(m.originalRef);
          if (dst == null) {
            throw new RuntimeException("Cannot find gold mention with ID=" + m.originalRef);
          }

          // to deal with cataphoric annotation
          while (dst.get(0) > src.get(0) || (dst.get(0) == src.get(0) && dst.get(1) > src.get(1))) {
            Mention dstMention = goldMentions.get(dst.get(0)).get(dst.get(1));
            m.originalRef = dstMention.originalRef;
            dstMention.originalRef = id;

            if (m.originalRef < 0) break;
            dst = positions.get(m.originalRef);
          }
          if (m.originalRef < 0) continue;

          // A B C: if A<-B, A<-C => make a link B<-C
          for (int k = dst.get(0); k <= src.get(0); k++) {
            for (int l = 0; l < goldMentions.get(k).size(); l++) {
              if (k == dst.get(0) && l < dst.get(1)) continue;
              if (k == src.get(0) && l > src.get(1)) break;
              IntTuple missed = new IntTuple(2);
              missed.set(0, k);
              missed.set(1, l);
              if (links.contains(new Pair<>(missed, dst))) {
                antecedents.get(id).add(missed);
                links.add(new Pair<>(src, missed));
              }
            }
          }

          links.add(new Pair<>(src, dst));

          assert (antecedents.get(id) != null);
          antecedents.get(id).add(dst);

          List ants = antecedents.get(m.originalRef);
          assert (ants != null);
          for (IntTuple ant : ants) {
            antecedents.get(id).add(ant);
            links.add(new Pair<>(src, ant));
          }
        }
      }
    }
    goldLinks = links;
  }

  public SpeakerInfo getSpeakerInfo(String speaker) {
    return speakerInfoMap.get(speaker);
  }

  public int numberOfSpeakers() {
    return speakerInfoMap.size();
  }

  public boolean isCoref(Mention m1, Mention m2) {
    return this.goldMentionsByID.containsKey(m1.mentionID)
        && this.goldMentionsByID.containsKey(m2.mentionID)
        && this.goldMentionsByID.get(m1.mentionID).goldCorefClusterID == this.goldMentionsByID.get(m2.mentionID).goldCorefClusterID;
  }

}