org.clulab.reach.darpa.DarpaLinks.scala Maven / Gradle / Ivy
The newest version!
package org.clulab.reach.darpa
import org.clulab.coref.CorefUtils._
import org.clulab.coref.{AntecedentSelector, LinearSelector, Links}
import org.clulab.odin._
import org.clulab.reach.ReachConstants._
import org.clulab.reach.mentions._
import org.clulab.reach.utils.DependencyUtils._
class DarpaLinks extends Links {
val debug: Boolean = false
val verbose: Boolean = debug
val defaultSelector: AntecedentSelector = new LinearSelector
/**
* Link a mention to the closest prior mention with exactly the same string, excluding generic mentions (e.g. 'it'
* won't match with 'it').
*
* @param mentions All mentions that might be linkable
* @param selector Rule for selecting the best antecedent from candidates
* @return The same mentions but with new links (antecedents) added
*/
def exactStringMatch(mentions: Seq[CorefMention], selector: AntecedentSelector = defaultSelector): Seq[CorefMention] = {
if (mentions.isEmpty) return Nil
if (debug) println("\n=====Exact entity string matching=====")
val sameText = mentions
.filter(x => x.isInstanceOf[CorefTextBoundMention] &&
!x.asInstanceOf[CorefTextBoundMention].isGeneric &&
!x.asInstanceOf[CorefTextBoundMention].hasGenericMutation)
.groupBy(m => m.text.toLowerCase + "(" + m.mutants.map(_.text).mkString("/") + ")")
.filter(_._2.length > 1)
sameText.foreach {
case (ent, ms) =>
ms.foldLeft(Set.empty: Set[CorefMention])((prev, curr) => {
if (curr.antecedents.isEmpty && !curr.isGeneric) {
if (debug) println(s"${curr.text} matches ${prev.map(_.text).mkString(", ")}")
curr.antecedents ++= prev
}
Set(curr)
})
}
mentions
}
/**
* Link a mention to the closest prior mention with exactly the same grounding ID, excluding potentially generic
* mentions (e.g. 'it' won't match with 'it').
*
* @param mentions All mentions that might be linkable
* @param selector Rule for selecting the best antecedent from candidates
* @return The same mentions but with new links (antecedents) added
*/
def groundingMatch(mentions: Seq[CorefMention], selector: AntecedentSelector = defaultSelector): Seq[CorefMention] = {
if (mentions.isEmpty) return Nil
if (debug) println("\n=====Entity grounding matching=====")
// exact grounding
val sameGrounding = mentions
.filter(x => x.isInstanceOf[CorefTextBoundMention] &&
!x.asInstanceOf[CorefTextBoundMention].isGeneric &&
!x.hasGenericMutation)
.filter(x => x.asInstanceOf[CorefTextBoundMention].isGrounded)
.groupBy(m => m.asInstanceOf[CorefTextBoundMention].grounding.get.id)
sameGrounding.foreach {
case (gr, ms) =>
ms.foldLeft(Set.empty: Set[CorefMention])((prev, curr) => {
if (curr.antecedents.isEmpty && curr.nonGeneric) {
if (debug) println(s"${curr.text} matches ${prev.map(_.text).mkString(", ")}")
curr.antecedents ++= prev
}
Set(curr)
})
}
mentions
}
def mutantProteinMatch(mentions: Seq[CorefMention], selector: AntecedentSelector = defaultSelector): Seq[CorefMention] = {
if (mentions.isEmpty) return Nil
if (debug) println("\n=====Mutant protein matching=====")
val tbms = mentions.filter(_.isInstanceOf[CorefTextBoundMention])
val gms = tbms.filter(m => m.hasGenericMutation)
gms.foreach {
case gm =>
if (verbose) println(s"Searching for ${gm.number.toString} antecedents to '${gm.text} ${gm.mutants.find(_.isGeneric).get.text}'")
val cands = tbms.filter { m =>
m.precedes(gm) &&
!m.isGeneric &&
m.isGrounded &&
(m.grounding.get.equals(gm.grounding.get) | gm.isGeneric) &&
(m.mutants.nonEmpty | gm.tags.get.head.takeRight(1) == "$") &&
m.mutants.forall(mut => !mut.isGeneric)
}
if (verbose) println(s"Candidates are '${cands.map(c => c.text + c.mutants.map(_.text).mkString(" ", " ", "")).mkString("', '")}'")
val ants = selector(gm, cands diff Seq(gm), gm.number)
if (debug) ants.foreach { ant =>
println(s"${gm.text} ${gm.mutants.find(mut => mut.isGeneric).get.text} links " +
s"to ${ant.text} ${ant.mutants.map(_.text).mkString("/")}")
}
gm.antecedents ++= ants
gm.sieves += "mutantProteinMatch"
}
mentions
}
/**
* Match two mentions, at least one generic, in which the later mention's head is in the earlier one and the later
* mention's words are a subset of the earlier one.
*
* @param mentions All mentions
* @param selector Rule for selecting the best antecedent from candidates
* @return The same mentions but with new links (antecedents) added.
*/
def strictHeadMatch(mentions: Seq[CorefMention], selector: AntecedentSelector = defaultSelector): Seq[CorefMention] = {
if (mentions.isEmpty) return Nil
val doc = mentions.head.document
if (debug) println("\n=====Strict head matching=====")
// split TBMs from other mentions -- we'll only be working on TBMs
val (tbms, hasArgs) = mentions.partition(m => m.isInstanceOf[CorefTextBoundMention])
// split generic mentions from specific mentions -- we'll only be working on generics
val (gnrc, spcfc) = tbms.partition(_.isGeneric)
for {
g <- gnrc
// save pronominals for later -- only noun phrases are of interest here
if !g.isClosedClass & g.antecedents.isEmpty
} {
// expand the mention so we can find its head
val gExpanded = expand(g)
// get the head of the noun phrase the mention is in
val hd = doc.sentences(g.sentence)
.words(findHeadStrict(gExpanded, doc.sentences(g.sentence)).getOrElse(g.tokenInterval.end - 1)).toLowerCase
if (verbose) println(s"Searching for ${g.number.toString} antecedents to '${g.text}${g.mutants.map(_.text).mkString(" ", " ", "")}' expanded to " +
s"'${doc.sentences(g.sentence).words.slice(gExpanded.start, gExpanded.end).mkString(" ")}' with head '$hd'")
// what other tbms have this same head?
val cands = tbms.filter { m =>
val mExpanded = expand(m)
val wdsExpanded = m.sentenceObj.words.slice(mExpanded.start, mExpanded.end).toSeq
m.precedes(g) &&
!g.isGeneric &&
wdsExpanded.contains(hd) &&
m.labels == g.labels &&
!nested(gExpanded, mExpanded, doc.sentences(g.sentence), doc.sentences(m.sentence))
}
// use the selector to say which of the candidates is best
val ants = selector(g, cands, g.number)
if (debug) ants.foreach { ant =>
println(s"${g.text}${g.mutants.map(_.text).mkString(" ", " ", "")} links to" +
s" ${ant.text}${ant.mutants.map(_.text).mkString(" ", " ", "")}")
}
g.antecedents ++= ants
g.sieves += "strictHeadMatch"
}
mentions
}
/**
* Match two mentions where the latter mention is a closed-class anaphor, matching number
*
* @param mentions All mentions
* @param selector Rule for selecting the best antecedent from candidates
* @return The same mentions but with new links (antecedents) added.
*/
def pronominalMatch(mentions: Seq[CorefMention], selector: AntecedentSelector = defaultSelector): Seq[CorefMention] = {
if (mentions.isEmpty) return Nil
if (debug) println("\n=====Pronominal matching=====")
// separate out TBMs, so we can look only at arguments of events -- others are irrelevant
val (tbms, hasArgs) = mentions.partition(m => m.isInstanceOf[CorefTextBoundMention])
hasArgs.filter(_.antecedents.isEmpty).foreach {
case pronominal if pronominal.arguments.values.flatten.exists(arg => arg.toCorefMention.isClosedClass) => {
// separate the arguments into pronominal and non-pronominal
val proMap = pronominal.arguments.map(args => args._1 -> args._2.partition(arg => arg.toCorefMention.isClosedClass))
// exclude all the arguments of this event, plus this event itself,
// plus all the arguments of any events that have this event as an argument
var excludeThese = Seq(pronominal)
if (verbose) proMap.foreach(kv =>
println(s"${kv._1} has pronominal args (${kv._2._1.map(_.text).mkString(", ")}) and non-pronominals (${kv._2._2.map(_.text).mkString(", ")})"))
// look at each matching generic argument in turn, in textual order
proMap.map(pm => pm._2._1.map(v => (pm._1, v))).flatten.toSeq.sortBy(a => a._2).foreach { kv =>
val (lbl, g) = (kv._1, kv._2.toCorefMention)
if (verbose) println(s"Searching for ${g.number.toString} antecedents to '${g.text}' excluding ${excludeThese.map(_.text).mkString("'", "', '", "'")}")
val gTag = g.tags.get.headOption
if(gTag.isEmpty || gTag.head != "PRP$") {
excludeThese ++= pronominal.arguments.values.flatten.toSeq.map(_.toCorefMention) ++
hasArgs.filter(m => m.arguments.values.flatten.toSeq.contains(pronominal)).flatMap(_.arguments.values).flatten.map(_.toCorefMention)
}
val cands = lbl match {
// controlled and controller can be EventMentions; other argument types must be TextBoundMentions
case "controlled" => mentions.filter { m =>
m.precedes(g) &&
g.sentence - m.sentence < 2 &&
m.getClass == g.getClass &&
!m.isGeneric &&
!excludeThese.contains(m) &&
(m.matches("PossibleController") || m.isInstanceOf[CorefEventMention])
}
case "controller" => mentions.filter { m =>
m.precedes(g) &&
g.sentence - m.sentence < 2 &&
m.getClass == g.getClass &&
!m.isGeneric &&
!excludeThese.contains(m) &&
(m.matches("PossibleController") || m.isInstanceOf[CorefEventMention])
}
case _ => tbms.filter { m =>
m.precedes(g) &&
g.sentence - m.sentence < 2 &&
m.getClass == g.getClass &&
!m.isGeneric &&
!excludeThese.contains(m) &&
m.matches("PossibleController")
}
}
if (verbose) println(s"Candidates are '${cands.map(c => c.text + c.mutants.map(_.text).mkString(" ", " ", "")).mkString("', '")}'")
// apply selector to candidates
val ants = selector(g.asInstanceOf[CorefTextBoundMention], cands, g.toCorefMention.number)
if (verbose) println(s"matched '${ants.map(a => a.text + a.mutants.map(_.text).mkString(" ", " ", "")).mkString(", ")}'")
// We must check for the anaphor mention in the state, because if it's not, we'll get an error upon
// trying to link the two
val gInState = mentions.find(m => g == m)
if (gInState.isDefined) {
// Mark the anaphor as having the selected antecedent
gInState.get.antecedents ++= ants
// Make sure we don't link to the same antecedent for other arguments to this event
excludeThese ++= ants
pronominal.sieves += "pronominalMatch"
}
}
}
case _ => ()
}
mentions
}
/**
* Match two mentions where the latter mention is one of a specific set of generic mentions with a known class, e.g.
* 'this protein' is known to have the label 'Protein'
*
* @param mentions All mentions
* @param selector Rule for selecting the best antecedent from candidates
* @return The same mentions but with new links (antecedents) added.
*/
def nounPhraseMatch(mentions: Seq[CorefMention], selector: AntecedentSelector = defaultSelector): Seq[CorefMention] = {
if (mentions.isEmpty) return Nil
if (debug) println("\n=====Noun phrase matching=====")
// only apply this matcher to arguments to events -- others are irrelevant
val (tbms, hasArgs) = mentions.partition(m => m.isInstanceOf[CorefTextBoundMention])
hasArgs.filter(_.antecedents.isEmpty).foreach {
case np if np.arguments.values.flatten.exists(arg =>
arg.isInstanceOf[TextBoundMention] &&
arg.toCorefMention.isGenericNounPhrase &&
arg.toCorefMention.antecedents.isEmpty) => {
// separate the arguments into generic noun phrases and others
val npMap = np.arguments.map(args =>
args._1 -> args._2.partition(arg =>
arg.isInstanceOf[TextBoundMention] &&
arg.toCorefMention.isGenericNounPhrase &&
arg.toCorefMention.antecedents.isEmpty
)
)
// exclude all the arguments of this event, plus this event itself,
// plus all the arguments of any events that have this event as an argument
var excludeThese = np.arguments.values.flatten.toSeq ++
Seq(np) ++
hasArgs.filter(m => m.arguments.values.flatten.toSeq.contains(np)).flatMap(_.arguments.values).flatten
// look at each matching generic argument in turn, in textual order
// Note: Do *not* turn this into flatMap
npMap.map(npm => npm._2._1.map(v => (npm._1, v))).flatten.toSeq.sortBy(x => x._2).foreach { kv =>
val (lbl, g) = (kv._1, kv._2.toCorefMention)
if (verbose) println(s"Searching for ${g.number.toString} antecedents to '${g.text}${g.mutants.map(_.text).mkString(" ", " ", "")}' " +
s"excluding ${excludeThese.map(_.text).mkString("'", "', '", "'")}")
val cands = lbl match {
// controlled and controller can be EventMentions; other argument types must be TextBoundMentions
case "controlled" => mentions.filter { m =>
m.precedes(g) &&
g.sentence - m.sentence < 2 &&
!m.isGeneric &&
!excludeThese.contains(m) &&
(m.matches("PossibleController") || m.isInstanceOf[CorefEventMention]) &&
compatibleMutants(m, g) &&
g.labels.filter(l => l != "Generic_entity" && l != "Generic_event").forall(x => m.labels.contains(x))
}
case "controller" => mentions.filter { m =>
m.precedes(g) &&
g.sentence - m.sentence < 2 &&
!m.isGeneric &&
!excludeThese.contains(m) &&
(m.matches("PossibleController") || m.isInstanceOf[CorefEventMention]) &&
compatibleMutants(m, g) &&
g.labels.filter(l => l != "Generic_entity" && l != "Generic_event").forall(x => m.labels.contains(x))
}
case _ => tbms.filter { m =>
m.precedes(g) &&
g.sentence - m.sentence < 2 &&
!m.isGeneric &&
!excludeThese.contains(m) &&
m.matches("PossibleController") &&
compatibleMutants(m, g) &&
g.labels.filter(l => l != "Generic_entity" && l != "Generic_event").forall(x => m.labels.contains(x))
}
}
if (verbose) println(s"Candidates are '${cands.map(c => c.text + c.mutants.map(_.text).mkString(" ", " ", "")).mkString("', '")}'")
// apply selector to candidates
val ants = selector(g.asInstanceOf[CorefTextBoundMention], cands, g.toCorefMention.number)
if (verbose) println(s"matched '${ants.map(a => a.text + a.mutants.map(_.text).mkString(" ", " ", "")).mkString(", ")}'")
// We must check for the anaphor mention in the state, because if it's not, we'll get an error upon
// trying to link the two
val gInState = mentions.find(m => g == m)
if (gInState.isDefined) {
// Mark the anaphor as having the selected antecedent
gInState.get.antecedents ++= ants
// Make sure we don't link to the same antecedent for other arguments to this event
excludeThese ++= ants
np.sieves += "nounPhraseMatch"
}
}
}
case _ => ()
}
mentions
}
/**
* Examine complex events with generic simple events as arguments, searching for the best match of the same label,
* e.g. "ASPP1 promotes this phosphorylation." will search for phosphorylations before this sentence.
*
* @param mentions All mentions
* @param selector Rule for selecting the best antecedent from candidates
* @return The same mentions but with new links (antecedents) added.
*/
def simpleEventMatch(mentions: Seq[CorefMention], selector: AntecedentSelector = defaultSelector): Seq[CorefMention] = {
if (mentions.isEmpty) return Nil
if (debug) println("\n=====Simple event matching=====\n")
val seLabels = SIMPLE_EVENTS - "Generic_event"
// We're only looking for generic simple events that are arguments of complex events
val (complex, others) = mentions.partition(m => m matches "ComplexEvent")
val (simplex, _) = others.partition(m => m matches "SimpleEvent")
// We need to have the specific event mentions ready to match our anaphors with
val (_, specifics) = simplex.partition(m => m matches "Generic_event")
// ComplexEvent mentions one by one. Ignore ComplexEvents with no generic SimpleEvent arguments, and arguments that
// are merely triggers of more complete SimpleEvents
complex.filter(_.antecedents.isEmpty).foreach { case cx if cx.arguments.values.flatten.exists(arg => arg.matches("Generic_event") &&
!specifics.filter(_.isInstanceOf[EventMention]).exists(sfc =>
sfc.asInstanceOf[CorefEventMention].trigger == arg.asInstanceOf[CorefEventMention].trigger)) =>
// Create a map just like m.arguments but with the values partitioned into generic and non-generic
val argMap = cx.arguments.map(args => args._1 -> args._2.partition(arg => arg matches "Generic_event"))
// These are the mentions excluded from matching with generic arguments: mentions participating in this event
// already, and mentions participating in some parent event (that has this ComplexEvent as an argument). It's a
// var because we'll be adding found antecedents as we go.
var excludeThese = cx.arguments.values.flatten.toSeq ++
specifics.filter(sfc =>
cx.arguments.values.flatten.toSeq.filter(_.isInstanceOf[EventMention])
.map(_.asInstanceOf[EventMention].trigger).contains(sfc.asInstanceOf[EventMention].trigger))
// Looking just at the generic arguments, since we don't need to find antecedents to full mentions.
// Note: Do *not* turn this into flatMap
argMap.map(arg => arg._2._1.map(v => (arg._1, v))).flatten.toSeq.sortBy(x => x._2).foreach { kv =>
// the label for the argument type (e.g. "theme") and the generic argument we need an antecedent for.
val (lbl, g) = kv
val gInState = mentions.find(m => g == m)
// first check if g already has had an antecedent found during a different ComplexEvent's search
if (gInState.isDefined && gInState.get.antecedents.isEmpty) {
if (verbose) println(s"Searching for antecedents to '${g.text}' excluding ${excludeThese.map(_.text).mkString("'", "', '", "'")}")
// candidates for the generic mention g's antecedents are full SimpleEvent mentions that precede g by 0-1
// sentences, which haven't been found as an antecedent to another argument yet, and which has matching labels
val cands = specifics.filter(s => (s precedes g) && g.sentence - s.sentence < 2 && !excludeThese.contains(s) &&
(s matches g.labels.toSet.intersect(seLabels).toSeq.headOption.getOrElse("")))
if (verbose) println(s"Candidates are '${cands.map(_.text).mkString("', '")}'")
// use the selector to choose which among the candidates is best. Assume 1 antecedent for now.
val ant = selector(g.asInstanceOf[CorefMention], cands, 1)
if (verbose) println(s"matched '${ant.map(_.text).mkString(", ")}'")
gInState.get.antecedents ++= ant
excludeThese ++= ant
cx.sieves += "simpleEventMatch"
}
}
case _ => ()
}
mentions
}
}
© 2015 - 2025 Weber Informatics LLC | Privacy Policy