org.clulab.reach.ReachSystem.scala Maven / Gradle / Ivy
The newest version!
package org.clulab.reach
import org.clulab.coref.Coref
import org.clulab.odin._
import org.clulab.reach.grounding._
import org.clulab.reach.mentions._
import RuleReader.{Rules, readResource}
import org.clulab.processors.Document
import org.clulab.processors.bionlp.BioNLPProcessor
import scala.collection.immutable.HashSet
import scala.collection.mutable
import org.clulab.reach.context._
import org.clulab.reach.context.ContextEngineFactory.Engine._
import ai.lum.nxmlreader.NxmlDocument
import com.typesafe.scalalogging.LazyLogging
import org.clulab.reach.darpa.{DarpaActions, MentionFilter, NegationHandler}
class ReachSystem(
rules: Option[Rules] = None,
proc: Option[BioNLPProcessor] = None,
contextEngineType: Engine = Dummy,
contextParams: Map[String, String] = Map()
) extends LazyLogging {
import ReachSystem._
val entityRules = if (rules.isEmpty) readResource(RuleReader.entitiesMasterFile) else rules.get.entities
val modificationRules = if (rules.isEmpty) readResource(RuleReader.modificationsMasterFile) else rules.get.modifications
val eventRules = if (rules.isEmpty) readResource(RuleReader.eventsMasterFile) else rules.get.events
val contextRules = if (rules.isEmpty) readResource(RuleReader.contextRelationsFile) else rules.get.context
// initialize actions object
val actions = new DarpaActions
val entityLookup = new ReachEntityLookup // initialize entity lookup (find grounding candidates)
val grounder = new ReachGrounder
// start entity extraction engine
// this engine extracts all physical entities of interest
val entityEngine = ExtractorEngine(entityRules, actions)
// start modification engine
// this engine extracts modification features and attaches them to the corresponding entity
val modificationEngine = ExtractorEngine(modificationRules, actions)
// start event extraction engine
// this engine extracts simple and recursive events and applies coreference
val eventEngine = ExtractorEngine(eventRules, actions, actions.cleanupEvents)
// initialize processor
val processor = if (proc.isEmpty) new BioNLPProcessor(withChunks = false) else proc.get
processor.annotate("something")
/** returns string with all rules used by the system */
def allRules: String =
Seq(entityRules, modificationRules, eventRules, contextRules).mkString("\n\n")
def mkDoc(text: String, docId: String, chunkId: String = ""): Document = {
val doc = processor.annotate(text, keepText = true)
val id = if (chunkId.isEmpty) docId else s"${docId}_${chunkId}"
doc.id = Some(id)
doc
}
def mkDoc(nxml: NxmlDocument): Document = {
// we are using the PMC as the chunk-id because we now read
// the whole paper in a single chunk
mkDoc(nxml.text, nxml.pmc, nxml.pmc)
}
def extractFrom(entry: FriesEntry): Seq[BioMention] =
extractFrom(entry.text, entry.name, entry.chunkId)
def extractFrom(nxml: NxmlDocument): Seq[BioMention] = {
// use standoff hashcode as the chunkId
extractFrom(mkDoc(nxml), Some(nxml))
}
def extractFrom(doc: Document, nxmlDoc: Option[NxmlDocument]): Seq[BioMention] = {
// initialize the context engine
val contextEngine = ContextEngineFactory.buildEngine(contextEngineType, contextParams)
val entities = extractEntitiesFrom(doc)
contextEngine.infer(entities)
val entitiesWithContext = contextEngine.assign(entities)
val unfilteredEvents = extractEventsFrom(doc, entitiesWithContext)
logger.debug(s"${unfilteredEvents.size} unfilteredEvents: ${display.summarizeMentions(unfilteredEvents,doc)}")
val events = MentionFilter.keepMostCompleteMentions(unfilteredEvents, State(unfilteredEvents))
logger.debug(s"${events.size} events after MentionFilter.keepMostCompleteMentions: ${display.summarizeMentions(events, doc)}")
contextEngine.update(events)
val eventsWithContext = contextEngine.assign(events)
logger.debug(s"${eventsWithContext.size} events after contextEngine.assign: ${display.summarizeMentions(eventsWithContext, doc)}")
val grounded = grounder(eventsWithContext)
logger.debug(s"${grounded.size} events after grounder: ${display.summarizeMentions(grounded, doc)}")
// Coref expects to get all mentions grouped
// we group according to the standoff, if there is one
// else we just make one group with all the mentions
val groundedAndGrouped = nxmlDoc match {
case Some(nxml) => groupMentionsByStandoff(grounded, nxml)
case None => Seq(grounded)
}
logger.debug(s"${groundedAndGrouped.flatten.size} events after groundedAndGrouped: ${display.summarizeMentions(groundedAndGrouped.flatten, doc)}")
val resolved = resolveCoref(groundedAndGrouped)
logger.debug(s"${resolved.size} events after coref: ${display.summarizeMentions(resolved, doc)}")
// Coref introduced incomplete Mentions that now need to be pruned
val complete = MentionFilter.keepMostCompleteMentions(resolved, State(resolved)).map(_.toCorefMention)
logger.debug(s"${complete.size} events after coref + 2nd MentionFilter.keepMostCompleteMentions: ${display.summarizeMentions(complete, doc)}")
logger.debug(s"Resolving display...")
resolveDisplay(complete)
}
def extractFrom(entries: Seq[FriesEntry]): Seq[BioMention] =
extractFrom(entries, entries.map{
e => mkDoc(e.text, e.name, e.chunkId)
})
def extractFrom(entries: Seq[FriesEntry], documents: Seq[Document]): Seq[BioMention] = {
// initialize the context engine
val contextEngine = ContextEngineFactory.buildEngine(contextEngineType, contextParams)
val entitiesPerEntry = for (doc <- documents) yield extractEntitiesFrom(doc)
contextEngine.infer(entitiesPerEntry.flatten)
val entitiesWithContextPerEntry = for (es <- entitiesPerEntry) yield contextEngine.assign(es)
// get events
val eventsPerEntry = for ((doc, es) <- documents zip entitiesWithContextPerEntry) yield {
val events = extractEventsFrom(doc, es)
MentionFilter.keepMostCompleteMentions(events, State(events))
}
contextEngine.update(eventsPerEntry.flatten)
val eventsWithContext = contextEngine.assign(eventsPerEntry.flatten)
val grounded = grounder(eventsWithContext)
// Coref expects to get all mentions grouped by document
val resolved = resolveCoref(groupMentionsByDocument(grounded, documents))
// Coref introduced incomplete Mentions that now need to be pruned
val complete = MentionFilter.keepMostCompleteMentions(resolved, State(resolved)).map(_.toCorefMention)
resolveDisplay(complete)
}
// this method groups the mentions by document
// the sequence of documents should be sorted in order of appearance in the paper
def groupMentionsByDocument(mentions: Seq[BioMention], documents: Seq[Document]): Seq[Seq[BioMention]] = {
for (doc <- documents) yield mentions.filter(_.document == doc)
}
/** group mentions according to their position in the nxml standoff */
def groupMentionsByStandoff(mentions: Seq[BioMention], nxml: NxmlDocument): Seq[Seq[BioMention]] = {
mentions.groupBy(m => nxml.standoff.getTerminals(m.startOffset, m.endOffset)).values.toVector
}
def extractFrom(text: String, docId: String, chunkId: String): Seq[BioMention] = {
extractFrom(mkDoc(text, docId, chunkId))
}
def extractFrom(doc: Document): Seq[BioMention] = {
require(doc.id.isDefined, "document must have an id")
require(doc.text.isDefined, "document should keep original text")
extractFrom(doc, None) // no nxml
}
def extractEntitiesFrom(doc: Document): Seq[BioMention] = {
// extract entities
val entities = entityEngine.extractByType[BioMention](doc)
// attach modification features to entities
val modifiedEntities = modificationEngine.extractByType[BioMention](doc, State(entities))
val mutationAddedEntities = modifiedEntities flatMap {
case m: BioTextBoundMention => mutationsToMentions(m)
case m => Seq(m)
}
// add grounding candidates to entities
entityLookup(mutationAddedEntities)
}
/** If the given mention has many mutations attached to it, return a mention for each mutation. */
def mutationsToMentions(mention: BioTextBoundMention): Seq[BioMention] = {
val mutations = mention.modifications.filter(_.isInstanceOf[Mutant])
if (mutations.isEmpty || mutations.size == 1)
Seq(mention)
else {
mutations.map { mut =>
val tbm = new BioTextBoundMention(mention.labels, mention.tokenInterval,
mention.sentence, mention.document,
mention.keep, mention.foundBy)
// copy all attachments
BioMention.copyAttachments(mention, tbm)
// remove all mutations
tbm.modifications = tbm.modifications diff mutations
// add desired mutation only
tbm.modifications += mut
tbm
}.toSeq
}
}
def extractEventsFrom(doc: Document, entities: Seq[BioMention]): Seq[BioMention] = {
val mentions = eventEngine.extractByType[BioMention](doc, State(entities))
// clean modified entities
// remove ModificationTriggers
// Make sure we don't have any "ModificationTrigger" Mentions
val validMentions = mentions.filterNot(_ matches "ModificationTrigger")
// handle multiple Negation modifications
NegationHandler.handleNegations(validMentions)
validMentions
}
// this method gets sequence composed of sequences of mentions, one per doc.
// each doc corresponds to a chunk of the paper, and it expects them to be in order of appearance
def resolveCoref(eventsPerDocument: Seq[Seq[BioMention]]): Seq[CorefMention] = {
val coref = new Coref()
coref(eventsPerDocument).flatten
}
}
object ReachSystem {
/** This function should set the right displayMention for each mention.
* NB: By default the displayMention is set to the main label of the mention,
* so, after extraction, it should never be null.
*/
def resolveDisplay (ms: Seq[CorefMention]): Seq[CorefMention] = {
for (m <- ms) {
m match {
case em:TextBoundMention with Display with Grounding =>
resolveDisplayForEntity(m)
case rm:RelationMention with Display with Grounding =>
resolveDisplayForArguments(m, new HashSet[String])
case vm:EventMention with Display with Grounding =>
resolveDisplayForArguments(m, new HashSet[String])
case _ => // nothing to do
}
}
ms
}
/** Recursively traverse the arguments of events and handle GPP entities. */
def resolveDisplayForArguments (em: CorefMention, parents: Set[String]) {
if (em.labels.contains("Event")) { // recursively traverse the arguments of events
val newParents = new mutable.HashSet[String]()
newParents ++= parents
newParents += em.label
em.arguments.values.foreach(ms => ms.foreach( m => {
val crm = m.asInstanceOf[CorefMention]
resolveDisplayForArguments(crm.antecedentOrElse(crm), newParents.toSet)
}))
}
else if (em.labels.contains("Gene_or_gene_product")) { // we only need to disambiguate these
resolveDisplayForEntity(em, Some(parents))
}
}
/** Set the displayLabel for a single mention, using optional parent label set information. */
def resolveDisplayForEntity (em: CorefMention, parents: Option[Set[String]] = None) {
if (em.labels.contains("Gene_or_gene_product")) {
if (em.isGrounded && ReachKBUtils.isFamilyGrounded(em)) {
em.displayLabel = "Family"
}
else if (parents.exists(_.contains("Transcription"))) {
em.displayLabel = "Gene"
} else {
em.displayLabel = "Protein"
}
}
}
}
© 2015 - 2025 Weber Informatics LLC | Privacy Policy