All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.clulab.reach.utils.BratUtils.scala Maven / Gradle / Ivy

The newest version!
package org.clulab.reach.utils

import org.clulab.odin.{EventMention, Mention, TextBoundMention}
import org.clulab.processors.Document
import org.clulab.reach.brat.{Annotation, Brat, Event, TextBound}
import org.clulab.struct.Interval
import org.slf4j.LoggerFactory

import scala.collection.mutable

/**
 * Utility functions for converting Brat Annotations to Odin Mentions
 */
object BratUtils {

  type TokenInterval = Interval
  type CharInterval = Interval

  val logger = LoggerFactory.getLogger(this.getClass.getSimpleName)

  // returns a sequence of tuples
  // (SentenceIndex, CharacterInterval)
  def getTokenCharOffsets(doc: Document): Seq[(Int, CharInterval)] = {
    val tokenIntervals = for (i <- doc.sentences.indices) yield {
      val sentence = doc.sentences(i)
      sentence.startOffsets zip sentence.endOffsets map {
        case (start, end) => (i, Interval(start, end))
      }
    }
    tokenIntervals.flatten
  }

  // collapses several intervals into a single one
  // that contains them all
  def totalSpan(intervals: Seq[Interval]): Interval =
    Interval(intervals.map(_.start).min, intervals.map(_.end).max)

  // turn a sequence of (SentenceIndex, CharacterInterval)
  // into a Map[SentenceIndex, Seq[CharacterInterval]]
  def getTokenCharOffsetsMap(
      tkIntervals: Seq[(Int, CharInterval)]
  ): Map[Int, Seq[CharInterval]] = {
    tkIntervals.groupBy(_._1).transform((k, v) => v.map(_._2))
  }

  /**
   * Translates Brat-style annotations to Odin-style Mentions.
   *
   * @param doc a processors-style Document
   * @param annotations sequence of brat annotations
   * @return A sequence of Mention
   */
  def getMentionsfromAnnotations(doc: Document, annotations: Seq[Annotation]): Seq[Mention] = {
    val tokenOffsetsMap = getTokenCharOffsetsMap(getTokenCharOffsets(doc))
    val sentenceOffsets = tokenOffsetsMap.map { case (i, ints) => (i, totalSpan(ints)) }
    // Should be exactly one annotation for an ID
    val annotationLUT: Map[String, Annotation] =
      annotations.groupBy(_.id).transform((k,v) => v.head)

    val mentionLUT = new mutable.HashMap[Annotation, Option[Mention]]

    // returns the sentence that contains the interval
    def getSentence(int: CharInterval): Int = {
      for ((i, m) <- sentenceOffsets if m.start <= int.start && m.end >= int.end) return i
      sys.error(s"sentence not found for $int")
    }

    def lookupMention(name: String): Option[Mention] = getMention(annotationLUT(name))

    def characterToTokenInterval(
        sentenceIndex: Int,
        charInterval: CharInterval
    ): Option[TokenInterval] = {
      // get start and end tokens
      val tokens = for {
        (int, i) <- tokenOffsetsMap(sentenceIndex).zipWithIndex
        if int overlaps charInterval
      } yield i
      try {
        val start = tokens.head
        val end = tokens.last
        Some(Interval(start, end + 1))
      } catch {
        case e: Throwable => {
          logger.debug(s"characterInterval:\t$charInterval")
          logger.debug(s"sentenceIndex:\t$sentenceIndex")
          logger.debug("sentence offsets:\n")
          logger.debug(s"${sentenceOffsets(sentenceIndex)}")
          None
        }
      }
    }

    def getMention(annotation: Annotation): Option[Mention] = mentionLUT.get(annotation) match {
      case Some(result) => result
      case None =>
        val result = annotation match {

          case tb: TextBound =>
            val parentSentence = getSentence(tb.spans.head)
            // Convert character Interval to token Interval...
            val tokInt = characterToTokenInterval(parentSentence, tb.spans.head)
            tokInt map { interval =>
              new TextBoundMention(
                label = tb.label,
                tokenInterval = interval,
                sentence = parentSentence,
                document = doc,
                keep = true,
                foundBy = tb.id
              )
            }

          case event: Event =>
            // use trigger for sentence
            val triggerId = event.trigger
            val triggerAnnotation = annotationLUT(triggerId).asInstanceOf[TextBound]
            val triggerMention = lookupMention(triggerId).get
            val parentSentence = getSentence(triggerAnnotation.spans.head)
            // get event arguments
            val eventArgs: Map[String, Seq[Mention]] = event.arguments.transform {
              (name, anns) => anns.flatMap(lookupMention)
            }

            Some(new EventMention(
              label = event.label,
              trigger = triggerMention.asInstanceOf[TextBoundMention],
              arguments = eventArgs,
              sentence = parentSentence,
              document = doc,
              keep = true,
              foundBy = event.id
            ))

          // ignore any other annotation type
          case _ => None
        }
        mentionLUT += (annotation -> result)
        result
    }

    // convert all annotations to mentions
    annotations.flatMap(getMention)
  }

  /**
   * Reads Brat standoff and output Odin-style Mentions.
   *
   * @param doc a processors-style Document
   * @param standoff a String of Brat standoff
   * @return A sequence of Odin-style Mentions
   */
  def getMentionsFromStandoff(doc: Document, standoff: String): Seq[Mention] =
    getMentionsfromAnnotations(doc, Brat.readStandOff(standoff))
}




© 2015 - 2025 Weber Informatics LLC | Privacy Policy