epic.corpora.MascUtil.scala Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of epic_2.10 Show documentation
epic
The newest version!
package epic.corpora

import scala.xml.{Source => _, _}
import java.io._
import io.Codec
import java.net.URL
import epic.slab._
import epic.trees.Span


import MascTransform._


/**
* Convert native MASC xml into CONLL format for named entity recognition.
*
* @author jasonbaldridge
*/
object MascTransform {


  case class MNode(id: String, targets: Seq[String])
  case class MAnnotation(id: String, label: String, ref: String, features: Map[String,String])
  case class MEdge(id: String, from: String, to: String)
  case class MRegion(id: String, start: Int, end: Int) extends Ordered[MRegion] {
    def span = Span(start, end)

    def compare(that: MRegion) = this.start - that.start
  }

  def main(args: Array[String]) {
    val mascDir = args(0)
    val outputDir = new File(if (args.length > 1) args(1) else "/tmp")
    outputDir.mkdirs

    val targets = collectTargets(new File(mascDir))

    // Get 3/5 for train, 1/5 for dev, and 1/5 for test
    val targetsAndIndices = targets.zipWithIndex
    val trainSet = targetsAndIndices.filter(_._2 % 5 < 3).unzip._1
    val devSet = targetsAndIndices.filter(_._2 % 5 == 3).unzip._1
    val testSet = targetsAndIndices.filter(_._2 % 5 == 4).unzip._1
    processSet(outputDir, "train", trainSet)
    processSet(outputDir, "dev", devSet)
    processSet(outputDir, "test", testSet)
  }

  def collectTargets(dir: File): Seq[(File,String)] = {
    val files = dir.listFiles.toSeq
    val filesInDir = files
      .filter(_.getName.endsWith(".txt"))
      .map(file => (dir, file.getName.dropRight(4)))
    filesInDir ++ files.filter(_.isDirectory).flatMap(collectTargets)
  }

  def processSet(parentDir: File, outputName: String, targets: Seq[(File, String)]) {
    System.err.println("Creating " + outputName)
    val outputDir = new File(parentDir, outputName)
    outputDir.mkdirs

    val outputSentences = new FileWriter(new File(outputDir,outputName+"-sent.txt"))
    val outputTokens = new FileWriter(new File(outputDir,outputName+"-tok.txt"))
    val outputNer = new FileWriter(new File(outputDir,outputName+"-ner.txt"))

    for (mfile <- MascFile(targets)) {
      for (sentence <- mfile.sentences) {
        val tokenizedSentence = new StringBuffer
        val MascSentence(tokens,postags,nerLabels,regions) = sentence
        (0 until sentence.numTokens).foreach { i => {
          val (tok, pos, ner, region) = (tokens(i), postags(i), nerLabels(i), regions(i))
            if (tok.exists(_.isSpaceChar)) {
              println("Weird token! '" + tok +"' " + mfile.dir + "/" + mfile.prefix +".txt:" + + region.start + "-" + region.end)
            }
          val split =
            if (i" else " "
          tokenizedSentence.append(tok).append(split)
          outputNer.write(tok + " " + pos + " " + pos + " " + ner + "\n")
        }}
        outputNer.write("\n")
        val sentStart = sentence.orderedRegions.head.start
        val sentEnd = sentence.orderedRegions.last.end
        val sentenceText = mfile.rawtext.slice(sentStart,sentEnd).replaceAll("\n"," ")
        outputSentences.write(sentenceText+"\n")
        outputTokens.write(tokenizedSentence.toString.trim + "\n")
      }
    }
    outputNer.flush()
    outputNer.close()
    outputSentences.flush()
    outputSentences.close()
    outputTokens.flush()
    outputTokens.close()
    System.err.println()
  }

}


case class MascSentence (
  orderedTokens: Seq[String],
  orderedPos: Seq[String],
  bioLabels: Seq[String],
  orderedRegions: Seq[MRegion]
) {

  lazy val numTokens = orderedTokens.length
}

class MascFile (
  val dir: File,
  val prefix: String,
  val rawtext: String,
  val sentences: Seq[MascSentence]
) {

  lazy val numSentences = sentences.length

}

object MascFile {

  import MascUtil._
  import io.Source

  lazy val outsideNe = MAnnotation("outside", "outside", "none", Map[String,String]())

  def apply(targets: Seq[(File, String)]): Iterator[MascFile] = {
    targets.toIterator.flatMap { case(file, prefix) => {
      try {
        val mfile = MascFile(file,prefix)
        System.err.println("Success: " + file + "," + prefix)
        Some(mfile)
      }
      catch { case e: Throwable =>
        System.err.println("Failure: " + file + "," + prefix)
        None
      }
    }}
  }

  def apply(dir: File, prefix: String): MascFile = {

    def dirFile(prefix: String) = new File(dir, prefix)
    def loadXML(file: File) = XML.load(new InputStreamReader( new FileInputStream(file), "UTF-8"))

    implicit val codec = Codec.UTF8

    // Raw text
    val rawtext = Source.fromFile(dirFile(prefix+".txt"))(codec).mkString

    // Sentence information
    val sentenceXml = loadXML(dirFile(prefix+"-s.xml"))
    val sentenceRegions = getRegions(sentenceXml).sorted

    
    // Basic segment information
    val segmentXml = loadXML(dirFile(prefix+"-seg.xml"))
    val segmentRegions = getRegions(segmentXml).map(r => (r.id -> r)).toMap

    // POS information
    val pennXml = loadXML(dirFile(prefix+"-penn.xml"))

    val tokenRegions = getNodes(pennXml).map { n =>
      val regions = n.targets.map(segmentRegions).sorted
      (n.id -> MRegion(n.id, regions.head.start, regions.last.end))
    }.toMap

    val tokens = tokenRegions.mapValues(region => rawtext.slice(region.start, region.end))
    val posAnnotations = getAnnotations(pennXml).map(anno => (anno.ref -> anno)).toMap

    // NER information
    val neXml = loadXML(dirFile(prefix+"-ne.xml"))
    val neAnnotations =
      getAnnotations(neXml).map(anno => (anno.ref -> anno)).toMap.withDefault(x=>outsideNe)

    val neEdges =
      getEdges(neXml).map(edge => (edge.to -> edge.from)).toMap.withDefault(x=>"outside")

    // A helper function for pulling out the information associated with a
    // subsequence of the tokens in the document.
    def orderedTokPosNer(orderedRegions: Seq[MRegion]) = {
      if (orderedRegions.length == 0) None
      else {
        val orderedTokens = orderedRegions.map(reg=>tokens(reg.id))
        
        val (orderedPos, orderedNe) = orderedRegions.map { region => {
          val posAnno = posAnnotations(region.id)
          val neAnno = neAnnotations(neEdges(posAnno.ref))
          (getPos(posAnno), neAnno)
        }}.unzip
        
        val bioLabels = (outsideNe +: orderedNe).sliding(2).toSeq.map {
          case Seq(prev, curr) =>
            if (curr.label == "outside")
              nerLabelStandardizer(curr.label)
            else {
              val prefix = if (prev.id != curr.id) "B-" else "I-"
              prefix+nerLabelStandardizer(curr.label)
            }
        }
        Some(MascSentence(orderedTokens, orderedPos, bioLabels, orderedRegions))
      }
    }


    // Insert the "missing" sentences. (Content not marked as a sentence,
    // but containing tokens.)
    
    //val paddedSentenceRegionBuffer =
    //  collection.mutable.ListBuffer[MRegion](sentenceRegions.head)
    //
    //sentenceRegions.sliding(2).foreach {
    //  case Seq(prev, curr) => {
    //    if (prev.end + 1 < curr.start)
    //      paddedSentenceRegionBuffer.append(MRegion("", prev.end + 1, curr.start - 1))
    //    paddedSentenceRegionBuffer.append(curr)
    //  }
    //}
    //
    //val paddedSentenceRegions = paddedSentenceRegionBuffer.toSeq
    val paddedSentenceRegions = sentenceRegions

    // Pull out the sequence of token, pos, and NE for each sentence.
    val allOrderedTokRegions = tokenRegions.values.toIndexedSeq.sorted
    var index = 0
    val allDataBySentence = paddedSentenceRegions.flatMap { region => {
      //val startIndex = math.max(index, region.start)
      val startIndex = math.max(index, allOrderedTokRegions.indexWhere(t=>t.start>=region.start,index))
      //val startIndex = index
      val endIndex = allOrderedTokRegions.indexWhere(t=>t.end>region.end,startIndex)
      //println(region.start + " -- " + region.end)
      //println(index + ": " + startIndex + " , " + endIndex)
      if (startIndex == endIndex) None
      else {
        val sentence = allOrderedTokRegions.slice(startIndex,endIndex)
        index = endIndex
        orderedTokPosNer(sentence)
      }
    }}

    new MascFile(dir, prefix, rawtext, allDataBySentence)
  }

}

/**
* Simple objects and functions for working with MASC data.
*
* @author jasonbaldridge
*/
object MascUtil {

  def xmlId(node: Node) = (node \ "@{http://www.w3.org/XML/1998/namespace}id").toString

  lazy val nerLabelStandardizer = Map(
    "location" -> "LOC",
    "person" -> "PER",
    "org" -> "ORG",
    //"date" -> "DAT"
    "date" -> "MISC"
  ).withDefault(x=>"O")


  def getRegions(doc: Elem) = (doc \\ "region").toSeq.map { rxml =>
    val Array(start, end) = (rxml \ "@anchors").toString.split(" ")
    MRegion(xmlId(rxml), start.toInt, end.toInt)
  }
    
  def getNodes(doc: Elem) = (doc \\ "node").toSeq.flatMap { nxml =>
    val link = (nxml \ "link")
    if (!link.isEmpty) {
      val targets = (link.head \ "@targets").toString.split(" ").toSeq
      Some(MNode(xmlId(nxml), targets))
    } else throw new Exception("Missing link element.") //None OK?
  }
  
  def getEdges(doc: Elem) = (doc \\ "edge").toSeq
    .map(exml => MEdge(xmlId(exml), (exml \ "@from").toString, (exml \ "@to").toString))
  
  def getAnnotations(doc: Elem) = (doc \\ "a").toSeq.map { axml =>
    val fs = (axml \\ "f").toSeq
    val features = fs .map(fnode => {
      val name = (fnode \ "@name").toString
      val value = (fnode \ "@value").toString
      name -> (if (!value.isEmpty) value else fnode.child.toString) 
    }).toMap
    MAnnotation(xmlId(axml), (axml \ "@label").toString, (axml \ "@ref").toString, features)
  }

  // Have to go through some pains to make sure we get a POS for every token.
  def getPos(anno: MAnnotation) = {
    if (anno.features.isDefinedAt("msd")) anno.features("msd")
    else if (anno.features.get("kind").getOrElse("") == "urlAddress") "URL"
    else if (anno.features.isDefinedAt("categor")) anno.features("categor")
    else "UNK"
  }

}

object MascSlab {

  /**
   * Create a Slab from a MASC .txt file
   * 
   * @param textFileUrl The URL of the MASC .txt (plain text) file.
   * @return A Slab of the text, with the URL saved as a Source annotation.
   */
  def apply(textFileUrl: URL): StringSlab[Source] = {
    val text = io.Source.fromURL(textFileUrl)(Codec.UTF8).mkString
    val slab = Slab(text)
    slab.append(Span(0, text.length), new Source(textFileUrl))
  }

  /**
   * Add sentences to a MASC Slab using the MASC -s.xml file.
   * 
   * Assumes there will be exactly one Source annotation, providing the URL of the MASC .txt file.
   * 
   * @param slab The Slab containing the text and source URL
   * @return The Slab with added Sentence annotations as read from the MASC -s.xml file.
   */
  def s[I <: Source](slab: StringSlab[I]): Slab[String, Span, I with Sentence] = {
    val List((_, source)) = slab.iterator[Source].toList
    val sentenceXml = XML.load(source.url.toString().replaceAll("[.]txt$", "-s.xml"))
    val sentences = for (region <- MascUtil.getRegions(sentenceXml)) yield {
      region.span ->  Sentence(Some(region.id))
    }
    slab.++[Sentence](sentences)
  }
  
  /**
   * Add segments to a MASC Slab using the MASC -seg.xml file.
   * 
   * Assumes there will be exactly one Source annotation, providing the URL of the MASC .txt file.
   * 
   * @param slab The Slab containing the text and source URL
   * @return The Slab with added Segment annotations as read from the MASC -seg.xml file.
   */
  def seg[I <: Source](slab: StringSlab[I]): Slab[String, Span, I with Segment] = {
    val List((_, source)) = slab.iterator[Source].toList
    val segmentXml = XML.load(source.url.toString().replaceAll("[.]txt$", "-seg.xml"))
    val segments = for (region <- MascUtil.getRegions(segmentXml)) yield {
      region.span -> Segment(Some(region.id))
    }
    slab.++[Segment](segments)
  }

  /**
   * Adds Penn PartOfSpeech tags using the MASC -penn.xml file.
   * 
   * Assumes there will be exactly one Source annotation, providing the URL of the MASC .txt file.
   * 
   * @param slab The Slab containing the text, the source URL and the Segment annotations.
   * @return The Slab with added PartOfSpeech annotations as read from the MASC -penn.xml file.
   */
  def penn[I <: Source with Segment](slab: StringSlab[I]) = {
    val List((_, source)) = slab.iterator[Source].toList
    val pennXml = XML.load(source.url.toString().replaceAll("[.]txt$", "-penn.xml"))

    val idToSegment = (for ((span, s) <- slab.iterator[Segment]; id <- s.id.iterator) yield id -> span).toMap
    val idToPosRegion = MascUtil.getNodes(pennXml).map{ node =>
      val segments = node.targets.map(idToSegment).sortBy{ s => (s.begin, -s.end)}
      node.id -> MRegion(node.id, segments.head.begin, segments.last.end)
    }.toMap
    
    val partOfSpeechTags = for (annotation <- MascUtil.getAnnotations(pennXml)) yield {
      val region = idToPosRegion(annotation.ref)
      val tag = MascUtil.getPos(annotation)
      region.span -> PartOfSpeech(tag, Some(region.id))
    }
    // TODO: should probably create Stem annotations too, available as the MASC "base" feature
    
    // FIXME: should not be necesssary to sort, but Slab needs better implementation
    slab ++ partOfSpeechTags
  }
  
  /**
   * Adds EntityMention annotations using the MASC -ne.xml file.
   * 
   * Assumes there will be exactly one Source annotation, providing the URL of the MASC .txt file.
   * 
   * @param slab The Slab containing the text, the source URL and PartOfSpeech annotations.
   * @return The Slab with added EntityMention annotations as read from the MASC -ne.xml file.
   */
  def namedEntities[I <: Source with PartOfSpeech](slab: StringSlab[I]) = {
    val List((_, source)) = slab.iterator[Source].toList
    val neXml = XML.load(source.url.toString().replaceAll("[.]txt$", "-ne.xml"))
    
    val idToPos = (for ((span, p) <- slab.iterator[PartOfSpeech]; id <- p.id.iterator) yield id -> (span, p)).toMap
    val neIdPosIdTuples = MascUtil.getEdges(neXml).map(e => (e.from -> e.to))
    val neIdToPosIds = neIdPosIdTuples.groupBy(_._1).mapValues(_.map(_._2))
    
    val entityMentions = for (annotation <- MascUtil.getAnnotations(neXml)) yield {
      val posTags = neIdToPosIds(annotation.ref).map(idToPos).sortBy{ case (span, p) => span.begin -> -span.end}
      val begin = posTags.head._1.begin
      val end = posTags.last._1.end
      Span(begin, end) -> EntityMention(annotation.label, Some(annotation.ref))
    }
    
    slab ++ entityMentions.iterator
  }
}