All Downloads are FREE. Search and download functionalities are using the official Maven repository.

epic.sequences.Segmentation.scala Maven / Gradle / Ivy

The newest version!
package epic.sequences

import epic.trees.{Tree, Span}
import scala.collection.mutable.ArrayBuffer
import nak.data.Example

/**
 *
 * @author dlwh
 */
case class Segmentation[+L, +W](segments: IndexedSeq[(L, Span)],
                              words: IndexedSeq[W],
                              id: String = "") extends Example[IndexedSeq[(L, Span)], IndexedSeq[W]] {

  def segmentsWithOutside: Iterator[(Option[L], Span)] = {
    val segs = for {
      qq@IndexedSeq((pL, pSpan), (l, span)) <- (segments.headOption.map(pair => pair._1 -> Span(0, 0)).toIndexedSeq ++ segments).sliding(2)
      padding = Iterator.range(pSpan.end, span.begin).map(i => None -> Span(i, i + 1))
      pair <- padding ++ Iterator((Some(l), span))
    } yield {
      pair
    }


    val lastSpanEnd = segments.lastOption match {
      case Some((_, Span(_, end))) => end
      case _ => 0
    }

    segs ++ (lastSpanEnd until length).map(i => None -> Span(i, i + 1))
  }



  def render: String = {
    segmentsWithOutside.map {
      case (None, span) => words.slice(span.begin, span.end).mkString(" ")
      case (Some(l), span) => words.slice(span.begin, span.end).mkString(s"[$l: ", " ", "]")
    }.mkString(" ")
  }


  def features = words

  def length: Int = words.length

  def label: IndexedSeq[(L, Span)] = segments

  def asBIOSequence[LL>:L](outsideLabel: LL): TaggedSequence[BIOETag[L], W] = {
    val outLabels = new ArrayBuffer[BIOETag[L]]()
    for((l,span) <- segments if !span.isEmpty) {
      while(outLabels.length < span.begin) {
        outLabels += BIOETag.Outside
      }

      if(l == outsideLabel)
        outLabels += BIOETag.Outside
      else
        outLabels += BIOETag.Begin(l)
      for(i <- (span.begin+1) until (span.end) ) {
        outLabels += {if(l != outsideLabel) BIOETag.Inside(l) else BIOETag.Outside}
      }
    }
    while(outLabels.length < words.length) {
      outLabels += BIOETag.Outside
    }
    assert(outLabels.length == words.length)
    TaggedSequence(outLabels, words, id +"-bio")
  }

  def asFlatTaggedSequence[LL>:L]: TaggedSequence[Option[LL], W] = {
    val outLabels = new ArrayBuffer[Option[LL]]()
    for((l,span) <- segments if !span.isEmpty) {
      while(outLabels.length < span.begin) {
        outLabels += None
      }

      for(i <- (span.begin) until (span.end) ) {
        outLabels += Some(l)
      }
    }
    while(outLabels.length < words.length) {
      outLabels += None
    }
    assert(outLabels.length == words.length)
    TaggedSequence(outLabels, words, id +"-flattened")
  }

  def toTree[LL>:L](outsideLabel: LL):Tree[LL] = {
    val outLabels = new ArrayBuffer[(LL, Span)]()
    for((l,span) <- segments if !span.isEmpty) {
      val lastEnd = outLabels.lastOption.map(_._2.end).getOrElse(0)
      if(lastEnd < span.begin) {
        outLabels += (outsideLabel -> Span(lastEnd, span.begin))
      }

      outLabels += (l -> span)
    }

    val lastEnd = outLabels.lastOption.map(_._2.end).getOrElse(0)
    if(lastEnd < words.length) {
      outLabels += (outsideLabel -> Span(lastEnd, words.length))
    }

    val t = Tree(outsideLabel, outLabels.map { case (l, span) => Tree(l, IndexedSeq.empty, span)}, Span(0, words.length))
    assert(t.isValid, outLabels)
    t
  }
}

object Segmentation {
  def fromBIOSequence[L, W](seq: TaggedSequence[BIOETag[L], W], outsideLabel: L):Segmentation[L, W] = {
    import BIOETag._
    val spans = ArrayBuffer[(L, Span)]()
    var currentStart = 0
    var currentLabel = outsideLabel
    for(i <- 0 until seq.length) {
      seq.label(i) match {
        case Begin(l) =>
          if(currentStart < i)
           spans += (currentLabel -> Span(currentStart, i))
          currentStart = i
          currentLabel = l
        case Inside(l) =>
          if(currentLabel != l) {
            if(currentStart < i)
              spans += (currentLabel -> Span(currentStart, i))
            currentStart = i
            currentLabel = l
          }
        case End(l) =>
          if(currentLabel != l) {
            if(currentStart < i)
              spans += (currentLabel -> Span(currentStart, i))
            currentStart = i
            currentLabel = l
          }
          spans += (currentLabel -> Span(currentStart, i+1))
          currentStart = i + 1
        case Outside =>
          if(currentLabel != outsideLabel) {
            if(currentStart < i)
              spans += (currentLabel -> Span(currentStart, i))
            currentStart = i
            currentLabel = outsideLabel
          }
          spans += (currentLabel -> Span(currentStart, i+1))
          currentStart = i + 1

      }
    }
    if(currentStart < seq.length)
      spans += (currentLabel -> Span(currentStart, seq.length))
    Segmentation(spans, seq.words, seq.id.replaceAll("-bio","-seg"))
  }
}




© 2015 - 2025 Weber Informatics LLC | Privacy Policy