All Downloads are FREE. Search and download functionalities are using the official Maven repository.

epic.slab.Slab.scala Maven / Gradle / Ivy

There is a newer version: 0.4.4
Show newest version
package epic.slab

import scala.reflect.ClassTag
import java.net.URL
import epic.util.BinarySearch
import epic.slab.AnnotatedSpan.{EndFirstSpanOrdering, SpanOrdering}
import epic.trees.Span

/**
 * A Slab is the core "document" type in Epic. It represents a document and a set of annotations on that document,
 * such as sentence boundaries, tokens, named entity spans, etc. The ContentType is the type of the document--typically
 * a string--and the AnnotationTypes parameter is an encoding of the kinds of annotations that are present. [[epic.slab.AnalysisFunction]]s
 * can be used to add new annotations to a Slab that have the prerequisite annotations.
 * @tparam ContentType
 * @tparam RegionType
 * @tparam AnnotationTypes
 */
trait Slab[ContentType, RegionType, +AnnotationTypes] {

  val content: ContentType

  def spanned(region: RegionType):ContentType

  def append[A](region: RegionType, annotation: A): Slab[ContentType, RegionType, AnnotationTypes with A] = {
    this.+[A](region -> annotation)
  }

  def +[A](pair: (RegionType, A)): Slab[ContentType, RegionType, AnnotationTypes with A] = {
    ++[A](Iterator(pair))
  }

  def ++[A](annotations: TraversableOnce[(RegionType, A)]): Slab[ContentType, RegionType, AnnotationTypes with A]

  def iterator[A >: AnnotationTypes: ClassTag]: Iterator[(RegionType, A)]

  def covered[A >: AnnotationTypes: ClassTag](region: RegionType): Iterator[(RegionType, A)]

  def preceding[A >: AnnotationTypes: ClassTag](region: RegionType): Iterator[(RegionType, A)]

  def following[A >: AnnotationTypes: ClassTag](region: RegionType): Iterator[(RegionType, A)]

  def stringRep[A >: AnnotationTypes: ClassTag] = {
    iterator[A].mkString("\n")
  }
  
}

/*
abstract class SlabAnnotationOps[ContentType, RegionType, AnnotationTypes](
  val region: RegionType,
  val slab: Slab[ContentType, RegionType, AnnotationTypes]) {

  def content: ContentType

  def covered[A >: AnnotationTypes : ClassTag] = this.slab.covered[A](this.annotation)

  def preceding[A >: AnnotationTypes: ClassTag] = this.slab.preceding[A](this.annotation)

  def following[A >: AnnotationTypes: ClassTag] = this.slab.following[A](this.annotation)
}
*/

// =========================
// Annotation infrastructure
// =========================
trait AnnotatedSpan {
  def begin: Int
  def end: Int
}


object AnnotatedSpan {

  implicit object SpanOrdering extends Ordering[Span] {
    override def compare(x: Span, y: Span): Int = {
      if      (x.begin < y.begin) -1
      else if (x.begin > y.begin)  1
      else if (x.end  < y.end)    -1
      else if (x.end > y.end)      1
      else                         0
    }
  }

  implicit object EndFirstSpanOrdering extends Ordering[Span] {
    override def compare(x: Span, y: Span): Int = {
      if (x.end  < y.end)    -1
      else if (x.end > y.end)      1
      else if (x.begin < y.begin) -1
      else if (x.begin > y.begin)  1
      else                         0
    }
  }

  /*
  implicit class SpanInStringSlab(val span: AnnotatedSpan) extends AnyVal {
    def in[AnnotationTypes <: AnnotatedSpan](slab: StringSlab[AnnotationTypes]) =
      new StringSpanAnnotationOps(this.span, slab)
  }

  class StringSpanAnnotationOps[AnnotationType >: AnnotationTypes <: AnnotatedSpan: ClassTag, AnnotationTypes <: AnnotatedSpan](
    annotation: AnnotationType,
    slab: StringSlab[AnnotationTypes])
    extends SlabAnnotationOps[String, AnnotatedSpan, AnnotationType, AnnotationTypes](annotation, slab) {
    def content = this.slab.content.substring(this.annotation.begin, this.annotation.end)
  }
  */
  
  implicit object StringAnnotationHasBounds extends Slab.HasBounds[AnnotatedSpan] {
    def covers(annotation1: AnnotatedSpan, annotation2: AnnotatedSpan): Boolean =
      annotation1.begin <= annotation2.begin && annotation2.end <= annotation1.end
    def follows(annotation1: AnnotatedSpan, annotation2: AnnotatedSpan): Boolean =
      annotation2.end <= annotation1.begin
    def precedes(annotation1: AnnotatedSpan, annotation2: AnnotatedSpan): Boolean =
      annotation1.end <= annotation2.begin
  }
}

// ===========
// Annotations
// ===========
case class Source(url: URL)
case class Sentence(id: Option[String] = None)
case class Segment(id: Option[String] = None)
case class Token(token: String)
case class PartOfSpeech(tag: String, id: Option[String] = None)
case class EntityMention(entityType: String, id: Option[String] = None)


object Slab {

  trait ExtractRegion[Region, T] {
    def apply(region: Region, t: T):T
  }

  implicit object SpanStringExtractRegion extends ExtractRegion[Span, String] {
    def apply(region: Span, t: String) = t.substring(region.begin, region.end)
  }

  def apply(content: String):StringSlab[Any] = {
    new SortedSequenceSlab(content, Map.empty, Map.empty)
  }
  
  def apply[ContentType, RegionType](content: ContentType)
                                    (implicit hasBounds: HasBounds[RegionType],
                                     extract: ExtractRegion[RegionType, ContentType]): Slab[ContentType, RegionType, Any] =
    new HorribleInefficientSlab[ContentType, RegionType, Any](content)

  /**
   * This trait has the minimum necessary for the implementation below.
   *
   * An efficient implementation will probably need some other set of operations.
   */
  trait HasBounds[AnnotationType] {
    def covers(annotation1: AnnotationType, annotation2: AnnotationType): Boolean
    def precedes(annotation1: AnnotationType, annotation2: AnnotationType): Boolean
    def follows(annotation1: AnnotationType, annotation2: AnnotationType): Boolean
  }

  private[slab] class HorribleInefficientSlab[ContentType, RegionType, AnnotationTypes](
    val content: ContentType,
    val _annotations: Seq[(RegionType, Any)] = Seq.empty)(
      implicit hasBounds: HasBounds[RegionType], extract: ExtractRegion[RegionType, ContentType])
    extends Slab[ContentType, RegionType, AnnotationTypes] {


    override def spanned(region: RegionType): ContentType = extract(region, content)

    def ++[AnnotationType](annotations: TraversableOnce[(RegionType, AnnotationType)]): Slab[ContentType, RegionType, AnnotationTypes with AnnotationType] =
      // FIXME: this should keep the annotations sorted by offset
      new HorribleInefficientSlab(this.content, this._annotations ++ annotations)

    def iterator[A >: AnnotationTypes: ClassTag]: Iterator[(RegionType, A)] =
      this._annotations.iterator.collect {
        case pair@(region, annotation: A) => pair.asInstanceOf[(RegionType, A)]
      }

    def covered[A >: AnnotationTypes: ClassTag](region: RegionType): Iterator[(RegionType, A)] =
      this.iterator[A].filter(a => hasBounds.covers(region, a._1))

    def following[A >: AnnotationTypes: ClassTag](region: RegionType): Iterator[(RegionType, A)] =
      this.iterator[A].filter(a => hasBounds.follows(a._1, region))

    def preceding[A >: AnnotationTypes: ClassTag](region: RegionType): Iterator[(RegionType, A)] =
      this.iterator[A].filter(a => hasBounds.precedes(a._1, region)).toSeq.reverseIterator

  }

  /**
   * This slab should be more efficient, especially for longer documents. It maintains the annotations in sorted order.
   *
   * @param content
   * @param annotations
   * @tparam ContentType
   * @tparam AnnotationType
   */
  private[slab] class SortedSequenceSlab[ContentType,
                                         AnnotationType](val content: ContentType,
                                                         val annotations: Map[Class[_], Vector[(Span, Any)]] = Map.empty,
                                                         val reverseAnnotations: Map[Class[_], Vector[(Span, Any)]] = Map.empty)(implicit extract: ExtractRegion[Span, ContentType]) extends Slab[ContentType, Span, AnnotationType] {


    override def spanned(region: Span): ContentType = extract(region, content)

    override def ++[A](annotations: TraversableOnce[(Span, A)]): Slab[ContentType, Span, AnnotationType with A] = {
      var newAnnotations = this.annotations
      val grouped = annotations.toIndexedSeq.groupBy(_._2.getClass)
      for( (clss, group) <- grouped) {
        newAnnotations = newAnnotations + (clss -> (newAnnotations.getOrElse(clss, Vector.empty) ++ group).sortBy(_._1)(SpanOrdering))
      }

      var reverseAnnotations = this.reverseAnnotations
      for( (clss, group) <- grouped) {
        reverseAnnotations = reverseAnnotations + (clss -> (reverseAnnotations.getOrElse(clss, Vector.empty) ++ group).sortBy(_._1)(EndFirstSpanOrdering))
      }
      new SortedSequenceSlab(content, newAnnotations, reverseAnnotations)
    }

    override def following[A >: AnnotationType: ClassTag](region: Span): Iterator[(Span, A)] = {
      annotations.filterKeys(implicitly[ClassTag[A]].runtimeClass.isAssignableFrom).valuesIterator.flatMap { annotations =>
        var pos = BinarySearch.interpolationSearch(annotations, (_:(Span, Any))._1.begin, region.end)
        if(pos < 0) pos = ~pos
        annotations.view(pos, annotations.length)
      }.asInstanceOf[Iterator[(Span, A)]]
    }

    override def preceding[A >: AnnotationType : ClassTag](region: Span): Iterator[(Span, A)] = {
      reverseAnnotations.filterKeys(implicitly[ClassTag[A]].runtimeClass.isAssignableFrom).valuesIterator.flatMap { annotations =>
        var pos = BinarySearch.interpolationSearch(annotations, (_:(Span, Any))._1.end, region.begin + 1)
        if(pos < 0) pos = ~pos
        annotations.view(0, pos).reverseIterator
      }.asInstanceOf[Iterator[(Span, A)]]
    }

    override def covered[A >: AnnotationType : ClassTag](region: Span): Iterator[(Span, A)] = {
      annotations.filterKeys(implicitly[ClassTag[A]].runtimeClass.isAssignableFrom).valuesIterator.flatMap { annotations =>
        var begin = BinarySearch.interpolationSearch(annotations, (_:(Span, Any))._1.begin, region.begin)
        if(begin < 0) begin = ~begin
        annotations.view(begin, annotations.length).takeWhile(_._1.end <= region.end)
      }.asInstanceOf[Iterator[(Span, A)]]
    }

    override def iterator[A >: AnnotationType : ClassTag]: Iterator[(Span, A)] = {
      annotations.filterKeys(implicitly[ClassTag[A]].runtimeClass.isAssignableFrom).valuesIterator.flatten.asInstanceOf[Iterator[(Span, A)]]
    }

  }



}




© 2015 - 2025 Weber Informatics LLC | Privacy Policy