All Downloads are FREE. Search and download functionalities are using the official Maven repository.

au.id.tmm.fetch.aws.textract.model.Block.scala Maven / Gradle / Ivy

package au.id.tmm.fetch.aws.textract.model

import au.id.tmm.fetch.aws.textract.model.SelectionElement.Status
import io.circe._
import io.circe.syntax.{EncoderOps, KeyOps}

import scala.collection.immutable.ArraySeq

sealed trait HasBlockId {
  val id: BlockId
}

sealed trait ReadableText {
  def readableText: String

  override def toString: String = readableText
}

object ReadableText {
  def from(others: ArraySeq[ReadableText]): String = others.map(_.readableText).mkString(" ")
}

sealed trait Block extends HasBlockId {
  val pageNumber: PageNumber
  val geometry: Geometry
}

sealed trait AtomicBlock extends Block with ReadableText

object AtomicBlock {
  implicit val codec: Codec[AtomicBlock] = Codec.from(
    decodeA = c =>
      for {
        typeFlag <- c.get[String]("type")
        atomicBloc: AtomicBlock <- typeFlag match {
          case "SelectionElement" => c.as[SelectionElement]
          case "Word"             => c.as[Word]
          case _                  => Left(DecodingFailure("Unrecognised type", c.history))
        }
      } yield atomicBloc,
    encodeA = {
      case s: SelectionElement => s.asJsonObject.+:("type" := "SelectionElement").asJson
      case w: Word             => w.asJsonObject.+:("type" := "Word").asJson
    },
  )
}

final case class Line(
  id: BlockId,
  pageNumber: PageNumber,
  geometry: Geometry,
  text: String,
  children: ArraySeq[AtomicBlock],
) extends Block
    with ReadableText {
  override def readableText: String = text
}

object Line {
  implicit val codec: Codec[Line] = Codec.from(
    Decoder.forProduct5("id", "pageNumber", "geometry", "text", "children")(Line.apply),
    Encoder.forProduct5("id", "pageNumber", "geometry", "text", "children")(l =>
      (l.id, l.pageNumber, l.geometry, l.text, l.children),
    ),
  )
}

final case class Page(
  id: BlockId,
  pageNumber: PageNumber,
  geometry: Geometry,
  children: ArraySeq[Page.Child],
) extends Block {
  def lines: ArraySeq[Line] =
    children.collect { case Page.Child.OfLine(l) =>
      l
    }

  def tables: ArraySeq[Table] =
    children.collect { case Page.Child.OfTable(t) =>
      t
    }

  def keyValueSets: ArraySeq[KeyValueSet] =
    children.collect { case Page.Child.OfKeyValueSet(kvSet) =>
      kvSet
    }
}

object Page {
  sealed trait Child

  object Child {
    final case class OfLine(line: Line)                      extends Child
    final case class OfTable(table: Table)                   extends Child
    final case class OfKeyValueSet(keyValueSet: KeyValueSet) extends Child

    implicit val codec: Codec[Child] = Codec.from[Child](
      decodeA = c =>
        c.keys.map(_.toList) match {
          case Some(onlyKey :: Nil) =>
            onlyKey match {
              case "line"        => c.get[Line]("line").map(OfLine.apply)
              case "table"       => c.get[Table]("table").map(OfTable.apply)
              case "keyValueSet" => c.get[KeyValueSet]("keyValueSet").map(OfKeyValueSet.apply)
            }
          case None | Some(_) => Left(DecodingFailure("Bad keys", c.history))
        },
      encodeA = {
        case OfLine(line)               => Json.obj("line" := line)
        case OfTable(table)             => Json.obj("table" := table)
        case OfKeyValueSet(keyValueSet) => Json.obj("keyValueSet" := keyValueSet)
      },
    )
  }

  implicit val ordering: Ordering[Page] = Ordering.by(_.pageNumber)

  implicit val codec: Codec[Page] = Codec.from(
    Decoder.forProduct4("id", "pageNumber", "geometry", "children")(Page.apply),
    Encoder.forProduct4("id", "pageNumber", "geometry", "children")(p => (p.id, p.pageNumber, p.geometry, p.children)),
  )
}

final case class SelectionElement(
  id: BlockId,
  pageNumber: PageNumber,
  geometry: Geometry,
  status: SelectionElement.Status,
) extends AtomicBlock {
  override def readableText: String =
    status match {
      case Status.Selected    => "☑"
      case Status.NotSelected => "☐"
    }
}

object SelectionElement {
  sealed abstract class Status(val isSelected: Boolean)

  object Status {
    case object Selected    extends Status(isSelected = true)
    case object NotSelected extends Status(isSelected = false)

    implicit val codec: Codec[Status] = Codec.from(
      c =>
        c.get[Boolean]("isSelected").map {
          case true  => Selected
          case false => NotSelected
        },
      {
        case Selected    => Json.obj("isSelected" := true)
        case NotSelected => Json.obj("isSelected" := false)
      },
    )
  }

  implicit val decoder: Decoder[SelectionElement] =
    Decoder.forProduct4("id", "pageNumber", "geometry", "status")(SelectionElement.apply)
  implicit val encoder: Encoder.AsObject[SelectionElement] =
    Encoder.forProduct4("id", "pageNumber", "geometry", "status")(s => (s.id, s.pageNumber, s.geometry, s.status))
}

final case class Table(
  id: BlockId,
  pageNumber: PageNumber,
  geometry: Geometry,
  children: ArraySeq[Table.Cell],
  mergedCells: ArraySeq[Table.MergedCell],
) extends Block {
  def rows: ArraySeq[ArraySeq[Table.Cell]] =
    children
      .sortBy(c => (c.index.row, c.index.column))
      .groupBy(c => c.index.row)
      .to(ArraySeq)
      .sortBy(_._1)
      .map(_._2)
}

object Table {

  final case class Cell(
    id: BlockId,
    pageNumber: PageNumber,
    geometry: Geometry,
    index: Cell.Index,
    children: ArraySeq[AtomicBlock],
  ) extends Block
      with ReadableText {
    override def readableText: String = ReadableText.from(children)
  }

  object Cell {
    final case class Index(
      row: Int,
      column: Int,
    )

    implicit val codec: Codec[Cell] = Codec.from(
      Decoder.instance { c =>
        for {
          id          <- c.get[BlockId]("id")
          pageNumber  <- c.get[PageNumber]("pageNumber")
          geometry    <- c.get[Geometry]("geometry")
          columnIndex <- c.get[Int]("columnIndex")
          columnSpan  <- c.get[Option[Int]]("columnSpan")
          rowIndex    <- c.get[Int]("rowIndex")
          rowSpan     <- c.get[Option[Int]]("rowSpan")
          children    <- c.get[ArraySeq[AtomicBlock]]("children")

          _ <- Either.cond(
            columnSpan.isEmpty || columnSpan.contains(1),
            (),
            DecodingFailure(s"Bad column span $columnSpan", c.history),
          )
          _ <- Either.cond(
            rowSpan.isEmpty || rowSpan.contains(1),
            (),
            DecodingFailure(s"Bad row span $rowSpan", c.history),
          )
        } yield Cell(id, pageNumber, geometry, Index(rowIndex, columnIndex), children)
      },
      Encoder.forProduct6(
        "id",
        "pageNumber",
        "geometry",
        "columnIndex",
        "rowIndex",
        "children",
      )(c => (c.id, c.pageNumber, c.geometry, c.index.column, c.index.row, c.children)),
    )
  }

  final case class MergedCell(
    id: BlockId,
    pageNumber: PageNumber,
    geometry: Geometry,
    index: Cell.Index,
    span: MergedCell.Span,
    children: ArraySeq[Cell],
  ) extends Block
      with ReadableText {
    override def readableText: String = ReadableText.from(children)
  }

  object MergedCell {
    final case class Span(
      row: Int,
      column: Int,
    )

    object Span {
      implicit val codec: Codec[Span] = Codec.from(
        Decoder.forProduct2("row", "column")(Span.apply),
        Encoder.forProduct2("row", "column")(s => (s.row, s.column)),
      )
    }

    private implicit val indexCodec: Codec[Cell.Index] = Codec.from(
      Decoder.forProduct2("row", "column")(Cell.Index.apply),
      Encoder.forProduct2("row", "column")(s => (s.row, s.column)),
    )

    implicit val codec: Codec[MergedCell] = Codec.from(
      Decoder.forProduct6(
        "id",
        "pageNumber",
        "geometry",
        "index",
        "span",
        "children",
      )(MergedCell.apply),
      Encoder.forProduct6(
        "id",
        "pageNumber",
        "geometry",
        "index",
        "span",
        "children",
      )(c => (c.id, c.pageNumber, c.geometry, c.index, c.span, c.children)),
    )
  }

  implicit val codec: Codec[Table] = Codec.from(
    Decoder.forProduct5("id", "pageNumber", "geometry", "children", "mergedCells") {
      (
        id: BlockId,
        pageNumber: PageNumber,
        geometry: Geometry,
        children: ArraySeq[Cell],
        mergedCells: Option[ArraySeq[MergedCell]],
      ) =>
        Table(
          id,
          pageNumber,
          geometry,
          children,
          mergedCells.getOrElse(ArraySeq.empty), // TODO should probably remove the forgiving mergedCells decoding
        )
    },
    Encoder.forProduct5("id", "pageNumber", "geometry", "children", "mergedCells")(t =>
      (t.id, t.pageNumber, t.geometry, t.children, t.mergedCells),
    ),
  )

}

final case class Word(
  id: BlockId,
  pageNumber: PageNumber,
  geometry: Geometry,
  text: String,
  confidence: Confidence,
  textType: Word.TextType,
) extends AtomicBlock {
  override def readableText: String = text
}

object Word {
  sealed trait TextType

  object TextType {
    case object Printed     extends TextType
    case object Handwriting extends TextType

    implicit val codec: Codec[TextType] = Codec.from(
      Decoder[String].emap {
        case "printed"     => Right(Printed)
        case "handwriting" => Right(Handwriting)
        case badTextType   => Left(s"Bad text type $badTextType")
      },
      Encoder[String].contramap {
        case Printed     => "printed"
        case Handwriting => "handwriting"
      },
    )
  }

  implicit val decoder: Decoder[Word] =
    Decoder.forProduct6("id", "pageNumber", "geometry", "text", "confidence", "textType")(Word.apply)
  implicit val encoder: Encoder.AsObject[Word] =
    Encoder.forProduct6("id", "pageNumber", "geometry", "text", "confidence", "textType")(w =>
      (w.id, w.pageNumber, w.geometry, w.text, w.confidence, w.textType),
    )
}

final case class KeyValueSet(
  key: KeyValueSet.Key,
  value: KeyValueSet.Value,
)

object KeyValueSet {
  final case class Key(
    id: BlockId,
    pageNumber: PageNumber,
    geometry: Geometry,
    children: ArraySeq[AtomicBlock],
  ) extends Block
      with ReadableText {
    override def readableText: String = ReadableText.from(children)
  }

  object Key {
    implicit val codec: Codec[Key] = Codec.from(
      Decoder.forProduct4("id", "pageNumber", "geometry", "children")(Key.apply),
      Encoder.forProduct4("id", "pageNumber", "geometry", "children")(k => (k.id, k.pageNumber, k.geometry, k.children)),
    )
  }

  final case class Value(
    id: BlockId,
    pageNumber: PageNumber,
    geometry: Geometry,
    children: ArraySeq[AtomicBlock],
  ) extends Block
      with ReadableText {
    override def readableText: String = ReadableText.from(children)
  }

  object Value {
    implicit val codec: Codec[Value] = Codec.from(
      Decoder.forProduct4("id", "pageNumber", "geometry", "children")(Value.apply),
      Encoder.forProduct4("id", "pageNumber", "geometry", "children")(v => (v.id, v.pageNumber, v.geometry, v.children)),
    )
  }

  implicit val codec: Codec[KeyValueSet] = Codec.from(
    Decoder.forProduct2("key", "value")(KeyValueSet.apply),
    Encoder.forProduct2("key", "value")(kvs => (kvs.key, kvs.value)),
  )
}




© 2015 - 2025 Weber Informatics LLC | Privacy Policy