All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.suecarter.tablediff.TableDiff.scala Maven / Gradle / Ivy

The newest version!
package org.suecarter.tablediff

import scala.annotation.tailrec
import ReportContent._
import scala.util.Properties

/**
  * Value differs on side to the other. None means the value is missing on that side
  * @param left Optional value on left of diff
  * @param right Optional value on right of diff
  */
case class EitherSide[T](left: Option[T], right: Option[T])

/**
  * Enumeration of the types of Diffs, only left, only right or both
  */
protected[tablediff] sealed trait DiffLocationType
protected[tablediff] case object OnlyLeft extends DiffLocationType
protected[tablediff] case object OnlyRight extends DiffLocationType
protected[tablediff] case object InBoth extends DiffLocationType

/**
  * Represents a diff and where it optionally sits on the right and left side
  * @param value actual value
  * @param iLeft optional index into left sequence
  * @param iRight optional index into right sequence
  */
protected[tablediff] case class DiffLocation[T](value: T, iLeft: Option[Int], iRight: Option[Int]) {
  require(iLeft.isDefined || iRight.isDefined, "At least one of iLeft or iRight must have a value")
  def hasANone = locationType match {
    case OnlyLeft | OnlyRight => true
    case InBoth               => false
  }

  def locationType: DiffLocationType =
    if (iLeft.isDefined && iRight.isDefined)
      InBoth
    else if (iRight.isDefined)
      OnlyRight
    else // if (iLeft.isDefined)
      OnlyLeft
}

/**
  * Functions to produce and handle diff reports
  */
object TableDiff {

  /**
    * Represents a value that could be a diff. Left is a diff. Right is not a diff, but is an option which is None
    * if there is no value
    */
  type ValueDiff[T] = Either[EitherSide[T], Option[T]]

  protected[tablediff] def pivotHeaders[T](reportSection: ReportSection[T]): ReportSection[T] = {
    @tailrec
    def inner(accumulator: ReportSection[T], r: ReportSection[T]): ReportSection[T] =
      r.filter(_.nonEmpty) match {
        case Nil                  => accumulator
        case hs: ReportSection[T] => inner(accumulator :+ hs.map(_.head), hs.map(_.tail))
      }
    inner(emptyRow, reportSection)
  }

  // Take a diff of a sequence and turn it into a Seq of individual diffs
  private def splitSeqDiff[T](seqDiff: DiffLocation[ValueDiff[ReportRow[T]]]): ReportRow[ValueDiff[T]] =
    seqDiff.value.fold(
      l => {
        val leftSeq = l.left.getOrElse(emptyRow).map(Option(_))
        val rightSeq = l.right.getOrElse(emptyRow).map(Option(_))
        for ((left, right) <- leftSeq.zipAll(rightSeq, None, None))
          yield
            if (left == right)
              Right(left)
            else
              Left(EitherSide(left, right))
      },
      r => r.map(x => x.map(y => Right(Some(y)))).getOrElse(emptyRow)
    )

  private def emptyDiffCells(i: Int) = (0 until i).map(x => Right(None))

  // fill in space at top left of table column header if needed
  protected[tablediff] def flattenColumnHeaders[C](report: ReportContent[_, ValueDiff[C], _]) = {
    val paddedRowColumnHeaders =
      (report.rowColumnHeaders.size to report.columnDepth - 1).map(x => Seq()) ++ report.rowColumnHeaders
    paddedRowColumnHeaders.zipAll(report.columnHeaders, Seq(), Seq()).map {
      case (rch, ch) =>
        rch ++ emptyDiffCells(report.rowWidth - rch.size) ++ ch ++ emptyDiffCells(report.columnCount - ch.size)
    }
  }

  // join up row headers and main table section
  protected[tablediff] def flattenTableRows[T, R <: T, M <: T](
      report: ReportContent[ValueDiff[R], _, ValueDiff[M]]
  ): ReportSection[ValueDiff[T]] =
    report.rowHeaders
      .zipAll(report.mainData, Seq(), Seq())
      .map {
        case (r, m) =>
          r ++ emptyDiffCells(report.rowWidth - r.size) ++ m ++ emptyDiffCells(report.mainDataColumnCount - m.size)
        // asInstanceOf here as compiler can't work it out, but is safe R,M <: T
      }
      .map(_.asInstanceOf[ReportRow[ValueDiff[T]]])

  /**
    *
    * @param diffReport a report containing diffs
    * @tparam R Row header type
    * @tparam C Column header type
    * @tparam M Main data type
    * @return the input diff report filtered to just rows and columns containing diffs
    */
  def onlyTheDiffs[R, C, M](diffReport: ReportContent[ValueDiff[R], ValueDiff[C], ValueDiff[M]]) = {
    val report = if (diffReport.fillForwardBlankHeaders) fillForwardReportHeaders(diffReport) else diffReport
    val (filteredRows, mainData) =
      report.rowHeaders
        .zipAll(report.mainData, Seq(Right(None)), Seq(Right(None)))
        .filter {
          case (r, m) => !(r.filter(_.isLeft).isEmpty && m.filter(_.isLeft).isEmpty)
        }
        .unzip
    val (cols, main) =
      pivotHeaders(report.columnHeaders)
        .zipAll(pivotHeaders(mainData), Seq(Right(None)), Seq(Right(None)))
        .filter {
          case (r, m) => !(r.filter(_.isLeft).isEmpty && m.filter(_.isLeft).isEmpty)
        }
        .unzip
    val filteredColumns = pivotHeaders(cols)
    val filteredMain = pivotHeaders(main)
    val filteredReport = ReportContent(
      filteredRows,
      filteredColumns,
      filteredMain,
      if ((!filteredRows.isEmpty) || !report.rowColumnHeaders.filter(!_.filter(_.isLeft).isEmpty).isEmpty)
        report.rowColumnHeaders
      else emptySection
    )
    if (diffReport.fillForwardBlankHeaders) removeDuplicateReportHeaders(filteredReport) else filteredReport
  }

  // default argument separated out so it can be used in Java compatibility layer
  protected def defaultMainValueComparison[T] = (l: Option[T], r: Option[T]) => l == r

  /**
    * Produce a report that is the diff of the left and right report
    * @param mainValueComparison override the comparison function to see if two elements are the same
    *                            default is ==
    * @tparam R Row header type
    * @tparam C Column header type
    * @tparam M Main data type
    * @return report containing a representation of any diffs
    */
  def produceReportDiff[R, C, M](
      leftReport: ReportContent[R, C, M],
      rightReport: ReportContent[R, C, M],
      mainValueComparison: (Option[M], Option[M]) => Boolean = defaultMainValueComparison,
      chunkSize: Int = diffChunkSize
  ): ReportContent[ValueDiff[R], ValueDiff[C], ValueDiff[M]] = {
    // get the value from the main data corresponding to this row and column indexes
    def mainValue(rowIndex: Option[Int], colIndex: Option[Int], report: ReportContent[R, C, M]): Option[M] =
      rowIndex.flatMap(r =>
        colIndex.flatMap(c =>
          if (r < report.mainData.size && c < report.mainData(r).size) {
            val a = report.mainData(r)(c)
            Some(a)
          } else None
        )
      )
    def headerValue[T](h: DiffLocation[T]): ValueDiff[T] =
      if (h.hasANone)
        Left(EitherSide(h.iLeft.map(x => h.value), h.iRight.map(a => h.value)))
      else
        Right(Some(h.value))
    def mainValueDiff(row: DiffLocation[_], col: DiffLocation[_]): ValueDiff[M] = {
      val left = mainValue(row.iLeft, col.iLeft, leftReport)
      val right = mainValue(row.iRight, col.iRight, rightReport)
      if (mainValueComparison(left, right)) Right(left) else Left(EitherSide(left, right))
    }
    // this logic tries to collapse headers diff back to a single ValueDiff if the mainData matches
    // (i.e. only the headerValue is a diff). Will only search "one" row down for match
    def collapseHeaders[T](
        headerRows: ReportRow[DiffLocation[T]],
        leftMainData: ReportSection[M],
        rightMainData: ReportSection[M]
    ): ReportRow[DiffLocation[ValueDiff[T]]] = {
      @tailrec
      def inner[S](
          accumulator: ReportRow[DiffLocation[ValueDiff[S]]],
          headerRows: ReportRow[DiffLocation[S]],
          leftMainData: ReportSection[M],
          rightMainData: ReportSection[M]
      ): ReportRow[DiffLocation[ValueDiff[S]]] =
        if (headerRows.isEmpty)
          accumulator
        else {
          val (head, tailSection): (DiffLocation[ValueDiff[S]], ReportRow[DiffLocation[S]]) = headerRows match {
            case Seq(dlLeft @ DiffLocation(leftValue, Some(leftI), None), tail @ _*) => {
              // find row with just a left value
              val matchingRightValue: Option[(DiffLocation[TableDiff.ValueDiff[S]], ReportRow[DiffLocation[S]])] =
                tail.collectFirst {
                  // try to match to row with just a right value
                  case dlRight @ DiffLocation(rightValue, None, Some(rightI)) => {
                    val leftI = dlLeft.iLeft.getOrElse(0)
                    val rightI = dlRight.iRight.getOrElse(0)
                    if (((leftMainData.isEmpty && rightMainData.isEmpty) // either main data is empty
                        || ((leftMainData.size >= leftI + 1) && // or indexes point to same elements, 1 row away
                        (rightMainData.size >= rightI + 1) &&
                        leftMainData(leftI) == rightMainData(rightI)))
                        && rightI == accumulator.flatMap(_.iRight).reduceOption(_ max _).getOrElse(-1) + 1)
                      (
                        DiffLocation(
                          Left(EitherSide(Some(dlLeft.value), Some(dlRight.value))),
                          Some(leftI),
                          Some(rightI)
                        ),
                        tail.filterNot(_ == dlRight)
                      )
                    else
                      (DiffLocation(headerValue(dlLeft), Some(leftI), None), tail)
                  }
                }
              matchingRightValue.getOrElse((DiffLocation(headerValue(dlLeft), Some(leftI), None), tail))
            }
            case Seq(dl @ DiffLocation(_, leftI, rightI), tail @ _*) =>
              (DiffLocation(headerValue(dl), leftI, rightI), tail)
          }
          inner(accumulator :+ head, tailSection, leftMainData, rightMainData)
        }
      inner(emptyRow, headerRows, leftMainData, rightMainData)
    }
    // used to fill in headers such that they line up with the mainData section
    def extraHeaders[S](
        howManyLeft: Int,
        startIndexLeft: Int,
        howManyRight: Int,
        startIndexRight: Int
    ): ReportRow[DiffLocation[ValueDiff[S]]] =
      (0 until howManyLeft).map(Option(_)).zipAll((0 until howManyRight).map(Option(_)), None, None).map {
        case (l, r) =>
          DiffLocation[ValueDiff[S]](Right(None), l.map(_ + startIndexLeft), r.map(_ + startIndexRight))
      }

    // get LCSs for both ColumnHeader sections
    def zipLCSColumnSection(
        leftSection: ReportSection[C],
        rightSection: ReportSection[C]
    ): ReportRow[DiffLocation[ReportRow[C]]] =
      zipLongestCommonSubsequence(
        if (leftReport.fillForwardBlankHeaders)
          fillSectionHeaders(pivotHeaders(leftSection))
        else
          pivotHeaders(leftSection),
        if (rightReport.fillForwardBlankHeaders)
          fillSectionHeaders(pivotHeaders(rightSection))
        else
          pivotHeaders(rightSection),
        chunkSize = chunkSize
      )
    val resultCols = zipLCSColumnSection(leftReport.columnHeaders, rightReport.columnHeaders)
    val resultRowColHeaders = zipLCSColumnSection(leftReport.rowColumnHeaders, rightReport.rowColumnHeaders)

    // get LCSs for RowHeader section
    def flattenRowHeaderSection(report: ReportContent[R, _, _]) =
      if (report.rowSectionWidth > 0) report.rowHeaders else (0 until report.mainDataRows).map(x => Seq[R]())
    val leftRowHeaders = flattenRowHeaderSection(leftReport)
    val rightRowHeaders = flattenRowHeaderSection(rightReport)
    val resultRows = zipLongestCommonSubsequence(
      if (leftReport.fillForwardBlankHeaders) fillSectionHeaders(leftRowHeaders) else leftRowHeaders,
      if (rightReport.fillForwardBlankHeaders) fillSectionHeaders(rightRowHeaders) else rightRowHeaders,
      chunkSize = chunkSize
    )

    // process row and column headers in context of MainData section
    val rows: ReportRow[DiffLocation[ValueDiff[ReportRow[R]]]] =
      collapseHeaders(resultRows, leftReport.mainData, rightReport.mainData) ++
        extraHeaders(
          leftReport.mainDataRows - leftRowHeaders.size,
          leftRowHeaders.size,
          rightReport.mainDataRows - rightRowHeaders.size,
          rightRowHeaders.size
        )
    val cols: ReportRow[DiffLocation[ValueDiff[ReportRow[C]]]] =
      collapseHeaders(resultCols, pivotHeaders(leftReport.mainData), pivotHeaders(rightReport.mainData)) ++
        extraHeaders(
          leftReport.mainDataColumnCount - leftReport.columnCount,
          leftReport.columnCount,
          rightReport.mainDataColumnCount - rightReport.columnCount,
          rightReport.columnCount
        )
    val rowsContent: ReportSection[ValueDiff[R]] = rows.map(splitSeqDiff)
    val rowColumnHeadersContent: ReportSection[ValueDiff[C]] =
      collapseHeaders(resultRowColHeaders, Seq(), Seq()).map(splitSeqDiff)
    val colsContent: ReportSection[ValueDiff[C]] = cols.map(splitSeqDiff)
    val diffReportFillForward = leftReport.fillForwardBlankHeaders && rightReport.fillForwardBlankHeaders
    ReportContent(
      if (diffReportFillForward) removeHeaderDuplicates(rowsContent) else rowsContent,
      pivotHeaders(if (diffReportFillForward) removeHeaderDuplicates(colsContent) else colsContent),
      rows.map(row => cols.map(col => mainValueDiff(row, col))),
      rowColumnHeaders = pivotHeaders(
        if (diffReportFillForward) removeHeaderDuplicates(rowColumnHeadersContent) else rowColumnHeadersContent
      ),
      fillForwardBlankHeaders = diffReportFillForward
    )
  }

  protected[tablediff] def removeDuplicateReportHeaders[R, C, M](
      report: ReportContent[ValueDiff[R], ValueDiff[C], ValueDiff[M]]
  ) =
    report.copy(
      rowHeaders = removeHeaderDuplicates(report.rowHeaders),
      columnHeaders = pivotHeaders(removeHeaderDuplicates(pivotHeaders(report.columnHeaders)))
    )

  protected[tablediff] def fillForwardReportHeaders[R, C, M](
      report: ReportContent[ValueDiff[R], ValueDiff[C], ValueDiff[M]]
  ) =
    report.copy(
      rowHeaders = fillSectionHeaders(report.rowHeaders),
      columnHeaders = pivotHeaders(fillSectionHeaders(pivotHeaders(report.columnHeaders)))
    )

  protected[tablediff] def zipLongestCommonSubsequence[T](
      fullLeftSeq: ReportRow[T],
      fullRightSeq: ReportRow[T],
      chunkSize: Int = diffChunkSize
  ): ReportRow[DiffLocation[T]] = {
    def zipLCSChunk(leftOffset: Int, rightOffset: Int, leftSeq: ReportRow[T], rightSeq: ReportRow[T]) = {
      // create array to store matching sequence length
      val seqLengths = Array.ofDim[Int](leftSeq.size + 1, rightSeq.size + 1)
      for ((left, i) <- leftSeq.zipWithIndex)
        for ((right, j) <- rightSeq.zipWithIndex)
          seqLengths(i + 1)(j + 1) =
            if (left == right)
              seqLengths(i)(j) + 1
            else
              math.max(seqLengths(i + 1)(j), seqLengths(i)(j + 1))

      // walk through array looking for longest sequence lengths
      @tailrec
      def diffWalk(accumulate: ReportRow[DiffLocation[T]], row: Int, col: Int): ReportRow[DiffLocation[T]] =
        if (row == 0 && col == 0)
          accumulate
        else {
          val (nextDiff, nextRow, nextCol) =
            if (row > 0 && col > 0 && leftSeq(row - 1) == rightSeq(col - 1)) {
              (
                DiffLocation(leftSeq(row - 1), Some(row - 1 + leftOffset), Some(col - 1 + rightOffset)),
                row - 1,
                col - 1
              )
            } else if (col > 0 && (row == 0 || seqLengths(row)(col - 1) >= seqLengths(row - 1)(col))) {
              (DiffLocation(rightSeq(col - 1), None, Some(col - 1 + rightOffset)), row, col - 1)
            } else {
              (DiffLocation(leftSeq(row - 1), Some(row - 1 + leftOffset), None), row - 1, col)
            }
          diffWalk(nextDiff +: accumulate, nextRow, nextCol)
        }
      diffWalk(emptyRow, leftSeq.size, rightSeq.size)
    }
    // Check at least 'chunkSize' from both Sequences for matches
    @tailrec
    def checkHeads(
        leftOffSet: Int,
        rightOffSet: Int,
        acc: ReportRow[DiffLocation[T]],
        nextLeftSeq: ReportRow[T],
        nextRightSeq: ReportRow[T]
    ): ReportRow[DiffLocation[T]] =
      if (nextLeftSeq.isEmpty && nextRightSeq.isEmpty)
        acc
      else {
        val (headLeft, headRight) = (nextLeftSeq.headOption, nextRightSeq.headOption)
        if (headLeft == headRight) {
          checkHeads(
            leftOffSet + 1,
            rightOffSet + 1,
            acc :+ DiffLocation(headLeft.get, Some(leftOffSet), Some(rightOffSet)),
            nextLeftSeq.tail,
            nextRightSeq.tail
          )
        } else {
          val (leftChunk, leftRemain) = nextLeftSeq.splitAt(chunkSize)
          val (rightChunk, rightRemain) = nextRightSeq.splitAt(chunkSize)
          val chunkDiffs = zipLCSChunk(leftOffSet, rightOffSet, leftChunk, rightChunk)
          val anyMatches = chunkDiffs.foldLeft(false)((matched, diffLoc) => matched || diffLoc.locationType == InBoth)
          // See if the edges of 2 chunks can simply be stiched back together
          def stichChunks(
              leftChunk: ReportRow[DiffLocation[T]],
              rightChunk: ReportRow[DiffLocation[T]]
          ): ReportRow[DiffLocation[T]] = {
            val stiched = for {
              rightDirection <- rightChunk.headOption.filter(_.hasANone).map(_.locationType)
              possibleStichRight = rightChunk.takeWhile(d => rightDirection == d.locationType)
              stichSize <- if (possibleStichRight.nonEmpty) Some(possibleStichRight.size) else None
              possibleStichLeft = leftChunk
                .takeRight(stichSize)
                .takeWhile(
                  _.locationType ==
                    (if (rightDirection == OnlyLeft) OnlyRight else OnlyLeft)
                )
              possibleStich = possibleStichLeft.zip(possibleStichRight)
              if possibleStichLeft.size == stichSize && possibleStich.forall { case (l, r) => l.value == r.value }
            } yield {
              leftChunk.dropRight(stichSize) ++
                possibleStich.map {
                  case (l, r) => DiffLocation(l.value, l.iLeft orElse r.iLeft, l.iRight orElse r.iRight)
                } ++ rightChunk.drop(stichSize)
            }
            stiched.getOrElse(leftChunk ++ rightChunk)
          }
          // If we haven't found any matches and the remains are empty on one side, then keep the small side to match on
          // if not, move onto the next chunks
          if (!anyMatches && leftRemain.isEmpty && rightRemain.nonEmpty)
            checkHeads(
              leftOffSet,
              rightOffSet + rightChunk.size,
              stichChunks(acc, chunkDiffs.filter(_.locationType == OnlyRight)),
              leftChunk,
              rightRemain
            )
          else if (!anyMatches && leftRemain.nonEmpty && rightRemain.isEmpty)
            checkHeads(
              leftOffSet + leftChunk.size,
              rightOffSet,
              stichChunks(acc, chunkDiffs.filter(_.locationType == OnlyLeft)),
              leftRemain,
              rightChunk
            )
          else
            checkHeads(
              leftOffSet + leftChunk.size,
              rightOffSet + rightChunk.size,
              stichChunks(acc, chunkDiffs),
              leftRemain,
              rightRemain
            )

        }
      }
    checkHeads(0, 0, emptyRow, fullLeftSeq, fullRightSeq)
  }

  private val defaultChunkSize = 1000
  protected[tablediff] lazy val diffChunkSize = readChunkEnvVar()
  val chunkEnvVarName = "TABLEDIFFCHUNKSIZE"
  protected[tablediff] def readChunkEnvVar(envOverride: Option[String] = None) = {
    val chunkEnvVar = envOverride.getOrElse(Properties.envOrElse(chunkEnvVarName, defaultChunkSize.toString))
    try {
      chunkEnvVar.toInt
    } catch {
      case x: NumberFormatException => {
        System.err.println(s"Unable to set diff chunk size $chunkEnvVarName=$chunkEnvVar ${x.getMessage}")
        System.err.println(s"Using defaultChunkSize $defaultChunkSize")
        defaultChunkSize
      }
    }
  }

  private case class Memoise[A, B](f: A => B) extends (A => B) {
    private val cache = scala.collection.mutable.Map.empty[A, B]

    def apply(x: A) = cache.getOrElseUpdate(x, f(x))
  }

  // This is a pretty algorithm, but I couldn't work out a way to make it tail recursive, it's get here for validating
  // main algorithm in the unit tests
  protected[tablediff] def zipLongestCommonSubsequencePretty[T](
      leftSeq: ReportRow[T],
      rightSeq: ReportRow[T]
  ): Seq[DiffLocation[T]] = {
    lazy val zlcs: Memoise[(Seq[(T, Int)], Seq[(T, Int)]), Seq[DiffLocation[T]]] = Memoise {
      case (left, Seq()) =>
        left.map {
          case (x, i) => DiffLocation(x, Some(i), None)
        }
      case (Seq(), right) =>
        right.map {
          case (x, i) => DiffLocation(x, None, Some(i))
        }
      case (left, right) if left.head._1 == right.head._1 =>
        DiffLocation(left.head._1, Some(left.head._2), Some(right.head._2)) +: zlcs(left.tail, right.tail)
      case (left, right) =>
        val rHeadless = zlcs(left, right.tail)
        val lHeadless = zlcs(left.tail, right)
        // version with the least Nones is the best
        if (rHeadless.count(_.hasANone) >= lHeadless.count(_.hasANone))
          DiffLocation(left.head._1, Some(left.head._2), None) +: lHeadless
        else
          DiffLocation(right.head._1, None, Some(right.head._2)) +: rHeadless
    }
    zlcs(leftSeq.zipWithIndex, rightSeq.zipWithIndex)
  }
}




© 2015 - 2025 Weber Informatics LLC | Privacy Policy