All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.specs2.data.EditDistance.scala Maven / Gradle / Ivy

The newest version!
package org.specs2
package data

/**
 * The EditDistance trait provides methods to compute the
 * distance between 2 sequences
 *
 * http://en.wikipedia.org/wiki/Edit_distance
 *
 */
trait EditDistance {

  /**
   * Edit matrix for 2 given sequences
   */
  class EditMatrix[T](s1: IndexedSeq[T], s2: IndexedSeq[T], costs: EditDistanceCosts[T]) {

    type DistanceMatrix = Array[Array[EditDistanceOp]]
    /* matrix containing the edit distance for any prefix of s1 and s2: matrix(i)(j) = edit distance(s1[0..i], s[0..j])*/
    private lazy val matrix = createDistanceMatrix(s1, s2)

    private def createDistanceMatrix(s1: IndexedSeq[T], s2: IndexedSeq[T]): DistanceMatrix = {
      val matrix = Array.ofDim[EditDistanceOp](s1.length + 1, s2.length + 1)

      for (i <- 0 to s1.length;
           j <- 0 to s2.length) {
        if (i == 0)      matrix(i)(j) = InsOp(j)                   // j insertions
        else if (j == 0) matrix(i)(j) = DelOp(i)                   // i suppressions
        else             matrix(i)(j) = cost(s1, s2, i, j, matrix) // otherwise
      }

      matrix
    }

    /** @return the cost for DistanceMatrix(i, j) */
    def cost(s1: IndexedSeq[T], s2: IndexedSeq[T], i: Int, j: Int, matrix: DistanceMatrix) = {
      val result = costs.lowerCost(s1(i - 1), s2(j - 1),
        matrix(i - 1)(j).cost     + costs.insertionDeletionCost(s1(i - 1)),       // suppression
        matrix(i - 1)(j - 1).cost + costs.substitutionCost(s1(i - 1), s2(j - 1)), // substitution
        matrix(i)(j - 1).cost     + costs.insertionDeletionCost(s2(j - 1)))       // insertion

      result match {
        case SubstOp(_) if matrix(i - 1)(j - 1).cost == result.cost => SameOp(result.cost)
        case _                                                      => result
      }
    }

    /** @return the edit distance between 2 strings */
    def distance = matrix(s1.length)(s2.length).cost

    /** prints the edit matrix of the 2 sequence */
    def showMatrix =
      matrix.map(_.mkString("|")).mkString("\n")

    /** show the differences between 2 sequences as a list of operations from one to the other */
    def operations: IndexedSeq[EditDistanceOperation[T]] = {
      def allOperations(i: Int, j: Int, operations: IndexedSeq[EditDistanceOperation[T]]): IndexedSeq[EditDistanceOperation[T]] = {
        if (i == 0 && j == 0) IndexedSeq()
        else {
          val op = matrix(i)(j)
          val dist = op.cost
          if (i == 1 && j == 1) {
            if (dist == 0) Same(s1(0)) +: operations
            else           Subst(s1(0), s2(0)) +: operations
          }
          else if (j < 1) s1.slice(0, i).map(Del.apply) ++ operations
          else if (i < 1) s2.slice(0, j).map(Add.apply) ++ operations
          else op match {
            case InsOp(_)   => allOperations(i,     j - 1, Add(s2(j - 1)) +: operations             )
            case DelOp(_)   => allOperations(i - 1, j,     Del(s1(i - 1)) +: operations             )
            case SubstOp(_) => allOperations(i - 1, j - 1, Subst(s1(i - 1), s2(j - 1)) +: operations)
            case _          => allOperations(i - 1, j - 1, Same(s1(i - 1)) +: operations            )
          }
        }
      }
      allOperations(s1.length, s2.length, IndexedSeq())
    }
  }

  def levenhsteinDistance[T](s1: IndexedSeq[T], s2: IndexedSeq[T]): IndexedSeq[EditDistanceOperation[T]] = {
    val matrix = new EditMatrix[T](s1, s2, EditDistanceCosts.levenhsteinCosts[T])
    matrix.operations
  }

  trait EditDistanceOperation[T] {
    def t: T
    def inverse: EditDistanceOperation[T]
  }
  case class Add[T](t: T) extends EditDistanceOperation[T] {
    def inverse = Del(t)
  }
  case class Del[T](t: T) extends EditDistanceOperation[T] {
    def inverse = Add(t)
  }
  case class Same[T](t: T) extends EditDistanceOperation[T] {
    def inverse = Same(t)
  }
  case class Subst[T](t: T, t2: T) extends EditDistanceOperation[T] {
    def inverse = Subst(t2, t)
  }

}

object EditDistance extends EditDistance

trait EditDistanceOp {
  def cost: Int
}
case class InsOp(cost: Int) extends EditDistanceOp {
  override def toString = "+ "+cost
}
case class DelOp(cost: Int) extends EditDistanceOp {
  override def toString = "- "+cost
}
case class SubstOp(cost: Int) extends EditDistanceOp {
  override def toString = "~ "+cost
}
case class SameOp(cost: Int) extends EditDistanceOp {
  override def toString = "o "+cost
}

trait EditDistanceCosts[T] {
  /** @return the cost of a substitution */
  def substitutionCost(a: T, b: T): Int

  /** @return the cost of an insertion or deletion */
  def insertionDeletionCost(c: T): Int

  /** @return the lower cost and associated operation of a deletion, substitution or insertion */
  def lowerCost(a: T, b: T, del: Int, subst: Int, ins: Int): EditDistanceOp
}

object EditDistanceCosts {
  def levenhsteinCosts[T]: EditDistanceCosts[T] = new LevenhsteinCosts[T] {}
}

trait LevenhsteinCosts[T] extends EditDistanceCosts[T] {
  /** @return the cost of a substitution */
  def substitutionCost(a: T, b: T): Int = if (a == b) 0 else 1

  /** @return the cost of an insertion or deletion */
  def insertionDeletionCost(c: T) = 1

  /**
   * @return the lower cost and associated operation of a deletion, substitution or insertion
   *         in case of equality between a non-substitution and an insertion/suppression
   *         we select the insertion/suppression in order to group all the differences together
   *         diff("abcd", "acbd") ==> ("a[bc]d", "a[cb]d"). the distance is 2, otherwise
   *         diff("abcd", "acbd") ==> ("a[b]c[]d", "a[c]b[]d")
   */
  def lowerCost(a: T, b: T, del: Int, subst: Int, ins: Int): EditDistanceOp = {
    val (opDel, opSubst, opIns) = (DelOp(del), SubstOp(subst), InsOp(ins))
    if (ins < del) {
      if (ins < subst) opIns
      else if (ins == subst && a == b) opIns
      else opSubst
    } else {
      if (del < subst) opDel
      else if (del == subst && a == b) opDel
      else opSubst
    }
  }

}

object StringLevenhsteinCosts extends LevenhsteinCosts[Char]







© 2015 - 2024 Weber Informatics LLC | Privacy Policy