All Downloads are FREE. Search and download functionalities are using the official Maven repository.

commonMain.net.ormr.fuzzywuzzy.diffutils.DiffUtils.kt Maven / Gradle / Ivy

package net.ormr.fuzzywuzzy.diffutils

import net.ormr.fuzzywuzzy.diffutils.EditType.*

// This is a port of all the functions needed from python-levenshtein C implementation.
// The code was ported line by line, but unfortunately it was mostly undocumented,
// so it is mostly non-readable (e.g. var names)

@Suppress("NAME_SHADOWING")
internal data object DiffUtils {
    fun getEditOps(s1: String, s2: String): Array = getEditOps(s1.length, s1, s2.length, s2)

    private fun getEditOps(len1: Int, s1: String, len2: Int, s2: String): Array {
        var len1 = len1
        var len2 = len2
        val len2o: Int

        val c1 = s1.toCharArray()
        val c2 = s2.toCharArray()

        var p1 = 0
        var p2 = 0

        var len1o = 0

        while (len1 > 0 && len2 > 0 && c1[p1] == c2[p2]) {
            len1--
            len2--

            p1++
            p2++

            len1o++
        }

        len2o = len1o

        /* strip common suffix */
        while (len1 > 0 && len2 > 0 && c1[p1 + len1 - 1] == c2[p2 + len2 - 1]) {
            len1--
            len2--
        }

        len1++
        len2++

        val matrix = IntArray(len2 * len1)

        var i = 0
        while (i < len2) {
            matrix[i] = i
            i++
        }
        i = 1
        while (i < len1) {
            matrix[len2 * i] = i
            i++
        }

        i = 1
        while (i < len1) {
            var ptrPrev = (i - 1) * len2
            var ptrC = i * len2
            val ptrEnd = ptrC + len2 - 1

            val char1 = c1[p1 + i - 1]
            var ptrChar2 = p2

            var x = i

            ptrC++

            while (ptrC <= ptrEnd) {
                var c3 = matrix[ptrPrev++] + (if (char1 != c2[ptrChar2++]) 1 else 0)
                x++

                if (x > c3) {
                    x = c3
                }

                c3 = matrix[ptrPrev] + 1

                if (x > c3) {
                    x = c3
                }

                matrix[ptrC++] = x
            }

            i++
        }

        return editOpsFromCostMatrix(len1, c1, p1, len1o, len2, c2, p2, len2o, matrix)
    }

    private fun editOpsFromCostMatrix(
        len1: Int, c1: CharArray, p1: Int, o1: Int,
        len2: Int, c2: CharArray, p2: Int, o2: Int,
        matrix: IntArray,
    ): Array {
        var pos = matrix[len1 * len2 - 1]
        val ops: Array = arrayOfNulls(pos)
        var dir = 0
        var i = len1 - 1
        var j = len2 - 1
        var ptr = len1 * len2 - 1

        while (i > 0 || j > 0) {
            if (i != 0 && j != 0 && matrix[ptr] == matrix[ptr - len2 - 1] && c1[p1 + i - 1] == c2[p2 + j - 1]) {
                i--
                j--
                ptr -= len2 + 1
                dir = 0

                continue
            }

            if (dir < 0 && j != 0 && matrix[ptr] == matrix[ptr - 1] + 1) {
                val eop = EditOp(
                    type = INSERT,
                    sourcePos = i + o1,
                    destinationPos = --j + o2,
                )

                pos--
                ops[pos] = eop

                ptr--

                continue
            }

            if (dir > 0 && i != 0 && matrix[ptr] == matrix[ptr - len2] + 1) {
                val eop = EditOp(
                    type = DELETE,
                    sourcePos = --i + o1,
                    destinationPos = j + o2,
                )

                pos--
                ops[pos] = eop

                ptr -= len2

                continue
            }

            if (i != 0 && j != 0 && matrix[ptr] == matrix[ptr - len2 - 1] + 1) {
                pos--

                val eop = EditOp(
                    type = REPLACE,
                    sourcePos = --i + o1,
                    destinationPos = --j + o2,
                )
                ops[pos] = eop



                ptr -= len2 + 1
                dir = 0
                continue
            }

            if (dir == 0 && j != 0 && matrix[ptr] == matrix[ptr - 1] + 1) {
                pos--
                val eop = EditOp(
                    type = INSERT,
                    sourcePos = i + o1,
                    destinationPos = --j + o2,
                )
                ops[pos] = eop
                ptr--
                dir = -1

                continue
            }

            if (dir == 0 && i != 0 && matrix[ptr] == matrix[ptr - len2] + 1) {
                pos--
                val eop = EditOp(
                    type = DELETE,
                    sourcePos = --i + o1,
                    destinationPos = j + o2,
                )
                ops[pos] = eop

                ptr -= len2
                dir = 1
                continue
            }

            check(false)
        }

        return ops.requireNoNulls()
    }

    fun getMatchingBlocks(s1: String, s2: String): Array =
        getMatchingBlocks(s1.length, s2.length, getEditOps(s1, s2))

    fun getMatchingBlocks(len1: Int, len2: Int, ops: Array): Array {
        val n = ops.size
        var o = 0

        var noOfMB = 0

        var i = n
        while (i-- != 0) {
            if (ops[o].type == KEEP) {
                noOfMB++

                while (i != 0 && ops[o].type == KEEP) {
                    i--
                    o++
                }

                if (i == 0) break
            }

            o++
        }

        val matchingBlocks = arrayOfNulls(noOfMB + 1)
        var mb = 0
        o = 0
        matchingBlocks[mb] = MatchingBlock()

        i = n
        while (i != 0) {
            if (ops[o].type == KEEP) {
                matchingBlocks[mb]!!.spos = ops[o].sbeg
                matchingBlocks[mb]!!.dpos = ops[o].dbeg

                while (i != 0 && ops[o].type == KEEP) {
                    i--
                    o++
                }

                if (i == 0) {
                    matchingBlocks[mb]!!.length = len1 - matchingBlocks[mb]!!.spos
                    mb++
                    break
                }

                matchingBlocks[mb]!!.length = ops[o].sbeg - matchingBlocks[mb]!!.spos
                mb++
                matchingBlocks[mb] = MatchingBlock()
            }


            i--
            o++
        }

        check(mb == noOfMB)

        val finalBlock = MatchingBlock()
        finalBlock.spos = len1
        finalBlock.dpos = len2
        finalBlock.length = 0

        matchingBlocks[mb] = finalBlock

        return matchingBlocks.requireNoNulls()
    }

    private fun getMatchingBlocks(len1: Int, len2: Int, ops: Array): Array {
        val n = ops.size
        var spos: Int
        var dpos: Int

        var numberOfMatchingBlocks = 0

        var o = 0

        dpos = 0
        spos = dpos

        var type: EditType?

        var i = n
        while (i != 0) {
            while (ops[o].type == KEEP && --i != 0) {
                o++
            }

            if (i == 0) break

            if (spos < ops[o].sourcePos || dpos < ops[o].destinationPos) {
                numberOfMatchingBlocks++
                spos = ops[o].sourcePos
                dpos = ops[o].destinationPos
            }

            type = ops[o].type

            when (type) {
                REPLACE -> do {
                    spos++
                    dpos++
                    i--
                    o++
                } while (i != 0 && ops[o].type == type && spos == ops[o].sourcePos && dpos == ops[o].destinationPos)
                DELETE -> do {
                    spos++
                    i--
                    o++
                } while (i != 0 && ops[o].type == type && spos == ops[o].sourcePos && dpos == ops[o].destinationPos)
                INSERT -> do {
                    dpos++
                    i--
                    o++
                } while (i != 0 && ops[o].type == type && spos == ops[o].sourcePos && dpos == ops[o].destinationPos)
                else -> {}
            }
        }

        if (spos < len1 || dpos < len2) {
            numberOfMatchingBlocks++
        }

        val matchingBlocks = arrayOfNulls(numberOfMatchingBlocks + 1)

        o = 0
        dpos = 0
        spos = dpos
        var mbIndex = 0

        i = n
        while (i != 0) {
            while (ops[o].type == KEEP && --i != 0) o++

            if (i == 0) break

            if (spos < ops[o].sourcePos || dpos < ops[o].destinationPos) {
                val mb = MatchingBlock()

                mb.spos = spos
                mb.dpos = dpos
                mb.length = ops[o].sourcePos - spos
                spos = ops[o].sourcePos
                dpos = ops[o].destinationPos

                matchingBlocks[mbIndex++] = mb
            }

            type = ops[o].type

            when (type) {
                REPLACE -> do {
                    spos++
                    dpos++
                    i--
                    o++
                } while (i != 0 && ops[o].type == type && spos == ops[o].sourcePos && dpos == ops[o].destinationPos)
                DELETE -> do {
                    spos++
                    i--
                    o++
                } while (i != 0 && ops[o].type == type && spos == ops[o].sourcePos && dpos == ops[o].destinationPos)
                INSERT -> do {
                    dpos++
                    i--
                    o++
                } while (i != 0 && ops[o].type == type && spos == ops[o].sourcePos && dpos == ops[o].destinationPos)
                else -> {}
            }
        }

        if (spos < len1 || dpos < len2) {
            check(len1 - spos == len2 - dpos)

            val mb = MatchingBlock()
            mb.spos = spos
            mb.dpos = dpos
            mb.length = len1 - spos

            matchingBlocks[mbIndex++] = mb
        }

        check(numberOfMatchingBlocks == mbIndex)

        val finalBlock = MatchingBlock()
        finalBlock.spos = len1
        finalBlock.dpos = len2
        finalBlock.length = 0

        matchingBlocks[mbIndex] = finalBlock

        return matchingBlocks.requireNoNulls()
    }

    private fun editOpsToOpCodes(ops: Array, len1: Int, len2: Int): Array {
        val n = ops.size
        var spos: Int
        var dpos: Int
        var o = 0
        var type: EditType?

        var noOfBlocks = 0
        dpos = 0
        spos = dpos

        var i = n
        while (i != 0) {
            while (ops[o].type === KEEP && --i != 0) {
                o++
            }

            if (i == 0) break

            if (spos < ops[o].sourcePos || dpos < ops[o].destinationPos) {
                noOfBlocks++
                spos = ops[o].sourcePos
                dpos = ops[o].destinationPos
            }

            // TODO: Is this right?
            noOfBlocks++
            type = ops[o].type

            when (type) {
                REPLACE -> do {
                    spos++
                    dpos++
                    i--
                    o++
                } while (i != 0 && ops[o].type == type && spos == ops[o].sourcePos && dpos == ops[o].destinationPos)
                DELETE -> do {
                    spos++
                    i--
                    o++
                } while (i != 0 && ops[o].type == type && spos == ops[o].sourcePos && dpos == ops[o].destinationPos)
                INSERT -> do {
                    dpos++
                    i--
                    o++
                } while (i != 0 && ops[o].type == type && spos == ops[o].sourcePos && dpos == ops[o].destinationPos)
                else -> {}
            }
        }

        if (spos < len1 || dpos < len2) noOfBlocks++

        val opCodes = arrayOfNulls(noOfBlocks)

        o = 0
        dpos = 0
        spos = dpos
        var oIndex = 0

        i = n
        while (i != 0) {
            while (ops[o].type === KEEP && --i != 0) o++

            if (i == 0) break

            val oc = OpCode()
            opCodes[oIndex] = oc
            oc.sbeg = spos
            oc.dbeg = dpos

            if (spos < ops[o].sourcePos || dpos < ops[o].destinationPos) {
                oc.type = KEEP
                oc.send = ops[o].sourcePos
                spos = oc.send
                oc.dend = ops[o].destinationPos
                dpos = oc.dend

                oIndex++
                val oc2 = OpCode()
                opCodes[oIndex] = oc2
                oc2.sbeg = spos
                oc2.dbeg = dpos
            }

            type = ops[o].type

            when (type) {
                REPLACE -> do {
                    spos++
                    dpos++
                    i--
                    o++
                } while (i != 0 && ops[o].type == type && spos == ops[o].sourcePos && dpos == ops[o].destinationPos)
                DELETE -> do {
                    spos++
                    i--
                    o++
                } while (i != 0 && ops[o].type == type && spos == ops[o].sourcePos && dpos == ops[o].destinationPos)
                INSERT -> do {
                    dpos++
                    i--
                    o++
                } while (i != 0 && ops[o].type == type && spos == ops[o].sourcePos && dpos == ops[o].destinationPos)
                else -> {}
            }
            opCodes[oIndex]!!.type = type
            opCodes[oIndex]!!.send = spos
            opCodes[oIndex]!!.dend = dpos
            oIndex++
        }

        if (spos < len1 || dpos < len2) {
            check(len1 - spos == len2 - dpos)
            if (opCodes[oIndex] == null) opCodes[oIndex] = OpCode()
            opCodes[oIndex]!!.type = KEEP
            opCodes[oIndex]!!.sbeg = spos
            opCodes[oIndex]!!.dbeg = dpos
            opCodes[oIndex]!!.send = len1
            opCodes[oIndex]!!.dend = len2

            oIndex++
        }

        check(oIndex == noOfBlocks)

        return opCodes.requireNoNulls()
    }

    fun levEditDistance(s1: String, s2: String, xcost: Int): Int {
        var i: Int
        val half: Int

        var c1 = s1.toCharArray()
        var c2 = s2.toCharArray()

        var str1 = 0
        var str2 = 0

        var len1 = s1.length
        var len2 = s2.length

        /* strip common prefix */
        while (len1 > 0 && len2 > 0 && c1[str1] == c2[str2]) {
            len1--
            len2--
            str1++
            str2++
        }

        /* strip common suffix */
        while (len1 > 0 && len2 > 0 && c1[str1 + len1 - 1] == c2[str2 + len2 - 1]) {
            len1--
            len2--
        }

        /* catch trivial cases */
        if (len1 == 0) return len2
        if (len2 == 0) return len1

        /* make the inner cycle (i.e. str2) the longer one */
        if (len1 > len2) {
            val nx = len1
            val temp = str1

            len1 = len2
            len2 = nx

            str1 = str2
            str2 = temp

            val t = c2
            c2 = c1
            c1 = t
        }

        /* check len1 == 1 separately */
        if (len1 == 1) {
            return if (xcost != 0) {
                len2 + 1 - 2 * memchr(c2, str2, c1[str1], len2)
            } else {
                len2 - memchr(c2, str2, c1[str1], len2)
            }
        }

        len1++
        len2++
        half = len1 shr 1

        val row = IntArray(len2)
        var end = len2 - 1

        i = 0
        while (i < len2 - (if (xcost != 0) 0 else half)) {
            row[i] = i
            i++
        }


        /* go through the matrix and compute the costs.  yes, this is an extremely
         * obfuscated version, but also extremely memory-conservative and relatively
         * fast.  */
        if (xcost != 0) {
            i = 1
            while (i < len1) {
                var p = 1

                val ch1 = c1[str1 + i - 1]
                var c2p = str2

                var D = i
                var x = i

                while (p <= end) {
                    if (ch1 == c2[c2p++]) {
                        x = --D
                    } else {
                        x++
                    }
                    D = row[p]
                    D++

                    if (x > D) x = D
                    row[p++] = x
                }

                i++
            }
        } else {
            /* in this case we don't have to scan two corner triangles (of size len1/2)
                    * in the matrix because no best path can go throught them. note this
                    * breaks when len1 == len2 == 2 so the memchr() special case above is
                    * necessary */

            row[0] = len1 - half - 1
            i = 1
            while (i < len1) {
                var p: Int

                val ch1 = c1[str1 + i - 1]
                var c2p: Int

                var D: Int
                var x: Int

                /* skip the upper triangle */
                if (i >= len1 - half) {
                    val offset = i - (len1 - half)

                    c2p = str2 + offset
                    p = offset
                    val c3 = row[p++] + (if ((ch1 != c2[c2p++])) 1 else 0)
                    x = row[p]
                    x++
                    D = x
                    if (x > c3) {
                        x = c3
                    }
                    row[p++] = x
                } else {
                    p = 1
                    c2p = str2
                    x = i
                    D = x
                }
                /* skip the lower triangle */
                if (i <= half + 1) end = len2 + i - half - 2
                /* main */
                while (p <= end) {
                    val c3 = --D + (if ((ch1 != c2[c2p++])) 1 else 0)
                    x++
                    if (x > c3) {
                        x = c3
                    }
                    D = row[p]
                    D++
                    if (x > D) x = D
                    row[p++] = x
                }

                /* lower triangle sentinel */
                if (i <= half) {
                    val c3 = --D + (if ((ch1 != c2[c2p])) 1 else 0)
                    x++
                    if (x > c3) {
                        x = c3
                    }
                    row[p] = x
                }
                i++
            }
        }

        i = row[end]

        return i
    }

    private fun memchr(haystack: CharArray, offset: Int, needle: Char, num: Int): Int {
        var num = num
        if (num != 0) {
            var p = 0

            do {
                if (haystack[offset + p] == needle) return 1

                p++
            } while (--num != 0)
        }
        return 0
    }


    fun getRatio(s1: String, s2: String): Double {
        val len1 = s1.length
        val len2 = s2.length
        val lensum = len1 + len2

        val editDistance = levEditDistance(s1, s2, 1)

        return (lensum - editDistance) / lensum.toDouble()
    }
}




© 2015 - 2024 Weber Informatics LLC | Privacy Policy