All Downloads are FREE. Search and download functionalities are using the official Maven repository.

me.xdrop.diffutils.DiffUtils Maven / Gradle / Ivy

Go to download

Fuzzy string searching implementation of the well-known fuzzywuzzy algorithm in Java

There is a newer version: 1.4.0
Show newest version
package me.xdrop.diffutils;

import me.xdrop.diffutils.structs.EditOp;
import me.xdrop.diffutils.structs.EditType;
import me.xdrop.diffutils.structs.MatchingBlock;
import me.xdrop.diffutils.structs.OpCode;

import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;

/**
 * This is a port of all the functions needed from python-levenshtein C implementation.
 * The code was ported line by line but unfortunately it was mostly undocumented,
 * so it is mostly non readable (eg. var names)
 */
public class DiffUtils {

    public static EditOp[] getEditOps(String s1, String s2) {
        return getEditOps(s1.length(), s1, s2.length(), s2);
    }


    private static EditOp[] getEditOps(int len1, String s1, int len2, String s2) {

        int len1o, len2o;
        int i;

        int[] matrix;

        char[] c1 = s1.toCharArray();
        char[] c2 = s2.toCharArray();

        int p1 = 0;
        int p2 = 0;

        len1o = 0;

        while (len1 > 0 && len2 > 0 && c1[p1] == c2[p2]) {
            len1--;
            len2--;

            p1++;
            p2++;

            len1o++;
        }

        len2o = len1o;

        /* strip common suffix */
        while (len1 > 0 && len2 > 0 && c1[p1 + len1 - 1] == c2[p2 + len2 - 1]) {
            len1--;
            len2--;
        }

        len1++;
        len2++;

        matrix = new int[len2 * len1];

        for (i = 0; i < len2; i++)
            matrix[i] = i;
        for (i = 1; i < len1; i++)
            matrix[len2 * i] = i;

        for (i = 1; i < len1; i++) {

            int ptrPrev = (i - 1) * len2;
            int ptrC = i * len2;
            int ptrEnd = ptrC + len2 - 1;

            char char1 = c1[p1 + i - 1];
            int ptrChar2 = p2;

            int x = i;

            ptrC++;

            while (ptrC <= ptrEnd) {

                int c3 = matrix[ptrPrev++] + (char1 != c2[ptrChar2++] ? 1 : 0);
                x++;

                if (x > c3) {
                    x = c3;
                }

                c3 = matrix[ptrPrev] + 1;

                if (x > c3) {
                    x = c3;
                }

                matrix[ptrC++] = x;

            }

        }


        return editOpsFromCostMatrix(len1, c1, p1, len1o, len2, c2, p2, len2o, matrix);
    }


    private static EditOp[] editOpsFromCostMatrix(int len1, char[] c1, int p1, int o1,
                                                  int len2, char[] c2, int p2, int o2,
                                                  int[] matrix) {

        int i, j, pos;

        int ptr;

        EditOp[] ops;

        int dir = 0;

        pos = matrix[len1 * len2 - 1];

        ops = new EditOp[pos];

        i = len1 - 1;
        j = len2 - 1;

        ptr = len1 * len2 - 1;

        while (i > 0 || j > 0) {

            if (dir < 0 && j != 0 && matrix[ptr] == matrix[ptr - 1] + 1) {

                EditOp eop = new EditOp();

                pos--;
                ops[pos] = eop;
                eop.type = EditType.INSERT;
                eop.spos = i + o1;
                eop.dpos = --j + o2;
                ptr--;

                continue;
            }

            if (dir > 0 && i != 0 && matrix[ptr] == matrix[ptr - len2] + 1) {

                EditOp eop = new EditOp();

                pos--;
                ops[pos] = eop;
                eop.type = EditType.DELETE;
                eop.spos = --i + o1;
                eop.dpos = j + o2;
                ptr -= len2;

                continue;

            }

            if (i != 0 && j != 0 && matrix[ptr] == matrix[ptr - len2 - 1]
                    && c1[p1 + i - 1] == c2[p2 + j - 1]) {

                i--;
                j--;
                ptr -= len2 + 1;
                dir = 0;

                continue;

            }

            if (i != 0 && j != 0 && matrix[ptr] == matrix[ptr - len2 - 1] + 1) {

                pos--;

                EditOp eop = new EditOp();
                ops[pos] = eop;

                eop.type = EditType.REPLACE;
                eop.spos = --i + o1;
                eop.dpos = --j + o2;

                ptr -= len2 + 1;
                dir = 0;
                continue;

            }

            if (dir == 0 && j != 0 && matrix[ptr] == matrix[ptr - 1] + 1) {

                pos--;
                EditOp eop = new EditOp();
                ops[pos] = eop;
                eop.type = EditType.INSERT;
                eop.spos = i + o1;
                eop.dpos = --j + o2;
                ptr--;
                dir = -1;

                continue;
            }

            if (dir == 0 && i != 0 && matrix[ptr] == matrix[ptr - len2] + 1) {
                pos--;
                EditOp eop = new EditOp();
                ops[pos] = eop;

                eop.type = EditType.DELETE;
                eop.spos = --i + o1;
                eop.dpos = j + o2;
                ptr -= len2;
                dir = 1;
                continue;
            }

            assert false;

        }

        return ops;

    }

    public static MatchingBlock[] getMatchingBlocks(String s1, String s2) {

        return getMatchingBlocks(s1.length(), s2.length(), getEditOps(s1, s2));

    }

    public static MatchingBlock[] getMatchingBlocks(int len1, int len2, OpCode[] ops) {

        int n = ops.length;

        int noOfMB, i;
        int o = 0;

        noOfMB = 0;

        for (i = n; i-- != 0; o++) {

            if (ops[o].type == EditType.KEEP) {

                noOfMB++;

                while (i != 0 && ops[o].type == EditType.KEEP) {
                    i--;
                    o++;
                }

                if (i == 0)
                    break;

            }

        }

        MatchingBlock[] matchingBlocks = new MatchingBlock[noOfMB + 1];
        int mb = 0;
        o = 0;
        matchingBlocks[mb] = new MatchingBlock();

        for (i = n; i != 0; i--, o++) {

            if (ops[o].type == EditType.KEEP) {


                matchingBlocks[mb].spos = ops[o].sbeg;
                matchingBlocks[mb].dpos = ops[o].dbeg;

                while (i != 0 && ops[o].type == EditType.KEEP) {
                    i--;
                    o++;
                }

                if (i == 0) {
                    matchingBlocks[mb].length = len1 - matchingBlocks[mb].spos;
                    mb++;
                    break;
                }

                matchingBlocks[mb].length = ops[o].sbeg - matchingBlocks[mb].spos;
                mb++;
                matchingBlocks[mb] = new MatchingBlock();
            }


        }

        assert mb == noOfMB;

        MatchingBlock finalBlock = new MatchingBlock();
        finalBlock.spos = len1;
        finalBlock.dpos = len2;
        finalBlock.length = 0;

        matchingBlocks[mb] = finalBlock;

        return matchingBlocks;


    }


    private static MatchingBlock[] getMatchingBlocks(int len1, int len2, EditOp[] ops) {

        int n = ops.length;

        int numberOfMatchingBlocks, i, spos, dpos;

        numberOfMatchingBlocks = 0;

        int o = 0;

        spos = dpos = 0;

        EditType type;

        for (i = n; i != 0; ) {


            while (ops[o].type == EditType.KEEP && --i != 0) {
                o++;
            }

            if (i == 0)
                break;

            if (spos < ops[o].spos || dpos < ops[o].dpos) {

                numberOfMatchingBlocks++;
                spos = ops[o].spos;
                dpos = ops[o].dpos;

            }

            type = ops[o].type;

            switch (type) {
                case REPLACE:
                    do {
                        spos++;
                        dpos++;
                        i--;
                        o++;
                    } while (i != 0 && ops[o].type == type &&
                            spos == ops[o].spos && dpos == ops[o].dpos);
                    break;

                case DELETE:
                    do {
                        spos++;
                        i--;
                        o++;
                    } while (i != 0 && ops[o].type == type &&
                            spos == ops[o].spos && dpos == ops[o].dpos);
                    break;

                case INSERT:
                    do {
                        dpos++;
                        i--;
                        o++;
                    } while (i != 0 && ops[o].type == type &&
                            spos == ops[o].spos && dpos == ops[o].dpos);
                    break;

                default:
                    break;
            }
        }

        if (spos < len1 || dpos < len2) {
            numberOfMatchingBlocks++;
        }

        MatchingBlock[] matchingBlocks = new MatchingBlock[numberOfMatchingBlocks + 1];

        o = 0;
        spos = dpos = 0;
        int mbIndex = 0;


        for (i = n; i != 0; ) {

            while (ops[o].type == EditType.KEEP && --i != 0)
                o++;

            if (i == 0)
                break;

            if (spos < ops[o].spos || dpos < ops[o].dpos) {
                MatchingBlock mb = new MatchingBlock();

                mb.spos = spos;
                mb.dpos = dpos;
                mb.length = ops[o].spos - spos;
                spos = ops[o].spos;
                dpos = ops[o].dpos;

                matchingBlocks[mbIndex++] = mb;

            }

            type = ops[o].type;

            switch (type) {
                case REPLACE:
                    do {
                        spos++;
                        dpos++;
                        i--;
                        o++;
                    } while (i != 0 && ops[o].type == type &&
                            spos == ops[o].spos && dpos == ops[o].dpos);
                    break;

                case DELETE:
                    do {
                        spos++;
                        i--;
                        o++;
                    } while (i != 0 && ops[o].type == type &&
                            spos == ops[o].spos && dpos == ops[o].dpos);
                    break;

                case INSERT:
                    do {
                        dpos++;
                        i--;
                        o++;
                    } while (i != 0 && ops[o].type == type &&
                            spos == ops[o].spos && dpos == ops[o].dpos);
                    break;

                default:
                    break;
            }
        }

        if (spos < len1 || dpos < len2) {
            assert len1 - spos == len2 - dpos;

            MatchingBlock mb = new MatchingBlock();
            mb.spos = spos;
            mb.dpos = dpos;
            mb.length = len1 - spos;

            matchingBlocks[mbIndex++] = mb;
        }

        assert numberOfMatchingBlocks == mbIndex;

        MatchingBlock finalBlock = new MatchingBlock();
        finalBlock.spos = len1;
        finalBlock.dpos = len2;
        finalBlock.length = 0;

        matchingBlocks[mbIndex] = finalBlock;


        return matchingBlocks;
    }


    private static OpCode[] editOpsToOpCodes(EditOp[] ops, int len1, int len2) {

        int n = ops.length;
        int noOfBlocks, i, spos, dpos;
        int o = 0;
        EditType type;

        noOfBlocks = 0;
        spos = dpos = 0;

        for (i = n; i != 0; ) {

            while (ops[o].type == EditType.KEEP && --i != 0) {
                o++;
            }

            if (i == 0)
                break;

            if (spos < ops[o].spos || dpos < ops[o].dpos) {

                noOfBlocks++;
                spos = ops[o].spos;
                dpos = ops[o].dpos;

            }

            // TODO: Is this right?
            noOfBlocks++;
            type = ops[o].type;

            switch (type) {
                case REPLACE:
                    do {
                        spos++;
                        dpos++;
                        i--;
                        o++;
                    } while (i != 0 && ops[o].type == type &&
                            spos == ops[o].spos && dpos == ops[o].dpos);
                    break;

                case DELETE:
                    do {
                        spos++;
                        i--;
                        o++;
                    } while (i != 0 && ops[o].type == type &&
                            spos == ops[o].spos && dpos == ops[o].dpos);
                    break;

                case INSERT:
                    do {
                        dpos++;
                        i--;
                        o++;
                    } while (i != 0 && ops[o].type == type &&
                            spos == ops[o].spos && dpos == ops[o].dpos);
                    break;

                default:
                    break;
            }
        }

        if (spos < len1 || dpos < len2)
            noOfBlocks++;

        OpCode[] opCodes = new OpCode[noOfBlocks];

        o = 0;
        spos = dpos = 0;
        int oIndex = 0;

        for (i = n; i != 0; ) {

            while (ops[o].type == EditType.KEEP && --i != 0)
                o++;

            if (i == 0)
                break;

            OpCode oc = new OpCode();
            opCodes[oIndex] = oc;
            oc.sbeg = spos;
            oc.dbeg = dpos;

            if (spos < ops[o].spos || dpos < ops[o].dpos) {

                oc.type = EditType.KEEP;
                spos = oc.send = ops[o].spos;
                dpos = oc.dend = ops[o].dpos;

                oIndex++;
                OpCode oc2 = new OpCode();
                opCodes[oIndex] = oc2;
                oc2.sbeg = spos;
                oc2.dbeg = dpos;

            }

            type = ops[o].type;

            switch (type) {
                case REPLACE:
                    do {
                        spos++;
                        dpos++;
                        i--;
                        o++;
                    } while (i != 0 && ops[o].type == type &&
                            spos == ops[o].spos && dpos == ops[o].dpos);
                    break;

                case DELETE:
                    do {
                        spos++;
                        i--;
                        o++;
                    } while (i != 0 && ops[o].type == type &&
                            spos == ops[o].spos && dpos == ops[o].dpos);
                    break;

                case INSERT:
                    do {
                        dpos++;
                        i--;
                        o++;
                    } while (i != 0 && ops[o].type == type &&
                            spos == ops[o].spos && dpos == ops[o].dpos);
                    break;

                default:
                    break;
            }

            opCodes[oIndex].type = type;
            opCodes[oIndex].send = spos;
            opCodes[oIndex].dend = dpos;
            oIndex++;
        }

        if (spos < len1 || dpos < len2) {

            assert len1 - spos == len2 - dpos;
            if (opCodes[oIndex] == null)
                opCodes[oIndex] = new OpCode();
            opCodes[oIndex].type = EditType.KEEP;
            opCodes[oIndex].sbeg = spos;
            opCodes[oIndex].dbeg = dpos;
            opCodes[oIndex].send = len1;
            opCodes[oIndex].dend = len2;

            oIndex++;

        }

        assert oIndex == noOfBlocks;

        return opCodes;

    }

    public static int levEditDistance(String s1, String s2, int xcost) {

        int i;
        int half;

        char[] c1 = s1.toCharArray();
        char[] c2 = s2.toCharArray();

        int str1 = 0;
        int str2 = 0;

        int len1 = s1.length();
        int len2 = s2.length();

        /* strip common prefix */
        while (len1 > 0 && len2 > 0 && c1[str1] == c2[str2]) {

            len1--;
            len2--;
            str1++;
            str2++;

        }

        /* strip common suffix */
        while (len1 > 0 && len2 > 0 && c1[str1 + len1 - 1] == c2[str2 + len2 - 1]) {
            len1--;
            len2--;
        }

          /* catch trivial cases */
        if (len1 == 0)
            return len2;
        if (len2 == 0)
            return len1;

        /* make the inner cycle (i.e. str2) the longer one */
        if (len1 > len2) {

            int nx = len1;
            int temp = str1;

            len1 = len2;
            len2 = nx;

            str1 = str2;
            str2 = temp;

            char[] t = c2;
            c2 = c1;
            c1 = t;

        }

        /* check len1 == 1 separately */
        if (len1 == 1) {
            if (xcost != 0) {
                return len2 + 1 - 2 * memchr(c2, str2, c1[str1], len2);
            } else {
                return len2 - memchr(c2, str2, c1[str1], len2);
            }
        }

        len1++;
        len2++;
        half = len1 >> 1;

        int[] row = new int[len2];
        int end = len2 - 1;

        for (i = 0; i < len2 - (xcost != 0 ? 0 : half); i++)
            row[i] = i;


        /* go through the matrix and compute the costs.  yes, this is an extremely
         * obfuscated version, but also extremely memory-conservative and relatively
         * fast.  */

        if (xcost != 0) {

            for (i = 1; i < len1; i++) {

                int p = 1;

                char ch1 = c1[str1 + i - 1];
                int c2p = str2;

                int D = i;
                int x = i;

                while (p <= end) {

                    if (ch1 == c2[c2p++]) {
                        x = --D;
                    } else {
                        x++;
                    }
                    D = row[p];
                    D++;

                    if (x > D)
                        x = D;
                    row[p++] = x;

                }

            }

        } else {

            /* in this case we don't have to scan two corner triangles (of size len1/2)
             * in the matrix because no best path can go throught them. note this
             * breaks when len1 == len2 == 2 so the memchr() special case above is
             * necessary */

            row[0] = len1 - half - 1;
            for (i = 1; i < len1; i++) {
                int p;

                char ch1 = c1[str1 + i - 1];
                int c2p;

                int D, x;

                /* skip the upper triangle */
                if (i >= len1 - half) {
                    int offset = i - (len1 - half);
                    int c3;

                    c2p = str2 + offset;
                    p = offset;
                    c3 = row[p++] + ((ch1 != c2[c2p++]) ? 1 : 0);
                    x = row[p];
                    x++;
                    D = x;
                    if (x > c3) {
                        x = c3;
                    }
                    row[p++] = x;
                } else {
                    p = 1;
                    c2p = str2;
                    D = x = i;
                }
                /* skip the lower triangle */
                if (i <= half + 1)
                    end = len2 + i - half - 2;
                /* main */
                while (p <= end) {
                    int c3 = --D + ((ch1 != c2[c2p++]) ? 1 : 0);
                    x++;
                    if (x > c3) {
                        x = c3;
                    }
                    D = row[p];
                    D++;
                    if (x > D)
                        x = D;
                    row[p++] = x;

                }

                /* lower triangle sentinel */
                if (i <= half) {
                    int c3 = --D + ((ch1 != c2[c2p]) ? 1 : 0);
                    x++;
                    if (x > c3) {
                        x = c3;
                    }
                    row[p] = x;
                }
            }
        }

        i = row[end];

        return i;

    }

    private static int memchr(char[] haystack, int offset, char needle, int num) {

        if (num != 0) {
            int p = 0;

            do {

                if (haystack[offset + p] == needle)
                    return 1;

                p++;

            } while (--num != 0);

        }
        return 0;

    }


    public static double getRatio(String s1, String s2) {

        int len1 = s1.length();
        int len2 = s2.length();
        int lensum = len1 + len2;

        int editDistance = levEditDistance(s1, s2, 1);

        return (lensum - editDistance) / (double) lensum;

    }



}




© 2015 - 2024 Weber Informatics LLC | Privacy Policy