All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.eclipse.compare.internal.LCS Maven / Gradle / Ivy

Go to download

docx4j is a library which helps you to work with the Office Open XML file format as used in docx documents, pptx presentations, and xlsx spreadsheets.

There is a newer version: 6.1.2
Show newest version
/*******************************************************************************
 * Copyright (c) 2000, 2006 IBM Corporation and others.
 * All rights reserved. This program and the accompanying materials
 * are made available under the terms of the Eclipse Public License v1.0
 * which accompanies this distribution, and is available at
 * http://www.eclipse.org/legal/epl-v10.html
 *
 * Contributors:
 *     IBM Corporation - initial API and implementation
 *******************************************************************************/
package org.eclipse.compare.internal;


/* Used to determine the change set responsible for each line */
public abstract class LCS {

    private int max_differences; // the maximum number of differences

    // from

    // each end to consider

    private int length;

    /**
     * Myers' algorithm for longest common subsequence. O((M + N)D) worst case
     * time, O(M + N + D^2) expected time, O(M + N) space
     * (http://citeseer.ist.psu.edu/myers86ond.html)
     * 
     * Note: Beyond implementing the algorithm as described in the paper I have
     * added diagonal range compression which helps when finding the LCS of a
     * very long and a very short sequence, also bound the running time to (N +
     * M)^1.5 when both sequences are very long.
     * 
     * After this method is called, the longest common subsequence is available
     * by calling getResult() where result[0] is composed of entries from l1 and
     * result[1] is composed of entries from l2
     * 
     * @param subMonitor
     */
    public void longestCommonSubsequence(LCSSettings settings) {
        int length1 = getLength1();
        int length2 = getLength2();
        if (length1 == 0 || length2 == 0) {
            length = 0;
            return;
        }

        max_differences = (length1 + length2 + 1) / 2; // ceil((N+M)/2)
        if ((double) length1 * (double) length2 > settings.getTooLong()) {
            // limit complexity to D^POW_LIMIT for long sequences
            max_differences = (int) Math.pow(max_differences, settings
                    .getPowLimit() - 1.0);
        }

        initializeLcs(length1);

        /*
         * The common prefixes and suffixes are always part of some LCS, include
         * them now to reduce our search space
         */
        int forwardBound;
        int max = Math.min(length1, length2);
        for (forwardBound = 0; forwardBound < max
                && isRangeEqual(forwardBound, forwardBound); forwardBound++) {
            setLcs(forwardBound, forwardBound);
        }

        int backBoundL1 = length1 - 1;
        int backBoundL2 = length2 - 1;

        while (backBoundL1 >= forwardBound && backBoundL2 >= forwardBound
                && isRangeEqual(backBoundL1, backBoundL2)) {
            setLcs(backBoundL1, backBoundL2);
            backBoundL1--;
            backBoundL2--;
        }

        length = forwardBound
                + length1
                - backBoundL1
                - 1
                + lcs_rec(forwardBound, backBoundL1, forwardBound, backBoundL2,
                        new int[2][length1 + length2 + 1], new int[3]);

    }

    /**
     * The recursive helper function for Myers' LCS. Computes the LCS of
     * l1[bottoml1 .. topl1] and l2[bottoml2 .. topl2] fills in the appropriate
     * location in lcs and returns the length
     * 
     * @param l1
     *                The 1st sequence
     * @param bottoml1
     *                Index in the 1st sequence to start from (inclusive)
     * @param topl1
     *                Index in the 1st sequence to end on (inclusive)
     * @param l2
     *                The 2nd sequence
     * @param bottoml2
     *                Index in the 2nd sequence to start from (inclusive)
     * @param topl2
     *                Index in the 2nd sequence to end on (inclusive)
     * @param V
     *                should be allocated as int[2][l1.length + l2.length + 1],
     *                used to store furthest reaching D-paths
     * @param snake
     *                should be allocated as int[3], used to store the beginning
     *                x, y coordinates and the length of the latest snake
     *                traversed
     * @param subMonitor
     * @param lcs
     *                should be allocated as TextLine[2][l1.length], used to
     *                store the common points found to be part of the LCS where
     *                lcs[0] references lines of l1 and lcs[1] references lines
     *                of l2.
     * 
     * @return the length of the LCS
     */
    private int lcs_rec(int bottoml1, int topl1, int bottoml2, int topl2,
            int[][] V, int[] snake) {

        // check that both sequences are non-empty
        if (bottoml1 > topl1 || bottoml2 > topl2) {
            return 0;
        }

        int d = find_middle_snake(bottoml1, topl1, bottoml2, topl2, V, snake);
        // System.out.println(snake[0] + " " + snake[1] + " " + snake[2]);

        // need to store these so we don't lose them when they're overwritten by
        // the recursion
        int len = snake[2];
        int startx = snake[0];
        int starty = snake[1];

        // the middle snake is part of the LCS, store it
        for (int i = 0; i < len; i++) {
            setLcs(startx + i, starty + i);
        }

        if (d > 1) {
            return len
                    + lcs_rec(bottoml1, startx - 1, bottoml2, starty - 1, V,
                            snake)
                    + lcs_rec(startx + len, topl1, starty + len, topl2, V,
                            snake);
        } else if (d == 1) {
            /*
             * In this case the sequences differ by exactly 1 line. We have
             * already saved all the lines after the difference in the for loop
             * above, now we need to save all the lines before the difference.
             */
            int max = Math.min(startx - bottoml1, starty - bottoml2);
            for (int i = 0; i < max; i++) {
                setLcs(bottoml1 + i, bottoml2 + i);
            }
            return max + len;
        }

        return len;
    }

    /**
     * Helper function for Myers' LCS algorithm to find the middle snake for
     * l1[bottoml1..topl1] and l2[bottoml2..topl2] The x, y coodrdinates of the
     * start of the middle snake are saved in snake[0], snake[1] respectively
     * and the length of the snake is saved in s[2].
     * 
     * @param l1
     *                The 1st sequence
     * @param bottoml1
     *                Index in the 1st sequence to start from (inclusive)
     * @param topl1
     *                Index in the 1st sequence to end on (inclusive)
     * @param l2
     *                The 2nd sequence
     * @param bottoml2
     *                Index in the 2nd sequence to start from (inclusive)
     * @param topl2
     *                Index in the 2nd sequence to end on (inclusive)
     * @param V
     *                should be allocated as int[2][l1.length + l2.length + 1],
     *                used to store furthest reaching D-paths
     * @param snake
     *                should be allocated as int[3], used to store the beginning
     *                x, y coordinates and the length of the middle snake
     * 
     * @return The number of differences (SES) between l1[bottoml1..topl1] and
     *         l2[bottoml2..topl2]
     */
    private int find_middle_snake(int bottoml1, int topl1, int bottoml2,
            int topl2, int[][] V, int[] snake) {
        int N = topl1 - bottoml1 + 1;
        int M = topl2 - bottoml2 + 1;
        // System.out.println("N: " + N + " M: " + M + " bottom: " + bottoml1 +
        // ", " +
        // bottoml2 + " top: " + topl1 + ", " + topl2);
        int delta = N - M;
        boolean isEven;
        if ((delta & 1) == 1) {
            isEven = false;
        } else {
            isEven = true;
        }

        int limit = Math.min(max_differences, (N + M + 1) / 2); // ceil((N+M)/2)

        int value_to_add_forward; // a 0 or 1 that we add to the start
        // offset
        // to make it odd/even
        if ((M & 1) == 1) {
            value_to_add_forward = 1;
        } else {
            value_to_add_forward = 0;
        }

        int value_to_add_backward;
        if ((N & 1) == 1) {
            value_to_add_backward = 1;
        } else {
            value_to_add_backward = 0;
        }

        int start_forward = -M;
        int end_forward = N;
        int start_backward = -N;
        int end_backward = M;

        V[0][limit + 1] = 0;
        V[1][limit - 1] = N;
        for (int d = 0; d <= limit; d++) {

            int start_diag = Math.max(value_to_add_forward + start_forward, -d);
            int end_diag = Math.min(end_forward, d);
            value_to_add_forward = 1 - value_to_add_forward;

            // compute forward furthest reaching paths
            for (int k = start_diag; k <= end_diag; k += 2) {
                int x;
                if (k == -d
                        || (k < d && V[0][limit + k - 1] < V[0][limit + k + 1])) {
                    x = V[0][limit + k + 1];
                } else {
                    x = V[0][limit + k - 1] + 1;
                }

                int y = x - k;

                snake[0] = x + bottoml1;
                snake[1] = y + bottoml2;
                snake[2] = 0;
                // System.out.println("1 x: " + x + " y: " + y + " k: " + k + "
                // d: " + d );
                while (x < N && y < M
                        && isRangeEqual(x + bottoml1, y + bottoml2)) {
                    x++;
                    y++;
                    snake[2]++;
                }
                V[0][limit + k] = x;
                // System.out.println(x + " " + V[1][limit+k -delta] + " " + k +
                // " " + delta);
                if (!isEven && k >= delta - d + 1 && k <= delta + d - 1
                        && x >= V[1][limit + k - delta]) {
                    // System.out.println("Returning: " + (2*d-1));
                    return 2 * d - 1;
                }

                // check to see if we can cut down the diagonal range
                if (x >= N && end_forward > k - 1) {
                    end_forward = k - 1;
                } else if (y >= M) {
                    start_forward = k + 1;
                    value_to_add_forward = 0;
                }
            }

            start_diag = Math.max(value_to_add_backward + start_backward, -d);
            end_diag = Math.min(end_backward, d);
            value_to_add_backward = 1 - value_to_add_backward;

            // compute backward furthest reaching paths
            for (int k = start_diag; k <= end_diag; k += 2) {
                int x;
                if (k == d
                        || (k != -d && V[1][limit + k - 1] < V[1][limit + k + 1])) {
                    x = V[1][limit + k - 1];
                } else {
                    x = V[1][limit + k + 1] - 1;
                }

                int y = x - k - delta;

                snake[2] = 0;
                // System.out.println("2 x: " + x + " y: " + y + " k: " + k + "
                // d: " + d);
                while (x > 0 && y > 0
                        && isRangeEqual(x - 1 + bottoml1, y - 1 + bottoml2)) {
                    x--;
                    y--;
                    snake[2]++;
                }
                V[1][limit + k] = x;

                if (isEven && k >= -delta - d && k <= d - delta
                        && x <= V[0][limit + k + delta]) {
                    // System.out.println("Returning: " + 2*d);
                    snake[0] = bottoml1 + x;
                    snake[1] = bottoml2 + y;

                    return 2 * d;
                }

                // check to see if we can cut down our diagonal range
                if (x <= 0) {
                    start_backward = k + 1;
                    value_to_add_backward = 0;
                } else if (y <= 0 && end_backward > k - 1) {
                    end_backward = k - 1;
                }
            }
        }

        /*
         * computing the true LCS is too expensive, instead find the diagonal
         * with the most progress and pretend a midle snake of length 0 occurs
         * there.
         */

        int[] most_progress = findMostProgress(M, N, limit, V);

        snake[0] = bottoml1 + most_progress[0];
        snake[1] = bottoml2 + most_progress[1];
        snake[2] = 0;
        return 5; /*
                     * HACK: since we didn't really finish the LCS computation
                     * we don't really know the length of the SES. We don't do
                     * anything with the result anyway, unless it's <=1. We know
                     * for a fact SES > 1 so 5 is as good a number as any to
                     * return here
                     */
    }

    /**
     * Takes the array with furthest reaching D-paths from an LCS computation
     * and returns the x,y coordinates and progress made in the middle diagonal
     * among those with maximum progress, both from the front and from the back.
     * 
     * @param M
     *                the length of the 1st sequence for which LCS is being
     *                computed
     * @param N
     *                the length of the 2nd sequence for which LCS is being
     *                computed
     * @param limit
     *                the number of steps made in an attempt to find the LCS
     *                from the front and back
     * @param V
     *                the array storing the furthest reaching D-paths for the
     *                LCS computation
     * @return The result as an array of 3 integers where result[0] is the x
     *         coordinate of the current location in the diagonal with the most
     *         progress, result[1] is the y coordinate of the current location
     *         in the diagonal with the most progress and result[2] is the
     *         amount of progress made in that diagonal
     */
    private static int[] findMostProgress(int M, int N, int limit, int[][] V) {
        int delta = N - M;

        int forward_start_diag;
        if ((M & 1) == (limit & 1)) {
            forward_start_diag = Math.max(-M, -limit);
        } else {
            forward_start_diag = Math.max(1 - M, -limit);
        }

        int forward_end_diag = Math.min(N, limit);

        int backward_start_diag;
        if ((N & 1) == (limit & 1)) {
            backward_start_diag = Math.max(-N, -limit);
        } else {
            backward_start_diag = Math.max(1 - N, -limit);
        }

        int backward_end_diag = Math.min(M, limit);

        int[][] max_progress = new int[Math.max(forward_end_diag
                - forward_start_diag, backward_end_diag - backward_start_diag) / 2 + 1][3];
        int num_progress = 0; // the 1st entry is current, it is initialized
        // with 0s

        // first search the forward diagonals
        for (int k = forward_start_diag; k <= forward_end_diag; k += 2) {
            int x = V[0][limit + k];
            int y = x - k;
            if (x > N || y > M) {
                continue;
            }

            int progress = x + y;
            if (progress > max_progress[0][2]) {
                num_progress = 0;
                max_progress[0][0] = x;
                max_progress[0][1] = y;
                max_progress[0][2] = progress;
            } else if (progress == max_progress[0][2]) {
                num_progress++;
                max_progress[num_progress][0] = x;
                max_progress[num_progress][1] = y;
                max_progress[num_progress][2] = progress;
            }
        }

        boolean max_progress_forward = true; // initially the maximum
        // progress is in the forward
        // direction

        // now search the backward diagonals
        for (int k = backward_start_diag; k <= backward_end_diag; k += 2) {
            int x = V[1][limit + k];
            int y = x - k - delta;
            if (x < 0 || y < 0) {
                continue;
            }

            int progress = N - x + M - y;
            if (progress > max_progress[0][2]) {
                num_progress = 0;
                max_progress_forward = false;
                max_progress[0][0] = x;
                max_progress[0][1] = y;
                max_progress[0][2] = progress;
            } else if (progress == max_progress[0][2] && !max_progress_forward) {
                num_progress++;
                max_progress[num_progress][0] = x;
                max_progress[num_progress][1] = y;
                max_progress[num_progress][2] = progress;
            }
        }

        // return the middle diagonal with maximal progress.
        return max_progress[num_progress / 2];
    }

    protected abstract int getLength2();

    protected abstract int getLength1();

    protected abstract boolean isRangeEqual(int i1, int i2);

    protected abstract void setLcs(int sl1, int sl2);

    protected abstract void initializeLcs(int lcsLength);

    public int getLength() {
        return length;
    }
}




© 2015 - 2024 Weber Informatics LLC | Privacy Policy