org.apache.commons.jrcs.diff.SimpleDiff Maven / Gradle / Ivy
Show all versions of sakai-rwiki-diff Show documentation
/*
* ====================================================================
*
* The Apache Software License, Version 1.1
*
* Copyright (c) 1999-2003 The Apache Software Foundation.
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
*
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in
* the documentation and/or other materials provided with the
* distribution.
*
* 3. The end-user documentation included with the redistribution, if
* any, must include the following acknowledgement:
* "This product includes software developed by the
* Apache Software Foundation (http://www.apache.org/)."
* Alternately, this acknowledgement may appear in the software itself,
* if and wherever such third-party acknowledgements normally appear.
*
* 4. The names "The Jakarta Project", "Commons", and "Apache Software
* Foundation" must not be used to endorse or promote products derived
* from this software without prior written permission. For written
* permission, please contact [email protected].
*
* 5. Products derived from this software may not be called "Apache"
* nor may "Apache" appear in their names without prior written
* permission of the Apache Software Foundation.
*
* THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED
* WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
* OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL THE APACHE SOFTWARE FOUNDATION OR
* ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
* LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
* USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
* ====================================================================
*
* This software consists of voluntary contributions made by many
* individuals on behalf of the Apache Software Foundation. For more
* information on the Apache Software Foundation, please see
* .
*
*/
package org.apache.commons.jrcs.diff;
import java.util.Arrays;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Map;
import java.util.Set;
/**
* Implements a simple differencing algortithm.
*
*
* @date $Date$
* @version $Revision$
* @author Juanco Anez
*
* Overview of Algorithm
*
*
* by bwm
*
*
* The algorithm is optimised for situations where the input sequences
* have few repeated objects. If it is given input with many repeated
* objects it will report sub-optimal changes. However, given
* appropriate input, it is fast, and linear in memory usage.
*
*
* The algorithm consists of the following steps:
*
*
* - compute an equivalence set for the input data
* - translate each element of the orginal and revised input
* sequences to a member of the equivalence set
* - match the the input sequences to determine the deltas, i.e. the
* differences between the original and revised sequences.
*
*
* The first step is to compute a an equivalence set for the input data.
* The equivalence set is computed from objects that are in the original
* input sequence
*
*
*
* eq(x) = the index of the first occurence of x in the original sequence.
*
*
*
* With this equivalence function, the algorithm can compare integers
* rather than strings, which is considerably more efficient.
*
*
* The second step is to compute the datastructure on which the
* algorithm will operate. Having computed the equivalence function in
* the previous step, we can compute two arrays where indx[i] =
* eqs(orig[i]) and jndx[i] = eqs(rev[i]). The algorithm can now operate
* on indx and jndx instead of orig and rev. Thus, comparisons are then
* on O(int == int) instead of O(Object.equals(Object)).
*
*
* The algorithm now matches indx and jndx. Whilst indx[i] == jndx[i] it
* skips matching objects in the sequence. In seeking to match objects
* in the input sequence it assumes that each object is likely to be
* unique. It uses the known characteristics of the unique equivalence
* function. It can tell from the eq value if this object appeared in
* the other sequence at all. If it did not, there is no point in
* searching for a match.
*
*
* Recall that the eq function value is the index earliest occurrence in
* the orig sequence. This information is used to search efficiently for
* the next match. The algorithm is perfect when all input objects are
* unique, but degrades when input objects are not unique. When input
* objects are not unique an optimal match may not be found, but a
* correct match will be.
*
*
* Having identified common matching objects in the orig and revised
* sequences, the differences between them are easily computed.
*
* @see Delta
* @see Revision Modifications: 27/Apr/2003 bwm Added some comments whilst
* trying to figure out the algorithm 03 May 2003 bwm Created this
* implementation class by refactoring it out of the Diff class to enable
* plug in difference algorithms
*/
public class SimpleDiff implements DiffAlgorithm
{
static final int NOT_FOUND_i = -2;
static final int NOT_FOUND_j = -1;
static final int EOS = Integer.MAX_VALUE;
public SimpleDiff()
{
}
protected int scan(int[] ndx, int i, int target)
{
while (ndx[i] < target)
{
i++;
}
return i;
}
/**
* Compute the difference between original and revised sequences.
*
* @param orig
* The original sequence.
* @param rev
* The revised sequence to be compared with the original.
* @return A Revision object describing the differences.
* @throws DifferenciationFailedException
* if the diff could not be computed.
*/
public Revision diff(Object[] orig, Object[] rev)
throws DifferentiationFailedException
{
// create map eqs, such that for each item in both orig and rev
// eqs(item) = firstOccurrence(item, orig);
Map eqs = buildEqSet(orig, rev);
// create an array such that
// indx[i] = NOT_FOUND_i if orig[i] is not in rev
// indx[i] = firstOccurrence(orig[i], orig)
int[] indx = buildIndex(eqs, orig, NOT_FOUND_i);
// create an array such that
// jndx[j] = NOT_FOUND_j if orig[j] is not in rev
// jndx[j] = firstOccurrence(rev[j], orig)
int[] jndx = buildIndex(eqs, rev, NOT_FOUND_j);
// what in effect has been done is to build a unique hash
// for each item that is in both orig and rev
// and to label each item in orig and new with that hash value
// or a marker that the item is not common to both.
eqs = null; // let gc know we're done with this
Revision deltas = new Revision(); // !!! new Revision()
int i = 0;
int j = 0;
// skip matching
// skip leading items that are equal
// could be written
// for (i=0; indx[i] != EOS && indx[i] == jndx[i]; i++);
// j = i;
for (; indx[i] != EOS && indx[i] == jndx[j]; i++, j++)
{
/* void */
}
while (indx[i] != jndx[j])
{ // only equal if both == EOS
// they are different
int ia = i;
int ja = j;
// size of this delta
do
{
// look down rev for a match
// stop at a match
// or if the FO(rev[j]) > FO(orig[i])
// or at the end
while (jndx[j] < 0 || jndx[j] < indx[i])
{
j++;
}
// look down orig for a match
// stop at a match
// or if the FO(orig[i]) > FO(rev[j])
// or at the end
while (indx[i] < 0 || indx[i] < jndx[j])
{
i++;
}
// this doesn't do a compare each line with each other line
// so it won't find all matching lines
}
while (indx[i] != jndx[j]);
// on exit we have a match
// they are equal, reverse any exedent matches
// it is possible to overshoot, so count back matching items
while (i > ia && j > ja && indx[i - 1] == jndx[j - 1])
{
--i;
--j;
}
deltas.addDelta(Delta.newDelta(new Chunk(orig, ia, i - ia),
new Chunk(rev, ja, j - ja)));
// skip matching
for (; indx[i] != EOS && indx[i] == jndx[j]; i++, j++)
{
/* void */
}
}
return deltas;
}
/**
* create a Map
from each common item in orig and rev to the
* index of its first occurrence in orig
*
* @param orig
* the original sequence of items
* @param rev
* the revised sequence of items
*/
protected Map buildEqSet(Object[] orig, Object[] rev)
{
// construct a set of the objects that orig and rev have in common
// first construct a set containing all the elements in orig
Set items = new HashSet(Arrays.asList(orig));
// then remove all those not in rev
items.retainAll(Arrays.asList(rev));
Map eqs = new HashMap();
for (int i = 0; i < orig.length; i++)
{
// if its a common item and hasn't been found before
if (items.contains(orig[i]))
{
// add it to the map
eqs.put(orig[i], Integer.valueOf(i));
// and make sure its not considered again
items.remove(orig[i]);
}
}
return eqs;
}
/**
* build a an array such each a[i] = eqs([i]) or NF if eqs([i]) undefined
*
* @param eqs
* a mapping from Object to Integer
* @param seq
* a sequence of objects
* @param NF
* the not found marker
*/
protected int[] buildIndex(Map eqs, Object[] seq, int NF)
{
int[] result = new int[seq.length + 1];
for (int i = 0; i < seq.length; i++)
{
Integer value = (Integer) eqs.get(seq[i]);
if (value == null || value.intValue() < 0)
{
result[i] = NF;
}
else
{
result[i] = value.intValue();
}
}
result[seq.length] = EOS;
return result;
}
}