de.unkrig.zz.diff.DocumentDiff Maven / Gradle / Ivy
/*
* de.unkrig.diff - An advanced version of the UNIX DIFF utility
*
* Copyright (c) 2016, Arno Unkrig
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without modification, are permitted provided that the
* following conditions are met:
*
* 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the
* following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the
* following disclaimer in the documentation and/or other materials provided with the distribution.
* 3. Neither the name of the copyright holder nor the names of its contributors may be used to endorse or promote
* products derived from this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES,
* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
* SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
* WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
package de.unkrig.zz.diff;
import static de.unkrig.commons.text.scanner.JavaScanner.TokenType.CXX_COMMENT;
import static de.unkrig.commons.text.scanner.JavaScanner.TokenType.C_COMMENT;
import static de.unkrig.commons.text.scanner.JavaScanner.TokenType.MULTI_LINE_C_COMMENT_BEGINNING;
import static de.unkrig.commons.text.scanner.JavaScanner.TokenType.MULTI_LINE_C_COMMENT_END;
import static de.unkrig.commons.text.scanner.JavaScanner.TokenType.MULTI_LINE_C_COMMENT_MIDDLE;
import java.io.BufferedReader;
import java.io.File;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.nio.charset.Charset;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collection;
import java.util.HashMap;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import java.util.zip.Checksum;
import org.incava.util.diff.Difference;
import de.unkrig.commons.io.ByteFilterInputStream;
import de.unkrig.commons.lang.AssertionUtil;
import de.unkrig.commons.lang.ExceptionUtil;
import de.unkrig.commons.lang.protocol.Predicate;
import de.unkrig.commons.nullanalysis.Nullable;
import de.unkrig.commons.text.AbstractPrinter;
import de.unkrig.commons.text.Printer;
import de.unkrig.commons.text.Printers;
import de.unkrig.commons.text.scanner.AbstractScanner.Token;
import de.unkrig.commons.text.scanner.JavaScanner;
import de.unkrig.commons.text.scanner.JavaScanner.TokenType;
import de.unkrig.commons.text.scanner.ScanException;
import de.unkrig.commons.text.scanner.ScannerUtil;
import de.unkrig.commons.text.scanner.StringScanner;
/**
* Implementation of a document comparator, i.e. the core of the UNIX DIFF utility.
*
* It prints its output via the {@link Printers context printer}; if you want to modify the printing, then you'll
* have to set up your own {@link Printer} and use {@link AbstractPrinter#run(Runnable)} to run the DIFF.
*
*/
public
class DocumentDiff {
static { AssertionUtil.enableAssertionsForThisClass(); }
private static final Pattern WHITESPACE_PATTERN = Pattern.compile("\\s+");
// private static final ExecutorService PARALLEL_EXECUTOR_SERVICE = new ScheduledThreadPoolExecutor(
// Runtime.getRuntime().availableProcessors() * 3,
// ThreadUtil.DAEMON_THREAD_FACTORY
// );
/**
* Iff the paths of the two contents sources match the {@link #pathPattern}, and the line from source 1 ("line 1")
* and the line from source 2 ("Line 2") both match the {@link #lineRegex}, and the capturing groups have
* equal text, then the two lines are regarded as "equal", although their texts may not be equal.
*/
public static
class LineEquivalence {
/**
* To which files / elements this object applies.
*/
public final Predicate super String> pathPattern;
/**
* The regex that is applied to each line.
*/
public final Pattern lineRegex;
public
LineEquivalence(Predicate super String> pathPattern, Pattern lineRegex) {
this.pathPattern = pathPattern;
this.lineRegex = lineRegex;
}
@Override public String
toString() {
return this.pathPattern + ":" + this.lineRegex;
}
}
/**
* The DIFF output format.
*/
public
enum DocumentDiffMode {
/**
* Output 'normal' DIFF output.
*/
NORMAL,
/**
* Output 'context diff' format.
*/
CONTEXT,
/**
* Output 'unified diff' format.
*/
UNIFIED
}
// Configuration parameters.
/** The possible modes for tokenizing the documents to compare. */
public enum Tokenization { LINE, JAVA }
private final Collection equivalentLines = new ArrayList();
private final Collection ignores = new ArrayList();
private boolean ignoreWhitespace;
private boolean disassembleClassFiles;
private boolean disassembleClassFilesVerbose;
@Nullable private File disassembleClassFilesSourceDirectory;
private boolean disassembleClassFilesButHideLines;
private boolean disassembleClassFilesButHideVars;
private boolean disassembleClassFilesSymbolicLabels;
private Charset charset = Charset.defaultCharset();
private DocumentDiffMode documentDiffMode = DocumentDiffMode.NORMAL;
private int contextSize = 3;
private Tokenization tokenization = Tokenization.LINE;
private boolean ignoreCStyleComments;
private boolean ignoreCPlusPlusStyleComments;
private boolean ignoreDocComments;
// SETTERS FOR THE VARIOUS CONFIGURATION PARAMETERS
public void
setIgnoreWhitespace(boolean value) { this.ignoreWhitespace = value; }
public void
setDisassembleClassFiles(boolean value) { this.disassembleClassFiles = value; }
/**
* @param value Whether to include a constant pool dump, constant pool indexes, and hex dumps of all attributes
* in the disassembly output
*/
public void
setDisassembleClassFilesVerbose(boolean value) { this.disassembleClassFilesVerbose = value; }
/**
* @param value Where to look for source files; {@code null} disables source file loading; source file loading is
* disabled by default
*/
public void
setDisassembleClassFilesSourceDirectory(@Nullable File value) { this.disassembleClassFilesSourceDirectory = value; }
/**
* @param value Whether source line numbers are suppressed in the disassembly (defaults to {@code false})
*/
public void
setDisassembleClassFilesButHideLines(boolean value) { this.disassembleClassFilesButHideLines = value; }
/**
* @param value Whether local variable names are suppressed in the disassembly (defaults to {@code false})
*/
public void
setDisassembleClassFilesButHideVars(boolean value) { this.disassembleClassFilesButHideVars = value; }
/**
* @param value Whether to use numeric labels ('#123') or symbolic labels /'L12') in the bytecode disassembly
*/
public void
setDisassembleClassFilesSymbolicLabels(boolean value) { this.disassembleClassFilesSymbolicLabels = value; }
public void
setCharset(Charset value) { this.charset = value; }
public void
setDocumentDiffMode(DocumentDiffMode value) { this.documentDiffMode = value; }
/**
* The number of (equal) lines before and after each change to report; defaults to 3.
*
* Only relevant for diff modes {@link DocumentDiffMode#UNIFIED} and {@link DocumentDiffMode#CONTEXT}.
*
*
* @see #setDocumentDiffMode(DocumentDiffMode)
*/
public void
setContextSize(int value) { this.contextSize = value; }
public void
setTokenization(Tokenization value) { this.tokenization = value; }
/**
* Whether C-style comments ("/*; ... */
") are relevant for comparison.
* Relevant iff {@link #setTokenization(Tokenization) tokenization} is {@link Tokenization#JAVA JAVA}.
*
* Doc comments ("/** ... */
") are handled differently, and are not regarded as C-style
* comments.
*
*
* The default is {@code false}.
*
*
* @see #setIgnoreDocComments(boolean)
*/
public void
setIgnoreCStyleComments(boolean value) { this.ignoreCStyleComments = value; }
/**
* Whether C++-style comments ("// ...
") are relevant for comparison.
* Relevant iff {@link #setTokenization(Tokenization) tokenization} is {@link Tokenization#JAVA JAVA}.
*
* The default is {@code false}.
*
*/
public void
setIgnoreCPlusPlusStyleComments(boolean value) { this.ignoreCPlusPlusStyleComments = value; }
/**
* Whether doc comments ("/** ... */
") are relevant for comparison.
* Relevant iff {@link #setTokenization(Tokenization) tokenization} is {@link Tokenization#JAVA JAVA}.
*
* The default is {@code false}.
*
*
* Strictly speaking, a doc comment is only a doc comment if it appears immediately before a declaration;
* however, this implementation regards any comment starting with "{@code /**}" as a doc comment.
*
*/
public void
setIgnoreDocComments(boolean value) { this.ignoreDocComments = value; }
public void
addEquivalentLine(LineEquivalence lineEquivalence) { this.equivalentLines.add(lineEquivalence); }
/**
* Differences where all deleted lines and all added lines contain matches of any of
* {@link LineEquivalence#lineRegex} are not printed.
*
* Only {@link LineEquivalence}s take effect where {@link LineEquivalence#pathPattern} matches the document's
* {@code path1}.
*
*/
public void
addIgnore(LineEquivalence lineEquivalence) { this.ignores.add(lineEquivalence); }
/**
* Honors {@link #equivalentLines}, invokes {@link #diff2(Line[], Line[], Collection)} and reports that the
* contents is "equal" or has "changed", and, in the latter case, prints the actual differences. Reporting is
* done through {@link Printers#info(String)}.
*
* The two input streams are closed in any case, even on abrupt completion.
*
*
* @return The number of reported differences
*/
public long
diff(String path1, String path2, InputStream stream1, InputStream stream2) throws IOException {
// Determine which of the "equivalent lines" are effective for this path.
final Collection effectiveEquivalentLines = new ArrayList();
for (LineEquivalence le : this.equivalentLines) {
if (le.pathPattern.evaluate(path1)) effectiveEquivalentLines.add(le.lineRegex);
}
// Read the contents of the two pathes.
Line[] lines1 = this.readAllLines(stream1, effectiveEquivalentLines, path1);
Line[] lines2 = this.readAllLines(stream2, effectiveEquivalentLines, path2);
Printers.verbose(
"''{0}'' ({1} {1,choice,0#lines|1#line|1 effectiveIgnores = new ArrayList();
for (LineEquivalence ignore : DocumentDiff.this.ignores) {
if (ignore.pathPattern.evaluate(path1)) effectiveIgnores.add(ignore.lineRegex);
}
List differences = this.diff2(lines1, lines2, effectiveIgnores);
if (differences.isEmpty()) return 0;
// Report the actual differences.
switch (DocumentDiff.this.documentDiffMode) {
case NORMAL:
DocumentDiff.normalDiff(lines1, lines2, differences);
break;
case CONTEXT:
Printers.info("*** " + path1);
Printers.info("--- " + path2);
DocumentDiff.this.contextDiff(lines1, lines2, differences);
break;
case UNIFIED:
Printers.info("--- " + path1);
Printers.info("+++ " + path2);
DocumentDiff.this.unifiedDiff(lines1, lines2, differences);
break;
default:
throw new AssertionError();
}
return differences.size();
}
/**
* Determines the {@link Difference}s between the given two contents. Invokes {@link #diff3(Line[], Line[])} and
* adds the ignores feature.
*
* @return The found differences
*/
private List
diff2(Line[] lines1, Line[] lines2, Collection ignores) {
// Compute the contents differences.
List diffs = this.diff3(lines1, lines2);
Printers.verbose("{0} raw {0,choice,0#differences|1#difference|1 it = diffs.iterator(); it.hasNext();) {
Difference d = it.next();
if (d.getDeletedStart() != Difference.NONE) {
for (int i = d.getDeletedStart(); i <= d.getDeletedEnd(); i++) {
if (!DocumentDiff.contains(lines1[i].text, ignores)) continue IGNORABLE;
}
}
if (d.getAddedStart() != Difference.NONE) {
for (int i = d.getAddedStart(); i <= d.getAddedEnd(); i++) {
if (!DocumentDiff.contains(lines2[i].text, ignores)) continue IGNORABLE;
}
}
it.remove();
}
Printers.verbose("Reduced to {0} non-ignorable differences", diffs.size());
}
return diffs;
}
/**
* Determines the {@link Difference}s between the given two contents. Implements the {@link #tokenization}
* feature.
*
* @return The found differences
*/
private List
diff3(Line[] lines1, Line[] lines2) {
switch (this.tokenization) {
case LINE:
return new org.incava.util.diff.Diff(lines1, lines2).diff();
case JAVA:
Map tokenIndexToLineIndex1 = new HashMap(lines1.length);
Map tokenIndexToLineIndex2 = new HashMap(lines2.length);
List tokens1 = this.tokenize(lines1, tokenIndexToLineIndex1);
List tokens2 = this.tokenize(lines2, tokenIndexToLineIndex2);
// Transform the list of "token differences" into a list of "line differences". Since there can be more
// than one "token diff" per line, the resulting list could be shorter than the original list.
List diffs = new org.incava.util.diff.Diff