de.unkrig.zz.diff.DocumentDiff Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of zz-diff Show documentation
The newest version!

/*
 * de.unkrig.diff - An advanced version of the UNIX DIFF utility
 *
 * Copyright (c) 2016, Arno Unkrig
 * All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without modification, are permitted provided that the
 * following conditions are met:
 *
 *    1. Redistributions of source code must retain the above copyright notice, this list of conditions and the
 *       following disclaimer.
 *    2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the
 *       following disclaimer in the documentation and/or other materials provided with the distribution.
 *    3. Neither the name of the copyright holder nor the names of its contributors may be used to endorse or promote
 *       products derived from this software without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES,
 * INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
 * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
 * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
 * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 */

package de.unkrig.zz.diff;

import static de.unkrig.commons.text.scanner.JavaScanner.TokenType.CXX_COMMENT;
import static de.unkrig.commons.text.scanner.JavaScanner.TokenType.C_COMMENT;
import static de.unkrig.commons.text.scanner.JavaScanner.TokenType.MULTI_LINE_C_COMMENT_BEGINNING;
import static de.unkrig.commons.text.scanner.JavaScanner.TokenType.MULTI_LINE_C_COMMENT_END;
import static de.unkrig.commons.text.scanner.JavaScanner.TokenType.MULTI_LINE_C_COMMENT_MIDDLE;

import java.io.BufferedReader;
import java.io.File;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.nio.charset.Charset;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collection;
import java.util.HashMap;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import java.util.zip.Checksum;

import org.incava.util.diff.Difference;

import de.unkrig.commons.io.ByteFilterInputStream;
import de.unkrig.commons.lang.AssertionUtil;
import de.unkrig.commons.lang.ExceptionUtil;
import de.unkrig.commons.lang.protocol.Predicate;
import de.unkrig.commons.nullanalysis.Nullable;
import de.unkrig.commons.text.AbstractPrinter;
import de.unkrig.commons.text.Printer;
import de.unkrig.commons.text.Printers;
import de.unkrig.commons.text.scanner.AbstractScanner.Token;
import de.unkrig.commons.text.scanner.JavaScanner;
import de.unkrig.commons.text.scanner.JavaScanner.TokenType;
import de.unkrig.commons.text.scanner.ScanException;
import de.unkrig.commons.text.scanner.ScannerUtil;
import de.unkrig.commons.text.scanner.StringScanner;

/**
 * Implementation of a document comparator, i.e. the core of the UNIX DIFF utility.
 * 
 *   It prints its output via the {@link Printers context printer}; if you want to modify the printing, then you'll
 *   have to set up your own {@link Printer} and use {@link AbstractPrinter#run(Runnable)} to run the DIFF.
 * 
 */
public
class DocumentDiff {

    static { AssertionUtil.enableAssertionsForThisClass(); }

    private static final Pattern WHITESPACE_PATTERN = Pattern.compile("\\s+");

//    private static final ExecutorService PARALLEL_EXECUTOR_SERVICE = new ScheduledThreadPoolExecutor(
//        Runtime.getRuntime().availableProcessors() * 3,
//        ThreadUtil.DAEMON_THREAD_FACTORY
//    );

    /**
     * Iff the paths of the two contents sources match the {@link #pathPattern}, and the line from source 1 ("line 1")
     * and the line from source 2 ("Line 2") both match the {@link #lineRegex}, and the capturing groups have
     * equal text, then the two lines are regarded as "equal", although their texts may not be equal.
     */
    public static
    class LineEquivalence {

        /**
         * To which files / elements this object applies.
         */
        public final Predicate pathPattern;

        /**
         * The regex that is applied to each line.
         */
        public final Pattern lineRegex;

        public
        LineEquivalence(Predicate pathPattern, Pattern lineRegex) {
            this.pathPattern = pathPattern;
            this.lineRegex   = lineRegex;
        }

        @Override public String
        toString() {
            return this.pathPattern + ":" + this.lineRegex;
        }
    }

    /**
     * The DIFF output format.
     */
    public
    enum DocumentDiffMode {

        /**
         * Output 'normal' DIFF output.
         */
        NORMAL,

        /**
         * Output 'context diff' format.
         */
        CONTEXT,

        /**
         * Output 'unified diff' format.
         */
        UNIFIED
    }

    // Configuration parameters.

    /** The possible modes for tokenizing the documents to compare. */
    public enum Tokenization { LINE, JAVA }

    private final Collection equivalentLines = new ArrayList();
    private final Collection ignores         = new ArrayList();
    private boolean                           ignoreWhitespace;
    private boolean                           disassembleClassFiles;
    private boolean                           disassembleClassFilesVerbose;
    @Nullable private File                    disassembleClassFilesSourceDirectory;
    private boolean                           disassembleClassFilesButHideLines;
    private boolean                           disassembleClassFilesButHideVars;
    private boolean                           disassembleClassFilesSymbolicLabels;
    private Charset                           charset          = Charset.defaultCharset();
    private DocumentDiffMode                  documentDiffMode = DocumentDiffMode.NORMAL;
    private int                               contextSize      = 3;
    private Tokenization                      tokenization     = Tokenization.LINE;
    private boolean                           ignoreCStyleComments;
    private boolean                           ignoreCPlusPlusStyleComments;
    private boolean                           ignoreDocComments;

    // SETTERS FOR THE VARIOUS CONFIGURATION PARAMETERS

    public void
    setIgnoreWhitespace(boolean value) { this.ignoreWhitespace = value; }

    public void
    setDisassembleClassFiles(boolean value) { this.disassembleClassFiles = value; }

    /**
     * @param value Whether to include a constant pool dump, constant pool indexes, and hex dumps of all attributes
     *              in the disassembly output
     */
    public void
    setDisassembleClassFilesVerbose(boolean value) { this.disassembleClassFilesVerbose = value; }

    /**
     * @param value Where to look for source files; {@code null} disables source file loading; source file loading is
     *              disabled by default
     */
    public void
    setDisassembleClassFilesSourceDirectory(@Nullable File value) { this.disassembleClassFilesSourceDirectory = value; }

    /**
     * @param value Whether source line numbers are suppressed in the disassembly (defaults to {@code false})
     */
    public void
    setDisassembleClassFilesButHideLines(boolean value) { this.disassembleClassFilesButHideLines = value; }

    /**
     * @param value Whether local variable names are suppressed in the disassembly (defaults to {@code false})
     */
    public void
    setDisassembleClassFilesButHideVars(boolean value) { this.disassembleClassFilesButHideVars = value; }

    /**
     * @param value Whether to use numeric labels ('#123') or symbolic labels /'L12') in the bytecode disassembly
     */
    public void
    setDisassembleClassFilesSymbolicLabels(boolean value) { this.disassembleClassFilesSymbolicLabels = value; }

    public void
    setCharset(Charset value) { this.charset = value; }

    public void
    setDocumentDiffMode(DocumentDiffMode value) { this.documentDiffMode = value; }

    /**
     * The number of (equal) lines before and after each change to report; defaults to 3.
     * 
     *   Only relevant for diff modes {@link DocumentDiffMode#UNIFIED} and {@link DocumentDiffMode#CONTEXT}.
     * 
     *
     * @see #setDocumentDiffMode(DocumentDiffMode)
     */
    public void
    setContextSize(int value) { this.contextSize = value; }

    public void
    setTokenization(Tokenization value) { this.tokenization = value; }

    /**
     * Whether C-style comments ("/*; ... */") are relevant for comparison.
     * Relevant iff {@link #setTokenization(Tokenization) tokenization} is {@link Tokenization#JAVA JAVA}.
     * 
     *   Doc comments ("/** ... */") are handled differently, and are not regarded as C-style
     *   comments.
     * 
     * 
     *   The default is {@code false}.
     * 
     *
     * @see #setIgnoreDocComments(boolean)
     */
    public void
    setIgnoreCStyleComments(boolean value) { this.ignoreCStyleComments = value; }

    /**
     * Whether C++-style comments ("// ...") are relevant for comparison.
     * Relevant iff {@link #setTokenization(Tokenization) tokenization} is {@link Tokenization#JAVA JAVA}.
     * 
     *   The default is {@code false}.
     * 
     */
    public void
    setIgnoreCPlusPlusStyleComments(boolean value) { this.ignoreCPlusPlusStyleComments = value; }

    /**
     * Whether doc comments ("/** ... */") are relevant for comparison.
     * Relevant iff {@link #setTokenization(Tokenization) tokenization} is {@link Tokenization#JAVA JAVA}.
     * 
     *   The default is {@code false}.
     * 
     * 
     *   Strictly speaking, a doc comment is only a doc comment if it appears immediately before a declaration;
     *   however, this implementation regards any comment starting with "{@code /**}" as a doc comment.
     * 
     */
    public void
    setIgnoreDocComments(boolean value) { this.ignoreDocComments = value; }

    public void
    addEquivalentLine(LineEquivalence lineEquivalence) { this.equivalentLines.add(lineEquivalence); }

    /**
     * Differences where all deleted lines and all added lines contain matches of any of
     * {@link LineEquivalence#lineRegex} are not printed.
     * 
     *   Only {@link LineEquivalence}s take effect where {@link LineEquivalence#pathPattern} matches the document's
     *   {@code path1}.
     * 
     */
    public void
    addIgnore(LineEquivalence lineEquivalence) { this.ignores.add(lineEquivalence); }

    /**
     * Honors {@link #equivalentLines}, invokes {@link #diff2(Line[], Line[], Collection)} and reports that the
     * contents is "equal" or has "changed", and, in the latter case, prints the actual differences. Reporting is
     * done through {@link Printers#info(String)}.
     * 
     *   The two input streams are closed in any case, even on abrupt completion.
     * 
     *
     * @return The number of reported differences
     */
    public long
    diff(String path1, String path2, InputStream stream1, InputStream stream2) throws IOException {

        // Determine which of the "equivalent lines" are effective for this path.
        final Collection effectiveEquivalentLines = new ArrayList();
        for (LineEquivalence le : this.equivalentLines) {
            if (le.pathPattern.evaluate(path1)) effectiveEquivalentLines.add(le.lineRegex);
        }

        // Read the contents of the two pathes.
        Line[] lines1 = this.readAllLines(stream1, effectiveEquivalentLines, path1);
        Line[] lines2 = this.readAllLines(stream2, effectiveEquivalentLines, path2);

        Printers.verbose(
            "''{0}'' ({1} {1,choice,0#lines|1#line|1 effectiveIgnores = new ArrayList();
        for (LineEquivalence ignore : DocumentDiff.this.ignores) {
            if (ignore.pathPattern.evaluate(path1)) effectiveIgnores.add(ignore.lineRegex);
        }

        List differences = this.diff2(lines1, lines2, effectiveIgnores);

        if (differences.isEmpty()) return 0;

        // Report the actual differences.
        switch (DocumentDiff.this.documentDiffMode) {

        case NORMAL:
            DocumentDiff.normalDiff(lines1, lines2, differences);
            break;

        case CONTEXT:
            Printers.info("*** " + path1);
            Printers.info("--- " + path2);
            DocumentDiff.this.contextDiff(lines1, lines2, differences);
            break;

        case UNIFIED:
            Printers.info("--- " + path1);
            Printers.info("+++ " + path2);
            DocumentDiff.this.unifiedDiff(lines1, lines2, differences);
            break;

        default:
            throw new AssertionError();
        }

        return differences.size();
    }

    /**
     * Determines the {@link Difference}s between the given two contents. Invokes {@link #diff3(Line[], Line[])} and
     * adds the ignores feature.
     *
     * @return The found differences
     */
    private List
    diff2(Line[] lines1, Line[] lines2, Collection ignores) {

        // Compute the contents differences.
        List diffs = this.diff3(lines1, lines2);

        Printers.verbose("{0} raw {0,choice,0#differences|1#difference|1 it = diffs.iterator(); it.hasNext();) {
                Difference d = it.next();

                if (d.getDeletedStart() != Difference.NONE) {
                    for (int i = d.getDeletedStart(); i <= d.getDeletedEnd(); i++) {
                        if (!DocumentDiff.contains(lines1[i].text, ignores)) continue IGNORABLE;
                    }
                }
                if (d.getAddedStart() != Difference.NONE) {
                    for (int i = d.getAddedStart(); i <= d.getAddedEnd(); i++) {
                        if (!DocumentDiff.contains(lines2[i].text, ignores)) continue IGNORABLE;
                    }
                }
                it.remove();
            }
            Printers.verbose("Reduced to {0} non-ignorable differences", diffs.size());
        }

        return diffs;
    }

    /**
     * Determines the {@link Difference}s between the given two contents. Implements the {@link #tokenization}
     * feature.
     *
     * @return The found differences
     */
    private List
    diff3(Line[] lines1, Line[] lines2) {

        switch (this.tokenization) {

        case LINE:
            return new org.incava.util.diff.Diff(lines1, lines2).diff();

        case JAVA:
            Map tokenIndexToLineIndex1 = new HashMap(lines1.length);
            Map tokenIndexToLineIndex2 = new HashMap(lines2.length);

            List tokens1 = this.tokenize(lines1, tokenIndexToLineIndex1);
            List tokens2 = this.tokenize(lines2, tokenIndexToLineIndex2);

            // Transform the list of "token differences" into a list of "line differences". Since there can be more
            // than one "token diff" per line, the resulting list could be shorter than the original list.
            List diffs = new org.incava.util.diff.Diff