org.docx4j.diff.Differencer Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of docx4j-diffx Show documentation
differencing of docx files
There is a newer version: 11.5.0
/*
 *  Copyright 2007-2008, Plutext Pty Ltd.
 *   
 *  This file is part of docx4j.

    docx4j is licensed under the Apache License, Version 2.0 (the "License"); 
    you may not use this file except in compliance with the License. 

    You may obtain a copy of the License at 

        http://www.apache.org/licenses/LICENSE-2.0 

    Unless required by applicable law or agreed to in writing, software 
    distributed under the License is distributed on an "AS IS" BASIS, 
    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 
    See the License for the specific language governing permissions and 
    limitations under the License.

 */

package org.docx4j.diff;


import java.io.File;
import java.io.FileInputStream;
import java.io.IOException;
import java.io.Reader;
import java.io.StringReader;
import java.io.StringWriter;
import java.io.Writer;
import java.text.SimpleDateFormat;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;

import javax.xml.bind.JAXBContext;
import javax.xml.bind.JAXBElement;
import javax.xml.bind.Marshaller;
import javax.xml.bind.Unmarshaller;
import javax.xml.stream.XMLInputFactory;
import javax.xml.stream.XMLOutputFactory;
import javax.xml.stream.XMLStreamConstants;
import javax.xml.stream.XMLStreamException;
import javax.xml.stream.XMLStreamReader;
import javax.xml.stream.XMLStreamWriter;
import javax.xml.transform.Source;
import javax.xml.transform.Templates;
import javax.xml.transform.TransformerConfigurationException;
import javax.xml.transform.stream.StreamSource;

import org.apache.commons.io.IOUtils;
import org.docx4j.XmlUtils;
import org.docx4j.jaxb.Context;
import org.docx4j.openpackaging.parts.Part;
import org.docx4j.openpackaging.parts.relationships.RelationshipsPart;
import org.docx4j.relationships.Relationship;
import org.docx4j.wml.P;
import org.docx4j.wml.R;
import org.eclipse.compare.StringComparator;
import org.eclipse.compare.rangedifferencer.RangeDifference;
import org.eclipse.compare.rangedifferencer.RangeDifferencer;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.w3c.dom.Document;
import org.w3c.dom.Node;
import org.xml.sax.InputSource;

import com.topologi.diffx.Docx4jDriver;
import com.topologi.diffx.Main;
import com.topologi.diffx.config.DiffXConfig;



/**
 * Capable of comparing a pair of:
 * - w:body (only lightly tested)
 * - w:sdtContent (used extensively)
 * - w:p (includes an algorithm aimed at producing a better diff)
 * 
 * See org.docx4j.samples.CompareDocuments for an example of how to use.
 * 
 * @author jason
 *
 */
public class Differencer {
	
	/*
	 * TODO:
	 * 
	 * - handle spaces properly (encode real spaces as something before splitting,
	 *   and add back in at end
	 *    
	 */

	protected static Logger log = LoggerFactory.getLogger(Differencer.class);


	// For XSLT
	public static void log(String message ) {		
		log.info(message);
	}
	
	

	static org.docx4j.wml.ObjectFactory wmlFactory = new org.docx4j.wml.ObjectFactory();
	
	// The rels used in the resulting diff
	private Map composedRels = new HashMap();
	public Map getComposedRels() {
		return composedRels;
	}
	
	
	
    final private static SimpleDateFormat RFC3339_FORMAT 
    	= new SimpleDateFormat("yyyy-MM-dd'T'HH:mm:ss'Z'");
    
		// SimpleDateFormat is not thread-safe see:
		//   http://bugs.sun.com/bugdatabase/view_bug.do?bug_id=6231579
		//   http://bugs.sun.com/bugdatabase/view_bug.do?bug_id=6178997
		// solution is to use stateless MessageFormat instead:
		// final private static String RFC3339_FORMAT = "yyyy-MM-dd'T'HH:mm:ss";
		// final private static String RFC3339_PATTERN = "{0,date," + RFC3339_FORMAT + "}";    	

    static Templates xsltDiffx2Wml;
    
	/**
	 * org/docx4j/diff/diffx2wml.xslt will be used by default
	 * to transform the diff output into a Word docx with tracked
	 * changes. This method allows you to use your own xslt 
	 * instead.
	 * @param xsltDiffx2Wml
	 */
	public static void setXsltDiffx2Wml(Templates xsltDiffx2Wml) {
		Differencer.xsltDiffx2Wml = xsltDiffx2Wml;
	}
    
    
    
    static Templates xsltMarkupInsert;
    static Templates xsltMarkupDelete;
    
    static {
		try {
			Source xsltSource = new StreamSource(org.docx4j.utils.ResourceUtils
					.getResource("org/docx4j/diff/diffx2wml.xslt"));
			xsltDiffx2Wml = XmlUtils.getTransformerTemplate(xsltSource);

			xsltSource = new StreamSource(org.docx4j.utils.ResourceUtils
					.getResource("org/docx4j/diff/MarkupInsert.xslt"));
			xsltMarkupInsert = XmlUtils.getTransformerTemplate(xsltSource);

			xsltSource = new StreamSource(org.docx4j.utils.ResourceUtils
					.getResource("org/docx4j/diff/MarkupDelete.xslt"));
			xsltMarkupDelete = XmlUtils.getTransformerTemplate(xsltSource);
		} catch (IOException e) {
			e.printStackTrace();
		} catch (TransformerConfigurationException e) {
			e.printStackTrace();
		}
    	
    }
    
//	/**
//	 * @param args
//	 */
//	public static void main(String[] args) throws Exception {
//		
//		String BASE_DIR = "/home/dev/workspace/docx4j/src/test/java/org/docx4j/diff/";
//		
//		// Test setup
//		String paraL = BASE_DIR + "t2R";		
//		String paraR = BASE_DIR + "t3L";
//		P pl = loadParagraph(paraL);
//		P pr = loadParagraph(paraR);
//		
//		// Result format
//		StreamResult result = new StreamResult(System.out);
//
//		// Run the diff - FIXME
//		Differencer pd = new Differencer();
//		pd.diff(pl, pr, result, null, null, null, null);
//		
//	}
	
	/**
	 * The id to be allocated to the ins/del
	 * @return
	 */
	public final static Integer getId() {		
		return ++nextId;		
	}
	public static Integer nextId = 0;

	
	/**
	 * Because the resulting document might be built out of the 
	 * results of a number of diffs, we need to be sure that the id's
	 * are unique across these diffs.
	 * 
	 * This is passed into the XSLT, where it is used as part
	 * of the generated rel id.
	 * 
	 * @return the 
	 */
	private String relsDiffIdentifier;  
	/**
	 * @param relsDiffIdentifier the relsDiffIdentifier to set
	 */
	public void setRelsDiffIdentifier(String relsDiffIdentifier) {
		this.relsDiffIdentifier = relsDiffIdentifier;
	}

	/**
	 * This is a Xalan extension function, invoked from diffx2wml.xslt
	 * 
	 * Any rel which is present in the results of the comparison must point to
	 * a valid target of the correct type, or the resulting document will
	 * be broken.  
	 * 
	 * So we pass the old and new rels objects, and
	 * progressively build up a List of relationships which will need to be
	 * in the resulting document.
	 * 
	 * Because the resulting document might be built out of the 
	 * results of a number of diffs, we need to be sure that the id's
	 * are unique across these diffs.
	 * 
	 * @return the 
	 */
	public static void registerRelationship(Differencer pd, 
			RelationshipsPart docPartRels, String relId,
			String newRelId ) {

		
		if (docPartRels==null) {
			// (In this case, Xalan won't even be able to find this function)
			return;
		}
		
		if (docPartRels.getRelationships()==null) {
			log.warn("relationships object is null!");
			return;
		}
		
		
		log.debug("Looking for rel " + relId);
		Relationship r = docPartRels.getRelationshipByID(relId);
		if (r==null) {
			log.error("Couldn't find rel " + relId);
			return;
		}
		
		Part p = docPartRels.getPart(r);
		
		Relationship r2 = (Relationship)XmlUtils.deepCopy(r, Context.jcRelationships);
		
		r2.setId(newRelId);
		log.debug(".. added rel " + newRelId + " -- " + r2.getTarget() );
		
		
		
		
		pd.composedRels.put(r2, p);
	}

	/**
	 * Compare 2 p objects, returning a result containing
	 * w:ins and w:del elements  
	 * 
	 * @param pl - the left paragraph
	 * @param pr - the right paragraph
	 * @param result 
	 */
	public void diff(P pl, P pr, javax.xml.transform.Result result, 
			String author, java.util.Calendar date,
			RelationshipsPart docPartRelsLeft, RelationshipsPart docPartRelsRight) {

		diff(pl, pr, result, 
				author, date, 
				docPartRelsLeft, docPartRelsRight,
				false);
	}

	public void diff(org.docx4j.wml.SdtContentBlock cbNewer, 
			org.docx4j.wml.SdtContentBlock cbOlder, 
			javax.xml.transform.Result result,
			String author, java.util.Calendar date,
			RelationshipsPart docPartRelsNewer, RelationshipsPart docPartRelsOlder) {
		
		this.diffWorker(org.docx4j.XmlUtils.marshaltoW3CDomDocument(cbNewer).getDocumentElement(), 
				org.docx4j.XmlUtils.marshaltoW3CDomDocument(cbOlder).getDocumentElement(), 
				result, author, date, docPartRelsNewer, docPartRelsOlder);
	}

	public void diff(org.docx4j.wml.Body newer, 
			org.docx4j.wml.Body older, 
			javax.xml.transform.Result result,
			String author, java.util.Calendar date,
			RelationshipsPart docPartRelsNewer, RelationshipsPart docPartRelsOlder) {
		
		this.diffWorker(
				org.docx4j.XmlUtils.marshaltoW3CDomDocument(newer).getDocumentElement(),
				org.docx4j.XmlUtils.marshaltoW3CDomDocument(older).getDocumentElement(), 				
				result, author, date, docPartRelsNewer, docPartRelsOlder);
	}
	
	/**
	 * This is private, in order to control what objects the user
	 * can invoke diff on.  At present there are public methods for
	 * pairs of w:body, w:sdtContent, and w:p.  
	 * 
	 * TODO: consider/test w:table! 
	 */
	private void diffWorker(Node newer, 
			Node older, 
			javax.xml.transform.Result result,
			String author, java.util.Calendar date,
			RelationshipsPart docPartRelsNewer, RelationshipsPart docPartRelsOlder) {

		Writer diffxResult = new StringWriter();

		try {
			Docx4jDriver.diff(newer,
					   older,
					   diffxResult);
			toWML( diffxResult.toString(),  result, author, date,
				docPartRelsNewer,  docPartRelsOlder);
		} catch (Exception exc) {
			throw new RuntimeException("diffWorker failed.", exc);
		} finally {
			IOUtils.closeQuietly(diffxResult);
		}
	}
	
	public  void toWML(String in, javax.xml.transform.Result result, String author, java.util.Calendar date,
			RelationshipsPart docPartRelsNewer, RelationshipsPart docPartRelsOlder) {
		
		if (log.isDebugEnabled()) {
			log.debug("in: " + in);
		}
		
		try {
			
			XMLInputFactory inputFactory = XMLInputFactory.newInstance();
			inputFactory.setProperty(XMLInputFactory.IS_SUPPORTING_EXTERNAL_ENTITIES, false);
			inputFactory.setProperty(XMLInputFactory.SUPPORT_DTD, false); // a DTD is merely ignored, its presence doesn't cause an exception
			
			/* 2014 09 09
			 * 
			 * For unknown reasons, diffx may write:
			 * 
			 *    
			 *    
			 * ie without the namespace being declared.
			 * 
			 * Maybe something to do with the namespace being declared deep in the input?
			 * 
			 *    
			 *    
			 * This is a crude workaround.   
			 */
			int nsIndex = in.indexOf("xmlns:");
			int closeTag = in.indexOf(">", nsIndex);
			String topLevelDecs = in.substring(0, closeTag);
			
			log.debug(topLevelDecs);
			if (topLevelDecs.contains("xmlns:a14")) {
				// OK
			} else {
				in = topLevelDecs + " xmlns:a14=\"http://schemas.microsoft.com/office/drawing/2010/main\""
						+ in.substring(closeTag);
			}
			// 2017 10 02: workaround for where right side contains w14:paraId, but left side doesn't
			if (topLevelDecs.contains("xmlns:w14")) {
				// OK
			} else {
				in = topLevelDecs + " xmlns:w14=\"http://schemas.microsoft.com/office/word/2010/wordml\""
						+ in.substring(closeTag);
			}
			if (topLevelDecs.contains("xmlns:o")) {
				// OK
			} else {
				in = topLevelDecs + " xmlns:o=\"urn:schemas-microsoft-com:office:office\""
						+ in.substring(closeTag);
			}
			
			
			if (log.isDebugEnabled() ) {
				log.debug("Diff result:" + in);
			} 
			Reader reader = new StringReader(in);
			
			String simplified = null;
				try {
					simplified = combineAdjacent(inputFactory.createXMLStreamReader(reader) );
				} catch (XMLStreamException e) {
					e.printStackTrace();
//					log.debug("left: " + XmlUtils.marshaltoString(objectLeft, true, false));
//					log.debug("right: " + XmlUtils.marshaltoString(objectRight, true, false));					
				}
			
			log.debug("\n\n Diff'd input to transform: \n\n" + simplified );
							
			StreamSource src = new StreamSource(new StringReader(simplified));
			transformDiffxOutputToWml(result, author, date, docPartRelsNewer,
					docPartRelsOlder, src);
			
		} catch (Exception exc) {
			exc.printStackTrace();
		}			
		
	}

	/**
	 * @param result
	 * @param author
	 * @param date
	 * @param docPartRelsLeft
	 * @param docPartRelsRight
	 * @param src
	 * @throws Exception
	 */
	private void transformDiffxOutputToWml(javax.xml.transform.Result result,
			String author, java.util.Calendar date,
			RelationshipsPart docPartRelsLeft,
			RelationshipsPart docPartRelsRight, StreamSource src)
			throws Exception {
		Map transformParameters = new java.util.HashMap();
					
		String dateString;
		if (date!=null) {				
			dateString = RFC3339_FORMAT.format(date.getTime()) ;
		} else {
			// TODO FIXME - JAXB requires a real date.
			// What to give it?  
			// The alternative is to change the xslt
			// to omit the @date entirely if its unknown
			dateString = "2009-03-11T17:57:00Z";
		}
		transformParameters.put("Differencer", this);
		transformParameters.put("date", dateString);
		transformParameters.put("author", author);
		transformParameters.put("docPartRelsLeft",  docPartRelsLeft);
		transformParameters.put("docPartRelsRight", docPartRelsRight);
		transformParameters.put("relsDiffIdentifier", relsDiffIdentifier);  
		
		log.debug("invoking xsltDiffx2Wml");
		XmlUtils.transform(src, xsltDiffx2Wml, transformParameters, result);
	}
	
	public void markupAsInsertion(org.docx4j.wml.SdtContentBlock cbLeft, 
			javax.xml.transform.Result result,
			String author, java.util.Calendar date,
			RelationshipsPart docPartRelsLeft) {

		Writer diffxResult = new StringWriter();
				
		try {

	    	// Now marshall it
			JAXBContext jc = Context.jc;
			Marshaller marshaller=jc.createMarshaller();
			org.w3c.dom.Document doc = org.docx4j.XmlUtils.neww3cDomDocument();

			marshaller.marshal(cbLeft, doc);
			
			
			Map transformParameters = new java.util.HashMap();
						
			if (date!=null) {				
				String dateString = RFC3339_FORMAT.format(date.getTime()) ;
				transformParameters.put("date", dateString);
			}
			
			transformParameters.put("Differencer", this);
			transformParameters.put("author", author);
			transformParameters.put("docPartRelsLeft",  docPartRelsLeft);
			transformParameters.put("docPartRelsRight", null);
			transformParameters.put("relsDiffIdentifier", relsDiffIdentifier);  
			XmlUtils.transform(doc, xsltMarkupInsert, transformParameters, result);
			
		} catch (Exception exc) {
			exc.printStackTrace();
		}					

	}

	public void markupAsDeletion(org.docx4j.wml.SdtContentBlock cbLeft, 
			javax.xml.transform.Result result,
			String author, java.util.Calendar date,
			RelationshipsPart docPartRelsRight) {

		Writer diffxResult = new StringWriter();
				
		try {

	    	// Now marshall it
			JAXBContext jc = Context.jc;
			Marshaller marshaller=jc.createMarshaller();
			org.w3c.dom.Document doc = org.docx4j.XmlUtils.neww3cDomDocument();

			marshaller.marshal(cbLeft, doc);
			
			
			Map transformParameters = new java.util.HashMap();
						
			if (date!=null) {				
				String dateString = RFC3339_FORMAT.format(date.getTime()) ;
				transformParameters.put("date", dateString);
			}
			
			transformParameters.put("Differencer", this);
			transformParameters.put("author", author);
			transformParameters.put("docPartRelsLeft",  null);
			transformParameters.put("docPartRelsRight", docPartRelsRight);
			transformParameters.put("relsDiffIdentifier", relsDiffIdentifier);  
			log.debug("applying xsltMarkupDelete");
			XmlUtils.transform(doc, xsltMarkupDelete, transformParameters, result);
			
		} catch (Exception exc) {
			exc.printStackTrace();
		}					

	}
	
	
	/**
	 * Compare 2 p objects, returning a result containing
	 * w:ins and w:del elements  
	 * 
	 * @param pl - the left paragraph
	 * @param pr - the right paragraph
	 * @param result 
	 */
	public void diff(P pl, P pr, javax.xml.transform.Result result,
			String author, java.util.Calendar date,
			RelationshipsPart docPartRelsLeft, RelationshipsPart docPartRelsRight,
			boolean preProcess) {
		
		
		
		/* In order to get an optimal result when comparing 2 WML paragraphs,
		 * it helps if each can be made to contain matching runs.
		 * 
		 * TODO: ensure each w:r contains one and only one w:t 
		 * 
		 * The process for achieving this involves running the LCS algorithm
		 * on the string content of the paragraph.
		 * 
		 * At this point, you'd actually be done, if you didn't care about
		 * run formatting.  
		 * 
		 * But we do care about run formatting, so the relevant formatting 
		 * is then re-attached to each of the sets of runs.
		 * 
		 * The XML diff is then run on these 'normalised' paragraphs. 
		 * It will tell which of the w:t have been populated/deleted, and
		 * what formatting has changed on their w:r elements.   
		 * 
		 * In terms of actual performance (versus plain old diffx), the
		 * main case where the pre-processing helps:
		 * 
		 * 1. t2R cf t3L
		 * 
			  Left input 
					
					
					    
					        The quick brown 
					    
					    
					        
					            
					            
					            
					        
					        fox
					    
					    
					         jumped over the 
					    
					    
					        
					            
					        
					        lazy
					    
					    
					         dog.
					    
					 
					
					
			  Right input 
					
					
					    
					        The quick brown fox jumped high 
					    
					    
					        high over the lazy dog.
					    
							 
					
					    
		 * 
		 */

        String leftXmlOld = null;
        String rightXmlOld = null;
        if (!preProcess || log.isDebugEnabled() ) {
	        leftXmlOld = org.docx4j.XmlUtils.marshaltoString(pl, true, false);
	        rightXmlOld = org.docx4j.XmlUtils.marshaltoString(pr, true, false);
	        	// NB boolean prettyprint must be set to false
	        	// with diffxConfig 
				//    .setIgnoreWhiteSpace(false);
				//    .setPreserveWhiteSpace(true);
	        	// because otherwise we get ins, del around
	        	// indentation whitespace, and this 
	        	// breaks the transform to wml.

        }

		if (!preProcess) {
			
	        String naive = getDiffxOutput(leftXmlOld, rightXmlOld);

	        // Debug purposes only!
	        log.debug("\n\n naive difference \n\n" );	        
	        log.debug(naive) ;
	        
	        
	        log.info("\n\n  difference without preprocessing  \n\n" );
			try {
				
				XMLInputFactory inputFactory = XMLInputFactory.newInstance();
				inputFactory.setProperty(XMLInputFactory.IS_SUPPORTING_EXTERNAL_ENTITIES, false);
				inputFactory.setProperty(XMLInputFactory.SUPPORT_DTD, false); // a DTD is merely ignored, its presence doesn't cause an exception
				
				//java.io.InputStream is = new java.io.ByteArrayInputStream(naive.getBytes("UTF-8"));
				Reader reader = new StringReader(naive);
				String simplified = combineAdjacent(inputFactory.createXMLStreamReader(reader) );
				
				log.debug("\n\n combineAdjacent: \n\n" + simplified );
								
				StreamSource src = new StreamSource(new StringReader(simplified));
				Map transformParameters = new java.util.HashMap();
				transformParameters.put("Differencer", this);
				transformParameters.put("author", author);
				transformParameters.put("docPartRelsLeft",  docPartRelsLeft);
				transformParameters.put("docPartRelsRight", docPartRelsRight);
				transformParameters.put("relsDiffIdentifier", relsDiffIdentifier);  
				XmlUtils.transform(src, xsltDiffx2Wml, transformParameters, result);
				
			} catch (Exception exc) {
				exc.printStackTrace();
			}			
			
			return;
		}
        
        
		// Compute LCS
		StringComparator left = new StringComparator(pl.toString());
		StringComparator right = new StringComparator(pr.toString());
		org.eclipse.compare.internal.LCSSettings settings = new org.eclipse.compare.internal.LCSSettings();
		
		RangeDifference[] rd = RangeDifferencer.findRanges(settings, left, right); 
		
		// Debug Output
		if (log.isDebugEnabled()) {
			log.debug("\n\n RangeDifferences \n\n");									
	        for (int x=0; x pLeftReplacement = new ArrayList