org.docx4j.diff.Differencer Maven / Gradle / Ivy

Show more of this group Show more artifacts with this name
Show all versions of docx4j Show documentation
docx4j is a library which helps you to work with the Office Open XML file format as used in docx documents, pptx presentations, and xlsx spreadsheets.
There is a newer version: 6.1.2
Show newest version
/*
 *  Copyright 2007-2008, Plutext Pty Ltd.
 *   
 *  This file is part of docx4j.

    docx4j is licensed under the Apache License, Version 2.0 (the "License"); 
    you may not use this file except in compliance with the License. 

    You may obtain a copy of the License at 

        http://www.apache.org/licenses/LICENSE-2.0 

    Unless required by applicable law or agreed to in writing, software 
    distributed under the License is distributed on an "AS IS" BASIS, 
    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 
    See the License for the specific language governing permissions and 
    limitations under the License.

 */

package org.docx4j.diff;


import java.io.File;
import java.io.FileInputStream;
import java.io.IOException;
import java.io.Reader;
import java.io.StringReader;
import java.io.StringWriter;
import java.io.Writer;
import java.text.SimpleDateFormat;
import java.util.Calendar;
import java.util.List;
import java.util.ArrayList;
import java.util.Map;


import javax.xml.bind.JAXBContext;
import javax.xml.bind.JAXBElement;
import javax.xml.bind.Marshaller;
import javax.xml.bind.Unmarshaller;
import javax.xml.parsers.DocumentBuilder;
import javax.xml.parsers.DocumentBuilderFactory;
import javax.xml.transform.Source;
import javax.xml.transform.Templates;
import javax.xml.transform.TransformerConfigurationException;
import javax.xml.transform.stream.StreamResult;
import javax.xml.transform.stream.StreamSource;

import javax.xml.stream.*;
import javax.xml.stream.events.*;
import javax.xml.stream.XMLOutputFactory;
import javax.xml.stream.XMLStreamWriter;

import org.apache.log4j.Logger;
import org.docx4j.XmlUtils;
import org.docx4j.wml.P;
import org.docx4j.wml.R;

import org.eclipse.compare.StringComparator;
import org.eclipse.compare.rangedifferencer.RangeDifference;
import org.docx4j.jaxb.Context;
import org.docx4j.openpackaging.parts.relationships.RelationshipsPart;
import org.docx4j.relationships.Relationship;

import org.eclipse.compare.rangedifferencer.RangeDifferencer;
import org.w3c.dom.Document;
import org.w3c.dom.Node;
import org.xml.sax.InputSource;

import com.topologi.diffx.Docx4jDriver;
import com.topologi.diffx.Main;
import com.topologi.diffx.config.DiffXConfig;



/**
 * Capable of comparing a pair of:
 * - w:body (only lightly tested)
 * - w:sdtContent (used extensively)
 * - w:p (includes an algorithm aimed at producing a better diff)
 * 
 * See org.docx4j.samples.CompareDocuments for an example of how to use.
 * 
 * @author jason
 *
 */
public class Differencer {
	
	/*
	 * TODO:
	 * 
	 * - handle spaces properly (encode real spaces as something before splitting,
	 *   and add back in at end
	 *    
	 */

	protected static Logger log = Logger.getLogger(Differencer.class);


	// For XSLT
	public static void log(String message ) {		
		log.info(message);
	}
	
	

	static org.docx4j.wml.ObjectFactory wmlFactory = new org.docx4j.wml.ObjectFactory();
	
	// The rels used in the resulting diff
	private List composedRels = new ArrayList();
	public List getComposedRels() {
		return composedRels;
	}
	
	
	
    final private static SimpleDateFormat RFC3339_FORMAT 
    	= new SimpleDateFormat("yyyy-MM-dd'T'HH:mm:ss'Z'");
    
		// SimpleDateFormat is not thread-safe see:
		//   http://bugs.sun.com/bugdatabase/view_bug.do?bug_id=6231579
		//   http://bugs.sun.com/bugdatabase/view_bug.do?bug_id=6178997
		// solution is to use stateless MessageFormat instead:
		// final private static String RFC3339_FORMAT = "yyyy-MM-dd'T'HH:mm:ss";
		// final private static String RFC3339_PATTERN = "{0,date," + RFC3339_FORMAT + "}";    	

    static Templates xsltDiffx2Wml;
    
	/**
	 * org/docx4j/diff/diffx2wml.xslt will be used by default
	 * to transform the diff output into a Word docx with tracked
	 * changes. This method allows you to use your own xslt 
	 * instead.
	 * @param xsltDiffx2Wml
	 */
	public static void setXsltDiffx2Wml(Templates xsltDiffx2Wml) {
		Differencer.xsltDiffx2Wml = xsltDiffx2Wml;
	}
    
    
    
    static Templates xsltMarkupInsert;
    static Templates xsltMarkupDelete;
    
    static {
		try {
			Source xsltSource = new StreamSource(org.docx4j.utils.ResourceUtils
					.getResource("org/docx4j/diff/diffx2wml.xslt"));
			xsltDiffx2Wml = XmlUtils.getTransformerTemplate(xsltSource);

			xsltSource = new StreamSource(org.docx4j.utils.ResourceUtils
					.getResource("org/docx4j/diff/MarkupInsert.xslt"));
			xsltMarkupInsert = XmlUtils.getTransformerTemplate(xsltSource);

			xsltSource = new StreamSource(org.docx4j.utils.ResourceUtils
					.getResource("org/docx4j/diff/MarkupDelete.xslt"));
			xsltMarkupDelete = XmlUtils.getTransformerTemplate(xsltSource);
		} catch (IOException e) {
			e.printStackTrace();
		} catch (TransformerConfigurationException e) {
			e.printStackTrace();
		}
    	
    }
    
//	/**
//	 * @param args
//	 */
//	public static void main(String[] args) throws Exception {
//		
//		String BASE_DIR = "/home/dev/workspace/docx4j/src/test/java/org/docx4j/diff/";
//		
//		// Test setup
//		String paraL = BASE_DIR + "t2R";		
//		String paraR = BASE_DIR + "t3L";
//		P pl = loadParagraph(paraL);
//		P pr = loadParagraph(paraR);
//		
//		// Result format
//		StreamResult result = new StreamResult(System.out);
//
//		// Run the diff - FIXME
//		Differencer pd = new Differencer();
//		pd.diff(pl, pr, result, null, null, null, null);
//		
//	}
	
	/**
	 * The id to be allocated to the ins/del
	 * @return
	 */
	public final static Integer getId() {		
		return ++nextId;		
	}
	public static Integer nextId = 0;

	
	/**
	 * Because the resulting document might be built out of the 
	 * results of a number of diffs, we need to be sure that the id's
	 * are unique across these diffs.
	 * 
	 * This is passed into the XSLT, where it is used as part
	 * of the generated rel id.
	 * 
	 * @return the 
	 */
	private String relsDiffIdentifier;  
	/**
	 * @param relsDiffIdentifier the relsDiffIdentifier to set
	 */
	public void setRelsDiffIdentifier(String relsDiffIdentifier) {
		this.relsDiffIdentifier = relsDiffIdentifier;
	}

	/**
	 * Any rel which is present in the results of the comparison must point to
	 * a valid target of the correct type, or the resulting document will
	 * be broken.  
	 * 
	 * So we pass the old and new rels objects, and
	 * progressively build up a List of relationships which will need to be
	 * in the resulting document.
	 * 
	 * Because the resulting document might be built out of the 
	 * results of a number of diffs, we need to be sure that the id's
	 * are unique across these diffs.
	 * 
	 * @return the 
	 */
	public static void registerRelationship(Differencer pd, 
			RelationshipsPart docPartRels, String relId,
			String newRelId ) {

		
		if (docPartRels==null) {
			// (In this case, Xalan won't even be able to find this function)
			return;
		}
		
		if (docPartRels.getRelationships()==null) {
			log.warn("relationships object is null!");
			return;
		}
		
		
		log.error("Looking for rel " + relId);
		Relationship r = docPartRels.getRelationshipByID(relId);
		if (r==null) {
			log.error("Couldn't find rel " + relId);
			return;
		}
		
		Relationship r2 = (Relationship)XmlUtils.deepCopy(r, Context.jcRelationships);
		
		r2.setId(newRelId);
		log.error(".. added rel " + newRelId + " -- " + r2.getTarget() );
		
		pd.composedRels.add(r2);
	}

	/**
	 * Compare 2 p objects, returning a result containing
	 * w:ins and w:del elements  
	 * 
	 * @param pl - the left paragraph
	 * @param pr - the right paragraph
	 * @param result 
	 */
	public void diff(P pl, P pr, javax.xml.transform.Result result, 
			String author, java.util.Calendar date,
			RelationshipsPart docPartRelsLeft, RelationshipsPart docPartRelsRight) {

		diff(pl, pr, result, 
				author, date, 
				docPartRelsLeft, docPartRelsRight,
				false);
	}

	public void diff(org.docx4j.wml.SdtContentBlock cbNewer, 
			org.docx4j.wml.SdtContentBlock cbOlder, 
			javax.xml.transform.Result result,
			String author, java.util.Calendar date,
			RelationshipsPart docPartRelsNewer, RelationshipsPart docPartRelsOlder) {
		
		this.diffWorker(org.docx4j.XmlUtils.marshaltoW3CDomDocument(cbNewer).getDocumentElement(), 
				org.docx4j.XmlUtils.marshaltoW3CDomDocument(cbOlder).getDocumentElement(), 
				result, author, date, docPartRelsNewer, docPartRelsOlder);
	}

	public void diff(org.docx4j.wml.Body newer, 
			org.docx4j.wml.Body older, 
			javax.xml.transform.Result result,
			String author, java.util.Calendar date,
			RelationshipsPart docPartRelsNewer, RelationshipsPart docPartRelsOlder) {
		
		this.diffWorker(
				org.docx4j.XmlUtils.marshaltoW3CDomDocument(newer).getDocumentElement(),
				org.docx4j.XmlUtils.marshaltoW3CDomDocument(older).getDocumentElement(), 				
				result, author, date, docPartRelsNewer, docPartRelsOlder);
	}
	
	/**
	 * This is private, in order to control what objects the user
	 * can invoke diff on.  At present there are public methods for
	 * pairs of w:body, w:sdtContent, and w:p.  
	 * 
	 * TODO: consider/test w:table! 
	 */
	private void diffWorker(Node newer, 
			Node older, 
			javax.xml.transform.Result result,
			String author, java.util.Calendar date,
			RelationshipsPart docPartRelsNewer, RelationshipsPart docPartRelsOlder) {

		Writer diffxResult = new StringWriter();

		try {
			Docx4jDriver.diff(newer,
					   older,
					   diffxResult);
				// The signature which takes Reader objects appears to be broken
			diffxResult.close();
		} catch (Exception exc) {
			exc.printStackTrace();
			diffxResult = null;
		}
		
		try {
			
			XMLInputFactory inputFactory = XMLInputFactory.newInstance();
			/*
			 * With JDK 1.5, you need to supply a stax jar, or you
			 * will get:
			 * 
			 * javax.xml.stream.FactoryConfigurationError: Provider com.bea.xml.stream.MXParserFactory not found
				at javax.xml.stream.FactoryFinder.newInstance(FactoryFinder.java:72)
				at javax.xml.stream.FactoryFinder.find(FactoryFinder.java:176)
				at javax.xml.stream.FactoryFinder.find(FactoryFinder.java:92)
				at javax.xml.stream.XMLInputFactory.newInstance(XMLInputFactory.java:136) 
				
			 * This is not necessary if you use Java 6. 
			 * 
			 * From http://java.sun.com/webservices/docs/1.6/tutorial/doc/SJSXP4.html
			 * 
			 * The XMLInputFactory class lets you configure implementation instances of XML 
			 * stream reader processors created by the factory. New instances of the abstract 
			 * class XMLInputFactory are created by calling the newInstance() method on the 
			 * class. The static method XMLInputFactory.newInstance() is then used to create 
			 * a new factory instance.

				Deriving from JAXP, the XMLInputFactory.newInstance() method determines the 
				specific XMLInputFactory implementation class to load by using the following 
				lookup procedure:
				
				   1. Use the javax.xml.stream.XMLInputFactory system property.
				   
				   2. Use the lib/xml.stream.properties file in the JRE directory.
				   
				   3. Use the Services API, if available, to determine the classname 
				   by looking in the META-INF/services/javax.xml.stream.XMLInputFactory 
				   files in jars available to the JRE.
				   
				   4. Use the platform default XMLInputFactory instance.
			 * 
			 */
			
			
			//java.io.InputStream is = new java.io.ByteArrayInputStream(naive.getBytes("UTF-8"));
			Reader reader;
			if (log.isDebugEnabled() ) {
				String res = diffxResult.toString();
				log.debug("Diff result:" + res);
				reader = new StringReader(res);
			} else {
				reader = new StringReader(diffxResult.toString());				
			}
			
			String simplified = null;
				try {
					simplified = combineAdjacent(inputFactory.createXMLStreamReader(reader) );
				} catch (XMLStreamException e) {
					e.printStackTrace();
//					log.debug("left: " + XmlUtils.marshaltoString(objectLeft, true, false));
//					log.debug("right: " + XmlUtils.marshaltoString(objectRight, true, false));					
				}
			
			log.debug("\n\n Diff'd input to transform: \n\n" + simplified );
							
			StreamSource src = new StreamSource(new StringReader(simplified));
			transformDiffxOutputToWml(result, author, date, docPartRelsNewer,
					docPartRelsOlder, src);
			
		} catch (Exception exc) {
			exc.printStackTrace();
		}			
		
	}

	/**
	 * @param result
	 * @param author
	 * @param date
	 * @param docPartRelsLeft
	 * @param docPartRelsRight
	 * @param src
	 * @throws Exception
	 */
	private void transformDiffxOutputToWml(javax.xml.transform.Result result,
			String author, java.util.Calendar date,
			RelationshipsPart docPartRelsLeft,
			RelationshipsPart docPartRelsRight, StreamSource src)
			throws Exception {
		Map transformParameters = new java.util.HashMap();
					
		String dateString;
		if (date!=null) {				
			dateString = RFC3339_FORMAT.format(date.getTime()) ;
		} else {
			// TODO FIXME - JAXB requires a real date.
			// What to give it?  
			// The alternative is to change the xslt
			// to omit the @date entirely if its unknown
			dateString = "2009-03-11T17:57:00Z";
		}
		transformParameters.put("Differencer", this);
		transformParameters.put("date", dateString);
		transformParameters.put("author", author);
		transformParameters.put("docPartRelsLeft",  docPartRelsLeft);
		transformParameters.put("docPartRelsRight", docPartRelsRight);
		transformParameters.put("relsDiffIdentifier", relsDiffIdentifier);  
		
		XmlUtils.transform(src, xsltDiffx2Wml, transformParameters, result);
	}
	
	public void markupAsInsertion(org.docx4j.wml.SdtContentBlock cbLeft, 
			javax.xml.transform.Result result,
			String author, java.util.Calendar date,
			RelationshipsPart docPartRelsLeft) {

		Writer diffxResult = new StringWriter();
				
		try {

	    	// Now marshall it
			JAXBContext jc = Context.jc;
			Marshaller marshaller=jc.createMarshaller();
			org.w3c.dom.Document doc = org.docx4j.XmlUtils.neww3cDomDocument();

			marshaller.marshal(cbLeft, doc);
			
			
			Map transformParameters = new java.util.HashMap();
						
			if (date!=null) {				
				String dateString = RFC3339_FORMAT.format(date.getTime()) ;
				transformParameters.put("date", dateString);
			}
			
			transformParameters.put("Differencer", this);
			transformParameters.put("author", author);
			transformParameters.put("docPartRelsLeft",  docPartRelsLeft);
			transformParameters.put("docPartRelsRight", null);
			transformParameters.put("relsDiffIdentifier", relsDiffIdentifier);  
			XmlUtils.transform(doc, xsltMarkupInsert, transformParameters, result);
			
		} catch (Exception exc) {
			exc.printStackTrace();
		}					

	}

	public void markupAsDeletion(org.docx4j.wml.SdtContentBlock cbLeft, 
			javax.xml.transform.Result result,
			String author, java.util.Calendar date,
			RelationshipsPart docPartRelsRight) {

		Writer diffxResult = new StringWriter();
				
		try {

	    	// Now marshall it
			JAXBContext jc = Context.jc;
			Marshaller marshaller=jc.createMarshaller();
			org.w3c.dom.Document doc = org.docx4j.XmlUtils.neww3cDomDocument();

			marshaller.marshal(cbLeft, doc);
			
			
			Map transformParameters = new java.util.HashMap();
						
			if (date!=null) {				
				String dateString = RFC3339_FORMAT.format(date.getTime()) ;
				transformParameters.put("date", dateString);
			}
			
			transformParameters.put("Differencer", this);
			transformParameters.put("author", author);
			transformParameters.put("docPartRelsLeft",  null);
			transformParameters.put("docPartRelsRight", docPartRelsRight);
			transformParameters.put("relsDiffIdentifier", relsDiffIdentifier);  
			XmlUtils.transform(doc, xsltMarkupDelete, transformParameters, result);
			
		} catch (Exception exc) {
			exc.printStackTrace();
		}					

	}
	
	
	/**
	 * Compare 2 p objects, returning a result containing
	 * w:ins and w:del elements  
	 * 
	 * @param pl - the left paragraph
	 * @param pr - the right paragraph
	 * @param result 
	 */
	public void diff(P pl, P pr, javax.xml.transform.Result result,
			String author, java.util.Calendar date,
			RelationshipsPart docPartRelsLeft, RelationshipsPart docPartRelsRight,
			boolean preProcess) {
		
		
		
		/* In order to get an optimal result when comparing 2 WML paragraphs,
		 * it helps if each can be made to contain matching runs.
		 * 
		 * TODO: ensure each w:r contains one and only one w:t 
		 * 
		 * The process for achieving this involves running the LCS algorithm
		 * on the string content of the paragraph.
		 * 
		 * At this point, you'd actually be done, if you didn't care about
		 * run formatting.  
		 * 
		 * But we do care about run formatting, so the relevant formatting 
		 * is then re-attached to each of the sets of runs.
		 * 
		 * The XML diff is then run on these 'normalised' paragraphs. 
		 * It will tell which of the w:t have been populated/deleted, and
		 * what formatting has changed on their w:r elements.   
		 * 
		 * In terms of actual performance (versus plain old diffx), the
		 * main case where the pre-processing helps:
		 * 
		 * 1. t2R cf t3L
		 * 
			  Left input 
					
					
					    
					        The quick brown 
					    
					    
					        
					            
					            
					            
					        
					        fox
					    
					    
					         jumped over the 
					    
					    
					        
					            
					        
					        lazy
					    
					    
					         dog.
					    
					 
					
					
			  Right input 
					
					
					    
					        The quick brown fox jumped high 
					    
					    
					        high over the lazy dog.
					    
							 
					
					    
		 * 
		 */

        String leftXmlOld = null;
        String rightXmlOld = null;
        if (!preProcess || log.isDebugEnabled() ) {
	        leftXmlOld = org.docx4j.XmlUtils.marshaltoString(pl, true, false);
	        rightXmlOld = org.docx4j.XmlUtils.marshaltoString(pr, true, false);
	        	// NB boolean prettyprint must be set to false
	        	// with diffxConfig 
				//    .setIgnoreWhiteSpace(false);
				//    .setPreserveWhiteSpace(true);
	        	// because otherwise we get ins, del around
	        	// indentation whitespace, and this 
	        	// breaks the transform to wml.

        }

		if (!preProcess) {
			
	        String naive = getDiffxOutput(leftXmlOld, rightXmlOld);

	        // Debug purposes only!
	        log.debug("\n\n naive difference \n\n" );	        
	        log.debug(naive) ;
	        
	        
	        log.info("\n\n  difference without preprocessing  \n\n" );
			try {
				
				XMLInputFactory inputFactory = XMLInputFactory.newInstance();
				//java.io.InputStream is = new java.io.ByteArrayInputStream(naive.getBytes("UTF-8"));
				Reader reader = new StringReader(naive);
				String simplified = combineAdjacent(inputFactory.createXMLStreamReader(reader) );
				
				log.debug("\n\n combineAdjacent: \n\n" + simplified );
								
				StreamSource src = new StreamSource(new StringReader(simplified));
				Map transformParameters = new java.util.HashMap();
				transformParameters.put("Differencer", this);
				transformParameters.put("author", author);
				transformParameters.put("docPartRelsLeft",  docPartRelsLeft);
				transformParameters.put("docPartRelsRight", docPartRelsRight);
				transformParameters.put("relsDiffIdentifier", relsDiffIdentifier);  
				XmlUtils.transform(src, xsltDiffx2Wml, transformParameters, result);
				
			} catch (Exception exc) {
				exc.printStackTrace();
			}			
			
			return;
		}
        
        
		// Compute LCS
		StringComparator left = new StringComparator(pl.toString());
		StringComparator right = new StringComparator(pr.toString());
		org.eclipse.compare.internal.LCSSettings settings = new org.eclipse.compare.internal.LCSSettings();
		
		RangeDifference[] rd = RangeDifferencer.findRanges(settings, left, right); 
		
		// Debug Output
		if (log.isDebugEnabled()) {
			log.debug("\n\n RangeDifferences \n\n");									
	        for (int x=0; x pLeftReplacement = new ArrayList