org.docx4j.model.datastorage.OpenDoPEHandler Maven / Gradle / Ivy

Show more of this group Show more artifacts with this name
Show all versions of docx4j Show documentation
docx4j is a library which helps you to work with the Office Open XML file format as used in docx documents, pptx presentations, and xlsx spreadsheets.
The newest version!
/**
 *  Copyright 2010-2013, Plutext Pty Ltd.
 *   
 *  This file is part of docx4j.

    docx4j is licensed under the Apache License, Version 2.0 (the "License"); 
    you may not use this file except in compliance with the License. 

    You may obtain a copy of the License at 

        http://www.apache.org/licenses/LICENSE-2.0 

    Unless required by applicable law or agreed to in writing, software 
    distributed under the License is distributed on an "AS IS" BASIS, 
    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 
    See the License for the specific language governing permissions and 
    limitations under the License.

 **/
package org.docx4j.model.datastorage;

import static org.docx4j.model.datastorage.XPathEnhancerParser.enhanceXPath;

import java.util.ArrayList;
import java.util.HashMap;
import java.util.HashSet;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.concurrent.atomic.AtomicInteger;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

import javax.xml.bind.JAXBElement;
import javax.xml.bind.JAXBException;

import org.apache.commons.lang3.StringUtils;
import org.docx4j.Docx4jProperties;
import org.docx4j.TraversalUtil;
import org.docx4j.TraversalUtil.CallbackImpl;
import org.docx4j.XmlUtils;
import org.docx4j.jaxb.Context;
import org.docx4j.model.sdt.QueryString;
import org.docx4j.openpackaging.exceptions.Docx4JException;
import org.docx4j.openpackaging.packages.WordprocessingMLPackage;
import org.docx4j.openpackaging.parts.CustomXmlDataStoragePart;
import org.docx4j.openpackaging.parts.CustomXmlPart;
import org.docx4j.openpackaging.parts.WordprocessingML.FooterPart;
import org.docx4j.openpackaging.parts.WordprocessingML.HeaderPart;
import org.docx4j.openpackaging.parts.opendope.JaxbCustomXmlDataStoragePart;
import org.docx4j.openpackaging.parts.relationships.Namespaces;
import org.docx4j.openpackaging.parts.relationships.RelationshipsPart;
import org.docx4j.relationships.Relationship;
import org.docx4j.w15.CTSdtRepeatedSection;
import org.docx4j.wml.CTDataBinding;
import org.docx4j.wml.CTLock;
import org.docx4j.wml.ContentAccessor;
import org.docx4j.wml.Id;
import org.docx4j.wml.P;
import org.docx4j.wml.SdtContent;
import org.docx4j.wml.SdtElement;
import org.docx4j.wml.SdtPr;
import org.docx4j.wml.Tag;
import org.docx4j.wml.Tbl;
import org.docx4j.wml.Tc;
import org.docx4j.wml.Tr;
import org.opendope.conditions.Condition;
import org.opendope.xpaths.Xpaths;
import org.opendope.xpaths.Xpaths.Xpath;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.w3c.dom.Document;
import org.w3c.dom.Node;

public class OpenDoPEHandler {

	private static Logger log = LoggerFactory.getLogger(OpenDoPEHandler.class);
	
	public static boolean ENABLE_XPATH_CACHE = true;
	
	private Map xpathsMap = null; 
	private Map conditionsMap = null; 

	public OpenDoPEHandler(WordprocessingMLPackage wordMLPackage)
			throws Docx4JException {

		this.wordMLPackage = wordMLPackage;

		if (wordMLPackage.getMainDocumentPart().getXPathsPart() == null) {
			log.info("OpenDoPE XPaths part missing (ok if you are just processing w15 repeatingSection)");
//			xPaths = new org.opendope.xpaths.Xpaths();
			xpathsMap = new HashMap();
		} else {
			org.opendope.xpaths.Xpaths xPaths = wordMLPackage.getMainDocumentPart().getXPathsPart()
					.getJaxbElement();
            if(log.isDebugEnabled()) {
                log.debug(XmlUtils.marshaltoString(xPaths, true, true));
            }
			
			xpathsMap = new HashMap(2*xPaths.getXpath().size());
			
			for (Xpaths.Xpath xp : xPaths.getXpath() ) {
				
				if (xpathsMap.put(xp.getId(), xp)!=null) {
					log.error("Duplicates in XPaths part: " + xp.getId());
				}
				// TODO key should include storeItemID?
			}
			
		}
		
		if (wordMLPackage.getMainDocumentPart().getConditionsPart() != null) {
			org.opendope.conditions.Conditions conditions = wordMLPackage.getMainDocumentPart()
					.getConditionsPart().getJaxbElement();
            if(log.isDebugEnabled()) {
                log.debug(XmlUtils.marshaltoString(conditions, true, true));
            }
			
			conditionsMap = new HashMap(2*conditions.getCondition().size());
			
			for (Condition c : conditions.getCondition()) {
				if (conditionsMap.put(c.getId(), c)!=null) {
					log.error("Duplicates in Conditions part: " + c.getId());
				}
			}
		}
		
		if (wordMLPackage.getMainDocumentPart().getComponentsPart() != null) {
			components = wordMLPackage.getMainDocumentPart()
					.getComponentsPart().getJaxbElement();
            if(log.isDebugEnabled()) {
                log.debug(XmlUtils.marshaltoString(components, true, true));
            }
		}

		// Precalculate repeat counts, for approx 30% gain
		// @ since 3.3.6
		if (ENABLE_XPATH_CACHE) {
						
			// We're only caching the first one we encounter
			// (even though, in principle, there could be multiple, and ODH is set up for that)
			CustomXmlPart cxp =
					CustomXmlDataStoragePartSelector.getCustomXmlDataStoragePart(
							(WordprocessingMLPackage)wordMLPackage);
		
			if (cxp==null) {
				log.warn("No CustomXmlDataStoragePart found; can't cache.");
				/* TODO: would fail on StandardisedAnswersPart
				 * since that extends JaxbCustomXmlDataStoragePart
				 */
			} else if (cxp instanceof CustomXmlDataStoragePart) {
				
				CustomXmlDataStoragePart cdsp = (CustomXmlDataStoragePart)cxp;
				
				long start = System.currentTimeMillis();
			
				Document data = cdsp.getData().getDocument();
				
				domToXPathMap = new DomToXPathMap(data);
				domToXPathMap.map();
//				countMap = domToXPathMap.getCountMap();
				long end = System.currentTimeMillis();
				long time = end - start;
	
				log.debug("Mapped " + domToXPathMap.getCountMap().size() + " in " + time + "ms");
				
			} else if (cxp instanceof JaxbCustomXmlDataStoragePart) {
				
				Document data = XmlUtils.neww3cDomDocument();
				try {
					((JaxbCustomXmlDataStoragePart)cxp).marshal(data);
				} catch (JAXBException e) {
					throw new Docx4JException("Problem caching JaxbCustomXmlDataStoragePart", e);
				}
				domToXPathMap = new DomToXPathMap(data);
				domToXPathMap.map();
//				countMap = domToXPathMap.getCountMap();
				log.debug("Mapped " + domToXPathMap.getCountMap().size() );
				
			} else {
				log.warn("TODO: cache " + cxp.getClass().getName() );
			}
		}
				
		
		shallowTraversor = new ShallowTraversor();
		shallowTraversor.wordMLPackage = wordMLPackage;
		
		bookmarkRenumber = new BookmarkRenumber(wordMLPackage);
	}

	private DomToXPathMap domToXPathMap = null;
	public DomToXPathMap getDomToXPathMap() {
		return domToXPathMap;
	}

	private WordprocessingMLPackage wordMLPackage;
	private ShallowTraversor shallowTraversor;
	
	private BookmarkRenumber bookmarkRenumber;
	
	/**
	 * Provide a way to for user to fetch the starting bookmark ID number
	 * for use in the next stage (ie Binding Traverse).
	 * 
	 * If it isn't fetched/set, the value will have to be recalculated (less efficient).
	 *  
	 * @since 3.2.1
	 */	
	public AtomicInteger getNextBookmarkId() {
		return bookmarkRenumber.getBookmarkId();
	}

	public final static String BINDING_ROLE_XPATH = "od:xpath";

	public final static String BINDING_ROLE_CONDITIONAL = "od:condition";
	public final static String BINDING_RESULT_CONDITION_FALSE = "od:resultConditionFalse";

	public final static String BINDING_ROLE_REPEAT = "od:repeat";
	public final static String BINDING_RESULT_RPTD_ZERO = "od:resultRepeatZero";
	public final static String BINDING_RESULT_RPTD_ZERO_W15 = "w15:resultRepeatZero";
	public final static String BINDING_RESULT_RPTD = "od:rptd";
	
	// Repeat position condition (eg second last entry)
	public final static String BINDING_ROLE_RPT_POS_CON = "od:RptPosCon";  // see bind.xslt

	public final static String BINDING_ROLE_NARRATIVE = "od:narrative";

	public final static String BINDING_ROLE_COMPONENT = "od:component";
	public final static String BINDING_ROLE_COMPONENT_BEFORE = "od:continuousBefore";
	public final static String BINDING_ROLE_COMPONENT_AFTER = "od:continuousAfter";
	public final static String BINDING_ROLE_COMPONENT_CONTEXT = "od:context";

	public final static String BINDING_ROLE_FINISHER = "od:finish";	
	
	public final static String BINDING_CONTENTTYPE = "od:ContentType";
	public final static String BINDING_HANDLER = "od:Handler";
	public final static String BINDING_PROGID = "od:progid"; // eg =Word.Document
	/*
	 * --------------------------------------------------------------------------
	 * - Pre-processing of content controls which have a tag containing
	 * "bindingrole"
	 */

	private org.opendope.components.Components components;

// TODO consider whether to reinstate.  User would need to choose between 	
//  conditional SDT removal, and reverting functionality.
	
//	private boolean removeSdtCellsOnFailedCondition;
//
//	/**
//	 * Configure, how the preprocessor handles conditions on table cells.
//	 *
//	 * If set to false, conditional SDT cells are replaced by empty
//	 * cells. This is the default behavior.
//	 *
//	 * If set to true, conditional SDT cells are removed entirely.
//	 * Note that the table geometry is not changed; hence this works better
//	 * without dynamic table widths / no global width settings.
//	 *
//	 * This affects all future calls on the {@link #preprocess} method for this
//     * instance.
//	 *
//	 * @param removeSdtCellsOnFailedCondition
//	 *            The new value for the cell removal flag.
//	 */
//	public void setRemoveSdtCellsOnFailedCondition(
//			boolean removeSdtCellsOnFailedCondition) {
//		this.removeSdtCellsOnFailedCondition = removeSdtCellsOnFailedCondition;
//	}

	/**
	 * Preprocess content controls which have tag
	 * "od:condition|od:repeat|od:component".
	 *
	 * It is "preprocess" in the sense that it is "pre" opening in Word
	 *
	 * The algorithm is as follows:
	 *
	 * Inject components first.
	 *
	 * Look at each top level SDT (ShallowTraversor). If it does not have a real
	 * data binding, it might have a bindingrole tag we need to process
	 * (processBindingRoleIfAny).
	 *
	 * Conditionals are easy.
	 *
	 * processRepeat method:
	 *
	 * - clones the sdt n times
	 *
	 * - invokes DeepTraversor which changes xpath binding on descendant sdts
	 * (both sdts with real bindings and sdts with bindingrole tags).
	 *
	 * It is not the job of DeepTraversor to expand out any other repeats it
	 * might encounter, or to resolve conditionals.
	 *
	 * Those things are done by ShallowTraversor, to which control returns, as
	 * it continues its traverse.
	 *
	 * The implementation of 13 Sept 2010 replaced the previous XPath based
	 * implementation, which did not support nested repeats. I've chosen to
	 * build this around TraversalUtil, instead of using XSLT, and this seems to
	 * have worked out nicely.
	 *
	 * The implementation of 10 October 2010 replaced the v1 conventions
	 * implementation with a v2 implementation. The main method in this class
	 * can convert v1 documents to v2. The v2 implementation is not yet
	 * complete. All v1 features are implemented, but not the new v2 stuff (eg
	 * complex conditions).
	 *
	 * @param documentPart
	 * @throws Exception
	 */
	public WordprocessingMLPackage preprocess() throws Docx4JException {
		
		Set partList = getParts(wordMLPackage);
			
		// Process repeats and conditionals.
		try {
			for (ContentAccessor part : partList) {
				new TraversalUtil(part, shallowTraversor);
			}
		} catch (InputIntegrityException iie) { // RuntimeException
			throw new Docx4JException(iie.getMessage(), iie);
		}
		
		// Write our maps back to their parts
		// XPaths
		if (wordMLPackage.getMainDocumentPart().getXPathsPart() == null) { 
			// OpenDoPE XPaths part missing is ok if you are just processing w15 repeatingSection)");
			log.info("No XPaths part; ok if you are just processing w15 repeatingSection");
		} else {
			wordMLPackage.getMainDocumentPart().getXPathsPart().getContents().getXpath().clear();
			wordMLPackage.getMainDocumentPart().getXPathsPart().getContents().getXpath().addAll(xpathsMap.values());
		}
		// Conditions
		if (wordMLPackage.getMainDocumentPart().getConditionsPart() != null) {
			wordMLPackage.getMainDocumentPart().getConditionsPart().getContents().getCondition().clear();
			wordMLPackage.getMainDocumentPart().getConditionsPart().getContents().getCondition().addAll(conditionsMap.values());
		}
		
		log.debug(this.conditionTiming.toString());
		log.debug("conditions in total: " + this.conditionTimingTotal/1000);  // in seconds

		return wordMLPackage;
	}

	protected static Set getParts(WordprocessingMLPackage srcPackage) {

		Set partList = new HashSet();

		partList.add(srcPackage.getMainDocumentPart());

		// Add headers/footers
		RelationshipsPart rp = srcPackage.getMainDocumentPart()
				.getRelationshipsPart();
		for (Relationship r : rp.getRelationships().getRelationship()) {

			if (r.getType().equals(Namespaces.HEADER)) {
				partList.add((HeaderPart) rp.getPart(r));
			} else if (r.getType().equals(Namespaces.FOOTER)) {
				partList.add((FooterPart) rp.getPart(r));
			}
		}

		return partList;
	}



	// private static void preprocessRun(WordprocessingMLPackage wordMLPackage,
	// ContentAccessor content) throws Docx4JException {
	//
	// log.info("\n\n Preprocess run.. \n\n");
	//
	// MainDocumentPart documentPart = wordMLPackage.getMainDocumentPart();
	//
	// if (wordMLPackage.getMainDocumentPart().getXPathsPart()==null) {
	// throw new Docx4JException("OpenDoPE XPaths part missing");
	// } else {
	// xPaths =
	// wordMLPackage.getMainDocumentPart().getXPathsPart().getJaxbElement();
	// log.debug( XmlUtils.marshaltoString(xPaths, true, true));
	// }
	// if (wordMLPackage.getMainDocumentPart().getConditionsPart()!=null) {
	// conditions =
	// wordMLPackage.getMainDocumentPart().getConditionsPart().getJaxbElement();
	// log.debug( XmlUtils.marshaltoString(conditions, true, true));
	// }
	// if (wordMLPackage.getMainDocumentPart().getComponentsPart()!=null) {
	// components =
	// wordMLPackage.getMainDocumentPart().getComponentsPart().getJaxbElement();
	// log.debug( XmlUtils.marshaltoString(components, true, true));
	// }
	//
	// org.docx4j.wml.Document wmlDocumentEl =
	// (org.docx4j.wml.Document)documentPart.getJaxbElement();
	// Body body = wmlDocumentEl.getBody();
	//
	// shallowTraversor.wordMLPackage = wordMLPackage;
	//
	// new TraversalUtil(body, shallowTraversor);
	//
	// }

	// private static XPathsPart getXPathsPart(WordprocessingMLPackage
	// wordMLPackage) {
	// wordMLPackage.getParts().
	// }

	/**
	 * This traversor duplicates the repeats, and removes false conditonals
	 */
	private class ShallowTraversor implements TraversalUtil.Callback {

		// private static Logger log = LoggerFactory.getLogger(ShallowTraversor.class);

		WordprocessingMLPackage wordMLPackage;

		@Override
		public List