org.docx4j.model.fields.merge.MailMerger Maven / Gradle / Ivy

Show more of this group Show more artifacts with this name
Show all versions of docx4j-core Show documentation
docx4j is a library which helps you to work with the Office Open XML file format as used in docx documents, pptx presentations, and xlsx spreadsheets.
There is a newer version: 11.4.11
Show newest version
package org.docx4j.model.fields.merge;

import java.io.ByteArrayInputStream;
import java.io.ByteArrayOutputStream;
import java.math.BigInteger;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.HashSet;
import java.util.List;
import java.util.Map;
import java.util.regex.Pattern;

import javax.xml.bind.JAXBElement;
import javax.xml.transform.TransformerException;

import org.apache.commons.lang3.StringUtils;
import org.docx4j.Docx4jProperties;
import org.docx4j.TraversalUtil;
import org.docx4j.XmlUtils;
import org.docx4j.jaxb.Context;
import org.docx4j.model.fields.ComplexFieldLocator;
import org.docx4j.model.fields.FieldRef;
import org.docx4j.model.fields.FieldsPreprocessor;
import org.docx4j.model.fields.FldSimpleModel;
import org.docx4j.model.fields.FormattingSwitchHelper;
import org.docx4j.model.structure.PageDimensions;
import org.docx4j.model.structure.PageSizePaper;
import org.docx4j.model.structure.SectionWrapper;
import org.docx4j.openpackaging.exceptions.Docx4JException;
import org.docx4j.openpackaging.io.SaveToZipFile;
import org.docx4j.openpackaging.packages.OpcPackage;
import org.docx4j.openpackaging.packages.WordprocessingMLPackage;
import org.docx4j.openpackaging.parts.JaxbXmlPart;
import org.docx4j.openpackaging.parts.WordprocessingML.FooterPart;
import org.docx4j.openpackaging.parts.WordprocessingML.HeaderPart;
import org.docx4j.openpackaging.parts.relationships.Namespaces;
import org.docx4j.openpackaging.parts.relationships.RelationshipsPart;
import org.docx4j.openpackaging.parts.relationships.RelationshipsPart.AddPartBehaviour;
import org.docx4j.relationships.Relationship;
import org.docx4j.vml.CTTextbox;
import org.docx4j.wml.Body;
import org.docx4j.wml.BooleanDefaultTrue;
import org.docx4j.wml.CTFFData;
import org.docx4j.wml.CTFFName;
import org.docx4j.wml.CTFFTextInput;
import org.docx4j.wml.CTFFTextType;
import org.docx4j.wml.CTLanguage;
import org.docx4j.wml.CTPageNumber;
import org.docx4j.wml.CTRel;
import org.docx4j.wml.ContentAccessor;
import org.docx4j.wml.ObjectFactory;
import org.docx4j.wml.P;
import org.docx4j.wml.R;
import org.docx4j.wml.RPr;
import org.docx4j.wml.STFFTextType;
import org.docx4j.wml.SectPr;
import org.docx4j.wml.Tc;
import org.docx4j.wml.Text;
import org.jvnet.jaxb2_commons.ppp.Child;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;



/**
 * Perform a mail merge.
 * 
 * Instance values are merged into a docx containing
 * MERGEFIELD to produce output docx made up of
 * a copy of the input docx for each collection of 
 * input values.
 * 
 * The output can be a single docx, or multiple docx.
 * 
 * If you choose single docx, there are two ways to
 * do this:
 * 
 * One is using MergeDocx, which will ensure each 
 * constituent "document" doesn't affect the neighbouring
 * ones (eg numbering will restart).
 * 
 * The other is the "poor man's" approach, which 
 * puts them together, and just hopes for the best.
 * Images and hyperlinks should be ok. But numbering 
 * will continue, as will footnotes/endnotes. 
 * 
 * From 3.0, there is some support for formatting switches
 * (date/time, numeric, and general), and basic 
 * support for MERGEFORMAT.
 *  
 * LIMITATIONS:
 * - no support for text before (\b) and text after (\f)
 *   switches
 * - no support for \m and \v switches
 * - no support for multiple MERGEFIELD in a single
 *   instruction (eg MERGEFIELD CoutesyTitle \f " " MERGEFIELD FirstName \f " " MERGEFIELD LastName ) 
 * 
 * @author jharrop
 *
 */
public class MailMerger {

	private static Logger log = LoggerFactory.getLogger(MailMerger.class);		

	/**
	 * A "poor man's" approach, which generates the mail merge  
	 * results as a single docx, and just hopes for the best.
	 * Images and hyperlinks should be ok. But numbering 
	 * will continue, as will footnotes/endnotes. 
	 * @param input
	 * @param data
	 * @return
	 * @throws Docx4JException
	 */
	public static WordprocessingMLPackage getConsolidatedResultCrude(WordprocessingMLPackage input, 
			List> data) throws Docx4JException {
		return getConsolidatedResultCrude(input, data, false);
	}
	
	/**
	 * A "poor man's" approach, which generates the mail merge  
	 * results as a single docx, and just hopes for the best.
	 * Images and hyperlinks should be ok. But numbering 
	 * will continue, as will footnotes/endnotes. 
	 * [Advert:] If this isn't working for you, the commercial Enterprise Edition of docx4j
	 * (MergeDocx component) will solve your problems. 
	 * @param input
	 * @param data
	 * @param processHeadersAndFooters process headers and footers in FIRST section only.
	 * If you have multiple sections in your input docx, performMerge is a better approach
	 * @return
	 * @throws Docx4JException
	 * @ since 2.8.1
	 */
	public static WordprocessingMLPackage getConsolidatedResultCrude(WordprocessingMLPackage input, 
			List> data, boolean processHeadersAndFooters) throws Docx4JException {
		
		FormTextFieldNames formTextFieldNames = new FormTextFieldNames(); 		
		
		// create contents destined for the main document part
		FieldsPreprocessor.complexifyFields(input.getMainDocumentPart() );
        if(log.isDebugEnabled()) {
            log.debug("complexified: " + XmlUtils.marshaltoString(input.getMainDocumentPart().getJaxbElement(), true));
        }
		List> mdpResults = performOverList(input, input.getMainDocumentPart().getContent(), data, formTextFieldNames );

		// headers/footers
		Map hfTemplates = null;
		BooleanDefaultTrue titlePage = null;
		if (processHeadersAndFooters) {
			// then we need a clone/template of the headers/footers
			// in the first section
			
			hfTemplates = new HashMap();
			
			SectionWrapper sw = input.getDocumentModel().getSections().get(0);
			SectPr sectPr = sw.getSectPr();
			
			List hdrFtrRefs = sectPr.getEGHdrFtrReferences();
			titlePage = sectPr.getTitlePg();
			
			for (CTRel rel : hdrFtrRefs) {
				String relId = rel.getId();
				log.debug("for h|f relId: " + relId);
				
				JaxbXmlPart part = (JaxbXmlPart)input.getMainDocumentPart().getRelationshipsPart().getPart(relId);
				FieldsPreprocessor.complexifyFields(part );

                if(log.isDebugEnabled()) {
                    log.debug("complexified: " + XmlUtils.marshaltoString(part.getJaxbElement(), true));
                }
				
				hfTemplates.put(rel, part);
			}
		}
		
		// Create WordprocessingMLPackage target, by cloning
		OpcPackage result = null;
		ByteArrayOutputStream baos = new ByteArrayOutputStream();
		SaveToZipFile saver = new SaveToZipFile(input);
		saver.save(baos);
		byte[] template = baos.toByteArray();
		WordprocessingMLPackage target = WordprocessingMLPackage.load(
				new ByteArrayInputStream(template));
		
		
		// populate main document part
		SectPr documentSeparator = getDocumentSeparator(target);
		if (processHeadersAndFooters) {
			if (titlePage!=null
					&& titlePage.isVal()) {
				documentSeparator.setTitlePg(titlePage);
			}
			documentSeparator.getEGHdrFtrReferences().clear();
		}
		target.getMainDocumentPart().getContent().clear();
		
		/*
		 * What we're doing, effectively, is doing the 
		 * main content in a single hit (ie for all
		 * instances), and then, for each instance,
		 * doing the headers/footers.
		 * 
		 * It is this way because that is how the code
		 * has evolved.
		 * 
		 * Since we have to do the headers/footers 
		 * instance by instance, it would probably
		 * be neater to do the main content at 
		 * the same time (ie instead of using 
		 * performOverList at the start of this method).
		 */
		
		int i = 0;
		for (List