All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.docx4j.convert.out.flatOpcXml.FlatOpcXmlCreator Maven / Gradle / Ivy

 *  Copyright 2007-2008, Plutext Pty Ltd.
 *  This file is part of docx4j.

    docx4j is licensed under the Apache License, Version 2.0 (the "License"); 
    you may not use this file except in compliance with the License. 

    You may obtain a copy of the License at  

    Unless required by applicable law or agreed to in writing, software 
    distributed under the License is distributed on an "AS IS" BASIS, 
    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 
    See the License for the specific language governing permissions and 
    limitations under the License.


package org.docx4j.convert.out.flatOpcXml;

import java.util.HashMap;

import javax.xml.bind.JAXBContext;
import javax.xml.bind.JAXBException;
import javax.xml.bind.Marshaller;
import javax.xml.parsers.DocumentBuilder;
import javax.xml.transform.dom.DOMSource;

import org.docx4j.XmlUtils;
import org.docx4j.convert.out.Output;
import org.docx4j.jaxb.Context;
import org.docx4j.jaxb.McIgnorableNamespaceDeclarator;
import org.docx4j.jaxb.NamespacePrefixMapperUtils;
import org.docx4j.openpackaging.URIHelper;
import org.docx4j.openpackaging.exceptions.Docx4JException;
import org.docx4j.openpackaging.packages.OpcPackage;
import org.docx4j.openpackaging.packages.WordprocessingMLPackage;
import org.docx4j.relationships.Relationship;
import org.docx4j.utils.XmlSerializerUtil;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.w3c.dom.Document;

 * Convert a Package object to org.docx4j.xmlPackage.Package
 * (ie the "pkg" single XML file format, sometimes called
 *  Flat OPC format).
 * Microsoft Word and Powerpoint both support
 * saving to this format.  Excel doesn't, but you 
 * can still have a spreadsheet in this format if you want to. 
 * "
 *    (@padding is only on rels)
 *   (@compression is only on binary parts)

 * @author jharrop
public class FlatOpcXmlCreator implements Output {
	private static Logger log = LoggerFactory.getLogger(FlatOpcXmlCreator.class);				
	public FlatOpcXmlCreator(OpcPackage p) {
		this.packageIn = p;
	// The package to save
	public OpcPackage packageIn;
	 * This HashMap is intended to prevent loops.
	private HashMap handled = new HashMap();
	private static org.docx4j.xmlPackage.ObjectFactory factory = new org.docx4j.xmlPackage.ObjectFactory();
	private org.docx4j.xmlPackage.Package pkgResult;
	public org.docx4j.xmlPackage.Package get() throws Docx4JException  {		
		 try {

			pkgResult = factory.createPackage();
			// In pkg format, we don't save [Content_Types].xml

			// Start with _rels/.rels

			RelationshipsPart rp = packageIn.getRelationshipsPart();
			saveRawXmlPart(rp ); 			
			// Now recursively 
			addPartsFromRelationships(rp );
	    } catch (Exception e) {
			e.printStackTrace() ;
			if (e instanceof Docx4JException) {
				throw (Docx4JException)e;
			} else {
				throw new Docx4JException("Failed to save package", e);

	    log.debug("...Done!" );		

		 return pkgResult;

	public void marshal(OutputStream os) throws Docx4JException {
		if (pkgResult==null) {
			if (packageIn==null) {
				throw new Docx4JException("No zipped package to convert to Flat OPC Package");
			} else {
		try {
			JAXBContext jc = Context.jcXmlPackage;
			Marshaller marshaller=jc.createMarshaller();
			// TODO 6.1.0 JAXB_FORMATTED_OUTPUT here doesn't work.
			// Changing org/docx4j/org/apache/xml/serializer/
			// to indent=yes doesn't help either.
			// But you can get formatted output using the approach demo'd in the main method below.
			marshaller.setProperty(Marshaller.JAXB_FORMATTED_OUTPUT, Boolean.TRUE);
			// .. marshall it 
			marshaller.marshal(pkgResult, os);				
		} catch (JAXBException e) {
			throw new Docx4JException("Couldn't marshall Flat OPC Package", e);

//	public void  saveRawXmlPart(Part part) throws Docx4JException {
//		// This is a neater signature and should be used where possible!
//		// Don't drop leading "/" for XmlPackage representation.
//		// It is needed if Word is to consume the result.
//		//String partName = part.getPartName().getName().substring(1);
//		saveRawXmlPart(part);
//	}

	public void  saveRawXmlPart(Part part) throws Docx4JException {
		org.docx4j.xmlPackage.Part partResult = createRawXmlPart(part);
	private static org.w3c.dom.Document marshaltoW3CDomDocument(Object o, JAXBContext jc, String ignorables) {
		try {

			Marshaller marshaller = jc.createMarshaller();

			org.w3c.dom.Document doc = XmlUtils.getNewDocumentBuilder().newDocument();
			marshaller.marshal(o, doc);
			// For FlatOPC, always canonicalize, since we want to
			// trim namespaces to make the file as small as possible
			// (since a key use case is OpenDoPE's UpdateXmlFromDocumentSurface).
			// Word requires the namespaces to be declared in each part in
			// the Flat OPC XML.  That is, you can't just declare them once on the root element!
			// It turns out that things work if you just do this.
			// ie no need for special processing at the package level.

//			if (true /* always canonicalize! */
//					|| Docx4jProperties.getProperty("docx4j.jaxb.marshal.canonicalize", false)) {

				byte[] bytes = XmlUtils.trimNamespaces(doc, ignorables);
				//log.debug(new String(bytes, "UTF-8"));
				/*MOXy issue where it looks like trimNamespaces drops w namespace!
					DEBUG org.docx4j.XmlUtils .trimNamespaces line 700 - Input to Canonicalizer: 
					DEBUG org.docx4j.XmlUtils .marshaltoW3CDomDocument line 903 - 
					[Fatal Error] :1:28: The prefix "w" for element "w:abstractNumId" is not bound.	
					where in fact the real problem is a missng @XmlRootElement annotation on the parent node
					which Sun/Oracle reports.  Once fixed, MOXy is happy as well.
				DocumentBuilder builder = XmlUtils.getDocumentBuilderFactory().newDocumentBuilder();
				return builder.parse(new ByteArrayInputStream(bytes));
//			} else {
//				return doc;
//			}
		} catch (Exception e) {
		    throw new RuntimeException(e);
	public static org.docx4j.xmlPackage.Part createRawXmlPart(Part part) throws Docx4JException {
		String partName = part.getPartName().getName();
        org.docx4j.xmlPackage.Part partResult = factory.createPart();
        if (partName.startsWith("/")) {       
        } else {
        	partResult.setName("/" + partName);
//        	log.error("@pkg:name must start with '/', or Word 2007 won't open it");
//        	throw new Docx4JException("@pkg:name must start with '/', or Word 2007 won't open it");
        String ct = part.getContentType();
        if (ct == null) {
        	// NB - Word can't consume it if the content type is not set 
        	// on the rels parts.
        	log.error("Content type not set! ");
        } else {
        	partResult.setContentType( ct );
        org.docx4j.xmlPackage.XmlData dataResult = factory.createXmlData();

		org.w3c.dom.Document w3cDoc = null;
		if (part instanceof {
			String mceIgnorable = ((JaxbXmlPart)part).getMceIgnorable(); 

			try {
//				w3cDoc = XmlUtils.getNewDocumentBuilder().newDocument();
//				(( w3cDoc, 
//						NamespacePrefixMapperUtils.getPrefixMapper() );
					/* Force the RelationshipsPart to be marshalled using
					 * the normal non-rels part NamespacePrefixMapper,
					 * since otherwise (because we'd be using 2 namespace
					 * prefix mappers?) we end up with errant xmlns="",
					 * which is wrong and stops Word 2007 from loading the
					 * document.
					 * Note that xmlPackage.xsd defines:
					 * Note also that marshaltoString uses 
					 * just the normal non-rels part NamespacePrefixMapper,
					 * so if/when this is marshalled again, that could
					 * have been causing problems as well?? 

				JaxbXmlPart jaxbXmlPart = (;
				w3cDoc = marshaltoW3CDomDocument(jaxbXmlPart.getJaxbElement(), 
						jaxbXmlPart.getJAXBContext(), mceIgnorable + jaxbXmlPart.getMcChoiceNamespaces());
		        dataResult.setAny( w3cDoc.getDocumentElement() );		        
				log.debug( "PUT SUCCESS: " + partName);		
			} catch (Exception e) {
				log.error("Problem saving part " + partName, e);
				throw new Docx4JException("Problem saving part " + partName, e);
		} else if (part instanceof {
			try {
				log.debug("PUT SUCCESS: " + partName);
			} catch (Exception e) {
				log.error("Problem saving part " + partName, e);
				throw new Docx4JException("Problem saving part " + partName, e);
		} else if (part instanceof {

		       Document doc =  ((;		       
		       dataResult.setAny( doc.getDocumentElement() );
		} else {
			// Shouldn't happen, since ContentTypeManagerImpl should
			// return an instance of one of the above, or throw an
			// Exception.
			log.error("PROBLEM - No suitable part found for: " + partName);
		return partResult;
	/* recursively 
		(i) get each Part listed in the relationships
		(ii) add the Part to the zip file
		(iii) traverse its relationship
	public void addPartsFromRelationships(RelationshipsPart rp )
	 throws Docx4JException {
//		for (Iterator it = rp.iterator(); it.hasNext(); ) {
//			Relationship r = (Relationship);
//"For Relationship Id=" + r.getId() + " Source is " + r.getSource().getPartName() + ", Target is " + r.getTargetURI() );

		for ( Relationship r : rp.getRelationships().getRelationship() ) {
			log.debug("For Relationship Id=" + r.getId() 
					+ " Source is " + rp.getSourceP().getPartName() 
					+ ", Target is " + r.getTarget() );
//			if (!r.getTargetMode().equals(TargetMode.INTERNAL) ) {
			if (r.getTargetMode() != null
					&& r.getTargetMode().equals("External") ) {
				//log.debug("Encountered external resource " + r.getTarget() + " of type " + r.getType() );
				// So
			try {
				//String resolvedPartUri = URIHelper.resolvePartUri(r.getSourceURI(), r.getTargetURI() ).toString();

				String resolvedPartUri = URIHelper.resolvePartUri(rp.getSourceURI(), new URI(r.getTarget() ) ).toString();		
				// Now drop leading "/'
				resolvedPartUri = resolvedPartUri.substring(1);				
				// Now normalise it .. ie abc/def/../ghi
				// becomes abc/ghi
				// Maybe this isn't necessary with a zip file,
				// - ZipFile class may be smart enough to do it.
				// But it is certainly necessary in the JCR case.
//				target = (new;
//"Normalised, it is " + target );				
//				Document contents = getDocumentFromZippedPart( zf,  target);
				// TODO - if this is already in our hashmap, skip
				// to the next				
				if (!false) {
					log.debug("Getting part /" + resolvedPartUri );
					Part part = packageIn.getParts().get(new PartName("/" + resolvedPartUri));
					if (part==null) {
						log.error("Part " + resolvedPartUri + " not found!");
					} else {
						log.debug(part.getClass().getName() );
			} catch (Exception e) {
				throw new Docx4JException("Failed to add parts from relationships", e);				

	 * @param out
	 * @param resolvedPartUri
	 * @param part
	 * @throws Docx4JException
	 * @throws IOException
	public void savePart(Part part)
			throws Docx4JException, IOException {
		// Drop the leading '/'
		String resolvedPartUri = part.getPartName().getName().substring(1);
		if (handled.get(resolvedPartUri)!=null) {
			log.debug(".. duplicate save avoided .." );
		if (part instanceof BinaryPart ) {
			log.debug(".. saving binary stuff" );
			saveRawBinaryPart( part );
		} else {
			log.debug(".. saving " );					
			saveRawXmlPart( part );
		handled.put(resolvedPartUri, resolvedPartUri);		
		// recurse via this parts relationships, if it has any
		if (part.getRelationshipsPart()!= null
				&& part.getRelationshipsPart().getJaxbElement()!=null
				&& part.getRelationshipsPart().getJaxbElement().getRelationship()!=null
				&& part.getRelationshipsPart().getJaxbElement().getRelationship().size()>0) {
			RelationshipsPart rrp = part.getRelationshipsPart();
			log.debug("Found relationships " + rrp.getPartName() );
			String relPart = PartName.getRelationshipsPartName(resolvedPartUri);
			log.debug("Cf constructed name " + relPart );
			saveRawXmlPart( rrp);
			//, "/" + relPart );  // '/' necessary for Xml Pkg format.
			addPartsFromRelationships( rrp );
		} else {
			log.debug("No relationships for " + resolvedPartUri );					
	protected void saveRawBinaryPart(Part part) throws Docx4JException {

		org.docx4j.xmlPackage.Part partResult = createRawBinaryPart(part);

	public static org.docx4j.xmlPackage.Part createRawBinaryPart(Part part) throws Docx4JException {
		String resolvedPartUri = part.getPartName().getName();
		// Don't drop leading "/" for XmlPackage representation.
		// It is needed if Word is to consume the result.
		//String resolvedPartUri = part.getPartName().getName().substring(1);

        org.docx4j.xmlPackage.Part partResult = factory.createPart();
        partResult.setContentType( part.getContentType() );

		try {

			partResult.setBinaryData( ((BinaryPart)part).getBytes() );
		} catch (Exception e ) {
			throw new Docx4JException("Failed to put binary part", e);			

		log.debug( "PUT SUCCESS: " + resolvedPartUri);		
		return partResult;
	/* It is sometimes useful to wrap a part in an appropriate pkg:part */
	public static String wrapInXmlPart(String xml, String partName, String contentType) {
		return  ""
				  		+ ""
				  			+ xml
				  		+ ""
				  	+ "";

	public static String wrapInBinaryPart(byte[] base64, String partName, String contentType) {
		try {
			return ""
					+ "" + new String(base64, "UTF-8")
					+ "" + "";
		} catch (UnsupportedEncodingException e) {
			// I assume system supports UTF-8 !!
			log.error(e.getMessage(), e);
			return null;
	 * Return the WordML package in Flat OPC format, as a W3C DOM document
	 * @return
	 * @throws Exception
	public static Document getFlatDomDocument(WordprocessingMLPackage wordMLPackage) throws Docx4JException {
		FlatOpcXmlCreator worker = new FlatOpcXmlCreator(wordMLPackage);
		org.docx4j.xmlPackage.Package pkg = worker.get();
		org.w3c.dom.Document doc;
		try {
			JAXBContext jc = Context.jcXmlPackage;
			Marshaller marshaller=jc.createMarshaller();
			doc = org.docx4j.XmlUtils.neww3cDomDocument();

			marshaller.marshal(pkg, doc);
		} catch (JAXBException e) {
			throw new Docx4JException("Couldn't marshal Flat OPC to DOM", e);
		return doc;
	// Implement the interface
	public void output(javax.xml.transform.Result result) throws Docx4JException {
		XmlSerializerUtil.serialize(new DOMSource( getFlatDomDocument( (WordprocessingMLPackage)packageIn)), result, 
				true, false);
				// we haven't explicitly set METHOD=xml here, but I don't see why we shouldn't. 
	public static void main(String[] args) throws Exception {
//		String inputfilepath = System.getProperty("user.dir") + "/ole_tests/OUT_wmv_f.pptx";
		String inputfilepath = System.getProperty("user.dir") + "/ole_tests/wmv_CT.pptx";
		OpcPackage wordMLPackage = OpcPackage.load(new;
		FlatOpcXmlCreator worker = new FlatOpcXmlCreator(wordMLPackage);
		org.docx4j.xmlPackage.Package result = worker.get();
		boolean suppressDeclaration = true;
		boolean prettyprint = true;
		String data = 
					marshaltoString(result, suppressDeclaration, prettyprint, 
				new File(System.getProperty("user.dir") + "/ole_tests/wmv_CT.xml"), 
		// Note - We don't bother adding:
		// 1. mso-application PI
		// 2. @padding on rels?
		// Since Word 2007 is happy to consume without either of these

© 2015 - 2024 Weber Informatics LLC | Privacy Policy