com.itextpdf.xmp.impl.XMPMetaParser Maven / Gradle / Ivy

Go to download
//Copyright (c) 2006, Adobe Systems Incorporated
//All rights reserved.
//
//        Redistribution and use in source and binary forms, with or without
//        modification, are permitted provided that the following conditions are met:
//        1. Redistributions of source code must retain the above copyright
//        notice, this list of conditions and the following disclaimer.
//        2. Redistributions in binary form must reproduce the above copyright
//        notice, this list of conditions and the following disclaimer in the
//        documentation and/or other materials provided with the distribution.
//        3. All advertising materials mentioning features or use of this software
//        must display the following acknowledgement:
//        This product includes software developed by the Adobe Systems Incorporated.
//        4. Neither the name of the Adobe Systems Incorporated nor the
//        names of its contributors may be used to endorse or promote products
//        derived from this software without specific prior written permission.
//
//        THIS SOFTWARE IS PROVIDED BY ADOBE SYSTEMS INCORPORATED ''AS IS'' AND ANY
//        EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
//        WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
//        DISCLAIMED. IN NO EVENT SHALL ADOBE SYSTEMS INCORPORATED BE LIABLE FOR ANY
//        DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
//        (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
//        LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
//        ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
//        (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
//        SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
//
//        http://www.adobe.com/devnet/xmp/library/eula-xmp-library-java.html

package com.itextpdf.xmp.impl;

import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.Reader;
import java.io.StringReader;
import java.io.UnsupportedEncodingException;

import javax.xml.XMLConstants;
import javax.xml.parsers.DocumentBuilder;
import javax.xml.parsers.DocumentBuilderFactory;
import javax.xml.parsers.ParserConfigurationException;

import org.w3c.dom.Document;
import org.w3c.dom.Node;
import org.w3c.dom.NodeList;
import org.w3c.dom.ProcessingInstruction;
import org.xml.sax.InputSource;
import org.xml.sax.SAXException;

import com.itextpdf.xmp.XMPConst;
import com.itextpdf.xmp.XMPError;
import com.itextpdf.xmp.XMPException;
import com.itextpdf.xmp.XMPMeta;
import com.itextpdf.xmp.options.ParseOptions;


/**
 * This class replaces the ExpatAdapter.cpp and does the
 * XML-parsing and fixes the prefix. After the parsing several normalisations
 * are applied to the XMPTree.
 *
 * @since 01.02.2006
 */
public class XMPMetaParser
{
	/**  */
	private static final Object XMP_RDF = new Object();
	/** the DOM Parser Factory, options are set */
	private static DocumentBuilderFactory factory = createDocumentBuilderFactory();

	/**
	 * Hidden constructor, initialises the SAX parser handler.
	 */
	private XMPMetaParser()
	{
		// EMPTY
	}



	/**
	 * Parses the input source into an XMP metadata object, including
	 * de-aliasing and normalisation.
	 *
	 * @param input the input can be an InputStream, a String or
	 * 			a byte buffer containing the XMP packet.
	 * @param options the parse options
	 * @return Returns the resulting XMP metadata object
	 * @throws XMPException Thrown if parsing or normalisation fails.
	 */
	public static XMPMeta parse(Object input, ParseOptions options) throws XMPException
	{
		ParameterAsserts.assertNotNull(input);
		options = options != null ? options : new ParseOptions();

		Document document = parseXml(input, options);

		boolean xmpmetaRequired = options.getRequireXMPMeta();
		Object[] result = new Object[3];
		result = findRootNode(document, xmpmetaRequired, result);

		if (result != null  &&  result[1] == XMP_RDF)
		{
			XMPMetaImpl xmp = ParseRDF.parse((Node) result[0]);
			xmp.setPacketHeader((String) result[2]);

			// Check if the XMP object shall be normalized
			if (!options.getOmitNormalization())
			{
				return XMPNormalizer.process(xmp, options);
			}
			else
			{
				return xmp;
			}
		}
		else
		{
			// no appropriate root node found, return empty metadata object
			return new XMPMetaImpl();
		}
	}


	/**
	 * Parses the raw XML metadata packet considering the parsing options.
	 * Latin-1/ISO-8859-1 can be accepted when the input is a byte stream
	 * (some old toolkits versions such packets). The stream is
	 * then wrapped in another stream that converts Latin-1 to UTF-8.
	 * 
	 * If control characters shall be fixed, a reader is used that fixes the chars to spaces
	 * (if the input is a byte stream is has to be read as character stream).
	 * 

	 * Both options reduce the performance of the parser.
	 *
	 * @param input the input can be an InputStream, a String or
	 * 			a byte buffer containing the XMP packet.
	 * @param options the parsing options
	 * @return Returns the parsed XML document or an exception.
	 * @throws XMPException Thrown if the parsing fails for different reasons
	 */
	private static Document parseXml(Object input, ParseOptions options)
			throws XMPException
	{
		if (input instanceof InputStream)
		{
			return parseXmlFromInputStream((InputStream) input, options);
		}
		else if (input instanceof byte[])
		{
			return parseXmlFromBytebuffer(new ByteBuffer((byte[]) input), options);
		}
		else
		{
			return parseXmlFromString((String) input, options);
		}
	}


	/**
	 * Parses XML from an {@link InputStream},
	 * fixing the encoding (Latin-1 to UTF-8) and illegal control character optionally.
	 *
	 * @param stream an InputStream
	 * @param options the parsing options
	 * @return Returns an XML DOM-Document.
	 * @throws XMPException Thrown when the parsing fails.
	 */
	private static Document parseXmlFromInputStream(InputStream stream, ParseOptions options)
			throws XMPException
	{
		if (!options.getAcceptLatin1()  &&  !options.getFixControlChars())
		{
			return parseInputSource(new InputSource(stream));
		}
		else
		{
			// load stream into bytebuffer
			try
			{
				ByteBuffer buffer = new ByteBuffer(stream);
				return parseXmlFromBytebuffer(buffer, options);
			}
			catch (IOException e)
			{
				throw new XMPException("Error reading the XML-file",
						XMPError.BADSTREAM, e);
			}
		}
	}


	/**
	 * Parses XML from a byte buffer,
	 * fixing the encoding (Latin-1 to UTF-8) and illegal control character optionally.
	 *
	 * @param buffer a byte buffer containing the XMP packet
	 * @param options the parsing options
	 * @return Returns an XML DOM-Document.
	 * @throws XMPException Thrown when the parsing fails.
	 */
	private static Document parseXmlFromBytebuffer(ByteBuffer buffer, ParseOptions options)
		throws XMPException
	{
		InputSource source = new InputSource(buffer.getByteStream());
		try
		{
			return parseInputSource(source);
		}
		catch (XMPException e)
		{
			if (e.getErrorCode() == XMPError.BADXML  ||
				e.getErrorCode() == XMPError.BADSTREAM)
			{
				if (options.getAcceptLatin1())
				{
					buffer = Latin1Converter.convert(buffer);
				}

				if (options.getFixControlChars())
				{
					try
					{
						String encoding = buffer.getEncoding();
						Reader fixReader = new FixASCIIControlsReader(
							new InputStreamReader(
								buffer.getByteStream(), encoding));
						return parseInputSource(new InputSource(fixReader));
					}
					catch (UnsupportedEncodingException e1)
					{
						// can normally not happen as the encoding is provided by a util function
						throw new XMPException("Unsupported Encoding",
								XMPError.INTERNALFAILURE, e);
					}
				}
				source = new InputSource(buffer.getByteStream());
				return parseInputSource(source);
			}
			else
			{
				throw e;
			}
		}
	}


	/**
	 * Parses XML from a {@link String},
	 * fixing the illegal control character optionally.
	 *
	 * @param input a String containing the XMP packet
	 * @param options the parsing options
	 * @return Returns an XML DOM-Document.
	 * @throws XMPException Thrown when the parsing fails.
	 */
	private static Document parseXmlFromString(String input, ParseOptions options)
			throws XMPException
	{
		InputSource source = new InputSource(new StringReader(input));
		try
		{
			return parseInputSource(source);
		}
		catch (XMPException e)
		{
			if (e.getErrorCode() == XMPError.BADXML  &&  options.getFixControlChars())
			{
				source = new InputSource(new FixASCIIControlsReader(new StringReader(input)));
				return parseInputSource(source);
			}
			else
			{
				throw e;
			}
		}
	}


	/**
	 * Runs the XML-Parser.
	 * @param source an InputSource
	 * @return Returns an XML DOM-Document.
	 * @throws XMPException Wraps parsing and I/O-exceptions into an XMPException.
	 */
	private static Document parseInputSource(InputSource source) throws XMPException
	{
		try
		{
			DocumentBuilder builder = factory.newDocumentBuilder();
			builder.setErrorHandler(null);
			return builder.parse(source);
		}
		catch (SAXException e)
		{
			throw new XMPException("XML parsing failure", XMPError.BADXML, e);
		}
		catch (ParserConfigurationException e)
		{
			throw new XMPException("XML Parser not correctly configured",
					XMPError.UNKNOWN, e);
		}
		catch (IOException e)
		{
			throw new XMPException("Error reading the XML-file", XMPError.BADSTREAM, e);
		}
	}


	/**
	 * Find the XML node that is the root of the XMP data tree. Generally this
	 * will be an outer node, but it could be anywhere if a general XML document
	 * is parsed (e.g. SVG). The XML parser counted all rdf:RDF and
	 * pxmp:XMP_Packet nodes, and kept a pointer to the last one. If there is
	 * more than one possible root use PickBestRoot to choose among them.
	 * 

	 * If there is a root node, try to extract the version of the previous XMP
	 * toolkit.
	 * 

	 * Pick the first x:xmpmeta among multiple root candidates. If there aren't
	 * any, pick the first bare rdf:RDF if that is allowed. The returned root is
	 * the rdf:RDF child if an x:xmpmeta element was chosen. The search is
	 * breadth first, so a higher level candiate is chosen over a lower level
	 * one that was textually earlier in the serialized XML.
	 *
	 * @param root the root of the xml document
	 * @param xmpmetaRequired flag if the xmpmeta-tag is still required, might be set
	 * 		initially to true, if the parse option "REQUIRE_XMP_META" is set
	 * @param result The result array that is filled during the recursive process.
	 * @return Returns an array that contains the result or null.
	 * 		   The array contains:
	 * 

	 * 		[0] - the rdf:RDF-node
	 * 		
[1] - an object that is either XMP_RDF or XMP_PLAIN (the latter is decrecated)
	 * 		
[2] - the body text of the xpacket-instruction.
	 * 
	 *
	 */
	private static Object[] findRootNode(Node root, boolean xmpmetaRequired, Object[] result)
	{
		// Look among this parent's content for x:xapmeta or x:xmpmeta.
		// The recursion for x:xmpmeta is broader than the strictly defined choice,
		// but gives us smaller code.
		NodeList children = root.getChildNodes();
		for (int i = 0; i < children.getLength(); i++)
		{
			root = children.item(i);
			if (Node.PROCESSING_INSTRUCTION_NODE == root.getNodeType()  &&
				XMPConst.XMP_PI.equals(((ProcessingInstruction) root).getTarget()))

			{
				// Store the processing instructions content
				if (result != null)
				{
					result[2] = ((ProcessingInstruction) root).getData();
				}
			}
			else if (Node.TEXT_NODE != root.getNodeType()  &&
				Node.PROCESSING_INSTRUCTION_NODE != root.getNodeType())
			{
				String rootNS = root.getNamespaceURI();
				String rootLocal = root.getLocalName();
				if (
						(
							XMPConst.TAG_XMPMETA.equals(rootLocal)  ||
							XMPConst.TAG_XAPMETA.equals(rootLocal)
						)  &&
						XMPConst.NS_X.equals(rootNS)
				   )
				{
					// by not passing the RequireXMPMeta-option, the rdf-Node will be valid
					return findRootNode(root, false, result);
				}
				else if (!xmpmetaRequired  &&
						"RDF".equals(rootLocal)  &&
						 XMPConst.NS_RDF.equals(rootNS))
				{
					if (result != null)
					{
						result[0] = root;
						result[1] = XMP_RDF;
					}
					return result;
				}
				else
				{
					// continue searching
					Object[] newResult = findRootNode(root, xmpmetaRequired, result);
					if (newResult != null)
					{
						return newResult;
					}
					else
					{
						continue;
					}
				}
			}
		}

		// no appropriate node has been found
		return null;
		//     is extracted here in the C++ Toolkit
	}


	/**
	 * @return Creates, configures and returnes the document builder factory for
	 *         the Metadata Parser.
	 */
	private static DocumentBuilderFactory createDocumentBuilderFactory()
	{
		DocumentBuilderFactory factory = DocumentBuilderFactory.newInstance();
		factory.setNamespaceAware(true);
		factory.setIgnoringComments(true);

		try
		{
			// honor System parsing limits, e.g.
			// System.setProperty("entityExpansionLimit", "10");
			factory.setFeature(XMLConstants.FEATURE_SECURE_PROCESSING, true);

            //Security stuff. Protecting against XEE attacks as described here: https://www.owasp.org/index.php/XML_External_Entity_%28XXE%29_Processing
            // Xerces 1 - http://xerces.apache.org/xerces-j/features.html#external-general-entities
            // Xerces 2 - http://xerces.apache.org/xerces2-j/features.html#external-general-entities
            factory.setFeature("http://xml.org/sax/features/external-general-entities", false);
            // Xerces 2 only - http://xerces.apache.org/xerces-j/features.html#external-general-entities
            factory.setFeature("http://apache.org/xml/features/disallow-doctype-decl", false);
            // and these as well, per Timothy Morgan's 2014 paper: "XML Schema, DTD, and Entity Attacks" (see reference below)
            factory.setXIncludeAware(false);
            factory.setExpandEntityReferences(false);

		}
		catch (Exception e)
		{
			// Ignore IllegalArgumentException and ParserConfigurationException
			// in case the configured XML-Parser does not implement the feature.
		}
		return factory;
	}
}