All Downloads are FREE. Search and download functionalities are using the official Maven repository.

com.itextpdf.xmp.impl.XMPMetaParser Maven / Gradle / Ivy

//Copyright (c) 2006, Adobe Systems Incorporated
//All rights reserved.
//
//        Redistribution and use in source and binary forms, with or without
//        modification, are permitted provided that the following conditions are met:
//        1. Redistributions of source code must retain the above copyright
//        notice, this list of conditions and the following disclaimer.
//        2. Redistributions in binary form must reproduce the above copyright
//        notice, this list of conditions and the following disclaimer in the
//        documentation and/or other materials provided with the distribution.
//        3. All advertising materials mentioning features or use of this software
//        must display the following acknowledgement:
//        This product includes software developed by the Adobe Systems Incorporated.
//        4. Neither the name of the Adobe Systems Incorporated nor the
//        names of its contributors may be used to endorse or promote products
//        derived from this software without specific prior written permission.
//
//        THIS SOFTWARE IS PROVIDED BY ADOBE SYSTEMS INCORPORATED ''AS IS'' AND ANY
//        EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
//        WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
//        DISCLAIMED. IN NO EVENT SHALL ADOBE SYSTEMS INCORPORATED BE LIABLE FOR ANY
//        DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
//        (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
//        LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
//        ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
//        (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
//        SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
//
//        http://www.adobe.com/devnet/xmp/library/eula-xmp-library-java.html

package com.itextpdf.xmp.impl;

import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.Reader;
import java.io.StringReader;
import java.io.UnsupportedEncodingException;

import javax.xml.XMLConstants;
import javax.xml.parsers.DocumentBuilder;
import javax.xml.parsers.DocumentBuilderFactory;
import javax.xml.parsers.ParserConfigurationException;

import org.w3c.dom.Document;
import org.w3c.dom.Node;
import org.w3c.dom.NodeList;
import org.w3c.dom.ProcessingInstruction;
import org.xml.sax.InputSource;
import org.xml.sax.SAXException;

import com.itextpdf.xmp.XMPConst;
import com.itextpdf.xmp.XMPError;
import com.itextpdf.xmp.XMPException;
import com.itextpdf.xmp.XMPMeta;
import com.itextpdf.xmp.options.ParseOptions;


/**
 * This class replaces the ExpatAdapter.cpp and does the
 * XML-parsing and fixes the prefix. After the parsing several normalisations
 * are applied to the XMPTree.
 *
 * @since 01.02.2006
 */
public class XMPMetaParser
{
	/**  */
	private static final Object XMP_RDF = new Object();
	/** the DOM Parser Factory, options are set */
	private static DocumentBuilderFactory factory = createDocumentBuilderFactory();

	/**
	 * Hidden constructor, initialises the SAX parser handler.
	 */
	private XMPMetaParser()
	{
		// EMPTY
	}



	/**
	 * Parses the input source into an XMP metadata object, including
	 * de-aliasing and normalisation.
	 *
	 * @param input the input can be an InputStream, a String or
	 * 			a byte buffer containing the XMP packet.
	 * @param options the parse options
	 * @return Returns the resulting XMP metadata object
	 * @throws XMPException Thrown if parsing or normalisation fails.
	 */
	public static XMPMeta parse(Object input, ParseOptions options) throws XMPException
	{
		ParameterAsserts.assertNotNull(input);
		options = options != null ? options : new ParseOptions();

		Document document = parseXml(input, options);

		boolean xmpmetaRequired = options.getRequireXMPMeta();
		Object[] result = new Object[3];
		result = findRootNode(document, xmpmetaRequired, result);

		if (result != null  &&  result[1] == XMP_RDF)
		{
			XMPMetaImpl xmp = ParseRDF.parse((Node) result[0]);
			xmp.setPacketHeader((String) result[2]);

			// Check if the XMP object shall be normalized
			if (!options.getOmitNormalization())
			{
				return XMPNormalizer.process(xmp, options);
			}
			else
			{
				return xmp;
			}
		}
		else
		{
			// no appropriate root node found, return empty metadata object
			return new XMPMetaImpl();
		}
	}


	/**
	 * Parses the raw XML metadata packet considering the parsing options.
	 * Latin-1/ISO-8859-1 can be accepted when the input is a byte stream
	 * (some old toolkits versions such packets). The stream is
	 * then wrapped in another stream that converts Latin-1 to UTF-8.
	 * 

* If control characters shall be fixed, a reader is used that fixes the chars to spaces * (if the input is a byte stream is has to be read as character stream). *

* Both options reduce the performance of the parser. * * @param input the input can be an InputStream, a String or * a byte buffer containing the XMP packet. * @param options the parsing options * @return Returns the parsed XML document or an exception. * @throws XMPException Thrown if the parsing fails for different reasons */ private static Document parseXml(Object input, ParseOptions options) throws XMPException { if (input instanceof InputStream) { return parseXmlFromInputStream((InputStream) input, options); } else if (input instanceof byte[]) { return parseXmlFromBytebuffer(new ByteBuffer((byte[]) input), options); } else { return parseXmlFromString((String) input, options); } } /** * Parses XML from an {@link InputStream}, * fixing the encoding (Latin-1 to UTF-8) and illegal control character optionally. * * @param stream an InputStream * @param options the parsing options * @return Returns an XML DOM-Document. * @throws XMPException Thrown when the parsing fails. */ private static Document parseXmlFromInputStream(InputStream stream, ParseOptions options) throws XMPException { if (!options.getAcceptLatin1() && !options.getFixControlChars()) { return parseInputSource(new InputSource(stream)); } else { // load stream into bytebuffer try { ByteBuffer buffer = new ByteBuffer(stream); return parseXmlFromBytebuffer(buffer, options); } catch (IOException e) { throw new XMPException("Error reading the XML-file", XMPError.BADSTREAM, e); } } } /** * Parses XML from a byte buffer, * fixing the encoding (Latin-1 to UTF-8) and illegal control character optionally. * * @param buffer a byte buffer containing the XMP packet * @param options the parsing options * @return Returns an XML DOM-Document. * @throws XMPException Thrown when the parsing fails. */ private static Document parseXmlFromBytebuffer(ByteBuffer buffer, ParseOptions options) throws XMPException { InputSource source = new InputSource(buffer.getByteStream()); try { return parseInputSource(source); } catch (XMPException e) { if (e.getErrorCode() == XMPError.BADXML || e.getErrorCode() == XMPError.BADSTREAM) { if (options.getAcceptLatin1()) { buffer = Latin1Converter.convert(buffer); } if (options.getFixControlChars()) { try { String encoding = buffer.getEncoding(); Reader fixReader = new FixASCIIControlsReader( new InputStreamReader( buffer.getByteStream(), encoding)); return parseInputSource(new InputSource(fixReader)); } catch (UnsupportedEncodingException e1) { // can normally not happen as the encoding is provided by a util function throw new XMPException("Unsupported Encoding", XMPError.INTERNALFAILURE, e); } } source = new InputSource(buffer.getByteStream()); return parseInputSource(source); } else { throw e; } } } /** * Parses XML from a {@link String}, * fixing the illegal control character optionally. * * @param input a String containing the XMP packet * @param options the parsing options * @return Returns an XML DOM-Document. * @throws XMPException Thrown when the parsing fails. */ private static Document parseXmlFromString(String input, ParseOptions options) throws XMPException { InputSource source = new InputSource(new StringReader(input)); try { return parseInputSource(source); } catch (XMPException e) { if (e.getErrorCode() == XMPError.BADXML && options.getFixControlChars()) { source = new InputSource(new FixASCIIControlsReader(new StringReader(input))); return parseInputSource(source); } else { throw e; } } } /** * Runs the XML-Parser. * @param source an InputSource * @return Returns an XML DOM-Document. * @throws XMPException Wraps parsing and I/O-exceptions into an XMPException. */ private static Document parseInputSource(InputSource source) throws XMPException { try { DocumentBuilder builder = factory.newDocumentBuilder(); builder.setErrorHandler(null); return builder.parse(source); } catch (SAXException e) { throw new XMPException("XML parsing failure", XMPError.BADXML, e); } catch (ParserConfigurationException e) { throw new XMPException("XML Parser not correctly configured", XMPError.UNKNOWN, e); } catch (IOException e) { throw new XMPException("Error reading the XML-file", XMPError.BADSTREAM, e); } } /** * Find the XML node that is the root of the XMP data tree. Generally this * will be an outer node, but it could be anywhere if a general XML document * is parsed (e.g. SVG). The XML parser counted all rdf:RDF and * pxmp:XMP_Packet nodes, and kept a pointer to the last one. If there is * more than one possible root use PickBestRoot to choose among them. *

* If there is a root node, try to extract the version of the previous XMP * toolkit. *

* Pick the first x:xmpmeta among multiple root candidates. If there aren't * any, pick the first bare rdf:RDF if that is allowed. The returned root is * the rdf:RDF child if an x:xmpmeta element was chosen. The search is * breadth first, so a higher level candiate is chosen over a lower level * one that was textually earlier in the serialized XML. * * @param root the root of the xml document * @param xmpmetaRequired flag if the xmpmeta-tag is still required, might be set * initially to true, if the parse option "REQUIRE_XMP_META" is set * @param result The result array that is filled during the recursive process. * @return Returns an array that contains the result or null. * The array contains: *

    *
  • [0] - the rdf:RDF-node *
  • [1] - an object that is either XMP_RDF or XMP_PLAIN (the latter is decrecated) *
  • [2] - the body text of the xpacket-instruction. *
* */ private static Object[] findRootNode(Node root, boolean xmpmetaRequired, Object[] result) { // Look among this parent's content for x:xapmeta or x:xmpmeta. // The recursion for x:xmpmeta is broader than the strictly defined choice, // but gives us smaller code. NodeList children = root.getChildNodes(); for (int i = 0; i < children.getLength(); i++) { root = children.item(i); if (Node.PROCESSING_INSTRUCTION_NODE == root.getNodeType() && XMPConst.XMP_PI.equals(((ProcessingInstruction) root).getTarget())) { // Store the processing instructions content if (result != null) { result[2] = ((ProcessingInstruction) root).getData(); } } else if (Node.TEXT_NODE != root.getNodeType() && Node.PROCESSING_INSTRUCTION_NODE != root.getNodeType()) { String rootNS = root.getNamespaceURI(); String rootLocal = root.getLocalName(); if ( ( XMPConst.TAG_XMPMETA.equals(rootLocal) || XMPConst.TAG_XAPMETA.equals(rootLocal) ) && XMPConst.NS_X.equals(rootNS) ) { // by not passing the RequireXMPMeta-option, the rdf-Node will be valid return findRootNode(root, false, result); } else if (!xmpmetaRequired && "RDF".equals(rootLocal) && XMPConst.NS_RDF.equals(rootNS)) { if (result != null) { result[0] = root; result[1] = XMP_RDF; } return result; } else { // continue searching Object[] newResult = findRootNode(root, xmpmetaRequired, result); if (newResult != null) { return newResult; } else { continue; } } } } // no appropriate node has been found return null; // is extracted here in the C++ Toolkit } /** * @return Creates, configures and returnes the document builder factory for * the Metadata Parser. */ private static DocumentBuilderFactory createDocumentBuilderFactory() { DocumentBuilderFactory factory = DocumentBuilderFactory.newInstance(); factory.setNamespaceAware(true); factory.setIgnoringComments(true); try { // honor System parsing limits, e.g. // System.setProperty("entityExpansionLimit", "10"); factory.setFeature(XMLConstants.FEATURE_SECURE_PROCESSING, true); //Security stuff. Protecting against XEE attacks as described here: https://www.owasp.org/index.php/XML_External_Entity_%28XXE%29_Processing // Xerces 1 - http://xerces.apache.org/xerces-j/features.html#external-general-entities // Xerces 2 - http://xerces.apache.org/xerces2-j/features.html#external-general-entities factory.setFeature("http://xml.org/sax/features/external-general-entities", false); // Xerces 2 only - http://xerces.apache.org/xerces-j/features.html#external-general-entities factory.setFeature("http://apache.org/xml/features/disallow-doctype-decl", false); // and these as well, per Timothy Morgan's 2014 paper: "XML Schema, DTD, and Entity Attacks" (see reference below) factory.setXIncludeAware(false); factory.setExpandEntityReferences(false); } catch (Exception e) { // Ignore IllegalArgumentException and ParserConfigurationException // in case the configured XML-Parser does not implement the feature. } return factory; } }




© 2015 - 2024 Weber Informatics LLC | Privacy Policy