com.itextpdf.xmp.impl.XMPMetaParser Maven / Gradle / Ivy
//Copyright (c) 2006, Adobe Systems Incorporated
//All rights reserved.
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions are met:
// 1. Redistributions of source code must retain the above copyright
// notice, this list of conditions and the following disclaimer.
// 2. Redistributions in binary form must reproduce the above copyright
// notice, this list of conditions and the following disclaimer in the
// documentation and/or other materials provided with the distribution.
// 3. All advertising materials mentioning features or use of this software
// must display the following acknowledgement:
// This product includes software developed by the Adobe Systems Incorporated.
// 4. Neither the name of the Adobe Systems Incorporated nor the
// names of its contributors may be used to endorse or promote products
// derived from this software without specific prior written permission.
//
// THIS SOFTWARE IS PROVIDED BY ADOBE SYSTEMS INCORPORATED ''AS IS'' AND ANY
// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
// WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
// DISCLAIMED. IN NO EVENT SHALL ADOBE SYSTEMS INCORPORATED BE LIABLE FOR ANY
// DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
// (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
// LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
// ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
//
// http://www.adobe.com/devnet/xmp/library/eula-xmp-library-java.html
package com.itextpdf.xmp.impl;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.Reader;
import java.io.StringReader;
import java.io.UnsupportedEncodingException;
import javax.xml.XMLConstants;
import javax.xml.parsers.DocumentBuilder;
import javax.xml.parsers.DocumentBuilderFactory;
import javax.xml.parsers.ParserConfigurationException;
import org.w3c.dom.Document;
import org.w3c.dom.Node;
import org.w3c.dom.NodeList;
import org.w3c.dom.ProcessingInstruction;
import org.xml.sax.InputSource;
import org.xml.sax.SAXException;
import com.itextpdf.xmp.XMPConst;
import com.itextpdf.xmp.XMPError;
import com.itextpdf.xmp.XMPException;
import com.itextpdf.xmp.XMPMeta;
import com.itextpdf.xmp.options.ParseOptions;
/**
* This class replaces the ExpatAdapter.cpp
and does the
* XML-parsing and fixes the prefix. After the parsing several normalisations
* are applied to the XMPTree.
*
* @since 01.02.2006
*/
public class XMPMetaParser
{
/** */
private static final Object XMP_RDF = new Object();
/** the DOM Parser Factory, options are set */
private static DocumentBuilderFactory factory = createDocumentBuilderFactory();
/**
* Hidden constructor, initialises the SAX parser handler.
*/
private XMPMetaParser()
{
// EMPTY
}
/**
* Parses the input source into an XMP metadata object, including
* de-aliasing and normalisation.
*
* @param input the input can be an InputStream
, a String
or
* a byte buffer containing the XMP packet.
* @param options the parse options
* @return Returns the resulting XMP metadata object
* @throws XMPException Thrown if parsing or normalisation fails.
*/
public static XMPMeta parse(Object input, ParseOptions options) throws XMPException
{
ParameterAsserts.assertNotNull(input);
options = options != null ? options : new ParseOptions();
Document document = parseXml(input, options);
boolean xmpmetaRequired = options.getRequireXMPMeta();
Object[] result = new Object[3];
result = findRootNode(document, xmpmetaRequired, result);
if (result != null && result[1] == XMP_RDF)
{
XMPMetaImpl xmp = ParseRDF.parse((Node) result[0]);
xmp.setPacketHeader((String) result[2]);
// Check if the XMP object shall be normalized
if (!options.getOmitNormalization())
{
return XMPNormalizer.process(xmp, options);
}
else
{
return xmp;
}
}
else
{
// no appropriate root node found, return empty metadata object
return new XMPMetaImpl();
}
}
/**
* Parses the raw XML metadata packet considering the parsing options.
* Latin-1/ISO-8859-1 can be accepted when the input is a byte stream
* (some old toolkits versions such packets). The stream is
* then wrapped in another stream that converts Latin-1 to UTF-8.
*
* If control characters shall be fixed, a reader is used that fixes the chars to spaces
* (if the input is a byte stream is has to be read as character stream).
*
* Both options reduce the performance of the parser.
*
* @param input the input can be an InputStream
, a String
or
* a byte buffer containing the XMP packet.
* @param options the parsing options
* @return Returns the parsed XML document or an exception.
* @throws XMPException Thrown if the parsing fails for different reasons
*/
private static Document parseXml(Object input, ParseOptions options)
throws XMPException
{
if (input instanceof InputStream)
{
return parseXmlFromInputStream((InputStream) input, options);
}
else if (input instanceof byte[])
{
return parseXmlFromBytebuffer(new ByteBuffer((byte[]) input), options);
}
else
{
return parseXmlFromString((String) input, options);
}
}
/**
* Parses XML from an {@link InputStream},
* fixing the encoding (Latin-1 to UTF-8) and illegal control character optionally.
*
* @param stream an InputStream
* @param options the parsing options
* @return Returns an XML DOM-Document.
* @throws XMPException Thrown when the parsing fails.
*/
private static Document parseXmlFromInputStream(InputStream stream, ParseOptions options)
throws XMPException
{
if (!options.getAcceptLatin1() && !options.getFixControlChars())
{
return parseInputSource(new InputSource(stream));
}
else
{
// load stream into bytebuffer
try
{
ByteBuffer buffer = new ByteBuffer(stream);
return parseXmlFromBytebuffer(buffer, options);
}
catch (IOException e)
{
throw new XMPException("Error reading the XML-file",
XMPError.BADSTREAM, e);
}
}
}
/**
* Parses XML from a byte buffer,
* fixing the encoding (Latin-1 to UTF-8) and illegal control character optionally.
*
* @param buffer a byte buffer containing the XMP packet
* @param options the parsing options
* @return Returns an XML DOM-Document.
* @throws XMPException Thrown when the parsing fails.
*/
private static Document parseXmlFromBytebuffer(ByteBuffer buffer, ParseOptions options)
throws XMPException
{
InputSource source = new InputSource(buffer.getByteStream());
try
{
return parseInputSource(source);
}
catch (XMPException e)
{
if (e.getErrorCode() == XMPError.BADXML ||
e.getErrorCode() == XMPError.BADSTREAM)
{
if (options.getAcceptLatin1())
{
buffer = Latin1Converter.convert(buffer);
}
if (options.getFixControlChars())
{
try
{
String encoding = buffer.getEncoding();
Reader fixReader = new FixASCIIControlsReader(
new InputStreamReader(
buffer.getByteStream(), encoding));
return parseInputSource(new InputSource(fixReader));
}
catch (UnsupportedEncodingException e1)
{
// can normally not happen as the encoding is provided by a util function
throw new XMPException("Unsupported Encoding",
XMPError.INTERNALFAILURE, e);
}
}
source = new InputSource(buffer.getByteStream());
return parseInputSource(source);
}
else
{
throw e;
}
}
}
/**
* Parses XML from a {@link String},
* fixing the illegal control character optionally.
*
* @param input a String
containing the XMP packet
* @param options the parsing options
* @return Returns an XML DOM-Document.
* @throws XMPException Thrown when the parsing fails.
*/
private static Document parseXmlFromString(String input, ParseOptions options)
throws XMPException
{
InputSource source = new InputSource(new StringReader(input));
try
{
return parseInputSource(source);
}
catch (XMPException e)
{
if (e.getErrorCode() == XMPError.BADXML && options.getFixControlChars())
{
source = new InputSource(new FixASCIIControlsReader(new StringReader(input)));
return parseInputSource(source);
}
else
{
throw e;
}
}
}
/**
* Runs the XML-Parser.
* @param source an InputSource
* @return Returns an XML DOM-Document.
* @throws XMPException Wraps parsing and I/O-exceptions into an XMPException.
*/
private static Document parseInputSource(InputSource source) throws XMPException
{
try
{
DocumentBuilder builder = factory.newDocumentBuilder();
builder.setErrorHandler(null);
return builder.parse(source);
}
catch (SAXException e)
{
throw new XMPException("XML parsing failure", XMPError.BADXML, e);
}
catch (ParserConfigurationException e)
{
throw new XMPException("XML Parser not correctly configured",
XMPError.UNKNOWN, e);
}
catch (IOException e)
{
throw new XMPException("Error reading the XML-file", XMPError.BADSTREAM, e);
}
}
/**
* Find the XML node that is the root of the XMP data tree. Generally this
* will be an outer node, but it could be anywhere if a general XML document
* is parsed (e.g. SVG). The XML parser counted all rdf:RDF and
* pxmp:XMP_Packet nodes, and kept a pointer to the last one. If there is
* more than one possible root use PickBestRoot to choose among them.
*
* If there is a root node, try to extract the version of the previous XMP
* toolkit.
*
* Pick the first x:xmpmeta among multiple root candidates. If there aren't
* any, pick the first bare rdf:RDF if that is allowed. The returned root is
* the rdf:RDF child if an x:xmpmeta element was chosen. The search is
* breadth first, so a higher level candiate is chosen over a lower level
* one that was textually earlier in the serialized XML.
*
* @param root the root of the xml document
* @param xmpmetaRequired flag if the xmpmeta-tag is still required, might be set
* initially to true
, if the parse option "REQUIRE_XMP_META" is set
* @param result The result array that is filled during the recursive process.
* @return Returns an array that contains the result or null
.
* The array contains:
*
* - [0] - the rdf:RDF-node
*
- [1] - an object that is either XMP_RDF or XMP_PLAIN (the latter is decrecated)
*
- [2] - the body text of the xpacket-instruction.
*
*
*/
private static Object[] findRootNode(Node root, boolean xmpmetaRequired, Object[] result)
{
// Look among this parent's content for x:xapmeta or x:xmpmeta.
// The recursion for x:xmpmeta is broader than the strictly defined choice,
// but gives us smaller code.
NodeList children = root.getChildNodes();
for (int i = 0; i < children.getLength(); i++)
{
root = children.item(i);
if (Node.PROCESSING_INSTRUCTION_NODE == root.getNodeType() &&
XMPConst.XMP_PI.equals(((ProcessingInstruction) root).getTarget()))
{
// Store the processing instructions content
if (result != null)
{
result[2] = ((ProcessingInstruction) root).getData();
}
}
else if (Node.TEXT_NODE != root.getNodeType() &&
Node.PROCESSING_INSTRUCTION_NODE != root.getNodeType())
{
String rootNS = root.getNamespaceURI();
String rootLocal = root.getLocalName();
if (
(
XMPConst.TAG_XMPMETA.equals(rootLocal) ||
XMPConst.TAG_XAPMETA.equals(rootLocal)
) &&
XMPConst.NS_X.equals(rootNS)
)
{
// by not passing the RequireXMPMeta-option, the rdf-Node will be valid
return findRootNode(root, false, result);
}
else if (!xmpmetaRequired &&
"RDF".equals(rootLocal) &&
XMPConst.NS_RDF.equals(rootNS))
{
if (result != null)
{
result[0] = root;
result[1] = XMP_RDF;
}
return result;
}
else
{
// continue searching
Object[] newResult = findRootNode(root, xmpmetaRequired, result);
if (newResult != null)
{
return newResult;
}
else
{
continue;
}
}
}
}
// no appropriate node has been found
return null;
// is extracted here in the C++ Toolkit
}
/**
* @return Creates, configures and returnes the document builder factory for
* the Metadata Parser.
*/
private static DocumentBuilderFactory createDocumentBuilderFactory()
{
DocumentBuilderFactory factory = DocumentBuilderFactory.newInstance();
factory.setNamespaceAware(true);
factory.setIgnoringComments(true);
try
{
// honor System parsing limits, e.g.
// System.setProperty("entityExpansionLimit", "10");
factory.setFeature(XMLConstants.FEATURE_SECURE_PROCESSING, true);
//Security stuff. Protecting against XEE attacks as described here: https://www.owasp.org/index.php/XML_External_Entity_%28XXE%29_Processing
// Xerces 1 - http://xerces.apache.org/xerces-j/features.html#external-general-entities
// Xerces 2 - http://xerces.apache.org/xerces2-j/features.html#external-general-entities
factory.setFeature("http://xml.org/sax/features/external-general-entities", false);
// Xerces 2 only - http://xerces.apache.org/xerces-j/features.html#external-general-entities
factory.setFeature("http://apache.org/xml/features/disallow-doctype-decl", false);
// and these as well, per Timothy Morgan's 2014 paper: "XML Schema, DTD, and Entity Attacks" (see reference below)
factory.setXIncludeAware(false);
factory.setExpandEntityReferences(false);
}
catch (Exception e)
{
// Ignore IllegalArgumentException and ParserConfigurationException
// in case the configured XML-Parser does not implement the feature.
}
return factory;
}
}