com.adobe.internal.xmp.impl.XMPMetaParser Maven / Gradle / Ivy
Show all versions of xmpcore Show documentation
// =================================================================================================
// ADOBE SYSTEMS INCORPORATED
// Copyright 2006 Adobe Systems Incorporated
// All Rights Reserved
//
// NOTICE: Adobe permits you to use, modify, and distribute this file in accordance with the terms
// of the Adobe license agreement accompanying it.
// =================================================================================================
package com.adobe.internal.xmp.impl;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.Reader;
import java.io.StringReader;
import java.io.UnsupportedEncodingException;
import javax.xml.XMLConstants;
import javax.xml.parsers.DocumentBuilder;
import javax.xml.parsers.DocumentBuilderFactory;
import javax.xml.parsers.ParserConfigurationException;
import org.w3c.dom.Document;
import org.w3c.dom.Node;
import org.w3c.dom.NodeList;
import org.w3c.dom.ProcessingInstruction;
import org.xml.sax.InputSource;
import org.xml.sax.SAXException;
import com.adobe.internal.xmp.XMPConst;
import com.adobe.internal.xmp.XMPError;
import com.adobe.internal.xmp.XMPException;
import com.adobe.internal.xmp.XMPMeta;
import com.adobe.internal.xmp.options.ParseOptions;
/**
* This class replaces the ExpatAdapter.cpp
and does the
* XML-parsing and fixes the prefix. After the parsing several normalisations
* are applied to the XMPTree.
*
* @author Stefan Makswit
* @version $Revision$
* @since 01.02.2006
*/
public class XMPMetaParser
{
/** */
private static final Object XMP_RDF = new Object();
// <#AdobePrivate>
// The Plain XMP format is disabled
/** */
// private static final Object XMP_PLAIN = new Object();
// #AdobePrivate>
/** the DOM Parser Factory, options are set */
private static DocumentBuilderFactory factory = createDocumentBuilderFactory();
/**
* Hidden constructor, initialises the SAX parser handler.
*/
private XMPMetaParser()
{
// EMPTY
}
/**
* Parses the input source into an XMP metadata object, including
* de-aliasing and normalisation.
*
* @param input the input can be an InputStream
, a String
or
* a byte buffer containing the XMP packet.
* @param options the parse options
* @return Returns the resulting XMP metadata object
* @throws XMPException Thrown if parsing or normalisation fails.
*/
public static XMPMeta parse(Object input, ParseOptions options) throws XMPException
{
ParameterAsserts.assertNotNull(input);
options = options != null ? options : new ParseOptions();
Document document = parseXml(input, options);
boolean xmpmetaRequired = options.getRequireXMPMeta();
Object[] result = new Object[3];
result = findRootNode(document, xmpmetaRequired, result);
if (result != null && result[1] == XMP_RDF)
{
XMPMetaImpl xmp = ParseRDF.parse((Node) result[0], options);
xmp.setPacketHeader((String) result[2]);
// Check if the XMP object shall be normalized
if (!options.getOmitNormalization())
{
return XMPNormalizer.process(xmp, options);
}
else
{
return xmp;
}
}
// <#AdobePrivate>
// The Plain XMP format is disabled
// else if (result != null && result[1] == XMP_PLAIN)
// {
// XMPMetaImpl xmp = ParsePlainXMP.parse((Node) result[0]);
// xmp.setPacketHeader((String) result[2]);
// return XMPNormalizer.process(xmp, options);
// }
// #AdobePrivate>
else
{
// no appropriate root node found, return empty metadata object
return new XMPMetaImpl();
}
}
/**
* Parses the raw XML metadata packet considering the parsing options.
* Latin-1/ISO-8859-1 can be accepted when the input is a byte stream
* (some old toolkits versions such packets). The stream is
* then wrapped in another stream that converts Latin-1 to UTF-8.
*
* If control characters shall be fixed, a reader is used that fixes the chars to spaces
* (if the input is a byte stream is has to be read as character stream).
*
* Both options reduce the performance of the parser.
*
* @param input the input can be an InputStream
, a String
or
* a byte buffer containing the XMP packet.
* @param options the parsing options
* @return Returns the parsed XML document or an exception.
* @throws XMPException Thrown if the parsing fails for different reasons
*/
private static Document parseXml(Object input, ParseOptions options)
throws XMPException
{
if (input instanceof InputStream)
{
return parseXmlFromInputStream((InputStream) input, options);
}
else if (input instanceof byte[])
{
return parseXmlFromBytebuffer(new ByteBuffer((byte[]) input), options);
}
else
{
return parseXmlFromString((String) input, options);
}
}
/**
* Parses XML from an {@link InputStream},
* fixing the encoding (Latin-1 to UTF-8) and illegal control character optionally.
*
* @param stream an InputStream
* @param options the parsing options
* @return Returns an XML DOM-Document.
* @throws XMPException Thrown when the parsing fails.
*/
private static Document parseXmlFromInputStream(InputStream stream, ParseOptions options)
throws XMPException
{
if (!options.getAcceptLatin1() &&
!options.getFixControlChars() &&
!options.getDisallowDoctype())
{
return parseInputSource(new InputSource(stream));
}
else
{
// load stream into ByteBuffer to apply advanced options
try
{
ByteBuffer buffer = new ByteBuffer(stream);
return parseXmlFromBytebuffer(buffer, options);
}
catch (IOException e)
{
throw new XMPException("Error reading the XML-file",
XMPError.BADSTREAM, e);
}
}
}
/**
* Parses XML from a byte buffer,
* fixing the encoding (Latin-1 to UTF-8) and illegal control character optionally.
* To improve the performance on legal files, it is first tried to parse normally,
* while the character fixing is only done when the first pass fails.
*
* @param buffer a byte buffer containing the XMP packet
* @param options the parsing options
* @return Returns an XML DOM-Document.
* @throws XMPException Thrown when the parsing fails.
*/
private static Document parseXmlFromBytebuffer(ByteBuffer buffer, ParseOptions options)
throws XMPException
{
try
{
InputSource source = new InputSource(buffer.getByteStream());
try
{
if (options.getDisallowDoctype())
{
try
{
factory.setFeature("http://apache.org/xml/features/disallow-doctype-decl", true);
}
catch(Throwable e)
{
}
}
return parseInputSource(source);
}
catch (XMPException e)
{
if ("DOCTYPE is disallowed".equals(e.getCause().getMessage()))
{
throw new XMPException(e.getCause().getMessage(), XMPError.BADXML);
}
else if (e.getErrorCode() == XMPError.BADXML ||
e.getErrorCode() == XMPError.BADSTREAM)
{
if (options.getAcceptLatin1())
{
buffer = Latin1Converter.convert(buffer);
}
if (options.getFixControlChars())
{
String encoding = buffer.getEncoding();
Reader fixReader = new FixASCIIControlsReader(
new InputStreamReader(
buffer.getByteStream(), encoding));
return parseInputSource(new InputSource(fixReader));
}
source = new InputSource(buffer.getByteStream());
return parseInputSource(source);
}
else
{
throw e;
}
}
}
catch (UnsupportedEncodingException e)
{
// can normally not happen as the encoding is provided by a util function
throw new XMPException("Unsupported Encoding",
XMPError.INTERNALFAILURE, e);
}
}
/**
* Parses XML from a {@link String},
* fixing the illegal control character or disallow DOCTYPEs optionally.
*
* @param input a String
containing the XMP packet
* @param options the parsing options
* @return Returns an XML DOM-Document.
* @throws XMPException Thrown when the parsing fails.
*/
private static Document parseXmlFromString(String input, ParseOptions options)
throws XMPException
{
InputSource source;
try
{
if (options.getDisallowDoctype())
{
try
{
factory.setFeature("http://apache.org/xml/features/disallow-doctype-decl", true);
}
catch(Throwable e)
{
}
}
source = new InputSource(new StringReader(input));
return parseInputSource(source);
}
catch (XMPException e)
{
if (e.getErrorCode() == XMPError.BADXML && options.getFixControlChars())
{
source = new InputSource(new FixASCIIControlsReader(new StringReader(input)));
return parseInputSource(source);
}
else
{
throw e;
}
}
}
/**
* Runs the XML-Parser.
* @param source an InputSource
* @return Returns an XML DOM-Document.
* @throws XMPException Wraps parsing and I/O-exceptions into an XMPException.
*/
private static Document parseInputSource(InputSource source) throws XMPException
{
try
{
DocumentBuilder builder = factory.newDocumentBuilder();
builder.setErrorHandler(null);
return builder.parse(source);
}
catch (SAXException e)
{
throw new XMPException("XML parsing failure", XMPError.BADXML, e);
}
catch (ParserConfigurationException e)
{
throw new XMPException("XML Parser not correctly configured",
XMPError.UNKNOWN, e);
}
catch (IOException e)
{
throw new XMPException("Error reading the XML-file", XMPError.BADSTREAM, e);
}
}
/**
* Find the XML node that is the root of the XMP data tree. Generally this
* will be an outer node, but it could be anywhere if a general XML document
* is parsed (e.g. SVG). The XML parser counted all rdf:RDF and
* pxmp:XMP_Packet nodes, and kept a pointer to the last one. If there is
* more than one possible root use PickBestRoot to choose among them.
*
* If there is a root node, try to extract the version of the previous XMP
* toolkit.
*
* Pick the first x:xmpmeta among multiple root candidates. If there aren't
* any, pick the first bare rdf:RDF if that is allowed. The returned root is
* the rdf:RDF child if an x:xmpmeta element was chosen. The search is
* breadth first, so a higher level candiate is chosen over a lower level
* one that was textually earlier in the serialized XML.
*
* @param root the root of the xml document
* @param xmpmetaRequired flag if the xmpmeta-tag is still required, might be set
* initially to true
, if the parse option "REQUIRE_XMP_META" is set
* @param result The result array that is filled during the recursive process.
* @return Returns an array that contains the result or null
.
* The array contains:
*
* - [0] - the rdf:RDF-node
*
- [1] - an object that is either XMP_RDF or XMP_PLAIN (the latter is decrecated)
*
- [2] - the body text of the xpacket-instruction.
*
*
*/
private static Object[] findRootNode(Node root, boolean xmpmetaRequired, Object[] result)
{
// Look among this parent's content for x:xapmeta or x:xmpmeta.
// The recursion for x:xmpmeta is broader than the strictly defined choice,
// but gives us smaller code.
NodeList children = root.getChildNodes();
for (int i = 0; i < children.getLength(); i++)
{
root = children.item(i);
if (Node.PROCESSING_INSTRUCTION_NODE == root.getNodeType() &&
XMPConst.XMP_PI.equals(((ProcessingInstruction) root).getTarget()))
{
// Store the processing instructions content
if (result != null)
{
result[2] = ((ProcessingInstruction) root).getData();
}
}
else if (Node.TEXT_NODE != root.getNodeType() &&
Node.PROCESSING_INSTRUCTION_NODE != root.getNodeType())
{
String rootNS = root.getNamespaceURI();
String rootLocal = root.getLocalName();
if (
(
XMPConst.TAG_XMPMETA.equals(rootLocal) ||
XMPConst.TAG_XAPMETA.equals(rootLocal)
) &&
XMPConst.NS_X.equals(rootNS)
)
{
// by not passing the RequireXMPMeta-option, the rdf-Node will be valid
return findRootNode(root, false, result);
}
else if (!xmpmetaRequired &&
"RDF".equals(rootLocal) &&
XMPConst.NS_RDF.equals(rootNS))
{
if (result != null)
{
result[0] = root;
result[1] = XMP_RDF;
}
return result;
}
// <#AdobePrivate>
// The Plain XMP format is disabled
// else if ("XMP_Packet".equals(rootLocal) &&
// XMPConst.NS_PXMP.equals(rootNS))
// {
// if (result != null)
// {
// result[0] = root;
// result[1] = XMP_PLAIN;
// }
// return result;
// }
// #AdobePrivate>
else
{
// continue searching
Object[] newResult = findRootNode(root, xmpmetaRequired, result);
if (newResult != null)
{
return newResult;
}
else
{
continue;
}
}
}
}
// no appropriate node has been found
return null;
// *** the version of the toolkit which generated this packet
// is extracted here in the C++ Toolkit
}
/**
* @return Creates, configures and returnes the document builder factory for
* the Metadata Parser.
*/
private static DocumentBuilderFactory createDocumentBuilderFactory()
{
DocumentBuilderFactory factory = DocumentBuilderFactory.newInstance();
factory.setNamespaceAware(true);
factory.setIgnoringComments(true);
factory.setExpandEntityReferences(false);
try
{
String FEATURE = "http://apache.org/xml/features/disallow-doctype-decl";
factory.setFeature(FEATURE, true);
// If you can't completely disable DTDs, then at least do the
// following:
// Xerces 1 -
// http://xerces.apache.org/xerces-j/features.html#external-general-entities
// Xerces 2 -
// http://xerces.apache.org/xerces2-j/features.html#external-general-entities
// JDK7+ - http://xml.org/sax/features/external-general-entities
FEATURE = "http://xml.org/sax/features/external-general-entities";
factory.setFeature(FEATURE, false);
FEATURE = "http://xerces.apache.org/xerces2-j/features.html#disallow-doctype-decl";
factory.setFeature(FEATURE, false);
// Xerces 1 -
// http://xerces.apache.org/xerces-j/features.html#external-parameter-entities
// Xerces 2 -
// http://xerces.apache.org/xerces2-j/features.html#external-parameter-entities
// JDK7+ - http://xml.org/sax/features/external-parameter-entities
FEATURE = "http://xml.org/sax/features/external-parameter-entities";
factory.setFeature(FEATURE, false);
FEATURE = "http://xerces.apache.org/xerces2-j/features.html#external-parameter-entities";
factory.setFeature(FEATURE, false);
// Disable external DTDs as well
FEATURE = "http://apache.org/xml/features/nonvalidating/load-external-dtd";
factory.setFeature(FEATURE, false);
// and these as well, per Timothy Morgan's 2014 paper: "XML Schema,
// DTD, and Entity Attacks" (see reference below)
factory.setXIncludeAware(false);
factory.setExpandEntityReferences(false);
}
catch (Throwable e)
{
// Ignore IllegalArgumentException and ParserConfigurationException
// in case the configured XML-Parser does not implement the feature.
}
return factory;
}
}