apoc.load.Xml Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of apoc-core Show documentation
Core package for Neo4j Procedures
There is a newer version: 5.24.0
/*
 * Copyright (c) "Neo4j"
 * Neo4j Sweden AB [http://neo4j.com]
 *
 * This file is part of Neo4j.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *    http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package apoc.load;

import static apoc.export.util.LimitedSizeInputStream.toLimitedIStream;
import static apoc.util.CompressionConfig.COMPRESSION;
import static apoc.util.FileUtils.getInputStreamFromBinary;
import static apoc.util.Util.ERROR_BYTES_OR_STRING;
import static apoc.util.Util.getStreamConnection;

import apoc.ApocConfig;
import apoc.export.util.CountingInputStream;
import apoc.generate.config.InvalidConfigException;
import apoc.result.MapResult;
import apoc.result.NodeResult;
import apoc.util.CompressionAlgo;
import apoc.util.CompressionConfig;
import apoc.util.FileUtils;
import apoc.util.StreamConnection;
import java.io.ByteArrayInputStream;
import java.io.FileNotFoundException;
import java.io.IOException;
import java.io.InputStream;
import java.io.StringReader;
import java.net.URISyntaxException;
import java.nio.charset.Charset;
import java.util.ArrayDeque;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collections;
import java.util.Deque;
import java.util.HashMap;
import java.util.LinkedHashMap;
import java.util.LinkedList;
import java.util.List;
import java.util.Map;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import java.util.stream.Stream;
import javax.xml.namespace.QName;
import javax.xml.parsers.DocumentBuilder;
import javax.xml.parsers.DocumentBuilderFactory;
import javax.xml.stream.XMLInputFactory;
import javax.xml.stream.XMLStreamConstants;
import javax.xml.stream.XMLStreamException;
import javax.xml.stream.XMLStreamReader;
import javax.xml.xpath.XPath;
import javax.xml.xpath.XPathConstants;
import javax.xml.xpath.XPathExpression;
import javax.xml.xpath.XPathFactory;
import org.apache.commons.lang3.BooleanUtils;
import org.apache.commons.lang3.StringUtils;
import org.neo4j.graphdb.Label;
import org.neo4j.graphdb.RelationshipType;
import org.neo4j.graphdb.Transaction;
import org.neo4j.graphdb.security.URLAccessChecker;
import org.neo4j.graphdb.security.URLAccessValidationError;
import org.neo4j.logging.Log;
import org.neo4j.procedure.Context;
import org.neo4j.procedure.Description;
import org.neo4j.procedure.Mode;
import org.neo4j.procedure.Name;
import org.neo4j.procedure.Procedure;
import org.neo4j.procedure.TerminationGuard;
import org.neo4j.procedure.UserFunction;
import org.w3c.dom.CharacterData;
import org.w3c.dom.Document;
import org.w3c.dom.NamedNodeMap;
import org.w3c.dom.Node;
import org.w3c.dom.NodeList;
import org.xml.sax.InputSource;
import org.xml.sax.SAXParseException;

public class Xml {

    private static final XMLInputFactory FACTORY = XMLInputFactory.newFactory();

    static {
        FACTORY.setProperty(XMLInputFactory.IS_COALESCING, true);
        FACTORY.setProperty(XMLInputFactory.SUPPORT_DTD, false);
        FACTORY.setProperty(XMLInputFactory.IS_SUPPORTING_EXTERNAL_ENTITIES, false);
    }

    @Context
    public ApocConfig apocConfig;

    @Context
    public Transaction tx;

    @Context
    public Log log;

    @Context
    public TerminationGuard terminationGuard;

    @Context
    public URLAccessChecker urlAccessChecker;

    @Procedure("apoc.load.xml")
    @Description("Loads a single nested `MAP` from an XML URL (e.g. web-API).")
    public Stream xml(
            @Name("urlOrBinary") Object urlOrBinary,
            @Name(value = "path", defaultValue = "/") String path,
            @Name(value = "config", defaultValue = "{}") Map config,
            @Name(value = "simple", defaultValue = "false") boolean simpleMode)
            throws Exception {
        return xmlXpathToMapResult(urlOrBinary, simpleMode, path, config);
    }

    @UserFunction("apoc.xml.parse")
    @Description("Parses the given XML `STRING` as a `MAP`.")
    public Map parse(
            @Name("data") String data,
            @Name(value = "path", defaultValue = "/") String path,
            @Name(value = "config", defaultValue = "{}") Map config,
            @Name(value = "simple", defaultValue = "false") boolean simpleMode)
            throws Exception {
        if (config == null) config = Collections.emptyMap();
        boolean failOnError = (boolean) config.getOrDefault("failOnError", true);
        return parse(new ByteArrayInputStream(data.getBytes(Charset.forName("UTF-8"))), simpleMode, path, failOnError)
                .map(mr -> mr.value)
                .findFirst()
                .orElse(null);
    }

    private Stream xmlXpathToMapResult(
            Object urlOrBinary, boolean simpleMode, String path, Map config) throws Exception {
        if (config == null) config = Collections.emptyMap();
        boolean failOnError = (boolean) config.getOrDefault("failOnError", true);
        try {
            Map headers = (Map) config.getOrDefault("headers", Collections.emptyMap());
            CountingInputStream is = FileUtils.inputStreamFor(
                    urlOrBinary,
                    headers,
                    null,
                    (String) config.getOrDefault(COMPRESSION, CompressionAlgo.NONE.name()),
                    urlAccessChecker);
            return parse(is, simpleMode, path, failOnError);
        } catch (Exception e) {
            if (!failOnError) return Stream.of(new MapResult(Collections.emptyMap()));
            else throw e;
        }
    }

    private Stream parse(InputStream data, boolean simpleMode, String path, boolean failOnError)
            throws Exception {
        List result = new ArrayList<>();
        try {
            DocumentBuilderFactory documentBuilderFactory = DocumentBuilderFactory.newInstance();
            documentBuilderFactory.setNamespaceAware(true);
            documentBuilderFactory.setIgnoringElementContentWhitespace(true);
            documentBuilderFactory.setFeature("http://apache.org/xml/features/disallow-doctype-decl", true);
            DocumentBuilder documentBuilder = documentBuilderFactory.newDocumentBuilder();
            documentBuilder.setEntityResolver((publicId, systemId) -> new InputSource(new StringReader("")));

            Document doc = documentBuilder.parse(data);
            XPathFactory xPathFactory = XPathFactory.newInstance();

            XPath xPath = xPathFactory.newXPath();

            path = StringUtils.isEmpty(path) ? "/" : path;
            XPathExpression xPathExpression = xPath.compile(path);
            NodeList nodeList = (NodeList) xPathExpression.evaluate(doc, XPathConstants.NODESET);

            for (int i = 0; i < nodeList.getLength(); i++) {
                final Deque> stack = new LinkedList<>();

                handleNode(stack, nodeList.item(i), simpleMode);
                for (int index = 0; index < stack.size(); index++) {
                    result.add(new MapResult(stack.pollFirst()));
                }
            }
        } catch (FileNotFoundException e) {
            if (!failOnError) return Stream.of(new MapResult(Collections.emptyMap()));
            else throw e;
        } catch (Exception e) {
            if (!failOnError) return Stream.of(new MapResult(Collections.emptyMap()));
            else if (e instanceof SAXParseException && e.getMessage().contains("DOCTYPE is disallowed"))
                throw generateXmlDoctypeException();
            else throw e;
        }
        return result.stream();
    }

    private XMLStreamReader getXMLStreamReader(
            Object urlOrBinary, XmlImportConfig config, URLAccessChecker urlAccessChecker)
            throws IOException, XMLStreamException, URISyntaxException, URLAccessValidationError {
        InputStream inputStream;
        if (urlOrBinary instanceof String) {
            String url = (String) urlOrBinary;
            url = FileUtils.changeFileUrlIfImportDirectoryConstrained(url, urlAccessChecker);
            StreamConnection streamConnection = getStreamConnection(url, null, null, urlAccessChecker);
            inputStream = toLimitedIStream(streamConnection.getInputStream(), streamConnection.getLength());
        } else if (urlOrBinary instanceof byte[]) {
            inputStream = toLimitedIStream(
                    getInputStreamFromBinary((byte[]) urlOrBinary, config.getCompressionAlgo()),
                    ((byte[]) urlOrBinary).length);
        } else {
            throw new RuntimeException(ERROR_BYTES_OR_STRING);
        }
        if (config.isFilterLeadingWhitespace()) {
            inputStream = new SkipWhitespaceInputStream(inputStream);
        }
        return FACTORY.createXMLStreamReader(inputStream);
    }

    private void handleNode(Deque> stack, Node node, boolean simpleMode) {
        terminationGuard.check();

        // Handle document node
        if (node.getNodeType() == Node.DOCUMENT_NODE) {
            NodeList children = node.getChildNodes();
            for (int i = 0; i < children.getLength(); i++) {
                if (children.item(i).getLocalName() != null) {
                    handleNode(stack, children.item(i), simpleMode);
                    return;
                }
            }
        }

        Map elementMap = new LinkedHashMap<>();
        handleTypeAndAttributes(node, elementMap);

        // Set children
        NodeList children = node.getChildNodes();
        int count = 0;
        for (int i = 0; i < children.getLength(); i++) {
            Node child = children.item(i);

            // This is to deal with text between xml tags for example new line characters
            if (child.getNodeType() != Node.TEXT_NODE && child.getNodeType() != Node.CDATA_SECTION_NODE) {
                handleNode(stack, child, simpleMode);
                count++;
            } else {
                // Deal with text nodes
                handleTextNode(child, elementMap);
            }
        }

        if (children.getLength() > 0) {
            if (!stack.isEmpty()) {
                List