All Downloads are FREE. Search and download functionalities are using the official Maven repository.

apoc.load.Xml Maven / Gradle / Ivy

There is a newer version: 5.24.0
Show newest version
/*
 * Copyright (c) "Neo4j"
 * Neo4j Sweden AB [http://neo4j.com]
 *
 * This file is part of Neo4j.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *    http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package apoc.load;

import static apoc.export.util.LimitedSizeInputStream.toLimitedIStream;
import static apoc.util.CompressionConfig.COMPRESSION;
import static apoc.util.FileUtils.getInputStreamFromBinary;
import static apoc.util.Util.ERROR_BYTES_OR_STRING;
import static apoc.util.Util.getStreamConnection;

import apoc.ApocConfig;
import apoc.export.util.CountingInputStream;
import apoc.generate.config.InvalidConfigException;
import apoc.result.MapResult;
import apoc.result.NodeResult;
import apoc.util.CompressionAlgo;
import apoc.util.CompressionConfig;
import apoc.util.FileUtils;
import apoc.util.StreamConnection;
import java.io.ByteArrayInputStream;
import java.io.FileNotFoundException;
import java.io.IOException;
import java.io.InputStream;
import java.io.StringReader;
import java.nio.charset.Charset;
import java.util.ArrayDeque;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collections;
import java.util.Deque;
import java.util.HashMap;
import java.util.LinkedHashMap;
import java.util.LinkedList;
import java.util.List;
import java.util.Map;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import java.util.stream.Stream;
import javax.xml.namespace.QName;
import javax.xml.parsers.DocumentBuilder;
import javax.xml.parsers.DocumentBuilderFactory;
import javax.xml.stream.XMLInputFactory;
import javax.xml.stream.XMLStreamConstants;
import javax.xml.stream.XMLStreamException;
import javax.xml.stream.XMLStreamReader;
import javax.xml.xpath.XPath;
import javax.xml.xpath.XPathConstants;
import javax.xml.xpath.XPathExpression;
import javax.xml.xpath.XPathFactory;
import org.apache.commons.lang3.BooleanUtils;
import org.apache.commons.lang3.StringUtils;
import org.neo4j.graphdb.Label;
import org.neo4j.graphdb.RelationshipType;
import org.neo4j.graphdb.Transaction;
import org.neo4j.graphdb.security.URLAccessChecker;
import org.neo4j.logging.Log;
import org.neo4j.procedure.Context;
import org.neo4j.procedure.Description;
import org.neo4j.procedure.Mode;
import org.neo4j.procedure.Name;
import org.neo4j.procedure.Procedure;
import org.neo4j.procedure.TerminationGuard;
import org.neo4j.procedure.UserFunction;
import org.w3c.dom.CharacterData;
import org.w3c.dom.Document;
import org.w3c.dom.NamedNodeMap;
import org.w3c.dom.Node;
import org.w3c.dom.NodeList;
import org.xml.sax.InputSource;
import org.xml.sax.SAXParseException;

public class Xml {

    private static final XMLInputFactory FACTORY = XMLInputFactory.newFactory();

    static {
        FACTORY.setProperty(XMLInputFactory.IS_COALESCING, true);
        FACTORY.setProperty(XMLInputFactory.SUPPORT_DTD, false);
        FACTORY.setProperty(XMLInputFactory.IS_SUPPORTING_EXTERNAL_ENTITIES, false);
    }

    @Context
    public ApocConfig apocConfig;

    @Context
    public Transaction tx;

    @Context
    public Log log;

    @Context
    public TerminationGuard terminationGuard;

    @Context
    public URLAccessChecker urlAccessChecker;

    @Procedure("apoc.load.xml")
    @Description("Loads a single nested `MAP` from an XML URL (e.g. web-API).")
    public Stream xml(
            @Name("urlOrBinary") Object urlOrBinary,
            @Name(value = "path", defaultValue = "/") String path,
            @Name(value = "config", defaultValue = "{}") Map config,
            @Name(value = "simple", defaultValue = "false") boolean simpleMode)
            throws Exception {
        return xmlXpathToMapResult(urlOrBinary, simpleMode, path, config);
    }

    @UserFunction("apoc.xml.parse")
    @Description("Parses the given XML `STRING` as a `MAP`.")
    public Map parse(
            @Name("data") String data,
            @Name(value = "path", defaultValue = "/") String path,
            @Name(value = "config", defaultValue = "{}") Map config,
            @Name(value = "simple", defaultValue = "false") boolean simpleMode)
            throws Exception {
        if (config == null) config = Collections.emptyMap();
        boolean failOnError = (boolean) config.getOrDefault("failOnError", true);
        return parse(new ByteArrayInputStream(data.getBytes(Charset.forName("UTF-8"))), simpleMode, path, failOnError)
                .map(mr -> mr.value)
                .findFirst()
                .orElse(null);
    }

    private Stream xmlXpathToMapResult(
            Object urlOrBinary, boolean simpleMode, String path, Map config) throws Exception {
        if (config == null) config = Collections.emptyMap();
        boolean failOnError = (boolean) config.getOrDefault("failOnError", true);
        try {
            Map headers = (Map) config.getOrDefault("headers", Collections.emptyMap());
            CountingInputStream is = FileUtils.inputStreamFor(
                    urlOrBinary,
                    headers,
                    null,
                    (String) config.getOrDefault(COMPRESSION, CompressionAlgo.NONE.name()),
                    urlAccessChecker);
            return parse(is, simpleMode, path, failOnError);
        } catch (Exception e) {
            if (!failOnError) return Stream.of(new MapResult(Collections.emptyMap()));
            else throw e;
        }
    }

    private Stream parse(InputStream data, boolean simpleMode, String path, boolean failOnError)
            throws Exception {
        List result = new ArrayList<>();
        try {
            DocumentBuilderFactory documentBuilderFactory = DocumentBuilderFactory.newInstance();
            documentBuilderFactory.setNamespaceAware(true);
            documentBuilderFactory.setIgnoringElementContentWhitespace(true);
            documentBuilderFactory.setFeature("http://apache.org/xml/features/disallow-doctype-decl", true);
            DocumentBuilder documentBuilder = documentBuilderFactory.newDocumentBuilder();
            documentBuilder.setEntityResolver((publicId, systemId) -> new InputSource(new StringReader("")));

            Document doc = documentBuilder.parse(data);
            XPathFactory xPathFactory = XPathFactory.newInstance();

            XPath xPath = xPathFactory.newXPath();

            path = StringUtils.isEmpty(path) ? "/" : path;
            XPathExpression xPathExpression = xPath.compile(path);
            NodeList nodeList = (NodeList) xPathExpression.evaluate(doc, XPathConstants.NODESET);

            for (int i = 0; i < nodeList.getLength(); i++) {
                final Deque> stack = new LinkedList<>();

                handleNode(stack, nodeList.item(i), simpleMode);
                for (int index = 0; index < stack.size(); index++) {
                    result.add(new MapResult(stack.pollFirst()));
                }
            }
        } catch (FileNotFoundException e) {
            if (!failOnError) return Stream.of(new MapResult(Collections.emptyMap()));
            else throw e;
        } catch (Exception e) {
            if (!failOnError) return Stream.of(new MapResult(Collections.emptyMap()));
            else if (e instanceof SAXParseException && e.getMessage().contains("DOCTYPE is disallowed"))
                throw generateXmlDoctypeException();
            else throw e;
        }
        return result.stream();
    }

    private XMLStreamReader getXMLStreamReader(
            Object urlOrBinary, XmlImportConfig config, URLAccessChecker urlAccessChecker)
            throws IOException, XMLStreamException {
        InputStream inputStream;
        if (urlOrBinary instanceof String) {
            String url = (String) urlOrBinary;
            apocConfig.checkReadAllowed(url, urlAccessChecker);
            url = FileUtils.changeFileUrlIfImportDirectoryConstrained(url);
            StreamConnection streamConnection = getStreamConnection(url, null, null, urlAccessChecker);
            inputStream = toLimitedIStream(streamConnection.getInputStream(), streamConnection.getLength());
        } else if (urlOrBinary instanceof byte[]) {
            inputStream = toLimitedIStream(
                    getInputStreamFromBinary((byte[]) urlOrBinary, config.getCompressionAlgo()),
                    ((byte[]) urlOrBinary).length);
        } else {
            throw new RuntimeException(ERROR_BYTES_OR_STRING);
        }
        if (config.isFilterLeadingWhitespace()) {
            inputStream = new SkipWhitespaceInputStream(inputStream);
        }
        return FACTORY.createXMLStreamReader(inputStream);
    }

    private void handleNode(Deque> stack, Node node, boolean simpleMode) {
        terminationGuard.check();

        // Handle document node
        if (node.getNodeType() == Node.DOCUMENT_NODE) {
            NodeList children = node.getChildNodes();
            for (int i = 0; i < children.getLength(); i++) {
                if (children.item(i).getLocalName() != null) {
                    handleNode(stack, children.item(i), simpleMode);
                    return;
                }
            }
        }

        Map elementMap = new LinkedHashMap<>();
        handleTypeAndAttributes(node, elementMap);

        // Set children
        NodeList children = node.getChildNodes();
        int count = 0;
        for (int i = 0; i < children.getLength(); i++) {
            Node child = children.item(i);

            // This is to deal with text between xml tags for example new line characters
            if (child.getNodeType() != Node.TEXT_NODE && child.getNodeType() != Node.CDATA_SECTION_NODE) {
                handleNode(stack, child, simpleMode);
                count++;
            } else {
                // Deal with text nodes
                handleTextNode(child, elementMap);
            }
        }

        if (children.getLength() > 0) {
            if (!stack.isEmpty()) {
                List nodeChildren = new ArrayList<>();
                for (int i = 0; i < count; i++) {
                    nodeChildren.add(stack.pollLast());
                }
                String key = simpleMode ? "_" + node.getLocalName() : "_children";
                Collections.reverse(nodeChildren);
                if (nodeChildren.size() > 0) {
                    // Before adding the children we need to handle mixed text
                    Object text = elementMap.get("_text");
                    if (text instanceof List) {
                        for (Object element : (List) text) {
                            nodeChildren.add(element);
                        }
                        elementMap.remove("_text");
                    }

                    elementMap.put(key, nodeChildren);
                }
            }
        }

        if (!elementMap.isEmpty()) {
            stack.addLast(elementMap);
        }
    }

    /**
     * Collects type and attributes for the node
     *
     * @param node
     * @param elementMap
     */
    private void handleTypeAndAttributes(Node node, Map elementMap) {
        // Set type
        if (node.getLocalName() != null) {
            elementMap.put("_type", node.getLocalName());
        }

        // Set the attributes
        if (node.getAttributes() != null) {
            NamedNodeMap attributeMap = node.getAttributes();
            for (int i = 0; i < attributeMap.getLength(); i++) {
                Node attribute = attributeMap.item(i);
                elementMap.put(attribute.getNodeName(), attribute.getNodeValue());
            }
        }
    }

    /**
     * Handle TEXT nodes and CDATA nodes
     *
     * @param node
     * @param elementMap
     */
    private void handleTextNode(Node node, Map elementMap) {
        Object text = "";
        int nodeType = node.getNodeType();
        switch (nodeType) {
            case Node.TEXT_NODE:
                text = normalizeText(node.getNodeValue());
                break;
            case Node.CDATA_SECTION_NODE:
                text = normalizeText(((CharacterData) node).getData());
                break;
            default:
                break;
        }

        // If the text is valid ...
        if (!StringUtils.isEmpty(text.toString())) {
            // We check if we have already collected some text previously
            Object previousText = elementMap.get("_text");
            if (previousText != null) {
                // If we just have a "_text" key than we need to collect to a List
                text = Arrays.asList(previousText.toString(), text);
            }
            elementMap.put("_text", text);
        }
    }

    /**
     * Remove trailing whitespaces and new line characters
     *
     * @param text
     * @return
     */
    private String normalizeText(String text) {
        String[] tokens = StringUtils.split(text, "\n");
        for (int i = 0; i < tokens.length; i++) {
            tokens[i] = tokens[i].trim();
        }

        return StringUtils.join(tokens, " ").trim();
    }

    public static class ParentAndChildPair {
        private final org.neo4j.graphdb.Node parent;
        private org.neo4j.graphdb.Node previousChild = null;

        public ParentAndChildPair(org.neo4j.graphdb.Node parent) {
            this.parent = parent;
        }

        public org.neo4j.graphdb.Node getParent() {
            return parent;
        }

        public org.neo4j.graphdb.Node getPreviousChild() {
            return previousChild;
        }

        public void setPreviousChild(org.neo4j.graphdb.Node previousChild) {
            this.previousChild = previousChild;
        }

        @Override
        public boolean equals(Object o) {
            if (this == o) return true;
            if (o == null || getClass() != o.getClass()) return false;
            ParentAndChildPair that = (ParentAndChildPair) o;
            return parent.equals(that.parent);
        }

        @Override
        public int hashCode() {
            return parent.hashCode();
        }
    }

    private static class XmlImportConfig extends CompressionConfig {

        private boolean connectCharacters;
        private Pattern delimiter;
        private Label label = Label.label("XmlCharacters");
        private RelationshipType relType = RelationshipType.withName("NE");
        private Map charactersForTag = new HashMap<>();
        private final boolean filterLeadingWhitespace;

        public XmlImportConfig(Map config) {
            super(config);
            if (config == null) {
                config = Collections.emptyMap();
            }
            connectCharacters = BooleanUtils.toBoolean((Boolean) config.get("connectCharacters"));
            filterLeadingWhitespace = BooleanUtils.toBoolean((Boolean) config.get("filterLeadingWhitespace"));

            String _delimiter = (String) config.get("delimiter");
            if (_delimiter != null) {
                connectCharacters = true;
            }
            delimiter = Pattern.compile(_delimiter == null ? "\\s" : _delimiter);

            String _label = (String) config.get("label");
            if (_label != null) {
                label = Label.label(_label);
                connectCharacters = true;
            }

            String _relType = (String) config.get("relType");
            if (_relType != null) {
                relType = RelationshipType.withName(_relType);
                connectCharacters = true;
            }

            Map _charactersForTag = (Map) config.get("charactersForTag");
            if (_charactersForTag != null) {
                charactersForTag = _charactersForTag;
            }

            if (config.containsKey("createNextWordRelationships")) {
                throw new InvalidConfigException(
                        "usage of `createNextWordRelationships` is no longer allowed. Use `{relType:'NEXT_WORD', label:'XmlWord'}` instead.");
            }
        }

        public Pattern getDelimiter() {
            return delimiter;
        }

        public Label getLabel() {
            return label;
        }

        public RelationshipType getRelType() {
            return relType;
        }

        public boolean isConnectCharacters() {
            return connectCharacters;
        }

        public Map getCharactersForTag() {
            return charactersForTag;
        }

        public boolean isFilterLeadingWhitespace() {
            return filterLeadingWhitespace;
        }
    }

    private static class ImportState {
        private final Deque parents = new ArrayDeque<>();
        private org.neo4j.graphdb.Node last;
        private org.neo4j.graphdb.Node lastWord;
        private int currentCharacterIndex = 0;

        public ImportState(org.neo4j.graphdb.Node initialNode) {
            this.last = initialNode;
            this.lastWord = initialNode;
        }

        public void push(ParentAndChildPair parentAndChildPair) {
            parents.push(parentAndChildPair);
        }

        public org.neo4j.graphdb.Node getLastWord() {
            return lastWord;
        }

        public void setLastWord(org.neo4j.graphdb.Node lastWord) {
            this.lastWord = lastWord;
        }

        public int getCurrentCharacterIndex() {
            return currentCharacterIndex;
        }

        public ParentAndChildPair pop() {
            return parents.pop();
        }

        public boolean isEmpty() {
            return parents.isEmpty();
        }

        public void updateLast(org.neo4j.graphdb.Node thisNode) {
            ParentAndChildPair parentAndChildPair = parents.peek();
            final org.neo4j.graphdb.Node parent = parentAndChildPair.getParent();
            final org.neo4j.graphdb.Node previousChild = parentAndChildPair.getPreviousChild();

            last.createRelationshipTo(thisNode, RelationshipType.withName("NEXT"));
            thisNode.createRelationshipTo(parent, RelationshipType.withName("IS_CHILD_OF"));
            if (previousChild == null) {
                thisNode.createRelationshipTo(parent, RelationshipType.withName("FIRST_CHILD_OF"));
            } else {
                previousChild.createRelationshipTo(thisNode, RelationshipType.withName("NEXT_SIBLING"));
            }
            parentAndChildPair.setPreviousChild(thisNode);
            last = thisNode;
        }

        public void addCurrentCharacterIndex(int length) {
            currentCharacterIndex += length;
        }
    }

    @Procedure(mode = Mode.WRITE, value = "apoc.import.xml")
    @Description("Imports a graph from the provided XML file.")
    public Stream importToGraph(
            @Name("urlOrBinary") Object urlOrBinary,
            @Name(value = "config", defaultValue = "{}") Map config)
            throws IOException, XMLStreamException {
        XmlImportConfig importConfig = new XmlImportConfig(config);
        // TODO: make labels, reltypes and magic properties configurable

        final XMLStreamReader xml = getXMLStreamReader(urlOrBinary, importConfig, urlAccessChecker);

        // stores parents and their most recent child
        org.neo4j.graphdb.Node root = tx.createNode(Label.label("XmlDocument"));
        setPropertyIfNotNull(root, "_xmlVersion", xml.getVersion());
        setPropertyIfNotNull(root, "_xmlEncoding", xml.getEncoding());
        if (urlOrBinary instanceof String) {
            root.setProperty("url", urlOrBinary);
        }
        ImportState state = new ImportState(root);
        state.push(new ParentAndChildPair(root));

        while (xml.hasNext()) {
            xml.next();

            switch (xml.getEventType()) {
                case XMLStreamConstants.DTD:
                    throw generateXmlDoctypeException();

                case XMLStreamConstants.START_DOCUMENT:
                    // xmlsteamreader starts off by definition at START_DOCUMENT prior to call next() - so ignore this
                    // one
                    break;

                case XMLStreamConstants.PROCESSING_INSTRUCTION:
                    org.neo4j.graphdb.Node pi = tx.createNode(Label.label("XmlProcessingInstruction"));
                    pi.setProperty("_piData", xml.getPIData());
                    pi.setProperty("_piTarget", xml.getPITarget());
                    state.updateLast(pi);
                    break;

                case XMLStreamConstants.START_ELEMENT:
                    final QName qName = xml.getName();
                    final org.neo4j.graphdb.Node tag = tx.createNode(Label.label("XmlTag"));
                    tag.setProperty("_name", qName.getLocalPart());
                    for (int i = 0; i < xml.getAttributeCount(); i++) {
                        tag.setProperty(xml.getAttributeLocalName(i), xml.getAttributeValue(i));
                    }

                    state.updateLast(tag);
                    state.push(new ParentAndChildPair(tag));
                    break;

                case XMLStreamConstants.CHARACTERS:
                    List words = parseTextIntoPartsAndDelimiters(xml.getText(), importConfig.getDelimiter());
                    for (String currentWord : words) {
                        createCharactersNode(currentWord, state, importConfig);
                    }
                    break;

                case XMLStreamConstants.END_ELEMENT:
                    String charactersForTag =
                            importConfig.getCharactersForTag().get(xml.getName().getLocalPart());
                    if (charactersForTag != null) {
                        createCharactersNode(charactersForTag, state, importConfig);
                    }
                    ParentAndChildPair parent = state.pop();
                    if (parent.getPreviousChild() != null) {
                        parent.getPreviousChild()
                                .createRelationshipTo(parent.getParent(), RelationshipType.withName("LAST_CHILD_OF"));
                    }
                    break;

                case XMLStreamConstants.END_DOCUMENT:
                    state.pop();
                    break;

                case XMLStreamConstants.COMMENT:
                case XMLStreamConstants.SPACE:
                    // intentionally do nothing
                    break;
                default:
                    log.warn("xml file contains a {} type structure - ignoring this.", xml.getEventType());
            }
        }
        if (!state.isEmpty()) {
            throw new IllegalStateException("non empty parents, this indicates a bug");
        }
        return Stream.of(new NodeResult(root));
    }

    private void createCharactersNode(String currentWord, ImportState state, XmlImportConfig importConfig) {
        org.neo4j.graphdb.Node word = tx.createNode(importConfig.getLabel());
        word.setProperty("text", currentWord);
        word.setProperty("startIndex", state.getCurrentCharacterIndex());
        state.addCurrentCharacterIndex(currentWord.length());
        word.setProperty("endIndex", state.getCurrentCharacterIndex() - 1);

        state.updateLast(word);
        if (importConfig.isConnectCharacters()) {
            state.getLastWord().createRelationshipTo(word, importConfig.getRelType());
            state.setLastWord(word);
        }
    }

    List parseTextIntoPartsAndDelimiters(String sourceString, Pattern delimiterPattern) {
        Matcher matcher = delimiterPattern.matcher(sourceString);
        ArrayList result = new ArrayList<>();

        int prevEndIndex = 0;
        int length = sourceString.length();
        while (matcher.find()) {
            int start = matcher.start();
            int end = matcher.end();
            if (prevEndIndex != start) {
                result.add(sourceString.substring(prevEndIndex, start));
            }
            result.add(sourceString.substring(start, end));
            prevEndIndex = end;
        }
        if (prevEndIndex != length) {
            result.add(sourceString.substring(prevEndIndex, length));
        }
        return result;
    }

    private void setPropertyIfNotNull(org.neo4j.graphdb.Node root, String propertyKey, Object value) {
        if (value != null) {
            root.setProperty(propertyKey, value);
        }
    }

    private RuntimeException generateXmlDoctypeException() {
        throw new RuntimeException("XML documents with a DOCTYPE are not allowed.");
    }
}