Please wait. This can take some minutes ...
Many resources are needed to download a project. Please understand that we have to compensate our server costs. Thank you in advance.
Project price only 1 $
You can buy this project and download/modify it how often you want.
apoc.load.Xml Maven / Gradle / Ivy
/*
* Copyright (c) "Neo4j"
* Neo4j Sweden AB [http://neo4j.com]
*
* This file is part of Neo4j.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package apoc.load;
import static apoc.export.util.LimitedSizeInputStream.toLimitedIStream;
import static apoc.util.CompressionConfig.COMPRESSION;
import static apoc.util.FileUtils.getInputStreamFromBinary;
import static apoc.util.Util.ERROR_BYTES_OR_STRING;
import static apoc.util.Util.getStreamConnection;
import apoc.ApocConfig;
import apoc.export.util.CountingInputStream;
import apoc.generate.config.InvalidConfigException;
import apoc.result.MapResult;
import apoc.result.NodeResult;
import apoc.util.CompressionAlgo;
import apoc.util.CompressionConfig;
import apoc.util.FileUtils;
import apoc.util.StreamConnection;
import java.io.ByteArrayInputStream;
import java.io.FileNotFoundException;
import java.io.IOException;
import java.io.InputStream;
import java.io.StringReader;
import java.net.URISyntaxException;
import java.nio.charset.Charset;
import java.util.ArrayDeque;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collections;
import java.util.Deque;
import java.util.HashMap;
import java.util.LinkedHashMap;
import java.util.LinkedList;
import java.util.List;
import java.util.Map;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import java.util.stream.Stream;
import javax.xml.namespace.QName;
import javax.xml.parsers.DocumentBuilder;
import javax.xml.parsers.DocumentBuilderFactory;
import javax.xml.stream.XMLInputFactory;
import javax.xml.stream.XMLStreamConstants;
import javax.xml.stream.XMLStreamException;
import javax.xml.stream.XMLStreamReader;
import javax.xml.xpath.XPath;
import javax.xml.xpath.XPathConstants;
import javax.xml.xpath.XPathExpression;
import javax.xml.xpath.XPathFactory;
import org.apache.commons.lang3.BooleanUtils;
import org.apache.commons.lang3.StringUtils;
import org.neo4j.graphdb.Label;
import org.neo4j.graphdb.RelationshipType;
import org.neo4j.graphdb.Transaction;
import org.neo4j.graphdb.security.URLAccessChecker;
import org.neo4j.graphdb.security.URLAccessValidationError;
import org.neo4j.logging.Log;
import org.neo4j.procedure.Context;
import org.neo4j.procedure.Description;
import org.neo4j.procedure.Mode;
import org.neo4j.procedure.Name;
import org.neo4j.procedure.Procedure;
import org.neo4j.procedure.TerminationGuard;
import org.neo4j.procedure.UserFunction;
import org.w3c.dom.CharacterData;
import org.w3c.dom.Document;
import org.w3c.dom.NamedNodeMap;
import org.w3c.dom.Node;
import org.w3c.dom.NodeList;
import org.xml.sax.InputSource;
import org.xml.sax.SAXParseException;
public class Xml {
private static final XMLInputFactory FACTORY = XMLInputFactory.newFactory();
static {
FACTORY.setProperty(XMLInputFactory.IS_COALESCING, true);
FACTORY.setProperty(XMLInputFactory.SUPPORT_DTD, false);
FACTORY.setProperty(XMLInputFactory.IS_SUPPORTING_EXTERNAL_ENTITIES, false);
}
@Context
public ApocConfig apocConfig;
@Context
public Transaction tx;
@Context
public Log log;
@Context
public TerminationGuard terminationGuard;
@Context
public URLAccessChecker urlAccessChecker;
@Procedure("apoc.load.xml")
@Description("Loads a single nested `MAP` from an XML URL (e.g. web-API).")
public Stream xml(
@Name("urlOrBinary") Object urlOrBinary,
@Name(value = "path", defaultValue = "/") String path,
@Name(value = "config", defaultValue = "{}") Map config,
@Name(value = "simple", defaultValue = "false") boolean simpleMode)
throws Exception {
return xmlXpathToMapResult(urlOrBinary, simpleMode, path, config);
}
@UserFunction("apoc.xml.parse")
@Description("Parses the given XML `STRING` as a `MAP`.")
public Map parse(
@Name("data") String data,
@Name(value = "path", defaultValue = "/") String path,
@Name(value = "config", defaultValue = "{}") Map config,
@Name(value = "simple", defaultValue = "false") boolean simpleMode)
throws Exception {
if (config == null) config = Collections.emptyMap();
boolean failOnError = (boolean) config.getOrDefault("failOnError", true);
return parse(new ByteArrayInputStream(data.getBytes(Charset.forName("UTF-8"))), simpleMode, path, failOnError)
.map(mr -> mr.value)
.findFirst()
.orElse(null);
}
private Stream xmlXpathToMapResult(
Object urlOrBinary, boolean simpleMode, String path, Map config) throws Exception {
if (config == null) config = Collections.emptyMap();
boolean failOnError = (boolean) config.getOrDefault("failOnError", true);
try {
Map headers = (Map) config.getOrDefault("headers", Collections.emptyMap());
CountingInputStream is = FileUtils.inputStreamFor(
urlOrBinary,
headers,
null,
(String) config.getOrDefault(COMPRESSION, CompressionAlgo.NONE.name()),
urlAccessChecker);
return parse(is, simpleMode, path, failOnError);
} catch (Exception e) {
if (!failOnError) return Stream.of(new MapResult(Collections.emptyMap()));
else throw e;
}
}
private Stream parse(InputStream data, boolean simpleMode, String path, boolean failOnError)
throws Exception {
List result = new ArrayList<>();
try {
DocumentBuilderFactory documentBuilderFactory = DocumentBuilderFactory.newInstance();
documentBuilderFactory.setNamespaceAware(true);
documentBuilderFactory.setIgnoringElementContentWhitespace(true);
documentBuilderFactory.setFeature("http://apache.org/xml/features/disallow-doctype-decl", true);
DocumentBuilder documentBuilder = documentBuilderFactory.newDocumentBuilder();
documentBuilder.setEntityResolver((publicId, systemId) -> new InputSource(new StringReader("")));
Document doc = documentBuilder.parse(data);
XPathFactory xPathFactory = XPathFactory.newInstance();
XPath xPath = xPathFactory.newXPath();
path = StringUtils.isEmpty(path) ? "/" : path;
XPathExpression xPathExpression = xPath.compile(path);
NodeList nodeList = (NodeList) xPathExpression.evaluate(doc, XPathConstants.NODESET);
for (int i = 0; i < nodeList.getLength(); i++) {
final Deque> stack = new LinkedList<>();
handleNode(stack, nodeList.item(i), simpleMode);
for (int index = 0; index < stack.size(); index++) {
result.add(new MapResult(stack.pollFirst()));
}
}
} catch (FileNotFoundException e) {
if (!failOnError) return Stream.of(new MapResult(Collections.emptyMap()));
else throw e;
} catch (Exception e) {
if (!failOnError) return Stream.of(new MapResult(Collections.emptyMap()));
else if (e instanceof SAXParseException && e.getMessage().contains("DOCTYPE is disallowed"))
throw generateXmlDoctypeException();
else throw e;
}
return result.stream();
}
private XMLStreamReader getXMLStreamReader(
Object urlOrBinary, XmlImportConfig config, URLAccessChecker urlAccessChecker)
throws IOException, XMLStreamException, URISyntaxException, URLAccessValidationError {
InputStream inputStream;
if (urlOrBinary instanceof String) {
String url = (String) urlOrBinary;
url = FileUtils.changeFileUrlIfImportDirectoryConstrained(url, urlAccessChecker);
StreamConnection streamConnection = getStreamConnection(url, null, null, urlAccessChecker);
inputStream = toLimitedIStream(streamConnection.getInputStream(), streamConnection.getLength());
} else if (urlOrBinary instanceof byte[]) {
inputStream = toLimitedIStream(
getInputStreamFromBinary((byte[]) urlOrBinary, config.getCompressionAlgo()),
((byte[]) urlOrBinary).length);
} else {
throw new RuntimeException(ERROR_BYTES_OR_STRING);
}
if (config.isFilterLeadingWhitespace()) {
inputStream = new SkipWhitespaceInputStream(inputStream);
}
return FACTORY.createXMLStreamReader(inputStream);
}
private void handleNode(Deque> stack, Node node, boolean simpleMode) {
terminationGuard.check();
// Handle document node
if (node.getNodeType() == Node.DOCUMENT_NODE) {
NodeList children = node.getChildNodes();
for (int i = 0; i < children.getLength(); i++) {
if (children.item(i).getLocalName() != null) {
handleNode(stack, children.item(i), simpleMode);
return;
}
}
}
Map elementMap = new LinkedHashMap<>();
handleTypeAndAttributes(node, elementMap);
// Set children
NodeList children = node.getChildNodes();
int count = 0;
for (int i = 0; i < children.getLength(); i++) {
Node child = children.item(i);
// This is to deal with text between xml tags for example new line characters
if (child.getNodeType() != Node.TEXT_NODE && child.getNodeType() != Node.CDATA_SECTION_NODE) {
handleNode(stack, child, simpleMode);
count++;
} else {
// Deal with text nodes
handleTextNode(child, elementMap);
}
}
if (children.getLength() > 0) {
if (!stack.isEmpty()) {
List nodeChildren = new ArrayList<>();
for (int i = 0; i < count; i++) {
nodeChildren.add(stack.pollLast());
}
String key = simpleMode ? "_" + node.getLocalName() : "_children";
Collections.reverse(nodeChildren);
if (nodeChildren.size() > 0) {
// Before adding the children we need to handle mixed text
Object text = elementMap.get("_text");
if (text instanceof List) {
for (Object element : (List) text) {
nodeChildren.add(element);
}
elementMap.remove("_text");
}
elementMap.put(key, nodeChildren);
}
}
}
if (!elementMap.isEmpty()) {
stack.addLast(elementMap);
}
}
/**
* Collects type and attributes for the node
*
* @param node
* @param elementMap
*/
private void handleTypeAndAttributes(Node node, Map elementMap) {
// Set type
if (node.getLocalName() != null) {
elementMap.put("_type", node.getLocalName());
}
// Set the attributes
if (node.getAttributes() != null) {
NamedNodeMap attributeMap = node.getAttributes();
for (int i = 0; i < attributeMap.getLength(); i++) {
Node attribute = attributeMap.item(i);
elementMap.put(attribute.getNodeName(), attribute.getNodeValue());
}
}
}
/**
* Handle TEXT nodes and CDATA nodes
*
* @param node
* @param elementMap
*/
private void handleTextNode(Node node, Map elementMap) {
Object text = "";
int nodeType = node.getNodeType();
switch (nodeType) {
case Node.TEXT_NODE:
text = normalizeText(node.getNodeValue());
break;
case Node.CDATA_SECTION_NODE:
text = normalizeText(((CharacterData) node).getData());
break;
default:
break;
}
// If the text is valid ...
if (!StringUtils.isEmpty(text.toString())) {
// We check if we have already collected some text previously
Object previousText = elementMap.get("_text");
if (previousText != null) {
// If we just have a "_text" key than we need to collect to a List
text = Arrays.asList(previousText.toString(), text);
}
elementMap.put("_text", text);
}
}
/**
* Remove trailing whitespaces and new line characters
*
* @param text
* @return
*/
private String normalizeText(String text) {
String[] tokens = StringUtils.split(text, "\n");
for (int i = 0; i < tokens.length; i++) {
tokens[i] = tokens[i].trim();
}
return StringUtils.join(tokens, " ").trim();
}
public static class ParentAndChildPair {
private final org.neo4j.graphdb.Node parent;
private org.neo4j.graphdb.Node previousChild = null;
public ParentAndChildPair(org.neo4j.graphdb.Node parent) {
this.parent = parent;
}
public org.neo4j.graphdb.Node getParent() {
return parent;
}
public org.neo4j.graphdb.Node getPreviousChild() {
return previousChild;
}
public void setPreviousChild(org.neo4j.graphdb.Node previousChild) {
this.previousChild = previousChild;
}
@Override
public boolean equals(Object o) {
if (this == o) return true;
if (o == null || getClass() != o.getClass()) return false;
ParentAndChildPair that = (ParentAndChildPair) o;
return parent.equals(that.parent);
}
@Override
public int hashCode() {
return parent.hashCode();
}
}
private static class XmlImportConfig extends CompressionConfig {
private boolean connectCharacters;
private Pattern delimiter;
private Label label = Label.label("XmlCharacters");
private RelationshipType relType = RelationshipType.withName("NE");
private Map charactersForTag = new HashMap<>();
private final boolean filterLeadingWhitespace;
public XmlImportConfig(Map config) {
super(config);
if (config == null) {
config = Collections.emptyMap();
}
connectCharacters = BooleanUtils.toBoolean((Boolean) config.get("connectCharacters"));
filterLeadingWhitespace = BooleanUtils.toBoolean((Boolean) config.get("filterLeadingWhitespace"));
String _delimiter = (String) config.get("delimiter");
if (_delimiter != null) {
connectCharacters = true;
}
delimiter = Pattern.compile(_delimiter == null ? "\\s" : _delimiter);
String _label = (String) config.get("label");
if (_label != null) {
label = Label.label(_label);
connectCharacters = true;
}
String _relType = (String) config.get("relType");
if (_relType != null) {
relType = RelationshipType.withName(_relType);
connectCharacters = true;
}
Map _charactersForTag = (Map) config.get("charactersForTag");
if (_charactersForTag != null) {
charactersForTag = _charactersForTag;
}
if (config.containsKey("createNextWordRelationships")) {
throw new InvalidConfigException(
"usage of `createNextWordRelationships` is no longer allowed. Use `{relType:'NEXT_WORD', label:'XmlWord'}` instead.");
}
}
public Pattern getDelimiter() {
return delimiter;
}
public Label getLabel() {
return label;
}
public RelationshipType getRelType() {
return relType;
}
public boolean isConnectCharacters() {
return connectCharacters;
}
public Map getCharactersForTag() {
return charactersForTag;
}
public boolean isFilterLeadingWhitespace() {
return filterLeadingWhitespace;
}
}
private static class ImportState {
private final Deque parents = new ArrayDeque<>();
private org.neo4j.graphdb.Node last;
private org.neo4j.graphdb.Node lastWord;
private int currentCharacterIndex = 0;
public ImportState(org.neo4j.graphdb.Node initialNode) {
this.last = initialNode;
this.lastWord = initialNode;
}
public void push(ParentAndChildPair parentAndChildPair) {
parents.push(parentAndChildPair);
}
public org.neo4j.graphdb.Node getLastWord() {
return lastWord;
}
public void setLastWord(org.neo4j.graphdb.Node lastWord) {
this.lastWord = lastWord;
}
public int getCurrentCharacterIndex() {
return currentCharacterIndex;
}
public ParentAndChildPair pop() {
return parents.pop();
}
public boolean isEmpty() {
return parents.isEmpty();
}
public void updateLast(org.neo4j.graphdb.Node thisNode) {
ParentAndChildPair parentAndChildPair = parents.peek();
final org.neo4j.graphdb.Node parent = parentAndChildPair.getParent();
final org.neo4j.graphdb.Node previousChild = parentAndChildPair.getPreviousChild();
last.createRelationshipTo(thisNode, RelationshipType.withName("NEXT"));
thisNode.createRelationshipTo(parent, RelationshipType.withName("IS_CHILD_OF"));
if (previousChild == null) {
thisNode.createRelationshipTo(parent, RelationshipType.withName("FIRST_CHILD_OF"));
} else {
previousChild.createRelationshipTo(thisNode, RelationshipType.withName("NEXT_SIBLING"));
}
parentAndChildPair.setPreviousChild(thisNode);
last = thisNode;
}
public void addCurrentCharacterIndex(int length) {
currentCharacterIndex += length;
}
}
@Procedure(mode = Mode.WRITE, value = "apoc.import.xml")
@Description("Imports a graph from the provided XML file.")
public Stream importToGraph(
@Name("urlOrBinary") Object urlOrBinary,
@Name(value = "config", defaultValue = "{}") Map config)
throws IOException, XMLStreamException, URISyntaxException, URLAccessValidationError {
XmlImportConfig importConfig = new XmlImportConfig(config);
// TODO: make labels, reltypes and magic properties configurable
final XMLStreamReader xml = getXMLStreamReader(urlOrBinary, importConfig, urlAccessChecker);
// stores parents and their most recent child
org.neo4j.graphdb.Node root = tx.createNode(Label.label("XmlDocument"));
setPropertyIfNotNull(root, "_xmlVersion", xml.getVersion());
setPropertyIfNotNull(root, "_xmlEncoding", xml.getEncoding());
if (urlOrBinary instanceof String) {
root.setProperty("url", urlOrBinary);
}
ImportState state = new ImportState(root);
state.push(new ParentAndChildPair(root));
while (xml.hasNext()) {
xml.next();
switch (xml.getEventType()) {
case XMLStreamConstants.DTD:
throw generateXmlDoctypeException();
case XMLStreamConstants.START_DOCUMENT:
// xmlsteamreader starts off by definition at START_DOCUMENT prior to call next() - so ignore this
// one
break;
case XMLStreamConstants.PROCESSING_INSTRUCTION:
org.neo4j.graphdb.Node pi = tx.createNode(Label.label("XmlProcessingInstruction"));
pi.setProperty("_piData", xml.getPIData());
pi.setProperty("_piTarget", xml.getPITarget());
state.updateLast(pi);
break;
case XMLStreamConstants.START_ELEMENT:
final QName qName = xml.getName();
final org.neo4j.graphdb.Node tag = tx.createNode(Label.label("XmlTag"));
tag.setProperty("_name", qName.getLocalPart());
for (int i = 0; i < xml.getAttributeCount(); i++) {
tag.setProperty(xml.getAttributeLocalName(i), xml.getAttributeValue(i));
}
state.updateLast(tag);
state.push(new ParentAndChildPair(tag));
break;
case XMLStreamConstants.CHARACTERS:
List words = parseTextIntoPartsAndDelimiters(xml.getText(), importConfig.getDelimiter());
for (String currentWord : words) {
createCharactersNode(currentWord, state, importConfig);
}
break;
case XMLStreamConstants.END_ELEMENT:
String charactersForTag =
importConfig.getCharactersForTag().get(xml.getName().getLocalPart());
if (charactersForTag != null) {
createCharactersNode(charactersForTag, state, importConfig);
}
ParentAndChildPair parent = state.pop();
if (parent.getPreviousChild() != null) {
parent.getPreviousChild()
.createRelationshipTo(parent.getParent(), RelationshipType.withName("LAST_CHILD_OF"));
}
break;
case XMLStreamConstants.END_DOCUMENT:
state.pop();
break;
case XMLStreamConstants.COMMENT:
case XMLStreamConstants.SPACE:
// intentionally do nothing
break;
default:
log.warn("xml file contains a {} type structure - ignoring this.", xml.getEventType());
}
}
if (!state.isEmpty()) {
throw new IllegalStateException("non empty parents, this indicates a bug");
}
return Stream.of(new NodeResult(root));
}
private void createCharactersNode(String currentWord, ImportState state, XmlImportConfig importConfig) {
org.neo4j.graphdb.Node word = tx.createNode(importConfig.getLabel());
word.setProperty("text", currentWord);
word.setProperty("startIndex", state.getCurrentCharacterIndex());
state.addCurrentCharacterIndex(currentWord.length());
word.setProperty("endIndex", state.getCurrentCharacterIndex() - 1);
state.updateLast(word);
if (importConfig.isConnectCharacters()) {
state.getLastWord().createRelationshipTo(word, importConfig.getRelType());
state.setLastWord(word);
}
}
List parseTextIntoPartsAndDelimiters(String sourceString, Pattern delimiterPattern) {
Matcher matcher = delimiterPattern.matcher(sourceString);
ArrayList result = new ArrayList<>();
int prevEndIndex = 0;
int length = sourceString.length();
while (matcher.find()) {
int start = matcher.start();
int end = matcher.end();
if (prevEndIndex != start) {
result.add(sourceString.substring(prevEndIndex, start));
}
result.add(sourceString.substring(start, end));
prevEndIndex = end;
}
if (prevEndIndex != length) {
result.add(sourceString.substring(prevEndIndex, length));
}
return result;
}
private void setPropertyIfNotNull(org.neo4j.graphdb.Node root, String propertyKey, Object value) {
if (value != null) {
root.setProperty(propertyKey, value);
}
}
private RuntimeException generateXmlDoctypeException() {
throw new RuntimeException("XML documents with a DOCTYPE are not allowed.");
}
}