All Downloads are FREE. Search and download functionalities are using the official Maven repository.

com.arcadedb.integration.importer.Neo4jImporter Maven / Gradle / Ivy

There is a newer version: 24.11.2
Show newest version
/*
 * Copyright © 2021-present Arcade Data Ltd ([email protected])
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 *
 * SPDX-FileCopyrightText: 2021-present Arcade Data Ltd ([email protected])
 * SPDX-License-Identifier: Apache-2.0
 */
package com.arcadedb.integration.importer;

import com.arcadedb.Constants;
import com.arcadedb.GlobalConfiguration;
import com.arcadedb.database.Database;
import com.arcadedb.database.DatabaseFactory;
import com.arcadedb.database.Identifiable;
import com.arcadedb.exception.DuplicatedKeyException;
import com.arcadedb.exception.NeedRetryException;
import com.arcadedb.graph.MutableEdge;
import com.arcadedb.graph.MutableVertex;
import com.arcadedb.graph.Vertex;
import com.arcadedb.index.IndexCursor;
import com.arcadedb.index.lsm.LSMTreeIndexAbstract;
import com.arcadedb.schema.Schema;
import com.arcadedb.schema.Type;
import com.arcadedb.schema.VertexType;
import com.arcadedb.serializer.json.JSONArray;
import com.arcadedb.serializer.json.JSONException;
import com.arcadedb.serializer.json.JSONObject;
import com.arcadedb.utility.Callable;
import com.arcadedb.utility.Pair;

import java.io.*;
import java.math.*;
import java.text.*;
import java.util.*;
import java.util.concurrent.atomic.*;
import java.util.stream.*;
import java.util.zip.*;

/**
 * Importer of a Neo4j database exported in JSONL format. To export a Neo4j database follow the instructions in https://neo4j.com/labs/apoc/4.3/export/json/.
 * The resulting file contains one json per line.
 * 
* Neo4j is a registered mark of Neo4j, Inc. * * @author Luca Garulli */ public class Neo4jImporter { protected Database database; protected Callable parsingCallback; protected int indexPageSize = LSMTreeIndexAbstract.DEF_PAGE_SIZE; protected int bucketsPerType = 1; private boolean closeDatabaseAfterImport = true; private InputStream inputStream; private String databasePath; private String inputFile; private boolean overwriteDatabase = false; private Type typeForDecimals = Type.DECIMAL; private final Map totalVerticesByType = new HashMap<>(); private long totalVerticesParsed = 0L; private final Map totalEdgesByType = new HashMap<>(); private long totalEdgesParsed = 0L; private long totalAttributesParsed = 0L; private int batchSize = 10_000; private long beginTimeVerticesCreation; private long beginTimeEdgesCreation; private boolean error = false; private final ImporterContext context; private final Map> schemaProperties = new HashMap<>(); private final static SimpleDateFormat dateTimeISO8601Format = new SimpleDateFormat("yyyy-MM-dd'T'HH:mm:ss"); private static final int MAX_RETRIES = 3; private enum PHASE {OFF, CREATE_SCHEMA, CREATE_VERTICES, CREATE_EDGES} public Neo4jImporter(final InputStream inputStream, final String... args) { parseArguments(args); this.inputStream = inputStream; this.context = new ImporterContext(); if (inputStream == null) syntaxError("Input Stream is null"); } public Neo4jImporter(final String... args) { parseArguments(args); this.context = new ImporterContext(); if (inputFile == null) syntaxError("Missing input file. Use -f "); } public Neo4jImporter(final Database database, final ImporterContext context) { this.database = database; this.context = context; this.closeDatabaseAfterImport = false; } public static void main(final String[] args) throws IOException { new Neo4jImporter(args).run(); } public void run() throws IOException { if (databasePath == null) log("Checking Neo4j database from file '%s'...", inputFile); else { log("Importing Neo4j database from file '%s' to '%s'", inputFile, databasePath); DatabaseFactory factory = new DatabaseFactory(databasePath); if (factory.exists()) { if (!overwriteDatabase) { error("Database already exists on path '%s'", databasePath); return; } else { database = factory.open(); error("Found existent database at '%s', dropping it and recreate a new one", databasePath); database.drop(); } } // CREATE THE DATABASE database = factory.create(); } try { context.startedOn = System.currentTimeMillis(); // PARSE THE FILE THE 1ST TIME TO CREATE THE SCHEMA AND CACHE EDGES IN RAM log("- Creation of the schema: types, properties and indexes"); syncSchema(); // PARSE THE FILE AGAIN TO CREATE VERTICES log("- Creation of vertices started"); beginTimeVerticesCreation = System.currentTimeMillis(); parseVertices(); // PARSE THE FILE AGAIN TO CREATE EDGES log("- Creation of edges started: creating edges between vertices"); beginTimeEdgesCreation = System.currentTimeMillis(); parseEdges(); final long elapsed = (System.currentTimeMillis() - context.startedOn) / 1000; log("***************************************************************************************************"); log("Import of Neo4j database completed in %,d secs with %,d errors and %,d warnings.", elapsed, context.errors.get(), context.warnings.get()); log("\nSUMMARY\n"); log("- Vertices.............: %,d", totalVerticesParsed); for (final Map.Entry entry : totalVerticesByType.entrySet()) { final String typeName = entry.getKey(); final Long verticesByClass = totalVerticesByType.get(typeName); final long entries = verticesByClass != null ? verticesByClass : 0L; final String additional = ""; log("-- %-20s: %,d %s", typeName, entries, additional); } log("- Edges................: %,d", totalEdgesParsed); for (final Map.Entry entry : totalEdgesByType.entrySet()) { final String typeName = entry.getKey(); final Long edgesByClass = totalEdgesByType.get(typeName); final long entries = edgesByClass != null ? edgesByClass : 0L; final String additional = ""; log("-- %-20s: %,d %s", typeName, entries, additional); } log("- Total attributes.....: %,d", totalAttributesParsed); log("***************************************************************************************************"); log(""); log("NOTES:"); if (database != null) log("- you can find your new ArcadeDB database in '" + database.getDatabasePath() + "'"); } finally { if (database != null && closeDatabaseAfterImport) database.close(); } } public boolean isError() { return error; } private void syncSchema() throws IOException { final VertexType rootNodeType = database.getSchema().buildVertexType().withName("Node") .withTotalBuckets(bucketsPerType).withIgnoreIfExists(true).create(); rootNodeType.getOrCreateProperty("id", Type.STRING); rootNodeType.getOrCreateTypeIndex(Schema.INDEX_TYPE.LSM_TREE, true, new String[] { "id" }, indexPageSize); readFile(json -> { switch (json.getString("type")) { case "node": final Pair> labels = typeNameFromLabels(json); if (!database.getSchema().existsType(labels.getFirst())) { final VertexType type = database.getSchema().buildVertexType().withName(labels.getFirst()) .withTotalBuckets(bucketsPerType).withIgnoreIfExists(true).create(); if (labels.getSecond() != null) { for (final String parent : labels.getSecond()) { final VertexType parentType = database.getSchema().getOrCreateVertexType(parent); parentType.addSuperType(rootNodeType); type.addSuperType(parentType); } } else type.addSuperType(rootNodeType); } inferPropertyType(json, labels.getFirst()); break; case "relationship": final String edgeLabel = json.has("label") && !json.isNull("label") ? json.getString("label") : null; if (edgeLabel != null) database.getSchema().buildEdgeType().withName(edgeLabel).withTotalBuckets(bucketsPerType).withIgnoreIfExists(true) .create(); inferPropertyType(json, edgeLabel); break; } return null; }); } private void inferPropertyType(final JSONObject json, final String label) { if (!json.has("properties")) return; // TRY TO INFER PROPERTY TYPES final Map typeProperties = schemaProperties.computeIfAbsent(label, k -> new HashMap<>()); final JSONObject properties = json.getJSONObject("properties"); for (final String propName : properties.keySet()) { ++totalAttributesParsed; Type currentType = typeProperties.get(propName); if (currentType == null) { Object propValue = properties.get(propName); if (!propValue.equals(JSONObject.NULL)) { if (propValue instanceof String) { // CHECK IF IT'S A DATE try { dateTimeISO8601Format.parse((String) propValue); currentType = Type.DATETIME; } catch (final ParseException e) { currentType = Type.STRING; } } else { if (propValue instanceof JSONObject) propValue = ((JSONObject) propValue).toMap(); else if (propValue instanceof JSONArray) propValue = ((JSONArray) propValue).toList(); currentType = Type.getTypeByValue(propValue); } typeProperties.put(propName, currentType); } } } } private void parseVertices() throws IOException { final AtomicInteger lineNumber = new AtomicInteger(); readFile(json -> { lineNumber.incrementAndGet(); switch (json.getString("type")) { case "node": context.parsed.incrementAndGet(); ++totalVerticesParsed; if (context.parsed.get() > 0 && context.parsed.get() % 1_000_000 == 0) { final long elapsed = System.currentTimeMillis() - beginTimeVerticesCreation; log("- Status update: created %,d vertices, skipped %,d edges (%,d vertices/sec)", context.createdVertices.get(), context.skippedEdges.get(), (context.createdVertices.get() / elapsed * 1000)); } final Pair> type = typeNameFromLabels(json); if (type == null) { log("- found vertex in line %d without labels. Skip it.", lineNumber.get()); context.warnings.incrementAndGet(); return null; } final String typeName = type.getFirst(); final String id = json.getString("id"); try { final MutableVertex vertex = database.newVertex(typeName); if (json.has("properties")) vertex.fromMap(setProperties(json.getJSONObject("properties"), schemaProperties.get(typeName))); vertex.set("id", id); vertex.save(); context.createdVertices.incrementAndGet(); incrementVerticesByType(typeName); } catch (Exception e) { error("- Error on saving vertex with id %s: %s", id, e.getMessage()); context.errors.incrementAndGet(); } break; case "relationship": context.skippedEdges.incrementAndGet(); break; } if (parsingCallback != null) parsingCallback.call(json); return null; }); final long elapsedInSecs = (System.currentTimeMillis() - context.startedOn) / 1000; log("- Creation of vertices completed: created %,d vertices, skipped %,d edges (%,d vertices/sec elapsed=%,d secs)", context.createdVertices.get(), context.skippedEdges.get(), elapsedInSecs > 0 ? (context.createdVertices.get() / elapsedInSecs) : 0, elapsedInSecs); } private void parseEdges() throws IOException { database.begin(); final AtomicInteger lineNumber = new AtomicInteger(); readFile(json -> { lineNumber.incrementAndGet(); switch (json.getString("type")) { case "node": break; case "relationship": context.parsed.incrementAndGet(); ++totalEdgesParsed; if (context.parsed.get() > 0 && context.parsed.get() % 1_000_000 == 0) { final long elapsed = System.currentTimeMillis() - beginTimeEdgesCreation; log("- Status update: created %,d edges %s (%,d edges/sec)", context.createdEdges.get(), totalEdgesByType, (context.createdEdges.get() / elapsed * 1000)); } final String type = json.getString("label"); if (type == null) { log("- found edge in line %d without labels. Skip it.", lineNumber.get()); context.warnings.incrementAndGet(); return null; } final JSONObject start = json.getJSONObject("start"); final Pair> startType = typeNameFromLabels(start); final String startId = start.getString("id"); final IndexCursor beginCursor = database.lookupByKey(startType.getFirst(), "id", startId); if (!beginCursor.hasNext()) { log("- cannot create relationship with id '%s'. Vertex id '%s' not found in type '%s'. Skip it.", json.getString("id"), startId, startType.getFirst()); context.warnings.incrementAndGet(); return null; } final Vertex fromVertex = beginCursor.next().asVertex(); final JSONObject end = json.getJSONObject("end"); final Pair> endType = typeNameFromLabels(end); final String endId = end.getString("id"); final IndexCursor endCursor = database.lookupByKey(endType.getFirst(), "id", endId); if (!endCursor.hasNext()) { log("- cannot create relationship with id '%s'. Vertex id '%s' not found for labels. Skip it.", json.getString("id"), endId); context.warnings.incrementAndGet(); return null; } final Identifiable toVertex = endCursor.next(); try { final MutableEdge edge = fromVertex.newEdge(type, toVertex, true); if (json.has("properties")) edge.fromMap(setProperties(json.getJSONObject("properties"), schemaProperties.get(type))); edge.save(); context.createdEdges.incrementAndGet(); incrementEdgesByType(type); } catch (Exception e) { error("- Error on saving edge between %s and %s: %s", fromVertex, toVertex, e.getMessage()); context.errors.incrementAndGet(); } if (context.parsed.get() > 0 && context.parsed.get() % batchSize == 0) { database.commit(); database.begin(); } break; } return null; }); database.commit(); final long elapsedInSecs = (System.currentTimeMillis() - context.startedOn) / 1000; log("- Creation of edged completed: created %,d edges, (%,d edges/sec elapsed=%,d secs)", context.createdEdges.get(), elapsedInSecs > 0 ? (context.createdEdges.get() / elapsedInSecs) : 0, elapsedInSecs); } private Map setProperties(final JSONObject properties, final Map typeSchema) { final Map result = new HashMap<>(); for (final String propName : properties.keySet()) { Object propValue = properties.get(propName); if (propValue == JSONObject.NULL) propValue = null; else if (propValue instanceof JSONObject) propValue = setProperties((JSONObject) propValue, null); else if (propValue instanceof JSONArray) propValue = ((JSONArray) propValue).toList(); else if (propValue instanceof String && typeSchema != null && typeSchema.get(propName) == Type.DATETIME) { try { propValue = dateTimeISO8601Format.parse((String) propValue).getTime(); } catch (final ParseException e) { log("Invalid date '%s', ignoring conversion to timestamp and leaving it as string", propValue); context.errors.incrementAndGet(); } } else if (propValue instanceof BigDecimal) { propValue = typeForDecimals.newInstance(propValue); } result.put(propName, propValue); } return result; } public InputStream openInputStream() throws IOException { if (inputStream != null) { inputStream.reset(); return inputStream; } final File file = new File(inputFile); if (!file.exists()) { error = true; throw new IllegalArgumentException("File '" + inputFile + "' not found"); } return file.getName().endsWith("gz") ? new GZIPInputStream(new FileInputStream(file)) : new FileInputStream(file); } private void readFile(final Callable callback) throws IOException { database.begin(); final InputStream inputStream = openInputStream(); try { final BufferedReader reader = new BufferedReader(new InputStreamReader(inputStream, DatabaseFactory.getDefaultCharset())); try { long lineNumberStartOfBatch = 0; final List transactionBuffer = new ArrayList<>(batchSize); String line = null; for (long lineNumber = 0; ; ++lineNumber) { try { line = reader.readLine(); if (line == null) { if (database.isTransactionActive()) database.commit(); transactionBuffer.clear(); break; } transactionBuffer.add(line); executeCallback(callback, line, lineNumber); if (context.parsed.get() > 0 && context.parsed.get() % batchSize == 0 && database.isTransactionActive()) { database.commit(); transactionBuffer.clear(); lineNumberStartOfBatch = lineNumber; database.begin(); } } catch (final NeedRetryException | DuplicatedKeyException e) { log("Transaction commit in error (%s), retrying the last transaction batch max %d times", e.getMessage(), MAX_RETRIES); // RETRY THE TRANSACTION for (int retry = 0; retry < MAX_RETRIES; retry++) { try { if (database.isTransactionActive()) database.rollback(); database.begin(); for (int i = 0; i < transactionBuffer.size(); i++) { line = transactionBuffer.get(i); executeCallback(callback, line, lineNumberStartOfBatch + i); } database.commit(); transactionBuffer.clear(); break; } catch (final NeedRetryException | DuplicatedKeyException e2) { log("Concurrent access to the database, retrying the last transaction batch (retry %d/%d)", retry + 1, MAX_RETRIES); } } } catch (final JSONException e) { log("Error on parsing json on line %d of the input JSONL file. The line will be ignored. JSON: %s", lineNumber, line); context.errors.incrementAndGet(); } } } finally { reader.close(); } } finally { inputStream.close(); } } private void executeCallback(final Callable callback, final String line, final long lineNumber) { final JSONObject json = new JSONObject(line); switch (json.getString("type")) { case "node": callback.call(json); break; case "relationship": callback.call(json); break; default: log("Invalid 'type' content on line %d of the input JSONL file. The line will be ignored. JSON: %s", lineNumber, line); context.errors.incrementAndGet(); } } private void incrementVerticesByType(final String label) { final Long vertices = totalVerticesByType.get(label); totalVerticesByType.put(label, vertices == null ? 1 : vertices + 1); } private void incrementEdgesByType(final String label) { final Long edges = totalEdgesByType.get(label); totalEdgesByType.put(label, edges == null ? 1 : edges + 1); } private Pair> typeNameFromLabels(final JSONObject json) { final JSONArray nodeLabels = json.has("labels") && !json.isNull("labels") ? json.getJSONArray("labels") : null; if (nodeLabels != null && !nodeLabels.isEmpty()) { if (nodeLabels.length() > 1) { // MULTI LABEL, CREATE A NEW MIXED TYPE THAT EXTEND ALL THE LABELS BY USING INHERITANCE final List list = nodeLabels.toList().stream().map(String.class::cast).sorted(Comparator.naturalOrder()) .collect(Collectors.toList()); return new Pair<>(String.join("_", list), list); } else return new Pair<>((String) nodeLabels.get(0), null); } return null; } private void log(final String text, final Object... args) { if (args.length == 0) System.out.println(text); else System.out.printf((text) + "%n", args); } private void error(final String text, final Object... args) { if (args.length == 0) System.out.println(text); else System.out.println(String.format(text, args)); } private void syntaxError(final String s) { log("Syntax error: " + s); error = true; printHelp(); } private void printHeader() { log(Constants.PRODUCT + " " + Constants.getVersion() + " - Neo4j Importer"); } private void printHelp() { log("Use:"); log("-d : create a database from the Neo4j export"); log("-i : path to the Neo4j export file in JSONL format"); log("-o: overwrite an existent database"); log("-decimalType : use for decimals. can be FLOAT, DOUBLE and DECIMAL. By default decimalType is DECIMAL."); } private void parseArguments(final String... args) { printHeader(); String state = null; for (final String arg : args) { if (arg.equals("-?")) printHelp(); else if (arg.equals("-d")) state = "databasePath"; else if (arg.equals("-i")) state = "inputFile"; else if (arg.equals("-o")) overwriteDatabase = true; else if (arg.equals("-b")) state = "batchSize"; else if (arg.equals("-decimalType")) state = "decimalType"; else if (state != null) { if (state.equals("databasePath")) databasePath = arg; else if (state.equals("inputFile")) inputFile = arg; else if (state.equals("batchSize")) batchSize = Integer.parseInt(arg); else if (state.equals("decimalType")) typeForDecimals = Type.valueOf(arg.toUpperCase(Locale.ENGLISH)); } } } }




© 2015 - 2025 Weber Informatics LLC | Privacy Policy