All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.neo4j.importer.CsvImporter Maven / Gradle / Ivy

There is a newer version: 5.25.1
Show newest version
/*
 * Copyright (c) "Neo4j"
 * Neo4j Sweden AB [https://neo4j.com]
 *
 * This file is part of Neo4j.
 *
 * Neo4j is free software: you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation, either version 3 of the License, or
 * (at your option) any later version.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with this program.  If not, see .
 */
package org.neo4j.importer;

import static java.util.Objects.requireNonNull;
import static org.apache.commons.lang3.exception.ExceptionUtils.indexOfThrowable;
import static org.neo4j.configuration.GraphDatabaseInternalSettings.duplication_user_messages;
import static org.neo4j.configuration.GraphDatabaseSettings.db_temporal_timezone;
import static org.neo4j.configuration.GraphDatabaseSettings.server_logging_config_path;
import static org.neo4j.internal.batchimport.input.Collectors.badCollector;
import static org.neo4j.internal.batchimport.input.Collectors.collect;
import static org.neo4j.internal.batchimport.input.InputEntityDecorators.NO_DECORATOR;
import static org.neo4j.internal.batchimport.input.InputEntityDecorators.additiveLabels;
import static org.neo4j.internal.batchimport.input.InputEntityDecorators.defaultRelationshipType;
import static org.neo4j.internal.batchimport.input.csv.DataFactories.data;
import static org.neo4j.internal.batchimport.input.csv.DataFactories.defaultFormatNodeFileHeader;
import static org.neo4j.internal.batchimport.input.csv.DataFactories.defaultFormatRelationshipFileHeader;
import static org.neo4j.io.ByteUnit.bytesToString;
import static org.neo4j.kernel.impl.scheduler.JobSchedulerFactory.createInitialisedScheduler;
import static org.neo4j.logging.log4j.LogConfig.createLoggerFromXmlConfig;
import static org.neo4j.storageengine.api.TransactionIdStore.BASE_TX_ID;

import java.io.IOException;
import java.io.OutputStream;
import java.io.PrintStream;
import java.nio.charset.Charset;
import java.nio.charset.StandardCharsets;
import java.nio.file.DirectoryNotEmptyException;
import java.nio.file.Path;
import java.time.ZoneId;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.function.Supplier;
import org.neo4j.batchimport.api.Configuration;
import org.neo4j.batchimport.api.input.Collector;
import org.neo4j.batchimport.api.input.IdType;
import org.neo4j.batchimport.api.input.Input;
import org.neo4j.configuration.Config;
import org.neo4j.csv.reader.IllegalMultilineFieldException;
import org.neo4j.internal.batchimport.cache.idmapping.string.DuplicateInputIdException;
import org.neo4j.internal.batchimport.input.BadCollector;
import org.neo4j.internal.batchimport.input.InputException;
import org.neo4j.internal.batchimport.input.MissingRelationshipDataException;
import org.neo4j.internal.batchimport.input.csv.CsvInput;
import org.neo4j.internal.batchimport.input.csv.DataFactory;
import org.neo4j.io.fs.DefaultFileSystemAbstraction;
import org.neo4j.io.fs.FileSystemAbstraction;
import org.neo4j.io.layout.DatabaseLayout;
import org.neo4j.io.locker.FileLockException;
import org.neo4j.io.os.OsBeanUtil;
import org.neo4j.io.pagecache.context.CursorContextFactory;
import org.neo4j.io.pagecache.context.FixedVersionContextSupplier;
import org.neo4j.io.pagecache.tracing.PageCacheTracer;
import org.neo4j.kernel.internal.Version;
import org.neo4j.logging.InternalLogProvider;
import org.neo4j.logging.NullLogProvider;
import org.neo4j.logging.internal.PrefixedLogProvider;
import org.neo4j.logging.internal.SimpleLogService;
import org.neo4j.logging.log4j.Log4jLogProvider;
import org.neo4j.memory.EmptyMemoryTracker;
import org.neo4j.memory.MemoryTracker;
import org.neo4j.scheduler.JobScheduler;
import org.neo4j.util.Preconditions;

class CsvImporter {
    static final String DEFAULT_REPORT_FILE_NAME = "import.report";

    private final DatabaseLayout databaseLayout;
    private final Config databaseConfig;
    private final org.neo4j.csv.reader.Configuration csvConfig;
    private final org.neo4j.batchimport.api.Configuration importConfig;
    private final Path reportFile;
    private final IdType idType;
    private final Charset inputEncoding;
    private final boolean ignoreExtraColumns;
    private final boolean skipBadRelationships;
    private final boolean skipDuplicateNodes;
    private final boolean skipBadEntriesLogging;
    private final long badTolerance;
    private final boolean normalizeTypes;
    private final boolean verbose;
    private final boolean autoSkipHeaders;
    private final Map, List> nodeFiles;
    private final Map> relationshipFiles;
    private final FileSystemAbstraction fileSystem;
    private final PrintStream stdOut;
    private final PrintStream stdErr;
    private final PageCacheTracer pageCacheTracer;
    private final CursorContextFactory contextFactory;
    private final MemoryTracker memoryTracker;
    private final boolean force;
    private final ImportCommand.IncrementalStage incrementalStage;
    private final boolean incremental;
    private final InternalLogProvider logProvider;

    private CsvImporter(Builder b) {
        this.databaseLayout = requireNonNull(b.databaseLayout);
        this.databaseConfig = requireNonNull(b.databaseConfig);
        this.csvConfig = requireNonNull(b.csvConfig);
        this.importConfig = requireNonNull(b.importConfig);
        this.reportFile = requireNonNull(b.reportFile);
        this.idType = requireNonNull(b.idType);
        this.inputEncoding = requireNonNull(b.inputEncoding);
        this.ignoreExtraColumns = b.ignoreExtraColumns;
        this.skipBadRelationships = b.skipBadRelationships;
        this.skipDuplicateNodes = b.skipDuplicateNodes;
        this.skipBadEntriesLogging = b.skipBadEntriesLogging;
        this.badTolerance = b.badTolerance;
        this.normalizeTypes = b.normalizeTypes;
        this.verbose = b.verbose;
        this.autoSkipHeaders = b.autoSkipHeaders;
        this.nodeFiles = requireNonNull(b.nodeFiles);
        this.relationshipFiles = requireNonNull(b.relationshipFiles);
        this.fileSystem = requireNonNull(b.fileSystem);
        this.pageCacheTracer = requireNonNull(b.pageCacheTracer);
        this.contextFactory = requireNonNull(b.contextFactory);
        this.memoryTracker = requireNonNull(b.memoryTracker);
        this.stdOut = requireNonNull(b.stdOut);
        this.stdErr = requireNonNull(b.stdErr);
        this.logProvider = requireNonNull(b.logProvider);
        this.force = b.force;
        this.incremental = b.incremental;
        this.incrementalStage = b.incrementalStage;
    }

    void doImport(ImportCommand.Base type) throws IOException {
        if (force) {
            fileSystem.deleteRecursively(
                    databaseLayout.databaseDirectory(), path -> !path.equals(databaseLayout.databaseLockFile()));
            fileSystem.deleteRecursively(databaseLayout.getTransactionLogsDirectory());
        }

        try (OutputStream badOutput = fileSystem.openAsOutputStream(reportFile, false);
                Collector badCollector = getBadCollector(skipBadEntriesLogging, badOutput)) {
            // Extract the default time zone from the database configuration
            ZoneId dbTimeZone = databaseConfig.get(db_temporal_timezone);
            Supplier defaultTimeZone = () -> dbTimeZone;

            final var nodeData = nodeData();
            final var relationshipsData = relationshipData();

            try (CsvInput input = new CsvInput(
                    nodeData,
                    defaultFormatNodeFileHeader(defaultTimeZone, normalizeTypes),
                    relationshipsData,
                    defaultFormatRelationshipFileHeader(defaultTimeZone, normalizeTypes),
                    idType,
                    csvConfig,
                    autoSkipHeaders,
                    new CsvInput.PrintingMonitor(stdOut),
                    memoryTracker)) {
                doImport(input, badCollector, type);
            }
        }
    }

    private void doImport(Input input, Collector badCollector, ImportCommand.Base type) {
        boolean success = false;

        printOverview();

        try (JobScheduler jobScheduler = createInitialisedScheduler()) {
            // Let the storage engine factory be configurable in the tool later on...
            var logService = new SimpleLogService(
                    NullLogProvider.getInstance(),
                    new PrefixedLogProvider(logProvider, databaseLayout.getDatabaseName()),
                    databaseConfig.get(duplication_user_messages));
            type.doImport(
                    fileSystem,
                    databaseLayout,
                    databaseConfig,
                    jobScheduler,
                    logProvider,
                    pageCacheTracer,
                    contextFactory,
                    importConfig,
                    logService,
                    stdOut,
                    stdErr,
                    verbose,
                    badCollector,
                    memoryTracker,
                    input);
            success = true;
        } catch (Exception ex) {
            throw andPrintError(databaseLayout.getDatabaseName(), ex, incremental, stdErr);
        } finally {
            long numberOfBadEntries = badCollector.badEntries();
            if (badTolerance != BadCollector.UNLIMITED_TOLERANCE && numberOfBadEntries > badTolerance) {
                stdOut.println("Neo4j-admin aborted the import because " + numberOfBadEntries + " bad entries were "
                        + "found, which exceeds the set fault tolerance ("
                        + badTolerance + "). Import is optimized to import fault-free data.");
                stdOut.println();
                if (skipBadEntriesLogging) {
                    stdOut.println(
                            "Bad entry logging is disabled, enable it using --skip-bad-entries-logging=false" + ".");
                } else {
                    stdOut.println("Bad entries were logged to " + reportFile.toAbsolutePath() + ".");
                }
                stdOut.println();
                stdOut.println("We recommend that data should be cleaned before importing. The fault-tolerance can be "
                        + "increased using --bad-tolerance=, however this will dramatically affect the tool’s"
                        + " performance.");
                stdOut.println();
            }
            if (!success) {
                stdErr.println("WARNING Import failed. The store files in "
                        + databaseLayout.databaseDirectory().toAbsolutePath()
                        + " are left as they are, although they are likely in an unusable state. "
                        + "Starting a database on these store files will likely fail or observe inconsistent records so "
                        + "start at your own risk or delete the store manually.");
                stdOut.println();
            }
        }
    }

    /**
     * Method name looks strange, but look at how it's used and you'll see why it's named like that.
     *
     * @param databaseName the name of the database to receive the import data
     * @param e            the error that occurred
     * @param incremental  whether the import is incremental
     * @param err          the error output stream
     */
    private static RuntimeException andPrintError(
            String databaseName, Exception e, boolean incremental, PrintStream err) {
        // List of common errors that can be explained to the user
        if (DuplicateInputIdException.class.equals(e.getClass())) {
            err.println("Duplicate input ids that would otherwise clash can be put into separate id space.");
        } else if (MissingRelationshipDataException.class.equals(e.getClass())) {
            err.println("Relationship missing mandatory field");
        } else if (DirectoryNotEmptyException.class.equals(e.getClass())) {
            err.println(
                    "Database already exist. Re-run with `--overwrite-destination` to remove the database prior to import");
        } else if (FileLockException.class.equals(e.getClass())) {
            String string =
                    "%s can only be run against a database which is offline. The current state of database '%s' is online."
                            .formatted(incremental ? "Incremental import" : "Import", databaseName);
            err.println(string);
        }
        // This type of exception is wrapped since our input code throws InputException consistently,
        // and so IllegalMultilineFieldException comes from the csv component, which has no access to InputException
        // therefore it's wrapped.
        else if (indexOfThrowable(e, IllegalMultilineFieldException.class) != -1) {
            err.println("Detected field which spanned multiple lines for an import where "
                    + "--multiline-fields=false. If you know that your input data "
                    + "include fields containing new-line characters then import with this option set to "
                    + "true.");
        } else if (indexOfThrowable(e, InputException.class) != -1) {
            err.println("Error in input data");
        }
        err.println();

        return new CsvImportException(e); // throw in order to have process exit with !0
    }

    static class CsvImportException extends RuntimeException {
        CsvImportException(Throwable cause) {
            super(cause);
        }
    }

    private void printOverview() {
        stdOut.println("Neo4j version: " + Version.getNeo4jVersion());
        stdOut.println("Importing the contents of these files into " + databaseLayout.databaseDirectory() + ":");
        if (incrementalStage != null) {
            stdOut.println("Import mode: " + incrementalStage);
        }
        printInputFiles("Nodes", nodeFiles, stdOut);
        printInputFiles("Relationships", relationshipFiles, stdOut);
        stdOut.println();
        stdOut.println("Available resources:");
        printIndented("Total machine memory: " + bytesToString(OsBeanUtil.getTotalPhysicalMemory()), stdOut);
        printIndented("Free machine memory: " + bytesToString(OsBeanUtil.getFreePhysicalMemory()), stdOut);
        printIndented("Max heap memory : " + bytesToString(Runtime.getRuntime().maxMemory()), stdOut);
        printIndented("Max worker threads: " + importConfig.maxNumberOfWorkerThreads(), stdOut);
        printIndented("Configured max memory: " + bytesToString(importConfig.maxOffHeapMemory()), stdOut);
        printIndented("High parallel IO: " + importConfig.highIO(), stdOut);
        stdOut.println();
    }

    private static void printInputFiles(String name, Map> inputFiles, PrintStream out) {
        if (inputFiles.isEmpty()) {
            return;
        }

        out.println(name + ":");

        inputFiles.forEach((k, files) -> {
            if (!isEmptyKey(k)) {
                printIndented(k + ":", out);
            }

            for (Path[] arr : files) {
                for (final Path file : arr) {
                    printIndented(file, out);
                }
            }
            out.println();
        });
    }

    private static boolean isEmptyKey(Object k) {
        if (k instanceof String) {
            return ((String) k).isEmpty();
        } else if (k instanceof Set) {
            return ((Set) k).isEmpty();
        }
        return false;
    }

    private static void printIndented(Object value, PrintStream out) {
        out.println("  " + value);
    }

    private Iterable relationshipData() {
        final var result = new ArrayList();
        relationshipFiles.forEach((defaultTypeName, fileSets) -> {
            final var decorator = defaultRelationshipType(defaultTypeName);
            for (Path[] files : fileSets) {
                final var data = data(decorator, inputEncoding, files);
                result.add(data);
            }
        });
        return result;
    }

    private Iterable nodeData() {
        final var result = new ArrayList();
        nodeFiles.forEach((labels, fileSets) -> {
            final var decorator = labels.isEmpty() ? NO_DECORATOR : additiveLabels(labels.toArray(new String[0]));
            for (Path[] files : fileSets) {
                final var data = data(decorator, inputEncoding, files);
                result.add(data);
            }
        });
        return result;
    }

    private Collector getBadCollector(boolean skipBadEntriesLogging, OutputStream badOutput) {
        return badCollector(
                badOutput,
                badTolerance,
                collect(skipBadRelationships, skipDuplicateNodes, ignoreExtraColumns),
                skipBadEntriesLogging);
    }

    static InternalLogProvider createLogProvider(FileSystemAbstraction fileSystem, Config databaseConfig) {
        return new Log4jLogProvider(createLoggerFromXmlConfig(
                fileSystem,
                databaseConfig.get(server_logging_config_path),
                !databaseConfig.isExplicitlySet(server_logging_config_path),
                databaseConfig::configStringLookup));
    }

    static Builder builder() {
        return new Builder();
    }

    static class Builder {
        private DatabaseLayout databaseLayout;
        private Config databaseConfig;
        private org.neo4j.csv.reader.Configuration csvConfig = org.neo4j.csv.reader.Configuration.COMMAS;
        private Configuration importConfig = Configuration.DEFAULT;
        private Path reportFile;
        private IdType idType = IdType.STRING;
        private Charset inputEncoding = StandardCharsets.UTF_8;
        private boolean ignoreExtraColumns;
        private boolean skipBadRelationships;
        private boolean skipDuplicateNodes;
        private boolean skipBadEntriesLogging;
        private long badTolerance;
        private boolean normalizeTypes;
        private boolean verbose;
        private boolean autoSkipHeaders;
        private final Map, List> nodeFiles = new HashMap<>();
        private final Map> relationshipFiles = new HashMap<>();
        private FileSystemAbstraction fileSystem = new DefaultFileSystemAbstraction();
        private PageCacheTracer pageCacheTracer = PageCacheTracer.NULL;
        private CursorContextFactory contextFactory =
                new CursorContextFactory(pageCacheTracer, new FixedVersionContextSupplier(BASE_TX_ID));
        private MemoryTracker memoryTracker = EmptyMemoryTracker.INSTANCE;
        private PrintStream stdOut = System.out;
        private PrintStream stdErr = System.err;
        private boolean force;
        private boolean incremental = false;
        private ImportCommand.IncrementalStage incrementalStage = null;
        private InternalLogProvider logProvider = NullLogProvider.getInstance();

        Builder withDatabaseLayout(DatabaseLayout databaseLayout) {
            this.databaseLayout = databaseLayout;
            return this;
        }

        Builder withDatabaseConfig(Config databaseConfig) {
            this.databaseConfig = databaseConfig;
            return this;
        }

        Builder withCsvConfig(org.neo4j.csv.reader.Configuration csvConfig) {
            this.csvConfig = csvConfig;
            return this;
        }

        Builder withImportConfig(Configuration importConfig) {
            this.importConfig = importConfig;
            return this;
        }

        Builder withReportFile(Path reportFile) {
            this.reportFile = reportFile;
            return this;
        }

        Builder withIdType(IdType idType) {
            this.idType = idType;
            return this;
        }

        Builder withInputEncoding(Charset inputEncoding) {
            this.inputEncoding = inputEncoding;
            return this;
        }

        Builder withIgnoreExtraColumns(boolean ignoreExtraColumns) {
            this.ignoreExtraColumns = ignoreExtraColumns;
            return this;
        }

        Builder withSkipBadRelationships(boolean skipBadRelationships) {
            this.skipBadRelationships = skipBadRelationships;
            return this;
        }

        Builder withSkipDuplicateNodes(boolean skipDuplicateNodes) {
            this.skipDuplicateNodes = skipDuplicateNodes;
            return this;
        }

        Builder withSkipBadEntriesLogging(boolean skipBadEntriesLogging) {
            this.skipBadEntriesLogging = skipBadEntriesLogging;
            return this;
        }

        Builder withBadTolerance(long badTolerance) {
            this.badTolerance = badTolerance;
            return this;
        }

        Builder withNormalizeTypes(boolean normalizeTypes) {
            this.normalizeTypes = normalizeTypes;
            return this;
        }

        Builder withVerbose(boolean verbose) {
            this.verbose = verbose;
            return this;
        }

        Builder withAutoSkipHeaders(boolean autoSkipHeaders) {
            this.autoSkipHeaders = autoSkipHeaders;
            return this;
        }

        Builder addNodeFiles(Set labels, Path[] files) {
            final var list = nodeFiles.computeIfAbsent(labels, unused -> new ArrayList<>());
            list.add(files);
            return this;
        }

        Builder addRelationshipFiles(String defaultRelType, Path[] files) {
            final var list = relationshipFiles.computeIfAbsent(defaultRelType, unused -> new ArrayList<>());
            list.add(files);
            return this;
        }

        Builder withFileSystem(FileSystemAbstraction fileSystem) {
            this.fileSystem = fileSystem;
            return this;
        }

        Builder withPageCacheTracer(PageCacheTracer pageCacheTracer) {
            this.pageCacheTracer = pageCacheTracer;
            return this;
        }

        Builder withCursorContextFactory(CursorContextFactory contextFactory) {
            this.contextFactory = contextFactory;
            return this;
        }

        Builder withMemoryTracker(MemoryTracker memoryTracker) {
            this.memoryTracker = memoryTracker;
            return this;
        }

        Builder withStdOut(PrintStream stdOut) {
            this.stdOut = stdOut;
            return this;
        }

        Builder withStdErr(PrintStream stdErr) {
            this.stdErr = stdErr;
            return this;
        }

        Builder withForce(boolean force) {
            this.force = force;
            return this;
        }

        Builder withIncremental(boolean incremental) {
            this.incremental = incremental;
            return this;
        }

        Builder withIncrementalStage(ImportCommand.IncrementalStage mode) {
            this.incrementalStage = mode;
            return this;
        }

        Builder withLogProvider(InternalLogProvider logProvider) {
            this.logProvider = logProvider;
            return this;
        }

        CsvImporter build() {
            Preconditions.checkState(
                    !(force && incremental),
                    "--overwrite-destination doesn't work with incremental import",
                    incrementalStage);
            return new CsvImporter(this);
        }
    }
}




© 2015 - 2024 Weber Informatics LLC | Privacy Policy