All Downloads are FREE. Search and download functionalities are using the official Maven repository.

com.marklogic.flux.impl.reprocess.ReprocessCommand Maven / Gradle / Ivy

There is a newer version: 1.0.0.ea1
Show newest version
/*
 * Copyright © 2024 MarkLogic Corporation. All Rights Reserved.
 */
package com.marklogic.flux.impl.reprocess;

import com.marklogic.flux.api.FluxException;
import com.marklogic.flux.api.Reprocessor;
import com.marklogic.flux.impl.AbstractCommand;
import com.marklogic.flux.impl.OptionsUtil;
import com.marklogic.spark.Options;
import org.apache.spark.sql.*;
import picocli.CommandLine;

import java.util.ArrayList;
import java.util.Arrays;
import java.util.List;
import java.util.Map;
import java.util.function.Consumer;
import java.util.function.Supplier;
import java.util.stream.Collectors;
import java.util.stream.Stream;

@CommandLine.Command(
    name = "reprocess",
    description = "Read any data from MarkLogic via custom code and reprocess it via custom code."
)
public class ReprocessCommand extends AbstractCommand implements Reprocessor {

    @CommandLine.Mixin
    protected ReadParams readParams = new ReadParams();

    @CommandLine.Mixin
    protected WriteParams writeParams = new WriteParams();

    @Override
    protected void validateDuringApiUsage() {
        readParams.validateReader();
        readParams.validatePartitionReader();
        writeParams.validateWriter();
    }

    @Override
    protected Dataset loadDataset(SparkSession session, DataFrameReader reader) {
        return reader.format(MARKLOGIC_CONNECTOR)
            .options(getConnectionParams().makeOptions())
            .options(readParams.get())
            .load();
    }

    @Override
    protected void applyWriter(SparkSession session, DataFrameWriter writer) {
        writer.format(MARKLOGIC_CONNECTOR)
            .options(getConnectionParams().makeOptions())
            .options(writeParams.get())
            .mode(SaveMode.Append)
            .save();
    }

    @Override
    public void validateCommandLineOptions(CommandLine.ParseResult parseResult) {
        super.validateCommandLineOptions(parseResult);
        validateReadParams(parseResult);
        validatePartitionParams(parseResult);
        validateWriteParams(parseResult);
    }

    private void validateReadParams(CommandLine.ParseResult parseResult) {
        String[] options = new String[]{
            "--read-invoke", "--read-javascript", "--read-xquery", "--read-javascript-file", "--read-xquery-file"
        };
        if (getCountOfNonNullParams(parseResult, options) != 1) {
            throw new FluxException(makeErrorMessage("Must specify one of ", options));
        }
    }

    private void validatePartitionParams(CommandLine.ParseResult parseResult) {
        String[] options = new String[]{
            "--read-partitions-invoke", "--read-partitions-javascript", "--read-partitions-xquery",
            "--read-partitions-javascript-file", "--read-partitions-xquery-file"
        };
        if (getCountOfNonNullParams(parseResult, options) > 1) {
            throw new FluxException(makeErrorMessage("Can only specify one of ", options));
        }
    }

    private void validateWriteParams(CommandLine.ParseResult parseResult) {
        String[] options = new String[]{
            "--write-invoke", "--write-javascript", "--write-xquery", "--write-javascript-file", "--write-xquery-file"
        };
        if (getCountOfNonNullParams(parseResult, options) != 1) {
            throw new FluxException(makeErrorMessage("Must specify one of ", options));
        }
    }

    private String makeErrorMessage(String preamble, String... args) {
        List list = Arrays.asList(args);
        String str = list.subList(0, list.size() - 1).stream().collect(Collectors.joining(", "));
        return preamble + str + ", or " + list.get(list.size() - 1) + ".";
    }

    private int getCountOfNonNullParams(CommandLine.ParseResult parseResult, String... options) {
        int count = 0;
        for (String option : options) {
            if (parseResult.subcommand().hasMatchedOption(option)) {
                count++;
            }
        }
        return count;
    }

    public static class ReadParams implements Supplier>, ReadOptions {

        @CommandLine.Option(
            names = {"--read-invoke"},
            description = "The path to a module to invoke for reading data; the module must be in your application’s modules database."
        )
        private String readInvoke;

        @CommandLine.Option(
            names = {"--read-javascript"},
            description = "JavaScript code to execute for reading data."
        )
        private String readJavascript;

        @CommandLine.Option(
            names = {"--read-javascript-file"},
            description = "Local file containing JavaScript code to execute for reading data."
        )
        private String readJavascriptFile;

        @CommandLine.Option(
            names = {"--read-xquery"},
            description = "XQuery code to execute for reading data."
        )
        private String readXquery;

        @CommandLine.Option(
            names = {"--read-xquery-file"},
            description = "Local file containing XQuery code to execute for reading data."
        )
        private String readXqueryFile;

        @CommandLine.Option(
            names = {"--read-partitions-invoke"},
            description = "The path to a module to invoke to define partitions that are sent to your custom code for reading; the module must be in your application’s modules database."
        )
        private String readPartitionsInvoke;

        @CommandLine.Option(
            names = {"--read-partitions-javascript"},
            description = "JavaScript code to execute to define partitions that are sent to your custom code for reading."
        )
        private String readPartitionsJavascript;

        @CommandLine.Option(
            names = {"--read-partitions-javascript-file"},
            description = "Local file containing JavaScript code to execute to define partitions that are sent to your custom code for reading."
        )
        private String readPartitionsJavascriptFile;

        @CommandLine.Option(
            names = {"--read-partitions-xquery"},
            description = "XQuery code to execute to define partitions that are sent to your custom code for reading."
        )
        private String readPartitionsXquery;

        @CommandLine.Option(
            names = {"--read-partitions-xquery-file"},
            description = "Local file containing XQuery code to execute to define partitions that are sent to your custom code for reading."
        )
        private String readPartitionsXqueryFile;

        @CommandLine.Option(
            names = "--read-var", arity = "*",
            description = "Define variables to be sent to the code for reading data; e.g. '--read-var var1=value1'."
        )
        private List readVars = new ArrayList<>();

        @CommandLine.Option(
            names = "--log-read-progress",
            description = "Log a count of total items read every time this many items are read."
        )
        private int progressInterval = 10000;

        public void validateReader() {
            Map options = get();
            if (Stream.of(Options.READ_INVOKE, Options.READ_JAVASCRIPT, Options.READ_JAVASCRIPT_FILE,
                Options.READ_XQUERY, Options.READ_XQUERY_FILE).noneMatch(options::containsKey)) {
                throw new FluxException("Must specify either JavaScript code, XQuery code, or an invokable module for reading from MarkLogic");
            }
        }

        public void validatePartitionReader() {
            Map options = get();
            long count = Stream.of(Options.READ_PARTITIONS_JAVASCRIPT, Options.READ_PARTITIONS_JAVASCRIPT_FILE,
                    Options.READ_PARTITIONS_XQUERY, Options.READ_PARTITIONS_XQUERY_FILE, Options.READ_PARTITIONS_INVOKE)
                .filter(options::containsKey).count();
            if (count > 1) {
                throw new FluxException("Can only specify one approach for defining partitions that are sent to the code for reading from MarkLogic");
            }
        }

        @Override
        public Map get() {
            Map options = OptionsUtil.makeOptions(
                Options.READ_INVOKE, readInvoke,
                Options.READ_JAVASCRIPT, readJavascript,
                Options.READ_JAVASCRIPT_FILE, readJavascriptFile,
                Options.READ_XQUERY, readXquery,
                Options.READ_XQUERY_FILE, readXqueryFile,
                Options.READ_PARTITIONS_INVOKE, readPartitionsInvoke,
                Options.READ_PARTITIONS_JAVASCRIPT, readPartitionsJavascript,
                Options.READ_PARTITIONS_JAVASCRIPT_FILE, readPartitionsJavascriptFile,
                Options.READ_PARTITIONS_XQUERY, readPartitionsXquery,
                Options.READ_PARTITIONS_XQUERY_FILE, readPartitionsXqueryFile,
                Options.READ_LOG_PROGRESS, OptionsUtil.intOption(progressInterval)
            );

            if (readVars != null) {
                readVars.forEach(readVar -> {
                    int pos = readVar.indexOf("=");
                    if (pos < 0) {
                        throw new IllegalArgumentException("Value of --read-var argument must be 'varName=varValue'; invalid value: " + readVar);
                    }
                    options.put(Options.READ_VARS_PREFIX + readVar.substring(0, pos), readVar.substring(pos + 1));
                });
            }

            return options;
        }

        @Override
        public ReadOptions invoke(String modulePath) {
            this.readInvoke = modulePath;
            return this;
        }

        @Override
        public ReadOptions javascript(String query) {
            this.readJavascript = query;
            return this;
        }

        @Override
        public ReadOptions javascriptFile(String path) {
            this.readJavascriptFile = path;
            return this;
        }

        @Override
        public ReadOptions xquery(String query) {
            this.readXquery = query;
            return this;
        }

        @Override
        public ReadOptions xqueryFile(String path) {
            this.readXqueryFile = path;
            return this;
        }

        @Override
        public ReadOptions partitionsInvoke(String modulePath) {
            this.readPartitionsInvoke = modulePath;
            return this;
        }

        @Override
        public ReadOptions partitionsJavascript(String query) {
            this.readPartitionsJavascript = query;
            return this;
        }

        @Override
        public ReadOptions partitionsJavascriptFile(String path) {
            this.readPartitionsJavascriptFile = path;
            return this;
        }

        @Override
        public ReadOptions partitionsXquery(String query) {
            this.readPartitionsXquery = query;
            return this;
        }

        @Override
        public ReadOptions partitionsXqueryFile(String path) {
            this.readPartitionsXqueryFile = path;
            return this;
        }

        @Override
        public ReadOptions vars(Map namesAndValues) {
            this.readVars = namesAndValues.entrySet().stream()
                .map(entry -> entry.getKey() + "=" + entry.getValue())
                .collect(Collectors.toList());
            return this;
        }

        @Override
        public ReadOptions logProgress(int interval) {
            this.progressInterval = interval;
            return this;
        }
    }

    public static class WriteParams implements Supplier>, WriteOptions {

        @CommandLine.Option(
            names = {"--write-invoke"},
            description = "The path to a module to invoke for writing data; the module must be in your application’s modules database."
        )
        private String writeInvoke;

        @CommandLine.Option(
            names = {"--write-javascript"},
            description = "JavaScript code to execute for writing data."
        )
        private String writeJavascript;

        @CommandLine.Option(
            names = {"--write-javascript-file"},
            description = "Local file containing JavaScript code to execute for writing data."
        )
        private String writeJavascriptFile;

        @CommandLine.Option(
            names = {"--write-xquery"},
            description = "XQuery code to execute for writing data."
        )
        private String writeXquery;

        @CommandLine.Option(
            names = {"--write-xquery-file"},
            description = "Local file containing XQuery code to execute for writing data."
        )
        private String writeXqueryFile;

        @CommandLine.Option(
            names = {"--external-variable-name"},
            description = "Name of the external variable in the custom code for writing that will be populated with each value read from MarkLogic."
        )
        private String externalVariableName = "URI";

        @CommandLine.Option(
            names = {"--external-variable-delimiter"},
            description = "Delimiter used when multiple values are included in the external variable in the code for writing."
        )
        private String externalVariableDelimiter = ",";

        @CommandLine.Option(
            names = "--write-var", arity = "*",
            description = "Define variables to be sent to the code for writing data; e.g. '--write-var var1=value1'."
        )
        private List writeVars = new ArrayList<>();

        @CommandLine.Option(
            names = "--abort-on-write-failure",
            description = "Causes the command to fail when a batch of documents cannot be written to MarkLogic."
        )
        private boolean abortOnWriteFailure;

        @CommandLine.Option(
            names = "--batch-size",
            description = "The number of values sent to the code for writing data in a single call."
        )
        private int batchSize = 1;

        @CommandLine.Option(
            names = "--log-progress",
            description = "Log a count of total items processed every time this many items are processed."
        )
        private int progressInterval = 10000;

        public void validateWriter() {
            Map options = get();
            if (Stream.of(Options.WRITE_INVOKE, Options.WRITE_JAVASCRIPT, Options.WRITE_JAVASCRIPT_FILE,
                Options.WRITE_XQUERY, Options.WRITE_XQUERY_FILE).noneMatch(options::containsKey)) {
                throw new FluxException("Must specify either JavaScript code, XQuery code, or an invokable module for writing to MarkLogic");
            }
        }

        @Override
        public Map get() {
            Map options = OptionsUtil.makeOptions(
                Options.WRITE_INVOKE, writeInvoke,
                Options.WRITE_JAVASCRIPT, writeJavascript,
                Options.WRITE_JAVASCRIPT_FILE, writeJavascriptFile,
                Options.WRITE_XQUERY, writeXquery,
                Options.WRITE_XQUERY_FILE, writeXqueryFile,
                Options.WRITE_EXTERNAL_VARIABLE_NAME, externalVariableName,
                Options.WRITE_EXTERNAL_VARIABLE_DELIMITER, externalVariableDelimiter,
                Options.WRITE_ABORT_ON_FAILURE, abortOnWriteFailure ? "true" : "false",
                Options.WRITE_BATCH_SIZE, OptionsUtil.intOption(batchSize),
                Options.WRITE_LOG_PROGRESS, OptionsUtil.intOption(progressInterval)
            );

            if (writeVars != null) {
                writeVars.forEach(writeVar -> {
                    int pos = writeVar.indexOf("=");
                    if (pos < 0) {
                        throw new IllegalArgumentException("Value of --write-var argument must be 'varName=varValue'; invalid value: " + writeVar);
                    }
                    options.put(Options.WRITE_VARS_PREFIX + writeVar.substring(0, pos), writeVar.substring(pos + 1));
                });
            }

            return options;
        }

        @Override
        public WriteOptions invoke(String modulePath) {
            this.writeInvoke = modulePath;
            return this;
        }

        @Override
        public WriteOptions javascript(String query) {
            this.writeJavascript = query;
            return this;
        }

        @Override
        public WriteOptions javascriptFile(String path) {
            this.writeJavascriptFile = path;
            return this;
        }

        @Override
        public WriteOptions xquery(String query) {
            this.writeXquery = query;
            return this;
        }

        @Override
        public WriteOptions xqueryFile(String path) {
            this.writeXqueryFile = path;
            return this;
        }

        @Override
        public WriteOptions externalVariableName(String name) {
            this.externalVariableName = name;
            return this;
        }

        @Override
        public WriteOptions externalVariableDelimiter(String delimiter) {
            this.externalVariableDelimiter = delimiter;
            return this;
        }

        @Override
        public WriteOptions vars(Map namesAndValues) {
            this.writeVars = namesAndValues.entrySet().stream()
                .map(entry -> entry.getKey() + "=" + entry.getValue())
                .collect(Collectors.toList());
            return this;
        }

        @Override
        public WriteOptions abortOnWriteFailure(boolean value) {
            this.abortOnWriteFailure = value;
            return this;
        }

        @Override
        public WriteOptions batchSize(int batchSize) {
            this.batchSize = batchSize;
            return this;
        }

        @Override
        public WriteOptions logProgress(int interval) {
            this.progressInterval = interval;
            return this;
        }
    }

    @Override
    public Reprocessor from(Consumer consumer) {
        consumer.accept(readParams);
        return this;
    }

    @Override
    public Reprocessor to(Consumer consumer) {
        consumer.accept(writeParams);
        return this;
    }
}




© 2015 - 2024 Weber Informatics LLC | Privacy Policy