com.marklogic.flux.impl.export.ExportFilesCommand Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of flux-api Show documentation
Show all versions of flux-api Show documentation
Flux API for data movement with MarkLogic
/*
* Copyright © 2024 MarkLogic Corporation. All Rights Reserved.
*/
package com.marklogic.flux.impl.export;
import com.marklogic.flux.api.CompressionType;
import com.marklogic.flux.api.FluxException;
import com.marklogic.flux.api.GenericFilesExporter;
import com.marklogic.flux.api.ReadDocumentsOptions;
import com.marklogic.flux.impl.AbstractCommand;
import com.marklogic.flux.impl.OptionsUtil;
import com.marklogic.flux.impl.S3Params;
import com.marklogic.spark.Options;
import org.apache.spark.sql.*;
import picocli.CommandLine;
import java.util.Map;
import java.util.function.Consumer;
import java.util.function.Supplier;
@CommandLine.Command(
name = "export-files",
description = "Read documents from MarkLogic and write them to a local filesystem, HDFS, or S3."
)
public class ExportFilesCommand extends AbstractCommand implements GenericFilesExporter {
@CommandLine.Mixin
private ReadDocumentParamsImpl readParams = new ReadDocumentParamsImpl();
@CommandLine.Mixin
protected WriteGenericFilesParams writeParams = new WriteGenericFilesParams();
@CommandLine.Option(
names = "--streaming",
description = "Causes documents to be read from MarkLogic and streamed to the file source. Intended for " +
"exporting large files that cannot be fully read into memory."
)
private boolean streaming;
@Override
protected void validateDuringApiUsage() {
readParams.verifyAtLeastOneQueryOptionIsSet("export");
writeParams.validatePath();
}
@Override
public void validateCommandLineOptions(CommandLine.ParseResult parseResult) {
super.validateCommandLineOptions(parseResult);
OptionsUtil.verifyHasAtLeastOneOption(parseResult, ReadDocumentParams.REQUIRED_QUERY_OPTIONS);
}
@Override
protected Dataset loadDataset(SparkSession session, DataFrameReader reader) {
final int zipFileCount = writeParams.zipFileCount;
if (zipFileCount > 0) {
getCommonParams().setRepartition(zipFileCount);
}
return reader.format(MARKLOGIC_CONNECTOR)
.options(getConnectionParams().makeOptions())
.options(buildReadOptions())
.load();
}
@Override
protected void applyWriter(SparkSession session, DataFrameWriter writer) {
writeParams.s3Params.addToHadoopConfiguration(session.sparkContext().hadoopConfiguration());
writer.format(MARKLOGIC_CONNECTOR)
.options(buildWriteOptions())
// The connector only supports "Append" in terms of how Spark defines it, but it will always overwrite files.
.mode(SaveMode.Append)
.save(writeParams.path);
}
protected final Map buildReadOptions() {
Map options = readParams.makeOptions();
if (this.streaming) {
options.put(Options.STREAM_FILES, "true");
}
return options;
}
protected final Map buildWriteOptions() {
Map options = writeParams.get();
if (this.streaming) {
options.put(Options.STREAM_FILES, "true");
// Need connection information so that the writer can retrieve documents from MarkLogic.
options.putAll(getConnectionParams().makeOptions());
}
return options;
}
public static class WriteGenericFilesParams implements Supplier
© 2015 - 2024 Weber Informatics LLC | Privacy Policy