com.marklogic.flux.impl.export.ExportArchiveFilesCommand Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of flux-api Show documentation
Show all versions of flux-api Show documentation
Flux API for data movement with MarkLogic
/*
* Copyright © 2024 MarkLogic Corporation. All Rights Reserved.
*/
package com.marklogic.flux.impl.export;
import com.marklogic.flux.api.ArchiveFilesExporter;
import com.marklogic.flux.impl.AbstractCommand;
import com.marklogic.flux.impl.OptionsUtil;
import com.marklogic.spark.Options;
import org.apache.spark.sql.*;
import picocli.CommandLine;
import java.util.Map;
import java.util.function.Consumer;
import java.util.stream.Collectors;
import java.util.stream.Stream;
@CommandLine.Command(
name = "export-archive-files",
description = "Read documents and their metadata from MarkLogic and write them to ZIP files on a local filesystem, HDFS, or S3."
)
public class ExportArchiveFilesCommand extends AbstractCommand implements ArchiveFilesExporter {
@CommandLine.Mixin
private ReadArchiveDocumentsParams readParams = new ReadArchiveDocumentsParams();
@CommandLine.Mixin
protected WriteArchiveFilesParams writeParams = new WriteArchiveFilesParams();
@CommandLine.Option(
names = "--streaming",
description = "Causes documents to be streamed from MarkLogic to archive files. Intended for " +
"exporting large documents that cannot be fully read into memory."
)
private boolean streaming;
@Override
protected void validateDuringApiUsage() {
writeParams.validatePath();
readParams.verifyAtLeastOneQueryOptionIsSet("export");
}
@Override
public void validateCommandLineOptions(CommandLine.ParseResult parseResult) {
super.validateCommandLineOptions(parseResult);
OptionsUtil.verifyHasAtLeastOneOption(parseResult, ReadDocumentParams.REQUIRED_QUERY_OPTIONS);
}
@Override
protected Dataset loadDataset(SparkSession session, DataFrameReader reader) {
final int fileCount = writeParams.getFileCount();
if (fileCount > 0) {
getCommonParams().setRepartition(fileCount);
}
return reader.format(MARKLOGIC_CONNECTOR)
.options(getConnectionParams().makeOptions())
.options(makeReadOptions())
.load();
}
@Override
protected void applyWriter(SparkSession session, DataFrameWriter writer) {
writeParams.getS3Params().addToHadoopConfiguration(session.sparkContext().hadoopConfiguration());
writer.format(MARKLOGIC_CONNECTOR)
.options(makeWriteOptions())
.mode(SaveMode.Append)
.save(writeParams.getPath());
}
// Extracted for unit-testing.
protected final Map makeReadOptions() {
Map readOptions = readParams.makeOptions();
if (streaming) {
readOptions.put(Options.STREAM_FILES, "true");
}
return readOptions;
}
// Extracted for unit-testing.
protected Map makeWriteOptions() {
Map writeOptions = writeParams.get();
if (streaming) {
writeOptions.put(Options.STREAM_FILES, "true");
// The writer needs to know what metadata to retrieve when streaming.
writeOptions.put(Options.READ_DOCUMENTS_CATEGORIES, readParams.determineCategories());
}
// Need connection params so writer can read documents and metadata from MarkLogic.
writeOptions.putAll(getConnectionParams().makeOptions());
return writeOptions;
}
public static class WriteArchiveFilesParams extends WriteFilesParams implements WriteArchiveFilesOptions {
@CommandLine.Option(names = "--encoding", description = "Specify an encoding for writing files.")
private String encoding;
@Override
public WriteArchiveFilesOptions encoding(String encoding) {
this.encoding = encoding;
return this;
}
@Override
public Map get() {
return OptionsUtil.makeOptions(
Options.WRITE_FILES_COMPRESSION, "zip",
Options.WRITE_FILES_ENCODING, encoding
);
}
}
public static class ReadArchiveDocumentsParams extends ReadDocumentParams implements ReadArchiveDocumentOptions {
@CommandLine.Option(names = "--categories", description = "Comma-delimited sequence of categories of data to include. " +
"Valid choices are: collections, permissions, quality, properties, and metadatavalues.")
private String categories;
@Override
public Map makeOptions() {
return OptionsUtil.addOptions(super.makeOptions(),
Options.READ_DOCUMENTS_CATEGORIES, determineCategories()
);
}
@Override
public ReadArchiveDocumentOptions categories(String... categories) {
this.categories = Stream.of(categories).collect(Collectors.joining(","));
return this;
}
/**
* While the "read documents" operation allows for only reading metadata, that isn't valid for an archive - we
* always need content to be returned as well.
*
* @return
*/
private String determineCategories() {
if (categories != null && categories.trim().length() > 0) {
return "content," + categories;
}
return "content,metadata";
}
}
@Override
public ArchiveFilesExporter from(Consumer consumer) {
consumer.accept(readParams);
return this;
}
@Override
public ArchiveFilesExporter streaming() {
this.streaming = true;
return this;
}
@Override
public ArchiveFilesExporter to(Consumer consumer) {
consumer.accept(writeParams);
return this;
}
@Override
public ArchiveFilesExporter to(String path) {
writeParams.path(path);
return this;
}
}
© 2015 - 2024 Weber Informatics LLC | Privacy Policy