com.marklogic.flux.impl.importdata.ImportDelimitedFilesCommand Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of flux-api Show documentation
Show all versions of flux-api Show documentation
Flux API for data movement with MarkLogic
/*
* Copyright © 2024 MarkLogic Corporation. All Rights Reserved.
*/
package com.marklogic.flux.impl.importdata;
import com.marklogic.flux.api.DelimitedFilesImporter;
import com.marklogic.flux.api.WriteStructuredDocumentsOptions;
import com.marklogic.flux.impl.SparkUtil;
import org.apache.spark.sql.Dataset;
import org.apache.spark.sql.Row;
import picocli.CommandLine;
import java.util.HashMap;
import java.util.Map;
import java.util.function.Consumer;
@CommandLine.Command(
name = "import-delimited-files",
description = "Read delimited text files from local, HDFS, and S3 locations using Spark's support " +
"defined at %nhttps://spark.apache.org/docs/latest/sql-data-sources-csv.html, and write JSON or XML documents " +
"to MarkLogic."
)
public class ImportDelimitedFilesCommand extends AbstractImportFilesCommand implements DelimitedFilesImporter {
@CommandLine.Mixin
private ReadDelimitedFilesParams readParams = new ReadDelimitedFilesParams();
@CommandLine.Mixin
private WriteStructuredDocumentParams writeParams = new WriteStructuredDocumentParams();
@Override
protected String getReadFormat() {
return "csv";
}
@Override
protected ReadFilesParams getReadParams() {
return readParams;
}
@Override
protected WriteDocumentParams getWriteParams() {
return writeParams;
}
public static class ReadDelimitedFilesParams extends ReadFilesParams implements ReadDelimitedFilesOptions {
@CommandLine.Option(names = "--delimiter", description = "Specify a delimiter; defaults to a comma.")
private String delimiter;
@CommandLine.Option(names = "--encoding", description = "Specify an encoding when reading files.")
private String encoding;
@CommandLine.Option(
names = "--uri-include-file-path",
description = "If true, each document URI will include the path of the originating file."
)
private boolean uriIncludeFilePath;
@CommandLine.Option(
names = "-P",
description = "Specify any Spark CSV option defined at " +
"%nhttps://spark.apache.org/docs/latest/sql-data-sources-csv.html; e.g. -PquoteAll=true."
)
private Map additionalOptions = new HashMap<>();
@CommandLine.Mixin
private AggregationParams aggregationParams = new AggregationParams();
@Override
public Map makeOptions() {
Map options = super.makeOptions();
options.putIfAbsent("header", "true");
options.putIfAbsent("inferSchema", "true");
if (delimiter != null) {
options.putIfAbsent("sep", delimiter);
}
if (encoding != null) {
options.putIfAbsent("encoding", encoding);
}
options.putAll(additionalOptions);
return options;
}
@Override
public ReadDelimitedFilesOptions delimiter(String delimiter) {
this.delimiter = delimiter;
return this;
}
@Override
public ReadDelimitedFilesOptions encoding(String encoding) {
this.encoding = encoding;
return this;
}
@Override
public ReadDelimitedFilesOptions additionalOptions(Map options) {
this.additionalOptions = options;
return this;
}
@Override
public ReadDelimitedFilesOptions uriIncludeFilePath(boolean value) {
this.uriIncludeFilePath = value;
return this;
}
@Override
public ReadDelimitedFilesOptions groupBy(String columnName) {
aggregationParams.setGroupBy(columnName);
return this;
}
@Override
public ReadDelimitedFilesOptions aggregateColumns(String newColumnName, String... columns) {
aggregationParams.addAggregationExpression(newColumnName, columns);
return this;
}
}
@Override
protected Dataset afterDatasetLoaded(Dataset dataset) {
if (readParams.uriIncludeFilePath) {
dataset = SparkUtil.addFilePathColumn(dataset);
}
return readParams.aggregationParams.applyGroupBy(dataset);
}
@Override
public DelimitedFilesImporter from(Consumer consumer) {
consumer.accept(readParams);
return this;
}
@Override
public DelimitedFilesImporter from(String... paths) {
readParams.paths(paths);
return this;
}
@Override
public DelimitedFilesImporter to(Consumer consumer) {
consumer.accept(writeParams);
return this;
}
}