com.marklogic.flux.impl.export.AbstractExportRowsToFilesCommand Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of flux-api Show documentation
Flux API for data movement with MarkLogic
There is a newer version: 1.0.0.ea1
/*
 * Copyright © 2024 MarkLogic Corporation. All Rights Reserved.
 */
package com.marklogic.flux.impl.export;

import com.marklogic.flux.api.Executor;
import com.marklogic.flux.api.WriteFilesOptions;
import com.marklogic.flux.impl.AbstractCommand;
import com.marklogic.flux.impl.SparkUtil;
import com.marklogic.spark.Util;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.spark.sql.*;
import picocli.CommandLine;

/**
 * Support class for concrete commands that want to run an Optic DSL query to read rows and then write them to one or
 * more files. Each subclass is expected to make use of a Spark data source for writing rows to a tabular data source.
 */
abstract class AbstractExportRowsToFilesCommand extends AbstractCommand {

    @CommandLine.Mixin
    protected final ReadRowsParams readParams = new ReadRowsParams();

    // Sonar complaining about the use of ?; not sure yet how to "fix" it, so ignoring.
    @SuppressWarnings("java:S1452")
    protected abstract WriteStructuredFilesParams getWriteFilesParams();

    /**
     * @return subclass must return the Spark data format for the desired output file.
     */
    protected abstract String getWriteFormat();

    @Override
    protected void validateDuringApiUsage() {
        getWriteFilesParams().validatePath();
    }

    @Override
    protected Dataset loadDataset(SparkSession session, DataFrameReader reader) {
        final Integer fileCount = getWriteFilesParams().getFileCount();
        if (fileCount != null && fileCount > 0) {
            getCommonParams().setRepartition(fileCount);
        }
        return reader.format(MARKLOGIC_CONNECTOR)
            .options(getConnectionParams().makeOptions())
            .options(readParams.makeOptions())
            .load();
    }

    @Override
    protected void applyWriter(SparkSession session, DataFrameWriter writer) {
        WriteStructuredFilesParams writeParams = getWriteFilesParams();

        Configuration hadoopConf = session.sparkContext().hadoopConfiguration();
        writeParams.getS3Params().addToHadoopConfiguration(hadoopConf);
        disableWriteChecksum(hadoopConf);

        writer.format(getWriteFormat())
            .options(writeParams.get())
            .mode(SparkUtil.toSparkSaveMode(writeParams.getSaveMode()))
            .save(writeParams.getPath());
    }

    /**
     * Spark defaults to writing checksum files, which is going to be unfamiliar for Flux users that either aren't
     * familiar with Spark and/or do not expect this having used MLCP before. Additionally, the other export commands
     * that our the MarkLogic connector do not offer any support for checksum files. So disabling this feature for now.
     * May expose it via an option later based on feedback.
     *
     * @param hadoopConf
     */
    private void disableWriteChecksum(Configuration hadoopConf) {
        try {
            FileSystem.get(hadoopConf).setWriteChecksum(false);
        } catch (Exception e) {
            Util.MAIN_LOGGER.warn("Unable to disable writing Spark checksum files; cause: {}", e.getMessage());
        }
    }
}