com.marklogic.flux.impl.SparkUtil Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of flux-api Show documentation
Flux API for data movement with MarkLogic
There is a newer version: 1.0.0.ea1
/*
 * Copyright © 2024 MarkLogic Corporation. All Rights Reserved.
 */
package com.marklogic.flux.impl;

import com.marklogic.flux.api.SaveMode;
import org.apache.spark.sql.Column;
import org.apache.spark.sql.Dataset;
import org.apache.spark.sql.Row;
import org.apache.spark.sql.SparkSession;

public class SparkUtil {

    private SparkUtil() {
        // Required by Sonar.
    }

    public static SparkSession buildSparkSession() {
        return buildSparkSession("local[*]");
    }

    public static SparkSession buildSparkSession(String masterUrl) {
        SparkSession.Builder builder = SparkSession.builder()
            .master(masterUrl)

            // Gets rid of the SUCCESS files. A Flux user who is not familiar with Spark is likely to be confused by
            // these files. A Flux user experienced with Spark may eventually want this to be configurable, but is
            // also likely to be using spark-submit or directly using the MarkLogic Spark connector, in which case the
            // user has control over this.
            .config("spark.hadoop.mapreduce.fileoutputcommitter.marksuccessfuljobs", "false")

            // Spark config options can be provided now or at runtime via spark.conf().set(). The downside to setting
            // options now that are defined by the user is that they won't work when used with spark-submit, which
            // handles constructing a SparkSession. We may eventually provide a feature though for providing options
            // at this point for local users that want more control over the SparkSession itself.
            .config("spark.sql.session.timeZone", "UTC");

        return builder.getOrCreate();
    }

    /**
     * Converts an instance of our SaveMode enum into Spark's SaveMode. This was originally created because JCommander
     * could not deal with Spark's SaveMode on account of the values being camel case. Not sure yet if picocli requires
     * this, so it's being left in place.
     *
     * @param saveMode
     * @return
     */
    public static org.apache.spark.sql.SaveMode toSparkSaveMode(SaveMode saveMode) {
        if (SaveMode.APPEND.equals(saveMode)) {
            return org.apache.spark.sql.SaveMode.Append;
        } else if (SaveMode.ERRORIFEXISTS.equals(saveMode)) {
            return org.apache.spark.sql.SaveMode.ErrorIfExists;
        } else if (SaveMode.IGNORE.equals(saveMode)) {
            return org.apache.spark.sql.SaveMode.Ignore;
        } else if (SaveMode.OVERWRITE.equals(saveMode)) {
            return org.apache.spark.sql.SaveMode.Overwrite;
        }
        return null;
    }

    public static Dataset addFilePathColumn(Dataset dataset) {
        // The MarkLogic Spark connector has special processing for this column. If it's found by the writer, the
        // value of this column will be used to construct an initial URI for the associated document. The column
        // will then have its value set to null so that the value is not included in the JSON or XML
        // serialization of the row.
        return dataset.withColumn("marklogic_spark_file_path", new Column("_metadata.file_path"));
    }
}