com.marklogic.spark.MarkLogicFileTable Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of marklogic-spark-connector Show documentation
Spark 3 connector for MarkLogic
The newest version!
/*
 * Copyright © 2024 MarkLogic Corporation. All Rights Reserved.
 */
package com.marklogic.spark;

import com.marklogic.spark.reader.file.FileScanBuilder;
import com.marklogic.spark.writer.file.DocumentFileWriteBuilder;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileStatus;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.spark.sql.SparkSession;
import org.apache.spark.sql.connector.read.ScanBuilder;
import org.apache.spark.sql.connector.write.LogicalWriteInfo;
import org.apache.spark.sql.connector.write.WriteBuilder;
import org.apache.spark.sql.execution.datasources.FileFormat;
import org.apache.spark.sql.execution.datasources.v2.FileTable;
import org.apache.spark.sql.types.StructType;
import org.apache.spark.sql.util.CaseInsensitiveStringMap;
import scala.Option;
import scala.collection.Seq;

/**
 * Extends Spark's FileTable class so that it can make use of that class's file index capabilities, which includes
 * support for Spark options like recursiveFileLookup and pathGlobFilter as defined at
 * https://spark.apache.org/docs/latest/sql-data-sources-generic-options.html .
 * 
 * A prototype that attempted to create an InMemoryFileIndex, and thus avoid the need to subclass FileTable, was not
 * successful. Could not get the following code to run in Java:
 * Seq hadoopPaths = DataSource.checkAndGlobPathIfNecessary(this.paths,
 * session.sparkContext().hadoopConfiguration(), true, true, numThreads, true);
 * For future attempts, the example at https://stackoverflow.com/a/45373345/3306099 is useful for converting a Java
 * map into an immutable Scala map.
 */
class MarkLogicFileTable extends FileTable {

    private final CaseInsensitiveStringMap options;
    private final StructType schema;

    MarkLogicFileTable(SparkSession sparkSession, CaseInsensitiveStringMap options, Seq paths, StructType schema) {
        super(sparkSession, options, paths, Option.apply(schema));
        if (isWriteFilesOperation(options, paths)) {
            makeWritePath(paths.head(), sparkSession);
        }
        this.options = options;
        this.schema = schema;
    }

    @Override
    public ScanBuilder newScanBuilder(CaseInsensitiveStringMap options) {
        if ("true".equalsIgnoreCase(options.get(Options.STREAM_FILES)) && Util.MAIN_LOGGER.isInfoEnabled()) {
            Util.MAIN_LOGGER.info("File streaming is enabled; will read files during writer phase.");
        }
        return new FileScanBuilder(options.asCaseSensitiveMap(), super.fileIndex());
    }

    @Override
    public WriteBuilder newWriteBuilder(LogicalWriteInfo info) {
        // Need to pass along a serializable object.
        return new DocumentFileWriteBuilder(this.options.asCaseSensitiveMap(), this.schema);
    }

    @Override
    public Option inferSchema(Seq files) {
        return Option.apply(this.schema);
    }

    @Override
    public String name() {
        return "marklogic-file";
    }

    @Override
    public String formatName() {
        // Per the docs in FileTable, this is providing an alias for supported file types. It does not appear to have
        // any impact on functionality.
        return "marklogic";
    }

    @Override
    public Class fallbackFileFormat() {
        // Per the docs in FileTable, this allows for returning a Spark V1 FileFormat. We don't have support for that,
        // so null is returned.
        return null;
    }

    private boolean isWriteFilesOperation(CaseInsensitiveStringMap options, Seq paths) {
        // When writing files, a user is limited to a single path. So if the user provides multiple paths when
        // reading files, we immediately know it's not a write operation.
        if (paths.size() != 1) {
            return false;
        }
        return options.keySet().stream()
            // There are no required options to indicate whether this is a read or write. So we at least check for some
            // options that would indicate that the user is reading files, in which case we don't need to call mkdirs.
            .noneMatch(key -> key.startsWith("spark.marklogic.read.files")
                || key.startsWith("spark.marklogic.read.aggregates.xml")
            );
    }

    private void makeWritePath(String path, SparkSession sparkSession) {
        Configuration config = sparkSession.sparkContext().hadoopConfiguration();
        try {
            Path hadoopPath = new Path(path);
            FileSystem fileSystem = hadoopPath.getFileSystem(config);
            if (!fileSystem.exists(hadoopPath)) {
                if (Util.MAIN_LOGGER.isDebugEnabled()) {
                    Util.MAIN_LOGGER.debug("Calling mkdirs on path: {}", path);
                }
                fileSystem.mkdirs(hadoopPath);
            }
        } catch (Exception ex) {
            // We'll get an exception in 1 of 2 scenarios. First, this is a write operation and the mkdirs call
            // fails. In that scenario, the user will still get an AnalysisException from Spark noting that the path
            // does not exist. The user likely won't be able to make the path themselves - perhaps a permissions issue -
            // and thus has the info they need to proceed. In the other scenario, this is a read operation and the
            // mkdirs call didn't need to happen. In that scenario, the user will still get an AnalysisException because
            // the directory could not be found. So the user has enough information to proceed there as well. So this
            // is only being logged at the debug level as the AnalysisException should be sufficient for helping the
            // user to fix their problem.
            if (Util.MAIN_LOGGER.isDebugEnabled()) {
                Util.MAIN_LOGGER.debug("Unable to call mkdirs on path: {}; cause: {}", path, ex.getMessage());
            }
        }
    }
}