com.marklogic.spark.MarkLogicFileTable Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of marklogic-spark-connector Show documentation
Spark 3 connector for MarkLogic
There is a newer version: 2.4.2
Show newest version
/*
 * Copyright © 2024 MarkLogic Corporation. All Rights Reserved.
 */
package com.marklogic.spark;

import com.marklogic.spark.reader.file.FileScanBuilder;
import com.marklogic.spark.writer.file.DocumentFileWriteBuilder;
import org.apache.hadoop.fs.FileStatus;
import org.apache.spark.sql.SparkSession;
import org.apache.spark.sql.connector.read.ScanBuilder;
import org.apache.spark.sql.connector.write.LogicalWriteInfo;
import org.apache.spark.sql.connector.write.WriteBuilder;
import org.apache.spark.sql.execution.datasources.FileFormat;
import org.apache.spark.sql.execution.datasources.v2.FileTable;
import org.apache.spark.sql.types.StructType;
import org.apache.spark.sql.util.CaseInsensitiveStringMap;
import scala.Option;
import scala.collection.Seq;

/**
 * Extends Spark's FileTable class so that it can make use of that class's file index capabilities, which includes
 * support for Spark options like recursiveFileLookup and pathGlobFilter as defined at
 * https://spark.apache.org/docs/latest/sql-data-sources-generic-options.html .
 * 
 * A prototype that attempted to create an InMemoryFileIndex, and thus avoid the need to subclass FileTable, was not
 * successful. Could not get the following code to run in Java:
 * Seq hadoopPaths = DataSource.checkAndGlobPathIfNecessary(this.paths,
 * session.sparkContext().hadoopConfiguration(), true, true, numThreads, true);
 * For future attempts, the example at https://stackoverflow.com/a/45373345/3306099 is useful for converting a Java
 * map into an immutable Scala map.
 */
class MarkLogicFileTable extends FileTable {

    private final CaseInsensitiveStringMap options;
    private final StructType schema;

    MarkLogicFileTable(SparkSession sparkSession, CaseInsensitiveStringMap options, Seq paths, StructType schema) {
        super(sparkSession, options, paths, Option.apply(schema));
        this.options = options;
        this.schema = schema;
    }

    @Override
    public ScanBuilder newScanBuilder(CaseInsensitiveStringMap options) {
        if ("true".equalsIgnoreCase(options.get(Options.STREAM_FILES)) && Util.MAIN_LOGGER.isInfoEnabled()) {
            Util.MAIN_LOGGER.info("File streaming is enabled; will read files during writer phase.");
        }
        return new FileScanBuilder(options.asCaseSensitiveMap(), super.fileIndex());
    }

    @Override
    public WriteBuilder newWriteBuilder(LogicalWriteInfo info) {
        // Need to pass along a serializable object.
        return new DocumentFileWriteBuilder(this.options.asCaseSensitiveMap(), this.schema);
    }

    @Override
    public Option inferSchema(Seq files) {
        return Option.apply(this.schema);
    }

    @Override
    public String name() {
        return "marklogic-file";
    }

    @Override
    public String formatName() {
        // Per the docs in FileTable, this is providing an alias for supported file types. It does not appear to have
        // any impact on functionality.
        return "marklogic";
    }

    @Override
    public Class fallbackFileFormat() {
        // Per the docs in FileTable, this allows for returning a Spark V1 FileFormat. We don't have support for that,
        // so null is returned.
        return null;
    }
}