com.marklogic.spark.reader.document.DocumentScanBuilder Maven / Gradle / Ivy

Go to download

Show more of this group Show more artifacts with this name
Show all versions of marklogic-spark-connector Show documentation

Spark 3 connector for MarkLogic

There is a newer version: 2.4.2

Show newest version

/*
 * Copyright © 2024 MarkLogic Corporation. All Rights Reserved.
 */
package com.marklogic.spark.reader.document;

import com.marklogic.spark.Util;
import org.apache.spark.sql.connector.read.Scan;
import org.apache.spark.sql.connector.read.ScanBuilder;
import org.apache.spark.sql.connector.read.SupportsPushDownLimit;
import org.apache.spark.sql.types.StructType;
import org.apache.spark.sql.util.CaseInsensitiveStringMap;

class DocumentScanBuilder implements ScanBuilder, SupportsPushDownLimit {

    private final DocumentContext context;

    DocumentScanBuilder(CaseInsensitiveStringMap options, StructType schema) {
        this.context = new DocumentContext(options, schema);
        if (this.context.isStreamingFiles() && Util.MAIN_LOGGER.isInfoEnabled()) {
            Util.MAIN_LOGGER.info("File streaming is enabled; will read documents from MarkLogic during writer phase.");
        }
    }

    @Override
    public Scan build() {
        return new DocumentScan(context);
    }

    @Override
    public boolean pushLimit(int limit) {
        this.context.setLimit(limit);
        return true;
    }

    @Override
    public boolean isPartiallyPushed() {
        // A partition reader can only ensure that it doesn't exceed the limit. In a worst case scenario, every reader
        // will return "limit" rows. So must return true here to ensure that Spark reduces the dataset to the
        // appropriate limit.
        return true;
    }
}