com.marklogic.spark.reader.optic.OpticScanBuilder Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of marklogic-spark-connector Show documentation
Show all versions of marklogic-spark-connector Show documentation
Spark 3 connector for MarkLogic
/*
* Copyright © 2024 MarkLogic Corporation. All Rights Reserved.
*/
package com.marklogic.spark.reader.optic;
import com.marklogic.spark.Options;
import com.marklogic.spark.Util;
import com.marklogic.spark.reader.filter.FilterFactory;
import com.marklogic.spark.reader.filter.OpticFilter;
import org.apache.spark.sql.connector.expressions.SortOrder;
import org.apache.spark.sql.connector.expressions.aggregate.*;
import org.apache.spark.sql.connector.read.*;
import org.apache.spark.sql.sources.Filter;
import org.apache.spark.sql.types.StructType;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.util.*;
import java.util.stream.Stream;
public class OpticScanBuilder implements ScanBuilder, SupportsPushDownFilters, SupportsPushDownLimit,
SupportsPushDownTopN, SupportsPushDownAggregates, SupportsPushDownRequiredColumns {
private static final Logger logger = LoggerFactory.getLogger(OpticScanBuilder.class);
private final OpticReadContext opticReadContext;
private List pushedFilters;
private static final Set> SUPPORTED_AGGREGATE_FUNCTIONS = new HashSet<>();
static {
SUPPORTED_AGGREGATE_FUNCTIONS.add(Avg.class);
SUPPORTED_AGGREGATE_FUNCTIONS.add(Count.class);
SUPPORTED_AGGREGATE_FUNCTIONS.add(CountStar.class);
SUPPORTED_AGGREGATE_FUNCTIONS.add(Max.class);
SUPPORTED_AGGREGATE_FUNCTIONS.add(Min.class);
SUPPORTED_AGGREGATE_FUNCTIONS.add(Sum.class);
}
public OpticScanBuilder(OpticReadContext opticReadContext) {
this.opticReadContext = opticReadContext;
}
@Override
public Scan build() {
if (logger.isTraceEnabled()) {
logger.trace("Creating new scan");
}
return new OpticScan(opticReadContext);
}
/**
* @param filters
* @return an array of filters that are not pushed down so that Spark knows it must apply them to data returned
* by the connector
*/
@Override
public Filter[] pushFilters(Filter[] filters) {
pushedFilters = new ArrayList<>();
if (opticReadContext.planAnalysisFoundNoRows()) {
return filters;
}
List unsupportedFilters = new ArrayList<>();
List opticFilters = new ArrayList<>();
if (logger.isDebugEnabled()) {
logger.debug("Filter count: {}", filters.length);
}
for (Filter filter : filters) {
OpticFilter opticFilter = FilterFactory.toPlanFilter(filter);
if (opticFilter != null && opticFilter.isValid()) {
if (logger.isDebugEnabled()) {
logger.debug("Pushing down filter: {}", filter);
}
opticFilters.add(opticFilter);
this.pushedFilters.add(filter);
} else {
if (logger.isDebugEnabled()) {
logger.debug("Unsupported filter, will be handled by Spark: {}", filter);
}
unsupportedFilters.add(filter);
}
}
opticReadContext.pushDownFiltersIntoOpticQuery(opticFilters);
return unsupportedFilters.toArray(new Filter[0]);
}
/**
* @return array of filters that were pushed down to MarkLogic so that Spark knows not to apply them itself
*/
@Override
public Filter[] pushedFilters() {
return pushedFilters.toArray(new Filter[0]);
}
@Override
public boolean pushLimit(int limit) {
if (opticReadContext.planAnalysisFoundNoRows()) {
return false;
}
if (logger.isDebugEnabled()) {
logger.debug("Pushing down limit: {}", limit);
}
opticReadContext.pushDownLimit(limit);
return true;
}
@Override
public boolean pushTopN(SortOrder[] orders, int limit) {
if (opticReadContext.planAnalysisFoundNoRows()) {
return false;
}
// This will be invoked when the user calls both orderBy and limit in their Spark program. If the user only
// calls limit, then only pushLimit is called and this will not be called. If the user only calls orderBy and
// not limit, then neither this nor pushLimit will be called.
if (logger.isDebugEnabled()) {
logger.debug("Pushing down topN: {}; limit: {}", Arrays.asList(orders), limit);
}
opticReadContext.pushDownTopN(orders, limit);
return true;
}
@Override
public boolean isPartiallyPushed() {
// If a single bucket exists - i.e. a single call will be made to MarkLogic - then any limit/orderBy can be
// fully pushed down to MarkLogic. Otherwise, we also need Spark to apply the limit/orderBy to the returned rows
// to ensure that the user receives the correct response.
return opticReadContext.getBucketCount() > 1;
}
/**
* Per the Spark javadocs, this should return true if we can push down the entire aggregation. This is only
* possible if every aggregation function is supported and if only one request will be made to MarkLogic. If
* multiple requests are made to MarkLogic (based on the user-defined partition count and batch size), then
* Spark has to apply the aggregation against the combined set of rows returned from all requests to MarkLogic.
*
* @param aggregation
* @return
*/
@Override
public boolean supportCompletePushDown(Aggregation aggregation) {
if (opticReadContext.planAnalysisFoundNoRows() || pushDownAggregatesIsDisabled()) {
return false;
}
if (hasUnsupportedAggregateFunction(aggregation)) {
if (Util.MAIN_LOGGER.isInfoEnabled()) {
Util.MAIN_LOGGER.info("Aggregation contains one or more unsupported functions, " +
"so not pushing aggregation to MarkLogic: {}", describeAggregation(aggregation));
}
return false;
}
if (opticReadContext.getBucketCount() > 1) {
if (Util.MAIN_LOGGER.isInfoEnabled()) {
Util.MAIN_LOGGER.info("Multiple requests will be made to MarkLogic; aggregation will be applied by Spark as well: {}",
describeAggregation(aggregation));
}
return false;
}
return true;
}
@Override
public boolean pushAggregation(Aggregation aggregation) {
// For the initial 2.0 release, there aren't any known unsupported aggregate functions that can be called
// after a "groupBy". If one is detected though, the aggregation won't be pushed down as it's uncertain if
// pushing it down would produce the correct results.
if (opticReadContext.planAnalysisFoundNoRows() || hasUnsupportedAggregateFunction(aggregation)) {
return false;
}
if (pushDownAggregatesIsDisabled()) {
Util.MAIN_LOGGER.info("Push down of aggregates is disabled; Spark will handle all aggregations.");
return false;
}
if (logger.isDebugEnabled()) {
logger.debug("Pushing down aggregation: {}", describeAggregation(aggregation));
}
opticReadContext.pushDownAggregation(aggregation);
return true;
}
@Override
public void pruneColumns(StructType requiredSchema) {
if (opticReadContext.planAnalysisFoundNoRows()) {
return;
}
if (requiredSchema.equals(opticReadContext.getSchema())) {
if (logger.isDebugEnabled()) {
logger.debug("The schema to push down is equal to the existing schema, so not pushing it down.");
}
} else {
// Keeping this at debug level as it can be fairly verbose.
if (logger.isDebugEnabled()) {
logger.debug("Pushing down required schema: {}", requiredSchema.json());
}
opticReadContext.pushDownRequiredSchema(requiredSchema);
}
}
private boolean hasUnsupportedAggregateFunction(Aggregation aggregation) {
return Stream
.of(aggregation.aggregateExpressions())
.anyMatch(func -> !SUPPORTED_AGGREGATE_FUNCTIONS.contains(func.getClass()));
}
private String describeAggregation(Aggregation aggregation) {
return String.format("groupBy: %s; aggregates: %s",
Arrays.asList(aggregation.groupByExpressions()),
Arrays.asList(aggregation.aggregateExpressions()));
}
private boolean pushDownAggregatesIsDisabled() {
return "false".equalsIgnoreCase(opticReadContext.getProperties().get(Options.READ_PUSH_DOWN_AGGREGATES));
}
}