com.azure.cosmos.implementation.batch.BulkExecutor Maven / Gradle / Ivy

// Copyright (c) Microsoft Corporation. All rights reserved.
// Licensed under the MIT License.

package com.azure.cosmos.implementation.batch;

import com.azure.cosmos.BridgeInternal;
import com.azure.cosmos.CosmosAsyncClient;
import com.azure.cosmos.CosmosAsyncContainer;
import com.azure.cosmos.CosmosBridgeInternal;
import com.azure.cosmos.CosmosEndToEndOperationLatencyPolicyConfig;
import com.azure.cosmos.CosmosException;
import com.azure.cosmos.CosmosItemSerializer;
import com.azure.cosmos.ThrottlingRetryOptions;
import com.azure.cosmos.implementation.AsyncDocumentClient;
import com.azure.cosmos.implementation.CosmosBulkExecutionOptionsImpl;
import com.azure.cosmos.implementation.CosmosSchedulers;
import com.azure.cosmos.implementation.HttpConstants;
import com.azure.cosmos.implementation.ImplementationBridgeHelpers;
import com.azure.cosmos.implementation.OperationType;
import com.azure.cosmos.implementation.RequestOptions;
import com.azure.cosmos.implementation.ResourceType;
import com.azure.cosmos.implementation.apachecommons.lang.tuple.Pair;
import com.azure.cosmos.implementation.spark.OperationContextAndListenerTuple;
import com.azure.cosmos.models.CosmosBatchOperationResult;
import com.azure.cosmos.models.CosmosBatchResponse;
import com.azure.cosmos.models.CosmosBulkItemResponse;
import com.azure.cosmos.models.CosmosBulkOperationResponse;
import com.azure.cosmos.models.CosmosItemOperation;
import com.azure.cosmos.models.CosmosItemOperationType;
import com.azure.cosmos.models.ModelBridgeInternal;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import reactor.core.Disposable;
import reactor.core.Exceptions;
import reactor.core.publisher.Flux;
import reactor.core.publisher.FluxProcessor;
import reactor.core.publisher.FluxSink;
import reactor.core.publisher.GroupedFlux;
import reactor.core.publisher.Mono;
import reactor.core.publisher.SignalType;
import reactor.core.publisher.Sinks;
import reactor.core.publisher.UnicastProcessor;
import reactor.core.scheduler.Scheduler;
import reactor.util.function.Tuple2;

import java.time.Duration;
import java.util.ArrayList;
import java.util.List;
import java.util.Map;
import java.util.UUID;
import java.util.concurrent.ConcurrentMap;
import java.util.concurrent.CopyOnWriteArrayList;
import java.util.concurrent.TimeUnit;
import java.util.concurrent.atomic.AtomicBoolean;
import java.util.concurrent.atomic.AtomicInteger;
import java.util.concurrent.atomic.AtomicLong;
import java.util.concurrent.atomic.AtomicReference;

import static com.azure.core.util.FluxUtil.withContext;
import static com.azure.cosmos.implementation.guava25.base.Preconditions.checkNotNull;

/**
 * The Core logic of bulk execution is here.
 *
 * The actual execution of the flux of operations. It is done in following steps:

 * 1. Getting partition key range ID and grouping operations using that id.
 * 2. For the flux of operations in a group, adding buffering based on size and a duration.
 * 3. For the operation we get in after buffering, process it using a batch request and return
 *    a wrapper having request, response(if-any) and exception(if-any). Either response or exception will be there.
 *
 * 4. Any internal retry is done by adding in an intermediate sink for each grouped flux.
 * 5. Any operation which failed due to partition key range gone is retried by putting it in the main sink which leads
 *    to re-calculation of partition key range id.
 * 6. At the end and this is very essential, we close all the sinks as the sink continues to waits for more and the
 *    execution isn't finished even if all the operations have been executed(figured out by completion call of source)
 *
 * Note: Sink will move to a new interface from 3.5 and this is documentation for it:
 *    - https://github.com/reactor/reactor-core/blob/master/docs/asciidoc/processors.adoc
 *
 *    For our use case, Sinks.many().unicast() will work.
 */
public final class BulkExecutor implements Disposable {

    private final static Logger logger = LoggerFactory.getLogger(BulkExecutor.class);
    private final static AtomicLong instanceCount = new AtomicLong(0);
    private static final ImplementationBridgeHelpers.CosmosAsyncClientHelper.CosmosAsyncClientAccessor clientAccessor =
        ImplementationBridgeHelpers.CosmosAsyncClientHelper.getCosmosAsyncClientAccessor();

    private final CosmosAsyncContainer container;
    private final int maxMicroBatchPayloadSizeInBytes;
    private final AsyncDocumentClient docClientWrapper;
    private final String operationContextText;
    private final OperationContextAndListenerTuple operationListener;
    private final ThrottlingRetryOptions throttlingRetryOptions;
    private final Flux inputOperations;

    // Options for bulk execution.
    private final Long maxMicroBatchIntervalInMs;

    private final TContext batchContext;
    private final ConcurrentMap partitionScopeThresholds;
    private final CosmosBulkExecutionOptionsImpl cosmosBulkExecutionOptions;

    // Handle gone error:
    private final AtomicBoolean mainSourceCompleted;
    private final AtomicBoolean isDisposed = new AtomicBoolean(false);
    private final AtomicBoolean isShutdown = new AtomicBoolean(false);
    private final AtomicInteger totalCount;
    private final static Sinks.EmitFailureHandler serializedEmitFailureHandler = new SerializedEmitFailureHandler();
    private final static Sinks.EmitFailureHandler serializedCompleteEmitFailureHandler =
        new SerializedCompleteEmitFailureHandler();
    private final Sinks.Many mainSink;
    private final List> groupSinks;
    private final CosmosAsyncClient cosmosClient;
    private final String bulkSpanName;
    private final AtomicReference scheduledFutureForFlush;
    private final String identifier = "BulkExecutor-" + instanceCount.incrementAndGet();
    private final BulkExecutorDiagnosticsTracker diagnosticsTracker;
    private final CosmosItemSerializer effectiveItemSerializer;
    private final Scheduler executionScheduler;

    @SuppressWarnings({"unchecked"})
    public BulkExecutor(CosmosAsyncContainer container,
                        Flux inputOperations,
                        CosmosBulkExecutionOptionsImpl cosmosBulkOptions) {

        checkNotNull(container, "expected non-null container");
        checkNotNull(inputOperations, "expected non-null inputOperations");
        checkNotNull(cosmosBulkOptions, "expected non-null bulkOptions");

        this.maxMicroBatchPayloadSizeInBytes = cosmosBulkOptions.getMaxMicroBatchPayloadSizeInBytes();
        this.cosmosBulkExecutionOptions = cosmosBulkOptions;
        this.container = container;
        this.bulkSpanName = "nonTransactionalBatch." + this.container.getId();
        this.inputOperations = inputOperations;
        this.docClientWrapper = CosmosBridgeInternal.getAsyncDocumentClient(container.getDatabase());
        this.cosmosClient = ImplementationBridgeHelpers
            .CosmosAsyncDatabaseHelper
            .getCosmosAsyncDatabaseAccessor()
            .getCosmosAsyncClient(container.getDatabase());
        this.effectiveItemSerializer = this.docClientWrapper.getEffectiveItemSerializer(cosmosBulkOptions.getCustomItemSerializer());

        this.throttlingRetryOptions = docClientWrapper.getConnectionPolicy().getThrottlingRetryOptions();

        // Fill the option first, to make the BulkProcessingOptions immutable, as if accessed directly, we might get
        // different values when a new group is created.
        maxMicroBatchIntervalInMs = cosmosBulkExecutionOptions.getMaxMicroBatchInterval().toMillis();
        batchContext = (TContext) cosmosBulkExecutionOptions.getLegacyBatchScopedContext();
        this.partitionScopeThresholds = ImplementationBridgeHelpers.CosmosBulkExecutionThresholdsStateHelper
            .getBulkExecutionThresholdsAccessor()
            .getPartitionScopeThresholds(cosmosBulkExecutionOptions.getThresholdsState());
        operationListener = cosmosBulkExecutionOptions.getOperationContextAndListenerTuple();
        if (operationListener != null &&
            operationListener.getOperationContext() != null) {
            operationContextText = identifier + "[" + operationListener.getOperationContext().toString() + "]";
        } else {
            operationContextText = identifier +"[n/a]";
        }

        this.diagnosticsTracker = cosmosBulkExecutionOptions.getDiagnosticsTracker();

        // Initialize sink for handling gone error.
        mainSourceCompleted = new AtomicBoolean(false);
        totalCount = new AtomicInteger(0);
        mainSink =  Sinks.many().unicast().onBackpressureBuffer();
        groupSinks = new CopyOnWriteArrayList<>();

        this.scheduledFutureForFlush = new AtomicReference<>(CosmosSchedulers
            .BULK_EXECUTOR_FLUSH_BOUNDED_ELASTIC
            .schedulePeriodically(
                this::onFlush,
                this.maxMicroBatchIntervalInMs,
                this.maxMicroBatchIntervalInMs,
                TimeUnit.MILLISECONDS));

        Scheduler schedulerSnapshotFromOptions = cosmosBulkOptions.getSchedulerOverride();
        this.executionScheduler = schedulerSnapshotFromOptions != null
            ? schedulerSnapshotFromOptions
            : CosmosSchedulers.BULK_EXECUTOR_BOUNDED_ELASTIC;

        logger.debug("Instantiated BulkExecutor, Context: {}",
            this.operationContextText);
    }

    @Override
    public void dispose() {
        if (this.isDisposed.compareAndSet(false, true)) {
            long totalCountSnapshot = totalCount.get();
            if (totalCountSnapshot == 0) {
                completeAllSinks();
            } else {
                this.shutdown();
            }
        }
    }

    @Override
    public boolean isDisposed() {
        return this.isDisposed.get();
    }

    private void cancelFlushTask(boolean initializeAggressiveFlush) {
        long flushIntervalAfterDrainingIncomingFlux = Math.min(
            this.maxMicroBatchIntervalInMs,
            BatchRequestResponseConstants
                .DEFAULT_MAX_MICRO_BATCH_INTERVAL_AFTER_DRAINING_INCOMING_FLUX_IN_MILLISECONDS);

        Disposable newFlushTask = initializeAggressiveFlush
            ? CosmosSchedulers
                .BULK_EXECUTOR_FLUSH_BOUNDED_ELASTIC
                .schedulePeriodically(
                    this::onFlush,
                    flushIntervalAfterDrainingIncomingFlux,
                    flushIntervalAfterDrainingIncomingFlux,
                    TimeUnit.MILLISECONDS)
            : null;

        Disposable scheduledFutureSnapshot = this.scheduledFutureForFlush.getAndSet(newFlushTask);

        if (scheduledFutureSnapshot != null) {
            try {
                scheduledFutureSnapshot.dispose();
                logDebugOrWarning("Cancelled all future scheduled tasks {}, Context: {}", getThreadInfo(), this.operationContextText);
            } catch (Exception e) {
                logger.warn("Failed to cancel scheduled tasks{}, Context: {}", getThreadInfo(), this.operationContextText, e);
            }
        }
    }

    private void logInfoOrWarning(String msg, Object... args) {
        if (this.diagnosticsTracker == null || !this.diagnosticsTracker.verboseLoggingAfterReEnqueueingRetriesEnabled()) {
            logger.info(msg, args);
        } else {
            logger.warn(msg, args);
        }
    }

    private void logTraceOrWarning(String msg, Object... args) {
        if (this.diagnosticsTracker == null || !this.diagnosticsTracker.verboseLoggingAfterReEnqueueingRetriesEnabled()) {
            logger.trace(msg, args);
        } else {
            logger.warn(msg, args);
        }
    }

    private void logDebugOrWarning(String msg, Object... args) {
        if (this.diagnosticsTracker == null || !this.diagnosticsTracker.verboseLoggingAfterReEnqueueingRetriesEnabled()) {
            logger.debug(msg, args);
        } else {
            logger.warn(msg, args);
        }
    }

    private void shutdown() {
        if (this.isShutdown.compareAndSet(false, true)) {
            logDebugOrWarning("Shutting down, Context: {}", this.operationContextText);

            groupSinks.forEach(FluxSink::complete);
            logger.debug("All group sinks completed, Context: {}", this.operationContextText);

            this.cancelFlushTask(false);
        }
    }

    public Flux> execute() {
        return this
            .executeCore()
            .doFinally((SignalType signal) -> {
                if (signal == SignalType.ON_COMPLETE) {
                    logDebugOrWarning("BulkExecutor.execute flux completed - # left items {}, Context: {}, {}",
                        this.totalCount.get(),
                        this.operationContextText,
                        getThreadInfo());
                } else {
                    int itemsLeftSnapshot = this.totalCount.get();
                    if (itemsLeftSnapshot > 0) {
                        logInfoOrWarning("BulkExecutor.execute flux terminated - Signal: {} - # left items {}, Context: {}, {}",
                            signal,
                            itemsLeftSnapshot,
                            this.operationContextText,
                            getThreadInfo());
                    } else {
                        logDebugOrWarning("BulkExecutor.execute flux terminated - Signal: {} - # left items {}, Context: {}, {}",
                            signal,
                            itemsLeftSnapshot,
                            this.operationContextText,
                            getThreadInfo());
                    }
                }

                this.dispose();
            });
    }

    private Flux> executeCore() {

        // The groupBy below is running into a hang if the flatMap above is
        // not allowing at least a concurrency of the number of unique values
        // you groupBy on.
        // The groupBy is used to isolate Cosmos physical partitions
        // so when there is no config override we enforce that the flatMap is using a concurrency of
        // Math.max(default concurrency (256), #of partitions * 2 (to accommodate for some splits))
        // The config override can be used by the Spark connector when customers follow best practices and
        // repartition the data frame to avoid that each Spark partition contains data spread across all
        // physical partitions. When repartitioning the incoming data it is possible to ensure that each
        // Spark partition will only target a subset of Cosmos partitions. This will improve the efficiency
        // and mean fewer than #of Partitions concurrency will be needed for
        // large containers. (with hundreds of physical partitions)
        Integer nullableMaxConcurrentCosmosPartitions = cosmosBulkExecutionOptions.getMaxConcurrentCosmosPartitions();
        Mono maxConcurrentCosmosPartitionsMono = nullableMaxConcurrentCosmosPartitions != null ?
            Mono.just(Math.max(256, nullableMaxConcurrentCosmosPartitions)) :
            ImplementationBridgeHelpers
                .CosmosAsyncContainerHelper
                .getCosmosAsyncContainerAccessor()
                .getFeedRanges(this.container, false).map(ranges -> Math.max(256, ranges.size() * 2));

        return
            maxConcurrentCosmosPartitionsMono
            .subscribeOn(this.executionScheduler)
            .flatMapMany(maxConcurrentCosmosPartitions -> {

                logDebugOrWarning("BulkExecutor.execute with MaxConcurrentPartitions: {}, Context: {}",
                    maxConcurrentCosmosPartitions,
                    this.operationContextText);

                return this.inputOperations
                    .publishOn(this.executionScheduler)
                    .onErrorMap(throwable -> {
                        logger.warn("{}: Error observed when processing inputOperations. Cause: {}, Context: {}",
                            getThreadInfo(),
                            throwable.getMessage(),
                            this.operationContextText,
                            throwable);

                        return throwable;
                    })
                    .doOnNext((CosmosItemOperation cosmosItemOperation) -> {
                        // Set the retry policy before starting execution. Should only happens once.
                        BulkExecutorUtil.setRetryPolicyForBulk(
                            docClientWrapper,
                            this.container,
                            cosmosItemOperation,
                            this.throttlingRetryOptions);

                        if (cosmosItemOperation != FlushBuffersItemOperation.singleton()) {
                            totalCount.incrementAndGet();
                        }

                        logger.trace(
                            "SetupRetryPolicy, {}, TotalCount: {}, Context: {}, {}",
                            getItemOperationDiagnostics(cosmosItemOperation),
                            totalCount.get(),
                            this.operationContextText,
                            getThreadInfo()
                        );
                    })
                    .doOnComplete(() -> {
                        mainSourceCompleted.set(true);

                        long totalCountSnapshot = totalCount.get();
                        logDebugOrWarning("Main source completed - # left items {}, Context: {}",
                            totalCountSnapshot,
                            this.operationContextText);
                        if (totalCountSnapshot == 0) {
                            // This is needed as there can be case that onComplete was called after last element was processed
                            // So complete the sink here also if count is 0, if source has completed and count isn't zero,
                            // then the last element in the doOnNext will close it. Sink doesn't mind in case of a double close.

                            completeAllSinks();
                        } else {
                            this.cancelFlushTask(true);
                            this.onFlush();

                            logDebugOrWarning("Scheduled new flush operation {}, Context: {}", getThreadInfo(), this.operationContextText);
                        }
                    })
                    .mergeWith(mainSink.asFlux())
                    .subscribeOn(this.executionScheduler)
                    .flatMap(
                        operation -> {
                            logger.trace("Before Resolve PkRangeId, {}, Context: {} {}",
                                getItemOperationDiagnostics(operation),
                                this.operationContextText,
                                getThreadInfo());

                            // resolve partition key range id again for operations which comes in main sink due to gone retry.
                            return BulkExecutorUtil.resolvePartitionKeyRangeId(this.docClientWrapper, this.container, operation)
                                                   .map((String pkRangeId) -> {
                                                       PartitionScopeThresholds partitionScopeThresholds =
                                                           this.partitionScopeThresholds.computeIfAbsent(
                                                               pkRangeId,
                                                               (newPkRangeId) -> new PartitionScopeThresholds(newPkRangeId, this.cosmosBulkExecutionOptions));

                                                       logger.trace("Resolved PkRangeId, {}, PKRangeId: {} Context: {} {}",
                                                           getItemOperationDiagnostics(operation),
                                                           pkRangeId,
                                                           this.operationContextText,
                                                           getThreadInfo());

                                                       return Pair.of(partitionScopeThresholds, operation);
                                                   });
                        })
                    .groupBy(Pair::getKey, Pair::getValue)
                    .flatMap(
                        this::executePartitionedGroup,
                        maxConcurrentCosmosPartitions)
                    .subscribeOn(this.executionScheduler)
                    .doOnNext(requestAndResponse -> {

                        int totalCountAfterDecrement = totalCount.decrementAndGet();
                        boolean mainSourceCompletedSnapshot = mainSourceCompleted.get();
                        if (totalCountAfterDecrement == 0 && mainSourceCompletedSnapshot) {
                            // It is possible that count is zero but there are more elements in the source.
                            // Count 0 also signifies that there are no pending elements in any sink.
                            logDebugOrWarning("All work completed, {}, TotalCount: {}, Context: {} {}",
                                getItemOperationDiagnostics(requestAndResponse.getOperation()),
                                totalCountAfterDecrement,
                                this.operationContextText,
                                getThreadInfo());
                            completeAllSinks();
                        } else {
                            if (totalCountAfterDecrement == 0) {
                                logDebugOrWarning(
                                    "No Work left - but mainSource not yet completed, Context: {} {}",
                                    this.operationContextText,
                                    getThreadInfo());
                            }
                            logTraceOrWarning(
                                "Work left - TotalCount after decrement: {}, main sink completed {}, {}, Context: {} {}",
                                totalCountAfterDecrement,
                                mainSourceCompletedSnapshot,
                                getItemOperationDiagnostics(requestAndResponse.getOperation()),
                                this.operationContextText,
                                getThreadInfo());
                        }
                    })
                    .doOnComplete(() -> {
                        int totalCountSnapshot = totalCount.get();
                        boolean mainSourceCompletedSnapshot = mainSourceCompleted.get();
                        if (totalCountSnapshot == 0 && mainSourceCompletedSnapshot) {
                            // It is possible that count is zero but there are more elements in the source.
                            // Count 0 also signifies that there are no pending elements in any sink.
                            logDebugOrWarning("DoOnComplete: All work completed, Context: {}", this.operationContextText);
                            completeAllSinks();
                        } else {
                            logDebugOrWarning(
                                "DoOnComplete: Work left - TotalCount after decrement: {}, main sink completed {}, Context: {} {}",
                                totalCountSnapshot,
                                mainSourceCompletedSnapshot,
                                this.operationContextText,
                                getThreadInfo());
                        }
                    });
            });
    }

    private Flux> executePartitionedGroup(
        GroupedFlux partitionedGroupFluxOfInputOperations) {

        final PartitionScopeThresholds thresholds = partitionedGroupFluxOfInputOperations.key();

        final FluxProcessor groupFluxProcessor =
            UnicastProcessor.create().serialize();
        final FluxSink groupSink = groupFluxProcessor.sink(FluxSink.OverflowStrategy.BUFFER);
        groupSinks.add(groupSink);

        AtomicLong firstRecordTimeStamp = new AtomicLong(-1);
        AtomicLong currentMicroBatchSize = new AtomicLong(0);
        AtomicInteger currentTotalSerializedLength = new AtomicInteger(0);

        return partitionedGroupFluxOfInputOperations
            .mergeWith(groupFluxProcessor)
            .onBackpressureBuffer()
            .timestamp()
            .subscribeOn(this.executionScheduler)
            .bufferUntil(timeStampItemOperationTuple -> {
                long timestamp = timeStampItemOperationTuple.getT1();
                CosmosItemOperation itemOperation = timeStampItemOperationTuple.getT2();

                logger.trace(
                    "BufferUntil - enqueued {}, {}, Context: {} {}",
                    timestamp,
                    getItemOperationDiagnostics(itemOperation),
                    this.operationContextText,
                    getThreadInfo());

                if (itemOperation == FlushBuffersItemOperation.singleton()) {
                    long currentMicroBatchSizeSnapshot = currentMicroBatchSize.get();
                    if (currentMicroBatchSizeSnapshot > 0) {
                        logger.trace(
                            "Flushing PKRange {} (batch size: {}) due to FlushItemOperation, Context: {} {}",
                            thresholds.getPartitionKeyRangeId(),
                            currentMicroBatchSizeSnapshot,
                            this.operationContextText,
                            getThreadInfo());

                        firstRecordTimeStamp.set(-1);
                        currentMicroBatchSize.set(0);
                        currentTotalSerializedLength.set(0);

                        return true;
                    }

                    // avoid counting flush operations for the micro batch size calculation
                    return false;
                }

                firstRecordTimeStamp.compareAndSet(-1, timestamp);
                long age = timestamp - firstRecordTimeStamp.get();
                long batchSize = currentMicroBatchSize.incrementAndGet();
                int totalSerializedLength = this.calculateTotalSerializedLength(currentTotalSerializedLength, itemOperation);

                if (batchSize >= thresholds.getTargetMicroBatchSizeSnapshot() ||
                    age >= this.maxMicroBatchIntervalInMs ||
                    totalSerializedLength >= this.maxMicroBatchPayloadSizeInBytes) {

                    logDebugOrWarning(
                        "BufferUntil - Flushing PKRange {} due to BatchSize ({}), payload size ({}) or age ({}), " +
                            "Triggering {}, Context: {} {}",
                        thresholds.getPartitionKeyRangeId(),
                        batchSize,
                        totalSerializedLength,
                        age,
                        getItemOperationDiagnostics(itemOperation),
                        this.operationContextText,
                        getThreadInfo());
                    firstRecordTimeStamp.set(-1);
                    currentMicroBatchSize.set(0);
                    currentTotalSerializedLength.set(0);
                    return true;
                }

                return false;
            })
            .flatMap(
                (List> timeStampAndItemOperationTuples) -> {
                    List operations = new ArrayList<>(timeStampAndItemOperationTuples.size());
                    for (Tuple2 timeStampAndItemOperationTuple :
                        timeStampAndItemOperationTuples) {

                        CosmosItemOperation itemOperation = timeStampAndItemOperationTuple.getT2();
                        if (itemOperation == FlushBuffersItemOperation.singleton()) {
                            continue;
                        }
                        operations.add(itemOperation);
                    }

                    logDebugOrWarning(
                        "Flushing PKRange {} micro batch with {} operations,  Context: {} {}",
                        thresholds.getPartitionKeyRangeId(),
                        operations.size(),
                        this.operationContextText,
                        getThreadInfo());

                    return executeOperations(operations, thresholds, groupSink);
                },
                this.cosmosBulkExecutionOptions.getMaxMicroBatchConcurrency());
    }

    private int calculateTotalSerializedLength(AtomicInteger currentTotalSerializedLength, CosmosItemOperation item) {
        if (item instanceof CosmosItemOperationBase) {
            return currentTotalSerializedLength.accumulateAndGet(
                ((CosmosItemOperationBase) item).getSerializedLength(this.effectiveItemSerializer),
                Integer::sum);
        }

        return currentTotalSerializedLength.get();
    }

    private Flux> executeOperations(
        List operations,
        PartitionScopeThresholds thresholds,
        FluxSink groupSink) {

        if (operations.size() == 0) {
            logger.trace("Empty operations list, Context: {}", this.operationContextText);
            return Flux.empty();
        }

        String pkRange = thresholds.getPartitionKeyRangeId();
        ServerOperationBatchRequest serverOperationBatchRequest =
            BulkExecutorUtil.createBatchRequest(operations, pkRange, this.maxMicroBatchPayloadSizeInBytes, this.effectiveItemSerializer);
        if (serverOperationBatchRequest.getBatchPendingOperations().size() > 0) {
            serverOperationBatchRequest.getBatchPendingOperations().forEach(groupSink::next);
        }

        return Flux.just(serverOperationBatchRequest.getBatchRequest())
            .publishOn(this.executionScheduler)
            .flatMap((PartitionKeyRangeServerBatchRequest serverRequest) ->
                this.executePartitionKeyRangeServerBatchRequest(serverRequest, groupSink, thresholds));
    }

    private Flux> executePartitionKeyRangeServerBatchRequest(
        PartitionKeyRangeServerBatchRequest serverRequest,
        FluxSink groupSink,
        PartitionScopeThresholds thresholds) {

        String batchTrackingId = UUID.randomUUID().toString();
        logTraceOrWarning(
            "Executing batch of PKRangeId %s - batch TrackingId: %s",
            serverRequest.getPartitionKeyRangeId(),
            batchTrackingId);

        return this.executeBatchRequest(serverRequest)
            .subscribeOn(this.executionScheduler)
            .flatMapMany(response -> {

                logTraceOrWarning(
                    "Response for batch of PKRangeId %s - status code %s, ActivityId: %s, batch TrackingId %s",
                    serverRequest.getPartitionKeyRangeId(),
                    response.getStatusCode(),
                    response.getActivityId(),
                    batchTrackingId);
                if (diagnosticsTracker != null && response.getDiagnostics() != null) {
                    diagnosticsTracker.trackDiagnostics(response.getDiagnostics().getDiagnosticsContext());
                }

                return Flux
                    .fromIterable(response.getResults())
                    .publishOn(this.executionScheduler)
                    .flatMap((CosmosBatchOperationResult result) ->
                    handleTransactionalBatchOperationResult(response, result, groupSink, thresholds));
            })
            .onErrorResume((Throwable throwable) -> {

                if (!(throwable instanceof Exception)) {
                    throw Exceptions.propagate(throwable);
                }

                Exception exception = (Exception) throwable;

                return Flux
                    .fromIterable(serverRequest.getOperations())
                    .publishOn(this.executionScheduler)
                    .flatMap((CosmosItemOperation itemOperation) ->
                        handleTransactionalBatchExecutionException(itemOperation, exception, groupSink, thresholds));
            });
    }

    // Helper functions
    private Mono> handleTransactionalBatchOperationResult(
        CosmosBatchResponse response,
        CosmosBatchOperationResult operationResult,
        FluxSink groupSink,
        PartitionScopeThresholds thresholds) {

        CosmosBulkItemResponse cosmosBulkItemResponse = ModelBridgeInternal
            .createCosmosBulkItemResponse(operationResult, response);
        CosmosItemOperation itemOperation = operationResult.getOperation();
        TContext actualContext = this.getActualContext(itemOperation);

        logDebugOrWarning(
            "HandleTransactionalBatchOperationResult - PKRange {}, Response Status Code {}, " +
                "Operation Status Code, {}, {}, Context: {} {}",
            thresholds.getPartitionKeyRangeId(),
            response.getStatusCode(),
            operationResult.getStatusCode(),
            getItemOperationDiagnostics(itemOperation),
            this.operationContextText,
            getThreadInfo());

        if (!operationResult.isSuccessStatusCode()) {

            if (itemOperation instanceof ItemBulkOperation) {

                ItemBulkOperation itemBulkOperation = (ItemBulkOperation) itemOperation;
                return itemBulkOperation.getRetryPolicy().shouldRetry(operationResult).flatMap(
                    result -> {
                        if (result.shouldRetry) {
                            logDebugOrWarning(
                                "HandleTransactionalBatchOperationResult - enqueue retry, PKRange {}, Response " +
                                    "Status Code {}, Operation Status Code, {}, {}, Context: {} {}",
                                thresholds.getPartitionKeyRangeId(),
                                response.getStatusCode(),
                                operationResult.getStatusCode(),
                                getItemOperationDiagnostics(itemOperation),
                                this.operationContextText,
                                getThreadInfo());
                            return this.enqueueForRetry(result.backOffTime, groupSink, itemOperation, thresholds);
                        } else {
                            // reduce log noise level for commonly expected/normal status codes
                            if (response.getStatusCode() == HttpConstants.StatusCodes.CONFLICT ||
                                response.getStatusCode() == HttpConstants.StatusCodes.PRECONDITION_FAILED) {

                                logDebugOrWarning(
                                    "HandleTransactionalBatchOperationResult - Fail, PKRange {}, Response Status " +
                                        "Code {}, Operation Status Code {}, {}, Context: {} {}",
                                    thresholds.getPartitionKeyRangeId(),
                                    response.getStatusCode(),
                                    operationResult.getStatusCode(),
                                    getItemOperationDiagnostics(itemOperation),
                                    this.operationContextText,
                                    getThreadInfo());
                            } else {
                                logger.error(
                                    "HandleTransactionalBatchOperationResult - Fail, PKRange {}, Response Status " +
                                        "Code {}, Operation Status Code {}, {}, Context: {} {}",
                                    thresholds.getPartitionKeyRangeId(),
                                    response.getStatusCode(),
                                    operationResult.getStatusCode(),
                                    getItemOperationDiagnostics(itemOperation),
                                    this.operationContextText,
                                    getThreadInfo());
                            }
                            return Mono.just(ModelBridgeInternal.createCosmosBulkOperationResponse(
                                itemOperation, cosmosBulkItemResponse, actualContext));
                        }
                    });

            } else {
                throw new UnsupportedOperationException("Unknown CosmosItemOperation.");
            }
        }

        thresholds.recordSuccessfulOperation();
        return Mono.just(ModelBridgeInternal.createCosmosBulkOperationResponse(
            itemOperation,
            cosmosBulkItemResponse,
            actualContext));
    }

    private TContext getActualContext(CosmosItemOperation itemOperation) {
        ItemBulkOperation itemBulkOperation = null;

        if (itemOperation instanceof ItemBulkOperation) {
            itemBulkOperation = (ItemBulkOperation) itemOperation;
        }

        if (itemBulkOperation == null) {
            return this.batchContext;
        }

        TContext operationContext = itemBulkOperation.getContext();
        if (operationContext != null) {
            return operationContext;
        }

        return this.batchContext;
    }

    private Mono> handleTransactionalBatchExecutionException(
        CosmosItemOperation itemOperation,
        Exception exception,
        FluxSink groupSink,
        PartitionScopeThresholds thresholds) {

        logDebugOrWarning(
            "HandleTransactionalBatchExecutionException, PKRange {}, Error: {}, {}, Context: {} {}",
            thresholds.getPartitionKeyRangeId(),
            exception,
            getItemOperationDiagnostics(itemOperation),
            this.operationContextText,
            getThreadInfo());

        if (exception instanceof CosmosException && itemOperation instanceof ItemBulkOperation) {
            CosmosException cosmosException = (CosmosException) exception;
            ItemBulkOperation itemBulkOperation = (ItemBulkOperation) itemOperation;

            // First check if it failed due to split, so the operations need to go in a different pk range group. So
            // add it in the mainSink.

            return itemBulkOperation.getRetryPolicy()
                .shouldRetryForGone(cosmosException.getStatusCode(), cosmosException.getSubStatusCode(), itemBulkOperation, cosmosException)
                .flatMap(shouldRetryGone -> {
                    if (shouldRetryGone) {
                        logDebugOrWarning(
                            "HandleTransactionalBatchExecutionException - Retry due to split, PKRange {}, Error: " +
                                "{}, {}, Context: {} {}",
                            thresholds.getPartitionKeyRangeId(),
                            exception,
                            getItemOperationDiagnostics(itemOperation),
                            this.operationContextText,
                            getThreadInfo());
                        // retry - but don't mark as enqueued for retry in thresholds
                        mainSink.emitNext(itemOperation, serializedEmitFailureHandler);
                        return Mono.empty();
                    } else {
                        logDebugOrWarning(
                            "HandleTransactionalBatchExecutionException - Retry other, PKRange {}, Error: " +
                                "{}, {}, Context: {} {}",
                            thresholds.getPartitionKeyRangeId(),
                            exception,
                            getItemOperationDiagnostics(itemOperation),
                            this.operationContextText,
                            getThreadInfo());
                        return retryOtherExceptions(
                            itemOperation,
                            exception,
                            groupSink,
                            cosmosException,
                            itemBulkOperation,
                            thresholds);
                    }
                });
        }

        TContext actualContext = this.getActualContext(itemOperation);
        return Mono.just(ModelBridgeInternal.createCosmosBulkOperationResponse(itemOperation, exception, actualContext));
    }

    private Mono> enqueueForRetry(
        Duration backOffTime,
        FluxSink groupSink,
        CosmosItemOperation itemOperation,
        PartitionScopeThresholds thresholds) {

        thresholds.recordEnqueuedRetry();
        if (backOffTime == null || backOffTime.isZero()) {
            groupSink.next(itemOperation);
            return Mono.empty();
        } else {
            return Mono
                .delay(backOffTime)
                .flatMap((dummy) -> {
                    groupSink.next(itemOperation);
                    return Mono.empty();
                });
        }
    }

    private Mono> retryOtherExceptions(
        CosmosItemOperation itemOperation,
        Exception exception,
        FluxSink groupSink,
        CosmosException cosmosException,
        ItemBulkOperation itemBulkOperation,
        PartitionScopeThresholds thresholds) {

        TContext actualContext = this.getActualContext(itemOperation);
        return itemBulkOperation.getRetryPolicy().shouldRetry(cosmosException).flatMap(result -> {
            if (result.shouldRetry) {
                return this.enqueueForRetry(result.backOffTime, groupSink, itemBulkOperation, thresholds);
            } else {
                return Mono.just(ModelBridgeInternal.createCosmosBulkOperationResponse(
                    itemOperation, exception, actualContext));
            }
        });
    }

    private Mono executeBatchRequest(PartitionKeyRangeServerBatchRequest serverRequest) {
        RequestOptions options = new RequestOptions();
        options.setThroughputControlGroupName(cosmosBulkExecutionOptions.getThroughputControlGroupName());
        options.setExcludedRegions(cosmosBulkExecutionOptions.getExcludedRegions());
        options.setKeywordIdentifiers(cosmosBulkExecutionOptions.getKeywordIdentifiers());

        CosmosEndToEndOperationLatencyPolicyConfig e2eLatencyPolicySnapshot =
            cosmosBulkExecutionOptions.getCosmosEndToEndLatencyPolicyConfig();
        if (e2eLatencyPolicySnapshot != null) {
            options.setCosmosEndToEndLatencyPolicyConfig(e2eLatencyPolicySnapshot);
        }

        //  This logic is to handle custom bulk options which can be passed through encryption or through some other project
        Map customOptions = cosmosBulkExecutionOptions.getHeaders();
        if (customOptions != null && !customOptions.isEmpty()) {
            for(Map.Entry entry : customOptions.entrySet()) {
                options.setHeader(entry.getKey(), entry.getValue());
            }
        }
        options.setOperationContextAndListenerTuple(operationListener);

        // The request options here are used for the BulkRequest exchanged with the service
        // If contentResponseOnWrite is not enabled here (or at the client level) the
        // service will not even send a bulk response payload - so all the
        // CosmosBulItemRequestOptions are irrelevant - all payloads will be null
        // Instead we should automatically enforce contentResponseOnWrite for all
        // bulk requests whenever at least one of the item operations requires a content response (either
        // because it is a read operation or because contentResponseOnWrite was enabled explicitly)
        if (!this.docClientWrapper.isContentResponseOnWriteEnabled() &&
            serverRequest.getOperations().size() > 0) {

            for (CosmosItemOperation itemOperation : serverRequest.getOperations()) {
                if (itemOperation instanceof ItemBulkOperation) {

                    ItemBulkOperation itemBulkOperation = (ItemBulkOperation) itemOperation;
                    if (itemBulkOperation.getOperationType() == CosmosItemOperationType.READ ||
                        (itemBulkOperation.getRequestOptions() != null &&
                            itemBulkOperation.getRequestOptions().isContentResponseOnWriteEnabled() != null &&
                            itemBulkOperation.getRequestOptions().isContentResponseOnWriteEnabled())) {

                        options.setContentResponseOnWriteEnabled(true);
                        break;
                    }
                }
            }
        }

        return withContext(context -> {
            final Mono responseMono = this.docClientWrapper.executeBatchRequest(
                BridgeInternal.getLink(this.container), serverRequest, options, false);

            return clientAccessor.getDiagnosticsProvider(this.cosmosClient)
                .traceEnabledBatchResponsePublisher(
                    responseMono,
                    context,
                    this.bulkSpanName,
                    this.container.getDatabase().getId(),
                    this.container.getId(),
                    this.cosmosClient,
                    options.getConsistencyLevel(),
                    OperationType.Batch,
                    ResourceType.Document,
                    options,
                    this.cosmosBulkExecutionOptions.getMaxMicroBatchSize());
        });
    }

    private void completeAllSinks() {
        logInfoOrWarning("Closing all sinks, Context: {}", this.operationContextText);

        logger.debug("Executor service shut down, Context: {}", this.operationContextText);
        mainSink.emitComplete(serializedCompleteEmitFailureHandler);

        this.shutdown();
    }

    private void onFlush() {
        try {
            this.groupSinks.forEach(sink -> sink.next(FlushBuffersItemOperation.singleton()));
        } catch(Throwable t) {
            logger.error("Callback invocation 'onFlush' failed. Context: {}", this.operationContextText,  t);
        }
    }

    private static String getItemOperationDiagnostics(CosmosItemOperation operation) {

        if (operation == FlushBuffersItemOperation.singleton()) {
            return "ItemOperation[Type: Flush]";
        }

        return "ItemOperation[Type: "
            + operation.getOperationType().toString()
            + ", PK: "
            + (operation.getPartitionKeyValue() != null ? operation.getPartitionKeyValue().toString() : "n/a")
            + ", id: "
            + operation.getId()
            + "]";
    }

    private static String getThreadInfo() {
        StringBuilder sb = new StringBuilder();
        Thread t = Thread.currentThread();
        sb
            .append("Thread[")
            .append("Name: ")
            .append(t.getName())
            .append(",Group: ")
            .append(t.getThreadGroup() != null ? t.getThreadGroup().getName() : "n/a")
            .append(", isDaemon: ")
            .append(t.isDaemon())
            .append(", Id: ")
            .append(t.getId())
            .append("]");

        return sb.toString();
    }

    private static class SerializedEmitFailureHandler implements Sinks.EmitFailureHandler {

        @Override
        public boolean onEmitFailure(SignalType signalType, Sinks.EmitResult emitResult) {
            if (emitResult.equals(Sinks.EmitResult.FAIL_NON_SERIALIZED)) {
                logger.debug("SerializedEmitFailureHandler.onEmitFailure - Signal:{}, Result: {}", signalType, emitResult);

                return true;
            }

            logger.error("SerializedEmitFailureHandler.onEmitFailure - Signal:{}, Result: {}", signalType, emitResult);
            return false;
        }
    }

    private static class SerializedCompleteEmitFailureHandler implements Sinks.EmitFailureHandler {

        @Override
        public boolean onEmitFailure(SignalType signalType, Sinks.EmitResult emitResult) {
            if (emitResult.equals(Sinks.EmitResult.FAIL_NON_SERIALIZED)) {
                logger.debug("SerializedCompleteEmitFailureHandler.onEmitFailure - Signal:{}, Result: {}", signalType, emitResult);

                return true;
            }

            if (emitResult == Sinks.EmitResult.FAIL_CANCELLED || emitResult == Sinks.EmitResult.FAIL_TERMINATED) {
                logger.debug("SerializedCompleteEmitFailureHandler.onEmitFailure - Main sink already completed, Signal:{}, Result: {}", signalType, emitResult);
                return false;
            }

            logger.error("SerializedCompleteEmitFailureHandler.onEmitFailure - Signal:{}, Result: {}", signalType, emitResult);
            return false;
        }
    }
}