All Downloads are FREE. Search and download functionalities are using the official Maven repository.

com.microsoft.azure.documentdb.bulkexecutor.DocumentBulkExecutor Maven / Gradle / Ivy

The newest version!
/*
 * The MIT License (MIT)
 * Copyright (c) 2017 Microsoft Corporation
 *
 * Permission is hereby granted, free of charge, to any person obtaining a copy
 * of this software and associated documentation files (the "Software"), to deal
 * in the Software without restriction, including without limitation the rights
 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 * copies of the Software, and to permit persons to whom the Software is
 * furnished to do so, subject to the following conditions:
 *
 * The above copyright notice and this permission notice shall be included in all
 * copies or substantial portions of the Software.
 *
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 * SOFTWARE.
 */
package com.microsoft.azure.documentdb.bulkexecutor;

import java.nio.charset.Charset;
import java.time.Duration;
import java.util.ArrayList;
import java.util.Collection;
import java.util.Collections;
import java.util.HashMap;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.concurrent.ConcurrentHashMap;
import java.util.concurrent.ExecutionException;
import java.util.concurrent.Executors;
import java.util.concurrent.TimeUnit;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import java.util.stream.Collectors;

import com.microsoft.azure.documentdb.internal.routing.PartitionKeyInternalHelper;
import org.apache.commons.lang3.StringUtils;
import org.apache.commons.lang3.tuple.ImmutablePair;
import org.apache.commons.lang3.tuple.Pair;
import org.apache.http.HttpStatus;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import com.google.common.base.Preconditions;
import com.google.common.base.Stopwatch;
import com.google.common.util.concurrent.AsyncCallable;
import com.google.common.util.concurrent.Futures;
import com.google.common.util.concurrent.Futures.FutureCombiner;
import com.google.common.util.concurrent.ListenableFuture;
import com.google.common.util.concurrent.ListeningExecutorService;
import com.google.common.util.concurrent.MoreExecutors;
import com.microsoft.azure.documentdb.Document;
import com.microsoft.azure.documentdb.DocumentClient;
import com.microsoft.azure.documentdb.DocumentClientException;
import com.microsoft.azure.documentdb.DocumentCollection;
import com.microsoft.azure.documentdb.Error;
import com.microsoft.azure.documentdb.FeedOptions;
import com.microsoft.azure.documentdb.FeedResponse;
import com.microsoft.azure.documentdb.PartitionKeyDefinition;
import com.microsoft.azure.documentdb.PartitionKeyRange;
import com.microsoft.azure.documentdb.RequestOptions;
import com.microsoft.azure.documentdb.RetryOptions;
import com.microsoft.azure.documentdb.bulkexecutor.internal.BatchDeleter;
import com.microsoft.azure.documentdb.bulkexecutor.internal.BatchInserter;
import com.microsoft.azure.documentdb.bulkexecutor.internal.BatchUpdater;
import com.microsoft.azure.documentdb.bulkexecutor.internal.BulkDeleteQuerySpec;
import com.microsoft.azure.documentdb.bulkexecutor.internal.BulkImportStoredProcedureOptions;
import com.microsoft.azure.documentdb.bulkexecutor.internal.CongestionController;
import com.microsoft.azure.documentdb.bulkexecutor.internal.DocumentAnalyzer;
import com.microsoft.azure.documentdb.bulkexecutor.internal.ExceptionUtils;
import com.microsoft.azure.documentdb.internal.HttpConstants;
import com.microsoft.azure.documentdb.internal.routing.CollectionRoutingMap;
import com.microsoft.azure.documentdb.internal.routing.InMemoryCollectionRoutingMap;
import com.microsoft.azure.documentdb.internal.routing.PartitionKeyInternal;
import com.microsoft.azure.documentdb.internal.routing.Range;

public class DocumentBulkExecutor implements AutoCloseable {

    public static class Builder {

        private DocumentClient client;
        private String collectionLink;
        private int maxMiniBatchSize = (int) Math.floor(MAX_BULK_IMPORT_SCRIPT_INPUT_SIZE * FRACTION_OF_MAX_BULK_IMPORT_SCRIPT_INPUT_SIZE_ALLOWED);
        private int maxUpdateMiniBatchCount = 500;
        private final static int DEFAULT_RETRY_ATTEMPT_ON_THROTTLING_FOR_INIT = 200;
        private final static int DEFAULT_WAIT_TIME_ON_THROTTLING_FOR_INIT_IN_SECONDS = 60;

        private PartitionKeyDefinition partitionKeyDef;
        private int offerThroughput;

        private static RetryOptions DEFAULT_INIT_RETRY_OPTIONS;

        static {
            DEFAULT_INIT_RETRY_OPTIONS = new RetryOptions();
            DEFAULT_INIT_RETRY_OPTIONS.setMaxRetryAttemptsOnThrottledRequests(DEFAULT_RETRY_ATTEMPT_ON_THROTTLING_FOR_INIT);
            DEFAULT_INIT_RETRY_OPTIONS.setMaxRetryWaitTimeInSeconds(DEFAULT_WAIT_TIME_ON_THROTTLING_FOR_INIT_IN_SECONDS);
        }

        private RetryOptions retryOptions = DEFAULT_INIT_RETRY_OPTIONS;

        /**
         * Use the instance of {@link DocumentClient} to perform bulk operations in target {@link DocumentCollection} instance at specified allocated throughput.
         * @param client an instance of {@link DocumentClient}
         * @param partitionKeyDef specifies the {@link PartitionKeyDefinition} of the collection
         * @param databaseName name of the database
         * @param collectionName name of the collection
         * @param offerThroughput specifies the throughput allocated for bulk operations out of the collection's total throughput
         * @return an instance of {@link Builder}
         */
        public Builder from(DocumentClient client,
                String databaseName, 
                String collectionName,
                PartitionKeyDefinition partitionKeyDef,
                int offerThroughput) {

            // TODO: validate the retry options for the client
            this.client = client;
            this.collectionLink = String.format("/dbs/%s/colls/%s", databaseName, collectionName);
            this.partitionKeyDef = partitionKeyDef;
            this.offerThroughput = offerThroughput;

            return this;
        }

        /**
         * Use the given size to configure max mini-batch size (specific to bulk import API).
         *
         * If not specified will use the default value of 220200 bytes.
         * @param size specifies the size of a mini-batch used in bulk import API.
         * @return {@link Builder}
         */
        public Builder withMaxMiniBatchSize(int size) {
            Preconditions.checkArgument(size > 0, "maxMiniBatchSize cannot be negative");
            Preconditions.checkArgument(size <= MAX_BULK_IMPORT_SCRIPT_INPUT_SIZE, "maxMiniBatchSize cannot be negative");

            this.maxMiniBatchSize = size;
            return this;
        }

        /**
         * Use the given count to configure max update mini-batch count (specific to bulk update API).
         *
         * If not specified will use the default value of 500.
         * @param count specifies the maximum count of update items in a mini-batch used in bulk import API.
         * @return {@link Builder}
         */
        public Builder withMaxUpdateMiniBatchCount(int count) {
            Preconditions.checkArgument(count > 0, "maxUpdateMiniBatchCount cannot be negative");

            this.maxUpdateMiniBatchCount = count;
            return this;
        }

        /**
         * Use the given retry options to apply to {@link DocumentClient} used in initialization of {@link DocumentBulkExecutor}.
         * 
         * @param options an instance of {@link RetryOptions}
         * @return {@link Builder}
         */
        public Builder withInitializationRetryOptions(RetryOptions options) {
            this.retryOptions = options;
            return this;
        }

        /**
         * Instantiates {@link DocumentBulkExecutor} given the configured {@link Builder}.
         *
         * @return the newly instantiated instance of {@link DocumentBulkExecutor}
         * @throws Exception if there is any failure
         */
        public DocumentBulkExecutor build() throws Exception {
            DocumentBulkExecutor executor = new DocumentBulkExecutor(client, collectionLink, partitionKeyDef, offerThroughput);
            try {
                executor.setInitializationRetryOptions(retryOptions);
                executor.setMaxMiniBatchSize(maxMiniBatchSize);
                executor.setMaxUpdateMiniBatchCount(maxUpdateMiniBatchCount);

                executor.safeInit();
            } catch (Exception e) {
                executor.close();
                throw e;
            }
            return executor;
        }

        private Builder() {}
    }

    /**
     * Creates a new {@link DocumentBulkExecutor.Builder} instance
     * @return an instance of {@link DocumentBulkExecutor.Builder}
     */
    public static DocumentBulkExecutor.Builder builder() {
        return new DocumentBulkExecutor.Builder();
    }

    /**
     * The name of the system stored procedure for bulk import.
     */
    private final static String BULK_IMPORT_STORED_PROCECURE_NAME = "__.sys.commonBulkInsert";

    /**
     * The name of the stored procedure for bulk update.
     */
    private final static String BULK_UPDATE_STORED_PROCECURE_NAME = "__.sys.bulkPatch";
    
    /**
     * The name of the stored procedure for bulk delete.
     */
    private final static String BULK_DELETE_STORED_PROCECURE_NAME = "__.sys.commonDelete";

    /**
     * The maximal sproc payload size sent (as a fraction of 2MB).
     */
    private final static int MAX_BULK_IMPORT_SCRIPT_INPUT_SIZE = (2202010 * 5) / 10;

    /**
     * The fraction of maximum sproc payload size up to which documents allowed to be fit in a mini-batch.
     */
    private final static double FRACTION_OF_MAX_BULK_IMPORT_SCRIPT_INPUT_SIZE_ALLOWED = 0.20;

    /**
     * Initialization sleep time on 
     */
    private final static int INITIALIZATION_SLEEP_TIME_ON_THROTTLING = 500;

    /**
     * The default max batch size for bulk delete operations
     */
    private final static int DEFAULT_BULK_DELETE_BATCH_SIZE = 1000;

    /**
     * Default sleep time in milliseconds to wait prior to re-initializing the BulkExecutor and retrying previously failed batch(es).
     */
    private final static int SLEEP_TIME_FOR_RETRY_POST_SPLIT_IN_MILLIS = 65 * 1000;

    /**
     * Maximum number of retries when split related failures are encountered
     */
    private final static int MAX_RETRIES_ON_SPLIT_FAILURES = 10;

    /**
     * Logger
     */
    private static final Logger logger = LoggerFactory.getLogger(DocumentBulkExecutor.class);

    /**
     * Degree of parallelism for each partition which was inferred from previous batch execution.
     */
    private final Map partitionKeyRangeIdToInferredDegreeOfParallelism = new ConcurrentHashMap<>();
    
    /**
     * Regex pattern for SQL query used to bulk delete documents
     */
    private final static String SQL_QUERY_REGEX_PATTERN = "(?i)select\\s+\\*\\s+(?i)from\\s+(?c)\\s+(?i)where(?:\\s+(?.+))?";
    
    private final static Pattern BULK_DELETE_QUERY_SPEC_PATTERN = Pattern.compile(SQL_QUERY_REGEX_PATTERN);
    
    /**
     * Executor Service
     */
    private final ListeningExecutorService listeningExecutorService;

    /**
     * The DocumentDB client instance.
     */
    private final DocumentClient client;

    /**
     * The document collection to which documents are to be bulk imported.
     */
    private final String collectionLink;

    /**
     * Partition Key Definition of the underlying collection.
     */
    private final PartitionKeyDefinition partitionKeyDefinition;

    /**
     * Partition Key Range Ids
     */
    private List partitionKeyRangeIds;

    /**
     * Collection routing map used to retrieve partition key range Ids of a given collection
     */
    private CollectionRoutingMap collectionRoutingMap;

    /**
     * Bulk Import Stored Procedure Link relevant to the given collection
     */
    private String bulkImportStoredProcLink;

    /**
     * Bulk Update Stored Procedure Link relevant to the given collection
     */
    private String bulkUpdateStoredProcLink;
    
    /**
     * Bulk Delete Stored Procedure Link relevant to the given collection
     */
    private String bulkDeleteStoredProcLink;

    /**
     * Collection offer throughput
     */
    private int collectionThroughput;

    /**
     * Max Mini Batch Size
     */
    private int maxMiniBatchSize;

    /**
     * Max Update Mini Batch Count
     */
    private int maxUpdateMiniBatchCount;

    private RetryOptions retryOptions;

    private void setMaxMiniBatchSize(int size) {
        this.maxMiniBatchSize = size;
    }

    private void setMaxUpdateMiniBatchCount(int count) {
        this.maxUpdateMiniBatchCount = count;
    }

    private void setInitializationRetryOptions(RetryOptions options) {
        this.retryOptions = options;
    }

    /**
     * Initializes a new instance of {@link DocumentBulkExecutor}
     *
     * @param client {@link DocumentClient} instance to use
     * @param collectionLink specifies the link to the target Azure Cosmos DB collection
     * @param partitionKeyDefinition specifies the {@link PartitionKeyDefinition} of the collection
     * @param collectionOfferThroughput specifies the throughput allocated for bulk operations out of the collection's total throughput
     */
    private DocumentBulkExecutor(DocumentClient client, 
            String collectionLink,
            PartitionKeyDefinition partitionKeyDefinition,
            int collectionOfferThroughput) {
        Preconditions.checkNotNull(client, "client cannot be null");
        Preconditions.checkNotNull(partitionKeyDefinition, "partitionKeyDefinition cannot be null");
        Preconditions.checkNotNull(collectionLink, "collectionLink cannot be null");
        Preconditions.checkArgument(collectionOfferThroughput > 0, "collection throughput is less than 10,000");

        this.client = client;
        this.collectionLink = collectionLink;
        this.collectionThroughput =  collectionOfferThroughput;
        this.partitionKeyDefinition = partitionKeyDefinition;
        this.listeningExecutorService = MoreExecutors.listeningDecorator(Executors.newCachedThreadPool());
    }

    private void safeInit() throws Exception {
        int count = 0;
        long startTime = System.currentTimeMillis();
        while(true) {
            try {
                initialize();
                break;
            } catch (Exception e) {
                count++;
                DocumentClientException dce = ExceptionUtils.getThrottelingException(e);
                long now = System.currentTimeMillis();
                if (count < retryOptions.getMaxRetryAttemptsOnThrottledRequests() 
                        && now - startTime < (retryOptions.getMaxRetryWaitTimeInSeconds() * 1000)
                        && dce != null
                        && dce.getStatusCode() == HttpConstants.StatusCodes.TOO_MANY_REQUESTS ) {
                    Thread.sleep(count * dce.getRetryAfterInMilliseconds() + INITIALIZATION_SLEEP_TIME_ON_THROTTLING);
                    continue;
                } else {
                    throw e;
                }
            }
        }
    }

    /**
     * Releases any internal resources.
     * It is responsibility of the caller to close {@link DocumentClient}.
     */
    @Override
    public void close() {
        // disable submission of new tasks
        listeningExecutorService.shutdown();
        try {
            // wait for existing tasks to terminate
            if (!listeningExecutorService.awaitTermination(60, TimeUnit.SECONDS)) {
                // cancel any currently running executing tasks
                listeningExecutorService.shutdownNow();
                // wait for cancelled tasks to terminate
                if (!listeningExecutorService.awaitTermination(60, TimeUnit.SECONDS)) {
                    logger.error("some tasks did not terminate");
                }
            }
        } catch (InterruptedException e) {
            listeningExecutorService.shutdownNow();
            Thread.currentThread().interrupt();
        }
    }

    /**
     * Initializes {@link DocumentBulkExecutor}. This happens only once
     * @throws DocumentClientException
     */
    private void initialize() throws DocumentClientException {
        logger.debug("Initializing ...");

        this.bulkImportStoredProcLink = String.format("%s/sprocs/%s", collectionLink, BULK_IMPORT_STORED_PROCECURE_NAME);
        this.bulkUpdateStoredProcLink = String.format("%s/sprocs/%s", collectionLink, BULK_UPDATE_STORED_PROCECURE_NAME);
        this.bulkDeleteStoredProcLink = String.format("%s/sprocs/%s", collectionLink, BULK_DELETE_STORED_PROCECURE_NAME);
        
        logger.debug("Fetching partition map of collection");
        Range fullRange = new Range(
                PartitionKeyInternalHelper.MinimumInclusiveEffectivePartitionKey,
                PartitionKeyInternalHelper.MaximumExclusiveEffectivePartitionKey,
                true,
                false);

        // this assumes database and collection already exists
        try {
            client.readCollection(collectionLink, null).getResource();
        } catch (DocumentClientException ex) {
            if (ex.getStatusCode() == 404) {
                logger.error("Unable to read resource for collection link " + collectionLink);
            }

            throw ex;
        }

        this.collectionRoutingMap = getCollectionRoutingMap(client, this.collectionLink);
        Collection partitionKeyRanges = this.collectionRoutingMap.getOverlappingRanges(fullRange);

        this.partitionKeyRangeIds = partitionKeyRanges.stream().map(partitionKeyRange -> partitionKeyRange.getId()).collect(Collectors.toList());
        logger.debug("Initialization completed");
    }

    /**
     * Executes a bulk import in the Azure Cosmos DB database service.
     * 
*
     * {@code
     * ConnectionPolicy connectionPolicy = new ConnectionPolicy();
     * RetryOptions retryOptions = new RetryOptions();
     * 
     * // Set client's retry options high for initialization
     * retryOptions.setMaxRetryWaitTimeInSeconds(120);
     * retryOptions.setMaxRetryAttemptsOnThrottledRequests(100);
     * connectionPolicy.setRetryOptions(retryOptions);
     * connectionPolicy.setMaxPoolSize(1000);
     *
     * DocumentClient client = new DocumentClient(HOST, MASTER_KEY, connectionPolicy, null);
     *
     * String collectionLink = String.format("/dbs/%s/colls/%s", "mydb", "mycol");
     * DocumentCollection collection = client.readCollection(collectionLink, null).getResource();
     *
     * DocumentBulkExecutor executor = DocumentBulkExecutor.builder().from(client, collection,
     *     collection.getPartitionKey(), collectionOfferThroughput).build();
     *
     * // Set retries to 0 to pass control to bulk executor
     * client.getConnectionPolicy().getRetryOptions().setMaxRetryWaitTimeInSeconds(0);
     * client.getConnectionPolicy().getRetryOptions().setMaxRetryAttemptsOnThrottledRequests(0);
     * 
     * for(int i = 0; i < 10; i++) {
     *   List documents = documentSource.getMoreDocuments();
     *
     *   BulkImportResponse bulkImportResponse = executor.importAll(documents, false, true, 40);
     *
     *   // Validate that all documents inserted to ensure no failure.
     *   if (bulkImportResponse.getNumberOfDocumentsImported() < documents.size()) {
     *      for(Exception e: bulkImportResponse.getErrors()) {
     *          // Validate why there were some failures.
     *          e.printStackTrace();
     *      }
     *      break;
     *   }
     * }
     *
     * executor.close();
     * client.close();
     * }
     * 
*
* * @param documents specifies the collection of JSON-serialized documents to import * @param isUpsert indicates whether a document in the supplied collection needs to be overwritten if the id already exists * @param disableAutomaticIdGeneration indicates whether the id has to be automatically generated for a document if absent in the supplied collection * @param maxConcurrencyPerPartitionRange specifies the maximum degree of concurrency per partition key range (default value is 20 if set to null) * @return an instance of {@link BulkImportResponse} * @throws DocumentClientException if any failure happens */ public BulkImportResponse importAll(Collection documents, boolean isUpsert, boolean disableAutomaticIdGeneration, Integer maxConcurrencyPerPartitionRange) throws DocumentClientException { return executeBulkImportInternal(documents, isUpsert, disableAutomaticIdGeneration, maxConcurrencyPerPartitionRange, 0, 0, 0); } /** * @param documents specifies the collection of JSON-serialized documents to import * @param isUpsert indicates whether a document in the supplied collection needs to be overwritten if the id already exists * @param disableAutomaticIdGeneration indicates whether the id has to be automatically generated for a document if absent in the supplied collection * @param maxConcurrencyPerPartitionRange specifies the maximum degree of concurrency per partition key range (default value is 20 if set to null) * @param effectiveMaxMiniBatchImportSize specifies the maximum size of mini batch imports * @param numberOfParallelTasks specifies the number of parallel ingestion tasks * @param writeThroughputBudgetPerCosmosPartition specifies the user provided RU Budget that is split for each cosmos b physical partition * @return an instance of {@link BulkImportResponse} * @throws DocumentClientException if any failure happens */ public BulkImportResponse importAll(Collection documents, boolean isUpsert, boolean disableAutomaticIdGeneration, Integer maxConcurrencyPerPartitionRange, Integer effectiveMaxMiniBatchImportSize, Integer numberOfParallelTasks, Integer writeThroughputBudgetPerCosmosPartition) throws DocumentClientException { return executeBulkImportInternal(documents, isUpsert, disableAutomaticIdGeneration, maxConcurrencyPerPartitionRange, effectiveMaxMiniBatchImportSize, numberOfParallelTasks, writeThroughputBudgetPerCosmosPartition); } /** * Executes a bulk update in the Azure Cosmos DB database service. * * @param updateItems specifies the collection of update items each of which comprises the list of field update operations to be performed * on a document identified by an id and partition key value. * @param maxConcurrencyPerPartitionRange specifies the maximum degree of concurrency per partition key range (default value is 20 if set to null) * @return an instance of {@link BulkUpdateResponse} * @throws DocumentClientException if any failure happens */ public BulkUpdateResponse updateAll(Collection updateItems, Integer maxConcurrencyPerPartitionRange) throws DocumentClientException { return executeBulkUpdateInternal(updateItems, maxConcurrencyPerPartitionRange); } /** * Executes a bulk update in the Azure Cosmos DB database service with given set of patch documents. * * @param patchDocuments which are documents comprising id, partition key values and fields to set with the corresponding values * @param maxConcurrencyPerPartitionRange specifies the maximum degree of concurrency per partition key range (default value is 20 if set to null) * @return an instance of {@link BulkUpdateResponse} * @throws DocumentClientException if any failure happens */ public BulkUpdateResponse mergeAll(Collection patchDocuments, Integer maxConcurrencyPerPartitionRange) throws DocumentClientException { return executeBulkUpdateWithPatchInternal(patchDocuments, maxConcurrencyPerPartitionRange); } /** * Executes a bulk delete in the Azure Cosmos DB database service. * * @param pkIdPairsToDelete List of pairs of partition key and id values of documents to delete * @return an instance of {@link BulkDeleteResponse} * @throws DocumentClientException if any failure happens */ public BulkDeleteResponse deleteAll(List> pkIdPairsToDelete) throws DocumentClientException { return executeBulkDeleteInternalPkRowKeys(pkIdPairsToDelete); } @SuppressWarnings("unused") private BulkUpdateResponse updateDocument(String partitionKey, String id, List updateOperations) throws DocumentClientException { return executeUpdateDocumentInternal(partitionKey, id, updateOperations); } private BulkImportResponse executeBulkImportInternal(Collection input, boolean isUpsert, boolean disableAutomaticIdGeneration, Integer maxConcurrencyPerPartitionRange, Integer effectiveMaxMiniBatchImportSize, Integer numberOfParallelTasks, Integer writeThroughputBudgetPerCosmosPartition) throws DocumentClientException { Preconditions.checkNotNull(input, "document collection cannot be null"); try { Collection documentsToInsertOrRetry = new ArrayList<>(input); Collection documentsFailedToImportDueToSplits; List failedImports = new ArrayList<>(); int numRetriesDueToSplits = 0; List failures = new ArrayList<>(); List badInputDocuments = new ArrayList(); int numberOfDocumentsImported = 0; double totalRequestUnitsConsumed = 0; Duration timeTakenForInserts = Duration.ofSeconds(0); do { documentsFailedToImportDueToSplits = new ArrayList(); BulkImportResponse eachInsertOrRetryResponse = executeBulkImportAsyncImpl( documentsToInsertOrRetry, documentsFailedToImportDueToSplits, failedImports, isUpsert, disableAutomaticIdGeneration, maxConcurrencyPerPartitionRange, effectiveMaxMiniBatchImportSize, numberOfParallelTasks, writeThroughputBudgetPerCosmosPartition).get(); failures.addAll(eachInsertOrRetryResponse.getErrors()); badInputDocuments.addAll((eachInsertOrRetryResponse.getBadInputDocuments())); numberOfDocumentsImported += eachInsertOrRetryResponse.getNumberOfDocumentsImported(); totalRequestUnitsConsumed += eachInsertOrRetryResponse.getTotalRequestUnitsConsumed(); timeTakenForInserts = timeTakenForInserts.plus(eachInsertOrRetryResponse.getTotalTimeTaken()); if (documentsFailedToImportDueToSplits.size() > 0) { numRetriesDueToSplits++; this.initialize(); documentsToInsertOrRetry = new ArrayList<>(documentsFailedToImportDueToSplits); } } while (documentsFailedToImportDueToSplits.size() > 0 && numRetriesDueToSplits <= MAX_RETRIES_ON_SPLIT_FAILURES); if(numRetriesDueToSplits > MAX_RETRIES_ON_SPLIT_FAILURES) { Map responseHeaders = new HashMap(); responseHeaders.put(HttpConstants.HttpHeaders.SUB_STATUS, String.valueOf(HttpConstants.SubStatusCodes.PARTITION_KEY_RANGE_GONE)); BulkImportFailure bulkImportFailure = new BulkImportFailure(); bulkImportFailure.getDocumentsFailedToImport().addAll(documentsFailedToImportDueToSplits); bulkImportFailure.setBulkImportFailureException(new DocumentClientException(HttpStatus.SC_SERVICE_UNAVAILABLE, new Error("{ 'message': 'Max retries for BulkExecutor exhausted. Please re-initialize BulkExecutor and retry latest batch import.' }"), responseHeaders)); failedImports.add(bulkImportFailure); } BulkImportResponse bulkImportResponse = new BulkImportResponse( numberOfDocumentsImported, totalRequestUnitsConsumed, timeTakenForInserts, failures, badInputDocuments, failedImports); return bulkImportResponse; } catch (ExecutionException e) { logger.error("Failed to import documents", e); Throwable cause = e.getCause(); if (cause instanceof Exception) { throw toDocumentClientException((Exception) cause); } else { throw toDocumentClientException(e); } } catch(Exception e) { logger.error("Failed to import documents", e); throw toDocumentClientException(e); } } private BulkUpdateResponse executeBulkUpdateInternal(Collection updateItems, Integer maxConcurrencyPerPartitionRange) throws DocumentClientException { Preconditions.checkNotNull(updateItems, "update items cannot be null"); try { List updatesToAttemptOrRetry = new ArrayList<>(updateItems); List documentsFailedToUpdateDueToSplits = new ArrayList<>(); List bulkUpdateFailures = new ArrayList<>(); int numRetriesDueToSplits = 0; int numberOfDocumentsUpdated = 0; double totalRequestUnitsConsumed = 0; Duration totalTimeTaken = Duration.ofMillis(0); List errors = new ArrayList(); do { documentsFailedToUpdateDueToSplits = new ArrayList<>(); BulkUpdateResponse eachUpdateOrRetryResponse = executeBulkUpdateAsyncImpl(updatesToAttemptOrRetry, documentsFailedToUpdateDueToSplits, maxConcurrencyPerPartitionRange).get(); numberOfDocumentsUpdated += eachUpdateOrRetryResponse.getNumberOfDocumentsUpdated(); totalRequestUnitsConsumed += eachUpdateOrRetryResponse.getTotalRequestUnitsConsumed(); totalTimeTaken = totalTimeTaken.plus(eachUpdateOrRetryResponse.getTotalTimeTaken()); errors.addAll(eachUpdateOrRetryResponse.getErrors()); bulkUpdateFailures.addAll(eachUpdateOrRetryResponse.getFailedUpdates()); if(documentsFailedToUpdateDueToSplits.size() > 0) { numRetriesDueToSplits++; this.initialize(); updatesToAttemptOrRetry = new ArrayList<>(documentsFailedToUpdateDueToSplits); } } while (documentsFailedToUpdateDueToSplits.size() > 0 && numRetriesDueToSplits <= MAX_RETRIES_ON_SPLIT_FAILURES); if(numRetriesDueToSplits > MAX_RETRIES_ON_SPLIT_FAILURES) { Map responseHeaders = new HashMap(); responseHeaders.put(HttpConstants.HttpHeaders.SUB_STATUS, String.valueOf(HttpConstants.SubStatusCodes.PARTITION_KEY_RANGE_GONE)); BulkUpdateFailure bulkUpdateFailure = new BulkUpdateFailure(); bulkUpdateFailure.getFailedUpdateItems().addAll(documentsFailedToUpdateDueToSplits); bulkUpdateFailure.setBulkUpdateFailureException(new DocumentClientException(HttpStatus.SC_SERVICE_UNAVAILABLE, new Error("{ 'message': 'Max retries for BulkExecutor exhausted. Please re-initialize BulkExecutor and retry latest batch update.' }"), responseHeaders)); bulkUpdateFailures.add(bulkUpdateFailure); } BulkUpdateResponse bulkUpdateResponse = new BulkUpdateResponse(numberOfDocumentsUpdated, totalRequestUnitsConsumed, totalTimeTaken, errors, bulkUpdateFailures); return bulkUpdateResponse; } catch (ExecutionException e) { logger.error("Failed to update documents", e); Throwable cause = e.getCause(); if (cause instanceof Exception) { throw toDocumentClientException((Exception) cause); } else { throw toDocumentClientException(e); } } catch(Exception e) { logger.error("Failed to update documents", e); throw toDocumentClientException(e); } } private BulkUpdateResponse executeBulkUpdateWithPatchInternal(Collection patchDocuments, Integer maxConcurrencyPerPartitionRange) throws DocumentClientException { Preconditions.checkNotNull(patchDocuments, "patch documents cannot be null"); try { List documentsFailedToUpdateDueToSplits = new ArrayList<>(); int numRetriesDueToSplits = 0; int numberOfDocumentsUpdated = 0; double totalRequestUnitsConsumed = 0; Duration totalTimeTaken = Duration.ofMillis(0); List errors = new ArrayList<>(); List bulkUpdateFailures = new ArrayList<>(); BulkUpdateResponse eachUpdateOrRetryResponse = executeBulkUpdateWithPatchAsyncImpl(patchDocuments, documentsFailedToUpdateDueToSplits, maxConcurrencyPerPartitionRange).get(); numberOfDocumentsUpdated += eachUpdateOrRetryResponse.getNumberOfDocumentsUpdated(); totalRequestUnitsConsumed += eachUpdateOrRetryResponse.getTotalRequestUnitsConsumed(); totalTimeTaken = totalTimeTaken.plus(eachUpdateOrRetryResponse.getTotalTimeTaken()); errors.addAll(eachUpdateOrRetryResponse.getErrors()); bulkUpdateFailures.addAll(eachUpdateOrRetryResponse.getFailedUpdates()); while (documentsFailedToUpdateDueToSplits.size() > 0 && numRetriesDueToSplits <= MAX_RETRIES_ON_SPLIT_FAILURES) { numRetriesDueToSplits++; this.initialize(); List updatesToAttemptOrRetry = new ArrayList<>(documentsFailedToUpdateDueToSplits); documentsFailedToUpdateDueToSplits = new ArrayList<>(); eachUpdateOrRetryResponse = executeBulkUpdateAsyncImpl(updatesToAttemptOrRetry, documentsFailedToUpdateDueToSplits, maxConcurrencyPerPartitionRange).get(); numberOfDocumentsUpdated += eachUpdateOrRetryResponse.getNumberOfDocumentsUpdated(); totalRequestUnitsConsumed += eachUpdateOrRetryResponse.getTotalRequestUnitsConsumed(); totalTimeTaken = totalTimeTaken.plus(eachUpdateOrRetryResponse.getTotalTimeTaken()); errors.addAll(eachUpdateOrRetryResponse.getErrors()); bulkUpdateFailures.addAll(eachUpdateOrRetryResponse.getFailedUpdates()); } if(numRetriesDueToSplits > MAX_RETRIES_ON_SPLIT_FAILURES) { Map responseHeaders = new HashMap(); responseHeaders.put(HttpConstants.HttpHeaders.SUB_STATUS, String.valueOf(HttpConstants.SubStatusCodes.PARTITION_KEY_RANGE_GONE)); BulkUpdateFailure bulkUpdateFailure = new BulkUpdateFailure(); bulkUpdateFailure.getFailedUpdateItems().addAll(documentsFailedToUpdateDueToSplits); bulkUpdateFailure.setBulkUpdateFailureException(new DocumentClientException(HttpStatus.SC_SERVICE_UNAVAILABLE, new Error("{ 'message': 'Max retries for BulkExecutor exhausted. Please re-initialize BulkExecutor and retry latest batch update.' }"), responseHeaders)); bulkUpdateFailures.add(bulkUpdateFailure); } BulkUpdateResponse bulkUpdateResponse = new BulkUpdateResponse(numberOfDocumentsUpdated, totalRequestUnitsConsumed, totalTimeTaken, errors, bulkUpdateFailures); return bulkUpdateResponse; } catch (ExecutionException e) { logger.error("Failed to update documents", e); Throwable cause = e.getCause(); if (cause instanceof Exception) { throw toDocumentClientException((Exception) cause); } else { throw toDocumentClientException(e); } } catch(Exception e) { logger.error("Failed to update documents", e); throw toDocumentClientException(e); } } private BulkUpdateResponse executeUpdateDocumentInternal(String partitionKey, String id, List updateOperations) throws DocumentClientException { Preconditions.checkNotNull(partitionKey, "partitionKey cannot be null"); Preconditions.checkNotNull(id, "id cannot be null"); Preconditions.checkNotNull(updateOperations, "update operations cannot be null"); try { return executeUpdateDocumentAsyncImpl(partitionKey, id, updateOperations).get(); } catch (ExecutionException e) { logger.error("Failed to update document", e); Throwable cause = e.getCause(); if (cause instanceof Exception) { throw toDocumentClientException((Exception) cause); } else { throw toDocumentClientException(e); } } catch(Exception e) { logger.error("Failed to update document", e); throw toDocumentClientException(e); } } private BulkDeleteResponse executeBulkDeleteInternal(String query, RequestOptions requestOptions) throws DocumentClientException { Preconditions.checkNotNull(query, "query to fetch documents to delete cannot be null"); try { return executeBulkDeleteAsyncImpl(query, requestOptions).get(); } catch (ExecutionException e) { logger.error("Failed to delete document", e); Throwable cause = e.getCause(); if (cause instanceof Exception) { throw toDocumentClientException((Exception) cause); } else { throw toDocumentClientException(e); } } catch(Exception e) { logger.error("Failed to delete document", e); throw toDocumentClientException(e); } } private BulkDeleteResponse executeBulkDeleteInternalPkRowKeys(List> pkIdPairsToDelete) throws DocumentClientException { Preconditions.checkNotNull(pkIdPairsToDelete, "list of pairs of partition key and ids to delete cannot be null"); List nullPairs = new ArrayList(); for(int eachPkIdPairIndex = 0; eachPkIdPairIndex < pkIdPairsToDelete.size(); eachPkIdPairIndex++) { if(pkIdPairsToDelete.get(eachPkIdPairIndex) == null) { nullPairs.add(eachPkIdPairIndex); } } if(nullPairs.size() > 0) { throw new NullPointerException("Input list of pairs of partition keys and ids to delete contains null entries. Indices of null entires are: " + nullPairs.toString()); } try { return executeBulkDeletePkRowKeyPairsAsyncImpl(pkIdPairsToDelete).get(); } catch (ExecutionException e) { logger.debug("Failed to delete document", e); Throwable cause = e.getCause(); if (cause instanceof Exception) { throw toDocumentClientException((Exception) cause); } else { throw toDocumentClientException(e); } } catch (Exception e) { logger.error("Failed to delete documents", e); throw toDocumentClientException(e); } } private ListenableFuture executeBulkImportAsyncImpl(Collection documents, Collection documentsFailedToImportDueToSplits, List failedImports, boolean isUpsert, boolean disableAutomaticIdGeneration, Integer maxConcurrencyPerPartitionRange, Integer effectiveMaxMiniBatchImportSize, Integer numberOfParallelTasks, Integer writeThroughputBudgetPerCosmosPartition) throws Exception { Stopwatch watch = Stopwatch.createStarted(); BulkImportStoredProcedureOptions options = new BulkImportStoredProcedureOptions(disableAutomaticIdGeneration, false, null, false, isUpsert, true); logger.debug("Bucketing documents ..."); ConcurrentHashMap> documentsToImportByPartition = new ConcurrentHashMap>(); ConcurrentHashMap>> miniBatchesToImportByPartition = new ConcurrentHashMap>>(); for (String partitionKeyRangeId: partitionKeyRangeIds) { documentsToImportByPartition.put(partitionKeyRangeId, ConcurrentHashMap.newKeySet(documents.size() / partitionKeyRangeIds.size())); miniBatchesToImportByPartition.put(partitionKeyRangeId, new ArrayList>(1000)); } documents.parallelStream().forEach(documentAsString -> { PartitionKeyInternal partitionKeyValue = DocumentAnalyzer.extractPartitionKeyValue(documentAsString, partitionKeyDefinition); String effectivePartitionKey = partitionKeyValue.getEffectivePartitionKeyString(partitionKeyDefinition, true); String partitionRangeId = collectionRoutingMap.getRangeByEffectivePartitionKey(effectivePartitionKey).getId(); documentsToImportByPartition.get(partitionRangeId).add(documentAsString); }); if (effectiveMaxMiniBatchImportSize > 0) { maxMiniBatchSize = effectiveMaxMiniBatchImportSize; } logger.debug("Creating mini batches within each partition bucket"); documentsToImportByPartition.entrySet().parallelStream().forEach(entry -> { String partitionRangeId = entry.getKey(); Set documentsToImportInPartition = entry.getValue(); Iterator it = documentsToImportInPartition.iterator(); ArrayList currentMiniBatch = new ArrayList(500); int currentMiniBatchSize = 0; while (it.hasNext()) { String currentDocument = it.next(); int currentDocumentSize = getDocumentSize(currentDocument); if ((currentMiniBatchSize + currentDocumentSize <= maxMiniBatchSize)) { // add the document to current batch currentMiniBatch.add(currentDocument); currentMiniBatchSize += currentDocumentSize; } else { // this batch has reached its max size miniBatchesToImportByPartition.get(partitionRangeId).add(currentMiniBatch); currentMiniBatch = new ArrayList(500); currentMiniBatch.add(currentDocument); currentMiniBatchSize = currentDocumentSize; } } if (currentMiniBatch.size() > 0) { // add mini batch miniBatchesToImportByPartition.get(partitionRangeId).add(currentMiniBatch); } }); logger.debug("Beginning bulk import within each partition bucket"); Map batchInserters = new HashMap(); Map congestionControllers = new HashMap(); logger.debug("Preprocessing took: " + watch.elapsed().toMillis() + " millis"); List> futures = new ArrayList<>(); for (String partitionKeyRangeId: this.partitionKeyRangeIds) { BatchInserter batchInserter = new BatchInserter( partitionKeyRangeId, miniBatchesToImportByPartition.get(partitionKeyRangeId), this.client, bulkImportStoredProcLink, options, numberOfParallelTasks, writeThroughputBudgetPerCosmosPartition); batchInserters.put(partitionKeyRangeId, batchInserter); CongestionController cc = new CongestionController(listeningExecutorService, collectionThroughput / partitionKeyRangeIds.size(), partitionKeyRangeId, batchInserter, partitionKeyRangeIdToInferredDegreeOfParallelism.get(partitionKeyRangeId), maxConcurrencyPerPartitionRange); congestionControllers.put(partitionKeyRangeId,cc); // starting futures.add(cc.executeAllAsync()); } FutureCombiner futureContainer = Futures.whenAllComplete(futures); AsyncCallable completeAsyncCallback = new AsyncCallable() { @Override public ListenableFuture call() throws Exception { List failures = new ArrayList<>(); List badInputDocuments = new ArrayList(); for(String partitionKeyRangeId: partitionKeyRangeIds) { CongestionController cc = congestionControllers.get(partitionKeyRangeId); failures.addAll(cc.getFailures()); BatchInserter batchInserter = batchInserters.get(partitionKeyRangeId); badInputDocuments.addAll(batchInserter.getBadInputDocuments()); documentsFailedToImportDueToSplits.addAll(batchInserter.getDocumentsFailedToImportDueToSplits()); failedImports.addAll(batchInserters.get(partitionKeyRangeId).getDocumentsFailedToImport()); partitionKeyRangeIdToInferredDegreeOfParallelism.put(partitionKeyRangeId, cc.getDegreeOfConcurrency()); } int numberOfDocumentsImported = batchInserters.values().stream().mapToInt(b -> b.getNumberOfDocumentsImported()).sum(); double totalRequestUnitsConsumed = batchInserters.values().stream().mapToDouble(b -> b.getTotalRequestUnitsConsumed()).sum(); watch.stop(); BulkImportResponse bulkImportResponse = new BulkImportResponse( numberOfDocumentsImported, totalRequestUnitsConsumed, watch.elapsed(), failures, badInputDocuments, failedImports); return Futures.immediateFuture(bulkImportResponse); } }; return futureContainer.callAsync(completeAsyncCallback, listeningExecutorService); } private ListenableFuture executeBulkUpdateAsyncImpl(Collection updateItems, Collection documentsFailedToUpdateDueToSplits, Integer maxConcurrencyPerPartitionRange) { Stopwatch watch = Stopwatch.createStarted(); logger.debug("Bucketing update items ..."); ConcurrentHashMap> updateItemsByPartition = new ConcurrentHashMap>(); ConcurrentHashMap>> miniBatchesToUpdateByPartition = new ConcurrentHashMap>>(); for (String partitionKeyRangeId: partitionKeyRangeIds) { updateItemsByPartition.put(partitionKeyRangeId, ConcurrentHashMap.newKeySet(updateItems.size() / partitionKeyRangeIds.size())); miniBatchesToUpdateByPartition.put(partitionKeyRangeId, new ArrayList>(1000)); } updateItems.parallelStream().forEach(updateItem -> { PartitionKeyInternal partitionKeyValue = DocumentAnalyzer.fromPartitionKeyvalue(updateItem.getPartitionKeyValue()); String effectivePartitionKey = partitionKeyValue.getEffectivePartitionKeyString(partitionKeyDefinition, true); String partitionRangeId = collectionRoutingMap.getRangeByEffectivePartitionKey(effectivePartitionKey).getId(); updateItemsByPartition.get(partitionRangeId).add(updateItem); }); logger.debug("Creating mini batches within each partition bucket"); updateItemsByPartition.entrySet().parallelStream().forEach(entry -> { String partitionRangeId = entry.getKey(); Set updateItemsInPartition = entry.getValue(); Iterator it = updateItemsInPartition.iterator(); ArrayList currentMiniBatch = new ArrayList(500); int currentMiniBatchIndex = 0; while (it.hasNext()) { UpdateItem currentUpdateItem = it.next(); if ((currentMiniBatchIndex + 1 <= maxUpdateMiniBatchCount)) { // add the update item to current batch currentMiniBatch.add(currentUpdateItem); currentMiniBatchIndex++; } else { // this batch has reached its max size miniBatchesToUpdateByPartition.get(partitionRangeId).add(currentMiniBatch); currentMiniBatch = new ArrayList(500); currentMiniBatch.add(currentUpdateItem); currentMiniBatchIndex = 1; } } if (currentMiniBatch.size() > 0) { // add mini batch miniBatchesToUpdateByPartition.get(partitionRangeId).add(currentMiniBatch); } }); logger.debug("Beginning bulk update within each partition bucket"); List failedUpdates = new ArrayList<>(); Map batchUpdaters = new HashMap(); Map congestionControllers = new HashMap(); logger.debug("Preprocessing took: " + watch.elapsed().toMillis() + " millis"); List> futures = new ArrayList<>(); // Note: we handle only simple partition key path at the moment. Collection partitionKeyPath = partitionKeyDefinition.getPaths(); String partitionKeyProperty = partitionKeyPath.iterator().next().replaceFirst("^/", ""); for (String partitionKeyRangeId: this.partitionKeyRangeIds) { BatchUpdater batchUpdater = new BatchUpdater( partitionKeyRangeId, miniBatchesToUpdateByPartition.get(partitionKeyRangeId), this.client, bulkUpdateStoredProcLink, partitionKeyProperty); batchUpdaters.put(partitionKeyRangeId, batchUpdater); CongestionController cc = new CongestionController(listeningExecutorService, collectionThroughput / partitionKeyRangeIds.size(), partitionKeyRangeId, batchUpdater, partitionKeyRangeIdToInferredDegreeOfParallelism.get(partitionKeyRangeId), maxConcurrencyPerPartitionRange); congestionControllers.put(partitionKeyRangeId,cc); // starting futures.add(cc.executeAllAsync()); } FutureCombiner futureContainer = Futures.whenAllComplete(futures); AsyncCallable completeAsyncCallback = new AsyncCallable() { @Override public ListenableFuture call() throws Exception { List failures = new ArrayList<>(); for(String partitionKeyRangeId: partitionKeyRangeIds) { CongestionController cc = congestionControllers.get(partitionKeyRangeId); failures.addAll(cc.getFailures()); BatchUpdater batchUpdater = batchUpdaters.get(partitionKeyRangeId); documentsFailedToUpdateDueToSplits.addAll(batchUpdater.getDocumentsFailedToUpdateDueToSplits()); failedUpdates.addAll(batchUpdater.getBulkUpdateFailures()); partitionKeyRangeIdToInferredDegreeOfParallelism.put(partitionKeyRangeId, cc.getDegreeOfConcurrency()); } int numberOfDocumentsUpdated = batchUpdaters.values().stream().mapToInt(b -> b.getNumberOfDocumentsUpdated()).sum(); double totalRequestUnitsConsumed = batchUpdaters.values().stream().mapToDouble(b -> b.getTotalRequestUnitsConsumed()).sum(); watch.stop(); BulkUpdateResponse bulkUpdateResponse = new BulkUpdateResponse(numberOfDocumentsUpdated, totalRequestUnitsConsumed, watch.elapsed(), failures, failedUpdates); return Futures.immediateFuture(bulkUpdateResponse); } }; return futureContainer.callAsync(completeAsyncCallback, listeningExecutorService); } private ListenableFuture executeBulkUpdateWithPatchAsyncImpl(Collection patchDocuments, Collection documentsFailedToUpdateDueToSplits, Integer maxConcurrencyPerPartitionRange) { Stopwatch watch = Stopwatch.createStarted(); logger.debug("Bucketing patch documents ..."); ConcurrentHashMap> updateItemsByPartition = new ConcurrentHashMap>(); ConcurrentHashMap>> miniBatchesToUpdateByPartition = new ConcurrentHashMap>>(); for (String partitionKeyRangeId: partitionKeyRangeIds) { updateItemsByPartition.put(partitionKeyRangeId, ConcurrentHashMap.newKeySet(patchDocuments.size() / partitionKeyRangeIds.size())); miniBatchesToUpdateByPartition.put(partitionKeyRangeId, new ArrayList>(1000)); } // Note: we handle only simple partition key path at the moment. Collection partitionKeyPath = partitionKeyDefinition.getPaths(); String partitionKeyProperty = partitionKeyPath.iterator().next().replaceFirst("^/", ""); patchDocuments.parallelStream().forEach(patchDocument -> { UpdateItem updateItem = getUpdateItemFromPatchDocument(patchDocument, partitionKeyProperty); PartitionKeyInternal partitionKeyValue = DocumentAnalyzer.fromPartitionKeyvalue(updateItem.getPartitionKeyValue()); String effectivePartitionKey = partitionKeyValue.getEffectivePartitionKeyString(partitionKeyDefinition, true); String partitionRangeId = collectionRoutingMap.getRangeByEffectivePartitionKey(effectivePartitionKey).getId(); updateItemsByPartition.get(partitionRangeId).add(updateItem); }); logger.debug("Creating mini batches within each partition bucket"); updateItemsByPartition.entrySet().parallelStream().forEach(entry -> { String partitionRangeId = entry.getKey(); Set updateItemsInPartition = entry.getValue(); Iterator it = updateItemsInPartition.iterator(); ArrayList currentMiniBatch = new ArrayList(500); int currentMiniBatchIndex = 0; while (it.hasNext()) { UpdateItem currentUpdateItem = it.next(); if ((currentMiniBatchIndex + 1 <= maxUpdateMiniBatchCount)) { // add the update item to current batch currentMiniBatch.add(currentUpdateItem); currentMiniBatchIndex++; } else { // this batch has reached its max size miniBatchesToUpdateByPartition.get(partitionRangeId).add(currentMiniBatch); currentMiniBatch = new ArrayList(500); currentMiniBatch.add(currentUpdateItem); currentMiniBatchIndex = 1; } } if (currentMiniBatch.size() > 0) { // add mini batch miniBatchesToUpdateByPartition.get(partitionRangeId).add(currentMiniBatch); } }); logger.debug("Beginning bulk update within each partition bucket"); List failedUpdates = new ArrayList<>(); Map batchUpdaters = new HashMap(); Map congestionControllers = new HashMap(); logger.debug("Preprocessing took: " + watch.elapsed().toMillis() + " millis"); List> futures = new ArrayList<>(); for (String partitionKeyRangeId: this.partitionKeyRangeIds) { BatchUpdater batchUpdater = new BatchUpdater( partitionKeyRangeId, miniBatchesToUpdateByPartition.get(partitionKeyRangeId), this.client, bulkUpdateStoredProcLink, partitionKeyProperty); batchUpdaters.put(partitionKeyRangeId, batchUpdater); CongestionController cc = new CongestionController(listeningExecutorService, collectionThroughput / partitionKeyRangeIds.size(), partitionKeyRangeId, batchUpdater, partitionKeyRangeIdToInferredDegreeOfParallelism.get(partitionKeyRangeId), maxConcurrencyPerPartitionRange); congestionControllers.put(partitionKeyRangeId,cc); // starting futures.add(cc.executeAllAsync()); } FutureCombiner futureContainer = Futures.whenAllComplete(futures); AsyncCallable completeAsyncCallback = new AsyncCallable() { @Override public ListenableFuture call() throws Exception { List failures = new ArrayList<>(); for(String partitionKeyRangeId: partitionKeyRangeIds) { CongestionController cc = congestionControllers.get(partitionKeyRangeId); BatchUpdater batchUpdater = batchUpdaters.get(partitionKeyRangeId); documentsFailedToUpdateDueToSplits.addAll(batchUpdater.getDocumentsFailedToUpdateDueToSplits()); failedUpdates.addAll(batchUpdater.getBulkUpdateFailures()); failures.addAll(cc.getFailures()); partitionKeyRangeIdToInferredDegreeOfParallelism.put(partitionKeyRangeId, cc.getDegreeOfConcurrency()); } int numberOfDocumentsUpdated = batchUpdaters.values().stream().mapToInt(b -> b.getNumberOfDocumentsUpdated()).sum(); double totalRequestUnitsConsumed = batchUpdaters.values().stream().mapToDouble(b -> b.getTotalRequestUnitsConsumed()).sum(); watch.stop(); BulkUpdateResponse bulkUpdateResponse = new BulkUpdateResponse(numberOfDocumentsUpdated, totalRequestUnitsConsumed, watch.elapsed(), failures, failedUpdates); return Futures.immediateFuture(bulkUpdateResponse); } }; return futureContainer.callAsync(completeAsyncCallback, listeningExecutorService); } private UpdateItem getUpdateItemFromPatchDocument(Document patchDocument, String partitionKeyProperty) { String idValue = null; String pkValue = null; List updateOperations = new ArrayList(); HashMap patchDocumentMap = patchDocument.getHashMap(); for (Map.Entry entry : patchDocumentMap.entrySet()) { if (entry.getKey().equals("id")) { idValue = (String)entry.getValue(); if (partitionKeyProperty != null && partitionKeyProperty.equals("id")) { pkValue = (String)entry.getValue(); } continue; } if (entry.getKey().equals(partitionKeyProperty)) { pkValue = (String)entry.getValue(); continue; } updateOperations.addAll(getUpdateOperations("", entry.getKey(), entry.getValue())); } return new UpdateItem(idValue, pkValue, updateOperations); } @SuppressWarnings("unchecked") private List getUpdateOperations(String propertyKeyPrefix, String propertyKey, Object propertyValue) { List updateOperations = new ArrayList(); String propertyKeyToSet = propertyKeyPrefix.matches("") ? propertyKey : propertyKeyPrefix+"."+propertyKey; if (propertyValue instanceof String) { updateOperations.add(new SetUpdateOperation(propertyKeyToSet, (String)propertyValue)); } else if (propertyValue instanceof Integer) { updateOperations.add(new SetUpdateOperation(propertyKeyToSet, (Integer)propertyValue)); } else if (propertyValue instanceof Double) { updateOperations.add(new SetUpdateOperation(propertyKeyToSet, (Double)propertyValue)); } else if (propertyValue instanceof Boolean) { updateOperations.add(new SetUpdateOperation(propertyKeyToSet, (Boolean)propertyValue)); } else if (propertyValue instanceof List) { updateOperations.add(new SetUpdateOperation>(propertyKeyToSet, (List)propertyValue)); } else if (propertyValue instanceof Map) { HashMap propertyHashMap = (HashMap)propertyValue; for (Map.Entry entry : propertyHashMap.entrySet()) { updateOperations.addAll(getUpdateOperations(propertyKeyToSet, entry.getKey(), entry.getValue())); } } return updateOperations; } private ListenableFuture executeUpdateDocumentAsyncImpl(String partitionKey, String id, List updateOperations) { Stopwatch watch = Stopwatch.createStarted(); PartitionKeyInternal partitionKeyValue = DocumentAnalyzer.fromPartitionKeyvalue(partitionKey); String effectivePartitionKey = partitionKeyValue.getEffectivePartitionKeyString(partitionKeyDefinition, true); String partitionRangeId = collectionRoutingMap.getRangeByEffectivePartitionKey(effectivePartitionKey).getId(); List> miniBatchesToUpdate = new ArrayList>(1); List currentMiniBatch = new ArrayList(1); UpdateItem currentItem = new UpdateItem(id, partitionKey, updateOperations); currentMiniBatch.add(currentItem); miniBatchesToUpdate.add(currentMiniBatch); // Note: we handle only simple partition key path at the moment. Collection partitionKeyPath = partitionKeyDefinition.getPaths(); String partitionKeyProperty = partitionKeyPath.iterator().next().replaceFirst("^/", ""); BatchUpdater batchUpdater = new BatchUpdater( partitionRangeId, miniBatchesToUpdate, this.client, bulkUpdateStoredProcLink, partitionKeyProperty); CongestionController cc = new CongestionController(listeningExecutorService, collectionThroughput / partitionKeyRangeIds.size(), partitionRangeId, batchUpdater, null, null); List> futures = new ArrayList<>(); futures.add(cc.executeAllAsync()); FutureCombiner futureContainer = Futures.whenAllComplete(futures); AsyncCallable completeAsyncCallback = new AsyncCallable() { @Override public ListenableFuture call() throws Exception { List failures = new ArrayList<>(); failures.addAll(cc.getFailures()); int numberOfDocumentsUpdated = batchUpdater.getNumberOfDocumentsUpdated(); double totalRequestUnitsConsumed = batchUpdater.getTotalRequestUnitsConsumed(); watch.stop(); BulkUpdateResponse bulkUpdateResponse = new BulkUpdateResponse(numberOfDocumentsUpdated, totalRequestUnitsConsumed, watch.elapsed(), failures, null); return Futures.immediateFuture(bulkUpdateResponse); } }; return futureContainer.callAsync(completeAsyncCallback, listeningExecutorService); } private ListenableFuture executeBulkDeleteAsyncImpl(String query, RequestOptions requestOptions) throws Exception { List partitionKeyRangeIds = this.partitionKeyRangeIds; if(requestOptions != null && requestOptions.getPartitionKey() != null) { partitionKeyRangeIds = new ArrayList<>(); PartitionKeyInternal partitionKeyValue = requestOptions.getPartitionKey().getInternalPartitionKey(); String effectivePartitionKey = partitionKeyValue.getEffectivePartitionKeyString(partitionKeyDefinition, true); partitionKeyRangeIds.add(collectionRoutingMap.getRangeByEffectivePartitionKey(effectivePartitionKey).getId()); } List> futures = new ArrayList<>(); Stopwatch watch = Stopwatch.createStarted(); Matcher bulkDeleteQuerySpecMatcher = BULK_DELETE_QUERY_SPEC_PATTERN.matcher(query); if(!bulkDeleteQuerySpecMatcher.find()) { throw new IllegalArgumentException("Input SQL query is invalid: " + query + " Query must be of the form: select * from c where "); } String root = bulkDeleteQuerySpecMatcher.group("root").toString(); if(bulkDeleteQuerySpecMatcher.group("filter") == null || bulkDeleteQuerySpecMatcher.group("filter").isEmpty() || bulkDeleteQuerySpecMatcher.group("filter").trim().isEmpty()) { throw new IllegalArgumentException("Input SQL query is invalid: " + query + " Query must have filters in its where clause"); } String filterExpression = bulkDeleteQuerySpecMatcher.group("filter").toString(); BulkDeleteQuerySpec querySpec = new BulkDeleteQuerySpec(root, filterExpression, null, null, DEFAULT_BULK_DELETE_BATCH_SIZE); logger.debug("Beginning bulk delete within each partition range"); Map batchDeleters = new HashMap<>(); for (String partitionKeyRangeId: partitionKeyRangeIds) { BatchDeleter batchDeleter = new BatchDeleter( partitionKeyRangeId, this.client, bulkDeleteStoredProcLink, querySpec); batchDeleters.put(partitionKeyRangeId, batchDeleter); ListenableFuture batchDeleterFuture = listeningExecutorService.submit(batchDeleter.executeDelete()); futures.add(batchDeleterFuture); } FutureCombiner futureContainer = Futures.whenAllComplete(futures); AsyncCallable completeAsyncCallback = new AsyncCallable() { @Override public ListenableFuture call() throws Exception { List failures = new ArrayList<>(); int numberOfDocumentsDeleted = batchDeleters.values().stream().mapToInt(b -> b.getNumberOfDocumentsDeleted()).sum(); double totalRequestUnitsConsumed = batchDeleters.values().stream().mapToDouble(b -> b.getTotalRequestUnitsConsumed()).sum(); watch.stop(); BulkDeleteResponse bulkDeleteResponse = new BulkDeleteResponse(numberOfDocumentsDeleted, totalRequestUnitsConsumed, watch.elapsed(), failures, new ArrayList()); return Futures.immediateFuture(bulkDeleteResponse); } }; return futureContainer.callAsync(completeAsyncCallback, listeningExecutorService); } private ListenableFuture executeBulkDeletePkRowKeyPairsAsyncImpl( List> pkIdPairsToDelete) throws Exception { List> futures = new ArrayList<>(); Stopwatch watch = Stopwatch.createStarted(); logger.debug("Bucketing documents ..."); ConcurrentHashMap>> documentsToDeleteByPartition = new ConcurrentHashMap>>(); ConcurrentHashMap>>> miniBatchesToDeleteByPartition = new ConcurrentHashMap>>>(); for (String partitionKeyRangeId: partitionKeyRangeIds) { documentsToDeleteByPartition.put(partitionKeyRangeId, ConcurrentHashMap.newKeySet(pkIdPairsToDelete.size() / partitionKeyRangeIds.size())); miniBatchesToDeleteByPartition.put(partitionKeyRangeId, new ArrayList>>(1000)); } pkIdPairsToDelete.parallelStream().forEach(documentAsString -> { PartitionKeyInternal partitionKeyValue = DocumentAnalyzer.fromPartitionKeyvalue(documentAsString.getKey()); String effectivePartitionKey = partitionKeyValue.getEffectivePartitionKeyString(partitionKeyDefinition, true); String partitionRangeId = collectionRoutingMap.getRangeByEffectivePartitionKey(effectivePartitionKey).getId(); documentsToDeleteByPartition.get(partitionRangeId).add(documentAsString); }); logger.trace("Creating mini batches within each partition bucket"); documentsToDeleteByPartition.entrySet().parallelStream().forEach(entry -> { String partitionRangeId = entry.getKey(); Set> documentsToDeleteInPartition = entry.getValue(); Iterator> it = documentsToDeleteInPartition.iterator(); ArrayList> currentMiniBatch = new ArrayList>(100); int currentMiniBatchSize = 0; while (it.hasNext()) { Pair currentDocument = it.next(); if ((currentMiniBatchSize <= maxMiniBatchSize)) { // add the document to current batch currentMiniBatch.add(currentDocument); currentMiniBatchSize++; } else { // this batch has reached its max size miniBatchesToDeleteByPartition.get(partitionRangeId).add(currentMiniBatch); currentMiniBatch = new ArrayList>(100); currentMiniBatch.add(currentDocument); currentMiniBatchSize = 1; } } if (currentMiniBatch.size() > 0) { // add mini batch miniBatchesToDeleteByPartition.get(partitionRangeId).add(currentMiniBatch); } }); logger.debug("Beginning bulk delete within each partition range"); Map batchDeleters = new HashMap<>(); for (String partitionKeyRangeId: this.partitionKeyRangeIds) { for (List> eachMiniBatchForPkRangeId : miniBatchesToDeleteByPartition.get(partitionKeyRangeId)) { BatchDeleter batchDeleter = new BatchDeleter( partitionKeyRangeId, this.client, bulkDeleteStoredProcLink, partitionKeyDefinition, eachMiniBatchForPkRangeId); batchDeleters.put(partitionKeyRangeId, batchDeleter); ListenableFuture batchDeleterFuture = listeningExecutorService.submit(batchDeleter.executeDelete()); futures.add(batchDeleterFuture); } } FutureCombiner futureContainer = Futures.whenAllComplete(futures); AsyncCallable completeAsyncCallback = new AsyncCallable() { @Override public ListenableFuture call() throws Exception { List failures = new ArrayList<>(); int numberOfDocumentsDeleted = 0; double totalRequestUnitsConsumed = 0; List bulkDeleteFailures = new ArrayList<>(); for (BatchDeleter eachBatchDeleter : batchDeleters.values()) { numberOfDocumentsDeleted += eachBatchDeleter.getNumberOfDocumentsDeleted(); totalRequestUnitsConsumed += eachBatchDeleter.getTotalRequestUnitsConsumed(); bulkDeleteFailures.addAll(eachBatchDeleter.getBulkDeleteFailures()); } watch.stop(); BulkDeleteResponse bulkDeleteResponse = new BulkDeleteResponse(numberOfDocumentsDeleted, totalRequestUnitsConsumed, watch.elapsed(), failures, bulkDeleteFailures); return Futures.immediateFuture(bulkDeleteResponse); } }; return futureContainer.callAsync(completeAsyncCallback, listeningExecutorService); } /** * Exposes the collection routing map of a particular Azure Cosmos DB collection. * * @param client an instance of {@link DocumentClient} * @param collectionLink specifies the link to the collection whose routing map is requested * @return an instance of {@link CollectionRoutingMap} * @throws IllegalStateException if complete routing map cannot be created or no partition key ranges are returned */ private static CollectionRoutingMap getCollectionRoutingMap(DocumentClient client, String collectionLink) { List> ranges = new ArrayList<>(); FeedResponse feedResponse = client.readPartitionKeyRanges(collectionLink, (FeedOptions) null); List partitionKeyRanges = feedResponse.getQueryIterable().toList(); if (partitionKeyRanges.size() == 0) { logger.error("Partition key ranges read operation failed to return any ranges: " + feedResponse.getResponseHeaders()); throw new IllegalStateException("Could not read the partition key ranges; at least one range should be returned; response headers: " + feedResponse.getResponseHeaders()); } for (PartitionKeyRange range : partitionKeyRanges) { ranges.add(new ImmutablePair<>(range, true)); } CollectionRoutingMap routingMap = InMemoryCollectionRoutingMap.tryCreateCompleteRoutingMap(ranges, StringUtils.EMPTY); if (routingMap == null) { throw new IllegalStateException("Cannot create complete routing map"); } return routingMap; } /** * Returns the target partition key range ids of given partition key values. * * @param collectionRoutingMap an instance of {@link CollectionRoutingMap} * @param partitionKeyDefinition an instance of {@link PartitionKeyDefinition} * @param partitionKeyValues specifies the list of partition key values whose partition key range ids are requested * @return a list of pairs of partition key value and corresponding partition key range id */ @SuppressWarnings("unused") private static List> getPartitionKeyRangeIdsFromValues( CollectionRoutingMap collectionRoutingMap, PartitionKeyDefinition partitionKeyDefinition, List partitionKeyValues) { List> partitionRangeIds = new ArrayList<>(); for (String partitionKeyValue: partitionKeyValues) { PartitionKeyInternal pkInternal = PartitionKeyInternal.fromObjectArray(Collections.singletonList(partitionKeyValue), true); String effectivePartitionKey = pkInternal.getEffectivePartitionKeyString(partitionKeyDefinition, true); String partitionRangeId = collectionRoutingMap.getRangeByEffectivePartitionKey(effectivePartitionKey).getId(); partitionRangeIds.add(new ImmutablePair<>(partitionKeyValue, partitionRangeId)); } return partitionRangeIds; } private DocumentClientException toDocumentClientException(Exception e) { if (e instanceof DocumentClientException) { return (DocumentClientException) e; } else { return new DocumentClientException(500, e); } } private int getDocumentSize(String document) { int documentSize = document.getBytes(Charset.forName("UTF-8")).length; if (documentSize > maxMiniBatchSize) { logger.error("Document size {} larger than script payload limit. {}", documentSize, maxMiniBatchSize); } return documentSize; } }