Many resources are needed to download a project. Please understand that we have to compensate our server costs. Thank you in advance. Project price only 1 $
You can buy this project and download/modify it how often you want.
/*
* The MIT License (MIT)
* Copyright (c) 2017 Microsoft Corporation
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to deal
* in the Software without restriction, including without limitation the rights
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in all
* copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
* SOFTWARE.
*/
package com.microsoft.azure.documentdb.bulkexecutor;
import java.nio.charset.Charset;
import java.time.Duration;
import java.util.ArrayList;
import java.util.Collection;
import java.util.Collections;
import java.util.HashMap;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.concurrent.ConcurrentHashMap;
import java.util.concurrent.ExecutionException;
import java.util.concurrent.Executors;
import java.util.concurrent.TimeUnit;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import java.util.stream.Collectors;
import com.microsoft.azure.documentdb.internal.routing.PartitionKeyInternalHelper;
import org.apache.commons.lang3.StringUtils;
import org.apache.commons.lang3.tuple.ImmutablePair;
import org.apache.commons.lang3.tuple.Pair;
import org.apache.http.HttpStatus;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import com.google.common.base.Preconditions;
import com.google.common.base.Stopwatch;
import com.google.common.util.concurrent.AsyncCallable;
import com.google.common.util.concurrent.Futures;
import com.google.common.util.concurrent.Futures.FutureCombiner;
import com.google.common.util.concurrent.ListenableFuture;
import com.google.common.util.concurrent.ListeningExecutorService;
import com.google.common.util.concurrent.MoreExecutors;
import com.microsoft.azure.documentdb.Document;
import com.microsoft.azure.documentdb.DocumentClient;
import com.microsoft.azure.documentdb.DocumentClientException;
import com.microsoft.azure.documentdb.DocumentCollection;
import com.microsoft.azure.documentdb.Error;
import com.microsoft.azure.documentdb.FeedOptions;
import com.microsoft.azure.documentdb.FeedResponse;
import com.microsoft.azure.documentdb.PartitionKeyDefinition;
import com.microsoft.azure.documentdb.PartitionKeyRange;
import com.microsoft.azure.documentdb.RequestOptions;
import com.microsoft.azure.documentdb.RetryOptions;
import com.microsoft.azure.documentdb.bulkexecutor.internal.BatchDeleter;
import com.microsoft.azure.documentdb.bulkexecutor.internal.BatchInserter;
import com.microsoft.azure.documentdb.bulkexecutor.internal.BatchUpdater;
import com.microsoft.azure.documentdb.bulkexecutor.internal.BulkDeleteQuerySpec;
import com.microsoft.azure.documentdb.bulkexecutor.internal.BulkImportStoredProcedureOptions;
import com.microsoft.azure.documentdb.bulkexecutor.internal.CongestionController;
import com.microsoft.azure.documentdb.bulkexecutor.internal.DocumentAnalyzer;
import com.microsoft.azure.documentdb.bulkexecutor.internal.ExceptionUtils;
import com.microsoft.azure.documentdb.internal.HttpConstants;
import com.microsoft.azure.documentdb.internal.routing.CollectionRoutingMap;
import com.microsoft.azure.documentdb.internal.routing.InMemoryCollectionRoutingMap;
import com.microsoft.azure.documentdb.internal.routing.PartitionKeyInternal;
import com.microsoft.azure.documentdb.internal.routing.Range;
public class DocumentBulkExecutor implements AutoCloseable {
public static class Builder {
private DocumentClient client;
private String collectionLink;
private int maxMiniBatchSize = (int) Math.floor(MAX_BULK_IMPORT_SCRIPT_INPUT_SIZE * FRACTION_OF_MAX_BULK_IMPORT_SCRIPT_INPUT_SIZE_ALLOWED);
private int maxUpdateMiniBatchCount = 500;
private final static int DEFAULT_RETRY_ATTEMPT_ON_THROTTLING_FOR_INIT = 200;
private final static int DEFAULT_WAIT_TIME_ON_THROTTLING_FOR_INIT_IN_SECONDS = 60;
private PartitionKeyDefinition partitionKeyDef;
private int offerThroughput;
private static RetryOptions DEFAULT_INIT_RETRY_OPTIONS;
static {
DEFAULT_INIT_RETRY_OPTIONS = new RetryOptions();
DEFAULT_INIT_RETRY_OPTIONS.setMaxRetryAttemptsOnThrottledRequests(DEFAULT_RETRY_ATTEMPT_ON_THROTTLING_FOR_INIT);
DEFAULT_INIT_RETRY_OPTIONS.setMaxRetryWaitTimeInSeconds(DEFAULT_WAIT_TIME_ON_THROTTLING_FOR_INIT_IN_SECONDS);
}
private RetryOptions retryOptions = DEFAULT_INIT_RETRY_OPTIONS;
/**
* Use the instance of {@link DocumentClient} to perform bulk operations in target {@link DocumentCollection} instance at specified allocated throughput.
* @param client an instance of {@link DocumentClient}
* @param partitionKeyDef specifies the {@link PartitionKeyDefinition} of the collection
* @param databaseName name of the database
* @param collectionName name of the collection
* @param offerThroughput specifies the throughput allocated for bulk operations out of the collection's total throughput
* @return an instance of {@link Builder}
*/
public Builder from(DocumentClient client,
String databaseName,
String collectionName,
PartitionKeyDefinition partitionKeyDef,
int offerThroughput) {
// TODO: validate the retry options for the client
this.client = client;
this.collectionLink = String.format("/dbs/%s/colls/%s", databaseName, collectionName);
this.partitionKeyDef = partitionKeyDef;
this.offerThroughput = offerThroughput;
return this;
}
/**
* Use the given size to configure max mini-batch size (specific to bulk import API).
*
* If not specified will use the default value of 220200 bytes.
* @param size specifies the size of a mini-batch used in bulk import API.
* @return {@link Builder}
*/
public Builder withMaxMiniBatchSize(int size) {
Preconditions.checkArgument(size > 0, "maxMiniBatchSize cannot be negative");
Preconditions.checkArgument(size <= MAX_BULK_IMPORT_SCRIPT_INPUT_SIZE, "maxMiniBatchSize cannot be negative");
this.maxMiniBatchSize = size;
return this;
}
/**
* Use the given count to configure max update mini-batch count (specific to bulk update API).
*
* If not specified will use the default value of 500.
* @param count specifies the maximum count of update items in a mini-batch used in bulk import API.
* @return {@link Builder}
*/
public Builder withMaxUpdateMiniBatchCount(int count) {
Preconditions.checkArgument(count > 0, "maxUpdateMiniBatchCount cannot be negative");
this.maxUpdateMiniBatchCount = count;
return this;
}
/**
* Use the given retry options to apply to {@link DocumentClient} used in initialization of {@link DocumentBulkExecutor}.
*
* @param options an instance of {@link RetryOptions}
* @return {@link Builder}
*/
public Builder withInitializationRetryOptions(RetryOptions options) {
this.retryOptions = options;
return this;
}
/**
* Instantiates {@link DocumentBulkExecutor} given the configured {@link Builder}.
*
* @return the newly instantiated instance of {@link DocumentBulkExecutor}
* @throws Exception if there is any failure
*/
public DocumentBulkExecutor build() throws Exception {
DocumentBulkExecutor executor = new DocumentBulkExecutor(client, collectionLink, partitionKeyDef, offerThroughput);
try {
executor.setInitializationRetryOptions(retryOptions);
executor.setMaxMiniBatchSize(maxMiniBatchSize);
executor.setMaxUpdateMiniBatchCount(maxUpdateMiniBatchCount);
executor.safeInit();
} catch (Exception e) {
executor.close();
throw e;
}
return executor;
}
private Builder() {}
}
/**
* Creates a new {@link DocumentBulkExecutor.Builder} instance
* @return an instance of {@link DocumentBulkExecutor.Builder}
*/
public static DocumentBulkExecutor.Builder builder() {
return new DocumentBulkExecutor.Builder();
}
/**
* The name of the system stored procedure for bulk import.
*/
private final static String BULK_IMPORT_STORED_PROCECURE_NAME = "__.sys.commonBulkInsert";
/**
* The name of the stored procedure for bulk update.
*/
private final static String BULK_UPDATE_STORED_PROCECURE_NAME = "__.sys.bulkPatch";
/**
* The name of the stored procedure for bulk delete.
*/
private final static String BULK_DELETE_STORED_PROCECURE_NAME = "__.sys.commonDelete";
/**
* The maximal sproc payload size sent (as a fraction of 2MB).
*/
private final static int MAX_BULK_IMPORT_SCRIPT_INPUT_SIZE = (2202010 * 5) / 10;
/**
* The fraction of maximum sproc payload size up to which documents allowed to be fit in a mini-batch.
*/
private final static double FRACTION_OF_MAX_BULK_IMPORT_SCRIPT_INPUT_SIZE_ALLOWED = 0.20;
/**
* Initialization sleep time on
*/
private final static int INITIALIZATION_SLEEP_TIME_ON_THROTTLING = 500;
/**
* The default max batch size for bulk delete operations
*/
private final static int DEFAULT_BULK_DELETE_BATCH_SIZE = 1000;
/**
* Default sleep time in milliseconds to wait prior to re-initializing the BulkExecutor and retrying previously failed batch(es).
*/
private final static int SLEEP_TIME_FOR_RETRY_POST_SPLIT_IN_MILLIS = 65 * 1000;
/**
* Maximum number of retries when split related failures are encountered
*/
private final static int MAX_RETRIES_ON_SPLIT_FAILURES = 10;
/**
* Logger
*/
private static final Logger logger = LoggerFactory.getLogger(DocumentBulkExecutor.class);
/**
* Degree of parallelism for each partition which was inferred from previous batch execution.
*/
private final Map partitionKeyRangeIdToInferredDegreeOfParallelism = new ConcurrentHashMap<>();
/**
* Regex pattern for SQL query used to bulk delete documents
*/
private final static String SQL_QUERY_REGEX_PATTERN = "(?i)select\\s+\\*\\s+(?i)from\\s+(?c)\\s+(?i)where(?:\\s+(?.+))?";
private final static Pattern BULK_DELETE_QUERY_SPEC_PATTERN = Pattern.compile(SQL_QUERY_REGEX_PATTERN);
/**
* Executor Service
*/
private final ListeningExecutorService listeningExecutorService;
/**
* The DocumentDB client instance.
*/
private final DocumentClient client;
/**
* The document collection to which documents are to be bulk imported.
*/
private final String collectionLink;
/**
* Partition Key Definition of the underlying collection.
*/
private final PartitionKeyDefinition partitionKeyDefinition;
/**
* Partition Key Range Ids
*/
private List partitionKeyRangeIds;
/**
* Collection routing map used to retrieve partition key range Ids of a given collection
*/
private CollectionRoutingMap collectionRoutingMap;
/**
* Bulk Import Stored Procedure Link relevant to the given collection
*/
private String bulkImportStoredProcLink;
/**
* Bulk Update Stored Procedure Link relevant to the given collection
*/
private String bulkUpdateStoredProcLink;
/**
* Bulk Delete Stored Procedure Link relevant to the given collection
*/
private String bulkDeleteStoredProcLink;
/**
* Collection offer throughput
*/
private int collectionThroughput;
/**
* Max Mini Batch Size
*/
private int maxMiniBatchSize;
/**
* Max Update Mini Batch Count
*/
private int maxUpdateMiniBatchCount;
private RetryOptions retryOptions;
private void setMaxMiniBatchSize(int size) {
this.maxMiniBatchSize = size;
}
private void setMaxUpdateMiniBatchCount(int count) {
this.maxUpdateMiniBatchCount = count;
}
private void setInitializationRetryOptions(RetryOptions options) {
this.retryOptions = options;
}
/**
* Initializes a new instance of {@link DocumentBulkExecutor}
*
* @param client {@link DocumentClient} instance to use
* @param collectionLink specifies the link to the target Azure Cosmos DB collection
* @param partitionKeyDefinition specifies the {@link PartitionKeyDefinition} of the collection
* @param collectionOfferThroughput specifies the throughput allocated for bulk operations out of the collection's total throughput
*/
private DocumentBulkExecutor(DocumentClient client,
String collectionLink,
PartitionKeyDefinition partitionKeyDefinition,
int collectionOfferThroughput) {
Preconditions.checkNotNull(client, "client cannot be null");
Preconditions.checkNotNull(partitionKeyDefinition, "partitionKeyDefinition cannot be null");
Preconditions.checkNotNull(collectionLink, "collectionLink cannot be null");
Preconditions.checkArgument(collectionOfferThroughput > 0, "collection throughput is less than 10,000");
this.client = client;
this.collectionLink = collectionLink;
this.collectionThroughput = collectionOfferThroughput;
this.partitionKeyDefinition = partitionKeyDefinition;
this.listeningExecutorService = MoreExecutors.listeningDecorator(Executors.newCachedThreadPool());
}
private void safeInit() throws Exception {
int count = 0;
long startTime = System.currentTimeMillis();
while(true) {
try {
initialize();
break;
} catch (Exception e) {
count++;
DocumentClientException dce = ExceptionUtils.getThrottelingException(e);
long now = System.currentTimeMillis();
if (count < retryOptions.getMaxRetryAttemptsOnThrottledRequests()
&& now - startTime < (retryOptions.getMaxRetryWaitTimeInSeconds() * 1000)
&& dce != null
&& dce.getStatusCode() == HttpConstants.StatusCodes.TOO_MANY_REQUESTS ) {
Thread.sleep(count * dce.getRetryAfterInMilliseconds() + INITIALIZATION_SLEEP_TIME_ON_THROTTLING);
continue;
} else {
throw e;
}
}
}
}
/**
* Releases any internal resources.
* It is responsibility of the caller to close {@link DocumentClient}.
*/
@Override
public void close() {
// disable submission of new tasks
listeningExecutorService.shutdown();
try {
// wait for existing tasks to terminate
if (!listeningExecutorService.awaitTermination(60, TimeUnit.SECONDS)) {
// cancel any currently running executing tasks
listeningExecutorService.shutdownNow();
// wait for cancelled tasks to terminate
if (!listeningExecutorService.awaitTermination(60, TimeUnit.SECONDS)) {
logger.error("some tasks did not terminate");
}
}
} catch (InterruptedException e) {
listeningExecutorService.shutdownNow();
Thread.currentThread().interrupt();
}
}
/**
* Initializes {@link DocumentBulkExecutor}. This happens only once
* @throws DocumentClientException
*/
private void initialize() throws DocumentClientException {
logger.debug("Initializing ...");
this.bulkImportStoredProcLink = String.format("%s/sprocs/%s", collectionLink, BULK_IMPORT_STORED_PROCECURE_NAME);
this.bulkUpdateStoredProcLink = String.format("%s/sprocs/%s", collectionLink, BULK_UPDATE_STORED_PROCECURE_NAME);
this.bulkDeleteStoredProcLink = String.format("%s/sprocs/%s", collectionLink, BULK_DELETE_STORED_PROCECURE_NAME);
logger.debug("Fetching partition map of collection");
Range fullRange = new Range(
PartitionKeyInternalHelper.MinimumInclusiveEffectivePartitionKey,
PartitionKeyInternalHelper.MaximumExclusiveEffectivePartitionKey,
true,
false);
// this assumes database and collection already exists
try {
client.readCollection(collectionLink, null).getResource();
} catch (DocumentClientException ex) {
if (ex.getStatusCode() == 404) {
logger.error("Unable to read resource for collection link " + collectionLink);
}
throw ex;
}
this.collectionRoutingMap = getCollectionRoutingMap(client, this.collectionLink);
Collection partitionKeyRanges = this.collectionRoutingMap.getOverlappingRanges(fullRange);
this.partitionKeyRangeIds = partitionKeyRanges.stream().map(partitionKeyRange -> partitionKeyRange.getId()).collect(Collectors.toList());
logger.debug("Initialization completed");
}
/**
* Executes a bulk import in the Azure Cosmos DB database service.
*
*
* {@code
* ConnectionPolicy connectionPolicy = new ConnectionPolicy();
* RetryOptions retryOptions = new RetryOptions();
*
* // Set client's retry options high for initialization
* retryOptions.setMaxRetryWaitTimeInSeconds(120);
* retryOptions.setMaxRetryAttemptsOnThrottledRequests(100);
* connectionPolicy.setRetryOptions(retryOptions);
* connectionPolicy.setMaxPoolSize(1000);
*
* DocumentClient client = new DocumentClient(HOST, MASTER_KEY, connectionPolicy, null);
*
* String collectionLink = String.format("/dbs/%s/colls/%s", "mydb", "mycol");
* DocumentCollection collection = client.readCollection(collectionLink, null).getResource();
*
* DocumentBulkExecutor executor = DocumentBulkExecutor.builder().from(client, collection,
* collection.getPartitionKey(), collectionOfferThroughput).build();
*
* // Set retries to 0 to pass control to bulk executor
* client.getConnectionPolicy().getRetryOptions().setMaxRetryWaitTimeInSeconds(0);
* client.getConnectionPolicy().getRetryOptions().setMaxRetryAttemptsOnThrottledRequests(0);
*
* for(int i = 0; i < 10; i++) {
* List documents = documentSource.getMoreDocuments();
*
* BulkImportResponse bulkImportResponse = executor.importAll(documents, false, true, 40);
*
* // Validate that all documents inserted to ensure no failure.
* if (bulkImportResponse.getNumberOfDocumentsImported() < documents.size()) {
* for(Exception e: bulkImportResponse.getErrors()) {
* // Validate why there were some failures.
* e.printStackTrace();
* }
* break;
* }
* }
*
* executor.close();
* client.close();
* }
*
*
*
* @param documents specifies the collection of JSON-serialized documents to import
* @param isUpsert indicates whether a document in the supplied collection needs to be overwritten if the id already exists
* @param disableAutomaticIdGeneration indicates whether the id has to be automatically generated for a document if absent in the supplied collection
* @param maxConcurrencyPerPartitionRange specifies the maximum degree of concurrency per partition key range (default value is 20 if set to null)
* @return an instance of {@link BulkImportResponse}
* @throws DocumentClientException if any failure happens
*/
public BulkImportResponse importAll(Collection documents,
boolean isUpsert,
boolean disableAutomaticIdGeneration,
Integer maxConcurrencyPerPartitionRange) throws DocumentClientException {
return executeBulkImportInternal(documents,
isUpsert,
disableAutomaticIdGeneration,
maxConcurrencyPerPartitionRange,
0,
0,
0);
}
/**
* @param documents specifies the collection of JSON-serialized documents to import
* @param isUpsert indicates whether a document in the supplied collection needs to be overwritten if the id already exists
* @param disableAutomaticIdGeneration indicates whether the id has to be automatically generated for a document if absent in the supplied collection
* @param maxConcurrencyPerPartitionRange specifies the maximum degree of concurrency per partition key range (default value is 20 if set to null)
* @param effectiveMaxMiniBatchImportSize specifies the maximum size of mini batch imports
* @param numberOfParallelTasks specifies the number of parallel ingestion tasks
* @param writeThroughputBudgetPerCosmosPartition specifies the user provided RU Budget that is split for each cosmos b physical partition
* @return an instance of {@link BulkImportResponse}
* @throws DocumentClientException if any failure happens
*/
public BulkImportResponse importAll(Collection documents,
boolean isUpsert,
boolean disableAutomaticIdGeneration,
Integer maxConcurrencyPerPartitionRange,
Integer effectiveMaxMiniBatchImportSize,
Integer numberOfParallelTasks,
Integer writeThroughputBudgetPerCosmosPartition) throws DocumentClientException {
return executeBulkImportInternal(documents,
isUpsert,
disableAutomaticIdGeneration,
maxConcurrencyPerPartitionRange,
effectiveMaxMiniBatchImportSize,
numberOfParallelTasks,
writeThroughputBudgetPerCosmosPartition);
}
/**
* Executes a bulk update in the Azure Cosmos DB database service.
*
* @param updateItems specifies the collection of update items each of which comprises the list of field update operations to be performed
* on a document identified by an id and partition key value.
* @param maxConcurrencyPerPartitionRange specifies the maximum degree of concurrency per partition key range (default value is 20 if set to null)
* @return an instance of {@link BulkUpdateResponse}
* @throws DocumentClientException if any failure happens
*/
public BulkUpdateResponse updateAll(Collection updateItems,
Integer maxConcurrencyPerPartitionRange) throws DocumentClientException {
return executeBulkUpdateInternal(updateItems, maxConcurrencyPerPartitionRange);
}
/**
* Executes a bulk update in the Azure Cosmos DB database service with given set of patch documents.
*
* @param patchDocuments which are documents comprising id, partition key values and fields to set with the corresponding values
* @param maxConcurrencyPerPartitionRange specifies the maximum degree of concurrency per partition key range (default value is 20 if set to null)
* @return an instance of {@link BulkUpdateResponse}
* @throws DocumentClientException if any failure happens
*/
public BulkUpdateResponse mergeAll(Collection patchDocuments,
Integer maxConcurrencyPerPartitionRange) throws DocumentClientException {
return executeBulkUpdateWithPatchInternal(patchDocuments, maxConcurrencyPerPartitionRange);
}
/**
* Executes a bulk delete in the Azure Cosmos DB database service.
*
* @param pkIdPairsToDelete List of pairs of partition key and id values of documents to delete
* @return an instance of {@link BulkDeleteResponse}
* @throws DocumentClientException if any failure happens
*/
public BulkDeleteResponse deleteAll(List> pkIdPairsToDelete) throws DocumentClientException {
return executeBulkDeleteInternalPkRowKeys(pkIdPairsToDelete);
}
@SuppressWarnings("unused")
private BulkUpdateResponse updateDocument(String partitionKey, String id, List updateOperations) throws DocumentClientException {
return executeUpdateDocumentInternal(partitionKey, id, updateOperations);
}
private BulkImportResponse executeBulkImportInternal(Collection input,
boolean isUpsert,
boolean disableAutomaticIdGeneration,
Integer maxConcurrencyPerPartitionRange,
Integer effectiveMaxMiniBatchImportSize,
Integer numberOfParallelTasks,
Integer writeThroughputBudgetPerCosmosPartition) throws DocumentClientException {
Preconditions.checkNotNull(input, "document collection cannot be null");
try {
Collection documentsToInsertOrRetry = new ArrayList<>(input);
Collection documentsFailedToImportDueToSplits;
List failedImports = new ArrayList<>();
int numRetriesDueToSplits = 0;
List failures = new ArrayList<>();
List