Please wait. This can take some minutes ...
Many resources are needed to download a project. Please understand that we have to compensate our server costs. Thank you in advance.
Project price only 1 $
You can buy this project and download/modify it how often you want.
com.microsoft.azure.documentdb.bulkexecutor.internal.BatchInserter Maven / Gradle / Ivy
/*
* The MIT License (MIT)
* Copyright (c) 2017 Microsoft Corporation
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to deal
* in the Software without restriction, including without limitation the rights
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in all
* copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
* SOFTWARE.
*/
package com.microsoft.azure.documentdb.bulkexecutor.internal;
import static com.microsoft.azure.documentdb.bulkexecutor.internal.ExceptionUtils.isGone;
import static com.microsoft.azure.documentdb.bulkexecutor.internal.ExceptionUtils.isSplit;
import static com.microsoft.azure.documentdb.bulkexecutor.internal.ExceptionUtils.isThrottled;
import static com.microsoft.azure.documentdb.bulkexecutor.internal.ExceptionUtils.isTimedOut;
import static com.microsoft.azure.documentdb.bulkexecutor.internal.ExceptionUtils.isUnavailable;
import java.io.IOException;
import java.time.Duration;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collections;
import java.util.Iterator;
import java.util.List;
import java.util.concurrent.Callable;
import java.util.concurrent.TimeUnit;
import java.util.concurrent.atomic.AtomicInteger;
import java.util.stream.Stream;
import org.apache.commons.lang3.StringUtils;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import com.fasterxml.jackson.core.JsonParseException;
import com.fasterxml.jackson.databind.JsonMappingException;
import com.google.common.base.Stopwatch;
import com.google.common.util.concurrent.AtomicDouble;
import com.microsoft.azure.documentdb.DocumentClient;
import com.microsoft.azure.documentdb.DocumentClientException;
import com.microsoft.azure.documentdb.RequestOptions;
import com.microsoft.azure.documentdb.StoredProcedureResponse;
import com.microsoft.azure.documentdb.bulkexecutor.BulkImportFailure;
import com.microsoft.azure.documentdb.internal.HttpConstants;
public class BatchInserter extends BatchOperator {
/**
* The count of documents bulk inserted by this batch inserter.
*/
public AtomicInteger numberOfDocumentsImported;
/**
* The total request units consumed by this batch inserter.
*/
public AtomicDouble totalRequestUnitsConsumed;
/**
* The list of documents which were bad input format and caused failure.
*/
public List badInputDocuments;
/**
* The list of mini-batches this batch inserter is responsible to import.
*/
private final List> batchesToInsert;
/**
* The link to the system bulk import stored procedure.
*/
private final String bulkImportSprocLink;
/**
* The list of inserts which failed and need to be retried due to physical partition(s) being split.
*/
private List documentsFailedToImportDueToSplits;
/**
* The list of failures during the bulk import execution for the mini batch
*/
private List documentsFailedToImport;
/**
* The options passed to the system bulk import stored procedure.
*/
private final BulkImportStoredProcedureOptions storedProcOptions;
/**
* The maximum number of retries when the bulk import times out.
*/
private final int maxRetryCountOnTimeouts = 5;
/**
* The number of parallel ingestion tasks.
*/
private final int numberOfParallelTasks;
/**
* The Throughput Budget allocated per Cosmos DB Partition.
*/
private final int writeThroughputBudgetPerCosmosPartition;
/**
* The logger instance.
*/
private final Logger logger = LoggerFactory.getLogger(BatchInserter.class);
public BatchInserter(String partitionKeyRangeId,
List> batchesToInsert,
DocumentClient client,
String bulkImportSprocLink,
BulkImportStoredProcedureOptions options,
Integer numberOfParallelTasks,
Integer writeThroughputBudgetPerCosmosPartition) {
this.partitionKeyRangeId = partitionKeyRangeId;
this.batchesToInsert = batchesToInsert;
this.client = client;
this.bulkImportSprocLink = bulkImportSprocLink;
this.storedProcOptions = options;
this.numberOfDocumentsImported = new AtomicInteger();
this.totalRequestUnitsConsumed = new AtomicDouble();
this.badInputDocuments = Collections.synchronizedList(new ArrayList());
this.documentsFailedToImportDueToSplits = Collections.synchronizedList(new ArrayList());
this.documentsFailedToImport = Collections.synchronizedList(new ArrayList());
this.numberOfParallelTasks = numberOfParallelTasks;
this.writeThroughputBudgetPerCosmosPartition = writeThroughputBudgetPerCosmosPartition;
class RequestOptionsInternal extends RequestOptions {
RequestOptionsInternal(String partitionKeyRangeId) {
setPartitionKeyRengeId(partitionKeyRangeId);
}
}
this.requestOptions = new RequestOptionsInternal(partitionKeyRangeId);
}
public int getNumberOfDocumentsImported() {
return numberOfDocumentsImported.get();
}
public double getTotalRequestUnitsConsumed() {
return totalRequestUnitsConsumed.get();
}
public List getBadInputDocuments() {
return badInputDocuments;
}
public List getDocumentsFailedToImportDueToSplits() {
return documentsFailedToImportDueToSplits;
}
public List getDocumentsFailedToImport() {
return documentsFailedToImport;
}
public Iterator> miniBatchExecutionCallableIterator() {
Stream> stream = batchesToInsert.stream().map(miniBatch -> {
return new Callable() {
@Override
public OperationMetrics call() {
Stopwatch stopwatch = Stopwatch.createStarted();
double requestUnitsCounsumed = 0;
int numberOfThrottles = 0;
int currentDocumentIndex = 0;
try {
logger.debug("pki {} importing mini batch started", partitionKeyRangeId);
int numberOfTimeouts = 0;
StoredProcedureResponse response = null;
boolean timedOut = false;
String[] docBatch = null;
if (cancel) {
// A previous micro batch has triggered a cancellation already
// So we need to include the documents of subsequent micro batches
// in the retries if not failure is bubbled up to the application layer
docBatch = miniBatch.subList(currentDocumentIndex, miniBatch.size()).toArray(new String[0]);
documentsFailedToImportDueToSplits.addAll(Arrays.asList(docBatch));
String msg = "Import has been cancelled for partition key range: " +
partitionKeyRangeId +
" - adding documents to the failedToImportDueToSplit collection to trigger retry.";
logger.warn(msg);
}
while (currentDocumentIndex < miniBatch.size() && !cancel) {
logger.debug("pki {} inside for loop, currentDocumentIndex", partitionKeyRangeId, currentDocumentIndex);
docBatch = miniBatch.subList(currentDocumentIndex, miniBatch.size()).toArray(new String[0]);
boolean isThrottled = false;
Duration retryAfter = Duration.ZERO;
Stopwatch responseWatch = Stopwatch.createStarted();
try {
logger.debug("pki {}, Trying to import minibatch of {} documents", partitionKeyRangeId, docBatch.length);
if (!timedOut) {
response = client.executeStoredProcedure(bulkImportSprocLink, requestOptions, new Object[] { docBatch, storedProcOptions, null });
} else {
BulkImportStoredProcedureOptions modifiedStoredProcOptions = new BulkImportStoredProcedureOptions(
storedProcOptions.disableAutomaticIdGeneration,
storedProcOptions.softStopOnConflict,
storedProcOptions.systemCollectionId,
storedProcOptions.enableBsonSchema,
true,
storedProcOptions.softStopOnBadRequest);
response = client.executeStoredProcedure(
bulkImportSprocLink, requestOptions,
new Object[] { docBatch, modifiedStoredProcOptions, null });
}
BulkImportStoredProcedureResponse bulkImportResponse = parseFrom(response);
if (bulkImportResponse != null) {
if (bulkImportResponse.errorCode != 0) {
logger.warn("pki {} Received response error code {}", partitionKeyRangeId, bulkImportResponse.errorCode);
if (bulkImportResponse.errorCode == HttpConstants.StatusCodes.BADREQUEST) {
badInputDocuments.add(bulkImportResponse.failedDoc);
currentDocumentIndex += 1; // cross the index of the failing document
}
else if (bulkImportResponse.count == 0) {
BulkImportFailure bulkImportFailure = new BulkImportFailure();
bulkImportFailure.getDocumentsFailedToImport().addAll(Arrays.asList(docBatch));
bulkImportFailure.setBulkImportFailureException(new RuntimeException(
String.format("Stored proc returned failure %s", bulkImportResponse.errorCode)));
documentsFailedToImport.add(bulkImportFailure);
cancel = true;
}
}
double requestCharge = response.getRequestCharge();
currentDocumentIndex += bulkImportResponse.count;
numberOfDocumentsImported.addAndGet(bulkImportResponse.count);
requestUnitsCounsumed += requestCharge;
totalRequestUnitsConsumed.addAndGet(requestCharge);
}
else {
logger.warn("pki {} Failed to receive response", partitionKeyRangeId);
}
} catch (DocumentClientException e) {
logger.debug("pki {} Importing minibatch failed", partitionKeyRangeId, e);
if (isThrottled(e)) {
logger.debug("pki {} Throttled on partition range id", partitionKeyRangeId);
numberOfThrottles++;
isThrottled = true;
retryAfter = Duration.ofMillis(e.getRetryAfterInMilliseconds());
// will retry again
} else if (isTimedOut(e)) {
logger.debug("pki {} Request timed out", partitionKeyRangeId);
// will retry a finite number of times
if(numberOfTimeouts < maxRetryCountOnTimeouts) {
timedOut = true;
numberOfTimeouts++;
} else {
BulkImportFailure bulkImportFailure = new BulkImportFailure();
bulkImportFailure.getDocumentsFailedToImport().addAll(Arrays.asList(docBatch));
bulkImportFailure.setBulkImportFailureException(e);
documentsFailedToImport.add(bulkImportFailure);
cancel = true;
}
} else if (isUnavailable(e)) {
logger.debug("pki {} Service unavailable", partitionKeyRangeId);
// will retry again
documentsFailedToImportDueToSplits.addAll(Arrays.asList(docBatch));
logger.warn(
"Received Service unavailable exception when importing a mini-batch for partition key range: " +
partitionKeyRangeId +
". This mini-batch will be retried on the next invocation.");
logger.debug("ServiceUnavailable. Original exception message was: {} ", e.getMessage());
cancel = true;
}
else if (isGone(e)) {
// In the case of a gone exception for a partition, in particular due to splits, store the inserts to retry after re-initializing the Bulk Executor instance
if (isSplit(e)) {
documentsFailedToImportDueToSplits.addAll(Arrays.asList(docBatch));
logger.warn(
"Received a GoneException on Partition range id " +
partitionKeyRangeId +
" as the partition was completing a split | Storing the mini batch and retrying");
logger.debug("GoneException due to split. Original exception message was: {} ", e.getMessage());
} else {
documentsFailedToImportDueToSplits.addAll(Arrays.asList(docBatch));
logger.warn(
"Received a GoneException on Partition range id " +
partitionKeyRangeId +
" | Storing the mini batch and retrying");
logger.debug("GonException - general. Original exception message was: {} ", e.getMessage());
}
cancel = true;
} else {
// there is no value in retrying
String errorMessage = String.format("pki %s failed to import mini-batch. Exception was %s. Status code was %s",
partitionKeyRangeId,
e.getMessage(),
e.getStatusCode());
logger.error(errorMessage, e);
BulkImportFailure bulkImportFailure = new BulkImportFailure();
bulkImportFailure.getDocumentsFailedToImport().addAll(Arrays.asList(docBatch));
bulkImportFailure.setBulkImportFailureException(e);
documentsFailedToImport.add(bulkImportFailure);
cancel = true;
}
} catch (IllegalStateException e) {
documentsFailedToImportDueToSplits.addAll(Arrays.asList(docBatch));
logger.warn(
"Received IllegalStateException since partition key range: " +
partitionKeyRangeId +
" was split or Gone. | Storing the mini batch and retrying");
cancel = true;
} catch (Exception e) {
String errorMessage = String.format("pki %s Failed to import mini-batch. Exception was %s", partitionKeyRangeId,
e.getMessage());
logger.error(errorMessage, e);
BulkImportFailure bulkImportFailure = new BulkImportFailure();
bulkImportFailure.getDocumentsFailedToImport().addAll(Arrays.asList(docBatch));
bulkImportFailure.setBulkImportFailureException(new RuntimeException(errorMessage, e));
documentsFailedToImport.add(bulkImportFailure);
cancel = true;
}
responseWatch.stop();
if (writeThroughputBudgetPerCosmosPartition > 0) {
double throughputBudgetAdjustmentFactor = (requestUnitsCounsumed * numberOfParallelTasks) / writeThroughputBudgetPerCosmosPartition;
double adjustedElapsedTime = responseWatch.elapsed(TimeUnit.MILLISECONDS) / throughputBudgetAdjustmentFactor;
double sleepTime = 1000 - adjustedElapsedTime;
Duration sleepDuration = Duration.ofMillis((long)(sleepTime));
if (sleepTime > 0) {
logger.debug("Going to sleep for {} millis to limit the RU consumption as per the provided writeThroughputBudget", sleepDuration.toMillis());
Thread.sleep(sleepDuration.toMillis());
}
}
if (isThrottled) {
logger.debug("pki {}: Minibatch import of {} docs took {} ms - throttled with retry after of {} ms",
partitionKeyRangeId, docBatch.length, responseWatch.elapsed(TimeUnit.MILLISECONDS), retryAfter.toMillis());
try {
logger.debug("pki {} throttled going to sleep for {} millis ", partitionKeyRangeId, retryAfter.toMillis());
Thread.sleep(retryAfter.toMillis());
} catch (InterruptedException e) {
BulkImportFailure bulkImportFailure = new BulkImportFailure();
bulkImportFailure.getDocumentsFailedToImport().addAll(Arrays.asList(docBatch));
bulkImportFailure.setBulkImportFailureException(new RuntimeException(e));
documentsFailedToImport.add(bulkImportFailure);
cancel = true;
}
}
else {
logger.debug("pki {}: Minibatch import of {} docs took {} ms, consumed {} RU - not throttled",
partitionKeyRangeId, docBatch.length, responseWatch.elapsed(TimeUnit.MILLISECONDS), (response == null ? 0 : response.getRequestCharge()));
}
}
logger.debug("pki {} completed", partitionKeyRangeId);
} catch (Exception e) {
BulkImportFailure bulkImportFailure = new BulkImportFailure();
bulkImportFailure.getDocumentsFailedToImport().addAll(miniBatch);
bulkImportFailure.setBulkImportFailureException(e);
documentsFailedToImport.add(bulkImportFailure);
cancel = true;
}
stopwatch.stop();
OperationMetrics insertMetrics = new OperationMetrics(currentDocumentIndex, stopwatch.elapsed(), requestUnitsCounsumed, numberOfThrottles);
return insertMetrics;
}
};
});
return stream.iterator();
}
private BulkImportStoredProcedureResponse parseFrom(StoredProcedureResponse storedProcResponse) throws JsonParseException, JsonMappingException, IOException {
String res = storedProcResponse.getResponseAsString();
logger.debug("MiniBatch Insertion for Partition Key Range Id {}: Stored Proc Response as String {}", partitionKeyRangeId, res);
if (StringUtils.isEmpty(res))
return null;
return objectMapper.readValue(res, BulkImportStoredProcedureResponse.class);
}
}