com.microsoft.azure.documentdb.bulkexecutor.DocumentBulkExecutor Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of documentdb-bulkexecutor Show documentation
Document Bulk Executor for Azure Cosmos DB Service
The newest version!
/*
 * The MIT License (MIT)
 * Copyright (c) 2017 Microsoft Corporation
 *
 * Permission is hereby granted, free of charge, to any person obtaining a copy
 * of this software and associated documentation files (the "Software"), to deal
 * in the Software without restriction, including without limitation the rights
 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 * copies of the Software, and to permit persons to whom the Software is
 * furnished to do so, subject to the following conditions:
 *
 * The above copyright notice and this permission notice shall be included in all
 * copies or substantial portions of the Software.
 *
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 * SOFTWARE.
 */
package com.microsoft.azure.documentdb.bulkexecutor;

import java.nio.charset.Charset;
import java.time.Duration;
import java.util.ArrayList;
import java.util.Collection;
import java.util.Collections;
import java.util.HashMap;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.concurrent.ConcurrentHashMap;
import java.util.concurrent.ExecutionException;
import java.util.concurrent.Executors;
import java.util.concurrent.TimeUnit;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import java.util.stream.Collectors;

import com.microsoft.azure.documentdb.internal.routing.PartitionKeyInternalHelper;
import org.apache.commons.lang3.StringUtils;
import org.apache.commons.lang3.tuple.ImmutablePair;
import org.apache.commons.lang3.tuple.Pair;
import org.apache.http.HttpStatus;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import com.google.common.base.Preconditions;
import com.google.common.base.Stopwatch;
import com.google.common.util.concurrent.AsyncCallable;
import com.google.common.util.concurrent.Futures;
import com.google.common.util.concurrent.Futures.FutureCombiner;
import com.google.common.util.concurrent.ListenableFuture;
import com.google.common.util.concurrent.ListeningExecutorService;
import com.google.common.util.concurrent.MoreExecutors;
import com.microsoft.azure.documentdb.Document;
import com.microsoft.azure.documentdb.DocumentClient;
import com.microsoft.azure.documentdb.DocumentClientException;
import com.microsoft.azure.documentdb.DocumentCollection;
import com.microsoft.azure.documentdb.Error;
import com.microsoft.azure.documentdb.FeedOptions;
import com.microsoft.azure.documentdb.FeedResponse;
import com.microsoft.azure.documentdb.PartitionKeyDefinition;
import com.microsoft.azure.documentdb.PartitionKeyRange;
import com.microsoft.azure.documentdb.RequestOptions;
import com.microsoft.azure.documentdb.RetryOptions;
import com.microsoft.azure.documentdb.bulkexecutor.internal.BatchDeleter;
import com.microsoft.azure.documentdb.bulkexecutor.internal.BatchInserter;
import com.microsoft.azure.documentdb.bulkexecutor.internal.BatchUpdater;
import com.microsoft.azure.documentdb.bulkexecutor.internal.BulkDeleteQuerySpec;
import com.microsoft.azure.documentdb.bulkexecutor.internal.BulkImportStoredProcedureOptions;
import com.microsoft.azure.documentdb.bulkexecutor.internal.CongestionController;
import com.microsoft.azure.documentdb.bulkexecutor.internal.DocumentAnalyzer;
import com.microsoft.azure.documentdb.bulkexecutor.internal.ExceptionUtils;
import com.microsoft.azure.documentdb.internal.HttpConstants;
import com.microsoft.azure.documentdb.internal.routing.CollectionRoutingMap;
import com.microsoft.azure.documentdb.internal.routing.InMemoryCollectionRoutingMap;
import com.microsoft.azure.documentdb.internal.routing.PartitionKeyInternal;
import com.microsoft.azure.documentdb.internal.routing.Range;

public class DocumentBulkExecutor implements AutoCloseable {

    public static class Builder {

        private DocumentClient client;
        private String collectionLink;
        private int maxMiniBatchSize = (int) Math.floor(MAX_BULK_IMPORT_SCRIPT_INPUT_SIZE * FRACTION_OF_MAX_BULK_IMPORT_SCRIPT_INPUT_SIZE_ALLOWED);
        private int maxUpdateMiniBatchCount = 500;
        private final static int DEFAULT_RETRY_ATTEMPT_ON_THROTTLING_FOR_INIT = 200;
        private final static int DEFAULT_WAIT_TIME_ON_THROTTLING_FOR_INIT_IN_SECONDS = 60;

        private PartitionKeyDefinition partitionKeyDef;
        private int offerThroughput;

        private static RetryOptions DEFAULT_INIT_RETRY_OPTIONS;

        static {
            DEFAULT_INIT_RETRY_OPTIONS = new RetryOptions();
            DEFAULT_INIT_RETRY_OPTIONS.setMaxRetryAttemptsOnThrottledRequests(DEFAULT_RETRY_ATTEMPT_ON_THROTTLING_FOR_INIT);
            DEFAULT_INIT_RETRY_OPTIONS.setMaxRetryWaitTimeInSeconds(DEFAULT_WAIT_TIME_ON_THROTTLING_FOR_INIT_IN_SECONDS);
        }

        private RetryOptions retryOptions = DEFAULT_INIT_RETRY_OPTIONS;

        /**
         * Use the instance of {@link DocumentClient} to perform bulk operations in target {@link DocumentCollection} instance at specified allocated throughput.
         * @param client an instance of {@link DocumentClient}
         * @param partitionKeyDef specifies the {@link PartitionKeyDefinition} of the collection
         * @param databaseName name of the database
         * @param collectionName name of the collection
         * @param offerThroughput specifies the throughput allocated for bulk operations out of the collection's total throughput
         * @return an instance of {@link Builder}
         */
        public Builder from(DocumentClient client,
                String databaseName, 
                String collectionName,
                PartitionKeyDefinition partitionKeyDef,
                int offerThroughput) {

            // TODO: validate the retry options for the client
            this.client = client;
            this.collectionLink = String.format("/dbs/%s/colls/%s", databaseName, collectionName);
            this.partitionKeyDef = partitionKeyDef;
            this.offerThroughput = offerThroughput;

            return this;
        }

        /**
         * Use the given size to configure max mini-batch size (specific to bulk import API).
         *
         * If not specified will use the default value of 220200 bytes.
         * @param size specifies the size of a mini-batch used in bulk import API.
         * @return {@link Builder}
         */
        public Builder withMaxMiniBatchSize(int size) {
            Preconditions.checkArgument(size > 0, "maxMiniBatchSize cannot be negative");
            Preconditions.checkArgument(size <= MAX_BULK_IMPORT_SCRIPT_INPUT_SIZE, "maxMiniBatchSize cannot be negative");

            this.maxMiniBatchSize = size;
            return this;
        }

        /**
         * Use the given count to configure max update mini-batch count (specific to bulk update API).
         *
         * If not specified will use the default value of 500.
         * @param count specifies the maximum count of update items in a mini-batch used in bulk import API.
         * @return {@link Builder}
         */
        public Builder withMaxUpdateMiniBatchCount(int count) {
            Preconditions.checkArgument(count > 0, "maxUpdateMiniBatchCount cannot be negative");

            this.maxUpdateMiniBatchCount = count;
            return this;
        }

        /**
         * Use the given retry options to apply to {@link DocumentClient} used in initialization of {@link DocumentBulkExecutor}.
         * 
         * @param options an instance of {@link RetryOptions}
         * @return {@link Builder}
         */
        public Builder withInitializationRetryOptions(RetryOptions options) {
            this.retryOptions = options;
            return this;
        }

        /**
         * Instantiates {@link DocumentBulkExecutor} given the configured {@link Builder}.
         *
         * @return the newly instantiated instance of {@link DocumentBulkExecutor}
         * @throws Exception if there is any failure
         */
        public DocumentBulkExecutor build() throws Exception {
            DocumentBulkExecutor executor = new DocumentBulkExecutor(client, collectionLink, partitionKeyDef, offerThroughput);
            try {
                executor.setInitializationRetryOptions(retryOptions);
                executor.setMaxMiniBatchSize(maxMiniBatchSize);
                executor.setMaxUpdateMiniBatchCount(maxUpdateMiniBatchCount);

                executor.safeInit();
            } catch (Exception e) {
                executor.close();
                throw e;
            }
            return executor;
        }

        private Builder() {}
    }

    /**
     * Creates a new {@link DocumentBulkExecutor.Builder} instance
     * @return an instance of {@link DocumentBulkExecutor.Builder}
     */
    public static DocumentBulkExecutor.Builder builder() {
        return new DocumentBulkExecutor.Builder();
    }

    /**
     * The name of the system stored procedure for bulk import.
     */
    private final static String BULK_IMPORT_STORED_PROCECURE_NAME = "__.sys.commonBulkInsert";

    /**
     * The name of the stored procedure for bulk update.
     */
    private final static String BULK_UPDATE_STORED_PROCECURE_NAME = "__.sys.bulkPatch";
    
    /**
     * The name of the stored procedure for bulk delete.
     */
    private final static String BULK_DELETE_STORED_PROCECURE_NAME = "__.sys.commonDelete";

    /**
     * The maximal sproc payload size sent (as a fraction of 2MB).
     */
    private final static int MAX_BULK_IMPORT_SCRIPT_INPUT_SIZE = (2202010 * 5) / 10;

    /**
     * The fraction of maximum sproc payload size up to which documents allowed to be fit in a mini-batch.
     */
    private final static double FRACTION_OF_MAX_BULK_IMPORT_SCRIPT_INPUT_SIZE_ALLOWED = 0.20;

    /**
     * Initialization sleep time on 
     */
    private final static int INITIALIZATION_SLEEP_TIME_ON_THROTTLING = 500;

    /**
     * The default max batch size for bulk delete operations
     */
    private final static int DEFAULT_BULK_DELETE_BATCH_SIZE = 1000;

    /**
     * Default sleep time in milliseconds to wait prior to re-initializing the BulkExecutor and retrying previously failed batch(es).
     */
    private final static int SLEEP_TIME_FOR_RETRY_POST_SPLIT_IN_MILLIS = 65 * 1000;

    /**
     * Maximum number of retries when split related failures are encountered
     */
    private final static int MAX_RETRIES_ON_SPLIT_FAILURES = 10;

    /**
     * Logger
     */
    private static final Logger logger = LoggerFactory.getLogger(DocumentBulkExecutor.class);

    /**
     * Degree of parallelism for each partition which was inferred from previous batch execution.
     */
    private final Map partitionKeyRangeIdToInferredDegreeOfParallelism = new ConcurrentHashMap<>();
    
    /**
     * Regex pattern for SQL query used to bulk delete documents
     */
    private final static String SQL_QUERY_REGEX_PATTERN = "(?i)select\\s+\\*\\s+(?i)from\\s+(?c)\\s+(?i)where(?:\\s+(?.+))?";
    
    private final static Pattern BULK_DELETE_QUERY_SPEC_PATTERN = Pattern.compile(SQL_QUERY_REGEX_PATTERN);
    
    /**
     * Executor Service
     */
    private final ListeningExecutorService listeningExecutorService;

    /**
     * The DocumentDB client instance.
     */
    private final DocumentClient client;

    /**
     * The document collection to which documents are to be bulk imported.
     */
    private final String collectionLink;

    /**
     * Partition Key Definition of the underlying collection.
     */
    private final PartitionKeyDefinition partitionKeyDefinition;

    /**
     * Partition Key Range Ids
     */
    private List partitionKeyRangeIds;

    /**
     * Collection routing map used to retrieve partition key range Ids of a given collection
     */
    private CollectionRoutingMap collectionRoutingMap;

    /**
     * Bulk Import Stored Procedure Link relevant to the given collection
     */
    private String bulkImportStoredProcLink;

    /**
     * Bulk Update Stored Procedure Link relevant to the given collection
     */
    private String bulkUpdateStoredProcLink;
    
    /**
     * Bulk Delete Stored Procedure Link relevant to the given collection
     */
    private String bulkDeleteStoredProcLink;

    /**
     * Collection offer throughput
     */
    private int collectionThroughput;

    /**
     * Max Mini Batch Size
     */
    private int maxMiniBatchSize;

    /**
     * Max Update Mini Batch Count
     */
    private int maxUpdateMiniBatchCount;

    private RetryOptions retryOptions;

    private void setMaxMiniBatchSize(int size) {
        this.maxMiniBatchSize = size;
    }

    private void setMaxUpdateMiniBatchCount(int count) {
        this.maxUpdateMiniBatchCount = count;
    }

    private void setInitializationRetryOptions(RetryOptions options) {
        this.retryOptions = options;
    }

    /**
     * Initializes a new instance of {@link DocumentBulkExecutor}
     *
     * @param client {@link DocumentClient} instance to use
     * @param collectionLink specifies the link to the target Azure Cosmos DB collection
     * @param partitionKeyDefinition specifies the {@link PartitionKeyDefinition} of the collection
     * @param collectionOfferThroughput specifies the throughput allocated for bulk operations out of the collection's total throughput
     */
    private DocumentBulkExecutor(DocumentClient client, 
            String collectionLink,
            PartitionKeyDefinition partitionKeyDefinition,
            int collectionOfferThroughput) {
        Preconditions.checkNotNull(client, "client cannot be null");
        Preconditions.checkNotNull(partitionKeyDefinition, "partitionKeyDefinition cannot be null");
        Preconditions.checkNotNull(collectionLink, "collectionLink cannot be null");
        Preconditions.checkArgument(collectionOfferThroughput > 0, "collection throughput is less than 10,000");

        this.client = client;
        this.collectionLink = collectionLink;
        this.collectionThroughput =  collectionOfferThroughput;
        this.partitionKeyDefinition = partitionKeyDefinition;
        this.listeningExecutorService = MoreExecutors.listeningDecorator(Executors.newCachedThreadPool());
    }

    private void safeInit() throws Exception {
        int count = 0;
        long startTime = System.currentTimeMillis();
        while(true) {
            try {
                initialize();
                break;
            } catch (Exception e) {
                count++;
                DocumentClientException dce = ExceptionUtils.getThrottelingException(e);
                long now = System.currentTimeMillis();
                if (count < retryOptions.getMaxRetryAttemptsOnThrottledRequests() 
                        && now - startTime < (retryOptions.getMaxRetryWaitTimeInSeconds() * 1000)
                        && dce != null
                        && dce.getStatusCode() == HttpConstants.StatusCodes.TOO_MANY_REQUESTS ) {
                    Thread.sleep(count * dce.getRetryAfterInMilliseconds() + INITIALIZATION_SLEEP_TIME_ON_THROTTLING);
                    continue;
                } else {
                    throw e;
                }
            }
        }
    }

    /**
     * Releases any internal resources.
     * It is responsibility of the caller to close {@link DocumentClient}.
     */
    @Override
    public void close() {
        // disable submission of new tasks
        listeningExecutorService.shutdown();
        try {
            // wait for existing tasks to terminate
            if (!listeningExecutorService.awaitTermination(60, TimeUnit.SECONDS)) {
                // cancel any currently running executing tasks
                listeningExecutorService.shutdownNow();
                // wait for cancelled tasks to terminate
                if (!listeningExecutorService.awaitTermination(60, TimeUnit.SECONDS)) {
                    logger.error("some tasks did not terminate");
                }
            }
        } catch (InterruptedException e) {
            listeningExecutorService.shutdownNow();
            Thread.currentThread().interrupt();
        }
    }

    /**
     * Initializes {@link DocumentBulkExecutor}. This happens only once
     * @throws DocumentClientException
     */
    private void initialize() throws DocumentClientException {
        logger.debug("Initializing ...");

        this.bulkImportStoredProcLink = String.format("%s/sprocs/%s", collectionLink, BULK_IMPORT_STORED_PROCECURE_NAME);
        this.bulkUpdateStoredProcLink = String.format("%s/sprocs/%s", collectionLink, BULK_UPDATE_STORED_PROCECURE_NAME);
        this.bulkDeleteStoredProcLink = String.format("%s/sprocs/%s", collectionLink, BULK_DELETE_STORED_PROCECURE_NAME);
        
        logger.debug("Fetching partition map of collection");
        Range fullRange = new Range(
                PartitionKeyInternalHelper.MinimumInclusiveEffectivePartitionKey,
                PartitionKeyInternalHelper.MaximumExclusiveEffectivePartitionKey,
                true,
                false);

        // this assumes database and collection already exists
        try {
            client.readCollection(collectionLink, null).getResource();
        } catch (DocumentClientException ex) {
            if (ex.getStatusCode() == 404) {
                logger.error("Unable to read resource for collection link " + collectionLink);
            }

            throw ex;
        }

        this.collectionRoutingMap = getCollectionRoutingMap(client, this.collectionLink);
        Collection partitionKeyRanges = this.collectionRoutingMap.getOverlappingRanges(fullRange);

        this.partitionKeyRangeIds = partitionKeyRanges.stream().map(partitionKeyRange -> partitionKeyRange.getId()).collect(Collectors.toList());
        logger.debug("Initialization completed");
    }

    /**
     * Executes a bulk import in the Azure Cosmos DB database service.
     * 
     *      * {@code
     * ConnectionPolicy connectionPolicy = new ConnectionPolicy();
     * RetryOptions retryOptions = new RetryOptions();
     * 
     * // Set client's retry options high for initialization
     * retryOptions.setMaxRetryWaitTimeInSeconds(120);
     * retryOptions.setMaxRetryAttemptsOnThrottledRequests(100);
     * connectionPolicy.setRetryOptions(retryOptions);
     * connectionPolicy.setMaxPoolSize(1000);
     *
     * DocumentClient client = new DocumentClient(HOST, MASTER_KEY, connectionPolicy, null);
     *
     * String collectionLink = String.format("/dbs/%s/colls/%s", "mydb", "mycol");
     * DocumentCollection collection = client.readCollection(collectionLink, null).getResource();
     *
     * DocumentBulkExecutor executor = DocumentBulkExecutor.builder().from(client, collection,
     *     collection.getPartitionKey(), collectionOfferThroughput).build();
     *
     * // Set retries to 0 to pass control to bulk executor
     * client.getConnectionPolicy().getRetryOptions().setMaxRetryWaitTimeInSeconds(0);
     * client.getConnectionPolicy().getRetryOptions().setMaxRetryAttemptsOnThrottledRequests(0);
     * 
     * for(int i = 0; i < 10; i++) {
     *   List documents = documentSource.getMoreDocuments();
     *
     *   BulkImportResponse bulkImportResponse = executor.importAll(documents, false, true, 40);
     *
     *   // Validate that all documents inserted to ensure no failure.
     *   if (bulkImportResponse.getNumberOfDocumentsImported() < documents.size()) {
     *      for(Exception e: bulkImportResponse.getErrors()) {
     *          // Validate why there were some failures.
     *          e.printStackTrace();
     *      }
     *      break;
     *   }
     * }
     *
     * executor.close();
     * client.close();
     * }
     * 
     * 
     * 
     * @param documents specifies the collection of JSON-serialized documents to import
     * @param isUpsert indicates whether a document in the supplied collection needs to be overwritten if the id already exists
     * @param disableAutomaticIdGeneration indicates whether the id has to be automatically generated for a document if absent in the supplied collection
     * @param maxConcurrencyPerPartitionRange specifies the maximum degree of concurrency per partition key range (default value is 20 if set to null)
     * @return an instance of {@link BulkImportResponse}
     * @throws DocumentClientException if any failure happens
     */
    public BulkImportResponse importAll(Collection documents,
            boolean isUpsert,
            boolean disableAutomaticIdGeneration,
            Integer maxConcurrencyPerPartitionRange) throws DocumentClientException {
        return executeBulkImportInternal(documents,
                isUpsert,
                disableAutomaticIdGeneration,
                maxConcurrencyPerPartitionRange,
                0,
                0,
                0);
    }

    /**
     * @param documents specifies the collection of JSON-serialized documents to import
     * @param isUpsert indicates whether a document in the supplied collection needs to be overwritten if the id already exists
     * @param disableAutomaticIdGeneration indicates whether the id has to be automatically generated for a document if absent in the supplied collection
     * @param maxConcurrencyPerPartitionRange specifies the maximum degree of concurrency per partition key range (default value is 20 if set to null)
     * @param effectiveMaxMiniBatchImportSize specifies the maximum size of mini batch imports
     * @param numberOfParallelTasks specifies the number of parallel ingestion tasks
     * @param writeThroughputBudgetPerCosmosPartition specifies the user provided RU Budget that is split for each cosmos b physical partition
     * @return an instance of {@link BulkImportResponse}
     * @throws DocumentClientException if any failure happens
      */
    public BulkImportResponse importAll(Collection documents,
                                        boolean isUpsert,
                                        boolean disableAutomaticIdGeneration,
                                        Integer maxConcurrencyPerPartitionRange,
                                        Integer effectiveMaxMiniBatchImportSize,
                                        Integer numberOfParallelTasks,
                                        Integer writeThroughputBudgetPerCosmosPartition) throws DocumentClientException {
        return executeBulkImportInternal(documents,
                isUpsert,
                disableAutomaticIdGeneration,
                maxConcurrencyPerPartitionRange,
                effectiveMaxMiniBatchImportSize,
                numberOfParallelTasks,
                writeThroughputBudgetPerCosmosPartition);
    }


    /**
     * Executes a bulk update in the Azure Cosmos DB database service.
     * 
     * @param updateItems specifies the collection of update items each of which comprises the list of field update operations to be performed
     * on a document identified by an id and partition key value.
     * @param maxConcurrencyPerPartitionRange specifies the maximum degree of concurrency per partition key range (default value is 20 if set to null)
     * @return an instance of {@link BulkUpdateResponse}
     * @throws DocumentClientException if any failure happens
     */
    public BulkUpdateResponse updateAll(Collection updateItems,
            Integer maxConcurrencyPerPartitionRange) throws DocumentClientException {
        return executeBulkUpdateInternal(updateItems, maxConcurrencyPerPartitionRange);
    }

    /**
     * Executes a bulk update in the Azure Cosmos DB database service with given set of patch documents.
     * 
     * @param patchDocuments which are documents comprising id, partition key values and fields to set with the corresponding values
     * @param maxConcurrencyPerPartitionRange specifies the maximum degree of concurrency per partition key range (default value is 20 if set to null)
     * @return an instance of {@link BulkUpdateResponse}
     * @throws DocumentClientException if any failure happens
     */
    public BulkUpdateResponse mergeAll(Collection patchDocuments,
            Integer maxConcurrencyPerPartitionRange) throws DocumentClientException {
        return executeBulkUpdateWithPatchInternal(patchDocuments, maxConcurrencyPerPartitionRange);
    }

    /**
     * Executes a bulk delete in the Azure Cosmos DB database service.
     *
     * @param pkIdPairsToDelete List of pairs of partition key and id values of documents to delete
     * @return an instance of {@link BulkDeleteResponse}
     * @throws DocumentClientException if any failure happens
     */
    public BulkDeleteResponse deleteAll(List> pkIdPairsToDelete) throws DocumentClientException {
        return executeBulkDeleteInternalPkRowKeys(pkIdPairsToDelete);
    }
    
    @SuppressWarnings("unused")
    private BulkUpdateResponse updateDocument(String partitionKey, String id, List updateOperations) throws DocumentClientException {
        return executeUpdateDocumentInternal(partitionKey, id, updateOperations);
    }

    private BulkImportResponse executeBulkImportInternal(Collection input,
            boolean isUpsert,
            boolean disableAutomaticIdGeneration,
            Integer maxConcurrencyPerPartitionRange,
            Integer effectiveMaxMiniBatchImportSize,
            Integer numberOfParallelTasks,
            Integer writeThroughputBudgetPerCosmosPartition) throws DocumentClientException {
        Preconditions.checkNotNull(input, "document collection cannot be null");
        try {
            Collection documentsToInsertOrRetry = new ArrayList<>(input);
            Collection documentsFailedToImportDueToSplits;
            List failedImports = new ArrayList<>();
            int numRetriesDueToSplits = 0;

            List failures = new ArrayList<>();
            List