org.elasticsearch.hadoop.rest.bulk.BulkProcessor Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of elasticsearch-hadoop-mr Show documentation
Show all versions of elasticsearch-hadoop-mr Show documentation
Elasticsearch Hadoop Map/Reduce
/*
* Licensed to Elasticsearch under one or more contributor
* license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright
* ownership. Elasticsearch licenses this file to you under
* the Apache License, Version 2.0 (the "License"); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/
package org.elasticsearch.hadoop.rest.bulk;
import java.io.Closeable;
import java.util.ArrayList;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.elasticsearch.hadoop.EsHadoopException;
import org.elasticsearch.hadoop.EsHadoopIllegalArgumentException;
import org.elasticsearch.hadoop.EsHadoopIllegalStateException;
import org.elasticsearch.hadoop.cfg.ConfigurationOptions;
import org.elasticsearch.hadoop.cfg.Settings;
import org.elasticsearch.hadoop.handler.EsHadoopAbortHandlerException;
import org.elasticsearch.hadoop.handler.HandlerResult;
import org.elasticsearch.hadoop.rest.ErrorExtractor;
import org.elasticsearch.hadoop.rest.Resource;
import org.elasticsearch.hadoop.rest.RestClient;
import org.elasticsearch.hadoop.rest.bulk.handler.BulkWriteErrorCollector;
import org.elasticsearch.hadoop.rest.bulk.handler.BulkWriteErrorHandler;
import org.elasticsearch.hadoop.rest.bulk.handler.BulkWriteFailure;
import org.elasticsearch.hadoop.rest.bulk.handler.IBulkWriteErrorHandler;
import org.elasticsearch.hadoop.rest.bulk.handler.impl.BulkWriteHandlerLoader;
import org.elasticsearch.hadoop.rest.bulk.handler.impl.HttpRetryHandler;
import org.elasticsearch.hadoop.rest.stats.Stats;
import org.elasticsearch.hadoop.rest.stats.StatsAware;
import org.elasticsearch.hadoop.util.ArrayUtils;
import org.elasticsearch.hadoop.util.Assert;
import org.elasticsearch.hadoop.util.BytesArray;
import org.elasticsearch.hadoop.util.BytesRef;
import org.elasticsearch.hadoop.util.TrackingBytesArray;
import org.elasticsearch.hadoop.util.unit.TimeValue;
/**
* Oversees the addition of bulk entries into an internal buffer, the flushing of documents to Elasticsearch,
* and the handling of failures in bulk operations.
*/
public class BulkProcessor implements Closeable, StatsAware {
private static Log LOG = LogFactory.getLog(BulkProcessor.class);
private final RestClient restClient;
private final Resource resource;
private final Settings settings;
private final Stats stats = new Stats();
private final ErrorExtractor errorExtractor;
// Buffers and state of content
private BytesArray ba;
private TrackingBytesArray data;
private int dataEntries = 0;
// Configs
private int bufferEntriesThreshold;
private boolean autoFlush = true;
private int retryLimit;
// Processor writing state flags
private boolean executedBulkWrite = false;
private boolean hadWriteErrors = false;
private boolean requiresRefreshAfterBulk = false;
// Bulk write error handlers.
private List documentBulkErrorHandlers;
public BulkProcessor(RestClient restClient, Resource resource, Settings settings) {
this.restClient = restClient;
this.resource = resource;
this.settings = settings;
// Flushing bounds
this.autoFlush = !settings.getBatchFlushManual();
this.bufferEntriesThreshold = settings.getBatchSizeInEntries();
this.requiresRefreshAfterBulk = settings.getBatchRefreshAfterWrite();
// Negative retry count means that we're going to retry forever in the retry handler.
int retryCount = settings.getBatchWriteRetryCount();
// Negative retry limit means that we'll let retry handlers retry forever if need be.
int limit = settings.getBatchWriteRetryLimit();
// Set the processors retry limit to a smart value based on both the configured limit and the configured retry count.
this.retryLimit = (limit < retryCount || retryCount < 0) ? retryCount : limit;
// Backing data array
this.ba = new BytesArray(new byte[settings.getBatchSizeInBytes()], 0);
this.data = new TrackingBytesArray(ba);
// Create error handlers
BulkWriteErrorHandler httpRetryHandler = new HttpRetryHandler(settings);
BulkWriteHandlerLoader handlerLoader = new BulkWriteHandlerLoader();
handlerLoader.setSettings(settings);
// Order up the handlers.
this.documentBulkErrorHandlers = new ArrayList();
this.documentBulkErrorHandlers.add(httpRetryHandler);
this.documentBulkErrorHandlers.addAll(handlerLoader.loadHandlers());
// Error Extractor
this.errorExtractor = new ErrorExtractor(settings.getInternalVersionOrThrow());
}
/**
* Adds an entry to the bulk request, potentially flushing if the request reaches capacity.
* @param payload the entire bulk entry in JSON format, including the header and payload.
*/
public void add(BytesRef payload) {
// check space first
// ba is the backing array for data
if (payload.length() > ba.available()) {
if (autoFlush) {
flush();
}
else {
throw new EsHadoopIllegalStateException(
String.format("Auto-flush disabled and bulk buffer full; disable manual flush or increase " +
"capacity [current size %s]; bailing out", ba.capacity()));
}
}
data.copyFrom(payload);
dataEntries++;
if (bufferEntriesThreshold > 0 && dataEntries >= bufferEntriesThreshold) {
if (autoFlush) {
flush();
}
else {
// handle the corner case of manual flush that occurs only after the buffer is completely full (think size of 1)
if (dataEntries > bufferEntriesThreshold) {
throw new EsHadoopIllegalStateException(
String.format(
"Auto-flush disabled and maximum number of entries surpassed; disable manual " +
"flush or increase capacity [current size %s]; bailing out",
bufferEntriesThreshold));
}
}
}
}
/**
* Keeps track of a given document entry's position in the original bulk request, as well as how many
* attempts to write the entry have been performed.
*/
private class BulkAttempt {
public BulkAttempt(int attemptNumber, int originalPosition) {
this.attemptNumber = attemptNumber;
this.originalPosition = originalPosition;
}
private int attemptNumber;
private int originalPosition;
}
/**
* Attempts a flush operation, handling failed documents based on configured error listeners.
* @return A result object detailing the success or failure of the request, including information about any
* failed documents.
* @throws EsHadoopException in the event that the bulk operation fails or is aborted.
*/
public BulkResponse tryFlush() {
BulkResponse bulkResult = null;
boolean trackingArrayExpanded = false;
String bulkLoggingID = createDebugTxnID();
try {
// double check data - it might be a false flush (called on clean-up)
if (data.length() > 0) {
int totalDocs = data.entries();
int docsSent = 0;
int docsSkipped = 0;
int docsAborted = 0;
long totalTime = 0L;
boolean retryOperation = false;
int totalAttempts = 0;
long waitTime = 0L;
List retries = new ArrayList();
List abortErrors = new ArrayList();
do {
// Throw to break out of a possible infinite loop, but only if the limit is a positive number
if (retryLimit >= 0 && totalAttempts > retryLimit) {
throw new EsHadoopException("Executed too many bulk requests without success. Attempted [" +
totalAttempts + "] write operations, which exceeds the bulk request retry limit specified" +
"by [" + ConfigurationOptions.ES_BATCH_WRITE_RETRY_LIMIT + "], and found data still " +
"not accepted. Perhaps there is an error handler that is not terminating? Bailing out..."
);
}
// Log messages, and if wait time is set, perform the thread sleep.
initFlushOperation(bulkLoggingID, retryOperation, retries.size(), waitTime);
// Exec bulk operation to ES, get response.
debugLog(bulkLoggingID, "Submitting request");
RestClient.BulkActionResponse bar = restClient.bulk(resource, data);
debugLog(bulkLoggingID, "Response received");
totalAttempts++;
totalTime += bar.getTimeSpent();
// Log retry stats if relevant
if (retryOperation) {
stats.docsRetried += data.entries();
stats.bytesRetried += data.length();
stats.bulkRetries++;
stats.bulkRetriesTotalTime += bar.getTimeSpent();
}
executedBulkWrite = true;
// Handle bulk write failures
if (!bar.getEntries().hasNext()) {
// Legacy Case:
// If no items on response, assume all documents made it in.
// Recorded bytes are ack'd here
stats.bytesAccepted += data.length();
stats.docsAccepted += data.entries();
retryOperation = false;
bulkResult = BulkResponse.complete(bar.getResponseCode(), totalTime, totalDocs, totalDocs, 0);
} else {
// Base Case:
// Iterate over the response and the data in the tracking bytes array at the same time, passing
// errors to error handlers for resolution.
// Keep track of which document we are on as well as where we are in the tracking bytes array.
int documentNumber = 0;
int trackingBytesPosition = 0;
// Hand off the previous list of retries so that we can track the next set of retries (if any).
List previousRetries = retries;
retries = new ArrayList();
// If a document is edited and retried then it is added at the end of the buffer. Keep a tail list of these new retry attempts.
List newDocumentRetries = new ArrayList();
BulkWriteErrorCollector errorCollector = new BulkWriteErrorCollector();
// Iterate over all entries, and for each error found, attempt to handle the problem.
for (Iterator