org.apache.hadoop.hbase.client.AsyncProcess Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of phoenix-client
Phoenix Client
The newest version!
/*
 *
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package org.apache.hadoop.hbase.client;

import com.google.common.annotations.VisibleForTesting;
import java.io.IOException;
import java.io.InterruptedIOException;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collection;
import java.util.Collections;
import java.util.Date;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Iterator;
import java.util.LinkedList;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.TreeSet;
import java.util.concurrent.ConcurrentHashMap;
import java.util.concurrent.ConcurrentMap;
import java.util.concurrent.ConcurrentSkipListMap;
import java.util.concurrent.ExecutorService;
import java.util.concurrent.RejectedExecutionException;
import java.util.concurrent.TimeUnit;
import java.util.concurrent.atomic.AtomicInteger;
import java.util.concurrent.atomic.AtomicLong;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.hbase.client.AsyncProcess.RowChecker.ReturnCode;
import org.apache.hadoop.hbase.CallQueueTooBigException;
import org.apache.hadoop.hbase.DoNotRetryIOException;
import org.apache.hadoop.hbase.HConstants;
import org.apache.hadoop.hbase.HRegionInfo;
import org.apache.hadoop.hbase.HRegionLocation;
import org.apache.hadoop.hbase.RegionLocations;
import org.apache.hadoop.hbase.RetryImmediatelyException;
import org.apache.hadoop.hbase.ServerName;
import org.apache.hadoop.hbase.TableName;
import org.apache.hadoop.hbase.classification.InterfaceAudience;
import org.apache.hadoop.hbase.client.backoff.ServerStatistics;
import org.apache.hadoop.hbase.client.coprocessor.Batch;
import org.apache.hadoop.hbase.exceptions.ClientExceptionsUtil;
import org.apache.hadoop.hbase.ipc.RpcControllerFactory;
import org.apache.hadoop.hbase.protobuf.generated.ClientProtos;
import org.apache.hadoop.hbase.util.Bytes;
import org.apache.hadoop.hbase.util.EnvironmentEdge;
import org.apache.hadoop.hbase.util.EnvironmentEdgeManager;
import org.apache.htrace.Trace;

/**
 * This class  allows a continuous flow of requests. It's written to be compatible with a
 * synchronous caller such as HTable.
 * 
 * The caller sends a buffer of operation, by calling submit. This class extract from this list
 * the operations it can send, i.e. the operations that are on region that are not considered
 * as busy. The process is asynchronous, i.e. it returns immediately when if has finished to
 * iterate on the list. If, and only if, the maximum number of current task is reached, the call
 * to submit will block. Alternatively, the caller can call submitAll, in which case all the
 * operations will be sent. Each call to submit returns a future-like object that can be used
 * to track operation progress.
 * 
 * 
 * The class manages internally the retries.
 * 
 * 
 * The class can be constructed in regular mode, or "global error" mode. In global error mode,
 * AP tracks errors across all calls (each "future" also has global view of all errors). That
 * mode is necessary for backward compat with HTable behavior, where multiple submissions are
 * made and the errors can propagate using any put/flush call, from previous calls.
 * In "regular" mode, the errors are tracked inside the Future object that is returned.
 * The results are always tracked inside the Future object and can be retrieved when the call
 * has finished. Partial results can also be retrieved if some part of multi-request failed.
 * 
 * 
 * This class is thread safe in regular mode; in global error code, submitting operations and
 * retrieving errors from different threads may be not thread safe.
 * Internally, the class is thread safe enough to manage simultaneously new submission and results
 * arising from older operations.
 * 
 * 
 * Internally, this class works with {@link Row}, this mean it could be theoretically used for
 * gets as well.
 * 
 */
@InterfaceAudience.Private
@edu.umd.cs.findbugs.annotations.SuppressWarnings(value="JLM_JSR166_UTILCONCURRENT_MONITORENTER",
  justification="Synchronization on tasks in progress counter is intended")
class AsyncProcess {
  private static final Log LOG = LogFactory.getLog(AsyncProcess.class);
  protected static final AtomicLong COUNTER = new AtomicLong();

  public static final String PRIMARY_CALL_TIMEOUT_KEY = "hbase.client.primaryCallTimeout.multiget";

  /**
   * Configure the number of failures after which the client will start logging. A few failures
   * is fine: region moved, then is not opened, then is overloaded. We try to have an acceptable
   * heuristic for the number of errors we don't log. 9 was chosen because we wait for 1s at
   * this stage.
   */
  public static final String START_LOG_ERRORS_AFTER_COUNT_KEY =
      "hbase.client.start.log.errors.counter";
  public static final int DEFAULT_START_LOG_ERRORS_AFTER_COUNT = 9;

  /**
   * Configuration to decide whether to log details for batch error
   */
  public static final String LOG_DETAILS_FOR_BATCH_ERROR = "hbase.client.log.batcherrors.details";

  private final int thresholdToLogUndoneTaskDetails;
  private static final String THRESHOLD_TO_LOG_UNDONE_TASK_DETAILS =
      "hbase.client.threshold.log.details";
  private static final int DEFAULT_THRESHOLD_TO_LOG_UNDONE_TASK_DETAILS = 10;
  private final int THRESHOLD_TO_LOG_REGION_DETAILS = 2;

  /**
   * The maximum size of single RegionServer.
   */
  public static final String HBASE_CLIENT_MAX_PERREQUEST_HEAPSIZE = "hbase.client.max.perrequest.heapsize";

  /**
   * Default value of {@link #HBASE_CLIENT_MAX_PERREQUEST_HEAPSIZE}.
   */
  public static final long DEFAULT_HBASE_CLIENT_MAX_PERREQUEST_HEAPSIZE = 4194304;

  /**
   * The maximum size of submit.
   */
  public static final String HBASE_CLIENT_MAX_SUBMIT_HEAPSIZE = "hbase.client.max.submit.heapsize";
  /**
   * Default value of {@link #HBASE_CLIENT_MAX_SUBMIT_HEAPSIZE}.
   */
  public static final long DEFAULT_HBASE_CLIENT_MAX_SUBMIT_HEAPSIZE = DEFAULT_HBASE_CLIENT_MAX_PERREQUEST_HEAPSIZE;

  /**
   * The context used to wait for results from one submit call.
   * 1) If AsyncProcess is set to track errors globally, and not per call (for HTable puts),
   *    then errors and failed operations in this object will reflect global errors.
   * 2) If submit call is made with needResults false, results will not be saved.
   *  */
  public static interface AsyncRequestFuture {
    public boolean hasError();
    public RetriesExhaustedWithDetailsException getErrors();
    public List getFailedOperations();
    public Object[] getResults() throws InterruptedIOException;
    /** Wait until all tasks are executed, successfully or not. */
    public void waitUntilDone() throws InterruptedIOException;
  }

  /**
   * Return value from a submit that didn't contain any requests.
   */
  private static final AsyncRequestFuture NO_REQS_RESULT = new AsyncRequestFuture() {

    final Object[] result = new Object[0];

    @Override
    public boolean hasError() {
      return false;
    }

    @Override
    public RetriesExhaustedWithDetailsException getErrors() {
      return null;
    }

    @Override
    public List getFailedOperations() {
      return null;
    }

    @Override
    public Object[] getResults() {
      return result;
    }

    @Override
    public void waitUntilDone() throws InterruptedIOException {
    }
  };

  /** Sync point for calls to multiple replicas for the same user request (Get).
   * Created and put in the results array (we assume replica calls require results) when
   * the replica calls are launched. See results for details of this process.
   * POJO, all fields are public. To modify them, the object itself is locked. */
  private static class ReplicaResultState {
    public ReplicaResultState(int callCount) {
      this.callCount = callCount;
    }

    /** Number of calls outstanding, or 0 if a call succeeded (even with others outstanding). */
    int callCount;
    /** Errors for which it is not decided whether we will report them to user. If one of the
     * calls succeeds, we will discard the errors that may have happened in the other calls. */
    BatchErrors replicaErrors = null;

    @Override
    public String toString() {
      return "[call count " + callCount + "; errors " + replicaErrors + "]";
    }
  }


  // TODO: many of the fields should be made private
  protected final long id;

  protected final ClusterConnection connection;
  protected final RpcRetryingCallerFactory rpcCallerFactory;
  protected final RpcControllerFactory rpcFactory;
  protected final BatchErrors globalErrors;
  protected final ExecutorService pool;

  protected final AtomicLong tasksInProgress = new AtomicLong(0);
  protected final ConcurrentMap taskCounterPerRegion =
      new ConcurrentSkipListMap(Bytes.BYTES_COMPARATOR);
  protected final ConcurrentMap taskCounterPerServer =
      new ConcurrentHashMap();
  // Start configuration settings.
  private final int startLogErrorsCnt;

  /**
   * The number of tasks simultaneously executed on the cluster.
   */
  protected final int maxTotalConcurrentTasks;

  /**
   * The max heap size of all tasks simultaneously executed on a server.
   */
  protected final long maxHeapSizePerRequest;
  protected final long maxHeapSizeSubmit;
  /**
   * The number of tasks we run in parallel on a single region.
   * With 1 (the default) , we ensure that the ordering of the queries is respected: we don't start
   * a set of operations on a region before the previous one is done. As well, this limits
   * the pressure we put on the region server.
   */
  protected final int maxConcurrentTasksPerRegion;

  /**
   * The number of task simultaneously executed on a single region server.
   */
  protected final int maxConcurrentTasksPerServer;
  protected final long pause;
  protected final long pauseForCQTBE;// pause for CallQueueTooBigException, if specified
  protected int numTries;
  protected int serverTrackerTimeout;
  protected int rpcTimeout;
  protected int operationTimeout;
  protected long primaryCallTimeoutMicroseconds;
  /** Whether to log details for batch errors */
  private final boolean logBatchErrorDetails;
  // End configuration settings.

  protected static class BatchErrors {
    private final List throwables = new ArrayList();
    private final List actions = new ArrayList();
    private final List addresses = new ArrayList();

    public synchronized void add(Throwable ex, Row row, ServerName serverName) {
      if (row == null){
        throw new IllegalArgumentException("row cannot be null. location=" + serverName);
      }

      throwables.add(ex);
      actions.add(row);
      addresses.add(serverName != null ? serverName.toString() : "null");
    }

    public boolean hasErrors() {
      return !throwables.isEmpty();
    }

    private synchronized RetriesExhaustedWithDetailsException makeException(boolean logDetails) {
      if (logDetails) {
        LOG.error("Exception occurred! Exception details: " + throwables + ";\nActions: "
            + actions);
      }
      return new RetriesExhaustedWithDetailsException(new ArrayList(throwables),
          new ArrayList(actions), new ArrayList(addresses));
    }

    public synchronized void clear() {
      throwables.clear();
      actions.clear();
      addresses.clear();
    }

    public synchronized void merge(BatchErrors other) {
      throwables.addAll(other.throwables);
      actions.addAll(other.actions);
      addresses.addAll(other.addresses);
    }
  }
  public AsyncProcess(ClusterConnection hc, Configuration conf, ExecutorService pool,
      RpcRetryingCallerFactory rpcCaller, boolean useGlobalErrors, RpcControllerFactory rpcFactory,
      int rpcTimeout) {
    if (hc == null) {
      throw new IllegalArgumentException("HConnection cannot be null.");
    }

    this.connection = hc;
    this.pool = pool;
    this.globalErrors = useGlobalErrors ? new BatchErrors() : null;

    this.id = COUNTER.incrementAndGet();

    this.pause = conf.getLong(HConstants.HBASE_CLIENT_PAUSE,
        HConstants.DEFAULT_HBASE_CLIENT_PAUSE);
    long configuredPauseForCQTBE = conf.getLong(HConstants.HBASE_CLIENT_PAUSE_FOR_CQTBE, pause);
    if (configuredPauseForCQTBE < pause) {
      LOG.warn("The " + HConstants.HBASE_CLIENT_PAUSE_FOR_CQTBE + " setting: "
          + configuredPauseForCQTBE + " is smaller than " + HConstants.HBASE_CLIENT_PAUSE
          + ", will use " + pause + " instead.");
      this.pauseForCQTBE = pause;
    } else {
      this.pauseForCQTBE = configuredPauseForCQTBE;
    }
    this.numTries = conf.getInt(HConstants.HBASE_CLIENT_RETRIES_NUMBER,
        HConstants.DEFAULT_HBASE_CLIENT_RETRIES_NUMBER);
    this.rpcTimeout = rpcTimeout;
    this.operationTimeout = conf.getInt(HConstants.HBASE_CLIENT_OPERATION_TIMEOUT,
        HConstants.DEFAULT_HBASE_CLIENT_OPERATION_TIMEOUT);
    this.primaryCallTimeoutMicroseconds = conf.getInt(PRIMARY_CALL_TIMEOUT_KEY, 10000);

    this.maxTotalConcurrentTasks = conf.getInt(HConstants.HBASE_CLIENT_MAX_TOTAL_TASKS,
      HConstants.DEFAULT_HBASE_CLIENT_MAX_TOTAL_TASKS);
    this.maxConcurrentTasksPerServer = conf.getInt(HConstants.HBASE_CLIENT_MAX_PERSERVER_TASKS,
          HConstants.DEFAULT_HBASE_CLIENT_MAX_PERSERVER_TASKS);
    this.maxConcurrentTasksPerRegion = conf.getInt(HConstants.HBASE_CLIENT_MAX_PERREGION_TASKS,
          HConstants.DEFAULT_HBASE_CLIENT_MAX_PERREGION_TASKS);
    this.maxHeapSizePerRequest = conf.getLong(HBASE_CLIENT_MAX_PERREQUEST_HEAPSIZE,
          DEFAULT_HBASE_CLIENT_MAX_PERREQUEST_HEAPSIZE);
    this.maxHeapSizeSubmit = conf.getLong(HBASE_CLIENT_MAX_SUBMIT_HEAPSIZE, DEFAULT_HBASE_CLIENT_MAX_SUBMIT_HEAPSIZE);
    this.startLogErrorsCnt =
        conf.getInt(START_LOG_ERRORS_AFTER_COUNT_KEY, DEFAULT_START_LOG_ERRORS_AFTER_COUNT);

    if (this.maxTotalConcurrentTasks <= 0) {
      throw new IllegalArgumentException("maxTotalConcurrentTasks=" + maxTotalConcurrentTasks);
    }
    if (this.maxConcurrentTasksPerServer <= 0) {
      throw new IllegalArgumentException("maxConcurrentTasksPerServer=" +
          maxConcurrentTasksPerServer);
    }
    if (this.maxConcurrentTasksPerRegion <= 0) {
      throw new IllegalArgumentException("maxConcurrentTasksPerRegion=" +
          maxConcurrentTasksPerRegion);
    }
    if (this.maxHeapSizePerRequest <= 0) {
      throw new IllegalArgumentException("maxHeapSizePerServer=" +
          maxHeapSizePerRequest);
    }

    if (this.maxHeapSizeSubmit <= 0) {
      throw new IllegalArgumentException("maxHeapSizeSubmit=" +
          maxHeapSizeSubmit);
    }
    // Server tracker allows us to do faster, and yet useful (hopefully), retries.
    // However, if we are too useful, we might fail very quickly due to retry count limit.
    // To avoid this, we are going to cheat for now (see HBASE-7659), and calculate maximum
    // retry time if normal retries were used. Then we will retry until this time runs out.
    // If we keep hitting one server, the net effect will be the incremental backoff, and
    // essentially the same number of retries as planned. If we have to do faster retries,
    // we will do more retries in aggregate, but the user will be none the wiser.
    this.serverTrackerTimeout = 0;
    for (int i = 0; i < this.numTries; ++i) {
      serverTrackerTimeout = (int) (serverTrackerTimeout + ConnectionUtils.getPauseTime(this.pause, i));
    }

    this.rpcCallerFactory = rpcCaller;
    this.rpcFactory = rpcFactory;
    this.logBatchErrorDetails = conf.getBoolean(LOG_DETAILS_FOR_BATCH_ERROR, false);

    this.thresholdToLogUndoneTaskDetails =
        conf.getInt(THRESHOLD_TO_LOG_UNDONE_TASK_DETAILS,
          DEFAULT_THRESHOLD_TO_LOG_UNDONE_TASK_DETAILS);
  }

  public void setRpcTimeout(int rpcTimeout) {
    this.rpcTimeout = rpcTimeout;
  }

  public void setOperationTimeout(int operationTimeout) {
    this.operationTimeout = operationTimeout;
  }

  /**
   * @return pool if non null, otherwise returns this.pool if non null, otherwise throws
   *         RuntimeException
   */
  @VisibleForTesting
  ExecutorService getPool(ExecutorService pool) {
    if (pool != null) {
      return pool;
    }
    if (this.pool != null) {
      return this.pool;
    }
    throw new RuntimeException("Neither AsyncProcess nor request have ExecutorService");
  }

  /**
   * See {@link #submit(ExecutorService, TableName, List, boolean, Batch.Callback, boolean)}.
   * Uses default ExecutorService for this AP (must have been created with one).
   */
  public  AsyncRequestFuture submit(TableName tableName, final List rows,
      boolean atLeastOne, Batch.Callback callback, boolean needResults)
      throws InterruptedIOException {
    return submit(null, tableName, rows, atLeastOne, callback, needResults);
  }
  /**
   * See {@link #submit(ExecutorService, TableName, RowAccess, boolean, Batch.Callback, boolean)}.
   * Uses default ExecutorService for this AP (must have been created with one).
   */
  public  AsyncRequestFuture submit(TableName tableName,
      final RowAccess rows, boolean atLeastOne, Batch.Callback callback,
      boolean needResults) throws InterruptedIOException {
    return submit(null, tableName, rows, atLeastOne, callback, needResults);
  }
  /**
   * See {@link #submit(ExecutorService, TableName, RowAccess, boolean, Batch.Callback, boolean)}.
   * Uses the {@link ListRowAccess} to wrap the {@link List}.
   */
  public  AsyncRequestFuture submit(ExecutorService pool, TableName tableName,
      List rows, boolean atLeastOne, Batch.Callback callback,
      boolean needResults) throws InterruptedIOException {
    return submit(pool, tableName, new ListRowAccess(rows), atLeastOne,
      callback, needResults);
  }

  /**
   * Extract from the rows list what we can submit. The rows we can not submit are kept in the
   * list. Does not send requests to replicas (not currently used for anything other
   * than streaming puts anyway).
   *
   * @param pool ExecutorService to use.
   * @param tableName The table for which this request is needed.
   * @param callback Batch callback. Only called on success (94 behavior).
   * @param needResults Whether results are needed, or can be discarded.
   * @param rows - the submitted row. Modified by the method: we remove the rows we took.
   * @param atLeastOne true if we should submit at least a subset.
   */
  public  AsyncRequestFuture submit(ExecutorService pool, TableName tableName,
      RowAccess rows, boolean atLeastOne, Batch.Callback callback,
      boolean needResults) throws InterruptedIOException {
    if (rows.isEmpty()) {
      return NO_REQS_RESULT;
    }

    Map> actionsByServer =
        new HashMap>();
    List> retainedActions = new ArrayList>(rows.size());

    NonceGenerator ng = this.connection.getNonceGenerator();
    long nonceGroup = ng.getNonceGroup(); // Currently, nonce group is per entire client.

    // Location errors that happen before we decide what requests to take.
    List locationErrors = null;
    List locationErrorRows = null;
    RowCheckerHost checker = createRowCheckerHost();
    boolean firstIter = true;
    do {
      // Wait until there is at least one slot for a new task.
      waitForMaximumCurrentTasks(maxTotalConcurrentTasks - 1, tableName.getNameAsString());
      int posInList = -1;
      if (!firstIter) {
        checker.reset();
      }
      Iterator it = rows.iterator();
      while (it.hasNext()) {
        Row r = it.next();
        HRegionLocation loc;
        try {
          if (r == null) {
            throw new IllegalArgumentException("#" + id + ", row cannot be null");
          }
          // Make sure we get 0-s replica.
          RegionLocations locs = connection.locateRegion(
              tableName, r.getRow(), true, true, RegionReplicaUtil.DEFAULT_REPLICA_ID);
          if (locs == null || locs.isEmpty() || locs.getDefaultRegionLocation() == null) {
            throw new IOException("#" + id + ", no location found, aborting submit for"
                + " tableName=" + tableName + " rowkey=" + Bytes.toStringBinary(r.getRow()));
          }
          loc = locs.getDefaultRegionLocation();
        } catch (IOException ex) {
          locationErrors = new ArrayList();
          locationErrorRows = new ArrayList();
          LOG.error("Failed to get region location ", ex);
          // This action failed before creating ars. Retain it, but do not add to submit list.
          // We will then add it to ars in an already-failed state.
          int priority = HConstants.NORMAL_QOS;
          if (r instanceof Mutation) {
            priority = ((Mutation) r).getPriority();
          }
          retainedActions.add(new Action(r, ++posInList, priority));
          locationErrors.add(ex);
          locationErrorRows.add(posInList);
          it.remove();
          break; // Backward compat: we stop considering actions on location error.
        }
        long rowSize = (r instanceof Mutation) ? ((Mutation) r).heapSize() : 0;
        ReturnCode code = checker.canTakeOperation(loc, rowSize);
        if (code == ReturnCode.END) {
          break;
        }
        if (code == ReturnCode.INCLUDE) {
          int priority = HConstants.NORMAL_QOS;
          if (r instanceof Mutation) {
            priority = ((Mutation) r).getPriority();
          }
          Action action = new Action(r, ++posInList, priority);
          setNonce(ng, r, action);
          retainedActions.add(action);
          // TODO: replica-get is not supported on this path
          byte[] regionName = loc.getRegionInfo().getRegionName();
          addAction(loc.getServerName(), regionName, action, actionsByServer, nonceGroup);
          it.remove();
        }
      }
      firstIter = false;
    } while (retainedActions.isEmpty() && atLeastOne && (locationErrors == null));

    if (retainedActions.isEmpty()) return NO_REQS_RESULT;

    return submitMultiActions(tableName, retainedActions, nonceGroup, callback, null, needResults,
        locationErrors, locationErrorRows, actionsByServer, pool);
  }

  private RowCheckerHost createRowCheckerHost() {
    return new RowCheckerHost(Arrays.asList(
        new TaskCountChecker(maxTotalConcurrentTasks,
          maxConcurrentTasksPerServer,
          maxConcurrentTasksPerRegion,
          tasksInProgress,
          taskCounterPerServer,
          taskCounterPerRegion)
        , new RequestSizeChecker(maxHeapSizePerRequest)
        , new SubmittedSizeChecker(maxHeapSizeSubmit)
    ));
  }
   AsyncRequestFuture submitMultiActions(TableName tableName,
      List> retainedActions, long nonceGroup, Batch.Callback callback,
      Object[] results, boolean needResults, List locationErrors,
      List locationErrorRows, Map> actionsByServer,
      ExecutorService pool) {
    AsyncRequestFutureImpl ars = createAsyncRequestFuture(
      tableName, retainedActions, nonceGroup, pool, callback, results, needResults, null,
        operationTimeout, rpcTimeout);
    // Add location errors if any
    if (locationErrors != null) {
      for (int i = 0; i < locationErrors.size(); ++i) {
        int originalIndex = locationErrorRows.get(i);
        Row row = retainedActions.get(originalIndex).getAction();
        ars.manageError(originalIndex, row,
            Retry.NO_LOCATION_PROBLEM, locationErrors.get(i), null);
      }
    }
    ars.sendMultiAction(actionsByServer, 1, null, false);
    return ars;
  }

  /**
   * Helper that is used when grouping the actions per region server.
   *
   * @param loc - the destination. Must not be null.
   * @param action - the action to add to the multiaction
   * @param actionsByServer the multiaction per server
   * @param nonceGroup Nonce group.
   */
  private static void addAction(ServerName server, byte[] regionName, Action action,
      Map> actionsByServer, long nonceGroup) {
    MultiAction multiAction = actionsByServer.get(server);
    if (multiAction == null) {
      multiAction = new MultiAction();
      actionsByServer.put(server, multiAction);
    }
    if (action.hasNonce() && !multiAction.hasNonceGroup()) {
      multiAction.setNonceGroup(nonceGroup);
    }

    multiAction.add(regionName, action);
  }
  /**
   * See {@link #submitAll(ExecutorService, TableName, List, org.apache.hadoop.hbase.client.coprocessor.Batch.Callback, Object[])}.
   * Uses default ExecutorService for this AP (must have been created with one).
   */
  public  AsyncRequestFuture submitAll(TableName tableName,
      List rows, Batch.Callback callback, Object[] results) {
    return submitAll(null, tableName, rows, callback, results, null, operationTimeout, rpcTimeout);
  }

  public  AsyncRequestFuture submitAll(ExecutorService pool, TableName tableName,
      List rows, Batch.Callback callback, Object[] results) {
    return submitAll(pool, tableName, rows, callback, results, null, operationTimeout, rpcTimeout);
  }
  /**
   * Submit immediately the list of rows, whatever the server status. Kept for backward
   * compatibility: it allows to be used with the batch interface that return an array of objects.
   *
   * @param pool ExecutorService to use.
   * @param tableName name of the table for which the submission is made.
   * @param rows the list of rows.
   * @param callback the callback.
   * @param results Optional array to return the results thru; backward compat.
   */
  public  AsyncRequestFuture submitAll(ExecutorService pool, TableName tableName,
      List rows, Batch.Callback callback, Object[] results,
      PayloadCarryingServerCallable callable, int operationTimeout, int rpcTimeout) {
    List> actions = new ArrayList>(rows.size());

    // The position will be used by the processBatch to match the object array returned.
    int posInList = -1;
    NonceGenerator ng = this.connection.getNonceGenerator();
    int highestPriority = HConstants.PRIORITY_UNSET;
    for (Row r : rows) {
      posInList++;
      if (r instanceof Put) {
        Put put = (Put) r;
        if (put.isEmpty()) {
          throw new IllegalArgumentException("No columns to insert for #" + (posInList+1)+ " item");
        }
        highestPriority = Math.max(put.getPriority(), highestPriority);
      }
      Action action = new Action(r, posInList, highestPriority);
      setNonce(ng, r, action);
      actions.add(action);
    }
    AsyncRequestFutureImpl ars = createAsyncRequestFuture(
        tableName, actions, ng.getNonceGroup(), getPool(pool), callback, results, results != null,
        callable, operationTimeout, rpcTimeout);
    ars.groupAndSendMultiAction(actions, 1);
    return ars;
  }

  private static void setNonce(NonceGenerator ng, Row r, Action action) {
    if (!(r instanceof Append) && !(r instanceof Increment)) return;
    action.setNonce(ng.newNonce()); // Action handles NO_NONCE, so it's ok if ng is disabled.
  }

  /**
   * The context, and return value, for a single submit/submitAll call.
   * Note on how this class (one AP submit) works. Initially, all requests are split into groups
   * by server; request is sent to each server in parallel; the RPC calls are not async so a
   * thread per server is used. Every time some actions fail, regions/locations might have
   * changed, so we re-group them by server and region again and send these groups in parallel
   * too. The result, in case of retries, is a "tree" of threads, with parent exiting after
   * scheduling children. This is why lots of code doesn't require any synchronization.
   */
  protected class AsyncRequestFutureImpl implements AsyncRequestFuture {

    /**
     * Runnable (that can be submitted to thread pool) that waits for when it's time
     * to issue replica calls, finds region replicas, groups the requests by replica and
     * issues the calls (on separate threads, via sendMultiAction).
     * This is done on a separate thread because we don't want to wait on user thread for
     * our asynchronous call, and usually we have to wait before making replica calls.
     */
    private final class ReplicaCallIssuingRunnable implements Runnable {
      private final long startTime;
      private final List> initialActions;

      public ReplicaCallIssuingRunnable(List> initialActions, long startTime) {
        this.initialActions = initialActions;
        this.startTime = startTime;
      }

      @Override
      public void run() {
        boolean done = false;
        if (primaryCallTimeoutMicroseconds > 0) {
          try {
            done = waitUntilDone(startTime * 1000L + primaryCallTimeoutMicroseconds);
          } catch (InterruptedException ex) {
            LOG.error("Replica thread was interrupted - no replica calls: " + ex.getMessage());
            return;
          }
        }
        if (done) return; // Done within primary timeout
        Map> actionsByServer =
            new HashMap>();
        List> unknownLocActions = new ArrayList>();
        if (replicaGetIndices == null) {
          for (int i = 0; i < results.length; ++i) {
            addReplicaActions(i, actionsByServer, unknownLocActions);
          }
        } else {
          for (int replicaGetIndice : replicaGetIndices) {
            addReplicaActions(replicaGetIndice, actionsByServer, unknownLocActions);
          }
        }
        if (!actionsByServer.isEmpty()) {
          sendMultiAction(actionsByServer, 1, null, unknownLocActions.isEmpty());
        }
        if (!unknownLocActions.isEmpty()) {
          actionsByServer = new HashMap>();
          for (Action action : unknownLocActions) {
            addReplicaActionsAgain(action, actionsByServer);
          }
          // Some actions may have completely failed, they are handled inside addAgain.
          if (!actionsByServer.isEmpty()) {
            sendMultiAction(actionsByServer, 1, null, true);
          }
        }
      }

      /**
       * Add replica actions to action map by server.
       * @param index Index of the original action.
       * @param actionsByServer The map by server to add it to.
       */
      private void addReplicaActions(int index, Map> actionsByServer,
          List> unknownReplicaActions) {
        if (results[index] != null) return; // opportunistic. Never goes from non-null to null.
        Action action = initialActions.get(index);
        RegionLocations loc = findAllLocationsOrFail(action, true);
        if (loc == null) return;
        HRegionLocation[] locs = loc.getRegionLocations();
        if (locs.length == 1) {
          LOG.warn("No replicas found for " + action.getAction());
          return;
        }
        synchronized (replicaResultLock) {
          // Don't run replica calls if the original has finished. We could do it e.g. if
          // original has already failed before first replica call (unlikely given retries),
          // but that would require additional synchronization w.r.t. returning to caller.
          if (results[index] != null) return;
          // We set the number of calls here. After that any path must call setResult/setError.
          // True even for replicas that are not found - if we refuse to send we MUST set error.
          results[index] = new ReplicaResultState(locs.length);
        }
        for (int i = 1; i < locs.length; ++i) {
          Action replicaAction = new Action(action, i);
          if (locs[i] != null) {
            addAction(locs[i].getServerName(), locs[i].getRegionInfo().getRegionName(),
                replicaAction, actionsByServer, nonceGroup);
          } else {
            unknownReplicaActions.add(replicaAction);
          }
        }
      }

      private void addReplicaActionsAgain(
          Action action, Map> actionsByServer) {
        if (action.getReplicaId() == RegionReplicaUtil.DEFAULT_REPLICA_ID) {
          throw new AssertionError("Cannot have default replica here");
        }
        HRegionLocation loc = getReplicaLocationOrFail(action);
        if (loc == null) return;
        addAction(loc.getServerName(), loc.getRegionInfo().getRegionName(),
            action, actionsByServer, nonceGroup);
      }
    }

    /**
     * Runnable (that can be submitted to thread pool) that submits MultiAction to a
     * single server. The server call is synchronous, therefore we do it on a thread pool.
     */
    @VisibleForTesting
    class SingleServerRequestRunnable implements Runnable {
      private final MultiAction multiAction;
      private final int numAttempt;
      private final ServerName server;
      private final Set callsInProgress;
      @VisibleForTesting
      SingleServerRequestRunnable(
          MultiAction multiAction, int numAttempt, ServerName server,
          Set callsInProgress) {
        this.multiAction = multiAction;
        this.numAttempt = numAttempt;
        this.server = server;
        this.callsInProgress = callsInProgress;
      }

      @Override
      public void run() {
        MultiResponse res = null;
        PayloadCarryingServerCallable callable = currentCallable;
        try {
          // setup the callable based on the actions, if we don't have one already from the request
          if (callable == null) {
            callable = createCallable(server, tableName, multiAction);
          }
          RpcRetryingCaller caller = createCaller(callable, rpcTimeout);
          try {
            if (callsInProgress != null) {
              callsInProgress.add(callable);
            }
            res = caller.callWithoutRetries(callable, operationTimeout);
            if (res == null) {
              // Cancelled
              return;
            }
          } catch (IOException e) {
            // The service itself failed . It may be an error coming from the communication
            //   layer, but, as well, a functional error raised by the server.
            receiveGlobalFailure(multiAction, server, numAttempt, e);
            return;
          } catch (Throwable t) {
            // This should not happen. Let's log & retry anyway.
            LOG.error("#" + id + ", Caught throwable while calling. This is unexpected." +
                " Retrying. Server is " + server + ", tableName=" + tableName, t);
            receiveGlobalFailure(multiAction, server, numAttempt, t);
            return;
          }

          // Normal case: we received an answer from the server, and it's not an exception.
          receiveMultiAction(multiAction, server, res, numAttempt);
        } catch (Throwable t) {
          // Something really bad happened. We are on the send thread that will now die.
          LOG.error("Internal AsyncProcess #" + id + " error for "
              + tableName + " processing for " + server, t);
          throw new RuntimeException(t);
        } finally {
          decTaskCounters(multiAction.getRegions(), server);
          if (callsInProgress != null && callable != null && res != null) {
            callsInProgress.remove(callable);
          }
        }
      }
    }

    private final Batch.Callback callback;
    private final BatchErrors errors;
    private final ConnectionManager.ServerErrorTracker errorsByServer;
    private final ExecutorService pool;
    private final Set callsInProgress;


    private final TableName tableName;
    private final AtomicLong actionsInProgress = new AtomicLong(-1);
    /**
     * The lock controls access to results. It is only held when populating results where
     * there might be several callers (eventual consistency gets). For other requests,
     * there's one unique call going on per result index.
     */
    private final Object replicaResultLock = new Object();
    /**
     * Result array.  Null if results are not needed. Otherwise, each index corresponds to
     * the action index in initial actions submitted. For most request types, has null-s for
     * requests that are not done, and result/exception for those that are done.
     * For eventual-consistency gets, initially the same applies; at some point, replica calls
     * might be started, and ReplicaResultState is put at the corresponding indices. The
     * returning calls check the type to detect when this is the case. After all calls are done,
     * ReplicaResultState-s are replaced with results for the user.
     */
    private final Object[] results;
    /**
     * Indices of replica gets in results. If null, all or no actions are replica-gets.
     */
    private final int[] replicaGetIndices;
    private final boolean hasAnyReplicaGets;
    private final long nonceGroup;
    private PayloadCarryingServerCallable currentCallable;
    private int operationTimeout;
    private int rpcTimeout;
    private RetryingTimeTracker tracker;

    public AsyncRequestFutureImpl(TableName tableName, List> actions, long nonceGroup,
        ExecutorService pool, boolean needResults, Object[] results,
        Batch.Callback callback, PayloadCarryingServerCallable callable,
        int operationTimeout, int rpcTimeout) {
      this.pool = pool;
      this.callback = callback;
      this.nonceGroup = nonceGroup;
      this.tableName = tableName;
      this.actionsInProgress.set(actions.size());
      if (results != null) {
        assert needResults;
        if (results.length != actions.size()) {
          throw new AssertionError("results.length");
        }
        this.results = results;
        for (int i = 0; i != this.results.length; ++i) {
          results[i] = null;
        }
      } else {
        this.results = needResults ? new Object[actions.size()] : null;
      }
      List replicaGetIndices = null;
      boolean hasAnyReplicaGets = false;
      if (needResults) {
        // Check to see if any requests might require replica calls.
        // We expect that many requests will consist of all or no multi-replica gets; in such
        // cases we would just use a boolean (hasAnyReplicaGets). If there's a mix, we will
        // store the list of action indexes for which replica gets are possible, and set
        // hasAnyReplicaGets to true.
        boolean hasAnyNonReplicaReqs = false;
        int posInList = 0;
        for (Action action : actions) {
          boolean isReplicaGet = isReplicaGet(action.getAction());
          if (isReplicaGet) {
            hasAnyReplicaGets = true;
            if (hasAnyNonReplicaReqs) { // Mixed case
              if (replicaGetIndices == null) {
                replicaGetIndices = new ArrayList(actions.size() - 1);
              }
              replicaGetIndices.add(posInList);
            }
          } else if (!hasAnyNonReplicaReqs) {
            // The first non-multi-replica request in the action list.
            hasAnyNonReplicaReqs = true;
            if (posInList > 0) {
              // Add all the previous requests to the index lists. We know they are all
              // replica-gets because this is the first non-multi-replica request in the list.
              replicaGetIndices = new ArrayList(actions.size() - 1);
              for (int i = 0; i < posInList; ++i) {
                replicaGetIndices.add(i);
              }
            }
          }
          ++posInList;
        }
      }
      this.hasAnyReplicaGets = hasAnyReplicaGets;
      if (replicaGetIndices != null) {
        this.replicaGetIndices = new int[replicaGetIndices.size()];
        int i = 0;
        for (Integer el : replicaGetIndices) {
          this.replicaGetIndices[i++] = el;
        }
      } else {
        this.replicaGetIndices = null;
      }
      this.callsInProgress = !hasAnyReplicaGets ? null :
          Collections.newSetFromMap(
              new ConcurrentHashMap());

      this.errorsByServer = createServerErrorTracker();
      this.errors = (globalErrors != null) ? globalErrors : new BatchErrors();
      this.currentCallable = callable;
      this.operationTimeout = operationTimeout;
      this.rpcTimeout = rpcTimeout;
      if (callable == null) {
        tracker = new RetryingTimeTracker();
        tracker.start();
      }
    }

    public Set getCallsInProgress() {
      return callsInProgress;
    }

    @VisibleForTesting
    SingleServerRequestRunnable createSingleServerRequest(MultiAction multiAction, int numAttempt, ServerName server,
      Set callsInProgress) {
      return new SingleServerRequestRunnable(multiAction, numAttempt, server, callsInProgress);
    }

    /**
     * Group a list of actions per region servers, and send them.
     *
     * @param currentActions - the list of row to submit
     * @param numAttempt - the current numAttempt (first attempt is 1)
     */
    private void groupAndSendMultiAction(List> currentActions, int numAttempt) {
      Map> actionsByServer =
          new HashMap>();

      boolean isReplica = false;
      List> unknownReplicaActions = null;
      for (Action action : currentActions) {
        RegionLocations locs = findAllLocationsOrFail(action, true);
        if (locs == null) continue;
        boolean isReplicaAction = !RegionReplicaUtil.isDefaultReplica(action.getReplicaId());
        if (isReplica && !isReplicaAction) {
          // This is the property of the current implementation, not a requirement.
          throw new AssertionError("Replica and non-replica actions in the same retry");
        }
        isReplica = isReplicaAction;
        HRegionLocation loc = locs.getRegionLocation(action.getReplicaId());
        if (loc == null || loc.getServerName() == null) {
          if (isReplica) {
            if (unknownReplicaActions == null) {
              unknownReplicaActions = new ArrayList>();
            }
            unknownReplicaActions.add(action);
          } else {
            // TODO: relies on primary location always being fetched
            manageLocationError(action, null);
          }
        } else {
          byte[] regionName = loc.getRegionInfo().getRegionName();
          addAction(loc.getServerName(), regionName, action, actionsByServer, nonceGroup);
        }
      }
      boolean doStartReplica = (numAttempt == 1 && !isReplica && hasAnyReplicaGets);
      boolean hasUnknown = unknownReplicaActions != null && !unknownReplicaActions.isEmpty();

      if (!actionsByServer.isEmpty()) {
        // If this is a first attempt to group and send, no replicas, we need replica thread.
        sendMultiAction(actionsByServer, numAttempt, (doStartReplica && !hasUnknown)
            ? currentActions : null, numAttempt > 1 && !hasUnknown);
      }

      if (hasUnknown) {
        actionsByServer = new HashMap>();
        for (Action action : unknownReplicaActions) {
          HRegionLocation loc = getReplicaLocationOrFail(action);
          if (loc == null) continue;
          byte[] regionName = loc.getRegionInfo().getRegionName();
          addAction(loc.getServerName(), regionName, action, actionsByServer, nonceGroup);
        }
        if (!actionsByServer.isEmpty()) {
          sendMultiAction(
              actionsByServer, numAttempt, doStartReplica ? currentActions : null, true);
        }
      }
    }

    private HRegionLocation getReplicaLocationOrFail(Action action) {
      // We are going to try get location once again. For each action, we'll do it once
      // from cache, because the previous calls in the loop might populate it.
      int replicaId = action.getReplicaId();
      RegionLocations locs = findAllLocationsOrFail(action, true);
      if (locs == null) return null; // manageError already called
      HRegionLocation loc = locs.getRegionLocation(replicaId);
      if (loc == null || loc.getServerName() == null) {
        locs = findAllLocationsOrFail(action, false);
        if (locs == null) return null; // manageError already called
        loc = locs.getRegionLocation(replicaId);
      }
      if (loc == null || loc.getServerName() == null) {
        manageLocationError(action, null);
        return null;
      }
      return loc;
    }

    private void manageLocationError(Action action, Exception ex) {
      String msg = "Cannot get replica " + action.getReplicaId()
          + " location for " + action.getAction();
      LOG.error(msg);
      if (ex == null) {
        ex = new IOException(msg);
      }
      manageError(action.getOriginalIndex(), action.getAction(),
          Retry.NO_LOCATION_PROBLEM, ex, null);
    }

    private RegionLocations findAllLocationsOrFail(Action action, boolean useCache) {
      if (action.getAction() == null) throw new IllegalArgumentException("#" + id +
          ", row cannot be null");
      RegionLocations loc = null;
      try {
        loc = connection.locateRegion(
            tableName, action.getAction().getRow(), useCache, true, action.getReplicaId());
      } catch (IOException ex) {
        manageLocationError(action, ex);
      }
      return loc;
    }

    /**
     * Send a multi action structure to the servers, after a delay depending on the attempt
     * number. Asynchronous.
     *
     * @param actionsByServer the actions structured by regions
     * @param numAttempt the attempt number.
     * @param actionsForReplicaThread original actions for replica thread; null on non-first call.
     */
    private void sendMultiAction(Map> actionsByServer,
        int numAttempt, List> actionsForReplicaThread, boolean reuseThread) {
      // Run the last item on the same thread if we are already on a send thread.
      // We hope most of the time it will be the only item, so we can cut down on threads.
      int actionsRemaining = actionsByServer.size();
      // This iteration is by server (the HRegionLocation comparator is by server portion only).
      for (Map.Entry> e : actionsByServer.entrySet()) {
        ServerName server = e.getKey();
        MultiAction multiAction = e.getValue();
        Collection runnables = getNewMultiActionRunnable(server, multiAction,
            numAttempt);
        // make sure we correctly count the number of runnables before we try to reuse the send
        // thread, in case we had to split the request into different runnables because of backoff
        if (runnables.size() > actionsRemaining) {
          actionsRemaining = runnables.size();
        }

        // run all the runnables
        // HBASE-17475: Do not reuse the thread after stack reach a certain depth to prevent stack overflow
        // for now, we use HConstants.DEFAULT_HBASE_CLIENT_RETRIES_NUMBER to control the depth
        for (Runnable runnable : runnables) {
          if ((--actionsRemaining == 0) && reuseThread
              && numAttempt % HConstants.DEFAULT_HBASE_CLIENT_RETRIES_NUMBER != 0) {
            runnable.run();
          } else {
            try {
              pool.submit(runnable);
            } catch (Throwable t) {
              if (t instanceof RejectedExecutionException) {
                // This should never happen. But as the pool is provided by the end user,
               // let's secure this a little.
               LOG.warn("#" + id + ", the task was rejected by the pool. This is unexpected." +
                  " Server is " + server.getServerName(), t);
              } else {
                // see #HBASE-14359 for more details
                LOG.warn("Caught unexpected exception/error: ", t);
              }
              decTaskCounters(multiAction.getRegions(), server);
              // We're likely to fail again, but this will increment the attempt counter,
             // so it will finish.
              receiveGlobalFailure(multiAction, server, numAttempt, t);
            }
          }
        }
      }

      if (actionsForReplicaThread != null) {
        startWaitingForReplicaCalls(actionsForReplicaThread);
      }
    }

    private Collection getNewMultiActionRunnable(ServerName server,
        MultiAction multiAction,
        int numAttempt) {
      // no stats to manage, just do the standard action
      if (AsyncProcess.this.connection.getStatisticsTracker() == null) {
        if (connection.getConnectionMetrics() != null) {
          connection.getConnectionMetrics().incrNormalRunners();
        }
        incTaskCounters(multiAction.getRegions(), server);
        SingleServerRequestRunnable runnable = createSingleServerRequest(multiAction, numAttempt, server, callsInProgress);
        return Collections.singletonList(Trace.wrap("AsyncProcess.sendMultiAction", runnable));
      }

      // group the actions by the amount of delay
      Map actions = new HashMap(multiAction
          .size());

      // split up the actions
      for (Map.Entry>> e : multiAction.actions.entrySet()) {
        Long backoff = getBackoff(server, e.getKey());
        DelayingRunner runner = actions.get(backoff);
        if (runner == null) {
          actions.put(backoff, new DelayingRunner(backoff, e));
        } else {
          runner.add(e);
        }
      }

      List toReturn = new ArrayList(actions.size());
      for (DelayingRunner runner : actions.values()) {
        incTaskCounters(runner.getActions().getRegions(), server);
        String traceText = "AsyncProcess.sendMultiAction";
        Runnable runnable = createSingleServerRequest(runner.getActions(), numAttempt, server, callsInProgress);
        // use a delay runner only if we need to sleep for some time
        if (runner.getSleepTime() > 0) {
          runner.setRunner(runnable);
          traceText = "AsyncProcess.clientBackoff.sendMultiAction";
          runnable = runner;
          if (connection.getConnectionMetrics() != null) {
            connection.getConnectionMetrics().incrDelayRunners();
            connection.getConnectionMetrics().updateDelayInterval(runner.getSleepTime());
          }
        } else {
          if (connection.getConnectionMetrics() != null) {
            connection.getConnectionMetrics().incrNormalRunners();
          }
        }
        runnable = Trace.wrap(traceText, runnable);
        toReturn.add(runnable);

      }
      return toReturn;
    }

    /**
     * @param server server location where the target region is hosted
     * @param regionName name of the region which we are going to write some data
     * @return the amount of time the client should wait until it submit a request to the
     * specified server and region
     */
    private Long getBackoff(ServerName server, byte[] regionName) {
      ServerStatisticTracker tracker = AsyncProcess.this.connection.getStatisticsTracker();
      ServerStatistics stats = tracker.getStats(server);
      return AsyncProcess.this.connection.getBackoffPolicy()
          .getBackoffTime(server, regionName, stats);
    }

    /**
     * Starts waiting to issue replica calls on a different thread; or issues them immediately.
     */
    private void startWaitingForReplicaCalls(List> actionsForReplicaThread) {
      long startTime = EnvironmentEdgeManager.currentTime();
      ReplicaCallIssuingRunnable replicaRunnable = new ReplicaCallIssuingRunnable(
          actionsForReplicaThread, startTime);
      if (primaryCallTimeoutMicroseconds == 0) {
        // Start replica calls immediately.
        replicaRunnable.run();
      } else {
        // Start the thread that may kick off replica gets.
        // TODO: we could do it on the same thread, but it's a user thread, might be a bad idea.
        try {
          pool.submit(replicaRunnable);
        } catch (RejectedExecutionException ree) {
          LOG.warn("#" + id + ", replica task was rejected by the pool - no replica calls", ree);
        }
      }
    }

    /**
     * Check that we can retry acts accordingly: logs, set the error status.
     *
     * @param originalIndex the position in the list sent
     * @param row           the row
     * @param canRetry      if false, we won't retry whatever the settings.
     * @param throwable     the throwable, if any (can be null)
     * @param server        the location, if any (can be null)
     * @return true if the action can be retried, false otherwise.
     */
    public Retry manageError(int originalIndex, Row row, Retry canRetry,
                                Throwable throwable, ServerName server) {
      if (canRetry == Retry.YES
          && throwable != null && (throwable instanceof DoNotRetryIOException ||
          throwable instanceof NeedUnmanagedConnectionException)) {
        canRetry = Retry.NO_NOT_RETRIABLE;
      }

      if (canRetry != Retry.YES) {
        // Batch.Callback was not called on failure in 0.94. We keep this.
        setError(originalIndex, row, throwable, server);
      } else if (isActionComplete(originalIndex, row)) {
        canRetry = Retry.NO_OTHER_SUCCEEDED;
      }
      return canRetry;
    }

    /**
     * Resubmit all the actions from this multiaction after a failure.
     *
     * @param rsActions  the actions still to do from the initial list
     * @param server   the destination
     * @param numAttempt the number of attempts so far
     * @param t the throwable (if any) that caused the resubmit
     */
    private void receiveGlobalFailure(
        MultiAction rsActions, ServerName server, int numAttempt, Throwable t) {
      errorsByServer.reportServerError(server);
      Retry canRetry = errorsByServer.canRetryMore(numAttempt)
          ? Retry.YES : Retry.NO_RETRIES_EXHAUSTED;

      if (tableName == null && ClientExceptionsUtil.isMetaClearingException(t)) {
        // tableName is null when we made a cross-table RPC call.
        connection.clearCaches(server);
      }
      int failed = 0, stopped = 0;
      List> toReplay = new ArrayList>();
      for (Map.Entry>> e : rsActions.actions.entrySet()) {
        byte[] regionName = e.getKey();
        byte[] row = e.getValue().iterator().next().getAction().getRow();
        // Do not use the exception for updating cache because it might be coming from
        // any of the regions in the MultiAction.
        try {
          if (tableName != null) {
            connection.updateCachedLocations(tableName, regionName, row,
              ClientExceptionsUtil.isMetaClearingException(t) ? null : t, server);
          }
        } catch (Throwable ex) {
          // That should never happen, but if it did, we want to make sure
          // we still process errors
          LOG.error("Couldn't update cached region locations: " + ex);
        }
        for (Action action : e.getValue()) {
          Retry retry = manageError(
              action.getOriginalIndex(), action.getAction(), canRetry, t, server);
          if (retry == Retry.YES) {
            toReplay.add(action);
          } else if (retry == Retry.NO_OTHER_SUCCEEDED) {
            ++stopped;
          } else {
            ++failed;
          }
        }
      }

      if (toReplay.isEmpty()) {
        logNoResubmit(server, numAttempt, rsActions.size(), t, failed, stopped);
      } else {
        resubmit(server, toReplay, numAttempt, rsActions.size(), t);
      }
    }

    /**
     * Log as much info as possible, and, if there is something to replay,
     * submit it again after a back off sleep.
     */
    private void resubmit(ServerName oldServer, List> toReplay,
        int numAttempt, int failureCount, Throwable throwable) {
      // We have something to replay. We're going to sleep a little before.

      // We have two contradicting needs here:
      //  1) We want to get the new location after having slept, as it may change.
      //  2) We want to take into account the location when calculating the sleep time.
      //  3) If all this is just because the response needed to be chunked try again FAST.
      // It should be possible to have some heuristics to take the right decision. Short term,
      //  we go for one.
      boolean retryImmediately = throwable instanceof RetryImmediatelyException;
      int nextAttemptNumber = retryImmediately ? numAttempt : numAttempt + 1;
      long backOffTime;
      if (retryImmediately) {
        backOffTime = 0;
      } else if (throwable instanceof CallQueueTooBigException) {
        // Give a special check on CQTBE, see #HBASE-17114
        backOffTime = errorsByServer.calculateBackoffTime(oldServer, pauseForCQTBE);
      } else {
        backOffTime = errorsByServer.calculateBackoffTime(oldServer, pause);
      }
      if (numAttempt > startLogErrorsCnt) {
        // We use this value to have some logs when we have multiple failures, but not too many
        //  logs, as errors are to be expected when a region moves, splits and so on
        LOG.info(createLog(numAttempt, failureCount, toReplay.size(),
            oldServer, throwable, backOffTime, true, null, -1, -1));
      }

      try {
        if (backOffTime > 0) {
          Thread.sleep(backOffTime);
        }
      } catch (InterruptedException e) {
        LOG.warn("#" + id + ", not sent: " + toReplay.size() + " operations, " + oldServer, e);
        Thread.currentThread().interrupt();
        return;
      }

      groupAndSendMultiAction(toReplay, nextAttemptNumber);
    }

    private void logNoResubmit(ServerName oldServer, int numAttempt,
        int failureCount, Throwable throwable, int failed, int stopped) {
      if (failureCount != 0 || numAttempt > startLogErrorsCnt + 1) {
        String timeStr = new Date(errorsByServer.getStartTrackingTime()).toString();
        String logMessage = createLog(numAttempt, failureCount, 0, oldServer,
            throwable, -1, false, timeStr, failed, stopped);
        if (failed != 0) {
          // Only log final failures as warning
          LOG.warn(logMessage);
        } else {
          LOG.info(logMessage);
        }
      }
    }

    @VisibleForTesting
    long getActionsInProgress() {
      return actionsInProgress.get();
    }
    /**
     * Called when we receive the result of a server query.
     *
     * @param multiAction    - the multiAction we sent
     * @param server       - the location. It's used as a server name.
     * @param responses      - the response, if any
     * @param numAttempt     - the attempt
     */
    private void receiveMultiAction(MultiAction multiAction,
        ServerName server, MultiResponse responses, int numAttempt) {
       assert responses != null;

      // Success or partial success
      // Analyze detailed results. We can still have individual failures to be redo.
      // two specific throwables are managed:
      //  - DoNotRetryIOException: we continue to retry for other actions
      //  - RegionMovedException: we update the cache with the new region location

      List> toReplay = new ArrayList>();
      Throwable throwable = null;
      int failureCount = 0;
      Retry retry = null;

      Map results = responses.getResults();
      updateStats(server, results);

      int failed = 0;
      int stopped = 0;
      // Go by original action.
      for (Map.Entry>> regionEntry : multiAction.actions.entrySet()) {
        byte[] regionName = regionEntry.getKey();
        Throwable regionException = responses.getExceptions().get(regionName);
        if (tableName == null && regionException != null &&
              ClientExceptionsUtil.isMetaClearingException(regionException)) {
          // For multi-actions, we don't have a table name, but we want to make sure to clear the
          // cache in case there were location-related exceptions. We don't to clear the cache
          // for every possible exception that comes through, however.
          connection.clearCaches(server);
        }
        Map regionResults;
        if (results.containsKey(regionName)) {
          regionResults = results.get(regionName).result;
        } else {
          regionResults = Collections.emptyMap();
        }
        boolean regionFailureRegistered = false;
        for (Action sentAction : regionEntry.getValue()) {
          Object result = regionResults.get(sentAction.getOriginalIndex());
          if (result == null) {
            if (regionException == null) {
              LOG.error("Server sent us neither results nor exceptions for " + Bytes
                .toStringBinary(regionName) + ", numAttempt:" + numAttempt);
              regionException = new RuntimeException("Invalid response");
            }
            // If the row operation encounters the region-lever error, the exception of action
            // may be null.
            result = regionException;
          }
          // Failure: retry if it's make sense else update the errors lists
          if (result instanceof Throwable) {
            Row row = sentAction.getAction();
            throwable = regionException != null ? regionException
              : ClientExceptionsUtil.findException(result);
            // Register corresponding failures once per server/once per region.
            if (!regionFailureRegistered) {
              regionFailureRegistered = true;
              try {
                connection.updateCachedLocations(
                  tableName, regionName, row.getRow(), result, server);
              } catch (Throwable ex) {
                // That should never happen, but if it did, we want to make sure
                // we still process errors
                LOG.error("Couldn't update cached region locations: " + ex);
              }
            }
            if (retry == null) {
              errorsByServer.reportServerError(server);
              // We determine canRetry only once for all calls, after reporting server failure.
              retry =
                errorsByServer.canRetryMore(numAttempt) ? Retry.YES : Retry.NO_RETRIES_EXHAUSTED;
            }
            ++failureCount;
            switch (manageError(sentAction.getOriginalIndex(), row, retry, (Throwable) result,
              server)) {
              case YES:
                toReplay.add(sentAction);
                break;
              case NO_OTHER_SUCCEEDED:
                ++stopped;
                break;
              default:
                ++failed;
                break;
            }
          } else {
            if (callback != null) {
              try {
                //noinspection unchecked
                // TODO: would callback expect a replica region name if it gets one?
                this.callback.update(regionName, sentAction.getAction().getRow(), (CResult) result);
              } catch (Throwable t) {
                LOG.error("User callback threw an exception for "
                    + Bytes.toStringBinary(regionName) + ", ignoring", t);
              }
            }
            setResult(sentAction, result);
          }
        }
      }
      if (toReplay.isEmpty()) {
        logNoResubmit(server, numAttempt, failureCount, throwable, failed, stopped);
      } else {
        resubmit(server, toReplay, numAttempt, failureCount, throwable);
      }
    }

    private String createLog(int numAttempt, int failureCount, int replaySize, ServerName sn,
        Throwable error, long backOffTime, boolean willRetry, String startTime,
        int failed, int stopped) {
      StringBuilder sb = new StringBuilder();
      sb.append("#").append(id).append(", table=").append(tableName).append(", ")
        .append("attempt=").append(numAttempt)
        .append("/").append(numTries).append(" ");

      if (failureCount > 0 || error != null){
        sb.append("failed=").append(failureCount).append("ops").append(", last exception: ").
            append(error == null ? "null" : error);
      } else {
        sb.append("succeeded");
      }

      sb.append(" on ").append(sn).append(", tracking started ").append(startTime);

      if (willRetry) {
        sb.append(", retrying after=").append(backOffTime).append("ms").
            append(", replay=").append(replaySize).append("ops");
      } else if (failureCount > 0) {
        if (stopped > 0) {
          sb.append("; not retrying ").append(stopped).append(" due to success from other replica");
        }
        if (failed > 0) {
          sb.append("; not retrying ").append(failed).append(" - final failure");
        }

      }

      return sb.toString();
    }

    /**
     * Sets the non-error result from a particular action.
     * @param action Action (request) that the server responded to.
     * @param result The result.
     */
    private void setResult(Action action, Object result) {
      if (result == null) {
        throw new RuntimeException("Result cannot be null");
      }
      ReplicaResultState state = null;
      boolean isStale = !RegionReplicaUtil.isDefaultReplica(action.getReplicaId());
      int index = action.getOriginalIndex();
      if (results == null) {
         decActionCounter(index);
         return; // Simple case, no replica requests.
      } else if ((state = trySetResultSimple(
          index, action.getAction(), false, result, null, isStale)) == null) {
        return; // Simple case, no replica requests.
      }
      assert state != null;
      // At this point we know that state is set to replica tracking class.
      // It could be that someone else is also looking at it; however, we know there can
      // only be one state object, and only one thread can set callCount to 0. Other threads
      // will either see state with callCount 0 after locking it; or will not see state at all
      // we will replace it with the result.
      synchronized (state) {
        if (state.callCount == 0) {
          return; // someone already set the result
        }
        state.callCount = 0;
      }
      synchronized (replicaResultLock) {
        if (results[index] != state) {
          throw new AssertionError("We set the callCount but someone else replaced the result");
        }
        results[index] = result;
      }

      decActionCounter(index);
    }

    /**
     * Sets the error from a particular action.
     * @param index Original action index.
     * @param row Original request.
     * @param throwable The resulting error.
     * @param server The source server.
     */
    private void setError(int index, Row row, Throwable throwable, ServerName server) {
      ReplicaResultState state = null;
      if (results == null) {
        // Note that we currently cannot have replica requests with null results. So it shouldn't
        // happen that multiple replica calls will call dAC for same actions with results == null.
        // Only one call per action should be present in this case.
        errors.add(throwable, row, server);
        decActionCounter(index);
        return; // Simple case, no replica requests.
      } else if ((state = trySetResultSimple(
          index, row, true, throwable, server, false)) == null) {
        return; // Simple case, no replica requests.
      }
      assert state != null;
      BatchErrors target = null; // Error will be added to final errors, or temp replica errors.
      boolean isActionDone = false;
      synchronized (state) {
        switch (state.callCount) {
          case 0: return; // someone already set the result
          case 1: { // All calls failed, we are the last error.
            target = errors;
            isActionDone = true;
            break;
          }
          default: {
            assert state.callCount > 1;
            if (state.replicaErrors == null) {
              state.replicaErrors = new BatchErrors();
            }
            target = state.replicaErrors;
            break;
          }
        }
        --state.callCount;
      }
      target.add(throwable, row, server);
      if (isActionDone) {
        if (state.replicaErrors != null) { // last call, no need to lock
          errors.merge(state.replicaErrors);
        }
        // See setResult for explanations.
        synchronized (replicaResultLock) {
          if (results[index] != state) {
            throw new AssertionError("We set the callCount but someone else replaced the result");
          }
          results[index] = throwable;
        }
        decActionCounter(index);
      }
    }

    /**
     * Checks if the action is complete; used on error to prevent needless retries.
     * Does not synchronize, assuming element index/field accesses are atomic.
     * This is an opportunistic optimization check, doesn't have to be strict.
     * @param index Original action index.
     * @param row Original request.
     */
    private boolean isActionComplete(int index, Row row) {
      if (!isReplicaGet(row)) return false;
      Object resObj = results[index];
      return (resObj != null) && (!(resObj instanceof ReplicaResultState)
          || ((ReplicaResultState)resObj).callCount == 0);
    }

    /**
     * Tries to set the result or error for a particular action as if there were no replica calls.
     * @return null if successful; replica state if there were in fact replica calls.
     */
    private ReplicaResultState trySetResultSimple(int index, Row row, boolean isError,
        Object result, ServerName server, boolean isFromReplica) {
      Object resObj = null;
      if (!isReplicaGet(row)) {
        if (isFromReplica) {
          throw new AssertionError("Unexpected stale result for " + row);
        }
        results[index] = result;
      } else {
        synchronized (replicaResultLock) {
          if ((resObj = results[index]) == null) {
            if (isFromReplica) {
              throw new AssertionError("Unexpected stale result for " + row);
            }
            results[index] = result;
          }
        }
      }

      ReplicaResultState rrs =
          (resObj instanceof ReplicaResultState) ? (ReplicaResultState)resObj : null;
      if (rrs == null && isError) {
        // The resObj is not replica state (null or already set).
        errors.add((Throwable)result, row, server);
      }

      if (resObj == null) {
        // resObj is null - no replica calls were made.
        decActionCounter(index);
        return null;
      }
      return rrs;
    }

    private void decActionCounter(int index) {
      long actionsRemaining = actionsInProgress.decrementAndGet();
      if (actionsRemaining < 0) {
        String error = buildDetailedErrorMsg("Incorrect actions in progress", index);
        throw new AssertionError(error);
      } else if (actionsRemaining == 0) {
        synchronized (actionsInProgress) {
          actionsInProgress.notifyAll();
        }
      }
    }

    private String buildDetailedErrorMsg(String string, int index) {
      StringBuilder error = new StringBuilder(string);
      error.append("; called for ").
        append(index).
        append(", actionsInProgress ").
        append(actionsInProgress.get()).
        append("; replica gets: ");
      if (replicaGetIndices != null) {
        for (int i = 0; i < replicaGetIndices.length; ++i) {
          error.append(replicaGetIndices[i]).append(", ");
        }
      } else {
        error.append(hasAnyReplicaGets ? "all" : "none");
      }
      error.append("; results ");
      if (results != null) {
        for (int i = 0; i < results.length; ++i) {
          Object o = results[i];
          error.append(((o == null) ? "null" : o.toString())).append(", ");
        }
      }
      return error.toString();
    }

    @Override
    public void waitUntilDone() throws InterruptedIOException {
      try {
        waitUntilDone(Long.MAX_VALUE);
      } catch (InterruptedException iex) {
        throw new InterruptedIOException(iex.getMessage());
      } finally {
        if (callsInProgress != null) {
          for (PayloadCarryingServerCallable clb : callsInProgress) {
            clb.cancel();
          }
        }
      }
    }

    private boolean waitUntilDone(long cutoff) throws InterruptedException {
      boolean hasWait = cutoff != Long.MAX_VALUE;
      long lastLog = EnvironmentEdgeManager.currentTime();
      long currentInProgress;
      while (0 != (currentInProgress = actionsInProgress.get())) {
        long now = EnvironmentEdgeManager.currentTime();
        if (hasWait && (now * 1000L) > cutoff) {
          return false;
        }
        if (!hasWait) { // Only log if wait is infinite.
          if (now > lastLog + 10000) {
            lastLog = now;
            LOG.info("#" + id + ", waiting for " + currentInProgress
                + "  actions to finish on table: " + tableName);
            if (currentInProgress <= thresholdToLogUndoneTaskDetails) {
              logDetailsOfUndoneTasks(currentInProgress);
            }
          }
        }
        synchronized (actionsInProgress) {
          if (actionsInProgress.get() == 0) break;
          if (!hasWait) {
            actionsInProgress.wait(10);
          } else {
            long waitMicroSecond = Math.min(100000L, (cutoff - now * 1000L));
            TimeUnit.MICROSECONDS.timedWait(actionsInProgress, waitMicroSecond);
          }
        }
      }
      return true;
    }

    @Override
    public boolean hasError() {
      return errors.hasErrors();
    }

    @Override
    public List getFailedOperations() {
      return errors.actions;
    }

    @Override
    public RetriesExhaustedWithDetailsException getErrors() {
      return errors.makeException(logBatchErrorDetails);
    }

    @Override
    public Object[] getResults() throws InterruptedIOException {
      waitUntilDone();
      return results;
    }

    /**
     * Create a callable. Isolated to be easily overridden in the tests.
     */
    @VisibleForTesting
    protected MultiServerCallable createCallable(final ServerName server,
        TableName tableName, final MultiAction multi) {
      return new MultiServerCallable(connection, tableName, server,
          AsyncProcess.this.rpcFactory, multi, rpcTimeout, tracker, multi.getPriority());
    }
  }

  @VisibleForTesting
  protected void updateStats(ServerName server, Map results) {
    boolean metrics = AsyncProcess.this.connection.getConnectionMetrics() != null;
    boolean stats = AsyncProcess.this.connection.getStatisticsTracker() != null;
    if (!stats && !metrics) {
      return;
    }
    for (Map.Entry regionStats : results.entrySet()) {
      byte[] regionName = regionStats.getKey();
      ClientProtos.RegionLoadStats stat = regionStats.getValue().getStat();
      ResultStatsUtil.updateStats(AsyncProcess.this.connection.getStatisticsTracker(), server,
          regionName, stat);
      ResultStatsUtil.updateStats(AsyncProcess.this.connection.getConnectionMetrics(),
          server, regionName, stat);
    }
  }

  @VisibleForTesting
   AsyncRequestFutureImpl createAsyncRequestFuture(
      TableName tableName, List> actions, long nonceGroup, ExecutorService pool,
      Batch.Callback callback, Object[] results, boolean needResults,
      PayloadCarryingServerCallable callable, int operationTimeout, int rpcTimeout) {
    return new AsyncRequestFutureImpl(
        tableName, actions, nonceGroup, getPool(pool), needResults,
        results, callback, callable, operationTimeout, rpcTimeout);
  }

  /**
   * Create a caller. Isolated to be easily overridden in the tests.
   */
  @VisibleForTesting
  protected RpcRetryingCaller createCaller(PayloadCarryingServerCallable callable,
      int rpcTimeout) {
    return rpcCallerFactory. newCaller(rpcTimeout);
  }

  @VisibleForTesting
  /** Waits until all outstanding tasks are done. Used in tests. */
  void waitUntilDone() throws InterruptedIOException {
    waitForMaximumCurrentTasks(0, null);
  }

  /** Wait until the async does not have more than max tasks in progress. */
  private void waitForMaximumCurrentTasks(int max, String tableName)
      throws InterruptedIOException {
    waitForMaximumCurrentTasks(max, tasksInProgress, id, tableName);
  }

  // Break out this method so testable
  @VisibleForTesting
  void waitForMaximumCurrentTasks(int max, final AtomicLong tasksInProgress, final long id,
      String tableName) throws InterruptedIOException {
    long lastLog = EnvironmentEdgeManager.currentTime();
    long currentInProgress, oldInProgress = Long.MAX_VALUE;
    while ((currentInProgress = tasksInProgress.get()) > max) {
      if (oldInProgress != currentInProgress) { // Wait for in progress to change.
        long now = EnvironmentEdgeManager.currentTime();
        if (now > lastLog + 10000) {
          lastLog = now;
          LOG.info("#" + id + ", waiting for some tasks to finish. Expected max="
              + max + ", tasksInProgress=" + currentInProgress +
              " hasError=" + hasError() + (tableName == null ? "" : ", tableName=" + tableName));
          if (currentInProgress <= thresholdToLogUndoneTaskDetails) {
            logDetailsOfUndoneTasks(currentInProgress);
          }
        }
      }
      oldInProgress = currentInProgress;
      try {
        synchronized (tasksInProgress) {
          if (tasksInProgress.get() == oldInProgress) {
            tasksInProgress.wait(10);
          }
        }
      } catch (InterruptedException e) {
        throw new InterruptedIOException("#" + id + ", interrupted." +
            " currentNumberOfTask=" + currentInProgress);
      }
    }
  }

  private void logDetailsOfUndoneTasks(long taskInProgress) {
    ArrayList servers = new ArrayList();
    for (Map.Entry entry : taskCounterPerServer.entrySet()) {
      if (entry.getValue().get() > 0) {
        servers.add(entry.getKey());
      }
    }
    LOG.info("Left over " + taskInProgress + " task(s) are processed on server(s): " + servers);
    if (taskInProgress <= THRESHOLD_TO_LOG_REGION_DETAILS) {
      ArrayList regions = new ArrayList();
      for (Map.Entry entry : taskCounterPerRegion.entrySet()) {
        if (entry.getValue().get() > 0) {
          regions.add(Bytes.toString(entry.getKey()));
        }
      }
      LOG.info("Regions against which left over task(s) are processed: " + regions);
    }
  }

  /**
   * Only used w/useGlobalErrors ctor argument, for HTable backward compat.
   * @return Whether there were any errors in any request since the last time
   *          {@link #waitForAllPreviousOpsAndReset(List)} was called, or AP was created.
   */
  public boolean hasError() {
    return globalErrors.hasErrors();
  }

  /**
   * Only used w/useGlobalErrors ctor argument, for HTable backward compat.
   * Waits for all previous operations to finish, and returns errors and (optionally)
   * failed operations themselves.
   * @param failedRows an optional list into which the rows that failed since the last time
   *        {@link #waitForAllPreviousOpsAndReset(List)} was called, or AP was created, are saved.
   * @param tableName name of the table
   * @return all the errors since the last time {@link #waitForAllPreviousOpsAndReset(List)}
   *          was called, or AP was created.
   */
  public RetriesExhaustedWithDetailsException waitForAllPreviousOpsAndReset(
      List failedRows, String tableName) throws InterruptedIOException {
    waitForMaximumCurrentTasks(0, tableName);
    if (!globalErrors.hasErrors()) {
      return null;
    }
    if (failedRows != null) {
      failedRows.addAll(globalErrors.actions);
    }
    RetriesExhaustedWithDetailsException result = globalErrors.makeException(logBatchErrorDetails);
    globalErrors.clear();
    return result;
  }

  /**
   * increment the tasks counters for a given set of regions. MT safe.
   */
  protected void incTaskCounters(Collection regions, ServerName sn) {
    tasksInProgress.incrementAndGet();

    AtomicInteger serverCnt = taskCounterPerServer.get(sn);
    if (serverCnt == null) {
      taskCounterPerServer.putIfAbsent(sn, new AtomicInteger());
      serverCnt = taskCounterPerServer.get(sn);
    }
    serverCnt.incrementAndGet();

    for (byte[] regBytes : regions) {
      AtomicInteger regionCnt = taskCounterPerRegion.get(regBytes);
      if (regionCnt == null) {
        regionCnt = new AtomicInteger();
        AtomicInteger oldCnt = taskCounterPerRegion.putIfAbsent(regBytes, regionCnt);
        if (oldCnt != null) {
          regionCnt = oldCnt;
        }
      }
      regionCnt.incrementAndGet();
    }
  }

  /**
   * Decrements the counters for a given region and the region server. MT Safe.
   */
  protected void decTaskCounters(Collection regions, ServerName sn) {
    for (byte[] regBytes : regions) {
      AtomicInteger regionCnt = taskCounterPerRegion.get(regBytes);
      regionCnt.decrementAndGet();
    }

    taskCounterPerServer.get(sn).decrementAndGet();
    tasksInProgress.decrementAndGet();
    synchronized (tasksInProgress) {
      tasksInProgress.notifyAll();
    }
  }

  /**
   * Creates the server error tracker to use inside process.
   * Currently, to preserve the main assumption about current retries, and to work well with
   * the retry-limit-based calculation, the calculation is local per Process object.
   * We may benefit from connection-wide tracking of server errors.
   * @return ServerErrorTracker to use, null if there is no ServerErrorTracker on this connection
   */
  protected ConnectionManager.ServerErrorTracker createServerErrorTracker() {
    return new ConnectionManager.ServerErrorTracker(
        this.serverTrackerTimeout, this.numTries);
  }

  private static boolean isReplicaGet(Row row) {
    return (row instanceof Get) && (((Get)row).getConsistency() == Consistency.TIMELINE);
  }

  /**
   * For manageError. Only used to make logging more clear, we don't actually care why we don't retry.
   */
  private enum Retry {
    YES,
    NO_LOCATION_PROBLEM,
    NO_NOT_RETRIABLE,
    NO_RETRIES_EXHAUSTED,
    NO_OTHER_SUCCEEDED
  }

  /**
   * Collect all advices from checkers and make the final decision.
   */
  @VisibleForTesting
  static class RowCheckerHost {
    private final List checkers;
    private boolean isEnd = false;
    RowCheckerHost(final List checkers) {
      this.checkers = checkers;
    }
    void reset() throws InterruptedIOException {
      isEnd = false;
      InterruptedIOException e = null;
      for (RowChecker checker : checkers) {
        try {
          checker.reset();
        } catch (InterruptedIOException ex) {
          e = ex;
        }
      }
      if (e != null) {
        throw e;
      }
    }
    ReturnCode canTakeOperation(HRegionLocation loc, long rowSize) {
      if (isEnd) {
        return ReturnCode.END;
      }
      ReturnCode code = ReturnCode.INCLUDE;
      for (RowChecker checker : checkers) {
        switch (checker.canTakeOperation(loc, rowSize)) {
          case END:
            isEnd = true;
            code = ReturnCode.END;
            break;
          case SKIP:
            code = ReturnCode.SKIP;
            break;
          case INCLUDE:
          default:
            break;
        }
        if (code == ReturnCode.END) {
          break;
        }
      }
      for (RowChecker checker : checkers) {
        checker.notifyFinal(code, loc, rowSize);
      }
      return code;
    }
  }

  /**
   * Provide a way to control the flow of rows iteration.
   */
  @VisibleForTesting
  interface RowChecker {
    enum ReturnCode {
      /**
       * Accept current row.
       */
      INCLUDE,
      /**
       * Skip current row.
       */
      SKIP,
      /**
       * No more row can be included.
       */
      END
    };
    ReturnCode canTakeOperation(HRegionLocation loc, long rowSize);
    /**
     * Add the final ReturnCode to the checker.
     * The ReturnCode may be reversed, so the checker need the final decision to update
     * the inner state.
     */
    void notifyFinal(ReturnCode code, HRegionLocation loc, long rowSize);
    /**
     * Reset the inner state.
     */
    void reset() throws InterruptedIOException ;
  }

  /**
   * limit the heapsize of total submitted data.
   * Reduce the limit of heapsize for submitting quickly
   * if there is no running task.
   */
  @VisibleForTesting
  static class SubmittedSizeChecker implements RowChecker {
    private final long maxHeapSizeSubmit;
    private long heapSize = 0;
    SubmittedSizeChecker(final long maxHeapSizeSubmit) {
      this.maxHeapSizeSubmit = maxHeapSizeSubmit;
    }
    @Override
    public ReturnCode canTakeOperation(HRegionLocation loc, long rowSize) {
      if (heapSize >= maxHeapSizeSubmit) {
        return ReturnCode.END;
      }
      return ReturnCode.INCLUDE;
    }

    @Override
    public void notifyFinal(ReturnCode code, HRegionLocation loc, long rowSize) {
      if (code == ReturnCode.INCLUDE) {
        heapSize += rowSize;
      }
    }

    @Override
    public void reset() {
      heapSize = 0;
    }
  }
  /**
   * limit the max number of tasks in an AsyncProcess.
   */
  @VisibleForTesting
  static class TaskCountChecker implements RowChecker {
    private static final long MAX_WAITING_TIME = 1000; //ms
    private final Set regionsIncluded = new HashSet<>();
    private final Set serversIncluded = new HashSet<>();
    private final int maxConcurrentTasksPerRegion;
    private final int maxTotalConcurrentTasks;
    private final int maxConcurrentTasksPerServer;
    private final Map taskCounterPerRegion;
    private final Map taskCounterPerServer;
    private final Set busyRegions = new TreeSet<>(Bytes.BYTES_COMPARATOR);
    private final AtomicLong tasksInProgress;
    TaskCountChecker(final int maxTotalConcurrentTasks,
      final int maxConcurrentTasksPerServer,
      final int maxConcurrentTasksPerRegion,
      final AtomicLong tasksInProgress,
      final Map taskCounterPerServer,
      final Map taskCounterPerRegion) {
      this.maxTotalConcurrentTasks = maxTotalConcurrentTasks;
      this.maxConcurrentTasksPerRegion = maxConcurrentTasksPerRegion;
      this.maxConcurrentTasksPerServer = maxConcurrentTasksPerServer;
      this.taskCounterPerRegion = taskCounterPerRegion;
      this.taskCounterPerServer = taskCounterPerServer;
      this.tasksInProgress = tasksInProgress;
    }
    @Override
    public void reset() throws InterruptedIOException {
      // prevent the busy-waiting
      waitForRegion();
      regionsIncluded.clear();
      serversIncluded.clear();
      busyRegions.clear();
    }
    private void waitForRegion() throws InterruptedIOException {
      if (busyRegions.isEmpty()) {
        return;
      }
      EnvironmentEdge ee = EnvironmentEdgeManager.getDelegate();
      final long start = ee.currentTime();
      while ((ee.currentTime() - start) <= MAX_WAITING_TIME) {
        for (byte[] region : busyRegions) {
          AtomicInteger count = taskCounterPerRegion.get(region);
          if (count == null || count.get() < maxConcurrentTasksPerRegion) {
            return;
          }
        }
        try {
          synchronized (tasksInProgress) {
            tasksInProgress.wait(10);
          }
        } catch (InterruptedException e) {
          throw new InterruptedIOException("Interrupted." +
              " tasksInProgress=" + tasksInProgress);
        }
      }
    }
    /**
     * 1) check the regions is allowed.
     * 2) check the concurrent tasks for regions.
     * 3) check the total concurrent tasks.
     * 4) check the concurrent tasks for server.
     * @param loc
     * @param rowSize
     * @return
     */
    @Override
    public ReturnCode canTakeOperation(HRegionLocation loc, long rowSize) {

      HRegionInfo regionInfo = loc.getRegionInfo();
      if (regionsIncluded.contains(regionInfo)) {
        // We already know what to do with this region.
        return ReturnCode.INCLUDE;
      }
      AtomicInteger regionCnt = taskCounterPerRegion.get(loc.getRegionInfo().getRegionName());
      if (regionCnt != null && regionCnt.get() >= maxConcurrentTasksPerRegion) {
        // Too many tasks on this region already.
        return ReturnCode.SKIP;
      }
      int newServers = serversIncluded.size()
        + (serversIncluded.contains(loc.getServerName()) ? 0 : 1);
      if ((newServers + tasksInProgress.get()) > maxTotalConcurrentTasks) {
        // Too many tasks.
        return ReturnCode.SKIP;
      }
      AtomicInteger serverCnt = taskCounterPerServer.get(loc.getServerName());
      if (serverCnt != null && serverCnt.get() >= maxConcurrentTasksPerServer) {
        // Too many tasks for this individual server
        return ReturnCode.SKIP;
      }
      return ReturnCode.INCLUDE;
    }

    @Override
    public void notifyFinal(ReturnCode code, HRegionLocation loc, long rowSize) {
      if (code == ReturnCode.INCLUDE) {
        regionsIncluded.add(loc.getRegionInfo());
        serversIncluded.add(loc.getServerName());
      }
      busyRegions.add(loc.getRegionInfo().getRegionName());
    }
  }

  /**
   * limit the request size for each regionserver.
   */
  @VisibleForTesting
  static class RequestSizeChecker implements RowChecker {
    private final long maxHeapSizePerRequest;
    private final Map serverRequestSizes = new HashMap<>();
    RequestSizeChecker(final long maxHeapSizePerRequest) {
      this.maxHeapSizePerRequest = maxHeapSizePerRequest;
    }
    @Override
    public void reset() {
      serverRequestSizes.clear();
    }
    @Override
    public ReturnCode canTakeOperation(HRegionLocation loc, long rowSize) {
      // Is it ok for limit of request size?
      long currentRequestSize = serverRequestSizes.containsKey(loc.getServerName()) ?
        serverRequestSizes.get(loc.getServerName()) : 0L;
      // accept at least one request
      if (currentRequestSize == 0 || currentRequestSize + rowSize <= maxHeapSizePerRequest) {
        return ReturnCode.INCLUDE;
      }
      return ReturnCode.SKIP;
    }

    @Override
    public void notifyFinal(ReturnCode code, HRegionLocation loc, long rowSize) {
      if (code == ReturnCode.INCLUDE) {
        long currentRequestSize = serverRequestSizes.containsKey(loc.getServerName()) ?
          serverRequestSizes.get(loc.getServerName()) : 0L;
        serverRequestSizes.put(loc.getServerName(), currentRequestSize + rowSize);
      }
    }
  }

  public static class ListRowAccess implements RowAccess {
    private final List data;
    ListRowAccess(final List data) {
      this.data = data;
    }

    @Override
    public int size() {
      return data.size();
    }

    @Override
    public boolean isEmpty() {
      return data.isEmpty();
    }

    @Override
    public Iterator iterator() {
      return data.iterator();
    }
  }
}