All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.apache.hadoop.hbase.client.ScannerCallableWithReplicas Maven / Gradle / Ivy

There is a newer version: 3.0.0-beta-1
Show newest version
/*
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.hadoop.hbase.client;

import java.io.IOException;
import java.io.InterruptedIOException;
import java.util.HashSet;
import java.util.Set;
import java.util.concurrent.Callable;
import java.util.concurrent.CancellationException;
import java.util.concurrent.ExecutionException;
import java.util.concurrent.ExecutorService;
import java.util.concurrent.Future;
import java.util.concurrent.TimeUnit;
import java.util.concurrent.atomic.AtomicBoolean;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.hbase.DoNotRetryIOException;
import org.apache.hadoop.hbase.HRegionInfo;
import org.apache.hadoop.hbase.RegionLocations;
import org.apache.hadoop.hbase.TableName;
import org.apache.hadoop.hbase.client.ScannerCallable.MoreResults;
import org.apache.hadoop.hbase.util.Pair;
import org.apache.yetus.audience.InterfaceAudience;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

/**
 * This class has the logic for handling scanners for regions with and without replicas. 1. A scan
 * is attempted on the default (primary) region, or a specific region. 2. The scanner sends all the
 * RPCs to the default/specific region until it is done, or, there is a timeout on the
 * default/specific region (a timeout of zero is disallowed). 3. If there is a timeout in (2) above,
 * scanner(s) is opened on the non-default replica(s) only for Consistency.TIMELINE without specific
 * replica id specified. 4. The results from the first successful scanner are taken, and it is
 * stored which server returned the results. 5. The next RPCs are done on the above stored server
 * until it is done or there is a timeout, in which case, the other replicas are queried (as in (3)
 * above).
 */
@InterfaceAudience.Private
class ScannerCallableWithReplicas implements RetryingCallable {
  private static final Logger LOG = LoggerFactory.getLogger(ScannerCallableWithReplicas.class);
  volatile ScannerCallable currentScannerCallable;
  AtomicBoolean replicaSwitched = new AtomicBoolean(false);
  private final ClusterConnection cConnection;
  protected final ExecutorService pool;
  private final boolean useScannerTimeoutForNextCalls;
  protected final int timeBeforeReplicas;
  private final Scan scan;
  private final int retries;
  private Result lastResult;
  private final RpcRetryingCaller caller;
  private final TableName tableName;
  private Configuration conf;
  private final int scannerTimeout;
  private final int readRpcTimeout;
  private Set outstandingCallables = new HashSet<>();
  private boolean someRPCcancelled = false; // required for testing purposes only
  private int regionReplication = 0;

  public ScannerCallableWithReplicas(TableName tableName, ClusterConnection cConnection,
    ScannerCallable baseCallable, ExecutorService pool, int timeBeforeReplicas, Scan scan,
    int retries, int readRpcTimeout, int scannerTimeout, boolean useScannerTimeoutForNextCalls,
    int caching, Configuration conf, RpcRetryingCaller caller) {
    this.currentScannerCallable = baseCallable;
    this.cConnection = cConnection;
    this.pool = pool;
    this.useScannerTimeoutForNextCalls = useScannerTimeoutForNextCalls;
    if (timeBeforeReplicas < 0) {
      throw new IllegalArgumentException("Invalid value of operation timeout on the primary");
    }
    this.timeBeforeReplicas = timeBeforeReplicas;
    this.scan = scan;
    this.retries = retries;
    this.tableName = tableName;
    this.conf = conf;
    this.readRpcTimeout = readRpcTimeout;
    this.scannerTimeout = scannerTimeout;
    this.caller = caller;
  }

  public void setClose() {
    if (currentScannerCallable != null) {
      currentScannerCallable.setClose();
    } else {
      LOG.warn("Calling close on ScannerCallable reference that is already null, "
        + "which shouldn't happen.");
    }
  }

  public void setRenew(boolean val) {
    currentScannerCallable.setRenew(val);
  }

  public void setCaching(int caching) {
    currentScannerCallable.setCaching(caching);
  }

  public int getCaching() {
    return currentScannerCallable.getCaching();
  }

  public HRegionInfo getHRegionInfo() {
    return currentScannerCallable.getHRegionInfo();
  }

  public MoreResults moreResultsInRegion() {
    return currentScannerCallable.moreResultsInRegion();
  }

  public MoreResults moreResultsForScan() {
    return currentScannerCallable.moreResultsForScan();
  }

  @Override
  public Result[] call(int timeout) throws IOException {
    // If the active replica callable was closed somewhere, invoke the RPC to
    // really close it. In the case of regular scanners, this applies. We make couple
    // of RPCs to a RegionServer, and when that region is exhausted, we set
    // the closed flag. Then an RPC is required to actually close the scanner.
    if (currentScannerCallable != null && currentScannerCallable.closed) {
      // For closing we target that exact scanner (and not do replica fallback like in
      // the case of normal reads)
      if (LOG.isTraceEnabled()) {
        LOG.trace("Closing scanner id=" + currentScannerCallable.scannerId);
      }
      Result[] r = currentScannerCallable.call(timeout);
      currentScannerCallable = null;
      return r;
    } else if (currentScannerCallable == null) {
      LOG.warn("Another call received, but our ScannerCallable is already null. "
        + "This shouldn't happen, but there's not much to do, so logging and returning null.");
      return null;
    }
    // We need to do the following:
    // 1. When a scan goes out to a certain replica (default or not), we need to
    // continue to hit that until there is a failure. So store the last successfully invoked
    // replica
    // 2. We should close the "losing" scanners (scanners other than the ones we hear back
    // from first)
    //
    // Since RegionReplication is a table attribute, it wont change as long as table is enabled,
    // it just needs to be set once.

    if (regionReplication <= 0) {
      RegionLocations rl = null;
      try {
        rl = RpcRetryingCallerWithReadReplicas.getRegionLocations(true,
          RegionReplicaUtil.DEFAULT_REPLICA_ID, cConnection, tableName,
          currentScannerCallable.getRow());
      } catch (RetriesExhaustedException | DoNotRetryIOException e) {
        // We cannot get the primary replica region location, it is possible that the region server
        // hosting meta table is down, it needs to proceed to try cached replicas directly.
        if (cConnection instanceof ConnectionImplementation) {
          rl = ((ConnectionImplementation) cConnection).getCachedLocation(tableName,
            currentScannerCallable.getRow());
          if (rl == null) {
            throw e;
          }
        } else {
          // For completeness
          throw e;
        }
      }
      regionReplication = rl.size();
    }
    // allocate a bounded-completion pool of some multiple of number of replicas.
    // We want to accommodate some RPCs for redundant replica scans (but are still in progress)
    final ConnectionConfiguration connectionConfig = cConnection != null
      ? cConnection.getConnectionConfiguration()
      : new ConnectionConfiguration(ScannerCallableWithReplicas.this.conf);
    ResultBoundedCompletionService> cs =
      new ResultBoundedCompletionService<>(
        RpcRetryingCallerFactory.instantiate(ScannerCallableWithReplicas.this.conf,
          connectionConfig, cConnection == null ? null : cConnection.getConnectionMetrics()),
        pool, regionReplication * 5);

    AtomicBoolean done = new AtomicBoolean(false);
    // make sure we use the same rpcTimeout for current and other replicas
    int rpcTimeoutForCall = getRpcTimeout();

    replicaSwitched.set(false);
    // submit call for the primary replica or user specified replica
    addCallsForCurrentReplica(cs, rpcTimeoutForCall);
    int startIndex = 0;

    try {
      // wait for the timeout to see whether the primary responds back
      Future> f =
        cs.poll(timeBeforeReplicas, TimeUnit.MICROSECONDS); // Yes, microseconds
      if (f != null) {
        // After poll, if f is not null, there must be a completed task
        Pair r = f.get();
        if (r != null && r.getSecond() != null) {
          updateCurrentlyServingReplica(r.getSecond(), r.getFirst(), done, pool);
        }
        return r == null ? null : r.getFirst(); // great we got a response
      }
    } catch (ExecutionException e) {
      // We ignore the ExecutionException and continue with the replicas
      if (LOG.isDebugEnabled()) {
        LOG.debug("Scan with primary region returns " + e.getCause());
      }

      // If rl's size is 1 or scan's consitency is strong, or scan is over specific replica,
      // it needs to throw out the exception from the primary replica
      if (
        regionReplication == 1 || scan.getConsistency() == Consistency.STRONG
          || scan.getReplicaId() >= 0
      ) {
        // Rethrow the first exception
        RpcRetryingCallerWithReadReplicas.throwEnrichedException(e, retries);
      }
      startIndex = 1;
    } catch (CancellationException e) {
      throw new InterruptedIOException(e.getMessage());
    } catch (InterruptedException e) {
      throw new InterruptedIOException(e.getMessage());
    }

    // submit call for the all of the secondaries at once
    int endIndex = regionReplication;
    if (scan.getConsistency() == Consistency.STRONG || scan.getReplicaId() >= 0) {
      // When scan's consistency is strong or scan is over specific replica region, do not send to
      // the secondaries
      endIndex = 1;
    } else {
      // TODO: this may be an overkill for large region replication
      addCallsForOtherReplicas(cs, 0, regionReplication - 1, rpcTimeoutForCall);
    }

    try {
      Future> f = cs.pollForFirstSuccessfullyCompletedTask(timeout,
        TimeUnit.MILLISECONDS, startIndex, endIndex);

      if (f == null) {
        throw new IOException("Failed to get result within timeout, timeout=" + timeout + "ms");
      }
      Pair r = f.get();

      if (r != null && r.getSecond() != null) {
        updateCurrentlyServingReplica(r.getSecond(), r.getFirst(), done, pool);
      }
      return r == null ? null : r.getFirst(); // great we got an answer

    } catch (ExecutionException e) {
      RpcRetryingCallerWithReadReplicas.throwEnrichedException(e, retries);
    } catch (CancellationException e) {
      throw new InterruptedIOException(e.getMessage());
    } catch (InterruptedException e) {
      throw new InterruptedIOException(e.getMessage());
    } finally {
      // We get there because we were interrupted or because one or more of the
      // calls succeeded or failed. In all case, we stop all our tasks.
      cs.cancelAll();
    }
    LOG.error("Imposible? Arrive at an unreachable line..."); // unreachable
    throw new IOException("Imposible? Arrive at an unreachable line...");
  }

  @SuppressWarnings("FutureReturnValueIgnored")
  private void updateCurrentlyServingReplica(ScannerCallable scanner, Result[] result,
    AtomicBoolean done, ExecutorService pool) {
    if (done.compareAndSet(false, true)) {
      if (currentScannerCallable != scanner) replicaSwitched.set(true);
      currentScannerCallable = scanner;
      // store where to start the replica scanner from if we need to.
      if (result != null && result.length != 0) this.lastResult = result[result.length - 1];
      if (LOG.isTraceEnabled()) {
        LOG.trace("Setting current scanner as id=" + currentScannerCallable.scannerId
          + " associated with replica=" + currentScannerCallable.getHRegionInfo().getReplicaId());
      }
      // close all outstanding replica scanners but the one we heard back from
      outstandingCallables.remove(scanner);
      for (ScannerCallable s : outstandingCallables) {
        if (LOG.isTraceEnabled()) {
          LOG.trace("Closing scanner id=" + s.scannerId + ", replica="
            + s.getHRegionInfo().getRegionId() + " because slow and replica="
            + this.currentScannerCallable.getHRegionInfo().getReplicaId() + " succeeded");
        }
        // Submit the "close" to the pool since this might take time, and we don't
        // want to wait for the "close" to happen yet. The "wait" will happen when
        // the table is closed (when the awaitTermination of the underlying pool is called)
        s.setClose();
        final RetryingRPC r = new RetryingRPC(s);
        pool.submit(new Callable() {
          @Override
          public Void call() throws Exception {
            r.call(scannerTimeout);
            return null;
          }
        });
      }
      // now clear outstandingCallables since we scheduled a close for all the contained scanners
      outstandingCallables.clear();
    }
  }

  /**
   * When a scanner switches in the middle of scanning (the 'next' call fails for example), the
   * upper layer {@link ClientScanner} needs to know
   */
  public boolean switchedToADifferentReplica() {
    return replicaSwitched.get();
  }

  /**
   * Returns true when the most recent RPC response indicated that the response was a heartbeat
   * message. Heartbeat messages are sent back from the server when the processing of the scan
   * request exceeds a certain time threshold. Heartbeats allow the server to avoid timeouts during
   * long running scan operations.
   */
  public boolean isHeartbeatMessage() {
    return currentScannerCallable != null && currentScannerCallable.isHeartbeatMessage();
  }

  public Cursor getCursor() {
    return currentScannerCallable != null ? currentScannerCallable.getCursor() : null;
  }

  private void addCallsForCurrentReplica(
    ResultBoundedCompletionService> cs, int rpcTimeout) {
    RetryingRPC retryingOnReplica = new RetryingRPC(currentScannerCallable);
    outstandingCallables.add(currentScannerCallable);
    cs.submit(retryingOnReplica, rpcTimeout, scannerTimeout, currentScannerCallable.id);
  }

  /**
   * As we have a call sequence for scan, it is useless to have a different rpc timeout which is
   * less than the scan timeout. If the server does not respond in time(usually this will not happen
   * as we have heartbeat now), we will get an OutOfOrderScannerNextException when resending the
   * next request and the only way to fix this is to close the scanner and open a new one.
   * 

* The legacy behavior of ScannerCallable has been to use readRpcTimeout despite the above. If * using legacy behavior, we always use that. *

* If new behavior is enabled, we determine the rpc timeout to use based on whether the scanner is * open. If scanner is open, use scannerTimeout otherwise use readRpcTimeout. */ private int getRpcTimeout() { if (useScannerTimeoutForNextCalls) { return isNextCall() ? scannerTimeout : readRpcTimeout; } else { return readRpcTimeout; } } private boolean isNextCall() { return currentScannerCallable != null && currentScannerCallable.scannerId != -1 && !currentScannerCallable.renew && !currentScannerCallable.closed; } private void addCallsForOtherReplicas( ResultBoundedCompletionService> cs, int min, int max, int rpcTimeout) { for (int id = min; id <= max; id++) { if (currentScannerCallable.id == id) { continue; // this was already scheduled earlier } ScannerCallable s = currentScannerCallable.getScannerCallableForReplica(id); setStartRowForReplicaCallable(s); outstandingCallables.add(s); RetryingRPC retryingOnReplica = new RetryingRPC(s); cs.submit(retryingOnReplica, rpcTimeout, scannerTimeout, id); } } /** * Set the start row for the replica callable based on the state of the last result received. * @param callable The callable to set the start row on */ private void setStartRowForReplicaCallable(ScannerCallable callable) { if (this.lastResult == null || callable == null) { return; } // 1. The last result was a partial result which means we have not received all of the cells // for this row. Thus, use the last result's row as the start row. If a replica switch // occurs, the scanner will ensure that any accumulated partial results are cleared, // and the scan can resume from this row. // 2. The last result was not a partial result which means it contained all of the cells for // that row (we no longer need any information from it). Set the start row to the next // closest row that could be seen. callable.getScan().withStartRow(this.lastResult.getRow(), this.lastResult.mayHaveMoreCellsInRow()); } boolean isAnyRPCcancelled() { return someRPCcancelled; } class RetryingRPC implements RetryingCallable>, Cancellable { final ScannerCallable callable; RpcRetryingCaller caller; private volatile boolean cancelled = false; RetryingRPC(ScannerCallable callable) { this.callable = callable; // For the Consistency.STRONG (default case), we reuse the caller // to keep compatibility with what is done in the past // For the Consistency.TIMELINE case, we can't reuse the caller // since we could be making parallel RPCs (caller.callWithRetries is synchronized // and we can't invoke it multiple times at the same time) this.caller = ScannerCallableWithReplicas.this.caller; if (scan.getConsistency() == Consistency.TIMELINE) { final ConnectionConfiguration connectionConfig = cConnection != null ? cConnection.getConnectionConfiguration() : new ConnectionConfiguration(ScannerCallableWithReplicas.this.conf); this.caller = RpcRetryingCallerFactory .instantiate(ScannerCallableWithReplicas.this.conf, connectionConfig, cConnection == null ? null : cConnection.getConnectionMetrics()) . newCaller(); } } @Override public Pair call(int callTimeout) throws IOException { // since the retries is done within the ResultBoundedCompletionService, // we don't invoke callWithRetries here if (cancelled) { return null; } Result[] res = this.caller.callWithoutRetries(this.callable, callTimeout); return new Pair<>(res, this.callable); } @Override public void prepare(boolean reload) throws IOException { if (cancelled) return; if (Thread.interrupted()) { throw new InterruptedIOException(); } callable.prepare(reload); } @Override public void throwable(Throwable t, boolean retrying) { callable.throwable(t, retrying); } @Override public String getExceptionMessageAdditionalDetail() { return callable.getExceptionMessageAdditionalDetail(); } @Override public long sleep(long pause, int tries) { return callable.sleep(pause, tries); } @Override public void cancel() { cancelled = true; caller.cancel(); if (callable.getRpcController() != null) { callable.getRpcController().startCancel(); } someRPCcancelled = true; } @Override public boolean isCancelled() { return cancelled; } } @Override public void prepare(boolean reload) throws IOException { } @Override public void throwable(Throwable t, boolean retrying) { currentScannerCallable.throwable(t, retrying); } @Override public String getExceptionMessageAdditionalDetail() { return currentScannerCallable.getExceptionMessageAdditionalDetail(); } @Override public long sleep(long pause, int tries) { return currentScannerCallable.sleep(pause, tries); } }





© 2015 - 2024 Weber Informatics LLC | Privacy Policy