org.apache.kudu.client.AsyncKuduScanner Maven / Gradle / Ivy
Show all versions of camel-quarkus-kudu-client
/*
* Copyright (C) 2010-2012 The Async HBase Authors. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
* - Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimer.
* - Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
* - Neither the name of the StumbleUpon nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*/
package org.apache.kudu.client;
import static com.google.common.base.Preconditions.checkArgument;
import static java.nio.charset.StandardCharsets.UTF_8;
import static org.apache.kudu.tserver.Tserver.NewScanRequestPB;
import static org.apache.kudu.tserver.Tserver.ResourceMetricsPB;
import static org.apache.kudu.tserver.Tserver.ScanRequestPB;
import static org.apache.kudu.tserver.Tserver.ScanResponsePB;
import static org.apache.kudu.tserver.Tserver.TabletServerErrorPB;
import java.util.ArrayList;
import java.util.Collection;
import java.util.List;
import java.util.Map;
import java.util.concurrent.atomic.AtomicReference;
import com.google.common.collect.ImmutableList;
import com.google.protobuf.Message;
import com.google.protobuf.UnsafeByteOperations;
import com.stumbleupon.async.Callback;
import com.stumbleupon.async.Deferred;
import org.apache.yetus.audience.InterfaceAudience;
import org.apache.yetus.audience.InterfaceStability;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.apache.kudu.ColumnSchema;
import org.apache.kudu.Common;
import org.apache.kudu.Schema;
import org.apache.kudu.Type;
import org.apache.kudu.security.Token;
import org.apache.kudu.tserver.Tserver;
import org.apache.kudu.tserver.Tserver.ScannerKeepAliveRequestPB;
import org.apache.kudu.tserver.Tserver.ScannerKeepAliveResponsePB;
import org.apache.kudu.util.Pair;
/**
* Creates a scanner to read data from Kudu.
*
* This class is not synchronized as it's expected to be
* used from a single thread at a time. It's rarely (if ever?) useful to
* scan concurrently from a shared scanner using multiple threads. If you
* want to optimize large table scans using extra parallelism, create a few
* scanners through the {@link KuduScanToken} API. Or use MapReduce.
*
* There's no method in this class to explicitly open the scanner. It will open
* itself automatically when you start scanning by calling {@link #nextRows()}.
* Also, the scanner will automatically call {@link #close} when it reaches the
* end key. If, however, you would like to stop scanning before reaching the
* end key, you must call {@link #close} before disposing of the scanner.
* Note that it's always safe to call {@link #close} on a scanner.
*
* A {@code AsyncKuduScanner} is not re-usable. Should you want to scan the same rows
* or the same table again, you must create a new one.
*
*
A note on passing {@code byte} arrays in argument
* None of the method that receive a {@code byte[]} in argument will copy it.
* For more info, please refer to the documentation of {@link KuduRpc}.
* A note on passing {@code String}s in argument
* All strings are assumed to use the platform's default charset.
*/
@InterfaceAudience.Public
@InterfaceStability.Unstable
public final class AsyncKuduScanner {
private static final Logger LOG = LoggerFactory.getLogger(AsyncKuduScanner.class);
/**
* The possible read modes for scanners.
*/
@InterfaceAudience.Public
@InterfaceStability.Evolving
public enum ReadMode {
/**
* When READ_LATEST is specified the server will always return committed writes at
* the time the request was received. This type of read does not return a snapshot
* timestamp and is not repeatable.
*
* In ACID terms this corresponds to Isolation mode: "Read Committed"
*
* This is the default mode.
*/
READ_LATEST(Common.ReadMode.READ_LATEST),
/**
* When READ_AT_SNAPSHOT is specified the server will attempt to perform a read
* at the provided timestamp. If no timestamp is provided the server will take the
* current time as the snapshot timestamp. In this mode reads are repeatable, i.e.
* all future reads at the same timestamp will yield the same data. This is
* performed at the expense of waiting for in-flight transactions whose timestamp
* is lower than the snapshot's timestamp to complete, so it might incur a latency
* penalty.
*
* In ACID terms this, by itself, corresponds to Isolation mode "Repeatable
* Read". If all writes to the scanned tablet are made externally consistent,
* then this corresponds to Isolation mode "Strict-Serializable".
*
* Note: there currently "holes", which happen in rare edge conditions, by which writes
* are sometimes not externally consistent even when action was taken to make them so.
* In these cases Isolation may degenerate to mode "Read Committed". See KUDU-430.
*/
READ_AT_SNAPSHOT(Common.ReadMode.READ_AT_SNAPSHOT),
/**
* When @c READ_YOUR_WRITES is specified, the client will perform a read
* such that it follows all previously known writes and reads from this client.
* Specifically this mode:
* (1) ensures read-your-writes and read-your-reads session guarantees,
* (2) minimizes latency caused by waiting for outstanding write
* transactions to complete.
*
* Reads in this mode are not repeatable: two READ_YOUR_WRITES reads, even if
* they provide the same propagated timestamp bound, can execute at different
* timestamps and thus may return different results.
*/
READ_YOUR_WRITES(Common.ReadMode.READ_YOUR_WRITES);
private final Common.ReadMode pbVersion;
ReadMode(Common.ReadMode pbVersion) {
this.pbVersion = pbVersion;
}
@InterfaceAudience.Private
public Common.ReadMode pbVersion() {
return this.pbVersion;
}
}
/**
* Expected row data format in scanner result set.
*
* The server may or may not support the expected layout, and the actual layout is internal
* hidden by {@link RowResult} and {@link RowResultIterator} interfaces so it's transparent to
* application code.
*/
@InterfaceAudience.Public
@InterfaceStability.Evolving
public enum RowDataFormat {
/**
* Server is expected to return scanner result data in row-wise layout.
* This is currently the default layout.
*/
ROWWISE,
/**
* Server is expected to return scanner result data in columnar layout.
* This layout is more efficient in processing and bandwidth for both server and client side.
* It requires server support (kudu-1.12.0 and later), if it's not supported server still
* returns data in row-wise layout.
*/
COLUMNAR,
}
// This is private because it is not safe to use this column name as it may be
// different in the case of collisions. Instead the `IS_DELETED` column should
// be looked up by type.
static final String DEFAULT_IS_DELETED_COL_NAME = "is_deleted";
//////////////////////////
// Initial configurations.
//////////////////////////
private final AsyncKuduClient client;
private final KuduTable table;
private final Schema schema;
private final PartitionPruner pruner;
/**
* Map of column name to predicate.
*/
private final Map predicates;
/**
* Maximum number of bytes returned by the scanner, on each batch.
*/
private final int batchSizeBytes;
/**
* The maximum number of rows to scan.
*/
private final long limit;
/**
* Set in the builder. If it's not set by the user, it will default to EMPTY_ARRAY.
* It is then reset to the new start primary key of each tablet we open a scanner on as the scan
* moves from one tablet to the next.
*/
private final byte[] startPrimaryKey;
/**
* Set in the builder. If it's not set by the user, it will default to EMPTY_ARRAY.
* It's never modified after that.
*/
private final byte[] endPrimaryKey;
private byte[] lastPrimaryKey;
private final boolean prefetching;
private final boolean cacheBlocks;
private final ReadMode readMode;
private final Common.OrderMode orderMode;
private final boolean isFaultTolerant;
private final long startTimestamp;
private long htTimestamp;
private long lowerBoundPropagationTimestamp = AsyncKuduClient.NO_TIMESTAMP;
private final ReplicaSelection replicaSelection;
private final long keepAlivePeriodMs;
/////////////////////
// Runtime variables.
/////////////////////
private boolean reuseRowResult = false;
private final ResourceMetrics resourceMetrics = new ResourceMetrics();
private boolean closed = false;
private boolean canRequestMore = true;
private long numRowsReturned = 0;
private RowDataFormat rowDataFormat = RowDataFormat.ROWWISE;
/**
* The tabletSlice currently being scanned.
* If null, we haven't started scanning.
* If == DONE, then we're done scanning.
* Otherwise it contains a proper tabletSlice name, and we're currently scanning.
*/
private RemoteTablet tablet;
/**
* This is the scanner ID we got from the TabletServer.
* It's generated randomly so any value is possible.
*/
private byte[] scannerId;
/**
* The sequence ID of this call. The sequence ID should start at 0
* with the request for a new scanner, and after each successful request,
* the client should increment it by 1. When retrying a request, the client
* should _not_ increment this value. If the server detects that the client
* missed a chunk of rows from the middle of a scan, it will respond with an
* error.
*/
private int sequenceId;
final long scanRequestTimeout;
/**
* The prefetching result is cached in memory. This atomic reference is used to avoid
* two concurrent prefetchings occur and the latest one overrides the previous one.
*/
private AtomicReference> cachedPrefetcherDeferred =
new AtomicReference<>();
/**
* When scanner's prefetching is enabled, there are at most two concurrent ScanRequests
* sent to the tserver. But if the scan data reached the end, only one hasMore=false is returned.
* As a result, one of the ScanRequests got "scanner not found (it may have expired)" exception.
* The same issue occurs for KeepAliveRequest.
*
* @param errorCode error code returned from tserver
* @return true if this can be ignored
*/
boolean canBeIgnored(TabletServerErrorPB.Code errorCode) {
return errorCode == TabletServerErrorPB.Code.SCANNER_EXPIRED &&
prefetching && closed;
}
AsyncKuduScanner(AsyncKuduClient client, KuduTable table, List projectedNames,
List projectedIndexes, ReadMode readMode, boolean isFaultTolerant,
long scanRequestTimeout,
Map predicates, long limit,
boolean cacheBlocks, boolean prefetching,
byte[] startPrimaryKey, byte[] endPrimaryKey,
long startTimestamp, long htTimestamp,
int batchSizeBytes, PartitionPruner pruner,
ReplicaSelection replicaSelection, long keepAlivePeriodMs) {
checkArgument(batchSizeBytes >= 0, "Need non-negative number of bytes, " +
"got %s", batchSizeBytes);
checkArgument(limit > 0, "Need a strictly positive number for the limit, " +
"got %s", limit);
if (htTimestamp != AsyncKuduClient.NO_TIMESTAMP) {
checkArgument(htTimestamp >= 0, "Need non-negative number for the scan, " +
" timestamp got %s", htTimestamp);
checkArgument(readMode == ReadMode.READ_AT_SNAPSHOT, "When specifying a " +
"HybridClock timestamp, the read mode needs to be set to READ_AT_SNAPSHOT");
}
if (startTimestamp != AsyncKuduClient.NO_TIMESTAMP) {
checkArgument(htTimestamp >= 0, "Must have both start and end timestamps " +
"for a diff scan");
checkArgument(startTimestamp <= htTimestamp, "Start timestamp must be less " +
"than or equal to end timestamp");
}
this.isFaultTolerant = isFaultTolerant;
if (this.isFaultTolerant) {
checkArgument(readMode == ReadMode.READ_AT_SNAPSHOT, "Use of fault tolerance scanner " +
"requires the read mode to be set to READ_AT_SNAPSHOT");
this.orderMode = Common.OrderMode.ORDERED;
} else {
this.orderMode = Common.OrderMode.UNORDERED;
}
this.client = client;
this.table = table;
this.pruner = pruner;
this.readMode = readMode;
this.scanRequestTimeout = scanRequestTimeout;
this.predicates = predicates;
this.limit = limit;
this.cacheBlocks = cacheBlocks;
this.prefetching = prefetching;
this.startPrimaryKey = startPrimaryKey;
this.endPrimaryKey = endPrimaryKey;
this.startTimestamp = startTimestamp;
this.htTimestamp = htTimestamp;
this.batchSizeBytes = batchSizeBytes;
this.lastPrimaryKey = AsyncKuduClient.EMPTY_ARRAY;
// Map the column names to actual columns in the table schema.
// If the user set this to 'null', we scan all columns.
List columns = new ArrayList<>();
if (projectedNames != null) {
for (String columnName : projectedNames) {
ColumnSchema originalColumn = table.getSchema().getColumn(columnName);
columns.add(getStrippedColumnSchema(originalColumn));
}
} else if (projectedIndexes != null) {
for (Integer columnIndex : projectedIndexes) {
ColumnSchema originalColumn = table.getSchema().getColumnByIndex(columnIndex);
columns.add(getStrippedColumnSchema(originalColumn));
}
} else {
// By default, a scanner is created with all columns including auto-incrementing
// column if projected columns are not specified.
columns.addAll(table.getSchema().getColumns());
}
// This is a diff scan so add the IS_DELETED column.
if (startTimestamp != AsyncKuduClient.NO_TIMESTAMP) {
columns.add(generateIsDeletedColumn(table.getSchema()));
}
this.schema = new Schema(columns);
// If the partition pruner has pruned all partitions, then the scan can be
// short circuited without contacting any tablet servers.
if (!pruner.hasMorePartitionKeyRanges()) {
LOG.debug("Short circuiting scan");
this.canRequestMore = false;
this.closed = true;
}
this.replicaSelection = replicaSelection;
this.keepAlivePeriodMs = keepAlivePeriodMs;
// For READ_YOUR_WRITES scan mode, get the latest observed timestamp
// and store it. Always use this one as the propagated timestamp for
// the duration of the scan to avoid unnecessary wait.
if (readMode == ReadMode.READ_YOUR_WRITES) {
this.lowerBoundPropagationTimestamp = this.client.getLastPropagatedTimestamp();
}
}
/**
* Generates and returns a ColumnSchema for the virtual IS_DELETED column.
* The column name is generated to ensure there is never a collision.
*
* @param schema the table schema
* @return a ColumnSchema for the virtual IS_DELETED column
*/
private static ColumnSchema generateIsDeletedColumn(Schema schema) {
StringBuilder columnName = new StringBuilder(DEFAULT_IS_DELETED_COL_NAME);
// If the column already exists and we need to pick an alternate column name.
while (schema.hasColumn(columnName.toString())) {
columnName.append("_");
}
return new ColumnSchema.ColumnSchemaBuilder(columnName.toString(), Type.BOOL)
.wireType(Common.DataType.IS_DELETED)
.defaultValue(false)
.nullable(false)
.key(false)
.build();
}
/**
* Sets isKey to false on the passed ColumnSchema.
* This allows out of order key columns in projections.
*
* TODO: Remove the need for this by handling server side.
*
* @return a new column schema
*/
private static ColumnSchema getStrippedColumnSchema(ColumnSchema columnToClone) {
return new ColumnSchema.ColumnSchemaBuilder(columnToClone)
.key(false)
.build();
}
/**
* Returns the maximum number of rows that this scanner was configured to return.
* @return a long representing the maximum number of rows that can be returned
*/
public long getLimit() {
return this.limit;
}
/**
* Tells if there is data to scan, including both rpc or cached rpc result.
* @return true if there might be more data to scan, else false
*/
public boolean hasMoreRows() {
return this.canRequestMore || cachedPrefetcherDeferred.get() != null;
}
/**
* Returns if this scanner was configured to cache data blocks or not.
* @return true if this scanner will cache blocks, else else.
*/
public boolean getCacheBlocks() {
return this.cacheBlocks;
}
/**
* Returns the maximum number of bytes returned by the scanner, on each batch.
* @return a long representing the maximum number of bytes that a scanner can receive at once
* from a tablet server
*/
public long getBatchSizeBytes() {
return this.batchSizeBytes;
}
/**
* Returns the ReadMode for this scanner.
* @return the configured read mode for this scanner
*/
public ReadMode getReadMode() {
return this.readMode;
}
private Common.OrderMode getOrderMode() {
return this.orderMode;
}
/**
* Returns the scan request timeout for this scanner.
* @return the scan request timeout, in milliseconds
*/
public long getScanRequestTimeout() {
return scanRequestTimeout;
}
/**
* Returns the projection schema of this scanner. If specific columns were
* not specified during scanner creation, the table schema is returned.
* @return the projection schema for this scanner
*/
public Schema getProjectionSchema() {
return this.schema;
}
public long getKeepAlivePeriodMs() {
return keepAlivePeriodMs;
}
long getStartSnapshotTimestamp() {
return this.startTimestamp;
}
/**
* Returns the {@code ResourceMetrics} for this scanner. These metrics are
* updated with each batch of rows returned from the server.
* @return the resource metrics for this scanner
*/
public ResourceMetrics getResourceMetrics() {
return this.resourceMetrics;
}
long getSnapshotTimestamp() {
return this.htTimestamp;
}
/**
* If set to true, the {@link RowResult} object returned by the {@link RowResultIterator}
* will be reused with each call to {@link RowResultIterator#next()}.
* This can be a useful optimization to reduce the number of objects created.
*
* Note: DO NOT use this if the RowResult is stored between calls to next().
* Enabling this optimization means that a call to next() mutates the previously returned
* RowResult. Accessing the previously returned RowResult after a call to next(), by storing all
* RowResults in a collection and accessing them later for example, will lead to all of the
* stored RowResults being mutated as per the data in the last RowResult returned.
*/
public void setReuseRowResult(boolean reuseRowResult) {
this.reuseRowResult = reuseRowResult;
}
/**
* Optionally set expected row data format.
*
* @param rowDataFormat Row data format to be expected.
*/
public void setRowDataFormat(RowDataFormat rowDataFormat) {
this.rowDataFormat = rowDataFormat;
}
/**
* Scans a number of rows.
*
* Once this method returns {@code null} once (which indicates that this
* {@code Scanner} is done scanning), calling it again leads to an undefined
* behavior.
* @return a deferred list of rows.
*/
public Deferred nextRows() {
if (closed) { // We're already done scanning.
if (prefetching && cachedPrefetcherDeferred.get() != null) {
// return the cached result and reset the cache.
return cachedPrefetcherDeferred.getAndUpdate((v) -> null);
}
return Deferred.fromResult(null);
} else if (tablet == null) {
Callback, AsyncKuduScanner.Response> cb =
new Callback, Response>() {
@Override
public Deferred call(Response resp) throws Exception {
if (htTimestamp == AsyncKuduClient.NO_TIMESTAMP &&
resp.scanTimestamp != AsyncKuduClient.NO_TIMESTAMP) {
// If the server-assigned timestamp is present in the tablet
// server's response, store it in the scanner. The stored value
// is used for read operations in READ_AT_SNAPSHOT mode at
// other tablet servers in the context of the same scan.
htTimestamp = resp.scanTimestamp;
}
long lastPropagatedTimestamp = AsyncKuduClient.NO_TIMESTAMP;
if (readMode == ReadMode.READ_YOUR_WRITES &&
resp.scanTimestamp != AsyncKuduClient.NO_TIMESTAMP) {
// For READ_YOUR_WRITES mode, update the latest propagated timestamp
// with the chosen snapshot timestamp sent back from the server, to
// avoid unnecessarily wait for subsequent reads. Since as long as
// the chosen snapshot timestamp of the next read is greater than
// the previous one, the scan does not violate READ_YOUR_WRITES
// session guarantees.
lastPropagatedTimestamp = resp.scanTimestamp;
} else if (resp.propagatedTimestamp != AsyncKuduClient.NO_TIMESTAMP) {
// Otherwise we just use the propagated timestamp returned from
// the server as the latest propagated timestamp.
lastPropagatedTimestamp = resp.propagatedTimestamp;
}
if (lastPropagatedTimestamp != AsyncKuduClient.NO_TIMESTAMP) {
client.updateLastPropagatedTimestamp(lastPropagatedTimestamp);
}
if (isFaultTolerant && resp.lastPrimaryKey != null) {
lastPrimaryKey = resp.lastPrimaryKey;
}
numRowsReturned += resp.data.getNumRows();
if (resp.resourceMetricsPb != null) {
resourceMetrics.update(resp.resourceMetricsPb);
}
if (!resp.more || resp.scannerId == null) {
scanFinished();
return Deferred.fromResult(resp.data); // there might be data to return
}
scannerId = resp.scannerId;
sequenceId++;
canRequestMore = resp.more;
if (LOG.isDebugEnabled()) {
LOG.debug("Scanner {} opened on {}", Bytes.pretty(scannerId), tablet);
}
return Deferred.fromResult(resp.data);
}
@Override
public String toString() {
return "scanner opened";
}
};
Callback, Exception> eb =
new Callback, Exception>() {
@Override
public Deferred call(Exception e) throws Exception {
invalidate();
if (e instanceof NonCoveredRangeException) {
NonCoveredRangeException ncre = (NonCoveredRangeException) e;
pruner.removePartitionKeyRange(ncre.getNonCoveredRangeEnd());
// Stop scanning if the non-covered range is past the end partition key.
if (!pruner.hasMorePartitionKeyRanges()) {
canRequestMore = false;
closed = true; // the scanner is closed on the other side at this point
return Deferred.fromResult(RowResultIterator.empty());
}
scannerId = null;
sequenceId = 0;
return nextRows();
} else {
LOG.debug("Can not open scanner", e);
// Don't let the scanner think it's opened on this tablet.
return Deferred.fromError(e); // Let the error propagate.
}
}
@Override
public String toString() {
return "open scanner errback";
}
};
// We need to open the scanner first.
return client.sendRpcToTablet(getOpenRequest()).addCallbackDeferring(cb).addErrback(eb);
} else if (prefetching && cachedPrefetcherDeferred.get() != null) {
Deferred prefetcherDeferred =
cachedPrefetcherDeferred.getAndUpdate((v) -> null);
prefetcherDeferred.chain(new Deferred().addCallback(prefetch));
return prefetcherDeferred;
}
final Deferred d =
client.scanNextRows(this).addCallbacks(gotNextRow, nextRowErrback());
if (prefetching) {
d.chain(new Deferred().addCallback(prefetch));
}
return d;
}
private final Callback prefetch =
new Callback() {
@Override
public RowResultIterator call(RowResultIterator arg) throws Exception {
if (canRequestMore) {
if (cachedPrefetcherDeferred.get() == null) {
Deferred prefetcherDeferred =
client.scanNextRows(AsyncKuduScanner.this)
.addCallbacks(gotNextRow, nextRowErrback());
if (!cachedPrefetcherDeferred.compareAndSet(null, prefetcherDeferred)) {
LOG.info("Skip one prefetching because two concurrent prefetching scan occurs");
}
}
}
return null;
}
};
/**
* Singleton callback to handle responses of "next" RPCs.
* This returns an {@code ArrayList>} (possibly inside a
* deferred one).
*/
private final Callback gotNextRow =
new Callback() {
@Override
public RowResultIterator call(final Response resp) {
long lastPropagatedTimestamp = AsyncKuduClient.NO_TIMESTAMP;
if (readMode == ReadMode.READ_YOUR_WRITES &&
resp.scanTimestamp != AsyncKuduClient.NO_TIMESTAMP) {
// For READ_YOUR_WRITES mode, update the latest propagated timestamp
// with the chosen snapshot timestamp sent back from the server, to
// avoid unnecessarily wait for subsequent reads. Since as long as
// the chosen snapshot timestamp of the next read is greater than
// the previous one, the scan does not violate READ_YOUR_WRITES
// session guarantees.
lastPropagatedTimestamp = resp.scanTimestamp;
} else if (resp.propagatedTimestamp != AsyncKuduClient.NO_TIMESTAMP) {
// Otherwise we just use the propagated timestamp returned from
// the server as the latest propagated timestamp.
lastPropagatedTimestamp = resp.propagatedTimestamp;
}
if (lastPropagatedTimestamp != AsyncKuduClient.NO_TIMESTAMP) {
client.updateLastPropagatedTimestamp(lastPropagatedTimestamp);
}
numRowsReturned += resp.data.getNumRows();
if (isFaultTolerant && resp.lastPrimaryKey != null) {
lastPrimaryKey = resp.lastPrimaryKey;
}
if (resp.resourceMetricsPb != null) {
resourceMetrics.update(resp.resourceMetricsPb);
}
if (!resp.more) { // We're done scanning this tablet.
scanFinished();
return resp.data;
}
sequenceId++;
canRequestMore = resp.more;
return resp.data;
}
@Override
public String toString() {
return "get nextRows response";
}
};
/**
* Creates a new errback to handle errors while trying to get more rows.
*/
private final Callback, Exception> nextRowErrback() {
return new Callback, Exception>() {
@Override
public Deferred call(Exception e) throws Exception {
final RemoteTablet old_tablet = tablet; // Save before invalidate().
invalidate(); // If there was an error, don't assume we're still OK.
// If encountered FaultTolerantScannerExpiredException, it means the
// fault tolerant scanner on the server side expired. Therefore, open
// a new scanner.
if (e instanceof FaultTolerantScannerExpiredException) {
scannerId = null;
sequenceId = 0;
LOG.warn("Scanner expired, creating a new one {}", AsyncKuduScanner.this);
return nextRows();
} else {
LOG.warn("{} pretends to not know {}", old_tablet, AsyncKuduScanner.this, e);
return Deferred.fromError(e); // Let the error propagate.
}
}
@Override
public String toString() {
return "NextRow errback";
}
};
}
void scanFinished() {
Partition partition = tablet.getPartition();
pruner.removePartitionKeyRange(partition.getPartitionKeyEnd());
// Stop scanning if we have scanned until or past the end partition key, or
// if we have fulfilled the limit.
if (!pruner.hasMorePartitionKeyRanges() || numRowsReturned >= limit) {
canRequestMore = false;
closed = true; // the scanner is closed on the other side at this point
return;
}
if (LOG.isDebugEnabled()) {
LOG.debug("Done scanning tablet {} for partition {} with scanner id {}",
tablet.getTabletId(), tablet.getPartition(), Bytes.pretty(scannerId));
}
scannerId = null;
sequenceId = 0;
lastPrimaryKey = AsyncKuduClient.EMPTY_ARRAY;
invalidate();
}
/**
* @return true if the scanner has been closed.
*/
public boolean isClosed() {
return closed;
}
/**
* Closes this scanner (don't forget to call this when you're done with it!).
*
* Closing a scanner already closed has no effect. The deferred returned
* will be called back immediately.
* @return A deferred object that indicates the completion of the request.
* The {@link Object} can be null, a RowResultIterator if there was data left
* in the scanner, or an Exception.
*/
public Deferred close() {
if (closed) {
return Deferred.fromResult(null);
}
return client.closeScanner(this).addCallback(closedCallback()); // TODO errBack ?
}
/** Callback+Errback invoked when the TabletServer closed our scanner. */
private Callback closedCallback() {
return new Callback() {
@Override
public RowResultIterator call(Response response) {
closed = true;
if (LOG.isDebugEnabled()) {
LOG.debug("Scanner {} closed on {}", Bytes.pretty(scannerId), tablet);
}
invalidate();
scannerId = "client debug closed".getBytes(UTF_8); // Make debugging easier.
return response == null ? null : response.data;
}
@Override
public String toString() {
return "scanner closed";
}
};
}
@Override
public String toString() {
final String tablet = this.tablet == null ? "null" : this.tablet.getTabletId();
final StringBuilder buf = new StringBuilder();
buf.append("KuduScanner(table=");
buf.append(table.getName());
buf.append(", tablet=").append(tablet);
buf.append(", scannerId=").append(Bytes.pretty(scannerId));
buf.append(", scanRequestTimeout=").append(scanRequestTimeout);
if (startPrimaryKey.length > 0) {
buf.append(", startPrimaryKey=").append(Bytes.hex(startPrimaryKey));
} else {
buf.append(", startPrimaryKey=");
}
if (endPrimaryKey.length > 0) {
buf.append(", endPrimaryKey=").append(Bytes.hex(endPrimaryKey));
} else {
buf.append(", endPrimaryKey=");
}
if (lastPrimaryKey.length > 0) {
buf.append(", lastPrimaryKey=").append(Bytes.hex(lastPrimaryKey));
} else {
buf.append(", lastPrimaryKey=");
}
buf.append(')');
return buf.toString();
}
// ---------------------- //
// Package private stuff. //
// ---------------------- //
KuduTable table() {
return table;
}
/**
* Invalidates this scanner and makes it assume it's no longer opened.
* When a TabletServer goes away while we're scanning it, or some other type
* of access problem happens, this method should be called so that the
* scanner will have to re-locate the TabletServer and re-open itself.
*/
void invalidate() {
tablet = null;
}
/**
* Returns the tabletSlice currently being scanned, if any.
*/
RemoteTablet currentTablet() {
return tablet;
}
/**
* Gets the replica selection mechanism being used.
*
* @return the replica selection mechanism.
*/
ReplicaSelection getReplicaSelection() {
return replicaSelection;
}
/**
* Returns an RPC to open this scanner.
*/
KuduRpc getOpenRequest() {
checkScanningNotStarted();
return new ScanRequest(table, State.OPENING, tablet);
}
/**
* Keep the current remote scanner alive.
*
* Keep the current remote scanner alive on the Tablet server for an
* additional time-to-live. This is useful if the interval in between
* nextRows() calls is big enough that the remote scanner might be garbage
* collected. The scanner time-to-live can be configured on the tablet
* server via the --scanner_ttl_ms configuration flag and has a default
* of 60 seconds.
*
* This does not invalidate any previously fetched results.
*
* Note that an error returned by this method should not be taken as indication
* that the scan has failed. Subsequent calls to nextRows() might still be successful,
* particularly if the scanner is configured to be fault tolerant.
* @return A deferred object that indicates the completion of the request.
* @throws IllegalStateException if the scanner is already closed.
*/
public Deferred keepAlive() {
if (closed) {
if (prefetching && cachedPrefetcherDeferred.get() != null) {
// skip sending keep alive if all of the data has been fetched in prefetching mode
return Deferred.fromResult(null);
}
throw new IllegalStateException("Scanner has already been closed");
}
return client.keepAlive(this);
}
/**
* Returns an RPC to fetch the next rows.
*/
KuduRpc getNextRowsRequest() {
return new ScanRequest(table, State.NEXT, tablet);
}
/**
* Returns an RPC to close this scanner.
*/
KuduRpc getCloseRequest() {
return new ScanRequest(table, State.CLOSING, tablet);
}
/**
* Returns an RPC to keep this scanner alive on the tablet server.
* @return a new {@link KeepAliveRequest}
*/
KuduRpc getKeepAliveRequest() {
return new KeepAliveRequest(table, tablet);
}
/**
* Throws an exception if scanning already started.
* @throws IllegalStateException if scanning already started.
*/
private void checkScanningNotStarted() {
if (tablet != null) {
throw new IllegalStateException("scanning already started");
}
}
/**
* Helper object that contains all the info sent by a TS after a Scan request.
*/
static final class Response {
/** The ID associated with the scanner that issued the request. */
private final byte[] scannerId;
/** The actual payload of the response. */
private final RowResultIterator data;
/**
* If false, the filter we use decided there was no more data to scan.
* In this case, the server has automatically closed the scanner for us,
* so we don't need to explicitly close it.
*/
private final boolean more;
/**
* Server-assigned timestamp for the scan operation. It's used when
* the scan operates in READ_AT_SNAPSHOT mode and the timestamp is not
* specified explicitly. The field is set with the snapshot timestamp sent
* in the response from the very first tablet server contacted while
* fetching data from corresponding tablets. If the tablet server does not
* send the snapshot timestamp in its response, this field is assigned
* a special value AsyncKuduClient.NO_TIMESTAMP.
*/
private final long scanTimestamp;
/**
* The server timestamp to propagate, if set. If the server response does
* not contain propagated timestamp, this field is set to special value
* AsyncKuduClient.NO_TIMESTAMP
*/
private final long propagatedTimestamp;
private final byte[] lastPrimaryKey;
private final ResourceMetricsPB resourceMetricsPb;
Response(final byte[] scannerId,
final RowResultIterator data,
final boolean more,
final long scanTimestamp,
final long propagatedTimestamp,
final byte[] lastPrimaryKey,
final ResourceMetricsPB resourceMetricsPb) {
this.scannerId = scannerId;
this.data = data;
this.more = more;
this.scanTimestamp = scanTimestamp;
this.propagatedTimestamp = propagatedTimestamp;
this.lastPrimaryKey = lastPrimaryKey;
this.resourceMetricsPb = resourceMetricsPb;
}
@Override
public String toString() {
String ret = "AsyncKuduScanner$Response(scannerId = " + Bytes.pretty(scannerId) +
", data = " + data + ", more = " + more;
if (scanTimestamp != AsyncKuduClient.NO_TIMESTAMP) {
ret += ", responseScanTimestamp = " + scanTimestamp;
}
ret += ")";
return ret;
}
}
private enum State {
OPENING,
NEXT,
CLOSING
}
/**
* RPC sent out to keep a scanner alive on a TabletServer.
*/
final class KeepAliveRequest extends KuduRpc {
KeepAliveRequest(KuduTable table, RemoteTablet tablet) {
super(table, client.getTimer(), scanRequestTimeout);
setTablet(tablet);
}
@Override
String serviceName() {
return TABLET_SERVER_SERVICE_NAME;
}
@Override
String method() {
return "ScannerKeepAlive";
}
@Override
ReplicaSelection getReplicaSelection() {
return replicaSelection;
}
/** Serializes this request. */
@Override
Message createRequestPB() {
final ScannerKeepAliveRequestPB.Builder builder = ScannerKeepAliveRequestPB.newBuilder();
builder.setScannerId(UnsafeByteOperations.unsafeWrap(scannerId));
return builder.build();
}
@Override
public byte[] partitionKey() {
// This key is used to lookup where the request needs to go
return pruner.nextPartitionKey();
}
@Override
Pair deserialize(final CallResponse callResponse,
String tsUUID) throws KuduException {
ScannerKeepAliveResponsePB.Builder builder = ScannerKeepAliveResponsePB.newBuilder();
readProtobuf(callResponse.getPBMessage(), builder);
ScannerKeepAliveResponsePB resp = builder.build();
TabletServerErrorPB error = null;
if (resp.hasError()) {
if (canBeIgnored(resp.getError().getCode())) {
LOG.info("Ignore false alert of scanner not found for keep alive request");
} else {
error = resp.getError();
}
}
return new Pair<>(null, error);
}
}
/**
* RPC sent out to fetch the next rows from the TabletServer.
*/
final class ScanRequest extends KuduRpc {
private final State state;
/** The token with which to authorize this RPC. */
private Token.SignedTokenPB authzToken;
ScanRequest(KuduTable table, State state, RemoteTablet tablet) {
super(table, client.getTimer(), scanRequestTimeout);
setTablet(tablet);
this.state = state;
}
@Override
String serviceName() {
return TABLET_SERVER_SERVICE_NAME;
}
@Override
String method() {
return "Scan";
}
@Override
Collection getRequiredFeatures() {
if (predicates.isEmpty()) {
return ImmutableList.of();
} else {
return ImmutableList.of(Tserver.TabletServerFeatures.COLUMN_PREDICATES_VALUE);
}
}
@Override
ReplicaSelection getReplicaSelection() {
return replicaSelection;
}
@Override
boolean needsAuthzToken() {
return true;
}
@Override
void bindAuthzToken(Token.SignedTokenPB token) {
authzToken = token;
}
/** Serializes this request. */
@Override
Message createRequestPB() {
final ScanRequestPB.Builder builder = ScanRequestPB.newBuilder();
switch (state) {
case OPENING:
// Save the tablet in the AsyncKuduScanner. This kind of a kludge but it really
// is the easiest way.
AsyncKuduScanner.this.tablet = super.getTablet();
NewScanRequestPB.Builder newBuilder = NewScanRequestPB.newBuilder();
newBuilder.setLimit(limit - AsyncKuduScanner.this.numRowsReturned);
newBuilder.addAllProjectedColumns(ProtobufHelper.schemaToListPb(schema));
newBuilder.setTabletId(UnsafeByteOperations.unsafeWrap(tablet.getTabletIdAsBytes()));
newBuilder.setOrderMode(AsyncKuduScanner.this.getOrderMode());
newBuilder.setCacheBlocks(cacheBlocks);
long rowFormatFlags = Tserver.RowFormatFlags.NO_FLAGS_VALUE;
if (rowDataFormat == RowDataFormat.COLUMNAR) {
rowFormatFlags |= Tserver.RowFormatFlags.COLUMNAR_LAYOUT.getNumber();
}
newBuilder.setRowFormatFlags(rowFormatFlags);
// If the last propagated timestamp is set, send it with the scan.
// For READ_YOUR_WRITES scan, use the propagated timestamp from
// the scanner.
long timestamp;
if (readMode == ReadMode.READ_YOUR_WRITES) {
timestamp = lowerBoundPropagationTimestamp;
} else {
timestamp = table.getAsyncClient().getLastPropagatedTimestamp();
}
if (timestamp != AsyncKuduClient.NO_TIMESTAMP) {
newBuilder.setPropagatedTimestamp(timestamp);
}
newBuilder.setReadMode(AsyncKuduScanner.this.getReadMode().pbVersion());
// if the mode is set to read on snapshot set the snapshot timestamps.
if (AsyncKuduScanner.this.getReadMode() == ReadMode.READ_AT_SNAPSHOT) {
if (AsyncKuduScanner.this.getSnapshotTimestamp() != AsyncKuduClient.NO_TIMESTAMP) {
newBuilder.setSnapTimestamp(AsyncKuduScanner.this.getSnapshotTimestamp());
}
if (AsyncKuduScanner.this.getStartSnapshotTimestamp() != AsyncKuduClient.NO_TIMESTAMP) {
newBuilder.setSnapStartTimestamp(AsyncKuduScanner.this.getStartSnapshotTimestamp());
}
}
if (isFaultTolerant && AsyncKuduScanner.this.lastPrimaryKey.length > 0) {
newBuilder.setLastPrimaryKey(UnsafeByteOperations.unsafeWrap(lastPrimaryKey));
}
if (AsyncKuduScanner.this.startPrimaryKey.length > 0) {
newBuilder.setStartPrimaryKey(UnsafeByteOperations.unsafeWrap(startPrimaryKey));
}
if (AsyncKuduScanner.this.endPrimaryKey.length > 0) {
newBuilder.setStopPrimaryKey(UnsafeByteOperations.unsafeWrap(endPrimaryKey));
}
for (KuduPredicate pred : predicates.values()) {
newBuilder.addColumnPredicates(pred.toPB());
}
if (authzToken != null) {
newBuilder.setAuthzToken(authzToken);
}
builder.setNewScanRequest(newBuilder.build())
.setBatchSizeBytes(batchSizeBytes);
break;
case NEXT:
builder.setScannerId(UnsafeByteOperations.unsafeWrap(scannerId))
.setCallSeqId(AsyncKuduScanner.this.sequenceId)
.setBatchSizeBytes(batchSizeBytes);
break;
case CLOSING:
builder.setScannerId(UnsafeByteOperations.unsafeWrap(scannerId))
.setBatchSizeBytes(0)
.setCloseScanner(true);
break;
default:
throw new RuntimeException("unreachable!");
}
return builder.build();
}
@Override
Pair deserialize(final CallResponse callResponse,
String tsUUID) throws KuduException {
ScanResponsePB.Builder builder = ScanResponsePB.newBuilder();
readProtobuf(callResponse.getPBMessage(), builder);
ScanResponsePB resp = builder.build();
final byte[] id = resp.getScannerId().toByteArray();
TabletServerErrorPB error = resp.hasError() ? resp.getError() : null;
// Error handling.
if (error != null) {
if (canBeIgnored(resp.getError().getCode())) {
LOG.info("Ignore false alert of scanner not found for scan request");
error = null;
} else {
switch (error.getCode()) {
case TABLET_NOT_FOUND:
case TABLET_NOT_RUNNING:
if (state == State.OPENING || (state == State.NEXT && isFaultTolerant)) {
// Doing this will trigger finding the new location.
return new Pair<>(null, error);
} else {
Status statusIncomplete = Status.Incomplete("Cannot continue scanning, " +
"the tablet has moved and this isn't a fault tolerant scan");
throw new NonRecoverableException(statusIncomplete);
}
case SCANNER_EXPIRED:
if (isFaultTolerant) {
Status status = Status.fromTabletServerErrorPB(error);
throw new FaultTolerantScannerExpiredException(status);
}
// fall through
default:
break;
}
}
}
// TODO: Find a clean way to plumb in reuseRowResult.
RowResultIterator iterator;
if (resp.hasData()) {
iterator = RowwiseRowResultIterator.makeRowResultIterator(
timeoutTracker.getElapsedMillis(), tsUUID, schema, resp.getData(),
callResponse, reuseRowResult);
} else {
iterator = ColumnarRowResultIterator.makeRowResultIterator(
timeoutTracker.getElapsedMillis(), tsUUID, schema, resp.getColumnarData(),
callResponse, reuseRowResult);
}
boolean hasMore = resp.getHasMoreResults();
if (id.length != 0 && scannerId != null && !Bytes.equals(scannerId, id)) {
Status statusIllegalState = Status.IllegalState("Scan RPC response was for scanner" +
" ID " + Bytes.pretty(id) + " but we expected " +
Bytes.pretty(scannerId));
throw new NonRecoverableException(statusIllegalState);
}
ResourceMetricsPB resourceMetricsPB = resp.hasResourceMetrics() ?
resp.getResourceMetrics() : null;
Response response = new Response(id, iterator, hasMore,
resp.hasSnapTimestamp() ? resp.getSnapTimestamp()
: AsyncKuduClient.NO_TIMESTAMP,
resp.hasPropagatedTimestamp() ? resp.getPropagatedTimestamp()
: AsyncKuduClient.NO_TIMESTAMP,
resp.getLastPrimaryKey().toByteArray(), resourceMetricsPB);
if (LOG.isDebugEnabled()) {
LOG.debug("{} for scanner {}", response, AsyncKuduScanner.this);
}
return new Pair<>(response, error);
}
@Override
public String toString() {
return "ScanRequest(scannerId=" + Bytes.pretty(scannerId) +
", state=" + state +
(tablet != null ? ", tablet=" + tablet.getTabletId() : "") +
", attempt=" + attempt + ", " + super.toString() + ")";
}
@Override
public byte[] partitionKey() {
// This key is used to lookup where the request needs to go
return pruner.nextPartitionKey();
}
}
/**
* A Builder class to build {@link AsyncKuduScanner}.
* Use {@link AsyncKuduClient#newScannerBuilder} in order to get a builder instance.
*/
@InterfaceAudience.Public
@InterfaceStability.Evolving
public static class AsyncKuduScannerBuilder
extends AbstractKuduScannerBuilder {
AsyncKuduScannerBuilder(AsyncKuduClient client, KuduTable table) {
super(client, table);
}
/**
* Builds an {@link AsyncKuduScanner} using the passed configurations.
* @return a new {@link AsyncKuduScanner}
*/
@Override
public AsyncKuduScanner build() {
return new AsyncKuduScanner(
client, table, projectedColumnNames, projectedColumnIndexes, readMode, isFaultTolerant,
scanRequestTimeout, predicates, limit, cacheBlocks, prefetching, lowerBoundPrimaryKey,
upperBoundPrimaryKey, startTimestamp, htTimestamp, batchSizeBytes,
PartitionPruner.create(this), replicaSelection, keepAlivePeriodMs);
}
}
}