oracle.kv.impl.query.runtime.ReceiveIter Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of oracle-nosql-server Show documentation
Show all versions of oracle-nosql-server Show documentation
NoSQL Database Server - supplies build and runtime support for the server (store) side of the Oracle NoSQL Database.
/*-
* Copyright (C) 2011, 2018 Oracle and/or its affiliates. All rights reserved.
*
* This file was distributed by Oracle as part of a version of Oracle NoSQL
* Database made available at:
*
* http://www.oracle.com/technetwork/database/database-technologies/nosqldb/downloads/index.html
*
* Please see the LICENSE file included in the top-level directory of the
* appropriate version of Oracle NoSQL Database for a copy of the license and
* additional information.
*/
package oracle.kv.impl.query.runtime;
import static java.util.concurrent.TimeUnit.MILLISECONDS;
import static oracle.kv.impl.util.SerialVersion.UNKNOWN;
import static oracle.kv.impl.util.SerializationUtil.writeNonNullByteArray;
import java.io.ByteArrayOutputStream;
import java.io.DataInput;
import java.io.DataOutput;
import java.io.DataOutputStream;
import java.io.IOException;
import java.util.Collections;
import java.util.HashSet;
import java.util.Iterator;
import java.util.List;
import java.util.NoSuchElementException;
import java.util.Set;
import java.util.concurrent.TimeUnit;
import oracle.kv.Consistency;
import oracle.kv.Depth;
import oracle.kv.Direction;
import oracle.kv.Durability;
import oracle.kv.ResultHandler;
import oracle.kv.StoreIteratorException;
import oracle.kv.impl.api.KVStoreImpl;
import oracle.kv.impl.api.Request;
import oracle.kv.impl.api.StoreIteratorParams;
import oracle.kv.impl.api.ops.Result;
import oracle.kv.impl.api.ops.Result.QueryResult;
import oracle.kv.impl.api.ops.TableQuery;
import oracle.kv.impl.api.parallelscan.PartitionScanIterator;
import oracle.kv.impl.api.parallelscan.ShardScanIterator;
import oracle.kv.impl.api.query.PreparedStatementImpl.DistributionKind;
import oracle.kv.impl.api.table.BinaryValueImpl;
import oracle.kv.impl.api.table.BooleanValueImpl;
import oracle.kv.impl.api.table.FieldDefImpl;
import oracle.kv.impl.api.table.FieldValueImpl;
import oracle.kv.impl.api.table.NullValueImpl;
import oracle.kv.impl.api.table.NumberValueImpl;
import oracle.kv.impl.api.table.PrimaryKeyImpl;
import oracle.kv.impl.api.table.RecordDefImpl;
import oracle.kv.impl.api.table.RecordValueImpl;
import oracle.kv.impl.api.table.TableImpl;
import oracle.kv.impl.api.table.TimestampValueImpl;
import oracle.kv.impl.api.table.TupleValue;
import oracle.kv.impl.async.AsyncTableIterator;
import oracle.kv.impl.async.IterationHandleNotifier;
import oracle.kv.impl.query.QueryException;
import oracle.kv.impl.query.QueryStateException;
import oracle.kv.impl.query.compiler.Expr;
import oracle.kv.impl.query.compiler.FunctionLib.FuncCode;
import oracle.kv.impl.query.compiler.QueryFormatter;
import oracle.kv.impl.query.compiler.SortSpec;
import oracle.kv.impl.topo.PartitionId;
import oracle.kv.impl.topo.RepGroupId;
import oracle.kv.impl.util.SerialVersion;
import oracle.kv.impl.util.SerializationUtil;
import oracle.kv.query.ExecuteOptions;
import oracle.kv.stats.DetailedMetrics;
/**
* ReceiveIter are placed at the boundaries between parts of the query that
* execute on different "machines". Currently, there can be only one ReceiveIter
* in the whole query plan. It executes at a "client machine" and its child
* subplan executes at a "server machine". The child subplan may actually be
* replicated on several server machines (RNs), in which case the ReceiveIter
* acts as a UNION ALL expr, collecting and propagating the results it receives
* from its children. Furthermore, the ReceiveIter may perform a merge-sort over
* its inputs (if the inputs return sorted results).
*
* If the ReceiveIter is the root iter, it just propagates to its output the
* FieldValues (most likely RecordValues) it receives from the RNs. Otherwise,
* if its input iter produces tuples, the ReceiveIter will recreate these tuples
* at its output by unnesting into tuples the RecordValues arriving from the RNs.
*/
public class ReceiveIter extends PlanIter {
private static class ReceiveIterState extends PlanIterState {
final PartitionId thePartitionId;
AsyncTableIterator theRemoteResultsIter;
Throwable theRemoteResultsIterCloseException;
HashSet thePrimKeysSet;
ReceiveIterState(PartitionId pid, boolean eliminateIndexDups) {
thePartitionId = pid;
if (eliminateIndexDups) {
thePrimKeysSet = new HashSet(1000);
}
}
@Override
public void done() {
super.done();
clear();
}
@Override
public void reset(PlanIter iter) {
super.reset(iter);
clear();
}
@Override
public void close() {
super.close();
if (theRemoteResultsIter != null) {
theRemoteResultsIterCloseException =
theRemoteResultsIter.getCloseException();
}
clear();
}
void clear() {
if (theRemoteResultsIter != null) {
theRemoteResultsIter.close();
theRemoteResultsIter = null;
}
if (thePrimKeysSet != null) {
thePrimKeysSet.clear();
}
}
}
private static class CachedBinaryPlan {
private byte[] thePlan = null;
private short theSerialVersion = UNKNOWN;
private CachedBinaryPlan(
byte[] plan,
short serialVersion) {
thePlan = plan;
theSerialVersion = serialVersion;
}
public static CachedBinaryPlan create(
byte[] plan,
short serialVersion) {
return new CachedBinaryPlan(plan, serialVersion);
}
byte[] getPlan() {
return thePlan;
}
short getSerialVersion() {
return theSerialVersion;
}
}
private final PlanIter theInputIter;
private transient volatile CachedBinaryPlan theSerializedInputIter = null;
private final FieldDefImpl theInputType;
/* added in QUERY_VERSION_2 */
private final boolean theMayReturnNULL;
private final int[] theSortFieldPositions;
private final SortSpec[] theSortSpecs;
private final int[] thePrimKeyPositions;
private final int[] theTupleRegs;
private final DistributionKind theDistributionKind;
private final RecordValueImpl thePrimaryKey;
private PartitionId thePartitionId;
private final long theTableId;
private final String theTableName;
private final String theNamespace;
private final PlanIter[] thePushedExternals;
private final int theNumRegs;
private final int theNumIters;
/* added in QUERY_VERSION_5 */
private final boolean theIsUpdate;
private transient volatile IterationHandleNotifier
theAsyncIterHandleNotifier;
public ReceiveIter(
Expr e,
int resultReg,
PlanIter input,
FieldDefImpl inputType,
boolean mayReturnNULL,
int[] sortFieldPositions,
SortSpec[] sortSpecs,
int[] primKeyPositions,
DistributionKind distrKind,
PrimaryKeyImpl primKey,
PlanIter[] pushedExternals,
int numRegs,
int numIters,
boolean isUpdate) {
super(e, resultReg);
theInputIter = input;
theInputType = inputType;
theMayReturnNULL = mayReturnNULL;
theSortFieldPositions = sortFieldPositions;
theSortSpecs = sortSpecs;
thePrimKeyPositions = primKeyPositions;
theDistributionKind = distrKind;
thePushedExternals = pushedExternals;
/*
* If the ReceiveIter is the root iter, it just propagates to its
* output the FieldValues (most likely RecordValues) it receives from
* the RNs. Otherwise, if its input iter produces tuples, the
* ReceiveIter will recreate these tuples at its output by unnesting
* into tuples the RecordValues arriving from the RNs.
*/
if (input.producesTuples() && e.getQCB().getRootExpr() != e) {
theTupleRegs = input.getTupleRegs();
} else {
theTupleRegs = null;
}
theTableId = e.getQCB().getTargetTableId();
if (primKey != null) {
thePrimaryKey = primKey;
theTableName = primKey.getTable().getFullName();
theNamespace = primKey.getTable().getInternalNamespace();
/*
* If it's a SINGLE_PARTITION query with no external vars, compute
* the partition id now. Otherwise, it is computed in the open().
*/
if (theDistributionKind == DistributionKind.SINGLE_PARTITION &&
(thePushedExternals == null ||
thePushedExternals.length == 0)) {
thePartitionId =
primKey.getPartitionId(e.getQCB().getStore());
}
} else {
thePrimaryKey = null;
theTableName = null;
theNamespace = null;
}
theNumRegs = numRegs;
theNumIters = numIters;
theIsUpdate = isUpdate;
assert(!theIsUpdate ||
theDistributionKind == DistributionKind.SINGLE_PARTITION);
}
/**
* FastExternalizable constructor.
*/
public ReceiveIter(DataInput in, short serialVersion) throws IOException {
super(in, serialVersion);
theNumRegs = readPositiveInt(in);
theNumIters = readPositiveInt(in);
theInputType = (FieldDefImpl) deserializeFieldDef(in, serialVersion);
theMayReturnNULL = in.readBoolean();
theSortFieldPositions = deserializeIntArray(in, serialVersion);
theSortSpecs = deserializeSortSpecs(in, serialVersion);
thePrimKeyPositions = deserializeIntArray(in, serialVersion);
theTupleRegs = deserializeIntArray(in, serialVersion);
short ordinal = in.readShort();
theDistributionKind = DistributionKind.values()[ordinal];
theTableId = in.readLong();
theTableName = SerializationUtil.readString(in, serialVersion);
if (theTableName != null) {
theNamespace = SerializationUtil.readString(in, serialVersion);
thePrimaryKey = deserializeKey(in, serialVersion);
} else {
thePrimaryKey = null;
theNamespace = null;
}
thePushedExternals = deserializeIters(in, serialVersion);
theIsUpdate = in.readBoolean();
if (theDistributionKind == DistributionKind.SINGLE_PARTITION &&
(thePushedExternals == null ||
thePushedExternals.length == 0)) {
thePartitionId = new PartitionId(in.readInt());
}
byte[] bytes = SerializationUtil.readNonNullByteArray(in);
setSerializedIter(bytes, serialVersion);
/* keeps compiler happy regarding final members */
theInputIter = null;
}
/**
* FastExternalizable writer. Must call superclass method first to
* write common elements.
*/
@Override
public void writeFastExternal(DataOutput out, short serialVersion)
throws IOException {
super.writeFastExternal(out, serialVersion);
out.writeInt(theNumRegs);
out.writeInt(theNumIters);
/*
* theInputIter is not serialized. It is the server side of the query
* plan and does not need to be used in this path.
*/
serializeFieldDef(theInputType, out, serialVersion);
out.writeBoolean(theMayReturnNULL);
serializeIntArray(theSortFieldPositions, out, serialVersion);
serializeSortSpecs(theSortSpecs, out, serialVersion);
serializeIntArray(thePrimKeyPositions, out, serialVersion);
serializeIntArray(theTupleRegs, out, serialVersion);
out.writeShort(theDistributionKind.ordinal());
out.writeLong(theTableId);
SerializationUtil.writeString(out, serialVersion, theTableName);
if (theTableName != null) {
SerializationUtil.writeString(out, serialVersion, theNamespace);
serializeKey(thePrimaryKey, out, serialVersion);
}
serializeIters(thePushedExternals, out, serialVersion);
out.writeBoolean(theIsUpdate);
if (theDistributionKind == DistributionKind.SINGLE_PARTITION &&
(thePushedExternals == null ||
thePushedExternals.length == 0)) {
out.writeInt(thePartitionId.getPartitionId());
}
byte[] bytes = ensureSerializedIter(serialVersion);
SerializationUtil.writeNonNullByteArray(out, bytes);
}
@Override
public PlanIterKind getKind() {
return PlanIterKind.RECV;
}
/*
* These are public so that PreparedStatementImpl can reconstruct itself from
* a serialized format.
*/
public int getNumRegisters() {
return theNumRegs;
}
public int getNumIterators() {
return theNumIters;
}
/*
* This should be compile-time only so it should be safe to *not* include
* theInputIter in serialization/deserialization.
*/
@Override
public int[] getTupleRegs() {
return theInputIter.getTupleRegs();
}
private boolean doesSort() {
return (theSortFieldPositions != null);
}
@Override
public void setIterationHandleNotifier(
IterationHandleNotifier iterHandleNotifier) {
theAsyncIterHandleNotifier = iterHandleNotifier;
}
/**
* Sets or updates the cached serialized version of the query plan under
* this ReceiveIter.
*
* The method is called from TableQuery.writeFastExternal(). This implies
* that it will be called by each parallel-scan stream that is created
* by "this", which further implies that it can be called concurrently by
* multiple threads.
*
* The method cannot be called earlier, because each stream must generate
* its own serialized plan. This is because the plan generated depends on
* the version of the RN that the stream connects with (and the RN may
* change every time the stream requests a new batch of results). Of course,
* unless a system upgrade is going on, all RNs will have the same version,
* and all streams will generate the same binary plan. So, to avoid the
* same plan to be generated again and again, by each stream and each batch
* of results, whenever one stream generates a binary plan, that plan, as
* well as the version used for its generation, are cached in the
* ReceiveIter. If another stream finds a cached plan whose version is the
* same as the current version used by the stream, the stream can use the
* cached plan, instead of generating it again.
*/
public byte[] ensureSerializedIter(short serialVersion) {
CachedBinaryPlan cachedPlan = theSerializedInputIter;
if (cachedPlan != null &&
cachedPlan.getPlan() != null &&
cachedPlan.getSerialVersion() == serialVersion) {
return cachedPlan.thePlan;
}
synchronized (this) {
try {
final ByteArrayOutputStream baos =
new ByteArrayOutputStream();
final DataOutput dataOut = new DataOutputStream(baos);
PlanIter.serializeIter(theInputIter, dataOut, serialVersion);
byte[] ba = baos.toByteArray();
cachedPlan = CachedBinaryPlan.create(ba, serialVersion);
theSerializedInputIter = cachedPlan;
return ba;
}
catch (IOException ioe) {
throw new QueryException(ioe);
}
}
}
public synchronized void setSerializedIter(byte[] bytes,
short serialVersion) {
assert theSerializedInputIter == null;
theSerializedInputIter = CachedBinaryPlan.create(bytes, serialVersion);
}
/**
* This method executes a query on the server side and stores in the
* iterator state a ParalleScanIterator over the results.
*
* At some point a refactor of how parallel scan and index scan work may
* be necessary to take into consideration these facts:
* o a query may be an update or read-only (this can probably be known
* ahead of time once the query is prepared). In any case the type of
* query and Durability specified will affect routing of the query.
* o some iterator params are not relevant (direction, keys, ranges, Depth)
*/
private void ensureIterator(
RuntimeControlBlock rcb,
ReceiveIterState state) {
if (state.theRemoteResultsIter != null) {
return;
}
switch (theDistributionKind) {
case SINGLE_PARTITION:
state.theRemoteResultsIter = runOnOnePartition(rcb);
break;
case ALL_PARTITIONS:
state.theRemoteResultsIter = runOnAllPartitions(rcb);
break;
case ALL_SHARDS:
state.theRemoteResultsIter = runOnAllShards(rcb);
break;
default:
throw new QueryStateException(
"Unknown distribution kind: " + theDistributionKind);
}
rcb.setTableIterator(state.theRemoteResultsIter);
}
/**
* Execute the child plan of this ReceiveIter on all partitions
*/
private AsyncTableIterator runOnAllPartitions(
final RuntimeControlBlock rcb) {
if (rcb.getMaxReadKB() > 0 || rcb.getUseBatchSizeAsLimit()) {
return new SequentialPartitionsIterator(rcb, null/*partitions*/);
}
ExecuteOptions options = rcb.getExecuteOptions();
/*
* Compute the direction to be stored in the BaseParallelScanIterator.
* Because the actual comparisons among the query results are done by
* the streams, the BaseParallelScanIterator just needs to know whether
* sorting is needed or not in order to invoke the comparison method or
* not. So, we just need to pass UNORDERED or FORWARD.
*/
Direction dir = (theSortFieldPositions != null ?
Direction.FORWARD :
Direction.UNORDERED);
StoreIteratorParams params =
new StoreIteratorParams(
dir,
rcb.getBatchSize(),
null, // key bytes
null, // key range
Depth.PARENT_AND_DESCENDANTS,
rcb.getConsistency(),
rcb.getTimeout(),
rcb.getTimeUnit(),
rcb.getPartitionSet());
return new PartitionScanIterator(
rcb.getStore(), options, params, theAsyncIterHandleNotifier) {
@Override
protected QueryPartitionStream createStream(
RepGroupId groupId,
int partitionId) {
return new QueryPartitionStream(groupId, partitionId);
}
@Override
protected TableQuery generateGetterOp(byte[] resumeKey) {
throw new QueryStateException("Unexpected call");
}
@Override
protected void convertResult(
Result result,
List elementList) {
List queryResults = result.getQueryResults();
// TODO: try to avoid this useless loop
for (FieldValueImpl res : queryResults) {
elementList.add(res);
}
}
@Override
protected int compare(FieldValueImpl one, FieldValueImpl two) {
throw new QueryStateException("Unexpected call");
}
class QueryPartitionStream extends PartitionStream {
private ResumeInfo theResumeInfo = new ResumeInfo(rcb);
QueryPartitionStream(RepGroupId groupId, int partitionId) {
super(groupId, partitionId, null);
}
@Override
protected Request makeReadRequest() {
TableQuery op = new TableQuery(
DistributionKind.ALL_PARTITIONS,
theInputType,
theMayReturnNULL,
ReceiveIter.this,
rcb.getExternalVars(),
theNumIters,
theNumRegs,
theTableId,
rcb.getMathContext(),
rcb.getTraceLevel(),
rcb.getBatchSize(),
0, /* maxCurrentReadKB */
0, /* maxReadKB*/
theResumeInfo,
1 /* emptyReadFactor */);
return storeImpl.makeReadRequest(
op,
new PartitionId(partitionId),
storeIteratorParams.getConsistency(),
storeIteratorParams.getTimeout(),
storeIteratorParams.getTimeoutUnit(),
null);
}
@Override
protected void setResumeKey(Result result) {
QueryResult res = (QueryResult)result;
theResumeInfo.refresh(res.getResumeInfo());
if (rcb.getTraceLevel() >= 1) {
rcb.trace("Received " + res.getNumRecords() +
" results from group : " + groupId +
" partition " + partitionId);
}
if (rcb.getTraceLevel() >= 4) {
rcb.trace(theResumeInfo.toString());
}
}
@Override
protected int compareInternal(Stream o) {
QueryPartitionStream other = (QueryPartitionStream)o;
int cmp;
FieldValueImpl v1 =
currentResultSet.getQueryResults().
get(currentResultPos);
FieldValueImpl v2 =
other.currentResultSet.getQueryResults().
get(other.currentResultPos);
if (theInputType.isRecord()) {
RecordValueImpl rec1 = (RecordValueImpl)v1;
RecordValueImpl rec2 = (RecordValueImpl)v2;
cmp = compareRecords(rec1, rec2);
} else {
cmp = compareAtomics(v1, v2, 0);
}
if (cmp == 0) {
return (partitionId < other.partitionId ? -1 : 1);
}
return cmp;
}
}
};
}
/**
* Execute the child plan of this ReceiveIter on a single partition
*/
private AsyncTableIterator runOnOnePartition(
final RuntimeControlBlock rcb) {
ReceiveIterState state = (ReceiveIterState)rcb.getState(theStatePos);
final PartitionId pid = state.thePartitionId;
PartitionId[] partitions = new PartitionId[1];
partitions[0] = pid;
return new SequentialPartitionsIterator(rcb, partitions);
}
/**
* Execute the child plan of this ReceiveIter on all shards
* TODO: remove duplicates in result
*/
private AsyncTableIterator runOnAllShards(
final RuntimeControlBlock rcb) {
if (rcb.getMaxReadKB() > 0 || rcb.getUseBatchSizeAsLimit()) {
/* If size limit is specified, scan shards sequentially. */
return new SequentialShardsIterator(rcb);
}
ExecuteOptions options = rcb.getExecuteOptions();
/*
* Compute the direction to be stored in the BaseParallelScanIterator.
* Because the actual comparisons among the query results are done by
* the streams, the BaseParallelScanIterator just needs to know whether
* sorting is needed or not in order to invoke the comparison method or
* not. So, we just need to pass UNORDERED or FORWARD.
*/
Direction dir = (theSortFieldPositions != null ?
Direction.FORWARD :
Direction.UNORDERED);
return new ShardScanIterator(
rcb.getStore(), options, dir, rcb.getShardSet(),
theAsyncIterHandleNotifier) {
@Override
protected QueryShardStream createStream(RepGroupId groupId) {
return new QueryShardStream(groupId);
}
@Override
protected TableQuery createOp(
byte[] resumeSecondaryKey,
byte[] resumePrimaryKey) {
throw new QueryStateException("Unexpected call");
}
@Override
protected void convertResult(
Result result,
List elementList) {
List queryResults = result.getQueryResults();
for (FieldValueImpl res : queryResults) {
elementList.add(res);
}
}
@Override
protected int compare(FieldValueImpl one, FieldValueImpl two) {
throw new QueryStateException("Unexpected call");
}
class QueryShardStream extends ShardStream {
private ResumeInfo theResumeInfo = new ResumeInfo(rcb);
QueryShardStream(RepGroupId groupId) {
super(groupId, null, null);
}
@Override
protected Request makeReadRequest() {
TableQuery op = new TableQuery(
DistributionKind.ALL_SHARDS,
theInputType,
theMayReturnNULL,
ReceiveIter.this,
rcb.getExternalVars(),
theNumIters,
theNumRegs,
theTableId,
rcb.getMathContext(),
rcb.getTraceLevel(),
rcb.getBatchSize(),
0, /* maxCurrentReadKB */
0, /* maxReadKB */
theResumeInfo,
1 /* emptyReadFactor */);
return storeImpl.makeReadRequest(
op,
groupId,
consistency,
requestTimeoutMs,
MILLISECONDS,
null);
}
@Override
protected void setResumeKey(Result result) {
QueryResult res = (QueryResult)result;
theResumeInfo.refresh(res.getResumeInfo());
if (rcb.getTraceLevel() >= 1) {
rcb.trace("Received " + res.getNumRecords() +
" results from group : " + groupId +
" shard " + groupId);
}
if (rcb.getTraceLevel() >= 4) {
rcb.trace(theResumeInfo.toString());
}
}
@Override
protected int compareInternal(Stream o) {
QueryShardStream other = (QueryShardStream)o;
int cmp;
FieldValueImpl v1 =
currentResultSet.getQueryResults().
get(currentResultPos);
FieldValueImpl v2 =
other.currentResultSet.getQueryResults().
get(other.currentResultPos);
if (theInputType.isRecord()) {
RecordValueImpl rec1 = (RecordValueImpl)v1;
RecordValueImpl rec2 = (RecordValueImpl)v2;
cmp = compareRecords(rec1, rec2);
} else {
cmp = compareAtomics(v1, v2, 0);
}
if (cmp == 0) {
return getGroupId().compareTo(other.getGroupId());
}
return cmp;
}
}
};
}
@Override
public void open(RuntimeControlBlock rcb) {
boolean alwaysFalse = false;
PartitionId pid = PartitionId.NULL_ID;
if (theDistributionKind == DistributionKind.SINGLE_PARTITION) {
if (thePushedExternals != null &&
thePushedExternals.length > 0) {
/*
* Make a copy of thePrimaryKey in order to replace its
* "dummy", placeholder values with the corresponding values
* of the external-variable expressions
*
* Optimize the local case where thePrimaryKey is an actual
* PrimaryKeyImpl, avoiding a potentially costly getTable()
* call.
*/
PrimaryKeyImpl primaryKey;
TableImpl table;
if (thePrimaryKey instanceof PrimaryKeyImpl) {
primaryKey = (PrimaryKeyImpl) thePrimaryKey.clone();
table = primaryKey.getTable();
} else {
table = rcb.getMetadataHelper().
getTable(theNamespace, theTableName);
primaryKey = table.createPrimaryKey(thePrimaryKey);
}
int size = thePushedExternals.length;
for (int i = 0; i < size; ++i) {
PlanIter iter = thePushedExternals[i];
if (iter == null) {
continue;
}
iter.open(rcb);
iter.next(rcb);
FieldValueImpl val = rcb.getRegVal(iter.getResultReg());
iter.close(rcb);
FieldValueImpl newVal = BaseTableIter.castValueToIndexKey(
table, null, i, val, FuncCode.OP_EQ);
if (newVal != val) {
if (newVal == BooleanValueImpl.falseValue) {
alwaysFalse = true;
break;
}
val = newVal;
}
String colName = table.getPrimaryKeyColumnName(i);
primaryKey.put(colName, val);
}
pid = primaryKey.getPartitionId(rcb.getStore());
} else {
pid = thePartitionId;
}
}
ReceiveIterState state =
new ReceiveIterState(pid, (thePrimKeyPositions != null));
rcb.setState(theStatePos, state);
if (theTupleRegs != null) {
TupleValue tuple = new TupleValue((RecordDefImpl)theInputType,
rcb.getRegisters(),
theTupleRegs);
rcb.setRegVal(theResultReg, tuple);
}
if (alwaysFalse) {
state.done();
}
}
@Override
public boolean next(RuntimeControlBlock rcb) {
return nextInternal(rcb, false /* localOnly */);
}
@Override
public boolean nextLocal(RuntimeControlBlock rcb) {
return nextInternal(rcb, true /* localOnly */);
}
private boolean nextInternal(RuntimeControlBlock rcb, boolean localOnly) {
/*
* Catch StoreIteratorException and if the cause is a QueryException,
* throw that instead to provide more information to the caller.
*/
try {
ReceiveIterState state =
(ReceiveIterState)rcb.getState(theStatePos);
if (state.isDone()) {
return false;
}
ensureIterator(rcb, state);
FieldValueImpl res;
do {
if (localOnly) {
res = state.theRemoteResultsIter.nextLocal();
if (res == null) {
if (state.theRemoteResultsIter.isClosed() &&
!state.isClosed()) {
state.done();
}
return false;
}
} else {
boolean more = state.theRemoteResultsIter.hasNext();
if (!more) {
state.done();
return false;
}
res = state.theRemoteResultsIter.next();
}
/* Eliminate index duplicates */
if (thePrimKeyPositions != null) {
BinaryValueImpl binPrimKey = createBinaryPrimKey(res);
boolean added = state.thePrimKeysSet.add(binPrimKey);
if (!added) {
continue;
}
}
break;
} while (true);
if (theTupleRegs != null) {
TupleValue tuple = (TupleValue)rcb.getRegVal(theResultReg);
tuple.toTuple((RecordValueImpl)res, doesSort());
} else if (doesSort() && res.isRecord()) {
((RecordValueImpl)res).convertEmptyToNull();
rcb.setRegVal(theResultReg, res);
} else {
rcb.setRegVal(theResultReg,
res.isEMPTY() ? NullValueImpl.getInstance() : res);
}
return true;
} catch (StoreIteratorException sie) {
final Throwable cause = sie.getCause();
if (cause instanceof RuntimeException) {
throw (RuntimeException) cause;
}
if (cause instanceof Error) {
throw (Error) cause;
}
throw new IllegalStateException("Unexpected exception: " + cause,
cause);
}
}
@Override
public void reset(RuntimeControlBlock rcb) {
ReceiveIterState state = (ReceiveIterState)rcb.getState(theStatePos);
state.reset(this);
}
@Override
public void close(RuntimeControlBlock rcb) {
ReceiveIterState state = (ReceiveIterState)rcb.getState(theStatePos);
if (state == null) {
return;
}
state.close();
}
@Override
public Throwable getCloseException(RuntimeControlBlock rcb) {
final ReceiveIterState state =
(ReceiveIterState) rcb.getState(theStatePos);
if (state == null) {
return null;
}
if (state.theRemoteResultsIter != null) {
return state.theRemoteResultsIter.getCloseException();
}
return state.theRemoteResultsIterCloseException;
}
private BinaryValueImpl createBinaryPrimKey(FieldValueImpl result) {
final ByteArrayOutputStream baos = new ByteArrayOutputStream();
final DataOutput out = new DataOutputStream(baos);
try {
if (!result.isRecord()) {
assert(thePrimKeyPositions.length == 1);
writeValue(out, result, 0);
} else {
for (int i = 0; i < thePrimKeyPositions.length; ++i) {
FieldValueImpl fval =
((RecordValueImpl)result).get(thePrimKeyPositions[i]);
writeValue(out, fval, i);
}
}
} catch (IOException e) {
throw new QueryStateException(
"Failed to create binary prim key due to IOException:\n" +
e.getMessage());
}
byte[] bytes = baos.toByteArray();
return FieldDefImpl.binaryDef.createBinary(bytes);
}
private void writeValue(DataOutput out, FieldValueImpl val, int i)
throws IOException {
switch (val.getType()) {
case INTEGER:
SerializationUtil.writePackedInt(out, val.getInt());
break;
case LONG:
SerializationUtil.writePackedLong(out, val.getLong());
break;
case DOUBLE:
out.writeDouble(val.getDouble());
break;
case FLOAT:
out.writeFloat(val.getFloat());
break;
case STRING:
/* Use the current format */
SerializationUtil.writeString(
out, SerialVersion.CURRENT, val.getString());
break;
case ENUM:
out.writeShort(val.asEnum().getIndex());
break;
case TIMESTAMP:
TimestampValueImpl ts = (TimestampValueImpl)val;
writeNonNullByteArray(out, ts.getBytes());
break;
case NUMBER:
NumberValueImpl num = (NumberValueImpl)val;
writeNonNullByteArray(out, num.getBytes());
break;
default:
throw new QueryStateException(
"Unexpected type for primary key column : " +
val.getType() + ", at result column " + i);
}
}
@Override
protected void displayContent(StringBuilder sb, QueryFormatter formatter) {
if (theSortFieldPositions != null) {
formatter.indent(sb);
sb.append("Sort Field Positions : ");
for (int i = 0; i < theSortFieldPositions.length; ++i) {
sb.append(theSortFieldPositions[i]);
if (i < theSortFieldPositions.length - 1) {
sb.append(", ");
}
}
sb.append(",\n");
}
if (thePrimKeyPositions != null) {
formatter.indent(sb);
sb.append("Primary Key Positions : ");
for (int i = 0; i < thePrimKeyPositions.length; ++i) {
sb.append(thePrimKeyPositions[i]);
if (i < thePrimKeyPositions.length - 1) {
sb.append(", ");
}
}
sb.append(",\n");
}
formatter.indent(sb);
sb.append("DistributionKind : ").append(theDistributionKind);
sb.append(",\n");
if (thePushedExternals != null) {
sb.append("\n");
formatter.indent(sb);
sb.append("EXTERNAL KEY EXPRS: ").append(thePushedExternals.length);
for (PlanIter iter : thePushedExternals) {
sb.append("\n");
if (iter != null) {
iter.display(sb, formatter);
} else {
formatter.indent(sb);
sb.append("null");
}
}
sb.append(",\n\n");
}
formatter.indent(sb);
sb.append("Number of Registers :").append(theNumRegs);
sb.append(",\n");
formatter.indent(sb);
sb.append("Number of Iterators :").append(theNumIters);
sb.append(",\n");
theInputIter.display(sb, formatter);
}
int compareRecords(RecordValueImpl rec1, RecordValueImpl rec2) {
for (int i = 0; i < theSortFieldPositions.length; ++i) {
int pos = theSortFieldPositions[i];
FieldValueImpl v1 = rec1.get(pos);
FieldValueImpl v2 = rec2.get(pos);
int comp = compareAtomics(v1, v2, i);
if (comp != 0) {
return comp;
}
}
/* they must be equal */
return 0;
}
int compareAtomics(FieldValueImpl v1, FieldValueImpl v2, int sortPos) {
int comp;
if (v1.isNull()) {
if (v2.isNull()) {
comp = 0;
} else {
comp = (theSortSpecs[sortPos].theNullsFirst ? -1 : 1);
}
} else if (v2.isNull()) {
comp = (theSortSpecs[sortPos].theNullsFirst ? 1 : -1);
} else if (v1.isEMPTY()) {
if (v2.isEMPTY()) {
comp = 0;
} else if (v2.isJsonNull()) {
comp = (theSortSpecs[sortPos].theNullsFirst ? 1 : -1);
} else {
comp = (theSortSpecs[sortPos].theNullsFirst ? -1 : 1);
}
} else if (v2.isEMPTY()) {
if (v1.isJsonNull()) {
comp = (theSortSpecs[sortPos].theNullsFirst ? -1 : 1);
} else {
comp = (theSortSpecs[sortPos].theNullsFirst ? 1 : -1);
}
} else if (v1.isJsonNull()) {
if (v1.isJsonNull()) {
comp = 0;
} else {
comp = (theSortSpecs[sortPos].theNullsFirst ? -1 : 1);
}
} else if (v2.isJsonNull()) {
comp = (theSortSpecs[sortPos].theNullsFirst ? 1 : -1);
} else {
comp = v1.compareTo(v2);
}
return (theSortSpecs[sortPos].theIsDesc ? -comp : comp);
}
/**
* The partitions iterator that scans all partitions from 1 to N
* sequentially.
*/
private class SequentialPartitionsIterator
implements AsyncTableIterator {
private final RuntimeControlBlock theRCB;
private final PartitionId[] thePartitions;
/*
* A single-partition iterator used to scan the current partition.
*/
private AbstractScanIterator thePartitionIter;
SequentialPartitionsIterator(
RuntimeControlBlock rcb,
PartitionId[] partitions) {
theRCB = rcb;
if (partitions != null) {
thePartitions = partitions;
} else {
Set pids = rcb.getStore().getTopology().
getPartitionMap().getAllIds();
thePartitions = pids.toArray(new PartitionId[pids.size()]);
}
int pidIdx = rcb.getPidIdx();
if (pidIdx < 0 || pidIdx >= thePartitions.length) {
throw new IllegalArgumentException(
"Invalid partition id in continuation key: " +
pidIdx);
}
/* Set emptyReadFactor to 1 for single partition scan. */
int emptyReadFactor = (thePartitions.length == 1) ? 1 : 0;
thePartitionIter =
new AbstractScanIterator(theRCB,
thePartitions[pidIdx],
null,/* group id */
theRCB.getBatchSize(),
theRCB.getMaxReadKB(),
emptyReadFactor);
}
@Override
public boolean hasNext() {
if (thePartitionIter == null) {
return false;
}
while (!thePartitionIter.hasNext()) {
if (theRCB.getReachedLimit()) {
theRCB.createContinuationKey();
return false;
}
int pidIdx = theRCB.incPidIdx();
if (pidIdx == thePartitions.length) {
/* Return false if no more partition to scan */
thePartitionIter.close();
thePartitionIter = null;
theRCB.setContinuationKey(null);
return false;
}
/*
* Calculates the read size limit for scanning on the next
* partition, return false if reached limit.
*/
int maxReadKB = 0;
if (theRCB.getMaxReadKB() > 0) {
maxReadKB = theRCB.getMaxReadKB() - theRCB.getReadKB();
if (maxReadKB <= 0) {
theRCB.createContinuationKey();
return false;
}
}
/*
* Calculates the batch size limit for scanning on the next
* partition, return false if reached limit.
*/
int maxReadNum;
if (theRCB.getUseBatchSizeAsLimit()) {
maxReadNum = theRCB.getBatchSize() - theRCB.getResultSize();
if (maxReadNum <= 0) {
theRCB.createContinuationKey();
return false;
}
} else {
maxReadNum = theRCB.getBatchSize();
}
/* Open iterator on next partition */
/*
* Set emptyReadFactor to 1 if no entry read until scan on the
* last partition.
*/
int emptyReadFactor =
(theRCB.getReadKB() == 0 &&
pidIdx == thePartitions.length - 1) ? 1 : 0;
thePartitionIter.initForNextPartition(thePartitions[pidIdx],
maxReadNum,
maxReadKB,
emptyReadFactor);
}
return true;
}
@Override
public FieldValueImpl nextLocal() {
if (thePartitionIter == null) {
return null;
}
/*
* Note that the various size limit checks performed by hasNext,
* which is called by next, are not needed here because this local
* version does not advance to another iterator.
*/
return thePartitionIter.nextLocal();
}
@Override
public FieldValueImpl next() {
if (!hasNext()) {
throw new NoSuchElementException();
}
return thePartitionIter.next();
}
@Override
public void close() {
if (thePartitionIter != null) {
thePartitionIter.close();
thePartitionIter = null;
}
}
@Override
public boolean isClosed() {
return thePartitionIter == null || thePartitionIter.isClosed();
}
@Override
public Throwable getCloseException() {
return (thePartitionIter != null ?
thePartitionIter.getCloseException() :
null);
}
@Override
public List getPartitionMetrics() {
return Collections.emptyList();
}
@Override
public List getShardMetrics() {
return Collections.emptyList();
}
@Override
public void remove() {
throw new UnsupportedOperationException();
}
}
/**
* The all shards iterator that scans all shards sequentially.
*/
private class SequentialShardsIterator
implements AsyncTableIterator {
private final RuntimeControlBlock theRCB;
private final RepGroupId[] theShards;
/* The current shard scan iterator */
private AbstractScanIterator theShardIter;
SequentialShardsIterator(RuntimeControlBlock rcb) {
theRCB = rcb;
Set gpIds =
theRCB.getStore().getTopology().getRepGroupIds();
theShards = gpIds.toArray(new RepGroupId[gpIds.size()]);
int shardIdx = theRCB.getShardIdx();
if (shardIdx < 0 || shardIdx >= theShards.length) {
throw new IllegalArgumentException(
"Invalid shard id in continuation key: " +
shardIdx);
}
/* Set emptyReadFactor to 1 for single shard scan. */
int emptyReadFactor = (theShards.length == 1) ? 1 : 0;
theShardIter = new AbstractScanIterator(theRCB,
null, /*partition id*/
theShards[shardIdx],
theRCB.getBatchSize(),
theRCB.getMaxReadKB(),
emptyReadFactor);
}
@Override
public boolean hasNext() {
if (theShardIter == null) {
return false;
}
while (!theShardIter.hasNext()) {
if (theRCB.getReachedLimit()) {
theRCB.createContinuationKey();
return false;
}
int shardIdx = theRCB.incShardIdx();
if (shardIdx == theShards.length) {
/* Return false if no more shard to scan */
theShardIter.close();
theShardIter = null;
theRCB.setContinuationKey(null);
return false;
}
/*
* Calculates the read size limit for scanning on the next
* shard, return false if reached limit.
*/
int maxReadKB = 0;
if (theRCB.getMaxReadKB() > 0) {
maxReadKB = theRCB.getMaxReadKB() - theRCB.getReadKB();
if (maxReadKB <= 0) {
theRCB.createContinuationKey();
return false;
}
}
/*
* Calculates the batch size limit for scanning on the next
* shard, return false if reached limit.
*/
int maxReadNum;
if (theRCB.getUseBatchSizeAsLimit()) {
maxReadNum = theRCB.getBatchSize() - theRCB.getResultSize();
if (maxReadNum <= 0) {
theRCB.createContinuationKey();
return false;
}
} else {
maxReadNum = theRCB.getBatchSize();
}
/* Open iterator on next shard */
/*
* Set emptyReadFactor to 1 if no entry read until scan on the
* last shard.
*/
int emptyReadFactor =
(theRCB.getReadKB() == 0 &&
shardIdx == theShards.length - 1) ? 1 : 0;
theShardIter.initForNextShard(theShards[shardIdx],
maxReadNum,
maxReadKB,
emptyReadFactor);
}
return true;
}
@Override
public FieldValueImpl next() {
if (!hasNext()) {
throw new NoSuchElementException();
}
return theShardIter.next();
}
@Override
public FieldValueImpl nextLocal() {
if (theShardIter == null) {
return null;
}
return theShardIter.nextLocal();
}
@Override
public void close() {
if (theShardIter != null) {
theShardIter = null;
}
}
@Override
public boolean isClosed() {
return theShardIter == null || theShardIter.isClosed();
}
@Override
public Throwable getCloseException() {
return (theShardIter != null ?
theShardIter.getCloseException() :
null);
}
@Override
public List getPartitionMetrics() {
return Collections.emptyList();
}
@Override
public List getShardMetrics() {
return Collections.emptyList();
}
@Override
public void remove() {
throw new UnsupportedOperationException();
}
}
/**
* Implements iterative table scan in a single partition/shard. Used by
* the SequentialPartitionsIterator and SequentialShardsIterator.
*
* Note: No synchronization is needed for async mode, because there can
* only a single pending remote request in the cases where an
* AbstractScanIterator is used.
*/
private class AbstractScanIterator implements
AsyncTableIterator {
private final RuntimeControlBlock theRCB;
private PartitionId thePid;
private RepGroupId theGroupId;
private int theMaxReadNum;
private int theMaxReadKB;
private int theEmptyReadFactor;
private Iterator theResultsIter;
private boolean theMoreRemoteResults;
private Throwable theAsyncCloseException;
private boolean theIsClosed;
private boolean theAsyncRequestExecuting;
public AbstractScanIterator(
RuntimeControlBlock rcb,
PartitionId pid,
RepGroupId gid,
int maxReadNum,
int maxReadKB,
int emptyReadFactor) {
theRCB = rcb;
thePid = pid;
theGroupId = gid;
theMaxReadNum = maxReadNum;
theMaxReadKB = maxReadKB;
theMoreRemoteResults = true;
theResultsIter = null;
theEmptyReadFactor = emptyReadFactor;
}
void initForNextPartition(
PartitionId pid,
int maxReadNum,
int maxReadKB,
int emptyReadFactor) {
initForNextScan(pid, null, maxReadNum, maxReadKB, emptyReadFactor);
}
void initForNextShard(
RepGroupId gid,
int maxReadNum,
int maxReadKB,
int emptyReadFactor) {
initForNextScan(null, gid, maxReadNum, maxReadKB, emptyReadFactor);
}
private void initForNextScan(
PartitionId pid,
RepGroupId gid,
int maxReadNum,
int maxReadKB,
int emptyReadFactor) {
thePid = pid;
theGroupId = gid;
theRCB.getResumeInfo().reset();
theMaxReadNum = maxReadNum;
theMaxReadKB = maxReadKB;
theMoreRemoteResults = true;
theResultsIter = null;
theEmptyReadFactor = emptyReadFactor;
}
/* Create request for TableQuery operation */
Request createRequest() {
TableQuery op = new TableQuery(
theDistributionKind,
theInputType,
theMayReturnNULL,
ReceiveIter.this,
theRCB.getExternalVars(),
theNumIters,
theNumRegs,
theTableId,
theRCB.getMathContext(),
theRCB.getTraceLevel(),
theMaxReadNum,
theMaxReadKB,
theRCB.getMaxReadKB(),
theRCB.getResumeInfo(),
theEmptyReadFactor);
final Consistency consistency = theRCB.getConsistency();
final Durability durability = theRCB.getDurability();
final long timeout = theRCB.getTimeout();
final TimeUnit timeUnit = theRCB.getTimeUnit();
final KVStoreImpl store = theRCB.getStore();
if (thePid != null) {
if (theIsUpdate) {
return store.makeWriteRequest(op, thePid, durability,
timeout, timeUnit, null);
}
return store.makeReadRequest(op, thePid, consistency,
timeout, timeUnit, null);
}
return store.makeReadRequest(op, theGroupId, consistency,
timeout, timeUnit, null);
}
@Override
public boolean hasNext() {
if (theResultsIter != null && theResultsIter.hasNext()) {
return true;
}
theResultsIter = null;
/*
* Stop to fetch next batch if no more elements to fetch or has
* reached the size or number limit.
*/
if (!theMoreRemoteResults || theRCB.getReachedLimit()) {
return false;
}
Request req = createRequest();
KVStoreImpl store = theRCB.getStore();
QueryResult result = (QueryResult)store.executeRequest(req);
return processResults(result);
}
private boolean processResults(QueryResult result) {
final List results = result.getQueryResults();
theMoreRemoteResults = result.hasMoreElements();
theRCB.getResumeInfo().refresh(result.getResumeInfo());
theRCB.tallyReadKB(result.getReadKB());
theRCB.tallyWriteKB(result.getWriteKB());
if (theRCB.getUseBatchSizeAsLimit()) {
theRCB.tallyResultSize(results.size());
}
if (results.isEmpty()) {
assert(result.getExceededSizeLimit() || !theMoreRemoteResults);
if (result.getExceededSizeLimit()) {
theRCB.setReachedLimit(true);
}
return false;
}
theResultsIter = results.iterator();
if (theMoreRemoteResults) {
/*
* So far, there still has more elements to fetch.
*
* 1. If there is size limit specified and reached the
* the size limit, store the resume key and mark the
* flag in RCB.
* 2. If no size limit or not reaches size limit, there is
* a number limit specified, then store the resume key
* and mark the flag in RCB, because "theMoreRemoteResults
* is true" implies the "batchSize" number of records
* are returned, and still has more.
* 3. If no for 1 and 2, then calculates the left size limit
* for next batch's fetching.
*/
boolean reachSizeLimit =
(result.getExceededSizeLimit() ||
(theMaxReadKB > 0 && theMaxReadKB == result.getReadKB()));
if (reachSizeLimit || theRCB.getUseBatchSizeAsLimit()) {
theRCB.setReachedLimit(true);
} else {
/* Calculates the left maxReadKB for next fetch */
if (theMaxReadKB > 0) {
theMaxReadKB -= result.getReadKB();
}
}
}
return true;
}
@Override
public FieldValueImpl nextLocal() {
/*
* This method must be called without the lock held to avoid lock
* problems that could arise if notifyNext were called in the
* current thread and then called back into this class.
*/
if (Thread.holdsLock(this)) {
throw new IllegalStateException(
"nextLocal called with lock held");
}
/* Return next */
if (theResultsIter != null && theResultsIter.hasNext()) {
return theResultsIter.next();
}
/* Throw any close exception */
if (theAsyncCloseException instanceof RuntimeException) {
throw (RuntimeException) theAsyncCloseException;
}
if (theAsyncCloseException instanceof Error) {
throw (Error) theAsyncCloseException;
}
if (theAsyncCloseException != null) {
throw new IllegalStateException(
"Unexpected exception from async iteration: " +
theAsyncCloseException,
theAsyncCloseException);
}
if (isClosed()) {
return null;
}
/* Initiate a request if one isn't already underway */
if (theAsyncRequestExecuting) {
return null;
}
Request request = createRequest();
theAsyncRequestExecuting = true;
theRCB.getStore().executeRequest(
request,
new ResultHandler() {
@Override
public void onResult(Result r, Throwable e) {
theAsyncRequestExecuting = false;
handleExecuteResult(r, e);
}
});
return null;
}
private void handleExecuteResult(Result r, Throwable e) {
assert !Thread.holdsLock(this);
if (r != null) {
processResults((QueryResult) r);
} else {
theAsyncCloseException = e;
close();
}
theAsyncIterHandleNotifier.notifyNext();
}
@Override
public FieldValueImpl next() {
return theResultsIter.next();
}
@Override
public void close() {
theResultsIter = null;
theIsClosed = true;
}
@Override
public boolean isClosed() {
if (theIsClosed) {
return true;
}
if (theResultsIter != null && theResultsIter.hasNext()) {
return false;
}
if (!theMoreRemoteResults) {
close();
return true;
}
return false;
}
@Override
public synchronized Throwable getCloseException() {
return theAsyncCloseException;
}
@Override
public void remove() {
throw new UnsupportedOperationException();
}
@Override
public List getPartitionMetrics() {
return Collections.emptyList();
}
@Override
public List getShardMetrics() {
return Collections.emptyList();
}
}
}
© 2015 - 2025 Weber Informatics LLC | Privacy Policy