All Downloads are FREE. Search and download functionalities are using the official Maven repository.

oracle.nosql.driver.query.ReceiveIter Maven / Gradle / Ivy

There is a newer version: 5.4.16
Show newest version
/*-
 * Copyright (c) 2011, 2020 Oracle and/or its affiliates.  All rights reserved.
 *
 * Licensed under the Universal Permissive License v 1.0 as shown at
 *  https://oss.oracle.com/licenses/upl/
 */

package oracle.nosql.driver.query;

import java.io.ByteArrayOutputStream;
import java.io.DataOutput;
import java.io.DataOutputStream;
import java.io.IOException;
import java.util.ArrayList;
import java.util.HashSet;
import java.util.List;
import java.util.TreeSet;

import oracle.nosql.driver.RetryableException;
import oracle.nosql.driver.ops.QueryRequest;
import oracle.nosql.driver.ops.QueryResult;
import oracle.nosql.driver.values.BinaryValue;
import oracle.nosql.driver.values.FieldValue;
import oracle.nosql.driver.values.MapValue;
import oracle.nosql.driver.values.NumberValue;
import oracle.nosql.driver.util.SizeOf;
import oracle.nosql.driver.util.ByteInputStream;
import oracle.nosql.driver.util.SerializationUtil;

/**
 * ReceiveIter requests and receives results from the proxy. For sorting
 * queries, it performs a merge sort of the received results. It also
 * performs duplicate elimination for queries that require it (note:
 * a query can do both sorting and dup elimination).
 */
public class ReceiveIter extends PlanIter {

    public enum DistributionKind {
        /*
         * The query predicates specify a complete shard key, and as a result,
         * the query goes to a single partition and uses the primary index for
         * its execution.
         */
        SINGLE_PARTITION,
        /*
         * The query uses the primary index for its execution, but does not
         * specify a complete shard key. As a result, it must be sent to all
         * partitions.
         */
        ALL_PARTITIONS,
        /*
         * The query uses a secondary index for its execution. As a result,
         * it must be sent to all shards.
         */
        ALL_SHARDS
    }

    private static class ReceiveIterState extends PlanIterState {

        /*
         * It stores the set of shard ids. Needed for sorting all-shard
         * queries only.
         */
        TopologyInfo theTopoInfo;

        /*
         * The remote scanner used for non-sorting queries.
         */
        RemoteScanner theScanner;

        /*
         * The remote scanners used for sorting queries. For all-shard queries
         * there is one RemoreScanner per shard. For all-partition queries
         * a RemoteScanner is created for each partition that has at least one
         * result. See the javadoc of PartitionUnionIter in kvstore for a
         * description of how all-partition queries are executed in the 3-tier
         * architecture.
         */
        TreeSet theSortedScanners;

        /*
         * Used for sorting all-partition queries. It specifies whether the
         * query execution is in sort pahse 1 (see javadoc of PartitionUnionIter
         * in kvstore).
         */
        boolean theIsInSortPhase1 = true;

        /*
         * The continuation key to be used for the next batch request during
         * sort-phase-1 of a sorting, all-partition query.
         */
        byte[] theContinuationKey = null;

        /*
         * Hash set used for duplicate elimination. It stores the primary
         * keys (in binary format) of all the results seen so far.
         */
        HashSet thePrimKeysSet;

        /*
         * The memory consumed by this ReceiveIter. Memory consumption is
         * counted for sorting all-partiton queries and/or queries that do
         * duplicate elimination. We count the memory taken by results cached
         * in theSortedScanners and/or primary keys stored in thePrimKeysSet.
         */
        long theMemoryConsumption;

        /*
         * The memory consumed for duplicate elimination
         */
        long theDupElimMemory;

        /*
         * theTotalResultsSize and theTotalNumResults store the total size
         * and number of results fetched by this ReceiveIter so far. They
         * are used to compute the average result size, which is then used
         * to compute the max number of results to fetch from a partition
         * during a sort-phase-2 request for a sorting, all-partition query.
         */
        long theTotalResultsSize;

        long theTotalNumResults;

        ReceiveIterState(RuntimeControlBlock rcb, ReceiveIter iter) {

            theTopoInfo = rcb.getTopologyInfo();

            if (iter.doesDupElim()) {
                thePrimKeysSet = new HashSet(1000);
            }

            if (iter.doesSort() &&
                iter.theDistributionKind == DistributionKind.ALL_PARTITIONS) {
                theSortedScanners = new TreeSet();
            } else if (iter.doesSort() &&
                       iter.theDistributionKind == DistributionKind.ALL_SHARDS) {
                int numShards = theTopoInfo.numShards();
                theSortedScanners = new TreeSet();
                for (int i = 0; i < numShards; ++i) {
                    theSortedScanners.add(
                        iter.new RemoteScanner(rcb, this, true,
                                               theTopoInfo.getShardId(i)));
                }
            } else {
                theScanner = iter.new RemoteScanner(rcb, this, false, -1);
            }
        }

        @Override
        public void done() {
            super.done();
            clear();
        }

        @Override
        public void close() {
            super.close();
            thePrimKeysSet = null;
            theSortedScanners = null;
        }

        void clear() {
            if (thePrimKeysSet != null) {
                thePrimKeysSet.clear();
            }

            if (theSortedScanners != null) {
                theSortedScanners.clear();
            }
        }
    }

    /*
     * The distribution kind of the query.
     */
    private final DistributionKind theDistributionKind;

    /*
     * Used for sorting queries. It specifies the names of the top-level
     * fields that contain the values on which to sort the received results.
     */
    private final String[] theSortFields;

    private final SortSpec[] theSortSpecs;

    /*
     * Used for duplicate elimination. It specifies the names of the top-level
     * fields that contain the primary-key values within the received results .
     */
    private final String[] thePrimKeyFields;

    public ReceiveIter(
        ByteInputStream in,
        short serialVersion) throws IOException {

        super(in, serialVersion);

        short ordinal = in.readShort();
        theDistributionKind = DistributionKind.values()[ordinal];

        theSortFields = SerializationUtil.readStringArray(in);
        theSortSpecs = readSortSpecs(in);
        thePrimKeyFields = SerializationUtil.readStringArray(in);
    }

    @Override
    public PlanIterKind getKind() {
        return PlanIterKind.RECV;
    }

    boolean doesSort() {
        return theSortFields != null;
    }

    boolean doesDupElim() {
        return thePrimKeyFields != null;
    }

    @Override
    public void open(RuntimeControlBlock rcb) {

        ReceiveIterState state = new ReceiveIterState(rcb, this);
        rcb.setState(theStatePos, state);
        rcb.incMemoryConsumption(state.theMemoryConsumption);

        QueryRequest qreq = rcb.getRequest();
        assert(qreq.isPrepared());
        assert(qreq.hasDriver());
    }

    @Override
    public void reset(RuntimeControlBlock rcb) {
        throw new IllegalStateException("Should never be called");
    }

    @Override
    public void close(RuntimeControlBlock rcb) {

        ReceiveIterState state = (ReceiveIterState)rcb.getState(theStatePos);
        if (state == null) {
            return;
        }

        state.close();
    }

    @Override
    public boolean next(RuntimeControlBlock rcb) {

        ReceiveIterState state = (ReceiveIterState)rcb.getState(theStatePos);

        if (state.isDone()) {
            if (rcb.getTraceLevel() >= 1) {
                rcb.trace("ReceiveIter.next() : done");
            }
            return false;
        }

        if (!doesSort()) {
            return simpleNext(rcb, state);
        }

        return sortingNext(rcb, state);
    }

    private boolean simpleNext(
        RuntimeControlBlock rcb,
        ReceiveIterState state) {

        do {
            MapValue res = state.theScanner.next();

            if (res != null) {
                if (rcb.getTraceLevel() >= 1) {
                    rcb.trace("ReceiveIter.simpleNext() : got result :\n" + res);
                }

                if (checkDuplicate(rcb, state, res)) {
                    continue;
                }

                rcb.setRegVal(theResultReg, res);
                return true;
            }

            break;

        } while (true);

        if (rcb.getTraceLevel() >= 1) {
            rcb.trace("ReceiveIter.simleNext() : no result. Reached limit = " +
                      rcb.reachedLimit());
        }

        if (!rcb.reachedLimit()) {
            state.done();
        }

        return false;
    }

    private boolean sortingNext(
        RuntimeControlBlock rcb,
        ReceiveIterState state) {

        if (theDistributionKind == DistributionKind.ALL_PARTITIONS &&
            state.theIsInSortPhase1) {

            initPartitionSort(rcb, state);
            return false;
        }

        while (true) {
            RemoteScanner scanner = state.theSortedScanners.pollFirst();

            if (scanner == null) {
                state.done();
                return false;
            }

            MapValue res = scanner.nextLocal();

            if (res != null) {

                if (rcb.getTraceLevel() >= 1) {
                    rcb.trace("ReceiveIter.sortingNext() : got result :\n" + res);
                }

                res.convertEmptyToNull();
                rcb.setRegVal(theResultReg, res);

                if (!scanner.isDone()) {
                    state.theSortedScanners.add(scanner);
                } else {
                    if (rcb.getTraceLevel() >= 1) {
                        rcb.trace("ReceiveIter.sortingNext() : done with " +
                                  "partition/shard " + scanner.theShardOrPartId);
                    }
                }

                if (checkDuplicate(rcb, state, res)) {
                    continue;
                }

                return true;
            }

            /*
             * Scanner had no cached results. If it may have remote results,
             * send a request to fetch more results. Otherwise, throw it away
             * (by leaving it outside theSortedScanners) and continue with
             * another scanner.
             */
            if (!scanner.isDone()) {
                try {
                    scanner.fetch();
                } catch (RetryableException e) {
                    state.theSortedScanners.add(scanner);
                    throw e;
                }
            } else {
                continue;
            }

            /*
             * We executed a remote fetch. If we got any result or the scanner
             * may have more remote results, put the scanner back into
             * theSortedScanner. Otherwise, throw it away.
             */
            if (!scanner.isDone()) {
                state.theSortedScanners.add(scanner);
            } else {
                if (rcb.getTraceLevel() >= 1) {
                    rcb.trace("ReceiveIter.sortingNext() : done with " +
                              "partition/shard " + scanner.theShardOrPartId);
                }
            }

            handleTopologyChange(rcb, state);

            /*
             * For simplicity, we don't want to allow the possibility of
             * another remote fetch during the same batch, so whether or not
             * the batch limit was reached during the above fetch, we set
             * limit flag to true and return false, thus terminating the
             * current batch.
             */
            rcb.setReachedLimit(true);
            return false;
        }
    }

    /*
     * Make sure we receive (and cache) at least one result per partition
     * (except from partitions that do not contain any results at all).
     */
    private void initPartitionSort(
        RuntimeControlBlock rcb,
        ReceiveIterState state) {

        assert(state.theIsInSortPhase1);

        /*
         * Create and execute a request to get at least one result from
         * the partition whose id is specified in theContinuationKey and
         * from any other partition that is co-located with that partition.
         */
        QueryRequest req = rcb.getRequest().copyInternal();
        req.setContKey(state.theContinuationKey);

        if (rcb.getTraceLevel() >= 1) {
            rcb.trace("ReceiveIter : executing remote request for " +
                      "sorting phase 1");
        }

        QueryResult result = (QueryResult)rcb.getClient().execute(req);

        int numPids = result.getNumPids();
        List results = result.getResultsInternal();
        state.theIsInSortPhase1 = result.isInPhase1();
        state.theContinuationKey = result.getContinuationKey();

        rcb.tallyReadKB(result.getReadKB());
        rcb.tallyReadUnits(result.getReadUnits());
        rcb.tallyWriteKB(result.getWriteKB());

        if (rcb.getTraceLevel() >= 1) {
            rcb.trace("ReceiveIter.initPartitionSort() : got result.\n" +
                      "reached limit = " + result.reachedLimit() +
                      " in phase 1 = " + result.isInPhase1());
        }

        /*
         * For each partition P that was accessed during the execution of
         * the above QueryRequest, collect the results for P and create a
         * scanner that will be used during phase 2 to collect further
         * results from P only.
         */
        int resIdx = 0;

        for (int p = 0; p < numPids; ++p) {

            int pid = result.getPid(p);
            int numResults = result.getNumPartitionResults(p);
            byte[] contKey = result.getPartitionContKey(p);
            assert(numResults > 0);

            ArrayList partitionResults =
                new ArrayList(numResults);

            for (int j = 0; j < numResults; ++j) {

                MapValue res = results.get(resIdx);
                partitionResults.add(res);

                if (rcb.getTraceLevel() >= 1) {
                    rcb.trace("Added result for partition " + pid +
                              ":\n" + res);
                }
                ++resIdx;
            }

            RemoteScanner scanner =
                this.new RemoteScanner(rcb, state, false, pid);
            scanner.addResults(partitionResults, contKey);
            state.theSortedScanners.add(scanner);
        }

        if (rcb.getTraceLevel() >= 1) {
            rcb.trace("ReceiveIter.initPartitionSort() : " +
                      " memory consumption = " + state.theMemoryConsumption);
        }

        /*
         * For simplicity, if the size limit was not reached during this
         * batch of sort phase 1, we don't start a new batch. We let the
         * app do it. Furthermore, this means that each remote fetch will
         * be done with the max amount of read limit, which will reduce the
         * total number of fetches.
         */
        rcb.setReachedLimit(true);
    }

    private void handleTopologyChange(
        RuntimeControlBlock rcb,
        ReceiveIterState state) {

        TopologyInfo newTopoInfo = rcb.getTopologyInfo();

        if (theDistributionKind == DistributionKind.ALL_PARTITIONS ||
            newTopoInfo.equals(state.theTopoInfo)) {
            return;
        }

        int[] newShards = newTopoInfo.getShardIds();
        int[] currShards = state.theTopoInfo.getShardIds();

        for (int i = 0; i < newShards.length; ++i) {

            int j;
            for (j = 0; j < currShards.length; ++j) {
                if (newShards[i] == currShards[j]) {
                    currShards[j] = -1;
                    break;
                }
            }

            if (j < currShards.length) {
                continue;
            }

            /* We have a new shard */
            state.theSortedScanners.add(
                this.new RemoteScanner(rcb, state, true, newShards[i]));
        }

        for (int j = 0; j < currShards.length; ++j) {

            if (currShards[j] == -1) {
                continue;
            }

            /* This shard does not exist any more */
            for (RemoteScanner scanner : state.theSortedScanners) {

                if (scanner.theShardOrPartId == currShards[j]) {
                    state.theSortedScanners.remove(scanner);
                    break;
                }
            }
        }

        state.theTopoInfo = newTopoInfo;
    }

    private boolean checkDuplicate(
        RuntimeControlBlock rcb,
        ReceiveIterState state,
        MapValue res) {

        if (thePrimKeyFields == null) {
            return false;
        }

        BinaryValue binPrimKey = createBinaryPrimKey(res);
        boolean added = state.thePrimKeysSet.add(binPrimKey);
        if (!added) {
            if (rcb.getTraceLevel() >= 1) {
                rcb.trace("ReceiveIter.checkDuplicate() : result was duplicate");
            }
            return true;
        }
        long sz = (binPrimKey.sizeof() + SizeOf.HASHSET_ENTRY_OVERHEAD);
        state.theMemoryConsumption += sz;
        state.theDupElimMemory += sz;
        rcb.incMemoryConsumption(sz);
        return false;
    }

    private BinaryValue createBinaryPrimKey(MapValue result) {

        final ByteArrayOutputStream baos = new ByteArrayOutputStream();
        final DataOutput out = new DataOutputStream(baos);

        try {
            for (int i = 0; i < thePrimKeyFields.length; ++i) {
                FieldValue fval = result.get(thePrimKeyFields[i]);
                writeValue(out, fval, i);
            }
        } catch (IOException e) {
            throw new QueryStateException(
                "Failed to create binary prim key due to IOException:\n" +
                e.getMessage());
        }

        byte[] bytes = baos.toByteArray();
        return new BinaryValue(bytes);
    }

    private void writeValue(DataOutput out, FieldValue val, int i)
        throws IOException {

        switch (val.getType()) {
        case INTEGER:
            SerializationUtil.writePackedInt(out, val.getInt());
            break;
        case LONG:
            SerializationUtil.writePackedLong(out, val.getLong());
            break;
        case DOUBLE:
            out.writeDouble(val.getDouble());
            break;
        case NUMBER:
            NumberValue num = (NumberValue)val;
            SerializationUtil.writeByteArray(out, num.getBytes());
            break;
        case STRING:
            SerializationUtil.writeString(out, val.getString());
            break;
        case TIMESTAMP:
            SerializationUtil.writeString(out, val.getString());
            break;
        default:
            throw new QueryStateException(
                "Unexpected type for primary key column : " +
                val.getType() + ", at result column " + i);
        }
    }

    /**
     * For all-shard, ordering queries, there is one RemoteScanner per shard.
     * In this case, each RemoteScanner will fetch results only from the shard
     * specified by theShardOrPartId.
     *
     * For all-partition, ordering queries, there is one RemoteScanner for
     * each partition that has at least one query result. In this case, each
     * RemoteScanner will fetch results only from the partition specified by
     * theShardOrPartId.
     *
     * For non-ordering queries, there is a single RemoteScanner. It will
     * fetch as many as possible results starting from the shard or partition
     * specified in theContinuationKey (so it may fetch results from more than
     * one shard/partition).
     */
    private class RemoteScanner implements Comparable {

        final RuntimeControlBlock theRCB;

        final ReceiveIterState theState;

        boolean theIsForShard;

        int theShardOrPartId = -1;

        List theResults;

        long theResultsSize;

        int theNextResultPos;

        byte[] theContinuationKey;

        boolean theMoreRemoteResults;

        public RemoteScanner(
            RuntimeControlBlock rcb,
            ReceiveIterState state,
            boolean isForShard,
            int spid) {

            theRCB = rcb;
            theState = state;
            theMoreRemoteResults = true;
            theIsForShard = isForShard;
            theShardOrPartId = spid;
        }

        boolean isDone() {
            return (!theMoreRemoteResults &&
                    (theResults == null ||
                     theNextResultPos >= theResults.size()));
        }

        boolean hasLocalResults() {
            return (theResults != null && theNextResultPos < theResults.size());
        }

        void addResults(List results, byte[] contKey) {
            theResults = results;
            theContinuationKey = contKey;
            theMoreRemoteResults = (contKey != null);
            addMemoryConsumption();
        }

        MapValue nextLocal() {

            if (theResults != null && theNextResultPos < theResults.size()) {
                MapValue res = theResults.get(theNextResultPos);
                theResults.set(theNextResultPos, null);
                ++theNextResultPos;
                return res;
            }

            return null;
        }

        MapValue next() {

            if (theResults != null && theNextResultPos < theResults.size()) {
                return theResults.get(theNextResultPos++);
            }

            theResults = null;
            theNextResultPos = 0;

            if (!theMoreRemoteResults || theRCB.reachedLimit()) {
                return null;
            }

            fetch();

            if (theResults.isEmpty()) {
                return null;
            }

            return theResults.get(theNextResultPos++);
        }

        void fetch() {

            QueryRequest req = theRCB.getRequest().copyInternal();
            req.setContKey(theContinuationKey);
            req.setShardId(theIsForShard ? theShardOrPartId : -1);

            if (doesSort() && !theIsForShard) {
                theState.theMemoryConsumption -= theResultsSize;
                theRCB.decMemoryConsumption(theResultsSize);
                long numResults =
                    ((req.getMaxMemoryConsumption() - theState.theDupElimMemory) /
                     ((theState.theSortedScanners.size() + 1) *
                      (theState.theTotalResultsSize /
                       theState.theTotalNumResults)));
                if (numResults > 2048) {
                    numResults = 2048;
                }
                req.setLimit((int)numResults);
            }

            if (theRCB.getTraceLevel() >= 1) {
                theRCB.trace("RemoteScanner : executing remote request. spid = " +
                             theShardOrPartId);
                assert(req.hasDriver());
            }

            QueryResult result = (QueryResult)theRCB.getClient().execute(req);

            theResults = result.getResultsInternal();
            theContinuationKey = result.getContinuationKey();
            theNextResultPos = 0;
            theMoreRemoteResults = (theContinuationKey != null);

            theRCB.tallyReadKB(result.getReadKB());
            theRCB.tallyReadUnits(result.getReadUnits());
            theRCB.tallyWriteKB(result.getWriteKB());

            assert(result.reachedLimit() || !theMoreRemoteResults);

            /*
             * For simplicity, if the query is a sorting one, we consider
             * the current batch done as soon as we get the response back
             * from the proxy, even if the batch limit was not reached there.
             */
            if (result.reachedLimit() || doesSort()) {
                theRCB.setReachedLimit(true);
            }

            if (doesSort() && !theIsForShard) {
                addMemoryConsumption();
            }

            if (theRCB.getTraceLevel() >= 1) {
                theRCB.trace("RemoteScanner : got " + theResults.size() +
                             " remote results. More remote resuls = " +
                             theMoreRemoteResults + " reached limit = " +
                             result.reachedLimit() + " read KB = " +
                             result.getReadKB() + " read Units = " +
                             result.getReadUnits() + " write KB = " +
                             result.getWriteKB() + " memory consumption = " +
                             theState.theMemoryConsumption);
            }
        }

        private void addMemoryConsumption() {

                theResultsSize = 0;
                for (int i = 0; i < theResults.size(); ++i) {
                    theResultsSize += theResults.get(i).sizeof();
                }

                theResultsSize += theResults.size() * SizeOf.OBJECT_REF_OVERHEAD;
                theState.theTotalNumResults += theResults.size();
                theState.theTotalResultsSize += theResultsSize;
                theState.theMemoryConsumption += theResultsSize;
                theRCB.incMemoryConsumption(theResultsSize);
        }

        @Override
        public int compareTo(RemoteScanner other) {

            if (!hasLocalResults()) {
                if (!other.hasLocalResults()) {
                    return (theShardOrPartId < other.theShardOrPartId ? -1 : 1);
                }
                return -1;
            }

            if (!other.hasLocalResults()) {
                return 1;
            }

            MapValue v1 = theResults.get(theNextResultPos);
            MapValue v2 = other.theResults.get(other.theNextResultPos);

            int comp =  Compare.sortResults(theRCB, v1, v2,
                                            theSortFields, theSortSpecs);

            if (comp == 0) {
                comp = (theShardOrPartId < other.theShardOrPartId ? -1 : 1);
            }

            return comp;
        }
    }

    @Override
    protected void displayContent(StringBuilder sb, QueryFormatter formatter) {

        formatter.indent(sb);
        sb.append("DistributionKind : ").append(theDistributionKind);
        sb.append(",\n");

        if (theSortFields != null) {
            formatter.indent(sb);
            sb.append("Sort Fields : ");
            for (int i = 0; i < theSortFields.length; ++i) {
                sb.append(theSortFields[i]);
                if (i < theSortFields.length - 1) {
                    sb.append(", ");
                }
            }
            sb.append(",\n");
        }

        if (thePrimKeyFields != null) {
            formatter.indent(sb);
            sb.append("Primary Key Fields : ");
            for (int i = 0; i < thePrimKeyFields.length; ++i) {
                sb.append(thePrimKeyFields[i]);
                if (i < thePrimKeyFields.length - 1) {
                    sb.append(", ");
                }
            }
            sb.append(",\n");
        }
    }
}




© 2015 - 2025 Weber Informatics LLC | Privacy Policy