All Downloads are FREE. Search and download functionalities are using the official Maven repository.

io.questdb.cairo.sql.async.PageFrameSequence Maven / Gradle / Ivy

/*******************************************************************************
 *     ___                  _   ____  ____
 *    / _ \ _   _  ___  ___| |_|  _ \| __ )
 *   | | | | | | |/ _ \/ __| __| | | |  _ \
 *   | |_| | |_| |  __/\__ \ |_| |_| | |_) |
 *    \__\_\\__,_|\___||___/\__|____/|____/
 *
 *  Copyright (c) 2014-2019 Appsicle
 *  Copyright (c) 2019-2023 QuestDB
 *
 *  Licensed under the Apache License, Version 2.0 (the "License");
 *  you may not use this file except in compliance with the License.
 *  You may obtain a copy of the License at
 *
 *  http://www.apache.org/licenses/LICENSE-2.0
 *
 *  Unless required by applicable law or agreed to in writing, software
 *  distributed under the License is distributed on an "AS IS" BASIS,
 *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 *  See the License for the specific language governing permissions and
 *  limitations under the License.
 *
 ******************************************************************************/

package io.questdb.cairo.sql.async;

import io.questdb.MessageBus;
import io.questdb.cairo.CairoConfiguration;
import io.questdb.cairo.sql.*;
import io.questdb.griffin.SqlException;
import io.questdb.griffin.SqlExecutionContext;
import io.questdb.log.Log;
import io.questdb.log.LogFactory;
import io.questdb.mp.MCSequence;
import io.questdb.mp.MPSequence;
import io.questdb.mp.RingQueue;
import io.questdb.mp.SCSequence;
import io.questdb.std.*;
import io.questdb.std.datetime.millitime.MillisecondClock;

import java.io.Closeable;
import java.util.concurrent.atomic.AtomicBoolean;
import java.util.concurrent.atomic.AtomicInteger;
import java.util.concurrent.atomic.AtomicLong;

public class PageFrameSequence implements Closeable {

    private static final AtomicLong ID_SEQ = new AtomicLong();
    private static final long LOCAL_TASK_CURSOR = Long.MAX_VALUE;
    private static final Log LOG = LogFactory.getLog(PageFrameSequence.class);
    private final MillisecondClock clock;
    private final LongList frameRowCounts = new LongList();
    private final WeakClosableObjectPool localTaskPool;
    private final MessageBus messageBus;
    private final PageAddressCache pageAddressCache;
    private final AtomicInteger reduceCounter = new AtomicInteger(0);
    private final PageFrameReducer reducer;
    private final AtomicBoolean valid = new AtomicBoolean(true);
    public volatile boolean done;
    private T atom;
    private SqlExecutionCircuitBreaker circuitBreaker;
    private int circuitBreakerFd;
    private SCSequence collectSubSeq;
    private int collectedFrameIndex = -1;
    private int dispatchStartFrameIndex;
    private int frameCount;
    private long id;
    // Local reduce task used when there is no slots in the queue to dispatch tasks.
    private PageFrameReduceTask localTask;
    private PageFrameCursor pageFrameCursor;
    private boolean readyToDispatch;
    private PageAddressCacheRecord record;
    private RingQueue reduceQueue;
    private int shard;
    private SqlExecutionContext sqlExecutionContext;
    private long startTime;

    public PageFrameSequence(
            CairoConfiguration configuration,
            MessageBus messageBus,
            PageFrameReducer reducer,
            WeakClosableObjectPool localTaskPool
    ) {
        this.pageAddressCache = new PageAddressCache(configuration);
        this.messageBus = messageBus;
        this.reducer = reducer;
        this.clock = configuration.getMillisecondClock();
        this.localTaskPool = localTaskPool;
    }

    /**
     * Waits for frame sequence completion, fetches remaining pieces of the
     * frame sequence from the queues. This method is not thread safe.
     */
    public void await() {
        LOG.debug()
                .$("awaiting completion [shard=").$(shard)
                .$(", id=").$(id)
                .$(", frameCount=").$(frameCount)
                .I$();

        final MCSequence pageFrameReduceSubSeq = messageBus.getPageFrameReduceSubSeq(shard);
        while (!done) {
            if (dispatchStartFrameIndex == collectedFrameIndex + 1) {
                // We know that all frames were collected. We're almost done.
                if (!done) {
                    // Looks like not all the frames were dispatched, so no one reached the very last frame and
                    // reset the sequence via calling PageFrameReduceTask#collected(). Let's do it ourselves.
                    reset();
                }
                break;
            }

            // We were asked to steal work from the reduce queue and beyond, as much as we can.
            boolean nothingProcessed = true;
            try {
                nothingProcessed = PageFrameReduceJob.consumeQueue(reduceQueue, pageFrameReduceSubSeq, record, circuitBreaker, this);
            } catch (Throwable e) {
                LOG.error()
                        .$("await error [id=").$(id)
                        .$(", ex=").$(e)
                        .I$();
            }

            if (nothingProcessed) {
                long cursor = collectSubSeq.next();
                if (cursor > -1) {
                    // Discard collected items.
                    final PageFrameReduceTask task = reduceQueue.get(cursor);
                    if (task.getFrameSequence() == this) {
                        assert id == task.getFrameSequenceId() : "ids mismatch: " + id + ", " + task.getFrameSequenceId();
                        collectedFrameIndex = task.getFrameIndex();
                        task.collected(true);
                    }
                    collectSubSeq.done(cursor);
                } else {
                    Os.pause();
                }
            }
        }

        // It could be the case that one of the workers reduced a page frame, then marked the task as done,
        // but haven't incremented reduce counter yet. In this case, we wait for the desired counter value.
        while (reduceCounter.get() != dispatchStartFrameIndex) {
            Os.pause();
        }
    }

    public void cancel() {
        this.valid.compareAndSet(true, false);
    }

    public void clear() {
        // prepare different frame sequence using the same object instance
        frameCount = 0;
        dispatchStartFrameIndex = 0;
        collectedFrameIndex = -1;
        readyToDispatch = false;
        pageAddressCache.clear();
        pageFrameCursor = Misc.freeIfCloseable(pageFrameCursor);
        // collect sequence may not be set here when
        // factory is closed without using cursor
        if (collectSubSeq != null) {
            messageBus.getPageFrameCollectFanOut(shard).remove(collectSubSeq);
            LOG.debug().$("removed [seq=").$(collectSubSeq).I$();
            collectSubSeq.clear();
        }
        if (localTask != null) {
            localTask.resetCapacities();
            localTaskPool.push(localTask);
            localTask = null;
        }
    }

    @Override
    public void close() {
        clear();
        Misc.freeIfCloseable(circuitBreaker);
        Misc.free(record);
    }

    public void collect(long cursor, boolean forceCollect) {
        assert cursor > -1;
        if (cursor == LOCAL_TASK_CURSOR) {
            collectedFrameIndex = localTask.getFrameIndex();
            localTask.collected();
            return;
        }
        PageFrameReduceTask task = reduceQueue.get(cursor);
        collectedFrameIndex = task.getFrameIndex();
        task.collected(forceCollect);
        collectSubSeq.done(cursor);
    }

    public T getAtom() {
        return atom;
    }

    public int getCircuitBreakerFd() {
        return circuitBreakerFd;
    }

    public int getFrameCount() {
        return frameCount;
    }

    public long getFrameRowCount(int frameIndex) {
        return frameRowCounts.getQuick(frameIndex);
    }

    public long getId() {
        return id;
    }

    public PageAddressCache getPageAddressCache() {
        return pageAddressCache;
    }

    public AtomicInteger getReduceCounter() {
        return reduceCounter;
    }

    public PageFrameReducer getReducer() {
        return reducer;
    }

    public int getShard() {
        return shard;
    }

    public SqlExecutionContext getSqlExecutionContext() {
        return sqlExecutionContext;
    }

    public long getStartTime() {
        return startTime;
    }

    public SymbolTableSource getSymbolTableSource() {
        return pageFrameCursor;
    }

    public PageFrameReduceTask getTask(long cursor) {
        assert cursor > -1;
        if (cursor == LOCAL_TASK_CURSOR) {
            assert localTask != null && localTask.getFrameSequence() != null;
            return localTask;
        }
        return reduceQueue.get(cursor);
    }

    public boolean isActive() {
        return valid.get();
    }

    /**
     * This method is not thread safe. It's always invoked on a single "query owner" thread.
     * 

* Returns a cursor that points either to the reduce queue or to the local reduce task. * The caller of this method should avoid accessing the reduce queue directly and, * instead, should use getTask and collect methods. Long.MAX_VALUE is the * reserved cursor value for the local reduce task case. * * @return the next cursor value, or -1 value if the cursor failed and the caller * should retry, or -2 if there are no frames to dispatch */ public long next() { if (frameCount == 0) { return -2; } assert collectedFrameIndex < frameCount - 1; while (true) { long cursor = collectSubSeq.next(); if (cursor > -1) { PageFrameReduceTask task = reduceQueue.get(cursor); PageFrameSequence thatFrameSequence = task.getFrameSequence(); if (thatFrameSequence == this) { assert id == task.getFrameSequenceId() : "ids mismatch: " + id + ", " + task.getFrameSequenceId(); return cursor; } else { // Not our task, nothing to collect. Go for another spin. collectSubSeq.done(cursor); } } else if (cursor == -1) { if (dispatch()) { // We have dispatched something, so let's try to collect it. continue; } if (dispatchStartFrameIndex == collectedFrameIndex + 1) { // We haven't dispatched anything, and we have collected everything // that was dispatched previously in this loop iteration. Use the // local task to avoid being blocked in case of full reduce queue. workLocally(); return LOCAL_TASK_CURSOR; } return -1; } else { Os.pause(); } } } public PageFrameSequence of( RecordCursorFactory base, SqlExecutionContext executionContext, SCSequence collectSubSeq, T atom, int order ) throws SqlException { sqlExecutionContext = executionContext; startTime = clock.getTicks(); circuitBreakerFd = executionContext.getCircuitBreaker().getFd(); initRecord(executionContext.getCircuitBreaker()); final Rnd rnd = executionContext.getAsyncRandom(); try { // pass one to cache page addresses // this has to be separate pass to ensure there no cache reads // while cache might be resizing pageAddressCache.of(base.getMetadata()); assert pageFrameCursor == null; pageFrameCursor = base.getPageFrameCursor(executionContext, order); this.atom = atom; this.collectSubSeq = collectSubSeq; id = ID_SEQ.incrementAndGet(); done = false; valid.set(true); reduceCounter.set(0); shard = rnd.nextInt(messageBus.getPageFrameReduceShardCount()); reduceQueue = messageBus.getPageFrameReduceQueue(shard); // It is essential to init the atom after we prepared sequence for dispatch. // If atom is to fail, we will be releasing whatever we prepared. atom.init(pageFrameCursor, executionContext); } catch (Throwable e) { pageFrameCursor = Misc.freeIfCloseable(pageFrameCursor); throw e; } return this; } /** * Must be called before subsequence calls to {@link #next()} to count page frames and * initialize page frame cache and filter functions. * * @throws io.questdb.cairo.DataUnavailableException when the queried partition is in cold storage */ public void prepareForDispatch() { if (!readyToDispatch) { atom.initCursor(); buildAddressCache(); readyToDispatch = true; } } public void reset() { // prepare to resend the same sequence as it might be required by toTop() frameRowCounts.clear(); assert !done; done = true; } /** * Prepares page frame sequence for retrieving the same data set again. The method * is not thread-safe. */ public void toTop() { if (frameCount > 0) { long newId = ID_SEQ.incrementAndGet(); LOG.debug().$("toTop [shard=").$(shard) .$(", id=").$(id) .$(", newId=").$(newId) .I$(); await(); // done is reset by method call above done = false; id = newId; dispatchStartFrameIndex = 0; collectedFrameIndex = -1; reduceCounter.set(0); valid.set(true); } } private void buildAddressCache() { PageFrame frame; while ((frame = pageFrameCursor.next()) != null) { pageAddressCache.add(frameCount++, frame); frameRowCounts.add(frame.getPartitionHi() - frame.getPartitionLo()); } // dispatch tasks only if there is anything to dispatch if (frameCount > 0) { // We need to subscribe publisher sequence before we return // control to the caller of this method. However, this sequence // will be unsubscribed asynchronously. messageBus.getPageFrameCollectFanOut(shard).and(collectSubSeq); LOG.debug() .$("added [shard=").$(shard) .$(", id=").$(id) .$(", seqCurrent=").$(collectSubSeq.current()) .$(", seq=").$(collectSubSeq) .I$(); } } /** * This method is re-enterable. It has to be in case queue capacity is smaller than number of frames to * be dispatched. When it is the case, frame count published so far is stored in the `frameSequence`. * This method has no responsibility to deal with "collect" stage hence it deals with everything to * unblock the collect stage. * * @return true if at least one task was dispatched or reduced; false otherwise */ private boolean dispatch() { boolean idle = true; boolean dispatched = false; // the sequence used to steal worker jobs final MCSequence reduceSubSeq = messageBus.getPageFrameReduceSubSeq(shard); final MPSequence reducePubSeq = messageBus.getPageFrameReducePubSeq(shard); long cursor; int i = dispatchStartFrameIndex; dispatchStartFrameIndex = frameCount; OUT: for (; i < frameCount; i++) { // We cannot process work on this thread. If we do the consumer will // never get the executions results. Consumer only picks ready to go // tasks from the queue. while (true) { cursor = reducePubSeq.next(); if (cursor > -1) { reduceQueue.get(cursor).of(this, i); LOG.debug() .$("dispatched [shard=").$(shard) .$(", id=").$(getId()) .$(", frameIndex=").$(i) .$(", frameCount=").$(frameCount) .$(", cursor=").$(cursor) .I$(); reducePubSeq.done(cursor); dispatched = true; break; } else if (cursor == -1) { idle = false; // start stealing work to unload the queue if (stealWork(reduceQueue, reduceSubSeq, record, circuitBreaker)) { continue; } dispatchStartFrameIndex = i; break OUT; } else { Os.pause(); } } } // Reduce counter is here to provide safe backoff point // for job stealing code. It is needed because queue is shared // and there is possibility of never ending stealing if we don't // specifically count only our items // join the gang to consume published tasks while (reduceCounter.get() < frameCount) { idle = false; if (stealWork(reduceQueue, reduceSubSeq, record, circuitBreaker)) { if (isActive()) { continue; } } break; } if (idle) { stealWork(reduceQueue, reduceSubSeq, record, circuitBreaker); } return dispatched; } private void initRecord(SqlExecutionCircuitBreaker executionContextCircuitBreaker) { if (record == null) { final SqlExecutionCircuitBreakerConfiguration sqlExecutionCircuitBreakerConfiguration = executionContextCircuitBreaker.getConfiguration(); record = new PageAddressCacheRecord(); if (sqlExecutionCircuitBreakerConfiguration != null) { circuitBreaker = new NetworkSqlExecutionCircuitBreaker(sqlExecutionCircuitBreakerConfiguration, MemoryTag.NATIVE_CB2); } else { circuitBreaker = NetworkSqlExecutionCircuitBreaker.NOOP_CIRCUIT_BREAKER; } } circuitBreaker.setFd(executionContextCircuitBreaker.getFd()); } private boolean stealWork( RingQueue queue, MCSequence reduceSubSeq, PageAddressCacheRecord record, SqlExecutionCircuitBreaker circuitBreaker ) { if (PageFrameReduceJob.consumeQueue(queue, reduceSubSeq, record, circuitBreaker, this)) { Os.pause(); return false; } return true; } private void workLocally() { assert dispatchStartFrameIndex < frameCount; if (localTask == null) { localTask = localTaskPool.pop(); } localTask.of(this, dispatchStartFrameIndex++); try { LOG.debug() .$("reducing locally [shard=").$(shard) .$(", id=").$(id) .$(", frameIndex=").$(localTask.getFrameIndex()) .$(", frameCount=").$(frameCount) .$(", active=").$(isActive()) .I$(); if (isActive()) { PageFrameReduceJob.reduce(record, circuitBreaker, localTask, this, this); } } catch (Throwable e) { cancel(); throw e; } finally { reduceCounter.incrementAndGet(); } } }





© 2015 - 2025 Weber Informatics LLC | Privacy Policy