All Downloads are FREE. Search and download functionalities are using the official Maven repository.

io.questdb.cairo.O3PartitionPurgeJob Maven / Gradle / Ivy

There is a newer version: 8.2.1
Show newest version
/*******************************************************************************
 *     ___                  _   ____  ____
 *    / _ \ _   _  ___  ___| |_|  _ \| __ )
 *   | | | | | | |/ _ \/ __| __| | | |  _ \
 *   | |_| | |_| |  __/\__ \ |_| |_| | |_) |
 *    \__\_\\__,_|\___||___/\__|____/|____/
 *
 *  Copyright (c) 2014-2019 Appsicle
 *  Copyright (c) 2019-2024 QuestDB
 *
 *  Licensed under the Apache License, Version 2.0 (the "License");
 *  you may not use this file except in compliance with the License.
 *  You may obtain a copy of the License at
 *
 *  http://www.apache.org/licenses/LICENSE-2.0
 *
 *  Unless required by applicable law or agreed to in writing, software
 *  distributed under the License is distributed on an "AS IS" BASIS,
 *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 *  See the License for the specific language governing permissions and
 *  limitations under the License.
 *
 ******************************************************************************/

package io.questdb.cairo;

import io.questdb.cairo.sql.TableReferenceOutOfDateException;
import io.questdb.cairo.wal.WalUtils;
import io.questdb.log.Log;
import io.questdb.log.LogFactory;
import io.questdb.mp.AbstractQueueConsumerJob;
import io.questdb.std.*;
import io.questdb.std.datetime.DateFormat;
import io.questdb.std.datetime.millitime.DateFormatUtils;
import io.questdb.std.str.Path;
import io.questdb.std.str.Utf8StringSink;
import io.questdb.std.str.Utf8s;
import io.questdb.tasks.O3PartitionPurgeTask;

import java.io.Closeable;
import java.util.concurrent.atomic.AtomicBoolean;

import static io.questdb.cairo.TableUtils.TXN_FILE_NAME;

public class O3PartitionPurgeJob extends AbstractQueueConsumerJob implements Closeable {

    private final static Log LOG = LogFactory.getLog(O3PartitionPurgeJob.class);
    private final CairoConfiguration configuration;
    private final CairoEngine engine;
    private final Utf8StringSink[] fileNameSinks;
    private final AtomicBoolean halted = new AtomicBoolean(false);
    private final ObjList partitionList;
    private final ObjList txnReaders;
    private final ObjList txnScoreboards;

    public O3PartitionPurgeJob(CairoEngine engine, int workerCount) {
        super(engine.getMessageBus().getO3PurgeDiscoveryQueue(), engine.getMessageBus().getO3PurgeDiscoverySubSeq());
        try {
            this.engine = engine;
            this.configuration = engine.getMessageBus().getConfiguration();
            this.fileNameSinks = new Utf8StringSink[workerCount];
            this.partitionList = new ObjList<>(workerCount);
            this.txnScoreboards = new ObjList<>(workerCount);
            this.txnReaders = new ObjList<>(workerCount);

            for (int i = 0; i < workerCount; i++) {
                fileNameSinks[i] = new Utf8StringSink();
                partitionList.add(new DirectLongList(configuration.getPartitionPurgeListCapacity() * 2L, MemoryTag.NATIVE_O3));
                txnScoreboards.add(new TxnScoreboard(configuration.getFilesFacade(), configuration.getTxnScoreboardEntryCount()));
                txnReaders.add(new TxReader(configuration.getFilesFacade()));
            }
        } catch (Throwable th) {
            close();
            throw th;
        }
    }

    @Override
    public void close() {
        if (halted.compareAndSet(false, true)) {
            Misc.freeObjList(partitionList);
            Misc.freeObjList(txnReaders);
            Misc.freeObjList(txnScoreboards);
        }
    }

    private static void parsePartitionDateVersion(
            Utf8StringSink fileNameSink,
            DirectLongList partitionList,
            CharSequence tableName,
            DateFormat partitionByFormat
    ) {
        int index = Utf8s.lastIndexOfAscii(fileNameSink, '.');

        int len = fileNameSink.size();
        if (index < 0) {
            index = len;
        }
        try {
            if (index < len) {
                long partitionVersion = Numbers.parseLong(fileNameSink, index + 1, len);
                // When reader locks transaction 100 it opens partition version .99 or lower.
                // Also, when there is no transaction version in the name, it is counted as -1.
                // By adding +1 here we kill 2 birds in with one stone, partition versions are aligned with
                // txn scoreboard reader locks and no need to add -1 which allows us to use 128bit
                // sort to sort 2 x 64bit unsigned integers
                partitionList.add(partitionVersion + 1);
            } else {
                // This should be -1, but it is only possible to correctly sort 2 unsigned longs
                // as 128bit integer sort
                // Set 0 instead of -1 and revert it later on. There should be not possible to have .0 in the partition name
                partitionList.add(0);
            }

            try {
                long partitionTs = partitionByFormat.parse(fileNameSink.asAsciiCharSequence(), 0, index, DateFormatUtils.EN_LOCALE);
                partitionList.add(partitionTs);
            } catch (NumericException e) {
                if (!Utf8s.startsWithAscii(fileNameSink, WalUtils.WAL_NAME_BASE) && !Utf8s.equalsAscii(WalUtils.SEQ_DIR, fileNameSink)
                        && !Utf8s.equalsAscii("seq", fileNameSink)) {
                    LOG.info().$("unknown directory [table=").utf8(tableName).$(", dir=").$(fileNameSink).I$();
                }
                partitionList.setPos(partitionList.size() - 1); // remove partition version record
            }
        } catch (NumericException e) {
            LOG.error().$("unknown directory [table=").utf8(tableName).$(", dir=").$(fileNameSink).I$();
        }
    }

    private void discoverPartitions(
            FilesFacade ff,
            Utf8StringSink fileNameSink,
            DirectLongList partitionList,
            CharSequence root,
            TableToken tableToken,
            TxnScoreboard txnScoreboard,
            TxReader txReader,
            int partitionBy
    ) {
        LOG.info().$("processing [table=").utf8(tableToken.getDirName()).I$();
        Path path = Path.getThreadLocal(root).concat(tableToken);
        int plimit = path.size();
        partitionList.clear();
        DateFormat partitionByFormat = PartitionBy.getPartitionDirFormatMethod(partitionBy);
        long p = ff.findFirst(path.$());
        if (p > 0) {
            try {
                do {
                    if (ff.isDirOrSoftLinkDirNoDots(path, plimit, ff.findName(p), ff.findType(p), fileNameSink)) {
                        parsePartitionDateVersion(fileNameSink, partitionList, tableToken.getDirName(), partitionByFormat);
                        path.trimTo(plimit).$();
                    }
                } while (ff.findNext(p) > 0);
            } finally {
                ff.findClose(p);
            }
        }

        // find duplicate partitions
        assert partitionList.size() % 2 == 0;
        Vect.sort128BitAscInPlace(partitionList.getAddress(), partitionList.size() / 2);

        long partitionTimestamp = Numbers.LONG_NULL;
        int lo = 0;
        int n = (int) partitionList.size();

        path.of(root).concat(tableToken);

        int tableRootLen = path.size();
        try {
            txnScoreboard.ofRO(path);
            txReader.ofRO(path.trimTo(tableRootLen).concat(TXN_FILE_NAME).$(), partitionBy);
            TableUtils.safeReadTxn(txReader, configuration.getMillisecondClock(), configuration.getSpinLockTimeout());

            for (int i = 0; i < n; i += 2) {
                long currentPartitionTs = partitionList.get(i + 1);
                if (currentPartitionTs != partitionTimestamp) {
                    if (i > lo + 2 ||
                            (i > 0 && txReader.findAttachedPartitionRawIndexByLoTimestamp(partitionTimestamp) < 0)) {
                        processPartition(
                                tableToken,
                                ff,
                                path,
                                tableRootLen,
                                txReader,
                                txnScoreboard,
                                partitionTimestamp,
                                partitionBy,
                                partitionList,
                                lo,
                                i
                        );
                    }
                    lo = i;
                    partitionTimestamp = currentPartitionTs;
                }
            }
            // Tail
            if (n > lo + 2 || txReader.getPartitionRowCountByTimestamp(partitionTimestamp) < 0) {
                processPartition(
                        tableToken,
                        ff,
                        path,
                        tableRootLen,
                        txReader,
                        txnScoreboard,
                        partitionTimestamp,
                        partitionBy,
                        partitionList,
                        lo,
                        n
                );
            }
        } catch (TableReferenceOutOfDateException e) {
            // table is dropped and recreated since we started processing it.
            // abort the table processing
            LOG.info().$("table reference out of date, aborting [table=").$(tableToken.getDirName()).I$();
        } catch (CairoException ex) {
            // It is possible that table is dropped while this async job was in the queue.
            // so it can be not too bad. Log error and continue work on the queue
            LOG.error()
                    .$("could not purge partition open [table=`").utf8(tableToken.getDirName())
                    .$("`, ex=").$(ex.getFlyweightMessage())
                    .$(", errno=").$(ex.getErrno())
                    .I$();
            LOG.error().$(ex.getFlyweightMessage()).$();
        } finally {
            txReader.clear();
            txnScoreboard.clear();
        }
        LOG.info().$("processed [table=").$(tableToken).I$();
    }

    private void processDetachedPartition(
            TableToken tableToken,
            FilesFacade ff,
            Path path,
            int tableRootLen,
            TxReader txReader,
            TxnScoreboard txnScoreboard,
            long partitionTimestamp,
            int partitionBy,
            DirectLongList partitionList,
            int lo,
            int hi
    ) {
        // Partition is dropped or not fully committed.
        // It is only possible to delete when there are no readers
        long lastTxn = txReader.getTxn();
        for (int i = hi - 2, n = lo - 1; i > n; i -= 2) {
            long nameTxn = partitionList.get(i);

            // If last committed transaction number is 4, TableWriter can write partition with ending .4 and .3
            // If the version on disk is .2 (nameTxn == 3) can remove it if the lastTxn > 3, e.g. when nameTxn < lastTxn
            boolean rangeUnlocked = nameTxn < lastTxn && txnScoreboard.isRangeAvailable(nameTxn, lastTxn);

            path.trimTo(tableRootLen);
            TableUtils.setPathForPartition(path, partitionBy, partitionTimestamp, nameTxn - 1);
            path.$();

            if (rangeUnlocked) {
                // nameTxn can be deleted
                // -1 here is to compensate +1 added when partition version parsed from folder name
                // See comments of why +1 added there in parsePartitionDateVersion()
                purgePartition(tableToken, ff, path, tableRootLen - tableToken.getDirNameUtf8().size() - 1, "purging dropped partition directory [path=");
                lastTxn = nameTxn;
            } else {
                LOG.debug().$("cannot purge partition directory, locked for reading [path=").$substr(tableRootLen - tableToken.getDirNameUtf8().size() - 1, path).I$();
                break;
            }
        }
    }

    private void processPartition(
            TableToken tableToken,
            FilesFacade ff,
            Path path,
            int tableRootLen,
            TxReader txReader,
            TxnScoreboard txnScoreboard,
            long partitionTimestamp,
            int partitionBy,
            DirectLongList partitionList,
            int lo,
            int hi
    ) {
        boolean partitionInTxnFile = txReader.findAttachedPartitionRawIndexByLoTimestamp(partitionTimestamp) >= 0;
        if (partitionInTxnFile) {
            processPartition0(
                    tableToken,
                    ff,
                    path,
                    tableRootLen,
                    txReader,
                    txnScoreboard,
                    partitionTimestamp,
                    partitionBy,
                    partitionList,
                    lo,
                    hi
            );
        } else {
            processDetachedPartition(
                    tableToken,
                    ff,
                    path,
                    tableRootLen,
                    txReader,
                    txnScoreboard,
                    partitionTimestamp,
                    partitionBy,
                    partitionList,
                    lo,
                    hi
            );
        }
    }

    private void processPartition0(
            TableToken tableToken,
            FilesFacade ff,
            Path path,
            int tableRootLen,
            TxReader txReader,
            TxnScoreboard txnScoreboard,
            long partitionTimestamp,
            int partitionBy,
            DirectLongList partitionList,
            int lo,
            int hi
    ) {
        long lastCommittedPartitionName = txReader.getPartitionNameTxnByPartitionTimestamp(partitionTimestamp);
        if (lastCommittedPartitionName > -1) {
            assert hi <= partitionList.size();
            // lo points to the beginning element in partitionList, hi next after last
            // each partition folder represented by a pair in the partitionList (partition version, partition timestamp)
            // Skip first pair, start from second and check if it can be deleted.
            for (int i = lo + 2; i < hi; i += 2) {
                long nextNameVersion = Math.min(lastCommittedPartitionName + 1, partitionList.get(i));
                long previousNameVersion = partitionList.get(i - 2);

                boolean rangeUnlocked = previousNameVersion < nextNameVersion
                        && txnScoreboard.isRangeAvailable(previousNameVersion, nextNameVersion);

                path.trimTo(tableRootLen);
                TableUtils.setPathForPartition(path, partitionBy, partitionTimestamp, previousNameVersion - 1);
                path.$();

                if (rangeUnlocked) {
                    // previousNameVersion can be deleted
                    // -1 here is to compensate +1 added when partition version parsed from folder name
                    // See comments of why +1 added there in parsePartitionDateVersion()
                    engine.getPartitionOverwriteControl().notifyPartitionMutates(tableToken, partitionTimestamp, previousNameVersion - 1, 0);
                    purgePartition(tableToken, ff, path, tableRootLen - tableToken.getDirNameUtf8().size() - 1, "purging overwritten partition directory [path=");
                } else {
                    LOG.info().$("cannot purge overwritten partition directory, locked for reading path=").$substr(tableRootLen - tableToken.getDirNameUtf8().size() - 1, path).I$();
                }
            }
        }
    }

    private void purgePartition(TableToken tableToken, FilesFacade ff, Path path, int pathFrom, String message) {
        if (engine.lockTableCreate(tableToken)) {
            try {
                TableToken lastToken = engine.getUpdatedTableToken(tableToken);
                if (lastToken == tableToken) {
                    LOG.info().$(message).$substr(pathFrom, path).I$();
                    ff.unlinkOrRemove(path, LOG);
                } else {
                    // table is dropped and recreated since we started processing it.
                    // abort the table processing
                    throw new TableReferenceOutOfDateException();
                }
            } finally {
                engine.unlockTableCreate(tableToken);
            }
        } else {
            // table is dropped and recreated since we started processing it.
            // abort the table processing
            throw new TableReferenceOutOfDateException();
        }
    }

    @Override
    protected boolean canRun() {
        // disable purge job while database checkpoint is in progress
        return !engine.getCheckpointStatus().isInProgress();
    }

    @Override
    protected boolean doRun(int workerId, long cursor, RunStatus runStatus) {
        final O3PartitionPurgeTask task = queue.get(cursor);
        discoverPartitions(
                configuration.getFilesFacade(),
                fileNameSinks[workerId],
                partitionList.get(workerId),
                configuration.getRoot(),
                task.getTableToken(),
                txnScoreboards.get(workerId),
                txnReaders.get(workerId),
                task.getPartitionBy()
        );
        subSeq.done(cursor);
        return true;
    }
}




© 2015 - 2025 Weber Informatics LLC | Privacy Policy