All Downloads are FREE. Search and download functionalities are using the official Maven repository.

io.questdb.cairo.wal.ApplyWal2TableJob Maven / Gradle / Ivy

/*******************************************************************************
 *     ___                  _   ____  ____
 *    / _ \ _   _  ___  ___| |_|  _ \| __ )
 *   | | | | | | |/ _ \/ __| __| | | |  _ \
 *   | |_| | |_| |  __/\__ \ |_| |_| | |_) |
 *    \__\_\\__,_|\___||___/\__|____/|____/
 *
 *  Copyright (c) 2014-2019 Appsicle
 *  Copyright (c) 2019-2023 QuestDB
 *
 *  Licensed under the Apache License, Version 2.0 (the "License");
 *  you may not use this file except in compliance with the License.
 *  You may obtain a copy of the License at
 *
 *  http://www.apache.org/licenses/LICENSE-2.0
 *
 *  Unless required by applicable law or agreed to in writing, software
 *  distributed under the License is distributed on an "AS IS" BASIS,
 *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 *  See the License for the specific language governing permissions and
 *  limitations under the License.
 *
 ******************************************************************************/

package io.questdb.cairo.wal;

import io.questdb.Telemetry;
import io.questdb.TelemetryOrigin;
import io.questdb.TelemetrySystemEvent;
import io.questdb.cairo.*;
import io.questdb.cairo.wal.seq.TableMetadataChangeLog;
import io.questdb.cairo.wal.seq.TableSequencerAPI;
import io.questdb.cairo.wal.seq.TransactionLogCursor;
import io.questdb.griffin.SqlException;
import io.questdb.log.Log;
import io.questdb.log.LogFactory;
import io.questdb.mp.AbstractQueueConsumerJob;
import io.questdb.mp.Job;
import io.questdb.std.*;
import io.questdb.std.datetime.microtime.MicrosecondClock;
import io.questdb.std.datetime.microtime.Timestamps;
import io.questdb.std.str.Path;
import io.questdb.tasks.TelemetryTask;
import io.questdb.tasks.TelemetryWalTask;
import io.questdb.tasks.WalTxnNotificationTask;

import java.io.Closeable;

import static io.questdb.TelemetrySystemEvent.*;
import static io.questdb.cairo.TableUtils.TABLE_EXISTS;
import static io.questdb.cairo.pool.AbstractMultiTenantPool.NO_LOCK_REASON;
import static io.questdb.cairo.wal.WalTxnType.*;
import static io.questdb.cairo.wal.WalUtils.*;
import static io.questdb.tasks.TableWriterTask.CMD_ALTER_TABLE;
import static io.questdb.tasks.TableWriterTask.CMD_UPDATE_TABLE;

public class ApplyWal2TableJob extends AbstractQueueConsumerJob implements Closeable {
    public static final String WAL_2_TABLE_RESUME_REASON = "Resume WAL Data Application";
    private static final Log LOG = LogFactory.getLog(ApplyWal2TableJob.class);
    private static final String WAL_2_TABLE_WRITE_REASON = "WAL Data Application";
    private static final int WAL_APPLY_FAILED = -2;
    private static final int WAL_APPLY_IGNORE_ERROR = -1;
    private final CairoEngine engine;
    private final int lookAheadTransactionCount;
    private final WalMetrics metrics;
    private final MicrosecondClock microClock;
    private final OperationExecutor operationExecutor;
    private final long tableTimeQuotaMicros;
    private final Telemetry telemetry;
    private final TelemetryFacade telemetryFacade;
    private final WalEventReader walEventReader;
    private final Telemetry walTelemetry;
    private final WalTelemetryFacade walTelemetryFacade;

    public ApplyWal2TableJob(CairoEngine engine, int workerCount, int sharedWorkerCount) {
        super(engine.getMessageBus().getWalTxnNotificationQueue(), engine.getMessageBus().getWalTxnNotificationSubSequence());
        this.engine = engine;
        walTelemetry = engine.getTelemetryWal();
        walTelemetryFacade = walTelemetry.isEnabled() ? this::doStoreWalTelemetry : this::storeWalTelemetryNoop;
        telemetry = engine.getTelemetry();
        telemetryFacade = telemetry.isEnabled() ? this::doStoreTelemetry : this::storeTelemetryNoop;
        operationExecutor = new OperationExecutor(engine, workerCount, sharedWorkerCount);
        CairoConfiguration configuration = engine.getConfiguration();
        microClock = configuration.getMicrosecondClock();
        walEventReader = new WalEventReader(configuration.getFilesFacade());
        metrics = engine.getMetrics().getWalMetrics();
        lookAheadTransactionCount = configuration.getWalApplyLookAheadTransactionCount();
        tableTimeQuotaMicros = configuration.getWalApplyTableTimeQuota() >= 0 ? configuration.getWalApplyTableTimeQuota() * 1000L : Timestamps.DAY_MICROS;
    }

    @Override
    public void close() {
        Misc.free(operationExecutor);
        Misc.free(walEventReader);
    }

    private static boolean cleanDroppedTableDirectory(CairoEngine engine, Path tempPath, TableToken tableToken) {
        // Clean all the files inside table folder name except WAL directories and SEQ_DIR directory
        boolean allClean = true;
        FilesFacade ff = engine.getConfiguration().getFilesFacade();
        tempPath.of(engine.getConfiguration().getRoot()).concat(tableToken);
        int rootLen = tempPath.length();

        long p = ff.findFirst(tempPath.$());
        if (p > 0) {
            try {
                do {
                    long pUtf8NameZ = ff.findName(p);
                    int type = ff.findType(p);
                    if (ff.isDirOrSoftLinkDirNoDots(tempPath, rootLen, pUtf8NameZ, type)) {
                        if (!CairoKeywords.isTxnSeq(pUtf8NameZ) && !CairoKeywords.isWal(pUtf8NameZ)) {
                            if (ff.unlinkOrRemove(tempPath, LOG) != 0) {
                                allClean = false;
                            }
                        }

                    } else if (type == Files.DT_FILE) {
                        tempPath.trimTo(rootLen);
                        tempPath.concat(pUtf8NameZ);

                        if (CairoKeywords.isTxn(pUtf8NameZ) || CairoKeywords.isMeta(pUtf8NameZ) || matchesWalLock(tempPath)) {
                            continue;
                        }

                        if (!ff.remove(tempPath.$())) {
                            allClean = false;
                            LOG.info().$("could not remove [tempPath=").utf8(tempPath).$(", errno=").$(ff.errno()).I$();
                        }
                    }
                } while (ff.findNext(p) > 0);

                if (allClean) {
                    // Remove _txn and _meta files when all other files are removed
                    ff.remove(tempPath.trimTo(rootLen).concat(TableUtils.TXN_FILE_NAME).$());
                    ff.remove(tempPath.trimTo(rootLen).concat(TableUtils.META_FILE_NAME).$());
                    return true;
                }
            } finally {
                ff.findClose(p);
            }
        }
        return false;
    }

    private static boolean matchesWalLock(CharSequence name) {
        if (Chars.endsWith(name, ".lock")) {
            for (int i = name.length() - ".lock".length() - 1; i > 0; i--) {
                char c = name.charAt(i);
                if (c < '0' || c > '9') {
                    return Chars.equals(name, i - WAL_NAME_BASE.length() + 1, i + 1, WAL_NAME_BASE, 0, WAL_NAME_BASE.length());
                }
            }
        }

        for (int i = 0, n = name.length(); i < n; i++) {
            char c = name.charAt(i);
            if (c < '0' || c > '9') {
                return false;
            }
        }
        return true;
    }

    private static boolean tryDestroyDroppedTable(TableToken tableToken, TableWriter writer, CairoEngine engine, Path tempPath) {
        if (engine.lockReadersByTableToken(tableToken)) {
            TableWriter writerToClose = null;
            try {
                final CairoConfiguration configuration = engine.getConfiguration();
                if (writer == null && TableUtils.exists(configuration.getFilesFacade(), tempPath, configuration.getRoot(), tableToken.getDirName()) == TABLE_EXISTS) {
                    try {
                        writer = writerToClose = engine.getWriterUnsafe(tableToken, WAL_2_TABLE_WRITE_REASON);
                    } catch (EntryUnavailableException ex) {
                        // Table is being written to, we cannot destroy it at the moment
                        return false;
                    } catch (CairoException ex) {
                        // Ignore it, table can be half deleted.
                    }
                }
                if (writer != null) {
                    // Force writer to close all the files.
                    writer.destroy();
                }
                return cleanDroppedTableDirectory(engine, tempPath, tableToken);
            } finally {
                if (writerToClose != null) {
                    writerToClose.close();
                }
                engine.releaseReadersByTableToken(tableToken);
            }
        } else {
            LOG.info().$("table '").utf8(tableToken.getDirName())
                    .$("' is dropped, waiting to acquire Table Readers lock to delete the table files").$();
        }
        return false;
    }

    // Returns true if the application is finished and false if it's early terminated
    private void applyOutstandingWalTransactions(
            TableToken tableToken,
            TableWriter writer,
            CairoEngine engine,
            OperationExecutor operationExecutor,
            Path tempPath,
            RunStatus runStatus
    ) {
        final TableSequencerAPI tableSequencerAPI = engine.getTableSequencerAPI();
        boolean isTerminating;
        boolean finishedAll = true;

        try (TransactionLogCursor transactionLogCursor = tableSequencerAPI.getCursor(tableToken, writer.getAppliedSeqTxn())) {
            TableMetadataChangeLog structuralChangeCursor = null;
            try {
                int iTransaction = 0;
                int totalTransactionCount = 0;
                long rowsAdded = 0;
                long physicalRowsAdded = 0;
                long insertTimespan = 0;

                tempPath.of(engine.getConfiguration().getRoot()).concat(tableToken).slash();

                // Populate transactionMeta with timestamps of future transactions
                // to avoid O3 commits by pre-calculating safe to commit timestamp for every commit.
                writer.readWalTxnDetails(transactionLogCursor);
                transactionLogCursor.toTop();

                isTerminating = runStatus.isTerminating();
                final long timeLimit = microClock.getTicks() + tableTimeQuotaMicros;
                boolean firstRun = true;
                WHILE_TRANSACTION_CURSOR:
                while (!isTerminating && ((finishedAll = microClock.getTicks() <= timeLimit) || firstRun) && transactionLogCursor.hasNext()) {
                    firstRun = false;
                    final int walId = transactionLogCursor.getWalId();
                    final int segmentId = transactionLogCursor.getSegmentId();
                    final long segmentTxn = transactionLogCursor.getSegmentTxn();
                    final long commitTimestamp = transactionLogCursor.getCommitTimestamp();
                    final long seqTxn = transactionLogCursor.getTxn();

                    if (seqTxn != writer.getAppliedSeqTxn() + 1) {
                        throw CairoException.critical(0)
                                .put("unexpected sequencer transaction, expected ").put(writer.getAppliedSeqTxn() + 1)
                                .put(" but was ").put(seqTxn);
                    }

                    switch (walId) {
                        case METADATA_WALID:
                            // This is metadata change
                            // to be taken from Sequencer directly
                            final long newStructureVersion = transactionLogCursor.getStructureVersion();
                            if (writer.getColumnStructureVersion() != newStructureVersion - 1) {
                                throw CairoException.critical(0)
                                        .put("unexpected new WAL structure version [walStructure=").put(newStructureVersion)
                                        .put(", tableStructureVersion=").put(writer.getColumnStructureVersion())
                                        .put(']');
                            }

                            boolean hasNext;
                            if (structuralChangeCursor == null || !(hasNext = structuralChangeCursor.hasNext())) {
                                Misc.free(structuralChangeCursor);
                                structuralChangeCursor = tableSequencerAPI.getMetadataChangeLog(tableToken, newStructureVersion - 1);
                                hasNext = structuralChangeCursor.hasNext();
                            }

                            if (hasNext) {
                                final long start = microClock.getTicks();
                                walTelemetryFacade.store(WAL_TXN_APPLY_START, tableToken, walId, seqTxn, -1L, -1L, start - commitTimestamp);
                                writer.setSeqTxn(seqTxn);
                                structuralChangeCursor.next().apply(writer, true);
                                walTelemetryFacade.store(WAL_TXN_STRUCTURE_CHANGE_APPLIED, tableToken, walId, seqTxn, -1L, -1L, microClock.getTicks() - start);
                            } else {
                                // Something messed up in sequencer.
                                // There is a transaction in WAL but no structure change record.
                                throw CairoException.critical(0)
                                        .put("could not apply structure change from WAL to table. WAL metadata change does not exist [structureVersion=")
                                        .put(newStructureVersion)
                                        .put(']');
                            }
                            break;

                        case DROP_TABLE_WALID:
                            engine.notifyDropped(tableToken);
                            tryDestroyDroppedTable(tableToken, writer, engine, tempPath);
                            return;

                        case 0:
                            throw CairoException.critical(0)
                                    .put("broken table transaction record in sequencer log, walId cannot be 0 [table=")
                                    .put(tableToken.getTableName()).put(", seqTxn=").put(seqTxn).put(']');

                        default:
                            // Always set full path when using thread static path
                            operationExecutor.setNowAndFixClock(commitTimestamp);
                            tempPath.of(engine.getConfiguration().getRoot()).concat(tableToken).slash().put(WAL_NAME_BASE).put(walId).slash().put(segmentId);
                            final long start = microClock.getTicks();

                            if (!writer.getWalTnxDetails().hasRecord(seqTxn + lookAheadTransactionCount)) {
                                // Last few transactions left to process from the list
                                // of observed transactions built upfront in the beginning of the loop.
                                // Check if more transaction exist, exit restart the loop to have better picture
                                // of the future transactions and optimise the application.
                                if (transactionLogCursor.setPosition()) {
                                    writer.readWalTxnDetails(transactionLogCursor);
                                    transactionLogCursor.toTop();
                                    totalTransactionCount += iTransaction;
                                    iTransaction = 0;
                                    continue;
                                }
                            }

                            isTerminating = runStatus.isTerminating();
                            final long added = processWalCommit(
                                    writer,
                                    walId,
                                    tempPath,
                                    segmentTxn,
                                    operationExecutor,
                                    seqTxn,
                                    commitTimestamp
                            );

                            if (added > -1L) {
                                insertTimespan += microClock.getTicks() - start;
                                rowsAdded += added;
                                iTransaction++;
                                physicalRowsAdded += writer.getPhysicallyWrittenRowsSinceLastCommit();
                            }
                            if (added == -2L || isTerminating) {
                                // transaction cursor goes beyond prepared transactionMeta or termination requested. Re-run the loop.
                                break WHILE_TRANSACTION_CURSOR;
                            }
                    }
                }
                totalTransactionCount += iTransaction;

                if (!finishedAll || isTerminating) {
                    writer.commitSeqTxn();
                }

                if (totalTransactionCount > 0) {
                    LOG.info().$("job ")
                            .$(finishedAll ? "finished" : "ejected")
                            .$(" [table=").$(writer.getTableToken().getDirName())
                            .$(", seqTxn=").$(writer.getAppliedSeqTxn())
                            .$(", transactions=").$(totalTransactionCount)
                            .$(", rows=").$(rowsAdded)
                            .$(", time=").$(insertTimespan / 1000)
                            .$("ms, rate=").$(rowsAdded * 1000000L / Math.max(1, insertTimespan))
                            .$("rows/s, physicalWrittenRowsMultiplier=").$(Math.round(100.0 * physicalRowsAdded / rowsAdded) / 100.0)
                            .I$();
                }
            } finally {
                Misc.free(structuralChangeCursor);
            }
        }
    }

    private void doStoreTelemetry(short event, short origin) {
        TelemetryTask.store(telemetry, origin, event);
    }

    private void doStoreWalTelemetry(short event, TableToken tableToken, int walId, long seqTxn, long rowCount, long physicalRowCount, long latencyUs) {
        TelemetryWalTask.store(walTelemetry, event, tableToken.getTableId(), walId, seqTxn, rowCount, physicalRowCount, latencyUs);
    }

    private long processWalCommit(
            TableWriter writer,
            int walId,
            @Transient Path walPath,
            long segmentTxn,
            OperationExecutor operationExecutor,
            long seqTxn,
            long commitTimestamp
    ) {
        try (WalEventReader eventReader = walEventReader) {
            final WalEventCursor walEventCursor = eventReader.of(walPath, WAL_FORMAT_VERSION, segmentTxn);
            final byte walTxnType = walEventCursor.getType();
            switch (walTxnType) {
                case DATA:
                    final WalEventCursor.DataInfo dataInfo = walEventCursor.getDataInfo();
                    if (writer.getWalTnxDetails().hasRecord(seqTxn)) {
                        long rowCount = dataInfo.getEndRowID() - dataInfo.getStartRowID();
                        final long start = microClock.getTicks();
                        walTelemetryFacade.store(WAL_TXN_APPLY_START, writer.getTableToken(), walId, seqTxn, -1L, -1L, start - commitTimestamp);
                        final long rowsAdded = writer.commitWalTransaction(
                                walPath,
                                !dataInfo.isOutOfOrder(),
                                dataInfo.getStartRowID(),
                                dataInfo.getEndRowID(),
                                dataInfo.getMinTimestamp(),
                                dataInfo.getMaxTimestamp(),
                                dataInfo,
                                seqTxn
                        );
                        final long latency = microClock.getTicks() - start;
                        long physicalRowCount = writer.getPhysicallyWrittenRowsSinceLastCommit();
                        metrics.addApplyRowsWritten(rowCount, physicalRowCount, latency);
                        walTelemetryFacade.store(WAL_TXN_DATA_APPLIED, writer.getTableToken(), walId, seqTxn, rowsAdded, physicalRowCount, latency);
                        return rowCount;
                    } else {
                        // re-build wal transaction details
                        return -2L;
                    }

                case SQL:
                    final WalEventCursor.SqlInfo sqlInfo = walEventCursor.getSqlInfo();
                    final long start = microClock.getTicks();
                    walTelemetryFacade.store(WAL_TXN_APPLY_START, writer.getTableToken(), walId, seqTxn, -1L, -1L, start - commitTimestamp);
                    processWalSql(writer, sqlInfo, operationExecutor, seqTxn);
                    walTelemetryFacade.store(WAL_TXN_SQL_APPLIED, writer.getTableToken(), walId, seqTxn, -1L, -1L, microClock.getTicks() - start);
                    return -1L;
                case TRUNCATE:
                    long txn = writer.getTxn();
                    writer.setSeqTxn(seqTxn);
                    writer.removeAllPartitions();
                    if (writer.getTxn() == txn) {
                        // force mark the transaction as applied
                        writer.markSeqTxnCommitted(seqTxn);
                    }
                    return -1L;
                default:
                    throw new UnsupportedOperationException("Unsupported WAL txn type: " + walTxnType);
            }
        }
    }

    private void processWalSql(TableWriter tableWriter, WalEventCursor.SqlInfo sqlInfo, OperationExecutor operationExecutor, long seqTxn) {
        final int cmdType = sqlInfo.getCmdType();
        final CharSequence sql = sqlInfo.getSql();
        operationExecutor.resetRnd(sqlInfo.getRndSeed0(), sqlInfo.getRndSeed1());
        sqlInfo.populateBindVariableService(operationExecutor.getBindVariableService());
        try {
            switch (cmdType) {
                case CMD_ALTER_TABLE:
                    operationExecutor.executeAlter(tableWriter, sql, seqTxn);
                    break;
                case CMD_UPDATE_TABLE:
                    operationExecutor.executeUpdate(tableWriter, sql, seqTxn);
                    break;
                default:
                    throw new UnsupportedOperationException("Unsupported command type: " + cmdType);
            }
        } catch (SqlException ex) {
            // This is fine, some syntax error, we should not block WAL processing if SQL is not valid
            LOG.error().$("error applying SQL to wal table [table=")
                    .utf8(tableWriter.getTableToken().getTableName()).$(", sql=").$(sql).$(", error=").$(ex.getFlyweightMessage()).I$();
        } catch (CairoException e) {
            if (e.isWALTolerable()) {
                // This is fine, some syntax error, we should not block WAL processing if SQL is not valid
                LOG.error().$("error applying SQL to wal table [table=")
                        .utf8(tableWriter.getTableToken().getTableName()).$(", sql=").$(sql).$(", error=").$(e.getFlyweightMessage()).I$();
            } else {
                throw e;
            }
        }
    }

    private void storeTelemetryNoop(short event, short origin) {
    }

    private void storeWalTelemetryNoop(short event, TableToken tableToken, int walId, long seqTxn, long rowCount, long physicalRowCount, long latencyUs) {
    }

    /**
     * Returns transaction number, which is always > -1. Negative values are used as status code.
     */
    long applyWal(
            TableToken tableToken,
            CairoEngine engine,
            OperationExecutor operationCompiler,
            Job.RunStatus runStatus
    ) {
        long lastWriterTxn = WAL_APPLY_IGNORE_ERROR;
        Path tempPath = Path.PATH.get();

        try {
            // security context is checked on writing to the WAL and can be ignored here
            TableToken updatedToken = engine.getUpdatedTableToken(tableToken);
            if (engine.isTableDropped(tableToken) || updatedToken == null) {
                if (engine.isTableDropped(tableToken)) {
                    return tryDestroyDroppedTable(tableToken, null, engine, tempPath) ? Long.MAX_VALUE : -1;
                }
                // else: table is dropped and fully cleaned, this is late notification.
                return Long.MAX_VALUE;
            }

            try (TableWriter writer = engine.getWriterUnsafe(updatedToken, WAL_2_TABLE_WRITE_REASON)) {
                assert writer.getMetadata().getTableId() == tableToken.getTableId();
                applyOutstandingWalTransactions(tableToken, writer, engine, operationCompiler, tempPath, runStatus);
                lastWriterTxn = writer.getSeqTxn();
            } catch (EntryUnavailableException tableBusy) {
                //noinspection StringEquality
                if (tableBusy.getReason() != NO_LOCK_REASON
                        && !WAL_2_TABLE_WRITE_REASON.equals(tableBusy.getReason())
                        && !WAL_2_TABLE_RESUME_REASON.equals(tableBusy.getReason())) {
                    LOG.critical().$("unsolicited table lock [table=").utf8(tableToken.getDirName()).$(", lock_reason=").$(tableBusy.getReason()).I$();
                    // This is abnormal termination but table is not set to suspended state.
                    // Reset state of SeqTxnTracker so that next CheckWalTransactionJob run will send job notification if necessary.
                    engine.notifyWalTxnRepublisher(tableToken);
                }
                // Don't suspend table. Perhaps writer will be unlocked with no transaction applied.
                // We don't suspend table by virtue of having initial value on lastWriterTxn. It will either be
                // "ignore" or last txn we applied.
                return lastWriterTxn;
            }

            if (engine.getTableSequencerAPI().notifyCommitReadable(tableToken, lastWriterTxn)) {
                engine.notifyWalTxnCommitted(tableToken);
            }
        } catch (CairoException ex) {
            if (ex.isTableDropped() || engine.isTableDropped(tableToken)) {
                engine.notifyDropped(tableToken);
                // Table is dropped, and we received cairo exception in the middle of apply
                return tryDestroyDroppedTable(tableToken, null, engine, tempPath) ? Long.MAX_VALUE : WAL_APPLY_IGNORE_ERROR;
            }
            telemetryFacade.store(TelemetrySystemEvent.WAL_APPLY_SUSPEND, TelemetryOrigin.WAL_APPLY);
            LOG.critical().$("job failed, table suspended [table=").utf8(tableToken.getDirName())
                    .$(", error=").$(ex.getFlyweightMessage())
                    .$(", errno=").$(ex.getErrno())
                    .I$();
            return WAL_APPLY_FAILED;
        }
        return lastWriterTxn;
    }

    @Override
    protected boolean doRun(int workerId, long cursor, RunStatus runStatus) {
        final TableToken tableToken;

        try {
            WalTxnNotificationTask task = queue.get(cursor);
            tableToken = task.getTableToken();
        } finally {
            // Don't hold the queue until the all the transactions applied to the table
            subSeq.done(cursor);
        }

        final long txn = applyWal(tableToken, engine, operationExecutor, runStatus);
        if (txn == WAL_APPLY_FAILED) {
            try {
                engine.getTableSequencerAPI().suspendTable(tableToken);
            } catch (CairoException e) {
                LOG.critical().$("could not suspend table [table=").$(tableToken.getTableName()).$(", error=").$(e.getFlyweightMessage()).I$();
            }
        }
        return true;
    }

    @FunctionalInterface
    private interface TelemetryFacade {
        void store(short event, short origin);
    }

    @FunctionalInterface
    private interface WalTelemetryFacade {
        void store(short event, TableToken tableToken, int walId, long seqTxn, long rowCount, long physicalRowCount, long latencyUs);
    }
}




© 2015 - 2025 Weber Informatics LLC | Privacy Policy