All Downloads are FREE. Search and download functionalities are using the official Maven repository.

io.questdb.cairo.TxWriter Maven / Gradle / Ivy

/*******************************************************************************
 *     ___                  _   ____  ____
 *    / _ \ _   _  ___  ___| |_|  _ \| __ )
 *   | | | | | | |/ _ \/ __| __| | | |  _ \
 *   | |_| | |_| |  __/\__ \ |_| |_| | |_) |
 *    \__\_\\__,_|\___||___/\__|____/|____/
 *
 *  Copyright (c) 2014-2019 Appsicle
 *  Copyright (c) 2019-2022 QuestDB
 *
 *  Licensed under the Apache License, Version 2.0 (the "License");
 *  you may not use this file except in compliance with the License.
 *  You may obtain a copy of the License at
 *
 *  http://www.apache.org/licenses/LICENSE-2.0
 *
 *  Unless required by applicable law or agreed to in writing, software
 *  distributed under the License is distributed on an "AS IS" BASIS,
 *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 *  See the License for the specific language governing permissions and
 *  limitations under the License.
 *
 ******************************************************************************/

package io.questdb.cairo;

import io.questdb.cairo.vm.Vm;
import io.questdb.cairo.vm.api.MemoryCMARW;
import io.questdb.std.*;
import io.questdb.std.str.Path;

import java.io.Closeable;

import static io.questdb.cairo.TableUtils.*;

public final class TxWriter extends TxReader implements Closeable, Mutable, SymbolValueCountCollector {
    private final FilesFacade ff;
    private long prevTransientRowCount;
    private int txPartitionCount;
    private long prevMaxTimestamp;
    private long prevMinTimestamp;
    private MemoryCMARW txMemBase;
    private int readBaseOffset;
    private int writeBaseOffset;
    private int writeAreaSize;
    private long baseVersion;
    private long readRecordSize;

    private long recordStructureVersion = 0;
    private long lastRecordStructureVersion = -1;
    private long prevRecordStructureVersion = -2;
    private int lastRecordBaseOffset = -1;
    private int prevRecordBaseOffset = -2;
    private TableWriter.ExtensionListener extensionListener;

    public TxWriter(FilesFacade ff) {
        super(ff);
        this.ff = ff;
    }

    public void append() {
        transientRowCount++;
    }

    public void beginPartitionSizeUpdate() {
        if (maxTimestamp != Long.MIN_VALUE) {
            // Last partition size is usually not stored in attached partitions list
            // but in transientRowCount only.
            // To resolve transientRowCount after out of order partition update
            // let's store it in attached partitions list
            // before out of order partition update happens
            updatePartitionSizeByTimestamp(maxTimestamp, transientRowCount);
        }
        recordStructureVersion++;
    }

    public void bumpStructureVersion(ObjList denseSymbolMapWriters) {
        recordStructureVersion++;
        ++structureVersion;
        commit(CommitMode.NOSYNC, denseSymbolMapWriters);
    }

    public void bumpTruncateVersion() {
        truncateVersion++;
    }

    public void cancelRow() {
        if (transientRowCount == 1 && txPartitionCount > 1) {
            // we have to undo creation of partition
            txPartitionCount--;
            fixedRowCount -= prevTransientRowCount;
            transientRowCount = prevTransientRowCount + 1; // When row cancel finishes 1 is subtracted. Add 1 to compensate.
            attachedPartitions.setPos(attachedPartitions.size() - LONGS_PER_TX_ATTACHED_PARTITION);
            prevTransientRowCount = getLong(TX_OFFSET_TRANSIENT_ROW_COUNT_64);
        }

        maxTimestamp = prevMaxTimestamp;
        minTimestamp = prevMinTimestamp;
        recordStructureVersion++;
    }

    public long cancelToMaxTimestamp() {
        return prevMaxTimestamp;
    }

    public long cancelToTransientRowCount() {
        return prevTransientRowCount;
    }

    @Override
    public void clear() {
        if (txMemBase != null) {
            // Never trim _txn file to size. Size of the file can only grow up.
            txMemBase.close(false);
        }
        recordStructureVersion = 0;
        lastRecordStructureVersion = -1;
        prevRecordStructureVersion = -2;
        lastRecordBaseOffset = -1;
        prevRecordBaseOffset = -2;
    }

    @Override
    public void close() {
        try {
            clear();
            txMemBase = null;
        } finally {
            super.close();
        }
    }

    public boolean unsafeLoadAll() {
        super.unsafeLoadAll();
        this.baseVersion = getVersion();
        if (baseVersion >= 0) {
            this.readBaseOffset = getBaseOffset();
            this.readRecordSize = getRecordSize();
            this.prevTransientRowCount = this.transientRowCount;
            this.prevMaxTimestamp = maxTimestamp;
            this.prevMinTimestamp = minTimestamp;
            return true;
        }
        return false;
    }

    @Override
    public TxWriter ofRO(@Transient Path path, int partitionBy) {
        throw new IllegalStateException();
    }

    protected long unsafeGetRawMemorySize() {
        return Math.max(super.unsafeGetRawMemorySize(), writeAreaSize + writeBaseOffset);
    }

    @Override
    public void collectValueCount(int symbolIndexInTxWriter, int count) {
        writeTransientSymbolCount(symbolIndexInTxWriter, count);
    }

    public void commit(int commitMode, ObjList symbolCountProviders) {

        if (prevRecordStructureVersion == recordStructureVersion && prevRecordBaseOffset > 0) {

            // Optimisation for the case where commit appends rows to the last partition only
            // In this case all to be changed is TX_OFFSET_MAX_TIMESTAMP_64 and TX_OFFSET_TRANSIENT_ROW_COUNT_64
            writeBaseOffset = prevRecordBaseOffset;
            putLong(TX_OFFSET_TXN_64, ++txn);
            putLong(TX_OFFSET_MAX_TIMESTAMP_64, maxTimestamp);
            putLong(TX_OFFSET_TRANSIENT_ROW_COUNT_64, transientRowCount);

            // Store symbol counts. Unfortunately we cannot skip it in here
            storeSymbolCounts(symbolCountProviders);

            Unsafe.getUnsafe().storeFence();
            txMemBase.putLong(TX_BASE_OFFSET_VERSION_64, ++baseVersion);

            super.switchRecord(writeBaseOffset, writeAreaSize); // writeAreaSize should be between records
            this.readBaseOffset = writeBaseOffset;

            prevTransientRowCount = transientRowCount;
            prevMinTimestamp = minTimestamp;
            prevMaxTimestamp = maxTimestamp;

            prevRecordBaseOffset = lastRecordBaseOffset;
            lastRecordBaseOffset = writeBaseOffset;
        } else {
            // Slow path, record structure changed
            commitFullRecord(commitMode, symbolCountProviders);
        }
    }

    public void finishPartitionSizeUpdate(long minTimestamp, long maxTimestamp) {
        recordStructureVersion++;
        this.minTimestamp = minTimestamp;
        this.maxTimestamp = maxTimestamp;
        finishPartitionSizeUpdate();
    }

    public void finishPartitionSizeUpdate() {
        recordStructureVersion++;
        assert getPartitionCount() > 0;
        this.transientRowCount = getPartitionSize(getPartitionCount() - 1);
        this.fixedRowCount = 0;
        this.txPartitionCount = getPartitionCount();
        for (int i = 0, hi = txPartitionCount - 1; i < hi; i++) {
            this.fixedRowCount += getPartitionSize(i);
        }
    }

    public int getAppendedPartitionCount() {
        return txPartitionCount;
    }

    public long getLastTxSize() {
        return txPartitionCount == 1 ? transientRowCount - prevTransientRowCount : transientRowCount;
    }

    public boolean inTransaction() {
        return txPartitionCount > 1 || transientRowCount != prevTransientRowCount;
    }

    public boolean isActivePartition(long timestamp) {
        return getPartitionTimestampLo(maxTimestamp) == timestamp;
    }

    public TxWriter ofRW(@Transient Path path, int partitionBy) {
        clear();
        openTxnFile(ff, path);
        try {
            super.initRO(txMemBase, partitionBy);
            unsafeLoadAll();
        } catch (Throwable e) {
            if (txMemBase != null) {
                // Do not truncate in case the file cannot be read
                txMemBase.close(false);
                txMemBase = null;
            }
            super.close();
            throw e;
        }
        return this;
    }

    public void openFirstPartition(long timestamp) {
        txPartitionCount = 1;
        updateAttachedPartitionSizeByTimestamp(timestamp, 0);
    }

    public void removeAttachedPartitions(long timestamp) {
        recordStructureVersion++;
        final long partitionTimestampLo = getPartitionTimestampLo(timestamp);
        int index = findAttachedPartitionIndexByLoTimestamp(partitionTimestampLo);
        if (index > -1) {
            final int size = attachedPartitions.size();
            final int lim = size - LONGS_PER_TX_ATTACHED_PARTITION;
            if (index < lim) {
                attachedPartitions.arrayCopy(index + LONGS_PER_TX_ATTACHED_PARTITION, index, lim - index);
            }
            attachedPartitions.setPos(lim);
            partitionTableVersion++;
        } else {
            assert false;
        }
    }

    public void reset(long fixedRowCount, long transientRowCount, long maxTimestamp, int commitMode, ObjList symbolCountProviders) {
        recordStructureVersion++;
        this.fixedRowCount = fixedRowCount;
        this.maxTimestamp = maxTimestamp;
        this.transientRowCount = transientRowCount;
        commit(commitMode, symbolCountProviders);
    }

    public void resetTimestamp() {
        recordStructureVersion++;
        prevMaxTimestamp = Long.MIN_VALUE;
        prevMinTimestamp = Long.MAX_VALUE;
        maxTimestamp = prevMaxTimestamp;
        minTimestamp = prevMinTimestamp;
    }

    public void setColumnVersion(long newVersion) {
        if (columnVersion != newVersion) {
            recordStructureVersion++;
            this.columnVersion = newVersion;
        }
    }

    public void setExtensionListener(TableWriter.ExtensionListener extensionListener) {
        this.extensionListener = extensionListener;
    }

    public void setMinTimestamp(long timestamp) {
        recordStructureVersion++;
        minTimestamp = timestamp;
        if (prevMinTimestamp == Long.MAX_VALUE) {
            prevMinTimestamp = minTimestamp;
        }
    }

    public void switchPartitions(long timestamp) {
        recordStructureVersion++;
        fixedRowCount += transientRowCount;
        prevTransientRowCount = transientRowCount;
        long partitionTimestampLo = getPartitionTimestampLo(maxTimestamp);
        int index = findAttachedPartitionIndexByLoTimestamp(partitionTimestampLo);
        updatePartitionSizeByIndex(index, transientRowCount);

        index += LONGS_PER_TX_ATTACHED_PARTITION;

        attachedPartitions.setPos(index + LONGS_PER_TX_ATTACHED_PARTITION);
        long newTimestampLo = getPartitionTimestampLo(timestamp);
        initPartitionAt(index, newTimestampLo, 0, -1);
        transientRowCount = 0;
        txPartitionCount++;
        if (extensionListener != null) {
            extensionListener.onTableExtended(newTimestampLo);
        }
    }

    public void truncate(long columnVersion) {
        recordStructureVersion++;
        maxTimestamp = Long.MIN_VALUE;
        minTimestamp = Long.MAX_VALUE;
        prevTransientRowCount = 0;
        transientRowCount = 0;
        fixedRowCount = 0;
        txPartitionCount = 1;
        attachedPartitions.clear();

        writeAreaSize = calculateWriteSize();
        writeBaseOffset = calculateWriteOffset();
        resetTxn(txMemBase, writeBaseOffset, getSymbolColumnCount(), ++txn, ++dataVersion, ++partitionTableVersion, structureVersion, columnVersion, ++truncateVersion);
        finishABHeader(writeBaseOffset, symbolColumnCount * 8, 0, CommitMode.NOSYNC);
    }

    public void updateMaxTimestamp(long timestamp) {
        prevMaxTimestamp = maxTimestamp;
        assert timestamp >= maxTimestamp;
        maxTimestamp = timestamp;
    }

    public void updatePartitionSizeByIndex(int partitionIndex, long partitionTimestampLo, long rowCount) {
        recordStructureVersion++;
        updateAttachedPartitionSizeByIndex(partitionIndex, partitionTimestampLo, rowCount);
    }

    public void updatePartitionSizeByTimestamp(long timestamp, long rowCount) {
        recordStructureVersion++;
        updateAttachedPartitionSizeByTimestamp(timestamp, rowCount);
    }

    void bumpPartitionTableVersion() {
        recordStructureVersion++;
        partitionTableVersion++;
    }

    private int calculateWriteOffset() {
        int areaSize = calculateTxRecordSize(symbolColumnCount * 8, attachedPartitions.size() * 8);
        boolean currentIsA = (baseVersion & 1L) == 0L;
        int currentOffset = currentIsA ? txMemBase.getInt(TX_BASE_OFFSET_A_32) : txMemBase.getInt(TX_BASE_OFFSET_B_32);
        if (TX_BASE_HEADER_SIZE + areaSize <= currentOffset) {
            return TX_BASE_HEADER_SIZE;
        }
        int currentSizeSymbols = currentIsA ? txMemBase.getInt(TX_BASE_OFFSET_SYMBOLS_SIZE_A_32) : txMemBase.getInt(TX_BASE_OFFSET_SYMBOLS_SIZE_B_32);
        int currentSizePartitions = currentIsA ? txMemBase.getInt(TX_BASE_OFFSET_PARTITIONS_SIZE_A_32) : txMemBase.getInt(TX_BASE_OFFSET_PARTITIONS_SIZE_B_32);
        int currentSize = calculateTxRecordSize(currentSizeSymbols, currentSizePartitions);
        return currentOffset + currentSize;
    }

    private int calculateWriteSize() {
        // If by any action data is reset and table is partitioned, clear attachedPartitions
        if (maxTimestamp == Long.MIN_VALUE && PartitionBy.isPartitioned(partitionBy)) {
            attachedPartitions.clear();
        }
        return calculateTxRecordSize(symbolColumnCount * 8, attachedPartitions.size() * 8);
    }

    private void commitFullRecord(int commitMode, ObjList symbolCountProviders) {
        symbolColumnCount = symbolCountProviders.size();

        writeAreaSize = calculateWriteSize();
        writeBaseOffset = calculateWriteOffset();
        putLong(TX_OFFSET_TXN_64, ++txn);
        putLong(TX_OFFSET_TRANSIENT_ROW_COUNT_64, transientRowCount);
        putLong(TX_OFFSET_FIXED_ROW_COUNT_64, fixedRowCount);
        putLong(TX_OFFSET_MIN_TIMESTAMP_64, minTimestamp);
        putLong(TX_OFFSET_MAX_TIMESTAMP_64, maxTimestamp);
        putLong(TX_OFFSET_PARTITION_TABLE_VERSION_64, partitionTableVersion);
        putLong(TX_OFFSET_STRUCT_VERSION_64, structureVersion);
        putLong(TX_OFFSET_DATA_VERSION_64, dataVersion);
        putLong(TX_OFFSET_COLUMN_VERSION_64, columnVersion);
        putInt(TX_OFFSET_MAP_WRITER_COUNT_32, symbolColumnCount);
        putLong(TX_OFFSET_TRUNCATE_VERSION_64, truncateVersion);

        // store symbol counts
        storeSymbolCounts(symbolCountProviders);

        // store attached partitions
        txPartitionCount = 1;
        saveAttachedPartitionsToTx(symbolColumnCount);
        finishABHeader(writeBaseOffset, symbolColumnCount * 8, attachedPartitions.size() * 8, commitMode);

        prevTransientRowCount = transientRowCount;
        prevMinTimestamp = minTimestamp;
        prevMaxTimestamp = maxTimestamp;

        prevRecordStructureVersion = lastRecordStructureVersion;
        lastRecordStructureVersion = recordStructureVersion;
        prevRecordBaseOffset = lastRecordBaseOffset;
        lastRecordBaseOffset = writeBaseOffset;
    }

    private void finishABHeader(int areaOffset, int bytesSymbols, int bytesPartitions, int commitMode) {
        boolean currentIsA = (baseVersion & 1L) == 0L;

        // When current is A, write to B
        long offsetOffset = currentIsA ? TX_BASE_OFFSET_B_32 : TX_BASE_OFFSET_A_32;
        long symbolSizeOffset = currentIsA ? TX_BASE_OFFSET_SYMBOLS_SIZE_B_32 : TX_BASE_OFFSET_SYMBOLS_SIZE_A_32;
        long partitionsSizeOffset = currentIsA ? TX_BASE_OFFSET_PARTITIONS_SIZE_B_32 : TX_BASE_OFFSET_PARTITIONS_SIZE_A_32;

        txMemBase.putInt(offsetOffset, areaOffset);
        txMemBase.putInt(symbolSizeOffset, bytesSymbols);
        txMemBase.putInt(partitionsSizeOffset, bytesPartitions);

        Unsafe.getUnsafe().storeFence();
        txMemBase.putLong(TX_BASE_OFFSET_VERSION_64, ++baseVersion);

        this.readRecordSize = calculateTxRecordSize(bytesSymbols, bytesPartitions);
        this.readBaseOffset = areaOffset;
        super.switchRecord(readBaseOffset, readRecordSize);

        if (commitMode != CommitMode.NOSYNC) {
            txMemBase.sync(commitMode == CommitMode.ASYNC);
        }
    }

    private long getLong(long offset) {
        assert offset + 8 <= readRecordSize;
        return txMemBase.getLong(readBaseOffset + offset);
    }

    private void insertPartitionSizeByTimestamp(int index, long partitionTimestamp, long partitionSize) {
        int size = attachedPartitions.size();
        attachedPartitions.setPos(size + LONGS_PER_TX_ATTACHED_PARTITION);
        if (index < size) {
            // insert in the middle
            attachedPartitions.arrayCopy(index, index + LONGS_PER_TX_ATTACHED_PARTITION, size - index);
            partitionTableVersion++;
        } else if (extensionListener != null) {
            extensionListener.onTableExtended(partitionTimestamp);
        }
        initPartitionAt(index, partitionTimestamp, partitionSize, -1);
    }

    private void openTxnFile(FilesFacade ff, Path path) {
        int pathLen = path.length();
        try {
            if (ff.exists(path.concat(TXN_FILE_NAME).$())) {
                if (txMemBase == null) {
                    txMemBase = Vm.getSmallCMARWInstance(ff, path, MemoryTag.MMAP_DEFAULT, CairoConfiguration.O_NONE);
                } else {
                    txMemBase.of(ff, path, ff.getPageSize(), MemoryTag.MMAP_DEFAULT, CairoConfiguration.O_NONE);
                }
                return;
            }
            throw CairoException.instance(ff.errno()).put("Cannot append. File does not exist: ").put(path);
        } finally {
            path.trimTo(pathLen);
        }
    }

    private void putInt(long offset, int value) {
        assert offset + 4 <= writeAreaSize;
        txMemBase.putInt(writeBaseOffset + offset, value);
    }

    private void putLong(long offset, long value) {
        txMemBase.putLong(writeBaseOffset + offset, value);
    }

    void resetToLastPartition(long committedTransientRowCount) {
        resetToLastPartition(committedTransientRowCount, getLong(TX_OFFSET_MAX_TIMESTAMP_64));
    }

    void resetToLastPartition(long committedTransientRowCount, long newMaxTimestamp) {
        recordStructureVersion++;
        updatePartitionSizeByTimestamp(maxTimestamp, committedTransientRowCount);
        prevMaxTimestamp = newMaxTimestamp;
        maxTimestamp = prevMaxTimestamp;
        transientRowCount = committedTransientRowCount;
    }

    private void saveAttachedPartitionsToTx(int symbolColumnCount) {
        // change partition count only when we have something to save to the partition table
        if (maxTimestamp != Long.MIN_VALUE) {
            final int size = attachedPartitions.size();
            final long partitionTableOffset = getPartitionTableSizeOffset(symbolColumnCount);
            putInt(partitionTableOffset, size * Long.BYTES);
            for (int i = 0; i < size; i++) {
                putLong(getPartitionTableIndexOffset(partitionTableOffset, i), attachedPartitions.getQuick(i));
            }
        }
    }

    private void storeSymbolCounts(ObjList symbolCountProviders) {
        for (int i = 0, n = symbolCountProviders.size(); i < n; i++) {
            long offset = getSymbolWriterIndexOffset(i);
            int symCount = symbolCountProviders.getQuick(i).getSymbolCount();
            putInt(offset, symCount);
            offset += Integer.BYTES;
            putInt(offset, symCount);
        }
    }

    long unsafeCommittedFixedRowCount() {
        return getLong(TX_OFFSET_FIXED_ROW_COUNT_64);
    }

    long unsafeCommittedTransientRowCount() {
        return getLong(TX_OFFSET_TRANSIENT_ROW_COUNT_64);
    }

    private void updateAttachedPartitionSizeByIndex(int partitionIndex, long partitionTimestampLo, long partitionSize) {
        if (partitionIndex > -1) {
            updatePartitionSizeByIndex(partitionIndex, partitionSize);
        } else {
            insertPartitionSizeByTimestamp(-(partitionIndex + 1), partitionTimestampLo, partitionSize);
        }
    }

    private void updateAttachedPartitionSizeByTimestamp(long timestamp, long partitionSize) {
        final long partitionTimestampLo = getPartitionTimestampLo(timestamp);
        updateAttachedPartitionSizeByIndex(findAttachedPartitionIndexByLoTimestamp(partitionTimestampLo), partitionTimestampLo, partitionSize);
    }

    private void updatePartitionSizeByIndex(int index, long partitionSize) {
        if (attachedPartitions.getQuick(index + PARTITION_SIZE_OFFSET) != partitionSize) {
            recordStructureVersion++;
            attachedPartitions.set(index + PARTITION_SIZE_OFFSET, partitionSize);
        }
    }

    void updatePartitionSizeAndTxnByIndex(int index, long partitionSize) {
        recordStructureVersion++;
        attachedPartitions.set(index + PARTITION_SIZE_OFFSET, partitionSize);
        attachedPartitions.set(index + PARTITION_NAME_TX_OFFSET, txn);
    }

    void updatePartitionColumnVersion(long partitionTimestamp) {
        final int index = findAttachedPartitionIndexByLoTimestamp(partitionTimestamp);
        attachedPartitions.set(index + PARTITION_COLUMN_VERSION_OFFSET, columnVersion);
    }

    private void writeTransientSymbolCount(int symbolIndex, int symCount) {
        // This updates into current record
        long recordOffset = getSymbolWriterTransientIndexOffset(symbolIndex);
        assert recordOffset + 4 <= readRecordSize;
        txMemBase.putInt(readBaseOffset + recordOffset, symCount);
    }
}




© 2015 - 2025 Weber Informatics LLC | Privacy Policy