All Downloads are FREE. Search and download functionalities are using the official Maven repository.

io.questdb.cairo.TableWriter Maven / Gradle / Ivy

/*******************************************************************************
 *     ___                  _   ____  ____
 *    / _ \ _   _  ___  ___| |_|  _ \| __ )
 *   | | | | | | |/ _ \/ __| __| | | |  _ \
 *   | |_| | |_| |  __/\__ \ |_| |_| | |_) |
 *    \__\_\\__,_|\___||___/\__|____/|____/
 *
 *  Copyright (c) 2014-2019 Appsicle
 *  Copyright (c) 2019-2020 QuestDB
 *
 *  Licensed under the Apache License, Version 2.0 (the "License");
 *  you may not use this file except in compliance with the License.
 *  You may obtain a copy of the License at
 *
 *  http://www.apache.org/licenses/LICENSE-2.0
 *
 *  Unless required by applicable law or agreed to in writing, software
 *  distributed under the License is distributed on an "AS IS" BASIS,
 *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 *  See the License for the specific language governing permissions and
 *  limitations under the License.
 *
 ******************************************************************************/

package io.questdb.cairo;

import io.questdb.MessageBus;
import io.questdb.MessageBusImpl;
import io.questdb.cairo.SymbolMapWriter.TransientSymbolCountChangeHandler;
import io.questdb.cairo.sql.Function;
import io.questdb.cairo.sql.Record;
import io.questdb.cairo.sql.RecordMetadata;
import io.questdb.cairo.sql.SymbolTable;
import io.questdb.cairo.vm.*;
import io.questdb.griffin.SqlException;
import io.questdb.griffin.model.IntervalUtils;
import io.questdb.log.Log;
import io.questdb.log.LogFactory;
import io.questdb.mp.*;
import io.questdb.std.*;
import io.questdb.std.datetime.DateFormat;
import io.questdb.std.datetime.microtime.Timestamps;
import io.questdb.std.str.LPSZ;
import io.questdb.std.str.NativeLPSZ;
import io.questdb.std.str.Path;
import io.questdb.std.str.StringSink;
import io.questdb.tasks.*;
import org.jetbrains.annotations.NotNull;

import java.io.Closeable;
import java.util.concurrent.atomic.AtomicInteger;
import java.util.concurrent.atomic.AtomicLong;
import java.util.function.LongConsumer;

import static io.questdb.cairo.StatusCode.*;
import static io.questdb.cairo.TableUtils.*;
import static io.questdb.std.Files.isDots;

public class TableWriter implements Closeable {
    public static final int TIMESTAMP_MERGE_ENTRY_BYTES = Long.BYTES * 2;
    public static final int O3_BLOCK_NONE = -1;
    public static final int O3_BLOCK_O3 = 1;
    public static final int O3_BLOCK_DATA = 2;
    public static final int O3_BLOCK_MERGE = 3;
    private static final int MEM_PAGE_SIZE = 16 * Numbers.SIZE_1MB;
    private static final Log LOG = LogFactory.getLog(TableWriter.class);
    private static final CharSequenceHashSet IGNORED_FILES = new CharSequenceHashSet();
    private static final Runnable NOOP = () -> {
    };
    private final static RemoveFileLambda REMOVE_OR_LOG = TableWriter::removeFileAndOrLog;
    private final static RemoveFileLambda REMOVE_OR_EXCEPTION = TableWriter::removeOrException;
    final ObjList columns;
    private final ObjList symbolMapWriters;
    private final ObjList denseSymbolMapWriters;
    private final ObjList denseSymbolTransientCountHandlers;
    private final ObjList indexers;
    private final ObjList denseIndexers = new ObjList<>();
    private final Path path;
    private final Path other;
    private final LongList refs = new LongList();
    private final Row row = new Row();
    private final int rootLen;
    private final MappedReadOnlyMemory metaMem;
    private final int partitionBy;
    private final RowFunction switchPartitionFunction = new SwitchPartitionRowFunction();
    private final RowFunction openPartitionFunction = new OpenPartitionRowFunction();
    private final RowFunction noPartitionFunction = new NoPartitionFunction();
    private final RowFunction noTimestampFunction = new NoTimestampFunction();
    private final RowFunction o3RowFunction = new O3PartitionFunction();
    private final NativeLPSZ nativeLPSZ = new NativeLPSZ();
    private final LongList columnTops;
    private final FilesFacade ff;
    private final DateFormat partitionDirFmt;
    private final AppendOnlyVirtualMemory ddlMem;
    private final int mkDirMode;
    private final int fileOperationRetryCount;
    private final CharSequence tableName;
    private final TableWriterMetadata metadata;
    private final CairoConfiguration configuration;
    private final CharSequenceIntHashMap validationMap = new CharSequenceIntHashMap();
    private final FragileCode RECOVER_FROM_META_RENAME_FAILURE = this::recoverFromMetaRenameFailure;
    private final SOCountDownLatch indexLatch = new SOCountDownLatch();
    private final LongList indexSequences = new LongList();
    private final MessageBus messageBus;
    private final boolean parallelIndexerEnabled;
    private final Timestamps.TimestampFloorMethod timestampFloorMethod;
    private final Timestamps.TimestampCeilMethod timestampCeilMethod;
    private final Timestamps.TimestampAddMethod timestampAddMethod;
    private final int defaultCommitMode;
    private final FindVisitor removePartitionDirectories = this::removePartitionDirectories0;
    private final ObjList nullSetters;
    private final ObjList o3NullSetters;
    private final ObjList o3Columns;
    private final ObjList o3Columns2;
    private final TableBlockWriter blockWriter;
    private final TimestampValueRecord dropPartitionFunctionRec = new TimestampValueRecord();
    private final ObjList o3PendingCallbackTasks = new ObjList<>();
    private final O3ColumnUpdateMethod oooSortVarColumnRef = this::o3SortVarColumn;
    private final O3ColumnUpdateMethod oooSortFixColumnRef = this::o3SortFixColumn;
    private final SOUnboundedCountDownLatch o3DoneLatch = new SOUnboundedCountDownLatch();
    private final AtomicLong o3PartitionUpdRemaining = new AtomicLong();
    private final AtomicInteger o3ErrorCount = new AtomicInteger();
    private final MappedReadWriteMemory todoMem = new PagedMappedReadWriteMemory();
    private final TxWriter txFile;
    private final FindVisitor removePartitionDirsNotAttached = this::removePartitionDirsNotAttached;
    private final LongList o3PartitionRemoveCandidates = new LongList();
    private final ObjectPool o3ColumnCounters = new ObjectPool(O3MutableAtomicInteger::new, 64);
    private final ObjectPool o3BasketPool = new ObjectPool(O3Basket::new, 64);
    private final TxnScoreboard txnScoreboard;
    private final StringSink o3Sink = new StringSink();
    private final NativeLPSZ o3NativeLPSZ = new NativeLPSZ();
    private final RingQueue o3PartitionUpdateQueue;
    private final MPSequence o3PartitionUpdatePubSeq;
    private final SCSequence o3PartitionUpdateSubSeq;
    private LongConsumer appendTimestampSetter;
    private long todoTxn;
    private ContiguousVirtualMemory o3TimestampMem;
    private ContiguousVirtualMemory o3TimestampMemCpy;
    private final O3ColumnUpdateMethod o3MoveHysteresisRef = this::o3MoveHysteresis0;
    private long lockFd = -1;
    private LongConsumer timestampSetter;
    private int columnCount;
    private RowFunction rowFunction = openPartitionFunction;
    private boolean avoidIndexOnCommit = false;
    private long partitionTimestampHi;
    private long masterRef = 0;
    private long o3MasterRef = -1;
    private boolean removeDirOnCancelRow = true;
    private long tempMem16b = Unsafe.malloc(16);
    private int metaSwapIndex;
    private int metaPrevIndex;
    private final FragileCode RECOVER_FROM_TODO_WRITE_FAILURE = this::recoverFrommTodoWriteFailure;
    private final FragileCode RECOVER_FROM_SYMBOL_MAP_WRITER_FAILURE = this::recoverFromSymbolMapWriterFailure;
    private final FragileCode RECOVER_FROM_SWAP_RENAME_FAILURE = this::recoverFromSwapRenameFailure;
    private final FragileCode RECOVER_FROM_COLUMN_OPEN_FAILURE = this::recoverOpenColumnFailure;
    private int indexCount;
    private boolean performRecovery;
    private boolean distressed = false;
    private LifecycleManager lifecycleManager;
    private String designatedTimestampColumnName;
    private long o3RowCount;
    private final O3ColumnUpdateMethod o3MoveUncommittedRef = this::o3MoveUncommitted0;
    private long lastPartitionTimestamp;
    private boolean o3InError = false;
    private final boolean o3QuickSortEnabled;

    public TableWriter(CairoConfiguration configuration, CharSequence tableName) {
        this(configuration, tableName, new MessageBusImpl(configuration));
    }

    public TableWriter(CairoConfiguration configuration, CharSequence tableName, @NotNull MessageBus messageBus) {
        this(configuration, tableName, messageBus, true, DefaultLifecycleManager.INSTANCE);
    }

    public TableWriter(
            CairoConfiguration configuration,
            CharSequence tableName,
            @NotNull MessageBus messageBus,
            boolean lock,
            LifecycleManager lifecycleManager
    ) {
        this(configuration, tableName, messageBus, lock, lifecycleManager, configuration.getRoot());
    }

    public TableWriter(
            CairoConfiguration configuration,
            CharSequence tableName,
            @NotNull MessageBus messageBus,
            boolean lock,
            LifecycleManager lifecycleManager,
            CharSequence root
    ) {
        LOG.info().$("open '").utf8(tableName).$('\'').$();
        this.configuration = configuration;
        this.messageBus = messageBus;
        this.defaultCommitMode = configuration.getCommitMode();
        this.lifecycleManager = lifecycleManager;
        this.parallelIndexerEnabled = configuration.isParallelIndexingEnabled();
        this.ff = configuration.getFilesFacade();
        this.mkDirMode = configuration.getMkDirMode();
        this.fileOperationRetryCount = configuration.getFileOperationRetryCount();
        this.tableName = Chars.toString(tableName);
        this.o3QuickSortEnabled = configuration.isO3QuickSortEnabled();
        this.o3PartitionUpdateQueue = new RingQueue(O3PartitionUpdateTask.CONSTRUCTOR, configuration.getO3PartitionUpdateQueueCapacity());
        this.o3PartitionUpdatePubSeq = new MPSequence(this.o3PartitionUpdateQueue.getCapacity());
        this.o3PartitionUpdateSubSeq = new SCSequence();
        o3PartitionUpdatePubSeq.then(o3PartitionUpdateSubSeq).then(o3PartitionUpdatePubSeq);

        this.path = new Path();
        this.path.of(root).concat(tableName);
        this.other = new Path().of(root).concat(tableName);
        this.rootLen = path.length();
        this.blockWriter = new TableBlockWriter(configuration, messageBus);
        try {
            if (lock) {
                lock();
            } else {
                this.lockFd = -1L;
            }
            long todoCount = openTodoMem();
            int todo;
            if (todoCount > 0) {
                todo = (int) todoMem.getLong(40);
            } else {
                todo = -1;
            }
            if (todo == TODO_RESTORE_META) {
                repairMetaRename((int) todoMem.getLong(48));
            }
            this.ddlMem = new AppendOnlyVirtualMemory();
            this.metaMem = new SinglePageMappedReadOnlyPageMemory();
            openMetaFile(ff, path, rootLen, metaMem);
            this.metadata = new TableWriterMetadata(ff, metaMem);
            this.partitionBy = metaMem.getInt(META_OFFSET_PARTITION_BY);
            this.txFile = new TxWriter(ff, path, partitionBy);
            this.txnScoreboard = new TxnScoreboard(ff, path.trimTo(rootLen), configuration.getTxnScoreboardEntryCount());
            path.trimTo(rootLen);
            // we have to do truncate repair at this stage of constructor
            // because this operation requires metadata
            switch (todo) {
                case TODO_TRUNCATE:
                    repairTruncate();
                    break;
                case TODO_RESTORE_META:
                case -1:
                    break;
                default:
                    LOG.error().$("ignoring unknown *todo* [code=").$(todo).$(']').$();
                    break;
            }
            this.columnCount = metadata.getColumnCount();
            if (metadata.getTimestampIndex() > -1) {
                this.designatedTimestampColumnName = metadata.getColumnName(metadata.getTimestampIndex());
            }
            this.refs.extendAndSet(columnCount, 0);
            this.columns = new ObjList<>(columnCount * 2);
            this.o3Columns = new ObjList<>(columnCount * 2);
            this.o3Columns2 = new ObjList<>(columnCount * 2);
            this.row.activeColumns = columns;
            this.symbolMapWriters = new ObjList<>(columnCount);
            this.indexers = new ObjList<>(columnCount);
            this.denseSymbolMapWriters = new ObjList<>(metadata.getSymbolMapCount());
            this.denseSymbolTransientCountHandlers = new ObjList<>(metadata.getSymbolMapCount());
            this.nullSetters = new ObjList<>(columnCount);
            this.o3NullSetters = new ObjList<>(columnCount);
            this.row.activeNullSetters = nullSetters;
            this.columnTops = new LongList(columnCount);
            if (partitionBy != PartitionBy.NONE) {
                timestampFloorMethod = getPartitionFloor(partitionBy);
                timestampCeilMethod = getPartitionCeil(partitionBy);
                timestampAddMethod = getPartitionAdd(partitionBy);
                partitionDirFmt = getPartitionDateFmt(partitionBy);
            } else {
                timestampFloorMethod = null;
                timestampCeilMethod = null;
                timestampAddMethod = null;
                partitionDirFmt = null;
            }

                executeDeferred();
        } catch (Throwable e) {
            doClose(false);
            throw e;
        }
    }

    public static int getPrimaryColumnIndex(int index) {
        return index * 2;
    }

    public static int getSecondaryColumnIndex(int index) {
        return getPrimaryColumnIndex(index) + 1;
    }

    public static long getTimestampIndexRow(long timestampIndex, long indexRow) {
        return Unsafe.getUnsafe().getLong(timestampIndex + indexRow * 16 + Long.BYTES);
    }

    public static long getTimestampIndexValue(long timestampIndex, long indexRow) {
        return Unsafe.getUnsafe().getLong(timestampIndex + indexRow * 16);
    }

    public static DateFormat selectPartitionDirFmt(int partitionBy) {
        switch (partitionBy) {
            case PartitionBy.DAY:
                return fmtDay;
            case PartitionBy.MONTH:
                return fmtMonth;
            case PartitionBy.YEAR:
                return fmtYear;
            default:
                return null;
        }
    }

    public void addColumn(CharSequence name, int type) {
        addColumn(name, type, configuration.getDefaultSymbolCapacity(), configuration.getDefaultSymbolCacheFlag(), false, 0, false);
    }

    /**
     * Adds new column to table, which can be either empty or can have data already. When existing columns
     * already have data this function will create ".top" file in addition to column files. ".top" file contains
     * size of partition at the moment of column creation. It must be used to accurately position inside new
     * column when either appending or reading.
     *
     * Failures
     * Adding new column can fail in many different situations. None of the failures affect integrity of data that is already in
     * the table but can leave instance of TableWriter in inconsistent state. When this happens function will throw CairoError.
     * Calling code must close TableWriter instance and open another when problems are rectified. Those problems would be
     * either with disk or memory or both.
     * 

* Whenever function throws CairoException application code can continue using TableWriter instance and may attempt to * add columns again. * * Transactions *

* Pending transaction will be committed before function attempts to add column. Even when function is unsuccessful it may * still have committed transaction. * * @param name of column either ASCII or UTF8 encoded. * @param symbolCapacity when column type is SYMBOL this parameter specifies approximate capacity for symbol map. * It should be equal to number of unique symbol values stored in the table and getting this * value badly wrong will cause performance degradation. Must be power of 2 * @param symbolCacheFlag when set to true, symbol values will be cached on Java heap. * @param type {@link ColumnType} * @param isIndexed configures column to be indexed or not * @param indexValueBlockCapacity approximation of number of rows for single index key, must be power of 2 * @param isSequential for columns that contain sequential values query optimiser can make assumptions on range searches (future feature) */ public void addColumn( CharSequence name, int type, int symbolCapacity, boolean symbolCacheFlag, boolean isIndexed, int indexValueBlockCapacity, boolean isSequential ) { assert indexValueBlockCapacity == Numbers.ceilPow2(indexValueBlockCapacity) : "power of 2 expected"; assert symbolCapacity == Numbers.ceilPow2(symbolCapacity) : "power of 2 expected"; assert TableUtils.isValidColumnName(name) : "invalid column name"; checkDistressed(); if (getColumnIndexQuiet(metaMem, name, columnCount) != -1) { throw CairoException.instance(0).put("Duplicate column name: ").put(name); } LOG.info().$("adding column '").utf8(name).$('[').$(ColumnType.nameOf(type)).$("]' to ").$(path).$(); commit(); removeColumnFiles(name, type, REMOVE_OR_EXCEPTION); // create new _meta.swp this.metaSwapIndex = addColumnToMeta(name, type, isIndexed, indexValueBlockCapacity, isSequential); // close _meta so we can rename it metaMem.close(); // validate new meta validateSwapMeta(name); // rename _meta to _meta.prev renameMetaToMetaPrev(name); // after we moved _meta to _meta.prev // we have to have _todo to restore _meta should anything go wrong writeRestoreMetaTodo(name); // rename _meta.swp to _meta renameSwapMetaToMeta(name); if (type == ColumnType.SYMBOL) { try { createSymbolMapWriter(name, symbolCapacity, symbolCacheFlag); } catch (CairoException e) { runFragile(RECOVER_FROM_SYMBOL_MAP_WRITER_FAILURE, name, e); } } else { // maintain sparse list of symbol writers symbolMapWriters.extendAndSet(columnCount, null); } // add column objects configureColumn(type, isIndexed); // increment column count columnCount++; // extend columnTop list to make sure row cancel can work // need for setting correct top is hard to test without being able to read from table columnTops.extendAndSet(columnCount - 1, txFile.getTransientRowCount()); // create column files if (txFile.getTransientRowCount() > 0 || partitionBy == PartitionBy.NONE) { try { openNewColumnFiles(name, isIndexed, indexValueBlockCapacity); } catch (CairoException e) { runFragile(RECOVER_FROM_COLUMN_OPEN_FAILURE, name, e); } } try { // open _meta file openMetaFile(ff, path, rootLen, metaMem); // remove _todo clearTodoLog(); } catch (CairoException err) { throwDistressException(err); } txFile.bumpStructureVersion(this.denseSymbolMapWriters); metadata.addColumn(name, type, isIndexed, indexValueBlockCapacity); LOG.info().$("ADDED column '").utf8(name).$('[').$(ColumnType.nameOf(type)).$("]' to ").$(path).$(); } public void addIndex(CharSequence columnName, int indexValueBlockSize) { assert indexValueBlockSize == Numbers.ceilPow2(indexValueBlockSize) : "power of 2 expected"; checkDistressed(); final int columnIndex = getColumnIndexQuiet(metaMem, columnName, columnCount); if (columnIndex == -1) { throw CairoException.instance(0).put("Invalid column name: ").put(columnName); } commit(); if (isColumnIndexed(metaMem, columnIndex)) { throw CairoException.instance(0).put("already indexed [column=").put(columnName).put(']'); } final int existingType = getColumnType(metaMem, columnIndex); LOG.info().$("adding index to '").utf8(columnName).$('[').$(ColumnType.nameOf(existingType)).$(", path=").$(path).$(']').$(); if (existingType != ColumnType.SYMBOL) { LOG.error().$("cannot create index for [column='").utf8(columnName).$(", type=").$(ColumnType.nameOf(existingType)).$(", path=").$(path).$(']').$(); throw CairoException.instance(0).put("cannot create index for [column='").put(columnName).put(", type=").put(ColumnType.nameOf(existingType)).put(", path=").put(path).put(']'); } // create indexer final SymbolColumnIndexer indexer = new SymbolColumnIndexer(); try { try { // edge cases here are: // column spans only part of table - e.g. it was added after table was created and populated // column has top value, e.g. does not span entire partition // to this end, we have a super-edge case: // if (partitionBy != PartitionBy.NONE) { // run indexer for the whole table final long timestamp = indexHistoricPartitions(indexer, columnName, indexValueBlockSize); path.trimTo(rootLen); setStateForTimestamp(path, timestamp, true); } else { setStateForTimestamp(path, 0, false); } // create index in last partition indexLastPartition(indexer, columnName, columnIndex, indexValueBlockSize); } finally { path.trimTo(rootLen); } } catch (Throwable e) { LOG.error().$("rolling back index created so far [path=").$(path).$(']').$(); removeIndexFiles(columnName); throw e; } // set index flag in metadata // create new _meta.swp metaSwapIndex = copyMetadataAndSetIndexed(columnIndex, indexValueBlockSize); // close _meta so we can rename it metaMem.close(); // validate new meta validateSwapMeta(columnName); // rename _meta to _meta.prev renameMetaToMetaPrev(columnName); // after we moved _meta to _meta.prev // we have to have _todo to restore _meta should anything go wrong writeRestoreMetaTodo(columnName); // rename _meta.swp to -_meta renameSwapMetaToMeta(columnName); try { // open _meta file openMetaFile(ff, path, rootLen, metaMem); // remove _todo clearTodoLog(); } catch (CairoException err) { throwDistressException(err); } txFile.bumpStructureVersion(this.denseSymbolMapWriters); indexers.extendAndSet((columnIndex) / 2, indexer); populateDenseIndexerList(); TableColumnMetadata columnMetadata = metadata.getColumnQuick(columnIndex); columnMetadata.setIndexed(true); columnMetadata.setIndexValueBlockCapacity(indexValueBlockSize); LOG.info().$("ADDED index to '").utf8(columnName).$('[').$(ColumnType.nameOf(existingType)).$("]' to ").$(path).$(); } public int attachPartition(long timestamp) { // Partitioned table must have a timestamp // SQL compiler will check that table is partitioned assert metadata.getTimestampIndex() > -1; CharSequence timestampCol = metadata.getColumnQuick(metadata.getTimestampIndex()).getName(); if (txFile.attachedPartitionsContains(timestamp)) { LOG.info().$("partition is already attached [path=").$(path).$(']').$(); return PARTITION_ALREADY_ATTACHED; } if (metadata.getSymbolMapCount() > 0) { LOG.error().$("attaching partitions on table with symbols not yet supported [table=").$(tableName) .$(",partition=").$ts(timestamp).I$(); return TABLE_HAS_SYMBOLS; } boolean rollbackRename = false; try { setPathForPartition(path, partitionBy, timestamp, false); if (!ff.exists(path.$())) { setPathForPartition(other, partitionBy, timestamp, false); other.put(DETACHED_DIR_MARKER); if (ff.exists(other.$())) { if (ff.rename(other, path)) { rollbackRename = true; LOG.info().$("moved partition dir: ").$(other).$(" to ").$(path).$(); } else { throw CairoException.instance(ff.errno()).put("File system error on trying to rename [from=") .put(other).put(",to=").put(path).put(']'); } } } if (ff.exists(path.$())) { // find out lo, hi ranges of partition attached as well as size final long partitionSize = readPartitionSizeMinMax(ff, path, timestampCol, tempMem16b, timestamp); if (partitionSize > 0) { if (inTransaction()) { LOG.info().$("committing open transaction before applying attach partition command [table=").$(tableName) .$(",partition=").$ts(timestamp).I$(); commit(); } attachPartitionCheckFilesMatchMetadata(ff, path, getMetadata(), partitionSize); long minPartitionTimestamp = Unsafe.getUnsafe().getLong(tempMem16b); long maxPartitionTimestamp = Unsafe.getUnsafe().getLong(tempMem16b + 8); assert timestamp <= minPartitionTimestamp && minPartitionTimestamp <= maxPartitionTimestamp; long nextMinTimestamp = Math.min(minPartitionTimestamp, txFile.getMinTimestamp()); long nextMaxTimestamp = Math.max(maxPartitionTimestamp, txFile.getMaxTimestamp()); boolean appendPartitionAttached = size() == 0 || getPartitionLo(nextMaxTimestamp) > getPartitionLo(txFile.getMaxTimestamp()); txFile.beginPartitionSizeUpdate(); txFile.updatePartitionSizeByTimestamp(timestamp, partitionSize); txFile.finishPartitionSizeUpdate(nextMinTimestamp, nextMaxTimestamp); txFile.commit(defaultCommitMode, denseSymbolMapWriters); if (appendPartitionAttached) { freeColumns(true); configureAppendPosition(); } LOG.info().$("partition attached [path=").$(path).$(']').$(); rollbackRename = false; } else { LOG.error().$("cannot detect partition size [path=").$(path).$(",timestampColumn=").$(timestampCol).$(']').$(); return PARTITION_EMPTY; } } else { LOG.error().$("cannot attach missing partition [path=").$(path).$(']').$(); return CANNOT_ATTACH_MISSING_PARTITION; } } finally { if (rollbackRename) { // rename back to .detached // otherwise it can be deleted on writer re-open if (ff.rename(path.$(), other.$())) { LOG.info().$("moved partition dir after failed attach attempt: ").$(path).$(" to ").$(other).$(); } else { LOG.info().$("file system error on trying to rename partition folder [errno=").$(ff.errno()) .$(",from=").$(path).$(",to=").$(other).I$(); } } path.trimTo(rootLen); other.trimTo(rootLen); } return StatusCode.OK; } public void changeCacheFlag(int columnIndex, boolean cache) { checkDistressed(); commit(); SymbolMapWriter symbolMapWriter = getSymbolMapWriter(columnIndex); if (symbolMapWriter.isCached() != cache) { symbolMapWriter.updateCacheFlag(cache); } else { return; } txFile.bumpStructureVersion(this.denseSymbolMapWriters); } @Override public void close() { if (null != blockWriter) { blockWriter.clear(); } if (isOpen() && lifecycleManager.close()) { doClose(true); } } public void commit() { commit(defaultCommitMode); } public void commit(int commitMode) { commit(commitMode, 0); } public boolean checkMaxAndCommitHysteresis() { if (getO3RowCount() < metadata.getO3MaxUncommittedRows()) { return false; } commitHysteresis(); return true; } public void commitHysteresis() { commit(defaultCommitMode, metadata.getO3CommitHysteresisInMicros()); } public void commitHysteresis(long lastTimestampHysteresisInMicros) { commit(defaultCommitMode, lastTimestampHysteresisInMicros); } public int getColumnIndex(CharSequence name) { int index = metadata.getColumnIndexQuiet(name); if (index > -1) { return index; } throw CairoException.instance(0).put("Invalid column name: ").put(name); } public String getDesignatedTimestampColumnName() { return designatedTimestampColumnName; } public FilesFacade getFilesFacade() { return ff; } public long getMaxTimestamp() { return txFile.getMaxTimestamp(); } public TableWriterMetadata getMetadata() { return metadata; } public int getPartitionBy() { return partitionBy; } public int getPartitionCount() { return txFile.getPartitionCount(); } public long getStructureVersion() { return txFile.getStructureVersion(); } public int getSymbolIndex(int columnIndex, CharSequence symValue) { return symbolMapWriters.getQuick(columnIndex).put(symValue); } public CharSequence getTableName() { return tableName; } public long getTxn() { return txFile.getTxn(); } public TxnScoreboard getTxnScoreboard() { return txnScoreboard; } public boolean inTransaction() { return txFile != null && (txFile.inTransaction() || hasO3()); } public boolean isOpen() { return tempMem16b != 0; } public TableBlockWriter newBlock() { bumpMasterRef(); txFile.newBlock(); blockWriter.open(this); return blockWriter; } public Row newRow(long timestamp) { return rowFunction.newRow(timestamp); } public Row newRow() { return newRow(0L); } public void o3BumpErrorCount() { o3ErrorCount.incrementAndGet(); } public long partitionNameToTimestamp(CharSequence partitionName) { if (partitionDirFmt == null) { throw CairoException.instance(0).put("table is not partitioned"); } try { return partitionDirFmt.parse(partitionName, null); } catch (NumericException e) { final CairoException ee = CairoException.instance(0); switch (partitionBy) { case PartitionBy.DAY: ee.put("'YYYY-MM-DD'"); break; case PartitionBy.MONTH: ee.put("'YYYY-MM'"); break; default: ee.put("'YYYY'"); break; } ee.put(" expected"); throw ee; } } public void removeColumn(CharSequence name) { checkDistressed(); final int index = getColumnIndex(name); final int type = metadata.getColumnType(index); LOG.info().$("removing column '").utf8(name).$("' from ").$(path).$(); // check if we are moving timestamp from a partitioned table final int timestampIndex = metaMem.getInt(META_OFFSET_TIMESTAMP_INDEX); boolean timestamp = index == timestampIndex; if (timestamp && partitionBy != PartitionBy.NONE) { throw CairoException.instance(0).put("Cannot remove timestamp from partitioned table"); } commit(); final CharSequence timestampColumnName = timestampIndex != -1 ? metadata.getColumnName(timestampIndex) : null; this.metaSwapIndex = removeColumnFromMeta(index); // close _meta so we can rename it metaMem.close(); // rename _meta to _meta.prev renameMetaToMetaPrev(name); // after we moved _meta to _meta.prev // we have to have _todo to restore _meta should anything go wrong writeRestoreMetaTodo(name); // rename _meta.swp to _meta renameSwapMetaToMeta(name); // remove column objects removeColumn(index); // remove symbol map writer or entry for such removeSymbolMapWriter(index); // decrement column count columnCount--; // reset timestamp limits if (timestamp) { txFile.resetTimestamp(); timestampSetter = value -> { }; } try { // open _meta file openMetaFile(ff, path, rootLen, metaMem); // remove _todo clearTodoLog(); // remove column files has to be done after _todo is removed removeColumnFiles(name, type, REMOVE_OR_LOG); } catch (CairoException err) { throwDistressException(err); } txFile.bumpStructureVersion(this.denseSymbolMapWriters); metadata.removeColumn(name); if (timestamp) { metadata.setTimestampIndex(-1); } else if (timestampColumnName != null) { int timestampIndex2 = metadata.getColumnIndex(timestampColumnName); metadata.setTimestampIndex(timestampIndex2); o3TimestampMem = o3Columns.getQuick(getPrimaryColumnIndex(timestampIndex2)); } LOG.info().$("REMOVED column '").utf8(name).$("' from ").$(path).$(); } public boolean removePartition(long timestamp) { long minTimestamp = txFile.getMinTimestamp(); long maxTimestamp = txFile.getMaxTimestamp(); if (partitionBy == PartitionBy.NONE) { return false; } timestamp = getPartitionLo(timestamp); if (timestamp < getPartitionLo(minTimestamp) || timestamp > maxTimestamp) { return false; } if (timestamp == getPartitionLo(maxTimestamp)) { LOG.error() .$("cannot remove active partition [path=").$(path) .$(", maxTimestamp=").$ts(maxTimestamp) .$(']').$(); return false; } if (!txFile.attachedPartitionsContains(timestamp)) { LOG.error().$("partition is already detached [path=").$(path).$(']').$(); return false; } try { // when we want to delete first partition we must find out // minTimestamp from next partition if it exists or next partition and so on // // when somebody removed data directories manually and then // attempts to tidy up metadata with logical partition delete // we have to uphold the effort and re-compute table size and its minTimestamp from // what remains on disk // find out if we are removing min partition setStateForTimestamp(path, timestamp, false); long nextMinTimestamp = minTimestamp; if (timestamp == txFile.getPartitionTimestamp(0)) { nextMinTimestamp = readMinTimestamp(txFile.getPartitionTimestamp(1)); } txFile.beginPartitionSizeUpdate(); txFile.removeAttachedPartitions(timestamp); txFile.setMinTimestamp(nextMinTimestamp); txFile.finishPartitionSizeUpdate(nextMinTimestamp, txFile.getMaxTimestamp()); txFile.commit(defaultCommitMode, denseSymbolMapWriters); if (ff.exists(path.$())) { int errno; if ((errno = ff.rmdir(path.chop$().slash$())) != 0) { LOG.info().$("partition directory delete is postponed [path=").$(path) .$(", errno=").$(errno) .$(']').$(); } else { LOG.info().$("partition marked for delete [path=").$(path).$(']').$(); } } else { LOG.info().$("partition absent on disk now detached from table [path=").$(path).$(']').$(); } return true; } finally { path.trimTo(rootLen); } } public void removePartition(Function function, int posForError) throws SqlException { if (partitionBy == PartitionBy.NONE) { throw SqlException.$(posForError, "table is not partitioned"); } if (txFile.getPartitionCount() == 0) { throw SqlException.$(posForError, "table is empty"); } else { // Drop partitions in descending order so if folders are missing on disk // removePartition does not fail to determine next minTimestamp for (int i = txFile.getPartitionCount() - 1; i > -1; i--) { long partitionTimestamp = txFile.getPartitionTimestamp(i); dropPartitionFunctionRec.setTimestamp(partitionTimestamp); if (function.getBool(dropPartitionFunctionRec)) { removePartition(partitionTimestamp); } } } } public void renameColumn(CharSequence currentName, CharSequence newName) { checkDistressed(); final int index = getColumnIndex(currentName); final int type = metadata.getColumnType(index); LOG.info().$("renaming column '").utf8(currentName).$("' to '").utf8(newName).$("' from ").$(path).$(); commit(); this.metaSwapIndex = renameColumnFromMeta(index, newName); // close _meta so we can rename it metaMem.close(); // rename _meta to _meta.prev renameMetaToMetaPrev(currentName); // after we moved _meta to _meta.prev // we have to have _todo to restore _meta should anything go wrong writeRestoreMetaTodo(currentName); // rename _meta.swp to _meta renameSwapMetaToMeta(currentName); try { // open _meta file openMetaFile(ff, path, rootLen, metaMem); // remove _todo clearTodoLog(); // rename column files has to be done after _todo is removed renameColumnFiles(currentName, newName, type); } catch (CairoException err) { throwDistressException(err); } txFile.bumpStructureVersion(this.denseSymbolMapWriters); metadata.renameColumn(currentName, newName); if (index == metadata.getTimestampIndex()) { designatedTimestampColumnName = Chars.toString(newName); } LOG.info().$("RENAMED column '").utf8(currentName).$("' to '").utf8(newName).$("' from ").$(path).$(); } public void rollback() { checkDistressed(); if (o3InError || inTransaction()) { try { LOG.info().$("tx rollback [name=").$(tableName).$(']').$(); if ((masterRef & 1) != 0) { masterRef++; } freeColumns(false); this.txFile.readUnchecked(); rollbackIndexes(); rollbackSymbolTables(); purgeUnusedPartitions(); configureAppendPosition(); o3InError = false; LOG.info().$("tx rollback complete [name=").$(tableName).$(']').$(); } catch (Throwable e) { LOG.error().$("could not perform rollback [name=").$(tableName).$(", msg=").$(e.getMessage()).$(']').$(); distressed = true; } } } public void setLifecycleManager(LifecycleManager lifecycleManager) { this.lifecycleManager = lifecycleManager; } public long size() { return txFile.getRowCount(); } @Override public String toString() { return "TableWriter{" + "name=" + tableName + '}'; } public void transferLock(long lockFd) { assert lockFd != -1; this.lockFd = lockFd; } /** * Truncates table. When operation is unsuccessful it throws CairoException. With that truncate can be * retried or alternatively table can be closed. Outcome of any other operation with the table is undefined * and likely to cause segmentation fault. When table re-opens any partial truncate will be retried. */ public final void truncate() { // we do this before size check so that "old" corrupt symbol tables are brought back in line for (int i = 0, n = denseSymbolMapWriters.size(); i < n; i++) { denseSymbolMapWriters.getQuick(i).truncate(); } if (size() == 0) { return; } // this is a crude block to test things for now todoMem.putLong(0, ++todoTxn); // write txn, reader will first read txn at offset 24 and then at offset 0 Unsafe.getUnsafe().storeFence(); // make sure we do not write hash before writing txn (view from another thread) todoMem.putLong(8, configuration.getDatabaseIdLo()); // write out our instance hashes todoMem.putLong(16, configuration.getDatabaseIdHi()); Unsafe.getUnsafe().storeFence(); todoMem.putLong(24, todoTxn); todoMem.putLong(32, 1); todoMem.putLong(40, TODO_TRUNCATE); todoMem.setSize(48); for (int i = 0; i < columnCount; i++) { getPrimaryColumn(i).truncate(); AppendOnlyVirtualMemory mem = getSecondaryColumn(i); if (mem != null) { mem.truncate(); } } if (partitionBy != PartitionBy.NONE) { freeColumns(false); if (indexers != null) { for (int i = 0, n = indexers.size(); i < n; i++) { Misc.free(indexers.getQuick(i)); } } removePartitionDirectories(); rowFunction = openPartitionFunction; } txFile.resetTimestamp(); txFile.truncate(); try { clearTodoLog(); } catch (CairoException err) { throwDistressException(err); } LOG.info().$("truncated [name=").$(tableName).$(']').$(); } public void updateMetadataVersion() { checkDistressed(); commit(); // create new _meta.swp this.metaSwapIndex = copyMetadataAndUpdateVersion(); // close _meta so we can rename it metaMem.close(); // rename _meta to _meta.prev this.metaPrevIndex = rename(fileOperationRetryCount); // rename _meta.swp to -_meta restoreMetaFrom(META_SWAP_FILE_NAME, metaSwapIndex); try { // open _meta file openMetaFile(ff, path, rootLen, metaMem); } catch (CairoException err) { throwDistressException(err); } txFile.bumpStructureVersion(this.denseSymbolMapWriters); metadata.setTableVersion(); } public void updateSymbols(int columnIndex, SymbolMapReader symReader) { int nSourceSymbols = symReader.size(); SymbolMapWriter symWriter = getSymbolMapWriter(columnIndex); int nDestinationSymbols = symWriter.getSymbolCount(); if (nSourceSymbols > nDestinationSymbols) { long address = symReader.symbolCharsAddressOf(nDestinationSymbols); long addressHi = symReader.symbolCharsAddressOf(nSourceSymbols); symWriter.appendSymbolCharsBlock(addressHi - address, address); } } /** * Eagerly sets up writer instance. Otherwise writer will initialize lazily. Invoking this method could improve * performance of some applications. UDP receivers use this in order to avoid initial receive buffer contention. */ public void warmUp() { Row r = newRow(txFile.getMaxTimestamp()); try { for (int i = 0; i < columnCount; i++) { r.putByte(i, (byte) 0); } } finally { r.cancel(); } } private static void removeFileAndOrLog(FilesFacade ff, LPSZ name) { if (ff.exists(name)) { if (ff.remove(name)) { LOG.info().$("removed: ").$(name).$(); } else { LOG.error().$("cannot remove: ").utf8(name).$(" [errno=").$(ff.errno()).$(']').$(); } } } private static void renameFileOrLog(FilesFacade ff, LPSZ name, LPSZ to) { if (ff.exists(name)) { if (ff.rename(name, to)) { LOG.info().$("renamed: ").$(name).$(); } else { LOG.error().$("cannot rename: ").utf8(name).$(" [errno=").$(ff.errno()).$(']').$(); } } } static void indexAndCountDown(ColumnIndexer indexer, long lo, long hi, SOCountDownLatch latch) { try { indexer.refreshSourceAndIndex(lo, hi); } catch (CairoException e) { indexer.distress(); LOG.error().$("index error [fd=").$(indexer.getFd()).$(']').$('{').$((Sinkable) e).$('}').$(); } finally { latch.countDown(); } } private static void removeOrException(FilesFacade ff, LPSZ path) { if (ff.exists(path) && !ff.remove(path)) { throw CairoException.instance(ff.errno()).put("Cannot remove ").put(path); } } private static void setColumnSize( FilesFacade ff, AppendOnlyVirtualMemory mem1, AppendOnlyVirtualMemory mem2, int type, long actualPosition, long buf, boolean ensureFileSize ) { long offset; long len; long mem1Size; if (actualPosition > 0) { // subtract column top switch (type) { case ColumnType.BINARY: assert mem2 != null; readOffsetBytes(ff, mem2, actualPosition, buf); offset = Unsafe.getUnsafe().getLong(buf); readBytes(ff, mem1, buf, Long.BYTES, offset, "Cannot read length, fd="); len = Unsafe.getUnsafe().getLong(buf); mem1Size = len == -1 ? offset + Long.BYTES : offset + len + Long.BYTES; if (ensureFileSize) { mem1.ensureFileSize(mem1.pageIndex(mem1Size)); mem2.ensureFileSize(mem2.pageIndex(actualPosition * Long.BYTES)); } mem1.setSize(mem1Size); mem2.setSize(actualPosition * Long.BYTES); break; case ColumnType.STRING: assert mem2 != null; readOffsetBytes(ff, mem2, actualPosition, buf); offset = Unsafe.getUnsafe().getLong(buf); readBytes(ff, mem1, buf, Integer.BYTES, offset, "Cannot read length, fd="); len = Unsafe.getUnsafe().getInt(buf); mem1Size = len == -1 ? offset + Integer.BYTES : offset + len * Character.BYTES + Integer.BYTES; if (ensureFileSize) { mem1.ensureFileSize(mem1.pageIndex(mem1Size)); mem2.ensureFileSize(mem2.pageIndex(actualPosition * Long.BYTES)); } mem1.setSize(mem1Size); mem2.setSize(actualPosition * Long.BYTES); break; default: mem1Size = actualPosition << ColumnType.pow2SizeOf(type); if (ensureFileSize) { mem1.ensureFileSize(mem1.pageIndex(mem1Size)); } mem1.setSize(mem1Size); break; } } else { mem1.setSize(0); if (mem2 != null) { mem2.setSize(0); } } } /** * This an O(n) method to find if column by the same name already exists. The benefit of poor performance * is that we don't keep column name strings on heap. We only use this method when adding new column, where * high performance of name check does not matter much. * * @param name to check * @return 0 based column index. */ private static int getColumnIndexQuiet(MappedReadOnlyMemory metaMem, CharSequence name, int columnCount) { long nameOffset = getColumnNameOffset(columnCount); for (int i = 0; i < columnCount; i++) { CharSequence col = metaMem.getStr(nameOffset); if (Chars.equalsIgnoreCase(col, name)) { return i; } nameOffset += VmUtils.getStorageLength(col); } return -1; } private static void readOffsetBytes(FilesFacade ff, AppendOnlyVirtualMemory mem, long position, long buf) { readBytes(ff, mem, buf, 8, (position - 1) * 8, "could not read offset, fd="); } private static void readBytes(FilesFacade ff, AppendOnlyVirtualMemory mem, long buf, int byteCount, long offset, CharSequence errorMsg) { if (ff.read(mem.getFd(), buf, byteCount, offset) != byteCount) { throw CairoException.instance(ff.errno()).put(errorMsg).put(mem.getFd()).put(", offset=").put(offset); } } private static void configureNullSetters(ObjList nullers, int type, WriteOnlyVirtualMemory mem1, WriteOnlyVirtualMemory mem2) { switch (type) { case ColumnType.BOOLEAN: case ColumnType.BYTE: nullers.add(() -> mem1.putByte((byte) 0)); break; case ColumnType.DOUBLE: nullers.add(() -> mem1.putDouble(Double.NaN)); break; case ColumnType.FLOAT: nullers.add(() -> mem1.putFloat(Float.NaN)); break; case ColumnType.INT: nullers.add(() -> mem1.putInt(Numbers.INT_NaN)); break; case ColumnType.LONG: case ColumnType.DATE: case ColumnType.TIMESTAMP: nullers.add(() -> mem1.putLong(Numbers.LONG_NaN)); break; case ColumnType.LONG256: nullers.add(() -> mem1.putLong256(Numbers.LONG_NaN, Numbers.LONG_NaN, Numbers.LONG_NaN, Numbers.LONG_NaN)); break; case ColumnType.SHORT: nullers.add(() -> mem1.putShort((short) 0)); break; case ColumnType.CHAR: nullers.add(() -> mem1.putChar((char) 0)); break; case ColumnType.STRING: nullers.add(() -> mem2.putLong(mem1.putNullStr())); break; case ColumnType.SYMBOL: nullers.add(() -> mem1.putInt(SymbolTable.VALUE_IS_NULL)); break; case ColumnType.BINARY: nullers.add(() -> mem2.putLong(mem1.putNullBin())); break; default: break; } } private static void openMetaFile(FilesFacade ff, Path path, int rootLen, MappedReadOnlyMemory metaMem) { path.concat(META_FILE_NAME).$(); try { metaMem.of(ff, path, ff.getPageSize(), ff.length(path)); } finally { path.trimTo(rootLen); } } private static void attachPartitionCheckFilesMatchMetadata(FilesFacade ff, Path path, RecordMetadata metadata, long partitionSize) throws CairoException { // for each column, check that file exist in the partition folder int rootLen = path.length(); for (int columnIndex = 0, size = metadata.getColumnCount(); columnIndex < size; columnIndex++) { try { int columnType = metadata.getColumnType(columnIndex); final CharSequence columnName = metadata.getColumnName(columnIndex); path.concat(columnName); switch (columnType) { case ColumnType.INT: case ColumnType.LONG: case ColumnType.BOOLEAN: case ColumnType.BYTE: case ColumnType.TIMESTAMP: case ColumnType.DATE: case ColumnType.DOUBLE: case ColumnType.CHAR: case ColumnType.SHORT: case ColumnType.FLOAT: case ColumnType.LONG256: // Consider Symbols as fixed, check data file size case ColumnType.SYMBOL: attachPartitionCheckFilesMatchFixedColumn(ff, path, columnType, partitionSize); break; case ColumnType.STRING: case ColumnType.BINARY: attachPartitionCheckFilesMatchVarLenColumn(ff, path, partitionSize); break; } } finally { path.trimTo(rootLen); } } } private static void attachPartitionCheckFilesMatchVarLenColumn(FilesFacade ff, Path path, long partitionSize) { int pathLen = path.length(); path.put(FILE_SUFFIX_I).$(); if (ff.exists(path)) { int typeSize = 4; long fileSize = ff.length(path); if (fileSize < partitionSize * typeSize) { throw CairoException.instance(0).put("Column file row count does not match timestamp file row count. " + "Partition files inconsistent [file=") .put(path) .put(",expectedSize=") .put(partitionSize * typeSize) .put(",actual=") .put(fileSize) .put(']'); } path.trimTo(pathLen); path.put(FILE_SUFFIX_D).$(); if (ff.exists(path)) { // good return; } } throw CairoException.instance(0).put("Column file does not exist [path=").put(path).put(']'); } private static void attachPartitionCheckFilesMatchFixedColumn(FilesFacade ff, Path path, int columnType, long partitionSize) { path.put(FILE_SUFFIX_D).$(); if (ff.exists(path)) { long fileSize = ff.length(path); if (fileSize < partitionSize << ColumnType.pow2SizeOf(columnType)) { throw CairoException.instance(0).put("Column file row count does not match timestamp file row count. " + "Partition files inconsistent [file=") .put(path) .put(",expectedSize=") .put(partitionSize << ColumnType.pow2SizeOf(columnType)) .put(",actual=") .put(fileSize) .put(']'); } return; } throw CairoException.instance(0).put("Column file does not exist [path=").put(path).put(']'); } private int addColumnToMeta( CharSequence name, int type, boolean indexFlag, int indexValueBlockCapacity, boolean sequentialFlag ) { int index; try { index = openMetaSwapFile(ff, ddlMem, path, rootLen, configuration.getMaxSwapFileCount()); int columnCount = metaMem.getInt(META_OFFSET_COUNT); ddlMem.putInt(columnCount + 1); ddlMem.putInt(metaMem.getInt(META_OFFSET_PARTITION_BY)); ddlMem.putInt(metaMem.getInt(META_OFFSET_TIMESTAMP_INDEX)); ddlMem.putInt(ColumnType.VERSION); ddlMem.putInt(metaMem.getInt(META_OFFSET_TABLE_ID)); ddlMem.jumpTo(META_OFFSET_COLUMN_TYPES); for (int i = 0; i < columnCount; i++) { writeColumnEntry(i); } // add new column metadata to bottom of list ddlMem.putByte((byte) type); long flags = 0; if (indexFlag) { flags |= META_FLAG_BIT_INDEXED; } if (sequentialFlag) { flags |= META_FLAG_BIT_SEQUENTIAL; } ddlMem.putLong(flags); ddlMem.putInt(indexValueBlockCapacity); ddlMem.skip(META_COLUMN_DATA_RESERVED); long nameOffset = getColumnNameOffset(columnCount); for (int i = 0; i < columnCount; i++) { CharSequence columnName = metaMem.getStr(nameOffset); ddlMem.putStr(columnName); nameOffset += VmUtils.getStorageLength(columnName); } ddlMem.putStr(name); } finally { ddlMem.close(); } return index; } private void bumpMasterRef() { if ((masterRef & 1) == 0) { masterRef++; } else { cancelRowAndBump(); } } void cancelRow() { if ((masterRef & 1) == 0) { return; } long dirtyMaxTimestamp = txFile.getMaxTimestamp(); long dirtyTransientRowCount = txFile.getTransientRowCount(); long rollbackToMaxTimestamp = txFile.cancelToMaxTimestamp(); long rollbackToTransientRowCount = txFile.cancelToTransientRowCount(); // dirty timestamp should be 1 because newRow() increments it if (dirtyTransientRowCount == 1) { if (partitionBy != PartitionBy.NONE) { // we have to undo creation of partition freeColumns(false); if (removeDirOnCancelRow) { try { setStateForTimestamp(path, dirtyMaxTimestamp, false); int errno; if ((errno = ff.rmdir(path.$())) != 0) { throw CairoException.instance(errno).put("Cannot remove directory: ").put(path); } removeDirOnCancelRow = false; } finally { path.trimTo(rootLen); } } // open old partition if (rollbackToMaxTimestamp > Long.MIN_VALUE) { try { openPartition(rollbackToMaxTimestamp); setAppendPosition(rollbackToTransientRowCount, false); } catch (Throwable e) { freeColumns(false); throw e; } } else { rowFunction = openPartitionFunction; } // undo counts removeDirOnCancelRow = true; txFile.cancelRow(); } else { txFile.cancelRow(); // we only have one partition, jump to start on every column for (int i = 0; i < columnCount; i++) { getPrimaryColumn(i).setSize(0); AppendOnlyVirtualMemory mem = getSecondaryColumn(i); if (mem != null) { mem.setSize(0); } } } } else { txFile.cancelRow(); // we are staying within same partition, prepare append positions for row count boolean rowChanged = false; // verify if any of the columns have been changed // if not - we don't have to do for (int i = 0; i < columnCount; i++) { if (refs.getQuick(i) == masterRef) { rowChanged = true; break; } } // is no column has been changed we take easy option and do nothing if (rowChanged) { setAppendPosition(dirtyTransientRowCount - 1, false); } } refs.fill(0, columnCount, --masterRef); txFile.transientRowCount--; } private void cancelRowAndBump() { cancelRow(); masterRef++; } private void checkDistressed() { if (!distressed) { return; } throw new CairoError("Table '" + tableName.toString() + "' is distressed"); } private void clearTodoLog() { try { todoMem.putLong(0, ++todoTxn); // write txn, reader will first read txn at offset 24 and then at offset 0 Unsafe.getUnsafe().storeFence(); // make sure we do not write hash before writing txn (view from another thread) todoMem.putLong(8, 0); // write out our instance hashes todoMem.putLong(16, 0); Unsafe.getUnsafe().storeFence(); todoMem.putLong(32, 0); Unsafe.getUnsafe().storeFence(); todoMem.putLong(24, todoTxn); todoMem.setSize(40); } finally { path.trimTo(rootLen); } } void closeActivePartition() { LOG.info().$("closing last partition [table=").$(tableName).I$(); closeAppendMemoryNoTruncate(true); Misc.freeObjList(denseIndexers); denseIndexers.clear(); } void closeActivePartition(long size) { for (int i = 0; i < columnCount; i++) { // stop calculating oversize as soon as we find first over-sized column final AppendOnlyVirtualMemory mem1 = getPrimaryColumn(i); final AppendOnlyVirtualMemory mem2 = getSecondaryColumn(i); setColumnSize( ff, mem1, mem2, getColumnType(metaMem, i), size - columnTops.getQuick(i), tempMem16b, false ); Misc.free(mem1); Misc.free(mem2); } Misc.freeObjList(denseIndexers); denseIndexers.clear(); } private void closeAppendMemoryNoTruncate(boolean truncate) { for (int i = 0, n = columns.size(); i < n; i++) { AppendOnlyVirtualMemory m = columns.getQuick(i); if (m != null) { m.close(truncate); } } } /** * Commits newly added rows of data. This method updates transaction file with pointers to end of appended data. *

* Pending rows *

This method will cancel pending rows by calling {@link #cancelRow()}. Data in partially appended row will be lost.

* * @param commitMode commit durability mode. * @param lastTimestampHysteresisInMicros if > 0 then do a partial commit, leaving the rows within the hysteresis in a new uncommitted transaction */ private void commit(int commitMode, long lastTimestampHysteresisInMicros) { checkDistressed(); if (o3InError) { rollback(); return; } if ((masterRef & 1) != 0) { cancelRow(); } if (inTransaction()) { if (hasO3() && o3Commit(lastTimestampHysteresisInMicros)) { return; } if (commitMode != CommitMode.NOSYNC) { syncColumns(commitMode); } updateIndexes(); txFile.commit(commitMode, this.denseSymbolMapWriters); o3ProcessPartitionRemoveCandidates(); } } void commitBlock(long firstTimestamp) { if (txFile.getMinTimestamp() == Long.MAX_VALUE) { txFile.setMinTimestamp(firstTimestamp); } for (int i = 0; i < columnCount; i++) { refs.setQuick(i, masterRef); } masterRef++; commit(); setAppendPosition(txFile.getTransientRowCount(), true); } private void configureAppendPosition() { if (this.txFile.getMaxTimestamp() > Long.MIN_VALUE || partitionBy == PartitionBy.NONE) { openFirstPartition(this.txFile.getMaxTimestamp()); if (partitionBy == PartitionBy.NONE) { if (metadata.getTimestampIndex() < 0) { rowFunction = noTimestampFunction; } else { rowFunction = noPartitionFunction; timestampSetter = appendTimestampSetter; } } else { rowFunction = switchPartitionFunction; timestampSetter = appendTimestampSetter; } } else { rowFunction = openPartitionFunction; timestampSetter = appendTimestampSetter; } row.activeColumns = columns; } private void configureColumn(int type, boolean indexFlag) { final AppendOnlyVirtualMemory primary = new AppendOnlyVirtualMemory(); final AppendOnlyVirtualMemory secondary; final ContiguousVirtualMemory oooPrimary = new ContiguousVirtualMemory(MEM_PAGE_SIZE, Integer.MAX_VALUE); final ContiguousVirtualMemory oooSecondary; final ContiguousVirtualMemory oooPrimary2 = new ContiguousVirtualMemory(MEM_PAGE_SIZE, Integer.MAX_VALUE); final ContiguousVirtualMemory oooSecondary2; switch (type) { case ColumnType.BINARY: case ColumnType.STRING: secondary = new AppendOnlyVirtualMemory(); oooSecondary = new ContiguousVirtualMemory(MEM_PAGE_SIZE, Integer.MAX_VALUE); oooSecondary2 = new ContiguousVirtualMemory(MEM_PAGE_SIZE, Integer.MAX_VALUE); break; default: secondary = null; oooSecondary = null; oooSecondary2 = null; break; } columns.add(primary); columns.add(secondary); o3Columns.add(oooPrimary); o3Columns.add(oooSecondary); o3Columns2.add(oooPrimary2); o3Columns2.add(oooSecondary2); configureNullSetters(nullSetters, type, primary, secondary); configureNullSetters(o3NullSetters, type, oooPrimary, oooSecondary); if (indexFlag) { indexers.extendAndSet((columns.size() - 1) / 2, new SymbolColumnIndexer()); populateDenseIndexerList(); } refs.add(0); } private void configureColumnMemory() { this.symbolMapWriters.setPos(columnCount); for (int i = 0; i < columnCount; i++) { int type = metadata.getColumnType(i); configureColumn(type, metadata.isColumnIndexed(i)); if (type == ColumnType.SYMBOL) { final int symbolIndex = denseSymbolMapWriters.size(); WriterTransientSymbolCountChangeHandler transientSymbolCountChangeHandler = new WriterTransientSymbolCountChangeHandler(symbolIndex); denseSymbolTransientCountHandlers.add(transientSymbolCountChangeHandler); SymbolMapWriter symbolMapWriter = new SymbolMapWriter(configuration, path.trimTo(rootLen), metadata.getColumnName(i), txFile.readSymbolCount(symbolIndex), transientSymbolCountChangeHandler); symbolMapWriters.extendAndSet(i, symbolMapWriter); denseSymbolMapWriters.add(symbolMapWriter); } if (metadata.isColumnIndexed(i)) { indexers.extendAndSet(i, new SymbolColumnIndexer()); } } final int timestampIndex = metadata.getTimestampIndex(); if (timestampIndex != -1) { o3TimestampMem = o3Columns.getQuick(getPrimaryColumnIndex(timestampIndex)); o3TimestampMemCpy = new ContiguousVirtualMemory(MEM_PAGE_SIZE, Integer.MAX_VALUE); } populateDenseIndexerList(); } private LongConsumer configureTimestampSetter() { int index = metadata.getTimestampIndex(); if (index == -1) { return value -> { }; } else { nullSetters.setQuick(index, NOOP); o3NullSetters.setQuick(index, NOOP); return getPrimaryColumn(index)::putLong; } } private void consumeO3PartitionRemoveTasks() { // consume discovery jobs final RingQueue discoveryQueue = messageBus.getO3PurgeDiscoveryQueue(); final Sequence discoverySubSeq = messageBus.getO3PurgeDiscoverySubSeq(); final RingQueue purgeQueue = messageBus.getO3PurgeQueue(); final Sequence purgePubSeq = messageBus.getO3PurgePubSeq(); final Sequence purgeSubSeq = messageBus.getO3PurgeSubSeq(); if (discoverySubSeq != null) { while (true) { long cursor = discoverySubSeq.next(); if (cursor > -1) { O3PurgeDiscoveryTask task = discoveryQueue.get(cursor); O3PurgeDiscoveryJob.discoverPartitions( ff, o3Sink, o3NativeLPSZ, refs, // reuse, this is only called from writer close purgeQueue, purgePubSeq, path, tableName, task.getPartitionBy(), task.getTimestamp(), txnScoreboard ); } else if (cursor == -1) { break; } } } // consume purge jobs if (purgeSubSeq != null) { while (true) { long cursor = purgeSubSeq.next(); if (cursor > -1) { O3PurgeTask task = purgeQueue.get(cursor); O3PurgeJob.purgePartitionDir( ff, other, task.getPartitionBy(), task.getTimestamp(), txnScoreboard, task.getNameTxnToRemove(), task.getMinTxnToExpect() ); } else if (cursor == -1) { break; } } } } private int copyMetadataAndSetIndexed(int columnIndex, int indexValueBlockSize) { try { int index = openMetaSwapFile(ff, ddlMem, path, rootLen, configuration.getMaxSwapFileCount()); int columnCount = metaMem.getInt(META_OFFSET_COUNT); ddlMem.putInt(columnCount); ddlMem.putInt(metaMem.getInt(META_OFFSET_PARTITION_BY)); ddlMem.putInt(metaMem.getInt(META_OFFSET_TIMESTAMP_INDEX)); ddlMem.putInt(ColumnType.VERSION); ddlMem.putInt(metaMem.getInt(META_OFFSET_TABLE_ID)); ddlMem.jumpTo(META_OFFSET_COLUMN_TYPES); for (int i = 0; i < columnCount; i++) { if (i != columnIndex) { writeColumnEntry(i); } else { ddlMem.putByte((byte) getColumnType(metaMem, i)); long flags = META_FLAG_BIT_INDEXED; if (isSequential(metaMem, i)) { flags |= META_FLAG_BIT_SEQUENTIAL; } ddlMem.putLong(flags); ddlMem.putInt(indexValueBlockSize); ddlMem.skip(META_COLUMN_DATA_RESERVED); } } long nameOffset = getColumnNameOffset(columnCount); for (int i = 0; i < columnCount; i++) { CharSequence columnName = metaMem.getStr(nameOffset); ddlMem.putStr(columnName); nameOffset += VmUtils.getStorageLength(columnName); } return index; } finally { ddlMem.close(); } } private int copyMetadataAndUpdateVersion() { int index; try { index = openMetaSwapFile(ff, ddlMem, path, rootLen, configuration.getMaxSwapFileCount()); int columnCount = metaMem.getInt(META_OFFSET_COUNT); ddlMem.putInt(columnCount); ddlMem.putInt(metaMem.getInt(META_OFFSET_PARTITION_BY)); ddlMem.putInt(metaMem.getInt(META_OFFSET_TIMESTAMP_INDEX)); ddlMem.putInt(ColumnType.VERSION); ddlMem.putInt(metaMem.getInt(META_OFFSET_TABLE_ID)); ddlMem.jumpTo(META_OFFSET_COLUMN_TYPES); for (int i = 0; i < columnCount; i++) { writeColumnEntry(i); } long nameOffset = getColumnNameOffset(columnCount); for (int i = 0; i < columnCount; i++) { CharSequence columnName = metaMem.getStr(nameOffset); ddlMem.putStr(columnName); nameOffset += VmUtils.getStorageLength(columnName); } return index; } finally { ddlMem.close(); } } /** * Creates bitmap index files for a column. This method uses primary column instance as temporary tool to * append index data. Therefore it must be called before primary column is initialized. * * @param columnName column name * @param indexValueBlockCapacity approximate number of values per index key * @param plen path length. This is used to trim shared path object to. */ private void createIndexFiles(CharSequence columnName, int indexValueBlockCapacity, int plen, boolean force) { try { BitmapIndexUtils.keyFileName(path.trimTo(plen), columnName); if (!force && ff.exists(path)) { return; } // reuse memory column object to create index and close it at the end try { ddlMem.of(ff, path, ff.getPageSize()); BitmapIndexWriter.initKeyMemory(ddlMem, indexValueBlockCapacity); } catch (CairoException e) { // looks like we could not create key file properly // lets not leave half baked file sitting around LOG.error() .$("could not create index [name=").utf8(path) .$(", errno=").$(e.getErrno()) .$(']').$(); if (!ff.remove(path)) { LOG.error() .$("could not remove '").utf8(path).$("'. Please remove MANUALLY.") .$("[errno=").$(ff.errno()) .$(']').$(); } throw e; } finally { ddlMem.close(); } if (!ff.touch(BitmapIndexUtils.valueFileName(path.trimTo(plen), columnName))) { LOG.error().$("could not create index [name=").$(path).$(']').$(); throw CairoException.instance(ff.errno()).put("could not create index [name=").put(path).put(']'); } } finally { path.trimTo(plen); } } private void createSymbolMapWriter(CharSequence name, int symbolCapacity, boolean symbolCacheFlag) { SymbolMapWriter.createSymbolMapFiles(ff, ddlMem, path, name, symbolCapacity, symbolCacheFlag); WriterTransientSymbolCountChangeHandler transientSymbolCountChangeHandler = new WriterTransientSymbolCountChangeHandler(denseSymbolMapWriters.size()); denseSymbolTransientCountHandlers.add(transientSymbolCountChangeHandler); SymbolMapWriter w = new SymbolMapWriter(configuration, path, name, 0, transientSymbolCountChangeHandler); denseSymbolMapWriters.add(w); symbolMapWriters.extendAndSet(columnCount, w); } private void doClose(boolean truncate) { consumeO3PartitionRemoveTasks(); boolean tx = inTransaction(); freeColumns(truncate & !distressed); freeSymbolMapWriters(); freeIndexers(); Misc.free(txFile); Misc.free(blockWriter); Misc.free(metaMem); Misc.free(ddlMem); Misc.free(other); Misc.free(todoMem); try { releaseLock(!truncate | tx | performRecovery | distressed); } finally { Misc.free(txnScoreboard); Misc.free(path); Misc.free(o3TimestampMemCpy); freeTempMem(); LOG.info().$("closed '").utf8(tableName).$('\'').$(); } } private void executeDeferred() { configureColumnMemory(); timestampSetter = configureTimestampSetter(); this.appendTimestampSetter = timestampSetter; this.txFile.readRowCounts(); configureAppendPosition(); purgeUnusedPartitions(); clearTodoLog(); } private void freeAndRemoveColumnPair(ObjList columns, int pi, int si) { Misc.free(columns.getQuick(pi)); Misc.free(columns.getQuick(si)); columns.remove(pi); columns.remove(pi); } private void freeColumns(boolean truncate) { // null check is because this method could be called from the constructor if (columns != null) { closeAppendMemoryNoTruncate(truncate); } Misc.freeObjListAndKeepObjects(o3Columns); Misc.freeObjListAndKeepObjects(o3Columns2); } private void freeIndexers() { if (indexers != null) { Misc.freeObjList(indexers); indexers.clear(); denseIndexers.clear(); } } private void freeSymbolMapWriters() { if (denseSymbolMapWriters != null) { for (int i = 0, n = denseSymbolMapWriters.size(); i < n; i++) { Misc.free(denseSymbolMapWriters.getQuick(i)); } symbolMapWriters.clear(); } if (symbolMapWriters != null) { symbolMapWriters.clear(); } } private void freeTempMem() { if (tempMem16b != 0) { Unsafe.free(tempMem16b, 16); tempMem16b = 0; } } BitmapIndexWriter getBitmapIndexWriter(int columnIndex) { return indexers.getQuick(columnIndex).getWriter(); } long getColumnTop(int columnIndex) { return columnTops.getQuick(columnIndex); } CairoConfiguration getConfiguration() { return configuration; } Sequence getO3CopyPubSeq() { return messageBus.getO3CopyPubSeq(); } RingQueue getO3CopyQueue() { return messageBus.getO3CopyQueue(); } Sequence getO3OpenColumnPubSeq() { return messageBus.getO3OpenColumnPubSeq(); } RingQueue getO3OpenColumnQueue() { return messageBus.getO3OpenColumnQueue(); } Sequence getO3PartitionUpdatePubSeq() { return o3PartitionUpdatePubSeq; } RingQueue getO3PartitionUpdateQueue() { return o3PartitionUpdateQueue; } public long getO3RowCount() { return (masterRef - o3MasterRef + 1) / 2; } private long getPartitionLo(long timestamp) { return timestampFloorMethod.floor(timestamp); } long getPartitionNameTxnByIndex(int index) { return txFile.getPartitionNameTxnByIndex(index); } long getPartitionSizeByIndex(int index) { return txFile.getPartitionSizeByIndex(index); } long getPrimaryAppendOffset(long timestamp, int columnIndex) { if (txFile.getAppendedPartitionCount() == 0) { openFirstPartition(timestamp); } if (timestamp > partitionTimestampHi) { return 0; } return columns.get(getPrimaryColumnIndex(columnIndex)).getAppendOffset(); } private AppendOnlyVirtualMemory getPrimaryColumn(int column) { assert column < columnCount : "Column index is out of bounds: " + column + " >= " + columnCount; return columns.getQuick(getPrimaryColumnIndex(column)); } long getSecondaryAppendOffset(long timestamp, int columnIndex) { if (txFile.getAppendedPartitionCount() == 0) { openFirstPartition(timestamp); } if (timestamp > partitionTimestampHi) { return 0; } return columns.get(getSecondaryColumnIndex(columnIndex)).getAppendOffset(); } private AppendOnlyVirtualMemory getSecondaryColumn(int column) { assert column < columnCount : "Column index is out of bounds: " + column + " >= " + columnCount; return columns.getQuick(getSecondaryColumnIndex(column)); } SymbolMapWriter getSymbolMapWriter(int columnIndex) { return symbolMapWriters.getQuick(columnIndex); } int getTxPartitionCount() { return txFile.getAppendedPartitionCount(); } private boolean hasO3() { return o3MasterRef > -1 && getO3RowCount() > 0; } private long indexHistoricPartitions(SymbolColumnIndexer indexer, CharSequence columnName, int indexValueBlockSize) { final long maxTimestamp = timestampFloorMethod.floor(this.txFile.getMaxTimestamp()); long timestamp = txFile.getMinTimestamp(); //noinspection TryFinallyCanBeTryWithResources try (final MappedReadOnlyMemory roMem = new SinglePageMappedReadOnlyPageMemory()) { while (timestamp < maxTimestamp) { path.trimTo(rootLen); setStateForTimestamp(path, timestamp, true); if (txFile.attachedPartitionsContains(timestamp) && ff.exists(path.$())) { final int plen = path.length(); TableUtils.dFile(path.trimTo(plen), columnName); if (ff.exists(path)) { path.trimTo(plen); LOG.info().$("indexing [path=").$(path).$(']').$(); createIndexFiles(columnName, indexValueBlockSize, plen, true); final long partitionSize = txFile.getPartitionSizeByPartitionTimestamp(timestamp); final long columnTop = TableUtils.readColumnTop(ff, path.trimTo(plen), columnName, plen, tempMem16b); if (partitionSize > columnTop) { TableUtils.dFile(path.trimTo(plen), columnName); roMem.of(ff, path, ff.getPageSize(), 0); roMem.grow((partitionSize - columnTop) << ColumnType.pow2SizeOf(ColumnType.INT)); indexer.configureWriter(configuration, path.trimTo(plen), columnName, columnTop); indexer.index(roMem, columnTop, partitionSize); } } } timestamp = timestampAddMethod.calculate(timestamp, 1); } } finally { indexer.close(); } return timestamp; } private void indexLastPartition(SymbolColumnIndexer indexer, CharSequence columnName, int columnIndex, int indexValueBlockSize) { final int plen = path.length(); createIndexFiles(columnName, indexValueBlockSize, plen, true); final long columnTop = TableUtils.readColumnTop(ff, path.trimTo(plen), columnName, plen, tempMem16b); // set indexer up to continue functioning as normal indexer.configureFollowerAndWriter(configuration, path.trimTo(plen), columnName, getPrimaryColumn(columnIndex), columnTop); indexer.refreshSourceAndIndex(0, txFile.getTransientRowCount()); } private boolean isAppendLastPartitionOnly(long sortedTimestampsAddr, long o3TimestampMax) { boolean yep; final long o3Min = getTimestampIndexValue(sortedTimestampsAddr, 0); long o3MinPartitionTimestamp = timestampFloorMethod.floor(o3Min); final boolean last = o3MinPartitionTimestamp == lastPartitionTimestamp; final int index = txFile.findAttachedPartitionIndexByLoTimestamp(o3MinPartitionTimestamp); if (timestampCeilMethod.ceil(o3Min) >= o3TimestampMax) { yep = last && (txFile.transientRowCount < 0 || o3Min >= txFile.getMaxTimestamp()); } else { yep = false; } return yep; } boolean isSymbolMapWriterCached(int columnIndex) { return symbolMapWriters.getQuick(columnIndex).isCached(); } private void lock() { try { path.trimTo(rootLen); lockName(path); performRecovery = ff.exists(path); this.lockFd = TableUtils.lock(ff, path); } finally { path.trimTo(rootLen); } if (this.lockFd == -1L) { throw CairoException.instance(ff.errno()).put("Cannot lock table: ").put(path.$()); } } private long o3CalculatedMoveUncommittedSize(long transientRowsAdded, long committedTransientRowCount) { // We want to move as much as possible of uncommitted rows to O3 dedicated memory from column files // but not all the column file data is mapped and mapping is relatively expensive. // If the rows are not moved to O3 memory it will result in merge of bigger segment // from O3 to column files // If all the rows moved this will be sort shuffle in O3 memory and copying back to column files for (int colIndex = 0; colIndex < columnCount; colIndex++) { int columnType = metadata.getColumnType(colIndex); AppendOnlyVirtualMemory primaryColumn = getPrimaryColumn(colIndex); AppendOnlyVirtualMemory secondaryColumn = getSecondaryColumn(colIndex); // Fixed size column // // Partition can be like this // // Page 1 (16Mb) unmapped Page 2 (16Mb) mapped // | ================================ < === | ================================ > ----- | // | committedTransientRowCount I transientRowsAdded | // // We want to move separator I between committedTransientRowCount and transientRowsAdded // to start from mapped page boundary but keeping sum of committedTransientRowCount + transientRowsAdded same // so that after the change in committedTransientRowCount, transientRowsAdded the picture looks like // // Page 1 (16Mb) unmapped Page 2 (16Mb) mapped // | ================================ < === | ================================ > ----- | // | committedTransientRowCount I transientRowsAdded | int shl = ColumnType.pow2SizeOf(columnType); if (secondaryColumn == null) { long srcMappedRows = primaryColumn.offsetInPage((committedTransientRowCount + transientRowsAdded) << shl) >> shl; if (srcMappedRows < transientRowsAdded) { long delta = transientRowsAdded - srcMappedRows; committedTransientRowCount += delta; transientRowsAdded -= delta; } // Assert that fixed column is mapped for row committedTransientRowCount assert primaryColumn.addressOf(committedTransientRowCount << shl) > 0; } else { // Variable length record column. It has 2 files: // Primary is the variable record length file // Secondary is fixed file with 64bit per record shl = 3; // Here both files have to be mapped into memory // // Step 1: process fixed record file in the same way as fixed size column above // long fixLenMappedRows = secondaryColumn.offsetInPage(secondaryColumn.getAppendOffset()) >> shl; if (fixLenMappedRows < transientRowsAdded) { committedTransientRowCount += transientRowsAdded - fixLenMappedRows; transientRowsAdded = fixLenMappedRows; } // Assert that secondary file is mapped for row committedTransientRowCount assert secondaryColumn.addressOf(committedTransientRowCount << shl) > 0; // // Step 2: check all rows in transientRowsAdded are mapped in variable record file // // Fixed record file (secondary) has offset records of the variable file // // Primary file: // // | Page 1 (unmapped ) | Page 2 (mapped) | // | ========== I =========== | =================== > -- | // | record one | record 2 | r3 | value of record 3 | // // Secondary file: // // | Page 1 (mapped ) | // | 0 | 14 | 25 | 30 | -----| // // here committedTransientRowCount == 0 and transientRowsAdded == 4 // varFileCommittedOffset is record with 0 offset in the example and is 0 long varFileCommittedOffset = secondaryColumn.getLong(committedTransientRowCount << shl); // varFileMappedOffset is 28 in the above example (offset in bytes where Page 2 of primary file starts) long varFileAppendOffset = primaryColumn.getAppendOffset(); long varLenMappedBytes = primaryColumn.offsetInPage(primaryColumn.getAppendOffset()); long varFileMappedOffset = varFileAppendOffset - varLenMappedBytes; // Check if all variable file records we want to move are mapped if (varFileCommittedOffset < varFileMappedOffset) { // We need to binary search fix file // to find the records where mapped page starts (Page 2 in the example, offset 28) long firstMappedVarColRowOffset = Vect.binarySearch64Bit( secondaryColumn.addressOf(committedTransientRowCount << shl), // this is address of memory of fix file where to start the search varFileMappedOffset, // This is the value we search for (28 in case of example) 0, // start from index 0 fixLenMappedRows - 1, // and search in all mapped rows (inclusive of ixLenMappedRows - 1) BinarySearch.SCAN_DOWN // doesn't matter ); // In the example firstMappedVarColRowOffset expected to be -3 -1, e.g. no exact match found // value 28 is between index 2 and 3 in the secondary file if (firstMappedVarColRowOffset < 0) { // convert it to index 3, the first index of the value >= 28 firstMappedVarColRowOffset = -firstMappedVarColRowOffset - 1; } // move transientRowsAdded to 1 and committedTransientRowCount to 3 transientRowsAdded -= firstMappedVarColRowOffset; assert transientRowsAdded >= 0; committedTransientRowCount += firstMappedVarColRowOffset; // assert that secondary file is mapped for record committedTransientRowCount assert primaryColumn.addressOf(secondaryColumn.getLong(committedTransientRowCount << shl)) > 0; } } } return transientRowsAdded; } void o3ClockDownPartitionUpdateCount() { o3PartitionUpdRemaining.decrementAndGet(); } /** * Commits O3 data. Hysteresis is optional. When 0 is specified the entire O3 segment is committed. * * @param hysteresis interval in microseconds that determines the length of O3 segment that is not going to be * committed to disk. The interval starts at max timestamp of O3 segment and ends hysteresis * microseconds before this timestamp. * @return true when commit has is a NOOP, e.g. no data has been committed to disk. false otherwise. */ private boolean o3Commit(long hysteresis) { o3RowCount = getO3RowCount(); o3PartitionRemoveCandidates.clear(); o3ErrorCount.set(0); o3ColumnCounters.clear(); o3BasketPool.clear(); long o3HysteresisRowCount = 0; long o3MaxUncommittedRows = metadata.getO3MaxUncommittedRows(); final int timestampIndex = metadata.getTimestampIndex(); this.lastPartitionTimestamp = timestampFloorMethod.floor(partitionTimestampHi); try { o3RowCount += o3MoveUncommitted(timestampIndex); // we may need to re-use file descriptors when this partition is the "current" one // we cannot open file again due to sharing violation // // to determine that 'ooTimestampLo' goes into current partition // we need to compare 'partitionTimestampHi', which is appropriately truncated to DAY/MONTH/YEAR // to this.maxTimestamp, which isn't truncated yet. So we need to truncate it first LOG.info().$("sorting o3 [table=").$(tableName).$(']').$(); final long sortedTimestampsAddr = o3TimestampMem.addressOf(0); // ensure there is enough size if (o3RowCount > 600 || !o3QuickSortEnabled) { o3TimestampMemCpy.jumpTo(o3TimestampMem.getAppendOffset()); Vect.radixSortLongIndexAscInPlace(sortedTimestampsAddr, o3RowCount, o3TimestampMemCpy.addressOf(0)); } else { Vect.quickSortLongIndexAscInPlace(sortedTimestampsAddr, o3RowCount); } // we have three frames: // partition logical "lo" and "hi" - absolute bounds (partitionLo, partitionHi) // partition actual data "lo" and "hi" (dataLo, dataHi) // out of order "lo" and "hi" (indexLo, indexHi) final long srcOooMax; final long o3TimestampMin = getTimestampIndexValue(sortedTimestampsAddr, 0); if (hysteresis > 0) { final long o3max = getTimestampIndexValue(sortedTimestampsAddr, o3RowCount - 1); long hysteresisThresholdTimestamp = o3max - hysteresis; if (hysteresisThresholdTimestamp >= o3TimestampMin) { long hysteresisThresholdRow = Vect.boundedBinarySearchIndexT(sortedTimestampsAddr, hysteresisThresholdTimestamp, 0, o3RowCount - 1, BinarySearch.SCAN_DOWN); o3HysteresisRowCount = o3RowCount - hysteresisThresholdRow - 1; if (o3HysteresisRowCount > o3MaxUncommittedRows) { o3HysteresisRowCount = o3MaxUncommittedRows; srcOooMax = o3RowCount - o3MaxUncommittedRows; } else { srcOooMax = hysteresisThresholdRow + 1; } } else { o3HysteresisRowCount = o3RowCount; srcOooMax = 0; } LOG.debug().$("o3 commit hysteresis [table=").$(tableName) .$(", hysteresis=").$(hysteresis) .$(", o3MaxUncommittedRows=").$(o3MaxUncommittedRows) .$(", o3max=").$ts(o3max) .$(", hysteresisThresholdTimestamp=").$ts(hysteresisThresholdTimestamp) .$(", o3HysteresisRowCount=").$(o3HysteresisRowCount) .$(", srcOooMax=").$(srcOooMax) .$(", o3RowCount=").$(o3RowCount) .I$(); } else { LOG.debug() .$("o3 commit no hysteresis [table=").$(tableName) .$(", o3RowCount=").$(o3RowCount) .I$(); srcOooMax = o3RowCount; } if (srcOooMax == 0) { return true; } final long o3TimestampMax = getTimestampIndexValue(sortedTimestampsAddr, srcOooMax - 1); // move uncommitted is liable to change max timestamp // however we need to identify last partition before max timestamp skips to NULL for example final long maxTimestamp = txFile.getMaxTimestamp(); // we are going to use this soon to avoid double-copying hysteresis data // final boolean yep = isAppendLastPartitionOnly(sortedTimestampsAddr, o3TimestampMax); // reshuffle all columns according to timestamp index o3Sort(sortedTimestampsAddr, timestampIndex, o3RowCount); LOG.info().$("sorted [table=").utf8(tableName).I$(); this.o3DoneLatch.reset(); this.o3PartitionUpdRemaining.set(0); boolean success = true; int latchCount = 0; long srcOoo = 0; boolean flattenTimestamp = true; int pCount = 0; try { while (srcOoo < srcOooMax) { try { final long srcOooLo = srcOoo; final long o3Timestamp = getTimestampIndexValue(sortedTimestampsAddr, srcOoo); final long srcOooHi; final long srcOooTimestampCeil = timestampCeilMethod.ceil(o3Timestamp); if (srcOooTimestampCeil < o3TimestampMax) { srcOooHi = Vect.boundedBinarySearchIndexT( sortedTimestampsAddr, srcOooTimestampCeil, srcOoo, srcOooMax - 1, BinarySearch.SCAN_DOWN ); } else { srcOooHi = srcOooMax - 1; } final long partitionTimestamp = timestampFloorMethod.floor(o3Timestamp); final boolean last = partitionTimestamp == lastPartitionTimestamp; srcOoo = srcOooHi + 1; final long srcDataSize; final long srcNameTxn; final int partitionIndex = txFile.findAttachedPartitionIndexByLoTimestamp(partitionTimestamp); if (partitionIndex > -1) { if (last) { srcDataSize = txFile.transientRowCount; } else { srcDataSize = getPartitionSizeByIndex(partitionIndex); } srcNameTxn = getPartitionNameTxnByIndex(partitionIndex); } else { srcDataSize = -1; srcNameTxn = -1; } final boolean append = last && (srcDataSize < 0 || o3Timestamp >= maxTimestamp); LOG.debug(). $("o3 partition task [table=").$(tableName) .$(", srcOooLo=").$(srcOooLo) .$(", srcOooHi=").$(srcOooHi) .$(", srcOooMax=").$(srcOooMax) .$(", o3TimestampMin=").$ts(o3TimestampMin) .$(", o3Timestamp=").$ts(o3Timestamp) .$(", o3TimestampMax=").$ts(o3TimestampMax) .$(", partitionTimestamp=").$ts(partitionTimestamp) .$(", partitionIndex=").$(partitionIndex) .$(", srcDataSize=").$(srcDataSize) .$(", maxTimestamp=").$ts(maxTimestamp) .$(", last=").$(last) .$(", append=").$(append) .$(", memUsed=").$(Unsafe.getMemUsed()) .I$(); pCount++; o3PartitionUpdRemaining.incrementAndGet(); final O3Basket o3Basket = o3BasketPool.next(); o3Basket.ensureCapacity(columnCount, indexCount); AtomicInteger columnCounter = o3ColumnCounters.next(); columnCounter.set(columnCount); latchCount++; if (append) { // assert srcOoo >= srcOooMax || !yep; Path pathToPartition = Path.getThreadLocal(this.path); TableUtils.setPathForPartition(pathToPartition, partitionBy, o3TimestampMin, false); TableUtils.txnPartitionConditionally(pathToPartition, srcNameTxn); final int plen = pathToPartition.length(); for (int i = 0; i < columnCount; i++) { final int colOffset = TableWriter.getPrimaryColumnIndex(i); final boolean notTheTimestamp = i != timestampIndex; final int columnType = metadata.getColumnType(i); final CharSequence columnName = metadata.getColumnName(i); final boolean isIndexed = metadata.isColumnIndexed(i); final BitmapIndexWriter indexWriter = isIndexed ? getBitmapIndexWriter(i) : null; final ContiguousVirtualMemory oooMem1 = o3Columns.getQuick(colOffset); final ContiguousVirtualMemory oooMem2 = o3Columns.getQuick(colOffset + 1); final AppendOnlyVirtualMemory mem1 = columns.getQuick(colOffset); final AppendOnlyVirtualMemory mem2 = columns.getQuick(colOffset + 1); final long srcDataTop = getColumnTop(i); final long srcOooFixAddr; final long srcOooFixSize; final long srcOooVarAddr; final long srcOooVarSize; final AppendOnlyVirtualMemory dstFixMem; final AppendOnlyVirtualMemory dstVarMem; if (columnType != ColumnType.STRING && columnType != ColumnType.BINARY) { srcOooFixAddr = oooMem1.addressOf(0); srcOooFixSize = oooMem1.getAppendOffset(); srcOooVarAddr = 0; srcOooVarSize = 0; dstFixMem = mem1; dstVarMem = null; } else { srcOooFixAddr = oooMem2.addressOf(0); srcOooFixSize = oooMem2.getAppendOffset(); srcOooVarAddr = oooMem1.addressOf(0); srcOooVarSize = oooMem1.getAppendOffset(); dstFixMem = mem2; dstVarMem = mem1; } O3OpenColumnJob.appendLastPartition( pathToPartition, plen, columnName, columnCounter, notTheTimestamp ? columnType : -columnType, srcOooFixAddr, srcOooFixSize, srcOooVarAddr, srcOooVarSize, srcOooLo, srcOooHi, srcOooMax, o3TimestampMin, o3TimestampMax, partitionTimestamp, srcDataTop, Math.max(0, srcDataSize), isIndexed, dstFixMem, dstVarMem, this, indexWriter, tempMem16b ); } } else { if (flattenTimestamp) { Vect.flattenIndex(sortedTimestampsAddr, o3RowCount); flattenTimestamp = false; } o3CommitPartitionAsync( columnCounter, maxTimestamp, sortedTimestampsAddr, srcOooMax, o3TimestampMin, o3TimestampMax, srcOooLo, srcOooHi, partitionTimestamp, last, srcDataSize, srcNameTxn, o3Basket ); } } catch (CairoException | CairoError e) { LOG.error().$((Sinkable) e).$(); success = false; throw e; } } } finally { // we are stealing work here it is possible we get exception from this method LOG.debug() .$("o3 expecting updates [table=").$(tableName) .$(", partitionsPublished=").$(pCount) .I$(); o3ConsumePartitionUpdates( srcOooMax, o3TimestampMin, o3TimestampMax ); o3DoneLatch.await(latchCount); o3InError = !success || o3ErrorCount.get() > 0; if (success && o3ErrorCount.get() > 0) { //noinspection ThrowFromFinallyBlock throw CairoException.instance(0).put("bulk update failed and will be rolled back"); } } if (o3HysteresisRowCount > 0) { o3ShiftHysteresisUp(timestampIndex, o3HysteresisRowCount, srcOooMax); } } finally { if (denseIndexers.size() == 0) { populateDenseIndexerList(); } path.trimTo(rootLen); // Alright, we finished updating partitions. Now we need to get this writer instance into // a consistent state. // // We start with ensuring append memory is in ready-to-use state. When max timestamp changes we need to // move append memory to new set of files. Otherwise we stay on the same set but advance the append position. avoidIndexOnCommit = o3ErrorCount.get() == 0; if (o3HysteresisRowCount == 0) { this.o3MasterRef = -1; rowFunction = switchPartitionFunction; row.activeColumns = columns; row.activeNullSetters = nullSetters; } else { // adjust O3 master ref so that virtual row count becomes equal to value of "o3HysteresisRowCount" this.o3MasterRef = this.masterRef - o3HysteresisRowCount * 2 + 1; } LOG.debug().$("adjusted [o3RowCount=").$(getO3RowCount()).I$(); } if (columns.getQuick(0).isClosed() || partitionTimestampHi < txFile.getMaxTimestamp()) { openPartition(txFile.getMaxTimestamp()); } setAppendPosition(txFile.getTransientRowCount(), true); return false; } private void o3CommitPartitionAsync( AtomicInteger columnCounter, long maxTimestamp, long sortedTimestampsAddr, long srcOooMax, long oooTimestampMin, long oooTimestampMax, long srcOooLo, long srcOooHi, long partitionTimestamp, boolean last, long srcDataSize, long srcNameTxn, O3Basket o3Basket ) { long cursor = messageBus.getO3PartitionPubSeq().next(); if (cursor > -1) { O3PartitionTask task = messageBus.getO3PartitionQueue().get(cursor); task.of( path, partitionBy, columns, o3Columns, srcOooLo, srcOooHi, srcOooMax, oooTimestampMin, oooTimestampMax, partitionTimestamp, maxTimestamp, srcDataSize, srcNameTxn, last, getTxn(), sortedTimestampsAddr, this, columnCounter, o3Basket ); messageBus.getO3PartitionPubSeq().done(cursor); } else { O3PartitionJob.processPartition( path, partitionBy, columns, o3Columns, srcOooLo, srcOooHi, srcOooMax, oooTimestampMin, oooTimestampMax, partitionTimestamp, maxTimestamp, srcDataSize, srcNameTxn, last, getTxn(), sortedTimestampsAddr, this, columnCounter, o3Basket, tempMem16b ); } } private void o3ConsumePartitionUpdates( long srcOooMax, long timestampMin, long timestampMax ) { final Sequence partitionSubSeq = messageBus.getO3PartitionSubSeq(); final RingQueue partitionQueue = messageBus.getO3PartitionQueue(); final Sequence openColumnSubSeq = messageBus.getO3OpenColumnSubSeq(); final RingQueue openColumnQueue = messageBus.getO3OpenColumnQueue(); final Sequence copySubSeq = messageBus.getO3CopySubSeq(); final RingQueue copyQueue = messageBus.getO3CopyQueue(); do { long cursor = o3PartitionUpdateSubSeq.next(); if (cursor > -1) { final O3PartitionUpdateTask task = o3PartitionUpdateQueue.get(cursor); final long partitionTimestamp = task.getPartitionTimestamp(); final long srcOooPartitionLo = task.getSrcOooPartitionLo(); final long srcOooPartitionHi = task.getSrcOooPartitionHi(); final long srcDataMax = task.getSrcDataMax(); final boolean partitionMutates = task.isPartitionMutates(); o3ClockDownPartitionUpdateCount(); o3PartitionUpdateSubSeq.done(cursor); if (o3ErrorCount.get() == 0) { o3PartitionUpdate( timestampMin, timestampMax, partitionTimestamp, srcOooPartitionLo, srcOooPartitionHi, srcOooMax, srcDataMax, partitionMutates ); } continue; } cursor = partitionSubSeq.next(); if (cursor > -1) { final O3PartitionTask partitionTask = partitionQueue.get(cursor); if (partitionTask.getTableWriter() == this && o3ErrorCount.get() > 0) { // do we need to free anything on the task? partitionSubSeq.done(cursor); o3ClockDownPartitionUpdateCount(); o3CountDownDoneLatch(); } else { o3ProcessPartitionSafe(partitionSubSeq, cursor, partitionTask); } continue; } cursor = openColumnSubSeq.next(); if (cursor > -1) { O3OpenColumnTask openColumnTask = openColumnQueue.get(cursor); if (openColumnTask.getTableWriter() == this && o3ErrorCount.get() > 0) { O3CopyJob.closeColumnIdle( openColumnTask.getColumnCounter(), openColumnTask.getTimestampMergeIndexAddr(), openColumnTask.getSrcTimestampFd(), openColumnTask.getSrcTimestampAddr(), openColumnTask.getSrcTimestampSize(), this ); openColumnSubSeq.done(cursor); } else { o3OpenColumnSafe(openColumnSubSeq, cursor, openColumnTask); } continue; } cursor = copySubSeq.next(); if (cursor > -1) { O3CopyTask copyTask = copyQueue.get(cursor); if (copyTask.getTableWriter() == this && o3ErrorCount.get() > 0) { O3CopyJob.copyIdle( copyTask.getColumnCounter(), copyTask.getPartCounter(), copyTask.getTimestampMergeIndexAddr(), copyTask.getSrcDataFixFd(), copyTask.getSrcDataFixAddr(), copyTask.getSrcDataFixSize(), copyTask.getSrcDataVarFd(), copyTask.getSrcDataVarAddr(), copyTask.getSrcDataVarSize(), copyTask.getDstFixFd(), copyTask.getDstFixAddr(), copyTask.getDstFixSize(), copyTask.getDstVarFd(), copyTask.getDstVarAddr(), copyTask.getDstVarSize(), copyTask.getSrcTimestampFd(), copyTask.getSrcTimestampAddr(), copyTask.getSrcTimestampSize(), copyTask.getDstKFd(), copyTask.getDstVFd(), this ); copySubSeq.done(cursor); } else { o3CopySafe(cursor); } } } while (this.o3PartitionUpdRemaining.get() > 0); } private void o3CopySafe( long cursor ) { final O3CopyTask task = messageBus.getO3CopyQueue().get(cursor); try { O3CopyJob.copy( task, cursor, messageBus.getO3CopySubSeq() ); } catch (CairoException | CairoError e) { LOG.error().$((Sinkable) e).$(); } catch (Throwable e) { LOG.error().$(e).$(); } } void o3CountDownDoneLatch() { o3DoneLatch.countDown(); } private void o3MoveHysteresis0( int columnIndex, final int columnType, long o3HysteresisRowCount, long o3RowCount ) { if (columnIndex > -1) { ContiguousVirtualMemory o3DataMem = o3Columns.get(getPrimaryColumnIndex(columnIndex)); ContiguousVirtualMemory o3IndexMem = o3Columns.get(getSecondaryColumnIndex(columnIndex)); long size; long sourceOffset; final int shl = ColumnType.pow2SizeOf(columnType); if (null == o3IndexMem) { // Fixed size column sourceOffset = o3RowCount << shl; size = o3HysteresisRowCount << shl; } else { // Var size column sourceOffset = o3IndexMem.getLong(o3RowCount * 8); size = o3DataMem.getAppendOffset() - sourceOffset; O3Utils.shiftCopyFixedSizeColumnData( sourceOffset, o3IndexMem.addressOf(o3RowCount * 8), 0, o3HysteresisRowCount * 8, o3IndexMem.addressOf(0) ); o3IndexMem.jumpTo(o3HysteresisRowCount * 8); } o3DataMem.jumpTo(size); Vect.memmove(o3DataMem.addressOf(0), o3DataMem.addressOf(sourceOffset), size); } else { // Special case, designated timestamp column // Move values and set index to 0..o3HysteresisRowCount final long sourceOffset = o3RowCount * 16; final long mergeMemAddr = o3TimestampMem.addressOf(0); Vect.shiftTimestampIndex(mergeMemAddr + sourceOffset, o3HysteresisRowCount, mergeMemAddr); o3TimestampMem.jumpTo(o3HysteresisRowCount * 16); } } private long o3MoveUncommitted(final int timestampIndex) { final long committedRowCount = txFile.getCommittedFixedRowCount() + txFile.getCommittedTransientRowCount(); final long rowsAdded = txFile.getRowCount() - committedRowCount; final long committedTransientRowCount = txFile.getTransientRowCount() - Math.min(txFile.getTransientRowCount(), rowsAdded); if (Math.min(txFile.getTransientRowCount(), rowsAdded) > 0) { LOG.debug() .$("o3 move uncommitted [table=").$(tableName) .$(", transientRowsAdded=").$(Math.min(txFile.getTransientRowCount(), rowsAdded)) .I$(); return o3ScheduleMoveUncommitted0( timestampIndex, Math.min(txFile.getTransientRowCount(), rowsAdded), committedTransientRowCount ); } return 0; } private void o3MoveUncommitted0( int colIndex, int columnType, long committedTransientRowCount, long transientRowsAdded ) { if (colIndex > -1) { AppendOnlyVirtualMemory srcDataMem = getPrimaryColumn(colIndex); int shl = ColumnType.pow2SizeOf(columnType); long srcFixOffset; final ContiguousVirtualMemory o3DataMem = o3Columns.get(getPrimaryColumnIndex(colIndex)); final ContiguousVirtualMemory o3IndexMem = o3Columns.get(getSecondaryColumnIndex(colIndex)); long extendedSize; long dstVarOffset = o3DataMem.getAppendOffset(); if (null == o3IndexMem) { // Fixed size extendedSize = transientRowsAdded << shl; srcFixOffset = committedTransientRowCount << shl; } else { // Var size final AppendOnlyVirtualMemory srcFixMem = getSecondaryColumn(colIndex); long srcVarOffset = srcFixMem.getLong(committedTransientRowCount * Long.BYTES); // ensure memory is available long dstAppendOffset = o3IndexMem.getAppendOffset(); o3IndexMem.jumpTo(o3IndexMem.getAppendOffset() + transientRowsAdded * Long.BYTES); O3Utils.shiftCopyFixedSizeColumnData( srcVarOffset - dstVarOffset, srcFixMem.addressOf(committedTransientRowCount * Long.BYTES), 0, transientRowsAdded, o3IndexMem.addressOf(dstAppendOffset) ); long sourceEndOffset = srcDataMem.getAppendOffset(); extendedSize = sourceEndOffset - srcVarOffset; srcFixOffset = srcVarOffset; srcFixMem.jumpTo(committedTransientRowCount * Long.BYTES); } o3DataMem.jumpTo(dstVarOffset + extendedSize); long appendAddress = o3DataMem.addressOf(dstVarOffset); long sourceAddress = srcDataMem.addressOf(srcFixOffset); Vect.memcpy(sourceAddress, appendAddress, extendedSize); srcDataMem.jumpTo(srcFixOffset); } else { // Timestamp column colIndex = -colIndex - 1; int shl = ColumnType.pow2SizeOf(ColumnType.TIMESTAMP); AppendOnlyVirtualMemory srcDataMem = getPrimaryColumn(colIndex); long srcFixOffset = committedTransientRowCount << shl; for (long n = 0; n < transientRowsAdded; n++) { long ts = srcDataMem.getLong(srcFixOffset + (n << shl)); o3TimestampMem.putLong128(ts, o3RowCount + n); } srcDataMem.jumpTo(srcFixOffset); } } private void o3OpenColumnSafe(Sequence openColumnSubSeq, long cursor, O3OpenColumnTask openColumnTask) { try { O3OpenColumnJob.openColumn(openColumnTask, cursor, openColumnSubSeq, tempMem16b); } catch (CairoException | CairoError e) { LOG.error().$((Sinkable) e).$(); } catch (Throwable e) { LOG.error().$(e).$(); } } private void o3OpenColumns() { for (int i = 0; i < columnCount; i++) { ContiguousVirtualMemory mem1 = o3Columns.getQuick(getPrimaryColumnIndex(i)); mem1.jumpTo(0); ContiguousVirtualMemory mem2 = o3Columns.getQuick(getSecondaryColumnIndex(i)); if (mem2 != null) { mem2.jumpTo(0); } } row.activeColumns = o3Columns; row.activeNullSetters = o3NullSetters; LOG.debug().$("switched partition to memory").$(); } private void o3PartitionUpdate( long timestampMin, long timestampMax, long partitionTimestamp, long srcOooPartitionLo, long srcOooPartitionHi, long srcOooMax, long srcDataMax, boolean partitionMutates ) { this.txFile.minTimestamp = Math.min(timestampMin, this.txFile.minTimestamp); final long partitionSize = srcDataMax + srcOooPartitionHi - srcOooPartitionLo + 1; final long rowDelta = srcOooPartitionHi - srcOooMax; if (partitionTimestamp < lastPartitionTimestamp) { this.txFile.fixedRowCount += partitionSize - srcDataMax; // when we exit here we need to rollback transientRowCount we've been incrementing // while adding out-of-order data } else if (rowDelta < -1) { this.txFile.fixedRowCount += partitionSize; } else { // this is last partition this.txFile.transientRowCount = partitionSize; this.txFile.maxTimestamp = Math.max(this.txFile.maxTimestamp, timestampMax); } final int partitionIndex = txFile.findAttachedPartitionIndexByLoTimestamp(partitionTimestamp); if (partitionTimestamp == lastPartitionTimestamp) { if (partitionMutates) { closeActivePartition(); } else if (rowDelta < -1) { closeActivePartition(partitionSize); } else { setAppendPosition(partitionSize, false); } } if (partitionMutates) { final long srcDataTxn = txFile.getPartitionNameTxnByIndex(partitionIndex); LOG.info() .$("merged partition [table=`").utf8(tableName) .$("`, ts=").$ts(partitionTimestamp) .$(", txn=").$(txFile.txn).$(']').$(); txFile.updatePartitionSizeByIndexAndTxn(partitionIndex, partitionSize); o3PartitionRemoveCandidates.add(partitionTimestamp); o3PartitionRemoveCandidates.add(srcDataTxn); txFile.bumpPartitionTableVersion(); } else { if (partitionTimestamp != lastPartitionTimestamp) { txFile.bumpPartitionTableVersion(); } txFile.updatePartitionSizeByIndex(partitionIndex, partitionTimestamp, partitionSize); } } synchronized void o3PartitionUpdateSynchronized( long timestampMin, long timestampMax, long partitionTimestamp, long srcOooPartitionLo, long srcOooPartitionHi, boolean partitionMutates, long srcOooMax, long srcDataMax ) { o3ClockDownPartitionUpdateCount(); o3PartitionUpdate( timestampMin, timestampMax, partitionTimestamp, srcOooPartitionLo, srcOooPartitionHi, srcOooMax, srcDataMax, partitionMutates ); } private void o3ProcessPartitionRemoveCandidates() { try { final int n = o3PartitionRemoveCandidates.size(); if (n > 0) { o3ProcessPartitionRemoveCandidates0(n); } } finally { o3PartitionRemoveCandidates.clear(); } } private void o3ProcessPartitionRemoveCandidates0(int n) { final long readerTxn = txnScoreboard.getMin(); final long readerTxnCount = txnScoreboard.getActiveReaderCount(readerTxn); if (txnScoreboard.isTxnAvailable(txFile.getTxn() - 1)) { for (int i = 0; i < n; i += 2) { final long timestamp = o3PartitionRemoveCandidates.getQuick(i); final long txn = o3PartitionRemoveCandidates.getQuick(i + 1); try { setPathForPartition( other, partitionBy, timestamp, false ); TableUtils.txnPartitionConditionally(other, txn); other.slash$(); int errno; if ((errno = ff.rmdir(other)) == 0) { LOG.info().$( "purged [path=").$(other) .$(", readerTxn=").$(readerTxn) .$(", readerTxnCount=").$(readerTxnCount) .$(']').$(); } else { LOG.info() .$("queued to purge [errno=").$(errno) .$(", table=").$(tableName) .$(", ts=").$ts(timestamp) .$(", txn=").$(txn) .$(']').$(); o3QueuePartitionForPurge(timestamp, txn); } } finally { other.trimTo(rootLen); } } } else { // queue all updated partitions for (int i = 0; i < n; i += 2) { o3QueuePartitionForPurge( o3PartitionRemoveCandidates.getQuick(i), o3PartitionRemoveCandidates.getQuick(i + 1) ); } } } private void o3ProcessPartitionSafe(Sequence partitionSubSeq, long cursor, O3PartitionTask partitionTask) { try { O3PartitionJob.processPartition(tempMem16b, partitionTask, cursor, partitionSubSeq); } catch (CairoException | CairoError e) { LOG.error().$((Sinkable) e).$(); } catch (Throwable e) { LOG.error().$(e).$(); } } private void o3QueuePartitionForPurge(long timestamp, long txn) { final MPSequence seq = messageBus.getO3PurgeDiscoveryPubSeq(); long cursor = seq.next(); if (cursor > -1) { O3PurgeDiscoveryTask task = messageBus.getO3PurgeDiscoveryQueue().get(cursor); task.of( tableName, partitionBy, txnScoreboard, timestamp, txn ); seq.done(cursor); } else { LOG.error() .$("could not purge [errno=").$(ff.errno()) .$(", table=").$(tableName) .$(", ts=").$ts(timestamp) .$(", txn=").$(txn) .$(']').$(); } } private long o3ScheduleMoveUncommitted0(int timestampIndex, long transientRowsAdded, long committedTransientRowCount) { long transientRowsAddedNew = o3CalculatedMoveUncommittedSize(transientRowsAdded, committedTransientRowCount); long delta = transientRowsAdded - transientRowsAddedNew; assert delta >= 0; if (delta > 0) { // Not all uncommitted rows can be moved to O3 staging memory // because column files are not fully mapped to memory // reduce number of rows to move transientRowsAdded -= delta; committedTransientRowCount += delta; } if (transientRowsAdded > 0) { long maxCommittedTimestamp = 0; if (delta > 0) { // If there are rows to move // and we cannot move all uncommitted rows to o3 memory // we have to set maxCommittedTimestamp in tx file AppendOnlyVirtualMemory timestampColumn = getPrimaryColumn(timestampIndex); if (!timestampColumn.isMapped((committedTransientRowCount - 1) << 3, Long.BYTES)) { // Need to leave one more record in column files // to correctly get max timestamp transientRowsAdded--; committedTransientRowCount++; } maxCommittedTimestamp = timestampColumn.getLong((committedTransientRowCount - 1) << 3); } final Sequence pubSeq = this.messageBus.getO3CallbackPubSeq(); final RingQueue queue = this.messageBus.getO3CallbackQueue(); o3PendingCallbackTasks.clear(); o3DoneLatch.reset(); int queuedCount = 0; for (int colIndex = 0; colIndex < columnCount; colIndex++) { int columnType = metadata.getColumnType(colIndex); int columnIndex = colIndex != timestampIndex ? colIndex : -colIndex - 1; long cursor = pubSeq.next(); // Pass column index as -1 when it's designated timestamp column to o3 move method if (cursor > -1) { try { final O3CallbackTask task = queue.get(cursor); task.of( o3DoneLatch, columnIndex, columnType, committedTransientRowCount, transientRowsAdded, this.o3MoveUncommittedRef ); o3PendingCallbackTasks.add(task); } finally { queuedCount++; pubSeq.done(cursor); } } else { o3MoveUncommitted0(columnIndex, columnType, committedTransientRowCount, transientRowsAdded); } } for (int n = o3PendingCallbackTasks.size() - 1; n > -1; n--) { final O3CallbackTask task = o3PendingCallbackTasks.getQuick(n); if (task.tryLock()) { O3CallbackJob.runCallbackWithCol( task, -1, null ); } } o3DoneLatch.await(queuedCount); if (delta == 0) { txFile.resetToLastPartition(committedTransientRowCount); } else { // If transientRowsAdded is decreased because uncommitted area is not mapped // maxCommittedTimestamp the last value of the segment left in files txFile.resetToLastPartition(committedTransientRowCount, maxCommittedTimestamp); } } return transientRowsAdded; } private void o3ShiftHysteresisUp(int timestampIndex, long o3HysteresisRowCount, long o3RowCount) { o3PendingCallbackTasks.clear(); final Sequence pubSeq = this.messageBus.getO3CallbackPubSeq(); final RingQueue queue = this.messageBus.getO3CallbackQueue(); o3DoneLatch.reset(); int queuedCount = 0; for (int colIndex = 0; colIndex < columnCount; colIndex++) { int columnType = metadata.getColumnType(colIndex); long cursor = pubSeq.next(); // Pass column index as -1 when it's designated timestamp column to o3 move method int columnIndex = colIndex != timestampIndex ? colIndex : -colIndex - 1; if (cursor > -1) { try { final O3CallbackTask task = queue.get(cursor); task.of( o3DoneLatch, columnIndex, columnType, o3HysteresisRowCount, o3RowCount, this.o3MoveHysteresisRef ); o3PendingCallbackTasks.add(task); } finally { queuedCount++; pubSeq.done(cursor); } } else { o3MoveHysteresis0(columnIndex, columnType, o3HysteresisRowCount, o3RowCount); } } for (int n = o3PendingCallbackTasks.size() - 1; n > -1; n--) { final O3CallbackTask task = o3PendingCallbackTasks.getQuick(n); if (task.tryLock()) { O3CallbackJob.runCallbackWithCol( task, -1, null ); } } o3DoneLatch.await(queuedCount); } private void o3Sort(long mergedTimestamps, int timestampIndex, long rowCount) { o3PendingCallbackTasks.clear(); final Sequence pubSeq = this.messageBus.getO3CallbackPubSeq(); final RingQueue queue = this.messageBus.getO3CallbackQueue(); o3DoneLatch.reset(); int queuedCount = 0; for (int i = 0; i < columnCount; i++) { if (timestampIndex != i) { final int type = metadata.getColumnType(i); long cursor = pubSeq.next(); if (cursor > -1) { try { final O3CallbackTask task = queue.get(cursor); task.of( o3DoneLatch, i, type, mergedTimestamps, rowCount, type == ColumnType.STRING || type == ColumnType.BINARY ? oooSortVarColumnRef : oooSortFixColumnRef ); o3PendingCallbackTasks.add(task); } finally { queuedCount++; pubSeq.done(cursor); } } else { o3SortColumn(mergedTimestamps, i, type, rowCount); } } } for (int n = o3PendingCallbackTasks.size() - 1; n > -1; n--) { final O3CallbackTask task = o3PendingCallbackTasks.getQuick(n); if (task.tryLock()) { O3CallbackJob.runCallbackWithCol( task, -1, null ); } } o3DoneLatch.await(queuedCount); } private void o3SortColumn(long mergedTimestamps, int i, int type, long rowCount) { switch (type) { case ColumnType.BINARY: case ColumnType.STRING: o3SortVarColumn(i, type, mergedTimestamps, rowCount); break; default: o3SortFixColumn(i, type, mergedTimestamps, rowCount); break; } } private void o3SortFixColumn( int columnIndex, final int columnType, long mergedTimestampsAddr, long valueCount ) { final int columnOffset = getPrimaryColumnIndex(columnIndex); final ContiguousVirtualMemory mem = o3Columns.getQuick(columnOffset); final ContiguousVirtualMemory mem2 = o3Columns2.getQuick(columnOffset); final long src = mem.addressOf(0); final long srcSize = mem.size(); final int shl = ColumnType.pow2SizeOf(columnType); mem2.jumpTo(valueCount << shl); final long tgtDataAddr = mem2.addressOf(0); final long tgtDataSize = mem2.size(); switch (shl) { case 0: Vect.indexReshuffle8Bit(src, tgtDataAddr, mergedTimestampsAddr, valueCount); break; case 1: Vect.indexReshuffle16Bit(src, tgtDataAddr, mergedTimestampsAddr, valueCount); break; case 2: Vect.indexReshuffle32Bit(src, tgtDataAddr, mergedTimestampsAddr, valueCount); break; case 3: Vect.indexReshuffle64Bit(src, tgtDataAddr, mergedTimestampsAddr, valueCount); break; case 5: Vect.indexReshuffle256Bit(src, tgtDataAddr, mergedTimestampsAddr, valueCount); break; default: assert false : "col type is unsupported"; break; } mem.replacePage(tgtDataAddr, tgtDataSize); mem2.replacePage(src, srcSize); } private void o3SortVarColumn( int columnIndex, int columnType, long mergedTimestampsAddr, long valueCount ) { final int primaryIndex = getPrimaryColumnIndex(columnIndex); final int secondaryIndex = primaryIndex + 1; final ContiguousVirtualMemory dataMem = o3Columns.getQuick(primaryIndex); final ContiguousVirtualMemory indexMem = o3Columns.getQuick(secondaryIndex); final ContiguousVirtualMemory dataMem2 = o3Columns2.getQuick(primaryIndex); final ContiguousVirtualMemory indexMem2 = o3Columns2.getQuick(secondaryIndex); final long dataSize = dataMem.getAppendOffset(); // ensure we have enough memory allocated final long srcDataAddr = dataMem.addressOf(0); final long srcDataSize = dataMem.size(); final long srcIndxAddr = indexMem.addressOf(0); final long srcIndxSize = indexMem.size(); final long tgtDataAddr = dataMem2.resize(dataSize); final long tgtDataSize = dataMem2.size(); final long tgtIndxAddr = indexMem2.resize(valueCount * Long.BYTES); final long tgtIndxSize = indexMem2.size(); // add max offset so that we do not have conditionals inside loop indexMem.putLong(valueCount * Long.BYTES, dataSize); final long offset = Vect.sortVarColumn( mergedTimestampsAddr, valueCount, srcDataAddr, srcIndxAddr, tgtDataAddr, tgtIndxAddr ); dataMem.replacePage(tgtDataAddr, tgtDataSize); indexMem.replacePage(tgtIndxAddr, tgtIndxSize); dataMem2.replacePage(srcDataAddr, srcDataSize); indexMem2.replacePage(srcIndxAddr, srcIndxSize); dataMem.jumpTo(offset); indexMem.jumpTo(valueCount * Long.BYTES); } private void o3TimestampSetter(long timestamp) { o3TimestampMem.putLong128(timestamp, getO3RowCount()); } private void openColumnFiles(CharSequence name, int i, int plen) { AppendOnlyVirtualMemory mem1 = getPrimaryColumn(i); AppendOnlyVirtualMemory mem2 = getSecondaryColumn(i); try { mem1.of(ff, dFile(path.trimTo(plen), name), configuration.getAppendPageSize()); if (mem2 != null) { mem2.of(ff, iFile(path.trimTo(plen), name), configuration.getAppendPageSize()); } } finally { path.trimTo(plen); } } private void openFirstPartition(long timestamp) { openPartition(repairDataGaps(timestamp)); setAppendPosition(txFile.getTransientRowCount(), true); if (performRecovery) { performRecovery(); } txFile.openFirstPartition(timestamp); } private void openNewColumnFiles(CharSequence name, boolean indexFlag, int indexValueBlockCapacity) { try { // open column files setStateForTimestamp(path, txFile.getMaxTimestamp(), false); final int plen = path.length(); final int columnIndex = columnCount - 1; // index must be created before column is initialised because // it uses primary column object as temporary tool if (indexFlag) { createIndexFiles(name, indexValueBlockCapacity, plen, true); } openColumnFiles(name, columnIndex, plen); if (txFile.getTransientRowCount() > 0) { // write .top file writeColumnTop(name); } if (indexFlag) { ColumnIndexer indexer = indexers.getQuick(columnIndex); assert indexer != null; indexers.getQuick(columnIndex).configureFollowerAndWriter(configuration, path.trimTo(plen), name, getPrimaryColumn(columnIndex), txFile.getTransientRowCount()); } } finally { path.trimTo(rootLen); } } private void openPartition(long timestamp) { try { setStateForTimestamp(path, timestamp, true); int plen = path.length(); if (ff.mkdirs(path.slash$(), mkDirMode) != 0) { throw CairoException.instance(ff.errno()).put("Cannot create directory: ").put(path); } assert columnCount > 0; for (int i = 0; i < columnCount; i++) { final CharSequence name = metadata.getColumnName(i); final boolean indexed = metadata.isColumnIndexed(i); final long columnTop; // prepare index writer if column requires indexing if (indexed) { // we have to create files before columns are open // because we are reusing AppendOnlyVirtualMemory object from columns list createIndexFiles(name, metadata.getIndexValueBlockCapacity(i), plen, txFile.getTransientRowCount() < 1); } openColumnFiles(name, i, plen); columnTop = readColumnTop(ff, path, name, plen, tempMem16b); columnTops.extendAndSet(i, columnTop); if (indexed) { ColumnIndexer indexer = indexers.getQuick(i); assert indexer != null; indexer.configureFollowerAndWriter(configuration, path, name, getPrimaryColumn(i), columnTop); } } LOG.info().$("switched partition [path='").$(path).$("']").$(); } finally { path.trimTo(rootLen); } } private long openTodoMem() { path.concat(TODO_FILE_NAME).$(); try { if (ff.exists(path)) { long fileLen = ff.length(path); if (fileLen < 32) { throw CairoException.instance(0).put("corrupt ").put(path); } todoMem.of(ff, path, ff.getPageSize(), fileLen); this.todoTxn = todoMem.getLong(0); // check if _todo_ file is consistent, if not, we just ignore its contents and reset hash if (todoMem.getLong(24) != todoTxn) { todoMem.putLong(8, configuration.getDatabaseIdLo()); todoMem.putLong(16, configuration.getDatabaseIdHi()); Unsafe.getUnsafe().storeFence(); todoMem.putLong(24, todoTxn); return 0; } return todoMem.getLong(32); } else { TableUtils.resetTodoLog(ff, path, rootLen, todoMem); todoTxn = 0; return 0; } } finally { path.trimTo(rootLen); } } private void performRecovery() { rollbackIndexes(); rollbackSymbolTables(); performRecovery = false; } private void populateDenseIndexerList() { denseIndexers.clear(); for (int i = 0, n = indexers.size(); i < n; i++) { ColumnIndexer indexer = indexers.getQuick(i); if (indexer != null) { denseIndexers.add(indexer); } } indexCount = denseIndexers.size(); } void purgeUnusedPartitions() { if (partitionBy != PartitionBy.NONE) { removeNonAttachedPartitions(); } } private long readMinTimestamp(long partitionTimestamp) { setStateForTimestamp(other, partitionTimestamp, false); try { dFile(other, metadata.getColumnName(metadata.getTimestampIndex())); if (ff.exists(other)) { // read min timestamp value final long fd = TableUtils.openRO(ff, other, LOG); try { long n = ff.read(fd, tempMem16b, Long.BYTES, 0); if (n != Long.BYTES) { throw CairoException.instance(Os.errno()).put("could not read timestamp value"); } return Unsafe.getUnsafe().getLong(tempMem16b); } finally { ff.close(fd); } } else { throw CairoException.instance(0).put("Partition does not exist [path=").put(other).put(']'); } } finally { other.trimTo(rootLen); } } private void recoverFromMetaRenameFailure(CharSequence columnName) { openMetaFile(ff, path, rootLen, metaMem); } private void recoverFromSwapRenameFailure(CharSequence columnName) { recoverFrommTodoWriteFailure(columnName); clearTodoLog(); } private void recoverFromSymbolMapWriterFailure(CharSequence columnName) { removeSymbolMapFilesQuiet(columnName); removeMetaFile(); recoverFromSwapRenameFailure(columnName); } private void recoverFrommTodoWriteFailure(CharSequence columnName) { restoreMetaFrom(META_PREV_FILE_NAME, metaPrevIndex); openMetaFile(ff, path, rootLen, metaMem); } private void recoverOpenColumnFailure(CharSequence columnName) { final int index = columnCount - 1; removeMetaFile(); removeLastColumn(); recoverFromSwapRenameFailure(columnName); removeSymbolMapWriter(index); } private void releaseLock(boolean distressed) { if (lockFd != -1L) { ff.close(lockFd); if (distressed) { return; } try { lockName(path); removeOrException(ff, path); } finally { path.trimTo(rootLen); } } } private void removeColumn(int columnIndex) { final int pi = getPrimaryColumnIndex(columnIndex); final int si = getSecondaryColumnIndex(columnIndex); freeAndRemoveColumnPair(columns, pi, si); freeAndRemoveColumnPair(o3Columns, pi, si); freeAndRemoveColumnPair(o3Columns2, pi, si); columnTops.removeIndex(columnIndex); nullSetters.remove(columnIndex); o3NullSetters.remove(columnIndex); if (columnIndex < indexers.size()) { Misc.free(indexers.getQuick(columnIndex)); indexers.remove(columnIndex); populateDenseIndexerList(); } } private void removeColumnFiles(CharSequence columnName, int columnType, RemoveFileLambda removeLambda) { try { ff.iterateDir(path.$(), (file, type) -> { nativeLPSZ.of(file); if (type == Files.DT_DIR && IGNORED_FILES.excludes(nativeLPSZ)) { path.trimTo(rootLen); path.concat(nativeLPSZ); int plen = path.length(); removeLambda.remove(ff, dFile(path, columnName)); removeLambda.remove(ff, iFile(path.trimTo(plen), columnName)); removeLambda.remove(ff, topFile(path.trimTo(plen), columnName)); removeLambda.remove(ff, BitmapIndexUtils.keyFileName(path.trimTo(plen), columnName)); removeLambda.remove(ff, BitmapIndexUtils.valueFileName(path.trimTo(plen), columnName)); } }); if (columnType == ColumnType.SYMBOL) { removeLambda.remove(ff, SymbolMapWriter.offsetFileName(path.trimTo(rootLen), columnName)); removeLambda.remove(ff, SymbolMapWriter.charFileName(path.trimTo(rootLen), columnName)); removeLambda.remove(ff, BitmapIndexUtils.keyFileName(path.trimTo(rootLen), columnName)); removeLambda.remove(ff, BitmapIndexUtils.valueFileName(path.trimTo(rootLen), columnName)); } } finally { path.trimTo(rootLen); } } private int removeColumnFromMeta(int index) { try { int metaSwapIndex = openMetaSwapFile(ff, ddlMem, path, rootLen, fileOperationRetryCount); int timestampIndex = metaMem.getInt(META_OFFSET_TIMESTAMP_INDEX); ddlMem.putInt(columnCount - 1); ddlMem.putInt(partitionBy); if (timestampIndex == index) { ddlMem.putInt(-1); } else if (index < timestampIndex) { ddlMem.putInt(timestampIndex - 1); } else { ddlMem.putInt(timestampIndex); } ddlMem.putInt(ColumnType.VERSION); ddlMem.putInt(metaMem.getInt(META_OFFSET_TABLE_ID)); ddlMem.jumpTo(META_OFFSET_COLUMN_TYPES); for (int i = 0; i < columnCount; i++) { if (i != index) { writeColumnEntry(i); } } long nameOffset = getColumnNameOffset(columnCount); for (int i = 0; i < columnCount; i++) { CharSequence columnName = metaMem.getStr(nameOffset); if (i != index) { ddlMem.putStr(columnName); } nameOffset += VmUtils.getStorageLength(columnName); } return metaSwapIndex; } finally { ddlMem.close(); } } private void removeIndexFiles(CharSequence columnName) { try { ff.iterateDir(path.$(), (file, type) -> { nativeLPSZ.of(file); if (type == Files.DT_DIR && IGNORED_FILES.excludes(nativeLPSZ)) { path.trimTo(rootLen); path.concat(nativeLPSZ); int plen = path.length(); removeFileAndOrLog(ff, BitmapIndexUtils.keyFileName(path.trimTo(plen), columnName)); removeFileAndOrLog(ff, BitmapIndexUtils.valueFileName(path.trimTo(plen), columnName)); } }); } finally { path.trimTo(rootLen); } } private void removeLastColumn() { removeColumn(columnCount - 1); columnCount--; } private void removeMetaFile() { try { path.concat(META_FILE_NAME).$(); if (ff.exists(path) && !ff.remove(path)) { throw CairoException.instance(ff.errno()).put("Recovery failed. Cannot remove: ").put(path); } } finally { path.trimTo(rootLen); } } private void removeNonAttachedPartitions() { LOG.info().$("purging non attached partitions [path=").$(path.$()).$(']').$(); try { ff.iterateDir(path.$(), removePartitionDirsNotAttached); } finally { path.trimTo(rootLen); } } private void removePartitionDirectories() { try { ff.iterateDir(path.$(), removePartitionDirectories); } finally { path.trimTo(rootLen); } } private void removePartitionDirectories0(long name, int type) { path.trimTo(rootLen); path.concat(name).$(); nativeLPSZ.of(name); int errno; if (IGNORED_FILES.excludes(nativeLPSZ) && type == Files.DT_DIR && (errno = ff.rmdir(path)) != 0) { LOG.info().$("could not remove [path=").$(path).$(", errno=").$(errno).$(']').$(); } } private void removePartitionDirsNotAttached(long pName, int type) { nativeLPSZ.of(pName); if (!isDots(nativeLPSZ) && type == Files.DT_DIR) { if (Chars.endsWith(nativeLPSZ, DETACHED_DIR_MARKER)) { // Do not remove detached partitions // They are probably about to be attached. return; } try { long txn = 0; int txnSep = Chars.indexOf(nativeLPSZ, '.'); if (txnSep < 0) { txnSep = nativeLPSZ.length(); } else { txn = Numbers.parseLong(nativeLPSZ, txnSep + 1, nativeLPSZ.length()); } long dirTimestamp = partitionDirFmt.parse(nativeLPSZ, 0, txnSep, null); if (txn <= txFile.txn && (txFile.attachedPartitionsContains(dirTimestamp) || txFile.isActivePartition(dirTimestamp))) { return; } } catch (NumericException ignore) { // not a date? // ignore exception and remove directory // we rely on this behaviour to remove leftover directories created by OOO processing } path.trimTo(rootLen); path.concat(pName).$(); int errno; if ((errno = ff.rmdir(path)) == 0) { LOG.info().$("removed partition dir: ").$(path).$(); } else { LOG.error().$("cannot remove: ").$(path).$(" [errno=").$(errno).$(']').$(); } } } private void removeSymbolMapFilesQuiet(CharSequence name) { try { removeFileAndOrLog(ff, SymbolMapWriter.offsetFileName(path.trimTo(rootLen), name)); removeFileAndOrLog(ff, SymbolMapWriter.charFileName(path.trimTo(rootLen), name)); removeFileAndOrLog(ff, BitmapIndexUtils.keyFileName(path.trimTo(rootLen), name)); removeFileAndOrLog(ff, BitmapIndexUtils.valueFileName(path.trimTo(rootLen), name)); } finally { path.trimTo(rootLen); } } private void removeSymbolMapWriter(int index) { SymbolMapWriter writer = symbolMapWriters.getQuick(index); symbolMapWriters.remove(index); if (writer != null) { int symColIndex = denseSymbolMapWriters.remove(writer); denseSymbolTransientCountHandlers.remove(symColIndex); // Shift all subsequent symbol indexes by 1 back while (symColIndex < denseSymbolTransientCountHandlers.size()) { WriterTransientSymbolCountChangeHandler transientCountHandler = denseSymbolTransientCountHandlers.getQuick(symColIndex); assert transientCountHandler.symColIndex - 1 == symColIndex; transientCountHandler.symColIndex = symColIndex; symColIndex++; } Misc.free(writer); } } private int rename(int retries) { try { int index = 0; other.concat(META_PREV_FILE_NAME).$(); path.concat(META_FILE_NAME).$(); int l = other.length(); do { if (index > 0) { other.trimTo(l); other.put('.').put(index); other.$(); } if (ff.exists(other) && !ff.remove(other)) { LOG.info().$("cannot remove target of rename '").$(path).$("' to '").$(other).$(" [errno=").$(ff.errno()).$(']').$(); index++; continue; } if (!ff.rename(path, other)) { LOG.info().$("cannot rename '").$(path).$("' to '").$(other).$(" [errno=").$(ff.errno()).$(']').$(); index++; continue; } return index; } while (index < retries); throw CairoException.instance(0).put("Cannot rename ").put(path).put(". Max number of attempts reached [").put(index).put("]. Last target was: ").put(other); } finally { path.trimTo(rootLen); other.trimTo(rootLen); } } private void renameColumnFiles(CharSequence columnName, CharSequence newName, int columnType) { try { ff.iterateDir(path.$(), (file, type) -> { nativeLPSZ.of(file); if (type == Files.DT_DIR && IGNORED_FILES.excludes(nativeLPSZ)) { path.trimTo(rootLen); path.concat(nativeLPSZ); other.trimTo(rootLen); other.concat(nativeLPSZ); int plen = path.length(); renameFileOrLog(ff, dFile(path.trimTo(plen), columnName), dFile(other.trimTo(plen), newName)); renameFileOrLog(ff, iFile(path.trimTo(plen), columnName), iFile(other.trimTo(plen), newName)); renameFileOrLog(ff, topFile(path.trimTo(plen), columnName), topFile(other.trimTo(plen), newName)); renameFileOrLog(ff, BitmapIndexUtils.keyFileName(path.trimTo(plen), columnName), BitmapIndexUtils.keyFileName(other.trimTo(plen), newName)); renameFileOrLog(ff, BitmapIndexUtils.valueFileName(path.trimTo(plen), columnName), BitmapIndexUtils.valueFileName(other.trimTo(plen), newName)); } }); if (columnType == ColumnType.SYMBOL) { renameFileOrLog(ff, SymbolMapWriter.offsetFileName(path.trimTo(rootLen), columnName), SymbolMapWriter.offsetFileName(other.trimTo(rootLen), newName)); renameFileOrLog(ff, SymbolMapWriter.charFileName(path.trimTo(rootLen), columnName), SymbolMapWriter.charFileName(other.trimTo(rootLen), newName)); renameFileOrLog(ff, BitmapIndexUtils.keyFileName(path.trimTo(rootLen), columnName), BitmapIndexUtils.keyFileName(other.trimTo(rootLen), newName)); renameFileOrLog(ff, BitmapIndexUtils.valueFileName(path.trimTo(rootLen), columnName), BitmapIndexUtils.valueFileName(other.trimTo(rootLen), newName)); } } finally { path.trimTo(rootLen); other.trimTo(rootLen); } } private int renameColumnFromMeta(int index, CharSequence newName) { try { int metaSwapIndex = openMetaSwapFile(ff, ddlMem, path, rootLen, fileOperationRetryCount); int timestampIndex = metaMem.getInt(META_OFFSET_TIMESTAMP_INDEX); ddlMem.putInt(columnCount); ddlMem.putInt(partitionBy); ddlMem.putInt(timestampIndex); ddlMem.putInt(ColumnType.VERSION); ddlMem.putInt(metaMem.getInt(META_OFFSET_TABLE_ID)); ddlMem.jumpTo(META_OFFSET_COLUMN_TYPES); for (int i = 0; i < columnCount; i++) { writeColumnEntry(i); } long nameOffset = getColumnNameOffset(columnCount); for (int i = 0; i < columnCount; i++) { CharSequence columnName = metaMem.getStr(nameOffset); nameOffset += VmUtils.getStorageLength(columnName); if (i == index) { columnName = newName; } ddlMem.putStr(columnName); } return metaSwapIndex; } finally { ddlMem.close(); } } private void renameMetaToMetaPrev(CharSequence columnName) { try { this.metaPrevIndex = rename(fileOperationRetryCount); } catch (CairoException e) { runFragile(RECOVER_FROM_META_RENAME_FAILURE, columnName, e); } } private void renameSwapMetaToMeta(CharSequence columnName) { // rename _meta.swp to _meta try { restoreMetaFrom(META_SWAP_FILE_NAME, metaSwapIndex); } catch (CairoException e) { runFragile(RECOVER_FROM_SWAP_RENAME_FAILURE, columnName, e); } } private long repairDataGaps(final long timestamp) { if (txFile.getMaxTimestamp() != Numbers.LONG_NaN && partitionBy != PartitionBy.NONE) { long actualSize = 0; long lastTimestamp = -1; long transientRowCount = this.txFile.getTransientRowCount(); long maxTimestamp = this.txFile.getMaxTimestamp(); try { final long tsLimit = timestampFloorMethod.floor(this.txFile.getMaxTimestamp()); for (long ts = getPartitionLo(txFile.getMinTimestamp()); ts < tsLimit; ts = timestampAddMethod.calculate(ts, 1)) { path.trimTo(rootLen); setStateForTimestamp(path, ts, false); int p = path.length(); long partitionSize = txFile.getPartitionSizeByPartitionTimestamp(ts); if (partitionSize >= 0 && ff.exists(path.$())) { actualSize += partitionSize; lastTimestamp = ts; } else { Path other = Path.getThreadLocal2(path.trimTo(p).$()); TableUtils.oldPartitionName(other, getTxn()); if (ff.exists(other.$())) { if (!ff.rename(other, path)) { LOG.error().$("could not rename [from=").$(other).$(", to=").$(path).$(']').$(); throw new CairoError("could not restore directory, see log for details"); } else { LOG.info().$("restored [path=").$(path).$(']').$(); } } else { LOG.info().$("missing partition [name=").$(path.trimTo(p).$()).$(']').$(); } } } if (lastTimestamp > -1) { path.trimTo(rootLen); setStateForTimestamp(path, tsLimit, false); if (!ff.exists(path.$())) { Path other = Path.getThreadLocal2(path); TableUtils.oldPartitionName(other, getTxn()); if (ff.exists(other.$())) { if (!ff.rename(other, path)) { LOG.error().$("could not rename [from=").$(other).$(", to=").$(path).$(']').$(); throw new CairoError("could not restore directory, see log for details"); } else { LOG.info().$("restored [path=").$(path).$(']').$(); } } else { LOG.error().$("last partition does not exist [name=").$(path).$(']').$(); // ok, create last partition we discovered the active // 1. read its size path.trimTo(rootLen); setStateForTimestamp(path, lastTimestamp, false); int p = path.length(); transientRowCount = txFile.getPartitionSizeByPartitionTimestamp(lastTimestamp); // 2. read max timestamp TableUtils.dFile(path.trimTo(p), metadata.getColumnName(metadata.getTimestampIndex())); maxTimestamp = TableUtils.readLongAtOffset(ff, path, tempMem16b, (transientRowCount - 1) * Long.BYTES); actualSize -= transientRowCount; LOG.info() .$("updated active partition [name=").$(path.trimTo(p).$()) .$(", maxTimestamp=").$ts(maxTimestamp) .$(", transientRowCount=").$(transientRowCount) .$(", fixedRowCount=").$(txFile.getFixedRowCount()) .$(']').$(); } } } } finally { path.trimTo(rootLen); } final long expectedSize = txFile.readFixedRowCount(); if (expectedSize != actualSize || maxTimestamp != this.txFile.getMaxTimestamp()) { LOG.info() .$("actual table size has been adjusted [name=`").utf8(tableName).$('`') .$(", expectedFixedSize=").$(expectedSize) .$(", actualFixedSize=").$(actualSize) .$(']').$(); txFile.reset(actualSize, transientRowCount, maxTimestamp); return maxTimestamp; } } return timestamp; } private void repairMetaRename(int index) { try { path.concat(META_PREV_FILE_NAME); if (index > 0) { path.put('.').put(index); } path.$(); if (ff.exists(path)) { LOG.info().$("Repairing metadata from: ").$(path).$(); if (ff.exists(other.concat(META_FILE_NAME).$()) && !ff.remove(other)) { throw CairoException.instance(ff.errno()).put("Repair failed. Cannot replace ").put(other); } if (!ff.rename(path, other)) { throw CairoException.instance(ff.errno()).put("Repair failed. Cannot rename ").put(path).put(" -> ").put(other); } } } finally { path.trimTo(rootLen); other.trimTo(rootLen); } clearTodoLog(); } private void repairTruncate() { LOG.info().$("repairing abnormally terminated truncate on ").$(path).$(); if (partitionBy != PartitionBy.NONE) { removePartitionDirectories(); } txFile.reset(); clearTodoLog(); } private void restoreMetaFrom(CharSequence fromBase, int fromIndex) { try { path.concat(fromBase); if (fromIndex > 0) { path.put('.').put(fromIndex); } path.$(); TableUtils.renameOrFail(ff, path, other.concat(META_FILE_NAME).$()); } finally { path.trimTo(rootLen); other.trimTo(rootLen); } } private void rollbackIndexes() { final long maxRow = txFile.getTransientRowCount() - 1; for (int i = 0, n = denseIndexers.size(); i < n; i++) { ColumnIndexer indexer = denseIndexers.getQuick(i); long fd = indexer.getFd(); LOG.info().$("recovering index [fd=").$(fd).$(']').$(); if (fd > -1) { indexer.rollback(maxRow); } } } private void rollbackSymbolTables() { int expectedMapWriters = txFile.readWriterCount(); for (int i = 0; i < expectedMapWriters; i++) { denseSymbolMapWriters.getQuick(i).rollback(txFile.readSymbolWriterIndexOffset(i)); } } private void runFragile(FragileCode fragile, CharSequence columnName, CairoException e) { try { fragile.run(columnName); } catch (CairoException err) { LOG.error().$("DOUBLE ERROR: 1st: {").$((Sinkable) e).$('}').$(); throwDistressException(err); } throw e; } private void setAppendPosition(final long position, boolean ensureFileSize) { for (int i = 0; i < columnCount; i++) { // stop calculating oversize as soon as we find first over-sized column setColumnSize( ff, getPrimaryColumn(i), getSecondaryColumn(i), getColumnType(metaMem, i), position - columnTops.getQuick(i), tempMem16b, ensureFileSize ); } } /** * Sets path member variable to partition directory for the given timestamp and * partitionLo and partitionHi to partition interval in millis. These values are * determined based on input timestamp and value of partitionBy. For any given * timestamp this method will determine either day, month or year interval timestamp falls to. * Partition directory name is ISO string of interval start. *

* Because this method modifies "path" member variable, be sure path is trimmed to original * state within try..finally block. * * @param path path instance to modify * @param timestamp to determine interval for * @param updatePartitionInterval flag indicating that partition interval partitionLo and */ private void setStateForTimestamp(Path path, long timestamp, boolean updatePartitionInterval) { final long partitionTimestampHi = TableUtils.setPathForPartition(path, partitionBy, timestamp, true); TableUtils.txnPartitionConditionally( path, txFile.getPartitionNameTxnByPartitionTimestamp(partitionTimestampHi) ); if (updatePartitionInterval) { this.partitionTimestampHi = partitionTimestampHi; } } void startAppendedBlock(long timestampLo, long timestampHi, long nRowsAdded, LongList blockColumnTops) { if (timestampLo < txFile.getMaxTimestamp()) { throw CairoException.instance(ff.errno()).put("Cannot insert rows out of order. Table=").put(path); } if (txFile.getAppendedPartitionCount() == 0) { openFirstPartition(timestampLo); } if (partitionBy != PartitionBy.NONE && timestampLo > partitionTimestampHi) { // Need close memory without truncating for (int columnIndex = 0; columnIndex < columnCount; columnIndex++) { AppendOnlyVirtualMemory mem1 = getPrimaryColumn(columnIndex); mem1.close(false); AppendOnlyVirtualMemory mem2 = getSecondaryColumn(columnIndex); if (null != mem2) { mem2.close(false); } } switchPartition(timestampLo); } this.txFile.appendBlock(timestampLo, timestampHi, nRowsAdded); for (int columnIndex = 0; columnIndex < columnCount; columnIndex++) { // Handle column tops long blockColumnTop = blockColumnTops.getQuick(columnIndex); if (blockColumnTop != -1) { long columnTop = columnTops.getQuick(columnIndex); if (blockColumnTop != columnTop) { try { assert columnTop == 0; assert blockColumnTop > 0; TableUtils.setPathForPartition(path, partitionBy, timestampLo, false); columnTops.setQuick(columnIndex, blockColumnTop); writeColumnTop(getMetadata().getColumnName(columnIndex), blockColumnTop); } finally { path.trimTo(rootLen); } } } } } private void switchPartition(long timestamp) { // Before partition can be switched we need to index records // added so far. Index writers will start point to different // files after switch. updateIndexes(); txFile.switchPartitions(timestamp); openPartition(timestamp); setAppendPosition(0, false); } private void syncColumns(int commitMode) { final boolean async = commitMode == CommitMode.ASYNC; for (int i = 0; i < columnCount; i++) { columns.getQuick(i * 2).sync(async); final AppendOnlyVirtualMemory m2 = columns.getQuick(i * 2 + 1); if (m2 != null) { m2.sync(false); } } } private void throwDistressException(Throwable cause) { this.distressed = true; throw new CairoError(cause); } private void updateIndexes() { if (indexCount == 0 || avoidIndexOnCommit) { avoidIndexOnCommit = false; return; } updateIndexesSlow(); } private void updateIndexesParallel(long lo, long hi) { indexSequences.clear(); indexLatch.setCount(indexCount); final int nParallelIndexes = indexCount - 1; final Sequence indexPubSequence = this.messageBus.getIndexerPubSequence(); final RingQueue indexerQueue = this.messageBus.getIndexerQueue(); LOG.info().$("parallel indexing [indexCount=").$(indexCount).$(']').$(); int serialIndexCount = 0; // we are going to index last column in this thread while other columns are on the queue OUT: for (int i = 0; i < nParallelIndexes; i++) { long cursor = indexPubSequence.next(); if (cursor == -1) { // queue is full, process index in the current thread indexAndCountDown(denseIndexers.getQuick(i), lo, hi, indexLatch); serialIndexCount++; continue; } if (cursor == -2) { // CAS issue, retry do { cursor = indexPubSequence.next(); if (cursor == -1) { indexAndCountDown(denseIndexers.getQuick(i), lo, hi, indexLatch); serialIndexCount++; continue OUT; } } while (cursor < 0); } final ColumnIndexerTask queueItem = indexerQueue.get(cursor); final ColumnIndexer indexer = denseIndexers.getQuick(i); final long sequence = indexer.getSequence(); queueItem.indexer = indexer; queueItem.lo = lo; queueItem.hi = hi; queueItem.countDownLatch = indexLatch; queueItem.sequence = sequence; indexSequences.add(sequence); indexPubSequence.done(cursor); } // index last column while other columns are brewing on the queue indexAndCountDown(denseIndexers.getQuick(indexCount - 1), lo, hi, indexLatch); serialIndexCount++; // At this point we have re-indexed our column and if things are flowing nicely // all other columns should have been done by other threads. Instead of actually // waiting we gracefully check latch count. if (!indexLatch.await(configuration.getWorkStealTimeoutNanos())) { // other columns are still in-flight, we must attempt to steal work from other threads for (int i = 0; i < nParallelIndexes; i++) { ColumnIndexer indexer = denseIndexers.getQuick(i); if (indexer.tryLock(indexSequences.getQuick(i))) { indexAndCountDown(indexer, lo, hi, indexLatch); serialIndexCount++; } } // wait for the ones we cannot steal indexLatch.await(); } // reset lock on completed indexers boolean distressed = false; for (int i = 0; i < indexCount; i++) { ColumnIndexer indexer = denseIndexers.getQuick(i); distressed = distressed | indexer.isDistressed(); } if (distressed) { throwDistressException(null); } LOG.info().$("parallel indexing done [serialCount=").$(serialIndexCount).$(']').$(); } private void updateIndexesSerially(long lo, long hi) { LOG.info().$("serial indexing [indexCount=").$(indexCount).$(']').$(); for (int i = 0, n = indexCount; i < n; i++) { try { denseIndexers.getQuick(i).refreshSourceAndIndex(lo, hi); } catch (CairoException e) { // this is pretty severe, we hit some sort of a limit LOG.error().$("index error {").$((Sinkable) e).$('}').$(); throwDistressException(e); } } LOG.info().$("serial indexing done [indexCount=").$(indexCount).$(']').$(); } private void updateIndexesSlow() { final long hi = txFile.getTransientRowCount(); final long lo = txFile.getAppendedPartitionCount() == 1 ? hi - txFile.getLastTxSize() : 0; if (indexCount > 1 && parallelIndexerEnabled && hi - lo > configuration.getParallelIndexThreshold()) { updateIndexesParallel(lo, hi); } else { updateIndexesSerially(lo, hi); } } private void updateMaxTimestamp(long timestamp) { txFile.updateMaxTimestamp(timestamp); this.timestampSetter.accept(timestamp); } private void validateSwapMeta(CharSequence columnName) { try { try { path.concat(META_SWAP_FILE_NAME); if (metaSwapIndex > 0) { path.put('.').put(metaSwapIndex); } metaMem.of(ff, path.$(), ff.getPageSize(), ff.length(path)); validationMap.clear(); validate(ff, metaMem, validationMap); } finally { metaMem.close(); path.trimTo(rootLen); } } catch (CairoException e) { runFragile(RECOVER_FROM_META_RENAME_FAILURE, columnName, e); } } private void writeColumnEntry(int i) { ddlMem.putByte((byte) getColumnType(metaMem, i)); long flags = 0; if (isColumnIndexed(metaMem, i)) { flags |= META_FLAG_BIT_INDEXED; } if (isSequential(metaMem, i)) { flags |= META_FLAG_BIT_SEQUENTIAL; } ddlMem.putLong(flags); ddlMem.putInt(getIndexBlockCapacity(metaMem, i)); ddlMem.skip(META_COLUMN_DATA_RESERVED); } private void writeColumnTop(CharSequence name) { writeColumnTop(name, txFile.getTransientRowCount()); } private void writeColumnTop(CharSequence name, long columnTop) { TableUtils.writeColumnTop( ff, path, name, columnTop, tempMem16b ); } private void writeRestoreMetaTodo(CharSequence columnName) { try { todoMem.putLong(0, ++todoTxn); // write txn, reader will first read txn at offset 24 and then at offset 0 Unsafe.getUnsafe().storeFence(); // make sure we do not write hash before writing txn (view from another thread) todoMem.putLong(8, configuration.getDatabaseIdLo()); // write out our instance hashes todoMem.putLong(16, configuration.getDatabaseIdHi()); Unsafe.getUnsafe().storeFence(); todoMem.putLong(32, 1); todoMem.putLong(40, TODO_RESTORE_META); todoMem.putLong(48, metaPrevIndex); Unsafe.getUnsafe().storeFence(); todoMem.putLong(24, todoTxn); todoMem.setSize(56); } catch (CairoException e) { runFragile(RECOVER_FROM_TODO_WRITE_FAILURE, columnName, e); } } @FunctionalInterface private interface RemoveFileLambda { void remove(FilesFacade ff, LPSZ name); } @FunctionalInterface private interface FragileCode { void run(CharSequence columnName); } @FunctionalInterface public interface O3ColumnUpdateMethod { void run( int columnIndex, final int columnType, long mergedTimestampsAddr, long valueCount ); } static class TimestampValueRecord implements Record { private long value; @Override public long getTimestamp(int col) { return value; } public void setTimestamp(long value) { this.value = value; } } private class OpenPartitionRowFunction implements RowFunction { @Override public Row newRow(long timestamp) { if (txFile.getMaxTimestamp() != Long.MIN_VALUE) { return (rowFunction = switchPartitionFunction).newRow(timestamp); } return getRowSlow(timestamp); } private Row getRowSlow(long timestamp) { txFile.setMinTimestamp(timestamp); openFirstPartition(timestamp); return (rowFunction = switchPartitionFunction).newRow(timestamp); } } private class NoPartitionFunction implements RowFunction { @Override public Row newRow(long timestamp) { bumpMasterRef(); txFile.append(); if (timestamp >= txFile.getMaxTimestamp()) { updateMaxTimestamp(timestamp); return row; } throw CairoException.instance(ff.errno()).put("Cannot insert rows out of order. Table=").put(path); } } private class NoTimestampFunction implements RowFunction { @Override public Row newRow(long timestamp) { bumpMasterRef(); txFile.append(); return row; } } private class O3PartitionFunction implements RowFunction { @Override public Row newRow(long timestamp) { bumpMasterRef(); o3TimestampSetter(timestamp); return row; } } private class SwitchPartitionRowFunction implements RowFunction { @Override public Row newRow(long timestamp) { bumpMasterRef(); if (timestamp > partitionTimestampHi || timestamp < txFile.getMaxTimestamp()) { return newRow0(timestamp); } updateMaxTimestamp(timestamp); txFile.append(); return row; } @NotNull private Row newRow0(long timestamp) { if (timestamp < txFile.getMaxTimestamp()) { return newRowO3(timestamp); } if (timestamp > partitionTimestampHi && partitionBy != PartitionBy.NONE) { switchPartition(timestamp); } updateMaxTimestamp(timestamp); txFile.append(); return row; } private Row newRowO3(long timestamp) { LOG.info().$("switched to o3 [table=").utf8(tableName).$(']').$(); txFile.beginPartitionSizeUpdate(); o3OpenColumns(); o3InError = false; o3MasterRef = masterRef; o3TimestampSetter(timestamp); rowFunction = o3RowFunction; return row; } } public class Row { private ObjList activeColumns; private ObjList activeNullSetters; public void append() { if ((masterRef & 1) != 0) { for (int i = 0; i < columnCount; i++) { if (refs.getQuick(i) < masterRef) { activeNullSetters.getQuick(i).run(); } } masterRef++; } } public void cancel() { cancelRow(); } public void putBin(int index, long address, long len) { getSecondaryColumn(index).putLong(getPrimaryColumn(index).putBin(address, len)); notNull(index); } public void putBin(int index, BinarySequence sequence) { getSecondaryColumn(index).putLong(getPrimaryColumn(index).putBin(sequence)); notNull(index); } public void putBool(int index, boolean value) { getPrimaryColumn(index).putBool(value); notNull(index); } public void putByte(int index, byte value) { getPrimaryColumn(index).putByte(value); notNull(index); } public void putChar(int index, char value) { getPrimaryColumn(index).putChar(value); notNull(index); } public void putDate(int index, long value) { putLong(index, value); } public void putDouble(int index, double value) { getPrimaryColumn(index).putDouble(value); notNull(index); } public void putFloat(int index, float value) { getPrimaryColumn(index).putFloat(value); notNull(index); } public void putInt(int index, int value) { getPrimaryColumn(index).putInt(value); notNull(index); } public void putLong(int index, long value) { getPrimaryColumn(index).putLong(value); notNull(index); } public void putLong256(int index, long l0, long l1, long l2, long l3) { getPrimaryColumn(index).putLong256(l0, l1, l2, l3); notNull(index); } public void putLong256(int index, Long256 value) { getPrimaryColumn(index).putLong256(value.getLong0(), value.getLong1(), value.getLong2(), value.getLong3()); notNull(index); } public void putLong256(int index, CharSequence hexString) { getPrimaryColumn(index).putLong256(hexString); notNull(index); } public void putLong256(int index, @NotNull CharSequence hexString, int start, int end) { getPrimaryColumn(index).putLong256(hexString, start, end); notNull(index); } public void putShort(int index, short value) { getPrimaryColumn(index).putShort(value); notNull(index); } public void putStr(int index, CharSequence value) { getSecondaryColumn(index).putLong(getPrimaryColumn(index).putStr(value)); notNull(index); } public void putStr(int index, char value) { getSecondaryColumn(index).putLong(getPrimaryColumn(index).putStr(value)); notNull(index); } public void putStr(int index, CharSequence value, int pos, int len) { getSecondaryColumn(index).putLong(getPrimaryColumn(index).putStr(value, pos, len)); notNull(index); } public void putSym(int index, CharSequence value) { getPrimaryColumn(index).putInt(symbolMapWriters.getQuick(index).put(value)); notNull(index); } public void putSym(int index, char value) { getPrimaryColumn(index).putInt(symbolMapWriters.getQuick(index).put(value)); notNull(index); } public void putSymIndex(int index, int symIndex) { getPrimaryColumn(index).putInt(symIndex); notNull(index); } public void putTimestamp(int index, long value) { putLong(index, value); } public void putTimestamp(int index, CharSequence value) { // try UTC timestamp first (micro) long l; try { l = value != null ? IntervalUtils.parseFloorPartialDate(value) : Numbers.LONG_NaN; } catch (NumericException e) { throw CairoException.instance(0).put("Invalid timestamp: ").put(value); } putTimestamp(index, l); } private WriteOnlyVirtualMemory getPrimaryColumn(int columnIndex) { return activeColumns.getQuick(getPrimaryColumnIndex(columnIndex)); } private WriteOnlyVirtualMemory getSecondaryColumn(int columnIndex) { return activeColumns.getQuick(getSecondaryColumnIndex(columnIndex)); } private void notNull(int index) { refs.setQuick(index, masterRef); } } private class WriterTransientSymbolCountChangeHandler implements TransientSymbolCountChangeHandler { private int symColIndex; private WriterTransientSymbolCountChangeHandler(int symColIndex) { this.symColIndex = symColIndex; } @Override public void handleTransientSymbolCountChange(int symbolCount) { Unsafe.getUnsafe().storeFence(); txFile.writeTransientSymbolCount(symColIndex, symbolCount); } } static { IGNORED_FILES.add(".."); IGNORED_FILES.add("."); IGNORED_FILES.add(META_FILE_NAME); IGNORED_FILES.add(TXN_FILE_NAME); IGNORED_FILES.add(TODO_FILE_NAME); } }





© 2015 - 2024 Weber Informatics LLC | Privacy Policy