
io.questdb.cairo.ColumnVersionWriter Maven / Gradle / Ivy
/*******************************************************************************
* ___ _ ____ ____
* / _ \ _ _ ___ ___| |_| _ \| __ )
* | | | | | | |/ _ \/ __| __| | | | _ \
* | |_| | |_| | __/\__ \ |_| |_| | |_) |
* \__\_\\__,_|\___||___/\__|____/|____/
*
* Copyright (c) 2014-2019 Appsicle
* Copyright (c) 2019-2023 QuestDB
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*
******************************************************************************/
package io.questdb.cairo;
import io.questdb.cairo.vm.Vm;
import io.questdb.cairo.vm.api.MemoryCMARW;
import io.questdb.std.FilesFacade;
import io.questdb.std.LongList;
import io.questdb.std.MemoryTag;
import io.questdb.std.Unsafe;
import io.questdb.std.str.LPSZ;
public class ColumnVersionWriter extends ColumnVersionReader {
private final CairoConfiguration configuration;
private final MemoryCMARW mem;
private final boolean partitioned;
private boolean hasChanges;
private long size;
private long version;
// size should be read from the transaction file
// it can be zero when there are no columns deviating from the main
// data branch
public ColumnVersionWriter(CairoConfiguration configuration, LPSZ fileName, boolean partitioned) {
final FilesFacade ff = configuration.getFilesFacade();
this.mem = Vm.getCMARWInstance(ff, fileName, ff.getPageSize(), 0, MemoryTag.MMAP_TABLE_READER, CairoConfiguration.O_NONE);
this.configuration = configuration;
this.partitioned = partitioned;
this.size = this.mem.size();
super.ofRO(mem);
if (this.size > 0) {
this.version = super.readUnsafe();
}
}
@Override
public void clear() {
throw new UnsupportedOperationException();
}
@Override
public void close() {
mem.close(false);
}
public void commit() {
if (!hasChanges) {
return;
}
doCommit();
hasChanges = false;
}
public void copyColumnVersions(long srcTimestamp, long dstTimestamp) {
int index = copyColumnVersions(srcTimestamp, dstTimestamp, cachedColumnVersionList);
if (index > -1) {
for (int n = cachedColumnVersionList.size(); index < n; index += BLOCK_SIZE) {
if (cachedColumnVersionList.get(index) == srcTimestamp) {
cachedColumnVersionList.setQuick(index, dstTimestamp);
} else {
break;
}
}
}
}
public long getOffsetA() {
return mem.getLong(OFFSET_OFFSET_A_64);
}
public long getOffsetB() {
return mem.getLong(OFFSET_OFFSET_B_64);
}
@Override
public long getVersion() {
return version;
}
public boolean hasChanges() {
return hasChanges;
}
public void overrideColumnVersions(long partitionTimestamp, ColumnVersionReader src) {
copyColumnVersions(partitionTimestamp, partitionTimestamp, src.cachedColumnVersionList);
}
@Override
public long readUnsafe() {
this.hasChanges = false;
return this.version = super.readUnsafe();
}
public void removeColumnTop(long partitionTimestamp, int columnIndex) {
int recordIndex = getRecordIndex(partitionTimestamp, columnIndex);
if (recordIndex >= 0) {
cachedColumnVersionList.setQuick(recordIndex + COLUMN_TOP_OFFSET, 0);
hasChanges = true;
}
}
public void removePartition(long partitionTimestamp) {
int from = cachedColumnVersionList.binarySearchBlock(BLOCK_SIZE_MSB, partitionTimestamp, BinarySearch.SCAN_UP);
if (from > -1) {
int to = cachedColumnVersionList.binarySearchBlock(from, BLOCK_SIZE_MSB, partitionTimestamp, BinarySearch.SCAN_DOWN);
int len = to - from + BLOCK_SIZE;
cachedColumnVersionList.removeIndexBlock(from, len);
hasChanges = true;
}
}
public void truncate() {
if (cachedColumnVersionList.size() > 0) {
final long defaultPartitionTimestamp = COL_TOP_DEFAULT_PARTITION;
int from = cachedColumnVersionList.binarySearchBlock(BLOCK_SIZE_MSB, defaultPartitionTimestamp + 1, BinarySearch.SCAN_UP);
if (from < 0) {
from = -from - 1;
}
if (partitioned) {
// Remove all partitions after COL_TOP_DEFAULT_PARTITION
if (from < cachedColumnVersionList.size()) {
cachedColumnVersionList.setPos(from);
}
// Keep default column version but reset the added timestamp to min
for (int i = 0, n = cachedColumnVersionList.size(); i < n; i += BLOCK_SIZE) {
cachedColumnVersionList.setQuick(i + TIMESTAMP_ADDED_PARTITION_OFFSET, defaultPartitionTimestamp);
}
} else {
// We have to keep all the column name txns because the files are truncated but not re-created.
// But we want to remove all the column tops.
// The column name txn can be added when the column is added via alter table or when column is updated.
// When ALTER table add column is executed it creates a record in the NaN partition with the column name txn
// and a record in 0 (default) partition with the column top.
// When the column is changed using UPDATE SQL, the column name txn is only set in 0 (default) partition.
// These 2 scenarios are test covered in TruncateTest.
// Result action is to remove all column tops and keep all column name txns.
for (int i = from; i < cachedColumnVersionList.size(); i += BLOCK_SIZE) {
cachedColumnVersionList.setQuick(i + COLUMN_TOP_OFFSET, 0);
}
}
hasChanges = true;
commit();
}
}
/**
* Adds or updates column version entry in the cached list. Entries from the cache are committed to disk via
* commit() call. In cache and on disk entries are maintained in ascending chronological order of partition
* timestamps and ascending column index order within each timestamp.
*
* @param timestamp partition timestamp
* @param columnIndex column index
* @param txn column file txn name
* @param columnTop column top
*/
public void upsert(long timestamp, int columnIndex, long txn, long columnTop) {
final int sz = cachedColumnVersionList.size();
int index = cachedColumnVersionList.binarySearchBlock(BLOCK_SIZE_MSB, timestamp, BinarySearch.SCAN_UP);
boolean insert = true;
if (index > -1) {
// brute force columns for this timestamp
while (index < sz && cachedColumnVersionList.getQuick(index) == timestamp) {
final long thisIndex = cachedColumnVersionList.getQuick(index + COLUMN_INDEX_OFFSET);
if (thisIndex == columnIndex) {
if (txn > -1) {
cachedColumnVersionList.setQuick(index + COLUMN_NAME_TXN_OFFSET, txn);
}
cachedColumnVersionList.setQuick(index + COLUMN_TOP_OFFSET, columnTop);
insert = false;
break;
}
if (thisIndex > columnIndex) {
break;
}
index += BLOCK_SIZE;
}
} else {
index = -index - 1;
}
if (insert) {
if (index < sz) {
cachedColumnVersionList.insert(index, BLOCK_SIZE);
} else {
cachedColumnVersionList.setPos(Math.max(index + BLOCK_SIZE, sz + BLOCK_SIZE));
}
cachedColumnVersionList.setQuick(index, timestamp);
cachedColumnVersionList.setQuick(index + COLUMN_INDEX_OFFSET, columnIndex);
cachedColumnVersionList.setQuick(index + COLUMN_NAME_TXN_OFFSET, txn);
cachedColumnVersionList.setQuick(index + COLUMN_TOP_OFFSET, columnTop);
}
hasChanges = true;
}
public void upsertColumnTop(long partitionTimestamp, int columnIndex, long colTop) {
assert partitioned;
int recordIndex = getRecordIndex(partitionTimestamp, columnIndex);
if (recordIndex > -1L) {
cachedColumnVersionList.setQuick(recordIndex + COLUMN_TOP_OFFSET, colTop);
hasChanges = true;
} else {
// This is a 0 column top record we need to store it
// to mark that the column is written in O3 even before the partition the column was originally added
int defaultRecordIndex = getRecordIndex(COL_TOP_DEFAULT_PARTITION, columnIndex);
if (defaultRecordIndex >= 0) {
long columnNameTxn = cachedColumnVersionList.getQuick(defaultRecordIndex + COLUMN_NAME_TXN_OFFSET);
long defaultPartitionTimestamp = cachedColumnVersionList.getQuick(defaultRecordIndex + TIMESTAMP_ADDED_PARTITION_OFFSET);
// Do not add 0 column top if the default partition
if (defaultPartitionTimestamp > partitionTimestamp || colTop > 0) {
upsert(partitionTimestamp, columnIndex, columnNameTxn, colTop);
}
} else if (colTop > 0) {
// Store non-zero column tops only, zero is default
// for columns added on the table creation
upsert(partitionTimestamp, columnIndex, -1L, colTop);
}
}
}
public void upsertDefaultTxnName(int columnIndex, long columnNameTxn, long partitionTimestamp) {
// When table is partitioned, use columnTop place to store the timestamp of the partition where the column added
upsert(COL_TOP_DEFAULT_PARTITION, columnIndex, columnNameTxn, partitionTimestamp);
}
private void bumpFileSize(long size) {
mem.setSize(size);
this.size = size;
}
private long calculateSize(int entryCount) {
// calculate the area size required to store the versions
// we're assuming that 'columnVersions' contains 4 longs per entry
// We're storing 4 longs per entry in the file
return (long) entryCount * BLOCK_SIZE_BYTES;
}
private long calculateWriteOffset(long areaSize) {
boolean currentIsA = isCurrentA();
long currentOffset = currentIsA ? getOffsetA() : getOffsetB();
currentOffset = Math.max(currentOffset, HEADER_SIZE);
if (HEADER_SIZE + areaSize <= currentOffset) {
return HEADER_SIZE;
}
long currentSize = currentIsA ? getSizeA() : getSizeB();
return currentOffset + currentSize;
}
private int copyColumnVersions(long srcTimestamp, long dstTimestamp, LongList srcColumnVersionList) {
int srcIndex = srcColumnVersionList.binarySearchBlock(BLOCK_SIZE_MSB, srcTimestamp, BinarySearch.SCAN_UP);
if (srcIndex < 0) { // source does not have partition information
return -1;
}
int index = cachedColumnVersionList.binarySearchBlock(BLOCK_SIZE_MSB, dstTimestamp, BinarySearch.SCAN_UP);
if (index > -1L) {
// Wipe out all the information about this partition to replace with the new one.
removePartition(dstTimestamp);
index = cachedColumnVersionList.binarySearchBlock(BLOCK_SIZE_MSB, dstTimestamp, BinarySearch.SCAN_UP);
}
if (index < 0) { // the cache does not contain this partition
index = -index - 1;
int srcEnd = srcColumnVersionList.binarySearchBlock(srcIndex, BLOCK_SIZE_MSB, srcTimestamp, BinarySearch.SCAN_DOWN);
cachedColumnVersionList.insertFromSource(index, srcColumnVersionList, srcIndex, srcEnd + BLOCK_SIZE);
} else {
throw CairoException.critical(0)
.put("invalid Column Version state ")
.ts(dstTimestamp)
.put(" column version state, cannot update partition information");
}
hasChanges = true;
return index;
}
private void doCommit() {
int entryCount = cachedColumnVersionList.size() / BLOCK_SIZE;
long areaSize = calculateSize(entryCount);
long writeOffset = calculateWriteOffset(areaSize);
bumpFileSize(writeOffset + areaSize);
store(entryCount, writeOffset);
if (isCurrentA()) {
updateB(writeOffset, areaSize);
} else {
updateA(writeOffset, areaSize);
}
Unsafe.getUnsafe().storeFence();
storeNewVersion();
final int commitMode = configuration.getCommitMode();
if (commitMode != CommitMode.NOSYNC) {
mem.sync(commitMode == CommitMode.ASYNC);
}
}
private long getSizeA() {
return mem.getLong(OFFSET_SIZE_A_64);
}
private long getSizeB() {
return mem.getLong(OFFSET_SIZE_B_64);
}
private boolean isCurrentA() {
return (version & 1L) == 0L;
}
private void store(int entryCount, long offset) {
for (int i = 0; i < entryCount; i++) {
int x = i * BLOCK_SIZE;
mem.putLong(offset, cachedColumnVersionList.getQuick(x));
mem.putLong(offset + 8, cachedColumnVersionList.getQuick(x + COLUMN_INDEX_OFFSET));
mem.putLong(offset + 16, cachedColumnVersionList.getQuick(x + COLUMN_NAME_TXN_OFFSET));
mem.putLong(offset + 24, cachedColumnVersionList.getQuick(x + COLUMN_TOP_OFFSET));
offset += BLOCK_SIZE * 8;
}
}
private void storeNewVersion() {
mem.putLong(OFFSET_VERSION_64, ++this.version);
}
private void updateA(long aOffset, long aSize) {
mem.putLong(OFFSET_OFFSET_A_64, aOffset);
mem.putLong(OFFSET_SIZE_A_64, aSize);
}
private void updateB(long bOffset, long bSize) {
mem.putLong(OFFSET_OFFSET_B_64, bOffset);
mem.putLong(OFFSET_SIZE_B_64, bSize);
}
static {
//noinspection ConstantValue
assert HEADER_SIZE == TableUtils.COLUMN_VERSION_FILE_HEADER_SIZE;
}
}
© 2015 - 2025 Weber Informatics LLC | Privacy Policy