
io.questdb.cairo.TableUtils Maven / Gradle / Ivy
/*******************************************************************************
* ___ _ ____ ____
* / _ \ _ _ ___ ___| |_| _ \| __ )
* | | | | | | |/ _ \/ __| __| | | | _ \
* | |_| | |_| | __/\__ \ |_| |_| | |_) |
* \__\_\\__,_|\___||___/\__|____/|____/
*
* Copyright (c) 2014-2019 Appsicle
* Copyright (c) 2019-2023 QuestDB
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*
******************************************************************************/
package io.questdb.cairo;
import io.questdb.MessageBus;
import io.questdb.cairo.sql.Function;
import io.questdb.cairo.sql.RecordMetadata;
import io.questdb.cairo.sql.SymbolTable;
import io.questdb.cairo.vm.Vm;
import io.questdb.cairo.vm.api.*;
import io.questdb.griffin.AnyRecordMetadata;
import io.questdb.griffin.FunctionParser;
import io.questdb.griffin.SqlException;
import io.questdb.griffin.SqlExecutionContext;
import io.questdb.griffin.model.ExpressionNode;
import io.questdb.griffin.model.QueryModel;
import io.questdb.log.Log;
import io.questdb.log.LogFactory;
import io.questdb.mp.MPSequence;
import io.questdb.std.*;
import io.questdb.std.datetime.millitime.MillisecondClock;
import io.questdb.std.str.CharSink;
import io.questdb.std.str.LPSZ;
import io.questdb.std.str.Path;
import io.questdb.tasks.O3PartitionPurgeTask;
import org.jetbrains.annotations.NotNull;
import org.jetbrains.annotations.Nullable;
import static io.questdb.cairo.MapWriter.createSymbolMapFiles;
import static io.questdb.cairo.wal.WalUtils.CONVERT_FILE_NAME;
public final class TableUtils {
public static final int ANY_TABLE_ID = -1;
public static final int ANY_TABLE_VERSION = -1;
public static final String ATTACHABLE_DIR_MARKER = ".attachable";
public static final long COLUMN_NAME_TXN_NONE = -1L;
public static final String COLUMN_VERSION_FILE_NAME = "_cv";
public static final String DEFAULT_PARTITION_NAME = "default";
public static final String DETACHED_DIR_MARKER = ".detached";
public static final String FILE_SUFFIX_D = ".d";
public static final String FILE_SUFFIX_I = ".i";
public static final int INITIAL_TXN = 0;
public static final int LONGS_PER_TX_ATTACHED_PARTITION = 4;
public static final int LONGS_PER_TX_ATTACHED_PARTITION_MSB = Numbers.msb(LONGS_PER_TX_ATTACHED_PARTITION);
public static final long META_COLUMN_DATA_SIZE = 32;
public static final String META_FILE_NAME = "_meta";
public static final long META_OFFSET_COLUMN_TYPES = 128;
public static final long META_OFFSET_COUNT = 0;
public static final long META_OFFSET_MAX_UNCOMMITTED_ROWS = 20; // LONG
public static final long META_OFFSET_METADATA_VERSION = 32; // LONG
public static final long META_OFFSET_O3_MAX_LAG = 24; // LONG
// INT - symbol map count, this is a variable part of transaction file
// below this offset we will have INT values for symbol map size
public static final long META_OFFSET_PARTITION_BY = 4;
public static final long META_OFFSET_TABLE_ID = 16;
public static final long META_OFFSET_TIMESTAMP_INDEX = 8;
public static final long META_OFFSET_VERSION = 12;
public static final long META_OFFSET_WAL_ENABLED = 40; // BOOLEAN
public static final String META_PREV_FILE_NAME = "_meta.prev";
/**
* TXN file structure
* struct {
* long txn;
* long transient_row_count; // rows count in last partition
* long fixed_row_count; // row count in table excluding count in last partition
* long max_timestamp; // last timestamp written to table
* long struct_version; // data structure version; whenever columns added or removed this version changes.
* long partition_version; // version that increments whenever non-current partitions are modified/added/removed
* long txn_check; // same as txn - sanity check for concurrent reads and writes
* int map_writer_count; // symbol writer count
* int map_writer_position[map_writer_count]; // position of each of map writers
* }
*
* TableUtils.resetTxn() writes to this file, it could be using different offsets, beware
*/
public static final String META_SWAP_FILE_NAME = "_meta.swp";
public static final int MIN_INDEX_VALUE_BLOCK_SIZE = Numbers.ceilPow2(4);
public static final int NULL_LEN = -1;
public static final String SNAPSHOT_META_FILE_NAME = "_snapshot";
public static final String SYMBOL_KEY_REMAP_FILE_SUFFIX = ".r";
public static final char SYSTEM_TABLE_NAME_SUFFIX = '~';
public static final int TABLE_DOES_NOT_EXIST = 1;
public static final int TABLE_EXISTS = 0;
public static final String TABLE_NAME_FILE = "_name";
public static final int TABLE_RESERVED = 2;
public static final int TABLE_TYPE_NON_WAL = 0;
public static final int TABLE_TYPE_WAL = 1;
public static final String TAB_INDEX_FILE_NAME = "_tab_index.d";
/**
* TXN file structure
* struct {
* long txn;
* long transient_row_count; // rows count in last partition
* long fixed_row_count; // row count in table excluding count in last partition
* long max_timestamp; // last timestamp written to table
* long struct_version; // data structure version; whenever columns added or removed this version changes.
* long partition_version; // version that increments whenever non-current partitions are modified/added/removed
* long txn_check; // same as txn - sanity check for concurrent reads and writes
* int map_writer_count; // symbol writer count
* int map_writer_position[map_writer_count]; // position of each of map writers
* }
*
* TableUtils.resetTxn() writes to this file, it could be using different offsets, beware
*/
public static final String TODO_FILE_NAME = "_todo_";
public static final String TXN_FILE_NAME = "_txn";
public static final String TXN_SCOREBOARD_FILE_NAME = "_txn_scoreboard";
// transaction file structure
// @formatter:off
public static final int TX_BASE_HEADER_SECTION_PADDING = 12; // Add some free space into header for future use
public static final long TX_BASE_OFFSET_VERSION_64 = 0;
public static final long TX_BASE_OFFSET_A_32 = TX_BASE_OFFSET_VERSION_64 + 8;
public static final long TX_BASE_OFFSET_SYMBOLS_SIZE_A_32 = TX_BASE_OFFSET_A_32 + 4;
public static final long TX_BASE_OFFSET_PARTITIONS_SIZE_A_32 = TX_BASE_OFFSET_SYMBOLS_SIZE_A_32 + 4;
public static final long TX_BASE_OFFSET_B_32 = TX_BASE_OFFSET_PARTITIONS_SIZE_A_32 + 4 + TX_BASE_HEADER_SECTION_PADDING;
public static final long TX_BASE_OFFSET_SYMBOLS_SIZE_B_32 = TX_BASE_OFFSET_B_32 + 4;
public static final long TX_BASE_OFFSET_PARTITIONS_SIZE_B_32 = TX_BASE_OFFSET_SYMBOLS_SIZE_B_32 + 4;
public static final int TX_BASE_HEADER_SIZE = (int) Math.max(TX_BASE_OFFSET_PARTITIONS_SIZE_B_32 + 4 + TX_BASE_HEADER_SECTION_PADDING, 64);
public static final long TX_OFFSET_MAP_WRITER_COUNT_32 = 128;
public static final long TX_OFFSET_TXN_64 = 0;
public static final long TX_OFFSET_TRANSIENT_ROW_COUNT_64 = TX_OFFSET_TXN_64 + 8;
public static final long TX_OFFSET_FIXED_ROW_COUNT_64 = TX_OFFSET_TRANSIENT_ROW_COUNT_64 + 8;
public static final long TX_OFFSET_MIN_TIMESTAMP_64 = TX_OFFSET_FIXED_ROW_COUNT_64 + 8;
public static final long TX_OFFSET_MAX_TIMESTAMP_64 = TX_OFFSET_MIN_TIMESTAMP_64 + 8;
public static final long TX_OFFSET_STRUCT_VERSION_64 = TX_OFFSET_MAX_TIMESTAMP_64 + 8;
public static final long TX_OFFSET_DATA_VERSION_64 = TX_OFFSET_STRUCT_VERSION_64 + 8;
public static final long TX_OFFSET_PARTITION_TABLE_VERSION_64 = TX_OFFSET_DATA_VERSION_64 + 8;
public static final long TX_OFFSET_COLUMN_VERSION_64 = TX_OFFSET_PARTITION_TABLE_VERSION_64 + 8;
public static final long TX_OFFSET_TRUNCATE_VERSION_64 = TX_OFFSET_COLUMN_VERSION_64 + 8;
public static final long TX_OFFSET_SEQ_TXN_64 = TX_OFFSET_TRUNCATE_VERSION_64 + 8;
public static final long TX_OFFSET_CHECKSUM_32 = TX_OFFSET_SEQ_TXN_64 + 8;
public static final long TX_OFFSET_LAG_TXN_COUNT_32 = TX_OFFSET_CHECKSUM_32 + 4;
public static final long TX_OFFSET_LAG_ROW_COUNT_32 = TX_OFFSET_LAG_TXN_COUNT_32 + 4;
public static final long TX_OFFSET_LAG_MIN_TIMESTAMP_64 = TX_OFFSET_LAG_ROW_COUNT_32 + 4;
public static final long TX_OFFSET_LAG_MAX_TIMESTAMP_64 = TX_OFFSET_LAG_MIN_TIMESTAMP_64 + 8;
// @formatter:on
public static final int TX_RECORD_HEADER_SIZE = (int) TX_OFFSET_MAP_WRITER_COUNT_32 + Integer.BYTES;
public static final String UPGRADE_FILE_NAME = "_upgrade.d";
static final int COLUMN_VERSION_FILE_HEADER_SIZE = 40;
static final int META_FLAG_BIT_INDEXED = 1;
static final int META_FLAG_BIT_NOT_INDEXED = 0;
static final int META_FLAG_BIT_SEQUENTIAL = 1 << 1;
static final byte TODO_RESTORE_META = 2;
static final byte TODO_TRUNCATE = 1;
private static final int EMPTY_TABLE_LAG_CHECKSUM = calculateTxnLagChecksum(0, 0, 0, Long.MAX_VALUE, Long.MIN_VALUE, 0);
private final static Log LOG = LogFactory.getLog(TableUtils.class);
private static final int MAX_INDEX_VALUE_BLOCK_SIZE = Numbers.ceilPow2(8 * 1024 * 1024);
private static final int MAX_SYMBOL_CAPACITY = Numbers.ceilPow2(Integer.MAX_VALUE);
private static final int MAX_SYMBOL_CAPACITY_CACHED = Numbers.ceilPow2(30_000_000);
private static final int MIN_SYMBOL_CAPACITY = 2;
private TableUtils() {
}
public static void allocateDiskSpace(FilesFacade ff, int fd, long size) {
if (ff.length(fd) < size && !ff.allocate(fd, size)) {
throw CairoException.critical(ff.errno()).put("No space left [size=").put(size).put(", fd=").put(fd).put(']');
}
}
public static void allocateDiskSpaceToPage(FilesFacade ff, int fd, long size) {
size = Files.ceilPageSize(size);
allocateDiskSpace(ff, fd, size);
}
public static int calculateTxRecordSize(int bytesSymbols, int bytesPartitions) {
return TX_RECORD_HEADER_SIZE + Integer.BYTES + bytesSymbols + Integer.BYTES + bytesPartitions;
}
public static int calculateTxnLagChecksum(long txn, long seqTxn, int lagRowCount, long lagMinTimestamp, long lagMaxTimestamp, int lagTxnCount) {
long checkSum = lagMinTimestamp;
checkSum = checkSum * 31 + lagMaxTimestamp;
checkSum = checkSum * 31 + txn;
checkSum = checkSum * 31 + seqTxn;
checkSum = checkSum * 31 + lagRowCount;
checkSum = checkSum * 31 + lagTxnCount;
return (int) (checkSum ^ (checkSum >>> 32));
}
public static Path charFileName(Path path, CharSequence columnName, long columnNameTxn) {
path.concat(columnName).put(".c");
if (columnNameTxn > COLUMN_NAME_TXN_NONE) {
path.put('.').put(columnNameTxn);
}
return path.$();
}
public static long checkMemSize(MemoryMR metaMem, long minSize) {
final long memSize = metaMem.size();
if (memSize < minSize) {
throw CairoException.critical(0).put("File is too small, size=").put(memSize).put(", required=").put(minSize);
}
return memSize;
}
public static int compressColumnCount(RecordMetadata metadata) {
int count = 0;
for (int i = 0, n = metadata.getColumnCount(); i < n; i++) {
if (metadata.getColumnType(i) > 0) {
count++;
}
}
return count;
}
public static void createColumnVersionFile(MemoryMARW mem) {
// Create page of 0s for Column Version file "_cv"
mem.extend(COLUMN_VERSION_FILE_HEADER_SIZE);
mem.jumpTo(COLUMN_VERSION_FILE_HEADER_SIZE);
mem.zero();
}
public static void createConvertFile(FilesFacade ff, Path path, byte walFlag) {
long addr = 0;
int fd = -1;
try {
fd = ff.openRW(path.concat(CONVERT_FILE_NAME).$(), CairoConfiguration.O_NONE);
if (fd < 1) {
throw CairoException.critical(ff.errno()).put("Could not open file [path=").put(path).put(']');
}
addr = Unsafe.malloc(Byte.BYTES, MemoryTag.MMAP_TABLE_WAL_WRITER);
if (addr < 1) {
throw CairoException.critical(ff.errno()).put("Could not allocate 1 byte");
}
Unsafe.getUnsafe().putByte(addr, walFlag);
ff.write(fd, addr, Byte.BYTES, 0);
} finally {
if (addr > 0) {
Unsafe.free(addr, Byte.BYTES, MemoryTag.MMAP_TABLE_WAL_WRITER);
}
ff.close(fd);
}
}
@NotNull
public static Function createCursorFunction(
FunctionParser functionParser,
@NotNull QueryModel model,
@NotNull SqlExecutionContext executionContext
) throws SqlException {
final ExpressionNode tableNameExpr = model.getTableNameExpr();
final Function function = functionParser.parseFunction(
tableNameExpr,
AnyRecordMetadata.INSTANCE,
executionContext
);
if (!ColumnType.isCursor(function.getType())) {
throw SqlException.$(tableNameExpr.position, "function must return CURSOR");
}
return function;
}
public static void createTable(
CairoConfiguration configuration,
MemoryMARW memory,
Path path,
TableStructure structure,
int tableId,
CharSequence dirName
) {
createTable(configuration, memory, path, structure, ColumnType.VERSION, tableId, dirName);
}
public static void createTable(
CairoConfiguration configuration,
MemoryMARW memory,
Path path,
TableStructure structure,
int tableVersion,
int tableId,
CharSequence dirName
) {
final FilesFacade ff = configuration.getFilesFacade();
final CharSequence root = configuration.getRoot();
final int mkDirMode = configuration.getMkDirMode();
createTable(ff, root, mkDirMode, memory, path, structure, tableVersion, tableId, dirName);
}
public static void createTable(
FilesFacade ff,
CharSequence root,
int mkDirMode,
MemoryMARW memory,
Path path,
TableStructure structure,
int tableVersion,
int tableId,
CharSequence dirName
) {
createTable(ff, root, mkDirMode, memory, path, dirName, structure, tableVersion, tableId);
}
public static void createTable(
FilesFacade ff,
CharSequence root,
int mkDirMode,
MemoryMARW memory,
Path path,
CharSequence tableDir,
TableStructure structure,
int tableVersion,
int tableId
) {
LOG.debug().$("create table [name=").utf8(tableDir).I$();
path.of(root).concat(tableDir).$();
if (ff.isDirOrSoftLinkDir(path)) {
throw CairoException.critical(ff.errno()).put("table directory already exists [path=").put(path).put(']');
}
int rootLen = path.length();
try {
if (ff.mkdirs(path.slash$(), mkDirMode) != 0) {
throw CairoException.critical(ff.errno()).put("could not create [dir=").put(path.trimTo(rootLen).$()).put(']');
}
createTableFiles(ff, memory, path, rootLen, tableDir, structure, tableVersion, tableId);
} finally {
path.trimTo(rootLen);
}
}
public static void createTableInVolume(
FilesFacade ff,
CharSequence root,
int mkDirMode,
MemoryMARW memory,
Path path,
CharSequence tableDir,
TableStructure structure,
int tableVersion,
int tableId
) {
LOG.info().$("create table in volume [path=").utf8(path).I$();
Path normalPath = Path.getThreadLocal2(root).concat(tableDir).$();
assert normalPath != path;
if (ff.isDirOrSoftLinkDir(normalPath)) {
throw CairoException.critical(ff.errno()).put("table directory already exists [path=").put(normalPath).put(']');
}
// path has been set by CREATE TABLE ... [IN VOLUME 'path'].
// it is a valid directory, or link to a directory, checked at bootstrap
if (ff.isDirOrSoftLinkDir(path)) {
throw CairoException.critical(ff.errno()).put("table directory already exists in volume [path=").put(path).put(']');
}
int rootLen = path.length();
try {
if (ff.mkdirs(path.slash$(), mkDirMode) != 0) {
throw CairoException.critical(ff.errno()).put("could not create [dir=").put(path).put(']');
}
if (ff.softLink(path.trimTo(rootLen).$(), normalPath) != 0) {
if (ff.rmdir(path.slash$()) != 0) {
LOG.error().$("cannot remove table directory in volume [errno=").$(ff.errno()).$(", path=").utf8(path.trimTo(rootLen).$()).I$();
}
throw CairoException.critical(ff.errno()).put("could not create soft link [src=").put(path.trimTo(rootLen).$()).put(", tableDir=").put(tableDir).put(']');
}
createTableFiles(ff, memory, path, rootLen, tableDir, structure, tableVersion, tableId);
} finally {
path.trimTo(rootLen);
}
}
public static void createTableNameFile(MemoryMARW mem, CharSequence charSequence) {
mem.putStr(charSequence);
mem.putByte((byte) 0);
mem.sync(false);
mem.close(true, Vm.TRUNCATE_TO_POINTER);
}
public static long createTransitionIndex(
MemoryR masterMeta,
AbstractRecordMetadata slaveMeta
) {
int slaveColumnCount = slaveMeta.columnCount;
int masterColumnCount = masterMeta.getInt(META_OFFSET_COUNT);
final long pTransitionIndex;
final int size = 8 + masterColumnCount * 8;
long index = pTransitionIndex = Unsafe.calloc(size, MemoryTag.NATIVE_TABLE_READER);
Unsafe.getUnsafe().putInt(index, size);
index += 8;
// index structure is
// [action: int, copy from:int]
// action: if -1 then current column in slave is deleted or renamed, else it's reused
// "copy from" >= 0 indicates that column is to be copied from slave position
// "copy from" < 0 indicates that column is new and should be taken from updated metadata position
// "copy from" == Integer.MIN_VALUE indicates that column is deleted for good and should not be re-added from any source
long offset = getColumnNameOffset(masterColumnCount);
int slaveIndex = 0;
int shiftLeft = 0;
for (int masterIndex = 0; masterIndex < masterColumnCount; masterIndex++) {
CharSequence name = masterMeta.getStr(offset);
offset += Vm.getStorageLength(name);
int masterColumnType = getColumnType(masterMeta, masterIndex);
if (slaveIndex < slaveColumnCount) {
int existingWriterIndex = slaveMeta.getWriterIndex(slaveIndex);
if (existingWriterIndex > masterIndex) {
// This column must be deleted so existing dense columns do not contain it
assert masterColumnType < 0;
continue;
}
assert existingWriterIndex == masterIndex;
}
int outIndex = slaveIndex - shiftLeft;
if (masterColumnType < 0) {
shiftLeft++; // Deleted in master
if (slaveIndex < slaveColumnCount) {
Unsafe.getUnsafe().putInt(index + slaveIndex * 8L, -1);
Unsafe.getUnsafe().putInt(index + slaveIndex * 8L + 4, Integer.MIN_VALUE);
}
} else {
if (
slaveIndex < slaveColumnCount
&& isColumnIndexed(masterMeta, masterIndex) == slaveMeta.isColumnIndexed(slaveIndex)
&& Chars.equals(name, slaveMeta.getColumnName(slaveIndex))
) {
// reuse
Unsafe.getUnsafe().putInt(index + outIndex * 8L + 4, slaveIndex);
if (slaveIndex > outIndex) {
// mark to do nothing with existing column, this may be overwritten later
Unsafe.getUnsafe().putInt(index + slaveIndex * 8L + 4, Integer.MIN_VALUE);
}
} else {
// new
if (slaveIndex < slaveColumnCount) {
// free
Unsafe.getUnsafe().putInt(index + slaveIndex * 8L, -1);
}
Unsafe.getUnsafe().putInt(index + outIndex * 8L + 4, -masterIndex - 1);
}
}
slaveIndex++;
}
Unsafe.getUnsafe().putInt(pTransitionIndex + 4, slaveIndex - shiftLeft);
return pTransitionIndex;
}
public static void createTxn(
MemoryMW txMem,
int symbolMapCount,
long txn,
long seqTxn,
long dataVersion,
long partitionTableVersion,
long structureVersion,
long columnVersion,
long truncateVersion
) {
txMem.putInt(TX_BASE_OFFSET_A_32, TX_BASE_HEADER_SIZE);
txMem.putInt(TX_BASE_OFFSET_SYMBOLS_SIZE_A_32, symbolMapCount * 8);
txMem.putInt(TX_BASE_OFFSET_PARTITIONS_SIZE_A_32, 0);
resetTxn(
txMem,
TX_BASE_HEADER_SIZE,
symbolMapCount,
txn,
seqTxn,
dataVersion,
partitionTableVersion,
structureVersion,
columnVersion,
truncateVersion
);
txMem.setTruncateSize(TX_BASE_HEADER_SIZE + TX_RECORD_HEADER_SIZE);
}
public static LPSZ dFile(Path path, CharSequence columnName, long columnTxn) {
path.concat(columnName).put(FILE_SUFFIX_D);
if (columnTxn > COLUMN_NAME_TXN_NONE) {
path.put('.').put(columnTxn);
}
return path.$();
}
public static LPSZ dFile(Path path, CharSequence columnName) {
return dFile(path, columnName, COLUMN_NAME_TXN_NONE);
}
public static int exists(FilesFacade ff, Path path, CharSequence root, CharSequence name) {
return exists(ff, path.of(root).concat(name).$());
}
public static int existsInVolume(FilesFacade ff, Path volumePath, CharSequence name) {
return exists(ff, volumePath.concat(name).$());
}
public static void freeTransitionIndex(long address) {
if (address == 0) {
return;
}
Unsafe.free(address, Unsafe.getUnsafe().getInt(address), MemoryTag.NATIVE_TABLE_READER);
}
public static int getColumnCount(MemoryMR metaMem, long offset) {
final int columnCount = metaMem.getInt(offset);
if (columnCount < 0) {
throw validationException(metaMem).put("Incorrect columnCount: ").put(columnCount);
}
return columnCount;
}
public static CharSequence getColumnName(MemoryMR metaMem, long memSize, long offset, int columnIndex) {
final int strLength = getInt(metaMem, memSize, offset);
if (strLength == TableUtils.NULL_LEN) {
throw validationException(metaMem).put("NULL column name at [").put(columnIndex).put(']');
}
return getCharSequence(metaMem, memSize, offset, strLength);
}
public static long getColumnNameOffset(int columnCount) {
return META_OFFSET_COLUMN_TYPES + columnCount * META_COLUMN_DATA_SIZE;
}
public static int getColumnType(MemoryR metaMem, int columnIndex) {
return metaMem.getInt(META_OFFSET_COLUMN_TYPES + columnIndex * META_COLUMN_DATA_SIZE);
}
public static int getColumnType(MemoryMR metaMem, long memSize, long offset, int columnIndex) {
final int type = getInt(metaMem, memSize, offset);
if (type >= 0 && ColumnType.sizeOf(type) == -1) {
throw validationException(metaMem).put("Invalid column type ").put(type).put(" at [").put(columnIndex).put(']');
}
return type;
}
public static long getPartitionTableIndexOffset(int symbolWriterCount, int index) {
return getPartitionTableIndexOffset(getPartitionTableSizeOffset(symbolWriterCount), index);
}
public static long getPartitionTableIndexOffset(long partitionTableOffset, int index) {
return partitionTableOffset + 4 + index * 8L;
}
public static long getPartitionTableSizeOffset(int symbolWriterCount) {
return getSymbolWriterIndexOffset(symbolWriterCount);
}
public static long getSymbolWriterIndexOffset(int index) {
return TX_OFFSET_MAP_WRITER_COUNT_32 + Integer.BYTES + (long) index * Long.BYTES;
}
public static long getSymbolWriterTransientIndexOffset(int index) {
return getSymbolWriterIndexOffset(index) + Integer.BYTES;
}
@NotNull
public static String getTableDir(boolean mangleDirNames, @NotNull String tableName, int tableId, boolean isWal) {
String dirName = tableName;
if (isWal) {
dirName += TableUtils.SYSTEM_TABLE_NAME_SUFFIX;
dirName += tableId;
} else if (mangleDirNames) {
dirName += TableUtils.SYSTEM_TABLE_NAME_SUFFIX;
}
return dirName;
}
public static CharSequence getTableNameFromDirName(CharSequence privateName) {
int suffixIndex = Chars.indexOf(privateName, SYSTEM_TABLE_NAME_SUFFIX);
if (suffixIndex == -1) {
return privateName;
}
return Chars.toString(privateName).substring(0, suffixIndex);
}
public static int getTimestampIndex(MemoryMR metaMem, long offset, int columnCount) {
final int timestampIndex = metaMem.getInt(offset);
if (timestampIndex < -1 || timestampIndex >= columnCount) {
throw validationException(metaMem).put("Timestamp index is outside of range, timestampIndex=").put(timestampIndex);
}
return timestampIndex;
}
public static void handleMetadataLoadException(CharSequence tableName, long deadline, CairoException ex, MillisecondClock millisecondClock, long spinLockTimeout) {
// This is temporary solution until we can get multiple version of metadata not overwriting each other
if (ex.errnoReadPathDoesNotExist()) {
if (millisecondClock.getTicks() < deadline) {
LOG.info().$("error reloading metadata [table=").utf8(tableName)
.$(", errno=").$(ex.getErrno())
.$(", error=").utf8(ex.getFlyweightMessage()).I$();
Os.pause();
} else {
LOG.error().$("metadata read timeout [timeout=").$(spinLockTimeout).utf8("μs]").$();
throw CairoException.critical(ex.getErrno()).put("Metadata read timeout. Last error: ").put(ex.getFlyweightMessage());
}
} else {
throw ex;
}
}
public static LPSZ iFile(Path path, CharSequence columnName, long columnTxn) {
path.concat(columnName).put(FILE_SUFFIX_I);
if (columnTxn > COLUMN_NAME_TXN_NONE) {
path.put('.').put(columnTxn);
}
return path.$();
}
public static LPSZ iFile(Path path, CharSequence columnName) {
return iFile(path, columnName, COLUMN_NAME_TXN_NONE);
}
public static boolean isValidColumnName(CharSequence seq, int fsFileNameLimit) {
int l = seq.length();
if (l > fsFileNameLimit) {
// Most file systems don't support files name longer than 255 bytes
return false;
}
for (int i = 0; i < l; i++) {
char c = seq.charAt(i);
switch (c) {
case '?':
case '.':
case ',':
case '\'':
case '\"':
case '\\':
case '/':
case ':':
case ')':
case '(':
case '+':
case '-':
case '*':
case '%':
case '~':
case '\u0000': // Control characters
case '\u0001':
case '\u0002':
case '\u0003':
case '\u0004':
case '\u0005':
case '\u0006':
case '\u0007':
case '\u0008':
case ' ':
case '\u000B':
case '\u000c':
case '\n':
case '\r':
case '\u000e':
case '\u000f':
case '\u007f':
case 0xfeff: // UTF-8 BOM (Byte Order Mark) can appear at the beginning of a character stream
return false;
default:
break;
}
}
return l > 0;
}
public static boolean isValidTableName(CharSequence tableName, int fsFileNameLimit) {
int l = tableName.length();
if (l > fsFileNameLimit) {
// Most file systems don't support files name longer than 255 bytes
return false;
}
for (int i = 0; i < l; i++) {
char c = tableName.charAt(i);
switch (c) {
case '.':
if (i == 0 || i == l - 1 || tableName.charAt(i - 1) == '.') {
// Single dot in the middle is allowed only
// Starting from . hides directory in Linux
// Ending . can be trimmed by some Windows versions / file systems
// Double, triple dot look suspicious
// Single dot allowed as compatibility,
// when someone uploads 'file_name.csv' the file name used as the table name
return false;
}
break;
case '?':
case ',':
case '\'':
case '\"':
case '\\':
case '/':
case ':':
case ')':
case '(':
case '+':
case '*':
case '%':
case '~':
case '\u0000': // Control characters
case '\u0001':
case '\u0002':
case '\u0003':
case '\u0004':
case '\u0005':
case '\u0006':
case '\u0007':
case '\u0008':
case ' ':
case '\u000B':
case '\u000c':
case '\r':
case '\n':
case '\u000e':
case '\u000f':
case '\u007f':
case 0xfeff: // UTF-8 BOM (Byte Order Mark) can appear at the beginning of a character stream
return false;
}
}
return tableName.length() > 0 && tableName.charAt(0) != ' ' && tableName.charAt(l - 1) != ' ';
}
public static int lock(FilesFacade ff, Path path, boolean verbose) {
final int fd = ff.openRW(path, CairoConfiguration.O_NONE);
if (fd == -1) {
if (verbose) {
LOG.error().$("cannot open '").utf8(path).$("' to lock [errno=").$(ff.errno()).I$();
}
return -1;
}
if (ff.lock(fd) != 0) {
if (verbose) {
LOG.error().$("cannot lock '").utf8(path).$("' [errno=").$(ff.errno()).$(", fd=").$(fd).I$();
}
ff.close(fd);
return -1;
}
if (verbose) {
LOG.info().$("locked '").utf8(path).$("' [fd=").$(fd).I$();
}
return fd;
}
public static int lock(FilesFacade ff, Path path) {
return lock(ff, path, true);
}
public static void lockName(Path path) {
path.put(".lock").$();
}
public static long mapAppendColumnBuffer(FilesFacade ff, int fd, long offset, long size, boolean rw, int memoryTag) {
// Linux requires the mmap offset to be page aligned
long alignedOffset = Files.floorPageSize(offset);
long alignedExtraLen = offset - alignedOffset;
long mapAddr = rw ?
mapRWNoAlloc(ff, fd, size + alignedExtraLen, alignedOffset, memoryTag) :
mapRO(ff, fd, size + alignedExtraLen, alignedOffset, memoryTag);
ff.madvise(mapAddr, size + alignedExtraLen, rw ? Files.POSIX_MADV_RANDOM : Files.POSIX_MADV_SEQUENTIAL);
return mapAddr + alignedExtraLen;
}
public static void mapAppendColumnBufferRelease(FilesFacade ff, long address, long offset, long size, int memoryTag) {
long alignedOffset = Files.floorPageSize(offset);
long alignedExtraLen = offset - alignedOffset;
ff.munmap(address - alignedExtraLen, size + alignedExtraLen, memoryTag);
}
public static long mapRO(FilesFacade ff, int fd, long size, int memoryTag) {
return mapRO(ff, fd, size, 0, memoryTag);
}
/**
* Maps a file in read-only mode.
*
* Important note. Linux requires the offset to be page aligned.
*
* @param ff files facade, - intermediary to allow intercepting calls to the OS.
* @param fd file descriptor, previously provided by one of openFile() functions
* @param size size of the mapped file region
* @param offset offset in file to begin mapping
* @param memoryTag bucket to trace memory allocation calls
* @return read-only memory address
*/
public static long mapRO(FilesFacade ff, int fd, long size, long offset, int memoryTag) {
assert fd != -1;
assert offset % ff.getPageSize() == 0;
final long address = ff.mmap(fd, size, offset, Files.MAP_RO, memoryTag);
if (address == FilesFacade.MAP_FAILED) {
throw CairoException.critical(ff.errno())
.put("could not mmap ")
.put(" [size=").put(size)
.put(", offset=").put(offset)
.put(", fd=").put(fd)
.put(", memUsed=").put(Unsafe.getMemUsed())
.put(", fileLen=").put(ff.length(fd))
.put(']');
}
return address;
}
public static long mapRW(FilesFacade ff, int fd, long size, int memoryTag) {
return mapRW(ff, fd, size, 0, memoryTag);
}
/**
* Maps a file in read-write mode.
*
* Important note. Linux requires the offset to be page aligned.
*
* @param ff files facade, - intermediary to allow intercepting calls to the OS.
* @param fd file descriptor, previously provided by one of openFile() functions. File has to be opened read-write
* @param size size of the mapped file region
* @param offset offset in file to begin mapping
* @param memoryTag bucket to trace memory allocation calls
* @return read-write memory address
*/
public static long mapRW(FilesFacade ff, int fd, long size, long offset, int memoryTag) {
assert fd != -1;
assert offset % ff.getPageSize() == 0;
allocateDiskSpace(ff, fd, size + offset);
return mapRWNoAlloc(ff, fd, size, offset, memoryTag);
}
/**
* Maps a file in read-write mode without allocating the disk space.
*
* Important note. Linux requires the offset to be page aligned.
*
* @param ff files facade, - intermediary to allow intercepting calls to the OS.
* @param fd file descriptor, previously provided by one of openFile() functions. File has to be opened read-write
* @param size size of the mapped file region
* @param offset offset in file to begin mapping
* @param memoryTag bucket to trace memory allocation calls
* @return read-write memory address
*/
public static long mapRWNoAlloc(FilesFacade ff, int fd, long size, long offset, int memoryTag) {
long addr = ff.mmap(fd, size, offset, Files.MAP_RW, memoryTag);
if (addr > -1) {
return addr;
}
int errno = ff.errno();
if (Os.type != Os.WINDOWS || errno != 112) {
throw CairoException.critical(ff.errno()).put("could not mmap column [fd=").put(fd).put(", size=").put(size).put(']');
}
throw CairoException.critical(ff.errno()).put("No space left [size=").put(size).put(", fd=").put(fd).put(']');
}
public static long mapRWOrClose(FilesFacade ff, int fd, long size, int memoryTag) {
try {
return TableUtils.mapRW(ff, fd, size, memoryTag);
} catch (CairoException e) {
ff.close(fd);
throw e;
}
}
public static long mremap(
FilesFacade ff,
int fd,
long prevAddress,
long prevSize,
long newSize,
int mapMode,
int memoryTag
) {
return mremap(ff, fd, prevAddress, prevSize, newSize, 0L, mapMode, memoryTag);
}
public static long mremap(
FilesFacade ff,
int fd,
long prevAddress,
long prevSize,
long newSize,
long offset,
int mapMode,
int memoryTag
) {
final long page = ff.mremap(fd, prevAddress, prevSize, newSize, offset, mapMode, memoryTag);
if (page == FilesFacade.MAP_FAILED) {
int errno = ff.errno();
// Closing memory will truncate size to current append offset.
// Since the failed resize can occur before append offset can be
// explicitly set, we must assume that file size should be
// equal to previous memory size
throw CairoException.critical(errno).put("could not remap file [previousSize=").put(prevSize)
.put(", newSize=").put(newSize)
.put(", offset=").put(offset)
.put(", fd=").put(fd)
.put(']');
}
return page;
}
public static Path offsetFileName(Path path, CharSequence columnName, long columnNameTxn) {
path.concat(columnName).put(".o");
if (columnNameTxn > COLUMN_NAME_TXN_NONE) {
path.put('.').put(columnNameTxn);
}
return path.$();
}
public static void oldPartitionName(Path path, long txn) {
path.put("-x-").put(txn);
}
public static int openFileRWOrFail(FilesFacade ff, LPSZ path, long opts) {
return openRW(ff, path, LOG, opts);
}
public static int openRO(FilesFacade ff, Path path, CharSequence fileName, Log log) {
final int rootLen = path.length();
path.concat(fileName).$();
try {
return TableUtils.openRO(ff, path, log);
} finally {
path.trimTo(rootLen);
}
}
public static int openRO(FilesFacade ff, LPSZ path, Log log) {
final int fd = ff.openRO(path);
if (fd > -1) {
log.debug().$("open [file=").$(path).$(", fd=").$(fd).I$();
return fd;
}
throw CairoException.critical(ff.errno()).put("could not open read-only [file=").put(path).put(']');
}
public static int openRW(FilesFacade ff, LPSZ path, Log log, long opts) {
final int fd = ff.openRW(path, opts);
if (fd > -1) {
log.debug().$("open [file=").$(path).$(", fd=").$(fd).I$();
return fd;
}
throw CairoException.critical(ff.errno()).put("could not open read-write [file=").put(path).put(']');
}
public static void openSmallFile(FilesFacade ff, Path path, int rootLen, MemoryMR metaMem, CharSequence fileName, int memoryTag) {
path.concat(fileName).$();
try {
metaMem.smallFile(ff, path, memoryTag);
} finally {
path.trimTo(rootLen);
}
}
public static void overwriteTableNameFile(Path tablePath, MemoryMARW memory, FilesFacade ff, TableToken newTableToken) {
// Update name in _name file.
// This is potentially racy but the file only read on startup when the tables.d file is missing
// so very limited circumstances.
Path nameFilePath = tablePath.concat(TABLE_NAME_FILE).$();
memory.smallFile(ff, nameFilePath, MemoryTag.MMAP_TABLE_WRITER);
memory.jumpTo(0);
createTableNameFile(memory, newTableToken.getTableName());
memory.close(true, Vm.TRUNCATE_TO_POINTER);
}
public static int readIntOrFail(FilesFacade ff, int fd, long offset, long tempMem8b, Path path) {
if (ff.read(fd, tempMem8b, Integer.BYTES, offset) != Integer.BYTES) {
throw CairoException.critical(ff.errno()).put("Cannot read: ").put(path);
}
return Unsafe.getUnsafe().getInt(tempMem8b);
}
public static long readLongAtOffset(FilesFacade ff, Path path, long tempMem8b, long offset) {
final int fd = TableUtils.openRO(ff, path, LOG);
try {
return readLongOrFail(ff, fd, offset, tempMem8b, path);
} finally {
ff.close(fd);
}
}
public static long readLongOrFail(FilesFacade ff, int fd, long offset, long tempMem8b, @Nullable Path path) {
if (ff.read(fd, tempMem8b, Long.BYTES, offset) != Long.BYTES) {
if (path != null) {
throw CairoException.critical(ff.errno()).put("could not read long [path=").put(path).put(", fd=").put(fd).put(", offset=").put(offset);
}
throw CairoException.critical(ff.errno()).put("could not read long [fd=").put(fd).put(", offset=").put(offset);
}
return Unsafe.getUnsafe().getLong(tempMem8b);
}
public static String readTableName(Path path, int rootLen, MemoryCMR mem, FilesFacade ff) {
int fd = -1;
try {
path.concat(TableUtils.TABLE_NAME_FILE).$();
fd = ff.openRO(path);
if (fd < 1) {
return null;
}
long fileLen = ff.length(fd);
if (fileLen > Integer.BYTES) {
int charLen = ff.readNonNegativeInt(fd, 0);
if (charLen * 2L + Integer.BYTES != fileLen - 1) {
LOG.error().$("invalid table name file [path=").$(path).$(", headerLen=").$(charLen).$(", fileLen=").$(fileLen).I$();
return null;
}
mem.of(ff, path, fileLen, fileLen, MemoryTag.MMAP_DEFAULT);
return Chars.toString(mem.getStr(0));
} else {
LOG.error().$("invalid table name file [path=").$(path).$(", fileLen=").$(fileLen).I$();
return null;
}
} finally {
path.trimTo(rootLen);
ff.close(fd);
}
}
public static void removeColumnFromMetadata(
CharSequence columnName,
LowerCaseCharSequenceIntHashMap columnNameIndexMap,
ObjList columnMetadata
) {
final int columnIndex = columnNameIndexMap.get(columnName);
if (columnIndex < 0) {
throw CairoException.critical(0).put("Column not found: ").put(columnName);
}
columnNameIndexMap.remove(columnName);
final TableColumnMetadata deletedMeta = columnMetadata.getQuick(columnIndex);
deletedMeta.markDeleted();
}
public static void removeOrException(FilesFacade ff, int fd, LPSZ path) {
if (ff.exists(path) && !ff.closeRemove(fd, path)) {
throw CairoException.critical(ff.errno()).put("Cannot remove ").put(path);
}
}
public static void renameColumnInMetadata(
CharSequence columnName,
CharSequence newName,
LowerCaseCharSequenceIntHashMap columnNameIndexMap,
ObjList columnMetadata
) {
final int columnIndex = columnNameIndexMap.get(columnName);
if (columnIndex < 0) {
throw CairoException.critical(0).put("Column not found: ").put(columnName);
}
final String newNameStr = newName.toString();
columnMetadata.getQuick(columnIndex).setName(newNameStr);
columnNameIndexMap.removeEntry(columnName);
columnNameIndexMap.put(newNameStr, columnIndex);
}
public static void renameOrFail(FilesFacade ff, Path src, Path dst) {
if (ff.rename(src, dst) != Files.FILES_RENAME_OK) {
throw CairoException.critical(ff.errno()).put("could not rename ").put(src).put(" -> ").put(dst);
}
}
public static void resetTodoLog(FilesFacade ff, Path path, int rootLen, MemoryMARW mem) {
mem.smallFile(ff, path.trimTo(rootLen).concat(TODO_FILE_NAME).$(), MemoryTag.MMAP_DEFAULT);
mem.jumpTo(0);
mem.putLong(24, 0); // txn check
Unsafe.getUnsafe().storeFence();
mem.putLong(8, 0); // hashLo
mem.putLong(16, 0); // hashHi
Unsafe.getUnsafe().storeFence();
mem.putLong(0, 0); // txn
mem.putLong(32, 0); // count
mem.jumpTo(40);
mem.sync(false);
}
public static void resetTxn(
MemoryMW txMem,
long baseOffset,
int symbolMapCount,
long txn,
long seqTxn,
long dataVersion,
long partitionTableVersion,
long structureVersion,
long columnVersion,
long truncateVersion
) {
// txn to let readers know table is being reset
txMem.putLong(baseOffset + TX_OFFSET_TXN_64, txn);
// transient row count
txMem.putLong(baseOffset + TX_OFFSET_TRANSIENT_ROW_COUNT_64, 0);
// fixed row count
txMem.putLong(baseOffset + TX_OFFSET_FIXED_ROW_COUNT_64, 0);
// min timestamp value in table
txMem.putLong(baseOffset + TX_OFFSET_MIN_TIMESTAMP_64, Long.MAX_VALUE);
// max timestamp value in table
txMem.putLong(baseOffset + TX_OFFSET_MAX_TIMESTAMP_64, Long.MIN_VALUE);
// structure version
txMem.putLong(baseOffset + TX_OFFSET_STRUCT_VERSION_64, structureVersion);
// data version
txMem.putLong(baseOffset + TX_OFFSET_DATA_VERSION_64, dataVersion);
// partition table version
txMem.putLong(baseOffset + TX_OFFSET_PARTITION_TABLE_VERSION_64, partitionTableVersion);
// column version
txMem.putLong(baseOffset + TX_OFFSET_COLUMN_VERSION_64, columnVersion);
// truncate version
txMem.putLong(baseOffset + TX_OFFSET_TRUNCATE_VERSION_64, truncateVersion);
// sequencer txn
txMem.putLong(baseOffset + TX_OFFSET_SEQ_TXN_64, seqTxn);
txMem.putInt(baseOffset + TX_OFFSET_MAP_WRITER_COUNT_32, symbolMapCount);
txMem.putLong(baseOffset + TX_OFFSET_LAG_MIN_TIMESTAMP_64, Long.MAX_VALUE);
txMem.putLong(baseOffset + TX_OFFSET_LAG_MAX_TIMESTAMP_64, Long.MIN_VALUE);
txMem.putInt(baseOffset + TX_OFFSET_LAG_ROW_COUNT_32, 0);
txMem.putInt(baseOffset + TX_OFFSET_LAG_TXN_COUNT_32, 0);
txMem.putInt(baseOffset + TX_OFFSET_CHECKSUM_32, EMPTY_TABLE_LAG_CHECKSUM);
for (int i = 0; i < symbolMapCount; i++) {
long offset = getSymbolWriterIndexOffset(i);
txMem.putInt(baseOffset + offset, 0);
offset += Integer.BYTES;
txMem.putInt(baseOffset + offset, 0);
}
// partition update count
txMem.putInt(baseOffset + getPartitionTableSizeOffset(symbolMapCount), 0);
}
public static void safeReadTxn(TxReader txReader, MillisecondClock clock, long spinLockTimeout) {
long deadline = clock.getTicks() + spinLockTimeout;
if (txReader.unsafeReadVersion() == txReader.getVersion()) {
LOG.debug().$("checked clean txn, version ").$(txReader.getVersion()).$(", txn=").$(txReader.getTxn()).$();
return;
}
while (true) {
if (txReader.unsafeLoadAll()) {
LOG.debug().$("loaded clean txn, version ").$(txReader.getVersion())
.$(", offset=").$(txReader.getBaseOffset())
.$(", size=").$(txReader.getRecordSize())
.$(", txn=").$(txReader.getTxn()).$();
// All good, snapshot read
return;
}
// This is unlucky, sequences have changed while we were reading transaction data
// We must discard and try again
if (clock.getTicks() > deadline) {
LOG.error().$("tx read timeout [timeout=").$(spinLockTimeout).utf8("ms]").$();
throw CairoException.critical(0).put("Transaction read timeout");
}
LOG.debug().$("loaded __dirty__ txn, version ").$(txReader.getVersion()).$();
Os.pause();
}
}
public static boolean schedulePurgeO3Partitions(MessageBus messageBus, TableToken tableName, int partitionBy) {
final MPSequence seq = messageBus.getO3PurgeDiscoveryPubSeq();
while (true) {
long cursor = seq.next();
if (cursor > -1) {
O3PartitionPurgeTask task = messageBus.getO3PurgeDiscoveryQueue().get(cursor);
task.of(tableName, partitionBy);
seq.done(cursor);
return true;
} else if (cursor == -1) {
return false;
}
Os.pause();
}
}
public static void setNull(int columnType, long addr, long count) {
switch (ColumnType.tagOf(columnType)) {
case ColumnType.BOOLEAN:
case ColumnType.BYTE:
Vect.memset(addr, count, 0);
break;
case ColumnType.GEOBYTE:
Vect.memset(addr, count, GeoHashes.BYTE_NULL);
break;
case ColumnType.CHAR:
case ColumnType.SHORT:
Vect.setMemoryShort(addr, (short) 0, count);
break;
case ColumnType.GEOSHORT:
Vect.setMemoryShort(addr, GeoHashes.SHORT_NULL, count);
break;
case ColumnType.INT:
Vect.setMemoryInt(addr, Numbers.INT_NaN, count);
break;
case ColumnType.GEOINT:
Vect.setMemoryInt(addr, GeoHashes.INT_NULL, count);
break;
case ColumnType.FLOAT:
Vect.setMemoryFloat(addr, Float.NaN, count);
break;
case ColumnType.SYMBOL:
Vect.setMemoryInt(addr, SymbolTable.VALUE_IS_NULL, count);
break;
case ColumnType.LONG:
case ColumnType.DATE:
case ColumnType.TIMESTAMP:
Vect.setMemoryLong(addr, Numbers.LONG_NaN, count);
break;
case ColumnType.GEOLONG:
Vect.setMemoryLong(addr, GeoHashes.NULL, count);
break;
case ColumnType.DOUBLE:
Vect.setMemoryDouble(addr, Double.NaN, count);
break;
case ColumnType.LONG256:
// Long256 is null when all 4 longs are NaNs
Vect.setMemoryLong(addr, Numbers.LONG_NaN, count * 4);
break;
case ColumnType.LONG128:
// fall through
case ColumnType.UUID:
// Long128 and UUID are null when all 2 longs are NaNs
Vect.setMemoryLong(addr, Numbers.LONG_NaN, count * 2);
break;
default:
break;
}
}
/**
* Sets the path to the directory of a partition taking into account the timestamp, the partitioning scheme
* and the partition version.
*
* @param path Set to the root directory for a table, this will be updated to the root directory of the partition
* @param partitionBy Partitioning scheme
* @param timestamp A timestamp in the partition
* @param nameTxn Partition txn suffix
*/
public static void setPathForPartition(Path path, int partitionBy, long timestamp, long nameTxn) {
setSinkForPartition(path.slash(), partitionBy, timestamp, nameTxn);
}
/**
* Sets the sink to the directory of a partition taking into account the timestamp, the partitioning scheme
* and the partition version.
*
* @param sink Set to the root directory for a table, this will be updated to the root directory of the partition
* @param partitionBy Partitioning scheme
* @param timestamp A timestamp in the partition
* @param nameTxn Partition txn suffix
*/
public static void setSinkForPartition(CharSink sink, int partitionBy, long timestamp, long nameTxn) {
PartitionBy.setSinkForPartition(sink, partitionBy, timestamp);
if (nameTxn > -1L) {
sink.put('.').put(nameTxn);
}
}
public static int toIndexKey(int symbolKey) {
return symbolKey == SymbolTable.VALUE_IS_NULL ? 0 : symbolKey + 1;
}
public static void validateIndexValueBlockSize(int position, int indexValueBlockSize) throws SqlException {
if (indexValueBlockSize < MIN_INDEX_VALUE_BLOCK_SIZE) {
throw SqlException.$(position, "min index block capacity is ").put(MIN_INDEX_VALUE_BLOCK_SIZE);
}
if (indexValueBlockSize > MAX_INDEX_VALUE_BLOCK_SIZE) {
throw SqlException.$(position, "max index block capacity is ").put(MAX_INDEX_VALUE_BLOCK_SIZE);
}
}
public static void validateMeta(
MemoryMR metaMem,
LowerCaseCharSequenceIntHashMap nameIndex,
int expectedVersion
) {
try {
final long memSize = checkMemSize(metaMem, META_OFFSET_COLUMN_TYPES);
validateMetaVersion(metaMem, META_OFFSET_VERSION, expectedVersion);
final int columnCount = getColumnCount(metaMem, META_OFFSET_COUNT);
long offset = getColumnNameOffset(columnCount);
if (memSize < offset) {
throw validationException(metaMem).put("File is too small, column types are missing ").put(memSize);
}
// validate designated timestamp column
final int timestampIndex = getTimestampIndex(metaMem, META_OFFSET_TIMESTAMP_INDEX, columnCount);
if (timestampIndex != -1) {
final int timestampType = getColumnType(metaMem, timestampIndex);
if (!ColumnType.isTimestamp(timestampType)) {
throw validationException(metaMem).put("Timestamp column must be TIMESTAMP, but found ").put(ColumnType.nameOf(timestampType));
}
}
// validate column types and index attributes
for (int i = 0; i < columnCount; i++) {
final int type = Math.abs(getColumnType(metaMem, i));
if (ColumnType.sizeOf(type) == -1) {
throw validationException(metaMem).put("Invalid column type ").put(type).put(" at [").put(i).put(']');
}
if (isColumnIndexed(metaMem, i)) {
if (!ColumnType.isSymbol(type)) {
throw validationException(metaMem).put("Index flag is only supported for SYMBOL").put(" at [").put(i).put(']');
}
if (getIndexBlockCapacity(metaMem, i) < 2) {
throw validationException(metaMem).put("Invalid index value block capacity ").put(getIndexBlockCapacity(metaMem, i)).put(" at [").put(i).put(']');
}
}
}
// validate column names
int denseCount = 0;
for (int i = 0; i < columnCount; i++) {
final CharSequence name = getColumnName(metaMem, memSize, offset, i);
if (getColumnType(metaMem, i) < 0 || nameIndex.put(name, denseCount++)) {
offset += Vm.getStorageLength(name);
} else {
throw validationException(metaMem).put("Duplicate column [name=").put(name).put("] at ").put(i);
}
}
} catch (Throwable e) {
nameIndex.clear();
throw e;
}
}
public static void validateMetaVersion(MemoryMR metaMem, long metaVersionOffset, int expectedVersion) {
final int metaVersion = metaMem.getInt(metaVersionOffset);
if (expectedVersion != metaVersion) {
throw validationException(metaMem)
.put("Metadata version does not match runtime version [expected=").put(expectedVersion)
.put(", actual=").put(metaVersion)
.put(']');
}
}
public static void validateSymbolCapacity(int position, int symbolCapacity) throws SqlException {
if (symbolCapacity < MIN_SYMBOL_CAPACITY) {
throw SqlException.$(position, "min symbol capacity is ").put(MIN_SYMBOL_CAPACITY);
}
if (symbolCapacity > MAX_SYMBOL_CAPACITY) {
throw SqlException.$(position, "max symbol capacity is ").put(MAX_SYMBOL_CAPACITY);
}
}
public static void validateSymbolCapacityCached(boolean cache, int symbolCapacity, int cacheKeywordPosition) throws SqlException {
if (cache && symbolCapacity > MAX_SYMBOL_CAPACITY_CACHED) {
throw SqlException.$(cacheKeywordPosition, "max cached symbol capacity is ").put(MAX_SYMBOL_CAPACITY_CACHED);
}
}
public static CairoException validationException(MemoryMR mem) {
return CairoException.critical(CairoException.METADATA_VALIDATION).put("Invalid metadata at fd=").put(mem.getFd()).put(". ");
}
public static void writeIntOrFail(FilesFacade ff, int fd, long offset, int value, long tempMem8b, Path path) {
Unsafe.getUnsafe().putInt(tempMem8b, value);
if (ff.write(fd, tempMem8b, Integer.BYTES, offset) != Integer.BYTES) {
throw CairoException.critical(ff.errno())
.put("could not write 8 bytes [path=").put(path)
.put(", fd=").put(fd)
.put(", offset=").put(offset)
.put(", value=").put(value)
.put(']');
}
}
public static void writeLongOrFail(FilesFacade ff, int fd, long offset, long value, long tempMem8b, Path path) {
Unsafe.getUnsafe().putLong(tempMem8b, value);
if (ff.write(fd, tempMem8b, Long.BYTES, offset) != Long.BYTES) {
throw CairoException.critical(ff.errno())
.put("could not write 8 bytes [path=").put(path)
.put(", fd=").put(fd)
.put(", offset=").put(offset)
.put(", value=").put(value)
.put(']');
}
}
private static void createTableFiles(
FilesFacade ff,
MemoryMARW memory,
Path path,
int rootLen,
CharSequence tableDir,
TableStructure structure,
int tableVersion,
int tableId
) {
final int dirFd = !ff.isRestrictedFileSystem() ? TableUtils.openRO(ff, path.trimTo(rootLen).$(), LOG) : 0;
try (MemoryMARW mem = memory) {
mem.smallFile(ff, path.trimTo(rootLen).concat(META_FILE_NAME).$(), MemoryTag.MMAP_DEFAULT);
mem.jumpTo(0);
final int count = structure.getColumnCount();
path.trimTo(rootLen);
mem.putInt(count);
mem.putInt(structure.getPartitionBy());
int timestampIndex = structure.getTimestampIndex();
assert timestampIndex == -1 ||
(timestampIndex >= 0 && timestampIndex < count && structure.getColumnType(timestampIndex) == ColumnType.TIMESTAMP);
mem.putInt(timestampIndex);
mem.putInt(tableVersion);
mem.putInt(tableId);
mem.putInt(structure.getMaxUncommittedRows());
mem.putLong(structure.getO3MaxLag());
mem.putLong(0); // Structure version.
mem.putInt(structure.isWalEnabled() ? 1 : 0);
mem.jumpTo(TableUtils.META_OFFSET_COLUMN_TYPES);
assert count > 0;
for (int i = 0; i < count; i++) {
mem.putInt(structure.getColumnType(i));
long flags = 0;
if (structure.isIndexed(i)) {
flags |= META_FLAG_BIT_INDEXED;
}
if (structure.isSequential(i)) {
flags |= META_FLAG_BIT_SEQUENTIAL;
}
mem.putLong(flags);
mem.putInt(structure.getIndexBlockCapacity(i));
// reserved
mem.skip(16);
}
for (int i = 0; i < count; i++) {
mem.putStr(structure.getColumnName(i));
}
mem.sync(false);
// create symbol maps
int symbolMapCount = 0;
for (int i = 0; i < count; i++) {
if (ColumnType.isSymbol(structure.getColumnType(i))) {
createSymbolMapFiles(
ff,
mem,
path.trimTo(rootLen),
structure.getColumnName(i),
COLUMN_NAME_TXN_NONE,
structure.getSymbolCapacity(i),
structure.getSymbolCacheFlag(i)
);
symbolMapCount++;
}
}
mem.smallFile(ff, path.trimTo(rootLen).concat(TXN_FILE_NAME).$(), MemoryTag.MMAP_DEFAULT);
createTxn(mem, symbolMapCount, 0L, 0L, INITIAL_TXN, 0L, 0L, 0L, 0L);
mem.sync(false);
mem.smallFile(ff, path.trimTo(rootLen).concat(COLUMN_VERSION_FILE_NAME).$(), MemoryTag.MMAP_DEFAULT);
createColumnVersionFile(mem);
mem.sync(false);
mem.close();
resetTodoLog(ff, path, rootLen, mem);
// allocate txn scoreboard
path.trimTo(rootLen).concat(TXN_SCOREBOARD_FILE_NAME).$();
mem.smallFile(ff, path.trimTo(rootLen).concat(TABLE_NAME_FILE).$(), MemoryTag.MMAP_DEFAULT);
createTableNameFile(mem, getTableNameFromDirName(tableDir));
} finally {
if (dirFd > 0) {
ff.fsyncAndClose(dirFd);
}
}
}
private static int exists(FilesFacade ff, Path path) {
if (ff.exists(path)) { // it can also be a file, for example created with touch
if (ff.exists(path.concat(TXN_FILE_NAME).$())) {
return TABLE_EXISTS;
} else {
return TABLE_RESERVED;
}
} else {
return TABLE_DOES_NOT_EXIST;
}
}
private static CharSequence getCharSequence(MemoryMR metaMem, long memSize, long offset, int strLength) {
if (strLength < 1 || strLength > 255) {
// EXT4 and many others do not allow file name length > 255 bytes
throw validationException(metaMem).put("String length of ").put(strLength).put(" is invalid at offset ").put(offset);
}
final long storageLength = Vm.getStorageLength(strLength);
if (offset + storageLength > memSize) {
throw CairoException.critical(0).put("File is too small, size=").put(memSize).put(", required=").put(offset + storageLength);
}
return metaMem.getStr(offset);
}
private static int getInt(MemoryMR metaMem, long memSize, long offset) {
if (memSize < offset + Integer.BYTES) {
throw CairoException.critical(0).put("File is too small, size=").put(memSize).put(", required=").put(offset + Integer.BYTES);
}
return metaMem.getInt(offset);
}
// Utility method for debugging. This method is not used in production.
@SuppressWarnings("unused")
static boolean assertTimestampInOrder(long srcTimestampAddr, long srcDataMax) {
long prev = Long.MIN_VALUE;
for (long i = 0; i < srcDataMax; i++) {
long newTs = Unsafe.getUnsafe().getLong(srcTimestampAddr + i * Long.BYTES);
if (newTs < prev) {
return false;
}
prev = newTs;
}
return true;
}
static void createDirsOrFail(FilesFacade ff, Path path, int mkDirMode) {
if (ff.mkdirs(path, mkDirMode) != 0) {
throw CairoException.critical(ff.errno()).put("could not create directories [file=").put(path).put(']');
}
}
static long getColumnFlags(MemoryR metaMem, int columnIndex) {
return metaMem.getLong(META_OFFSET_COLUMN_TYPES + columnIndex * META_COLUMN_DATA_SIZE + 4);
}
static int getIndexBlockCapacity(MemoryR metaMem, int columnIndex) {
return metaMem.getInt(META_OFFSET_COLUMN_TYPES + columnIndex * META_COLUMN_DATA_SIZE + 4 + 8);
}
static boolean isColumnIndexed(MemoryR metaMem, int columnIndex) {
return (getColumnFlags(metaMem, columnIndex) & META_FLAG_BIT_INDEXED) != 0;
}
static boolean isSequential(MemoryR metaMem, int columnIndex) {
return (getColumnFlags(metaMem, columnIndex) & META_FLAG_BIT_SEQUENTIAL) != 0;
}
static int openMetaSwapFile(FilesFacade ff, MemoryMA mem, Path path, int rootLen, int retryCount) {
try {
path.concat(META_SWAP_FILE_NAME).$();
int l = path.length();
int index = 0;
do {
if (index > 0) {
path.trimTo(l).put('.').put(index);
path.$();
}
if (!ff.exists(path) || ff.remove(path)) {
try {
mem.smallFile(ff, path, MemoryTag.MMAP_DEFAULT);
mem.jumpTo(0);
return index;
} catch (CairoException e) {
// right, cannot open file for some reason?
LOG.error()
.$("could not open swap [file=").$(path)
.$(", errno=").$(e.getErrno())
.I$();
}
} else {
LOG.error()
.$("could not remove swap [file=").$(path)
.$(", errno=").$(ff.errno())
.I$();
}
} while (++index < retryCount);
throw CairoException.critical(0).put("Cannot open indexed file. Max number of attempts reached [").put(index).put("]. Last file tried: ").put(path);
} finally {
path.trimTo(rootLen);
}
}
static void openMetaSwapFileByIndex(FilesFacade ff, MemoryMA mem, Path path, int rootLen, int swapIndex) {
try {
path.concat(META_SWAP_FILE_NAME);
if (swapIndex > 0) {
path.put('.').put(swapIndex);
}
path.$();
mem.smallFile(ff, path, MemoryTag.MMAP_DEFAULT);
} finally {
path.trimTo(rootLen);
}
}
public interface FailureCloseable {
void close(long prevSize);
}
static {
//noinspection ConstantValue
assert TX_OFFSET_LAG_MAX_TIMESTAMP_64 + 8 <= TX_OFFSET_MAP_WRITER_COUNT_32;
}
}