io.questdb.cairo.TableReader Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of core Show documentation
Show all versions of core Show documentation
QuestDB is High Performance Time Series Database
/*******************************************************************************
* ___ _ ____ ____
* / _ \ _ _ ___ ___| |_| _ \| __ )
* | | | | | | |/ _ \/ __| __| | | | _ \
* | |_| | |_| | __/\__ \ |_| |_| | |_) |
* \__\_\\__,_|\___||___/\__|____/|____/
*
* Copyright (c) 2014-2019 Appsicle
* Copyright (c) 2019-2020 QuestDB
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*
******************************************************************************/
package io.questdb.cairo;
import io.questdb.cairo.sql.RecordMetadata;
import io.questdb.log.Log;
import io.questdb.log.LogFactory;
import io.questdb.std.*;
import io.questdb.std.microtime.TimestampLocaleFactory;
import io.questdb.std.microtime.Timestamps;
import io.questdb.std.str.Path;
import java.io.Closeable;
import java.util.concurrent.locks.LockSupport;
import static io.questdb.cairo.TableUtils.TX_OFFSET_MIN_TIMESTAMP;
public class TableReader implements Closeable {
private static final Log LOG = LogFactory.getLog(TableReader.class);
private static final PartitionPathGenerator YEAR_GEN = TableReader::pathGenYear;
private static final PartitionPathGenerator MONTH_GEN = TableReader::pathGenMonth;
private static final PartitionPathGenerator DAY_GEN = TableReader::pathGenDay;
private static final PartitionPathGenerator DEFAULT_GEN = (reader, partitionIndex) -> reader.pathGenDefault();
private static final ReloadMethod NON_PARTITIONED_RELOAD_METHOD = TableReader::reloadNonPartitioned;
private static final ReloadMethod FIRST_TIME_NON_PARTITIONED_RELOAD_METHOD = TableReader::reloadInitialNonPartitioned;
private static final ReloadMethod PARTITIONED_RELOAD_METHOD = TableReader::reloadPartitioned;
private static final ReloadMethod FIRST_TIME_PARTITIONED_RELOAD_METHOD = TableReader::reloadInitialPartitioned;
private static final TimestampFloorMethod ENTITY_FLOOR_METHOD = timestamp -> timestamp;
private final ColumnCopyStruct tempCopyStruct = new ColumnCopyStruct();
private final FilesFacade ff;
private final Path path;
private final int rootLen;
private final ReadOnlyMemory txMem;
private final TableReaderMetadata metadata;
private final LongList partitionRowCounts;
private final PartitionPathGenerator partitionPathGenerator;
private final TableReaderRecordCursor recordCursor = new TableReaderRecordCursor();
private final TimestampFloorMethod timestampFloorMethod;
private final IntervalLengthMethod intervalLengthMethod;
private final PartitionTimestampCalculatorMethod partitionTimestampCalculatorMethod;
private final String tableName;
private final ObjList symbolMapReaders = new ObjList<>();
private final CairoConfiguration configuration;
private final IntList symbolCountSnapshot = new IntList();
private final LongHashSet removedPartitions = new LongHashSet();
private LongList columnTops;
private ObjList columns;
private ObjList bitmapIndexes;
private int columnCount;
private int columnCountBits;
private long transientRowCount;
private long structVersion;
private long dataVersion;
private long prevStructVersion;
private long partitionTableVersion;
private long prevPartitionTableVersion;
private long rowCount;
private long txn = TableUtils.INITIAL_TXN;
private long maxTimestamp = Numbers.LONG_NaN;
private int partitionCount;
private long minTimestamp = Long.MAX_VALUE;
private long prevMinTimestamp = Long.MAX_VALUE;
private ReloadMethod reloadMethod;
private long tempMem8b = Unsafe.malloc(8);
public TableReader(CairoConfiguration configuration, CharSequence tableName) {
LOG.info().$("open '").utf8(tableName).$('\'').$();
this.configuration = configuration;
this.ff = configuration.getFilesFacade();
this.tableName = Chars.stringOf(tableName);
this.path = new Path().of(configuration.getRoot()).concat(tableName);
this.rootLen = path.length();
try {
failOnPendingTodo();
this.txMem = openTxnFile();
this.metadata = openMetaFile();
this.columnCount = this.metadata.getColumnCount();
this.columnCountBits = getColumnBits(columnCount);
switch (this.metadata.getPartitionBy()) {
case PartitionBy.DAY:
partitionPathGenerator = DAY_GEN;
reloadMethod = FIRST_TIME_PARTITIONED_RELOAD_METHOD;
timestampFloorMethod = Timestamps::floorDD;
intervalLengthMethod = Timestamps::getDaysBetween;
partitionTimestampCalculatorMethod = Timestamps::addDays;
break;
case PartitionBy.MONTH:
partitionPathGenerator = MONTH_GEN;
reloadMethod = FIRST_TIME_PARTITIONED_RELOAD_METHOD;
timestampFloorMethod = Timestamps::floorMM;
intervalLengthMethod = Timestamps::getMonthsBetween;
partitionTimestampCalculatorMethod = Timestamps::addMonths;
break;
case PartitionBy.YEAR:
partitionPathGenerator = YEAR_GEN;
reloadMethod = FIRST_TIME_PARTITIONED_RELOAD_METHOD;
timestampFloorMethod = Timestamps::floorYYYY;
intervalLengthMethod = Timestamps::getYearsBetween;
partitionTimestampCalculatorMethod = Timestamps::addYear;
break;
default:
partitionPathGenerator = DEFAULT_GEN;
reloadMethod = FIRST_TIME_NON_PARTITIONED_RELOAD_METHOD;
timestampFloorMethod = ENTITY_FLOOR_METHOD;
intervalLengthMethod = null;
partitionTimestampCalculatorMethod = null;
break;
}
readTxn();
openSymbolMaps();
this.prevStructVersion = structVersion;
this.prevPartitionTableVersion = partitionTableVersion;
if (metadata.getPartitionBy() == PartitionBy.NONE) {
checkDefaultPartitionExistsAndUpdatePartitionCount();
} else {
partitionCount = calculatePartitionCount();
}
int capacity = getColumnBase(partitionCount);
this.columns = new ObjList<>(capacity);
this.columns.setPos(capacity);
this.bitmapIndexes = new ObjList<>(capacity);
this.bitmapIndexes.setPos(capacity);
this.partitionRowCounts = new LongList(partitionCount);
this.partitionRowCounts.seed(partitionCount, -1);
this.columnTops = new LongList(capacity / 2);
this.columnTops.setPos(capacity / 2);
this.recordCursor.of(this);
} catch (CairoException e) {
close();
throw e;
}
}
private static int getColumnBits(int columnCount) {
return Numbers.msb(Numbers.ceilPow2(columnCount) * 2);
}
static int getPrimaryColumnIndex(int base, int index) {
return base + index * 2;
}
private static boolean isEntryToBeProcessed(long address, int index) {
if (Unsafe.getUnsafe().getByte(address + index) == -1) {
return false;
}
Unsafe.getUnsafe().putByte(address + index, (byte) -1);
return true;
}
private static void growColumn(ReadOnlyColumn mem1, ReadOnlyColumn mem2, int type, long rowCount) {
if (rowCount > 0) {
// subtract column top
switch (type) {
case ColumnType.BINARY:
growBin(mem1, mem2, rowCount);
break;
case ColumnType.STRING:
growStr(mem1, mem2, rowCount);
break;
default:
mem1.grow(rowCount << ColumnType.pow2SizeOf(type));
break;
}
}
}
private static void growStr(ReadOnlyColumn mem1, ReadOnlyColumn mem2, long rowCount) {
assert mem2 != null;
mem2.grow(rowCount * 8);
final long offset = mem2.getLong((rowCount - 1) * 8);
mem1.grow(offset + 4);
final long len = mem1.getInt(offset);
if (len > 0) {
mem1.grow(offset + len * 2 + 4);
}
}
private static void growBin(ReadOnlyColumn mem1, ReadOnlyColumn mem2, long rowCount) {
assert mem2 != null;
mem2.grow(rowCount * 8);
final long offset = mem2.getLong((rowCount - 1) * 8);
// grow data column to value offset + length, so that we can read length
mem1.grow(offset + 8);
final long len = mem1.getLong(offset);
if (len > 0) {
mem1.grow(offset + len + 8);
}
}
@Override
public void close() {
if (isOpen()) {
freeSymbolMapReaders();
freeBitmapIndexCache();
Misc.free(path);
Misc.free(metadata);
Misc.free(txMem);
freeColumns();
freeTempMem();
LOG.info().$("closed '").utf8(tableName).$('\'').$();
}
}
/**
* Closed column files. Similarly to {@link #closeColumnForRemove(CharSequence)} closed reader column files before
* column can be removed. This method takes column index usually resolved from column name by #TableReaderMetadata.
* Bounds checking is performed via assertion.
*
* @param columnIndex column index
*/
public void closeColumnForRemove(int columnIndex) {
assert columnIndex > -1 && columnIndex < columnCount;
for (int partitionIndex = 0; partitionIndex < partitionCount; partitionIndex++) {
// replace columns we force closed with special marker object
// when we come to reloading table reader we would be able to
// tell that column has to be attempted to be read from disk
closeColumn(getColumnBase(partitionIndex), columnIndex);
}
if (metadata.getColumnType(columnIndex) == ColumnType.SYMBOL) {
// same goes for symbol map reader - replace object with maker instance
Misc.free(symbolMapReaders.getAndSetQuick(columnIndex, EmptySymbolMapReader.INSTANCE));
}
}
/**
* Closes column files. This method should be used before call to TableWriter.removeColumn() on
* Windows OS.
*
* @param columnName name of column to be closed.
*/
public void closeColumnForRemove(CharSequence columnName) {
closeColumnForRemove(metadata.getColumnIndex(columnName));
}
public long floorToPartitionTimestamp(long timestamp) {
return timestampFloorMethod.floor(timestamp);
}
public BitmapIndexReader getBitmapIndexReader(int columnBase, int columnIndex, int direction) {
final int index = getPrimaryColumnIndex(columnBase, columnIndex);
BitmapIndexReader reader = bitmapIndexes.getQuick(direction == BitmapIndexReader.DIR_BACKWARD ? index : index + 1);
return reader == null ? createBitmapIndexReaderAt(index, columnBase, columnIndex, direction) : reader;
}
public TableReaderRecordCursor getCursor() {
recordCursor.toTop();
return recordCursor;
}
public long getDataVersion() {
return dataVersion;
}
public long getMaxTimestamp() {
return maxTimestamp;
}
public RecordMetadata getMetadata() {
return metadata;
}
public long getMinTimestamp() {
return minTimestamp;
}
public int getPartitionCount() {
return partitionCount;
}
public int getPartitionCountBetweenTimestamps(long partitionTimestamp1, long partitionTimestamp2) {
return (int) intervalLengthMethod.calculate(partitionTimestamp1, partitionTimestamp2);
}
public int getPartitionedBy() {
return metadata.getPartitionBy();
}
public SymbolMapReader getSymbolMapReader(int columnIndex) {
return symbolMapReaders.getQuick(columnIndex);
}
public CharSequence getTableName() {
return tableName;
}
public long getVersion() {
return this.structVersion;
}
public boolean isOpen() {
return tempMem8b != 0;
}
public boolean reload() {
return reloadMethod.reload(this);
}
public void reshuffleSymbolMapReaders(long pTransitionIndex) {
final int columnCount = Unsafe.getUnsafe().getInt(pTransitionIndex + 4);
final long index = pTransitionIndex + 8;
final long stateAddress = index + columnCount * 8;
if (columnCount > this.columnCount) {
symbolMapReaders.setPos(columnCount);
}
Unsafe.getUnsafe().setMemory(stateAddress, columnCount, (byte) 0);
// this is a silly exercise in walking the index
for (int i = 0; i < columnCount; i++) {
// prevent writing same entry once
if (Unsafe.getUnsafe().getByte(stateAddress + i) == -1) {
continue;
}
Unsafe.getUnsafe().putByte(stateAddress + i, (byte) -1);
int copyFrom = Unsafe.getUnsafe().getInt(index + i * 8);
// don't copy entries to themselves, unless symbol map was deleted
if (copyFrom == i + 1 && copyFrom < columnCount) {
SymbolMapReader reader = symbolMapReaders.getQuick(copyFrom);
if (reader != null && reader.isDeleted()) {
symbolMapReaders.setQuick(copyFrom, reloadSymbolMapReader(copyFrom, reader));
}
continue;
}
// check where we source entry:
// 1. from another entry
// 2. create new instance
SymbolMapReader tmp;
if (copyFrom > 0) {
tmp = copyOrRenewSymbolMapReader(symbolMapReaders.getAndSetQuick(copyFrom - 1, null), i);
int copyTo = Unsafe.getUnsafe().getInt(index + i * 8 + 4);
// now we copied entry, what do we do with value that was already there?
// do we copy it somewhere else?
while (copyTo > 0) {
// Yeah, we do. This can get recursive!
// prevent writing same entry twice
if (Unsafe.getUnsafe().getByte(stateAddress + copyTo - 1) == -1) {
break;
}
Unsafe.getUnsafe().putByte(stateAddress + copyTo - 1, (byte) -1);
tmp = copyOrRenewSymbolMapReader(tmp, copyTo - 1);
copyTo = Unsafe.getUnsafe().getInt(index + (copyTo - 1) * 8 + 4);
}
Misc.free(tmp);
} else {
// new instance
Misc.free(symbolMapReaders.getAndSetQuick(i, reloadSymbolMapReader(i, null)));
}
}
// ended up with fewer columns than before?
// free resources for the "extra" symbol map readers and contract the list
if (columnCount < this.columnCount) {
for (int i = columnCount; i < this.columnCount; i++) {
Misc.free(symbolMapReaders.getQuick(i));
}
symbolMapReaders.setPos(columnCount);
}
}
public long size() {
return rowCount;
}
private void applyTruncate() {
LOG.info().$("truncate detected").$();
for (int i = 0, n = partitionCount; i < n; i++) {
long size = openPartition0(i);
if (size == -1) {
int base = getColumnBase(i);
for (int k = 0; k < columnCount; k++) {
final int index = getPrimaryColumnIndex(base, k);
Misc.free(columns.getAndSetQuick(index, null));
Misc.free(columns.getAndSetQuick(index + 1, null));
Misc.free(bitmapIndexes.getAndSetQuick(index, null));
Misc.free(bitmapIndexes.getAndSetQuick(index + 1, null));
}
partitionRowCounts.setQuick(i, -1);
}
}
reloadSymbolMapCounts();
partitionCount = calculatePartitionCount();
if (partitionCount > 0) {
updateCapacities();
}
}
private int calculatePartitionCount() {
if (minTimestamp == Long.MAX_VALUE) {
return 0;
} else {
return maxTimestamp == Long.MIN_VALUE ? 1 : getPartitionCountBetweenTimestamps(
minTimestamp,
floorToPartitionTimestamp(maxTimestamp)
) + 1;
}
}
private void checkDefaultPartitionExistsAndUpdatePartitionCount() {
if (maxTimestamp == Numbers.LONG_NaN) {
partitionCount = 0;
} else {
Path path = pathGenDefault();
partitionCount = ff.exists(path) ? 1 : 0;
path.trimTo(rootLen);
}
}
private void closeColumn(int columnBase, int columnIndex) {
final int index = getPrimaryColumnIndex(columnBase, columnIndex);
Misc.free(columns.getAndSetQuick(index, ForceNullColumn.INSTANCE));
Misc.free(columns.getAndSetQuick(index + 1, ForceNullColumn.INSTANCE));
Misc.free(bitmapIndexes.getAndSetQuick(index, null));
Misc.free(bitmapIndexes.getAndSetQuick(index + 1, null));
}
private void closeRemovedPartitions() {
for (int i = 0, n = removedPartitions.size(); i < n; i++) {
final long timestamp = removedPartitions.get(i);
int partitionIndex = getPartitionCountBetweenTimestamps(prevMinTimestamp, timestamp);
if (partitionIndex > -1) {
if (partitionIndex < partitionCount) {
if (getPartitionRowCount(partitionIndex) != -1) {
// this is an open partition
int base = getColumnBase(partitionIndex);
for (int k = 0; k < columnCount; k++) {
closeColumn(base, k);
}
partitionRowCounts.setQuick(partitionIndex, -1);
}
// partition has not yet been opened
} else {
LOG.error()
.$("partition index is out of range [partitionIndex=").$(partitionIndex)
.$(", partitionCount=").$(partitionCount)
.$(", timestamp=").$ts(timestamp)
.$(']').$();
}
}
// adjust columns list when leading partitions have been removed
if (prevMinTimestamp != minTimestamp) {
assert prevMinTimestamp < minTimestamp;
int delta = getPartitionCountBetweenTimestamps(prevMinTimestamp, minTimestamp);
columns.remove(0, getColumnBase(delta) - 1);
prevMinTimestamp = minTimestamp;
partitionCount -= delta;
}
}
}
private void copyColumnsTo(ObjList columns, LongList columnTops, ObjList indexReaders, int columnBase, int columnIndex, long partitionRowCount) {
ReadOnlyColumn mem1 = tempCopyStruct.mem1;
final boolean reload = (mem1 instanceof ReadOnlyMemory || mem1 instanceof ForceNullColumn) && mem1.isDeleted();
final int index = getPrimaryColumnIndex(columnBase, columnIndex);
tempCopyStruct.mem1 = columns.getAndSetQuick(index, mem1);
tempCopyStruct.mem2 = columns.getAndSetQuick(index + 1, tempCopyStruct.mem2);
tempCopyStruct.top = columnTops.getAndSetQuick(columnBase / 2 + columnIndex, tempCopyStruct.top);
tempCopyStruct.backwardReader = indexReaders.getAndSetQuick(index, tempCopyStruct.backwardReader);
tempCopyStruct.forwardReader = indexReaders.getAndSetQuick(index + 1, tempCopyStruct.forwardReader);
if (reload) {
reloadColumnAt(path, columns, columnTops, indexReaders, columnBase, columnIndex, partitionRowCount);
}
}
private SymbolMapReader copyOrRenewSymbolMapReader(SymbolMapReader reader, int columnIndex) {
if (reader != null && reader.isDeleted()) {
reader = reloadSymbolMapReader(columnIndex, reader);
}
return symbolMapReaders.getAndSetQuick(columnIndex, reader);
}
private BitmapIndexReader createBitmapIndexReaderAt(int globalIndex, int columnBase, int columnIndex, int direction) {
BitmapIndexReader reader;
if (!metadata.isColumnIndexed(columnIndex)) {
throw CairoException.instance(0).put("Not indexed: ").put(metadata.getColumnName(columnIndex));
}
ReadOnlyColumn col = columns.getQuick(globalIndex);
if (col instanceof NullColumn) {
if (direction == BitmapIndexReader.DIR_BACKWARD) {
reader = new BitmapIndexBwdNullReader();
bitmapIndexes.setQuick(globalIndex, reader);
} else {
reader = new BitmapIndexFwdNullReader();
bitmapIndexes.setQuick(globalIndex + 1, reader);
}
} else {
Path path = partitionPathGenerator.generate(this, getPartitionIndex(columnBase));
try {
if (direction == BitmapIndexReader.DIR_BACKWARD) {
reader = new BitmapIndexBwdReader(configuration, path.chopZ(), metadata.getColumnName(columnIndex), getColumnTop(columnBase, columnIndex));
bitmapIndexes.setQuick(globalIndex, reader);
} else {
reader = new BitmapIndexFwdReader(configuration, path.chopZ(), metadata.getColumnName(columnIndex), getColumnTop(columnBase, columnIndex));
bitmapIndexes.setQuick(globalIndex + 1, reader);
}
} finally {
path.trimTo(rootLen);
}
}
return reader;
}
private void createNewColumnList(int columnCount, long pTransitionIndex, int columnBits) {
int capacity = partitionCount << columnBits;
final ObjList columns = new ObjList<>(capacity);
final LongList columnTops = new LongList(capacity / 2);
final ObjList indexReaders = new ObjList<>(capacity);
columns.setPos(capacity);
columnTops.setPos(capacity / 2);
indexReaders.setPos(capacity);
final long pIndexBase = pTransitionIndex + 8;
for (int partitionIndex = 0; partitionIndex < partitionCount; partitionIndex++) {
final int base = partitionIndex << columnBits;
final int oldBase = partitionIndex << columnCountBits;
try {
Path path = partitionPathGenerator.generate(this, partitionIndex);
final long partitionRowCount = partitionRowCounts.getQuick(partitionIndex);
for (int i = 0; i < columnCount; i++) {
final int copyFrom = Unsafe.getUnsafe().getInt(pIndexBase + i * 8) - 1;
if (copyFrom > -1) {
fetchColumnsFrom(this.columns, this.columnTops, this.bitmapIndexes, oldBase, copyFrom);
copyColumnsTo(columns, columnTops, indexReaders, base, i, partitionRowCount);
} else {
// new instance
reloadColumnAt(path, columns, columnTops, indexReaders, base, i, partitionRowCount);
}
}
// free remaining columns
for (int i = 0; i < this.columnCount; i++) {
final int index = getPrimaryColumnIndex(oldBase, i);
Misc.free(this.columns.getQuick(index));
Misc.free(this.columns.getQuick(index + 1));
}
} finally {
path.trimTo(rootLen);
}
}
this.columns = columns;
this.columnTops = columnTops;
this.columnCountBits = columnBits;
this.bitmapIndexes = indexReaders;
}
private void failOnPendingTodo() {
try {
if (ff.exists(path.concat(TableUtils.TODO_FILE_NAME).$())) {
throw CairoException.instance(0).put("Table ").put(path.$()).put(" is pending recovery.");
}
} finally {
path.trimTo(rootLen);
}
}
private void fetchColumnsFrom(ObjList columns, LongList columnTops, ObjList indexReaders, int columnBase, int columnIndex) {
final int index = getPrimaryColumnIndex(columnBase, columnIndex);
tempCopyStruct.mem1 = columns.getAndSetQuick(index, null);
tempCopyStruct.mem2 = columns.getAndSetQuick(index + 1, null);
tempCopyStruct.top = columnTops.getQuick(columnBase / 2 + columnIndex);
tempCopyStruct.backwardReader = indexReaders.getAndSetQuick(index, null);
tempCopyStruct.forwardReader = indexReaders.getAndSetQuick(index + 1, null);
}
private void freeBitmapIndexCache() {
Misc.freeObjList(bitmapIndexes);
}
private void freeColumns() {
Misc.freeObjList(columns);
}
private void freeSymbolMapReaders() {
for (int i = 0, n = symbolMapReaders.size(); i < n; i++) {
Misc.free(symbolMapReaders.getQuick(i));
}
symbolMapReaders.clear();
}
private void freeTempMem() {
if (tempMem8b != 0) {
Unsafe.free(tempMem8b, 8);
tempMem8b = 0;
}
}
ReadOnlyColumn getColumn(int absoluteIndex) {
return columns.getQuick(absoluteIndex);
}
int getColumnBase(int partitionIndex) {
return partitionIndex << columnCountBits;
}
int getColumnCount() {
return columnCount;
}
long getColumnTop(int base, int columnIndex) {
return this.columnTops.getQuick(base / 2 + columnIndex);
}
int getPartitionIndex(int columnBase) {
return columnBase >>> columnCountBits;
}
long getPartitionRowCount(int partitionIndex) {
assert partitionRowCounts.size() > 0;
return partitionRowCounts.getQuick(partitionIndex);
}
long getTransientRowCount() {
return transientRowCount;
}
long getTxn() {
return txn;
}
private void incrementPartitionCountBy(int delta) {
partitionRowCounts.seed(partitionCount, delta, -1);
partitionCount += delta;
updateCapacities();
}
boolean isColumnCached(int columnIndex) {
return symbolMapReaders.getQuick(columnIndex).isCached();
}
private TableReaderMetadata openMetaFile() {
try {
return new TableReaderMetadata(ff, path.concat(TableUtils.META_FILE_NAME).$());
} finally {
path.trimTo(rootLen);
}
}
long openPartition(int partitionIndex) {
final long size = getPartitionRowCount(partitionIndex);
if (size != -1) {
return size;
}
return openPartition0(partitionIndex);
}
private long openPartition0(int partitionIndex) {
// is this table is partitioned?
if (partitionTimestampCalculatorMethod != null
&& removedPartitions.contains(partitionTimestampCalculatorMethod.calculate(
minTimestamp, partitionIndex
))) {
return -1;
}
// todo: this may not be the best place to check if partition is out of range
if (maxTimestamp == Long.MIN_VALUE) {
return -1;
}
try {
Path path = partitionPathGenerator.generate(this, partitionIndex);
if (ff.exists(path)) {
path.chopZ();
final long partitionSize = partitionIndex == partitionCount - 1 ? transientRowCount : TableUtils.readPartitionSize(ff, path, tempMem8b);
LOG.info()
.$("open partition ").utf8(path.$())
.$(" [rowCount=").$(partitionSize)
.$(", transientRowCount=").$(transientRowCount)
.$(", partitionIndex=").$(partitionIndex)
.$(", partitionCount=").$(partitionCount)
.$(']').$();
if (partitionSize > 0) {
openPartitionColumns(path, getColumnBase(partitionIndex), partitionSize);
partitionRowCounts.setQuick(partitionIndex, partitionSize);
if (maxTimestamp != Numbers.LONG_NaN) {
if (reloadMethod == FIRST_TIME_PARTITIONED_RELOAD_METHOD) {
reloadMethod = PARTITIONED_RELOAD_METHOD;
} else if (reloadMethod == FIRST_TIME_NON_PARTITIONED_RELOAD_METHOD) {
reloadMethod = NON_PARTITIONED_RELOAD_METHOD;
}
}
}
return partitionSize;
}
return -1;
} finally {
path.trimTo(rootLen);
}
}
private void openPartitionColumns(Path path, int columnBase, long partitionRowCount) {
for (int i = 0; i < columnCount; i++) {
reloadColumnAt(path, this.columns, this.columnTops, this.bitmapIndexes, columnBase, i, partitionRowCount);
}
}
private void openSymbolMaps() {
int symbolColumnIndex = 0;
final int columnCount = metadata.getColumnCount();
symbolMapReaders.setPos(columnCount);
for (int i = 0; i < columnCount; i++) {
if (metadata.getColumnType(i) == ColumnType.SYMBOL) {
SymbolMapReaderImpl symbolMapReader = new SymbolMapReaderImpl(configuration, path, metadata.getColumnName(i), symbolCountSnapshot.getQuick(symbolColumnIndex++));
symbolMapReaders.extendAndSet(i, symbolMapReader);
}
}
}
private ReadOnlyMemory openTxnFile() {
try {
return new ReadOnlyMemory(ff, path.concat(TableUtils.TXN_FILE_NAME).$(), ff.getPageSize(), TableUtils.getSymbolWriterIndexOffset(0));
} finally {
path.trimTo(rootLen);
}
}
private Path pathGenDay(int partitionIndex) {
TableUtils.fmtDay.format(
Timestamps.addDays(minTimestamp, partitionIndex),
TimestampLocaleFactory.INSTANCE.getDefaultTimestampLocale(),
null,
path.put(Files.SEPARATOR)
);
return path.$();
}
private Path pathGenDefault() {
return path.concat(TableUtils.DEFAULT_PARTITION_NAME).$();
}
private Path pathGenMonth(int partitionIndex) {
TableUtils.fmtMonth.format(
Timestamps.addMonths(minTimestamp, partitionIndex),
TimestampLocaleFactory.INSTANCE.getDefaultTimestampLocale(),
null,
path.put(Files.SEPARATOR)
);
return path.$();
}
private Path pathGenYear(int partitionIndex) {
TableUtils.fmtYear.format(
Timestamps.addYear(minTimestamp, partitionIndex),
TimestampLocaleFactory.INSTANCE.getDefaultTimestampLocale(),
null,
path.put(Files.SEPARATOR)
);
return path.$();
}
private boolean readTxn() {
int count = 0;
final long deadline = configuration.getMicrosecondClock().getTicks() + configuration.getSpinLockTimeoutUs();
while (true) {
long txn = txMem.getLong(TableUtils.TX_OFFSET_TXN);
// exit if this is the same as we already have
if (txn == this.txn) {
return false;
}
// make sure this isn't re-ordered
Unsafe.getUnsafe().loadFence();
// do start and end sequences match? if so we have a chance at stable read
if (txn == txMem.getLong(TableUtils.TX_OFFSET_TXN_CHECK)) {
// great, we seem to have got stable read, lets do some reading
// and check later if it was worth it
Unsafe.getUnsafe().loadFence();
final long transientRowCount = txMem.getLong(TableUtils.TX_OFFSET_TRANSIENT_ROW_COUNT);
final long fixedRowCount = txMem.getLong(TableUtils.TX_OFFSET_FIXED_ROW_COUNT);
final long minTimestamp = txMem.getLong(TX_OFFSET_MIN_TIMESTAMP);
final long maxTimestamp = txMem.getLong(TableUtils.TX_OFFSET_MAX_TIMESTAMP);
final long structVersion = txMem.getLong(TableUtils.TX_OFFSET_STRUCT_VERSION);
final long dataVersion = txMem.getLong(TableUtils.TX_OFFSET_DATA_VERSION);
final long partitionTableVersion = txMem.getLong(TableUtils.TX_OFFSET_PARTITION_TABLE_VERSION);
this.symbolCountSnapshot.clear();
int symbolMapCount = txMem.getInt(TableUtils.TX_OFFSET_MAP_WRITER_COUNT);
if (symbolMapCount > 0) {
txMem.grow(TableUtils.getSymbolWriterIndexOffset(symbolMapCount));
for (int i = 0; i < symbolMapCount; i++) {
symbolCountSnapshot.add(txMem.getInt(TableUtils.getSymbolWriterIndexOffset(i)));
}
}
txMem.grow(TableUtils.getPartitionTableIndexOffset(symbolMapCount, 0));
this.removedPartitions.clear();
int partitionTableSize = txMem.getInt(TableUtils.getPartitionTableSizeOffset(symbolMapCount));
if (partitionTableSize > 0) {
txMem.grow(TableUtils.getPartitionTableIndexOffset(symbolMapCount, partitionTableSize));
for (int i = 0; i < partitionTableSize; i++) {
this.removedPartitions.add(txMem.getLong(TableUtils.getPartitionTableIndexOffset(symbolMapCount, i)));
}
}
Unsafe.getUnsafe().loadFence();
// ok, we have snapshot, check if our snapshot is stable
if (txn == txMem.getLong(TableUtils.TX_OFFSET_TXN)) {
// good, very stable, congrats
this.txn = txn;
this.transientRowCount = transientRowCount;
this.rowCount = fixedRowCount + transientRowCount;
this.prevMinTimestamp = this.minTimestamp;
if (minTimestamp == Long.MAX_VALUE) {
this.minTimestamp = Long.MAX_VALUE;
} else {
this.minTimestamp = timestampFloorMethod.floor(minTimestamp);
}
this.maxTimestamp = maxTimestamp;
this.structVersion = structVersion;
this.dataVersion = dataVersion;
this.partitionTableVersion = partitionTableVersion;
LOG.info()
.$("new transaction [txn=").$(txn)
.$(", transientRowCount=").$(transientRowCount)
.$(", fixedRowCount=").$(fixedRowCount)
.$(", maxTimestamp=").$(maxTimestamp)
.$(", attempts=").$(count)
.$(']').$();
return true;
}
// This is unlucky, sequences have changed while we were reading transaction data
// We must discard and try again
}
count++;
if (configuration.getMicrosecondClock().getTicks() > deadline) {
LOG.error().$("tx read timeout [timeout=").$(configuration.getSpinLockTimeoutUs()).utf8("μs]").$();
throw CairoException.instance(0).put("Transaction read timeout");
}
LockSupport.parkNanos(1);
}
}
private void reloadColumnAt(Path path, ObjList columns, LongList columnTops, ObjList indexReaders, int columnBase, int columnIndex, long partitionRowCount) {
int plen = path.length();
try {
final CharSequence name = metadata.getColumnName(columnIndex);
final int primaryIndex = getPrimaryColumnIndex(columnBase, columnIndex);
final int secondaryIndex = primaryIndex + 1;
ReadOnlyColumn mem1 = columns.getQuick(primaryIndex);
ReadOnlyColumn mem2 = columns.getQuick(secondaryIndex);
if (ff.exists(TableUtils.dFile(path.trimTo(plen), name))) {
if (mem1 instanceof ReadOnlyMemory) {
((ReadOnlyMemory) mem1).of(ff, path, ff.getMapPageSize(), 0);
} else {
mem1 = new ReadOnlyMemory(ff, path, ff.getMapPageSize(), 0);
columns.setQuick(primaryIndex, mem1);
}
final long columnTop = TableUtils.readColumnTop(ff, path.trimTo(plen), name, plen, tempMem8b);
final int type = metadata.getColumnType(columnIndex);
switch (type) {
case ColumnType.BINARY:
case ColumnType.STRING:
TableUtils.iFile(path.trimTo(plen), name);
if (mem2 instanceof ReadOnlyMemory) {
((ReadOnlyMemory) mem2).of(ff, path, ff.getMapPageSize(), 0);
} else {
mem2 = new ReadOnlyMemory(ff, path, ff.getMapPageSize(), 0);
columns.setQuick(secondaryIndex, mem2);
}
growColumn(mem1, mem2, type, partitionRowCount - columnTop);
break;
default:
Misc.free(columns.getAndSetQuick(secondaryIndex, null));
growColumn(mem1, null, type, partitionRowCount - columnTop);
break;
}
columnTops.setQuick(columnBase / 2 + columnIndex, columnTop);
if (metadata.isColumnIndexed(columnIndex)) {
BitmapIndexReader indexReader = indexReaders.getQuick(primaryIndex);
if (indexReader instanceof BitmapIndexBwdReader) {
((BitmapIndexBwdReader) indexReader).of(configuration, path.trimTo(plen), name, columnTop);
}
indexReader = indexReaders.getQuick(secondaryIndex);
if (indexReader instanceof BitmapIndexFwdReader) {
((BitmapIndexFwdReader) indexReader).of(configuration, path.trimTo(plen), name, columnTop);
}
} else {
Misc.free(indexReaders.getAndSetQuick(primaryIndex, null));
Misc.free(indexReaders.getAndSetQuick(secondaryIndex, null));
}
} else {
Misc.free(columns.getAndSetQuick(primaryIndex, NullColumn.INSTANCE));
Misc.free(columns.getAndSetQuick(secondaryIndex, NullColumn.INSTANCE));
// the appropriate index for NUllColumn will be created lazily when requested
// these indexes have state and may not be always required
Misc.free(indexReaders.getAndSetQuick(primaryIndex, null));
Misc.free(indexReaders.getAndSetQuick(secondaryIndex, null));
}
} finally {
path.trimTo(plen);
}
}
private void reloadColumnChanges() {
// create transition index, which will help us reuse already open resources
long pTransitionIndex = metadata.createTransitionIndex();
try {
metadata.applyTransitionIndex(pTransitionIndex);
final int columnCount = Unsafe.getUnsafe().getInt(pTransitionIndex + 4);
int columnCountBits = getColumnBits(columnCount);
// when a column is added we cannot easily reshuffle columns in-place
// the reason is that we'd have to create gaps in columns list between
// partitions. It is possible in theory, but this could be an algo for
// another day.
if (columnCountBits > this.columnCountBits) {
createNewColumnList(columnCount, pTransitionIndex, columnCountBits);
} else {
reshuffleColumns(columnCount, pTransitionIndex);
}
// rearrange symbol map reader list
reshuffleSymbolMapReaders(pTransitionIndex);
this.columnCount = columnCount;
} finally {
TableReaderMetadata.freeTransitionIndex(pTransitionIndex);
}
}
private boolean reloadInitialNonPartitioned() {
long dataVersion = this.dataVersion;
if (readTxn()) {
reloadStruct();
reloadSymbolMapCounts();
checkDefaultPartitionExistsAndUpdatePartitionCount();
if (partitionCount > 0) {
updateCapacities();
reloadMethod = NON_PARTITIONED_RELOAD_METHOD;
return true;
}
}
return dataVersion != this.dataVersion;
}
private boolean reloadInitialPartitioned() {
if (readTxn()) {
reloadStruct();
return reloadInitialPartitioned0();
}
return false;
}
private boolean reloadInitialPartitioned0() {
reloadSymbolMapCounts();
partitionCount = calculatePartitionCount();
if (partitionCount > 0) {
updateCapacities();
if (maxTimestamp != Long.MIN_VALUE) {
reloadMethod = PARTITIONED_RELOAD_METHOD;
}
}
return true;
}
private boolean reloadNonPartitioned() {
// calling readTxn will set "rowCount" member variable
if (readTxn()) {
reloadStruct();
reloadPartition(0, rowCount);
return true;
}
return false;
}
/**
* Updates boundaries of all columns in partition.
*
* @param partitionIndex index of partition
* @param rowCount number of rows in partition
*/
private void reloadPartition(int partitionIndex, long rowCount) {
int symbolMapIndex = 0;
int columnBase = getColumnBase(partitionIndex);
for (int i = 0; i < columnCount; i++) {
final int index = getPrimaryColumnIndex(columnBase, i);
growColumn(
columns.getQuick(index),
columns.getQuick(index + 1),
metadata.getColumnType(i),
rowCount - getColumnTop(columnBase, i)
);
// reload symbol map
SymbolMapReader reader = symbolMapReaders.getQuick(i);
if (reader != null) {
reader.updateSymbolCount(symbolCountSnapshot.getQuick(symbolMapIndex++));
}
}
partitionRowCounts.setQuick(partitionIndex, rowCount);
}
private boolean reloadPartitioned() {
assert timestampFloorMethod != null;
final long currentPartitionTimestamp = maxTimestamp == Long.MIN_VALUE ? maxTimestamp : floorToPartitionTimestamp(maxTimestamp);
final long dataVersion = this.dataVersion;
if (readTxn()) {
reloadStruct();
if (this.dataVersion != dataVersion) {
applyTruncate();
return true;
}
if (partitionCount == 0) {
// old partition count was 0
incrementPartitionCountBy(calculatePartitionCount());
return true;
}
assert intervalLengthMethod != null;
// calculate timestamp delta between before and after reload.
int delta = getPartitionCountBetweenTimestamps(currentPartitionTimestamp, floorToPartitionTimestamp(maxTimestamp));
int partitionIndex = partitionCount - 1;
// do we have something to reload?
if (getPartitionRowCount(partitionIndex) > -1) {
if (delta > 0) {
incrementPartitionCountBy(delta);
Path path = partitionPathGenerator.generate(this, partitionIndex);
try {
reloadPartition(partitionIndex, TableUtils.readPartitionSize(ff, path.chopZ(), tempMem8b));
} finally {
path.trimTo(rootLen);
}
} else {
reloadPartition(partitionIndex, transientRowCount);
}
} else if (delta > 0) {
// although we have nothing to reload we still have to bump partition count
incrementPartitionCountBy(delta);
}
return true;
}
return false;
}
private void reloadStruct() {
if (this.prevStructVersion != this.structVersion) {
reloadColumnChanges();
this.prevStructVersion = this.structVersion;
}
if (this.prevPartitionTableVersion != this.partitionTableVersion) {
closeRemovedPartitions();
this.prevPartitionTableVersion = partitionTableVersion;
}
}
private void reloadSymbolMapCounts() {
int symbolMapIndex = 0;
for (int i = 0; i < columnCount; i++) {
if (metadata.getColumnType(i) == ColumnType.SYMBOL) {
symbolMapReaders.getQuick(i).updateSymbolCount(symbolCountSnapshot.getQuick(symbolMapIndex++));
}
}
}
private SymbolMapReader reloadSymbolMapReader(int columnIndex, SymbolMapReader reader) {
if (metadata.getColumnType(columnIndex) == ColumnType.SYMBOL) {
if (reader instanceof SymbolMapReaderImpl) {
((SymbolMapReaderImpl) reader).of(configuration, path, metadata.getColumnName(columnIndex), 0);
return reader;
}
return new SymbolMapReaderImpl(configuration, path, metadata.getColumnName(columnIndex), 0);
} else {
return reader;
}
}
private void reshuffleColumns(int columnCount, long pTransitionIndex) {
final long pIndexBase = pTransitionIndex + 8;
final long pState = pIndexBase + columnCount * 8L;
for (int partitionIndex = 0; partitionIndex < partitionCount; partitionIndex++) {
int base = getColumnBase(partitionIndex);
try {
Path path = partitionPathGenerator.generate(this, partitionIndex);
final long partitionRowCount = partitionRowCounts.getQuick(partitionIndex);
Unsafe.getUnsafe().setMemory(pState, columnCount, (byte) 0);
for (int i = 0; i < columnCount; i++) {
if (isEntryToBeProcessed(pState, i)) {
final int copyFrom = Unsafe.getUnsafe().getInt(pIndexBase + i * 8) - 1;
if (copyFrom == i) {
// It appears that column hasn't changed its position. There are three possibilities here:
// 1. Column has been deleted and re-added by the same name. We must check if file
// descriptor is still valid. If it isn't, reload the column from disk
// 2. Column has been forced out of the reader via closeColumnForRemove(). This is required
// on Windows before column can be deleted. In this case we must check for marker
// instance and the column from disk
// 3. Column hasn't been altered and we can skip to next column.
ReadOnlyColumn col = columns.getQuick(getPrimaryColumnIndex(base, i));
if ((col instanceof ReadOnlyMemory && col.isDeleted()) || col instanceof ForceNullColumn) {
reloadColumnAt(path, columns, columnTops, bitmapIndexes, base, i, partitionRowCount);
}
continue;
}
if (copyFrom > -1) {
fetchColumnsFrom(this.columns, this.columnTops, this.bitmapIndexes, base, copyFrom);
copyColumnsTo(this.columns, this.columnTops, this.bitmapIndexes, base, i, partitionRowCount);
int copyTo = Unsafe.getUnsafe().getInt(pIndexBase + i * 8 + 4) - 1;
while (copyTo > -1 && isEntryToBeProcessed(pState, copyTo)) {
copyColumnsTo(this.columns, this.columnTops, this.bitmapIndexes, base, copyTo, partitionRowCount);
copyTo = Unsafe.getUnsafe().getInt(pIndexBase + (copyTo - 1) * 8 + 4);
}
Misc.free(tempCopyStruct.mem1);
Misc.free(tempCopyStruct.mem2);
Misc.free(tempCopyStruct.backwardReader);
Misc.free(tempCopyStruct.forwardReader);
} else {
// new instance
reloadColumnAt(path, columns, columnTops, bitmapIndexes, base, i, partitionRowCount);
}
}
}
for (int i = columnCount; i < this.columnCount; i++) {
int index = getPrimaryColumnIndex(base, i);
Misc.free(columns.getQuick(index));
Misc.free(columns.getQuick(index + 1));
}
} finally {
path.trimTo(rootLen);
}
}
}
private void updateCapacities() {
int capacity = getColumnBase(partitionCount);
columns.setPos(capacity);
bitmapIndexes.setPos(capacity);
this.partitionRowCounts.seed(partitionCount, -1);
this.columnTops.setPos(capacity / 2);
}
@FunctionalInterface
private interface IntervalLengthMethod {
long calculate(long minTimestamp, long maxTimestamp);
}
@FunctionalInterface
interface PartitionTimestampCalculatorMethod {
long calculate(long minTimestamp, int partitionIndex);
}
@FunctionalInterface
interface TimestampFloorMethod {
long floor(long timestamp);
}
@FunctionalInterface
private interface ReloadMethod {
boolean reload(TableReader reader);
}
@FunctionalInterface
private interface PartitionPathGenerator {
Path generate(TableReader reader, int partitionIndex);
}
private static class ForceNullColumn extends NullColumn {
private static final ForceNullColumn INSTANCE = new ForceNullColumn();
}
private static class ColumnCopyStruct {
ReadOnlyColumn mem1;
ReadOnlyColumn mem2;
BitmapIndexReader backwardReader;
BitmapIndexReader forwardReader;
long top;
}
}
© 2015 - 2025 Weber Informatics LLC | Privacy Policy