All Downloads are FREE. Search and download functionalities are using the official Maven repository.

io.questdb.cairo.TableUtils Maven / Gradle / Ivy

The newest version!
/*******************************************************************************
 *     ___                  _   ____  ____
 *    / _ \ _   _  ___  ___| |_|  _ \| __ )
 *   | | | | | | |/ _ \/ __| __| | | |  _ \
 *   | |_| | |_| |  __/\__ \ |_| |_| | |_) |
 *    \__\_\\__,_|\___||___/\__|____/|____/
 *
 *  Copyright (c) 2014-2019 Appsicle
 *  Copyright (c) 2019-2020 QuestDB
 *
 *  Licensed under the Apache License, Version 2.0 (the "License");
 *  you may not use this file except in compliance with the License.
 *  You may obtain a copy of the License at
 *
 *  http://www.apache.org/licenses/LICENSE-2.0
 *
 *  Unless required by applicable law or agreed to in writing, software
 *  distributed under the License is distributed on an "AS IS" BASIS,
 *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 *  See the License for the specific language governing permissions and
 *  limitations under the License.
 *
 ******************************************************************************/

package io.questdb.cairo;

import io.questdb.cairo.sql.SymbolTable;
import io.questdb.griffin.SqlException;
import io.questdb.log.Log;
import io.questdb.log.LogFactory;
import io.questdb.std.CharSequenceIntHashMap;
import io.questdb.std.Files;
import io.questdb.std.FilesFacade;
import io.questdb.std.Numbers;
import io.questdb.std.Os;
import io.questdb.std.Transient;
import io.questdb.std.Unsafe;
import io.questdb.std.microtime.DateFormatCompiler;
import io.questdb.std.microtime.TimestampFormat;
import io.questdb.std.microtime.Timestamps;
import io.questdb.std.str.LPSZ;
import io.questdb.std.str.Path;

public final class TableUtils {
    public static final int TABLE_EXISTS = 0;
    public static final int TABLE_DOES_NOT_EXIST = 1;
    public static final int TABLE_RESERVED = 2;
    public static final String META_FILE_NAME = "_meta";
    public static final String TXN_FILE_NAME = "_txn";
    public static final int INITIAL_TXN = 0;
    public static final int NULL_LEN = -1;
    public static final int ANY_TABLE_VERSION = -1;
    static final int MIN_INDEX_VALUE_BLOCK_SIZE = Numbers.ceilPow2(4);
    static final byte TODO_RESTORE_META = 2;
    static final byte TODO_TRUNCATE = 1;
    static final TimestampFormat fmtDay;
    static final TimestampFormat fmtMonth;
    static final TimestampFormat fmtYear;
    static final String ARCHIVE_FILE_NAME = "_archive";
    static final String DEFAULT_PARTITION_NAME = "default";
    // transaction file structure
    static final long TX_OFFSET_TXN = 0;
    static final long TX_OFFSET_TRANSIENT_ROW_COUNT = 8;
    static final long TX_OFFSET_FIXED_ROW_COUNT = 16;
    static final long TX_OFFSET_MIN_TIMESTAMP = 24;
    static final long TX_OFFSET_MAX_TIMESTAMP = 32;
    static final long TX_OFFSET_STRUCT_VERSION = 40;
    static final long TX_OFFSET_DATA_VERSION = 48;
    static final long TX_OFFSET_PARTITION_TABLE_VERSION = 56;
    static final long TX_OFFSET_TXN_CHECK = 64;
    static final long TX_OFFSET_MAP_WRITER_COUNT = 72;
    /**
     * TXN file structure
     * struct {
     * long txn;
     * long transient_row_count; // rows count in last partition
     * long fixed_row_count; // row count in table excluding count in last partition
     * long max_timestamp; // last timestamp written to table
     * long struct_version; // data structure version; whenever columns added or removed this version changes.
     * long partition_version; // version that increments whenever non-current partitions are modified/added/removed
     * long txn_check; // same as txn - sanity check for concurrent reads and writes
     * int  map_writer_count; // symbol writer count
     * int  map_writer_position[map_writer_count]; // position of each of map writers
     * }
     * 

* TableUtils.resetTxn() writes to this file, it could be using different offsets, beware */ static final String META_SWAP_FILE_NAME = "_meta.swp"; static final String META_PREV_FILE_NAME = "_meta.prev"; static final long META_OFFSET_COUNT = 0; // INT - symbol map count, this is a variable part of transaction file // below this offset we will have INT values for symbol map size static final long META_OFFSET_PARTITION_BY = 4; static final long META_OFFSET_TIMESTAMP_INDEX = 8; static final long META_OFFSET_VERSION = 12; static final long META_COLUMN_DATA_SIZE = 16; static final long META_COLUMN_DATA_RESERVED = 3; static final long META_OFFSET_COLUMN_TYPES = 128; static final int META_FLAG_BIT_INDEXED = 1; static final int META_FLAG_BIT_SEQUENTIAL = 1 << 1; static final String TODO_FILE_NAME = "_todo"; private static final int MIN_SYMBOL_CAPACITY = 2; private static final int MAX_SYMBOL_CAPACITY = Numbers.ceilPow2(Integer.MAX_VALUE); private static final int MAX_SYMBOL_CAPACITY_CACHED = Numbers.ceilPow2(1_000_000); private static final int MAX_INDEX_VALUE_BLOCK_SIZE = Numbers.ceilPow2(8 * 1024 * 1024); private final static Log LOG = LogFactory.getLog(TableUtils.class); public static void createTable( FilesFacade ff, AppendMemory memory, Path path, @Transient CharSequence root, TableStructure structure, int mkDirMode ) { createTable(ff, memory, path, root, structure, mkDirMode, ColumnType.VERSION); } public static void createTable( FilesFacade ff, AppendMemory memory, Path path, @Transient CharSequence root, TableStructure structure, int mkDirMode, int tableVersion ) { path.of(root).concat(structure.getTableName()); if (ff.mkdirs(path.put(Files.SEPARATOR).$(), mkDirMode) != 0) { throw CairoException.instance(ff.errno()).put("could not create [dir=").put(path).put(']'); } final int rootLen = path.length(); try (AppendMemory mem = memory) { mem.of(ff, path.trimTo(rootLen).concat(META_FILE_NAME).$(), ff.getPageSize()); final int count = structure.getColumnCount(); mem.putInt(count); mem.putInt(structure.getPartitionBy()); mem.putInt(structure.getTimestampIndex()); mem.putInt(tableVersion); mem.jumpTo(TableUtils.META_OFFSET_COLUMN_TYPES); for (int i = 0; i < count; i++) { mem.putByte((byte) structure.getColumnType(i)); long flags = 0; if (structure.isIndexed(i)) { flags |= META_FLAG_BIT_INDEXED; } if (structure.isSequential(i)) { flags |= META_FLAG_BIT_SEQUENTIAL; } mem.putLong(flags); mem.putInt(structure.getIndexBlockCapacity(i)); mem.skip(META_COLUMN_DATA_RESERVED); // reserved } for (int i = 0; i < count; i++) { mem.putStr(structure.getColumnName(i)); } // create symbol maps int symbolMapCount = 0; for (int i = 0; i < count; i++) { if (structure.getColumnType(i) == ColumnType.SYMBOL) { SymbolMapWriter.createSymbolMapFiles( ff, mem, path.trimTo(rootLen), structure.getColumnName(i), structure.getSymbolCapacity(i), structure.getSymbolCacheFlag(i) ); symbolMapCount++; } } mem.of(ff, path.trimTo(rootLen).concat(TXN_FILE_NAME).$(), ff.getPageSize()); TableUtils.resetTxn(mem, symbolMapCount, 0L, INITIAL_TXN); } } public static int exists(FilesFacade ff, Path path, CharSequence root, CharSequence name) { return exists(ff, path, root, name, 0, name.length()); } public static int exists(FilesFacade ff, Path path, CharSequence root, CharSequence name, int lo, int hi) { path.of(root).concat(name, lo, hi).$(); if (ff.exists(path)) { // prepare to replace trailing \0 if (ff.exists(path.chopZ().concat(TXN_FILE_NAME).$())) { return TABLE_EXISTS; } else { return TABLE_RESERVED; } } else { return TABLE_DOES_NOT_EXIST; } } public static long getColumnNameOffset(int columnCount) { return META_OFFSET_COLUMN_TYPES + columnCount * META_COLUMN_DATA_SIZE; } public static long getPartitionTableIndexOffset(int symbolWriterCount, int index) { return getPartitionTableSizeOffset(symbolWriterCount) + 4 + index * 8; } public static long getPartitionTableSizeOffset(int symbolWriterCount) { return getSymbolWriterIndexOffset(symbolWriterCount); } public static long getSymbolWriterIndexOffset(int index) { return TX_OFFSET_MAP_WRITER_COUNT + 4 + index * 4L; } public static long getTxMemSize(int symbolWriterCount, int removedPartitionsCount) { return getPartitionTableIndexOffset(symbolWriterCount, removedPartitionsCount); } public static long lock(FilesFacade ff, Path path) { long fd = ff.openRW(path); if (fd == -1) { LOG.error().$("cannot open '").$(path).$("' to lock [errno=").$(ff.errno()).$(']').$(); return -1L; } if (ff.lock(fd) != 0) { LOG.error().$("cannot lock '").$(path).$("' [errno=").$(ff.errno()).$(", fd=").$(fd).$(']').$(); ff.close(fd); return -1L; } return fd; } public static void lockName(Path path) { path.put(".lock").$(); } public static void resetTxn(VirtualMemory txMem, int symbolMapCount, long txn, long dataVersion) { // txn to let readers know table is being reset txMem.putLong(TX_OFFSET_TXN, txn); Unsafe.getUnsafe().storeFence(); // transient row count txMem.putLong(TX_OFFSET_TRANSIENT_ROW_COUNT, 0); // fixed row count txMem.putLong(TX_OFFSET_FIXED_ROW_COUNT, 0); // min timestamp value in table txMem.putLong(TX_OFFSET_MIN_TIMESTAMP, Long.MAX_VALUE); // max timestamp value in table txMem.putLong(TX_OFFSET_MAX_TIMESTAMP, Long.MIN_VALUE); // structure version txMem.putLong(TX_OFFSET_STRUCT_VERSION, 0); // data version txMem.putLong(TX_OFFSET_DATA_VERSION, dataVersion); txMem.putInt(TX_OFFSET_MAP_WRITER_COUNT, symbolMapCount); for (int i = 0; i < symbolMapCount; i++) { txMem.putInt(getSymbolWriterIndexOffset(i), 0); } Unsafe.getUnsafe().storeFence(); // txn check txMem.putLong(TX_OFFSET_TXN_CHECK, txn); // partition update count txMem.putInt(getPartitionTableSizeOffset(symbolMapCount), 0); // make sure we put append pointer behind our data so that // files does not get truncated when closing txMem.jumpTo(getPartitionTableIndexOffset(symbolMapCount, 0)); } public static int toIndexKey(int symbolKey) { return symbolKey == SymbolTable.VALUE_IS_NULL ? 0 : symbolKey + 1; } public static void validate(FilesFacade ff, ReadOnlyColumn metaMem, CharSequenceIntHashMap nameIndex) { try { final int metaVersion = metaMem.getInt(TableUtils.META_OFFSET_VERSION); if (ColumnType.VERSION != metaVersion && metaVersion != 404) { throw validationException(metaMem).put("Metadata version does not match runtime version"); } final int columnCount = metaMem.getInt(META_OFFSET_COUNT); long offset = getColumnNameOffset(columnCount); if (offset < columnCount || ( columnCount > 0 && (offset < 0 || offset >= ff.length(metaMem.getFd())))) { throw validationException(metaMem).put("Incorrect columnCount: ").put(columnCount); } final int timestampIndex = metaMem.getInt(META_OFFSET_TIMESTAMP_INDEX); if (timestampIndex < -1 || timestampIndex >= columnCount) { throw validationException(metaMem).put("Timestamp index is outside of columnCount"); } if (timestampIndex != -1) { int timestampType = getColumnType(metaMem, timestampIndex); if (timestampType != ColumnType.TIMESTAMP) { throw validationException(metaMem).put("Timestamp column must be TIMESTAMP, but found ").put(ColumnType.nameOf(timestampType)); } } // validate column types and index attributes for (int i = 0; i < columnCount; i++) { int type = getColumnType(metaMem, i); if (ColumnType.sizeOf(type) == -1) { throw validationException(metaMem).put("Invalid column type ").put(type).put(" at [").put(i).put(']'); } if (isColumnIndexed(metaMem, i)) { if (type != ColumnType.SYMBOL) { throw validationException(metaMem).put("Index flag is only supported for SYMBOL").put(" at [").put(i).put(']'); } if (getIndexBlockCapacity(metaMem, i) < 2) { throw validationException(metaMem).put("Invalid index value block capacity ").put(getIndexBlockCapacity(metaMem, i)).put(" at [").put(i).put(']'); } } } // validate column names for (int i = 0; i < columnCount; i++) { CharSequence name = metaMem.getStr(offset); if (name == null || name.length() < 1) { throw validationException(metaMem).put("NULL column name at [").put(i).put(']'); } String s = name.toString(); if (!nameIndex.put(s, i)) { throw validationException(metaMem).put("Duplicate column: ").put(s).put(" at [").put(i).put(']'); } offset += ReadOnlyMemory.getStorageLength(name); } } catch (CairoException e) { nameIndex.clear(); throw e; } } public static void validateIndexValueBlockSize(int position, int indexValueBlockSize) throws SqlException { if (indexValueBlockSize < MIN_INDEX_VALUE_BLOCK_SIZE) { throw SqlException.$(position, "min index block capacity is ").put(MIN_INDEX_VALUE_BLOCK_SIZE); } if (indexValueBlockSize > MAX_INDEX_VALUE_BLOCK_SIZE) { throw SqlException.$(position, "max index block capacity is ").put(MAX_INDEX_VALUE_BLOCK_SIZE); } } public static void validateSymbolCapacity(int position, int symbolCapacity) throws SqlException { if (symbolCapacity < MIN_SYMBOL_CAPACITY) { throw SqlException.$(position, "min symbol capacity is ").put(MIN_SYMBOL_CAPACITY); } if (symbolCapacity > MAX_SYMBOL_CAPACITY) { throw SqlException.$(position, "max symbol capacity is ").put(MAX_SYMBOL_CAPACITY); } } public static void validateSymbolCapacityCached(boolean cache, int symbolCapacity, int cacheKeywordPosition) throws SqlException { if (cache && symbolCapacity > MAX_SYMBOL_CAPACITY_CACHED) { throw SqlException.$(cacheKeywordPosition, "max cached symbol capacity is ").put(MAX_SYMBOL_CAPACITY_CACHED); } } static long readLongAtOffset(FilesFacade ff, Path path, long tempMem8b, long offset) { long fd = ff.openRO(path); if (fd == -1) { throw CairoException.instance(ff.errno()).put("Cannot open: ").put(path); } try { if (ff.read(fd, tempMem8b, Long.BYTES, offset) != Long.BYTES) { throw CairoException.instance(ff.errno()).put("Cannot read: ").put(path); } return Unsafe.getUnsafe().getLong(tempMem8b); } finally { ff.close(fd); } } /** * path member variable has to be set to location of "top" file. * * @return number of rows column doesn't have when column was added to table that already had data. */ static long readColumnTop(FilesFacade ff, Path path, CharSequence name, int plen, long buf) { try { if (ff.exists(topFile(path.chopZ(), name))) { long fd = ff.openRO(path); try { if (ff.read(fd, buf, 8, 0) != 8) { throw CairoException.instance(Os.errno()).put("Cannot read top of column ").put(path); } return Unsafe.getUnsafe().getLong(buf); } finally { ff.close(fd); } } return 0L; } finally { path.trimTo(plen); } } static LPSZ dFile(Path path, CharSequence columnName) { return path.concat(columnName).put(".d").$(); } static LPSZ topFile(Path path, CharSequence columnName) { return path.concat(columnName).put(".top").$(); } static LPSZ iFile(Path path, CharSequence columnName) { return path.concat(columnName).put(".i").$(); } static int getColumnType(ReadOnlyColumn metaMem, int columnIndex) { return metaMem.getByte(META_OFFSET_COLUMN_TYPES + columnIndex * META_COLUMN_DATA_SIZE); } static long getColumnFlags(ReadOnlyColumn metaMem, int columnIndex) { return metaMem.getLong(META_OFFSET_COLUMN_TYPES + columnIndex * META_COLUMN_DATA_SIZE + 1); } static boolean isColumnIndexed(ReadOnlyColumn metaMem, int columnIndex) { return (getColumnFlags(metaMem, columnIndex) & META_FLAG_BIT_INDEXED) != 0; } static boolean isSequential(ReadOnlyColumn metaMem, int columnIndex) { return (getColumnFlags(metaMem, columnIndex) & META_FLAG_BIT_SEQUENTIAL) != 0; } static int getIndexBlockCapacity(ReadOnlyColumn metaMem, int columnIndex) { return metaMem.getInt(META_OFFSET_COLUMN_TYPES + columnIndex * META_COLUMN_DATA_SIZE + 9); } static int openMetaSwapFile(FilesFacade ff, AppendMemory mem, Path path, int rootLen, int retryCount) { try { path.concat(META_SWAP_FILE_NAME).$(); int l = path.length(); int index = 0; do { if (index > 0) { path.trimTo(l).put('.').put(index); path.$(); } if (!ff.exists(path) || ff.remove(path)) { try { mem.of(ff, path, ff.getPageSize()); return index; } catch (CairoException e) { // right, cannot open file for some reason? LOG.error().$("Cannot open file: ").$(path).$('[').$(Os.errno()).$(']').$(); } } else { LOG.error().$("Cannot remove file: ").$(path).$('[').$(Os.errno()).$(']').$(); } } while (++index < retryCount); throw CairoException.instance(0).put("Cannot open indexed file. Max number of attempts reached [").put(index).put("]. Last file tried: ").put(path); } finally { path.trimTo(rootLen); } } static String getTodoText(long code) { switch ((int) (code & 0xff)) { case TODO_TRUNCATE: return "truncate"; case TODO_RESTORE_META: return "restore meta"; default: // really impossible to happen, but we keep this line to comply with Murphy's law. return "unknown"; } } private static CairoException validationException(ReadOnlyColumn mem) { return CairoException.instance(0).put("Invalid metadata at fd=").put(mem.getFd()).put(". "); } static boolean isSamePartition(long timestampA, long timestampB, int partitionBy) { switch (partitionBy) { case PartitionBy.NONE: return true; case PartitionBy.DAY: return Timestamps.floorDD(timestampA) == Timestamps.floorDD(timestampB); case PartitionBy.MONTH: return Timestamps.floorMM(timestampA) == Timestamps.floorMM(timestampB); case PartitionBy.YEAR: return Timestamps.floorYYYY(timestampA) == Timestamps.floorYYYY(timestampB); default: throw CairoException.instance(0).put("Cannot compare timestamps for unsupported partition type: [").put(partitionBy).put(']'); } } static long readPartitionSize(FilesFacade ff, Path path, long tempMem8b) { int plen = path.length(); try { if (ff.exists(path.concat(ARCHIVE_FILE_NAME).$())) { long fd = ff.openRO(path); if (fd == -1) { throw CairoException.instance(Os.errno()).put("Cannot open: ").put(path); } try { if (ff.read(fd, tempMem8b, 8, 0) != 8) { throw CairoException.instance(Os.errno()).put("Cannot read: ").put(path); } return Unsafe.getUnsafe().getLong(tempMem8b); } finally { ff.close(fd); } } else { throw CairoException.instance(0).put("Doesn't exist: ").put(path); } } finally { path.trimTo(plen); } } static { DateFormatCompiler compiler = new DateFormatCompiler(); fmtDay = compiler.compile("yyyy-MM-dd"); fmtMonth = compiler.compile("yyyy-MM"); fmtYear = compiler.compile("yyyy"); } }





© 2015 - 2024 Weber Informatics LLC | Privacy Policy