All Downloads are FREE. Search and download functionalities are using the official Maven repository.

io.questdb.cairo.TableNameRegistryStore Maven / Gradle / Ivy

The newest version!
/*******************************************************************************
 *     ___                  _   ____  ____
 *    / _ \ _   _  ___  ___| |_|  _ \| __ )
 *   | | | | | | |/ _ \/ __| __| | | |  _ \
 *   | |_| | |_| |  __/\__ \ |_| |_| | |_) |
 *    \__\_\\__,_|\___||___/\__|____/|____/
 *
 *  Copyright (c) 2014-2019 Appsicle
 *  Copyright (c) 2019-2024 QuestDB
 *
 *  Licensed under the Apache License, Version 2.0 (the "License");
 *  you may not use this file except in compliance with the License.
 *  You may obtain a copy of the License at
 *
 *  http://www.apache.org/licenses/LICENSE-2.0
 *
 *  Unless required by applicable law or agreed to in writing, software
 *  distributed under the License is distributed on an "AS IS" BASIS,
 *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 *  See the License for the specific language governing permissions and
 *  limitations under the License.
 *
 ******************************************************************************/

package io.questdb.cairo;

import io.questdb.cairo.vm.Vm;
import io.questdb.cairo.vm.api.MemoryCMR;
import io.questdb.cairo.vm.api.MemoryMR;
import io.questdb.log.Log;
import io.questdb.log.LogFactory;
import io.questdb.std.Chars;
import io.questdb.std.ConcurrentHashMap;
import io.questdb.std.Files;
import io.questdb.std.FilesFacade;
import io.questdb.std.MemoryTag;
import io.questdb.std.Misc;
import io.questdb.std.Numbers;
import io.questdb.std.NumericException;
import io.questdb.std.ObjList;
import io.questdb.std.Unsafe;
import io.questdb.std.str.LPSZ;
import io.questdb.std.str.Path;
import io.questdb.std.str.StringSink;
import io.questdb.std.str.Utf8StringSink;
import io.questdb.std.str.Utf8s;
import org.jetbrains.annotations.Nullable;
import org.jetbrains.annotations.TestOnly;

import java.util.Map;

import static io.questdb.cairo.TableUtils.META_FILE_NAME;
import static io.questdb.cairo.TableUtils.isMatViewDefinitionFileExists;
import static io.questdb.cairo.wal.WalUtils.*;
import static io.questdb.std.Files.DT_FILE;

public class TableNameRegistryStore extends GrowOnlyTableNameRegistryStore {
    private static final Log LOG = LogFactory.getLog(TableNameRegistryStore.class);
    private final CairoConfiguration configuration;
    private final StringSink nameSink = new StringSink();
    private final TableFlagResolver tableFlagResolver;
    private final MemoryCMR tableNameRoMemory = Vm.getCMRInstance();
    private long lockFd = -1;
    private long longBuffer;

    public TableNameRegistryStore(CairoConfiguration configuration, TableFlagResolver tableFlagResolver) {
        super(configuration.getFilesFacade());
        this.configuration = configuration;
        this.tableFlagResolver = tableFlagResolver;
    }

    public static int findLastTablesFileVersion(FilesFacade ff, Path path, StringSink nameSink) {
        long findPtr = ff.findFirst(path.$());
        if (findPtr == 0) {
            throw CairoException.critical(0).put("database root directory does not exist at ").put(path);
        }
        try {
            int lastVersion = 0;
            do {
                long pUtf8NameZ = ff.findName(findPtr);
                if (ff.findType(findPtr) == DT_FILE) {
                    nameSink.clear();
                    boolean validUtf8 = Utf8s.utf8ToUtf16Z(pUtf8NameZ, nameSink);
                    assert validUtf8 : "invalid UTF-8 in file name";
                    if (Chars.startsWith(nameSink, TABLE_REGISTRY_NAME_FILE) && nameSink.length() > TABLE_REGISTRY_NAME_FILE.length() + 1) {
                        try {
                            int version = Numbers.parseInt(nameSink, TABLE_REGISTRY_NAME_FILE.length() + 1, nameSink.length());
                            if (version > lastVersion) {
                                lastVersion = version;
                            }
                        } catch (NumericException ignore) {
                            // no-op
                        }
                    }
                }
            } while (ff.findNext(findPtr) > 0);
            return lastVersion;
        } finally {
            ff.findClose(findPtr);
        }
    }

    @Override
    public void close() {
        super.close();
        if (lockFd != -1) {
            configuration.getFilesFacade().close(lockFd);
            lockFd = -1;
        }
    }

    public boolean isLocked() {
        return lockFd != -1;
    }

    public boolean lock() {
        if (lockFd != -1) {
            throw CairoException.critical(0).put("table registry already locked");
        }

        // Windows does not allow to lock directories, so we lock a special lock file
        FilesFacade ff = configuration.getFilesFacade();
        LPSZ path = Path.getThreadLocal(configuration.getDbRoot()).concat(TABLE_REGISTRY_NAME_FILE).put(".lock").$();
        if (ff.exists(path)) {
            ff.touch(path);
        }
        lockFd = TableUtils.lock(ff, path);
        return lockFd != -1;
    }

    public boolean reload(
            ConcurrentHashMap tableNameToTokenMap,
            ConcurrentHashMap dirNameToTokenMap,
            @Nullable ObjList convertedTables
    ) {
        boolean consistent = reloadFromTablesFile(tableNameToTokenMap, dirNameToTokenMap, convertedTables);
        reloadFromRootDirectory(tableNameToTokenMap, dirNameToTokenMap);
        return consistent;
    }

    @TestOnly
    public synchronized void resetMemory() {
        if (!isLocked()) {
            if (!lock()) {
                throw CairoException.critical(0).put("table registry is not locked");
            }
        }
        tableNameMemory.close();

        final LPSZ path = Path.getThreadLocal(configuration.getDbRoot()).concat(TABLE_REGISTRY_NAME_FILE).put(".0").$();
        configuration.getFilesFacade().remove(path);

        tableNameMemory.smallFile(configuration.getFilesFacade(), path, MemoryTag.MMAP_DEFAULT);
    }

    @Override
    public void writeEntry(TableToken tableToken, int operation) {
        if (!isLocked()) {
            throw CairoException.critical(0).put("table registry is not locked");
        }
        super.writeEntry(tableToken, operation);
    }

    private boolean checkWalTableInPendingDropState(TableToken tableToken, FilesFacade ff, Path path, int plimit) {
        if (longBuffer == 0) {
            // lazy init
            longBuffer = Unsafe.malloc(Long.BYTES, MemoryTag.NATIVE_DEFAULT);
        }

        path.trimTo(plimit).concat(tableToken.getDirName()).concat(SEQ_DIR).concat(META_FILE_NAME);
        long seqMetaFd = ff.openRO(path.$());
        if (seqMetaFd == -1) {
            LOG.error().$("cannot open seq meta file, assume table is being dropped [path=").$(path).I$();
            return true;
        }

        try {
            if (ff.read(seqMetaFd, longBuffer, Long.BYTES, SEQ_META_OFFSET_STRUCTURE_VERSION) == Long.BYTES) {
                long structureVersion = Unsafe.getUnsafe().getLong(longBuffer);
                return structureVersion == DROP_TABLE_STRUCTURE_VERSION;
            } else {
                LOG.error().$("cannot read structure version, assume table is being dropped [path=").$(path).I$();
                // cannot read structure version, assume table is being dropped
                return true;
            }
        } finally {
            ff.close(seqMetaFd);
        }
    }

    private void clearRegistryToReloadFromFileSystem(
            ConcurrentHashMap tableNameToTableTokenMap,
            ConcurrentHashMap dirNameToTableTokenMap,
            int lastFileVersion,
            String errorTableName,
            String errorDirName,
            TableToken conflictTableToken
    ) {
        LOG.critical().$("duplicate table dir to name mapping found [tableName=").utf8(errorTableName)
                .$(", dirName1=").utf8(conflictTableToken.getDirName())
                .$(", dirName2=").utf8(errorDirName)
                .I$();
        dumpTableRegistry(lastFileVersion);
        if (isLocked()) {
            // Reset existing registry to empty state
            tableNameMemory.putLong(0, Long.BYTES);
            tableNameMemory.jumpTo(Long.BYTES);
        }
        tableNameToTableTokenMap.clear();
        dirNameToTableTokenMap.clear();
    }

    private void compactTableNameFile(
            Map nameTableTokenMap,
            Map reverseNameMap,
            int lastFileVersion,
            FilesFacade ff,
            Path path
    ) {
        // compact the memory, remove deleted entries.
        // write to the tmp file.
        int pathRootLen = path.size();
        path.concat(TABLE_REGISTRY_NAME_FILE).putAscii(".tmp");
        long currentOffset;

        tableNameMemory.close(false);
        tableNameMemory.smallFile(ff, path.$(), MemoryTag.MMAP_DEFAULT);
        tableNameMemory.putLong(0L);

        // Save tables not fully deleted yet to complete the deletion.
        for (ReverseTableMapItem reverseMapItem : reverseNameMap.values()) {
            if (reverseMapItem.isDropped()) {
                writeEntry(reverseMapItem.getToken(), OPERATION_ADD);
                writeEntry(reverseMapItem.getToken(), OPERATION_REMOVE);
            }
        }

        for (TableToken token : nameTableTokenMap.values()) {
            writeEntry(token, OPERATION_ADD);
        }

        tableNameMemory.sync(false);
        long newAppendOffset = tableNameMemory.getAppendOffset();
        tableNameMemory.close();

        // rename tmp to next version file, everyone will automatically switch to new file
        LPSZ path2 = Path.getThreadLocal2(configuration.getDbRoot())
                .concat(TABLE_REGISTRY_NAME_FILE).put('.').put(lastFileVersion + 1).$();
        if (ff.rename(path.$(), path2) == Files.FILES_RENAME_OK) {
            LOG.info().$("compacted tables file [path=").$(path2).I$();
            lastFileVersion++;
            currentOffset = newAppendOffset;
            // best effort to remove old files, but we don't care if it fails
            path.trimTo(pathRootLen).concat(TABLE_REGISTRY_NAME_FILE).putAscii('.').put(lastFileVersion - 1);
            ff.removeQuiet(path.$());

            path.trimTo(pathRootLen).concat(TABLE_REGISTRY_NAME_FILE).putAscii('.').put(lastFileVersion);
            tableNameMemory.smallFile(ff, path.$(), MemoryTag.MMAP_DEFAULT);
            tableNameMemory.jumpTo(currentOffset);
        } else {
            // Not critical, if rename fails, compaction will be done next time
            // Reopen the existing, non-compacted file
            path2 = Path.getThreadLocal2(configuration.getDbRoot())
                    .concat(TABLE_REGISTRY_NAME_FILE).put('.').put(lastFileVersion).$();
            tableNameMemory.smallFile(ff, path2, MemoryTag.MMAP_DEFAULT);
            long appendOffset = tableNameMemory.getLong(0);
            tableNameMemory.jumpTo(appendOffset);

            LOG.error().$("could not rename tables file, tables file will not be compacted [from=").$(path)
                    .$(", to=").$(path2).I$();
        }
    }

    private void dumpTableRegistry(int lastFileVersion) {
        MemoryMR memory = isLocked() ? tableNameMemory : tableNameRoMemory;
        long mapMem = memory.getLong(0);
        long currentOffset = Long.BYTES;

        LOG.advisoryW().$("dumping table registry [file=").$(TABLE_REGISTRY_NAME_FILE).$('.').$(lastFileVersion)
                .$(", size=").$(mapMem).I$();

        while (currentOffset < mapMem) {
            int operation = memory.getInt(currentOffset);
            currentOffset += Integer.BYTES;
            String tableName = Chars.toString(memory.getStrA(currentOffset));
            currentOffset += Vm.getStorageLength(tableName);
            String dirName = Chars.toString(memory.getStrA(currentOffset));
            currentOffset += Vm.getStorageLength(dirName);
            int tableId = memory.getInt(currentOffset);
            currentOffset += Integer.BYTES;
            int tableType = memory.getInt(currentOffset);
            currentOffset += Integer.BYTES;

            LOG.advisoryW().$("operation=").$(operation == OPERATION_ADD ? "add (" : "remove (").$(operation)
                    .$("), tableName=").utf8(tableName)
                    .$(", dirName=").utf8(dirName)
                    .$(", tableId=").$(tableId)
                    .$(", tableType=").$(tableType)
                    .$(']').$();

            if (operation == OPERATION_ADD) {
                currentOffset += TABLE_NAME_ENTRY_RESERVED_LONGS * Long.BYTES;
            }
        }
        LOG.advisoryW().$("table registry dump complete").$();
    }

    private int findLastTablesFileVersion(FilesFacade ff, Path path) {
        return findLastTablesFileVersion(ff, path, nameSink);
    }

    private int readTableId(Path path, CharSequence dirName, FilesFacade ff) {
        path.of(configuration.getDbRoot()).concat(dirName).concat(META_FILE_NAME);
        long fd = ff.openRO(path.$());
        if (fd < 1) {
            return 0;
        }

        try {
            int tableId = ff.readNonNegativeInt(fd, TableUtils.META_OFFSET_TABLE_ID);
            if (tableId < 0) {
                LOG.error().$("cannot read table id from metadata file [path=").$(path).I$();
                return 0;
            }
            byte isWal = (byte) (ff.readNonNegativeInt(fd, TableUtils.META_OFFSET_WAL_ENABLED) & 0xFF);
            return isWal == 0 ? tableId : -tableId;
        } finally {
            ff.close(fd);
        }
    }

    private void reloadFromRootDirectory(
            ConcurrentHashMap tableNameToTableTokenMap,
            ConcurrentHashMap dirNameToTableTokenMap
    ) {
        Path path = Path.getThreadLocal(configuration.getDbRoot());
        int plimit = path.size();
        FilesFacade ff = configuration.getFilesFacade();
        long findPtr = ff.findFirst(path.$());
        try {
            Utf8StringSink dirNameSink = Misc.getThreadLocalUtf8Sink();
            do {
                if (ff.isDirOrSoftLinkDirNoDots(path, plimit, ff.findName(findPtr), ff.findType(findPtr), dirNameSink)) {
                    String dirName = Utf8s.toString(dirNameSink);
                    if (
                            !dirNameToTableTokenMap.containsKey(dirName)
                                    && TableUtils.exists(ff, path, configuration.getDbRoot(), dirNameSink) == TableUtils.TABLE_EXISTS
                    ) {
                        int tableId;
                        boolean isWal;
                        String tableName;

                        try {
                            tableId = readTableId(path, dirName, ff);
                            isWal = tableId < 0;
                            tableId = Math.abs(tableId);
                            tableName = TableUtils.readTableName(path.of(configuration.getDbRoot()).concat(dirNameSink), plimit, tableNameRoMemory, ff);
                        } catch (CairoException e) {
                            if (e.errnoFileCannotRead()) {
                                // table is being removed.
                                continue;
                            } else {
                                throw e;
                            }
                        } finally {
                            tableNameRoMemory.close();
                        }

                        if (tableName == null) {
                            LOG.info().$("could not read table name, table will use directory name [dirName=").$(dirNameSink).I$();
                            tableName = Chars.toString(TableUtils.getTableNameFromDirName(dirName));
                        }

                        if (tableId > -1L) {
                            boolean isProtected = tableFlagResolver.isProtected(tableName);
                            boolean isSystem = tableFlagResolver.isSystem(tableName);
                            boolean isPublic = tableFlagResolver.isPublic(tableName);
                            boolean isMatView = isMatViewDefinitionFileExists(configuration, path, dirName);
                            TableToken token = new TableToken(tableName, dirName, tableId, isMatView, isWal, isSystem, isProtected, isPublic);
                            TableToken existingTableToken = tableNameToTableTokenMap.get(tableName);

                            if (existingTableToken != null) {
                                // One of the tables can be in pending drop state.
                                if (!resolveTableNameConflict(tableNameToTableTokenMap, dirNameToTableTokenMap, token, existingTableToken, ff, path, plimit)) {
                                    LOG.critical().$("duplicate table name found, table will not be available [dirName=").$(dirNameSink)
                                            .$(", name=").utf8(tableName)
                                            .$(", existingTableDir=").utf8(tableNameToTableTokenMap.get(tableName).getDirName())
                                            .I$();
                                }
                                continue;
                            }

                            tableNameToTableTokenMap.put(tableName, token);
                            dirNameToTableTokenMap.put(dirName, ReverseTableMapItem.of(token));
                        }
                    }
                }
            } while (ff.findNext(findPtr) > 0);
        } finally {
            ff.findClose(findPtr);
            if (longBuffer != 0) {
                longBuffer = Unsafe.free(longBuffer, Long.BYTES, MemoryTag.NATIVE_DEFAULT);
            }
        }
    }

    private boolean reloadFromTablesFile(
            ConcurrentHashMap tableNameToTableTokenMap,
            ConcurrentHashMap dirNameToTableTokenMap,
            @Nullable ObjList convertedTables
    ) {
        int lastFileVersion;
        FilesFacade ff = configuration.getFilesFacade();
        Path path = Path.getThreadLocal(configuration.getDbRoot());
        int plimit = path.size();

        MemoryMR memory = isLocked() ? tableNameMemory : tableNameRoMemory;
        do {
            lastFileVersion = findLastTablesFileVersion(ff, path.trimTo(plimit));
            path.trimTo(plimit).concat(TABLE_REGISTRY_NAME_FILE).putAscii('.').put(lastFileVersion).$();
            try {
                memory.smallFile(ff, path.$(), MemoryTag.MMAP_DEFAULT);
                LOG.info()
                        .$("reloading tables file [path=").$(path)
                        .$(", threadId=").$(Thread.currentThread().getId())
                        .I$();
                if (memory.size() >= 2 * Long.BYTES) {
                    break;
                }
            } catch (CairoException e) {
                if (!isLocked()) {
                    if (e.errnoFileCannotRead()) {
                        if (lastFileVersion == 0) {
                            // This is RO mode and file and tables.d.0 does not exist.
                            return false;
                        } else {
                            // This is RO mode and file we want to read was just swapped to new one by the RW instance.
                            continue;
                        }
                    }
                }
                throw e;
            }
        } while (true);

        long mapMem = memory.getLong(0);
        long currentOffset = Long.BYTES;
        memory.extend(mapMem);
        int forceCompact = Integer.MAX_VALUE / 2;

        int tableToCompact = 0;
        while (currentOffset < mapMem) {
            int operation = memory.getInt(currentOffset);
            currentOffset += Integer.BYTES;
            String tableName = Chars.toString(memory.getStrA(currentOffset));
            currentOffset += Vm.getStorageLength(tableName);
            String dirName = Chars.toString(memory.getStrA(currentOffset));
            currentOffset += Vm.getStorageLength(dirName);
            int tableId = memory.getInt(currentOffset);
            currentOffset += Integer.BYTES;
            int tableType = memory.getInt(currentOffset);
            currentOffset += Integer.BYTES;

            if (operation == OPERATION_REMOVE) {
                TableToken token = tableNameToTableTokenMap.remove(tableName);
                if (!ff.exists(path.trimTo(plimit).concat(dirName).$())) {
                    // table already fully removed
                    tableToCompact++;
                    dirNameToTableTokenMap.remove(dirName);
                } else {
                    if (token == null) {
                        boolean isProtected = tableFlagResolver.isProtected(tableName);
                        boolean isSystem = tableFlagResolver.isSystem(tableName);
                        boolean isPublic = tableFlagResolver.isPublic(tableName);
                        boolean isMatView = tableType == TableUtils.TABLE_TYPE_MAT;
                        boolean isWal = tableType == TableUtils.TABLE_TYPE_WAL || isMatView;
                        token = new TableToken(tableName, dirName, tableId, isMatView, isWal, isSystem, isProtected, isPublic);
                    }
                    dirNameToTableTokenMap.put(dirName, ReverseTableMapItem.ofDropped(token));
                }
            } else {
                assert operation == OPERATION_ADD;
                if (TableUtils.exists(ff, path, configuration.getDbRoot(), dirName) != TableUtils.TABLE_EXISTS) {
                    // This can be BAU, remove record will follow
                    tableToCompact++;
                } else {
                    final TableToken existing = tableNameToTableTokenMap.get(tableName);
                    if (existing != null) {
                        clearRegistryToReloadFromFileSystem(
                                tableNameToTableTokenMap,
                                dirNameToTableTokenMap,
                                lastFileVersion,
                                tableName,
                                dirName,
                                existing
                        );
                        return false;
                    }

                    boolean isProtected = tableFlagResolver.isProtected(tableName);
                    boolean isSystem = tableFlagResolver.isSystem(tableName);
                    boolean isPublic = tableFlagResolver.isPublic(tableName);
                    boolean isMatView = tableType == TableUtils.TABLE_TYPE_MAT;
                    boolean isWal = tableType == TableUtils.TABLE_TYPE_WAL || isMatView;
                    final TableToken token = new TableToken(tableName, dirName, tableId, isMatView, isWal, isSystem, isProtected, isPublic);
                    tableNameToTableTokenMap.put(tableName, token);
                    if (!Chars.startsWith(token.getDirName(), token.getTableName())) {
                        // This table is renamed, log system to real table name mapping
                        LOG.debug().$("table dir name does not match logical name [table=").utf8(tableName).$(", dirName=").utf8(dirName).I$();
                    }
                    dirNameToTableTokenMap.put(token.getDirName(), ReverseTableMapItem.of(token));
                }
                currentOffset += TABLE_NAME_ENTRY_RESERVED_LONGS * Long.BYTES;
            }
        }

        if (isLocked()) {
            tableNameMemory.jumpTo(currentOffset);
            if (convertedTables != null) {
                for (int i = 0, n = convertedTables.size(); i < n; i++) {
                    final TableToken token = convertedTables.get(i);
                    final TableToken existing = tableNameToTableTokenMap.get(token.getTableName());

                    if (existing != null && !Chars.equals(existing.getDirName(), token.getDirName())) {
                        // Table with different directory already exists pointing to the same name
                        clearRegistryToReloadFromFileSystem(
                                tableNameToTableTokenMap,
                                dirNameToTableTokenMap,
                                lastFileVersion,
                                token.getTableName(),
                                token.getDirName(),
                                existing
                        );
                        return false;
                    }

                    if (token.isWal()) {
                        tableNameToTableTokenMap.put(token.getTableName(), token);
                        dirNameToTableTokenMap.put(token.getDirName(), ReverseTableMapItem.of(token));
                    } else {
                        tableNameToTableTokenMap.remove(token.getTableName());
                        dirNameToTableTokenMap.remove(token.getDirName());
                    }

                    // Force the compaction
                    tableToCompact = forceCompact;
                }
            }

            final int tableRegistryCompactionThreshold = configuration.getTableRegistryCompactionThreshold();
            if ((tableRegistryCompactionThreshold > -1 && tableToCompact > tableRegistryCompactionThreshold) || tableToCompact >= forceCompact) {
                path.trimTo(plimit);
                LOG.info().$("compacting tables file").$();
                compactTableNameFile(tableNameToTableTokenMap, dirNameToTableTokenMap, lastFileVersion, ff, path);
            } else {
                tableNameMemory.jumpTo(currentOffset);
            }
        } else {
            tableNameRoMemory.close();
        }
        return true;
    }

    private boolean resolveTableNameConflict(
            ConcurrentHashMap tableNameToTableTokenMap,
            ConcurrentHashMap dirNameToTableTokenMap,
            TableToken newToken,
            TableToken existingTableToken,
            FilesFacade ff,
            Path path,
            int plimit
    ) {
        boolean existingDropped = false;
        boolean newDropped = false;

        if (existingTableToken.isWal()) {
            existingDropped = checkWalTableInPendingDropState(existingTableToken, ff, path, plimit);
        }

        if (!existingDropped) {
            // Check if new table is dropped
            if (newToken.isWal()) {
                newDropped = checkWalTableInPendingDropState(newToken, ff, path, plimit);
            }
        } else {
            // existing table token table is partially dropped
            tableNameToTableTokenMap.remove(existingTableToken.getTableName());
            dirNameToTableTokenMap.remove(existingTableToken.getDirName());

            // mark existing as pending dropped
            dirNameToTableTokenMap.put(existingTableToken.getDirName(), ReverseTableMapItem.ofDropped(existingTableToken));

            // add new table
            tableNameToTableTokenMap.put(newToken.getTableName(), newToken);
            dirNameToTableTokenMap.put(newToken.getDirName(), ReverseTableMapItem.of(newToken));

            return true;
        }

        // don't add new table to registry
        return newDropped;
    }
}




© 2015 - 2025 Weber Informatics LLC | Privacy Policy