
io.questdb.cairo.O3PartitionPurgeJob Maven / Gradle / Ivy
/*******************************************************************************
* ___ _ ____ ____
* / _ \ _ _ ___ ___| |_| _ \| __ )
* | | | | | | |/ _ \/ __| __| | | | _ \
* | |_| | |_| | __/\__ \ |_| |_| | |_) |
* \__\_\\__,_|\___||___/\__|____/|____/
*
* Copyright (c) 2014-2019 Appsicle
* Copyright (c) 2019-2024 QuestDB
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*
******************************************************************************/
package io.questdb.cairo;
import io.questdb.cairo.sql.TableReferenceOutOfDateException;
import io.questdb.cairo.wal.WalUtils;
import io.questdb.log.Log;
import io.questdb.log.LogFactory;
import io.questdb.mp.AbstractQueueConsumerJob;
import io.questdb.std.*;
import io.questdb.std.datetime.DateFormat;
import io.questdb.std.datetime.millitime.DateFormatUtils;
import io.questdb.std.str.Path;
import io.questdb.std.str.Utf8StringSink;
import io.questdb.std.str.Utf8s;
import io.questdb.tasks.O3PartitionPurgeTask;
import java.io.Closeable;
import java.util.concurrent.atomic.AtomicBoolean;
import static io.questdb.cairo.TableUtils.TXN_FILE_NAME;
public class O3PartitionPurgeJob extends AbstractQueueConsumerJob implements Closeable {
private final static Log LOG = LogFactory.getLog(O3PartitionPurgeJob.class);
private final CairoConfiguration configuration;
private final CairoEngine engine;
private final Utf8StringSink[] fileNameSinks;
private final AtomicBoolean halted = new AtomicBoolean(false);
private final ObjList partitionList;
private final ObjList txnReaders;
private final ObjList txnScoreboards;
public O3PartitionPurgeJob(CairoEngine engine, int workerCount) {
super(engine.getMessageBus().getO3PurgeDiscoveryQueue(), engine.getMessageBus().getO3PurgeDiscoverySubSeq());
try {
this.engine = engine;
this.configuration = engine.getMessageBus().getConfiguration();
this.fileNameSinks = new Utf8StringSink[workerCount];
this.partitionList = new ObjList<>(workerCount);
this.txnScoreboards = new ObjList<>(workerCount);
this.txnReaders = new ObjList<>(workerCount);
for (int i = 0; i < workerCount; i++) {
fileNameSinks[i] = new Utf8StringSink();
partitionList.add(new DirectLongList(configuration.getPartitionPurgeListCapacity() * 2L, MemoryTag.NATIVE_O3));
txnScoreboards.add(new TxnScoreboard(configuration.getFilesFacade(), configuration.getTxnScoreboardEntryCount()));
txnReaders.add(new TxReader(configuration.getFilesFacade()));
}
} catch (Throwable th) {
close();
throw th;
}
}
@Override
public void close() {
if (halted.compareAndSet(false, true)) {
Misc.freeObjList(partitionList);
Misc.freeObjList(txnReaders);
Misc.freeObjList(txnScoreboards);
}
}
private static void parsePartitionDateVersion(
Utf8StringSink fileNameSink,
DirectLongList partitionList,
CharSequence tableName,
DateFormat partitionByFormat
) {
int index = Utf8s.lastIndexOfAscii(fileNameSink, '.');
int len = fileNameSink.size();
if (index < 0) {
index = len;
}
try {
if (index < len) {
long partitionVersion = Numbers.parseLong(fileNameSink, index + 1, len);
// When reader locks transaction 100 it opens partition version .99 or lower.
// Also, when there is no transaction version in the name, it is counted as -1.
// By adding +1 here we kill 2 birds in with one stone, partition versions are aligned with
// txn scoreboard reader locks and no need to add -1 which allows us to use 128bit
// sort to sort 2 x 64bit unsigned integers
partitionList.add(partitionVersion + 1);
} else {
// This should be -1, but it is only possible to correctly sort 2 unsigned longs
// as 128bit integer sort
// Set 0 instead of -1 and revert it later on. There should be not possible to have .0 in the partition name
partitionList.add(0);
}
try {
long partitionTs = partitionByFormat.parse(fileNameSink.asAsciiCharSequence(), 0, index, DateFormatUtils.EN_LOCALE);
partitionList.add(partitionTs);
} catch (NumericException e) {
if (!Utf8s.startsWithAscii(fileNameSink, WalUtils.WAL_NAME_BASE) && !Utf8s.equalsAscii(WalUtils.SEQ_DIR, fileNameSink)
&& !Utf8s.equalsAscii("seq", fileNameSink)) {
LOG.info().$("unknown directory [table=").utf8(tableName).$(", dir=").$(fileNameSink).I$();
}
partitionList.setPos(partitionList.size() - 1); // remove partition version record
}
} catch (NumericException e) {
LOG.error().$("unknown directory [table=").utf8(tableName).$(", dir=").$(fileNameSink).I$();
}
}
private void discoverPartitions(
FilesFacade ff,
Utf8StringSink fileNameSink,
DirectLongList partitionList,
CharSequence root,
TableToken tableToken,
TxnScoreboard txnScoreboard,
TxReader txReader,
int partitionBy
) {
LOG.info().$("processing [table=").utf8(tableToken.getDirName()).I$();
Path path = Path.getThreadLocal(root).concat(tableToken);
int plimit = path.size();
partitionList.clear();
DateFormat partitionByFormat = PartitionBy.getPartitionDirFormatMethod(partitionBy);
long p = ff.findFirst(path.$());
if (p > 0) {
try {
do {
if (ff.isDirOrSoftLinkDirNoDots(path, plimit, ff.findName(p), ff.findType(p), fileNameSink)) {
parsePartitionDateVersion(fileNameSink, partitionList, tableToken.getDirName(), partitionByFormat);
path.trimTo(plimit).$();
}
} while (ff.findNext(p) > 0);
} finally {
ff.findClose(p);
}
}
// find duplicate partitions
assert partitionList.size() % 2 == 0;
Vect.sort128BitAscInPlace(partitionList.getAddress(), partitionList.size() / 2);
long partitionTimestamp = Numbers.LONG_NULL;
int lo = 0;
int n = (int) partitionList.size();
path.of(root).concat(tableToken);
int tableRootLen = path.size();
try {
txnScoreboard.ofRO(path);
txReader.ofRO(path.trimTo(tableRootLen).concat(TXN_FILE_NAME).$(), partitionBy);
TableUtils.safeReadTxn(txReader, configuration.getMillisecondClock(), configuration.getSpinLockTimeout());
for (int i = 0; i < n; i += 2) {
long currentPartitionTs = partitionList.get(i + 1);
if (currentPartitionTs != partitionTimestamp) {
if (i > lo + 2 ||
(i > 0 && txReader.findAttachedPartitionRawIndexByLoTimestamp(partitionTimestamp) < 0)) {
processPartition(
tableToken,
ff,
path,
tableRootLen,
txReader,
txnScoreboard,
partitionTimestamp,
partitionBy,
partitionList,
lo,
i
);
}
lo = i;
partitionTimestamp = currentPartitionTs;
}
}
// Tail
if (n > lo + 2 || txReader.getPartitionRowCountByTimestamp(partitionTimestamp) < 0) {
processPartition(
tableToken,
ff,
path,
tableRootLen,
txReader,
txnScoreboard,
partitionTimestamp,
partitionBy,
partitionList,
lo,
n
);
}
} catch (TableReferenceOutOfDateException e) {
// table is dropped and recreated since we started processing it.
// abort the table processing
LOG.info().$("table reference out of date, aborting [table=").$(tableToken.getDirName()).I$();
} catch (CairoException ex) {
// It is possible that table is dropped while this async job was in the queue.
// so it can be not too bad. Log error and continue work on the queue
LOG.error()
.$("could not purge partition open [table=`").utf8(tableToken.getDirName())
.$("`, ex=").$(ex.getFlyweightMessage())
.$(", errno=").$(ex.getErrno())
.I$();
LOG.error().$(ex.getFlyweightMessage()).$();
} finally {
txReader.clear();
txnScoreboard.clear();
}
LOG.info().$("processed [table=").$(tableToken).I$();
}
private void processDetachedPartition(
TableToken tableToken,
FilesFacade ff,
Path path,
int tableRootLen,
TxReader txReader,
TxnScoreboard txnScoreboard,
long partitionTimestamp,
int partitionBy,
DirectLongList partitionList,
int lo,
int hi
) {
// Partition is dropped or not fully committed.
// It is only possible to delete when there are no readers
long lastTxn = txReader.getTxn();
for (int i = hi - 2, n = lo - 1; i > n; i -= 2) {
long nameTxn = partitionList.get(i);
// If last committed transaction number is 4, TableWriter can write partition with ending .4 and .3
// If the version on disk is .2 (nameTxn == 3) can remove it if the lastTxn > 3, e.g. when nameTxn < lastTxn
boolean rangeUnlocked = nameTxn < lastTxn && txnScoreboard.isRangeAvailable(nameTxn, lastTxn);
path.trimTo(tableRootLen);
TableUtils.setPathForPartition(path, partitionBy, partitionTimestamp, nameTxn - 1);
path.$();
if (rangeUnlocked) {
// nameTxn can be deleted
// -1 here is to compensate +1 added when partition version parsed from folder name
// See comments of why +1 added there in parsePartitionDateVersion()
purgePartition(tableToken, ff, path, tableRootLen - tableToken.getDirNameUtf8().size() - 1, "purging dropped partition directory [path=");
lastTxn = nameTxn;
} else {
LOG.debug().$("cannot purge partition directory, locked for reading [path=").$substr(tableRootLen - tableToken.getDirNameUtf8().size() - 1, path).I$();
break;
}
}
}
private void processPartition(
TableToken tableToken,
FilesFacade ff,
Path path,
int tableRootLen,
TxReader txReader,
TxnScoreboard txnScoreboard,
long partitionTimestamp,
int partitionBy,
DirectLongList partitionList,
int lo,
int hi
) {
boolean partitionInTxnFile = txReader.findAttachedPartitionRawIndexByLoTimestamp(partitionTimestamp) >= 0;
if (partitionInTxnFile) {
processPartition0(
tableToken,
ff,
path,
tableRootLen,
txReader,
txnScoreboard,
partitionTimestamp,
partitionBy,
partitionList,
lo,
hi
);
} else {
processDetachedPartition(
tableToken,
ff,
path,
tableRootLen,
txReader,
txnScoreboard,
partitionTimestamp,
partitionBy,
partitionList,
lo,
hi
);
}
}
private void processPartition0(
TableToken tableToken,
FilesFacade ff,
Path path,
int tableRootLen,
TxReader txReader,
TxnScoreboard txnScoreboard,
long partitionTimestamp,
int partitionBy,
DirectLongList partitionList,
int lo,
int hi
) {
long lastCommittedPartitionName = txReader.getPartitionNameTxnByPartitionTimestamp(partitionTimestamp);
if (lastCommittedPartitionName > -1) {
assert hi <= partitionList.size();
// lo points to the beginning element in partitionList, hi next after last
// each partition folder represented by a pair in the partitionList (partition version, partition timestamp)
// Skip first pair, start from second and check if it can be deleted.
for (int i = lo + 2; i < hi; i += 2) {
long nextNameVersion = Math.min(lastCommittedPartitionName + 1, partitionList.get(i));
long previousNameVersion = partitionList.get(i - 2);
boolean rangeUnlocked = previousNameVersion < nextNameVersion
&& txnScoreboard.isRangeAvailable(previousNameVersion, nextNameVersion);
path.trimTo(tableRootLen);
TableUtils.setPathForPartition(path, partitionBy, partitionTimestamp, previousNameVersion - 1);
path.$();
if (rangeUnlocked) {
// previousNameVersion can be deleted
// -1 here is to compensate +1 added when partition version parsed from folder name
// See comments of why +1 added there in parsePartitionDateVersion()
engine.getPartitionOverwriteControl().notifyPartitionMutates(tableToken, partitionTimestamp, previousNameVersion - 1, 0);
purgePartition(tableToken, ff, path, tableRootLen - tableToken.getDirNameUtf8().size() - 1, "purging overwritten partition directory [path=");
} else {
LOG.info().$("cannot purge overwritten partition directory, locked for reading path=").$substr(tableRootLen - tableToken.getDirNameUtf8().size() - 1, path).I$();
}
}
}
}
private void purgePartition(TableToken tableToken, FilesFacade ff, Path path, int pathFrom, String message) {
if (engine.lockTableCreate(tableToken)) {
try {
TableToken lastToken = engine.getUpdatedTableToken(tableToken);
if (lastToken == tableToken) {
LOG.info().$(message).$substr(pathFrom, path).I$();
ff.unlinkOrRemove(path, LOG);
} else {
// table is dropped and recreated since we started processing it.
// abort the table processing
throw new TableReferenceOutOfDateException();
}
} finally {
engine.unlockTableCreate(tableToken);
}
} else {
// table is dropped and recreated since we started processing it.
// abort the table processing
throw new TableReferenceOutOfDateException();
}
}
@Override
protected boolean canRun() {
// disable purge job while database checkpoint is in progress
return !engine.getCheckpointStatus().isInProgress();
}
@Override
protected boolean doRun(int workerId, long cursor, RunStatus runStatus) {
final O3PartitionPurgeTask task = queue.get(cursor);
discoverPartitions(
configuration.getFilesFacade(),
fileNameSinks[workerId],
partitionList.get(workerId),
configuration.getRoot(),
task.getTableToken(),
txnScoreboards.get(workerId),
txnReaders.get(workerId),
task.getPartitionBy()
);
subSeq.done(cursor);
return true;
}
}
© 2015 - 2025 Weber Informatics LLC | Privacy Policy