
io.questdb.cairo.wal.WalPurgeJob Maven / Gradle / Ivy
/*******************************************************************************
* ___ _ ____ ____
* / _ \ _ _ ___ ___| |_| _ \| __ )
* | | | | | | |/ _ \/ __| __| | | | _ \
* | |_| | |_| | __/\__ \ |_| |_| | |_) |
* \__\_\\__,_|\___||___/\__|____/|____/
*
* Copyright (c) 2014-2019 Appsicle
* Copyright (c) 2019-2024 QuestDB
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*
******************************************************************************/
package io.questdb.cairo.wal;
import io.questdb.cairo.*;
import io.questdb.cairo.wal.seq.TableSequencerAPI;
import io.questdb.cairo.wal.seq.TransactionLogCursor;
import io.questdb.log.Log;
import io.questdb.log.LogFactory;
import io.questdb.log.LogRecord;
import io.questdb.mp.SimpleWaitingLock;
import io.questdb.mp.SynchronizedJob;
import io.questdb.std.*;
import io.questdb.std.datetime.microtime.MicrosecondClock;
import io.questdb.std.datetime.millitime.MillisecondClock;
import io.questdb.std.str.DirectUtf8StringZ;
import io.questdb.std.str.LPSZ;
import io.questdb.std.str.Path;
import io.questdb.std.str.Utf8Sequence;
import java.io.Closeable;
public class WalPurgeJob extends SynchronizedJob implements Closeable {
private static final Log LOG = LogFactory.getLog(WalPurgeJob.class);
private final TableSequencerAPI.TableSequencerCallback broadSweepRef;
private final long checkInterval;
private final MicrosecondClock clock;
private final CairoConfiguration configuration;
private final CairoEngine engine;
private final FilesFacade ff;
private final Logic logic;
private final MillisecondClock millisecondClock;
private final IntHashSet onDiskWalIDSet = new IntHashSet();
private final Path path = new Path();
private final SimpleWaitingLock runLock = new SimpleWaitingLock();
private final long spinLockTimeout;
private final ObjHashSet tableTokenBucket = new ObjHashSet<>();
private final TxReader txReader;
private final WalDirectoryPolicy walDirectoryPolicy;
private final DirectUtf8StringZ walName = new DirectUtf8StringZ();
private long last = 0;
private TableToken tableToken;
public WalPurgeJob(CairoEngine engine, FilesFacade ff, MicrosecondClock clock) {
this.engine = engine;
this.ff = ff;
this.clock = clock;
this.checkInterval = engine.getConfiguration().getWalPurgeInterval() * 1000;
this.millisecondClock = engine.getConfiguration().getMillisecondClock();
this.spinLockTimeout = engine.getConfiguration().getSpinLockTimeout();
this.txReader = new TxReader(ff);
this.broadSweepRef = this::broadSweep;
this.walDirectoryPolicy = engine.getWalDirectoryPolicy();
// some code here assumes that WAL_NAME_BASE is "wal", this is to fail the tests if it is not
//noinspection ConstantConditions
assert WalUtils.WAL_NAME_BASE.equals("wal");
configuration = engine.getConfiguration();
logic = new Logic(new FsDeleter(), engine.getConfiguration().getWalPurgeWaitBeforeDelete());
}
public WalPurgeJob(CairoEngine engine) {
this(engine, engine.getConfiguration().getFilesFacade(), engine.getConfiguration().getMicrosecondClock());
}
@Override
public void close() {
this.txReader.close();
path.close();
}
/**
* Delay the first run of this job by half a configured interval to
* spread its work more evenly across other timer-based jobs with
* similar cadences.
*/
public void delayByHalfInterval() {
this.last = clock.getTicks() - (checkInterval / 2);
}
public SimpleWaitingLock getRunLock() {
return runLock;
}
/**
* Validate equivalent of "^\d+$" regex.
*/
private static boolean matchesNumberPattern(Utf8Sequence name) {
for (int i = 0, n = name.size(); i < n; i++) {
final byte b = name.byteAt(i);
if (b < '0' || b > '9') {
return false;
}
}
return true;
}
/**
* Validate equivalent of "^wal\d+$" regex.
*/
private static boolean matchesWalNamePattern(Utf8Sequence name) {
final int len = name.size();
if (len < (WalUtils.WAL_NAME_BASE.length() + 1)) {
return false;
}
if (name.byteAt(0) != 'w' || name.byteAt(1) != 'a' || name.byteAt(2) != 'l') {
return false; // Not a "wal" prefix.
}
for (int i = 3; i < len; ++i) {
final byte b = name.byteAt(i);
if (b < '0' || b > '9') {
return false; // Not a number.
}
}
return true;
}
/**
* Perform a broad sweep that searches for all tables that have closed
* WAL segments across the database and deletes any which are no longer needed.
*/
private void broadSweep() {
engine.getTableSequencerAPI().forAllWalTables(tableTokenBucket, true, broadSweepRef);
}
private void broadSweep(int tableId, final TableToken tableToken, long lastTxn) {
try {
this.tableToken = tableToken;
this.logic.reset(tableToken);
onDiskWalIDSet.clear();
boolean tableDropped = false;
discoverWalSegments();
int seqPartsCount = discoverSequencerParts();
if (logic.hasOnDiskSegments()) {
try {
tableDropped = fetchSequencerPairs(seqPartsCount);
} catch (Throwable th) {
logic.releaseLocks();
throw th;
}
// Any of the calls above may leave outstanding `discoveredWalIds` that are still on the filesystem
// and don't have any active segments. Any unlocked walNNN directories may be deleted if they don't have
// pending segments that are yet to be applied to the table.
// Note that this also handles cases where a wal directory was created shortly before a crash and thus
// never recorded and tracked by the sequencer for that table.
logic.run();
}
if (tableDropped || (lastTxn < 0 && engine.isTableDropped(tableToken))) {
if (logic.hasPendingTasks()) {
LOG.info().$("table is dropped, but has WALs containing segments with pending tasks ")
.$("[tableDir=").$(tableToken.getDirName()).I$();
} else if (
TableUtils.exists(
ff,
Path.getThreadLocal(""),
configuration.getRoot(),
tableToken.getDirName()
) != TableUtils.TABLE_EXISTS
) {
// Fully deregister the table
Path pathToDelete = Path.getThreadLocal(configuration.getRoot()).concat(tableToken);
Path symLinkTarget = null;
if (ff.isSoftLink(path.$())) {
symLinkTarget = Path.getThreadLocal2("");
if (!ff.readLink(pathToDelete, symLinkTarget)) {
symLinkTarget = null;
}
}
boolean fullyDeleted = ff.rmdir(pathToDelete, false);
if (symLinkTarget != null) {
ff.rmdir(symLinkTarget, false);
}
// Sometimes on Windows sequencer files can be open at this point,
// wait for them to be closed before fully removing the token from name registry
// and marking table as fully deleted.
if (fullyDeleted) {
engine.removeTableToken(tableToken);
LOG.info().$("table is fully dropped [tableDir=").$(pathToDelete).I$();
TableUtils.lockName(pathToDelete);
ff.removeQuiet(pathToDelete.$());
} else {
LOG.info().$("could not fully remove table, some files left on the disk [tableDir=").$(pathToDelete).I$();
}
} else {
LOG.info().$("table is not fully dropped, pinging WAL Apply job to delete table files [tableDir=").$(tableToken.getDirName()).I$();
// Ping ApplyWal2TableJob to clean up the table files
engine.notifyWalTxnCommitted(tableToken);
}
}
} catch (CairoException ce) {
LOG.error().$("broad sweep failed [table=").$(tableToken)
.$(", msg=").$((Throwable) ce)
.$(", errno=").$(ff.errno()).$(']').$();
}
}
private int discoverSequencerParts() {
LPSZ path = setSeqPartPath(tableToken).$();
int partsFound = 0;
if (ff.exists(path)) {
long p = ff.findFirst(path);
if (p > 0) {
try {
do {
int type = ff.findType(p);
long pUtf8NameZ = ff.findName(p);
if (type == Files.DT_FILE && matchesNumberPattern(walName.of(pUtf8NameZ))) {
try {
final int partNo = Numbers.parseInt(walName);
logic.trackSeqPart(partNo);
partsFound++;
} catch (NumericException ne) {
// Non-Part file directory, ignore.
}
}
} while (ff.findNext(p) > 0);
} finally {
ff.findClose(p);
}
}
}
return partsFound;
}
private void discoverWalSegments() {
Path path = setTablePath(tableToken);
long p = ff.findFirst(path.$());
int rootPathLen = path.size();
logic.sequencerHasPendingTasks(sequencerHasPendingTasks());
if (p > 0) {
try {
do {
int type = ff.findType(p);
long pUtf8NameZ = ff.findName(p);
if (type == Files.DT_DIR && matchesWalNamePattern(walName.of(pUtf8NameZ))) {
try {
final int walId = Numbers.parseInt(walName, 3, walName.size());
onDiskWalIDSet.add(walId);
long walLockFd = TableUtils.lock(ff, setWalLockPath(tableToken, walId).$(), false);
boolean walHasPendingTasks = false;
// Search for segments.
path.trimTo(rootPathLen).concat(pUtf8NameZ);
final int walPathLen = path.size();
final long sp = ff.findFirst(path.$());
try {
do {
type = ff.findType(sp);
pUtf8NameZ = ff.findName(sp);
if (type == Files.DT_DIR && matchesNumberPattern(walName.of(pUtf8NameZ))) {
try {
final int segmentId = Numbers.parseInt(walName);
if ((segmentId < WalUtils.SEG_MIN_ID) || (segmentId > WalUtils.SEG_MAX_ID)) {
throw NumericException.INSTANCE;
}
path.trimTo(walPathLen);
final Path segmentPath = setSegmentLockPath(tableToken, walId, segmentId);
long lockFd = TableUtils.lock(ff, segmentPath.$(), false);
if (lockFd > -1) {
final boolean pendingTasks = segmentHasPendingTasks(walId, segmentId);
if (pendingTasks) {
// Treat is as being locked.
ff.close(lockFd);
lockFd = -1;
}
}
walHasPendingTasks |= lockFd < 0;
logic.trackDiscoveredSegment(walId, segmentId, lockFd);
} catch (NumericException ne) {
// Non-Segment directory, ignore.
}
}
} while (ff.findNext(sp) > 0);
} finally {
ff.findClose(sp);
}
if (walLockFd > -1 && walHasPendingTasks) {
// WAL dir cannot be deleted, there are busy segments.
// Unlock it.
ff.close(walLockFd);
walLockFd = -1;
}
logic.trackDiscoveredWal(walId, walLockFd);
} catch (NumericException ne) {
// Non-WAL directory, ignore.
}
}
} while (ff.findNext(p) > 0);
} finally {
ff.findClose(p);
}
}
}
private boolean fetchSequencerPairs(int partsFound) {
setTxnPath(tableToken);
if (!engine.isTableDropped(tableToken)) {
try {
try {
txReader.ofRO(path.$(), PartitionBy.NONE);
TableUtils.safeReadTxn(txReader, millisecondClock, spinLockTimeout);
} catch (CairoException ex) {
if (engine.isTableDropped(tableToken)) {
// This is ok, table dropped while we tried to read the txn
return false;
}
throw ex;
}
final long lastAppliedTxn = txReader.getSeqTxn();
TableSequencerAPI tableSequencerAPI = engine.getTableSequencerAPI();
try (TransactionLogCursor transactionLogCursor = tableSequencerAPI.getCursor(tableToken, lastAppliedTxn)) {
int txnPartSize = transactionLogCursor.getPartitionSize();
long currentSeqPart = getCurrentSeqPart(tableToken, lastAppliedTxn, txnPartSize, partsFound);
logic.trackCurrentSeqPart(currentSeqPart);
while (onDiskWalIDSet.size() > 0 && transactionLogCursor.hasNext()) {
int walId = transactionLogCursor.getWalId();
if (onDiskWalIDSet.remove(walId) != -1) {
int segmentId = transactionLogCursor.getSegmentId();
logic.trackNextToApplySegment(walId, segmentId);
}
}
} catch (CairoException e) {
if (e.isTableDropped()) {
// there was a race, we lost
return true;
} else {
throw e;
}
}
} finally {
txReader.close();
}
}
return false;
// If table is dropped, all wals can be deleted.
// No need to do anything, all discovered segments / wals will be deleted
}
private boolean recursiveDelete(Path path) {
if (!ff.rmdir(path, false) && !CairoException.errnoPathDoesNotExist(ff.errno())) {
LOG.debug()
.$("could not delete directory [path=").$(path)
.$(", errno=").$(ff.errno())
.I$();
return false;
}
return true;
}
/**
* Check if the segment directory has any outstanding ".pending" marker files in the ".pending" directory.
*/
private boolean segmentHasPendingTasks(int walId, int segmentId) {
return walDirectoryPolicy.isInUse(setSegmentPath(tableToken, walId, segmentId));
}
private boolean sequencerHasPendingTasks() {
return walDirectoryPolicy.isInUse(path.of(configuration.getRoot()).concat(tableToken).concat(WalUtils.SEQ_DIR));
}
private Path setSegmentLockPath(TableToken tableName, int walId, int segmentId) {
TableUtils.lockName(setSegmentPath(tableName, walId, segmentId));
return path;
}
private Path setSegmentPath(TableToken tableName, int walId, int segmentId) {
return path.of(configuration.getRoot())
.concat(tableName).concat(WalUtils.WAL_NAME_BASE).put(walId).slash().put(segmentId);
}
private Path setSeqPartPath(TableToken tableName) {
return path.of(configuration.getRoot())
.concat(tableName).concat(WalUtils.SEQ_DIR).concat(WalUtils.TXNLOG_PARTS_DIR);
}
private Path setTablePath(TableToken tableName) {
return path.of(configuration.getRoot())
.concat(tableName);
}
private void setTxnPath(TableToken tableName) {
path.of(configuration.getRoot())
.concat(tableName)
.concat(TableUtils.TXN_FILE_NAME);
}
private Path setWalLockPath(TableToken tableName, int walId) {
path.of(configuration.getRoot())
.concat(tableName).concat(WalUtils.WAL_NAME_BASE).put(walId);
TableUtils.lockName(path);
return path;
}
private Path setWalPath(TableToken tableName, int walId) {
return path.of(configuration.getRoot())
.concat(tableName).concat(WalUtils.WAL_NAME_BASE).put(walId);
}
protected long getCurrentSeqPart(TableToken tableToken, long lastAppliedTxn, int txnPartSize, int partsFound) {
// There can be more advanced override which uses more parameters than this implementation.
if (txnPartSize > 0) {
// we don't want to purge the part where the last txn is in. Txn is 1-based.
return (lastAppliedTxn - 1) / txnPartSize;
}
return Long.MAX_VALUE;
}
@Override
protected boolean runSerially() {
final long t = clock.getTicks();
if (last + checkInterval < t) {
last = t;
if (runLock.tryLock()) {
try {
broadSweep();
} finally {
runLock.unlock();
}
} else {
LOG.info().$("skipping, locked out").$();
}
}
return false;
}
public interface Deleter {
void deleteSegmentDirectory(int walId, int segmentId, long lockFd);
void deleteSequencerPart(int seqPart);
void deleteWalDirectory(int walId, long lockFd);
void unlock(long lockFd);
}
public static class Logic {
private final Deleter deleter;
private final LongList discovered = new LongList();
private final IntIntHashMap nextToApply = new IntIntHashMap();
private final int waitBeforeDelete;
private long currentSeqPart;
private boolean logged;
private boolean sequencerPending;
private TableToken tableToken;
public Logic(Deleter deleter, int waitBeforeDelete) {
this.deleter = deleter;
this.waitBeforeDelete = waitBeforeDelete;
}
public boolean hasOnDiskSegments() {
return discovered.size() != 0;
}
public boolean hasPendingTasks() {
if (sequencerPending) {
return true;
}
for (int i = 0; i < getStateSize(); ++i) {
if (getLockFd(i) < 0) {
return true;
}
}
return false;
}
public void releaseLocks() {
for (int i = 0, n = getStateSize(); i < n; ++i) {
deleter.unlock(getLockFd(i));
}
}
public void reset(TableToken tableToken) {
this.tableToken = tableToken;
nextToApply.clear();
discovered.clear();
sequencerPending = false;
currentSeqPart = -1;
logged = false;
}
public void run() {
int i = 0, n = getStateSize();
if (n > 0 && waitBeforeDelete > 0) {
Os.sleep(waitBeforeDelete);
}
try {
for (; i < n; i++) {
long lockFd = getLockFd(i);
final int walId = getWalId(i);
final int segmentId = getSegmentId(i);
final int nextToApplySegmentId = nextToApply.get(walId); // -1 if not found
if (lockFd > -1) {
// Delete a wal or segment directory only if:
// * It has been fully applied to the table.
// * Is not locked.
// * None of its segments have pending tasks
if (isWalDir(segmentId, walId)) {
final boolean walAlreadyApplied = nextToApplySegmentId == -1;
if (walAlreadyApplied) {
logDebugInfo();
deleter.deleteWalDirectory(walId, lockFd);
continue;
}
} else {
final boolean segmentAlreadyApplied = (nextToApplySegmentId == -1) || (nextToApplySegmentId > segmentId);
if (segmentAlreadyApplied) {
logDebugInfo();
deleter.deleteSegmentDirectory(walId, segmentId, lockFd);
continue;
}
}
deleter.unlock(lockFd);
} else {
final int seqPart = getSeqPart(walId, segmentId); // -1 if not a seq part
if (seqPart > -1 && seqPart < currentSeqPart) {
logDebugInfo();
deleter.deleteSequencerPart(seqPart);
}
}
}
} finally {
for (; i < n; i++) {
deleter.unlock(getLockFd(i));
}
}
}
public void sequencerHasPendingTasks(boolean isPending) {
sequencerPending = isPending;
}
public void trackCurrentSeqPart(long partNo) {
currentSeqPart = partNo;
}
public void trackDiscoveredSegment(int walId, int segmentId, long lockFd) {
discovered.add(walId);
discovered.add(segmentId);
discovered.add(lockFd);
}
public void trackDiscoveredWal(int walId, long lockFd) {
trackDiscoveredSegment(walId, WalUtils.SEG_NONE_ID, lockFd);
}
public void trackNextToApplySegment(int walId, int segmentId) {
final int index = nextToApply.keyIndex(walId);
if (index > -1) { // not tracked yet
nextToApply.putAt(index, walId, segmentId);
}
}
public void trackSeqPart(int part) {
discovered.add(WalUtils.METADATA_WALID);
discovered.add(part);
discovered.add(-1);
}
private static boolean isWalDir(int segmentId, int walId) {
return segmentId == WalUtils.SEG_NONE_ID && walId != WalUtils.METADATA_WALID;
}
private long getLockFd(int index) {
return discovered.get(index * 3 + 2);
}
private int getSegmentId(int index) {
return (int) discovered.get(index * 3 + 1);
}
private int getSeqPart(int walId, int segmentId) {
return walId == WalUtils.METADATA_WALID ? segmentId : -1;
}
private int getStateSize() {
return discovered.size() / 3;
}
private int getWalId(int index) {
return (int) discovered.get(index * 3);
}
private void logDebugInfo() {
if (!logged) {
printDebugState();
logged = true;
}
}
private void printDebugState() {
LogRecord log = LOG.info();
try {
log.$("table=").put(tableToken.getDirName())
.put(", discovered=[");
for (int i = 0, n = getStateSize(); i < n; i++) {
final int walId = getWalId(i);
final int segmentId = getSegmentId(i);
final long lockId = getLockFd(i);
final int partNo = getSeqPart(walId, segmentId);
if (partNo > -1) {
log.$("seqPart=").$(partNo);
if (partNo == currentSeqPart) {
log.$(":current");
}
} else {
if (isWalDir(segmentId, walId)) {
log.$("(wal").$(walId);
} else {
final int nextToApplyId = nextToApply.get(walId);
log.$('(').$(walId).$(',').$(segmentId);
if (segmentId == nextToApplyId) {
log.$(":next");
}
}
if (lockId < 0) {
log.$(":busy");
}
log.$(')');
}
if (i < n - 1) {
log.$(',');
}
}
log.$(']');
} finally {
log.$();
}
}
}
private class FsDeleter implements Deleter {
@Override
public void deleteSegmentDirectory(int walId, int segmentId, long lockFd) {
LOG.debug().$("deleting WAL segment directory [table=").utf8(tableToken.getDirName())
.$(", walId=").$(walId)
.$(", segmentId=").$(segmentId).$(']').$();
if (recursiveDelete(setSegmentPath(tableToken, walId, segmentId))) {
ff.closeRemove(lockFd, setSegmentLockPath(tableToken, walId, segmentId).$());
} else {
ff.close(lockFd);
}
}
@Override
public void deleteSequencerPart(int seqPart) {
LOG.debug().$("deleting sequencer part [table=").utf8(tableToken.getDirName())
.$(", part=").$(seqPart).$(']').$();
Path path = setSeqPartPath(tableToken).put(Files.SEPARATOR).put(seqPart);
// If error removing, will be retried on next run.
ff.removeQuiet(path.$());
}
@Override
public void deleteWalDirectory(int walId, long lockFd) {
LOG.debug().$("deleting WAL directory [table=").utf8(tableToken.getDirName())
.$(", walId=").$(walId).$(']').$();
if (recursiveDelete(setWalPath(tableToken, walId))) {
ff.closeRemove(lockFd, setWalLockPath(tableToken, walId).$());
} else {
ff.close(lockFd);
}
}
@Override
public void unlock(long lockFd) {
if (lockFd > -1) {
ff.close(lockFd);
}
}
}
}
© 2015 - 2025 Weber Informatics LLC | Privacy Policy