
org.apache.cassandra.db.compaction.CompactionIterator Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of cassandra-all Show documentation
Show all versions of cassandra-all Show documentation
The Apache Cassandra Project develops a highly scalable second-generation distributed database, bringing together Dynamo's fully distributed design and Bigtable's ColumnFamily-based data model.
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.cassandra.db.compaction;
import java.util.ArrayList;
import java.util.Collection;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.concurrent.TimeUnit;
import java.util.function.LongPredicate;
import com.google.common.collect.ImmutableSet;
import com.google.common.collect.Ordering;
import org.apache.cassandra.config.DatabaseDescriptor;
import org.apache.cassandra.db.AbstractCompactionController;
import org.apache.cassandra.db.ColumnFamilyStore;
import org.apache.cassandra.db.Columns;
import org.apache.cassandra.db.DecoratedKey;
import org.apache.cassandra.db.DeletionTime;
import org.apache.cassandra.db.EmptyIterators;
import org.apache.cassandra.db.Keyspace;
import org.apache.cassandra.db.RegularAndStaticColumns;
import org.apache.cassandra.db.SystemKeyspace;
import org.apache.cassandra.db.transform.DuplicateRowChecker;
import org.apache.cassandra.db.filter.ColumnFilter;
import org.apache.cassandra.db.partitions.PurgeFunction;
import org.apache.cassandra.db.partitions.UnfilteredPartitionIterator;
import org.apache.cassandra.db.partitions.UnfilteredPartitionIterators;
import org.apache.cassandra.db.rows.RangeTombstoneBoundMarker;
import org.apache.cassandra.db.rows.RangeTombstoneMarker;
import org.apache.cassandra.db.rows.Row;
import org.apache.cassandra.db.rows.Rows;
import org.apache.cassandra.db.rows.Unfiltered;
import org.apache.cassandra.db.rows.UnfilteredRowIterator;
import org.apache.cassandra.db.rows.UnfilteredRowIterators;
import org.apache.cassandra.db.rows.WrappingUnfilteredRowIterator;
import org.apache.cassandra.db.transform.Transformation;
import org.apache.cassandra.dht.Token;
import org.apache.cassandra.index.transactions.CompactionTransaction;
import org.apache.cassandra.index.transactions.IndexTransaction;
import org.apache.cassandra.io.sstable.ISSTableScanner;
import org.apache.cassandra.io.sstable.format.SSTableReader;
import org.apache.cassandra.metrics.TopPartitionTracker;
import org.apache.cassandra.schema.CompactionParams.TombstoneOption;
import org.apache.cassandra.schema.Schema;
import org.apache.cassandra.schema.SchemaConstants;
import org.apache.cassandra.schema.TableId;
import org.apache.cassandra.schema.TableMetadata;
import org.apache.cassandra.service.paxos.PaxosRepairHistory;
import org.apache.cassandra.service.paxos.uncommitted.PaxosRows;
import org.apache.cassandra.utils.TimeUUID;
import static java.util.concurrent.TimeUnit.MICROSECONDS;
import static org.apache.cassandra.config.Config.PaxosStatePurging.legacy;
import static org.apache.cassandra.config.DatabaseDescriptor.paxosStatePurging;
/**
* Merge multiple iterators over the content of sstable into a "compacted" iterator.
*
* On top of the actual merging the source iterators, this class:
*
* - purge gc-able tombstones if possible (see PurgeIterator below).
* - update 2ndary indexes if necessary (as we don't read-before-write on index updates, index entries are
* not deleted on deletion of the base table data, which is ok because we'll fix index inconsistency
* on reads. This however mean that potentially obsolete index entries could be kept a long time for
* data that is not read often, so compaction "pro-actively" fix such index entries. This is mainly
* an optimization).
* - invalidate cached partitions that are empty post-compaction. This avoids keeping partitions with
* only purgable tombstones in the row cache.
* - keep tracks of the compaction progress.
*
*/
public class CompactionIterator extends CompactionInfo.Holder implements UnfilteredPartitionIterator
{
private static final long UNFILTERED_TO_UPDATE_PROGRESS = 100;
private final OperationType type;
private final AbstractCompactionController controller;
private final List scanners;
private final ImmutableSet sstables;
private final long nowInSec;
private final TimeUUID compactionId;
private final long totalBytes;
private long bytesRead;
private long totalSourceCQLRows;
// Keep targetDirectory for compactions, needed for `nodetool compactionstats`
private volatile String targetDirectory;
/*
* counters for merged rows.
* array index represents (number of merged rows - 1), so index 0 is counter for no merge (1 row),
* index 1 is counter for 2 rows merged, and so on.
*/
private final long[] mergeCounters;
private final UnfilteredPartitionIterator compacted;
private final ActiveCompactionsTracker activeCompactions;
public CompactionIterator(OperationType type, List scanners, AbstractCompactionController controller, long nowInSec, TimeUUID compactionId)
{
this(type, scanners, controller, nowInSec, compactionId, ActiveCompactionsTracker.NOOP, null);
}
@SuppressWarnings("resource") // We make sure to close mergedIterator in close() and CompactionIterator is itself an AutoCloseable
public CompactionIterator(OperationType type,
List scanners,
AbstractCompactionController controller,
long nowInSec,
TimeUUID compactionId,
ActiveCompactionsTracker activeCompactions,
TopPartitionTracker.Collector topPartitionCollector)
{
this.controller = controller;
this.type = type;
this.scanners = scanners;
this.nowInSec = nowInSec;
this.compactionId = compactionId;
this.bytesRead = 0;
long bytes = 0;
for (ISSTableScanner scanner : scanners)
bytes += scanner.getLengthInBytes();
this.totalBytes = bytes;
this.mergeCounters = new long[scanners.size()];
// note that we leak `this` from the constructor when calling beginCompaction below, this means we have to get the sstables before
// calling that to avoid a NPE.
sstables = scanners.stream().map(ISSTableScanner::getBackingSSTables).flatMap(Collection::stream).collect(ImmutableSet.toImmutableSet());
this.activeCompactions = activeCompactions == null ? ActiveCompactionsTracker.NOOP : activeCompactions;
this.activeCompactions.beginCompaction(this); // note that CompactionTask also calls this, but CT only creates CompactionIterator with a NOOP ActiveCompactions
UnfilteredPartitionIterator merged = scanners.isEmpty()
? EmptyIterators.unfilteredPartition(controller.cfs.metadata())
: UnfilteredPartitionIterators.merge(scanners, listener());
if (topPartitionCollector != null) // need to count tombstones before they are purged
merged = Transformation.apply(merged, new TopPartitionTracker.TombstoneCounter(topPartitionCollector, nowInSec));
merged = Transformation.apply(merged, new GarbageSkipper(controller));
Transformation purger = isPaxos(controller.cfs) && paxosStatePurging() != legacy
? new PaxosPurger(nowInSec)
: new Purger(controller, nowInSec);
merged = Transformation.apply(merged, purger);
merged = DuplicateRowChecker.duringCompaction(merged, type);
compacted = Transformation.apply(merged, new AbortableUnfilteredPartitionTransformation(this));
}
public TableMetadata metadata()
{
return controller.cfs.metadata();
}
public CompactionInfo getCompactionInfo()
{
return new CompactionInfo(controller.cfs.metadata(),
type,
bytesRead,
totalBytes,
compactionId,
sstables,
targetDirectory);
}
public boolean isGlobal()
{
return false;
}
public void setTargetDirectory(final String targetDirectory)
{
this.targetDirectory = targetDirectory;
}
private void updateCounterFor(int rows)
{
assert rows > 0 && rows - 1 < mergeCounters.length;
mergeCounters[rows - 1] += 1;
}
public long[] getMergedRowCounts()
{
return mergeCounters;
}
public long getTotalSourceCQLRows()
{
return totalSourceCQLRows;
}
private UnfilteredPartitionIterators.MergeListener listener()
{
return new UnfilteredPartitionIterators.MergeListener()
{
public UnfilteredRowIterators.MergeListener getRowMergeListener(DecoratedKey partitionKey, List versions)
{
int merged = 0;
for (int i=0, isize=versions.size(); i 0;
CompactionIterator.this.updateCounterFor(merged);
if ( (type != OperationType.COMPACTION && type != OperationType.MAJOR_COMPACTION)
|| !controller.cfs.indexManager.handles(IndexTransaction.Type.COMPACTION) )
{
return null;
}
Columns statics = Columns.NONE;
Columns regulars = Columns.NONE;
for (int i=0, isize=versions.size(); i 0)
{
if (tombNext.isRangeTombstoneMarker())
{
tombOpenDeletionTime = updateOpenDeletionTime(tombOpenDeletionTime, tombNext);
activeDeletionTime = Ordering.natural().max(partitionDeletionTime,
tombOpenDeletionTime);
boolean supersededBefore = openDeletionTime.isLive();
boolean supersededAfter = !dataOpenDeletionTime.supersedes(activeDeletionTime);
// If a range open was not issued because it was superseded and the deletion isn't superseded any more, we need to open it now.
if (supersededBefore && !supersededAfter)
next = new RangeTombstoneBoundMarker(((RangeTombstoneMarker) tombNext).closeBound(false).invert(), dataOpenDeletionTime);
// If the deletion begins to be superseded, we don't close the range yet. This can save us a close/open pair if it ends after the superseding range.
}
}
if (next instanceof RangeTombstoneMarker)
openDeletionTime = updateOpenDeletionTime(openDeletionTime, next);
if (cmp <= 0)
dataNext = advance(wrapped);
if (cmp >= 0)
tombNext = advance(tombSource);
}
return next != null;
}
protected Row garbageFilterRow(Row dataRow, Row tombRow)
{
if (cellLevelGC)
{
return Rows.removeShadowedCells(dataRow, tombRow, activeDeletionTime);
}
else
{
DeletionTime deletion = Ordering.natural().max(tombRow.deletion().time(),
activeDeletionTime);
return dataRow.filter(cf, deletion, false, metadata);
}
}
/**
* Decide how to act on a tombstone marker from the input iterator. We can decide what to issue depending on
* whether or not the ranges before and after the marker are superseded/live -- if none are, we can reuse the
* marker; if both are, the marker can be ignored; otherwise we issue a corresponding start/end marker.
*/
private RangeTombstoneMarker processDataMarker()
{
dataOpenDeletionTime = updateOpenDeletionTime(dataOpenDeletionTime, dataNext);
boolean supersededBefore = openDeletionTime.isLive();
boolean supersededAfter = !dataOpenDeletionTime.supersedes(activeDeletionTime);
RangeTombstoneMarker marker = (RangeTombstoneMarker) dataNext;
if (!supersededBefore)
if (!supersededAfter)
return marker;
else
return new RangeTombstoneBoundMarker(marker.closeBound(false), marker.closeDeletionTime(false));
else
if (!supersededAfter)
return new RangeTombstoneBoundMarker(marker.openBound(false), marker.openDeletionTime(false));
else
return null;
}
@Override
public Unfiltered next()
{
if (!hasNext())
throw new IllegalStateException();
Unfiltered v = next;
next = null;
return v;
}
private DeletionTime updateOpenDeletionTime(DeletionTime openDeletionTime, Unfiltered next)
{
RangeTombstoneMarker marker = (RangeTombstoneMarker) next;
assert openDeletionTime.isLive() == !marker.isClose(false);
assert openDeletionTime.isLive() || openDeletionTime.equals(marker.closeDeletionTime(false));
return marker.isOpen(false) ? marker.openDeletionTime(false) : DeletionTime.LIVE;
}
}
/**
* Partition transformation applying GarbageSkippingUnfilteredRowIterator, obtaining tombstone sources for each
* partition using the controller's shadowSources method.
*/
private static class GarbageSkipper extends Transformation
{
final AbstractCompactionController controller;
final boolean cellLevelGC;
private GarbageSkipper(AbstractCompactionController controller)
{
this.controller = controller;
cellLevelGC = controller.tombstoneOption == TombstoneOption.CELL;
}
@Override
protected UnfilteredRowIterator applyToPartition(UnfilteredRowIterator partition)
{
Iterable sources = controller.shadowSources(partition.partitionKey(), !cellLevelGC);
if (sources == null)
return partition;
List iters = new ArrayList<>();
for (UnfilteredRowIterator iter : sources)
{
if (!iter.isEmpty())
iters.add(iter);
else
iter.close();
}
if (iters.isEmpty())
return partition;
return new GarbageSkippingUnfilteredRowIterator(partition, UnfilteredRowIterators.merge(iters), cellLevelGC);
}
}
private class PaxosPurger extends Transformation
{
private final long nowInSec;
private final long paxosPurgeGraceMicros = DatabaseDescriptor.getPaxosPurgeGrace(MICROSECONDS);
private final Map tableIdToHistory = new HashMap<>();
private Token currentToken;
private int compactedUnfiltered;
private PaxosPurger(long nowInSec)
{
this.nowInSec = nowInSec;
}
protected void onEmptyPartitionPostPurge(DecoratedKey key)
{
if (type == OperationType.COMPACTION)
controller.cfs.invalidateCachedPartition(key);
}
protected void updateProgress()
{
if ((++compactedUnfiltered) % UNFILTERED_TO_UPDATE_PROGRESS == 0)
updateBytesRead();
}
@Override
@SuppressWarnings("resource")
protected UnfilteredRowIterator applyToPartition(UnfilteredRowIterator partition)
{
currentToken = partition.partitionKey().getToken();
UnfilteredRowIterator purged = Transformation.apply(partition, this);
if (purged.isEmpty())
{
onEmptyPartitionPostPurge(purged.partitionKey());
purged.close();
return null;
}
return purged;
}
@Override
protected Row applyToRow(Row row)
{
updateProgress();
TableId tableId = PaxosRows.getTableId(row);
switch (paxosStatePurging())
{
default: throw new AssertionError();
case legacy:
case gc_grace:
{
TableMetadata metadata = Schema.instance.getTableMetadata(tableId);
return row.purgeDataOlderThan(TimeUnit.SECONDS.toMicros(nowInSec - (metadata == null ? (3 * 3600) : metadata.params.gcGraceSeconds)), false);
}
case repaired:
{
PaxosRepairHistory.Searcher history = tableIdToHistory.computeIfAbsent(tableId, find -> {
TableMetadata metadata = Schema.instance.getTableMetadata(find);
if (metadata == null)
return null;
return Keyspace.openAndGetStore(metadata).getPaxosRepairHistory().searcher();
});
return history == null ? row :
row.purgeDataOlderThan(history.ballotForToken(currentToken).unixMicros() - paxosPurgeGraceMicros, false);
}
}
}
}
private static class AbortableUnfilteredPartitionTransformation extends Transformation
{
private final AbortableUnfilteredRowTransformation abortableIter;
private AbortableUnfilteredPartitionTransformation(CompactionIterator iter)
{
this.abortableIter = new AbortableUnfilteredRowTransformation(iter);
}
@Override
protected UnfilteredRowIterator applyToPartition(UnfilteredRowIterator partition)
{
if (abortableIter.iter.isStopRequested())
throw new CompactionInterruptedException(abortableIter.iter.getCompactionInfo());
return Transformation.apply(partition, abortableIter);
}
}
private static class AbortableUnfilteredRowTransformation extends Transformation
{
private final CompactionIterator iter;
private AbortableUnfilteredRowTransformation(CompactionIterator iter)
{
this.iter = iter;
}
public Row applyToRow(Row row)
{
if (iter.isStopRequested())
throw new CompactionInterruptedException(iter.getCompactionInfo());
return row;
}
}
private static boolean isPaxos(ColumnFamilyStore cfs)
{
return cfs.name.equals(SystemKeyspace.PAXOS) && cfs.getKeyspaceName().equals(SchemaConstants.SYSTEM_KEYSPACE_NAME);
}
}
© 2015 - 2024 Weber Informatics LLC | Privacy Policy