All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.apache.cassandra.db.compaction.CompactionIterator Maven / Gradle / Ivy

Go to download

The Apache Cassandra Project develops a highly scalable second-generation distributed database, bringing together Dynamo's fully distributed design and Bigtable's ColumnFamily-based data model.

There is a newer version: 5.0.0
Show newest version
/*
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.cassandra.db.compaction;

import java.util.ArrayList;
import java.util.Collection;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.concurrent.TimeUnit;
import java.util.function.LongPredicate;

import com.google.common.collect.ImmutableSet;
import com.google.common.collect.Ordering;

import org.apache.cassandra.config.DatabaseDescriptor;
import org.apache.cassandra.db.AbstractCompactionController;
import org.apache.cassandra.db.ColumnFamilyStore;
import org.apache.cassandra.db.Columns;
import org.apache.cassandra.db.DecoratedKey;
import org.apache.cassandra.db.DeletionTime;
import org.apache.cassandra.db.EmptyIterators;
import org.apache.cassandra.db.Keyspace;
import org.apache.cassandra.db.RegularAndStaticColumns;
import org.apache.cassandra.db.SystemKeyspace;
import org.apache.cassandra.db.transform.DuplicateRowChecker;
import org.apache.cassandra.db.filter.ColumnFilter;
import org.apache.cassandra.db.partitions.PurgeFunction;
import org.apache.cassandra.db.partitions.UnfilteredPartitionIterator;
import org.apache.cassandra.db.partitions.UnfilteredPartitionIterators;
import org.apache.cassandra.db.rows.RangeTombstoneBoundMarker;
import org.apache.cassandra.db.rows.RangeTombstoneMarker;
import org.apache.cassandra.db.rows.Row;
import org.apache.cassandra.db.rows.Rows;
import org.apache.cassandra.db.rows.Unfiltered;
import org.apache.cassandra.db.rows.UnfilteredRowIterator;
import org.apache.cassandra.db.rows.UnfilteredRowIterators;
import org.apache.cassandra.db.rows.WrappingUnfilteredRowIterator;
import org.apache.cassandra.db.transform.Transformation;
import org.apache.cassandra.dht.Token;
import org.apache.cassandra.index.transactions.CompactionTransaction;
import org.apache.cassandra.index.transactions.IndexTransaction;
import org.apache.cassandra.io.sstable.ISSTableScanner;
import org.apache.cassandra.io.sstable.format.SSTableReader;
import org.apache.cassandra.metrics.TopPartitionTracker;
import org.apache.cassandra.schema.CompactionParams.TombstoneOption;
import org.apache.cassandra.schema.Schema;
import org.apache.cassandra.schema.SchemaConstants;
import org.apache.cassandra.schema.TableId;
import org.apache.cassandra.schema.TableMetadata;
import org.apache.cassandra.service.paxos.PaxosRepairHistory;
import org.apache.cassandra.service.paxos.uncommitted.PaxosRows;
import org.apache.cassandra.utils.TimeUUID;

import static java.util.concurrent.TimeUnit.MICROSECONDS;
import static org.apache.cassandra.config.Config.PaxosStatePurging.legacy;
import static org.apache.cassandra.config.DatabaseDescriptor.paxosStatePurging;

/**
 * Merge multiple iterators over the content of sstable into a "compacted" iterator.
 * 

* On top of the actual merging the source iterators, this class: *

    *
  • purge gc-able tombstones if possible (see PurgeIterator below).
  • *
  • update 2ndary indexes if necessary (as we don't read-before-write on index updates, index entries are * not deleted on deletion of the base table data, which is ok because we'll fix index inconsistency * on reads. This however mean that potentially obsolete index entries could be kept a long time for * data that is not read often, so compaction "pro-actively" fix such index entries. This is mainly * an optimization).
  • *
  • invalidate cached partitions that are empty post-compaction. This avoids keeping partitions with * only purgable tombstones in the row cache.
  • *
  • keep tracks of the compaction progress.
  • *
*/ public class CompactionIterator extends CompactionInfo.Holder implements UnfilteredPartitionIterator { private static final long UNFILTERED_TO_UPDATE_PROGRESS = 100; private final OperationType type; private final AbstractCompactionController controller; private final List scanners; private final ImmutableSet sstables; private final long nowInSec; private final TimeUUID compactionId; private final long totalBytes; private long bytesRead; private long totalSourceCQLRows; // Keep targetDirectory for compactions, needed for `nodetool compactionstats` private volatile String targetDirectory; /* * counters for merged rows. * array index represents (number of merged rows - 1), so index 0 is counter for no merge (1 row), * index 1 is counter for 2 rows merged, and so on. */ private final long[] mergeCounters; private final UnfilteredPartitionIterator compacted; private final ActiveCompactionsTracker activeCompactions; public CompactionIterator(OperationType type, List scanners, AbstractCompactionController controller, long nowInSec, TimeUUID compactionId) { this(type, scanners, controller, nowInSec, compactionId, ActiveCompactionsTracker.NOOP, null); } public CompactionIterator(OperationType type, List scanners, AbstractCompactionController controller, long nowInSec, TimeUUID compactionId, ActiveCompactionsTracker activeCompactions, TopPartitionTracker.Collector topPartitionCollector) { this.controller = controller; this.type = type; this.scanners = scanners; this.nowInSec = nowInSec; this.compactionId = compactionId; this.bytesRead = 0; long bytes = 0; for (ISSTableScanner scanner : scanners) bytes += scanner.getLengthInBytes(); this.totalBytes = bytes; this.mergeCounters = new long[scanners.size()]; // note that we leak `this` from the constructor when calling beginCompaction below, this means we have to get the sstables before // calling that to avoid a NPE. sstables = scanners.stream().map(ISSTableScanner::getBackingSSTables).flatMap(Collection::stream).collect(ImmutableSet.toImmutableSet()); this.activeCompactions = activeCompactions == null ? ActiveCompactionsTracker.NOOP : activeCompactions; this.activeCompactions.beginCompaction(this); // note that CompactionTask also calls this, but CT only creates CompactionIterator with a NOOP ActiveCompactions UnfilteredPartitionIterator merged = scanners.isEmpty() ? EmptyIterators.unfilteredPartition(controller.cfs.metadata()) : UnfilteredPartitionIterators.merge(scanners, listener()); if (topPartitionCollector != null) // need to count tombstones before they are purged merged = Transformation.apply(merged, new TopPartitionTracker.TombstoneCounter(topPartitionCollector, nowInSec)); merged = Transformation.apply(merged, new GarbageSkipper(controller)); Transformation purger = isPaxos(controller.cfs) && paxosStatePurging() != legacy ? new PaxosPurger(nowInSec) : new Purger(controller, nowInSec); merged = Transformation.apply(merged, purger); merged = DuplicateRowChecker.duringCompaction(merged, type); compacted = Transformation.apply(merged, new AbortableUnfilteredPartitionTransformation(this)); } public TableMetadata metadata() { return controller.cfs.metadata(); } public CompactionInfo getCompactionInfo() { return new CompactionInfo(controller.cfs.metadata(), type, bytesRead, totalBytes, compactionId, sstables, targetDirectory); } public boolean isGlobal() { return false; } public void setTargetDirectory(final String targetDirectory) { this.targetDirectory = targetDirectory; } private void updateCounterFor(int rows) { assert rows > 0 && rows - 1 < mergeCounters.length; mergeCounters[rows - 1] += 1; } public long[] getMergedRowCounts() { return mergeCounters; } public long getTotalSourceCQLRows() { return totalSourceCQLRows; } private UnfilteredPartitionIterators.MergeListener listener() { return new UnfilteredPartitionIterators.MergeListener() { private boolean rowProcessingNeeded() { return (type == OperationType.COMPACTION || type == OperationType.MAJOR_COMPACTION) && controller.cfs.indexManager.handles(IndexTransaction.Type.COMPACTION); } @Override public boolean preserveOrder() { return rowProcessingNeeded(); } public UnfilteredRowIterators.MergeListener getRowMergeListener(DecoratedKey partitionKey, List versions) { int merged = 0; for (int i=0, isize=versions.size(); i 0; CompactionIterator.this.updateCounterFor(merged); if (!rowProcessingNeeded()) return null; Columns statics = Columns.NONE; Columns regulars = Columns.NONE; for (int i=0, isize=versions.size(); i 0) { if (tombNext.isRangeTombstoneMarker()) { tombOpenDeletionTime = updateOpenDeletionTime(tombOpenDeletionTime, tombNext); activeDeletionTime = Ordering.natural().max(partitionDeletionTime, tombOpenDeletionTime); boolean supersededBefore = openDeletionTime.isLive(); boolean supersededAfter = !dataOpenDeletionTime.supersedes(activeDeletionTime); // If a range open was not issued because it was superseded and the deletion isn't superseded any more, we need to open it now. if (supersededBefore && !supersededAfter) next = new RangeTombstoneBoundMarker(((RangeTombstoneMarker) tombNext).closeBound(false).invert(), dataOpenDeletionTime); // If the deletion begins to be superseded, we don't close the range yet. This can save us a close/open pair if it ends after the superseding range. } } if (next instanceof RangeTombstoneMarker) openDeletionTime = updateOpenDeletionTime(openDeletionTime, next); if (cmp <= 0) dataNext = advance(wrapped); if (cmp >= 0) tombNext = advance(tombSource); } return next != null; } protected Row garbageFilterRow(Row dataRow, Row tombRow) { if (cellLevelGC) { return Rows.removeShadowedCells(dataRow, tombRow, activeDeletionTime); } else { DeletionTime deletion = Ordering.natural().max(tombRow.deletion().time(), activeDeletionTime); return dataRow.filter(cf, deletion, false, metadata); } } /** * Decide how to act on a tombstone marker from the input iterator. We can decide what to issue depending on * whether or not the ranges before and after the marker are superseded/live -- if none are, we can reuse the * marker; if both are, the marker can be ignored; otherwise we issue a corresponding start/end marker. */ private RangeTombstoneMarker processDataMarker() { dataOpenDeletionTime = updateOpenDeletionTime(dataOpenDeletionTime, dataNext); boolean supersededBefore = openDeletionTime.isLive(); boolean supersededAfter = !dataOpenDeletionTime.supersedes(activeDeletionTime); RangeTombstoneMarker marker = (RangeTombstoneMarker) dataNext; if (!supersededBefore) if (!supersededAfter) return marker; else return new RangeTombstoneBoundMarker(marker.closeBound(false), marker.closeDeletionTime(false)); else if (!supersededAfter) return new RangeTombstoneBoundMarker(marker.openBound(false), marker.openDeletionTime(false)); else return null; } @Override public Unfiltered next() { if (!hasNext()) throw new IllegalStateException(); Unfiltered v = next; next = null; return v; } private DeletionTime updateOpenDeletionTime(DeletionTime openDeletionTime, Unfiltered next) { RangeTombstoneMarker marker = (RangeTombstoneMarker) next; assert openDeletionTime.isLive() == !marker.isClose(false); assert openDeletionTime.isLive() || openDeletionTime.equals(marker.closeDeletionTime(false)); return marker.isOpen(false) ? marker.openDeletionTime(false) : DeletionTime.LIVE; } } /** * Partition transformation applying GarbageSkippingUnfilteredRowIterator, obtaining tombstone sources for each * partition using the controller's shadowSources method. */ private static class GarbageSkipper extends Transformation { final AbstractCompactionController controller; final boolean cellLevelGC; private GarbageSkipper(AbstractCompactionController controller) { this.controller = controller; cellLevelGC = controller.tombstoneOption == TombstoneOption.CELL; } @Override protected UnfilteredRowIterator applyToPartition(UnfilteredRowIterator partition) { Iterable sources = controller.shadowSources(partition.partitionKey(), !cellLevelGC); if (sources == null) return partition; List iters = new ArrayList<>(); for (UnfilteredRowIterator iter : sources) { if (!iter.isEmpty()) iters.add(iter); else iter.close(); } if (iters.isEmpty()) return partition; return new GarbageSkippingUnfilteredRowIterator(partition, UnfilteredRowIterators.merge(iters), cellLevelGC); } } private class PaxosPurger extends Transformation { private final long nowInSec; private final long paxosPurgeGraceMicros = DatabaseDescriptor.getPaxosPurgeGrace(MICROSECONDS); private final Map tableIdToHistory = new HashMap<>(); private Token currentToken; private int compactedUnfiltered; private PaxosPurger(long nowInSec) { this.nowInSec = nowInSec; } protected void onEmptyPartitionPostPurge(DecoratedKey key) { if (type == OperationType.COMPACTION) controller.cfs.invalidateCachedPartition(key); } protected void updateProgress() { if ((++compactedUnfiltered) % UNFILTERED_TO_UPDATE_PROGRESS == 0) updateBytesRead(); } @Override protected UnfilteredRowIterator applyToPartition(UnfilteredRowIterator partition) { currentToken = partition.partitionKey().getToken(); UnfilteredRowIterator purged = Transformation.apply(partition, this); if (purged.isEmpty()) { onEmptyPartitionPostPurge(purged.partitionKey()); purged.close(); return null; } return purged; } @Override protected Row applyToRow(Row row) { updateProgress(); TableId tableId = PaxosRows.getTableId(row); switch (paxosStatePurging()) { default: throw new AssertionError(); case legacy: case gc_grace: { TableMetadata metadata = Schema.instance.getTableMetadata(tableId); return row.purgeDataOlderThan(TimeUnit.SECONDS.toMicros(nowInSec - (metadata == null ? (3 * 3600) : metadata.params.gcGraceSeconds)), false); } case repaired: { PaxosRepairHistory.Searcher history = tableIdToHistory.computeIfAbsent(tableId, find -> { TableMetadata metadata = Schema.instance.getTableMetadata(find); if (metadata == null) return null; return Keyspace.openAndGetStore(metadata).getPaxosRepairHistory().searcher(); }); return history == null ? row : row.purgeDataOlderThan(history.ballotForToken(currentToken).unixMicros() - paxosPurgeGraceMicros, false); } } } } private static class AbortableUnfilteredPartitionTransformation extends Transformation { private final AbortableUnfilteredRowTransformation abortableIter; private AbortableUnfilteredPartitionTransformation(CompactionIterator iter) { this.abortableIter = new AbortableUnfilteredRowTransformation(iter); } @Override protected UnfilteredRowIterator applyToPartition(UnfilteredRowIterator partition) { if (abortableIter.iter.isStopRequested()) throw new CompactionInterruptedException(abortableIter.iter.getCompactionInfo()); return Transformation.apply(partition, abortableIter); } } private static class AbortableUnfilteredRowTransformation extends Transformation { private final CompactionIterator iter; private AbortableUnfilteredRowTransformation(CompactionIterator iter) { this.iter = iter; } public Row applyToRow(Row row) { if (iter.isStopRequested()) throw new CompactionInterruptedException(iter.getCompactionInfo()); return row; } } private static boolean isPaxos(ColumnFamilyStore cfs) { return cfs.name.equals(SystemKeyspace.PAXOS) && cfs.getKeyspaceName().equals(SchemaConstants.SYSTEM_KEYSPACE_NAME); } }




© 2015 - 2024 Weber Informatics LLC | Privacy Policy