org.apache.cassandra.db.commitlog.CommitLogReplayer Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of cassandra-all Show documentation
The Apache Cassandra Project develops a highly scalable second-generation distributed database, bringing together Dynamo's fully distributed design and Bigtable's ColumnFamily-based data model.
There is a newer version: 5.0.2
Show newest version
/*
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *   http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing,
 * software distributed under the License is distributed on an
 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 * KIND, either express or implied.  See the License for the
 * specific language governing permissions and limitations
 * under the License.
 */
package org.apache.cassandra.db.commitlog;

import java.io.File;
import java.io.IOException;
import java.util.ArrayDeque;
import java.util.ArrayList;
import java.util.Collection;
import java.util.Collections;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.Queue;
import java.util.Set;
import java.util.UUID;
import java.util.concurrent.Future;
import java.util.concurrent.atomic.AtomicInteger;

import com.google.common.annotations.VisibleForTesting;
import com.google.common.base.Predicate;
import com.google.common.collect.HashMultimap;
import com.google.common.collect.Iterables;
import com.google.common.collect.Multimap;
import com.google.common.collect.Ordering;

import org.apache.commons.lang3.StringUtils;
import org.cliffc.high_scale_lib.NonBlockingHashSet;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import org.apache.cassandra.concurrent.Stage;
import org.apache.cassandra.config.Config;
import org.apache.cassandra.config.DatabaseDescriptor;
import org.apache.cassandra.db.ColumnFamilyStore;
import org.apache.cassandra.db.Keyspace;
import org.apache.cassandra.db.Mutation;
import org.apache.cassandra.db.SystemKeyspace;
import org.apache.cassandra.db.partitions.PartitionUpdate;
import org.apache.cassandra.io.sstable.format.SSTableReader;
import org.apache.cassandra.io.util.FileUtils;
import org.apache.cassandra.io.util.RandomAccessReader;
import org.apache.cassandra.schema.Schema;
import org.apache.cassandra.schema.SchemaConstants;
import org.apache.cassandra.schema.TableId;
import org.apache.cassandra.schema.TableMetadataRef;
import org.apache.cassandra.utils.FBUtilities;
import org.apache.cassandra.utils.WrappedRunnable;

public class CommitLogReplayer implements CommitLogReadHandler
{
    @VisibleForTesting
    public static long MAX_OUTSTANDING_REPLAY_BYTES = Long.getLong("cassandra.commitlog_max_outstanding_replay_bytes", 1024 * 1024 * 64);
    @VisibleForTesting
    public static MutationInitiator mutationInitiator = new MutationInitiator();
    static final String IGNORE_REPLAY_ERRORS_PROPERTY = Config.PROPERTY_PREFIX + "commitlog.ignorereplayerrors";
    private static final Logger logger = LoggerFactory.getLogger(CommitLogReplayer.class);
    private static final int MAX_OUTSTANDING_REPLAY_COUNT = Integer.getInteger(Config.PROPERTY_PREFIX + "commitlog_max_outstanding_replay_count", 1024);

    private final Set keyspacesReplayed;
    private final Queue> futures;

    private final AtomicInteger replayedCount;
    private final Map> cfPersisted;
    private final CommitLogPosition globalPosition;

    // Used to throttle speed of replay of mutations if we pass the max outstanding count
    private long pendingMutationBytes = 0;

    private final ReplayFilter replayFilter;
    private CommitLogArchiver archiver;

    @VisibleForTesting
    protected boolean sawCDCMutation;

    @VisibleForTesting
    protected CommitLogReader commitLogReader;

    CommitLogReplayer(CommitLog commitLog,
                      CommitLogPosition globalPosition,
                      Map> cfPersisted,
                      ReplayFilter replayFilter)
    {
        this.keyspacesReplayed = new NonBlockingHashSet<>();
        this.futures = new ArrayDeque<>();
        // count the number of replayed mutation. We don't really care about atomicity, but we need it to be a reference.
        this.replayedCount = new AtomicInteger();
        this.cfPersisted = cfPersisted;
        this.globalPosition = globalPosition;
        this.replayFilter = replayFilter;
        this.archiver = commitLog.archiver;
        this.commitLogReader = new CommitLogReader();
    }

    public static CommitLogReplayer construct(CommitLog commitLog, UUID localHostId)
    {
        // compute per-CF and global replay intervals
        Map> cfPersisted = new HashMap<>();
        ReplayFilter replayFilter = ReplayFilter.create();

        for (ColumnFamilyStore cfs : ColumnFamilyStore.all())
        {
            // but, if we've truncated the cf in question, then we need to need to start replay after the truncation
            CommitLogPosition truncatedAt = SystemKeyspace.getTruncatedPosition(cfs.metadata.id);
            if (truncatedAt != null)
            {
                // Point in time restore is taken to mean that the tables need to be replayed even if they were
                // deleted at a later point in time. Any truncation record after that point must thus be cleared prior
                // to replay (CASSANDRA-9195).
                // truncatedTime is millseconds level but restoreTime is microlevel
                long restoreTime = commitLog.archiver.restorePointInTimeInMicroseconds == Long.MAX_VALUE ? Long.MAX_VALUE : commitLog.archiver.restorePointInTimeInMicroseconds / 1000;
                long truncatedTime = SystemKeyspace.getTruncatedAt(cfs.metadata.id);
                if (truncatedTime > restoreTime)
                {
                    if (replayFilter.includes(cfs.metadata))
                    {
                        logger.info("Restore point in time is before latest truncation of table {}.{}. Clearing truncation record.",
                                    cfs.metadata.keyspace,
                                    cfs.metadata.name);
                        SystemKeyspace.removeTruncationRecord(cfs.metadata.id);
                        truncatedAt = null;
                    }
                }
            }

            IntervalSet filter = persistedIntervals(cfs.getLiveSSTables(), truncatedAt, localHostId);
            cfPersisted.put(cfs.metadata.id, filter);
        }
        CommitLogPosition globalPosition = firstNotCovered(cfPersisted.values());
        logger.debug("Global replay position is {} from columnfamilies {}", globalPosition, FBUtilities.toString(cfPersisted));
        return new CommitLogReplayer(commitLog, globalPosition, cfPersisted, replayFilter);
    }

    public void replayPath(File file, boolean tolerateTruncation) throws IOException
    {
        sawCDCMutation = false;
        commitLogReader.readCommitLogSegment(this, file, globalPosition, CommitLogReader.ALL_MUTATIONS, tolerateTruncation);
        if (sawCDCMutation)
            handleCDCReplayCompletion(file);
    }

    public void replayFiles(File[] clogs) throws IOException
    {
        List filteredLogs = CommitLogReader.filterCommitLogFiles(clogs);
        int i = 0;
        for (File file: filteredLogs)
        {
            i++;
            sawCDCMutation = false;
            commitLogReader.readCommitLogSegment(this, file, globalPosition, i == filteredLogs.size());
            if (sawCDCMutation)
                handleCDCReplayCompletion(file);
        }
    }


    /**
     * Upon replay completion, CDC needs to hard-link files in the CDC folder and calculate index files so consumers can
     * begin their work.
     */
    private void handleCDCReplayCompletion(File f) throws IOException
    {
        // Can only reach this point if CDC is enabled, thus we have a CDCSegmentManager
        ((CommitLogSegmentManagerCDC)CommitLog.instance.segmentManager).addCDCSize(f.length());

        File dest = new File(DatabaseDescriptor.getCDCLogLocation(), f.getName());

        // If hard link already exists, assume it's from a previous node run. If people are mucking around in the cdc_raw
        // directory that's on them.
        if (!dest.exists())
            FileUtils.createHardLink(f, dest);

        // The reader has already verified we can deserialize the descriptor.
        CommitLogDescriptor desc;
        try(RandomAccessReader reader = RandomAccessReader.open(f))
        {
            desc = CommitLogDescriptor.readHeader(reader, DatabaseDescriptor.getEncryptionContext());
            assert desc != null;
            assert f.length() < Integer.MAX_VALUE;
            CommitLogSegment.writeCDCIndexFile(desc, (int)f.length(), true);
        }
    }


    /**
     * Flushes all keyspaces associated with this replayer in parallel, blocking until their flushes are complete.
     * @return the number of mutations replayed
     */
    public int blockForWrites()
    {
        for (Map.Entry entry : commitLogReader.getInvalidMutations())
            logger.warn("Skipped {} mutations from unknown (probably removed) CF with id {}", entry.getValue(), entry.getKey());

        // wait for all the writes to finish on the mutation stage
        FBUtilities.waitOnFutures(futures);
        logger.trace("Finished waiting on mutations from recovery");

        // flush replayed keyspaces
        futures.clear();
        boolean flushingSystem = false;

        List> futures = new ArrayList>();
        for (Keyspace keyspace : keyspacesReplayed)
        {
            if (keyspace.getName().equals(SchemaConstants.SYSTEM_KEYSPACE_NAME))
                flushingSystem = true;

            futures.addAll(keyspace.flush());
        }

        // also flush batchlog incase of any MV updates
        if (!flushingSystem)
            futures.add(Keyspace.open(SchemaConstants.SYSTEM_KEYSPACE_NAME).getColumnFamilyStore(SystemKeyspace.BATCHES).forceFlush());

        FBUtilities.waitOnFutures(futures);

        return replayedCount.get();
    }

    /*
     * Wrapper around initiating mutations read from the log to make it possible
     * to spy on initiated mutations for test
     */
    @VisibleForTesting
    public static class MutationInitiator
    {
        protected Future initiateMutation(final Mutation mutation,
                                                   final long segmentId,
                                                   final int serializedSize,
                                                   final int entryLocation,
                                                   final CommitLogReplayer commitLogReplayer)
        {
            Runnable runnable = new WrappedRunnable()
            {
                public void runMayThrow()
                {
                    if (Schema.instance.getKeyspaceMetadata(mutation.getKeyspaceName()) == null)
                        return;
                    if (commitLogReplayer.pointInTimeExceeded(mutation))
                        return;

                    final Keyspace keyspace = Keyspace.open(mutation.getKeyspaceName());

                    // Rebuild the mutation, omitting column families that
                    //    a) the user has requested that we ignore,
                    //    b) have already been flushed,
                    // or c) are part of a cf that was dropped.
                    // Keep in mind that the cf.name() is suspect. do every thing based on the cfid instead.
                    Mutation.PartitionUpdateCollector newPUCollector = null;
                    for (PartitionUpdate update : commitLogReplayer.replayFilter.filter(mutation))
                    {
                        if (Schema.instance.getTableMetadata(update.metadata().id) == null)
                            continue; // dropped

                        // replay if current segment is newer than last flushed one or,
                        // if it is the last known segment, if we are after the commit log segment position
                        if (commitLogReplayer.shouldReplay(update.metadata().id, new CommitLogPosition(segmentId, entryLocation)))
                        {
                            if (newPUCollector == null)
                                newPUCollector = new Mutation.PartitionUpdateCollector(mutation.getKeyspaceName(), mutation.key());
                            newPUCollector.add(update);
                            commitLogReplayer.replayedCount.incrementAndGet();
                        }
                    }
                    if (newPUCollector != null)
                    {
                        assert !newPUCollector.isEmpty();

                        Keyspace.open(newPUCollector.getKeyspaceName()).apply(newPUCollector.build(), false, true, false);
                        commitLogReplayer.keyspacesReplayed.add(keyspace);
                    }
                }
            };
            return Stage.MUTATION.submit(runnable, serializedSize);
        }
    }

    /**
     * A set of known safe-to-discard commit log replay positions, based on
     * the range covered by on disk sstables and those prior to the most recent truncation record
     */
    public static IntervalSet persistedIntervals(Iterable onDisk,
                                                                    CommitLogPosition truncatedAt,
                                                                    UUID localhostId)
    {
        IntervalSet.Builder builder = new IntervalSet.Builder<>();
        List skippedSSTables = new ArrayList<>();
        for (SSTableReader reader : onDisk)
        {
            UUID originatingHostId = reader.getSSTableMetadata().originatingHostId;
            if (originatingHostId != null && originatingHostId.equals(localhostId))
                builder.addAll(reader.getSSTableMetadata().commitLogIntervals);
            else
                skippedSSTables.add(reader.getFilename());
        }

        if (!skippedSSTables.isEmpty()) {
            logger.warn("Origin of {} sstables is unknown or doesn't match the local node; commitLogIntervals for them were ignored", skippedSSTables.size());
            logger.debug("Ignored commitLogIntervals from the following sstables: {}", skippedSSTables);
        }

        if (truncatedAt != null)
            builder.add(CommitLogPosition.NONE, truncatedAt);
        return builder.build();
    }

    /**
     * Find the earliest commit log position that is not covered by the known flushed ranges for some table.
     *
     * For efficiency this assumes that the first contiguously flushed interval we know of contains the moment that the
     * given table was constructed* and hence we can start replay from the end of that interval.
     *
     * If such an interval is not known, we must replay from the beginning.
     *
     * * This is not true only until if the very first flush of a table stalled or failed, while the second or latter
     *   succeeded. The chances of this happening are at most very low, and if the assumption does prove to be
     *   incorrect during replay there is little chance that the affected deployment is in production.
     */
    public static CommitLogPosition firstNotCovered(Collection> ranges)
    {
        return ranges.stream()
                .map(intervals -> Iterables.getFirst(intervals.ends(), CommitLogPosition.NONE))
                .min(Ordering.natural())
                .get(); // iteration is per known-CF, there must be at least one.
    }

    abstract static class ReplayFilter
    {
        public abstract Iterable filter(Mutation mutation);

        public abstract boolean includes(TableMetadataRef metadata);

        public static ReplayFilter create()
        {
            // If no replaylist is supplied an empty array of strings is used to replay everything.
            if (System.getProperty("cassandra.replayList") == null)
                return new AlwaysReplayFilter();

            Multimap toReplay = HashMultimap.create();
            for (String rawPair : System.getProperty("cassandra.replayList").split(","))
            {
                String[] pair = StringUtils.split(rawPair.trim(), '.');
                if (pair.length != 2)
                    throw new IllegalArgumentException("Each table to be replayed must be fully qualified with keyspace name, e.g., 'system.peers'");

                Keyspace ks = Schema.instance.getKeyspaceInstance(pair[0]);
                if (ks == null)
                    throw new IllegalArgumentException("Unknown keyspace " + pair[0]);
                ColumnFamilyStore cfs = ks.getColumnFamilyStore(pair[1]);
                if (cfs == null)
                    throw new IllegalArgumentException(String.format("Unknown table %s.%s", pair[0], pair[1]));

                toReplay.put(pair[0], pair[1]);
            }
            return new CustomReplayFilter(toReplay);
        }
    }

    private static class AlwaysReplayFilter extends ReplayFilter
    {
        public Iterable filter(Mutation mutation)
        {
            return mutation.getPartitionUpdates();
        }

        public boolean includes(TableMetadataRef metadata)
        {
            return true;
        }
    }

    private static class CustomReplayFilter extends ReplayFilter
    {
        private Multimap toReplay;

        public CustomReplayFilter(Multimap toReplay)
        {
            this.toReplay = toReplay;
        }

        public Iterable filter(Mutation mutation)
        {
            final Collection cfNames = toReplay.get(mutation.getKeyspaceName());
            if (cfNames == null)
                return Collections.emptySet();

            return Iterables.filter(mutation.getPartitionUpdates(), new Predicate()
            {
                public boolean apply(PartitionUpdate upd)
                {
                    return cfNames.contains(upd.metadata().name);
                }
            });
        }

        public boolean includes(TableMetadataRef metadata)
        {
            return toReplay.containsEntry(metadata.keyspace, metadata.name);
        }
    }

    /**
     * consult the known-persisted ranges for our sstables;
     * if the position is covered by one of them it does not need to be replayed
     *
     * @return true iff replay is necessary
     */
    private boolean shouldReplay(TableId tableId, CommitLogPosition position)
    {
        return !cfPersisted.get(tableId).contains(position);
    }

    protected boolean pointInTimeExceeded(Mutation fm)
    {
        for (PartitionUpdate upd : fm.getPartitionUpdates())
        {
            if (archiver.precision.toMicros(upd.maxTimestamp()) > archiver.restorePointInTimeInMicroseconds)
                return true;
        }
        return false;
    }

    public void handleMutation(Mutation m, int size, int entryLocation, CommitLogDescriptor desc)
    {
        if (DatabaseDescriptor.isCDCEnabled() && m.trackedByCDC())
            sawCDCMutation = true;

        pendingMutationBytes += size;
        futures.offer(mutationInitiator.initiateMutation(m,
                                                         desc.id,
                                                         size,
                                                         entryLocation,
                                                         this));
        // If there are finished mutations, or too many outstanding bytes/mutations
        // drain the futures in the queue
        while (futures.size() > MAX_OUTSTANDING_REPLAY_COUNT
               || pendingMutationBytes > MAX_OUTSTANDING_REPLAY_BYTES
               || (!futures.isEmpty() && futures.peek().isDone()))
        {
            pendingMutationBytes -= FBUtilities.waitOnFuture(futures.poll());
        }
    }

    public boolean shouldSkipSegmentOnError(CommitLogReadException exception) throws IOException
    {
        if (exception.permissible)
            logger.error("Ignoring commit log replay error likely due to incomplete flush to disk", exception);
        else if (Boolean.getBoolean(IGNORE_REPLAY_ERRORS_PROPERTY))
            logger.error("Ignoring commit log replay error", exception);
        else if (!CommitLog.handleCommitError("Failed commit log replay", exception))
        {
            logger.error("Replay stopped. If you wish to override this error and continue starting the node ignoring " +
                         "commit log replay problems, specify -D" + IGNORE_REPLAY_ERRORS_PROPERTY + "=true " +
                         "on the command line");
            throw new CommitLogReplayException(exception.getMessage(), exception);
        }
        return false;
    }

    /**
     * The logic for whether or not we throw on an error is identical for the replayer between recoverable or non.
     */
    public void handleUnrecoverableError(CommitLogReadException exception) throws IOException
    {
        // Don't care about return value, use this simply to throw exception as appropriate.
        shouldSkipSegmentOnError(exception);
    }

    @SuppressWarnings("serial")
    public static class CommitLogReplayException extends IOException
    {
        public CommitLogReplayException(String message, Throwable cause)
        {
            super(message, cause);
        }

        public CommitLogReplayException(String message)
        {
            super(message);
        }
    }
}