All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.apache.cassandra.io.sstable.SSTableRewriter Maven / Gradle / Ivy

Go to download

The Apache Cassandra Project develops a highly scalable second-generation distributed database, bringing together Dynamo's fully distributed design and Bigtable's ColumnFamily-based data model.

There is a newer version: 2.1.07
Show newest version
/*
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.cassandra.io.sstable;

import java.util.*;

import com.google.common.annotations.VisibleForTesting;
import com.google.common.base.Function;
import com.google.common.base.Functions;

import org.apache.cassandra.config.DatabaseDescriptor;
import org.apache.cassandra.db.ColumnFamilyStore;
import org.apache.cassandra.db.DataTracker;
import org.apache.cassandra.db.DecoratedKey;
import org.apache.cassandra.db.RowIndexEntry;
import org.apache.cassandra.db.compaction.AbstractCompactedRow;
import org.apache.cassandra.utils.CLibrary;
import org.apache.cassandra.utils.FBUtilities;

/**
 * Wraps one or more writers as output for rewriting one or more readers: every sstable_preemptive_open_interval_in_mb
 * we look in the summary we're collecting for the latest writer for the penultimate key that we know to have been fully
 * flushed to the index file, and then double check that the key is fully present in the flushed data file.
 * Then we move the starts of each reader forwards to that point, replace them in the datatracker, and attach a runnable
 * for on-close (i.e. when all references expire) that drops the page cache prior to that key position
 *
 * hard-links are created for each partially written sstable so that readers opened against them continue to work past
 * the rename of the temporary file, which is deleted once all readers against the hard-link have been closed.
 * If for any reason the writer is rolled over, we immediately rename and fully expose the completed file in the DataTracker.
 *
 * On abort we restore the original lower bounds to the existing readers and delete any temporary files we had in progress,
 * but leave any hard-links in place for the readers we opened to cleanup when they're finished as we would had we finished
 * successfully.
 */
public class SSTableRewriter
{
    private static long preemptiveOpenInterval;
    static
    {
        long interval = DatabaseDescriptor.getSSTablePreempiveOpenIntervalInMB() * (1L << 20);
        if (interval < 0)
            interval = Long.MAX_VALUE;
        preemptiveOpenInterval = interval;
    }

    @VisibleForTesting
    static void overrideOpenInterval(long size)
    {
        preemptiveOpenInterval = size;
    }

    private final DataTracker dataTracker;
    private final ColumnFamilyStore cfs;

    private final long maxAge;
    private final List finished = new ArrayList<>();
    private final Set rewriting; // the readers we are rewriting (updated as they are replaced)
    private final Map originalStarts = new HashMap<>(); // the start key for each reader we are rewriting
    private final Map fileDescriptors = new HashMap<>(); // the file descriptors for each reader descriptor we are rewriting

    private SSTableReader currentlyOpenedEarly; // the reader for the most recent (re)opening of the target file
    private long currentlyOpenedEarlyAt; // the position (in MB) in the target file we last (re)opened at

    private final Queue finishedEarly = new ArrayDeque<>();
    // as writers are closed from finishedEarly, their last readers are moved
    // into discard, so that abort can cleanup after us safely
    private final List discard = new ArrayList<>();
    private final boolean isOffline; // true for operations that are performed without Cassandra running (prevents updates of DataTracker)

    private SSTableWriter writer;
    private Map cachedKeys = new HashMap<>();

    public SSTableRewriter(ColumnFamilyStore cfs, Set rewriting, long maxAge, boolean isOffline)
    {
        this.rewriting = rewriting;
        for (SSTableReader sstable : rewriting)
        {
            originalStarts.put(sstable.descriptor, sstable.first);
            fileDescriptors.put(sstable.descriptor, CLibrary.getfd(sstable.getFilename()));
        }
        this.dataTracker = cfs.getDataTracker();
        this.cfs = cfs;
        this.maxAge = maxAge;
        this.isOffline = isOffline;
    }

    public SSTableWriter currentWriter()
    {
        return writer;
    }

    public RowIndexEntry append(AbstractCompactedRow row)
    {
        // we do this before appending to ensure we can resetAndTruncate() safely if the append fails
        maybeReopenEarly(row.key);
        RowIndexEntry index = writer.append(row);
        if (!isOffline)
        {
            if (index == null)
            {
                cfs.invalidateCachedRow(row.key);
            }
            else
            {
                boolean save = false;
                for (SSTableReader reader : rewriting)
                {
                    if (reader.getCachedPosition(row.key, false) != null)
                    {
                        save = true;
                        break;
                    }
                }
                if (save)
                    cachedKeys.put(row.key, index);
            }
        }
        return index;
    }

    // attempts to append the row, if fails resets the writer position
    public RowIndexEntry tryAppend(AbstractCompactedRow row)
    {
        writer.mark();
        try
        {
            return append(row);
        }
        catch (Throwable t)
        {
            writer.resetAndTruncate();
            throw t;
        }
    }

    private void maybeReopenEarly(DecoratedKey key)
    {
        if (!FBUtilities.isWindows() && writer.getFilePointer() - currentlyOpenedEarlyAt > preemptiveOpenInterval)
        {
            if (isOffline)
            {
                for (SSTableReader reader : rewriting)
                {
                    RowIndexEntry index = reader.getPosition(key, SSTableReader.Operator.GE);
                    CLibrary.trySkipCache(fileDescriptors.get(reader.descriptor), 0, index == null ? 0 : index.position);
                }
            }
            else
            {
                SSTableReader reader = writer.openEarly(maxAge);
                if (reader != null)
                {
                    replaceEarlyOpenedFile(currentlyOpenedEarly, reader);
                    currentlyOpenedEarly = reader;
                    currentlyOpenedEarlyAt = writer.getFilePointer();
                    moveStarts(reader, Functions.constant(reader.last), false);
                }
            }
        }
    }

    public void abort()
    {
        switchWriter(null, true);
        moveStarts(null, Functions.forMap(originalStarts), true);

        // remove already completed SSTables
        for (SSTableReader sstable : finished)
        {
            sstable.markObsolete();
            sstable.sharedRef().release();
        }

        // abort the writers
        for (Finished finished : finishedEarly)
        {
            boolean opened = finished.reader != null;
            finished.writer.abort(!opened);
            if (opened)
            {
                // if we've already been opened, add ourselves to the discard pile
                discard.add(finished.reader);
                finished.reader.markObsolete();
            }
        }

        replaceWithFinishedReaders(Collections.emptyList());
    }

    /**
     * Replace the readers we are rewriting with cloneWithNewStart, reclaiming any page cache that is no longer
     * needed, and transferring any key cache entries over to the new reader, expiring them from the old. if reset
     * is true, we are instead restoring the starts of the readers from before the rewriting began
     *
     * note that we replace an existing sstable with a new *instance* of the same sstable, the replacement
     * sstable .equals() the old one, BUT, it is a new instance, so, for example, since we releaseReference() on the old
     * one, the old *instance* will have reference count == 0 and if we were to start a new compaction with that old
     * instance, we would get exceptions.
     *
     * @param newReader the rewritten reader that replaces them for this region
     * @param newStarts a function mapping a reader's descriptor to their new start value
     * @param reset true iff we are restoring earlier starts (increasing the range over which they are valid)
     */
    private void moveStarts(SSTableReader newReader, Function newStarts, boolean reset)
    {
        if (isOffline)
            return;
        List toReplace = new ArrayList<>();
        List replaceWith = new ArrayList<>();
        final List invalidateKeys = new ArrayList<>();
        if (!reset)
        {
            invalidateKeys.addAll(cachedKeys.keySet());
            for (Map.Entry cacheKey : cachedKeys.entrySet())
                newReader.cacheKey(cacheKey.getKey(), cacheKey.getValue());
        }
        cachedKeys = new HashMap<>();
        for (final SSTableReader sstable : rewriting)
        {
            DecoratedKey newStart = newStarts.apply(sstable.descriptor);
            assert newStart != null;
            if (sstable.first.compareTo(newStart) < 0 || (reset && newStart != sstable.first))
            {
                toReplace.add(sstable);
                // we call getCurrentReplacement() to support multiple rewriters operating over the same source readers at once.
                // note: only one such writer should be written to at any moment
                replaceWith.add(sstable.getCurrentReplacement().cloneWithNewStart(newStart, new Runnable()
                {
                    public void run()
                    {
                        // this is somewhat racey, in that we could theoretically be closing this old reader
                        // when an even older reader is still in use, but it's not likely to have any major impact
                        for (DecoratedKey key : invalidateKeys)
                            sstable.invalidateCacheKey(key);
                    }
                }));
            }
        }
        cfs.getDataTracker().replaceWithNewInstances(toReplace, replaceWith);
        rewriting.removeAll(toReplace);
        rewriting.addAll(replaceWith);
    }

    private void replaceEarlyOpenedFile(SSTableReader toReplace, SSTableReader replaceWith)
    {
        if (isOffline)
            return;
        Set toReplaceSet;
        if (toReplace != null)
        {
            toReplace.setReplacedBy(replaceWith);
            toReplaceSet = Collections.singleton(toReplace);
        }
        else
        {
            dataTracker.markCompacting(Collections.singleton(replaceWith));
            toReplaceSet = Collections.emptySet();
        }
        dataTracker.replaceEarlyOpenedFiles(toReplaceSet, Collections.singleton(replaceWith));
    }

    public void switchWriter(SSTableWriter newWriter)
    {
        switchWriter(newWriter, false);
    }

    private void switchWriter(SSTableWriter newWriter, boolean abort)
    {
        if (writer == null)
        {
            writer = newWriter;
            return;
        }

        // we leave it as a tmp file, but we open it and add it to the dataTracker
        if (writer.getFilePointer() != 0 && !abort)
        {
            SSTableReader reader = writer.finish(SSTableWriter.FinishType.EARLY, maxAge, -1);
            replaceEarlyOpenedFile(currentlyOpenedEarly, reader);
            moveStarts(reader, Functions.constant(reader.last), false);
            finishedEarly.add(new Finished(writer, reader));
        }
        else
        {
            writer.abort();
        }
        currentlyOpenedEarly = null;
        currentlyOpenedEarlyAt = 0;
        writer = newWriter;
    }

    public List finish()
    {
        return finish(-1);
    }

    /**
     * Finishes the new file(s)
     *
     * Creates final files, adds the new files to the dataTracker (via replaceReader).
     *
     * We add them to the tracker to be able to get rid of the tmpfiles
     *
     * It is up to the caller to do the compacted sstables replacement
     * gymnastics (ie, call DataTracker#markCompactedSSTablesReplaced(..))
     *
     *
     * @param repairedAt the repair time, -1 if we should use the time we supplied when we created
     *                   the SSTableWriter (and called rewriter.switchWriter(..)), actual time if we want to override the
     *                   repair time.
     */
    public List finish(long repairedAt)
    {
        return finishAndMaybeThrow(repairedAt, false, false);
    }

    @VisibleForTesting
    void finishAndThrow(boolean throwEarly)
    {
        finishAndMaybeThrow(-1, throwEarly, !throwEarly);
    }

    private List finishAndMaybeThrow(long repairedAt, boolean throwEarly, boolean throwLate)
    {
        List newReaders = new ArrayList<>();
        switchWriter(null, false);

        if (throwEarly)
            throw new RuntimeException("exception thrown early in finish, for testing");

        while (!finishedEarly.isEmpty())
        {
            Finished f = finishedEarly.poll();
            if (f.writer.getFilePointer() > 0)
            {
                if (f.reader != null)
                    discard.add(f.reader);

                SSTableReader newReader = f.writer.finish(SSTableWriter.FinishType.FINISH_EARLY, maxAge, repairedAt);

                if (f.reader != null)
                    f.reader.setReplacedBy(newReader);

                finished.add(newReader);
                newReaders.add(newReader);
            }
            else
            {
                f.writer.abort(true);
                assert f.reader == null;
            }
        }

        if (throwLate)
            throw new RuntimeException("exception thrown after all sstables finished, for testing");

        replaceWithFinishedReaders(newReaders);
        return finished;
    }

    // cleanup all our temporary readers and swap in our new ones
    private void replaceWithFinishedReaders(List finished)
    {
        if (isOffline)
        {
            for (SSTableReader reader : discard)
            {
                if (reader.getCurrentReplacement() == null)
                    reader.markObsolete();
                reader.sharedRef().release();
            }
        }
        else
        {
            dataTracker.replaceEarlyOpenedFiles(discard, finished);
            dataTracker.unmarkCompacting(discard);
        }
        discard.clear();
    }

    private static final class Finished
    {
        final SSTableWriter writer;
        final SSTableReader reader;

        private Finished(SSTableWriter writer, SSTableReader reader)
        {
            this.writer = writer;
            this.reader = reader;
        }
    }
}




© 2015 - 2025 Weber Informatics LLC | Privacy Policy