All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.apache.cassandra.db.LegacyLayout Maven / Gradle / Ivy

Go to download

The Apache Cassandra Project develops a highly scalable second-generation distributed database, bringing together Dynamo's fully distributed design and Bigtable's ColumnFamily-based data model.

There is a newer version: 5.0.0
Show newest version
/*
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.cassandra.db;

import java.io.DataInput;
import java.io.IOException;
import java.io.IOError;
import java.nio.ByteBuffer;
import java.security.MessageDigest;
import java.util.*;
import java.util.concurrent.TimeUnit;
import java.util.stream.Collectors;

import org.apache.cassandra.cql3.ColumnIdentifier;
import org.apache.cassandra.cql3.SuperColumnCompatibility;
import org.apache.cassandra.config.SchemaConstants;
import org.apache.cassandra.utils.AbstractIterator;

import com.google.common.annotations.VisibleForTesting;
import com.google.common.collect.Iterators;
import com.google.common.collect.Lists;
import com.google.common.collect.PeekingIterator;

import org.apache.cassandra.config.CFMetaData;
import org.apache.cassandra.config.ColumnDefinition;
import org.apache.cassandra.db.filter.ColumnFilter;
import org.apache.cassandra.db.filter.DataLimits;
import org.apache.cassandra.db.rows.*;
import org.apache.cassandra.db.partitions.*;
import org.apache.cassandra.db.context.CounterContext;
import org.apache.cassandra.db.marshal.*;
import org.apache.cassandra.io.util.DataInputPlus;
import org.apache.cassandra.io.util.DataOutputPlus;
import org.apache.cassandra.net.MessagingService;
import org.apache.cassandra.utils.*;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import static com.google.common.collect.Iterables.all;
import static org.apache.cassandra.utils.ByteBufferUtil.bytes;

/**
 * Functions to deal with the old format.
 */
public abstract class LegacyLayout
{
    private static final Logger logger = LoggerFactory.getLogger(LegacyLayout.class);
    private static final NoSpamLogger noSpamLogger = NoSpamLogger.getLogger(logger, 1L, TimeUnit.MINUTES);

    public final static int MAX_CELL_NAME_LENGTH = FBUtilities.MAX_UNSIGNED_SHORT;

    public final static int STATIC_PREFIX = 0xFFFF;

    public final static int DELETION_MASK        = 0x01;
    public final static int EXPIRATION_MASK      = 0x02;
    public final static int COUNTER_MASK         = 0x04;
    public final static int COUNTER_UPDATE_MASK  = 0x08;
    private final static int RANGE_TOMBSTONE_MASK = 0x10;

    // Used in decodeBound if the number of components in the legacy bound is greater than the clustering size,
    // indicating a complex column deletion (i.e. a collection tombstone), but the referenced column is either
    // not present in the current table metadata, or is not currently a complex column. In that case, we'll
    // check the dropped columns for the table which should contain the previous column definition. If that
    // previous definition is also not complex (indicating that the column may have been dropped and re-added
    // with different types multiple times), we use this fake definition to ensure that the complex deletion
    // can be safely processed. This resulting deletion should be filtered out of any row created by a
    // CellGrouper by the dropped column check, but this gives us an extra level of confidence as that check
    // is timestamp based and so is fallible in the face of clock drift.
    private static final ColumnDefinition INVALID_DROPPED_COMPLEX_SUBSTITUTE_COLUMN =
        new ColumnDefinition("",
                             "",
                             ColumnIdentifier.getInterned(ByteBufferUtil.EMPTY_BYTE_BUFFER, UTF8Type.instance),
                             SetType.getInstance(UTF8Type.instance, true),
                             ColumnDefinition.NO_POSITION,
                             ColumnDefinition.Kind.REGULAR);

    private LegacyLayout() {}

    public static AbstractType makeLegacyComparator(CFMetaData metadata)
    {
        ClusteringComparator comparator = metadata.comparator;
        if (!metadata.isCompound())
        {
            assert comparator.size() == 1;
            return comparator.subtype(0);
        }

        boolean hasCollections = metadata.hasCollectionColumns() || metadata.hasDroppedCollectionColumns();
        List> types = new ArrayList<>(comparator.size() + (metadata.isDense() ? 0 : 1) + (hasCollections ? 1 : 0));

        types.addAll(comparator.subtypes());

        if (!metadata.isDense())
        {
            types.add(UTF8Type.instance);

            if (hasCollections)
            {
                Map defined = new HashMap<>();

                for (CFMetaData.DroppedColumn def : metadata.getDroppedColumns().values())
                    if (def.type instanceof CollectionType && def.type.isMultiCell())
                        defined.put(bytes(def.name), (CollectionType) def.type);

                for (ColumnDefinition def : metadata.partitionColumns())
                    if (def.type instanceof CollectionType && def.type.isMultiCell())
                        defined.put(def.name.bytes, (CollectionType) def.type);

                types.add(ColumnToCollectionType.getInstance(defined));
            }
        }
        return CompositeType.getInstance(types);
    }

    public static LegacyCellName decodeCellName(CFMetaData metadata, ByteBuffer superColumnName, ByteBuffer cellname)
    throws UnknownColumnException
    {
        assert cellname != null;
        if (metadata.isSuper())
        {
            assert superColumnName != null;
            return decodeForSuperColumn(metadata, Clustering.make(superColumnName), cellname);
        }

        assert superColumnName == null;
        return decodeCellName(metadata, cellname);
    }

    private static LegacyCellName decodeForSuperColumn(CFMetaData metadata, Clustering clustering, ByteBuffer subcol)
    {
        ColumnDefinition def = metadata.getColumnDefinition(subcol);
        if (def != null)
        {
            // it's a statically defined subcolumn
            return new LegacyCellName(clustering, def, null);
        }

        def = metadata.compactValueColumn();
        assert def != null && def.type instanceof MapType;
        return new LegacyCellName(clustering, def, subcol);
    }

    public static LegacyCellName decodeCellName(CFMetaData metadata, ByteBuffer cellname) throws UnknownColumnException
    {
        return decodeCellName(metadata, cellname, false);
    }

    public static LegacyCellName decodeCellName(CFMetaData metadata, ByteBuffer cellname, boolean readAllAsDynamic) throws UnknownColumnException
    {
        Clustering clustering = decodeClustering(metadata, cellname);

        if (metadata.isSuper())
            return decodeForSuperColumn(metadata, clustering, CompositeType.extractComponent(cellname, 1));

        if (metadata.isDense() || (metadata.isCompactTable() && readAllAsDynamic))
            return new LegacyCellName(clustering, metadata.compactValueColumn(), null);

        ByteBuffer column = metadata.isCompound() ? CompositeType.extractComponent(cellname, metadata.comparator.size()) : cellname;
        if (column == null)
        {
            // Tables for composite 2ndary indexes used to be compound but dense, but we've transformed them into regular tables
            // (non compact ones) but with no regular column (i.e. we only care about the clustering). So we'll get here
            // in that case, and what we want to return is basically a row marker.
            if (metadata.partitionColumns().isEmpty())
                return new LegacyCellName(clustering, null, null);

            // Otherwise, we shouldn't get there
            throw new IllegalArgumentException("No column name component found in cell name");
        }

        // Row marker, this is ok
        if (!column.hasRemaining())
            return new LegacyCellName(clustering, null, null);

        ColumnDefinition def = metadata.getColumnDefinition(column);

        if (metadata.isCompactTable())
        {
            if (def == null || def.isPrimaryKeyColumn())
                // If it's a compact table, it means the column is in fact a "dynamic" one
                return new LegacyCellName(Clustering.make(column), metadata.compactValueColumn(), null);
        }
        else if (def == null)
        {
            throw new UnknownColumnException(metadata, column);
        }

        ByteBuffer collectionElement = metadata.isCompound() ? CompositeType.extractComponent(cellname, metadata.comparator.size() + 1) : null;
        if (collectionElement != null && def.type instanceof CollectionType)
        {
            ((CollectionType)def.type).nameComparator().validateIfFixedSize(collectionElement);
        }

        // Note that because static compact columns are translated to static defs in the new world order, we need to force a static
        // clustering if the definition is static (as it might not be in this case).
        return new LegacyCellName(def.isStatic() ? Clustering.STATIC_CLUSTERING : clustering, def, collectionElement);
    }

    public static LegacyBound decodeSliceBound(CFMetaData metadata, ByteBuffer bound, boolean isStart)
    {
        return decodeBound(metadata, bound, isStart, false);
    }

    public static LegacyBound decodeTombstoneBound(CFMetaData metadata, ByteBuffer bound, boolean isStart)
    {
        return decodeBound(metadata, bound, isStart, true);
    }

    private static LegacyBound decodeBound(CFMetaData metadata, ByteBuffer bound, boolean isStart, boolean isDeletion)
    {
        if (!bound.hasRemaining())
            return isStart ? LegacyBound.BOTTOM : LegacyBound.TOP;

        if (!metadata.isCompound())
        {
            // The non compound case is a lot easier, in that there is no EOC nor collection to worry about, so dealing
            // with that first.
            metadata.comparator.subtype(0).validateIfFixedSize(bound);
            return new LegacyBound(isStart ? ClusteringBound.inclusiveStartOf(bound) : ClusteringBound.inclusiveEndOf(bound), false, null);
        }

        int clusteringSize = metadata.comparator.size();

        boolean isStatic = metadata.isCompound() && CompositeType.isStaticName(bound);
        List components = CompositeType.splitName(bound);
        byte eoc = CompositeType.lastEOC(bound);
        for (int i=0; i= clusteringSize
                        && all(components.subList(0, clusteringSize), ByteBufferUtil.EMPTY_BYTE_BUFFER::equals));

        ColumnDefinition collectionName = null;
        if (components.size() > clusteringSize)
        {
            // For a deletion, there can be more components than the clustering size only in the case this is the
            // bound of a collection range tombstone. In such a case, there is exactly one more component, and that
            // component is the name of the collection being deleted, since we do not support collection range deletions.
            // If the bound is not part of a deletion, it is from slice query filter. The column name may be:
            //   - a valid, non-collection column; in this case we expect a single extra component
            //   - an empty buffer, representing a row marker; in this case we also expect a single extra empty component
            //   - a valid collection column and the first part of a cell path; in this case we expect exactly two extra components
            // In any of these slice cases, these items are unnecessary for the bound we construct,
            // so we can simply remove them, after corroborating we have encountered one of these scenario.
            assert !metadata.isCompactTable() : toDebugHex(components);

            // In all cases, the element straight after the clusterings should contain the name of a column.
            if (components.size() > clusteringSize + 1)
            {
                // we accept bounds from paging state that occur inside a complex column - in this case, we expect
                // two excess components, the first of which is a column name, the second a key into the collection
                if (isDeletion)
                    throw new IllegalArgumentException("Invalid bound " + toDebugHex(components) + ": deletion can have at most one extra component");

                if (clusteringSize + 2 != components.size())
                    throw new IllegalArgumentException("Invalid bound " + toDebugHex(components) + ": complex slices require exactly two extra components");

                // decode simply to verify that we have (or may have had) a complex column; we assume the collection key is valid
                decodeBoundLookupComplexColumn(metadata, components, clusteringSize, isStatic);
                components.remove(clusteringSize + 1);
            }
            else if (isDeletion)
            {
                collectionName = decodeBoundLookupComplexColumn(metadata, components, clusteringSize, isStatic);
            }
            else if (components.get(clusteringSize).hasRemaining())
            {
                decodeBoundVerifySimpleColumn(metadata, components, clusteringSize, isStatic);
            }
            components.remove(clusteringSize);
        }

        boolean isInclusive;
        if (isStart)
        {
            isInclusive = eoc <= 0;
        }
        else
        {
            isInclusive = eoc >= 0;

            // for an end bound, if we only have a prefix of all the components and the final EOC is zero,
            // then it should only match up to the prefix but no further, that is, it is an inclusive bound
            // of the exact prefix but an exclusive bound of anything beyond it, so adding an empty
            // composite value ensures this behavior, see CASSANDRA-12423 for more details
            if (eoc == 0 && components.size() < clusteringSize)
            {
                components.add(ByteBufferUtil.EMPTY_BYTE_BUFFER);
                isInclusive = false;
            }
        }

        ClusteringPrefix.Kind boundKind = ClusteringBound.boundKind(isStart, isInclusive);
        ClusteringBound cb = ClusteringBound.create(boundKind, components.toArray(new ByteBuffer[components.size()]));
        return new LegacyBound(cb, isStatic, collectionName);
    }

    // finds the simple column definition associated with components.get(clusteringSize)
    // if no such columns exists, or ever existed, we throw an exception; if we do not know, we return a dummy column definition
    private static ColumnDefinition decodeBoundLookupComplexColumn(CFMetaData metadata, List components, int clusteringSize, boolean isStatic)
    {
        ByteBuffer columnNameBytes = components.get(clusteringSize);
        ColumnDefinition columnName = metadata.getColumnDefinition(columnNameBytes);
        if (columnName == null || !columnName.isComplex())
        {
            columnName = metadata.getDroppedColumnDefinition(columnNameBytes, isStatic);
            // if no record of the column having ever existed is found, something is badly wrong
            if (columnName == null)
                throw new IllegalArgumentException("Invalid bound " + toDebugHex(components) + ": expected complex column at position " + clusteringSize);

            // if we do have a record of dropping this column but it wasn't previously complex, use a fake
            // column definition for safety (see the comment on the constant declaration for details)
            if (!columnName.isComplex())
                columnName = INVALID_DROPPED_COMPLEX_SUBSTITUTE_COLUMN;
        }

        return columnName;
    }

    // finds the simple column definition associated with components.get(clusteringSize)
    // if no such columns exists, and definitely never existed, we throw an exception
    private static void decodeBoundVerifySimpleColumn(CFMetaData metadata, List components, int clusteringSize, boolean isStatic)
    {
        ByteBuffer columnNameBytes = components.get(clusteringSize);
        ColumnDefinition columnName = metadata.getColumnDefinition(columnNameBytes);
        if (columnName == null || !columnName.isSimple())
        {
            columnName = metadata.getDroppedColumnDefinition(columnNameBytes, isStatic);
            // if no record of the column having ever existed is found, something is badly wrong
            if (columnName == null)
                throw new IllegalArgumentException("Invalid bound " + toDebugHex(components) + ": expected simple column at position " + clusteringSize);
        }
    }

    private static String toDebugHex(Collection buffers)
    {
        return buffers.stream().map(ByteBufferUtil::bytesToHex).collect(Collectors.joining());
    }

    public static ByteBuffer encodeBound(CFMetaData metadata, ClusteringBound bound, boolean isStart)
    {
        if (bound == ClusteringBound.BOTTOM || bound == ClusteringBound.TOP || metadata.comparator.size() == 0)
            return ByteBufferUtil.EMPTY_BYTE_BUFFER;

        ClusteringPrefix clustering = bound.clustering();

        if (!metadata.isCompound())
        {
            assert clustering.size() == 1;
            return clustering.get(0);
        }

        CompositeType ctype = CompositeType.getInstance(metadata.comparator.subtypes());
        CompositeType.Builder builder = ctype.builder();
        for (int i = 0; i < clustering.size(); i++)
            builder.add(clustering.get(i));

        if (isStart)
            return bound.isInclusive() ? builder.build() : builder.buildAsEndOfRange();
        else
            return bound.isInclusive() ? builder.buildAsEndOfRange() : builder.build();
    }

    public static ByteBuffer encodeCellName(CFMetaData metadata, ClusteringPrefix clustering, ByteBuffer columnName, ByteBuffer collectionElement)
    {
        boolean isStatic = clustering == Clustering.STATIC_CLUSTERING;

        if (!metadata.isCompound())
        {
            if (isStatic)
                return columnName;

            assert clustering.size() == 1 : "Expected clustering size to be 1, but was " + clustering.size();
            return clustering.get(0);
        }

        // We use comparator.size() rather than clustering.size() because of static clusterings
        int clusteringSize = metadata.comparator.size();
        int size = clusteringSize + (metadata.isDense() ? 0 : 1) + (collectionElement == null ? 0 : 1);
        if (metadata.isSuper())
            size = clusteringSize + 1;
        ByteBuffer[] values = new ByteBuffer[size];
        for (int i = 0; i < clusteringSize; i++)
        {
            if (isStatic)
            {
                values[i] = ByteBufferUtil.EMPTY_BYTE_BUFFER;
                continue;
            }

            ByteBuffer v = clustering.get(i);
            // we can have null (only for dense compound tables for backward compatibility reasons) but that
            // means we're done and should stop there as far as building the composite is concerned.
            if (v == null)
                return CompositeType.build(Arrays.copyOfRange(values, 0, i));

            values[i] = v;
        }

        if (metadata.isSuper())
        {
            // We need to set the "column" (in thrift terms) name, i.e. the value corresponding to the subcomparator.
            // What it is depends if this a cell for a declared "static" column or a "dynamic" column part of the
            // super-column internal map.
            assert columnName != null; // This should never be null for supercolumns, see decodeForSuperColumn() above
            values[clusteringSize] = columnName.equals(SuperColumnCompatibility.SUPER_COLUMN_MAP_COLUMN)
                                   ? collectionElement
                                   : columnName;
        }
        else
        {
            if (!metadata.isDense())
                values[clusteringSize] = columnName;
            if (collectionElement != null)
                values[clusteringSize + 1] = collectionElement;
        }

        return CompositeType.build(isStatic, values);
    }

    public static Clustering decodeClustering(CFMetaData metadata, ByteBuffer value)
    {
        int csize = metadata.comparator.size();
        if (csize == 0)
            return Clustering.EMPTY;

        if (metadata.isCompound() && CompositeType.isStaticName(value))
            return Clustering.STATIC_CLUSTERING;

        List components = metadata.isCompound()
                                    ? CompositeType.splitName(value)
                                    : Collections.singletonList(value);

        for (int i=0; i type = metadata.comparator.subtype(i);
            type.validateIfFixedSize(components.get(i));
        }
        return Clustering.make(components.subList(0, Math.min(csize, components.size())).toArray(new ByteBuffer[csize]));
    }

    public static ByteBuffer encodeClustering(CFMetaData metadata, ClusteringPrefix clustering)
    {
        if (clustering.size() == 0)
            return ByteBufferUtil.EMPTY_BYTE_BUFFER;

        if (!metadata.isCompound())
        {
            assert clustering.size() == 1;
            return clustering.get(0);
        }

        ByteBuffer[] values = new ByteBuffer[clustering.size()];
        for (int i = 0; i < clustering.size(); i++)
            values[i] = clustering.get(i);
        return CompositeType.build(values);
    }

    /**
     * The maximum number of cells to include per partition when converting to the old format.
     * 

* We already apply the limit during the actual query, but for queries that counts cells and not rows (thrift queries * and distinct queries as far as old nodes are concerned), we may still include a little bit more than requested * because {@link DataLimits} always include full rows. So if the limit ends in the middle of a queried row, the * full row will be part of our result. This would confuse old nodes however so we make sure to truncate it to * what's expected before writting it on the wire. * * @param command the read commmand for which to determine the maximum cells per partition. This can be {@code null} * in which case {@code Integer.MAX_VALUE} is returned. * @return the maximum number of cells per partition that should be enforced according to the read command if * post-query limitation are in order (see above). This will be {@code Integer.MAX_VALUE} if no such limits are * necessary. */ private static int maxLiveCellsPerPartition(ReadCommand command) { if (command == null) return Integer.MAX_VALUE; DataLimits limits = command.limits(); // There is 2 types of DISTINCT queries: those that includes only the partition key, and those that include static columns. // On old nodes, the latter expects the first row in term of CQL count, which is what we already have and there is no additional // limit to apply. The former however expect only one cell per partition and rely on it (See CASSANDRA-10762). if (limits.isDistinct()) return command.columnFilter().fetchedColumns().statics.isEmpty() ? 1 : Integer.MAX_VALUE; switch (limits.kind()) { case THRIFT_LIMIT: case SUPER_COLUMN_COUNTING_LIMIT: return limits.perPartitionCount(); default: return Integer.MAX_VALUE; } } // For serializing to old wire format public static LegacyUnfilteredPartition fromUnfilteredRowIterator(ReadCommand command, UnfilteredRowIterator iterator) { // we need to extract the range tombstone so materialize the partition. Since this is // used for the on-wire format, this is not worst than it used to be. final ImmutableBTreePartition partition = ImmutableBTreePartition.create(iterator); DeletionInfo info = partition.deletionInfo(); Pair> pair = fromRowIterator(partition.metadata(), partition.iterator(), partition.staticRow()); LegacyLayout.LegacyRangeTombstoneList rtl = pair.left; // Processing the cell iterator results in the LegacyRangeTombstoneList being populated, so we do this // before we use the LegacyRangeTombstoneList at all List cells = Lists.newArrayList(pair.right); int maxCellsPerPartition = maxLiveCellsPerPartition(command); cells = maybeTrimLiveCells(cells, maxCellsPerPartition, command); // The LegacyRangeTombstoneList already has range tombstones for the single-row deletions and complex // deletions. Go through our normal range tombstones and add then to the LegacyRTL so that the range // tombstones all get merged and sorted properly. if (info.hasRanges()) { Iterator rangeTombstoneIterator = info.rangeIterator(false); while (rangeTombstoneIterator.hasNext()) { RangeTombstone rt = rangeTombstoneIterator.next(); Slice slice = rt.deletedSlice(); LegacyLayout.LegacyBound start = new LegacyLayout.LegacyBound(slice.start(), false, null); LegacyLayout.LegacyBound end = new LegacyLayout.LegacyBound(slice.end(), false, null); rtl.add(start, end, rt.deletionTime().markedForDeleteAt(), rt.deletionTime().localDeletionTime()); } } return new LegacyUnfilteredPartition(info.getPartitionDeletion(), rtl, cells); } private static List maybeTrimLiveCells(List cells, int maxLiveCells, ReadCommand command) { if (null == command || maxLiveCells >= cells.size()) return cells; int nowInSec = command.nowInSec(); int live = 0; int dead = 0; for (int i = 0; i < cells.size() && live < maxLiveCells; i++) { if (cells.get(i).isLive(nowInSec)) live++; else dead++; } return cells.subList(0, live + dead); } public static void serializeAsLegacyPartition(ReadCommand command, UnfilteredRowIterator partition, DataOutputPlus out, int version) throws IOException { assert version < MessagingService.VERSION_30; out.writeBoolean(true); LegacyLayout.LegacyUnfilteredPartition legacyPartition = LegacyLayout.fromUnfilteredRowIterator(command, partition); UUIDSerializer.serializer.serialize(partition.metadata().cfId, out, version); DeletionTime.serializer.serialize(legacyPartition.partitionDeletion, out); legacyPartition.rangeTombstones.serialize(out, partition.metadata()); // begin cell serialization out.writeInt(legacyPartition.cells.size()); for (LegacyLayout.LegacyCell cell : legacyPartition.cells) { ByteBufferUtil.writeWithShortLength(cell.name.encode(partition.metadata()), out); out.writeByte(cell.serializationFlags()); if (cell.isExpiring()) { out.writeInt(cell.ttl); out.writeInt(cell.localDeletionTime); } else if (cell.isTombstone()) { out.writeLong(cell.timestamp); out.writeInt(TypeSizes.sizeof(cell.localDeletionTime)); out.writeInt(cell.localDeletionTime); continue; } else if (cell.isCounterUpdate()) { out.writeLong(cell.timestamp); long count = CounterContext.instance().getUpdateCount(cell.value); ByteBufferUtil.writeWithLength(ByteBufferUtil.bytes(count), out); continue; } else if (cell.isCounter()) { out.writeLong(Long.MIN_VALUE); // timestampOfLastDelete (not used, and MIN_VALUE is the default) } out.writeLong(cell.timestamp); ByteBufferUtil.writeWithLength(cell.value, out); } } // For the old wire format // Note: this can return null if an empty partition is serialized! public static UnfilteredRowIterator deserializeLegacyPartition(DataInputPlus in, int version, SerializationHelper.Flag flag, ByteBuffer key) throws IOException { assert version < MessagingService.VERSION_30; // This is only used in mutation, and mutation have never allowed "null" column families boolean present = in.readBoolean(); if (!present) return null; CFMetaData metadata = CFMetaData.serializer.deserialize(in, version); LegacyDeletionInfo info = LegacyDeletionInfo.deserialize(metadata, in); int size = in.readInt(); Iterator cells = deserializeCells(metadata, in, flag, size); SerializationHelper helper = new SerializationHelper(metadata, version, flag); return onWireCellstoUnfilteredRowIterator(metadata, metadata.partitioner.decorateKey(key), info, cells, false, helper); } // For the old wire format public static long serializedSizeAsLegacyPartition(ReadCommand command, UnfilteredRowIterator partition, int version) { assert version < MessagingService.VERSION_30; if (partition.isEmpty()) return TypeSizes.sizeof(false); long size = TypeSizes.sizeof(true); LegacyLayout.LegacyUnfilteredPartition legacyPartition = LegacyLayout.fromUnfilteredRowIterator(command, partition); size += UUIDSerializer.serializer.serializedSize(partition.metadata().cfId, version); size += DeletionTime.serializer.serializedSize(legacyPartition.partitionDeletion); size += legacyPartition.rangeTombstones.serializedSize(partition.metadata()); // begin cell serialization size += TypeSizes.sizeof(legacyPartition.cells.size()); for (LegacyLayout.LegacyCell cell : legacyPartition.cells) { size += ByteBufferUtil.serializedSizeWithShortLength(cell.name.encode(partition.metadata())); size += 1; // serialization flags if (cell.isExpiring()) { size += TypeSizes.sizeof(cell.ttl); size += TypeSizes.sizeof(cell.localDeletionTime); } else if (cell.isTombstone()) { size += TypeSizes.sizeof(cell.timestamp); // localDeletionTime replaces cell.value as the body size += TypeSizes.sizeof(TypeSizes.sizeof(cell.localDeletionTime)); size += TypeSizes.sizeof(cell.localDeletionTime); continue; } else if (cell.isCounterUpdate()) { size += TypeSizes.sizeof(cell.timestamp); long count = CounterContext.instance().getUpdateCount(cell.value); size += ByteBufferUtil.serializedSizeWithLength(ByteBufferUtil.bytes(count)); continue; } else if (cell.isCounter()) { size += TypeSizes.sizeof(Long.MIN_VALUE); // timestampOfLastDelete } size += TypeSizes.sizeof(cell.timestamp); size += ByteBufferUtil.serializedSizeWithLength(cell.value); } return size; } // For thrift sake public static UnfilteredRowIterator toUnfilteredRowIterator(CFMetaData metadata, DecoratedKey key, LegacyDeletionInfo delInfo, Iterator cells) { SerializationHelper helper = new SerializationHelper(metadata, 0, SerializationHelper.Flag.LOCAL); return toUnfilteredRowIterator(metadata, key, delInfo, cells, false, helper); } // For deserializing old wire format public static UnfilteredRowIterator onWireCellstoUnfilteredRowIterator(CFMetaData metadata, DecoratedKey key, LegacyDeletionInfo delInfo, Iterator cells, boolean reversed, SerializationHelper helper) { // If the table is a static compact, the "column_metadata" are now internally encoded as // static. This has already been recognized by decodeCellName, but it means the cells // provided are not in the expected order (the "static" cells are not necessarily at the front). // So sort them to make sure toUnfilteredRowIterator works as expected. // Further, if the query is reversed, then the on-wire format still has cells in non-reversed // order, but we need to have them reverse in the final UnfilteredRowIterator. So reverse them. if (metadata.isStaticCompactTable() || reversed) { List l = new ArrayList<>(); Iterators.addAll(l, cells); Collections.sort(l, legacyCellComparator(metadata, reversed)); cells = l.iterator(); } return toUnfilteredRowIterator(metadata, key, delInfo, cells, reversed, helper); } private static UnfilteredRowIterator toUnfilteredRowIterator(CFMetaData metadata, DecoratedKey key, LegacyDeletionInfo delInfo, Iterator cells, boolean reversed, SerializationHelper helper) { // A reducer that basically does nothing, we know the 2 merged iterators can't have conflicting atoms (since we merge cells with range tombstones). MergeIterator.Reducer reducer = new MergeIterator.Reducer() { private LegacyAtom atom; public void reduce(int idx, LegacyAtom current) { // We're merging cell with range tombstones, so we should always only have a single atom to reduce. assert atom == null; atom = current; } protected LegacyAtom getReduced() { return atom; } protected void onKeyChange() { atom = null; } }; List> iterators = Arrays.asList(asLegacyAtomIterator(cells), asLegacyAtomIterator(delInfo.inRowRangeTombstones())); PeekingIterator atoms = Iterators.peekingIterator(MergeIterator.get(iterators, legacyAtomComparator(metadata), reducer)); // Check if we have some static Row staticRow = atoms.hasNext() && atoms.peek().isStatic() ? getNextRow(CellGrouper.staticGrouper(metadata, helper), atoms) : Rows.EMPTY_STATIC_ROW; Iterator rows = convertToRows(new CellGrouper(metadata, helper), atoms); Iterator ranges = delInfo.deletionInfo.rangeIterator(reversed); return new RowAndDeletionMergeIterator(metadata, key, delInfo.deletionInfo.getPartitionDeletion(), ColumnFilter.all(metadata), staticRow, reversed, EncodingStats.NO_STATS, rows, ranges, true); } public static Row extractStaticColumns(CFMetaData metadata, DataInputPlus in, Columns statics) throws IOException { assert !statics.isEmpty(); assert metadata.isCompactTable(); if (metadata.isSuper()) // TODO: there is in practice nothing to do here, but we need to handle the column_metadata for super columns somewhere else throw new UnsupportedOperationException(); Set columnsToFetch = new HashSet<>(statics.size()); for (ColumnDefinition column : statics) columnsToFetch.add(column.name.bytes); Row.Builder builder = BTreeRow.unsortedBuilder(FBUtilities.nowInSeconds()); builder.newRow(Clustering.STATIC_CLUSTERING); boolean foundOne = false; LegacyAtom atom; while ((atom = readLegacyAtomSkippingUnknownColumn(metadata,in)) != null) { if (atom.isCell()) { LegacyCell cell = atom.asCell(); if (!columnsToFetch.contains(cell.name.encode(metadata))) continue; foundOne = true; cell.name.column.type.validateIfFixedSize(cell.value); builder.addCell(new BufferCell(cell.name.column, cell.timestamp, cell.ttl, cell.localDeletionTime, cell.value, null)); } else { LegacyRangeTombstone tombstone = atom.asRangeTombstone(); // TODO: we need to track tombstones and potentially ignore cells that are // shadowed (or even better, replace them by tombstones). throw new UnsupportedOperationException(); } } return foundOne ? builder.build() : Rows.EMPTY_STATIC_ROW; } private static LegacyAtom readLegacyAtomSkippingUnknownColumn(CFMetaData metadata, DataInputPlus in) throws IOException { while (true) { try { return readLegacyAtom(metadata, in, false); } catch (UnknownColumnException e) { // Simply skip, as the method name implies. } } } private static Row getNextRow(CellGrouper grouper, PeekingIterator cells) { if (!cells.hasNext()) return null; grouper.reset(); while (cells.hasNext() && grouper.addAtom(cells.peek())) { // We've added the cell already in the grouper, so just skip it cells.next(); } return grouper.getRow(); } @SuppressWarnings("unchecked") private static Iterator asLegacyAtomIterator(Iterator iter) { return (Iterator)iter; } private static Iterator convertToRows(final CellGrouper grouper, final PeekingIterator atoms) { return new AbstractIterator() { protected Row computeNext() { if (!atoms.hasNext()) return endOfData(); return getNextRow(grouper, atoms); } }; } public static Pair> fromRowIterator(final RowIterator iterator) { return fromRowIterator(iterator.metadata(), iterator, iterator.staticRow()); } private static Pair> fromRowIterator(final CFMetaData metadata, final Iterator iterator, final Row staticRow) { LegacyRangeTombstoneList deletions = new LegacyRangeTombstoneList(new LegacyBoundComparator(metadata.comparator), 10); Iterator cells = new AbstractIterator() { private Iterator currentRow = initializeRow(); private Iterator initializeRow() { if (staticRow == null || staticRow.isEmpty()) return Collections.emptyIterator(); Pair> row = fromRow(metadata, staticRow); deletions.addAll(row.left); return row.right; } protected LegacyCell computeNext() { while (true) { if (currentRow.hasNext()) return currentRow.next(); if (!iterator.hasNext()) return endOfData(); Pair> row = fromRow(metadata, iterator.next()); deletions.addAll(row.left); currentRow = row.right; } } }; return Pair.create(deletions, cells); } private static Pair> fromRow(final CFMetaData metadata, final Row row) { // convert any complex deletions or row deletion into normal range tombstones so that we can build and send a proper RangeTombstoneList // to legacy nodes LegacyRangeTombstoneList deletions = new LegacyRangeTombstoneList(new LegacyBoundComparator(metadata.comparator), 10); if (!row.deletion().isLive()) { Clustering clustering = row.clustering(); ClusteringBound startBound = ClusteringBound.inclusiveStartOf(clustering); ClusteringBound endBound = ClusteringBound.inclusiveEndOf(clustering); LegacyBound start = new LegacyLayout.LegacyBound(startBound, false, null); LegacyBound end = new LegacyLayout.LegacyBound(endBound, false, null); deletions.add(start, end, row.deletion().time().markedForDeleteAt(), row.deletion().time().localDeletionTime()); } for (ColumnData cd : row) { ColumnDefinition col = cd.column(); if (col.isSimple()) continue; DeletionTime delTime = ((ComplexColumnData)cd).complexDeletion(); if (!delTime.isLive()) { Clustering clustering = row.clustering(); boolean isStatic = clustering == Clustering.STATIC_CLUSTERING; assert isStatic == col.isStatic(); ClusteringBound startBound = isStatic ? LegacyDeletionInfo.staticBound(metadata, true) : ClusteringBound.inclusiveStartOf(clustering); ClusteringBound endBound = isStatic ? LegacyDeletionInfo.staticBound(metadata, false) : ClusteringBound.inclusiveEndOf(clustering); LegacyLayout.LegacyBound start = new LegacyLayout.LegacyBound(startBound, isStatic, col); LegacyLayout.LegacyBound end = new LegacyLayout.LegacyBound(endBound, isStatic, col); deletions.add(start, end, delTime.markedForDeleteAt(), delTime.localDeletionTime()); } } Iterator cells = new AbstractIterator() { private final Iterator cells = row.cellsInLegacyOrder(metadata, false).iterator(); // we don't have (and shouldn't have) row markers for compact tables. private boolean hasReturnedRowMarker = metadata.isCompactTable(); protected LegacyCell computeNext() { if (!hasReturnedRowMarker) { hasReturnedRowMarker = true; // don't include a row marker if there's no timestamp on the primary key; this is the 3.0+ equivalent // of a row marker if (!row.primaryKeyLivenessInfo().isEmpty()) { LegacyCellName cellName = new LegacyCellName(row.clustering(), null, null); LivenessInfo info = row.primaryKeyLivenessInfo(); return new LegacyCell(info.isExpiring() ? LegacyCell.Kind.EXPIRING : LegacyCell.Kind.REGULAR, cellName, ByteBufferUtil.EMPTY_BYTE_BUFFER, info.timestamp(), info.localExpirationTime(), info.ttl()); } } if (!cells.hasNext()) return endOfData(); return makeLegacyCell(row.clustering(), cells.next()); } }; return Pair.create(deletions, cells); } private static LegacyCell makeLegacyCell(Clustering clustering, Cell cell) { LegacyCell.Kind kind; if (cell.isCounterCell()) kind = LegacyCell.Kind.COUNTER; else if (cell.isTombstone()) kind = LegacyCell.Kind.DELETED; else if (cell.isExpiring()) kind = LegacyCell.Kind.EXPIRING; else kind = LegacyCell.Kind.REGULAR; CellPath path = cell.path(); assert path == null || path.size() == 1; LegacyCellName name = new LegacyCellName(clustering, cell.column(), path == null ? null : path.get(0)); return new LegacyCell(kind, name, cell.value(), cell.timestamp(), cell.localDeletionTime(), cell.ttl()); } public static RowIterator toRowIterator(final CFMetaData metadata, final DecoratedKey key, final Iterator cells, final int nowInSec) { SerializationHelper helper = new SerializationHelper(metadata, 0, SerializationHelper.Flag.LOCAL); return UnfilteredRowIterators.filter(toUnfilteredRowIterator(metadata, key, LegacyDeletionInfo.live(), cells, false, helper), nowInSec); } public static Comparator legacyCellComparator(CFMetaData metadata) { return legacyCellComparator(metadata, false); } public static Comparator legacyCellComparator(final CFMetaData metadata, final boolean reversed) { final Comparator cellNameComparator = legacyCellNameComparator(metadata, reversed); return new Comparator() { public int compare(LegacyCell cell1, LegacyCell cell2) { LegacyCellName c1 = cell1.name; LegacyCellName c2 = cell2.name; int c = cellNameComparator.compare(c1, c2); if (c != 0) return c; // The actual sorting when the cellname is equal doesn't matter, we just want to make // sure the cells are not considered equal. if (cell1.timestamp != cell2.timestamp) return cell1.timestamp < cell2.timestamp ? -1 : 1; if (cell1.localDeletionTime != cell2.localDeletionTime) return cell1.localDeletionTime < cell2.localDeletionTime ? -1 : 1; return cell1.value.compareTo(cell2.value); } }; } // Note that this doesn't exactly compare cells as they were pre-3.0 because within a row they sort columns like // in 3.0, that is, with simple columns before complex columns. In other words, this comparator makes sure cells // are in the proper order to convert them to actual 3.0 rows. public static Comparator legacyCellNameComparator(final CFMetaData metadata, final boolean reversed) { return new Comparator() { public int compare(LegacyCellName c1, LegacyCellName c2) { // Compare clustering first if (c1.clustering == Clustering.STATIC_CLUSTERING) { if (c2.clustering != Clustering.STATIC_CLUSTERING) return -1; } else if (c2.clustering == Clustering.STATIC_CLUSTERING) { return 1; } else { int c = metadata.comparator.compare(c1.clustering, c2.clustering); if (c != 0) return reversed ? -c : c; } // Note that when reversed, we only care about the clustering being reversed, so it's ok // not to take reversed into account below. // Then check the column name if (c1.column != c2.column) { // A null for the column means it's a row marker if (c1.column == null) return -1; if (c2.column == null) return 1; assert c1.column.isRegular() || c1.column.isStatic(); assert c2.column.isRegular() || c2.column.isStatic(); int cmp = c1.column.compareTo(c2.column); if (cmp != 0) return cmp; } assert (c1.collectionElement == null) == (c2.collectionElement == null); if (c1.collectionElement != null) { AbstractType colCmp = ((CollectionType)c1.column.type).nameComparator(); return colCmp.compare(c1.collectionElement, c2.collectionElement); } return 0; } }; } private static boolean equalValues(ClusteringPrefix c1, ClusteringPrefix c2, ClusteringComparator comparator) { assert c1.size() == c2.size(); for (int i = 0; i < c1.size(); i++) { if (comparator.compareComponent(i, c1.get(i), c2.get(i)) != 0) return false; } return true; } static Comparator legacyAtomComparator(CFMetaData metadata) { return (o1, o2) -> { // First we want to compare by clustering, but we have to be careful with range tombstone, because // we can have collection deletion and we want those to sort properly just before the column they // delete, not before the whole row. // We also want to special case static so they sort before any non-static. Note in particular that // this special casing is important in the case of one of the Atom being Bound.BOTTOM: we want // it to sort after the static as we deal with static first in toUnfilteredAtomIterator and having // Bound.BOTTOM first would mess that up (note that static deletion is handled through a specific // static tombstone, see LegacyDeletionInfo.add()). if (o1.isStatic() != o2.isStatic()) return o1.isStatic() ? -1 : 1; ClusteringPrefix c1 = o1.clustering(); ClusteringPrefix c2 = o2.clustering(); int clusteringComparison; if (c1.size() != c2.size() || (o1.isCell() == o2.isCell()) || !equalValues(c1, c2, metadata.comparator)) { clusteringComparison = metadata.comparator.compare(c1, c2); } else { // one is a cell and one is a range tombstone, and both have the same prefix size (that is, the // range tombstone is either a row deletion or a collection deletion). LegacyRangeTombstone rt = o1.isCell() ? o2.asRangeTombstone() : o1.asRangeTombstone(); clusteringComparison = rt.isCollectionTombstone() ? 0 : metadata.comparator.compare(c1, c2); } // Note that if both are range tombstones and have the same clustering, then they are equal. if (clusteringComparison != 0) return clusteringComparison; if (o1.isCell()) { LegacyCell cell1 = o1.asCell(); if (o2.isCell()) { LegacyCell cell2 = o2.asCell(); // Check for row marker cells if (cell1.name.column == null) return cell2.name.column == null ? 0 : -1; return cell2.name.column == null ? 1 : cell1.name.column.compareTo(cell2.name.column); } LegacyRangeTombstone rt2 = o2.asRangeTombstone(); assert rt2.isCollectionTombstone(); // otherwise, we shouldn't have got a clustering equality if (cell1.name.column == null) return -1; int cmp = cell1.name.column.compareTo(rt2.start.collectionName); // If both are for the same column, then the RT should come first return cmp == 0 ? 1 : cmp; } else { assert o2.isCell(); LegacyCell cell2 = o2.asCell(); LegacyRangeTombstone rt1 = o1.asRangeTombstone(); assert rt1.isCollectionTombstone(); // otherwise, we shouldn't have got a clustering equality if (cell2.name.column == null) return 1; int cmp = rt1.start.collectionName.compareTo(cell2.name.column); // If both are for the same column, then the RT should come first return cmp == 0 ? -1 : cmp; } }; } public static LegacyAtom readLegacyAtom(CFMetaData metadata, DataInputPlus in, boolean readAllAsDynamic) throws IOException, UnknownColumnException { ByteBuffer cellname = ByteBufferUtil.readWithShortLength(in); if (!cellname.hasRemaining()) return null; // END_OF_ROW try { int b = in.readUnsignedByte(); return (b & RANGE_TOMBSTONE_MASK) != 0 ? readLegacyRangeTombstoneBody(metadata, in, cellname) : readLegacyCellBody(metadata, in, cellname, b, SerializationHelper.Flag.LOCAL, readAllAsDynamic); } catch (UnknownColumnException e) { // We legitimately can get here in 2 cases: // 1) for system tables, because we've unceremoniously removed columns (without registering them as dropped) // 2) for dropped columns. // In any other case, there is a mismatch between the schema and the data, and we complain loudly in // that case. Note that if we are in a legit case of an unknown column, we want to simply skip that cell, // but we don't do this here and re-throw the exception because the calling code sometimes has to know // about this happening. This does mean code calling this method should handle this case properly. if (!metadata.ksName.equals(SchemaConstants.SYSTEM_KEYSPACE_NAME) && metadata.getDroppedColumnDefinition(e.columnName) == null) logger.warn(String.format("Got cell for unknown column %s in sstable of %s.%s: " + "This suggest a problem with the schema which doesn't list " + "this column. Even if that column was dropped, it should have " + "been listed as such", metadata.ksName, metadata.cfName, UTF8Type.instance.compose(e.columnName)), e); throw e; } } public static LegacyCell readLegacyCell(CFMetaData metadata, DataInput in, SerializationHelper.Flag flag) throws IOException, UnknownColumnException { ByteBuffer cellname = ByteBufferUtil.readWithShortLength(in); int b = in.readUnsignedByte(); return readLegacyCellBody(metadata, in, cellname, b, flag, false); } public static LegacyCell readLegacyCellBody(CFMetaData metadata, DataInput in, ByteBuffer cellname, int mask, SerializationHelper.Flag flag, boolean readAllAsDynamic) throws IOException, UnknownColumnException { // Note that we want to call decodeCellName only after we've deserialized other parts, since it can throw // and we want to throw only after having deserialized the full cell. if ((mask & COUNTER_MASK) != 0) { in.readLong(); // timestampOfLastDelete: this has been unused for a long time so we ignore it long ts = in.readLong(); ByteBuffer value = ByteBufferUtil.readWithLength(in); if (flag == SerializationHelper.Flag.FROM_REMOTE || (flag == SerializationHelper.Flag.LOCAL && CounterContext.instance().shouldClearLocal(value))) value = CounterContext.instance().clearAllLocal(value); return new LegacyCell(LegacyCell.Kind.COUNTER, decodeCellName(metadata, cellname, readAllAsDynamic), value, ts, Cell.NO_DELETION_TIME, Cell.NO_TTL); } else if ((mask & EXPIRATION_MASK) != 0) { int ttl = in.readInt(); int expiration = in.readInt(); long ts = in.readLong(); ByteBuffer value = ByteBufferUtil.readWithLength(in); return new LegacyCell(LegacyCell.Kind.EXPIRING, decodeCellName(metadata, cellname, readAllAsDynamic), value, ts, expiration, ttl); } else { long ts = in.readLong(); ByteBuffer value = ByteBufferUtil.readWithLength(in); LegacyCellName name = decodeCellName(metadata, cellname, readAllAsDynamic); return (mask & COUNTER_UPDATE_MASK) != 0 ? new LegacyCell(LegacyCell.Kind.COUNTER, name, CounterContext.instance().createUpdate(ByteBufferUtil.toLong(value)), ts, Cell.NO_DELETION_TIME, Cell.NO_TTL) : ((mask & DELETION_MASK) == 0 ? new LegacyCell(LegacyCell.Kind.REGULAR, name, value, ts, Cell.NO_DELETION_TIME, Cell.NO_TTL) : new LegacyCell(LegacyCell.Kind.DELETED, name, ByteBufferUtil.EMPTY_BYTE_BUFFER, ts, ByteBufferUtil.toInt(value), Cell.NO_TTL)); } } public static LegacyRangeTombstone readLegacyRangeTombstoneBody(CFMetaData metadata, DataInputPlus in, ByteBuffer boundname) throws IOException { LegacyBound min = decodeTombstoneBound(metadata, boundname, true); LegacyBound max = decodeTombstoneBound(metadata, ByteBufferUtil.readWithShortLength(in), false); DeletionTime dt = DeletionTime.serializer.deserialize(in); return new LegacyRangeTombstone(min, max, dt); } public static Iterator deserializeCells(final CFMetaData metadata, final DataInput in, final SerializationHelper.Flag flag, final int size) { return new AbstractIterator() { private int i = 0; protected LegacyCell computeNext() { if (i >= size) return endOfData(); ++i; try { return readLegacyCell(metadata, in, flag); } catch (UnknownColumnException e) { // We can get there if we read a cell for a dropped column, and if that is the case, // then simply ignore the cell is fine. But also not that we ignore if it's the // system keyspace because for those table we actually remove columns without registering // them in the dropped columns if (metadata.ksName.equals(SchemaConstants.SYSTEM_KEYSPACE_NAME) || metadata.getDroppedColumnDefinition(e.columnName) != null) return computeNext(); else throw new IOError(e); } catch (IOException e) { throw new IOError(e); } } }; } public static class CellGrouper { /** * The fake TTL used for expired rows that have been compacted. */ private static final int FAKE_TTL = 1; public final CFMetaData metadata; private final boolean isStatic; private final SerializationHelper helper; private final Row.Builder builder; private Clustering clustering; private LegacyRangeTombstone rowDeletion; private LegacyRangeTombstone collectionDeletion; /** * Used to track if we need to add pk liveness info (row marker) when removing invalid legacy cells. * * In 2.1 these invalid cells existed but were not queryable, in this case specifically because they * represented values for clustering key columns that were written as data cells. * * However, the presence (or not) of such cells on an otherwise empty CQL row (or partition) would decide * if an empty result row were returned for the CQL row (or partition). To maintain this behaviour we * insert a row marker containing the liveness info of these invalid cells iff we have no other data * on the row. * * See also CASSANDRA-15365 */ private boolean hasValidCells = false; private LivenessInfo invalidLivenessInfo = null; public CellGrouper(CFMetaData metadata, SerializationHelper helper) { this(metadata, helper, false); } private CellGrouper(CFMetaData metadata, SerializationHelper helper, boolean isStatic) { this.metadata = metadata; this.isStatic = isStatic; this.helper = helper; // We cannot use a sorted builder because we don't have exactly the same ordering in 3.0 and pre-3.0. More precisely, within a row, we // store all simple columns before the complex ones in 3.0, which we use to sort everything sorted by the column name before. Note however // that the unsorted builder won't have to reconcile cells, so the exact value we pass for nowInSec doesn't matter. this.builder = BTreeRow.unsortedBuilder(FBUtilities.nowInSeconds()); } public static CellGrouper staticGrouper(CFMetaData metadata, SerializationHelper helper) { return new CellGrouper(metadata, helper, true); } public void reset() { this.clustering = null; this.rowDeletion = null; this.collectionDeletion = null; this.invalidLivenessInfo = null; this.hasValidCells = false; } /** * Try adding the provided atom to the currently grouped row. * * @param atom the new atom to try to add. This must be a "row" atom, that is either a cell or a legacy * range tombstone that covers only one row (row deletion) or a subset of it (collection * deletion). Meaning that legacy range tombstone covering multiple rows (that should be handled as * legit range tombstone in the new storage engine) should be handled separately. Atoms should also * be provided in proper clustering order. * @return {@code true} if the provided atom has been "consumed" by this grouper (this does _not_ mean the * atom has been "used" by the grouper as the grouper will skip some shadowed atoms for instance, just * that {@link #getRow()} shouldn't be called just yet if there is more atom in the atom iterator we're * grouping). {@code false} otherwise, that is if the row currently built by this grouper is done * _without_ the provided atom being "consumed" (and so {@link #getRow()} should be called and the * grouper resetted, after which the provided atom should be provided again). */ public boolean addAtom(LegacyAtom atom) { assert atom.isRowAtom(metadata) : "Unexpected non in-row legacy range tombstone " + atom; return atom.isCell() ? addCell(atom.asCell()) : addRangeTombstone(atom.asRangeTombstone()); } private boolean addCell(LegacyCell cell) { if (clustering == null) { clustering = cell.name.clustering; assert !isStatic || clustering == Clustering.STATIC_CLUSTERING; builder.newRow(clustering); } else if (!clustering.equals(cell.name.clustering)) { return false; } // Ignore shadowed cells if (rowDeletion != null && rowDeletion.deletionTime.deletes(cell.timestamp)) return true; ColumnDefinition column = cell.name.column; if (column == null) { // It's the row marker assert !cell.value.hasRemaining(); // In 2.1, the row marker expired cell might have been converted into a deleted one by compaction. // If we do not set the primary key liveness info for this row and it does not contains any regular columns // the row will be empty. To avoid that, we reuse the localDeletionTime but use a fake TTL. // The only time in 2.x that we actually delete a row marker is in 2i tables, so in that case we do // want to actually propagate the row deletion. (CASSANDRA-13320) if (!cell.isTombstone()) builder.addPrimaryKeyLivenessInfo(LivenessInfo.withExpirationTime(cell.timestamp, cell.ttl, cell.localDeletionTime)); else if (metadata.isIndex()) builder.addRowDeletion(Row.Deletion.regular(new DeletionTime(cell.timestamp, cell.localDeletionTime))); else builder.addPrimaryKeyLivenessInfo(LivenessInfo.create(cell.timestamp, FAKE_TTL, cell.localDeletionTime)); hasValidCells = true; } else if (column.isPrimaryKeyColumn() && metadata.isCQLTable()) { // SSTables generated offline and side-loaded may include invalid cells which have the column name // of a primary key column. So that we don't fail when encountering these cells, we treat them the // same way as 2.1 did, namely we include their clusterings in the new CQL row, but drop the invalid // column part of the cell noSpamLogger.warn("Illegal cell name for CQL3 table {}.{}. {} is defined as a primary key column", metadata.ksName, metadata.cfName, column.name); if (invalidLivenessInfo != null) { // when we have several invalid cells we follow the logic in LivenessInfo#supersedes when picking the PKLI to keep: LivenessInfo newInvalidLiveness = LivenessInfo.create(cell.timestamp, cell.isTombstone() ? FAKE_TTL : cell.ttl, cell.localDeletionTime); if (newInvalidLiveness.supersedes(invalidLivenessInfo)) invalidLivenessInfo = newInvalidLiveness; } else { invalidLivenessInfo = LivenessInfo.create(cell.timestamp, cell.isTombstone() ? FAKE_TTL : cell.ttl, cell.localDeletionTime); } return true; } else { if (collectionDeletion != null && collectionDeletion.start.collectionName.name.equals(column.name) && collectionDeletion.deletionTime.deletes(cell.timestamp)) return true; if (helper.includes(column)) { hasValidCells = true; CellPath path = null; if (column.isComplex()) { // Recalling startOfComplexColumn for every cell is a big inefficient, but it's ok in practice // and it's simpler. And since 1) this only matter for super column selection in thrift in // practice and 2) is only used during upgrade, it's probably worth keeping things simple. helper.startOfComplexColumn(column); path = cell.name.collectionElement == null ? null : CellPath.create(cell.name.collectionElement); if (!helper.includes(path)) return true; } column.type.validateIfFixedSize(cell.value); Cell c = new BufferCell(column, cell.timestamp, cell.ttl, cell.localDeletionTime, cell.value, path); if (!helper.isDropped(c, column.isComplex())) builder.addCell(c); if (column.isComplex()) { helper.endOfComplexColumn(); } } } return true; } private boolean addRangeTombstone(LegacyRangeTombstone tombstone) { if (tombstone.isRowDeletion(metadata)) { return addRowTombstone(tombstone); } else { // The isRowAtom() assertion back in addAtom would have already triggered otherwise, but spelling it // out nonetheless. assert tombstone.isCollectionTombstone(); return addCollectionTombstone(tombstone); } } private boolean addRowTombstone(LegacyRangeTombstone tombstone) { if (clustering != null) { // If we're already in the row, there might be a chance that there were two range tombstones // written, as 2.x storage format does not guarantee just one range tombstone, unlike 3.x. // We have to make sure that clustering matches, which would mean that tombstone is for the // same row. if (clustering.equals(tombstone.start.getAsClustering(metadata))) { // If the tombstone superceeds the previous delete, we discard the previous one. // This assumes that we are building the row from a sane source (ie, this row deletion // does not delete anything already added to the builder). See CASSANDRA-15789 for details if (rowDeletion == null || tombstone.deletionTime.supersedes(rowDeletion.deletionTime)) { builder.addRowDeletion(Row.Deletion.regular(tombstone.deletionTime)); rowDeletion = tombstone; hasValidCells = true; } return true; } // different clustering -> new row return false; } clustering = tombstone.start.getAsClustering(metadata); builder.newRow(clustering); builder.addRowDeletion(Row.Deletion.regular(tombstone.deletionTime)); rowDeletion = tombstone; hasValidCells = true; return true; } private boolean addCollectionTombstone(LegacyRangeTombstone tombstone) { // If the collection tombstone is not included in the query (which technically would only apply to thrift // queries since CQL one "fetch" everything), we can skip it (so return), but we're problably still within // the current row so we return `true`. Technically, it is possible that tombstone belongs to another row // that the row currently grouped, but as we ignore it, returning `true` is ok in that case too. if (!helper.includes(tombstone.start.collectionName)) return true; // see CASSANDRA-13109 // The helper needs to be informed about the current complex column identifier before // it can perform the comparison between the recorded drop time and the RT deletion time. // If the RT has been superceded by a drop, we still return true as we don't want the // grouper to terminate yet. helper.startOfComplexColumn(tombstone.start.collectionName); if (helper.isDroppedComplexDeletion(tombstone.deletionTime)) return true; if (clustering == null) { clustering = tombstone.start.getAsClustering(metadata); builder.newRow(clustering); } else if (!clustering.equals(tombstone.start.getAsClustering(metadata))) { return false; } builder.addComplexDeletion(tombstone.start.collectionName, tombstone.deletionTime); if (rowDeletion == null || tombstone.deletionTime.supersedes(rowDeletion.deletionTime)) collectionDeletion = tombstone; hasValidCells = true; return true; } /** * Whether the provided range tombstone starts strictly after the current row of the cell grouper (if no row is * currently started, this return false). */ public boolean startsAfterCurrentRow(LegacyRangeTombstone rangeTombstone) { return clustering != null && metadata.comparator.compare(rangeTombstone.start.bound, clustering) > 0; } /** * The clustering of the current row of the cell grouper, or {@code null} if no row is currently started. */ public Clustering currentRowClustering() { return clustering; } /** * Generates the row currently grouped by this grouper and reset it for the following row. *

* Note that the only correct way to call this is when either all the atom we're trying to group has been * consumed, or when {@link #addAtom(LegacyAtom)} returns {@code false}. * * @return the current row that has been grouped, or {@code null} in the rare case where all the atoms * "consumed" by {@link #addAtom(LegacyAtom)} for this row were skipped (we skip atoms under a few conditions). */ public Row getRow() { if (clustering == null) return null; if (!hasValidCells && invalidLivenessInfo != null) builder.addPrimaryKeyLivenessInfo(invalidLivenessInfo); return builder.build(); } } public static class LegacyUnfilteredPartition { public final DeletionTime partitionDeletion; public final LegacyRangeTombstoneList rangeTombstones; public final List cells; private LegacyUnfilteredPartition(DeletionTime partitionDeletion, LegacyRangeTombstoneList rangeTombstones, List cells) { this.partitionDeletion = partitionDeletion; this.rangeTombstones = rangeTombstones; this.cells = cells; } public void digest(CFMetaData metadata, MessageDigest digest) { for (LegacyCell cell : cells) { digest.update(cell.name.encode(metadata).duplicate()); if (cell.isCounter()) CounterContext.instance().updateDigest(digest, cell.value); else digest.update(cell.value.duplicate()); FBUtilities.updateWithLong(digest, cell.timestamp); FBUtilities.updateWithByte(digest, cell.serializationFlags()); if (cell.isExpiring()) FBUtilities.updateWithInt(digest, cell.ttl); if (cell.isCounter()) { // Counters used to have the timestampOfLastDelete field, which we stopped using long ago and has been hard-coded // to Long.MIN_VALUE but was still taken into account in 2.2 counter digests (to maintain backward compatibility // in the first place). FBUtilities.updateWithLong(digest, Long.MIN_VALUE); } } if (partitionDeletion.markedForDeleteAt() != Long.MIN_VALUE) digest.update(ByteBufferUtil.bytes(partitionDeletion.markedForDeleteAt())); if (!rangeTombstones.isEmpty()) rangeTombstones.updateDigest(digest); } } public static class LegacyCellName { public final Clustering clustering; public final ColumnDefinition column; public final ByteBuffer collectionElement; @VisibleForTesting public LegacyCellName(Clustering clustering, ColumnDefinition column, ByteBuffer collectionElement) { this.clustering = clustering; this.column = column; this.collectionElement = collectionElement; } public static LegacyCellName create(Clustering clustering, ColumnDefinition column) { return new LegacyCellName(clustering, column, null); } public ByteBuffer encode(CFMetaData metadata) { return encodeCellName(metadata, clustering, column == null ? ByteBufferUtil.EMPTY_BYTE_BUFFER : column.name.bytes, collectionElement); } public ByteBuffer superColumnSubName() { assert collectionElement != null; return collectionElement; } public ByteBuffer superColumnName() { return clustering.get(0); } @Override public String toString() { StringBuilder sb = new StringBuilder(); for (int i = 0; i < clustering.size(); i++) sb.append(i > 0 ? ":" : "").append(clustering.get(i) == null ? "null" : ByteBufferUtil.bytesToHex(clustering.get(i))); return String.format("Cellname(clustering=%s, column=%s, collElt=%s)", sb.toString(), column == null ? "null" : column.name, collectionElement == null ? "null" : ByteBufferUtil.bytesToHex(collectionElement)); } } public static class LegacyBound { public static final LegacyBound BOTTOM = new LegacyBound(ClusteringBound.BOTTOM, false, null); public static final LegacyBound TOP = new LegacyBound(ClusteringBound.TOP, false, null); public final ClusteringBound bound; public final boolean isStatic; public final ColumnDefinition collectionName; public LegacyBound(ClusteringBound bound, boolean isStatic, ColumnDefinition collectionName) { this.bound = bound; this.isStatic = isStatic; this.collectionName = collectionName; } public Clustering getAsClustering(CFMetaData metadata) { if (isStatic) return Clustering.STATIC_CLUSTERING; assert bound.size() == metadata.comparator.size(); ByteBuffer[] values = new ByteBuffer[bound.size()]; for (int i = 0; i < bound.size(); i++) values[i] = bound.get(i); return Clustering.make(values); } @Override public String toString() { StringBuilder sb = new StringBuilder(); sb.append(bound.kind()).append('('); for (int i = 0; i < bound.size(); i++) sb.append(i > 0 ? ":" : "").append(bound.get(i) == null ? "null" : ByteBufferUtil.bytesToHex(bound.get(i))); sb.append(')'); return String.format("Bound(%s, collection=%s)", sb.toString(), collectionName == null ? "null" : collectionName.name); } } public interface LegacyAtom { public boolean isCell(); // note that for static atoms, LegacyCell and LegacyRangeTombstone behave differently here: // - LegacyCell returns the modern Clustering.STATIC_CLUSTERING // - LegacyRangeTombstone returns the 2.2 bound (i.e. N empty ByteBuffer, where N is number of clusterings) // in LegacyDeletionInfo.add(), we split any LRT with a static bound out into the inRowRangeTombstones collection // these are merged with regular row cells, in the CellGrouper, and their clustering is obtained via start.bound.getAsClustering // (also, it should be impossibly to issue raw static row deletions anyway) public ClusteringPrefix clustering(); public boolean isStatic(); public LegacyCell asCell(); public LegacyRangeTombstone asRangeTombstone(); /** * Whether the atom is one that becomes part of a {@link Row} in the new storage engine, meaning it is either * as cell or a legacy range tombstone that covers a single row, or parts of one. */ public boolean isRowAtom(CFMetaData metadata); } /** * A legacy cell. *

* This is used as a temporary object to facilitate dealing with the legacy format, this * is not meant to be optimal. */ public static class LegacyCell implements LegacyAtom { private final static int DELETION_MASK = 0x01; private final static int EXPIRATION_MASK = 0x02; private final static int COUNTER_MASK = 0x04; private final static int COUNTER_UPDATE_MASK = 0x08; private final static int RANGE_TOMBSTONE_MASK = 0x10; public enum Kind { REGULAR, EXPIRING, DELETED, COUNTER } public final Kind kind; public final LegacyCellName name; public final ByteBuffer value; public final long timestamp; public final int localDeletionTime; public final int ttl; @VisibleForTesting public LegacyCell(Kind kind, LegacyCellName name, ByteBuffer value, long timestamp, int localDeletionTime, int ttl) { this.kind = kind; this.name = name; this.value = value; this.timestamp = timestamp; this.localDeletionTime = localDeletionTime; this.ttl = ttl; } public static LegacyCell regular(CFMetaData metadata, ByteBuffer superColumnName, ByteBuffer name, ByteBuffer value, long timestamp) throws UnknownColumnException { return new LegacyCell(Kind.REGULAR, decodeCellName(metadata, superColumnName, name), value, timestamp, Cell.NO_DELETION_TIME, Cell.NO_TTL); } public static LegacyCell expiring(CFMetaData metadata, ByteBuffer superColumnName, ByteBuffer name, ByteBuffer value, long timestamp, int ttl, int nowInSec) throws UnknownColumnException { /* * CASSANDRA-14092: Max expiration date capping is maybe performed here, expiration overflow policy application * is done at {@link org.apache.cassandra.thrift.ThriftValidation#validateTtl(CFMetaData, Column)} */ return new LegacyCell(Kind.EXPIRING, decodeCellName(metadata, superColumnName, name), value, timestamp, ExpirationDateOverflowHandling.computeLocalExpirationTime(nowInSec, ttl), ttl); } public static LegacyCell tombstone(CFMetaData metadata, ByteBuffer superColumnName, ByteBuffer name, long timestamp, int nowInSec) throws UnknownColumnException { return new LegacyCell(Kind.DELETED, decodeCellName(metadata, superColumnName, name), ByteBufferUtil.EMPTY_BYTE_BUFFER, timestamp, nowInSec, LivenessInfo.NO_TTL); } public static LegacyCell counterUpdate(CFMetaData metadata, ByteBuffer superColumnName, ByteBuffer name, long value) throws UnknownColumnException { // See UpdateParameters.addCounter() for more details on this ByteBuffer counterValue = CounterContext.instance().createUpdate(value); return counter(decodeCellName(metadata, superColumnName, name), counterValue); } public static LegacyCell counter(LegacyCellName name, ByteBuffer value) { return new LegacyCell(Kind.COUNTER, name, value, FBUtilities.timestampMicros(), Cell.NO_DELETION_TIME, Cell.NO_TTL); } public byte serializationFlags() { if (isExpiring()) return EXPIRATION_MASK; if (isTombstone()) return DELETION_MASK; if (isCounterUpdate()) return COUNTER_UPDATE_MASK; if (isCounter()) return COUNTER_MASK; return 0; } public boolean isCounterUpdate() { // See UpdateParameters.addCounter() for more details on this return isCounter() && CounterContext.instance().isUpdate(value); } public ClusteringPrefix clustering() { return name.clustering; } public boolean isStatic() { return name.clustering == Clustering.STATIC_CLUSTERING; } public boolean isCell() { return true; } public LegacyCell asCell() { return this; } public LegacyRangeTombstone asRangeTombstone() { throw new UnsupportedOperationException(); } @Override public boolean isRowAtom(CFMetaData metaData) { return true; } public boolean isCounter() { return kind == Kind.COUNTER; } public boolean isExpiring() { return kind == Kind.EXPIRING; } public boolean isTombstone() { return kind == Kind.DELETED; } public boolean isLive(int nowInSec) { if (isTombstone()) return false; return !isExpiring() || nowInSec < localDeletionTime; } @Override public String toString() { return String.format("LegacyCell(%s, name=%s, v=%s, ts=%s, ldt=%s, ttl=%s)", kind, name, ByteBufferUtil.bytesToHex(value), timestamp, localDeletionTime, ttl); } } /** * A legacy range tombstone. *

* This is used as a temporary object to facilitate dealing with the legacy format, this * is not meant to be optimal. */ public static class LegacyRangeTombstone implements LegacyAtom { public final LegacyBound start; public final LegacyBound stop; public final DeletionTime deletionTime; public LegacyRangeTombstone(LegacyBound start, LegacyBound stop, DeletionTime deletionTime) { // Because of the way RangeTombstoneList work, we can have a tombstone where only one of // the bound has a collectionName. That happens if we have a big tombstone A (spanning one // or multiple rows) and a collection tombstone B. In that case, RangeTombstoneList will // split this into 3 RTs: the first one from the beginning of A to the beginning of B, // then B, then a third one from the end of B to the end of A. To make this simpler, if // we detect that case we transform the 1st and 3rd tombstone so they don't end in the middle // of a row (which is still correct). if ((start.collectionName == null) != (stop.collectionName == null)) { if (start.collectionName == null) stop = new LegacyBound(ClusteringBound.inclusiveEndOf(stop.bound.values), stop.isStatic, null); else start = new LegacyBound(ClusteringBound.inclusiveStartOf(start.bound.values), start.isStatic, null); } else if (!Objects.equals(start.collectionName, stop.collectionName)) { // We're in the similar but slightly more complex case where on top of the big tombstone // A, we have 2 (or more) collection tombstones B and C within A. So we also end up with // a tombstone that goes between the end of B and the start of C. start = new LegacyBound(start.bound, start.isStatic, null); stop = new LegacyBound(stop.bound, stop.isStatic, null); } this.start = start; this.stop = stop; this.deletionTime = deletionTime; } /** @see LegacyAtom#clustering for static inconsistencies explained */ public ClusteringPrefix clustering() { return start.bound; } public LegacyRangeTombstone withNewStart(LegacyBound newStart) { return new LegacyRangeTombstone(newStart, stop, deletionTime); } public LegacyRangeTombstone withNewStart(ClusteringBound newStart) { return withNewStart(new LegacyBound(newStart, start.isStatic, null)); } public LegacyRangeTombstone withNewEnd(LegacyBound newStop) { return new LegacyRangeTombstone(start, newStop, deletionTime); } public LegacyRangeTombstone withNewEnd(ClusteringBound newEnd) { return withNewEnd(new LegacyBound(newEnd, stop.isStatic, null)); } public boolean isCell() { return false; } public boolean isStatic() { return start.isStatic || stop.isStatic; } public LegacyCell asCell() { throw new UnsupportedOperationException(); } public LegacyRangeTombstone asRangeTombstone() { return this; } @Override public boolean isRowAtom(CFMetaData metadata) { return isCollectionTombstone() || isRowDeletion(metadata); } public boolean isCollectionTombstone() { return start.collectionName != null; } public boolean isRowDeletion(CFMetaData metadata) { if (start.collectionName != null || stop.collectionName != null || start.bound.size() != metadata.comparator.size() || stop.bound.size() != metadata.comparator.size()) return false; for (int i = 0; i < start.bound.size(); i++) if (!Objects.equals(start.bound.get(i), stop.bound.get(i))) return false; return true; } @Override public String toString() { return String.format("RT(%s-%s, %s)", start, stop, deletionTime); } } public static class LegacyDeletionInfo { public final MutableDeletionInfo deletionInfo; public final List inRowTombstones = new ArrayList<>(); private LegacyDeletionInfo(MutableDeletionInfo deletionInfo) { this.deletionInfo = deletionInfo; } public static LegacyDeletionInfo live() { return new LegacyDeletionInfo(MutableDeletionInfo.live()); } public void add(DeletionTime topLevel) { deletionInfo.add(topLevel); } private static ClusteringBound staticBound(CFMetaData metadata, boolean isStart) { // In pre-3.0 nodes, static row started by a clustering with all empty values so we // preserve that here. Note that in practice, it doesn't really matter since the rest // of the code will ignore the bound for RT that have their static flag set. ByteBuffer[] values = new ByteBuffer[metadata.comparator.size()]; for (int i = 0; i < values.length; i++) values[i] = ByteBufferUtil.EMPTY_BYTE_BUFFER; return isStart ? ClusteringBound.inclusiveStartOf(values) : ClusteringBound.inclusiveEndOf(values); } public void add(CFMetaData metadata, LegacyRangeTombstone tombstone) { if (metadata.hasStaticColumns()) { /* * For table having static columns we have to deal with the following cases: * 1. the end of the tombstone is static (in which case either the start is static or is BOTTOM, which is the same * for our consideration). This mean that either the range only delete the static row, or that it's a collection * tombstone of a static collection. In both case, we just add the tombstone to the inRowTombstones. * 2. only the start is static. There is then 2 subcase: either the start is inclusive, and that mean we include the * static row and more (so we add an inRowTombstone for the static and deal with the rest normally). Or the start * is exclusive, and that means we explicitely exclude the static (in which case we can just add the tombstone * as if it started at BOTTOM). * 3. none of the bound are static but the start is BOTTOM. This means we intended to delete the static row so we * need to add it to the inRowTombstones (and otherwise handle the range normally). */ if (tombstone.stop.isStatic) { // If the start is BOTTOM, we replace it by the beginning of the starting row so as to not confuse the // RangeTombstone.isRowDeletion() method if (tombstone.start == LegacyBound.BOTTOM) tombstone = tombstone.withNewStart(new LegacyBound(staticBound(metadata, true), true, null)); inRowTombstones.add(tombstone); return; } if (tombstone.start.isStatic) { if (tombstone.start.bound.isInclusive()) inRowTombstones.add(tombstone.withNewEnd(new LegacyBound(staticBound(metadata, false), true, null))); tombstone = tombstone.withNewStart(LegacyBound.BOTTOM); } else if (tombstone.start == LegacyBound.BOTTOM) { inRowTombstones.add(new LegacyRangeTombstone(new LegacyBound(staticBound(metadata, true), true, null), new LegacyBound(staticBound(metadata, false), true, null), tombstone.deletionTime)); } } if (tombstone.isCollectionTombstone() || tombstone.isRowDeletion(metadata)) inRowTombstones.add(tombstone); else add(metadata, new RangeTombstone(Slice.make(tombstone.start.bound, tombstone.stop.bound), tombstone.deletionTime)); } public void add(CFMetaData metadata, RangeTombstone tombstone) { deletionInfo.add(tombstone, metadata.comparator); } public Iterator inRowRangeTombstones() { return inRowTombstones.iterator(); } public static LegacyDeletionInfo deserialize(CFMetaData metadata, DataInputPlus in) throws IOException { DeletionTime topLevel = DeletionTime.serializer.deserialize(in); int rangeCount = in.readInt(); if (rangeCount == 0) return new LegacyDeletionInfo(new MutableDeletionInfo(topLevel)); LegacyDeletionInfo delInfo = new LegacyDeletionInfo(new MutableDeletionInfo(topLevel)); for (int i = 0; i < rangeCount; i++) { LegacyBound start = decodeTombstoneBound(metadata, ByteBufferUtil.readWithShortLength(in), true); LegacyBound end = decodeTombstoneBound(metadata, ByteBufferUtil.readWithShortLength(in), false); int delTime = in.readInt(); long markedAt = in.readLong(); delInfo.add(metadata, new LegacyRangeTombstone(start, end, new DeletionTime(markedAt, delTime))); } return delInfo; } } /** * A helper class for LegacyRangeTombstoneList. This replaces the Comparator that RTL used before 3.0. */ private static class LegacyBoundComparator implements Comparator { ClusteringComparator clusteringComparator; public LegacyBoundComparator(ClusteringComparator clusteringComparator) { this.clusteringComparator = clusteringComparator; } public int compare(LegacyBound a, LegacyBound b) { // In the legacy sorting, BOTTOM comes before anything else if (a == LegacyBound.BOTTOM) return b == LegacyBound.BOTTOM ? 0 : -1; if (b == LegacyBound.BOTTOM) return 1; // Excluding BOTTOM, statics are always before anything else. if (a.isStatic != b.isStatic) return a.isStatic ? -1 : 1; // We have to be careful with bound comparison because of collections. Namely, if the 2 bounds represent the // same prefix, then we should take the collectionName into account before taking the bounds kind // (ClusteringPrefix.Kind). This means we can't really call ClusteringComparator.compare() directly. // For instance, if // a is (bound=INCL_START_BOUND('x'), collectionName='d') // b is (bound=INCL_END_BOUND('x'), collectionName='c') // Ten b < a since the element 'c' of collection 'x' comes before element 'd', but calling // clusteringComparator.compare(a.bound, b.bound) returns -1. // See CASSANDRA-13125 for details. int sa = a.bound.size(); int sb = b.bound.size(); for (int i = 0; i < Math.min(sa, sb); i++) { int cmp = clusteringComparator.compareComponent(i, a.bound.get(i), b.bound.get(i)); if (cmp != 0) return cmp; } if (sa != sb) return sa < sb ? a.bound.kind().comparedToClustering : -b.bound.kind().comparedToClustering; // Both bound represent the same prefix, compare the collection names // If one has a collection name and the other doesn't, the other comes before as it points to the beginning of the row. if ((a.collectionName == null) != (b.collectionName == null)) return a.collectionName == null ? -1 : 1; // If they both have a collection, compare that first if (a.collectionName != null) { int cmp = UTF8Type.instance.compare(a.collectionName.name.bytes, b.collectionName.name.bytes); if (cmp != 0) return cmp; } // Lastly, if everything so far is equal, compare their clustering kind return ClusteringPrefix.Kind.compare(a.bound.kind(), b.bound.kind()); } } /** * Almost an entire copy of RangeTombstoneList from C* 2.1. The main difference is that LegacyBoundComparator * is used in place of {@code Comparator} (because Composite doesn't exist any more). * * This class is needed to allow us to convert single-row deletions and complex deletions into range tombstones * and properly merge them into the normal set of range tombstones. */ public static class LegacyRangeTombstoneList { private final LegacyBoundComparator comparator; // Note: we don't want to use a List for the markedAts and delTimes to avoid boxing. We could // use a List for starts and ends, but having arrays everywhere is almost simpler. LegacyBound[] starts; LegacyBound[] ends; private long[] markedAts; private int[] delTimes; private int size; private LegacyRangeTombstoneList(LegacyBoundComparator comparator, LegacyBound[] starts, LegacyBound[] ends, long[] markedAts, int[] delTimes, int size) { assert starts.length == ends.length && starts.length == markedAts.length && starts.length == delTimes.length; this.comparator = comparator; this.starts = starts; this.ends = ends; this.markedAts = markedAts; this.delTimes = delTimes; this.size = size; } public LegacyRangeTombstoneList(LegacyBoundComparator comparator, int capacity) { this(comparator, new LegacyBound[capacity], new LegacyBound[capacity], new long[capacity], new int[capacity], 0); } @Override public String toString() { StringBuilder sb = new StringBuilder(); sb.append('['); for (int i = 0; i < size; i++) { if (i > 0) sb.append(','); sb.append('(').append(starts[i]).append(", ").append(ends[i]).append(')'); } return sb.append(']').toString(); } public boolean isEmpty() { return size == 0; } public int size() { return size; } /** * Adds a new range tombstone. * * This method will be faster if the new tombstone sort after all the currently existing ones (this is a common use case), * but it doesn't assume it. */ public void add(LegacyBound start, LegacyBound end, long markedAt, int delTime) { if (isEmpty()) { addInternal(0, start, end, markedAt, delTime); return; } int c = comparator.compare(ends[size-1], start); // Fast path if we add in sorted order if (c <= 0) { addInternal(size, start, end, markedAt, delTime); } else { // Note: insertFrom expect i to be the insertion point in term of interval ends int pos = Arrays.binarySearch(ends, 0, size, start, comparator); insertFrom((pos >= 0 ? pos : -pos-1), start, end, markedAt, delTime); } } /* * Inserts a new element starting at index i. This method assumes that: * ends[i-1] <= start <= ends[i] * * A RangeTombstoneList is a list of range [s_0, e_0]...[s_n, e_n] such that: * - s_i <= e_i * - e_i <= s_i+1 * - if s_i == e_i and e_i == s_i+1 then s_i+1 < e_i+1 * Basically, range are non overlapping except for their bound and in order. And while * we allow ranges with the same value for the start and end, we don't allow repeating * such range (so we can't have [0, 0][0, 0] even though it would respect the first 2 * conditions). * */ /** * Adds all the range tombstones of {@code tombstones} to this RangeTombstoneList. */ public void addAll(LegacyRangeTombstoneList tombstones) { if (tombstones.isEmpty()) return; if (isEmpty()) { copyArrays(tombstones, this); return; } /* * We basically have 2 techniques we can use here: either we repeatedly call add() on tombstones values, * or we do a merge of both (sorted) lists. If this lists is bigger enough than the one we add, then * calling add() will be faster, otherwise it's merging that will be faster. * * Let's note that during memtables updates, it might not be uncommon that a new update has only a few range * tombstones, while the CF we're adding it to (the one in the memtable) has many. In that case, using add() is * likely going to be faster. * * In other cases however, like when diffing responses from multiple nodes, the tombstone lists we "merge" will * be likely sized, so using add() might be a bit inefficient. * * Roughly speaking (this ignore the fact that updating an element is not exactly constant but that's not a big * deal), if n is the size of this list and m is tombstones size, merging is O(n+m) while using add() is O(m*log(n)). * * But let's not crank up a logarithm computation for that. Long story short, merging will be a bad choice only * if this list size is lot bigger that the other one, so let's keep it simple. */ if (size > 10 * tombstones.size) { for (int i = 0; i < tombstones.size; i++) add(tombstones.starts[i], tombstones.ends[i], tombstones.markedAts[i], tombstones.delTimes[i]); } else { int i = 0; int j = 0; while (i < size && j < tombstones.size) { if (comparator.compare(tombstones.starts[j], ends[i]) <= 0) { insertFrom(i, tombstones.starts[j], tombstones.ends[j], tombstones.markedAts[j], tombstones.delTimes[j]); j++; } else { i++; } } // Addds the remaining ones from tombstones if any (note that addInternal will increment size if relevant). for (; j < tombstones.size; j++) addInternal(size, tombstones.starts[j], tombstones.ends[j], tombstones.markedAts[j], tombstones.delTimes[j]); } } private static void copyArrays(LegacyRangeTombstoneList src, LegacyRangeTombstoneList dst) { dst.grow(src.size); System.arraycopy(src.starts, 0, dst.starts, 0, src.size); System.arraycopy(src.ends, 0, dst.ends, 0, src.size); System.arraycopy(src.markedAts, 0, dst.markedAts, 0, src.size); System.arraycopy(src.delTimes, 0, dst.delTimes, 0, src.size); dst.size = src.size; } private void insertFrom(int i, LegacyBound start, LegacyBound end, long markedAt, int delTime) { while (i < size) { assert i == 0 || comparator.compare(ends[i-1], start) <= 0; int c = comparator.compare(start, ends[i]); assert c <= 0; if (c == 0) { // If start == ends[i], then we can insert from the next one (basically the new element // really start at the next element), except for the case where starts[i] == ends[i]. // In this latter case, if we were to move to next element, we could end up with ...[x, x][x, x]... if (comparator.compare(starts[i], ends[i]) == 0) { // The current element cover a single value which is equal to the start of the inserted // element. If the inserted element overwrites the current one, just remove the current // (it's included in what we insert) and proceed with the insert. if (markedAt > markedAts[i]) { removeInternal(i); continue; } // Otherwise (the current singleton interval override the new one), we want to leave the // current element and move to the next, unless start == end since that means the new element // is in fact fully covered by the current one (so we're done) if (comparator.compare(start, end) == 0) return; } i++; continue; } // Do we overwrite the current element? if (markedAt > markedAts[i]) { // We do overwrite. // First deal with what might come before the newly added one. if (comparator.compare(starts[i], start) < 0) { addInternal(i, starts[i], start, markedAts[i], delTimes[i]); i++; // We don't need to do the following line, but in spirit that's what we want to do // setInternal(i, start, ends[i], markedAts, delTime]) } // now, start <= starts[i] // Does the new element stops before/at the current one, int endCmp = comparator.compare(end, starts[i]); if (endCmp <= 0) { // Here start <= starts[i] and end <= starts[i] // This means the current element is before the current one. However, one special // case is if end == starts[i] and starts[i] == ends[i]. In that case, // the new element entirely overwrite the current one and we can just overwrite if (endCmp == 0 && comparator.compare(starts[i], ends[i]) == 0) setInternal(i, start, end, markedAt, delTime); else addInternal(i, start, end, markedAt, delTime); return; } // Do we overwrite the current element fully? int cmp = comparator.compare(ends[i], end); if (cmp <= 0) { // We do overwrite fully: // update the current element until it's end and continue // on with the next element (with the new inserted start == current end). // If we're on the last element, we can optimize if (i == size-1) { setInternal(i, start, end, markedAt, delTime); return; } setInternal(i, start, ends[i], markedAt, delTime); if (cmp == 0) return; start = ends[i]; i++; } else { // We don't ovewrite fully. Insert the new interval, and then update the now next // one to reflect the not overwritten parts. We're then done. addInternal(i, start, end, markedAt, delTime); i++; setInternal(i, end, ends[i], markedAts[i], delTimes[i]); return; } } else { // we don't overwrite the current element // If the new interval starts before the current one, insert that new interval if (comparator.compare(start, starts[i]) < 0) { // If we stop before the start of the current element, just insert the new // interval and we're done; otherwise insert until the beginning of the // current element if (comparator.compare(end, starts[i]) <= 0) { addInternal(i, start, end, markedAt, delTime); return; } addInternal(i, start, starts[i], markedAt, delTime); i++; } // After that, we're overwritten on the current element but might have // some residual parts after ... // ... unless we don't extend beyond it. if (comparator.compare(end, ends[i]) <= 0) return; start = ends[i]; i++; } } // If we got there, then just insert the remainder at the end addInternal(i, start, end, markedAt, delTime); } private int capacity() { return starts.length; } private void addInternal(int i, LegacyBound start, LegacyBound end, long markedAt, int delTime) { assert i >= 0; if (size == capacity()) growToFree(i); else if (i < size) moveElements(i); setInternal(i, start, end, markedAt, delTime); size++; } private void removeInternal(int i) { assert i >= 0; System.arraycopy(starts, i+1, starts, i, size - i - 1); System.arraycopy(ends, i+1, ends, i, size - i - 1); System.arraycopy(markedAts, i+1, markedAts, i, size - i - 1); System.arraycopy(delTimes, i+1, delTimes, i, size - i - 1); --size; starts[size] = null; ends[size] = null; } /* * Grow the arrays, leaving index i "free" in the process. */ private void growToFree(int i) { int newLength = (capacity() * 3) / 2 + 1; grow(i, newLength); } /* * Grow the arrays to match newLength capacity. */ private void grow(int newLength) { if (capacity() < newLength) grow(-1, newLength); } private void grow(int i, int newLength) { starts = grow(starts, size, newLength, i); ends = grow(ends, size, newLength, i); markedAts = grow(markedAts, size, newLength, i); delTimes = grow(delTimes, size, newLength, i); } private static LegacyBound[] grow(LegacyBound[] a, int size, int newLength, int i) { if (i < 0 || i >= size) return Arrays.copyOf(a, newLength); LegacyBound[] newA = new LegacyBound[newLength]; System.arraycopy(a, 0, newA, 0, i); System.arraycopy(a, i, newA, i+1, size - i); return newA; } private static long[] grow(long[] a, int size, int newLength, int i) { if (i < 0 || i >= size) return Arrays.copyOf(a, newLength); long[] newA = new long[newLength]; System.arraycopy(a, 0, newA, 0, i); System.arraycopy(a, i, newA, i+1, size - i); return newA; } private static int[] grow(int[] a, int size, int newLength, int i) { if (i < 0 || i >= size) return Arrays.copyOf(a, newLength); int[] newA = new int[newLength]; System.arraycopy(a, 0, newA, 0, i); System.arraycopy(a, i, newA, i+1, size - i); return newA; } /* * Move elements so that index i is "free", assuming the arrays have at least one free slot at the end. */ private void moveElements(int i) { if (i >= size) return; System.arraycopy(starts, i, starts, i+1, size - i); System.arraycopy(ends, i, ends, i+1, size - i); System.arraycopy(markedAts, i, markedAts, i+1, size - i); System.arraycopy(delTimes, i, delTimes, i+1, size - i); // we set starts[i] to null to indicate the position is now empty, so that we update boundaryHeapSize // when we set it starts[i] = null; } private void setInternal(int i, LegacyBound start, LegacyBound end, long markedAt, int delTime) { starts[i] = start; ends[i] = end; markedAts[i] = markedAt; delTimes[i] = delTime; } public void updateDigest(MessageDigest digest) { ByteBuffer longBuffer = ByteBuffer.allocate(8); for (int i = 0; i < size; i++) { for (int j = 0; j < starts[i].bound.size(); j++) digest.update(starts[i].bound.get(j).duplicate()); if (starts[i].collectionName != null) digest.update(starts[i].collectionName.name.bytes.duplicate()); for (int j = 0; j < ends[i].bound.size(); j++) digest.update(ends[i].bound.get(j).duplicate()); if (ends[i].collectionName != null) digest.update(ends[i].collectionName.name.bytes.duplicate()); longBuffer.putLong(0, markedAts[i]); digest.update(longBuffer.array(), 0, 8); } } public void serialize(DataOutputPlus out, CFMetaData metadata) throws IOException { out.writeInt(size); if (size == 0) return; if (metadata.isCompound()) serializeCompound(out, metadata.isDense()); else serializeSimple(out); } private void serializeCompound(DataOutputPlus out, boolean isDense) throws IOException { List> types = new ArrayList<>(comparator.clusteringComparator.subtypes()); if (!isDense) types.add(UTF8Type.instance); CompositeType type = CompositeType.getInstance(types); for (int i = 0; i < size; i++) { LegacyBound start = starts[i]; LegacyBound end = ends[i]; CompositeType.Builder startBuilder = type.builder(start.isStatic); CompositeType.Builder endBuilder = type.builder(end.isStatic); for (int j = 0; j < start.bound.clustering().size(); j++) { startBuilder.add(start.bound.get(j)); endBuilder.add(end.bound.get(j)); } if (start.collectionName != null) startBuilder.add(start.collectionName.name.bytes); if (end.collectionName != null) endBuilder.add(end.collectionName.name.bytes); ByteBufferUtil.writeWithShortLength(startBuilder.build(), out); ByteBufferUtil.writeWithShortLength(endBuilder.buildAsEndOfRange(), out); out.writeInt(delTimes[i]); out.writeLong(markedAts[i]); } } private void serializeSimple(DataOutputPlus out) throws IOException { List> types = new ArrayList<>(comparator.clusteringComparator.subtypes()); assert types.size() == 1 : types; for (int i = 0; i < size; i++) { LegacyBound start = starts[i]; LegacyBound end = ends[i]; ClusteringPrefix startClustering = start.bound.clustering(); ClusteringPrefix endClustering = end.bound.clustering(); assert startClustering.size() == 1; assert endClustering.size() == 1; ByteBufferUtil.writeWithShortLength(startClustering.get(0), out); ByteBufferUtil.writeWithShortLength(endClustering.get(0), out); out.writeInt(delTimes[i]); out.writeLong(markedAts[i]); } } public long serializedSize(CFMetaData metadata) { long size = 0; size += TypeSizes.sizeof(this.size); if (this.size == 0) return size; if (metadata.isCompound()) return size + serializedSizeCompound(metadata.isDense()); else return size + serializedSizeSimple(); } private long serializedSizeCompound(boolean isDense) { long size = 0; List> types = new ArrayList<>(comparator.clusteringComparator.subtypes()); if (!isDense) types.add(UTF8Type.instance); CompositeType type = CompositeType.getInstance(types); for (int i = 0; i < this.size; i++) { LegacyBound start = starts[i]; LegacyBound end = ends[i]; CompositeType.Builder startBuilder = type.builder(); CompositeType.Builder endBuilder = type.builder(); for (int j = 0; j < start.bound.size(); j++) startBuilder.add(start.bound.get(j)); for (int j = 0; j < end.bound.size(); j++) endBuilder.add(end.bound.get(j)); if (start.collectionName != null) startBuilder.add(start.collectionName.name.bytes); if (end.collectionName != null) endBuilder.add(end.collectionName.name.bytes); size += ByteBufferUtil.serializedSizeWithShortLength(startBuilder.build()); size += ByteBufferUtil.serializedSizeWithShortLength(endBuilder.buildAsEndOfRange()); size += TypeSizes.sizeof(delTimes[i]); size += TypeSizes.sizeof(markedAts[i]); } return size; } private long serializedSizeSimple() { long size = 0; List> types = new ArrayList<>(comparator.clusteringComparator.subtypes()); assert types.size() == 1 : types; for (int i = 0; i < this.size; i++) { LegacyBound start = starts[i]; LegacyBound end = ends[i]; ClusteringPrefix startClustering = start.bound.clustering(); ClusteringPrefix endClustering = end.bound.clustering(); assert startClustering.size() == 1; assert endClustering.size() == 1; size += ByteBufferUtil.serializedSizeWithShortLength(startClustering.get(0)); size += ByteBufferUtil.serializedSizeWithShortLength(endClustering.get(0)); size += TypeSizes.sizeof(delTimes[i]); size += TypeSizes.sizeof(markedAts[i]); } return size; } } }





© 2015 - 2024 Weber Informatics LLC | Privacy Policy