All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.apache.cassandra.service.pager.PagingState Maven / Gradle / Ivy

Go to download

The Apache Cassandra Project develops a highly scalable second-generation distributed database, bringing together Dynamo's fully distributed design and Bigtable's ColumnFamily-based data model.

There is a newer version: 5.0.2
Show newest version
/*
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.cassandra.service.pager;

import java.io.IOException;
import java.nio.ByteBuffer;
import java.util.*;

import com.google.common.annotations.VisibleForTesting;
import com.google.common.primitives.Ints;

import org.apache.cassandra.db.Clustering;
import org.apache.cassandra.db.marshal.AbstractType;
import org.apache.cassandra.db.marshal.ByteArrayAccessor;
import org.apache.cassandra.db.marshal.ByteBufferAccessor;
import org.apache.cassandra.db.marshal.BytesType;
import org.apache.cassandra.db.marshal.CompositeType;
import org.apache.cassandra.db.rows.Cell;
import org.apache.cassandra.db.rows.Row;
import org.apache.cassandra.io.util.DataInputBuffer;
import org.apache.cassandra.io.util.DataOutputBuffer;
import org.apache.cassandra.io.util.DataOutputBufferFixed;
import org.apache.cassandra.net.MessagingService;
import org.apache.cassandra.schema.TableMetadata;
import org.apache.cassandra.transport.ProtocolException;
import org.apache.cassandra.transport.ProtocolVersion;

import static org.apache.cassandra.db.TypeSizes.sizeof;
import static org.apache.cassandra.db.TypeSizes.sizeofUnsignedVInt;
import static org.apache.cassandra.utils.ByteBufferUtil.*;
import static org.apache.cassandra.utils.vint.VIntCoding.computeUnsignedVIntSize;
import static org.apache.cassandra.utils.vint.VIntCoding.getUnsignedVInt;

@SuppressWarnings("WeakerAccess")
public class PagingState
{
    public final ByteBuffer partitionKey;  // Can be null for single partition queries.
    public final RowMark rowMark;          // Can be null if not needed.
    public final int remaining;
    public final int remainingInPartition;

    public PagingState(ByteBuffer partitionKey, RowMark rowMark, int remaining, int remainingInPartition)
    {
        this.partitionKey = partitionKey;
        this.rowMark = rowMark;
        this.remaining = remaining;
        this.remainingInPartition = remainingInPartition;
    }

    public ByteBuffer serialize(ProtocolVersion protocolVersion)
    {
        assert rowMark == null || protocolVersion == rowMark.protocolVersion;
        try
        {
            return protocolVersion.isGreaterThan(ProtocolVersion.V3) ? modernSerialize() : legacySerialize(true);
        }
        catch (IOException e)
        {
            throw new RuntimeException(e);
        }
    }

    public int serializedSize(ProtocolVersion protocolVersion)
    {
        assert rowMark == null || protocolVersion == rowMark.protocolVersion;

        return protocolVersion.isGreaterThan(ProtocolVersion.V3) ? modernSerializedSize() : legacySerializedSize(true);
    }

    /**
     * It's possible to receive a V3 paging state on a V4 client session, and vice versa - so we cannot
     * blindly rely on the protocol version provided. We must verify first that the buffer indeed contains
     * a paging state that adheres to the protocol version provided, or, if not - see if it is in a different
     * version, in which case we try the other format.
     */
    public static PagingState deserialize(ByteBuffer bytes, ProtocolVersion protocolVersion)
    {
        if (bytes == null)
            return null;

        try
        {
            /*
             * We can't just attempt to deser twice, as we risk to misinterpet short/vint
             * lengths and allocate huge byte arrays for readWithVIntLength() or,
             * to a lesser extent, readWithShortLength()
             */

            if (protocolVersion.isGreaterThan(ProtocolVersion.V3))
            {
                if (isModernSerialized(bytes)) return modernDeserialize(bytes, protocolVersion);
                if (isLegacySerialized(bytes)) return legacyDeserialize(bytes, ProtocolVersion.V3);
            }

            if (protocolVersion.isSmallerThan(ProtocolVersion.V4))
            {
                if (isLegacySerialized(bytes)) return legacyDeserialize(bytes, protocolVersion);
                if (isModernSerialized(bytes)) return modernDeserialize(bytes, ProtocolVersion.V4);
            }
        }
        catch (IOException e)
        {
            throw new ProtocolException("Invalid value for the paging state");
        }

        throw new ProtocolException("Invalid value for the paging state");
    }

    /*
     * Modern serde (> VERSION_3)
     */

    @SuppressWarnings({ "resource", "RedundantSuppression" })
    private ByteBuffer modernSerialize() throws IOException
    {
        DataOutputBuffer out = new DataOutputBufferFixed(modernSerializedSize());
        writeWithVIntLength(null == partitionKey ? EMPTY_BYTE_BUFFER : partitionKey, out);
        writeWithVIntLength(null == rowMark ? EMPTY_BYTE_BUFFER : rowMark.mark, out);
        out.writeUnsignedVInt(remaining);
        out.writeUnsignedVInt(remainingInPartition);
        return out.buffer(false);
    }

    @VisibleForTesting
    static boolean isModernSerialized(ByteBuffer bytes)
    {
        int index = bytes.position();
        int limit = bytes.limit();

        int partitionKeyLen = toIntExact(getUnsignedVInt(bytes, index, limit));
        if (partitionKeyLen < 0)
            return false;
        index = addNonNegative(index, computeUnsignedVIntSize(partitionKeyLen), partitionKeyLen);
        if (index >= limit || index < 0)
            return false;

        int rowMarkerLen = toIntExact(getUnsignedVInt(bytes, index, limit));
        if (rowMarkerLen < 0)
            return false;
        index = addNonNegative(index, computeUnsignedVIntSize(rowMarkerLen), rowMarkerLen);
        if (index >= limit || index < 0)
            return false;

        int remaining = toIntExact(getUnsignedVInt(bytes, index, limit));
        if (remaining < 0)
            return false;
        index = addNonNegative(index, computeUnsignedVIntSize(remaining));
        if (index >= limit || index < 0)
            return false;

        long remainingInPartition = getUnsignedVInt(bytes, index, limit);
        if (remainingInPartition < 0)
            return false;
        index = addNonNegative(index, computeUnsignedVIntSize(remainingInPartition));
        return index == limit;
    }

    // Following operations are similar to Math.{addExact/toIntExact}, but without using exceptions for control flow.
    // Since we're operating non-negative numbers, we can use -1 return value as an error code.
    private static int addNonNegative(int x, int y)
    {
        int sum = x + y;
        if (sum < 0)
            return -1;
        return sum;
    }

    private static int addNonNegative(int x, int y, int z)
    {
        int sum = x + y;
        if (sum < 0)
            return -1;
        sum += z;
        if (sum < 0)
            return -1;
        return sum;
    }

    private static int toIntExact(long value)
    {
        if ((int)value != value)
            return -1;
        return (int)value;
    }

    @SuppressWarnings({ "resource", "RedundantSuppression" })
    private static PagingState modernDeserialize(ByteBuffer bytes, ProtocolVersion protocolVersion) throws IOException
    {
        if (protocolVersion.isSmallerThan(ProtocolVersion.V4))
            throw new IllegalArgumentException();

        DataInputBuffer in = new DataInputBuffer(bytes, false);

        ByteBuffer partitionKey = readWithVIntLength(in);
        ByteBuffer rawMark = readWithVIntLength(in);
        int remaining = Ints.checkedCast(in.readUnsignedVInt());
        int remainingInPartition = Ints.checkedCast(in.readUnsignedVInt());

        return new PagingState(partitionKey.hasRemaining() ? partitionKey : null,
                               rawMark.hasRemaining() ? new RowMark(rawMark, protocolVersion) : null,
                               remaining,
                               remainingInPartition);
    }

    private int modernSerializedSize()
    {
        return serializedSizeWithVIntLength(null == partitionKey ? EMPTY_BYTE_BUFFER : partitionKey)
             + serializedSizeWithVIntLength(null == rowMark ? EMPTY_BYTE_BUFFER : rowMark.mark)
             + sizeofUnsignedVInt(remaining)
             + sizeofUnsignedVInt(remainingInPartition);
    }

    /*
     * Legacy serde (< VERSION_4)
     *
     * There are two versions of legacy PagingState format - one used by 2.1/2.2 and one used by 3.0+.
     * The latter includes remainingInPartition count, while the former doesn't.
     */

    @VisibleForTesting
    @SuppressWarnings({ "resource", "RedundantSuppression" })
    ByteBuffer legacySerialize(boolean withRemainingInPartition) throws IOException
    {
        DataOutputBuffer out = new DataOutputBufferFixed(legacySerializedSize(withRemainingInPartition));
        writeWithShortLength(null == partitionKey ? EMPTY_BYTE_BUFFER : partitionKey, out);
        writeWithShortLength(null == rowMark ? EMPTY_BYTE_BUFFER : rowMark.mark, out);
        out.writeInt(remaining);
        if (withRemainingInPartition)
            out.writeInt(remainingInPartition);
        return out.buffer(false);
    }

    @VisibleForTesting
    static boolean isLegacySerialized(ByteBuffer bytes)
    {
        int index = bytes.position();
        int limit = bytes.limit();

        if (limit - index < 2)
            return false;
        short partitionKeyLen = bytes.getShort(index);
        if (partitionKeyLen < 0)
            return false;
        index += 2 + partitionKeyLen;

        if (limit - index < 2)
            return false;
        short rowMarkerLen = bytes.getShort(index);
        if (rowMarkerLen < 0)
            return false;
        index += 2 + rowMarkerLen;

        if (limit - index < 4)
            return false;
        int remaining = bytes.getInt(index);
        if (remaining < 0)
            return false;
        index += 4;

        // V3 encoded by 2.1/2.2 - sans remainingInPartition
        if (index == limit)
            return true;

        if (limit - index == 4)
        {
            int remainingInPartition = bytes.getInt(index);
            return remainingInPartition >= 0; // the value must make sense
        }
        return false;
    }

    @SuppressWarnings({ "resource", "RedundantSuppression" })
    private static PagingState legacyDeserialize(ByteBuffer bytes, ProtocolVersion protocolVersion) throws IOException
    {
        if (protocolVersion.isGreaterThan(ProtocolVersion.V3))
            throw new IllegalArgumentException();

        DataInputBuffer in = new DataInputBuffer(bytes, false);

        ByteBuffer partitionKey = readWithShortLength(in);
        ByteBuffer rawMark = readWithShortLength(in);
        int remaining = in.readInt();
        /*
         * 2.1/2.2 implementations of V3 protocol did not write remainingInPartition, but C* 3.0+ does, so we need
         * to handle both variants of V3 serialization for compatibility.
         */
        int remainingInPartition = in.available() > 0 ? in.readInt() : Integer.MAX_VALUE;

        return new PagingState(partitionKey.hasRemaining() ? partitionKey : null,
                               rawMark.hasRemaining() ? new RowMark(rawMark, protocolVersion) : null,
                               remaining,
                               remainingInPartition);
    }

    @VisibleForTesting
    int legacySerializedSize(boolean withRemainingInPartition)
    {
        return serializedSizeWithShortLength(null == partitionKey ? EMPTY_BYTE_BUFFER : partitionKey)
             + serializedSizeWithShortLength(null == rowMark ? EMPTY_BYTE_BUFFER : rowMark.mark)
             + sizeof(remaining)
             + (withRemainingInPartition ? sizeof(remainingInPartition) : 0);
    }

    @Override
    public final int hashCode()
    {
        return Objects.hash(partitionKey, rowMark, remaining, remainingInPartition);
    }

    @Override
    public final boolean equals(Object o)
    {
        if(!(o instanceof PagingState))
            return false;
        PagingState that = (PagingState)o;
        return Objects.equals(this.partitionKey, that.partitionKey)
            && Objects.equals(this.rowMark, that.rowMark)
            && this.remaining == that.remaining
            && this.remainingInPartition == that.remainingInPartition;
    }

    @Override
    public String toString()
    {
        return String.format("PagingState(key=%s, cellname=%s, remaining=%d, remainingInPartition=%d",
                             partitionKey != null ? bytesToHex(partitionKey) : null,
                             rowMark,
                             remaining,
                             remainingInPartition);
    }

    /**
     * Marks the last row returned by paging, the one from which paging should continue.
     * This class essentially holds a row clustering, but due to backward compatibility reasons,
     * we need to actually store  the cell name for the last cell of the row we're marking when
     * the protocol v3 is in use, and this class abstract that complication.
     *
     * See CASSANDRA-10254 for more details.
     */
    public static class RowMark
    {
        // This can be null for convenience if no row is marked.
        private final ByteBuffer mark;
        private final ProtocolVersion protocolVersion;

        private RowMark(ByteBuffer mark, ProtocolVersion protocolVersion)
        {
            this.mark = mark;
            this.protocolVersion = protocolVersion;
        }

        private static List> makeClusteringTypes(TableMetadata metadata)
        {
            // This is the types that will be used when serializing the clustering in the paging state. We can't really use the actual clustering
            // types however because we can't guarantee that there won't be a schema change between when we send the paging state and get it back,
            // and said schema change could theoretically change one of the clustering types from a fixed width type to a non-fixed one
            // (say timestamp -> blob). So we simply use a list of BytesTypes (for both reading and writting), which may be slightly inefficient
            // for fixed-width types, but avoid any risk during schema changes.
            int size = metadata.clusteringColumns().size();
            List> l = new ArrayList<>(size);
            for (int i = 0; i < size; i++)
                l.add(BytesType.instance);
            return l;
        }

        public static RowMark create(TableMetadata metadata, Row row, ProtocolVersion protocolVersion)
        {
            ByteBuffer mark;
            if (protocolVersion.isSmallerOrEqualTo(ProtocolVersion.V3))
            {
                // We need to be backward compatible with 2.1/2.2 nodes paging states. Which means we have to send
                // the full cellname of the "last" cell in the row we get (since that's how 2.1/2.2 nodes will start after
                // that last row if they get that paging state).
                Iterator> cells = row.cellsInLegacyOrder(metadata, true).iterator();
                if (!cells.hasNext())
                {
                    // If the last returned row has no cell, this means in 2.1/2.2 terms that we stopped on the row
                    // marker.  Note that this shouldn't happen if the table is COMPACT STORAGE tables.
                    assert !metadata.isCompactTable();
                    mark = encodeCellName(metadata, row.clustering(), EMPTY_BYTE_BUFFER, null);
                }
                else
                {
                    Cell cell = cells.next();
                    mark = encodeCellName(metadata, row.clustering(), cell.column().name.bytes, cell.column().isComplex() ? cell.path().get(0) : null);
                }
            }
            else
            {
                // We froze the serialization version to 3.0 as we need to make this this doesn't change (that is, it has to be
                // fix for a given version of the protocol).
                mark = Clustering.serializer.serialize(row.clustering(), MessagingService.VERSION_30, makeClusteringTypes(metadata));
            }
            return new RowMark(mark, protocolVersion);
        }

        public Clustering clustering(TableMetadata metadata)
        {
            if (mark == null)
                return null;

            return protocolVersion.isSmallerOrEqualTo(ProtocolVersion.V3)
                 ? decodeClustering(metadata, mark)
                 : Clustering.serializer.deserialize(mark, MessagingService.VERSION_30, makeClusteringTypes(metadata));
        }

        // Old (pre-3.0) encoding of cells. We need that for the protocol v3 as that is how things where encoded
        private static ByteBuffer encodeCellName(TableMetadata metadata, Clustering clustering, ByteBuffer columnName, ByteBuffer collectionElement)
        {
            // v30 and v3X don't use composites for single-element clusterings in compact tables
            if (metadata.isCompactTable() && metadata.comparator.size() == 1)
                return clustering.bufferAt(0);

            boolean isStatic = clustering == Clustering.STATIC_CLUSTERING;

            // We use comparator.size() rather than clustering.size() because of static clusterings
            int clusteringSize = metadata.comparator.size();
            int size = clusteringSize + 1 + (collectionElement == null ? 0 : 1);
            ByteBuffer[] values = new ByteBuffer[size];
            for (int i = 0; i < clusteringSize; i++)
            {
                if (isStatic)
                {
                    values[i] = EMPTY_BYTE_BUFFER;
                    continue;
                }

                ByteBuffer v = clustering.bufferAt(i);
                // we can have null (only for dense compound tables for backward compatibility reasons) but that
                // means we're done and should stop there as far as building the composite is concerned.
                if (v == null)
                    return CompositeType.build(ByteBufferAccessor.instance, Arrays.copyOfRange(values, 0, i));

                values[i] = v;
            }

            values[clusteringSize] = columnName;
            if (collectionElement != null)
                values[clusteringSize + 1] = collectionElement;

            return CompositeType.build(ByteBufferAccessor.instance, isStatic, values);
        }

        private static Clustering decodeClustering(TableMetadata metadata, ByteBuffer value)
        {
            int csize = metadata.comparator.size();
            if (csize == 0)
                return Clustering.EMPTY;

            // v30 and v3X don't use composites for single-element clusterings in compact tables
            if (metadata.isCompactTable() && metadata.comparator.size() == 1)
                return Clustering.make(value);

            if (CompositeType.isStaticName(value, ByteBufferAccessor.instance))
                return Clustering.STATIC_CLUSTERING;

            List components = CompositeType.splitName(value, ByteBufferAccessor.instance);

            return Clustering.make(components.subList(0, Math.min(csize, components.size())).toArray(new ByteBuffer[csize]));
        }

        @Override
        public final int hashCode()
        {
            return Objects.hash(mark, protocolVersion);
        }

        @Override
        public final boolean equals(Object o)
        {
            if(!(o instanceof RowMark))
                return false;
            RowMark that = (RowMark)o;
            return Objects.equals(this.mark, that.mark) && this.protocolVersion == that.protocolVersion;
        }

        @Override
        public String toString()
        {
            return mark == null ? "null" : bytesToHex(mark);
        }
    }
}




© 2015 - 2024 Weber Informatics LLC | Privacy Policy