com.palantir.atlasdb.keyvalue.cassandra.CassandraKeyValueServices Maven / Gradle / Ivy

Show more of this group Show more artifacts with this name
Show all versions of atlasdb-cassandra Show documentation
Palantir open source project
The newest version!
/*
 * (c) Copyright 2018 Palantir Technologies Inc. All rights reserved.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package com.palantir.atlasdb.keyvalue.cassandra;

import com.google.common.collect.HashMultimap;
import com.google.common.collect.ImmutableSet;
import com.google.common.collect.Multimap;
import com.palantir.atlasdb.AtlasDbConstants;
import com.palantir.atlasdb.cassandra.CassandraKeyValueServiceConfig;
import com.palantir.atlasdb.encoding.PtBytes;
import com.palantir.atlasdb.keyvalue.api.Cell;
import com.palantir.atlasdb.keyvalue.api.ColumnSelection;
import com.palantir.atlasdb.keyvalue.api.RangeRequest;
import com.palantir.atlasdb.keyvalue.api.RangeRequests;
import com.palantir.atlasdb.keyvalue.api.TableReference;
import com.palantir.atlasdb.keyvalue.api.Value;
import com.palantir.atlasdb.table.description.TableMetadata;
import com.palantir.common.base.RunnableCheckedException;
import com.palantir.common.base.Throwables;
import com.palantir.common.visitor.Visitor;
import com.palantir.logsafe.SafeArg;
import com.palantir.logsafe.logger.SafeLogger;
import com.palantir.logsafe.logger.SafeLoggerFactory;
import com.palantir.util.Pair;
import com.palantir.util.io.AvailabilityRequirement;
import java.nio.ByteBuffer;
import java.nio.ByteOrder;
import java.nio.charset.Charset;
import java.nio.charset.StandardCharsets;
import java.util.Arrays;
import java.util.Collection;
import java.util.List;
import java.util.Locale;
import java.util.Map;
import java.util.Optional;
import java.util.UUID;
import java.util.concurrent.ConcurrentHashMap;
import java.util.function.Function;
import java.util.stream.Collectors;
import org.apache.cassandra.thrift.CfDef;
import org.apache.cassandra.thrift.Column;
import org.apache.cassandra.thrift.ColumnOrSuperColumn;
import org.apache.thrift.TException;

public final class CassandraKeyValueServices {
    private static final SafeLogger log = SafeLoggerFactory.get(CassandraKeyValueServices.class);

    private static final long INITIAL_SLEEP_TIME = 100;
    private static final long MAX_SLEEP_TIME = 5000;
    public static final String VERSION_UNREACHABLE = "UNREACHABLE";
    public static final byte[] METADATA_COL = "m".getBytes(StandardCharsets.UTF_8);

    private CassandraKeyValueServices() {
        // Utility class
    }

    static void waitForSchemaVersions(
            int schemaMutationTimeMillis, CassandraClient client, String unsafeSchemaChangeDescription)
            throws TException {
        waitForSchemaVersions(
                schemaMutationTimeMillis, client, unsafeSchemaChangeDescription, AvailabilityRequirement.QUORUM);
    }

    /**
     * Attempt to wait until the specified number of nodes are available and there are no disagreements on schema
     * version.
     * 
     * The goals of this method include:
     * 

     *     
     *         Backoff during schema mutations if the cluster is known to currently be in disagreement, so that
     *         Cassandra can more efficiently come to agreement.
     *     
     *     
     *         Avoid performing schema mutations if our availability requirements are not met. This is to prevent a
     *         split-brain situation in respect to the schema. While this _should_ be a safe state to be in, there is
     *         little-to-no risk to availability for waiting for consensus and availability. If the specified
     *         availability requirements are not met, then it is very likely Cassandra is in an outage anyways, which
     *         means it is ok for a service to not start up. There is of course a check-then-act race condition, but
     *         Cassandra is able to eventually recover.
     *     
     *     
     *         Allowing schema mutations to take place in the presence of failures (the KVS needs to be able to
     *         tolerate a limited number of Cassandra node-level failures).
     *     
     * 
     *
     * @param schemaMutationTimeMillis      Time to wait for nodes' schema versions to match.
     * @param client                        Cassandra client.
     * @param unsafeSchemaChangeDescription Description of the schema change that was performed prior to this check.
     * @param availabilityRequirement       Number of Cassandra nodes that must be reachable for schema consensus.
     * @throws IllegalStateException if we wait for more than schemaMutationTimeoutMillis specified in config.
     */
    static void waitForSchemaVersions(
            int schemaMutationTimeMillis,
            CassandraClient client,
            String unsafeSchemaChangeDescription,
            AvailabilityRequirement availabilityRequirement)
            throws TException {
        long start = System.currentTimeMillis();
        long sleepTime = INITIAL_SLEEP_TIME;
        Map> versions;
        do {
            // This may only include some of the nodes if the coordinator hasn't shaken hands with someone; however,
            // this existed largely as a defense against performance issues with concurrent schema modifications.
            versions = client.describe_schema_versions();
            if (clusterSatisfiesAvailabilityRequirementAndNoDivergentSchemas(versions, availabilityRequirement)) {
                return;
            }
            sleepTime = sleepAndGetNextBackoffTime(sleepTime);
        } while (System.currentTimeMillis() < start + schemaMutationTimeMillis);

        log.warn(
                "Cassandra cluster failed to reach agreement on schema versions within the timeout",
                SafeArg.of("schemaMutationTimeMillis", schemaMutationTimeMillis));

        StringBuilder schemaVersions = new StringBuilder();
        for (Map.Entry> version : versions.entrySet()) {
            addNodeInformation(
                    schemaVersions, String.format("%nAt schema version %s:", version.getKey()), version.getValue());
        }

        String clusterNodes = addNodeInformation(
                        new StringBuilder(),
                        "Nodes believed to exist:",
                        versions.values().stream().flatMap(Collection::stream).collect(Collectors.toList()))
                .toString();

        String errorMessage = String.format(
                "Cassandra cluster cannot come to agreement on schema versions, %s. %s"
                        + " \nFind the nodes above that diverge from the majority schema and examine their logs to"
                        + " determine the issue. If nodes have schema 'UNKNOWN', they are likely down/unresponsive."
                        + " Fixing the underlying issue and restarting Cassandra should resolve the problem."
                        + " You can quick-check this with 'nodetool describecluster'."
                        + " \nIf nodes are specified in the config file, but do not have a schema version listed"
                        + " above, then they may have never joined the cluster. Verify your configuration is correct"
                        + " and that the nodes specified in the config are up and joined the cluster. %s",
                unsafeSchemaChangeDescription, schemaVersions.toString(), clusterNodes);
        throw new IllegalStateException(errorMessage);
    }

    static void runWithWaitingForSchemas(
            RunnableCheckedException task,
            CassandraKeyValueServiceConfig config,
            CassandraClient client,
            String unsafeSchemaChangeDescription)
            throws TException {
        waitForSchemaVersions(config.schemaMutationTimeoutMillis(), client, "before " + unsafeSchemaChangeDescription);
        task.run();
        waitForSchemaVersions(config.schemaMutationTimeoutMillis(), client, "after " + unsafeSchemaChangeDescription);
    }

    private static boolean clusterSatisfiesAvailabilityRequirementAndNoDivergentSchemas(
            Map> versions, AvailabilityRequirement availabilityRequirement) {
        if (getDistinctReachableSchemas(versions).size() != 1) {
            return false;
        }

        int totalNodes = versions.values().stream().mapToInt(List::size).sum();
        int numUnreachableNodes = Optional.ofNullable(versions.get(VERSION_UNREACHABLE))
                .map(List::size)
                .orElse(0);
        int availableNodes = totalNodes - numUnreachableNodes;
        return availabilityRequirement.satisfies(availableNodes, totalNodes);
    }

    private static List getDistinctReachableSchemas(Map> versions) {
        return versions.keySet().stream()
                .filter(schema -> !schema.equals(VERSION_UNREACHABLE))
                .collect(Collectors.toList());
    }

    private static long sleepAndGetNextBackoffTime(long sleepTime) {
        try {
            Thread.sleep(sleepTime);
        } catch (InterruptedException e) {
            Thread.currentThread().interrupt();
            throw Throwables.throwUncheckedException(e);
        }
        return Math.min(sleepTime * 2, MAX_SLEEP_TIME);
    }

    private static StringBuilder addNodeInformation(StringBuilder builder, String message, List nodes) {
        builder.append(message);
        for (String node : nodes) {
            builder.append(String.format("%n\tNode: %s", node));
        }
        return builder;
    }

    /**
     * This is a request from pbrown / FDEs; basically it's a pain to do DB surgery to get out
     * of failed patch upgrades, the majority of which requires schema mutations; they would find
     * it preferable to stop before starting the actual patch upgrade / setting APPLYING state.
     */
    static void warnUserInInitializationIfClusterAlreadyInInconsistentState(
            CassandraClientPool clientPool, CassandraKeyValueServiceConfig config) {
        try {
            clientPool.run(client -> {
                waitForSchemaVersions(config.schemaMutationTimeoutMillis(), client, " during an initialization check");
                return null;
            });
        } catch (Exception e) {
            log.warn("Failed to retrieve current Cassandra cluster schema status.", e);
        }
    }

    static String encodeAsHex(byte[] array) {
        return "0x" + PtBytes.encodeHexString(array);
    }

    @SuppressWarnings("BadAssert") // performance sensitive asserts
    public static ByteBuffer makeCompositeBuffer(byte[] colName, long positiveTimestamp) {
        assert colName.length <= 1 << 16 : "Cannot use column names larger than 64KiB, was " + colName.length;

        ByteBuffer buffer = ByteBuffer.allocate(6 /* misc */ + 8 /* timestamp */ + colName.length)
                .order(ByteOrder.BIG_ENDIAN);

        buffer.put((byte) ((colName.length >> 8) & 0xFF));
        buffer.put((byte) (colName.length & 0xFF));
        buffer.put(colName);
        buffer.put((byte) 0);

        buffer.put((byte) 0);
        buffer.put((byte) (8 & 0xFF));
        buffer.putLong(~positiveTimestamp);
        buffer.put((byte) 0);

        buffer.flip();

        return buffer;
    }

    /**
     * @deprecated use {@link #decomposeColumn(ByteBuffer)}
     */
    @Deprecated
    static Pair decompose(ByteBuffer inputComposite) {
        ColumnAndTimestamp columnAndTimestamp = decomposeColumn(inputComposite);
        return Pair.create(columnAndTimestamp.columnName(), columnAndTimestamp.timestamp());
    }

    static ColumnAndTimestamp decomposeColumn(ByteBuffer inputComposite) {
        ByteBuffer composite = inputComposite.slice().order(ByteOrder.BIG_ENDIAN);

        short len = composite.getShort();
        byte[] colName = new byte[len];
        composite.get(colName);

        short shouldBeZero = composite.getShort();
        com.palantir.logsafe.Preconditions.checkArgument(shouldBeZero == 0);

        byte shouldBe8 = composite.get();
        com.palantir.logsafe.Preconditions.checkArgument(shouldBe8 == 8);
        long ts = composite.getLong();

        return new ColumnAndTimestamp(colName, ~ts);
    }

    /**
     * @deprecated use {@link #decomposeColumnName(Column)}
     */
    @Deprecated
    public static Pair decomposeName(Column column) {
        ColumnAndTimestamp columnAndTimestamp = decomposeColumnName(column);
        return Pair.create(columnAndTimestamp.columnName(), columnAndTimestamp.timestamp());
    }

    /**
     * Convenience method to get the name buffer for the specified column and
     * decompose it into the name and timestamp.
     */
    public static ColumnAndTimestamp decomposeColumnName(Column column) {
        ByteBuffer nameBuffer;
        if (column.isSetName()) {
            nameBuffer = column.bufferForName();
        } else {
            // the column buffer has not yet been set/cached
            // so we must fallback on the slowpath and force
            // the transform to bytes and wrap ourselves
            nameBuffer = ByteBuffer.wrap(column.getName());
        }
        return decomposeColumn(nameBuffer);
    }

    public static byte[] getBytesFromByteBuffer(ByteBuffer buffer) {
        // Be careful *NOT* to perform anything that will modify the buffer's position or limit
        byte[] bytes = new byte[buffer.limit() - buffer.position()];
        if (buffer.hasArray()) {
            System.arraycopy(buffer.array(), buffer.arrayOffset() + buffer.position(), bytes, 0, bytes.length);
        } else {
            buffer.duplicate().get(bytes, buffer.position(), bytes.length);
        }
        return bytes;
    }

    // /Obviously/ this is long (internal cassandra timestamp) + long (internal cassandra clock sequence and node id)
    static String convertCassandraByteBufferUuidToString(ByteBuffer uuid) {
        return new UUID(uuid.getLong(uuid.position()), uuid.getLong(uuid.position() + 8)).toString();
    }

    static String getFilteredStackTrace(String filter) {
        Exception ex = new Exception();
        StackTraceElement[] stackTrace = ex.getStackTrace();
        StringBuilder sb = new StringBuilder();
        for (StackTraceElement element : stackTrace) {
            if (element.getClassName().contains(filter)) {
                sb.append(element.toString()).append("\n");
            }
        }
        return sb.toString();
    }

    static Column createColumn(Cell cell, Value value) {
        return createColumnAtSpecificCassandraTimestamp(cell, value, value.getTimestamp());
    }

    /**
     * Creates a {@link Column} for an Atlas tombstone.
     * These columns have an Atlas timestamp of zero, but should not have a Cassandra timestamp of zero as that may
     * interfere with compactions. We want these to be at least reasonably consistent with Atlas's overall logical
     * time.
     * 
     * In practice, usage may involve obtaining a (reasonably) fresh timestamp and using that as the timestamp for the
     * deletion.
     */
    static Column createColumnForDelete(Cell cell, Value value, long cassandraTimestamp) {
        return createColumnAtSpecificCassandraTimestamp(cell, value, cassandraTimestamp);
    }

    private static Column createColumnAtSpecificCassandraTimestamp(Cell cell, Value value, long cassandraTimestamp) {
        byte[] contents = value.getContents();
        long atlasTimestamp = value.getTimestamp();
        ByteBuffer colName = makeCompositeBuffer(cell.getColumnName(), atlasTimestamp);
        Column col = new Column();
        col.setName(colName);
        col.setValue(contents);
        col.setTimestamp(cassandraTimestamp);
        return col;
    }

    static Cell getMetadataCell(TableReference tableRef) {
        // would have preferred an explicit charset, but thrift uses default internally
        return Cell.create(lowerCaseTableReferenceToBytes(tableRef), METADATA_COL);
    }

    @SuppressWarnings("checkstyle:RegexpSinglelineJava")
    static Cell getOldMetadataCell(TableReference tableRef) {
        return Cell.create(tableRef.getQualifiedName().getBytes(Charset.defaultCharset()), METADATA_COL);
    }

    static RangeRequest metadataRangeRequest() {
        return RangeRequest.builder()
                .retainColumns(ImmutableSet.of(METADATA_COL))
                .build();
    }

    static RangeRequest metadataRangeRequestForTable(TableReference tableRef) {
        byte[] startRow = upperCaseTableReferenceToBytes(tableRef);
        byte[] endRow = lowerCaseTableReferenceToBytes(tableRef);
        return RangeRequest.builder()
                .startRowInclusive(startRow)
                .endRowExclusive(RangeRequests.nextLexicographicName(endRow))
                .retainColumns(ImmutableSet.of(METADATA_COL))
                .build();
    }

    @SuppressWarnings("checkstyle:RegexpSinglelineJava")
    static byte[] lowerCaseTableReferenceToBytes(TableReference tableRef) {
        return tableRef.getQualifiedName().toLowerCase(Locale.ROOT).getBytes(Charset.defaultCharset());
    }

    @SuppressWarnings("checkstyle:RegexpSinglelineJava")
    static byte[] upperCaseTableReferenceToBytes(TableReference tableRef) {
        return tableRef.getQualifiedName().toUpperCase(Locale.ROOT).getBytes(Charset.defaultCharset());
    }

    @SuppressWarnings("checkstyle:RegexpSinglelineJava")
    static TableReference lowerCaseTableReferenceFromBytes(byte[] name) {
        return TableReference.createUnsafe(new String(name, Charset.defaultCharset()).toLowerCase(Locale.ROOT));
    }

    static TableReference tableReferenceFromCfDef(CfDef cf) {
        return TableReference.fromInternalTableName(cf.getName());
    }

    @SuppressWarnings("checkstyle:RegexpSinglelineJava")
    static TableReference tableReferenceFromBytes(byte[] name) {
        return TableReference.createUnsafe(new String(name, Charset.defaultCharset()));
    }

    interface ThreadSafeResultVisitor extends Visitor>> {
        // marker
    }

    static class StartTsResultsCollector implements ThreadSafeResultVisitor {
        private final Map collectedResults = new ConcurrentHashMap<>();
        private final ResultsExtractor extractor;
        private final long startTs;

        StartTsResultsCollector(long startTs, Function, ResultsExtractor> extractorFactory) {
            this.extractor = extractorFactory.apply(collectedResults);
            this.startTs = startTs;
        }

        public Map getCollectedResults() {
            return collectedResults;
        }

        @Override
        public void visit(Map> results) {
            extractor.extractResults(results, startTs, ColumnSelection.all());
        }
    }

    static class AllTimestampsCollector implements ThreadSafeResultVisitor {
        private final Multimap collectedResults = HashMultimap.create();

        public Multimap getCollectedResults() {
            return collectedResults;
        }

        @Override
        public synchronized void visit(Map> results) {
            for (Map.Entry> result : results.entrySet()) {
                byte[] row = CassandraKeyValueServices.getBytesFromByteBuffer(result.getKey());
                for (ColumnOrSuperColumn col : result.getValue()) {
                    ColumnAndTimestamp columnAndTimestamp = CassandraKeyValueServices.decomposeColumnName(col.column);
                    Cell cell = Cell.create(row, columnAndTimestamp.columnName());
                    collectedResults.put(cell, columnAndTimestamp.timestamp());
                }
            }
        }
    }

    static boolean isEmptyOrInvalidMetadata(byte[] metadata) {
        return metadata == null
                || Arrays.equals(metadata, AtlasDbConstants.EMPTY_TABLE_METADATA)
                || Arrays.equals(metadata, AtlasDbConstants.GENERIC_TABLE_METADATA);
    }

    static TableMetadata getMetadataOrDefaultToGeneric(byte[] metadata) {
        if (metadata == null || Arrays.equals(metadata, AtlasDbConstants.EMPTY_TABLE_METADATA)) {
            return TableMetadata.BYTES_HYDRATOR.hydrateFromBytes(AtlasDbConstants.GENERIC_TABLE_METADATA);
        }
        return TableMetadata.BYTES_HYDRATOR.hydrateFromBytes(metadata);
    }

    public static class ColumnAndTimestamp {
        private final long timestamp;
        private final byte[] columnName;

        ColumnAndTimestamp(byte[] columnName, long timestamp) {
            this.columnName = columnName;
            this.timestamp = timestamp;
        }

        public byte[] columnName() {
            return columnName;
        }

        public long timestamp() {
            return timestamp;
        }

        @Override
        public boolean equals(Object obj) {
            if (this == obj) {
                return true;
            }
            if (obj == null || getClass() != obj.getClass()) {
                return false;
            }
            ColumnAndTimestamp that = (ColumnAndTimestamp) obj;
            return timestamp == that.timestamp && Arrays.equals(columnName, that.columnName);
        }

        @Override
        public int hashCode() {
            return 31 * Long.hashCode(timestamp) + Arrays.hashCode(columnName);
        }

        @Override
        public String toString() {
            return "ColumnAndTimestamp{columnName=" + Arrays.toString(columnName) + ", timestamp=" + timestamp + '}';
        }
    }
}