All Downloads are FREE. Search and download functionalities are using the official Maven repository.

com.palantir.atlasdb.keyvalue.cassandra.CassandraKeyValueServices Maven / Gradle / Ivy

The newest version!
/*
 * (c) Copyright 2018 Palantir Technologies Inc. All rights reserved.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package com.palantir.atlasdb.keyvalue.cassandra;

import com.google.common.collect.HashMultimap;
import com.google.common.collect.ImmutableSet;
import com.google.common.collect.Multimap;
import com.palantir.atlasdb.AtlasDbConstants;
import com.palantir.atlasdb.cassandra.CassandraKeyValueServiceConfig;
import com.palantir.atlasdb.encoding.PtBytes;
import com.palantir.atlasdb.keyvalue.api.Cell;
import com.palantir.atlasdb.keyvalue.api.ColumnSelection;
import com.palantir.atlasdb.keyvalue.api.RangeRequest;
import com.palantir.atlasdb.keyvalue.api.RangeRequests;
import com.palantir.atlasdb.keyvalue.api.TableReference;
import com.palantir.atlasdb.keyvalue.api.Value;
import com.palantir.atlasdb.table.description.TableMetadata;
import com.palantir.common.base.RunnableCheckedException;
import com.palantir.common.base.Throwables;
import com.palantir.common.visitor.Visitor;
import com.palantir.logsafe.SafeArg;
import com.palantir.logsafe.logger.SafeLogger;
import com.palantir.logsafe.logger.SafeLoggerFactory;
import com.palantir.util.Pair;
import com.palantir.util.io.AvailabilityRequirement;
import java.nio.ByteBuffer;
import java.nio.ByteOrder;
import java.nio.charset.Charset;
import java.nio.charset.StandardCharsets;
import java.util.Arrays;
import java.util.Collection;
import java.util.List;
import java.util.Locale;
import java.util.Map;
import java.util.Optional;
import java.util.UUID;
import java.util.concurrent.ConcurrentHashMap;
import java.util.function.Function;
import java.util.stream.Collectors;
import org.apache.cassandra.thrift.CfDef;
import org.apache.cassandra.thrift.Column;
import org.apache.cassandra.thrift.ColumnOrSuperColumn;
import org.apache.thrift.TException;

public final class CassandraKeyValueServices {
    private static final SafeLogger log = SafeLoggerFactory.get(CassandraKeyValueServices.class);

    private static final long INITIAL_SLEEP_TIME = 100;
    private static final long MAX_SLEEP_TIME = 5000;
    public static final String VERSION_UNREACHABLE = "UNREACHABLE";
    public static final byte[] METADATA_COL = "m".getBytes(StandardCharsets.UTF_8);

    private CassandraKeyValueServices() {
        // Utility class
    }

    static void waitForSchemaVersions(
            int schemaMutationTimeMillis, CassandraClient client, String unsafeSchemaChangeDescription)
            throws TException {
        waitForSchemaVersions(
                schemaMutationTimeMillis, client, unsafeSchemaChangeDescription, AvailabilityRequirement.QUORUM);
    }

    /**
     * Attempt to wait until the specified number of nodes are available and there are no disagreements on schema
     * version.
     * 

* The goals of this method include: *

    *
  1. * Backoff during schema mutations if the cluster is known to currently be in disagreement, so that * Cassandra can more efficiently come to agreement. *
  2. *
  3. * Avoid performing schema mutations if our availability requirements are not met. This is to prevent a * split-brain situation in respect to the schema. While this _should_ be a safe state to be in, there is * little-to-no risk to availability for waiting for consensus and availability. If the specified * availability requirements are not met, then it is very likely Cassandra is in an outage anyways, which * means it is ok for a service to not start up. There is of course a check-then-act race condition, but * Cassandra is able to eventually recover. *
  4. *
  5. * Allowing schema mutations to take place in the presence of failures (the KVS needs to be able to * tolerate a limited number of Cassandra node-level failures). *
  6. *
* * @param schemaMutationTimeMillis Time to wait for nodes' schema versions to match. * @param client Cassandra client. * @param unsafeSchemaChangeDescription Description of the schema change that was performed prior to this check. * @param availabilityRequirement Number of Cassandra nodes that must be reachable for schema consensus. * @throws IllegalStateException if we wait for more than schemaMutationTimeoutMillis specified in config. */ static void waitForSchemaVersions( int schemaMutationTimeMillis, CassandraClient client, String unsafeSchemaChangeDescription, AvailabilityRequirement availabilityRequirement) throws TException { long start = System.currentTimeMillis(); long sleepTime = INITIAL_SLEEP_TIME; Map> versions; do { // This may only include some of the nodes if the coordinator hasn't shaken hands with someone; however, // this existed largely as a defense against performance issues with concurrent schema modifications. versions = client.describe_schema_versions(); if (clusterSatisfiesAvailabilityRequirementAndNoDivergentSchemas(versions, availabilityRequirement)) { return; } sleepTime = sleepAndGetNextBackoffTime(sleepTime); } while (System.currentTimeMillis() < start + schemaMutationTimeMillis); log.warn( "Cassandra cluster failed to reach agreement on schema versions within the timeout", SafeArg.of("schemaMutationTimeMillis", schemaMutationTimeMillis)); StringBuilder schemaVersions = new StringBuilder(); for (Map.Entry> version : versions.entrySet()) { addNodeInformation( schemaVersions, String.format("%nAt schema version %s:", version.getKey()), version.getValue()); } String clusterNodes = addNodeInformation( new StringBuilder(), "Nodes believed to exist:", versions.values().stream().flatMap(Collection::stream).collect(Collectors.toList())) .toString(); String errorMessage = String.format( "Cassandra cluster cannot come to agreement on schema versions, %s. %s" + " \nFind the nodes above that diverge from the majority schema and examine their logs to" + " determine the issue. If nodes have schema 'UNKNOWN', they are likely down/unresponsive." + " Fixing the underlying issue and restarting Cassandra should resolve the problem." + " You can quick-check this with 'nodetool describecluster'." + " \nIf nodes are specified in the config file, but do not have a schema version listed" + " above, then they may have never joined the cluster. Verify your configuration is correct" + " and that the nodes specified in the config are up and joined the cluster. %s", unsafeSchemaChangeDescription, schemaVersions.toString(), clusterNodes); throw new IllegalStateException(errorMessage); } static void runWithWaitingForSchemas( RunnableCheckedException task, CassandraKeyValueServiceConfig config, CassandraClient client, String unsafeSchemaChangeDescription) throws TException { waitForSchemaVersions(config.schemaMutationTimeoutMillis(), client, "before " + unsafeSchemaChangeDescription); task.run(); waitForSchemaVersions(config.schemaMutationTimeoutMillis(), client, "after " + unsafeSchemaChangeDescription); } private static boolean clusterSatisfiesAvailabilityRequirementAndNoDivergentSchemas( Map> versions, AvailabilityRequirement availabilityRequirement) { if (getDistinctReachableSchemas(versions).size() != 1) { return false; } int totalNodes = versions.values().stream().mapToInt(List::size).sum(); int numUnreachableNodes = Optional.ofNullable(versions.get(VERSION_UNREACHABLE)) .map(List::size) .orElse(0); int availableNodes = totalNodes - numUnreachableNodes; return availabilityRequirement.satisfies(availableNodes, totalNodes); } private static List getDistinctReachableSchemas(Map> versions) { return versions.keySet().stream() .filter(schema -> !schema.equals(VERSION_UNREACHABLE)) .collect(Collectors.toList()); } private static long sleepAndGetNextBackoffTime(long sleepTime) { try { Thread.sleep(sleepTime); } catch (InterruptedException e) { Thread.currentThread().interrupt(); throw Throwables.throwUncheckedException(e); } return Math.min(sleepTime * 2, MAX_SLEEP_TIME); } private static StringBuilder addNodeInformation(StringBuilder builder, String message, List nodes) { builder.append(message); for (String node : nodes) { builder.append(String.format("%n\tNode: %s", node)); } return builder; } /** * This is a request from pbrown / FDEs; basically it's a pain to do DB surgery to get out * of failed patch upgrades, the majority of which requires schema mutations; they would find * it preferable to stop before starting the actual patch upgrade / setting APPLYING state. */ static void warnUserInInitializationIfClusterAlreadyInInconsistentState( CassandraClientPool clientPool, CassandraKeyValueServiceConfig config) { try { clientPool.run(client -> { waitForSchemaVersions(config.schemaMutationTimeoutMillis(), client, " during an initialization check"); return null; }); } catch (Exception e) { log.warn("Failed to retrieve current Cassandra cluster schema status.", e); } } static String encodeAsHex(byte[] array) { return "0x" + PtBytes.encodeHexString(array); } @SuppressWarnings("BadAssert") // performance sensitive asserts public static ByteBuffer makeCompositeBuffer(byte[] colName, long positiveTimestamp) { assert colName.length <= 1 << 16 : "Cannot use column names larger than 64KiB, was " + colName.length; ByteBuffer buffer = ByteBuffer.allocate(6 /* misc */ + 8 /* timestamp */ + colName.length) .order(ByteOrder.BIG_ENDIAN); buffer.put((byte) ((colName.length >> 8) & 0xFF)); buffer.put((byte) (colName.length & 0xFF)); buffer.put(colName); buffer.put((byte) 0); buffer.put((byte) 0); buffer.put((byte) (8 & 0xFF)); buffer.putLong(~positiveTimestamp); buffer.put((byte) 0); buffer.flip(); return buffer; } /** * @deprecated use {@link #decomposeColumn(ByteBuffer)} */ @Deprecated static Pair decompose(ByteBuffer inputComposite) { ColumnAndTimestamp columnAndTimestamp = decomposeColumn(inputComposite); return Pair.create(columnAndTimestamp.columnName(), columnAndTimestamp.timestamp()); } static ColumnAndTimestamp decomposeColumn(ByteBuffer inputComposite) { ByteBuffer composite = inputComposite.slice().order(ByteOrder.BIG_ENDIAN); short len = composite.getShort(); byte[] colName = new byte[len]; composite.get(colName); short shouldBeZero = composite.getShort(); com.palantir.logsafe.Preconditions.checkArgument(shouldBeZero == 0); byte shouldBe8 = composite.get(); com.palantir.logsafe.Preconditions.checkArgument(shouldBe8 == 8); long ts = composite.getLong(); return new ColumnAndTimestamp(colName, ~ts); } /** * @deprecated use {@link #decomposeColumnName(Column)} */ @Deprecated public static Pair decomposeName(Column column) { ColumnAndTimestamp columnAndTimestamp = decomposeColumnName(column); return Pair.create(columnAndTimestamp.columnName(), columnAndTimestamp.timestamp()); } /** * Convenience method to get the name buffer for the specified column and * decompose it into the name and timestamp. */ public static ColumnAndTimestamp decomposeColumnName(Column column) { ByteBuffer nameBuffer; if (column.isSetName()) { nameBuffer = column.bufferForName(); } else { // the column buffer has not yet been set/cached // so we must fallback on the slowpath and force // the transform to bytes and wrap ourselves nameBuffer = ByteBuffer.wrap(column.getName()); } return decomposeColumn(nameBuffer); } public static byte[] getBytesFromByteBuffer(ByteBuffer buffer) { // Be careful *NOT* to perform anything that will modify the buffer's position or limit byte[] bytes = new byte[buffer.limit() - buffer.position()]; if (buffer.hasArray()) { System.arraycopy(buffer.array(), buffer.arrayOffset() + buffer.position(), bytes, 0, bytes.length); } else { buffer.duplicate().get(bytes, buffer.position(), bytes.length); } return bytes; } // /Obviously/ this is long (internal cassandra timestamp) + long (internal cassandra clock sequence and node id) static String convertCassandraByteBufferUuidToString(ByteBuffer uuid) { return new UUID(uuid.getLong(uuid.position()), uuid.getLong(uuid.position() + 8)).toString(); } static String getFilteredStackTrace(String filter) { Exception ex = new Exception(); StackTraceElement[] stackTrace = ex.getStackTrace(); StringBuilder sb = new StringBuilder(); for (StackTraceElement element : stackTrace) { if (element.getClassName().contains(filter)) { sb.append(element.toString()).append("\n"); } } return sb.toString(); } static Column createColumn(Cell cell, Value value) { return createColumnAtSpecificCassandraTimestamp(cell, value, value.getTimestamp()); } /** * Creates a {@link Column} for an Atlas tombstone. * These columns have an Atlas timestamp of zero, but should not have a Cassandra timestamp of zero as that may * interfere with compactions. We want these to be at least reasonably consistent with Atlas's overall logical * time. *

* In practice, usage may involve obtaining a (reasonably) fresh timestamp and using that as the timestamp for the * deletion. */ static Column createColumnForDelete(Cell cell, Value value, long cassandraTimestamp) { return createColumnAtSpecificCassandraTimestamp(cell, value, cassandraTimestamp); } private static Column createColumnAtSpecificCassandraTimestamp(Cell cell, Value value, long cassandraTimestamp) { byte[] contents = value.getContents(); long atlasTimestamp = value.getTimestamp(); ByteBuffer colName = makeCompositeBuffer(cell.getColumnName(), atlasTimestamp); Column col = new Column(); col.setName(colName); col.setValue(contents); col.setTimestamp(cassandraTimestamp); return col; } static Cell getMetadataCell(TableReference tableRef) { // would have preferred an explicit charset, but thrift uses default internally return Cell.create(lowerCaseTableReferenceToBytes(tableRef), METADATA_COL); } @SuppressWarnings("checkstyle:RegexpSinglelineJava") static Cell getOldMetadataCell(TableReference tableRef) { return Cell.create(tableRef.getQualifiedName().getBytes(Charset.defaultCharset()), METADATA_COL); } static RangeRequest metadataRangeRequest() { return RangeRequest.builder() .retainColumns(ImmutableSet.of(METADATA_COL)) .build(); } static RangeRequest metadataRangeRequestForTable(TableReference tableRef) { byte[] startRow = upperCaseTableReferenceToBytes(tableRef); byte[] endRow = lowerCaseTableReferenceToBytes(tableRef); return RangeRequest.builder() .startRowInclusive(startRow) .endRowExclusive(RangeRequests.nextLexicographicName(endRow)) .retainColumns(ImmutableSet.of(METADATA_COL)) .build(); } @SuppressWarnings("checkstyle:RegexpSinglelineJava") static byte[] lowerCaseTableReferenceToBytes(TableReference tableRef) { return tableRef.getQualifiedName().toLowerCase(Locale.ROOT).getBytes(Charset.defaultCharset()); } @SuppressWarnings("checkstyle:RegexpSinglelineJava") static byte[] upperCaseTableReferenceToBytes(TableReference tableRef) { return tableRef.getQualifiedName().toUpperCase(Locale.ROOT).getBytes(Charset.defaultCharset()); } @SuppressWarnings("checkstyle:RegexpSinglelineJava") static TableReference lowerCaseTableReferenceFromBytes(byte[] name) { return TableReference.createUnsafe(new String(name, Charset.defaultCharset()).toLowerCase(Locale.ROOT)); } static TableReference tableReferenceFromCfDef(CfDef cf) { return TableReference.fromInternalTableName(cf.getName()); } @SuppressWarnings("checkstyle:RegexpSinglelineJava") static TableReference tableReferenceFromBytes(byte[] name) { return TableReference.createUnsafe(new String(name, Charset.defaultCharset())); } interface ThreadSafeResultVisitor extends Visitor>> { // marker } static class StartTsResultsCollector implements ThreadSafeResultVisitor { private final Map collectedResults = new ConcurrentHashMap<>(); private final ResultsExtractor extractor; private final long startTs; StartTsResultsCollector(long startTs, Function, ResultsExtractor> extractorFactory) { this.extractor = extractorFactory.apply(collectedResults); this.startTs = startTs; } public Map getCollectedResults() { return collectedResults; } @Override public void visit(Map> results) { extractor.extractResults(results, startTs, ColumnSelection.all()); } } static class AllTimestampsCollector implements ThreadSafeResultVisitor { private final Multimap collectedResults = HashMultimap.create(); public Multimap getCollectedResults() { return collectedResults; } @Override public synchronized void visit(Map> results) { for (Map.Entry> result : results.entrySet()) { byte[] row = CassandraKeyValueServices.getBytesFromByteBuffer(result.getKey()); for (ColumnOrSuperColumn col : result.getValue()) { ColumnAndTimestamp columnAndTimestamp = CassandraKeyValueServices.decomposeColumnName(col.column); Cell cell = Cell.create(row, columnAndTimestamp.columnName()); collectedResults.put(cell, columnAndTimestamp.timestamp()); } } } } static boolean isEmptyOrInvalidMetadata(byte[] metadata) { return metadata == null || Arrays.equals(metadata, AtlasDbConstants.EMPTY_TABLE_METADATA) || Arrays.equals(metadata, AtlasDbConstants.GENERIC_TABLE_METADATA); } static TableMetadata getMetadataOrDefaultToGeneric(byte[] metadata) { if (metadata == null || Arrays.equals(metadata, AtlasDbConstants.EMPTY_TABLE_METADATA)) { return TableMetadata.BYTES_HYDRATOR.hydrateFromBytes(AtlasDbConstants.GENERIC_TABLE_METADATA); } return TableMetadata.BYTES_HYDRATOR.hydrateFromBytes(metadata); } public static class ColumnAndTimestamp { private final long timestamp; private final byte[] columnName; ColumnAndTimestamp(byte[] columnName, long timestamp) { this.columnName = columnName; this.timestamp = timestamp; } public byte[] columnName() { return columnName; } public long timestamp() { return timestamp; } @Override public boolean equals(Object obj) { if (this == obj) { return true; } if (obj == null || getClass() != obj.getClass()) { return false; } ColumnAndTimestamp that = (ColumnAndTimestamp) obj; return timestamp == that.timestamp && Arrays.equals(columnName, that.columnName); } @Override public int hashCode() { return 31 * Long.hashCode(timestamp) + Arrays.hashCode(columnName); } @Override public String toString() { return "ColumnAndTimestamp{columnName=" + Arrays.toString(columnName) + ", timestamp=" + timestamp + '}'; } } }





© 2015 - 2024 Weber Informatics LLC | Privacy Policy