org.apache.cassandra.service.StorageService Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of cassandra-all Show documentation
Show all versions of cassandra-all Show documentation
Palantir open source project
The newest version!
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.cassandra.service;
import java.io.*;
import java.lang.management.ManagementFactory;
import java.net.InetAddress;
import java.net.UnknownHostException;
import java.nio.ByteBuffer;
import java.time.Instant;
import java.util.*;
import java.util.Map.Entry;
import java.util.concurrent.*;
import java.util.concurrent.atomic.AtomicBoolean;
import java.util.concurrent.atomic.AtomicInteger;
import java.util.concurrent.locks.Condition;
import java.util.stream.Collectors;
import javax.management.*;
import javax.management.openmbean.TabularData;
import javax.management.openmbean.TabularDataSupport;
import com.google.common.annotations.VisibleForTesting;
import com.google.common.base.Preconditions;
import com.google.common.base.Predicate;
import com.google.common.collect.*;
import com.google.common.util.concurrent.*;
import com.palantir.cassandra.db.BootstrappingSafetyException;
import com.palantir.cassandra.settings.LocalQuorumReadForSerialCasSetting;
import com.palantir.logsafe.Safe;
import com.palantir.logsafe.SafeArg;
import org.apache.cassandra.schema.LegacySchemaTables;
import org.apache.commons.lang3.StringUtils;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import ch.qos.logback.classic.LoggerContext;
import ch.qos.logback.classic.jmx.JMXConfiguratorMBean;
import ch.qos.logback.classic.spi.ILoggingEvent;
import ch.qos.logback.core.Appender;
import com.palantir.cassandra.cvim.CrossVpcIpMappingAckVerbHandler;
import com.palantir.cassandra.cvim.CrossVpcIpMappingSynVerbHandler;
import com.palantir.cassandra.dht.SingleRackFilter;
import com.palantir.cassandra.settings.DisableClientInterfaceSetting;
import com.palantir.cassandra.settings.LockKeyspaceCreationSetting;
import org.apache.cassandra.auth.AuthKeyspace;
import org.apache.cassandra.auth.AuthMigrationListener;
import org.apache.cassandra.concurrent.ScheduledExecutors;
import org.apache.cassandra.concurrent.Stage;
import org.apache.cassandra.concurrent.StageManager;
import org.apache.cassandra.config.*;
import org.apache.cassandra.db.*;
import org.apache.cassandra.db.commitlog.CommitLog;
import org.apache.cassandra.db.commitlog.CommitLogReplayer;
import org.apache.cassandra.db.compaction.CompactionManager;
import org.apache.cassandra.dht.*;
import org.apache.cassandra.dht.Range;
import org.apache.cassandra.exceptions.*;
import org.apache.cassandra.gms.*;
import org.apache.cassandra.io.sstable.SSTableDeletingTask;
import org.apache.cassandra.io.sstable.SSTableLoader;
import org.apache.cassandra.io.sstable.format.SSTableReader;
import org.apache.cassandra.io.util.FileUtils;
import org.apache.cassandra.locator.*;
import org.apache.cassandra.metrics.NonTransientErrorMetrics;
import org.apache.cassandra.metrics.StorageMetrics;
import org.apache.cassandra.net.*;
import org.apache.cassandra.repair.*;
import org.apache.cassandra.repair.messages.RepairOption;
import org.apache.cassandra.service.opstate.CleanupStateTracker;
import org.apache.cassandra.service.paxos.CommitVerbHandler;
import org.apache.cassandra.service.paxos.PrepareVerbHandler;
import org.apache.cassandra.service.paxos.ProposeVerbHandler;
import org.apache.cassandra.streaming.*;
import org.apache.cassandra.thrift.EndpointDetails;
import org.apache.cassandra.thrift.TokenRange;
import org.apache.cassandra.thrift.cassandraConstants;
import org.apache.cassandra.tracing.TraceKeyspace;
import org.apache.cassandra.utils.*;
import org.apache.cassandra.utils.concurrent.SimpleCondition;
import org.apache.cassandra.utils.progress.ProgressListener;
import org.apache.cassandra.utils.progress.jmx.JMXProgressSupport;
import org.apache.cassandra.utils.progress.jmx.LegacyJMXProgressSupport;
import static java.util.concurrent.TimeUnit.MINUTES;
/**
* This abstraction contains the token/identifier of this node
* on the identifier space. This token gets gossiped around.
* This class will also maintain histograms of the load information
* of other nodes in the cluster.
*/
public class StorageService extends NotificationBroadcasterSupport implements IEndpointStateChangeSubscriber, StorageServiceMBean
{
private static final Logger logger = LoggerFactory.getLogger(StorageService.class);
private static final boolean DISABLE_WAIT_TO_BOOTSTRAP = Boolean.getBoolean("palantir_cassandra.disable_wait_to_bootstrap");
private static final boolean DISABLE_WAIT_TO_FINISH_BOOTSTRAP = Boolean.getBoolean("palantir_cassandra.disable_wait_to_finish_bootstrap");
private static final Integer BOOTSTRAP_DISK_USAGE_THRESHOLD = Integer.getInteger("palantir_cassandra.bootstrap_disk_usage_threshold_percentage");
public static final int RING_DELAY = getRingDelay(); // delay after which we assume ring has stablized
private final JMXProgressSupport progressSupport = new JMXProgressSupport(this);
private final CleanupStateTracker cleanupState = new CleanupStateTracker();
private int cleanupOpsInProgress = 0;
private final RepairTracker repairTracker = new RepairTracker();
private final List bootstrapListeners = new CopyOnWriteArrayList<>();
private final Condition startBootstrapCondition = new SimpleCondition(DISABLE_WAIT_TO_BOOTSTRAP);
private final Condition finishBootstrapCondition = new SimpleCondition(DISABLE_WAIT_TO_FINISH_BOOTSTRAP);
/**
* @deprecated backward support to previous notification interface
* Will be removed on 4.0
*/
@Deprecated
private final LegacyJMXProgressSupport legacyProgressSupport;
private static int getRingDelay()
{
String newdelay = System.getProperty("cassandra.ring_delay_ms");
if (newdelay != null)
{
logger.info("Overriding RING_DELAY to {}ms", newdelay);
return Integer.parseInt(newdelay);
}
else
return 30 * 1000;
}
/* This abstraction maintains the token/endpoint metadata information */
private TokenMetadata tokenMetadata = new TokenMetadata(true);
public volatile VersionedValue.VersionedValueFactory valueFactory = new VersionedValue.VersionedValueFactory(getPartitioner());
private Thread drainOnShutdown = null;
private volatile boolean inShutdownHook = false;
public static final StorageService instance = new StorageService();
public boolean isInShutdownHook()
{
return inShutdownHook;
}
public static IPartitioner getPartitioner()
{
return DatabaseDescriptor.getPartitioner();
}
public Collection> getLocalRanges(String keyspaceName)
{
return getRangesForEndpoint(keyspaceName, FBUtilities.getBroadcastAddress());
}
public Collection> getPrimaryRanges(String keyspace)
{
return getPrimaryRangesForEndpoint(keyspace, FBUtilities.getBroadcastAddress());
}
public Collection> getPrimaryRangesWithinDC(String keyspace)
{
return getPrimaryRangeForEndpointWithinDC(keyspace, FBUtilities.getBroadcastAddress());
}
private final Set replicatingNodes = Collections.synchronizedSet(new HashSet());
private CassandraDaemon daemon;
private InetAddress removingNode;
/* Are we starting this node in bootstrap mode? */
private volatile boolean isBootstrapMode;
/* we bootstrap but do NOT join the ring unless told to do so */
private boolean isSurveyMode = Boolean.parseBoolean(System.getProperty
("cassandra.write_survey", "false"));
/* true if node is rebuilding and receiving data */
private final AtomicBoolean isRebuilding = new AtomicBoolean();
private boolean initialized;
private volatile boolean joined = false;
/* the probability for tracing any particular request, 0 disables tracing and 1 enables for all */
private double traceProbability = 0.0;
@VisibleForTesting
static enum Mode { STARTING, NORMAL, JOINING, LEAVING, DECOMMISSIONED, MOVING, DRAINING, DRAINED, ZOMBIE, NON_TRANSIENT_ERROR, TRANSIENT_ERROR, WAITING_TO_BOOTSTRAP, WAITING_TO_FINISH_BOOTSTRAP, DISABLED }
private volatile Mode operationMode = Mode.STARTING;
/* Used for tracking drain progress */
private volatile int totalCFs, remainingCFs;
private static final AtomicInteger nextRepairCommand = new AtomicInteger();
private final List lifecycleSubscribers = new CopyOnWriteArrayList<>();
private static final BackgroundActivityMonitor bgMonitor = new BackgroundActivityMonitor();
private final String jmxObjectName;
private Collection bootstrapTokens = null;
private final Set> nonTransientErrors = Collections.synchronizedSet(new HashSet<>());
private final Set> transientErrors = Collections.synchronizedSet(new HashSet<>());
// true when keeping strict consistency while bootstrapping
private boolean useStrictConsistency = Boolean.parseBoolean(System.getProperty("cassandra.consistent.rangemovement", "true"));
private static final boolean allowSimultaneousMoves = Boolean.parseBoolean(System.getProperty("cassandra.consistent.simultaneousmoves.allow", "false"));
public static final boolean joinRing = Boolean.parseBoolean(System.getProperty("cassandra.join_ring", "true"));
private boolean replacing;
private UUID replacingId;
private final StreamStateStore streamStateStore = new StreamStateStore();
private final AtomicBoolean doneAuthSetup = new AtomicBoolean(false);
public boolean isSurveyMode()
{
return isSurveyMode;
}
public boolean hasJoined()
{
return joined;
}
/** This method updates the local token on disk */
public void setTokens(Collection tokens)
{
assert tokens != null && !tokens.isEmpty() : "Node needs at least one token.";
if (logger.isDebugEnabled())
logger.debug("Setting tokens to {}", tokens);
SystemKeyspace.updateTokens(tokens);
Collection localTokens = getLocalTokens();
setGossipTokens(localTokens);
tokenMetadata.updateNormalTokens(tokens, FBUtilities.getBroadcastAddress());
setMode(Mode.NORMAL, false);
}
public void setGossipTokens(Collection tokens)
{
if (DatabaseDescriptor.isAutoBootstrap() && !bootstrapComplete())
{
throw new BootstrappingSafetyException("Cannot set tokens for a non-bootstrapped node");
}
List> states = new ArrayList>();
states.add(Pair.create(ApplicationState.TOKENS, valueFactory.tokens(tokens)));
states.add(Pair.create(ApplicationState.STATUS, valueFactory.normal(tokens)));
Gossiper.instance.addLocalApplicationStates(states);
}
public StorageService()
{
// use dedicated executor for sending JMX notifications
super(Executors.newSingleThreadExecutor());
jmxObjectName = "org.apache.cassandra.db:type=StorageService";
MBeanWrapper.instance.registerMBean(this, jmxObjectName);
MBeanWrapper.instance.registerMBean(StreamManager.instance, StreamManager.OBJECT_NAME);
legacyProgressSupport = new LegacyJMXProgressSupport(this, jmxObjectName);
/* register the verb handlers */
MessagingService.instance().registerVerbHandlers(MessagingService.Verb.MUTATION, new MutationVerbHandler());
MessagingService.instance().registerVerbHandlers(MessagingService.Verb.READ_REPAIR, new ReadRepairVerbHandler());
MessagingService.instance().registerVerbHandlers(MessagingService.Verb.READ, new ReadVerbHandler());
MessagingService.instance().registerVerbHandlers(MessagingService.Verb.RANGE_SLICE, new RangeSliceVerbHandler());
MessagingService.instance().registerVerbHandlers(MessagingService.Verb.PAGED_RANGE, new RangeSliceVerbHandler());
MessagingService.instance().registerVerbHandlers(MessagingService.Verb.COUNTER_MUTATION, new CounterMutationVerbHandler());
MessagingService.instance().registerVerbHandlers(MessagingService.Verb.TRUNCATE, new TruncateVerbHandler());
MessagingService.instance().registerVerbHandlers(MessagingService.Verb.PAXOS_PREPARE, new PrepareVerbHandler());
MessagingService.instance().registerVerbHandlers(MessagingService.Verb.PAXOS_PROPOSE, new ProposeVerbHandler());
MessagingService.instance().registerVerbHandlers(MessagingService.Verb.PAXOS_COMMIT, new CommitVerbHandler());
// see BootStrapper for a summary of how the bootstrap verbs interact
MessagingService.instance().registerVerbHandlers(MessagingService.Verb.REPLICATION_FINISHED, new ReplicationFinishedVerbHandler());
MessagingService.instance().registerVerbHandlers(MessagingService.Verb.REQUEST_RESPONSE, new ResponseVerbHandler());
MessagingService.instance().registerVerbHandlers(MessagingService.Verb.INTERNAL_RESPONSE, new ResponseVerbHandler());
MessagingService.instance().registerVerbHandlers(MessagingService.Verb.REPAIR_MESSAGE, new RepairMessageVerbHandler());
MessagingService.instance().registerVerbHandlers(MessagingService.Verb.GOSSIP_SHUTDOWN, new GossipShutdownVerbHandler());
MessagingService.instance().registerVerbHandlers(MessagingService.Verb.GOSSIP_DIGEST_SYN, new GossipDigestSynVerbHandler());
MessagingService.instance().registerVerbHandlers(MessagingService.Verb.GOSSIP_DIGEST_ACK, new GossipDigestAckVerbHandler());
MessagingService.instance().registerVerbHandlers(MessagingService.Verb.GOSSIP_DIGEST_ACK2, new GossipDigestAck2VerbHandler());
MessagingService.instance().registerVerbHandlers(MessagingService.Verb.DEFINITIONS_UPDATE, new DefinitionsUpdateVerbHandler());
MessagingService.instance().registerVerbHandlers(MessagingService.Verb.SCHEMA_CHECK, new SchemaCheckVerbHandler());
MessagingService.instance().registerVerbHandlers(MessagingService.Verb.MIGRATION_REQUEST, new MigrationRequestVerbHandler());
MessagingService.instance().registerVerbHandlers(MessagingService.Verb.SNAPSHOT, new SnapshotVerbHandler());
MessagingService.instance().registerVerbHandlers(MessagingService.Verb.ECHO, new EchoVerbHandler());
MessagingService.instance().registerVerbHandlers(MessagingService.Verb.CROSS_VPC_IP_MAPPING_SYN, new CrossVpcIpMappingSynVerbHandler());
MessagingService.instance().registerVerbHandlers(MessagingService.Verb.CROSS_VPC_IP_MAPPING_ACK, new CrossVpcIpMappingAckVerbHandler());
}
public void registerDaemon(CassandraDaemon daemon)
{
this.daemon = daemon;
}
public void register(IEndpointLifecycleSubscriber subscriber)
{
lifecycleSubscribers.add(subscriber);
}
public void unregister(IEndpointLifecycleSubscriber subscriber)
{
lifecycleSubscribers.remove(subscriber);
}
public void registerBootstrapListener(ProgressListener bootstrapListener) {
bootstrapListeners.add(bootstrapListener);
}
// should only be called via JMX
public void stopGossiping()
{
if (initialized)
{
logger.warn("Stopping gossip by operator request");
Gossiper.instance.stop();
initialized = false;
}
}
// should only be called via JMX
public void startGossiping()
{
if (!isInitialized())
{
logger.warn("Starting gossip by operator request");
Collection tokens = SystemKeyspace.getSavedTokens();
boolean validTokens = tokens != null && !tokens.isEmpty();
// shouldn't be called before these are set if we intend to join the ring/are in the process of doing so
if (joined || joinRing)
assert validTokens : "Cannot start gossiping for a node intended to join without valid tokens";
if (validTokens)
setGossipTokens(tokens);
Gossiper.instance.forceNewerGeneration();
Gossiper.instance.start((int) (System.currentTimeMillis() / 1000));
initialized = true;
}
}
// should only be called via JMX
public boolean isGossipRunning()
{
return Gossiper.instance.isEnabled();
}
// should only be called via JMX
public void startRPCServer()
{
if (daemon == null)
{
throw new IllegalStateException("No configured daemon");
}
// We only start transports if bootstrap has completed and we're not in survey mode, OR if we are in
// survey mode and streaming has completed but we're not using auth.
// OR if we have not joined the ring yet.
if (StorageService.instance.hasJoined())
{
if (StorageService.instance.isSurveyMode())
{
if (StorageService.instance.isBootstrapMode() || DatabaseDescriptor.getAuthenticator().requireAuthentication())
{
throw new IllegalStateException("Not starting RPC server in write_survey mode as " +
"it's bootstrapping or auth is enabled");
}
}
else
{
if (!SystemKeyspace.bootstrapComplete())
{
throw new IllegalStateException("Node is not yet bootstrapped completely. Use nodetool to check bootstrap" +
" state and resume. For more, see `nodetool help bootstrap`");
}
}
}
daemon.thriftServer.start();
}
public void stopRPCServer()
{
if (daemon == null)
{
throw new IllegalStateException("No configured daemon");
}
if (daemon.thriftServer != null)
daemon.thriftServer.stop();
}
public boolean isRPCServerRunning()
{
if ((daemon == null) || (daemon.thriftServer == null))
{
return false;
}
return daemon.thriftServer.isRunning();
}
public void startNativeTransport()
{
// We only start transports if bootstrap has completed and we're not in survey mode, OR if we are in
// survey mode and streaming has completed but we're not using auth.
// OR if we have not joined the ring yet.
if (hasJoined() &&
((!isSurveyMode() && !SystemKeyspace.bootstrapComplete()) ||
(isSurveyMode() && isBootstrapMode())))
{
throw new IllegalStateException("Node is not yet bootstrapped completely. Use nodetool to check bootstrap" +
" state and resume. For more, see `nodetool help bootstrap`");
}
if (hasJoined() && isSurveyMode() && !SystemKeyspace.bootstrapComplete() &&
DatabaseDescriptor.getAuthenticator().requireAuthentication())
{
throw new IllegalStateException("Not starting client transports as write_survey mode is enabled");
}
if (daemon == null)
{
throw new IllegalStateException("No configured daemon");
}
try
{
daemon.nativeServer.start();
}
catch (Exception e)
{
throw new RuntimeException("Error starting native transport: " + e.getMessage());
}
}
public void stopNativeTransport()
{
if (daemon == null)
{
throw new IllegalStateException("No configured daemon");
}
if (daemon.nativeServer != null)
daemon.nativeServer.stop();
}
public boolean isNativeTransportRunning()
{
if ((daemon == null) || (daemon.nativeServer == null))
{
return false;
}
return daemon.nativeServer.isRunning();
}
public void stopTransports()
{
if (isNativeTransportRunning())
{
logger.error("Stopping native transport");
stopNativeTransport();
}
if (isRPCServerRunning())
{
logger.error("Stopping RPC server");
stopRPCServer();
}
if (isInitialized())
{
logger.error("Stopping gossiper");
stopGossiping();
}
}
private void startTransports() {
if (!bootstrapComplete())
{
throw new IllegalStateException("Node is not yet bootstrapped completely. Refusing operator request to "
+ "start transports.");
}
if (!isInitialized() && !Gossiper.instance.isEnabled())
{
logger.info("Starting gossiper");
startGossiping();
}
if (!isRPCServerRunning())
{
logger.info("Starting RPC server");
startRPCServer();
}
if (!isNativeTransportRunning())
{
logger.info("Starting native transport");
startNativeTransport();
}
}
private boolean areAllTransportsStopped() {
return !isGossipRunning() && !isRPCServerRunning() && !isNativeTransportRunning();
}
private static boolean isAutoCompactionDisabled() {
boolean isDisabled = true;
for (String keyspaceName : Schema.instance.getKeyspaces())
{
Keyspace keyspace = Schema.instance.getKeyspaceInstance(keyspaceName);
if (keyspace != null)
{
for (ColumnFamilyStore cfs : keyspace.getColumnFamilyStores())
{
for (ColumnFamilyStore store : cfs.concatWithIndexes())
{
isDisabled &= store.isAutoCompactionDisabled();
}
}
}
}
return isDisabled;
}
private void shutdownClientServers()
{
stopRPCServer();
stopNativeTransport();
}
public void stopClient()
{
Gossiper.instance.unregister(this);
Gossiper.instance.stop();
MessagingService.instance().shutdown();
// give it a second so that task accepted before the MessagingService shutdown gets submitted to the stage (to avoid RejectedExecutionException)
Uninterruptibles.sleepUninterruptibly(1, TimeUnit.SECONDS);
StageManager.shutdownNow();
}
public boolean isInitialized()
{
return initialized;
}
public boolean isSetupCompleted()
{
return daemon == null
? false
: daemon.setupCompleted();
}
public void stopDaemon()
{
if (daemon == null)
throw new IllegalStateException("No configured daemon");
daemon.deactivate();
}
public synchronized Collection prepareReplacementInfo() throws ConfigurationException
{
logger.info("Gathering node replacement information for {}", DatabaseDescriptor.getReplaceAddress());
if (!MessagingService.instance().isListening())
MessagingService.instance().listen();
// make magic happen
Map epStates = Gossiper.instance.doShadowRound();
// now that we've gossiped at least once, we should be able to find the node we're replacing
if (epStates.get(DatabaseDescriptor.getReplaceAddress())== null)
throw new RuntimeException("Cannot replace_address " + DatabaseDescriptor.getReplaceAddress() + " because it doesn't exist in gossip");
replacingId = Gossiper.instance.getHostId(DatabaseDescriptor.getReplaceAddress(), epStates);
try
{
VersionedValue tokensVersionedValue = epStates.get(DatabaseDescriptor.getReplaceAddress()).getApplicationState(ApplicationState.TOKENS);
if (tokensVersionedValue == null)
throw new RuntimeException("Could not find tokens for " + DatabaseDescriptor.getReplaceAddress() + " to replace");
Collection tokens = TokenSerializer.deserialize(getPartitioner(), new DataInputStream(new ByteArrayInputStream(tokensVersionedValue.toBytes())));
if (isReplacingSameAddress())
{
SystemKeyspace.setLocalHostId(replacingId); // use the replacee's host Id as our own so we receive hints, etc
}
return tokens;
}
catch (IOException e)
{
throw new RuntimeException(e);
}
}
public synchronized void checkForEndpointCollision() throws ConfigurationException
{
logger.debug("Starting shadow gossip round to check for endpoint collision");
if (!MessagingService.instance().isListening())
MessagingService.instance().listen();
Map epStates = Gossiper.instance.doShadowRound();
if (!Gossiper.instance.isSafeForBootstrap(FBUtilities.getBroadcastAddress(), epStates))
{
throw new RuntimeException(String.format("A node with address %s already exists, cancelling join. " +
"Use cassandra.replace_address if you want to replace this node.",
FBUtilities.getBroadcastAddress()));
}
if (useStrictConsistency && !allowSimultaneousMoves())
{
for (Map.Entry entry : epStates.entrySet())
{
// ignore local node or empty status
if (entry.getKey().equals(FBUtilities.getBroadcastAddress()) || entry.getValue().getApplicationState(ApplicationState.STATUS) == null)
continue;
String[] pieces = splitValue(entry.getValue().getApplicationState(ApplicationState.STATUS));
assert (pieces.length > 0);
String state = pieces[0];
if (state.equals(VersionedValue.STATUS_BOOTSTRAPPING) || state.equals(VersionedValue.STATUS_LEAVING) || state.equals(VersionedValue.STATUS_MOVING))
throw new UnsupportedOperationException("Other bootstrapping/leaving/moving nodes detected, cannot bootstrap while cassandra.consistent.rangemovement is true");
}
}
}
private boolean allowSimultaneousMoves()
{
return allowSimultaneousMoves && DatabaseDescriptor.getNumTokens() == 1;
}
// for testing only
public void unsafeInitialize() throws ConfigurationException
{
initialized = true;
Gossiper.instance.register(this);
Gossiper.instance.start((int) (System.currentTimeMillis() / 1000)); // needed for node-ring gathering.
Gossiper.instance.addLocalApplicationState(ApplicationState.NET_VERSION, valueFactory.networkVersion());
if (!MessagingService.instance().isListening())
MessagingService.instance().listen();
}
public synchronized void initServer() throws ConfigurationException
{
initServer(RING_DELAY);
}
public synchronized void initServer(int delay) throws ConfigurationException
{
logger.info("Cassandra version: {}", FBUtilities.getReleaseVersionString());
logger.info("Thrift API version: {}", cassandraConstants.VERSION);
logger.info("CQL supported versions: {} (default: {})",
StringUtils.join(ClientState.getCQLSupportedVersion(), ","), ClientState.DEFAULT_CQL_VERSION);
isBootstrapMode = SystemKeyspace.bootstrapInProgress();
initialized = true;
try
{
// Ensure StorageProxy is initialized on start-up; see CASSANDRA-3797.
Class.forName("org.apache.cassandra.service.StorageProxy");
// also IndexSummaryManager, which is otherwise unreferenced
Class.forName("org.apache.cassandra.io.sstable.IndexSummaryManager");
}
catch (ClassNotFoundException e)
{
throw new AssertionError(e);
}
if (Boolean.parseBoolean(System.getProperty("cassandra.load_ring_state", "true")))
{
logger.info("Loading persisted ring state");
Multimap loadedTokens = SystemKeyspace.loadTokens();
Map loadedHostIds = SystemKeyspace.loadHostIds();
for (InetAddress ep : loadedTokens.keySet())
{
if (ep.equals(FBUtilities.getBroadcastAddress()))
{
// entry has been mistakenly added, delete it
SystemKeyspace.removeEndpoint(ep);
}
else
{
tokenMetadata.updateNormalTokens(loadedTokens.get(ep), ep);
if (loadedHostIds.containsKey(ep))
tokenMetadata.updateHostId(loadedHostIds.get(ep), ep);
Gossiper.instance.addSavedEndpoint(ep);
}
}
}
// daemon threads, like our executors', continue to run while shutdown hooks are invoked
drainOnShutdown = new Thread(new WrappedRunnable()
{
@Override
public void runMayThrow() throws InterruptedException
{
inShutdownHook = true;
ExecutorService counterMutationStage = StageManager.getStage(Stage.COUNTER_MUTATION);
ExecutorService mutationStage = StageManager.getStage(Stage.MUTATION);
if (mutationStage.isShutdown() && counterMutationStage.isShutdown())
return; // drained already
if (daemon != null)
shutdownClientServers();
ScheduledExecutors.optionalTasks.shutdown();
Gossiper.instance.stop();
// In-progress writes originating here could generate hints to be written, so shut down MessagingService
// before mutation stage, so we can get all the hints saved before shutting down
MessagingService.instance().shutdown();
counterMutationStage.shutdown();
mutationStage.shutdown();
counterMutationStage.awaitTermination(3600, TimeUnit.SECONDS);
mutationStage.awaitTermination(3600, TimeUnit.SECONDS);
StorageProxy.instance.verifyNoHintsInProgress();
List> flushes = new ArrayList<>();
for (Keyspace keyspace : Keyspace.all())
{
KSMetaData ksm = Schema.instance.getKSMetaData(keyspace.getName());
if (!ksm.durableWrites)
{
for (ColumnFamilyStore cfs : keyspace.getColumnFamilyStores())
flushes.add(cfs.forceFlush("Shutting down"));
}
}
try
{
FBUtilities.waitOnFutures(flushes);
}
catch (Throwable t)
{
JVMStabilityInspector.inspectThrowable(t);
// don't let this stop us from shutting down the commitlog and other thread pools
logger.warn("Caught exception while waiting for memtable flushes during shutdown hook", t);
}
CommitLog.instance.shutdownBlocking();
if (FBUtilities.isWindows())
WindowsTimer.endTimerPeriod(DatabaseDescriptor.getWindowsTimerInterval());
// wait for miscellaneous tasks like sstable and commitlog segment deletion
ScheduledExecutors.nonPeriodicTasks.shutdown();
if (!ScheduledExecutors.nonPeriodicTasks.awaitTermination(1, MINUTES))
logger.warn("Miscellaneous task executor still busy after one minute; proceeding with shutdown");
}
}, "StorageServiceShutdownHook");
Runtime.getRuntime().addShutdownHook(drainOnShutdown);
replacing = DatabaseDescriptor.isReplacing();
if (!Boolean.parseBoolean(System.getProperty("cassandra.start_gossip", "true")))
{
logger.info("Not starting gossip as requested.");
return;
}
prepareToJoin();
// Has to be called after the host id has potentially changed in prepareToJoin().
try
{
CacheService.instance.counterCache.loadSavedAsync().get();
}
catch (Throwable t)
{
JVMStabilityInspector.inspectThrowable(t);
logger.warn("Error loading counter cache", t);
}
if (joinRing)
{
joinTokenRing(delay);
}
else
{
Collection tokens = SystemKeyspace.getSavedTokens();
if (!tokens.isEmpty())
{
tokenMetadata.updateNormalTokens(tokens, FBUtilities.getBroadcastAddress());
// order is important here, the gossiper can fire in between adding these two states. It's ok to send TOKENS without STATUS, but *not* vice versa.
List> states = new ArrayList>();
states.add(Pair.create(ApplicationState.TOKENS, valueFactory.tokens(tokens)));
states.add(Pair.create(ApplicationState.STATUS, valueFactory.hibernate(true)));
Gossiper.instance.addLocalApplicationStates(states);
}
doAuthSetup();
setMode(Mode.ZOMBIE, true);
logger.info("Not joining ring as requested. Use JMX (StorageService->joinRing()) to initiate ring joining");
}
}
/**
* In the event of forceful termination we need to remove the shutdown hook to prevent hanging (OOM for instance)
*/
public void removeShutdownHook()
{
if (drainOnShutdown != null)
Runtime.getRuntime().removeShutdownHook(drainOnShutdown);
if (FBUtilities.isWindows())
WindowsTimer.endTimerPeriod(DatabaseDescriptor.getWindowsTimerInterval());
}
private boolean shouldBootstrap()
{
return shouldBootstrap(DatabaseDescriptor.isAutoBootstrap());
}
private boolean shouldBootstrap(boolean autoBootstrap)
{
return autoBootstrap && !SystemKeyspace.bootstrapComplete() && !DatabaseDescriptor.getSeeds().contains(FBUtilities.getBroadcastAddress());
}
private void prepareToJoin() throws ConfigurationException
{
if (!joined)
{
Map appStates = new EnumMap<>(ApplicationState.class);
if (replacing && !joinRing)
throw new ConfigurationException("Cannot set both join_ring=false and attempt to replace a node");
if (DatabaseDescriptor.getReplaceTokens().size() > 0 || DatabaseDescriptor.getReplaceNode() != null)
throw new RuntimeException("Replace method removed; use cassandra.replace_address instead");
if (replacing)
{
if (SystemKeyspace.bootstrapComplete())
throw new RuntimeException("Cannot replace address with a node that is already bootstrapped");
if (!DatabaseDescriptor.isAutoBootstrap())
throw new RuntimeException("Trying to replace_address with auto_bootstrap disabled will not work, check your configuration");
bootstrapTokens = prepareReplacementInfo();
if (isReplacingSameAddress())
{
logger.warn("Writes will not be forwarded to this node during replacement because it has the same address as " +
"the node to be replaced ({}). If the previous node has been down for longer than max_hint_window_in_ms, " +
"repair must be run after the replacement process in order to make this node consistent.",
DatabaseDescriptor.getReplaceAddress());
appStates.put(ApplicationState.TOKENS, valueFactory.tokens(bootstrapTokens));
appStates.put(ApplicationState.STATUS, valueFactory.hibernate(true));
}
}
else if (shouldBootstrap())
{
checkForEndpointCollision();
}
// have to start the gossip service before we can see any info on other nodes. this is necessary
// for bootstrap to get the load info it needs.
// (we won't be part of the storage ring though until we add a counterId to our state, below.)
// Seed the host ID-to-endpoint map with our own ID.
UUID localHostId = SystemKeyspace.getLocalHostId();
getTokenMetadata().updateHostId(localHostId, FBUtilities.getBroadcastAddress());
appStates.put(ApplicationState.NET_VERSION, valueFactory.networkVersion());
appStates.put(ApplicationState.HOST_ID, valueFactory.hostId(localHostId));
appStates.put(ApplicationState.RPC_ADDRESS, valueFactory.rpcaddress(FBUtilities.getBroadcastRpcAddress()));
appStates.put(ApplicationState.RELEASE_VERSION, valueFactory.releaseVersion());
logger.info("Starting up server gossip");
Gossiper.instance.register(this);
Gossiper.instance.start(SystemKeyspace.incrementAndGetGeneration(), appStates); // needed for node-ring gathering.
// gossip snitch infos (local DC and rack)
gossipSnitchInfo();
// gossip Schema.emptyVersion forcing immediate check for schema updates (see MigrationManager#maybeScheduleSchemaPull)
Schema.instance.updateVersionAndAnnounce(); // Ensure we know our own actual Schema UUID in preparation for updates
if (!MessagingService.instance().isListening())
MessagingService.instance().listen();
LoadBroadcaster.instance.startBroadcasting();
HintedHandOffManager.instance.start();
BatchlogManager.instance.start();
}
}
private void joinTokenRing(int delay, boolean autoBootstrap, Collection initialTokens)
{
joined = true;
// We bootstrap if we haven't successfully bootstrapped before, as long as we are not a seed.
// If we are a seed, or if the user manually sets auto_bootstrap to false,
// we'll skip streaming data from other nodes and jump directly into the ring.
//
// The seed check allows us to skip the RING_DELAY sleep for the single-node cluster case,
// which is useful for both new users and testing.
//
// We attempted to replace this with a schema-presence check, but you need a meaningful sleep
// to get schema info from gossip which defeats the purpose. See CASSANDRA-4427 for the gory details.
Set current = new HashSet<>();
if (logger.isDebugEnabled())
{
logger.debug("Bootstrap variables: {} {} {} {}",
autoBootstrap,
SystemKeyspace.bootstrapInProgress(),
SystemKeyspace.bootstrapComplete(),
DatabaseDescriptor.getSeeds().contains(FBUtilities.getBroadcastAddress()));
}
if (autoBootstrap && !SystemKeyspace.bootstrapComplete() && DatabaseDescriptor.getSeeds().contains(FBUtilities.getBroadcastAddress()))
{
logger.info("This node will not auto bootstrap because it is configured to be a seed node.");
}
boolean dataAvailable = true; // make this to false when bootstrap streaming failed
if (shouldBootstrap(autoBootstrap))
{
setMode(Mode.WAITING_TO_BOOTSTRAP, "Awaiting start bootstrap call", true);
try
{
startBootstrapCondition.await();
}
catch (InterruptedException e)
{
throw new AssertionError(e);
}
boolean noPreviousDataFound = isCommitlogEmptyForBootstrap() && areKeyspacesEmptyForBootstrap();
if (!noPreviousDataFound)
{
recordNonTransientError(NonTransientError.BOOTSTRAP_ERROR,
ImmutableMap.of("previousDataFound", "true"));
unsafeDisableNode();
// leave node in non-transient error state and prevent it from bootstrapping into the cluster
throw new BootstrappingSafetyException("Detected data from previous bootstrap, failing.");
}
if (SystemKeyspace.bootstrapInProgress())
{
logger.warn("Detected previous bootstrap failure; retrying");
}
else
{
SystemKeyspace.setBootstrapState(SystemKeyspace.BootstrapState.IN_PROGRESS);
}
setMode(Mode.JOINING, "waiting for ring information", true);
// first sleep the delay to make sure we see all our peers
for (int i = 0; i < delay; i += 1000)
{
// if we see schema, we can proceed to the next check directly
if (!Schema.instance.getVersion().equals(Schema.emptyVersion))
{
logger.debug("got schema: {}", Schema.instance.getVersion());
break;
}
Uninterruptibles.sleepUninterruptibly(1, TimeUnit.SECONDS);
}
// if our schema hasn't matched yet, keep sleeping until it does
// (post CASSANDRA-1391 we don't expect this to be necessary very often, but it doesn't hurt to be careful)
while (!MigrationManager.isReadyForBootstrap())
{
setMode(Mode.JOINING, "waiting for schema information to complete", true);
Uninterruptibles.sleepUninterruptibly(1, TimeUnit.SECONDS);
}
setMode(Mode.JOINING, "schema complete, ready to bootstrap", true);
setMode(Mode.JOINING, "waiting for pending range calculation", true);
PendingRangeCalculatorService.instance.blockUntilFinished();
setMode(Mode.JOINING, "calculation complete, ready to bootstrap", true);
logger.debug("... got ring + schema info");
if (useStrictConsistency && !allowSimultaneousMoves() &&
(
tokenMetadata.getBootstrapTokens().valueSet().size() > 0 ||
tokenMetadata.getLeavingEndpoints().size() > 0 ||
tokenMetadata.getMovingEndpoints().size() > 0
))
{
throw new UnsupportedOperationException("Other bootstrapping/leaving/moving nodes detected, cannot bootstrap while cassandra.consistent.rangemovement is true");
}
// get bootstrap tokens
if (!replacing)
{
if (tokenMetadata.isMember(FBUtilities.getBroadcastAddress()))
{
String s = "This node is already a member of the token ring; bootstrap aborted. (If replacing a dead node, remove the old one from the ring first.)";
throw new UnsupportedOperationException(s);
}
setMode(Mode.JOINING, "getting bootstrap token", true);
bootstrapTokens = BootStrapper.getBootstrapTokens(tokenMetadata, initialTokens);
}
else
{
if (!isReplacingSameAddress())
{
try
{
// Sleep additionally to make sure that the server actually is not alive
// and giving it more time to gossip if alive.
Thread.sleep(LoadBroadcaster.BROADCAST_INTERVAL);
}
catch (InterruptedException e)
{
throw new AssertionError(e);
}
// check for operator errors...
for (Token token : bootstrapTokens)
{
InetAddress existing = tokenMetadata.getEndpoint(token);
if (existing != null)
{
long nanoDelay = delay * 1000000L;
if (Gossiper.instance.getEndpointStateForEndpoint(existing).getUpdateTimestamp() > (System.nanoTime() - nanoDelay))
throw new UnsupportedOperationException("Cannot replace a live node... ");
current.add(existing);
}
else
{
throw new UnsupportedOperationException("Cannot replace token " + token + " which does not exist!");
}
}
}
else
{
try
{
Thread.sleep(RING_DELAY);
}
catch (InterruptedException e)
{
throw new AssertionError(e);
}
}
setMode(Mode.JOINING, "Replacing a node with token(s): " + bootstrapTokens, true);
}
dataAvailable = bootstrap(bootstrapTokens);
if (!dataAvailable)
{
recordNonTransientError(NonTransientError.BOOTSTRAP_ERROR, ImmutableMap.of("streamingFailed", "true"));
unsafeDisableNode();
throw new BootstrappingSafetyException("Bootstrap streaming failed.");
}
logger.info("Bootstrap streaming complete. Waiting to finish bootstrap. Not becoming an active ring " +
"member. Use JMX (StorageService->finishBootstrap()) to finalize ring joining.");
try
{
setMode(Mode.WAITING_TO_FINISH_BOOTSTRAP, "Awaiting finish bootstrap call", true);
boolean timeoutExceeded = !finishBootstrapCondition.await(30, MINUTES);
if (timeoutExceeded)
{
recordNonTransientError(NonTransientError.BOOTSTRAP_ERROR, ImmutableMap.of("bootstrapSafetyCheckFailed", "true"));
unsafeDisableNode();
String message = "Finish bootstrap was not called within 30 minutes. Bootstrap safety check failed.";
logger.error(message);
throw new BootstrappingSafetyException(message);
}
}
catch (InterruptedException e)
{
throw new AssertionError(e);
}
logger.info("Received signal to finish bootstrap");
}
else
{
bootstrapTokens = SystemKeyspace.getSavedTokens();
if (bootstrapTokens.isEmpty())
{
if (initialTokens.size() < 1)
{
bootstrapTokens = BootStrapper.getRandomTokens(tokenMetadata, DatabaseDescriptor.getNumTokens());
if (DatabaseDescriptor.getNumTokens() == 1)
logger.warn("Generated random token {}. Random tokens will result in an unbalanced ring; see http://wiki.apache.org/cassandra/Operations", bootstrapTokens);
else
logger.info("Generated random tokens. tokens are {}", bootstrapTokens);
}
else
{
bootstrapTokens = new ArrayList<>(initialTokens.size());
for (String token : initialTokens)
bootstrapTokens.add(getPartitioner().getTokenFactory().fromString(token));
logger.info("Saved tokens not found. Using configuration value: {}", bootstrapTokens);
}
}
else
{
if (bootstrapTokens.size() != DatabaseDescriptor.getNumTokens())
throw new ConfigurationException("Cannot change the number of tokens from " + bootstrapTokens.size() + " to " + DatabaseDescriptor.getNumTokens());
else
logger.info("Using saved tokens {}", bootstrapTokens);
}
}
// if we don't have system_traces keyspace at this point, then create it manually
ensureTraceKeyspace();
maybeAddOrUpdateKeyspace(SystemDistributedKeyspace.definition());
if (!isSurveyMode)
{
if (dataAvailable)
{
finishJoiningRing(bootstrapTokens);
// remove the existing info about the replaced node.
if (!current.isEmpty())
{
for (InetAddress existing : current)
Gossiper.instance.replacedEndpoint(existing);
}
}
else
{
logger.warn("Some data streaming failed. Use nodetool to check bootstrap state and resume. For more, see `nodetool help bootstrap`. {}", SystemKeyspace.getBootstrapState());
}
}
else
{
if (dataAvailable)
logger.info("Startup complete, but write survey mode is active, not becoming an active ring member. Use JMX (StorageService->joinRing()) to finalize ring joining.");
else
logger.warn("Some data streaming failed. Use nodetool to check bootstrap state and resume. For more, see `nodetool help bootstrap`. {}", SystemKeyspace.getBootstrapState());
}
}
private void joinTokenRing(int delay) throws ConfigurationException
{
joinTokenRing(delay, DatabaseDescriptor.isAutoBootstrap(), DatabaseDescriptor.getInitialTokens());
}
/**
* Checks and ensures that keyspaces are empty, and no ranges have been marked as streamed before we start a bootstrap.
*/
private static boolean areKeyspacesEmptyForBootstrap() {
boolean empty = true;
Set userKeyspaces = ImmutableSet.copyOf(Schema.instance.getNonAdminKeyspaces());
for (String keyspaceName : userKeyspaces)
{
Set> availableRanges = SystemKeyspace.getAvailableRanges(keyspaceName, StorageService.getPartitioner());
if(!availableRanges.isEmpty()) {
logger.error("Found previous ranges available {} for a non-system keyspace.", availableRanges);
empty = false;
}
Keyspace keyspace = Keyspace.open(keyspaceName);
for (ColumnFamilyStore store : keyspace.getColumnFamilyStores())
{
Collection tables = store.getSSTables();
if (tables.size() > 0)
{
logger.error("Found previous SSTables {} for keyspace {} and cf {}.", tables, keyspaceName, store.name);
empty = false;
}
}
}
return empty;
}
private static boolean isCommitlogEmptyForBootstrap() {
return isCommitlogEmptyForBootstrap(CommitLogReplayer.getSeenColumnFamilies());
}
/**
* Checks to see if any commitlog segments have been replayed for non-system keyspaces.
* @return True if no commitlog segments for non-system keyspaces have been replayed, false otherwise.
*/
static boolean isCommitlogEmptyForBootstrap(Set columnFamiliesWithReplayedMutations) {
boolean empty = true;
Set ignoredKeyspacesInCommitLog = columnFamiliesWithReplayedMutations.stream()
.filter(Objects::nonNull) // cfIds for commitlog can sometimes be null
.filter(uuid -> Schema.instance.getCFMetaData(uuid) == null)
.collect(Collectors.toSet());
if (!ignoredKeyspacesInCommitLog.isEmpty()) {
logger.info("Tried to replay a commitlog segment with an unknown CF(s) {}, " +
"this indicates data from a previous bootstrap attempt still exists. Please delete before proceeding.", ignoredKeyspacesInCommitLog);
empty = false;
}
Set seenKeyspacesInCommitlog = columnFamiliesWithReplayedMutations.stream()
.map(Schema.instance::getCFMetaData)
.filter(Objects::nonNull)
.map(cf -> cf.ksName)
.filter(keyspace -> !Schema.SYSTEM_KEYSPACES.contains(keyspace))
.collect(Collectors.toSet());
if (!seenKeyspacesInCommitlog.isEmpty()) {
logger.error("Found previous commitlog entries for non-existing CFs {}, indicating we've an old commitlog files from a preivous bootstrap. Please delete before proceeding.",
CommitLogReplayer.getSeenColumnFamilies());
empty = false;
}
return empty;
}
@VisibleForTesting
public void ensureTraceKeyspace()
{
maybeAddOrUpdateKeyspace(TraceKeyspace.definition());
}
public static boolean isReplacingSameAddress()
{
return DatabaseDescriptor.getReplaceAddress().equals(FBUtilities.getBroadcastAddress());
}
public void gossipSnitchInfo()
{
IEndpointSnitch snitch = DatabaseDescriptor.getEndpointSnitch();
String dc = snitch.getDatacenter(FBUtilities.getBroadcastAddress());
String rack = snitch.getRack(FBUtilities.getBroadcastAddress());
Gossiper.instance.addLocalApplicationState(ApplicationState.DC, StorageService.instance.valueFactory.datacenter(dc));
Gossiper.instance.addLocalApplicationState(ApplicationState.RACK, StorageService.instance.valueFactory.rack(rack));
}
public synchronized void joinRing(Collection initalTokens) throws IOException {
if (!joined)
{
logger.info("Joining ring by operator request");
try
{
if(initalTokens.isEmpty()) {
joinTokenRing(0);
} else {
initalTokens.stream().forEach(getPartitioner().getTokenFactory()::validate);
Preconditions.checkState(operationMode.equals(Mode.ZOMBIE), "Cannot join ring without being in Zombie mode.");
Preconditions.checkState(SystemKeyspace.getSavedTokens().isEmpty(), "Cannot join ring with new tokens as SystemKeyspace already has tokens sets.");
joinTokenRing(0, false, initalTokens);
}
}
catch (ConfigurationException e)
{
throw new IOException(e.getMessage());
}
}
else if (isSurveyMode)
{
// if isSurveyMode is on then verify isBootstrapMode
// node can join the ring even if isBootstrapMode is true which should not happen
if (!isBootstrapMode())
{
isSurveyMode = false;
logger.info("Leaving write survey mode and joining ring at operator request");
finishJoiningRing(SystemKeyspace.getSavedTokens());
daemon.start();
}
else
{
logger.warn("Can't join the ring because in write_survey mode and bootstrap hasn't completed");
}
}
else if (isBootstrapMode())
{
// bootstrap is not complete hence node cannot join the ring
logger.warn("Can't join the ring because bootstrap hasn't completed.");
}
}
public synchronized void joinRing() throws IOException
{
joinRing(ImmutableSet.of());
}
private void finishJoiningRing(Collection tokens)
{
// start participating in the ring.
logger.info("Attempting to set bootstrap state to COMPLETED and to join token ring");
SystemKeyspace.setBootstrapState(SystemKeyspace.BootstrapState.COMPLETED);
setTokens(tokens);
assert tokenMetadata.sortedTokens().size() > 0;
doAuthSetup();
logger.info("Node has finished joining token ring. Bootstrap state is COMPLETED.");
}
private void doAuthSetup()
{
if (!doneAuthSetup.getAndSet(true))
{
maybeAddOrUpdateKeyspace(AuthKeyspace.definition());
DatabaseDescriptor.getRoleManager().setup();
DatabaseDescriptor.getAuthenticator().setup();
DatabaseDescriptor.getAuthorizer().setup();
MigrationManager.instance.register(new AuthMigrationListener());
}
}
private void maybeAddKeyspace(KSMetaData ksm)
{
try
{
MigrationManager.announceNewKeyspace(ksm, 0, false);
}
catch (AlreadyExistsException e)
{
logger.debug("Attempted to create new keyspace {}, but it already exists", ksm.name);
}
}
/**
* Ensure the schema of a pseudo-system keyspace (a distributed system keyspace: traces, auth and the so-called distributedKeyspace),
* is up to date with what we expected (creating it if it doesn't exist and updating tables that may have been upgraded).
*/
private void maybeAddOrUpdateKeyspace(KSMetaData expected)
{
// Note that want to deal with the keyspace and its table a bit differently: for the keyspace definition
// itself, we want to create it if it doesn't exist yet, but if it does exist, we don't want to modify it,
// because user can modify the definition to change the replication factor (#6016) and we don't want to
// override it. For the tables however, we have to deal with the fact that new version can add new columns
// (#8162 being an example), so even if the table definition exists, we still need to force the "current"
// version of the schema, the one the node will be expecting.
KSMetaData defined = Schema.instance.getKSMetaData(expected.name);
// If the keyspace doesn't exist, create it
if (defined == null)
{
maybeAddKeyspace(expected);
defined = Schema.instance.getKSMetaData(expected.name);
}
// While the keyspace exists, it might miss table or have outdated one
// There is also the potential for a race, as schema migrations add the bare
// keyspace into Schema.instance before adding its tables, so double check that
// all the expected tables are present
for (CFMetaData expectedTable : expected.cfMetaData().values())
{
CFMetaData definedTable = defined.cfMetaData().get(expectedTable.cfName);
if (definedTable == null || !definedTable.equals(expectedTable))
MigrationManager.forceAnnounceNewColumnFamily(expectedTable);
}
}
public boolean isJoined()
{
return tokenMetadata.isMember(FBUtilities.getBroadcastAddress()) && !isSurveyMode;
}
public void rebuild(String sourceDc)
{
rebuild(sourceDc, null);
}
public void rebuild(String sourceDc, String keyspace)
{
// check on going rebuild
if (!isRebuilding.compareAndSet(false, true))
{
throw new IllegalStateException("Node is still rebuilding. Check nodetool netstats.");
}
logger.info("Rebuild from DC: {}, {}, (All tokens)", sourceDc == null ? "(Any DC)" : sourceDc,
keyspace == null ? "(All keyspaces)" : keyspace);
try
{
RangeStreamer streamer = getRebuildStreamer(sourceDc, keyspace);
StreamResultFuture resultFuture = streamer.fetchAsync();
// wait for result
resultFuture.get();
}
catch (InterruptedException e)
{
throw new RuntimeException("Interrupted while waiting on rebuild streaming");
}
catch (ExecutionException e)
{
// This is used exclusively through JMX, so log the full trace but only throw a simple RTE
logger.error("Error while rebuilding node", e.getCause());
throw new RuntimeException("Error while rebuilding node: " + e.getCause().getMessage());
}
finally
{
// rebuild is done (successfully or not)
isRebuilding.set(false);
}
}
public void rebuild(String sourceDc, String keyspace, String tokens, String specificSources)
{
if (tokens != null)
{
throw new UnsupportedOperationException("Rebuild with specific tokens is not supported");
}
if (specificSources != null)
{
throw new UnsupportedOperationException("Rebuild with specificSources is not supported");
}
rebuild(sourceDc, keyspace);
}
public boolean isRebuilding()
{
return isRebuilding.get();
}
/** Controls {@link #isRebuilding()}. Do not use outside of unit tests. */
@VisibleForTesting
public void unsafeSetRebuilding(boolean rebuilding)
{
isRebuilding.set(rebuilding);
}
public Set getKeyspacesWithAllRangesAvailable(String sourceDc)
{
Set keyspaces = Schema.instance.getNonSystemKeyspaces().stream()
.filter(keyspace -> {
Class extends AbstractReplicationStrategy> strategyClass =
Schema.instance.getKSMetaData(keyspace).strategyClass;
return strategyClass.equals(NetworkTopologyStrategy.class)
|| strategyClass.equals(OldNetworkTopologyStrategy.class);
})
.collect(Collectors.toSet());
Set unavailable = keyspaces.stream()
.filter(keyspace -> !verifyAllRangesAvailable(sourceDc, keyspace))
.collect(Collectors.toSet());
if (!unavailable.isEmpty())
logger.warn("Verified keyspaces are missing ranges (from source DC: {}): {}", sourceDc, unavailable);
return keyspaces.stream()
.filter(keyspace -> !unavailable.contains(keyspace))
.collect(Collectors.toSet());
}
private boolean verifyAllRangesAvailable(String sourceDc, String keyspace)
{
try
{
RangeStreamer rebuildStreamer = getRebuildStreamer(sourceDc, keyspace);
return rebuildStreamer.areAllRangesPresent();
}
catch (Exception e)
{
logger.error("Failed to verify all ranges from source DC {} for keyspace {} were available on this node. " +
"Defaulting to false for safety", sourceDc, keyspace, e);
return false;
}
}
private RangeStreamer getRebuildStreamer(String sourceDc, String keyspace)
{
RangeStreamer streamer = new RangeStreamer(tokenMetadata,
null,
FBUtilities.getBroadcastAddress(),
"Rebuild",
!replacing && useStrictConsistency,
DatabaseDescriptor.getEndpointSnitch(),
streamStateStore);
streamer.addSourceFilter(new RangeStreamer.FailureDetectorSourceFilter(FailureDetector.instance));
if (sourceDc != null)
{
streamer.addSourceFilter(new RangeStreamer.SingleDatacenterFilter(DatabaseDescriptor.getEndpointSnitch(), sourceDc));
if (keyspace != null) {
/**
* Given RF 3, with 3 abritrary racks, this will result in a fully consistent rebuild.
* This is due to the simple fact that, our topology will be mirrored identically across datacenters.
* As a result, the only way for our destination datacenter to have inconsistent data, is for our
* source datacenter to have inconsistent data.
*/
HashMultimap topology = HashMultimap.create();
Gossiper.instance.getEndpointStates().stream()
.map(Entry::getKey)
.forEach(address -> topology.put(DatabaseDescriptor.getEndpointSnitch().getDatacenter(address),
DatabaseDescriptor.getEndpointSnitch().getRack(address)));
String localDc = DatabaseDescriptor.getEndpointSnitch().getDatacenter(FBUtilities.getBroadcastAddress());
String localRack = DatabaseDescriptor.getEndpointSnitch().getRack(FBUtilities.getBroadcastAddress());
AbstractReplicationStrategy replicationStrategy = Keyspace.open(keyspace).getReplicationStrategy();
streamer.addSourceFilter(SingleRackFilter.create(topology, sourceDc, localDc, localRack, replicationStrategy));
}
}
if (keyspace == null)
{
for (String keyspaceName : Schema.instance.getNonSystemKeyspaces())
streamer.addRanges(keyspaceName, getLocalRanges(keyspaceName));
}
else
{
streamer.addRanges(keyspace, getLocalRanges(keyspace));
}
return streamer;
}
public void setStreamThroughputMbPerSec(int value)
{
DatabaseDescriptor.setStreamThroughputOutboundMegabitsPerSec(value);
logger.info("setstreamthroughput: throttle set to {}", value);
}
public int getStreamThroughputMbPerSec()
{
return DatabaseDescriptor.getStreamThroughputOutboundMegabitsPerSec();
}
public void setInterDCStreamThroughputMbPerSec(int value)
{
DatabaseDescriptor.setInterDCStreamThroughputOutboundMegabitsPerSec(value);
logger.info("setinterdcstreamthroughput: throttle set to {}", value);
}
public int getInterDCStreamThroughputMbPerSec()
{
return DatabaseDescriptor.getInterDCStreamThroughputOutboundMegabitsPerSec();
}
public int getCompactionThroughputMbPerSec()
{
return DatabaseDescriptor.getCompactionThroughputMbPerSec();
}
public void setCompactionThroughputMbPerSec(int value)
{
DatabaseDescriptor.setCompactionThroughputMbPerSec(value);
CompactionManager.instance.setRate(value);
}
public boolean isIncrementalBackupsEnabled()
{
return DatabaseDescriptor.isIncrementalBackupsEnabled();
}
public void setIncrementalBackupsEnabled(boolean value)
{
DatabaseDescriptor.setIncrementalBackupsEnabled(value);
}
private void setMode(Mode m, boolean log)
{
setMode(m, null, log);
}
@VisibleForTesting
void setMode(Mode m, @Safe String msg, boolean log)
{
operationMode = m;
if (log)
logger.info(m.toString(), SafeArg.of("msg", msg));
else
logger.debug(m.toString(), SafeArg.of("msg", msg));
}
/**
* Bootstrap node by fetching data from other nodes.
* If node is bootstrapping as a new node, then this also announces bootstrapping to the cluster.
*
* This blocks until streaming is done.
*
* @param tokens bootstrapping tokens
* @return true if bootstrap succeeds.
*/
private boolean bootstrap(final Collection tokens)
{
isBootstrapMode = true;
SystemKeyspace.updateTokens(tokens); // DON'T use setToken, that makes us part of the ring locally which is incorrect until we are done bootstrapping
if (!replacing || !isReplacingSameAddress())
{
// if not an existing token then bootstrap
List> states = new ArrayList<>();
states.add(Pair.create(ApplicationState.TOKENS, valueFactory.tokens(tokens)));
states.add(Pair.create(ApplicationState.STATUS, replacing?
valueFactory.bootReplacing(DatabaseDescriptor.getReplaceAddress()) :
valueFactory.bootstrapping(tokens)));
Gossiper.instance.addLocalApplicationStates(states);
setMode(Mode.JOINING, "sleeping " + RING_DELAY + " ms for pending range setup", true);
Uninterruptibles.sleepUninterruptibly(RING_DELAY, TimeUnit.MILLISECONDS);
}
else
{
// Dont set any state for the node which is bootstrapping the existing token...
tokenMetadata.updateNormalTokens(tokens, FBUtilities.getBroadcastAddress());
SystemKeyspace.removeEndpoint(DatabaseDescriptor.getReplaceAddress());
}
if (!Gossiper.instance.seenAnySeed())
throw new IllegalStateException("Unable to contact any seeds!");
if (Boolean.getBoolean("cassandra.reset_bootstrap_progress"))
{
logger.info("Resetting bootstrap progress to start fresh");
SystemKeyspace.resetAvailableRanges();
}
setMode(Mode.JOINING, "Starting to bootstrap...", true);
BootStrapper bootstrapper = new BootStrapper(FBUtilities.getBroadcastAddress(), tokens, tokenMetadata);
bootstrapper.addProgressListener(progressSupport);
bootstrapListeners.forEach(bootstrapper::addProgressListener);
ListenableFuture bootstrapStream = bootstrapper.bootstrap(streamStateStore, !replacing && useStrictConsistency); // handles token update
try
{
bootstrapStream.get();
isBootstrapMode = false;
logger.info("Bootstrap streaming completed for tokens {}", tokens);
return !StorageService.instance.hasNonTransientError(StorageServiceMBean.NonTransientError.BOOTSTRAP_ERROR);
}
catch (Throwable e)
{
logger.error("Error while waiting on bootstrap to complete. Bootstrap will have to be restarted.", e);
return false;
}
}
public boolean resumeBootstrap()
{
throw new RuntimeException("Resuming bootstraps is currently not supported, as it can result in corruption.");
/*
if (isBootstrapMode && SystemKeyspace.bootstrapInProgress())
{
logger.info("Resuming bootstrap...");
// get bootstrap tokens saved in system keyspace
final Collection tokens = SystemKeyspace.getSavedTokens();
// already bootstrapped ranges are filtered during bootstrap
BootStrapper bootstrapper = new BootStrapper(FBUtilities.getBroadcastAddress(), tokens, tokenMetadata);
bootstrapper.addProgressListener(progressSupport);
ListenableFuture bootstrapStream = bootstrapper.bootstrap(streamStateStore, !replacing && useStrictConsistency); // handles token update
Futures.addCallback(bootstrapStream, new FutureCallback()
{
@Override
public void onSuccess(StreamState streamState)
{
isBootstrapMode = false;
if (isSurveyMode)
{
logger.info("Startup complete, but write survey mode is active, not becoming an active ring member. Use JMX (StorageService->joinRing()) to finalize ring joining.");
}
else
{
isSurveyMode = false;
progressSupport.progress("bootstrap", ProgressEvent.createNotification("Joining ring..."));
finishJoiningRing(bootstrapTokens);
}
progressSupport.progress("bootstrap", new ProgressEvent(ProgressEventType.COMPLETE, 1, 1, "Resume bootstrap complete"));
daemon.start();
logger.info("Resume complete");
}
@Override
public void onFailure(Throwable e)
{
String message = "Error during bootstrap: ";
if (e instanceof ExecutionException && e.getCause() != null)
{
message += e.getCause().getMessage();
}
else
{
message += e.getMessage();
}
logger.error(message, e);
progressSupport.progress("bootstrap", new ProgressEvent(ProgressEventType.ERROR, 1, 1, message));
progressSupport.progress("bootstrap", new ProgressEvent(ProgressEventType.COMPLETE, 1, 1, "Resume bootstrap complete"));
}
});
return true;
}
else
{
logger.info("Resuming bootstrap is requested, but the node is already bootstrapped.");
return false;
}
*/
}
@Override
public void startBootstrap()
{
startBootstrapCondition.signalAll();
}
@Override
public void finishBootstrap()
{
finishBootstrapCondition.signalAll();
}
public void clearNonTransientErrors() {
nonTransientErrors.clear();
}
public void clearTransientErrors() {
transientErrors.clear();
}
public void setOperationModeNormal() {
setOperationMode(Mode.NORMAL);
}
@VisibleForTesting
void setOperationMode(Mode mode) {
setMode(mode, false);
}
@Override
public Set