Many resources are needed to download a project. Please understand that we have to compensate our server costs. Thank you in advance. Project price only 1 $
You can buy this project and download/modify it how often you want.
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.cassandra.service;
import java.io.ByteArrayInputStream;
import java.io.DataInputStream;
import java.io.File;
import java.io.IOException;
import java.lang.management.ManagementFactory;
import java.net.InetAddress;
import java.net.UnknownHostException;
import java.nio.ByteBuffer;
import java.util.*;
import java.util.concurrent.*;
import java.util.concurrent.atomic.AtomicInteger;
import java.util.concurrent.atomic.AtomicLong;
import javax.management.JMX;
import javax.management.MBeanServer;
import javax.management.Notification;
import javax.management.NotificationBroadcasterSupport;
import javax.management.ObjectName;
import javax.management.openmbean.TabularData;
import javax.management.openmbean.TabularDataSupport;
import ch.qos.logback.classic.LoggerContext;
import ch.qos.logback.classic.jmx.JMXConfiguratorMBean;
import ch.qos.logback.classic.spi.ILoggingEvent;
import ch.qos.logback.core.Appender;
import com.google.common.annotations.VisibleForTesting;
import com.google.common.base.Predicate;
import com.google.common.collect.*;
import com.google.common.util.concurrent.FutureCallback;
import com.google.common.util.concurrent.Futures;
import com.google.common.util.concurrent.Uninterruptibles;
import org.apache.commons.lang3.StringUtils;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.apache.cassandra.auth.Auth;
import org.apache.cassandra.concurrent.ScheduledExecutors;
import org.apache.cassandra.concurrent.Stage;
import org.apache.cassandra.concurrent.StageManager;
import org.apache.cassandra.config.CFMetaData;
import org.apache.cassandra.config.DatabaseDescriptor;
import org.apache.cassandra.config.KSMetaData;
import org.apache.cassandra.config.Schema;
import org.apache.cassandra.db.*;
import org.apache.cassandra.db.commitlog.CommitLog;
import org.apache.cassandra.db.compaction.CompactionManager;
import org.apache.cassandra.db.index.SecondaryIndex;
import org.apache.cassandra.dht.*;
import org.apache.cassandra.dht.Range;
import org.apache.cassandra.exceptions.ConfigurationException;
import org.apache.cassandra.exceptions.InvalidRequestException;
import org.apache.cassandra.exceptions.UnavailableException;
import org.apache.cassandra.gms.*;
import org.apache.cassandra.io.sstable.SSTableDeletingTask;
import org.apache.cassandra.io.sstable.SSTableLoader;
import org.apache.cassandra.io.util.FileUtils;
import org.apache.cassandra.locator.*;
import org.apache.cassandra.metrics.StorageMetrics;
import org.apache.cassandra.net.AsyncOneResponse;
import org.apache.cassandra.net.MessageOut;
import org.apache.cassandra.net.MessagingService;
import org.apache.cassandra.net.ResponseVerbHandler;
import org.apache.cassandra.repair.RepairFuture;
import org.apache.cassandra.repair.RepairMessageVerbHandler;
import org.apache.cassandra.repair.RepairParallelism;
import org.apache.cassandra.service.paxos.CommitVerbHandler;
import org.apache.cassandra.service.paxos.PrepareVerbHandler;
import org.apache.cassandra.service.paxos.ProposeVerbHandler;
import org.apache.cassandra.streaming.*;
import org.apache.cassandra.thrift.EndpointDetails;
import org.apache.cassandra.thrift.TokenRange;
import org.apache.cassandra.thrift.cassandraConstants;
import org.apache.cassandra.tracing.Tracing;
import org.apache.cassandra.utils.*;
import static java.nio.charset.StandardCharsets.ISO_8859_1;
/**
* This abstraction contains the token/identifier of this node
* on the identifier space. This token gets gossiped around.
* This class will also maintain histograms of the load information
* of other nodes in the cluster.
*/
public class StorageService extends NotificationBroadcasterSupport implements IEndpointStateChangeSubscriber, StorageServiceMBean
{
private static final Logger logger = LoggerFactory.getLogger(StorageService.class);
public static final int RING_DELAY = getRingDelay(); // delay after which we assume ring has stablized
/* JMX notification serial number counter */
private final AtomicLong notificationSerialNumber = new AtomicLong();
private static int getRingDelay()
{
String newdelay = System.getProperty("cassandra.ring_delay_ms");
if (newdelay != null)
{
logger.info("Overriding RING_DELAY to {}ms", newdelay);
return Integer.parseInt(newdelay);
}
else
return 30 * 1000;
}
/* This abstraction maintains the token/endpoint metadata information */
private TokenMetadata tokenMetadata = new TokenMetadata();
public volatile VersionedValue.VersionedValueFactory valueFactory = new VersionedValue.VersionedValueFactory(getPartitioner());
private Thread drainOnShutdown = null;
public static final StorageService instance = new StorageService();
public static IPartitioner getPartitioner()
{
return DatabaseDescriptor.getPartitioner();
}
public Collection> getLocalRanges(String keyspaceName)
{
return getRangesForEndpoint(keyspaceName, FBUtilities.getBroadcastAddress());
}
public Collection> getPrimaryRanges(String keyspace)
{
return getPrimaryRangesForEndpoint(keyspace, FBUtilities.getBroadcastAddress());
}
public Collection> getPrimaryRangesWithinDC(String keyspace)
{
return getPrimaryRangeForEndpointWithinDC(keyspace, FBUtilities.getBroadcastAddress());
}
private final Set replicatingNodes = Collections.synchronizedSet(new HashSet());
private CassandraDaemon daemon;
private InetAddress removingNode;
/* Are we starting this node in bootstrap mode? */
private boolean isBootstrapMode;
/* we bootstrap but do NOT join the ring unless told to do so */
private boolean isSurveyMode= Boolean.parseBoolean(System.getProperty("cassandra.write_survey", "false"));
/* when intialized as a client, we shouldn't write to the system keyspace. */
private boolean isClientMode;
private boolean initialized;
private volatile boolean joined = false;
/* the probability for tracing any particular request, 0 disables tracing and 1 enables for all */
private double tracingProbability = 0.0;
private static enum Mode { STARTING, NORMAL, CLIENT, JOINING, LEAVING, DECOMMISSIONED, MOVING, DRAINING, DRAINED }
private Mode operationMode = Mode.STARTING;
/* Used for tracking drain progress */
private volatile int totalCFs, remainingCFs;
private static final AtomicInteger nextRepairCommand = new AtomicInteger();
private final List lifecycleSubscribers = new CopyOnWriteArrayList<>();
private static final BackgroundActivityMonitor bgMonitor = new BackgroundActivityMonitor();
private final ObjectName jmxObjectName;
private Collection bootstrapTokens = null;
public void finishBootstrapping()
{
isBootstrapMode = false;
}
/** This method updates the local token on disk */
public void setTokens(Collection tokens)
{
if (logger.isDebugEnabled())
logger.debug("Setting tokens to {}", tokens);
SystemKeyspace.updateTokens(tokens);
tokenMetadata.updateNormalTokens(tokens, FBUtilities.getBroadcastAddress());
Collection localTokens = getLocalTokens();
List> states = new ArrayList>();
states.add(Pair.create(ApplicationState.TOKENS, valueFactory.tokens(localTokens)));
states.add(Pair.create(ApplicationState.STATUS, valueFactory.normal(localTokens)));
Gossiper.instance.addLocalApplicationStates(states);
setMode(Mode.NORMAL, false);
}
public StorageService()
{
MBeanServer mbs = ManagementFactory.getPlatformMBeanServer();
try
{
jmxObjectName = new ObjectName("org.apache.cassandra.db:type=StorageService");
mbs.registerMBean(this, jmxObjectName);
mbs.registerMBean(StreamManager.instance, new ObjectName(StreamManager.OBJECT_NAME));
}
catch (Exception e)
{
throw new RuntimeException(e);
}
/* register the verb handlers */
MessagingService.instance().registerVerbHandlers(MessagingService.Verb.MUTATION, new MutationVerbHandler());
MessagingService.instance().registerVerbHandlers(MessagingService.Verb.READ_REPAIR, new ReadRepairVerbHandler());
MessagingService.instance().registerVerbHandlers(MessagingService.Verb.READ, new ReadVerbHandler());
MessagingService.instance().registerVerbHandlers(MessagingService.Verb.RANGE_SLICE, new RangeSliceVerbHandler());
MessagingService.instance().registerVerbHandlers(MessagingService.Verb.PAGED_RANGE, new RangeSliceVerbHandler());
MessagingService.instance().registerVerbHandlers(MessagingService.Verb.COUNTER_MUTATION, new CounterMutationVerbHandler());
MessagingService.instance().registerVerbHandlers(MessagingService.Verb.TRUNCATE, new TruncateVerbHandler());
MessagingService.instance().registerVerbHandlers(MessagingService.Verb.PAXOS_PREPARE, new PrepareVerbHandler());
MessagingService.instance().registerVerbHandlers(MessagingService.Verb.PAXOS_PROPOSE, new ProposeVerbHandler());
MessagingService.instance().registerVerbHandlers(MessagingService.Verb.PAXOS_COMMIT, new CommitVerbHandler());
// see BootStrapper for a summary of how the bootstrap verbs interact
MessagingService.instance().registerVerbHandlers(MessagingService.Verb.REPLICATION_FINISHED, new ReplicationFinishedVerbHandler());
MessagingService.instance().registerVerbHandlers(MessagingService.Verb.REQUEST_RESPONSE, new ResponseVerbHandler());
MessagingService.instance().registerVerbHandlers(MessagingService.Verb.INTERNAL_RESPONSE, new ResponseVerbHandler());
MessagingService.instance().registerVerbHandlers(MessagingService.Verb.REPAIR_MESSAGE, new RepairMessageVerbHandler());
MessagingService.instance().registerVerbHandlers(MessagingService.Verb.GOSSIP_SHUTDOWN, new GossipShutdownVerbHandler());
MessagingService.instance().registerVerbHandlers(MessagingService.Verb.GOSSIP_DIGEST_SYN, new GossipDigestSynVerbHandler());
MessagingService.instance().registerVerbHandlers(MessagingService.Verb.GOSSIP_DIGEST_ACK, new GossipDigestAckVerbHandler());
MessagingService.instance().registerVerbHandlers(MessagingService.Verb.GOSSIP_DIGEST_ACK2, new GossipDigestAck2VerbHandler());
MessagingService.instance().registerVerbHandlers(MessagingService.Verb.DEFINITIONS_UPDATE, new DefinitionsUpdateVerbHandler());
MessagingService.instance().registerVerbHandlers(MessagingService.Verb.SCHEMA_CHECK, new SchemaCheckVerbHandler());
MessagingService.instance().registerVerbHandlers(MessagingService.Verb.MIGRATION_REQUEST, new MigrationRequestVerbHandler());
MessagingService.instance().registerVerbHandlers(MessagingService.Verb.SNAPSHOT, new SnapshotVerbHandler());
MessagingService.instance().registerVerbHandlers(MessagingService.Verb.ECHO, new EchoVerbHandler());
}
public void registerDaemon(CassandraDaemon daemon)
{
this.daemon = daemon;
}
public void register(IEndpointLifecycleSubscriber subscriber)
{
lifecycleSubscribers.add(subscriber);
}
public void unregister(IEndpointLifecycleSubscriber subscriber)
{
lifecycleSubscribers.remove(subscriber);
}
// should only be called via JMX
public void stopGossiping()
{
if (initialized)
{
logger.warn("Stopping gossip by operator request");
Gossiper.instance.stop();
initialized = false;
}
}
// should only be called via JMX
public void startGossiping()
{
if (!initialized)
{
logger.warn("Starting gossip by operator request");
Gossiper.instance.start((int) (System.currentTimeMillis() / 1000));
initialized = true;
}
}
// should only be called via JMX
public boolean isGossipRunning()
{
return Gossiper.instance.isEnabled();
}
// should only be called via JMX
public void startRPCServer()
{
if (daemon == null)
{
throw new IllegalStateException("No configured daemon");
}
daemon.thriftServer.start();
}
public void stopRPCServer()
{
if (daemon == null)
{
throw new IllegalStateException("No configured daemon");
}
if (daemon.thriftServer != null)
daemon.thriftServer.stop();
}
public boolean isRPCServerRunning()
{
if ((daemon == null) || (daemon.thriftServer == null))
{
return false;
}
return daemon.thriftServer.isRunning();
}
public void startNativeTransport()
{
if (daemon == null)
{
throw new IllegalStateException("No configured daemon");
}
try
{
daemon.nativeServer.start();
}
catch (Exception e)
{
throw new RuntimeException("Error starting native transport: " + e.getMessage());
}
}
public void stopNativeTransport()
{
if (daemon == null)
{
throw new IllegalStateException("No configured daemon");
}
if (daemon.nativeServer != null)
daemon.nativeServer.stop();
}
public boolean isNativeTransportRunning()
{
if ((daemon == null) || (daemon.nativeServer == null))
{
return false;
}
return daemon.nativeServer.isRunning();
}
public void stopTransports()
{
if (isInitialized())
{
logger.error("Stopping gossiper");
stopGossiping();
}
if (isRPCServerRunning())
{
logger.error("Stopping RPC server");
stopRPCServer();
}
if (isNativeTransportRunning())
{
logger.error("Stopping native transport");
stopNativeTransport();
}
}
private void shutdownClientServers()
{
stopRPCServer();
stopNativeTransport();
}
public void stopClient()
{
Gossiper.instance.unregister(this);
Gossiper.instance.stop();
MessagingService.instance().shutdown();
// give it a second so that task accepted before the MessagingService shutdown gets submitted to the stage (to avoid RejectedExecutionException)
Uninterruptibles.sleepUninterruptibly(1, TimeUnit.SECONDS);
StageManager.shutdownNow();
}
public boolean isInitialized()
{
return initialized;
}
public void stopDaemon()
{
if (daemon == null)
throw new IllegalStateException("No configured daemon");
daemon.deactivate();
// completely shut down cassandra
System.exit(0);
}
public synchronized Collection prepareReplacementInfo() throws ConfigurationException
{
logger.info("Gathering node replacement information for {}", DatabaseDescriptor.getReplaceAddress());
if (!MessagingService.instance().isListening())
MessagingService.instance().listen(FBUtilities.getLocalAddress());
// make magic happen
Gossiper.instance.doShadowRound();
UUID hostId = null;
// now that we've gossiped at least once, we should be able to find the node we're replacing
if (Gossiper.instance.getEndpointStateForEndpoint(DatabaseDescriptor.getReplaceAddress())== null)
throw new RuntimeException("Cannot replace_address " + DatabaseDescriptor.getReplaceAddress() + " because it doesn't exist in gossip");
hostId = Gossiper.instance.getHostId(DatabaseDescriptor.getReplaceAddress());
try
{
if (Gossiper.instance.getEndpointStateForEndpoint(DatabaseDescriptor.getReplaceAddress()).getApplicationState(ApplicationState.TOKENS) == null)
throw new RuntimeException("Could not find tokens for " + DatabaseDescriptor.getReplaceAddress() + " to replace");
Collection tokens = TokenSerializer.deserialize(getPartitioner(), new DataInputStream(new ByteArrayInputStream(getApplicationStateValue(DatabaseDescriptor.getReplaceAddress(), ApplicationState.TOKENS))));
SystemKeyspace.setLocalHostId(hostId); // use the replacee's host Id as our own so we receive hints, etc
Gossiper.instance.resetEndpointStateMap(); // clean up since we have what we need
return tokens;
}
catch (IOException e)
{
throw new RuntimeException(e);
}
}
public synchronized void checkForEndpointCollision() throws ConfigurationException
{
logger.debug("Starting shadow gossip round to check for endpoint collision");
if (!MessagingService.instance().isListening())
MessagingService.instance().listen(FBUtilities.getLocalAddress());
Gossiper.instance.doShadowRound();
EndpointState epState = Gossiper.instance.getEndpointStateForEndpoint(FBUtilities.getBroadcastAddress());
if (epState != null && !Gossiper.instance.isDeadState(epState) && !Gossiper.instance.isFatClient(FBUtilities.getBroadcastAddress()))
{
throw new RuntimeException(String.format("A node with address %s already exists, cancelling join. " +
"Use cassandra.replace_address if you want to replace this node.",
FBUtilities.getBroadcastAddress()));
}
if (RangeStreamer.useStrictConsistency)
{
for (Map.Entry entry : Gossiper.instance.getEndpointStates())
{
if (entry.getValue().getApplicationState(ApplicationState.STATUS) == null)
continue;
String[] pieces = entry.getValue().getApplicationState(ApplicationState.STATUS).value.split(VersionedValue.DELIMITER_STR, -1);
assert (pieces.length > 0);
String state = pieces[0];
if (state.equals(VersionedValue.STATUS_BOOTSTRAPPING) || state.equals(VersionedValue.STATUS_LEAVING) || state.equals(VersionedValue.STATUS_MOVING))
throw new UnsupportedOperationException("Other bootstrapping/leaving/moving nodes detected, cannot bootstrap while cassandra.consistent.rangemovement is true");
}
}
Gossiper.instance.resetEndpointStateMap();
}
public synchronized void initClient() throws ConfigurationException
{
// We don't wait, because we're going to actually try to work on
initClient(0);
// sleep a while to allow gossip to warm up (the other nodes need to know about this one before they can reply).
outer:
while (true)
{
Uninterruptibles.sleepUninterruptibly(1, TimeUnit.SECONDS);
for (InetAddress address : Gossiper.instance.getLiveMembers())
{
if (!Gossiper.instance.isFatClient(address))
break outer;
}
}
// sleep until any schema migrations have finished
while (!MigrationManager.isReadyForBootstrap())
{
Uninterruptibles.sleepUninterruptibly(1, TimeUnit.SECONDS);
}
}
public synchronized void initClient(int ringDelay) throws ConfigurationException
{
if (initialized)
{
if (!isClientMode)
throw new UnsupportedOperationException("StorageService does not support switching modes.");
return;
}
initialized = true;
isClientMode = true;
logger.info("Starting up client gossip");
setMode(Mode.CLIENT, false);
Gossiper.instance.register(this);
Gossiper.instance.start((int) (System.currentTimeMillis() / 1000)); // needed for node-ring gathering.
Gossiper.instance.addLocalApplicationState(ApplicationState.NET_VERSION, valueFactory.networkVersion());
if (!MessagingService.instance().isListening())
MessagingService.instance().listen(FBUtilities.getLocalAddress());
Uninterruptibles.sleepUninterruptibly(ringDelay, TimeUnit.MILLISECONDS);
}
public synchronized void initServer() throws ConfigurationException
{
initServer(RING_DELAY);
}
public synchronized void initServer(int delay) throws ConfigurationException
{
logger.info("Cassandra version: {}", FBUtilities.getReleaseVersionString());
logger.info("Thrift API version: {}", cassandraConstants.VERSION);
logger.info("CQL supported versions: {} (default: {})", StringUtils.join(ClientState.getCQLSupportedVersion(), ","), ClientState.DEFAULT_CQL_VERSION);
if (initialized)
{
if (isClientMode)
throw new UnsupportedOperationException("StorageService does not support switching modes.");
return;
}
initialized = true;
isClientMode = false;
try
{
// Ensure StorageProxy is initialized on start-up; see CASSANDRA-3797.
Class.forName("org.apache.cassandra.service.StorageProxy");
// also IndexSummaryManager, which is otherwise unreferenced
Class.forName("org.apache.cassandra.io.sstable.IndexSummaryManager");
}
catch (ClassNotFoundException e)
{
throw new AssertionError(e);
}
if (Boolean.parseBoolean(System.getProperty("cassandra.load_ring_state", "true")))
{
logger.info("Loading persisted ring state");
Multimap loadedTokens = SystemKeyspace.loadTokens();
Map loadedHostIds = SystemKeyspace.loadHostIds();
for (InetAddress ep : loadedTokens.keySet())
{
if (ep.equals(FBUtilities.getBroadcastAddress()))
{
// entry has been mistakenly added, delete it
SystemKeyspace.removeEndpoint(ep);
}
else
{
tokenMetadata.updateNormalTokens(loadedTokens.get(ep), ep);
if (loadedHostIds.containsKey(ep))
tokenMetadata.updateHostId(loadedHostIds.get(ep), ep);
Gossiper.instance.addSavedEndpoint(ep);
}
}
}
// daemon threads, like our executors', continue to run while shutdown hooks are invoked
drainOnShutdown = new Thread(new WrappedRunnable()
{
@Override
public void runMayThrow() throws InterruptedException
{
ExecutorService counterMutationStage = StageManager.getStage(Stage.COUNTER_MUTATION);
ExecutorService mutationStage = StageManager.getStage(Stage.MUTATION);
if (mutationStage.isShutdown() && counterMutationStage.isShutdown())
return; // drained already
if (daemon != null)
shutdownClientServers();
ScheduledExecutors.optionalTasks.shutdown();
Gossiper.instance.stop();
// In-progress writes originating here could generate hints to be written, so shut down MessagingService
// before mutation stage, so we can get all the hints saved before shutting down
MessagingService.instance().shutdown();
counterMutationStage.shutdown();
mutationStage.shutdown();
counterMutationStage.awaitTermination(3600, TimeUnit.SECONDS);
mutationStage.awaitTermination(3600, TimeUnit.SECONDS);
StorageProxy.instance.verifyNoHintsInProgress();
List> flushes = new ArrayList<>();
for (Keyspace keyspace : Keyspace.all())
{
KSMetaData ksm = Schema.instance.getKSMetaData(keyspace.getName());
if (!ksm.durableWrites)
{
for (ColumnFamilyStore cfs : keyspace.getColumnFamilyStores())
flushes.add(cfs.forceFlush());
}
}
try
{
FBUtilities.waitOnFutures(flushes);
}
catch (Throwable t)
{
JVMStabilityInspector.inspectThrowable(t);
// don't let this stop us from shutting down the commitlog and other thread pools
logger.warn("Caught exception while waiting for memtable flushes during shutdown hook", t);
}
CommitLog.instance.shutdownBlocking();
// wait for miscellaneous tasks like sstable and commitlog segment deletion
ScheduledExecutors.nonPeriodicTasks.shutdown();
if (!ScheduledExecutors.nonPeriodicTasks.awaitTermination(1, TimeUnit.MINUTES))
logger.warn("Miscellaneous task executor still busy after one minute; proceeding with shutdown");
}
}, "StorageServiceShutdownHook");
Runtime.getRuntime().addShutdownHook(drainOnShutdown);
prepareToJoin();
// Has to be called after the host id has potentially changed in prepareToJoin().
for (ColumnFamilyStore cfs : ColumnFamilyStore.all())
if (cfs.metadata.isCounter())
cfs.initCounterCache();
if (Boolean.parseBoolean(System.getProperty("cassandra.join_ring", "true")))
{
joinTokenRing(delay);
}
else
{
Collection tokens = SystemKeyspace.getSavedTokens();
if (!tokens.isEmpty())
{
tokenMetadata.updateNormalTokens(tokens, FBUtilities.getBroadcastAddress());
// order is important here, the gossiper can fire in between adding these two states. It's ok to send TOKENS without STATUS, but *not* vice versa.
List> states = new ArrayList>();
states.add(Pair.create(ApplicationState.TOKENS, valueFactory.tokens(tokens)));
states.add(Pair.create(ApplicationState.STATUS, valueFactory.hibernate(true)));
Gossiper.instance.addLocalApplicationStates(states);
}
logger.info("Not joining ring as requested. Use JMX (StorageService->joinRing()) to initiate ring joining");
}
}
/**
* In the event of forceful termination we need to remove the shutdown hook to prevent hanging (OOM for instance)
*/
public void removeShutdownHook()
{
if (drainOnShutdown != null)
Runtime.getRuntime().removeShutdownHook(drainOnShutdown);
}
private boolean shouldBootstrap()
{
return DatabaseDescriptor.isAutoBootstrap() && !SystemKeyspace.bootstrapComplete() && !DatabaseDescriptor.getSeeds().contains(FBUtilities.getBroadcastAddress());
}
private void prepareToJoin() throws ConfigurationException
{
if (!joined)
{
Map appStates = new HashMap<>();
if (DatabaseDescriptor.isReplacing() && !(Boolean.parseBoolean(System.getProperty("cassandra.join_ring", "true"))))
throw new ConfigurationException("Cannot set both join_ring=false and attempt to replace a node");
if (DatabaseDescriptor.getReplaceTokens().size() > 0 || DatabaseDescriptor.getReplaceNode() != null)
throw new RuntimeException("Replace method removed; use cassandra.replace_address instead");
if (DatabaseDescriptor.isReplacing())
{
if (SystemKeyspace.bootstrapComplete())
throw new RuntimeException("Cannot replace address with a node that is already bootstrapped");
if (!DatabaseDescriptor.isAutoBootstrap())
throw new RuntimeException("Trying to replace_address with auto_bootstrap disabled will not work, check your configuration");
bootstrapTokens = prepareReplacementInfo();
appStates.put(ApplicationState.TOKENS, valueFactory.tokens(bootstrapTokens));
appStates.put(ApplicationState.STATUS, valueFactory.hibernate(true));
}
else if (shouldBootstrap())
{
checkForEndpointCollision();
}
// have to start the gossip service before we can see any info on other nodes. this is necessary
// for bootstrap to get the load info it needs.
// (we won't be part of the storage ring though until we add a counterId to our state, below.)
// Seed the host ID-to-endpoint map with our own ID.
UUID localHostId = SystemKeyspace.getLocalHostId();
getTokenMetadata().updateHostId(localHostId, FBUtilities.getBroadcastAddress());
appStates.put(ApplicationState.NET_VERSION, valueFactory.networkVersion());
appStates.put(ApplicationState.HOST_ID, valueFactory.hostId(localHostId));
appStates.put(ApplicationState.RPC_ADDRESS, valueFactory.rpcaddress(DatabaseDescriptor.getBroadcastRpcAddress()));
appStates.put(ApplicationState.RELEASE_VERSION, valueFactory.releaseVersion());
logger.info("Starting up server gossip");
Gossiper.instance.register(this);
Gossiper.instance.start(SystemKeyspace.incrementAndGetGeneration(), appStates); // needed for node-ring gathering.
// gossip snitch infos (local DC and rack)
gossipSnitchInfo();
// gossip Schema.emptyVersion forcing immediate check for schema updates (see MigrationManager#maybeScheduleSchemaPull)
Schema.instance.updateVersionAndAnnounce(); // Ensure we know our own actual Schema UUID in preparation for updates
if (!MessagingService.instance().isListening())
MessagingService.instance().listen(FBUtilities.getLocalAddress());
LoadBroadcaster.instance.startBroadcasting();
HintedHandOffManager.instance.start();
BatchlogManager.instance.start();
}
}
private void joinTokenRing(int delay) throws ConfigurationException
{
joined = true;
// We bootstrap if we haven't successfully bootstrapped before, as long as we are not a seed.
// If we are a seed, or if the user manually sets auto_bootstrap to false,
// we'll skip streaming data from other nodes and jump directly into the ring.
//
// The seed check allows us to skip the RING_DELAY sleep for the single-node cluster case,
// which is useful for both new users and testing.
//
// We attempted to replace this with a schema-presence check, but you need a meaningful sleep
// to get schema info from gossip which defeats the purpose. See CASSANDRA-4427 for the gory details.
Set current = new HashSet<>();
logger.debug("Bootstrap variables: {} {} {} {}",
DatabaseDescriptor.isAutoBootstrap(),
SystemKeyspace.bootstrapInProgress(),
SystemKeyspace.bootstrapComplete(),
DatabaseDescriptor.getSeeds().contains(FBUtilities.getBroadcastAddress()));
if (DatabaseDescriptor.isAutoBootstrap() && !SystemKeyspace.bootstrapComplete() && DatabaseDescriptor.getSeeds().contains(FBUtilities.getBroadcastAddress()))
logger.info("This node will not auto bootstrap because it is configured to be a seed node.");
if (shouldBootstrap())
{
if (SystemKeyspace.bootstrapInProgress())
logger.warn("Detected previous bootstrap failure; retrying");
else
SystemKeyspace.setBootstrapState(SystemKeyspace.BootstrapState.IN_PROGRESS);
setMode(Mode.JOINING, "waiting for ring information", true);
// first sleep the delay to make sure we see all our peers
for (int i = 0; i < delay; i += 1000)
{
// if we see schema, we can proceed to the next check directly
if (!Schema.instance.getVersion().equals(Schema.emptyVersion))
{
logger.debug("got schema: {}", Schema.instance.getVersion());
break;
}
Uninterruptibles.sleepUninterruptibly(1, TimeUnit.SECONDS);
}
// if our schema hasn't matched yet, keep sleeping until it does
// (post CASSANDRA-1391 we don't expect this to be necessary very often, but it doesn't hurt to be careful)
while (!MigrationManager.isReadyForBootstrap())
{
setMode(Mode.JOINING, "waiting for schema information to complete", true);
Uninterruptibles.sleepUninterruptibly(1, TimeUnit.SECONDS);
}
setMode(Mode.JOINING, "schema complete, ready to bootstrap", true);
setMode(Mode.JOINING, "waiting for pending range calculation", true);
PendingRangeCalculatorService.instance.blockUntilFinished();
setMode(Mode.JOINING, "calculation complete, ready to bootstrap", true);
if (logger.isDebugEnabled())
logger.debug("... got ring + schema info");
if (Boolean.parseBoolean(System.getProperty("cassandra.consistent.rangemovement", "true")) &&
(
tokenMetadata.getBootstrapTokens().valueSet().size() > 0 ||
tokenMetadata.getLeavingEndpoints().size() > 0 ||
tokenMetadata.getMovingEndpoints().size() > 0
))
throw new UnsupportedOperationException("Other bootstrapping/leaving/moving nodes detected, cannot bootstrap while cassandra.consistent.rangemovement is true");
if (!DatabaseDescriptor.isReplacing())
{
if (tokenMetadata.isMember(FBUtilities.getBroadcastAddress()))
{
String s = "This node is already a member of the token ring; bootstrap aborted. (If replacing a dead node, remove the old one from the ring first.)";
throw new UnsupportedOperationException(s);
}
setMode(Mode.JOINING, "getting bootstrap token", true);
bootstrapTokens = BootStrapper.getBootstrapTokens(tokenMetadata);
}
else
{
if (!DatabaseDescriptor.getReplaceAddress().equals(FBUtilities.getBroadcastAddress()))
{
try
{
// Sleep additionally to make sure that the server actually is not alive
// and giving it more time to gossip if alive.
Thread.sleep(LoadBroadcaster.BROADCAST_INTERVAL);
}
catch (InterruptedException e)
{
throw new AssertionError(e);
}
// check for operator errors...
for (Token token : bootstrapTokens)
{
InetAddress existing = tokenMetadata.getEndpoint(token);
if (existing != null)
{
long nanoDelay = delay * 1000000L;
if (Gossiper.instance.getEndpointStateForEndpoint(existing).getUpdateTimestamp() > (System.nanoTime() - nanoDelay))
throw new UnsupportedOperationException("Cannot replace a live node... ");
current.add(existing);
}
else
{
throw new UnsupportedOperationException("Cannot replace token " + token + " which does not exist!");
}
}
}
else
{
try
{
Thread.sleep(RING_DELAY);
}
catch (InterruptedException e)
{
throw new AssertionError(e);
}
}
setMode(Mode.JOINING, "Replacing a node with token(s): " + bootstrapTokens, true);
}
bootstrap(bootstrapTokens);
assert !isBootstrapMode; // bootstrap will block until finished
}
else
{
bootstrapTokens = SystemKeyspace.getSavedTokens();
if (bootstrapTokens.isEmpty())
{
Collection initialTokens = DatabaseDescriptor.getInitialTokens();
if (initialTokens.size() < 1)
{
bootstrapTokens = BootStrapper.getRandomTokens(tokenMetadata, DatabaseDescriptor.getNumTokens());
if (DatabaseDescriptor.getNumTokens() == 1)
logger.warn("Generated random token " + bootstrapTokens + ". Random tokens will result in an unbalanced ring; see http://wiki.apache.org/cassandra/Operations");
else
logger.info("Generated random tokens. tokens are {}", bootstrapTokens);
}
else
{
bootstrapTokens = new ArrayList(initialTokens.size());
for (String token : initialTokens)
bootstrapTokens.add(getPartitioner().getTokenFactory().fromString(token));
logger.info("Saved tokens not found. Using configuration value: {}", bootstrapTokens);
}
}
else
{
if (bootstrapTokens.size() != DatabaseDescriptor.getNumTokens())
throw new ConfigurationException("Cannot change the number of tokens from " + bootstrapTokens.size() + " to " + DatabaseDescriptor.getNumTokens());
else
logger.info("Using saved tokens " + bootstrapTokens);
}
}
// if we don't have system_traces keyspace at this point, then create it manually
if (Schema.instance.getKSMetaData(Tracing.TRACE_KS) == null)
{
KSMetaData tracingKeyspace = KSMetaData.traceKeyspace();
MigrationManager.announceNewKeyspace(tracingKeyspace, 0, false);
}
if (!isSurveyMode)
{
// start participating in the ring.
SystemKeyspace.setBootstrapState(SystemKeyspace.BootstrapState.COMPLETED);
setTokens(bootstrapTokens);
// remove the existing info about the replaced node.
if (!current.isEmpty())
for (InetAddress existing : current)
Gossiper.instance.replacedEndpoint(existing);
assert tokenMetadata.sortedTokens().size() > 0;
Auth.setup();
}
else
{
logger.info("Startup complete, but write survey mode is active, not becoming an active ring member. Use JMX (StorageService->joinRing()) to finalize ring joining.");
}
}
public void gossipSnitchInfo()
{
IEndpointSnitch snitch = DatabaseDescriptor.getEndpointSnitch();
String dc = snitch.getDatacenter(FBUtilities.getBroadcastAddress());
String rack = snitch.getRack(FBUtilities.getBroadcastAddress());
Gossiper.instance.addLocalApplicationState(ApplicationState.DC, StorageService.instance.valueFactory.datacenter(dc));
Gossiper.instance.addLocalApplicationState(ApplicationState.RACK, StorageService.instance.valueFactory.rack(rack));
}
public synchronized void joinRing() throws IOException
{
if (!joined)
{
logger.info("Joining ring by operator request");
try
{
joinTokenRing(0);
}
catch (ConfigurationException e)
{
throw new IOException(e.getMessage());
}
}
else if (isSurveyMode)
{
setTokens(SystemKeyspace.getSavedTokens());
SystemKeyspace.setBootstrapState(SystemKeyspace.BootstrapState.COMPLETED);
isSurveyMode = false;
logger.info("Leaving write survey mode and joining ring at operator request");
assert tokenMetadata.sortedTokens().size() > 0;
Auth.setup();
}
}
public boolean isJoined()
{
return joined;
}
public void rebuild(String sourceDc)
{
logger.info("rebuild from dc: {}", sourceDc == null ? "(any dc)" : sourceDc);
RangeStreamer streamer = new RangeStreamer(tokenMetadata, FBUtilities.getBroadcastAddress(), "Rebuild");
streamer.addSourceFilter(new RangeStreamer.FailureDetectorSourceFilter(FailureDetector.instance));
if (sourceDc != null)
streamer.addSourceFilter(new RangeStreamer.SingleDatacenterFilter(DatabaseDescriptor.getEndpointSnitch(), sourceDc));
for (String keyspaceName : Schema.instance.getNonSystemKeyspaces())
streamer.addRanges(keyspaceName, getLocalRanges(keyspaceName));
try
{
streamer.fetchAsync().get();
}
catch (InterruptedException e)
{
throw new RuntimeException("Interrupted while waiting on rebuild streaming");
}
catch (ExecutionException e)
{
// This is used exclusively through JMX, so log the full trace but only throw a simple RTE
logger.error("Error while rebuilding node", e.getCause());
throw new RuntimeException("Error while rebuilding node: " + e.getCause().getMessage());
}
}
public void setStreamThroughputMbPerSec(int value)
{
DatabaseDescriptor.setStreamThroughputOutboundMegabitsPerSec(value);
logger.info("setstreamthroughput: throttle set to {}", value);
}
public int getStreamThroughputMbPerSec()
{
return DatabaseDescriptor.getStreamThroughputOutboundMegabitsPerSec();
}
public int getCompactionThroughputMbPerSec()
{
return DatabaseDescriptor.getCompactionThroughputMbPerSec();
}
public void setCompactionThroughputMbPerSec(int value)
{
DatabaseDescriptor.setCompactionThroughputMbPerSec(value);
}
public boolean isIncrementalBackupsEnabled()
{
return DatabaseDescriptor.isIncrementalBackupsEnabled();
}
public void setIncrementalBackupsEnabled(boolean value)
{
DatabaseDescriptor.setIncrementalBackupsEnabled(value);
}
private void setMode(Mode m, boolean log)
{
setMode(m, null, log);
}
private void setMode(Mode m, String msg, boolean log)
{
operationMode = m;
String logMsg = msg == null ? m.toString() : String.format("%s: %s", m, msg);
if (log)
logger.info(logMsg);
else
logger.debug(logMsg);
}
private void bootstrap(Collection tokens)
{
isBootstrapMode = true;
SystemKeyspace.updateTokens(tokens); // DON'T use setToken, that makes us part of the ring locally which is incorrect until we are done bootstrapping
if (!DatabaseDescriptor.isReplacing())
{
// if not an existing token then bootstrap
List> states = new ArrayList>();
states.add(Pair.create(ApplicationState.TOKENS, valueFactory.tokens(tokens)));
states.add(Pair.create(ApplicationState.STATUS, valueFactory.bootstrapping(tokens)));
Gossiper.instance.addLocalApplicationStates(states);
setMode(Mode.JOINING, "sleeping " + RING_DELAY + " ms for pending range setup", true);
Uninterruptibles.sleepUninterruptibly(RING_DELAY, TimeUnit.MILLISECONDS);
}
else
{
// Dont set any state for the node which is bootstrapping the existing token...
tokenMetadata.updateNormalTokens(tokens, FBUtilities.getBroadcastAddress());
SystemKeyspace.removeEndpoint(DatabaseDescriptor.getReplaceAddress());
}
if (!Gossiper.instance.seenAnySeed())
throw new IllegalStateException("Unable to contact any seeds!");
setMode(Mode.JOINING, "Starting to bootstrap...", true);
new BootStrapper(FBUtilities.getBroadcastAddress(), tokens, tokenMetadata).bootstrap(); // handles token update
logger.info("Bootstrap completed! for the tokens {}", tokens);
}
public boolean isBootstrapMode()
{
return isBootstrapMode;
}
public TokenMetadata getTokenMetadata()
{
return tokenMetadata;
}
/**
* Increment about the known Compaction severity of the events in this node
*/
public void reportSeverity(double incr)
{
bgMonitor.incrCompactionSeverity(incr);
}
public void reportManualSeverity(double incr)
{
bgMonitor.incrManualSeverity(incr);
}
public double getSeverity(InetAddress endpoint)
{
return bgMonitor.getSeverity(endpoint);
}
/**
* for a keyspace, return the ranges and corresponding listen addresses.
* @param keyspace
* @return the endpoint map
*/
public Map, List> getRangeToEndpointMap(String keyspace)
{
/* All the ranges for the tokens */
Map, List> map = new HashMap<>();
for (Map.Entry,List> entry : getRangeToAddressMap(keyspace).entrySet())
{
map.put(entry.getKey().asList(), stringify(entry.getValue()));
}
return map;
}
/**
* Return the rpc address associated with an endpoint as a string.
* @param endpoint The endpoint to get rpc address for
* @return the rpc address
*/
public String getRpcaddress(InetAddress endpoint)
{
if (endpoint.equals(FBUtilities.getBroadcastAddress()))
return DatabaseDescriptor.getBroadcastRpcAddress().getHostAddress();
else if (Gossiper.instance.getEndpointStateForEndpoint(endpoint).getApplicationState(ApplicationState.RPC_ADDRESS) == null)
return endpoint.getHostAddress();
else
return Gossiper.instance.getEndpointStateForEndpoint(endpoint).getApplicationState(ApplicationState.RPC_ADDRESS).value;
}
/**
* for a keyspace, return the ranges and corresponding RPC addresses for a given keyspace.
* @param keyspace
* @return the endpoint map
*/
public Map, List> getRangeToRpcaddressMap(String keyspace)
{
/* All the ranges for the tokens */
Map, List> map = new HashMap<>();
for (Map.Entry, List> entry : getRangeToAddressMap(keyspace).entrySet())
{
List rpcaddrs = new ArrayList<>(entry.getValue().size());
for (InetAddress endpoint: entry.getValue())
{
rpcaddrs.add(getRpcaddress(endpoint));
}
map.put(entry.getKey().asList(), rpcaddrs);
}
return map;
}
public Map, List> getPendingRangeToEndpointMap(String keyspace)
{
// some people just want to get a visual representation of things. Allow null and set it to the first
// non-system keyspace.
if (keyspace == null)
keyspace = Schema.instance.getNonSystemKeyspaces().get(0);
Map, List> map = new HashMap<>();
for (Map.Entry, Collection> entry : tokenMetadata.getPendingRanges(keyspace).entrySet())
{
List l = new ArrayList<>(entry.getValue());
map.put(entry.getKey().asList(), stringify(l));
}
return map;
}
public Map, List> getRangeToAddressMap(String keyspace)
{
return getRangeToAddressMap(keyspace, tokenMetadata.sortedTokens());
}
public Map, List> getRangeToAddressMapInLocalDC(String keyspace)
{
Predicate isLocalDC = new Predicate()
{
public boolean apply(InetAddress address)
{
return isLocalDC(address);
}
};
Map, List> origMap = getRangeToAddressMap(keyspace, getTokensInLocalDC());
Map, List> filteredMap = Maps.newHashMap();
for (Map.Entry, List> entry : origMap.entrySet())
{
List endpointsInLocalDC = Lists.newArrayList(Collections2.filter(entry.getValue(), isLocalDC));
filteredMap.put(entry.getKey(), endpointsInLocalDC);
}
return filteredMap;
}
private List getTokensInLocalDC()
{
List filteredTokens = Lists.newArrayList();
for (Token token : tokenMetadata.sortedTokens())
{
InetAddress endpoint = tokenMetadata.getEndpoint(token);
if (isLocalDC(endpoint))
filteredTokens.add(token);
}
return filteredTokens;
}
private boolean isLocalDC(InetAddress targetHost)
{
String remoteDC = DatabaseDescriptor.getEndpointSnitch().getDatacenter(targetHost);
String localDC = DatabaseDescriptor.getEndpointSnitch().getDatacenter(FBUtilities.getBroadcastAddress());
return remoteDC.equals(localDC);
}
private Map, List> getRangeToAddressMap(String keyspace, List sortedTokens)
{
// some people just want to get a visual representation of things. Allow null and set it to the first
// non-system keyspace.
if (keyspace == null)
keyspace = Schema.instance.getNonSystemKeyspaces().get(0);
List> ranges = getAllRanges(sortedTokens);
return constructRangeToEndpointMap(keyspace, ranges);
}
/**
* The same as {@code describeRing(String)} but converts TokenRange to the String for JMX compatibility
*
* @param keyspace The keyspace to fetch information about
*
* @return a List of TokenRange(s) converted to String for the given keyspace
*/
public List describeRingJMX(String keyspace) throws IOException
{
List tokenRanges;
try
{
tokenRanges = describeRing(keyspace);
}
catch (InvalidRequestException e)
{
throw new IOException(e.getMessage());
}
List result = new ArrayList<>(tokenRanges.size());
for (TokenRange tokenRange : tokenRanges)
result.add(tokenRange.toString());
return result;
}
/**
* The TokenRange for a given keyspace.
*
* @param keyspace The keyspace to fetch information about
*
* @return a List of TokenRange(s) for the given keyspace
*
* @throws InvalidRequestException if there is no ring information available about keyspace
*/
public List describeRing(String keyspace) throws InvalidRequestException
{
return describeRing(keyspace, false);
}
/**
* The same as {@code describeRing(String)} but considers only the part of the ring formed by nodes in the local DC.
*/
public List describeLocalRing(String keyspace) throws InvalidRequestException
{
return describeRing(keyspace, true);
}
private List describeRing(String keyspace, boolean includeOnlyLocalDC) throws InvalidRequestException
{
if (!Schema.instance.getKeyspaces().contains(keyspace))
throw new InvalidRequestException("No such keyspace: " + keyspace);
if (keyspace == null || Keyspace.open(keyspace).getReplicationStrategy() instanceof LocalStrategy)
throw new InvalidRequestException("There is no ring for the keyspace: " + keyspace);
List ranges = new ArrayList<>();
Token.TokenFactory tf = getPartitioner().getTokenFactory();
Map, List> rangeToAddressMap =
includeOnlyLocalDC
? getRangeToAddressMapInLocalDC(keyspace)
: getRangeToAddressMap(keyspace);
for (Map.Entry, List> entry : rangeToAddressMap.entrySet())
{
Range range = entry.getKey();
List addresses = entry.getValue();
List endpoints = new ArrayList<>(addresses.size());
List rpc_endpoints = new ArrayList<>(addresses.size());
List epDetails = new ArrayList<>(addresses.size());
for (InetAddress endpoint : addresses)
{
EndpointDetails details = new EndpointDetails();
details.host = endpoint.getHostAddress();
details.datacenter = DatabaseDescriptor.getEndpointSnitch().getDatacenter(endpoint);
details.rack = DatabaseDescriptor.getEndpointSnitch().getRack(endpoint);
endpoints.add(details.host);
rpc_endpoints.add(getRpcaddress(endpoint));
epDetails.add(details);
}
TokenRange tr = new TokenRange(tf.toString(range.left.getToken()), tf.toString(range.right.getToken()), endpoints)
.setEndpoint_details(epDetails)
.setRpc_endpoints(rpc_endpoints);
ranges.add(tr);
}
return ranges;
}
public Map getTokenToEndpointMap()
{
Map mapInetAddress = tokenMetadata.getNormalAndBootstrappingTokenToEndpointMap();
// in order to preserve tokens in ascending order, we use LinkedHashMap here
Map mapString = new LinkedHashMap<>(mapInetAddress.size());
List tokens = new ArrayList<>(mapInetAddress.keySet());
Collections.sort(tokens);
for (Token token : tokens)
{
mapString.put(token.toString(), mapInetAddress.get(token).getHostAddress());
}
return mapString;
}
public String getLocalHostId()
{
return getTokenMetadata().getHostId(FBUtilities.getBroadcastAddress()).toString();
}
public Map getHostIdMap()
{
Map mapOut = new HashMap<>();
for (Map.Entry entry : getTokenMetadata().getEndpointToHostIdMapForReading().entrySet())
mapOut.put(entry.getKey().getHostAddress(), entry.getValue().toString());
return mapOut;
}
/**
* Construct the range to endpoint mapping based on the true view
* of the world.
* @param ranges
* @return mapping of ranges to the replicas responsible for them.
*/
private Map, List> constructRangeToEndpointMap(String keyspace, List> ranges)
{
Map, List> rangeToEndpointMap = new HashMap<>();
for (Range range : ranges)
{
rangeToEndpointMap.put(range, Keyspace.open(keyspace).getReplicationStrategy().getNaturalEndpoints(range.right));
}
return rangeToEndpointMap;
}
public void beforeChange(InetAddress endpoint, EndpointState currentState, ApplicationState newStateKey, VersionedValue newValue)
{
// no-op
}
/*
* Handle the reception of a new particular ApplicationState for a particular endpoint. Note that the value of the
* ApplicationState has not necessarily "changed" since the last known value, if we already received the same update
* from somewhere else.
*
* onChange only ever sees one ApplicationState piece change at a time (even if many ApplicationState updates were
* received at the same time), so we perform a kind of state machine here. We are concerned with two events: knowing
* the token associated with an endpoint, and knowing its operation mode. Nodes can start in either bootstrap or
* normal mode, and from bootstrap mode can change mode to normal. A node in bootstrap mode needs to have
* pendingranges set in TokenMetadata; a node in normal mode should instead be part of the token ring.
*
* Normal progression of ApplicationState.STATUS values for a node should be like this:
* STATUS_BOOTSTRAPPING,token
* if bootstrapping. stays this way until all files are received.
* STATUS_NORMAL,token
* ready to serve reads and writes.
* STATUS_LEAVING,token
* get ready to leave the cluster as part of a decommission
* STATUS_LEFT,token
* set after decommission is completed.
*
* Other STATUS values that may be seen (possibly anywhere in the normal progression):
* STATUS_MOVING,newtoken
* set if node is currently moving to a new token in the ring
* REMOVING_TOKEN,deadtoken
* set if the node is dead and is being removed by its REMOVAL_COORDINATOR
* REMOVED_TOKEN,deadtoken
* set if the node is dead and has been removed by its REMOVAL_COORDINATOR
*
* Note: Any time a node state changes from STATUS_NORMAL, it will not be visible to new nodes. So it follows that
* you should never bootstrap a new node during a removenode, decommission or move.
*/
public void onChange(InetAddress endpoint, ApplicationState state, VersionedValue value)
{
if (state.equals(ApplicationState.STATUS))
{
String apStateValue = value.value;
String[] pieces = apStateValue.split(VersionedValue.DELIMITER_STR, -1);
assert (pieces.length > 0);
String moveName = pieces[0];
switch (moveName)
{
case VersionedValue.STATUS_BOOTSTRAPPING:
handleStateBootstrap(endpoint);
break;
case VersionedValue.STATUS_NORMAL:
handleStateNormal(endpoint);
break;
case VersionedValue.REMOVING_TOKEN:
case VersionedValue.REMOVED_TOKEN:
handleStateRemoving(endpoint, pieces);
break;
case VersionedValue.STATUS_LEAVING:
handleStateLeaving(endpoint);
break;
case VersionedValue.STATUS_LEFT:
handleStateLeft(endpoint, pieces);
break;
case VersionedValue.STATUS_MOVING:
handleStateMoving(endpoint, pieces);
break;
}
}
else
{
EndpointState epState = Gossiper.instance.getEndpointStateForEndpoint(endpoint);
if (epState == null || Gossiper.instance.isDeadState(epState))
{
logger.debug("Ignoring state change for dead or unknown endpoint: {}", endpoint);
return;
}
switch (state)
{
case RELEASE_VERSION:
SystemKeyspace.updatePeerInfo(endpoint, "release_version", value.value);
break;
case DC:
SystemKeyspace.updatePeerInfo(endpoint, "data_center", value.value);
break;
case RACK:
SystemKeyspace.updatePeerInfo(endpoint, "rack", value.value);
break;
case RPC_ADDRESS:
try
{
SystemKeyspace.updatePeerInfo(endpoint, "rpc_address", InetAddress.getByName(value.value));
}
catch (UnknownHostException e)
{
throw new RuntimeException(e);
}
break;
case SCHEMA:
SystemKeyspace.updatePeerInfo(endpoint, "schema_version", UUID.fromString(value.value));
MigrationManager.instance.scheduleSchemaPull(endpoint, epState);
break;
case HOST_ID:
SystemKeyspace.updatePeerInfo(endpoint, "host_id", UUID.fromString(value.value));
break;
}
}
}
private void updatePeerInfo(InetAddress endpoint)
{
EndpointState epState = Gossiper.instance.getEndpointStateForEndpoint(endpoint);
for (Map.Entry entry : epState.getApplicationStateMap().entrySet())
{
switch (entry.getKey())
{
case RELEASE_VERSION:
SystemKeyspace.updatePeerInfo(endpoint, "release_version", entry.getValue().value);
break;
case DC:
SystemKeyspace.updatePeerInfo(endpoint, "data_center", entry.getValue().value);
break;
case RACK:
SystemKeyspace.updatePeerInfo(endpoint, "rack", entry.getValue().value);
break;
case RPC_ADDRESS:
try
{
SystemKeyspace.updatePeerInfo(endpoint, "rpc_address", InetAddress.getByName(entry.getValue().value));
}
catch (UnknownHostException e)
{
throw new RuntimeException(e);
}
break;
case SCHEMA:
SystemKeyspace.updatePeerInfo(endpoint, "schema_version", UUID.fromString(entry.getValue().value));
break;
case HOST_ID:
SystemKeyspace.updatePeerInfo(endpoint, "host_id", UUID.fromString(entry.getValue().value));
break;
}
}
}
private byte[] getApplicationStateValue(InetAddress endpoint, ApplicationState appstate)
{
String vvalue = Gossiper.instance.getEndpointStateForEndpoint(endpoint).getApplicationState(appstate).value;
return vvalue.getBytes(ISO_8859_1);
}
private Collection getTokensFor(InetAddress endpoint)
{
try
{
return TokenSerializer.deserialize(getPartitioner(), new DataInputStream(new ByteArrayInputStream(getApplicationStateValue(endpoint, ApplicationState.TOKENS))));
}
catch (IOException e)
{
throw new RuntimeException(e);
}
}
/**
* Handle node bootstrap
*
* @param endpoint bootstrapping node
*/
private void handleStateBootstrap(InetAddress endpoint)
{
Collection tokens;
// explicitly check for TOKENS, because a bootstrapping node might be bootstrapping in legacy mode; that is, not using vnodes and no token specified
tokens = getTokensFor(endpoint);
if (logger.isDebugEnabled())
logger.debug("Node {} state bootstrapping, token {}", endpoint, tokens);
// if this node is present in token metadata, either we have missed intermediate states
// or the node had crashed. Print warning if needed, clear obsolete stuff and
// continue.
if (tokenMetadata.isMember(endpoint))
{
// If isLeaving is false, we have missed both LEAVING and LEFT. However, if
// isLeaving is true, we have only missed LEFT. Waiting time between completing
// leave operation and rebootstrapping is relatively short, so the latter is quite
// common (not enough time for gossip to spread). Therefore we report only the
// former in the log.
if (!tokenMetadata.isLeaving(endpoint))
logger.info("Node {} state jump to bootstrap", endpoint);
tokenMetadata.removeEndpoint(endpoint);
}
tokenMetadata.addBootstrapTokens(tokens, endpoint);
PendingRangeCalculatorService.instance.update();
if (Gossiper.instance.usesHostId(endpoint))
tokenMetadata.updateHostId(Gossiper.instance.getHostId(endpoint), endpoint);
}
/**
* Handle node move to normal state. That is, node is entering token ring and participating
* in reads.
*
* @param endpoint node
*/
private void handleStateNormal(final InetAddress endpoint)
{
Collection tokens;
tokens = getTokensFor(endpoint);
Set tokensToUpdateInMetadata = new HashSet<>();
Set tokensToUpdateInSystemKeyspace = new HashSet<>();
Set localTokensToRemove = new HashSet<>();
Set endpointsToRemove = new HashSet<>();
if (logger.isDebugEnabled())
logger.debug("Node {} state normal, token {}", endpoint, tokens);
if (tokenMetadata.isMember(endpoint))
logger.info("Node {} state jump to normal", endpoint);
updatePeerInfo(endpoint);
// Order Matters, TM.updateHostID() should be called before TM.updateNormalToken(), (see CASSANDRA-4300).
if (Gossiper.instance.usesHostId(endpoint))
{
UUID hostId = Gossiper.instance.getHostId(endpoint);
InetAddress existing = tokenMetadata.getEndpointForHostId(hostId);
if (DatabaseDescriptor.isReplacing() && Gossiper.instance.getEndpointStateForEndpoint(DatabaseDescriptor.getReplaceAddress()) != null && (hostId.equals(Gossiper.instance.getHostId(DatabaseDescriptor.getReplaceAddress()))))
logger.warn("Not updating token metadata for {} because I am replacing it", endpoint);
else
{
if (existing != null && !existing.equals(endpoint))
{
if (existing.equals(FBUtilities.getBroadcastAddress()))
{
logger.warn("Not updating host ID {} for {} because it's mine", hostId, endpoint);
tokenMetadata.removeEndpoint(endpoint);
endpointsToRemove.add(endpoint);
}
else if (Gossiper.instance.compareEndpointStartup(endpoint, existing) > 0)
{
logger.warn("Host ID collision for {} between {} and {}; {} is the new owner", hostId, existing, endpoint, endpoint);
tokenMetadata.removeEndpoint(existing);
endpointsToRemove.add(existing);
tokenMetadata.updateHostId(hostId, endpoint);
}
else
{
logger.warn("Host ID collision for {} between {} and {}; ignored {}", hostId, existing, endpoint, endpoint);
tokenMetadata.removeEndpoint(endpoint);
endpointsToRemove.add(endpoint);
}
}
else
tokenMetadata.updateHostId(hostId, endpoint);
}
}
for (final Token token : tokens)
{
// we don't want to update if this node is responsible for the token and it has a later startup time than endpoint.
InetAddress currentOwner = tokenMetadata.getEndpoint(token);
if (currentOwner == null)
{
logger.debug("New node {} at token {}", endpoint, token);
tokensToUpdateInMetadata.add(token);
if (!isClientMode)
tokensToUpdateInSystemKeyspace.add(token);
}
else if (endpoint.equals(currentOwner))
{
// set state back to normal, since the node may have tried to leave, but failed and is now back up
tokensToUpdateInMetadata.add(token);
if (!isClientMode)
tokensToUpdateInSystemKeyspace.add(token);
}
else if (Gossiper.instance.compareEndpointStartup(endpoint, currentOwner) > 0)
{
tokensToUpdateInMetadata.add(token);
if (!isClientMode)
tokensToUpdateInSystemKeyspace.add(token);
// currentOwner is no longer current, endpoint is. Keep track of these moves, because when
// a host no longer has any tokens, we'll want to remove it.
Multimap epToTokenCopy = getTokenMetadata().getEndpointToTokenMapForReading();
epToTokenCopy.get(currentOwner).remove(token);
if (epToTokenCopy.get(currentOwner).size() < 1)
endpointsToRemove.add(currentOwner);
logger.info(String.format("Nodes %s and %s have the same token %s. %s is the new owner",
endpoint,
currentOwner,
token,
endpoint));
}
else
{
logger.info(String.format("Nodes %s and %s have the same token %s. Ignoring %s",
endpoint,
currentOwner,
token,
endpoint));
}
}
boolean isMoving = tokenMetadata.isMoving(endpoint); // capture because updateNormalTokens clears moving status
tokenMetadata.updateNormalTokens(tokensToUpdateInMetadata, endpoint);
for (InetAddress ep : endpointsToRemove)
{
removeEndpoint(ep);
if (DatabaseDescriptor.isReplacing() && DatabaseDescriptor.getReplaceAddress().equals(ep))
Gossiper.instance.replacementQuarantine(ep); // quarantine locally longer than normally; see CASSANDRA-8260
}
if (!tokensToUpdateInSystemKeyspace.isEmpty())
SystemKeyspace.updateTokens(endpoint, tokensToUpdateInSystemKeyspace);
if (!localTokensToRemove.isEmpty())
SystemKeyspace.updateLocalTokens(Collections.emptyList(), localTokensToRemove);
if (isMoving)
{
tokenMetadata.removeFromMoving(endpoint);
if (!isClientMode)
{
for (IEndpointLifecycleSubscriber subscriber : lifecycleSubscribers)
subscriber.onMove(endpoint);
}
}
else
{
if (!isClientMode)
{
for (IEndpointLifecycleSubscriber subscriber : lifecycleSubscribers)
subscriber.onJoinCluster(endpoint);
}
}
PendingRangeCalculatorService.instance.update();
}
/**
* Handle node preparing to leave the ring
*
* @param endpoint node
*/
private void handleStateLeaving(InetAddress endpoint)
{
Collection tokens;
tokens = getTokensFor(endpoint);
if (logger.isDebugEnabled())
logger.debug("Node {} state leaving, tokens {}", endpoint, tokens);
// If the node is previously unknown or tokens do not match, update tokenmetadata to
// have this node as 'normal' (it must have been using this token before the
// leave). This way we'll get pending ranges right.
if (!tokenMetadata.isMember(endpoint))
{
logger.info("Node {} state jump to leaving", endpoint);
tokenMetadata.updateNormalTokens(tokens, endpoint);
}
else if (!tokenMetadata.getTokens(endpoint).containsAll(tokens))
{
logger.warn("Node {} 'leaving' token mismatch. Long network partition?", endpoint);
tokenMetadata.updateNormalTokens(tokens, endpoint);
}
// at this point the endpoint is certainly a member with this token, so let's proceed
// normally
tokenMetadata.addLeavingEndpoint(endpoint);
PendingRangeCalculatorService.instance.update();
}
/**
* Handle node leaving the ring. This will happen when a node is decommissioned
*
* @param endpoint If reason for leaving is decommission, endpoint is the leaving node.
* @param pieces STATE_LEFT,token
*/
private void handleStateLeft(InetAddress endpoint, String[] pieces)
{
assert pieces.length >= 2;
Collection tokens;
tokens = getTokensFor(endpoint);
if (logger.isDebugEnabled())
logger.debug("Node {} state left, tokens {}", endpoint, tokens);
excise(tokens, endpoint, extractExpireTime(pieces));
}
/**
* Handle node moving inside the ring.
*
* @param endpoint moving endpoint address
* @param pieces STATE_MOVING, token
*/
private void handleStateMoving(InetAddress endpoint, String[] pieces)
{
assert pieces.length >= 2;
Token token = getPartitioner().getTokenFactory().fromString(pieces[1]);
if (logger.isDebugEnabled())
logger.debug("Node {} state moving, new token {}", endpoint, token);
tokenMetadata.addMovingEndpoint(token, endpoint);
PendingRangeCalculatorService.instance.update();
}
/**
* Handle notification that a node being actively removed from the ring via 'removenode'
*
* @param endpoint node
* @param pieces either REMOVED_TOKEN (node is gone) or REMOVING_TOKEN (replicas need to be restored)
*/
private void handleStateRemoving(InetAddress endpoint, String[] pieces)
{
assert (pieces.length > 0);
if (endpoint.equals(FBUtilities.getBroadcastAddress()))
{
logger.info("Received removenode gossip about myself. Is this node rejoining after an explicit removenode?");
try
{
drain();
}
catch (Exception e)
{
throw new RuntimeException(e);
}
return;
}
if (tokenMetadata.isMember(endpoint))
{
String state = pieces[0];
Collection removeTokens = tokenMetadata.getTokens(endpoint);
if (VersionedValue.REMOVED_TOKEN.equals(state))
{
excise(removeTokens, endpoint, extractExpireTime(pieces));
}
else if (VersionedValue.REMOVING_TOKEN.equals(state))
{
if (logger.isDebugEnabled())
logger.debug("Tokens {} removed manually (endpoint was {})", removeTokens, endpoint);
// Note that the endpoint is being removed
tokenMetadata.addLeavingEndpoint(endpoint);
PendingRangeCalculatorService.instance.update();
// find the endpoint coordinating this removal that we need to notify when we're done
String[] coordinator = Gossiper.instance.getEndpointStateForEndpoint(endpoint).getApplicationState(ApplicationState.REMOVAL_COORDINATOR).value.split(VersionedValue.DELIMITER_STR, -1);
UUID hostId = UUID.fromString(coordinator[1]);
// grab any data we are now responsible for and notify responsible node
restoreReplicaCount(endpoint, tokenMetadata.getEndpointForHostId(hostId));
}
}
else // now that the gossiper has told us about this nonexistent member, notify the gossiper to remove it
{
if (VersionedValue.REMOVED_TOKEN.equals(pieces[0]))
addExpireTimeIfFound(endpoint, extractExpireTime(pieces));
removeEndpoint(endpoint);
}
}
private void excise(Collection tokens, InetAddress endpoint)
{
logger.info("Removing tokens {} for {}", tokens, endpoint);
HintedHandOffManager.instance.deleteHintsForEndpoint(endpoint);
removeEndpoint(endpoint);
tokenMetadata.removeEndpoint(endpoint);
tokenMetadata.removeBootstrapTokens(tokens);
if (!isClientMode)
{
for (IEndpointLifecycleSubscriber subscriber : lifecycleSubscribers)
subscriber.onLeaveCluster(endpoint);
}
PendingRangeCalculatorService.instance.update();
}
private void excise(Collection tokens, InetAddress endpoint, long expireTime)
{
addExpireTimeIfFound(endpoint, expireTime);
excise(tokens, endpoint);
}
/** unlike excise we just need this endpoint gone without going through any notifications **/
private void removeEndpoint(InetAddress endpoint)
{
Gossiper.instance.removeEndpoint(endpoint);
if (!isClientMode)
SystemKeyspace.removeEndpoint(endpoint);
}
protected void addExpireTimeIfFound(InetAddress endpoint, long expireTime)
{
if (expireTime != 0L)
{
Gossiper.instance.addExpireTimeForEndpoint(endpoint, expireTime);
}
}
protected long extractExpireTime(String[] pieces)
{
return Long.parseLong(pieces[2]);
}
/**
* Finds living endpoints responsible for the given ranges
*
* @param keyspaceName the keyspace ranges belong to
* @param ranges the ranges to find sources for
* @return multimap of addresses to ranges the address is responsible for
*/
private Multimap> getNewSourceRanges(String keyspaceName, Set> ranges)
{
InetAddress myAddress = FBUtilities.getBroadcastAddress();
Multimap, InetAddress> rangeAddresses = Keyspace.open(keyspaceName).getReplicationStrategy().getRangeAddresses(tokenMetadata.cloneOnlyTokenMap());
Multimap> sourceRanges = HashMultimap.create();
IFailureDetector failureDetector = FailureDetector.instance;
// find alive sources for our new ranges
for (Range range : ranges)
{
Collection possibleRanges = rangeAddresses.get(range);
IEndpointSnitch snitch = DatabaseDescriptor.getEndpointSnitch();
List sources = snitch.getSortedListByProximity(myAddress, possibleRanges);
assert (!sources.contains(myAddress));
for (InetAddress source : sources)
{
if (failureDetector.isAlive(source))
{
sourceRanges.put(source, range);
break;
}
}
}
return sourceRanges;
}
/**
* Sends a notification to a node indicating we have finished replicating data.
*
* @param remote node to send notification to
*/
private void sendReplicationNotification(InetAddress remote)
{
// notify the remote token
MessageOut msg = new MessageOut(MessagingService.Verb.REPLICATION_FINISHED);
IFailureDetector failureDetector = FailureDetector.instance;
if (logger.isDebugEnabled())
logger.debug("Notifying {} of replication completion\n", remote);
while (failureDetector.isAlive(remote))
{
AsyncOneResponse iar = MessagingService.instance().sendRR(msg, remote);
try
{
iar.get(DatabaseDescriptor.getRpcTimeout(), TimeUnit.MILLISECONDS);
return; // done
}
catch(TimeoutException e)
{
// try again
}
}
}
/**
* Called when an endpoint is removed from the ring. This function checks
* whether this node becomes responsible for new ranges as a
* consequence and streams data if needed.
*
* This is rather ineffective, but it does not matter so much
* since this is called very seldom
*
* @param endpoint the node that left
*/
private void restoreReplicaCount(InetAddress endpoint, final InetAddress notifyEndpoint)
{
Multimap>>> rangesToFetch = HashMultimap.create();
InetAddress myAddress = FBUtilities.getBroadcastAddress();
for (String keyspaceName : Schema.instance.getNonSystemKeyspaces())
{
Multimap, InetAddress> changedRanges = getChangedRangesForLeaving(keyspaceName, endpoint);
Set> myNewRanges = new HashSet<>();
for (Map.Entry, InetAddress> entry : changedRanges.entries())
{
if (entry.getValue().equals(myAddress))
myNewRanges.add(entry.getKey());
}
Multimap> sourceRanges = getNewSourceRanges(keyspaceName, myNewRanges);
for (Map.Entry>> entry : sourceRanges.asMap().entrySet())
{
rangesToFetch.put(keyspaceName, entry);
}
}
StreamPlan stream = new StreamPlan("Restore replica count");
for (String keyspaceName : rangesToFetch.keySet())
{
for (Map.Entry>> entry : rangesToFetch.get(keyspaceName))
{
InetAddress source = entry.getKey();
InetAddress preferred = SystemKeyspace.getPreferredIP(source);
Collection> ranges = entry.getValue();
if (logger.isDebugEnabled())
logger.debug("Requesting from {} ranges {}", source, StringUtils.join(ranges, ", "));
stream.requestRanges(source, preferred, keyspaceName, ranges);
}
}
StreamResultFuture future = stream.execute();
Futures.addCallback(future, new FutureCallback()
{
public void onSuccess(StreamState finalState)
{
sendReplicationNotification(notifyEndpoint);
}
public void onFailure(Throwable t)
{
logger.warn("Streaming to restore replica count failed", t);
// We still want to send the notification
sendReplicationNotification(notifyEndpoint);
}
});
}
// needs to be modified to accept either a keyspace or ARS.
private Multimap, InetAddress> getChangedRangesForLeaving(String keyspaceName, InetAddress endpoint)
{
// First get all ranges the leaving endpoint is responsible for
Collection> ranges = getRangesForEndpoint(keyspaceName, endpoint);
if (logger.isDebugEnabled())
logger.debug("Node {} ranges [{}]", endpoint, StringUtils.join(ranges, ", "));
Map, List> currentReplicaEndpoints = new HashMap<>();
// Find (for each range) all nodes that store replicas for these ranges as well
TokenMetadata metadata = tokenMetadata.cloneOnlyTokenMap(); // don't do this in the loop! #7758
for (Range range : ranges)
currentReplicaEndpoints.put(range, Keyspace.open(keyspaceName).getReplicationStrategy().calculateNaturalEndpoints(range.right, metadata));
TokenMetadata temp = tokenMetadata.cloneAfterAllLeft();
// endpoint might or might not be 'leaving'. If it was not leaving (that is, removenode
// command was used), it is still present in temp and must be removed.
if (temp.isMember(endpoint))
temp.removeEndpoint(endpoint);
Multimap, InetAddress> changedRanges = HashMultimap.create();
// Go through the ranges and for each range check who will be
// storing replicas for these ranges when the leaving endpoint
// is gone. Whoever is present in newReplicaEndpoints list, but
// not in the currentReplicaEndpoints list, will be needing the
// range.
for (Range range : ranges)
{
Collection newReplicaEndpoints = Keyspace.open(keyspaceName).getReplicationStrategy().calculateNaturalEndpoints(range.right, temp);
newReplicaEndpoints.removeAll(currentReplicaEndpoints.get(range));
if (logger.isDebugEnabled())
if (newReplicaEndpoints.isEmpty())
logger.debug("Range {} already in all replicas", range);
else
logger.debug("Range {} will be responsibility of {}", range, StringUtils.join(newReplicaEndpoints, ", "));
changedRanges.putAll(range, newReplicaEndpoints);
}
return changedRanges;
}
public void onJoin(InetAddress endpoint, EndpointState epState)
{
for (Map.Entry entry : epState.getApplicationStateMap().entrySet())
{
onChange(endpoint, entry.getKey(), entry.getValue());
}
MigrationManager.instance.scheduleSchemaPull(endpoint, epState);
}
public void onAlive(InetAddress endpoint, EndpointState state)
{
MigrationManager.instance.scheduleSchemaPull(endpoint, state);
if (isClientMode)
return;
if (tokenMetadata.isMember(endpoint))
{
HintedHandOffManager.instance.scheduleHintDelivery(endpoint, true);
for (IEndpointLifecycleSubscriber subscriber : lifecycleSubscribers)
subscriber.onUp(endpoint);
}
}
public void onRemove(InetAddress endpoint)
{
tokenMetadata.removeEndpoint(endpoint);
PendingRangeCalculatorService.instance.update();
}
public void onDead(InetAddress endpoint, EndpointState state)
{
MessagingService.instance().convict(endpoint);
if (!isClientMode)
{
for (IEndpointLifecycleSubscriber subscriber : lifecycleSubscribers)
subscriber.onDown(endpoint);
}
}
public void onRestart(InetAddress endpoint, EndpointState state)
{
// If we have restarted before the node was even marked down, we need to reset the connection pool
if (state.isAlive())
onDead(endpoint, state);
}
/** raw load value */
public double getLoad()
{
double bytes = 0;
for (String keyspaceName : Schema.instance.getKeyspaces())
{
Keyspace keyspace = Schema.instance.getKeyspaceInstance(keyspaceName);
if (keyspace == null)
continue;
for (ColumnFamilyStore cfs : keyspace.getColumnFamilyStores())
bytes += cfs.getLiveDiskSpaceUsed();
}
return bytes;
}
public String getLoadString()
{
return FileUtils.stringifyFileSize(getLoad());
}
public Map getLoadMap()
{
Map map = new HashMap<>();
for (Map.Entry entry : LoadBroadcaster.instance.getLoadInfo().entrySet())
{
map.put(entry.getKey().getHostAddress(), FileUtils.stringifyFileSize(entry.getValue()));
}
// gossiper doesn't see its own updates, so we need to special-case the local node
map.put(FBUtilities.getBroadcastAddress().getHostAddress(), getLoadString());
return map;
}
public final void deliverHints(String host) throws UnknownHostException
{
HintedHandOffManager.instance.scheduleHintDelivery(host);
}
public Collection getLocalTokens()
{
Collection tokens = SystemKeyspace.getSavedTokens();
assert tokens != null && !tokens.isEmpty(); // should not be called before initServer sets this
return tokens;
}
/* These methods belong to the MBean interface */
public List getTokens()
{
return getTokens(FBUtilities.getBroadcastAddress());
}
public List getTokens(String endpoint) throws UnknownHostException
{
return getTokens(InetAddress.getByName(endpoint));
}
private List getTokens(InetAddress endpoint)
{
List strTokens = new ArrayList<>();
for (Token tok : getTokenMetadata().getTokens(endpoint))
strTokens.add(tok.toString());
return strTokens;
}
public String getReleaseVersion()
{
return FBUtilities.getReleaseVersionString();
}
public String getSchemaVersion()
{
return Schema.instance.getVersion().toString();
}
public List getLeavingNodes()
{
return stringify(tokenMetadata.getLeavingEndpoints());
}
public List getMovingNodes()
{
List endpoints = new ArrayList<>();
for (Pair node : tokenMetadata.getMovingEndpoints())
{
endpoints.add(node.right.getHostAddress());
}
return endpoints;
}
public List getJoiningNodes()
{
return stringify(tokenMetadata.getBootstrapTokens().valueSet());
}
public List getLiveNodes()
{
return stringify(Gossiper.instance.getLiveMembers());
}
public List getUnreachableNodes()
{
return stringify(Gossiper.instance.getUnreachableMembers());
}
public String[] getAllDataFileLocations()
{
String[] locations = DatabaseDescriptor.getAllDataFileLocations();
for (int i = 0; i < locations.length; i++)
locations[i] = FileUtils.getCanonicalPath(locations[i]);
return locations;
}
public String getCommitLogLocation()
{
return FileUtils.getCanonicalPath(DatabaseDescriptor.getCommitLogLocation());
}
public String getSavedCachesLocation()
{
return FileUtils.getCanonicalPath(DatabaseDescriptor.getSavedCachesLocation());
}
private List stringify(Iterable endpoints)
{
List stringEndpoints = new ArrayList<>();
for (InetAddress ep : endpoints)
{
stringEndpoints.add(ep.getHostAddress());
}
return stringEndpoints;
}
public int getCurrentGenerationNumber()
{
return Gossiper.instance.getCurrentGenerationNumber(FBUtilities.getBroadcastAddress());
}
public int forceKeyspaceCleanup(String keyspaceName, String... columnFamilies) throws IOException, ExecutionException, InterruptedException
{
if (keyspaceName.equals(Keyspace.SYSTEM_KS))
throw new RuntimeException("Cleanup of the system keyspace is neither necessary nor wise");
CompactionManager.AllSSTableOpStatus status = CompactionManager.AllSSTableOpStatus.SUCCESSFUL;
for (ColumnFamilyStore cfStore : getValidColumnFamilies(false, false, keyspaceName, columnFamilies))
{
CompactionManager.AllSSTableOpStatus oneStatus = cfStore.forceCleanup();
if (oneStatus != CompactionManager.AllSSTableOpStatus.SUCCESSFUL)
status = oneStatus;
}
return status.statusCode;
}
public int scrub(boolean disableSnapshot, boolean skipCorrupted, String keyspaceName, String... columnFamilies) throws IOException, ExecutionException, InterruptedException
{
CompactionManager.AllSSTableOpStatus status = CompactionManager.AllSSTableOpStatus.SUCCESSFUL;
for (ColumnFamilyStore cfStore : getValidColumnFamilies(false, false, keyspaceName, columnFamilies))
{
CompactionManager.AllSSTableOpStatus oneStatus = cfStore.scrub(disableSnapshot, skipCorrupted);
if (oneStatus != CompactionManager.AllSSTableOpStatus.SUCCESSFUL)
status = oneStatus;
}
return status.statusCode;
}
public int upgradeSSTables(String keyspaceName, boolean excludeCurrentVersion, String... columnFamilies) throws IOException, ExecutionException, InterruptedException
{
CompactionManager.AllSSTableOpStatus status = CompactionManager.AllSSTableOpStatus.SUCCESSFUL;
for (ColumnFamilyStore cfStore : getValidColumnFamilies(true, true, keyspaceName, columnFamilies))
{
CompactionManager.AllSSTableOpStatus oneStatus = cfStore.sstablesRewrite(excludeCurrentVersion);
if (oneStatus != CompactionManager.AllSSTableOpStatus.SUCCESSFUL)
status = oneStatus;
}
return status.statusCode;
}
public void forceKeyspaceCompaction(String keyspaceName, String... columnFamilies) throws IOException, ExecutionException, InterruptedException
{
for (ColumnFamilyStore cfStore : getValidColumnFamilies(true, false, keyspaceName, columnFamilies))
{
cfStore.forceMajorCompaction();
}
}
/**
* Takes the snapshot for the given keyspaces. A snapshot name must be specified.
*
* @param tag the tag given to the snapshot; may not be null or empty
* @param keyspaceNames the names of the keyspaces to snapshot; empty means "all."
*/
public void takeSnapshot(String tag, String... keyspaceNames) throws IOException
{
if (operationMode.equals(Mode.JOINING))
throw new IOException("Cannot snapshot until bootstrap completes");
if (tag == null || tag.equals(""))
throw new IOException("You must supply a snapshot name.");
Iterable keyspaces;
if (keyspaceNames.length == 0)
{
keyspaces = Keyspace.all();
}
else
{
ArrayList t = new ArrayList<>(keyspaceNames.length);
for (String keyspaceName : keyspaceNames)
t.add(getValidKeyspace(keyspaceName));
keyspaces = t;
}
// Do a check to see if this snapshot exists before we actually snapshot
for (Keyspace keyspace : keyspaces)
if (keyspace.snapshotExists(tag))
throw new IOException("Snapshot " + tag + " already exists.");
for (Keyspace keyspace : keyspaces)
keyspace.snapshot(tag, null);
}
/**
* Takes the snapshot of a specific column family. A snapshot name must be specified.
*
* @param keyspaceName the keyspace which holds the specified column family
* @param columnFamilyName the column family to snapshot
* @param tag the tag given to the snapshot; may not be null or empty
*/
public void takeColumnFamilySnapshot(String keyspaceName, String columnFamilyName, String tag) throws IOException
{
if (keyspaceName == null)
throw new IOException("You must supply a keyspace name");
if (operationMode.equals(Mode.JOINING))
throw new IOException("Cannot snapshot until bootstrap completes");
if (columnFamilyName == null)
throw new IOException("You must supply a column family name");
if (columnFamilyName.contains("."))
throw new IllegalArgumentException("Cannot take a snapshot of a secondary index by itself. Run snapshot on the column family that owns the index.");
if (tag == null || tag.equals(""))
throw new IOException("You must supply a snapshot name.");
Keyspace keyspace = getValidKeyspace(keyspaceName);
if (keyspace.snapshotExists(tag))
throw new IOException("Snapshot " + tag + " already exists.");
keyspace.snapshot(tag, columnFamilyName);
}
private Keyspace getValidKeyspace(String keyspaceName) throws IOException
{
if (!Schema.instance.getKeyspaces().contains(keyspaceName))
{
throw new IOException("Keyspace " + keyspaceName + " does not exist");
}
return Keyspace.open(keyspaceName);
}
/**
* Remove the snapshot with the given name from the given keyspaces.
* If no tag is specified we will remove all snapshots.
*/
public void clearSnapshot(String tag, String... keyspaceNames) throws IOException
{
if(tag == null)
tag = "";
Set keyspaces = new HashSet<>();
for (String dataDir : DatabaseDescriptor.getAllDataFileLocations())
{
for(String keyspaceDir : new File(dataDir).list())
{
// Only add a ks if it has been specified as a param, assuming params were actually provided.
if (keyspaceNames.length > 0 && !Arrays.asList(keyspaceNames).contains(keyspaceDir))
continue;
keyspaces.add(keyspaceDir);
}
}
for (String keyspace : keyspaces)
Keyspace.clearSnapshot(tag, keyspace);
if (logger.isDebugEnabled())
logger.debug("Cleared out snapshot directories");
}
public Map getSnapshotDetails()
{
Map snapshotMap = new HashMap<>();
for (Keyspace keyspace : Keyspace.all())
{
if (Keyspace.SYSTEM_KS.equals(keyspace.getName()))
continue;
for (ColumnFamilyStore cfStore : keyspace.getColumnFamilyStores())
{
for (Map.Entry> snapshotDetail : cfStore.getSnapshotDetails().entrySet())
{
TabularDataSupport data = (TabularDataSupport)snapshotMap.get(snapshotDetail.getKey());
if (data == null)
{
data = new TabularDataSupport(SnapshotDetailsTabularData.TABULAR_TYPE);
snapshotMap.put(snapshotDetail.getKey(), data);
}
SnapshotDetailsTabularData.from(snapshotDetail.getKey(), keyspace.getName(), cfStore.getColumnFamilyName(), snapshotDetail, data);
}
}
}
return snapshotMap;
}
public long trueSnapshotsSize()
{
long total = 0;
for (Keyspace keyspace : Keyspace.all())
{
if (Keyspace.SYSTEM_KS.equals(keyspace.getName()))
continue;
for (ColumnFamilyStore cfStore : keyspace.getColumnFamilyStores())
{
total += cfStore.trueSnapshotsSize();
}
}
return total;
}
/**
* @param allowIndexes Allow index CF names to be passed in
* @param autoAddIndexes Automatically add secondary indexes if a CF has them
* @param keyspaceName keyspace
* @param cfNames CFs
* @throws java.lang.IllegalArgumentException when given CF name does not exist
*/
public Iterable getValidColumnFamilies(boolean allowIndexes, boolean autoAddIndexes, String keyspaceName, String... cfNames) throws IOException
{
Keyspace keyspace = getValidKeyspace(keyspaceName);
Set valid = new HashSet<>();
if (cfNames.length == 0)
{
// all stores are interesting
for (ColumnFamilyStore cfStore : keyspace.getColumnFamilyStores())
{
valid.add(cfStore);
if (autoAddIndexes)
{
for (SecondaryIndex si : cfStore.indexManager.getIndexes())
{
if (si.getIndexCfs() != null) {
logger.info("adding secondary index {} to operation", si.getIndexName());
valid.add(si.getIndexCfs());
}
}
}
}
return valid;
}
// filter out interesting stores
for (String cfName : cfNames)
{
//if the CF name is an index, just flush the CF that owns the index
String baseCfName = cfName;
String idxName = null;
if (cfName.contains(".")) // secondary index
{
if(!allowIndexes)
{
logger.warn("Operation not allowed on secondary Index column family ({})", cfName);
continue;
}
String[] parts = cfName.split("\\.", 2);
baseCfName = parts[0];
idxName = parts[1];
}
ColumnFamilyStore cfStore = keyspace.getColumnFamilyStore(baseCfName);
if (idxName != null)
{
Collection< SecondaryIndex > indexes = cfStore.indexManager.getIndexesByNames(new HashSet<>(Arrays.asList(cfName)));
if (indexes.isEmpty())
logger.warn(String.format("Invalid column family index specified: %s/%s. Proceeding with others.", baseCfName, idxName));
else
valid.add(Iterables.get(indexes, 0).getIndexCfs());
}
else
{
valid.add(cfStore);
if(autoAddIndexes)
{
for(SecondaryIndex si : cfStore.indexManager.getIndexes())
{
if (si.getIndexCfs() != null) {
logger.info("adding secondary index {} to operation", si.getIndexName());
valid.add(si.getIndexCfs());
}
}
}
}
}
return valid;
}
/**
* Flush all memtables for a keyspace and column families.
* @param keyspaceName
* @param columnFamilies
* @throws IOException
*/
public void forceKeyspaceFlush(String keyspaceName, String... columnFamilies) throws IOException
{
for (ColumnFamilyStore cfStore : getValidColumnFamilies(true, false, keyspaceName, columnFamilies))
{
logger.debug("Forcing flush on keyspace {}, CF {}", keyspaceName, cfStore.name);
cfStore.forceBlockingFlush();
}
}
/**
* Sends JMX notification to subscribers.
*
* @param type Message type
* @param message Message itself
* @param userObject Arbitrary object to attach to notification
*/
public void sendNotification(String type, String message, Object userObject)
{
Notification jmxNotification = new Notification(type, jmxObjectName, notificationSerialNumber.incrementAndGet(), message);
jmxNotification.setUserData(userObject);
sendNotification(jmxNotification);
}
public int forceRepairAsync(String keyspace, boolean isSequential, Collection dataCenters, Collection hosts, boolean primaryRange, boolean fullRepair, String... columnFamilies) throws IOException
{
return forceRepairAsync(keyspace, isSequential ? RepairParallelism.SEQUENTIAL.ordinal() : RepairParallelism.PARALLEL.ordinal(), dataCenters, hosts, primaryRange, fullRepair, columnFamilies);
}
public int forceRepairAsync(String keyspace, int parallelismDegree, Collection dataCenters, Collection hosts, boolean primaryRange, boolean fullRepair, String... columnFamilies)
{
if (parallelismDegree < 0 || parallelismDegree > RepairParallelism.values().length - 1)
{
throw new IllegalArgumentException("Invalid parallelism degree specified: " + parallelismDegree);
}
Collection> ranges;
if (primaryRange)
{
// when repairing only primary range, neither dataCenters nor hosts can be set
if (dataCenters == null && hosts == null)
ranges = getPrimaryRanges(keyspace);
// except dataCenters only contain local DC (i.e. -local)
else if (dataCenters != null && dataCenters.size() == 1 && dataCenters.contains(DatabaseDescriptor.getLocalDataCenter()))
ranges = getPrimaryRangesWithinDC(keyspace);
else
throw new IllegalArgumentException("You need to run primary range repair on all nodes in the cluster.");
}
else
{
ranges = getLocalRanges(keyspace);
}
return forceRepairAsync(keyspace, RepairParallelism.values()[parallelismDegree], dataCenters, hosts, ranges, fullRepair, columnFamilies);
}
public int forceRepairAsync(String keyspace, boolean isSequential, Collection dataCenters, Collection hosts, Collection> ranges, boolean fullRepair, String... columnFamilies)
{
return forceRepairAsync(keyspace, isSequential ? RepairParallelism.SEQUENTIAL : RepairParallelism.PARALLEL, dataCenters, hosts, ranges, fullRepair, columnFamilies);
}
public int forceRepairAsync(String keyspace, RepairParallelism parallelismDegree, Collection dataCenters, Collection hosts, Collection> ranges, boolean fullRepair, String... columnFamilies)
{
if (ranges.isEmpty() || Keyspace.open(keyspace).getReplicationStrategy().getReplicationFactor() < 2)
return 0;
int cmd = nextRepairCommand.incrementAndGet();
if (ranges.size() > 0)
{
if (FBUtilities.isWindows() && parallelismDegree != RepairParallelism.PARALLEL)
{
logger.warn("Snapshot-based repair is not yet supported on Windows. Reverting to parallel repair.");
parallelismDegree = RepairParallelism.PARALLEL;
}
new Thread(createRepairTask(cmd, keyspace, ranges, parallelismDegree, dataCenters, hosts, fullRepair, columnFamilies)).start();
}
return cmd;
}
public int forceRepairAsync(String keyspace, boolean isSequential, boolean isLocal, boolean primaryRange, boolean fullRepair, String... columnFamilies)
{
Collection> ranges;
if (primaryRange)
{
ranges = isLocal ? getPrimaryRangesWithinDC(keyspace) : getPrimaryRanges(keyspace);
}
else
{
ranges = getLocalRanges(keyspace);
}
return forceRepairAsync(keyspace, isSequential, isLocal, ranges, fullRepair, columnFamilies);
}
public int forceRepairAsync(String keyspace, boolean isSequential, boolean isLocal, Collection> ranges, boolean fullRepair, String... columnFamilies)
{
return forceRepairAsync(keyspace, isSequential ? RepairParallelism.SEQUENTIAL : RepairParallelism.PARALLEL, isLocal, ranges, fullRepair, columnFamilies);
}
public int forceRepairAsync(String keyspace, RepairParallelism parallelismDegree, boolean isLocal, Collection> ranges, boolean fullRepair, String... columnFamilies)
{
if (ranges.isEmpty() || Keyspace.open(keyspace).getReplicationStrategy().getReplicationFactor() < 2)
return 0;
int cmd = nextRepairCommand.incrementAndGet();
if (FBUtilities.isWindows() && parallelismDegree != RepairParallelism.PARALLEL)
{
logger.warn("Snapshot-based repair is not yet supported on Windows. Reverting to parallel repair.");
parallelismDegree = RepairParallelism.PARALLEL;
}
new Thread(createRepairTask(cmd, keyspace, ranges, parallelismDegree, isLocal, fullRepair, columnFamilies)).start();
return cmd;
}
public int forceRepairRangeAsync(String beginToken, String endToken, String keyspaceName, boolean isSequential, Collection dataCenters, Collection hosts, boolean fullRepair, String... columnFamilies) throws IOException
{
return forceRepairRangeAsync(beginToken, endToken, keyspaceName, isSequential ? RepairParallelism.SEQUENTIAL.ordinal() : RepairParallelism.PARALLEL.ordinal(), dataCenters, hosts, fullRepair, columnFamilies);
}
public int forceRepairRangeAsync(String beginToken, String endToken, String keyspaceName, int parallelismDegree, Collection dataCenters, Collection hosts, boolean fullRepair, String... columnFamilies)
{
if (parallelismDegree < 0 || parallelismDegree > RepairParallelism.values().length - 1)
{
throw new IllegalArgumentException("Invalid parallelism degree specified: " + parallelismDegree);
}
Collection> repairingRange = createRepairRangeFrom(beginToken, endToken);
logger.info("starting user-requested repair of range {} for keyspace {} and column families {}",
repairingRange, keyspaceName, columnFamilies);
RepairParallelism parallelism = RepairParallelism.values()[parallelismDegree];
return forceRepairAsync(keyspaceName, parallelism, dataCenters, hosts, repairingRange, fullRepair, columnFamilies);
}
public int forceRepairRangeAsync(String beginToken, String endToken, String keyspaceName, boolean isSequential, boolean isLocal, boolean fullRepair, String... columnFamilies)
{
Collection> repairingRange = createRepairRangeFrom(beginToken, endToken);
logger.info("starting user-requested repair of range {} for keyspace {} and column families {}",
repairingRange, keyspaceName, columnFamilies);
return forceRepairAsync(keyspaceName, isSequential, isLocal, repairingRange, fullRepair, columnFamilies);
}
/**
* Create collection of ranges that match ring layout from given tokens.
*
* @param beginToken beginning token of the range
* @param endToken end token of the range
* @return collection of ranges that match ring layout in TokenMetadata
*/
@SuppressWarnings("unchecked")
@VisibleForTesting
Collection> createRepairRangeFrom(String beginToken, String endToken)
{
Token parsedBeginToken = getPartitioner().getTokenFactory().fromString(beginToken);
Token parsedEndToken = getPartitioner().getTokenFactory().fromString(endToken);
// Break up given range to match ring layout in TokenMetadata
ArrayList> repairingRange = new ArrayList<>();
ArrayList tokens = new ArrayList<>(tokenMetadata.sortedTokens());
if (!tokens.contains(parsedBeginToken))
{
tokens.add(parsedBeginToken);
}
if (!tokens.contains(parsedEndToken))
{
tokens.add(parsedEndToken);
}
// tokens now contain all tokens including our endpoints
Collections.sort(tokens);
int start = tokens.indexOf(parsedBeginToken), end = tokens.indexOf(parsedEndToken);
for (int i = start; i != end; i = (i+1) % tokens.size())
{
Range range = new Range<>(tokens.get(i), tokens.get((i+1) % tokens.size()));
repairingRange.add(range);
}
return repairingRange;
}
private FutureTask