org.adejanovski.cassandra.policies.DCAwareFailoverRoundRobinPolicy Maven / Gradle / Ivy
/*
* Copyright (C) 2012-2015 DataStax Inc.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.adejanovski.cassandra.policies;
import java.util.*;
import java.util.concurrent.ConcurrentHashMap;
import java.util.concurrent.ConcurrentMap;
import java.util.concurrent.CopyOnWriteArrayList;
import java.util.concurrent.ExecutionException;
import java.util.concurrent.TimeUnit;
import java.util.concurrent.TimeoutException;
import java.util.concurrent.atomic.AtomicBoolean;
import java.util.concurrent.atomic.AtomicInteger;
import com.google.common.base.Joiner;
import com.google.common.base.Preconditions;
import com.google.common.base.Strings;
import com.google.common.annotations.VisibleForTesting;
import com.google.common.collect.AbstractIterator;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import com.datastax.driver.core.Cluster;
import com.datastax.driver.core.Configuration;
import com.datastax.driver.core.ConsistencyLevel;
import com.datastax.driver.core.Host;
import com.datastax.driver.core.HostDistance;
import com.datastax.driver.core.Statement;
import com.datastax.driver.core.policies.CloseableLoadBalancingPolicy;
import com.datastax.driver.core.policies.LoadBalancingPolicy;
/**
* A data-center aware Round-robin load balancing policy with DC failover
* support.
*
* This policy provides round-robin queries over the node of the local data
* center. It also includes in the query plans returned a configurable number of
* hosts in the remote data centers, but those are always tried after the local
* nodes. In other words, this policy guarantees that no host in a remote data
* center will be queried unless no host in the local data center can be
* reached.
*
* If used with a single data center, this policy is equivalent to the
* RoundRobinPolicy, but its DC awareness incurs a
* slight overhead so the RoundRobinPolicy could be
* preferred to this policy in that case.
*
* On top of the DCAwareRoundRobinPolicy, this policy uses a one way switch in
* case a defined number of nodes are down in the local DC. As stated, the
* policy never switches back to the local DC in order to prevent
* inconsistencies and give ops teams the ability to repair the local DC before
* switching back manually.
*/
/**
* @author adejanovski
*
*/
/**
* @author adejanovski
*
*/
public class DCAwareFailoverRoundRobinPolicy implements LoadBalancingPolicy,
CloseableLoadBalancingPolicy {
private static final Logger logger = LoggerFactory
.getLogger(DCAwareFailoverRoundRobinPolicy.class);
/**
* Returns a builder to create a new instance.
*
* @return the builder.
*/
public static Builder builder() {
return new Builder();
}
private final String UNSET = "";
private final ConcurrentMap> perDcLiveHosts = new ConcurrentHashMap>();
private final AtomicInteger index = new AtomicInteger();
volatile String localDc;
volatile String backupDc;
/**
* Current value of the switch threshold. if {@code hostDownSwitchThreshold}
* is lower than 0 then we must switch.
*/
private AtomicInteger hostDownSwitchThreshold = new AtomicInteger();
/**
* Initial value of the switch threshold
*/
private final int initHostDownSwitchThreshold;
/**
* flag to test if the switch as occurred
*/
private AtomicBoolean switchedToBackupDc = new AtomicBoolean(false);
/**
* Time at which the switch occurred
*/
private Date switchedToBackupDcAt;
/**
* Automatically switching back to local DC is possible after : downtime*{@code switchBackDelayFactor}
*/
private Float switchBackDelayFactor=(float)1000;
/**
* Downtime delay after which switching back cannot be automated (usually
* when hinted handoff window is reached) In seconds.
*/
private int noSwitchBackDowntimeDelay=0;
private Date localDcCameBackUpAt;
private boolean switchBackCanNeverHappen=false;
private volatile Configuration configuration;
/**
* Creates a new datacenter aware failover round robin policy that uses a
* local data-center and a backup data-center. Switching to the backup DC is
* triggered automatically if local DC loses more than
* {@code hostDownSwitchThreshold} nodes. Switching back to local DC after
* going to backup will never happen automatically.
* @param localDc the local datacenter
* @param backupDc the backup datacenter
* @param hostDownSwitchThreshold how many nodes have to be down before switching
*/
public DCAwareFailoverRoundRobinPolicy(String localDc, String backupDc,
int hostDownSwitchThreshold) {
this(localDc, backupDc, hostDownSwitchThreshold, (float) -1.0, 0);
}
/**
* Creates a new datacenter aware failover round robin policy that uses a
* local data-center and a backup data-center. Switching to the backup DC is
* triggered automatically if local DC loses more than
* {@code hostDownSwitchThreshold} nodes.
* The policy will switch back to the local DC if conditions are fulfilled :
* - Downtime lasted less than noSwitchBackDowntimeDelay (hint window)
* - uptime since downtime happened is superior to downtime*switchBackDelayFactor (give
* enough time for hints to be executed)
*
* @param localDc the local datacenter
* @param backupDc the backup datacenter
* @param hostDownSwitchThreshold how many nodes have to be down before switching
* @param switchBackDelayFactor uptime since downtime happened is superior to downtime*switchBackDelayFactor
* @param noSwitchBackDowntimeDelay maximum downtime to authorize a back switch to local DC
*/
public DCAwareFailoverRoundRobinPolicy(String localDc, String backupDc,
int hostDownSwitchThreshold, float switchBackDelayFactor,
int noSwitchBackDowntimeDelay) {
this.localDc = localDc == null ? UNSET : localDc;
this.backupDc = backupDc == null ? UNSET : backupDc;
this.hostDownSwitchThreshold = new AtomicInteger(hostDownSwitchThreshold);
this.initHostDownSwitchThreshold = hostDownSwitchThreshold;
this.switchBackDelayFactor = switchBackDelayFactor;
this.noSwitchBackDowntimeDelay = noSwitchBackDowntimeDelay;
}
public void init(Cluster cluster, Collection hosts) {
if (localDc != UNSET)
logger.info(
"Using provided data-center name '{}' for DCAwareFailoverRoundRobinPolicy",
localDc);
this.configuration = cluster.getConfiguration();
ArrayList notInLocalDC = new ArrayList();
for (Host host : hosts) {
String dc = dc(host);
logger.trace("node {} is in dc {}", host.getAddress().toString(), dc);
// If the localDC was in "auto-discover" mode and it's the first
// host for which we have a DC, use it.
if (localDc == UNSET && dc != UNSET) {
logger.info(
"Using data-center name '{}' for DCAwareFailoverRoundRobinPolicy (if this is incorrect, please provide the correct datacenter name with DCAwareFailoverRoundRobinPolicy constructor)",
dc);
localDc = dc;
} else if (!dc.equals(localDc) && !dc.equals(backupDc))
notInLocalDC.add(String.format("%s (%s)", host.toString(), dc));
if (!dc.equals(localDc) && !dc.equals(backupDc))
notInLocalDC.add(String.format("%s (%s)", host.toString(),
host.getDatacenter()));
CopyOnWriteArrayList prev = perDcLiveHosts.get(dc);
if (prev == null)
perDcLiveHosts.put(dc, new CopyOnWriteArrayList(
Collections.singletonList(host)));
else
prev.addIfAbsent(host);
}
if (notInLocalDC.size() > 0) {
String nonLocalHosts = Joiner.on(",").join(notInLocalDC);
logger.warn(
"Some contact points don't match local or backup data center. Local DC = {} - backup DC {}. Non-conforming contact points: {}",
localDc, backupDc, nonLocalHosts);
}
}
private String dc(Host host) {
String dc = host.getDatacenter();
return dc == null ? localDc : dc;
}
@SuppressWarnings("unchecked")
private static CopyOnWriteArrayList cloneList(
CopyOnWriteArrayList list) {
return (CopyOnWriteArrayList) list.clone();
}
/**
* Return the HostDistance for the provided host.
*
* This policy consider nodes in the local datacenter as {@code LOCAL}. For
* each remote datacenter, it considers a configurable number of hosts as
* {@code REMOTE} and the rest is {@code IGNORED}.
*
* To configure how many host in each remote datacenter is considered
* {@code REMOTE}.
*
* @param host
* the host of which to return the distance of.
* @return the HostDistance to {@code host}.
*/
public HostDistance distance(Host host) {
String dc = dc(host);
// If the connection has switched to the backup DC and fulfills
// the requirement for a back switch, make it happen.
if(!switchBackCanNeverHappen){
triggerBackSwitchIfNecessary();
}
if (isLocal(dc)) {
return HostDistance.LOCAL;
}
// Only hosts in local DC and backup DC can be considered remote
if(dc(host).equals(localDc) || dc(host).equals(backupDc))
return HostDistance.REMOTE;
// All other hosts are ignored
return HostDistance.IGNORED;
}
/**
* Returns the hosts to use for a new query.
*
* The returned plan will always try each known host in the local datacenter
* first, and then, if none of the local host is reachable, will try up to a
* configurable number of other host per remote datacenter. The order of the
* local node in the returned query plan will follow a Round-robin
* algorithm.
*
* @param loggedKeyspace
* the keyspace currently logged in on for this query.
* @param statement
* the query for which to build the plan.
* @return a new query plan, i.e. an iterator indicating which host to try
* first for querying, which one to use as failover, etc...
*/
public Iterator newQueryPlan(String loggedKeyspace,
final Statement statement) {
String currentDc = localDc;
if(!switchBackCanNeverHappen){
triggerBackSwitchIfNecessary();
}
if (switchedToBackupDc.get()) {
currentDc = backupDc;
}
CopyOnWriteArrayList localLiveHosts = perDcLiveHosts.get(currentDc);
final List hosts = localLiveHosts == null ? Collections. emptyList() : cloneList(localLiveHosts);
final int startIdx = index.getAndIncrement();
return new AbstractIterator() {
private int idx = startIdx;
private int remainingLocal = hosts.size();
// For remote Dcs
private Iterator remoteDcs;
private List currentDcHosts;
private int currentDcRemaining;
@Override
protected Host computeNext() {
if (remainingLocal > 0) {
remainingLocal--;
int c = idx++ % hosts.size();
if (c < 0) {
c += hosts.size();
}
return hosts.get(c);
}
return endOfData();
}
};
}
public void onUp(Host host) {
String dc = dc(host);
if (dc.equals(localDc) && this.hostDownSwitchThreshold.get() < this.initHostDownSwitchThreshold
) {
// if a node comes backup in the local DC and we're not already
// equal to the initial threshold, add one node to the
// switch threshold
// This can only happen if the switch didn't occur yet
this.hostDownSwitchThreshold.incrementAndGet();
updateLocalDcStatus();
}
// If the localDC was in "auto-discover" mode and it's the first host
// for which we have a DC, use it.
if (localDc == UNSET && dc != UNSET) {
logger.info(
"Using data-center name '{}' for DCAwareFailoverRoundRobinPolicy (if this is incorrect, please provide the correct datacenter name with DCAwareFailoverRoundRobinPolicy constructor)",
dc);
localDc = dc;
}
CopyOnWriteArrayList dcHosts = perDcLiveHosts.get(dc);
if (dcHosts == null) {
CopyOnWriteArrayList newMap = new CopyOnWriteArrayList(Collections.singletonList(host));
dcHosts = perDcLiveHosts.putIfAbsent(dc, newMap);
// If we've successfully put our new host, we're good, otherwise
// we've been beaten so continue
if (dcHosts == null)
return;
}
dcHosts.addIfAbsent(host);
}
public void onSuspected(Host host) {
}
public void onDown(Host host) {
if (dc(host).equals(localDc) && !switchedToBackupDc.get()) {
// if a node goes down in the local DC remove one node to eventually
// trigger the switch
this.hostDownSwitchThreshold.decrementAndGet();
}
CopyOnWriteArrayList dcHosts = perDcLiveHosts.get(dc(host));
if (dcHosts != null)
dcHosts.remove(host);
if (this.hostDownSwitchThreshold.get() <= 0) {
// Make sure localDc is not considered as being up
localDcCameBackUpAt = null;
if (!switchedToBackupDc.get()) {
// if we lost as many nodes in the local dc as configured in the
// threshold, switch to backup DC
switchToBackup();
}
}
}
public void onAdd(Host host) {
onUp(host);
}
public void onRemove(Host host) {
onDown(host);
}
public void close() {
// nothing to do
}
/**
* Perform switch to backup DC
*/
private void switchToBackup() {
switchedToBackupDc.set(true);
switchedToBackupDcAt = new Date();
logger.warn(
"Lost {} nodes in data-center '{}'. Switching to data-center '{}'",
this.initHostDownSwitchThreshold, this.localDc, this.backupDc);
}
/**
* Perform switch back to local DC
*/
private void switchBackToLocal() {
switchedToBackupDc.set(false);
switchedToBackupDcAt = null;
localDcCameBackUpAt = null;
logger.warn(
"Recovered enough nodes in data-center '{}'. Switching back since conditions are fulfilled",
this.localDc);
}
/**
* Check if the cluster state fulfills requirements for switching back to
* local DC. Conditions to switch back : - the connection as already
* switched to backup DC - hostDownSwitchThreshold is > 0 - Enough time has
* passed for hinted handoff (currentTime - localDcCameBackUpAt) >
* (localDcCameBackUpAt - switchedToBackupDcAt)*switchBackDelayFactor -
* (localDcCameBackUpAt - switchedToBackupDcAt) < noSwitchBackDowntimeDelay
*
* @return
*/
private boolean canSwitchBack() {
if ((localDcCameBackUpAt.getTime() - switchedToBackupDcAt.getTime()) < noSwitchBackDowntimeDelay * 1000) {
if (switchedToBackupDc.get() && isLocalDcBackUp()) {
logger.debug(
"Local DC {} is up and has been down for {}s. Switch back will happen after {}s. Uptime = {}s ",
localDc,
(int) (getDowntimeDuration() / 1000),
(int) (getDowntimeDuration() * switchBackDelayFactor / 1000),
(getUptimeDuration()) / 1000);
return (hostDownSwitchThreshold.get() > 0)
&& (getUptimeDuration() > getDowntimeDuration() * switchBackDelayFactor)
&& getDowntimeDuration() < noSwitchBackDowntimeDelay * 1000;
}
}else{
// Downtime lasted more than the hinted handoff window
// Switching back is now a manual operation
logger.warn(
"Local DC has been down for too long. Switch back will never happen.");
switchBackCanNeverHappen=true;
}
return false;
}
/**
* returns the duration of the local DC downtime.
* @return
*/
private long getDowntimeDuration(){
return localDcCameBackUpAt.getTime() - switchedToBackupDcAt.getTime();
}
/**
* get the uptime duration of local DC after outage.
* @return
*/
private long getUptimeDuration(){
return new Date().getTime() - localDcCameBackUpAt.getTime();
}
private void updateLocalDcStatus() {
if (switchedToBackupDc.get() && hostDownSwitchThreshold.get() > 0 && localDcCameBackUpAt == null) {
localDcCameBackUpAt = new Date();
}
}
/**
* Test if local DC has enough nodes to be considered alive
*
* @return
*/
private boolean isLocalDcBackUp() {
return hostDownSwitchThreshold.get() > 0 && localDcCameBackUpAt != null;
}
/**
* Test if a node is in the local DC (or in the backup DC and switch has
* occurred)
*
* @param dc
* @return
*/
private boolean isLocal(String dc) {
return dc == UNSET || (dc.equals(localDc) && !switchedToBackupDc.get())
|| (dc.equals(backupDc) && switchedToBackupDc.get());
}
/**
* Check if a switch as occurred and switching back to local DC is possible.
*/
public void triggerBackSwitchIfNecessary() {
if (switchedToBackupDc.get() && localDcCameBackUpAt!=null && switchedToBackupDcAt!=null) {
if (canSwitchBack()) {
switchBackToLocal();
}
}
}
/**
* Helper class to build the policy.
*/
public static class Builder {
private String localDc;
private String backupDc;
private int hostDownSwitchThreshold;
private Float switchBackDelayFactor=(float)1000;
private int noSwitchBackDowntimeDelay=0;
/**
* Sets the name of the datacenter that will be considered "local" by the policy.
*
* This must be the name as known by Cassandra (in other words, the name in that appears in
* {@code system.peers}, or in the output of admin tools like nodetool).
*
* If this method isn't called, the policy will default to the datacenter of the first node
* connected to. This will always be ok if all the contact points use at {@code Cluster}
* creation are in the local data-center. Otherwise, you should provide the name yourself
* with this method.
*
* @param localDc the name of the datacenter. It should not be {@code null}.
* @return this builder.
*/
public Builder withLocalDc(String localDc) {
Preconditions.checkArgument(!Strings.isNullOrEmpty(localDc),
"localDc name can't be null or empty. If you want to let the policy autodetect the datacenter, don't call Builder.withLocalDC");
this.localDc = localDc;
return this;
}
/**
* Sets the name of the datacenter that will be considered as "backup" by the policy.
*
* This must be the name as known by Cassandra (in other words, the name in that appears in
* {@code system.peers}, or in the output of admin tools like nodetool).
*
* If this method must be called, otherwise you should not use this policy.
*
* @param backupDc the name of the datacenter. It should not be {@code null}.
* @return this builder.
*/
public Builder withBackupDc(String backupDc) {
Preconditions.checkArgument(!Strings.isNullOrEmpty(localDc),
"backupDc name can't be null or empty.");
this.backupDc = backupDc;
return this;
}
/**
* Sets how many nodes must be down in the local DC before switching to backup.
*
* @param hostDownSwitchThreshold the number of nodes down before switching to the backup DC.
* @return this builder
*/
public Builder withHostDownSwitchThreshold(int hostDownSwitchThreshold) {
this.hostDownSwitchThreshold = hostDownSwitchThreshold;
return this;
}
/**
* Mandatory if you want to authorize switching back to local DC after downtime.
* Allows enough time to pass so that hinted handoff can finish
* (currentTime - localDcCameBackUpAt) > (localDcCameBackUpAt - switchedToBackupDcAt)*switchBackDelayFactor
*
* @param switchBackDelayFactor times downtime has to be <= uptime before switching back to local DC
* @return this builder
*/
public Builder withSwitchBackDelayFactor(float switchBackDelayFactor) {
this.switchBackDelayFactor = switchBackDelayFactor;
return this;
}
/**
* Mandatory if you want to authorize switching back to local DC after downtime.
* Prevents switching back to local DC if downtime was longer than the provided value.
* Used to check if downtime didn't last more than the hinted handoff window (which requires repair).
*
* @param noSwitchBackDowntimeDelay max time in seconds before switching back to local DC will be prevented.
* @return this builder
*/
public Builder withNoSwitchBackDowntimeDelay(int noSwitchBackDowntimeDelay) {
this.noSwitchBackDowntimeDelay = noSwitchBackDowntimeDelay;
return this;
}
/**
* Builds the policy configured by this builder.
*
* @return the policy.
*/
public DCAwareFailoverRoundRobinPolicy build() {
return new DCAwareFailoverRoundRobinPolicy(localDc, backupDc, hostDownSwitchThreshold, switchBackDelayFactor, noSwitchBackDowntimeDelay);
}
}
}