Please wait. This can take some minutes ...
Many resources are needed to download a project. Please understand that we have to compensate our server costs. Thank you in advance.
Project price only 1 $
You can buy this project and download/modify it how often you want.
org.voltdb.iv2.Cartographer Maven / Gradle / Ivy
/* This file is part of VoltDB.
* Copyright (C) 2008-2018 VoltDB Inc.
*
* This program is free software: you can redistribute it and/or modify
* it under the terms of the GNU Affero General Public License as
* published by the Free Software Foundation, either version 3 of the
* License, or (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU Affero General Public License for more details.
*
* You should have received a copy of the GNU Affero General Public License
* along with VoltDB. If not, see .
*/
package org.voltdb.iv2;
import java.util.AbstractMap;
import java.util.ArrayDeque;
import java.util.ArrayList;
import java.util.Collection;
import java.util.Collections;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Iterator;
import java.util.LinkedList;
import java.util.List;
import java.util.Map;
import java.util.Map.Entry;
import java.util.Queue;
import java.util.Set;
import java.util.concurrent.Callable;
import java.util.concurrent.ExecutionException;
import java.util.concurrent.ExecutorService;
import org.apache.zookeeper_voltpatches.CreateMode;
import org.apache.zookeeper_voltpatches.KeeperException;
import org.apache.zookeeper_voltpatches.ZooKeeper;
import org.json_voltpatches.JSONException;
import org.json_voltpatches.JSONStringer;
import org.voltcore.logging.VoltLogger;
import org.voltcore.messaging.BinaryPayloadMessage;
import org.voltcore.messaging.ForeignHost;
import org.voltcore.messaging.HostMessenger;
import org.voltcore.utils.CoreUtils;
import org.voltcore.utils.Pair;
import org.voltcore.zk.CoreZK;
import org.voltcore.zk.LeaderElector;
import org.voltcore.zk.ZKUtil;
import org.voltdb.AbstractTopology;
import org.voltdb.MailboxNodeContent;
import org.voltdb.RealVoltDB;
import org.voltdb.StatsSource;
import org.voltdb.VoltDB;
import org.voltdb.VoltTable;
import org.voltdb.VoltTable.ColumnInfo;
import org.voltdb.VoltType;
import org.voltdb.VoltZK;
import org.voltdb.VoltZK.MailboxType;
import org.voltdb.export.ExportManager;
import org.voltdb.iv2.LeaderCache.LeaderCallBackInfo;
import com.google_voltpatches.common.base.Preconditions;
import com.google_voltpatches.common.collect.ArrayListMultimap;
import com.google_voltpatches.common.collect.ImmutableMap;
import com.google_voltpatches.common.collect.Lists;
import com.google_voltpatches.common.collect.Maps;
import com.google_voltpatches.common.collect.Multimap;
import com.google_voltpatches.common.collect.Multimaps;
import com.google_voltpatches.common.collect.Sets;
/**
* Cartographer provides answers to queries about the components in a cluster.
* It provides the StatsSource interface for the TOPO statistics selector, but
* can be called directly as long as the caller is careful about not calling
* from a network thread (need to avoid ZK deadlocks).
*/
public class Cartographer extends StatsSource
{
private static final VoltLogger hostLog = new VoltLogger("HOST");
private final LeaderCacheReader m_iv2Masters;
private final LeaderCacheReader m_iv2Mpi;
private final Set m_currentSPMasters = new HashSet();
private final HostMessenger m_hostMessenger;
private final ZooKeeper m_zk;
private final Set m_allMasters = new HashSet();
public static final String JSON_PARTITION_ID = "partitionId";
public static final String JSON_INITIATOR_HSID = "initiatorHSId";
public static final String JSON_LEADER_MIGRATION = "leaderMigration";
private final int m_configuredReplicationFactor;
//partition masters by host
private final Map> m_currentMastersByHost = Maps.newTreeMap();
private final ExecutorService m_es
= CoreUtils.getCachedSingleThreadExecutor("Cartographer", 15000);
// This message used to be sent by the SP or MP initiator when they accepted a promotion.
// For dev speed, we'll detect mastership changes here and construct and send this message to the
// local client interface so we can keep the CIs implementation
private void sendLeaderChangeNotify(long hsId, int partitionId, boolean migratePartitionLeader)
{
hostLog.info("[Cartographer] Sending leader change notification with new leader:" +
CoreUtils.hsIdToString(hsId) + " for partition:" + partitionId);
try {
JSONStringer stringer = new JSONStringer();
stringer.object();
stringer.keySymbolValuePair(JSON_PARTITION_ID, partitionId);
stringer.keySymbolValuePair(JSON_INITIATOR_HSID, hsId);
stringer.keySymbolValuePair(JSON_LEADER_MIGRATION, migratePartitionLeader);
stringer.endObject();
BinaryPayloadMessage bpm = new BinaryPayloadMessage(new byte[0], stringer.toString().getBytes("UTF-8"));
int hostId = m_hostMessenger.getHostId();
m_hostMessenger.send(CoreUtils.getHSIdFromHostAndSite(hostId,
HostMessenger.CLIENT_INTERFACE_SITE_ID),
bpm);
}
catch (Exception e) {
VoltDB.crashLocalVoltDB("Unable to propogate leader promotion to client interface.", true, e);
}
}
LeaderCache.Callback m_MPICallback = new LeaderCache.Callback()
{
@Override
public void run(ImmutableMap cache) {
// Every MPI change means a new single MPI. Just do the right thing here
int pid = MpInitiator.MP_INIT_PID;
// Can be zero-length at startup
if (cache.size() > 0) {
hostLog.info("[Cartographer MP] Sending leader change notification with new leader:");
sendLeaderChangeNotify(cache.get(pid).m_HSId, pid, false);
}
}
};
LeaderCache.Callback m_SPIMasterCallback = new LeaderCache.Callback()
{
@Override
public void run(ImmutableMap cache) {
// We know there's a 1:1 mapping between partitions and HSIds in this map.
// let's flip it
// Map hsIdToPart = new HashMap();
Set newMasters = new HashSet();
Set newHSIDs = Sets.newHashSet();
Map> newMastersByHost = Maps.newTreeMap();
for (Entry e : cache.entrySet()) {
LeaderCallBackInfo newMasterInfo = e.getValue();
Long hsid = newMasterInfo.m_HSId;
int partitionId = e.getKey();
newHSIDs.add(hsid);
// hsIdToPart.put(hsid, partitionId);
int hostId = CoreUtils.getHostIdFromHSId(hsid);
Set masters = newMastersByHost.get(hostId);
if (masters == null) {
masters = Sets.newHashSet();
newMastersByHost.put(hostId, masters);
}
masters.add(hsid);
if (!m_currentSPMasters.contains(hsid)) {
// we want to see items which are present in the new map but not in the old,
// these are newly promoted SPIs
newMasters.add(newMasterInfo);
// send the messages indicating promotion from here for each new master
sendLeaderChangeNotify(hsid, partitionId, newMasterInfo.m_isMigratePartitionLeaderRequested);
// For Export Subsystem, demote the old leaders and promote new leaders
// only target current host
// In the rare case the spMasterCallbacks from MigratePartitionLeaderRequested could be trigger
// before this node has finished init ExportManager.
// This could happen for a previous sp leader Migration on a fresh rejoined node.
// Since this node would not be export master nor should be promoted from previous migration event,
// we can ignore this sp leader change for export.
if (newMasterInfo.m_isMigratePartitionLeaderRequested && ExportManager.instance() != null) {
if (isHostIdLocal(hostId)) {
// this is a host contain newly promoted partition
// inform the export manager to prepare mastership promotion
// Cartographer is initialized before ExportManager, ignore callbacks
// before export is initialized
ExportManager.instance().prepareAcceptMastership(partitionId);
} else {
// this host *could* contain old master
// inform the export manager to prepare mastership migration (drain existing PBD and notify new leader)
ExportManager.instance().prepareTransferMastership(partitionId, hostId);
}
}
}
}
if (hostLog.isDebugEnabled()) {
Set masters = Sets.newHashSet();
m_currentSPMasters.forEach((k) -> {
masters.add(CoreUtils.hsIdToString(k));
});
hostLog.debug("[Cartographer] SP masters:" + masters);
masters.clear();
cache.values().forEach((k) -> {
masters.add(CoreUtils.hsIdToString(k.m_HSId));
});
hostLog.debug("[Cartographer]Updated SP masters:" + masters + ". New masters:" + newMasters);
}
m_currentSPMasters.clear();
m_currentSPMasters.addAll(newHSIDs);
m_currentMastersByHost.clear();
m_currentMastersByHost.putAll(newMastersByHost);
}
};
/**
* A dummy iterator that wraps an UnmodifiableIterator and provides the
* Iterator
*/
private static class DummyIterator implements Iterator {
private final Iterator i;
private DummyIterator(Iterator i) {
this.i = i;
}
@Override
public boolean hasNext() {
return i.hasNext();
}
@Override
public Object next() {
return i.next();
}
@Override
public void remove() {
i.remove();
}
}
public Cartographer(HostMessenger hostMessenger, int configuredReplicationFactor, boolean partitionDetectionEnabled) {
super(false);
m_hostMessenger = hostMessenger;
m_zk = hostMessenger.getZK();
m_iv2Masters = new LeaderCache(m_zk, VoltZK.iv2masters, m_SPIMasterCallback);
m_iv2Mpi = new LeaderCache(m_zk, VoltZK.iv2mpi, m_MPICallback);
m_configuredReplicationFactor = configuredReplicationFactor;
try {
m_iv2Masters.start(true);
m_iv2Mpi.start(true);
} catch (Exception e) {
VoltDB.crashLocalVoltDB("Screwed", true, e);
}
}
@Override
protected void populateColumnSchema(ArrayList columns)
{
columns.add(new ColumnInfo("Partition", VoltType.INTEGER));
columns.add(new ColumnInfo("Sites", VoltType.STRING));
columns.add(new ColumnInfo("Leader", VoltType.STRING));
}
@Override
protected Iterator getStatsRowKeyIterator(boolean interval)
{
m_allMasters.clear();
m_allMasters.addAll(m_iv2Masters.pointInTimeCache().keySet());
m_allMasters.add(MpInitiator.MP_INIT_PID);
//make a copy of the master list for the topology statistics to avoid any concurrent modification
//since the master list may be updated while the topology statistics is being built.
Set masters = new HashSet<>();
masters.addAll(m_allMasters);
return new DummyIterator(masters.iterator());
}
@Override
protected void updateStatsRow(Object rowKey, Object[] rowValues) {
long leader;
List sites = new ArrayList();
if (rowKey.equals(MpInitiator.MP_INIT_PID)) {
leader = getHSIdForMultiPartitionInitiator();
sites.add(leader);
}
else {
//sanity check. The master list may be updated while the statistics is calculated.
Long leaderInCache = m_iv2Masters.pointInTimeCache().get(rowKey);
if (leaderInCache == null) {
return;
}
leader = leaderInCache;
sites.addAll(getReplicasForPartition((Integer)rowKey));
}
rowValues[columnNameToIndex.get("Partition")] = rowKey;
rowValues[columnNameToIndex.get("Sites")] = CoreUtils.hsIdCollectionToString(sites);
rowValues[columnNameToIndex.get("Leader")] = CoreUtils.hsIdToString(leader);
}
/**
* Convenience method: Get the HSID of the master for the specified partition ID, SP or MP
*/
public long getHSIdForMaster(int partitionId)
{
if (partitionId == MpInitiator.MP_INIT_PID) {
return getHSIdForMultiPartitionInitiator();
}
else {
return getHSIdForSinglePartitionMaster(partitionId);
}
}
/**
* Checks whether this host is partition 0 'zero' leader
*
* @return result of the test
*/
public boolean isPartitionZeroLeader()
{
return CoreUtils.getHostIdFromHSId(m_iv2Masters.get(0)) == m_hostMessenger.getHostId();
}
/**
* Checks whether this host is partition 0 'zero' leader
*
* @return result of the test
*/
public boolean isHostIdLocal(int hostId)
{
return hostId == m_hostMessenger.getHostId();
}
/**
* Get the HSID of the single partition master for the specified partition ID
*/
public long getHSIdForSinglePartitionMaster(int partitionId)
{
return m_iv2Masters.get(partitionId);
}
/**
* validate partition id
* @param partitionId The partition id
* @return return true if the partition id is valid
*/
public boolean hasPartition(int partitionId) {
return ((LeaderCache)m_iv2Masters).contain(partitionId);
}
// This used to be the method to get this on SiteTracker
public Long getHSIdForMultiPartitionInitiator()
{
return m_iv2Mpi.get(MpInitiator.MP_INIT_PID);
}
public long getBuddySiteForMPI(long hsid)
{
int host = CoreUtils.getHostIdFromHSId(hsid);
// We'll be lazy and get the map we'd feed to SiteTracker's
// constructor, then go looking for a matching host ID.
List sitesList = getMailboxNodeContentList();
for (MailboxNodeContent site : sitesList) {
if (site.partitionId != MpInitiator.MP_INIT_PID && host == CoreUtils.getHostIdFromHSId(site.HSId)) {
return site.HSId;
}
}
throw new RuntimeException("Unable to find a buddy initiator for MPI with HSID: " +
CoreUtils.hsIdToString(hsid));
}
/**
* Returns the IDs of the partitions currently in the cluster.
* @return A list of partition IDs
*/
public static List getPartitions(ZooKeeper zk) {
List partitions = new ArrayList();
try {
List children = zk.getChildren(VoltZK.leaders_initiators, null);
for (String child : children) {
partitions.add(LeaderElector.getPartitionFromElectionDir(child));
}
} catch (KeeperException e) {
VoltDB.crashLocalVoltDB("Failed to get partition IDs from ZK", true, e);
} catch (InterruptedException e) {
VoltDB.crashLocalVoltDB("Failed to get partition IDs from ZK", true, e);
}
return partitions;
}
public List getPartitions() {
return Cartographer.getPartitions(m_zk);
}
public int getPartitionCount()
{
// The list returned by getPartitions includes the MP PID. Need to remove that for the
// true partition count.
return Cartographer.getPartitions(m_zk).size() - 1;
}
private Multimap getHostToPartitionMap() {
Multimap hostToPartitions = ArrayListMultimap.create();
for (int pId : getPartitions()) {
if (pId == MpInitiator.MP_INIT_PID) {
continue;
}
List hsIDs = getReplicasForPartition(pId);
hsIDs.forEach(hsId -> hostToPartitions.put(CoreUtils.getHostIdFromHSId(hsId), pId));
}
return hostToPartitions;
}
/**
* Convenient method, given a hostId, return the hostId of its buddies (including itself) which both
* belong to the same partition group.
* @param hostId
* @return A set of host IDs (includes the given hostId) that both belong to the same partition group
*/
public Set getHostIdsWithinPartitionGroup(int hostId) {
Set hostIds = Sets.newHashSet();
Multimap hostToPartitions = getHostToPartitionMap();
assert hostToPartitions.containsKey(hostId);
Multimap partitionByIds = ArrayListMultimap.create();
Multimaps.invertFrom(hostToPartitions, partitionByIds);
for (int partition : hostToPartitions.asMap().get(hostId)) {
hostIds.addAll(partitionByIds.get(partition));
}
return hostIds;
}
/**
* Convenient method, given a set of partitionIds, return the hostId of their partition group buddies.
*
* @param partitions A list of partitions that to be assigned to the newly rejoined host
* @return A set of host IDs that both belong to the same partition group
*/
public Set findPartitionGroupPeers(List partitions) {
Set hostIds = Sets.newHashSet();
Multimap hostToPartitions = getHostToPartitionMap();
Multimap partitionByIds = ArrayListMultimap.create();
Multimaps.invertFrom(hostToPartitions, partitionByIds);
for (int p : partitions) {
hostIds.addAll(partitionByIds.get(p));
}
return hostIds;
}
/**
* @return a multi map of a pair of Partition to HSIDs to all Hosts
*/
public Multimap> getHostToPartition2HSIdMap() {
Multimap> hostToHSId = ArrayListMultimap.create();
for (int pId : getPartitions()) {
if (pId == MpInitiator.MP_INIT_PID) {
continue;
}
List hsIDs = getReplicasForPartition(pId);
hsIDs.forEach(hsId -> hostToHSId.put(CoreUtils.getHostIdFromHSId(hsId), new AbstractMap.SimpleEntry<>(pId, hsId)));
}
return hostToHSId;
}
/**
* Given a partition ID, return a list of HSIDs of all the sites with copies of that partition
*/
public List getReplicasForPartition(int partition) {
String zkpath = LeaderElector.electionDirForPartition(VoltZK.leaders_initiators, partition);
List retval = new ArrayList();
try {
List children = m_zk.getChildren(zkpath, null);
for (String child : children) {
retval.add(Long.valueOf(child.split("_")[0]));
}
}
catch (KeeperException.NoNodeException e) {
//Can happen when partitions are being removed
} catch (KeeperException | InterruptedException e) {
org.voltdb.VoltDB.crashLocalVoltDB("Exception getting replicas for partition: " + partition,
true, e);
}
return retval;
}
/**
* return host site id with a given host id and a partition id
* @param hostId The host id
* @param partition The partition id
* @return a site id or null if there is no such a site
*/
public Long getHSIDForPartitionHost(int hostId, int partition) {
List hsids = getReplicasForPartition(partition);
for (Long hsid : hsids) {
if (hostId == CoreUtils.getHostIdFromHSId(hsid)){
return hsid;
}
}
return null;
}
/**
* Given a set of partition IDs, return a map of partition to a list of HSIDs of all the sites with copies of each partition
*/
public Map> getReplicasForPartitions(Collection partitions) {
Map> retval = new HashMap>();
List> callbacks = new ArrayList>();
for (Integer partition : partitions) {
String zkpath = LeaderElector.electionDirForPartition(VoltZK.leaders_initiators, partition);
ZKUtil.ChildrenCallback cb = new ZKUtil.ChildrenCallback();
callbacks.add(Pair.of(partition, cb));
m_zk.getChildren(zkpath, false, cb, null);
}
for (Pair p : callbacks ) {
final Integer partition = p.getFirst();
try {
List children = p.getSecond().getChildren();
List sites = new ArrayList();
for (String child : children) {
sites.add(Long.valueOf(child.split("_")[0]));
}
retval.put(partition, sites);
} catch (KeeperException.NoNodeException e) {
//This can happen when a partition is being removed from the system
} catch (KeeperException ke) {
org.voltdb.VoltDB.crashLocalVoltDB("KeeperException getting replicas for partition: " + partition,
true, ke);
}
catch (InterruptedException ie) {
org.voltdb.VoltDB.crashLocalVoltDB("InterruptedException getting replicas for partition: " +
partition, true, ie);
}
}
return retval;
}
/**
* Convenience method to return the immediate count of replicas for the given partition
*/
public int getReplicaCountForPartition(int partition) {
return getReplicasForPartition(partition).size();
}
/**
* Utility method to sort the keys of a map by their value. public for testing.
*/
static public List sortKeysByValue(Map map)
{
List> entries = CoreUtils.sortKeyValuePairByValue(map);
List keys = new ArrayList();
for (Entry entry : entries) {
keys.add(entry.getKey());
}
return keys;
}
/**
* Given the current state of the cluster, compute the partitions which should be replicated on a single new host.
* Break this method out to be static and testable independent of ZK, JSON, other ugh.
*/
static public void computeReplacementPartitions(Map repsPerPart, int kfactor,
int sitesPerHost, List partitions)
{
List partSortedByRep = sortKeysByValue(repsPerPart);
for (int i = 0; i < partSortedByRep.size(); i++) {
int leastReplicatedPart = partSortedByRep.get(i);
if (repsPerPart.get(leastReplicatedPart) < kfactor + 1) {
partitions.add(leastReplicatedPart);
if (partitions.size() == sitesPerHost) {
break;
}
}
}
}
public List getIv2PartitionsToReplace(int kfactor, int sitesPerHost, int localHostId, Map hostGroups)
throws JSONException
{
Preconditions.checkArgument(sitesPerHost != VoltDB.UNDEFINED);
List partitionsToReplace = new ArrayList();
Map repsPerPart = new HashMap();
List> sortedHosts = AbstractTopology.sortHostIdByHGDistance(localHostId, hostGroups);
// attach partitions to each hosts
Multimap hostToPartitions = getHostToPartitionMap();
for (Collection subgroup : sortedHosts) {
Set partitions = new HashSet();
for (Integer hid : subgroup) {
partitions.addAll(hostToPartitions.get(hid));
}
hostLog.info("Computing partitions to replace. Qualified partitions: " + partitions);
// sort the partitions by replicas number
for (Integer pid : partitions) {
repsPerPart.put(pid, getReplicaCountForPartition(pid));
}
computeReplacementPartitions(repsPerPart, kfactor, sitesPerHost, partitionsToReplace);
if (partitionsToReplace.size() == sitesPerHost) {
hostLog.info("IV2 Sites will replicate the following partitions: " + partitionsToReplace);
break;
}
}
return partitionsToReplace;
}
/**
* Compute the new partition IDs to add to the cluster based on the new topology.
*
* @param zk Zookeeper client
* @param newPartitionTotalCount The new total partition count
* @return A list of partitions IDs to add to the cluster.
* @throws JSONException
*/
public static List getPartitionsToAdd(ZooKeeper zk, int newPartitionTotalCount)
throws JSONException
{
List newPartitions = new ArrayList();
Set existingParts = new HashSet(getPartitions(zk));
// Remove MPI
existingParts.remove(MpInitiator.MP_INIT_PID);
int partsToAdd = newPartitionTotalCount - existingParts.size();
hostLog.info("Computing " + partsToAdd + " new partitions to add. Total partitions: " + newPartitionTotalCount);
if (partsToAdd > 0) {
for (int i = 0; newPartitions.size() != partsToAdd; i++) {
if (!existingParts.contains(i)) {
newPartitions.add(i);
}
}
hostLog.info("Adding new partitions: " + newPartitions);
}
return newPartitions;
}
private List getMailboxNodeContentList()
{
List sitesList = new ArrayList();
final Set iv2MastersKeySet = m_iv2Masters.pointInTimeCache().keySet();
Map> hsidsForPartMap = getReplicasForPartitions(iv2MastersKeySet);
for (Map.Entry> entry : hsidsForPartMap.entrySet()) {
Integer partId = entry.getKey();
List hsidsForPart = entry.getValue();
for (long hsid : hsidsForPart) {
MailboxNodeContent mnc = new MailboxNodeContent(hsid, partId);
sitesList.add(mnc);
}
}
return sitesList;
}
public Map> getSiteTrackerMailboxMap()
{
HashMap> result =
new HashMap>();
List sitesList = getMailboxNodeContentList();
result.put(MailboxType.ExecutionSite, sitesList);
return result;
}
public void shutdown() throws InterruptedException
{
m_iv2Masters.shutdown();
m_iv2Mpi.shutdown();
m_es.shutdown();
}
//Check partition replicas.
public synchronized String stopNodeIfClusterIsSafe(final Set liveHids, final int ihid) {
try {
return m_es.submit(new Callable() {
@Override
public String call() throws Exception {
if (m_configuredReplicationFactor == 0) {
//Dont die in k=0 cluster
return "Stopping individual nodes is only allowed on a K-safe cluster";
}
// check if any node still in rejoin status
try {
if (m_zk.exists(CoreZK.rejoin_node_blocker, false) != null) {
return "All rejoin nodes must be completed";
}
} catch (KeeperException.NoNodeException ignore) {} // shouldn't happen
//Otherwise we do check replicas for host
Set otherStoppedHids = new HashSet();
// Write the id of node to be stopped to ZK, partition detection will bypass this node.
ZKUtil.addIfMissing(m_zk, ZKUtil.joinZKPath(VoltZK.host_ids_be_stopped, Integer.toString(ihid)), CreateMode.PERSISTENT, null);
try {
List children = m_zk.getChildren(VoltZK.host_ids_be_stopped, false);
for (String child : children) {
int hostId = Integer.parseInt(child);
otherStoppedHids.add(hostId);
}
otherStoppedHids.remove(ihid); /* don't count self */
} catch (KeeperException.NoNodeException ignore) {}
String reason = doPartitionsHaveReplicas(ihid, otherStoppedHids);
if (reason == null) {
// Safe to stop
m_hostMessenger.sendStopNodeNotice(ihid);
// Shutdown or send poison pill
int hid = m_hostMessenger.getHostId();
if (hid == ihid) {
//Killing myself no pill needs to be sent
VoltDB.instance().halt();
} else {
//Send poison pill with target to kill
m_hostMessenger.sendPoisonPill(ihid, "@StopNode", ForeignHost.CRASH_ME);
}
} else {
// unsafe, clear the indicator
ZKUtil.deleteRecursively(m_zk, ZKUtil.joinZKPath(VoltZK.host_ids_be_stopped, Integer.toString(ihid)));
}
return reason;
}
}).get();
} catch (InterruptedException | ExecutionException t) {
hostLog.debug("LeaderAppointer: Error in isClusterSafeIfIDie.", t);
return "Internal error: " + t.getMessage();
}
}
public String verifyPartitonLeaderMigrationForStopNode(final int ihid) {
if (m_configuredReplicationFactor == 0) {
return "Stopping individual nodes is only allowed on a K-safe cluster";
}
try {
return m_es.submit(new Callable() {
@Override
public String call() throws Exception {
// Check rejoining status
try {
if (m_zk.exists(CoreZK.rejoin_node_blocker, false) != null) {
return "All rejoin nodes must be completed";
}
} catch (KeeperException.NoNodeException ignore) {}
// Check replicas
Set otherStoppedHids = new HashSet();
try {
List children = m_zk.getChildren(VoltZK.host_ids_be_stopped, false);
for (String child : children) {
int hostId = Integer.parseInt(child);
otherStoppedHids.add(hostId);
}
} catch (KeeperException.NoNodeException ignore) {}
otherStoppedHids.remove(ihid);
if (!otherStoppedHids.isEmpty()) {
return "Cann't move partition leaders while other nodes are being shutdown.";
}
String message = doPartitionsHaveReplicas(ihid, otherStoppedHids);
if (message != null) {
return message;
}
// Partition leader distribution mast be balanced, otherwise, the migrated partition leader will
// be moved back.
if (!isPartitionLeadersBalanced()) {
return "Cann't move partition leaders since leader migration is in progress";
}
return null;
}}).get();
} catch (InterruptedException | ExecutionException t) {
return "Internal error: " + t.getMessage();
}
}
private boolean isPartitionLeadersBalanced() {
// Is partition leader migration on?
final boolean disableSpiTask = "true".equals(System.getProperty("DISABLE_MIGRATE_PARTITION_LEADER", "false"));
if (disableSpiTask) {
return true;
}
RealVoltDB db = (RealVoltDB) VoltDB.instance();
if(db.isClusterComplete()) {
Pair pair = getPartitionLeaderMigrationTarget(db.getHostCount(),Integer.MIN_VALUE, true);
// Partition leader migration may be in progress. don't mess up
return (pair == null || pair.getFirst() == -1 ) ;
}
return true;
}
// Check if partitions on host hid will have enough replicas after losing host hid.
// If nodesBeingStopped was set, it means there are concurrent @StopNode running,
// don't count replicas on those to-be-stopped nodes.
private String doPartitionsHaveReplicas(int hid, Set nodesBeingStopped) {
hostLog.debug("Cartographer: Reloading partition information.");
assert (!nodesBeingStopped.contains(hid));
List partitionDirs = null;
try {
partitionDirs = m_zk.getChildren(VoltZK.leaders_initiators, null);
} catch (KeeperException | InterruptedException e) {
return "Failed to read ZooKeeper node " + VoltZK.leaders_initiators +": " + e.getMessage();
}
//Don't fetch the values serially do it asynchronously
Queue dataCallbacks = new ArrayDeque<>();
Queue childrenCallbacks = new ArrayDeque<>();
for (String partitionDir : partitionDirs) {
String dir = ZKUtil.joinZKPath(VoltZK.leaders_initiators, partitionDir);
try {
ZKUtil.ByteArrayCallback callback = new ZKUtil.ByteArrayCallback();
m_zk.getData(dir, false, callback, null);
dataCallbacks.offer(callback);
ZKUtil.ChildrenCallback childrenCallback = new ZKUtil.ChildrenCallback();
m_zk.getChildren(dir, false, childrenCallback, null);
childrenCallbacks.offer(childrenCallback);
} catch (Exception e) {
return "Failed to read ZooKeeper node " + dir + ": " + e.getMessage();
}
}
//Assume that we are ksafe
for (String partitionDir : partitionDirs) {
int pid = LeaderElector.getPartitionFromElectionDir(partitionDir);
try {
//Dont let anyone die if someone is in INITIALIZING state
byte[] partitionState = dataCallbacks.poll().getData();
if (partitionState != null && partitionState.length == 1) {
if (partitionState[0] == LeaderElector.INITIALIZING) {
return "StopNode is disallowed in initialization phase";
}
}
List replicas = childrenCallbacks.poll().getChildren();
//This is here just so callback is polled.
if (pid == MpInitiator.MP_INIT_PID) {
continue;
}
//Get Hosts for replicas
final List replicaHost = new ArrayList<>();
final List replicaOnStoppingHost = new ArrayList<>();
boolean hostHasReplicas = false;
for (String replica : replicas) {
final String split[] = replica.split("/");
final long hsId = Long.valueOf(split[split.length - 1].split("_")[0]);
final int hostId = CoreUtils.getHostIdFromHSId(hsId);
if (hostId == hid) {
hostHasReplicas = true;
}
if (nodesBeingStopped.contains(hostId)) {
replicaOnStoppingHost.add(hostId);
}
replicaHost.add(hostId);
}
hostLog.debug("Replica Host for Partition " + pid + " " + replicaHost);
if (hostHasReplicas && replicaHost.size() <= 1) {
return "Cluster doesn't have enough replicas";
}
if (hostHasReplicas && !replicaOnStoppingHost.isEmpty() && replicaHost.size() <= replicaOnStoppingHost.size() + 1) {
return "Cluster doesn't have enough replicas. There are concurrent stop node requests, retry the command later";
}
} catch (InterruptedException | KeeperException | NumberFormatException e) {
return "Failed to stop node:" + e.getMessage();
}
}
return null;
}
/**
* used to calculate the partition candidate for migration
*/
private static class Host implements Comparable {
final int m_hostId;
//the master partition ids on the host
List m_masterPartitionIDs = Lists.newArrayList();
//the replica partition ids on the host
List m_replicaPartitionIDs = Lists.newArrayList();
public Host(int hostId) {
m_hostId = hostId;
}
@Override
public int compareTo(Host other){
int diff = (other.m_masterPartitionIDs.size() - m_masterPartitionIDs.size());
if (diff != 0) {
return diff;
}
return (m_hostId - other.m_hostId);
}
@Override
public int hashCode() {
return m_hostId;
}
@Override
public boolean equals(Object obj) {
if (obj instanceof Host) {
Host info = (Host)obj;
return m_hostId == info.m_hostId;
}
return false;
}
public void addPartition(Integer partitionId, boolean master) {
if (master) {
m_masterPartitionIDs.add(partitionId);
} else {
m_replicaPartitionIDs.add(partitionId);
}
}
public int getPartitionLeaderCount() {
return m_masterPartitionIDs.size();
}
@Override
public String toString() {
StringBuilder builder = new StringBuilder();
builder.append("Host:" + m_hostId);
builder.append(", master:" + m_masterPartitionIDs);
builder.append(", replica:" + m_replicaPartitionIDs);
return builder.toString();
}
}
/**
* find a partition and its target host for MigratePartitionLeader
* SPI is migrated from the host with most partition leaders to a host which has the partition replica and
* the least number of partition. if the host with @localHostId is not the host which has the most partition
* leaders, return null. Eventually the host with most partition leaders will initiate MigratePartitionLeader.
* @param hostCount The number of hosts in the cluster
* @param localHostId the host id
* @return a pair of partition id and destination host id
*/
public Pair getPartitionLeaderMigrationTarget(int hostCount, int localHostId, boolean prepareStopNode) {
final int maxMastersPerHost = (int)Math.ceil(((double)getPartitionCount()) / hostCount);
final int minMastersPerHost = (getPartitionCount() / hostCount);
// Sort the hosts by partition leader count, descending
LinkedList hostList = getHostsByPartionMasterCount();
// only move SPI from the one with most partition leaders
// The local ClientInterface will pick it up and start @MigratePartitionLeader
Iterator it = hostList.iterator();
Host srcHost = it.next();
// @MigratePartitionLeader is initiated on the host with the old leader to facilitate DR integration
// If current host does not have the most partition leaders, give it up.
// Let the host with the most partition leaders to migrate
if (srcHost.m_hostId != localHostId && !prepareStopNode) {
return null;
}
// The new host is the one with least number of partition leaders and the partition replica
for (Iterator reverseIt = hostList.descendingIterator(); reverseIt.hasNext();) {
Host targetHost = reverseIt.next();
int partitionCandidate = findNewHostForPartitionLeader(srcHost,targetHost, maxMastersPerHost, minMastersPerHost);
if (partitionCandidate > -1){
return new Pair (partitionCandidate, targetHost.m_hostId);
}
}
// indicate that the cluster is balanced.
return new Pair (-1, -1);
}
private LinkedList getHostsByPartionMasterCount() {
Set liveHosts = m_hostMessenger.getLiveHostIds();
if (liveHosts.size() == 1) {
return null;
}
//collect host and partition info
Map hostsMap = Maps.newHashMap();
Set allMasters = new HashSet();
allMasters.addAll(m_iv2Masters.pointInTimeCache().keySet());
for ( Iterator it = allMasters.iterator(); it.hasNext();) {
Integer partitionId = it.next();
int leaderHostId = CoreUtils.getHostIdFromHSId(m_iv2Masters.pointInTimeCache().get(partitionId));
//sanity check to make sure that the topology is not in the middle of leader promotion
if (!liveHosts.contains(leaderHostId)) {
return null;
}
Host leaderHost = hostsMap.get(leaderHostId);
if (leaderHost == null) {
leaderHost = new Host(leaderHostId);
hostsMap.put(leaderHostId, leaderHost);
}
List sites = getReplicasForPartition(partitionId);
for (long site : sites) {
int hostId = CoreUtils.getHostIdFromHSId(site);
if (!liveHosts.contains(hostId)) {
return null;
}
Host host = hostsMap.get(hostId);
if (host == null) {
host = new Host(hostId);
hostsMap.put(hostId, host);
}
host.addPartition(partitionId, (leaderHostId == hostId));
}
}
//Sort the hosts by partition leader count, descending
LinkedList hostList = new LinkedList(hostsMap.values());
Collections.sort(hostList);
return hostList;
}
private int findNewHostForPartitionLeader(Host src, Host target, int maxMastersPerHost, int minMastersPerHost) {
// cann't move onto itself
if (src.equals(target)) {
return -1;
}
// still have more leaders than max?
if (src.getPartitionLeaderCount() > maxMastersPerHost) {
for (Integer partition : src.m_masterPartitionIDs) {
if (target.m_replicaPartitionIDs.contains(partition) && target.getPartitionLeaderCount() < maxMastersPerHost) {
return partition;
}
}
} else {
if (target.getPartitionLeaderCount() >= minMastersPerHost || src.getPartitionLeaderCount() <= minMastersPerHost) {
return -1;
} else {
for (Integer partition : src.m_masterPartitionIDs) {
if (target.m_replicaPartitionIDs.contains(partition)) {
return partition;
}
}
}
}
return -1;
}
// find a new host for a partition leader on this host
public Pair getPartitionLeaderMigrationTargetForStopNode(int localHostId) {
// Sort the hosts by partition leader count, descending
LinkedList hostList = getHostsByPartionMasterCount();
Host srcHost = null;
for (Host host : hostList) {
if (host.m_hostId == localHostId && !host.m_masterPartitionIDs.isEmpty()) {
srcHost = host;
break;
}
}
if (srcHost == null) {
return new Pair (-1, -1);
}
// The new host is the one with least number of partition leaders and the partition replica
for (Iterator reverseIt = hostList.descendingIterator(); reverseIt.hasNext();) {
Host targetHost = reverseIt.next();
if (!targetHost.equals(srcHost)) {
for (Integer partition : srcHost.m_masterPartitionIDs) {
if (targetHost.m_replicaPartitionIDs.contains(partition)) {
return new Pair (partition, targetHost.m_hostId);
}
}
}
}
// Indicate that all the partition leaders have been relocated.
return new Pair (-1, -1);
}
//return the number of masters on a host
public int getMasterCount(int hostId) {
Set masters = m_currentMastersByHost.get(hostId);
if (masters == null) {
return 0;
}
return masters.size();
}
//Utility method to peek the topology
public static VoltTable peekTopology(Cartographer cart) {
ColumnInfo[] column = new ColumnInfo[3];
column[0] = new ColumnInfo("Partition", VoltType.BIGINT);
column[1] = new ColumnInfo("Sites", VoltType.STRING);
column[2] = new ColumnInfo("Leader", VoltType.STRING);
VoltTable t = new VoltTable(column);
Iterator i = cart.getStatsRowKeyIterator(false);
while (i.hasNext()) {
Object rowKey = i.next();
long leader;
List sites = new ArrayList();
if (rowKey.equals(MpInitiator.MP_INIT_PID)) {
leader = cart.getHSIdForMultiPartitionInitiator();
sites.add(leader);
} else {
leader = cart.m_iv2Masters.pointInTimeCache().get(rowKey);
sites.addAll(cart.getReplicasForPartition((Integer)rowKey));
}
t.addRow(rowKey, CoreUtils.hsIdCollectionToString(sites), CoreUtils.hsIdToString(leader));
}
return t;
}
public boolean hasPartitionMastersOnHosts(Set hosts) {
for (Integer host : hosts) {
if (m_currentMastersByHost.containsKey(host)) {
return true;
}
}
return false;
}
}