org.apache.hadoop.hbase.master.balancer.StochasticLoadBalancer Maven / Gradle / Ivy
Show all versions of hbase-server Show documentation
/**
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.hadoop.hbase.master.balancer;
import com.google.common.annotations.VisibleForTesting;
import java.util.ArrayDeque;
import java.util.Arrays;
import java.util.Collection;
import java.util.Deque;
import java.util.HashMap;
import java.util.LinkedList;
import java.util.List;
import java.util.Map;
import java.util.Map.Entry;
import java.util.Random;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.hbase.classification.InterfaceAudience;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.hbase.ClusterStatus;
import org.apache.hadoop.hbase.HBaseInterfaceAudience;
import org.apache.hadoop.hbase.HRegionInfo;
import org.apache.hadoop.hbase.RegionLoad;
import org.apache.hadoop.hbase.ServerLoad;
import org.apache.hadoop.hbase.ServerName;
import org.apache.hadoop.hbase.master.MasterServices;
import org.apache.hadoop.hbase.master.RegionPlan;
import org.apache.hadoop.hbase.master.balancer.BaseLoadBalancer.Cluster.Action;
import org.apache.hadoop.hbase.master.balancer.BaseLoadBalancer.Cluster.Action.Type;
import org.apache.hadoop.hbase.master.balancer.BaseLoadBalancer.Cluster.AssignRegionAction;
import org.apache.hadoop.hbase.master.balancer.BaseLoadBalancer.Cluster.MoveRegionAction;
import org.apache.hadoop.hbase.master.balancer.BaseLoadBalancer.Cluster.SwapRegionsAction;
import org.apache.hadoop.hbase.util.Bytes;
import org.apache.hadoop.hbase.util.EnvironmentEdgeManager;
/**
* This is a best effort load balancer. Given a Cost function F(C) => x It will
* randomly try and mutate the cluster to Cprime. If F(Cprime) < F(C) then the
* new cluster state becomes the plan. It includes costs functions to compute the cost of:
*
* - Region Load
* - Table Load
* - Data Locality
* - Memstore Sizes
* - Storefile Sizes
*
*
*
* Every cost function returns a number between 0 and 1 inclusive; where 0 is the lowest cost
* best solution, and 1 is the highest possible cost and the worst solution. The computed costs are
* scaled by their respective multipliers:
*
*
* - hbase.master.balancer.stochastic.regionLoadCost
* - hbase.master.balancer.stochastic.moveCost
* - hbase.master.balancer.stochastic.tableLoadCost
* - hbase.master.balancer.stochastic.localityCost
* - hbase.master.balancer.stochastic.memstoreSizeCost
* - hbase.master.balancer.stochastic.storefileSizeCost
*
*
* In addition to the above configurations, the balancer can be tuned by the following
* configuration values:
*
* - hbase.master.balancer.stochastic.maxMoveRegions which
* controls what the max number of regions that can be moved in a single invocation of this
* balancer.
* - hbase.master.balancer.stochastic.stepsPerRegion is the coefficient by which the number of
* regions is multiplied to try and get the number of times the balancer will
* mutate all servers.
* - hbase.master.balancer.stochastic.maxSteps which controls the maximum number of times that
* the balancer will try and mutate all the servers. The balancer will use the minimum of this
* value and the above computation.
*
*
* This balancer is best used with hbase.master.loadbalance.bytable set to false
* so that the balancer gets the full picture of all loads on the cluster.
*/
@InterfaceAudience.LimitedPrivate(HBaseInterfaceAudience.CONFIG)
@edu.umd.cs.findbugs.annotations.SuppressWarnings(value="IS2_INCONSISTENT_SYNC",
justification="Complaint is about costFunctions not being synchronized; not end of the world")
public class StochasticLoadBalancer extends BaseLoadBalancer {
protected static final String STEPS_PER_REGION_KEY =
"hbase.master.balancer.stochastic.stepsPerRegion";
protected static final String MAX_STEPS_KEY =
"hbase.master.balancer.stochastic.maxSteps";
protected static final String MAX_RUNNING_TIME_KEY =
"hbase.master.balancer.stochastic.maxRunningTime";
protected static final String KEEP_REGION_LOADS =
"hbase.master.balancer.stochastic.numRegionLoadsToRemember";
private static final Random RANDOM = new Random(System.currentTimeMillis());
private static final Log LOG = LogFactory.getLog(StochasticLoadBalancer.class);
Map> loads = new HashMap>();
// values are defaults
private int maxSteps = 1000000;
private int stepsPerRegion = 800;
private long maxRunningTime = 30 * 1000 * 1; // 30 seconds.
private int numRegionLoadsToRemember = 15;
private CandidateGenerator[] candidateGenerators;
private CostFromRegionLoadFunction[] regionLoadFunctions;
private CostFunction[] costFunctions; // FindBugs: Wants this protected; IS2_INCONSISTENT_SYNC
// Keep locality based picker and cost function to alert them
// when new services are offered
private LocalityBasedCandidateGenerator localityCandidateGenerator;
private LocalityCostFunction localityCost;
private RegionReplicaHostCostFunction regionReplicaHostCostFunction;
private RegionReplicaRackCostFunction regionReplicaRackCostFunction;
@Override
public void onConfigurationChange(Configuration conf) {
setConf(conf);
}
@Override
public synchronized void setConf(Configuration conf) {
super.setConf(conf);
LOG.info("loading config");
maxSteps = conf.getInt(MAX_STEPS_KEY, maxSteps);
stepsPerRegion = conf.getInt(STEPS_PER_REGION_KEY, stepsPerRegion);
maxRunningTime = conf.getLong(MAX_RUNNING_TIME_KEY, maxRunningTime);
numRegionLoadsToRemember = conf.getInt(KEEP_REGION_LOADS, numRegionLoadsToRemember);
if (localityCandidateGenerator == null) {
localityCandidateGenerator = new LocalityBasedCandidateGenerator(services);
}
localityCost = new LocalityCostFunction(conf, services);
if (candidateGenerators == null) {
candidateGenerators = new CandidateGenerator[] {
new RandomCandidateGenerator(),
new LoadCandidateGenerator(),
localityCandidateGenerator,
new RegionReplicaRackCandidateGenerator(),
};
}
regionLoadFunctions = new CostFromRegionLoadFunction[] {
new ReadRequestCostFunction(conf),
new WriteRequestCostFunction(conf),
new MemstoreSizeCostFunction(conf),
new StoreFileCostFunction(conf)
};
regionReplicaHostCostFunction = new RegionReplicaHostCostFunction(conf);
regionReplicaRackCostFunction = new RegionReplicaRackCostFunction(conf);
costFunctions = new CostFunction[]{
new RegionCountSkewCostFunction(conf),
new PrimaryRegionCountSkewCostFunction(conf),
new MoveCostFunction(conf),
localityCost,
new TableSkewCostFunction(conf),
regionReplicaHostCostFunction,
regionReplicaRackCostFunction,
regionLoadFunctions[0],
regionLoadFunctions[1],
regionLoadFunctions[2],
regionLoadFunctions[3],
};
}
@Override
protected void setSlop(Configuration conf) {
this.slop = conf.getFloat("hbase.regions.slop", 0.001F);
}
@Override
public synchronized void setClusterStatus(ClusterStatus st) {
super.setClusterStatus(st);
updateRegionLoad();
for(CostFromRegionLoadFunction cost : regionLoadFunctions) {
cost.setClusterStatus(st);
}
}
@Override
public synchronized void setMasterServices(MasterServices masterServices) {
super.setMasterServices(masterServices);
this.localityCost.setServices(masterServices);
this.localityCandidateGenerator.setServices(masterServices);
}
@Override
protected synchronized boolean areSomeRegionReplicasColocated(Cluster c) {
regionReplicaHostCostFunction.init(c);
if (regionReplicaHostCostFunction.cost() > 0) return true;
regionReplicaRackCostFunction.init(c);
if (regionReplicaRackCostFunction.cost() > 0) return true;
return false;
}
@VisibleForTesting
Cluster.Action nextAction(Cluster cluster) {
return candidateGenerators[(RANDOM.nextInt(candidateGenerators.length))]
.generate(cluster);
}
/**
* Given the cluster state this will try and approach an optimal balance. This
* should always approach the optimal state given enough steps.
*/
@Override
public synchronized List balanceCluster(Map> clusterState) {
List plans = balanceMasterRegions(clusterState);
if (plans != null || clusterState == null || clusterState.size() <= 1) {
return plans;
}
if (masterServerName != null && clusterState.containsKey(masterServerName)) {
if (clusterState.size() <= 2) {
return null;
}
clusterState = new HashMap>(clusterState);
clusterState.remove(masterServerName);
}
// On clusters with lots of HFileLinks or lots of reference files,
// instantiating the storefile infos can be quite expensive.
// Allow turning this feature off if the locality cost is not going to
// be used in any computations.
RegionLocationFinder finder = null;
if (this.localityCost != null && this.localityCost.getMultiplier() > 0) {
finder = this.regionFinder;
}
//The clusterState that is given to this method contains the state
//of all the regions in the table(s) (that's true today)
// Keep track of servers to iterate through them.
Cluster cluster = new Cluster(clusterState, loads, finder, rackManager);
if (!needsBalance(cluster)) {
return null;
}
long startTime = EnvironmentEdgeManager.currentTime();
initCosts(cluster);
double currentCost = computeCost(cluster, Double.MAX_VALUE);
double initCost = currentCost;
double newCost = currentCost;
long computedMaxSteps = Math.min(this.maxSteps,
((long)cluster.numRegions * (long)this.stepsPerRegion * (long)cluster.numServers));
// Perform a stochastic walk to see if we can get a good fit.
long step;
for (step = 0; step < computedMaxSteps; step++) {
Cluster.Action action = nextAction(cluster);
if (action.type == Type.NULL) {
continue;
}
cluster.doAction(action);
updateCostsWithAction(cluster, action);
newCost = computeCost(cluster, currentCost);
// Should this be kept?
if (newCost < currentCost) {
currentCost = newCost;
} else {
// Put things back the way they were before.
// TODO: undo by remembering old values
Action undoAction = action.undoAction();
cluster.doAction(undoAction);
updateCostsWithAction(cluster, undoAction);
}
if (EnvironmentEdgeManager.currentTime() - startTime >
maxRunningTime) {
break;
}
}
long endTime = EnvironmentEdgeManager.currentTime();
metricsBalancer.balanceCluster(endTime - startTime);
if (initCost > currentCost) {
plans = createRegionPlans(cluster);
if (LOG.isDebugEnabled()) {
LOG.debug("Finished computing new load balance plan. Computation took "
+ (endTime - startTime) + "ms to try " + step
+ " different iterations. Found a solution that moves "
+ plans.size() + " regions; Going from a computed cost of "
+ initCost + " to a new cost of " + currentCost);
}
return plans;
}
if (LOG.isDebugEnabled()) {
LOG.debug("Could not find a better load balance plan. Tried "
+ step + " different configurations in " + (endTime - startTime)
+ "ms, and did not find anything with a computed cost less than " + initCost);
}
return null;
}
/**
* Create all of the RegionPlan's needed to move from the initial cluster state to the desired
* state.
*
* @param cluster The state of the cluster
* @return List of RegionPlan's that represent the moves needed to get to desired final state.
*/
private List createRegionPlans(Cluster cluster) {
List plans = new LinkedList();
for (int regionIndex = 0;
regionIndex < cluster.regionIndexToServerIndex.length; regionIndex++) {
int initialServerIndex = cluster.initialRegionIndexToServerIndex[regionIndex];
int newServerIndex = cluster.regionIndexToServerIndex[regionIndex];
if (initialServerIndex != newServerIndex) {
HRegionInfo region = cluster.regions[regionIndex];
ServerName initialServer = cluster.servers[initialServerIndex];
ServerName newServer = cluster.servers[newServerIndex];
if (LOG.isTraceEnabled()) {
LOG.trace("Moving Region " + region.getEncodedName() + " from server "
+ initialServer.getHostname() + " to " + newServer.getHostname());
}
RegionPlan rp = new RegionPlan(region, initialServer, newServer);
plans.add(rp);
}
}
return plans;
}
/**
* Store the current region loads.
*/
private synchronized void updateRegionLoad() {
// We create a new hashmap so that regions that are no longer there are removed.
// However we temporarily need the old loads so we can use them to keep the rolling average.
Map> oldLoads = loads;
loads = new HashMap>();
for (ServerName sn : clusterStatus.getServers()) {
ServerLoad sl = clusterStatus.getLoad(sn);
if (sl == null) {
continue;
}
for (Entry entry : sl.getRegionsLoad().entrySet()) {
Deque rLoads = oldLoads.get(Bytes.toString(entry.getKey()));
if (rLoads == null) {
// There was nothing there
rLoads = new ArrayDeque();
} else if (rLoads.size() >= numRegionLoadsToRemember) {
rLoads.remove();
}
rLoads.add(entry.getValue());
loads.put(Bytes.toString(entry.getKey()), rLoads);
}
}
for(CostFromRegionLoadFunction cost : regionLoadFunctions) {
cost.setLoads(loads);
}
}
protected void initCosts(Cluster cluster) {
for (CostFunction c:costFunctions) {
c.init(cluster);
}
}
protected void updateCostsWithAction(Cluster cluster, Action action) {
for (CostFunction c : costFunctions) {
c.postAction(action);
}
}
/**
* This is the main cost function. It will compute a cost associated with a proposed cluster
* state. All different costs will be combined with their multipliers to produce a double cost.
*
* @param cluster The state of the cluster
* @param previousCost the previous cost. This is used as an early out.
* @return a double of a cost associated with the proposed cluster state. This cost is an
* aggregate of all individual cost functions.
*/
protected double computeCost(Cluster cluster, double previousCost) {
double total = 0;
for (CostFunction c:costFunctions) {
if (c.getMultiplier() <= 0) {
continue;
}
total += c.getMultiplier() * c.cost();
if (total > previousCost) {
return total;
}
}
return total;
}
/** Generates a candidate action to be applied to the cluster for cost function search */
abstract static class CandidateGenerator {
abstract Cluster.Action generate(Cluster cluster);
/**
* From a list of regions pick a random one. Null can be returned which
* {@link StochasticLoadBalancer#balanceCluster(Map)} recognize as signal to try a region move
* rather than swap.
*
* @param cluster The state of the cluster
* @param server index of the server
* @param chanceOfNoSwap Chance that this will decide to try a move rather
* than a swap.
* @return a random {@link HRegionInfo} or null if an asymmetrical move is
* suggested.
*/
protected int pickRandomRegion(Cluster cluster, int server, double chanceOfNoSwap) {
// Check to see if this is just a move.
if (cluster.regionsPerServer[server].length == 0 || RANDOM.nextFloat() < chanceOfNoSwap) {
// signal a move only.
return -1;
}
int rand = RANDOM.nextInt(cluster.regionsPerServer[server].length);
return cluster.regionsPerServer[server][rand];
}
protected int pickRandomServer(Cluster cluster) {
if (cluster.numServers < 1) {
return -1;
}
return RANDOM.nextInt(cluster.numServers);
}
protected int pickRandomRack(Cluster cluster) {
if (cluster.numRacks < 1) {
return -1;
}
return RANDOM.nextInt(cluster.numRacks);
}
protected int pickOtherRandomServer(Cluster cluster, int serverIndex) {
if (cluster.numServers < 2) {
return -1;
}
while (true) {
int otherServerIndex = pickRandomServer(cluster);
if (otherServerIndex != serverIndex) {
return otherServerIndex;
}
}
}
protected int pickOtherRandomRack(Cluster cluster, int rackIndex) {
if (cluster.numRacks < 2) {
return -1;
}
while (true) {
int otherRackIndex = pickRandomRack(cluster);
if (otherRackIndex != rackIndex) {
return otherRackIndex;
}
}
}
protected Cluster.Action pickRandomRegions(Cluster cluster,
int thisServer,
int otherServer) {
if (thisServer < 0 || otherServer < 0) {
return Cluster.NullAction;
}
// Decide who is most likely to need another region
int thisRegionCount = cluster.getNumRegions(thisServer);
int otherRegionCount = cluster.getNumRegions(otherServer);
// Assign the chance based upon the above
double thisChance = (thisRegionCount > otherRegionCount) ? 0 : 0.5;
double otherChance = (thisRegionCount <= otherRegionCount) ? 0 : 0.5;
int thisRegion = pickRandomRegion(cluster, thisServer, thisChance);
int otherRegion = pickRandomRegion(cluster, otherServer, otherChance);
return getAction(thisServer, thisRegion, otherServer, otherRegion);
}
protected Cluster.Action getAction (int fromServer, int fromRegion,
int toServer, int toRegion) {
if (fromServer < 0 || toServer < 0) {
return Cluster.NullAction;
}
if (fromRegion > 0 && toRegion > 0) {
return new Cluster.SwapRegionsAction(fromServer, fromRegion,
toServer, toRegion);
} else if (fromRegion > 0) {
return new Cluster.MoveRegionAction(fromRegion, fromServer, toServer);
} else if (toRegion > 0) {
return new Cluster.MoveRegionAction(toRegion, toServer, fromServer);
} else {
return Cluster.NullAction;
}
}
}
static class RandomCandidateGenerator extends CandidateGenerator {
@Override
Cluster.Action generate(Cluster cluster) {
int thisServer = pickRandomServer(cluster);
// Pick the other server
int otherServer = pickOtherRandomServer(cluster, thisServer);
return pickRandomRegions(cluster, thisServer, otherServer);
}
}
static class LoadCandidateGenerator extends CandidateGenerator {
@Override
Cluster.Action generate(Cluster cluster) {
cluster.sortServersByRegionCount();
int thisServer = pickMostLoadedServer(cluster, -1);
int otherServer = pickLeastLoadedServer(cluster, thisServer);
return pickRandomRegions(cluster, thisServer, otherServer);
}
private int pickLeastLoadedServer(final Cluster cluster, int thisServer) {
Integer[] servers = cluster.serverIndicesSortedByRegionCount;
int index = 0;
while (servers[index] == null || servers[index] == thisServer) {
index++;
if (index == servers.length) {
return -1;
}
}
return servers[index];
}
private int pickMostLoadedServer(final Cluster cluster, int thisServer) {
Integer[] servers = cluster.serverIndicesSortedByRegionCount;
int index = servers.length - 1;
while (servers[index] == null || servers[index] == thisServer) {
index--;
if (index < 0) {
return -1;
}
}
return servers[index];
}
}
static class LocalityBasedCandidateGenerator extends CandidateGenerator {
private MasterServices masterServices;
LocalityBasedCandidateGenerator(MasterServices masterServices) {
this.masterServices = masterServices;
}
@Override
Cluster.Action generate(Cluster cluster) {
if (this.masterServices == null) {
int thisServer = pickRandomServer(cluster);
// Pick the other server
int otherServer = pickOtherRandomServer(cluster, thisServer);
return pickRandomRegions(cluster, thisServer, otherServer);
}
cluster.calculateRegionServerLocalities();
// Pick server with lowest locality
int thisServer = pickLowestLocalityServer(cluster);
int thisRegion;
if (thisServer == -1) {
LOG.warn("Could not pick lowest locality region server");
return Cluster.NullAction;
} else {
// Pick lowest locality region on this server
thisRegion = pickLowestLocalityRegionOnServer(cluster, thisServer);
}
if (thisRegion == -1) {
return Cluster.NullAction;
}
// Pick the least loaded server with good locality for the region
int otherServer = cluster.getLeastLoadedTopServerForRegion(thisRegion);
if (otherServer == -1) {
return Cluster.NullAction;
}
// Let the candidate region be moved to its highest locality server.
int otherRegion = -1;
return getAction(thisServer, thisRegion, otherServer, otherRegion);
}
private int pickLowestLocalityServer(Cluster cluster) {
return cluster.getLowestLocalityRegionServer();
}
private int pickLowestLocalityRegionOnServer(Cluster cluster, int server) {
return cluster.getLowestLocalityRegionOnServer(server);
}
void setServices(MasterServices services) {
this.masterServices = services;
}
}
/**
* Generates candidates which moves the replicas out of the region server for
* co-hosted region replicas
*/
static class RegionReplicaCandidateGenerator extends CandidateGenerator {
RandomCandidateGenerator randomGenerator = new RandomCandidateGenerator();
/**
* Randomly select one regionIndex out of all region replicas co-hosted in the same group
* (a group is a server, host or rack)
* @param primariesOfRegionsPerGroup either Cluster.primariesOfRegionsPerServer,
* primariesOfRegionsPerHost or primariesOfRegionsPerRack
* @param regionsPerGroup either Cluster.regionsPerServer, regionsPerHost or regionsPerRack
* @param regionIndexToPrimaryIndex Cluster.regionsIndexToPrimaryIndex
* @return a regionIndex for the selected primary or -1 if there is no co-locating
*/
int selectCoHostedRegionPerGroup(int[] primariesOfRegionsPerGroup, int[] regionsPerGroup
, int[] regionIndexToPrimaryIndex) {
int currentPrimary = -1;
int currentPrimaryIndex = -1;
int selectedPrimaryIndex = -1;
double currentLargestRandom = -1;
// primariesOfRegionsPerGroup is a sorted array. Since it contains the primary region
// ids for the regions hosted in server, a consecutive repetition means that replicas
// are co-hosted
for (int j = 0; j <= primariesOfRegionsPerGroup.length; j++) {
int primary = j < primariesOfRegionsPerGroup.length
? primariesOfRegionsPerGroup[j] : -1;
if (primary != currentPrimary) { // check for whether we see a new primary
int numReplicas = j - currentPrimaryIndex;
if (numReplicas > 1) { // means consecutive primaries, indicating co-location
// decide to select this primary region id or not
double currentRandom = RANDOM.nextDouble();
// we don't know how many region replicas are co-hosted, we will randomly select one
// using reservoir sampling (http://gregable.com/2007/10/reservoir-sampling.html)
if (currentRandom > currentLargestRandom) {
selectedPrimaryIndex = currentPrimary;
currentLargestRandom = currentRandom;
}
}
currentPrimary = primary;
currentPrimaryIndex = j;
}
}
// we have found the primary id for the region to move. Now find the actual regionIndex
// with the given primary, prefer to move the secondary region.
for (int j = 0; j < regionsPerGroup.length; j++) {
int regionIndex = regionsPerGroup[j];
if (selectedPrimaryIndex == regionIndexToPrimaryIndex[regionIndex]) {
// always move the secondary, not the primary
if (selectedPrimaryIndex != regionIndex) {
return regionIndex;
}
}
}
return -1;
}
@Override
Cluster.Action generate(Cluster cluster) {
int serverIndex = pickRandomServer(cluster);
if (cluster.numServers <= 1 || serverIndex == -1) {
return Cluster.NullAction;
}
int regionIndex = selectCoHostedRegionPerGroup(
cluster.primariesOfRegionsPerServer[serverIndex],
cluster.regionsPerServer[serverIndex],
cluster.regionIndexToPrimaryIndex);
// if there are no pairs of region replicas co-hosted, default to random generator
if (regionIndex == -1) {
// default to randompicker
return randomGenerator.generate(cluster);
}
int toServerIndex = pickOtherRandomServer(cluster, serverIndex);
int toRegionIndex = pickRandomRegion(cluster, toServerIndex, 0.9f);
return getAction (serverIndex, regionIndex, toServerIndex, toRegionIndex);
}
}
/**
* Generates candidates which moves the replicas out of the rack for
* co-hosted region replicas in the same rack
*/
static class RegionReplicaRackCandidateGenerator extends RegionReplicaCandidateGenerator {
@Override
Cluster.Action generate(Cluster cluster) {
int rackIndex = pickRandomRack(cluster);
if (cluster.numRacks <= 1 || rackIndex == -1) {
return super.generate(cluster);
}
int regionIndex = selectCoHostedRegionPerGroup(
cluster.primariesOfRegionsPerRack[rackIndex],
cluster.regionsPerRack[rackIndex],
cluster.regionIndexToPrimaryIndex);
// if there are no pairs of region replicas co-hosted, default to random generator
if (regionIndex == -1) {
// default to randompicker
return randomGenerator.generate(cluster);
}
int serverIndex = cluster.regionIndexToServerIndex[regionIndex];
int toRackIndex = pickOtherRandomRack(cluster, rackIndex);
int rand = RANDOM.nextInt(cluster.serversPerRack[toRackIndex].length);
int toServerIndex = cluster.serversPerRack[toRackIndex][rand];
int toRegionIndex = pickRandomRegion(cluster, toServerIndex, 0.9f);
return getAction (serverIndex, regionIndex, toServerIndex, toRegionIndex);
}
}
/**
* Base class of StochasticLoadBalancer's Cost Functions.
*/
abstract static class CostFunction {
private float multiplier = 0;
protected Cluster cluster;
CostFunction(Configuration c) {
}
float getMultiplier() {
return multiplier;
}
void setMultiplier(float m) {
this.multiplier = m;
}
/** Called once per LB invocation to give the cost function
* to initialize it's state, and perform any costly calculation.
*/
void init(Cluster cluster) {
this.cluster = cluster;
}
/** Called once per cluster Action to give the cost function
* an opportunity to update it's state. postAction() is always
* called at least once before cost() is called with the cluster
* that this action is performed on. */
void postAction(Action action) {
switch (action.type) {
case NULL: break;
case ASSIGN_REGION:
AssignRegionAction ar = (AssignRegionAction) action;
regionMoved(ar.region, -1, ar.server);
break;
case MOVE_REGION:
MoveRegionAction mra = (MoveRegionAction) action;
regionMoved(mra.region, mra.fromServer, mra.toServer);
break;
case SWAP_REGIONS:
SwapRegionsAction a = (SwapRegionsAction) action;
regionMoved(a.fromRegion, a.fromServer, a.toServer);
regionMoved(a.toRegion, a.toServer, a.fromServer);
break;
default:
throw new RuntimeException("Uknown action:" + action.type);
}
}
protected void regionMoved(int region, int oldServer, int newServer) {
}
abstract double cost();
/**
* Function to compute a scaled cost using {@link DescriptiveStatistics}. It
* assumes that this is a zero sum set of costs. It assumes that the worst case
* possible is all of the elements in one region server and the rest having 0.
*
* @param stats the costs
* @return a scaled set of costs.
*/
protected double costFromArray(double[] stats) {
double totalCost = 0;
double total = getSum(stats);
double count = stats.length;
double mean = total/count;
// Compute max as if all region servers had 0 and one had the sum of all costs. This must be
// a zero sum cost for this to make sense.
double max = ((count - 1) * mean) + (total - mean);
// It's possible that there aren't enough regions to go around
double min;
if (count > total) {
min = ((count - total) * mean) + ((1 - mean) * total);
} else {
// Some will have 1 more than everything else.
int numHigh = (int) (total - (Math.floor(mean) * count));
int numLow = (int) (count - numHigh);
min = (numHigh * (Math.ceil(mean) - mean)) + (numLow * (mean - Math.floor(mean)));
}
min = Math.max(0, min);
for (int i=0; i maxMoves) {
return 1000000; // return a number much greater than any of the other cost
}
return scale(0, cluster.numRegions, moveCost);
}
}
/**
* Compute the cost of a potential cluster state from skew in number of
* regions on a cluster.
*/
static class RegionCountSkewCostFunction extends CostFunction {
private static final String REGION_COUNT_SKEW_COST_KEY =
"hbase.master.balancer.stochastic.regionCountCost";
private static final float DEFAULT_REGION_COUNT_SKEW_COST = 500;
private double[] stats = null;
RegionCountSkewCostFunction(Configuration conf) {
super(conf);
// Load multiplier should be the greatest as it is the most general way to balance data.
this.setMultiplier(conf.getFloat(REGION_COUNT_SKEW_COST_KEY, DEFAULT_REGION_COUNT_SKEW_COST));
}
@Override
double cost() {
if (stats == null || stats.length != cluster.numServers) {
stats = new double[cluster.numServers];
}
for (int i =0; i < cluster.numServers; i++) {
stats[i] = cluster.regionsPerServer[i].length;
}
return costFromArray(stats);
}
}
/**
* Compute the cost of a potential cluster state from skew in number of
* primary regions on a cluster.
*/
static class PrimaryRegionCountSkewCostFunction extends CostFunction {
private static final String PRIMARY_REGION_COUNT_SKEW_COST_KEY =
"hbase.master.balancer.stochastic.primaryRegionCountCost";
private static final float DEFAULT_PRIMARY_REGION_COUNT_SKEW_COST = 500;
private double[] stats = null;
PrimaryRegionCountSkewCostFunction(Configuration conf) {
super(conf);
// Load multiplier should be the greatest as primary regions serve majority of reads/writes.
this.setMultiplier(conf.getFloat(PRIMARY_REGION_COUNT_SKEW_COST_KEY,
DEFAULT_PRIMARY_REGION_COUNT_SKEW_COST));
}
@Override
double cost() {
if (!cluster.hasRegionReplicas) {
return 0;
}
if (stats == null || stats.length != cluster.numServers) {
stats = new double[cluster.numServers];
}
for (int i =0; i < cluster.numServers; i++) {
stats[i] = 0;
for (int regionIdx : cluster.regionsPerServer[i]) {
if (regionIdx == cluster.regionIndexToPrimaryIndex[regionIdx]) {
stats[i] ++;
}
}
}
return costFromArray(stats);
}
}
/**
* Compute the cost of a potential cluster configuration based upon how evenly
* distributed tables are.
*/
static class TableSkewCostFunction extends CostFunction {
private static final String TABLE_SKEW_COST_KEY =
"hbase.master.balancer.stochastic.tableSkewCost";
private static final float DEFAULT_TABLE_SKEW_COST = 35;
TableSkewCostFunction(Configuration conf) {
super(conf);
this.setMultiplier(conf.getFloat(TABLE_SKEW_COST_KEY, DEFAULT_TABLE_SKEW_COST));
}
@Override
double cost() {
double max = cluster.numRegions;
double min = ((double) cluster.numRegions) / cluster.numServers;
double value = 0;
for (int i = 0; i < cluster.numMaxRegionsPerTable.length; i++) {
value += cluster.numMaxRegionsPerTable[i];
}
return scale(min, max, value);
}
}
/**
* Compute a cost of a potential cluster configuration based upon where
* {@link org.apache.hadoop.hbase.regionserver.StoreFile}s are located.
*/
static class LocalityCostFunction extends CostFunction {
private static final String LOCALITY_COST_KEY = "hbase.master.balancer.stochastic.localityCost";
private static final float DEFAULT_LOCALITY_COST = 25;
private MasterServices services;
LocalityCostFunction(Configuration conf, MasterServices srv) {
super(conf);
this.setMultiplier(conf.getFloat(LOCALITY_COST_KEY, DEFAULT_LOCALITY_COST));
this.services = srv;
}
void setServices(MasterServices srvc) {
this.services = srvc;
}
@Override
double cost() {
double max = 0;
double cost = 0;
// If there's no master so there's no way anything else works.
if (this.services == null) {
return cost;
}
for (int i = 0; i < cluster.regionLocations.length; i++) {
max += 1;
int serverIndex = cluster.regionIndexToServerIndex[i];
int[] regionLocations = cluster.regionLocations[i];
// If we can't find where the data is getTopBlock returns null.
// so count that as being the best possible.
if (regionLocations == null) {
continue;
}
int index = -1;
for (int j = 0; j < regionLocations.length; j++) {
if (regionLocations[j] >= 0 && regionLocations[j] == serverIndex) {
index = j;
break;
}
}
if (index < 0) {
cost += 1;
} else {
cost += (1 - cluster.getLocalityOfRegion(i, index));
}
}
return scale(0, max, cost);
}
}
/**
* Base class the allows writing costs functions from rolling average of some
* number from RegionLoad.
*/
abstract static class CostFromRegionLoadFunction extends CostFunction {
private ClusterStatus clusterStatus = null;
private Map> loads = null;
private double[] stats = null;
CostFromRegionLoadFunction(Configuration conf) {
super(conf);
}
void setClusterStatus(ClusterStatus status) {
this.clusterStatus = status;
}
void setLoads(Map> l) {
this.loads = l;
}
@Override
double cost() {
if (clusterStatus == null || loads == null) {
return 0;
}
if (stats == null || stats.length != cluster.numServers) {
stats = new double[cluster.numServers];
}
for (int i =0; i < stats.length; i++) {
//Cost this server has from RegionLoad
long cost = 0;
// for every region on this server get the rl
for(int regionIndex:cluster.regionsPerServer[i]) {
Collection regionLoadList = cluster.regionLoads[regionIndex];
// Now if we found a region load get the type of cost that was requested.
if (regionLoadList != null) {
cost += getRegionLoadCost(regionLoadList);
}
}
// Add the total cost to the stats.
stats[i] = cost;
}
// Now return the scaled cost from data held in the stats object.
return costFromArray(stats);
}
protected double getRegionLoadCost(Collection regionLoadList) {
double cost = 0;
for (RegionLoad rl : regionLoadList) {
double toAdd = getCostFromRl(rl);
if (cost == 0) {
cost = toAdd;
} else {
cost = (.5 * cost) + (.5 * toAdd);
}
}
return cost;
}
protected abstract double getCostFromRl(RegionLoad rl);
}
/**
* Compute the cost of total number of read requests The more unbalanced the higher the
* computed cost will be. This uses a rolling average of regionload.
*/
static class ReadRequestCostFunction extends CostFromRegionLoadFunction {
private static final String READ_REQUEST_COST_KEY =
"hbase.master.balancer.stochastic.readRequestCost";
private static final float DEFAULT_READ_REQUEST_COST = 5;
ReadRequestCostFunction(Configuration conf) {
super(conf);
this.setMultiplier(conf.getFloat(READ_REQUEST_COST_KEY, DEFAULT_READ_REQUEST_COST));
}
@Override
protected double getCostFromRl(RegionLoad rl) {
return rl.getReadRequestsCount();
}
}
/**
* Compute the cost of total number of write requests. The more unbalanced the higher the
* computed cost will be. This uses a rolling average of regionload.
*/
static class WriteRequestCostFunction extends CostFromRegionLoadFunction {
private static final String WRITE_REQUEST_COST_KEY =
"hbase.master.balancer.stochastic.writeRequestCost";
private static final float DEFAULT_WRITE_REQUEST_COST = 5;
WriteRequestCostFunction(Configuration conf) {
super(conf);
this.setMultiplier(conf.getFloat(WRITE_REQUEST_COST_KEY, DEFAULT_WRITE_REQUEST_COST));
}
@Override
protected double getCostFromRl(RegionLoad rl) {
return rl.getWriteRequestsCount();
}
}
/**
* A cost function for region replicas. We give a very high cost to hosting
* replicas of the same region in the same host. We do not prevent the case
* though, since if numReplicas > numRegionServers, we still want to keep the
* replica open.
*/
static class RegionReplicaHostCostFunction extends CostFunction {
private static final String REGION_REPLICA_HOST_COST_KEY =
"hbase.master.balancer.stochastic.regionReplicaHostCostKey";
private static final float DEFAULT_REGION_REPLICA_HOST_COST_KEY = 100000;
long maxCost = 0;
long[] costsPerGroup; // group is either server, host or rack
int[][] primariesOfRegionsPerGroup;
public RegionReplicaHostCostFunction(Configuration conf) {
super(conf);
this.setMultiplier(conf.getFloat(REGION_REPLICA_HOST_COST_KEY,
DEFAULT_REGION_REPLICA_HOST_COST_KEY));
}
@Override
void init(Cluster cluster) {
super.init(cluster);
// max cost is the case where every region replica is hosted together regardless of host
maxCost = cluster.numHosts > 1 ? getMaxCost(cluster) : 0;
costsPerGroup = new long[cluster.numHosts];
primariesOfRegionsPerGroup = cluster.multiServersPerHost // either server based or host based
? cluster.primariesOfRegionsPerHost
: cluster.primariesOfRegionsPerServer;
for (int i = 0 ; i < primariesOfRegionsPerGroup.length; i++) {
costsPerGroup[i] = costPerGroup(primariesOfRegionsPerGroup[i]);
}
}
long getMaxCost(Cluster cluster) {
if (!cluster.hasRegionReplicas) {
return 0; // short circuit
}
// max cost is the case where every region replica is hosted together regardless of host
int[] primariesOfRegions = new int[cluster.numRegions];
System.arraycopy(cluster.regionIndexToPrimaryIndex, 0, primariesOfRegions, 0,
cluster.regions.length);
Arrays.sort(primariesOfRegions);
// compute numReplicas from the sorted array
return costPerGroup(primariesOfRegions);
}
@Override
double cost() {
if (maxCost <= 0) {
return 0;
}
long totalCost = 0;
for (int i = 0 ; i < costsPerGroup.length; i++) {
totalCost += costsPerGroup[i];
}
return scale(0, maxCost, totalCost);
}
/**
* For each primary region, it computes the total number of replicas in the array (numReplicas)
* and returns a sum of numReplicas-1 squared. For example, if the server hosts
* regions a, b, c, d, e, f where a and b are same replicas, and c,d,e are same replicas, it
* returns (2-1) * (2-1) + (3-1) * (3-1) + (1-1) * (1-1).
* @param primariesOfRegions a sorted array of primary regions ids for the regions hosted
* @return a sum of numReplicas-1 squared for each primary region in the group.
*/
protected long costPerGroup(int[] primariesOfRegions) {
long cost = 0;
int currentPrimary = -1;
int currentPrimaryIndex = -1;
// primariesOfRegions is a sorted array of primary ids of regions. Replicas of regions
// sharing the same primary will have consecutive numbers in the array.
for (int j = 0 ; j <= primariesOfRegions.length; j++) {
int primary = j < primariesOfRegions.length ? primariesOfRegions[j] : -1;
if (primary != currentPrimary) { // we see a new primary
int numReplicas = j - currentPrimaryIndex;
// square the cost
if (numReplicas > 1) { // means consecutive primaries, indicating co-location
cost += (numReplicas - 1) * (numReplicas - 1);
}
currentPrimary = primary;
currentPrimaryIndex = j;
}
}
return cost;
}
@Override
protected void regionMoved(int region, int oldServer, int newServer) {
if (maxCost <= 0) {
return; // no need to compute
}
if (cluster.multiServersPerHost) {
int oldHost = cluster.serverIndexToHostIndex[oldServer];
int newHost = cluster.serverIndexToHostIndex[newServer];
if (newHost != oldHost) {
costsPerGroup[oldHost] = costPerGroup(cluster.primariesOfRegionsPerHost[oldHost]);
costsPerGroup[newHost] = costPerGroup(cluster.primariesOfRegionsPerHost[newHost]);
}
} else {
costsPerGroup[oldServer] = costPerGroup(cluster.primariesOfRegionsPerServer[oldServer]);
costsPerGroup[newServer] = costPerGroup(cluster.primariesOfRegionsPerServer[newServer]);
}
}
}
/**
* A cost function for region replicas for the rack distribution. We give a relatively high
* cost to hosting replicas of the same region in the same rack. We do not prevent the case
* though.
*/
static class RegionReplicaRackCostFunction extends RegionReplicaHostCostFunction {
private static final String REGION_REPLICA_RACK_COST_KEY =
"hbase.master.balancer.stochastic.regionReplicaRackCostKey";
private static final float DEFAULT_REGION_REPLICA_RACK_COST_KEY = 10000;
public RegionReplicaRackCostFunction(Configuration conf) {
super(conf);
this.setMultiplier(conf.getFloat(REGION_REPLICA_RACK_COST_KEY, DEFAULT_REGION_REPLICA_RACK_COST_KEY));
}
@Override
void init(Cluster cluster) {
this.cluster = cluster;
if (cluster.numRacks <= 1) {
maxCost = 0;
return; // disabled for 1 rack
}
// max cost is the case where every region replica is hosted together regardless of rack
maxCost = getMaxCost(cluster);
costsPerGroup = new long[cluster.numRacks];
for (int i = 0 ; i < cluster.primariesOfRegionsPerRack.length; i++) {
costsPerGroup[i] = costPerGroup(cluster.primariesOfRegionsPerRack[i]);
}
}
@Override
protected void regionMoved(int region, int oldServer, int newServer) {
if (maxCost <= 0) {
return; // no need to compute
}
int oldRack = cluster.serverIndexToRackIndex[oldServer];
int newRack = cluster.serverIndexToRackIndex[newServer];
if (newRack != oldRack) {
costsPerGroup[oldRack] = costPerGroup(cluster.primariesOfRegionsPerRack[oldRack]);
costsPerGroup[newRack] = costPerGroup(cluster.primariesOfRegionsPerRack[newRack]);
}
}
}
/**
* Compute the cost of total memstore size. The more unbalanced the higher the
* computed cost will be. This uses a rolling average of regionload.
*/
static class MemstoreSizeCostFunction extends CostFromRegionLoadFunction {
private static final String MEMSTORE_SIZE_COST_KEY =
"hbase.master.balancer.stochastic.memstoreSizeCost";
private static final float DEFAULT_MEMSTORE_SIZE_COST = 5;
MemstoreSizeCostFunction(Configuration conf) {
super(conf);
this.setMultiplier(conf.getFloat(MEMSTORE_SIZE_COST_KEY, DEFAULT_MEMSTORE_SIZE_COST));
}
@Override
protected double getCostFromRl(RegionLoad rl) {
return rl.getMemStoreSizeMB();
}
}
/**
* Compute the cost of total open storefiles size. The more unbalanced the higher the
* computed cost will be. This uses a rolling average of regionload.
*/
static class StoreFileCostFunction extends CostFromRegionLoadFunction {
private static final String STOREFILE_SIZE_COST_KEY =
"hbase.master.balancer.stochastic.storefileSizeCost";
private static final float DEFAULT_STOREFILE_SIZE_COST = 5;
StoreFileCostFunction(Configuration conf) {
super(conf);
this.setMultiplier(conf.getFloat(STOREFILE_SIZE_COST_KEY, DEFAULT_STOREFILE_SIZE_COST));
}
@Override
protected double getCostFromRl(RegionLoad rl) {
return rl.getStorefileSizeMB();
}
}
}