org.apache.solr.cluster.placement.plugins.AffinityPlacementFactory Maven / Gradle / Ivy
Show all versions of solr-core Show documentation
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.solr.cluster.placement.plugins;
import java.io.IOException;
import java.lang.invoke.MethodHandles;
import java.util.Arrays;
import java.util.Collection;
import java.util.Collections;
import java.util.HashMap;
import java.util.HashSet;
import java.util.List;
import java.util.Locale;
import java.util.Map;
import java.util.Objects;
import java.util.Optional;
import java.util.Set;
import java.util.stream.Collectors;
import java.util.stream.Stream;
import org.apache.solr.cluster.Cluster;
import org.apache.solr.cluster.Node;
import org.apache.solr.cluster.Replica;
import org.apache.solr.cluster.SolrCollection;
import org.apache.solr.cluster.placement.AttributeFetcher;
import org.apache.solr.cluster.placement.AttributeValues;
import org.apache.solr.cluster.placement.BalanceRequest;
import org.apache.solr.cluster.placement.DeleteCollectionRequest;
import org.apache.solr.cluster.placement.PlacementContext;
import org.apache.solr.cluster.placement.PlacementException;
import org.apache.solr.cluster.placement.PlacementModificationException;
import org.apache.solr.cluster.placement.PlacementPlugin;
import org.apache.solr.cluster.placement.PlacementPluginFactory;
import org.apache.solr.cluster.placement.ReplicaMetric;
import org.apache.solr.cluster.placement.ShardMetrics;
import org.apache.solr.cluster.placement.impl.NodeMetricImpl;
import org.apache.solr.cluster.placement.impl.ReplicaMetricImpl;
import org.apache.solr.common.util.CollectionUtil;
import org.apache.solr.common.util.StrUtils;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
/**
* This factory is instantiated by config from its class name. Using it is the only way to create
* instances of {@link AffinityPlacementPlugin}.
*
* In order to configure this plugin to be used for placement decisions, the following {@code
* curl} command (or something equivalent) has to be executed once the cluster is already running in
* order to set the appropriate Zookeeper stored configuration. Replace {@code localhost:8983} by
* one of your servers' IP address and port.
*
*
*
* curl -X POST -H 'Content-type:application/json' -d '{
* "add": {
* "name": ".placement-plugin",
* "class": "org.apache.solr.cluster.placement.plugins.AffinityPlacementFactory",
* "config": {
* "minimalFreeDiskGB": 10,
* "prioritizedFreeDiskGB": 50
* }
* }
* }' http://localhost:8983/api/cluster/plugin
*
*
* In order to delete the placement-plugin section (and to fallback to either Legacy or rule
* based placement if configured for a collection), execute:
*
*
*
* curl -X POST -H 'Content-type:application/json' -d '{
* "remove" : ".placement-plugin"
* }' http://localhost:8983/api/cluster/plugin
*
*
* {@link AffinityPlacementPlugin} implements placing replicas in a way that replicate past
* Autoscaling config defined here.
*
*
This specification is doing the following:
*
*
Spread replicas per shard as evenly as possible across multiple availability zones (given
* by a sys prop), assign replicas based on replica type to specific kinds of nodes (another sys
* prop), and avoid having more than one replica per shard on the same node.
* Only after these constraints are satisfied do minimize cores per node or disk usage.
*
*
This plugin achieves this by creating a {@link AffinityPlacementPlugin.AffinityNode} that
* weights nodes very high if they are unbalanced with respect to AvailabilityZone and SpreadDomain.
* See {@link AffinityPlacementPlugin.AffinityNode} for more information on how this weighting helps
* the plugin correctly place and balance replicas.
*
*
This code is a realistic placement computation, based on a few assumptions. The code is
* written in such a way to make it relatively easy to adapt it to (somewhat) different assumptions.
* Additional configuration options could be introduced to allow configuration base option selection
* as well...
*/
public class AffinityPlacementFactory implements PlacementPluginFactory {
private static final Logger log = LoggerFactory.getLogger(MethodHandles.lookup().lookupClass());
AffinityPlacementConfig config = AffinityPlacementConfig.DEFAULT;
/**
* Empty public constructor is used to instantiate this factory. Using a factory pattern to allow
* the factory to do one time costly operations if needed, and to only have to instantiate a
* default constructor class by name, rather than having to call a constructor with more
* parameters (if we were to instantiate the plugin class directly without going through a
* factory).
*/
public AffinityPlacementFactory() {}
@Override
public PlacementPlugin createPluginInstance() {
config.validate();
return new AffinityPlacementPlugin(
config.minimalFreeDiskGB,
config.prioritizedFreeDiskGB,
config.withCollection,
config.withCollectionShards,
config.collectionNodeType,
config.spreadAcrossDomains);
}
@Override
public void configure(AffinityPlacementConfig cfg) {
Objects.requireNonNull(cfg, "configuration must never be null");
cfg.validate();
this.config = cfg;
}
@Override
public AffinityPlacementConfig getConfig() {
return config;
}
/**
* See {@link AffinityPlacementFactory} for instructions on how to configure a cluster to use this
* plugin and details on what the plugin does.
*/
public static class AffinityPlacementPlugin extends OrderedNodePlacementPlugin {
private final long minimalFreeDiskGB;
private final long prioritizedFreeDiskGB;
// primary to secondary (1:1)
private final Map withCollections;
// same but shardwise
private final Map withCollectionShards;
// secondary to primary (1:N) + shard-wise_primary (1:N)
private final Map> collocatedWith;
private final Map> nodeTypes;
private final boolean spreadAcrossDomains;
/**
* The factory has decoded the configuration for the plugin instance and passes it the
* parameters it needs.
*/
AffinityPlacementPlugin(
long minimalFreeDiskGB,
long prioritizedFreeDiskGB,
Map withCollections,
Map withCollectionShards,
Map collectionNodeTypes,
boolean spreadAcrossDomains) {
this.minimalFreeDiskGB = minimalFreeDiskGB;
this.prioritizedFreeDiskGB = prioritizedFreeDiskGB;
Objects.requireNonNull(withCollections, "withCollections must not be null");
Objects.requireNonNull(collectionNodeTypes, "collectionNodeTypes must not be null");
Objects.requireNonNull(withCollectionShards, "withCollectionShards must not be null");
this.spreadAcrossDomains = spreadAcrossDomains;
this.withCollections = withCollections;
this.withCollectionShards = withCollectionShards;
Map> collocated = new HashMap<>();
// reverse both relations: shard-agnostic and shard-wise
List.of(this.withCollections, this.withCollectionShards)
.forEach(
direct ->
direct.forEach(
(primary, secondary) ->
collocated
.computeIfAbsent(secondary, s -> new HashSet<>())
.add(primary)));
this.collocatedWith = Collections.unmodifiableMap(collocated);
if (collectionNodeTypes.isEmpty()) {
nodeTypes = Map.of();
} else {
nodeTypes = new HashMap<>();
collectionNodeTypes.forEach(
(coll, typesString) -> {
List types = StrUtils.splitSmart(typesString, ',', true);
if (!types.isEmpty()) {
nodeTypes.put(coll, new HashSet<>(types));
}
});
}
}
@Override
protected void verifyDeleteCollection(
DeleteCollectionRequest deleteCollectionRequest, PlacementContext placementContext)
throws PlacementModificationException {
Cluster cluster = placementContext.getCluster();
Set collocatedCollections =
collocatedWith.getOrDefault(deleteCollectionRequest.getCollection().getName(), Set.of());
for (String primaryName : collocatedCollections) {
try {
if (cluster.getCollection(primaryName) != null) {
// still exists
throw new PlacementModificationException(
"collocated collection "
+ primaryName
+ " of "
+ deleteCollectionRequest.getCollection().getName()
+ " still present");
}
} catch (IOException e) {
throw new PlacementModificationException(
"failed to retrieve collocated collection information", e);
}
}
}
/**
* AffinityPlacementContext is used to share information across {@link AffinityNode} instances.
*
* For instance, with SpreadDomains and AvailabilityZones, the weighting of a Node requires
* information on the contents of other Nodes. This class is how that information is shared.
*
*
One AffinityPlacementContext is used for each call to {@link
* #computePlacements(Collection, PlacementContext)} or {@link #computeBalancing(BalanceRequest,
* PlacementContext)}. The state of the context will be altered throughout the computation.
*/
private static final class AffinityPlacementContext {
private final Set allSpreadDomains = new HashSet<>();
private final Map> spreadDomainUsage = new HashMap<>();
private final Set allAvailabilityZones = new HashSet<>();
private final Map>>
availabilityZoneUsage = new HashMap<>();
private boolean doSpreadAcrossDomains;
}
@Override
protected Map getBaseWeightedNodes(
PlacementContext placementContext,
Set nodes,
Iterable relevantCollections,
boolean skipNodesWithErrors)
throws PlacementException {
// Fetch attributes for a superset of all nodes requested amongst the placementRequests
AttributeFetcher attributeFetcher = placementContext.getAttributeFetcher();
attributeFetcher
.requestNodeSystemProperty(AffinityPlacementConfig.AVAILABILITY_ZONE_SYSPROP)
.requestNodeSystemProperty(AffinityPlacementConfig.NODE_TYPE_SYSPROP)
.requestNodeSystemProperty(AffinityPlacementConfig.REPLICA_TYPE_SYSPROP)
.requestNodeSystemProperty(AffinityPlacementConfig.SPREAD_DOMAIN_SYSPROP);
attributeFetcher
.requestNodeMetric(NodeMetricImpl.NUM_CORES)
.requestNodeMetric(NodeMetricImpl.FREE_DISK_GB);
Set> replicaMetrics = Set.of(ReplicaMetricImpl.INDEX_SIZE_GB);
Set requestedCollections = new HashSet<>();
for (SolrCollection collection : relevantCollections) {
if (requestedCollections.add(collection.getName())) {
attributeFetcher.requestCollectionMetrics(collection, replicaMetrics);
}
}
attributeFetcher.fetchFrom(nodes);
final AttributeValues attrValues = attributeFetcher.fetchAttributes();
AffinityPlacementContext affinityPlacementContext = new AffinityPlacementContext();
affinityPlacementContext.doSpreadAcrossDomains = spreadAcrossDomains;
Map affinityNodeMap = CollectionUtil.newHashMap(nodes.size());
for (Node node : nodes) {
AffinityNode affinityNode =
newNodeFromMetrics(node, attrValues, affinityPlacementContext, skipNodesWithErrors);
if (affinityNode != null) {
affinityNodeMap.put(node, affinityNode);
}
}
// If there are not multiple spreadDomains, then there is nothing to spread across
if (affinityPlacementContext.allSpreadDomains.size() < 2) {
affinityPlacementContext.doSpreadAcrossDomains = false;
}
return affinityNodeMap;
}
AffinityNode newNodeFromMetrics(
Node node,
AttributeValues attrValues,
AffinityPlacementContext affinityPlacementContext,
boolean skipNodesWithErrors)
throws PlacementException {
Set supportedReplicaTypes =
attrValues.getSystemProperty(node, AffinityPlacementConfig.REPLICA_TYPE_SYSPROP).stream()
.flatMap(s -> Arrays.stream(s.split(",")))
.map(String::trim)
.map(s -> s.toUpperCase(Locale.ROOT))
.map(
s -> {
try {
return Replica.ReplicaType.valueOf(s);
} catch (IllegalArgumentException e) {
log.warn(
"Node {} has an invalid value for the {} systemProperty: {}",
node.getName(),
AffinityPlacementConfig.REPLICA_TYPE_SYSPROP,
s);
return null;
}
})
.collect(Collectors.toSet());
if (supportedReplicaTypes.isEmpty()) {
// If property not defined or is only whitespace on a node, assuming node can take any
// replica type
supportedReplicaTypes = Set.of(Replica.ReplicaType.values());
}
Set nodeType;
Optional nodePropOpt =
attrValues.getSystemProperty(node, AffinityPlacementConfig.NODE_TYPE_SYSPROP);
if (nodePropOpt.isEmpty()) {
nodeType = Collections.emptySet();
} else {
nodeType = new HashSet<>(StrUtils.splitSmart(nodePropOpt.get(), ','));
}
Optional nodeFreeDiskGB = attrValues.getNodeMetric(node, NodeMetricImpl.FREE_DISK_GB);
Optional nodeNumCores = attrValues.getNodeMetric(node, NodeMetricImpl.NUM_CORES);
String az =
attrValues
.getSystemProperty(node, AffinityPlacementConfig.AVAILABILITY_ZONE_SYSPROP)
.orElse(AffinityPlacementConfig.UNDEFINED_AVAILABILITY_ZONE);
affinityPlacementContext.allAvailabilityZones.add(az);
String spreadDomain;
if (affinityPlacementContext.doSpreadAcrossDomains) {
spreadDomain =
attrValues
.getSystemProperty(node, AffinityPlacementConfig.SPREAD_DOMAIN_SYSPROP)
.orElse(null);
if (spreadDomain == null) {
if (log.isWarnEnabled()) {
log.warn(
"AffinityPlacementPlugin configured to spread across domains, but node {} does not have the {} system property. Ignoring spreadAcrossDomains.",
node.getName(),
AffinityPlacementConfig.SPREAD_DOMAIN_SYSPROP);
}
// In the context stop using spreadDomains, because we have a node without a spread
// domain.
affinityPlacementContext.doSpreadAcrossDomains = false;
affinityPlacementContext.allSpreadDomains.clear();
} else {
affinityPlacementContext.allSpreadDomains.add(spreadDomain);
}
} else {
spreadDomain = null;
}
if (nodeFreeDiskGB.isEmpty() && skipNodesWithErrors) {
if (log.isWarnEnabled()) {
log.warn(
"Unknown free disk on node {}, excluding it from placement decisions.",
node.getName());
}
return null;
} else if (nodeNumCores.isEmpty() && skipNodesWithErrors) {
if (log.isWarnEnabled()) {
log.warn(
"Unknown number of cores on node {}, excluding it from placement decisions.",
node.getName());
}
return null;
} else {
return new AffinityNode(
node,
attrValues,
affinityPlacementContext,
supportedReplicaTypes,
nodeType,
nodeNumCores.orElse(0),
nodeFreeDiskGB.orElse(0D),
az,
spreadDomain);
}
}
/**
* This implementation weights nodes in order to achieve balancing across AvailabilityZones and
* SpreadDomains, while trying to minimize the amount of replicas on a node and ensure a given
* disk space per node. This implementation also supports limiting the placement of certain
* replica types per node and co-locating collections.
*
* The total weight of the AffinityNode is the sum of:
*
*
* - The number of replicas on the node
*
- 100 if the free disk space on the node < prioritizedFreeDiskGB, otherwise 0
*
- If SpreadDomains are used:
* 10,000 * the sum over each collection/shard:
*
* - (# of replicas in this node's spread domain - the minimum spreadDomain's
* replicaCount)^2
* These are individually squared to penalize higher values when summing up all
* values
*
* - If AvailabilityZones are used:
* 1,000,000 * the sum over each collection/shard/replicaType:
*
* - (# of replicas in this node's AZ - the minimum AZ's replicaCount)^2
* These are individually squared to penalize higher values when summing up all
* values
*
*
*
* The weighting here ensures that the order of importance for nodes is:
*
*
* - Spread replicas of the same shard/replicaType across availabilityZones
*
- Spread replicas of the same shard across spreadDomains
*
- Make sure that replicas are not placed on nodes that have < prioritizedFreeDiskGB disk
* space available
*
- Minimize the amount of replicas on the node
*
*
* The "relevant" weight with a replica is the sum of:
*
*
* - The number of replicas on the node
*
- 100 if the projected free disk space on the node < prioritizedFreeDiskGB, otherwise 0
*
- If SpreadDomains are used:
* 10,000 * ( # of replicas for the replica's shard in this node's spread domain - the
* minimum spreadDomain's replicaCount )
* - If AvailabilityZones are used:
* 1,000,000 * ( # of replicas for the replica's shard & replicaType in this node's AZ -
* the minimum AZ's replicaCount )
*
*
* Multiple replicas of the same shard are not permitted to live on the same Node.
*
*
Users can specify withCollection, to ensure that co-placement of replicas is ensured when
* computing new replica placements or replica balancing.
*/
private class AffinityNode extends WeightedNode {
private final AttributeValues attrValues;
private final AffinityPlacementContext affinityPlacementContext;
private final Set supportedReplicaTypes;
private final Set nodeType;
private int coresOnNode;
private double nodeFreeDiskGB;
private final String availabilityZone;
private final String spreadDomain;
AffinityNode(
Node node,
AttributeValues attrValues,
AffinityPlacementContext affinityPlacementContext,
Set supportedReplicaTypes,
Set nodeType,
int coresOnNode,
double nodeFreeDiskGB,
String az,
String spreadDomain) {
super(node);
this.attrValues = attrValues;
this.affinityPlacementContext = affinityPlacementContext;
this.supportedReplicaTypes = supportedReplicaTypes;
this.nodeType = nodeType;
this.coresOnNode = coresOnNode;
this.nodeFreeDiskGB = nodeFreeDiskGB;
this.availabilityZone = az;
this.spreadDomain = spreadDomain;
}
@Override
public int calcWeight() {
return coresOnNode
// Only add 100 if prioritizedFreeDiskGB was provided and the node's freeDisk is lower
// than it
+ 100 * (prioritizedFreeDiskGB > 0 && nodeFreeDiskGB < prioritizedFreeDiskGB ? 1 : 0)
+ 10000 * getSpreadDomainWeight()
+ 1000000 * getAZWeight();
}
@Override
public int calcRelevantWeightWithReplica(Replica replica) {
return coresOnNode
// Only add 100 if prioritizedFreeDiskGB was provided and the node's projected freeDisk
// is lower than it
+ 100
* (prioritizedFreeDiskGB > 0
&& nodeFreeDiskGB - getProjectedSizeOfReplica(replica)
< prioritizedFreeDiskGB
? 1
: 0)
+ 10000 * projectReplicaSpreadWeight(replica)
+ 1000000 * projectAZWeight(replica);
}
@Override
public boolean canAddReplica(Replica replica) {
String collection = replica.getShard().getCollection().getName();
// By default, do not allow two replicas of the same shard on a node
return super.canAddReplica(replica)
// Filter out unsupported replica types
&& supportedReplicaTypes.contains(replica.getType())
// Filter out unsupported node types
&& Optional.ofNullable(nodeTypes.get(collection))
.map(s -> s.stream().anyMatch(nodeType::contains))
.orElse(true)
// Ensure any co-located collections already exist on the Node
&& Optional.ofNullable(withCollections.get(collection))
.map(this::hasCollectionOnNode)
.orElse(true)
// Ensure same shard is collocated if required
&& Optional.ofNullable(withCollectionShards.get(collection))
.map(
shardWiseOf ->
getShardsOnNode(shardWiseOf).contains(replica.getShard().getShardName()))
.orElse(true)
// Ensure the disk space will not go below the minimum if the replica is added
&& (minimalFreeDiskGB <= 0
|| nodeFreeDiskGB - getProjectedSizeOfReplica(replica) > minimalFreeDiskGB);
}
/**
* Return any replicas that cannot be removed because there are collocated collections that
* require the replica to exist.
*
* @param replicas the replicas to remove
* @return any errors for replicas that cannot be removed
*/
@Override
public Map canRemoveReplicas(Collection replicas) {
Map replicaRemovalExceptions = new HashMap<>();
Map>> removals = new HashMap<>();
for (Replica replica : replicas) {
SolrCollection collection = replica.getShard().getCollection();
Set collocatedCollections = new HashSet<>();
Optional.ofNullable(collocatedWith.get(collection.getName()))
.ifPresent(collocatedCollections::addAll);
collocatedCollections.retainAll(getCollectionsOnNode());
if (collocatedCollections.isEmpty()) {
continue;
}
Stream shardWiseCollocations =
collocatedCollections.stream()
.filter(
priColl -> collection.getName().equals(withCollectionShards.get(priColl)));
final Set mandatoryShardsOrAll =
shardWiseCollocations
.flatMap(priColl -> getShardsOnNode(priColl).stream())
.collect(Collectors.toSet());
// There are collocatedCollections for this shard, so make sure there is a replica of this
// shard left on the node after it is removed
Set replicasRemovedForShard =
removals
.computeIfAbsent(
replica.getShard().getCollection().getName(), k -> new HashMap<>())
.computeIfAbsent(replica.getShard().getShardName(), k -> new HashSet<>());
replicasRemovedForShard.add(replica);
// either if all shards are mandatory, or the current one is mandatory
boolean shardWise = false;
if (mandatoryShardsOrAll.isEmpty()
|| (shardWise = mandatoryShardsOrAll.contains(replica.getShard().getShardName()))) {
if (replicasRemovedForShard.size()
>= getReplicasForShardOnNode(replica.getShard()).size()) {
replicaRemovalExceptions.put(
replica,
"co-located with replicas of "
+ (shardWise ? replica.getShard().getShardName() + " of " : "")
+ collocatedCollections);
}
}
}
return replicaRemovalExceptions;
}
@Override
protected boolean addProjectedReplicaWeights(Replica replica) {
nodeFreeDiskGB -= getProjectedSizeOfReplica(replica);
coresOnNode += 1;
return addReplicaToAzAndSpread(replica);
}
@Override
protected void initReplicaWeights(Replica replica) {
addReplicaToAzAndSpread(replica);
}
private boolean addReplicaToAzAndSpread(Replica replica) {
boolean needsResort = false;
// Only use AvailabilityZones if there are more than 1
if (affinityPlacementContext.allAvailabilityZones.size() > 1) {
needsResort |=
affinityPlacementContext
.availabilityZoneUsage
.computeIfAbsent(
replica.getShard().getCollection().getName(), k -> new HashMap<>())
.computeIfAbsent(replica.getShard().getShardName(), k -> new HashMap<>())
.computeIfAbsent(
replica.getType(),
k -> new ReplicaSpread(affinityPlacementContext.allAvailabilityZones))
.addReplica(availabilityZone);
}
// Only use SpreadDomains if they have been provided to all nodes and there are more than 1
if (affinityPlacementContext.doSpreadAcrossDomains) {
needsResort |=
affinityPlacementContext
.spreadDomainUsage
.computeIfAbsent(
replica.getShard().getCollection().getName(), k -> new HashMap<>())
.computeIfAbsent(
replica.getShard().getShardName(),
k -> new ReplicaSpread(affinityPlacementContext.allSpreadDomains))
.addReplica(spreadDomain);
}
return needsResort;
}
@Override
protected void removeProjectedReplicaWeights(Replica replica) {
nodeFreeDiskGB += getProjectedSizeOfReplica(replica);
coresOnNode -= 1;
// Only use AvailabilityZones if there are more than 1
if (affinityPlacementContext.allAvailabilityZones.size() > 1) {
Optional.ofNullable(
affinityPlacementContext.availabilityZoneUsage.get(
replica.getShard().getCollection().getName()))
.map(m -> m.get(replica.getShard().getShardName()))
.map(m -> m.get(replica.getType()))
.ifPresent(m -> m.removeReplica(availabilityZone));
}
// Only use SpreadDomains if they have been provided to all nodes and there are more than 1
if (affinityPlacementContext.doSpreadAcrossDomains) {
Optional.ofNullable(
affinityPlacementContext.spreadDomainUsage.get(
replica.getShard().getCollection().getName()))
.map(m -> m.get(replica.getShard().getShardName()))
.ifPresent(m -> m.removeReplica(spreadDomain));
}
}
private double getProjectedSizeOfReplica(Replica replica) {
return attrValues
.getCollectionMetrics(replica.getShard().getCollection().getName())
.flatMap(colMetrics -> colMetrics.getShardMetrics(replica.getShard().getShardName()))
.flatMap(ShardMetrics::getLeaderMetrics)
.flatMap(lrm -> lrm.getReplicaMetric(ReplicaMetricImpl.INDEX_SIZE_GB))
.orElse(0D);
}
/**
* If there are more than one spreadDomains given in the cluster, then return a weight for
* this node, given the number of replicas in its spreadDomain.
*
* For each Collection & Shard, sum up the number of replicas this node's SpreadDomain has
* over the minimum SpreadDomain. Square each value before summing, to ensure that smaller
* number of higher values are penalized more than a larger number of smaller values.
*
* @return the weight
*/
private int getSpreadDomainWeight() {
if (affinityPlacementContext.doSpreadAcrossDomains) {
return affinityPlacementContext.spreadDomainUsage.values().stream()
.flatMap(m -> m.values().stream())
.mapToInt(rs -> rs.overMinimum(spreadDomain))
.map(i -> i * i)
.sum();
} else {
return 0;
}
}
/**
* If there are more than one SpreadDomains given in the cluster, then return a projected
* SpreadDomain weight for this node and this replica.
*
*
For the new replica's Collection & Shard, project the number of replicas this node's
* SpreadDomain has over the minimum SpreadDomain.
*
* @return the weight
*/
private int projectReplicaSpreadWeight(Replica replica) {
if (replica != null && affinityPlacementContext.doSpreadAcrossDomains) {
return Optional.ofNullable(
affinityPlacementContext.spreadDomainUsage.get(
replica.getShard().getCollection().getName()))
.map(m -> m.get(replica.getShard().getShardName()))
.map(rs -> rs.projectOverMinimum(spreadDomain, 1))
.orElse(0);
} else {
return 0;
}
}
/**
* If there are more than one AvailabilityZones given in the cluster, then return a weight for
* this node, given the number of replicas in its availabilityZone.
*
*
For each Collection, Shard & ReplicaType, sum up the number of replicas this node's
* AvailabilityZone has over the minimum AvailabilityZone. Square each value before summing,
* to ensure that smaller number of higher values are penalized more than a larger number of
* smaller values.
*
* @return the weight
*/
private int getAZWeight() {
if (affinityPlacementContext.allAvailabilityZones.size() < 2) {
return 0;
} else {
return affinityPlacementContext.availabilityZoneUsage.values().stream()
.flatMap(m -> m.values().stream())
.flatMap(m -> m.values().stream())
.mapToInt(rs -> rs.overMinimum(availabilityZone))
.map(i -> i * i)
.sum();
}
}
/**
* If there are more than one AvailabilityZones given in the cluster, then return a projected
* AvailabilityZone weight for this node and this replica.
*
*
For the new replica's Collection, Shard & ReplicaType, project the number of replicas
* this node's AvailabilityZone has over the minimum AvailabilityZone.
*
* @return the weight
*/
private int projectAZWeight(Replica replica) {
if (replica == null || affinityPlacementContext.allAvailabilityZones.size() < 2) {
return 0;
} else {
return Optional.ofNullable(
affinityPlacementContext.availabilityZoneUsage.get(
replica.getShard().getCollection().getName()))
.map(m -> m.get(replica.getShard().getShardName()))
.map(m -> m.get(replica.getType()))
.map(rs -> rs.projectOverMinimum(availabilityZone, 1))
.orElse(0);
}
}
}
private static class ReplicaSpread {
private final Set allKeys;
private final Map spread;
private int minReplicasLocated;
private ReplicaSpread(Set allKeys) {
this.allKeys = allKeys;
this.spread = new HashMap<>();
this.minReplicasLocated = 0;
}
int overMinimum(String key) {
return spread.getOrDefault(key, 0) - minReplicasLocated;
}
/**
* Trying adding a replica for the given spread key, and return the {@link
* #overMinimum(String)} with it added. Remove the replica, so that the state is unchanged
* from when the method was called.
*/
int projectOverMinimum(String key, int replicaDelta) {
int overMinimum = overMinimum(key);
if (overMinimum == 0 && replicaDelta > 0) {
addReplica(key);
int projected = overMinimum(key);
removeReplica(key);
return projected;
} else {
return Integer.max(0, overMinimum + replicaDelta);
}
}
/**
* Add a replica for the given spread key, returning whether a full resorting is needed for
* AffinityNodes. Resorting is only needed if other nodes could possibly have a lower weight
* than before.
*
* @param key the spread key for the replica that should be added
* @return whether a re-sort is required
*/
boolean addReplica(String key) {
int previous = spread.getOrDefault(key, 0);
spread.put(key, previous + 1);
if (allKeys.size() > 0
&& spread.size() == allKeys.size()
&& previous == minReplicasLocated) {
minReplicasLocated = spread.values().stream().mapToInt(Integer::intValue).min().orElse(0);
return true;
}
return false;
}
void removeReplica(String key) {
Integer replicasLocated = spread.computeIfPresent(key, (k, v) -> v - 1 == 0 ? null : v - 1);
if (replicasLocated == null) {
replicasLocated = 0;
}
if (replicasLocated < minReplicasLocated) {
minReplicasLocated = replicasLocated;
}
}
}
}
}