oracle.kv.impl.topo.split.SplitBuilder Maven / Gradle / Ivy
Show all versions of oracle-nosql-server Show documentation
/*-
* Copyright (C) 2011, 2018 Oracle and/or its affiliates. All rights reserved.
*
* This file was distributed by Oracle as part of a version of Oracle NoSQL
* Database made available at:
*
* http://www.oracle.com/technetwork/database/database-technologies/nosqldb/downloads/index.html
*
* Please see the LICENSE file included in the top-level directory of the
* appropriate version of Oracle NoSQL Database for a copy of the license and
* additional information.
*/
package oracle.kv.impl.topo.split;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.Set;
import oracle.kv.Consistency;
import oracle.kv.impl.topo.Datacenter;
import oracle.kv.impl.topo.PartitionId;
import oracle.kv.impl.topo.RepGroupId;
import oracle.kv.impl.topo.Topology;
/**
* Class for constructing groupings of partitions (splits) for use by components
* that want to optimally iterate over the store in parallel. A set of splits
* returned by one of the create methods of this class will contain entries for
* every partition in the store. A consumer of this set must iterate over the
* partitions contained in each split in order to iterate over the complete
* store. The goal is to provide a path for multiple processes, each operating
* on a single split from the same, to iterate over the store with minimal
* contention on the rep nodes, either from itself (using multiple threads)
* or from other processes.
*
* The set of splits can be constructed in different ways to meet specific
* needs. In general, each split contains one or more partition sets in a list.
* The intent is that the user should iterate over one partition set at a time.
* A set may contain one or more partitions which can be accessed in
* parallel with limited contention.
*
* There are two basic ways in which a set of splits can be created, a
* "partition split" and a "shard split". These differ in how the partitions
* are allocated to each split.
*
* Partition splits, returned by createPartitionSplits(int, Consistency),
* assign partitions to each split sequentially, based on simple division.
* A partition split can be calculated by independent processes and yield
* the exact set of splits and partition sets for a given store.
*
* Shard splits, returned by createShardSplits(int, Consistency) and
* createShardSplits(Consistency), assign partitions based on membership in
* a shard. A shard split is dependent on a specific topology version and may
* not produce the same results given different topology versions of the same
* store.
*
* In either case, split optimization is done at the time of creation.
* If the topology or replication node mastership changes between when the
* split is created and when it is used, some loss in efficiency may result.
*
* Split optimization is highly dependent on the topology. For example, more
* rep nodes will permit a higher level of concurrency without contention.
* Also, if the number of splits is specified in the create methods
* (vs. automatically generated) it may not be possible to create an
* optimal set.
*
* Note that an implicit assumption in all of this is that partitions of a
* given store are roughly the same size, and take the same amount of time
* to iterate over. Furthermore, the assignment of partitions to a set assumes
* that the request dispatcher will distribute accesses evenly across the
* nodes in a shard (as consistency requirements allow).
*/
public class SplitBuilder {
/* The target number of concurrent streams accessing each RN */
private static final int TARGET_STREAMS_PER_RN = 2;
/*
* The target number of concurrent streams from each split. This value is
* a somewhat arbitrary minimum used unless the requested number of splits
* results in a higher value
*/
private static final int TARGET_STREAMS_PER_SPLIT = 3;
private final Topology topo;
private final int storeRf;
/**
* Constructs a split builder with the specified topology. All subsequent
* calls to create methods will use that topology.
*
* @param topology the topology to use for this builder
*/
public SplitBuilder(Topology topology) {
this.topo = topology;
if (topo.getRepGroupMap().size() < 1) {
throw new IllegalArgumentException("Number of shards in store is 0");
}
int totalRf = 0;
for (Datacenter dc : topo.getDatacenterMap().getAll()) {
if (dc.getDatacenterType().isPrimary()) {
totalRf += dc.getRepFactor();
}
}
if (totalRf == 0) {
throw new IllegalArgumentException("Store replication factor is 0");
}
storeRf = totalRf;
}
/**
* Creates a list of splits based on partitions. This method creates the
* specified number of splits where the number of partitions in each split
* is the total number divided by nSplits. An attempt is made to order the
* partitions within the splits to minimize contention on the rep
* nodes. This method is safe to use in independent processes given the
* same base topology. This is because the same splits will be generated
* independent of changes in the topology.
*
* If the number of splits is greater then the number of partitions, some
* splits may be empty (i.e. contain no partition sets).
*
* Note: The above guarantee assumes that the number of partitions of
* a store cannot change.
*
* @param nSplits the number of splits to create
* @param consistency the consistency used to access the store
* @return a list of splits based on partitions
*/
public List createPartitionSplits(int nSplits,
Consistency consistency) {
if (nSplits < 1) {
throw new IllegalArgumentException("nSplits must be > 0");
}
final List splits = new ArrayList(nSplits);
for (int i = 1; i <= nSplits; i++) {
splits.add(new TopoSplit(i));
}
final int nShards = topo.getRepGroupMap().size();
/*
* This assumes accesses to a shard are spread across the RNs
* housing the shard's partitions.
*/
final int streamsPerShard =
calcShardConcurrency(consistency) * TARGET_STREAMS_PER_RN;
/*
* totalStreams is the optimal number of streams hitting the store.
*/
final int totalStreams = streamsPerShard * nShards;
/*
* Calc the streams per split, rounding up if necessary so that
* totalStreams are created. Note that we don't go over totalStreams
* which could result in the last split having less partitions in each
* set.
*/
int streamsPerSplit = totalStreams / nSplits;
if ((totalStreams % nSplits) != 0) {
streamsPerSplit += 1;
}
final int nPartitions = topo.getPartitionMap().getNPartitions();
final int splitSize = (nPartitions < nSplits) ? 1 : nPartitions/nSplits;
int start = 1;
int end = splitSize;
for (TopoSplit split : splits) {
/*
* The last split covers the remaining partitions. If nPartitions
* does not divide evenly by nSplits, the the last split may have
* fewer partitions, and the end point will be < splitSize
*/
if (split.getId() == nSplits) {
end = nPartitions;
}
final Map> map =
createPartitionMap(start, end);
while (!map.isEmpty()) {
final PartitionSelector selector =
new PartitionSelector(map, streamsPerShard);
final Set pSet = selector.getNextSet(streamsPerSplit);
if (pSet == null) {
break;
}
split.add(pSet);
}
/*
* Adjust pointers, if start > nPartions then we are done. Any
* remaining splits will be empty.
*/
start = end+1;
end += splitSize;
if (start > nPartitions) {
break;
}
}
return splits;
}
/**
* Creates a list of topology splits for the store. This method creates
* the specified number of splits based on the specified consistency.
* The ordering of partition sets within each split will be optimized
* to reduce contention on the rep nodes assuming that each split
* is processed in parallel, and the partition sets are processed in
* order. This optimization assumes the size of each partition is
* roughly the same.
*
* TODO - the nSplits parameter has likely come from a Hadoop API which
* says that the parameter is a hint. In that case it may be better to
* produce the optimized set, independent of the number requested.?
*
* @param nSplits the target number of splits to generate
* @param consistency the consistency used to access the store
* @return a list of topology splits
*/
List createShardSplits(int nSplits, Consistency consistency) {
if (nSplits < 1) {
throw new IllegalArgumentException("nSplits must be > 0");
}
return createShardSplitsInternal(nSplits, consistency);
}
/**
* Creates a list of topology splits for the store. The number of splits
* returned is the maximum number of splits which can be processed in
* parallel without contention at the storage node given the specified
* consistency. The ordering of partition sets within each split will
* be optimized to reduce contention on the storage nodes assuming that
* each split is processed in parallel, and the partition sets are
* processed in order. This optimization assumes the size of each
* partition is roughly the same.
*
* @param consistency the consistency used to access the store
* @return a list of topology splits
*/
public List createShardSplits(Consistency consistency) {
return createShardSplitsInternal(Integer.MAX_VALUE, consistency);
}
private List createShardSplitsInternal(int targetSplits,
Consistency consistency) {
final int nShards = topo.getRepGroupMap().size();
assert nShards != 0;
/*
* This assumes accesses to a shard are spread across the RNs
* housing the shard's partitions.
*/
final int streamsPerShard =
calcShardConcurrency(consistency) * TARGET_STREAMS_PER_RN;
/*
* totalStreams is the optimal number of streams hitting the store.
*/
final int totalStreams = streamsPerShard * nShards;
/*
* Strawman calculation of the optimal number of splits. This assumes
* each split will handle multiple streams. Make sure there is at least
* one split.
*/
final int optimalNSplits = (totalStreams > TARGET_STREAMS_PER_SPLIT) ?
totalStreams / TARGET_STREAMS_PER_SPLIT :
1;
/*
* If optimal is at or under the targert go with that, otherwise
* use the target value.
*/
final int nSplits = (optimalNSplits <= targetSplits) ? optimalNSplits :
targetSplits;
final List splits = new ArrayList(nSplits);
for (int i = 1; i <= nSplits; i++) {
splits.add(new TopoSplit(i));
}
/*
* Calc the streams per split, rounding up if necessary so that
* totalStreams are created. Note that we don't go over totalStreams
* which could result in the last split having less partitions in each
* set.
* TODO - Is is possible to spread the shorted sets around?
*/
int streamsPerSplit = totalStreams / nSplits;
if ((totalStreams % nSplits) != 0) {
streamsPerSplit += 1;
}
final Map> map =
createPartitionMap(1, topo.getPartitionMap().getNPartitions());
/*
* Loop though the splits, picking off streamsPerShard partitions from
* each group as we go. This continues until all partitions are
* selected.
*/
while (!map.isEmpty()) {
final PartitionSelector selector =
new PartitionSelector(map, streamsPerShard);
/*
* Note that one pass though the splits will consume up to
* totalStreams partitions.
*/
for (TopoSplit split : splits) {
final Set pSet = selector.getNextSet(streamsPerSplit);
if (pSet == null) {
break;
}
split.add(pSet);
}
assert selector.getNextSet(streamsPerSplit) == null;
}
return splits;
}
/**
* Create a map of rep group => set of partitions in that group for
* the partitions between start and end.
*
* @return a map of rep group to the set of partitions in that group
*/
private Map> createPartitionMap(int startPartition,
int endPartition) {
final int nPartitions = 1+(endPartition-startPartition);
assert startPartition > 0;
assert nPartitions > 0;
final Map> map =
new HashMap>();
for (int i = startPartition; i <= endPartition; i++) {
final PartitionId partId = new PartitionId(i);
final RepGroupId rgid = topo.getRepGroupId(partId);
Set parts = map.get(rgid);
if (parts == null) {
parts = new HashSet();
map.put(rgid, parts);
}
parts.add(i);
}
return map;
}
/**
* Calculates the number of accesses which can be made from a single split
* to a shard without contention. Basically this returns 1, RF-1, or RF
* dependent on consistency requirements.
*/
@SuppressWarnings("deprecation")
private int calcShardConcurrency(Consistency consistency) {
if (consistency == Consistency.ABSOLUTE) {
return 1;
} else if (consistency == Consistency.NONE_REQUIRED_NO_MASTER) {
return storeRf - 1;
}
return storeRf;
}
/**
* A selector which will generate sets of partitions from a map of
* rep group => partitions in that group. As partitions are selected they
* are removed from the sets in the map. Likewise when a group no longer
* has any partitions it is removed from the map. Therfore a way to check
* if the selection is complete is to call isEmpty() on the map.
*/
private class PartitionSelector {
/*
* The maximum number of partitions selected from any given shard
* during a single pass through the partitions.
*/
final int maxSelectFromShard;
/* An iterator over the sets of partitions per shards. */
private final Iterator> shardItr;
/* The current partition iterator. */
Iterator partitionItr;
/*
* Running total of how many partitions have been selected from the
* current shard.
*/
int selected = 0;
PartitionSelector(Map> map,
int maxSelectFromShard) {
this.maxSelectFromShard = maxSelectFromShard;
shardItr = map.values().iterator();
nextPartitionItr();
}
private boolean nextPartitionItr() {
partitionItr = shardItr.hasNext() ? shardItr.next().iterator() :
null;
return partitionItr != null;
}
/*
* Gets the next partition. This will return the next partition in the
* shard. Once maxSelectFromShard partitions have been taken
* move on to the next shard. Return null if at the end of the
* list.
*/
private Integer getNextPartition() {
/* If the iterator is null the set is done. */
if (partitionItr == null) {
return null;
}
/* If we selected enough from this shard, move to the next. */
if (selected >= maxSelectFromShard) {
if (!nextPartitionItr()) {
return null;
}
selected = 0;
}
assert partitionItr != null;
/*
* If the next shard is empty, remove that shard and try again.
*/
while (!partitionItr.hasNext()) {
shardItr.remove();
if (!nextPartitionItr()) {
return null;
}
assert partitionItr != null;
}
selected++;
final Integer ret = partitionItr.next();
partitionItr.remove();
return ret;
}
/*
* Gets the next set of setSize partitions. The set returned may
* contain less than setSize entries. Returns null if at the
* end of this pass.
*/
Set getNextSet(int setSize) {
assert selected >= 0;
Set pSet = null;
for (int i = 0; i < setSize; i++) {
Integer p = getNextPartition();
if (p == null) {
break;
}
if (pSet == null) {
pSet = new HashSet();
}
pSet.add(p);
}
return pSet;
}
}
}