org.apache.beam.sdk.io.astra.db.transforms.split.TokenRangeClusterer Maven / Gradle / Ivy
Show all versions of beam-sdks-java-io-astra Show documentation
/*
* Copyright DataStax, Inc.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.beam.sdk.io.astra.db.transforms.split;
/*-
* #%L
* Beam SDK for Astra
* --
* Copyright (C) 2023 DataStax
* --
* Licensed under the Apache License, Version 2.0
* You may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
* #L%
*/
import com.datastax.oss.driver.api.core.metadata.token.Token;
import com.datastax.oss.driver.api.core.metadata.token.TokenRange;
import com.datastax.oss.driver.shaded.guava.common.collect.ComparisonChain;
import com.datastax.oss.driver.shaded.guava.common.collect.Lists;
import edu.umd.cs.findbugs.annotations.NonNull;
import java.util.ArrayList;
import java.util.LinkedList;
import java.util.List;
/**
* Groups small, contiguous token ranges sharing the same replicas in order to reduce the total
* number of splits.
*
* Useful mostly with virtual nodes, which may create lots of small token range splits.
*/
public class TokenRangeClusterer {
/**
* Reference to the factory
*/
private final AstraTokenFactory tokenFactory;
/**
* Constructor for the clusterer.
*
* @param tokenFactory
* list of token
*/
public TokenRangeClusterer(@NonNull AstraTokenFactory tokenFactory) {
this.tokenFactory = tokenFactory;
}
/**
* Groups contiguous ranges together as long as they are contiguous and share the same replicas.
*
*
Note: the grouping algorithm used in DSBulk is different from the one used by the Spark
* connector: the former favors groups of contiguous ranges sharing the same replicas (in order to
* make it possible to route a range read to a coordinator that is also a replica), whereas the
* latter favors data locality (i.e. groups even non-continguous ranges as long as they share at
* least one common replica).
*
* @param ranges
* list of ranges
* @param groupCount
* number of gorups
* @param maxGroupSize
* maximum group size
* @return
* list of token
*/
@NonNull
public List group(List ranges, int groupCount, int maxGroupSize) {
double ringFractionPerGroup = 1.0d / groupCount;
LinkedList sorted = Lists.newLinkedList(ranges);
sorted.sort(
(tr1, tr2) ->
ComparisonChain.start()
.compare(tr1.getStart(), tr2.getStart())
.compare(tr1.getEnd(), tr2.getEnd())
.result());
if (sorted.isEmpty()) {
return sorted;
}
List grouped = new ArrayList<>();
while (!sorted.isEmpty()) {
AstraTokenRange head = sorted.peek();
assert head != null;
double ringFractionLimit =
Math.max(
ringFractionPerGroup,
head.fraction()); // make sure first element will be always included
double cumulativeRingFraction = 0;
Token end = head.getStart();
for (int i = 0; i < Math.max(1, maxGroupSize) && !sorted.isEmpty(); i++) {
AstraTokenRange current = sorted.peek();
assert current != null;
cumulativeRingFraction += current.fraction();
// keep grouping ranges as long as they share the same replicas and the resulting
// range is contiguous.
if (cumulativeRingFraction > ringFractionLimit
|| !head.replicas().equals(current.replicas())
|| !end.equals(current.getStart())) {
break;
}
sorted.pop();
end = current.getEnd();
}
grouped.add(tokenFactory.range(head.getStart(), end, head.replicas()));
}
List list = new ArrayList<>();
for (AstraTokenRange tr : grouped) {
for (TokenRange r : tr.unwrap()) {
list.add(tokenFactory.range(r.getStart(), r.getEnd(), tr.replicas()));
}
}
return list;
}
}