org.apache.cassandra.io.sstable.Downsampling Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of cassandra-all Show documentation
Show all versions of cassandra-all Show documentation
The Apache Cassandra Project develops a highly scalable second-generation distributed database, bringing together Dynamo's fully distributed design and Bigtable's ColumnFamily-based data model.
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/
package org.apache.cassandra.io.sstable;
import java.util.*;
public class Downsampling
{
/**
* The base (down)sampling level determines the granularity at which we can down/upsample.
*
* A higher number allows us to approximate more closely the ideal sampling. (It could also mean we do a lot of
* expensive almost-no-op resamplings from N to N-1, but the thresholds in IndexSummaryManager prevent that.)
*
* BSL must be a power of two in order to have good sampling patterns. This cannot be changed without rebuilding
* all index summaries at full sampling; for now we treat it as a constant.
*/
public static final int BASE_SAMPLING_LEVEL = 128;
private static final Map> samplePatternCache = new HashMap<>();
private static final Map> originalIndexCache = new HashMap<>();
/**
* Gets a list L of starting indices for downsampling rounds: the first round should start with the offset
* given by L[0], the second by the offset in L[1], etc.
*
* @param samplingLevel the base sampling level
*
* @return A list of `samplingLevel` unique indices between 0 and `samplingLevel`
*/
public static List getSamplingPattern(int samplingLevel)
{
List pattern = samplePatternCache.get(samplingLevel);
if (pattern != null)
return pattern;
if (samplingLevel <= 1)
return Arrays.asList(0);
int[] odds = new int[samplingLevel / 2];
int[] evens = new int[samplingLevel / 2];
for (int i = 1; i < samplingLevel; i += 2)
odds[i/2] = i;
for (int i = 0; i < samplingLevel; i += 2)
evens[i/2] = i;
// especially for latter rounds, it's important that we spread out the start points, so we'll
// make a recursive call to get an ordering for this list of start points
List ordering = getSamplingPattern(samplingLevel/2);
List startIndices = new ArrayList<>(samplingLevel);
for (Integer index : ordering)
startIndices.add(odds[index]);
for (Integer index : ordering)
startIndices.add(evens[index]);
samplePatternCache.put(samplingLevel, startIndices);
return startIndices;
}
/**
* Returns a list that can be used to translate current index summary indexes to their original index before
* downsampling. (This repeats every `samplingLevel`, so that's how many entries we return.)
*
* For example, if [0, 64] is returned, the current index summary entry at index 0 was originally
* at index 0, and the current index 1 was originally at index 64.
*
* @param samplingLevel the current sampling level for the index summary
*
* @return a list of original indexes for current summary entries
*/
public static List getOriginalIndexes(int samplingLevel)
{
List originalIndexes = originalIndexCache.get(samplingLevel);
if (originalIndexes != null)
return originalIndexes;
List pattern = getSamplingPattern(BASE_SAMPLING_LEVEL).subList(0, BASE_SAMPLING_LEVEL - samplingLevel);
originalIndexes = new ArrayList<>(samplingLevel);
for (int j = 0; j < BASE_SAMPLING_LEVEL; j++)
{
if (!pattern.contains(j))
originalIndexes.add(j);
}
originalIndexCache.put(samplingLevel, originalIndexes);
return originalIndexes;
}
/**
* Calculates the effective index interval after the entry at `index` in an IndexSummary. In other words, this
* returns the number of partitions in the primary on-disk index before the next partition that has an entry in
* the index summary. If samplingLevel == BASE_SAMPLING_LEVEL, this will be equal to the index interval.
* @param index an index into an IndexSummary
* @param samplingLevel the current sampling level for that IndexSummary
* @param minIndexInterval the min index interval (effective index interval at full sampling)
* @return the number of partitions before the next index summary entry, inclusive on one end
*/
public static int getEffectiveIndexIntervalAfterIndex(int index, int samplingLevel, int minIndexInterval)
{
assert index >= 0;
index %= samplingLevel;
List originalIndexes = getOriginalIndexes(samplingLevel);
int nextEntryOriginalIndex = (index == originalIndexes.size() - 1) ? BASE_SAMPLING_LEVEL : originalIndexes.get(index + 1);
return (nextEntryOriginalIndex - originalIndexes.get(index)) * minIndexInterval;
}
public static int[] getStartPoints(int currentSamplingLevel, int newSamplingLevel)
{
List allStartPoints = getSamplingPattern(BASE_SAMPLING_LEVEL);
// calculate starting indexes for sampling rounds
int initialRound = BASE_SAMPLING_LEVEL - currentSamplingLevel;
int numRounds = Math.abs(currentSamplingLevel - newSamplingLevel);
int[] startPoints = new int[numRounds];
for (int i = 0; i < numRounds; ++i)
{
int start = allStartPoints.get(initialRound + i);
// our "ideal" start points will be affected by the removal of items in earlier rounds, so go through all
// earlier rounds, and if we see an index that comes before our ideal start point, decrement the start point
int adjustment = 0;
for (int j = 0; j < initialRound; ++j)
{
if (allStartPoints.get(j) < start)
adjustment++;
}
startPoints[i] = start - adjustment;
}
return startPoints;
}
}