org.apache.cassandra.io.sstable.Downsampling Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of cassandra-all Show documentation
A fork of the Apache Cassandra Project that uses Lucene indexes for providing near real time search such as ElasticSearch or Solr, including full text search capabilities, multi-dimensional queries, and relevance scoring.
The newest version!
/*
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *   http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing,
 * software distributed under the License is distributed on an
 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 * KIND, either express or implied.  See the License for the
 * specific language governing permissions and limitations
 * under the License.
 */
package org.apache.cassandra.io.sstable;

import java.util.*;

public class Downsampling
{
    /**
     * The base (down)sampling level determines the granularity at which we can down/upsample.
     *
     * A higher number allows us to approximate more closely the ideal sampling.  (It could also mean we do a lot of
     * expensive almost-no-op resamplings from N to N-1, but the thresholds in IndexSummaryManager prevent that.)
     *
     * BSL must be a power of two in order to have good sampling patterns. This cannot be changed without rebuilding
     * all index summaries at full sampling; for now we treat it as a constant.
     */
    public static final int BASE_SAMPLING_LEVEL = 128;

    private static final Map> samplePatternCache = new HashMap<>();

    private static final Map> originalIndexCache = new HashMap<>();

    /**
     * Gets a list L of starting indices for downsampling rounds: the first round should start with the offset
     * given by L[0], the second by the offset in L[1], etc.
     *
     * @param samplingLevel the base sampling level
     *
     * @return A list of `samplingLevel` unique indices between 0 and `samplingLevel`
     */
    public static List getSamplingPattern(int samplingLevel)
    {
        List pattern = samplePatternCache.get(samplingLevel);
        if (pattern != null)
            return pattern;

        if (samplingLevel <= 1)
            return Arrays.asList(0);

        int[] odds = new int[samplingLevel / 2];
        int[] evens = new int[samplingLevel / 2];
        for (int i = 1; i < samplingLevel; i += 2)
            odds[i/2] = i;
        for (int i = 0; i < samplingLevel; i += 2)
            evens[i/2] = i;

        // especially for latter rounds, it's important that we spread out the start points, so we'll
        // make a recursive call to get an ordering for this list of start points
        List ordering = getSamplingPattern(samplingLevel/2);
        List startIndices = new ArrayList<>(samplingLevel);

        for (Integer index : ordering)
            startIndices.add(odds[index]);
        for (Integer index : ordering)
            startIndices.add(evens[index]);

        samplePatternCache.put(samplingLevel, startIndices);
        return startIndices;
    }

    /**
     * Returns a list that can be used to translate current index summary indexes to their original index before
     * downsampling.  (This repeats every `samplingLevel`, so that's how many entries we return.)
     *
     * For example, if [7, 15] is returned, the current index summary entry at index 0 was originally
     * at index 7, and the current index 1 was originally at index 15.
     *
     * @param samplingLevel the current sampling level for the index summary
     *
     * @return a list of original indexes for current summary entries
     */
    public static List getOriginalIndexes(int samplingLevel)
    {
        List originalIndexes = originalIndexCache.get(samplingLevel);
        if (originalIndexes != null)
            return originalIndexes;

        List pattern = getSamplingPattern(BASE_SAMPLING_LEVEL).subList(0, BASE_SAMPLING_LEVEL - samplingLevel);
        originalIndexes = new ArrayList<>(samplingLevel);
        for (int j = 0; j < BASE_SAMPLING_LEVEL; j++)
        {
            if (!pattern.contains(j))
                originalIndexes.add(j);
        }

        originalIndexCache.put(samplingLevel, originalIndexes);
        return originalIndexes;
    }

    /**
     * Calculates the effective index interval after the entry at `index` in an IndexSummary.  In other words, this
     * returns the number of partitions in the primary on-disk index before the next partition that has an entry in
     * the index summary.  If samplingLevel == BASE_SAMPLING_LEVEL, this will be equal to the index interval.
     * @param index an index into an IndexSummary
     * @param samplingLevel the current sampling level for that IndexSummary
     * @param minIndexInterval the min index interval (effective index interval at full sampling)
     * @return the number of partitions before the next index summary entry, inclusive on one end
     */
    public static int getEffectiveIndexIntervalAfterIndex(int index, int samplingLevel, int minIndexInterval)
    {
        assert index >= -1;
        List originalIndexes = getOriginalIndexes(samplingLevel);
        if (index == -1)
            return originalIndexes.get(0) * minIndexInterval;

        index %= samplingLevel;
        if (index == originalIndexes.size() - 1)
        {
            // account for partitions after the "last" entry as well as partitions before the "first" entry
            return ((BASE_SAMPLING_LEVEL - originalIndexes.get(index)) + originalIndexes.get(0)) * minIndexInterval;
        }
        else
        {
            return (originalIndexes.get(index + 1) - originalIndexes.get(index)) * minIndexInterval;
        }
    }

    public static int[] getStartPoints(int currentSamplingLevel, int newSamplingLevel)
    {
        List allStartPoints = getSamplingPattern(BASE_SAMPLING_LEVEL);

        // calculate starting indexes for sampling rounds
        int initialRound = BASE_SAMPLING_LEVEL - currentSamplingLevel;
        int numRounds = Math.abs(currentSamplingLevel - newSamplingLevel);
        int[] startPoints = new int[numRounds];
        for (int i = 0; i < numRounds; ++i)
        {
            int start = allStartPoints.get(initialRound + i);

            // our "ideal" start points will be affected by the removal of items in earlier rounds, so go through all
            // earlier rounds, and if we see an index that comes before our ideal start point, decrement the start point
            int adjustment = 0;
            for (int j = 0; j < initialRound; ++j)
            {
                if (allStartPoints.get(j) < start)
                    adjustment++;
            }
            startPoints[i] = start - adjustment;
        }
        return startPoints;
    }
}