All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.apache.flink.api.java.sampling.DistributedRandomSampler Maven / Gradle / Ivy

There is a newer version: 1.20.0
Show newest version
/*
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package org.apache.flink.api.java.sampling;

import org.apache.flink.annotation.Internal;

import java.util.Iterator;
import java.util.PriorityQueue;

/**
 * For sampling with fraction, the sample algorithms are natively distributed, while it's not true
 * for fixed size sample algorithms. The fixed size sample algorithms require two-phases sampling
 * (according to our current implementation). In the first phase, each distributed partition is
 * sampled independently. The partial sampling results are handled by a central coordinator. The
 * central coordinator combines the partial sampling results to form the final result.
 *
 * @param  The input data type.
 */
@Internal
public abstract class DistributedRandomSampler extends RandomSampler {

    protected final int numSamples;

    public DistributedRandomSampler(int numSamples) {
        this.numSamples = numSamples;
    }

    protected final Iterator> emptyIntermediateIterable =
            new SampledIterator>() {
                @Override
                public boolean hasNext() {
                    return false;
                }

                @Override
                public IntermediateSampleData next() {
                    return null;
                }
            };

    /**
     * Sample algorithm for the first phase. It operates on a single partition.
     *
     * @param input The DataSet input of each partition.
     * @return Intermediate sample output which will be used as the input of the second phase.
     */
    public abstract Iterator> sampleInPartition(Iterator input);

    /**
     * Sample algorithm for the second phase. This operation should be executed as the UDF of an all
     * reduce operation.
     *
     * @param input The intermediate sample output generated in the first phase.
     * @return The sampled output.
     */
    public Iterator sampleInCoordinator(Iterator> input) {
        if (numSamples == 0) {
            return emptyIterable;
        }

        // This queue holds fixed number elements with the top K weight for the coordinator.
        PriorityQueue> reservoir =
                new PriorityQueue>(numSamples);
        int index = 0;
        IntermediateSampleData smallest = null;
        while (input.hasNext()) {
            IntermediateSampleData element = input.next();
            if (index < numSamples) {
                // Fill the queue with first K elements from input.
                reservoir.add(element);
                smallest = reservoir.peek();
            } else {
                // If current element weight is larger than the smallest one in queue, remove the
                // element
                // with the smallest weight, and append current element into the queue.
                if (element.getWeight() > smallest.getWeight()) {
                    reservoir.remove();
                    reservoir.add(element);
                    smallest = reservoir.peek();
                }
            }
            index++;
        }
        final Iterator> itr = reservoir.iterator();

        return new Iterator() {
            @Override
            public boolean hasNext() {
                return itr.hasNext();
            }

            @Override
            public T next() {
                return itr.next().getElement();
            }

            @Override
            public void remove() {
                itr.remove();
            }
        };
    }

    /**
     * Combine the first phase and second phase in sequence, implemented for test purpose only.
     *
     * @param input Source data.
     * @return Sample result in sequence.
     */
    @Override
    public Iterator sample(Iterator input) {
        return sampleInCoordinator(sampleInPartition(input));
    }
}




© 2015 - 2025 Weber Informatics LLC | Privacy Policy