org.deeplearning4j.spark.impl.repartitioner.EqualRepartitioner Maven / Gradle / Ivy

Show more of this group Show more artifacts with this name
Show all versions of dl4j-spark3_2.12 Show documentation
The newest version!
/*
 *  ******************************************************************************
 *  *
 *  *
 *  * This program and the accompanying materials are made available under the
 *  * terms of the Apache License, Version 2.0 which is available at
 *  * https://www.apache.org/licenses/LICENSE-2.0.
 *  *
 *  *  See the NOTICE file distributed with this work for additional
 *  *  information regarding copyright ownership.
 *  * Unless required by applicable law or agreed to in writing, software
 *  * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
 *  * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
 *  * License for the specific language governing permissions and limitations
 *  * under the License.
 *  *
 *  * SPDX-License-Identifier: Apache-2.0
 *  *****************************************************************************
 */

package org.deeplearning4j.spark.impl.repartitioner;

import lombok.extern.slf4j.Slf4j;
import org.apache.spark.api.java.JavaPairRDD;
import org.apache.spark.api.java.JavaRDD;
import org.deeplearning4j.spark.api.Repartitioner;
import org.deeplearning4j.spark.impl.common.CountPartitionsFunction;
import org.deeplearning4j.spark.impl.common.repartition.EqualPartitioner;
import org.deeplearning4j.spark.util.SparkUtils;
import org.nd4j.common.util.MathUtils;
import scala.Tuple2;

import java.util.List;
import java.util.Random;

@Slf4j
public class EqualRepartitioner implements Repartitioner {
    @Override
    public  JavaRDD repartition(JavaRDD rdd, int minObjectsPerPartition, int numExecutors) {
        //minObjectsPerPartition: intentionally not used here

        //Repartition: either always, or origNumPartitions != numWorkers

        //First: count number of elements in each partition. Need to know this so we can work out how to properly index each example,
        // so we can in turn create properly balanced partitions after repartitioning
        //Because the objects (DataSets etc) should be small, this should be OK

        //Count each partition...
        List> partitionCounts =
                rdd.mapPartitionsWithIndex(new CountPartitionsFunction(), true).collect();
        return repartition(rdd, numExecutors, partitionCounts);
    }


    public static  JavaRDD repartition(JavaRDD rdd, int numPartitions, List> partitionCounts){
        int totalObjects = 0;
        int initialPartitions = partitionCounts.size();

        for (Tuple2 t2 : partitionCounts) {
            totalObjects += t2._2();
        }

        //Check if already correct
        int minAllowable = (int)Math.floor(totalObjects / (double) numPartitions);
        int maxAllowable = (int)Math.ceil(totalObjects / (double) numPartitions);

        boolean repartitionRequired = false;
        for (Tuple2 t2 : partitionCounts) {
            if(t2._2() < minAllowable || t2._2() > maxAllowable ){
                repartitionRequired = true;
                break;
            }
        }

        if (initialPartitions == numPartitions && !repartitionRequired) {
            //Don't need to do any repartitioning here - already in the format we want
            return rdd;
        }

        //Index each element for repartitioning (can only do manual repartitioning on a JavaPairRDD)
        JavaPairRDD pairIndexed = SparkUtils.indexedRDD(rdd);

        //Handle remainder.
        //We'll randomly allocate one of these to a single partition, with no partition getting more than 1 (otherwise, imbalanced)
        //Given that we don't know exactly how Spark will allocate partitions to workers, we are probably better off doing
        // this randomly rather than "first N get +1" or "every M get +1" as this could introduce poor load balancing
        int remainder = totalObjects % numPartitions;
        int[] remainderPartitions = null;
        if (remainder > 0) {
            remainderPartitions = new int[remainder];
            int[] temp = new int[numPartitions];
            for( int i=0; i< temp.length; i++ ){
                temp[i] = i;
            }
            MathUtils.shuffleArray(temp, new Random());
            System.arraycopy(temp, 0, remainderPartitions, 0, remainder);
        }

        int partitionSizeExRemainder = totalObjects / numPartitions;
        pairIndexed = pairIndexed.partitionBy(new EqualPartitioner(numPartitions, partitionSizeExRemainder, remainderPartitions));
        return pairIndexed.values();
    }
}