org.deeplearning4j.spark.datavec.RDDMiniBatches Maven / Gradle / Ivy

Go to download

Show more of this group Show more artifacts with this name
Show all versions of dl4j-spark_2.11 Show documentation

There is a newer version: 1.0.0-beta_spark_2

/*
 *
 *  * Copyright 2015 Skymind,Inc.
 *  *
 *  *    Licensed under the Apache License, Version 2.0 (the "License");
 *  *    you may not use this file except in compliance with the License.
 *  *    You may obtain a copy of the License at
 *  *
 *  *        http://www.apache.org/licenses/LICENSE-2.0
 *  *
 *  *    Unless required by applicable law or agreed to in writing, software
 *  *    distributed under the License is distributed on an "AS IS" BASIS,
 *  *    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 *  *    See the License for the specific language governing permissions and
 *  *    limitations under the License.
 *
 */

package org.deeplearning4j.spark.datavec;

import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.function.FlatMapFunction;
import org.nd4j.linalg.dataset.DataSet;

import java.io.Serializable;
import java.util.ArrayList;
import java.util.Iterator;
import java.util.List;

/**
 * RDD mini batch partitioning
 * @author Adam Gibson
 */
public class RDDMiniBatches  implements Serializable {
    private int miniBatches = 10;
    private JavaRDD toSplitJava;

    public RDDMiniBatches(int miniBatches, JavaRDD toSplit) {
        this.miniBatches = miniBatches;
        this.toSplitJava = toSplit;
    }

    public JavaRDD miniBatchesJava() {
        //need a new mapping function, doesn't handle mini batches properly
        return toSplitJava.mapPartitions(new MiniBatchFunction(miniBatches));
    }


    public static class MiniBatchFunction implements FlatMapFunction, DataSet> {
        private int batchSize = 10;

        public MiniBatchFunction(int batchSize) {
            this.batchSize = batchSize;
        }

        @Override
        public Iterable call(Iterator dataSetIterator) throws Exception {
            List ret = new ArrayList<>();
            List temp = new ArrayList<>();
            while (dataSetIterator.hasNext()) {
                temp.add(dataSetIterator.next().copy());
                if (temp.size() == batchSize) {
                    ret.add(DataSet.merge(temp));
                    temp.clear();
                }
            }

            //edge cases with map partitions where one will be left over.
            //this is due to race conditions.
            if(temp.size() > 1)
                ret.add(DataSet.merge(temp));

            return ret;
        }

    }


}