All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.deeplearning4j.spark.impl.multilayer.SparkDl4jMultiLayer Maven / Gradle / Ivy

The newest version!
/*
 *  ******************************************************************************
 *  *
 *  *
 *  * This program and the accompanying materials are made available under the
 *  * terms of the Apache License, Version 2.0 which is available at
 *  * https://www.apache.org/licenses/LICENSE-2.0.
 *  *
 *  *  See the NOTICE file distributed with this work for additional
 *  *  information regarding copyright ownership.
 *  * Unless required by applicable law or agreed to in writing, software
 *  * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
 *  * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
 *  * License for the specific language governing permissions and limitations
 *  * under the License.
 *  *
 *  * SPDX-License-Identifier: Apache-2.0
 *  *****************************************************************************
 */

package org.deeplearning4j.spark.impl.multilayer;

import lombok.extern.slf4j.Slf4j;
import org.apache.hadoop.conf.Configuration;
import org.apache.spark.SparkContext;
import org.apache.spark.api.java.JavaDoubleRDD;
import org.apache.spark.api.java.JavaPairRDD;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.mllib.linalg.Matrix;
import org.apache.spark.mllib.linalg.Vector;
import org.apache.spark.mllib.regression.LabeledPoint;
import org.apache.spark.rdd.RDD;
import org.datavec.spark.util.BroadcastHadoopConfigHolder;
import org.deeplearning4j.core.loader.DataSetLoader;
import org.deeplearning4j.core.loader.MultiDataSetLoader;
import org.deeplearning4j.core.loader.impl.SerializedDataSetLoader;
import org.deeplearning4j.nn.conf.MultiLayerConfiguration;
import org.deeplearning4j.nn.conf.layers.FeedForwardLayer;
import org.deeplearning4j.nn.multilayer.MultiLayerNetwork;
import org.deeplearning4j.spark.api.TrainingMaster;
import org.deeplearning4j.spark.api.stats.SparkTrainingStats;
import org.deeplearning4j.spark.data.loader.RemoteFileSourceFactory;
import org.deeplearning4j.spark.impl.SparkListenable;
import org.deeplearning4j.spark.impl.common.LoadDataSetFunction;
import org.deeplearning4j.spark.impl.common.reduce.IntDoubleReduceFunction;
import org.deeplearning4j.spark.impl.graph.evaluation.IEvaluateMDSPathsFlatMapFunction;
import org.deeplearning4j.spark.impl.multilayer.evaluation.IEvaluateAggregateFunction;
import org.deeplearning4j.spark.impl.multilayer.evaluation.IEvaluateFlatMapFunction;
import org.deeplearning4j.spark.impl.multilayer.evaluation.IEvaluationReduceFunction;
import org.deeplearning4j.spark.impl.multilayer.scoring.*;
import org.deeplearning4j.spark.util.MLLibUtil;
import org.deeplearning4j.spark.util.SparkUtils;
import org.deeplearning4j.util.ModelSerializer;
import org.nd4j.common.base.Preconditions;
import org.nd4j.evaluation.IEvaluation;
import org.nd4j.evaluation.classification.Evaluation;
import org.nd4j.evaluation.classification.ROC;
import org.nd4j.evaluation.classification.ROCMultiClass;
import org.nd4j.evaluation.regression.RegressionEvaluation;
import org.nd4j.linalg.api.ndarray.INDArray;
import org.nd4j.linalg.api.ops.executioner.GridExecutioner;
import org.nd4j.linalg.dataset.DataSet;
import org.nd4j.linalg.dataset.api.iterator.DataSetIterator;
import org.nd4j.linalg.factory.Nd4j;
import org.nd4j.linalg.heartbeat.Heartbeat;
import org.nd4j.linalg.heartbeat.reports.Environment;
import org.nd4j.linalg.heartbeat.reports.Event;
import org.nd4j.linalg.heartbeat.reports.Task;
import org.nd4j.linalg.heartbeat.utils.EnvironmentUtils;
import scala.Tuple2;

import java.io.IOException;
import java.io.OutputStream;
import java.util.List;

@Slf4j
public class SparkDl4jMultiLayer extends SparkListenable {
    public static final int DEFAULT_EVAL_SCORE_BATCH_SIZE = 64;
    public static final int DEFAULT_ROC_THRESHOLD_STEPS = 32;
    public static final int DEFAULT_EVAL_WORKERS = 4;
    private transient JavaSparkContext sc;
    private MultiLayerConfiguration conf;
    private MultiLayerNetwork network;
    private double lastScore;
    private int defaultEvaluationWorkers = DEFAULT_EVAL_WORKERS;

    /**
     * Instantiate a multi layer spark instance
     * with the given context and network.
     * This is the prediction constructor
     *
     * @param sparkContext the spark context to use
     * @param network      the network to use
     */
    public SparkDl4jMultiLayer(SparkContext sparkContext, MultiLayerNetwork network,
                    TrainingMaster trainingMaster) {
        this(new JavaSparkContext(sparkContext), network, trainingMaster);
    }

    /**
     * Training constructor. Instantiate with a configuration
     *
     * @param sparkContext the spark context to use
     * @param conf         the configuration of the network
     */
    public SparkDl4jMultiLayer(SparkContext sparkContext, MultiLayerConfiguration conf,
                    TrainingMaster trainingMaster) {
        this(new JavaSparkContext(sparkContext), initNetwork(conf), trainingMaster);
    }

    /**
     * Training constructor. Instantiate with a configuration
     *
     * @param sc   the spark context to use
     * @param conf the configuration of the network
     */
    public SparkDl4jMultiLayer(JavaSparkContext sc, MultiLayerConfiguration conf, TrainingMaster trainingMaster) {
        this(sc.sc(), conf, trainingMaster);
    }

    public SparkDl4jMultiLayer(JavaSparkContext javaSparkContext, MultiLayerNetwork network,
                    TrainingMaster trainingMaster) {
        sc = javaSparkContext;
        this.conf = network.getLayerWiseConfigurations().clone();
        this.network = network;
        if (!network.isInitCalled())
            network.init();
        this.trainingMaster = trainingMaster;

        //Check if kryo configuration is correct:
        SparkUtils.checkKryoConfiguration(javaSparkContext, log);
    }

    private static MultiLayerNetwork initNetwork(MultiLayerConfiguration conf) {
        MultiLayerNetwork net = new MultiLayerNetwork(conf);
        net.init();
        return net;
    }

    public JavaSparkContext getSparkContext() {
        return sc;
    }

    /**
     * @return The MultiLayerNetwork underlying the SparkDl4jMultiLayer
     */
    public MultiLayerNetwork getNetwork() {
        return network;
    }

    /**
     * @return The TrainingMaster for this network
     */
    public TrainingMaster getTrainingMaster() {
        return trainingMaster;
    }

    /**
     * Set the network that underlies this SparkDl4jMultiLayer instacne
     *
     * @param network network to set
     */
    public void setNetwork(MultiLayerNetwork network) {
        this.network = network;
    }


    /**
     * Returns the currently set default number of evaluation workers/threads.
     * Note that when the number of workers is provided explicitly in an evaluation method, the default value
     * is not used.
* In many cases, we may want this to be smaller than the number of Spark threads, to reduce memory requirements. * For example, with 32 Spark threads and a large network, we don't want to spin up 32 instances of the network * to perform evaluation. Better (for memory requirements, and reduced cache thrashing) to use say 4 workers.
* If it is not set explicitly, {@link #DEFAULT_EVAL_WORKERS} will be used * * @return Default number of evaluation workers (threads). */ public int getDefaultEvaluationWorkers(){ return defaultEvaluationWorkers; } /** * Set the default number of evaluation workers/threads. * Note that when the number of workers is provided explicitly in an evaluation method, the default value * is not used.
* In many cases, we may want this to be smaller than the number of Spark threads, to reduce memory requirements. * For example, with 32 Spark threads and a large network, we don't want to spin up 32 instances of the network * to perform evaluation. Better (for memory requirements, and reduced cache thrashing) to use say 4 workers.
* If it is not set explicitly, {@link #DEFAULT_EVAL_WORKERS} will be used * * @return Default number of evaluation workers (threads). */ public void setDefaultEvaluationWorkers(int workers){ Preconditions.checkArgument(workers > 0, "Number of workers must be > 0: got %s", workers); this.defaultEvaluationWorkers = workers; } /** * Set whether training statistics should be collected for debugging purposes. Statistics collection is disabled by default * * @param collectTrainingStats If true: collect training statistics. If false: don't collect. */ public void setCollectTrainingStats(boolean collectTrainingStats) { trainingMaster.setCollectTrainingStats(collectTrainingStats); } /** * Get the training statistics, after collection of stats has been enabled using {@link #setCollectTrainingStats(boolean)} * * @return Training statistics */ public SparkTrainingStats getSparkTrainingStats() { return trainingMaster.getTrainingStats(); } /** * Predict the given feature matrix * * @param features the given feature matrix * @return the predictions */ public Matrix predict(Matrix features) { return MLLibUtil.toMatrix(network.output(MLLibUtil.toMatrix(features))); } /** * Predict the given vector * * @param point the vector to predict * @return the predicted vector */ public Vector predict(Vector point) { return MLLibUtil.toVector(network.output(MLLibUtil.toVector(point))); } /** * Fit the DataSet RDD. Equivalent to fit(trainingData.toJavaRDD()) * * @param trainingData the training data RDD to fitDataSet * @return the MultiLayerNetwork after training */ public MultiLayerNetwork fit(RDD trainingData) { return fit(trainingData.toJavaRDD()); } /** * Fit the DataSet RDD * * @param trainingData the training data RDD to fitDataSet * @return the MultiLayerNetwork after training */ public MultiLayerNetwork fit(JavaRDD trainingData) { if (Nd4j.getExecutioner() instanceof GridExecutioner) ((GridExecutioner) Nd4j.getExecutioner()).flushQueue(); trainingMaster.executeTraining(this, trainingData); network.incrementEpochCount(); return network; } /** * Fit the SparkDl4jMultiLayer network using a directory of serialized DataSet objects * The assumption here is that the directory contains a number of {@link DataSet} objects, each serialized using * {@link DataSet#save(OutputStream)} * * @param path Path to the directory containing the serialized DataSet objcets * @return The MultiLayerNetwork after training */ public MultiLayerNetwork fit(String path) { if (Nd4j.getExecutioner() instanceof GridExecutioner) ((GridExecutioner) Nd4j.getExecutioner()).flushQueue(); JavaRDD paths; try { paths = SparkUtils.listPaths(sc, path); } catch (IOException e) { throw new RuntimeException("Error listing paths in directory", e); } return fitPaths(paths); } /** * @deprecated Use {@link #fit(String)} */ @Deprecated public MultiLayerNetwork fit(String path, int minPartitions) { return fit(path); } /** * Fit the network using a list of paths for serialized DataSet objects. * * @param paths List of paths * @return trained network */ public MultiLayerNetwork fitPaths(JavaRDD paths) { return fitPaths(paths, new SerializedDataSetLoader()); } public MultiLayerNetwork fitPaths(JavaRDD paths, DataSetLoader loader) { trainingMaster.executeTrainingPaths(this, null, paths, loader, null); network.incrementEpochCount(); return network; } /** * Fit a MultiLayerNetwork using Spark MLLib LabeledPoint instances. * This will convert the labeled points to the internal DL4J data format and train the model on that * * @param rdd the rdd to fitDataSet * @return the multi layer network that was fitDataSet */ public MultiLayerNetwork fitLabeledPoint(JavaRDD rdd) { int nLayers = network.getLayerWiseConfigurations().getConfs().size(); FeedForwardLayer ffl = (FeedForwardLayer) network.getLayerWiseConfigurations().getConf(nLayers - 1).getLayer(); JavaRDD ds = MLLibUtil.fromLabeledPoint(sc, rdd, ffl.getNOut()); return fit(ds); } /** * Fits a MultiLayerNetwork using Spark MLLib LabeledPoint instances * This will convert labeled points that have continuous labels used for regression to the internal * DL4J data format and train the model on that * @param rdd the javaRDD containing the labeled points * @return a MultiLayerNetwork */ public MultiLayerNetwork fitContinuousLabeledPoint(JavaRDD rdd) { return fit(MLLibUtil.fromContinuousLabeledPoint(sc, rdd)); } /** * Gets the last (average) minibatch score from calling fit. This is the average score across all executors for the * last minibatch executed in each worker */ public double getScore() { return lastScore; } public void setScore(double lastScore) { this.lastScore = lastScore; } /** * Overload of {@link #calculateScore(JavaRDD, boolean)} for {@code RDD} instead of {@code JavaRDD} */ public double calculateScore(RDD data, boolean average) { return calculateScore(data.toJavaRDD(), average); } /** * Calculate the score for all examples in the provided {@code JavaRDD}, either by summing * or averaging over the entire data set. To calculate a score for each example individually, use {@link #scoreExamples(JavaPairRDD, boolean)} * or one of the similar methods. Uses default minibatch size in each worker, {@link SparkDl4jMultiLayer#DEFAULT_EVAL_SCORE_BATCH_SIZE} * * @param data Data to score * @param average Whether to sum the scores, or average them */ public double calculateScore(JavaRDD data, boolean average) { return calculateScore(data, average, DEFAULT_EVAL_SCORE_BATCH_SIZE); } /** * Calculate the score for all examples in the provided {@code JavaRDD}, either by summing * or averaging over the entire data set. To calculate a score for each example individually, use {@link #scoreExamples(JavaPairRDD, boolean)} * or one of the similar methods * * @param data Data to score * @param average Whether to sum the scores, or average them * @param minibatchSize The number of examples to use in each minibatch when scoring. If more examples are in a partition than * this, multiple scoring operations will be done (to avoid using too much memory by doing the whole partition * in one go) */ public double calculateScore(JavaRDD data, boolean average, int minibatchSize) { JavaRDD> rdd = data.mapPartitions( new ScoreFlatMapFunction(conf.toJson(), sc.broadcast(network.params(false)), minibatchSize)); //Reduce to a single tuple, with example count + sum of scores Tuple2 countAndSumScores = rdd.reduce(new IntDoubleReduceFunction()); if (average) { return countAndSumScores._2() / countAndSumScores._1(); } else { return countAndSumScores._2(); } } /** * {@code RDD} overload of {@link #scoreExamples(JavaPairRDD, boolean)} */ public JavaDoubleRDD scoreExamples(RDD data, boolean includeRegularizationTerms) { return scoreExamples(data.toJavaRDD(), includeRegularizationTerms); } /** * Score the examples individually, using the default batch size {@link #DEFAULT_EVAL_SCORE_BATCH_SIZE}. Unlike {@link #calculateScore(JavaRDD, boolean)}, * this method returns a score for each example separately. If scoring is needed for specific examples use either * {@link #scoreExamples(JavaPairRDD, boolean)} or {@link #scoreExamples(JavaPairRDD, boolean, int)} which can have * a key for each example. * * @param data Data to score * @param includeRegularizationTerms If true: include the l1/l2 regularization terms with the score (if any) * @return A JavaDoubleRDD containing the scores of each example * @see MultiLayerNetwork#scoreExamples(DataSet, boolean) */ public JavaDoubleRDD scoreExamples(JavaRDD data, boolean includeRegularizationTerms) { return scoreExamples(data, includeRegularizationTerms, DEFAULT_EVAL_SCORE_BATCH_SIZE); } /** * {@code RDD} * overload of {@link #scoreExamples(JavaRDD, boolean, int)} */ public JavaDoubleRDD scoreExamples(RDD data, boolean includeRegularizationTerms, int batchSize) { return scoreExamples(data.toJavaRDD(), includeRegularizationTerms, batchSize); } /** * Score the examples individually, using a specified batch size. Unlike {@link #calculateScore(JavaRDD, boolean)}, * this method returns a score for each example separately. If scoring is needed for specific examples use either * {@link #scoreExamples(JavaPairRDD, boolean)} or {@link #scoreExamples(JavaPairRDD, boolean, int)} which can have * a key for each example. * * @param data Data to score * @param includeRegularizationTerms If true: include the l1/l2 regularization terms with the score (if any) * @param batchSize Batch size to use when doing scoring * @return A JavaDoubleRDD containing the scores of each example * @see MultiLayerNetwork#scoreExamples(DataSet, boolean) */ public JavaDoubleRDD scoreExamples(JavaRDD data, boolean includeRegularizationTerms, int batchSize) { return data.mapPartitionsToDouble(new ScoreExamplesFunction(sc.broadcast(network.params()), sc.broadcast(conf.toJson()), includeRegularizationTerms, batchSize)); } /** * Score the examples individually, using the default batch size {@link #DEFAULT_EVAL_SCORE_BATCH_SIZE}. Unlike {@link #calculateScore(JavaRDD, boolean)}, * this method returns a score for each example separately
* Note: The provided JavaPairRDD has a key that is associated with each example and returned score.
* Note: The DataSet objects passed in must have exactly one example in them (otherwise: can't have a 1:1 association * between keys and data sets to score) * * @param data Data to score * @param includeRegularizationTerms If true: include the l1/l2 regularization terms with the score (if any) * @param Key type * @return A {@code JavaPairRDD} containing the scores of each example * @see MultiLayerNetwork#scoreExamples(DataSet, boolean) */ public JavaPairRDD scoreExamples(JavaPairRDD data, boolean includeRegularizationTerms) { return scoreExamples(data, includeRegularizationTerms, DEFAULT_EVAL_SCORE_BATCH_SIZE); } /** * Score the examples individually, using a specified batch size. Unlike {@link #calculateScore(JavaRDD, boolean)}, * this method returns a score for each example separately
* Note: The provided JavaPairRDD has a key that is associated with each example and returned score.
* Note: The DataSet objects passed in must have exactly one example in them (otherwise: can't have a 1:1 association * between keys and data sets to score) * * @param data Data to score * @param includeRegularizationTerms If true: include the l1/l2 regularization terms with the score (if any) * @param Key type * @return A {@code JavaPairRDD} containing the scores of each example * @see MultiLayerNetwork#scoreExamples(DataSet, boolean) */ public JavaPairRDD scoreExamples(JavaPairRDD data, boolean includeRegularizationTerms, int batchSize) { return data.mapPartitionsToPair(new ScoreExamplesWithKeyFunction(sc.broadcast(network.params()), sc.broadcast(conf.toJson()), includeRegularizationTerms, batchSize)); } /** * Feed-forward the specified data, with the given keys. i.e., get the network output/predictions for the specified data * * @param featuresData Features data to feed through the network * @param batchSize Batch size to use when doing feed forward operations * @param Type of data for key - may be anything * @return Network output given the input, by key */ public JavaPairRDD feedForwardWithKey(JavaPairRDD featuresData, int batchSize) { return feedForwardWithMaskAndKey(featuresData.mapToPair(new SingleToPairFunction()), batchSize); } /** * Feed-forward the specified data (and optionally mask array), with the given keys. i.e., get the network * output/predictions for the specified data * * @param featuresDataAndMask Features data to feed through the network. The Tuple2 is of the network input (features), * and optionally the feature mask arrays * @param batchSize Batch size to use when doing feed forward operations * @param Type of data for key - may be anything * @return Network output given the input (and optionally mask), by key */ public JavaPairRDD feedForwardWithMaskAndKey(JavaPairRDD> featuresDataAndMask, int batchSize) { return featuresDataAndMask .mapPartitionsToPair(new FeedForwardWithKeyFunction(sc.broadcast(network.params()), sc.broadcast(conf.toJson()), batchSize)); } /** * {@code RDD} overload of {@link #evaluate(JavaRDD)} */ public T evaluate(RDD data) { return evaluate(data.toJavaRDD()); } /** * Evaluate on a directory containing a set of DataSet objects serialized with {@link DataSet#save(OutputStream)} * @param path Path/URI to the directory containing the dataset objects * @return Evaluation */ public T evaluate(String path){ return evaluate(path, new SerializedDataSetLoader()); } /** * Evaluate on a directory containing a set of DataSet objects to be loaded with a {@link DataSetLoader}. * Uses default batch size of {@link #DEFAULT_EVAL_SCORE_BATCH_SIZE} * @param path Path/URI to the directory containing the datasets to load * @return Evaluation */ public T evaluate(String path, DataSetLoader loader) { return evaluate(path, DEFAULT_EVAL_SCORE_BATCH_SIZE, loader); } /** * Evaluate on a directory containing a set of DataSet objects to be loaded with a {@link DataSetLoader}. * Uses default batch size of {@link #DEFAULT_EVAL_SCORE_BATCH_SIZE} * @param path Path/URI to the directory containing the datasets to load * @return Evaluation */ public T evaluate(String path, int batchSize, DataSetLoader loader){ JavaRDD paths; try { paths = SparkUtils.listPaths(sc, path); } catch (IOException e) { throw new RuntimeException("Error listing paths in directory", e); } JavaRDD rdd = paths.map(new LoadDataSetFunction(loader, new RemoteFileSourceFactory(BroadcastHadoopConfigHolder.get(sc)))); return (T)doEvaluation(rdd, batchSize, new org.deeplearning4j.eval.Evaluation())[0]; } /** * Evaluate the network (classification performance) in a distributed manner on the provided data * * @param data Data to evaluate on * @return Evaluation object; results of evaluation on all examples in the data set */ public T evaluate(JavaRDD data) { return evaluate(data, null); } /** * {@code RDD} overload of {@link #evaluate(JavaRDD, List)} */ public T evaluate(RDD data, List labelsList) { return evaluate(data.toJavaRDD(), labelsList); } /** * Evaluate the network (regression performance) in a distributed manner on the provided data * * @param data Data to evaluate * @return {@link RegressionEvaluation} instance with regression performance */ public T evaluateRegression(JavaRDD data) { return evaluateRegression(data, DEFAULT_EVAL_SCORE_BATCH_SIZE); } /** * Evaluate the network (regression performance) in a distributed manner on the provided data * * @param data Data to evaluate * @param minibatchSize Minibatch size to use when doing performing evaluation * @return {@link RegressionEvaluation} instance with regression performance */ public T evaluateRegression(JavaRDD data, int minibatchSize) { long nOut = ((FeedForwardLayer) network.getOutputLayer().conf().getLayer()).getNOut(); return (T)doEvaluation(data, new org.deeplearning4j.eval.RegressionEvaluation(nOut), minibatchSize); } /** * Evaluate the network (classification performance) in a distributed manner, using default batch size and a provided * list of labels * * @param data Data to evaluate on * @param labelsList List of labels used for evaluation * @return Evaluation object; results of evaluation on all examples in the data set */ public T evaluate(JavaRDD data, List labelsList) { return evaluate(data, labelsList, DEFAULT_EVAL_SCORE_BATCH_SIZE); } /** * Perform ROC analysis/evaluation on the given DataSet in a distributed manner, using the default number of * threshold steps ({@link #DEFAULT_ROC_THRESHOLD_STEPS}) and the default minibatch size ({@link #DEFAULT_EVAL_SCORE_BATCH_SIZE}) * * @param data Test set data (to evaluate on) * @return ROC for the entire data set */ public T evaluateROC(JavaRDD data) { return evaluateROC(data, DEFAULT_ROC_THRESHOLD_STEPS, DEFAULT_EVAL_SCORE_BATCH_SIZE); } /** * Perform ROC analysis/evaluation on the given DataSet in a distributed manner * * @param data Test set data (to evaluate on) * @param thresholdSteps Number of threshold steps for ROC - see {@link ROC} * @param evaluationMinibatchSize Minibatch size to use when performing ROC evaluation * @return ROC for the entire data set */ public T evaluateROC(JavaRDD data, int thresholdSteps, int evaluationMinibatchSize) { return (T)doEvaluation(data, new org.deeplearning4j.eval.ROC(thresholdSteps), evaluationMinibatchSize); } /** * Perform ROC analysis/evaluation (for the multi-class case, using {@link ROCMultiClass} on the given DataSet in a distributed manner * * @param data Test set data (to evaluate on) * @return ROC for the entire data set */ public T evaluateROCMultiClass(JavaRDD data) { return evaluateROCMultiClass(data, DEFAULT_ROC_THRESHOLD_STEPS, DEFAULT_EVAL_SCORE_BATCH_SIZE); } /** * Perform ROC analysis/evaluation (for the multi-class case, using {@link ROCMultiClass} on the given DataSet in a distributed manner * * @param data Test set data (to evaluate on) * @param thresholdSteps Number of threshold steps for ROC - see {@link ROC} * @param evaluationMinibatchSize Minibatch size to use when performing ROC evaluation * @return ROCMultiClass for the entire data set */ public T evaluateROCMultiClass(JavaRDD data, int thresholdSteps, int evaluationMinibatchSize) { return (T)doEvaluation(data, new org.deeplearning4j.eval.ROCMultiClass(thresholdSteps), evaluationMinibatchSize); } private void update(int mr, long mg) { Environment env = EnvironmentUtils.buildEnvironment(); env.setNumCores(mr); env.setAvailableMemory(mg); Task task = ModelSerializer.taskByModel(network); Heartbeat.getInstance().reportEvent(Event.SPARK, env, task); } /** * Evaluate the network (classification performance) in a distributed manner, using specified batch size and a provided * list of labels * * @param data Data to evaluate on * @param labelsList List of labels used for evaluation * @param evalBatchSize Batch size to use when conducting evaluations * @return Evaluation object; results of evaluation on all examples in the data set */ public T evaluate(JavaRDD data, List labelsList, int evalBatchSize) { Evaluation e = new org.deeplearning4j.eval.Evaluation(); e = doEvaluation(data, e, evalBatchSize); if (labelsList != null) { e.setLabelsList(labelsList); } return (T)e; } /** * Perform distributed evaluation of any type of {@link IEvaluation}. For example, {@link Evaluation}, {@link RegressionEvaluation}, * {@link ROC}, {@link ROCMultiClass} etc. * * @param data Data to evaluate on * @param emptyEvaluation Empty evaluation instance. This is the starting point (serialized/duplicated, then merged) * @param evalBatchSize Evaluation batch size * @param Type of evaluation instance to return * @return IEvaluation instance */ @SuppressWarnings("unchecked") public T doEvaluation(JavaRDD data, T emptyEvaluation, int evalBatchSize) { return doEvaluation(data, evalBatchSize, emptyEvaluation)[0]; } /** * Perform distributed evaluation of any type of {@link IEvaluation} - or multiple IEvaluation instances. * Distributed equivalent of {@link MultiLayerNetwork#doEvaluation(DataSetIterator, IEvaluation[])} * * @param data Data to evaluate on * @param emptyEvaluations Empty evaluation instances. Starting point (serialized/duplicated, then merged) * @param evalBatchSize Evaluation batch size * @param Type of evaluation instance to return * @return IEvaluation instances */ @SuppressWarnings("unchecked") public T[] doEvaluation(JavaRDD data, int evalBatchSize, T... emptyEvaluations) { return doEvaluation(data, getDefaultEvaluationWorkers(), evalBatchSize, emptyEvaluations ); } /** * Perform distributed evaluation of any type of {@link IEvaluation} - or multiple IEvaluation instances. * Distributed equivalent of {@link MultiLayerNetwork#doEvaluation(DataSetIterator, IEvaluation[])} * * @param data Data to evaluate on * @param emptyEvaluations Empty evaluation instances. Starting point (serialized/duplicated, then merged) * @param evalNumWorkers Number of workers (copies of the MultiLayerNetwork) model to use. Generally this should * be smaller than the number of threads - 2 to 4 is often good enough. If using CUDA GPUs, * this should ideally be set to the number of GPUs on each node (i.e., 1 for a single GPU node) * @param evalBatchSize Evaluation batch size * @param Type of evaluation instance to return * @return IEvaluation instances */ public T[] doEvaluation(JavaRDD data, int evalNumWorkers, int evalBatchSize, T... emptyEvaluations) { IEvaluateFlatMapFunction evalFn = new IEvaluateFlatMapFunction<>(false, sc.broadcast(conf.toJson()), SparkUtils.asByteArrayBroadcast(sc, network.params()), evalNumWorkers, evalBatchSize, emptyEvaluations); JavaRDD evaluations = data.mapPartitions(evalFn); return evaluations.treeAggregate(null, new IEvaluateAggregateFunction(), new IEvaluationReduceFunction()); } /** * Perform evaluation on serialized DataSet objects on disk, (potentially in any format), that are loaded using an {@link DataSetLoader}.
* Uses the default number of workers (model replicas per JVM) of {@link #DEFAULT_EVAL_WORKERS} with the default * minibatch size of {@link #DEFAULT_EVAL_SCORE_BATCH_SIZE} * @param data List of paths to the data (that can be loaded as / converted to DataSets) * @param loader Used to load DataSets from their paths * @param emptyEvaluations Evaluations to perform * @return Evaluation */ public IEvaluation[] doEvaluation(JavaRDD data, DataSetLoader loader, IEvaluation... emptyEvaluations) { return doEvaluation(data, DEFAULT_EVAL_WORKERS, DEFAULT_EVAL_SCORE_BATCH_SIZE, loader, emptyEvaluations); } /** * Perform evaluation on serialized DataSet objects on disk, (potentially in any format), that are loaded using an {@link DataSetLoader}. * @param data List of paths to the data (that can be loaded as / converted to DataSets) * @param evalNumWorkers Number of workers to perform evaluation with. To reduce memory requirements and cache thrashing, * it is common to set this to a lower value than the number of spark threads per JVM/executor * @param evalBatchSize Batch size to use when performing evaluation * @param loader Used to load DataSets from their paths * @param emptyEvaluations Evaluations to perform * @return Evaluation */ public IEvaluation[] doEvaluation(JavaRDD data, int evalNumWorkers, int evalBatchSize, DataSetLoader loader, IEvaluation... emptyEvaluations) { return doEvaluation(data, evalNumWorkers, evalBatchSize, loader, null, emptyEvaluations); } /** * Perform evaluation on serialized MultiDataSet objects on disk, (potentially in any format), that are loaded using an {@link MultiDataSetLoader}.
* Uses the default number of workers (model replicas per JVM) of {@link #DEFAULT_EVAL_WORKERS} with the default * minibatch size of {@link #DEFAULT_EVAL_SCORE_BATCH_SIZE} * @param data List of paths to the data (that can be loaded as / converted to DataSets) * @param loader Used to load MultiDataSets from their paths * @param emptyEvaluations Evaluations to perform * @return Evaluation */ public IEvaluation[] doEvaluation(JavaRDD data, MultiDataSetLoader loader, IEvaluation... emptyEvaluations) { return doEvaluation(data, DEFAULT_EVAL_WORKERS, DEFAULT_EVAL_SCORE_BATCH_SIZE, null, loader, emptyEvaluations); } /** * Perform evaluation on serialized MultiDataSet objects on disk, (potentially in any format), that are loaded using an {@link MultiDataSetLoader} * @param data List of paths to the data (that can be loaded as / converted to DataSets) * @param evalNumWorkers Number of workers to perform evaluation with. To reduce memory requirements and cache thrashing, * it is common to set this to a lower value than the number of spark threads per JVM/executor * @param evalBatchSize Batch size to use when performing evaluation * @param loader Used to load MultiDataSets from their paths * @param emptyEvaluations Evaluations to perform * @return Evaluation */ public IEvaluation[] doEvaluation(JavaRDD data, int evalNumWorkers, int evalBatchSize, MultiDataSetLoader loader, IEvaluation... emptyEvaluations) { return doEvaluation(data, evalNumWorkers, evalBatchSize, null, loader, emptyEvaluations); } protected IEvaluation[] doEvaluation(JavaRDD data, int evalNumWorkers, int evalBatchSize, DataSetLoader loader, MultiDataSetLoader mdsLoader, IEvaluation... emptyEvaluations){ Configuration config = sc.hadoopConfiguration(); IEvaluateMDSPathsFlatMapFunction evalFn = new IEvaluateMDSPathsFlatMapFunction(sc.broadcast(conf.toJson()), SparkUtils.asByteArrayBroadcast(sc, network.params()), evalNumWorkers, evalBatchSize, loader, mdsLoader, BroadcastHadoopConfigHolder.get(sc), emptyEvaluations); Preconditions.checkArgument(evalNumWorkers > 0, "Invalid number of evaulation workers: require at least 1 - got %s", evalNumWorkers); JavaRDD evaluations = data.mapPartitions(evalFn); return evaluations.treeAggregate(null, new IEvaluateAggregateFunction<>(), new IEvaluateAggregateFunction<>()); } }




© 2015 - 2024 Weber Informatics LLC | Privacy Policy