All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.deeplearning4j.streaming.pipeline.spark.SparkStreamingPipeline Maven / Gradle / Ivy

There is a newer version: 1.0.0-beta_spark_1
Show newest version
package org.deeplearning4j.streaming.pipeline.spark;

import lombok.Builder;
import lombok.Data;
import org.apache.camel.CamelContext;
import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaPairRDD;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.function.Function;
import org.apache.spark.streaming.Duration;
import org.apache.spark.streaming.Durations;
import org.apache.spark.streaming.api.java.JavaDStream;
import org.apache.spark.streaming.api.java.JavaPairInputDStream;
import org.apache.spark.streaming.api.java.JavaStreamingContext;
import org.apache.spark.streaming.kafka.KafkaUtils;
import org.deeplearning4j.streaming.conversion.dataset.RecordToDataSet;
import org.deeplearning4j.streaming.pipeline.kafka.BaseKafkaPipeline;
import org.nd4j.linalg.dataset.DataSet;

import java.util.Collections;

/**
 * Spark streaming pipeline.
 *
 * @author Adam Gibson
 */
@Data
public class SparkStreamingPipeline extends BaseKafkaPipeline,RecordToDataSet> {
    protected JavaStreamingContext jssc;
    protected SparkConf sparkConf;
    protected Function, Void> streamProcessor;
    protected Duration streamingDuration =  Durations.seconds(1);
    protected String sparkMaster;
    protected Function, Void> datasetConsumption;

    @Builder
    public SparkStreamingPipeline(String kafkaTopic, String inputUri, String inputFormat, String kafkaBroker, String zkHost, CamelContext camelContext, String hadoopHome, String dataType, String sparkAppName, int kafkaPartitions, RecordToDataSet recordToDataSetFunction, int numLabels, JavaDStream dataset, JavaStreamingContext jssc, SparkConf sparkConf, Function, Void> streamProcessor, Duration streamingDuration, String sparkMaster) {
        super(kafkaTopic, inputUri, inputFormat, kafkaBroker, zkHost, camelContext, hadoopHome, dataType, sparkAppName, kafkaPartitions, recordToDataSetFunction, numLabels, dataset);
        this.jssc = jssc;
        this.sparkConf = sparkConf;
        this.streamProcessor = streamProcessor;
        this.streamingDuration = streamingDuration;
        this.sparkMaster = sparkMaster;
    }

    @Override
    public void initComponents() {
        sparkConf = new SparkConf().setAppName(sparkAppName).setMaster(sparkMaster);
        jssc = new JavaStreamingContext(sparkConf, streamingDuration);
    }

    /**
     * Create the streaming result
     *
     * @return the stream
     */
    @Override
    public JavaDStream createStream() {
        JavaPairInputDStream messages = KafkaUtils.createStream(
                jssc,
                zkHost,
                "datavec",
                Collections.singletonMap(kafkaTopic, kafkaPartitions));
        JavaDStream dataset = messages.flatMap(new DataSetFlatmap(numLabels,recordToDataSetFunction)).cache();
        return dataset;
    }

    /**
     * Starts the streaming consumption
     */
    @Override
    public void startStreamingConsumption(long timeout) {
        jssc.start();
        if(timeout < 0)
            jssc.awaitTermination();
        else
            jssc.awaitTermination(timeout);
    }
}




© 2015 - 2024 Weber Informatics LLC | Privacy Policy