co.cask.cdap.etl.api.batch.SparkSink Maven / Gradle / Ivy
/*
* Copyright © 2016 Cask Data, Inc.
*
* Licensed under the Apache License, Version 2.0 (the "License"); you may not
* use this file except in compliance with the License. You may obtain a copy of
* the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
* License for the specific language governing permissions and limitations under
* the License.
*/
package co.cask.cdap.etl.api.batch;
import co.cask.cdap.api.annotation.Beta;
import org.apache.spark.api.java.JavaRDD;
import java.io.Serializable;
/**
* SparkSink composes a final, optional stage of a Batch ETL Pipeline. In addition to configuring the Batch run, it
* can also perform RDD operations on the key value pairs provided by the Batch run.
*
* {@link SparkSink#run} method is called inside the Batch Run while {@link SparkSink#prepareRun} and
* {@link SparkSink#onRunFinish} methods are called on the client side, which launches the Batch run, before the
* Batch run starts and after it finishes respectively.
*
* @param The type of input record to the SparkSink.
*/
@Beta
public abstract class SparkSink extends BatchConfigurable implements Serializable {
public static final String PLUGIN_TYPE = "sparksink";
private static final long serialVersionUID = -8600555200583639593L;
/**
* User Spark job which will be executed and is responsible for persisting any data.
*
* @param context {@link SparkExecutionPluginContext} for this job
* @param input the input from previous stages of the Batch run.
*/
public abstract void run(SparkExecutionPluginContext context, JavaRDD input) throws Exception;
}
© 2015 - 2025 Weber Informatics LLC | Privacy Policy