co.cask.cdap.etl.api.batch.SparkSink Maven / Gradle / Ivy

Go to download
/*
 * Copyright © 2016 Cask Data, Inc.
 *
 * Licensed under the Apache License, Version 2.0 (the "License"); you may not
 * use this file except in compliance with the License. You may obtain a copy of
 * the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
 * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
 * License for the specific language governing permissions and limitations under
 * the License.
 */

package co.cask.cdap.etl.api.batch;

import co.cask.cdap.api.annotation.Beta;
import org.apache.spark.api.java.JavaRDD;

import java.io.Serializable;


/**
 * SparkSink composes a final, optional stage of a Batch ETL Pipeline. In addition to configuring the Batch run, it
 * can also perform RDD operations on the key value pairs provided by the Batch run.
 *
 * {@link SparkSink#run} method is called inside the Batch Run while {@link SparkSink#prepareRun} and
 * {@link SparkSink#onRunFinish} methods are called on the client side, which launches the Batch run, before the
 * Batch run starts and after it finishes respectively.
 *
 * @param  The type of input record to the SparkSink.
 */
@Beta
public abstract class SparkSink extends BatchConfigurable implements Serializable {

  public static final String PLUGIN_TYPE = "sparksink";

  private static final long serialVersionUID = -8600555200583639593L;

  /**
   * User Spark job which will be executed and is responsible for persisting any data.
   *
   * @param context {@link SparkExecutionPluginContext} for this job
   * @param input the input from previous stages of the Batch run.
   */
  public abstract void run(SparkExecutionPluginContext context, JavaRDD input) throws Exception;

}