co.cask.cdap.etl.api.batch.SparkExecutionPluginContext Maven / Gradle / Ivy

Go to download
/*
 * Copyright © 2016 Cask Data, Inc.
 *
 * Licensed under the Apache License, Version 2.0 (the "License"); you may not
 * use this file except in compliance with the License. You may obtain a copy of
 * the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
 * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
 * License for the specific language governing permissions and limitations under
 * the License.
 */

package co.cask.cdap.etl.api.batch;

import co.cask.cdap.api.annotation.Beta;
import co.cask.cdap.api.data.DatasetContext;
import co.cask.cdap.api.data.DatasetInstantiationException;
import co.cask.cdap.api.data.batch.Split;
import co.cask.cdap.api.dataset.Dataset;
import co.cask.cdap.api.flow.flowlet.StreamEvent;
import co.cask.cdap.api.plugin.PluginContext;
import co.cask.cdap.api.stream.StreamEventDecoder;
import co.cask.cdap.etl.api.TransformContext;
import org.apache.hadoop.io.ByteWritable;
import org.apache.hadoop.io.Text;
import org.apache.spark.Partition;
import org.apache.spark.api.java.JavaPairRDD;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext;

import java.io.Serializable;
import java.util.Map;
import javax.annotation.Nullable;

/**
 * Context passed to spark plugin types.
 */
@Beta
public interface SparkExecutionPluginContext extends DatasetContext, TransformContext {

  /**
   * Returns the logical start time of the Batch Job.  Logical start time is the time when this Batch
   * job is supposed to start if this job is started by the scheduler. Otherwise it would be the current time when the
   * job runs.
   *
   * @return Time in milliseconds since epoch time (00:00:00 January 1, 1970 UTC).
   */
  long getLogicalStartTime();

  /**
   * Returns runtime arguments of the Batch Job.
   *
   * @return runtime arguments of the Batch Job.
   */
  Map getRuntimeArguments();

  /**
   * Creates a {@link JavaPairRDD} from the given {@link Dataset}.
   *
   * @param datasetName name of the Dataset
   * @param  key type
   * @param  value type
   * @return A new {@link JavaPairRDD} instance that reads from the given Dataset
   * @throws DatasetInstantiationException if the Dataset doesn't exist
   */
   JavaPairRDD fromDataset(String datasetName);

  /**
   * Creates a {@link JavaPairRDD} from the given {@link Dataset} with the given set of Dataset arguments.
   *
   * @param datasetName name of the Dataset
   * @param arguments arguments for the Dataset
   * @param  key type
   * @param  value type
   * @return A new {@link JavaPairRDD} instance that reads from the given Dataset
   * @throws DatasetInstantiationException if the Dataset doesn't exist
   */
   JavaPairRDD fromDataset(String datasetName, Map arguments);

  /**
   * Creates a {@link JavaPairRDD} from the given {@link Dataset} with the given set of Dataset arguments
   * and custom list of {@link Split}s. Each {@link Split} will create a {@link Partition} in the {@link JavaPairRDD}.
   *
   * @param datasetName name of the Dataset
   * @param arguments arguments for the Dataset
   * @param splits list of {@link Split} or {@code null} to use the default splits provided by the Dataset
   * @param  key type
   * @param  value type
   * @return A new {@link JavaPairRDD} instance that reads from the given Dataset
   * @throws DatasetInstantiationException if the Dataset doesn't exist
   */
   JavaPairRDD fromDataset(String datasetName, Map arguments,
                                       @Nullable Iterable splits);

  /**
   * Creates a {@link JavaRDD} that represents all events from the given stream.
   *
   * @param streamName name of the stream
   * @return A new {@link JavaRDD} instance that reads from the given stream
   * @throws DatasetInstantiationException if the Stream doesn't exist
   */
  JavaRDD fromStream(String streamName);

  /**
   * Creates a {@link JavaRDD} that represents events from the given stream in the given time range.
   *
   * @param streamName name of the stream
   * @param startTime the starting time of the stream to be read in milliseconds (inclusive)
   * @param endTime the ending time of the streams to be read in milliseconds (exclusive)
   * @return A new {@link JavaRDD} instance that reads from the given stream
   * @throws DatasetInstantiationException if the Stream doesn't exist
   */
  JavaRDD fromStream(String streamName, long startTime, long endTime);

  /**
   * Creates a {@link JavaPairRDD} that represents all events from the given stream. The key in the
   * resulting {@link JavaPairRDD} is the event timestamp. The stream body will
   * be decoded as the give value type. Currently it supports {@link Text}, {@link String} and {@link ByteWritable}.
   *
   * @param streamName name of the stream
   * @param valueType type of the stream body to decode to
   * @return A new {@link JavaRDD} instance that reads from the given stream
   * @throws DatasetInstantiationException if the Stream doesn't exist
   */
   JavaPairRDD fromStream(String streamName, Class valueType);

  /**
   * Creates a {@link JavaPairRDD} that represents events from the given stream in the given time range.
   * The key in the resulting {@link JavaPairRDD} is the event timestamp.
   * The stream body will be decoded as the give value type.
   * Currently it supports {@link Text}, {@link String} and {@link ByteWritable}.
   *
   * @param streamName name of the stream
   * @param startTime the starting time of the stream to be read in milliseconds (inclusive)
   * @param endTime the ending time of the streams to be read in milliseconds (exclusive)
   * @param valueType type of the stream body to decode to
   * @return A new {@link JavaRDD} instance that reads from the given stream
   * @throws DatasetInstantiationException if the Stream doesn't exist
   */
   JavaPairRDD fromStream(String streamName, long startTime, long endTime, Class valueType);

  /**
   * Creates a {@link JavaPairRDD} that represents events from the given stream in the given time range.
   * Each steam event will be decoded by an instance of the given {@link StreamEventDecoder} class.
   *
   * @param streamName name of the stream
   * @param startTime the starting time of the stream to be read in milliseconds (inclusive)
   * @param endTime the ending time of the streams to be read in milliseconds (exclusive)
   * @param decoderClass the {@link StreamEventDecoder} for decoding {@link StreamEvent}
   * @param keyType the type of the decoded key
   * @param valueType the type of the decoded value
   * @return A new {@link JavaRDD} instance that reads from the given stream
   * @throws DatasetInstantiationException if the Stream doesn't exist
   */
   JavaPairRDD fromStream(String streamName, long startTime, long endTime,
                                      Class> decoderClass,
                                      Class keyType, Class valueType);

  /**
   * Saves the given {@link JavaPairRDD} to the given {@link Dataset}.
   *
   * @param rdd the {@link JavaPairRDD} to be saved
   * @param datasetName name of the Dataset
   * @throws DatasetInstantiationException if the Dataset doesn't exist
   */
   void saveAsDataset(JavaPairRDD rdd, String datasetName);

  /**
   * Saves the given {@link JavaPairRDD} to the given {@link Dataset} with the given set of Dataset arguments.
   *
   * @param rdd the {@link JavaPairRDD} to be saved
   * @param datasetName name of the Dataset
   * @param arguments arguments for the Dataset
   * @throws DatasetInstantiationException if the Dataset doesn't exist
   */
   void saveAsDataset(JavaPairRDD rdd, String datasetName, Map arguments);

  /**
   * Returns the {@link JavaSparkContext} used during the execution.
   *
   * @return the Spark Context
   */
  JavaSparkContext getSparkContext();

  /**
   * Returns a {@link Serializable} {@link PluginContext} which can be used to request for plugins instances. The
   * instance returned can also be used in Spark program's closures.
   *
   * @return A {@link Serializable} {@link PluginContext}.
   */
  PluginContext getPluginContext();
}