org.memeticlabs.spark.rdd.trycatch.ErrorHandlingRDD.scala Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of spark-rdd-trycatch Show documentation
Error trapping and handling functionality for Spark's RDD API
The newest version!
/**
	* Copyright 2017 Tristan Nixon
	*
	* Licensed under the Apache License, Version 2.0 (the "License");
	* you may not use this file except in compliance with the License.
	* You may obtain a copy of the License at
	*
	* http://www.apache.org/licenses/LICENSE-2.0
	*
	* Unless required by applicable law or agreed to in writing, software
	* distributed under the License is distributed on an "AS IS" BASIS,
	* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	* See the License for the specific language governing permissions and
	* limitations under the License.
	*
	* Created by Tristan Nixon on 7/2/17.
	*/
package org.memeticlabs.spark.rdd.trycatch

import scala.language.implicitConversions
import scala.reflect.ClassTag

import org.apache.spark.{Partition, TaskContext}
import org.apache.spark.rdd.{PartitionCoalescer, RDD}

import org.memeticlabs.spark.rdd.trycatch.TryCatchHelpers._
import org.memeticlabs.spark.rdd.trycatch.TryCatchRDDFunctions._

/**
	* RDD that injects an error handler around user functions
	*
	* @param inner the wrapped RDD
	* @param errorHandlerBuilder builder function that returns an error-handler, given an operation name
	* @tparam T Data type of this RDD
	*/
private[memeticlabs] class ErrorHandlingRDD[T]( var inner: RDD[T],
                                                val errorHandlerBuilder: errorHandlerBuilder[Any] )
                                              (implicit tt: ClassTag[T])
	extends RDD[T](inner)
{
	override def compute(split: Partition, context: TaskContext): Iterator[T] =
		firstParent[T].iterator( split, context )

	override protected def getPartitions: Array[Partition] =
		firstParent[T].partitions

	override protected def clearDependencies(): Unit = {
		super.clearDependencies()
		inner = null
	}

	/** Helper methods */

	private def asEHRDD[U: ClassTag]( tx: (RDD[T]) => RDD[U] ) =
		new ErrorHandlingRDD[U]( tx(inner), errorHandlerBuilder )

	/** Transformations */

	override def map[U: ClassTag](f: T => U): RDD[U] =
		asEHRDD { _.tryMap( f, errorHandlerBuilder("map") ) }

	override def flatMap[U: ClassTag](f: T => TraversableOnce[U]): RDD[U] =
		asEHRDD { _.tryFlatMap( f, errorHandlerBuilder("flatMap") ) }

	override def filter(f: ( T ) => Boolean): RDD[T] =
		asEHRDD { _.tryFilter( f, errorHandlerBuilder("filter") ) }

	override def distinct(numPartitions: Int)(implicit ord: Ordering[T]): RDD[T] =
		asEHRDD { _.distinct( numPartitions )(ord) }

	override def coalesce(numPartitions: Int,
	                      shuffle: Boolean,
	                      partitionCoalescer: Option[PartitionCoalescer])
	                     (implicit ord: Ordering[T]): RDD[T] =
		asEHRDD { _.coalesce( numPartitions, shuffle, partitionCoalescer )(ord) }

	override def sample( withReplacement: Boolean,
	                     fraction: Double,
	                     seed: Long): RDD[T] =
		asEHRDD { _.sample( withReplacement, fraction, seed ) }

	override def randomSplit( weights: Array[Double],
	                          seed: Long): Array[RDD[T]] =
		inner.randomSplit( weights, seed ).map( rdd => new ErrorHandlingRDD[T]( rdd, errorHandlerBuilder ) )

	override def union(other: RDD[T]): RDD[T] = asEHRDD { _.union( other ) }

	override def glom(): RDD[Array[T]] = asEHRDD { _.glom() }

	override def cartesian[U: ClassTag](other: RDD[U]): RDD[(T, U)] = asEHRDD { _.cartesian( other ) }

	override def pipe( command: Seq[String],
	                   env: collection.Map[String, String],
	                   printPipeContext: ( ( String ) => Unit ) => Unit,
	                   printRDDElement: (T, ( String ) => Unit) => Unit, separateWorkingDir: Boolean,
	                   bufferSize: Int,
	                   encoding: String ): RDD[String] =
		asEHRDD { _.pipe( command, env, printPipeContext, printRDDElement, separateWorkingDir, bufferSize, encoding ) }

	override def mapPartitions[U: ClassTag]
	             (f: Iterator[T] => Iterator[U],
	              preservesPartitioning: Boolean = false): RDD[U] =
		asEHRDD { _.tryMapPartitions( f, errorHandlerBuilder("mapPartitions"), preservesPartitioning ) }

	override def mapPartitionsWithIndex[U: ClassTag]
	             (f: (Int, Iterator[T]) => Iterator[U],
	              preservesPartitioning: Boolean = false): RDD[U] =
		asEHRDD { _.tryMapPartitionsWithIndex( f, errorHandlerBuilder("mapPartitionsWithIndex"), preservesPartitioning ) }

	override def zipPartitions[B: ClassTag, V: ClassTag]
	             (rdd2: RDD[B], preservesPartitioning: Boolean)
	             (f: (Iterator[T], Iterator[B]) => Iterator[V]): RDD[V] =
		asEHRDD { _.tryZipPartitions( rdd2, preservesPartitioning )( f, errorHandlerBuilder("zipPartitions") ) }

	override def zipPartitions[B: ClassTag, C: ClassTag, V: ClassTag]
	             (rdd2: RDD[B], rdd3: RDD[C], preservesPartitioning: Boolean)
	             (f: (Iterator[T], Iterator[B], Iterator[C]) => Iterator[V]): RDD[V] =
		asEHRDD { _.tryZipPartitions( rdd2, rdd3, preservesPartitioning )( f, errorHandlerBuilder("zipPartitions") ) }

	override def zipPartitions[B: ClassTag, C: ClassTag, D: ClassTag, V: ClassTag]
	             (rdd2: RDD[B], rdd3: RDD[C], rdd4: RDD[D], preservesPartitioning: Boolean)
	             (f: (Iterator[T], Iterator[B], Iterator[C], Iterator[D]) => Iterator[V]): RDD[V] =
		asEHRDD { _.tryZipPartitions( rdd2, rdd3, rdd4, preservesPartitioning )( f, errorHandlerBuilder("zipPartitions") ) }

	override def zipWithIndex(): RDD[(T, Long)] = asEHRDD { _.zipWithIndex() }

	/** Actions */

	override def foreach(f: ( T ) => Unit): Unit =
		inner.tryForeach( f, errorHandlerBuilder("foreach") )

	override def foreachPartition(f: ( Iterator[T] ) => Unit): Unit =
		inner.tryForeachPartition( f, errorHandlerBuilder("foreachPartition") )

	@deprecated("Use reduce(zeroValue: T)(f: (T, T) => T) instead", "Spark-RDD-Trycatch 0.1")
	override def reduce(f: (T, T) => T): T =
		reduce(null.asInstanceOf[T])(f)

	@deprecated("Use treeReduce(zeroValue: T)(f: (T, T) => T, depth: Int) instead", "Spark-RDD-Trycatch 0.1")
	override def treeReduce(f: (T, T) => T, depth: Int): T =
		treeReduce(null.asInstanceOf[T])( f, depth )

	def reduce(zeroValue: T)(f: (T, T) => T): T =
		inner.tryReduce(zeroValue)( f, errorHandlerBuilder("reduce") )

	def treeReduce(zeroValue: T)(f: (T, T) => T, depth: Int): T =
		inner.tryTreeReduce(zeroValue)( f, depth, errorHandlerBuilder("treeReduce") )

	override def fold(zeroValue: T)(op: (T, T) => T): T =
		inner.tryFold(zeroValue)( op, errorHandlerBuilder("fold") )

	override def aggregate[U: ClassTag](zeroValue: U)(seqOp: (U, T) => U, combOp: (U, U) => U): U =
		inner.tryAggregate(zeroValue)( seqOp, errorHandlerBuilder("aggregate"),
		                               combOp, errorHandlerBuilder("aggregate") )

	override def treeAggregate[U: ClassTag](zeroValue: U)(seqOp: (U, T) => U, combOp: (U, U) => U, depth: Int): U =
		inner.tryTreeAggregate(zeroValue)( seqOp, errorHandlerBuilder("treeAggregate"),
		                                   combOp, errorHandlerBuilder("treeAggregate"),
		                                   depth )
}

object ErrorHandlingRDD
{
	def forRDD[T: ClassTag]( rdd: RDD[T], withErrorHandlerBuilder: errorHandlerBuilder[Any] ): RDD[T] =
		new ErrorHandlingRDD[T]( rdd, withErrorHandlerBuilder )
}