
org.memeticlabs.spark.rdd.trycatch.ErrorHandlingRDD.scala Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of spark-rdd-trycatch Show documentation
Show all versions of spark-rdd-trycatch Show documentation
Error trapping and handling functionality for Spark's RDD API
The newest version!
/**
* Copyright 2017 Tristan Nixon
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*
* Created by Tristan Nixon on 7/2/17.
*/
package org.memeticlabs.spark.rdd.trycatch
import scala.language.implicitConversions
import scala.reflect.ClassTag
import org.apache.spark.{Partition, TaskContext}
import org.apache.spark.rdd.{PartitionCoalescer, RDD}
import org.memeticlabs.spark.rdd.trycatch.TryCatchHelpers._
import org.memeticlabs.spark.rdd.trycatch.TryCatchRDDFunctions._
/**
* RDD that injects an error handler around user functions
*
* @param inner the wrapped RDD
* @param errorHandlerBuilder builder function that returns an error-handler, given an operation name
* @tparam T Data type of this RDD
*/
private[memeticlabs] class ErrorHandlingRDD[T]( var inner: RDD[T],
val errorHandlerBuilder: errorHandlerBuilder[Any] )
(implicit tt: ClassTag[T])
extends RDD[T](inner)
{
override def compute(split: Partition, context: TaskContext): Iterator[T] =
firstParent[T].iterator( split, context )
override protected def getPartitions: Array[Partition] =
firstParent[T].partitions
override protected def clearDependencies(): Unit = {
super.clearDependencies()
inner = null
}
/** Helper methods */
private def asEHRDD[U: ClassTag]( tx: (RDD[T]) => RDD[U] ) =
new ErrorHandlingRDD[U]( tx(inner), errorHandlerBuilder )
/** Transformations */
override def map[U: ClassTag](f: T => U): RDD[U] =
asEHRDD { _.tryMap( f, errorHandlerBuilder("map") ) }
override def flatMap[U: ClassTag](f: T => TraversableOnce[U]): RDD[U] =
asEHRDD { _.tryFlatMap( f, errorHandlerBuilder("flatMap") ) }
override def filter(f: ( T ) => Boolean): RDD[T] =
asEHRDD { _.tryFilter( f, errorHandlerBuilder("filter") ) }
override def distinct(numPartitions: Int)(implicit ord: Ordering[T]): RDD[T] =
asEHRDD { _.distinct( numPartitions )(ord) }
override def coalesce(numPartitions: Int,
shuffle: Boolean,
partitionCoalescer: Option[PartitionCoalescer])
(implicit ord: Ordering[T]): RDD[T] =
asEHRDD { _.coalesce( numPartitions, shuffle, partitionCoalescer )(ord) }
override def sample( withReplacement: Boolean,
fraction: Double,
seed: Long): RDD[T] =
asEHRDD { _.sample( withReplacement, fraction, seed ) }
override def randomSplit( weights: Array[Double],
seed: Long): Array[RDD[T]] =
inner.randomSplit( weights, seed ).map( rdd => new ErrorHandlingRDD[T]( rdd, errorHandlerBuilder ) )
override def union(other: RDD[T]): RDD[T] = asEHRDD { _.union( other ) }
override def glom(): RDD[Array[T]] = asEHRDD { _.glom() }
override def cartesian[U: ClassTag](other: RDD[U]): RDD[(T, U)] = asEHRDD { _.cartesian( other ) }
override def pipe( command: Seq[String],
env: collection.Map[String, String],
printPipeContext: ( ( String ) => Unit ) => Unit,
printRDDElement: (T, ( String ) => Unit) => Unit, separateWorkingDir: Boolean,
bufferSize: Int,
encoding: String ): RDD[String] =
asEHRDD { _.pipe( command, env, printPipeContext, printRDDElement, separateWorkingDir, bufferSize, encoding ) }
override def mapPartitions[U: ClassTag]
(f: Iterator[T] => Iterator[U],
preservesPartitioning: Boolean = false): RDD[U] =
asEHRDD { _.tryMapPartitions( f, errorHandlerBuilder("mapPartitions"), preservesPartitioning ) }
override def mapPartitionsWithIndex[U: ClassTag]
(f: (Int, Iterator[T]) => Iterator[U],
preservesPartitioning: Boolean = false): RDD[U] =
asEHRDD { _.tryMapPartitionsWithIndex( f, errorHandlerBuilder("mapPartitionsWithIndex"), preservesPartitioning ) }
override def zipPartitions[B: ClassTag, V: ClassTag]
(rdd2: RDD[B], preservesPartitioning: Boolean)
(f: (Iterator[T], Iterator[B]) => Iterator[V]): RDD[V] =
asEHRDD { _.tryZipPartitions( rdd2, preservesPartitioning )( f, errorHandlerBuilder("zipPartitions") ) }
override def zipPartitions[B: ClassTag, C: ClassTag, V: ClassTag]
(rdd2: RDD[B], rdd3: RDD[C], preservesPartitioning: Boolean)
(f: (Iterator[T], Iterator[B], Iterator[C]) => Iterator[V]): RDD[V] =
asEHRDD { _.tryZipPartitions( rdd2, rdd3, preservesPartitioning )( f, errorHandlerBuilder("zipPartitions") ) }
override def zipPartitions[B: ClassTag, C: ClassTag, D: ClassTag, V: ClassTag]
(rdd2: RDD[B], rdd3: RDD[C], rdd4: RDD[D], preservesPartitioning: Boolean)
(f: (Iterator[T], Iterator[B], Iterator[C], Iterator[D]) => Iterator[V]): RDD[V] =
asEHRDD { _.tryZipPartitions( rdd2, rdd3, rdd4, preservesPartitioning )( f, errorHandlerBuilder("zipPartitions") ) }
override def zipWithIndex(): RDD[(T, Long)] = asEHRDD { _.zipWithIndex() }
/** Actions */
override def foreach(f: ( T ) => Unit): Unit =
inner.tryForeach( f, errorHandlerBuilder("foreach") )
override def foreachPartition(f: ( Iterator[T] ) => Unit): Unit =
inner.tryForeachPartition( f, errorHandlerBuilder("foreachPartition") )
@deprecated("Use reduce(zeroValue: T)(f: (T, T) => T) instead", "Spark-RDD-Trycatch 0.1")
override def reduce(f: (T, T) => T): T =
reduce(null.asInstanceOf[T])(f)
@deprecated("Use treeReduce(zeroValue: T)(f: (T, T) => T, depth: Int) instead", "Spark-RDD-Trycatch 0.1")
override def treeReduce(f: (T, T) => T, depth: Int): T =
treeReduce(null.asInstanceOf[T])( f, depth )
def reduce(zeroValue: T)(f: (T, T) => T): T =
inner.tryReduce(zeroValue)( f, errorHandlerBuilder("reduce") )
def treeReduce(zeroValue: T)(f: (T, T) => T, depth: Int): T =
inner.tryTreeReduce(zeroValue)( f, depth, errorHandlerBuilder("treeReduce") )
override def fold(zeroValue: T)(op: (T, T) => T): T =
inner.tryFold(zeroValue)( op, errorHandlerBuilder("fold") )
override def aggregate[U: ClassTag](zeroValue: U)(seqOp: (U, T) => U, combOp: (U, U) => U): U =
inner.tryAggregate(zeroValue)( seqOp, errorHandlerBuilder("aggregate"),
combOp, errorHandlerBuilder("aggregate") )
override def treeAggregate[U: ClassTag](zeroValue: U)(seqOp: (U, T) => U, combOp: (U, U) => U, depth: Int): U =
inner.tryTreeAggregate(zeroValue)( seqOp, errorHandlerBuilder("treeAggregate"),
combOp, errorHandlerBuilder("treeAggregate"),
depth )
}
object ErrorHandlingRDD
{
def forRDD[T: ClassTag]( rdd: RDD[T], withErrorHandlerBuilder: errorHandlerBuilder[Any] ): RDD[T] =
new ErrorHandlingRDD[T]( rdd, withErrorHandlerBuilder )
}
© 2015 - 2025 Weber Informatics LLC | Privacy Policy