
org.memeticlabs.spark.rdd.trycatch.SourcePropagatingRDD.scala Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of spark-rdd-trycatch Show documentation
Show all versions of spark-rdd-trycatch Show documentation
Error trapping and handling functionality for Spark's RDD API
The newest version!
/**
* Copyright 2017 Tristan Nixon
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*
* Created by Tristan Nixon on 7/5/17.
*/
package org.memeticlabs.spark.rdd.trycatch
import scala.reflect.ClassTag
import org.apache.spark.{Partition, TaskContext}
import org.apache.spark.rdd.{PartitionCoalescer, RDD}
import org.memeticlabs.spark.rdd.trycatch.TryCatchHelpers._
import org.memeticlabs.spark.rdd.trycatch.TryCatchRDDFunctions._
import org.memeticlabs.spark.rdd.trycatch.ValueWithSource._
private[memeticlabs] class SourcePropagatingRDD[T, S]
( var inner: RDD[ValueWithSource[T,S]],
val errorHandlerBuilder: errorHandlerBuilder[ValueWithSource[_,S]] )
(implicit tt: ClassTag[T], st: ClassTag[S])
extends RDD[T]( inner )
{
/** RDD computation */
override protected def getPartitions: Array[Partition] =
firstParent[ValueWithSource[T,S]].partitions
override def compute(split: Partition, context: TaskContext): Iterator[T] =
firstParent[ValueWithSource[T,S]].iterator( split, context ).map( _.value )
override protected def clearDependencies(): Unit = {
super.clearDependencies()
inner = null
}
/** Source Access methods */
def source: RDD[S] = inner.map( _.source )
def keyBySource: RDD[(S,T)] = inner.map( vws => ( vws.source, vws.value ) )
/** Helper methods */
private def retuplify[U: ClassTag]( t: (ValueWithSource[T,S],U) ): ValueWithSource[(T,U),S] =
ValueWithSource( ( t._1.value, t._2 ), t._1.source )
private def asSPEHRDD[U: ClassTag]( tx: (RDD[ValueWithSource[T,S]]) => RDD[ValueWithSource[U,S]] ) =
new SourcePropagatingRDD[U, S]( tx( inner ), errorHandlerBuilder )
/** Transformations */
override def map[U: ClassTag](f: ( T ) => U): RDD[U] =
asSPEHRDD { _.tryMap( vwsMapFn[T,U,S](f), errorHandlerBuilder("map") ) }
override def flatMap[U: ClassTag](f: ( T ) => TraversableOnce[U]): RDD[U] =
asSPEHRDD { _.tryFlatMap( vwsFlatMapFn[T,U,S](f), errorHandlerBuilder("flatMap") ) }
override def filter(f: ( T ) => Boolean): RDD[T] =
asSPEHRDD { _.tryFilter( vws => f(vws.value), errorHandlerBuilder("filter") ) }
override def coalesce( numPartitions: Int,
shuffle: Boolean,
partitionCoalescer: Option[PartitionCoalescer])
(implicit ord: Ordering[T] ): RDD[T] =
asSPEHRDD { _.coalesce( numPartitions, shuffle, partitionCoalescer )(vwsOrdering(ord)) }
override def sample( withReplacement: Boolean,
fraction: Double,
seed: Long ): RDD[T] =
asSPEHRDD { _.sample( withReplacement, fraction, seed ) }
override def randomSplit( weights: Array[Double],
seed: Long ): Array[RDD[T]] =
inner.randomSplit( weights, seed ).map( rdd => new SourcePropagatingRDD( rdd, errorHandlerBuilder ) )
// override def union(other: RDD[T]): RDD[T] =
// other match {
// case o: SourcePropagatingRDD[T,S] => asSPEHRDD{_.union( o.inner ) }
// case r: RDD[T] => this.union( fromSource[T]( r, errorHandlerBuilder ) )
// }
override def cartesian[U: ClassTag](other: RDD[U]): RDD[(T, U)] =
asSPEHRDD { _.cartesian(other).map( retuplify[U] ) }
override def zipWithIndex(): RDD[(T, Long)] = asSPEHRDD { _.zipWithIndex().map( retuplify[Long] ) }
}
object SourcePropagatingRDD
{
private[memeticlabs] def asVWS[S]( source: RDD[S] ): RDD[ValueWithSource[S,S]] =
source.map( s => ValueWithSource(s, s) )
def fromSource[S: ClassTag]( source: RDD[S], errorHandlerBuilder: errorHandlerBuilder[ValueWithSource[_,S]] ): RDD[S] =
new SourcePropagatingRDD[S,S]( asVWS( source ), errorHandlerBuilder )
def fromSource[S: ClassTag]( source: RDD[S], errorHandler: ( String, ValueWithSource[_,S], Throwable ) => Unit ): RDD[S] =
{
val curriedHandler: errorHandlerBuilder[ValueWithSource[_,S]] = ( op: String ) => errorHandler( op, _, _ )
fromSource[S]( source, curriedHandler )
}
}
© 2015 - 2025 Weber Informatics LLC | Privacy Policy