All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.memeticlabs.spark.rdd.trycatch.SourcePropagatingRDD.scala Maven / Gradle / Ivy

The newest version!
/**
	* Copyright 2017 Tristan Nixon
	*
	* Licensed under the Apache License, Version 2.0 (the "License");
	* you may not use this file except in compliance with the License.
	* You may obtain a copy of the License at
	*
	* http://www.apache.org/licenses/LICENSE-2.0
	*
	* Unless required by applicable law or agreed to in writing, software
	* distributed under the License is distributed on an "AS IS" BASIS,
	* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	* See the License for the specific language governing permissions and
	* limitations under the License.
	*
	* Created by Tristan Nixon on 7/5/17.
	*/
package org.memeticlabs.spark.rdd.trycatch

import scala.reflect.ClassTag

import org.apache.spark.{Partition, TaskContext}
import org.apache.spark.rdd.{PartitionCoalescer, RDD}

import org.memeticlabs.spark.rdd.trycatch.TryCatchHelpers._
import org.memeticlabs.spark.rdd.trycatch.TryCatchRDDFunctions._
import org.memeticlabs.spark.rdd.trycatch.ValueWithSource._

private[memeticlabs] class SourcePropagatingRDD[T, S]
	( var inner: RDD[ValueWithSource[T,S]],
	  val errorHandlerBuilder: errorHandlerBuilder[ValueWithSource[_,S]] )
	(implicit tt: ClassTag[T], st: ClassTag[S])
		extends RDD[T]( inner )
{

	/** RDD computation */

	override protected def getPartitions: Array[Partition] =
		firstParent[ValueWithSource[T,S]].partitions

	override def compute(split: Partition, context: TaskContext): Iterator[T] =
		firstParent[ValueWithSource[T,S]].iterator( split, context ).map( _.value )

	override protected def clearDependencies(): Unit = {
		super.clearDependencies()
		inner = null
	}

	/** Source Access methods */

	def source: RDD[S] = inner.map( _.source )

	def keyBySource: RDD[(S,T)] = inner.map( vws => ( vws.source, vws.value ) )

	/** Helper methods */

	private def retuplify[U: ClassTag]( t: (ValueWithSource[T,S],U) ): ValueWithSource[(T,U),S] =
		ValueWithSource( ( t._1.value, t._2 ), t._1.source )

	private def asSPEHRDD[U: ClassTag]( tx: (RDD[ValueWithSource[T,S]]) => RDD[ValueWithSource[U,S]] ) =
		new SourcePropagatingRDD[U, S]( tx( inner ), errorHandlerBuilder )

	/** Transformations */

	override def map[U: ClassTag](f: ( T ) => U): RDD[U] =
		asSPEHRDD { _.tryMap( vwsMapFn[T,U,S](f), errorHandlerBuilder("map") ) }

	override def flatMap[U: ClassTag](f: ( T ) => TraversableOnce[U]): RDD[U] =
		asSPEHRDD { _.tryFlatMap( vwsFlatMapFn[T,U,S](f), errorHandlerBuilder("flatMap") ) }

	override def filter(f: ( T ) => Boolean): RDD[T] =
		asSPEHRDD { _.tryFilter( vws => f(vws.value), errorHandlerBuilder("filter") ) }

	override def coalesce( numPartitions: Int,
	                       shuffle: Boolean,
	                       partitionCoalescer: Option[PartitionCoalescer])
	                     (implicit ord: Ordering[T] ): RDD[T] =
		asSPEHRDD { _.coalesce( numPartitions, shuffle, partitionCoalescer )(vwsOrdering(ord)) }

	override def sample( withReplacement: Boolean,
	                     fraction: Double,
	                     seed: Long ): RDD[T] =
		asSPEHRDD { _.sample( withReplacement, fraction, seed ) }

	override def randomSplit( weights: Array[Double],
	                          seed: Long ): Array[RDD[T]] =
		inner.randomSplit( weights, seed ).map( rdd => new SourcePropagatingRDD( rdd, errorHandlerBuilder ) )

//	override def union(other: RDD[T]): RDD[T] =
//		other match {
//			case o: SourcePropagatingRDD[T,S] => asSPEHRDD{_.union( o.inner ) }
//			case r: RDD[T] => this.union( fromSource[T]( r, errorHandlerBuilder ) )
//		}

	override def cartesian[U: ClassTag](other: RDD[U]): RDD[(T, U)] =
		asSPEHRDD { _.cartesian(other).map( retuplify[U] ) }

	override def zipWithIndex(): RDD[(T, Long)] = asSPEHRDD { _.zipWithIndex().map( retuplify[Long] ) }
}

object SourcePropagatingRDD
{
	private[memeticlabs] def asVWS[S]( source: RDD[S] ): RDD[ValueWithSource[S,S]] =
		source.map( s => ValueWithSource(s, s) )

	def fromSource[S: ClassTag]( source: RDD[S], errorHandlerBuilder: errorHandlerBuilder[ValueWithSource[_,S]] ): RDD[S] =
		new SourcePropagatingRDD[S,S]( asVWS( source ), errorHandlerBuilder )

	def fromSource[S: ClassTag]( source: RDD[S], errorHandler: ( String, ValueWithSource[_,S], Throwable ) => Unit ): RDD[S] =
	{
		val curriedHandler: errorHandlerBuilder[ValueWithSource[_,S]] = ( op: String ) => errorHandler( op, _, _ )
		fromSource[S]( source, curriedHandler )
	}
}




© 2015 - 2025 Weber Informatics LLC | Privacy Policy