All Downloads are FREE. Search and download functionalities are using the official Maven repository.

com.twitter.scalding.source.CodecSource.scala Maven / Gradle / Ivy

There is a newer version: 0.18.0-RC1
Show newest version
/*
Copyright 2012 Twitter, Inc.

Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at

http://www.apache.org/licenses/LICENSE-2.0

Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/

package com.twitter.scalding.source

import cascading.pipe.Pipe
import cascading.scheme.Scheme
import cascading.scheme.hadoop.WritableSequenceFile
import cascading.tuple.Fields
import com.twitter.bijection.{ Bijection, Injection }
import com.twitter.chill.Externalizer
import com.twitter.scalding._

import java.util.Arrays
import org.apache.hadoop.io.BytesWritable
import scala.collection.JavaConverters._

/**
 * Source used to write some type T into a WritableSequenceFile using a codec on T
 * for serialization.
 */

object BytesWritableCodec {
  def get =
    Bijection.build[Array[Byte], BytesWritable] { arr =>
      new BytesWritable(arr)
    } { w =>
      Arrays.copyOfRange(w.getBytes, 0, w.getLength)
    }
}

object CodecSource {
  def apply[T](paths: String*)(implicit codec: Injection[T, Array[Byte]]) = new CodecSource[T](paths)
}

class CodecSource[T] private (val hdfsPaths: Seq[String], val maxFailures: Int = 0)(implicit @transient injection: Injection[T, Array[Byte]])
  extends FileSource
  with Mappable[T]
  with LocalTapSource {
  import Dsl._

  val fieldSym = 'encodedBytes
  lazy val field = new Fields(fieldSym.name)
  val injectionBox = Externalizer(injection andThen BytesWritableCodec.get)

  def localPaths = hdfsPaths

  override def converter[U >: T] = TupleConverter.asSuperConverter[T, U](TupleConverter.singleConverter[T])
  override def hdfsScheme =
    HadoopSchemeInstance(new WritableSequenceFile(field, classOf[BytesWritable]).asInstanceOf[Scheme[_, _, _, _, _]])

  protected lazy val checkedInversion = new MaxFailuresCheck[T, BytesWritable](maxFailures)(injectionBox.get)
  override def transformForRead(pipe: Pipe) =
    pipe.flatMap((fieldSym) -> (fieldSym)) { (bw: BytesWritable) => checkedInversion(bw) }

  override def transformForWrite(pipe: Pipe) =
    pipe.mapTo((0) -> (fieldSym)) { injectionBox.get.apply(_: T) }

  override def toIterator(implicit config: Config, mode: Mode): Iterator[T] = {
    val tap = createTap(Read)(mode)
    mode.openForRead(config, tap)
      .asScala
      .flatMap { te =>
        checkedInversion(te.selectTuple(sourceFields).getObject(0).asInstanceOf[BytesWritable])
      }
  }
}




© 2015 - 2025 Weber Informatics LLC | Privacy Policy