All Downloads are FREE. Search and download functionalities are using the official Maven repository.

com.twitter.scalding.avro.AvroSource.scala Maven / Gradle / Ivy

/*  Copyright 2013 eBay, inc.
 *
 *  Licensed under the Apache License, Version 2.0 (the "License");
 *  you may not use this file except in compliance with the License.
 *  You may obtain a copy of the License at
 *
 *      http://www.apache.org/licenses/LICENSE-2.0
 *
 *  Unless required by applicable law or agreed to in writing, software
 *  distributed under the License is distributed on an "AS IS" BASIS,
 *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 *  See the License for the specific language governing permissions and
 *  limitations under the License.
 */

package com.twitter.scalding.avro

import cascading.avro.AvroScheme
import cascading.avro.PackedAvroScheme
import cascading.avro.local.{ AvroScheme => LAvroScheme, PackedAvroScheme => LPackedAvroScheme }
import com.twitter.scalding._
import org.apache.avro.Schema
import cascading.scheme.Scheme
import java.io.InputStream
import java.io.OutputStream
import java.util.Properties
import cascading.tuple.Fields
import collection.JavaConverters._
import org.apache.hadoop.mapred.{ OutputCollector, RecordReader, JobConf }

trait UnpackedAvroFileScheme extends FileSource {
  def schema: Option[Schema]

  // HadoopSchemeInstance gives compile errors in 2.10 for some reason
  override def hdfsScheme = (new AvroScheme(schema.getOrElse(null))).asInstanceOf[Scheme[JobConf, RecordReader[_, _], OutputCollector[_, _], _, _]]

  override def localScheme = (new LAvroScheme(schema.getOrElse(null))).asInstanceOf[Scheme[Properties, InputStream, OutputStream, _, _]]

}

trait PackedAvroFileScheme[T] extends FileSource {
  def schema: Schema

  // HadoopSchemeInstance gives compile errors for this in 2.10 for some reason
  override def hdfsScheme = (new PackedAvroScheme[T](schema)).asInstanceOf[Scheme[JobConf, RecordReader[_, _], OutputCollector[_, _], _, _]]

  override def localScheme = (new LPackedAvroScheme[T](schema)).asInstanceOf[Scheme[Properties, InputStream, OutputStream, _, _]]
}

object UnpackedAvroSource {

  def apply[T](path: String, schema: Schema) =
    new UnpackedAvroSource[T](Seq(path), Some(schema))

  def apply[T](path: String) =
    new UnpackedAvroSource[T](Seq(path), None)

  def apply[T](paths: Seq[String], schema: Schema) =
    new UnpackedAvroSource[T](paths, Some(schema))

  def apply[T](paths: Seq[String]) =
    new UnpackedAvroSource[T](paths, None)

  def apply[T](path: String, schema: Option[Schema]) =
    new UnpackedAvroSource[T](Seq(path), schema)
}

case class UnpackedAvroSource[T](paths: Seq[String], schema: Option[Schema])(implicit val conv: TupleConverter[T], tset: TupleSetter[T])

  extends FixedPathSource(paths: _*)
  with UnpackedAvroFileScheme with Mappable[T] with TypedSink[T] {

  override def sinkFields: Fields = {
    val outFields = schema.map {
      schema =>
        val schemaFields = schema.getFields
        schemaFields.asScala.foldLeft(new Fields())((cFields, sField) => cFields.append(new Fields(sField.name())))
    }
    outFields.getOrElse(Dsl.intFields(0 until setter.arity))
  }

  override def converter[U >: T] = TupleConverter.asSuperConverter[T, U](conv)

  override def setter[U <: T] = TupleSetter.asSubSetter[T, U](tset)

}

object PackedAvroSource {
  def apply[T: AvroSchemaType: Manifest: TupleConverter](path: String) = new PackedAvroSource[T](Seq(path))
}

case class PackedAvroSource[T](paths: Seq[String])(implicit val mf: Manifest[T], conv: TupleConverter[T], tset: TupleSetter[T], avroType: AvroSchemaType[T])
  extends FixedPathSource(paths: _*) with PackedAvroFileScheme[T] with Mappable[T] with TypedSink[T] {
  override def converter[U >: T] = TupleConverter.asSuperConverter[T, U](conv)

  override def setter[U <: T] = TupleSetter.asSubSetter[T, U](tset)

  override def schema = avroType.schema
}




© 2015 - 2025 Weber Informatics LLC | Privacy Policy