All Downloads are FREE. Search and download functionalities are using the official Maven repository.

com.twitter.scalding.parquet.tuple.scheme.ParquetTupleConverter.scala Maven / Gradle / Ivy

There is a newer version: 0.16.1-RC2
Show newest version
package com.twitter.scalding.parquet.tuple.scheme

import org.apache.parquet.io.api.{ Binary, Converter, GroupConverter, PrimitiveConverter }
import scala.util.Try

trait TupleFieldConverter[+T] extends Converter with Serializable {
  /**
   * Current value read from parquet column
   */
  def currentValue: T

  /**
   * reset the converter state, make it ready for reading next column value.
   */
  def reset(): Unit
}

/**
 * Parquet tuple converter used to create user defined tuple value from parquet column values
 */
abstract class ParquetTupleConverter[T] extends GroupConverter with TupleFieldConverter[T] {
  override def start(): Unit = reset()
  override def end(): Unit = ()
}

/**
 * Primitive fields converter
 * @tparam T primitive types (String, Double, Float, Long, Int, Short, Byte, Boolean)
 */
trait PrimitiveFieldConverter[T] extends PrimitiveConverter with TupleFieldConverter[T] {
  val defaultValue: T
  var value: T = defaultValue

  override def currentValue: T = value

  override def reset(): Unit = value = defaultValue
}

class StringConverter extends PrimitiveFieldConverter[String] {
  override val defaultValue: String = null

  override def addBinary(binary: Binary): Unit = value = binary.toStringUsingUTF8
}

class DoubleConverter extends PrimitiveFieldConverter[Double] {
  override val defaultValue: Double = 0D

  override def addDouble(v: Double): Unit = value = v
}

class FloatConverter extends PrimitiveFieldConverter[Float] {
  override val defaultValue: Float = 0F

  override def addFloat(v: Float): Unit = value = v
}

class LongConverter extends PrimitiveFieldConverter[Long] {
  override val defaultValue: Long = 0L

  override def addLong(v: Long): Unit = value = v
}

class IntConverter extends PrimitiveFieldConverter[Int] {
  override val defaultValue: Int = 0

  override def addInt(v: Int): Unit = value = v
}

class ShortConverter extends PrimitiveFieldConverter[Short] {
  override val defaultValue: Short = 0

  override def addInt(v: Int): Unit = value = Try(v.toShort).getOrElse(0)
}

class ByteConverter extends PrimitiveFieldConverter[Byte] {
  override val defaultValue: Byte = 0

  override def addInt(v: Int): Unit = value = Try(v.toByte).getOrElse(0)
}

class BooleanConverter extends PrimitiveFieldConverter[Boolean] {
  override val defaultValue: Boolean = false

  override def addBoolean(v: Boolean): Unit = value = v
}

/**
 * Collection field converter, such as list(Scala Option is also seen as a collection).
 * @tparam T collection element type(can be primitive types or nested types)
 */
trait CollectionConverter[T] {
  val child: TupleFieldConverter[T]

  def appendValue(v: T): Unit
}

/**
 * A wrapper of primitive converters for modeling primitive fields in a collection
 * @tparam T primitive types (String, Double, Float, Long, Int, Short, Byte, Boolean)
 */
abstract class CollectionElementPrimitiveConverter[T](val parent: CollectionConverter[T]) extends PrimitiveConverter
  with TupleFieldConverter[T] {
  val delegate: PrimitiveFieldConverter[T]

  override def addBinary(v: Binary) = {
    delegate.addBinary(v)
    parent.appendValue(delegate.currentValue)
  }

  override def addBoolean(v: Boolean) = {
    delegate.addBoolean(v)
    parent.appendValue(delegate.currentValue)
  }

  override def addDouble(v: Double) = {
    delegate.addDouble(v)
    parent.appendValue(delegate.currentValue)
  }

  override def addFloat(v: Float) = {
    delegate.addFloat(v)
    parent.appendValue(delegate.currentValue)
  }

  override def addInt(v: Int) = {
    delegate.addInt(v)
    parent.appendValue(delegate.currentValue)
  }

  override def addLong(v: Long) = {
    delegate.addLong(v)
    parent.appendValue(delegate.currentValue)
  }

  override def currentValue: T = delegate.currentValue

  override def reset(): Unit = delegate.reset()
}

/**
 * A wrapper of group converters for modeling group type element in a collection
 * @tparam T group tuple type(can be a collection type, such as list)
 */
abstract class CollectionElementGroupConverter[T](val parent: CollectionConverter[T]) extends GroupConverter
  with TupleFieldConverter[T] {

  val delegate: TupleFieldConverter[T]

  override def getConverter(i: Int): Converter = delegate.asGroupConverter().getConverter(i)

  override def end(): Unit = {
    parent.appendValue(delegate.currentValue)
    delegate.asGroupConverter().end()
  }

  override def start(): Unit = delegate.asGroupConverter().start()

  override def currentValue: T = delegate.currentValue

  override def reset(): Unit = delegate.reset()
}

/**
 * Option converter for modeling option field
 * @tparam T option element type(can be primitive types or nested types)
 */
abstract class OptionConverter[T] extends TupleFieldConverter[Option[T]] with CollectionConverter[T] {
  var value: Option[T] = None

  override def appendValue(v: T): Unit = value = Option(v)

  override def currentValue: Option[T] = value

  override def reset(): Unit = {
    value = None
    child.reset()
  }

  override def isPrimitive: Boolean = child.isPrimitive

  override def asGroupConverter: GroupConverter = child.asGroupConverter()

  override def asPrimitiveConverter: PrimitiveConverter = child.asPrimitiveConverter()
}

/**
 * List in parquet is represented by 3-level structure.
 * Check this https://github.com/apache/incubator-parquet-format/blob/master/LogicalTypes.md
 * Helper class to wrap a converter for a list group converter
 */
object ListElement {
  def wrapper(child: Converter): GroupConverter = new GroupConverter() {
    override def getConverter(i: Int): Converter = {
      if (i != 0)
        throw new IllegalArgumentException("list have only one element field. can't reach " + i)
      child
    }

    override def end(): Unit = ()

    override def start(): Unit = ()
  }
}
/**
 * List converter for modeling list field
 * @tparam T list element type(can be primitive types or nested types)
 */
abstract class ListConverter[T] extends GroupConverter with TupleFieldConverter[List[T]] with CollectionConverter[T] {

  var value: List[T] = Nil

  def appendValue(v: T): Unit = value = value :+ v

  lazy val listElement: GroupConverter = new GroupConverter() {
    override def getConverter(i: Int): Converter = {
      if (i != 0)
        throw new IllegalArgumentException("lists have only one element field. can't reach " + i)
      child
    }

    override def end(): Unit = ()

    override def start(): Unit = ()
  }

  override def getConverter(i: Int): Converter = {
    if (i != 0)
      throw new IllegalArgumentException("lists have only one element field. can't reach " + i)
    listElement
  }

  override def end(): Unit = ()

  override def start(): Unit = reset()

  override def currentValue: List[T] = value

  override def reset(): Unit = {
    value = Nil
    child.reset()
  }
}

/**
 * Set converter for modeling set field
 * @tparam T list element type(can be primitive types or nested types)
 */
abstract class SetConverter[T] extends GroupConverter with TupleFieldConverter[Set[T]] with CollectionConverter[T] {

  var value: Set[T] = Set()

  def appendValue(v: T): Unit = value = value + v

  //in the back end set is stored as list
  lazy val listElement: GroupConverter = ListElement.wrapper(child)

  override def getConverter(i: Int): Converter = {
    if (i != 0)
      throw new IllegalArgumentException("sets have only one element field. can't reach " + i)
    listElement
  }

  override def end(): Unit = ()

  override def start(): Unit = reset()

  override def currentValue: Set[T] = value

  override def reset(): Unit = {
    value = Set()
    child.reset()
  }
}

/**
 * Map converter for modeling map field
 * @tparam K map key type
 * @tparam V map value type
 */
abstract class MapConverter[K, V] extends GroupConverter with TupleFieldConverter[Map[K, V]] with CollectionConverter[(K, V)] {

  var value: Map[K, V] = Map()

  def appendValue(v: (K, V)): Unit = value = value + v

  override def getConverter(i: Int): Converter = {
    if (i != 0)
      throw new IllegalArgumentException("maps have only one element type key_value(0). can't reach " + i)
    child
  }

  override def end(): Unit = ()

  override def start(): Unit = reset()

  override def currentValue: Map[K, V] = value

  override def reset(): Unit = {
    value = Map()
    child.reset()
  }
}

abstract class MapKeyValueConverter[K, V](parent: CollectionConverter[(K, V)])
  extends CollectionElementGroupConverter[(K, V)](parent) {

  val keyConverter: TupleFieldConverter[K]

  val valueConverter: TupleFieldConverter[V]

  override lazy val delegate: TupleFieldConverter[(K, V)] = new GroupConverter with TupleFieldConverter[(K, V)] {
    override def currentValue: (K, V) = (keyConverter.currentValue, valueConverter.currentValue)

    override def reset(): Unit = {
      keyConverter.reset()
      valueConverter.reset()
    }

    override def getConverter(i: Int): Converter = {
      if (i == 0) keyConverter
      else if (i == 1) valueConverter
      else throw new IllegalArgumentException("key_value has only the key (0) and value (1) fields expected: " + i)
    }

    override def end(): Unit = ()

    override def start(): Unit = reset()
  }
}





© 2015 - 2025 Weber Informatics LLC | Privacy Policy