Maven / Gradle / Ivy
The newest version!
* Copyright 2016 The BigDL Authors.
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* See the License for the specific language governing permissions and
* limitations under the License.
import{NumericBoolean, NumericDouble, NumericFloat, NumericInt, NumericLong, NumericShort, NumericString}
import{NumericWildcard, TensorNumeric}
import{T, Table}
import org.apache.spark.sql.Row
import org.apache.spark.sql.types._
import scala.collection.mutable
import scala.reflect.ClassTag
* ======RowTransformer transform a `Row` to a `Table` whose values are all `Tensor`.======
* This transformer is a container of `RowTransformSchema`s.
* When this transformer being executed,
* it will run `transform` methods of its `RowTransformSchema`s.
* Output of `RowTransformer` is a `Table`.
* The keys of `Table` are Tensor.scalar(`schemaKey`)s of included `RowTransformSchema`s.
* Correspondingly, the values of `Table` are results(`Tensor`) of `RowTransformSchema.transform`.
* @param schemas schemas of transformer, whose keys should `NOT` be duplicated
* @param rowSize size of `Row` transformed by this transformer, default is `None`
class RowTransformer(
@transient private val schemas: Seq[RowTransformSchema],
protected val rowSize: Option[Int] = None
) extends Transformer[Row, Table] {
protected val schemaMap: mutable.Map[String, RowTransformSchema] = {
val map = mutable.LinkedHashMap[String, RowTransformSchema]()
schemas.foreach { schema =>
s"Found replicated schemeKey: ${schema.schemaKey}"
if (schema.fieldNames.isEmpty) {
require(schema.indices.forall(i => i >= 0 && i < rowSize.getOrElse(Int.MaxValue)),
s"At least one of indices are out of bound: ${schema.indices.mkString(",")}"
map += schema.schemaKey -> schema
override def apply(prev: Iterator[Row]): Iterator[Table] = {
new Iterator[Table] {
override def hasNext: Boolean = prev.hasNext
override def next(): Table = {
val row =
val table = T()
schemaMap.foreach { case (key, schema) =>
val indices = schema match {
case sch if sch.fieldNames.nonEmpty =>
case sch if sch.indices.nonEmpty =>
case _ =>
0 until row.length
val (values, fields) = =>
row.get(i) -> row.schema.fields(i)
val outputKey = Tensor.scalar[String](key)
val output = schema.transform(values, fields)
table.update(outputKey, output)
object RowTransformer {
def apply(
schemas: Seq[RowTransformSchema],
rowSize: Int = 0
): RowTransformer = {
new RowTransformer(schemas, if (rowSize > 0) Some(rowSize) else None)
* A `RowTransformer` which transform each `selected columns` to a size(1) `Tensor`.
* The keys of output `Table` are `fieldNames` of `selected columns`.
* @param fieldNames field names of `selected columns`
def atomic(fieldNames: Seq[String]): RowTransformer = {
val transSchemas = => ColToTensor(f, f))
new RowTransformer(transSchemas)
* A `RowTransformer` which transform each `selected columns` to a size(1) `Tensor`.
* The keys of output `Table` are `indices` of `selected columns`.
* @param indices indices of `selected columns`
* @param rowSize size of `Row` transformed by this transformer
def atomic(indices: Seq[Int], rowSize: Int): RowTransformer = {
val transSchemas = => new ColToTensor(f.toString, f))
new RowTransformer(transSchemas, Option(rowSize))
* A `RowTransformer` which concat values of `all columns` to one `Tensor`.
* It means you will get a Table with single key-value pair after transformation.
* The unique key is `schemaKey`. The unique value is a size(length of Row) Tensor.
* @param schemaKey key of the schema, default value is "all"
def numeric[T: ClassTag](schemaKey: String = "all"
)(implicit ev: TensorNumeric[T]): RowTransformer = {
new RowTransformer(Seq(ColsToNumeric[T](schemaKey)))
* A `RowTransformer` which concat values of `selected columns` to one `Tensor`.
* It means you will get a `Table` with keys of `numericFields`.
* Values of `Table` are `Tensor`s concatenated by `selected columns` of the keys.
* @param numericFields Map<`schemaKey`, `fieldNames of selected columns`> of numeric fields
def numeric[T: ClassTag](numericFields: Map[String, Seq[String]]
)(implicit ev: TensorNumeric[T]): RowTransformer = {
val transSchemas = { case(key, fields) => ColsToNumeric[T](key, fields) }
new RowTransformer(transSchemas.toSeq)
* A `RowTransformer` which contains both `atomic` schemas and `numeric` schemas.
* @param atomicFields field names of `selected columns`
* @param numericFields Map<`schemaKey`, `fieldNames of selected columns`> of numeric fields
def atomicWithNumeric[T: ClassTag](
atomicFields: Seq[String],
numericFields: Map[String, Seq[String]]
)(implicit ev: TensorNumeric[T]): RowTransformer = {
val transSchemas = mutable.ArrayBuffer[RowTransformSchema]()
atomicFields.foreach(f => transSchemas += ColToTensor(f, f))
numericFields.foreach { case(key, fields) =>
transSchemas += ColsToNumeric[T](key, fields)
new RowTransformer(transSchemas)
* A `schema` describe a transforming job which convert a `Row` to a `Table`(`Tensor`).
trait RowTransformSchema extends Serializable {
* Key of the schema, which will be the key of `Tensor` in result `Table`.
* So, it should be `unique` in single `RowTransformer`.
val schemaKey: String
* ======`Indices` of Selected Columns======
* It will work on only when `fieldNames` is empty,
* otherwise `RowTransformer` will select columns accord to `fieldNames`.
* If both `indices` and `fieldNames` are empty,
* `RowTransformer` will select all columns by default.
val indices: Seq[Int] = Seq.empty
* ======`FieldNames` of Selected Columns======
* This property will override `indices` when it is not empty.
val fieldNames: Seq[String] = Seq.empty
* Transforming Logic of the Schema
* @param values values of selected columns
* @param fields StructFields of selected columns
* @return a result `Tensor`
def transform(values: Seq[Any], fields: Seq[StructField]): Tensor[NumericWildcard]
* A schema which specialize on transforming multiple `numeric` columns to one `Tensor`.
* Types of `selected columns` will be identified according to their `[DataType`.
* And type conversions will be done automatically from `DataType` to `T` if valid.
* Currently, `DoubleType`, `FloatType`, `ShortType`, `IntegerType`, `LongType` are supported.
* @param schemaKey key of the schema
* @param indices indices of `selected columns`
* @param fieldNames field names of `selected columns`
* @tparam T the type of result `Tensor`
class ColsToNumeric[@specialized T: ClassTag](
override val schemaKey: String,
override val indices: Seq[Int] = Seq.empty,
override val fieldNames: Seq[String] = Seq.empty
)(implicit ev: TensorNumeric[T]) extends RowTransformSchema {
override def transform(input: Seq[Any], fields: Seq[StructField]): Tensor[NumericWildcard] = {
val tensor = Tensor[T](input.length)
var i = 0
while (i < input.length) {
val value = fields(i).dataType match {
// TODO: support VectorUDT
case _: DoubleType => ev.fromType(input(i).asInstanceOf[Double])
case _: FloatType => ev.fromType(input(i).asInstanceOf[Float])
case _: ShortType => ev.fromType(input(i).asInstanceOf[Short])
case _: IntegerType => ev.fromType(input(i).asInstanceOf[Int])
case _: LongType => ev.fromType(input(i).asInstanceOf[Long])
case tpe => throw new IllegalArgumentException(s"Found unSupported DataType($tpe)!")
tensor.setValue(i + 1, value)
i += 1
object ColsToNumeric {
* Build a `ColsToNumeric` which transforms `all columns` of Row.
* @param schemaKey key of the schema
* @tparam T the type of result `Tensor`
def apply[@specialized(Float, Double) T: ClassTag](schemaKey: String
)(implicit ev: TensorNumeric[T]): ColsToNumeric[T] = {
new ColsToNumeric[T](schemaKey)
* Build a `ColsToNumeric` which transforms `selected columns` of Row.
* @param schemaKey key of the schema
* @param fieldNames field names of `selected columns`
* @tparam T the type of result `Tensor`
def apply[@specialized(Float, Double) T: ClassTag](
schemaKey: String,
fieldNames: Seq[String]
)(implicit ev: TensorNumeric[T]): ColsToNumeric[T] = {
new ColsToNumeric[T](schemaKey, Seq.empty, fieldNames)
* A schema which specialize on transforming `single column` to size(1) `Tensor`.
* Types of `selected columns` will be identified according to their `DataType`.
* And type conversions will be done automatically from `DataType` to `TensorDataType`.
* @param schemaKey key of the schema
* @param index index of selected column, overridden by non empty `fieldName`
* @param fieldName field name of selected column, default is empty
class ColToTensor(
override val schemaKey: String,
index: Int,
fieldName: String = ""
) extends RowTransformSchema {
override val indices: Seq[Int] = Seq(index)
override val fieldNames: Seq[String] = if (fieldName.isEmpty) Seq.empty else Seq(fieldName)
override def transform(input: Seq[Any], fields: Seq[StructField]): Tensor[NumericWildcard] = {
val (value, tpe) = input.head -> fields.head.dataType
val tensor = tpe match {
// TODO: support VectorUDT
case _: BooleanType => Tensor[Boolean](1).setValue(1, value.asInstanceOf[Boolean])
case _: DoubleType => Tensor[Double](1).setValue(1, value.asInstanceOf[Double])
case _: FloatType => Tensor[Float](1).setValue(1, value.asInstanceOf[Float])
case _: StringType => Tensor[String](1).setValue(1, value.asInstanceOf[String])
case _: ShortType => Tensor[Short](1).setValue(1, value.asInstanceOf[Short])
case _: IntegerType => Tensor[Int](1).setValue(1, value.asInstanceOf[Int])
case _: LongType => Tensor[Long](1).setValue(1, value.asInstanceOf[Long])
case t => throw new IllegalArgumentException(s"Found unSupported DataType($t)!")
object ColToTensor {
* Build a `ColsToTensor` according to `fieldName`
* @param schemaKey key of the schema
* @param fieldName field name of selected column
def apply(schemaKey: String, fieldName: String): ColToTensor = {
new ColToTensor(schemaKey, -1, fieldName)
© 2015 - 2025 Weber Informatics LLC | Privacy Policy