Many resources are needed to download a project. Please understand that we have to compensate our server costs. Thank you in advance. Project price only 1 $
You can buy this project and download/modify it how often you want.
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.spark.sql.execution.datasources.orc
import org.apache.hadoop.io._
import org.apache.orc.mapred.{OrcList, OrcMap, OrcStruct, OrcTimestamp}
import org.apache.orc.storage.serde2.io.{DateWritable, HiveDecimalWritable}
import org.apache.spark.sql.catalyst.InternalRow
import org.apache.spark.sql.catalyst.expressions.{SpecificInternalRow, UnsafeArrayData}
import org.apache.spark.sql.catalyst.util._
import org.apache.spark.sql.types._
import org.apache.spark.unsafe.types.UTF8String
/**
* A deserializer to deserialize ORC structs to Spark rows.
*/
class OrcDeserializer(
dataSchema: StructType,
requiredSchema: StructType,
requestedColIds: Array[Int]) {
private val resultRow = new SpecificInternalRow(requiredSchema.map(_.dataType))
private val fieldWriters: Array[WritableComparable[_] => Unit] = {
requiredSchema.zipWithIndex
// The value of missing columns are always null, do not need writers.
.filterNot { case (_, index) => requestedColIds(index) == -1 }
.map { case (f, index) =>
val writer = newWriter(f.dataType, new RowUpdater(resultRow))
(value: WritableComparable[_]) => writer(index, value)
}.toArray
}
private val validColIds = requestedColIds.filterNot(_ == -1)
def deserialize(orcStruct: OrcStruct): InternalRow = {
var i = 0
while (i < validColIds.length) {
val value = orcStruct.getFieldValue(validColIds(i))
if (value == null) {
resultRow.setNullAt(i)
} else {
fieldWriters(i)(value)
}
i += 1
}
resultRow
}
/**
* Creates a writer to write ORC values to Catalyst data structure at the given ordinal.
*/
private def newWriter(
dataType: DataType, updater: CatalystDataUpdater): (Int, WritableComparable[_]) => Unit =
dataType match {
case NullType => (ordinal, _) =>
updater.setNullAt(ordinal)
case BooleanType => (ordinal, value) =>
updater.setBoolean(ordinal, value.asInstanceOf[BooleanWritable].get)
case ByteType => (ordinal, value) =>
updater.setByte(ordinal, value.asInstanceOf[ByteWritable].get)
case ShortType => (ordinal, value) =>
updater.setShort(ordinal, value.asInstanceOf[ShortWritable].get)
case IntegerType => (ordinal, value) =>
updater.setInt(ordinal, value.asInstanceOf[IntWritable].get)
case LongType => (ordinal, value) =>
updater.setLong(ordinal, value.asInstanceOf[LongWritable].get)
case FloatType => (ordinal, value) =>
updater.setFloat(ordinal, value.asInstanceOf[FloatWritable].get)
case DoubleType => (ordinal, value) =>
updater.setDouble(ordinal, value.asInstanceOf[DoubleWritable].get)
case StringType => (ordinal, value) =>
updater.set(ordinal, UTF8String.fromBytes(value.asInstanceOf[Text].copyBytes))
case BinaryType => (ordinal, value) =>
val binary = value.asInstanceOf[BytesWritable]
val bytes = new Array[Byte](binary.getLength)
System.arraycopy(binary.getBytes, 0, bytes, 0, binary.getLength)
updater.set(ordinal, bytes)
case DateType => (ordinal, value) =>
updater.setInt(ordinal, DateTimeUtils.fromJavaDate(value.asInstanceOf[DateWritable].get))
case TimestampType => (ordinal, value) =>
updater.setLong(ordinal, DateTimeUtils.fromJavaTimestamp(value.asInstanceOf[OrcTimestamp]))
case DecimalType.Fixed(precision, scale) => (ordinal, value) =>
val decimal = value.asInstanceOf[HiveDecimalWritable].getHiveDecimal()
val v = Decimal(decimal.bigDecimalValue, decimal.precision(), decimal.scale())
v.changePrecision(precision, scale)
updater.set(ordinal, v)
case st: StructType => (ordinal, value) =>
val result = new SpecificInternalRow(st)
val fieldUpdater = new RowUpdater(result)
val fieldConverters = st.map(_.dataType).map { dt =>
newWriter(dt, fieldUpdater)
}.toArray
val orcStruct = value.asInstanceOf[OrcStruct]
var i = 0
while (i < st.length) {
val value = orcStruct.getFieldValue(i)
if (value == null) {
result.setNullAt(i)
} else {
fieldConverters(i)(i, value)
}
i += 1
}
updater.set(ordinal, result)
case ArrayType(elementType, _) => (ordinal, value) =>
val orcArray = value.asInstanceOf[OrcList[WritableComparable[_]]]
val length = orcArray.size()
val result = createArrayData(elementType, length)
val elementUpdater = new ArrayDataUpdater(result)
val elementConverter = newWriter(elementType, elementUpdater)
var i = 0
while (i < length) {
val value = orcArray.get(i)
if (value == null) {
result.setNullAt(i)
} else {
elementConverter(i, value)
}
i += 1
}
updater.set(ordinal, result)
case MapType(keyType, valueType, _) => (ordinal, value) =>
val orcMap = value.asInstanceOf[OrcMap[WritableComparable[_], WritableComparable[_]]]
val length = orcMap.size()
val keyArray = createArrayData(keyType, length)
val keyUpdater = new ArrayDataUpdater(keyArray)
val keyConverter = newWriter(keyType, keyUpdater)
val valueArray = createArrayData(valueType, length)
val valueUpdater = new ArrayDataUpdater(valueArray)
val valueConverter = newWriter(valueType, valueUpdater)
var i = 0
val it = orcMap.entrySet().iterator()
while (it.hasNext) {
val entry = it.next()
keyConverter(i, entry.getKey)
val value = entry.getValue
if (value == null) {
valueArray.setNullAt(i)
} else {
valueConverter(i, value)
}
i += 1
}
updater.set(ordinal, new ArrayBasedMapData(keyArray, valueArray))
case udt: UserDefinedType[_] => newWriter(udt.sqlType, updater)
case _ =>
throw new UnsupportedOperationException(s"$dataType is not supported yet.")
}
private def createArrayData(elementType: DataType, length: Int): ArrayData = elementType match {
case BooleanType => UnsafeArrayData.fromPrimitiveArray(new Array[Boolean](length))
case ByteType => UnsafeArrayData.fromPrimitiveArray(new Array[Byte](length))
case ShortType => UnsafeArrayData.fromPrimitiveArray(new Array[Short](length))
case IntegerType => UnsafeArrayData.fromPrimitiveArray(new Array[Int](length))
case LongType => UnsafeArrayData.fromPrimitiveArray(new Array[Long](length))
case FloatType => UnsafeArrayData.fromPrimitiveArray(new Array[Float](length))
case DoubleType => UnsafeArrayData.fromPrimitiveArray(new Array[Double](length))
case _ => new GenericArrayData(new Array[Any](length))
}
/**
* A base interface for updating values inside catalyst data structure like `InternalRow` and
* `ArrayData`.
*/
sealed trait CatalystDataUpdater {
def set(ordinal: Int, value: Any): Unit
def setNullAt(ordinal: Int): Unit = set(ordinal, null)
def setBoolean(ordinal: Int, value: Boolean): Unit = set(ordinal, value)
def setByte(ordinal: Int, value: Byte): Unit = set(ordinal, value)
def setShort(ordinal: Int, value: Short): Unit = set(ordinal, value)
def setInt(ordinal: Int, value: Int): Unit = set(ordinal, value)
def setLong(ordinal: Int, value: Long): Unit = set(ordinal, value)
def setDouble(ordinal: Int, value: Double): Unit = set(ordinal, value)
def setFloat(ordinal: Int, value: Float): Unit = set(ordinal, value)
}
final class RowUpdater(row: InternalRow) extends CatalystDataUpdater {
override def setNullAt(ordinal: Int): Unit = row.setNullAt(ordinal)
override def set(ordinal: Int, value: Any): Unit = row.update(ordinal, value)
override def setBoolean(ordinal: Int, value: Boolean): Unit = row.setBoolean(ordinal, value)
override def setByte(ordinal: Int, value: Byte): Unit = row.setByte(ordinal, value)
override def setShort(ordinal: Int, value: Short): Unit = row.setShort(ordinal, value)
override def setInt(ordinal: Int, value: Int): Unit = row.setInt(ordinal, value)
override def setLong(ordinal: Int, value: Long): Unit = row.setLong(ordinal, value)
override def setDouble(ordinal: Int, value: Double): Unit = row.setDouble(ordinal, value)
override def setFloat(ordinal: Int, value: Float): Unit = row.setFloat(ordinal, value)
}
final class ArrayDataUpdater(array: ArrayData) extends CatalystDataUpdater {
override def setNullAt(ordinal: Int): Unit = array.setNullAt(ordinal)
override def set(ordinal: Int, value: Any): Unit = array.update(ordinal, value)
override def setBoolean(ordinal: Int, value: Boolean): Unit = array.setBoolean(ordinal, value)
override def setByte(ordinal: Int, value: Byte): Unit = array.setByte(ordinal, value)
override def setShort(ordinal: Int, value: Short): Unit = array.setShort(ordinal, value)
override def setInt(ordinal: Int, value: Int): Unit = array.setInt(ordinal, value)
override def setLong(ordinal: Int, value: Long): Unit = array.setLong(ordinal, value)
override def setDouble(ordinal: Int, value: Double): Unit = array.setDouble(ordinal, value)
override def setFloat(ordinal: Int, value: Float): Unit = array.setFloat(ordinal, value)
}
}