Many resources are needed to download a project. Please understand that we have to compensate our server costs. Thank you in advance. Project price only 1 $
You can buy this project and download/modify it how often you want.
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/package org.apache.spark.sql.hive
import scala.collection.JavaConverters._
import scala.collection.mutable.ArrayBufferimport org.apache.hadoop.hive.ql.exec._
import org.apache.hadoop.hive.ql.udf.{UDFType => HiveUDFType}
import org.apache.hadoop.hive.ql.udf.generic._
import org.apache.hadoop.hive.ql.udf.generic.GenericUDF._
import org.apache.hadoop.hive.ql.udf.generic.GenericUDFUtils.ConversionHelperimport org.apache.hadoop.hive.serde2.objectinspector.{ConstantObjectInspector, ObjectInspector,
ObjectInspectorFactory}
import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorFactory.ObjectInspectorOptionsimport org.apache.spark.internal.Loggingimport org.apache.spark.sql.catalyst.InternalRowimport org.apache.spark.sql.catalyst.expressions._
import org.apache.spark.sql.catalyst.expressions.aggregate._
import org.apache.spark.sql.catalyst.expressions.codegen.CodegenFallbackimport org.apache.spark.sql.hive.HiveShim._
import org.apache.spark.sql.types._
private[hive] caseclassHiveSimpleUDF(
name: String, funcWrapper: HiveFunctionWrapper, children: Seq[Expression])extendsExpressionwithHiveInspectorswithCodegenFallbackwithLogging {
overridedefdeterministic: Boolean = isUDFDeterministic
overridedefnullable: Boolean = true@transientlazyval function = funcWrapper.createFunction[UDF]()
@transientprivatelazyval method =
function.getResolver.getEvalMethod(children.map(_.dataType.toTypeInfo).asJava)
@transientprivatelazyval arguments = children.map(toInspector).toArray
@transientprivatelazyval isUDFDeterministic = {
val udfType = function.getClass().getAnnotation(classOf[HiveUDFType])
udfType != null && udfType.deterministic()
}
overridedeffoldable: Boolean = isUDFDeterministic && children.forall(_.foldable)
// Create parameter converters@transientprivatelazyval conversionHelper = newConversionHelper(method, arguments)
overridelazyval dataType = javaClassToDataType(method.getReturnType)
@transientlazyval returnInspector = ObjectInspectorFactory.getReflectionObjectInspector(
method.getGenericReturnType(), ObjectInspectorOptions.JAVA)
@transientprivatelazyval cached: Array[AnyRef] = newArray[AnyRef](children.length)
@transientprivatelazyval inputDataTypes: Array[DataType] = children.map(_.dataType).toArray
// TODO: Finish input output types.overridedefeval(input: InternalRow): Any = {
val inputs = wrap(children.map(_.eval(input)), arguments, cached, inputDataTypes)
val ret = FunctionRegistry.invoke(
method,
function,
conversionHelper.convertIfNecessary(inputs : _*): _*)
unwrap(ret, returnInspector)
}
overridedeftoString: String = {
s"$nodeName#${funcWrapper.functionClassName}(${children.mkString(",")})"
}
overridedefprettyName: String = name
overridedefsql: String = s"$name(${children.map(_.sql).mkString(", ")})"
}
// Adapter from Catalyst ExpressionResult to Hive DeferredObjectprivate[hive] classDeferredObjectAdapter(oi: ObjectInspector, dataType: DataType)extendsDeferredObjectwithHiveInspectors {
privatevar func: () => Any = _
defset(func: () => Any): Unit = {
this.func = func
}
overridedefprepare(i: Int): Unit = {}
overridedefget(): AnyRef = wrap(func(), oi, dataType)
}
private[hive] caseclassHiveGenericUDF(
name: String, funcWrapper: HiveFunctionWrapper, children: Seq[Expression])extendsExpressionwithHiveInspectorswithCodegenFallbackwithLogging {
overridedefnullable: Boolean = trueoverridedefdeterministic: Boolean = isUDFDeterministic
overridedeffoldable: Boolean =
isUDFDeterministic && returnInspector.isInstanceOf[ConstantObjectInspector]
@transientlazyval function = funcWrapper.createFunction[GenericUDF]()
@transientprivatelazyval argumentInspectors = children.map(toInspector)
@transientprivatelazyval returnInspector = {
function.initializeAndFoldConstants(argumentInspectors.toArray)
}
@transientprivatelazyval isUDFDeterministic = {
val udfType = function.getClass.getAnnotation(classOf[HiveUDFType])
udfType != null && udfType.deterministic()
}
@transientprivatelazyval deferredObjects = argumentInspectors.zip(children).map { case (inspect, child) =>
newDeferredObjectAdapter(inspect, child.dataType)
}.toArray[DeferredObject]
overridelazyval dataType: DataType = inspectorToDataType(returnInspector)
overridedefeval(input: InternalRow): Any = {
returnInspector // Make sure initialized.var i = 0val length = children.length
while (i < length) {
val idx = i
deferredObjects(i).asInstanceOf[DeferredObjectAdapter]
.set(() => children(idx).eval(input))
i += 1
}
unwrap(function.evaluate(deferredObjects), returnInspector)
}
overridedefprettyName: String = name
overridedeftoString: String = {
s"$nodeName#${funcWrapper.functionClassName}(${children.mkString(",")})"
}
}
/**
* Converts a Hive Generic User Defined Table Generating Function (UDTF) to a
* [[Generator]]. Note that the semantics of Generators do not allow
* Generators to maintain state in between input rows. Thus UDTFs that rely on partitioning
* dependent operations like calls to `close()` before producing output will not operate the same as
* in Hive. However, in practice this should not affect compatibility for most sane UDTFs
* (e.g. explode or GenericUDTFParseUrlTuple).
*
* Operators that require maintaining state in between input rows should instead be implemented as
* user defined aggregations, which have clean semantics even in a partitioned execution.
*/private[hive] caseclassHiveGenericUDTF(
name: String,
funcWrapper: HiveFunctionWrapper,
children: Seq[Expression])extendsGeneratorwithHiveInspectorswithCodegenFallback {
@transientprotectedlazyval function: GenericUDTF = {
val fun: GenericUDTF = funcWrapper.createFunction()
fun.setCollector(collector)
fun
}
@transientprotectedlazyval inputInspectors = children.map(toInspector)
@transientprotectedlazyval outputInspector = function.initialize(inputInspectors.toArray)
@transientprotectedlazyval udtInput = newArray[AnyRef](children.length)
@transientprotectedlazyval collector = newUDTFCollectoroverridelazyval elementSchema = StructType(outputInspector.getAllStructFieldRefs.asScala.map {
field => StructField(field.getFieldName, inspectorToDataType(field.getFieldObjectInspector),
nullable = true)
})
@transientprivatelazyval inputDataTypes: Array[DataType] = children.map(_.dataType).toArray
overridedefeval(input: InternalRow): TraversableOnce[InternalRow] = {
outputInspector // Make sure initialized.val inputProjection = newInterpretedProjection(children)
function.process(wrap(inputProjection(input), inputInspectors, udtInput, inputDataTypes))
collector.collectRows()
}
protectedclassUDTFCollectorextendsCollector{
var collected = newArrayBuffer[InternalRow]
overridedefcollect(input: java.lang.Object) {
// We need to clone the input here because implementations of// GenericUDTF reuse the same object. Luckily they are always an array, so// it is easy to clone.
collected += unwrap(input, outputInspector).asInstanceOf[InternalRow]
}
defcollectRows(): Seq[InternalRow] = {
val toCollect = collected
collected = newArrayBuffer[InternalRow]
toCollect
}
}
overridedefterminate(): TraversableOnce[InternalRow] = {
outputInspector // Make sure initialized.
function.close()
collector.collectRows()
}
overridedeftoString: String = {
s"$nodeName#${funcWrapper.functionClassName}(${children.mkString(",")})"
}
overridedefprettyName: String = name
}
/**
* Currently we don't support partial aggregation for queries using Hive UDAF, which may hurt
* performance a lot.
*/private[hive] caseclassHiveUDAFFunction(
name: String,
funcWrapper: HiveFunctionWrapper,
children: Seq[Expression],
isUDAFBridgeRequired: Boolean = false,
mutableAggBufferOffset: Int = 0,
inputAggBufferOffset: Int = 0)extendsImperativeAggregatewithHiveInspectors {
overridedefwithNewMutableAggBufferOffset(newMutableAggBufferOffset: Int): ImperativeAggregate =
copy(mutableAggBufferOffset = newMutableAggBufferOffset)
overridedefwithNewInputAggBufferOffset(newInputAggBufferOffset: Int): ImperativeAggregate =
copy(inputAggBufferOffset = newInputAggBufferOffset)
@transientprivatelazyval resolver =
if (isUDAFBridgeRequired) {
newGenericUDAFBridge(funcWrapper.createFunction[UDAF]())
} else {
funcWrapper.createFunction[AbstractGenericUDAFResolver]()
}
@transientprivatelazyval inspectors = children.map(toInspector).toArray
@transientprivatelazyval functionAndInspector = {
val parameterInfo = newSimpleGenericUDAFParameterInfo(inspectors, false, false)
val f = resolver.getEvaluator(parameterInfo)
f -> f.init(GenericUDAFEvaluator.Mode.COMPLETE, inspectors)
}
@transientprivatelazyval function = functionAndInspector._1
@transientprivatelazyval returnInspector = functionAndInspector._2
@transientprivate[this] var buffer: GenericUDAFEvaluator.AggregationBuffer = _
overridedefeval(input: InternalRow): Any = unwrap(function.evaluate(buffer), returnInspector)
@transientprivatelazyval inputProjection = newInterpretedProjection(children)
@transientprivatelazyval cached = newArray[AnyRef](children.length)
@transientprivatelazyval inputDataTypes: Array[DataType] = children.map(_.dataType).toArray
// Hive UDAF has its own buffer, so we don't need to occupy a slot in the aggregation// buffer for it.overridedefaggBufferSchema: StructType = StructType(Nil)
overridedefupdate(_buffer: MutableRow, input: InternalRow): Unit = {
val inputs = inputProjection(input)
function.iterate(buffer, wrap(inputs, inspectors, cached, inputDataTypes))
}
overridedefmerge(buffer1: MutableRow, buffer2: InternalRow): Unit = {
thrownewUnsupportedOperationException(
"Hive UDAF doesn't support partial aggregate")
}
overridedefinitialize(_buffer: MutableRow): Unit = {
buffer = function.getNewAggregationBuffer
}
overrideval aggBufferAttributes: Seq[AttributeReference] = Nil// Note: although this simply copies aggBufferAttributes, this common code can not be placed// in the superclass because that will lead to initialization ordering issues.overrideval inputAggBufferAttributes: Seq[AttributeReference] = Nil// We rely on Hive to check the input data types, so use `AnyDataType` here to bypass our// catalyst type checking framework.overridedefinputTypes: Seq[AbstractDataType] = children.map(_ => AnyDataType)
overridedefnullable: Boolean = trueoverridedefsupportsPartial: Boolean = falseoverridelazyval dataType: DataType = inspectorToDataType(returnInspector)
overridedefprettyName: String = name
overridedefsql(isDistinct: Boolean): String = {
val distinct = if (isDistinct) "DISTINCT "else" "s"$name($distinct${children.map(_.sql).mkString(", ")})"
}
}