All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.apache.spark.sql.hive.hiveUDFs.scala Maven / Gradle / Ivy

/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *    http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package org.apache.spark.sql.hive

import scala.collection.mutable.ArrayBuffer
import scala.collection.JavaConverters._
import scala.util.Try

import org.apache.hadoop.hive.serde2.objectinspector.{ObjectInspector, ConstantObjectInspector}
import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorFactory.ObjectInspectorOptions
import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorFactory
import org.apache.hadoop.hive.ql.exec._
import org.apache.hadoop.hive.ql.udf.{UDFType => HiveUDFType}
import org.apache.hadoop.hive.ql.udf.generic._
import org.apache.hadoop.hive.ql.udf.generic.GenericUDF._
import org.apache.hadoop.hive.ql.udf.generic.GenericUDAFEvaluator.AggregationBuffer
import org.apache.hadoop.hive.ql.udf.generic.GenericUDFUtils.ConversionHelper

import org.apache.spark.Logging
import org.apache.spark.sql.AnalysisException
import org.apache.spark.sql.catalyst.InternalRow
import org.apache.spark.sql.catalyst.analysis
import org.apache.spark.sql.catalyst.analysis.FunctionRegistry.FunctionBuilder
import org.apache.spark.sql.catalyst.expressions._
import org.apache.spark.sql.catalyst.expressions.aggregate._
import org.apache.spark.sql.catalyst.expressions.codegen.CodegenFallback
import org.apache.spark.sql.catalyst.plans.logical._
import org.apache.spark.sql.catalyst.rules.Rule
import org.apache.spark.sql.catalyst.util.ArrayData
import org.apache.spark.sql.hive.HiveShim._
import org.apache.spark.sql.hive.client.ClientWrapper
import org.apache.spark.sql.types._


private[hive] class HiveFunctionRegistry(
    underlying: analysis.FunctionRegistry,
    executionHive: ClientWrapper)
  extends analysis.FunctionRegistry with HiveInspectors {

  def getFunctionInfo(name: String): FunctionInfo = {
    // Hive Registry need current database to lookup function
    // TODO: the current database of executionHive should be consistent with metadataHive
    executionHive.withHiveState {
      FunctionRegistry.getFunctionInfo(name)
    }
  }

  override def lookupFunction(name: String, children: Seq[Expression]): Expression = {
    Try(underlying.lookupFunction(name, children)).getOrElse {
      // We only look it up to see if it exists, but do not include it in the HiveUDF since it is
      // not always serializable.
      val functionInfo: FunctionInfo =
        Option(getFunctionInfo(name.toLowerCase)).getOrElse(
          throw new AnalysisException(s"undefined function $name"))

      val functionClassName = functionInfo.getFunctionClass.getName

      // When we instantiate hive UDF wrapper class, we may throw exception if the input expressions
      // don't satisfy the hive UDF, such as type mismatch, input number mismatch, etc. Here we
      // catch the exception and throw AnalysisException instead.
      try {
        if (classOf[GenericUDFMacro].isAssignableFrom(functionInfo.getFunctionClass)) {
          HiveGenericUDF(
            new HiveFunctionWrapper(functionClassName, functionInfo.getGenericUDF), children)
        } else if (classOf[UDF].isAssignableFrom(functionInfo.getFunctionClass)) {
          HiveSimpleUDF(new HiveFunctionWrapper(functionClassName), children)
        } else if (classOf[GenericUDF].isAssignableFrom(functionInfo.getFunctionClass)) {
          HiveGenericUDF(new HiveFunctionWrapper(functionClassName), children)
        } else if (
          classOf[AbstractGenericUDAFResolver].isAssignableFrom(functionInfo.getFunctionClass)) {
          HiveUDAFFunction(new HiveFunctionWrapper(functionClassName), children)
        } else if (classOf[UDAF].isAssignableFrom(functionInfo.getFunctionClass)) {
          HiveUDAFFunction(
            new HiveFunctionWrapper(functionClassName), children, isUDAFBridgeRequired = true)
        } else if (classOf[GenericUDTF].isAssignableFrom(functionInfo.getFunctionClass)) {
          val udtf = HiveGenericUDTF(new HiveFunctionWrapper(functionClassName), children)
          udtf.elementTypes // Force it to check input data types.
          udtf
        } else {
          throw new AnalysisException(s"No handler for udf ${functionInfo.getFunctionClass}")
        }
      } catch {
        case analysisException: AnalysisException =>
          // If the exception is an AnalysisException, just throw it.
          throw analysisException
        case throwable: Throwable =>
          // If there is any other error, we throw an AnalysisException.
          val errorMessage = s"No handler for Hive udf ${functionInfo.getFunctionClass} " +
            s"because: ${throwable.getMessage}."
          throw new AnalysisException(errorMessage)
      }
    }
  }

  override def registerFunction(name: String, info: ExpressionInfo, builder: FunctionBuilder)
  : Unit = underlying.registerFunction(name, info, builder)

  /* List all of the registered function names. */
  override def listFunction(): Seq[String] = {
    (FunctionRegistry.getFunctionNames.asScala ++ underlying.listFunction()).toList.sorted
  }

  /* Get the class of the registered function by specified name. */
  override def lookupFunction(name: String): Option[ExpressionInfo] = {
    underlying.lookupFunction(name).orElse(
    Try {
      val info = getFunctionInfo(name)
      val annotation = info.getFunctionClass.getAnnotation(classOf[Description])
      if (annotation != null) {
        Some(new ExpressionInfo(
          info.getFunctionClass.getCanonicalName,
          annotation.name(),
          annotation.value(),
          annotation.extended()))
      } else {
        Some(new ExpressionInfo(
          info.getFunctionClass.getCanonicalName,
          name,
          null,
          null))
      }
    }.getOrElse(None))
  }
}

private[hive] case class HiveSimpleUDF(funcWrapper: HiveFunctionWrapper, children: Seq[Expression])
  extends Expression with HiveInspectors with CodegenFallback with Logging {

  override def deterministic: Boolean = isUDFDeterministic

  override def nullable: Boolean = true

  @transient
  lazy val function = funcWrapper.createFunction[UDF]()

  @transient
  private lazy val method =
    function.getResolver.getEvalMethod(children.map(_.dataType.toTypeInfo).asJava)

  @transient
  private lazy val arguments = children.map(toInspector).toArray

  @transient
  private lazy val isUDFDeterministic = {
    val udfType = function.getClass().getAnnotation(classOf[HiveUDFType])
    udfType != null && udfType.deterministic()
  }

  override def foldable: Boolean = isUDFDeterministic && children.forall(_.foldable)

  // Create parameter converters
  @transient
  private lazy val conversionHelper = new ConversionHelper(method, arguments)

  override val dataType = javaClassToDataType(method.getReturnType)

  @transient
  lazy val returnInspector = ObjectInspectorFactory.getReflectionObjectInspector(
    method.getGenericReturnType(), ObjectInspectorOptions.JAVA)

  @transient
  private lazy val cached: Array[AnyRef] = new Array[AnyRef](children.length)

  @transient
  private lazy val inputDataTypes: Array[DataType] = children.map(_.dataType).toArray

  // TODO: Finish input output types.
  override def eval(input: InternalRow): Any = {
    val inputs = wrap(children.map(c => c.eval(input)), arguments, cached, inputDataTypes)
    val ret = FunctionRegistry.invoke(
      method,
      function,
      conversionHelper.convertIfNecessary(inputs : _*): _*)
    unwrap(ret, returnInspector)
  }

  override def toString: String = {
    s"$nodeName#${funcWrapper.functionClassName}(${children.mkString(",")})"
  }
}

// Adapter from Catalyst ExpressionResult to Hive DeferredObject
private[hive] class DeferredObjectAdapter(oi: ObjectInspector, dataType: DataType)
  extends DeferredObject with HiveInspectors {

  private var func: () => Any = _
  def set(func: () => Any): Unit = {
    this.func = func
  }
  override def prepare(i: Int): Unit = {}
  override def get(): AnyRef = wrap(func(), oi, dataType)
}

private[hive] case class HiveGenericUDF(funcWrapper: HiveFunctionWrapper, children: Seq[Expression])
  extends Expression with HiveInspectors with CodegenFallback with Logging {

  override def nullable: Boolean = true

  override def deterministic: Boolean = isUDFDeterministic

  override def foldable: Boolean =
    isUDFDeterministic && returnInspector.isInstanceOf[ConstantObjectInspector]

  @transient
  lazy val function = funcWrapper.createFunction[GenericUDF]()

  @transient
  private lazy val argumentInspectors = children.map(toInspector)

  @transient
  private lazy val returnInspector = {
    function.initializeAndFoldConstants(argumentInspectors.toArray)
  }

  @transient
  private lazy val isUDFDeterministic = {
    val udfType = function.getClass.getAnnotation(classOf[HiveUDFType])
    udfType != null && udfType.deterministic()
  }

  @transient
  private lazy val deferedObjects = argumentInspectors.zip(children).map { case (inspect, child) =>
    new DeferredObjectAdapter(inspect, child.dataType)
  }.toArray[DeferredObject]

  override val dataType: DataType = inspectorToDataType(returnInspector)

  override def eval(input: InternalRow): Any = {
    returnInspector // Make sure initialized.

    var i = 0
    while (i < children.length) {
      val idx = i
      deferedObjects(i).asInstanceOf[DeferredObjectAdapter].set(
        () => {
          children(idx).eval(input)
        })
      i += 1
    }
    unwrap(function.evaluate(deferedObjects), returnInspector)
  }

  override def toString: String = {
    s"$nodeName#${funcWrapper.functionClassName}(${children.mkString(",")})"
  }
}

/**
 * Resolves [[UnresolvedWindowFunction]] to [[HiveWindowFunction]].
 */
private[spark] object ResolveHiveWindowFunction extends Rule[LogicalPlan] {
  private def shouldResolveFunction(
      unresolvedWindowFunction: UnresolvedWindowFunction,
      windowSpec: WindowSpecDefinition): Boolean = {
    unresolvedWindowFunction.childrenResolved && windowSpec.childrenResolved
  }

  def apply(plan: LogicalPlan): LogicalPlan = plan transformUp {
    case p: LogicalPlan if !p.childrenResolved => p

    // We are resolving WindowExpressions at here. When we get here, we have already
    // replaced those WindowSpecReferences.
    case p: LogicalPlan =>
      p transformExpressions {
        // We will not start to resolve the function unless all arguments are resolved
        // and all expressions in window spec are fixed.
        case WindowExpression(
          u @ UnresolvedWindowFunction(name, children),
          windowSpec: WindowSpecDefinition) if shouldResolveFunction(u, windowSpec) =>
          // First, let's find the window function info.
          val windowFunctionInfo: WindowFunctionInfo =
            Option(FunctionRegistry.getWindowFunctionInfo(name.toLowerCase)).getOrElse(
              throw new AnalysisException(s"Couldn't find window function $name"))

          // Get the class of this function.
          // In Hive 0.12, there is no windowFunctionInfo.getFunctionClass. So, we use
          // windowFunctionInfo.getfInfo().getFunctionClass for both Hive 0.13 and Hive 0.13.1.
          val functionClass = windowFunctionInfo.getFunctionClass()
          val newChildren =
            // Rank(), DENSE_RANK(), CUME_DIST(), and PERCENT_RANK() do not take explicit
            // input parameters and requires implicit parameters, which
            // are expressions in Order By clause.
            if (classOf[GenericUDAFRank].isAssignableFrom(functionClass)) {
              if (children.nonEmpty) {
               throw new AnalysisException(s"$name does not take input parameters.")
              }
              windowSpec.orderSpec.map(_.child)
            } else {
              children
            }

          // If the class is UDAF, we need to use UDAFBridge.
          val isUDAFBridgeRequired =
            if (classOf[UDAF].isAssignableFrom(functionClass)) {
              true
            } else {
              false
            }

          // Create the HiveWindowFunction. For the meaning of isPivotResult, see the doc of
          // HiveWindowFunction.
          val windowFunction =
            HiveWindowFunction(
              new HiveFunctionWrapper(functionClass.getName),
              windowFunctionInfo.isPivotResult,
              isUDAFBridgeRequired,
              newChildren)

          // Second, check if the specified window function can accept window definition.
          windowSpec.frameSpecification match {
            case frame: SpecifiedWindowFrame if !windowFunctionInfo.isSupportsWindow =>
              // This Hive window function does not support user-speficied window frame.
              throw new AnalysisException(
                s"Window function $name does not take a frame specification.")
            case frame: SpecifiedWindowFrame if windowFunctionInfo.isSupportsWindow &&
                                                windowFunctionInfo.isPivotResult =>
              // These two should not be true at the same time when a window frame is defined.
              // If so, throw an exception.
              throw new AnalysisException(s"Could not handle Hive window function $name because " +
                s"it supports both a user specified window frame and pivot result.")
            case _ => // OK
          }
          // Resolve those UnspecifiedWindowFrame because the physical Window operator still needs
          // a window frame specification to work.
          val newWindowSpec = windowSpec.frameSpecification match {
            case UnspecifiedFrame =>
              val newWindowFrame =
                SpecifiedWindowFrame.defaultWindowFrame(
                  windowSpec.orderSpec.nonEmpty,
                  windowFunctionInfo.isSupportsWindow)
              WindowSpecDefinition(windowSpec.partitionSpec, windowSpec.orderSpec, newWindowFrame)
            case _ => windowSpec
          }

          // Finally, we create a WindowExpression with the resolved window function and
          // specified window spec.
          WindowExpression(windowFunction, newWindowSpec)
      }
  }
}

/**
 * A [[WindowFunction]] implementation wrapping Hive's window function.
 * @param funcWrapper The wrapper for the Hive Window Function.
 * @param pivotResult If it is true, the Hive function will return a list of values representing
 *                    the values of the added columns. Otherwise, a single value is returned for
 *                    current row.
 * @param isUDAFBridgeRequired If it is true, the function returned by functionWrapper's
 *                             createFunction is UDAF, we need to use GenericUDAFBridge to wrap
 *                             it as a GenericUDAFResolver2.
 * @param children Input parameters.
 */
private[hive] case class HiveWindowFunction(
    funcWrapper: HiveFunctionWrapper,
    pivotResult: Boolean,
    isUDAFBridgeRequired: Boolean,
    children: Seq[Expression]) extends WindowFunction
  with HiveInspectors with Unevaluable {

  // Hive window functions are based on GenericUDAFResolver2.
  type UDFType = GenericUDAFResolver2

  @transient
  protected lazy val resolver: GenericUDAFResolver2 =
    if (isUDAFBridgeRequired) {
      new GenericUDAFBridge(funcWrapper.createFunction[UDAF]())
    } else {
      funcWrapper.createFunction[GenericUDAFResolver2]()
    }

  @transient
  protected lazy val inputInspectors = children.map(toInspector).toArray

  // The GenericUDAFEvaluator used to evaluate the window function.
  @transient
  protected lazy val evaluator: GenericUDAFEvaluator = {
    val parameterInfo = new SimpleGenericUDAFParameterInfo(inputInspectors, false, false)
    resolver.getEvaluator(parameterInfo)
  }

  // The object inspector of values returned from the Hive window function.
  @transient
  protected lazy val returnInspector = {
    evaluator.init(GenericUDAFEvaluator.Mode.COMPLETE, inputInspectors)
  }

  override val dataType: DataType =
    if (!pivotResult) {
      inspectorToDataType(returnInspector)
    } else {
      // If pivotResult is true, we should take the element type out as the data type of this
      // function.
      inspectorToDataType(returnInspector) match {
        case ArrayType(dt, _) => dt
        case _ =>
          sys.error(
            s"error resolve the data type of window function ${funcWrapper.functionClassName}")
      }
    }

  override def nullable: Boolean = true

  @transient
  lazy val inputProjection = new InterpretedProjection(children)

  @transient
  private var hiveEvaluatorBuffer: AggregationBuffer = _
  // Output buffer.
  private var outputBuffer: Any = _

  @transient
  private lazy val inputDataTypes: Array[DataType] = children.map(_.dataType).toArray

  override def init(): Unit = {
    evaluator.init(GenericUDAFEvaluator.Mode.COMPLETE, inputInspectors)
  }

  // Reset the hiveEvaluatorBuffer and outputPosition
  override def reset(): Unit = {
    // We create a new aggregation buffer to workaround the bug in GenericUDAFRowNumber.
    // Basically, GenericUDAFRowNumberEvaluator.reset calls RowNumberBuffer.init.
    // However, RowNumberBuffer.init does not really reset this buffer.
    hiveEvaluatorBuffer = evaluator.getNewAggregationBuffer
    evaluator.reset(hiveEvaluatorBuffer)
  }

  override def prepareInputParameters(input: InternalRow): AnyRef = {
    wrap(
      inputProjection(input),
      inputInspectors,
      new Array[AnyRef](children.length),
      inputDataTypes)
  }

  // Add input parameters for a single row.
  override def update(input: AnyRef): Unit = {
    evaluator.iterate(hiveEvaluatorBuffer, input.asInstanceOf[Array[AnyRef]])
  }

  override def batchUpdate(inputs: Array[AnyRef]): Unit = {
    var i = 0
    while (i < inputs.length) {
      evaluator.iterate(hiveEvaluatorBuffer, inputs(i).asInstanceOf[Array[AnyRef]])
      i += 1
    }
  }

  override def evaluate(): Unit = {
    outputBuffer = unwrap(evaluator.evaluate(hiveEvaluatorBuffer), returnInspector)
  }

  override def get(index: Int): Any = {
    if (!pivotResult) {
      // if pivotResult is false, we will get a single value for all rows in the frame.
      outputBuffer
    } else {
      // if pivotResult is true, we will get a ArrayData having the same size with the size
      // of the window frame. At here, we will return the result at the position of
      // index in the output buffer.
      outputBuffer.asInstanceOf[ArrayData].get(index, dataType)
    }
  }

  override def toString: String = {
    s"$nodeName#${funcWrapper.functionClassName}(${children.mkString(",")})"
  }

  override def newInstance(): WindowFunction =
    new HiveWindowFunction(funcWrapper, pivotResult, isUDAFBridgeRequired, children)
}

/**
 * Converts a Hive Generic User Defined Table Generating Function (UDTF) to a
 * [[Generator]].  Note that the semantics of Generators do not allow
 * Generators to maintain state in between input rows.  Thus UDTFs that rely on partitioning
 * dependent operations like calls to `close()` before producing output will not operate the same as
 * in Hive.  However, in practice this should not affect compatibility for most sane UDTFs
 * (e.g. explode or GenericUDTFParseUrlTuple).
 *
 * Operators that require maintaining state in between input rows should instead be implemented as
 * user defined aggregations, which have clean semantics even in a partitioned execution.
 */
private[hive] case class HiveGenericUDTF(
    funcWrapper: HiveFunctionWrapper,
    children: Seq[Expression])
  extends Generator with HiveInspectors with CodegenFallback {

  @transient
  protected lazy val function: GenericUDTF = {
    val fun: GenericUDTF = funcWrapper.createFunction()
    fun.setCollector(collector)
    fun
  }

  @transient
  protected lazy val inputInspectors = children.map(toInspector)

  @transient
  protected lazy val outputInspector = function.initialize(inputInspectors.toArray)

  @transient
  protected lazy val udtInput = new Array[AnyRef](children.length)

  @transient
  protected lazy val collector = new UDTFCollector

  override lazy val elementTypes = outputInspector.getAllStructFieldRefs.asScala.map {
    field => (inspectorToDataType(field.getFieldObjectInspector), true, field.getFieldName)
  }

  @transient
  private lazy val inputDataTypes: Array[DataType] = children.map(_.dataType).toArray

  override def eval(input: InternalRow): TraversableOnce[InternalRow] = {
    outputInspector // Make sure initialized.

    val inputProjection = new InterpretedProjection(children)

    function.process(wrap(inputProjection(input), inputInspectors, udtInput, inputDataTypes))
    collector.collectRows()
  }

  protected class UDTFCollector extends Collector {
    var collected = new ArrayBuffer[InternalRow]

    override def collect(input: java.lang.Object) {
      // We need to clone the input here because implementations of
      // GenericUDTF reuse the same object. Luckily they are always an array, so
      // it is easy to clone.
      collected += unwrap(input, outputInspector).asInstanceOf[InternalRow]
    }

    def collectRows(): Seq[InternalRow] = {
      val toCollect = collected
      collected = new ArrayBuffer[InternalRow]
      toCollect
    }
  }

  override def terminate(): TraversableOnce[InternalRow] = {
    outputInspector // Make sure initialized.
    function.close()
    collector.collectRows()
  }

  override def toString: String = {
    s"$nodeName#${funcWrapper.functionClassName}(${children.mkString(",")})"
  }
}

/**
 * Currently we don't support partial aggregation for queries using Hive UDAF, which may hurt
 * performance a lot.
 */
private[hive] case class HiveUDAFFunction(
    funcWrapper: HiveFunctionWrapper,
    children: Seq[Expression],
    isUDAFBridgeRequired: Boolean = false,
    mutableAggBufferOffset: Int = 0,
    inputAggBufferOffset: Int = 0)
  extends ImperativeAggregate with HiveInspectors {

  override def withNewMutableAggBufferOffset(newMutableAggBufferOffset: Int): ImperativeAggregate =
    copy(mutableAggBufferOffset = newMutableAggBufferOffset)

  override def withNewInputAggBufferOffset(newInputAggBufferOffset: Int): ImperativeAggregate =
    copy(inputAggBufferOffset = newInputAggBufferOffset)

  @transient
  private lazy val resolver =
    if (isUDAFBridgeRequired) {
      new GenericUDAFBridge(funcWrapper.createFunction[UDAF]())
    } else {
      funcWrapper.createFunction[AbstractGenericUDAFResolver]()
    }

  @transient
  private lazy val inspectors = children.map(toInspector).toArray

  @transient
  private lazy val functionAndInspector = {
    val parameterInfo = new SimpleGenericUDAFParameterInfo(inspectors, false, false)
    val f = resolver.getEvaluator(parameterInfo)
    f -> f.init(GenericUDAFEvaluator.Mode.COMPLETE, inspectors)
  }

  @transient
  private lazy val function = functionAndInspector._1

  @transient
  private lazy val returnInspector = functionAndInspector._2

  @transient
  private[this] var buffer: GenericUDAFEvaluator.AggregationBuffer = _

  override def eval(input: InternalRow): Any = unwrap(function.evaluate(buffer), returnInspector)

  @transient
  private lazy val inputProjection = new InterpretedProjection(children)

  @transient
  private lazy val cached = new Array[AnyRef](children.length)

  @transient
  private lazy val inputDataTypes: Array[DataType] = children.map(_.dataType).toArray

  // Hive UDAF has its own buffer, so we don't need to occupy a slot in the aggregation
  // buffer for it.
  override def aggBufferSchema: StructType = StructType(Nil)

  override def update(_buffer: MutableRow, input: InternalRow): Unit = {
    val inputs = inputProjection(input)
    function.iterate(buffer, wrap(inputs, inspectors, cached, inputDataTypes))
  }

  override def merge(buffer1: MutableRow, buffer2: InternalRow): Unit = {
    throw new UnsupportedOperationException(
      "Hive UDAF doesn't support partial aggregate")
  }

  override def initialize(_buffer: MutableRow): Unit = {
    buffer = function.getNewAggregationBuffer
  }

  override val aggBufferAttributes: Seq[AttributeReference] = Nil

  // Note: although this simply copies aggBufferAttributes, this common code can not be placed
  // in the superclass because that will lead to initialization ordering issues.
  override val inputAggBufferAttributes: Seq[AttributeReference] = Nil

  // We rely on Hive to check the input data types, so use `AnyDataType` here to bypass our
  // catalyst type checking framework.
  override def inputTypes: Seq[AbstractDataType] = children.map(_ => AnyDataType)

  override def nullable: Boolean = true

  override def supportsPartial: Boolean = false

  override val dataType: DataType = inspectorToDataType(returnInspector)
}





© 2015 - 2025 Weber Informatics LLC | Privacy Policy