org.apache.spark.sql.hive.HiveUtils.scala Maven / Gradle / Ivy

Go to download
/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *    http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package org.apache.spark.sql.hive

import java.io.File
import java.net.URL
import java.util.Locale

import scala.collection.mutable.HashMap
import scala.jdk.CollectionConverters._
import scala.util.Try

import org.apache.hadoop.conf.Configuration
import org.apache.hadoop.fs.Path
import org.apache.hadoop.hive.common.FileUtils
import org.apache.hadoop.hive.conf.HiveConf
import org.apache.hadoop.hive.conf.HiveConf.ConfVars
import org.apache.hadoop.hive.ql.session.SessionState
import org.apache.hadoop.util.VersionInfo
import org.apache.hive.common.util.HiveVersionInfo

import org.apache.spark.SparkConf
import org.apache.spark.deploy.SparkHadoopUtil
import org.apache.spark.internal.{Logging, MDC}
import org.apache.spark.internal.LogKeys
import org.apache.spark.sql._
import org.apache.spark.sql.catalyst.catalog.CatalogTable
import org.apache.spark.sql.execution.command.DDLUtils
import org.apache.spark.sql.execution.datasources.DataSource
import org.apache.spark.sql.hive.client._
import org.apache.spark.sql.internal.SQLConf
import org.apache.spark.sql.internal.SQLConf._
import org.apache.spark.sql.internal.StaticSQLConf.WAREHOUSE_PATH
import org.apache.spark.sql.types._
import org.apache.spark.util.ArrayImplicits._
import org.apache.spark.util.Utils


private[spark] object HiveUtils extends Logging {
  private val PATTERN_FOR_KEY_EQ_VAL = "(.+)=(.+)".r

  /** The version of hive used internally by Spark SQL. */
  val builtinHiveVersion: String = HiveVersionInfo.getVersion

  val BUILTIN_HIVE_VERSION = buildStaticConf("spark.sql.hive.version")
    .doc("The compiled, a.k.a, builtin Hive version of the Spark distribution bundled with." +
        " Note that, this a read-only conf and only used to report the built-in hive version." +
        " If you want a different metastore client for Spark to call, please refer to" +
        " spark.sql.hive.metastore.version.")
    .version("1.1.1")
    .stringConf
    .checkValue(_ == builtinHiveVersion,
      "The builtin Hive version is read-only, please use spark.sql.hive.metastore.version")
    .createWithDefault(builtinHiveVersion)

  private def isCompatibleHiveVersion(hiveVersionStr: String): Boolean = {
    Try { IsolatedClientLoader.hiveVersion(hiveVersionStr) }.isSuccess
  }

  val HIVE_METASTORE_VERSION = buildStaticConf("spark.sql.hive.metastore.version")
    .doc("Version of the Hive metastore. Available options are " +
        "2.0.0 through 2.3.10 and " +
        "3.0.0 through 3.1.3.")
    .version("1.4.0")
    .stringConf
    .checkValue(isCompatibleHiveVersion, "Unsupported Hive Metastore version")
    .createWithDefault(builtinHiveVersion)

  val HIVE_METASTORE_JARS = buildStaticConf("spark.sql.hive.metastore.jars")
    .doc(s"""
      | Location of the jars that should be used to instantiate the HiveMetastoreClient.
      | This property can be one of four options:
      | 1. "builtin"
      |   Use Hive ${builtinHiveVersion}, which is bundled with the Spark assembly when
      |   -Phive is enabled. When this option is chosen,
      |   spark.sql.hive.metastore.version must be either
      |   ${builtinHiveVersion} or not defined.
      | 2. "maven"
      |   Use Hive jars of specified version downloaded from Maven repositories.
      | 3. "path"
      |   Use Hive jars configured by `spark.sql.hive.metastore.jars.path`
      |   in comma separated format. Support both local or remote paths.The provided jars
      |   should be the same version as `${HIVE_METASTORE_VERSION.key}`.
      | 4. A classpath in the standard format for both Hive and Hadoop. The provided jars
      |   should be the same version as `${HIVE_METASTORE_VERSION.key}`.
      """.stripMargin)
    .version("1.4.0")
    .stringConf
    .createWithDefault("builtin")

  val HIVE_METASTORE_JARS_PATH = buildStaticConf("spark.sql.hive.metastore.jars.path")
    .doc(s"""
      | Comma-separated paths of the jars that used to instantiate the HiveMetastoreClient.
      | This configuration is useful only when `${HIVE_METASTORE_JARS.key}` is set as `path`.
      | The paths can be any of the following format:
      | 1. file://path/to/jar/foo.jar
      | 2. hdfs://nameservice/path/to/jar/foo.jar
      | 3. /path/to/jar/ (path without URI scheme follow conf `fs.defaultFS`'s URI schema)
      | 4. [http/https/ftp]://path/to/jar/foo.jar
      | Note that 1, 2, and 3 support wildcard. For example:
      | 1. file://path/to/jar/*,file://path2/to/jar/*/*.jar
      | 2. hdfs://nameservice/path/to/jar/*,hdfs://nameservice2/path/to/jar/*/*.jar
      """.stripMargin)
    .version("3.1.0")
    .stringConf
    .toSequence
    .createWithDefault(Nil)

  val CONVERT_METASTORE_PARQUET = buildConf("spark.sql.hive.convertMetastoreParquet")
    .doc("When set to true, the built-in Parquet reader and writer are used to process " +
      "parquet tables created by using the HiveQL syntax, instead of Hive serde.")
    .version("1.1.1")
    .booleanConf
    .createWithDefault(true)

  val CONVERT_METASTORE_PARQUET_WITH_SCHEMA_MERGING =
    buildConf("spark.sql.hive.convertMetastoreParquet.mergeSchema")
      .doc("When true, also tries to merge possibly different but compatible Parquet schemas in " +
        "different Parquet data files. This configuration is only effective " +
        "when \"spark.sql.hive.convertMetastoreParquet\" is true.")
      .version("1.3.1")
      .booleanConf
      .createWithDefault(false)

  val CONVERT_METASTORE_ORC = buildConf("spark.sql.hive.convertMetastoreOrc")
    .doc("When set to true, the built-in ORC reader and writer are used to process " +
      "ORC tables created by using the HiveQL syntax, instead of Hive serde.")
    .version("2.0.0")
    .booleanConf
    .createWithDefault(true)

  val CONVERT_INSERTING_PARTITIONED_TABLE =
    buildConf("spark.sql.hive.convertInsertingPartitionedTable")
      .doc("When set to true, and `spark.sql.hive.convertMetastoreParquet` or " +
        "`spark.sql.hive.convertMetastoreOrc` is true, the built-in ORC/Parquet writer is used" +
        "to process inserting into partitioned ORC/Parquet tables created by using the HiveSQL " +
        "syntax.")
      .version("3.0.0")
      .booleanConf
      .createWithDefault(true)

  val CONVERT_INSERTING_UNPARTITIONED_TABLE =
    buildConf("spark.sql.hive.convertInsertingUnpartitionedTable")
      .doc("When set to true, and `spark.sql.hive.convertMetastoreParquet` or " +
        "`spark.sql.hive.convertMetastoreOrc` is true, the built-in ORC/Parquet writer is used" +
        "to process inserting into unpartitioned ORC/Parquet tables created by using the HiveSQL " +
        "syntax.")
      .version("4.0.0")
      .booleanConf
      .createWithDefault(true)

  val CONVERT_METASTORE_CTAS = buildConf("spark.sql.hive.convertMetastoreCtas")
    .doc("When set to true,  Spark will try to use built-in data source writer " +
      "instead of Hive serde in CTAS. This flag is effective only if " +
      "`spark.sql.hive.convertMetastoreParquet` or `spark.sql.hive.convertMetastoreOrc` is " +
      "enabled respectively for Parquet and ORC formats")
    .version("3.0.0")
    .booleanConf
    .createWithDefault(true)

  val CONVERT_METASTORE_INSERT_DIR = buildConf("spark.sql.hive.convertMetastoreInsertDir")
    .doc("When set to true,  Spark will try to use built-in data source writer " +
      "instead of Hive serde in INSERT OVERWRITE DIRECTORY. This flag is effective only if " +
      "`spark.sql.hive.convertMetastoreParquet` or `spark.sql.hive.convertMetastoreOrc` is " +
      "enabled respectively for Parquet and ORC formats")
    .version("3.3.0")
    .booleanConf
    .createWithDefault(true)

  val HIVE_METASTORE_SHARED_PREFIXES = buildStaticConf("spark.sql.hive.metastore.sharedPrefixes")
    .doc("A comma separated list of class prefixes that should be loaded using the classloader " +
      "that is shared between Spark SQL and a specific version of Hive. An example of classes " +
      "that should be shared is JDBC drivers that are needed to talk to the metastore. Other " +
      "classes that need to be shared are those that interact with classes that are already " +
      "shared. For example, custom appenders that are used by log4j.")
    .version("1.4.0")
    .stringConf
    .toSequence
    .createWithDefault(jdbcPrefixes)

  private def jdbcPrefixes = Seq(
    "com.mysql.jdbc", "org.postgresql", "com.microsoft.sqlserver", "oracle.jdbc")

  val HIVE_METASTORE_BARRIER_PREFIXES = buildStaticConf("spark.sql.hive.metastore.barrierPrefixes")
    .doc("A comma separated list of class prefixes that should explicitly be reloaded for each " +
      "version of Hive that Spark SQL is communicating with. For example, Hive UDFs that are " +
      "declared in a prefix that typically would be shared (i.e. org.apache.spark.*).")
    .version("1.4.0")
    .stringConf
    .toSequence
    .createWithDefault(Nil)

  val HIVE_THRIFT_SERVER_ASYNC = buildConf("spark.sql.hive.thriftServer.async")
    .doc("When set to true, Hive Thrift server executes SQL queries in an asynchronous way.")
    .version("1.5.0")
    .booleanConf
    .createWithDefault(true)

  val USE_DELEGATE_FOR_SYMLINK_TEXT_INPUT_FORMAT =
    buildConf("spark.sql.hive.useDelegateForSymlinkTextInputFormat")
      .internal()
      .doc("When true, SymlinkTextInputFormat is replaced with a similar delegate class during " +
        "table scan in order to fix the issue of empty splits")
      .version("3.4.0")
      .booleanConf
      .createWithDefault(true)

  /**
   * The version of the hive client that will be used to communicate with the metastore.  Note that
   * this does not necessarily need to be the same version of Hive that is used internally by
   * Spark SQL for execution.
   */
  private def hiveMetastoreVersion(conf: SQLConf): String = {
    conf.getConf(HIVE_METASTORE_VERSION)
  }

  /**
   * The location of the jars that should be used to instantiate the HiveMetastoreClient.  This
   * property can be one of three options:
   *  - a classpath in the standard format for both hive and hadoop.
   *  - path - attempt to discover the jars with paths configured by `HIVE_METASTORE_JARS_PATH`.
   *  - builtin - attempt to discover the jars that were used to load Spark SQL and use those. This
   *              option is only valid when using the execution version of Hive.
   *  - maven - download the correct version of hive on demand from maven.
   */
  private def hiveMetastoreJars(conf: SQLConf): String = {
    conf.getConf(HIVE_METASTORE_JARS)
  }

  /**
   * Hive jars paths, only work when `HIVE_METASTORE_JARS` is `path`.
   */
  private def hiveMetastoreJarsPath(conf: SQLConf): Seq[String] = {
    conf.getConf(HIVE_METASTORE_JARS_PATH)
  }

  /**
   * A comma separated list of class prefixes that should be loaded using the classloader that
   * is shared between Spark SQL and a specific version of Hive. An example of classes that should
   * be shared is JDBC drivers that are needed to talk to the metastore. Other classes that need
   * to be shared are those that interact with classes that are already shared.  For example,
   * custom appenders that are used by log4j.
   */
  private def hiveMetastoreSharedPrefixes(conf: SQLConf): Seq[String] = {
    conf.getConf(HIVE_METASTORE_SHARED_PREFIXES).filterNot(_ == "")
  }

  /**
   * A comma separated list of class prefixes that should explicitly be reloaded for each version
   * of Hive that Spark SQL is communicating with.  For example, Hive UDFs that are declared in a
   * prefix that typically would be shared (i.e. org.apache.spark.*)
   */
  private def hiveMetastoreBarrierPrefixes(conf: SQLConf): Seq[String] = {
    conf.getConf(HIVE_METASTORE_BARRIER_PREFIXES).filterNot(_ == "")
  }

  /**
   * Check current Thread's SessionState type
   * @return true when SessionState.get returns an instance of CliSessionState,
   *         false when it gets non-CliSessionState instance or null
   */
  def isCliSessionState(): Boolean = {
    val state = SessionState.get
    var temp: Class[_] = if (state != null) state.getClass else null
    var found = false
    while (temp != null && !found) {
      found = temp.getName == "org.apache.hadoop.hive.cli.CliSessionState"
      temp = temp.getSuperclass
    }
    found
  }

  /**
   * Create a [[HiveClient]] used for execution.
   *
   * Currently this must always be the Hive built-in version that packaged
   * with Spark SQL. This copy of the client is used for execution related tasks like
   * registering temporary functions or ensuring that the ThreadLocal SessionState is
   * correctly populated.  This copy of Hive is *not* used for storing persistent metadata,
   * and only point to a dummy metastore in a temporary directory.
   */
  protected[hive] def newClientForExecution(
      conf: SparkConf,
      hadoopConf: Configuration): HiveClientImpl = {
    logInfo(log"Initializing execution hive, version " +
      log"${MDC(LogKeys.HIVE_METASTORE_VERSION, builtinHiveVersion)}")
    val loader = new IsolatedClientLoader(
      version = IsolatedClientLoader.hiveVersion(builtinHiveVersion),
      sparkConf = conf,
      execJars = Seq.empty,
      hadoopConf = hadoopConf,
      config = newTemporaryConfiguration(useInMemoryDerby = true),
      isolationOn = false,
      baseClassLoader = Utils.getContextOrSparkClassLoader)
    loader.createClient().asInstanceOf[HiveClientImpl]
  }

  /**
   * Create a [[HiveClient]] used to retrieve metadata from the Hive MetaStore.
   *
   * The version of the Hive client that is used here must match the metastore that is configured
   * in the hive-site.xml file.
   */
  protected[hive] def newClientForMetadata(
      conf: SparkConf,
      hadoopConf: Configuration,
      configurations: Map[String, String] = Map.empty): HiveClient = {
    val sqlConf = new SQLConf
    sqlConf.setConf(SQLContext.getSQLProperties(conf))
    val hiveMetastoreVersion = HiveUtils.hiveMetastoreVersion(sqlConf)
    val hiveMetastoreJars = HiveUtils.hiveMetastoreJars(sqlConf)
    val hiveMetastoreSharedPrefixes = HiveUtils.hiveMetastoreSharedPrefixes(sqlConf)
    val hiveMetastoreBarrierPrefixes = HiveUtils.hiveMetastoreBarrierPrefixes(sqlConf)
    val metaVersion = IsolatedClientLoader.hiveVersion(hiveMetastoreVersion)

    def addLocalHiveJars(file: File): Seq[URL] = {
      if (file.getName == "*") {
        val files = file.getParentFile.listFiles()
        if (files == null) {
          logWarning(log"Hive jar path '${MDC(LogKeys.PATH, file.getPath)}' does not exist.")
          Nil
        } else {
          files.filter(_.getName.toLowerCase(Locale.ROOT).endsWith(".jar")).map(_.toURI.toURL)
            .toImmutableArraySeq
        }
      } else {
        file.toURI.toURL :: Nil
      }
    }

    def logInitWithPath(jars: Seq[URL]): Unit = {
      logInfo(log"Initializing HiveMetastoreConnection version " +
        log"${MDC(LogKeys.HIVE_METASTORE_VERSION, hiveMetastoreVersion)} using paths: " +
        log"${MDC(LogKeys.PATH, jars.mkString(", "))}")
    }

    val isolatedLoader = if (hiveMetastoreJars == "builtin") {
      if (builtinHiveVersion != hiveMetastoreVersion) {
        throw new IllegalArgumentException(
          "Builtin jars can only be used when hive execution version == hive metastore version. " +
            s"Execution: $builtinHiveVersion != Metastore: $hiveMetastoreVersion. " +
            s"Specify a valid path to the correct hive jars using ${HIVE_METASTORE_JARS.key} " +
            s"or change ${HIVE_METASTORE_VERSION.key} to $builtinHiveVersion.")
      }

      logInfo(
        log"Initializing HiveMetastoreConnection version " +
          log"${MDC(LogKeys.HIVE_METASTORE_VERSION, hiveMetastoreVersion)} using Spark classes.")
      new IsolatedClientLoader(
        version = metaVersion,
        sparkConf = conf,
        hadoopConf = hadoopConf,
        config = configurations,
        isolationOn = false,
        sessionStateIsolationOverride = Some(!isCliSessionState()),
        barrierPrefixes = hiveMetastoreBarrierPrefixes,
        sharedPrefixes = hiveMetastoreSharedPrefixes)
    } else if (hiveMetastoreJars == "maven") {
      // TODO: Support for loading the jars from an already downloaded location.
      logInfo(
        log"Initializing HiveMetastoreConnection version " +
          log"${MDC(LogKeys.HIVE_METASTORE_VERSION, hiveMetastoreVersion)} using maven.")
      IsolatedClientLoader.forVersion(
        hiveMetastoreVersion = hiveMetastoreVersion,
        hadoopVersion = VersionInfo.getVersion,
        sparkConf = conf,
        hadoopConf = hadoopConf,
        config = configurations,
        barrierPrefixes = hiveMetastoreBarrierPrefixes,
        sharedPrefixes = hiveMetastoreSharedPrefixes)
    } else if (hiveMetastoreJars == "path") {
      // Convert to files and expand any directories.
      val jars =
        HiveUtils.hiveMetastoreJarsPath(sqlConf)
          .flatMap {
            case path if path.contains("\\") && Utils.isWindows =>
              addLocalHiveJars(new File(path))
            case path =>
              DataSource.checkAndGlobPathIfNecessary(
                pathStrings = Seq(path),
                hadoopConf = hadoopConf,
                checkEmptyGlobPath = true,
                checkFilesExist = false,
                enableGlobbing = true
              ).map(_.toUri.toURL)
          }

      logInitWithPath(jars)
      new IsolatedClientLoader(
        version = metaVersion,
        sparkConf = conf,
        hadoopConf = hadoopConf,
        execJars = jars,
        config = configurations,
        isolationOn = true,
        barrierPrefixes = hiveMetastoreBarrierPrefixes,
        sharedPrefixes = hiveMetastoreSharedPrefixes)
    } else {
      // Convert to files and expand any directories.
      val jars =
        hiveMetastoreJars
          .split(File.pathSeparator)
          .flatMap { path =>
            addLocalHiveJars(new File(path))
          }

      logInitWithPath(jars.toSeq)
      new IsolatedClientLoader(
        version = metaVersion,
        sparkConf = conf,
        hadoopConf = hadoopConf,
        execJars = jars.toImmutableArraySeq,
        config = configurations,
        isolationOn = true,
        barrierPrefixes = hiveMetastoreBarrierPrefixes,
        sharedPrefixes = hiveMetastoreSharedPrefixes)
    }
    isolatedLoader.createClient()
  }

  /** Constructs a configuration for hive, where the metastore is located in a temp directory. */
  def newTemporaryConfiguration(useInMemoryDerby: Boolean): Map[String, String] = {
    val withInMemoryMode = if (useInMemoryDerby) "memory:" else ""

    val tempDir = Utils.createTempDir()
    val localMetastore = new File(tempDir, "metastore")
    val propMap: HashMap[String, String] = HashMap()
    // We have to mask all properties in hive-site.xml that relates to metastore data source
    // as we used a local metastore here.
    HiveConf.ConfVars.values().foreach { confvar =>
      if (confvar.varname.contains("datanucleus") || confvar.varname.contains("jdo")
        || confvar.varname.contains("hive.metastore.rawstore.impl")) {
        propMap.put(confvar.varname, confvar.getDefaultExpr())
      }
    }
    propMap.put(WAREHOUSE_PATH.key, localMetastore.toURI.toString)
    propMap.put("javax.jdo.option.ConnectionURL",
      s"jdbc:derby:${withInMemoryMode};databaseName=${localMetastore.getAbsolutePath};create=true")
    propMap.put("datanucleus.rdbms.datastoreAdapterClassName",
      "org.datanucleus.store.rdbms.adapter.DerbyAdapter")

    // Disable schema verification and allow schema auto-creation in the
    // Derby database, in case the config for the metastore is set otherwise.
    // Without these settings, starting the client fails with
    // MetaException(message:Version information not found in metastore.)
    propMap.put("hive.metastore.schema.verification", "false")
    propMap.put("datanucleus.schema.autoCreateAll", "true")

    // SPARK-11783: When "hive.metastore.uris" is set, the metastore connection mode will be
    // remote (https://cwiki.apache.org/confluence/display/Hive/AdminManual+MetastoreAdmin
    // mentions that "If hive.metastore.uris is empty local mode is assumed, remote otherwise").
    // Remote means that the metastore server is running in its own process.
    // When the mode is remote, configurations like "javax.jdo.option.ConnectionURL" will not be
    // used (because they are used by remote metastore server that talks to the database).
    // Because execution Hive should always connects to an embedded derby metastore.
    // We have to remove the value of hive.metastore.uris. So, the execution Hive client connects
    // to the actual embedded derby metastore instead of the remote metastore.
    // You can search hive.metastore.uris in the code of HiveConf (in Hive's repo).
    // Then, you will find that the local metastore mode is only set to true when
    // hive.metastore.uris is not set.
    propMap.put("hive.metastore.uris", "")

    // The execution client will generate garbage events, therefore the listeners that are generated
    // for the execution clients are useless. In order to not output garbage, we don't generate
    // these listeners.
    propMap.put(ConfVars.METASTORE_PRE_EVENT_LISTENERS.varname, "")
    propMap.put(ConfVars.METASTORE_EVENT_LISTENERS.varname, "")
    propMap.put(ConfVars.METASTORE_END_FUNCTION_LISTENERS.varname, "")

    // SPARK-21451: Spark will gather all `spark.hadoop.*` properties from a `SparkConf` to a
    // Hadoop Configuration internally, as long as it happens after SparkContext initialized.
    // Some instances such as `CliSessionState` used in `SparkSQLCliDriver` may also rely on these
    // Configuration. But it happens before SparkContext initialized, we need to take them from
    // system properties in the form of regular hadoop configurations.
    SparkHadoopUtil.get.appendSparkHadoopConfigs(sys.props.toMap, propMap)
    SparkHadoopUtil.get.appendSparkHiveConfigs(sys.props.toMap, propMap)

    propMap.toMap
  }

  /**
   * Infers the schema for Hive serde tables and returns the CatalogTable with the inferred schema.
   * When the tables are data source tables or the schema already exists, returns the original
   * CatalogTable.
   */
  def inferSchema(table: CatalogTable): CatalogTable = {
    if (DDLUtils.isDatasourceTable(table) || table.dataSchema.nonEmpty) {
      table
    } else {
      val hiveTable = HiveClientImpl.toHiveTable(table)
      // Note: Hive separates partition columns and the schema, but for us the
      // partition columns are part of the schema
      val partCols = hiveTable.getPartCols.asScala.map(HiveClientImpl.fromHiveColumn)
      val dataCols = hiveTable.getCols.asScala.map(HiveClientImpl.fromHiveColumn)
      table.copy(schema = StructType((dataCols ++ partCols).toArray))
    }
  }

  /**
   * Extract the partition values from a partition name, e.g., if a partition name is
   * "region=US/dt=2023-02-18", then we will return an array of values ("US", "2023-02-18").
   */
  def partitionNameToValues(name: String): Array[String] = {
    name.split(Path.SEPARATOR).map {
      case PATTERN_FOR_KEY_EQ_VAL(_, v) => FileUtils.unescapePathName(v)
    }
  }
}