org.apache.spark.sql.hive.HiveUtils.scala Maven / Gradle / Ivy
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.spark.sql.hive
import java.io.File
import java.net.{URL, URLClassLoader}
import java.nio.charset.StandardCharsets
import java.sql.Timestamp
import java.util.concurrent.TimeUnit
import scala.collection.JavaConverters._
import scala.collection.mutable.HashMap
import scala.language.implicitConversions
import org.apache.hadoop.conf.Configuration
import org.apache.hadoop.hive.common.`type`.HiveDecimal
import org.apache.hadoop.hive.conf.HiveConf
import org.apache.hadoop.hive.conf.HiveConf.ConfVars
import org.apache.hadoop.hive.serde2.io.{DateWritable, TimestampWritable}
import org.apache.hadoop.util.VersionInfo
import org.apache.spark.{SparkConf, SparkContext}
import org.apache.spark.internal.Logging
import org.apache.spark.internal.config.CATALOG_IMPLEMENTATION
import org.apache.spark.sql._
import org.apache.spark.sql.hive.client._
import org.apache.spark.sql.internal.SQLConf
import org.apache.spark.sql.internal.SQLConf._
import org.apache.spark.sql.types._
import org.apache.spark.util.Utils
private[spark] object HiveUtils extends Logging {
def withHiveExternalCatalog(sc: SparkContext): SparkContext = {
sc.conf.set(CATALOG_IMPLEMENTATION.key, "hive")
sc
}
/** The version of hive used internally by Spark SQL. */
val hiveExecutionVersion: String = "1.2.1"
val HIVE_METASTORE_VERSION = SQLConfigBuilder("spark.sql.hive.metastore.version")
.doc("Version of the Hive metastore. Available options are " +
s"0.12.0
through $hiveExecutionVersion
.")
.stringConf
.createWithDefault(hiveExecutionVersion)
val HIVE_EXECUTION_VERSION = SQLConfigBuilder("spark.sql.hive.version")
.doc("Version of Hive used internally by Spark SQL.")
.stringConf
.createWithDefault(hiveExecutionVersion)
val HIVE_METASTORE_JARS = SQLConfigBuilder("spark.sql.hive.metastore.jars")
.doc(s"""
| Location of the jars that should be used to instantiate the HiveMetastoreClient.
| This property can be one of three options: "
| 1. "builtin"
| Use Hive ${hiveExecutionVersion}, which is bundled with the Spark assembly when
| -Phive
is enabled. When this option is chosen,
| spark.sql.hive.metastore.version
must be either
| ${hiveExecutionVersion}
or not defined.
| 2. "maven"
| Use Hive jars of specified version downloaded from Maven repositories.
| 3. A classpath in the standard format for both Hive and Hadoop.
""".stripMargin)
.stringConf
.createWithDefault("builtin")
val CONVERT_METASTORE_PARQUET = SQLConfigBuilder("spark.sql.hive.convertMetastoreParquet")
.doc("When set to false, Spark SQL will use the Hive SerDe for parquet tables instead of " +
"the built in support.")
.booleanConf
.createWithDefault(true)
val CONVERT_METASTORE_PARQUET_WITH_SCHEMA_MERGING =
SQLConfigBuilder("spark.sql.hive.convertMetastoreParquet.mergeSchema")
.doc("When true, also tries to merge possibly different but compatible Parquet schemas in " +
"different Parquet data files. This configuration is only effective " +
"when \"spark.sql.hive.convertMetastoreParquet\" is true.")
.booleanConf
.createWithDefault(false)
val CONVERT_CTAS = SQLConfigBuilder("spark.sql.hive.convertCTAS")
.doc("When true, a table created by a Hive CTAS statement (no USING clause) will be " +
"converted to a data source table, using the data source set by spark.sql.sources.default.")
.booleanConf
.createWithDefault(false)
val CONVERT_METASTORE_ORC = SQLConfigBuilder("spark.sql.hive.convertMetastoreOrc")
.doc("When set to false, Spark SQL will use the Hive SerDe for ORC tables instead of " +
"the built in support.")
.booleanConf
.createWithDefault(true)
val HIVE_METASTORE_SHARED_PREFIXES = SQLConfigBuilder("spark.sql.hive.metastore.sharedPrefixes")
.doc("A comma separated list of class prefixes that should be loaded using the classloader " +
"that is shared between Spark SQL and a specific version of Hive. An example of classes " +
"that should be shared is JDBC drivers that are needed to talk to the metastore. Other " +
"classes that need to be shared are those that interact with classes that are already " +
"shared. For example, custom appenders that are used by log4j.")
.stringConf
.toSequence
.createWithDefault(jdbcPrefixes)
private def jdbcPrefixes = Seq(
"com.mysql.jdbc", "org.postgresql", "com.microsoft.sqlserver", "oracle.jdbc")
val HIVE_METASTORE_BARRIER_PREFIXES = SQLConfigBuilder("spark.sql.hive.metastore.barrierPrefixes")
.doc("A comma separated list of class prefixes that should explicitly be reloaded for each " +
"version of Hive that Spark SQL is communicating with. For example, Hive UDFs that are " +
"declared in a prefix that typically would be shared (i.e. org.apache.spark.*
).")
.stringConf
.toSequence
.createWithDefault(Nil)
val HIVE_THRIFT_SERVER_ASYNC = SQLConfigBuilder("spark.sql.hive.thriftServer.async")
.doc("When set to true, Hive Thrift server executes SQL queries in an asynchronous way.")
.booleanConf
.createWithDefault(true)
/**
* The version of the hive client that will be used to communicate with the metastore. Note that
* this does not necessarily need to be the same version of Hive that is used internally by
* Spark SQL for execution.
*/
private def hiveMetastoreVersion(conf: SQLConf): String = {
conf.getConf(HIVE_METASTORE_VERSION)
}
/**
* The location of the jars that should be used to instantiate the HiveMetastoreClient. This
* property can be one of three options:
* - a classpath in the standard format for both hive and hadoop.
* - builtin - attempt to discover the jars that were used to load Spark SQL and use those. This
* option is only valid when using the execution version of Hive.
* - maven - download the correct version of hive on demand from maven.
*/
private def hiveMetastoreJars(conf: SQLConf): String = {
conf.getConf(HIVE_METASTORE_JARS)
}
/**
* A comma separated list of class prefixes that should be loaded using the classloader that
* is shared between Spark SQL and a specific version of Hive. An example of classes that should
* be shared is JDBC drivers that are needed to talk to the metastore. Other classes that need
* to be shared are those that interact with classes that are already shared. For example,
* custom appenders that are used by log4j.
*/
private def hiveMetastoreSharedPrefixes(conf: SQLConf): Seq[String] = {
conf.getConf(HIVE_METASTORE_SHARED_PREFIXES).filterNot(_ == "")
}
/**
* A comma separated list of class prefixes that should explicitly be reloaded for each version
* of Hive that Spark SQL is communicating with. For example, Hive UDFs that are declared in a
* prefix that typically would be shared (i.e. org.apache.spark.*)
*/
private def hiveMetastoreBarrierPrefixes(conf: SQLConf): Seq[String] = {
conf.getConf(HIVE_METASTORE_BARRIER_PREFIXES).filterNot(_ == "")
}
/**
* Configurations needed to create a [[HiveClient]].
*/
private[hive] def hiveClientConfigurations(hadoopConf: Configuration): Map[String, String] = {
// Hive 0.14.0 introduces timeout operations in HiveConf, and changes default values of a bunch
// of time `ConfVar`s by adding time suffixes (`s`, `ms`, and `d` etc.). This breaks backwards-
// compatibility when users are trying to connecting to a Hive metastore of lower version,
// because these options are expected to be integral values in lower versions of Hive.
//
// Here we enumerate all time `ConfVar`s and convert their values to numeric strings according
// to their output time units.
Seq(
ConfVars.METASTORE_CLIENT_CONNECT_RETRY_DELAY -> TimeUnit.SECONDS,
ConfVars.METASTORE_CLIENT_SOCKET_TIMEOUT -> TimeUnit.SECONDS,
ConfVars.METASTORE_CLIENT_SOCKET_LIFETIME -> TimeUnit.SECONDS,
ConfVars.HMSHANDLERINTERVAL -> TimeUnit.MILLISECONDS,
ConfVars.METASTORE_EVENT_DB_LISTENER_TTL -> TimeUnit.SECONDS,
ConfVars.METASTORE_EVENT_CLEAN_FREQ -> TimeUnit.SECONDS,
ConfVars.METASTORE_EVENT_EXPIRY_DURATION -> TimeUnit.SECONDS,
ConfVars.METASTORE_AGGREGATE_STATS_CACHE_TTL -> TimeUnit.SECONDS,
ConfVars.METASTORE_AGGREGATE_STATS_CACHE_MAX_WRITER_WAIT -> TimeUnit.MILLISECONDS,
ConfVars.METASTORE_AGGREGATE_STATS_CACHE_MAX_READER_WAIT -> TimeUnit.MILLISECONDS,
ConfVars.HIVES_AUTO_PROGRESS_TIMEOUT -> TimeUnit.SECONDS,
ConfVars.HIVE_LOG_INCREMENTAL_PLAN_PROGRESS_INTERVAL -> TimeUnit.MILLISECONDS,
ConfVars.HIVE_STATS_JDBC_TIMEOUT -> TimeUnit.SECONDS,
ConfVars.HIVE_STATS_RETRIES_WAIT -> TimeUnit.MILLISECONDS,
ConfVars.HIVE_LOCK_SLEEP_BETWEEN_RETRIES -> TimeUnit.SECONDS,
ConfVars.HIVE_ZOOKEEPER_SESSION_TIMEOUT -> TimeUnit.MILLISECONDS,
ConfVars.HIVE_ZOOKEEPER_CONNECTION_BASESLEEPTIME -> TimeUnit.MILLISECONDS,
ConfVars.HIVE_TXN_TIMEOUT -> TimeUnit.SECONDS,
ConfVars.HIVE_COMPACTOR_WORKER_TIMEOUT -> TimeUnit.SECONDS,
ConfVars.HIVE_COMPACTOR_CHECK_INTERVAL -> TimeUnit.SECONDS,
ConfVars.HIVE_COMPACTOR_CLEANER_RUN_INTERVAL -> TimeUnit.MILLISECONDS,
ConfVars.HIVE_SERVER2_THRIFT_HTTP_MAX_IDLE_TIME -> TimeUnit.MILLISECONDS,
ConfVars.HIVE_SERVER2_THRIFT_HTTP_WORKER_KEEPALIVE_TIME -> TimeUnit.SECONDS,
ConfVars.HIVE_SERVER2_THRIFT_HTTP_COOKIE_MAX_AGE -> TimeUnit.SECONDS,
ConfVars.HIVE_SERVER2_THRIFT_LOGIN_BEBACKOFF_SLOT_LENGTH -> TimeUnit.MILLISECONDS,
ConfVars.HIVE_SERVER2_THRIFT_LOGIN_TIMEOUT -> TimeUnit.SECONDS,
ConfVars.HIVE_SERVER2_THRIFT_WORKER_KEEPALIVE_TIME -> TimeUnit.SECONDS,
ConfVars.HIVE_SERVER2_ASYNC_EXEC_SHUTDOWN_TIMEOUT -> TimeUnit.SECONDS,
ConfVars.HIVE_SERVER2_ASYNC_EXEC_KEEPALIVE_TIME -> TimeUnit.SECONDS,
ConfVars.HIVE_SERVER2_LONG_POLLING_TIMEOUT -> TimeUnit.MILLISECONDS,
ConfVars.HIVE_SERVER2_SESSION_CHECK_INTERVAL -> TimeUnit.MILLISECONDS,
ConfVars.HIVE_SERVER2_IDLE_SESSION_TIMEOUT -> TimeUnit.MILLISECONDS,
ConfVars.HIVE_SERVER2_IDLE_OPERATION_TIMEOUT -> TimeUnit.MILLISECONDS,
ConfVars.SERVER_READ_SOCKET_TIMEOUT -> TimeUnit.SECONDS,
ConfVars.HIVE_LOCALIZE_RESOURCE_WAIT_INTERVAL -> TimeUnit.MILLISECONDS,
ConfVars.SPARK_CLIENT_FUTURE_TIMEOUT -> TimeUnit.SECONDS,
ConfVars.SPARK_JOB_MONITOR_TIMEOUT -> TimeUnit.SECONDS,
ConfVars.SPARK_RPC_CLIENT_CONNECT_TIMEOUT -> TimeUnit.MILLISECONDS,
ConfVars.SPARK_RPC_CLIENT_HANDSHAKE_TIMEOUT -> TimeUnit.MILLISECONDS
).map { case (confVar, unit) =>
confVar.varname -> HiveConf.getTimeVar(hadoopConf, confVar, unit).toString
}.toMap
}
/**
* Create a [[HiveClient]] used for execution.
*
* Currently this must always be Hive 13 as this is the version of Hive that is packaged
* with Spark SQL. This copy of the client is used for execution related tasks like
* registering temporary functions or ensuring that the ThreadLocal SessionState is
* correctly populated. This copy of Hive is *not* used for storing persistent metadata,
* and only point to a dummy metastore in a temporary directory.
*/
protected[hive] def newClientForExecution(
conf: SparkConf,
hadoopConf: Configuration): HiveClientImpl = {
logInfo(s"Initializing execution hive, version $hiveExecutionVersion")
val loader = new IsolatedClientLoader(
version = IsolatedClientLoader.hiveVersion(hiveExecutionVersion),
sparkConf = conf,
execJars = Seq(),
hadoopConf = hadoopConf,
config = newTemporaryConfiguration(useInMemoryDerby = true),
isolationOn = false,
baseClassLoader = Utils.getContextOrSparkClassLoader)
loader.createClient().asInstanceOf[HiveClientImpl]
}
/**
* Create a [[HiveClient]] used to retrieve metadata from the Hive MetaStore.
*
* The version of the Hive client that is used here must match the metastore that is configured
* in the hive-site.xml file.
*/
protected[hive] def newClientForMetadata(
conf: SparkConf,
hadoopConf: Configuration): HiveClient = {
val configurations = hiveClientConfigurations(hadoopConf)
newClientForMetadata(conf, hadoopConf, configurations)
}
protected[hive] def newClientForMetadata(
conf: SparkConf,
hadoopConf: Configuration,
configurations: Map[String, String]): HiveClient = {
val sqlConf = new SQLConf
sqlConf.setConf(SQLContext.getSQLProperties(conf))
val hiveMetastoreVersion = HiveUtils.hiveMetastoreVersion(sqlConf)
val hiveMetastoreJars = HiveUtils.hiveMetastoreJars(sqlConf)
val hiveMetastoreSharedPrefixes = HiveUtils.hiveMetastoreSharedPrefixes(sqlConf)
val hiveMetastoreBarrierPrefixes = HiveUtils.hiveMetastoreBarrierPrefixes(sqlConf)
val metaVersion = IsolatedClientLoader.hiveVersion(hiveMetastoreVersion)
val isolatedLoader = if (hiveMetastoreJars == "builtin") {
if (hiveExecutionVersion != hiveMetastoreVersion) {
throw new IllegalArgumentException(
"Builtin jars can only be used when hive execution version == hive metastore version. " +
s"Execution: $hiveExecutionVersion != Metastore: $hiveMetastoreVersion. " +
"Specify a vaild path to the correct hive jars using $HIVE_METASTORE_JARS " +
s"or change ${HIVE_METASTORE_VERSION.key} to $hiveExecutionVersion.")
}
// We recursively find all jars in the class loader chain,
// starting from the given classLoader.
def allJars(classLoader: ClassLoader): Array[URL] = classLoader match {
case null => Array.empty[URL]
case urlClassLoader: URLClassLoader =>
urlClassLoader.getURLs ++ allJars(urlClassLoader.getParent)
case other => allJars(other.getParent)
}
val classLoader = Utils.getContextOrSparkClassLoader
val jars = allJars(classLoader)
if (jars.length == 0) {
throw new IllegalArgumentException(
"Unable to locate hive jars to connect to metastore. " +
"Please set spark.sql.hive.metastore.jars.")
}
logInfo(
s"Initializing HiveMetastoreConnection version $hiveMetastoreVersion using Spark classes.")
new IsolatedClientLoader(
version = metaVersion,
sparkConf = conf,
hadoopConf = hadoopConf,
execJars = jars.toSeq,
config = configurations,
isolationOn = true,
barrierPrefixes = hiveMetastoreBarrierPrefixes,
sharedPrefixes = hiveMetastoreSharedPrefixes)
} else if (hiveMetastoreJars == "maven") {
// TODO: Support for loading the jars from an already downloaded location.
logInfo(
s"Initializing HiveMetastoreConnection version $hiveMetastoreVersion using maven.")
IsolatedClientLoader.forVersion(
hiveMetastoreVersion = hiveMetastoreVersion,
hadoopVersion = VersionInfo.getVersion,
sparkConf = conf,
hadoopConf = hadoopConf,
config = configurations,
barrierPrefixes = hiveMetastoreBarrierPrefixes,
sharedPrefixes = hiveMetastoreSharedPrefixes)
} else {
// Convert to files and expand any directories.
val jars =
hiveMetastoreJars
.split(File.pathSeparator)
.flatMap {
case path if new File(path).getName == "*" =>
val files = new File(path).getParentFile.listFiles()
if (files == null) {
logWarning(s"Hive jar path '$path' does not exist.")
Nil
} else {
files.filter(_.getName.toLowerCase.endsWith(".jar"))
}
case path =>
new File(path) :: Nil
}
.map(_.toURI.toURL)
logInfo(
s"Initializing HiveMetastoreConnection version $hiveMetastoreVersion " +
s"using ${jars.mkString(":")}")
new IsolatedClientLoader(
version = metaVersion,
sparkConf = conf,
hadoopConf = hadoopConf,
execJars = jars.toSeq,
config = configurations,
isolationOn = true,
barrierPrefixes = hiveMetastoreBarrierPrefixes,
sharedPrefixes = hiveMetastoreSharedPrefixes)
}
isolatedLoader.createClient()
}
/** Constructs a configuration for hive, where the metastore is located in a temp directory. */
def newTemporaryConfiguration(useInMemoryDerby: Boolean): Map[String, String] = {
val withInMemoryMode = if (useInMemoryDerby) "memory:" else ""
val tempDir = Utils.createTempDir()
val localMetastore = new File(tempDir, "metastore")
val propMap: HashMap[String, String] = HashMap()
// We have to mask all properties in hive-site.xml that relates to metastore data source
// as we used a local metastore here.
HiveConf.ConfVars.values().foreach { confvar =>
if (confvar.varname.contains("datanucleus") || confvar.varname.contains("jdo")
|| confvar.varname.contains("hive.metastore.rawstore.impl")) {
propMap.put(confvar.varname, confvar.getDefaultExpr())
}
}
propMap.put(SQLConf.WAREHOUSE_PATH.key, localMetastore.toURI.toString)
propMap.put(HiveConf.ConfVars.METASTORECONNECTURLKEY.varname,
s"jdbc:derby:${withInMemoryMode};databaseName=${localMetastore.getAbsolutePath};create=true")
propMap.put("datanucleus.rdbms.datastoreAdapterClassName",
"org.datanucleus.store.rdbms.adapter.DerbyAdapter")
// SPARK-11783: When "hive.metastore.uris" is set, the metastore connection mode will be
// remote (https://cwiki.apache.org/confluence/display/Hive/AdminManual+MetastoreAdmin
// mentions that "If hive.metastore.uris is empty local mode is assumed, remote otherwise").
// Remote means that the metastore server is running in its own process.
// When the mode is remote, configurations like "javax.jdo.option.ConnectionURL" will not be
// used (because they are used by remote metastore server that talks to the database).
// Because execution Hive should always connects to a embedded derby metastore.
// We have to remove the value of hive.metastore.uris. So, the execution Hive client connects
// to the actual embedded derby metastore instead of the remote metastore.
// You can search HiveConf.ConfVars.METASTOREURIS in the code of HiveConf (in Hive's repo).
// Then, you will find that the local metastore mode is only set to true when
// hive.metastore.uris is not set.
propMap.put(ConfVars.METASTOREURIS.varname, "")
propMap.toMap
}
protected val primitiveTypes =
Seq(StringType, IntegerType, LongType, DoubleType, FloatType, BooleanType, ByteType,
ShortType, DateType, TimestampType, BinaryType)
protected[sql] def toHiveString(a: (Any, DataType)): String = a match {
case (struct: Row, StructType(fields)) =>
struct.toSeq.zip(fields).map {
case (v, t) => s""""${t.name}":${toHiveStructString(v, t.dataType)}"""
}.mkString("{", ",", "}")
case (seq: Seq[_], ArrayType(typ, _)) =>
seq.map(v => (v, typ)).map(toHiveStructString).mkString("[", ",", "]")
case (map: Map[_, _], MapType(kType, vType, _)) =>
map.map {
case (key, value) =>
toHiveStructString((key, kType)) + ":" + toHiveStructString((value, vType))
}.toSeq.sorted.mkString("{", ",", "}")
case (null, _) => "NULL"
case (d: Int, DateType) => new DateWritable(d).toString
case (t: Timestamp, TimestampType) => new TimestampWritable(t).toString
case (bin: Array[Byte], BinaryType) => new String(bin, StandardCharsets.UTF_8)
case (decimal: java.math.BigDecimal, DecimalType()) =>
// Hive strips trailing zeros so use its toString
HiveDecimal.create(decimal).toString
case (other, tpe) if primitiveTypes contains tpe => other.toString
}
/** Hive outputs fields of structs slightly differently than top level attributes. */
protected def toHiveStructString(a: (Any, DataType)): String = a match {
case (struct: Row, StructType(fields)) =>
struct.toSeq.zip(fields).map {
case (v, t) => s""""${t.name}":${toHiveStructString(v, t.dataType)}"""
}.mkString("{", ",", "}")
case (seq: Seq[_], ArrayType(typ, _)) =>
seq.map(v => (v, typ)).map(toHiveStructString).mkString("[", ",", "]")
case (map: Map[_, _], MapType(kType, vType, _)) =>
map.map {
case (key, value) =>
toHiveStructString((key, kType)) + ":" + toHiveStructString((value, vType))
}.toSeq.sorted.mkString("{", ",", "}")
case (null, _) => "null"
case (s: String, StringType) => "\"" + s + "\""
case (decimal, DecimalType()) => decimal.toString
case (other, tpe) if primitiveTypes contains tpe => other.toString
}
}
© 2015 - 2025 Weber Informatics LLC | Privacy Policy