
io.delta.hive.DeltaStorageHandler.scala Maven / Gradle / Ivy
/*
* Copyright (2020-present) The Delta Lake Project Authors.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package io.delta.hive
import java.util.{ArrayList => JArrayList}
import scala.collection.JavaConverters._
import scala.collection.mutable
import org.apache.hadoop.fs.Path
import org.apache.hadoop.hive.metastore.HiveMetaHook
import org.apache.hadoop.hive.metastore.api.MetaException
import org.apache.hadoop.hive.metastore.api.Table
import org.apache.hadoop.hive.metastore.api.hive_metastoreConstants.META_TABLE_LOCATION
import org.apache.hadoop.hive.ql.exec.FunctionRegistry
import org.apache.hadoop.hive.ql.index.IndexSearchCondition
import org.apache.hadoop.hive.ql.io.IOConstants
import org.apache.hadoop.hive.ql.io.parquet.read.DataWritableReadSupport
import org.apache.hadoop.hive.ql.io.parquet.serde.ParquetHiveSerDe
import org.apache.hadoop.hive.ql.metadata.DefaultStorageHandler
import org.apache.hadoop.hive.ql.metadata.HiveStoragePredicateHandler
import org.apache.hadoop.hive.ql.metadata.HiveStoragePredicateHandler.DecomposedPredicate
import org.apache.hadoop.hive.ql.plan.ExprNodeDesc
import org.apache.hadoop.hive.ql.plan.ExprNodeGenericFuncDesc
import org.apache.hadoop.hive.ql.plan.TableDesc
import org.apache.hadoop.hive.serde2.AbstractSerDe
import org.apache.hadoop.hive.serde2.Deserializer
import org.apache.hadoop.hive.serde2.typeinfo.{StructTypeInfo, TypeInfo, TypeInfoFactory, TypeInfoUtils}
import org.apache.hadoop.mapred.{InputFormat, JobConf, OutputFormat}
import org.slf4j.LoggerFactory
class DeltaStorageHandler extends DefaultStorageHandler with HiveMetaHook
with HiveStoragePredicateHandler {
import DeltaStorageHandler._
private val LOG = LoggerFactory.getLogger(classOf[DeltaStorageHandler])
override def getInputFormatClass: Class[_ <: InputFormat[_, _]] = classOf[DeltaInputFormat]
/**
* Returns a special [[OutputFormat]] to prevent from writing to a Delta table in Hive before we
* support it. We have to give Hive some class when creating a table, hence we have to implement
* an [[OutputFormat]] which throws an exception when Hive is using it.
*/
override def getOutputFormatClass: Class[_ <: OutputFormat[_, _]] = classOf[DeltaOutputFormat]
override def getSerDeClass(): Class[_ <: AbstractSerDe] = classOf[ParquetHiveSerDe]
/**
* DataWritableReadSupport.getColumnNames is private before Hive 2.3. Using reflection to make it
* compatible with versions before Hive 2.3.
*/
private def getColumnNames(columns: String): java.util.List[String] = {
val getColumnNamesMethod =
classOf[DataWritableReadSupport].getDeclaredMethod("getColumnNames", classOf[String])
getColumnNamesMethod.setAccessible(true)
getColumnNamesMethod.invoke(null, columns).asInstanceOf[java.util.List[String]]
}
override def configureInputJobProperties(
tableDesc: TableDesc,
jobProperties: java.util.Map[String, String]): Unit = {
super.configureInputJobProperties(tableDesc, jobProperties)
val tableProps = tableDesc.getProperties()
val columnNames = getColumnNames(tableProps.getProperty(IOConstants.COLUMNS))
val columnTypes =
DataWritableReadSupport.getColumnTypes(tableProps.getProperty(IOConstants.COLUMNS_TYPES))
val hiveSchema = TypeInfoFactory.getStructTypeInfo(columnNames, columnTypes)
.asInstanceOf[StructTypeInfo]
val rootPath = tableProps.getProperty(META_TABLE_LOCATION)
val snapshot = DeltaHelper.loadDeltaLatestSnapshot(getConf, new Path(rootPath))
DeltaHelper.checkTableSchema(snapshot.getMetadata.getSchema, hiveSchema)
jobProperties.put(DELTA_TABLE_PATH, rootPath)
jobProperties.put(DELTA_TABLE_SCHEMA, hiveSchema.toString)
}
override def decomposePredicate(
jobConf: JobConf,
deserializer: Deserializer,
predicate: ExprNodeDesc): DecomposedPredicate = {
// Get the delta root path
val deltaRootPath = jobConf.get(META_TABLE_LOCATION)
// Get the partitionColumns of Delta
val partitionColumns = DeltaHelper.getPartitionCols(jobConf, new Path(deltaRootPath))
if (LOG.isInfoEnabled) {
LOG.info("delta partitionColumns is " + partitionColumns.mkString(", "))
}
val analyzer = newIndexPredicateAnalyzer(partitionColumns)
val conditions = new java.util.ArrayList[IndexSearchCondition]()
var pushedPredicate: ExprNodeGenericFuncDesc = null
var residualPredicate =
analyzer.analyzePredicate(predicate, conditions).asInstanceOf[ExprNodeGenericFuncDesc]
for (searchConditions <- decompose(conditions).values) {
// still push back the pushedPredicate to residualPredicate
residualPredicate =
extractResidualCondition(analyzer, searchConditions, residualPredicate)
pushedPredicate =
extractStorageHandlerCondition(analyzer, searchConditions, pushedPredicate)
}
if (LOG.isInfoEnabled) {
LOG.info("pushedPredicate:" +
(if (pushedPredicate == null) "null" else pushedPredicate.getExprString()) +
",residualPredicate" + residualPredicate)
}
val decomposedPredicate = new DecomposedPredicate()
decomposedPredicate.pushedPredicate = pushedPredicate
decomposedPredicate.residualPredicate = residualPredicate
decomposedPredicate
}
private def newIndexPredicateAnalyzer(partitionColumns: Seq[String]): IndexPredicateAnalyzer = {
val analyzer = new IndexPredicateAnalyzer()
for (col <- partitionColumns) {
// Supported filter exprs on partition column to be pushed down to delta
analyzer.addComparisonOp(col, SUPPORTED_PUSH_DOWN_UDFS: _*)
}
analyzer
}
private def decompose(searchConditions: JArrayList[IndexSearchCondition]):
Map[String, JArrayList[IndexSearchCondition]] = {
val result = mutable.Map[String, java.util.ArrayList[IndexSearchCondition]]()
for (condition <- searchConditions.asScala) {
val conditions = result.getOrElseUpdate(
condition.getColumnDesc().getColumn(),
new JArrayList[IndexSearchCondition]())
conditions.add(condition)
}
result.toMap
}
private def extractResidualCondition(
analyzer: IndexPredicateAnalyzer,
searchConditions: java.util.ArrayList[IndexSearchCondition],
inputExpr: ExprNodeGenericFuncDesc): ExprNodeGenericFuncDesc = {
if (inputExpr == null) {
analyzer.translateOriginalConditions(searchConditions)
} else {
val children = new JArrayList[ExprNodeDesc]
children.add(analyzer.translateOriginalConditions(searchConditions))
children.add(inputExpr)
new ExprNodeGenericFuncDesc(TypeInfoFactory.booleanTypeInfo,
FunctionRegistry.getGenericUDFForAnd(), children)
}
}
private def extractStorageHandlerCondition(
analyzer: IndexPredicateAnalyzer,
searchConditions: java.util.ArrayList[IndexSearchCondition],
inputExpr: ExprNodeGenericFuncDesc): ExprNodeGenericFuncDesc = {
if (inputExpr == null) {
analyzer.translateSearchConditions(searchConditions)
} else {
val children = new JArrayList[ExprNodeDesc]
children.add(analyzer.translateSearchConditions(searchConditions))
children.add(inputExpr)
new ExprNodeGenericFuncDesc(TypeInfoFactory.booleanTypeInfo,
FunctionRegistry.getGenericUDFForAnd(), children)
}
}
override def getMetaHook: HiveMetaHook = this
/**
* We include `MetaStoreUtils.isExternalTable` to make our code compatible with Hive 2 and 3.
* `MetaStoreUtils` is in different packages in Hive 2 and 3.
*/
private def isExternalTable(table: Table): Boolean = {
if (table == null) {
return false
}
val params = table.getParameters();
if (params == null) {
return false
}
"TRUE".equalsIgnoreCase(params.get("EXTERNAL"));
}
override def preCreateTable(tbl: Table): Unit = {
if (!isExternalTable(tbl)) {
throw new UnsupportedOperationException(
s"The type of table ${tbl.getDbName}:${tbl.getTableName} is ${tbl.getTableType}." +
" Only external Delta tables can be read in Hive right now")
}
if (tbl.getPartitionKeysSize > 0) {
throw new MetaException(
s"Found partition columns " +
s"(${tbl.getPartitionKeys.asScala.map(_.getName).mkString(",")}) in table " +
s"${tbl.getDbName}:${tbl.getTableName}. The partition columns in a Delta table " +
s"will be read from its own metadata and should not be set manually.") }
val deltaRootString = tbl.getSd.getLocation
if (deltaRootString == null || deltaRootString.trim.isEmpty) {
throw new MetaException("table location should be set when creating a Delta table")
}
val snapshot = DeltaHelper.loadDeltaLatestSnapshot(getConf, new Path(deltaRootString))
// Extract the table schema in Hive to compare it with the latest table schema in Delta logs,
// and fail the query if it was changed.
val cols = tbl.getSd.getCols
val columnNames = new JArrayList[String](cols.size)
val columnTypes = new JArrayList[TypeInfo](cols.size)
cols.asScala.foreach { col =>
columnNames.add(col.getName)
columnTypes.add(TypeInfoUtils.getTypeInfoFromTypeString(col.getType))
}
val hiveSchema = TypeInfoFactory.getStructTypeInfo(columnNames, columnTypes)
.asInstanceOf[StructTypeInfo]
DeltaHelper.checkTableSchema(snapshot.getMetadata.getSchema, hiveSchema)
tbl.getParameters.put("spark.sql.sources.provider", "DELTA")
tbl.getSd.getSerdeInfo.getParameters.put("path", deltaRootString)
}
override def rollbackCreateTable(table: Table): Unit = {
// We don't change the Delta table on the file system. Nothing to do
}
override def commitCreateTable(table: Table): Unit = {
// Nothing to do
}
override def preDropTable(table: Table): Unit = {
// Nothing to do
}
override def rollbackDropTable(table: Table): Unit = {
// Nothing to do
}
override def commitDropTable(table: Table, b: Boolean): Unit = {
// Nothing to do
}
}
object DeltaStorageHandler {
/**
* The Delta table path passing into `JobConf` so that `DeltaLog` can be accessed everywhere.
*/
val DELTA_TABLE_PATH = "delta.table.path"
/**
* The Hive table schema passing into `JobConf` so that `DeltaLog` can be accessed everywhere.
*/
val DELTA_TABLE_SCHEMA = "delta.table.schema"
val SUPPORTED_PUSH_DOWN_UDFS = Array(
"org.apache.hadoop.hive.ql.udf.generic.GenericUDFOPEqual",
"org.apache.hadoop.hive.ql.udf.generic.GenericUDFOPEqualOrGreaterThan",
"org.apache.hadoop.hive.ql.udf.generic.GenericUDFOPEqualOrLessThan",
"org.apache.hadoop.hive.ql.udf.generic.GenericUDFOPLessThan",
"org.apache.hadoop.hive.ql.udf.generic.GenericUDFOPGreaterThan",
"org.apache.hadoop.hive.ql.udf.generic.GenericUDFOPNotEqual",
"org.apache.hadoop.hive.ql.udf.generic.GenericUDFOPEqualNS",
"org.apache.hadoop.hive.ql.udf.UDFLike",
"org.apache.hadoop.hive.ql.udf.generic.GenericUDFIn"
)
}
© 2015 - 2025 Weber Informatics LLC | Privacy Policy