Many resources are needed to download a project. Please understand that we have to compensate our server costs. Thank you in advance. Project price only 1 $
You can buy this project and download/modify it how often you want.
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.spark.sql.execution
import org.apache.spark.sql.catalyst.InternalRow
import org.apache.spark.sql.catalyst.catalog.{HiveTableRelation, SessionCatalog}
import org.apache.spark.sql.catalyst.expressions._
import org.apache.spark.sql.catalyst.expressions.aggregate._
import org.apache.spark.sql.catalyst.plans.logical._
import org.apache.spark.sql.catalyst.rules.Rule
import org.apache.spark.sql.catalyst.util.{CaseInsensitiveMap, DateTimeUtils}
import org.apache.spark.sql.execution.datasources.{HadoopFsRelation, LogicalRelation}
import org.apache.spark.sql.internal.SQLConf
/**
* This rule optimizes the execution of queries that can be answered by looking only at
* partition-level metadata. This applies when all the columns scanned are partition columns, and
* the query has an aggregate operator that satisfies the following conditions:
* 1. aggregate expression is partition columns.
* e.g. SELECT col FROM tbl GROUP BY col.
* 2. aggregate function on partition columns with DISTINCT.
* e.g. SELECT col1, count(DISTINCT col2) FROM tbl GROUP BY col1.
* 3. aggregate function on partition columns which have same result w or w/o DISTINCT keyword.
* e.g. SELECT col1, Max(col2) FROM tbl GROUP BY col1.
*/
case class OptimizeMetadataOnlyQuery(catalog: SessionCatalog) extends Rule[LogicalPlan] {
def apply(plan: LogicalPlan): LogicalPlan = {
if (!SQLConf.get.optimizerMetadataOnly) {
return plan
}
plan.transform {
case a @ Aggregate(_, aggExprs, child @ PartitionedRelation(partAttrs, relation)) =>
// We only apply this optimization when only partitioned attributes are scanned.
if (a.references.subsetOf(partAttrs)) {
val aggFunctions = aggExprs.flatMap(_.collect {
case agg: AggregateExpression => agg
})
val isAllDistinctAgg = aggFunctions.forall { agg =>
agg.isDistinct || (agg.aggregateFunction match {
// `Max`, `Min`, `First` and `Last` are always distinct aggregate functions no matter
// they have DISTINCT keyword or not, as the result will be same.
case _: Max => true
case _: Min => true
case _: First => true
case _: Last => true
case _ => false
})
}
if (isAllDistinctAgg) {
a.withNewChildren(Seq(replaceTableScanWithPartitionMetadata(child, relation)))
} else {
a
}
} else {
a
}
}
}
/**
* Returns the partition attributes of the table relation plan.
*/
private def getPartitionAttrs(
partitionColumnNames: Seq[String],
relation: LogicalPlan): Seq[Attribute] = {
val partColumns = partitionColumnNames.map(_.toLowerCase).toSet
relation.output.filter(a => partColumns.contains(a.name.toLowerCase))
}
/**
* Transform the given plan, find its table scan nodes that matches the given relation, and then
* replace the table scan node with its corresponding partition values.
*/
private def replaceTableScanWithPartitionMetadata(
child: LogicalPlan,
relation: LogicalPlan): LogicalPlan = {
child transform {
case plan if plan eq relation =>
relation match {
case l @ LogicalRelation(fsRelation: HadoopFsRelation, _, _, isStreaming) =>
val partAttrs = getPartitionAttrs(fsRelation.partitionSchema.map(_.name), l)
val partitionData = fsRelation.location.listFiles(Nil, Nil)
LocalRelation(partAttrs, partitionData.map(_.values), isStreaming)
case relation: HiveTableRelation =>
val partAttrs = getPartitionAttrs(relation.tableMeta.partitionColumnNames, relation)
val caseInsensitiveProperties =
CaseInsensitiveMap(relation.tableMeta.storage.properties)
val timeZoneId = caseInsensitiveProperties.get(DateTimeUtils.TIMEZONE_OPTION)
.getOrElse(SQLConf.get.sessionLocalTimeZone)
val partitionData = catalog.listPartitions(relation.tableMeta.identifier).map { p =>
InternalRow.fromSeq(partAttrs.map { attr =>
Cast(Literal(p.spec(attr.name)), attr.dataType, Option(timeZoneId)).eval()
})
}
LocalRelation(partAttrs, partitionData)
case _ =>
throw new IllegalStateException(s"unrecognized table scan node: $relation, " +
s"please turn off ${SQLConf.OPTIMIZER_METADATA_ONLY.key} and try again.")
}
}
}
/**
* A pattern that finds the partitioned table relation node inside the given plan, and returns a
* pair of the partition attributes and the table relation node.
*
* It keeps traversing down the given plan tree if there is a [[Project]] or [[Filter]] with
* deterministic expressions, and returns result after reaching the partitioned table relation
* node.
*/
object PartitionedRelation {
def unapply(plan: LogicalPlan): Option[(AttributeSet, LogicalPlan)] = plan match {
case l @ LogicalRelation(fsRelation: HadoopFsRelation, _, _, _)
if fsRelation.partitionSchema.nonEmpty =>
val partAttrs = getPartitionAttrs(fsRelation.partitionSchema.map(_.name), l)
Some((AttributeSet(partAttrs), l))
case relation: HiveTableRelation if relation.tableMeta.partitionColumnNames.nonEmpty =>
val partAttrs = getPartitionAttrs(relation.tableMeta.partitionColumnNames, relation)
Some((AttributeSet(partAttrs), relation))
case p @ Project(projectList, child) if projectList.forall(_.deterministic) =>
unapply(child).flatMap { case (partAttrs, relation) =>
if (p.references.subsetOf(partAttrs)) Some((p.outputSet, relation)) else None
}
case f @ Filter(condition, child) if condition.deterministic =>
unapply(child).flatMap { case (partAttrs, relation) =>
if (f.references.subsetOf(partAttrs)) Some((partAttrs, relation)) else None
}
case _ => None
}
}
}