Many resources are needed to download a project. Please understand that we have to compensate our server costs. Thank you in advance. Project price only 1 $
You can buy this project and download/modify it how often you want.
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.spark.sql.catalyst.plans.logical.statsEstimation
import scala.collection.immutable.HashSet
import scala.collection.mutable
import org.apache.spark.internal.Logging
import org.apache.spark.sql.catalyst.expressions._
import org.apache.spark.sql.catalyst.expressions.Literal.{FalseLiteral, TrueLiteral}
import org.apache.spark.sql.catalyst.plans.logical._
import org.apache.spark.sql.catalyst.plans.logical.statsEstimation.EstimationUtils._
import org.apache.spark.sql.types._
case class FilterEstimation(plan: Filter) extends Logging {
private val childStats = plan.child.stats
private val colStatsMap = ColumnStatsMap(childStats.attributeStats)
/**
* Returns an option of Statistics for a Filter logical plan node.
* For a given compound expression condition, this method computes filter selectivity
* (or the percentage of rows meeting the filter condition), which
* is used to compute row count, size in bytes, and the updated statistics after a given
* predicated is applied.
*
* @return Option[Statistics] When there is no statistics collected, it returns None.
*/
def estimate: Option[Statistics] = {
if (childStats.rowCount.isEmpty) return None
// Estimate selectivity of this filter predicate, and update column stats if needed.
// For not-supported condition, set filter selectivity to a conservative estimate 100%
val filterSelectivity = calculateFilterSelectivity(plan.condition).getOrElse(1.0)
val filteredRowCount: BigInt = ceil(BigDecimal(childStats.rowCount.get) * filterSelectivity)
val newColStats = if (filteredRowCount == 0) {
// The output is empty, we don't need to keep column stats.
AttributeMap[ColumnStat](Nil)
} else {
colStatsMap.outputColumnStats(rowsBeforeFilter = childStats.rowCount.get,
rowsAfterFilter = filteredRowCount)
}
val filteredSizeInBytes: BigInt = getOutputSize(plan.output, filteredRowCount, newColStats)
Some(childStats.copy(sizeInBytes = filteredSizeInBytes, rowCount = Some(filteredRowCount),
attributeStats = newColStats))
}
/**
* Returns a percentage of rows meeting a condition in Filter node.
* If it's a single condition, we calculate the percentage directly.
* If it's a compound condition, it is decomposed into multiple single conditions linked with
* AND, OR, NOT.
* For logical AND conditions, we need to update stats after a condition estimation
* so that the stats will be more accurate for subsequent estimation. This is needed for
* range condition such as (c > 40 AND c <= 50)
* For logical OR and NOT conditions, we do not update stats after a condition estimation.
*
* @param condition the compound logical expression
* @param update a boolean flag to specify if we need to update ColumnStat of a column
* for subsequent conditions
* @return an optional double value to show the percentage of rows meeting a given condition.
* It returns None if the condition is not supported.
*/
def calculateFilterSelectivity(condition: Expression, update: Boolean = true): Option[Double] = {
condition match {
case And(cond1, cond2) =>
val percent1 = calculateFilterSelectivity(cond1, update).getOrElse(1.0)
val percent2 = calculateFilterSelectivity(cond2, update).getOrElse(1.0)
Some(percent1 * percent2)
case Or(cond1, cond2) =>
val percent1 = calculateFilterSelectivity(cond1, update = false).getOrElse(1.0)
val percent2 = calculateFilterSelectivity(cond2, update = false).getOrElse(1.0)
Some(percent1 + percent2 - (percent1 * percent2))
// Not-operator pushdown
case Not(And(cond1, cond2)) =>
calculateFilterSelectivity(Or(Not(cond1), Not(cond2)), update = false)
// Not-operator pushdown
case Not(Or(cond1, cond2)) =>
calculateFilterSelectivity(And(Not(cond1), Not(cond2)), update = false)
// Collapse two consecutive Not operators which could be generated after Not-operator pushdown
case Not(Not(cond)) =>
calculateFilterSelectivity(cond, update = false)
// The foldable Not has been processed in the ConstantFolding rule
// This is a top-down traversal. The Not could be pushed down by the above two cases.
case Not(l @ Literal(null, _)) =>
calculateSingleCondition(l, update = false)
case Not(cond) =>
calculateFilterSelectivity(cond, update = false) match {
case Some(percent) => Some(1.0 - percent)
case None => None
}
case _ =>
calculateSingleCondition(condition, update)
}
}
/**
* Returns a percentage of rows meeting a single condition in Filter node.
* Currently we only support binary predicates where one side is a column,
* and the other is a literal.
*
* @param condition a single logical expression
* @param update a boolean flag to specify if we need to update ColumnStat of a column
* for subsequent conditions
* @return an optional double value to show the percentage of rows meeting a given condition.
* It returns None if the condition is not supported.
*/
def calculateSingleCondition(condition: Expression, update: Boolean): Option[Double] = {
condition match {
case l: Literal =>
evaluateLiteral(l)
// For evaluateBinary method, we assume the literal on the right side of an operator.
// So we will change the order if not.
// EqualTo/EqualNullSafe does not care about the order
case Equality(ar: Attribute, l: Literal) =>
evaluateEquality(ar, l, update)
case Equality(l: Literal, ar: Attribute) =>
evaluateEquality(ar, l, update)
case op @ LessThan(ar: Attribute, l: Literal) =>
evaluateBinary(op, ar, l, update)
case op @ LessThan(l: Literal, ar: Attribute) =>
evaluateBinary(GreaterThan(ar, l), ar, l, update)
case op @ LessThanOrEqual(ar: Attribute, l: Literal) =>
evaluateBinary(op, ar, l, update)
case op @ LessThanOrEqual(l: Literal, ar: Attribute) =>
evaluateBinary(GreaterThanOrEqual(ar, l), ar, l, update)
case op @ GreaterThan(ar: Attribute, l: Literal) =>
evaluateBinary(op, ar, l, update)
case op @ GreaterThan(l: Literal, ar: Attribute) =>
evaluateBinary(LessThan(ar, l), ar, l, update)
case op @ GreaterThanOrEqual(ar: Attribute, l: Literal) =>
evaluateBinary(op, ar, l, update)
case op @ GreaterThanOrEqual(l: Literal, ar: Attribute) =>
evaluateBinary(LessThanOrEqual(ar, l), ar, l, update)
case In(ar: Attribute, expList)
if expList.forall(e => e.isInstanceOf[Literal]) =>
// Expression [In (value, seq[Literal])] will be replaced with optimized version
// [InSet (value, HashSet[Literal])] in Optimizer, but only for list.size > 10.
// Here we convert In into InSet anyway, because they share the same processing logic.
val hSet = expList.map(e => e.eval())
evaluateInSet(ar, HashSet() ++ hSet, update)
case InSet(ar: Attribute, set) =>
evaluateInSet(ar, set, update)
// In current stage, we don't have advanced statistics such as sketches or histograms.
// As a result, some operator can't estimate `nullCount` accurately. E.g. left outer join
// estimation does not accurately update `nullCount` currently.
// So for IsNull and IsNotNull predicates, we only estimate them when the child is a leaf
// node, whose `nullCount` is accurate.
// This is a limitation due to lack of advanced stats. We should remove it in the future.
case IsNull(ar: Attribute) if plan.child.isInstanceOf[LeafNode] =>
evaluateNullCheck(ar, isNull = true, update)
case IsNotNull(ar: Attribute) if plan.child.isInstanceOf[LeafNode] =>
evaluateNullCheck(ar, isNull = false, update)
case op @ Equality(attrLeft: Attribute, attrRight: Attribute) =>
evaluateBinaryForTwoColumns(op, attrLeft, attrRight, update)
case op @ LessThan(attrLeft: Attribute, attrRight: Attribute) =>
evaluateBinaryForTwoColumns(op, attrLeft, attrRight, update)
case op @ LessThanOrEqual(attrLeft: Attribute, attrRight: Attribute) =>
evaluateBinaryForTwoColumns(op, attrLeft, attrRight, update)
case op @ GreaterThan(attrLeft: Attribute, attrRight: Attribute) =>
evaluateBinaryForTwoColumns(op, attrLeft, attrRight, update)
case op @ GreaterThanOrEqual(attrLeft: Attribute, attrRight: Attribute) =>
evaluateBinaryForTwoColumns(op, attrLeft, attrRight, update)
case _ =>
// TODO: it's difficult to support string operators without advanced statistics.
// Hence, these string operators Like(_, _) | Contains(_, _) | StartsWith(_, _)
// | EndsWith(_, _) are not supported yet
logDebug("[CBO] Unsupported filter condition: " + condition)
None
}
}
/**
* Returns a percentage of rows meeting "IS NULL" or "IS NOT NULL" condition.
*
* @param attr an Attribute (or a column)
* @param isNull set to true for "IS NULL" condition. set to false for "IS NOT NULL" condition
* @param update a boolean flag to specify if we need to update ColumnStat of a given column
* for subsequent conditions
* @return an optional double value to show the percentage of rows meeting a given condition
* It returns None if no statistics collected for a given column.
*/
def evaluateNullCheck(
attr: Attribute,
isNull: Boolean,
update: Boolean): Option[Double] = {
if (!colStatsMap.contains(attr) || !colStatsMap(attr).hasCountStats) {
logDebug("[CBO] No statistics for " + attr)
return None
}
val colStat = colStatsMap(attr)
val rowCountValue = childStats.rowCount.get
val nullPercent: Double = if (rowCountValue == 0) {
0
} else {
(BigDecimal(colStat.nullCount.get) / BigDecimal(rowCountValue)).toDouble
}
if (update) {
val newStats = if (isNull) {
colStat.copy(distinctCount = Some(0), min = None, max = None)
} else {
colStat.copy(nullCount = Some(0))
}
colStatsMap.update(attr, newStats)
}
val percent = if (isNull) {
nullPercent
} else {
1.0 - nullPercent
}
Some(percent)
}
/**
* Returns a percentage of rows meeting a binary comparison expression.
*
* @param op a binary comparison operator such as =, <, <=, >, >=
* @param attr an Attribute (or a column)
* @param literal a literal value (or constant)
* @param update a boolean flag to specify if we need to update ColumnStat of a given column
* for subsequent conditions
* @return an optional double value to show the percentage of rows meeting a given condition
* It returns None if no statistics exists for a given column or wrong value.
*/
def evaluateBinary(
op: BinaryComparison,
attr: Attribute,
literal: Literal,
update: Boolean): Option[Double] = {
if (!colStatsMap.contains(attr)) {
logDebug("[CBO] No statistics for " + attr)
return None
}
attr.dataType match {
case _: NumericType | DateType | TimestampType | BooleanType =>
evaluateBinaryForNumeric(op, attr, literal, update)
case StringType | BinaryType =>
// TODO: It is difficult to support other binary comparisons for String/Binary
// type without min/max and advanced statistics like histogram.
logDebug("[CBO] No range comparison statistics for String/Binary type " + attr)
None
}
}
/**
* Returns a percentage of rows meeting an equality (=) expression.
* This method evaluates the equality predicate for all data types.
*
* For EqualNullSafe (<=>), if the literal is not null, result will be the same as EqualTo;
* if the literal is null, the condition will be changed to IsNull after optimization.
* So we don't need specific logic for EqualNullSafe here.
*
* @param attr an Attribute (or a column)
* @param literal a literal value (or constant)
* @param update a boolean flag to specify if we need to update ColumnStat of a given column
* for subsequent conditions
* @return an optional double value to show the percentage of rows meeting a given condition
*/
def evaluateEquality(
attr: Attribute,
literal: Literal,
update: Boolean): Option[Double] = {
if (!colStatsMap.contains(attr)) {
logDebug("[CBO] No statistics for " + attr)
return None
}
val colStat = colStatsMap(attr)
// decide if the value is in [min, max] of the column.
// We currently don't store min/max for binary/string type.
// Hence, we assume it is in boundary for binary/string type.
val statsInterval = ValueInterval(colStat.min, colStat.max, attr.dataType)
if (statsInterval.contains(literal)) {
if (update) {
// We update ColumnStat structure after apply this equality predicate:
// Set distinctCount to 1, nullCount to 0, and min/max values (if exist) to the literal
// value.
val newStats = attr.dataType match {
case StringType | BinaryType =>
colStat.copy(distinctCount = Some(1), nullCount = Some(0))
case _ =>
colStat.copy(distinctCount = Some(1), min = Some(literal.value),
max = Some(literal.value), nullCount = Some(0))
}
colStatsMap.update(attr, newStats)
}
if (colStat.histogram.isEmpty) {
if (!colStat.distinctCount.isEmpty) {
// returns 1/ndv if there is no histogram
Some(1.0 / colStat.distinctCount.get.toDouble)
} else {
None
}
} else {
Some(computeEqualityPossibilityByHistogram(literal, colStat))
}
} else { // not in interval
Some(0.0)
}
}
/**
* Returns a percentage of rows meeting a Literal expression.
* This method evaluates all the possible literal cases in Filter.
*
* FalseLiteral and TrueLiteral should be eliminated by optimizer, but null literal might be added
* by optimizer rule NullPropagation. For safety, we handle all the cases here.
*
* @param literal a literal value (or constant)
* @return an optional double value to show the percentage of rows meeting a given condition
*/
def evaluateLiteral(literal: Literal): Option[Double] = {
literal match {
case Literal(null, _) => Some(0.0)
case FalseLiteral => Some(0.0)
case TrueLiteral => Some(1.0)
// Ideally, we should not hit the following branch
case _ => None
}
}
/**
* Returns a percentage of rows meeting "IN" operator expression.
* This method evaluates the equality predicate for all data types.
*
* @param attr an Attribute (or a column)
* @param hSet a set of literal values
* @param update a boolean flag to specify if we need to update ColumnStat of a given column
* for subsequent conditions
* @return an optional double value to show the percentage of rows meeting a given condition
* It returns None if no statistics exists for a given column.
*/
def evaluateInSet(
attr: Attribute,
hSet: Set[Any],
update: Boolean): Option[Double] = {
if (!colStatsMap.hasDistinctCount(attr)) {
logDebug("[CBO] No statistics for " + attr)
return None
}
val colStat = colStatsMap(attr)
val ndv = colStat.distinctCount.get
val dataType = attr.dataType
var newNdv = ndv
// use [min, max] to filter the original hSet
dataType match {
case _: NumericType | BooleanType | DateType | TimestampType =>
if (ndv.toDouble == 0 || colStat.min.isEmpty || colStat.max.isEmpty) {
return Some(0.0)
}
val statsInterval =
ValueInterval(colStat.min, colStat.max, dataType).asInstanceOf[NumericValueInterval]
val validQuerySet = hSet.filter { v =>
v != null && statsInterval.contains(Literal(v, dataType))
}
if (validQuerySet.isEmpty) {
return Some(0.0)
}
val newMax = validQuerySet.maxBy(EstimationUtils.toDouble(_, dataType))
val newMin = validQuerySet.minBy(EstimationUtils.toDouble(_, dataType))
// newNdv should not be greater than the old ndv. For example, column has only 2 values
// 1 and 6. The predicate column IN (1, 2, 3, 4, 5). validQuerySet.size is 5.
newNdv = ndv.min(BigInt(validQuerySet.size))
if (update) {
val newStats = colStat.copy(distinctCount = Some(newNdv), min = Some(newMin),
max = Some(newMax), nullCount = Some(0))
colStatsMap.update(attr, newStats)
}
// We assume the whole set since there is no min/max information for String/Binary type
case StringType | BinaryType =>
if (ndv.toDouble == 0) {
return Some(0.0)
}
newNdv = ndv.min(BigInt(hSet.size))
if (update) {
val newStats = colStat.copy(distinctCount = Some(newNdv), nullCount = Some(0))
colStatsMap.update(attr, newStats)
}
}
// return the filter selectivity. Without advanced statistics such as histograms,
// we have to assume uniform distribution.
Some(math.min(newNdv.toDouble / ndv.toDouble, 1.0))
}
/**
* Returns a percentage of rows meeting a binary comparison expression.
* This method evaluate expression for Numeric/Date/Timestamp/Boolean columns.
*
* @param op a binary comparison operator such as =, <, <=, >, >=
* @param attr an Attribute (or a column)
* @param literal a literal value (or constant)
* @param update a boolean flag to specify if we need to update ColumnStat of a given column
* for subsequent conditions
* @return an optional double value to show the percentage of rows meeting a given condition
*/
def evaluateBinaryForNumeric(
op: BinaryComparison,
attr: Attribute,
literal: Literal,
update: Boolean): Option[Double] = {
if (!colStatsMap.hasMinMaxStats(attr) || !colStatsMap.hasDistinctCount(attr)) {
logDebug("[CBO] No statistics for " + attr)
return None
}
val colStat = colStatsMap(attr)
val statsInterval =
ValueInterval(colStat.min, colStat.max, attr.dataType).asInstanceOf[NumericValueInterval]
val max = statsInterval.max
val min = statsInterval.min
val ndv = colStat.distinctCount.get.toDouble
// determine the overlapping degree between predicate interval and column's interval
val numericLiteral = EstimationUtils.toDouble(literal.value, literal.dataType)
val (noOverlap: Boolean, completeOverlap: Boolean) = op match {
case _: LessThan =>
(numericLiteral <= min, numericLiteral > max)
case _: LessThanOrEqual =>
(numericLiteral < min, numericLiteral >= max)
case _: GreaterThan =>
(numericLiteral >= max, numericLiteral < min)
case _: GreaterThanOrEqual =>
(numericLiteral > max, numericLiteral <= min)
}
var percent = 1.0
if (noOverlap) {
percent = 0.0
} else if (completeOverlap) {
percent = 1.0
} else {
// This is the partial overlap case:
if (colStat.histogram.isEmpty) {
// Without advanced statistics like histogram, we assume uniform data distribution.
// We just prorate the adjusted range over the initial range to compute filter selectivity.
assert(max > min)
percent = op match {
case _: LessThan =>
if (numericLiteral == max) {
// If the literal value is right on the boundary, we can minus the part of the
// boundary value (1/ndv).
1.0 - 1.0 / ndv
} else {
(numericLiteral - min) / (max - min)
}
case _: LessThanOrEqual =>
if (numericLiteral == min) {
// The boundary value is the only satisfying value.
1.0 / ndv
} else {
(numericLiteral - min) / (max - min)
}
case _: GreaterThan =>
if (numericLiteral == min) {
1.0 - 1.0 / ndv
} else {
(max - numericLiteral) / (max - min)
}
case _: GreaterThanOrEqual =>
if (numericLiteral == max) {
1.0 / ndv
} else {
(max - numericLiteral) / (max - min)
}
}
} else {
percent = computeComparisonPossibilityByHistogram(op, literal, colStat)
}
if (update) {
val newValue = Some(literal.value)
var newMax = colStat.max
var newMin = colStat.min
op match {
case _: GreaterThan | _: GreaterThanOrEqual =>
newMin = newValue
case _: LessThan | _: LessThanOrEqual =>
newMax = newValue
}
val newStats = colStat.copy(distinctCount = Some(ceil(ndv * percent)),
min = newMin, max = newMax, nullCount = Some(0))
colStatsMap.update(attr, newStats)
}
}
Some(percent)
}
/**
* Computes the possibility of an equality predicate using histogram.
*/
private def computeEqualityPossibilityByHistogram(
literal: Literal, colStat: ColumnStat): Double = {
val datum = EstimationUtils.toDouble(literal.value, literal.dataType)
val histogram = colStat.histogram.get
// find bins where column's current min and max locate. Note that a column's [min, max]
// range may change due to another condition applied earlier.
val min = EstimationUtils.toDouble(colStat.min.get, literal.dataType)
val max = EstimationUtils.toDouble(colStat.max.get, literal.dataType)
// compute how many bins the column's current valid range [min, max] occupies.
val numBinsHoldingEntireRange = EstimationUtils.numBinsHoldingRange(
upperBound = max,
upperBoundInclusive = true,
lowerBound = min,
lowerBoundInclusive = true,
histogram.bins)
val numBinsHoldingDatum = EstimationUtils.numBinsHoldingRange(
upperBound = datum,
upperBoundInclusive = true,
lowerBound = datum,
lowerBoundInclusive = true,
histogram.bins)
numBinsHoldingDatum / numBinsHoldingEntireRange
}
/**
* Computes the possibility of a comparison predicate using histogram.
*/
private def computeComparisonPossibilityByHistogram(
op: BinaryComparison, literal: Literal, colStat: ColumnStat): Double = {
val datum = EstimationUtils.toDouble(literal.value, literal.dataType)
val histogram = colStat.histogram.get
// find bins where column's current min and max locate. Note that a column's [min, max]
// range may change due to another condition applied earlier.
val min = EstimationUtils.toDouble(colStat.min.get, literal.dataType)
val max = EstimationUtils.toDouble(colStat.max.get, literal.dataType)
// compute how many bins the column's current valid range [min, max] occupies.
val numBinsHoldingEntireRange = EstimationUtils.numBinsHoldingRange(
max, upperBoundInclusive = true, min, lowerBoundInclusive = true, histogram.bins)
val numBinsHoldingRange = op match {
// LessThan and LessThanOrEqual share the same logic, the only difference is whether to
// include the upperBound in the range.
case _: LessThan =>
EstimationUtils.numBinsHoldingRange(
upperBound = datum,
upperBoundInclusive = false,
lowerBound = min,
lowerBoundInclusive = true,
histogram.bins)
case _: LessThanOrEqual =>
EstimationUtils.numBinsHoldingRange(
upperBound = datum,
upperBoundInclusive = true,
lowerBound = min,
lowerBoundInclusive = true,
histogram.bins)
// GreaterThan and GreaterThanOrEqual share the same logic, the only difference is whether to
// include the lowerBound in the range.
case _: GreaterThan =>
EstimationUtils.numBinsHoldingRange(
upperBound = max,
upperBoundInclusive = true,
lowerBound = datum,
lowerBoundInclusive = false,
histogram.bins)
case _: GreaterThanOrEqual =>
EstimationUtils.numBinsHoldingRange(
upperBound = max,
upperBoundInclusive = true,
lowerBound = datum,
lowerBoundInclusive = true,
histogram.bins)
}
numBinsHoldingRange / numBinsHoldingEntireRange
}
/**
* Returns a percentage of rows meeting a binary comparison expression containing two columns.
* In SQL queries, we also see predicate expressions involving two columns
* such as "column-1 (op) column-2" where column-1 and column-2 belong to same table.
* Note that, if column-1 and column-2 belong to different tables, then it is a join
* operator's work, NOT a filter operator's work.
*
* @param op a binary comparison operator, including =, <=>, <, <=, >, >=
* @param attrLeft the left Attribute (or a column)
* @param attrRight the right Attribute (or a column)
* @param update a boolean flag to specify if we need to update ColumnStat of the given columns
* for subsequent conditions
* @return an optional double value to show the percentage of rows meeting a given condition
*/
def evaluateBinaryForTwoColumns(
op: BinaryComparison,
attrLeft: Attribute,
attrRight: Attribute,
update: Boolean): Option[Double] = {
if (!colStatsMap.hasCountStats(attrLeft)) {
logDebug("[CBO] No statistics for " + attrLeft)
return None
}
if (!colStatsMap.hasCountStats(attrRight)) {
logDebug("[CBO] No statistics for " + attrRight)
return None
}
attrLeft.dataType match {
case StringType | BinaryType =>
// TODO: It is difficult to support other binary comparisons for String/Binary
// type without min/max and advanced statistics like histogram.
logDebug("[CBO] No range comparison statistics for String/Binary type " + attrLeft)
return None
case _ =>
if (!colStatsMap.hasMinMaxStats(attrLeft)) {
logDebug("[CBO] No min/max statistics for " + attrLeft)
return None
}
if (!colStatsMap.hasMinMaxStats(attrRight)) {
logDebug("[CBO] No min/max statistics for " + attrRight)
return None
}
}
val colStatLeft = colStatsMap(attrLeft)
val statsIntervalLeft = ValueInterval(colStatLeft.min, colStatLeft.max, attrLeft.dataType)
.asInstanceOf[NumericValueInterval]
val maxLeft = statsIntervalLeft.max
val minLeft = statsIntervalLeft.min
val colStatRight = colStatsMap(attrRight)
val statsIntervalRight = ValueInterval(colStatRight.min, colStatRight.max, attrRight.dataType)
.asInstanceOf[NumericValueInterval]
val maxRight = statsIntervalRight.max
val minRight = statsIntervalRight.min
// determine the overlapping degree between predicate interval and column's interval
val allNotNull = (colStatLeft.nullCount.get == 0) && (colStatRight.nullCount.get == 0)
val (noOverlap: Boolean, completeOverlap: Boolean) = op match {
// Left < Right or Left <= Right
// - no overlap:
// minRight maxRight minLeft maxLeft
// --------+------------------+------------+-------------+------->
// - complete overlap: (If null values exists, we set it to partial overlap.)
// minLeft maxLeft minRight maxRight
// --------+------------------+------------+-------------+------->
case _: LessThan =>
(minLeft >= maxRight, (maxLeft < minRight) && allNotNull)
case _: LessThanOrEqual =>
(minLeft > maxRight, (maxLeft <= minRight) && allNotNull)
// Left > Right or Left >= Right
// - no overlap:
// minLeft maxLeft minRight maxRight
// --------+------------------+------------+-------------+------->
// - complete overlap: (If null values exists, we set it to partial overlap.)
// minRight maxRight minLeft maxLeft
// --------+------------------+------------+-------------+------->
case _: GreaterThan =>
(maxLeft <= minRight, (minLeft > maxRight) && allNotNull)
case _: GreaterThanOrEqual =>
(maxLeft < minRight, (minLeft >= maxRight) && allNotNull)
// Left = Right or Left <=> Right
// - no overlap:
// minLeft maxLeft minRight maxRight
// --------+------------------+------------+-------------+------->
// minRight maxRight minLeft maxLeft
// --------+------------------+------------+-------------+------->
// - complete overlap:
// minLeft maxLeft
// minRight maxRight
// --------+------------------+------->
case _: EqualTo =>
((maxLeft < minRight) || (maxRight < minLeft),
(minLeft == minRight) && (maxLeft == maxRight) && allNotNull
&& (colStatLeft.distinctCount.get == colStatRight.distinctCount.get)
)
case _: EqualNullSafe =>
// For null-safe equality, we use a very restrictive condition to evaluate its overlap.
// If null values exists, we set it to partial overlap.
(((maxLeft < minRight) || (maxRight < minLeft)) && allNotNull,
(minLeft == minRight) && (maxLeft == maxRight) && allNotNull
&& (colStatLeft.distinctCount.get == colStatRight.distinctCount.get)
)
}
var percent = 1.0
if (noOverlap) {
percent = 0.0
} else if (completeOverlap) {
percent = 1.0
} else {
// For partial overlap, we use an empirical value 1/3 as suggested by the book
// "Database Systems, the complete book".
percent = 1.0 / 3.0
if (update) {
// Need to adjust new min/max after the filter condition is applied
val ndvLeft = BigDecimal(colStatLeft.distinctCount.get)
val newNdvLeft = ceil(ndvLeft * percent)
val ndvRight = BigDecimal(colStatRight.distinctCount.get)
val newNdvRight = ceil(ndvRight * percent)
var newMaxLeft = colStatLeft.max
var newMinLeft = colStatLeft.min
var newMaxRight = colStatRight.max
var newMinRight = colStatRight.min
op match {
case _: LessThan | _: LessThanOrEqual =>
// the left side should be less than the right side.
// If not, we need to adjust it to narrow the range.
// Left < Right or Left <= Right
// minRight < minLeft
// --------+******************+------->
// filtered ^
// |
// newMinRight
//
// maxRight < maxLeft
// --------+******************+------->
// ^ filtered
// |
// newMaxLeft
if (minLeft > minRight) newMinRight = colStatLeft.min
if (maxLeft > maxRight) newMaxLeft = colStatRight.max
case _: GreaterThan | _: GreaterThanOrEqual =>
// the left side should be greater than the right side.
// If not, we need to adjust it to narrow the range.
// Left > Right or Left >= Right
// minLeft < minRight
// --------+******************+------->
// filtered ^
// |
// newMinLeft
//
// maxLeft < maxRight
// --------+******************+------->
// ^ filtered
// |
// newMaxRight
if (minLeft < minRight) newMinLeft = colStatRight.min
if (maxLeft < maxRight) newMaxRight = colStatLeft.max
case _: EqualTo | _: EqualNullSafe =>
// need to set new min to the larger min value, and
// set the new max to the smaller max value.
// Left = Right or Left <=> Right
// minLeft < minRight
// --------+******************+------->
// filtered ^
// |
// newMinLeft
//
// minRight <= minLeft
// --------+******************+------->
// filtered ^
// |
// newMinRight
//
// maxLeft < maxRight
// --------+******************+------->
// ^ filtered
// |
// newMaxRight
//
// maxRight <= maxLeft
// --------+******************+------->
// ^ filtered
// |
// newMaxLeft
if (minLeft < minRight) {
newMinLeft = colStatRight.min
} else {
newMinRight = colStatLeft.min
}
if (maxLeft < maxRight) {
newMaxRight = colStatLeft.max
} else {
newMaxLeft = colStatRight.max
}
}
val newStatsLeft = colStatLeft.copy(distinctCount = Some(newNdvLeft), min = newMinLeft,
max = newMaxLeft)
colStatsMap(attrLeft) = newStatsLeft
val newStatsRight = colStatRight.copy(distinctCount = Some(newNdvRight), min = newMinRight,
max = newMaxRight)
colStatsMap(attrRight) = newStatsRight
}
}
Some(percent)
}
}
/**
* This class contains the original column stats from child, and maintains the updated column stats.
* We will update the corresponding ColumnStats for a column after we apply a predicate condition.
* For example, column c has [min, max] value as [0, 100]. In a range condition such as
* (c > 40 AND c <= 50), we need to set the column's [min, max] value to [40, 100] after we
* evaluate the first condition c > 40. We also need to set the column's [min, max] value to
* [40, 50] after we evaluate the second condition c <= 50.
*
* @param originalMap Original column stats from child.
*/
case class ColumnStatsMap(originalMap: AttributeMap[ColumnStat]) {
/** This map maintains the latest column stats. */
private val updatedMap: mutable.Map[ExprId, (Attribute, ColumnStat)] = mutable.HashMap.empty
def contains(a: Attribute): Boolean = updatedMap.contains(a.exprId) || originalMap.contains(a)
/**
* Gets an Option of column stat for the given attribute.
* Prefer the column stat in updatedMap than that in originalMap,
* because updatedMap has the latest (updated) column stats.
*/
def get(a: Attribute): Option[ColumnStat] = {
if (updatedMap.contains(a.exprId)) {
updatedMap.get(a.exprId).map(_._2)
} else {
originalMap.get(a)
}
}
def hasCountStats(a: Attribute): Boolean =
get(a).exists(_.hasCountStats)
def hasDistinctCount(a: Attribute): Boolean =
get(a).exists(_.distinctCount.isDefined)
def hasMinMaxStats(a: Attribute): Boolean =
get(a).exists(_.hasMinMaxStats)
/**
* Gets column stat for the given attribute. Prefer the column stat in updatedMap than that in
* originalMap, because updatedMap has the latest (updated) column stats.
*/
def apply(a: Attribute): ColumnStat = {
get(a).get
}
/** Updates column stats in updatedMap. */
def update(a: Attribute, stats: ColumnStat): Unit = updatedMap.update(a.exprId, a -> stats)
/**
* Collects updated column stats, and scales down ndv for other column stats if the number of rows
* decreases after this Filter operator.
*/
def outputColumnStats(rowsBeforeFilter: BigInt, rowsAfterFilter: BigInt)
: AttributeMap[ColumnStat] = {
val newColumnStats = originalMap.map { case (attr, oriColStat) =>
val colStat = updatedMap.get(attr.exprId).map(_._2).getOrElse(oriColStat)
val newNdv = if (colStat.distinctCount.isEmpty) {
// No NDV in the original stats.
None
} else if (colStat.distinctCount.get > 1) {
// Update ndv based on the overall filter selectivity: scale down ndv if the number of rows
// decreases; otherwise keep it unchanged.
Some(EstimationUtils.updateNdv(oldNumRows = rowsBeforeFilter,
newNumRows = rowsAfterFilter, oldNdv = oriColStat.distinctCount.get))
} else {
// no need to scale down since it is already down to 1 (for skewed distribution case)
colStat.distinctCount
}
attr -> colStat.copy(distinctCount = newNdv)
}
AttributeMap(newColumnStats.toSeq)
}
}