All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.apache.spark.sql.catalyst.plans.logical.statsEstimation.ValueInterval.scala Maven / Gradle / Ivy

The newest version!
/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *    http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package org.apache.spark.sql.catalyst.plans.logical.statsEstimation

import org.apache.spark.sql.catalyst.expressions.Literal
import org.apache.spark.sql.types._


/** Value range of a column. */
trait ValueInterval {
  def contains(l: Literal): Boolean
}

/** For simplicity we use double to unify operations of numeric intervals. */
case class NumericValueInterval(min: Double, max: Double) extends ValueInterval {
  override def contains(l: Literal): Boolean = {
    val lit = EstimationUtils.toDouble(l.value, l.dataType)
    min <= lit && max >= lit
  }
}

/**
 * This version of Spark does not have min/max for binary/string types, we define their default
 * behaviors by this class.
 */
class DefaultValueInterval extends ValueInterval {
  override def contains(l: Literal): Boolean = true
}

/** This is for columns with only null values. */
class NullValueInterval extends ValueInterval {
  override def contains(l: Literal): Boolean = false
}

object ValueInterval {
  def apply(
      min: Option[Any],
      max: Option[Any],
      dataType: DataType): ValueInterval = dataType match {
    case StringType | BinaryType => new DefaultValueInterval()
    case _ if min.isEmpty || max.isEmpty => new NullValueInterval()
    case _ =>
      NumericValueInterval(
        min = EstimationUtils.toDouble(min.get, dataType),
        max = EstimationUtils.toDouble(max.get, dataType))
  }

  def isIntersected(r1: ValueInterval, r2: ValueInterval): Boolean = (r1, r2) match {
    case (_, _: DefaultValueInterval) | (_: DefaultValueInterval, _) =>
      // The DefaultValueInterval represents string/binary types which do not have max/min stats,
      // we assume they are intersected to be conservative on estimation
      true
    case (_, _: NullValueInterval) | (_: NullValueInterval, _) =>
      false
    case (n1: NumericValueInterval, n2: NumericValueInterval) =>
      n1.min.compareTo(n2.max) <= 0 && n1.max.compareTo(n2.min) >= 0
    case _ =>
      throw new UnsupportedOperationException(s"Not supported pair: $r1, $r2 at isIntersected()")
  }

  /**
   * Intersected results of two intervals. This is only for two overlapped intervals.
   * The outputs are the intersected min/max values.
   */
  def intersect(r1: ValueInterval, r2: ValueInterval, dt: DataType): (Option[Any], Option[Any]) = {
    (r1, r2) match {
      case (_, _: DefaultValueInterval) | (_: DefaultValueInterval, _) =>
        // binary/string types don't support intersecting.
        (None, None)
      case (n1: NumericValueInterval, n2: NumericValueInterval) =>
        // Choose the maximum of two min values, and the minimum of two max values.
        val newMin = if (n1.min <= n2.min) n2.min else n1.min
        val newMax = if (n1.max <= n2.max) n1.max else n2.max
        (Some(EstimationUtils.fromDouble(newMin, dt)),
          Some(EstimationUtils.fromDouble(newMax, dt)))
      case _ =>
        throw new UnsupportedOperationException(s"Not supported pair: $r1, $r2 at intersect()")
    }
  }
}




© 2015 - 2024 Weber Informatics LLC | Privacy Policy