
org.apache.spark.sql.sources.druid.DruidPlanner.scala Maven / Gradle / Ivy
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.spark.sql.sources.druid
import org.apache.spark.sql.SQLConf.SQLConfEntry
import org.apache.spark.sql.SQLConf.SQLConfEntry._
import org.apache.spark.sql.execution.{PhysicalRDD, SparkPlan}
import org.apache.spark.sql.{CachedTablePattern, SQLContext}
import org.apache.spark.sql.catalyst.plans.logical.LogicalPlan
import org.sparklinedata.druid._
import org.sparklinedata.druid.client.ConnectionManager
class DruidPlanner private[druid](val sqlContext : SQLContext) extends DruidTransforms {
val cacheTablePatternMatch = new CachedTablePattern(sqlContext)
val joinGraphTransform =
druidRelationTransformForJoin.debug("druidRelationTransformForJoin") or
joinTransform.debug("join")
val joinTransformWithDebug = joinTransform.debug("join")
val transforms : Seq[DruidTransform] = Seq(
druidRelationTransform.debug("druidRelationTransform") or
joinTransformWithDebug,
aggregateTransform.debug("aggregate"),
limitTransform.debug("limit")
)
def plan(db : Seq[DruidQueryBuilder], plan: LogicalPlan): Seq[DruidQueryBuilder] = {
transforms.view.flatMap(_(db, plan))
}
}
object DruidPlanner {
def apply(sqlContext : SQLContext) : DruidPlanner = {
val dP = new DruidPlanner(sqlContext)
ConnectionManager.init(sqlContext)
dP
}
val SPARKLINEDATA_CACHE_TABLES_TOCHECK = stringSeqConf(
"spark.sparklinedata.druid.cache.tables.tocheck",
defaultValue = Some(List()),
doc = "A comma separated list of tableNames that should be checked if they are cached." +
"For Star-Schemas with associated Druid Indexes, even if tables are cached, we " +
"attempt to rewrite the Query to Druid. In order to do this we need to convert an" +
"[[InMemoryRelation]] operator with its underlying Logical Plan. This value, will" +
"tell us to restrict our check to certain tables. Otherwise by default we will check" +
"all tables. ")
val DEBUG_TRANSFORMATIONS = booleanConf("spark.sparklinedata.druid.debug.transformations",
defaultValue = Some(false),
doc = "When set to true each transformation is logged.")
val TZ_ID = stringConf("spark.sparklinedata.tz.id",
defaultValue = Some(org.joda.time.DateTimeZone.getDefault.getID),
doc = "Specifes the TimeZone ID of the spark; " +
"used by Druid for Date/TimeStamp transformations.")
val DRUID_SELECT_QUERY_PAGESIZE = intConf("spark.sparklinedata.druid.selectquery.pagesize",
defaultValue = Some(10000),
doc = "Num. of rows fetched on each invocation of Druid Select Query"
)
val DRUID_CONN_POOL_MAX_CONNECTIONS = intConf("spark.sparklinedata.druid.max.connections",
defaultValue = Some(100),
doc = "Max. number of Http Connections to Druid Cluster"
)
val DRUID_CONN_POOL_MAX_CONNECTIONS_PER_ROUTE = intConf(
"spark.sparklinedata.druid.max.connections.per.route",
defaultValue = Some(20),
doc = "Max. number of Http Connections to each server in Druid Cluster"
)
val DRUID_QUERY_COST_MODEL_ENABLED = booleanConf(
"spark.sparklinedata.druid.querycostmodel.enabled",
defaultValue = Some(true),
doc = "flag that controls if decision to execute druid query on broker or" +
" historicals is based on the cost model; default is true. If false " +
"the decision is based on the 'queryHistoricalServers' and " +
"'numSegmentsPerHistoricalQuery' datasource options."
)
val DRUID_QUERY_COST_MODEL_HIST_MERGE_COST = doubleConf(
"spark.sparklinedata.druid.querycostmodel.histMergeCostFactor",
defaultValue = Some(0.07),
doc = "cost of performing a segment agg. merge in druid " +
"historicals relative to spark shuffle cost"
)
val DRUID_QUERY_COST_MODEL_HIST_SEGMENTS_PERQUERY_LIMIT = intConf(
"spark.sparklinedata.druid.querycostmodel.histSegsPerQueryLimit",
defaultValue = Some(5),
doc = "the max. number of segments processed per historical query."
)
val DRUID_QUERY_COST_MODEL_INTERVAL_SCALING_NDV = doubleConf(
"spark.sparklinedata.druid.querycostmodel.queryintervalScalingForDistinctValues",
defaultValue = Some(3),
doc = "The ndv estimate for a query interval uses this number. The ratio of" +
"the (query interval/index interval) is multiplied by this number. " +
"The ndv for a query is estimated as:" +
" 'log(this_value * min(10*interval_ratio,10)) * orig_ndv'. The reduction is logarithmic" +
"and this value applies further dampening factor on the reduction. At a default value" +
"of '3' any interval ratio >= 0.33 will have no reduction in ndvs. "
)
val DRUID_QUERY_COST_MODEL_HISTORICAL_PROCESSING_COST = doubleConf(
"spark.sparklinedata.druid.querycostmodel.historicalProcessingCost",
defaultValue = Some(0.1),
doc = "the cost per row of groupBy processing in historical servers " +
"relative to spark shuffle cost"
)
val DRUID_QUERY_COST_MODEL_HISTORICAL_TIMESERIES_PROCESSING_COST = doubleConf(
"spark.sparklinedata.druid.querycostmodel.historicalTimeSeriesProcessingCost",
defaultValue = Some(0.07),
doc = "the cost per row of timeseries processing in historical servers " +
"relative to spark shuffle cost"
)
val DRUID_QUERY_COST_MODEL_SPARK_SCHEDULING_COST = doubleConf(
"spark.sparklinedata.druid.querycostmodel.sparkSchedulingCost",
defaultValue = Some(1.0),
doc = "the cost of scheduling tasks in spark relative to the shuffle cost of 1 row"
)
val DRUID_QUERY_COST_MODEL_SPARK_AGGREGATING_COST = doubleConf(
"spark.sparklinedata.druid.querycostmodel.sparkAggregatingCost",
defaultValue = Some(0.15),
doc = "the cost per row to do aggregation in spark relative to the shuffle cost"
)
val DRUID_QUERY_COST_MODEL_OUTPUT_TRANSPORT_COST = doubleConf(
"spark.sparklinedata.druid.querycostmodel.druidOutputTransportCost",
defaultValue = Some(0.4),
doc = "the cost per row to transport druid output relative to the shuffle cost"
)
val DRUID_USE_SMILE = booleanConf(
"spark.sparklinedata.druid.option.useSmile",
defaultValue = Some(true),
doc = "for druid queries use the smile binary protocol"
)
val DRUID_ALLOW_TOPN_QUERIES = booleanConf(
"spark.sparklinedata.druid.allowTopN",
defaultValue = Some(DefaultSource.DEFAULT_ALLOW_TOPN),
doc = "druid TopN queries are approximate in their aggregation and ranking, this " +
"flag controls if TopN query rewrites should happen."
)
val DRUID_TOPN_QUERIES_MAX_THRESHOLD = intConf(
"spark.sparklinedata.druid.topNMaxThreshold",
defaultValue = Some(DefaultSource.DEFAULT_TOPN_MAX_THRESHOLD),
doc = "if druid TopN queries are enabled, this property controls the maximum " +
"limit for which such rewrites are done. For limits beyond this value the GroupBy query is" +
"executed."
)
val DRUID_USE_V2_GBY_ENGINE = booleanConf(
"spark.sparklinedata.druid.option.use.v2.groupByEngine",
defaultValue = Some(false),
doc = "for druid queries use the smile binary protocol"
)
val DRUID_RECORD_QUERY_EXECUTION = booleanConf(
"spark.sparklinedata.enable.druid.query.history",
defaultValue = Some(false),
doc = "track each druid query executed from every Spark Executor."
)
def getDruidQuerySpecs(plan : SparkPlan) : Seq[DruidQuery] = {
plan.collect {
case PhysicalRDD(_, r : DruidRDD, _, _, _) => r.dQuery
}
}
def getDruidRDDs(plan : SparkPlan) : Seq[DruidRDD] = {
plan.collect {
case PhysicalRDD(_, r : DruidRDD, _, _, _) => r
}
}
def getConfValue[T](sqlContext : SQLContext,
entry : SQLConfEntry[T]) : T = {
sqlContext.getConf(entry)
}
}
© 2015 - 2025 Weber Informatics LLC | Privacy Policy