org.sparklinedata.druid.DefaultSource.scala Maven / Gradle / Ivy

Go to download
/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *    http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package org.sparklinedata.druid

import org.apache.spark.Logging
import org.apache.spark.sql.SQLContext
import org.apache.spark.sql.hive.sparklinedata.SparklineDataContext
import org.apache.spark.sql.sources.{BaseRelation, RelationProvider}
import org.json4s._
import org.json4s.jackson.JsonMethods._
import org.sparklinedata.druid.metadata._

class DefaultSource extends RelationProvider with Logging {

  import DefaultSource._

  override def createRelation(sqlContext: SQLContext,
                              parameters: Map[String, String]): BaseRelation = {

    import Utils.jsonFormat

    var sourceDFName = parameters.getOrElse(SOURCE_DF_PARAM,
      throw new DruidDataSourceException(
        s"'$SOURCE_DF_PARAM' must be specified for Druid DataSource")
    )

    sourceDFName = SparklineDataContext.qualifiedName(sqlContext, sourceDFName)

    val sourceDF = sqlContext.table(sourceDFName)

    val dsName: String = parameters.getOrElse(DRUID_DS_PARAM,
      throw new DruidDataSourceException(
        s"'$DRUID_DS_PARAM' must be specified for Druid DataSource")
    )

    val timeDimensionCol: String = parameters.getOrElse(TIME_DIMENSION_COLUMN_PARAM,
      throw new DruidDataSourceException(
        s"'$TIME_DIMENSION_COLUMN_PARAM' must be specified for Druid DataSource")
    )

    // validate field is in SourceDF.
    sourceDF.schema(timeDimensionCol)

    val maxCardinality =
      parameters.getOrElse(MAX_CARDINALITY_PARAM, DEFAULT_MAX_CARDINALITY).toLong
    val cardinalityPerDruidQuery =
      parameters.getOrElse(MAX_CARDINALITY_PER_DRUID_QUERY_PARAM,
        DEFAULT_CARDINALITY_PER_DRUID_QUERY).toLong

    val columnMapping: Map[String, String] =
      parameters.get(SOURCE_TO_DRUID_NAME_MAP_PARAM).map { s =>
        val o = parse(s).asInstanceOf[JObject]
        o.obj.map(t => (t._1, t._2.values.toString)).toMap
      }.getOrElse(Map())

    val columnInfos: List[DruidRelationColumnInfo] =
      parameters.get(SOURCE_TO_DRUID_INFO).map { s =>
        parse(s).extract[List[DruidRelationColumnInfo]]
      }.getOrElse(List())

    val fds: List[FunctionalDependency] = parameters.get(FUNCTIONAL_DEPENDENCIES_PARAM).map { s =>
      parse(s).extract[List[FunctionalDependency]]
    }.getOrElse(List())

    val druidHost = parameters.get(DRUID_HOST_PARAM).getOrElse(DEFAULT_DRUID_HOST)

    var starSchemaInfo =
      parameters.get(STAR_SCHEMA_INFO_PARAM).map(parse(_).extract[StarSchemaInfo]).
        getOrElse(StarSchemaInfo(sourceDFName))

    starSchemaInfo = StarSchemaInfo.qualifyTableNames(sqlContext, starSchemaInfo)

    val ss = StarSchema(sourceDFName, starSchemaInfo)(sqlContext)
    if (ss.isLeft) {
      throw new DruidDataSourceException(
        s"Failed to parse StarSchemaInfo: ${ss.left.get}")
    }

    val pushHLLTODruid =
      parameters.get(PUSH_HYPERLOGLOG_TODRUID).
        getOrElse(DEFAULT_PUSH_HYPERLOGLOG_TODRUID).toBoolean

    val streamDruidQueryResults =
      parameters.get(STREAM_DRUID_QUERY_RESULTS).
        getOrElse(DEFAULT_STREAM_DRUID_QUERY_RESULTS).toBoolean

    val loadMetadataFromAllSegments =
      parameters.get(LOAD_METADATA_FROM_ALL_SEGMENTS).
        getOrElse(DEFAULT_LOAD_METADATA_FROM_ALL_SEGMENTS).toBoolean

    val zkSessionTimeoutMs : Int =
      parameters.get(ZK_SESSION_TIMEOUT).
        getOrElse(DEFAULT_ZK_SESSION_TIMEOUT).toInt

    val zkEnableCompression : Boolean =
      parameters.get(ZK_ENABLE_COMPRESSION).
        getOrElse(DEFAULT_ZK_ENABLE_COMPRESSION).toBoolean

    val zkDruidPath : String =
      parameters.get(ZK_DRUID_PATH).
        getOrElse(DEFAULT_ZK_DRUID_PATH)

    val queryHistorical : Boolean =
      parameters.get(QUERY_HISTORICAL).
        getOrElse(DEFAULT_QUERY_HISTORICAL).toBoolean

    val zkQualifyDiscoveryNames : Boolean =
      parameters.get(ZK_QUALIFY_DISCOVERY_NAMES).
        getOrElse(DEFAULT_ZK_QUALIFY_DISCOVERY_NAMES).toBoolean

    val numSegmentsPerHistoricalQuery : Int =
      parameters.get(NUM_SEGMENTS_PER_HISTORICAL_QUERY).
        getOrElse(DEFAULT_NUM_SEGMENTS_PER_HISTORICAL_QUERY).toInt

    val useSmile : Boolean =
      parameters.get(USE_SMILE).
        getOrElse(DEFAULT_USE_SMILE).toBoolean

    val allowTopN : Boolean =
      parameters.get(ALLOW_TOPN).
        getOrElse(DEFAULT_ALLOW_TOPN.toString).toBoolean

    val topNMaxThreshold : Int =
      parameters.get(TOPN_MAX_THRESHOLD).
        getOrElse(DEFAULT_TOPN_MAX_THRESHOLD.toString).toInt


    val numProcessingThreadsPerHistorical =
      parameters.get(NUM_PROCESSING_THREADS_PER_HISTORICAL).map(_.toInt)

    val nonAggregateQueryHandling:
    NonAggregateQueryHandling.Value = NonAggregateQueryHandling.withName(
      parameters.get(NON_AGG_QUERY_HANDLING).
        getOrElse(DEFAULT_NON_AGG_QUERY_HANDLING)
    )

    val queryGranularity : DruidQueryGranularity =
      DruidQueryGranularity(parameters.get(QUERY_GRANULARITY).getOrElse(DEFAULT_QUERY_GRANULARITY))

    val options = DruidRelationOptions(
      maxCardinality,
      cardinalityPerDruidQuery,
      pushHLLTODruid,
      streamDruidQueryResults,
      loadMetadataFromAllSegments,
      zkSessionTimeoutMs,
      zkEnableCompression,
      zkDruidPath,
      queryHistorical,
      zkQualifyDiscoveryNames,
      numSegmentsPerHistoricalQuery,
      useSmile,
      nonAggregateQueryHandling,
      queryGranularity,
      allowTopN,
      topNMaxThreshold,
      numProcessingThreadsPerHistorical
    )


    val drI = DruidMetadataCache.druidRelation(sqlContext,
      sourceDFName, sourceDF,
      dsName,
      timeDimensionCol,
      druidHost,
      columnMapping,
      columnInfos,
      fds,
      ss.right.get,
      options)

    logInfo(drI.fd.depGraph.debugString(drI.druidDS))

    val dQuery = parameters.get(DRUID_QUERY).map { s =>
      parse(s).extract[DruidQuery]
    }

    new DruidRelation(drI, dQuery)(sqlContext)
  }
}

object DefaultSource {

  val SOURCE_DF_PARAM = "sourceDataframe"

  /**
    * DataSource name in Druid.
    */
  val DRUID_DS_PARAM = "druidDatasource"

  val TIME_DIMENSION_COLUMN_PARAM = "timeDimensionColumn"

  /**
    * If the result cardinality of a Query exceeeds this value then Query is not
    * converted to a Druid Query.
    */
  val MAX_CARDINALITY_PARAM = "maxResultCardinality"
  val DEFAULT_MAX_CARDINALITY: String = (1 * 1000 * 1000).toString

  /**
    * If the result size estimate exceeds this number, and attempt is made to run 'n'
    * druid queries, each of which spans a sub interval of the total time interval.
    * 'n' is computed as `result.size % thisParam + 1`
    */
  val MAX_CARDINALITY_PER_DRUID_QUERY_PARAM = "maxCardinalityPerQuery"
  val DEFAULT_CARDINALITY_PER_DRUID_QUERY = (100 * 1000).toString

  /**
    * Map column names to Druid field names.
    * Specified as a json string.
    */
  val SOURCE_TO_DRUID_NAME_MAP_PARAM = "columnMapping"

  /**
    * List of [[DruidRelationColumnInfo]] that provide details about the source column
    * to Druid linkages.
    */
  val SOURCE_TO_DRUID_INFO = "columnInfos"

  /**
    * Specify how columns are related, see
    * [[org.sparklinedata.druid.metadata.FunctionalDependency]]. Specified as a list of
    * functional dependency objects.
    */
  val FUNCTIONAL_DEPENDENCIES_PARAM = "functionalDependencies"

  val DRUID_HOST_PARAM = "druidHost"
  val DEFAULT_DRUID_HOST = "localhost"

  // this is only for test purposes
  val DRUID_QUERY = "druidQuery"

  val STAR_SCHEMA_INFO_PARAM = "starSchema"

  /**
    * Controls whether Query results from Druid are streamed into
    * Spark Operator pipeline. Default is true.
    */
  val PUSH_HYPERLOGLOG_TODRUID = "pushHLLTODruid"
  val DEFAULT_PUSH_HYPERLOGLOG_TODRUID = "true"

  /**
    * Controls whether Query results from Druid are streamed into
    * Spark Operator pipeline. Default is true.
    */
  val STREAM_DRUID_QUERY_RESULTS = "streamDruidQueryResults"
  val DEFAULT_STREAM_DRUID_QUERY_RESULTS = "true"


  /**
    * When loading Druid DataSource metadata should the query interval be
    * the entire dataSource interval, or only the latest segment is enough.
    * Default is to load from all segments; since our query has
    * ("analysisTypes" -> []) the query is cheap.
    */
  val LOAD_METADATA_FROM_ALL_SEGMENTS = "loadMetadataFromAllSegments"
  val DEFAULT_LOAD_METADATA_FROM_ALL_SEGMENTS = "true"

  val ZK_SESSION_TIMEOUT = "zkSessionTimeoutMilliSecs"
  val DEFAULT_ZK_SESSION_TIMEOUT = "30000"

  val ZK_ENABLE_COMPRESSION = "zkEnableCompression"
  val DEFAULT_ZK_ENABLE_COMPRESSION = "true"

  val ZK_DRUID_PATH = "zkDruidPath"
  val DEFAULT_ZK_DRUID_PATH = "/druid"

  val QUERY_HISTORICAL = "queryHistoricalServers"
  val DEFAULT_QUERY_HISTORICAL = "false"

  val ZK_QUALIFY_DISCOVERY_NAMES = "zkQualifyDiscoveryNames"
  val DEFAULT_ZK_QUALIFY_DISCOVERY_NAMES = "false"

  val NUM_SEGMENTS_PER_HISTORICAL_QUERY = "numSegmentsPerHistoricalQuery"
  val DEFAULT_NUM_SEGMENTS_PER_HISTORICAL_QUERY = Int.MaxValue.toString

  val USE_SMILE = "useSmile"
  val DEFAULT_USE_SMILE = "true"

  val NUM_PROCESSING_THREADS_PER_HISTORICAL = "numProcessingThreadsPerHistorical"

  val NON_AGG_QUERY_HANDLING = "nonAggregateQueryHandling"
  val DEFAULT_NON_AGG_QUERY_HANDLING = NonAggregateQueryHandling.PUSH_NONE.toString

  val QUERY_GRANULARITY = "queryGranularity"
  val DEFAULT_QUERY_GRANULARITY = "none"

  val ALLOW_TOPN = "allowTopNRewrite"
  val DEFAULT_ALLOW_TOPN = false

  val TOPN_MAX_THRESHOLD = "topNMaxThreshold"
  val DEFAULT_TOPN_MAX_THRESHOLD : Int = 100000

}