All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.sparklinedata.druid.metadata.StarSchemaInfo.scala Maven / Gradle / Ivy

/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *    http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package org.sparklinedata.druid.metadata

import org.apache.spark.sql.SQLContext
import org.apache.spark.sql.catalyst.expressions.{AttributeReference, Expression}
import org.apache.spark.sql.hive.sparklinedata.SparklineDataContext

import scala.annotation.tailrec
import scala.collection.mutable.ArrayBuffer

/**
 * Describes the relations in a Star Schema. Used to build a [[StarSchema]] model.
 * This will be part of the [[org.sparklinedata.druid.DefaultSource DruidDataSource]] defintion.
 *
 * @param factTable name of the fact table for the StarSchema
 * @param relations how are tables related in this StarSchema.
 */
case class StarSchemaInfo(factTable : String, relations : StarRelationInfo*)

object StarSchemaInfo {

  def qualifyTableNames(sqlContext : SQLContext,
                        sSI : StarSchemaInfo) : StarSchemaInfo = {
    StarSchemaInfo(
      SparklineDataContext.qualifiedName(sqlContext, sSI.factTable),
      sSI.relations.map(StarRelationInfo.qualifyTableNames(sqlContext, _)):_*
    )
  }
}

/**
 * Represents how 2 tables in a StarSchema are related.
 * @param leftTable
 * @param rightTable
 * @param relationType is it a 1-1 or n-1 relation. As the relation Graph is traversed outward from
 *                     the fact table, only these relations are supported. The implication is we
 *                     don't support fact-to-fact relations or relation between dimensions.
 * @param joinCondition
 */
case class StarRelationInfo(leftTable : String,
                             rightTable : String,
                             relationType : FunctionalDependencyType.Value,
                              joinCondition : Seq[EqualityCondition]) {

}

object StarRelationInfo {

  def oneToone(leftTable : String,
           rightTable : String,
           joinCondition : (String,String)* ) : StarRelationInfo =
    new StarRelationInfo(leftTable, rightTable, FunctionalDependencyType.OneToOne,
      joinCondition.map(t => EqualityCondition(t._1, t._2)))

  def manyToone(leftTable : String,
               rightTable : String,
               joinCondition : (String,String)* ) : StarRelationInfo =
    new StarRelationInfo(leftTable, rightTable, FunctionalDependencyType.ManyToOne,
      joinCondition.map(t => EqualityCondition(t._1, t._2)))


  def qualifyTableNames(sqlContext : SQLContext,
                        sRI : StarRelationInfo) : StarRelationInfo = {
    sRI.copy(
      leftTable = SparklineDataContext.qualifiedName(sqlContext, sRI.leftTable),
      rightTable = SparklineDataContext.qualifiedName(sqlContext, sRI.rightTable)
    )
  }

}

case class EqualityCondition(leftAttribute : String, rightAttribute : String)

case class StarRelation(tableName : String,
                             relationType : FunctionalDependencyType.Value,
                             joiningKeys : Set[(String, String)])

/**
 * Represents a Table in the StarSchema.
 *
 * @param name
 * @param parent Its parent table along the join path from this Table to the Fact table
 *               of the Star Schema.
 */
case class StarTable(name : String,
                      parent : Option[StarRelation])

/**
 * Represents a StarSchema. The '''Star Schema'''s we support have the following __constraints__:
 *  - We only support '''one-one''' or '''many-one''' relations between entities.
 *  - A table can be related to the '''Fact Table''' via only 1 unique Path.
 *  - The ''column names'' across the Star Schema must be unique. So 2 tables in the Star Schema
 *  cannot have columns with the same name.
 *
 *  The first 2 points are not an issue only in the most involved star schema models; for e.g.
 *  we show how tpch can be modeled below. The 3rd restriction is an implementation issue:
 *  when performing QueryPlan rewrites we don't have access to the table an Attribute belongs to,
 *  for now we get around this issue by forcing column names to be unique across the Star Schema.
 *
 *  '''Tpch Model:'''
 *  {{{
 *    FactTable = LineItem
 *    StarRelations: [
 *      LineItem - n:1 - Order => [[li_orderkey],[o_orderkey]]
 *      LineItem - n:1 - PartSupp => [[li_partkey, li_suppkey],[ps_partkey, ps_suppkey]]
 *      Order - n: 1 - Customer => [[o_custkey], [c_custkey]]
 *      PartSupp - n:1 - Part => [[ps_partkey], [p_partkey]]
 *      PartSupp - n:1 - Supplier => [[ps_suppkey], [s_suppkey]]
 *      Customer - n:1 - CustNation => [[c_nationkey], [cn_nationkey]]
 *      CustNation - n:1 - CustRegion => [[cn_regionkey], [cr_regionkey]]
 *      Supplier - n:1 - SupptNation => [[s_nationkey], [sn_nationkey]]
 *      SuppNation - n:1 - SuppRegion => [[sn_regionkey], [sr_regionkey]]
 *    ]
 *  }}}
 *
 *  Because of our restrictions we have had to model the ''Nation'' table as separate
 *  ''CustNation'' and ''SuppNation'' tables. Similar separation has to be done for ''CustRegion''
 *  and ''SuppRegion''. Having to setup separate entities for Supplier and Customer Nation is
 *  not atypical when directly writing SQLs; these would be views on the same Nation Dimension
 *  table. Currently we are being more restrictive than this, we require the 2 views to be
 *  tables in the Metastore(this is because during Plan rewrite we loose the Table association
 *  in [[AttributeReference Attributereferences]]. But note, this doesn't require the data to be
 *  copied, both tables can point to the same underlying data in the storage layer.
 *
 *  We have to rename the column names in the 2 Nation(and region) tables. This is so that we
 *  can infer the Attribute to Tables(in the Star Schema) associations in a Query Plan.
 *
 *
 * @param info the [[StarSchemaInfo]] used to build this StarSchema Graph.
 * @param factTable the node that represents the '''Fact Table'''
 * @param tableMap maps a tableName to the [[StarTable]] node in the StarSchema Graph.
 * @param attrMap provides a mapping from a columnName to its table.
 */
case class StarSchema(val info : StarSchemaInfo,
                 val factTable : StarTable,
                 val tableMap : Map[String, StarTable],
                 val attrMap : Map[String, StarTable]) {
  import StarSchema._

  /**
   * The seq of expressions representing one side of a join must all be AttributeReferences
   * and must be from the same table. If this condition is met, the table's name is returned.
   *
   * @param joinKeys
   * @return
   */
  def getUniqueTable(joinKeys : Seq[Expression]) : Option[String] = {

    val tables: Set[String] = {
      for (e <- joinKeys;
           a <- e.references
      ) yield attrMap.get(a.name).map(_.name).getOrElse(UNKNOWN_TABLE_NAME)
    }.toSet

    if (tables.size == 1 && tables.head != UNKNOWN_TABLE_NAME) tables.headOption else None
  }

  def isAttributeReference(e : Expression) = e.isInstanceOf[AttributeReference]

  /**
   * Does the join predicate represented by the left and right join keys match a join in the
   * StarSchema. So a join like {{{lineitem li join part p on li.l_partkey = p.p_partkey}}} is
   * represented as {{{Seq(AttributeReference("l_partkey")), Seq(AttributeReference("p_partkey"))}}}
   *
   * The following constraints must be met for the joining condition to be a join from this
   * StarSchema:
   *  - Every joining expressions can only be an AttributeReference
   *  - each set of joining conditions(leftJoinKeys, rightJoinKeys) must be on 1 table.
   *  - the 2 tables must be related in the StarSchema.
   *  - the matching Attributes in the input(leftJoinKeys, rightJoinKeys) must exactly
   *  nmatch the joining key defined in the StarSchema for the 2 tables involved.
   *
   * @param leftJoinKeys
   * @param rightJoinKeys
   * @return
   */
  def isStarJoin(leftJoinKeys : Seq[Expression], rightJoinKeys : Seq[Expression]) :
  Option[(String, String)] = {

    /*
     * joining expressions must be AttributeReferences
     */
    if ( !leftJoinKeys.forall(isAttributeReference) ||
      !rightJoinKeys.forall(isAttributeReference) ) {
      return None
    }

    /*
     * each set of joining conditions must be on 1 table.
     */
    val (leftTableName, rightTableName) =
      (getUniqueTable(leftJoinKeys), getUniqueTable(rightJoinKeys)) match {
        case (None, _) => return None
        case (_, None) => return None
        case (l,r) => (l.get,r.get)
      }

    var flip = false

    /*
     * the 2 tables must be related in the StarSchema.
     */
    val joinCondition = (tableMap(leftTableName), tableMap(rightTableName)) match {
      case(lT, rT) if (lT.parent.isDefined && lT.parent.get.tableName == rT.name) =>
        lT.parent.get.joiningKeys
      case(lT, rT) if (rT.parent.isDefined && rT.parent.get.tableName == lT.name) =>
        flip = true
        rT.parent.get.joiningKeys
      case _ => null
    }

    if ( joinCondition == null ) {
      return None
    }

    val lKeys = if (!flip) leftJoinKeys else rightJoinKeys
    val rKeys = if (!flip) rightJoinKeys else leftJoinKeys

    /*
     * form a list of tuples representing the joined columns from the 2 tables.
     */
    val joiningKeys = (lKeys.map(_.asInstanceOf[AttributeReference].name)).zip(
      rKeys.map(_.asInstanceOf[AttributeReference].name)
    ).toList

    /*
     * this list must match the joiningKeys set from the StarSchema.
     */

    if (joiningKeys.forall(joinCondition.contains(_)) &&
      joinCondition.forall(joiningKeys.contains(_)) ) {
      Some((leftTableName, rightTableName))
    } else {
      None
    }

  }

  def prettyString : String = {
    s"""
       |FactTable=${factTable.name}
       |${tableMap.mkString("\n")}
     """.stripMargin
  }

}

object StarSchema {

  val UNKNOWN_TABLE_NAME = ""

  type JoinRelationInfo = (FunctionalDependencyType.Value, Set[(String, String)])

  type StarJoinGraph = collection.mutable.Map[String,
    collection.mutable.Map[String, JoinRelationInfo]]

  type ErrorInfo = String

  def flipJoinCondition(jCond : Set[(String, String)]) : Set[(String, String)] =
  jCond.map(t => (t._2, t._1))

  private def joinConditionToRelationInfo(jc : StarRelationInfo, reverse : Boolean = false) :
  JoinRelationInfo = {
    (jc.relationType,
      jc.joinCondition.map(c =>
        if (!reverse) {
          (c.leftAttribute, c.rightAttribute)
        } else {
          (c.rightAttribute, c.leftAttribute)
        }).toSet)
  }

  private def relations(joinGraph : StarJoinGraph, tableName : String) :
  collection.mutable.Map[String, JoinRelationInfo] = {
    if ( !joinGraph.contains(tableName)) {
      joinGraph(tableName) = collection.mutable.Map()
    }
    joinGraph(tableName)
  }

  private def addToJoinGraph(joinGraph : StarJoinGraph, jC : StarRelationInfo) :
  Either[ErrorInfo, Unit] = {
    val lrs = relations(joinGraph, jC.leftTable)

    jC.relationType match {
      case FunctionalDependencyType.ManyToOne =>  {
        if (lrs.contains(jC.rightTable)) {
          return Left(s"multiple join conditions for '${jC.leftTable}' and '${jC.rightTable}'")
        }
        lrs(jC.rightTable) = joinConditionToRelationInfo(jC, false)
      }
      case FunctionalDependencyType.OneToOne =>  {
        if (lrs.contains(jC.rightTable)) {
          return Left(s"multiple join conditions for '${jC.leftTable}' and '${jC.rightTable}'")
        }
        lrs(jC.rightTable) = joinConditionToRelationInfo(jC, false)
        relations(joinGraph, jC.rightTable)(jC.leftTable) = joinConditionToRelationInfo(jC, true)
      }
    }

    Right(())
  }

  def apply(sourceDFName : String, info : StarSchemaInfo)(implicit sqlContext : SQLContext) :
  Either[ErrorInfo, StarSchema] = {

    val joinGraph : StarJoinGraph = collection.mutable.Map()

    /**
     * Go over the [[StarRelationInfo]] from the info and form a mapping:
     * TableName -> RelatedTable -> [[JoinRelationInfo]]
     */
    val joinGraphAdditions = info.relations.map(addToJoinGraph(joinGraph, _))

    val errors = (ArrayBuffer[String]() /: joinGraphAdditions) {
    case (a, Left(err)) => a += err
    case (a, Right(_)) => a
    }

    if ( errors.size > 0) {
      return Left(errors.mkString("\n"))
    }

    val traversedEdges = collection.mutable.Set[(String, String)]()
    val tableMap = collection.mutable.Map[String, StarTable]()
    val attrMap = collection.mutable.Map[String, StarTable]()

    def addColumns(tabNm : String, tbl : StarTable) : Either[ErrorInfo, Unit] = {

      sqlContext.table(tabNm).schema.fieldNames.foreach { aName =>
        if ( attrMap.contains(aName) ) {
          return Left(s"Column $aName is not unique across Star Schema; " +
            s"in tables ${attrMap(aName).name}, $tabNm")
        }
        attrMap(aName) = tbl
      }
      return Right(())
    }

    def addTables(tables : Seq[String]) : Either[ErrorInfo, Unit] = {

      if (tables.isEmpty) return Right(())

      val descendantTables = ArrayBuffer[String]()

      tables.foreach { tName =>
        val childMap = joinGraph.getOrElse(tName, Map())
        childMap.foreach {
          case(childTable, jRInfo) => {
            if ( tableMap.contains(childTable)) {
              val edge = (childTable, tName)
              if ( !traversedEdges.contains(edge) ) {
                return Left(s"multiple join paths to table '$childTable'")
              }
            }
            val childStarTable = StarTable(childTable,
            Some(StarRelation(tName, jRInfo._1, flipJoinCondition(jRInfo._2))))
            tableMap(childTable) = childStarTable
            val r = addColumns(childTable, childStarTable)
            if (r.isLeft) return r
            descendantTables += childTable
          }
        }
      }

      addTables(descendantTables.toSeq)

    }

    tableMap(info.factTable) = StarTable(info.factTable, None)
    val ac = addColumns(sourceDFName, tableMap(info.factTable))
    if (ac.isLeft) {
      return Left(ac.left.get)
    }

    /**
     * Starting from the '''Fact Table''' recursively walk the Related tables, building out the
     * StarSchema with a [[StarTable]] node for each table. The following constraints are
     * enforced:
     *  - there is a unique Path to any Table.
     *  - the columnNames across the StarSchema are unique.
     */
    val r = addTables(Seq(info.factTable)).right.flatMap { x =>

      val tableSet = scala.collection.mutable.Set[String]()
      info.relations.foreach { r =>
        tableSet += r.leftTable
        tableSet += r.rightTable
      }

      val errors = (ArrayBuffer[String]() /: tableSet) {
        case (a, t) if (!tableMap.contains(t)) =>
          a += "Table '${t}' is not part of the join Graph"
        case (a, t) => a
      }

      if ( errors.size > 0) {
        return Left(errors.mkString("\n"))
      } else {
        Right(())
      }
    }

    r match {
      case Left(err) => Left(err)
      case _ => Right(new StarSchema(info,
        tableMap(info.factTable),
        tableMap.toMap,
        attrMap.toMap)
      )
    }

  }

}




© 2015 - 2025 Weber Informatics LLC | Privacy Policy