za.co.absa.enceladus.conformance.datasource.DataSource.scala Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of spark-jobs Show documentation
The newest version!
/*
 * Copyright 2018 ABSA Group Limited
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package za.co.absa.enceladus.conformance.datasource

import org.apache.spark.sql._
import org.slf4j.LoggerFactory
import za.co.absa.enceladus.model.dataFrameFilter.DataFrameFilter

import scala.collection.mutable
import scala.util.control.NonFatal

/**
  * Utility object to provide access to data in HDFS (including partitioning and caching)
  */
object DataSource {
  private val log = LoggerFactory.getLogger("enceladus.conformance.DataSource")
  private type DataFrameId = (String, Option[DataFrameFilter])
  private var dataFrameCache = Map.empty[DataFrameId, DataFrame]

  /**
    * Get loaded dataframe or load data given the report date.
    * The partitioning pattern is determined by the current configuration.
    *
    * @param path       The base path in HDFS of the data
    * @param reportDate A string representing a report date in `yyyy-mm-dd` format
    * @return Dataframe with the required data (cached if requested more than once)
    */
  def getDataFrame(path: String,
                   reportDate: String)
                  (implicit spark: SparkSession): Dataset[Row] = {
    getDataFrame(path, reportDate, PartitioningUtils.mappingTablePattern, None)
  }

  /**
    * Get loaded dataframe or load data given the report date.
    * The partitioning pattern is determined by the current configuration.
    *
    * @param path       The base path in HDFS of the data
    * @param reportDate A string representing a report date in `yyyy-mm-dd` format
    * @param filter     The optional expression to filter the DataFrame with
    * @return Dataframe with the required data (cached if requested more than once)
    */
  def getDataFrame(path: String,
                   reportDate: String,
                   filter: Option[DataFrameFilter])
                  (implicit spark: SparkSession): Dataset[Row] = {
    getDataFrame(path, reportDate, PartitioningUtils.mappingTablePattern, filter)
  }

  /**
    * Get loaded dataframe or load data given the report date and partitioning pattern.
    *
    * @param path                The base path in HDFS of the data
    * @param reportDate          A string representing a report date in `yyyy-mm-dd` format
    * @param partitioningPattern Pattern representing the date partitioning where {0} stands for year, {1} for month, {2} for day
    * @param filter              The optional expression to filter the DataFrame with
    * @return Dataframe with the required data (cached if requested more than once)
    */
  def getDataFrame(path: String,
                   reportDate: String,
                   partitioningPattern: String,
                   filter: Option[DataFrameFilter] = None)
                  (implicit spark: SparkSession): DataFrame = {
    val fullPath = PartitioningUtils.getPartitionedPathName(path, reportDate, partitioningPattern)
    val dataFrameId = (fullPath, filter)
    if (dataFrameCache.contains(dataFrameId)) {
      dataFrameCache(dataFrameId)
    } else {
      log.info(s"Use partitioned path for: '$path' -> '$fullPath'")
      val df = loadDataFrame(fullPath, filter)
      dataFrameCache = dataFrameCache + (dataFrameId -> df)
      df
    }
  }

  private[conformance] def setData(path: String, data: DataFrame, filter: Option[DataFrameFilter] = None) {
    dataFrameCache = dataFrameCache + ((path, filter) -> data)
  }

  private def loadDataFrame(path: String, filter: Option[DataFrameFilter])
                           (implicit spark: SparkSession): DataFrame = {
    try {
      val dfLoaded = spark.read.parquet(path)
      filter.foldLeft(dfLoaded) { case(df, filterDef) => df.filter(filterDef.filter) }
    } catch {
      case ex: AnalysisException if ex.getMessage.contains("Unable to infer schema for Parquet") =>
        throw new RuntimeException(s"Unable to read the mapping table from '$path'. " +
          "Possibly, the directory does not have valid Parquet files.", ex)
      case NonFatal(e) =>
        throw e
    }
  }

}