com.coxautodata.waimak.metastore.ImpalaDBConnector.scala Maven / Gradle / Ivy

package com.coxautodata.waimak.metastore

import java.sql.ResultSet

import com.coxautodata.waimak.dataflow.DataFlowException
import com.coxautodata.waimak.dataflow.spark.SparkFlowContext
import org.apache.hadoop.conf.Configuration
import org.apache.hadoop.fs.Path
import org.apache.spark.SparkConf


/**
  * Impala trait that implements the Impala-specific HadoopDBConnector functions
  */
trait ImpalaDBConnector extends HadoopDBConnector {

  private[metastore] override def createTableFromParquetDDL(tableName: String, path: String, external: Boolean, partitionColumns: Seq[String], ifNotExists: Boolean = true): Seq[String] = {
    val qualifiedPath = new Path(path).makeQualified(context.fileSystem.getUri, context.fileSystem.getWorkingDirectory)

    //Find glob paths catering for partitions
    val globPath = ("part-*.parquet" +: partitionColumns.map(_ + "=*")).foldRight(qualifiedPath)((c, p) => new Path(p, c))

    logInfo("Get paths for ddls " + globPath.toString)
    val parquetFile = context.fileSystem.globStatus(globPath).sortBy(_.getPath.toUri.getPath).headOption.map(_.getPath).getOrElse(throw new DataFlowException(s"Could not find parquet file at " +
      s"'$qualifiedPath' to infer schema for table '$tableName'"))

    //Create ddl
    val ifNotExistsString = if (ifNotExists) "if not exists " else ""
    val externalString = if (external) "external " else ""
    if (partitionColumns.isEmpty) {
      Seq(s"create ${externalString}table $ifNotExistsString$tableName like parquet '${parquetFile.toString}' stored as parquet location '$qualifiedPath'")
    } else {
      val partitionDef = partitionColumns.map(_ + " string").mkString(", ")
      Seq(
        s"create ${externalString}table $ifNotExistsString$tableName like parquet '${parquetFile.toString}' partitioned by ($partitionDef) stored as parquet location '$qualifiedPath'",
        s"alter table $tableName recover partitions"
      )
    }
  }
}

/**
  * Impala Database connector that is constructed using the Waimak JDBC template in spark conf
  *
  * @param context          The flow context object containing the SparkSession and FileSystem
  * @param database         name of the database to connect to
  * @param cluster          the cluster label in the JDBC template string
  * @param properties       Key value pairs passed as connection arguments to the DriverManager during connection
  * @param secureProperties Key value set of parameters used to get parameter values for JDBC properties
  *                         from a secure jceks file at CredentialProviderFactory.CREDENTIAL_PROVIDER_PATH.
  *                         First value is the key of the parameter in the jceks file and the second parameter
  *                         is the key of the parameter you want in jdbc properties
  *
  */
case class ImpalaWaimakJDBCConnector(context: SparkFlowContext,
                                     database: String,
                                     cluster: String = "default",
                                     properties: java.util.Properties = new java.util.Properties(),
                                     secureProperties: Map[String, String] = Map.empty) extends ImpalaDBConnector with WaimakJDBCConnector {
  override val driverName: String = "org.apache.hive.jdbc.HiveDriver"
  override val sparkConf: SparkConf = context.spark.sparkContext.getConf
  override val service: String = "impala"

  override def hadoopConfiguration: Configuration = context.spark.sparkContext.hadoopConfiguration
}

/**
  * Impala Database connector that is constructed from a JDBC connection string
  *
  * @param context          The flow context object containing the SparkSession and FileSystem
  * @param jdbcString       the JDBC connection string
  * @param properties       Key value pairs passed as connection arguments to the DriverManager during connection
  * @param secureProperties Key value set of parameters used to get parameter values for JDBC properties
  *                         from a secure jceks file at CredentialProviderFactory.CREDENTIAL_PROVIDER_PATH.
  *                         First value is the key of the parameter in the jceks file and the second parameter
  *                         is the key of the parameter you want in jdbc properties
  */
case class ImpalaJDBCConnector(context: SparkFlowContext,
                               jdbcString: String,
                               properties: java.util.Properties = new java.util.Properties(),
                               secureProperties: Map[String, String] = Map.empty) extends ImpalaDBConnector with JDBCConnector {
  override val driverName: String = "org.apache.hive.jdbc.HiveDriver"

  override def hadoopConfiguration: Configuration = context.spark.sparkContext.hadoopConfiguration
}

/**
  * A dummy Impala database connector that does not submit the DDLs but
  * collects all that have been submitted in a List.
  * This is useful for testing or using in flows where you wish to collect
  * the DDLs and run them manually.
  *
  * @param context The flow context object containing the SparkSession and FileSystem
  */
case class ImpalaDummyConnector(context: SparkFlowContext) extends ImpalaDBConnector {
  var ranDDLs: List[List[String]] = List.empty

  override private[metastore] def runQueries(ddls: Seq[String]): Seq[Option[ResultSet]] = {
    ranDDLs = ranDDLs :+ ddls.toList
    Seq(None)
  }

}