
ai.starlake.job.sink.jdbc.JdbcConnectionLoadCmd.scala Maven / Gradle / Ivy
package ai.starlake.job.sink.jdbc
import ai.starlake.config.Settings
import ai.starlake.job.Cmd
import ai.starlake.schema.handlers.SchemaHandler
import ai.starlake.schema.model.{ConnectionType, WriteStrategy, WriteStrategyType}
import ai.starlake.utils.Formatter.RichFormatter
import ai.starlake.utils.JobResult
import org.apache.spark.sql.DataFrame
import scopt.OParser
import java.sql.{DriverManager, SQLException}
import scala.util.{Failure, Success, Try}
object JdbcConnectionLoadCmd extends Cmd[JdbcConnectionLoadConfig] {
val command = "cnxload"
def checkTablePresent(
jdbcOptions: Settings.Connection,
jdbcEngine: Settings.JdbcEngine,
outputDomainAndTablename: String
)(implicit settings: Settings): Unit = {
assert(
jdbcOptions.`type` == ConnectionType.JDBC,
s"Only JDBC connections are supported ${jdbcOptions.`type`}"
)
val table = jdbcEngine.tables.get(outputDomainAndTablename.split('.').last)
table.foreach { table =>
val conn = DriverManager.getConnection(
jdbcOptions.options("url"),
jdbcOptions.options("user"),
jdbcOptions.options("password")
)
Try {
val stmt = conn.createStatement
Try {
val pingSql = table.effectivePingSql(outputDomainAndTablename)
val rs = stmt.executeQuery(pingSql)
rs.close() // we don't need to fetch the result, it should be empty anyway.
} match {
case Failure(e) if e.isInstanceOf[SQLException] =>
stmt.executeUpdate(
table.createSql.richFormat(
Map(
"table" -> outputDomainAndTablename,
"writeFormat" -> settings.appConfig.defaultWriteFormat
),
Map.empty
)
)
conn.commit() // some databases are transactional wrt schema updates
case Failure(e) => throw e
case Success(_) => ;
}
stmt.close()
}
conn.close()
}
}
def fromComet(
connectionRef: String,
comet: Settings.AppConfig,
sourceFile: Either[String, DataFrame],
outputTable: String,
strategy: WriteStrategy,
createTableIfAbsent: Boolean
)(implicit settings: Settings): JdbcConnectionLoadConfig = {
// TODO: wanted to just call this "apply" but we need to get rid of the defaults in the ctor above
val starlakeConnection = comet.connections(connectionRef)
val jdbcEngineName = starlakeConnection.getJdbcEngineName()
if (createTableIfAbsent)
comet.jdbcEngines.get(jdbcEngineName.toString).foreach { jdbcEngine =>
checkTablePresent(starlakeConnection, jdbcEngine, outputTable)
}
// This is to make sure that the column names are uppercase on JDBC databases
// TODO: Once spark 3.3 is not supported anymore, switch to withColumnsRenamed(colsMap: Map[String, String])
val dfWithUppercaseColumns = sourceFile.map { df =>
df.columns.foldLeft(df) { case (df, colName) =>
df.withColumnRenamed(colName, colName.toUpperCase())
}
}
JdbcConnectionLoadConfig(
sourceFile = dfWithUppercaseColumns,
outputDomainAndTableName = outputTable.toUpperCase(),
strategy = strategy,
starlakeConnection.sparkDatasource().getOrElse("jdbc"),
starlakeConnection.options
)
}
val parser: OParser[Unit, JdbcConnectionLoadConfig] = {
val builder = OParser.builder[JdbcConnectionLoadConfig]
OParser.sequence(
builder.programName(s"$shell $command"),
builder.head(shell, command, "[options]"),
builder.note("""
|Load parquet file into JDBC Table.
|""".stripMargin),
builder
.opt[String]("source_file")
.action((x, c) => c.copy(sourceFile = Left(x)))
.text("Full Path to source file")
.required(),
builder
.opt[String]("output_table")
.action((x, c) => c.copy(outputDomainAndTableName = x))
.text("JDBC Output Table")
.required(),
builder
.opt[Map[String, String]]("options")
.action((x, c) => c.copy(options = x))
.text(
"Connection options eq for jdbc : driver, user, password, url, partitions, batchSize"
),
builder
.opt[String]("write_strategy")
.action((x, c) => c.copy(strategy = WriteStrategy(Some(WriteStrategyType(x)))))
.text(
"One of the write strategies: APPEND, OVERWRITE (see strategy types)"
)
)
}
// comet bqload --source_file xxx --output_table schema --source_format parquet --create_disposition CREATE_IF_NEEDED --write_disposition WRITE_TRUNCATE
// --partitions 1 --batch_size 1000 --user username --password pwd -- url jdbcurl
def parse(args: Seq[String]): Option[JdbcConnectionLoadConfig] =
OParser.parse(parser, args, JdbcConnectionLoadConfig(), setup)
override def run(config: JdbcConnectionLoadConfig, schemaHandler: SchemaHandler)(implicit
settings: Settings
): Try[JobResult] =
workflow(schemaHandler).jdbcload(config).map(_ => JobResult.empty)
}
© 2015 - 2025 Weber Informatics LLC | Privacy Policy