
com.landoop.streamreactor.connect.hive.sink.HiveSink.scala Maven / Gradle / Ivy
The newest version!
package com.landoop.streamreactor.connect.hive.sink
import com.landoop.streamreactor.connect.hive
import com.landoop.streamreactor.connect.hive._
import com.landoop.streamreactor.connect.hive.sink.config.HiveSinkConfig
import com.landoop.streamreactor.connect.hive.formats.HiveWriter
import com.landoop.streamreactor.connect.hive.sink.mapper.{DropPartitionValuesMapper, MetastoreSchemaAlignMapper, ProjectionMapper}
import com.landoop.streamreactor.connect.hive.sink.partitioning.CachedPartitionHandler
import com.landoop.streamreactor.connect.hive.sink.staging.{CommitPolicy, StageManager}
import com.typesafe.scalalogging.slf4j.StrictLogging
import org.apache.hadoop.fs.{FileSystem, Path}
import org.apache.hadoop.hive.metastore.IMetaStoreClient
import org.apache.hadoop.hive.metastore.api.Table
import org.apache.kafka.connect.data.{Schema, Struct}
import scala.util.{Failure, Success}
/**
* A [[HiveSink]] handles writing records to a single hive table.
*
* It will manage a set of [[HiveWriter]] instances, each of which will handle records in a partition.
* is not partitioned, then this will be a single file.
*
* The hive sink uses a [[CommitPolicy]] to determine when to close a file and start
* a new file. When the file is due to be commited, the [[StageManager]]
* will be invoked to commit the file.
*
* The lifecycle of a sink is managed independently from other sinks.
*
* @param tableName the name of the table to write out to
*/
class HiveSink(tableName: TableName,
config: HiveSinkConfig)
(implicit client: IMetaStoreClient, fs: FileSystem) extends StrictLogging {
private val tableConfig = config.tableOptions.find(_.tableName == tableName)
.getOrElse(sys.error(s"No table config for ${tableName.value}"))
private val writerManager = new HiveWriterManager(tableConfig.format, config.stageManager)
private val partitioner = new CachedPartitionHandler(tableConfig.partitioner)
private val offsets = scala.collection.mutable.Map.empty[TopicPartition, Offset]
private var table: Table = _
private var tableLocation: Path = _
private var plan: Option[PartitionPlan] = _
private var metastoreSchema: Schema = _
private var mapper: Struct => Struct = _
private var lastSchema: Schema = _
private def init(schema: Schema): Unit = {
logger.info(s"Init sink for schema $schema")
def getOrCreateTable(): Table = {
def create = {
val partstring = if (tableConfig.partitions.isEmpty) "" else tableConfig.partitions.mkString(",")
logger.info(s"Creating table in hive [${config.dbName.value}.${tableName.value}, partitions=$partstring]")
hive.createTable(config.dbName, tableName, schema, tableConfig.partitions, tableConfig.location, tableConfig.format)
}
logger.debug(s"Fetching or creating table ${config.dbName.value}.${tableName.value}")
client.tableExists(config.dbName.value, tableName.value) match {
case true if tableConfig.overwriteTable =>
hive.dropTable(config.dbName, tableName, true)
create
case true => client.getTable(config.dbName.value, tableName.value)
case false if tableConfig.createTable => create
case false => throw new RuntimeException(s"Table ${config.dbName.value}.${tableName.value} does not exist")
}
}
table = getOrCreateTable()
tableLocation = new Path(table.getSd.getLocation)
plan = hive.partitionPlan(table)
metastoreSchema = tableConfig.evolutionPolicy.evolve(config.dbName, tableName, HiveSchemas.toKafka(table), schema)
.getOrElse(sys.error(s"Unable to retrieve or evolve schema for $schema"))
val mapperFns: Seq[Struct => Struct] = Seq(
tableConfig.projection.map(new ProjectionMapper(_)),
Some(new MetastoreSchemaAlignMapper(metastoreSchema)),
plan.map(new DropPartitionValuesMapper(_))
).flatten.map(mapper => mapper.map _)
mapper = Function.chain(mapperFns)
}
/**
* Returns the appropriate output directory for the given struct.
*
* If the table is not partitioned, then the directory returned
* will be the location of the table itself, otherwise it will delegate to
* the partitioning policy.
*/
private def outputDir(struct: Struct, plan: Option[PartitionPlan], table: Table): Path = {
plan.fold(tableLocation) { plan =>
val part = hive.partition(struct, plan)
partitioner.path(part, config.dbName, tableName)(client, fs) match {
case Failure(t) => throw t
case Success(path) => path
}
}
}
def write(struct: Struct, tpo: TopicPartitionOffset): Unit = {
// if the schema has changed, or if we haven't yet had a struct, we need
// to make sure the table exists, and that our table metadata is up to date
// given that the table may have evolved.
if (struct.schema != lastSchema) {
init(struct.schema)
lastSchema = struct.schema
}
val dir = outputDir(struct, plan, table)
val mapped = mapper(struct)
val (path, writer) = writerManager.writer(dir, tpo.toTopicPartition, mapped.schema)
val count = writer.write(mapped)
if (fs.exists(path) && tableConfig.commitPolicy.shouldFlush(struct, tpo, path, count)) {
logger.debug(s"Flushing offsets for $dir")
writerManager.flush(tpo, dir)
config.stageManager.commit(path, tpo)
}
offsets.put(tpo.toTopicPartition, tpo.offset)
lastSchema = struct.schema()
}
def close(): Unit = {
writerManager.flush(offsets.toMap)
offsets.clear()
}
}
© 2015 - 2025 Weber Informatics LLC | Privacy Policy