com.datamountaineer.streamreactor.connect.kudu.sink.KuduWriter.scala Maven / Gradle / Ivy
/*
* Copyright 2017 Datamountaineer.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package com.datamountaineer.streamreactor.connect.kudu.sink
import com.datamountaineer.streamreactor.connect.errors.ErrorHandler
import com.datamountaineer.streamreactor.connect.kudu.KuduConverter
import com.datamountaineer.streamreactor.connect.kudu.config.{KuduConfig, KuduConfigConstants, KuduSettings, WriteFlushMode}
import com.datamountaineer.streamreactor.connect.schemas.ConverterUtil
import com.typesafe.scalalogging.slf4j.StrictLogging
import org.apache.kafka.connect.data.Schema
import org.apache.kafka.connect.sink.SinkRecord
import org.apache.kudu.client.SessionConfiguration.FlushMode
import org.apache.kudu.client._
import scala.collection.JavaConverters._
import scala.collection.mutable
import scala.util.{Failure, Success, Try}
case class SchemaMap(version: Int, schema: Schema)
/**
* Created by [email protected] on 22/02/16.
* stream-reactor
*/
object KuduWriter extends StrictLogging {
def apply(config: KuduConfig, settings: KuduSettings): KuduWriter = {
val kuduMaster = config.getString(KuduConfigConstants.KUDU_MASTER)
logger.info(s"Connecting to Kudu Master at $kuduMaster")
lazy val client = new KuduClient.KuduClientBuilder(kuduMaster).build()
new KuduWriter(client, settings)
}
}
class KuduWriter(client: KuduClient, setting: KuduSettings) extends StrictLogging with KuduConverter
with ErrorHandler with ConverterUtil {
logger.info("Initialising Kudu writer")
Try(DbHandler.createTables(setting, client)) match {
case Success(_) =>
case Failure(f) => logger.warn("Unable to create tables at startup! Tables will be created on delivery of the first record", f)
}
private val MUTATION_BUFFER_SPACE = setting.mutationBufferSpace
private lazy val kuduTablesCache = collection.mutable.Map(DbHandler.buildTableCache(setting, client).toSeq: _*)
private lazy val session = client.newSession()
session.setFlushMode(setting.writeFlushMode match {
case WriteFlushMode.SYNC =>
FlushMode.AUTO_FLUSH_SYNC
case WriteFlushMode.BATCH_SYNC =>
FlushMode.MANUAL_FLUSH
case WriteFlushMode.BATCH_BACKGROUND =>
FlushMode.AUTO_FLUSH_BACKGROUND
})
session.setMutationBufferSpace(MUTATION_BUFFER_SPACE)
//ignore duplicate in case of redelivery
session.isIgnoreAllDuplicateRows
//initialize error tracker
initialize(setting.maxRetries, setting.errorPolicy)
//schema cache
val schemaCache: mutable.Map[String, SchemaMap] = mutable.Map.empty[String, SchemaMap]
/**
* Write SinkRecords to Kudu
*
* @param records A list of SinkRecords to write
**/
def write(records: Set[SinkRecord]): Unit = {
if (records.isEmpty) {
logger.debug("No records received.")
} else {
logger.debug(s"Received ${records.size} records.")
//if error occurred rebuild cache in case of change on target tables
if (errored()) {
kuduTablesCache.empty
DbHandler.buildTableCache(setting, client)
.map({ case (topic, table) => kuduTablesCache.put(topic, table) })
}
applyInsert(records, session)
}
}
/**
* Per topic, build an new Kudu insert. Per insert build a Kudu row per SinkRecord.
* Apply the insert per topic for the rows
**/
private def applyInsert(records: Set[SinkRecord], session: KuduSession) = {
val t = Try({
records.iterator
.map(r => convert(r, setting.fieldsMap(r.topic), setting.ignoreFields(r.topic)))
.map(r => applyDDLs(r))
.map(r => convertToKuduUpsert(r, kuduTablesCache(r.topic)))
.map(i => session.apply(i))
.grouped(MUTATION_BUFFER_SPACE-1)
.foreach(_ => flush())
})
handleTry(t)
logger.debug(s"Written ${records.size}")
}
/**
* Create the Kudu table if not already done and alter table if required
*
* @param record The sink record to create a table for
* @return A KuduTable
**/
private def applyDDLs(record: SinkRecord): SinkRecord = {
if (!kuduTablesCache.contains(record.topic())) {
val mapping = setting.kcql.filter(f => f.getSource.equals(record.topic())).head
val table = DbHandler.createTableFromSinkRecord(mapping, record.valueSchema(), client).get
logger.info(s"Adding table ${mapping.getTarget} to the table cache")
kuduTablesCache.put(mapping.getSource, table)
} else {
handleAlterTable(record)
}
record
}
/**
* Check alter table is schema has changed
*
* @param record The sinkRecord to check the schema for
**/
def handleAlterTable(record: SinkRecord): SinkRecord = {
val topic = record.topic()
val allowEvo = setting.allowAutoEvolve.getOrElse(topic, false)
if (allowEvo) {
val schema = record.valueSchema()
val version = schema.version()
val table = setting.topicTables(topic)
val cachedSchema = schemaCache.getOrElse(topic, SchemaMap(version, schema))
//allow evolution
val evolving = cachedSchema.version < version
//if table is allowed to evolve all the table
if (evolving) {
logger.info(s"Schema change detected for $topic mapped to table $table. Old schema version " +
s"${cachedSchema.version} new version $version")
val kuduTable = DbHandler.alterTable(table, cachedSchema.schema, schema, client)
kuduTablesCache.update(topic, kuduTable)
schemaCache.update(topic, SchemaMap(version, schema))
} else {
schemaCache.update(topic, SchemaMap(version, schema))
}
}
record
}
/**
* Close the Kudu session and client
**/
def close(): Unit = {
logger.info("Closing Kudu Session and Client")
flush()
if (!session.isClosed) session.close()
client.shutdown()
}
/**
* Force the session to flush it's buffers.
*
**/
def flush(): Unit = {
if (!session.isClosed) {
//throw and let error policy handle it, don't want to throw RetriableException.
//May want to die if error policy is Throw
val errors : String = session.getFlushMode match {
case FlushMode.AUTO_FLUSH_SYNC | FlushMode.MANUAL_FLUSH =>
val flush = session.flush()
if (flush != null) {
flush.asScala
.flatMap(r => Option(r))
.withFilter(_.hasRowError)
.map(_.getRowError.toString)
.mkString(";")
} else {
""
}
case FlushMode.AUTO_FLUSH_BACKGROUND =>
session.getPendingErrors.getRowErrors
.map(_.toString)
.mkString(";")
}
if (errors.nonEmpty) {
throw new RuntimeException(s"Failed to flush one or more changes:$errors")
}
}
}
}
© 2015 - 2025 Weber Informatics LLC | Privacy Policy