com.datamountaineer.streamreactor.connect.kudu.sink.DbHandler.scala Maven / Gradle / Ivy
/*
* Copyright 2017 Datamountaineer.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package com.datamountaineer.streamreactor.connect.kudu.sink
import java.util
import com.datamountaineer.kcql.Kcql
import com.datamountaineer.streamreactor.connect.kudu.KuduConverter
import com.datamountaineer.streamreactor.connect.kudu.config.KuduSettings
import com.datamountaineer.streamreactor.connect.kudu.sink.DbHandler.kuduSchema
import com.datamountaineer.streamreactor.connect.schemas.SchemaRegistry
import com.typesafe.scalalogging.slf4j.StrictLogging
import org.apache.avro.{JsonProperties, Schema}
import org.apache.kafka.connect.errors.ConnectException
import org.apache.kudu.client.{KuduClient, KuduTable}
import org.apache.kudu.ColumnSchema
import org.apache.kudu.client._
import org.codehaus.jackson.JsonNode
import scala.collection.JavaConversions._
import scala.util.{Failure, Success, Try}
/**
* Created by [email protected] on 06/06/16.
* stream-reactor-maven
*/
case class CreateTableProps(name: String, schema: kuduSchema, cto: CreateTableOptions)
object DbHandler extends StrictLogging with KuduConverter {
type kuduSchema = org.apache.kudu.Schema
type avroSchema = org.apache.avro.Schema
type avroField = org.apache.avro.Schema.Field
type connectSchema = org.apache.kafka.connect.data.Schema
def checkTables(client: KuduClient, settings: KuduSettings) = {
val kuduTables = client.getTablesList.getTablesList
logger.info(s"Found the following tables in Kudu, ${kuduTables.mkString(",")}")
val tables = settings.kcql.map(s => s.getTarget.trim).toSet
val missing = tables diff kuduTables.toSet
//filter for autocreate as the schema may not exist yet in the registry, they will be create on arrival of the first message if
//set to auto create
missing
.flatMap(m => settings.kcql.filter(f => f.getTarget.trim.equals(m) && f.isAutoCreate))
.foreach(a => logger.warn(s"Kudu table $a does not exist in Kudu and is marked for AutoCreate!"))
val finalList = missing.flatMap(m => settings.kcql.filter(f => f.getTarget.trim.equals(m) && !f.isAutoCreate))
if (finalList.nonEmpty) {
throw new ConnectException(s"The following tables are not found and not set for autocreate" +
s" ${finalList.map(f => f.getTarget).mkString(",")}. Check you aren't missing the namespace (impala::database.table) " +
s"for Impala managed tables!")
}
}
/**
* Build a cache of Kudu insert statements per topic and check tables exists for topics
*
* @param settings Settings containing the mapping of topic to table
* @return A Map of topic -> KuduRowInsert
**/
def buildTableCache(settings: KuduSettings, client: KuduClient): Map[String, KuduTable] = {
checkTables(client, settings)
settings.kcql.map(s => (s.getSource, client.openTable(s.getTarget))).toMap
}
/**
* Creates tables in Kudu
*
* @param setting A kuduSetting with the list of tables to create
* @param client A Kudu Client to execute the DDL
**/
def createTables(setting: KuduSettings,
client: KuduClient): Set[KuduTable] = {
checkTables(client, setting)
//check the schema registry for the a schema for this topic
val url = setting.schemaRegistryUrl
val subjects = SchemaRegistry.getSubjects(url).toSet
subjects
.flatMap(_ => {
setting
.kcql
.filter(r => r.isAutoCreate && !client.tableExists(r.getTarget)) //don't try to create existing tables
.map(m => {
var lkTopic = m.getSource
if (!subjects.contains(lkTopic)) {
if (subjects.contains(lkTopic + "-value")) {
lkTopic = lkTopic + "-value"
}
}
createTableProps(SchemaRegistry.getSchema(url, lkTopic), m, url, client)
})
}).flatten
.map(ctp => executeCreateTable(ctp, client))
}
/**
* Create a Kudu table
*
* @param schema The avro schema
* @param kcql The mapping configuration to create
* @param client The kudu client to use
**/
def createTableProps(schema: String,
kcql: Kcql,
url: String,
client: KuduClient): Set[CreateTableProps] = {
if (schema.nonEmpty) {
val kuduSchema = getKuduSchema(kcql, schema)
val cto = getCreateTableOptions(kcql)
val createTableProps = CreateTableProps(kcql.getTarget, kuduSchema, cto)
Set(createTableProps)
} else {
Set.empty[CreateTableProps]
}
}
/**
* Create a Kudu table
*
* @param config The config containing the fields and mappings set for the sink
* @param schema The topics schema
* @return The kudu schema
**/
def getKuduSchema(config: Kcql, schema: String): kuduSchema = {
//get the latest schema from the schema registry
val avroFields = new Schema.Parser().parse(schema)
//build the columns
val kuduCols = getKuduCols(config, avroFields)
new kuduSchema(kuduCols)
}
/**
* Convert Avro fields to Kudu columns
*
* @param kcql The config containing the fields and mappings set for the sink
* @param avroFields The avro fields
* @return A list of Kudu Columns
**/
private def getKuduCols(kcql: Kcql, avroFields: avroSchema): util.List[ColumnSchema] = {
if (kcql.getFields.head.equals("*")) {
logger.info(s"All fields from topic will be used to create Kudu table ${kcql.getTarget}. ")
} else {
logger.info(s"Using fields ${kcql.getFields.mkString(",")} to create the ${kcql.getTarget}")
}
val mappingFields = kcql.getFields.map(f => (f.getName, f.getAlias)).toMap
val ignored = kcql.getIgnoredFields.toSet
val fields = avroFields.getFields.filterNot(f => ignored.contains(f.name()))
//only allow auto creation if distribute by and bucketing are specified
val pks = Try(kcql.getBucketing.getBucketNames.toSet) match {
case Success(s) => s
case Failure(_) => throw new ConnectException("DISTRIBUTEBY columns INTO BUCKETS n must be specified for table " +
"auto creation!")
}
val cols = fields.map(f => {
val fieldName = f.name()
val alias = if (mappingFields.contains(fieldName)) mappingFields(fieldName) else fieldName
val col = fromAvro(f.schema(), alias)
val default = if (f.defaultValue() != JsonProperties.NULL_VALUE) f.defaultValue() else null
if (pks.contains(alias)) {
logger.info(s"Setting PK on ${f.name()} for ${kcql.getTarget}")
col.key(true)
} else {
col.nullable(true)
if (default != null) col.defaultValue(default)
}
col.build()
}).toList
logger.info(s"Setting columns as ${cols.map(c => c.getName).mkString(",")} for ${kcql.getTarget}")
cols
}
/**
* Alter a Kudu table, new columns only
*
* @param table The table to alter
* @param old The old schema
* @param current The current schema
* @param client A Kudu client to execute the DDL
**/
def alterTable(table: String,
old: connectSchema,
current: connectSchema,
client: KuduClient): KuduTable = {
val ato = compare(old, current)
ato.foreach(a => executeAlterTable(a, table, client))
client.openTable(table)
}
/**
* Compare two connect schemas and return a Kudu AlterTableOptions list
*
* @param old The old schema
* @param current The current schema
* @return A list of AlterTableOptions
**/
def compare(old: connectSchema, current: connectSchema): List[AlterTableOptions] = {
///look for new fields
logger.info("Found a difference in the schemas.")
val diff = current.fields().toSet.diff(old.fields().toSet)
diff.map(d => {
val schema = convertConnectField(d)
val ato = new AlterTableOptions()
if (null == schema.getDefaultValue) {
logger.info(s"Adding nullable column ${schema.getName}, type ${schema.getType}")
ato.addNullableColumn(schema.getName, schema.getType)
} else {
logger.info(s"Adding column ${schema.getName}, type ${schema.getType}, default ${schema.getDefaultValue}")
ato.addColumn(schema.getName, schema.getType, schema.getDefaultValue)
}
}).toList
}
/**
* Execute a Alter table DDL
*
* @param ato The kudu alter table options
* @param target The name of the table to create
* @param client A client to use to execute the DDL
**/
private def executeAlterTable(ato: AlterTableOptions, target: String, client: KuduClient) = {
logger.info(s"Executing alter table on $target with ${ato.toString}")
client.alterTable(target, ato)
//wait for alter table
while (!client.isAlterTableDone(target)) {
logger.info(s"Waiting to alter table to complete for table $target")
}
logger.info(s"Altered table $target. Added ${ato.toString}")
}
def createTableFromSinkRecord(kcql: Kcql, schema: connectSchema, client: KuduClient): Try[KuduTable] = {
if (kcql.isAutoCreate) {
val cto = getCreateTableOptions(kcql)
val kuduSchema = convertToKuduSchema(schema)
val ctp = CreateTableProps(kcql.getTarget, kuduSchema, cto)
Success(executeCreateTable(ctp, client))
} else {
Failure(new ConnectException(s"Mapping ${kcql.toString} not configured for Auto table creation"))
}
}
/**
* Execute a Create table DDL
*
* @param ctp A create table properties contain the name of the table, the schema and create table options
* @param client A client to use to execute the DDL
**/
private[kudu] def executeCreateTable(ctp: CreateTableProps, client: KuduClient): KuduTable = {
logger.info(s"Executing create table on ${ctp.name} with ${ctp.schema.toString} and props ${ctp.cto.toString}")
val table = client.createTable(ctp.name, ctp.schema, ctp.cto)
logger.info(s"Table ${ctp.name} created.")
table
}
/**
* Create a Kudu CreateTableOptions default to hash partition for now
*
* @param config The mapping config
* @return a CreateTableConfig
**/
private def getCreateTableOptions(config: Kcql): CreateTableOptions = {
new CreateTableOptions()
.addHashPartitions(config.getBucketing.getBucketNames.toList, config.getBucketing.getBucketsNumber)
}
}
© 2015 - 2025 Weber Informatics LLC | Privacy Policy