com.adform.streamloader.vertica.ExternalOffsetVerticaFileStorage.scala Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of stream-loader-vertica_2.13 Show documentation
stream-loader-vertica
There is a newer version: 0.3.4
Show newest version
/*
 * Copyright (c) 2020 Adform
 *
 * This Source Code Form is subject to the terms of the Mozilla Public
 * License, v. 2.0. If a copy of the MPL was not distributed with this
 * file, You can obtain one at http://mozilla.org/MPL/2.0/.
 */

package com.adform.streamloader.vertica

import com.adform.streamloader.model.{StreamPosition, Timestamp}
import com.adform.streamloader.sink.batch.storage.InDataOffsetBatchStorage
import com.adform.streamloader.util.Logging
import org.apache.kafka.common.TopicPartition

import java.sql.{Connection, SQLDataException, Timestamp => SqlTimestamp}
import javax.sql.DataSource
import scala.collection.mutable
import scala.util.Using

/**
  * A Vertica file storage implementation that loads data to some table and commits offsets to a separate dedicated offset table.
  * The commit happens in a single transaction, offsets and data can be joined using a file ID that is stored in both
  * the data table and the offset table. The offset table contains only ranges of offsets
  * from each topic partition contained in the file. Its structure should look as follows (all names can be customized):
  *
  * {{{
  *   CREATE TABLE file_offsets (
  *   _file_id INT NOT NULL,
  *   _consumer_group VARCHAR(128) NOT NULL,
  *   _topic VARCHAR(128) NOT NULL,
  *   _partition INT NOT NULL,
  *   _start_offset INT NOT NULL,
  *   _start_watermark TIMESTAMP NOT NULL,
  *   _end_offset INT NOT NULL,
  *   _end_watermark TIMESTAMP NOT NULL
  * );
  * }}}
  *
  * Compared to the [[InRowOffsetVerticaFileStorage]] this implementation does not preserve individual row offsets,
  * however it is much less expensive in terms of data usage in the licensing scheme, as the license is calculated based
  * on the size of the data as it occupies converted to strings, ignoring compression and encoding.
  * Thus while it does not cost much to store the topic name, partition and offset next to each row physically (this data
  * compresses very well), it can be significant when auditing data usage for licensing.
  */
class ExternalOffsetVerticaFileStorage(
    dbDataSource: DataSource,
    table: String,
    offsetTable: String,
    fileIdColumnName: String,
    consumerGroupColumnName: String,
    topicColumnName: String,
    partitionColumnName: String,
    startOffsetColumnName: String,
    startWatermarkColumnName: String,
    endOffsetColumnName: String,
    endWatermarkColumnName: String
) extends InDataOffsetBatchStorage[ExternalOffsetVerticaFileRecordBatch]
    with Logging {

  def committedPositions(connection: Connection): Map[TopicPartition, StreamPosition] = {
    val query =
      s"SELECT $topicColumnName, $partitionColumnName, MAX($endOffsetColumnName) + 1, MAX($endWatermarkColumnName) " +
        s"FROM $offsetTable " +
        s"WHERE $consumerGroupColumnName = ? " +
        s"GROUP BY $topicColumnName, $partitionColumnName"
    log.info(s"Running stream position query: $query")
    Using.resource(connection.prepareStatement(query)) { statement =>
      statement.setString(1, kafkaContext.consumerGroup)
      Using.resource(statement.executeQuery()) { result =>
        val positions: mutable.HashMap[TopicPartition, StreamPosition] = mutable.HashMap.empty
        while (result.next()) {
          if (!result.wasNull()) {
            val topicPartition = new TopicPartition(result.getString(1), result.getInt(2))
            val position = StreamPosition(result.getLong(3), Timestamp(result.getTimestamp(4).getTime))
            positions.put(topicPartition, position)
          }
        }
        positions.toMap
      }
    }
  }

  override def committedPositions(topicPartitions: Set[TopicPartition]): Map[TopicPartition, Option[StreamPosition]] = {
    Using.resource(dbDataSource.getConnection) { connection =>
      val positions = committedPositions(connection)
      topicPartitions.map(tp => (tp, positions.get(tp))).toMap
    }
  }

  override def commitBatchWithOffsets(batch: ExternalOffsetVerticaFileRecordBatch): Unit = {
    Using.resource(dbDataSource.getConnection) { connection =>
      connection.setAutoCommit(false)
      val copyQuery = batch.copyStatement(table)
      try {
        val inserts = batch.recordRanges.map { range =>
          val batchInsertQuery =
            s"INSERT INTO $offsetTable " +
              s"($fileIdColumnName, $consumerGroupColumnName, $topicColumnName, $partitionColumnName, $startOffsetColumnName, $startWatermarkColumnName, $endOffsetColumnName, $endWatermarkColumnName) " +
              s"VALUES " +
              s"(?, ?, ?, ?, ?, ?, ?, ?)"
          val batchInsertStatement = connection.prepareStatement(batchInsertQuery)
          batchInsertStatement.setLong(1, batch.fileId)
          batchInsertStatement.setString(2, kafkaContext.consumerGroup)
          batchInsertStatement.setString(3, range.topic)
          batchInsertStatement.setInt(4, range.partition)
          batchInsertStatement.setLong(5, range.start.offset)
          batchInsertStatement.setTimestamp(6, new SqlTimestamp(range.start.watermark.millis))
          batchInsertStatement.setLong(7, range.end.offset)
          batchInsertStatement.setTimestamp(8, new SqlTimestamp(range.end.watermark.millis))

          log.debug(s"Running statement: $batchInsertQuery")
          val insertResult = batchInsertStatement.executeUpdate()

          log.debug(s"Successfully inserted $insertResult batch metadata record(s)")
          insertResult
        }

        Using.resource(connection.prepareStatement(copyQuery)) { copyStatement =>
          log.info(s"Running statement: $copyQuery")
          val copyResult = copyStatement.executeUpdate()
          log.info(s"Successfully committed $copyResult record(s) and updated ${inserts.sum} batch metadata record(s)")
        }

        connection.commit()

      } catch {
        case e: SQLDataException =>
          log.error(e)("Failed inserting data, rolling back the transaction")
          connection.rollback()
          throw e
      }
    }
  }
}

object ExternalOffsetVerticaFileStorage {

  case class Builder(
      private val _dbDataSource: DataSource,
      private val _table: String,
      private val _offsetTable: String,
      private val _fileIdColumnName: String = "_file_id",
      private val _consumerGroupColumnName: String = "_consumer_group",
      private val _topicColumnName: String = "_topic",
      private val _partitionColumnName: String = "_partition",
      private val _startOffsetColumnName: String = "_start_offset",
      private val _startWatermarkColumnName: String = "_start_watermark",
      private val _endOffsetColumnName: String = "_end_offset",
      private val _endWatermarkColumnName: String = "_end_watermark"
  ) {

    /**
      * Sets a data source for Vertica JDBC connections.
      */
    def dbDataSource(source: DataSource): Builder = copy(_dbDataSource = source)

    /**
      * Sets the table to load data to.
      */
    def table(name: String): Builder = copy(_table = name)

    /**
      * Sets the name of the table used for storing offsets.
      */
    def offsetTable(name: String): Builder = copy(_offsetTable = name)

    /**
      * Sets the names of the columns in the offset table.
      * Used for querying the the offset during initialization to look up committed stream positions.
      */
    def offsetTableColumnNames(
        fileId: String = "_file_id",
        consumerGroup: String = "_consumer_group",
        topic: String = "_topic",
        partition: String = "_partition",
        startOffset: String = "_start_offset",
        startWatermark: String = "_start_watermark",
        endOffset: String = "_end_offset",
        endWatermark: String = "_end_watermark"
    ): Builder =
      copy(
        _fileIdColumnName = fileId,
        _consumerGroupColumnName = consumerGroup,
        _topicColumnName = topic,
        _partitionColumnName = partition,
        _startOffsetColumnName = startOffset,
        _startWatermarkColumnName = startWatermark,
        _endOffsetColumnName = endOffset,
        _endWatermarkColumnName = endWatermark
      )

    def build(): ExternalOffsetVerticaFileStorage = {
      if (_dbDataSource == null) throw new IllegalStateException("Must provide a Vertica data source")
      if (_table == null) throw new IllegalStateException("Must provide a valid table name")
      if (_offsetTable == null) throw new IllegalStateException("Must provide a valid offset table name")

      new ExternalOffsetVerticaFileStorage(
        _dbDataSource,
        _table,
        _offsetTable,
        _fileIdColumnName,
        _consumerGroupColumnName,
        _topicColumnName,
        _partitionColumnName,
        _startOffsetColumnName,
        _startWatermarkColumnName,
        _endOffsetColumnName,
        _endWatermarkColumnName
      )
    }
  }

  def builder[R](): Builder = Builder(null, null, null)
}