All Downloads are FREE. Search and download functionalities are using the official Maven repository.

jp.co.bizreach.kinesis.spark.KinesisRDDWriter.scala Maven / Gradle / Ivy

There is a newer version: 0.0.3
Show newest version
package jp.co.bizreach.kinesis.spark

import com.amazonaws.auth.AWSCredentialsProvider
import com.amazonaws.regions.Regions
import jp.co.bizreach.kinesis._
import org.apache.commons.codec.digest.DigestUtils
import org.apache.spark.TaskContext
import org.json4s.jackson.JsonMethods
import org.json4s.{Extraction, Formats, DefaultFormats}
import org.slf4j.LoggerFactory

class KinesisRDDWriter[A <: AnyRef](streamName: String, region: Regions,
                                    credentials: Class[_ <: AWSCredentialsProvider],
                                    chunk: Int) extends Serializable {
  private val logger = LoggerFactory.getLogger(getClass)

  val write = (task: TaskContext, data: Iterator[A]) => {
    // send data, including retry
    def put(a: Seq[PutRecordsEntry]) = KinesisRDDWriter.client(credentials)(region)
      .putRecordsWithRetry(PutRecordsRequest(streamName, a))
      .zipWithIndex.collect { case (Left(e), i) => a(i) -> s"${e.errorCode}: ${e.errorMessage}" }

    val errors = data.foldLeft(
      (Nil: Seq[PutRecordsEntry], Nil: Seq[(PutRecordsEntry, String)])
    ){ (z, x) =>
      val (records, failed) = z
      val payload = serialize(x)
      val entry   = PutRecordsEntry(DigestUtils.sha256Hex(payload), payload)

      // record exceeds max size
      if (entry.recordSize > recordMaxDataSize)
        records -> ((entry -> "per-record size limit") +: failed)

      // execute
      else if (records.size >= chunk || (records.map(_.recordSize).sum + entry.recordSize) >= recordsMaxDataSize)
        (entry +: Nil) -> (put(records) ++ failed)

      // buffering
      else
        (entry +: records) -> failed
    } match {
      case (Nil, e)  => e
      case (rest, e) => put(rest) ++ e
    }

    // failed records
    if (errors.nonEmpty) dump(errors)
  }

  protected def dump(errors: Seq[(PutRecordsEntry, String)]): Unit =
    logger.error(
      s"""Could not put record, count: ${errors.size}, following details:
         |${errors map { case (entry, message) => message + "\n" + new String(entry.data, "UTF-8") } mkString "\n"}
       """.stripMargin)

  protected def serialize(a: A)(implicit formats: Formats = DefaultFormats): Array[Byte] =
    JsonMethods.mapper.writeValueAsBytes(Extraction.decompose(a)(formats))

}

object KinesisRDDWriter {
  private val cache = collection.concurrent.TrieMap.empty[Regions, AmazonKinesisClient]

  private val client: Class[_ <: AWSCredentialsProvider] => Regions => AmazonKinesisClient = {
    credentials => implicit region =>
      cache.getOrElseUpdate(region, AmazonKinesisClient(credentials.getConstructor().newInstance()))
  }

}




© 2015 - 2025 Weber Informatics LLC | Privacy Policy