com.landoop.streamreactor.connect.hive.sink.HiveWriterManager.scala Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of connector Show documentation
stream-reactor
The newest version!
package com.landoop.streamreactor.connect.hive.sink

import com.landoop.streamreactor.connect.hive.formats.{HiveFormat, HiveWriter}
import com.landoop.streamreactor.connect.hive.sink.staging.StageManager
import com.landoop.streamreactor.connect.hive.{Offset, TopicPartition, TopicPartitionOffset}
import com.typesafe.scalalogging.slf4j.StrictLogging
import org.apache.hadoop.fs.{FileSystem, Path}
import org.apache.kafka.connect.data.Schema

/**
  * Manages the lifecycle of [[HiveWriter]] instances.
  *
  * A given sink may be writing to multiple locations (partitions), and therefore
  * it is convenient to extract this to another class.
  *
  * This class is not thread safe as it is not designed to be shared between concurrent
  * sinks, since file handles cannot be safely shared without considerable overhead.
  */
class HiveWriterManager(format: HiveFormat,
                        stageManager: StageManager)
                       (implicit fs: FileSystem) extends StrictLogging {

  case class WriterKey(tp: TopicPartition, dir: Path)

  private val writers = scala.collection.mutable.Map.empty[WriterKey, (Path, HiveWriter)]

  private def createWriter(dir: Path,
                           tp: TopicPartition,
                           schema: Schema): (Path, HiveWriter) = {
    val path = stageManager.stage(dir, tp)
    logger.debug(s"Staging new writer at path [$path]")
    val writer = format.writer(path, schema)
    (path, writer)
  }

  /**
    * Returns a writer that can write records for a particular topic and partition.
    * The writer will create a file inside the given directory if there is no open writer.
    */
  def writer(dir: Path, tp: TopicPartition, schema: Schema): (Path, HiveWriter) = {
    writers.getOrElseUpdate(WriterKey(tp, dir), createWriter(dir, tp, schema))
  }

  /**
    * Flushes the open writer for the given topic partition and directory.
    *
    * Next time a writer is requested for the given (topic,partition,directory), a new
    * writer will be created.
    *
    * The directory is required, as there may be multiple writers, one per partition.
    * The offset is required as part of the commit filename.
    */
  def flush(tpo: TopicPartitionOffset, dir: Path): Unit = {
    logger.info(s"Flushing writer for $tpo")
    val key = WriterKey(tpo.toTopicPartition, dir)
    writers.get(key).foreach { case (path, writer) =>
      writer.close()
      stageManager.commit(path, tpo)
      writers.remove(key)
    }
  }

  /**
    * Flushes all open writers.
    *
    * @param offsets the offset for each [[TopicPartition]] which is required
    *                by the commit process.
    */
  def flush(offsets: Map[TopicPartition, Offset]): Unit = {
    logger.info(s"Flushing offsets $offsets")
    // we may not have an offset for a given topic/partition if no data was written to that TP
    writers.foreach { case (key, (path, writer)) =>
      writer.close()
      offsets.get(key.tp).foreach { offset =>
        stageManager.commit(path, key.tp.withOffset(offset))
      }
    }
  }
}