io.eels.component.hive.HiveSinkWriter.scala Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of eel-hive_2.11 Show documentation
eel-hive
The newest version!
package io.eels.component.hive

import com.sksamuel.exts.Logging
import com.sksamuel.exts.OptionImplicits._
import com.typesafe.config.ConfigFactory
import io.eels.component.hive.partition.PartitionStrategy
import io.eels.schema.{Partition, PartitionEntry, StructType}
import io.eels.util.HdfsIterator
import io.eels.{Row, SinkWriter}
import org.apache.hadoop.conf.Configuration
import org.apache.hadoop.fs.permission.FsPermission
import org.apache.hadoop.fs.{FileSystem, Path}
import org.apache.hadoop.hive.metastore.IMetaStoreClient

import scala.collection.concurrent.TrieMap
import scala.math.BigDecimal.RoundingMode
import scala.math.BigDecimal.RoundingMode.RoundingMode
import scala.util.control.NonFatal

class HiveSinkWriter(sourceSchema: StructType,
                     metastoreSchema: StructType,
                     dbName: String,
                     tableName: String,
                     partitionKeys: Seq[String],
                     // a discriminator for the file names, needed when we are writing to the same table
                     // with multiple threads
                     discriminator: Option[String],
                     dialect: HiveDialect,
                     partitionStrategy: PartitionStrategy,
                     filenameStrategy: FilenameStrategy,
                     stagingStrategy: StagingStrategy,
                     evolutionStrategy: MetastoreSchemaHandler,
                     alignStrategy: AlignmentStrategy,
                     outputSchemaStrategy: OutputSchemaStrategy,
                     inheritPermissions: Option[Boolean],
                     permission: Option[FsPermission],
                     fileListener: FileListener,
                     callbacks: Seq[CommitCallback],
                     roundingMode: RoundingMode = RoundingMode.UNNECESSARY,
                     metadata: Map[String, String])
                    (implicit fs: FileSystem,
                     conf: Configuration,
                     client: IMetaStoreClient) extends SinkWriter with Logging {

  private val hiveOps = new HiveOps(client)
  private val config = ConfigFactory.load()
  private val inheritPermissionsDefault = config.getBoolean("eel.hive.sink.inheritPermissions")

  // Since the data can come in unordered, we want to keep the streams for each partition open
  // otherwise we would be opening and closing streams frequently.
  private val streams = TrieMap.empty[Path, HiveOutputStream]
  private val tablePath = hiveOps.tablePath(dbName, tableName)
  private val writeSchema = outputSchemaStrategy.resolve(metastoreSchema, partitionKeys, client)
  private val aligner = alignStrategy.create(writeSchema)

  // the extractor should use the same schema as the incoming rows
  private val partitionExtractor = new HivePartitionExtractor(sourceSchema, partitionKeys)

  case class WriteStatus(path: Path, fileSizeInBytes: Long, records: Int)

  // returns a Map consisting of each path written, the size of the file, and the number of records in that file
  def writeStats(): Seq[WriteStatus] = {
    streams.values.map { writer => WriteStatus(writer.path, fs.getFileStatus(writer.path).getLen, writer.records) }
  }.toVector

  override def write(row: Row): Unit = {
    val stream = getOrCreateHiveWriter(row)
    // need to ensure the row is compatible with the write schema
    stream.write(aligner.align(row))
  }

  override def close(): Unit = {
    logger.debug("Closing hive sink writer")

    logger.debug(s"Closing ${streams.size} hive output stream(s)")
    streams.values.foreach(_.close)

    if (stagingStrategy.staging) {
      logger.info("Staging was enabled, committing staging files to public")

      // move files from the staging area into the public area
      streams.foreach { case (location, writer) =>
        val stagingPath = writer.path
        val finalPath = new Path(location, stagingPath.getName)
        logger.debug(s"Committing file $stagingPath => $finalPath")
        fs.rename(stagingPath, finalPath)
        callbacks.foreach(_.onCommit(stagingPath, finalPath))
      }

      // check if the staging directories are empty and if so, nuke'em
      streams.values.foreach { writer =>
        val stagingDir = writer.path.getParent
        if (HdfsIterator.remote(fs.listFiles(stagingDir, false)).isEmpty) {
          logger.debug(s"Deleting staging directory $stagingDir")
          fs.delete(stagingDir, false)
        }
      }

      logger.info("Commit completed")
      callbacks.foreach(_.onCommitComplete)
    }

    logger.info("Hive write completed")
  }

  private def createWriter(location: Path): HiveOutputStream = try {
    logger.debug(s"Requesting new hive output stream for location $location")

    val filePath = outputPath(location)
    logger.debug(s"Hive output stream will write to file $filePath")
    assert(filePath.isAbsolute, s"Output stream path must be absolute (was $filePath)")
    fileListener.onOutputFile(filePath)

    dialect.output(writeSchema, filePath, permission, roundingMode, metadata)

  } catch {
    case NonFatal(e) =>
      logger.error(s"Error getting or creating the hive output stream for $location", e)
      throw e
  }

  private def outputPath(partitionPath: Path): Path = {
    val filename = filenameStrategy.filename + discriminator.getOrElse("")
    if (stagingStrategy.staging) {
      val stagingDirectory = stagingStrategy.stagingDirectory(partitionPath, fs)
        .getOrError("Staging strategy returned None, but staging was enabled. This is a bug in the staging strategy.")
      logger.debug(s"Staging strategy has returned staging directory $stagingDirectory")
      new Path(stagingDirectory, filename)
    } else {
      new Path(partitionPath, filename)
    }
  }

  // if partitioning is used, inspects the row to see which partition it should live in
  // and returns an output stream for that partition
  // if partitioning is not used then will return the same table stream for all rows
  // we cache the writer so that we don't keep opening and closing loads of writers
  private def getOrCreateHiveWriter(row: Row): HiveOutputStream = {
    // we need a a writer per partition (as each partition is written to a different directory)
    // if we don't have partitions then we only need a writer for the table
    if (partitionKeys.isEmpty) {
      streams.getOrElseUpdate(tablePath, createWriter(tablePath))
    } else {
      val partition = partitionExtractor(row)
      val partitionPath = partitionStrategy.ensurePartition(partition, dbName, tableName, inheritPermissions.getOrElse(inheritPermissionsDefault), client)
      streams.getOrElseUpdate(partitionPath, createWriter(partitionPath))
    }
  }
}

class HivePartitionExtractor(schema: StructType, partitionKeys: Seq[String]) {

  require(
    partitionKeys.forall { key => schema.fieldNames().contains(key) },
    s"The row schema must include data for all partitions; schema fields=${schema.fieldNames()}; expected partitions=$partitionKeys"
  )

  private val indexes = partitionKeys.map(schema.indexOf)

  def apply(row: Row): Partition = {
    val values = indexes.map(row.get)
    val entries = partitionKeys.zip(values).map { case (name, value) =>
      require(value != null, s"Partition value cannot be null for $name")
      require(!value.toString.contains(" "), s"Values for partitions cannot contain spaces $name=$value")
      PartitionEntry(name, value.toString)
    }
    Partition(entries)
  }
}