io.eels.component.hive.HiveDialect.scala Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of eel-hive_2.11 Show documentation
eel-hive
The newest version!
package io.eels.component.hive

import com.sksamuel.exts.Logging
import io.eels.component.hive.dialect.{OrcHiveDialect, ParquetHiveDialect}
import io.eels.datastream.Publisher
import io.eels.schema.StructType
import io.eels.{Predicate, Row}
import org.apache.hadoop.conf.Configuration
import org.apache.hadoop.fs.permission.FsPermission
import org.apache.hadoop.fs.{FileSystem, Path}
import org.apache.hadoop.hive.metastore.api.Table

import scala.math.BigDecimal.RoundingMode.RoundingMode

trait HiveDialect extends Logging {

  def serde: String
  def inputFormat: String
  def outputFormat: String

  /**
    * @param path where to load the data from
    * @param metastoreSchema the schema as present in the metastore and used to match up with the raw data in dialects
    * where the schema is not present. For example with a CSV format in Hive, the metastoreSchema is required
    * in order to know what each column represents. We can't use the projection schema for this because the projection
    * schema might be in a different order.
    * @param projectionSchema the schema required to read. This might not be the full schema present in the data
    * but is required here because some file schemas can read data more efficiently
    * if they know they can omit some fields (eg Parquet).
    * @param predicate used by some implementations to filter data at a file read level (eg Parquet)
    *
    * The dataSchema represents the schema that was written for the data files. This won't necessarily be the same
    * as the hive metastore schema, because partition values are not written to the data files. We must include
    * this here because some hive formats don't store schema information with the data, eg delimited files.
    *
    * The readerSchema is the schema required by the caller which may be the same as the written data, or
    * it may be a subset if a projection pushdown is being used.
    */
  def input(path: Path,
            metastoreSchema: StructType,
            projectionSchema: StructType,
            predicate: Option[Predicate])
           (implicit fs: FileSystem, conf: Configuration): Publisher[Seq[Row]]

  /**
    * Creates a new writer ready to do the bidding of the hive sink.
    *
    * @param schema     the schema that will be written to the underlying file.
    *                   Since this is Hive, the caller should have stripped out any partition values
    * @param path       the location to write the file
    * @param permission optional permission to set on the file once completed
    * @param metadata   any metadata we wish to include in the file, this might not be supported by all filetypes
    */
  def output(schema: StructType, // schema without partition information
             path: Path,
             permission: Option[FsPermission],
             roundingMode: RoundingMode,
             metadata: Map[String, String])
            (implicit fs: FileSystem, conf: Configuration): HiveOutputStream

  def stats(getPath: Path)(implicit fs: FileSystem): Long = throw new UnsupportedOperationException
}

object HiveDialect extends Logging {

  def apply(format: String): HiveDialect = format match {
    case input if input.contains("ParquetInputFormat") => ParquetHiveDialect()
    case input if input.contains("OrcInputFormat") => OrcHiveDialect()
    //case input if input.contains("AvroHiveDialect") || input.contains("AvroContainerInputFormat") => AvroHiveDialect
    //      "org.apache.hadoop.mapred.TextInputFormat" -> TextHiveDialect
    case _ => throw new UnsupportedOperationException(s"Unknown hive input format $format")
  }

  def apply(table: Table): HiveDialect = {
    val format = table.getSd.getInputFormat
    logger.debug(s"Table format is $format")
    val dialect = HiveDialect(format)
    logger.debug(s"HiveDialect is $dialect")
    dialect
  }
}