All Downloads are FREE. Search and download functionalities are using the official Maven repository.

com.landoop.streamreactor.connect.hive.source.TableFileScanner.scala Maven / Gradle / Ivy

The newest version!
package com.landoop.streamreactor.connect.hive.source

import com.landoop.streamreactor.connect.hive
import com.landoop.streamreactor.connect.hive.HdfsUtils._
import com.landoop.streamreactor.connect.hive.{DatabaseName, Partition, TableName}
import org.apache.hadoop.fs.{FileSystem, Path}
import org.apache.hadoop.hive.metastore.IMetaStoreClient

// for a table that is not partitioned, the files will all reside directly in the table location directory
// otherwise, the files will each live in the particular partition folder (which technically, could be anywhere)
object TableFileScanner {

  def scan(db: DatabaseName, tableName: TableName)
          (implicit fs: FileSystem, client: IMetaStoreClient): Seq[(Path, Option[Partition])] = {

    // the partitions from the metastore which each contain a pointer to the partition location

    hive.partitionPlan(db, tableName) match {
      case Some(plan) =>
        hive.partitions(db, tableName).flatMap { case partition@Partition(entries, Some(location)) =>
          val files = fs.listFiles(location, false)
          files.map(_.getPath).toVector.map(_ -> Some(partition))
        }
      case None =>
        val table = client.getTable(db.value, tableName.value)
        val files = fs.listFiles(new Path(table.getSd.getLocation), false)
        files.map(_.getPath).toVector.map(_ -> None)
    }
  }
}




© 2015 - 2025 Weber Informatics LLC | Privacy Policy