All Downloads are FREE. Search and download functionalities are using the official Maven repository.

com.dmetasoul.lakesoul.meta.DataOperation.scala Maven / Gradle / Ivy

There is a newer version: 2.5.1-presto-0.28
Show newest version
// SPDX-FileCopyrightText: 2023 LakeSoul Contributors
//
// SPDX-License-Identifier: Apache-2.0

package com.dmetasoul.lakesoul.meta

import com.dmetasoul.lakesoul.meta.entity.DataCommitInfo
import com.facebook.presto.lakesoul.pojo.Path

import java.util.{Objects, UUID}
import com.google.common.collect.Lists
import scala.collection.JavaConverters.{asJavaIterableConverter, asScalaBufferConverter}
import scala.collection.mutable.ArrayBuffer
import scala.collection.{JavaConverters, mutable}
import scala.util.control.Breaks

object BucketingUtils {
  // The file name of bucketed data should have 3 parts:
  //   1. some other information in the head of file name
  //   2. bucket id part, some numbers, starts with "_"
  //      * The other-information part may use `-` as separator and may have numbers at the end,
  //        e.g. a normal parquet file without bucketing may have name:
  //        part-r-00000-2dd664f9-d2c4-4ffe-878f-431234567891.gz.parquet, and we will mistakenly
  //        treat `431234567891` as bucket id. So here we pick `_` as separator.
  //   3. optional file extension part, in the tail of file name, starts with `.`
  // An example of bucketed parquet file name with bucket id 3:
  //   part-r-00000-2dd664f9-d2c4-4ffe-878f-c6c70c1fb0cb_00003.gz.parquet
  private val bucketedFileName = """.*_(\d+)(?:\..*)?$""".r

  def getBucketId(fileName: String): Option[Int] = fileName match {
    case bucketedFileName(bucketId) => Some(bucketId.toInt)
    case _ => None
  }
}

case class DataFileInfo(range_partitions: String, path: String, file_op: String, size: Long,
                        modification_time: Long = -1L, file_exist_cols: String = "") {

  lazy val file_bucket_id: Int = BucketingUtils.getBucketId(new Path(path).getName)
    .getOrElse(sys.error(s"Invalid bucket file $path"))

  override def hashCode(): Int = {
    Objects.hash(range_partitions, path, file_op)
  }
}

case class PartitionInfo(table_id: String, range_value: String, version: Int = -1,
                         read_files: Array[UUID] = Array.empty[UUID], expression: String = "", commit_op: String = "") {
  override def toString: String = {
    s"partition info: {\ntable_name: $table_id,\nrange_value: $range_value}"
  }
}

object DataOperation {

  val dbManager = new DBManager

  def getTableDataInfo(tableId: String): Array[DataFileInfo] = {
    getTableDataInfo(MetaVersion.getAllPartitionInfo(tableId))
  }

  def getTableDataInfo(partition_info_arr: Array[PartitionInfo]): Array[DataFileInfo] = {

    val file_info_buf = new ArrayBuffer[DataFileInfo]()

    for (partition_info <- partition_info_arr) {
      file_info_buf ++= getSinglePartitionDataInfo(partition_info)
    }

    file_info_buf.toArray
  }


  def getTableDataInfo(tableId: String, partitions: List[String]): Array[DataFileInfo] = {
    val Pars = MetaVersion.getAllPartitionInfo(tableId)
    val partitionInfos = new ArrayBuffer[PartitionInfo]()
    for (partition_info <- Pars) {
      var contained = true;
      for (item <- partitions) {
        if (partitions.size > 0 && !partition_info.range_value.contains(item)) {
          contained = false
        }
      }
      if (contained) {
        partitionInfos += partition_info
      }
    }
    getTableDataInfo(partitionInfos.toArray)
  }

  private def filterFiles(file_arr_buf: ArrayBuffer[DataFileInfo]): ArrayBuffer[DataFileInfo] = {
    val dupCheck = new mutable.HashSet[String]()
    val file_res_arr_buf = new ArrayBuffer[DataFileInfo]()
    if (file_arr_buf.length > 1) {
      for (i <- Range(file_arr_buf.size - 1, -1, -1)) {
        if (file_arr_buf(i).file_op.equals("del")) {
          dupCheck.add(file_arr_buf(i).path)
        } else {
          if (dupCheck.isEmpty || !dupCheck.contains(file_arr_buf(i).path)) {
            file_res_arr_buf += file_arr_buf(i)
          }
        }
      }
      file_res_arr_buf.reverse
    } else {
      file_arr_buf.filter(_.file_op.equals("add"))
    }
  }

  private def fillFiles(file_arr_buf: ArrayBuffer[DataFileInfo],
                        dataCommitInfoList: Array[DataCommitInfo]): ArrayBuffer[DataFileInfo] = {
    dataCommitInfoList.foreach(data_commit_info => {
      val fileOps = data_commit_info.getFileOpsList.asScala.toArray
      fileOps.foreach(file => {
        file_arr_buf += DataFileInfo(data_commit_info.getPartitionDesc, file.getPath, file.getFileOp.name, file.getSize,
          data_commit_info.getTimestamp, file.getFileExistCols)
      })
    })
    filterFiles(file_arr_buf)
  }

  //get fies info in this partition that match the current read version
  private def getSinglePartitionDataInfo(partition_info: PartitionInfo): ArrayBuffer[DataFileInfo] = {
    val file_arr_buf = new ArrayBuffer[DataFileInfo]()

    val metaPartitionInfo = entity.PartitionInfo.newBuilder
    metaPartitionInfo.setTableId(partition_info.table_id)
    metaPartitionInfo.setPartitionDesc(partition_info.range_value)
    metaPartitionInfo.addAllSnapshot(JavaConverters.bufferAsJavaList(partition_info.read_files.map(DBUtil.toProtoUuid).toBuffer))
    val dataCommitInfoList = dbManager.getTableSinglePartitionDataInfo(metaPartitionInfo.build).asScala.toArray
    for (metaDataCommitInfo <- dataCommitInfoList) {
      val fileOps = metaDataCommitInfo.getFileOpsList.asScala.toArray
      for (file <- fileOps) {
        file_arr_buf += DataFileInfo(partition_info.range_value, file.getPath, file.getFileOp.name, file.getSize,
          metaDataCommitInfo.getTimestamp, file.getFileExistCols)
      }
    }
    filterFiles(file_arr_buf)
  }

  private def getSinglePartitionDataInfo(table_id: String, partition_desc: String,
                                         version: Int): ArrayBuffer[DataFileInfo] = {
    val file_arr_buf = new ArrayBuffer[DataFileInfo]()
    val dataCommitInfoList = dbManager.getPartitionSnapshot(table_id, partition_desc, version).asScala.toArray
    fillFiles(file_arr_buf, dataCommitInfoList)
  }

  def getIncrementalPartitionDataInfo(table_id: String, partition_desc: String, startTimestamp: Long,
                                      endTimestamp: Long, readType: String): Array[DataFileInfo] = {
    val endTime = if (endTimestamp == 0) Long.MaxValue else endTimestamp
    getSinglePartitionDataInfo(table_id, partition_desc, startTimestamp, endTime, readType).toArray
  }

  private def getSinglePartitionDataInfo(table_id: String, partition_desc: String, startTimestamp: Long,
                                         endTimestamp: Long, readType: String): ArrayBuffer[DataFileInfo] = {
    if (readType.equals(LakeSoulOptions.ReadType.INCREMENTAL_READ) || readType
      .equals(LakeSoulOptions.ReadType.SNAPSHOT_READ)) {
      if (null == partition_desc || "".equals(partition_desc)) {
        val partitions = dbManager.getAllPartitionInfo(table_id)
        val files_all_partitions_buf = new ArrayBuffer[DataFileInfo]()
        partitions.forEach(partition => {
          val preVersionTimestamp = dbManager
            .getLastedVersionTimestampUptoTime(table_id, partition.getPartitionDesc, startTimestamp)
          files_all_partitions_buf ++= getSinglePartitionIncrementalDataInfos(table_id, partition.getPartitionDesc,
            preVersionTimestamp, endTimestamp)
        })
        files_all_partitions_buf
      } else {
        val preVersionTimestamp = dbManager.getLastedVersionTimestampUptoTime(table_id, partition_desc, startTimestamp)
        getSinglePartitionIncrementalDataInfos(table_id, partition_desc, preVersionTimestamp, endTimestamp)
      }
    } else {
      val version = dbManager.getLastedVersionUptoTime(table_id, partition_desc, endTimestamp)
      getSinglePartitionDataInfo(table_id, partition_desc, version)
    }
  }

  private def getSinglePartitionIncrementalDataInfos(table_id: String, partition_desc: String,
                                                     startVersionTimestamp: Long,
                                                     endVersionTimestamp: Long): ArrayBuffer[DataFileInfo] = {
    val preVersionUUIDs = new mutable.LinkedHashSet[UUID]()
    val compactionUUIDs = new mutable.LinkedHashSet[UUID]()
    val incrementalAllUUIDs = new mutable.LinkedHashSet[UUID]()
    var updated: Boolean = false
    val dataCommitInfoList = dbManager
      .getIncrementalPartitionsFromTimestamp(table_id, partition_desc, startVersionTimestamp, endVersionTimestamp)
      .asScala.toArray
    var count: Int = 0
    val loop = new Breaks()
    loop.breakable {
      for (dataItem <- dataCommitInfoList) {
        count += 1
        if ("UpdateCommit".equals(dataItem.getCommitOp) && startVersionTimestamp != dataItem
          .getTimestamp && count != 1) {
          updated = true
          loop.break()
        }
        if (startVersionTimestamp == dataItem.getTimestamp) {
          preVersionUUIDs ++= dataItem.getSnapshotList.asScala.map(DBUtil.toJavaUUID)
        } else {
          if ("CompactionCommit".equals(dataItem.getCommitOp)) {
            val compactShotList = dataItem.getSnapshotList.asScala.map(DBUtil.toJavaUUID).toArray
            compactionUUIDs += compactShotList(0)
            if (compactShotList.length > 1) {
              incrementalAllUUIDs ++= compactShotList.slice(1, compactShotList.length)
            }
          } else {
            incrementalAllUUIDs ++= dataItem.getSnapshotList.asScala.map(DBUtil.toJavaUUID)
          }
        }
      }
    }
    if (updated) {
      new ArrayBuffer[DataFileInfo]()
    } else {
      val tmpUUIDs = incrementalAllUUIDs -- preVersionUUIDs
      val resultUUID = tmpUUIDs -- compactionUUIDs
      val file_arr_buf = new ArrayBuffer[DataFileInfo]()
      val dataCommitInfoList = dbManager
        .getDataCommitInfosFromUUIDs(table_id, partition_desc, Lists.newArrayList(resultUUID.map(DBUtil.toProtoUuid).asJava)).asScala.toArray
      fillFiles(file_arr_buf, dataCommitInfoList)
    }
  }
}




© 2015 - 2024 Weber Informatics LLC | Privacy Policy