All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.apache.spark.sql.delta.hudi.HudiTransactionUtils.scala Maven / Gradle / Ivy

/*
 * Copyright (2021) The Delta Lake Project Authors.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package org.apache.spark.sql.delta.hudi

import org.apache.hadoop.conf.Configuration
import org.apache.hadoop.fs.Path
import org.apache.hudi.client.WriteStatus
import org.apache.hudi.common.model.{HoodieAvroPayload, HoodieTableType, HoodieTimelineTimeZone, HoodieDeltaWriteStat}
import org.apache.hudi.common.table.HoodieTableMetaClient
import org.apache.hudi.common.util.ExternalFilePathUtil
import org.apache.hudi.exception.TableNotFoundException
import org.apache.spark.sql.delta.actions.AddFile
import org.apache.spark.sql.delta.metering.DeltaLogging

object HudiTransactionUtils extends DeltaLogging {

  /////////////////
  // Public APIs //
  /////////////////
  def convertAddFile(addFile: AddFile,
                     tablePath: Path,
                     commitTime: String): WriteStatus = {

    val writeStatus = new WriteStatus
    val path = addFile.toPath
    val partitionPath = getPartitionPath(tablePath, path)
    val fileName = path.getName
    val fileId = fileName
    val filePath = if (partitionPath.isEmpty) fileName else partitionPath + "/" + fileName
    writeStatus.setFileId(fileId)
    writeStatus.setPartitionPath(partitionPath)
    val writeStat = new HoodieDeltaWriteStat
    writeStat.setFileId(fileId)
    writeStat.setPath(
      ExternalFilePathUtil.appendCommitTimeAndExternalFileMarker(filePath, commitTime))
    writeStat.setPartitionPath(partitionPath)
    writeStat.setNumWrites(addFile.numLogicalRecords.getOrElse(0L))
    writeStat.setTotalWriteBytes(addFile.getFileSize)
    writeStat.setFileSizeInBytes(addFile.getFileSize)
    writeStatus.setStat(writeStat)

    writeStatus
  }

  def getPartitionPath(tableBasePath: Path, filePath: Path): String = {
    val fileName = filePath.getName
    val pathStr = filePath.toUri.getPath
    val tableBasePathStr = tableBasePath.toUri.getPath
    if (pathStr.contains(tableBasePathStr)) {
      // input file path is absolute
      val startIndex = tableBasePath.toUri.getPath.length + 1
      val endIndex = pathStr.length - fileName.length - 1
      if (endIndex <= startIndex) ""
      else pathStr.substring(startIndex, endIndex)
    } else {
      val lastSlash = pathStr.lastIndexOf("/")
      if (lastSlash <= 0) ""
      else pathStr.substring(0, pathStr.lastIndexOf("/"))
    }
  }

  /**
   * Loads the meta client for the table at the base path if it exists.
   * If it does not exist, initializes the Hudi table and returns the meta client.
   *
   * @param tableDataPath the path for the table
   * @param tableName the name of the table
   * @param partitionFields the fields used for partitioning
   * @param conf the hadoop configuration
   * @return {@link HoodieTableMetaClient} for the existing table or that was created
   */
  def loadTableMetaClient(tableDataPath: String,
                          tableName: Option[String],
                          partitionFields: Seq[String],
                          conf: Configuration): HoodieTableMetaClient = {
    try HoodieTableMetaClient.builder
      .setBasePath(tableDataPath).setConf(conf)
      .setLoadActiveTimelineOnLoad(false)
      .build
    catch {
      case ex: TableNotFoundException =>
        log.debug("Hudi table does not exist, creating now.")
        if (tableName.isEmpty) {
          log.warn("No name is specified for the table. "
            + "Creating a new Hudi table with a default name: 'table'.")
        }
        initializeHudiTable(tableDataPath, tableName.getOrElse("table"), partitionFields, conf)
    }
  }

    /**
     * Initializes a Hudi table with the provided properties
     *
     * @param tableDataPath the base path for the data files in the table
     * @param tableName the name of the table
     * @param partitionFields the fields used for partitioning
     * @param conf the hadoop configuration
     * @return {@link HoodieTableMetaClient} for the table that was created
     */
    private def initializeHudiTable(tableDataPath: String,
                                    tableName: String,
                                    partitionFields: Seq[String],
                                    conf: Configuration): HoodieTableMetaClient = {
      val keyGeneratorClass = getKeyGeneratorClass(partitionFields)
      HoodieTableMetaClient
        .withPropertyBuilder
        .setCommitTimezone(HoodieTimelineTimeZone.UTC)
        .setHiveStylePartitioningEnable(true)
        .setTableType(HoodieTableType.COPY_ON_WRITE)
        .setTableName(tableName)
        .setPayloadClass(classOf[HoodieAvroPayload])
        .setKeyGeneratorClassProp(keyGeneratorClass)
        .setPopulateMetaFields(false)
        .setPartitionFields(partitionFields.mkString(","))
        .initTable(conf, tableDataPath)
    }

    private def getKeyGeneratorClass(partitionFields: Seq[String]): String = {
      if (partitionFields.isEmpty) {
        "org.apache.hudi.keygen.NonpartitionedKeyGenerator"
      } else if (partitionFields.size > 1) {
        "org.apache.hudi.keygen.CustomKeyGenerator"
      } else {
        "org.apache.hudi.keygen.SimpleKeyGenerator"
      }
    }
}




© 2015 - 2024 Weber Informatics LLC | Privacy Policy