org.apache.hudi.HoodieIncrementalFileIndex.scala Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of hudi-spark3.0-bundle_2.12 Show documentation
The newest version!
/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *    http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package org.apache.hudi

import org.apache.hadoop.fs.{FileStatus, Path}
import org.apache.hudi.common.model.{FileSlice, HoodieLogFile}
import org.apache.hudi.common.table.HoodieTableMetaClient
import org.apache.hudi.storage.StoragePathInfo
import org.apache.hudi.util.JFunction
import org.apache.spark.sql.SparkSession
import org.apache.spark.sql.catalyst.InternalRow
import org.apache.spark.sql.catalyst.expressions.Expression
import org.apache.spark.sql.execution.datasources.{FileIndex, FileStatusCache, NoopCache, PartitionDirectory}
import org.apache.spark.sql.sources.Filter
import org.apache.spark.sql.types.StructType

import java.util.stream.Collectors

import scala.collection.JavaConverters._

class HoodieIncrementalFileIndex(override val spark: SparkSession,
                                 override val metaClient: HoodieTableMetaClient,
                                 override val schemaSpec: Option[StructType],
                                 override val options: Map[String, String],
                                 @transient override val fileStatusCache: FileStatusCache = NoopCache,
                                 override val includeLogFiles: Boolean,
                                 override val shouldEmbedFileSlices: Boolean)
  extends HoodieFileIndex(
    spark, metaClient, schemaSpec, options, fileStatusCache, includeLogFiles, shouldEmbedFileSlices
  ) with FileIndex {
  val mergeOnReadIncrementalRelation: MergeOnReadIncrementalRelation = MergeOnReadIncrementalRelation(
    spark.sqlContext, options, metaClient, schemaSpec, schemaSpec)

  override def listFiles(partitionFilters: Seq[Expression], dataFilters: Seq[Expression]): Seq[PartitionDirectory] = {
    hasPushedDownPartitionPredicates = true

    val fileSlices = mergeOnReadIncrementalRelation.listFileSplits(partitionFilters, dataFilters)
    if (fileSlices.isEmpty) {
      Seq.empty
    } else {
      val prunedPartitionsAndFilteredFileSlices = fileSlices.map {
        case (partitionValues, fileSlices) =>
          if (shouldEmbedFileSlices) {
            val baseFileStatusesAndLogFileOnly: Seq[FileStatus] = fileSlices.map(slice => {
              if (slice.getBaseFile.isPresent) {
                slice.getBaseFile.get().getPathInfo
              } else if (includeLogFiles && slice.getLogFiles.findAny().isPresent) {
                slice.getLogFiles.findAny().get().getPathInfo
              } else {
                null
              }
            }).filter(slice => slice != null)
              .map(pathInfo => new FileStatus(pathInfo.getLength, pathInfo.isDirectory, pathInfo.getBlockReplication, pathInfo.getBlockSize,
                pathInfo.getModificationTime, new Path(pathInfo.getPath.toUri)))
            val c = fileSlices.filter(f => (includeLogFiles && f.getLogFiles.findAny().isPresent)
              || (f.getBaseFile.isPresent && f.getBaseFile.get().getBootstrapBaseFile.isPresent)).
              foldLeft(Map[String, FileSlice]()) { (m, f) => m + (f.getFileId -> f) }
            if (c.nonEmpty) {
              sparkAdapter.getSparkPartitionedFileUtils.newPartitionDirectory(
                new HoodiePartitionFileSliceMapping(partitionValues, c), baseFileStatusesAndLogFileOnly)
            } else {
              sparkAdapter.getSparkPartitionedFileUtils.newPartitionDirectory(
                partitionValues, baseFileStatusesAndLogFileOnly)
            }
          } else {
            val allCandidateFiles: Seq[FileStatus] = fileSlices.flatMap(fs => {
              val baseFileStatusOpt = getBaseFileInfo(Option.apply(fs.getBaseFile.orElse(null)))
              val logFilesStatus = if (includeLogFiles) {
                fs.getLogFiles.map[StoragePathInfo](JFunction.toJavaFunction[HoodieLogFile, StoragePathInfo](lf => lf.getPathInfo))
              } else {
                java.util.stream.Stream.empty()
              }
              val files = logFilesStatus.collect(Collectors.toList[StoragePathInfo]).asScala
              baseFileStatusOpt.foreach(f => files.append(f))
              files
            })
              .map(fileInfo => new FileStatus(fileInfo.getLength, fileInfo.isDirectory, 0, fileInfo.getBlockSize,
                fileInfo.getModificationTime, new Path(fileInfo.getPath.toUri)))
            sparkAdapter.getSparkPartitionedFileUtils.newPartitionDirectory(
              partitionValues, allCandidateFiles)
          }
      }.toSeq

      if (shouldReadAsPartitionedTable()) {
        prunedPartitionsAndFilteredFileSlices
      } else if (shouldEmbedFileSlices) {
        assert(partitionSchema.isEmpty)
        prunedPartitionsAndFilteredFileSlices
      } else {
        Seq(PartitionDirectory(InternalRow.empty, prunedPartitionsAndFilteredFileSlices.flatMap(_.files)))
      }
    }
  }

  override def inputFiles: Array[String] = {
    val fileSlices = mergeOnReadIncrementalRelation.listFileSplits(Seq.empty, Seq.empty)
    if (fileSlices.isEmpty) {
      Array.empty
    }
    fileSlices.values.flatten.flatMap(fs => {
      val baseFileStatusOpt = getBaseFileInfo(Option.apply(fs.getBaseFile.orElse(null)))
      val logFilesStatus = if (includeLogFiles) {
        fs.getLogFiles.map[StoragePathInfo](JFunction.toJavaFunction[HoodieLogFile, StoragePathInfo](lf => lf.getPathInfo))
      } else {
        java.util.stream.Stream.empty()
      }
      val files = logFilesStatus.collect(Collectors.toList[StoragePathInfo]).asScala
      baseFileStatusOpt.foreach(f => files.append(f))
      files
    }).map(fileStatus => fileStatus.getPath.toString).toArray
  }

  def getRequiredFilters: Seq[Filter] = {
    mergeOnReadIncrementalRelation.getRequiredFilters
  }
}