All Downloads are FREE. Search and download functionalities are using the official Maven repository.

com.sap.hana.datalake.files.HdlfsOutputCommitter Maven / Gradle / Ivy

Go to download

An implementation of org.apache.hadoop.fs.FileSystem targeting SAP HANA Data Lake Files.

There is a newer version: 3.0.27
Show newest version
// © 2022-2024 SAP SE or an SAP affiliate company. All rights reserved.
package com.sap.hana.datalake.files;

import com.sap.hana.datalake.files.shaded.com.fasterxml.jackson.databind.JsonNode;
import com.sap.hana.datalake.files.shaded.com.fasterxml.jackson.databind.ObjectMapper;
import com.sap.hana.datalake.files.utils.HdlfsRetryUtils;
import com.sap.hana.datalake.files.utils.threads.ThreadUtils;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FSDataInputStream;
import org.apache.hadoop.fs.FSDataOutputStream;
import org.apache.hadoop.fs.FileStatus;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.retry.RetryPolicy;
import org.apache.hadoop.mapreduce.JobContext;
import org.apache.hadoop.mapreduce.JobID;
import org.apache.hadoop.mapreduce.TaskAttemptContext;
import org.apache.hadoop.mapreduce.lib.output.PathOutputCommitter;
import org.apache.hadoop.thirdparty.com.google.common.base.Preconditions;
import org.apache.hadoop.thirdparty.com.google.common.util.concurrent.ListenableFuture;
import org.apache.hadoop.thirdparty.com.google.common.util.concurrent.ListeningExecutorService;
import org.apache.hadoop.thirdparty.com.google.common.util.concurrent.MoreExecutors;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import javax.annotation.Nonnull;
import java.io.FileNotFoundException;
import java.io.IOException;
import java.io.InputStream;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collection;
import java.util.Collections;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.TreeSet;
import java.util.concurrent.Callable;
import java.util.concurrent.ExecutorService;
import java.util.concurrent.Future;
import java.util.concurrent.TimeUnit;
import java.util.stream.Collectors;


public class HdlfsOutputCommitter extends PathOutputCommitter {

  public static final String FS_HDLFS_JOBUUID = "fs.hdlfs.commit.jobuuid";

  private static final String COMMITTER_THREADS_PREFIX = "HdlfsOutputCommitter-thread";
  private static final String EMPTY_STRING = "";
  private static final Logger LOG = LoggerFactory.getLogger(HdlfsOutputCommitter.class);

  private static volatile ExecutorService globalThreadPool;

  private final Path outputPath;
  private final Path workPath;
  private final HdlfsFileSystem destFS;
  private final Path relativeOutputPath; // relative to the destFS
  private final String jobId;
  private final ListeningExecutorService executorService;
  private final boolean fsCacheEnabled;
  private final ObjectMapper mapper;
  private final RetryPolicy retryPolicy;
  private final RetryPolicy mergeFileNotFoundRetryPolicy;
  private final boolean isDeleteBatchAsyncEnabled;
  private final boolean shouldCompleteAsyncDeleteBatch;
  private final int deleteBatchCompleteWaitTimeSeconds;
  private final int deleteBatchSize;
  private final int maxDeleteRetries;
  private final int deleteRetryTimeoutMs;

  public HdlfsOutputCommitter(final Path outputPath, final TaskAttemptContext context) throws IOException {
    super(outputPath, context);

    final Configuration conf = context.getConfiguration();
    final FileSystem fs = outputPath.getFileSystem(conf);

    if (!(fs instanceof HdlfsFileSystem)) {
      throw new IllegalArgumentException("Destination FileSystem for HdlfsOutputCommitter must be an instance of HdlfsFileSystem");
    }

    this.jobId = this.getOrCreateJobId(conf, context.getJobID());
    this.destFS = (HdlfsFileSystem) fs;
    this.workPath = this.getTaskAttemptPath(context, this.jobId, outputPath);
    this.outputPath = fs.makeQualified(outputPath);
    this.relativeOutputPath = new Path(this.getRelativePathFromSchemaPath(outputPath));

    LOG.debug("Initializing HdlfsOutputCommitter(jobId={}, outputPath={}, workPath={})", this.jobId, this.outputPath, this.workPath);

    this.mapper = new ObjectMapper();
    this.retryPolicy = this.destFS.getRetryPolicy();

    final ExecutorService threadPool = getOrCreateGlobalThreadPool(conf);
    this.executorService = MoreExecutors.listeningDecorator(threadPool);

    this.mergeFileNotFoundRetryPolicy = HdlfsRetryPolicies.createMergeFileNotFoundRetryPolicy(conf);

    this.fsCacheEnabled = conf.getBoolean(HdlfsConstants.FSCACHE_ENABLED, HdlfsConstants.FSCACHE_ENABLED_DEFAULT);

    this.isDeleteBatchAsyncEnabled = conf.getBoolean(HdlfsConstants.FS_OPERATION_DELETE_BATCH_ASYNC_ENABLED_KEY, HdlfsConstants.FS_OPERATION_DELETE_BATCH_ASYNC_ENABLED_DEFAULT);
    this.shouldCompleteAsyncDeleteBatch = conf.getBoolean(HdlfsConstants.FS_OPERATION_DELETE_BATCH_ASYNC_COMPLETE_ENABLED_KEY, HdlfsConstants.FS_OPERATION_DELETE_BATCH_ASYNC_COMPLETE_ENABLED_DEFAULT);
    this.deleteBatchCompleteWaitTimeSeconds = conf.getInt(HdlfsConstants.FS_OPERATION_DELETE_BATCH_ASYNC_COMPLETE_WAIT_TIME_SECS_KEY, HdlfsConstants.FS_OPERATION_DELETE_BATCH_ASYNC_COMPLETE_WAIT_TIME_SECS_DEFAULT);

    this.deleteBatchSize = conf.getInt(HdlfsConstants.HDLFS_OUTPUT_COMMITTER_DELETE_BATCH_SIZE_KEY, HdlfsConstants.HDLFS_OUTPUT_COMMITTER_DELETE_BATCH_SIZE_DEFAULT);
    this.maxDeleteRetries = conf.getInt(HdlfsConstants.HDLFS_OUTPUT_COMMITTER_DELETE_BATCH_MAX_RETRIES_KEY, HdlfsConstants.HDLFS_OUTPUT_COMMITTER_DELETE_BATCH_MAX_RETRIES_DEFAULT);
    this.deleteRetryTimeoutMs = conf.getInt(HdlfsConstants.HDLFS_OUTPUT_COMMITTER_DELETE_BATCH_RETRY_TIMEOUT_MS_KEY, HdlfsConstants.HDLFS_OUTPUT_COMMITTER_DELETE_BATCH_RETRY_TIMEOUT_MS_DEFAULT);

    LOG.debug("HdlfsOutputCommitter was initialized successfully");
  }

  private static ExecutorService getOrCreateGlobalThreadPool(final Configuration conf) {
    if (globalThreadPool == null) {
      synchronized (HdlfsOutputCommitter.class) {
        if (globalThreadPool == null) {
          // First instance of the class in a given spark node will create the thread pool based on the node configuration, or based on client configuration
          final int availableCores = Runtime.getRuntime().availableProcessors();
          final int threadPoolMaxActiveTasks = conf.getInt(HdlfsConstants.HDLFS_OUTPUT_COMMITTER_THREADPOOL_MAX_ACTIVE_TASKS_KEY, 4 * availableCores);
          final int threadPoolMaxWaitingTasks = conf.getInt(HdlfsConstants.HDLFS_OUTPUT_COMMITTER_THREADPOOL_MAX_WAITING_TASKS_KEY, 4 * threadPoolMaxActiveTasks);
          final int threadPoolKeepAliveSeconds = conf.getInt(HdlfsConstants.HDLFS_OUTPUT_COMMITTER_THREADPOOL_KEEP_ALIVE_SECS_KEY, HdlfsConstants.HDLFS_OUTPUT_COMMITTER_THREADPOOL_KEEP_ALIVE_SECS_DEFAULT);

          globalThreadPool = ThreadUtils.newDaemonThreadBlockingExecutor(
                  threadPoolMaxActiveTasks,
                  threadPoolMaxWaitingTasks,
                  /* allowCoreThreadTimeOut */ true,
                  threadPoolKeepAliveSeconds, TimeUnit.SECONDS,
                  /* fairSemaphore */ false,
                  COMMITTER_THREADS_PREFIX,
                  conf);
        }
      }
    }

    return globalThreadPool;
  }

  @Override
  public Path getOutputPath() {
    return this.outputPath;
  }

  @Override
  public Path getWorkPath() {
    return this.workPath;
  }

  @Override
  public void setupJob(final JobContext context) {
    LOG.info("Setting up job with id = [{}]", this.jobId);
    final Configuration conf = context.getConfiguration();
    conf.set(FS_HDLFS_JOBUUID, this.jobId);
  }

  @Override
  public void setupTask(final TaskAttemptContext context) {
    // no op
    LOG.info("Setting up task with id = [{}]", context.getTaskAttemptID());
  }

  // we only need to commit tasks that actually write output to the filesystem
  @Override
  public boolean needsTaskCommit(final TaskAttemptContext taskAttemptContext) throws IOException {
    final Path taskAttemptDir = this.getTaskAttemptPath(taskAttemptContext, this.jobId, this.relativeOutputPath);

    try {
      this.destFS.getFileStatus(taskAttemptDir);
      LOG.debug("Task attempt {} contains data and must be committed", taskAttemptContext.getTaskAttemptID());
      return true;
    } catch (final FileNotFoundException ex) {
      LOG.debug("Task attempt {} does not contain data and will not be committed", taskAttemptContext.getTaskAttemptID());
      return false;
    }
  }

  @Override
  public void commitTask(final TaskAttemptContext taskAttemptContext) throws IOException {
    LOG.debug("Committing task with id = [{}]", taskAttemptContext.getTaskAttemptID());

    final Path taskAttemptDir = this.getTaskAttemptPath(taskAttemptContext, this.jobId, this.relativeOutputPath);
    final FileStatus[] taskAttemptFiles = this.destFS.listStatus(taskAttemptDir);

    Preconditions.checkNotNull(taskAttemptFiles, "Task Attempt Directory can not be empty");

    final Path pendingSetPath = this.getTaskPendingSetPath(taskAttemptContext);
    final HdlfsTaskCommitInfo taskCommitInfo = new HdlfsTaskCommitInfo(this.getTaskAttemptName(taskAttemptContext));
    final List pendingFilesToBeDeleted = new ArrayList<>();

    for (final FileStatus fs : taskAttemptFiles) {
      final String relativePath = this.getRelativePathFromSchemaPath(fs.getPath());

      if (fs.isFile() && relativePath.endsWith(HdlfsConstants.PENDING_SUFFIX)) {
        LOG.debug("Reading pending file " + relativePath);

        final HdlfsTaskOutputInfo outputInfo = this.readTaskOutputInfo(this.destFS, new Path(relativePath));

        taskCommitInfo.addTaskOutputInfo(outputInfo);
        pendingFilesToBeDeleted.add(relativePath);

        LOG.debug("Pending file [{}] was added to pendingset file [{}]", relativePath, pendingSetPath);
      }
    }

    LOG.debug("Creating pendingset file [{}]", pendingSetPath);

    final String operationName = String.format("Creation of PendingSet %s", pendingSetPath);

    HdlfsRetryUtils.execWithRetry(operationName, this.retryPolicy, true, () -> {
      try (final FSDataOutputStream out = this.destFS.create(pendingSetPath, true)) {
        out.write(this.mapper.writeValueAsBytes(taskCommitInfo));
      } catch (final Exception ex) {
        final String message = String.format("Exception when writing pendingset file %s output.", pendingSetPath);
        throw new IOException(message, ex);
      }

      return pendingSetPath;
    });

    LOG.debug("Pendingset [{}] created", pendingSetPath);

    // created the pendingset file, if FsCache is enabled now we need to update the cache
    if (this.fsCacheEnabled) {
      final List entries = Collections.singletonList(new FsCacheEntry(pendingSetPath.getParent(), pendingSetPath.getName(), false, false));
      this.addEntriesToFsCache(entries);
    }

    this.performDeleteBatch(pendingFilesToBeDeleted, false);
    this.performFsCacheCleanup(taskAttemptDir);

    LOG.debug("Task with id = [{}] committed successfully", taskAttemptContext.getTaskAttemptID());
  }

  @Override
  public void abortTask(final TaskAttemptContext taskAttemptContext) throws IOException {
    final Path taskAttPath = this.getTaskAttemptPath(taskAttemptContext, this.jobId, this.relativeOutputPath);
    final List toDelete = new ArrayList<>();

    // we need to delete a possible existing task pendingSet file, and mark it as deleted in FsCache
    final Path taskPendingSet = this.getTaskPendingSetPath(taskAttemptContext);
    toDelete.add(taskPendingSet.toString());

    if (this.fsCacheEnabled) {
      final List entries = Collections.singletonList(new FsCacheEntry(taskPendingSet.getParent(), taskPendingSet.getName(), false, true));
      this.addEntriesToFsCache(entries);
    }

    FileStatus[] taskAttFiles = null;

    try {
      taskAttFiles = this.destFS.listStatus(taskAttPath);
    } catch (final FileNotFoundException ex) {
      LOG.info("Task {} has not written any data under its task attempt directory, there is nothing to cleanup", taskAttemptContext.getTaskAttemptID());
    }

    if (taskAttFiles != null) {
      for (final FileStatus fileStatus : taskAttFiles) {
        this.innerAbortTask(fileStatus, toDelete);
      }
    }

    try {
      this.performDeleteBatch(toDelete, true);
    } catch (final IOException ex) {
      final String message = String.format("Failed to abort task attempt %s. Could not delete task prefix files", taskAttemptContext.getTaskAttemptID());
      throw new IOException(message, ex);
    }
  }

  private void innerAbortTask(final FileStatus taskAttPathFs, final List toDelete) throws IOException {
    final Path relativePath = new Path(this.getRelativePathFromSchemaPath(taskAttPathFs.getPath()));

    if (taskAttPathFs.isFile()) {
      toDelete.add(relativePath.toString());
      return;
    }

    final FileStatus[] prefixFiles = this.destFS.listStatus(relativePath);
    Preconditions.checkNotNull(prefixFiles, String.format("Path %s is marked as a directory and can not be empty", relativePath));

    for (final FileStatus nextFs : prefixFiles) {
      this.innerAbortTask(nextFs, toDelete);
    }
  }

  @Override
  public void commitJob(final JobContext context) throws IOException {
    LOG.debug("Initiating job commit for Job(id={}) with output path = [{}]", this.jobId, this.relativeOutputPath);
    final Path jobAttemptPath = this.getJobAttemptPath(this.jobId, this.relativeOutputPath);

    // List all files under the job attempt path, we are looking for .pendingset files to commit
    final FileStatus[] jobAttemptFiles = this.destFS.listStatus(jobAttemptPath);
    Preconditions.checkNotNull(jobAttemptFiles, "Job attempt directory can not be empty");

    final List> taskOutputCommitFutures = new ArrayList<>();
    final Set fsCacheFiles = Collections.synchronizedSet(new TreeSet<>());

    for (final FileStatus fs : jobAttemptFiles) {
      final String relativePath = this.getRelativePathFromSchemaPath(fs.getPath());

      if (fs.isFile() && relativePath.endsWith(HdlfsConstants.PENDINGSET_SUFFIX)) {
        final PendingSetCommitter pendingSetCommitter = new PendingSetCommitter(relativePath, this.destFS, fsCacheFiles);
        taskOutputCommitFutures.add(this.executorService.submit(pendingSetCommitter));
      }
    }

    try {
      for (final Future taskOutputCommitFuture : taskOutputCommitFutures) {
        taskOutputCommitFuture.get();
      }
    } catch (final Exception ex) {
      throw new IOException("Failed to commit the output of the job.", ex);
    }

    this.addEntriesToFsCache(fsCacheFiles);
    this.performFsCacheCleanup(jobAttemptPath);

    LOG.debug("Job(id={}) with output path = [{}] committed successfully", this.jobId, this.relativeOutputPath);
  }

  // Collection must be sorted by prefix
  void addEntriesToFsCache(final Collection entries) throws IOException {
    if (entries == null) {
      return;
    }

    final List currentBatch = new ArrayList<>();

    for (final FsCacheEntry entry : entries) {
      if (!currentBatch.isEmpty() && !currentBatch.get(currentBatch.size() - 1).getPrefix().equals(entry.getPrefix())) {
        // dispatch current batch
        final Path currentPrefix = currentBatch.get(0).getPrefix();
        this.executeFsCachePatchOperation(currentPrefix, currentBatch);

        // init next batch
        currentBatch.clear();
      }

      currentBatch.add(entry);
    }

    if (!currentBatch.isEmpty()) {
      final Path currentPrefix = currentBatch.get(0).getPrefix();
      this.executeFsCachePatchOperation(currentPrefix, currentBatch);
    }
  }

  private void executeFsCachePatchOperation(final Path path, final List entries) throws IOException {
    final JsonNode patch = this.createFsCachePatchJsonNode(entries);
    final FsCache fsCache = this.destFS.getFsCache();

    try {
      fsCache.applyPatch(path, patch);
    } catch (final IOException ex) {
      LOG.error("Failed to apply patch {} to path {} in FsCache", patch.toString(), path);
      throw ex;
    }
  }

  private JsonNode createFsCachePatchJsonNode(final List entries) {
    final List> patch = new ArrayList<>();

    for (final FsCacheEntry entry : entries) {
      final Map entryMap = new HashMap<>();
      entryMap.put(HdlfsConstants.MARK_AS_DELETED_FIELD_NAME, entry.getMarkAsDeleted());
      entryMap.put(HdlfsConstants.IS_PREFIX_FIELD_NAME, entry.getIsPrefix());

      final Map patchOp = new HashMap<>();
      patchOp.put(HdlfsConstants.JSON_PATCH_OP_KEY, HdlfsConstants.JSON_PATCH_ADD_OP);
      patchOp.put(HdlfsConstants.JSON_PATCH_PATH_KEY, String.format("/%s", entry.getPath()));
      patchOp.put(HdlfsConstants.JSON_PATCH_VALUE_KEY, entryMap);

      patch.add(patchOp);
    }

    return this.mapper.valueToTree(patch);
  }

  private void performFsCacheCleanup(final Path path) {
    if (this.fsCacheEnabled) {
      try {
        final FsCache fsCache = this.destFS.getFsCache();
        fsCache.deleteEntry(path);
      } catch (final IOException ex) {
        LOG.warn("Failed to delete cache entry", ex);
      }
    }
  }

  private String getOrCreateJobId(final Configuration configuration, final JobID jobID) {
    final String configuredJobId = configuration.get(FS_HDLFS_JOBUUID, EMPTY_STRING);

    if (!configuredJobId.isEmpty()) {
      return configuredJobId;
    } else {
      return jobID.toString();
    }
  }

  private void performDeleteBatch(final List files, final boolean shouldThrowException) throws IOException {
    for (int i = 0; i < files.size(); i += this.deleteBatchSize) {
      final int startIndex = i;
      final int endIndex = Math.min(i + this.deleteBatchSize, files.size());
      deleteFilesWithRetry(files.subList(startIndex, endIndex), shouldThrowException);
    }
  }

  private void deleteFilesWithRetry(final List files, final boolean shouldThrowException) throws IOException {
    List currentList = files;

    for (int i = 0; i < this.maxDeleteRetries; i++) {
      HdlfsFileSystemCapabilities.DeleteBatchResult result;

      try {
        LOG.debug("Performing delete batch operation, isDeleteBatchAsync is = [{}]", this.isDeleteBatchAsyncEnabled);
        result = this.destFS.deleteBatch(currentList.stream().map(Path::new).collect(Collectors.toList()), !this.isDeleteBatchAsyncEnabled);
      } catch (final IOException ex) {
        LOG.warn("Failed to perform delete batch operation on files {}. Need to retry the operation with all the files again.", currentList, ex);
        // Continue the loop here without modifying the current list, so we retry with all the files again since we do not know which ones failed or succeeded

        try {
          TimeUnit.MILLISECONDS.sleep(this.deleteRetryTimeoutMs);
        } catch (final InterruptedException e) {
          LOG.warn("Thread interrupted", e);
          Thread.currentThread().interrupt();
        }

        continue;
      }

      if (this.isDeleteBatchAsyncEnabled && this.shouldCompleteAsyncDeleteBatch) {
        LOG.debug("DeleteBatch operation was asynchronous and configured to be completed. Maximum wait time is {}s", this.deleteBatchCompleteWaitTimeSeconds);
        result = this.destFS.completeDeleteBatch(((HdlfsFileSystemCapabilities.AsyncDeleteBatchResult) result).getToken(), this.deleteBatchCompleteWaitTimeSeconds);
      }

      if (result instanceof HdlfsFileSystemCapabilities.AsyncDeleteBatchResult) {
        LOG.debug("DeleteBatch operation was configured to not wait for result, or exceeded the maximum completion time configure. Token for polling the result is [{}]", ((HdlfsFileSystemCapabilities.AsyncDeleteBatchResult) result).getToken());
        return;
      }

      if (result.isSuccessful()) {
        return;
      }

      // Get all files that were not deleted successfully from the response
      final Set deletedFiles = Arrays.stream(result.getFiles())
          .map(HdlfsFileSystemCapabilities.DeletedFile::getPath)
          .collect(Collectors.toSet());

      // update the deletion list with all files from the initial request that failed to delete
      currentList = currentList.stream().filter(deletedFiles::contains).collect(Collectors.toList());

      try {
        TimeUnit.MILLISECONDS.sleep(this.deleteRetryTimeoutMs);
      } catch (final InterruptedException ex) {
        LOG.warn("Thread interrupted", ex);
        Thread.currentThread().interrupt();
      }
    }

    final String message = String.format("Could not delete the files %s even after the maximum number of retries.", currentList);

    if (shouldThrowException) {
      throw new IOException(message);
    } else {
      LOG.warn(message);
    }
  }

  // Commits temporary files from a single pending set file and returns the list of chunk paths that have to be deleted.
  private List commitTaskOutput(final HdlfsTaskCommitInfo commitInfo, final Set filesToBeUploadedToFsCache) throws IOException {
    final List chunksToBeDeleted = new ArrayList<>();

    for (final HdlfsTaskOutputInfo singleOutputInfo : commitInfo.taskOutputsInfo) {
      final Path finalTaskOutputPath = new Path(singleOutputInfo.finalTaskOutputLocation);
      LOG.debug("Committing task attempt {} to final path {}", singleOutputInfo.taskAttemptId, finalTaskOutputPath);

      try {
        // If there are not pending files, we should just create an empty file in the task output path
        if (singleOutputInfo.taskPendingFiles.isEmpty()) {
          this.destFS.create(finalTaskOutputPath, true).close();
        } else {
          final String operationTitle = "Committing task attempt into output path " + finalTaskOutputPath;
          HdlfsRetryUtils.execWithRetry(operationTitle, this.mergeFileNotFoundRetryPolicy, true,
              () -> ((HdlfsFileSystemCapabilities) this.destFS).merge(finalTaskOutputPath, singleOutputInfo.taskPendingFiles.stream().map(Path::new).collect(Collectors.toList())));
        }
      } catch (final IOException ex) {
        final String message = String.format("Failed to commit task attempt %s during final job commit", singleOutputInfo.taskAttemptId);
        throw new IOException(message, ex);
      }

      // Update FsCache with the resulting file
      if (this.fsCacheEnabled) {
        Path currentPath = finalTaskOutputPath;
        boolean isPrefix = false;

        do {
          final FsCacheEntry entry = new FsCacheEntry(currentPath.getParent(), currentPath.getName(), isPrefix, false);
          filesToBeUploadedToFsCache.add(entry);
          currentPath = currentPath.getParent();
          isPrefix = true; // after the first iteration the files will be "prefixes", i.e, directories
        } while (!currentPath.equals(this.relativeOutputPath));
      }

      LOG.debug("Commit for task attempt {} was successful, marking pending files to be deleted in the future", singleOutputInfo.taskAttemptId);

      chunksToBeDeleted.addAll(singleOutputInfo.taskPendingFiles);
    }

    return chunksToBeDeleted;
  }

  private HdlfsTaskOutputInfo readTaskOutputInfo(final FileSystem fs, final Path path) throws IOException {
    LOG.debug("Reading task output info from path = [{}]", path.toString());

    try (final FSDataInputStream in = fs.open(path)) {
      return this.mapper.readValue((InputStream) in, HdlfsTaskOutputInfo.class);
    } catch (final Exception ex) {
      throw new IOException("Could not read task output info", ex);
    }
  }

  private HdlfsTaskCommitInfo readTaskCommitInfo(final FileSystem fs, final Path path) throws IOException {
    LOG.debug("Reading task commit info from path {}", path.toString());

    try (final FSDataInputStream in = fs.open(path)) {
      return this.mapper.readValue((InputStream) in, HdlfsTaskCommitInfo.class);
    } catch (final Exception ex) {
      throw new IOException("Could not read task commit info", ex);
    }
  }

  private String getTaskAttemptName(final TaskAttemptContext context) {
    return String.format("task-%s", context.getTaskAttemptID().toString());
  }

  private Path getTaskAttemptPath(final TaskAttemptContext context,
                                  final String jobUUID,
                                  final Path dest) {
    final String taskAttemptPath = this.getTaskAttemptName(context);
    final Path fullPathWithTaskAttempt = new Path(this.getJobAttemptPath(jobUUID, dest), taskAttemptPath);
    final Path fullPathWithBaseMarker = new Path(fullPathWithTaskAttempt, HdlfsConstants.BASE_PREFIX_NAME);

    return fullPathWithBaseMarker;
  }

  private String getRelativePathFromSchemaPath(final Path outputPath) {
    return outputPath.toUri().getPath();
  }

  private Path getJobAttemptPath(final String jobUUID, final Path dest) {
    final Path destWithMagicSuffix = new Path(dest, new Path(HdlfsConstants.PENDING_PREFIX_NAME));

    return new Path(destWithMagicSuffix, new Path(String.format("job-%s", jobUUID)));
  }

  private Path getTaskPendingSetPath(final TaskAttemptContext context) {
    final Path jobAttemptDir = this.getJobAttemptPath(this.jobId, this.relativeOutputPath);
    final String pendingSetFileName = this.getTaskAttemptName(context) + HdlfsConstants.PENDINGSET_SUFFIX;

    return new Path(jobAttemptDir, pendingSetFileName);
  }

  private final class PendingSetCommitter implements Callable {
    private final String pendingSetPath;
    private final FileSystem destinationFs;
    private final Set filesToBeUploadedToFsCache;

    public PendingSetCommitter(
        final String pendingSetPath,
        final FileSystem destinationFs,
        final Set filesToBeUploadedToFsCache) {
      this.pendingSetPath = pendingSetPath;
      this.destinationFs = destinationFs;
      this.filesToBeUploadedToFsCache = filesToBeUploadedToFsCache;
    }

    @Override
    public Void call() throws IOException {
      final Path pendingSetFilePath = new Path(this.pendingSetPath);
      final HdlfsTaskCommitInfo commitInfo = readTaskCommitInfo(this.destinationFs, pendingSetFilePath);

      LOG.debug("Committing output for task attempt {}", commitInfo.taskAttemptId);

      final List filesToBeDeleted;

      try {
        final List chunksToBeDeleted = HdlfsOutputCommitter.this.commitTaskOutput(commitInfo, this.filesToBeUploadedToFsCache);

        filesToBeDeleted = new ArrayList<>(chunksToBeDeleted);
      } catch (final IOException ex) {
        final String message = String.format("Failed to commit multipart upload for pendingset %s", pendingSetFilePath);
        throw new IOException(message, ex);
      }

      LOG.debug("Output of task attempt {} was committed successfully", commitInfo.taskAttemptId);

      filesToBeDeleted.add(pendingSetFilePath.toString());

      HdlfsOutputCommitter.this.performDeleteBatch(filesToBeDeleted, false);

      return null;
    }
  }


  static final class FsCacheEntry implements Comparable {

    private final Path prefix;
    private final String path;
    private final boolean isPrefix;
    private final boolean markAsDeleted;

    public FsCacheEntry(final Path prefix, final String path, final boolean isPrefix, final boolean markAsDeleted) {
      this.prefix = prefix;
      this.path = path;
      this.isPrefix = isPrefix;
      this.markAsDeleted = markAsDeleted;
    }

    public Path getPrefix() {
      return this.prefix;
    }

    public String getPath() {
      return this.path;
    }

    public boolean getIsPrefix() {
      return this.isPrefix;
    }

    public boolean getMarkAsDeleted() {
      return this.markAsDeleted;
    }

    @Override
    public int compareTo(@Nonnull final FsCacheEntry other) {
      final int prefixCompare = this.prefix.compareTo(other.getPrefix());

      if (prefixCompare != 0) {
        return prefixCompare;
      }

      final int pathCompare = this.path.compareTo(other.getPath());

      if (pathCompare != 0) {
        return pathCompare;
      }

      final int isPrefixCompare = Boolean.compare(this.isPrefix, other.getIsPrefix());

      if (isPrefixCompare != 0) {
        return isPrefixCompare;
      }

      return Boolean.compare(this.markAsDeleted, other.getMarkAsDeleted());
    }
  }
}

// © 2022-2024 SAP SE or an SAP affiliate company. All rights reserved.




© 2015 - 2025 Weber Informatics LLC | Privacy Policy