All Downloads are FREE. Search and download functionalities are using the official Maven repository.

com.sap.hana.datalake.files.committers.manifest.ManifestCommitterConfig Maven / Gradle / Ivy

Go to download

An implementation of org.apache.hadoop.fs.FileSystem targeting SAP HANA Data Lake Files.

There is a newer version: 3.0.27
Show newest version
/*
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package com.sap.hana.datalake.files.committers.manifest;

import com.sap.hana.datalake.files.utils.threads.ThreadUtils;
import org.apache.commons.lang3.tuple.Pair;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.mapreduce.JobContext;
import org.apache.hadoop.mapreduce.TaskAttemptContext;
import org.apache.hadoop.mapreduce.TaskAttemptID;
import org.apache.hadoop.mapreduce.lib.output.FileOutputCommitter;
import org.apache.hadoop.util.Progressable;

import java.io.IOException;
import java.util.Objects;
import java.util.concurrent.ExecutorService;

/**
 * The configuration for the committer as built up from the job configuration
 * and data passed down from the committer factory.
 * Isolated for ease of dev/test
 */
final class ManifestCommitterConfig {

    /**
     * Final destination of work.
     * This is unqualified.
     */
    private final Path destinationDir;

    /**
     * Role: used in log/text messages.
     */
    private final String role;

    /**
     * This is the directory for all intermediate work: where the output
     * format will write data.
     * Will be null if built from a job context.
     */
    private final Path taskAttemptDir;

    /** Configuration of the job. */
    private final Configuration conf;

    /** The job context. For a task, this can be cast to a TaskContext. */
    private final JobContext jobContext;

    /** Should a job marker be created? */
    private final boolean createJobMarker;

    /**
     * Job ID Or UUID -without any attempt suffix.
     * This is expected/required to be unique, though
     * Spark has had "issues" there until recently
     * with lack of uniqueness of generated MR Job IDs.
     */
    private final String jobUniqueId;

    /**
     * Where did the job Unique ID come from?
     */
    private final String jobUniqueIdSource;

    /**
     * Number of this attempt; starts at zero.
     */
    private final int jobAttemptNumber;

    /**
     * Job ID + AttemptID.
     */
    private final String jobAttemptId;

    /**
     * Task ID: used as the filename of the manifest.
     * Will be "" if built from a job context.
     */
    private final String taskId;

    /**
     * Task attempt ID. Determines the working
     * directory for task attempts to write data into,
     * and for the task committer to scan.
     * Will be "" if built from a job context.
     */
    private final String taskAttemptId;

    /** Any progressable for progress callbacks. */
    private final Progressable progressable;


    /** Should the output be validated after the commit? */
    private final boolean validateOutput;

    /**
     * Attempt directory management.
     */
    private final ManifestCommitterSupport.AttemptDirectories dirs;

    /**
     * Callback when a stage is entered.
     */
    private final StageEventCallbacks stageEventCallbacks;

    /**
     * Name for logging.
     */
    private final String name;

    /**
     * Delete target paths on commit? Stricter, but
     * higher IO cost.
     */
    private final boolean deleteTargetPaths;

    /**
     * Constructor.
     * @param outputPath destination path of the job.
     * @param role role for log messages.
     * @param context job/task context
     * @param stageEventCallbacks stage event callbacks.
     */

    ManifestCommitterConfig(
            final Path outputPath,
            final String role,
            final JobContext context,
            final StageEventCallbacks stageEventCallbacks) {
        this.role = role;
        this.jobContext = context;
        this.conf = context.getConfiguration();
        this.destinationDir = outputPath;
        this.stageEventCallbacks = stageEventCallbacks;

        final Pair pair = ManifestCommitterSupport.buildJobUUID(this.conf, context.getJobID());
        this.jobUniqueId = pair.getLeft();
        this.jobUniqueIdSource = pair.getRight();
        this.jobAttemptNumber = ManifestCommitterSupport.getAppAttemptId(context);
        this.jobAttemptId = this.jobUniqueId + "_" + this.jobAttemptNumber;

        // build directories
        this.dirs = new ManifestCommitterSupport.AttemptDirectories(outputPath,
                this.jobUniqueId, this.jobAttemptNumber);

        // read in configuration options
        this.createJobMarker = this.conf.getBoolean(
                FileOutputCommitter.SUCCESSFUL_JOB_OUTPUT_DIR_MARKER,
                ManifestCommitterConstants.DEFAULT_CREATE_SUCCESSFUL_JOB_DIR_MARKER);
        this.validateOutput = this.conf.getBoolean(
                ManifestCommitterConstants.OPT_VALIDATE_OUTPUT,
                ManifestCommitterConstants.OPT_VALIDATE_OUTPUT_DEFAULT);
        this.deleteTargetPaths = this.conf.getBoolean(
                ManifestCommitterConstants.OPT_DELETE_TARGET_FILES,
                ManifestCommitterConstants.OPT_DELETE_TARGET_FILES_DEFAULT);

        // if constructed with a task attempt, build the task ID and path.
        if (context instanceof TaskAttemptContext) {
            // it's a task
            final TaskAttemptContext tac = (TaskAttemptContext) context;
            final TaskAttemptID taskAttempt = Objects.requireNonNull(
                    tac.getTaskAttemptID());
          this.taskAttemptId = taskAttempt.toString();
          this.taskId = taskAttempt.getTaskID().toString();
            // Task attempt dir; must be different across instances
          this.taskAttemptDir = this.dirs.getTaskAttemptPath(this.taskAttemptId);
            // the context is also the progress callback.
          this.progressable = tac;
          this.name = String.format(InternalConstants.NAME_FORMAT_TASK_ATTEMPT, this.taskAttemptId);

        } else {
            // it's a job
          this.taskId = "";
          this.taskAttemptId = "";
          this.taskAttemptDir = null;
          this.progressable = null;
          this.name = String.format(InternalConstants.NAME_FORMAT_JOB_ATTEMPT, this.jobAttemptId);
        }
    }

    @Override
    public String toString() {
        return "ManifestCommitterConfig{" +
                "name=" + this.name +
                ", destinationDir=" + this.destinationDir +
                ", role='" + this.role + '\'' +
                ", taskAttemptDir=" + this.taskAttemptDir +
                ", createJobMarker=" + this.createJobMarker +
                ", jobUniqueId='" + this.jobUniqueId + '\'' +
                ", jobUniqueIdSource='" + this.jobUniqueIdSource + '\'' +
                ", jobAttemptNumber=" + this.jobAttemptNumber +
                ", jobAttemptId='" + this.jobAttemptId + '\'' +
                ", taskId='" + this.taskId + '\'' +
                ", taskAttemptId='" + this.taskAttemptId + '\'' +
                '}';
    }

    /**
     * Get the destination filesystem.
     * @return destination FS.
     * @throws IOException Problems binding to the destination FS.
     */
    FileSystem getDestinationFileSystem() throws IOException {
        return FileSystem.get(this.destinationDir.toUri(), this.conf);
    }

    /**
     * Create the stage config from the committer
     * configuration.
     * This does not bind the store operations
     * or processors.
     * @return a stage config with configuration options passed in.
     */
    StageConfig createStageConfig() {
        final StageConfig stageConfig = new StageConfig();
        stageConfig
                .withJobAttemptNumber(this.jobAttemptNumber)
                .withJobDirectories(this.dirs)
                .withJobId(this.jobUniqueId)
                .withJobIdSource(this.jobUniqueIdSource)
                .withName(this.name)
                .withProgressable(this.progressable)
                .withStageEventCallbacks(this.stageEventCallbacks)
                .withTaskAttemptDir(this.taskAttemptDir)
                .withTaskAttemptId(this.taskAttemptId)
                .withTaskId(this.taskId)
                .withDeleteTargetPaths(this.deleteTargetPaths);

        return stageConfig;
    }

    public Path getDestinationDir() {
        return this.destinationDir;
    }

    public String getRole() {
        return this.role;
    }

    public Path getTaskAttemptDir() {
        return this.taskAttemptDir;
    }

    public Path getJobAttemptDir() {
        return this.dirs.getJobAttemptDir();
    }

    public Path getTaskManifestDir() {
        return this.dirs.getTaskManifestDir();
    }

    public Configuration getConf() {
        return this.conf;
    }

    public JobContext getJobContext() {
        return this.jobContext;
    }

    public boolean getCreateJobMarker() {
        return this.createJobMarker;
    }

    public String getJobAttemptId() {
        return this.jobAttemptId;
    }

    public String getTaskAttemptId() {
        return this.taskAttemptId;
    }

    public String getTaskId() {
        return this.taskId;
    }

    public String getJobUniqueId() {
        return this.jobUniqueId;
    }

    public boolean getValidateOutput() {
        return this.validateOutput;
    }

    public String getName() {
        return this.name;
    }

    /**
     * Create a new submitter task pool from the
     * {@link ManifestCommitterConstants#OPT_IO_PROCESSORS}
     * settings.
     * @return a new thread pool.
     */
    public CloseableTaskPoolSubmitter createSubmitter() {
        return this.createSubmitter(ManifestCommitterConstants.OPT_IO_PROCESSORS, ManifestCommitterConstants.OPT_IO_PROCESSORS_DEFAULT);
    }

    /**
     * Create a new submitter task pool.
     * @param key config key with pool size.
     * @param defVal default value.
     * @return a new task pool.
     */
    public CloseableTaskPoolSubmitter createSubmitter(final String key, final int defVal) {
        int numThreads = this.conf.getInt(key, defVal);
        if (numThreads <= 0) {
            // ignore the setting if it is too invalid.
            numThreads = defVal;
        }

        return createCloseableTaskSubmitter(numThreads, this.getJobAttemptId(), this.conf);
    }

    /**
     * Create a new submitter task pool.
     *
     * @param numThreads thread count.
     * @param jobAttemptId job ID
     * @return a new task pool.
     */
    public static CloseableTaskPoolSubmitter createCloseableTaskSubmitter(
            final int numThreads,
            final String jobAttemptId,
            final Configuration conf) {
        final ExecutorService executorService = ThreadUtils.newDaemonThreadFixedExecutor(
                numThreads,
                String.format("ManifestCommitter-%s-thread", jobAttemptId),
                conf);

        return new CloseableTaskPoolSubmitter(executorService);
    }

}




© 2015 - 2025 Weber Informatics LLC | Privacy Policy