All Downloads are FREE. Search and download functionalities are using the official Maven repository.

com.sap.hana.datalake.files.committers.manifest.ManifestCommitterSupport Maven / Gradle / Ivy

Go to download

An implementation of org.apache.hadoop.fs.FileSystem targeting SAP HANA Data Lake Files.

There is a newer version: 3.0.27
Show newest version
/*
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package com.sap.hana.datalake.files.committers.manifest;

import java.io.IOException;
import java.time.ZonedDateTime;

import com.sap.hana.datalake.files.HasETag;
import org.apache.commons.lang3.tuple.Pair;
import org.apache.hadoop.classification.InterfaceAudience;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileStatus;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.fs.PathIOException;
import org.apache.hadoop.mapreduce.JobContext;
import org.apache.hadoop.mapreduce.JobID;
import org.apache.hadoop.mapreduce.MRJobConfig;
import org.apache.hadoop.net.NetUtils;
import org.apache.hadoop.security.UserGroupInformation;

import static java.util.Objects.requireNonNull;
import static org.apache.hadoop.mapreduce.lib.output.FileOutputCommitter.PENDING_DIR_NAME;
import static com.sap.hana.datalake.files.committers.manifest.ManifestCommitterConstants.INITIAL_APP_ATTEMPT_ID;
import static com.sap.hana.datalake.files.committers.manifest.ManifestCommitterConstants.JOB_ATTEMPT_DIR_FORMAT_STR;
import static com.sap.hana.datalake.files.committers.manifest.ManifestCommitterConstants.JOB_DIR_FORMAT_STR;
import static com.sap.hana.datalake.files.committers.manifest.ManifestCommitterConstants.JOB_ID_SOURCE_MAPREDUCE;
import static com.sap.hana.datalake.files.committers.manifest.ManifestCommitterConstants.JOB_TASK_ATTEMPT_SUBDIR;
import static com.sap.hana.datalake.files.committers.manifest.ManifestCommitterConstants.JOB_TASK_MANIFEST_SUBDIR;
import static com.sap.hana.datalake.files.committers.manifest.ManifestCommitterConstants.MANIFEST_COMMITTER_CLASSNAME;
import static com.sap.hana.datalake.files.committers.manifest.ManifestCommitterConstants.MANIFEST_SUFFIX;
import static com.sap.hana.datalake.files.committers.manifest.ManifestCommitterConstants.OPT_STORE_OPERATIONS_CLASS;
import static com.sap.hana.datalake.files.committers.manifest.ManifestCommitterConstants.SPARK_WRITE_UUID;
import static com.sap.hana.datalake.files.committers.manifest.ManifestCommitterConstants.SUMMARY_FILENAME_FORMAT;
import static com.sap.hana.datalake.files.committers.manifest.ManifestCommitterConstants.TMP_SUFFIX;
import static com.sap.hana.datalake.files.committers.manifest.DiagnosticKeys.PRINCIPAL;
import static com.sap.hana.datalake.files.committers.manifest.DiagnosticKeys.STAGE;

/**
 * Class for manifest committer support util methods.
 */

@InterfaceAudience.Private
final class ManifestCommitterSupport {

    private ManifestCommitterSupport() {
    }

    /**
     * Build a Job UUID from the job conf (if it is
     * {@link ManifestCommitterConstants#SPARK_WRITE_UUID}
     * or the MR job ID.
     * @param conf job/task configuration
     * @param jobId job ID from YARN or spark.
     * @return (a job ID, source)
     */
    public static Pair buildJobUUID(Configuration conf,
                                                    JobID jobId) {
        String jobUUID = conf.getTrimmed(SPARK_WRITE_UUID, "");
        if (jobUUID.isEmpty()) {
            jobUUID = jobId.toString();
            return Pair.of(jobUUID, JOB_ID_SOURCE_MAPREDUCE);
        } else {
            return Pair.of(jobUUID, SPARK_WRITE_UUID);
        }
    }

    /**
     * Get the location of pending job attempts.
     * @param out the base output directory.
     * @return the location of pending job attempts.
     */
    public static Path getPendingJobAttemptsPath(Path out) {
        return new Path(out, PENDING_DIR_NAME);
    }

    /**
     * Get the Application Attempt Id for this job.
     * @param context the context to look in
     * @return the Application Attempt Id for a given job.
     */
    public static int getAppAttemptId(JobContext context) {
        return getAppAttemptId(context.getConfiguration());
    }

    /**
     * Get the Application Attempt Id for this job
     * by looking for {@link MRJobConfig#APPLICATION_ATTEMPT_ID}
     * in the configuration, falling back to 0 if unset.
     * For spark it will always be 0, for MR it will be set in the AM
     * to the {@code ApplicationAttemptId} the AM is launched with.
     * @param conf job configuration.
     * @return the Application Attempt Id for the job.
     */
    public static int getAppAttemptId(Configuration conf) {
        return conf.getInt(MRJobConfig.APPLICATION_ATTEMPT_ID,
                INITIAL_APP_ATTEMPT_ID);
    }

    /**
     * Get the path in the job attempt dir for a manifest for a task.
     * @param manifestDir manifest directory
     * @param taskId taskID.
     * @return the final path to rename the manifest file to
     */
    public static Path manifestPathForTask(Path manifestDir, String taskId) {

        return new Path(manifestDir, taskId + MANIFEST_SUFFIX);
    }

    /**
     * Get the path in the  manifest subdir for the temp path to save a
     * task attempt's manifest before renaming it to the
     * path defined by {@link #manifestPathForTask(Path, String)}.
     * @param manifestDir manifest directory
     * @param taskAttemptId task attempt ID.
     * @return the path to save/load the manifest.
     */
    public static Path manifestTempPathForTaskAttempt(Path manifestDir,
                                                      String taskAttemptId) {
        return new Path(manifestDir,
                taskAttemptId + MANIFEST_SUFFIX + TMP_SUFFIX);
    }

    /**
     * Create a task attempt dir; stage config must be for a task attempt.
     * @param stageConfig state config.
     * @return a manifest with job and task attempt info set up.
     */
    public static TaskManifest createTaskManifest(StageConfig stageConfig) {
        final TaskManifest manifest = new TaskManifest();
        manifest.setTaskAttemptID(stageConfig.getTaskAttemptId());
        manifest.setTaskID(stageConfig.getTaskId());
        manifest.setJobId(stageConfig.getJobId());
        manifest.setJobAttemptNumber(stageConfig.getJobAttemptNumber());
        manifest.setTaskAttemptDir(
                stageConfig.getTaskAttemptDir().toUri().toString());
        return manifest;
    }

    /**
     * Create success/outcome data.
     * @param stageConfig configuration.
     * @param stage
     * @return a _SUCCESS object with some diagnostics.
     */
    public static ManifestSuccessData createManifestOutcome(
            StageConfig stageConfig, String stage) {
        final ManifestSuccessData outcome = new ManifestSuccessData();
        outcome.setJobId(stageConfig.getJobId());
        outcome.setJobIdSource(stageConfig.getJobIdSource());
        outcome.setCommitter(MANIFEST_COMMITTER_CLASSNAME);
        // real timestamp
        outcome.setTimestamp(System.currentTimeMillis());
        final ZonedDateTime now = ZonedDateTime.now();
        outcome.setDate(now.toString());
        outcome.setHostname(NetUtils.getLocalHostname());
        // add some extra diagnostics which can still be parsed by older
        // builds of test applications.
        // Audit Span information can go in here too, in future.
        try {
            outcome.putDiagnostic(PRINCIPAL,
                    UserGroupInformation.getCurrentUser().getShortUserName());
        } catch (IOException ignored) {
            // don't know who we are? exclude from the diagnostics.
        }
        outcome.putDiagnostic(STAGE, stage);
        return outcome;
    }

    /**
     * Create the filename for a report from the jobID.
     * @param jobId jobId
     * @return filename for a report.
     */
    public static String createJobSummaryFilename(String jobId) {
        return String.format(SUMMARY_FILENAME_FORMAT, jobId);
    }

    /**
     * Get an etag from a FileStatus which MUST BE
     * an implementation of HasETag and
     * whose etag MUST NOT BE null/empty.
     * @param status the status; may be null.
     * @return the etag or null if not provided
     */
    public static String getEtag(FileStatus status) {
        // Use com.sap.hana.datalake.files.HasETag interface
        // instead of org.apache.hadoop.fs.EtagSource which is only available from Hadoop 3.3.2
        if (status instanceof HasETag) {
            return status.getETag();
        } else {
            return null;
        }
    }

    /**
     * Create the manifest store operations for the given FS.
     * This supports binding to custom filesystem handlers.
     * @param conf configuration.
     * @param filesystem fs.
     * @param path path under FS.
     * @return a bonded store operations.
     * @throws IOException on binding/init problems.
     */
    public static ManifestStoreOperations createManifestStoreOperations(
            final Configuration conf,
            final FileSystem filesystem,
            final Path path) throws IOException {
        try {
            final Class storeClass = conf.getClass(
                    OPT_STORE_OPERATIONS_CLASS,
                    ManifestStoreOperationsThroughFileSystem.class,
                    ManifestStoreOperations.class);
            final ManifestStoreOperations operations = storeClass.
                    getDeclaredConstructor().newInstance();
            operations.bindToFileSystem(filesystem, path);
            return operations;
        } catch (Exception e) {
            final IOException cause = new IOException("Failed to create Store Operations from configuration option "
                    + OPT_STORE_OPERATIONS_CLASS
                    + ":" + e);
            throw new PathIOException(path.toString(), cause);
        }
    }

    /**
     * Logic to create directory names from job and attempt.
     * This is self-contained it so it can be used in tests
     * as well as in the committer.
     */
    public static class AttemptDirectories {

        /**
         * Job output path.
         */
        private final Path outputPath;

        /**
         * Path for the job attempt.
         */
        private final Path jobAttemptDir;

        /**
         * Path for the job.
         */
        private final Path jobPath;

        /**
         * Subdir under the job attempt dir where task
         * attempts will have subdirectories.
         */
        private final Path jobAttemptTaskSubDir;

        /**
         * temp directory under job dest dir.
         */
        private final Path outputTempSubDir;

        /**
         * Directory to save manifests into.
         */
        private final Path taskManifestDir;

        /**
         * Build the attempt directories.
         * @param outputPath output path
         * @param jobUniqueId job ID/UUID
         * @param jobAttemptNumber job attempt number
         */
        public AttemptDirectories(
                Path outputPath,
                String jobUniqueId,
                int jobAttemptNumber) {
            this.outputPath = requireNonNull(outputPath, "Output path");

            this.outputTempSubDir = new Path(outputPath, PENDING_DIR_NAME);
            // build the path for the job
            this.jobPath = new Path(outputTempSubDir,
                    String.format(JOB_DIR_FORMAT_STR, jobUniqueId));

            // then the specific path underneath that for the attempt.
            this.jobAttemptDir = new Path(jobPath,
                    String.format(JOB_ATTEMPT_DIR_FORMAT_STR, jobAttemptNumber));

            // subdir for task attempts.
            this.jobAttemptTaskSubDir = new Path(jobAttemptDir, JOB_TASK_ATTEMPT_SUBDIR);

            this.taskManifestDir = new Path(jobAttemptDir, JOB_TASK_MANIFEST_SUBDIR);
        }

        public Path getOutputPath() {
            return outputPath;
        }

        public Path getJobAttemptDir() {
            return jobAttemptDir;
        }

        public Path getJobPath() {
            return jobPath;
        }

        public Path getJobAttemptTaskSubDir() {
            return jobAttemptTaskSubDir;
        }

        public Path getTaskAttemptPath(String taskAttemptId) {
            return new Path(jobAttemptTaskSubDir, taskAttemptId);
        }

        public Path getOutputTempSubDir() {
            return outputTempSubDir;
        }

        public Path getTaskManifestDir() {
            return taskManifestDir;
        }
    }
}




© 2015 - 2025 Weber Informatics LLC | Privacy Policy