com.sap.hana.datalake.files.committers.manifest.ManifestCommitterConstants Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of sap-hdlfs Show documentation
An implementation of org.apache.hadoop.fs.FileSystem targeting SAP HANA Data Lake Files.
There is a newer version: 3.0.27
/*
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package com.sap.hana.datalake.files.committers.manifest;

import org.apache.hadoop.classification.InterfaceAudience;
import org.apache.hadoop.classification.InterfaceStability;

/**
 * Public constants for the manifest committer.
 * This includes all configuration options and their default values.
 */
@InterfaceAudience.Public
@InterfaceStability.Unstable
final class ManifestCommitterConstants {

    /**
     * Suffix to use in manifest files in the manifest subdir.
     * Value: {@value}.
     */
    public static final String MANIFEST_SUFFIX = "-manifest.json";

    /**
     * Prefix for summary files in the report dir. Call
     */
    public static final String SUMMARY_FILENAME_PREFIX = "summary-";

    /**
     * Format string used to build a summary file from a Job ID.
     */
    public static final String SUMMARY_FILENAME_FORMAT =
            SUMMARY_FILENAME_PREFIX + "%s.json";

    /**
     * Suffix to use for temp files before renaming them.
     * Value: {@value}.
     */
    public static final String TMP_SUFFIX = ".tmp";

    /**
     * Initial number of all app attempts.
     * This is fixed in YARN; for Spark jobs the
     * same number "0" is used.
     */
    public static final int INITIAL_APP_ATTEMPT_ID = 0;

    /**
     * Format string for building a job dir.
     * Value: {@value}.
     */
    public static final String JOB_DIR_FORMAT_STR = "%s";

    /**
     * Format string for building a job attempt dir.
     * This uses the job attempt number so previous versions
     * can be found trivially.
     * Value: {@value}.
     */
    public static final String JOB_ATTEMPT_DIR_FORMAT_STR = "%02d";

    /**
     * Name of directory under job attempt dir for manifests.
     */
    public static final String JOB_TASK_MANIFEST_SUBDIR = "manifests";

    /**
     * Name of directory under job attempt dir for task attempts.
     */
    public static final String JOB_TASK_ATTEMPT_SUBDIR = "tasks";


    /**
     * Committer classname as recorded in the committer _SUCCESS file.
     */
    public static final String MANIFEST_COMMITTER_CLASSNAME =
            ManifestCommitter.class.getName();

    /**
     * Marker file to create on success: {@value}.
     */
    public static final String SUCCESS_MARKER = "_SUCCESS";

    /** Default job marker option: {@value}. */
    public static final boolean DEFAULT_CREATE_SUCCESSFUL_JOB_DIR_MARKER = true;

    /**
     * The limit to the number of committed objects tracked during
     * job commits and saved to the _SUCCESS file.
     * Value: {@value}.
     */
    public static final int SUCCESS_MARKER_FILE_LIMIT = 100;

    /**
     * The UUID for jobs: {@value}.
     * This was historically created in Spark 1.x's SQL queries,
     * but "went away".
     * It has been restored in recent spark releases.
     * If found: it is used instead of the MR job attempt ID.
     */
    public static final String SPARK_WRITE_UUID = "spark.sql.sources.writeJobUUID";

    /**
     * String to use as source of the job ID.
     * This SHOULD be kept in sync with that of
     * {@code AbstractS3ACommitter.JobUUIDSource}.
     * Value: {@value}.
     */
    public static final String JOB_ID_SOURCE_MAPREDUCE = "JobID";

    /**
     * Prefix to use for config options: {@value}.
     */
    public static final String OPT_PREFIX = "mapreduce.manifest.committer.";

    /**
     * Should dir cleanup do parallel deletion of task attempt dirs
     * before trying to delete the toplevel dirs.
     * For GCS this may deliver speedup, while on ABFS it may avoid
     * timeouts in certain deployments.
     * Value: {@value}.
     */
    public static final String OPT_CLEANUP_PARALLEL_DELETE =
            OPT_PREFIX + "cleanup.parallel.delete";

    /**
     * Default value:  {@value}.
     */
    public static final boolean OPT_CLEANUP_PARALLEL_DELETE_DIRS_DEFAULT = true;

    /**
     * Threads to use for IO.
     */
    public static final String OPT_IO_PROCESSORS = OPT_PREFIX + "io.threads";

    /**
     * Default value:  {@value}.
     */
    public static final int OPT_IO_PROCESSORS_DEFAULT = 64;

    /**
     * Directory for saving job summary reports.
     * These are the _SUCCESS files, but are saved even on
     * job failures.
     * Value: {@value}.
     */
    public static final String OPT_SUMMARY_REPORT_DIR =
            OPT_PREFIX + "summary.report.directory";

    /**
     * Directory for moving manifests under for diagnostics.
     * Value: {@value}.
     */
    public static final String OPT_DIAGNOSTICS_MANIFEST_DIR =
            OPT_PREFIX + "diagnostics.manifest.directory";

    /**
     * Should the output be validated?
     * This will check expected vs actual file lengths, and,
     * if etags can be obtained, etags.
     * Value: {@value}.
     */
    public static final String OPT_VALIDATE_OUTPUT = OPT_PREFIX + "validate.output";

    /**
     * Default value: {@value}.
     */
    public static final boolean OPT_VALIDATE_OUTPUT_DEFAULT = false;

    /**
     * Should job commit delete for files/directories at the targets
     * of renames, and, if found, deleting them?
     *
     * This is part of the effective behavior of the FileOutputCommitter,
     * however it adds an extra delete call per file being committed.
     *
     * If a job is writing to a directory which has only just been created
     * or were unique filenames are being used, there is no need to perform
     * this preparation.
     * The recognition of newly created dirs is automatic.
     *
     * Value: {@value}.
     */
    public static final String OPT_DELETE_TARGET_FILES =
            OPT_PREFIX + "delete.target.files";

    /**
     * Default value: {@value}.
     */
    public static final boolean OPT_DELETE_TARGET_FILES_DEFAULT = false;

    /**
     * Classname of the store operations; filesystems and tests
     * may override.
     * Value: {@value}.
     */
    public static final String OPT_STORE_OPERATIONS_CLASS = OPT_PREFIX + "store.operations.classname";

    /**
     * Default classname of the store operations.
     * Value: {@value}.
     */
    public static final String STORE_OPERATIONS_CLASS_DEFAULT =
            ManifestStoreOperationsThroughFileSystem.class.getName();

    /**
     * Stage attribute in audit context: {@value}.
     */
    public static final String CONTEXT_ATTR_STAGE = "st";

    /**
     * Task ID attribute in audit context: {@value}.
     */
    public static final String CONTEXT_ATTR_TASK_ATTEMPT_ID = "ta";

    /**
     * Stream Capabilities probe for spark dynamic partitioning compatibility.
     */
    public static final String CAPABILITY_DYNAMIC_PARTITIONING =
            "mapreduce.job.committer.dynamic.partitioning";

    private ManifestCommitterConstants() {
    }

}