com.sap.hana.datalake.files.committers.manifest.ManifestCommitterConfig Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of sap-hdlfs Show documentation
Show all versions of sap-hdlfs Show documentation
An implementation of org.apache.hadoop.fs.FileSystem targeting SAP HANA Data Lake Files.
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package com.sap.hana.datalake.files.committers.manifest;
import com.sap.hana.datalake.files.utils.threads.ThreadUtils;
import org.apache.commons.lang3.tuple.Pair;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.mapreduce.JobContext;
import org.apache.hadoop.mapreduce.TaskAttemptContext;
import org.apache.hadoop.mapreduce.TaskAttemptID;
import org.apache.hadoop.mapreduce.lib.output.FileOutputCommitter;
import org.apache.hadoop.util.Progressable;
import java.io.IOException;
import java.util.Objects;
import java.util.concurrent.ExecutorService;
/**
* The configuration for the committer as built up from the job configuration
* and data passed down from the committer factory.
* Isolated for ease of dev/test
*/
final class ManifestCommitterConfig {
/**
* Final destination of work.
* This is unqualified.
*/
private final Path destinationDir;
/**
* Role: used in log/text messages.
*/
private final String role;
/**
* This is the directory for all intermediate work: where the output
* format will write data.
* Will be null if built from a job context.
*/
private final Path taskAttemptDir;
/** Configuration of the job. */
private final Configuration conf;
/** The job context. For a task, this can be cast to a TaskContext. */
private final JobContext jobContext;
/** Should a job marker be created? */
private final boolean createJobMarker;
/**
* Job ID Or UUID -without any attempt suffix.
* This is expected/required to be unique, though
* Spark has had "issues" there until recently
* with lack of uniqueness of generated MR Job IDs.
*/
private final String jobUniqueId;
/**
* Where did the job Unique ID come from?
*/
private final String jobUniqueIdSource;
/**
* Number of this attempt; starts at zero.
*/
private final int jobAttemptNumber;
/**
* Job ID + AttemptID.
*/
private final String jobAttemptId;
/**
* Task ID: used as the filename of the manifest.
* Will be "" if built from a job context.
*/
private final String taskId;
/**
* Task attempt ID. Determines the working
* directory for task attempts to write data into,
* and for the task committer to scan.
* Will be "" if built from a job context.
*/
private final String taskAttemptId;
/** Any progressable for progress callbacks. */
private final Progressable progressable;
/** Should the output be validated after the commit? */
private final boolean validateOutput;
/**
* Attempt directory management.
*/
private final ManifestCommitterSupport.AttemptDirectories dirs;
/**
* Callback when a stage is entered.
*/
private final StageEventCallbacks stageEventCallbacks;
/**
* Name for logging.
*/
private final String name;
/**
* Delete target paths on commit? Stricter, but
* higher IO cost.
*/
private final boolean deleteTargetPaths;
/**
* Constructor.
* @param outputPath destination path of the job.
* @param role role for log messages.
* @param context job/task context
* @param stageEventCallbacks stage event callbacks.
*/
ManifestCommitterConfig(
final Path outputPath,
final String role,
final JobContext context,
final StageEventCallbacks stageEventCallbacks) {
this.role = role;
this.jobContext = context;
this.conf = context.getConfiguration();
this.destinationDir = outputPath;
this.stageEventCallbacks = stageEventCallbacks;
final Pair pair = ManifestCommitterSupport.buildJobUUID(this.conf, context.getJobID());
this.jobUniqueId = pair.getLeft();
this.jobUniqueIdSource = pair.getRight();
this.jobAttemptNumber = ManifestCommitterSupport.getAppAttemptId(context);
this.jobAttemptId = this.jobUniqueId + "_" + this.jobAttemptNumber;
// build directories
this.dirs = new ManifestCommitterSupport.AttemptDirectories(outputPath,
this.jobUniqueId, this.jobAttemptNumber);
// read in configuration options
this.createJobMarker = this.conf.getBoolean(
FileOutputCommitter.SUCCESSFUL_JOB_OUTPUT_DIR_MARKER,
ManifestCommitterConstants.DEFAULT_CREATE_SUCCESSFUL_JOB_DIR_MARKER);
this.validateOutput = this.conf.getBoolean(
ManifestCommitterConstants.OPT_VALIDATE_OUTPUT,
ManifestCommitterConstants.OPT_VALIDATE_OUTPUT_DEFAULT);
this.deleteTargetPaths = this.conf.getBoolean(
ManifestCommitterConstants.OPT_DELETE_TARGET_FILES,
ManifestCommitterConstants.OPT_DELETE_TARGET_FILES_DEFAULT);
// if constructed with a task attempt, build the task ID and path.
if (context instanceof TaskAttemptContext) {
// it's a task
final TaskAttemptContext tac = (TaskAttemptContext) context;
final TaskAttemptID taskAttempt = Objects.requireNonNull(
tac.getTaskAttemptID());
this.taskAttemptId = taskAttempt.toString();
this.taskId = taskAttempt.getTaskID().toString();
// Task attempt dir; must be different across instances
this.taskAttemptDir = this.dirs.getTaskAttemptPath(this.taskAttemptId);
// the context is also the progress callback.
this.progressable = tac;
this.name = String.format(InternalConstants.NAME_FORMAT_TASK_ATTEMPT, this.taskAttemptId);
} else {
// it's a job
this.taskId = "";
this.taskAttemptId = "";
this.taskAttemptDir = null;
this.progressable = null;
this.name = String.format(InternalConstants.NAME_FORMAT_JOB_ATTEMPT, this.jobAttemptId);
}
}
@Override
public String toString() {
return "ManifestCommitterConfig{" +
"name=" + this.name +
", destinationDir=" + this.destinationDir +
", role='" + this.role + '\'' +
", taskAttemptDir=" + this.taskAttemptDir +
", createJobMarker=" + this.createJobMarker +
", jobUniqueId='" + this.jobUniqueId + '\'' +
", jobUniqueIdSource='" + this.jobUniqueIdSource + '\'' +
", jobAttemptNumber=" + this.jobAttemptNumber +
", jobAttemptId='" + this.jobAttemptId + '\'' +
", taskId='" + this.taskId + '\'' +
", taskAttemptId='" + this.taskAttemptId + '\'' +
'}';
}
/**
* Get the destination filesystem.
* @return destination FS.
* @throws IOException Problems binding to the destination FS.
*/
FileSystem getDestinationFileSystem() throws IOException {
return FileSystem.get(this.destinationDir.toUri(), this.conf);
}
/**
* Create the stage config from the committer
* configuration.
* This does not bind the store operations
* or processors.
* @return a stage config with configuration options passed in.
*/
StageConfig createStageConfig() {
final StageConfig stageConfig = new StageConfig();
stageConfig
.withJobAttemptNumber(this.jobAttemptNumber)
.withJobDirectories(this.dirs)
.withJobId(this.jobUniqueId)
.withJobIdSource(this.jobUniqueIdSource)
.withName(this.name)
.withProgressable(this.progressable)
.withStageEventCallbacks(this.stageEventCallbacks)
.withTaskAttemptDir(this.taskAttemptDir)
.withTaskAttemptId(this.taskAttemptId)
.withTaskId(this.taskId)
.withDeleteTargetPaths(this.deleteTargetPaths);
return stageConfig;
}
public Path getDestinationDir() {
return this.destinationDir;
}
public String getRole() {
return this.role;
}
public Path getTaskAttemptDir() {
return this.taskAttemptDir;
}
public Path getJobAttemptDir() {
return this.dirs.getJobAttemptDir();
}
public Path getTaskManifestDir() {
return this.dirs.getTaskManifestDir();
}
public Configuration getConf() {
return this.conf;
}
public JobContext getJobContext() {
return this.jobContext;
}
public boolean getCreateJobMarker() {
return this.createJobMarker;
}
public String getJobAttemptId() {
return this.jobAttemptId;
}
public String getTaskAttemptId() {
return this.taskAttemptId;
}
public String getTaskId() {
return this.taskId;
}
public String getJobUniqueId() {
return this.jobUniqueId;
}
public boolean getValidateOutput() {
return this.validateOutput;
}
public String getName() {
return this.name;
}
/**
* Create a new submitter task pool from the
* {@link ManifestCommitterConstants#OPT_IO_PROCESSORS}
* settings.
* @return a new thread pool.
*/
public CloseableTaskPoolSubmitter createSubmitter() {
return this.createSubmitter(ManifestCommitterConstants.OPT_IO_PROCESSORS, ManifestCommitterConstants.OPT_IO_PROCESSORS_DEFAULT);
}
/**
* Create a new submitter task pool.
* @param key config key with pool size.
* @param defVal default value.
* @return a new task pool.
*/
public CloseableTaskPoolSubmitter createSubmitter(final String key, final int defVal) {
int numThreads = this.conf.getInt(key, defVal);
if (numThreads <= 0) {
// ignore the setting if it is too invalid.
numThreads = defVal;
}
return createCloseableTaskSubmitter(numThreads, this.getJobAttemptId(), this.conf);
}
/**
* Create a new submitter task pool.
*
* @param numThreads thread count.
* @param jobAttemptId job ID
* @return a new task pool.
*/
public static CloseableTaskPoolSubmitter createCloseableTaskSubmitter(
final int numThreads,
final String jobAttemptId,
final Configuration conf) {
final ExecutorService executorService = ThreadUtils.newDaemonThreadFixedExecutor(
numThreads,
String.format("ManifestCommitter-%s-thread", jobAttemptId),
conf);
return new CloseableTaskPoolSubmitter(executorService);
}
}
© 2015 - 2025 Weber Informatics LLC | Privacy Policy