
alluxio.master.job.LoadJob Maven / Gradle / Ivy
The newest version!
/*
* The Alluxio Open Foundation licenses this work under the Apache License, version 2.0
* (the "License"). You may not use this work except in compliance with the License, which is
* available at www.apache.org/licenses/LICENSE-2.0
*
* This software is distributed on an "AS IS" basis, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
* either express or implied, as more fully set forth in the License.
*
* See the NOTICE file distributed with this work for information regarding copyright ownership.
*/
package alluxio.master.job;
import static java.lang.String.format;
import static java.util.Objects.requireNonNull;
import alluxio.client.block.stream.BlockWorkerClient;
import alluxio.conf.Configuration;
import alluxio.conf.PropertyKey;
import alluxio.exception.runtime.AlluxioRuntimeException;
import alluxio.exception.runtime.InternalRuntimeException;
import alluxio.exception.runtime.InvalidArgumentRuntimeException;
import alluxio.grpc.Block;
import alluxio.grpc.BlockStatus;
import alluxio.grpc.JobProgressReportFormat;
import alluxio.grpc.LoadRequest;
import alluxio.grpc.LoadResponse;
import alluxio.grpc.TaskStatus;
import alluxio.grpc.UfsReadOptions;
import alluxio.job.JobDescription;
import alluxio.metrics.MetricKey;
import alluxio.metrics.MetricsSystem;
import alluxio.proto.journal.Journal;
import alluxio.scheduler.job.JobState;
import alluxio.scheduler.job.Task;
import alluxio.util.FormatUtils;
import alluxio.wire.BlockInfo;
import alluxio.wire.FileInfo;
import alluxio.wire.WorkerInfo;
import com.codahale.metrics.Counter;
import com.codahale.metrics.Meter;
import com.fasterxml.jackson.annotation.JsonAutoDetect;
import com.fasterxml.jackson.annotation.JsonInclude;
import com.fasterxml.jackson.annotation.PropertyAccessor;
import com.fasterxml.jackson.core.JsonProcessingException;
import com.fasterxml.jackson.databind.ObjectMapper;
import com.google.common.annotations.VisibleForTesting;
import com.google.common.base.MoreObjects;
import com.google.common.base.Objects;
import com.google.common.base.Preconditions;
import com.google.common.collect.ImmutableList;
import com.google.common.util.concurrent.ListenableFuture;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.util.ArrayList;
import java.util.Collections;
import java.util.HashMap;
import java.util.Iterator;
import java.util.LinkedList;
import java.util.List;
import java.util.Map;
import java.util.Optional;
import java.util.OptionalLong;
import java.util.Set;
import java.util.UUID;
import java.util.concurrent.CancellationException;
import java.util.concurrent.ExecutionException;
import java.util.concurrent.atomic.AtomicLong;
import java.util.function.Predicate;
import javax.annotation.concurrent.NotThreadSafe;
/**
* Load job that loads a file or a directory into Alluxio.
* This class should only be manipulated from the scheduler thread in Scheduler
* thus the state changing functions are not thread safe.
* Deprecated class, refer to {@link DoraLoadJob}
*/
@NotThreadSafe
public class LoadJob extends AbstractJob {
private static final Logger LOG = LoggerFactory.getLogger(LoadJob.class);
public static final String TYPE = "load";
private static final double FAILURE_RATIO_THRESHOLD = 0.05;
private static final int FAILURE_COUNT_THRESHOLD = 100;
private static final int RETRY_BLOCK_CAPACITY = 1000;
private static final double RETRY_THRESHOLD = 0.8 * RETRY_BLOCK_CAPACITY;
private static final int BATCH_SIZE = Configuration.getInt(PropertyKey.JOB_BATCH_SIZE);
public static final Predicate QUALIFIED_FILE_FILTER =
(fileInfo) -> !fileInfo.isFolder() && fileInfo.isCompleted() && fileInfo.isPersisted()
&& fileInfo.getInAlluxioPercentage() != 100;
// Job configurations
private final String mPath;
private OptionalLong mBandwidth;
private boolean mUsePartialListing;
private boolean mVerificationEnabled;
// Job states
private final LinkedList mRetryBlocks = new LinkedList<>();
private final Map mFailedFiles = new HashMap<>();
private final AtomicLong mProcessedFileCount = new AtomicLong();
private final AtomicLong mLoadedByteCount = new AtomicLong();
private final AtomicLong mTotalByteCount = new AtomicLong();
private final AtomicLong mTotalBlockCount = new AtomicLong();
private final AtomicLong mCurrentBlockCount = new AtomicLong();
private final AtomicLong mTotalFailureCount = new AtomicLong();
private final AtomicLong mCurrentFailureCount = new AtomicLong();
private Optional mFailedReason = Optional.empty();
private final Iterable mFileIterable;
private Optional> mFileIterator = Optional.empty();
private FileInfo mCurrentFile;
private Iterator mBlockIterator = Collections.emptyIterator();
/**
* Constructor.
* @param path file path
* @param user user for authentication
* @param bandwidth bandwidth
* @param fileIterator file iterator
*/
@VisibleForTesting
public LoadJob(String path, String user, OptionalLong bandwidth,
FileIterable fileIterator) {
this(path, Optional.of(user), UUID.randomUUID().toString(), bandwidth, false, false,
fileIterator);
}
/**
* Constructor.
*
* @param path file path
* @param user user for authentication
* @param jobId job identifier
* @param bandwidth bandwidth
* @param usePartialListing whether to use partial listing
* @param verificationEnabled whether to verify the job after loaded
* @param fileIterable file iterable
*/
public LoadJob(
String path,
Optional user, String jobId, OptionalLong bandwidth,
boolean usePartialListing,
boolean verificationEnabled, Iterable fileIterable) {
super(user, jobId);
mPath = requireNonNull(path, "path is null");
Preconditions.checkArgument(
!bandwidth.isPresent() || bandwidth.getAsLong() > 0,
format("bandwidth should be greater than 0 if provided, get %s", bandwidth));
mBandwidth = bandwidth;
mUsePartialListing = usePartialListing;
mVerificationEnabled = verificationEnabled;
mFileIterable = fileIterable;
}
/**
* Get load file path.
* @return file path
*/
public String getPath() {
return mPath;
}
@Override
public JobDescription getDescription() {
return JobDescription.newBuilder().setPath(mPath).setType(TYPE).build();
}
/**
* Get bandwidth.
* @return the allocated bandwidth
*/
public OptionalLong getBandwidth() {
return mBandwidth;
}
/**
* Update bandwidth.
* @param bandwidth new bandwidth
*/
public void updateBandwidth(OptionalLong bandwidth) {
mBandwidth = bandwidth;
}
/**
* Is verification enabled.
* @return whether verification is enabled
*/
public boolean isVerificationEnabled() {
return mVerificationEnabled;
}
/**
* Enable verification.
* @param enableVerification whether to enable verification
*/
public void setVerificationEnabled(boolean enableVerification) {
mVerificationEnabled = enableVerification;
}
/**
* Set load state to FAILED with given reason.
* @param reason failure exception
*/
@Override
public void failJob(AlluxioRuntimeException reason) {
setJobState(JobState.FAILED, true);
mFailedReason = Optional.of(reason);
JOB_LOAD_FAIL.inc();
}
@Override
public void setJobSuccess() {
setJobState(JobState.SUCCEEDED, true);
JOB_LOAD_SUCCESS.inc();
}
/**
* Add bytes to total loaded bytes.
* @param bytes bytes to be added to total
*/
@VisibleForTesting
public void addLoadedBytes(long bytes) {
mLoadedByteCount.addAndGet(bytes);
}
@Override
public String getProgress(JobProgressReportFormat format, boolean verbose) {
return (new LoadProgressReport(this, verbose)).getReport(format);
}
/**
* Get the processed block count in the current loading pass.
* @return current block count
*/
public long getCurrentBlockCount() {
return mCurrentBlockCount.get();
}
@Override
public boolean equals(Object o) {
if (this == o) {
return true;
}
if (o == null || getClass() != o.getClass()) {
return false;
}
LoadJob that = (LoadJob) o;
return Objects.equal(getDescription(), that.getDescription());
}
@Override
public int hashCode() {
return Objects.hashCode(getDescription());
}
@Override
public boolean isHealthy() {
long currentFailureCount = mCurrentFailureCount.get();
return mState != JobState.FAILED
&& (currentFailureCount <= FAILURE_COUNT_THRESHOLD
|| (double) currentFailureCount / mCurrentBlockCount.get() <= FAILURE_RATIO_THRESHOLD);
}
@Override
public boolean isCurrentPassDone() {
return mFileIterator.isPresent() && !mFileIterator.get().hasNext() && !mBlockIterator.hasNext()
&& mRetryBlocks.isEmpty();
}
@Override
public void initiateVerification() {
Preconditions.checkState(isCurrentPassDone(), "Previous pass is not finished");
mFileIterator = Optional.empty();
mTotalBlockCount.addAndGet(mCurrentBlockCount.get());
mTotalFailureCount.addAndGet(mCurrentFailureCount.get());
mCurrentBlockCount.set(0);
mCurrentFailureCount.set(0);
mState = JobState.VERIFYING;
}
/**
* get next load task.
*
* @param workers list of available workers to schedule task on
* @return the next task to run. If there is no task to run, return empty
*/
@Override
public List getNextTasks(Set workers) {
List tasks = new ArrayList<>();
List blocks = getNextBatchBlocks(BATCH_SIZE);
if (blocks.isEmpty()) {
return Collections.unmodifiableList(tasks);
}
LoadTask task = new LoadTask(blocks);
tasks.add(task);
return Collections.unmodifiableList(tasks);
}
@Override
public void onTaskSubmitFailure(Task> task) {
// NOOP
}
/**
* Get next batch of blocks.
* @param count number of blocks
* @return list of blocks
*/
@VisibleForTesting
public List getNextBatchBlocks(int count) {
if (!mFileIterator.isPresent()) {
mFileIterator = Optional.of(mFileIterable.iterator());
if (!mFileIterator
.get()
.hasNext()) {
return ImmutableList.of();
}
mCurrentFile = mFileIterator.get().next();
if (!mFailedFiles.containsKey(mCurrentFile.getPath())) {
mProcessedFileCount.incrementAndGet();
}
mBlockIterator = mCurrentFile.getBlockIds().listIterator();
}
ImmutableList.Builder batchBuilder = ImmutableList.builder();
int i = 0;
// retry failed blocks if there's too many failed blocks otherwise wait until no more new block
if (mRetryBlocks.size() > RETRY_THRESHOLD
|| (!mFileIterator.get().hasNext() && !mBlockIterator.hasNext())) {
while (i < count && !mRetryBlocks.isEmpty()) {
batchBuilder.add(requireNonNull(mRetryBlocks.removeFirst()));
i++;
}
}
for (; i < count; i++) {
if (!mBlockIterator.hasNext()) {
if (!mFileIterator.get().hasNext()) {
return batchBuilder.build();
}
mCurrentFile = mFileIterator.get().next();
if (!mFailedFiles.containsKey(mCurrentFile.getPath())) {
mProcessedFileCount.incrementAndGet();
}
mBlockIterator = mCurrentFile.getBlockIds().listIterator();
}
long blockId = mBlockIterator.next();
BlockInfo blockInfo = mCurrentFile.getFileBlockInfo(blockId).getBlockInfo();
if (blockInfo.getLocations().isEmpty()) {
batchBuilder.add(buildBlock(mCurrentFile, blockId));
mCurrentBlockCount.incrementAndGet();
// would be inaccurate when we initial verification, and we retry un-retryable blocks
mTotalByteCount.addAndGet(blockInfo.getLength());
}
}
return batchBuilder.build();
}
/**
* Add a block to retry later.
* @param block the block that failed to load thus needing retry
* @return whether the block is successfully added
*/
@VisibleForTesting
public boolean addBlockToRetry(Block block) {
if (mRetryBlocks.size() >= RETRY_BLOCK_CAPACITY) {
return false;
}
LOG.debug("Retry block {}", block);
mRetryBlocks.add(block);
mCurrentFailureCount.incrementAndGet();
JOB_LOAD_BLOCK_FAIL.inc();
return true;
}
/**
* Add a block to failure summary.
*
* @param block the block that failed to load and cannot be retried
* @param message failure message
* @param code status code for exception
*/
@VisibleForTesting
public void addBlockFailure(Block block, String message, int code) {
// When multiple blocks of the same file failed to load, from user's perspective,
// it's not hugely important what are the reasons for each specific failure,
// if they are different, so we will just keep the first one.
mFailedFiles.put(block.getUfsPath(),
format("Status code: %s, message: %s", code, message));
mCurrentFailureCount.incrementAndGet();
JOB_LOAD_BLOCK_FAIL.inc();
}
private static Block buildBlock(FileInfo fileInfo, long blockId) {
return Block.newBuilder().setBlockId(blockId)
.setLength(fileInfo.getFileBlockInfo(blockId).getBlockInfo().getLength())
.setUfsPath(fileInfo.getUfsPath())
.setMountId(fileInfo.getMountId())
.setOffsetInFile(fileInfo.getFileBlockInfo(blockId).getOffset())
.build();
}
@Override
public String toString() {
return MoreObjects.toStringHelper(this)
.add("Path", mPath)
.add("User", mUser)
.add("Bandwidth", mBandwidth)
.add("UsePartialListing", mUsePartialListing)
.add("VerificationEnabled", mVerificationEnabled)
.add("RetryBlocks", mRetryBlocks)
.add("FailedFiles", mFailedFiles)
.add("StartTime", mStartTime)
.add("ProcessedFileCount", mProcessedFileCount)
.add("LoadedByteCount", mLoadedByteCount)
.add("TotalBlockCount", mTotalBlockCount)
.add("CurrentBlockCount", mCurrentBlockCount)
.add("TotalFailureCount", mTotalFailureCount)
.add("CurrentFailureCount", mCurrentFailureCount)
.add("State", mState)
.add("BatchSize", BATCH_SIZE)
.add("FailedReason", mFailedReason)
.add("FileIterator", mFileIterator)
.add("CurrentFile", mCurrentFile)
.add("BlockIterator", mBlockIterator)
.add("EndTime", mEndTime)
.toString();
}
@Override
public Journal.JournalEntry toJournalEntry() {
alluxio.proto.journal.Job.LoadJobEntry.Builder jobEntry = alluxio.proto.journal.Job.LoadJobEntry
.newBuilder()
.setLoadPath(mPath)
.setState(JobState.toProto(mState))
.setPartialListing(mUsePartialListing)
.setVerify(mVerificationEnabled)
.setJobId(mJobId);
mUser.ifPresent(jobEntry::setUser);
mBandwidth.ifPresent(jobEntry::setBandwidth);
mEndTime.ifPresent(jobEntry::setEndTime);
return Journal.JournalEntry
.newBuilder()
.setLoadJob(jobEntry.build())
.build();
}
/**
* Get duration in seconds.
* @return job duration in seconds
*/
@VisibleForTesting
public long getDurationInSec() {
return (mEndTime.orElse(System.currentTimeMillis()) - mStartTime) / 1000;
}
@Override
public boolean processResponse(LoadTask loadTask) {
try {
long totalBytes = loadTask.getBlocks().stream()
.map(Block::getLength)
.reduce(Long::sum)
.orElse(0L);
LoadResponse response = loadTask.getResponseFuture().get();
if (response.getStatus() != TaskStatus.SUCCESS) {
LOG.debug(format("Get failure from worker: %s", response.getBlockStatusList()));
for (BlockStatus status : response.getBlockStatusList()) {
totalBytes -= status.getBlock().getLength();
if (!isHealthy() || !status.getRetryable() || !addBlockToRetry(
status.getBlock())) {
addBlockFailure(status.getBlock(), status.getMessage(), status.getCode());
}
}
}
addLoadedBytes(totalBytes);
JOB_LOAD_BLOCK_COUNT.inc(
loadTask.getBlocks().size() - response.getBlockStatusCount());
JOB_LOAD_BLOCK_SIZE.inc(totalBytes);
JOB_LOAD_RATE.mark(totalBytes);
return response.getStatus() != TaskStatus.FAILURE;
}
catch (ExecutionException e) {
LOG.warn("exception when trying to get load response.", e.getCause());
for (Block block : loadTask.getBlocks()) {
if (isHealthy()) {
addBlockToRetry(block);
}
else {
AlluxioRuntimeException exception = AlluxioRuntimeException.from(e.getCause());
addBlockFailure(block, exception.getMessage(), exception.getStatus().getCode()
.value());
}
}
return false;
}
catch (CancellationException e) {
LOG.warn("Task get canceled and will retry.", e);
loadTask.getBlocks().forEach(this::addBlockToRetry);
return true;
}
catch (InterruptedException e) {
loadTask.getBlocks().forEach(this::addBlockToRetry);
Thread.currentThread().interrupt();
// We don't count InterruptedException as task failure
return true;
}
}
@Override
public boolean hasFailure() {
return !mFailedFiles.isEmpty();
}
/**
* Is verification enabled.
*
* @return whether verification is enabled
*/
@Override
public boolean needVerification() {
return mVerificationEnabled && mCurrentBlockCount.get() > 0;
}
/**
* Loads blocks in a UFS through an Alluxio worker.
*/
public class LoadTask extends Task {
/**
* @return blocks to load
*/
public List getBlocks() {
return mBlocks;
}
private final List mBlocks;
/**
* Creates a new instance of {@link LoadTask}.
*
* @param blocks blocks to load
*/
public LoadTask(List blocks) {
super(LoadJob.this, LoadJob.this.mTaskIdGenerator.incrementAndGet());
mBlocks = blocks;
}
@Override
public ListenableFuture run(BlockWorkerClient workerClient) {
LoadRequest.Builder request1 = LoadRequest
.newBuilder()
.addAllBlocks(mBlocks);
UfsReadOptions.Builder options = UfsReadOptions
.newBuilder()
.setTag(mJobId)
.setPositionShort(false);
if (mBandwidth.isPresent()) {
options.setBandwidth(mBandwidth.getAsLong());
}
mUser.ifPresent(options::setUser);
LoadRequest request = request1
.setOptions(options.build())
.build();
return workerClient.load(request);
}
}
private static class LoadProgressReport {
private final boolean mVerbose;
private final JobState mJobState;
private final Long mBandwidth;
private final boolean mVerificationEnabled;
private final long mProcessedFileCount;
private final long mLoadedByteCount;
private final Long mTotalByteCount;
private final Long mThroughput;
private final double mFailurePercentage;
private final AlluxioRuntimeException mFailureReason;
private final long mFailedFileCount;
private final Map mFailedFilesWithReasons;
public LoadProgressReport(LoadJob job, boolean verbose)
{
mVerbose = verbose;
mJobState = job.mState;
mBandwidth = job.mBandwidth.isPresent() ? job.mBandwidth.getAsLong() : null;
mVerificationEnabled = job.mVerificationEnabled;
mProcessedFileCount = job.mProcessedFileCount.get();
mLoadedByteCount = job.mLoadedByteCount.get();
if (!job.mUsePartialListing && job.mFileIterator.isPresent()) {
mTotalByteCount = job.mTotalByteCount.get();
}
else {
mTotalByteCount = null;
}
long duration = job.getDurationInSec();
if (duration > 0) {
mThroughput = job.mLoadedByteCount.get() / duration;
}
else {
mThroughput = null;
}
long blockCount = job.mTotalBlockCount.get() + job.mCurrentBlockCount.get();
if (blockCount > 0) {
mFailurePercentage =
((double) (job.mTotalFailureCount.get() + job.mCurrentFailureCount.get()) / blockCount)
* 100;
}
else {
mFailurePercentage = 0;
}
mFailureReason = job.mFailedReason.orElse(null);
mFailedFileCount = job.mFailedFiles.size();
if (verbose && mFailedFileCount > 0) {
mFailedFilesWithReasons = job.mFailedFiles;
} else {
mFailedFilesWithReasons = Collections.emptyMap();
}
}
public String getReport(JobProgressReportFormat format)
{
switch (format) {
case TEXT:
return getTextReport();
case JSON:
return getJsonReport();
default:
throw new InvalidArgumentRuntimeException(
format("Unknown load progress report format: %s", format));
}
}
private String getTextReport() {
StringBuilder progress = new StringBuilder();
progress.append(
format("\tSettings:\tbandwidth: %s\tverify: %s%n",
mBandwidth == null ? "unlimited" : mBandwidth,
mVerificationEnabled));
progress.append(format("\tJob State: %s%s%n", mJobState,
mFailureReason == null
? "" : format(
" (%s: %s)",
mFailureReason.getClass().getName(),
mFailureReason.getMessage())));
if (mVerbose && mFailureReason != null) {
for (StackTraceElement stack : mFailureReason.getStackTrace()) {
progress.append(format("\t\t%s%n", stack.toString()));
}
}
progress.append(format("\tFiles Processed: %d%n", mProcessedFileCount));
progress.append(format("\tBytes Loaded: %s%s%n",
FormatUtils.getSizeFromBytes(mLoadedByteCount),
mTotalByteCount == null
? "" : format(" out of %s", FormatUtils.getSizeFromBytes(mTotalByteCount))));
if (mThroughput != null) {
progress.append(format("\tThroughput: %s/s%n",
FormatUtils.getSizeFromBytes(mThroughput)));
}
progress.append(format("\tBlock load failure rate: %.2f%%%n", mFailurePercentage));
progress.append(format("\tFiles Failed: %s%n", mFailedFileCount));
if (mVerbose && !mFailedFilesWithReasons.isEmpty()) {
mFailedFilesWithReasons.forEach((fileName, reason) ->
progress.append(format("\t\t%s: %s%n", fileName, reason)));
}
return progress.toString();
}
private String getJsonReport() {
try {
return new ObjectMapper()
.setVisibility(PropertyAccessor.FIELD, JsonAutoDetect.Visibility.ANY)
.setSerializationInclusion(JsonInclude.Include.NON_NULL)
.writeValueAsString(this);
} catch (JsonProcessingException e) {
throw new InternalRuntimeException("Failed to convert LoadProgressReport to JSON", e);
}
}
}
// metrics
public static final Counter JOB_LOAD_SUCCESS =
MetricsSystem.counter(MetricKey.MASTER_JOB_LOAD_SUCCESS.getName());
public static final Counter JOB_LOAD_FAIL =
MetricsSystem.counter(MetricKey.MASTER_JOB_LOAD_FAIL.getName());
public static final Counter JOB_LOAD_BLOCK_COUNT =
MetricsSystem.counter(MetricKey.MASTER_JOB_LOAD_BLOCK_COUNT.getName());
public static final Counter JOB_LOAD_BLOCK_FAIL =
MetricsSystem.counter(MetricKey.MASTER_JOB_LOAD_BLOCK_FAIL.getName());
public static final Counter JOB_LOAD_BLOCK_SIZE =
MetricsSystem.counter(MetricKey.MASTER_JOB_LOAD_BLOCK_SIZE.getName());
public static final Meter JOB_LOAD_RATE =
MetricsSystem.meter(MetricKey.MASTER_JOB_LOAD_RATE.getName());
}
© 2015 - 2025 Weber Informatics LLC | Privacy Policy