org.apache.hadoop.mapred.LocatedFileStatusFetcher Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of hadoop-apache Show documentation
Show all versions of hadoop-apache Show documentation
Shaded version of Apache Hadoop for Presto
/**
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.hadoop.mapred;
import java.io.IOException;
import java.util.LinkedList;
import java.util.List;
import java.util.concurrent.BlockingQueue;
import java.util.concurrent.Callable;
import java.util.concurrent.ExecutorService;
import java.util.concurrent.Executors;
import java.util.concurrent.LinkedBlockingQueue;
import java.util.concurrent.atomic.AtomicInteger;
import java.util.concurrent.locks.Condition;
import java.util.concurrent.locks.ReentrantLock;
import org.apache.hadoop.classification.InterfaceAudience.Private;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileStatus;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.LocatedFileStatus;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.fs.PathFilter;
import org.apache.hadoop.fs.RemoteIterator;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import io.prestosql.hadoop.$internal.com.google.common.collect.Iterables;
import io.prestosql.hadoop.$internal.com.google.common.util.concurrent.FutureCallback;
import io.prestosql.hadoop.$internal.com.google.common.util.concurrent.Futures;
import io.prestosql.hadoop.$internal.com.google.common.util.concurrent.ListenableFuture;
import io.prestosql.hadoop.$internal.com.google.common.util.concurrent.ListeningExecutorService;
import io.prestosql.hadoop.$internal.com.google.common.util.concurrent.MoreExecutors;
import io.prestosql.hadoop.$internal.com.google.common.util.concurrent.ThreadFactoryBuilder;
/**
* Utility class to fetch block locations for specified Input paths using a
* configured number of threads.
*/
@Private
public class LocatedFileStatusFetcher {
private final Path[] inputDirs;
private final PathFilter inputFilter;
private final Configuration conf;
private final boolean recursive;
private final boolean newApi;
private final ExecutorService rawExec;
private final ListeningExecutorService exec;
private final BlockingQueue> resultQueue;
private final List invalidInputErrors = new LinkedList();
private final ProcessInitialInputPathCallback processInitialInputPathCallback =
new ProcessInitialInputPathCallback();
private final ProcessInputDirCallback processInputDirCallback =
new ProcessInputDirCallback();
private final AtomicInteger runningTasks = new AtomicInteger(0);
private final ReentrantLock lock = new ReentrantLock();
private final Condition condition = lock.newCondition();
private volatile Throwable unknownError;
/**
* @param conf configuration for the job
* @param dirs the initial list of paths
* @param recursive whether to traverse the patchs recursively
* @param inputFilter inputFilter to apply to the resulting paths
* @param newApi whether using the mapred or mapreduce API
* @throws InterruptedException
* @throws IOException
*/
public LocatedFileStatusFetcher(Configuration conf, Path[] dirs,
boolean recursive, PathFilter inputFilter, boolean newApi) throws InterruptedException,
IOException {
int numThreads = conf.getInt(FileInputFormat.LIST_STATUS_NUM_THREADS,
FileInputFormat.DEFAULT_LIST_STATUS_NUM_THREADS);
rawExec = Executors.newFixedThreadPool(
numThreads,
new ThreadFactoryBuilder().setDaemon(true)
.setNameFormat("GetFileInfo #%d").build());
exec = MoreExecutors.listeningDecorator(rawExec);
resultQueue = new LinkedBlockingQueue>();
this.conf = conf;
this.inputDirs = dirs;
this.recursive = recursive;
this.inputFilter = inputFilter;
this.newApi = newApi;
}
/**
* Start executing and return FileStatuses based on the parameters specified
* @return fetched file statuses
* @throws InterruptedException
* @throws IOException
*/
public Iterable getFileStatuses() throws InterruptedException,
IOException {
// Increment to make sure a race between the first thread completing and the
// rest being scheduled does not lead to a termination.
runningTasks.incrementAndGet();
for (Path p : inputDirs) {
runningTasks.incrementAndGet();
ListenableFuture future = exec
.submit(new ProcessInitialInputPathCallable(p, conf, inputFilter));
Futures.addCallback(future, processInitialInputPathCallback);
}
runningTasks.decrementAndGet();
lock.lock();
try {
while (runningTasks.get() != 0 && unknownError == null) {
condition.await();
}
} finally {
lock.unlock();
}
this.exec.shutdownNow();
if (this.unknownError != null) {
if (this.unknownError instanceof Error) {
throw (Error) this.unknownError;
} else if (this.unknownError instanceof RuntimeException) {
throw (RuntimeException) this.unknownError;
} else if (this.unknownError instanceof IOException) {
throw (IOException) this.unknownError;
} else if (this.unknownError instanceof InterruptedException) {
throw (InterruptedException) this.unknownError;
} else {
throw new IOException(this.unknownError);
}
}
if (this.invalidInputErrors.size() != 0) {
if (this.newApi) {
throw new org.apache.hadoop.mapreduce.lib.input.InvalidInputException(
invalidInputErrors);
} else {
throw new InvalidInputException(invalidInputErrors);
}
}
return Iterables.concat(resultQueue);
}
/**
* Collect misconfigured Input errors. Errors while actually reading file info
* are reported immediately
*/
private void registerInvalidInputError(List errors) {
synchronized (this) {
this.invalidInputErrors.addAll(errors);
}
}
/**
* Register fatal errors - example an IOException while accessing a file or a
* full exection queue
*/
private void registerError(Throwable t) {
lock.lock();
try {
if (unknownError != null) {
unknownError = t;
condition.signal();
}
} finally {
lock.unlock();
}
}
private void decrementRunningAndCheckCompletion() {
lock.lock();
try {
if (runningTasks.decrementAndGet() == 0) {
condition.signal();
}
} finally {
lock.unlock();
}
}
/**
* Retrieves block locations for the given @link {@link FileStatus}, and adds
* additional paths to the process queue if required.
*/
private static class ProcessInputDirCallable implements
Callable {
private final FileSystem fs;
private final FileStatus fileStatus;
private final boolean recursive;
private final PathFilter inputFilter;
ProcessInputDirCallable(FileSystem fs, FileStatus fileStatus,
boolean recursive, PathFilter inputFilter) {
this.fs = fs;
this.fileStatus = fileStatus;
this.recursive = recursive;
this.inputFilter = inputFilter;
}
@Override
public Result call() throws Exception {
Result result = new Result();
result.fs = fs;
if (fileStatus.isDirectory()) {
RemoteIterator iter = fs
.listLocatedStatus(fileStatus.getPath());
while (iter.hasNext()) {
LocatedFileStatus stat = iter.next();
if (inputFilter.accept(stat.getPath())) {
if (recursive && stat.isDirectory()) {
result.dirsNeedingRecursiveCalls.add(stat);
} else {
result.locatedFileStatuses.add(stat);
}
}
}
} else {
result.locatedFileStatuses.add(fileStatus);
}
return result;
}
private static class Result {
private List locatedFileStatuses = new LinkedList();
private List dirsNeedingRecursiveCalls = new LinkedList();
private FileSystem fs;
}
}
/**
* The callback handler to handle results generated by
* {@link ProcessInputDirCallable}. This populates the final result set.
*
*/
private class ProcessInputDirCallback implements
FutureCallback {
@Override
public void onSuccess(ProcessInputDirCallable.Result result) {
try {
if (result.locatedFileStatuses.size() != 0) {
resultQueue.add(result.locatedFileStatuses);
}
if (result.dirsNeedingRecursiveCalls.size() != 0) {
for (FileStatus fileStatus : result.dirsNeedingRecursiveCalls) {
runningTasks.incrementAndGet();
ListenableFuture future = exec
.submit(new ProcessInputDirCallable(result.fs, fileStatus,
recursive, inputFilter));
Futures.addCallback(future, processInputDirCallback);
}
}
decrementRunningAndCheckCompletion();
} catch (Throwable t) { // Error within the callback itself.
registerError(t);
}
}
@Override
public void onFailure(Throwable t) {
// Any generated exceptions. Leads to immediate termination.
registerError(t);
}
}
/**
* Processes an initial Input Path pattern through the globber and PathFilter
* to generate a list of files which need further processing.
*/
private static class ProcessInitialInputPathCallable implements
Callable {
private final Path path;
private final Configuration conf;
private final PathFilter inputFilter;
public ProcessInitialInputPathCallable(Path path, Configuration conf,
PathFilter pathFilter) {
this.path = path;
this.conf = conf;
this.inputFilter = pathFilter;
}
@Override
public Result call() throws Exception {
Result result = new Result();
FileSystem fs = path.getFileSystem(conf);
result.fs = fs;
FileStatus[] matches = fs.globStatus(path, inputFilter);
if (matches == null) {
result.addError(new IOException("Input path does not exist: " + path));
} else if (matches.length == 0) {
result.addError(new IOException("Input Pattern " + path
+ " matches 0 files"));
} else {
result.matchedFileStatuses = matches;
}
return result;
}
private static class Result {
private List errors;
private FileStatus[] matchedFileStatuses;
private FileSystem fs;
void addError(IOException ioe) {
if (errors == null) {
errors = new LinkedList();
}
errors.add(ioe);
}
}
}
/**
* The callback handler to handle results generated by
* {@link ProcessInitialInputPathCallable}
*
*/
private class ProcessInitialInputPathCallback implements
FutureCallback {
@Override
public void onSuccess(ProcessInitialInputPathCallable.Result result) {
try {
if (result.errors != null) {
registerInvalidInputError(result.errors);
}
if (result.matchedFileStatuses != null) {
for (FileStatus matched : result.matchedFileStatuses) {
runningTasks.incrementAndGet();
ListenableFuture future = exec
.submit(new ProcessInputDirCallable(result.fs, matched,
recursive, inputFilter));
Futures.addCallback(future, processInputDirCallback);
}
}
decrementRunningAndCheckCompletion();
} catch (Throwable t) { // Exception within the callback
registerError(t);
}
}
@Override
public void onFailure(Throwable t) {
// Any generated exceptions. Leads to immediate termination.
registerError(t);
}
}
}