All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.apache.hadoop.mapred.LocatedFileStatusFetcher Maven / Gradle / Ivy

There is a newer version: 3.2.0-9
Show newest version
/**
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package org.apache.hadoop.mapred;

import java.io.IOException;
import java.util.LinkedList;
import java.util.List;
import java.util.concurrent.BlockingQueue;
import java.util.concurrent.Callable;
import java.util.concurrent.ExecutorService;
import java.util.concurrent.Executors;
import java.util.concurrent.LinkedBlockingQueue;
import java.util.concurrent.atomic.AtomicInteger;
import java.util.concurrent.locks.Condition;
import java.util.concurrent.locks.ReentrantLock;

import org.apache.hadoop.classification.InterfaceAudience.Private;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileStatus;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.LocatedFileStatus;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.fs.PathFilter;
import org.apache.hadoop.fs.RemoteIterator;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;

import io.prestosql.hadoop.$internal.com.google.common.collect.Iterables;
import io.prestosql.hadoop.$internal.com.google.common.util.concurrent.FutureCallback;
import io.prestosql.hadoop.$internal.com.google.common.util.concurrent.Futures;
import io.prestosql.hadoop.$internal.com.google.common.util.concurrent.ListenableFuture;
import io.prestosql.hadoop.$internal.com.google.common.util.concurrent.ListeningExecutorService;
import io.prestosql.hadoop.$internal.com.google.common.util.concurrent.MoreExecutors;
import io.prestosql.hadoop.$internal.com.google.common.util.concurrent.ThreadFactoryBuilder;

/**
 * Utility class to fetch block locations for specified Input paths using a
 * configured number of threads.
 */
@Private
public class LocatedFileStatusFetcher {

  private final Path[] inputDirs;
  private final PathFilter inputFilter;
  private final Configuration conf;
  private final boolean recursive;
  private final boolean newApi;
  
  private final ExecutorService rawExec;
  private final ListeningExecutorService exec;
  private final BlockingQueue> resultQueue;
  private final List invalidInputErrors = new LinkedList();

  private final ProcessInitialInputPathCallback processInitialInputPathCallback = 
      new ProcessInitialInputPathCallback();
  private final ProcessInputDirCallback processInputDirCallback = 
      new ProcessInputDirCallback();

  private final AtomicInteger runningTasks = new AtomicInteger(0);

  private final ReentrantLock lock = new ReentrantLock();
  private final Condition condition = lock.newCondition();

  private volatile Throwable unknownError;

  /**
   * @param conf configuration for the job
   * @param dirs the initial list of paths
   * @param recursive whether to traverse the patchs recursively
   * @param inputFilter inputFilter to apply to the resulting paths
   * @param newApi whether using the mapred or mapreduce API
   * @throws InterruptedException
   * @throws IOException
   */
  public LocatedFileStatusFetcher(Configuration conf, Path[] dirs,
      boolean recursive, PathFilter inputFilter, boolean newApi) throws InterruptedException,
      IOException {
    int numThreads = conf.getInt(FileInputFormat.LIST_STATUS_NUM_THREADS,
        FileInputFormat.DEFAULT_LIST_STATUS_NUM_THREADS);
    rawExec = Executors.newFixedThreadPool(
        numThreads,
        new ThreadFactoryBuilder().setDaemon(true)
            .setNameFormat("GetFileInfo #%d").build());
    exec = MoreExecutors.listeningDecorator(rawExec);
    resultQueue = new LinkedBlockingQueue>();
    this.conf = conf;
    this.inputDirs = dirs;
    this.recursive = recursive;
    this.inputFilter = inputFilter;
    this.newApi = newApi;
  }

  /**
   * Start executing and return FileStatuses based on the parameters specified
   * @return fetched file statuses
   * @throws InterruptedException
   * @throws IOException
   */
  public Iterable getFileStatuses() throws InterruptedException,
      IOException {
    // Increment to make sure a race between the first thread completing and the
    // rest being scheduled does not lead to a termination.
    runningTasks.incrementAndGet();
    for (Path p : inputDirs) {
      runningTasks.incrementAndGet();
      ListenableFuture future = exec
          .submit(new ProcessInitialInputPathCallable(p, conf, inputFilter));
      Futures.addCallback(future, processInitialInputPathCallback);
    }

    runningTasks.decrementAndGet();

    lock.lock();
    try {
      while (runningTasks.get() != 0 && unknownError == null) {
        condition.await();
      }
    } finally {
      lock.unlock();
    }
    this.exec.shutdownNow();
    if (this.unknownError != null) {
      if (this.unknownError instanceof Error) {
        throw (Error) this.unknownError;
      } else if (this.unknownError instanceof RuntimeException) {
        throw (RuntimeException) this.unknownError;
      } else if (this.unknownError instanceof IOException) {
        throw (IOException) this.unknownError;
      } else if (this.unknownError instanceof InterruptedException) {
        throw (InterruptedException) this.unknownError;
      } else {
        throw new IOException(this.unknownError);
      }
    }
    if (this.invalidInputErrors.size() != 0) {
      if (this.newApi) {
        throw new org.apache.hadoop.mapreduce.lib.input.InvalidInputException(
            invalidInputErrors);
      } else {
        throw new InvalidInputException(invalidInputErrors);
      }
    }
    return Iterables.concat(resultQueue);
  }

  /**
   * Collect misconfigured Input errors. Errors while actually reading file info
   * are reported immediately
   */
  private void registerInvalidInputError(List errors) {
    synchronized (this) {
      this.invalidInputErrors.addAll(errors);
    }
  }

  /**
   * Register fatal errors - example an IOException while accessing a file or a
   * full exection queue
   */
  private void registerError(Throwable t) {
    lock.lock();
    try {
      if (unknownError != null) {
        unknownError = t;
        condition.signal();
      }

    } finally {
      lock.unlock();
    }
  }

  private void decrementRunningAndCheckCompletion() {
    lock.lock();
    try {
      if (runningTasks.decrementAndGet() == 0) {
        condition.signal();
      }
    } finally {
      lock.unlock();
    }
  }
  
  /**
   * Retrieves block locations for the given @link {@link FileStatus}, and adds
   * additional paths to the process queue if required.
   */
  private static class ProcessInputDirCallable implements
      Callable {

    private final FileSystem fs;
    private final FileStatus fileStatus;
    private final boolean recursive;
    private final PathFilter inputFilter;

    ProcessInputDirCallable(FileSystem fs, FileStatus fileStatus,
        boolean recursive, PathFilter inputFilter) {
      this.fs = fs;
      this.fileStatus = fileStatus;
      this.recursive = recursive;
      this.inputFilter = inputFilter;
    }

    @Override
    public Result call() throws Exception {
      Result result = new Result();
      result.fs = fs;

      if (fileStatus.isDirectory()) {
        RemoteIterator iter = fs
            .listLocatedStatus(fileStatus.getPath());
        while (iter.hasNext()) {
          LocatedFileStatus stat = iter.next();
          if (inputFilter.accept(stat.getPath())) {
            if (recursive && stat.isDirectory()) {
              result.dirsNeedingRecursiveCalls.add(stat);
            } else {
              result.locatedFileStatuses.add(stat);
            }
          }
        }
      } else {
        result.locatedFileStatuses.add(fileStatus);
      }
      return result;
    }

    private static class Result {
      private List locatedFileStatuses = new LinkedList();
      private List dirsNeedingRecursiveCalls = new LinkedList();
      private FileSystem fs;
    }
  }

  /**
   * The callback handler to handle results generated by
   * {@link ProcessInputDirCallable}. This populates the final result set.
   * 
   */
  private class ProcessInputDirCallback implements
      FutureCallback {

    @Override
    public void onSuccess(ProcessInputDirCallable.Result result) {
      try {
        if (result.locatedFileStatuses.size() != 0) {
          resultQueue.add(result.locatedFileStatuses);
        }
        if (result.dirsNeedingRecursiveCalls.size() != 0) {
          for (FileStatus fileStatus : result.dirsNeedingRecursiveCalls) {
            runningTasks.incrementAndGet();
            ListenableFuture future = exec
                .submit(new ProcessInputDirCallable(result.fs, fileStatus,
                    recursive, inputFilter));
            Futures.addCallback(future, processInputDirCallback);
          }
        }
        decrementRunningAndCheckCompletion();
      } catch (Throwable t) { // Error within the callback itself.
        registerError(t);
      }
    }

    @Override
    public void onFailure(Throwable t) {
      // Any generated exceptions. Leads to immediate termination.
      registerError(t);
    }
  }

  
  /**
   * Processes an initial Input Path pattern through the globber and PathFilter
   * to generate a list of files which need further processing.
   */
  private static class ProcessInitialInputPathCallable implements
      Callable {

    private final Path path;
    private final Configuration conf;
    private final PathFilter inputFilter;

    public ProcessInitialInputPathCallable(Path path, Configuration conf,
        PathFilter pathFilter) {
      this.path = path;
      this.conf = conf;
      this.inputFilter = pathFilter;
    }

    @Override
    public Result call() throws Exception {
      Result result = new Result();
      FileSystem fs = path.getFileSystem(conf);
      result.fs = fs;
      FileStatus[] matches = fs.globStatus(path, inputFilter);
      if (matches == null) {
        result.addError(new IOException("Input path does not exist: " + path));
      } else if (matches.length == 0) {
        result.addError(new IOException("Input Pattern " + path
            + " matches 0 files"));
      } else {
        result.matchedFileStatuses = matches;
      }
      return result;
    }

    private static class Result {
      private List errors;
      private FileStatus[] matchedFileStatuses;
      private FileSystem fs;

      void addError(IOException ioe) {
        if (errors == null) {
          errors = new LinkedList();
        }
        errors.add(ioe);
      }
    }
  }

  /**
   * The callback handler to handle results generated by
   * {@link ProcessInitialInputPathCallable}
   * 
   */
  private class ProcessInitialInputPathCallback implements
      FutureCallback {

    @Override
    public void onSuccess(ProcessInitialInputPathCallable.Result result) {
      try {
        if (result.errors != null) {
          registerInvalidInputError(result.errors);
        }
        if (result.matchedFileStatuses != null) {
          for (FileStatus matched : result.matchedFileStatuses) {
            runningTasks.incrementAndGet();
            ListenableFuture future = exec
                .submit(new ProcessInputDirCallable(result.fs, matched,
                    recursive, inputFilter));
            Futures.addCallback(future, processInputDirCallback);
          }
        }
        decrementRunningAndCheckCompletion();
      } catch (Throwable t) { // Exception within the callback
        registerError(t);
      }
    }

    @Override
    public void onFailure(Throwable t) {
      // Any generated exceptions. Leads to immediate termination.
      registerError(t);
    }
  }
}




© 2015 - 2024 Weber Informatics LLC | Privacy Policy