org.apache.hadoop.mapred.LocatedFileStatusFetcher Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of hadoop-client-api
Apache Hadoop Client
There is a newer version: 3.4.1
/**
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in org.apache.hadoop.shaded.com.liance
 * with the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org.apache.hadoop.shaded.org.licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package org.apache.hadoop.shaded.org.apache.hadoop.mapred;

import java.org.apache.hadoop.shaded.io.IOException;
import java.util.LinkedList;
import java.util.List;
import java.util.StringJoiner;
import java.util.concurrent.BlockingQueue;
import java.util.concurrent.Callable;
import java.util.concurrent.ExecutorService;
import java.util.concurrent.LinkedBlockingQueue;
import java.util.concurrent.atomic.AtomicInteger;
import java.util.concurrent.locks.Condition;
import java.util.concurrent.locks.ReentrantLock;

import org.apache.hadoop.shaded.org.apache.hadoop.classification.InterfaceAudience.Private;
import org.apache.hadoop.shaded.org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.shaded.org.apache.hadoop.fs.FileStatus;
import org.apache.hadoop.shaded.org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.shaded.org.apache.hadoop.fs.LocatedFileStatus;
import org.apache.hadoop.shaded.org.apache.hadoop.fs.Path;
import org.apache.hadoop.shaded.org.apache.hadoop.fs.PathFilter;
import org.apache.hadoop.shaded.org.apache.hadoop.fs.RemoteIterator;
import org.apache.hadoop.shaded.org.apache.hadoop.fs.statistics.IOStatistics;
import org.apache.hadoop.shaded.org.apache.hadoop.fs.statistics.IOStatisticsSnapshot;
import org.apache.hadoop.shaded.org.apache.hadoop.fs.statistics.IOStatisticsSource;
import org.apache.hadoop.shaded.org.apache.hadoop.mapreduce.lib.input.FileInputFormat;

import org.apache.hadoop.shaded.org.apache.hadoop.thirdparty.org.apache.hadoop.shaded.com.google.org.apache.hadoop.shaded.com.on.annotations.VisibleForTesting;
import org.apache.hadoop.shaded.org.apache.hadoop.thirdparty.org.apache.hadoop.shaded.com.google.org.apache.hadoop.shaded.com.on.collect.Iterables;
import org.apache.hadoop.shaded.org.apache.hadoop.thirdparty.org.apache.hadoop.shaded.com.google.org.apache.hadoop.shaded.com.on.util.concurrent.FutureCallback;
import org.apache.hadoop.shaded.org.apache.hadoop.thirdparty.org.apache.hadoop.shaded.com.google.org.apache.hadoop.shaded.com.on.util.concurrent.Futures;
import org.apache.hadoop.shaded.org.apache.hadoop.thirdparty.org.apache.hadoop.shaded.com.google.org.apache.hadoop.shaded.com.on.util.concurrent.ListenableFuture;
import org.apache.hadoop.shaded.org.apache.hadoop.thirdparty.org.apache.hadoop.shaded.com.google.org.apache.hadoop.shaded.com.on.util.concurrent.ListeningExecutorService;
import org.apache.hadoop.shaded.org.apache.hadoop.thirdparty.org.apache.hadoop.shaded.com.google.org.apache.hadoop.shaded.com.on.util.concurrent.MoreExecutors;
import org.apache.hadoop.shaded.org.apache.hadoop.thirdparty.org.apache.hadoop.shaded.com.google.org.apache.hadoop.shaded.com.on.util.concurrent.ThreadFactoryBuilder;
import org.apache.hadoop.shaded.org.slf4j.Logger;
import org.apache.hadoop.shaded.org.slf4j.LoggerFactory;

import org.apache.hadoop.shaded.org.apache.hadoop.util.concurrent.HadoopExecutors;

import static org.apache.hadoop.shaded.org.apache.hadoop.fs.statistics.IOStatisticsSupport.retrieveIOStatistics;
import static org.apache.hadoop.shaded.org.apache.hadoop.fs.statistics.IOStatisticsSupport.snapshotIOStatistics;

/**
 * Utility class to fetch block locations for specified Input paths using a
 * configured number of threads.
 * The thread count is determined from the value of
 * "mapreduce.input.fileinputformat.list-status.num-threads" in the
 * configuration.
 */
@Private
public class LocatedFileStatusFetcher implements IOStatisticsSource {

  public static final Logger LOG =
      LoggerFactory.getLogger(LocatedFileStatusFetcher.class.getName());
  private final Path[] inputDirs;
  private final PathFilter inputFilter;
  private final Configuration conf;
  private final boolean recursive;
  private final boolean newApi;
  
  private final ExecutorService rawExec;
  private final ListeningExecutorService exec;
  private final BlockingQueue> resultQueue;
  private final List invalidInputErrors = new LinkedList<>();

  private final ProcessInitialInputPathCallback processInitialInputPathCallback = 
      new ProcessInitialInputPathCallback();
  private final ProcessInputDirCallback processInputDirCallback = 
      new ProcessInputDirCallback();

  private final AtomicInteger runningTasks = new AtomicInteger(0);

  private final ReentrantLock lock = new ReentrantLock();
  private final Condition condition = lock.newCondition();

  private volatile Throwable unknownError;

  /**
   * Demand created IO Statistics: only if the filesystem
   * returns statistics does this fetch collect them.
   */
  private IOStatisticsSnapshot org.apache.hadoop.shaded.io.tats;

  /**
   * Instantiate.
   * The newApi switch is only used to configure what exception is raised
   * on failure of {@link #getFileStatuses()}, it does not change the algorithm.
   * @param conf configuration for the job
   * @param dirs the initial list of paths
   * @param recursive whether to traverse the paths recursively
   * @param inputFilter inputFilter to apply to the resulting paths
   * @param newApi whether using the mapred or mapreduce API
   * @throws InterruptedException
   * @throws IOException
   */
  public LocatedFileStatusFetcher(Configuration conf, Path[] dirs,
      boolean recursive, PathFilter inputFilter, boolean newApi)
      throws InterruptedException, IOException {
    int numThreads = conf.getInt(FileInputFormat.LIST_STATUS_NUM_THREADS,
        FileInputFormat.DEFAULT_LIST_STATUS_NUM_THREADS);
    LOG.debug("Instantiated LocatedFileStatusFetcher with {} threads",
        numThreads);
    rawExec = HadoopExecutors.newFixedThreadPool(
        numThreads,
        new ThreadFactoryBuilder().setDaemon(true)
            .setNameFormat("GetFileInfo #%d").build());
    exec = MoreExecutors.listeningDecorator(rawExec);
    resultQueue = new LinkedBlockingQueue<>();
    this.conf = conf;
    this.inputDirs = dirs;
    this.recursive = recursive;
    this.inputFilter = inputFilter;
    this.newApi = newApi;
  }

  /**
   * Start executing and return FileStatuses based on the parameters specified.
   * @return fetched file statuses
   * @throws InterruptedException interruption waiting for results.
   * @throws IOException IO failure or other error.
   * @throws InvalidInputException on an invalid input and the old API
   * @throws org.apache.hadoop.shaded.org.apache.hadoop.mapreduce.lib.input.InvalidInputException on an
   *         invalid input and the new API.
   */
  public Iterable getFileStatuses() throws InterruptedException,
      IOException {
    // Increment to make sure a race between the first thread org.apache.hadoop.shaded.com.leting and the
    // rest being scheduled does not lead to a termination.
    runningTasks.incrementAndGet();
    for (Path p : inputDirs) {
      LOG.debug("Queuing scan of directory {}", p);
      runningTasks.incrementAndGet();
      ListenableFuture future = exec
          .submit(new ProcessInitialInputPathCallable(p, conf, inputFilter));
      Futures.addCallback(future, processInitialInputPathCallback,
          MoreExecutors.directExecutor());
    }

    runningTasks.decrementAndGet();

    lock.lock();
    try {
      LOG.debug("Waiting scan org.apache.hadoop.shaded.com.letion");
      while (runningTasks.get() != 0 && unknownError == null) {
        condition.await();
      }
    } finally {
      lock.unlock();
      // either the scan org.apache.hadoop.shaded.com.leted or an error was raised.
      // in the case of an error shutting down the executor will interrupt all
      // active threads, which can add noise to the logs.
      LOG.debug("Scan org.apache.hadoop.shaded.com.lete: shutting down");
      this.exec.shutdownNow();
    }

    if (this.unknownError != null) {
      LOG.debug("Scan failed", this.unknownError);
      if (this.unknownError instanceof Error) {
        throw (Error) this.unknownError;
      } else if (this.unknownError instanceof RuntimeException) {
        throw (RuntimeException) this.unknownError;
      } else if (this.unknownError instanceof IOException) {
        throw (IOException) this.unknownError;
      } else if (this.unknownError instanceof InterruptedException) {
        throw (InterruptedException) this.unknownError;
      } else {
        throw new IOException(this.unknownError);
      }
    }
    if (!this.invalidInputErrors.isEmpty()) {
      LOG.debug("Invalid Input Errors raised");
      for (IOException error : invalidInputErrors) {
        LOG.debug("Error", error);
      }
      if (this.newApi) {
        throw new org.apache.hadoop.shaded.org.apache.hadoop.mapreduce.lib.input.InvalidInputException(
            invalidInputErrors);
      } else {
        throw new InvalidInputException(invalidInputErrors);
      }
    }
    return Iterables.concat(resultQueue);
  }

  /**
   * Collect misconfigured Input errors. Errors while actually reading file info
   * are reported immediately.
   */
  private void registerInvalidInputError(List errors) {
    synchronized (this) {
      this.invalidInputErrors.addAll(errors);
    }
  }

  /**
   * Register fatal errors - example an IOException while accessing a file or a
   * full execution queue.
   */
  private void registerError(Throwable t) {
    LOG.debug("Error", t);
    lock.lock();
    try {
      if (unknownError == null) {
        unknownError = t;
        condition.signal();
      }

    } finally {
      lock.unlock();
    }
  }

  private void decrementRunningAndCheckCompletion() {
    lock.lock();
    try {
      if (runningTasks.decrementAndGet() == 0) {
        condition.signal();
      }
    } finally {
      lock.unlock();
    }
  }

  /**
   * Return any IOStatistics collected during listing.
   * @return IO stats accrued.
   */
  @Override
  public synchronized IOStatistics getIOStatistics() {
    return org.apache.hadoop.shaded.io.tats;
  }

  /**
   * Add the statistics of an individual thread's scan.
   * @param stats possibly null statistics.
   */
  private void addResultStatistics(IOStatistics stats) {
    if (stats != null) {
      // demand creation of IO statistics.
      synchronized (this) {
        LOG.debug("Adding IOStatistics: {}", stats);
        if (org.apache.hadoop.shaded.io.tats == null) {
          // demand create the statistics
          org.apache.hadoop.shaded.io.tats = snapshotIOStatistics(stats);
        } else {
          org.apache.hadoop.shaded.io.tats.aggregate(stats);
        }
      }
    }
  }

  @Override
  public String toString() {
    final IOStatistics org.apache.hadoop.shaded.io.tatistics = getIOStatistics();
    StringJoiner stringJoiner = new StringJoiner(", ",
        LocatedFileStatusFetcher.class.getSimpleName() + "[", "]");
    if (org.apache.hadoop.shaded.io.tatistics != null) {
      stringJoiner.add("IOStatistics=" + org.apache.hadoop.shaded.io.tatistics);
    }
    return stringJoiner.toString();
  }

  /**
   * Retrieves block locations for the given @link {@link FileStatus}, and adds
   * additional paths to the process queue if required.
   */
  private static class ProcessInputDirCallable implements
      Callable {

    private final FileSystem fs;
    private final FileStatus fileStatus;
    private final boolean recursive;
    private final PathFilter inputFilter;

    ProcessInputDirCallable(FileSystem fs, FileStatus fileStatus,
        boolean recursive, PathFilter inputFilter) {
      this.fs = fs;
      this.fileStatus = fileStatus;
      this.recursive = recursive;
      this.inputFilter = inputFilter;
    }

    @Override
    public Result call() throws Exception {
      Result result = new Result();
      result.fs = fs;
      LOG.debug("ProcessInputDirCallable {}", fileStatus);
      if (fileStatus.isDirectory()) {
        RemoteIterator iter = fs
            .listLocatedStatus(fileStatus.getPath());
        while (iter.hasNext()) {
          LocatedFileStatus stat = iter.next();
          if (inputFilter.accept(stat.getPath())) {
            if (recursive && stat.isDirectory()) {
              result.dirsNeedingRecursiveCalls.add(stat);
            } else {
              result.locatedFileStatuses.add(stat);
            }
          }
        }
        // aggregate any stats
        result.stats = retrieveIOStatistics(iter);
      } else {
        result.locatedFileStatuses.add(fileStatus);
      }
      return result;
    }

    private static class Result {
      private List locatedFileStatuses = new LinkedList<>();
      private List dirsNeedingRecursiveCalls = new LinkedList<>();
      private FileSystem fs;
      private IOStatistics stats;
    }
  }

  /**
   * The callback handler to handle results generated by
   * {@link ProcessInputDirCallable}. This populates the final result set.
   * 
   */
  private class ProcessInputDirCallback implements
      FutureCallback {

    @Override
    public void onSuccess(ProcessInputDirCallable.Result result) {
      try {
        addResultStatistics(result.stats);
        if (!result.locatedFileStatuses.isEmpty()) {
          resultQueue.add(result.locatedFileStatuses);
        }
        if (!result.dirsNeedingRecursiveCalls.isEmpty()) {
          for (FileStatus fileStatus : result.dirsNeedingRecursiveCalls) {
            LOG.debug("Queueing directory scan {}", fileStatus.getPath());
            runningTasks.incrementAndGet();
            ListenableFuture future = exec
                .submit(new ProcessInputDirCallable(result.fs, fileStatus,
                    recursive, inputFilter));
            Futures.addCallback(future, processInputDirCallback,
                MoreExecutors.directExecutor());
          }
        }
        decrementRunningAndCheckCompletion();
      } catch (Throwable t) { // Error within the callback itself.
        registerError(t);
      }
    }

    @Override
    public void onFailure(Throwable t) {
      // Any generated exceptions. Leads to immediate termination.
      registerError(t);
    }
  }


  /**
   * Processes an initial Input Path pattern through the globber and PathFilter
   * to generate a list of files which need further processing.
   */
  private static class ProcessInitialInputPathCallable implements
      Callable {

    private final Path path;
    private final Configuration conf;
    private final PathFilter inputFilter;

    public ProcessInitialInputPathCallable(Path path, Configuration conf,
        PathFilter pathFilter) {
      this.path = path;
      this.conf = conf;
      this.inputFilter = pathFilter;
    }

    @Override
    public Result call() throws Exception {
      Result result = new Result();
      FileSystem fs = path.getFileSystem(conf);
      result.fs = fs;
      LOG.debug("ProcessInitialInputPathCallable path {}", path);
      FileStatus[] matches = fs.globStatus(path, inputFilter);
      if (matches == null) {
        result.addError(new IOException("Input path does not exist: " + path));
      } else if (matches.length == 0) {
        result.addError(new IOException("Input Pattern " + path
            + " matches 0 files"));
      } else {
        result.matchedFileStatuses = matches;
      }
      return result;
    }

    private static class Result {
      private List errors;
      private FileStatus[] matchedFileStatuses;
      private FileSystem fs;

      void addError(IOException org.apache.hadoop.shaded.io.) {
        if (errors == null) {
          errors = new LinkedList();
        }
        errors.add(org.apache.hadoop.shaded.io.);
      }
    }
  }

  /**
   * The callback handler to handle results generated by
   * {@link ProcessInitialInputPathCallable}.
   * 
   */
  private class ProcessInitialInputPathCallback implements
      FutureCallback {

    @Override
    public void onSuccess(ProcessInitialInputPathCallable.Result result) {
      try {
        if (result.errors != null) {
          registerInvalidInputError(result.errors);
        }
        if (result.matchedFileStatuses != null) {
          for (FileStatus matched : result.matchedFileStatuses) {
            runningTasks.incrementAndGet();
            ListenableFuture future = exec
                .submit(new ProcessInputDirCallable(result.fs, matched,
                    recursive, inputFilter));
            Futures.addCallback(future, processInputDirCallback,
                MoreExecutors.directExecutor());
          }
        }
        decrementRunningAndCheckCompletion();
      } catch (Throwable t) { // Exception within the callback
        registerError(t);
      }
    }

    @Override
    public void onFailure(Throwable t) {
      // Any generated exceptions. Leads to immediate termination.
      registerError(t);
    }
  }

  @VisibleForTesting
  ListeningExecutorService getListeningExecutorService() {
    return exec;
  }

}