All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.apache.hadoop.fs.s3a.Listing Maven / Gradle / Ivy

Go to download

This module contains code to support integration with Amazon Web Services. It also declares the dependencies needed to work with AWS services.

There is a newer version: 3.4.0
Show newest version
/*
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package org.apache.hadoop.fs.s3a;

import com.amazonaws.AmazonClientException;
import com.amazonaws.services.s3.model.S3ObjectSummary;
import com.google.common.annotations.VisibleForTesting;
import org.apache.hadoop.fs.FileStatus;
import org.apache.hadoop.fs.LocatedFileStatus;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.fs.PathFilter;
import org.apache.hadoop.fs.RemoteIterator;

import com.google.common.base.Preconditions;
import org.slf4j.Logger;

import java.io.IOException;
import java.util.ArrayList;
import java.util.Collections;
import java.util.HashSet;
import java.util.Iterator;
import java.util.List;
import java.util.ListIterator;
import java.util.NoSuchElementException;
import java.util.Set;

import static org.apache.hadoop.fs.s3a.Constants.S3N_FOLDER_SUFFIX;
import static org.apache.hadoop.fs.s3a.S3AUtils.createFileStatus;
import static org.apache.hadoop.fs.s3a.S3AUtils.objectRepresentsDirectory;
import static org.apache.hadoop.fs.s3a.S3AUtils.stringify;
import static org.apache.hadoop.fs.s3a.S3AUtils.translateException;

/**
 * Place for the S3A listing classes; keeps all the small classes under control.
 */
public class Listing {

  private final S3AFileSystem owner;
  private static final Logger LOG = S3AFileSystem.LOG;

  public Listing(S3AFileSystem owner) {
    this.owner = owner;
  }

  /**
   * Create a FileStatus iterator against a provided list of file status, with
   * a given status filter.
   *
   * @param fileStatuses the provided list of file status. NO remote calls.
   * @param filter file path filter on which paths to accept
   * @param acceptor the file status acceptor
   * @return the file status iterator
   */
  ProvidedFileStatusIterator createProvidedFileStatusIterator(
      FileStatus[] fileStatuses,
      PathFilter filter,
      FileStatusAcceptor acceptor) {
    return new ProvidedFileStatusIterator(fileStatuses, filter, acceptor);
  }

  /**
   * Create a FileStatus iterator against a path, with a given list object
   * request.
   *
   * @param listPath path of the listing
   * @param request initial request to make
   * @param filter the filter on which paths to accept
   * @param acceptor the class/predicate to decide which entries to accept
   * in the listing based on the full file status.
   * @return the iterator
   * @throws IOException IO Problems
   */
  FileStatusListingIterator createFileStatusListingIterator(
      Path listPath,
      S3ListRequest request,
      PathFilter filter,
      Listing.FileStatusAcceptor acceptor) throws IOException {
    return createFileStatusListingIterator(listPath, request, filter, acceptor,
        null);
  }

  /**
   * Create a FileStatus iterator against a path, with a given
   * list object request.
   * @param listPath path of the listing
   * @param request initial request to make
   * @param filter the filter on which paths to accept
   * @param acceptor the class/predicate to decide which entries to accept
   * in the listing based on the full file status.
   * @param providedStatus the provided list of file status, which may contain
   *                       items that are not listed from source.
   * @return the iterator
   * @throws IOException IO Problems
   */
  @Retries.RetryRaw
  FileStatusListingIterator createFileStatusListingIterator(
      Path listPath,
      S3ListRequest request,
      PathFilter filter,
      Listing.FileStatusAcceptor acceptor,
      RemoteIterator providedStatus) throws IOException {
    return new FileStatusListingIterator(
        new ObjectListingIterator(listPath, request),
        filter,
        acceptor,
        providedStatus);
  }

  /**
   * Create a located status iterator over a file status iterator.
   * @param statusIterator an iterator over the remote status entries
   * @return a new remote iterator
   */
  @VisibleForTesting
  LocatedFileStatusIterator createLocatedFileStatusIterator(
      RemoteIterator statusIterator) {
    return new LocatedFileStatusIterator(statusIterator);
  }

  /**
   * Create an located status iterator that wraps another to filter out a set
   * of recently deleted items.
   * @param iterator an iterator over the remote located status entries.
   * @param tombstones set of paths that are recently deleted and should be
   *                   filtered.
   * @return a new remote iterator.
   */
  @VisibleForTesting
  TombstoneReconcilingIterator createTombstoneReconcilingIterator(
      RemoteIterator iterator, Set tombstones) {
    return new TombstoneReconcilingIterator(iterator, tombstones);
  }

  /**
   * Interface to implement by the logic deciding whether to accept a summary
   * entry or path as a valid file or directory.
   */
  interface FileStatusAcceptor {

    /**
     * Predicate to decide whether or not to accept a summary entry.
     * @param keyPath qualified path to the entry
     * @param summary summary entry
     * @return true if the entry is accepted (i.e. that a status entry
     * should be generated.
     */
    boolean accept(Path keyPath, S3ObjectSummary summary);

    /**
     * Predicate to decide whether or not to accept a prefix.
     * @param keyPath qualified path to the entry
     * @param commonPrefix the prefix
     * @return true if the entry is accepted (i.e. that a status entry
     * should be generated.)
     */
    boolean accept(Path keyPath, String commonPrefix);

    /**
     * Predicate to decide whether or not to accept a file status.
     * @param status file status containing file path information
     * @return true if the status is accepted else false
     */
    boolean accept(FileStatus status);
  }

  /**
   * A remote iterator which only iterates over a single `LocatedFileStatus`
   * value.
   *
   * If the status value is null, the iterator declares that it has no data.
   * This iterator is used to handle {@link S3AFileSystem#listStatus} calls
   * where the path handed in refers to a file, not a directory: this is the
   * iterator returned.
   */
  static final class SingleStatusRemoteIterator
      implements RemoteIterator {

    /**
     * The status to return; set to null after the first iteration.
     */
    private LocatedFileStatus status;

    /**
     * Constructor.
     * @param status status value: may be null, in which case
     * the iterator is empty.
     */
    public SingleStatusRemoteIterator(LocatedFileStatus status) {
      this.status = status;
    }

    /**
     * {@inheritDoc}
     * @return true if there is a file status to return: this is always false
     * for the second iteration, and may be false for the first.
     * @throws IOException never
     */
    @Override
    public boolean hasNext() throws IOException {
      return status != null;
    }

    /**
     * {@inheritDoc}
     * @return the non-null status element passed in when the instance was
     * constructed, if it ha not already been retrieved.
     * @throws IOException never
     * @throws NoSuchElementException if this is the second call, or it is
     * the first call and a null {@link LocatedFileStatus} entry was passed
     * to the constructor.
     */
    @Override
    public LocatedFileStatus next() throws IOException {
      if (hasNext()) {
        LocatedFileStatus s = this.status;
        status = null;
        return s;
      } else {
        throw new NoSuchElementException();
      }
    }
  }

  /**
   * This wraps up a provided non-null list of file status as a remote iterator.
   *
   * It firstly filters the provided list and later {@link #next} call will get
   * from the filtered list. This suffers from scalability issues if the
   * provided list is too large.
   *
   * There is no remote data to fetch.
   */
  static class ProvidedFileStatusIterator
      implements RemoteIterator {
    private final ArrayList filteredStatusList;
    private int index = 0;

    ProvidedFileStatusIterator(FileStatus[] fileStatuses, PathFilter filter,
        FileStatusAcceptor acceptor) {
      Preconditions.checkArgument(fileStatuses != null, "Null status list!");

      filteredStatusList = new ArrayList<>(fileStatuses.length);
      for (FileStatus status : fileStatuses) {
        if (filter.accept(status.getPath()) && acceptor.accept(status)) {
          filteredStatusList.add(status);
        }
      }
      filteredStatusList.trimToSize();
    }

    @Override
    public boolean hasNext() throws IOException {
      return index < filteredStatusList.size();
    }

    @Override
    public FileStatus next() throws IOException {
      if (!hasNext()) {
        throw new NoSuchElementException();
      }
      return filteredStatusList.get(index++);
    }
  }

  /**
   * Wraps up object listing into a remote iterator which will ask for more
   * listing data if needed.
   *
   * This is a complex operation, especially the process to determine
   * if there are more entries remaining. If there are no more results
   * remaining in the (filtered) results of the current listing request, then
   * another request is made and those results filtered before the
   * iterator can declare that there is more data available.
   *
   * The need to filter the results precludes the iterator from simply
   * declaring that if the {@link ObjectListingIterator#hasNext()}
   * is true then there are more results. Instead the next batch of results must
   * be retrieved and filtered.
   *
   * What does this mean? It means that remote requests to retrieve new
   * batches of object listings are made in the {@link #hasNext()} call;
   * the {@link #next()} call simply returns the filtered results of the last
   * listing processed. However, do note that {@link #next()} calls
   * {@link #hasNext()} during its operation. This is critical to ensure
   * that a listing obtained through a sequence of {@link #next()} will
   * complete with the same set of results as a classic
   * {@code while(it.hasNext()} loop.
   *
   * Thread safety: None.
   */
  class FileStatusListingIterator
      implements RemoteIterator {

    /** Source of objects. */
    private final ObjectListingIterator source;
    /** Filter of paths from API call. */
    private final PathFilter filter;
    /** Filter of entries from file status. */
    private final FileStatusAcceptor acceptor;
    /** request batch size. */
    private int batchSize;
    /** Iterator over the current set of results. */
    private ListIterator statusBatchIterator;

    private final Set providedStatus;
    private Iterator providedStatusIterator;

    /**
     * Create an iterator over file status entries.
     * @param source the listing iterator from a listObjects call.
     * @param filter the filter on which paths to accept
     * @param acceptor the class/predicate to decide which entries to accept
     * in the listing based on the full file status.
     * @param providedStatus the provided list of file status, which may contain
     *                       items that are not listed from source.
     * @throws IOException IO Problems
     */
    @Retries.RetryTranslated
    FileStatusListingIterator(ObjectListingIterator source,
        PathFilter filter,
        FileStatusAcceptor acceptor,
        RemoteIterator providedStatus) throws IOException {
      this.source = source;
      this.filter = filter;
      this.acceptor = acceptor;
      this.providedStatus = new HashSet<>();
      for (; providedStatus != null && providedStatus.hasNext();) {
        final FileStatus status = providedStatus.next();
        if (filter.accept(status.getPath()) && acceptor.accept(status)) {
          this.providedStatus.add(status);
        }
      }
      // build the first set of results. This will not trigger any
      // remote IO, assuming the source iterator is in its initial
      // iteration
      requestNextBatch();
    }

    /**
     * Report whether or not there is new data available.
     * If there is data in the local filtered list, return true.
     * Else: request more data util that condition is met, or there
     * is no more remote listing data.
     * Lastly, return true if the {@code providedStatusIterator}
     * has left items.
     * @return true if a call to {@link #next()} will succeed.
     * @throws IOException
     */
    @Override
    @Retries.RetryTranslated
    public boolean hasNext() throws IOException {
      return sourceHasNext() || providedStatusIterator.hasNext();
    }

    @Retries.RetryTranslated
    private boolean sourceHasNext() throws IOException {
      if (statusBatchIterator.hasNext() || requestNextBatch()) {
        return true;
      } else {
        // turn to file status that are only in provided list
        if (providedStatusIterator == null) {
          LOG.debug("Start iterating the provided status.");
          providedStatusIterator = providedStatus.iterator();
        }
        return false;
      }
    }

    @Override
    @Retries.RetryTranslated
    public FileStatus next() throws IOException {
      final FileStatus status;
      if (sourceHasNext()) {
        status = statusBatchIterator.next();
        // We remove from provided list the file status listed by S3 so that
        // this does not return duplicate items.
        if (providedStatus.remove(status)) {
          LOG.debug("Removed the status from provided file status {}", status);
        }
      } else {
        if (providedStatusIterator.hasNext()) {
          status = providedStatusIterator.next();
          LOG.debug("Returning provided file status {}", status);
        } else {
          throw new NoSuchElementException();
        }
      }
      return status;
    }

    /**
     * Try to retrieve another batch.
     * Note that for the initial batch,
     * {@link ObjectListingIterator} does not generate a request;
     * it simply returns the initial set.
     *
     * @return true if a new batch was created.
     * @throws IOException IO problems
     */
    @Retries.RetryTranslated
    private boolean requestNextBatch() throws IOException {
      // look for more object listing batches being available
      while (source.hasNext()) {
        // if available, retrieve it and build the next status
        if (buildNextStatusBatch(source.next())) {
          // this batch successfully generated entries matching the filters/
          // acceptors; declare that the request was successful
          return true;
        } else {
          LOG.debug("All entries in batch were filtered...continuing");
        }
      }
      // if this code is reached, it means that all remaining
      // object lists have been retrieved, and there are no new entries
      // to return.
      return false;
    }

    /**
     * Build the next status batch from a listing.
     * @param objects the next object listing
     * @return true if this added any entries after filtering
     */
    private boolean buildNextStatusBatch(S3ListResult objects) {
      // counters for debug logs
      int added = 0, ignored = 0;
      // list to fill in with results. Initial size will be list maximum.
      List stats = new ArrayList<>(
          objects.getObjectSummaries().size() +
              objects.getCommonPrefixes().size());
      // objects
      for (S3ObjectSummary summary : objects.getObjectSummaries()) {
        String key = summary.getKey();
        Path keyPath = owner.keyToQualifiedPath(key);
        if (LOG.isDebugEnabled()) {
          LOG.debug("{}: {}", keyPath, stringify(summary));
        }
        // Skip over keys that are ourselves and old S3N _$folder$ files
        if (acceptor.accept(keyPath, summary) && filter.accept(keyPath)) {
          FileStatus status = createFileStatus(keyPath, summary,
              owner.getDefaultBlockSize(keyPath), owner.getUsername());
          LOG.debug("Adding: {}", status);
          stats.add(status);
          added++;
        } else {
          LOG.debug("Ignoring: {}", keyPath);
          ignored++;
        }
      }

      // prefixes: always directories
      for (String prefix : objects.getCommonPrefixes()) {
        Path keyPath = owner.keyToQualifiedPath(prefix);
        if (acceptor.accept(keyPath, prefix) && filter.accept(keyPath)) {
          FileStatus status = new S3AFileStatus(Tristate.FALSE, keyPath,
              owner.getUsername());
          LOG.debug("Adding directory: {}", status);
          added++;
          stats.add(status);
        } else {
          LOG.debug("Ignoring directory: {}", keyPath);
          ignored++;
        }
      }

      // finish up
      batchSize = stats.size();
      statusBatchIterator = stats.listIterator();
      boolean hasNext = statusBatchIterator.hasNext();
      LOG.debug("Added {} entries; ignored {}; hasNext={}; hasMoreObjects={}",
          added, ignored, hasNext, objects.isTruncated());
      return hasNext;
    }

    /**
     * Get the number of entries in the current batch.
     * @return a number, possibly zero.
     */
    public int getBatchSize() {
      return batchSize;
    }
  }

  /**
   * Wraps up AWS `ListObjects` requests in a remote iterator
   * which will ask for more listing data if needed.
   *
   * That is:
   *
   * 1. The first invocation of the {@link #next()} call will return the results
   * of the first request, the one created during the construction of the
   * instance.
   *
   * 2. Second and later invocations will continue the ongoing listing,
   * calling {@link S3AFileSystem#continueListObjects} to request the next
   * batch of results.
   *
   * 3. The {@link #hasNext()} predicate returns true for the initial call,
   * where {@link #next()} will return the initial results. It declares
   * that it has future results iff the last executed request was truncated.
   *
   * Thread safety: none.
   */
  class ObjectListingIterator implements RemoteIterator {

    /** The path listed. */
    private final Path listPath;

    /** The most recent listing results. */
    private S3ListResult objects;

    /** The most recent listing request. */
    private S3ListRequest request;

    /** Indicator that this is the first listing. */
    private boolean firstListing = true;

    /**
     * Count of how many listings have been requested
     * (including initial result).
     */
    private int listingCount = 1;

    /**
     * Maximum keys in a request.
     */
    private int maxKeys;

    /**
     * Constructor -calls `listObjects()` on the request to populate the
     * initial set of results/fail if there was a problem talking to the bucket.
     * @param listPath path of the listing
     * @param request initial request to make
     * @throws IOException if listObjects raises one.
     */
    @Retries.RetryRaw
    ObjectListingIterator(
        Path listPath,
        S3ListRequest request) throws IOException {
      this.listPath = listPath;
      this.maxKeys = owner.getMaxKeys();
      this.objects = owner.listObjects(request);
      this.request = request;
    }

    /**
     * Declare that the iterator has data if it is either is the initial
     * iteration or it is a later one and the last listing obtained was
     * incomplete.
     * @throws IOException never: there is no IO in this operation.
     */
    @Override
    public boolean hasNext() throws IOException {
      return firstListing || objects.isTruncated();
    }

    /**
     * Ask for the next listing.
     * For the first invocation, this returns the initial set, with no
     * remote IO. For later requests, S3 will be queried, hence the calls
     * may block or fail.
     * @return the next object listing.
     * @throws IOException if a query made of S3 fails.
     * @throws NoSuchElementException if there is no more data to list.
     */
    @Override
    @Retries.RetryTranslated
    public S3ListResult next() throws IOException {
      if (firstListing) {
        // on the first listing, don't request more data.
        // Instead just clear the firstListing flag so that it future calls
        // will request new data.
        firstListing = false;
      } else {
        try {
          if (!objects.isTruncated()) {
            // nothing more to request: fail.
            throw new NoSuchElementException("No more results in listing of "
                + listPath);
          }
          // need to request a new set of objects.
          LOG.debug("[{}], Requesting next {} objects under {}",
              listingCount, maxKeys, listPath);
          objects = owner.continueListObjects(request, objects);
          listingCount++;
          LOG.debug("New listing status: {}", this);
        } catch (AmazonClientException e) {
          throw translateException("listObjects()", listPath, e);
        }
      }
      return objects;
    }

    @Override
    public String toString() {
      return "Object listing iterator against " + listPath
          + "; listing count "+ listingCount
          + "; isTruncated=" + objects.isTruncated();
    }

    /**
     * Get the path listed.
     * @return the path used in this listing.
     */
    public Path getListPath() {
      return listPath;
    }

    /**
     * Get the count of listing requests.
     * @return the counter of requests made (including the initial lookup).
     */
    public int getListingCount() {
      return listingCount;
    }
  }

  /**
   * Accept all entries except the base path and those which map to S3N
   * pseudo directory markers.
   */
  static class AcceptFilesOnly implements FileStatusAcceptor {
    private final Path qualifiedPath;

    public AcceptFilesOnly(Path qualifiedPath) {
      this.qualifiedPath = qualifiedPath;
    }

    /**
     * Reject a summary entry if the key path is the qualified Path, or
     * it ends with {@code "_$folder$"}.
     * @param keyPath key path of the entry
     * @param summary summary entry
     * @return true if the entry is accepted (i.e. that a status entry
     * should be generated.
     */
    @Override
    public boolean accept(Path keyPath, S3ObjectSummary summary) {
      return !keyPath.equals(qualifiedPath)
          && !summary.getKey().endsWith(S3N_FOLDER_SUFFIX)
          && !objectRepresentsDirectory(summary.getKey(), summary.getSize());
    }

    /**
     * Accept no directory paths.
     * @param keyPath qualified path to the entry
     * @param prefix common prefix in listing.
     * @return false, always.
     */
    @Override
    public boolean accept(Path keyPath, String prefix) {
      return false;
    }

    @Override
    public boolean accept(FileStatus status) {
      return (status != null) && status.isFile();
    }
  }

  /**
   * Take a remote iterator over a set of {@link FileStatus} instances and
   * return a remote iterator of {@link LocatedFileStatus} instances.
   */
  class LocatedFileStatusIterator
      implements RemoteIterator {
    private final RemoteIterator statusIterator;

    /**
     * Constructor.
     * @param statusIterator an iterator over the remote status entries
     */
    LocatedFileStatusIterator(RemoteIterator statusIterator) {
      this.statusIterator = statusIterator;
    }

    @Override
    public boolean hasNext() throws IOException {
      return statusIterator.hasNext();
    }

    @Override
    public LocatedFileStatus next() throws IOException {
      return owner.toLocatedFileStatus(statusIterator.next());
    }
  }

  /**
   * Wraps another iterator and filters out files that appear in the provided
   * set of tombstones.  Will read ahead in the iterator when necessary to
   * ensure that emptiness is detected early enough if only deleted objects
   * remain in the source iterator.
   */
  static class TombstoneReconcilingIterator implements
      RemoteIterator {
    private LocatedFileStatus next = null;
    private final RemoteIterator iterator;
    private final Set tombstones;

    /**
     * @param iterator Source iterator to filter
     * @param tombstones set of tombstone markers to filter out of results
     */
    TombstoneReconcilingIterator(RemoteIterator
        iterator, Set tombstones) {
      this.iterator = iterator;
      if (tombstones != null) {
        this.tombstones = tombstones;
      } else {
        this.tombstones = Collections.emptySet();
      }
    }

    private boolean fetch() throws IOException {
      while (next == null && iterator.hasNext()) {
        LocatedFileStatus candidate = iterator.next();
        if (!tombstones.contains(candidate.getPath())) {
          next = candidate;
          return true;
        }
      }
      return false;
    }

    public boolean hasNext() throws IOException {
      if (next != null) {
        return true;
      }
      return fetch();
    }

    public LocatedFileStatus next() throws IOException {
      if (hasNext()) {
        LocatedFileStatus result = next;
        next = null;
        fetch();
        return result;
      }
      throw new NoSuchElementException();
    }
  }

  /**
   * Accept all entries except those which map to S3N pseudo directory markers.
   */
  static class AcceptAllButS3nDirs implements FileStatusAcceptor {

    public boolean accept(Path keyPath, S3ObjectSummary summary) {
      return !summary.getKey().endsWith(S3N_FOLDER_SUFFIX);
    }

    public boolean accept(Path keyPath, String prefix) {
      return !keyPath.toString().endsWith(S3N_FOLDER_SUFFIX);
    }

    public boolean accept(FileStatus status) {
      return !status.getPath().toString().endsWith(S3N_FOLDER_SUFFIX);
    }

  }

  /**
   * Accept all entries except the base path and those which map to S3N
   * pseudo directory markers.
   */
  static class AcceptAllButSelfAndS3nDirs implements FileStatusAcceptor {

    /** Base path. */
    private final Path qualifiedPath;

    /**
     * Constructor.
     * @param qualifiedPath an already-qualified path.
     */
    public AcceptAllButSelfAndS3nDirs(Path qualifiedPath) {
      this.qualifiedPath = qualifiedPath;
    }

    /**
     * Reject a summary entry if the key path is the qualified Path, or
     * it ends with {@code "_$folder$"}.
     * @param keyPath key path of the entry
     * @param summary summary entry
     * @return true if the entry is accepted (i.e. that a status entry
     * should be generated.)
     */
    @Override
    public boolean accept(Path keyPath, S3ObjectSummary summary) {
      return !keyPath.equals(qualifiedPath) &&
          !summary.getKey().endsWith(S3N_FOLDER_SUFFIX);
    }

    /**
     * Accept all prefixes except the one for the base path, "self".
     * @param keyPath qualified path to the entry
     * @param prefix common prefix in listing.
     * @return true if the entry is accepted (i.e. that a status entry
     * should be generated.
     */
    @Override
    public boolean accept(Path keyPath, String prefix) {
      return !keyPath.equals(qualifiedPath);
    }

    @Override
    public boolean accept(FileStatus status) {
      return (status != null) && !status.getPath().equals(qualifiedPath);
    }
  }

}




© 2015 - 2024 Weber Informatics LLC | Privacy Policy