All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.apache.hadoop.mapred.gridmix.FilePool Maven / Gradle / Ivy

There is a newer version: 3.4.1
Show newest version
/**
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.hadoop.mapred.gridmix;

import java.io.IOException;

import java.util.Arrays;
import java.util.ArrayList;
import java.util.Collection;
import java.util.Comparator;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Iterator;
import java.util.Map;
import java.util.Random;
import java.util.concurrent.locks.ReadWriteLock;
import java.util.concurrent.locks.ReentrantReadWriteLock;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.BlockLocation;
import org.apache.hadoop.fs.FileStatus;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;

import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.mapred.gridmix.RandomAlgorithms.Selector;

/**
 * Class for caching a pool of input data to be used by synthetic jobs for
 * simulating read traffic.
 */
class FilePool {

  public static final Log LOG = LogFactory.getLog(FilePool.class);

  /**
   * The minimum file size added to the pool. Default 128MiB.
   */
  public static final String GRIDMIX_MIN_FILE = "gridmix.min.file.size";

  /**
   * The maximum size for files added to the pool. Defualts to 100TiB.
   */
  public static final String GRIDMIX_MAX_TOTAL = "gridmix.max.total.scan";

  private Node root;
  private final Path path;
  private final FileSystem fs;
  private final Configuration conf;
  private final ReadWriteLock updateLock;

  /**
   * Initialize a filepool under the path provided, but do not populate the
   * cache.
   */
  public FilePool(Configuration conf, Path input) throws IOException {
    root = null;
    this.conf = conf;
    this.path = input;
    this.fs = path.getFileSystem(conf);
    updateLock = new ReentrantReadWriteLock();
  }

  /**
   * Gather a collection of files at least as large as minSize.
   * @return The total size of files returned.
   */
  public long getInputFiles(long minSize, Collection files)
      throws IOException {
    updateLock.readLock().lock();
    try {
      return root.selectFiles(minSize, files);
    } finally {
      updateLock.readLock().unlock();
    }
  }

  /**
   * (Re)generate cache of input FileStatus objects.
   */
  public void refresh() throws IOException {
    updateLock.writeLock().lock();
    try {
      root = new InnerDesc(fs, fs.getFileStatus(path),
        new MinFileFilter(conf.getLong(GRIDMIX_MIN_FILE, 128 * 1024 * 1024),
                          conf.getLong(GRIDMIX_MAX_TOTAL, 100L * (1L << 40))));
      if (0 == root.getSize()) {
        throw new IOException("Found no satisfactory file in " + path);
      }
    } finally {
      updateLock.writeLock().unlock();
    }
  }

  /**
   * Get a set of locations for the given file.
   */
  public BlockLocation[] locationsFor(FileStatus stat, long start, long len)
      throws IOException {
    // TODO cache
    return fs.getFileBlockLocations(stat, start, len);
  }

  static abstract class Node {

    protected final static Random rand = new Random();

    /**
     * Total size of files and directories under the current node.
     */
    abstract long getSize();

    /**
     * Return a set of files whose cumulative size is at least
     * targetSize.
     * TODO Clearly size is not the only criterion, e.g. refresh from
     * generated data without including running task output, tolerance
     * for permission issues, etc.
     */
    abstract long selectFiles(long targetSize, Collection files)
        throws IOException;
  }

  /**
   * Files in current directory of this Node.
   */
  static class LeafDesc extends Node {
    final long size;
    final ArrayList curdir;

    LeafDesc(ArrayList curdir, long size) {
      this.size = size;
      this.curdir = curdir;
    }

    @Override
    public long getSize() {
      return size;
    }

    @Override
    public long selectFiles(long targetSize, Collection files)
        throws IOException {
      if (targetSize >= getSize()) {
        files.addAll(curdir);
        return getSize();
      }

      Selector selector = new Selector(curdir.size(), (double) targetSize
          / getSize(), rand);
      
      ArrayList selected = new ArrayList();
      long ret = 0L;
      do {
        int index = selector.next();
        selected.add(index);
        ret += curdir.get(index).getLen();
      } while (ret < targetSize);

      for (Integer i : selected) {
        files.add(curdir.get(i));
      }

      return ret;
    }
  }

  /**
   * A subdirectory of the current Node.
   */
  static class InnerDesc extends Node {
    final long size;
    final double[] dist;
    final Node[] subdir;

    private static final Comparator nodeComparator =
      new Comparator() {
          public int compare(Node n1, Node n2) {
            return n1.getSize() < n2.getSize() ? -1
                 : n1.getSize() > n2.getSize() ? 1 : 0;
          }
    };

    InnerDesc(final FileSystem fs, FileStatus thisDir, MinFileFilter filter)
        throws IOException {
      long fileSum = 0L;
      final ArrayList curFiles = new ArrayList();
      final ArrayList curDirs = new ArrayList();
      for (FileStatus stat : fs.listStatus(thisDir.getPath())) {
        if (stat.isDirectory()) {
          curDirs.add(stat);
        } else if (filter.accept(stat)) {
          curFiles.add(stat);
          fileSum += stat.getLen();
        }
      }
      ArrayList subdirList = new ArrayList();
      if (!curFiles.isEmpty()) {
        subdirList.add(new LeafDesc(curFiles, fileSum));
      }
      for (Iterator i = curDirs.iterator();
          !filter.done() && i.hasNext();) {
        // add subdirectories
        final Node d = new InnerDesc(fs, i.next(), filter);
        final long dSize = d.getSize();
        if (dSize > 0) {
          fileSum += dSize;
          subdirList.add(d);
        }
      }
      size = fileSum;
      LOG.debug(size + " bytes in " + thisDir.getPath());
      subdir = subdirList.toArray(new Node[subdirList.size()]);
      Arrays.sort(subdir, nodeComparator);
      dist = new double[subdir.length];
      for (int i = dist.length - 1; i > 0; --i) {
        fileSum -= subdir[i].getSize();
        dist[i] = fileSum / (1.0 * size);
      }
    }

    @Override
    public long getSize() {
      return size;
    }

    @Override
    public long selectFiles(long targetSize, Collection files)
        throws IOException {
      long ret = 0L;
      if (targetSize >= getSize()) {
        // request larger than all subdirs; add everything
        for (Node n : subdir) {
          long added = n.selectFiles(targetSize, files);
          ret += added;
          targetSize -= added;
        }
        return ret;
      }

      // can satisfy request in proper subset of contents
      // select random set, weighted by size
      final HashSet sub = new HashSet();
      do {
        assert sub.size() < subdir.length;
        final double r = rand.nextDouble();
        int pos = Math.abs(Arrays.binarySearch(dist, r) + 1) - 1;
        while (sub.contains(subdir[pos])) {
          pos = (pos + 1) % subdir.length;
        }
        long added = subdir[pos].selectFiles(targetSize, files);
        ret += added;
        targetSize -= added;
        sub.add(subdir[pos]);
      } while (targetSize > 0);
      return ret;
    }
  }

  /**
   * Filter enforcing the minFile/maxTotal parameters of the scan.
   */
  private static class MinFileFilter {

    private long totalScan;
    private final long minFileSize;

    public MinFileFilter(long minFileSize, long totalScan) {
      this.minFileSize = minFileSize;
      this.totalScan = totalScan;
    }
    public boolean done() {
      return totalScan <= 0;
    }
    public boolean accept(FileStatus stat) {
      final boolean done = done();
      if (!done && stat.getLen() >= minFileSize) {
        totalScan -= stat.getLen();
        return true;
      }
      return false;
    }
  }

}




© 2015 - 2025 Weber Informatics LLC | Privacy Policy