All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.apache.hadoop.tools.util.DistCpUtils Maven / Gradle / Ivy

There is a newer version: 3.4.1
Show newest version
/**
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package org.apache.hadoop.tools.util;

import com.google.common.collect.Maps;

import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.BlockLocation;
import org.apache.hadoop.fs.FileChecksum;
import org.apache.hadoop.fs.FileStatus;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.fs.XAttr;
import org.apache.hadoop.fs.permission.AclEntry;
import org.apache.hadoop.fs.permission.AclUtil;
import org.apache.hadoop.hdfs.DistributedFileSystem;
import org.apache.hadoop.io.SequenceFile;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.InputFormat;
import org.apache.hadoop.tools.DistCpConstants;
import org.apache.hadoop.tools.CopyListing.AclsNotSupportedException;
import org.apache.hadoop.tools.CopyListing.XAttrsNotSupportedException;
import org.apache.hadoop.tools.CopyListingFileStatus;
import org.apache.hadoop.tools.DistCpContext;
import org.apache.hadoop.tools.DistCpOptions.FileAttribute;
import org.apache.hadoop.tools.mapred.UniformSizeInputFormat;
import org.apache.hadoop.util.StringUtils;

import java.io.IOException;
import java.text.DecimalFormat;
import java.util.EnumSet;
import java.util.LinkedList;
import java.util.List;
import java.util.Map;
import java.util.Map.Entry;

/**
 * Utility functions used in DistCp.
 */
public class DistCpUtils {

  private static final Logger LOG = LoggerFactory.getLogger(DistCpUtils.class);

  /**
   * Retrieves size of the file at the specified path.
   * @param path The path of the file whose size is sought.
   * @param configuration Configuration, to retrieve the appropriate FileSystem.
   * @return The file-size, in number of bytes.
   * @throws IOException
   */
  public static long getFileSize(Path path, Configuration configuration)
                                            throws IOException {
    if (LOG.isDebugEnabled())
      LOG.debug("Retrieving file size for: " + path);
    return path.getFileSystem(configuration).getFileStatus(path).getLen();
  }

  /**
   * Utility to publish a value to a configuration.
   * @param configuration The Configuration to which the value must be written.
   * @param label The label for the value being published.
   * @param value The value being published.
   * @param  The type of the value.
   */
  public static  void publish(Configuration configuration,
                                 String label, T value) {
    configuration.set(label, String.valueOf(value));
  }

  /**
   * Utility to retrieve a specified key from a Configuration. Throw exception
   * if not found.
   * @param configuration The Configuration in which the key is sought.
   * @param label The key being sought.
   * @return Integer value of the key.
   */
  public static int getInt(Configuration configuration, String label) {
    int value = configuration.getInt(label, -1);
    assert value >= 0 : "Couldn't find " + label;
    return value;
  }

  /**
   * Utility to retrieve a specified key from a Configuration. Throw exception
   * if not found.
   * @param configuration The Configuration in which the key is sought.
   * @param label The key being sought.
   * @return Long value of the key.
   */
  public static long getLong(Configuration configuration, String label) {
    long value = configuration.getLong(label, -1);
    assert value >= 0 : "Couldn't find " + label;
    return value;
  }

  /**
   * Returns the class that implements a copy strategy. Looks up the implementation for
   * a particular strategy from distcp-default.xml
   *
   * @param conf - Configuration object
   * @param context - Distcp context with associated input options
   * @return Class implementing the strategy specified in options.
   */
  public static Class getStrategy(Configuration conf,
      DistCpContext context) {
    String confLabel = "distcp."
        + StringUtils.toLowerCase(context.getCopyStrategy())
        + ".strategy" + ".impl";
    return conf.getClass(confLabel, UniformSizeInputFormat.class, InputFormat.class);
  }

  /**
   * Gets relative path of child path with respect to a root path
   * For ex. If childPath = /tmp/abc/xyz/file and
   *            sourceRootPath = /tmp/abc
   * Relative path would be /xyz/file
   *         If childPath = /file and
   *            sourceRootPath = /
   * Relative path would be /file
   * @param sourceRootPath - Source root path
   * @param childPath - Path for which relative path is required
   * @return - Relative portion of the child path (always prefixed with /
   *           unless it is empty
   */
  public static String getRelativePath(Path sourceRootPath, Path childPath) {
    String childPathString = childPath.toUri().getPath();
    String sourceRootPathString = sourceRootPath.toUri().getPath();
    return sourceRootPathString.equals("/") ? childPathString :
        childPathString.substring(sourceRootPathString.length());
  }

  /**
   * Pack file preservation attributes into a string, containing
   * just the first character of each preservation attribute
   * @param attributes - Attribute set to preserve
   * @return - String containing first letters of each attribute to preserve
   */
  public static String packAttributes(EnumSet attributes) {
    StringBuffer buffer = new StringBuffer(FileAttribute.values().length);
    int len = 0;
    for (FileAttribute attribute : attributes) {
      buffer.append(attribute.name().charAt(0));
      len++;
    }
    return buffer.substring(0, len);
  }

  /**
   * Unpacks preservation attribute string containing the first character of
   * each preservation attribute back to a set of attributes to preserve
   * @param attributes - Attribute string
   * @return - Attribute set
   */
  public static EnumSet unpackAttributes(String attributes) {
    EnumSet retValue = EnumSet.noneOf(FileAttribute.class);

    if (attributes != null) {
      for (int index = 0; index < attributes.length(); index++) {
        retValue.add(FileAttribute.getAttribute(attributes.charAt(index)));
      }
    }

    return retValue;
  }

  /**
   * Preserve attribute on file matching that of the file status being sent
   * as argument. Barring the block size, all the other attributes are preserved
   * by this function
   *
   * @param targetFS - File system
   * @param path - Path that needs to preserve original file status
   * @param srcFileStatus - Original file status
   * @param attributes - Attribute set that needs to be preserved
   * @param preserveRawXattrs if true, raw.* xattrs should be preserved
   * @throws IOException - Exception if any (particularly relating to group/owner
   *                       change or any transient error)
   */
  public static void preserve(FileSystem targetFS, Path path,
                              CopyListingFileStatus srcFileStatus,
                              EnumSet attributes,
                              boolean preserveRawXattrs) throws IOException {

    // strip out those attributes we don't need any more
    attributes.remove(FileAttribute.BLOCKSIZE);
    attributes.remove(FileAttribute.CHECKSUMTYPE);
    // If not preserving anything from FileStatus, don't bother fetching it.
    FileStatus targetFileStatus = attributes.isEmpty() ? null :
        targetFS.getFileStatus(path);
    String group = targetFileStatus == null ? null :
        targetFileStatus.getGroup();
    String user = targetFileStatus == null ? null :
        targetFileStatus.getOwner();
    boolean chown = false;

    if (attributes.contains(FileAttribute.ACL)) {
      List srcAcl = srcFileStatus.getAclEntries();
      List targetAcl = getAcl(targetFS, targetFileStatus);
      if (!srcAcl.equals(targetAcl)) {
        targetFS.removeAcl(path);
        targetFS.setAcl(path, srcAcl);
      }
      // setAcl doesn't preserve sticky bit, so also call setPermission if needed.
      if (srcFileStatus.getPermission().getStickyBit() !=
          targetFileStatus.getPermission().getStickyBit()) {
        targetFS.setPermission(path, srcFileStatus.getPermission());
      }
    } else if (attributes.contains(FileAttribute.PERMISSION) &&
      !srcFileStatus.getPermission().equals(targetFileStatus.getPermission())) {
      targetFS.setPermission(path, srcFileStatus.getPermission());
    }

    final boolean preserveXAttrs = attributes.contains(FileAttribute.XATTR);
    if (preserveXAttrs || preserveRawXattrs) {
      final String rawNS =
          StringUtils.toLowerCase(XAttr.NameSpace.RAW.name());
      Map srcXAttrs = srcFileStatus.getXAttrs();
      Map targetXAttrs = getXAttrs(targetFS, path);
      if (srcXAttrs != null && !srcXAttrs.equals(targetXAttrs)) {
        for (Entry entry : srcXAttrs.entrySet()) {
          String xattrName = entry.getKey();
          if (xattrName.startsWith(rawNS) || preserveXAttrs) {
            targetFS.setXAttr(path, xattrName, entry.getValue());
          }
        }
      }
    }

    // The replication factor can only be preserved for replicated files.
    // It is ignored when either the source or target file are erasure coded.
    if (attributes.contains(FileAttribute.REPLICATION) &&
        !targetFileStatus.isDirectory() &&
        !targetFileStatus.isErasureCoded() &&
        !srcFileStatus.isErasureCoded() &&
        srcFileStatus.getReplication() != targetFileStatus.getReplication()) {
      targetFS.setReplication(path, srcFileStatus.getReplication());
    }

    if (attributes.contains(FileAttribute.GROUP) &&
        !group.equals(srcFileStatus.getGroup())) {
      group = srcFileStatus.getGroup();
      chown = true;
    }

    if (attributes.contains(FileAttribute.USER) &&
        !user.equals(srcFileStatus.getOwner())) {
      user = srcFileStatus.getOwner();
      chown = true;
    }

    if (chown) {
      targetFS.setOwner(path, user, group);
    }
    
    if (attributes.contains(FileAttribute.TIMES)) {
      targetFS.setTimes(path, 
          srcFileStatus.getModificationTime(), 
          srcFileStatus.getAccessTime());
    }
  }

  /**
   * Returns a file's full logical ACL.
   *
   * @param fileSystem FileSystem containing the file
   * @param fileStatus FileStatus of file
   * @return List containing full logical ACL
   * @throws IOException if there is an I/O error
   */
  public static List getAcl(FileSystem fileSystem,
      FileStatus fileStatus) throws IOException {
    List entries = fileSystem.getAclStatus(fileStatus.getPath())
      .getEntries();
    return AclUtil.getAclFromPermAndEntries(fileStatus.getPermission(), entries);
  }
  
  /**
   * Returns a file's all xAttrs.
   * 
   * @param fileSystem FileSystem containing the file
   * @param path file path
   * @return Map containing all xAttrs
   * @throws IOException if there is an I/O error
   */
  public static Map getXAttrs(FileSystem fileSystem,
      Path path) throws IOException {
    return fileSystem.getXAttrs(path);
  }

  /**
   * Converts FileStatus to a list of CopyListingFileStatus.
   * The resulted list contains either one CopyListingFileStatus per chunk of
   * file-blocks (if file-size exceeds blockSize * blocksPerChunk, and there
   * are more blocks in the file than blocksperChunk), or a single
   * CopyListingFileStatus for the entire file (if file-size is too small to
   * split).
   * If preserving ACLs, populates the CopyListingFileStatus with the ACLs.
   * If preserving XAttrs, populates the CopyListingFileStatus with the XAttrs.
   *
   * @param fileSystem FileSystem containing the file
   * @param fileStatus FileStatus of file
   * @param preserveAcls boolean true if preserving ACLs
   * @param preserveXAttrs boolean true if preserving XAttrs
   * @param preserveRawXAttrs boolean true if preserving raw.* XAttrs
   * @param blocksPerChunk size of chunks when copying chunks in parallel
   * @return list of CopyListingFileStatus
   * @throws IOException if there is an I/O error
   */
  public static LinkedList toCopyListingFileStatus(
      FileSystem fileSystem, FileStatus fileStatus, boolean preserveAcls,
      boolean preserveXAttrs, boolean preserveRawXAttrs, int blocksPerChunk)
          throws IOException {
    LinkedList copyListingFileStatus =
        new LinkedList();

    final CopyListingFileStatus clfs = toCopyListingFileStatusHelper(
        fileSystem, fileStatus, preserveAcls,
        preserveXAttrs, preserveRawXAttrs,
        0, fileStatus.getLen());
    final long blockSize = fileStatus.getBlockSize();
    if (LOG.isDebugEnabled()) {
      LOG.debug("toCopyListing: " + fileStatus + " chunkSize: "
          + blocksPerChunk + " isDFS: " +
          (fileSystem instanceof DistributedFileSystem));
    }
    if ((blocksPerChunk > 0) &&
        !fileStatus.isDirectory() &&
        (fileStatus.getLen() > blockSize * blocksPerChunk)) {
      // split only when the file size is larger than the intended chunk size
      final BlockLocation[] blockLocations;
      blockLocations = fileSystem.getFileBlockLocations(fileStatus, 0,
            fileStatus.getLen());

      int numBlocks = blockLocations.length;
      long curPos = 0;
      if (numBlocks <= blocksPerChunk) {
        if (LOG.isDebugEnabled()) {
          LOG.debug("  add file " + clfs);
        }
        copyListingFileStatus.add(clfs);
      } else {
        int i = 0;
        while (i < numBlocks) {
          long curLength = 0;
          for (int j = 0; j < blocksPerChunk && i < numBlocks; ++j, ++i) {
            curLength += blockLocations[i].getLength();
          }
          if (curLength > 0) {
            CopyListingFileStatus clfs1 = new CopyListingFileStatus(clfs);
            clfs1.setChunkOffset(curPos);
            clfs1.setChunkLength(curLength);
            if (LOG.isDebugEnabled()) {
              LOG.debug("  add file chunk " + clfs1);
            }
            copyListingFileStatus.add(clfs1);
            curPos += curLength;
          }
        }
      }
    } else {
      if (LOG.isDebugEnabled()) {
        LOG.debug("  add file/dir " + clfs);
      }
      copyListingFileStatus.add(clfs);
    }

    return copyListingFileStatus;
  }

  /**
   * Converts a FileStatus to a CopyListingFileStatus.  If preserving ACLs,
   * populates the CopyListingFileStatus with the ACLs. If preserving XAttrs,
   * populates the CopyListingFileStatus with the XAttrs.
   *
   * @param fileSystem FileSystem containing the file
   * @param fileStatus FileStatus of file
   * @param preserveAcls boolean true if preserving ACLs
   * @param preserveXAttrs boolean true if preserving XAttrs
   * @param preserveRawXAttrs boolean true if preserving raw.* XAttrs
   * @param chunkOffset chunk offset in bytes
   * @param chunkLength chunk length in bytes
   * @return CopyListingFileStatus
   * @throws IOException if there is an I/O error
   */
  public static CopyListingFileStatus toCopyListingFileStatusHelper(
      FileSystem fileSystem, FileStatus fileStatus, boolean preserveAcls, 
      boolean preserveXAttrs, boolean preserveRawXAttrs,
      long chunkOffset, long chunkLength) throws IOException {
    CopyListingFileStatus copyListingFileStatus =
        new CopyListingFileStatus(fileStatus, chunkOffset, chunkLength);
    if (preserveAcls) {
      if (fileStatus.hasAcl()) {
        List aclEntries = fileSystem.getAclStatus(
          fileStatus.getPath()).getEntries();
        copyListingFileStatus.setAclEntries(aclEntries);
      }
    }
    if (preserveXAttrs || preserveRawXAttrs) {
      Map srcXAttrs = fileSystem.getXAttrs(fileStatus.getPath());
      if (preserveXAttrs && preserveRawXAttrs) {
         copyListingFileStatus.setXAttrs(srcXAttrs);
      } else {
        Map trgXAttrs = Maps.newHashMap();
        final String rawNS =
            StringUtils.toLowerCase(XAttr.NameSpace.RAW.name());
        for (Map.Entry ent : srcXAttrs.entrySet()) {
          final String xattrName = ent.getKey();
          if (xattrName.startsWith(rawNS)) {
            if (preserveRawXAttrs) {
              trgXAttrs.put(xattrName, ent.getValue());
            }
          } else if (preserveXAttrs) {
            trgXAttrs.put(xattrName, ent.getValue());
          }
        }
        copyListingFileStatus.setXAttrs(trgXAttrs);
      }
    }
    return copyListingFileStatus;
  }

  /**
   * Sort sequence file containing FileStatus and Text as key and value
   * respectively.
   *
   * @param conf - Configuration
   * @param sourceListing - Source listing file
   * @return Path of the sorted file. Is source file with _sorted appended to the name
   * @throws IOException - Any exception during sort.
   */
  public static Path sortListing(Configuration conf,
      Path sourceListing)
      throws IOException {
    Path output = new Path(sourceListing.toString() +  "_sorted");
    sortListing(conf, sourceListing, output);
    return output;
  }

  /**
   * Sort sequence file containing FileStatus and Text as key and value
   * respectively, saving the result to the {@code output} path, which
   * will be deleted first.
   *
   * @param conf - Configuration
   * @param sourceListing - Source listing file
   * @param output output path
   * @throws IOException - Any exception during sort.
   */

  public static void sortListing(final Configuration conf,
      final Path sourceListing,
      final Path output) throws IOException {
    FileSystem fs = sourceListing.getFileSystem(conf);
    // force verify that the destination FS matches the input
    fs.makeQualified(output);
    SequenceFile.Sorter sorter = new SequenceFile.Sorter(fs, Text.class,
      CopyListingFileStatus.class, conf);

    fs.delete(output, false);

    sorter.sort(sourceListing, output);
  }

  /**
   * Determines if a file system supports ACLs by running a canary getAclStatus
   * request on the file system root.  This method is used before distcp job
   * submission to fail fast if the user requested preserving ACLs, but the file
   * system cannot support ACLs.
   *
   * @param fs FileSystem to check
   * @throws AclsNotSupportedException if fs does not support ACLs
   */
  public static void checkFileSystemAclSupport(FileSystem fs)
      throws AclsNotSupportedException {
    try {
      fs.getAclStatus(new Path(Path.SEPARATOR));
    } catch (Exception e) {
      throw new AclsNotSupportedException("ACLs not supported for file system: "
        + fs.getUri());
    }
  }
  
  /**
   * Determines if a file system supports XAttrs by running a getXAttrs request
   * on the file system root. This method is used before distcp job submission
   * to fail fast if the user requested preserving XAttrs, but the file system
   * cannot support XAttrs.
   * 
   * @param fs FileSystem to check
   * @throws XAttrsNotSupportedException if fs does not support XAttrs
   */
  public static void checkFileSystemXAttrSupport(FileSystem fs)
      throws XAttrsNotSupportedException {
    try {
      fs.getXAttrs(new Path(Path.SEPARATOR));
    } catch (Exception e) {
      throw new XAttrsNotSupportedException("XAttrs not supported for file system: "
        + fs.getUri());
    }
  }

  /**
   * String utility to convert a number-of-bytes to human readable format.
   */
  private static final ThreadLocal FORMATTER
                        = new ThreadLocal() {
    @Override
    protected DecimalFormat initialValue() {
      return new DecimalFormat("0.0");
    }
  };

  public static DecimalFormat getFormatter() {
    return FORMATTER.get();
  }

  public static String getStringDescriptionFor(long nBytes) {

    char units [] = {'B', 'K', 'M', 'G', 'T', 'P'};

    double current = nBytes;
    double prev    = current;
    int index = 0;

    while ((current = current/1024) >= 1) {
      prev = current;
      ++index;
    }

    assert index < units.length : "Too large a number.";

    return getFormatter().format(prev) + units[index];
  }

  /**
   * Utility to compare checksums for the paths specified.
   *
   * If checksums can't be retrieved, it doesn't fail the test
   * Only time the comparison would fail is when checksums are
   * available and they don't match
   *
   * @param sourceFS FileSystem for the source path.
   * @param source The source path.
   * @param sourceChecksum The checksum of the source file. If it is null we
   * still need to retrieve it through sourceFS.
   * @param targetFS FileSystem for the target path.
   * @param target The target path.
   * @return If either checksum couldn't be retrieved, the function returns
   * false. If checksums are retrieved, the function returns true if they match,
   * and false otherwise.
   * @throws IOException if there's an exception while retrieving checksums.
   */
  public static boolean checksumsAreEqual(FileSystem sourceFS, Path source,
                                          FileChecksum sourceChecksum,
                                          FileSystem targetFS,
                                          Path target, long sourceLen)
      throws IOException {
    FileChecksum targetChecksum = null;
    try {
      sourceChecksum = sourceChecksum != null
          ? sourceChecksum
          : sourceFS.getFileChecksum(source, sourceLen);
      if (sourceChecksum != null) {
        // iff there's a source checksum, look for one at the destination.
        targetChecksum = targetFS.getFileChecksum(target);
      }
    } catch (IOException e) {
      LOG.error("Unable to retrieve checksum for " + source + " or " + target, e);
    }
    return (sourceChecksum == null || targetChecksum == null ||
            sourceChecksum.equals(targetChecksum));
  }

  /**
   * Utility to compare file lengths and checksums for source and target.
   *
   * @param sourceFS FileSystem for the source path.
   * @param source The source path.
   * @param sourceChecksum The checksum of the source file. If it is null we
   * still need to retrieve it through sourceFS.
   * @param targetFS FileSystem for the target path.
   * @param target The target path.
   * @param skipCrc The flag to indicate whether to skip checksums.
   * @throws IOException if there's a mismatch in file lengths or checksums.
   */
  public static void compareFileLengthsAndChecksums(long srcLen,
             FileSystem sourceFS, Path source, FileChecksum sourceChecksum,
             FileSystem targetFS, Path target, boolean skipCrc,
             long targetLen) throws IOException {
    if (srcLen != targetLen) {
      throw new IOException(
          DistCpConstants.LENGTH_MISMATCH_ERROR_MSG + source + " (" + srcLen
              + ") and target:" + target + " (" + targetLen + ")");
    }

    //At this point, src & dest lengths are same. if length==0, we skip checksum
    if ((srcLen != 0) && (!skipCrc)) {
      if (!checksumsAreEqual(sourceFS, source, sourceChecksum,
          targetFS, target, srcLen)) {
        StringBuilder errorMessage =
            new StringBuilder(DistCpConstants.CHECKSUM_MISMATCH_ERROR_MSG)
                .append(source).append(" and ").append(target).append(".");
        boolean addSkipHint = false;
        String srcScheme = sourceFS.getScheme();
        String targetScheme = targetFS.getScheme();
        if (!srcScheme.equals(targetScheme)) {
          // the filesystems are different and they aren't both hdfs connectors
          errorMessage.append("Source and destination filesystems are of"
              + " different types\n")
              .append("Their checksum algorithms may be incompatible");
          addSkipHint = true;
        } else if (sourceFS.getFileStatus(source).getBlockSize() !=
            targetFS.getFileStatus(target).getBlockSize()) {
          errorMessage.append(" Source and target differ in block-size.\n")
              .append(" Use -pb to preserve block-sizes during copy.");
          addSkipHint = true;
        }
        if (addSkipHint) {
          errorMessage
              .append(" You can choose file-level checksum validation via "
                  + "-Ddfs.checksum.combine.mode=COMPOSITE_CRC when block-sizes"
                  + " or filesystems are different.")
              .append(" Or you can skip checksum-checks altogether "
                  + " with -skipcrccheck.\n")
              .append(" (NOTE: By skipping checksums, one runs the risk of " +
                  "masking data-corruption during file-transfer.)\n");
        }
        throw new IOException(errorMessage.toString());
      }
    }
  }

  /*
   * Return the Path for a given chunk.
   * Used when splitting large file into chunks to copy in parallel.
   * @param targetFile path to target file
   * @param srcFileStatus source file status in copy listing
   * @return path to the chunk specified by the parameters to store
   * in target cluster temporarily
   */
  public static Path getSplitChunkPath(Path targetFile,
      CopyListingFileStatus srcFileStatus) {
    return new Path(targetFile.toString()
        + ".____distcpSplit____" + srcFileStatus.getChunkOffset()
        + "." + srcFileStatus.getChunkLength());
  }
}




© 2015 - 2024 Weber Informatics LLC | Privacy Policy