All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.apache.hadoop.tools.DistCpOptions Maven / Gradle / Ivy

There is a newer version: 3.4.0
Show newest version
/**
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package org.apache.hadoop.tools;

import com.google.common.annotations.VisibleForTesting;
import com.google.common.base.Preconditions;
import org.apache.commons.lang3.StringUtils;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import org.apache.hadoop.classification.InterfaceAudience;
import org.apache.hadoop.classification.InterfaceStability;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.tools.util.DistCpUtils;

import java.util.Collections;
import java.util.EnumSet;
import java.util.List;
import java.util.NoSuchElementException;
import java.util.Set;

/**
 * The Options class encapsulates all DistCp options.
 *
 * When you add a new option, please:
 *  - Add the field along with javadoc in DistCpOptions and its Builder
 *  - Add setter method in the {@link Builder} class
 *
 * This class is immutable.
 */
@InterfaceAudience.Public
@InterfaceStability.Evolving
public final class DistCpOptions {
  private static final Logger LOG = LoggerFactory.getLogger(Builder.class);
  public static final int MAX_NUM_LISTSTATUS_THREADS = 40;

  /** File path (hdfs:// or file://) that contains the list of actual files to
   * copy.
   */
  private final Path sourceFileListing;

  /** List of source-paths (including wildcards) to be copied to target. */
  private final List sourcePaths;

  /** Destination path for the dist-copy. */
  private final Path targetPath;

  /** Whether data need to be committed automatically. */
  private final boolean atomicCommit;

  /** the work path for atomic commit. If null, the work
   * path would be parentOf(targetPath) + "/._WIP_" + nameOf(targetPath). */
  private final Path atomicWorkPath;

  /** Whether source and target folder contents be sync'ed up. */
  private final boolean syncFolder;

  /** Path to save source/dest sequence files to, if non-null. */
  private final Path trackPath;

  /** Whether files only present in target should be deleted. */
  private boolean deleteMissing;

  /** Whether failures during copy be ignored. */
  private final boolean ignoreFailures;

  /** Whether files should always be overwritten on target. */
  private final boolean overwrite;

  /** Whether we want to append new data to target files. This is valid only
   * with update option and CRC is not skipped. */
  private final boolean append;

  /** Whether checksum comparison should be skipped while determining if source
   * and destination files are identical. */
  private final boolean skipCRC;

  /** Whether to run blocking or non-blocking. */
  private final boolean blocking;

  // When "-diff s1 s2 src tgt" is passed, apply forward snapshot diff (from s1
  // to s2) of source cluster to the target cluster to sync target cluster with
  // the source cluster. Referred to as "Fdiff" in the code.
  // It's required that s2 is newer than s1.
  private final boolean useDiff;

  // When "-rdiff s2 s1 src tgt" is passed, apply reversed snapshot diff (from
  // s2 to s1) of target cluster to the target cluster, so to make target
  // cluster go back to s1. Referred to as "Rdiff" in the code.
  // It's required that s2 is newer than s1, and src and tgt have exact same
  // content at their s1, if src is not the same as tgt.
  private final boolean useRdiff;

  /** Whether to log additional info (path, size) in the SKIP/COPY log. */
  private final boolean verboseLog;

  // For both -diff and -rdiff, given the example command line switches, two
  // steps are taken:
  //   1. Sync Step. This step does renaming/deletion ops in the snapshot diff,
  //      so to avoid copying files copied already but renamed later(HDFS-7535)
  //   2. Copy Step. This step copy the necessary files from src to tgt
  //      2.1 For -diff, it copies from snapshot s2 of src (HDFS-8828)
  //      2.2 For -rdiff, it copies from snapshot s1 of src, where the src
  //          could be the tgt itself (HDFS-9820).
  //

  private final String fromSnapshot;
  private final String toSnapshot;

  /** The path to a file containing a list of paths to filter out of copy. */
  private final String filtersFile;

  /** Path where output logs are stored. If not specified, it will use the
   * default value JobStagingDir/_logs and delete upon job completion. */
  private final Path logPath;

  /** Set the copy strategy to use. Should map to a strategy implementation
   * in distp-default.xml. */
  private final String copyStrategy;

  /** per map bandwidth in MB. */
  private final float mapBandwidth;

  /** The number of threads to use for listStatus. We allow max
   * {@link #MAX_NUM_LISTSTATUS_THREADS} threads. Setting numThreads to zero
   * signify we should use the value from conf properties. */
  private final int numListstatusThreads;

  /** The max number of maps to use for copy. */
  private final int maxMaps;

  /** File attributes that need to be preserved. */
  private final EnumSet preserveStatus;

  // Size of chunk in number of blocks when splitting large file into chunks
  // to copy in parallel. Default is 0 and file are not splitted.
  private final int blocksPerChunk;

  private final int copyBufferSize;

  /**
   * File attributes for preserve.
   *
   * Each enum entry uses the first char as its symbol.
   */
  public enum FileAttribute {
    REPLICATION,    // R
    BLOCKSIZE,      // B
    USER,           // U
    GROUP,          // G
    PERMISSION,     // P
    CHECKSUMTYPE,   // C
    ACL,            // A
    XATTR,          // X
    TIMES;          // T

    public static FileAttribute getAttribute(char symbol) {
      for (FileAttribute attribute : values()) {
        if (attribute.name().charAt(0) == Character.toUpperCase(symbol)) {
          return attribute;
        }
      }
      throw new NoSuchElementException("No attribute for " + symbol);
    }
  }

  private DistCpOptions(Builder builder) {
    this.sourceFileListing = builder.sourceFileListing;
    this.sourcePaths = builder.sourcePaths;
    this.targetPath = builder.targetPath;

    this.atomicCommit = builder.atomicCommit;
    this.atomicWorkPath = builder.atomicWorkPath;
    this.syncFolder = builder.syncFolder;
    this.deleteMissing = builder.deleteMissing;
    this.ignoreFailures = builder.ignoreFailures;
    this.overwrite = builder.overwrite;
    this.append = builder.append;
    this.skipCRC = builder.skipCRC;
    this.blocking = builder.blocking;

    this.useDiff = builder.useDiff;
    this.useRdiff = builder.useRdiff;
    this.fromSnapshot = builder.fromSnapshot;
    this.toSnapshot = builder.toSnapshot;

    this.filtersFile = builder.filtersFile;
    this.logPath = builder.logPath;
    this.copyStrategy = builder.copyStrategy;

    this.mapBandwidth = builder.mapBandwidth;
    this.numListstatusThreads = builder.numListstatusThreads;
    this.maxMaps = builder.maxMaps;

    this.preserveStatus = builder.preserveStatus;

    this.blocksPerChunk = builder.blocksPerChunk;

    this.copyBufferSize = builder.copyBufferSize;
    this.verboseLog = builder.verboseLog;
    this.trackPath = builder.trackPath;
  }

  public Path getSourceFileListing() {
    return sourceFileListing;
  }

  public List getSourcePaths() {
    return sourcePaths == null ?
        null : Collections.unmodifiableList(sourcePaths);
  }

  public Path getTargetPath() {
    return targetPath;
  }

  public boolean shouldAtomicCommit() {
    return atomicCommit;
  }

  public Path getAtomicWorkPath() {
    return atomicWorkPath;
  }

  public boolean shouldSyncFolder() {
    return syncFolder;
  }

  public boolean shouldDeleteMissing() {
    return deleteMissing;
  }

  public boolean shouldIgnoreFailures() {
    return ignoreFailures;
  }

  public boolean shouldOverwrite() {
    return overwrite;
  }

  public boolean shouldAppend() {
    return append;
  }

  public boolean shouldSkipCRC() {
    return skipCRC;
  }

  public boolean shouldBlock() {
    return blocking;
  }

  public boolean shouldUseDiff() {
    return this.useDiff;
  }

  public boolean shouldUseRdiff() {
    return this.useRdiff;
  }

  public boolean shouldUseSnapshotDiff() {
    return shouldUseDiff() || shouldUseRdiff();
  }

  public String getFromSnapshot() {
    return this.fromSnapshot;
  }

  public String getToSnapshot() {
    return this.toSnapshot;
  }

  public String getFiltersFile() {
    return filtersFile;
  }

  public Path getLogPath() {
    return logPath;
  }

  public String getCopyStrategy() {
    return copyStrategy;
  }

  public int getNumListstatusThreads() {
    return numListstatusThreads;
  }

  public int getMaxMaps() {
    return maxMaps;
  }

  public float getMapBandwidth() {
    return mapBandwidth;
  }

  public Set getPreserveAttributes() {
    return (preserveStatus == null)
        ? null
        : Collections.unmodifiableSet(preserveStatus);
  }

  /**
   * Checks if the input attribute should be preserved or not.
   *
   * @param attribute - Attribute to check
   * @return True if attribute should be preserved, false otherwise
   */
  public boolean shouldPreserve(FileAttribute attribute) {
    return preserveStatus.contains(attribute);
  }

  public int getBlocksPerChunk() {
    return blocksPerChunk;
  }

  public int getCopyBufferSize() {
    return copyBufferSize;
  }

  public boolean shouldVerboseLog() {
    return verboseLog;
  }

  public Path getTrackPath() {
    return trackPath;
  }

  /**
   * Add options to configuration. These will be used in the Mapper/committer
   *
   * @param conf - Configuration object to which the options need to be added
   */
  public void appendToConf(Configuration conf) {
    DistCpOptionSwitch.addToConf(conf, DistCpOptionSwitch.ATOMIC_COMMIT,
        String.valueOf(atomicCommit));
    DistCpOptionSwitch.addToConf(conf, DistCpOptionSwitch.IGNORE_FAILURES,
        String.valueOf(ignoreFailures));
    DistCpOptionSwitch.addToConf(conf, DistCpOptionSwitch.SYNC_FOLDERS,
        String.valueOf(syncFolder));
    DistCpOptionSwitch.addToConf(conf, DistCpOptionSwitch.DELETE_MISSING,
        String.valueOf(deleteMissing));
    DistCpOptionSwitch.addToConf(conf, DistCpOptionSwitch.OVERWRITE,
        String.valueOf(overwrite));
    DistCpOptionSwitch.addToConf(conf, DistCpOptionSwitch.APPEND,
        String.valueOf(append));
    DistCpOptionSwitch.addToConf(conf, DistCpOptionSwitch.DIFF,
        String.valueOf(useDiff));
    DistCpOptionSwitch.addToConf(conf, DistCpOptionSwitch.RDIFF,
        String.valueOf(useRdiff));
    DistCpOptionSwitch.addToConf(conf, DistCpOptionSwitch.SKIP_CRC,
        String.valueOf(skipCRC));
    if (mapBandwidth > 0) {
      DistCpOptionSwitch.addToConf(conf, DistCpOptionSwitch.BANDWIDTH,
          String.valueOf(mapBandwidth));
    }
    DistCpOptionSwitch.addToConf(conf, DistCpOptionSwitch.PRESERVE_STATUS,
        DistCpUtils.packAttributes(preserveStatus));
    if (filtersFile != null) {
      DistCpOptionSwitch.addToConf(conf, DistCpOptionSwitch.FILTERS,
          filtersFile);
    }
    DistCpOptionSwitch.addToConf(conf, DistCpOptionSwitch.BLOCKS_PER_CHUNK,
        String.valueOf(blocksPerChunk));
    DistCpOptionSwitch.addToConf(conf, DistCpOptionSwitch.COPY_BUFFER_SIZE,
        String.valueOf(copyBufferSize));
    DistCpOptionSwitch.addToConf(conf, DistCpOptionSwitch.VERBOSE_LOG,
        String.valueOf(verboseLog));
    if (trackPath != null) {
      DistCpOptionSwitch.addToConf(conf, DistCpOptionSwitch.TRACK_MISSING,
          String.valueOf(trackPath));
    }
    if (numListstatusThreads > 0) {
      DistCpOptionSwitch.addToConf(conf, DistCpOptionSwitch.NUM_LISTSTATUS_THREADS,
          Integer.toString(numListstatusThreads));
    }
  }

  /**
   * Utility to easily string-ify Options, for logging.
   *
   * @return String representation of the Options.
   */
  @Override
  public String toString() {
    return "DistCpOptions{" +
        "atomicCommit=" + atomicCommit +
        ", syncFolder=" + syncFolder +
        ", deleteMissing=" + deleteMissing +
        ", ignoreFailures=" + ignoreFailures +
        ", overwrite=" + overwrite +
        ", append=" + append +
        ", useDiff=" + useDiff +
        ", useRdiff=" + useRdiff +
        ", fromSnapshot=" + fromSnapshot +
        ", toSnapshot=" + toSnapshot +
        ", skipCRC=" + skipCRC +
        ", blocking=" + blocking +
        ", numListstatusThreads=" + numListstatusThreads +
        ", maxMaps=" + maxMaps +
        ", mapBandwidth=" + mapBandwidth +
        ", copyStrategy='" + copyStrategy + '\'' +
        ", preserveStatus=" + preserveStatus +
        ", atomicWorkPath=" + atomicWorkPath +
        ", logPath=" + logPath +
        ", sourceFileListing=" + sourceFileListing +
        ", sourcePaths=" + sourcePaths +
        ", targetPath=" + targetPath +
        ", filtersFile='" + filtersFile + '\'' +
        ", blocksPerChunk=" + blocksPerChunk +
        ", copyBufferSize=" + copyBufferSize +
        ", verboseLog=" + verboseLog +
        '}';
  }

  /**
   * The builder of the {@link DistCpOptions}.
   *
   * This is designed to be the only public interface to create a
   * {@link DistCpOptions} object for users. It follows a simple Builder design
   * pattern.
   */
  public static class Builder {
    private Path sourceFileListing;
    private List sourcePaths;
    private Path targetPath;

    private boolean atomicCommit = false;
    private Path atomicWorkPath;
    private boolean syncFolder = false;
    private boolean deleteMissing = false;
    private boolean ignoreFailures = false;
    private boolean overwrite = false;
    private boolean append = false;
    private boolean skipCRC = false;
    private boolean blocking = true;
    private boolean verboseLog = false;

    private boolean useDiff = false;
    private boolean useRdiff = false;
    private String fromSnapshot;
    private String toSnapshot;

    private String filtersFile;

    private Path logPath;
    private Path trackPath;
    private String copyStrategy = DistCpConstants.UNIFORMSIZE;

    private int numListstatusThreads = 0;  // 0 indicates that flag is not set.
    private int maxMaps = DistCpConstants.DEFAULT_MAPS;
    private float mapBandwidth = 0; // 0 indicates we should use the default

    private EnumSet preserveStatus =
        EnumSet.noneOf(FileAttribute.class);

    private int blocksPerChunk = 0;

    private int copyBufferSize =
            DistCpConstants.COPY_BUFFER_SIZE_DEFAULT;

    public Builder(List sourcePaths, Path targetPath) {
      Preconditions.checkArgument(sourcePaths != null && !sourcePaths.isEmpty(),
          "Source paths should not be null or empty!");
      Preconditions.checkArgument(targetPath != null,
          "Target path should not be null!");
      this.sourcePaths = sourcePaths;
      this.targetPath = targetPath;
    }

    public Builder(Path sourceFileListing, Path targetPath) {
      Preconditions.checkArgument(sourceFileListing != null,
          "Source file listing should not be null!");
      Preconditions.checkArgument(targetPath != null,
          "Target path should not be null!");

      this.sourceFileListing = sourceFileListing;
      this.targetPath = targetPath;
    }

    /**
     * This is the single entry point for constructing DistCpOptions objects.
     *
     * Before a new DistCpOptions object is returned, it will set the dependent
     * options, validate the option combinations. After constructing, the
     * DistCpOptions instance is immutable.
     */
    public DistCpOptions build() {
      setOptionsForSplitLargeFile();

      validate();

      return new DistCpOptions(this);
    }

    /**
     * Override options for split large files.
     */
    private void setOptionsForSplitLargeFile() {
      if (blocksPerChunk <= 0) {
        return;
      }

      LOG.info("Enabling preserving blocksize since "
          + DistCpOptionSwitch.BLOCKS_PER_CHUNK.getSwitch() + " is passed.");
      preserve(FileAttribute.BLOCKSIZE);

      LOG.info("Set " + DistCpOptionSwitch.APPEND.getSwitch()
          + " to false since " + DistCpOptionSwitch.BLOCKS_PER_CHUNK.getSwitch()
          + " is passed.");
      this.append = false;
    }

    private void validate() {
      if ((useDiff || useRdiff) && deleteMissing) {
        // -delete and -diff/-rdiff are mutually exclusive.
        throw new IllegalArgumentException("-delete and -diff/-rdiff are "
            + "mutually exclusive. The -delete option will be ignored.");
      }

      if (!atomicCommit && atomicWorkPath != null) {
        throw new IllegalArgumentException(
            "-tmp work-path can only be specified along with -atomic");
      }

      if (syncFolder && atomicCommit) {
        throw new IllegalArgumentException("Atomic commit can't be used with "
            + "sync folder or overwrite options");
      }

      if (deleteMissing && !(overwrite || syncFolder)) {
        throw new IllegalArgumentException("Delete missing is applicable "
            + "only with update or overwrite options");
      }

      if (overwrite && syncFolder) {
        throw new IllegalArgumentException("Overwrite and update options are "
            + "mutually exclusive");
      }

      if (!syncFolder && append) {
        throw new IllegalArgumentException(
            "Append is valid only with update options");
      }
      if (skipCRC && append) {
        throw new IllegalArgumentException(
            "Append is disallowed when skipping CRC");
      }
      if (!syncFolder && (useDiff || useRdiff)) {
        throw new IllegalArgumentException(
            "-diff/-rdiff is valid only with -update option");
      }

      if (useDiff || useRdiff) {
        if (StringUtils.isBlank(fromSnapshot) ||
            StringUtils.isBlank(toSnapshot)) {
          throw new IllegalArgumentException(
              "Must provide both the starting and ending " +
                  "snapshot names for -diff/-rdiff");
        }
      }
      if (useDiff && useRdiff) {
        throw new IllegalArgumentException(
            "-diff and -rdiff are mutually exclusive");
      }

      if (verboseLog && logPath == null) {
        throw new IllegalArgumentException(
            "-v is valid only with -log option");
      }
    }

    @VisibleForTesting
    Builder withSourcePaths(List newSourcePaths) {
      this.sourcePaths = newSourcePaths;
      return this;
    }

    public Builder withAtomicCommit(boolean newAtomicCommit) {
      this.atomicCommit = newAtomicCommit;
      return this;
    }

    public Builder withAtomicWorkPath(Path newAtomicWorkPath) {
      this.atomicWorkPath = newAtomicWorkPath;
      return this;
    }

    public Builder withSyncFolder(boolean newSyncFolder) {
      this.syncFolder = newSyncFolder;
      return this;
    }

    public Builder withDeleteMissing(boolean newDeleteMissing) {
      this.deleteMissing = newDeleteMissing;
      return this;
    }

    public Builder withIgnoreFailures(boolean newIgnoreFailures) {
      this.ignoreFailures = newIgnoreFailures;
      return this;
    }

    public Builder withOverwrite(boolean newOverwrite) {
      this.overwrite = newOverwrite;
      return this;
    }

    public Builder withAppend(boolean newAppend) {
      this.append = newAppend;
      return this;
    }

    public Builder withCRC(boolean newSkipCRC) {
      this.skipCRC = newSkipCRC;
      return this;
    }

    public Builder withBlocking(boolean newBlocking) {
      this.blocking = newBlocking;
      return this;
    }

    public Builder withUseDiff(String newFromSnapshot,  String newToSnapshot) {
      this.useDiff = true;
      this.fromSnapshot = newFromSnapshot;
      this.toSnapshot = newToSnapshot;
      return this;
    }

    public Builder withUseRdiff(String newFromSnapshot, String newToSnapshot) {
      this.useRdiff = true;
      this.fromSnapshot = newFromSnapshot;
      this.toSnapshot = newToSnapshot;
      return this;
    }

    public Builder withFiltersFile(String newFiletersFile) {
      this.filtersFile = newFiletersFile;
      return this;
    }

    public Builder withLogPath(Path newLogPath) {
      this.logPath = newLogPath;
      return this;
    }

    public Builder withTrackMissing(Path path) {
      this.trackPath = path;
      return this;
    }

    public Builder withCopyStrategy(String newCopyStrategy) {
      this.copyStrategy = newCopyStrategy;
      return this;
    }

    public Builder withMapBandwidth(float newMapBandwidth) {
      Preconditions.checkArgument(newMapBandwidth > 0,
          "Bandwidth " + newMapBandwidth + " is invalid (should be > 0)");
      this.mapBandwidth = newMapBandwidth;
      return this;
    }

    public Builder withNumListstatusThreads(int newNumListstatusThreads) {
      if (newNumListstatusThreads > MAX_NUM_LISTSTATUS_THREADS) {
        this.numListstatusThreads = MAX_NUM_LISTSTATUS_THREADS;
      } else if (newNumListstatusThreads > 0) {
        this.numListstatusThreads = newNumListstatusThreads;
      } else {
        this.numListstatusThreads = 0;
      }
      return this;
    }

    public Builder maxMaps(int newMaxMaps) {
      this.maxMaps = Math.max(newMaxMaps, 1);
      return this;
    }

    public Builder preserve(String attributes) {
      if (attributes == null || attributes.isEmpty()) {
        preserveStatus = EnumSet.allOf(FileAttribute.class);
      } else {
        for (int index = 0; index < attributes.length(); index++) {
          preserveStatus.add(FileAttribute.
              getAttribute(attributes.charAt(index)));
        }
      }
      return this;
    }

    public Builder preserve(FileAttribute attribute) {
      preserveStatus.add(attribute);
      return this;
    }

    public Builder withBlocksPerChunk(int newBlocksPerChunk) {
      this.blocksPerChunk = newBlocksPerChunk;
      return this;
    }

    public Builder withCopyBufferSize(int newCopyBufferSize) {
      this.copyBufferSize =
          newCopyBufferSize > 0 ? newCopyBufferSize
              : DistCpConstants.COPY_BUFFER_SIZE_DEFAULT;
      return this;
    }

    public Builder withVerboseLog(boolean newVerboseLog) {
      this.verboseLog = newVerboseLog;
      return this;
    }
  }

}




© 2015 - 2024 Weber Informatics LLC | Privacy Policy