All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.apache.hadoop.tools.DistCpOptions Maven / Gradle / Ivy

There is a newer version: 3.4.1
Show newest version
/**
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package org.apache.hadoop.tools;

import org.apache.commons.lang.StringUtils;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.tools.util.DistCpUtils;

import java.util.EnumSet;
import java.util.Iterator;
import java.util.List;
import java.util.NoSuchElementException;

/**
 * The Options class encapsulates all DistCp options.
 * These may be set from command-line (via the OptionsParser)
 * or may be set manually.
 */
public class DistCpOptions {

  private boolean atomicCommit = false;
  private boolean syncFolder = false;
  private boolean deleteMissing = false;
  private boolean ignoreFailures = false;
  private boolean overwrite = false;
  private boolean append = false;
  private boolean skipCRC = false;
  private boolean blocking = true;
  // When "-diff s1 s2 src tgt" is passed, apply forward snapshot diff (from s1
  // to s2) of source cluster to the target cluster to sync target cluster with
  // the source cluster. Referred to as "Fdiff" in the code.
  // It's required that s2 is newer than s1.
  private boolean useDiff = false;

  // When "-rdiff s2 s1 src tgt" is passed, apply reversed snapshot diff (from
  // s2 to s1) of target cluster to the target cluster, so to make target
  // cluster go back to s1. Referred to as "Rdiff" in the code.
  // It's required that s2 is newer than s1, and src and tgt have exact same
  // content at their s1, if src is not the same as tgt.
  private boolean useRdiff = false;

  // For both -diff and -rdiff, given the example command line switches, two
  // steps are taken:
  //   1. Sync Step. This step does renaming/deletion ops in the snapshot diff,
  //      so to avoid copying files copied already but renamed later(HDFS-7535)
  //   2. Copy Step. This step copy the necessary files from src to tgt
  //      2.1 For -diff, it copies from snapshot s2 of src (HDFS-8828)
  //      2.2 For -rdiff, it copies from snapshot s1 of src, where the src
  //          could be the tgt itself (HDFS-9820).
  //

  public static final int maxNumListstatusThreads = 40;
  private int numListstatusThreads = 0;  // Indicates that flag is not set.
  private int maxMaps = DistCpConstants.DEFAULT_MAPS;
  private float mapBandwidth = 0;  // Indicates that we should use the default.

  private String copyStrategy = DistCpConstants.UNIFORMSIZE;

  private EnumSet preserveStatus = EnumSet.noneOf(FileAttribute.class);

  private boolean preserveRawXattrs;

  private Path atomicWorkPath;

  private Path logPath;

  private Path sourceFileListing;
  private List sourcePaths;

  private String fromSnapshot;
  private String toSnapshot;

  private Path targetPath;

  /**
   * The path to a file containing a list of paths to filter out of the copy.
   */
  private String filtersFile;

  // targetPathExist is a derived field, it's initialized in the
  // beginning of distcp.
  private boolean targetPathExists = true;
  
  public static enum FileAttribute{
    REPLICATION, BLOCKSIZE, USER, GROUP, PERMISSION, CHECKSUMTYPE, ACL, XATTR, TIMES;

    public static FileAttribute getAttribute(char symbol) {
      for (FileAttribute attribute : values()) {
        if (attribute.name().charAt(0) == Character.toUpperCase(symbol)) {
          return attribute;
        }
      }
      throw new NoSuchElementException("No attribute for " + symbol);
    }
  }

  /**
   * Constructor, to initialize source/target paths.
   * @param sourcePaths List of source-paths (including wildcards)
   *                     to be copied to target.
   * @param targetPath Destination path for the dist-copy.
   */
  public DistCpOptions(List sourcePaths, Path targetPath) {
    assert sourcePaths != null && !sourcePaths.isEmpty() : "Invalid source paths";
    assert targetPath != null : "Invalid Target path";

    this.sourcePaths = sourcePaths;
    this.targetPath = targetPath;
  }

  /**
   * Constructor, to initialize source/target paths.
   * @param sourceFileListing File containing list of source paths
   * @param targetPath Destination path for the dist-copy.
   */
  public DistCpOptions(Path sourceFileListing, Path targetPath) {
    assert sourceFileListing != null : "Invalid source paths";
    assert targetPath != null : "Invalid Target path";

    this.sourceFileListing = sourceFileListing;
    this.targetPath = targetPath;
  }

  /**
   * Copy constructor.
   * @param that DistCpOptions being copied from.
   */
  public DistCpOptions(DistCpOptions that) {
    if (this != that && that != null) {
      this.atomicCommit = that.atomicCommit;
      this.syncFolder = that.syncFolder;
      this.deleteMissing = that.deleteMissing;
      this.ignoreFailures = that.ignoreFailures;
      this.overwrite = that.overwrite;
      this.skipCRC = that.skipCRC;
      this.blocking = that.blocking;
      this.useDiff = that.useDiff;
      this.useRdiff = that.useRdiff;
      this.numListstatusThreads = that.numListstatusThreads;
      this.maxMaps = that.maxMaps;
      this.mapBandwidth = that.mapBandwidth;
      this.copyStrategy = that.copyStrategy;
      this.preserveStatus = that.preserveStatus;
      this.preserveRawXattrs = that.preserveRawXattrs;
      this.atomicWorkPath = that.getAtomicWorkPath();
      this.logPath = that.getLogPath();
      this.sourceFileListing = that.getSourceFileListing();
      this.sourcePaths = that.getSourcePaths();
      this.targetPath = that.getTargetPath();
      this.targetPathExists = that.getTargetPathExists();
      this.filtersFile = that.getFiltersFile();
    }
  }

  /**
   * Should the data be committed atomically?
   *
   * @return true if data should be committed automically. false otherwise
   */
  public boolean shouldAtomicCommit() {
    return atomicCommit;
  }

  /**
   * Set if data need to be committed automatically
   *
   * @param atomicCommit - boolean switch
   */
  public void setAtomicCommit(boolean atomicCommit) {
    this.atomicCommit = atomicCommit;
  }

  /**
   * Should the data be sync'ed between source and target paths?
   *
   * @return true if data should be sync'ed up. false otherwise
   */
  public boolean shouldSyncFolder() {
    return syncFolder;
  }

  /**
   * Set if source and target folder contents be sync'ed up
   *
   * @param syncFolder - boolean switch
   */
  public void setSyncFolder(boolean syncFolder) {
    this.syncFolder = syncFolder;
  }

  /**
   * Should target files missing in source should be deleted?
   *
   * @return true if zoombie target files to be removed. false otherwise
   */
  public boolean shouldDeleteMissing() {
    return deleteMissing;
  }

  /**
   * Set if files only present in target should be deleted
   *
   * @param deleteMissing - boolean switch
   */
  public void setDeleteMissing(boolean deleteMissing) {
    this.deleteMissing = deleteMissing;
  }

  /**
   * Should failures be logged and ignored during copy?
   *
   * @return true if failures are to be logged and ignored. false otherwise
   */
  public boolean shouldIgnoreFailures() {
    return ignoreFailures;
  }

  /**
   * Set if failures during copy be ignored
   *
   * @param ignoreFailures - boolean switch
   */
  public void setIgnoreFailures(boolean ignoreFailures) {
    this.ignoreFailures = ignoreFailures;
  }

  /**
   * Should DistCp be running in blocking mode
   *
   * @return true if should run in blocking, false otherwise
   */
  public boolean shouldBlock() {
    return blocking;
  }

  /**
   * Set if Disctp should run blocking or non-blocking
   *
   * @param blocking - boolean switch
   */
  public void setBlocking(boolean blocking) {
    this.blocking = blocking;
  }

  /**
   * Should files be overwritten always?
   *
   * @return true if files in target that may exist before distcp, should always
   *         be overwritten. false otherwise
   */
  public boolean shouldOverwrite() {
    return overwrite;
  }

  /**
   * Set if files should always be overwritten on target
   *
   * @param overwrite - boolean switch
   */
  public void setOverwrite(boolean overwrite) {
    this.overwrite = overwrite;
  }

  /**
   * @return whether we can append new data to target files
   */
  public boolean shouldAppend() {
    return append;
  }

  /**
   * Set if we want to append new data to target files. This is valid only with
   * update option and CRC is not skipped.
   */
  public void setAppend(boolean append) {
    this.append = append;
  }

  public boolean shouldUseDiff() {
    return this.useDiff;
  }

  public boolean shouldUseRdiff() {
    return this.useRdiff;
  }

  public boolean shouldUseSnapshotDiff() {
    return shouldUseDiff() || shouldUseRdiff();
  }

  public String getFromSnapshot() {
    return this.fromSnapshot;
  }

  public String getToSnapshot() {
    return this.toSnapshot;
  }

  public void setUseDiff(String fromSS, String toSS) {
    this.useDiff = true;
    this.fromSnapshot = fromSS;
    this.toSnapshot = toSS;
  }

  public void setUseRdiff(String fromSS, String toSS) {
    this.useRdiff = true;
    this.fromSnapshot = fromSS;
    this.toSnapshot = toSS;
  }

  /**
   * Should CRC/checksum check be skipped while checking files are identical
   *
   * @return true if checksum check should be skipped while checking files are
   *         identical. false otherwise
   */
  public boolean shouldSkipCRC() {
    return skipCRC;
  }

  /**
   * Set if checksum comparison should be skipped while determining if
   * source and destination files are identical
   *
   * @param skipCRC - boolean switch
   */
  public void setSkipCRC(boolean skipCRC) {
    this.skipCRC = skipCRC;
  }

  /** Get the number of threads to use for listStatus
   *
   * @return Number of threads to do listStatus
   */
  public int getNumListstatusThreads() {
    return numListstatusThreads;
  }

  /** Set the number of threads to use for listStatus. We allow max 40
   *  threads. Setting numThreads to zero signify we should use the value
   *  from conf properties.
   *
   * @param numThreads - Number of threads
   */
  public void setNumListstatusThreads(int numThreads) {
    if (numThreads > maxNumListstatusThreads) {
      this.numListstatusThreads = maxNumListstatusThreads;
    } else if (numThreads > 0) {
      this.numListstatusThreads = numThreads;
    } else {
      this.numListstatusThreads = 0;
    }
  }

  /** Get the max number of maps to use for this copy
   *
   * @return Max number of maps
   */
  public int getMaxMaps() {
    return maxMaps;
  }

  /**
   * Set the max number of maps to use for copy
   *
   * @param maxMaps - Number of maps
   */
  public void setMaxMaps(int maxMaps) {
    this.maxMaps = Math.max(maxMaps, 1);
  }

  /** Get the map bandwidth in MB
   *
   * @return Bandwidth in MB
   */
  public float getMapBandwidth() {
    return mapBandwidth;
  }

  /**
   * Set per map bandwidth
   *
   * @param mapBandwidth - per map bandwidth
   */
  public void setMapBandwidth(float mapBandwidth) {
    assert mapBandwidth > 0 : "Bandwidth " + mapBandwidth + " is invalid (should be > 0)";
    this.mapBandwidth = mapBandwidth;
  }

  /**
   * Returns an iterator with the list of file attributes to preserve
   *
   * @return iterator of file attributes to preserve
   */
  public Iterator preserveAttributes() {
    return preserveStatus.iterator();
  }

  /**
   * Checks if the input attribute should be preserved or not
   *
   * @param attribute - Attribute to check
   * @return True if attribute should be preserved, false otherwise
   */
  public boolean shouldPreserve(FileAttribute attribute) {
    return preserveStatus.contains(attribute);
  }

  /**
   * Add file attributes that need to be preserved. This method may be
   * called multiple times to add attributes.
   *
   * @param fileAttribute - Attribute to add, one at a time
   */
  public void preserve(FileAttribute fileAttribute) {
    for (FileAttribute attribute : preserveStatus) {
      if (attribute.equals(fileAttribute)) {
        return;
      }
    }
    preserveStatus.add(fileAttribute);
  }

  /**
   * Return true if raw.* xattrs should be preserved.
   * @return true if raw.* xattrs should be preserved.
   */
  public boolean shouldPreserveRawXattrs() {
    return preserveRawXattrs;
  }

  /**
   * Indicate that raw.* xattrs should be preserved
   */
  public void preserveRawXattrs() {
    preserveRawXattrs = true;
  }

  /** Get work path for atomic commit. If null, the work
   * path would be parentOf(targetPath) + "/._WIP_" + nameOf(targetPath)
   *
   * @return Atomic work path on the target cluster. Null if not set
   */
  public Path getAtomicWorkPath() {
    return atomicWorkPath;
  }

  /**
   * Set the work path for atomic commit
   *
   * @param atomicWorkPath - Path on the target cluster
   */
  public void setAtomicWorkPath(Path atomicWorkPath) {
    this.atomicWorkPath = atomicWorkPath;
  }

  /** Get output directory for writing distcp logs. Otherwise logs
   * are temporarily written to JobStagingDir/_logs and deleted
   * upon job completion
   *
   * @return Log output path on the cluster where distcp job is run
   */
  public Path getLogPath() {
    return logPath;
  }

  /**
   * Set the log path where distcp output logs are stored
   * Uses JobStagingDir/_logs by default
   *
   * @param logPath - Path where logs will be saved
   */
  public void setLogPath(Path logPath) {
    this.logPath = logPath;
  }

  /**
   * Get the copy strategy to use. Uses appropriate input format
   *
   * @return copy strategy to use
   */
  public String getCopyStrategy() {
    return copyStrategy;
  }

  /**
   * Set the copy strategy to use. Should map to a strategy implementation
   * in distp-default.xml
   *
   * @param copyStrategy - copy Strategy to use
   */
  public void setCopyStrategy(String copyStrategy) {
    this.copyStrategy = copyStrategy;
  }

  /**
   * File path (hdfs:// or file://) that contains the list of actual
   * files to copy
   *
   * @return - Source listing file path
   */
  public Path getSourceFileListing() {
    return sourceFileListing;
  }

  /**
   * Getter for sourcePaths.
   * @return List of source-paths.
   */
  public List getSourcePaths() {
    return sourcePaths;
  }

  /**
   * Setter for sourcePaths.
   * @param sourcePaths The new list of source-paths.
   */
  public void setSourcePaths(List sourcePaths) {
    assert sourcePaths != null && sourcePaths.size() != 0;
    this.sourcePaths = sourcePaths;
  }

  /**
   * Getter for the targetPath.
   * @return The target-path.
   */
  public Path getTargetPath() {
    return targetPath;
  }

  /**
   * Getter for the targetPathExists.
   * @return The target-path.
   */
  public boolean getTargetPathExists() {
    return targetPathExists;
  }
  
  /**
   * Set targetPathExists.
   * @param targetPathExists Whether the target path of distcp exists.
   */
  public boolean setTargetPathExists(boolean targetPathExists) {
    return this.targetPathExists = targetPathExists;
  }

  /**
   * File path that contains the list of patterns
   * for paths to be filtered from the file copy.
   * @return - Filter  file path.
   */
  public final String getFiltersFile() {
    return filtersFile;
  }

  /**
   * Set filtersFile.
   * @param filtersFilename The path to a list of patterns to exclude from copy.
   */
  public final void setFiltersFile(String filtersFilename) {
    this.filtersFile = filtersFilename;
  }

  void validate() {
    if ((useDiff || useRdiff) && deleteMissing) {
      // -delete and -diff/-rdiff are mutually exclusive. For backward
      // compatibility, we ignore the -delete option here, instead of throwing
      // an IllegalArgumentException. See HDFS-10397 for more discussion.
      OptionsParser.LOG.warn(
          "-delete and -diff/-rdiff are mutually exclusive. " +
          "The -delete option will be ignored.");
      setDeleteMissing(false);
    }

    if (syncFolder && atomicCommit) {
      throw new IllegalArgumentException("Atomic commit can't be used with " +
          "sync folder or overwrite options");
    }

    if (deleteMissing && !(overwrite || syncFolder)) {
      throw new IllegalArgumentException("Delete missing is applicable " +
          "only with update or overwrite options");
    }

    if (overwrite && syncFolder) {
      throw new IllegalArgumentException("Overwrite and update options are " +
          "mutually exclusive");
    }

    if (!syncFolder && skipCRC) {
      throw new IllegalArgumentException("Skip CRC is valid only with update options");
    }

    if (!syncFolder && append) {
      throw new IllegalArgumentException(
          "Append is valid only with update options");
    }
    if (skipCRC && append) {
      throw new IllegalArgumentException(
          "Append is disallowed when skipping CRC");
    }
    if (!syncFolder && (useDiff || useRdiff)) {
      throw new IllegalArgumentException(
          "-diff/-rdiff is valid only with -update option");
    }

    if (useDiff || useRdiff) {
      if (StringUtils.isBlank(fromSnapshot) ||
          StringUtils.isBlank(toSnapshot)) {
        throw new IllegalArgumentException(
            "Must provide both the starting and ending " +
            "snapshot names for -diff/-rdiff");
      }
    }
    if (useDiff && useRdiff) {
      throw new IllegalArgumentException(
          "-diff and -rdiff are mutually exclusive");
    }
  }

  /**
   * Add options to configuration. These will be used in the Mapper/committer
   *
   * @param conf - Configuration object to which the options need to be added
   */
  public void appendToConf(Configuration conf) {
    DistCpOptionSwitch.addToConf(conf, DistCpOptionSwitch.ATOMIC_COMMIT,
        String.valueOf(atomicCommit));
    DistCpOptionSwitch.addToConf(conf, DistCpOptionSwitch.IGNORE_FAILURES,
        String.valueOf(ignoreFailures));
    DistCpOptionSwitch.addToConf(conf, DistCpOptionSwitch.SYNC_FOLDERS,
        String.valueOf(syncFolder));
    DistCpOptionSwitch.addToConf(conf, DistCpOptionSwitch.DELETE_MISSING,
        String.valueOf(deleteMissing));
    DistCpOptionSwitch.addToConf(conf, DistCpOptionSwitch.OVERWRITE,
        String.valueOf(overwrite));
    DistCpOptionSwitch.addToConf(conf, DistCpOptionSwitch.APPEND,
        String.valueOf(append));
    DistCpOptionSwitch.addToConf(conf, DistCpOptionSwitch.DIFF,
        String.valueOf(useDiff));
    DistCpOptionSwitch.addToConf(conf, DistCpOptionSwitch.RDIFF,
        String.valueOf(useRdiff));
    DistCpOptionSwitch.addToConf(conf, DistCpOptionSwitch.SKIP_CRC,
        String.valueOf(skipCRC));
    if (mapBandwidth > 0) {
      DistCpOptionSwitch.addToConf(conf, DistCpOptionSwitch.BANDWIDTH,
          String.valueOf(mapBandwidth));
    }
    DistCpOptionSwitch.addToConf(conf, DistCpOptionSwitch.PRESERVE_STATUS,
        DistCpUtils.packAttributes(preserveStatus));
    if (filtersFile != null) {
      DistCpOptionSwitch.addToConf(conf, DistCpOptionSwitch.FILTERS,
          filtersFile);
    }
  }

  /**
   * Utility to easily string-ify Options, for logging.
   *
   * @return String representation of the Options.
   */
  @Override
  public String toString() {
    return "DistCpOptions{" +
        "atomicCommit=" + atomicCommit +
        ", syncFolder=" + syncFolder +
        ", deleteMissing=" + deleteMissing +
        ", ignoreFailures=" + ignoreFailures +
        ", overwrite=" + overwrite +
        ", append=" + append +
        ", useDiff=" + useDiff +
        ", useRdiff=" + useRdiff +
        ", fromSnapshot=" + fromSnapshot +
        ", toSnapshot=" + toSnapshot +
        ", skipCRC=" + skipCRC +
        ", blocking=" + blocking +
        ", numListstatusThreads=" + numListstatusThreads +
        ", maxMaps=" + maxMaps +
        ", mapBandwidth=" + mapBandwidth +
        ", copyStrategy='" + copyStrategy + '\'' +
        ", preserveStatus=" + preserveStatus +
        ", preserveRawXattrs=" + preserveRawXattrs +
        ", atomicWorkPath=" + atomicWorkPath +
        ", logPath=" + logPath +
        ", sourceFileListing=" + sourceFileListing +
        ", sourcePaths=" + sourcePaths +
        ", targetPath=" + targetPath +
        ", targetPathExists=" + targetPathExists +
        ", filtersFile='" + filtersFile + '\'' +
        '}';
  }

  @Override
  protected DistCpOptions clone() throws CloneNotSupportedException {
    return (DistCpOptions) super.clone();
  }
}




© 2015 - 2025 Weber Informatics LLC | Privacy Policy