All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.apache.hadoop.tools.DistCpSync Maven / Gradle / Ivy

The newest version!
/**
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.hadoop.tools;

import com.google.common.annotations.VisibleForTesting;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.hdfs.DistributedFileSystem;
import org.apache.hadoop.hdfs.protocol.HdfsConstants;
import org.apache.hadoop.hdfs.protocol.SnapshotDiffReport;

import java.io.IOException;
import java.util.Arrays;
import java.util.List;
import java.util.Random;

/**
 * This class provides the basic functionality to sync two FileSystems based on
 * the snapshot diff report. More specifically, we have the following settings:
 * 1. Both the source and target FileSystem must be DistributedFileSystem
 * 2. Two snapshots (e.g., s1 and s2) have been created on the source FS.
 * The diff between these two snapshots will be copied to the target FS.
 * 3. The target has the same snapshot s1. No changes have been made on the
 * target since s1. All the files/directories in the target are the same with
 * source.s1
 */
class DistCpSync {

  static boolean sync(DistCpOptions inputOptions, Configuration conf)
      throws IOException {
    List sourcePaths = inputOptions.getSourcePaths();
    if (sourcePaths.size() != 1) {
      // we only support one source dir which must be a snapshottable directory
      throw new IllegalArgumentException(sourcePaths.size()
          + " source paths are provided");
    }
    final Path sourceDir = sourcePaths.get(0);
    final Path targetDir = inputOptions.getTargetPath();

    final FileSystem sfs = sourceDir.getFileSystem(conf);
    final FileSystem tfs = targetDir.getFileSystem(conf);
    // currently we require both the source and the target file system are
    // DistributedFileSystem.
    if (!(sfs instanceof DistributedFileSystem) ||
        !(tfs instanceof DistributedFileSystem)) {
      throw new IllegalArgumentException("The FileSystems needs to" +
          " be DistributedFileSystem for using snapshot-diff-based distcp");
    }
    final DistributedFileSystem sourceFs = (DistributedFileSystem) sfs;
    final DistributedFileSystem targetFs= (DistributedFileSystem) tfs;

    // make sure targetFS has no change between from and the current states
    if (!checkNoChange(inputOptions, targetFs, targetDir)) {
      // set the source path using the snapshot path
      inputOptions.setSourcePaths(Arrays.asList(getSourceSnapshotPath(sourceDir,
          inputOptions.getToSnapshot())));
      return false;
    }

    Path tmpDir = null;
    try {
      tmpDir = createTargetTmpDir(targetFs, targetDir);
      DiffInfo[] diffs = getDiffs(inputOptions, sourceFs, sourceDir, targetDir);
      if (diffs == null) {
        return false;
      }
      // do the real sync work: deletion and rename
      syncDiff(diffs, targetFs, tmpDir);
      return true;
    } catch (Exception e) {
      DistCp.LOG.warn("Failed to use snapshot diff for distcp", e);
      return false;
    } finally {
      deleteTargetTmpDir(targetFs, tmpDir);
      // TODO: since we have tmp directory, we can support "undo" with failures
      // set the source path using the snapshot path
      inputOptions.setSourcePaths(Arrays.asList(getSourceSnapshotPath(sourceDir,
          inputOptions.getToSnapshot())));
    }
  }

  private static String getSnapshotName(String name) {
    return Path.CUR_DIR.equals(name) ? "" : name;
  }

  private static Path getSourceSnapshotPath(Path sourceDir, String snapshotName) {
    if (Path.CUR_DIR.equals(snapshotName)) {
      return sourceDir;
    } else {
      return new Path(sourceDir,
          HdfsConstants.DOT_SNAPSHOT_DIR + Path.SEPARATOR + snapshotName);
    }
  }

  private static Path createTargetTmpDir(DistributedFileSystem targetFs,
      Path targetDir) throws IOException {
    final Path tmp = new Path(targetDir,
        DistCpConstants.HDFS_DISTCP_DIFF_DIRECTORY_NAME + DistCp.rand.nextInt());
    if (!targetFs.mkdirs(tmp)) {
      throw new IOException("The tmp directory " + tmp + " already exists");
    }
    return tmp;
  }

  private static void deleteTargetTmpDir(DistributedFileSystem targetFs,
      Path tmpDir) {
    try {
      if (tmpDir != null) {
        targetFs.delete(tmpDir, true);
      }
    } catch (IOException e) {
      DistCp.LOG.error("Unable to cleanup tmp dir: " + tmpDir, e);
    }
  }

  /**
   * Compute the snapshot diff on the given file system. Return true if the diff
   * is empty, i.e., no changes have happened in the FS.
   */
  private static boolean checkNoChange(DistCpOptions inputOptions,
      DistributedFileSystem fs, Path path) {
    try {
      SnapshotDiffReport targetDiff =
          fs.getSnapshotDiffReport(path, inputOptions.getFromSnapshot(), "");
      if (!targetDiff.getDiffList().isEmpty()) {
        DistCp.LOG.warn("The target has been modified since snapshot "
            + inputOptions.getFromSnapshot());
        return false;
      } else {
        return true;
      }
    } catch (IOException e) {
      DistCp.LOG.warn("Failed to compute snapshot diff on " + path, e);
    }
    return false;
  }

  @VisibleForTesting
  static DiffInfo[] getDiffs(DistCpOptions inputOptions,
      DistributedFileSystem fs, Path sourceDir, Path targetDir) {
    try {
      final String from = getSnapshotName(inputOptions.getFromSnapshot());
      final String to = getSnapshotName(inputOptions.getToSnapshot());
      SnapshotDiffReport sourceDiff = fs.getSnapshotDiffReport(sourceDir,
          from, to);
      return DiffInfo.getDiffs(sourceDiff, targetDir);
    } catch (IOException e) {
      DistCp.LOG.warn("Failed to compute snapshot diff on " + sourceDir, e);
    }
    return null;
  }

  private static void syncDiff(DiffInfo[] diffs,
      DistributedFileSystem targetFs, Path tmpDir) throws IOException {
    moveToTmpDir(diffs, targetFs, tmpDir);
    moveToTarget(diffs, targetFs);
  }

  /**
   * Move all the source files that should be renamed or deleted to the tmp
   * directory.
   */
  private static void moveToTmpDir(DiffInfo[] diffs,
      DistributedFileSystem targetFs, Path tmpDir) throws IOException {
    // sort the diffs based on their source paths to make sure the files and
    // subdirs are moved before moving their parents/ancestors.
    Arrays.sort(diffs, DiffInfo.sourceComparator);
    Random random = new Random();
    for (DiffInfo diff : diffs) {
      Path tmpTarget = new Path(tmpDir, diff.source.getName());
      while (targetFs.exists(tmpTarget)) {
        tmpTarget = new Path(tmpDir, diff.source.getName() + random.nextInt());
      }
      diff.setTmp(tmpTarget);
      targetFs.rename(diff.source, tmpTarget);
    }
  }

  /**
   * Finish the rename operations: move all the intermediate files/directories
   * from the tmp dir to the final targets.
   */
  private static void moveToTarget(DiffInfo[] diffs,
      DistributedFileSystem targetFs) throws IOException {
    // sort the diffs based on their target paths to make sure the parent
    // directories are created first.
    Arrays.sort(diffs, DiffInfo.targetComparator);
    for (DiffInfo diff : diffs) {
      if (diff.target != null) {
        if (!targetFs.exists(diff.target.getParent())) {
          targetFs.mkdirs(diff.target.getParent());
        }
        targetFs.rename(diff.getTmp(), diff.target);
      }
    }
  }
}




© 2015 - 2025 Weber Informatics LLC | Privacy Policy