org.apache.hadoop.tools.DistCpSync Maven / Gradle / Ivy
/**
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.hadoop.tools;
import org.apache.hadoop.HadoopIllegalArgumentException;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileStatus;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.hdfs.DFSUtilClient;
import org.apache.hadoop.hdfs.DistributedFileSystem;
import org.apache.hadoop.hdfs.protocol.HdfsConstants;
import org.apache.hadoop.hdfs.protocol.SnapshotDiffReport;
import org.apache.hadoop.tools.CopyListing.InvalidInputException;
import java.io.FileNotFoundException;
import java.io.IOException;
import java.util.Arrays;
import java.util.List;
import java.util.Random;
import java.util.EnumMap;
import java.util.ArrayList;
import java.util.HashSet;
/**
* This class provides the basic functionality to sync two FileSystems based on
* the snapshot diff report. More specifically, we have the following settings:
* 1. Both the source and target FileSystem must be DistributedFileSystem
* 2. Two snapshots (e.g., s1 and s2) have been created on the source FS.
* The diff between these two snapshots will be copied to the target FS.
* 3. The target has the same snapshot s1. No changes have been made on the
* target since s1. All the files/directories in the target are the same with
* source.s1
*/
class DistCpSync {
private DistCpOptions inputOptions;
private Configuration conf;
private EnumMap> diffMap;
private DiffInfo[] renameDiffs;
DistCpSync(DistCpOptions options, Configuration conf) {
this.inputOptions = options;
this.conf = conf;
}
/**
* Check if three conditions are met before sync.
* 1. Only one source directory.
* 2. Both source and target file system are DFS.
* 3. There is no change between from and the current status in target
* file system.
* Throw exceptions if first two aren't met, and return false to fallback to
* default distcp if the third condition isn't met.
*/
private boolean preSyncCheck() throws IOException {
List sourcePaths = inputOptions.getSourcePaths();
if (sourcePaths.size() != 1) {
// we only support one source dir which must be a snapshottable directory
throw new IllegalArgumentException(sourcePaths.size()
+ " source paths are provided");
}
final Path sourceDir = sourcePaths.get(0);
final Path targetDir = inputOptions.getTargetPath();
final FileSystem sfs = sourceDir.getFileSystem(conf);
final FileSystem tfs = targetDir.getFileSystem(conf);
// currently we require both the source and the target file system are
// DistributedFileSystem.
if (!(sfs instanceof DistributedFileSystem) ||
!(tfs instanceof DistributedFileSystem)) {
throw new IllegalArgumentException("The FileSystems needs to" +
" be DistributedFileSystem for using snapshot-diff-based distcp");
}
final DistributedFileSystem targetFs = (DistributedFileSystem) tfs;
// make sure targetFS has no change between from and the current states
if (!checkNoChange(targetFs, targetDir)) {
// set the source path using the snapshot path
inputOptions.setSourcePaths(Arrays.asList(getSourceSnapshotPath(sourceDir,
inputOptions.getToSnapshot())));
return false;
}
final String from = getSnapshotName(inputOptions.getFromSnapshot());
final String to = getSnapshotName(inputOptions.getToSnapshot());
try {
final FileStatus fromSnapshotStat =
sfs.getFileStatus(getSourceSnapshotPath(sourceDir, from));
final FileStatus toSnapshotStat =
sfs.getFileStatus(getSourceSnapshotPath(sourceDir, to));
// If toSnapshot isn't current dir then do a time check
if (!to.equals("")
&& fromSnapshotStat.getModificationTime() > toSnapshotStat
.getModificationTime()) {
throw new HadoopIllegalArgumentException("Snapshot " + to
+ " should be newer than " + from);
}
} catch (FileNotFoundException nfe) {
throw new InvalidInputException("Input snapshot is not found", nfe);
}
return true;
}
public boolean sync() throws IOException {
if (!preSyncCheck()) {
return false;
}
if (!getAllDiffs()) {
return false;
}
List sourcePaths = inputOptions.getSourcePaths();
final Path sourceDir = sourcePaths.get(0);
final Path targetDir = inputOptions.getTargetPath();
final FileSystem tfs = targetDir.getFileSystem(conf);
final DistributedFileSystem targetFs = (DistributedFileSystem) tfs;
Path tmpDir = null;
try {
tmpDir = createTargetTmpDir(targetFs, targetDir);
DiffInfo[] renameAndDeleteDiffs = getRenameAndDeleteDiffs(targetDir);
if (renameAndDeleteDiffs.length > 0) {
// do the real sync work: deletion and rename
syncDiff(renameAndDeleteDiffs, targetFs, tmpDir);
}
return true;
} catch (Exception e) {
DistCp.LOG.warn("Failed to use snapshot diff for distcp", e);
return false;
} finally {
deleteTargetTmpDir(targetFs, tmpDir);
// TODO: since we have tmp directory, we can support "undo" with failures
// set the source path using the snapshot path
inputOptions.setSourcePaths(Arrays.asList(getSourceSnapshotPath(sourceDir,
inputOptions.getToSnapshot())));
}
}
/**
* Get all diffs from source directory snapshot diff report, put them into an
* EnumMap whose key is DiffType, and value is a DiffInfo list. If there is
* no entry for a given DiffType, the associated value will be an empty list.
*/
private boolean getAllDiffs() throws IOException {
List sourcePaths = inputOptions.getSourcePaths();
final Path sourceDir = sourcePaths.get(0);
try {
DistributedFileSystem fs =
(DistributedFileSystem) sourceDir.getFileSystem(conf);
final String from = getSnapshotName(inputOptions.getFromSnapshot());
final String to = getSnapshotName(inputOptions.getToSnapshot());
SnapshotDiffReport report = fs.getSnapshotDiffReport(sourceDir,
from, to);
this.diffMap = new EnumMap<>(SnapshotDiffReport.DiffType.class);
for (SnapshotDiffReport.DiffType type :
SnapshotDiffReport.DiffType.values()) {
diffMap.put(type, new ArrayList());
}
for (SnapshotDiffReport.DiffReportEntry entry : report.getDiffList()) {
// If the entry is the snapshot root, usually a item like "M\t."
// in the diff report. We don't need to handle it and cannot handle it,
// since its sourcepath is empty.
if (entry.getSourcePath().length <= 0) {
continue;
}
List list = diffMap.get(entry.getType());
if (entry.getType() == SnapshotDiffReport.DiffType.MODIFY ||
entry.getType() == SnapshotDiffReport.DiffType.CREATE ||
entry.getType() == SnapshotDiffReport.DiffType.DELETE) {
final Path source =
new Path(DFSUtilClient.bytes2String(entry.getSourcePath()));
list.add(new DiffInfo(source, null, entry.getType()));
} else if (entry.getType() == SnapshotDiffReport.DiffType.RENAME) {
final Path source =
new Path(DFSUtilClient.bytes2String(entry.getSourcePath()));
final Path target =
new Path(DFSUtilClient.bytes2String(entry.getTargetPath()));
list.add(new DiffInfo(source, target, entry.getType()));
}
}
return true;
} catch (IOException e) {
DistCp.LOG.warn("Failed to compute snapshot diff on " + sourceDir, e);
}
this.diffMap = null;
return false;
}
private String getSnapshotName(String name) {
return Path.CUR_DIR.equals(name) ? "" : name;
}
private Path getSourceSnapshotPath(Path sourceDir, String snapshotName) {
if (Path.CUR_DIR.equals(snapshotName)) {
return sourceDir;
} else {
return new Path(sourceDir,
HdfsConstants.DOT_SNAPSHOT_DIR + Path.SEPARATOR + snapshotName);
}
}
private Path createTargetTmpDir(DistributedFileSystem targetFs,
Path targetDir) throws IOException {
final Path tmp = new Path(targetDir,
DistCpConstants.HDFS_DISTCP_DIFF_DIRECTORY_NAME + DistCp.rand.nextInt());
if (!targetFs.mkdirs(tmp)) {
throw new IOException("The tmp directory " + tmp + " already exists");
}
return tmp;
}
private void deleteTargetTmpDir(DistributedFileSystem targetFs,
Path tmpDir) {
try {
if (tmpDir != null) {
targetFs.delete(tmpDir, true);
}
} catch (IOException e) {
DistCp.LOG.error("Unable to cleanup tmp dir: " + tmpDir, e);
}
}
/**
* Compute the snapshot diff on the given file system. Return true if the diff
* is empty, i.e., no changes have happened in the FS.
*/
private boolean checkNoChange(DistributedFileSystem fs, Path path) {
try {
SnapshotDiffReport targetDiff =
fs.getSnapshotDiffReport(path, inputOptions.getFromSnapshot(), "");
if (!targetDiff.getDiffList().isEmpty()) {
DistCp.LOG.warn("The target has been modified since snapshot "
+ inputOptions.getFromSnapshot());
return false;
} else {
return true;
}
} catch (IOException e) {
DistCp.LOG.warn("Failed to compute snapshot diff on " + path, e);
}
return false;
}
private void syncDiff(DiffInfo[] diffs,
DistributedFileSystem targetFs, Path tmpDir) throws IOException {
moveToTmpDir(diffs, targetFs, tmpDir);
moveToTarget(diffs, targetFs);
}
/**
* Move all the source files that should be renamed or deleted to the tmp
* directory.
*/
private void moveToTmpDir(DiffInfo[] diffs,
DistributedFileSystem targetFs, Path tmpDir) throws IOException {
// sort the diffs based on their source paths to make sure the files and
// subdirs are moved before moving their parents/ancestors.
Arrays.sort(diffs, DiffInfo.sourceComparator);
Random random = new Random();
for (DiffInfo diff : diffs) {
Path tmpTarget = new Path(tmpDir, diff.source.getName());
while (targetFs.exists(tmpTarget)) {
tmpTarget = new Path(tmpDir, diff.source.getName() + random.nextInt());
}
diff.setTmp(tmpTarget);
targetFs.rename(diff.source, tmpTarget);
}
}
/**
* Finish the rename operations: move all the intermediate files/directories
* from the tmp dir to the final targets.
*/
private void moveToTarget(DiffInfo[] diffs,
DistributedFileSystem targetFs) throws IOException {
// sort the diffs based on their target paths to make sure the parent
// directories are created first.
Arrays.sort(diffs, DiffInfo.targetComparator);
for (DiffInfo diff : diffs) {
if (diff.target != null) {
if (!targetFs.exists(diff.target.getParent())) {
targetFs.mkdirs(diff.target.getParent());
}
targetFs.rename(diff.getTmp(), diff.target);
}
}
}
/**
* Get rename and delete diffs and add the targetDir as the prefix of their
* source and target paths.
*/
private DiffInfo[] getRenameAndDeleteDiffs(Path targetDir) {
List renameAndDeleteDiff = new ArrayList<>();
for (DiffInfo diff : diffMap.get(SnapshotDiffReport.DiffType.DELETE)) {
Path source = new Path(targetDir, diff.source);
renameAndDeleteDiff.add(new DiffInfo(source, diff.target,
diff.getType()));
}
for (DiffInfo diff : diffMap.get(SnapshotDiffReport.DiffType.RENAME)) {
Path source = new Path(targetDir, diff.source);
Path target = new Path(targetDir, diff.target);
renameAndDeleteDiff.add(new DiffInfo(source, target, diff.getType()));
}
return renameAndDeleteDiff.toArray(
new DiffInfo[renameAndDeleteDiff.size()]);
}
private DiffInfo[] getCreateAndModifyDiffs() {
List createDiff =
diffMap.get(SnapshotDiffReport.DiffType.CREATE);
List modifyDiff =
diffMap.get(SnapshotDiffReport.DiffType.MODIFY);
List diffs =
new ArrayList<>(createDiff.size() + modifyDiff.size());
diffs.addAll(createDiff);
diffs.addAll(modifyDiff);
return diffs.toArray(new DiffInfo[diffs.size()]);
}
/**
* Probe for a path being a parent of another.
* @return true if the parent's path matches the start of the child's
*/
private boolean isParentOf(Path parent, Path child) {
String parentPath = parent.toString();
String childPath = child.toString();
if (!parentPath.endsWith(Path.SEPARATOR)) {
parentPath += Path.SEPARATOR;
}
return childPath.length() > parentPath.length() &&
childPath.startsWith(parentPath);
}
/**
* Find the possible rename item which equals to the parent or self of
* a created/modified file/directory.
* @param diff a modify/create diff item
* @param renameDiffArray all rename diffs
* @return possible rename item
*/
private DiffInfo getRenameItem(DiffInfo diff, DiffInfo[] renameDiffArray) {
for (DiffInfo renameItem : renameDiffArray) {
if (diff.source.equals(renameItem.source)) {
// The same path string may appear in:
// 1. both renamed and modified snapshot diff entries.
// 2. both renamed and created snapshot diff entries.
// Case 1 is the about same file/directory, whereas case 2
// is about two different files/directories.
// We are finding case 1 here, thus we check against DiffType.MODIFY.
if (diff.getType() == SnapshotDiffReport.DiffType.MODIFY) {
return renameItem;
}
} else if (isParentOf(renameItem.source, diff.source)) {
// If rename entry is the parent of diff entry, then both MODIFY and
// CREATE diff entries should be handled.
return renameItem;
}
}
return null;
}
/**
* For a given source path, get its target path based on the rename item.
* @return target path
*/
private Path getTargetPath(Path sourcePath, DiffInfo renameItem) {
if (sourcePath.equals(renameItem.source)) {
return renameItem.target;
}
StringBuffer sb = new StringBuffer(sourcePath.toString());
String remain = sb.substring(renameItem.source.toString().length() + 1);
return new Path(renameItem.target, remain);
}
/**
* Prepare the diff list.
* This diff list only includes created or modified files/directories, since
* delete and rename items are synchronized already.
*
* If the parent or self of a source path is renamed, we need to change its
* target path according the correspondent rename item.
* @return a diff list
*/
public ArrayList prepareDiffList() {
DiffInfo[] modifyAndCreateDiffs = getCreateAndModifyDiffs();
List renameDiffsList =
diffMap.get(SnapshotDiffReport.DiffType.RENAME);
DiffInfo[] renameDiffArray =
renameDiffsList.toArray(new DiffInfo[renameDiffsList.size()]);
Arrays.sort(renameDiffArray, DiffInfo.sourceComparator);
ArrayList finalListWithTarget = new ArrayList<>();
for (DiffInfo diff : modifyAndCreateDiffs) {
DiffInfo renameItem = getRenameItem(diff, renameDiffArray);
if (renameItem == null) {
diff.target = diff.source;
} else {
diff.target = getTargetPath(diff.source, renameItem);
}
finalListWithTarget.add(diff);
}
return finalListWithTarget;
}
/**
* This method returns a list of items to be excluded when recursively
* traversing newDir to build the copy list.
*
* Specifically, given a newly created directory newDir (a CREATE entry in
* the snapshot diff), if a previously copied file/directory itemX is moved
* (a RENAME entry in the snapshot diff) into newDir, itemX should be
* excluded when recursively traversing newDir in caller method so that it
* will not to be copied again.
* If the same itemX also has a MODIFY entry in the snapshot diff report,
* meaning it was modified after it was previously copied, it will still
* be added to the copy list in caller method.
* @return the exclude list
*/
public HashSet getTraverseExcludeList(Path newDir, Path prefix) {
if (renameDiffs == null) {
List renameList =
diffMap.get(SnapshotDiffReport.DiffType.RENAME);
renameDiffs = renameList.toArray(new DiffInfo[renameList.size()]);
Arrays.sort(renameDiffs, DiffInfo.targetComparator);
}
if (renameDiffs.length <= 0) {
return null;
}
boolean foundChild = false;
HashSet excludeList = new HashSet<>();
for (DiffInfo diff : renameDiffs) {
if (isParentOf(newDir, diff.target)) {
foundChild = true;
excludeList.add(new Path(prefix, diff.target).toUri().getPath());
} else if (foundChild) {
// The renameDiffs was sorted, the matching section should be
// contiguous.
break;
}
}
return excludeList;
}
}
© 2015 - 2025 Weber Informatics LLC | Privacy Policy