org.apache.hudi.hadoop.fs.HadoopFSUtils Maven / Gradle / Ivy
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/
package org.apache.hudi.hadoop.fs;
import org.apache.hudi.avro.model.HoodieFSPermission;
import org.apache.hudi.avro.model.HoodieFileStatus;
import org.apache.hudi.avro.model.HoodiePath;
import org.apache.hudi.common.engine.HoodieEngineContext;
import org.apache.hudi.common.fs.FSUtils;
import org.apache.hudi.common.model.HoodieFileFormat;
import org.apache.hudi.common.table.HoodieTableMetaClient;
import org.apache.hudi.common.util.collection.ImmutablePair;
import org.apache.hudi.common.util.collection.Pair;
import org.apache.hudi.exception.HoodieIOException;
import org.apache.hudi.exception.InvalidHoodiePathException;
import org.apache.hudi.storage.StorageConfiguration;
import org.apache.hudi.storage.StoragePath;
import org.apache.hudi.storage.StoragePathInfo;
import org.apache.hudi.storage.StorageSchemes;
import org.apache.hudi.storage.hadoop.HadoopStorageConfiguration;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.BufferedFSInputStream;
import org.apache.hadoop.fs.FSDataInputStream;
import org.apache.hadoop.fs.FSInputStream;
import org.apache.hadoop.fs.FileStatus;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.fs.permission.FsAction;
import org.apache.hadoop.fs.permission.FsPermission;
import org.apache.hadoop.hdfs.DistributedFileSystem;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.io.File;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.HashMap;
import java.util.HashSet;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.regex.Matcher;
import java.util.stream.Collectors;
import static org.apache.hudi.common.fs.FSUtils.LOG_FILE_PATTERN;
/**
* Utility functions related to accessing the file storage on Hadoop.
*/
public class HadoopFSUtils {
private static final Logger LOG = LoggerFactory.getLogger(HadoopFSUtils.class);
private static final String HOODIE_ENV_PROPS_PREFIX = "HOODIE_ENV_";
private static final int MAX_ATTEMPTS_RECOVER_LEASE = 10;
public static Configuration prepareHadoopConf(Configuration conf) {
// look for all properties, prefixed to be picked up
for (Map.Entry prop : System.getenv().entrySet()) {
if (prop.getKey().startsWith(HOODIE_ENV_PROPS_PREFIX)) {
LOG.info("Picking up value for hoodie env var : {}", prop.getKey());
conf.set(prop.getKey().replace(HOODIE_ENV_PROPS_PREFIX, "").replaceAll("_DOT_", "."), prop.getValue());
}
}
return conf;
}
public static StorageConfiguration getStorageConf(Configuration conf) {
return getStorageConf(conf, false);
}
public static StorageConfiguration getStorageConf() {
return getStorageConf(prepareHadoopConf(new Configuration()), false);
}
public static StorageConfiguration getStorageConfWithCopy(Configuration conf) {
return getStorageConf(conf, true);
}
public static FileSystem getFs(String pathStr, StorageConfiguration storageConf) {
return getFs(new Path(pathStr), storageConf);
}
public static FileSystem getFs(String pathStr, StorageConfiguration storageConf, boolean newCopy) {
return getFs(new Path(pathStr), storageConf, newCopy);
}
public static FileSystem getFs(Path path, StorageConfiguration storageConf) {
return getFs(path, storageConf, false);
}
public static FileSystem getFs(Path path, StorageConfiguration storageConf, boolean newCopy) {
Configuration conf = newCopy ? storageConf.unwrapCopyAs(Configuration.class) : storageConf.unwrapAs(Configuration.class);
return getFs(path, conf);
}
public static FileSystem getFs(String pathStr, Configuration conf) {
return getFs(new Path(pathStr), conf);
}
public static FileSystem getFs(StoragePath path, Configuration conf) {
return getFs(convertToHadoopPath(path), conf);
}
public static FileSystem getFs(Path path, Configuration conf) {
FileSystem fs;
prepareHadoopConf(conf);
try {
fs = path.getFileSystem(conf);
} catch (IOException e) {
throw new HoodieIOException(String.format("Failed to get instance of %s", FileSystem.class.getName()), e);
}
return fs;
}
public static FileSystem getFs(String pathStr, Configuration conf, boolean localByDefault) {
if (localByDefault) {
return getFs(addSchemeIfLocalPath(pathStr), conf);
}
return getFs(pathStr, conf);
}
public static Path addSchemeIfLocalPath(String path) {
Path providedPath = new Path(path);
File localFile = new File(path);
if (!providedPath.isAbsolute() && localFile.exists()) {
Path resolvedPath = new Path("file://" + localFile.getAbsolutePath());
LOG.info("Resolving file {} to be a local file.", path);
return resolvedPath;
}
LOG.info("Resolving file {} to be a remote file.", path);
return providedPath;
}
/**
* @param path {@link StoragePath} instance.
* @return the Hadoop {@link Path} instance after conversion.
*/
public static Path convertToHadoopPath(StoragePath path) {
return new Path(path.toUri());
}
/**
* @param path Hadoop {@link Path} instance.
* @return the {@link StoragePath} instance after conversion.
*/
public static StoragePath convertToStoragePath(Path path) {
return new StoragePath(path.toUri());
}
/**
* @param fileStatus Hadoop {@link FileStatus} instance.
* @return the {@link StoragePathInfo} instance after conversion.
*/
public static StoragePathInfo convertToStoragePathInfo(FileStatus fileStatus) {
return new StoragePathInfo(
convertToStoragePath(fileStatus.getPath()),
fileStatus.getLen(),
fileStatus.isDirectory(),
fileStatus.getReplication(),
fileStatus.getBlockSize(),
fileStatus.getModificationTime());
}
public static StoragePathInfo convertToStoragePathInfo(FileStatus fileStatus, String[] locations) {
return new StoragePathInfo(
convertToStoragePath(fileStatus.getPath()),
fileStatus.getLen(),
fileStatus.isDirectory(),
fileStatus.getReplication(),
fileStatus.getBlockSize(),
fileStatus.getModificationTime(),
locations);
}
/**
* @param pathInfo {@link StoragePathInfo} instance.
* @return the {@link FileStatus} instance after conversion.
*/
public static FileStatus convertToHadoopFileStatus(StoragePathInfo pathInfo) {
return new FileStatus(
pathInfo.getLength(),
pathInfo.isDirectory(),
pathInfo.getBlockReplication(),
pathInfo.getBlockSize(),
pathInfo.getModificationTime(),
convertToHadoopPath(pathInfo.getPath()));
}
/**
* Fetch the right {@link FSDataInputStream} to be used by wrapping with required input streams.
*
* @param fs instance of {@link FileSystem} in use.
* @param filePath path of the file.
* @param bufferSize buffer size to be used.
* @param wrapStream if false, don't attempt to wrap the stream
* @return the right {@link FSDataInputStream} as required.
*/
public static FSDataInputStream getFSDataInputStream(FileSystem fs,
StoragePath filePath,
int bufferSize,
boolean wrapStream) {
FSDataInputStream fsDataInputStream = null;
try {
fsDataInputStream = fs.open(convertToHadoopPath(filePath), bufferSize);
} catch (IOException e) {
throw new HoodieIOException(String.format("Exception creating input stream from file: %s", filePath), e);
}
if (!wrapStream) {
return fsDataInputStream;
}
if (isGCSFileSystem(fs)) {
// in GCS FS, we might need to interceptor seek offsets as we might get EOF exception
return new SchemeAwareFSDataInputStream(getFSDataInputStreamForGCS(fsDataInputStream, filePath, bufferSize), true);
}
if (isCHDFileSystem(fs)) {
return new BoundedFsDataInputStream(fs, convertToHadoopPath(filePath), fsDataInputStream);
}
if (fsDataInputStream.getWrappedStream() instanceof FSInputStream) {
return new TimedFSDataInputStream(convertToHadoopPath(filePath), new FSDataInputStream(
new BufferedFSInputStream((FSInputStream) fsDataInputStream.getWrappedStream(), bufferSize)));
}
// fsDataInputStream.getWrappedStream() maybe a BufferedFSInputStream
// need to wrap in another BufferedFSInputStream the make bufferSize work?
return fsDataInputStream;
}
/**
* GCS FileSystem needs some special handling for seek and hence this method assists to fetch the right {@link FSDataInputStream} to be
* used by wrapping with required input streams.
*
* @param fsDataInputStream original instance of {@link FSDataInputStream}.
* @param filePath path of the file.
* @param bufferSize buffer size to be used.
* @return the right {@link FSDataInputStream} as required.
*/
private static FSDataInputStream getFSDataInputStreamForGCS(FSDataInputStream fsDataInputStream,
StoragePath filePath,
int bufferSize) {
// in case of GCS FS, there are two flows.
// a. fsDataInputStream.getWrappedStream() instanceof FSInputStream
// b. fsDataInputStream.getWrappedStream() not an instanceof FSInputStream, but an instance of FSDataInputStream.
// (a) is handled in the first if block and (b) is handled in the second if block. If not, we fallback to original fsDataInputStream
if (fsDataInputStream.getWrappedStream() instanceof FSInputStream) {
return new TimedFSDataInputStream(convertToHadoopPath(filePath), new FSDataInputStream(
new BufferedFSInputStream((FSInputStream) fsDataInputStream.getWrappedStream(), bufferSize)));
}
if (fsDataInputStream.getWrappedStream() instanceof FSDataInputStream
&& ((FSDataInputStream) fsDataInputStream.getWrappedStream()).getWrappedStream() instanceof FSInputStream) {
FSInputStream inputStream = (FSInputStream) ((FSDataInputStream) fsDataInputStream.getWrappedStream()).getWrappedStream();
return new TimedFSDataInputStream(convertToHadoopPath(filePath),
new FSDataInputStream(new BufferedFSInputStream(inputStream, bufferSize)));
}
return fsDataInputStream;
}
/**
* This is due to HUDI-140 GCS has a different behavior for detecting EOF during seek().
*
* @param fs fileSystem instance.
* @return true if the inputstream or the wrapped one is of type GoogleHadoopFSInputStream
*/
public static boolean isGCSFileSystem(FileSystem fs) {
return fs.getScheme().equals(StorageSchemes.GCS.getScheme());
}
/**
* Chdfs will throw {@code IOException} instead of {@code EOFException}. It will cause error in isBlockCorrupted().
* Wrapped by {@code BoundedFsDataInputStream}, to check whether the desired offset is out of the file size in advance.
*/
public static boolean isCHDFileSystem(FileSystem fs) {
return StorageSchemes.CHDFS.getScheme().equals(fs.getScheme());
}
private static StorageConfiguration getStorageConf(Configuration conf, boolean copy) {
return new HadoopStorageConfiguration(conf, copy);
}
public static Configuration registerFileSystem(StoragePath file, Configuration conf) {
Configuration returnConf = new Configuration(conf);
String scheme = HadoopFSUtils.getFs(file.toString(), conf).getScheme();
returnConf.set("fs." + HoodieWrapperFileSystem.getHoodieScheme(scheme) + ".impl",
HoodieWrapperFileSystem.class.getName());
return returnConf;
}
public static Path toPath(HoodiePath path) {
if (null == path) {
return null;
}
return new Path(path.getUri());
}
public static HoodiePath fromPath(Path path) {
if (null == path) {
return null;
}
return HoodiePath.newBuilder().setUri(path.toString()).build();
}
public static FsPermission toFSPermission(HoodieFSPermission fsPermission) {
if (null == fsPermission) {
return null;
}
FsAction userAction = fsPermission.getUserAction() != null ? FsAction.valueOf(fsPermission.getUserAction()) : null;
FsAction grpAction = fsPermission.getGroupAction() != null ? FsAction.valueOf(fsPermission.getGroupAction()) : null;
FsAction otherAction =
fsPermission.getOtherAction() != null ? FsAction.valueOf(fsPermission.getOtherAction()) : null;
boolean stickyBit = fsPermission.getStickyBit() != null ? fsPermission.getStickyBit() : false;
return new FsPermission(userAction, grpAction, otherAction, stickyBit);
}
public static HoodieFSPermission fromFSPermission(FsPermission fsPermission) {
if (null == fsPermission) {
return null;
}
String userAction = fsPermission.getUserAction() != null ? fsPermission.getUserAction().name() : null;
String grpAction = fsPermission.getGroupAction() != null ? fsPermission.getGroupAction().name() : null;
String otherAction = fsPermission.getOtherAction() != null ? fsPermission.getOtherAction().name() : null;
return HoodieFSPermission.newBuilder().setUserAction(userAction).setGroupAction(grpAction)
.setOtherAction(otherAction).setStickyBit(fsPermission.getStickyBit()).build();
}
public static HoodieFileStatus fromFileStatus(FileStatus fileStatus) {
if (null == fileStatus) {
return null;
}
HoodieFileStatus fStatus = new HoodieFileStatus();
try {
fStatus.setPath(fromPath(fileStatus.getPath()));
fStatus.setLength(fileStatus.getLen());
fStatus.setIsDir(fileStatus.isDirectory());
fStatus.setBlockReplication((int) fileStatus.getReplication());
fStatus.setBlockSize(fileStatus.getBlockSize());
fStatus.setModificationTime(fileStatus.getModificationTime());
fStatus.setAccessTime(fileStatus.getModificationTime());
fStatus.setSymlink(fileStatus.isSymlink() ? fromPath(fileStatus.getSymlink()) : null);
safeReadAndSetMetadata(fStatus, fileStatus);
} catch (IOException ioe) {
throw new HoodieIOException(ioe.getMessage(), ioe);
}
return fStatus;
}
/**
* Used to safely handle FileStatus calls which might fail on some FileSystem implementation.
* (DeprecatedLocalFileSystem)
*/
private static void safeReadAndSetMetadata(HoodieFileStatus fStatus, FileStatus fileStatus) {
try {
fStatus.setOwner(fileStatus.getOwner());
fStatus.setGroup(fileStatus.getGroup());
fStatus.setPermission(fromFSPermission(fileStatus.getPermission()));
} catch (IllegalArgumentException ie) {
// Deprecated File System (testing) does not work well with this call
// skipping
}
}
public static long getFileSize(FileSystem fs, Path path) throws IOException {
return fs.getFileStatus(path).getLen();
}
/**
* Given a base partition and a partition path, return relative path of partition path to the base path.
*/
public static String getRelativePartitionPath(Path basePath, Path fullPartitionPath) {
return FSUtils.getRelativePartitionPath(new StoragePath(basePath.toUri()), new StoragePath(fullPartitionPath.toUri()));
}
/**
* Get the first part of the file name in the log file. That will be the fileId. Log file do not have instantTime in
* the file name.
*/
public static String getFileIdFromLogPath(Path path) {
Matcher matcher = LOG_FILE_PATTERN.matcher(path.getName());
if (!matcher.find()) {
throw new InvalidHoodiePathException(path.toString(), "LogFile");
}
return matcher.group(1);
}
/**
* Get the second part of the file name in the log file. That will be the delta commit time.
*/
public static String getDeltaCommitTimeFromLogPath(Path path) {
Matcher matcher = LOG_FILE_PATTERN.matcher(path.getName());
if (!matcher.find()) {
throw new InvalidHoodiePathException(path.toString(), "LogFile");
}
return matcher.group(2);
}
/**
* Check if the file is a base file of a log file. Then get the fileId appropriately.
*/
public static String getFileIdFromFilePath(Path filePath) {
if (isLogFile(filePath)) {
return getFileIdFromLogPath(filePath);
}
return FSUtils.getFileId(filePath.getName());
}
public static boolean isBaseFile(Path path) {
String extension = FSUtils.getFileExtension(path.getName());
return HoodieFileFormat.BASE_FILE_EXTENSIONS.contains(extension);
}
public static boolean isLogFile(Path logPath) {
return FSUtils.isLogFile(new StoragePath(logPath.getName()));
}
/**
* Returns true if the given path is a Base file or a Log file.
*/
public static boolean isDataFile(Path path) {
return isBaseFile(path) || isLogFile(path);
}
/**
* Get the names of all the base and log files in the given partition path.
*/
public static FileStatus[] getAllDataFilesInPartition(FileSystem fs, Path partitionPath) throws IOException {
final Set validFileExtensions = Arrays.stream(HoodieFileFormat.values())
.map(HoodieFileFormat::getFileExtension).collect(Collectors.toCollection(HashSet::new));
final String logFileExtension = HoodieFileFormat.HOODIE_LOG.getFileExtension();
try {
return Arrays.stream(fs.listStatus(partitionPath, path -> {
String extension = FSUtils.getFileExtension(path.getName());
return validFileExtensions.contains(extension) || path.getName().contains(logFileExtension);
})).filter(FileStatus::isFile).toArray(FileStatus[]::new);
} catch (IOException e) {
// return empty FileStatus if partition does not exist already
if (!fs.exists(partitionPath)) {
return new FileStatus[0];
} else {
throw e;
}
}
}
/**
* When a file was opened and the task died without closing the stream, another task executor cannot open because the
* existing lease will be active. We will try to recover the lease, from HDFS. If a data node went down, it takes
* about 10 minutes for the lease to be recovered. But if the client dies, this should be instant.
*/
public static boolean recoverDFSFileLease(final DistributedFileSystem dfs, final Path p)
throws IOException, InterruptedException {
LOG.info("Recover lease on dfs file {}", p);
// initiate the recovery
boolean recovered = false;
for (int nbAttempt = 0; nbAttempt < MAX_ATTEMPTS_RECOVER_LEASE; nbAttempt++) {
LOG.info("Attempt {} to recover lease on dfs file {}", nbAttempt, p);
recovered = dfs.recoverLease(p);
if (recovered) {
break;
}
// Sleep for 1 second before trying again. Typically it takes about 2-3 seconds to recover
// under default settings
Thread.sleep(1000);
}
return recovered;
}
public static Path constructAbsolutePathInHadoopPath(String basePath, String relativePartitionPath) {
return new Path(FSUtils.constructAbsolutePath(basePath, relativePartitionPath).toUri());
}
/**
* Get DFS full partition path (e.g. hdfs://ip-address:8020:/)
*/
public static String getDFSFullPartitionPath(FileSystem fs, Path fullPartitionPath) {
return fs.getUri() + fullPartitionPath.toUri().getRawPath();
}
public static Map parallelizeFilesProcess(
HoodieEngineContext hoodieEngineContext,
FileSystem fs,
int parallelism,
FSUtils.SerializableFunction>, T> pairFunction,
List subPaths) {
Map result = new HashMap<>();
if (subPaths.size() > 0) {
StorageConfiguration conf = new HadoopStorageConfiguration(fs.getConf(), true);
int actualParallelism = Math.min(subPaths.size(), parallelism);
hoodieEngineContext.setJobStatus(FSUtils.class.getSimpleName(),
"Parallel listing paths " + String.join(",", subPaths));
result = hoodieEngineContext.mapToPair(subPaths,
subPath -> new ImmutablePair<>(subPath, pairFunction.apply(new ImmutablePair<>(subPath, conf))),
actualParallelism);
}
return result;
}
/**
* Lists file status at a certain level in the directory hierarchy.
*
* E.g., given "/tmp/hoodie_table" as the rootPath, and 3 as the expected level,
* this method gives back the {@link FileStatus} of all files under
* "/tmp/hoodie_table/[*]/[*]/[*]/" folders.
*
* @param hoodieEngineContext {@link HoodieEngineContext} instance.
* @param fs {@link FileSystem} instance.
* @param rootPath Root path for the file listing.
* @param expectLevel Expected level of directory hierarchy for files to be added.
* @param parallelism Parallelism for the file listing.
* @return A list of file status of files at the level.
*/
public static List getFileStatusAtLevel(
HoodieEngineContext hoodieEngineContext, FileSystem fs, Path rootPath,
int expectLevel, int parallelism) {
List levelPaths = new ArrayList<>();
List result = new ArrayList<>();
levelPaths.add(rootPath.toString());
for (int i = 0; i <= expectLevel; i++) {
result = parallelizeFilesProcess(hoodieEngineContext, fs, parallelism,
pairOfSubPathAndConf -> {
Path path = new Path(pairOfSubPathAndConf.getKey());
try {
FileSystem fileSystem = path.getFileSystem(pairOfSubPathAndConf.getValue().unwrap());
return Arrays.stream(fileSystem.listStatus(path))
.collect(Collectors.toList());
} catch (IOException e) {
throw new HoodieIOException("Failed to list " + path, e);
}
},
levelPaths)
.values().stream()
.flatMap(list -> list.stream()).collect(Collectors.toList());
if (i < expectLevel) {
levelPaths = result.stream()
.filter(FileStatus::isDirectory)
.map(fileStatus -> fileStatus.getPath().toString())
.collect(Collectors.toList());
}
}
return result;
}
public static Map deleteFilesParallelize(
HoodieTableMetaClient metaClient,
List paths,
HoodieEngineContext context,
int parallelism,
boolean ignoreFailed) {
return HadoopFSUtils.parallelizeFilesProcess(context,
(FileSystem) metaClient.getStorage().getFileSystem(),
parallelism,
pairOfSubPathAndConf -> {
Path file = new Path(pairOfSubPathAndConf.getKey());
try {
FileSystem fs = (FileSystem) metaClient.getStorage().getFileSystem();
if (fs.exists(file)) {
return fs.delete(file, false);
}
return true;
} catch (IOException e) {
if (!ignoreFailed) {
throw new HoodieIOException("Failed to delete : " + file, e);
} else {
LOG.warn("Ignore failed deleting : " + file);
return true;
}
}
},
paths);
}
}