All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.apache.hudi.common.fs.FSUtils Maven / Gradle / Ivy

There is a newer version: 1.0.0-beta2
Show newest version
/*
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *      http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package org.apache.hudi.common.fs;

import org.apache.hudi.common.config.HoodieMetadataConfig;
import org.apache.hudi.common.config.SerializableConfiguration;
import org.apache.hudi.common.engine.HoodieEngineContext;
import org.apache.hudi.common.fs.inline.InLineFileSystem;
import org.apache.hudi.common.model.HoodieFileFormat;
import org.apache.hudi.common.model.HoodieLogFile;
import org.apache.hudi.common.table.HoodieTableConfig;
import org.apache.hudi.common.table.HoodieTableMetaClient;
import org.apache.hudi.common.util.Option;
import org.apache.hudi.common.util.StringUtils;
import org.apache.hudi.common.util.collection.ImmutablePair;
import org.apache.hudi.common.util.collection.Pair;
import org.apache.hudi.exception.HoodieException;
import org.apache.hudi.exception.HoodieIOException;
import org.apache.hudi.exception.HoodieValidationException;
import org.apache.hudi.exception.InvalidHoodiePathException;
import org.apache.hudi.hadoop.CachingPath;
import org.apache.hudi.metadata.HoodieTableMetadata;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileStatus;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.LocatedFileStatus;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.fs.PathFilter;
import org.apache.hadoop.fs.RemoteIterator;
import org.apache.hadoop.hdfs.DistributedFileSystem;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import java.io.File;
import java.io.FileNotFoundException;
import java.io.IOException;
import java.io.Serializable;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.HashMap;
import java.util.HashSet;
import java.util.List;
import java.util.Map;
import java.util.Map.Entry;
import java.util.Objects;
import java.util.Set;
import java.util.UUID;
import java.util.function.Function;
import java.util.function.Predicate;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import java.util.stream.Collectors;
import java.util.stream.Stream;

import static org.apache.hudi.hadoop.CachingPath.getPathWithoutSchemeAndAuthority;

/**
 * Utility functions related to accessing the file storage.
 */
public class FSUtils {

  private static final Logger LOG = LoggerFactory.getLogger(FSUtils.class);
  // Log files are of this pattern - .b5068208-e1a4-11e6-bf01-fe55135034f3_20170101134598.log.1_1-0-1
  // Archive log files are of this pattern - .commits_.archive.1_1-0-1
  public static final Pattern LOG_FILE_PATTERN =
      Pattern.compile("^\\.(.+)_(.*)\\.(log|archive)\\.(\\d+)(_((\\d+)-(\\d+)-(\\d+))(.cdc)?)?");
  public static final Pattern PREFIX_BY_FILE_ID_PATTERN = Pattern.compile("^(.+)-(\\d+)");
  private static final int MAX_ATTEMPTS_RECOVER_LEASE = 10;
  private static final String HOODIE_ENV_PROPS_PREFIX = "HOODIE_ENV_";

  private static final PathFilter ALLOW_ALL_FILTER = file -> true;

  public static Configuration prepareHadoopConf(Configuration conf) {
    // look for all properties, prefixed to be picked up
    for (Entry prop : System.getenv().entrySet()) {
      if (prop.getKey().startsWith(HOODIE_ENV_PROPS_PREFIX)) {
        LOG.info("Picking up value for hoodie env var :" + prop.getKey());
        conf.set(prop.getKey().replace(HOODIE_ENV_PROPS_PREFIX, "").replaceAll("_DOT_", "."), prop.getValue());
      }
    }
    return conf;
  }

  public static Configuration buildInlineConf(Configuration conf) {
    Configuration inlineConf = new Configuration(conf);
    inlineConf.set("fs." + InLineFileSystem.SCHEME + ".impl", InLineFileSystem.class.getName());
    inlineConf.setClassLoader(InLineFileSystem.class.getClassLoader());
    return inlineConf;
  }

  public static FileSystem getFs(String pathStr, Configuration conf) {
    return getFs(new Path(pathStr), conf);
  }

  public static FileSystem getFs(Path path, Configuration conf) {
    FileSystem fs;
    prepareHadoopConf(conf);
    try {
      fs = path.getFileSystem(conf);
    } catch (IOException e) {
      throw new HoodieIOException("Failed to get instance of " + FileSystem.class.getName(), e);
    }
    return fs;
  }

  public static FileSystem getFs(String pathStr, Configuration conf, boolean localByDefault) {
    if (localByDefault) {
      return getFs(addSchemeIfLocalPath(pathStr), conf);
    }
    return getFs(pathStr, conf);
  }

  /**
   * Check if table already exists in the given path.
   * @param path base path of the table.
   * @param fs instance of {@link FileSystem}.
   * @return {@code true} if table exists. {@code false} otherwise.
   */
  public static boolean isTableExists(String path, FileSystem fs) throws IOException {
    return fs.exists(new Path(path + "/" + HoodieTableMetaClient.METAFOLDER_NAME));
  }

  public static Path addSchemeIfLocalPath(String path) {
    Path providedPath = new Path(path);
    File localFile = new File(path);
    if (!providedPath.isAbsolute() && localFile.exists()) {
      Path resolvedPath = new Path("file://" + localFile.getAbsolutePath());
      LOG.info("Resolving file " + path + " to be a local file.");
      return resolvedPath;
    }
    LOG.info("Resolving file " + path + "to be a remote file.");
    return providedPath;
  }

  /**
   * Makes path qualified w/ {@link FileSystem}'s URI
   *
   * @param fs instance of {@link FileSystem} path belongs to
   * @param path path to be qualified
   * @return qualified path, prefixed w/ the URI of the target FS object provided
   */
  public static Path makeQualified(FileSystem fs, Path path) {
    return path.makeQualified(fs.getUri(), fs.getWorkingDirectory());
  }

  /**
   * A write token uniquely identifies an attempt at one of the IOHandle operations (Merge/Create/Append).
   */
  public static String makeWriteToken(int taskPartitionId, int stageId, long taskAttemptId) {
    return String.format("%d-%d-%d", taskPartitionId, stageId, taskAttemptId);
  }

  // TODO: this should be removed
  public static String makeBaseFileName(String instantTime, String writeToken, String fileId) {
    return String.format("%s_%s_%s%s", fileId, writeToken, instantTime,
        HoodieTableConfig.BASE_FILE_FORMAT.defaultValue().getFileExtension());
  }

  public static String makeBaseFileName(String instantTime, String writeToken, String fileId, String fileExtension) {
    return String.format("%s_%s_%s%s", fileId, writeToken, instantTime, fileExtension);
  }

  public static String makeBootstrapIndexFileName(String instantTime, String fileId, String ext) {
    return String.format("%s_%s_%s%s", fileId, "1-0-1", instantTime, ext);
  }

  public static String maskWithoutFileId(String instantTime, int taskPartitionId) {
    return String.format("*_%s_%s%s", taskPartitionId, instantTime, HoodieTableConfig.BASE_FILE_FORMAT
        .defaultValue().getFileExtension());
  }

  public static String getCommitFromCommitFile(String commitFileName) {
    return commitFileName.split("\\.")[0];
  }

  public static String getCommitTime(String fullFileName) {
    try {
      if (isLogFile(fullFileName)) {
        return fullFileName.split("_")[1].split("\\.", 2)[0];
      }
      return fullFileName.split("_")[2].split("\\.", 2)[0];
    } catch (ArrayIndexOutOfBoundsException e) {
      throw new HoodieException("Failed to get commit time from filename: " + fullFileName, e);
    }
  }

  public static long getFileSize(FileSystem fs, Path path) throws IOException {
    return fs.getFileStatus(path).getLen();
  }

  public static String getFileId(String fullFileName) {
    return fullFileName.split("_", 2)[0];
  }

  /**
   * Gets all partition paths assuming date partitioning (year, month, day) three levels down.
   */
  public static List getAllPartitionFoldersThreeLevelsDown(FileSystem fs, String basePath) throws IOException {
    List datePartitions = new ArrayList<>();
    // Avoid listing and including any folders under the metafolder
    PathFilter filter = getExcludeMetaPathFilter();
    FileStatus[] folders = fs.globStatus(new Path(basePath + "/*/*/*"), filter);
    for (FileStatus status : folders) {
      Path path = status.getPath();
      datePartitions.add(String.format("%s/%s/%s", path.getParent().getParent().getName(), path.getParent().getName(),
          path.getName()));
    }
    return datePartitions;
  }

  /**
   * Given a base partition and a partition path, return relative path of partition path to the base path.
   */
  public static String getRelativePartitionPath(Path basePath, Path fullPartitionPath) {
    basePath = getPathWithoutSchemeAndAuthority(basePath);
    fullPartitionPath = getPathWithoutSchemeAndAuthority(fullPartitionPath);

    String fullPartitionPathStr = fullPartitionPath.toString();

    if (!fullPartitionPathStr.startsWith(basePath.toString())) {
      throw new IllegalArgumentException("Partition path \"" + fullPartitionPathStr
          + "\" does not belong to base-path \"" + basePath + "\"");
    }

    int partitionStartIndex = fullPartitionPathStr.indexOf(basePath.getName(),
        basePath.getParent() == null ? 0 : basePath.getParent().toString().length());
    // Partition-Path could be empty for non-partitioned tables
    return partitionStartIndex + basePath.getName().length() == fullPartitionPathStr.length() ? ""
        : fullPartitionPathStr.substring(partitionStartIndex + basePath.getName().length() + 1);
  }

  /**
   * Recursively processes all files in the base-path. If excludeMetaFolder is set, the meta-folder and all its subdirs
   * are skipped
   *
   * @param fs File System
   * @param basePathStr Base-Path
   * @param consumer Callback for processing
   * @param excludeMetaFolder Exclude .hoodie folder
   * @throws IOException -
   */
  public static void processFiles(FileSystem fs, String basePathStr, Function consumer,
                                  boolean excludeMetaFolder) throws IOException {
    PathFilter pathFilter = excludeMetaFolder ? getExcludeMetaPathFilter() : ALLOW_ALL_FILTER;
    FileStatus[] topLevelStatuses = fs.listStatus(new Path(basePathStr));
    for (FileStatus child : topLevelStatuses) {
      if (child.isFile()) {
        boolean success = consumer.apply(child);
        if (!success) {
          throw new HoodieException("Failed to process file-status=" + child);
        }
      } else if (pathFilter.accept(child.getPath())) {
        RemoteIterator itr = fs.listFiles(child.getPath(), true);
        while (itr.hasNext()) {
          FileStatus status = itr.next();
          boolean success = consumer.apply(status);
          if (!success) {
            throw new HoodieException("Failed to process file-status=" + status);
          }
        }
      }
    }
  }

  public static List getAllPartitionPaths(HoodieEngineContext engineContext, String basePathStr,
                                                  boolean useFileListingFromMetadata,
                                                  boolean assumeDatePartitioning) {
    HoodieMetadataConfig metadataConfig = HoodieMetadataConfig.newBuilder()
        .enable(useFileListingFromMetadata)
        .withAssumeDatePartitioning(assumeDatePartitioning)
        .build();
    try (HoodieTableMetadata tableMetadata = HoodieTableMetadata.create(engineContext, metadataConfig, basePathStr)) {
      return tableMetadata.getAllPartitionPaths();
    } catch (Exception e) {
      throw new HoodieException("Error fetching partition paths from metadata table", e);
    }
  }

  public static List getAllPartitionPaths(HoodieEngineContext engineContext, HoodieMetadataConfig metadataConfig,
                                                  String basePathStr) {
    try (HoodieTableMetadata tableMetadata = HoodieTableMetadata.create(engineContext, metadataConfig, basePathStr)) {
      return tableMetadata.getAllPartitionPaths();
    } catch (Exception e) {
      throw new HoodieException("Error fetching partition paths from metadata table", e);
    }
  }

  public static Map getFilesInPartitions(HoodieEngineContext engineContext,
                                                               HoodieMetadataConfig metadataConfig,
                                                               String basePathStr,
                                                               String[] partitionPaths) {
    try (HoodieTableMetadata tableMetadata = HoodieTableMetadata.create(engineContext, metadataConfig, basePathStr)) {
      return tableMetadata.getAllFilesInPartitions(Arrays.asList(partitionPaths));
    } catch (Exception ex) {
      throw new HoodieException("Error get files in partitions: " + String.join(",", partitionPaths), ex);
    }
  }

  public static String getFileExtension(String fullName) {
    Objects.requireNonNull(fullName);
    String fileName = new File(fullName).getName();
    int dotIndex = fileName.lastIndexOf('.');
    return dotIndex == -1 ? "" : fileName.substring(dotIndex);
  }

  private static PathFilter getExcludeMetaPathFilter() {
    // Avoid listing and including any folders under the metafolder
    return (path) -> !path.toString().contains(HoodieTableMetaClient.METAFOLDER_NAME);
  }

  /**
   * Returns a new unique prefix for creating a file group.
   */
  public static String createNewFileIdPfx() {
    return UUID.randomUUID().toString();
  }

  /**
   * Returns prefix for a file group from fileId.
   */
  public static String getFileIdPfxFromFileId(String fileId) {
    Matcher matcher = PREFIX_BY_FILE_ID_PATTERN.matcher(fileId);
    if (!matcher.find()) {
      throw new HoodieValidationException("Failed to get prefix from " + fileId);
    }
    return matcher.group(1);
  }

  public static String createNewFileId(String idPfx, int id) {
    return String.format("%s-%d", idPfx, id);
  }

  /**
   * Get the file extension from the log file.
   */
  public static String getFileExtensionFromLog(Path logPath) {
    Matcher matcher = LOG_FILE_PATTERN.matcher(logPath.getName());
    if (!matcher.find()) {
      throw new InvalidHoodiePathException(logPath, "LogFile");
    }
    return matcher.group(3);
  }

  /**
   * Get the first part of the file name in the log file. That will be the fileId. Log file do not have instantTime in
   * the file name.
   */
  public static String getFileIdFromLogPath(Path path) {
    Matcher matcher = LOG_FILE_PATTERN.matcher(path.getName());
    if (!matcher.find()) {
      throw new InvalidHoodiePathException(path, "LogFile");
    }
    return matcher.group(1);
  }

  /**
   * Check if the file is a base file of a log file. Then get the fileId appropriately.
   */
  public static String getFileIdFromFilePath(Path filePath) {
    if (FSUtils.isLogFile(filePath)) {
      return FSUtils.getFileIdFromLogPath(filePath);
    }
    return FSUtils.getFileId(filePath.getName());
  }

  /**
   * Get the first part of the file name in the log file. That will be the fileId. Log file do not have instantTime in
   * the file name.
   */
  public static String getBaseCommitTimeFromLogPath(Path path) {
    Matcher matcher = LOG_FILE_PATTERN.matcher(path.getName());
    if (!matcher.find()) {
      throw new InvalidHoodiePathException(path, "LogFile");
    }
    return matcher.group(2);
  }

  /**
   * Get TaskPartitionId used in log-path.
   */
  public static Integer getTaskPartitionIdFromLogPath(Path path) {
    Matcher matcher = LOG_FILE_PATTERN.matcher(path.getName());
    if (!matcher.find()) {
      throw new InvalidHoodiePathException(path, "LogFile");
    }
    String val = matcher.group(7);
    return val == null ? null : Integer.parseInt(val);
  }

  /**
   * Get Write-Token used in log-path.
   */
  public static String getWriteTokenFromLogPath(Path path) {
    Matcher matcher = LOG_FILE_PATTERN.matcher(path.getName());
    if (!matcher.find()) {
      throw new InvalidHoodiePathException(path, "LogFile");
    }
    return matcher.group(6);
  }

  /**
   * Get StageId used in log-path.
   */
  public static Integer getStageIdFromLogPath(Path path) {
    Matcher matcher = LOG_FILE_PATTERN.matcher(path.getName());
    if (!matcher.find()) {
      throw new InvalidHoodiePathException(path, "LogFile");
    }
    String val = matcher.group(8);
    return val == null ? null : Integer.parseInt(val);
  }

  /**
   * Get Task Attempt Id used in log-path.
   */
  public static Integer getTaskAttemptIdFromLogPath(Path path) {
    Matcher matcher = LOG_FILE_PATTERN.matcher(path.getName());
    if (!matcher.find()) {
      throw new InvalidHoodiePathException(path, "LogFile");
    }
    String val = matcher.group(9);
    return val == null ? null : Integer.parseInt(val);
  }

  /**
   * Get the last part of the file name in the log file and convert to int.
   */
  public static int getFileVersionFromLog(Path logPath) {
    return getFileVersionFromLog(logPath.getName());
  }

  public static int getFileVersionFromLog(String logFileName) {
    Matcher matcher = LOG_FILE_PATTERN.matcher(logFileName);
    if (!matcher.find()) {
      throw new HoodieIOException("Invalid log file name: " + logFileName);
    }
    return Integer.parseInt(matcher.group(4));
  }

  public static String makeLogFileName(String fileId, String logFileExtension, String baseCommitTime, int version,
      String writeToken) {
    String suffix = (writeToken == null)
        ? String.format("%s_%s%s.%d", fileId, baseCommitTime, logFileExtension, version)
        : String.format("%s_%s%s.%d_%s", fileId, baseCommitTime, logFileExtension, version, writeToken);
    return HoodieLogFile.LOG_FILE_PREFIX + suffix;
  }

  public static boolean isBaseFile(Path path) {
    String extension = getFileExtension(path.getName());
    return HoodieFileFormat.BASE_FILE_EXTENSIONS.contains(extension);
  }

  public static boolean isLogFile(Path logPath) {
    return isLogFile(logPath.getName());
  }

  public static boolean isLogFile(String fileName) {
    Matcher matcher = LOG_FILE_PATTERN.matcher(fileName);
    return fileName.contains(".log") && matcher.find();
  }

  /**
   * Returns true if the given path is a Base file or a Log file.
   */
  public static boolean isDataFile(Path path) {
    return isBaseFile(path) || isLogFile(path);
  }

  /**
   * Get the names of all the base and log files in the given partition path.
   */
  public static FileStatus[] getAllDataFilesInPartition(FileSystem fs, Path partitionPath) throws IOException {
    final Set validFileExtensions = Arrays.stream(HoodieFileFormat.values())
        .map(HoodieFileFormat::getFileExtension).collect(Collectors.toCollection(HashSet::new));
    final String logFileExtension = HoodieFileFormat.HOODIE_LOG.getFileExtension();

    try {
      return Arrays.stream(fs.listStatus(partitionPath, path -> {
        String extension = FSUtils.getFileExtension(path.getName());
        return validFileExtensions.contains(extension) || path.getName().contains(logFileExtension);
      })).filter(FileStatus::isFile).toArray(FileStatus[]::new);
    } catch (IOException e) {
      // return empty FileStatus if partition does not exist already
      if (!fs.exists(partitionPath)) {
        return new FileStatus[0];
      } else {
        throw e;
      }
    }
  }

  /**
   * Get the latest log file for the passed in file-id in the partition path
   */
  public static Option getLatestLogFile(FileSystem fs, Path partitionPath, String fileId,
                                                       String logFileExtension, String baseCommitTime) throws IOException {
    return getLatestLogFile(getAllLogFiles(fs, partitionPath, fileId, logFileExtension, baseCommitTime));
  }

  /**
   * Get all the log files for the passed in file-id in the partition path.
   */
  public static Stream getAllLogFiles(FileSystem fs, Path partitionPath, final String fileId,
      final String logFileExtension, final String baseCommitTime) throws IOException {
    try {
      PathFilter pathFilter = path -> path.getName().startsWith("." + fileId) && path.getName().contains(logFileExtension);
      return Arrays.stream(fs.listStatus(partitionPath, pathFilter))
          .map(HoodieLogFile::new)
          .filter(s -> s.getBaseCommitTime().equals(baseCommitTime));
    } catch (FileNotFoundException e) {
      return Stream.of();
    }
  }

  /**
   * Get the latest log version for the fileId in the partition path.
   */
  public static Option> getLatestLogVersion(FileSystem fs, Path partitionPath,
      final String fileId, final String logFileExtension, final String baseCommitTime) throws IOException {
    Option latestLogFile =
        getLatestLogFile(getAllLogFiles(fs, partitionPath, fileId, logFileExtension, baseCommitTime));
    if (latestLogFile.isPresent()) {
      return Option
          .of(Pair.of(latestLogFile.get().getLogVersion(), latestLogFile.get().getLogWriteToken()));
    }
    return Option.empty();
  }

  /**
   * computes the next log version for the specified fileId in the partition path.
   */
  public static int computeNextLogVersion(FileSystem fs, Path partitionPath, final String fileId,
      final String logFileExtension, final String baseCommitTime) throws IOException {
    Option> currentVersionWithWriteToken =
        getLatestLogVersion(fs, partitionPath, fileId, logFileExtension, baseCommitTime);
    // handle potential overflow
    return (currentVersionWithWriteToken.isPresent()) ? currentVersionWithWriteToken.get().getKey() + 1
        : HoodieLogFile.LOGFILE_BASE_VERSION;
  }

  public static int getDefaultBufferSize(final FileSystem fs) {
    return fs.getConf().getInt("io.file.buffer.size", 4096);
  }

  public static Short getDefaultReplication(FileSystem fs, Path path) {
    return fs.getDefaultReplication(path);
  }

  /**
   * When a file was opened and the task died without closing the stream, another task executor cannot open because the
   * existing lease will be active. We will try to recover the lease, from HDFS. If a data node went down, it takes
   * about 10 minutes for the lease to be recovered. But if the client dies, this should be instant.
   */
  public static boolean recoverDFSFileLease(final DistributedFileSystem dfs, final Path p)
      throws IOException, InterruptedException {
    LOG.info("Recover lease on dfs file " + p);
    // initiate the recovery
    boolean recovered = false;
    for (int nbAttempt = 0; nbAttempt < MAX_ATTEMPTS_RECOVER_LEASE; nbAttempt++) {
      LOG.info("Attempt " + nbAttempt + " to recover lease on dfs file " + p);
      recovered = dfs.recoverLease(p);
      if (recovered) {
        break;
      }
      // Sleep for 1 second before trying again. Typically it takes about 2-3 seconds to recover
      // under default settings
      Thread.sleep(1000);
    }
    return recovered;
  }

  public static void createPathIfNotExists(FileSystem fs, Path partitionPath) throws IOException {
    if (!fs.exists(partitionPath)) {
      fs.mkdirs(partitionPath);
    }
  }

  public static Long getSizeInMB(long sizeInBytes) {
    return sizeInBytes / (1024 * 1024);
  }

  public static Path getPartitionPath(String basePath, String partitionPath) {
    if (StringUtils.isNullOrEmpty(partitionPath)) {
      return new Path(basePath);
    }

    // NOTE: We have to chop leading "/" to make sure Hadoop does not treat it like
    //       absolute path
    String properPartitionPath = partitionPath.startsWith("/")
        ? partitionPath.substring(1)
        : partitionPath;
    return getPartitionPath(new CachingPath(basePath), properPartitionPath);
  }

  public static Path getPartitionPath(Path basePath, String partitionPath) {
    // For non-partitioned table, return only base-path
    return StringUtils.isNullOrEmpty(partitionPath) ? basePath : new CachingPath(basePath, partitionPath);
  }

  /**
   * Extracts the file name from the relative path based on the table base path.  For example:
   * "/2022/07/29/file1.parquet", "/2022/07/29" -> "file1.parquet"
   * "2022/07/29/file2.parquet", "2022/07/29" -> "file2.parquet"
   * "/file3.parquet", "" -> "file3.parquet"
   * "file4.parquet", "" -> "file4.parquet"
   *
   * @param filePathWithPartition the relative file path based on the table base path.
   * @param partition             the relative partition path.  For partitioned table, `partition` contains the relative partition path;
   *                              for non-partitioned table, `partition` is empty
   * @return Extracted file name in String.
   */
  public static String getFileName(String filePathWithPartition, String partition) {
    int offset = StringUtils.isNullOrEmpty(partition)
        ? (filePathWithPartition.startsWith("/") ? 1 : 0) : partition.length() + 1;
    return filePathWithPartition.substring(offset);
  }

  /**
   * Get DFS full partition path (e.g. hdfs://ip-address:8020:/)
   */
  public static String getDFSFullPartitionPath(FileSystem fs, Path fullPartitionPath) {
    return fs.getUri() + fullPartitionPath.toUri().getRawPath();
  }

  /**
   * This is due to HUDI-140 GCS has a different behavior for detecting EOF during seek().
   *
   * @param fs fileSystem instance.
   * @return true if the inputstream or the wrapped one is of type GoogleHadoopFSInputStream
   */
  public static boolean isGCSFileSystem(FileSystem fs) {
    return fs.getScheme().equals(StorageSchemes.GCS.getScheme());
  }

  /**
   * Chdfs will throw {@code IOException} instead of {@code EOFException}. It will cause error in isBlockCorrupted().
   * Wrapped by {@code BoundedFsDataInputStream}, to check whether the desired offset is out of the file size in advance.
   */
  public static boolean isCHDFileSystem(FileSystem fs) {
    return StorageSchemes.CHDFS.getScheme().equals(fs.getScheme());
  }

  public static Configuration registerFileSystem(Path file, Configuration conf) {
    Configuration returnConf = new Configuration(conf);
    String scheme = FSUtils.getFs(file.toString(), conf).getScheme();
    returnConf.set("fs." + HoodieWrapperFileSystem.getHoodieScheme(scheme) + ".impl",
        HoodieWrapperFileSystem.class.getName());
    return returnConf;
  }

  /**
   * Get the FS implementation for this table.
   * @param path  Path String
   * @param hadoopConf  Serializable Hadoop Configuration
   * @param consistencyGuardConfig Consistency Guard Config
   * @return HoodieWrapperFileSystem
   */
  public static HoodieWrapperFileSystem getFs(String path, SerializableConfiguration hadoopConf,
      ConsistencyGuardConfig consistencyGuardConfig) {
    FileSystem fileSystem = FSUtils.getFs(path, hadoopConf.newCopy());
    return new HoodieWrapperFileSystem(fileSystem,
        consistencyGuardConfig.isConsistencyCheckEnabled()
            ? new FailSafeConsistencyGuard(fileSystem, consistencyGuardConfig)
            : new NoOpConsistencyGuard());
  }

  /**
   * Helper to filter out paths under metadata folder when running fs.globStatus.
   * @param fs  File System
   * @param globPath Glob Path
   * @return the file status list of globPath exclude the meta folder
   * @throws IOException when having trouble listing the path
   */
  public static List getGlobStatusExcludingMetaFolder(FileSystem fs, Path globPath) throws IOException {
    FileStatus[] statuses = fs.globStatus(globPath);
    return Arrays.stream(statuses)
        .filter(fileStatus -> !fileStatus.getPath().toString().contains(HoodieTableMetaClient.METAFOLDER_NAME))
        .collect(Collectors.toList());
  }

  /**
   * Deletes a directory by deleting sub-paths in parallel on the file system.
   *
   * @param hoodieEngineContext {@code HoodieEngineContext} instance
   * @param fs file system
   * @param dirPath directory path
   * @param parallelism parallelism to use for sub-paths
   * @return {@code true} if the directory is delete; {@code false} otherwise.
   */
  public static boolean deleteDir(
      HoodieEngineContext hoodieEngineContext, FileSystem fs, Path dirPath, int parallelism) {
    try {
      if (fs.exists(dirPath)) {
        FSUtils.parallelizeSubPathProcess(hoodieEngineContext, fs, dirPath, parallelism, e -> true,
            pairOfSubPathAndConf -> deleteSubPath(
                pairOfSubPathAndConf.getKey(), pairOfSubPathAndConf.getValue(), true)
        );
        boolean result = fs.delete(dirPath, false);
        LOG.info("Removed directory at " + dirPath);
        return result;
      }
    } catch (IOException ioe) {
      throw new HoodieIOException(ioe.getMessage(), ioe);
    }
    return false;
  }

  /**
   * Processes sub-path in parallel.
   *
   * @param hoodieEngineContext {@code HoodieEngineContext} instance
   * @param fs file system
   * @param dirPath directory path
   * @param parallelism parallelism to use for sub-paths
   * @param subPathPredicate predicate to use to filter sub-paths for processing
   * @param pairFunction actual processing logic for each sub-path
   * @param  type of result to return for each sub-path
   * @return a map of sub-path to result of the processing
   */
  public static  Map parallelizeSubPathProcess(
      HoodieEngineContext hoodieEngineContext, FileSystem fs, Path dirPath, int parallelism,
      Predicate subPathPredicate, SerializableFunction, T> pairFunction) {
    Map result = new HashMap<>();
    try {
      FileStatus[] fileStatuses = fs.listStatus(dirPath);
      List subPaths = Arrays.stream(fileStatuses)
          .filter(subPathPredicate)
          .map(fileStatus -> fileStatus.getPath().toString())
          .collect(Collectors.toList());
      result = parallelizeFilesProcess(hoodieEngineContext, fs, parallelism, pairFunction, subPaths);
    } catch (IOException ioe) {
      throw new HoodieIOException(ioe.getMessage(), ioe);
    }
    return result;
  }

  public static  Map parallelizeFilesProcess(
      HoodieEngineContext hoodieEngineContext,
      FileSystem fs,
      int parallelism,
      SerializableFunction, T> pairFunction,
      List subPaths) {
    Map result = new HashMap<>();
    if (subPaths.size() > 0) {
      SerializableConfiguration conf = new SerializableConfiguration(fs.getConf());
      int actualParallelism = Math.min(subPaths.size(), parallelism);

      hoodieEngineContext.setJobStatus(FSUtils.class.getSimpleName(),
          "Parallel listing paths " + String.join(",", subPaths));

      result = hoodieEngineContext.mapToPair(subPaths,
          subPath -> new ImmutablePair<>(subPath, pairFunction.apply(new ImmutablePair<>(subPath, conf))),
          actualParallelism);
    }
    return result;
  }

  /**
   * Deletes a sub-path.
   *
   * @param subPathStr sub-path String
   * @param conf       serializable config
   * @param recursive  is recursive or not
   * @return {@code true} if the sub-path is deleted; {@code false} otherwise.
   */
  public static boolean deleteSubPath(String subPathStr, SerializableConfiguration conf, boolean recursive) {
    try {
      Path subPath = new Path(subPathStr);
      FileSystem fileSystem = subPath.getFileSystem(conf.get());
      return fileSystem.delete(subPath, recursive);
    } catch (IOException e) {
      throw new HoodieIOException(e.getMessage(), e);
    }
  }

  /**
   * Lists file status at a certain level in the directory hierarchy.
   * 

* E.g., given "/tmp/hoodie_table" as the rootPath, and 3 as the expected level, * this method gives back the {@link FileStatus} of all files under * "/tmp/hoodie_table/[*]/[*]/[*]/" folders. * * @param hoodieEngineContext {@link HoodieEngineContext} instance. * @param fs {@link FileSystem} instance. * @param rootPath Root path for the file listing. * @param expectLevel Expected level of directory hierarchy for files to be added. * @param parallelism Parallelism for the file listing. * @return A list of file status of files at the level. */ public static List getFileStatusAtLevel( HoodieEngineContext hoodieEngineContext, FileSystem fs, Path rootPath, int expectLevel, int parallelism) { List levelPaths = new ArrayList<>(); List result = new ArrayList<>(); levelPaths.add(rootPath.toString()); for (int i = 0; i <= expectLevel; i++) { result = FSUtils.parallelizeFilesProcess(hoodieEngineContext, fs, parallelism, pairOfSubPathAndConf -> { Path path = new Path(pairOfSubPathAndConf.getKey()); try { FileSystem fileSystem = path.getFileSystem(pairOfSubPathAndConf.getValue().get()); return Arrays.stream(fileSystem.listStatus(path)) .collect(Collectors.toList()); } catch (IOException e) { throw new HoodieIOException("Failed to list " + path, e); } }, levelPaths) .values().stream() .flatMap(list -> list.stream()).collect(Collectors.toList()); if (i < expectLevel) { levelPaths = result.stream() .filter(FileStatus::isDirectory) .map(fileStatus -> fileStatus.getPath().toString()) .collect(Collectors.toList()); } } return result; } /** * Serializable function interface. * * @param Input value type. * @param Output value type. */ public interface SerializableFunction extends Function, Serializable { } private static Option getLatestLogFile(Stream logFiles) { return Option.fromJavaOptional(logFiles.min(HoodieLogFile.getReverseLogFileComparator())); } }





© 2015 - 2024 Weber Informatics LLC | Privacy Policy