All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.apache.hudi.table.action.bootstrap.BootstrapUtils Maven / Gradle / Ivy

/*
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *      http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package org.apache.hudi.table.action.bootstrap;

import org.apache.hudi.avro.model.HoodieFileStatus;
import org.apache.hudi.common.bootstrap.FileStatusUtils;
import org.apache.hudi.common.engine.HoodieEngineContext;
import org.apache.hudi.common.fs.FSUtils;
import org.apache.hudi.common.table.HoodieTableMetaClient;
import org.apache.hudi.common.util.collection.Pair;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileStatus;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.LocatedFileStatus;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.fs.PathFilter;
import org.apache.hadoop.fs.RemoteIterator;

import java.io.IOException;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.OptionalInt;
import java.util.stream.Collectors;

public class BootstrapUtils {

  /**
   * Returns leaf folders with files under a path.
   * @param metaClient Hoodie table metadata client
   * @param fs  File System
   * @param context JHoodieEngineContext
   * @return list of partition paths with files under them.
   * @throws IOException
   */
  public static List>> getAllLeafFoldersWithFiles(HoodieTableMetaClient metaClient,
      FileSystem fs, String basePathStr, HoodieEngineContext context) throws IOException {
    final Path basePath = new Path(basePathStr);
    final String baseFileExtension = metaClient.getTableConfig().getBaseFileFormat().getFileExtension();
    final Map> levelToPartitions = new HashMap<>();
    final Map> partitionToFiles = new HashMap<>();
    PathFilter filePathFilter = getFilePathFilter(baseFileExtension);
    PathFilter metaPathFilter = getExcludeMetaPathFilter();

    FileStatus[] topLevelStatuses = fs.listStatus(basePath);
    List subDirectories = new ArrayList<>();

    List>> result = new ArrayList<>();

    for (FileStatus topLevelStatus: topLevelStatuses) {
      if (topLevelStatus.isFile() && filePathFilter.accept(topLevelStatus.getPath())) {
        String relativePath = FSUtils.getRelativePartitionPath(basePath, topLevelStatus.getPath().getParent());
        Integer level = (int) relativePath.chars().filter(ch -> ch == '/').count();
        HoodieFileStatus hoodieFileStatus = FileStatusUtils.fromFileStatus(topLevelStatus);
        result.add(Pair.of(hoodieFileStatus, Pair.of(level, relativePath)));
      } else if (topLevelStatus.isDirectory() && metaPathFilter.accept(topLevelStatus.getPath())) {
        subDirectories.add(topLevelStatus.getPath().toString());
      }
    }

    if (subDirectories.size() > 0) {
      result.addAll(context.flatMap(subDirectories, directory -> {
        PathFilter pathFilter = getFilePathFilter(baseFileExtension);
        Path path = new Path(directory);
        FileSystem fileSystem = path.getFileSystem(new Configuration());
        RemoteIterator itr = fileSystem.listFiles(path, true);
        List>> res = new ArrayList<>();
        while (itr.hasNext()) {
          FileStatus status = itr.next();
          if (pathFilter.accept(status.getPath())) {
            String relativePath = FSUtils.getRelativePartitionPath(new Path(basePathStr), status.getPath().getParent());
            Integer level = (int) relativePath.chars().filter(ch -> ch == '/').count();
            HoodieFileStatus hoodieFileStatus = FileStatusUtils.fromFileStatus(status);
            res.add(Pair.of(hoodieFileStatus, Pair.of(level, relativePath)));
          }
        }
        return res.stream();
      }, subDirectories.size()));
    }

    result.forEach(val -> {
      String relativePath = val.getRight().getRight();
      List statusList = partitionToFiles.get(relativePath);
      if (null == statusList) {
        Integer level = val.getRight().getLeft();
        List dirs = levelToPartitions.get(level);
        if (null == dirs) {
          dirs = new ArrayList<>();
          levelToPartitions.put(level, dirs);
        }
        dirs.add(relativePath);
        statusList = new ArrayList<>();
        partitionToFiles.put(relativePath, statusList);
      }
      statusList.add(val.getLeft());
    });

    OptionalInt maxLevelOpt = levelToPartitions.keySet().stream().mapToInt(x -> x).max();
    int maxLevel = maxLevelOpt.orElse(-1);
    return maxLevel >= 0 ? levelToPartitions.get(maxLevel).stream()
            .map(d -> Pair.of(d, partitionToFiles.get(d))).collect(Collectors.toList()) : new ArrayList<>();
  }

  private static PathFilter getFilePathFilter(String baseFileExtension) {
    return (path) -> {
      return path.getName().endsWith(baseFileExtension);
    };
  }

  private static PathFilter getExcludeMetaPathFilter() {
    // Avoid listing and including any folders under the meta folder
    return (path) -> !path.toString().contains(HoodieTableMetaClient.METAFOLDER_NAME);
  }
}




© 2015 - 2025 Weber Informatics LLC | Privacy Policy