org.apache.iceberg.data.TableMigrationUtil Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of iceberg-data Show documentation
Apache Iceberg modified by Tencent blueking
There is a newer version: 0.14.0-bkbase.2
/*
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *   http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing,
 * software distributed under the License is distributed on an
 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 * KIND, either express or implied.  See the License for the
 * specific language governing permissions and limitations
 * under the License.
 */

package org.apache.iceberg.data;

import java.io.IOException;
import java.io.UncheckedIOException;
import java.util.Arrays;
import java.util.List;
import java.util.Map;
import java.util.concurrent.ExecutorService;
import java.util.concurrent.Executors;
import java.util.concurrent.ThreadPoolExecutor;
import java.util.stream.Collectors;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileStatus;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.fs.PathFilter;
import org.apache.iceberg.DataFile;
import org.apache.iceberg.DataFiles;
import org.apache.iceberg.Metrics;
import org.apache.iceberg.MetricsConfig;
import org.apache.iceberg.PartitionField;
import org.apache.iceberg.PartitionSpec;
import org.apache.iceberg.avro.Avro;
import org.apache.iceberg.hadoop.HadoopInputFile;
import org.apache.iceberg.io.InputFile;
import org.apache.iceberg.mapping.NameMapping;
import org.apache.iceberg.orc.OrcMetrics;
import org.apache.iceberg.parquet.ParquetUtil;
import org.apache.iceberg.relocated.com.google.common.util.concurrent.MoreExecutors;
import org.apache.iceberg.relocated.com.google.common.util.concurrent.ThreadFactoryBuilder;
import org.apache.iceberg.util.Tasks;

public class TableMigrationUtil {
  private static final PathFilter HIDDEN_PATH_FILTER =
      p -> !p.getName().startsWith("_") && !p.getName().startsWith(".");

  private TableMigrationUtil() {
  }

  /**
   * Returns the data files in a partition by listing the partition location.
   * 
   * For Parquet and ORC partitions, this will read metrics from the file footer. For Avro partitions,
   * metrics are set to null.
   * 
   * Note: certain metrics, like NaN counts, that are only supported by iceberg file writers but not file footers,
   * will not be populated.
   *
   * @param partition partition key, e.g., "a=1/b=2"
   * @param uri           partition location URI
   * @param format        partition format, avro, parquet or orc
   * @param spec          a partition spec
   * @param conf          a Hadoop conf
   * @param metricsConfig a metrics conf
   * @param mapping       a name mapping
   * @return a List of DataFile
   */
  public static List listPartition(Map partition, String uri, String format,
                                             PartitionSpec spec, Configuration conf, MetricsConfig metricsConfig,
                                             NameMapping mapping) {
    return listPartition(partition, uri, format, spec, conf, metricsConfig, mapping, 1);
  }

  public static List listPartition(Map partitionPath, String partitionUri, String format,
                                             PartitionSpec spec, Configuration conf, MetricsConfig metricsSpec,
                                             NameMapping mapping, int parallelism) {
    try {
      String partitionKey = spec.fields().stream()
              .map(PartitionField::name)
              .map(name -> String.format("%s=%s", name, partitionPath.get(name)))
              .collect(Collectors.joining("/"));

      Path partition = new Path(partitionUri);
      FileSystem fs = partition.getFileSystem(conf);
      List fileStatus = Arrays.stream(fs.listStatus(partition, HIDDEN_PATH_FILTER))
              .filter(FileStatus::isFile)
              .collect(Collectors.toList());
      DataFile[] datafiles = new DataFile[fileStatus.size()];
      Tasks.Builder task = Tasks.range(fileStatus.size())
              .stopOnFailure()
              .throwFailureWhenFinished();

      if (parallelism > 1) {
        task.executeWith(migrationService(parallelism));
      }

      if (format.contains("avro")) {
        task.run(index -> {
          Metrics metrics = getAvroMerics(fileStatus.get(index).getPath(), conf);
          datafiles[index] = buildDataFile(fileStatus.get(index), partitionKey, spec, metrics, "avro");
        });
      } else if (format.contains("parquet")) {
        task.run(index -> {
          Metrics metrics = getParquetMerics(fileStatus.get(index).getPath(), conf, metricsSpec, mapping);
          datafiles[index] = buildDataFile(fileStatus.get(index), partitionKey, spec, metrics, "parquet");
        });
      } else if (format.contains("orc")) {
        task.run(index -> {
          Metrics metrics = getOrcMerics(fileStatus.get(index).getPath(), conf, metricsSpec, mapping);
          datafiles[index] = buildDataFile(fileStatus.get(index), partitionKey, spec, metrics, "orc");
        });
      } else {
        throw new UnsupportedOperationException("Unknown partition format: " + format);
      }
      return Arrays.asList(datafiles);
    } catch (IOException e) {
      throw new RuntimeException("Unable to list files in partition: " + partitionUri, e);
    }
  }

  private static Metrics getAvroMerics(Path path,  Configuration conf) {
    try {
      InputFile file = HadoopInputFile.fromPath(path, conf);
      long rowCount = Avro.rowCount(file);
      return new Metrics(rowCount, null, null, null, null);
    } catch (UncheckedIOException e) {
      throw new RuntimeException("Unable to read Avro file: " +
              path, e);
    }
  }

  private static Metrics getParquetMerics(Path path,  Configuration conf,
                                          MetricsConfig metricsSpec, NameMapping mapping) {
    try {
      InputFile file = HadoopInputFile.fromPath(path, conf);
      return ParquetUtil.fileMetrics(file, metricsSpec, mapping);
    } catch (UncheckedIOException e) {
      throw new RuntimeException("Unable to read the metrics of the Parquet file: " +
              path, e);
    }
  }

  private static Metrics getOrcMerics(Path path,  Configuration conf,
                                          MetricsConfig metricsSpec, NameMapping mapping) {
    try {
      return OrcMetrics.fromInputFile(HadoopInputFile.fromPath(path, conf),
              metricsSpec, mapping);
    } catch (UncheckedIOException e) {
      throw new RuntimeException("Unable to read the metrics of the Orc file: " +
              path, e);
    }
  }

  private static DataFile buildDataFile(FileStatus stat, String partitionKey,
                                      PartitionSpec spec, Metrics metrics, String format) {
    return  DataFiles.builder(spec)
            .withPath(stat.getPath().toString())
            .withFormat(format)
            .withFileSizeInBytes(stat.getLen())
            .withMetrics(metrics)
            .withPartitionPath(partitionKey)
            .build();
  }

  private static ExecutorService migrationService(int concurrentDeletes) {
    return MoreExecutors.getExitingExecutorService(
            (ThreadPoolExecutor) Executors.newFixedThreadPool(
                    concurrentDeletes,
                    new ThreadFactoryBuilder()
                            .setNameFormat("table-migration-%d")
                            .build()));
  }
}