All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.apache.iceberg.data.TableMigrationUtil Maven / Gradle / Ivy

There is a newer version: 0.14.0-bkbase.2
Show newest version
/*
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *   http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing,
 * software distributed under the License is distributed on an
 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 * KIND, either express or implied.  See the License for the
 * specific language governing permissions and limitations
 * under the License.
 */

package org.apache.iceberg.data;

import java.io.IOException;
import java.io.UncheckedIOException;
import java.util.Arrays;
import java.util.List;
import java.util.Map;
import java.util.concurrent.ExecutorService;
import java.util.concurrent.Executors;
import java.util.concurrent.ThreadPoolExecutor;
import java.util.stream.Collectors;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileStatus;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.fs.PathFilter;
import org.apache.iceberg.DataFile;
import org.apache.iceberg.DataFiles;
import org.apache.iceberg.Metrics;
import org.apache.iceberg.MetricsConfig;
import org.apache.iceberg.PartitionField;
import org.apache.iceberg.PartitionSpec;
import org.apache.iceberg.avro.Avro;
import org.apache.iceberg.hadoop.HadoopInputFile;
import org.apache.iceberg.io.InputFile;
import org.apache.iceberg.mapping.NameMapping;
import org.apache.iceberg.orc.OrcMetrics;
import org.apache.iceberg.parquet.ParquetUtil;
import org.apache.iceberg.relocated.com.google.common.util.concurrent.MoreExecutors;
import org.apache.iceberg.relocated.com.google.common.util.concurrent.ThreadFactoryBuilder;
import org.apache.iceberg.util.Tasks;

public class TableMigrationUtil {
  private static final PathFilter HIDDEN_PATH_FILTER =
      p -> !p.getName().startsWith("_") && !p.getName().startsWith(".");

  private TableMigrationUtil() {
  }

  /**
   * Returns the data files in a partition by listing the partition location.
   * 

* For Parquet and ORC partitions, this will read metrics from the file footer. For Avro partitions, * metrics are set to null. *

* Note: certain metrics, like NaN counts, that are only supported by iceberg file writers but not file footers, * will not be populated. * * @param partition partition key, e.g., "a=1/b=2" * @param uri partition location URI * @param format partition format, avro, parquet or orc * @param spec a partition spec * @param conf a Hadoop conf * @param metricsConfig a metrics conf * @param mapping a name mapping * @return a List of DataFile */ public static List listPartition(Map partition, String uri, String format, PartitionSpec spec, Configuration conf, MetricsConfig metricsConfig, NameMapping mapping) { return listPartition(partition, uri, format, spec, conf, metricsConfig, mapping, 1); } public static List listPartition(Map partitionPath, String partitionUri, String format, PartitionSpec spec, Configuration conf, MetricsConfig metricsSpec, NameMapping mapping, int parallelism) { try { String partitionKey = spec.fields().stream() .map(PartitionField::name) .map(name -> String.format("%s=%s", name, partitionPath.get(name))) .collect(Collectors.joining("/")); Path partition = new Path(partitionUri); FileSystem fs = partition.getFileSystem(conf); List fileStatus = Arrays.stream(fs.listStatus(partition, HIDDEN_PATH_FILTER)) .filter(FileStatus::isFile) .collect(Collectors.toList()); DataFile[] datafiles = new DataFile[fileStatus.size()]; Tasks.Builder task = Tasks.range(fileStatus.size()) .stopOnFailure() .throwFailureWhenFinished(); if (parallelism > 1) { task.executeWith(migrationService(parallelism)); } if (format.contains("avro")) { task.run(index -> { Metrics metrics = getAvroMerics(fileStatus.get(index).getPath(), conf); datafiles[index] = buildDataFile(fileStatus.get(index), partitionKey, spec, metrics, "avro"); }); } else if (format.contains("parquet")) { task.run(index -> { Metrics metrics = getParquetMerics(fileStatus.get(index).getPath(), conf, metricsSpec, mapping); datafiles[index] = buildDataFile(fileStatus.get(index), partitionKey, spec, metrics, "parquet"); }); } else if (format.contains("orc")) { task.run(index -> { Metrics metrics = getOrcMerics(fileStatus.get(index).getPath(), conf, metricsSpec, mapping); datafiles[index] = buildDataFile(fileStatus.get(index), partitionKey, spec, metrics, "orc"); }); } else { throw new UnsupportedOperationException("Unknown partition format: " + format); } return Arrays.asList(datafiles); } catch (IOException e) { throw new RuntimeException("Unable to list files in partition: " + partitionUri, e); } } private static Metrics getAvroMerics(Path path, Configuration conf) { try { InputFile file = HadoopInputFile.fromPath(path, conf); long rowCount = Avro.rowCount(file); return new Metrics(rowCount, null, null, null, null); } catch (UncheckedIOException e) { throw new RuntimeException("Unable to read Avro file: " + path, e); } } private static Metrics getParquetMerics(Path path, Configuration conf, MetricsConfig metricsSpec, NameMapping mapping) { try { InputFile file = HadoopInputFile.fromPath(path, conf); return ParquetUtil.fileMetrics(file, metricsSpec, mapping); } catch (UncheckedIOException e) { throw new RuntimeException("Unable to read the metrics of the Parquet file: " + path, e); } } private static Metrics getOrcMerics(Path path, Configuration conf, MetricsConfig metricsSpec, NameMapping mapping) { try { return OrcMetrics.fromInputFile(HadoopInputFile.fromPath(path, conf), metricsSpec, mapping); } catch (UncheckedIOException e) { throw new RuntimeException("Unable to read the metrics of the Orc file: " + path, e); } } private static DataFile buildDataFile(FileStatus stat, String partitionKey, PartitionSpec spec, Metrics metrics, String format) { return DataFiles.builder(spec) .withPath(stat.getPath().toString()) .withFormat(format) .withFileSizeInBytes(stat.getLen()) .withMetrics(metrics) .withPartitionPath(partitionKey) .build(); } private static ExecutorService migrationService(int concurrentDeletes) { return MoreExecutors.getExitingExecutorService( (ThreadPoolExecutor) Executors.newFixedThreadPool( concurrentDeletes, new ThreadFactoryBuilder() .setNameFormat("table-migration-%d") .build())); } }





© 2015 - 2025 Weber Informatics LLC | Privacy Policy