All Downloads are FREE. Search and download functionalities are using the official Maven repository.

com.netease.arctic.hive.op.ReplaceHivePartitions Maven / Gradle / Ivy

The newest version!
/*
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package com.netease.arctic.hive.op;

import com.netease.arctic.hive.HMSClientPool;
import com.netease.arctic.hive.HiveTableProperties;
import com.netease.arctic.hive.exceptions.CannotAlterHiveLocationException;
import com.netease.arctic.hive.table.UnkeyedHiveTable;
import com.netease.arctic.hive.utils.HivePartitionUtil;
import com.netease.arctic.hive.utils.HiveTableUtil;
import com.netease.arctic.op.UpdatePartitionProperties;
import com.netease.arctic.utils.TableFileUtils;
import com.netease.arctic.utils.TablePropertyUtil;
import org.apache.hadoop.fs.FileStatus;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.hive.metastore.api.NoSuchObjectException;
import org.apache.hadoop.hive.metastore.api.Partition;
import org.apache.hadoop.hive.metastore.api.Table;
import com.netease.arctic.shade.org.apache.iceberg.DataFile;
import com.netease.arctic.shade.org.apache.iceberg.ReplacePartitions;
import com.netease.arctic.shade.org.apache.iceberg.Snapshot;
import com.netease.arctic.shade.org.apache.iceberg.Transaction;
import com.netease.arctic.shade.org.apache.iceberg.relocated.com.google.common.base.Joiner;
import com.netease.arctic.shade.org.apache.iceberg.relocated.com.google.common.collect.Lists;
import com.netease.arctic.shade.org.apache.iceberg.relocated.com.google.common.collect.Maps;
import com.netease.arctic.shade.org.apache.iceberg.types.Types;
import com.netease.arctic.shade.org.apache.iceberg.util.StructLikeMap;
import com.netease.arctic.shade.org.apache.thrift.TException;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import java.lang.reflect.InvocationTargetException;
import java.util.List;
import java.util.Map;
import java.util.function.Consumer;
import java.util.stream.Collectors;

public class ReplaceHivePartitions implements ReplacePartitions {

  private static final Logger LOG = LoggerFactory.getLogger(ReplaceHivePartitions.class);

  private final Transaction transaction;
  private final boolean insideTransaction;
  private final ReplacePartitions delegate;

  private final HMSClientPool hmsClient;
  private final HMSClientPool transactionalHMSClient;

  private final UnkeyedHiveTable table;
  private final List addFiles = Lists.newArrayList();
  private final String db;
  private final String tableName;
  private final Table hiveTable;

  private final StructLikeMap rewritePartitions;
  private final StructLikeMap newPartitions;
  private String unpartitionTableLocation;
  private int commitTimestamp; // in seconds

  public ReplaceHivePartitions(
      Transaction transaction,
      boolean insideTransaction,
      UnkeyedHiveTable table,
      HMSClientPool client,
      HMSClientPool transactionalClient) {
    this.transaction = transaction;
    this.insideTransaction = insideTransaction;
    this.delegate = transaction.newReplacePartitions();
    this.hmsClient = client;
    this.transactionalHMSClient = transactionalClient;
    this.table = table;
    this.db = table.id().getDatabase();
    this.tableName = table.id().getTableName();
    try {
      this.hiveTable = client.run(c -> c.getTable(db, tableName));
    } catch (TException | InterruptedException e) {
      throw new RuntimeException(e);
    }
    this.rewritePartitions = StructLikeMap.create(table.spec().partitionType());
    this.newPartitions = StructLikeMap.create(table.spec().partitionType());
  }

  @Override
  public ReplacePartitions addFile(DataFile file) {
    delegate.addFile(file);
    String tableLocation = table.hiveLocation();
    String dataFileLocation = file.path().toString();
    if (dataFileLocation.toLowerCase().contains(tableLocation.toLowerCase())) {
      // only handle file in hive location
      this.addFiles.add(file);
    }
    return this;
  }

  @Override
  public ReplacePartitions validateAppendOnly() {
    delegate.validateAppendOnly();
    return this;
  }

  @Override
  public ReplacePartitions set(String property, String value) {
    delegate.set(property, value);
    return this;
  }

  @Override
  public ReplacePartitions deleteWith(Consumer deleteFunc) {
    delegate.deleteWith(deleteFunc);
    return this;
  }

  @Override
  public ReplacePartitions stageOnly() {
    delegate.stageOnly();
    return this;
  }

  @Override
  public Snapshot apply() {
    return delegate.apply();
  }

  @Override
  public void commit() {
    if (!addFiles.isEmpty()) {
      commitTimestamp = (int) (System.currentTimeMillis() / 1000);
      if (table.spec().isUnpartitioned()) {
        generateUnpartitionTableLocation();
      } else {
        applyHivePartitions();
      }

      delegate.commit();
      commitPartitionProperties();
      if (!insideTransaction) {
        transaction.commitTransaction();
      }

      if (table.spec().isUnpartitioned()) {
        commitUnPartitionedTable();
      } else {
        commitPartitionedTable();
      }
    }
  }

  private void commitPartitionProperties() {
    UpdatePartitionProperties updatePartitionProperties = table.updatePartitionProperties(transaction);
    if (table.spec().isUnpartitioned() && unpartitionTableLocation != null) {
      updatePartitionProperties.set(TablePropertyUtil.EMPTY_STRUCT,
          HiveTableProperties.PARTITION_PROPERTIES_KEY_HIVE_LOCATION, unpartitionTableLocation);
      updatePartitionProperties.set(TablePropertyUtil.EMPTY_STRUCT,
          HiveTableProperties.PARTITION_PROPERTIES_KEY_TRANSIENT_TIME, commitTimestamp + "");
    } else {
      rewritePartitions.forEach((partitionData, partition) -> {
        updatePartitionProperties.set(partitionData, HiveTableProperties.PARTITION_PROPERTIES_KEY_HIVE_LOCATION,
            partition.getSd().getLocation());
        updatePartitionProperties.set(partitionData, HiveTableProperties.PARTITION_PROPERTIES_KEY_TRANSIENT_TIME,
            commitTimestamp + "");
      });
      newPartitions.forEach((partitionData, partition) -> {
        updatePartitionProperties.set(partitionData, HiveTableProperties.PARTITION_PROPERTIES_KEY_HIVE_LOCATION,
            partition.getSd().getLocation());
        updatePartitionProperties.set(partitionData, HiveTableProperties.PARTITION_PROPERTIES_KEY_TRANSIENT_TIME,
            commitTimestamp + "");
      });
    }
    updatePartitionProperties.commit();
  }

  @Override
  public Object updateEvent() {
    return delegate.updateEvent();
  }

  private void applyHivePartitions() {
    Types.StructType partitionSchema = table.spec().partitionType();

    // partitionValue -> partitionLocation.
    Map partitionLocationMap = Maps.newHashMap();
    Map> partitionDataFileMap = Maps.newHashMap();
    Map> partitionValueMap = Maps.newHashMap();

    for (DataFile d : addFiles) {
      List partitionValues = HivePartitionUtil.partitionValuesAsList(d.partition(), partitionSchema);
      String value = Joiner.on("/").join(partitionValues);
      String location = TableFileUtils.getFileDir(d.path().toString());
      partitionLocationMap.put(value, location);
      if (!partitionDataFileMap.containsKey(value)) {
        partitionDataFileMap.put(value, Lists.newArrayList());
      }
      partitionDataFileMap.get(value).add(d);
      partitionValueMap.put(value, partitionValues);
    }

    partitionLocationMap.forEach((k, v) -> checkOrphanFilesAndDelete(v, partitionDataFileMap.get(k)));
    partitionLocationMap.forEach((k, v) -> checkDataFileInSameLocation(v, partitionDataFileMap.get(k)));

    for (String val : partitionValueMap.keySet()) {
      List values = partitionValueMap.get(val);
      String location = partitionLocationMap.get(val);
      List dataFiles = partitionDataFileMap.get(val);

      try {
        Partition partition = hmsClient.run(c -> c.getPartition(db, tableName, values));
        HivePartitionUtil.rewriteHivePartitions(partition, location, dataFiles, commitTimestamp);
        rewritePartitions.put(dataFiles.get(0).partition(), partition);
      } catch (NoSuchObjectException e) {
        Partition p = HivePartitionUtil.newPartition(hiveTable, values, location, dataFiles, commitTimestamp);
        newPartitions.put(dataFiles.get(0).partition(), p);
      } catch (TException | InterruptedException e) {
        throw new RuntimeException(e);
      }
    }
  }

  /**
   * check files in the partition, and delete orphan files
   * @param partitionLocation
   * @param dataFiles
   */
  private void checkOrphanFilesAndDelete(String partitionLocation, List dataFiles) {
    List filePathCollect = dataFiles.stream()
        .map(dataFile -> dataFile.path().toString()).collect(Collectors.toList());
    List exisitedFiles = table.io().list(partitionLocation);
    for (FileStatus filePath: exisitedFiles) {
      if (!filePathCollect.contains(filePath.getPath().toString())) {
        table.io().deleteFile(String.valueOf(filePath.getPath().toString()));
        LOG.warn("Delete orphan file path: {}", filePath.getPath().toString());
      }
    }
  }

  private void commitUnPartitionedTable() {
    if (!addFiles.isEmpty()) {
      final String newDataLocation = TableFileUtils.getFileDir(addFiles.get(0).path().toString());
      try {
        transactionalHMSClient.run(c -> {
          Table tbl = c.getTable(db, tableName);
          tbl.getSd().setLocation(newDataLocation);
          HiveTableUtil.generateTableProperties(commitTimestamp, addFiles)
              .forEach((key, value) -> hiveTable.getParameters().put(key, value));
          c.alterTable(db, tableName, tbl);
          return 0;
        });
      } catch (TException | InterruptedException e) {
        throw new RuntimeException(e);
      }
    }
  }

  private void commitPartitionedTable() {
    try {
      transactionalHMSClient.run(c -> {
        if (!rewritePartitions.isEmpty()) {
          try {
            c.alterPartitions(db, tableName, Lists.newArrayList(rewritePartitions.values()), null);
          } catch (InstantiationException | NoSuchMethodException |
                   InvocationTargetException | IllegalAccessException |
                   ClassNotFoundException e) {
            throw new RuntimeException(e);
          }
        }
        if (!newPartitions.isEmpty()) {
          c.addPartitions(Lists.newArrayList(newPartitions.values()));
        }
        return 0;
      });
    } catch (TException | InterruptedException e) {
      throw new RuntimeException(e);
    }
  }

  private void checkDataFileInSameLocation(String partitionLocation, List files) {
    Path partitionPath = new Path(partitionLocation);
    for (DataFile df : files) {
      String fileDir = TableFileUtils.getFileDir(df.path().toString());
      Path dirPath = new Path(fileDir);
      if (!partitionPath.equals(dirPath)) {
        throw new CannotAlterHiveLocationException(
            "can't create new hive location: " + partitionLocation + " for data file: " + df.path().toString() +
                " is not under partition location path"
        );
      }
    }
  }

  private void generateUnpartitionTableLocation() {
    unpartitionTableLocation = TableFileUtils.getFileDir(this.addFiles.get(0).path().toString());
    checkOrphanFilesAndDelete(unpartitionTableLocation, this.addFiles);
  }
}




© 2015 - 2025 Weber Informatics LLC | Privacy Policy