All Downloads are FREE. Search and download functionalities are using the official Maven repository.

com.netease.arctic.hive.op.UpdateHiveFiles Maven / Gradle / Ivy

The newest version!
package com.netease.arctic.hive.op;

import com.netease.arctic.hive.HMSClientPool;
import com.netease.arctic.hive.HiveTableProperties;
import com.netease.arctic.hive.exceptions.CannotAlterHiveLocationException;
import com.netease.arctic.hive.table.UnkeyedHiveTable;
import com.netease.arctic.hive.utils.HivePartitionUtil;
import com.netease.arctic.hive.utils.HiveTableUtil;
import com.netease.arctic.op.UpdatePartitionProperties;
import com.netease.arctic.utils.TableFileUtils;
import com.netease.arctic.utils.TablePropertyUtil;
import org.apache.commons.collections.CollectionUtils;
import org.apache.hadoop.fs.FileStatus;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.hive.metastore.PartitionDropOptions;
import org.apache.hadoop.hive.metastore.api.NoSuchObjectException;
import org.apache.hadoop.hive.metastore.api.Partition;
import org.apache.hadoop.hive.metastore.api.Table;
import com.netease.arctic.shade.org.apache.iceberg.DataFile;
import com.netease.arctic.shade.org.apache.iceberg.FileScanTask;
import com.netease.arctic.shade.org.apache.iceberg.SnapshotUpdate;
import com.netease.arctic.shade.org.apache.iceberg.StructLike;
import com.netease.arctic.shade.org.apache.iceberg.Transaction;
import com.netease.arctic.shade.org.apache.iceberg.expressions.Expression;
import com.netease.arctic.shade.org.apache.iceberg.expressions.Expressions;
import com.netease.arctic.shade.org.apache.iceberg.io.CloseableIterable;
import com.netease.arctic.shade.org.apache.iceberg.io.OutputFile;
import com.netease.arctic.shade.org.apache.iceberg.relocated.com.google.common.base.Joiner;
import com.netease.arctic.shade.org.apache.iceberg.relocated.com.google.common.collect.Lists;
import com.netease.arctic.shade.org.apache.iceberg.relocated.com.google.common.collect.Maps;
import com.netease.arctic.shade.org.apache.iceberg.relocated.com.google.common.collect.Sets;
import com.netease.arctic.shade.org.apache.iceberg.types.Types;
import com.netease.arctic.shade.org.apache.iceberg.util.StructLikeMap;
import com.netease.arctic.shade.org.apache.thrift.TException;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import java.io.IOException;
import java.lang.reflect.InvocationTargetException;
import java.util.ArrayList;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.stream.Collectors;

public abstract class UpdateHiveFiles> implements SnapshotUpdate {

  private static final Logger LOG = LoggerFactory.getLogger(UpdateHiveFiles.class);

  public static final String PROPERTIES_VALIDATE_LOCATION = "validate-location";
  public static final String DELETE_UNTRACKED_HIVE_FILE = "delete-untracked-hive-file";

  protected final Transaction transaction;
  protected final boolean insideTransaction;
  protected final UnkeyedHiveTable table;
  protected final HMSClientPool hmsClient;
  protected final HMSClientPool transactionClient;
  protected final String db;
  protected final String tableName;

  protected final Table hiveTable;

  protected Expression expr;
  protected final List addFiles = Lists.newArrayList();
  protected final List deleteFiles = Lists.newArrayList();

  protected StructLikeMap partitionToDelete;
  protected StructLikeMap partitionToCreate;
  protected final StructLikeMap partitionToAlter;
  protected String unpartitionTableLocation;
  protected Long txId = null;
  protected boolean validateLocation = true;
  protected boolean checkOrphanFiles = false;
  protected int commitTimestamp; // in seconds

  public UpdateHiveFiles(Transaction transaction, boolean insideTransaction, UnkeyedHiveTable table,
                         HMSClientPool hmsClient, HMSClientPool transactionClient) {
    this.transaction = transaction;
    this.insideTransaction = insideTransaction;
    this.table = table;
    this.hmsClient = hmsClient;
    this.transactionClient = transactionClient;

    this.db = table.id().getDatabase();
    this.tableName = table.id().getTableName();
    try {
      this.hiveTable = hmsClient.run(c -> c.getTable(db, tableName));
    } catch (TException | InterruptedException e) {
      throw new RuntimeException(e);
    }
    this.partitionToAlter = StructLikeMap.create(table.spec().partitionType());
    this.partitionToCreate = StructLikeMap.create(table.spec().partitionType());
    this.partitionToDelete = StructLikeMap.create(table.spec().partitionType());
  }

  abstract SnapshotUpdate getSnapshotUpdateDelegate();

  @Override
  public void commit() {
    commitTimestamp = (int) (System.currentTimeMillis() / 1000);
    if (table.spec().isUnpartitioned()) {
      generateUnpartitionTableLocation();
    } else {
      this.partitionToDelete = getDeletePartition();
      this.partitionToCreate = getCreatePartition(this.partitionToDelete);
    }

    if (checkOrphanFiles) {
      checkPartitionedOrphanFilesAndDelete(table.spec().isUnpartitioned());
    }
    // if no DataFiles to add or delete in Hive location, only commit to iceberg
    boolean noHiveDataFilesChanged = CollectionUtils.isEmpty(addFiles) && CollectionUtils.isEmpty(deleteFiles) &&
        expr != Expressions.alwaysTrue();

    getSnapshotUpdateDelegate().commit();
    if (!noHiveDataFilesChanged) {
      commitPartitionProperties();
    }
    if (!insideTransaction) {
      transaction.commitTransaction();
    }
    if (noHiveDataFilesChanged) {
      return;
    }

    try {
      if (table.spec().isUnpartitioned()) {
        commitNonPartitionedTable();
      } else {
        commitPartitionedTable();
      }
    } catch (Exception e) {
      LOG.warn("Commit operation to HMS failed.", e);
    }
  }

  private void commitPartitionProperties() {
    UpdatePartitionProperties updatePartitionProperties = table.updatePartitionProperties(transaction);
    if (table.spec().isUnpartitioned()) {
      updatePartitionProperties.set(TablePropertyUtil.EMPTY_STRUCT,
          HiveTableProperties.PARTITION_PROPERTIES_KEY_HIVE_LOCATION, unpartitionTableLocation);
      updatePartitionProperties.set(TablePropertyUtil.EMPTY_STRUCT,
          HiveTableProperties.PARTITION_PROPERTIES_KEY_TRANSIENT_TIME, commitTimestamp + "");
    } else {
      partitionToDelete.forEach((partitionData, partition) -> {
        if (!partitionToCreate.containsKey(partitionData)) {
          updatePartitionProperties.remove(partitionData, HiveTableProperties.PARTITION_PROPERTIES_KEY_HIVE_LOCATION);
          updatePartitionProperties.remove(partitionData, HiveTableProperties.PARTITION_PROPERTIES_KEY_TRANSIENT_TIME);
        }
      });
      partitionToCreate.forEach((partitionData, partition) -> {
        updatePartitionProperties.set(partitionData, HiveTableProperties.PARTITION_PROPERTIES_KEY_HIVE_LOCATION,
            partition.getSd().getLocation());
        updatePartitionProperties.set(partitionData, HiveTableProperties.PARTITION_PROPERTIES_KEY_TRANSIENT_TIME,
            commitTimestamp + "");
      });
      partitionToAlter.forEach((partitionData, partition) -> {
        updatePartitionProperties.set(partitionData, HiveTableProperties.PARTITION_PROPERTIES_KEY_HIVE_LOCATION,
            partition.getSd().getLocation());
        updatePartitionProperties.set(partitionData, HiveTableProperties.PARTITION_PROPERTIES_KEY_TRANSIENT_TIME,
            commitTimestamp + "");
      });
    }
    updatePartitionProperties.commit();
  }

  protected StructLikeMap getCreatePartition(StructLikeMap partitionToDelete) {
    if (this.addFiles.isEmpty()) {
      return StructLikeMap.create(table.spec().partitionType());
    }

    Map partitionLocationMap = Maps.newHashMap();
    Map> partitionDataFileMap = Maps.newHashMap();
    Map> partitionValueMap = Maps.newHashMap();

    Types.StructType partitionSchema = table.spec().partitionType();
    for (DataFile d : addFiles) {
      List partitionValues = HivePartitionUtil.partitionValuesAsList(d.partition(), partitionSchema);
      String value = Joiner.on("/").join(partitionValues);
      String location = TableFileUtils.getFileDir(d.path().toString());
      partitionLocationMap.put(value, location);
      if (!partitionDataFileMap.containsKey(value)) {
        partitionDataFileMap.put(value, Lists.newArrayList());
      }
      partitionDataFileMap.get(value).add(d);
      partitionValueMap.put(value, partitionValues);
    }

    StructLikeMap createPartitions = StructLikeMap.create(table.spec().partitionType());
    for (String val : partitionValueMap.keySet()) {
      List values = partitionValueMap.get(val);
      String location = partitionLocationMap.get(val);
      List dataFiles = partitionDataFileMap.get(val);

      checkCreatePartitionDataFiles(dataFiles, location);
      Partition p = HivePartitionUtil.newPartition(hiveTable, values, location, dataFiles, commitTimestamp);
      createPartitions.put(dataFiles.get(0).partition(), p);
    }

    createPartitions = filterNewPartitionNonExists(createPartitions, partitionToDelete);
    return createPartitions;
  }

  protected StructLikeMap getDeletePartition() {
    if (expr != null) {
      List deleteFilesByExpr = applyDeleteExpr();
      this.deleteFiles.addAll(deleteFilesByExpr);
    }

    StructLikeMap deletePartitions = StructLikeMap.create(table.spec().partitionType());
    if (deleteFiles.isEmpty()) {
      return deletePartitions;
    }

    Types.StructType partitionSchema = table.spec().partitionType();

    Set checkedPartitionValues = Sets.newHashSet();
    Set deleteFileLocations = Sets.newHashSet();

    for (DataFile dataFile : deleteFiles) {
      List values = HivePartitionUtil.partitionValuesAsList(dataFile.partition(), partitionSchema);
      String pathValue = Joiner.on("/").join(values);
      deleteFileLocations.add(new Path(dataFile.path().toString()));
      if (checkedPartitionValues.contains(pathValue)) {
        continue;
      }
      try {
        Partition partition = hmsClient.run(c -> c.getPartition(db, tableName, values));
        deletePartitions.put(dataFile.partition(), partition);
      } catch (NoSuchObjectException e) {
        // pass do nothing
      } catch (TException | InterruptedException e) {
        throw new RuntimeException(e);
      }
      checkedPartitionValues.add(pathValue);
    }

    if (validateLocation) {
      deletePartitions.values().forEach(p -> checkPartitionDelete(deleteFileLocations, p));
    }
    return deletePartitions;
  }

  private void checkPartitionDelete(Set deleteFiles, Partition partition) {
    String partitionLocation = partition.getSd().getLocation();
    List files = table.io().list(partitionLocation);
    for (FileStatus f : files) {
      Path filePath = f.getPath();
      if (!deleteFiles.contains(filePath)) {
        throw new CannotAlterHiveLocationException(
            "can't delete hive partition: " + partitionToString(partition) + ", file under partition is not deleted: " +
                filePath.toString());
      }
    }
  }

  /**
   * check all file with same partition key under same path
   */
  private void checkCreatePartitionDataFiles(List addFiles, String partitionLocation) {
    Path partitionPath = new Path(partitionLocation);
    for (DataFile df : addFiles) {
      String fileDir = TableFileUtils.getFileDir(df.path().toString());
      Path dirPath = new Path(fileDir);
      if (!partitionPath.equals(dirPath)) {
        throw new CannotAlterHiveLocationException(
            "can't create new hive location: " + partitionLocation + " for data file: " + df.path().toString() +
                " is not under partition location path"
        );
      }
    }
  }

  /**
   * check files in the partition, and delete orphan files
   */
  private void checkPartitionedOrphanFilesAndDelete(boolean isUnPartitioned) {
    List partitionsToCheck = new ArrayList<>();
    if (isUnPartitioned) {
      partitionsToCheck.add(this.unpartitionTableLocation);
    } else {
      partitionsToCheck = this.partitionToCreate.values()
          .stream().map(partition -> partition.getSd().getLocation()).collect(Collectors.toList());
    }
    for (String partitionLocation: partitionsToCheck) {
      List addFilesPathCollect = addFiles.stream()
          .map(dataFile -> dataFile.path().toString()).collect(Collectors.toList());
      List deleteFilesPathCollect = deleteFiles.stream()
          .map(deleteFile -> deleteFile.path().toString()).collect(Collectors.toList());
      List exisitedFiles = table.io().list(partitionLocation);
      for (FileStatus filePath: exisitedFiles) {
        if (!addFilesPathCollect.contains(filePath.getPath().toString()) &&
            !deleteFilesPathCollect.contains(filePath.getPath().toString())) {
          table.io().deleteFile(String.valueOf(filePath.getPath().toString()));
          LOG.warn("Delete orphan file path: {}", filePath.getPath().toString());
        }
      }
    }
  }

  /**
   * filter partitionToCreate. make sure all partition non-exist in hive. or
   * 0. partition is able to delete.
   * 0.1 - not same location, allow to create
   * 0.2 - same location, can't create ( delete partition will not delete files )
   * 1. exists but location is same. skip
   * 2. exists but location is not same, throw {@link CannotAlterHiveLocationException}
   */
  private StructLikeMap filterNewPartitionNonExists(
      StructLikeMap partitionToCreate,
      StructLikeMap partitionToDelete) {
    StructLikeMap partitions = StructLikeMap.create(table.spec().partitionType());
    Map deletePartitionValueMap = Maps.newHashMap();
    for (Partition p : partitionToDelete.values()) {
      String partValue = Joiner.on("/").join(p.getValues());
      deletePartitionValueMap.put(partValue, p);
    }

    for (Map.Entry entry : partitionToCreate.entrySet()) {
      String partValue = Joiner.on("/").join(entry.getValue().getValues());
      String location = entry.getValue().getSd().getLocation();
      Partition toDelete = deletePartitionValueMap.get(partValue);
      if (toDelete != null) {
        String deleteLocation = toDelete.getSd().getLocation();
        // if exists partition to delete with same value
        // make sure location is different
        if (isPathEquals(location, deleteLocation) && validateLocation) {
          throw new CannotAlterHiveLocationException("can't create new partition: " +
              partitionToString(entry.getValue()) + ", this " +
              "partition will be " +
              "delete and re-create with same location");
        } else {
          // this partition is need to alter, rather than delete
          partitionToAlter.put(entry.getKey(), entry.getValue());
          partitionToDelete.remove(entry.getKey());
          continue;
        }
      }

      try {
        Partition partitionInHive = hmsClient.run(c -> c.getPartition(db, tableName, entry.getValue().getValues()));
        String locationInHive = partitionInHive.getSd().getLocation();
        if (isPathEquals(location, locationInHive)) {
          partitionToAlter.put(entry.getKey(), entry.getValue());
          continue;
        }
        throw new CannotAlterHiveLocationException("can't create new partition: " +
            partitionToString(entry.getValue()) +
            ", this partition exists in hive with different location: " + locationInHive);
      } catch (NoSuchObjectException e) {
        partitions.put(entry.getKey(), entry.getValue());
      } catch (TException | InterruptedException e) {
        throw new RuntimeException(e);
      }
    }
    return partitions;
  }

  private void commitPartitionedTable() {
    if (!partitionToDelete.isEmpty()) {
      for (Partition p : partitionToDelete.values()) {
        try {
          transactionClient.run(c -> {
            PartitionDropOptions options = PartitionDropOptions.instance()
                .deleteData(false)
                .ifExists(true)
                .purgeData(false)
                .returnResults(false);
            c.dropPartition(db, tableName, p.getValues(), options);
            return 0;
          });
        } catch (NoSuchObjectException e) {
          LOG.warn("try to delete hive partition {} but partition not exist.", p);
        } catch (TException | InterruptedException e) {
          throw new RuntimeException(e);
        }
      }
    }

    if (!partitionToCreate.isEmpty()) {
      try {
        transactionClient.run(c -> c.addPartitions(Lists.newArrayList(partitionToCreate.values())));
      } catch (TException | InterruptedException e) {
        throw new RuntimeException(e);
      }
    }

    if (!partitionToAlter.isEmpty()) {
      try {
        transactionClient.run(c ->  {
          try {
            c.alterPartitions(db, tableName, Lists.newArrayList(partitionToAlter.values()), null);
          } catch (InvocationTargetException | InstantiationException |
              IllegalAccessException | NoSuchMethodException |
              ClassNotFoundException e) {
            throw new RuntimeException(e);
          }
          return null;
        });
      } catch (TException | InterruptedException e) {
        throw new RuntimeException(e);
      }
    }
  }

  private void generateUnpartitionTableLocation() {
    if (this.addFiles.isEmpty()) {
      unpartitionTableLocation = createUnpartitionEmptyLocationForHive();
    } else {
      unpartitionTableLocation = TableFileUtils.getFileDir(this.addFiles.get(0).path().toString());
    }
  }

  private void commitNonPartitionedTable() {

    final String finalLocation = unpartitionTableLocation;
    try {
      transactionClient.run(c -> {
        Table hiveTable = c.getTable(db, tableName);
        hiveTable.getSd().setLocation(finalLocation);
        HiveTableUtil.generateTableProperties(commitTimestamp, addFiles)
            .forEach((key, value) -> hiveTable.getParameters().put(key, value));
        c.alterTable(db, tableName, hiveTable);
        return 0;
      });
    } catch (TException | InterruptedException e) {
      throw new RuntimeException(e);
    }
  }

  private String createUnpartitionEmptyLocationForHive() {
    // create a new empty location for hive
    String newLocation;
    newLocation = HiveTableUtil.newHiveDataLocation(table.hiveLocation(), table.spec(), null,
        txId != null ? HiveTableUtil.newHiveSubdirectory(txId) : HiveTableUtil.newHiveSubdirectory());
    OutputFile file = table.io().newOutputFile(newLocation + "/.keep");
    try {
      file.createOrOverwrite().close();
    } catch (IOException e) {
      throw new RuntimeException(e);
    }
    return newLocation;
  }

  protected List applyDeleteExpr() {
    try (CloseableIterable tasks = table.newScan().filter(expr).planFiles()) {
      return Lists.newArrayList(tasks).stream().map(FileScanTask::file).collect(Collectors.toList());
    } catch (IOException e) {
      throw new RuntimeException(e);
    }
  }

  private boolean isPathEquals(String pathA, String pathB) {
    Path path1 = new Path(pathA);
    Path path2 = new Path(pathB);
    return path1.equals(path2);
  }

  private String partitionToString(Partition p) {
    return "Partition(values: [" + Joiner.on("/").join(p.getValues()) +
        "], location: " + p.getSd().getLocation() + ")";
  }
}




© 2015 - 2025 Weber Informatics LLC | Privacy Policy