com.netease.arctic.hive.op.ReplaceHivePartitions Maven / Gradle / Ivy
The newest version!
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package com.netease.arctic.hive.op;
import com.netease.arctic.hive.HMSClientPool;
import com.netease.arctic.hive.HiveTableProperties;
import com.netease.arctic.hive.exceptions.CannotAlterHiveLocationException;
import com.netease.arctic.hive.table.UnkeyedHiveTable;
import com.netease.arctic.hive.utils.HivePartitionUtil;
import com.netease.arctic.hive.utils.HiveTableUtil;
import com.netease.arctic.op.UpdatePartitionProperties;
import com.netease.arctic.utils.TableFileUtils;
import com.netease.arctic.utils.TablePropertyUtil;
import org.apache.hadoop.fs.FileStatus;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.hive.metastore.api.NoSuchObjectException;
import org.apache.hadoop.hive.metastore.api.Partition;
import org.apache.hadoop.hive.metastore.api.Table;
import com.netease.arctic.shade.org.apache.iceberg.DataFile;
import com.netease.arctic.shade.org.apache.iceberg.ReplacePartitions;
import com.netease.arctic.shade.org.apache.iceberg.Snapshot;
import com.netease.arctic.shade.org.apache.iceberg.Transaction;
import com.netease.arctic.shade.org.apache.iceberg.relocated.com.google.common.base.Joiner;
import com.netease.arctic.shade.org.apache.iceberg.relocated.com.google.common.collect.Lists;
import com.netease.arctic.shade.org.apache.iceberg.relocated.com.google.common.collect.Maps;
import com.netease.arctic.shade.org.apache.iceberg.types.Types;
import com.netease.arctic.shade.org.apache.iceberg.util.StructLikeMap;
import com.netease.arctic.shade.org.apache.thrift.TException;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.lang.reflect.InvocationTargetException;
import java.util.List;
import java.util.Map;
import java.util.function.Consumer;
import java.util.stream.Collectors;
public class ReplaceHivePartitions implements ReplacePartitions {
private static final Logger LOG = LoggerFactory.getLogger(ReplaceHivePartitions.class);
private final Transaction transaction;
private final boolean insideTransaction;
private final ReplacePartitions delegate;
private final HMSClientPool hmsClient;
private final HMSClientPool transactionalHMSClient;
private final UnkeyedHiveTable table;
private final List addFiles = Lists.newArrayList();
private final String db;
private final String tableName;
private final Table hiveTable;
private final StructLikeMap rewritePartitions;
private final StructLikeMap newPartitions;
private String unpartitionTableLocation;
private int commitTimestamp; // in seconds
public ReplaceHivePartitions(
Transaction transaction,
boolean insideTransaction,
UnkeyedHiveTable table,
HMSClientPool client,
HMSClientPool transactionalClient) {
this.transaction = transaction;
this.insideTransaction = insideTransaction;
this.delegate = transaction.newReplacePartitions();
this.hmsClient = client;
this.transactionalHMSClient = transactionalClient;
this.table = table;
this.db = table.id().getDatabase();
this.tableName = table.id().getTableName();
try {
this.hiveTable = client.run(c -> c.getTable(db, tableName));
} catch (TException | InterruptedException e) {
throw new RuntimeException(e);
}
this.rewritePartitions = StructLikeMap.create(table.spec().partitionType());
this.newPartitions = StructLikeMap.create(table.spec().partitionType());
}
@Override
public ReplacePartitions addFile(DataFile file) {
delegate.addFile(file);
String tableLocation = table.hiveLocation();
String dataFileLocation = file.path().toString();
if (dataFileLocation.toLowerCase().contains(tableLocation.toLowerCase())) {
// only handle file in hive location
this.addFiles.add(file);
}
return this;
}
@Override
public ReplacePartitions validateAppendOnly() {
delegate.validateAppendOnly();
return this;
}
@Override
public ReplacePartitions set(String property, String value) {
delegate.set(property, value);
return this;
}
@Override
public ReplacePartitions deleteWith(Consumer deleteFunc) {
delegate.deleteWith(deleteFunc);
return this;
}
@Override
public ReplacePartitions stageOnly() {
delegate.stageOnly();
return this;
}
@Override
public Snapshot apply() {
return delegate.apply();
}
@Override
public void commit() {
if (!addFiles.isEmpty()) {
commitTimestamp = (int) (System.currentTimeMillis() / 1000);
if (table.spec().isUnpartitioned()) {
generateUnpartitionTableLocation();
} else {
applyHivePartitions();
}
delegate.commit();
commitPartitionProperties();
if (!insideTransaction) {
transaction.commitTransaction();
}
if (table.spec().isUnpartitioned()) {
commitUnPartitionedTable();
} else {
commitPartitionedTable();
}
}
}
private void commitPartitionProperties() {
UpdatePartitionProperties updatePartitionProperties = table.updatePartitionProperties(transaction);
if (table.spec().isUnpartitioned() && unpartitionTableLocation != null) {
updatePartitionProperties.set(TablePropertyUtil.EMPTY_STRUCT,
HiveTableProperties.PARTITION_PROPERTIES_KEY_HIVE_LOCATION, unpartitionTableLocation);
updatePartitionProperties.set(TablePropertyUtil.EMPTY_STRUCT,
HiveTableProperties.PARTITION_PROPERTIES_KEY_TRANSIENT_TIME, commitTimestamp + "");
} else {
rewritePartitions.forEach((partitionData, partition) -> {
updatePartitionProperties.set(partitionData, HiveTableProperties.PARTITION_PROPERTIES_KEY_HIVE_LOCATION,
partition.getSd().getLocation());
updatePartitionProperties.set(partitionData, HiveTableProperties.PARTITION_PROPERTIES_KEY_TRANSIENT_TIME,
commitTimestamp + "");
});
newPartitions.forEach((partitionData, partition) -> {
updatePartitionProperties.set(partitionData, HiveTableProperties.PARTITION_PROPERTIES_KEY_HIVE_LOCATION,
partition.getSd().getLocation());
updatePartitionProperties.set(partitionData, HiveTableProperties.PARTITION_PROPERTIES_KEY_TRANSIENT_TIME,
commitTimestamp + "");
});
}
updatePartitionProperties.commit();
}
@Override
public Object updateEvent() {
return delegate.updateEvent();
}
private void applyHivePartitions() {
Types.StructType partitionSchema = table.spec().partitionType();
// partitionValue -> partitionLocation.
Map partitionLocationMap = Maps.newHashMap();
Map> partitionDataFileMap = Maps.newHashMap();
Map> partitionValueMap = Maps.newHashMap();
for (DataFile d : addFiles) {
List partitionValues = HivePartitionUtil.partitionValuesAsList(d.partition(), partitionSchema);
String value = Joiner.on("/").join(partitionValues);
String location = TableFileUtils.getFileDir(d.path().toString());
partitionLocationMap.put(value, location);
if (!partitionDataFileMap.containsKey(value)) {
partitionDataFileMap.put(value, Lists.newArrayList());
}
partitionDataFileMap.get(value).add(d);
partitionValueMap.put(value, partitionValues);
}
partitionLocationMap.forEach((k, v) -> checkOrphanFilesAndDelete(v, partitionDataFileMap.get(k)));
partitionLocationMap.forEach((k, v) -> checkDataFileInSameLocation(v, partitionDataFileMap.get(k)));
for (String val : partitionValueMap.keySet()) {
List values = partitionValueMap.get(val);
String location = partitionLocationMap.get(val);
List dataFiles = partitionDataFileMap.get(val);
try {
Partition partition = hmsClient.run(c -> c.getPartition(db, tableName, values));
HivePartitionUtil.rewriteHivePartitions(partition, location, dataFiles, commitTimestamp);
rewritePartitions.put(dataFiles.get(0).partition(), partition);
} catch (NoSuchObjectException e) {
Partition p = HivePartitionUtil.newPartition(hiveTable, values, location, dataFiles, commitTimestamp);
newPartitions.put(dataFiles.get(0).partition(), p);
} catch (TException | InterruptedException e) {
throw new RuntimeException(e);
}
}
}
/**
* check files in the partition, and delete orphan files
* @param partitionLocation
* @param dataFiles
*/
private void checkOrphanFilesAndDelete(String partitionLocation, List dataFiles) {
List filePathCollect = dataFiles.stream()
.map(dataFile -> dataFile.path().toString()).collect(Collectors.toList());
List exisitedFiles = table.io().list(partitionLocation);
for (FileStatus filePath: exisitedFiles) {
if (!filePathCollect.contains(filePath.getPath().toString())) {
table.io().deleteFile(String.valueOf(filePath.getPath().toString()));
LOG.warn("Delete orphan file path: {}", filePath.getPath().toString());
}
}
}
private void commitUnPartitionedTable() {
if (!addFiles.isEmpty()) {
final String newDataLocation = TableFileUtils.getFileDir(addFiles.get(0).path().toString());
try {
transactionalHMSClient.run(c -> {
Table tbl = c.getTable(db, tableName);
tbl.getSd().setLocation(newDataLocation);
HiveTableUtil.generateTableProperties(commitTimestamp, addFiles)
.forEach((key, value) -> hiveTable.getParameters().put(key, value));
c.alterTable(db, tableName, tbl);
return 0;
});
} catch (TException | InterruptedException e) {
throw new RuntimeException(e);
}
}
}
private void commitPartitionedTable() {
try {
transactionalHMSClient.run(c -> {
if (!rewritePartitions.isEmpty()) {
try {
c.alterPartitions(db, tableName, Lists.newArrayList(rewritePartitions.values()), null);
} catch (InstantiationException | NoSuchMethodException |
InvocationTargetException | IllegalAccessException |
ClassNotFoundException e) {
throw new RuntimeException(e);
}
}
if (!newPartitions.isEmpty()) {
c.addPartitions(Lists.newArrayList(newPartitions.values()));
}
return 0;
});
} catch (TException | InterruptedException e) {
throw new RuntimeException(e);
}
}
private void checkDataFileInSameLocation(String partitionLocation, List files) {
Path partitionPath = new Path(partitionLocation);
for (DataFile df : files) {
String fileDir = TableFileUtils.getFileDir(df.path().toString());
Path dirPath = new Path(fileDir);
if (!partitionPath.equals(dirPath)) {
throw new CannotAlterHiveLocationException(
"can't create new hive location: " + partitionLocation + " for data file: " + df.path().toString() +
" is not under partition location path"
);
}
}
}
private void generateUnpartitionTableLocation() {
unpartitionTableLocation = TableFileUtils.getFileDir(this.addFiles.get(0).path().toString());
checkOrphanFilesAndDelete(unpartitionTableLocation, this.addFiles);
}
}
© 2015 - 2025 Weber Informatics LLC | Privacy Policy