com.netease.arctic.hive.utils.HiveMetaSynchronizer Maven / Gradle / Ivy
The newest version!
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package com.netease.arctic.hive.utils;
import com.netease.arctic.hive.HMSClientPool;
import com.netease.arctic.hive.HiveTableProperties;
import com.netease.arctic.hive.op.OverwriteHiveFiles;
import com.netease.arctic.op.OverwriteBaseFiles;
import com.netease.arctic.table.ArcticTable;
import com.netease.arctic.table.TableIdentifier;
import com.netease.arctic.table.TableProperties;
import com.netease.arctic.table.UnkeyedTable;
import com.netease.arctic.utils.TablePropertyUtil;
import org.apache.hadoop.hive.metastore.api.Partition;
import org.apache.hadoop.hive.metastore.api.Table;
import com.netease.arctic.shade.org.apache.iceberg.DataFile;
import com.netease.arctic.shade.org.apache.iceberg.FileScanTask;
import com.netease.arctic.shade.org.apache.iceberg.MetricsConfig;
import com.netease.arctic.shade.org.apache.iceberg.OverwriteFiles;
import com.netease.arctic.shade.org.apache.iceberg.PartitionSpec;
import com.netease.arctic.shade.org.apache.iceberg.Schema;
import com.netease.arctic.shade.org.apache.iceberg.StructLike;
import com.netease.arctic.shade.org.apache.iceberg.TableScan;
import com.netease.arctic.shade.org.apache.iceberg.UpdateSchema;
import com.netease.arctic.shade.org.apache.iceberg.data.TableMigrationUtil;
import com.netease.arctic.shade.org.apache.iceberg.io.CloseableIterable;
import com.netease.arctic.shade.org.apache.iceberg.mapping.NameMappingParser;
import com.netease.arctic.shade.org.apache.iceberg.relocated.com.google.common.collect.ListMultimap;
import com.netease.arctic.shade.org.apache.iceberg.relocated.com.google.common.collect.Lists;
import com.netease.arctic.shade.org.apache.iceberg.relocated.com.google.common.collect.Maps;
import com.netease.arctic.shade.org.apache.iceberg.relocated.com.google.common.collect.Multimaps;
import com.netease.arctic.shade.org.apache.iceberg.types.TypeUtil;
import com.netease.arctic.shade.org.apache.iceberg.types.Types;
import com.netease.arctic.shade.org.apache.iceberg.util.StructLikeMap;
import com.netease.arctic.shade.org.apache.thrift.TException;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.io.IOException;
import java.io.UncheckedIOException;
import java.util.ArrayList;
import java.util.Collection;
import java.util.List;
import java.util.Map;
import java.util.Objects;
import java.util.stream.Collectors;
/**
* Synchronize the metadata of the hive table to arctic table
*/
public class HiveMetaSynchronizer {
private static final Logger LOG = LoggerFactory.getLogger(HiveMetaSynchronizer.class);
/**
* Synchronize the schema change of the hive table to arctic table
* @param table arctic table to accept the schema change
* @param hiveClient hive client
*/
public static void syncHiveSchemaToArctic(ArcticTable table, HMSClientPool hiveClient) {
try {
Table hiveTable = hiveClient.run(client -> client.getTable(table.id().getDatabase(), table.id().getTableName()));
Schema hiveSchema = HiveSchemaUtil.convertHiveSchemaToIcebergSchema(hiveTable, table.isKeyedTable() ?
table.asKeyedTable().primaryKeySpec().fieldNames() : new ArrayList<>());
UpdateSchema updateSchema = table.updateSchema();
boolean update = updateStructSchema(table.id(), updateSchema, null,
table.schema().asStruct(), hiveSchema.asStruct());
if (update) {
updateSchema.commit();
}
} catch (TException | InterruptedException e) {
throw new RuntimeException("Failed to get hive table:" + table.id(), e);
}
}
private static boolean updateStructSchema(TableIdentifier tableIdentifier, UpdateSchema updateSchema,
String parentName, Types.StructType icebergStruct, Types.StructType hiveStruct) {
boolean update = false;
for (int i = 0; i < hiveStruct.fields().size(); i++) {
Types.NestedField hiveField = hiveStruct.fields().get(i);
Types.NestedField icebergField = icebergStruct.field(hiveField.name());
if (icebergField == null) {
updateSchema.addColumn(parentName, hiveField.name(), hiveField.type(), hiveField.doc());
update = true;
LOG.info("Table {} sync new hive column {} to arctic", tableIdentifier, hiveField);
} else if (!icebergField.type().equals(hiveField.type()) ||
!Objects.equals(icebergField.doc(), (hiveField.doc()))) {
if (hiveField.type().isPrimitiveType() && icebergField.type().isPrimitiveType()) {
if (TypeUtil.isPromotionAllowed(icebergField.type().asPrimitiveType(), hiveField.type().asPrimitiveType())) {
String columnName = parentName == null ? hiveField.name() : parentName + "." + hiveField.name();
updateSchema.updateColumn(columnName, hiveField.type().asPrimitiveType(), hiveField.doc());
update = true;
LOG.info("Table {} sync hive column {} to arctic", tableIdentifier, hiveField);
} else {
LOG.warn("Table {} sync hive column {} to arctic failed, because of type mismatch",
tableIdentifier, hiveField);
}
} else if (hiveField.type().isStructType() && icebergField.type().isStructType()) {
String columnName = parentName == null ? hiveField.name() : parentName + "." + hiveField.name();
update = update || updateStructSchema(tableIdentifier, updateSchema,
columnName, icebergField.type().asStructType(), hiveField.type().asStructType());
} else {
LOG.warn("Table {} sync hive column {} to arctic failed, because of type mismatch",
tableIdentifier, hiveField);
}
}
}
return update;
}
public static void syncHiveDataToArctic(ArcticTable table, HMSClientPool hiveClient) {
syncHiveDataToArctic(table, hiveClient, false);
}
/**
* Synchronize the data change of the hive table to arctic table
* @param table arctic table to accept the data change
* @param hiveClient hive client
*/
public static void syncHiveDataToArctic(ArcticTable table, HMSClientPool hiveClient, boolean force) {
UnkeyedTable baseStore;
if (table.isKeyedTable()) {
baseStore = table.asKeyedTable().baseTable();
} else {
baseStore = table.asUnkeyedTable();
}
try {
if (table.spec().isUnpartitioned()) {
Table hiveTable =
hiveClient.run(client -> client.getTable(table.id().getDatabase(), table.id().getTableName()));
if (force || tableHasModified(baseStore, hiveTable)) {
List hiveDataFiles = listHivePartitionFiles(table, Maps.newHashMap(),
hiveTable.getSd().getLocation());
List deleteFiles = Lists.newArrayList();
try (CloseableIterable fileScanTasks = baseStore.newScan().planFiles()) {
fileScanTasks.forEach(fileScanTask -> deleteFiles.add(fileScanTask.file()));
} catch (IOException e) {
throw new UncheckedIOException("Failed to close table scan of " + table.name(), e);
}
overwriteTable(table, deleteFiles, hiveDataFiles);
}
} else {
// list all hive partitions.
List hivePartitions = hiveClient.run(client -> client.listPartitions(table.id().getDatabase(),
table.id().getTableName(), Short.MAX_VALUE));
// group arctic files by partition.
ListMultimap filesGroupedByPartition
= Multimaps.newListMultimap(Maps.newHashMap(), Lists::newArrayList);
TableScan tableScan = baseStore.newScan();
try (CloseableIterable fileScanTasks = tableScan.planFiles()) {
for (com.netease.arctic.shade.org.apache.iceberg.FileScanTask fileScanTask : fileScanTasks) {
filesGroupedByPartition.put(fileScanTask.file().partition(), fileScanTask.file());
}
} catch (IOException e) {
throw new UncheckedIOException("Failed to close table scan of " + table.name(), e);
}
Map> filesMap = filesGroupedByPartition.asMap();
List filesToDelete = Lists.newArrayList();
List filesToAdd = Lists.newArrayList();
List icebergPartitions = Lists.newArrayList(filesMap.keySet());
for (Partition hivePartition : hivePartitions) {
StructLike partitionData = HivePartitionUtil.buildPartitionData(hivePartition.getValues(), table.spec());
icebergPartitions.remove(partitionData);
if (force || partitionHasModified(baseStore, hivePartition, partitionData)) {
List hiveDataFiles = listHivePartitionFiles(table,
buildPartitionValueMap(hivePartition.getValues(), table.spec()),
hivePartition.getSd().getLocation());
if (filesMap.get(partitionData) != null) {
filesToDelete.addAll(filesMap.get(partitionData));
filesToAdd.addAll(hiveDataFiles);
// make sure new partition is not created by arctic
} else if (hivePartition.getParameters().get(HiveTableProperties.ARCTIC_TABLE_FLAG) == null &&
hivePartition.getParameters().get(HiveTableProperties.ARCTIC_TABLE_FLAG_LEGACY) == null) {
filesToAdd.addAll(hiveDataFiles);
}
}
}
icebergPartitions.forEach(partition -> {
List dataFiles = Lists.newArrayList(filesMap.get(partition));
if (dataFiles.size() > 0) {
// make sure dropped partition with no files
if (!table.io().exists(dataFiles.get(0).path().toString())) {
filesToDelete.addAll(filesMap.get(partition));
}
}
});
overwriteTable(table, filesToDelete, filesToAdd);
}
} catch (TException | InterruptedException e) {
throw new RuntimeException("Failed to get hive table:" + table.id(), e);
}
}
private static boolean partitionHasModified(UnkeyedTable arcticTable, Partition hivePartition,
StructLike partitionData) {
String hiveTransientTime = hivePartition.getParameters().get("transient_lastDdlTime");
String arcticTransientTime = arcticTable.partitionProperty().containsKey(partitionData) ?
arcticTable.partitionProperty().get(partitionData)
.get(HiveTableProperties.PARTITION_PROPERTIES_KEY_TRANSIENT_TIME) : null;
String hiveLocation = hivePartition.getSd().getLocation();
String arcticPartitionLocation = arcticTable.partitionProperty().containsKey(partitionData) ?
arcticTable.partitionProperty().get(partitionData)
.get(HiveTableProperties.PARTITION_PROPERTIES_KEY_HIVE_LOCATION) : null;
// hive partition location is modified only in arctic full optimize, So if the hive partition location is
// different from the arctic partition location, it is not necessary to trigger synchronization from the hive
// side to the arctic
if (arcticPartitionLocation != null && !arcticPartitionLocation.equals(hiveLocation)) {
return false;
}
// compare hive partition parameter transient_lastDdlTime with arctic partition properties to
// find out if the partition is changed.
if (arcticTransientTime == null || !arcticTransientTime.equals(hiveTransientTime)) {
return true;
}
return false;
}
private static boolean tableHasModified(UnkeyedTable arcticTable, Table table) {
String hiveTransientTime = table.getParameters().get("transient_lastDdlTime");
StructLikeMap
© 2015 - 2025 Weber Informatics LLC | Privacy Policy