![JAR search and dependency download from the Maven repository](/logo.png)
org.apache.iceberg.hive.HiveSchemaUtil Maven / Gradle / Ivy
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/
package org.apache.iceberg.hive;
import java.util.Collection;
import java.util.List;
import java.util.Map;
import java.util.Objects;
import java.util.Optional;
import java.util.stream.Collectors;
import org.apache.hadoop.hive.metastore.api.FieldSchema;
import org.apache.hadoop.hive.serde2.typeinfo.TypeInfo;
import org.apache.hadoop.hive.serde2.typeinfo.TypeInfoUtils;
import org.apache.iceberg.PartitionSpec;
import org.apache.iceberg.Schema;
import org.apache.iceberg.relocated.com.google.common.collect.Lists;
import org.apache.iceberg.relocated.com.google.common.collect.Maps;
import org.apache.iceberg.types.Type;
import org.apache.iceberg.types.Types;
import org.apache.iceberg.util.Pair;
public final class HiveSchemaUtil {
private HiveSchemaUtil() {
}
/**
* Converts the Iceberg schema to a Hive schema (list of FieldSchema objects).
* @param schema The original Iceberg schema to convert
* @return The Hive column list generated from the Iceberg schema
*/
public static List convert(Schema schema) {
return schema.columns().stream()
.map(col -> new FieldSchema(col.name(), convertToTypeString(col.type()), col.doc()))
.collect(Collectors.toList());
}
/**
* Converts a Hive schema (list of FieldSchema objects) to an Iceberg schema. If some of the types are not convertible
* then exception is thrown.
* @param fieldSchemas The list of the columns
* @return An equivalent Iceberg Schema
*/
public static Schema convert(List fieldSchemas) {
return convert(fieldSchemas, false);
}
/**
* Converts a Hive schema (list of FieldSchema objects) to an Iceberg schema.
* @param fieldSchemas The list of the columns
* @param autoConvert If true
then TINYINT and SMALLINT is converted to INTEGER and VARCHAR and CHAR is
* converted to STRING. Otherwise if these types are used in the Hive schema then exception is
* thrown.
* @return An equivalent Iceberg Schema
*/
public static Schema convert(List fieldSchemas, boolean autoConvert) {
List names = Lists.newArrayListWithExpectedSize(fieldSchemas.size());
List typeInfos = Lists.newArrayListWithExpectedSize(fieldSchemas.size());
List comments = Lists.newArrayListWithExpectedSize(fieldSchemas.size());
for (FieldSchema col : fieldSchemas) {
names.add(col.getName().toLowerCase());
typeInfos.add(TypeInfoUtils.getTypeInfoFromTypeString(col.getType()));
comments.add(col.getComment());
}
return HiveSchemaConverter.convert(names, typeInfos, comments, autoConvert);
}
/**
* Converts the Hive partition columns to Iceberg identity partition specification.
* @param schema The Iceberg schema
* @param fieldSchemas The partition column specification
* @return The Iceberg partition specification
*/
public static PartitionSpec spec(Schema schema, List fieldSchemas) {
PartitionSpec.Builder builder = PartitionSpec.builderFor(schema);
fieldSchemas.forEach(fieldSchema -> builder.identity(fieldSchema.getName().toLowerCase()));
return builder.build();
}
/**
* Converts the Hive list of column names and column types to an Iceberg schema. If some of the types are not
* convertible then exception is thrown.
* @param names The list of the Hive column names
* @param types The list of the Hive column types
* @param comments The list of the Hive column comments
* @return The Iceberg schema
*/
public static Schema convert(List names, List types, List comments) {
return HiveSchemaConverter.convert(names, types, comments, false);
}
/**
* Converts the Hive list of column names and column types to an Iceberg schema.
* @param names The list of the Hive column names
* @param types The list of the Hive column types
* @param comments The list of the Hive column comments, can be null
* @param autoConvert If true
then TINYINT and SMALLINT is converted to INTEGER and VARCHAR and CHAR is
* converted to STRING. Otherwise if these types are used in the Hive schema then exception is
* thrown.
* @return The Iceberg schema
*/
public static Schema convert(List names, List types, List comments, boolean autoConvert) {
return HiveSchemaConverter.convert(names, types, comments, autoConvert);
}
/**
* Converts an Iceberg type to a Hive TypeInfo object.
* @param type The Iceberg type
* @return The Hive type
*/
public static TypeInfo convert(Type type) {
return TypeInfoUtils.getTypeInfoFromTypeString(convertToTypeString(type));
}
/**
* Converts a Hive typeInfo object to an Iceberg type.
* @param typeInfo The Hive type
* @return The Iceberg type
*/
public static Type convert(TypeInfo typeInfo) {
return HiveSchemaConverter.convert(typeInfo, false);
}
/**
* Returns a SchemaDifference containing those fields which are present in only one of the collections, as well as
* those fields which are present in both (in terms of the name) but their type or comment has changed.
* @param minuendCollection Collection of fields to subtract from
* @param subtrahendCollection Collection of fields to subtract
* @param bothDirections Whether or not to compute the missing fields from the minuendCollection as well
* @return the difference between the two schemas
*/
public static SchemaDifference getSchemaDiff(Collection minuendCollection,
Collection subtrahendCollection, boolean bothDirections) {
SchemaDifference difference = new SchemaDifference();
for (FieldSchema first : minuendCollection) {
boolean found = false;
for (FieldSchema second : subtrahendCollection) {
if (Objects.equals(first.getName(), second.getName())) {
found = true;
if (!Objects.equals(first.getType(), second.getType())) {
difference.addTypeChanged(first);
}
if (!Objects.equals(first.getComment(), second.getComment())) {
difference.addCommentChanged(first);
}
}
}
if (!found) {
difference.addMissingFromSecond(first);
}
}
if (bothDirections) {
SchemaDifference otherWay = getSchemaDiff(subtrahendCollection, minuendCollection, false);
otherWay.getMissingFromSecond().forEach(difference::addMissingFromFirst);
}
return difference;
}
/**
* Compares two lists of columns to each other to find the (singular) column that was moved. This works ideally for
* identifying the column that was moved by an ALTER TABLE ... CHANGE COLUMN command.
*
* Note: This method is only suitable for finding a single reordered column.
* Consequently, this method is NOT suitable for handling scenarios where multiple column reorders are possible at the
* same time, such as ALTER TABLE ... REPLACE COLUMNS commands.
*
* @param updated The list of the columns after some updates have taken place (if any)
* @param old The list of the original columns
* @param renameMapping A map of name aliases for the updated columns (e.g. if a column rename occurred)
* @return A pair consisting of the reordered column's name, and its preceding column's name (if any).
* Returns a null in case there are no out of order columns.
*/
public static Pair> getReorderedColumn(List updated,
List old,
Map renameMapping) {
// first collect the updated index for each column
Map nameToNewIndex = Maps.newHashMap();
for (int i = 0; i < updated.size(); ++i) {
String updatedCol = renameMapping.getOrDefault(updated.get(i).getName(), updated.get(i).getName());
nameToNewIndex.put(updatedCol, i);
}
// find the column which has the highest index difference between its position in the old vs the updated list
String reorderedColName = null;
int maxIndexDiff = 0;
for (int oldIndex = 0; oldIndex < old.size(); ++oldIndex) {
String oldName = old.get(oldIndex).getName();
Integer newIndex = nameToNewIndex.get(oldName);
if (newIndex != null) {
int indexDiff = Math.abs(newIndex - oldIndex);
if (maxIndexDiff < indexDiff) {
maxIndexDiff = indexDiff;
reorderedColName = oldName;
}
}
}
if (maxIndexDiff == 0) {
// if there are no changes in index, there were no reorders
return null;
} else {
int newIndex = nameToNewIndex.get(reorderedColName);
if (newIndex > 0) {
// if the newIndex > 0, that means the column was moved after another column:
// ALTER TABLE tbl CHANGE COLUMN reorderedColName reorderedColName type AFTER previousColName;
String previousColName = renameMapping.getOrDefault(
updated.get(newIndex - 1).getName(), updated.get(newIndex - 1).getName());
return Pair.of(reorderedColName, Optional.of(previousColName));
} else {
// if the newIndex is 0, that means the column was moved to the first position:
// ALTER TABLE tbl CHANGE COLUMN reorderedColName reorderedColName type FIRST;
return Pair.of(reorderedColName, Optional.empty());
}
}
}
public static class SchemaDifference {
private final List missingFromFirst = Lists.newArrayList();
private final List missingFromSecond = Lists.newArrayList();
private final List typeChanged = Lists.newArrayList();
private final List commentChanged = Lists.newArrayList();
public List getMissingFromFirst() {
return missingFromFirst;
}
public List getMissingFromSecond() {
return missingFromSecond;
}
public List getTypeChanged() {
return typeChanged;
}
public List getCommentChanged() {
return commentChanged;
}
public boolean isEmpty() {
return missingFromFirst.isEmpty() && missingFromSecond.isEmpty() && typeChanged.isEmpty() &&
commentChanged.isEmpty();
}
void addMissingFromFirst(FieldSchema field) {
missingFromFirst.add(field);
}
void addMissingFromSecond(FieldSchema field) {
missingFromSecond.add(field);
}
void addTypeChanged(FieldSchema field) {
typeChanged.add(field);
}
void addCommentChanged(FieldSchema field) {
commentChanged.add(field);
}
}
private static String convertToTypeString(Type type) {
switch (type.typeId()) {
case BOOLEAN:
return "boolean";
case INTEGER:
return "int";
case LONG:
return "bigint";
case FLOAT:
return "float";
case DOUBLE:
return "double";
case DATE:
return "date";
case TIME:
case STRING:
case UUID:
return "string";
case TIMESTAMP:
Types.TimestampType timestampType = (Types.TimestampType) type;
if (HiveVersion.min(HiveVersion.HIVE_3) && timestampType.shouldAdjustToUTC()) {
return "timestamp with local time zone";
}
return "timestamp";
case FIXED:
case BINARY:
return "binary";
case DECIMAL:
final Types.DecimalType decimalType = (Types.DecimalType) type;
return String.format("decimal(%s,%s)", decimalType.precision(), decimalType.scale());
case STRUCT:
final Types.StructType structType = type.asStructType();
final String nameToType = structType.fields().stream()
.map(f -> String.format("%s:%s", f.name(), convert(f.type())))
.collect(Collectors.joining(","));
return String.format("struct<%s>", nameToType);
case LIST:
final Types.ListType listType = type.asListType();
return String.format("array<%s>", convert(listType.elementType()));
case MAP:
final Types.MapType mapType = type.asMapType();
return String.format("map<%s,%s>", convert(mapType.keyType()), convert(mapType.valueType()));
default:
throw new UnsupportedOperationException(type + " is not supported");
}
}
}
© 2015 - 2025 Weber Informatics LLC | Privacy Policy