com.nvidia.spark.rapids.iceberg.parquet.ParquetSchemaUtil Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of rapids-4-spark_2.12 Show documentation
Show all versions of rapids-4-spark_2.12 Show documentation
Creates the distribution package of the RAPIDS plugin for Apache Spark
/*
* Copyright (c) 2022, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package com.nvidia.spark.rapids.iceberg.parquet;
import java.util.List;
import org.apache.iceberg.mapping.NameMapping;
import org.apache.parquet.schema.GroupType;
import org.apache.parquet.schema.MessageType;
import org.apache.parquet.schema.PrimitiveType;
import org.apache.parquet.schema.Type;
import org.apache.parquet.schema.Types.MessageTypeBuilder;
/** Derived from Apache Iceberg's ParquetSchemaUtil class. */
public class ParquetSchemaUtil {
private ParquetSchemaUtil() {
}
public static boolean hasIds(MessageType fileSchema) {
return ParquetTypeVisitor.visit(fileSchema, new HasIds());
}
public static MessageType addFallbackIds(MessageType fileSchema) {
MessageTypeBuilder builder = org.apache.parquet.schema.Types.buildMessage();
int ordinal = 1; // ids are assigned starting at 1
for (Type type : fileSchema.getFields()) {
builder.addField(type.withId(ordinal));
ordinal += 1;
}
return builder.named(fileSchema.getName());
}
public static MessageType applyNameMapping(MessageType fileSchema, NameMapping nameMapping) {
return (MessageType) ParquetTypeVisitor.visit(fileSchema, new ApplyNameMapping(nameMapping));
}
public static class HasIds extends ParquetTypeVisitor {
@Override
public Boolean message(MessageType message, List fields) {
return struct(message, fields);
}
@Override
public Boolean struct(GroupType struct, List hasIds) {
for (Boolean hasId : hasIds) {
if (hasId) {
return true;
}
}
return struct.getId() != null;
}
@Override
public Boolean list(GroupType array, Boolean hasId) {
return hasId || array.getId() != null;
}
@Override
public Boolean map(GroupType map, Boolean keyHasId, Boolean valueHasId) {
return keyHasId || valueHasId || map.getId() != null;
}
@Override
public Boolean primitive(PrimitiveType primitive) {
return primitive.getId() != null;
}
}
public static Type determineListElementType(GroupType array) {
Type repeated = array.getFields().get(0);
boolean isOldListElementType = isOldListElementType(array);
return isOldListElementType ? repeated : repeated.asGroupType().getType(0);
}
// Parquet LIST backwards-compatibility rules.
// https://github.com/apache/parquet-format/blob/master/LogicalTypes.md#backward-compatibility-rules
static boolean isOldListElementType(GroupType list) {
Type repeatedType = list.getFields().get(0);
String parentName = list.getName();
return
// For legacy 2-level list types with primitive element type, e.g.:
//
// // ARRAY (nullable list, non-null elements)
// optional group my_list (LIST) {
// repeated int32 element;
// }
//
repeatedType.isPrimitive() ||
// For legacy 2-level list types whose element type is a group type with 2 or more fields,
// e.g.:
//
// // ARRAY> (nullable list, non-null elements)
// optional group my_list (LIST) {
// repeated group element {
// required binary str (UTF8);
// required int32 num;
// };
// }
//
repeatedType.asGroupType().getFieldCount() > 1 ||
// For legacy 2-level list types generated by parquet-avro (Parquet version < 1.6.0), e.g.:
//
// // ARRAY> (nullable list, non-null elements)
// optional group my_list (LIST) {
// repeated group array {
// required binary str (UTF8);
// };
// }
repeatedType.getName().equals("array") ||
// For Parquet data generated by parquet-thrift, e.g.:
//
// // ARRAY> (nullable list, non-null elements)
// optional group my_list (LIST) {
// repeated group my_list_tuple {
// required binary str (UTF8);
// };
// }
//
repeatedType.getName().equals(parentName + "_tuple");
}
}