All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.apache.parquet.avro.AvroSchemaConverter Maven / Gradle / Ivy

The newest version!
/*
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *   http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing,
 * software distributed under the License is distributed on an
 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 * KIND, either express or implied.  See the License for the
 * specific language governing permissions and limitations
 * under the License.
 */
package org.apache.parquet.avro;

import org.apache.avro.LogicalType;
import org.apache.avro.LogicalTypes;
import org.apache.avro.Schema;

import org.apache.hadoop.conf.Configuration;
import org.apache.parquet.schema.ConversionPatterns;
import org.apache.parquet.schema.GroupType;
import org.apache.parquet.schema.LogicalTypeAnnotation;
import org.apache.parquet.schema.MessageType;
import org.apache.parquet.schema.PrimitiveType;
import org.apache.parquet.schema.PrimitiveType.PrimitiveTypeName;
import org.apache.parquet.schema.Type;
import org.apache.parquet.schema.Types;
import org.apache.parquet.schema.LogicalTypeAnnotation.UUIDLogicalTypeAnnotation;

import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collections;
import java.util.HashMap;
import java.util.HashSet;
import java.util.List;
import java.util.Map;
import java.util.Optional;
import java.util.Set;

import static java.util.Optional.empty;
import static java.util.Optional.of;
import static org.apache.avro.JsonProperties.NULL_VALUE;
import static org.apache.parquet.avro.AvroReadSupport.READ_INT96_AS_FIXED;
import static org.apache.parquet.avro.AvroReadSupport.READ_INT96_AS_FIXED_DEFAULT;
import static org.apache.parquet.avro.AvroWriteSupport.WRITE_FIXED_AS_INT96;
import static org.apache.parquet.avro.AvroWriteSupport.WRITE_OLD_LIST_STRUCTURE;
import static org.apache.parquet.avro.AvroWriteSupport.WRITE_OLD_LIST_STRUCTURE_DEFAULT;
import static org.apache.parquet.avro.AvroWriteSupport.WRITE_PARQUET_UUID;
import static org.apache.parquet.avro.AvroWriteSupport.WRITE_PARQUET_UUID_DEFAULT;
import static org.apache.parquet.schema.LogicalTypeAnnotation.TimeUnit.MICROS;
import static org.apache.parquet.schema.LogicalTypeAnnotation.TimeUnit.MILLIS;
import static org.apache.parquet.schema.LogicalTypeAnnotation.dateType;
import static org.apache.parquet.schema.LogicalTypeAnnotation.decimalType;
import static org.apache.parquet.schema.LogicalTypeAnnotation.enumType;
import static org.apache.parquet.schema.LogicalTypeAnnotation.stringType;
import static org.apache.parquet.schema.LogicalTypeAnnotation.timeType;
import static org.apache.parquet.schema.LogicalTypeAnnotation.timestampType;
import static org.apache.parquet.schema.LogicalTypeAnnotation.uuidType;
import static org.apache.parquet.schema.PrimitiveType.PrimitiveTypeName.*;
import static org.apache.parquet.schema.Type.Repetition.REPEATED;

/**
 * 

* Converts an Avro schema into a Parquet schema, or vice versa. See package * documentation for details of the mapping. *

*/ public class AvroSchemaConverter { public static final String ADD_LIST_ELEMENT_RECORDS = "parquet.avro.add-list-element-records"; private static final boolean ADD_LIST_ELEMENT_RECORDS_DEFAULT = true; private final boolean assumeRepeatedIsListElement; private final boolean writeOldListStructure; private final boolean writeParquetUUID; private final boolean readInt96AsFixed; private final Set pathsToInt96; public AvroSchemaConverter() { this(ADD_LIST_ELEMENT_RECORDS_DEFAULT); } /** * Constructor used by {@link AvroRecordConverter#isElementType}, which always * uses the 2-level list conversion. * * @param assumeRepeatedIsListElement whether to assume 2-level lists */ AvroSchemaConverter(boolean assumeRepeatedIsListElement) { this.assumeRepeatedIsListElement = assumeRepeatedIsListElement; this.writeOldListStructure = WRITE_OLD_LIST_STRUCTURE_DEFAULT; this.writeParquetUUID = WRITE_PARQUET_UUID_DEFAULT; this.readInt96AsFixed = READ_INT96_AS_FIXED_DEFAULT; this.pathsToInt96 = Collections.emptySet(); } public AvroSchemaConverter(Configuration conf) { this.assumeRepeatedIsListElement = conf.getBoolean( ADD_LIST_ELEMENT_RECORDS, ADD_LIST_ELEMENT_RECORDS_DEFAULT); this.writeOldListStructure = conf.getBoolean( WRITE_OLD_LIST_STRUCTURE, WRITE_OLD_LIST_STRUCTURE_DEFAULT); this.writeParquetUUID = conf.getBoolean(WRITE_PARQUET_UUID, WRITE_PARQUET_UUID_DEFAULT); this.readInt96AsFixed = conf.getBoolean(READ_INT96_AS_FIXED, READ_INT96_AS_FIXED_DEFAULT); this.pathsToInt96 = new HashSet<>(Arrays.asList(conf.getStrings(WRITE_FIXED_AS_INT96, new String[0]))); } /** * Given a schema, check to see if it is a union of a null type and a regular schema, * and then return the non-null sub-schema. Otherwise, return the given schema. * * @param schema The schema to check * @return The non-null portion of a union schema, or the given schema */ public static Schema getNonNull(Schema schema) { if (schema.getType().equals(Schema.Type.UNION)) { List schemas = schema.getTypes(); if (schemas.size() == 2) { if (schemas.get(0).getType().equals(Schema.Type.NULL)) { return schemas.get(1); } else if (schemas.get(1).getType().equals(Schema.Type.NULL)) { return schemas.get(0); } else { return schema; } } else { return schema; } } else { return schema; } } public MessageType convert(Schema avroSchema) { if (!avroSchema.getType().equals(Schema.Type.RECORD)) { throw new IllegalArgumentException("Avro schema must be a record."); } return new MessageType(avroSchema.getFullName(), convertFields(avroSchema.getFields(), "")); } private List convertFields(List fields, String schemaPath) { List types = new ArrayList(); for (Schema.Field field : fields) { if (field.schema().getType().equals(Schema.Type.NULL)) { continue; // Avro nulls are not encoded, unless they are null unions } types.add(convertField(field, appendPath(schemaPath, field.name()))); } return types; } private Type convertField(String fieldName, Schema schema, String schemaPath) { return convertField(fieldName, schema, Type.Repetition.REQUIRED, schemaPath); } @SuppressWarnings("deprecation") private Type convertField(String fieldName, Schema schema, Type.Repetition repetition, String schemaPath) { Types.PrimitiveBuilder builder; Schema.Type type = schema.getType(); LogicalType logicalType = schema.getLogicalType(); if (type.equals(Schema.Type.BOOLEAN)) { builder = Types.primitive(BOOLEAN, repetition); } else if (type.equals(Schema.Type.INT)) { builder = Types.primitive(INT32, repetition); } else if (type.equals(Schema.Type.LONG)) { builder = Types.primitive(INT64, repetition); } else if (type.equals(Schema.Type.FLOAT)) { builder = Types.primitive(FLOAT, repetition); } else if (type.equals(Schema.Type.DOUBLE)) { builder = Types.primitive(DOUBLE, repetition); } else if (type.equals(Schema.Type.BYTES)) { builder = Types.primitive(BINARY, repetition); } else if (type.equals(Schema.Type.STRING)) { if (logicalType != null && logicalType.getName().equals(LogicalTypes.uuid().getName()) && writeParquetUUID) { builder = Types.primitive(FIXED_LEN_BYTE_ARRAY, repetition) .length(LogicalTypeAnnotation.UUIDLogicalTypeAnnotation.BYTES); } else { builder = Types.primitive(BINARY, repetition).as(stringType()); } } else if (type.equals(Schema.Type.RECORD)) { return new GroupType(repetition, fieldName, convertFields(schema.getFields(), schemaPath)); } else if (type.equals(Schema.Type.ENUM)) { builder = Types.primitive(BINARY, repetition).as(enumType()); } else if (type.equals(Schema.Type.ARRAY)) { if (writeOldListStructure) { return ConversionPatterns.listType(repetition, fieldName, convertField("array", schema.getElementType(), REPEATED, schemaPath)); } else { return ConversionPatterns.listOfElements(repetition, fieldName, convertField(AvroWriteSupport.LIST_ELEMENT_NAME, schema.getElementType(), schemaPath)); } } else if (type.equals(Schema.Type.MAP)) { Type valType = convertField("value", schema.getValueType(), schemaPath); // avro map key type is always string return ConversionPatterns.stringKeyMapType(repetition, fieldName, valType); } else if (type.equals(Schema.Type.FIXED)) { if (pathsToInt96.contains(schemaPath)) { if (schema.getFixedSize() != 12) { throw new IllegalArgumentException( "The size of the fixed type field " + schemaPath + " must be 12 bytes for INT96 conversion"); } builder = Types.primitive(PrimitiveTypeName.INT96, repetition); } else { builder = Types.primitive(FIXED_LEN_BYTE_ARRAY, repetition).length(schema.getFixedSize()); } } else if (type.equals(Schema.Type.UNION)) { return convertUnion(fieldName, schema, repetition, schemaPath); } else { throw new UnsupportedOperationException("Cannot convert Avro type " + type); } // schema translation can only be done for known logical types because this // creates an equivalence if (logicalType != null) { if (logicalType instanceof LogicalTypes.Decimal) { LogicalTypes.Decimal decimal = (LogicalTypes.Decimal) logicalType; builder = builder.as(decimalType(decimal.getScale(), decimal.getPrecision())); } else { LogicalTypeAnnotation annotation = convertLogicalType(logicalType); if (annotation != null) { builder.as(annotation); } } } return builder.named(fieldName); } private Type convertUnion(String fieldName, Schema schema, Type.Repetition repetition, String schemaPath) { List nonNullSchemas = new ArrayList(schema.getTypes().size()); // Found any schemas in the union? Required for the edge case, where the union contains only a single type. boolean foundNullSchema = false; for (Schema childSchema : schema.getTypes()) { if (childSchema.getType().equals(Schema.Type.NULL)) { foundNullSchema = true; if (Type.Repetition.REQUIRED == repetition) { repetition = Type.Repetition.OPTIONAL; } } else { nonNullSchemas.add(childSchema); } } // If we only get a null and one other type then its a simple optional field // otherwise construct a union container switch (nonNullSchemas.size()) { case 0: throw new UnsupportedOperationException("Cannot convert Avro union of only nulls"); case 1: return foundNullSchema ? convertField(fieldName, nonNullSchemas.get(0), repetition, schemaPath) : convertUnionToGroupType(fieldName, repetition, nonNullSchemas, schemaPath); default: // complex union type return convertUnionToGroupType(fieldName, repetition, nonNullSchemas, schemaPath); } } private Type convertUnionToGroupType(String fieldName, Type.Repetition repetition, List nonNullSchemas, String schemaPath) { List unionTypes = new ArrayList(nonNullSchemas.size()); int index = 0; for (Schema childSchema : nonNullSchemas) { unionTypes.add( convertField("member" + index++, childSchema, Type.Repetition.OPTIONAL, schemaPath)); } return new GroupType(repetition, fieldName, unionTypes); } private Type convertField(Schema.Field field, String schemaPath) { return convertField(field.name(), field.schema(), schemaPath); } public Schema convert(MessageType parquetSchema) { return convertFields(parquetSchema.getName(), parquetSchema.getFields(), new HashMap<>()); } Schema convert(GroupType parquetSchema) { return convertFields(parquetSchema.getName(), parquetSchema.getFields(), new HashMap<>()); } private Schema convertFields(String name, List parquetFields, Map names) { List fields = new ArrayList(); Integer nameCount = names.merge(name, 1, (oldValue, value) -> oldValue + 1); for (Type parquetType : parquetFields) { Schema fieldSchema = convertField(parquetType, names); if (parquetType.isRepetition(REPEATED)) { throw new UnsupportedOperationException("REPEATED not supported outside LIST or MAP. Type: " + parquetType); } else if (parquetType.isRepetition(Type.Repetition.OPTIONAL)) { fields.add(new Schema.Field( parquetType.getName(), optional(fieldSchema), null, NULL_VALUE)); } else { // REQUIRED fields.add(new Schema.Field( parquetType.getName(), fieldSchema, null, (Object) null)); } } Schema schema = Schema.createRecord(name, null, nameCount > 1 ? name + nameCount : null, false); schema.setFields(fields); return schema; } private Schema convertField(final Type parquetType, Map names) { if (parquetType.isPrimitive()) { final PrimitiveType asPrimitive = parquetType.asPrimitiveType(); final PrimitiveTypeName parquetPrimitiveTypeName = asPrimitive.getPrimitiveTypeName(); final LogicalTypeAnnotation annotation = parquetType.getLogicalTypeAnnotation(); Schema schema = parquetPrimitiveTypeName.convert( new PrimitiveType.PrimitiveTypeNameConverter() { @Override public Schema convertBOOLEAN(PrimitiveTypeName primitiveTypeName) { return Schema.create(Schema.Type.BOOLEAN); } @Override public Schema convertINT32(PrimitiveTypeName primitiveTypeName) { return Schema.create(Schema.Type.INT); } @Override public Schema convertINT64(PrimitiveTypeName primitiveTypeName) { return Schema.create(Schema.Type.LONG); } @Override public Schema convertINT96(PrimitiveTypeName primitiveTypeName) { if (readInt96AsFixed) { return Schema.createFixed("INT96", "INT96 represented as byte[12]", null, 12); } throw new IllegalArgumentException( "INT96 is deprecated. As interim enable READ_INT96_AS_FIXED flag to read as byte array."); } @Override public Schema convertFLOAT(PrimitiveTypeName primitiveTypeName) { return Schema.create(Schema.Type.FLOAT); } @Override public Schema convertDOUBLE(PrimitiveTypeName primitiveTypeName) { return Schema.create(Schema.Type.DOUBLE); } @Override public Schema convertFIXED_LEN_BYTE_ARRAY(PrimitiveTypeName primitiveTypeName) { if (annotation instanceof LogicalTypeAnnotation.UUIDLogicalTypeAnnotation) { return Schema.create(Schema.Type.STRING); } else { int size = parquetType.asPrimitiveType().getTypeLength(); return Schema.createFixed(parquetType.getName(), null, null, size); } } @Override public Schema convertBINARY(PrimitiveTypeName primitiveTypeName) { if (annotation instanceof LogicalTypeAnnotation.StringLogicalTypeAnnotation || annotation instanceof LogicalTypeAnnotation.EnumLogicalTypeAnnotation) { return Schema.create(Schema.Type.STRING); } else { return Schema.create(Schema.Type.BYTES); } } }); LogicalType logicalType = convertLogicalType(annotation); if (logicalType != null && (!(annotation instanceof LogicalTypeAnnotation.DecimalLogicalTypeAnnotation) || parquetPrimitiveTypeName == BINARY || parquetPrimitiveTypeName == FIXED_LEN_BYTE_ARRAY)) { schema = logicalType.addToSchema(schema); } return schema; } else { GroupType parquetGroupType = parquetType.asGroupType(); LogicalTypeAnnotation logicalTypeAnnotation = parquetGroupType.getLogicalTypeAnnotation(); if (logicalTypeAnnotation != null) { return logicalTypeAnnotation.accept(new LogicalTypeAnnotation.LogicalTypeAnnotationVisitor() { @Override public Optional visit(LogicalTypeAnnotation.ListLogicalTypeAnnotation listLogicalType) { if (parquetGroupType.getFieldCount()!= 1) { throw new UnsupportedOperationException("Invalid list type " + parquetGroupType); } Type repeatedType = parquetGroupType.getType(0); if (!repeatedType.isRepetition(REPEATED)) { throw new UnsupportedOperationException("Invalid list type " + parquetGroupType); } if (isElementType(repeatedType, parquetGroupType.getName())) { // repeated element types are always required return of(Schema.createArray(convertField(repeatedType, names))); } else { Type elementType = repeatedType.asGroupType().getType(0); if (elementType.isRepetition(Type.Repetition.OPTIONAL)) { return of(Schema.createArray(optional(convertField(elementType, names)))); } else { return of(Schema.createArray(convertField(elementType, names))); } } } @Override // for backward-compatibility public Optional visit(LogicalTypeAnnotation.MapKeyValueTypeAnnotation mapKeyValueLogicalType) { return visitMapOrMapKeyValue(); } @Override public Optional visit(LogicalTypeAnnotation.MapLogicalTypeAnnotation mapLogicalType) { return visitMapOrMapKeyValue(); } private Optional visitMapOrMapKeyValue() { if (parquetGroupType.getFieldCount() != 1 || parquetGroupType.getType(0).isPrimitive()) { throw new UnsupportedOperationException("Invalid map type " + parquetGroupType); } GroupType mapKeyValType = parquetGroupType.getType(0).asGroupType(); if (!mapKeyValType.isRepetition(REPEATED) || mapKeyValType.getFieldCount()!=2) { throw new UnsupportedOperationException("Invalid map type " + parquetGroupType); } Type keyType = mapKeyValType.getType(0); if (!keyType.isPrimitive() || !keyType.asPrimitiveType().getPrimitiveTypeName().equals(PrimitiveTypeName.BINARY) || !keyType.getLogicalTypeAnnotation().equals(stringType())) { throw new IllegalArgumentException("Map key type must be binary (UTF8): " + keyType); } Type valueType = mapKeyValType.getType(1); if (valueType.isRepetition(Type.Repetition.OPTIONAL)) { return of(Schema.createMap(optional(convertField(valueType, names)))); } else { return of(Schema.createMap(convertField(valueType, names))); } } @Override public Optional visit(LogicalTypeAnnotation.EnumLogicalTypeAnnotation enumLogicalType) { return of(Schema.create(Schema.Type.STRING)); } }).orElseThrow(() -> new UnsupportedOperationException("Cannot convert Parquet type " + parquetType)); } else { // if no original type then it's a record return convertFields(parquetGroupType.getName(), parquetGroupType.getFields(), names); } } } private LogicalTypeAnnotation convertLogicalType(LogicalType logicalType) { if (logicalType == null) { return null; } else if (logicalType instanceof LogicalTypes.Decimal) { LogicalTypes.Decimal decimal = (LogicalTypes.Decimal) logicalType; return decimalType(decimal.getScale(), decimal.getPrecision()); } else if (logicalType instanceof LogicalTypes.Date) { return dateType(); } else if (logicalType instanceof LogicalTypes.TimeMillis) { return timeType(true, MILLIS); } else if (logicalType instanceof LogicalTypes.TimeMicros) { return timeType(true, MICROS); } else if (logicalType instanceof LogicalTypes.TimestampMillis) { return timestampType(true, MILLIS); } else if (logicalType instanceof LogicalTypes.TimestampMicros) { return timestampType(true, MICROS); } else if (logicalType.getName().equals(LogicalTypes.uuid().getName()) && writeParquetUUID) { return uuidType(); } return null; } private LogicalType convertLogicalType(LogicalTypeAnnotation annotation) { if (annotation == null) { return null; } return annotation.accept(new LogicalTypeAnnotation.LogicalTypeAnnotationVisitor() { @Override public Optional visit(LogicalTypeAnnotation.DecimalLogicalTypeAnnotation decimalLogicalType) { return of(LogicalTypes.decimal(decimalLogicalType.getPrecision(), decimalLogicalType.getScale())); } @Override public Optional visit(LogicalTypeAnnotation.DateLogicalTypeAnnotation dateLogicalType) { return of(LogicalTypes.date()); } @Override public Optional visit(LogicalTypeAnnotation.TimeLogicalTypeAnnotation timeLogicalType) { LogicalTypeAnnotation.TimeUnit unit = timeLogicalType.getUnit(); switch (unit) { case MILLIS: return of(LogicalTypes.timeMillis()); case MICROS: return of(LogicalTypes.timeMicros()); } return empty(); } @Override public Optional visit(LogicalTypeAnnotation.TimestampLogicalTypeAnnotation timestampLogicalType) { LogicalTypeAnnotation.TimeUnit unit = timestampLogicalType.getUnit(); switch (unit) { case MILLIS: return of(LogicalTypes.timestampMillis()); case MICROS: return of(LogicalTypes.timestampMicros()); } return empty(); } @Override public Optional visit(UUIDLogicalTypeAnnotation uuidLogicalType) { return of(LogicalTypes.uuid()); } }).orElse(null); } /** * Implements the rules for interpreting existing data from the logical type * spec for the LIST annotation. This is used to produce the expected schema. *

* The AvroArrayConverter will decide whether the repeated type is the array * element type by testing whether the element schema and repeated type are * the same. This ensures that the LIST rules are followed when there is no * schema and that a schema can be provided to override the default behavior. */ private boolean isElementType(Type repeatedType, String parentName) { return ( // can't be a synthetic layer because it would be invalid repeatedType.isPrimitive() || repeatedType.asGroupType().getFieldCount() > 1 || repeatedType.asGroupType().getType(0).isRepetition(REPEATED) || // known patterns without the synthetic layer repeatedType.getName().equals("array") || repeatedType.getName().equals(parentName + "_tuple") || // default assumption assumeRepeatedIsListElement ); } private static Schema optional(Schema original) { // null is first in the union because Parquet's default is always null return Schema.createUnion(Arrays.asList( Schema.create(Schema.Type.NULL), original)); } private static String appendPath(String path, String fieldName) { if (path == null || path.isEmpty()) { return fieldName; } return path + '.' + fieldName; } }





© 2015 - 2025 Weber Informatics LLC | Privacy Policy