parquet.avro.AvroReadSupport Maven / Gradle / Ivy
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/
package parquet.avro;
import java.util.LinkedHashMap;
import java.util.Map;
import org.apache.avro.Schema;
import org.apache.avro.generic.IndexedRecord;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.util.ReflectionUtils;
import parquet.hadoop.api.ReadSupport;
import parquet.io.api.RecordMaterializer;
import parquet.schema.MessageType;
/**
* Avro implementation of {@link ReadSupport} for Avro {@link IndexedRecord}s which cover both Avro Specific and
* Generic. Users should use {@link AvroParquetReader} or {@link AvroParquetInputFormat} rather than using
* this class directly.
*/
public class AvroReadSupport extends ReadSupport {
public static String AVRO_REQUESTED_PROJECTION = "parquet.avro.projection";
private static final String AVRO_READ_SCHEMA = "parquet.avro.read.schema";
static final String AVRO_SCHEMA_METADATA_KEY = "parquet.avro.schema";
private static final String AVRO_READ_SCHEMA_METADATA_KEY = "avro.read.schema";
public static String AVRO_DATA_SUPPLIER = "parquet.avro.data.supplier";
/**
* @see parquet.avro.AvroParquetInputFormat#setRequestedProjection(org.apache.hadoop.mapreduce.Job, org.apache.avro.Schema)
*/
public static void setRequestedProjection(Configuration configuration, Schema requestedProjection) {
configuration.set(AVRO_REQUESTED_PROJECTION, requestedProjection.toString());
}
/**
* @see parquet.avro.AvroParquetInputFormat#setAvroReadSchema(org.apache.hadoop.mapreduce.Job, org.apache.avro.Schema)
*/
public static void setAvroReadSchema(Configuration configuration, Schema avroReadSchema) {
configuration.set(AVRO_READ_SCHEMA, avroReadSchema.toString());
}
public static void setAvroDataSupplier(Configuration configuration,
Class extends AvroDataSupplier> clazz) {
configuration.set(AVRO_DATA_SUPPLIER, clazz.toString());
}
@Override
public ReadContext init(Configuration configuration,
Map keyValueMetaData,
MessageType fileSchema) {
MessageType projection = fileSchema;
Map metadata = null;
String requestedProjectionString = configuration.get(AVRO_REQUESTED_PROJECTION);
if (requestedProjectionString != null) {
Schema avroRequestedProjection = new Schema.Parser().parse(requestedProjectionString);
projection = new AvroSchemaConverter(configuration).convert(avroRequestedProjection);
}
String avroReadSchema = configuration.get(AVRO_READ_SCHEMA);
if (avroReadSchema != null) {
metadata = new LinkedHashMap();
metadata.put(AVRO_READ_SCHEMA_METADATA_KEY, avroReadSchema);
}
// use getSchemaForRead because it checks that the requested schema is a
// subset of the columns in the file schema
return new ReadContext(getSchemaForRead(fileSchema, projection), metadata);
}
@Override
public RecordMaterializer prepareForRead(
Configuration configuration, Map keyValueMetaData,
MessageType fileSchema, ReadContext readContext) {
MessageType parquetSchema = readContext.getRequestedSchema();
Schema avroSchema;
if (readContext.getReadSupportMetadata() != null &&
readContext.getReadSupportMetadata().get(AVRO_READ_SCHEMA_METADATA_KEY) != null) {
// use the Avro read schema provided by the user
avroSchema = new Schema.Parser().parse(readContext.getReadSupportMetadata().get(AVRO_READ_SCHEMA_METADATA_KEY));
} else if (keyValueMetaData.get(AVRO_SCHEMA_METADATA_KEY) != null) {
// use the Avro schema from the file metadata if present
avroSchema = new Schema.Parser().parse(keyValueMetaData.get(AVRO_SCHEMA_METADATA_KEY));
} else {
// default to converting the Parquet schema into an Avro schema
avroSchema = new AvroSchemaConverter(configuration).convert(parquetSchema);
}
Class extends AvroDataSupplier> suppClass = configuration.getClass(AVRO_DATA_SUPPLIER,
SpecificDataSupplier.class,
AvroDataSupplier.class);
AvroDataSupplier supplier =ReflectionUtils.newInstance(suppClass, configuration);
return new AvroRecordMaterializer(parquetSchema, avroSchema, supplier.get());
}
}
© 2015 - 2025 Weber Informatics LLC | Privacy Policy