All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.apache.beam.sdk.extensions.smb.ParquetAvroFileOperations Maven / Gradle / Ivy

The newest version!
/*
 * Copyright 2021 Spotify AB.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing,
 * software distributed under the License is distributed on an
 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 * KIND, either express or implied.  See the License for the
 * specific language governing permissions and limitations
 * under the License.
 */

package org.apache.beam.sdk.extensions.smb;

import java.io.IOException;
import java.nio.channels.ReadableByteChannel;
import java.nio.channels.WritableByteChannel;
import java.util.NoSuchElementException;
import org.apache.avro.Schema;
import org.apache.avro.generic.GenericRecord;
import org.apache.avro.generic.IndexedRecord;
import org.apache.avro.reflect.ReflectData;
import org.apache.beam.sdk.coders.Coder;
import org.apache.beam.sdk.extensions.avro.coders.AvroCoder;
import org.apache.beam.sdk.extensions.smb.AvroFileOperations.SerializableSchemaSupplier;
import org.apache.beam.sdk.io.Compression;
import org.apache.beam.sdk.io.FileIO;
import org.apache.beam.sdk.io.hadoop.SerializableConfiguration;
import org.apache.beam.sdk.transforms.display.DisplayData;
import org.apache.beam.sdk.util.MimeTypes;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.util.ReflectionUtils;
import org.apache.parquet.avro.*;
import org.apache.parquet.filter2.compat.FilterCompat;
import org.apache.parquet.filter2.predicate.FilterPredicate;
import org.apache.parquet.hadoop.ParquetReader;
import org.apache.parquet.hadoop.ParquetWriter;
import org.apache.parquet.hadoop.metadata.CompressionCodecName;

/**
 * {@link org.apache.beam.sdk.extensions.smb.FileOperations} implementation for Parquet files with
 * Avro records.
 */
public class ParquetAvroFileOperations extends FileOperations {
  static final CompressionCodecName DEFAULT_COMPRESSION = CompressionCodecName.ZSTD;

  private final Class recordClass;
  private final SerializableSchemaSupplier schemaSupplier;

  private SerializableSchemaSupplier projectionSupplier;
  private CompressionCodecName compression;
  private SerializableConfiguration conf;
  private FilterPredicate predicate;

  private ParquetAvroFileOperations(Schema schema, Class recordClass) {
    super(Compression.UNCOMPRESSED, MimeTypes.BINARY);
    this.schemaSupplier = new SerializableSchemaSupplier(schema);
    this.recordClass = recordClass;
    this.compression = DEFAULT_COMPRESSION;
    this.conf = new SerializableConfiguration(new Configuration());
  }

  public static ParquetAvroFileOperations of(Schema schema) {
    return new ParquetAvroFileOperations(schema, null);
  }

  public static  ParquetAvroFileOperations of(Class recordClass) {
    return new ParquetAvroFileOperations(
        new ReflectData(recordClass.getClassLoader()).getSchema(recordClass), recordClass);
  }

  public ParquetAvroFileOperations withConfiguration(Configuration configuration) {
    this.conf = new SerializableConfiguration(configuration);
    return this;
  }

  public ParquetAvroFileOperations withCompression(CompressionCodecName compression) {
    this.compression = compression;
    return this;
  }

  public ParquetAvroFileOperations withFilterPredicate(FilterPredicate filterPredicate) {
    this.predicate = filterPredicate;
    return this;
  }

  public ParquetAvroFileOperations withProjection(Schema projection) {
    this.projectionSupplier =
        (projection != null) ? new SerializableSchemaSupplier(projection) : null;
    return this;
  }

  @Override
  public void populateDisplayData(DisplayData.Builder builder) {
    super.populateDisplayData(builder);
    builder.add(DisplayData.item("compressionCodecName", compression.name()));
    builder.add(DisplayData.item("schema", schemaSupplier.get().getFullName()));
  }

  @Override
  protected Reader createReader() {
    return new ParquetAvroReader<>(
        schemaSupplier, projectionSupplier, conf, predicate, recordClass);
  }

  @Override
  protected FileIO.Sink createSink() {
    return new ParquetAvroSink<>(schemaSupplier, compression, conf);
  }

  @SuppressWarnings("unchecked")
  @Override
  public Coder getCoder() {
    return recordClass == null
        ? (AvroCoder) AvroCoder.of(getSchema())
        : AvroCoder.reflect(recordClass);
  }

  Schema getSchema() {
    return schemaSupplier.get();
  }

  ////////////////////////////////////////
  // Reader
  ////////////////////////////////////////

  private static class ParquetAvroReader extends FileOperations.Reader {
    private final SerializableSchemaSupplier readSchemaSupplier;

    private final SerializableSchemaSupplier projectionSchemaSupplier;
    private final SerializableConfiguration conf;
    private final FilterPredicate predicate;
    private final Class recordClass;
    private transient ParquetReader reader;
    private transient ValueT current;

    private ParquetAvroReader(
        SerializableSchemaSupplier readSchemaSupplier,
        SerializableSchemaSupplier projectionSchemaSupplier,
        SerializableConfiguration conf,
        FilterPredicate predicate,
        Class recordClass) {
      this.readSchemaSupplier = readSchemaSupplier;
      this.projectionSchemaSupplier = projectionSchemaSupplier;
      this.conf = conf;
      this.predicate = predicate;
      this.recordClass = recordClass;
    }

    @Override
    public void prepareRead(ReadableByteChannel channel) throws IOException {
      final Schema readSchema = readSchemaSupplier.get();
      final Configuration configuration = conf.get();
      AvroReadSupport.setAvroReadSchema(configuration, readSchema);

      if (projectionSchemaSupplier != null) {
        AvroReadSupport.setRequestedProjection(configuration, projectionSchemaSupplier.get());
      } else {
        AvroReadSupport.setRequestedProjection(configuration, readSchema);
      }

      if (recordClass == null && configuration.get(AvroReadSupport.AVRO_DATA_SUPPLIER) == null) {
        configuration.setClass(
            AvroReadSupport.AVRO_DATA_SUPPLIER, GenericDataSupplier.class, AvroDataSupplier.class);
      }

      ParquetReader.Builder builder =
          AvroParquetReader.builder(new ParquetInputFile(channel)).withConf(configuration);
      if (predicate != null) {
        builder = builder.withFilter(FilterCompat.get(predicate));
      }
      reader = builder.build();
      current = reader.read();
    }

    @Override
    public ValueT readNext() throws IOException, NoSuchElementException {
      ValueT r = current;
      current = reader.read();
      return r;
    }

    @Override
    public boolean hasNextElement() throws IOException {
      return current != null;
    }

    @Override
    public void finishRead() throws IOException {
      reader.close();
    }
  }

  ////////////////////////////////////////
  // Sink
  ////////////////////////////////////////

  private static class ParquetAvroSink implements FileIO.Sink {
    private final SerializableSchemaSupplier schemaSupplier;
    private final CompressionCodecName compression;
    private final SerializableConfiguration conf;
    private transient ParquetWriter writer;

    private ParquetAvroSink(
        SerializableSchemaSupplier schemaSupplier,
        CompressionCodecName compression,
        SerializableConfiguration conf) {
      this.schemaSupplier = schemaSupplier;
      this.compression = compression;
      this.conf = conf;
    }

    @Override
    public void open(WritableByteChannel channel) throws IOException {
      // https://github.com/apache/parquet-mr/tree/master/parquet-hadoop#class-parquetoutputformat
      final Configuration configuration = conf.get();

      AvroParquetWriter.Builder builder =
          AvroParquetWriter.builder(new ParquetOutputFile(channel))
              .withSchema(schemaSupplier.get());

      // Workaround for PARQUET-2265
      if (configuration.getClass(AvroWriteSupport.AVRO_DATA_SUPPLIER, null) != null) {
        Class dataModelSupplier =
            configuration.getClass(
                AvroWriteSupport.AVRO_DATA_SUPPLIER,
                SpecificDataSupplier.class,
                AvroDataSupplier.class);
        builder =
            builder.withDataModel(
                ReflectionUtils.newInstance(dataModelSupplier, configuration).get());
      }

      writer = ParquetUtils.buildWriter(builder, configuration, compression);
    }

    @Override
    public void write(ValueT element) throws IOException {
      writer.write(element);
    }

    @Override
    public void flush() throws IOException {
      writer.close();
    }
  }
}




© 2015 - 2025 Weber Informatics LLC | Privacy Policy