org.apache.beam.sdk.extensions.smb.AvroFileOperations Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of scio-smb_2.13 Show documentation
Show all versions of scio-smb_2.13 Show documentation
Sort Merge Bucket source/sink implementations for Apache Beam
The newest version!
/*
* Copyright 2019 Spotify AB.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/
package org.apache.beam.sdk.extensions.smb;
import java.io.IOException;
import java.io.Serializable;
import java.nio.channels.Channels;
import java.nio.channels.ReadableByteChannel;
import java.util.Map;
import org.apache.avro.Schema;
import org.apache.avro.file.CodecFactory;
import org.apache.avro.file.DataFileStream;
import org.apache.avro.generic.IndexedRecord;
import org.apache.avro.io.DatumReader;
import org.apache.beam.sdk.coders.Coder;
import org.apache.beam.sdk.extensions.avro.coders.AvroCoder;
import org.apache.beam.sdk.extensions.avro.io.AvroDatumFactory;
import org.apache.beam.sdk.extensions.avro.io.AvroIO;
import org.apache.beam.sdk.io.Compression;
import org.apache.beam.sdk.io.FileIO;
import org.apache.beam.sdk.io.PatchedSerializableAvroCodecFactory;
import org.apache.beam.sdk.transforms.display.DisplayData;
import org.apache.beam.sdk.transforms.display.DisplayData.Builder;
import org.apache.beam.sdk.util.MimeTypes;
import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.base.Supplier;
/** {@link org.apache.beam.sdk.extensions.smb.FileOperations} implementation for Avro files. */
public class AvroFileOperations extends FileOperations {
private final AvroDatumFactory datumFactory;
private final SerializableSchemaSupplier schemaSupplier;
private PatchedSerializableAvroCodecFactory codec;
private Map metadata;
static CodecFactory defaultCodec() {
return CodecFactory.deflateCodec(6);
}
private AvroFileOperations(AvroDatumFactory datumFactory, Schema schema) {
super(Compression.UNCOMPRESSED, MimeTypes.BINARY); // Avro has its own compression via codec
this.schemaSupplier = new SerializableSchemaSupplier(schema);
this.datumFactory = datumFactory;
this.codec = new PatchedSerializableAvroCodecFactory(defaultCodec());
}
public static AvroFileOperations of(
AvroDatumFactory datumFactory, Schema schema) {
return new AvroFileOperations<>(datumFactory, schema);
}
public AvroFileOperations withCodec(CodecFactory codec) {
this.codec = new PatchedSerializableAvroCodecFactory(codec);
return this;
}
public AvroFileOperations withMetadata(Map metadata) {
this.metadata = metadata;
return this;
}
@Override
public void populateDisplayData(Builder builder) {
super.populateDisplayData(builder);
builder.add(DisplayData.item("codecFactory", codec.getCodec().getClass()));
builder.add(DisplayData.item("schema", schemaSupplier.schema.getFullName()));
}
@Override
protected Reader createReader() {
return new AvroReader<>(datumFactory, schemaSupplier);
}
@SuppressWarnings("unchecked")
@Override
protected FileIO.Sink createSink() {
final AvroIO.Sink sink =
((AvroIO.Sink) AvroIO.sink(getSchema()))
.withDatumWriterFactory(datumFactory)
.withCodec(codec.getCodec());
if (metadata != null) {
return sink.withMetadata(metadata);
} else {
return sink;
}
}
@SuppressWarnings("unchecked")
@Override
public Coder getCoder() {
return AvroCoder.of(datumFactory, getSchema());
}
Schema getSchema() {
return schemaSupplier.get();
}
private static class SerializableSchemaString implements Serializable {
private final String schema;
private SerializableSchemaString(String schema) {
this.schema = schema;
}
private Object readResolve() throws IOException, ClassNotFoundException {
return new SerializableSchemaSupplier(new Schema.Parser().parse(schema));
}
}
static class SerializableSchemaSupplier implements Serializable, Supplier {
private transient Schema schema;
SerializableSchemaSupplier(Schema schema) {
this.schema = schema;
}
private Object writeReplace() {
return new SerializableSchemaString(schema.toString());
}
@Override
public Schema get() {
return schema;
}
}
////////////////////////////////////////
// Reader
////////////////////////////////////////
private static class AvroReader extends FileOperations.Reader {
private AvroDatumFactory datumFactory;
private SerializableSchemaSupplier schemaSupplier;
private transient DataFileStream reader;
AvroReader(AvroDatumFactory datumFactory, SerializableSchemaSupplier schemaSupplier) {
this.datumFactory = datumFactory;
this.schemaSupplier = schemaSupplier;
}
@Override
public void prepareRead(ReadableByteChannel channel) throws IOException {
final Schema schema = schemaSupplier.get();
DatumReader datumReader = datumFactory.apply(schema, schema);
reader = new DataFileStream<>(Channels.newInputStream(channel), datumReader);
}
@Override
public ValueT readNext() {
return reader.next();
}
@Override
public boolean hasNextElement() {
return reader.hasNext();
}
@Override
public void finishRead() throws IOException {
reader.close();
}
}
}
© 2015 - 2025 Weber Informatics LLC | Privacy Policy