com.spotify.scio.parquet.avro.ParquetAvroFileBasedSink Maven / Gradle / Ivy
/*
* Copyright 2017 Spotify AB.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/
package com.spotify.scio.parquet.avro;
import com.spotify.scio.parquet.BeamOutputFile;
import com.spotify.scio.parquet.WriterUtils;
import java.nio.channels.WritableByteChannel;
import org.apache.avro.Schema;
import org.apache.beam.sdk.io.FileBasedSink;
import org.apache.beam.sdk.io.fs.ResourceId;
import org.apache.beam.sdk.io.hadoop.SerializableConfiguration;
import org.apache.beam.sdk.options.ValueProvider;
import org.apache.beam.sdk.util.MimeTypes;
import org.apache.hadoop.conf.Configuration;
import org.apache.parquet.avro.AvroParquetWriter;
import org.apache.parquet.hadoop.ParquetWriter;
import org.apache.parquet.hadoop.metadata.CompressionCodecName;
public class ParquetAvroFileBasedSink extends FileBasedSink {
private final String schemaString;
private final SerializableConfiguration conf;
private final CompressionCodecName compression;
public ParquetAvroFileBasedSink(
ValueProvider baseOutputFileName,
FileBasedSink.DynamicDestinations dynamicDestinations,
Schema schema,
Configuration conf,
CompressionCodecName compression) {
super(baseOutputFileName, dynamicDestinations);
this.schemaString = schema.toString();
this.conf = new SerializableConfiguration(conf);
this.compression = compression;
}
@Override
public FileBasedSink.WriteOperation createWriteOperation() {
return new ParquetAvroWriteOperation(this, schemaString, conf, compression);
}
// =======================================================================
// WriteOperation
// =======================================================================
static class ParquetAvroWriteOperation extends WriteOperation {
private final String schemaString;
private final SerializableConfiguration conf;
private final CompressionCodecName compression;
public ParquetAvroWriteOperation(
FileBasedSink sink,
String schemaString,
SerializableConfiguration conf,
CompressionCodecName compression) {
super(sink);
this.schemaString = schemaString;
this.conf = conf;
this.compression = compression;
}
@Override
public Writer createWriter() throws Exception {
return new ParquetAvroWriter<>(
this, new Schema.Parser().parse(schemaString), conf, compression);
}
}
// =======================================================================
// Writer
// =======================================================================
static class ParquetAvroWriter extends FileBasedSink.Writer {
private final Schema schema;
private final SerializableConfiguration conf;
private final CompressionCodecName compression;
private ParquetWriter writer;
public ParquetAvroWriter(
WriteOperation writeOperation,
Schema schema,
SerializableConfiguration conf,
CompressionCodecName compression) {
super(writeOperation, MimeTypes.BINARY);
this.schema = schema;
this.conf = conf;
this.compression = compression;
}
@Override
protected void prepareWrite(WritableByteChannel channel) throws Exception {
BeamOutputFile outputFile = BeamOutputFile.of(channel);
Configuration configuration = conf.get();
AvroParquetWriter.Builder builder =
AvroParquetWriter.builder(outputFile).withSchema(schema);
writer = WriterUtils.build(builder, configuration, compression);
}
@Override
public void write(T value) throws Exception {
writer.write(value);
}
@Override
protected void finishWrite() throws Exception {
writer.close();
}
}
}