com.spotify.scio.parquet.tensorflow.ParquetExampleFileBasedSink Maven / Gradle / Ivy
The newest version!
/*
* Copyright 2020 Spotify AB.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/
package com.spotify.scio.parquet.tensorflow;
import com.spotify.parquet.tensorflow.TensorflowExampleParquetWriter;
import com.spotify.scio.parquet.BeamOutputFile;
import com.spotify.scio.parquet.WriterUtils;
import java.nio.channels.WritableByteChannel;
import org.apache.beam.sdk.io.FileBasedSink;
import org.apache.beam.sdk.io.fs.ResourceId;
import org.apache.beam.sdk.io.hadoop.SerializableConfiguration;
import org.apache.beam.sdk.options.ValueProvider;
import org.apache.beam.sdk.util.MimeTypes;
import org.apache.hadoop.conf.Configuration;
import org.apache.parquet.hadoop.ParquetWriter;
import org.apache.parquet.hadoop.metadata.CompressionCodecName;
import org.tensorflow.metadata.v0.Schema;
import org.tensorflow.proto.example.Example;
public class ParquetExampleFileBasedSink extends FileBasedSink {
private final Schema schema;
private final SerializableConfiguration conf;
private final CompressionCodecName compression;
public ParquetExampleFileBasedSink(
ValueProvider baseOutputFileName,
FileBasedSink.DynamicDestinations dynamicDestinations,
Schema schema,
Configuration conf,
CompressionCodecName compression) {
super(baseOutputFileName, dynamicDestinations);
this.schema = schema;
this.conf = new SerializableConfiguration(conf);
this.compression = compression;
}
@Override
public FileBasedSink.WriteOperation createWriteOperation() {
return new ParquetExampleWriteOperation(this, schema, conf, compression);
}
// =======================================================================
// WriteOperation
// =======================================================================
static class ParquetExampleWriteOperation extends FileBasedSink.WriteOperation {
private final Schema schema;
private final SerializableConfiguration conf;
private final CompressionCodecName compression;
ParquetExampleWriteOperation(
FileBasedSink sink,
Schema schema,
SerializableConfiguration conf,
CompressionCodecName compression) {
super(sink);
this.schema = schema;
this.conf = conf;
this.compression = compression;
}
@Override
public Writer createWriter() throws Exception {
return new ParquetExampleWriter(this, schema, conf, compression);
}
}
// =======================================================================
// Writer
// =======================================================================
static class ParquetExampleWriter extends FileBasedSink.Writer {
private final Schema schema;
private final SerializableConfiguration conf;
private final CompressionCodecName compression;
private ParquetWriter writer;
public ParquetExampleWriter(
FileBasedSink.WriteOperation writeOperation,
Schema schema,
SerializableConfiguration conf,
CompressionCodecName compression) {
super(writeOperation, MimeTypes.BINARY);
this.schema = schema;
this.conf = conf;
this.compression = compression;
}
@Override
protected void prepareWrite(WritableByteChannel channel) throws Exception {
BeamOutputFile outputFile = BeamOutputFile.of(channel);
TensorflowExampleParquetWriter.Builder builder =
TensorflowExampleParquetWriter.builder(outputFile).withSchema(schema);
writer = WriterUtils.build(builder, conf.get(), compression);
}
@Override
public void write(Example value) throws Exception {
writer.write(value);
}
@Override
protected void finishWrite() throws Exception {
writer.close();
}
}
}
© 2015 - 2025 Weber Informatics LLC | Privacy Policy