All Downloads are FREE. Search and download functionalities are using the official Maven repository.

com.spotify.scio.parquet.tensorflow.ParquetExampleFileBasedSink Maven / Gradle / Ivy

The newest version!
/*
 * Copyright 2020 Spotify AB.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing,
 * software distributed under the License is distributed on an
 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 * KIND, either express or implied.  See the License for the
 * specific language governing permissions and limitations
 * under the License.
 */

package com.spotify.scio.parquet.tensorflow;

import com.spotify.parquet.tensorflow.TensorflowExampleParquetWriter;
import com.spotify.scio.parquet.BeamOutputFile;
import com.spotify.scio.parquet.WriterUtils;
import java.nio.channels.WritableByteChannel;
import org.apache.beam.sdk.io.FileBasedSink;
import org.apache.beam.sdk.io.fs.ResourceId;
import org.apache.beam.sdk.io.hadoop.SerializableConfiguration;
import org.apache.beam.sdk.options.ValueProvider;
import org.apache.beam.sdk.util.MimeTypes;
import org.apache.hadoop.conf.Configuration;
import org.apache.parquet.hadoop.ParquetWriter;
import org.apache.parquet.hadoop.metadata.CompressionCodecName;
import org.tensorflow.metadata.v0.Schema;
import org.tensorflow.proto.example.Example;

public class ParquetExampleFileBasedSink extends FileBasedSink {

  private final Schema schema;
  private final SerializableConfiguration conf;
  private final CompressionCodecName compression;

  public ParquetExampleFileBasedSink(
      ValueProvider baseOutputFileName,
      FileBasedSink.DynamicDestinations dynamicDestinations,
      Schema schema,
      Configuration conf,
      CompressionCodecName compression) {
    super(baseOutputFileName, dynamicDestinations);
    this.schema = schema;
    this.conf = new SerializableConfiguration(conf);
    this.compression = compression;
  }

  @Override
  public FileBasedSink.WriteOperation createWriteOperation() {
    return new ParquetExampleWriteOperation(this, schema, conf, compression);
  }

  // =======================================================================
  // WriteOperation
  // =======================================================================

  static class ParquetExampleWriteOperation extends FileBasedSink.WriteOperation {
    private final Schema schema;
    private final SerializableConfiguration conf;
    private final CompressionCodecName compression;

    ParquetExampleWriteOperation(
        FileBasedSink sink,
        Schema schema,
        SerializableConfiguration conf,
        CompressionCodecName compression) {
      super(sink);
      this.schema = schema;
      this.conf = conf;
      this.compression = compression;
    }

    @Override
    public Writer createWriter() throws Exception {
      return new ParquetExampleWriter(this, schema, conf, compression);
    }
  }

  // =======================================================================
  // Writer
  // =======================================================================

  static class ParquetExampleWriter extends FileBasedSink.Writer {

    private final Schema schema;
    private final SerializableConfiguration conf;
    private final CompressionCodecName compression;
    private ParquetWriter writer;

    public ParquetExampleWriter(
        FileBasedSink.WriteOperation writeOperation,
        Schema schema,
        SerializableConfiguration conf,
        CompressionCodecName compression) {
      super(writeOperation, MimeTypes.BINARY);
      this.schema = schema;
      this.conf = conf;
      this.compression = compression;
    }

    @Override
    protected void prepareWrite(WritableByteChannel channel) throws Exception {
      BeamOutputFile outputFile = BeamOutputFile.of(channel);
      TensorflowExampleParquetWriter.Builder builder =
          TensorflowExampleParquetWriter.builder(outputFile).withSchema(schema);
      writer = WriterUtils.build(builder, conf.get(), compression);
    }

    @Override
    public void write(Example value) throws Exception {
      writer.write(value);
    }

    @Override
    protected void finishWrite() throws Exception {
      writer.close();
    }
  }
}




© 2015 - 2025 Weber Informatics LLC | Privacy Policy