io.kestra.plugin.serdes.parquet.ParquetToIon Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of plugin-serdes Show documentation
Show all versions of plugin-serdes Show documentation
Serialize and deserialize data formats in Kestra workflows.
The newest version!
package io.kestra.plugin.serdes.parquet;
import io.kestra.core.models.annotations.Example;
import io.kestra.core.models.annotations.Plugin;
import io.kestra.core.models.annotations.PluginProperty;
import io.kestra.core.models.executions.metrics.Counter;
import io.kestra.core.models.tasks.RunnableTask;
import io.kestra.core.models.tasks.Task;
import io.kestra.core.runners.RunContext;
import io.kestra.core.serializers.FileSerde;
import io.kestra.plugin.serdes.avro.AvroConverter;
import io.kestra.plugin.serdes.avro.AvroDeserializer;
import lombok.*;
import lombok.experimental.SuperBuilder;
import org.apache.avro.generic.GenericRecord;
import org.apache.commons.io.IOUtils;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.parquet.avro.AvroParquetReader;
import org.apache.parquet.hadoop.util.HadoopInputFile;
import java.io.*;
import java.net.URI;
import java.util.Map;
import java.util.function.Consumer;
import jakarta.validation.constraints.NotNull;
import reactor.core.publisher.Flux;
import reactor.core.publisher.FluxSink;
import reactor.core.publisher.Mono;
import static io.kestra.core.utils.Rethrow.throwConsumer;
@SuperBuilder
@ToString
@EqualsAndHashCode
@Getter
@NoArgsConstructor
@io.swagger.v3.oas.annotations.media.Schema(
title = "Read a provided parquet file and convert it to ion serialized data file."
)
@Plugin(
examples = {
@Example(
full = true,
title = "Convert a parquet file to the Amazon Ion format.",
code = """
id: parquet_to_ion
namespace: company.team
tasks:
- id: http_download
type: io.kestra.plugin.core.http.Download
uri: https://huggingface.co/datasets/kestra/datasets/raw/main/parquet/products.parquet
- id: to_ion
type: io.kestra.plugin.serdes.parquet.ParquetToIon
from: "{{ outputs.http_download.uri }}"
"""
)
},
aliases = "io.kestra.plugin.serdes.parquet.ParquetReader"
)
public class ParquetToIon extends Task implements RunnableTask {
@NotNull
@io.swagger.v3.oas.annotations.media.Schema(
title = "Source file URI"
)
@PluginProperty(dynamic = true)
private String from;
static {
ParquetTools.handleLogger();
// We initialize snappy in a static initializer block, so it is done when the plugin is loaded by the plugin registry,
// and not at when it is executed by the Worker to prevent issues with Java Security that prevent writing on /tmp.
ParquetTools.initSnappy();
}
public Output run(RunContext runContext) throws Exception {
// reader
URI from = new URI(runContext.render(this.from));
// New ion file
File tempFile = runContext.workingDir().createTempFile(".ion").toFile();
// Parquet file
File parquetFile = runContext.workingDir().createTempFile(".parquet").toFile();
try (OutputStream outputStream = new BufferedOutputStream(new FileOutputStream(parquetFile), FileSerde.BUFFER_SIZE)) {
IOUtils.copyLarge(runContext.storage().getFile(from), outputStream);
}
Path parquetHadoopPath = new Path(parquetFile.getPath());
HadoopInputFile parquetOutputFile = HadoopInputFile.fromPath(parquetHadoopPath, new Configuration());
AvroParquetReader.Builder parquetReaderBuilder = AvroParquetReader.builder(parquetOutputFile)
.disableCompatibility()
.withDataModel(AvroConverter.genericData());
try (
org.apache.parquet.hadoop.ParquetReader parquetReader = parquetReaderBuilder.build();
Writer output = new BufferedWriter(new FileWriter(tempFile), FileSerde.BUFFER_SIZE)
) {
Flux