io.kestra.plugin.serdes.excel.ExcelToIon Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of plugin-serdes Show documentation
Show all versions of plugin-serdes Show documentation
Serialize and deserialize data formats in Kestra workflows.
The newest version!
package io.kestra.plugin.serdes.excel;
import com.github.pjfanning.xlsx.StreamingReader;
import io.kestra.core.models.annotations.Example;
import io.kestra.core.models.annotations.Plugin;
import io.kestra.core.models.annotations.PluginProperty;
import io.kestra.core.models.executions.metrics.Counter;
import io.kestra.core.models.tasks.RunnableTask;
import io.kestra.core.models.tasks.Task;
import io.kestra.core.runners.RunContext;
import io.kestra.core.serializers.FileSerde;
import io.kestra.core.utils.ListUtils;
import io.swagger.v3.oas.annotations.media.Schema;
import lombok.*;
import lombok.experimental.SuperBuilder;
import org.apache.poi.ss.usermodel.*;
import jakarta.validation.constraints.NotBlank;
import jakarta.validation.constraints.PositiveOrZero;
import reactor.core.publisher.Flux;
import java.io.*;
import java.net.URI;
import java.util.*;
import java.util.concurrent.atomic.AtomicInteger;
import static io.kestra.core.utils.Rethrow.throwFunction;
@SuperBuilder
@ToString
@EqualsAndHashCode
@Getter
@NoArgsConstructor
@Schema(
title = "Read data from Excel into a row-wise ION-serialized format"
)
@Plugin(
examples = {
@Example(
full = true,
title = "Convert an Excel file to the Ion format.",
code = """
id: excel_to_ion
namespace: company.team
tasks:
- id: http_download
type: io.kestra.plugin.core.http.Download
uri: https://huggingface.co/datasets/kestra/datasets/raw/main/excel/Products.xlsx
- id: to_ion
type: io.kestra.plugin.serdes.excel.ExcelToIon
from: "{{ outputs.http_download.uri }}"
"""
)
}
)
public class ExcelToIon extends Task implements RunnableTask {
@NotBlank
@Schema(
title = "Source file URI"
)
@PluginProperty(dynamic = true)
private String from;
@Schema(
title = "The sheets title to be included"
)
@PluginProperty
private List sheetsTitle;
@Schema(
title = "The name of a supported character set"
)
@PluginProperty
@Builder.Default
private String charset = "UTF-8";
@Schema(
title = "Determines how values should be rendered in the output",
description = "Possible values: FORMATTED_VALUE, UNFORMATTED_VALUE, FORMULA"
)
@PluginProperty
@Builder.Default
private ValueRender valueRender = ValueRender.UNFORMATTED_VALUE;
@Schema(
title = "How dates, times, and durations should be represented in the output",
description = "Possible values: SERIAL_NUMBER, FORMATTED_STRING"
)
@Builder.Default
@PluginProperty
private DateTimeRender dateTimeRender = DateTimeRender.UNFORMATTED_VALUE;
@Schema(
title = "Whether the first row should be treated as the header"
)
@PluginProperty
@Builder.Default
private boolean header = true;
@Schema(
title = "Specifies if empty rows should be skipped"
)
@PluginProperty
@Builder.Default
private boolean skipEmptyRows = false;
@Schema(
title = "Number of lines to skip at the start of the file. Useful if a table has a title and explanation in the first few rows"
)
@PositiveOrZero
@PluginProperty
@Builder.Default
private int skipRows = 0;
@Override
public Output run(RunContext runContext) throws Exception {
URI from = new URI(runContext.render(this.from));
try(Workbook workbook = StreamingReader.builder().rowCacheSize(1).open(runContext.storage().getFile(from))) {
List sheets = new ArrayList<>();
workbook.sheetIterator().forEachRemaining(sheets::add);
List includedSheetsTitle = ListUtils.emptyOnNull(this.sheetsTitle)
.stream()
.map(throwFunction(runContext::render))
.toList();
List selectedSheets = sheets.stream()
.filter(sheet -> includedSheetsTitle.isEmpty() || includedSheetsTitle.contains(sheet.getSheetName()))
.toList();
runContext.metric(Counter.of("sheets", sheets.size()));
// read values
Map uris = new HashMap<>();
AtomicInteger rowsCount = new AtomicInteger();
AtomicInteger skipped = new AtomicInteger();
for (Sheet sheet : selectedSheets) {
List