All Downloads are FREE. Search and download functionalities are using the official Maven repository.

io.delta.flink.source.internal.builder.RowDataFormatBuilder Maven / Gradle / Ivy

There is a newer version: 3.2.1
Show newest version
package io.delta.flink.source.internal.builder;

import java.util.Collections;
import java.util.List;

import io.delta.flink.source.internal.DeltaPartitionFieldExtractor;
import io.delta.flink.source.internal.DeltaSourceOptions;
import io.delta.flink.source.internal.state.DeltaSourceSplit;
import org.apache.flink.formats.parquet.ParquetColumnarRowInputFormat;
import org.apache.flink.table.data.RowData;
import org.apache.flink.table.runtime.typeutils.InternalTypeInfo;
import org.apache.flink.table.types.logical.RowType;
import org.apache.hadoop.conf.Configuration;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

/**
 * Builder for {@link RowData} implementation io {@link FormatBuilder}
 */
public class RowDataFormatBuilder implements FormatBuilder {

    private static final Logger LOG = LoggerFactory.getLogger(RowDataFormatBuilder.class);

    // -------------- Hardcoded Non Public Options ----------
    /**
     * Hardcoded option for {@link RowDataFormat} to threat timestamps as a UTC timestamps.
     */
    private static final boolean PARQUET_UTC_TIMESTAMP = true;

    /**
     * Hardcoded option for {@link RowDataFormat} to use case-sensitive in column name processing
     * for Parquet files.
     */
    private static final boolean PARQUET_CASE_SENSITIVE = true;
    // ------------------------------------------------------

    private final RowType rowType;

    /**
     * An instance of Hadoop configuration used to read Parquet files.
     */
    private final Configuration hadoopConfiguration;

    /**
     * An array with Delta table partition columns.
     */
    private List partitionColumns; // partitionColumns are validated in DeltaSourceBuilder.

    private int batchSize = DeltaSourceOptions.PARQUET_BATCH_SIZE.defaultValue();

    RowDataFormatBuilder(RowType rowType, Configuration hadoopConfiguration) {
        this.rowType = rowType;
        this.hadoopConfiguration = hadoopConfiguration;
        this.partitionColumns = Collections.emptyList();
    }

    @Override
    public RowDataFormatBuilder partitionColumns(List partitionColumns) {
        this.partitionColumns = partitionColumns;
        return this;
    }

    @Override
    public FormatBuilder parquetBatchSize(int size) {
        this.batchSize = size;
        return this;
    }

    /**
     * Creates an instance of {@link RowDataFormat}.
     *
     * @throws io.delta.flink.internal.options.DeltaOptionValidationException if invalid
     *                arguments were passed to {@link RowDataFormatBuilder}. For example null
     *                arguments.
     */
    @Override
    public RowDataFormat build() {

        if (partitionColumns.isEmpty()) {
            LOG.info("Building format data for non-partitioned Delta table.");
            return buildFormatWithoutPartitions();
        } else {
            LOG.info("Building format data for partitioned Delta table.");
            return
                buildFormatWithPartitionColumns(
                    rowType,
                    hadoopConfiguration,
                    partitionColumns
                );
        }
    }

    private RowDataFormat buildFormatWithoutPartitions() {
        return buildFormatWithPartitionColumns(
            rowType,
            hadoopConfiguration,
            Collections.emptyList()
        );
    }

    private RowDataFormat buildFormatWithPartitionColumns(
            RowType producedRowType,
            Configuration hadoopConfig,
            List partitionColumns) {

        ParquetColumnarRowInputFormat rowInputFormat =
            ParquetColumnarRowInputFormat.createPartitionedFormat(
                hadoopConfig,
                producedRowType,
                InternalTypeInfo.of(producedRowType),
                partitionColumns,
                new DeltaPartitionFieldExtractor<>(),
                batchSize,
                PARQUET_UTC_TIMESTAMP,
                PARQUET_CASE_SENSITIVE
            );

        return new RowDataFormat(rowInputFormat);
    }
}




© 2015 - 2024 Weber Informatics LLC | Privacy Policy