io.trino.plugin.deltalake.AbstractDeltaLakePageSink Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of trino-delta-lake Show documentation
Show all versions of trino-delta-lake Show documentation
Trino - Delta Lake connector
/*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package io.trino.plugin.deltalake;
import com.google.common.collect.ImmutableList;
import com.google.common.collect.Streams;
import com.google.common.primitives.Ints;
import com.google.common.util.concurrent.Futures;
import io.airlift.concurrent.MoreFutures;
import io.airlift.json.JsonCodec;
import io.airlift.log.Logger;
import io.airlift.slice.Slice;
import io.trino.filesystem.Location;
import io.trino.filesystem.TrinoFileSystem;
import io.trino.filesystem.TrinoFileSystemFactory;
import io.trino.parquet.writer.ParquetWriterOptions;
import io.trino.plugin.deltalake.DataFileInfo.DataFileType;
import io.trino.plugin.deltalake.util.DeltaLakeWriteUtils;
import io.trino.plugin.hive.HivePartitionKey;
import io.trino.plugin.hive.parquet.ParquetFileWriter;
import io.trino.plugin.hive.util.HiveUtil;
import io.trino.spi.Page;
import io.trino.spi.PageIndexer;
import io.trino.spi.PageIndexerFactory;
import io.trino.spi.TrinoException;
import io.trino.spi.block.Block;
import io.trino.spi.connector.ConnectorPageSink;
import io.trino.spi.connector.ConnectorSession;
import io.trino.spi.type.Type;
import io.trino.spi.type.TypeOperators;
import org.apache.parquet.format.CompressionCodec;
import java.io.Closeable;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Collection;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.Objects;
import java.util.Optional;
import java.util.concurrent.CompletableFuture;
import static com.google.common.base.Verify.verify;
import static com.google.common.collect.ImmutableList.toImmutableList;
import static io.airlift.slice.Slices.wrappedBuffer;
import static io.trino.filesystem.Locations.appendPath;
import static io.trino.plugin.deltalake.DeltaLakeErrorCode.DELTA_LAKE_BAD_WRITE;
import static io.trino.plugin.deltalake.DeltaLakeSessionProperties.getCompressionCodec;
import static io.trino.plugin.deltalake.DeltaLakeSessionProperties.getParquetWriterBlockSize;
import static io.trino.plugin.deltalake.DeltaLakeSessionProperties.getParquetWriterPageSize;
import static io.trino.plugin.deltalake.DeltaLakeSessionProperties.getParquetWriterPageValueCount;
import static io.trino.plugin.deltalake.DeltaLakeTypes.toParquetType;
import static io.trino.plugin.hive.util.HiveUtil.escapePathName;
import static java.lang.Math.min;
import static java.lang.String.format;
import static java.util.Objects.requireNonNull;
import static java.util.UUID.randomUUID;
import static java.util.stream.Collectors.toList;
public abstract class AbstractDeltaLakePageSink
implements ConnectorPageSink
{
private static final Logger LOG = Logger.get(AbstractDeltaLakePageSink.class);
private static final int MAX_PAGE_POSITIONS = 4096;
private final TypeOperators typeOperators;
private final List dataColumnHandles;
private final int[] dataColumnInputIndex;
private final List dataColumnNames;
private final List dataColumnTypes;
private final int[] partitionColumnsInputIndex;
private final List originalPartitionColumnNames;
private final List partitionColumnTypes;
private final PageIndexer pageIndexer;
private final TrinoFileSystem fileSystem;
private final int maxOpenWriters;
protected final JsonCodec dataFileInfoCodec;
private final List writers = new ArrayList<>();
private final Location tableLocation;
protected final Location outputPathDirectory;
private final ConnectorSession session;
private final DeltaLakeWriterStats stats;
private final String trinoVersion;
private final long targetMaxFileSize;
private final long idleWriterMinFileSize;
private long writtenBytes;
private long memoryUsage;
private final List closedWriterRollbackActions = new ArrayList<>();
private final List activeWriters = new ArrayList<>();
protected final ImmutableList.Builder dataFileInfos = ImmutableList.builder();
private final DeltaLakeParquetSchemaMapping parquetSchemaMapping;
private long currentOpenWriters;
public AbstractDeltaLakePageSink(
TypeOperators typeOperators,
List inputColumns,
List originalPartitionColumns,
PageIndexerFactory pageIndexerFactory,
TrinoFileSystemFactory fileSystemFactory,
int maxOpenWriters,
JsonCodec dataFileInfoCodec,
Location tableLocation,
Location outputPathDirectory,
ConnectorSession session,
DeltaLakeWriterStats stats,
String trinoVersion,
DeltaLakeParquetSchemaMapping parquetSchemaMapping)
{
this.typeOperators = requireNonNull(typeOperators, "typeOperators is null");
requireNonNull(inputColumns, "inputColumns is null");
requireNonNull(pageIndexerFactory, "pageIndexerFactory is null");
this.fileSystem = requireNonNull(fileSystemFactory, "fileSystemFactory is null").create(session);
this.maxOpenWriters = maxOpenWriters;
this.dataFileInfoCodec = requireNonNull(dataFileInfoCodec, "dataFileInfoCodec is null");
this.parquetSchemaMapping = requireNonNull(parquetSchemaMapping, "parquetSchemaMapping is null");
// determine the input index of the partition columns and data columns
int[] partitionColumnInputIndex = new int[originalPartitionColumns.size()];
ImmutableList.Builder dataColumnsInputIndex = ImmutableList.builder();
Type[] partitionColumnTypes = new Type[originalPartitionColumns.size()];
String[] originalPartitionColumnNames = new String[originalPartitionColumns.size()];
ImmutableList.Builder dataColumnHandles = ImmutableList.builder();
ImmutableList.Builder dataColumnTypes = ImmutableList.builder();
ImmutableList.Builder dataColumnNames = ImmutableList.builder();
Map toOriginalPartitionPositions = new HashMap<>();
int partitionColumnPosition = 0;
for (String partitionColumnName : originalPartitionColumns) {
toOriginalPartitionPositions.put(partitionColumnName, partitionColumnPosition++);
}
for (int inputIndex = 0; inputIndex < inputColumns.size(); inputIndex++) {
DeltaLakeColumnHandle column = inputColumns.get(inputIndex);
switch (column.getColumnType()) {
case PARTITION_KEY:
int partitionPosition = toOriginalPartitionPositions.get(column.getColumnName());
partitionColumnInputIndex[partitionPosition] = inputIndex;
originalPartitionColumnNames[partitionPosition] = column.getColumnName();
partitionColumnTypes[partitionPosition] = column.getBaseType();
break;
case REGULAR:
verify(column.isBaseColumn(), "Unexpected dereference: %s", column);
dataColumnHandles.add(column);
dataColumnsInputIndex.add(inputIndex);
dataColumnNames.add(column.getBasePhysicalColumnName());
dataColumnTypes.add(column.getBasePhysicalType());
break;
case SYNTHESIZED:
processSynthesizedColumn(column);
break;
default:
throw new IllegalStateException("Unexpected column type: " + column.getColumnType());
}
}
this.partitionColumnsInputIndex = partitionColumnInputIndex;
this.dataColumnInputIndex = Ints.toArray(dataColumnsInputIndex.build());
this.originalPartitionColumnNames = ImmutableList.copyOf(originalPartitionColumnNames);
this.dataColumnHandles = dataColumnHandles.build();
this.partitionColumnTypes = ImmutableList.copyOf(partitionColumnTypes);
this.dataColumnNames = dataColumnNames.build();
this.dataColumnTypes = dataColumnTypes.build();
this.pageIndexer = pageIndexerFactory.createPageIndexer(this.partitionColumnTypes);
this.tableLocation = tableLocation;
this.outputPathDirectory = outputPathDirectory;
this.session = requireNonNull(session, "session is null");
this.stats = stats;
this.trinoVersion = requireNonNull(trinoVersion, "trinoVersion is null");
this.targetMaxFileSize = DeltaLakeSessionProperties.getTargetMaxFileSize(session);
this.idleWriterMinFileSize = DeltaLakeSessionProperties.getIdleWriterMinFileSize(session);
}
protected abstract void processSynthesizedColumn(DeltaLakeColumnHandle column);
protected abstract String getPathPrefix();
protected abstract DataFileType getDataFileType();
@Override
public long getCompletedBytes()
{
return writtenBytes;
}
@Override
public long getMemoryUsage()
{
return memoryUsage;
}
@Override
public long getValidationCpuNanos()
{
return 0;
}
@Override
public CompletableFuture> finish()
{
for (int writerIndex = 0; writerIndex < writers.size(); writerIndex++) {
closeWriter(writerIndex);
}
writers.clear();
List dataFilesInfo = dataFileInfos.build();
Collection result = dataFilesInfo.stream()
.map(dataFileInfo -> wrappedBuffer(dataFileInfoCodec.toJsonBytes(dataFileInfo)))
.collect(toImmutableList());
return MoreFutures.toCompletableFuture(Futures.immediateFuture(result));
}
@Override
public void abort()
{
List rollbackActions = Streams.concat(
writers.stream()
// writers can contain nulls if an exception is thrown when doAppend expands the writer list
.filter(Objects::nonNull)
.map(writer -> writer::rollback),
closedWriterRollbackActions.stream())
.collect(toImmutableList());
RuntimeException rollbackException = null;
for (Closeable rollbackAction : rollbackActions) {
try {
rollbackAction.close();
}
catch (Throwable t) {
if (rollbackException == null) {
rollbackException = new TrinoException(DELTA_LAKE_BAD_WRITE, "Error rolling back write to Delta Lake");
}
rollbackException.addSuppressed(t);
}
}
if (rollbackException != null) {
throw rollbackException;
}
}
@Override
public CompletableFuture> appendPage(Page page)
{
int writeOffset = 0;
while (writeOffset < page.getPositionCount()) {
Page chunk = page.getRegion(writeOffset, min(page.getPositionCount() - writeOffset, MAX_PAGE_POSITIONS));
writeOffset += chunk.getPositionCount();
writePage(chunk);
}
return NOT_BLOCKED;
}
private void writePage(Page page)
{
int[] writerIndexes = getWriterIndexes(page);
// position count for each writer
int[] sizes = new int[writers.size()];
for (int index : writerIndexes) {
sizes[index]++;
}
// record which positions are used by which writer
int[][] writerPositions = new int[writers.size()][];
int[] counts = new int[writers.size()];
for (int position = 0; position < page.getPositionCount(); position++) {
int index = writerIndexes[position];
int count = counts[index];
if (count == 0) {
writerPositions[index] = new int[sizes[index]];
}
writerPositions[index][count] = position;
counts[index] = count + 1;
}
// invoke the writers
Page dataPage = getDataPage(page);
for (int index = 0; index < writerPositions.length; index++) {
int[] positions = writerPositions[index];
if (positions == null) {
continue;
}
// If write is partitioned across multiple writers, filter page using dictionary blocks
Page pageForWriter = dataPage;
if (positions.length != dataPage.getPositionCount()) {
verify(positions.length == counts[index]);
pageForWriter = pageForWriter.getPositions(positions, 0, positions.length);
}
DeltaLakeWriter writer = writers.get(index);
verify(writer != null, "Expected writer at index %s", index);
long currentWritten = writer.getWrittenBytes();
long currentMemory = writer.getMemoryUsage();
writer.appendRows(pageForWriter);
writtenBytes += writer.getWrittenBytes() - currentWritten;
memoryUsage += writer.getMemoryUsage() - currentMemory;
// Mark this writer as active (i.e. not idle)
activeWriters.set(index, true);
}
}
@Override
public void closeIdleWriters()
{
for (int writerIndex = 0; writerIndex < writers.size(); writerIndex++) {
DeltaLakeWriter writer = writers.get(writerIndex);
if (activeWriters.get(writerIndex) || writer == null || writer.getWrittenBytes() <= idleWriterMinFileSize) {
activeWriters.set(writerIndex, false);
continue;
}
LOG.debug("Closing writer %s with %s bytes written", writerIndex, writer.getWrittenBytes());
closeWriter(writerIndex);
}
}
private int[] getWriterIndexes(Page page)
{
Page partitionColumns = extractColumns(page, partitionColumnsInputIndex);
int[] writerIndexes = pageIndexer.indexPage(partitionColumns);
// expand writers list to new size
while (writers.size() <= pageIndexer.getMaxIndex()) {
writers.add(null);
activeWriters.add(false);
}
// create missing writers
for (int position = 0; position < page.getPositionCount(); position++) {
int writerIndex = writerIndexes[position];
DeltaLakeWriter deltaLakeWriter = writers.get(writerIndex);
if (deltaLakeWriter != null) {
if (deltaLakeWriter.getWrittenBytes() <= targetMaxFileSize) {
continue;
}
closeWriter(writerIndex);
}
Location filePath = outputPathDirectory;
List partitionValues = createPartitionValues(partitionColumnTypes, partitionColumns, position);
Optional partitionName = Optional.empty();
if (!originalPartitionColumnNames.isEmpty()) {
String partName = makePartName(originalPartitionColumnNames, partitionValues);
filePath = filePath.appendPath(partName);
partitionName = Optional.of(partName);
}
String fileName = session.getQueryId() + "_" + randomUUID();
filePath = filePath.appendPath(fileName);
ParquetFileWriter fileWriter = createParquetFileWriter(filePath);
DeltaLakeWriter writer = new DeltaLakeWriter(
fileWriter,
tableLocation,
getRelativeFilePath(partitionName, fileName),
partitionValues,
stats,
dataColumnHandles,
getDataFileType());
writers.set(writerIndex, writer);
currentOpenWriters++;
memoryUsage += writer.getMemoryUsage();
}
verify(writers.size() == pageIndexer.getMaxIndex() + 1);
if (currentOpenWriters > maxOpenWriters) {
throw new TrinoException(DELTA_LAKE_BAD_WRITE, format("Exceeded limit of %s open writers for partitions", maxOpenWriters));
}
return writerIndexes;
}
private String getRelativeFilePath(Optional partitionName, String fileName)
{
return getPathPrefix() + partitionName.map(partition -> appendPath(partition, fileName)).orElse(fileName);
}
protected void closeWriter(int writerIndex)
{
DeltaLakeWriter writer = writers.get(writerIndex);
if (writer == null) {
return;
}
long currentWritten = writer.getWrittenBytes();
long currentMemory = writer.getMemoryUsage();
closedWriterRollbackActions.add(writer.commit());
writtenBytes += writer.getWrittenBytes() - currentWritten;
memoryUsage -= currentMemory;
writers.set(writerIndex, null);
currentOpenWriters--;
try {
DataFileInfo dataFileInfo = writer.getDataFileInfo();
dataFileInfos.add(dataFileInfo);
}
catch (IOException e) {
LOG.warn("exception '%s' while finishing write on %s", e, writer);
throw new TrinoException(DELTA_LAKE_BAD_WRITE, "Error committing Parquet file to Delta Lake", e);
}
}
/**
* Copy of {@link HiveUtil#makePartName} modified to preserve case of partition columns.
*/
private static String makePartName(List partitionColumns, List partitionValues)
{
StringBuilder name = new StringBuilder();
for (int i = 0; i < partitionColumns.size(); ++i) {
if (i > 0) {
name.append("/");
}
name.append(escapePathName(partitionColumns.get(i)));
name.append('=');
name.append(escapePathName(partitionValues.get(i)));
}
return name.toString();
}
public static List createPartitionValues(List partitionColumnTypes, Page partitionColumns, int position)
{
return DeltaLakeWriteUtils.createPartitionValues(partitionColumnTypes, partitionColumns, position).stream()
.map(value -> value.equals(HivePartitionKey.HIVE_DEFAULT_DYNAMIC_PARTITION) ? null : value)
.collect(toList());
}
private ParquetFileWriter createParquetFileWriter(Location path)
{
ParquetWriterOptions parquetWriterOptions = ParquetWriterOptions.builder()
.setMaxBlockSize(getParquetWriterBlockSize(session))
.setMaxPageSize(getParquetWriterPageSize(session))
.setMaxPageValueCount(getParquetWriterPageValueCount(session))
.build();
CompressionCodec compressionCodec = getCompressionCodec(session).getParquetCompressionCodec()
.orElseThrow(); // validated on the session property level
try {
Closeable rollbackAction = () -> fileSystem.deleteFile(path);
List parquetTypes = dataColumnTypes.stream()
.map(type -> toParquetType(typeOperators, type))
.collect(toImmutableList());
// we use identity column mapping; input page already contains only data columns per
// DataLagePageSink.getDataPage()
int[] identityMapping = new int[dataColumnTypes.size()];
for (int i = 0; i < identityMapping.length; ++i) {
identityMapping[i] = i;
}
return new ParquetFileWriter(
fileSystem.newOutputFile(path),
rollbackAction,
parquetTypes,
dataColumnNames,
parquetSchemaMapping.messageType(),
parquetSchemaMapping.primitiveTypes(),
parquetWriterOptions,
identityMapping,
compressionCodec,
trinoVersion,
Optional.empty(),
Optional.empty());
}
catch (IOException e) {
throw new TrinoException(DELTA_LAKE_BAD_WRITE, "Error creating Parquet file", e);
}
}
private Page getDataPage(Page page)
{
Block[] blocks = new Block[dataColumnInputIndex.length];
for (int i = 0; i < dataColumnInputIndex.length; i++) {
int dataColumn = dataColumnInputIndex[i];
blocks[i] = page.getBlock(dataColumn);
}
return new Page(page.getPositionCount(), blocks);
}
private static Page extractColumns(Page page, int[] columns)
{
Block[] blocks = new Block[columns.length];
for (int i = 0; i < columns.length; i++) {
int dataColumn = columns[i];
blocks[i] = page.getBlock(dataColumn);
}
return new Page(page.getPositionCount(), blocks);
}
}