All Downloads are FREE. Search and download functionalities are using the official Maven repository.

io.trino.plugin.hive.avro.AvroPageSourceFactory Maven / Gradle / Ivy

There is a newer version: 468
Show newest version
/*
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package io.trino.plugin.hive.avro;

import com.google.inject.Inject;
import io.airlift.slice.Slices;
import io.airlift.units.DataSize;
import io.trino.filesystem.Location;
import io.trino.filesystem.TrinoFileSystem;
import io.trino.filesystem.TrinoFileSystemFactory;
import io.trino.filesystem.TrinoInputFile;
import io.trino.filesystem.TrinoInputStream;
import io.trino.filesystem.memory.MemoryInputFile;
import io.trino.hive.formats.avro.AvroTypeException;
import io.trino.hive.formats.avro.HiveAvroTypeBlockHandler;
import io.trino.plugin.hive.AcidInfo;
import io.trino.plugin.hive.HiveColumnHandle;
import io.trino.plugin.hive.HivePageSourceFactory;
import io.trino.plugin.hive.HiveTimestampPrecision;
import io.trino.plugin.hive.ReaderColumns;
import io.trino.plugin.hive.ReaderPageSource;
import io.trino.plugin.hive.acid.AcidTransaction;
import io.trino.spi.TrinoException;
import io.trino.spi.connector.ConnectorSession;
import io.trino.spi.connector.EmptyPageSource;
import io.trino.spi.predicate.TupleDomain;
import org.apache.avro.Schema;
import org.apache.avro.SchemaBuilder;
import org.apache.avro.util.internal.Accessor;

import java.io.IOException;
import java.util.AbstractCollection;
import java.util.HashSet;
import java.util.LinkedHashSet;
import java.util.List;
import java.util.Map;
import java.util.Objects;
import java.util.Optional;
import java.util.OptionalInt;
import java.util.Set;
import java.util.UUID;

import static com.google.common.base.Preconditions.checkArgument;
import static com.google.common.base.Verify.verify;
import static com.google.common.collect.ImmutableList.toImmutableList;
import static io.trino.hive.formats.HiveClassNames.AVRO_SERDE_CLASS;
import static io.trino.plugin.hive.HiveErrorCode.HIVE_CANNOT_OPEN_SPLIT;
import static io.trino.plugin.hive.HivePageSourceProvider.projectBaseColumns;
import static io.trino.plugin.hive.HiveSessionProperties.getTimestampPrecision;
import static io.trino.plugin.hive.ReaderPageSource.noProjectionAdaptation;
import static io.trino.plugin.hive.avro.AvroHiveFileUtils.getCanonicalToGivenFieldName;
import static io.trino.plugin.hive.avro.AvroHiveFileUtils.wrapInUnionWithNull;
import static io.trino.plugin.hive.util.HiveUtil.getDeserializerClassName;
import static io.trino.plugin.hive.util.HiveUtil.splitError;
import static io.trino.spi.type.TimestampType.createTimestampType;
import static java.lang.Math.min;
import static java.util.Objects.requireNonNull;

public class AvroPageSourceFactory
        implements HivePageSourceFactory
{
    private static final DataSize BUFFER_SIZE = DataSize.of(8, DataSize.Unit.MEGABYTE);

    private final TrinoFileSystemFactory trinoFileSystemFactory;

    @Inject
    public AvroPageSourceFactory(TrinoFileSystemFactory trinoFileSystemFactory)
    {
        this.trinoFileSystemFactory = requireNonNull(trinoFileSystemFactory, "trinoFileSystemFactory is null");
    }

    @Override
    public Optional createPageSource(
            ConnectorSession session,
            Location path,
            long start,
            long length,
            long estimatedFileSize,
            long fileModifiedTime,
            Map schema,
            List columns,
            TupleDomain effectivePredicate,
            Optional acidInfo,
            OptionalInt bucketNumber,
            boolean originalFile,
            AcidTransaction transaction)
    {
        if (!AVRO_SERDE_CLASS.equals(getDeserializerClassName(schema))) {
            return Optional.empty();
        }
        checkArgument(acidInfo.isEmpty(), "Acid is not supported");

        List projectedReaderColumns = columns;
        Optional readerProjections = projectBaseColumns(columns);

        if (readerProjections.isPresent()) {
            projectedReaderColumns = readerProjections.get().get().stream()
                    .map(HiveColumnHandle.class::cast)
                    .collect(toImmutableList());
        }

        TrinoFileSystem trinoFileSystem = trinoFileSystemFactory.create(session);
        TrinoInputFile inputFile = trinoFileSystem.newInputFile(path);
        HiveTimestampPrecision hiveTimestampPrecision = getTimestampPrecision(session);

        Schema tableSchema;
        try {
            tableSchema = AvroHiveFileUtils.determineSchemaOrThrowException(trinoFileSystem, schema);
        }
        catch (IOException | org.apache.avro.AvroTypeException e) {
            throw new TrinoException(HIVE_CANNOT_OPEN_SPLIT, "Unable to load or parse schema", e);
        }

        try {
            length = min(inputFile.length() - start, length);
            if (estimatedFileSize < BUFFER_SIZE.toBytes()) {
                try (TrinoInputStream input = inputFile.newStream()) {
                    byte[] data = input.readAllBytes();
                    inputFile = new MemoryInputFile(path, Slices.wrappedBuffer(data));
                }
            }
        }
        catch (TrinoException e) {
            throw e;
        }
        catch (Exception e) {
            throw new TrinoException(HIVE_CANNOT_OPEN_SPLIT, splitError(e, path, start, length), e);
        }

        // Split may be empty now that the correct file size is known
        if (length <= 0) {
            return Optional.of(noProjectionAdaptation(new EmptyPageSource()));
        }

        Schema maskedSchema;
        try {
            maskedSchema = maskColumnsFromTableSchema(projectedReaderColumns, tableSchema);
        }
        catch (org.apache.avro.AvroTypeException e) {
            throw new TrinoException(HIVE_CANNOT_OPEN_SPLIT, "Avro type resolution error when initializing split from %s".formatted(path), e);
        }

        if (maskedSchema.getFields().isEmpty()) {
            // no non-masked columns to select from partition schema
            // hack to return null rows with same total count as underlying data file
            // will error if UUID is same name as base column for underlying storage table but should never
            // return false data. If file data has f+uuid column in schema then resolution of read null from not null will fail.
            SchemaBuilder.FieldAssembler nullSchema = SchemaBuilder.record("null_only").fields();
            for (int i = 0; i < Math.max(projectedReaderColumns.size(), 1); i++) {
                String notAColumnName = null;
                while (Objects.isNull(notAColumnName) || Objects.nonNull(tableSchema.getField(notAColumnName))) {
                    notAColumnName = "f" + UUID.randomUUID().toString().replace('-', '_');
                }
                nullSchema = nullSchema.name(notAColumnName).type(Schema.create(Schema.Type.NULL)).withDefault(null);
            }
            try {
                return Optional.of(noProjectionAdaptation(new AvroPageSource(inputFile, nullSchema.endRecord(), new HiveAvroTypeBlockHandler(createTimestampType(hiveTimestampPrecision.getPrecision())), start, length)));
            }
            catch (IOException e) {
                throw new TrinoException(HIVE_CANNOT_OPEN_SPLIT, e);
            }
            catch (AvroTypeException e) {
                throw new TrinoException(HIVE_CANNOT_OPEN_SPLIT, "Avro type resolution error when initializing split from %s".formatted(path), e);
            }
        }

        try {
            return Optional.of(new ReaderPageSource(new AvroPageSource(inputFile, maskedSchema, new HiveAvroTypeBlockHandler(createTimestampType(hiveTimestampPrecision.getPrecision())), start, length), readerProjections));
        }
        catch (IOException e) {
            throw new TrinoException(HIVE_CANNOT_OPEN_SPLIT, e);
        }
        catch (AvroTypeException e) {
            throw new TrinoException(HIVE_CANNOT_OPEN_SPLIT, "Avro type resolution error when initializing split from %s".formatted(path), e);
        }
    }

    private Schema maskColumnsFromTableSchema(List columns, Schema tableSchema)
    {
        verify(tableSchema.getType() == Schema.Type.RECORD);
        Set maskedColumns = columns.stream().map(HiveColumnHandle::getBaseColumnName).collect(LinkedHashSet::new, HashSet::add, AbstractCollection::addAll);

        SchemaBuilder.FieldAssembler maskedSchema = SchemaBuilder.builder()
                .record(tableSchema.getName())
                .namespace(tableSchema.getNamespace())
                .fields();
        Map lowerToGivenName = getCanonicalToGivenFieldName(tableSchema);

        for (String columnName : maskedColumns) {
            Schema.Field field = tableSchema.getField(columnName);
            if (Objects.isNull(field)) {
                if (!lowerToGivenName.containsKey(columnName)) {
                    throw new TrinoException(HIVE_CANNOT_OPEN_SPLIT, "Unable to find column %s in table Avro schema %s".formatted(columnName, tableSchema.getFullName()));
                }
                field = tableSchema.getField(lowerToGivenName.get(columnName));
            }
            if (field.hasDefaultValue()) {
                try {
                    Object defaultObj = Accessor.defaultValue(field);
                    maskedSchema = maskedSchema
                            .name(field.name())
                            .aliases(field.aliases().toArray(String[]::new))
                            .doc(field.doc())
                            .type(field.schema())
                            .withDefault(defaultObj);
                }
                catch (org.apache.avro.AvroTypeException e) {
                    // in order to maintain backwards compatibility invalid defaults are mapped to null
                    // behavior defined by io.trino.tests.product.hive.TestAvroSchemaStrictness.testInvalidUnionDefaults
                    // solution is to make the field nullable and default-able to null. Any place default would be used, null will be
                    if (e.getMessage().contains("Invalid default")) {
                        maskedSchema = maskedSchema
                                .name(field.name())
                                .aliases(field.aliases().toArray(String[]::new))
                                .doc(field.doc())
                                .type(wrapInUnionWithNull(field.schema()))
                                .withDefault(null);
                    }
                    else {
                        throw e;
                    }
                }
            }
            else {
                maskedSchema = maskedSchema
                        .name(field.name())
                        .aliases(field.aliases().toArray(String[]::new))
                        .doc(field.doc())
                        .type(field.schema())
                        .noDefault();
            }
        }
        return maskedSchema.endRecord();
    }
}




© 2015 - 2025 Weber Informatics LLC | Privacy Policy