All Downloads are FREE. Search and download functionalities are using the official Maven repository.

io.trino.plugin.hive.BackgroundHiveSplitLoader Maven / Gradle / Ivy

There is a newer version: 468
Show newest version
/*
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package io.trino.plugin.hive;

import com.google.common.annotations.VisibleForTesting;
import com.google.common.base.Stopwatch;
import com.google.common.collect.ArrayListMultimap;
import com.google.common.collect.ImmutableList;
import com.google.common.collect.ListMultimap;
import com.google.common.collect.Multimaps;
import com.google.common.collect.Streams;
import com.google.common.io.CharStreams;
import com.google.common.util.concurrent.Futures;
import com.google.common.util.concurrent.ListenableFuture;
import io.airlift.units.Duration;
import io.trino.filesystem.FileEntry;
import io.trino.filesystem.FileIterator;
import io.trino.filesystem.Location;
import io.trino.filesystem.TrinoFileSystem;
import io.trino.filesystem.TrinoFileSystemFactory;
import io.trino.plugin.hive.HiveSplit.BucketConversion;
import io.trino.plugin.hive.HiveSplit.BucketValidation;
import io.trino.plugin.hive.fs.DirectoryLister;
import io.trino.plugin.hive.fs.HiveFileIterator;
import io.trino.plugin.hive.fs.TrinoFileStatus;
import io.trino.plugin.hive.metastore.Column;
import io.trino.plugin.hive.metastore.Partition;
import io.trino.plugin.hive.metastore.StorageFormat;
import io.trino.plugin.hive.metastore.Table;
import io.trino.plugin.hive.util.AcidTables.AcidState;
import io.trino.plugin.hive.util.AcidTables.ParsedDelta;
import io.trino.plugin.hive.util.HiveBucketing.BucketingVersion;
import io.trino.plugin.hive.util.HiveBucketing.HiveBucketFilter;
import io.trino.plugin.hive.util.InternalHiveSplitFactory;
import io.trino.plugin.hive.util.ResumableTask;
import io.trino.plugin.hive.util.ResumableTasks;
import io.trino.plugin.hive.util.ValidWriteIdList;
import io.trino.spi.TrinoException;
import io.trino.spi.connector.ColumnHandle;
import io.trino.spi.connector.ConnectorSession;
import io.trino.spi.connector.DynamicFilter;
import io.trino.spi.predicate.TupleDomain;
import io.trino.spi.type.TypeManager;

import java.io.IOException;
import java.io.InputStreamReader;
import java.io.Reader;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Deque;
import java.util.HashMap;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.Map.Entry;
import java.util.Optional;
import java.util.OptionalInt;
import java.util.concurrent.ConcurrentLinkedDeque;
import java.util.concurrent.Executor;
import java.util.concurrent.atomic.AtomicInteger;
import java.util.concurrent.locks.ReadWriteLock;
import java.util.concurrent.locks.ReentrantLock;
import java.util.concurrent.locks.ReentrantReadWriteLock;
import java.util.function.BooleanSupplier;
import java.util.function.Function;
import java.util.function.IntPredicate;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import java.util.stream.Stream;

import static com.google.common.base.Preconditions.checkArgument;
import static com.google.common.base.Preconditions.checkState;
import static com.google.common.util.concurrent.Futures.immediateVoidFuture;
import static com.google.common.util.concurrent.MoreExecutors.directExecutor;
import static io.airlift.concurrent.MoreFutures.addExceptionCallback;
import static io.airlift.concurrent.MoreFutures.toListenableFuture;
import static io.trino.hive.formats.HiveClassNames.SYMLINK_TEXT_INPUT_FORMAT_CLASS;
import static io.trino.plugin.hive.HiveErrorCode.HIVE_BAD_DATA;
import static io.trino.plugin.hive.HiveErrorCode.HIVE_EXCEEDED_PARTITION_LIMIT;
import static io.trino.plugin.hive.HiveErrorCode.HIVE_FILESYSTEM_ERROR;
import static io.trino.plugin.hive.HiveErrorCode.HIVE_FILE_NOT_FOUND;
import static io.trino.plugin.hive.HiveErrorCode.HIVE_INVALID_BUCKET_FILES;
import static io.trino.plugin.hive.HiveErrorCode.HIVE_INVALID_METADATA;
import static io.trino.plugin.hive.HiveErrorCode.HIVE_INVALID_PARTITION_VALUE;
import static io.trino.plugin.hive.HiveErrorCode.HIVE_UNKNOWN_ERROR;
import static io.trino.plugin.hive.HiveErrorCode.HIVE_UNSUPPORTED_FORMAT;
import static io.trino.plugin.hive.HiveSessionProperties.getMaxInitialSplitSize;
import static io.trino.plugin.hive.HiveSessionProperties.isForceLocalScheduling;
import static io.trino.plugin.hive.HiveSessionProperties.isValidateBucketing;
import static io.trino.plugin.hive.HiveStorageFormat.TEXTFILE;
import static io.trino.plugin.hive.HiveStorageFormat.getHiveStorageFormat;
import static io.trino.plugin.hive.fs.HiveFileIterator.NestedDirectoryPolicy.FAIL;
import static io.trino.plugin.hive.fs.HiveFileIterator.NestedDirectoryPolicy.IGNORED;
import static io.trino.plugin.hive.fs.HiveFileIterator.NestedDirectoryPolicy.RECURSE;
import static io.trino.plugin.hive.metastore.MetastoreUtil.getHiveSchema;
import static io.trino.plugin.hive.metastore.MetastoreUtil.getPartitionLocation;
import static io.trino.plugin.hive.util.AcidTables.getAcidState;
import static io.trino.plugin.hive.util.AcidTables.isFullAcidTable;
import static io.trino.plugin.hive.util.AcidTables.isTransactionalTable;
import static io.trino.plugin.hive.util.AcidTables.readAcidVersionFile;
import static io.trino.plugin.hive.util.HiveBucketing.getBucketingVersion;
import static io.trino.plugin.hive.util.HiveUtil.checkCondition;
import static io.trino.plugin.hive.util.HiveUtil.getDeserializerClassName;
import static io.trino.plugin.hive.util.HiveUtil.getFooterCount;
import static io.trino.plugin.hive.util.HiveUtil.getHeaderCount;
import static io.trino.plugin.hive.util.HiveUtil.getInputFormatName;
import static io.trino.plugin.hive.util.HiveUtil.getPartitionKeyColumnHandles;
import static io.trino.plugin.hive.util.PartitionMatchSupplier.createPartitionMatchSupplier;
import static io.trino.spi.StandardErrorCode.NOT_SUPPORTED;
import static java.lang.Integer.parseInt;
import static java.lang.Math.max;
import static java.lang.String.format;
import static java.nio.charset.StandardCharsets.UTF_8;
import static java.util.Collections.max;
import static java.util.Objects.requireNonNull;
import static java.util.concurrent.TimeUnit.MILLISECONDS;

public class BackgroundHiveSplitLoader
        implements HiveSplitLoader
{
    // See https://github.com/apache/hive/commit/ffee30e6267e85f00a22767262192abb9681cfb7#diff-5fe26c36b4e029dcd344fc5d484e7347R165
    private static final Pattern BUCKET_WITH_OPTIONAL_ATTEMPT_ID_PATTERN = Pattern.compile("bucket_(\\d+)(_\\d+)?$");

    private static final Iterable BUCKET_PATTERNS = ImmutableList.of(
            // legacy Presto naming pattern (current version matches Hive)
            Pattern.compile("\\d{8}_\\d{6}_\\d{5}_[a-z0-9]{5}_bucket-(\\d+)(?:[-_.].*)?"),
            // Hive naming pattern per `org.apache.hadoop.hive.ql.exec.Utilities#getBucketIdFromFile()`
            Pattern.compile("(\\d+)_\\d+.*"),
            // Hive ACID with optional direct insert attempt id
            BUCKET_WITH_OPTIONAL_ATTEMPT_ID_PATTERN);

    private static final ListenableFuture COMPLETED_FUTURE = immediateVoidFuture();

    private final Table table;
    private final TupleDomain compactEffectivePredicate;
    private final DynamicFilter dynamicFilter;
    private final long dynamicFilteringWaitTimeoutMillis;
    private final TypeManager typeManager;
    private final Optional tableBucketInfo;
    private final DirectoryLister directoryLister;
    private final TrinoFileSystemFactory fileSystemFactory;
    private final int loaderConcurrency;
    private final boolean recursiveDirWalkerEnabled;
    private final boolean ignoreAbsentPartitions;
    private final Executor executor;
    private final ConnectorSession session;
    private final ConcurrentLazyQueue partitions;
    private final Deque> fileIterators = new ConcurrentLinkedDeque<>();
    private final Optional validWriteIds;
    private final Optional maxSplitFileSize;
    private final int maxPartitions;

    // Purpose of this lock:
    // * Write lock: when you need a consistent view across partitions, fileIterators, and hiveSplitSource.
    // * Read lock: when you need to modify any of the above.
    //   Make sure the lock is held throughout the period during which they may not be consistent with each other.
    // Details:
    // * When write lock is acquired, except the holder, no one can do any of the following:
    // ** poll from (or check empty) partitions
    // ** poll from (or check empty) or push to fileIterators
    // ** push to hiveSplitSource
    // * When any of the above three operations is carried out, either a read lock or a write lock must be held.
    // * When a series of operations involving two or more of the above three operations are carried out, the lock
    //   must be continuously held throughout the series of operations.
    // Implications:
    // * if you hold a read lock but not a write lock, you can do any of the above three operations, but you may
    //   see a series of operations involving two or more of the operations carried out half way.
    private final ReadWriteLock taskExecutionLock = new ReentrantReadWriteLock();

    private HiveSplitSource hiveSplitSource;
    private Stopwatch stopwatch;
    private volatile boolean stopped;
    private final AtomicInteger activeLoaderCount = new AtomicInteger();
    private final AtomicInteger partitionCount = new AtomicInteger();

    public BackgroundHiveSplitLoader(
            Table table,
            Iterator partitions,
            TupleDomain compactEffectivePredicate,
            DynamicFilter dynamicFilter,
            Duration dynamicFilteringWaitTimeout,
            TypeManager typeManager,
            Optional tableBucketInfo,
            ConnectorSession session,
            TrinoFileSystemFactory fileSystemFactory,
            DirectoryLister directoryLister,
            Executor executor,
            int loaderConcurrency,
            boolean recursiveDirWalkerEnabled,
            boolean ignoreAbsentPartitions,
            Optional validWriteIds,
            Optional maxSplitFileSize,
            int maxPartitions)
    {
        this.table = table;
        this.compactEffectivePredicate = compactEffectivePredicate;
        this.dynamicFilter = dynamicFilter;
        this.dynamicFilteringWaitTimeoutMillis = dynamicFilteringWaitTimeout.toMillis();
        this.typeManager = typeManager;
        this.tableBucketInfo = tableBucketInfo;
        this.loaderConcurrency = loaderConcurrency;
        checkArgument(loaderConcurrency > 0, "loaderConcurrency must be > 0, found: %s", loaderConcurrency);
        this.session = session;
        this.fileSystemFactory = requireNonNull(fileSystemFactory, "fileSystemFactory is null");
        this.directoryLister = directoryLister;
        this.recursiveDirWalkerEnabled = recursiveDirWalkerEnabled;
        this.ignoreAbsentPartitions = ignoreAbsentPartitions;
        requireNonNull(executor, "executor is null");
        // direct executor is not supported in this implementation due to locking specifics
        checkExecutorIsNotDirectExecutor(executor);
        this.executor = executor;
        this.partitions = new ConcurrentLazyQueue<>(partitions);
        this.validWriteIds = requireNonNull(validWriteIds, "validWriteIds is null");
        this.maxSplitFileSize = requireNonNull(maxSplitFileSize, "maxSplitFileSize is null");
        this.maxPartitions = maxPartitions;
    }

    @Override
    public void start(HiveSplitSource splitSource)
    {
        this.hiveSplitSource = splitSource;
        this.stopwatch = Stopwatch.createStarted();
        addLoaderIfNecessary();
    }

    private void addLoaderIfNecessary()
    {
        // opportunistic check to avoid incrementing indefinitely
        if (activeLoaderCount.get() >= loaderConcurrency) {
            return;
        }
        if (activeLoaderCount.incrementAndGet() > loaderConcurrency) {
            return;
        }
        ListenableFuture future = ResumableTasks.submit(executor, new HiveSplitLoaderTask());
        // best effort; hiveSplitSource could be already completed
        addExceptionCallback(future, hiveSplitSource::fail);
    }

    @Override
    public void stop()
    {
        stopped = true;
    }

    private class HiveSplitLoaderTask
            implements ResumableTask
    {
        @Override
        public TaskStatus process()
        {
            while (true) {
                if (stopped) {
                    return TaskStatus.finished();
                }
                ListenableFuture future;
                // Block until one of below conditions is met:
                // 1. Completion of DynamicFilter
                // 2. Timeout after waiting for the configured time
                long timeLeft = dynamicFilteringWaitTimeoutMillis - stopwatch.elapsed(MILLISECONDS);
                if (timeLeft > 0 && dynamicFilter.isAwaitable()) {
                    future = asVoid(toListenableFuture(dynamicFilter.isBlocked()
                            // As isBlocked() returns unmodifiableFuture, we need to create new future for correct propagation of the timeout
                            .thenApply(Function.identity())
                            .orTimeout(timeLeft, MILLISECONDS)));
                    return TaskStatus.continueOn(future);
                }
                taskExecutionLock.readLock().lock();
                try {
                    future = loadSplits();
                }
                catch (Throwable e) {
                    if (e instanceof IOException) {
                        e = new TrinoException(HIVE_FILESYSTEM_ERROR, e);
                    }
                    else if (!(e instanceof TrinoException)) {
                        e = new TrinoException(HIVE_UNKNOWN_ERROR, e);
                    }
                    // Fail the split source before releasing the execution lock
                    // Otherwise, a race could occur where the split source is completed before we fail it.
                    hiveSplitSource.fail(e);
                    checkState(stopped);
                    return TaskStatus.finished();
                }
                finally {
                    taskExecutionLock.readLock().unlock();
                }
                invokeNoMoreSplitsIfNecessary();
                if (!future.isDone()) {
                    return TaskStatus.continueOn(future);
                }
            }
        }
    }

    private void invokeNoMoreSplitsIfNecessary()
    {
        taskExecutionLock.readLock().lock();
        try {
            // This is an opportunistic check to avoid getting the write lock unnecessarily
            if (!partitions.isEmpty() || !fileIterators.isEmpty()) {
                return;
            }
        }
        catch (Exception e) {
            hiveSplitSource.fail(e);
            checkState(stopped, "Task is not marked as stopped even though it failed");
            return;
        }
        finally {
            taskExecutionLock.readLock().unlock();
        }

        taskExecutionLock.writeLock().lock();
        try {
            // the write lock guarantees that no one is operating on the partitions, fileIterators, or hiveSplitSource, or half way through doing so.
            if (partitions.isEmpty() && fileIterators.isEmpty()) {
                // It is legal to call `noMoreSplits` multiple times or after `stop` was called.
                // Nothing bad will happen if `noMoreSplits` implementation calls methods that will try to obtain a read lock because the lock is re-entrant.
                hiveSplitSource.noMoreSplits();
            }
        }
        catch (Exception e) {
            hiveSplitSource.fail(e);
            checkState(stopped, "Task is not marked as stopped even though it failed");
        }
        finally {
            taskExecutionLock.writeLock().unlock();
        }
    }

    private static  ListenableFuture asVoid(ListenableFuture future)
    {
        return Futures.transform(future, v -> null, directExecutor());
    }

    private ListenableFuture loadSplits()
            throws IOException
    {
        Iterator splits = fileIterators.poll();
        if (splits == null) {
            HivePartitionMetadata partition = partitions.poll();
            if (partition == null) {
                return COMPLETED_FUTURE;
            }
            if (partitionCount.incrementAndGet() > maxPartitions) {
                throw new TrinoException(HIVE_EXCEEDED_PARTITION_LIMIT, format(
                        "Query over table '%s' can potentially read more than %s partitions",
                        partition.getHivePartition().getTableName(),
                        maxPartitions));
            }
            // this is racy and sometimes more loaders can be added than necessary, but this is fine
            if (!partitions.isEmpty()) {
                addLoaderIfNecessary();
            }
            return loadPartition(partition);
        }

        // this is racy and sometimes more loaders can be added than necessary, but this is fine
        if (!fileIterators.isEmpty()) {
            addLoaderIfNecessary();
        }

        while (splits.hasNext() && !stopped) {
            ListenableFuture future = hiveSplitSource.addToQueue(splits.next());
            if (!future.isDone()) {
                fileIterators.addFirst(splits);
                return future;
            }
        }

        // No need to put the iterator back, since it's either empty or we've stopped
        return COMPLETED_FUTURE;
    }

    private ListenableFuture loadPartition(HivePartitionMetadata partition)
            throws IOException
    {
        HivePartition hivePartition = partition.getHivePartition();
        String partitionName = hivePartition.getPartitionId();
        Map schema = partition.getPartition()
                .map(value -> getHiveSchema(value, table))
                .orElseGet(() -> getHiveSchema(table));
        List partitionKeys = getPartitionKeys(table, partition.getPartition());
        TupleDomain effectivePredicate = compactEffectivePredicate.transformKeys(HiveColumnHandle.class::cast);

        BooleanSupplier partitionMatchSupplier = createPartitionMatchSupplier(dynamicFilter, hivePartition, getPartitionKeyColumnHandles(table, typeManager));
        if (!partitionMatchSupplier.getAsBoolean()) {
            // Avoid listing files and creating splits from a partition if it has been pruned due to dynamic filters
            return COMPLETED_FUTURE;
        }

        Location location = Location.of(getPartitionLocation(table, partition.getPartition()));

        // Skip header / footer lines are not splittable except for a special case when skip.header.line.count=1
        boolean splittable = getFooterCount(schema) == 0 && getHeaderCount(schema) <= 1;

        if (SYMLINK_TEXT_INPUT_FORMAT_CLASS.equals(getInputFormatName(schema).orElse(null))) {
            if (tableBucketInfo.isPresent()) {
                throw new TrinoException(NOT_SUPPORTED, "Bucketed table in SymlinkTextInputFormat is not yet supported");
            }
            HiveStorageFormat targetStorageFormat = getSymlinkStorageFormat(getDeserializerClassName(schema));
            ListMultimap targets = getTargetLocationsByParentFromSymlink(location);

            InternalHiveSplitFactory splitFactory = new InternalHiveSplitFactory(
                    partitionName,
                    targetStorageFormat,
                    schema,
                    partitionKeys,
                    effectivePredicate,
                    partitionMatchSupplier,
                    partition.getHiveColumnCoercions(),
                    Optional.empty(),
                    Optional.empty(),
                    getMaxInitialSplitSize(session),
                    isForceLocalScheduling(session),
                    maxSplitFileSize);

            for (Entry> entry : Multimaps.asMap(targets).entrySet()) {
                fileIterators.addLast(buildManifestFileIterator(splitFactory, entry.getKey(), entry.getValue(), splittable));
            }

            return COMPLETED_FUTURE;
        }

        StorageFormat rawStorageFormat = partition.getPartition()
                .map(Partition::getStorage).orElseGet(table::getStorage).getStorageFormat();
        HiveStorageFormat storageFormat = getHiveStorageFormat(rawStorageFormat)
                .orElseThrow(() -> new TrinoException(HIVE_INVALID_METADATA, "Unsupported storage format: %s %s".formatted(hivePartition, rawStorageFormat)));

        Optional bucketConversion = Optional.empty();
        boolean bucketConversionRequiresWorkerParticipation = false;
        if (partition.getPartition().isPresent()) {
            Optional partitionBucketProperty = partition.getPartition().get().getStorage().getBucketProperty();
            if (tableBucketInfo.isPresent() && partitionBucketProperty.isPresent()) {
                int tableBucketCount = tableBucketInfo.get().getTableBucketCount();
                // Partition bucketing_version cannot be different from table
                BucketingVersion bucketingVersion = getBucketingVersion(table.getParameters());
                int partitionBucketCount = partitionBucketProperty.get().bucketCount();
                // Validation was done in HiveSplitManager#getPartitionMetadata.
                // Here, it's just trying to see if its needs the BucketConversion.
                if (tableBucketCount != partitionBucketCount) {
                    bucketConversion = Optional.of(new BucketConversion(bucketingVersion, tableBucketCount, partitionBucketCount, tableBucketInfo.get().getBucketColumns()));
                    if (tableBucketCount > partitionBucketCount) {
                        bucketConversionRequiresWorkerParticipation = true;
                    }
                }
            }
        }

        Optional bucketValidation = Optional.empty();
        if (isValidateBucketing(session) && tableBucketInfo.isPresent()) {
            BucketSplitInfo info = tableBucketInfo.get();
            bucketValidation = Optional.of(new BucketValidation(info.getBucketingVersion(), info.getTableBucketCount(), info.getBucketColumns()));
        }

        InternalHiveSplitFactory splitFactory = new InternalHiveSplitFactory(
                partitionName,
                storageFormat,
                schema,
                partitionKeys,
                effectivePredicate,
                partitionMatchSupplier,
                partition.getHiveColumnCoercions(),
                bucketConversionRequiresWorkerParticipation ? bucketConversion : Optional.empty(),
                bucketValidation,
                getMaxInitialSplitSize(session),
                isForceLocalScheduling(session),
                maxSplitFileSize);

        if (isTransactionalTable(table.getParameters())) {
            return getTransactionalSplits(location, splittable, bucketConversion, splitFactory);
        }

        TrinoFileSystem trinoFileSystem = fileSystemFactory.create(session);
        // Bucketed partitions are fully loaded immediately since all files must be loaded to determine the file to bucket mapping
        if (tableBucketInfo.isPresent()) {
            List files = listBucketFiles(trinoFileSystem, location, splitFactory.getPartitionName());
            return hiveSplitSource.addToQueue(getBucketedSplits(files, splitFactory, tableBucketInfo.get(), bucketConversion, splittable, Optional.empty()));
        }

        fileIterators.addLast(createInternalHiveSplitIterator(trinoFileSystem, location, splitFactory, splittable, Optional.empty()));

        return COMPLETED_FUTURE;
    }

    private List listBucketFiles(TrinoFileSystem fs, Location location, String partitionName)
    {
        try {
            HiveFileIterator fileIterator = new HiveFileIterator(table, location, fs, directoryLister, FAIL);
            if (!fileIterator.hasNext() && !ignoreAbsentPartitions) {
                checkPartitionLocationExists(fs, location);
            }
            return ImmutableList.copyOf(fileIterator);
        }
        catch (HiveFileIterator.NestedDirectoryNotAllowedException e) {
            // Fail here to be on the safe side. This seems to be the same as what Hive does
            throw new TrinoException(HIVE_INVALID_BUCKET_FILES, "Hive table '%s' is corrupt. Found sub-directory '%s' in bucket directory for partition: %s"
                    .formatted(table.getSchemaTableName(), e.getNestedDirectoryPath(), partitionName));
        }
    }

    @VisibleForTesting
    Iterator buildManifestFileIterator(InternalHiveSplitFactory splitFactory, Location location, List paths, boolean splittable)
    {
        return createInternalHiveSplitIterator(splitFactory, splittable, Optional.empty(), verifiedFileStatusesStream(location, paths));
    }

    private Stream verifiedFileStatusesStream(Location location, List paths)
    {
        TrinoFileSystem trinoFileSystem = fileSystemFactory.create(session);
        // Check if location is cached BEFORE using the directoryLister
        boolean isCached = directoryLister.isCached(location);

        Map fileStatuses = new HashMap<>();
        Iterator fileStatusIterator = new HiveFileIterator(table, location, trinoFileSystem, directoryLister, RECURSE);
        if (!fileStatusIterator.hasNext()) {
            checkPartitionLocationExists(trinoFileSystem, location);
        }
        fileStatusIterator.forEachRemaining(status -> fileStatuses.put(Location.of(status.getPath()).path(), status));

        // If file statuses came from cache verify that all are present
        if (isCached) {
            boolean missing = paths.stream()
                    .anyMatch(path -> !fileStatuses.containsKey(path.path()));
            // Invalidate the cache and reload
            if (missing) {
                directoryLister.invalidate(location);

                fileStatuses.clear();
                fileStatusIterator = new HiveFileIterator(table, location, trinoFileSystem, directoryLister, RECURSE);
                fileStatusIterator.forEachRemaining(status -> fileStatuses.put(Location.of(status.getPath()).path(), status));
            }
        }

        return paths.stream()
                .map(path -> {
                    TrinoFileStatus status = fileStatuses.get(path.path());
                    if (status == null) {
                        throw new TrinoException(HIVE_FILE_NOT_FOUND, "Manifest file from the location [%s] contains non-existent path: %s".formatted(location, path));
                    }
                    return status;
                });
    }

    private ListenableFuture getTransactionalSplits(Location path, boolean splittable, Optional bucketConversion, InternalHiveSplitFactory splitFactory)
            throws IOException
    {
        TrinoFileSystem fileSystem = fileSystemFactory.create(session);
        ValidWriteIdList writeIds = validWriteIds.orElseThrow(() -> new IllegalStateException("No validWriteIds present"));
        AcidState acidState = getAcidState(fileSystem, path, writeIds);

        boolean fullAcid = isFullAcidTable(table.getParameters());
        AcidInfo.Builder acidInfoBuilder = AcidInfo.builder(path);

        if (fullAcid) {
            // From Hive version >= 3.0, delta/base files will always have file '_orc_acid_version' with value >= '2'.
            Optional baseOrDeltaPath = acidState.baseDirectory()
                    .or(() -> acidState.deltas().stream().findFirst()
                            .map(delta -> Location.of(delta.path())));

            if (baseOrDeltaPath.isPresent() && readAcidVersionFile(fileSystem, baseOrDeltaPath.get()) >= 2) {
                // Trino cannot read ORC ACID tables with version < 2 (written by Hive older than 3.0)
                // See https://github.com/trinodb/trino/issues/2790#issuecomment-591901728 for more context

                // We perform initial version check based on _orc_acid_version file here.
                // If we cannot verify the version (the _orc_acid_version file may not exist),
                // we will do extra check based on ORC datafile metadata in OrcPageSourceFactory.
                acidInfoBuilder.setOrcAcidVersionValidated(true);
            }
        }

        // Collect base files, delta files, and delete delta paths
        List acidFiles = new ArrayList<>();
        for (FileEntry file : acidState.baseFiles()) {
            acidFiles.add(new TrinoFileStatus(file));
        }

        for (ParsedDelta delta : acidState.deltas()) {
            if (delta.deleteDelta()) {
                if (!fullAcid) {
                    throw new TrinoException(HIVE_BAD_DATA, "Unexpected delete delta for a non full ACID table '%s'. Would be ignored by the reader: %s"
                            .formatted(table.getSchemaTableName(), delta.path()));
                }
                acidInfoBuilder.addDeleteDelta(Location.of(delta.path()));
            }
            else {
                for (FileEntry file : delta.files()) {
                    acidFiles.add(new TrinoFileStatus(file));
                }
            }
        }

        for (FileEntry entry : acidState.originalFiles()) {
            // Hive requires "original" files of transactional tables to conform to the bucketed tables naming pattern, to match them with delete deltas.
            acidInfoBuilder.addOriginalFile(entry.location(), entry.length(), getRequiredBucketNumber(entry.location()));
        }

        if (tableBucketInfo.isPresent()) {
            BucketSplitInfo bucketInfo = tableBucketInfo.get();

            for (FileEntry entry : acidState.originalFiles()) {
                List fileStatuses = ImmutableList.of(new TrinoFileStatus(entry));
                Optional acidInfo = acidInfoForOriginalFiles(fullAcid, acidInfoBuilder, entry.location());
                hiveSplitSource.addToQueue(getBucketedSplits(fileStatuses, splitFactory, bucketInfo, bucketConversion, splittable, acidInfo));
            }

            Optional acidInfo = acidInfo(fullAcid, acidInfoBuilder);
            return hiveSplitSource.addToQueue(getBucketedSplits(acidFiles, splitFactory, bucketInfo, bucketConversion, splittable, acidInfo));
        }

        Optional acidInfo = acidInfo(fullAcid, acidInfoBuilder);
        fileIterators.addLast(createInternalHiveSplitIterator(splitFactory, splittable, acidInfo, acidFiles.stream()));

        fileIterators.addLast(generateOriginalFilesSplits(splitFactory, acidState.originalFiles(), splittable, acidInfoBuilder, fullAcid));

        return COMPLETED_FUTURE;
    }

    private static Iterator generateOriginalFilesSplits(
            InternalHiveSplitFactory splitFactory,
            List originalFileLocations,
            boolean splittable,
            AcidInfo.Builder acidInfoBuilder,
            boolean fullAcid)
    {
        return originalFileLocations.stream()
                .map(entry -> createInternalHiveSplit(
                        splitFactory,
                        splittable,
                        acidInfoForOriginalFiles(fullAcid, acidInfoBuilder, entry.location()),
                        new TrinoFileStatus(entry)))
                .flatMap(Optional::stream)
                .iterator();
    }

    private static Optional acidInfo(boolean fullAcid, AcidInfo.Builder builder)
    {
        return fullAcid ? builder.build() : Optional.empty();
    }

    private static Optional acidInfoForOriginalFiles(boolean fullAcid, AcidInfo.Builder builder, Location location)
    {
        return fullAcid ? Optional.of(builder.buildWithRequiredOriginalFiles(getRequiredBucketNumber(location))) : Optional.empty();
    }

    private Iterator createInternalHiveSplitIterator(TrinoFileSystem fileSystem, Location location, InternalHiveSplitFactory splitFactory, boolean splittable, Optional acidInfo)
    {
        Iterator iterator = new HiveFileIterator(table, location, fileSystem, directoryLister, recursiveDirWalkerEnabled ? RECURSE : IGNORED);
        if (!iterator.hasNext() && !ignoreAbsentPartitions) {
            checkPartitionLocationExists(fileSystem, location);
        }
        return createInternalHiveSplitIterator(splitFactory, splittable, acidInfo, Streams.stream(iterator));
    }

    private static void checkPartitionLocationExists(TrinoFileSystem fileSystem, Location location)
    {
        try {
            if (!fileSystem.directoryExists(location).orElse(true)) {
                throw new TrinoException(HIVE_FILE_NOT_FOUND, "Partition location does not exist: " + location);
            }
        }
        catch (IOException e) {
            throw new TrinoException(HIVE_FILESYSTEM_ERROR, "Failed checking directory path:" + location, e);
        }
    }

    private static Iterator createInternalHiveSplitIterator(InternalHiveSplitFactory splitFactory, boolean splittable, Optional acidInfo, Stream fileStream)
    {
        return fileStream
                .map(file -> createInternalHiveSplit(splitFactory, splittable, acidInfo, file))
                .flatMap(Optional::stream)
                .iterator();
    }

    private static Optional createInternalHiveSplit(InternalHiveSplitFactory splitFactory, boolean splittable, Optional acidInfo, TrinoFileStatus file)
    {
        return splitFactory.createInternalHiveSplit(file, OptionalInt.empty(), OptionalInt.empty(), splittable, acidInfo);
    }

    private List getBucketedSplits(
            List files,
            InternalHiveSplitFactory splitFactory,
            BucketSplitInfo bucketSplitInfo,
            Optional bucketConversion,
            boolean splittable,
            Optional acidInfo)
    {
        int readBucketCount = bucketSplitInfo.getReadBucketCount();
        int tableBucketCount = bucketSplitInfo.getTableBucketCount();
        int partitionBucketCount = bucketConversion.map(BucketConversion::partitionBucketCount).orElse(tableBucketCount);
        int bucketCount = max(readBucketCount, partitionBucketCount);

        checkState(readBucketCount <= tableBucketCount, "readBucketCount(%s) should be less than or equal to tableBucketCount(%s)", readBucketCount, tableBucketCount);

        // build mapping of file name to bucket
        ListMultimap bucketFiles = ArrayListMultimap.create();
        for (TrinoFileStatus file : files) {
            String fileName = Location.of(file.getPath()).fileName();
            OptionalInt bucket = getBucketNumber(fileName);
            if (bucket.isPresent()) {
                bucketFiles.put(bucket.getAsInt(), file);
                continue;
            }

            // legacy mode requires exactly one file per bucket
            if (files.size() != partitionBucketCount) {
                throw new TrinoException(HIVE_INVALID_BUCKET_FILES, format(
                        "Hive table '%s' is corrupt. File '%s' does not match the standard naming pattern, and the number " +
                                "of files in the directory (%s) does not match the declared bucket count (%s) for partition: %s",
                        table.getSchemaTableName(),
                        fileName,
                        files.size(),
                        partitionBucketCount,
                        splitFactory.getPartitionName()));
            }

            // sort FileStatus objects per `org.apache.hadoop.hive.ql.metadata.Table#getSortedPaths()`
            files = files.stream().sorted().toList();

            // use position in sorted list as the bucket number
            bucketFiles.clear();
            for (int i = 0; i < files.size(); i++) {
                bucketFiles.put(i, files.get(i));
            }
            break;
        }

        validateFileBuckets(bucketFiles, partitionBucketCount, table.getSchemaTableName().toString(), splitFactory.getPartitionName());

        // convert files internal splits
        List splitList = new ArrayList<>();
        for (int bucketNumber = 0; bucketNumber < bucketCount; bucketNumber++) {
            // Physical bucket #. This determine file name. It also determines the order of splits in the result.
            int partitionBucketNumber = bucketNumber % partitionBucketCount;
            // Logical bucket #. Each logical bucket corresponds to a "bucket" from engine's perspective.
            int readBucketNumber = bucketNumber % readBucketCount;

            boolean containsIneligibleTableBucket = false;
            List eligibleTableBucketNumbers = new ArrayList<>();
            for (int tableBucketNumber = bucketNumber % tableBucketCount; tableBucketNumber < tableBucketCount; tableBucketNumber += bucketCount) {
                // table bucket number: this is used for evaluating "$bucket" filters.
                if (bucketSplitInfo.isTableBucketEnabled(tableBucketNumber)) {
                    eligibleTableBucketNumbers.add(tableBucketNumber);
                }
                else {
                    containsIneligibleTableBucket = true;
                }
            }

            if (!eligibleTableBucketNumbers.isEmpty() && containsIneligibleTableBucket) {
                throw new TrinoException(
                        NOT_SUPPORTED,
                        "The bucket filter cannot be satisfied. There are restrictions on the bucket filter when all the following is true: " +
                                "1. a table has a different buckets count as at least one of its partitions that is read in this query; " +
                                "2. the table has a different but compatible bucket number with another table in the query; " +
                                "3. some buckets of the table is filtered out from the query, most likely using a filter on \"$bucket\". " +
                                "(table name: " + table.getTableName() + ", table bucket count: " + tableBucketCount + ", " +
                                "partition bucket count: " + partitionBucketCount + ", effective reading bucket count: " + readBucketCount + ")");
            }
            if (!eligibleTableBucketNumbers.isEmpty()) {
                for (TrinoFileStatus file : bucketFiles.get(partitionBucketNumber)) {
                    // OrcDeletedRows will load only delete delta files matching current bucket id,
                    // so we can pass all delete delta locations here, without filtering.
                    eligibleTableBucketNumbers.stream()
                            .map(tableBucketNumber -> splitFactory.createInternalHiveSplit(file, OptionalInt.of(readBucketNumber), OptionalInt.of(tableBucketNumber), splittable, acidInfo))
                            .flatMap(Optional::stream)
                            .forEach(splitList::add);
                }
            }
        }
        return splitList;
    }

    @VisibleForTesting
    static void validateFileBuckets(ListMultimap bucketFiles, int partitionBucketCount, String tableName, String partitionName)
    {
        if (bucketFiles.isEmpty()) {
            return;
        }

        int highestBucketNumber = max(bucketFiles.keySet());
        // validate the bucket number detected from files, fail the query if the highest bucket number detected from file
        // exceeds the allowed highest number
        if (highestBucketNumber >= partitionBucketCount) {
            throw new TrinoException(HIVE_INVALID_BUCKET_FILES, format(
                    "Hive table '%s' is corrupt. The highest bucket number in the directory (%s) exceeds the bucket number range " +
                            "defined by the declared bucket count (%s) for partition: %s",
                    tableName,
                    highestBucketNumber,
                    partitionBucketCount,
                    partitionName));
        }
    }

    private static int getRequiredBucketNumber(Location location)
    {
        return getBucketNumber(location.fileName())
                .orElseThrow(() -> new IllegalStateException("Cannot get bucket number from location: " + location));
    }

    @VisibleForTesting
    static OptionalInt getBucketNumber(String name)
    {
        for (Pattern pattern : BUCKET_PATTERNS) {
            Matcher matcher = pattern.matcher(name);
            if (matcher.matches()) {
                return OptionalInt.of(parseInt(matcher.group(1)));
            }
        }
        return OptionalInt.empty();
    }

    public static boolean hasAttemptId(String bucketFilename)
    {
        Matcher matcher = BUCKET_WITH_OPTIONAL_ATTEMPT_ID_PATTERN.matcher(bucketFilename);
        return matcher.matches() && matcher.group(2) != null;
    }

    private static HiveStorageFormat getSymlinkStorageFormat(String serde)
    {
        // LazySimpleSerDe is used by TEXTFILE and SEQUENCEFILE. Use TEXTFILE per Hive behavior.
        if (serde.equals(TEXTFILE.getSerde())) {
            return TEXTFILE;
        }
        return Arrays.stream(HiveStorageFormat.values())
                .filter(format -> serde.equals(format.getSerde()))
                .findFirst()
                .orElseThrow(() -> new TrinoException(HIVE_UNSUPPORTED_FORMAT, "Unknown SerDe for SymlinkTextInputFormat: " + serde));
    }

    private ListMultimap getTargetLocationsByParentFromSymlink(Location symlinkDir)
    {
        TrinoFileSystem fileSystem = fileSystemFactory.create(session);
        try {
            ListMultimap targets = ArrayListMultimap.create();
            FileIterator iterator = fileSystem.listFiles(symlinkDir);
            while (iterator.hasNext()) {
                Location location = iterator.next().location();
                String name = location.fileName();
                if (name.startsWith("_") || name.startsWith(".")) {
                    continue;
                }

                try (Reader reader = new InputStreamReader(fileSystem.newInputFile(location).newStream(), UTF_8)) {
                    CharStreams.readLines(reader).stream()
                            .map(Location::of)
                            .forEach(target -> targets.put(target.parentDirectory(), target));
                }
            }
            return targets;
        }
        catch (IOException | IllegalArgumentException e) {
            throw new TrinoException(HIVE_BAD_DATA, "Error parsing symlinks from: " + symlinkDir, e);
        }
    }

    private static List getPartitionKeys(Table table, Optional partition)
    {
        if (partition.isEmpty()) {
            return ImmutableList.of();
        }
        ImmutableList.Builder partitionKeys = ImmutableList.builder();
        List keys = table.getPartitionColumns();
        List values = partition.get().getValues();
        checkCondition(keys.size() == values.size(), HIVE_INVALID_METADATA, "Expected %s partition key values, but got %s", keys.size(), values.size());
        for (int i = 0; i < keys.size(); i++) {
            String name = keys.get(i).getName();
            HiveType hiveType = keys.get(i).getType();
            if (!hiveType.isSupportedType(table.getStorage().getStorageFormat())) {
                throw new TrinoException(NOT_SUPPORTED, format("Unsupported Hive type %s found in partition keys of table %s.%s", hiveType, table.getDatabaseName(), table.getTableName()));
            }
            String value = values.get(i);
            checkCondition(value != null, HIVE_INVALID_PARTITION_VALUE, "partition key value cannot be null for field: %s", name);
            partitionKeys.add(new HivePartitionKey(name, value));
        }
        return partitionKeys.build();
    }

    public static class BucketSplitInfo
    {
        private final BucketingVersion bucketingVersion;
        private final List bucketColumns;
        private final int tableBucketCount;
        private final int readBucketCount;
        private final IntPredicate bucketFilter;

        public static Optional createBucketSplitInfo(Optional bucketHandle, Optional bucketFilter)
        {
            requireNonNull(bucketHandle, "bucketHandle is null");
            requireNonNull(bucketFilter, "bucketFilter is null");

            if (bucketHandle.isEmpty()) {
                checkArgument(bucketFilter.isEmpty(), "bucketHandle must be present if bucketFilter is present");
                return Optional.empty();
            }

            BucketingVersion bucketingVersion = bucketHandle.get().bucketingVersion();
            int tableBucketCount = bucketHandle.get().tableBucketCount();
            int readBucketCount = bucketHandle.get().readBucketCount();

            List bucketColumns = bucketHandle.get().columns();
            IntPredicate predicate = bucketFilter
                    .map(filter -> filter.getBucketsToKeep()::contains)
                    .orElse(bucket -> true);
            return Optional.of(new BucketSplitInfo(bucketingVersion, bucketColumns, tableBucketCount, readBucketCount, predicate));
        }

        private BucketSplitInfo(BucketingVersion bucketingVersion, List bucketColumns, int tableBucketCount, int readBucketCount, IntPredicate bucketFilter)
        {
            this.bucketingVersion = requireNonNull(bucketingVersion, "bucketingVersion is null");
            this.bucketColumns = ImmutableList.copyOf(requireNonNull(bucketColumns, "bucketColumns is null"));
            this.tableBucketCount = tableBucketCount;
            this.readBucketCount = readBucketCount;
            this.bucketFilter = requireNonNull(bucketFilter, "bucketFilter is null");
        }

        public BucketingVersion getBucketingVersion()
        {
            return bucketingVersion;
        }

        public List getBucketColumns()
        {
            return bucketColumns;
        }

        public int getTableBucketCount()
        {
            return tableBucketCount;
        }

        public int getReadBucketCount()
        {
            return readBucketCount;
        }

        /**
         * Evaluates whether the provided table bucket number passes the bucket predicate.
         * A bucket predicate can be present in two cases:
         * 
    *
  • Filter on "$bucket" column. e.g. {@code "$bucket" between 0 and 100} *
  • Single-value equality filter on all bucket columns. e.g. for a table with two bucketing columns, * {@code bucketCol1 = 'a' AND bucketCol2 = 123} *
*/ public boolean isTableBucketEnabled(int tableBucketNumber) { return bucketFilter.test(tableBucketNumber); } } private static void checkExecutorIsNotDirectExecutor(Executor executor) { ReentrantLock lock = new ReentrantLock(); lock.lock(); try { executor.execute(() -> checkState(!lock.isHeldByCurrentThread(), "executor is a direct executor")); } finally { lock.unlock(); } } }




© 2015 - 2025 Weber Informatics LLC | Privacy Policy