io.trino.plugin.hive.BackgroundHiveSplitLoader Maven / Gradle / Ivy
/*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package io.trino.plugin.hive;
import com.google.common.annotations.VisibleForTesting;
import com.google.common.base.Stopwatch;
import com.google.common.collect.ArrayListMultimap;
import com.google.common.collect.ImmutableList;
import com.google.common.collect.Iterators;
import com.google.common.collect.ListMultimap;
import com.google.common.collect.Streams;
import com.google.common.io.CharStreams;
import com.google.common.util.concurrent.Futures;
import com.google.common.util.concurrent.ListenableFuture;
import io.airlift.units.Duration;
import io.trino.plugin.hive.HdfsEnvironment.HdfsContext;
import io.trino.plugin.hive.HiveSplit.BucketConversion;
import io.trino.plugin.hive.HiveSplit.BucketValidation;
import io.trino.plugin.hive.acid.AcidTransaction;
import io.trino.plugin.hive.metastore.Column;
import io.trino.plugin.hive.metastore.Partition;
import io.trino.plugin.hive.metastore.Table;
import io.trino.plugin.hive.util.HiveBucketing.BucketingVersion;
import io.trino.plugin.hive.util.HiveBucketing.HiveBucketFilter;
import io.trino.plugin.hive.util.HiveFileIterator;
import io.trino.plugin.hive.util.InternalHiveSplitFactory;
import io.trino.plugin.hive.util.ResumableTask;
import io.trino.plugin.hive.util.ResumableTasks;
import io.trino.spi.TrinoException;
import io.trino.spi.connector.ColumnHandle;
import io.trino.spi.connector.ConnectorSession;
import io.trino.spi.connector.DynamicFilter;
import io.trino.spi.predicate.TupleDomain;
import io.trino.spi.type.TypeManager;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileStatus;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.LocatedFileStatus;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.hive.common.ValidWriteIdList;
import org.apache.hadoop.hive.ql.io.AcidUtils;
import org.apache.hadoop.hive.ql.io.SymlinkTextInputFormat;
import org.apache.hadoop.hive.shims.HadoopShims.HdfsFileStatusWithId;
import org.apache.hadoop.mapred.FileInputFormat;
import org.apache.hadoop.mapred.FileSplit;
import org.apache.hadoop.mapred.InputFormat;
import org.apache.hadoop.mapred.InputSplit;
import org.apache.hadoop.mapred.JobConf;
import org.apache.hadoop.mapred.JobConfigurable;
import org.apache.hadoop.mapred.TextInputFormat;
import org.apache.hadoop.mapreduce.MRConfig;
import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStreamReader;
import java.lang.annotation.Annotation;
import java.nio.charset.StandardCharsets;
import java.security.Principal;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Deque;
import java.util.HashMap;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.Optional;
import java.util.OptionalInt;
import java.util.Properties;
import java.util.Set;
import java.util.concurrent.ConcurrentLinkedDeque;
import java.util.concurrent.Executor;
import java.util.concurrent.locks.ReadWriteLock;
import java.util.concurrent.locks.ReentrantReadWriteLock;
import java.util.function.BooleanSupplier;
import java.util.function.Function;
import java.util.function.IntPredicate;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import static com.google.common.base.Preconditions.checkArgument;
import static com.google.common.base.Preconditions.checkState;
import static com.google.common.collect.ImmutableSet.toImmutableSet;
import static com.google.common.collect.Iterables.getOnlyElement;
import static com.google.common.collect.Maps.fromProperties;
import static com.google.common.util.concurrent.Futures.immediateVoidFuture;
import static com.google.common.util.concurrent.MoreExecutors.directExecutor;
import static io.airlift.concurrent.MoreFutures.addExceptionCallback;
import static io.airlift.concurrent.MoreFutures.toListenableFuture;
import static io.trino.plugin.hive.HiveErrorCode.HIVE_BAD_DATA;
import static io.trino.plugin.hive.HiveErrorCode.HIVE_FILESYSTEM_ERROR;
import static io.trino.plugin.hive.HiveErrorCode.HIVE_INVALID_BUCKET_FILES;
import static io.trino.plugin.hive.HiveErrorCode.HIVE_INVALID_METADATA;
import static io.trino.plugin.hive.HiveErrorCode.HIVE_INVALID_PARTITION_VALUE;
import static io.trino.plugin.hive.HiveErrorCode.HIVE_UNKNOWN_ERROR;
import static io.trino.plugin.hive.HivePartitionManager.partitionMatches;
import static io.trino.plugin.hive.HiveSessionProperties.getMaxInitialSplitSize;
import static io.trino.plugin.hive.HiveSessionProperties.isForceLocalScheduling;
import static io.trino.plugin.hive.HiveSessionProperties.isValidateBucketing;
import static io.trino.plugin.hive.metastore.MetastoreUtil.getHiveSchema;
import static io.trino.plugin.hive.metastore.MetastoreUtil.getPartitionLocation;
import static io.trino.plugin.hive.s3select.S3SelectPushdown.shouldEnablePushdownForTable;
import static io.trino.plugin.hive.util.ConfigurationUtils.toJobConf;
import static io.trino.plugin.hive.util.HiveFileIterator.NestedDirectoryPolicy.FAIL;
import static io.trino.plugin.hive.util.HiveFileIterator.NestedDirectoryPolicy.IGNORED;
import static io.trino.plugin.hive.util.HiveFileIterator.NestedDirectoryPolicy.RECURSE;
import static io.trino.plugin.hive.util.HiveUtil.checkCondition;
import static io.trino.plugin.hive.util.HiveUtil.getFooterCount;
import static io.trino.plugin.hive.util.HiveUtil.getHeaderCount;
import static io.trino.plugin.hive.util.HiveUtil.getInputFormat;
import static io.trino.plugin.hive.util.HiveUtil.getPartitionKeyColumnHandles;
import static io.trino.spi.StandardErrorCode.NOT_SUPPORTED;
import static java.lang.Integer.parseInt;
import static java.lang.Math.max;
import static java.lang.String.format;
import static java.util.Collections.max;
import static java.util.Objects.requireNonNull;
import static java.util.concurrent.TimeUnit.MILLISECONDS;
import static org.apache.hadoop.fs.Path.getPathWithoutSchemeAndAuthority;
import static org.apache.hadoop.hive.common.FileUtils.HIDDEN_FILES_PATH_FILTER;
public class BackgroundHiveSplitLoader
implements HiveSplitLoader
{
// See https://github.com/apache/hive/commit/ffee30e6267e85f00a22767262192abb9681cfb7#diff-5fe26c36b4e029dcd344fc5d484e7347R165
private static final Pattern BUCKET_WITH_OPTIONAL_ATTEMPT_ID_PATTERN = Pattern.compile("bucket_(\\d+)(_\\d+)?$");
private static final Iterable BUCKET_PATTERNS = ImmutableList.of(
// legacy Presto naming pattern (current version matches Hive)
Pattern.compile("\\d{8}_\\d{6}_\\d{5}_[a-z0-9]{5}_bucket-(\\d+)(?:[-_.].*)?"),
// Hive naming pattern per `org.apache.hadoop.hive.ql.exec.Utilities#getBucketIdFromFile()`
Pattern.compile("(\\d+)_\\d+.*"),
// Hive ACID with optional direct insert attempt id
BUCKET_WITH_OPTIONAL_ATTEMPT_ID_PATTERN);
private static final ListenableFuture COMPLETED_FUTURE = immediateVoidFuture();
private final Table table;
private final AcidTransaction transaction;
private final TupleDomain extends ColumnHandle> compactEffectivePredicate;
private final DynamicFilter dynamicFilter;
private final long dynamicFilteringWaitTimeoutMillis;
private final TypeManager typeManager;
private final Optional tableBucketInfo;
private final HdfsEnvironment hdfsEnvironment;
private final HdfsContext hdfsContext;
private final NamenodeStats namenodeStats;
private final DirectoryLister directoryLister;
private final int loaderConcurrency;
private final boolean recursiveDirWalkerEnabled;
private final boolean ignoreAbsentPartitions;
private final boolean optimizeSymlinkListing;
private final Executor executor;
private final ConnectorSession session;
private final ConcurrentLazyQueue partitions;
private final Deque> fileIterators = new ConcurrentLinkedDeque<>();
private final Optional validWriteIds;
private final Optional maxSplitFileSize;
// Purpose of this lock:
// * Write lock: when you need a consistent view across partitions, fileIterators, and hiveSplitSource.
// * Read lock: when you need to modify any of the above.
// Make sure the lock is held throughout the period during which they may not be consistent with each other.
// Details:
// * When write lock is acquired, except the holder, no one can do any of the following:
// ** poll from (or check empty) partitions
// ** poll from (or check empty) or push to fileIterators
// ** push to hiveSplitSource
// * When any of the above three operations is carried out, either a read lock or a write lock must be held.
// * When a series of operations involving two or more of the above three operations are carried out, the lock
// must be continuously held throughout the series of operations.
// Implications:
// * if you hold a read lock but not a write lock, you can do any of the above three operations, but you may
// see a series of operations involving two or more of the operations carried out half way.
private final ReadWriteLock taskExecutionLock = new ReentrantReadWriteLock();
private HiveSplitSource hiveSplitSource;
private Stopwatch stopwatch;
private volatile boolean stopped;
public BackgroundHiveSplitLoader(
Table table,
AcidTransaction transaction,
Iterable partitions,
TupleDomain extends ColumnHandle> compactEffectivePredicate,
DynamicFilter dynamicFilter,
Duration dynamicFilteringWaitTimeout,
TypeManager typeManager,
Optional tableBucketInfo,
ConnectorSession session,
HdfsEnvironment hdfsEnvironment,
NamenodeStats namenodeStats,
DirectoryLister directoryLister,
Executor executor,
int loaderConcurrency,
boolean recursiveDirWalkerEnabled,
boolean ignoreAbsentPartitions,
boolean optimizeSymlinkListing,
Optional validWriteIds,
Optional maxSplitFileSize)
{
this.table = table;
this.transaction = requireNonNull(transaction, "transaction is null");
this.compactEffectivePredicate = compactEffectivePredicate;
this.dynamicFilter = dynamicFilter;
this.dynamicFilteringWaitTimeoutMillis = dynamicFilteringWaitTimeout.toMillis();
this.typeManager = typeManager;
this.tableBucketInfo = tableBucketInfo;
this.loaderConcurrency = loaderConcurrency;
checkArgument(loaderConcurrency > 0, "loaderConcurrency must be > 0, found: %s", loaderConcurrency);
this.session = session;
this.hdfsEnvironment = hdfsEnvironment;
this.namenodeStats = namenodeStats;
this.directoryLister = directoryLister;
this.recursiveDirWalkerEnabled = recursiveDirWalkerEnabled;
this.ignoreAbsentPartitions = ignoreAbsentPartitions;
this.optimizeSymlinkListing = optimizeSymlinkListing;
this.executor = executor;
this.partitions = new ConcurrentLazyQueue<>(partitions);
this.hdfsContext = new HdfsContext(session);
this.validWriteIds = requireNonNull(validWriteIds, "validWriteIds is null");
this.maxSplitFileSize = requireNonNull(maxSplitFileSize, "maxSplitFileSize is null");
}
@Override
public void start(HiveSplitSource splitSource)
{
this.hiveSplitSource = splitSource;
this.stopwatch = Stopwatch.createStarted();
for (int i = 0; i < loaderConcurrency; i++) {
ListenableFuture future = ResumableTasks.submit(executor, new HiveSplitLoaderTask());
addExceptionCallback(future, hiveSplitSource::fail); // best effort; hiveSplitSource could be already completed
}
}
@Override
public void stop()
{
stopped = true;
}
private class HiveSplitLoaderTask
implements ResumableTask
{
@Override
public TaskStatus process()
{
while (true) {
if (stopped) {
return TaskStatus.finished();
}
ListenableFuture future;
// Block until one of below conditions is met:
// 1. Completion of DynamicFilter
// 2. Timeout after waiting for the configured time
long timeLeft = dynamicFilteringWaitTimeoutMillis - stopwatch.elapsed(MILLISECONDS);
if (timeLeft > 0 && dynamicFilter.isAwaitable()) {
future = asVoid(toListenableFuture(dynamicFilter.isBlocked()
// As isBlocked() returns unmodifiableFuture, we need to create new future for correct propagation of the timeout
.thenApply(Function.identity())
.orTimeout(timeLeft, MILLISECONDS)));
return TaskStatus.continueOn(future);
}
taskExecutionLock.readLock().lock();
try {
future = loadSplits();
}
catch (Throwable e) {
if (e instanceof IOException) {
e = new TrinoException(HIVE_FILESYSTEM_ERROR, e);
}
else if (!(e instanceof TrinoException)) {
e = new TrinoException(HIVE_UNKNOWN_ERROR, e);
}
// Fail the split source before releasing the execution lock
// Otherwise, a race could occur where the split source is completed before we fail it.
hiveSplitSource.fail(e);
checkState(stopped);
return TaskStatus.finished();
}
finally {
taskExecutionLock.readLock().unlock();
}
invokeNoMoreSplitsIfNecessary();
if (!future.isDone()) {
return TaskStatus.continueOn(future);
}
}
}
}
private void invokeNoMoreSplitsIfNecessary()
{
taskExecutionLock.readLock().lock();
try {
// This is an opportunistic check to avoid getting the write lock unnecessarily
if (!partitions.isEmpty() || !fileIterators.isEmpty()) {
return;
}
}
catch (Exception e) {
hiveSplitSource.fail(e);
checkState(stopped, "Task is not marked as stopped even though it failed");
return;
}
finally {
taskExecutionLock.readLock().unlock();
}
taskExecutionLock.writeLock().lock();
try {
// the write lock guarantees that no one is operating on the partitions, fileIterators, or hiveSplitSource, or half way through doing so.
if (partitions.isEmpty() && fileIterators.isEmpty()) {
// It is legal to call `noMoreSplits` multiple times or after `stop` was called.
// Nothing bad will happen if `noMoreSplits` implementation calls methods that will try to obtain a read lock because the lock is re-entrant.
hiveSplitSource.noMoreSplits();
}
}
catch (Exception e) {
hiveSplitSource.fail(e);
checkState(stopped, "Task is not marked as stopped even though it failed");
}
finally {
taskExecutionLock.writeLock().unlock();
}
}
private static ListenableFuture asVoid(ListenableFuture future)
{
return Futures.transform(future, v -> null, directExecutor());
}
private ListenableFuture loadSplits()
throws IOException
{
Iterator splits = fileIterators.poll();
if (splits == null) {
HivePartitionMetadata partition = partitions.poll();
if (partition == null) {
return COMPLETED_FUTURE;
}
return loadPartition(partition);
}
while (splits.hasNext() && !stopped) {
ListenableFuture future = hiveSplitSource.addToQueue(splits.next());
if (!future.isDone()) {
fileIterators.addFirst(splits);
return future;
}
}
// No need to put the iterator back, since it's either empty or we've stopped
return COMPLETED_FUTURE;
}
private ListenableFuture loadPartition(HivePartitionMetadata partition)
throws IOException
{
HivePartition hivePartition = partition.getHivePartition();
String partitionName = hivePartition.getPartitionId();
Properties schema = getPartitionSchema(table, partition.getPartition());
List partitionKeys = getPartitionKeys(table, partition.getPartition());
TupleDomain effectivePredicate = compactEffectivePredicate.transformKeys(HiveColumnHandle.class::cast);
List partitionColumns = getPartitionKeyColumnHandles(table, typeManager);
BooleanSupplier partitionMatchSupplier =
partitionColumns.stream().noneMatch(dynamicFilter.getColumnsCovered()::contains)
? () -> true
: () -> partitionMatches(partitionColumns, dynamicFilter.getCurrentPredicate(), hivePartition);
if (!partitionMatchSupplier.getAsBoolean()) {
// Avoid listing files and creating splits from a partition if it has been pruned due to dynamic filters
return COMPLETED_FUTURE;
}
Path path = new Path(getPartitionLocation(table, partition.getPartition()));
Configuration configuration = hdfsEnvironment.getConfiguration(hdfsContext, path);
InputFormat, ?> inputFormat = getInputFormat(configuration, schema, false);
FileSystem fs = hdfsEnvironment.getFileSystem(hdfsContext, path);
boolean s3SelectPushdownEnabled = shouldEnablePushdownForTable(session, table, path.toString(), partition.getPartition());
// S3 Select pushdown works at the granularity of individual S3 objects,
// therefore we must not split files when it is enabled.
// Skip header / footer lines are not splittable except for a special case when skip.header.line.count=1
boolean splittable = !s3SelectPushdownEnabled && getFooterCount(schema) == 0 && getHeaderCount(schema) <= 1;
if (inputFormat instanceof SymlinkTextInputFormat) {
if (tableBucketInfo.isPresent()) {
throw new TrinoException(NOT_SUPPORTED, "Bucketed table in SymlinkTextInputFormat is not yet supported");
}
InputFormat, ?> targetInputFormat = getInputFormat(configuration, schema, true);
List targetPaths = hdfsEnvironment.doAs(
hdfsContext.getIdentity(),
() -> getTargetPathsFromSymlink(fs, path));
Set parents = targetPaths.stream()
.map(Path::getParent)
.distinct()
.collect(toImmutableSet());
if (optimizeSymlinkListing && parents.size() == 1 && !recursiveDirWalkerEnabled) {
Optional> manifestFileIterator = buildManifestFileIterator(
targetInputFormat,
partitionName,
schema,
partitionKeys,
effectivePredicate,
partitionMatchSupplier,
s3SelectPushdownEnabled,
partition.getTableToPartitionMapping(),
getOnlyElement(parents),
targetPaths,
splittable);
if (manifestFileIterator.isPresent()) {
fileIterators.addLast(manifestFileIterator.get());
return COMPLETED_FUTURE;
}
}
return createHiveSymlinkSplits(
partitionName,
targetInputFormat,
schema,
partitionKeys,
effectivePredicate,
partitionMatchSupplier,
s3SelectPushdownEnabled,
partition.getTableToPartitionMapping(),
targetPaths);
}
Optional bucketConversion = Optional.empty();
boolean bucketConversionRequiresWorkerParticipation = false;
if (partition.getPartition().isPresent()) {
Optional partitionBucketProperty = partition.getPartition().get().getStorage().getBucketProperty();
if (tableBucketInfo.isPresent() && partitionBucketProperty.isPresent()) {
int readBucketCount = tableBucketInfo.get().getReadBucketCount();
BucketingVersion bucketingVersion = partitionBucketProperty.get().getBucketingVersion(); // TODO can partition's bucketing_version be different from table's?
int partitionBucketCount = partitionBucketProperty.get().getBucketCount();
// Validation was done in HiveSplitManager#getPartitionMetadata.
// Here, it's just trying to see if its needs the BucketConversion.
if (readBucketCount != partitionBucketCount) {
bucketConversion = Optional.of(new BucketConversion(bucketingVersion, readBucketCount, partitionBucketCount, tableBucketInfo.get().getBucketColumns()));
if (readBucketCount > partitionBucketCount) {
bucketConversionRequiresWorkerParticipation = true;
}
}
}
}
Optional bucketValidation = Optional.empty();
if (isValidateBucketing(session) && tableBucketInfo.isPresent()) {
BucketSplitInfo info = tableBucketInfo.get();
bucketValidation = Optional.of(new BucketValidation(info.getBucketingVersion(), info.getTableBucketCount(), info.getBucketColumns()));
}
InternalHiveSplitFactory splitFactory = new InternalHiveSplitFactory(
fs,
partitionName,
inputFormat,
schema,
partitionKeys,
effectivePredicate,
partitionMatchSupplier,
partition.getTableToPartitionMapping(),
bucketConversionRequiresWorkerParticipation ? bucketConversion : Optional.empty(),
bucketValidation,
getMaxInitialSplitSize(session),
isForceLocalScheduling(session),
s3SelectPushdownEnabled,
transaction,
maxSplitFileSize);
// To support custom input formats, we want to call getSplits()
// on the input format to obtain file splits.
if (shouldUseFileSplitsFromInputFormat(inputFormat)) {
if (tableBucketInfo.isPresent()) {
throw new TrinoException(NOT_SUPPORTED, "Trino cannot read bucketed partition in an input format with UseFileSplitsFromInputFormat annotation: " + inputFormat.getClass().getSimpleName());
}
if (AcidUtils.isTransactionalTable(table.getParameters())) {
throw new TrinoException(NOT_SUPPORTED, "Hive transactional tables in an input format with UseFileSplitsFromInputFormat annotation are not supported: " + inputFormat.getClass().getSimpleName());
}
JobConf jobConf = toJobConf(configuration);
FileInputFormat.setInputPaths(jobConf, path);
// Pass SerDes and Table parameters into input format configuration
fromProperties(schema).forEach(jobConf::set);
InputSplit[] splits = hdfsEnvironment.doAs(hdfsContext.getIdentity(), () -> inputFormat.getSplits(jobConf, 0));
return addSplitsToSource(splits, splitFactory);
}
List readPaths;
List fileStatusOriginalFiles = ImmutableList.of();
AcidInfo.Builder acidInfoBuilder = AcidInfo.builder(path);
boolean isFullAcid = AcidUtils.isFullAcidTable(table.getParameters());
if (AcidUtils.isTransactionalTable(table.getParameters())) {
AcidUtils.Directory directory = hdfsEnvironment.doAs(hdfsContext.getIdentity(), () -> AcidUtils.getAcidState(
path,
configuration,
validWriteIds.orElseThrow(() -> new IllegalStateException("No validWriteIds present")),
false,
true));
if (isFullAcid) {
// From Hive version >= 3.0, delta/base files will always have file '_orc_acid_version' with value >= '2'.
Path baseOrDeltaPath = directory.getBaseDirectory() != null
? directory.getBaseDirectory()
: (directory.getCurrentDirectories().size() > 0 ? directory.getCurrentDirectories().get(0).getPath() : null);
if (baseOrDeltaPath != null && AcidUtils.OrcAcidVersion.getAcidVersionFromMetaFile(baseOrDeltaPath, fs) >= 2) {
// Trino cannot read ORC ACID tables with version < 2 (written by Hive older than 3.0)
// See https://github.com/trinodb/trino/issues/2790#issuecomment-591901728 for more context
// We perform initial version check based on _orc_acid_version file here.
// If we cannot verify the version (the _orc_acid_version file may not exist),
// we will do extra check based on ORC datafile metadata in OrcPageSourceFactory.
acidInfoBuilder.setOrcAcidVersionValidated(true);
}
}
readPaths = new ArrayList<>();
// base
if (directory.getBaseDirectory() != null) {
readPaths.add(directory.getBaseDirectory());
}
// delta directories
for (AcidUtils.ParsedDelta delta : directory.getCurrentDirectories()) {
if (!delta.isDeleteDelta()) {
readPaths.add(delta.getPath());
}
}
// Create a registry of delete_delta directories for the partition
for (AcidUtils.ParsedDelta delta : directory.getCurrentDirectories()) {
if (delta.isDeleteDelta()) {
if (!isFullAcid) {
throw new TrinoException(HIVE_BAD_DATA, format(
"Unexpected delete delta for a non full ACID table '%s'. Would be ignored by the reader: %s",
table.getSchemaTableName(),
delta.getPath()));
}
acidInfoBuilder.addDeleteDelta(delta.getPath());
}
}
// initialize original files status list if present
fileStatusOriginalFiles = directory.getOriginalFiles();
for (HdfsFileStatusWithId hdfsFileStatusWithId : fileStatusOriginalFiles) {
Path originalFilePath = hdfsFileStatusWithId.getFileStatus().getPath();
long originalFileLength = hdfsFileStatusWithId.getFileStatus().getLen();
if (originalFileLength == 0) {
continue;
}
// Hive requires "original" files of transactional tables to conform to the bucketed tables naming pattern, to match them with delete deltas.
int bucketId = getRequiredBucketNumber(originalFilePath);
acidInfoBuilder.addOriginalFile(originalFilePath, originalFileLength, bucketId);
}
}
else {
// TODO https://github.com/trinodb/trino/issues/7603 - we should not referece acidInfoBuilder at allwhen we are not reading from non-ACID table
acidInfoBuilder.setOrcAcidVersionValidated(true); // no ACID; no further validation needed
readPaths = ImmutableList.of(path);
}
// Bucketed partitions are fully loaded immediately since all files must be loaded to determine the file to bucket mapping
if (tableBucketInfo.isPresent()) {
ListenableFuture lastResult = immediateVoidFuture(); // TODO document in addToQueue() that it is sufficient to hold on to last returned future
for (Path readPath : readPaths) {
// list all files in the partition
List files = new ArrayList<>();
try {
Iterators.addAll(files, new HiveFileIterator(table, readPath, fs, directoryLister, namenodeStats, FAIL, ignoreAbsentPartitions));
}
catch (HiveFileIterator.NestedDirectoryNotAllowedException e) {
// Fail here to be on the safe side. This seems to be the same as what Hive does
throw new TrinoException(
HIVE_INVALID_BUCKET_FILES,
format("Hive table '%s' is corrupt. Found sub-directory '%s' in bucket directory for partition: %s",
table.getSchemaTableName(),
e.getNestedDirectoryPath(),
splitFactory.getPartitionName()));
}
Optional acidInfo = isFullAcid ? acidInfoBuilder.build() : Optional.empty();
lastResult = hiveSplitSource.addToQueue(getBucketedSplits(files, splitFactory, tableBucketInfo.get(), bucketConversion, splittable, acidInfo));
}
for (HdfsFileStatusWithId hdfsFileStatusWithId : fileStatusOriginalFiles) {
List locatedFileStatuses = ImmutableList.of((LocatedFileStatus) hdfsFileStatusWithId.getFileStatus());
Optional acidInfo = isFullAcid
? Optional.of(acidInfoBuilder.buildWithRequiredOriginalFiles(getRequiredBucketNumber(hdfsFileStatusWithId.getFileStatus().getPath())))
: Optional.empty();
lastResult = hiveSplitSource.addToQueue(getBucketedSplits(locatedFileStatuses, splitFactory, tableBucketInfo.get(), bucketConversion, splittable, acidInfo));
}
return lastResult;
}
for (Path readPath : readPaths) {
Optional acidInfo = isFullAcid ? acidInfoBuilder.build() : Optional.empty();
fileIterators.addLast(createInternalHiveSplitIterator(readPath, fs, splitFactory, splittable, acidInfo));
}
if (!fileStatusOriginalFiles.isEmpty()) {
fileIterators.addLast(generateOriginalFilesSplits(splitFactory, fileStatusOriginalFiles, splittable, acidInfoBuilder, isFullAcid));
}
return COMPLETED_FUTURE;
}
private ListenableFuture createHiveSymlinkSplits(
String partitionName,
InputFormat, ?> targetInputFormat,
Properties schema,
List partitionKeys,
TupleDomain effectivePredicate,
BooleanSupplier partitionMatchSupplier,
boolean s3SelectPushdownEnabled,
TableToPartitionMapping tableToPartitionMapping,
List targetPaths)
throws IOException
{
ListenableFuture lastResult = COMPLETED_FUTURE;
for (Path targetPath : targetPaths) {
// the splits must be generated using the file system for the target path
// get the configuration for the target path -- it may be a different hdfs instance
FileSystem targetFilesystem = hdfsEnvironment.getFileSystem(hdfsContext, targetPath);
JobConf targetJob = toJobConf(targetFilesystem.getConf());
targetJob.setInputFormat(TextInputFormat.class);
Optional principal = hdfsContext.getIdentity().getPrincipal();
if (principal.isPresent()) {
targetJob.set(MRConfig.FRAMEWORK_NAME, MRConfig.CLASSIC_FRAMEWORK_NAME);
targetJob.set(MRConfig.MASTER_USER_NAME, principal.get().getName());
}
if (targetInputFormat instanceof JobConfigurable) {
((JobConfigurable) targetInputFormat).configure(targetJob);
}
FileInputFormat.setInputPaths(targetJob, targetPath);
InputSplit[] targetSplits = hdfsEnvironment.doAs(
hdfsContext.getIdentity(),
() -> targetInputFormat.getSplits(targetJob, 0));
InternalHiveSplitFactory splitFactory = new InternalHiveSplitFactory(
targetFilesystem,
partitionName,
targetInputFormat,
schema,
partitionKeys,
effectivePredicate,
partitionMatchSupplier,
tableToPartitionMapping,
Optional.empty(),
Optional.empty(),
getMaxInitialSplitSize(session),
isForceLocalScheduling(session),
s3SelectPushdownEnabled,
transaction,
maxSplitFileSize);
lastResult = addSplitsToSource(targetSplits, splitFactory);
if (stopped) {
return COMPLETED_FUTURE;
}
}
return lastResult;
}
@VisibleForTesting
Optional> buildManifestFileIterator(
InputFormat, ?> targetInputFormat,
String partitionName,
Properties schema,
List partitionKeys,
TupleDomain effectivePredicate,
BooleanSupplier partitionMatchSupplier,
boolean s3SelectPushdownEnabled,
TableToPartitionMapping tableToPartitionMapping,
Path parent,
List paths,
boolean splittable)
throws IOException
{
FileSystem targetFilesystem = hdfsEnvironment.getFileSystem(hdfsContext, parent);
Map fileStatuses = new HashMap<>();
HiveFileIterator fileStatusIterator = new HiveFileIterator(table, parent, targetFilesystem, directoryLister, namenodeStats, IGNORED, false);
fileStatusIterator.forEachRemaining(status -> fileStatuses.put(getPathWithoutSchemeAndAuthority(status.getPath()), status));
List locatedFileStatuses = new ArrayList<>();
for (Path path : paths) {
LocatedFileStatus status = fileStatuses.get(getPathWithoutSchemeAndAuthority(path));
// This check will catch all directories in the manifest since HiveFileIterator will not return any directories.
// Some files may not be listed by HiveFileIterator - if those are included in the manifest this check will fail as well.
if (status == null) {
return Optional.empty();
}
locatedFileStatuses.add(status);
}
InternalHiveSplitFactory splitFactory = new InternalHiveSplitFactory(
targetFilesystem,
partitionName,
targetInputFormat,
schema,
partitionKeys,
effectivePredicate,
partitionMatchSupplier,
tableToPartitionMapping,
Optional.empty(),
Optional.empty(),
getMaxInitialSplitSize(session),
isForceLocalScheduling(session),
s3SelectPushdownEnabled,
transaction,
maxSplitFileSize);
return Optional.of(locatedFileStatuses.stream()
.map(locatedFileStatus -> splitFactory.createInternalHiveSplit(locatedFileStatus, OptionalInt.empty(), splittable, Optional.empty()))
.filter(Optional::isPresent)
.map(Optional::get)
.iterator());
}
private Iterator generateOriginalFilesSplits(
InternalHiveSplitFactory splitFactory,
List originalFileLocations,
boolean splittable,
AcidInfo.Builder acidInfoBuilder,
boolean isFullAcid)
{
return originalFileLocations.stream()
.map(HdfsFileStatusWithId::getFileStatus)
.map(fileStatus -> {
Optional acidInfo = isFullAcid
? Optional.of(acidInfoBuilder.buildWithRequiredOriginalFiles(getRequiredBucketNumber(fileStatus.getPath())))
: Optional.empty();
return splitFactory.createInternalHiveSplit(
(LocatedFileStatus) fileStatus,
OptionalInt.empty(),
splittable,
acidInfo);
})
.filter(Optional::isPresent)
.map(Optional::get)
.iterator();
}
private ListenableFuture addSplitsToSource(InputSplit[] targetSplits, InternalHiveSplitFactory splitFactory)
throws IOException
{
ListenableFuture lastResult = COMPLETED_FUTURE;
for (InputSplit inputSplit : targetSplits) {
Optional internalHiveSplit = splitFactory.createInternalHiveSplit((FileSplit) inputSplit);
if (internalHiveSplit.isPresent()) {
lastResult = hiveSplitSource.addToQueue(internalHiveSplit.get());
}
if (stopped) {
return COMPLETED_FUTURE;
}
}
return lastResult;
}
private static boolean shouldUseFileSplitsFromInputFormat(InputFormat, ?> inputFormat)
{
return Arrays.stream(inputFormat.getClass().getAnnotations())
.map(Annotation::annotationType)
.map(Class::getSimpleName)
.anyMatch(name -> name.equals("UseFileSplitsFromInputFormat"));
}
private Iterator createInternalHiveSplitIterator(Path path, FileSystem fileSystem, InternalHiveSplitFactory splitFactory, boolean splittable, Optional acidInfo)
{
return Streams.stream(new HiveFileIterator(table, path, fileSystem, directoryLister, namenodeStats, recursiveDirWalkerEnabled ? RECURSE : IGNORED, ignoreAbsentPartitions))
.map(status -> splitFactory.createInternalHiveSplit(status, OptionalInt.empty(), splittable, acidInfo))
.filter(Optional::isPresent)
.map(Optional::get)
.iterator();
}
private List getBucketedSplits(
List files,
InternalHiveSplitFactory splitFactory,
BucketSplitInfo bucketSplitInfo,
Optional bucketConversion,
boolean splittable,
Optional acidInfo)
{
int readBucketCount = bucketSplitInfo.getReadBucketCount();
int tableBucketCount = bucketSplitInfo.getTableBucketCount();
int partitionBucketCount = bucketConversion.map(BucketConversion::getPartitionBucketCount).orElse(tableBucketCount);
int bucketCount = max(readBucketCount, partitionBucketCount);
// build mapping of file name to bucket
ListMultimap bucketFiles = ArrayListMultimap.create();
for (LocatedFileStatus file : files) {
String fileName = file.getPath().getName();
OptionalInt bucket = getBucketNumber(fileName);
if (bucket.isPresent()) {
bucketFiles.put(bucket.getAsInt(), file);
continue;
}
// legacy mode requires exactly one file per bucket
if (files.size() != partitionBucketCount) {
throw new TrinoException(HIVE_INVALID_BUCKET_FILES, format(
"Hive table '%s' is corrupt. File '%s' does not match the standard naming pattern, and the number " +
"of files in the directory (%s) does not match the declared bucket count (%s) for partition: %s",
table.getSchemaTableName(),
fileName,
files.size(),
partitionBucketCount,
splitFactory.getPartitionName()));
}
// sort FileStatus objects per `org.apache.hadoop.hive.ql.metadata.Table#getSortedPaths()`
files.sort(null);
// use position in sorted list as the bucket number
bucketFiles.clear();
for (int i = 0; i < files.size(); i++) {
bucketFiles.put(i, files.get(i));
}
break;
}
validateFileBuckets(bucketFiles, partitionBucketCount, table.getSchemaTableName().toString(), splitFactory.getPartitionName());
// convert files internal splits
List splitList = new ArrayList<>();
for (int bucketNumber = 0; bucketNumber < bucketCount; bucketNumber++) {
// Physical bucket #. This determine file name. It also determines the order of splits in the result.
int partitionBucketNumber = bucketNumber % partitionBucketCount;
// Logical bucket #. Each logical bucket corresponds to a "bucket" from engine's perspective.
int readBucketNumber = bucketNumber % readBucketCount;
boolean containsEligibleTableBucket = false;
boolean containsIneligibleTableBucket = false;
for (int tableBucketNumber = bucketNumber % tableBucketCount; tableBucketNumber < tableBucketCount; tableBucketNumber += bucketCount) {
// table bucket number: this is used for evaluating "$bucket" filters.
if (bucketSplitInfo.isTableBucketEnabled(tableBucketNumber)) {
containsEligibleTableBucket = true;
}
else {
containsIneligibleTableBucket = true;
}
}
if (containsEligibleTableBucket && containsIneligibleTableBucket) {
throw new TrinoException(
NOT_SUPPORTED,
"The bucket filter cannot be satisfied. There are restrictions on the bucket filter when all the following is true: " +
"1. a table has a different buckets count as at least one of its partitions that is read in this query; " +
"2. the table has a different but compatible bucket number with another table in the query; " +
"3. some buckets of the table is filtered out from the query, most likely using a filter on \"$bucket\". " +
"(table name: " + table.getTableName() + ", table bucket count: " + tableBucketCount + ", " +
"partition bucket count: " + partitionBucketCount + ", effective reading bucket count: " + readBucketCount + ")");
}
if (containsEligibleTableBucket) {
for (LocatedFileStatus file : bucketFiles.get(partitionBucketNumber)) {
// OrcDeletedRows will load only delete delta files matching current bucket id,
// so we can pass all delete delta locations here, without filtering.
splitFactory.createInternalHiveSplit(file, OptionalInt.of(readBucketNumber), splittable, acidInfo)
.ifPresent(splitList::add);
}
}
}
return splitList;
}
@VisibleForTesting
static void validateFileBuckets(ListMultimap bucketFiles, int partitionBucketCount, String tableName, String partitionName)
{
if (bucketFiles.isEmpty()) {
return;
}
int highestBucketNumber = max(bucketFiles.keySet());
// validate the bucket number detected from files, fail the query if the highest bucket number detected from file
// exceeds the allowed highest number
if (highestBucketNumber >= partitionBucketCount) {
throw new TrinoException(HIVE_INVALID_BUCKET_FILES, format(
"Hive table '%s' is corrupt. The highest bucket number in the directory (%s) exceeds the bucket number range " +
"defined by the declared bucket count (%s) for partition: %s",
tableName,
highestBucketNumber,
partitionBucketCount,
partitionName));
}
}
private static int getRequiredBucketNumber(Path path)
{
return getBucketNumber(path.getName())
.orElseThrow(() -> new IllegalStateException("Cannot get bucket number from path: " + path));
}
@VisibleForTesting
static OptionalInt getBucketNumber(String name)
{
for (Pattern pattern : BUCKET_PATTERNS) {
Matcher matcher = pattern.matcher(name);
if (matcher.matches()) {
return OptionalInt.of(parseInt(matcher.group(1)));
}
}
return OptionalInt.empty();
}
public static boolean hasAttemptId(String bucketFilename)
{
Matcher matcher = BUCKET_WITH_OPTIONAL_ATTEMPT_ID_PATTERN.matcher(bucketFilename);
return matcher.matches() && matcher.group(2) != null;
}
private static List getTargetPathsFromSymlink(FileSystem fileSystem, Path symlinkDir)
{
try {
FileStatus[] symlinks = fileSystem.listStatus(symlinkDir, HIDDEN_FILES_PATH_FILTER);
List targets = new ArrayList<>();
for (FileStatus symlink : symlinks) {
try (BufferedReader reader = new BufferedReader(new InputStreamReader(fileSystem.open(symlink.getPath()), StandardCharsets.UTF_8))) {
CharStreams.readLines(reader).stream()
.map(Path::new)
.forEach(targets::add);
}
}
return targets;
}
catch (IOException e) {
throw new TrinoException(HIVE_BAD_DATA, "Error parsing symlinks from: " + symlinkDir, e);
}
}
private static List getPartitionKeys(Table table, Optional partition)
{
if (partition.isEmpty()) {
return ImmutableList.of();
}
ImmutableList.Builder partitionKeys = ImmutableList.builder();
List keys = table.getPartitionColumns();
List values = partition.get().getValues();
checkCondition(keys.size() == values.size(), HIVE_INVALID_METADATA, "Expected %s partition key values, but got %s", keys.size(), values.size());
for (int i = 0; i < keys.size(); i++) {
String name = keys.get(i).getName();
HiveType hiveType = keys.get(i).getType();
if (!hiveType.isSupportedType(table.getStorage().getStorageFormat())) {
throw new TrinoException(NOT_SUPPORTED, format("Unsupported Hive type %s found in partition keys of table %s.%s", hiveType, table.getDatabaseName(), table.getTableName()));
}
String value = values.get(i);
checkCondition(value != null, HIVE_INVALID_PARTITION_VALUE, "partition key value cannot be null for field: %s", name);
partitionKeys.add(new HivePartitionKey(name, value));
}
return partitionKeys.build();
}
private static Properties getPartitionSchema(Table table, Optional partition)
{
if (partition.isEmpty()) {
return getHiveSchema(table);
}
return getHiveSchema(partition.get(), table);
}
public static class BucketSplitInfo
{
private final BucketingVersion bucketingVersion;
private final List bucketColumns;
private final int tableBucketCount;
private final int readBucketCount;
private final IntPredicate bucketFilter;
public static Optional createBucketSplitInfo(Optional bucketHandle, Optional bucketFilter)
{
requireNonNull(bucketHandle, "bucketHandle is null");
requireNonNull(bucketFilter, "bucketFilter is null");
if (bucketHandle.isEmpty()) {
checkArgument(bucketFilter.isEmpty(), "bucketHandle must be present if bucketFilter is present");
return Optional.empty();
}
BucketingVersion bucketingVersion = bucketHandle.get().getBucketingVersion();
int tableBucketCount = bucketHandle.get().getTableBucketCount();
int readBucketCount = bucketHandle.get().getReadBucketCount();
if (tableBucketCount != readBucketCount && bucketFilter.isPresent()) {
// TODO: remove when supported
throw new TrinoException(NOT_SUPPORTED, "Filter on \"$bucket\" is not supported when the table has partitions with different bucket counts");
}
List bucketColumns = bucketHandle.get().getColumns();
IntPredicate predicate = bucketFilter
.map(filter -> filter.getBucketsToKeep()::contains)
.orElse(bucket -> true);
return Optional.of(new BucketSplitInfo(bucketingVersion, bucketColumns, tableBucketCount, readBucketCount, predicate));
}
private BucketSplitInfo(BucketingVersion bucketingVersion, List bucketColumns, int tableBucketCount, int readBucketCount, IntPredicate bucketFilter)
{
this.bucketingVersion = requireNonNull(bucketingVersion, "bucketingVersion is null");
this.bucketColumns = ImmutableList.copyOf(requireNonNull(bucketColumns, "bucketColumns is null"));
this.tableBucketCount = tableBucketCount;
this.readBucketCount = readBucketCount;
this.bucketFilter = requireNonNull(bucketFilter, "bucketFilter is null");
}
public BucketingVersion getBucketingVersion()
{
return bucketingVersion;
}
public List getBucketColumns()
{
return bucketColumns;
}
public int getTableBucketCount()
{
return tableBucketCount;
}
public int getReadBucketCount()
{
return readBucketCount;
}
/**
* Evaluates whether the provided table bucket number passes the bucket predicate.
* A bucket predicate can be present in two cases:
*
* - Filter on "$bucket" column. e.g. {@code "$bucket" between 0 and 100}
*
- Single-value equality filter on all bucket columns. e.g. for a table with two bucketing columns,
* {@code bucketCol1 = 'a' AND bucketCol2 = 123}
*
*/
public boolean isTableBucketEnabled(int tableBucketNumber)
{
return bucketFilter.test(tableBucketNumber);
}
}
}
© 2015 - 2025 Weber Informatics LLC | Privacy Policy