io.trino.plugin.hive.HiveSplitSource Maven / Gradle / Ivy
/*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package io.trino.plugin.hive;
import com.google.common.annotations.VisibleForTesting;
import com.google.common.collect.ImmutableList;
import com.google.common.util.concurrent.Futures;
import com.google.common.util.concurrent.ListenableFuture;
import io.airlift.log.Logger;
import io.airlift.stats.CounterStat;
import io.airlift.units.DataSize;
import io.trino.plugin.hive.InternalHiveSplit.InternalHiveBlock;
import io.trino.plugin.hive.util.AsyncQueue;
import io.trino.plugin.hive.util.AsyncQueue.BorrowResult;
import io.trino.plugin.hive.util.ThrottledAsyncQueue;
import io.trino.spi.TrinoException;
import io.trino.spi.connector.ConnectorPartitionHandle;
import io.trino.spi.connector.ConnectorSession;
import io.trino.spi.connector.ConnectorSplit;
import io.trino.spi.connector.ConnectorSplitSource;
import java.io.FileNotFoundException;
import java.util.List;
import java.util.Map;
import java.util.OptionalInt;
import java.util.concurrent.CompletableFuture;
import java.util.concurrent.ConcurrentHashMap;
import java.util.concurrent.Executor;
import java.util.concurrent.atomic.AtomicBoolean;
import java.util.concurrent.atomic.AtomicInteger;
import java.util.concurrent.atomic.AtomicLong;
import java.util.concurrent.atomic.AtomicReference;
import java.util.function.Function;
import java.util.function.Predicate;
import static com.google.common.base.Preconditions.checkArgument;
import static com.google.common.base.Preconditions.checkState;
import static com.google.common.util.concurrent.Futures.immediateFuture;
import static com.google.common.util.concurrent.MoreExecutors.directExecutor;
import static io.airlift.concurrent.MoreFutures.failedFuture;
import static io.airlift.concurrent.MoreFutures.toCompletableFuture;
import static io.airlift.units.DataSize.succinctBytes;
import static io.trino.plugin.hive.HiveErrorCode.HIVE_EXCEEDED_SPLIT_BUFFERING_LIMIT;
import static io.trino.plugin.hive.HiveErrorCode.HIVE_FILE_NOT_FOUND;
import static io.trino.plugin.hive.HiveErrorCode.HIVE_UNKNOWN_ERROR;
import static io.trino.plugin.hive.HiveSessionProperties.getMaxInitialSplitSize;
import static io.trino.plugin.hive.HiveSessionProperties.getMaxSplitSize;
import static io.trino.plugin.hive.HiveSplitSource.StateKind.CLOSED;
import static io.trino.plugin.hive.HiveSplitSource.StateKind.FAILED;
import static io.trino.plugin.hive.HiveSplitSource.StateKind.INITIAL;
import static io.trino.plugin.hive.HiveSplitSource.StateKind.NO_MORE_SPLITS;
import static io.trino.spi.connector.NotPartitionedPartitionHandle.NOT_PARTITIONED;
import static java.lang.String.format;
import static java.util.Objects.requireNonNull;
class HiveSplitSource
implements ConnectorSplitSource
{
private static final Logger log = Logger.get(HiveSplitSource.class);
private final String queryId;
private final String databaseName;
private final String tableName;
private final PerBucket queues;
private final AtomicInteger bufferedInternalSplitCount = new AtomicInteger();
private final long maxOutstandingSplitsBytes;
private final DataSize maxSplitSize;
private final DataSize maxInitialSplitSize;
private final AtomicInteger remainingInitialSplits;
private final HiveSplitLoader splitLoader;
private final AtomicReference stateReference;
private final AtomicLong estimatedSplitSizeInBytes = new AtomicLong();
private final CounterStat highMemorySplitSourceCounter;
private final AtomicBoolean loggedHighMemoryWarning = new AtomicBoolean();
private HiveSplitSource(
ConnectorSession session,
String databaseName,
String tableName,
PerBucket queues,
int maxInitialSplits,
DataSize maxOutstandingSplitsSize,
HiveSplitLoader splitLoader,
AtomicReference stateReference,
CounterStat highMemorySplitSourceCounter)
{
requireNonNull(session, "session is null");
this.queryId = session.getQueryId();
this.databaseName = requireNonNull(databaseName, "databaseName is null");
this.tableName = requireNonNull(tableName, "tableName is null");
this.queues = requireNonNull(queues, "queues is null");
this.maxOutstandingSplitsBytes = requireNonNull(maxOutstandingSplitsSize, "maxOutstandingSplitsSize is null").toBytes();
this.splitLoader = requireNonNull(splitLoader, "splitLoader is null");
this.stateReference = requireNonNull(stateReference, "stateReference is null");
this.highMemorySplitSourceCounter = requireNonNull(highMemorySplitSourceCounter, "highMemorySplitSourceCounter is null");
this.maxSplitSize = getMaxSplitSize(session);
this.maxInitialSplitSize = getMaxInitialSplitSize(session);
this.remainingInitialSplits = new AtomicInteger(maxInitialSplits);
}
public static HiveSplitSource allAtOnce(
ConnectorSession session,
String databaseName,
String tableName,
int maxInitialSplits,
int maxOutstandingSplits,
DataSize maxOutstandingSplitsSize,
int maxSplitsPerSecond,
HiveSplitLoader splitLoader,
Executor executor,
CounterStat highMemorySplitSourceCounter)
{
AtomicReference stateReference = new AtomicReference<>(State.initial());
return new HiveSplitSource(
session,
databaseName,
tableName,
new PerBucket()
{
private final AsyncQueue queue = new ThrottledAsyncQueue<>(maxSplitsPerSecond, maxOutstandingSplits, executor);
@Override
public ListenableFuture> offer(OptionalInt bucketNumber, InternalHiveSplit connectorSplit)
{
// bucketNumber can be non-empty because BackgroundHiveSplitLoader does not have knowledge of execution plan
return queue.offer(connectorSplit);
}
@Override
public ListenableFuture borrowBatchAsync(OptionalInt bucketNumber, int maxSize, Function, BorrowResult> function)
{
checkArgument(bucketNumber.isEmpty());
return queue.borrowBatchAsync(maxSize, function);
}
@Override
public void finish()
{
queue.finish();
}
@Override
public boolean isFinished(OptionalInt bucketNumber)
{
checkArgument(bucketNumber.isEmpty());
return queue.isFinished();
}
},
maxInitialSplits,
maxOutstandingSplitsSize,
splitLoader,
stateReference,
highMemorySplitSourceCounter);
}
public static HiveSplitSource bucketed(
ConnectorSession session,
String databaseName,
String tableName,
int estimatedOutstandingSplitsPerBucket,
int maxInitialSplits,
DataSize maxOutstandingSplitsSize,
int maxSplitsPerSecond,
HiveSplitLoader splitLoader,
Executor executor,
CounterStat highMemorySplitSourceCounter)
{
AtomicReference stateReference = new AtomicReference<>(State.initial());
return new HiveSplitSource(
session,
databaseName,
tableName,
new PerBucket()
{
private final Map> queues = new ConcurrentHashMap<>();
private final AtomicBoolean finished = new AtomicBoolean();
@Override
public ListenableFuture> offer(OptionalInt bucketNumber, InternalHiveSplit connectorSplit)
{
AsyncQueue queue = queueFor(bucketNumber);
queue.offer(connectorSplit);
// Do not block "offer" when running split discovery in bucketed mode.
// A limit is enforced on estimatedSplitSizeInBytes.
return immediateFuture(null);
}
@Override
public ListenableFuture borrowBatchAsync(OptionalInt bucketNumber, int maxSize, Function, BorrowResult> function)
{
return queueFor(bucketNumber).borrowBatchAsync(maxSize, function);
}
@Override
public void finish()
{
if (finished.compareAndSet(false, true)) {
queues.values().forEach(AsyncQueue::finish);
}
}
@Override
public boolean isFinished(OptionalInt bucketNumber)
{
return queueFor(bucketNumber).isFinished();
}
public AsyncQueue queueFor(OptionalInt bucketNumber)
{
checkArgument(bucketNumber.isPresent());
AtomicBoolean isNew = new AtomicBoolean();
AsyncQueue queue = queues.computeIfAbsent(bucketNumber.getAsInt(), ignored -> {
isNew.set(true);
return new ThrottledAsyncQueue<>(maxSplitsPerSecond, estimatedOutstandingSplitsPerBucket, executor);
});
if (isNew.get() && finished.get()) {
// Check `finished` and invoke `queue.finish` after the `queue` is added to the map.
// Otherwise, `queue.finish` may not be invoked if `finished` is set while the lambda above is being evaluated.
queue.finish();
}
return queue;
}
},
maxInitialSplits,
maxOutstandingSplitsSize,
splitLoader,
stateReference,
highMemorySplitSourceCounter);
}
/**
* The upper bound of outstanding split count.
* It might be larger than the actual number when called concurrently with other methods.
*/
@VisibleForTesting
int getBufferedInternalSplitCount()
{
return bufferedInternalSplitCount.get();
}
ListenableFuture> addToQueue(List extends InternalHiveSplit> splits)
{
ListenableFuture> lastResult = immediateFuture(null);
for (InternalHiveSplit split : splits) {
lastResult = addToQueue(split);
}
return lastResult;
}
ListenableFuture> addToQueue(InternalHiveSplit split)
{
if (stateReference.get().getKind() != INITIAL) {
return immediateFuture(null);
}
if (estimatedSplitSizeInBytes.addAndGet(split.getEstimatedSizeInBytes()) > maxOutstandingSplitsBytes) {
// TODO: investigate alternative split discovery strategies when this error is hit.
// This limit should never be hit given there is a limit of maxOutstandingSplits.
// If it's hit, it means individual splits are huge.
if (loggedHighMemoryWarning.compareAndSet(false, true)) {
highMemorySplitSourceCounter.update(1);
log.warn("Split buffering for %s.%s in query %s exceeded memory limit (%s). %s splits are buffered.",
databaseName, tableName, queryId, succinctBytes(maxOutstandingSplitsBytes), getBufferedInternalSplitCount());
}
throw new TrinoException(HIVE_EXCEEDED_SPLIT_BUFFERING_LIMIT, format(
"Split buffering for %s.%s exceeded memory limit (%s). %s splits are buffered.",
databaseName, tableName, succinctBytes(maxOutstandingSplitsBytes), getBufferedInternalSplitCount()));
}
bufferedInternalSplitCount.incrementAndGet();
OptionalInt bucketNumber = split.getBucketNumber();
return queues.offer(bucketNumber, split);
}
void noMoreSplits()
{
if (setIf(stateReference, State.noMoreSplits(), state -> state.getKind() == INITIAL)) {
// Stop the split loader before finishing the queue.
// Once the queue is finished, it will always return a completed future to avoid blocking any caller.
// This could lead to a short period of busy loop in splitLoader (although unlikely in general setup).
splitLoader.stop();
queues.finish();
}
}
void fail(Throwable e)
{
// The error must be recorded before setting the finish marker to make sure
// isFinished will observe failure instead of successful completion.
// Only record the first error message.
if (setIf(stateReference, State.failed(e), state -> state.getKind() == INITIAL)) {
// Stop the split loader before finishing the queue.
// Once the queue is finished, it will always return a completed future to avoid blocking any caller.
// This could lead to a short period of busy loop in splitLoader (although unlikely in general setup).
splitLoader.stop();
queues.finish();
}
}
@Override
public CompletableFuture getNextBatch(ConnectorPartitionHandle partitionHandle, int maxSize)
{
boolean noMoreSplits;
State state = stateReference.get();
switch (state.getKind()) {
case INITIAL:
noMoreSplits = false;
break;
case NO_MORE_SPLITS:
noMoreSplits = true;
break;
case FAILED:
return failedFuture(state.getThrowable());
case CLOSED:
throw new IllegalStateException("HiveSplitSource is already closed");
default:
throw new UnsupportedOperationException();
}
OptionalInt bucketNumber = toBucketNumber(partitionHandle);
ListenableFuture> future = queues.borrowBatchAsync(bucketNumber, maxSize, internalSplits -> {
ImmutableList.Builder splitsToInsertBuilder = ImmutableList.builder();
ImmutableList.Builder resultBuilder = ImmutableList.builder();
int removedEstimatedSizeInBytes = 0;
for (InternalHiveSplit internalSplit : internalSplits) {
long maxSplitBytes = maxSplitSize.toBytes();
if (remainingInitialSplits.get() > 0) {
if (remainingInitialSplits.getAndDecrement() > 0) {
maxSplitBytes = maxInitialSplitSize.toBytes();
}
}
InternalHiveBlock block = internalSplit.currentBlock();
long splitBytes;
if (internalSplit.isSplittable()) {
long remainingBlockBytes = block.getEnd() - internalSplit.getStart();
if (remainingBlockBytes <= maxSplitBytes) {
splitBytes = remainingBlockBytes;
}
else if (maxSplitBytes * 2 >= remainingBlockBytes) {
// Second to last split in this block, generate two evenly sized splits
splitBytes = remainingBlockBytes / 2;
}
else {
splitBytes = maxSplitBytes;
}
}
else {
splitBytes = internalSplit.getEnd() - internalSplit.getStart();
}
resultBuilder.add(new HiveSplit(
databaseName,
tableName,
internalSplit.getPartitionName(),
internalSplit.getPath(),
internalSplit.getStart(),
splitBytes,
internalSplit.getEstimatedFileSize(),
internalSplit.getFileModifiedTime(),
internalSplit.getSchema(),
internalSplit.getPartitionKeys(),
block.getAddresses(),
internalSplit.getBucketNumber(),
internalSplit.getStatementId(),
internalSplit.isForceLocalScheduling(),
internalSplit.getTableToPartitionMapping(),
internalSplit.getBucketConversion(),
internalSplit.getBucketValidation(),
internalSplit.isS3SelectPushdownEnabled(),
internalSplit.getAcidInfo()));
internalSplit.increaseStart(splitBytes);
if (internalSplit.isDone()) {
removedEstimatedSizeInBytes += internalSplit.getEstimatedSizeInBytes();
}
else {
splitsToInsertBuilder.add(internalSplit);
}
}
estimatedSplitSizeInBytes.addAndGet(-removedEstimatedSizeInBytes);
List splitsToInsert = splitsToInsertBuilder.build();
List result = resultBuilder.build();
bufferedInternalSplitCount.addAndGet(splitsToInsert.size() - result.size());
return new AsyncQueue.BorrowResult<>(splitsToInsert, result);
});
ListenableFuture transform = Futures.transform(future, splits -> {
requireNonNull(splits, "splits is null");
if (noMoreSplits) {
// Checking splits.isEmpty() here is required for thread safety.
// Let's say there are 10 splits left, and max number of splits per batch is 5.
// The futures constructed in two getNextBatch calls could each fetch 5, resulting in zero splits left.
// After fetching the splits, both futures reach this line at the same time.
// Without the isEmpty check, both will claim they are the last.
// Side note 1: In such a case, it doesn't actually matter which one gets to claim it's the last.
// But having both claim they are the last would be a surprising behavior.
// Side note 2: One could argue that the isEmpty check is overly conservative.
// The caller of getNextBatch will likely need to make an extra invocation.
// But an extra invocation likely doesn't matter.
return new ConnectorSplitBatch(splits, splits.isEmpty() && queues.isFinished(bucketNumber));
}
else {
return new ConnectorSplitBatch(splits, false);
}
}, directExecutor());
return toCompletableFuture(transform);
}
@Override
public boolean isFinished()
{
State state = stateReference.get();
switch (state.getKind()) {
case INITIAL:
return false;
case NO_MORE_SPLITS:
return bufferedInternalSplitCount.get() == 0;
case FAILED:
throw propagateTrinoException(state.getThrowable());
case CLOSED:
throw new IllegalStateException("HiveSplitSource is already closed");
default:
throw new UnsupportedOperationException();
}
}
@Override
public void close()
{
if (setIf(stateReference, State.closed(), state -> state.getKind() == INITIAL || state.getKind() == NO_MORE_SPLITS)) {
// Stop the split loader before finishing the queue.
// Once the queue is finished, it will always return a completed future to avoid blocking any caller.
// This could lead to a short period of busy loop in splitLoader (although unlikely in general setup).
splitLoader.stop();
queues.finish();
}
}
private static OptionalInt toBucketNumber(ConnectorPartitionHandle partitionHandle)
{
if (partitionHandle == NOT_PARTITIONED) {
return OptionalInt.empty();
}
return OptionalInt.of(((HivePartitionHandle) partitionHandle).getBucket());
}
private static boolean setIf(AtomicReference atomicReference, T newValue, Predicate predicate)
{
while (true) {
T current = atomicReference.get();
if (!predicate.test(current)) {
return false;
}
if (atomicReference.compareAndSet(current, newValue)) {
return true;
}
}
}
private static RuntimeException propagateTrinoException(Throwable throwable)
{
if (throwable instanceof TrinoException) {
throw (TrinoException) throwable;
}
if (throwable instanceof FileNotFoundException) {
throw new TrinoException(HIVE_FILE_NOT_FOUND, throwable);
}
throw new TrinoException(HIVE_UNKNOWN_ERROR, throwable);
}
interface PerBucket
{
ListenableFuture> offer(OptionalInt bucketNumber, InternalHiveSplit split);
ListenableFuture borrowBatchAsync(OptionalInt bucketNumber, int maxSize, Function, BorrowResult> function);
void finish();
boolean isFinished(OptionalInt bucketNumber);
}
static class State
{
private final StateKind kind;
private final Throwable throwable;
private State(StateKind kind, Throwable throwable)
{
this.kind = kind;
this.throwable = throwable;
}
public StateKind getKind()
{
return kind;
}
public Throwable getThrowable()
{
checkState(throwable != null);
return throwable;
}
public static State initial()
{
return new State(INITIAL, null);
}
public static State noMoreSplits()
{
return new State(NO_MORE_SPLITS, null);
}
public static State failed(Throwable throwable)
{
return new State(FAILED, throwable);
}
public static State closed()
{
return new State(CLOSED, null);
}
}
enum StateKind
{
INITIAL,
NO_MORE_SPLITS,
FAILED,
CLOSED,
}
}
© 2015 - 2025 Weber Informatics LLC | Privacy Policy