com.facebook.presto.hive.HiveBucketing Maven / Gradle / Ivy
The newest version!
/*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package com.facebook.presto.hive;
import com.facebook.presto.common.Page;
import com.facebook.presto.common.block.Block;
import com.facebook.presto.common.predicate.Domain;
import com.facebook.presto.common.predicate.NullableValue;
import com.facebook.presto.common.predicate.TupleDomain;
import com.facebook.presto.common.predicate.ValueSet;
import com.facebook.presto.common.type.Type;
import com.facebook.presto.hive.metastore.Column;
import com.facebook.presto.hive.metastore.Table;
import com.facebook.presto.spi.ColumnHandle;
import com.facebook.presto.spi.ConnectorSession;
import com.facebook.presto.spi.PrestoException;
import com.fasterxml.jackson.annotation.JsonCreator;
import com.fasterxml.jackson.annotation.JsonProperty;
import com.google.common.collect.ImmutableList;
import com.google.common.collect.ImmutableSet;
import com.google.common.primitives.Shorts;
import com.google.common.primitives.SignedBytes;
import io.airlift.slice.Slice;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.hive.serde2.typeinfo.ListTypeInfo;
import org.apache.hadoop.hive.serde2.typeinfo.MapTypeInfo;
import org.apache.hadoop.hive.serde2.typeinfo.PrimitiveTypeInfo;
import org.apache.hadoop.hive.serde2.typeinfo.TypeInfo;
import java.util.List;
import java.util.Map;
import java.util.Objects;
import java.util.Optional;
import java.util.Set;
import java.util.stream.Collectors;
import static com.facebook.presto.common.type.TypeUtils.hashPosition;
import static com.facebook.presto.hive.BucketFunctionType.HIVE_COMPATIBLE;
import static com.facebook.presto.hive.HiveColumnHandle.BUCKET_COLUMN_NAME;
import static com.facebook.presto.hive.HiveErrorCode.HIVE_INVALID_METADATA;
import static com.facebook.presto.hive.HiveSessionProperties.getCteVirtualBucketCount;
import static com.facebook.presto.hive.HiveUtil.getRegularColumnHandles;
import static com.facebook.presto.hive.metastore.PrestoTableType.TEMPORARY_TABLE;
import static com.google.common.base.Preconditions.checkArgument;
import static com.google.common.collect.ImmutableList.toImmutableList;
import static com.google.common.collect.ImmutableMap.toImmutableMap;
import static io.airlift.slice.Slices.utf8Slice;
import static java.lang.Double.doubleToLongBits;
import static java.lang.Math.toIntExact;
import static java.lang.String.format;
import static java.util.Map.Entry;
import static java.util.Objects.requireNonNull;
import static java.util.function.Function.identity;
import static org.apache.hadoop.hive.serde2.objectinspector.PrimitiveObjectInspector.PrimitiveCategory;
public final class HiveBucketing
{
private static final Set SUPPORTED_TYPES_FOR_BUCKET_FILTER = ImmutableSet.of(
HiveType.HIVE_BYTE,
HiveType.HIVE_SHORT,
HiveType.HIVE_INT,
HiveType.HIVE_LONG,
HiveType.HIVE_BOOLEAN,
HiveType.HIVE_STRING);
private HiveBucketing() {}
public static int getVirtualBucketNumber(int bucketCount, Path path)
{
// this is equivalent to bucketing the table on a VARCHAR column containing $path
return (hashBytes(0, utf8Slice(path.toString())) & Integer.MAX_VALUE) % bucketCount;
}
public static int getBucket(int bucketCount, List types, Page page, int position)
{
return (getHashCode(types, page, position) & Integer.MAX_VALUE) % bucketCount;
}
public static int getHiveBucket(int bucketCount, List types, Page page, int position, boolean useLegacyTimestampBucketing)
{
return (getBucketHashCode(types, page, position, useLegacyTimestampBucketing) & Integer.MAX_VALUE) % bucketCount;
}
public static int getHiveBucket(int bucketCount, List types, Object[] values, boolean useLegacyTimestampBucketing)
{
return (getBucketHashCode(types, values, useLegacyTimestampBucketing) & Integer.MAX_VALUE) % bucketCount;
}
private static int getHashCode(List types, Page page, int position)
{
checkArgument(types.size() == page.getChannelCount());
int result = 0;
for (int i = 0; i < page.getChannelCount(); i++) {
int fieldHash = (int) hashPosition(types.get(i), page.getBlock(i), position);
result = result * 31 + fieldHash;
}
return result;
}
private static int getBucketHashCode(List types, Page page, int position, boolean useLegacyTimestampBucketing)
{
checkArgument(types.size() == page.getChannelCount());
int result = 0;
for (int i = 0; i < page.getChannelCount(); i++) {
int fieldHash = hash(types.get(i), page.getBlock(i), position, useLegacyTimestampBucketing);
result = result * 31 + fieldHash;
}
return result;
}
private static int getBucketHashCode(List types, Object[] values, boolean useLegacyTimestampBucketing)
{
checkArgument(types.size() == values.length);
int result = 0;
for (int i = 0; i < values.length; i++) {
int fieldHash = hash(types.get(i), values[i], useLegacyTimestampBucketing);
result = result * 31 + fieldHash;
}
return result;
}
private static int hash(TypeInfo type, Block block, int position, boolean useLegacyTimestampBucketing)
{
// This function mirrors the behavior of function hashCode in
// HIVE-12025 ba83fd7bff serde/src/java/org/apache/hadoop/hive/serde2/objectinspector/ObjectInspectorUtils.java
// https://github.com/apache/hive/blob/ba83fd7bff/serde/src/java/org/apache/hadoop/hive/serde2/objectinspector/ObjectInspectorUtils.java
// HIVE-7148 proposed change to bucketing hash algorithms. If that gets implemented, this function will need to change significantly.
if (block.isNull(position)) {
return 0;
}
switch (type.getCategory()) {
case PRIMITIVE: {
PrimitiveTypeInfo typeInfo = (PrimitiveTypeInfo) type;
PrimitiveCategory primitiveCategory = typeInfo.getPrimitiveCategory();
Type prestoType = requireNonNull(HiveType.getPrimitiveType(typeInfo));
switch (primitiveCategory) {
case BOOLEAN:
return prestoType.getBoolean(block, position) ? 1 : 0;
case BYTE:
return SignedBytes.checkedCast(prestoType.getLong(block, position));
case SHORT:
return Shorts.checkedCast(prestoType.getLong(block, position));
case INT:
return toIntExact(prestoType.getLong(block, position));
case LONG:
long bigintValue = prestoType.getLong(block, position);
return (int) ((bigintValue >>> 32) ^ bigintValue);
case FLOAT:
return (int) prestoType.getLong(block, position);
case DOUBLE:
long doubleValue = doubleToLongBits(prestoType.getDouble(block, position));
return (int) ((doubleValue >>> 32) ^ doubleValue);
case STRING:
return hashBytes(0, prestoType.getSlice(block, position));
case VARCHAR:
return hashBytes(1, prestoType.getSlice(block, position));
case DATE:
// day offset from 1970-01-01
long days = prestoType.getLong(block, position);
return toIntExact(days);
case TIMESTAMP:
long millisSinceEpoch = prestoType.getLong(block, position);
return getHashForTimestamp(millisSinceEpoch, useLegacyTimestampBucketing);
default:
throw new UnsupportedOperationException("Computation of Hive bucket hashCode is not supported for Hive primitive category: " + primitiveCategory.toString() + ".");
}
}
case LIST: {
Block elementsBlock = block.getBlock(position);
return hashOfList((ListTypeInfo) type, elementsBlock, useLegacyTimestampBucketing);
}
case MAP: {
Block elementsBlock = block.getBlock(position);
return hashOfMap((MapTypeInfo) type, elementsBlock, useLegacyTimestampBucketing);
}
default:
// TODO: support more types, e.g. ROW
throw new UnsupportedOperationException("Computation of Hive bucket hashCode is not supported for Hive category: " + type.getCategory().toString() + ".");
}
}
private static int hash(TypeInfo type, Object value, boolean useLegacyTimestampBucketing)
{
if (value == null) {
return 0;
}
switch (type.getCategory()) {
case PRIMITIVE: {
PrimitiveTypeInfo typeInfo = (PrimitiveTypeInfo) type;
PrimitiveCategory primitiveCategory = typeInfo.getPrimitiveCategory();
Type prestoType = requireNonNull(HiveType.getPrimitiveType(typeInfo));
switch (primitiveCategory) {
case BOOLEAN:
return (boolean) value ? 1 : 0;
case BYTE:
return SignedBytes.checkedCast((long) value);
case SHORT:
return Shorts.checkedCast((long) value);
case INT:
return toIntExact((long) value);
case LONG:
long bigintValue = (long) value;
return (int) ((bigintValue >>> 32) ^ bigintValue);
case FLOAT:
return (int) (long) value;
case DOUBLE:
long doubleValue = doubleToLongBits((double) value);
return (int) ((doubleValue >>> 32) ^ doubleValue);
case STRING:
return hashBytes(0, (Slice) value);
case VARCHAR:
return hashBytes(1, (Slice) value);
case DATE:
// day offset from 1970-01-01
long days = (long) value;
return toIntExact(days);
case TIMESTAMP:
long millisSinceEpoch = (long) value;
return getHashForTimestamp(millisSinceEpoch, useLegacyTimestampBucketing);
default:
throw new UnsupportedOperationException("Computation of Hive bucket hashCode is not supported for Hive primitive category: " + primitiveCategory.toString() + ".");
}
}
case LIST: {
return hashOfList((ListTypeInfo) type, (Block) value, useLegacyTimestampBucketing);
}
case MAP: {
return hashOfMap((MapTypeInfo) type, (Block) value, useLegacyTimestampBucketing);
}
default:
// TODO: support more types, e.g. ROW
throw new UnsupportedOperationException("Computation of Hive bucket hashCode is not supported for Hive category: " + type.getCategory().toString() + ".");
}
}
private static int getHashForTimestamp(long millisSinceEpoch, boolean useLegacyTimestampBucketing)
{
if (useLegacyTimestampBucketing) {
// seconds << 30 + milliseconds
long secondsAndMillis = (Math.floorDiv(millisSinceEpoch, 1000L) << 30) + Math.floorMod(millisSinceEpoch, 1000L);
return (int) ((secondsAndMillis >>> 32) ^ secondsAndMillis);
}
// seconds << 30 + nanoseconds
long secondsAndNanos = (Math.floorDiv(millisSinceEpoch, 1000L) << 30) + Math.floorMod(millisSinceEpoch, 1000L) * 1000L * 1000L;
return (int) ((secondsAndNanos >>> 32) ^ secondsAndNanos);
}
private static int hashOfMap(MapTypeInfo type, Block singleMapBlock, boolean useLegacyTimestampBucketing)
{
TypeInfo keyTypeInfo = type.getMapKeyTypeInfo();
TypeInfo valueTypeInfo = type.getMapValueTypeInfo();
int result = 0;
for (int i = 0; i < singleMapBlock.getPositionCount(); i += 2) {
result += hash(keyTypeInfo, singleMapBlock, i, useLegacyTimestampBucketing) ^ hash(valueTypeInfo, singleMapBlock, i + 1, useLegacyTimestampBucketing);
}
return result;
}
private static int hashOfList(ListTypeInfo type, Block singleListBlock, boolean useLegacyTimestampBucketing)
{
TypeInfo elementTypeInfo = type.getListElementTypeInfo();
int result = 0;
for (int i = 0; i < singleListBlock.getPositionCount(); i++) {
result = result * 31 + hash(elementTypeInfo, singleListBlock, i, useLegacyTimestampBucketing);
}
return result;
}
private static int hashBytes(int initialValue, Slice bytes)
{
int result = initialValue;
for (int i = 0; i < bytes.length(); i++) {
result = result * 31 + bytes.getByte(i);
}
return result;
}
public static Optional getHiveBucketHandle(ConnectorSession session, Table table)
{
Optional hiveBucketProperty = table.getStorage().getBucketProperty();
if (!hiveBucketProperty.isPresent()) {
if (table.getTableType().equals(TEMPORARY_TABLE)) {
return Optional.of(HiveBucketHandle.createVirtualBucketHandle(getCteVirtualBucketCount(session)));
}
return Optional.empty();
}
Map map = getRegularColumnHandles(table).stream()
.collect(Collectors.toMap(HiveColumnHandle::getName, identity()));
ImmutableList.Builder bucketColumns = ImmutableList.builder();
for (String bucketColumnName : hiveBucketProperty.get().getBucketedBy()) {
HiveColumnHandle bucketColumnHandle = map.get(bucketColumnName);
if (bucketColumnHandle == null) {
throw new PrestoException(
HIVE_INVALID_METADATA,
format("Table '%s.%s' is bucketed on non-existent column '%s'", table.getDatabaseName(), table.getTableName(), bucketColumnName));
}
bucketColumns.add(bucketColumnHandle);
}
int bucketCount = hiveBucketProperty.get().getBucketCount();
return Optional.of(new HiveBucketHandle(bucketColumns.build(), bucketCount, bucketCount));
}
public static Optional getHiveBucketFilter(Table table, TupleDomain effectivePredicate, boolean useLegacyTimestampBucketing)
{
return getHiveBucketFilter(table.getStorage().getBucketProperty(), table.getDataColumns(), effectivePredicate, useLegacyTimestampBucketing);
}
public static Optional getHiveBucketFilter(
Optional hiveBucketProperty,
List dataColumns,
TupleDomain effectivePredicate,
boolean useLegacyTimestampBucketing)
{
if (!hiveBucketProperty.isPresent()) {
return Optional.empty();
}
if (!hiveBucketProperty.get().getBucketFunctionType().equals(HIVE_COMPATIBLE)) {
// bucket filtering is only supported for tables bucketed with HIVE_COMPATIBLE hash function
return Optional.empty();
}
Optional
© 2015 - 2025 Weber Informatics LLC | Privacy Policy