org.apache.iceberg.transforms.Bucket Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of iceberg-api Show documentation
Show all versions of iceberg-api Show documentation
A table format for huge analytic datasets
The newest version!
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/
package org.apache.iceberg.transforms;
import java.math.BigDecimal;
import java.nio.ByteBuffer;
import java.nio.charset.StandardCharsets;
import java.util.Set;
import java.util.UUID;
import org.apache.iceberg.expressions.BoundPredicate;
import org.apache.iceberg.expressions.BoundTransform;
import org.apache.iceberg.expressions.Expression;
import org.apache.iceberg.expressions.Expressions;
import org.apache.iceberg.expressions.UnboundPredicate;
import org.apache.iceberg.relocated.com.google.common.annotations.VisibleForTesting;
import org.apache.iceberg.relocated.com.google.common.base.Objects;
import org.apache.iceberg.relocated.com.google.common.base.Preconditions;
import org.apache.iceberg.relocated.com.google.common.collect.Sets;
import org.apache.iceberg.relocated.com.google.common.hash.HashFunction;
import org.apache.iceberg.relocated.com.google.common.hash.Hashing;
import org.apache.iceberg.types.Type;
import org.apache.iceberg.types.Types;
import static org.apache.iceberg.types.Type.TypeID;
abstract class Bucket implements Transform {
private static final HashFunction MURMUR3 = Hashing.murmur3_32();
@SuppressWarnings("unchecked")
static Bucket get(Type type, int numBuckets) {
Preconditions.checkArgument(numBuckets > 0,
"Invalid number of buckets: %s (must be > 0)", numBuckets);
switch (type.typeId()) {
case DATE:
case INTEGER:
return (Bucket) new BucketInteger(numBuckets);
case TIME:
case TIMESTAMP:
case LONG:
return (Bucket) new BucketLong(numBuckets);
case DECIMAL:
return (Bucket) new BucketDecimal(numBuckets);
case STRING:
return (Bucket) new BucketString(numBuckets);
case FIXED:
case BINARY:
return (Bucket) new BucketByteBuffer(numBuckets);
case UUID:
return (Bucket) new BucketUUID(numBuckets);
default:
throw new IllegalArgumentException("Cannot bucket by type: " + type);
}
}
private final int numBuckets;
private Bucket(int numBuckets) {
this.numBuckets = numBuckets;
}
public Integer numBuckets() {
return numBuckets;
}
@VisibleForTesting
abstract int hash(T value);
@Override
public Integer apply(T value) {
if (value == null) {
return null;
}
return (hash(value) & Integer.MAX_VALUE) % numBuckets;
}
@Override
public boolean equals(Object o) {
if (this == o) {
return true;
} else if (!(o instanceof Bucket)) {
return false;
}
Bucket> bucket = (Bucket>) o;
return numBuckets == bucket.numBuckets;
}
@Override
public int hashCode() {
return Objects.hashCode(numBuckets);
}
@Override
public String toString() {
return "bucket[" + numBuckets + "]";
}
@Override
public UnboundPredicate project(String name, BoundPredicate predicate) {
if (predicate.term() instanceof BoundTransform) {
return ProjectionUtil.projectTransformPredicate(this, name, predicate);
}
if (predicate.isUnaryPredicate()) {
return Expressions.predicate(predicate.op(), name);
} else if (predicate.isLiteralPredicate() && predicate.op() == Expression.Operation.EQ) {
return Expressions.predicate(
predicate.op(), name, apply(predicate.asLiteralPredicate().literal().value()));
} else if (predicate.isSetPredicate() && predicate.op() == Expression.Operation.IN) { // notIn can't be projected
return ProjectionUtil.transformSet(name, predicate.asSetPredicate(), this);
}
// comparison predicates can't be projected, notEq can't be projected
// TODO: small ranges can be projected.
// for example, (x > 0) and (x < 3) can be turned into in({1, 2}) and projected.
return null;
}
@Override
public UnboundPredicate projectStrict(String name, BoundPredicate predicate) {
if (predicate.term() instanceof BoundTransform) {
return ProjectionUtil.projectTransformPredicate(this, name, predicate);
}
if (predicate.isUnaryPredicate()) {
return Expressions.predicate(predicate.op(), name);
} else if (predicate.isLiteralPredicate() && predicate.op() == Expression.Operation.NOT_EQ) {
// TODO: need to translate not(eq(...)) into notEq in expressions
return Expressions.predicate(predicate.op(), name, apply(predicate.asLiteralPredicate().literal().value()));
} else if (predicate.isSetPredicate() && predicate.op() == Expression.Operation.NOT_IN) {
return ProjectionUtil.transformSet(name, predicate.asSetPredicate(), this);
}
// no strict projection for comparison or equality
return null;
}
@Override
public Type getResultType(Type sourceType) {
return Types.IntegerType.get();
}
private static class BucketInteger extends Bucket {
private BucketInteger(int numBuckets) {
super(numBuckets);
}
@Override
public int hash(Integer value) {
return MURMUR3.hashLong(value.longValue()).asInt();
}
@Override
public boolean canTransform(Type type) {
return type.typeId() == TypeID.INTEGER || type.typeId() == TypeID.DATE;
}
}
private static class BucketLong extends Bucket {
private BucketLong(int numBuckets) {
super(numBuckets);
}
@Override
public int hash(Long value) {
return MURMUR3.hashLong(value).asInt();
}
@Override
public boolean canTransform(Type type) {
return type.typeId() == TypeID.LONG ||
type.typeId() == TypeID.TIME ||
type.typeId() == TypeID.TIMESTAMP;
}
}
// bucketing by Double is not allowed by the spec, but this has the float hash implementation
static class BucketFloat extends Bucket {
// used by tests because the factory method will not instantiate a bucket function for floats
BucketFloat(int numBuckets) {
super(numBuckets);
}
@Override
public int hash(Float value) {
return MURMUR3.hashLong(Double.doubleToLongBits((double) value)).asInt();
}
@Override
public boolean canTransform(Type type) {
return type.typeId() == TypeID.FLOAT;
}
}
// bucketing by Double is not allowed by the spec, but this has the double hash implementation
static class BucketDouble extends Bucket {
// used by tests because the factory method will not instantiate a bucket function for doubles
BucketDouble(int numBuckets) {
super(numBuckets);
}
@Override
public int hash(Double value) {
return MURMUR3.hashLong(Double.doubleToLongBits(value)).asInt();
}
@Override
public boolean canTransform(Type type) {
return type.typeId() == TypeID.DOUBLE;
}
}
private static class BucketString extends Bucket {
private BucketString(int numBuckets) {
super(numBuckets);
}
@Override
public int hash(CharSequence value) {
for (int i = 0; i < value.length(); i++) {
if (Character.isSurrogate(value.charAt(i))) {
// TODO remove the fallback to this (slower) code path once https://github.com/google/guava/issues/5648 is fixed
return MURMUR3.hashBytes(value.toString().getBytes(StandardCharsets.UTF_8)).asInt();
}
}
return MURMUR3.hashString(value, StandardCharsets.UTF_8).asInt();
}
@Override
public boolean canTransform(Type type) {
return type.typeId() == TypeID.STRING;
}
}
private static class BucketByteBuffer extends Bucket {
private static final Set SUPPORTED_TYPES = Sets.newHashSet(
TypeID.BINARY, TypeID.FIXED);
private BucketByteBuffer(int numBuckets) {
super(numBuckets);
}
@Override
public int hash(ByteBuffer value) {
if (value.hasArray()) {
return MURMUR3.hashBytes(value.array(),
value.arrayOffset() + value.position(),
value.arrayOffset() + value.remaining()).asInt();
} else {
int position = value.position();
byte[] copy = new byte[value.remaining()];
try {
value.get(copy);
} finally {
// make sure the buffer position is unchanged
value.position(position);
}
return MURMUR3.hashBytes(copy).asInt();
}
}
@Override
public boolean canTransform(Type type) {
return SUPPORTED_TYPES.contains(type.typeId());
}
}
private static class BucketUUID extends Bucket {
private BucketUUID(int numBuckets) {
super(numBuckets);
}
@Override
public int hash(UUID value) {
return MURMUR3.newHasher(16)
.putLong(Long.reverseBytes(value.getMostSignificantBits()))
.putLong(Long.reverseBytes(value.getLeastSignificantBits()))
.hash().asInt();
}
@Override
public boolean canTransform(Type type) {
return type.typeId() == TypeID.UUID;
}
}
private static class BucketDecimal extends Bucket {
private BucketDecimal(int numBuckets) {
super(numBuckets);
}
@Override
public int hash(BigDecimal value) {
return MURMUR3.hashBytes(value.unscaledValue().toByteArray()).asInt();
}
@Override
public boolean canTransform(Type type) {
return type.typeId() == TypeID.DECIMAL;
}
}
}
© 2015 - 2025 Weber Informatics LLC | Privacy Policy