org.apache.iceberg.orc.ExpressionToSearchArgument Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of iceberg-orc Show documentation
Show all versions of iceberg-orc Show documentation
A table format for huge analytic datasets
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/
package org.apache.iceberg.orc;
import java.math.BigDecimal;
import java.sql.Date;
import java.sql.Timestamp;
import java.time.Instant;
import java.time.LocalDate;
import java.util.Map;
import java.util.Set;
import org.apache.iceberg.expressions.Bound;
import org.apache.iceberg.expressions.BoundPredicate;
import org.apache.iceberg.expressions.BoundReference;
import org.apache.iceberg.expressions.Expression;
import org.apache.iceberg.expressions.ExpressionVisitors;
import org.apache.iceberg.expressions.Literal;
import org.apache.iceberg.relocated.com.google.common.collect.ImmutableSet;
import org.apache.iceberg.types.Type;
import org.apache.iceberg.types.Type.TypeID;
import org.apache.orc.TypeDescription;
import org.apache.orc.storage.common.type.HiveDecimal;
import org.apache.orc.storage.ql.io.sarg.PredicateLeaf;
import org.apache.orc.storage.ql.io.sarg.SearchArgument;
import org.apache.orc.storage.ql.io.sarg.SearchArgument.TruthValue;
import org.apache.orc.storage.ql.io.sarg.SearchArgumentFactory;
import org.apache.orc.storage.serde2.io.HiveDecimalWritable;
class ExpressionToSearchArgument
extends ExpressionVisitors.BoundVisitor {
static SearchArgument convert(Expression expr, TypeDescription readSchema) {
Map idToColumnName =
ORCSchemaUtil.idToOrcName(ORCSchemaUtil.convert(readSchema));
SearchArgument.Builder builder = SearchArgumentFactory.newBuilder();
ExpressionVisitors.visit(expr, new ExpressionToSearchArgument(builder, idToColumnName))
.invoke();
return builder.build();
}
// Currently every predicate in ORC requires a PredicateLeaf.Type field which is not available for
// these Iceberg types
private static final Set UNSUPPORTED_TYPES =
ImmutableSet.of(
TypeID.BINARY, TypeID.FIXED, TypeID.UUID, TypeID.STRUCT, TypeID.MAP, TypeID.LIST);
private SearchArgument.Builder builder;
private Map idToColumnName;
private ExpressionToSearchArgument(
SearchArgument.Builder builder, Map idToColumnName) {
this.builder = builder;
this.idToColumnName = idToColumnName;
}
@Override
public Action alwaysTrue() {
return () -> this.builder.literal(TruthValue.YES);
}
@Override
public Action alwaysFalse() {
return () -> this.builder.literal(TruthValue.NO);
}
@Override
public Action not(Action child) {
return () -> {
this.builder.startNot();
child.invoke();
this.builder.end();
};
}
@Override
public Action and(Action leftChild, Action rightChild) {
return () -> {
this.builder.startAnd();
leftChild.invoke();
rightChild.invoke();
this.builder.end();
};
}
@Override
public Action or(Action leftChild, Action rightChild) {
return () -> {
this.builder.startOr();
leftChild.invoke();
rightChild.invoke();
this.builder.end();
};
}
@Override
public Action isNull(Bound expr) {
return () ->
this.builder.isNull(idToColumnName.get(expr.ref().fieldId()), type(expr.ref().type()));
}
@Override
public Action notNull(Bound expr) {
return () ->
this.builder
.startNot()
.isNull(idToColumnName.get(expr.ref().fieldId()), type(expr.ref().type()))
.end();
}
@Override
public Action isNaN(Bound expr) {
return () ->
this.builder.equals(
idToColumnName.get(expr.ref().fieldId()),
type(expr.ref().type()),
literal(expr.ref().type(), getNaNForType(expr.ref().type())));
}
private Object getNaNForType(Type type) {
switch (type.typeId()) {
case FLOAT:
return Float.NaN;
case DOUBLE:
return Double.NaN;
default:
throw new IllegalArgumentException("Cannot get NaN value for type " + type.typeId());
}
}
@Override
public Action notNaN(Bound expr) {
return () -> {
this.builder.startOr();
isNull(expr).invoke();
this.builder.startNot();
isNaN(expr).invoke();
this.builder.end(); // end NOT
this.builder.end(); // end OR
};
}
@Override
public Action lt(Bound expr, Literal lit) {
return () ->
this.builder.lessThan(
idToColumnName.get(expr.ref().fieldId()),
type(expr.ref().type()),
literal(expr.ref().type(), lit.value()));
}
@Override
public Action ltEq(Bound expr, Literal lit) {
return () ->
this.builder.lessThanEquals(
idToColumnName.get(expr.ref().fieldId()),
type(expr.ref().type()),
literal(expr.ref().type(), lit.value()));
}
@Override
public Action gt(Bound expr, Literal lit) {
// ORC SearchArguments do not have a greaterThan predicate, so we use not(lessThanOrEquals)
// e.g. x > 5 => not(x <= 5)
return () ->
this.builder
.startNot()
.lessThanEquals(
idToColumnName.get(expr.ref().fieldId()),
type(expr.ref().type()),
literal(expr.ref().type(), lit.value()))
.end();
}
@Override
public Action gtEq(Bound expr, Literal lit) {
// ORC SearchArguments do not have a greaterThanOrEquals predicate, so we use not(lessThan)
// e.g. x >= 5 => not(x < 5)
return () ->
this.builder
.startNot()
.lessThan(
idToColumnName.get(expr.ref().fieldId()),
type(expr.ref().type()),
literal(expr.ref().type(), lit.value()))
.end();
}
@Override
public Action eq(Bound expr, Literal lit) {
return () ->
this.builder.equals(
idToColumnName.get(expr.ref().fieldId()),
type(expr.ref().type()),
literal(expr.ref().type(), lit.value()));
}
@Override
public Action notEq(Bound expr, Literal lit) {
// NOTE: ORC uses SQL semantics for Search Arguments, so an expression like
// `col != 1` will exclude rows where col is NULL along with rows where col = 1
// In contrast, Iceberg's Expressions will keep rows with NULL values
// So the equivalent ORC Search Argument for an Iceberg Expression `col != x`
// is `col IS NULL OR col != x`
return () -> {
this.builder.startOr();
isNull(expr).invoke();
this.builder.startNot();
eq(expr, lit).invoke();
this.builder.end(); // end NOT
this.builder.end(); // end OR
};
}
@Override
public Action in(Bound expr, Set literalSet) {
return () ->
this.builder.in(
idToColumnName.get(expr.ref().fieldId()),
type(expr.ref().type()),
literalSet.stream().map(lit -> literal(expr.ref().type(), lit)).toArray(Object[]::new));
}
@Override
public Action notIn(Bound expr, Set literalSet) {
// NOTE: ORC uses SQL semantics for Search Arguments, so an expression like
// `col NOT IN {1}` will exclude rows where col is NULL along with rows where col = 1
// In contrast, Iceberg's Expressions will keep rows with NULL values
// So the equivalent ORC Search Argument for an Iceberg Expression `col NOT IN {x}`
// is `col IS NULL OR col NOT IN {x}`
return () -> {
this.builder.startOr();
isNull(expr).invoke();
this.builder.startNot();
in(expr, literalSet).invoke();
this.builder.end(); // end NOT
this.builder.end(); // end OR
};
}
@Override
public Action startsWith(Bound expr, Literal lit) {
// Cannot push down STARTS_WITH operator to ORC, so return TruthValue.YES_NO_NULL which
// signifies
// that this predicate cannot help with filtering
return () -> this.builder.literal(TruthValue.YES_NO_NULL);
}
@Override
public Action notStartsWith(Bound expr, Literal lit) {
// Cannot push down NOT_STARTS_WITH operator to ORC, so return TruthValue.YES_NO_NULL which
// signifies
// that this predicate cannot help with filtering
return () -> this.builder.literal(TruthValue.YES_NO_NULL);
}
@Override
public Action predicate(BoundPredicate pred) {
if (UNSUPPORTED_TYPES.contains(pred.ref().type().typeId())
|| !(pred.term() instanceof BoundReference)) {
// Cannot push down predicates for types which cannot be represented in PredicateLeaf.Type, so
// return
// TruthValue.YES_NO_NULL which signifies that this predicate cannot help with filtering
return () -> this.builder.literal(TruthValue.YES_NO_NULL);
} else {
return super.predicate(pred);
}
}
@FunctionalInterface
interface Action {
void invoke();
}
private PredicateLeaf.Type type(Type icebergType) {
switch (icebergType.typeId()) {
case BOOLEAN:
return PredicateLeaf.Type.BOOLEAN;
case INTEGER:
case LONG:
case TIME:
return PredicateLeaf.Type.LONG;
case FLOAT:
case DOUBLE:
return PredicateLeaf.Type.FLOAT;
case DATE:
return PredicateLeaf.Type.DATE;
case TIMESTAMP:
return PredicateLeaf.Type.TIMESTAMP;
case STRING:
return PredicateLeaf.Type.STRING;
case DECIMAL:
return PredicateLeaf.Type.DECIMAL;
default:
throw new UnsupportedOperationException(
"Type " + icebergType + " not supported in ORC SearchArguments");
}
}
private Object literal(Type icebergType, T icebergLiteral) {
switch (icebergType.typeId()) {
case BOOLEAN:
case LONG:
case TIME:
case DOUBLE:
return icebergLiteral;
case INTEGER:
return ((Integer) icebergLiteral).longValue();
case FLOAT:
return ((Float) icebergLiteral).doubleValue();
case STRING:
return icebergLiteral.toString();
case DATE:
return Date.valueOf(LocalDate.ofEpochDay((Integer) icebergLiteral));
case TIMESTAMP:
long microsFromEpoch = (Long) icebergLiteral;
return Timestamp.from(
Instant.ofEpochSecond(
Math.floorDiv(microsFromEpoch, 1_000_000),
Math.floorMod(microsFromEpoch, 1_000_000) * 1_000));
case DECIMAL:
return new HiveDecimalWritable(HiveDecimal.create((BigDecimal) icebergLiteral, false));
default:
throw new UnsupportedOperationException(
"Type " + icebergType + " not supported in ORC SearchArguments");
}
}
}
© 2015 - 2025 Weber Informatics LLC | Privacy Policy