org.apache.iceberg.expressions.StrictMetricsEvaluator Maven / Gradle / Ivy
Show all versions of iceberg-api Show documentation
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/
package org.apache.iceberg.expressions;
import java.nio.ByteBuffer;
import java.util.Collection;
import java.util.Map;
import java.util.Set;
import java.util.stream.Collectors;
import org.apache.iceberg.ContentFile;
import org.apache.iceberg.DataFile;
import org.apache.iceberg.Schema;
import org.apache.iceberg.expressions.ExpressionVisitors.BoundExpressionVisitor;
import org.apache.iceberg.relocated.com.google.common.base.Preconditions;
import org.apache.iceberg.types.Conversions;
import org.apache.iceberg.types.Types;
import org.apache.iceberg.types.Types.StructType;
import org.apache.iceberg.util.NaNUtil;
import static org.apache.iceberg.expressions.Expressions.rewriteNot;
/**
* Evaluates an {@link Expression} on a {@link DataFile} to test whether all rows in the file match.
*
* This evaluation is strict: it returns true if all rows in a file must match the expression. For
* example, if a file's ts column has min X and max Y, this evaluator will return true for ts < Y+1
* but not for ts < Y-1.
*
* Files are passed to {@link #eval(ContentFile)}, which returns true if all rows in the file must
* contain matching rows and false if the file may contain rows that do not match.
*
* Due to the comparison implementation of ORC stats, for float/double columns in ORC files, if the first
* value in a file is NaN, metrics of this file will report NaN for both upper and lower bound despite
* that the column could contain non-NaN data. Thus in some scenarios explicitly checks for NaN is necessary
* in order to not include files that may contain rows that don't match.
*/
public class StrictMetricsEvaluator {
private final Schema schema;
private final StructType struct;
private final Expression expr;
public StrictMetricsEvaluator(Schema schema, Expression unbound) {
this(schema, unbound, true);
}
public StrictMetricsEvaluator(Schema schema, Expression unbound, boolean caseSensitive) {
this.schema = schema;
this.struct = schema.asStruct();
this.expr = Binder.bind(struct, rewriteNot(unbound), caseSensitive);
}
/**
* Test whether all records within the file match the expression.
*
* @param file a data file
* @return false if the file may contain any row that doesn't match the expression, true otherwise.
*/
public boolean eval(ContentFile> file) {
// TODO: detect the case where a column is missing from the file using file's max field id.
return new MetricsEvalVisitor().eval(file);
}
private static final boolean ROWS_MUST_MATCH = true;
private static final boolean ROWS_MIGHT_NOT_MATCH = false;
private class MetricsEvalVisitor extends BoundExpressionVisitor {
private Map valueCounts = null;
private Map nullCounts = null;
private Map nanCounts = null;
private Map lowerBounds = null;
private Map upperBounds = null;
private boolean eval(ContentFile> file) {
if (file.recordCount() <= 0) {
return ROWS_MUST_MATCH;
}
this.valueCounts = file.valueCounts();
this.nullCounts = file.nullValueCounts();
this.nanCounts = file.nanValueCounts();
this.lowerBounds = file.lowerBounds();
this.upperBounds = file.upperBounds();
return ExpressionVisitors.visitEvaluator(expr, this);
}
@Override
public Boolean handleNonReference(Bound term) {
// If the term in any expression is not a direct reference, assume that rows may not match. This happens when
// transforms or other expressions are passed to this evaluator. For example, bucket16(x) = 0 can't be determined
// because this visitor operates on data metrics and not partition values. It may be possible to un-transform
// expressions for order preserving transforms in the future, but this is not currently supported.
return ROWS_MIGHT_NOT_MATCH;
}
@Override
public Boolean alwaysTrue() {
return ROWS_MUST_MATCH; // all rows match
}
@Override
public Boolean alwaysFalse() {
return ROWS_MIGHT_NOT_MATCH; // no rows match
}
@Override
public Boolean not(Boolean result) {
return !result;
}
@Override
public Boolean and(Boolean leftResult, Boolean rightResult) {
return leftResult && rightResult;
}
@Override
public Boolean or(Boolean leftResult, Boolean rightResult) {
return leftResult || rightResult;
}
@Override
public Boolean isNull(BoundReference ref) {
// no need to check whether the field is required because binding evaluates that case
// if the column has any non-null values, the expression does not match
int id = ref.fieldId();
Preconditions.checkNotNull(struct.field(id),
"Cannot filter by nested column: %s", schema.findField(id));
if (containsNullsOnly(id)) {
return ROWS_MUST_MATCH;
}
return ROWS_MIGHT_NOT_MATCH;
}
@Override
public Boolean notNull(BoundReference ref) {
// no need to check whether the field is required because binding evaluates that case
// if the column has any null values, the expression does not match
int id = ref.fieldId();
Preconditions.checkNotNull(struct.field(id),
"Cannot filter by nested column: %s", schema.findField(id));
if (nullCounts != null && nullCounts.containsKey(id) && nullCounts.get(id) == 0) {
return ROWS_MUST_MATCH;
}
return ROWS_MIGHT_NOT_MATCH;
}
@Override
public Boolean isNaN(BoundReference ref) {
int id = ref.fieldId();
if (containsNaNsOnly(id)) {
return ROWS_MUST_MATCH;
}
return ROWS_MIGHT_NOT_MATCH;
}
@Override
public Boolean notNaN(BoundReference ref) {
int id = ref.fieldId();
if (nanCounts != null && nanCounts.containsKey(id) && nanCounts.get(id) == 0) {
return ROWS_MUST_MATCH;
}
if (containsNullsOnly(id)) {
return ROWS_MUST_MATCH;
}
return ROWS_MIGHT_NOT_MATCH;
}
@Override
public Boolean lt(BoundReference ref, Literal lit) {
// Rows must match when: <----------Min----Max---X------->
Integer id = ref.fieldId();
Types.NestedField field = struct.field(id);
Preconditions.checkNotNull(field, "Cannot filter by nested column: %s", schema.findField(id));
if (canContainNulls(id) || canContainNaNs(id)) {
return ROWS_MIGHT_NOT_MATCH;
}
if (upperBounds != null && upperBounds.containsKey(id)) {
T upper = Conversions.fromByteBuffer(field.type(), upperBounds.get(id));
int cmp = lit.comparator().compare(upper, lit.value());
if (cmp < 0) {
return ROWS_MUST_MATCH;
}
}
return ROWS_MIGHT_NOT_MATCH;
}
@Override
public Boolean ltEq(BoundReference ref, Literal lit) {
// Rows must match when: <----------Min----Max---X------->
Integer id = ref.fieldId();
Types.NestedField field = struct.field(id);
Preconditions.checkNotNull(field, "Cannot filter by nested column: %s", schema.findField(id));
if (canContainNulls(id) || canContainNaNs(id)) {
return ROWS_MIGHT_NOT_MATCH;
}
if (upperBounds != null && upperBounds.containsKey(id)) {
T upper = Conversions.fromByteBuffer(field.type(), upperBounds.get(id));
int cmp = lit.comparator().compare(upper, lit.value());
if (cmp <= 0) {
return ROWS_MUST_MATCH;
}
}
return ROWS_MIGHT_NOT_MATCH;
}
@Override
public Boolean gt(BoundReference ref, Literal lit) {
// Rows must match when: <-------X---Min----Max---------->
Integer id = ref.fieldId();
Types.NestedField field = struct.field(id);
Preconditions.checkNotNull(field, "Cannot filter by nested column: %s", schema.findField(id));
if (canContainNulls(id) || canContainNaNs(id)) {
return ROWS_MIGHT_NOT_MATCH;
}
if (lowerBounds != null && lowerBounds.containsKey(id)) {
T lower = Conversions.fromByteBuffer(field.type(), lowerBounds.get(id));
if (NaNUtil.isNaN(lower)) {
// NaN indicates unreliable bounds. See the StrictMetricsEvaluator docs for more.
return ROWS_MIGHT_NOT_MATCH;
}
int cmp = lit.comparator().compare(lower, lit.value());
if (cmp > 0) {
return ROWS_MUST_MATCH;
}
}
return ROWS_MIGHT_NOT_MATCH;
}
@Override
public Boolean gtEq(BoundReference ref, Literal lit) {
// Rows must match when: <-------X---Min----Max---------->
Integer id = ref.fieldId();
Types.NestedField field = struct.field(id);
Preconditions.checkNotNull(field, "Cannot filter by nested column: %s", schema.findField(id));
if (canContainNulls(id) || canContainNaNs(id)) {
return ROWS_MIGHT_NOT_MATCH;
}
if (lowerBounds != null && lowerBounds.containsKey(id)) {
T lower = Conversions.fromByteBuffer(field.type(), lowerBounds.get(id));
if (NaNUtil.isNaN(lower)) {
// NaN indicates unreliable bounds. See the StrictMetricsEvaluator docs for more.
return ROWS_MIGHT_NOT_MATCH;
}
int cmp = lit.comparator().compare(lower, lit.value());
if (cmp >= 0) {
return ROWS_MUST_MATCH;
}
}
return ROWS_MIGHT_NOT_MATCH;
}
@Override
public Boolean eq(BoundReference ref, Literal lit) {
// Rows must match when Min == X == Max
Integer id = ref.fieldId();
Types.NestedField field = struct.field(id);
Preconditions.checkNotNull(field, "Cannot filter by nested column: %s", schema.findField(id));
if (canContainNulls(id) || canContainNaNs(id)) {
return ROWS_MIGHT_NOT_MATCH;
}
if (lowerBounds != null && lowerBounds.containsKey(id) &&
upperBounds != null && upperBounds.containsKey(id)) {
T lower = Conversions.fromByteBuffer(struct.field(id).type(), lowerBounds.get(id));
int cmp = lit.comparator().compare(lower, lit.value());
if (cmp != 0) {
return ROWS_MIGHT_NOT_MATCH;
}
T upper = Conversions.fromByteBuffer(field.type(), upperBounds.get(id));
cmp = lit.comparator().compare(upper, lit.value());
if (cmp != 0) {
return ROWS_MIGHT_NOT_MATCH;
}
return ROWS_MUST_MATCH;
}
return ROWS_MIGHT_NOT_MATCH;
}
@Override
public Boolean notEq(BoundReference ref, Literal lit) {
// Rows must match when X < Min or Max < X because it is not in the range
Integer id = ref.fieldId();
Types.NestedField field = struct.field(id);
Preconditions.checkNotNull(field, "Cannot filter by nested column: %s", schema.findField(id));
if (containsNullsOnly(id) || containsNaNsOnly(id)) {
return ROWS_MUST_MATCH;
}
if (lowerBounds != null && lowerBounds.containsKey(id)) {
T lower = Conversions.fromByteBuffer(struct.field(id).type(), lowerBounds.get(id));
if (NaNUtil.isNaN(lower)) {
// NaN indicates unreliable bounds. See the StrictMetricsEvaluator docs for more.
return ROWS_MIGHT_NOT_MATCH;
}
int cmp = lit.comparator().compare(lower, lit.value());
if (cmp > 0) {
return ROWS_MUST_MATCH;
}
}
if (upperBounds != null && upperBounds.containsKey(id)) {
T upper = Conversions.fromByteBuffer(field.type(), upperBounds.get(id));
int cmp = lit.comparator().compare(upper, lit.value());
if (cmp < 0) {
return ROWS_MUST_MATCH;
}
}
return ROWS_MIGHT_NOT_MATCH;
}
@Override
public Boolean in(BoundReference ref, Set literalSet) {
Integer id = ref.fieldId();
Types.NestedField field = struct.field(id);
Preconditions.checkNotNull(field, "Cannot filter by nested column: %s", schema.findField(id));
if (canContainNulls(id) || canContainNaNs(id)) {
return ROWS_MIGHT_NOT_MATCH;
}
if (lowerBounds != null && lowerBounds.containsKey(id) &&
upperBounds != null && upperBounds.containsKey(id)) {
// similar to the implementation in eq, first check if the lower bound is in the set
T lower = Conversions.fromByteBuffer(struct.field(id).type(), lowerBounds.get(id));
if (!literalSet.contains(lower)) {
return ROWS_MIGHT_NOT_MATCH;
}
// check if the upper bound is in the set
T upper = Conversions.fromByteBuffer(field.type(), upperBounds.get(id));
if (!literalSet.contains(upper)) {
return ROWS_MIGHT_NOT_MATCH;
}
// finally check if the lower bound and the upper bound are equal
if (ref.comparator().compare(lower, upper) != 0) {
return ROWS_MIGHT_NOT_MATCH;
}
// All values must be in the set if the lower bound and the upper bound are in the set and are equal.
return ROWS_MUST_MATCH;
}
return ROWS_MIGHT_NOT_MATCH;
}
@Override
public Boolean notIn(BoundReference ref, Set literalSet) {
Integer id = ref.fieldId();
Types.NestedField field = struct.field(id);
Preconditions.checkNotNull(field, "Cannot filter by nested column: %s", schema.findField(id));
if (containsNullsOnly(id) || containsNaNsOnly(id)) {
return ROWS_MUST_MATCH;
}
Collection literals = literalSet;
if (lowerBounds != null && lowerBounds.containsKey(id)) {
T lower = Conversions.fromByteBuffer(struct.field(id).type(), lowerBounds.get(id));
if (NaNUtil.isNaN(lower)) {
// NaN indicates unreliable bounds. See the StrictMetricsEvaluator docs for more.
return ROWS_MIGHT_NOT_MATCH;
}
literals = literals.stream().filter(v -> ref.comparator().compare(lower, v) <= 0).collect(Collectors.toList());
if (literals.isEmpty()) { // if all values are less than lower bound, rows must match (notIn).
return ROWS_MUST_MATCH;
}
}
if (upperBounds != null && upperBounds.containsKey(id)) {
T upper = Conversions.fromByteBuffer(field.type(), upperBounds.get(id));
literals = literals.stream().filter(v -> ref.comparator().compare(upper, v) >= 0).collect(Collectors.toList());
if (literals.isEmpty()) { // if all remaining values are greater than upper bound, rows must match (notIn).
return ROWS_MUST_MATCH;
}
}
return ROWS_MIGHT_NOT_MATCH;
}
@Override
public Boolean startsWith(BoundReference ref, Literal lit) {
return ROWS_MIGHT_NOT_MATCH;
}
@Override
public Boolean notStartsWith(BoundReference ref, Literal lit) {
// TODO: Handle cases that definitely cannot match, such as notStartsWith("x") when the bounds are ["a", "b"].
return ROWS_MIGHT_NOT_MATCH;
}
private boolean canContainNulls(Integer id) {
return nullCounts == null || (nullCounts.containsKey(id) && nullCounts.get(id) > 0);
}
private boolean canContainNaNs(Integer id) {
// nan counts might be null for early version writers when nan counters are not populated.
return nanCounts != null && nanCounts.containsKey(id) && nanCounts.get(id) > 0;
}
private boolean containsNullsOnly(Integer id) {
return valueCounts != null && valueCounts.containsKey(id) &&
nullCounts != null && nullCounts.containsKey(id) &&
valueCounts.get(id) - nullCounts.get(id) == 0;
}
private boolean containsNaNsOnly(Integer id) {
return nanCounts != null && nanCounts.containsKey(id) &&
valueCounts != null && nanCounts.get(id).equals(valueCounts.get(id));
}
}
}