All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.apache.iceberg.orc.ExpressionToSearchArgument Maven / Gradle / Ivy

There is a newer version: 1.0.0.5
Show newest version
/*
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *   http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing,
 * software distributed under the License is distributed on an
 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 * KIND, either express or implied.  See the License for the
 * specific language governing permissions and limitations
 * under the License.
 */

package org.apache.iceberg.orc;

import java.math.BigDecimal;
import java.sql.Date;
import java.sql.Timestamp;
import java.time.Instant;
import java.time.LocalDate;
import java.util.Map;
import java.util.Set;
import org.apache.iceberg.expressions.Bound;
import org.apache.iceberg.expressions.BoundPredicate;
import org.apache.iceberg.expressions.Expression;
import org.apache.iceberg.expressions.ExpressionVisitors;
import org.apache.iceberg.expressions.Literal;
import org.apache.iceberg.relocated.com.google.common.collect.ImmutableSet;
import org.apache.iceberg.types.Type;
import org.apache.iceberg.types.Type.TypeID;
import org.apache.orc.TypeDescription;
import org.apache.orc.storage.common.type.HiveDecimal;
import org.apache.orc.storage.ql.io.sarg.PredicateLeaf;
import org.apache.orc.storage.ql.io.sarg.SearchArgument;
import org.apache.orc.storage.ql.io.sarg.SearchArgument.TruthValue;
import org.apache.orc.storage.ql.io.sarg.SearchArgumentFactory;
import org.apache.orc.storage.serde2.io.HiveDecimalWritable;

class ExpressionToSearchArgument extends ExpressionVisitors.BoundVisitor {

  static SearchArgument convert(Expression expr, TypeDescription readSchema) {
    Map idToColumnName = ORCSchemaUtil.idToOrcName(ORCSchemaUtil.convert(readSchema));
    SearchArgument.Builder builder = SearchArgumentFactory.newBuilder();
    ExpressionVisitors.visit(expr, new ExpressionToSearchArgument(builder, idToColumnName)).invoke();
    return builder.build();
  }

  // Currently every predicate in ORC requires a PredicateLeaf.Type field which is not available for these Iceberg types
  private static final Set UNSUPPORTED_TYPES = ImmutableSet.of(
      TypeID.BINARY,
      TypeID.FIXED,
      TypeID.UUID,
      TypeID.STRUCT,
      TypeID.MAP,
      TypeID.LIST
  );

  private SearchArgument.Builder builder;
  private Map idToColumnName;

  private ExpressionToSearchArgument(SearchArgument.Builder builder, Map idToColumnName) {
    this.builder = builder;
    this.idToColumnName = idToColumnName;
  }

  @Override
  public Action alwaysTrue() {
    return () -> this.builder.literal(TruthValue.YES);
  }

  @Override
  public Action alwaysFalse() {
    return () -> this.builder.literal(TruthValue.NO);
  }

  @Override
  public Action not(Action child) {
    return () -> {
      this.builder.startNot();
      child.invoke();
      this.builder.end();
    };
  }

  @Override
  public Action and(Action leftChild, Action rightChild) {
    return () -> {
      this.builder.startAnd();
      leftChild.invoke();
      rightChild.invoke();
      this.builder.end();
    };
  }

  @Override
  public Action or(Action leftChild, Action rightChild) {
    return () -> {
      this.builder.startOr();
      leftChild.invoke();
      rightChild.invoke();
      this.builder.end();
    };
  }

  @Override
  public  Action isNull(Bound expr) {
    return () -> this.builder.isNull(idToColumnName.get(expr.ref().fieldId()),
        type(expr.ref().type()));
  }

  @Override
  public  Action notNull(Bound expr) {
    return () -> this.builder.startNot()
        .isNull(idToColumnName.get(expr.ref().fieldId()),
            type(expr.ref().type()))
        .end();
  }

  @Override
  public  Action isNaN(Bound expr) {
    return () -> this.builder.equals(
        idToColumnName.get(expr.ref().fieldId()),
        type(expr.ref().type()),
        literal(expr.ref().type(), getNaNForType(expr.ref().type())));
  }

  private Object getNaNForType(Type type) {
    switch (type.typeId()) {
      case FLOAT:
        return Float.NaN;
      case DOUBLE:
        return Double.NaN;
      default:
        throw new IllegalArgumentException("Cannot get NaN value for type " + type.typeId());
    }
  }

  @Override
  public  Action notNaN(Bound expr) {
    return () -> {
      this.builder.startOr();
      isNull(expr).invoke();
      this.builder.startNot();
      isNaN(expr).invoke();
      this.builder.end(); // end NOT
      this.builder.end(); // end OR
    };
  }

  @Override
  public  Action lt(Bound expr, Literal lit) {
    return () -> this.builder.lessThan(idToColumnName.get(expr.ref().fieldId()),
        type(expr.ref().type()),
        literal(expr.ref().type(), lit.value()));
  }

  @Override
  public  Action ltEq(Bound expr, Literal lit) {
    return () -> this.builder.lessThanEquals(idToColumnName.get(expr.ref().fieldId()),
        type(expr.ref().type()),
        literal(expr.ref().type(), lit.value()));
  }

  @Override
  public  Action gt(Bound expr, Literal lit) {
    // ORC SearchArguments do not have a greaterThan predicate, so we use not(lessThanOrEquals)
    // e.g. x > 5 => not(x <= 5)
    return () -> this.builder.startNot()
          .lessThanEquals(idToColumnName.get(expr.ref().fieldId()),
              type(expr.ref().type()),
              literal(expr.ref().type(), lit.value()))
          .end();
  }

  @Override
  public  Action gtEq(Bound expr, Literal lit) {
    // ORC SearchArguments do not have a greaterThanOrEquals predicate, so we use not(lessThan)
    // e.g. x >= 5 => not(x < 5)
    return () -> this.builder.startNot()
          .lessThan(idToColumnName.get(expr.ref().fieldId()),
              type(expr.ref().type()),
              literal(expr.ref().type(), lit.value()))
          .end();
  }

  @Override
  public  Action eq(Bound expr, Literal lit) {
    return () -> this.builder.equals(idToColumnName.get(expr.ref().fieldId()),
        type(expr.ref().type()),
        literal(expr.ref().type(), lit.value()));
  }

  @Override
  public  Action notEq(Bound expr, Literal lit) {
    // NOTE: ORC uses SQL semantics for Search Arguments, so an expression like
    // `col != 1` will exclude rows where col is NULL along with rows where col = 1
    // In contrast, Iceberg's Expressions will keep rows with NULL values
    // So the equivalent ORC Search Argument for an Iceberg Expression `col != x`
    // is `col IS NULL OR col != x`
    return () -> {
      this.builder.startOr();
      isNull(expr).invoke();
      this.builder.startNot();
      eq(expr, lit).invoke();
      this.builder.end(); // end NOT
      this.builder.end(); // end OR
    };
  }

  @Override
  public  Action in(Bound expr, Set literalSet) {
    return () -> this.builder.in(
        idToColumnName.get(expr.ref().fieldId()),
        type(expr.ref().type()),
        literalSet.stream().map(lit -> literal(expr.ref().type(), lit)).toArray(Object[]::new));
  }

  @Override
  public  Action notIn(Bound expr, Set literalSet) {
    // NOTE: ORC uses SQL semantics for Search Arguments, so an expression like
    // `col NOT IN {1}` will exclude rows where col is NULL along with rows where col = 1
    // In contrast, Iceberg's Expressions will keep rows with NULL values
    // So the equivalent ORC Search Argument for an Iceberg Expression `col NOT IN {x}`
    // is `col IS NULL OR col NOT IN {x}`
    return () -> {
      this.builder.startOr();
      isNull(expr).invoke();
      this.builder.startNot();
      in(expr, literalSet).invoke();
      this.builder.end(); // end NOT
      this.builder.end(); // end OR
    };
  }

  @Override
  public  Action startsWith(Bound expr, Literal lit) {
    // Cannot push down STARTS_WITH operator to ORC, so return TruthValue.YES_NO_NULL which signifies
    // that this predicate cannot help with filtering
    return () -> this.builder.literal(TruthValue.YES_NO_NULL);
  }

  @Override
  public  Action predicate(BoundPredicate pred) {
    if (UNSUPPORTED_TYPES.contains(pred.ref().type().typeId())) {
      // Cannot push down predicates for types which cannot be represented in PredicateLeaf.Type, so return
      // TruthValue.YES_NO_NULL which signifies that this predicate cannot help with filtering
      return () -> this.builder.literal(TruthValue.YES_NO_NULL);
    } else {
      return super.predicate(pred);
    }
  }

  @FunctionalInterface
  interface Action {
    void invoke();
  }

  private PredicateLeaf.Type type(Type icebergType) {
    switch (icebergType.typeId()) {
      case BOOLEAN:
        return PredicateLeaf.Type.BOOLEAN;
      case INTEGER:
      case LONG:
      case TIME:
        return PredicateLeaf.Type.LONG;
      case FLOAT:
      case DOUBLE:
        return PredicateLeaf.Type.FLOAT;
      case DATE:
        return PredicateLeaf.Type.DATE;
      case TIMESTAMP:
        return PredicateLeaf.Type.TIMESTAMP;
      case STRING:
        return PredicateLeaf.Type.STRING;
      case DECIMAL:
        return PredicateLeaf.Type.DECIMAL;
      default:
        throw new UnsupportedOperationException("Type " + icebergType + " not supported in ORC SearchArguments");
    }
  }

  private  Object literal(Type icebergType, T icebergLiteral) {
    switch (icebergType.typeId()) {
      case BOOLEAN:
      case LONG:
      case TIME:
      case DOUBLE:
        return icebergLiteral;
      case INTEGER:
        return ((Integer) icebergLiteral).longValue();
      case FLOAT:
        return ((Float) icebergLiteral).doubleValue();
      case STRING:
        return icebergLiteral.toString();
      case DATE:
        return Date.valueOf(LocalDate.ofEpochDay((Integer) icebergLiteral));
      case TIMESTAMP:
        long microsFromEpoch = (Long) icebergLiteral;
        return Timestamp.from(Instant.ofEpochSecond(
            Math.floorDiv(microsFromEpoch, 1_000_000),
            Math.floorMod(microsFromEpoch, 1_000_000) * 1_000
        ));
      case DECIMAL:
        return new HiveDecimalWritable(HiveDecimal.create((BigDecimal) icebergLiteral, false));
      default:
        throw new UnsupportedOperationException("Type " + icebergType + " not supported in ORC SearchArguments");
    }
  }
}




© 2015 - 2025 Weber Informatics LLC | Privacy Policy