org.apache.iceberg.orc.OrcMetrics Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of iceberg-orc Show documentation
A table format for huge analytic datasets
There is a newer version: 1.7.1
/*
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *   http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing,
 * software distributed under the License is distributed on an
 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 * KIND, either express or implied.  See the License for the
 * specific language governing permissions and limitations
 * under the License.
 */
package org.apache.iceberg.orc;

import java.io.IOException;
import java.nio.ByteBuffer;
import java.sql.Timestamp;
import java.util.List;
import java.util.Map;
import java.util.Objects;
import java.util.Optional;
import java.util.Set;
import java.util.function.Function;
import java.util.stream.Collectors;
import java.util.stream.Stream;
import org.apache.hadoop.conf.Configuration;
import org.apache.iceberg.FieldMetrics;
import org.apache.iceberg.Metrics;
import org.apache.iceberg.MetricsConfig;
import org.apache.iceberg.MetricsModes;
import org.apache.iceberg.MetricsModes.MetricsMode;
import org.apache.iceberg.MetricsUtil;
import org.apache.iceberg.Schema;
import org.apache.iceberg.exceptions.RuntimeIOException;
import org.apache.iceberg.expressions.Literal;
import org.apache.iceberg.hadoop.HadoopInputFile;
import org.apache.iceberg.io.InputFile;
import org.apache.iceberg.mapping.NameMapping;
import org.apache.iceberg.relocated.com.google.common.collect.ImmutableSet;
import org.apache.iceberg.relocated.com.google.common.collect.Maps;
import org.apache.iceberg.types.Conversions;
import org.apache.iceberg.types.Type;
import org.apache.iceberg.types.Types;
import org.apache.iceberg.util.DateTimeUtil;
import org.apache.iceberg.util.UnicodeUtil;
import org.apache.orc.BooleanColumnStatistics;
import org.apache.orc.ColumnStatistics;
import org.apache.orc.DateColumnStatistics;
import org.apache.orc.DecimalColumnStatistics;
import org.apache.orc.DoubleColumnStatistics;
import org.apache.orc.IntegerColumnStatistics;
import org.apache.orc.Reader;
import org.apache.orc.StringColumnStatistics;
import org.apache.orc.TimestampColumnStatistics;
import org.apache.orc.TypeDescription;
import org.apache.orc.Writer;

public class OrcMetrics {

  private enum Bound {
    LOWER,
    UPPER
  }

  private OrcMetrics() {}

  public static Metrics fromInputFile(InputFile file) {
    return fromInputFile(file, MetricsConfig.getDefault());
  }

  public static Metrics fromInputFile(InputFile file, MetricsConfig metricsConfig) {
    return fromInputFile(file, metricsConfig, null);
  }

  public static Metrics fromInputFile(
      InputFile file, MetricsConfig metricsConfig, NameMapping mapping) {
    final Configuration config =
        (file instanceof HadoopInputFile)
            ? ((HadoopInputFile) file).getConf()
            : new Configuration();
    return fromInputFile(file, config, metricsConfig, mapping);
  }

  static Metrics fromInputFile(
      InputFile file, Configuration config, MetricsConfig metricsConfig, NameMapping mapping) {
    try (Reader orcReader = ORC.newFileReader(file, config)) {
      return buildOrcMetrics(
          orcReader.getNumberOfRows(),
          orcReader.getSchema(),
          orcReader.getStatistics(),
          Stream.empty(),
          metricsConfig,
          mapping);
    } catch (IOException ioe) {
      throw new RuntimeIOException(ioe, "Failed to open file: %s", file.location());
    }
  }

  static Metrics fromWriter(
      Writer writer, Stream> fieldMetricsStream, MetricsConfig metricsConfig) {
    try {
      return buildOrcMetrics(
          writer.getNumberOfRows(),
          writer.getSchema(),
          writer.getStatistics(),
          fieldMetricsStream,
          metricsConfig,
          null);
    } catch (IOException ioe) {
      throw new RuntimeIOException(ioe, "Failed to get statistics from writer");
    }
  }

  private static Metrics buildOrcMetrics(
      final long numOfRows,
      final TypeDescription orcSchema,
      final ColumnStatistics[] colStats,
      final Stream> fieldMetricsStream,
      final MetricsConfig metricsConfig,
      final NameMapping mapping) {
    final TypeDescription orcSchemaWithIds =
        (!ORCSchemaUtil.hasIds(orcSchema) && mapping != null)
            ? ORCSchemaUtil.applyNameMapping(orcSchema, mapping)
            : orcSchema;
    final Set statsColumns = statsColumns(orcSchemaWithIds);
    final MetricsConfig effectiveMetricsConfig =
        Optional.ofNullable(metricsConfig).orElseGet(MetricsConfig::getDefault);
    Map columnSizes = Maps.newHashMapWithExpectedSize(colStats.length);
    Map valueCounts = Maps.newHashMapWithExpectedSize(colStats.length);
    Map nullCounts = Maps.newHashMapWithExpectedSize(colStats.length);

    if (!ORCSchemaUtil.hasIds(orcSchemaWithIds)) {
      return new Metrics(numOfRows, columnSizes, valueCounts, nullCounts, null, null, null);
    }

    final Schema schema = ORCSchemaUtil.convert(orcSchemaWithIds);
    Map lowerBounds = Maps.newHashMap();
    Map upperBounds = Maps.newHashMap();

    Map> fieldMetricsMap =
        Optional.ofNullable(fieldMetricsStream)
            .map(stream -> stream.collect(Collectors.toMap(FieldMetrics::id, Function.identity())))
            .orElseGet(Maps::newHashMap);

    for (int i = 0; i < colStats.length; i++) {
      final ColumnStatistics colStat = colStats[i];
      final TypeDescription orcCol = orcSchemaWithIds.findSubtype(i);
      final Optional icebergColOpt =
          ORCSchemaUtil.icebergID(orcCol).map(schema::findField);

      if (icebergColOpt.isPresent()) {
        final Types.NestedField icebergCol = icebergColOpt.get();
        final int fieldId = icebergCol.fieldId();

        final MetricsMode metricsMode =
            MetricsUtil.metricsMode(schema, effectiveMetricsConfig, icebergCol.fieldId());
        columnSizes.put(fieldId, colStat.getBytesOnDisk());

        if (metricsMode == MetricsModes.None.get()) {
          continue;
        }

        if (statsColumns.contains(fieldId)) {
          // Since ORC does not track null values nor repeated ones, the value count for columns in
          // containers (maps, list) may be larger than what it actually is, however these are not
          // used in experssions right now. For such cases, we use the value number of values
          // directly stored in ORC.
          if (colStat.hasNull()) {
            nullCounts.put(fieldId, numOfRows - colStat.getNumberOfValues());
          } else {
            nullCounts.put(fieldId, 0L);
          }
          valueCounts.put(fieldId, colStat.getNumberOfValues() + nullCounts.get(fieldId));

          if (metricsMode != MetricsModes.Counts.get()) {
            Optional orcMin =
                (colStat.getNumberOfValues() > 0)
                    ? fromOrcMin(
                        icebergCol.type(), colStat, metricsMode, fieldMetricsMap.get(fieldId))
                    : Optional.empty();
            orcMin.ifPresent(byteBuffer -> lowerBounds.put(icebergCol.fieldId(), byteBuffer));
            Optional orcMax =
                (colStat.getNumberOfValues() > 0)
                    ? fromOrcMax(
                        icebergCol.type(), colStat, metricsMode, fieldMetricsMap.get(fieldId))
                    : Optional.empty();
            orcMax.ifPresent(byteBuffer -> upperBounds.put(icebergCol.fieldId(), byteBuffer));
          }
        }
      }
    }

    return new Metrics(
        numOfRows,
        columnSizes,
        valueCounts,
        nullCounts,
        MetricsUtil.createNanValueCounts(
            fieldMetricsMap.values().stream(), effectiveMetricsConfig, schema),
        lowerBounds,
        upperBounds);
  }

  private static Optional fromOrcMin(
      Type type,
      ColumnStatistics columnStats,
      MetricsMode metricsMode,
      FieldMetrics fieldMetrics) {
    Object min = null;
    if (columnStats instanceof IntegerColumnStatistics) {
      min = ((IntegerColumnStatistics) columnStats).getMinimum();
      if (type.typeId() == Type.TypeID.INTEGER) {
        min = Math.toIntExact((long) min);
      }
    } else if (columnStats instanceof DoubleColumnStatistics) {
      if (fieldMetrics != null) {
        // since Orc includes NaN for upper/lower bounds of floating point columns, and we don't
        // want this behavior,
        // we have tracked metrics for such columns ourselves and thus do not need to rely on Orc's
        // column statistics.
        min = fieldMetrics.lowerBound();
      } else {
        // imported files will not have metrics that were tracked by Iceberg, so fall back to the
        // file's metrics.
        min =
            replaceNaN(
                ((DoubleColumnStatistics) columnStats).getMinimum(), Double.NEGATIVE_INFINITY);
        if (type.typeId() == Type.TypeID.FLOAT) {
          min = ((Double) min).floatValue();
        }
      }
    } else if (columnStats instanceof StringColumnStatistics) {
      min = ((StringColumnStatistics) columnStats).getMinimum();
    } else if (columnStats instanceof DecimalColumnStatistics) {
      min =
          Optional.ofNullable(((DecimalColumnStatistics) columnStats).getMinimum())
              .map(
                  minStats ->
                      minStats.bigDecimalValue().setScale(((Types.DecimalType) type).scale()))
              .orElse(null);
    } else if (columnStats instanceof DateColumnStatistics) {
      min = (int) ((DateColumnStatistics) columnStats).getMinimumDayOfEpoch();
    } else if (columnStats instanceof TimestampColumnStatistics) {
      TimestampColumnStatistics tColStats = (TimestampColumnStatistics) columnStats;
      Timestamp minValue = tColStats.getMinimumUTC();
      min =
          Optional.ofNullable(minValue)
              .map(v -> DateTimeUtil.microsFromInstant(v.toInstant()))
              .orElse(null);
    } else if (columnStats instanceof BooleanColumnStatistics) {
      BooleanColumnStatistics booleanStats = (BooleanColumnStatistics) columnStats;
      min = booleanStats.getFalseCount() <= 0;
    }

    return Optional.ofNullable(
        Conversions.toByteBuffer(type, truncateIfNeeded(Bound.LOWER, type, min, metricsMode)));
  }

  private static Optional fromOrcMax(
      Type type,
      ColumnStatistics columnStats,
      MetricsMode metricsMode,
      FieldMetrics fieldMetrics) {
    Object max = null;
    if (columnStats instanceof IntegerColumnStatistics) {
      max = ((IntegerColumnStatistics) columnStats).getMaximum();
      if (type.typeId() == Type.TypeID.INTEGER) {
        max = Math.toIntExact((long) max);
      }
    } else if (columnStats instanceof DoubleColumnStatistics) {
      if (fieldMetrics != null) {
        // since Orc includes NaN for upper/lower bounds of floating point columns, and we don't
        // want this behavior,
        // we have tracked metrics for such columns ourselves and thus do not need to rely on Orc's
        // column statistics.
        max = fieldMetrics.upperBound();
      } else {
        // imported files will not have metrics that were tracked by Iceberg, so fall back to the
        // file's metrics.
        max =
            replaceNaN(
                ((DoubleColumnStatistics) columnStats).getMaximum(), Double.POSITIVE_INFINITY);
        if (type.typeId() == Type.TypeID.FLOAT) {
          max = ((Double) max).floatValue();
        }
      }
    } else if (columnStats instanceof StringColumnStatistics) {
      max = ((StringColumnStatistics) columnStats).getMaximum();
    } else if (columnStats instanceof DecimalColumnStatistics) {
      max =
          Optional.ofNullable(((DecimalColumnStatistics) columnStats).getMaximum())
              .map(
                  maxStats ->
                      maxStats.bigDecimalValue().setScale(((Types.DecimalType) type).scale()))
              .orElse(null);
    } else if (columnStats instanceof DateColumnStatistics) {
      max = (int) ((DateColumnStatistics) columnStats).getMaximumDayOfEpoch();
    } else if (columnStats instanceof TimestampColumnStatistics) {
      TimestampColumnStatistics tColStats = (TimestampColumnStatistics) columnStats;
      Timestamp maxValue = tColStats.getMaximumUTC();
      max =
          Optional.ofNullable(maxValue)
              .map(v -> DateTimeUtil.microsFromInstant(v.toInstant()))
              .orElse(null);
    } else if (columnStats instanceof BooleanColumnStatistics) {
      BooleanColumnStatistics booleanStats = (BooleanColumnStatistics) columnStats;
      max = booleanStats.getTrueCount() > 0;
    }
    return Optional.ofNullable(
        Conversions.toByteBuffer(type, truncateIfNeeded(Bound.UPPER, type, max, metricsMode)));
  }

  private static Object replaceNaN(double value, double replacement) {
    return Double.isNaN(value) ? replacement : value;
  }

  private static Object truncateIfNeeded(
      Bound bound, Type type, Object value, MetricsMode metricsMode) {
    // Out of the two types which could be truncated, string or binary, ORC only supports string
    // bounds.
    // Therefore, truncation will be applied if needed only on string type.
    if (!(metricsMode instanceof MetricsModes.Truncate)
        || type.typeId() != Type.TypeID.STRING
        || value == null) {
      return value;
    }

    CharSequence charSequence = (CharSequence) value;
    MetricsModes.Truncate truncateMode = (MetricsModes.Truncate) metricsMode;
    int truncateLength = truncateMode.length();

    switch (bound) {
      case UPPER:
        return Optional.ofNullable(
                UnicodeUtil.truncateStringMax(Literal.of(charSequence), truncateLength))
            .map(Literal::value)
            .orElse(charSequence);
      case LOWER:
        return UnicodeUtil.truncateStringMin(Literal.of(charSequence), truncateLength).value();
      default:
        throw new RuntimeException("No other bound is defined.");
    }
  }

  private static Set statsColumns(TypeDescription schema) {
    return OrcSchemaVisitor.visit(schema, new StatsColumnsVisitor());
  }

  private static class StatsColumnsVisitor extends OrcSchemaVisitor> {
    @Override
    public Set record(
        TypeDescription record, List names, List> fields) {
      ImmutableSet.Builder result = ImmutableSet.builder();
      fields.stream().filter(Objects::nonNull).forEach(result::addAll);
      record.getChildren().stream()
          .map(ORCSchemaUtil::icebergID)
          .filter(Optional::isPresent)
          .map(Optional::get)
          .forEach(result::add);
      return result.build();
    }
  }
}