All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.apache.parquet.column.statistics.Statistics Maven / Gradle / Ivy

There is a newer version: 1.14.4
Show newest version
/*
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *   http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing,
 * software distributed under the License is distributed on an
 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 * KIND, either express or implied.  See the License for the
 * specific language governing permissions and limitations
 * under the License.
 */
package org.apache.parquet.column.statistics;

import java.util.Arrays;
import org.apache.parquet.column.UnknownColumnTypeException;
import org.apache.parquet.io.api.Binary;
import org.apache.parquet.schema.Float16;
import org.apache.parquet.schema.LogicalTypeAnnotation;
import org.apache.parquet.schema.PrimitiveComparator;
import org.apache.parquet.schema.PrimitiveStringifier;
import org.apache.parquet.schema.PrimitiveType;
import org.apache.parquet.schema.PrimitiveType.PrimitiveTypeName;
import org.apache.parquet.schema.Type;

/**
 * Statistics class to keep track of statistics in parquet pages and column chunks
 *
 * @param  the Java type described by this Statistics instance
 */
public abstract class Statistics> {

  /**
   * Builder class to build Statistics objects. Used to read the statistics from the Parquet file.
   */
  public static class Builder {
    private final PrimitiveType type;
    private byte[] min;
    private byte[] max;
    private long numNulls = -1;

    private Builder(PrimitiveType type) {
      this.type = type;
    }

    public Builder withMin(byte[] min) {
      this.min = min;
      return this;
    }

    public Builder withMax(byte[] max) {
      this.max = max;
      return this;
    }

    public Builder withNumNulls(long numNulls) {
      this.numNulls = numNulls;
      return this;
    }

    public Statistics build() {
      Statistics stats = createStats(type);
      if (min != null && max != null) {
        stats.setMinMaxFromBytes(min, max);
      }
      stats.num_nulls = this.numNulls;
      return stats;
    }
  }

  // Builder for FLOAT type to handle special cases of min/max values like NaN, -0.0, and 0.0
  private static class FloatBuilder extends Builder {
    public FloatBuilder(PrimitiveType type) {
      super(type);
      assert type.getPrimitiveTypeName() == PrimitiveTypeName.FLOAT;
    }

    @Override
    public Statistics build() {
      FloatStatistics stats = (FloatStatistics) super.build();
      if (stats.hasNonNullValue()) {
        Float min = stats.genericGetMin();
        Float max = stats.genericGetMax();
        // Drop min/max values in case of NaN as the sorting order of values is undefined for this case
        if (min.isNaN() || max.isNaN()) {
          stats.setMinMax(0.0f, 0.0f);
          ((Statistics) stats).hasNonNullValue = false;
        } else {
          // Updating min to -0.0 and max to +0.0 to ensure that no 0.0 values would be skipped
          if (Float.compare(min, 0.0f) == 0) {
            min = -0.0f;
            stats.setMinMax(min, max);
          }
          if (Float.compare(max, -0.0f) == 0) {
            max = 0.0f;
            stats.setMinMax(min, max);
          }
        }
      }
      return stats;
    }
  }

  // Builder for DOUBLE type to handle special cases of min/max values like NaN, -0.0, and 0.0
  private static class DoubleBuilder extends Builder {
    public DoubleBuilder(PrimitiveType type) {
      super(type);
      assert type.getPrimitiveTypeName() == PrimitiveTypeName.DOUBLE;
    }

    @Override
    public Statistics build() {
      DoubleStatistics stats = (DoubleStatistics) super.build();
      if (stats.hasNonNullValue()) {
        Double min = stats.genericGetMin();
        Double max = stats.genericGetMax();
        // Drop min/max values in case of NaN as the sorting order of values is undefined for this case
        if (min.isNaN() || max.isNaN()) {
          stats.setMinMax(0.0, 0.0);
          ((Statistics) stats).hasNonNullValue = false;
        } else {
          // Updating min to -0.0 and max to +0.0 to ensure that no 0.0 values would be skipped
          if (Double.compare(min, 0.0) == 0) {
            min = -0.0;
            stats.setMinMax(min, max);
          }
          if (Double.compare(max, -0.0) == 0) {
            max = 0.0;
            stats.setMinMax(min, max);
          }
        }
      }
      return stats;
    }
  }

  // Builder for FLOAT16 type to handle special cases of min/max values like NaN, -0.0, and 0.0
  private static class Float16Builder extends Builder {
    private static final Binary POSITIVE_ZERO_LITTLE_ENDIAN = Binary.fromConstantByteArray(new byte[] {0x00, 0x00});
    private static final Binary NEGATIVE_ZERO_LITTLE_ENDIAN =
        Binary.fromConstantByteArray(new byte[] {0x00, (byte) 0x80});

    public Float16Builder(PrimitiveType type) {
      super(type);
      assert type.getPrimitiveTypeName() == PrimitiveTypeName.FIXED_LEN_BYTE_ARRAY;
      assert type.getTypeLength() == 2;
    }

    @Override
    public Statistics build() {
      BinaryStatistics stats = (BinaryStatistics) super.build();
      if (stats.hasNonNullValue()) {
        Binary bMin = stats.genericGetMin();
        Binary bMax = stats.genericGetMax();
        short min = bMin.get2BytesLittleEndian();
        short max = bMax.get2BytesLittleEndian();
        // Drop min/max values in case of NaN as the sorting order of values is undefined for this case
        if (Float16.isNaN(min) || Float16.isNaN(max)) {
          stats.setMinMax(POSITIVE_ZERO_LITTLE_ENDIAN, NEGATIVE_ZERO_LITTLE_ENDIAN);
          ((Statistics) stats).hasNonNullValue = false;
        } else {
          // Updating min to -0.0 and max to +0.0 to ensure that no 0.0 values would be skipped
          if (min == (short) 0x0000) {
            stats.setMinMax(NEGATIVE_ZERO_LITTLE_ENDIAN, bMax);
          }
          if (max == (short) 0x8000) {
            stats.setMinMax(bMin, POSITIVE_ZERO_LITTLE_ENDIAN);
          }
        }
      }
      return stats;
    }
  }

  private final PrimitiveType type;
  private final PrimitiveComparator comparator;
  private boolean hasNonNullValue;
  private long num_nulls;
  final PrimitiveStringifier stringifier;

  Statistics(PrimitiveType type) {
    this.type = type;
    this.comparator = type.comparator();
    this.stringifier = type.stringifier();
    hasNonNullValue = false;
    num_nulls = 0;
  }

  /**
   * Returns the typed statistics object based on the passed type parameter
   *
   * @param type PrimitiveTypeName type of the column
   * @return instance of a typed statistics class
   * @deprecated Use {@link #createStats(Type)} instead
   */
  @Deprecated
  public static Statistics getStatsBasedOnType(PrimitiveTypeName type) {
    switch (type) {
      case INT32:
        return new IntStatistics();
      case INT64:
        return new LongStatistics();
      case FLOAT:
        return new FloatStatistics();
      case DOUBLE:
        return new DoubleStatistics();
      case BOOLEAN:
        return new BooleanStatistics();
      case BINARY:
        return new BinaryStatistics();
      case INT96:
        return new BinaryStatistics();
      case FIXED_LEN_BYTE_ARRAY:
        return new BinaryStatistics();
      default:
        throw new UnknownColumnTypeException(type);
    }
  }

  /**
   * Creates an empty {@code Statistics} instance for the specified type to be
   * used for reading/writing the new min/max statistics used in the V2 format.
   *
   * @param type type of the column
   * @return instance of a typed statistics class
   */
  public static Statistics createStats(Type type) {
    PrimitiveType primitive = type.asPrimitiveType();
    switch (primitive.getPrimitiveTypeName()) {
      case INT32:
        return new IntStatistics(primitive);
      case INT64:
        return new LongStatistics(primitive);
      case FLOAT:
        return new FloatStatistics(primitive);
      case DOUBLE:
        return new DoubleStatistics(primitive);
      case BOOLEAN:
        return new BooleanStatistics(primitive);
      case BINARY:
      case INT96:
      case FIXED_LEN_BYTE_ARRAY:
        return new BinaryStatistics(primitive);
      default:
        throw new UnknownColumnTypeException(primitive.getPrimitiveTypeName());
    }
  }

  /**
   * Returns a builder to create new statistics object. Used to read the statistics from the parquet file.
   *
   * @param type type of the column
   * @return builder to create new statistics object
   */
  public static Builder getBuilderForReading(PrimitiveType type) {
    switch (type.getPrimitiveTypeName()) {
      case FLOAT:
        return new FloatBuilder(type);
      case DOUBLE:
        return new DoubleBuilder(type);
      case FIXED_LEN_BYTE_ARRAY:
        LogicalTypeAnnotation logicalTypeAnnotation = type.getLogicalTypeAnnotation();
        if (logicalTypeAnnotation instanceof LogicalTypeAnnotation.Float16LogicalTypeAnnotation) {
          return new Float16Builder(type);
        }
      default:
        return new Builder(type);
    }
  }

  /**
   * updates statistics min and max using the passed value
   *
   * @param value value to use to update min and max
   */
  public void updateStats(int value) {
    throw new UnsupportedOperationException();
  }

  /**
   * updates statistics min and max using the passed value
   *
   * @param value value to use to update min and max
   */
  public void updateStats(long value) {
    throw new UnsupportedOperationException();
  }

  /**
   * updates statistics min and max using the passed value
   *
   * @param value value to use to update min and max
   */
  public void updateStats(float value) {
    throw new UnsupportedOperationException();
  }

  /**
   * updates statistics min and max using the passed value
   *
   * @param value value to use to update min and max
   */
  public void updateStats(double value) {
    throw new UnsupportedOperationException();
  }

  /**
   * updates statistics min and max using the passed value
   *
   * @param value value to use to update min and max
   */
  public void updateStats(boolean value) {
    throw new UnsupportedOperationException();
  }

  /**
   * updates statistics min and max using the passed value
   *
   * @param value value to use to update min and max
   */
  public void updateStats(Binary value) {
    throw new UnsupportedOperationException();
  }

  /**
   * Equality comparison method to compare two statistics objects.
   *
   * @param other Object to compare against
   * @return true if objects are equal, false otherwise
   */
  @Override
  public boolean equals(Object other) {
    if (other == this) return true;
    if (!(other instanceof Statistics)) return false;
    Statistics stats = (Statistics) other;
    return type.equals(stats.type)
        && Arrays.equals(stats.getMaxBytes(), this.getMaxBytes())
        && Arrays.equals(stats.getMinBytes(), this.getMinBytes())
        && stats.getNumNulls() == this.getNumNulls();
  }

  /**
   * Hash code for the statistics object
   *
   * @return hash code int
   */
  @Override
  public int hashCode() {
    return 31 * type.hashCode()
        + 31 * Arrays.hashCode(getMaxBytes())
        + 17 * Arrays.hashCode(getMinBytes())
        + Long.valueOf(this.getNumNulls()).hashCode();
  }

  /**
   * Method to merge this statistics object with the object passed
   * as parameter. Merging keeps the smallest of min values, largest of max
   * values and combines the number of null counts.
   *
   * @param stats Statistics object to merge with
   */
  public void mergeStatistics(Statistics stats) {
    if (stats.isEmpty()) return;

    // Merge stats only if they have the same type
    if (type.equals(stats.type)) {
      incrementNumNulls(stats.getNumNulls());
      if (stats.hasNonNullValue()) {
        mergeStatisticsMinMax(stats);
        markAsNotEmpty();
      }
    } else {
      throw StatisticsClassException.create(this, stats);
    }
  }

  /**
   * Abstract method to merge this statistics min and max with the values
   * of the parameter object. Does not do any checks, only called internally.
   *
   * @param stats Statistics object to merge with
   */
  protected abstract void mergeStatisticsMinMax(Statistics stats);

  /**
   * Abstract method to set min and max values from byte arrays.
   *
   * @param minBytes byte array to set the min value to
   * @param maxBytes byte array to set the max value to
   * @deprecated will be removed in 2.0.0. Use {@link #getBuilderForReading(PrimitiveType)} instead.
   */
  @Deprecated
  public abstract void setMinMaxFromBytes(byte[] minBytes, byte[] maxBytes);

  /**
   * Returns the min value in the statistics. The java natural order of the returned type defined by {@link
   * Comparable#compareTo(Object)} might not be the proper one. For example, UINT_32 requires unsigned comparison instead of the
   * natural signed one. Use {@link #compareMinToValue(Comparable)} or the comparator returned by {@link #comparator()} to
   * always get the proper ordering.
   *
   * @return the min value
   */
  public abstract T genericGetMin();

  /**
   * Returns the max value in the statistics. The java natural order of the returned type defined by {@link
   * Comparable#compareTo(Object)} might not be the proper one. For example, UINT_32 requires unsigned comparison instead of the
   * natural signed one. Use {@link #compareMaxToValue(Comparable)} or the comparator returned by {@link #comparator()} to
   * always get the proper ordering.
   *
   * @return the max value
   */
  public abstract T genericGetMax();

  /**
   * Returns the {@link PrimitiveComparator} implementation to be used to compare two generic values in the proper way
   * (for example, unsigned comparison for UINT_32).
   *
   * @return the comparator for data described by this Statistics instance
   */
  public final PrimitiveComparator comparator() {
    return comparator;
  }

  /**
   * Compares min to the specified value in the proper way. It does the same as invoking
   * {@code comparator().compare(genericGetMin(), value)}. The corresponding statistics implementations overload this
   * method so the one with the primitive argument shall be used to avoid boxing/unboxing.
   *
   * @param value the value which {@code min} is to be compared to
   * @return a negative integer, zero, or a positive integer as {@code min} is less than, equal to, or greater than
   * {@code value}.
   */
  public final int compareMinToValue(T value) {
    return comparator.compare(genericGetMin(), value);
  }

  /**
   * Compares max to the specified value in the proper way. It does the same as invoking
   * {@code comparator().compare(genericGetMax(), value)}. The corresponding statistics implementations overload this
   * method so the one with the primitive argument shall be used to avoid boxing/unboxing.
   *
   * @param value the value which {@code max} is to be compared to
   * @return a negative integer, zero, or a positive integer as {@code max} is less than, equal to, or greater than
   * {@code value}.
   */
  public final int compareMaxToValue(T value) {
    return comparator.compare(genericGetMax(), value);
  }

  /**
   * Abstract method to return the max value as a byte array
   *
   * @return byte array corresponding to the max value
   */
  public abstract byte[] getMaxBytes();

  /**
   * Abstract method to return the min value as a byte array
   *
   * @return byte array corresponding to the min value
   */
  public abstract byte[] getMinBytes();

  /**
   * Returns the string representation of min for debugging/logging purposes.
   *
   * @return the min value as a string
   */
  public String minAsString() {
    return stringify(genericGetMin());
  }

  /**
   * Returns the string representation of max for debugging/logging purposes.
   *
   * @return the max value as a string
   */
  public String maxAsString() {
    return stringify(genericGetMax());
  }

  abstract String stringify(T value);

  /**
   * Abstract method to return whether the min and max values fit in the given
   * size.
   *
   * @param size a size in bytes
   * @return true iff the min and max values are less than size bytes
   */
  public abstract boolean isSmallerThan(long size);

  @Override
  public String toString() {
    if (this.hasNonNullValue()) {
      if (isNumNullsSet()) {
        return String.format(
            "min: %s, max: %s, num_nulls: %d", minAsString(), maxAsString(), this.getNumNulls());
      } else {
        return String.format("min: %s, max: %s, num_nulls not defined", minAsString(), maxAsString());
      }
    } else if (!this.isEmpty()) return String.format("num_nulls: %d, min/max not defined", this.getNumNulls());
    else return "no stats for this column";
  }

  /**
   * Increments the null count by one
   */
  public void incrementNumNulls() {
    num_nulls++;
  }

  /**
   * Increments the null count by the parameter value
   *
   * @param increment value to increment the null count by
   */
  public void incrementNumNulls(long increment) {
    num_nulls += increment;
  }

  /**
   * Returns the null count
   *
   * @return null count or {@code -1} if the null count is not set
   */
  public long getNumNulls() {
    return num_nulls;
  }

  /**
   * Sets the number of nulls to the parameter value
   *
   * @param nulls null count to set the count to
   * @deprecated will be removed in 2.0.0. Use {@link #getBuilderForReading(PrimitiveType)} instead.
   */
  @Deprecated
  public void setNumNulls(long nulls) {
    num_nulls = nulls;
  }

  /**
   * Returns a boolean specifying if the Statistics object is empty,
   * i.e does not contain valid statistics for the page/column yet
   *
   * @return true if object is empty, false otherwise
   */
  public boolean isEmpty() {
    return !hasNonNullValue && !isNumNullsSet();
  }

  /**
   * Returns whether there have been non-null values added to this statistics
   *
   * @return true if the values contained at least one non-null value
   */
  public boolean hasNonNullValue() {
    return hasNonNullValue;
  }

  /**
   * @return whether numNulls is set and can be used
   */
  public boolean isNumNullsSet() {
    return num_nulls >= 0;
  }

  /**
   * Sets the page/column as having a valid non-null value
   * kind of misnomer here
   */
  protected void markAsNotEmpty() {
    hasNonNullValue = true;
  }

  /**
   * @return a new independent statistics instance of this class.
   */
  public abstract Statistics copy();

  /**
   * @return the primitive type object which this statistics is created for
   */
  public PrimitiveType type() {
    return type;
  }
}




© 2015 - 2024 Weber Informatics LLC | Privacy Policy