All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.apache.parquet.hadoop.ColumnIndexValidator Maven / Gradle / Ivy

The newest version!
/*
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *   http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing,
 * software distributed under the License is distributed on an
 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 * KIND, either express or implied.  See the License for the
 * specific language governing permissions and limitations
 * under the License.
 */

package org.apache.parquet.hadoop;

import static org.apache.parquet.hadoop.ColumnIndexValidator.Contract.MAX_ASCENDING;
import static org.apache.parquet.hadoop.ColumnIndexValidator.Contract.MAX_DESCENDING;
import static org.apache.parquet.hadoop.ColumnIndexValidator.Contract.MAX_GTEQ_VALUE;
import static org.apache.parquet.hadoop.ColumnIndexValidator.Contract.MIN_ASCENDING;
import static org.apache.parquet.hadoop.ColumnIndexValidator.Contract.MIN_DESCENDING;
import static org.apache.parquet.hadoop.ColumnIndexValidator.Contract.MIN_LTEQ_VALUE;
import static org.apache.parquet.hadoop.ColumnIndexValidator.Contract.NULL_COUNT_CORRECT;
import static org.apache.parquet.hadoop.ColumnIndexValidator.Contract.NULL_PAGE_HAS_NO_MAX;
import static org.apache.parquet.hadoop.ColumnIndexValidator.Contract.NULL_PAGE_HAS_NO_MIN;
import static org.apache.parquet.hadoop.ColumnIndexValidator.Contract.NULL_PAGE_HAS_NO_VALUES;

import java.io.IOException;
import java.nio.ByteBuffer;
import java.util.ArrayList;
import java.util.EnumSet;
import java.util.List;
import java.util.Set;
import java.util.function.Supplier;
import org.apache.parquet.column.ColumnDescriptor;
import org.apache.parquet.column.ColumnReadStore;
import org.apache.parquet.column.ColumnReader;
import org.apache.parquet.column.impl.ColumnReadStoreImpl;
import org.apache.parquet.column.page.PageReadStore;
import org.apache.parquet.example.DummyRecordConverter;
import org.apache.parquet.hadoop.metadata.BlockMetaData;
import org.apache.parquet.hadoop.metadata.ColumnChunkMetaData;
import org.apache.parquet.hadoop.metadata.ColumnPath;
import org.apache.parquet.hadoop.metadata.FileMetaData;
import org.apache.parquet.internal.column.columnindex.BoundaryOrder;
import org.apache.parquet.internal.column.columnindex.ColumnIndex;
import org.apache.parquet.internal.column.columnindex.OffsetIndex;
import org.apache.parquet.io.InputFile;
import org.apache.parquet.io.api.Binary;
import org.apache.parquet.schema.MessageType;
import org.apache.parquet.schema.PrimitiveComparator;
import org.apache.parquet.schema.PrimitiveStringifier;
import org.apache.parquet.schema.PrimitiveType;

public class ColumnIndexValidator {

  public enum Contract {
    MIN_LTEQ_VALUE(
        "The min value stored in the index for the page must be less than or equal to all values in the page.\n"
            + "Actual value in the page: %s\n"
            + "Min value in the index: %s\n"),
    MAX_GTEQ_VALUE(
        "The max value stored in the index for the page must be greater than or equal to all values in the page.\n"
            + "Actual value in the page: %s\n"
            + "Max value in the index: %s\n"),
    NULL_COUNT_CORRECT(
        "The null count stored in the index for the page must be equal to the number of nulls in the page.\n"
            + "Actual null count: %s\n"
            + "Null count in the index: %s\n"),
    NULL_PAGE_HAS_NO_VALUES("Only pages consisting entirely of NULL-s can be marked as a null page in the index.\n"
        + "Actual non-null value in the page: %s"),
    NULL_PAGE_HAS_NO_MIN("A null page shall not have a min value in the index\n" + "Min value in the index: %s\n"),
    NULL_PAGE_HAS_NO_MAX("A null page shall not have a max value in the index\n" + "Max value in the index: %s\n"),
    MIN_ASCENDING(
        "According to the ASCENDING boundary order, the min value for a page must be greater than or equal to the min value of the previous page.\n"
            + "Min value for the page: %s\n"
            + "Min value for the previous page: %s\n"),
    MAX_ASCENDING(
        "According to the ASCENDING boundary order, the max value for a page must be greater than or equal to the max value of the previous page.\n"
            + "Max value for the page: %s\n"
            + "Max value for the previous page: %s\n"),
    MIN_DESCENDING(
        "According to the DESCENDING boundary order, the min value for a page must be less than or equal to the min value of the previous page.\n"
            + "Min value for the page: %s\n"
            + "Min value for the previous page: %s\n"),
    MAX_DESCENDING(
        "According to the DESCENDING boundary order, the max value for a page must be less than or equal to the max value of the previous page.\n"
            + "Max value for the page: %s\n"
            + "Max value for the previous page: %s\n");

    public final String description;

    Contract(String description) {
      this.description = description;
    }
  }

  public static class ContractViolation {
    public ContractViolation(
        Contract violatedContract,
        String referenceValue,
        String offendingValue,
        int rowGroupNumber,
        int columnNumber,
        ColumnPath columnPath,
        int pageNumber) {
      this.violatedContract = violatedContract;
      this.referenceValue = referenceValue;
      this.offendingValue = offendingValue;
      this.rowGroupNumber = rowGroupNumber;
      this.columnNumber = columnNumber;
      this.columnPath = columnPath;
      this.pageNumber = pageNumber;
    }

    private final Contract violatedContract;
    private final String referenceValue;
    private final String offendingValue;
    private final int rowGroupNumber;
    private final int columnNumber;
    private final ColumnPath columnPath;
    private final int pageNumber;

    @Override
    public String toString() {
      return String.format(
          "Contract violation\nLocation: row group %d, column %d (\"%s\"), page %d\nViolated contract: "
              + violatedContract.description,
          rowGroupNumber,
          columnNumber,
          columnPath.toDotString(),
          pageNumber,
          referenceValue,
          offendingValue);
    }
  }

  static interface StatValue extends Comparable {
    int compareToValue(ColumnReader reader);

    abstract class Builder {
      final PrimitiveComparator comparator;
      final PrimitiveStringifier stringifier;

      Builder(PrimitiveType type) {
        comparator = type.comparator();
        stringifier = type.stringifier();
      }

      abstract StatValue build(ByteBuffer value);

      abstract String stringifyValue(ColumnReader reader);
    }
  }

  static StatValue.Builder getBuilder(PrimitiveType type) {
    switch (type.getPrimitiveTypeName()) {
      case BINARY:
      case FIXED_LEN_BYTE_ARRAY:
      case INT96:
        return new BinaryStatValueBuilder(type);
      case BOOLEAN:
        return new BooleanStatValueBuilder(type);
      case DOUBLE:
        return new DoubleStatValueBuilder(type);
      case FLOAT:
        return new FloatStatValueBuilder(type);
      case INT32:
        return new IntStatValueBuilder(type);
      case INT64:
        return new LongStatValueBuilder(type);
      default:
        throw new IllegalArgumentException("Unsupported type: " + type);
    }
  }

  private static class BinaryStatValueBuilder extends StatValue.Builder {
    private class Value implements StatValue {
      final Binary value;

      private Value(Binary value) {
        this.value = value;
      }

      @Override
      public int compareTo(StatValue o) {
        return comparator.compare(value, ((Value) o).value);
      }

      @Override
      public int compareToValue(ColumnReader reader) {
        return comparator.compare(value, reader.getBinary());
      }

      @Override
      public String toString() {
        return stringifier.stringify(value);
      }
    }

    private BinaryStatValueBuilder(PrimitiveType type) {
      super(type);
    }

    @Override
    StatValue build(ByteBuffer value) {
      return new Value(Binary.fromConstantByteBuffer(value));
    }

    @Override
    String stringifyValue(ColumnReader reader) {
      return stringifier.stringify(reader.getBinary());
    }
  }

  private static class BooleanStatValueBuilder extends StatValue.Builder {
    private class Value implements StatValue {
      final boolean value;

      private Value(boolean value) {
        this.value = value;
      }

      @Override
      public int compareTo(StatValue o) {
        return comparator.compare(value, ((Value) o).value);
      }

      @Override
      public int compareToValue(ColumnReader reader) {
        return comparator.compare(value, reader.getBoolean());
      }

      @Override
      public String toString() {
        return stringifier.stringify(value);
      }
    }

    private BooleanStatValueBuilder(PrimitiveType type) {
      super(type);
    }

    @Override
    StatValue build(ByteBuffer value) {
      return new Value(value.get(0) != 0);
    }

    @Override
    String stringifyValue(ColumnReader reader) {
      return stringifier.stringify(reader.getBoolean());
    }
  }

  private static class DoubleStatValueBuilder extends StatValue.Builder {
    private class Value implements StatValue {
      final double value;

      private Value(double value) {
        this.value = value;
      }

      @Override
      public int compareTo(StatValue o) {
        return comparator.compare(value, ((Value) o).value);
      }

      @Override
      public int compareToValue(ColumnReader reader) {
        return comparator.compare(value, reader.getDouble());
      }

      @Override
      public String toString() {
        return stringifier.stringify(value);
      }
    }

    private DoubleStatValueBuilder(PrimitiveType type) {
      super(type);
    }

    @Override
    StatValue build(ByteBuffer value) {
      return new Value(value.getDouble(0));
    }

    @Override
    String stringifyValue(ColumnReader reader) {
      return stringifier.stringify(reader.getDouble());
    }
  }

  private static class FloatStatValueBuilder extends StatValue.Builder {
    private class Value implements StatValue {
      final float value;

      private Value(float value) {
        this.value = value;
      }

      @Override
      public int compareTo(StatValue o) {
        return comparator.compare(value, ((Value) o).value);
      }

      @Override
      public int compareToValue(ColumnReader reader) {
        return comparator.compare(value, reader.getFloat());
      }

      @Override
      public String toString() {
        return stringifier.stringify(value);
      }
    }

    private FloatStatValueBuilder(PrimitiveType type) {
      super(type);
    }

    @Override
    StatValue build(ByteBuffer value) {
      return new Value(value.getFloat(0));
    }

    @Override
    String stringifyValue(ColumnReader reader) {
      return stringifier.stringify(reader.getFloat());
    }
  }

  private static class IntStatValueBuilder extends StatValue.Builder {
    private class Value implements StatValue {
      final int value;

      private Value(int value) {
        this.value = value;
      }

      @Override
      public int compareTo(StatValue o) {
        return comparator.compare(value, ((Value) o).value);
      }

      @Override
      public int compareToValue(ColumnReader reader) {
        return comparator.compare(value, reader.getInteger());
      }

      @Override
      public String toString() {
        return stringifier.stringify(value);
      }
    }

    private IntStatValueBuilder(PrimitiveType type) {
      super(type);
    }

    @Override
    StatValue build(ByteBuffer value) {
      return new Value(value.getInt(0));
    }

    @Override
    String stringifyValue(ColumnReader reader) {
      return stringifier.stringify(reader.getInteger());
    }
  }

  private static class LongStatValueBuilder extends StatValue.Builder {
    private class Value implements StatValue {
      final long value;

      private Value(long value) {
        this.value = value;
      }

      @Override
      public int compareTo(StatValue o) {
        return comparator.compare(value, ((Value) o).value);
      }

      @Override
      public int compareToValue(ColumnReader reader) {
        return comparator.compare(value, reader.getLong());
      }

      @Override
      public String toString() {
        return stringifier.stringify(value);
      }
    }

    private LongStatValueBuilder(PrimitiveType type) {
      super(type);
    }

    @Override
    StatValue build(ByteBuffer value) {
      return new Value(value.getLong(0));
    }

    @Override
    String stringifyValue(ColumnReader reader) {
      return stringifier.stringify(reader.getLong());
    }
  }

  private static class PageValidator {
    private final int rowGroupNumber;
    private final int columnNumber;
    private final ColumnPath columnPath;
    private final int pageNumber;
    private final int maxDefinitionLevel;
    private final long nullCountInIndex;
    private long nullCountActual;
    private final boolean isNullPage;
    private final ColumnReader columnReader;
    private final List violations;
    private final Set pageViolations = EnumSet.noneOf(Contract.class);
    private final StatValue minValue;
    private final StatValue maxValue;
    private final StatValue.Builder statValueBuilder;

    PageValidator(
        PrimitiveType type,
        int rowGroupNumber,
        int columnNumber,
        ColumnPath columnPath,
        int pageNumber,
        List violations,
        ColumnReader columnReader,
        ByteBuffer minValue,
        ByteBuffer maxValue,
        ByteBuffer prevMinValue,
        ByteBuffer prevMaxValue,
        BoundaryOrder boundaryOrder,
        long nullCount,
        boolean isNullPage) {
      this.columnReader = columnReader;
      this.rowGroupNumber = rowGroupNumber;
      this.columnNumber = columnNumber;
      this.columnPath = columnPath;
      this.pageNumber = pageNumber;
      this.nullCountInIndex = nullCount;
      this.nullCountActual = 0;
      this.isNullPage = isNullPage;
      this.maxDefinitionLevel = columnReader.getDescriptor().getMaxDefinitionLevel();
      this.violations = violations;
      this.statValueBuilder = getBuilder(type);
      this.minValue = isNullPage ? null : statValueBuilder.build(minValue);
      this.maxValue = isNullPage ? null : statValueBuilder.build(maxValue);

      if (isNullPage) {
        // By specification null pages have empty byte arrays as min/max values
        validateContract(!minValue.hasRemaining(), NULL_PAGE_HAS_NO_MIN, () -> statValueBuilder
            .build(minValue)
            .toString());
        validateContract(!maxValue.hasRemaining(), NULL_PAGE_HAS_NO_MAX, () -> statValueBuilder
            .build(maxValue)
            .toString());
      } else if (prevMinValue != null) {
        validateBoundaryOrder(
            statValueBuilder.build(prevMinValue), statValueBuilder.build(prevMaxValue), boundaryOrder);
      }
    }

    void validateValuesBelongingToRow() {
      do {
        if (columnReader.getCurrentDefinitionLevel() == maxDefinitionLevel) {
          validateValue();
        } else {
          ++nullCountActual;
        }
        columnReader.consume();
      } while (columnReader.getCurrentRepetitionLevel() != 0);
    }

    void finishPage() {
      validateContract(
          nullCountInIndex == nullCountActual,
          NULL_COUNT_CORRECT,
          () -> Long.toString(nullCountActual),
          () -> Long.toString(nullCountInIndex));
    }

    void validateContract(boolean contractCondition, Contract type, Supplier value1) {
      validateContract(contractCondition, type, value1, () -> "N/A");
    }

    void validateContract(
        boolean contractCondition, Contract type, Supplier value1, Supplier value2) {
      if (!contractCondition && !pageViolations.contains(type)) {
        violations.add(new ContractViolation(
            type, value1.get(), value2.get(), rowGroupNumber, columnNumber, columnPath, pageNumber));
        pageViolations.add(type);
      }
    }

    private void validateValue() {
      validateContract(!isNullPage, NULL_PAGE_HAS_NO_VALUES, () -> statValueBuilder.stringifyValue(columnReader));
      validateContract(
          minValue.compareToValue(columnReader) <= 0,
          MIN_LTEQ_VALUE,
          () -> statValueBuilder.stringifyValue(columnReader),
          minValue::toString);
      validateContract(
          maxValue.compareToValue(columnReader) >= 0,
          MAX_GTEQ_VALUE,
          () -> statValueBuilder.stringifyValue(columnReader),
          maxValue::toString);
    }

    private void validateBoundaryOrder(
        StatValue prevMinValue, StatValue prevMaxValue, BoundaryOrder boundaryOrder) {
      switch (boundaryOrder) {
        case ASCENDING:
          validateContract(
              minValue.compareTo(prevMinValue) >= 0,
              MIN_ASCENDING,
              minValue::toString,
              prevMinValue::toString);
          validateContract(
              maxValue.compareTo(prevMaxValue) >= 0,
              MAX_ASCENDING,
              maxValue::toString,
              prevMaxValue::toString);
          break;
        case DESCENDING:
          validateContract(
              minValue.compareTo(prevMinValue) <= 0,
              MIN_DESCENDING,
              minValue::toString,
              prevMinValue::toString);
          validateContract(
              maxValue.compareTo(prevMaxValue) <= 0,
              MAX_DESCENDING,
              maxValue::toString,
              prevMaxValue::toString);
          break;
        case UNORDERED:
          // No checks necessary.
      }
    }
  }

  public static List checkContractViolations(InputFile file) throws IOException {
    List violations = new ArrayList<>();
    try (ParquetFileReader reader = ParquetFileReader.open(file)) {
      FileMetaData meta = reader.getFooter().getFileMetaData();
      MessageType schema = meta.getSchema();
      List columns = schema.getColumns();

      List blocks = reader.getFooter().getBlocks();
      int rowGroupNumber = 0;
      PageReadStore rowGroup = reader.readNextRowGroup();
      while (rowGroup != null) {
        ColumnReadStore columnReadStore = new ColumnReadStoreImpl(
            rowGroup, new DummyRecordConverter(schema).getRootConverter(), schema, null);
        List columnChunks =
            blocks.get(rowGroupNumber).getColumns();
        assert (columnChunks.size() == columns.size());
        for (int columnNumber = 0; columnNumber < columns.size(); ++columnNumber) {
          ColumnDescriptor column = columns.get(columnNumber);
          ColumnChunkMetaData columnChunk = columnChunks.get(columnNumber);
          ColumnIndex columnIndex = reader.readColumnIndex(columnChunk);
          if (columnIndex == null) {
            continue;
          }
          ColumnPath columnPath = columnChunk.getPath();
          OffsetIndex offsetIndex = reader.readOffsetIndex(columnChunk);
          List minValues = columnIndex.getMinValues();
          List maxValues = columnIndex.getMaxValues();
          BoundaryOrder boundaryOrder = columnIndex.getBoundaryOrder();
          List nullCounts = columnIndex.getNullCounts();
          List nullPages = columnIndex.getNullPages();
          long rowNumber = 0;
          ColumnReader columnReader = columnReadStore.getColumnReader(column);
          ByteBuffer prevMinValue = null;
          ByteBuffer prevMaxValue = null;
          for (int pageNumber = 0; pageNumber < offsetIndex.getPageCount(); ++pageNumber) {
            boolean isNullPage = nullPages.get(pageNumber);
            ByteBuffer minValue = minValues.get(pageNumber);
            ByteBuffer maxValue = maxValues.get(pageNumber);
            PageValidator pageValidator = new PageValidator(
                column.getPrimitiveType(),
                rowGroupNumber,
                columnNumber,
                columnPath,
                pageNumber,
                violations,
                columnReader,
                minValue,
                maxValue,
                prevMinValue,
                prevMaxValue,
                boundaryOrder,
                nullCounts.get(pageNumber),
                isNullPage);
            if (!isNullPage) {
              prevMinValue = minValue;
              prevMaxValue = maxValue;
            }
            long lastRowNumberInPage = offsetIndex.getLastRowIndex(pageNumber, rowGroup.getRowCount());
            while (rowNumber <= lastRowNumberInPage) {
              pageValidator.validateValuesBelongingToRow();
              ++rowNumber;
            }
            pageValidator.finishPage();
          }
        }
        rowGroup.close();
        rowGroup = reader.readNextRowGroup();
        rowGroupNumber++;
      }
    }
    return violations;
  }
}




© 2015 - 2025 Weber Informatics LLC | Privacy Policy