All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.apache.parquet.cli.commands.CheckParquet251Command Maven / Gradle / Ivy

The newest version!
/*
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *   http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing,
 * software distributed under the License is distributed on an
 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 * KIND, either express or implied.  See the License for the
 * specific language governing permissions and limitations
 * under the License.
 */

package org.apache.parquet.cli.commands;

import static org.apache.parquet.schema.PrimitiveType.PrimitiveTypeName.BINARY;

import com.beust.jcommander.Parameter;
import com.beust.jcommander.Parameters;
import com.beust.jcommander.internal.Lists;
import com.google.common.base.Predicate;
import com.google.common.collect.Iterables;
import java.io.IOException;
import java.util.Arrays;
import java.util.Comparator;
import java.util.List;
import javax.annotation.Nullable;
import org.apache.hadoop.fs.Path;
import org.apache.parquet.CorruptStatistics;
import org.apache.parquet.Version;
import org.apache.parquet.VersionParser;
import org.apache.parquet.bytes.BytesInput;
import org.apache.parquet.cli.BaseCommand;
import org.apache.parquet.column.ColumnDescriptor;
import org.apache.parquet.column.ColumnReader;
import org.apache.parquet.column.page.DataPage;
import org.apache.parquet.column.page.DataPageV1;
import org.apache.parquet.column.page.DataPageV2;
import org.apache.parquet.column.page.DictionaryPage;
import org.apache.parquet.column.page.PageReadStore;
import org.apache.parquet.column.page.PageReader;
import org.apache.parquet.column.statistics.Statistics;
import org.apache.parquet.format.converter.ParquetMetadataConverter;
import org.apache.parquet.hadoop.ParquetFileReader;
import org.apache.parquet.hadoop.metadata.FileMetaData;
import org.apache.parquet.hadoop.metadata.ParquetMetadata;
import org.apache.parquet.io.ParquetDecodingException;
import org.apache.parquet.io.api.Binary;
import org.apache.parquet.io.api.PrimitiveConverter;
import org.apache.parquet.schema.PrimitiveType.PrimitiveTypeName;
import org.apache.parquet.schema.PrimitiveType.PrimitiveTypeNameConverter;
import org.apache.parquet.util.DynConstructors;
import org.slf4j.Logger;

@Parameters(commandDescription = "Check Parquet files for corrupt page and column stats (PARQUET-251)")
public class CheckParquet251Command extends BaseCommand {

  public CheckParquet251Command(Logger console) {
    super(console);
  }

  @Parameter(description = "", required = true)
  List files;

  @Override
  public int run() throws IOException {
    boolean badFiles = false;
    for (String file : files) {
      String problem = check(file);
      if (problem != null) {
        badFiles = true;
        console.info("{} has corrupt stats: {}", file, problem);
      } else {
        console.info("{} has no corrupt stats", file);
      }
    }

    return badFiles ? 1 : 0;
  }

  private String check(String file) throws IOException {
    Path path = qualifiedPath(file);
    ParquetMetadata footer = ParquetFileReader.readFooter(getConf(), path, ParquetMetadataConverter.NO_FILTER);

    FileMetaData meta = footer.getFileMetaData();
    String createdBy = meta.getCreatedBy();
    if (CorruptStatistics.shouldIgnoreStatistics(createdBy, BINARY)) {
      // create fake metadata that will read corrupt stats and return them
      FileMetaData fakeMeta =
          new FileMetaData(meta.getSchema(), meta.getKeyValueMetaData(), Version.FULL_VERSION);

      // get just the binary columns
      List columns = Lists.newArrayList();
      Iterables.addAll(
          columns, Iterables.filter(meta.getSchema().getColumns(), new Predicate() {
            @Override
            public boolean apply(@Nullable ColumnDescriptor input) {
              return input != null && input.getType() == BINARY;
            }
          }));

      // now check to see if the data is actually corrupt
      try (ParquetFileReader reader =
          new ParquetFileReader(getConf(), fakeMeta, path, footer.getBlocks(), columns)) {
        PageStatsValidator validator = new PageStatsValidator();
        for (PageReadStore pages = reader.readNextRowGroup();
            pages != null;
            pages = reader.readNextRowGroup()) {
          validator.validate(columns, pages);
          pages.close();
        }
      } catch (BadStatsException e) {
        return e.getMessage();
      }
    }

    return null;
  }

  @Override
  public List getExamples() {
    return Arrays.asList("# Check file1.parquet for corrupt page and column stats", "file1.parquet");
  }

  public static class BadStatsException extends RuntimeException {
    public BadStatsException(String message) {
      super(message);
    }
  }

  public class SingletonPageReader implements PageReader {
    private final DictionaryPage dict;
    private final DataPage data;

    public SingletonPageReader(DictionaryPage dict, DataPage data) {
      this.dict = dict;
      this.data = data;
    }

    @Override
    public DictionaryPage readDictionaryPage() {
      return dict;
    }

    @Override
    public long getTotalValueCount() {
      return data.getValueCount();
    }

    @Override
    public DataPage readPage() {
      return data;
    }
  }

  private static > Statistics getStatisticsFromPageHeader(DataPage page) {
    return page.accept(new DataPage.Visitor>() {
      @Override
      @SuppressWarnings("unchecked")
      public Statistics visit(DataPageV1 dataPageV1) {
        return (Statistics) dataPageV1.getStatistics();
      }

      @Override
      @SuppressWarnings("unchecked")
      public Statistics visit(DataPageV2 dataPageV2) {
        return (Statistics) dataPageV2.getStatistics();
      }
    });
  }

  private class StatsValidator> {
    private final boolean hasNonNull;
    private final T min;
    private final T max;
    private final Comparator comparator;

    public StatsValidator(DataPage page) {
      Statistics stats = getStatisticsFromPageHeader(page);
      this.comparator = stats.comparator();
      this.hasNonNull = stats.hasNonNullValue();
      if (hasNonNull) {
        this.min = stats.genericGetMin();
        this.max = stats.genericGetMax();
      } else {
        this.min = null;
        this.max = null;
      }
    }

    public void validate(T value) {
      if (hasNonNull) {
        if (comparator.compare(min, value) > 0) {
          throw new BadStatsException("Min should be <= all values.");
        }
        if (comparator.compare(max, value) < 0) {
          throw new BadStatsException("Max should be >= all values.");
        }
      }
    }
  }

  private PrimitiveConverter getValidatingConverter(final DataPage page, PrimitiveTypeName type) {
    return type.convert(new PrimitiveTypeNameConverter() {
      @Override
      public PrimitiveConverter convertFLOAT(PrimitiveTypeName primitiveTypeName) {
        final StatsValidator validator = new StatsValidator(page);
        return new PrimitiveConverter() {
          @Override
          public void addFloat(float value) {
            validator.validate(value);
          }
        };
      }

      @Override
      public PrimitiveConverter convertDOUBLE(PrimitiveTypeName primitiveTypeName) {
        final StatsValidator validator = new StatsValidator(page);
        return new PrimitiveConverter() {
          @Override
          public void addDouble(double value) {
            validator.validate(value);
          }
        };
      }

      @Override
      public PrimitiveConverter convertINT32(PrimitiveTypeName primitiveTypeName) {
        final StatsValidator validator = new StatsValidator(page);
        return new PrimitiveConverter() {
          @Override
          public void addInt(int value) {
            validator.validate(value);
          }
        };
      }

      @Override
      public PrimitiveConverter convertINT64(PrimitiveTypeName primitiveTypeName) {
        final StatsValidator validator = new StatsValidator(page);
        return new PrimitiveConverter() {
          @Override
          public void addLong(long value) {
            validator.validate(value);
          }
        };
      }

      @Override
      public PrimitiveConverter convertBOOLEAN(PrimitiveTypeName primitiveTypeName) {
        final StatsValidator validator = new StatsValidator(page);
        return new PrimitiveConverter() {
          @Override
          public void addBoolean(boolean value) {
            validator.validate(value);
          }
        };
      }

      @Override
      public PrimitiveConverter convertINT96(PrimitiveTypeName primitiveTypeName) {
        return convertBINARY(primitiveTypeName);
      }

      @Override
      public PrimitiveConverter convertFIXED_LEN_BYTE_ARRAY(PrimitiveTypeName primitiveTypeName) {
        return convertBINARY(primitiveTypeName);
      }

      @Override
      public PrimitiveConverter convertBINARY(PrimitiveTypeName primitiveTypeName) {
        final StatsValidator validator = new StatsValidator(page);
        return new PrimitiveConverter() {
          @Override
          public void addBinary(Binary value) {
            validator.validate(value);
          }
        };
      }
    });
  }

  private static final DynConstructors.Ctor COL_READER_CTOR = new DynConstructors.Builder(
          ColumnReader.class)
      .hiddenImpl(
          "org.apache.parquet.column.impl.ColumnReaderImpl",
          ColumnDescriptor.class,
          PageReader.class,
          PrimitiveConverter.class,
          VersionParser.ParsedVersion.class)
      .build();

  public class PageStatsValidator {
    public void validate(List columns, PageReadStore store) {
      for (ColumnDescriptor desc : columns) {
        PageReader reader = store.getPageReader(desc);
        DictionaryPage dict = reader.readDictionaryPage();
        DictionaryPage reusableDict = null;
        if (dict != null) {
          try {
            reusableDict = new DictionaryPage(
                BytesInput.from(dict.getBytes().toByteArray()),
                dict.getDictionarySize(),
                dict.getEncoding());
          } catch (IOException e) {
            throw new ParquetDecodingException("Cannot read dictionary", e);
          }
        }
        DataPage page;
        while ((page = reader.readPage()) != null) {
          validateStatsForPage(page, reusableDict, desc);
        }
      }
    }

    private void validateStatsForPage(DataPage page, DictionaryPage dict, ColumnDescriptor desc) {
      SingletonPageReader reader = new SingletonPageReader(dict, page);
      PrimitiveConverter converter = getValidatingConverter(page, desc.getType());
      Statistics stats = getStatisticsFromPageHeader(page);

      long numNulls = 0;

      ColumnReader column = COL_READER_CTOR.newInstance(desc, reader, converter, null);
      for (int i = 0; i < reader.getTotalValueCount(); i += 1) {
        if (column.getCurrentDefinitionLevel() >= desc.getMaxDefinitionLevel()) {
          column.writeCurrentValueToConverter();
        } else {
          numNulls += 1;
        }
        column.consume();
      }

      if (numNulls != stats.getNumNulls()) {
        throw new BadStatsException("Number of nulls doesn't match.");
      }

      console.debug(String.format(
          "Validated stats min=%s max=%s nulls=%d for page=%s col=%s",
          stats.minAsString(),
          stats.maxAsString(),
          stats.getNumNulls(),
          page,
          Arrays.toString(desc.getPath())));
    }
  }
}




© 2015 - 2024 Weber Informatics LLC | Privacy Policy