All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.apache.parquet.cli.commands.ParquetMetadataCommand Maven / Gradle / Ivy

There is a newer version: 1.15.0
Show newest version
/*
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *   http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing,
 * software distributed under the License is distributed on an
 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 * KIND, either express or implied.  See the License for the
 * specific language governing permissions and limitations
 * under the License.
 */

package org.apache.parquet.cli.commands;

import static org.apache.parquet.cli.Util.encodingStatsAsString;
import static org.apache.parquet.cli.Util.encodingsAsString;
import static org.apache.parquet.cli.Util.humanReadable;
import static org.apache.parquet.cli.Util.minMaxAsString;
import static org.apache.parquet.cli.Util.primitive;
import static org.apache.parquet.cli.Util.shortCodec;

import com.beust.jcommander.Parameter;
import com.beust.jcommander.Parameters;
import com.google.common.base.Function;
import com.google.common.base.Preconditions;
import com.google.common.collect.Iterables;
import com.google.common.collect.Lists;
import java.io.IOException;
import java.util.List;
import java.util.Map;
import java.util.Set;
import javax.annotation.Nullable;
import org.apache.commons.text.TextStringBuilder;
import org.apache.parquet.cli.BaseCommand;
import org.apache.parquet.column.ColumnDescriptor;
import org.apache.parquet.column.Encoding;
import org.apache.parquet.column.EncodingStats;
import org.apache.parquet.column.statistics.Statistics;
import org.apache.parquet.format.converter.ParquetMetadataConverter;
import org.apache.parquet.hadoop.ParquetFileReader;
import org.apache.parquet.hadoop.metadata.BlockMetaData;
import org.apache.parquet.hadoop.metadata.ColumnChunkMetaData;
import org.apache.parquet.hadoop.metadata.CompressionCodecName;
import org.apache.parquet.hadoop.metadata.ParquetMetadata;
import org.apache.parquet.schema.MessageType;
import org.apache.parquet.schema.PrimitiveType;
import org.slf4j.Logger;

@Parameters(commandDescription = "Print a Parquet file's metadata")
public class ParquetMetadataCommand extends BaseCommand {

  public ParquetMetadataCommand(Logger console) {
    super(console);
  }

  @Parameter(description = "")
  List targets;

  @Override
  @SuppressWarnings("unchecked")
  public int run() throws IOException {
    Preconditions.checkArgument(targets != null && targets.size() >= 1, "A Parquet file is required.");
    Preconditions.checkArgument(targets.size() == 1, "Cannot process multiple Parquet files.");

    String source = targets.get(0);
    ParquetMetadata footer =
        ParquetFileReader.readFooter(getConf(), qualifiedPath(source), ParquetMetadataConverter.NO_FILTER);

    console.info("\nFile path:  {}", source);
    console.info("Created by: {}", footer.getFileMetaData().getCreatedBy());

    Map kv = footer.getFileMetaData().getKeyValueMetaData();
    if (kv != null && !kv.isEmpty()) {
      console.info("Properties:");
      String format = "  %" + maxSize(kv.keySet()) + "s: %s";
      for (Map.Entry entry : kv.entrySet()) {
        console.info(String.format(format, entry.getKey(), entry.getValue()));
      }
    } else {
      console.info("Properties: (none)");
    }

    MessageType schema = footer.getFileMetaData().getSchema();
    console.info("Schema:\n{}", schema);

    List rowGroups = footer.getBlocks();
    for (int index = 0, n = rowGroups.size(); index < n; index += 1) {
      printRowGroup(console, index, rowGroups.get(index), schema);
    }

    console.info("");

    return 0;
  }

  @Override
  public List getExamples() {
    return Lists.newArrayList();
  }

  private int maxSize(Iterable strings) {
    int size = 0;
    for (String s : strings) {
      size = Math.max(size, s.length());
    }
    return size;
  }

  private void printRowGroup(Logger console, int index, BlockMetaData rowGroup, MessageType schema) {
    long start = rowGroup.getStartingPos();
    long rowCount = rowGroup.getRowCount();
    long compressedSize = rowGroup.getCompressedSize();
    long uncompressedSize = rowGroup.getTotalByteSize();
    String filePath = rowGroup.getPath();

    console.info(String.format(
        "\nRow group %d:  count: %d  %s records  start: %d  total(compressed): %s total(uncompressed):%s %s\n%s",
        index,
        rowCount,
        humanReadable(((float) compressedSize) / rowCount),
        start,
        humanReadable(compressedSize),
        humanReadable(uncompressedSize),
        filePath != null ? "path: " + filePath : "",
        new TextStringBuilder(80).appendPadding(80, '-')));

    int size = maxSize(Iterables.transform(rowGroup.getColumns(), new Function() {
      @Override
      public String apply(@Nullable ColumnChunkMetaData input) {
        return input == null ? "" : input.getPath().toDotString();
      }
    }));

    console.info(String.format(
        "%-" + size + "s  %-9s %-9s %-9s %-10s %-7s %s",
        "",
        "type",
        "encodings",
        "count",
        "avg size",
        "nulls",
        "min / max"));
    for (ColumnChunkMetaData column : rowGroup.getColumns()) {
      printColumnChunk(console, size, column, schema);
    }
  }

  private void printColumnChunk(Logger console, int width, ColumnChunkMetaData column, MessageType schema) {
    String[] path = column.getPath().toArray();
    PrimitiveType type = primitive(schema, path);
    Preconditions.checkNotNull(type);

    ColumnDescriptor desc = schema.getColumnDescription(path);
    long size = column.getTotalSize();
    long count = column.getValueCount();
    float perValue = ((float) size) / count;
    CompressionCodecName codec = column.getCodec();
    Set encodings = column.getEncodings();
    EncodingStats encodingStats = column.getEncodingStats();
    String encodingSummary =
        encodingStats == null ? encodingsAsString(encodings, desc) : encodingStatsAsString(encodingStats);
    Statistics stats = column.getStatistics();

    String name = column.getPath().toDotString();

    PrimitiveType.PrimitiveTypeName typeName = type.getPrimitiveTypeName();
    if (typeName == PrimitiveType.PrimitiveTypeName.FIXED_LEN_BYTE_ARRAY) {
      console.info(String.format(
          "%-" + width + "s  FIXED[%d] %s %-7s %-9d %-8s %-7s %s",
          name,
          type.getTypeLength(),
          shortCodec(codec),
          encodingSummary,
          count,
          humanReadable(perValue),
          stats == null || !stats.isNumNullsSet() ? "" : String.valueOf(stats.getNumNulls()),
          minMaxAsString(stats)));
    } else {
      console.info(String.format(
          "%-" + width + "s  %-9s %s %-7s %-9d %-10s %-7s %s",
          name,
          typeName,
          shortCodec(codec),
          encodingSummary,
          count,
          humanReadable(perValue),
          stats == null || !stats.isNumNullsSet() ? "" : String.valueOf(stats.getNumNulls()),
          minMaxAsString(stats)));
    }
  }
}




© 2015 - 2025 Weber Informatics LLC | Privacy Policy