All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.apache.parquet.cli.Util Maven / Gradle / Ivy

/*
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *   http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing,
 * software distributed under the License is distributed on an
 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 * KIND, either express or implied.  See the License for the
 * specific language governing permissions and limitations
 * under the License.
 */

package org.apache.parquet.cli;

import static org.apache.parquet.column.Encoding.BIT_PACKED;
import static org.apache.parquet.column.Encoding.DELTA_BINARY_PACKED;
import static org.apache.parquet.column.Encoding.DELTA_BYTE_ARRAY;
import static org.apache.parquet.column.Encoding.PLAIN;
import static org.apache.parquet.column.Encoding.PLAIN_DICTIONARY;
import static org.apache.parquet.column.Encoding.RLE;
import static org.apache.parquet.column.Encoding.RLE_DICTIONARY;
import static org.apache.parquet.format.Encoding.DELTA_LENGTH_BYTE_ARRAY;

import com.google.common.base.Ascii;
import com.google.common.base.Joiner;
import com.google.common.base.Preconditions;
import com.google.common.base.Splitter;
import com.google.common.collect.Iterables;
import com.google.common.hash.HashCode;
import java.util.Set;
import org.apache.parquet.column.ColumnDescriptor;
import org.apache.parquet.column.Encoding;
import org.apache.parquet.column.EncodingStats;
import org.apache.parquet.column.statistics.Statistics;
import org.apache.parquet.hadoop.metadata.CompressionCodecName;
import org.apache.parquet.schema.MessageType;
import org.apache.parquet.schema.OriginalType;
import org.apache.parquet.schema.PrimitiveType;
import org.apache.parquet.schema.Type;

public class Util {

  private static final long KB = 1024;
  private static final long MB = 1024 * KB;
  private static final long GB = 1024 * MB;
  private static final long TB = 1024 * GB;

  public static String humanReadable(float bytes) {
    if (bytes > TB) {
      return String.format("%.03f TB", bytes / TB);
    } else if (bytes > GB) {
      return String.format("%.03f GB", bytes / GB);
    } else if (bytes > MB) {
      return String.format("%.03f MB", bytes / MB);
    } else if (bytes > KB) {
      return String.format("%.03f kB", bytes / KB);
    } else {
      return String.format("%.02f B", bytes);
    }
  }

  public static String humanReadable(long bytes) {
    if (bytes > TB) {
      return String.format("%.03f TB", ((float) bytes) / TB);
    } else if (bytes > GB) {
      return String.format("%.03f GB", ((float) bytes) / GB);
    } else if (bytes > MB) {
      return String.format("%.03f MB", ((float) bytes) / MB);
    } else if (bytes > KB) {
      return String.format("%.03f kB", ((float) bytes) / KB);
    } else {
      return String.format("%d B", bytes);
    }
  }

  @Deprecated
  public static String minMaxAsString(Statistics stats, OriginalType annotation) {
    return minMaxAsString(stats);
  }

  public static String minMaxAsString(Statistics stats) {
    if (stats == null) {
      return "no stats";
    }
    if (!stats.hasNonNullValue()) {
      return "";
    }
    return String.format("%s / %s", humanReadable(stats.minAsString(), 30), humanReadable(stats.maxAsString(), 30));
  }

  @Deprecated
  public static String toString(Statistics stats, long count, OriginalType annotation) {
    return toString(stats, count);
  }

  public static String toString(Statistics stats, long count) {
    if (stats == null) {
      return "no stats";
    }
    return String.format(
        "min: %s max: %s nulls: %d/%d",
        humanReadable(stats.minAsString(), 30),
        humanReadable(stats.maxAsString(), 30),
        stats.getNumNulls(),
        count);
  }

  public static String humanReadable(String str, int len) {
    if (str == null) {
      return "null";
    }

    StringBuilder sb = new StringBuilder();
    sb.append("\"");
    if (str.length() > len - 2) {
      sb.append(str.substring(0, len - 5)).append("...");
    } else {
      sb.append(str);
    }
    sb.append("\"");

    return sb.toString();
  }

  public static String humanReadable(byte[] bytes, int len) {
    Preconditions.checkArgument(len >= 5, "Display length must be minimum 5");
    if (bytes == null || bytes.length == 0) {
      return "null";
    }

    final String asString = HashCode.fromBytes(bytes).toString();
    return "0x" + Ascii.truncate(asString, len - 2, "...");
  }

  public static String shortCodec(CompressionCodecName codec) {
    switch (codec) {
      case UNCOMPRESSED:
        return "_";
      case SNAPPY:
        return "S";
      case GZIP:
        return "G";
      case LZO:
        return "L";
      case BROTLI:
        return "B";
      case LZ4:
        return "4";
      case LZ4_RAW:
        return "F";
      case ZSTD:
        return "Z";
      default:
        return "?";
    }
  }

  public static String encodingAsString(Encoding encoding, boolean isDict) {
    switch (encoding) {
      case PLAIN:
        return "_";
      case PLAIN_DICTIONARY:
        // data pages use RLE, dictionary pages use plain
        return isDict ? "_" : "R";
      case RLE_DICTIONARY:
        return "R";
      case DELTA_BINARY_PACKED:
      case DELTA_LENGTH_BYTE_ARRAY:
      case DELTA_BYTE_ARRAY:
        return "D";
      default:
        return "?";
    }
  }

  public static String encodingStatsAsString(EncodingStats encodingStats) {
    StringBuilder sb = new StringBuilder();
    if (encodingStats.hasDictionaryPages()) {
      for (Encoding encoding : encodingStats.getDictionaryEncodings()) {
        sb.append(encodingAsString(encoding, true));
      }
      sb.append(" ");
    } else {
      sb.append("  ");
    }

    Set encodings = encodingStats.getDataEncodings();
    if (encodings.contains(RLE_DICTIONARY) || encodings.contains(PLAIN_DICTIONARY)) {
      sb.append("R");
    }
    if (encodings.contains(PLAIN)) {
      sb.append("_");
    }
    if (encodings.contains(DELTA_BYTE_ARRAY)
        || encodings.contains(DELTA_BINARY_PACKED)
        || encodings.contains(DELTA_LENGTH_BYTE_ARRAY)) {
      sb.append("D");
    }

    // Check for fallback and add a flag
    if (encodingStats.hasDictionaryEncodedPages() && encodingStats.hasNonDictionaryEncodedPages()) {
      sb.append(" F");
    }

    return sb.toString();
  }

  public static String encodingsAsString(Set encodings, ColumnDescriptor desc) {
    StringBuilder sb = new StringBuilder();
    if (encodings.contains(RLE) || encodings.contains(BIT_PACKED)) {
      sb.append(desc.getMaxDefinitionLevel() == 0 ? "B" : "R");
      sb.append(desc.getMaxRepetitionLevel() == 0 ? "B" : "R");
      if (encodings.contains(PLAIN_DICTIONARY)) {
        sb.append("R");
      }
      if (encodings.contains(PLAIN)) {
        sb.append("_");
      }
    } else {
      sb.append("RR");
      if (encodings.contains(RLE_DICTIONARY)) {
        sb.append("R");
      }
      if (encodings.contains(PLAIN)) {
        sb.append("_");
      }
      if (encodings.contains(DELTA_BYTE_ARRAY)
          || encodings.contains(DELTA_BINARY_PACKED)
          || encodings.contains(DELTA_LENGTH_BYTE_ARRAY)) {
        sb.append("D");
      }
    }
    return sb.toString();
  }

  private static final Splitter DOT = Splitter.on('.');

  public static ColumnDescriptor descriptor(String column, MessageType schema) {
    String[] path = Iterables.toArray(DOT.split(column), String.class);
    Preconditions.checkArgument(schema.containsPath(path), "Schema doesn't have column: %s", column);
    return schema.getColumnDescription(path);
  }

  public static String columnName(ColumnDescriptor desc) {
    return Joiner.on('.').join(desc.getPath());
  }

  public static PrimitiveType primitive(MessageType schema, String[] path) {
    Type current = schema;
    for (String part : path) {
      current = current.asGroupType().getType(part);
      if (current.isPrimitive()) {
        return current.asPrimitiveType();
      }
    }
    return null;
  }

  public static PrimitiveType primitive(String column, MessageType schema) {
    String[] path = Iterables.toArray(DOT.split(column), String.class);
    Preconditions.checkArgument(schema.containsPath(path), "Schema doesn't have column: %s", column);
    return primitive(schema, path);
  }
}




© 2015 - 2025 Weber Informatics LLC | Privacy Policy