All Downloads are FREE. Search and download functionalities are using the official Maven repository.

a.clickzetta-java.1.3.15.source-code.statistics.proto Maven / Gradle / Ivy

There is a newer version: 2.0.0
Show newest version
syntax = "proto3";

package cz.proto;
option java_multiple_files = true;

import "expression.proto";
import "data_type.proto";

message SortKeys {
  repeated uint32 keys = 1;
}

message FieldBounds {
  map bounds = 1;
}

// If the file is a delta file, it may contain the following information.
message DeltaUpdatedInfo {
  // field ids for updated columns.
  repeated uint32 updated_columns = 1;
}

message StatsData {
  int64 snapshot_id = 1;
  optional int64 size_in_bytes = 2;
  optional int64 record_count = 3;

  // Estimated number of records that exist in the file/table.
  optional int64 estimated_record_count = 16;

  // Number of records that have been added or deleted altogether, depending on its sign.
  // It is computed by the operation type of delta file:
  // - REINSERT: +1
  // - DELETE:   -1
  // - UPDATE:    0
  optional int64 delta_row_count_change = 14;
  // If the file is a delta file, it may contain the following information.
  //
  // If the file not contains updated_info, even it's a Delta file, we cannot
  // know which columns are updated, so we cannot use the update stats.
  //
  // Once updated_info exists, it must contain all the updated columns.
  optional DeltaUpdatedInfo updated_info = 15;

  FieldsStats fields_stats = 5;

  // file level sort key lower bound, field id -> value
  FieldBounds sort_key_lower_bounds = 12;
  // file level sort key upper bound, field id -> value
  FieldBounds sort_key_upper_bounds = 13;
}

message FieldsStats {
  repeated FieldStats field_stats = 1;
}

message FieldStats {
  repeated uint32 field_id = 1;
  repeated FieldStatsValue stats_value = 2;
}

message FieldStatsValue {
  oneof value {
    int64 nan_count = 1;
    // For storage, value_count not includes null data. And for complex type, 
    //  only non-null leaf will be counted.
    int64 value_count = 2;
    int64 null_count = 3;
    Constant lower_bounds = 4;
    Constant upper_bounds = 5;
    double avg_size = 6;
    int64 max_size = 7;
    int64 compressed_size = 8;
    int64 distinct_number = 9;
    TopK top_k = 10;
    Histogram histogram = 11;
    // For storage, Size in bytes of columnar data before any encoding or compression.
    // * For fix-sized type, it's always `value_count * parquet::GetTypeByteSize(T)`,
    //   value_count includes null data.
    // * For variable-length type, it's `value_count * parquet::GetTypeByteSize(T)
    //   + variable-length`.
    // For complex type, leaf will be counted, and non-leaf types would be the
    // sum of all leaf-type raw size.
    int64 raw_size_in_bytes = 12;
  }
}

message TopK {
  repeated Constant top_k = 1;
}

message HistogramBucket {
  Constant lower_bound = 1;
  Constant upper_bound = 2;
  int64 value_count = 3;
}

message Histogram {
  repeated HistogramBucket buckets = 1;
}

/*
 * for range distribution boundaries
 */

message ValuePoint {
  repeated Constant values = 1;
}

message BoundaryPoint {
  bool included = 1;
  oneof value {
    bool unbounded = 2;
    ValuePoint value_point = 3;
  }
};

message Boundary {
  BoundaryPoint lower = 1;
  BoundaryPoint upper = 2;
}

message RangeBoundary {
  uint64 id = 1;
  repeated DataType types = 2;
  repeated Boundary ranges = 3;
}

message EnforceBoundary {
  string table_name = 1;
  RangeBoundary boundaries = 2;
}

message TableBoundary {
  repeated EnforceBoundary table_boundaries = 1;
}

/*
 * end for range distribution boundaries
 */

// file input the min and max of columns
message FieldRange {
  uint32 field_id = 1;
  DataType type = 2;
  Boundary range = 3;
}




© 2015 - 2024 Weber Informatics LLC | Privacy Policy