tensorflow_metadata.proto.v0.schema.proto Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of scio-tensorflow_2.12 Show documentation
Scio add-on for TensorFlow
There is a newer version: 0.6.0
// Copyright 2017 The TensorFlow Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
// =============================================================================

syntax = "proto2";

package tensorflow.metadata.v0;

import "google/protobuf/any.proto";
import "tensorflow_metadata/proto/v0/derived_feature.proto";
import "tensorflow_metadata/proto/v0/path.proto";

// GOOGLE-LEGACY option jspb_use_correct_proto2_semantics = false;  
option cc_enable_arenas = true;
option java_package = "org.tensorflow.metadata.v0";
option java_multiple_files = true;

// LifecycleStage. Only UNKNOWN_STAGE, BETA, PRODUCTION, and VALIDATION_DERIVED
// features are actually validated.
// PLANNED, ALPHA, DISABLED, and DEBUG are treated as DEPRECATED.
enum LifecycleStage {
  // Unknown stage.
  UNKNOWN_STAGE = 0;

  // Planned feature, may not be created yet.
  PLANNED = 1;

  // Prototype feature, not used in experiments yet.
  ALPHA = 2;

  // Used in user-facing experiments.
  BETA = 3;

  // Used in a significant fraction of user traffic.
  PRODUCTION = 4;

  // No longer supported: do not use in new models.
  DEPRECATED = 5;

  // Only exists for debugging purposes.
  DEBUG_ONLY = 6;

  // Generic indication that feature is disabled / excluded
  // from models, regardless of specific reason.
  DISABLED = 7;

  // Indicates that this feature was derived from ordinary
  // features for the purposes of statistics generation or
  // validation. Consumers should expect that this feature
  // may be present in DatasetFeatureStatistics, but not in
  // input data.
  // Experimental and subject to change.
  VALIDATION_DERIVED = 9;

  reserved 8;
}

//
// Message to represent schema information.
// NextID: 15
message Schema {
  // Features described in this schema.
  repeated Feature feature = 1;

  // Sparse features described in this schema.
  repeated SparseFeature sparse_feature = 6;

  // Weighted features described in this schema.
  repeated WeightedFeature weighted_feature = 12;

  // Use StructDomain instead.
  // Sequences described in this schema. A sequence may be described in terms of
  // several features. Any features appearing within a sequence must *not* be
  // declared as top-level features in .
// GOOGLE-LEGACY   repeated Sequence sequence = 2;  

  // String domains referenced in the features.
  repeated StringDomain string_domain = 4;

  // TOP LEVEL FLOAT AND INT DOMAINS ARE UNSUPPORTED IN TFDV.
  // TODO(b/63664182): Support this.
  // top level float domains that can be reused by features
  repeated FloatDomain float_domain = 9;

  // top level int domains that can be reused by features
  repeated IntDomain int_domain = 10;

  // Default environments for each feature.
  // An environment represents both a type of location (e.g. a server or phone)
  // and a time (e.g. right before model X is run). In the standard scenario,
  // 99% of the features should be in the default environments TRAINING,
  // SERVING, and the LABEL (or labels) AND WEIGHT is only available at TRAINING
  // (not at serving).
  // Other possible variations:
  // 1. There may be TRAINING_MOBILE, SERVING_MOBILE, TRAINING_SERVICE,
  //    and SERVING_SERVICE.
  // 2. If one is ensembling three models, where the predictions of the first
  //    three models are available for the ensemble model, there may be
  //    TRAINING, SERVING_INITIAL, SERVING_ENSEMBLE.
  // See FeatureProto::not_in_environment and FeatureProto::in_environment.
  repeated string default_environment = 5;

  /* BEGIN GOOGLE-LEGACY
  // TODO(b/73109633): Change default to false, before removing this field.
  optional bool generate_legacy_feature_spec = 7 [default = true];
  END GOOGLE-LEGACY */

  // Whether to represent variable length features as RaggedTensors. By default
  // they are represented as ragged left-alighned SparseTensors. RaggedTensor
  // representation is more memory efficient. Therefore, turning this on will
  // likely yield data processing performance improvement.
  // Experimental and may be subject to change.
  optional bool represent_variable_length_as_ragged = 14;

  // Additional information about the schema as a whole. Features may also
  // be annotated individually.
  optional Annotation annotation = 8;

  // Dataset-level constraints. This is currently used for specifying
  // information about changes in num_examples.
  optional DatasetConstraints dataset_constraints = 11;

  // TensorRepresentation groups. The keys are the names of the groups.
  // Key "" (empty string) denotes the "default" group, which is what should
  // be used when a group name is not provided.
  // See the documentation at TensorRepresentationGroup for more info.
  // Under development.
  map tensor_representation_group = 13;
}

message ValueCountList {
  repeated ValueCount value_count = 1;
}

// Describes schema-level information about a specific feature.
// NextID: 36
message Feature {
  // The name of the feature.
  optional string name = 1;  // required

  // This field is no longer supported. Instead, use:
  // lifecycle_stage: DEPRECATED
  // TODO(b/111450258): remove this.
  optional bool deprecated = 2 [deprecated = true];

  // Comment field for a human readable description of the field.
  // TODO(b/123518108): remove this.
// GOOGLE-LEGACY   optional string comment = 3 [deprecated = true];  

  oneof presence_constraints {
    // Constraints on the presence of this feature in the examples.
    FeaturePresence presence = 14;
    // Only used in the context of a "group" context, e.g., inside a sequence.
    FeaturePresenceWithinGroup group_presence = 17;
  }

  // The shape of the feature which governs the number of values that appear in
  // each example.
  oneof shape_type {
    // The feature has a fixed shape corresponding to a multi-dimensional
    // tensor.
    FixedShape shape = 23;
    // The feature doesn't have a well defined shape. All we know are limits on
    // the minimum and maximum number of values.
    ValueCount value_count = 5;
    // Captures the same information as value_count but for features with
    // nested values. A ValueCount is provided for each nest level.
    ValueCountList value_counts = 32;
  }

  // Physical type of the feature's values.
  // Note that you can have:
  // type: BYTES
  // int_domain: {
  //   min: 0
  //   max: 3
  // }
  // This would be a field that is syntactically BYTES (i.e. strings), but
  // semantically an int, i.e. it would be "0", "1", "2", or "3".
  optional FeatureType type = 6;

  // Domain for the values of the feature.
  oneof domain_info {
    // Reference to a domain defined at the schema level.
    // NOTE THAT TFDV ONLY SUPPORTS STRING DOMAINS AT THE TOP LEVEL.
    // TODO(b/63664182): Support this.
    string domain = 7;
    // Inline definitions of domains.
    IntDomain int_domain = 9;
    FloatDomain float_domain = 10;
    StringDomain string_domain = 11;
    BoolDomain bool_domain = 13;
    StructDomain struct_domain = 29;
    // Supported semantic domains.
    NaturalLanguageDomain natural_language_domain = 24;
    ImageDomain image_domain = 25;
    MIDDomain mid_domain = 26;
    URLDomain url_domain = 27;
    TimeDomain time_domain = 28;
    TimeOfDayDomain time_of_day_domain = 30;
  }

  // Constraints on the distribution of the feature values.
  // Only supported for StringDomains.
  optional DistributionConstraints distribution_constraints = 15;

  // Additional information about the feature for documentation purpose.
  optional Annotation annotation = 16;

  // Tests comparing the distribution to the associated serving data.
  optional FeatureComparator skew_comparator = 18;

  // Tests comparing the distribution between two consecutive spans (e.g. days).
  optional FeatureComparator drift_comparator = 21;

  // List of environments this feature is present in.
  // Should be disjoint from not_in_environment.
  // This feature is in environment "foo" if:
  // ("foo" is in in_environment or default_environment) AND
  // "foo" is not in not_in_environment.
  // See Schema::default_environment.
  repeated string in_environment = 20;

  // List of environments this feature is not present in.
  // Should be disjoint from of in_environment.
  // See Schema::default_environment and in_environment.
  repeated string not_in_environment = 19;

  // The lifecycle stage of a feature. It can also apply to its descendants.
  // i.e., if a struct is DEPRECATED, its children are implicitly deprecated.
  optional LifecycleStage lifecycle_stage = 22;

  // Constraints on the number of unique values for a given feature.
  // This is supported for string and categorical features only.
  optional UniqueConstraints unique_constraints = 31;

  // If set, indicates that that this feature is derived, and stores metadata
  // about its source. If this field is set, this feature should have a
  // disabled stage (PLANNED, ALPHA, DEPRECATED, DISABLED, DEBUG_ONLY), or
  // lifecycle_stage VALIDATION_DERIVED.
  // Experimental and subject to change.
  optional DerivedFeatureSource validation_derived_source = 34;
  reserved 33;

  // This field specifies if this feature could be treated as a sequence
  // feature which has meaningful element order.
  optional SequenceMetadata sequence_metadata = 35;
}

// Additional information about the schema or about a feature.
message Annotation {
  // Tags can be used to mark features. For example, tag on user_age feature can
  // be `user_feature`, tag on user_country feature can be `location_feature`,
  // `user_feature`.
  repeated string tag = 1;
  // Free-text comments. This can be used as a description of the feature,
  // developer notes etc.
  repeated string comment = 2;
  // Application-specific metadata may be attached here.
  repeated .google.protobuf.Any extra_metadata = 3;
}

// Checks that the ratio of the current value to the previous value is not below
// the min_fraction_threshold or above the max_fraction_threshold. That is,
// previous value * min_fraction_threshold <= current value <=
// previous value * max_fraction_threshold.
// To specify that the value cannot change, set both min_fraction_threshold and
// max_fraction_threshold to 1.0.
message NumericValueComparator {
  optional double min_fraction_threshold = 1;
  optional double max_fraction_threshold = 2;
}

// Constraints on the entire dataset.
message DatasetConstraints {
  // Tests differences in number of examples between the current data and the
  // previous span.
  optional NumericValueComparator num_examples_drift_comparator = 1;
  // Tests comparisions in number of examples between the current data and the
  // previous version of that data.
  optional NumericValueComparator num_examples_version_comparator = 2;
  // Minimum number of examples in the dataset.
  optional int64 min_examples_count = 3;
  // Maximum number of examples in the dataset.
  optional int64 max_examples_count = 4;
}

// Specifies a fixed shape for the feature's values. The immediate implication
// is that each feature has a fixed number of values. Moreover, these values
// can be parsed in a multi-dimensional tensor using the specified axis sizes.
// The FixedShape defines a lexicographical ordering of the data. For instance,
// if there is a FixedShape {
//   dim {size:3} dim {size:2}
// }
// then tensor[0][0]=field[0]
// then tensor[0][1]=field[1]
// then tensor[1][0]=field[2]
// then tensor[1][1]=field[3]
// then tensor[2][0]=field[4]
// then tensor[2][1]=field[5]
//
// The FixedShape message is identical with the tensorflow.TensorShape proto
// message for fully defined shapes. The FixedShape message cannot represent
// unknown dimensions or an unknown rank.
message FixedShape {
  // The dimensions that define the shape. The total number of values in each
  // example is the product of sizes of each dimension.
  repeated Dim dim = 2;

  // An axis in a multi-dimensional feature representation.
  message Dim {
    optional int64 size = 1;

    // Optional name of the tensor dimension.
    optional string name = 2;
  }
}

// Limits on maximum and minimum number of values in a
// single example (when the feature is present). Use this when the minimum
// value count can be different than the maximum value count. Otherwise prefer
// FixedShape.
message ValueCount {
  optional int64 min = 1;
  optional int64 max = 2;
}

/* BEGIN GOOGLE-LEGACY
// Constraint on the number of elements in a sequence.
message LengthConstraint {
  optional int64 min = 1;
  optional int64 max = 2;
}

// A sequence is a logical feature that comprises several "raw" features that
// encode values at different "steps" within the sequence.
// TODO(b/110490010): Delete this. This is a special case of StructDomain.
message Sequence {
  // An optional name for this sequence. Used mostly for debugging and
  // presentation.
  optional string name = 1;

  // Features that comprise the sequence. These features are "zipped" together
  // to form the values for the sequence at different steps.
  // - Use group_presence within each feature to encode presence constraints
  //   within the sequence.
  // - If all features have the same value-count constraints then
  //   declare this once using the shape_constraint below.
  repeated Feature feature = 2;

  // Constraints on the presence of the sequence across all examples in the
  // dataset. The sequence is assumed to be present if at least one of its
  // features is present.
  optional FeaturePresence presence = 3;

  // Shape constraints that apply on all the features that comprise the
  // sequence. If this is set then the value_count in 'feature' is
  // ignored.
  // TODO(martinz): delete: there is no reason to believe the shape of the
  // fields in a sequence will be the same. Use the fields in Feature instead.
  oneof shape_constraint {
    ValueCount value_count = 4;
    FixedShape fixed_shape = 5;
  }

  // Constraint on the number of elements in a sequence.
  optional LengthConstraint length_constraint = 6;
}
END GOOGLE-LEGACY */

// Represents a weighted feature that is encoded as a combination of raw base
// features. The `weight_feature` should be a float feature with identical
// shape as the `feature`. This is useful for representing weights associated
// with categorical tokens (e.g. a TFIDF weight associated with each token).
// TODO(b/142122960): Handle WeightedCategorical end to end in TFX (validation,
// TFX Unit Testing, etc)
message WeightedFeature {
  // Name for the weighted feature. This should not clash with other features in
  // the same schema.
  optional string name = 1;  // required
  // Path of a base feature to be weighted. Required.
  optional Path feature = 2;
  // Path of weight feature to associate with the base feature. Must be same
  // shape as feature. Required.
  optional Path weight_feature = 3;
  // The lifecycle_stage determines where a feature is expected to be used,
  // and therefore how important issues with it are.
  optional LifecycleStage lifecycle_stage = 4;
}

// A sparse feature represents a sparse tensor that is encoded with a
// combination of raw features, namely index features and a value feature. Each
// index feature defines a list of indices in a different dimension.
message SparseFeature {
  reserved 11;
  // Name for the sparse feature. This should not clash with other features in
  // the same schema.
  optional string name = 1;  // required

  // This field is no longer supported. Instead, use:
  // lifecycle_stage: DEPRECATED
  // TODO(b/111450258): remove this.
  optional bool deprecated = 2 [deprecated = true];

  // The lifecycle_stage determines where a feature is expected to be used,
  // and therefore how important issues with it are.
  optional LifecycleStage lifecycle_stage = 7;

  // Comment field for a human readable description of the field.
  // TODO(martinz): delete, convert to annotation.
// GOOGLE-LEGACY   optional string comment = 3 [deprecated = true];  

  // Constraints on the presence of this feature in examples.
  // Deprecated, this is inferred by the referred features.
  optional FeaturePresence presence = 4 [deprecated = true];

  // Shape of the sparse tensor that this SparseFeature represents.
  // Currently not supported.
  // TODO(b/109669962): Consider deriving this from the referred features.
  optional FixedShape dense_shape = 5;

  // Features that represent indexes. Should be integers >= 0.
  repeated IndexFeature index_feature = 6;  // at least one
  message IndexFeature {
    // Name of the index-feature. This should be a reference to an existing
    // feature in the schema.
    optional string name = 1;
  }

  // If true then the index values are already sorted lexicographically.
  optional bool is_sorted = 8;

  optional ValueFeature value_feature = 9;  // required
  message ValueFeature {
    // Name of the value-feature. This should be a reference to an existing
    // feature in the schema.
    optional string name = 1;
  }

  // Type of value feature.
  // Deprecated, this is inferred by the referred features.
  optional FeatureType type = 10 [deprecated = true];
}

// Models constraints on the distribution of a feature's values.
// TODO(martinz): replace min_domain_mass with max_off_domain (but slowly).
message DistributionConstraints {
  // The minimum fraction (in [0,1]) of values across all examples that
  // should come from the feature's domain, e.g.:
  //   1.0  => All values must come from the domain.
  //    .9  => At least 90% of the values must come from the domain.
  optional double min_domain_mass = 1 [default = 1.0];
}

// Encodes vocabulary coverage constraints.
message FeatureCoverageConstraints {
  // Fraction of feature values that map to a vocab entry (i.e. are not oov).
  optional float min_coverage = 1;
  // Average length of tokens. Used for cases such as wordpiece that fallback
  // to character-level tokenization.
  optional float min_avg_token_length = 2;

  // String tokens to exclude when calculating min_coverage and
  // min_avg_token_length. Useful for tokens such as [PAD].
  repeated string excluded_string_tokens = 3;

  // Integer tokens to exclude when calculating min_coverage and
  // min_avg_token_length.
  repeated int64 excluded_int_tokens = 4 [packed = true];

  // String tokens to treat as oov tokens (e.g. [UNK]). These tokens are also
  // excluded when calculating avg token length.
  repeated string oov_string_tokens = 5;
}

// Encodes constraints on specific values in sequences.
message SequenceValueConstraints {
  // The value which to express constraints for. Can be either an integer or
  // a string.
  oneof value {
    int64 int_value = 1;
    string string_value = 2;
  }

  // Min / max number of times the value can occur in a sequence.
  optional int64 min_per_sequence = 3;
  optional int64 max_per_sequence = 4;

  // Min / max fraction of sequences that must contain the value.
  optional float min_fraction_of_sequences = 5;
  optional float max_fraction_of_sequences = 6;
}

// Encodes constraints on sequence lengths.
message SequenceLengthConstraints {
  // Token values (int and string) that are excluded when calculating sequence
  // length.
  repeated int64 excluded_int_value = 1;
  repeated string excluded_string_value = 2;

  // Min / max sequence length.
  optional int64 min_sequence_length = 3;
  optional int64 max_sequence_length = 4;
}

// Encodes information for domains of integer values.
// Note that FeatureType could be either INT or BYTES.
message IntDomain {
  // Id of the domain. Required if the domain is defined at the schema level. If
  // so, then the name must be unique within the schema.
  optional string name = 1;

  // Min and max values for the domain.
  optional int64 min = 3;
  optional int64 max = 4;

  // If true then the domain encodes categorical values (i.e., ids) rather than
  // ordinal values.
  optional bool is_categorical = 5;
}

// Encodes information for domains of float values.
// Note that FeatureType could be either INT or BYTES.
message FloatDomain {
  // Id of the domain. Required if the domain is defined at the schema level. If
  // so, then the name must be unique within the schema.
  optional string name = 1;

  // Min and max values of the domain.
  optional float min = 3;
  optional float max = 4;

  // If true, feature should not contain NaNs.
  optional bool disallow_nan = 5;
  // If true, feature should not contain Inf or -Inf.
  optional bool disallow_inf = 6;
  // If True, this indicates that the feature is semantically an embedding. This
  // can be useful for distinguishing fixed dimensional numeric features that
  // should be fed to a model unmodified.
  optional bool is_embedding = 7;

  // If true then the domain encodes categorical values (i.e., ids) rather than
  // continuous values.
  optional bool is_categorical = 8;

  // This field specifies the embedding dimension and is only applicable if
  // is_embedding is true. It is useful for use cases such as restoring shapes
  // for flattened sequence of embeddings.
  optional int64 embedding_dim = 9;

  // Specifies the semantic type of the embedding e.g. sbv4_semantic or pulsar.
  optional string embedding_type = 10;
}

// Domain for a recursive struct.
// NOTE: If a feature with a StructDomain is deprecated, then all the
// child features (features and sparse_features of the StructDomain) are also
// considered to be deprecated.  Similarly child features can only be in
// environments of the parent feature.
message StructDomain {
  repeated Feature feature = 1;

  repeated SparseFeature sparse_feature = 2;
}

// Encodes information for domains of string values.
message StringDomain {
  // Id of the domain. Required if the domain is defined at the schema level. If
  // so, then the name must be unique within the schema.
  optional string name = 1;

  // The values appearing in the domain.
  repeated string value = 2;

  // Currently unused
  // StringDomain consists of Categorical. This enum allows the user to
  // specify the whether to treat the feature as categorical.
  enum Categorical {
    CATEGORICAL_UNSPECIFIED = 0;
    CATEGORICAL_YES = 1;
    CATEGORICAL_NO = 2;
  }
  optional Categorical is_categorical = 3;
}

// Encodes information about the domain of a boolean attribute that encodes its
// TRUE/FALSE values as strings, or 0=false, 1=true.
// Note that FeatureType could be either INT or BYTES.
message BoolDomain {
  // Id of the domain. Required if the domain is defined at the schema level. If
  // so, then the name must be unique within the schema.
  optional string name = 1;

  // Strings values for TRUE/FALSE.
  optional string true_value = 2;
  optional string false_value = 3;
}

// BEGIN SEMANTIC-TYPES-PROTOS
// Semantic domains are specialized feature domains. For example a string
// Feature might represent a Time of a specific format.
// Semantic domains are defined as protocol buffers to allow further sub-types /
// specialization, e.g: NaturalLanguageDomain can provide information on the
// language of the text.

// Natural language text.
message NaturalLanguageDomain {
  // Name of the vocabulary associated with the NaturalLanguageDomain.
  // When computing and validating stats using TFDV,
  // tfdv.StatsOptions.vocab_paths should map this name to a vocabulary file.
  optional string vocabulary = 1;
  optional FeatureCoverageConstraints coverage = 2;
  repeated SequenceValueConstraints token_constraints = 3;
  optional SequenceLengthConstraints sequence_length_constraints = 5;

  reserved 4;
}

// Image data.
message ImageDomain {
  // If set, at least this fraction of values should be TensorFlow supported
  // images.
  optional float minimum_supported_image_fraction = 1;

  // If set, image should have less than this value of undecoded byte size.
  optional int64 max_image_byte_size = 2;
}

// Knowledge graph ID, see: https://www.wikidata.org/wiki/Property:P646
message MIDDomain {}

// A URL, see: https://en.wikipedia.org/wiki/URL
message URLDomain {}

// Time or date representation.
message TimeDomain {
  enum IntegerTimeFormat {
    FORMAT_UNKNOWN = 0;
    UNIX_DAYS = 5;  // Number of days since 1970-01-01.
    UNIX_SECONDS = 1;
    UNIX_MILLISECONDS = 2;
    UNIX_MICROSECONDS = 3;
    UNIX_NANOSECONDS = 4;
  }

  oneof format {
    // Expected format that contains a combination of regular characters and
    // special format specifiers. Format specifiers are a subset of the
    // strptime standard.
    string string_format = 1;

    // Expected format of integer times.
    IntegerTimeFormat integer_format = 2;
  }
}

// Time of day, without a particular date.
message TimeOfDayDomain {
  enum IntegerTimeOfDayFormat {
    FORMAT_UNKNOWN = 0;
    // Time values, containing hour/minute/second/nanos, encoded into 8-byte
    // bit fields following the ZetaSQL convention:
    //        6         5         4         3         2         1
    // MSB 3210987654321098765432109876543210987654321098765432109876543210 LSB
    //                      | H ||  M ||  S ||---------- nanos -----------|
    PACKED_64_NANOS = 1;
  }

  oneof format {
    // Expected format that contains a combination of regular characters and
    // special format specifiers. Format specifiers are a subset of the
    // strptime standard.
    string string_format = 1;

    // Expected format of integer times.
    IntegerTimeOfDayFormat integer_format = 2;
  }
}
// END SEMANTIC-TYPES-PROTOS

// Describes the physical representation of a feature.
// It may be different than the logical representation, which
// is represented as a Domain.
enum FeatureType {
  TYPE_UNKNOWN = 0;
  BYTES = 1;
  INT = 2;
  FLOAT = 3;
  STRUCT = 4;
}

// Describes constraints on the presence of the feature in the data.
message FeaturePresence {
  // Minimum fraction of examples that have this feature.
  optional double min_fraction = 1;
  // Minimum number of examples that have this feature.
  optional int64 min_count = 2;
}

// Records constraints on the presence of a feature inside a "group" context
// (e.g., .presence inside a group of features that define a sequence).
message FeaturePresenceWithinGroup {
  optional bool required = 1;
}

// Checks that the L-infinity norm is below a certain threshold between the
// two discrete distributions. Since this is applied to a FeatureNameStatistics,
// it only considers the top k.
// L_infty(p,q) = max_i |p_i-q_i|
message InfinityNorm {
  // The InfinityNorm is in the interval [0.0, 1.0] so sensible bounds should
  // be in the interval [0.0, 1.0).
  optional double threshold = 1;
}

message HistogramSelection {
  // Type controls the source of the histogram used for numeric drift and
  // skew calculations. Currently the default is STANDARD. Calculations
  // based on QUANTILES are more robust to outliers.
  enum Type {
    DEFAULT = 0;
    QUANTILES = 1;
    STANDARD = 2;
  }
  optional Type type = 1;
}

// Checks that the approximate Jensen-Shannon Divergence is below a certain
// threshold between the two distributions.
message JensenShannonDivergence {
  // The JensenShannonDivergence will be in the interval [0.0, 1.0] so sensible
  // bounds should be in the interval [0.0, 1.0).
  optional double threshold = 1;
  optional HistogramSelection source = 2;
}

// Checks that the absolute count difference relative to the total count of both
// datasets is small. This metric is appropriate for comparing datasets that
// are expected to have similar absolute counts, and not necessarily just
// similar distributions.
// Computed as max_i | x_i - y_i |  / sum_i(x_i + y_i) for aligned datasets
// x and y. Results will be in the interval [0.0, 1.0] so sensible bounds should
// be in the interval [0.0, 1.0).
message NormalizedAbsoluteDifference {
  optional double threshold = 1;
}

message FeatureComparator {
  optional InfinityNorm infinity_norm = 1;
  optional JensenShannonDivergence jensen_shannon_divergence = 2;
  optional NormalizedAbsoluteDifference normalized_abs_difference = 3;
}

// Checks that the number of unique values is greater than or equal to the min,
// and less than or equal to the max.
message UniqueConstraints {
  optional int64 min = 1;
  optional int64 max = 2;
}

// A TensorRepresentation captures the intent for converting columns in a
// dataset to TensorFlow Tensors (or more generally, tf.CompositeTensors).
// Note that one tf.CompositeTensor may consist of data from multiple columns,
// for example, a N-dimensional tf.SparseTensor may need N + 1 columns to
// provide the sparse indices and values.
// Note that the "column name" that a TensorRepresentation needs is a
// string, not a Path -- it means that the column name identifies a top-level
// Feature in the schema (i.e. you cannot specify a Feature nested in a STRUCT
// Feature).
message TensorRepresentation {
  message DefaultValue {
    oneof kind {
      double float_value = 1;
      // Note that the data column might be of a shorter integral type. It's the
      // user's responsitiblity to make sure the default value fits that type.
      int64 int_value = 2;
      bytes bytes_value = 3;
      // uint_value should only be used if the default value can't fit in a
      // int64 (`int_value`).
      uint64 uint_value = 4;
    }
  }

  // A tf.Tensor
  message DenseTensor {
    // Identifies the column in the dataset that provides the values of this
    // Tensor.
    optional string column_name = 1;
    // The shape of each row of the data (i.e. does not include the batch
    // dimension)
    optional FixedShape shape = 2;
    // If this column is missing values in a row, the default_value will be
    // used to fill that row.
    optional DefaultValue default_value = 3;
  }

  // A ragged tf.SparseTensor that models nested lists.
  message VarLenSparseTensor {
    // Identifies the column in the dataset that should be converted to the
    // VarLenSparseTensor.
    optional string column_name = 1;
  }

  // A tf.SparseTensor whose indices and values come from separate data columns.
  // This will replace Schema.sparse_feature eventually.
  // The index columns must be of INT type, and all the columns must co-occur
  // and have the same valency at the same row.
  message SparseTensor {
    // The dense shape of the resulting SparseTensor (does not include the batch
    // dimension).
    optional FixedShape dense_shape = 1;
    // The columns constitute the coordinates of the values.
    // indices_column[i][j] contains the coordinate of the i-th dimension of the
    // j-th value.
    repeated string index_column_names = 2;
    // The column that contains the values.
    optional string value_column_name = 3;
    // Specify whether the values are already sorted by their index position.
    optional bool already_sorted = 4;
  }

  // A tf.RaggedTensor that models nested lists.
  // Currently there is no way for the user to specify the shape of the leaf
  // value (the innermost value tensor of the RaggedTensor). The leaf value will
  // always be a 1-D tensor.
  message RaggedTensor {
    // Identifies the leaf feature that provides values of the RaggedTensor.
    // struct type sub fields.
    // The first step of the path refers to a top-level feature in the data. The
    // remaining steps refer to STRUCT features under the top-level feature,
    // recursively.
    // If the feature has N outer ragged lists, they will become the first
    // N dimensions of the resulting RaggedTensor and the contents will become
    // the flat_values.
    optional Path feature_path = 1;  // required.

    // Further partition of the feature values at the leaf level.
    message Partition {
      oneof kind {
        // If the final element(s) of partition are uniform_row_lengths [U0, U1,
        // ...] , then the result RaggedTensor will have  their flat values (a
        // dense tensor) being of shape [U0, U1, ...]. Otherwise, a
        // uniform_row_length simply means a ragged dimension with row_lengths
        // [uniform_row_length]*nrows.
        int64 uniform_row_length = 1;
        // Identifies a leaf feature who share the same parent of
        // value_feature_path that contains the partition row lengths.
        string row_length = 2;
      }
    }
    // The result RaggedTensor would be of shape:
    // [B, D_0, D_1, ..., D_N, P_0, P_1, ..., P_M, U_0, U_1, ..., U_P]
    //
    // Where the dimensions belong to different categories:
    // * B: Batch size dimension
    // * D_n: Dimensions specified by the nested structure specified by the
    // value path until the leaf node. n>=1.
    // * P_m: Dimensions specified by the partitions that do not define any
    // fixed diomension size. m>=0.
    // * U_0: Dimensions specified by the latest partitions of type
    // uniform_row_length that can define the fixed inner shape of the tensor.
    // If iterationg the partitions from the end to the beginning, these
    // dimensions are defined by all the continuous uniform_row_length
    // partitions present. p>=0.
    repeated Partition partition = 3;

    // The data type of the ragged tensor's row partitions. This will
    // default to INT64 if it is not specified.
    optional RowPartitionDType row_partition_dtype = 2;
  }

  // RaggedTensor consists of RowPartitions. This enum allows the user to
  // specify the dtype of those RowPartitions. If it is UNSPECIFIED, then we
  // default to INT64.
  enum RowPartitionDType {
    UNSPECIFIED = 0;
    INT64 = 1;
    INT32 = 2;
  }

  oneof kind {
    DenseTensor dense_tensor = 1;
    VarLenSparseTensor varlen_sparse_tensor = 2;
    SparseTensor sparse_tensor = 3;
    RaggedTensor ragged_tensor = 4;
  }
}

// A TensorRepresentationGroup is a collection of TensorRepresentations with
// names. These names may serve as identifiers when converting the dataset
// to a collection of Tensors or tf.CompositeTensors.
// For example, given the following group:
// {
//   key: "dense_tensor"
//   tensor_representation {
//     dense_tensor {
//       column_name: "univalent_feature"
//       shape {
//         dim {
//           size: 1
//         }
//       }
//       default_value {
//         float_value: 0
//       }
//     }
//   }
// }
// {
//   key: "varlen_sparse_tensor"
//   tensor_representation {
//     varlen_sparse_tensor {
//       column_name: "multivalent_feature"
//     }
//   }
// }
//
// Then the schema is expected to have feature "univalent_feature" and
// "multivalent_feature", and when a batch of data is converted to Tensors using
// this TensorRepresentationGroup, the result may be the following dict:
// {
//   "dense_tensor": tf.Tensor(...),
//   "varlen_sparse_tensor": tf.SparseTensor(...),
// }
message TensorRepresentationGroup {
  map tensor_representation = 1;
}

message SequenceMetadata {
  // This enum specifies whether to treat the feature as a sequence which has
  // meaningful element order.
  enum SequentialStatus {
    SEQUENTIAL_UNSPECIFIED = 0;
    SEQUENTIAL_YES = 1;
    SEQUENTIAL_NO = 2;
  }
  optional SequentialStatus sequential_status = 3;
  // An arbitrary string defining a "group" of features that could be modeled as
  // a single joint sequence. For example, consider a dataset that contains
  // three sequential features "purchase_time", "product_id", "purchase_price".
  // These belong to the same sequence of purchases and could be modeled
  // jointly. Specifying joint_group = "purchase" on all three sequences would
  // communicate that the features can be considered part of a single conceptual
  // sequence.
  optional string joint_group = 4;
  // Specifies the maximum sequence length that should be processed. Sequences
  // may exceed this limit but are expected to be truncated by modeling layers.
  optional int64 sequence_truncation_limit = 5;
}