tensorflow_metadata.proto.v0.anomalies.proto Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of scio-tensorflow_2.12 Show documentation
Scio add-on for TensorFlow
The newest version!
// Copyright 2017 The TensorFlow Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
// =============================================================================

syntax = "proto2";

package tensorflow.metadata.v0;

// GOOGLE-LEGACY option jspb_use_correct_proto2_semantics = false;  
option cc_enable_arenas = true;
option java_package = "org.tensorflow.metadata.v0";
option java_multiple_files = true;

// TODO(b/123519907): Remove this.
// GOOGLE-LEGACY import "net/proto2/bridge/proto/message_set.proto"; 
import "tensorflow_metadata/proto/v0/path.proto";
import "tensorflow_metadata/proto/v0/schema.proto";

// Message to represent information about an individual anomaly.
message AnomalyInfo {
  // Deleted fields.
  reserved 1, 3;

  // A path indicating where the anomaly occurred.
  // Dataset-level anomalies do not have a path.
  optional Path path = 8;

  enum Severity {
    UNKNOWN = 0;
    WARNING = 1;
    ERROR = 2;
  }
  optional Severity severity = 5;
  // A description of the entire anomaly.
  optional string description = 2;
  // A shorter description, suitable for UI presentation.
  // If there is a single reason for the anomaly, identical to
  // reason[0].short_description. Otherwise, summarizes all the reasons.
  optional string short_description = 6;
  // The comparison between the existing schema and the fixed schema.
  repeated DiffRegion diff_regions = 4;

  // Next ID: 89
  // LINT.IfChange
  enum Type {
    UNKNOWN_TYPE = 0;
    // Multiple reasons for anomaly.
    MULTIPLE_REASONS = 82;
    // Integer larger than 1
    BOOL_TYPE_BIG_INT = 1;
    // BYTES type when expected INT type
    BOOL_TYPE_BYTES_NOT_INT = 2;
    // BYTES type when expected STRING type
    BOOL_TYPE_BYTES_NOT_STRING = 3;
    // FLOAT type when expected INT type
    BOOL_TYPE_FLOAT_NOT_INT = 4;
    // FLOAT type when expected STRING type
    BOOL_TYPE_FLOAT_NOT_STRING = 5;
    // INT type when expected STRING type
    BOOL_TYPE_INT_NOT_STRING = 6;
    // Integer smaller than 0
    BOOL_TYPE_SMALL_INT = 7;
    // STRING type when expected INT type
    BOOL_TYPE_STRING_NOT_INT = 8;
    // Expected a string, but not the string seen
    BOOL_TYPE_UNEXPECTED_STRING = 9;
    // Boolean had float values other than 0 and 1.
    BOOL_TYPE_UNEXPECTED_FLOAT = 52;
    // BoolDomain has invalid configuration.
    BOOL_TYPE_INVALID_CONFIG = 88;
    // BYTES type when expected STRING type
    ENUM_TYPE_BYTES_NOT_STRING = 10;
    // FLOAT type when expected STRING type
    ENUM_TYPE_FLOAT_NOT_STRING = 11;
    // INT type when expected STRING type
    ENUM_TYPE_INT_NOT_STRING = 12;
    // Invalid UTF8 string observed
    ENUM_TYPE_INVALID_UTF8 = 13;
    // Unexpected string values
    ENUM_TYPE_UNEXPECTED_STRING_VALUES = 14;
    // The number of values in a given example is too large
    FEATURE_TYPE_HIGH_NUMBER_VALUES = 15;
    // The fraction of examples containing a feature is too small
    FEATURE_TYPE_LOW_FRACTION_PRESENT = 16;
    // The number of examples containing a feature is too small
    FEATURE_TYPE_LOW_NUMBER_PRESENT = 17;
    // The number of values in a given example is too small
    FEATURE_TYPE_LOW_NUMBER_VALUES = 18;
    // No examples contain the value
    FEATURE_TYPE_NOT_PRESENT = 19;
    // The feature is present as an empty list
    FEATURE_TYPE_NO_VALUES = 20;
    // The feature is repeated in an example, but was expected to be a singleton
    FEATURE_TYPE_UNEXPECTED_REPEATED = 21;
    // The feature had too many unique values (string and categorical features
    // only).
    FEATURE_TYPE_HIGH_UNIQUE = 59;
    // The feature had too few unique values (string and categorical features
    // only).
    FEATURE_TYPE_LOW_UNIQUE = 60;
    // The feature has a constraint on the number of unique values but is not of
    // a type that has the number of unique values counted (i.e., is not string
    // or categorical).
    FEATURE_TYPE_NO_UNIQUE = 61;
    // There is a float value that is too high
    FLOAT_TYPE_BIG_FLOAT = 22;
    // The type is not FLOAT
    FLOAT_TYPE_NOT_FLOAT = 23;
    // There is a float value that is too low
    FLOAT_TYPE_SMALL_FLOAT = 24;
    // The feature is supposed to be floats encoded as strings, but there is
    // a string that is not a float
    FLOAT_TYPE_STRING_NOT_FLOAT = 25;
    // The feature is supposed to be floats encoded as strings, but it was
    // some other type (INT, BYTES, FLOAT)
    FLOAT_TYPE_NON_STRING = 26;
    // The type is completely unknown
    FLOAT_TYPE_UNKNOWN_TYPE_NUMBER = 27;
    // Float feature includes NaN values.
    FLOAT_TYPE_HAS_NAN = 53;
    // Float feature includes Inf or -Inf values.
    FLOAT_TYPE_HAS_INF = 62;
    // There is an unexpectedly large integer
    INT_TYPE_BIG_INT = 28;
    // The type was supposed to be INT, but it was not.
    INT_TYPE_INT_EXPECTED = 29;
    // The feature is supposed to be ints encoded as strings, but some string
    // was not an int.
    INT_TYPE_NOT_INT_STRING = 30;
    // The type was supposed to be STRING, but it was not.
    INT_TYPE_NOT_STRING = 31;
    // There is an unexpectedly small integer
    INT_TYPE_SMALL_INT = 32;
    // The feature is supposed to be ints encoded as strings, but it was
    // some other type (INT, BYTES, FLOAT)
    INT_TYPE_STRING_EXPECTED = 33;
    // Unknown type in stats proto
    INT_TYPE_UNKNOWN_TYPE_NUMBER = 34;
    // The fraction of examples containing TensorFlow supported images is lower
    // than the threshold set in the Schema.
    LOW_SUPPORTED_IMAGE_FRACTION = 64;
    // There are no stats for a column at all
    SCHEMA_MISSING_COLUMN = 35;
    // There is a new column that is not in the schema.
    SCHEMA_NEW_COLUMN = 36;
    // Training serving skew issue
    SCHEMA_TRAINING_SERVING_SKEW = 37;
    // Expected STRING type, but it was FLOAT.
    STRING_TYPE_NOW_FLOAT = 38;
    // Expected STRING type, but it was INT.
    STRING_TYPE_NOW_INT = 39;
    // Control data is missing (either scoring data or previous day).
    COMPARATOR_CONTROL_DATA_MISSING = 40;
    // Treatment data is missing (either treatment data or current day).
    COMPARATOR_TREATMENT_DATA_MISSING = 41;
    // L infinity between treatment and control is high.
    COMPARATOR_L_INFTY_HIGH = 42;
    // Approximate Jensen-Shannon divergence between treatment and control is
    // high.
    COMPARATOR_JENSEN_SHANNON_DIVERGENCE_HIGH = 63;
    // The normalized absolute difference between treatment and control is high.
    COMPARATOR_NORMALIZED_ABSOLUTE_DIFFERENCE_HIGH = 87;
    // No examples in the span.
    NO_DATA_IN_SPAN = 43;
    // The value feature of a sparse feature is missing and at least one
    // feature defining the sparse feature is present.
    SPARSE_FEATURE_MISSING_VALUE = 44;
    // An index feature of a sparse feature is missing and at least one
    // feature defining the sparse feature is present.
    SPARSE_FEATURE_MISSING_INDEX = 45;
    // The length of the features representing a sparse feature does not match.
    SPARSE_FEATURE_LENGTH_MISMATCH = 46;
    // Name collision between a sparse feature and raw feature.
    SPARSE_FEATURE_NAME_COLLISION = 47;
    // Invalid custom semantic domain.
    SEMANTIC_DOMAIN_UPDATE = 48;
    // There are not enough examples in the current data as compared to a
    // control dataset.
    COMPARATOR_LOW_NUM_EXAMPLES = 49;
    // There are too many examples in the current data as compared to a control
    // dataset.
    COMPARATOR_HIGH_NUM_EXAMPLES = 50;
    // There are not enough examples in the dataset.
    DATASET_LOW_NUM_EXAMPLES = 51;
    // There are too many examples in the dataset.
    DATASET_HIGH_NUM_EXAMPLES = 58;
    // Name collision between a weighted feature and a raw feature.
    WEIGHTED_FEATURE_NAME_COLLISION = 54;
    // The value feature of a weighted feature is missing on examples where the
    // weight feature is present.
    WEIGHTED_FEATURE_MISSING_VALUE = 55;
    // The weight feature of a weighted feature is missing on examples where the
    // value feature is present.
    WEIGHTED_FEATURE_MISSING_WEIGHT = 56;
    // The length of the features representing a weighted feature does not
    // match.
    WEIGHTED_FEATURE_LENGTH_MISMATCH = 57;
    // The nesting level of the feature values does not match.
    VALUE_NESTEDNESS_MISMATCH = 65;
    // The domain specified is not compatible with the physical type.
    DOMAIN_INVALID_FOR_TYPE = 66;
    // Feature on schema has no name.
    FEATURE_MISSING_NAME = 67;
    // Feature on schema has no type.
    FEATURE_MISSING_TYPE = 68;
    // Triggered for invalid schema specifications, e.g. min_fraction < 0.
    INVALID_SCHEMA_SPECIFICATION = 69;
    // Triggered for invalid domain specifications in schema.
    INVALID_DOMAIN_SPECIFICATION = 81;
    // The type of the data is inconsistent with the specified type.
    UNEXPECTED_DATA_TYPE = 70;
    // A value did not show up the min number of times within a sequence.
    SEQUENCE_VALUE_TOO_FEW_OCCURRENCES = 71;
    // A value showed up more the max number of times within a sequence.
    SEQUENCE_VALUE_TOO_MANY_OCCURRENCES = 72;
    // A value did not show up in at least the min fraction of sequences.
    SEQUENCE_VALUE_TOO_SMALL_FRACTION = 73;
    // A value showed up in greater than the max fraction of sequences.
    SEQUENCE_VALUE_TOO_LARGE_FRACTION = 74;
    // Too small a fraction of feature values matched vocab entries.
    FEATURE_COVERAGE_TOO_LOW = 75;
    // The average token length was too short.
    FEATURE_COVERAGE_TOO_SHORT_AVG_TOKEN_LENGTH = 76;
    // A sequence violated the location constraint.
    NLP_WRONG_LOCATION = 77;
    // A feature was specified as an embedding but was not a fixed dimension.
    EMBEDDING_SHAPE_INVALID = 78;
    // A feature contains an image that has more bytes than the max byte size.
    MAX_IMAGE_BYTE_SIZE_EXCEEDED = 79;
    // A feature is supposed to be of a fixed shape but its valency stats
    // do not agree.
    INVALID_FEATURE_SHAPE = 80;
    // Constraints are specified within the but cannot be verified because the
    // corresponding stats are not available.
    STATS_NOT_AVAILABLE = 83;
    // A derived feature had a schema lifecycle other than VALIDATION_DERIVED
    // or DISABLED.

    // The following are experimental and subject to change.
    DERIVED_FEATURE_BAD_LIFECYCLE = 84;
    // A derived feature is represented in the schema with an invalid or missing
    // validation_derived_source.
    DERIVED_FEATURE_INVALID_SOURCE = 85;

    // The following type is experimental and subject to change.
    // The statistics did not specify a custom validation condition.
    CUSTOM_VALIDATION = 86;
  }
  // LINT.ThenChange(//tensorflow_data_validation/g3doc/anomalies.md)
  // Reason for the anomaly. There may be more than one reason,
  // e.g. the field might be missing sometimes AND a new value is
  // present.
  message Reason {
    optional Type type = 1 [default = UNKNOWN_TYPE];
    // A short description of an anomaly, suitable for UI presentation.
    optional string short_description = 2;
    // A longer description of an anomaly.
    optional string description = 3;
  }
  repeated Reason reason = 7;
}

// Message to contain the result of the drift/skew measurements for a feature.
message DriftSkewInfo {
  message Measurement {
    enum Type {
      UNKNOWN = 0;
      L_INFTY = 1;
      JENSEN_SHANNON_DIVERGENCE = 2;
      NORMALIZED_ABSOLUTE_DIFFERENCE = 3;
    }
    // Type of the measurement.
    optional Type type = 1;
    // Value of the measurement.
    optional double value = 2;
    // Threshold used to determine whether the measurement results in an
    // anomaly.
    optional double threshold = 3;
  }

  // Identifies the feature;
  optional Path path = 1;

  // The drift/skew may be measured in the same invocation of TFDV, in which
  // case both of the following fields are populated.
  // Also the drift/skew may be quantified by different measurements, thus
  // repeated.
  repeated Measurement drift_measurements = 2;
  repeated Measurement skew_measurements = 3;
}

// Message to represent the anomalies, which describe the mismatches (if any)
// between the stats and the schema.
message Anomalies {
  // Deleted fields.
  reserved 4;

  // The baseline schema that is used.
  oneof baseline_schema {
    tensorflow.metadata.v0.Schema baseline = 1;
    tensorflow.metadata.v0.Schema baseline_v1 = 6 [deprecated = true];
  }

  // Map from a column to the difference that it represents.
  enum AnomalyNameFormat {
    // At present, this indicates that the keys in anomaly_info
    // refers to the raw field name in the Schema.
    UNKNOWN = 0;
    // The serialized path to a struct.
    SERIALIZED_PATH = 1;
  }

  // The format of the keys in anomaly_info.
  // If absent, default is DEFAULT.
  optional AnomalyNameFormat anomaly_name_format = 7;
  // Information about feature-level anomalies.
  map anomaly_info = 2;
  // Information about dataset-level anomalies.
  optional AnomalyInfo dataset_anomaly_info = 8;
  // True if numExamples == 0.
  optional bool data_missing = 3;

  // If drift / skew detection was conducted, this field will hold the
  // comparison results for all the features compared, regardless whether a
  // related anomaly was reported.
  repeated DriftSkewInfo drift_skew_info = 9;
  // TODO(b/123519907): Remove this.
  // The hook to attach any usage and tool specific metadata. Example:
  // message SchemaStamp {
  //   // extension ID is any CL number that has not been used in an extension.
  //   extend proto2.bridge.MessageSet {
  //     optional StampedSchemaDiff message_set_extension = 123445554;
  //   }
  //   optional string schema_stamp = 1;
  // }
  //
  // then, the following proto msg encodes an Anomalies with an embedded
  // SchemaStamp:
  //
  // Anomalies {
  //   metadata {
  //     [SchemaStamp]: {
  //        schema_stamp: "stamp"
  //     }
  //   }
  // }
// GOOGLE-LEGACY   optional proto2.bridge.MessageSet metadata = 5;  
}

// Describes a region in the comparison between two text artifacts. Note that
// a region also contains the contents of the two artifacts that correspond to
// the region.
message DiffRegion {
  // Details for the chunk.
  oneof details {
    // An unchanged region of lines.
    UnchangedRegion unchanged = 1;
    // A region of lines removed from the left.
    OneSideRegion removed = 2;
    // A region of lines added to the right.
    OneSideRegion added = 3;
    // A region of lines that are different in the two artifacts.
    ChangedRegion changed = 4;
    // An unchanged region of lines whose contents are just hidden.
    HiddenRegion hidden = 5;
  }
}

// Describes a chunk that is the same in the two artifacts.
message UnchangedRegion {
  // The starting lines of the chunk in the two artifacts.
  optional int32 left_start = 1;
  optional int32 right_start = 2;
  // The contents of the chunk. These are the same in both artifacts.
  repeated string contents = 3;
}

// Describes a chunk that applies to only one of the two artifacts.
message OneSideRegion {
  // Starting line.
  optional int32 start = 1;
  // Contents.
  repeated string contents = 2;
}

// Describes a chunk that represents changes in both artifacts over the same
// number of lines.
message ChangedRegion {
  // Changed region in the left artifact, in terms of starting line number and
  // contents.
  optional int32 left_start = 1;
  repeated string left_contents = 2;
  // Ditto for the right artifact.
  optional int32 right_start = 3;
  repeated string right_contents = 4;
}

// A chunk that represents identical lines, whose contents are hidden.
message HiddenRegion {
  // Starting lines in the two artifacts.
  optional int32 left_start = 1;
  optional int32 right_start = 2;
  // Size of the region in terms of lines.
  optional int32 size = 3;
}