tensorflow_metadata.proto.v0.anomalies.proto Maven / Gradle / Ivy
The newest version!
// Copyright 2017 The TensorFlow Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
// =============================================================================
syntax = "proto2";
package tensorflow.metadata.v0;
// GOOGLE-LEGACY option jspb_use_correct_proto2_semantics = false;
option cc_enable_arenas = true;
option java_package = "org.tensorflow.metadata.v0";
option java_multiple_files = true;
// TODO(b/123519907): Remove this.
// GOOGLE-LEGACY import "net/proto2/bridge/proto/message_set.proto";
import "tensorflow_metadata/proto/v0/path.proto";
import "tensorflow_metadata/proto/v0/schema.proto";
// Message to represent information about an individual anomaly.
message AnomalyInfo {
// Deleted fields.
reserved 1, 3;
// A path indicating where the anomaly occurred.
// Dataset-level anomalies do not have a path.
optional Path path = 8;
enum Severity {
UNKNOWN = 0;
WARNING = 1;
ERROR = 2;
}
optional Severity severity = 5;
// A description of the entire anomaly.
optional string description = 2;
// A shorter description, suitable for UI presentation.
// If there is a single reason for the anomaly, identical to
// reason[0].short_description. Otherwise, summarizes all the reasons.
optional string short_description = 6;
// The comparison between the existing schema and the fixed schema.
repeated DiffRegion diff_regions = 4;
// Next ID: 89
// LINT.IfChange
enum Type {
UNKNOWN_TYPE = 0;
// Multiple reasons for anomaly.
MULTIPLE_REASONS = 82;
// Integer larger than 1
BOOL_TYPE_BIG_INT = 1;
// BYTES type when expected INT type
BOOL_TYPE_BYTES_NOT_INT = 2;
// BYTES type when expected STRING type
BOOL_TYPE_BYTES_NOT_STRING = 3;
// FLOAT type when expected INT type
BOOL_TYPE_FLOAT_NOT_INT = 4;
// FLOAT type when expected STRING type
BOOL_TYPE_FLOAT_NOT_STRING = 5;
// INT type when expected STRING type
BOOL_TYPE_INT_NOT_STRING = 6;
// Integer smaller than 0
BOOL_TYPE_SMALL_INT = 7;
// STRING type when expected INT type
BOOL_TYPE_STRING_NOT_INT = 8;
// Expected a string, but not the string seen
BOOL_TYPE_UNEXPECTED_STRING = 9;
// Boolean had float values other than 0 and 1.
BOOL_TYPE_UNEXPECTED_FLOAT = 52;
// BoolDomain has invalid configuration.
BOOL_TYPE_INVALID_CONFIG = 88;
// BYTES type when expected STRING type
ENUM_TYPE_BYTES_NOT_STRING = 10;
// FLOAT type when expected STRING type
ENUM_TYPE_FLOAT_NOT_STRING = 11;
// INT type when expected STRING type
ENUM_TYPE_INT_NOT_STRING = 12;
// Invalid UTF8 string observed
ENUM_TYPE_INVALID_UTF8 = 13;
// Unexpected string values
ENUM_TYPE_UNEXPECTED_STRING_VALUES = 14;
// The number of values in a given example is too large
FEATURE_TYPE_HIGH_NUMBER_VALUES = 15;
// The fraction of examples containing a feature is too small
FEATURE_TYPE_LOW_FRACTION_PRESENT = 16;
// The number of examples containing a feature is too small
FEATURE_TYPE_LOW_NUMBER_PRESENT = 17;
// The number of values in a given example is too small
FEATURE_TYPE_LOW_NUMBER_VALUES = 18;
// No examples contain the value
FEATURE_TYPE_NOT_PRESENT = 19;
// The feature is present as an empty list
FEATURE_TYPE_NO_VALUES = 20;
// The feature is repeated in an example, but was expected to be a singleton
FEATURE_TYPE_UNEXPECTED_REPEATED = 21;
// The feature had too many unique values (string and categorical features
// only).
FEATURE_TYPE_HIGH_UNIQUE = 59;
// The feature had too few unique values (string and categorical features
// only).
FEATURE_TYPE_LOW_UNIQUE = 60;
// The feature has a constraint on the number of unique values but is not of
// a type that has the number of unique values counted (i.e., is not string
// or categorical).
FEATURE_TYPE_NO_UNIQUE = 61;
// There is a float value that is too high
FLOAT_TYPE_BIG_FLOAT = 22;
// The type is not FLOAT
FLOAT_TYPE_NOT_FLOAT = 23;
// There is a float value that is too low
FLOAT_TYPE_SMALL_FLOAT = 24;
// The feature is supposed to be floats encoded as strings, but there is
// a string that is not a float
FLOAT_TYPE_STRING_NOT_FLOAT = 25;
// The feature is supposed to be floats encoded as strings, but it was
// some other type (INT, BYTES, FLOAT)
FLOAT_TYPE_NON_STRING = 26;
// The type is completely unknown
FLOAT_TYPE_UNKNOWN_TYPE_NUMBER = 27;
// Float feature includes NaN values.
FLOAT_TYPE_HAS_NAN = 53;
// Float feature includes Inf or -Inf values.
FLOAT_TYPE_HAS_INF = 62;
// There is an unexpectedly large integer
INT_TYPE_BIG_INT = 28;
// The type was supposed to be INT, but it was not.
INT_TYPE_INT_EXPECTED = 29;
// The feature is supposed to be ints encoded as strings, but some string
// was not an int.
INT_TYPE_NOT_INT_STRING = 30;
// The type was supposed to be STRING, but it was not.
INT_TYPE_NOT_STRING = 31;
// There is an unexpectedly small integer
INT_TYPE_SMALL_INT = 32;
// The feature is supposed to be ints encoded as strings, but it was
// some other type (INT, BYTES, FLOAT)
INT_TYPE_STRING_EXPECTED = 33;
// Unknown type in stats proto
INT_TYPE_UNKNOWN_TYPE_NUMBER = 34;
// The fraction of examples containing TensorFlow supported images is lower
// than the threshold set in the Schema.
LOW_SUPPORTED_IMAGE_FRACTION = 64;
// There are no stats for a column at all
SCHEMA_MISSING_COLUMN = 35;
// There is a new column that is not in the schema.
SCHEMA_NEW_COLUMN = 36;
// Training serving skew issue
SCHEMA_TRAINING_SERVING_SKEW = 37;
// Expected STRING type, but it was FLOAT.
STRING_TYPE_NOW_FLOAT = 38;
// Expected STRING type, but it was INT.
STRING_TYPE_NOW_INT = 39;
// Control data is missing (either scoring data or previous day).
COMPARATOR_CONTROL_DATA_MISSING = 40;
// Treatment data is missing (either treatment data or current day).
COMPARATOR_TREATMENT_DATA_MISSING = 41;
// L infinity between treatment and control is high.
COMPARATOR_L_INFTY_HIGH = 42;
// Approximate Jensen-Shannon divergence between treatment and control is
// high.
COMPARATOR_JENSEN_SHANNON_DIVERGENCE_HIGH = 63;
// The normalized absolute difference between treatment and control is high.
COMPARATOR_NORMALIZED_ABSOLUTE_DIFFERENCE_HIGH = 87;
// No examples in the span.
NO_DATA_IN_SPAN = 43;
// The value feature of a sparse feature is missing and at least one
// feature defining the sparse feature is present.
SPARSE_FEATURE_MISSING_VALUE = 44;
// An index feature of a sparse feature is missing and at least one
// feature defining the sparse feature is present.
SPARSE_FEATURE_MISSING_INDEX = 45;
// The length of the features representing a sparse feature does not match.
SPARSE_FEATURE_LENGTH_MISMATCH = 46;
// Name collision between a sparse feature and raw feature.
SPARSE_FEATURE_NAME_COLLISION = 47;
// Invalid custom semantic domain.
SEMANTIC_DOMAIN_UPDATE = 48;
// There are not enough examples in the current data as compared to a
// control dataset.
COMPARATOR_LOW_NUM_EXAMPLES = 49;
// There are too many examples in the current data as compared to a control
// dataset.
COMPARATOR_HIGH_NUM_EXAMPLES = 50;
// There are not enough examples in the dataset.
DATASET_LOW_NUM_EXAMPLES = 51;
// There are too many examples in the dataset.
DATASET_HIGH_NUM_EXAMPLES = 58;
// Name collision between a weighted feature and a raw feature.
WEIGHTED_FEATURE_NAME_COLLISION = 54;
// The value feature of a weighted feature is missing on examples where the
// weight feature is present.
WEIGHTED_FEATURE_MISSING_VALUE = 55;
// The weight feature of a weighted feature is missing on examples where the
// value feature is present.
WEIGHTED_FEATURE_MISSING_WEIGHT = 56;
// The length of the features representing a weighted feature does not
// match.
WEIGHTED_FEATURE_LENGTH_MISMATCH = 57;
// The nesting level of the feature values does not match.
VALUE_NESTEDNESS_MISMATCH = 65;
// The domain specified is not compatible with the physical type.
DOMAIN_INVALID_FOR_TYPE = 66;
// Feature on schema has no name.
FEATURE_MISSING_NAME = 67;
// Feature on schema has no type.
FEATURE_MISSING_TYPE = 68;
// Triggered for invalid schema specifications, e.g. min_fraction < 0.
INVALID_SCHEMA_SPECIFICATION = 69;
// Triggered for invalid domain specifications in schema.
INVALID_DOMAIN_SPECIFICATION = 81;
// The type of the data is inconsistent with the specified type.
UNEXPECTED_DATA_TYPE = 70;
// A value did not show up the min number of times within a sequence.
SEQUENCE_VALUE_TOO_FEW_OCCURRENCES = 71;
// A value showed up more the max number of times within a sequence.
SEQUENCE_VALUE_TOO_MANY_OCCURRENCES = 72;
// A value did not show up in at least the min fraction of sequences.
SEQUENCE_VALUE_TOO_SMALL_FRACTION = 73;
// A value showed up in greater than the max fraction of sequences.
SEQUENCE_VALUE_TOO_LARGE_FRACTION = 74;
// Too small a fraction of feature values matched vocab entries.
FEATURE_COVERAGE_TOO_LOW = 75;
// The average token length was too short.
FEATURE_COVERAGE_TOO_SHORT_AVG_TOKEN_LENGTH = 76;
// A sequence violated the location constraint.
NLP_WRONG_LOCATION = 77;
// A feature was specified as an embedding but was not a fixed dimension.
EMBEDDING_SHAPE_INVALID = 78;
// A feature contains an image that has more bytes than the max byte size.
MAX_IMAGE_BYTE_SIZE_EXCEEDED = 79;
// A feature is supposed to be of a fixed shape but its valency stats
// do not agree.
INVALID_FEATURE_SHAPE = 80;
// Constraints are specified within the but cannot be verified because the
// corresponding stats are not available.
STATS_NOT_AVAILABLE = 83;
// A derived feature had a schema lifecycle other than VALIDATION_DERIVED
// or DISABLED.
// The following are experimental and subject to change.
DERIVED_FEATURE_BAD_LIFECYCLE = 84;
// A derived feature is represented in the schema with an invalid or missing
// validation_derived_source.
DERIVED_FEATURE_INVALID_SOURCE = 85;
// The following type is experimental and subject to change.
// The statistics did not specify a custom validation condition.
CUSTOM_VALIDATION = 86;
}
// LINT.ThenChange(//tensorflow_data_validation/g3doc/anomalies.md)
// Reason for the anomaly. There may be more than one reason,
// e.g. the field might be missing sometimes AND a new value is
// present.
message Reason {
optional Type type = 1 [default = UNKNOWN_TYPE];
// A short description of an anomaly, suitable for UI presentation.
optional string short_description = 2;
// A longer description of an anomaly.
optional string description = 3;
}
repeated Reason reason = 7;
}
// Message to contain the result of the drift/skew measurements for a feature.
message DriftSkewInfo {
message Measurement {
enum Type {
UNKNOWN = 0;
L_INFTY = 1;
JENSEN_SHANNON_DIVERGENCE = 2;
NORMALIZED_ABSOLUTE_DIFFERENCE = 3;
}
// Type of the measurement.
optional Type type = 1;
// Value of the measurement.
optional double value = 2;
// Threshold used to determine whether the measurement results in an
// anomaly.
optional double threshold = 3;
}
// Identifies the feature;
optional Path path = 1;
// The drift/skew may be measured in the same invocation of TFDV, in which
// case both of the following fields are populated.
// Also the drift/skew may be quantified by different measurements, thus
// repeated.
repeated Measurement drift_measurements = 2;
repeated Measurement skew_measurements = 3;
}
// Message to represent the anomalies, which describe the mismatches (if any)
// between the stats and the schema.
message Anomalies {
// Deleted fields.
reserved 4;
// The baseline schema that is used.
oneof baseline_schema {
tensorflow.metadata.v0.Schema baseline = 1;
tensorflow.metadata.v0.Schema baseline_v1 = 6 [deprecated = true];
}
// Map from a column to the difference that it represents.
enum AnomalyNameFormat {
// At present, this indicates that the keys in anomaly_info
// refers to the raw field name in the Schema.
UNKNOWN = 0;
// The serialized path to a struct.
SERIALIZED_PATH = 1;
}
// The format of the keys in anomaly_info.
// If absent, default is DEFAULT.
optional AnomalyNameFormat anomaly_name_format = 7;
// Information about feature-level anomalies.
map anomaly_info = 2;
// Information about dataset-level anomalies.
optional AnomalyInfo dataset_anomaly_info = 8;
// True if numExamples == 0.
optional bool data_missing = 3;
// If drift / skew detection was conducted, this field will hold the
// comparison results for all the features compared, regardless whether a
// related anomaly was reported.
repeated DriftSkewInfo drift_skew_info = 9;
// TODO(b/123519907): Remove this.
// The hook to attach any usage and tool specific metadata. Example:
// message SchemaStamp {
// // extension ID is any CL number that has not been used in an extension.
// extend proto2.bridge.MessageSet {
// optional StampedSchemaDiff message_set_extension = 123445554;
// }
// optional string schema_stamp = 1;
// }
//
// then, the following proto msg encodes an Anomalies with an embedded
// SchemaStamp:
//
// Anomalies {
// metadata {
// [SchemaStamp]: {
// schema_stamp: "stamp"
// }
// }
// }
// GOOGLE-LEGACY optional proto2.bridge.MessageSet metadata = 5;
}
// Describes a region in the comparison between two text artifacts. Note that
// a region also contains the contents of the two artifacts that correspond to
// the region.
message DiffRegion {
// Details for the chunk.
oneof details {
// An unchanged region of lines.
UnchangedRegion unchanged = 1;
// A region of lines removed from the left.
OneSideRegion removed = 2;
// A region of lines added to the right.
OneSideRegion added = 3;
// A region of lines that are different in the two artifacts.
ChangedRegion changed = 4;
// An unchanged region of lines whose contents are just hidden.
HiddenRegion hidden = 5;
}
}
// Describes a chunk that is the same in the two artifacts.
message UnchangedRegion {
// The starting lines of the chunk in the two artifacts.
optional int32 left_start = 1;
optional int32 right_start = 2;
// The contents of the chunk. These are the same in both artifacts.
repeated string contents = 3;
}
// Describes a chunk that applies to only one of the two artifacts.
message OneSideRegion {
// Starting line.
optional int32 start = 1;
// Contents.
repeated string contents = 2;
}
// Describes a chunk that represents changes in both artifacts over the same
// number of lines.
message ChangedRegion {
// Changed region in the left artifact, in terms of starting line number and
// contents.
optional int32 left_start = 1;
repeated string left_contents = 2;
// Ditto for the right artifact.
optional int32 right_start = 3;
repeated string right_contents = 4;
}
// A chunk that represents identical lines, whose contents are hidden.
message HiddenRegion {
// Starting lines in the two artifacts.
optional int32 left_start = 1;
optional int32 right_start = 2;
// Size of the region in terms of lines.
optional int32 size = 3;
}
© 2015 - 2025 Weber Informatics LLC | Privacy Policy