tensorflow_metadata.proto.v0.problem_statement.proto Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of scio-tensorflow_2.12 Show documentation
Scio add-on for TensorFlow
The newest version!
// Copyright 2017 The TensorFlow Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
// =============================================================================

syntax = "proto3";

package tensorflow.metadata.v0;

import "google/protobuf/descriptor.proto";
import "tensorflow_metadata/proto/v0/metric.proto";
import "tensorflow_metadata/proto/v0/path.proto";

option cc_enable_arenas = true;
option java_package = "org.tensorflow.metadata.v0";
option java_multiple_files = true;

enum TaskType {
  UNKNOWN_TYPE = 0;
  BINARY_CLASSIFICATION = 1;
  MULTI_CLASS_CLASSIFICATION = 2;
  TOP_K_CLASSIFICATION = 3;
  ONE_DIMENSIONAL_REGRESSION = 4;
  MULTI_LABEL_CLASSIFICATION = 5;
}

extend google.protobuf.MessageOptions {
  TaskType task_type = 241943395;
}

// Configuration for a binary classification task.
// The output is one of two possible class labels, encoded as the same type
// as the label column.
// BinaryClassification is the same as MultiClassClassification with
// n_classes = 2.
message BinaryClassification {
  option (task_type) = BINARY_CLASSIFICATION;

  // The label column.
  oneof label_id {
    // The name of the label. Assumes the label is a flat, top-level field.
    string label = 1;
    // A path can be used instead of a flat string if the label is nested.
    Path label_path = 3;
  }
  // (optional) The weight column.
  string example_weight = 2;

  // Defines which label value is the positive and/or negative class.
  message PositiveNegativeSpec {
    // Specifies a label's value which can be used for positive/negative class
    // specification.
    message LabelValue {
      oneof value_type {
        string string_value = 1;
      }
    }
    // This value is the positive class.
    LabelValue positive_class_value = 1;
    // This value is the negative class.
    LabelValue negative_class_value = 2;
  }

  // (optional) specification of the positive and/or negative class value.
  PositiveNegativeSpec positive_negative_spec = 4;
}

// Specifies a dynamic multiclass/multi-label problem where the number of label
//  classes is inferred from the data.
message DynamicClassSpec {
  // Note: it is up to a solution provider to implement support for OOV labels.
  // Note: both a frequency_threshold and a top_k may be set. A class is grouped
  // into the OOV class if it fails to meet either of the criteria below.
  message OovClassSpec {
    // If set, labels are grouped into the "OOV" class if they occur less than
    // frequency_threshold times in the training dataset. If 0, labels
    // that appear in test / validation splits but not in training would be
    // still classified as the "OOV" class.
    int64 frequency_threshold = 1;
    // If set, only the top_k labels in the training set are used and all others
    // are grouped into an "OOV" class.
    int64 top_k = 2;
  }
  // Optional. If specified, an Out-Of-Vocabulary (OOV) class is created and
  // populated based on frequencies in the training set. If no OOV class is
  // specified, the model's label vocabulary should consist of all labels that
  // appear in the training set.
  OovClassSpec oov_class_spec = 1;
}
// Configuration for a multi-class classification task.
// In this problem type, there are n_classes possible label values, and the
// model predicts a single label.
// The output is one of the class labels, out of n_classes possible classes.
// The output type will correspond to the label column type.
message MultiClassClassification {
  option (task_type) = MULTI_CLASS_CLASSIFICATION;

  // The label column.  There's only a single label per example.
  // If the label column is a BoolDomain, use the BinaryClassification Type
  // instead.
  oneof label_id {
    // The name of the label. Assumes the label is a flat, top-level field.
    string label = 1;
    // A path can be used instead of a flat string if the label is nested.
    Path label_path = 5;
  }
  // The weight column.
  string example_weight = 2;
  oneof class_spec {
    // The exact number of label classes.
    uint64 n_classes = 3;
    // The number of label classes that should be inferred dynamically from the
    // data.
    DynamicClassSpec dynamic_class_spec = 4;
  }
}
// Configuration for a multi-label classification task.
// In this problem type there are n_classes unique possible label values
// overall. There can be from zero up to n_classes unique labels per example.
// The output, which is of type real number, is class probabilities associated
// with each class. It will be of n_classes dimension for each example, if
// n_classes is specified. Otherwise, the dimension will be set to the number
// of unique class labels that are dynamically inferred from the data based on
// dynamic_class_spec.
message MultiLabelClassification {
  option (task_type) = MULTI_LABEL_CLASSIFICATION;

  // The label column. There can be one or more labels per example.
  oneof label_id {
    // The name of the label. Assumes the label is a flat, top-level field.
    string label = 1;
    // A path can be used instead of a flat string if the label is nested.
    Path label_path = 5;
  }
  // The weight column.
  string example_weight = 2;
  oneof class_spec {
    // The exact number of unique class labels.
    uint64 n_classes = 3;
    // The maximal number of label classes that should be inferred dynamically
    // from the data.
    DynamicClassSpec dynamic_class_spec = 4;
  }
}

// Configuration for a top-K classification task.
// In this problem type, there are n_classes possible label values, and the
// model predicts n_predicted_labels labels.
// The output is a sequence of n_predicted_labels labels, out of n_classes
// possible classes. The order of the predicted output labels is determined
// by the predictions_order field.
// (*) MultiClassClassification is the same as TopKClassification with
//     n_predicted_labels = 1.
// (*) TopKClassification does NOT mean multi-class multi-label classification:
//     e.g., the output contains a sequence of labels, all coming from the same
//     label column in the data.
message TopKClassification {
  option (task_type) = TOP_K_CLASSIFICATION;

  // The label column.
  oneof label_id {
    // The name of the label. Assumes the label is a flat, top-level field.
    string label = 1;
    // A path can be used instead of a flat string if the label is nested.
    Path label_path = 6;
  }
  // (optional) The weight column.
  string example_weight = 2;
  // (optional) The number of label classes. If unset, the solution provider
  // is expected to infer the number of classes from the data.
  uint64 n_classes = 3;
  // (optional) The number of class labels to predict. If unset, we assume 1.
  uint64 n_predicted_labels = 4;
  enum Order {
    UNSPECIFIED = 0;
    // Predictions are ordered from the most likely to least likely.
    SCORE_DESC = 1;
    // Predictions are ordered from the least likely to most likely.
    SCORE_ASC = 2;
  }
  Order predictions_order = 5;
}

// A one-dimensional regression task.
// The output is a single real number, whose range is dependent upon the
// objective.
message OneDimensionalRegression {
  option (task_type) = ONE_DIMENSIONAL_REGRESSION;

  // The label column.
  oneof label_id {  // oneof label_id is required.
    // The name of the label. Assumes the label is a flat, top-level field.
    string label = 1;
    // A path can be used instead of a flat string if the label is nested.
    Path label_path = 3;
  }
  // (optional) The weight column.
  string weight = 2;

  // Defines a regression problem where labels are in [0, 1] and represent a
  // probability (e.g: probability of click).
  message Probability {}

  // Defines a regression problem where the labels are counts i.e. integers >=0.
  message Counts {}

  oneof label_type {
    // When set means the label is a probability in range [0..1].
    Probability probability = 4;
    // When set the label corresponds to counts from a poisson distribution.
    // Eg: Number of googlers contributing to memegen each year.
    Counts counts = 5;
  }
}

// The type of a head or meta-objective. Specifies the label, weight,
// and output type of the head.
// TODO(martinz): add logistic regression.
message Type {
  oneof task_type {
    BinaryClassification binary_classification = 1;
    OneDimensionalRegression one_dimensional_regression = 2;
    MultiClassClassification multi_class_classification = 3;
    TopKClassification top_k_classification = 4;
    MultiLabelClassification multi_label_classification = 5;
  }
}


// Describes a single task in a model and all its properties.
// A task corresponds to a single output of the model.
// Multiple tasks in the same problem statement correspond to different outputs
// of the model.
message Task {
  reserved 3;
  // Specification of the label and weight columns, and the type of the
  // prediction or classification.
  Type type = 1;

  // The task name. Tasks within the same ProblemStatement should have unique
  // names. This a REQUIRED field in case of multi-task learning problems.
  string name = 5;

  // If a Problem is composed of mulitple sub-tasks, the weight of each task
  // determines the importance of solving each sub-task. It is used to
  // rank and select the best solution for multi-task problems.
  // Not meaningful for a problem with one task.
  // If the problem has multiple tasks and all task_weight=0 (unset) then all
  // tasks are weighted equally.
  double task_weight = 2 [deprecated = true];

  // This field includes performance metrics of this head that are important to
  // the problem owner and need to be monitored and reported. However, unlike
  // fields such as "meta_optimization_target", these metrics are not
  // not automatically used in meta-optimization.
  repeated PerformanceMetric performance_metric = 4;

  // True to indicate the task is an auxiliary task in a multi-task setting.
  // Auxiliary tasks are of minor relevance for the application and they are
  // added only to improve the performance on a primary task (by providing
  // additional regularization or data augmentation), and thus are not
  // considered in the meta optimization process (but may be utilized in the
  // learner optimization).
  bool is_auxiliary = 6;

}

// The high-level objectives described by this problem statement. These
// objectives provide a basis for ranking models and can be optimized by a meta
// optimizer (e.g. a grid search over hyperparameters). A solution provider may
// also directly use the meta optimization targets to heuristically select
// losses, etc without any meta-optimization process. If not specified, the
// high-level meta optimization target is inferred from the task. These
// objectives do not need to be differentiable, as the solution provider may use
// proxy function to optimize model weights. Target definitions include tasks,
// metrics, and any weighted combination of them.
message MetaOptimizationTarget {
  reserved 2;
  // The name of a task in this problem statement producing the
  // prediction or classification for the metric.
  string task_name = 1;

  // The performance metric to be evaluated.
  // The prediction or classification is based upon the task.
  // The label is from the type of the task, or from the override_task.
  PerformanceMetric performance_metric = 3;

  // Configuration for thresholded meta-optimization targets.
  message ThresholdConfig {
    oneof type {
      // If specified, indicates a threshold that the user wishes the metric to
      // stay under (for MINIMIZE type), or above (for MAXIMIZE type). The
      // optimization process need not prefer models that are higher (or lower)
      // on the thresholded metric so long as the threshold is respected.
      // E.g., if `threshold` for a MAXIMIZE type metric X is .9, the
      // optimization process will prefer a solution with X = .92 over a
      // solution with X = .88, but may not prefer a solution with X = .95 over
      // a solution with X = .92. Unless otherwise specified by the
      // PerformanceMetric, threshold is best effort. It does not provide a hard
      // guarantee about the properties of the final model, but rather serves as
      // a "target" to guide the optimization process. The user is responsible
      // for validating that final model metrics are in an acceptable range for
      // the application. A problem statement may, however, be rejected if the
      // specified target is impossible to achieve. Keep this in mind if running
      // the optimization on a recurring basis, as shifts in the data could push
      // a previously achievable target to being unachievable (and thus yield no
      // solution). The units and range for the threshold will be the same as
      // the valid output range of the associated performance_metric.
      double threshold = 1;

    }
  }

  // Describes how to combine with other objectives.
  oneof objective_combination {
    // If a model spec has multiple meta optimization targets, the weight
    // of each can be specified. The final objective is then a weighted
    // combination of the multiple objectives. If not specified, value is 1.
    double weight = 4 [deprecated = true];

    // Secondary meta optimization targets can be thresholded, meaning that the
    // optimization process prefers solutions above (or below) the threshold,
    // but need not prefer solutions higher (or lower) on the metric if the
    // threshold is met.
    ThresholdConfig threshold_config = 5;
  }
}

message ProblemStatement {
  // Description of the problem statement. For example, should describe how
  // the problem statement was arrived at: what experiments were run, what
  // side-by-sides were considered.
  string description = 2;
  repeated string owner = 3;

  // The environment of the ProblemStatement (optional). Specifies an
  // environment string in the SchemaProto.
  string environment = 4;

  // The target used for meta-optimization. This is used to compare multiple
  // solutions for this problem. For example, if two solutions have different
  // candidates, a tuning tool can use meta_optimization_target to decide which
  // candidate performs the best.
  // A repeated meta-optimization target implies the weighted sum of the
  // meta_optimization targets of any non-thresholded metrics.
  repeated MetaOptimizationTarget meta_optimization_target = 7;
  bool multi_objective = 8 [deprecated = true];

  reserved 5;

  // Tasks for heads of the generated model. This field is repeated because some
  // models are multi-task models. Each task should have a unique name.
  // If you wish to directly optimize this problem statement, you need
  // to specify the objective in the task.
  repeated Task tasks = 9;
}