All Downloads are FREE. Search and download functionalities are using the official Maven repository.

google.cloud.documentai.v1beta3.dataset.proto Maven / Gradle / Ivy

There is a newer version: 0.71.0
Show newest version
// Copyright 2023 Google LLC
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

syntax = "proto3";

package google.cloud.documentai.v1beta3;

import "google/api/field_behavior.proto";
import "google/api/resource.proto";
import "google/cloud/documentai/v1beta3/document.proto";
import "google/cloud/documentai/v1beta3/document_io.proto";
import "google/cloud/documentai/v1beta3/document_schema.proto";

option csharp_namespace = "Google.Cloud.DocumentAI.V1Beta3";
option go_package = "cloud.google.com/go/documentai/apiv1beta3/documentaipb;documentaipb";
option java_multiple_files = true;
option java_outer_classname = "DatasetProto";
option java_package = "com.google.cloud.documentai.v1beta3";
option php_namespace = "Google\\Cloud\\DocumentAI\\V1beta3";
option ruby_package = "Google::Cloud::DocumentAI::V1beta3";
option (google.api.resource_definition) = {
  type: "contentwarehouse.googleapis.com/Schema"
  pattern: "projects/{project}/locations/{location}/schemas/{schema}"
};

// A singleton resource under a
// [Processor][google.cloud.documentai.v1beta3.Processor] which configures a
// collection of documents.
message Dataset {
  option (google.api.resource) = {
    type: "documentai.googleapis.com/Dataset"
    pattern: "projects/{project}/locations/{location}/processors/{processor}/dataset"
  };

  // Configuration specific to the Cloud Storage-based implementation.
  message GCSManagedConfig {
    // Required. The Cloud Storage URI (a directory) where the documents
    // belonging to the dataset must be stored.
    GcsPrefix gcs_prefix = 1 [(google.api.field_behavior) = REQUIRED];
  }

  // Configuration specific to the Document AI Warehouse-based implementation.
  message DocumentWarehouseConfig {
    // Output only. The collection in Document AI Warehouse associated with the
    // dataset.
    string collection = 1 [(google.api.field_behavior) = OUTPUT_ONLY];

    // Output only. The schema in Document AI Warehouse associated with the
    // dataset.
    string schema = 2 [
      (google.api.field_behavior) = OUTPUT_ONLY,
      (google.api.resource_reference) = {
        type: "contentwarehouse.googleapis.com/Schema"
      }
    ];
  }

  // Configuration specific to an unmanaged dataset.
  message UnmanagedDatasetConfig {}

  // Configuration specific to spanner-based indexing.
  message SpannerIndexingConfig {}

  // Different states of a dataset.
  enum State {
    // Default unspecified enum, should not be used.
    STATE_UNSPECIFIED = 0;

    // Dataset has not been initialized.
    UNINITIALIZED = 1;

    // Dataset is being initialized.
    INITIALIZING = 2;

    // Dataset has been initialized.
    INITIALIZED = 3;
  }

  oneof storage_source {
    // Optional. User-managed Cloud Storage dataset configuration. Use this
    // configuration if the dataset documents are stored under a user-managed
    // Cloud Storage location.
    GCSManagedConfig gcs_managed_config = 3
        [(google.api.field_behavior) = OPTIONAL];

    // Optional. Deprecated. Warehouse-based dataset configuration is not
    // supported.
    DocumentWarehouseConfig document_warehouse_config = 5
        [deprecated = true, (google.api.field_behavior) = OPTIONAL];

    // Optional. Unmanaged dataset configuration. Use this configuration if the
    // dataset documents are managed by the document service internally (not
    // user-managed).
    UnmanagedDatasetConfig unmanaged_dataset_config = 6
        [(google.api.field_behavior) = OPTIONAL];
  }

  oneof indexing_source {
    // Optional. A lightweight indexing source with low latency and high
    // reliability, but lacking advanced features like CMEK and content-based
    // search.
    SpannerIndexingConfig spanner_indexing_config = 4
        [(google.api.field_behavior) = OPTIONAL];
  }

  // Dataset resource name.
  // Format:
  // `projects/{project}/locations/{location}/processors/{processor}/dataset`
  string name = 1;

  // Required. State of the dataset. Ignored when updating dataset.
  State state = 2 [(google.api.field_behavior) = REQUIRED];
}

// Document Identifier.
message DocumentId {
  // Identifies a document uniquely within the scope of a dataset in the
  // user-managed Cloud Storage option.
  message GCSManagedDocumentId {
    // Required. The Cloud Storage URI where the actual document is stored.
    string gcs_uri = 1 [(google.api.field_behavior) = REQUIRED];

    // Id of the document (indexed) managed by Content Warehouse.
    string cw_doc_id = 2 [deprecated = true];
  }

  // Identifies a document uniquely within the scope of a dataset in unmanaged
  // option.
  message UnmanagedDocumentId {
    // Required. The id of the document.
    string doc_id = 1 [(google.api.field_behavior) = REQUIRED];
  }

  oneof type {
    // A document id within user-managed Cloud Storage.
    GCSManagedDocumentId gcs_managed_doc_id = 1;

    // A document id within unmanaged dataset.
    UnmanagedDocumentId unmanaged_doc_id = 4;
  }

  // Points to a specific revision of the document if set.
  RevisionRef revision_ref = 3;
}

// Dataset Schema.
message DatasetSchema {
  option (google.api.resource) = {
    type: "documentai.googleapis.com/DatasetSchema"
    pattern: "projects/{project}/locations/{location}/processors/{processor}/dataset/datasetSchema"
  };

  // Dataset schema resource name.
  // Format:
  // `projects/{project}/locations/{location}/processors/{processor}/dataset/datasetSchema`
  string name = 1;

  // Optional. Schema of the dataset.
  DocumentSchema document_schema = 3 [(google.api.field_behavior) = OPTIONAL];
}

// Dataset documents that the batch operation will be applied to.
message BatchDatasetDocuments {
  // List of individual DocumentIds.
  message IndividualDocumentIds {
    // Required. List of Document IDs indicating where the actual documents are
    // stored.
    repeated DocumentId document_ids = 1
        [(google.api.field_behavior) = REQUIRED];
  }

  oneof criteria {
    // Document identifiers.
    IndividualDocumentIds individual_document_ids = 1;

    // A filter matching the documents.
    // Follows the same format and restriction as
    // [google.cloud.documentai.master.ListDocumentsRequest.filter].
    string filter = 2;
  }
}




© 2015 - 2025 Weber Informatics LLC | Privacy Policy