google.cloud.documentai.v1beta3.document_io.proto Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of proto-google-cloud-document-ai-v1beta3 Show documentation
Show all versions of proto-google-cloud-document-ai-v1beta3 Show documentation
PROTO library for proto-google-cloud-document-ai-v1beta3
// Copyright 2023 Google LLC
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
syntax = "proto3";
package google.cloud.documentai.v1beta3;
import "google/protobuf/field_mask.proto";
option csharp_namespace = "Google.Cloud.DocumentAI.V1Beta3";
option go_package = "cloud.google.com/go/documentai/apiv1beta3/documentaipb;documentaipb";
option java_multiple_files = true;
option java_outer_classname = "DocumentIoProto";
option java_package = "com.google.cloud.documentai.v1beta3";
option php_namespace = "Google\\Cloud\\DocumentAI\\V1beta3";
option ruby_package = "Google::Cloud::DocumentAI::V1beta3";
// Payload message of raw document content (bytes).
message RawDocument {
// Inline document content.
bytes content = 1;
// An IANA MIME type (RFC6838) indicating the nature and format of the
// [content][google.cloud.documentai.v1beta3.RawDocument.content].
string mime_type = 2;
// The display name of the document, it supports all Unicode characters except
// the following:
// `*`, `?`, `[`, `]`, `%`, `{`, `}`,`'`, `\"`, `,`
// `~`, `=` and `:` are reserved.
// If not specified, a default ID is generated.
string display_name = 3;
}
// Specifies a document stored on Cloud Storage.
message GcsDocument {
// The Cloud Storage object uri.
string gcs_uri = 1;
// An IANA MIME type (RFC6838) of the content.
string mime_type = 2;
}
// Specifies a set of documents on Cloud Storage.
message GcsDocuments {
// The list of documents.
repeated GcsDocument documents = 1;
}
// Specifies all documents on Cloud Storage with a common prefix.
message GcsPrefix {
// The URI prefix.
string gcs_uri_prefix = 1;
}
// The common config to specify a set of documents used as input.
message BatchDocumentsInputConfig {
// The source.
oneof source {
// The set of documents that match the specified Cloud Storage `gcs_prefix`.
GcsPrefix gcs_prefix = 1;
// The set of documents individually specified on Cloud Storage.
GcsDocuments gcs_documents = 2;
}
}
// Config that controls the output of documents. All documents will be written
// as a JSON file.
message DocumentOutputConfig {
// The configuration used when outputting documents.
message GcsOutputConfig {
// The sharding config for the output document.
message ShardingConfig {
// The number of pages per shard.
int32 pages_per_shard = 1;
// The number of overlapping pages between consecutive shards.
int32 pages_overlap = 2;
}
// The Cloud Storage uri (a directory) of the output.
string gcs_uri = 1;
// Specifies which fields to include in the output documents.
// Only supports top level document and pages field so it must be in the
// form of `{document_field_name}` or `pages.{page_field_name}`.
google.protobuf.FieldMask field_mask = 2;
// Specifies the sharding config for the output document.
ShardingConfig sharding_config = 3;
}
// The destination of the results.
oneof destination {
// Output config to write the results to Cloud Storage.
GcsOutputConfig gcs_output_config = 1;
}
}
// Config for Document OCR.
message OcrConfig {
// Hints for OCR Engine
message Hints {
// List of BCP-47 language codes to use for OCR. In most cases, not
// specifying it yields the best results since it enables automatic language
// detection. For languages based on the Latin alphabet, setting hints is
// not needed. In rare cases, when the language of the text in the
// image is known, setting a hint will help get better results (although it
// will be a significant hindrance if the hint is wrong).
repeated string language_hints = 1;
}
// Configurations for premium OCR features.
message PremiumFeatures {
// Turn on selection mark detector in OCR engine. Only available in OCR 2.0
// (and later) processors.
bool enable_selection_mark_detection = 3;
// Turn on font identification model and return font style information.
bool compute_style_info = 4;
// Turn on the model that can extract LaTeX math formulas.
bool enable_math_ocr = 5;
}
// Hints for the OCR model.
Hints hints = 2;
// Enables special handling for PDFs with existing text information. Results
// in better text extraction quality in such PDF inputs.
bool enable_native_pdf_parsing = 3;
// Enables intelligent document quality scores after OCR. Can help with
// diagnosing why OCR responses are of poor quality for a given input.
// Adds additional latency comparable to regular OCR to the process call.
bool enable_image_quality_scores = 4;
// A list of advanced OCR options to further fine-tune OCR behavior. Current
// valid values are:
//
// - `legacy_layout`: a heuristics layout detection algorithm, which serves as
// an alternative to the current ML-based layout detection algorithm.
// Customers can choose the best suitable layout algorithm based on their
// situation.
repeated string advanced_ocr_options = 5;
// Includes symbol level OCR information if set to true.
bool enable_symbol = 6;
// Turn on font identification model and return font style information.
// Deprecated, use
// [PremiumFeatures.compute_style_info][google.cloud.documentai.v1beta3.OcrConfig.PremiumFeatures.compute_style_info]
// instead.
bool compute_style_info = 8 [deprecated = true];
// Turn off character box detector in OCR engine. Character box detection is
// enabled by default in OCR 2.0 (and later) processors.
bool disable_character_boxes_detection = 10;
// Configurations for premium OCR features.
PremiumFeatures premium_features = 11;
}
© 2015 - 2025 Weber Informatics LLC | Privacy Policy