diff --git a/automl/google/cloud/automl_v1beta1/gapic/enums.py b/automl/google/cloud/automl_v1beta1/gapic/enums.py index a34ed876b1ca..9d817f8ec807 100644 --- a/automl/google/cloud/automl_v1beta1/gapic/enums.py +++ b/automl/google/cloud/automl_v1beta1/gapic/enums.py @@ -83,6 +83,65 @@ class TypeCode(enum.IntEnum): CATEGORY = 10 +class Document(object): + class Layout(object): + class TextSegmentType(enum.IntEnum): + """ + The type of TextSegment in the context of the original document. + + Attributes: + TEXT_SEGMENT_TYPE_UNSPECIFIED (int): Should not be used. + TOKEN (int): The text segment is a token. e.g. word. + PARAGRAPH (int): The text segment is a paragraph. + FORM_FIELD (int): The text segment is a form field. + FORM_FIELD_NAME (int): The text segment is the name part of a form field. It will be treated as + child of another FORM\_FIELD TextSegment if its span is subspan of + another TextSegment with type FORM\_FIELD. + FORM_FIELD_CONTENTS (int): The text segment is the text content part of a form field. It will be + treated as child of another FORM\_FIELD TextSegment if its span is + subspan of another TextSegment with type FORM\_FIELD. + TABLE (int): The text segment is a whole table, including headers, and all rows. + TABLE_HEADER (int): The text segment is a table's headers. It will be treated as child of + another TABLE TextSegment if its span is subspan of another TextSegment + with type TABLE. + TABLE_ROW (int): The text segment is a row in table. It will be treated as child of + another TABLE TextSegment if its span is subspan of another TextSegment + with type TABLE. + TABLE_CELL (int): The text segment is a cell in table. It will be treated as child of + another TABLE\_ROW TextSegment if its span is subspan of another + TextSegment with type TABLE\_ROW. + """ + + TEXT_SEGMENT_TYPE_UNSPECIFIED = 0 + TOKEN = 1 + PARAGRAPH = 2 + FORM_FIELD = 3 + FORM_FIELD_NAME = 4 + FORM_FIELD_CONTENTS = 5 + TABLE = 6 + TABLE_HEADER = 7 + TABLE_ROW = 8 + TABLE_CELL = 9 + + +class DocumentDimensions(object): + class DocumentDimensionUnit(enum.IntEnum): + """ + Unit of the document dimension. + + Attributes: + DOCUMENT_DIMENSION_UNIT_UNSPECIFIED (int): Should not be used. + INCH (int): Document dimension is measured in inches. + CENTIMETER (int): Document dimension is measured in centimeters. + POINT (int): Document dimension is measured in points. 72 points = 1 inch. + """ + + DOCUMENT_DIMENSION_UNIT_UNSPECIFIED = 0 + INCH = 1 + CENTIMETER = 2 + POINT = 3 + + class Model(object): class DeploymentState(enum.IntEnum): """ diff --git a/automl/google/cloud/automl_v1beta1/proto/data_items.proto b/automl/google/cloud/automl_v1beta1/proto/data_items.proto index fec75d3bbd73..fdbea7042d84 100644 --- a/automl/google/cloud/automl_v1beta1/proto/data_items.proto +++ b/automl/google/cloud/automl_v1beta1/proto/data_items.proto @@ -17,11 +17,13 @@ syntax = "proto3"; package google.cloud.automl.v1beta1; +import "google/api/annotations.proto"; +import "google/cloud/automl/v1beta1/geometry.proto"; import "google/cloud/automl/v1beta1/io.proto"; +import "google/cloud/automl/v1beta1/text_segment.proto"; import "google/protobuf/any.proto"; import "google/protobuf/duration.proto"; import "google/protobuf/struct.proto"; -import "google/api/annotations.proto"; option go_package = "google.golang.org/genproto/googleapis/cloud/automl/v1beta1;automl"; option java_multiple_files = true; @@ -56,19 +58,135 @@ message TextSnippet { // characters long. string content = 1; - // Optional. The format of [content][google.cloud.automl.v1beta1.TextSnippet.content]. Currently the only two allowed - // values are "text/html" and "text/plain". If left blank, the format is - // automatically determined from the type of the uploaded [content][google.cloud.automl.v1beta1.TextSnippet.content]. + // Optional. The format of + // [content][google.cloud.automl.v1beta1.TextSnippet.content]. Currently the + // only two allowed values are "text/html" and "text/plain". If left blank, + // the format is automatically determined from the type of the uploaded + // [content][google.cloud.automl.v1beta1.TextSnippet.content]. string mime_type = 2; // Output only. HTTP URI where you can download the content. string content_uri = 4; } +// Message that describes dimension of a document. +message DocumentDimensions { + // Unit of the document dimension. + enum DocumentDimensionUnit { + // Should not be used. + DOCUMENT_DIMENSION_UNIT_UNSPECIFIED = 0; + + // Document dimension is measured in inches. + INCH = 1; + + // Document dimension is measured in centimeters. + CENTIMETER = 2; + + // Document dimension is measured in points. 72 points = 1 inch. + POINT = 3; + } + + // Unit of the dimension. + DocumentDimensionUnit unit = 1; + + // Width value of the document, works together with the unit. + float width = 2; + + // Height value of the document, works together with the unit. + float height = 3; +} + // A structured text document e.g. a PDF. message Document { + // Describes the layout information of a + // [text_segment][google.cloud.automl.v1beta1.Document.Layout.text_segment] in + // the document. + message Layout { + // The type of TextSegment in the context of the original document. + enum TextSegmentType { + // Should not be used. + TEXT_SEGMENT_TYPE_UNSPECIFIED = 0; + + // The text segment is a token. e.g. word. + TOKEN = 1; + + // The text segment is a paragraph. + PARAGRAPH = 2; + + // The text segment is a form field. + FORM_FIELD = 3; + + // The text segment is the name part of a form field. It will be treated + // as child of another FORM_FIELD TextSegment if its span is subspan of + // another TextSegment with type FORM_FIELD. + FORM_FIELD_NAME = 4; + + // The text segment is the text content part of a form field. It will be + // treated as child of another FORM_FIELD TextSegment if its span is + // subspan of another TextSegment with type FORM_FIELD. + FORM_FIELD_CONTENTS = 5; + + // The text segment is a whole table, including headers, and all rows. + TABLE = 6; + + // The text segment is a table's headers. It will be treated as child of + // another TABLE TextSegment if its span is subspan of another TextSegment + // with type TABLE. + TABLE_HEADER = 7; + + // The text segment is a row in table. It will be treated as child of + // another TABLE TextSegment if its span is subspan of another TextSegment + // with type TABLE. + TABLE_ROW = 8; + + // The text segment is a cell in table. It will be treated as child of + // another TABLE_ROW TextSegment if its span is subspan of another + // TextSegment with type TABLE_ROW. + TABLE_CELL = 9; + } + + // Text Segment that represents a segment in + // [document_text][google.cloud.automl.v1beta1.Document.document_text]. + TextSegment text_segment = 1; + + // Page number of the + // [text_segment][google.cloud.automl.v1beta1.Document.Layout.text_segment] + // in the original document, starts from 1. + int32 page_number = 2; + + // The position of the + // [text_segment][google.cloud.automl.v1beta1.Document.Layout.text_segment] + // in the page. Contains exactly 4 + // + // [normalized_vertices][google.cloud.automl.v1beta1.BoundingPoly.normalized_vertices] + // and they are connected by edges in the order provided, which will + // represent a rectangle parallel to the frame. The + // [NormalizedVertex-s][google.cloud.automl.v1beta1.NormalizedVertex] are + // relative to the page. + // Coordinates are based on top-left as point (0,0). + BoundingPoly bounding_poly = 3; + + // The type of the + // [text_segment][google.cloud.automl.v1beta1.Document.Layout.text_segment] + // in document. + TextSegmentType text_segment_type = 4; + } + // An input config specifying the content of the document. DocumentInputConfig input_config = 1; + + // The plain text version of this document. + TextSnippet document_text = 2; + + // Describes the layout of the document. + // Sorted by [page_number][]. + repeated Layout layout = 3; + + // The dimensions of the page in the document. + DocumentDimensions document_dimensions = 4; + + // Number of pages in the document. + int32 page_count = 5; } // A representation of a row in a relational table. diff --git a/automl/google/cloud/automl_v1beta1/proto/data_items_pb2.py b/automl/google/cloud/automl_v1beta1/proto/data_items_pb2.py index 75d030ce87e6..ee388d632364 100644 --- a/automl/google/cloud/automl_v1beta1/proto/data_items_pb2.py +++ b/automl/google/cloud/automl_v1beta1/proto/data_items_pb2.py @@ -15,13 +15,19 @@ _sym_db = _symbol_database.Default() +from google.api import annotations_pb2 as google_dot_api_dot_annotations__pb2 +from google.cloud.automl_v1beta1.proto import ( + geometry_pb2 as google_dot_cloud_dot_automl__v1beta1_dot_proto_dot_geometry__pb2, +) from google.cloud.automl_v1beta1.proto import ( io_pb2 as google_dot_cloud_dot_automl__v1beta1_dot_proto_dot_io__pb2, ) +from google.cloud.automl_v1beta1.proto import ( + text_segment_pb2 as google_dot_cloud_dot_automl__v1beta1_dot_proto_dot_text__segment__pb2, +) from google.protobuf import any_pb2 as google_dot_protobuf_dot_any__pb2 from google.protobuf import duration_pb2 as google_dot_protobuf_dot_duration__pb2 from google.protobuf import struct_pb2 as google_dot_protobuf_dot_struct__pb2 -from google.api import annotations_pb2 as google_dot_api_dot_annotations__pb2 DESCRIPTOR = _descriptor.FileDescriptor( @@ -32,18 +38,107 @@ "\n\037com.google.cloud.automl.v1beta1P\001ZAgoogle.golang.org/genproto/googleapis/cloud/automl/v1beta1;automl\312\002\033Google\\Cloud\\AutoMl\\V1beta1\352\002\036Google::Cloud::AutoML::V1beta1" ), serialized_pb=_b( - '\n2google/cloud/automl_v1beta1/proto/data_items.proto\x12\x1bgoogle.cloud.automl.v1beta1\x1a*google/cloud/automl_v1beta1/proto/io.proto\x1a\x19google/protobuf/any.proto\x1a\x1egoogle/protobuf/duration.proto\x1a\x1cgoogle/protobuf/struct.proto\x1a\x1cgoogle/api/annotations.proto"\x7f\n\x05Image\x12\x15\n\x0bimage_bytes\x18\x01 \x01(\x0cH\x00\x12@\n\x0cinput_config\x18\x06 \x01(\x0b\x32(.google.cloud.automl.v1beta1.InputConfigH\x00\x12\x15\n\rthumbnail_uri\x18\x04 \x01(\tB\x06\n\x04\x64\x61ta"F\n\x0bTextSnippet\x12\x0f\n\x07\x63ontent\x18\x01 \x01(\t\x12\x11\n\tmime_type\x18\x02 \x01(\t\x12\x13\n\x0b\x63ontent_uri\x18\x04 \x01(\t"R\n\x08\x44ocument\x12\x46\n\x0cinput_config\x18\x01 \x01(\x0b\x32\x30.google.cloud.automl.v1beta1.DocumentInputConfig"F\n\x03Row\x12\x17\n\x0f\x63olumn_spec_ids\x18\x02 \x03(\t\x12&\n\x06values\x18\x03 \x03(\x0b\x32\x16.google.protobuf.Value"\xfe\x01\n\x0e\x45xamplePayload\x12\x33\n\x05image\x18\x01 \x01(\x0b\x32".google.cloud.automl.v1beta1.ImageH\x00\x12@\n\x0ctext_snippet\x18\x02 \x01(\x0b\x32(.google.cloud.automl.v1beta1.TextSnippetH\x00\x12\x39\n\x08\x64ocument\x18\x04 \x01(\x0b\x32%.google.cloud.automl.v1beta1.DocumentH\x00\x12/\n\x03row\x18\x03 \x01(\x0b\x32 .google.cloud.automl.v1beta1.RowH\x00\x42\t\n\x07payloadB\xa5\x01\n\x1f\x63om.google.cloud.automl.v1beta1P\x01ZAgoogle.golang.org/genproto/googleapis/cloud/automl/v1beta1;automl\xca\x02\x1bGoogle\\Cloud\\AutoMl\\V1beta1\xea\x02\x1eGoogle::Cloud::AutoML::V1beta1b\x06proto3' + '\n2google/cloud/automl_v1beta1/proto/data_items.proto\x12\x1bgoogle.cloud.automl.v1beta1\x1a\x1cgoogle/api/annotations.proto\x1a\x30google/cloud/automl_v1beta1/proto/geometry.proto\x1a*google/cloud/automl_v1beta1/proto/io.proto\x1a\x34google/cloud/automl_v1beta1/proto/text_segment.proto\x1a\x19google/protobuf/any.proto\x1a\x1egoogle/protobuf/duration.proto\x1a\x1cgoogle/protobuf/struct.proto"\x7f\n\x05Image\x12\x15\n\x0bimage_bytes\x18\x01 \x01(\x0cH\x00\x12@\n\x0cinput_config\x18\x06 \x01(\x0b\x32(.google.cloud.automl.v1beta1.InputConfigH\x00\x12\x15\n\rthumbnail_uri\x18\x04 \x01(\tB\x06\n\x04\x64\x61ta"F\n\x0bTextSnippet\x12\x0f\n\x07\x63ontent\x18\x01 \x01(\t\x12\x11\n\tmime_type\x18\x02 \x01(\t\x12\x13\n\x0b\x63ontent_uri\x18\x04 \x01(\t"\xef\x01\n\x12\x44ocumentDimensions\x12S\n\x04unit\x18\x01 \x01(\x0e\x32\x45.google.cloud.automl.v1beta1.DocumentDimensions.DocumentDimensionUnit\x12\r\n\x05width\x18\x02 \x01(\x02\x12\x0e\n\x06height\x18\x03 \x01(\x02"e\n\x15\x44ocumentDimensionUnit\x12\'\n#DOCUMENT_DIMENSION_UNIT_UNSPECIFIED\x10\x00\x12\x08\n\x04INCH\x10\x01\x12\x0e\n\nCENTIMETER\x10\x02\x12\t\n\x05POINT\x10\x03"\xf9\x05\n\x08\x44ocument\x12\x46\n\x0cinput_config\x18\x01 \x01(\x0b\x32\x30.google.cloud.automl.v1beta1.DocumentInputConfig\x12?\n\rdocument_text\x18\x02 \x01(\x0b\x32(.google.cloud.automl.v1beta1.TextSnippet\x12<\n\x06layout\x18\x03 \x03(\x0b\x32,.google.cloud.automl.v1beta1.Document.Layout\x12L\n\x13\x64ocument_dimensions\x18\x04 \x01(\x0b\x32/.google.cloud.automl.v1beta1.DocumentDimensions\x12\x12\n\npage_count\x18\x05 \x01(\x05\x1a\xc3\x03\n\x06Layout\x12>\n\x0ctext_segment\x18\x01 \x01(\x0b\x32(.google.cloud.automl.v1beta1.TextSegment\x12\x13\n\x0bpage_number\x18\x02 \x01(\x05\x12@\n\rbounding_poly\x18\x03 \x01(\x0b\x32).google.cloud.automl.v1beta1.BoundingPoly\x12W\n\x11text_segment_type\x18\x04 \x01(\x0e\x32<.google.cloud.automl.v1beta1.Document.Layout.TextSegmentType"\xc8\x01\n\x0fTextSegmentType\x12!\n\x1dTEXT_SEGMENT_TYPE_UNSPECIFIED\x10\x00\x12\t\n\x05TOKEN\x10\x01\x12\r\n\tPARAGRAPH\x10\x02\x12\x0e\n\nFORM_FIELD\x10\x03\x12\x13\n\x0f\x46ORM_FIELD_NAME\x10\x04\x12\x17\n\x13\x46ORM_FIELD_CONTENTS\x10\x05\x12\t\n\x05TABLE\x10\x06\x12\x10\n\x0cTABLE_HEADER\x10\x07\x12\r\n\tTABLE_ROW\x10\x08\x12\x0e\n\nTABLE_CELL\x10\t"F\n\x03Row\x12\x17\n\x0f\x63olumn_spec_ids\x18\x02 \x03(\t\x12&\n\x06values\x18\x03 \x03(\x0b\x32\x16.google.protobuf.Value"\xfe\x01\n\x0e\x45xamplePayload\x12\x33\n\x05image\x18\x01 \x01(\x0b\x32".google.cloud.automl.v1beta1.ImageH\x00\x12@\n\x0ctext_snippet\x18\x02 \x01(\x0b\x32(.google.cloud.automl.v1beta1.TextSnippetH\x00\x12\x39\n\x08\x64ocument\x18\x04 \x01(\x0b\x32%.google.cloud.automl.v1beta1.DocumentH\x00\x12/\n\x03row\x18\x03 \x01(\x0b\x32 .google.cloud.automl.v1beta1.RowH\x00\x42\t\n\x07payloadB\xa5\x01\n\x1f\x63om.google.cloud.automl.v1beta1P\x01ZAgoogle.golang.org/genproto/googleapis/cloud/automl/v1beta1;automl\xca\x02\x1bGoogle\\Cloud\\AutoMl\\V1beta1\xea\x02\x1eGoogle::Cloud::AutoML::V1beta1b\x06proto3' ), dependencies=[ + google_dot_api_dot_annotations__pb2.DESCRIPTOR, + google_dot_cloud_dot_automl__v1beta1_dot_proto_dot_geometry__pb2.DESCRIPTOR, google_dot_cloud_dot_automl__v1beta1_dot_proto_dot_io__pb2.DESCRIPTOR, + google_dot_cloud_dot_automl__v1beta1_dot_proto_dot_text__segment__pb2.DESCRIPTOR, google_dot_protobuf_dot_any__pb2.DESCRIPTOR, google_dot_protobuf_dot_duration__pb2.DESCRIPTOR, google_dot_protobuf_dot_struct__pb2.DESCRIPTOR, - google_dot_api_dot_annotations__pb2.DESCRIPTOR, ], ) +_DOCUMENTDIMENSIONS_DOCUMENTDIMENSIONUNIT = _descriptor.EnumDescriptor( + name="DocumentDimensionUnit", + full_name="google.cloud.automl.v1beta1.DocumentDimensions.DocumentDimensionUnit", + filename=None, + file=DESCRIPTOR, + values=[ + _descriptor.EnumValueDescriptor( + name="DOCUMENT_DIMENSION_UNIT_UNSPECIFIED", + index=0, + number=0, + serialized_options=None, + type=None, + ), + _descriptor.EnumValueDescriptor( + name="INCH", index=1, number=1, serialized_options=None, type=None + ), + _descriptor.EnumValueDescriptor( + name="CENTIMETER", index=2, number=2, serialized_options=None, type=None + ), + _descriptor.EnumValueDescriptor( + name="POINT", index=3, number=3, serialized_options=None, type=None + ), + ], + containing_type=None, + serialized_options=None, + serialized_start=690, + serialized_end=791, +) +_sym_db.RegisterEnumDescriptor(_DOCUMENTDIMENSIONS_DOCUMENTDIMENSIONUNIT) + +_DOCUMENT_LAYOUT_TEXTSEGMENTTYPE = _descriptor.EnumDescriptor( + name="TextSegmentType", + full_name="google.cloud.automl.v1beta1.Document.Layout.TextSegmentType", + filename=None, + file=DESCRIPTOR, + values=[ + _descriptor.EnumValueDescriptor( + name="TEXT_SEGMENT_TYPE_UNSPECIFIED", + index=0, + number=0, + serialized_options=None, + type=None, + ), + _descriptor.EnumValueDescriptor( + name="TOKEN", index=1, number=1, serialized_options=None, type=None + ), + _descriptor.EnumValueDescriptor( + name="PARAGRAPH", index=2, number=2, serialized_options=None, type=None + ), + _descriptor.EnumValueDescriptor( + name="FORM_FIELD", index=3, number=3, serialized_options=None, type=None + ), + _descriptor.EnumValueDescriptor( + name="FORM_FIELD_NAME", + index=4, + number=4, + serialized_options=None, + type=None, + ), + _descriptor.EnumValueDescriptor( + name="FORM_FIELD_CONTENTS", + index=5, + number=5, + serialized_options=None, + type=None, + ), + _descriptor.EnumValueDescriptor( + name="TABLE", index=6, number=6, serialized_options=None, type=None + ), + _descriptor.EnumValueDescriptor( + name="TABLE_HEADER", index=7, number=7, serialized_options=None, type=None + ), + _descriptor.EnumValueDescriptor( + name="TABLE_ROW", index=8, number=8, serialized_options=None, type=None + ), + _descriptor.EnumValueDescriptor( + name="TABLE_CELL", index=9, number=9, serialized_options=None, type=None + ), + ], + containing_type=None, + serialized_options=None, + serialized_start=1355, + serialized_end=1555, +) +_sym_db.RegisterEnumDescriptor(_DOCUMENT_LAYOUT_TEXTSEGMENTTYPE) + + _IMAGE = _descriptor.Descriptor( name="Image", full_name="google.cloud.automl.v1beta1.Image", @@ -122,8 +217,8 @@ fields=[], ) ], - serialized_start=246, - serialized_end=373, + serialized_start=350, + serialized_end=477, ) @@ -197,11 +292,178 @@ syntax="proto3", extension_ranges=[], oneofs=[], - serialized_start=375, - serialized_end=445, + serialized_start=479, + serialized_end=549, +) + + +_DOCUMENTDIMENSIONS = _descriptor.Descriptor( + name="DocumentDimensions", + full_name="google.cloud.automl.v1beta1.DocumentDimensions", + filename=None, + file=DESCRIPTOR, + containing_type=None, + fields=[ + _descriptor.FieldDescriptor( + name="unit", + full_name="google.cloud.automl.v1beta1.DocumentDimensions.unit", + index=0, + number=1, + type=14, + cpp_type=8, + label=1, + has_default_value=False, + default_value=0, + message_type=None, + enum_type=None, + containing_type=None, + is_extension=False, + extension_scope=None, + serialized_options=None, + file=DESCRIPTOR, + ), + _descriptor.FieldDescriptor( + name="width", + full_name="google.cloud.automl.v1beta1.DocumentDimensions.width", + index=1, + number=2, + type=2, + cpp_type=6, + label=1, + has_default_value=False, + default_value=float(0), + message_type=None, + enum_type=None, + containing_type=None, + is_extension=False, + extension_scope=None, + serialized_options=None, + file=DESCRIPTOR, + ), + _descriptor.FieldDescriptor( + name="height", + full_name="google.cloud.automl.v1beta1.DocumentDimensions.height", + index=2, + number=3, + type=2, + cpp_type=6, + label=1, + has_default_value=False, + default_value=float(0), + message_type=None, + enum_type=None, + containing_type=None, + is_extension=False, + extension_scope=None, + serialized_options=None, + file=DESCRIPTOR, + ), + ], + extensions=[], + nested_types=[], + enum_types=[_DOCUMENTDIMENSIONS_DOCUMENTDIMENSIONUNIT], + serialized_options=None, + is_extendable=False, + syntax="proto3", + extension_ranges=[], + oneofs=[], + serialized_start=552, + serialized_end=791, ) +_DOCUMENT_LAYOUT = _descriptor.Descriptor( + name="Layout", + full_name="google.cloud.automl.v1beta1.Document.Layout", + filename=None, + file=DESCRIPTOR, + containing_type=None, + fields=[ + _descriptor.FieldDescriptor( + name="text_segment", + full_name="google.cloud.automl.v1beta1.Document.Layout.text_segment", + index=0, + number=1, + type=11, + cpp_type=10, + label=1, + has_default_value=False, + default_value=None, + message_type=None, + enum_type=None, + containing_type=None, + is_extension=False, + extension_scope=None, + serialized_options=None, + file=DESCRIPTOR, + ), + _descriptor.FieldDescriptor( + name="page_number", + full_name="google.cloud.automl.v1beta1.Document.Layout.page_number", + index=1, + number=2, + type=5, + cpp_type=1, + label=1, + has_default_value=False, + default_value=0, + message_type=None, + enum_type=None, + containing_type=None, + is_extension=False, + extension_scope=None, + serialized_options=None, + file=DESCRIPTOR, + ), + _descriptor.FieldDescriptor( + name="bounding_poly", + full_name="google.cloud.automl.v1beta1.Document.Layout.bounding_poly", + index=2, + number=3, + type=11, + cpp_type=10, + label=1, + has_default_value=False, + default_value=None, + message_type=None, + enum_type=None, + containing_type=None, + is_extension=False, + extension_scope=None, + serialized_options=None, + file=DESCRIPTOR, + ), + _descriptor.FieldDescriptor( + name="text_segment_type", + full_name="google.cloud.automl.v1beta1.Document.Layout.text_segment_type", + index=3, + number=4, + type=14, + cpp_type=8, + label=1, + has_default_value=False, + default_value=0, + message_type=None, + enum_type=None, + containing_type=None, + is_extension=False, + extension_scope=None, + serialized_options=None, + file=DESCRIPTOR, + ), + ], + extensions=[], + nested_types=[], + enum_types=[_DOCUMENT_LAYOUT_TEXTSEGMENTTYPE], + serialized_options=None, + is_extendable=False, + syntax="proto3", + extension_ranges=[], + oneofs=[], + serialized_start=1104, + serialized_end=1555, +) + _DOCUMENT = _descriptor.Descriptor( name="Document", full_name="google.cloud.automl.v1beta1.Document", @@ -226,18 +488,90 @@ extension_scope=None, serialized_options=None, file=DESCRIPTOR, - ) + ), + _descriptor.FieldDescriptor( + name="document_text", + full_name="google.cloud.automl.v1beta1.Document.document_text", + index=1, + number=2, + type=11, + cpp_type=10, + label=1, + has_default_value=False, + default_value=None, + message_type=None, + enum_type=None, + containing_type=None, + is_extension=False, + extension_scope=None, + serialized_options=None, + file=DESCRIPTOR, + ), + _descriptor.FieldDescriptor( + name="layout", + full_name="google.cloud.automl.v1beta1.Document.layout", + index=2, + number=3, + type=11, + cpp_type=10, + label=3, + has_default_value=False, + default_value=[], + message_type=None, + enum_type=None, + containing_type=None, + is_extension=False, + extension_scope=None, + serialized_options=None, + file=DESCRIPTOR, + ), + _descriptor.FieldDescriptor( + name="document_dimensions", + full_name="google.cloud.automl.v1beta1.Document.document_dimensions", + index=3, + number=4, + type=11, + cpp_type=10, + label=1, + has_default_value=False, + default_value=None, + message_type=None, + enum_type=None, + containing_type=None, + is_extension=False, + extension_scope=None, + serialized_options=None, + file=DESCRIPTOR, + ), + _descriptor.FieldDescriptor( + name="page_count", + full_name="google.cloud.automl.v1beta1.Document.page_count", + index=4, + number=5, + type=5, + cpp_type=1, + label=1, + has_default_value=False, + default_value=0, + message_type=None, + enum_type=None, + containing_type=None, + is_extension=False, + extension_scope=None, + serialized_options=None, + file=DESCRIPTOR, + ), ], extensions=[], - nested_types=[], + nested_types=[_DOCUMENT_LAYOUT], enum_types=[], serialized_options=None, is_extendable=False, syntax="proto3", extension_ranges=[], oneofs=[], - serialized_start=447, - serialized_end=529, + serialized_start=794, + serialized_end=1555, ) @@ -293,8 +627,8 @@ syntax="proto3", extension_ranges=[], oneofs=[], - serialized_start=531, - serialized_end=601, + serialized_start=1557, + serialized_end=1627, ) @@ -394,8 +728,8 @@ fields=[], ) ], - serialized_start=604, - serialized_end=858, + serialized_start=1630, + serialized_end=1884, ) _IMAGE.fields_by_name[ @@ -405,11 +739,33 @@ _IMAGE.fields_by_name["image_bytes"].containing_oneof = _IMAGE.oneofs_by_name["data"] _IMAGE.oneofs_by_name["data"].fields.append(_IMAGE.fields_by_name["input_config"]) _IMAGE.fields_by_name["input_config"].containing_oneof = _IMAGE.oneofs_by_name["data"] +_DOCUMENTDIMENSIONS.fields_by_name[ + "unit" +].enum_type = _DOCUMENTDIMENSIONS_DOCUMENTDIMENSIONUNIT +_DOCUMENTDIMENSIONS_DOCUMENTDIMENSIONUNIT.containing_type = _DOCUMENTDIMENSIONS +_DOCUMENT_LAYOUT.fields_by_name[ + "text_segment" +].message_type = ( + google_dot_cloud_dot_automl__v1beta1_dot_proto_dot_text__segment__pb2._TEXTSEGMENT +) +_DOCUMENT_LAYOUT.fields_by_name[ + "bounding_poly" +].message_type = ( + google_dot_cloud_dot_automl__v1beta1_dot_proto_dot_geometry__pb2._BOUNDINGPOLY +) +_DOCUMENT_LAYOUT.fields_by_name[ + "text_segment_type" +].enum_type = _DOCUMENT_LAYOUT_TEXTSEGMENTTYPE +_DOCUMENT_LAYOUT.containing_type = _DOCUMENT +_DOCUMENT_LAYOUT_TEXTSEGMENTTYPE.containing_type = _DOCUMENT_LAYOUT _DOCUMENT.fields_by_name[ "input_config" ].message_type = ( google_dot_cloud_dot_automl__v1beta1_dot_proto_dot_io__pb2._DOCUMENTINPUTCONFIG ) +_DOCUMENT.fields_by_name["document_text"].message_type = _TEXTSNIPPET +_DOCUMENT.fields_by_name["layout"].message_type = _DOCUMENT_LAYOUT +_DOCUMENT.fields_by_name["document_dimensions"].message_type = _DOCUMENTDIMENSIONS _ROW.fields_by_name["values"].message_type = google_dot_protobuf_dot_struct__pb2._VALUE _EXAMPLEPAYLOAD.fields_by_name["image"].message_type = _IMAGE _EXAMPLEPAYLOAD.fields_by_name["text_snippet"].message_type = _TEXTSNIPPET @@ -441,6 +797,7 @@ ] DESCRIPTOR.message_types_by_name["Image"] = _IMAGE DESCRIPTOR.message_types_by_name["TextSnippet"] = _TEXTSNIPPET +DESCRIPTOR.message_types_by_name["DocumentDimensions"] = _DOCUMENTDIMENSIONS DESCRIPTOR.message_types_by_name["Document"] = _DOCUMENT DESCRIPTOR.message_types_by_name["Row"] = _ROW DESCRIPTOR.message_types_by_name["ExamplePayload"] = _EXAMPLEPAYLOAD @@ -505,10 +862,67 @@ ) _sym_db.RegisterMessage(TextSnippet) +DocumentDimensions = _reflection.GeneratedProtocolMessageType( + "DocumentDimensions", + (_message.Message,), + dict( + DESCRIPTOR=_DOCUMENTDIMENSIONS, + __module__="google.cloud.automl_v1beta1.proto.data_items_pb2", + __doc__="""Message that describes dimension of a document. + + + Attributes: + unit: + Unit of the dimension. + width: + Width value of the document, works together with the unit. + height: + Height value of the document, works together with the unit. + """, + # @@protoc_insertion_point(class_scope:google.cloud.automl.v1beta1.DocumentDimensions) + ), +) +_sym_db.RegisterMessage(DocumentDimensions) + Document = _reflection.GeneratedProtocolMessageType( "Document", (_message.Message,), dict( + Layout=_reflection.GeneratedProtocolMessageType( + "Layout", + (_message.Message,), + dict( + DESCRIPTOR=_DOCUMENT_LAYOUT, + __module__="google.cloud.automl_v1beta1.proto.data_items_pb2", + __doc__="""Describes the layout information of a + [text\_segment][google.cloud.automl.v1beta1.Document.Layout.text\_segment] + in the document. + + + Attributes: + text_segment: + Text Segment that represents a segment in [document\_text][goo + gle.cloud.automl.v1beta1.Document.document\_text]. + page_number: + Page number of the [text\_segment][google.cloud.automl.v1beta1 + .Document.Layout.text\_segment] in the original document, + starts from 1. + bounding_poly: + The position of the [text\_segment][google.cloud.automl.v1beta + 1.Document.Layout.text\_segment] in the page. Contains exactly + 4 [normalized\_vertices][google.cloud.automl.v1beta1.Bounding + Poly.normalized\_vertices] and they are connected by edges in + the order provided, which will represent a rectangle parallel + to the frame. The [NormalizedVertex-s][google.cloud.automl.v1b + eta1.NormalizedVertex] are relative to the page. Coordinates + are based on top-left as point (0,0). + text_segment_type: + The type of the [text\_segment][google.cloud.automl.v1beta1.Do + cument.Layout.text\_segment] in document. + """, + # @@protoc_insertion_point(class_scope:google.cloud.automl.v1beta1.Document.Layout) + ), + ), DESCRIPTOR=_DOCUMENT, __module__="google.cloud.automl_v1beta1.proto.data_items_pb2", __doc__="""A structured text document e.g. a PDF. @@ -517,11 +931,21 @@ Attributes: input_config: An input config specifying the content of the document. + document_text: + The plain text version of this document. + layout: + Describes the layout of the document. Sorted by + [page\_number][]. + document_dimensions: + The dimensions of the page in the document. + page_count: + Number of pages in the document. """, # @@protoc_insertion_point(class_scope:google.cloud.automl.v1beta1.Document) ), ) _sym_db.RegisterMessage(Document) +_sym_db.RegisterMessage(Document.Layout) Row = _reflection.GeneratedProtocolMessageType( "Row", diff --git a/automl/google/cloud/automl_v1beta1/proto/io.proto b/automl/google/cloud/automl_v1beta1/proto/io.proto index c08edc277474..6f007f02a10d 100644 --- a/automl/google/cloud/automl_v1beta1/proto/io.proto +++ b/automl/google/cloud/automl_v1beta1/proto/io.proto @@ -140,7 +140,8 @@ option ruby_package = "Google::Cloud::AutoML::V1beta1"; // CSV file(s) with each line in format: // ML_USE,GCS_FILE_PATH // GCS_FILE_PATH leads to a .JSONL (that is, JSON Lines) file which -// either imports text in-line or as documents. +// either imports text in-line or as documents. Any given +// .JSONL file must be 100MB or smaller. // The in-line .JSONL file contains, per line, a proto that wraps a // TextSnippet proto (in json representation) followed by one or more // AnnotationPayload protos (called annotations), which have @@ -148,12 +149,16 @@ option ruby_package = "Google::Cloud::AutoML::V1beta1"; // is expected to be annotated exhaustively, for example, if you look // for animals and text contains "dolphin" that is not labeled, then // "dolphin" is assumed to not be an animal. Any given text snippet -// content must have 30,000 characters or less, and also be UTF-8 NFC -// encoded (ASCII already is). The document .JSONL file contains, per line, a proto that wraps a -// Document proto with input_config set. Only PDF documents are -// supported now, and each document may be up to 2MB large. Currently -// annotations on documents cannot be specified at import. Any given -// .JSONL file must be 100MB or smaller. +// content must be 10KB or smaller, and also be UTF-8 NFC encoded +// (ASCII already is). +// The document .JSONL file contains, per line, a proto that wraps a +// Document proto. The Document proto must have either document_text +// or input_config set. In document_text case, the Document proto may +// also contain the spatial information of the document, including +// layout, document dimension and page number. In input_config case, +// only PDF documents are supported now, and each document may be up +// to 2MB large. Currently, annotations on documents cannot be +// specified at import. // Three sample CSV rows: // TRAIN,gs://folder/file1.jsonl // VALIDATE,gs://folder/file2.jsonl @@ -162,27 +167,61 @@ option ruby_package = "Google::Cloud::AutoML::V1beta1"; // with artificial line breaks, but the only actual line break is // denoted by \n).: // { -// "text_snippet": { -// "content": "dog car cat" -// } "annotations": [ -// { -// "display_name": "animal", -// "text_extraction": { -// "text_segment": {"start_offset": 0, "end_offset": 3} +// "document": { +// "document_text": {"content": "dog cat"} +// "layout": [ +// { +// "text_segment": { +// "start_offset": 0, +// "end_offset": 3, +// }, +// "page_number": 1, +// "bounding_poly": { +// "normalized_vertices": [ +// {"x": 0.1, "y": 0.1}, +// {"x": 0.1, "y": 0.3}, +// {"x": 0.3, "y": 0.3}, +// {"x": 0.3, "y": 0.1}, +// ], +// }, +// "text_segment_type": TOKEN, +// }, +// { +// "text_segment": { +// "start_offset": 4, +// "end_offset": 7, +// }, +// "page_number": 1, +// "bounding_poly": { +// "normalized_vertices": [ +// {"x": 0.4, "y": 0.1}, +// {"x": 0.4, "y": 0.3}, +// {"x": 0.8, "y": 0.3}, +// {"x": 0.8, "y": 0.1}, +// ], +// }, +// "text_segment_type": TOKEN, // } -// }, +// +// ], +// "document_dimensions": { +// "width": 8.27, +// "height": 11.69, +// "unit": INCH, +// } +// "page_count": 1, +// }, +// "annotations": [ // { -// "display_name": "vehicle", -// "text_extraction": { -// "text_segment": {"start_offset": 4, "end_offset": 7} -// } +// "display_name": "animal", +// "text_extraction": {"text_segment": {"start_offset": 0, +// "end_offset": 3}} // }, // { // "display_name": "animal", -// "text_extraction": { -// "text_segment": {"start_offset": 8, "end_offset": 11} -// } -// }, +// "text_extraction": {"text_segment": {"start_offset": 4, +// "end_offset": 7}} +// } // ], // }\n // { diff --git a/automl/google/cloud/automl_v1beta1/proto/io_pb2.py b/automl/google/cloud/automl_v1beta1/proto/io_pb2.py index 161b9d25ad62..62cd25fdd121 100644 --- a/automl/google/cloud/automl_v1beta1/proto/io_pb2.py +++ b/automl/google/cloud/automl_v1beta1/proto/io_pb2.py @@ -1029,20 +1029,23 @@ - For Text Extraction: CSV file(s) with each line in format: ML\_USE,GCS\_FILE\_PATH GCS\_FILE\_PATH leads to a .JSONL (that is, JSON Lines) file which either imports text in-line or as documents. - The in-line .JSONL file contains, per line, a proto that wraps a - TextSnippet proto (in json representation) followed by one or more - AnnotationPayload protos (called annotations), which have - display\_name and text\_extraction detail populated. The given text - is expected to be annotated exhaustively, for example, if you look - for animals and text contains "dolphin" that is not labeled, then - "dolphin" is assumed to not be an animal. Any given text snippet - content must have 30,000 characters or less, and also be UTF-8 NFC - encoded (ASCII already is). The document .JSONL file contains, per - line, a proto that wraps a Document proto with input\_config set. - Only PDF documents are supported now, and each document may be up to - 2MB large. Currently annotations on documents cannot be specified at - import. Any given .JSONL file must be 100MB or smaller. Three sample - CSV rows: TRAIN,gs://folder/file1.jsonl + Any given .JSONL file must be 100MB or smaller. The in-line .JSONL + file contains, per line, a proto that wraps a TextSnippet proto (in + json representation) followed by one or more AnnotationPayload protos + (called annotations), which have display\_name and text\_extraction + detail populated. The given text is expected to be annotated + exhaustively, for example, if you look for animals and text contains + "dolphin" that is not labeled, then "dolphin" is assumed to not be an + animal. Any given text snippet content must be 10KB or smaller, and + also be UTF-8 NFC encoded (ASCII already is). The document .JSONL + file contains, per line, a proto that wraps a Document proto. The + Document proto must have either document\_text or input\_config set. + In document\_text case, the Document proto may also contain the + spatial information of the document, including layout, document + dimension and page number. In input\_config case, only PDF documents + are supported now, and each document may be up to 2MB large. + Currently, annotations on documents cannot be specified at import. + Three sample CSV rows: TRAIN,gs://folder/file1.jsonl VALIDATE,gs://folder/file2.jsonl TEST,gs://folder/file3.jsonl - For Text Classification: CSV file(s) with each line in format: diff --git a/automl/google/cloud/automl_v1beta1/proto/prediction_service.proto b/automl/google/cloud/automl_v1beta1/proto/prediction_service.proto index 243849213e38..57f1b794e716 100644 --- a/automl/google/cloud/automl_v1beta1/proto/prediction_service.proto +++ b/automl/google/cloud/automl_v1beta1/proto/prediction_service.proto @@ -18,12 +18,12 @@ syntax = "proto3"; package google.cloud.automl.v1beta1; import "google/api/annotations.proto"; +import "google/api/client.proto"; import "google/cloud/automl/v1beta1/annotation_payload.proto"; import "google/cloud/automl/v1beta1/data_items.proto"; import "google/cloud/automl/v1beta1/io.proto"; import "google/cloud/automl/v1beta1/operations.proto"; import "google/longrunning/operations.proto"; -import "google/api/client.proto"; option go_package = "google.golang.org/genproto/googleapis/cloud/automl/v1beta1;automl"; option java_multiple_files = true; @@ -38,7 +38,8 @@ option ruby_package = "Google::Cloud::AutoML::V1beta1"; // snake_case or kebab-case, either of those cases is accepted. service PredictionService { option (google.api.default_host) = "automl.googleapis.com"; - option (google.api.oauth_scopes) = "https://www.googleapis.com/auth/cloud-platform"; + option (google.api.oauth_scopes) = + "https://www.googleapis.com/auth/cloud-platform"; // Perform an online prediction. The prediction result will be directly // returned in the response. @@ -66,12 +67,14 @@ service PredictionService { }; } - // Perform a batch prediction. Unlike the online [Predict][google.cloud.automl.v1beta1.PredictionService.Predict], batch + // Perform a batch prediction. Unlike the online + // [Predict][google.cloud.automl.v1beta1.PredictionService.Predict], batch // prediction result won't be immediately available in the response. Instead, // a long running operation object is returned. User can poll the operation // result via [GetOperation][google.longrunning.Operations.GetOperation] - // method. Once the operation is done, [BatchPredictResult][google.cloud.automl.v1beta1.BatchPredictResult] is returned in - // the [response][google.longrunning.Operation.response] field. + // method. Once the operation is done, + // [BatchPredictResult][google.cloud.automl.v1beta1.BatchPredictResult] is + // returned in the [response][google.longrunning.Operation.response] field. // Available for following ML problems: // * Image Classification // * Image Object Detection @@ -86,7 +89,8 @@ service PredictionService { } } -// Request message for [PredictionService.Predict][google.cloud.automl.v1beta1.PredictionService.Predict]. +// Request message for +// [PredictionService.Predict][google.cloud.automl.v1beta1.PredictionService.Predict]. message PredictRequest { // Name of the model requested to serve the prediction. string name = 1; @@ -122,12 +126,20 @@ message PredictRequest { map params = 3; } -// Response message for [PredictionService.Predict][google.cloud.automl.v1beta1.PredictionService.Predict]. +// Response message for +// [PredictionService.Predict][google.cloud.automl.v1beta1.PredictionService.Predict]. message PredictResponse { // Prediction result. // Translation and Text Sentiment will return precisely one payload. repeated AnnotationPayload payload = 1; + // The preprocessed example that AutoML actually makes prediction on. + // Empty if AutoML does not preprocess the input example. + // * For Text Extraction: + // If the input is a .pdf file, the OCR'ed text will be provided in + // [document_text][google.cloud.automl.v1beta1.Document.document_text]. + ExamplePayload preprocessed_input = 3; + // Additional domain-specific prediction response metadata. // // * For Image Object Detection: @@ -146,7 +158,8 @@ message PredictResponse { map metadata = 2; } -// Request message for [PredictionService.BatchPredict][google.cloud.automl.v1beta1.PredictionService.BatchPredict]. +// Request message for +// [PredictionService.BatchPredict][google.cloud.automl.v1beta1.PredictionService.BatchPredict]. message BatchPredictRequest { // Name of the model requested to serve the batch prediction. string name = 1; @@ -226,7 +239,8 @@ message BatchPredictRequest { // Result of the Batch Predict. This message is returned in // [response][google.longrunning.Operation.response] of the operation returned -// by the [PredictionService.BatchPredict][google.cloud.automl.v1beta1.PredictionService.BatchPredict]. +// by the +// [PredictionService.BatchPredict][google.cloud.automl.v1beta1.PredictionService.BatchPredict]. message BatchPredictResult { // Additional domain-specific prediction response metadata. // diff --git a/automl/google/cloud/automl_v1beta1/proto/prediction_service_pb2.py b/automl/google/cloud/automl_v1beta1/proto/prediction_service_pb2.py index 589a74ba7fe4..751f16ef8f5b 100644 --- a/automl/google/cloud/automl_v1beta1/proto/prediction_service_pb2.py +++ b/automl/google/cloud/automl_v1beta1/proto/prediction_service_pb2.py @@ -16,6 +16,7 @@ from google.api import annotations_pb2 as google_dot_api_dot_annotations__pb2 +from google.api import client_pb2 as google_dot_api_dot_client__pb2 from google.cloud.automl_v1beta1.proto import ( annotation_payload_pb2 as google_dot_cloud_dot_automl__v1beta1_dot_proto_dot_annotation__payload__pb2, ) @@ -31,7 +32,6 @@ from google.longrunning import ( operations_pb2 as google_dot_longrunning_dot_operations__pb2, ) -from google.api import client_pb2 as google_dot_api_dot_client__pb2 DESCRIPTOR = _descriptor.FileDescriptor( @@ -42,16 +42,16 @@ "\n\037com.google.cloud.automl.v1beta1B\026PredictionServiceProtoP\001ZAgoogle.golang.org/genproto/googleapis/cloud/automl/v1beta1;automl\312\002\033Google\\Cloud\\AutoMl\\V1beta1\352\002\036Google::Cloud::AutoML::V1beta1" ), serialized_pb=_b( - '\n:google/cloud/automl_v1beta1/proto/prediction_service.proto\x12\x1bgoogle.cloud.automl.v1beta1\x1a\x1cgoogle/api/annotations.proto\x1a:google/cloud/automl_v1beta1/proto/annotation_payload.proto\x1a\x32google/cloud/automl_v1beta1/proto/data_items.proto\x1a*google/cloud/automl_v1beta1/proto/io.proto\x1a\x32google/cloud/automl_v1beta1/proto/operations.proto\x1a#google/longrunning/operations.proto\x1a\x17google/api/client.proto"\xd4\x01\n\x0ePredictRequest\x12\x0c\n\x04name\x18\x01 \x01(\t\x12<\n\x07payload\x18\x02 \x01(\x0b\x32+.google.cloud.automl.v1beta1.ExamplePayload\x12G\n\x06params\x18\x03 \x03(\x0b\x32\x37.google.cloud.automl.v1beta1.PredictRequest.ParamsEntry\x1a-\n\x0bParamsEntry\x12\x0b\n\x03key\x18\x01 \x01(\t\x12\r\n\x05value\x18\x02 \x01(\t:\x02\x38\x01"\xd1\x01\n\x0fPredictResponse\x12?\n\x07payload\x18\x01 \x03(\x0b\x32..google.cloud.automl.v1beta1.AnnotationPayload\x12L\n\x08metadata\x18\x02 \x03(\x0b\x32:.google.cloud.automl.v1beta1.PredictResponse.MetadataEntry\x1a/\n\rMetadataEntry\x12\x0b\n\x03key\x18\x01 \x01(\t\x12\r\n\x05value\x18\x02 \x01(\t:\x02\x38\x01"\xba\x02\n\x13\x42\x61tchPredictRequest\x12\x0c\n\x04name\x18\x01 \x01(\t\x12J\n\x0cinput_config\x18\x03 \x01(\x0b\x32\x34.google.cloud.automl.v1beta1.BatchPredictInputConfig\x12L\n\routput_config\x18\x04 \x01(\x0b\x32\x35.google.cloud.automl.v1beta1.BatchPredictOutputConfig\x12L\n\x06params\x18\x05 \x03(\x0b\x32<.google.cloud.automl.v1beta1.BatchPredictRequest.ParamsEntry\x1a-\n\x0bParamsEntry\x12\x0b\n\x03key\x18\x01 \x01(\t\x12\r\n\x05value\x18\x02 \x01(\t:\x02\x38\x01"\x96\x01\n\x12\x42\x61tchPredictResult\x12O\n\x08metadata\x18\x01 \x03(\x0b\x32=.google.cloud.automl.v1beta1.BatchPredictResult.MetadataEntry\x1a/\n\rMetadataEntry\x12\x0b\n\x03key\x18\x01 \x01(\t\x12\r\n\x05value\x18\x02 \x01(\t:\x02\x38\x01\x32\xb4\x03\n\x11PredictionService\x12\xa8\x01\n\x07Predict\x12+.google.cloud.automl.v1beta1.PredictRequest\x1a,.google.cloud.automl.v1beta1.PredictResponse"B\x82\xd3\xe4\x93\x02<"7/v1beta1/{name=projects/*/locations/*/models/*}:predict:\x01*\x12\xa8\x01\n\x0c\x42\x61tchPredict\x12\x30.google.cloud.automl.v1beta1.BatchPredictRequest\x1a\x1d.google.longrunning.Operation"G\x82\xd3\xe4\x93\x02\x41"", + flags=re.DOTALL ) # Replace docstring with no summary line