samples: migrate v1beta2 doc AI samples (#79)

* samples: migrate v1beta2 doc AI samples * added noxfile * reformatted code * organized imports in right order * lint * finally fixed lint * reorganized folders * imports * added from prefix imports * renamed files * renamed package on tests files * nit
GoogleCloudPlatform · Jan 12, 2021 · deeba8e · deeba8e
1 parent 984627e
commit deeba8e
Show file tree

Hide file tree

Showing 15 changed files with 825 additions and 18 deletions.
diff --git a/document_ai/snippets/batch_parse_form_v1beta2.py b/document_ai/snippets/batch_parse_form_v1beta2.py
@@ -0,0 +1,99 @@
+# Copyright 2020 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+# [START documentai_batch_parse_form_beta]
+import re
+
+from google.cloud import documentai_v1beta2 as documentai
+from google.cloud import storage
+
+
+def batch_parse_form(
+    project_id="YOUR_PROJECT_ID",
+    input_uri="gs://cloud-samples-data/documentai/form.pdf",
+    destination_uri="gs://your-bucket-id/path/to/save/results/",
+):
+    """Parse a form"""
+
+    client = documentai.DocumentUnderstandingServiceClient()
+
+    gcs_source = documentai.types.GcsSource(uri=input_uri)
+
+    # mime_type can be application/pdf, image/tiff,
+    # and image/gif, or application/json
+    input_config = documentai.types.InputConfig(
+        gcs_source=gcs_source, mime_type="application/pdf"
+    )
+
+    # where to write results
+    output_config = documentai.types.OutputConfig(
+        gcs_destination=documentai.types.GcsDestination(uri=destination_uri),
+        pages_per_shard=1,  # Map one doc page to one output page
+    )
+
+    # Improve form parsing results by providing key-value pair hints.
+    # For each key hint, key is text that is likely to appear in the
+    # document as a form field name (i.e. "DOB").
+    # Value types are optional, but can be one or more of:
+    # ADDRESS, LOCATION, ORGANIZATION, PERSON, PHONE_NUMBER, ID,
+    # NUMBER, EMAIL, PRICE, TERMS, DATE, NAME
+    key_value_pair_hints = [
+        documentai.types.KeyValuePairHint(
+            key="Emergency Contact", value_types=["NAME"]
+        ),
+        documentai.types.KeyValuePairHint(key="Referred By"),
+    ]
+
+    # Setting enabled=True enables form extraction
+    form_extraction_params = documentai.types.FormExtractionParams(
+        enabled=True, key_value_pair_hints=key_value_pair_hints
+    )
+
+    # Location can be 'us' or 'eu'
+    parent = "projects/{}/locations/us".format(project_id)
+    request = documentai.types.ProcessDocumentRequest(
+        input_config=input_config,
+        output_config=output_config,
+        form_extraction_params=form_extraction_params,
+    )
+
+    # Add each ProcessDocumentRequest to the batch request
+    requests = []
+    requests.append(request)
+
+    batch_request = documentai.types.BatchProcessDocumentsRequest(
+        parent=parent, requests=requests
+    )
+
+    operation = client.batch_process_documents(batch_request)
+
+    # Wait for the operation to finish
+    operation.result()
+
+    # Results are written to GCS. Use a regex to find
+    # output files
+    match = re.match(r"gs://([^/]+)/(.+)", destination_uri)
+    output_bucket = match.group(1)
+    prefix = match.group(2)
+
+    storage_client = storage.client.Client()
+    bucket = storage_client.get_bucket(output_bucket)
+    blob_list = list(bucket.list_blobs(prefix=prefix))
+    print("Output files:")
+    for blob in blob_list:
+        print(blob.name)
+
+
+# [END documentai_batch_parse_form_beta]
diff --git a/document_ai/snippets/batch_parse_form_v1beta2_test.py b/document_ai/snippets/batch_parse_form_v1beta2_test.py
@@ -0,0 +1,46 @@
+# Copyright 2020 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific ladnguage governing permissions and
+# limitations under the License.
+
+import os
+import uuid
+
+from google.cloud import storage
+
+import pytest
+
+from samples.snippets import batch_parse_form_v1beta2
+
+
+BUCKET = "document-ai-{}".format(uuid.uuid4())
+OUTPUT_PREFIX = "TEST_OUTPUT_{}".format(uuid.uuid4())
+PROJECT_ID = os.environ["GOOGLE_CLOUD_PROJECT"]
+INPUT_URI = "gs://cloud-samples-data/documentai/invoice.pdf"
+BATCH_OUTPUT_URI = "gs://{}/{}/".format(BUCKET, OUTPUT_PREFIX)
+
+
+@pytest.fixture(autouse=True)
+def setup_teardown():
+    """Create a temporary bucket to store annotation output."""
+    storage_client = storage.Client()
+    bucket = storage_client.create_bucket(BUCKET)
+
+    yield
+
+    bucket.delete(force=True)
+
+
+def test_batch_parse_form(capsys):
+    batch_parse_form_v1beta2.batch_parse_form(PROJECT_ID, INPUT_URI, BATCH_OUTPUT_URI)
+    out, _ = capsys.readouterr()
+    assert "Output files" in out
diff --git a/document_ai/snippets/batch_parse_table_v1beta2.py b/document_ai/snippets/batch_parse_table_v1beta2.py
@@ -0,0 +1,107 @@
+# Copyright 2020 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+# [START documentai_batch_parse_table_beta]
+import re
+
+from google.cloud import documentai_v1beta2 as documentai
+from google.cloud import storage
+
+
+def batch_parse_table(
+    project_id="YOUR_PROJECT_ID",
+    input_uri="gs://cloud-samples-data/documentai/form.pdf",
+    destination_uri="gs://your-bucket-id/path/to/save/results/",
+):
+    """Parse a form"""
+
+    client = documentai.DocumentUnderstandingServiceClient()
+
+    gcs_source = documentai.types.GcsSource(uri=input_uri)
+
+    # mime_type can be application/pdf, image/tiff,
+    # and image/gif, or application/json
+    input_config = documentai.types.InputConfig(
+        gcs_source=gcs_source, mime_type="application/pdf"
+    )
+
+    # where to write results
+    output_config = documentai.types.OutputConfig(
+        gcs_destination=documentai.types.GcsDestination(uri=destination_uri),
+        pages_per_shard=1,  # Map one doc page to one output page
+    )
+
+    # Improve table parsing results by providing bounding boxes
+    # specifying where the box appears in the document (optional)
+    table_bound_hints = [
+        documentai.types.TableBoundHint(
+            page_number=1,
+            bounding_box=documentai.types.BoundingPoly(
+                # Define a polygon around tables to detect
+                # Each vertice coordinate must be a number between 0 and 1
+                normalized_vertices=[
+                    # Top left
+                    documentai.types.geometry.NormalizedVertex(x=0, y=0),
+                    # Top right
+                    documentai.types.geometry.NormalizedVertex(x=1, y=0),
+                    # Bottom right
+                    documentai.types.geometry.NormalizedVertex(x=1, y=1),
+                    # Bottom left
+                    documentai.types.geometry.NormalizedVertex(x=0, y=1),
+                ]
+            ),
+        )
+    ]
+
+    # Setting enabled=True enables form extraction
+    table_extraction_params = documentai.types.TableExtractionParams(
+        enabled=True, table_bound_hints=table_bound_hints
+    )
+
+    # Location can be 'us' or 'eu'
+    parent = "projects/{}/locations/us".format(project_id)
+    request = documentai.types.ProcessDocumentRequest(
+        input_config=input_config,
+        output_config=output_config,
+        table_extraction_params=table_extraction_params,
+    )
+
+    requests = []
+    requests.append(request)
+
+    batch_request = documentai.types.BatchProcessDocumentsRequest(
+        parent=parent, requests=requests
+    )
+
+    operation = client.batch_process_documents(batch_request)
+
+    # Wait for the operation to finish
+    operation.result()
+
+    # Results are written to GCS. Use a regex to find
+    # output files
+    match = re.match(r"gs://([^/]+)/(.+)", destination_uri)
+    output_bucket = match.group(1)
+    prefix = match.group(2)
+
+    storage_client = storage.client.Client()
+    bucket = storage_client.get_bucket(output_bucket)
+    blob_list = list(bucket.list_blobs(prefix=prefix))
+    print("Output files:")
+    for blob in blob_list:
+        print(blob.name)
+
+
+# [END documentai_batch_parse_table_beta]
diff --git a/document_ai/snippets/batch_parse_table_v1beta2_test.py b/document_ai/snippets/batch_parse_table_v1beta2_test.py
@@ -0,0 +1,46 @@
+# Copyright 2020 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific ladnguage governing permissions and
+# limitations under the License.
+
+import os
+import uuid
+
+from google.cloud import storage
+
+import pytest
+
+from samples.snippets import batch_parse_table_v1beta2
+
+
+BUCKET = "document-ai-{}".format(uuid.uuid4())
+OUTPUT_PREFIX = "TEST_OUTPUT_{}".format(uuid.uuid4())
+PROJECT_ID = os.environ["GOOGLE_CLOUD_PROJECT"]
+INPUT_URI = "gs://cloud-samples-data/documentai/invoice.pdf"
+BATCH_OUTPUT_URI = "gs://{}/{}/".format(BUCKET, OUTPUT_PREFIX)
+
+
+@pytest.fixture(autouse=True)
+def setup_teardown():
+    """Create a temporary bucket to store annotation output."""
+    storage_client = storage.Client()
+    bucket = storage_client.create_bucket(BUCKET)
+
+    yield
+
+    bucket.delete(force=True)
+
+
+def test_batch_parse_table(capsys):
+    batch_parse_table_v1beta2.batch_parse_table(PROJECT_ID, INPUT_URI, BATCH_OUTPUT_URI)
+    out, _ = capsys.readouterr()
+    assert "Output files:" in out