diff --git a/document_ai/snippets/batch_parse_form_v1beta2.py b/document_ai/snippets/batch_parse_form_v1beta2.py new file mode 100644 index 000000000000..01c19e1e716c --- /dev/null +++ b/document_ai/snippets/batch_parse_form_v1beta2.py @@ -0,0 +1,99 @@ +# Copyright 2020 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +# [START documentai_batch_parse_form_beta] +import re + +from google.cloud import documentai_v1beta2 as documentai +from google.cloud import storage + + +def batch_parse_form( + project_id="YOUR_PROJECT_ID", + input_uri="gs://cloud-samples-data/documentai/form.pdf", + destination_uri="gs://your-bucket-id/path/to/save/results/", +): + """Parse a form""" + + client = documentai.DocumentUnderstandingServiceClient() + + gcs_source = documentai.types.GcsSource(uri=input_uri) + + # mime_type can be application/pdf, image/tiff, + # and image/gif, or application/json + input_config = documentai.types.InputConfig( + gcs_source=gcs_source, mime_type="application/pdf" + ) + + # where to write results + output_config = documentai.types.OutputConfig( + gcs_destination=documentai.types.GcsDestination(uri=destination_uri), + pages_per_shard=1, # Map one doc page to one output page + ) + + # Improve form parsing results by providing key-value pair hints. + # For each key hint, key is text that is likely to appear in the + # document as a form field name (i.e. "DOB"). + # Value types are optional, but can be one or more of: + # ADDRESS, LOCATION, ORGANIZATION, PERSON, PHONE_NUMBER, ID, + # NUMBER, EMAIL, PRICE, TERMS, DATE, NAME + key_value_pair_hints = [ + documentai.types.KeyValuePairHint( + key="Emergency Contact", value_types=["NAME"] + ), + documentai.types.KeyValuePairHint(key="Referred By"), + ] + + # Setting enabled=True enables form extraction + form_extraction_params = documentai.types.FormExtractionParams( + enabled=True, key_value_pair_hints=key_value_pair_hints + ) + + # Location can be 'us' or 'eu' + parent = "projects/{}/locations/us".format(project_id) + request = documentai.types.ProcessDocumentRequest( + input_config=input_config, + output_config=output_config, + form_extraction_params=form_extraction_params, + ) + + # Add each ProcessDocumentRequest to the batch request + requests = [] + requests.append(request) + + batch_request = documentai.types.BatchProcessDocumentsRequest( + parent=parent, requests=requests + ) + + operation = client.batch_process_documents(batch_request) + + # Wait for the operation to finish + operation.result() + + # Results are written to GCS. Use a regex to find + # output files + match = re.match(r"gs://([^/]+)/(.+)", destination_uri) + output_bucket = match.group(1) + prefix = match.group(2) + + storage_client = storage.client.Client() + bucket = storage_client.get_bucket(output_bucket) + blob_list = list(bucket.list_blobs(prefix=prefix)) + print("Output files:") + for blob in blob_list: + print(blob.name) + + +# [END documentai_batch_parse_form_beta] diff --git a/document_ai/snippets/batch_parse_form_v1beta2_test.py b/document_ai/snippets/batch_parse_form_v1beta2_test.py new file mode 100644 index 000000000000..50dc845d4ea6 --- /dev/null +++ b/document_ai/snippets/batch_parse_form_v1beta2_test.py @@ -0,0 +1,46 @@ +# Copyright 2020 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific ladnguage governing permissions and +# limitations under the License. + +import os +import uuid + +from google.cloud import storage + +import pytest + +from samples.snippets import batch_parse_form_v1beta2 + + +BUCKET = "document-ai-{}".format(uuid.uuid4()) +OUTPUT_PREFIX = "TEST_OUTPUT_{}".format(uuid.uuid4()) +PROJECT_ID = os.environ["GOOGLE_CLOUD_PROJECT"] +INPUT_URI = "gs://cloud-samples-data/documentai/invoice.pdf" +BATCH_OUTPUT_URI = "gs://{}/{}/".format(BUCKET, OUTPUT_PREFIX) + + +@pytest.fixture(autouse=True) +def setup_teardown(): + """Create a temporary bucket to store annotation output.""" + storage_client = storage.Client() + bucket = storage_client.create_bucket(BUCKET) + + yield + + bucket.delete(force=True) + + +def test_batch_parse_form(capsys): + batch_parse_form_v1beta2.batch_parse_form(PROJECT_ID, INPUT_URI, BATCH_OUTPUT_URI) + out, _ = capsys.readouterr() + assert "Output files" in out diff --git a/document_ai/snippets/batch_parse_table_v1beta2.py b/document_ai/snippets/batch_parse_table_v1beta2.py new file mode 100644 index 000000000000..08942080684a --- /dev/null +++ b/document_ai/snippets/batch_parse_table_v1beta2.py @@ -0,0 +1,107 @@ +# Copyright 2020 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +# [START documentai_batch_parse_table_beta] +import re + +from google.cloud import documentai_v1beta2 as documentai +from google.cloud import storage + + +def batch_parse_table( + project_id="YOUR_PROJECT_ID", + input_uri="gs://cloud-samples-data/documentai/form.pdf", + destination_uri="gs://your-bucket-id/path/to/save/results/", +): + """Parse a form""" + + client = documentai.DocumentUnderstandingServiceClient() + + gcs_source = documentai.types.GcsSource(uri=input_uri) + + # mime_type can be application/pdf, image/tiff, + # and image/gif, or application/json + input_config = documentai.types.InputConfig( + gcs_source=gcs_source, mime_type="application/pdf" + ) + + # where to write results + output_config = documentai.types.OutputConfig( + gcs_destination=documentai.types.GcsDestination(uri=destination_uri), + pages_per_shard=1, # Map one doc page to one output page + ) + + # Improve table parsing results by providing bounding boxes + # specifying where the box appears in the document (optional) + table_bound_hints = [ + documentai.types.TableBoundHint( + page_number=1, + bounding_box=documentai.types.BoundingPoly( + # Define a polygon around tables to detect + # Each vertice coordinate must be a number between 0 and 1 + normalized_vertices=[ + # Top left + documentai.types.geometry.NormalizedVertex(x=0, y=0), + # Top right + documentai.types.geometry.NormalizedVertex(x=1, y=0), + # Bottom right + documentai.types.geometry.NormalizedVertex(x=1, y=1), + # Bottom left + documentai.types.geometry.NormalizedVertex(x=0, y=1), + ] + ), + ) + ] + + # Setting enabled=True enables form extraction + table_extraction_params = documentai.types.TableExtractionParams( + enabled=True, table_bound_hints=table_bound_hints + ) + + # Location can be 'us' or 'eu' + parent = "projects/{}/locations/us".format(project_id) + request = documentai.types.ProcessDocumentRequest( + input_config=input_config, + output_config=output_config, + table_extraction_params=table_extraction_params, + ) + + requests = [] + requests.append(request) + + batch_request = documentai.types.BatchProcessDocumentsRequest( + parent=parent, requests=requests + ) + + operation = client.batch_process_documents(batch_request) + + # Wait for the operation to finish + operation.result() + + # Results are written to GCS. Use a regex to find + # output files + match = re.match(r"gs://([^/]+)/(.+)", destination_uri) + output_bucket = match.group(1) + prefix = match.group(2) + + storage_client = storage.client.Client() + bucket = storage_client.get_bucket(output_bucket) + blob_list = list(bucket.list_blobs(prefix=prefix)) + print("Output files:") + for blob in blob_list: + print(blob.name) + + +# [END documentai_batch_parse_table_beta] diff --git a/document_ai/snippets/batch_parse_table_v1beta2_test.py b/document_ai/snippets/batch_parse_table_v1beta2_test.py new file mode 100644 index 000000000000..ed1be2ee070f --- /dev/null +++ b/document_ai/snippets/batch_parse_table_v1beta2_test.py @@ -0,0 +1,46 @@ +# Copyright 2020 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific ladnguage governing permissions and +# limitations under the License. + +import os +import uuid + +from google.cloud import storage + +import pytest + +from samples.snippets import batch_parse_table_v1beta2 + + +BUCKET = "document-ai-{}".format(uuid.uuid4()) +OUTPUT_PREFIX = "TEST_OUTPUT_{}".format(uuid.uuid4()) +PROJECT_ID = os.environ["GOOGLE_CLOUD_PROJECT"] +INPUT_URI = "gs://cloud-samples-data/documentai/invoice.pdf" +BATCH_OUTPUT_URI = "gs://{}/{}/".format(BUCKET, OUTPUT_PREFIX) + + +@pytest.fixture(autouse=True) +def setup_teardown(): + """Create a temporary bucket to store annotation output.""" + storage_client = storage.Client() + bucket = storage_client.create_bucket(BUCKET) + + yield + + bucket.delete(force=True) + + +def test_batch_parse_table(capsys): + batch_parse_table_v1beta2.batch_parse_table(PROJECT_ID, INPUT_URI, BATCH_OUTPUT_URI) + out, _ = capsys.readouterr() + assert "Output files:" in out diff --git a/document_ai/snippets/noxfile.py b/document_ai/snippets/noxfile.py index bca0522ec4d9..bbd25fcdb5e7 100644 --- a/document_ai/snippets/noxfile.py +++ b/document_ai/snippets/noxfile.py @@ -38,28 +38,25 @@ TEST_CONFIG = { # You can opt out from the test for specific Python versions. - 'ignored_versions': ["2.7"], - + "ignored_versions": ["2.7"], # Old samples are opted out of enforcing Python type hints # All new samples should feature them - 'enforce_type_hints': False, - + "enforce_type_hints": False, # An envvar key for determining the project id to use. Change it # to 'BUILD_SPECIFIC_GCLOUD_PROJECT' if you want to opt in using a # build specific Cloud project. You can also use your own string # to use your own Cloud project. - 'gcloud_project_env': 'GOOGLE_CLOUD_PROJECT', + "gcloud_project_env": "GOOGLE_CLOUD_PROJECT", # 'gcloud_project_env': 'BUILD_SPECIFIC_GCLOUD_PROJECT', - # A dictionary you want to inject into your test. Don't put any # secrets here. These values will override predefined values. - 'envs': {}, + "envs": {}, } try: # Ensure we can import noxfile_config in the project's directory. - sys.path.append('.') + sys.path.append(".") from noxfile_config import TEST_CONFIG_OVERRIDE except ImportError as e: print("No user noxfile_config found: detail: {}".format(e)) @@ -74,12 +71,12 @@ def get_pytest_env_vars() -> Dict[str, str]: ret = {} # Override the GCLOUD_PROJECT and the alias. - env_key = TEST_CONFIG['gcloud_project_env'] + env_key = TEST_CONFIG["gcloud_project_env"] # This should error out if not set. - ret['GOOGLE_CLOUD_PROJECT'] = os.environ[env_key] + ret["GOOGLE_CLOUD_PROJECT"] = os.environ[env_key] # Apply user supplied envs. - ret.update(TEST_CONFIG['envs']) + ret.update(TEST_CONFIG["envs"]) return ret @@ -88,7 +85,7 @@ def get_pytest_env_vars() -> Dict[str, str]: ALL_VERSIONS = ["2.7", "3.6", "3.7", "3.8"] # Any default versions that should be ignored. -IGNORED_VERSIONS = TEST_CONFIG['ignored_versions'] +IGNORED_VERSIONS = TEST_CONFIG["ignored_versions"] TESTED_VERSIONS = sorted([v for v in ALL_VERSIONS if v not in IGNORED_VERSIONS]) @@ -137,7 +134,7 @@ def _determine_local_import_names(start_dir: str) -> List[str]: @nox.session def lint(session: nox.sessions.Session) -> None: - if not TEST_CONFIG['enforce_type_hints']: + if not TEST_CONFIG["enforce_type_hints"]: session.install("flake8", "flake8-import-order") else: session.install("flake8", "flake8-import-order", "flake8-annotations") @@ -146,9 +143,11 @@ def lint(session: nox.sessions.Session) -> None: args = FLAKE8_COMMON_ARGS + [ "--application-import-names", ",".join(local_names), - "." + ".", ] session.run("flake8", *args) + + # # Black # @@ -161,6 +160,7 @@ def blacken(session: nox.sessions.Session) -> None: session.run("black", *python_files) + # # Sample Tests # @@ -169,7 +169,9 @@ def blacken(session: nox.sessions.Session) -> None: PYTEST_COMMON_ARGS = ["--junitxml=sponge_log.xml"] -def _session_tests(session: nox.sessions.Session, post_install: Callable = None) -> None: +def _session_tests( + session: nox.sessions.Session, post_install: Callable = None +) -> None: """Runs py.test for a particular project.""" if os.path.exists("requirements.txt"): session.install("-r", "requirements.txt") @@ -200,9 +202,9 @@ def py(session: nox.sessions.Session) -> None: if session.python in TESTED_VERSIONS: _session_tests(session) else: - session.skip("SKIPPED: {} tests are disabled for this sample.".format( - session.python - )) + session.skip( + "SKIPPED: {} tests are disabled for this sample.".format(session.python) + ) # diff --git a/document_ai/snippets/parse_form_v1beta2.py b/document_ai/snippets/parse_form_v1beta2.py new file mode 100644 index 000000000000..27c99811cbbc --- /dev/null +++ b/document_ai/snippets/parse_form_v1beta2.py @@ -0,0 +1,92 @@ +# Copyright 2020 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the 'License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# [START documentai_parse_form_beta] +from google.cloud import documentai_v1beta2 as documentai + + +def parse_form( + project_id="YOUR_PROJECT_ID", + input_uri="gs://cloud-samples-data/documentai/form.pdf", +): + """Parse a form""" + + client = documentai.DocumentUnderstandingServiceClient() + + gcs_source = documentai.types.GcsSource(uri=input_uri) + + # mime_type can be application/pdf, image/tiff, + # and image/gif, or application/json + input_config = documentai.types.InputConfig( + gcs_source=gcs_source, mime_type="application/pdf" + ) + + # Improve form parsing results by providing key-value pair hints. + # For each key hint, key is text that is likely to appear in the + # document as a form field name (i.e. "DOB"). + # Value types are optional, but can be one or more of: + # ADDRESS, LOCATION, ORGANIZATION, PERSON, PHONE_NUMBER, ID, + # NUMBER, EMAIL, PRICE, TERMS, DATE, NAME + key_value_pair_hints = [ + documentai.types.KeyValuePairHint( + key="Emergency Contact", value_types=["NAME"] + ), + documentai.types.KeyValuePairHint(key="Referred By"), + ] + + # Setting enabled=True enables form extraction + form_extraction_params = documentai.types.FormExtractionParams( + enabled=True, key_value_pair_hints=key_value_pair_hints + ) + + # Location can be 'us' or 'eu' + parent = "projects/{}/locations/us".format(project_id) + request = documentai.types.ProcessDocumentRequest( + parent=parent, + input_config=input_config, + form_extraction_params=form_extraction_params, + ) + + document = client.process_document(request=request) + + def _get_text(el): + """Doc AI identifies form fields by their offsets + in document text. This function converts offsets + to text snippets. + """ + response = "" + # If a text segment spans several lines, it will + # be stored in different text segments. + for segment in el.text_anchor.text_segments: + start_index = segment.start_index + end_index = segment.end_index + response += document.text[start_index:end_index] + return response + + for page in document.pages: + print("Page number: {}".format(page.page_number)) + for form_field in page.form_fields: + print( + "Field Name: {}\tConfidence: {}".format( + _get_text(form_field.field_name), form_field.field_name.confidence + ) + ) + print( + "Field Value: {}\tConfidence: {}".format( + _get_text(form_field.field_value), form_field.field_value.confidence + ) + ) + + +# [END documentai_parse_form_beta] diff --git a/document_ai/snippets/parse_form_v1beta2_test.py b/document_ai/snippets/parse_form_v1beta2_test.py new file mode 100644 index 000000000000..6987612aef9f --- /dev/null +++ b/document_ai/snippets/parse_form_v1beta2_test.py @@ -0,0 +1,28 @@ +# Copyright 2020 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific ladnguage governing permissions and +# limitations under the License. + +import os + +from samples.snippets import parse_form_v1beta2 + + +PROJECT_ID = os.environ["GOOGLE_CLOUD_PROJECT"] +INPUT_URI = "gs://cloud-samples-data/documentai/form.pdf" + + +def test_parse_form(capsys): + parse_form_v1beta2.parse_form(PROJECT_ID, INPUT_URI) + out, _ = capsys.readouterr() + assert "Field Name" in out + assert "Field Value" in out diff --git a/document_ai/snippets/parse_table_v1beta2.py b/document_ai/snippets/parse_table_v1beta2.py new file mode 100644 index 000000000000..ac8f5d11dd41 --- /dev/null +++ b/document_ai/snippets/parse_table_v1beta2.py @@ -0,0 +1,95 @@ +# Copyright 2020 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# [START documentai_parse_table_beta] +from google.cloud import documentai_v1beta2 as documentai + + +def parse_table( + project_id="YOUR_PROJECT_ID", + input_uri="gs://cloud-samples-data/documentai/invoice.pdf", +): + """Parse a form""" + + client = documentai.DocumentUnderstandingServiceClient() + + gcs_source = documentai.types.GcsSource(uri=input_uri) + + # mime_type can be application/pdf, image/tiff, + # and image/gif, or application/json + input_config = documentai.types.InputConfig( + gcs_source=gcs_source, mime_type="application/pdf" + ) + + # Improve table parsing results by providing bounding boxes + # specifying where the box appears in the document (optional) + table_bound_hints = [ + documentai.types.TableBoundHint( + page_number=1, + bounding_box=documentai.types.BoundingPoly( + # Define a polygon around tables to detect + # Each vertice coordinate must be a number between 0 and 1 + normalized_vertices=[ + # Top left + documentai.types.geometry.NormalizedVertex(x=0, y=0), + # Top right + documentai.types.geometry.NormalizedVertex(x=1, y=0), + # Bottom right + documentai.types.geometry.NormalizedVertex(x=1, y=1), + # Bottom left + documentai.types.geometry.NormalizedVertex(x=0, y=1), + ] + ), + ) + ] + + # Setting enabled=True enables form extraction + table_extraction_params = documentai.types.TableExtractionParams( + enabled=True, table_bound_hints=table_bound_hints + ) + + # Location can be 'us' or 'eu' + parent = "projects/{}/locations/us".format(project_id) + request = documentai.types.ProcessDocumentRequest( + parent=parent, + input_config=input_config, + table_extraction_params=table_extraction_params, + ) + + document = client.process_document(request=request) + + def _get_text(el): + """Convert text offset indexes into text snippets.""" + response = "" + # If a text segment spans several lines, it will + # be stored in different text segments. + for segment in el.text_anchor.text_segments: + start_index = segment.start_index + end_index = segment.end_index + response += document.text[start_index:end_index] + return response + + for page in document.pages: + print("Page number: {}".format(page.page_number)) + for table_num, table in enumerate(page.tables): + print("Table {}: ".format(table_num)) + for row_num, row in enumerate(table.header_rows): + cells = "\t".join([_get_text(cell.layout) for cell in row.cells]) + print("Header Row {}: {}".format(row_num, cells)) + for row_num, row in enumerate(table.body_rows): + cells = "\t".join([_get_text(cell.layout) for cell in row.cells]) + print("Row {}: {}".format(row_num, cells)) + + +# [END documentai_parse_table_beta] diff --git a/document_ai/snippets/parse_table_v1beta2_test.py b/document_ai/snippets/parse_table_v1beta2_test.py new file mode 100644 index 000000000000..4102c926b487 --- /dev/null +++ b/document_ai/snippets/parse_table_v1beta2_test.py @@ -0,0 +1,28 @@ +# Copyright 2020 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific ladnguage governing permissions and +# limitations under the License. + +import os + +from samples.snippets import parse_table_v1beta2 + + +PROJECT_ID = os.environ["GOOGLE_CLOUD_PROJECT"] +INPUT_URI = "gs://cloud-samples-data/documentai/invoice.pdf" + + +def test_parse_table(capsys): + parse_table_v1beta2.parse_table(PROJECT_ID, INPUT_URI) + out, _ = capsys.readouterr() + assert "Table" in out + assert "Header Row" in out diff --git a/document_ai/snippets/parse_with_model_v1beta2.py b/document_ai/snippets/parse_with_model_v1beta2.py new file mode 100644 index 000000000000..59265c4f2038 --- /dev/null +++ b/document_ai/snippets/parse_with_model_v1beta2.py @@ -0,0 +1,60 @@ +# Copyright 2020 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +# [START documentai_parse_with_model_beta] +from google.cloud import documentai_v1beta2 as documentai + + +def parse_with_model( + project_id="YOUR_PROJECT_ID", + input_uri="gs://cloud-samples-data/documentai/invoice.pdf", + automl_model_name="YOUR_AUTOML_MODEL_NAME", +): + """Process a single document with the Document AI API. + + Args: + project_id: your Google Cloud project id + input_uri: the Cloud Storage URI of your input PDF + automl_model_name: the AutoML model name formatted as: + `projects/[PROJECT_ID]/locations/[LOCATION]/models/[MODEL_ID] + where LOCATION is a Compute Engine region, e.g. `us-central1` + """ + + client = documentai.DocumentUnderstandingServiceClient() + + gcs_source = documentai.types.GcsSource(uri=input_uri) + + # mime_type can be application/pdf, image/tiff, + # and image/gif, or application/json + input_config = documentai.types.InputConfig( + gcs_source=gcs_source, mime_type="application/pdf" + ) + + automl_params = documentai.types.AutoMlParams(model=automl_model_name) + + # Location can be 'us' or 'eu' + parent = "projects/{}/locations/us".format(project_id) + request = documentai.types.ProcessDocumentRequest( + parent=parent, input_config=input_config, automl_params=automl_params + ) + + document = client.process_document(request=request) + + for label in document.labels: + print("Label detected: {}".format(label.name)) + print("Confidence: {}".format(label.confidence)) + + +# [END documentai_parse_with_model_beta] diff --git a/document_ai/snippets/parse_with_model_v1beta2_test.py b/document_ai/snippets/parse_with_model_v1beta2_test.py new file mode 100644 index 000000000000..4b5d3ca52b31 --- /dev/null +++ b/document_ai/snippets/parse_with_model_v1beta2_test.py @@ -0,0 +1,36 @@ +# Copyright 2020 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the 'License'); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific ladnguage governing permissions and +# limitations under the License. + +import os + +from samples.snippets import parse_with_model_v1beta2 + + +PROJECT_ID = os.environ["GOOGLE_CLOUD_PROJECT"] +INPUT_URI = "gs://cloud-samples-data/documentai/invoice.pdf" +AUTOML_NL_MODEL_ID = "TCN3472481026502981088" + +if "AUTOML_NL_MODEL_ID" in os.environ: + AUTOML_NL_MODEL_ID = os.environ["AUTOML_NL_MODEL_ID"] + +MODEL_NAME = "projects/{}/locations/us-central1/models/{}".format( + PROJECT_ID, AUTOML_NL_MODEL_ID +) + + +def test_parse_with_model(capsys): + parse_with_model_v1beta2.parse_with_model(PROJECT_ID, INPUT_URI, MODEL_NAME) + out, _ = capsys.readouterr() + assert "Label detected" in out + assert "Confidence" in out diff --git a/document_ai/snippets/quickstart_v1beta2.py b/document_ai/snippets/quickstart_v1beta2.py new file mode 100644 index 000000000000..34f588201164 --- /dev/null +++ b/document_ai/snippets/quickstart_v1beta2.py @@ -0,0 +1,65 @@ +# Copyright 2020 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +# [START documentai_quickstart_beta] +from google.cloud import documentai_v1beta2 as documentai + + +def main( + project_id="YOUR_PROJECT_ID", + input_uri="gs://cloud-samples-data/documentai/invoice.pdf", +): + """Process a single document with the Document AI API, including + text extraction and entity extraction.""" + + client = documentai.DocumentUnderstandingServiceClient() + + gcs_source = documentai.types.GcsSource(uri=input_uri) + + # mime_type can be application/pdf, image/tiff, + # and image/gif, or application/json + input_config = documentai.types.InputConfig( + gcs_source=gcs_source, mime_type="application/pdf" + ) + + # Location can be 'us' or 'eu' + parent = "projects/{}/locations/us".format(project_id) + request = documentai.types.ProcessDocumentRequest( + parent=parent, input_config=input_config + ) + + document = client.process_document(request=request) + + # All text extracted from the document + print("Document Text: {}".format(document.text)) + + def _get_text(el): + """Convert text offset indexes into text snippets.""" + response = "" + # If a text segment spans several lines, it will + # be stored in different text segments. + for segment in el.text_anchor.text_segments: + start_index = segment.start_index + end_index = segment.end_index + response += document.text[start_index:end_index] + return response + + for entity in document.entities: + print("Entity type: {}".format(entity.type_)) + print("Text: {}".format(_get_text(entity))) + print("Mention text: {}\n".format(entity.mention_text)) + + +# [END documentai_quickstart_beta] diff --git a/document_ai/snippets/quickstart_v1beta2_test.py b/document_ai/snippets/quickstart_v1beta2_test.py new file mode 100644 index 000000000000..1868788d7cea --- /dev/null +++ b/document_ai/snippets/quickstart_v1beta2_test.py @@ -0,0 +1,28 @@ +# Copyright 2020 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific ladnguage governing permissions and +# limitations under the License. + +import os + +from samples.snippets import quickstart_v1beta2 + + +PROJECT_ID = os.environ["GOOGLE_CLOUD_PROJECT"] +INPUT_URI = "gs://cloud-samples-data/documentai/invoice.pdf" + + +def test_quickstart(capsys): + quickstart_v1beta2.main(PROJECT_ID, INPUT_URI) + out, _ = capsys.readouterr() + assert "Entity type" in out + assert "Mention text" in out diff --git a/document_ai/snippets/set_endpoint_v1beta2.py b/document_ai/snippets/set_endpoint_v1beta2.py new file mode 100644 index 000000000000..0fa9921bb84a --- /dev/null +++ b/document_ai/snippets/set_endpoint_v1beta2.py @@ -0,0 +1,48 @@ +# Copyright 2020 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +def set_endpoint( + project_id="YOUR_PROJECT_ID", + input_uri="gs://cloud-samples-data/documentai/invoice.pdf", +): + """Process a single document with the Document AI API, including + text extraction and entity extraction.""" + + # [START documentai_set_endpoint_beta] + from google.cloud import documentai_v1beta2 as documentai + + client = documentai.DocumentUnderstandingServiceClient( + client_options={"api_endpoint": "eu-documentai.googleapis.com"} + ) + # [END documentai_set_endpoint_beta] + + gcs_source = documentai.types.GcsSource(uri=input_uri) + + # mime_type can be application/pdf, image/tiff, + # and image/gif, or application/json + input_config = documentai.types.InputConfig( + gcs_source=gcs_source, mime_type="application/pdf" + ) + + # Location can be 'us' or 'eu' + parent = "projects/{}/locations/eu".format(project_id) + request = documentai.types.ProcessDocumentRequest( + parent=parent, input_config=input_config + ) + + document = client.process_document(request=request) + + # All text extracted from the document + print("Document Text: {}".format(document.text)) diff --git a/document_ai/snippets/set_endpoint_v1beta2_test.py b/document_ai/snippets/set_endpoint_v1beta2_test.py new file mode 100644 index 000000000000..be535a28d29c --- /dev/null +++ b/document_ai/snippets/set_endpoint_v1beta2_test.py @@ -0,0 +1,27 @@ +# Copyright 2020 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific ladnguage governing permissions and +# limitations under the License. + +import os + +from samples.snippets import set_endpoint_v1beta2 + + +PROJECT_ID = os.environ["GOOGLE_CLOUD_PROJECT"] +INPUT_URI = "gs://cloud-samples-data/documentai/invoice.pdf" + + +def test_set_endpoint(capsys): + set_endpoint_v1beta2.set_endpoint(PROJECT_ID, INPUT_URI) + out, _ = capsys.readouterr() + assert "Document Text" in out