Skip to content

Commit

Permalink
docs(samples): add OCR, form, quality, splitter and specialized proce…
Browse files Browse the repository at this point in the history
…ssing samples (#239)

* docs(samples): add processing samples for OCR, quality, splitter and specialized

* Update quality, specialized and splitter samples

* Fix lint issues

* Fix snippet tags

* update library from v1 to v1beta3

* restore previous processing sample to avoid sample tag breakage
  • Loading branch information
Matt Carroll authored Nov 10, 2021
1 parent d5e0d84 commit 102553b
Show file tree
Hide file tree
Showing 14 changed files with 700 additions and 0 deletions.
115 changes: 115 additions & 0 deletions documentai/snippets/process_document_form_sample.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,115 @@
# Copyright 2020 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#

# [START documentai_process_form_document]

# TODO(developer): Uncomment these variables before running the sample.
# project_id= 'YOUR_PROJECT_ID'
# location = 'YOUR_PROJECT_LOCATION' # Format is 'us' or 'eu'
# processor_id = 'YOUR_PROCESSOR_ID' # Create processor in Cloud Console
# file_path = '/path/to/local/pdf'

def process_document_form_sample(
project_id: str, location: str, processor_id: str, file_path: str
):
from google.cloud import documentai_v1beta3 as documentai

# You must set the api_endpoint if you use a location other than 'us', e.g.:
opts = {}
if location == "eu":
opts = {"api_endpoint": "eu-documentai.googleapis.com"}

client = documentai.DocumentProcessorServiceClient(client_options=opts)

# The full resource name of the processor, e.g.:
# projects/project-id/locations/location/processor/processor-id
# You must create new processors in the Cloud Console first
name = f"projects/{project_id}/locations/{location}/processors/{processor_id}"

with open(file_path, "rb") as image:
image_content = image.read()

# Read the file into memory
document = {"content": image_content, "mime_type": "application/pdf"}

# Configure the process request
request = {"name": name, "raw_document": document}

# Recognizes text entities in the PDF document
result = client.process_document(request=request)

print("Document processing complete.")

# Read the table and form fields output from the processor
# The form processor also contains OCR data. For more information
# on how to parse OCR data please see the OCR sample.
# For a full list of Document object attributes, please reference this page: https://googleapis.dev/python/documentai/latest/_modules/google/cloud/documentai_v1beta3/types/document.html#Document
document = result.document
text = document.text
print(f"Full document text: {repr(text)}\n")
print(f"There are {len(document.pages)} page(s) in this document.")

# Read the text recognition output from the processor
for page in document.pages:
print(f"\n\n**** Page {page.page_number} ****")

print(f"Found {len(page.tables)} table(s):")
for table in page.tables:
num_collumns = len(table.header_rows[0].cells)
num_rows = len(table.body_rows)
print(f'Table with {num_collumns} columns and {num_rows} rows:')
print_table_info(table, text)
print(f'Found {len(page.form_fields)} form fields:')
for field in page.form_fields:
name = layout_to_text(field.field_name, text)
value = layout_to_text(field.field_value, text)
print(f" * {repr(name.strip())}: {repr(value.strip())}")


def print_table_info(table: dict, text: str) -> None:
# Print header row
header_row_text = ''
for header_cell in table.header_rows[0].cells:
header_cell_text = layout_to_text(header_cell.layout, text)
header_row_text += f'{repr(header_cell_text.strip())} | '
print(f'Collumns: {header_row_text[:-3]}')
# Print first body row
body_row_text = ''
for body_cell in table.body_rows[0].cells:
body_cell_text = layout_to_text(body_cell.layout, text)
body_row_text += f'{repr(body_cell_text.strip())} | '
print(f'First row data: {body_row_text[:-3]}\n')


def layout_to_text(layout: dict, text: str) -> str:
"""
Document AI identifies form fields by their offsets in the entirity of the
document's text. This function converts offsets to a string.
"""
response = ""
# If a text segment spans several lines, it will
# be stored in different text segments.
for segment in layout.text_anchor.text_segments:
start_index = (
int(segment.start_index)
if segment in layout.text_anchor.text_segments
else 0
)
end_index = int(segment.end_index)
response += text[start_index:end_index]
return response


# [END documentai_process_form_document]
43 changes: 43 additions & 0 deletions documentai/snippets/process_document_form_sample_test.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,43 @@
# # Copyright 2020 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#

import os

from samples.snippets import process_document_form_sample


location = "us"
project_id = os.environ["GOOGLE_CLOUD_PROJECT"]
processor_id = "90484cfdedb024f6"
file_path = "resources/invoice.pdf"


def test_process_documents(capsys):
process_document_form_sample.process_document_form_sample(
project_id=project_id,
location=location,
processor_id=processor_id,
file_path=file_path,
)
out, _ = capsys.readouterr()

expected_strings = [
"There are 1 page(s) in this document.",
"Table with 4 columns and 6 rows",
"Found 13 form fields",
"'BALANCE DUE': '$2140.00'",
]
for expected_string in expected_strings:
assert expected_string in out
141 changes: 141 additions & 0 deletions documentai/snippets/process_document_ocr_sample.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,141 @@
# Copyright 2020 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#

# [START documentai_process_ocr_document]

# TODO(developer): Uncomment these variables before running the sample.
# project_id= 'YOUR_PROJECT_ID'
# location = 'YOUR_PROJECT_LOCATION' # Format is 'us' or 'eu'
# processor_id = 'YOUR_PROCESSOR_ID' # Create processor in Cloud Console
# file_path = '/path/to/local/pdf'

def process_document_ocr_sample(
project_id: str, location: str, processor_id: str, file_path: str
) -> None:
from google.cloud import documentai_v1beta3 as documentai

# You must set the api_endpoint if you use a location other than 'us', e.g.:
opts = {}
if location == "eu":
opts = {"api_endpoint": "eu-documentai.googleapis.com"}

client = documentai.DocumentProcessorServiceClient(client_options=opts)

# The full resource name of the processor, e.g.:
# projects/project-id/locations/location/processor/processor-id
# You must create new processors in the Cloud Console first
name = f"projects/{project_id}/locations/{location}/processors/{processor_id}"

with open(file_path, "rb") as image:
image_content = image.read()

# Read the file into memory
document = {"content": image_content, "mime_type": "application/pdf"}

# Configure the process request
request = {"name": name, "raw_document": document}

# Recognizes text entities in the PDF document
result = client.process_document(request=request)

print("Document processing complete.")

# Read the text recognition output from the processor
# For a full list of Document object attributes, please reference this page: https://googleapis.dev/python/documentai/latest/_modules/google/cloud/documentai_v1beta3/types/document.html#Document
document = result.document
text = document.text
print(f"Full document text: {repr(text)}\n")
print(f"There are {len(document.pages)} page(s) in this document.\n")

for page in document.pages:
print(f"Page {page.page_number}:")
print_page_dimensions(page.dimension)
print_detected_langauges(page.detected_languages)
print_paragraphs(page.paragraphs, text)
print_blocks(page.blocks, text)
print_lines(page.lines, text)
print_tokens(page.tokens, text)


def print_page_dimensions(dimension: dict) -> None:
print(f" Width: {str(dimension.width)}")
print(f" Height: {str(dimension.height)}")


def print_detected_langauges(detected_languages: dict) -> None:
print(" Detected languages:")
for lang in detected_languages:
code = lang.language_code
conf_percent = '{:.1%}'.format(lang.confidence)
print(f" {code} ({conf_percent} confidence)")


def print_paragraphs(paragraphs: dict, text: str) -> None:
print(f" {len(paragraphs)} paragraphs detected:")
first_paragraph_text = layout_to_text(paragraphs[0].layout, text)
print(f" First paragraph text: {repr(first_paragraph_text)}")
last_paragraph_text = layout_to_text(paragraphs[-1].layout, text)
print(f" Last paragraph text: {repr(last_paragraph_text)}")


def print_blocks(blocks: dict, text: str) -> None:
print(f" {len(blocks)} blocks detected:")
first_block_text = layout_to_text(blocks[0].layout, text)
print(f" First text block: {repr(first_block_text)}")
last_block_text = layout_to_text(blocks[-1].layout, text)
print(f" Last text block: {repr(last_block_text)}")


def print_lines(lines: dict, text: str) -> None:
print(f" {len(lines)} lines detected:")
first_line_text = layout_to_text(lines[0].layout, text)
print(f" First line text: {repr(first_line_text)}")
last_line_text = layout_to_text(lines[-1].layout, text)
print(f" Last line text: {repr(last_line_text)}")


def print_tokens(tokens: dict, text: str) -> None:
print(f" {len(tokens)} tokens detected:")
first_token_text = layout_to_text(tokens[0].layout, text)
first_token_break_type = tokens[0].detected_break.type_.name
print(f" First token text: {repr(first_token_text)}")
print(f" First token break type: {repr(first_token_break_type)}")
last_token_text = layout_to_text(tokens[-1].layout, text)
last_token_break_type = tokens[-1].detected_break.type_.name
print(f" Last token text: {repr(last_token_text)}")
print(f" Last token break type: {repr(last_token_break_type)}")


def layout_to_text(layout: dict, text: str) -> str:
"""
Document AI identifies text in different parts of the document by their
offsets in the entirity of the document's text. This function converts
offsets to a string.
"""
response = ""
# If a text segment spans several lines, it will
# be stored in different text segments.
for segment in layout.text_anchor.text_segments:
start_index = (
int(segment.start_index)
if segment in layout.text_anchor.text_segments
else 0
)
end_index = int(segment.end_index)
response += text[start_index:end_index]
return response


# [END documentai_process_ocr_document]
37 changes: 37 additions & 0 deletions documentai/snippets/process_document_ocr_sample_test.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,37 @@
# # Copyright 2020 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#

import os

from samples.snippets import process_document_ocr_sample

location = "us"
project_id = os.environ["GOOGLE_CLOUD_PROJECT"]
processor_id = "91e072f8626a76b7"
file_path = "resources/handwritten_form.pdf"


def test_process_documents(capsys):
process_document_ocr_sample.process_document_ocr_sample(
project_id=project_id,
location=location,
processor_id=processor_id,
file_path=file_path,
)
out, _ = capsys.readouterr()

assert "Page 1" in out
assert "en" in out
assert "FakeDoc" in out
Loading

0 comments on commit 102553b

Please sign in to comment.