-
Notifications
You must be signed in to change notification settings - Fork 2.8k
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
[formrecognizer] Add prebuilt-document samples and tests (#20894)
* add prebuilt-document samples * fix from_generated methods * add prebuilt-document tests * update samples * fix spelling error
- Loading branch information
1 parent
831d3c6
commit be52288
Showing
18 changed files
with
34,073 additions
and
16 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
173 changes: 173 additions & 0 deletions
173
.../azure-ai-formrecognizer/samples/v3.2-beta/async_samples/sample_analyze_document_async.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,173 @@ | ||
# coding: utf-8 | ||
|
||
# ------------------------------------------------------------------------- | ||
# Copyright (c) Microsoft Corporation. All rights reserved. | ||
# Licensed under the MIT License. See License.txt in the project root for | ||
# license information. | ||
# -------------------------------------------------------------------------- | ||
|
||
""" | ||
FILE: sample_analyze_document_async.py | ||
DESCRIPTION: | ||
This sample demonstrates how to extract general document information from a document | ||
given through a file. | ||
Note that selection marks returned from begin_analyze_document() do not return the text associated with | ||
the checkbox. For the API to return this information, build a custom model to analyze the checkbox and its text. | ||
See sample_build_model_async.py for more information. | ||
USAGE: | ||
python sample_analyze_document_async.py | ||
Set the environment variables with your own values before running the sample: | ||
1) AZURE_FORM_RECOGNIZER_ENDPOINT - the endpoint to your Cognitive Services resource. | ||
2) AZURE_FORM_RECOGNIZER_KEY - your Form Recognizer API key | ||
""" | ||
|
||
import os | ||
import asyncio | ||
|
||
def format_bounding_region(bounding_regions): | ||
if not bounding_regions: | ||
return "N/A" | ||
return ", ".join("Page #{}: {}".format(region.page_number, format_bounding_box(region.bounding_box)) for region in bounding_regions) | ||
|
||
def format_bounding_box(bounding_box): | ||
if not bounding_box: | ||
return "N/A" | ||
return ", ".join(["[{}, {}]".format(p.x, p.y) for p in bounding_box]) | ||
|
||
|
||
async def analyze_document(): | ||
path_to_sample_documents = os.path.abspath( | ||
os.path.join( | ||
os.path.abspath(__file__), | ||
"..", | ||
"..", | ||
"..", | ||
"./sample_forms/forms/form_selection_mark.png", | ||
) | ||
) | ||
# [START analyze_document] | ||
from azure.core.credentials import AzureKeyCredential | ||
from azure.ai.formrecognizer.aio import DocumentAnalysisClient | ||
|
||
endpoint = os.environ["AZURE_FORM_RECOGNIZER_ENDPOINT"] | ||
key = os.environ["AZURE_FORM_RECOGNIZER_KEY"] | ||
|
||
document_analysis_client = DocumentAnalysisClient( | ||
endpoint=endpoint, credential=AzureKeyCredential(key) | ||
) | ||
|
||
async with document_analysis_client: | ||
with open(path_to_sample_documents, "rb") as f: | ||
poller = await document_analysis_client.begin_analyze_document( | ||
"prebuilt-document", document=f | ||
) | ||
result = await poller.result() | ||
|
||
for idx, style in enumerate(result.styles): | ||
print( | ||
"Document contains {} content".format( | ||
"handwritten" if style.is_handwritten else "no handwritten" | ||
) | ||
) | ||
|
||
for idx, page in enumerate(result.pages): | ||
print("----Analyzing document from page #{}----".format(idx + 1)) | ||
print( | ||
"Page has width: {} and height: {}, measured with unit: {}".format( | ||
page.width, page.height, page.unit | ||
) | ||
) | ||
|
||
for line_idx, line in enumerate(page.lines): | ||
print( | ||
"Line # {} has text content '{}' within bounding box '{}'".format( | ||
line_idx, | ||
line.content, | ||
format_bounding_box(line.bounding_box), | ||
) | ||
) | ||
|
||
for word in page.words: | ||
print( | ||
"...Word '{}' has a confidence of {}".format( | ||
word.content, word.confidence | ||
) | ||
) | ||
|
||
for selection_mark in page.selection_marks: | ||
print( | ||
"Selection mark is '{}' within bounding box '{}' and has a confidence of {}".format( | ||
selection_mark.state, | ||
format_bounding_box(selection_mark.bounding_box), | ||
selection_mark.confidence, | ||
) | ||
) | ||
|
||
for table_idx, table in enumerate(result.tables): | ||
print( | ||
"Table # {} has {} rows and {} columns".format( | ||
table_idx, table.row_count, table.column_count | ||
) | ||
) | ||
for region in table.bounding_regions: | ||
print( | ||
"Table # {} location on page: {} is {}".format( | ||
table_idx, | ||
region.page_number, | ||
format_bounding_box(region.bounding_box), | ||
) | ||
) | ||
for cell in table.cells: | ||
print( | ||
"...Cell[{}][{}] has text '{}'".format( | ||
cell.row_index, | ||
cell.column_index, | ||
cell.content, | ||
) | ||
) | ||
for region in cell.bounding_regions: | ||
print( | ||
"...content on page {} is within bounding box '{}'".format( | ||
region.page_number, | ||
format_bounding_box(region.bounding_box), | ||
) | ||
) | ||
|
||
print("----Entities found in document----") | ||
for idx, entity in enumerate(result.entities): | ||
print("Entity of category '{}' with sub-category '{}'".format(entity.category, entity.sub_category)) | ||
print("...has content '{}'".format(entity.content)) | ||
print("...within '{}' bounding regions".format(format_bounding_region(entity.bounding_regions))) | ||
print("...with confidence {}".format(entity.confidence)) | ||
|
||
print("----Key-value pairs found in document----") | ||
for idx, kv_pair in enumerate(result.key_value_pairs): | ||
if kv_pair.key: | ||
print( | ||
"Key '{}' found within '{}' bounding regions".format( | ||
kv_pair.key.content, | ||
format_bounding_region(kv_pair.key.bounding_regions), | ||
) | ||
) | ||
if kv_pair.value: | ||
print( | ||
"Value '{}' found within '{}' bounding regions".format( | ||
kv_pair.value.content, | ||
format_bounding_region(kv_pair.value.bounding_regions), | ||
) | ||
) | ||
print("----------------------------------------") | ||
|
||
# [END analyze_document] | ||
|
||
|
||
async def main(): | ||
await analyze_document() | ||
|
||
if __name__ == "__main__": | ||
loop = asyncio.get_event_loop() | ||
loop.run_until_complete(main()) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
165 changes: 165 additions & 0 deletions
165
sdk/formrecognizer/azure-ai-formrecognizer/samples/v3.2-beta/sample_analyze_document.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,165 @@ | ||
# coding: utf-8 | ||
|
||
# ------------------------------------------------------------------------- | ||
# Copyright (c) Microsoft Corporation. All rights reserved. | ||
# Licensed under the MIT License. See License.txt in the project root for | ||
# license information. | ||
# -------------------------------------------------------------------------- | ||
|
||
""" | ||
FILE: sample_analyze_document.py | ||
DESCRIPTION: | ||
This sample demonstrates how to extract general document information from a document | ||
given through a file. | ||
Note that selection marks returned from begin_analyze_document() do not return the text associated with | ||
the checkbox. For the API to return this information, build a custom model to analyze the checkbox and its text. | ||
See sample_build_model.py for more information. | ||
USAGE: | ||
python sample_analyze_document.py | ||
Set the environment variables with your own values before running the sample: | ||
1) AZURE_FORM_RECOGNIZER_ENDPOINT - the endpoint to your Cognitive Services resource. | ||
2) AZURE_FORM_RECOGNIZER_KEY - your Form Recognizer API key | ||
""" | ||
|
||
import os | ||
|
||
def format_bounding_region(bounding_regions): | ||
if not bounding_regions: | ||
return "N/A" | ||
return ", ".join("Page #{}: {}".format(region.page_number, format_bounding_box(region.bounding_box)) for region in bounding_regions) | ||
|
||
def format_bounding_box(bounding_box): | ||
if not bounding_box: | ||
return "N/A" | ||
return ", ".join(["[{}, {}]".format(p.x, p.y) for p in bounding_box]) | ||
|
||
|
||
def analyze_document(): | ||
path_to_sample_documents = os.path.abspath( | ||
os.path.join( | ||
os.path.abspath(__file__), | ||
"..", | ||
"..", | ||
"./sample_forms/forms/form_selection_mark.png", | ||
) | ||
) | ||
# [START analyze_document] | ||
from azure.core.credentials import AzureKeyCredential | ||
from azure.ai.formrecognizer import DocumentAnalysisClient | ||
|
||
endpoint = os.environ["AZURE_FORM_RECOGNIZER_ENDPOINT"] | ||
key = os.environ["AZURE_FORM_RECOGNIZER_KEY"] | ||
|
||
document_analysis_client = DocumentAnalysisClient( | ||
endpoint=endpoint, credential=AzureKeyCredential(key) | ||
) | ||
with open(path_to_sample_documents, "rb") as f: | ||
poller = document_analysis_client.begin_analyze_document( | ||
"prebuilt-document", document=f | ||
) | ||
result = poller.result() | ||
|
||
for idx, style in enumerate(result.styles): | ||
print( | ||
"Document contains {} content".format( | ||
"handwritten" if style.is_handwritten else "no handwritten" | ||
) | ||
) | ||
|
||
for idx, page in enumerate(result.pages): | ||
print("----Analyzing document from page #{}----".format(idx + 1)) | ||
print( | ||
"Page has width: {} and height: {}, measured with unit: {}".format( | ||
page.width, page.height, page.unit | ||
) | ||
) | ||
|
||
for line_idx, line in enumerate(page.lines): | ||
print( | ||
"Line # {} has text content '{}' within bounding box '{}'".format( | ||
line_idx, | ||
line.content, | ||
format_bounding_box(line.bounding_box), | ||
) | ||
) | ||
|
||
for word in page.words: | ||
print( | ||
"...Word '{}' has a confidence of {}".format( | ||
word.content, word.confidence | ||
) | ||
) | ||
|
||
for selection_mark in page.selection_marks: | ||
print( | ||
"Selection mark is '{}' within bounding box '{}' and has a confidence of {}".format( | ||
selection_mark.state, | ||
format_bounding_box(selection_mark.bounding_box), | ||
selection_mark.confidence, | ||
) | ||
) | ||
|
||
for table_idx, table in enumerate(result.tables): | ||
print( | ||
"Table # {} has {} rows and {} columns".format( | ||
table_idx, table.row_count, table.column_count | ||
) | ||
) | ||
for region in table.bounding_regions: | ||
print( | ||
"Table # {} location on page: {} is {}".format( | ||
table_idx, | ||
region.page_number, | ||
format_bounding_box(region.bounding_box), | ||
) | ||
) | ||
for cell in table.cells: | ||
print( | ||
"...Cell[{}][{}] has text '{}'".format( | ||
cell.row_index, | ||
cell.column_index, | ||
cell.content, | ||
) | ||
) | ||
for region in cell.bounding_regions: | ||
print( | ||
"...content on page {} is within bounding box '{}'".format( | ||
region.page_number, | ||
format_bounding_box(region.bounding_box), | ||
) | ||
) | ||
|
||
print("----Entities found in document----") | ||
for idx, entity in enumerate(result.entities): | ||
print("Entity of category '{}' with sub-category '{}'".format(entity.category, entity.sub_category)) | ||
print("...has content '{}'".format(entity.content)) | ||
print("...within '{}' bounding regions".format(format_bounding_region(entity.bounding_regions))) | ||
print("...with confidence {}".format(entity.confidence)) | ||
|
||
print("----Key-value pairs found in document----") | ||
for idx, kv_pair in enumerate(result.key_value_pairs): | ||
if kv_pair.key: | ||
print( | ||
"Key '{}' found within '{}' bounding regions".format( | ||
kv_pair.key.content, | ||
format_bounding_region(kv_pair.key.bounding_regions), | ||
) | ||
) | ||
if kv_pair.value: | ||
print( | ||
"Value '{}' found within '{}' bounding regions".format( | ||
kv_pair.value.content, | ||
format_bounding_region(kv_pair.value.bounding_regions), | ||
) | ||
) | ||
print("----------------------------------------") | ||
|
||
# [END analyze_document] | ||
|
||
|
||
if __name__ == "__main__": | ||
analyze_document() |
Oops, something went wrong.