-
Notifications
You must be signed in to change notification settings - Fork 2
/
google_vision_ocr.py
157 lines (122 loc) · 4.96 KB
/
google_vision_ocr.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
#!/usr/bin/env python
# -*- coding: utf-8 -*-
"""
google_vision_ocr.py: goes through a directory of png files and outputs text
and json files in an output directory with the same file name as input file.
So, for instance, abc_1_15.png produces abc_1_15.txt and abc_1_15.json.
Modified from: python-docs-samples/vision/cloud-client/document_text/doctext.py
"""
# [START vision_document_text_tutorial]
# [START vision_document_text_tutorial_imports]
import os
import argparse
from enum import Enum
import io
import time
from glob import glob
from google.cloud import vision
from google.cloud.vision import types
from PIL import Image, ImageDraw
# [END vision_document_text_tutorial_imports]
class FeatureType(Enum):
PAGE = 1
BLOCK = 2
PARA = 3
WORD = 4
SYMBOL = 5
def draw_boxes(image, bounds, color):
"""Draw a border around the image using the hints in the vector list."""
draw = ImageDraw.Draw(image)
for bound in bounds:
draw.polygon([
bound.vertices[0].x, bound.vertices[0].y,
bound.vertices[1].x, bound.vertices[1].y,
bound.vertices[2].x, bound.vertices[2].y,
bound.vertices[3].x, bound.vertices[3].y], None, color)
return image
def detect_document_text(image_file, textfile, jsonfile):
"""Returns document text given an image."""
client = vision.ImageAnnotatorClient()
with io.open(image_file, 'rb') as image_file:
content = image_file.read()
image = types.Image(content=content)
image_context = types.ImageContext(language_hints=['en'])
response = client.document_text_detection(image=image, timeout=300, image_context=image_context)
document = response.full_text_annotation
if textfile is not 0:
with io.open(textfile, 'wb') as f:
f.write(document.text.encode('utf-8'))
if jsonfile is not 0:
with io.open(jsonfile, 'wb') as f:
f.write(str(document))
return document
def get_document_bounds(document, feature):
# [START vision_document_text_tutorial_detect_bounds]
bounds = []
# Collect specified feature bounds by enumerating all document features
for page in document.pages:
for block in page.blocks:
for paragraph in block.paragraphs:
for word in paragraph.words:
for symbol in word.symbols:
if (feature == FeatureType.SYMBOL):
bounds.append(symbol.bounding_box)
if (feature == FeatureType.WORD):
bounds.append(word.bounding_box)
if (feature == FeatureType.PARA):
bounds.append(paragraph.bounding_box)
if (feature == FeatureType.BLOCK):
bounds.append(block.bounding_box)
if (feature == FeatureType.PAGE):
bounds.append(block.bounding_box)
# The list `bounds` contains the coordinates of the bounding boxes.
# [END vision_document_text_tutorial_detect_bounds]
return bounds
def render_doc_text(filein, fileout, textfile, jsonfile):
doc = detect_document_text(filein, textfile, jsonfile)
image = Image.open(filein)
bounds = get_document_bounds(doc, FeatureType.BLOCK)
draw_boxes(image, bounds, 'blue')
bounds = get_document_bounds(doc, FeatureType.PARA)
draw_boxes(image, bounds, 'red')
bounds = get_document_bounds(doc, FeatureType.WORD)
draw_boxes(image, bounds, 'green')
if fileout is not 0:
image.save(fileout)
else:
image.show()
n = 0
sum = 0
for p in doc.pages:
for b in p.blocks:
sum += b.confidence
n += 1.0
print("Confidence: %0.4f" % (sum / n))
if __name__ == "__main__":
title = 'OCR PNG files in the directory using Google Vision API'
parser = argparse.ArgumentParser(description=title)
parser.add_argument('directory', default=None,
help='Directory contains PNG files')
parser.add_argument('--overwritten',
help='Overwrite if output file exists',
action='store_true')
parser.add_argument('-o', '--output', default='output',
help='Directory for output files')
args = parser.parse_args()
print(args)
if not os.path.exists(args.output):
os.makedirs(args.output)
for filein in glob(os.path.join(args.directory, '*.png')):
print('Processing...{:s}'.format(filein))
base_fn = os.path.basename(filein)
fn = os.path.splitext(base_fn)[0]
fileout = os.path.join(args.output, fn + '.png')
if os.path.exists(fileout) and not args.overwritten:
print("- Output exists, skip...")
continue
textfile = os.path.join(args.output, fn + '.txt')
jsonfile = os.path.join(args.output, fn + '.json')
start = time.time()
render_doc_text(filein, fileout, textfile, jsonfile)
duration = time.time() - start
print("- Duration: %0.1f" % (duration))