-
Notifications
You must be signed in to change notification settings - Fork 1
/
ocr.py
51 lines (40 loc) · 1.22 KB
/
ocr.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
"""Optical Character Recognition using Tesseract."""
from __future__ import annotations
import logging
import subprocess
import typing
if typing.TYPE_CHECKING:
import pathlib
logger = logging.getLogger(__name__)
class OcrHelper:
"""OCT helper for Tesseract."""
def __init__(self, exe_path: pathlib.Path, data_dir: pathlib.Path) -> None:
"""Create a new instance.
Args:
exe_path: The path to the tesseract executable.
data_dir: The path to the tesseract data directory.
"""
self._exe_path = exe_path
self._data_dir = data_dir
def run(self, image_file: pathlib.Path) -> str | None:
"""Run tesseract over an image file.
Args:
image_file: The path to the image file.
Returns:
The text from the image.
"""
cmds = [
str(self._exe_path),
"--tessdata-dir",
str(self._data_dir),
str(image_file),
"stdout",
]
result = subprocess.run(
cmds,
check=True,
capture_output=True,
shell=False,
)
raw_value = result.stdout.decode(encoding="UTF-8")
return raw_value