-
-
Notifications
You must be signed in to change notification settings - Fork 1
/
pdf_processor.py
246 lines (194 loc) · 10.1 KB
/
pdf_processor.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
import os
import sys
import logging
from typing import Dict, Tuple, Optional
from jellyfish import jaro_winkler_similarity
import datetime
import ocrmypdf
from ocrmypdf import exceptions as ocrmypdf_exceptions
from openai import OpenAI
import json
import dateparser
import re
import fitz # PyMuPDF
import tempfile
import traceback
from pydantic import BaseModel, Field
from dotenv import load_dotenv
import ctypes
# Constants
PDF_EXTENSION = ".pdf"
UNKNOWN_VALUE = "Unknown"
DEFAULT_DATE = "00000000"
CONFIDENCE_THRESHOLD = 0.85
client = None
def set_env_vars(env_vars):
for key, value in env_vars.items():
os.environ[key] = value
def initialize_openai_client(api_key):
global client
client = OpenAI(api_key=api_key)
class DocumentResponse(BaseModel):
company_name: str = Field(..., description="Name of the company in the document")
document_date: str = Field(..., description="Date of the document in format dd.mm.yyyy")
document_type: str = Field(..., description="Type of the document (ER, AR, etc.)")
def is_valid_filename(filename: str) -> bool:
forbidden_chars = r'[<>:"/\\|?*]'
if re.search(forbidden_chars, filename):
return False
if not filename or filename.isspace():
return False
if len(filename) > 255:
return False
return True
def pdf_to_text(pdf_path: str, start_page: int = 1, end_page: int = 3) -> str:
"""Process PDF: always run OCR first, then extract text using PyMuPDF."""
all_text = ""
try:
# Step 1: Always run OCR
with tempfile.NamedTemporaryFile(delete=False, suffix='.pdf') as temp_file:
temp_path = temp_file.name
ocr_languages = os.getenv('OCR_LANGUAGES', 'eng,deu').split(',')
try:
ocrmypdf.ocr(
pdf_path,
temp_path,
force_ocr=True,
pages=f"{start_page}-{end_page}",
language=ocr_languages,
invalidate_digital_signatures=True
)
except ocrmypdf_exceptions.PriorOcrFoundError:
logging.warning(f"Prior OCR found in {pdf_path}. Skipping OCR step.")
temp_path = pdf_path
# Step 2: Extract text using PyMuPDF (fitz)
doc = fitz.open(temp_path)
for page_num in range(start_page - 1, min(end_page, len(doc))):
page = doc[page_num]
page_text = page.get_text()
all_text += f"Page {page_num + 1}:\n{page_text}\n\n"
doc.close()
# Clean up temporary file
os.unlink(temp_path)
except Exception as e:
logging.error(f"Error processing PDF {pdf_path}: {str(e)}")
logging.info(f"Extracted text (first 500 characters): {all_text[:500]}...")
if not all_text.strip():
logging.warning(f"No text extracted from {pdf_path}")
return all_text.strip()
def process_text_with_openai(text: str) -> Dict[str, str]:
"""Process the extracted text with OpenAI API."""
global client
if client is None:
raise ValueError("OpenAI client not initialized. Call initialize_openai_client first.")
try:
response = client.chat.completions.create(
model=os.getenv("OPENAI_MODEL", "gpt-4"),
messages=[
{
"role": "system",
"content":
"You will extract the company name, document date, and document type from the following PDF text. Due to the nature of OCR Text detection, the text will be very noisy and might contain spelling and detection errors, handle those as good as possible." + "\n\n" +
f"document_date: Find the most appropriate date (e.g. the invoice date) and assume the correct Date Format according to the language and location of the document. Return format must be: dd.mm.YYYY" + "\n\n" +
f"company_name: Find the name of the company that is the corresponding party of the document. My company name is: \"{os.getenv('MY_COMPANY_NAME')}\", avoid using my company name as company_name in the response. For the company_name you always strip the legal form (e.U., SARL, GmbH, AG, Lmt, Limited etc.)" + "\n\n" +
f"document_type: Find the best matching type of the document. Valid document types are: For incoming invoices (invoices my company receives) use the term '{os.getenv('PDF_INCOMING_INVOICE', 'ER')}' only, nothing more. For outgoing invoices (invoices my company sends) use the term '{os.getenv('PDF_OUTGOING_INVOICE', 'AR')}', nothing more. For all other documents, find a short descriptive summary/subject in {os.getenv('OUTPUT_LANGUAGE', 'German')} language." + "\n\n" +
"If a value is not found, leave it empty." + "\n\n" +
"Output - adhere to the following JSON format: company_name, document_date, document_type. No additional text and no formatting. Only the JSON object." +
os.getenv("PROMPT_EXTENSION", "")
},
{"role": "user", "content": f"Extract the information from the text:\n\n{text}"}
],
response_format={ "type": "json_object" }
)
content = response.choices[0].message.content
parsed_response = json.loads(content)
logging.info(f'API Extract Response: {parsed_response}')
company_name = parsed_response.get('company_name', UNKNOWN_VALUE)
document_date = parsed_response.get('document_date', DEFAULT_DATE)
document_type = parsed_response.get('document_type', UNKNOWN_VALUE)
if not is_valid_filename(company_name):
company_name = UNKNOWN_VALUE
if not is_valid_filename(document_type):
document_type = UNKNOWN_VALUE
if not is_valid_filename(document_date):
document_date = DEFAULT_DATE
return {"company_name": company_name, "document_date": document_date, "document_type": document_type}
except Exception as e:
logging.error(f"Error during OpenAI API call: {e}")
return {"company_name": UNKNOWN_VALUE, "document_date": DEFAULT_DATE, "document_type": UNKNOWN_VALUE}
def harmonize_company_name(company_name: str, json_path: str) -> str:
"""Harmonize company name based on predefined mappings."""
company_name = company_name.strip()
if not os.path.exists(json_path):
logging.warning(f'{json_path} not found, using original name: {company_name}')
return company_name
with open(json_path, "r", encoding='utf-8') as file:
harmonized_names = json.load(file)
best_match = max(
((harmonized_name, max(jaro_winkler_similarity(company_name.lower(), synonym.lower()) for synonym in synonyms))
for harmonized_name, synonyms in harmonized_names.items()),
key=lambda x: x[1]
)
if best_match[1] > CONFIDENCE_THRESHOLD:
logging.info(f'Using harmonized company name: {best_match[0]}')
return best_match[0]
logging.info(f'No harmonized company name found, using original name: {company_name}')
return company_name
def parse_openai_response(response: Dict[str, str]) -> Tuple[str, Optional[datetime.date], str]:
"""Parse the OpenAI response and extract relevant information."""
company_name = response.get('company_name', UNKNOWN_VALUE)
document_date = response.get('document_date', DEFAULT_DATE)
document_type = response.get('document_type', UNKNOWN_VALUE)
parsed_date = dateparser.parse(document_date, settings={'DATE_ORDER': 'DMY'})
if parsed_date is None:
parsed_date = dateparser.parse(DEFAULT_DATE, settings={'DATE_ORDER': 'DMY'})
return company_name, parsed_date, document_type
def attempt_to_close_file(file_path: str) -> None:
"""Attempt to close the file if it's open (Windows-specific)."""
if sys.platform == "win32":
try:
# Convert the file path to a wide character string
file_path_wide = ctypes.c_wchar_p(file_path)
# Attempt to close the file handle
ctypes.windll.kernel32.CloseHandle(file_path_wide)
except Exception as e:
logging.warning(f"Failed to close file handle for {file_path}: {str(e)}")
def rename_invoice(pdf_path: str, company_name: str, document_date: Optional[datetime.date], document_type: str) -> None:
"""Rename the document based on extracted information."""
if document_date:
base_name = f'{document_date.strftime(os.getenv("OUTPUT_DATE_FORMAT", "%Y%m%d"))} {company_name} {document_type}'
else:
base_name = f'{company_name} {document_type}'
new_name = f"{base_name}.pdf"
new_path = os.path.join(os.path.dirname(pdf_path), new_name)
if pdf_path == new_path:
logging.info(f'File "{new_name}" is already correctly named.')
return
counter = 0
while os.path.exists(new_path):
counter += 1
new_name = f'{base_name} ({counter}).pdf'
new_path = os.path.join(os.path.dirname(pdf_path), new_name)
try:
# Attempt to close the file before renaming
attempt_to_close_file(pdf_path)
os.rename(pdf_path, new_path)
logging.info(f'Document renamed to: {new_name}')
except Exception as e:
logging.error(f'Error renaming {pdf_path}: {str(e)}')
def process_pdf(pdf_path: str, json_path: str) -> None:
"""Process a single PDF file."""
logging.info("---")
logging.info(f"Processing {pdf_path}")
try:
extracted_text = pdf_to_text(pdf_path)
if not extracted_text:
logging.warning(f"No text extracted from {pdf_path}. Skipping further processing.")
return
openai_response = process_text_with_openai(extracted_text)
company_name, document_date, document_type = parse_openai_response(openai_response)
company_name = harmonize_company_name(company_name, json_path)
rename_invoice(pdf_path, company_name, document_date, document_type)
except Exception as e:
logging.error(f"Error processing {pdf_path}: {str(e)}")
logging.debug(traceback.format_exc())