-
Notifications
You must be signed in to change notification settings - Fork 0
/
pdf_to_excel.py
59 lines (49 loc) · 1.81 KB
/
pdf_to_excel.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
import tabula
import fitz
import pytesseract
from pdf2image import convert_from_path
import pandas as pd
pytesseract.pytesseract.tesseract_cmd = r'C:\Program Files\Tesseract-OCR\tesseract.exe'
def extract_tables_with_tabula(pdf_path):
try:
tables = tabula.read_pdf(pdf_path, pages='all', multiple_tables=True)
return tables
except Exception as e:
print(f"Error using tabula for {pdf_path}: {e}")
return None
def extract_text_with_ocr(pdf_path):
images = convert_from_path(pdf_path)
text = ""
for img in images:
text += pytesseract.image_to_string(img) + "\n"
return text
def extract_tables_from_text(text):
lines = text.split('\n')
tables = []
table = []
for line in lines:
if line.strip() == "":
if table:
tables.append(table)
table = []
else:
table.append(line.split())
if table:
tables.append(table)
return tables
def save_tables_to_excel(tables, writer, sheet_name):
if tables:
combined_df = pd.concat([pd.DataFrame(table) for table in tables], ignore_index=True)
combined_df.to_excel(writer, sheet_name=sheet_name, index=False, header=False)
def convert_pdf_to_excel(pdf_path, writer, sheet_name):
tables = extract_tables_with_tabula(pdf_path)
if not tables:
text = extract_text_with_ocr(pdf_path)
tables = extract_tables_from_text(text)
save_tables_to_excel(tables, writer, sheet_name)
pdf_paths = ["12-DEZEMBRO (2).pdf", "Balanco socio ostensivo 2022 (2) (2).pdf"]
excel_path = "combined_output4.xlsx"
with pd.ExcelWriter(excel_path, engine='openpyxl') as writer:
for i, pdf_path in enumerate(pdf_paths):
sheet_name = f'Arquivo_{i+1}'
convert_pdf_to_excel(pdf_path, writer, sheet_name)