Skip to content

Commit

Permalink
Fix coordinate extraction problems
Browse files Browse the repository at this point in the history
  • Loading branch information
VikParuchuri committed Oct 18, 2024
1 parent 1263a1e commit 5aa2033
Show file tree
Hide file tree
Showing 5 changed files with 25 additions and 26 deletions.
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@ temp.json
notebooks
results
.DS_Store
profile_output*

# Byte-compiled / optimized / DLL files
__pycache__/
Expand Down
25 changes: 12 additions & 13 deletions pdftext/inference.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,18 +3,18 @@
from pdftext.pdf.utils import LINE_BREAKS, TABS, SPACES
from pdftext.settings import settings


def update_current(current, new_char):
bbox = new_char["bbox"]
if "bbox" not in current:
current["bbox"] = bbox.copy()
current_bbox = bbox.copy()
current["bbox"] = current_bbox
else:
current_bbox = current["bbox"]
current_bbox[0] = min(bbox[0], current_bbox[0])
current_bbox[1] = min(bbox[1], current_bbox[1])
current_bbox[2] = max(bbox[2], current_bbox[2])
current_bbox[3] = max(bbox[3], current_bbox[3])
current_bbox = current["bbox"]

current["center_x"] = (current_bbox[0] + current_bbox[2]) / 2
current["center_y"] = (current_bbox[1] + current_bbox[3]) / 2

Expand Down Expand Up @@ -71,9 +71,6 @@ def create_training_row(char_info, prev_char, currblock, currline):
def update_span(line, span):
if span["chars"]:
first_char = span["chars"][0]
span["font"] = first_char["font"]
span["rotation"] = first_char["rotation"]

char_bboxes = [char["bbox"] for char in span["chars"]]
min_x, min_y, max_x, max_y = char_bboxes[0]

Expand All @@ -83,14 +80,17 @@ def update_span(line, span):
max_x = max(max_x, bbox[2])
max_y = max(max_y, bbox[3])

span["bbox"] = [min_x, min_y, max_x, max_y]
span["text"] = "".join(char["char"] for char in span["chars"])
span["char_start_idx"] = first_char["char_idx"]
span["char_end_idx"] = span["chars"][-1]["char_idx"]
span.update({
"font": first_char["font"],
"rotation": first_char["rotation"],
"bbox": [min_x, min_y, max_x, max_y],
"text": "".join(char["char"] for char in span["chars"])
})

# Remove unneeded keys from the characters
char_keys = list(first_char.keys())
for char in span["chars"]:
for key in list(char.keys()):
for key in char_keys:
if key not in ["char", "bbox"]:
del char[key]

Expand Down Expand Up @@ -148,7 +148,6 @@ def normalized_diff(a, b, mult=1, use_abs=True):
char_center_y = (char_bbox[1] + char_bbox[3]) / 2
return normalized_diff(char_center_y, line_center_y)


def infer_single_page(text_chars, block_threshold=settings.BLOCK_THRESHOLD):
prev_char = None
prev_font_info = None
Expand Down Expand Up @@ -248,4 +247,4 @@ def inference(text_chars, model):
sorted_keys = sorted(page_blocks.keys())
page_blocks = [page_blocks[key] for key in sorted_keys]
assert len(page_blocks) == len(text_chars)
return page_blocks
return page_blocks
4 changes: 4 additions & 0 deletions pdftext/pdf/chars.py
Original file line number Diff line number Diff line change
Expand Up @@ -64,6 +64,10 @@ def get_pdfium_chars(pdf, page_range, flatten_pdf, fontname_sample_freq=settings
"height": page_height,
}

# For pypdfium bbox function later
page_width = math.ceil(page_width)
page_height = math.ceil(page_height)

fontname = None
fontflags = None
total_chars = text_page.count_chars()
Expand Down
19 changes: 7 additions & 12 deletions pdftext/pdf/utils.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,5 @@
import pypdfium2.raw as pdfium_c
import ctypes
import math

LINE_BREAKS = ["\n", "\u000D", "\u000A"]
TABS = ["\t", "\u0009", "\x09"]
Expand Down Expand Up @@ -32,7 +31,7 @@ def get_fontname(textpage, char_index):
return decoded, flag_buffer.value


def page_to_device(page, x, y, page_width, page_height, page_rotation: int):
def page_to_device(page, x, y, page_width, page_height, page_rotation: int, device_x, device_y):
if page_rotation == 90:
page_rotation = 1
elif page_rotation == 180:
Expand All @@ -41,19 +40,15 @@ def page_to_device(page, x, y, page_width, page_height, page_rotation: int):
page_rotation = 3
else:
page_rotation = 0
width = math.ceil(page_width)
height = math.ceil(page_height)
device_x = ctypes.c_int()
device_y = ctypes.c_int()
pdfium_c.FPDF_PageToDevice(page, 0, 0, width, height, page_rotation, x, y, device_x, device_y)
x = device_x.value
y = device_y.value
return x, y
pdfium_c.FPDF_PageToDevice(page, 0, 0, page_width, page_height, page_rotation, x, y, device_x, device_y)
return device_x.value, device_y.value


def pdfium_page_bbox_to_device_bbox(page, bbox, page_width, page_height, page_rotation):
left_bottom = page_to_device(page, *bbox[:2], page_width, page_height, page_rotation)
top_right = page_to_device(page, *bbox[2:], page_width, page_height, page_rotation)
device_x = ctypes.c_int()
device_y = ctypes.c_int()
left_bottom = page_to_device(page, *bbox[:2], page_width, page_height, page_rotation, device_x, device_y)
top_right = page_to_device(page, *bbox[2:], page_width, page_height, page_rotation, device_x, device_y)

dev_bbox = [left_bottom[0], top_right[1], top_right[0], left_bottom[1]]
return dev_bbox
Expand Down
2 changes: 1 addition & 1 deletion pdftext/settings.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@ class Settings(BaseSettings):
MODEL_PATH: str = os.path.join(BASE_PATH, "models", "dt.onnx")

# Fonts
FONTNAME_SAMPLE_FREQ: int = 4
FONTNAME_SAMPLE_FREQ: int = 6

# Inference
BLOCK_THRESHOLD: float = 0.8 # Confidence threshold for block detection
Expand Down

0 comments on commit 5aa2033

Please sign in to comment.