Skip to content

Commit

Permalink
More changes for table feature
Browse files Browse the repository at this point in the history
Ensure equal sources for `table.py` classic and rebased architecture.
  • Loading branch information
JorjMcKie authored and julian-smith-artifex-com committed Aug 22, 2023
1 parent aad6a76 commit 1ec7d35
Show file tree
Hide file tree
Showing 4 changed files with 118 additions and 87 deletions.
6 changes: 3 additions & 3 deletions fitz/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -85,7 +85,7 @@ def v_tuple_to_string(t):

# copy functions in 'utils' to their respective fitz classes
import fitz.utils
from fitz.table import find_tables
from .table import find_tables

# ------------------------------------------------------------------------------
# General
Expand Down Expand Up @@ -165,8 +165,7 @@ def v_tuple_to_string(t):
fitz.Page.get_textpage_ocr = fitz.utils.get_textpage_ocr
fitz.Page.delete_image = fitz.utils.delete_image
fitz.Page.replace_image = fitz.utils.replace_image
fitz.Page.find_tables = fitz.table.find_tables

fitz.Page.find_tables = find_tables
# ------------------------------------------------------------------------
# Annot
# ------------------------------------------------------------------------
Expand Down Expand Up @@ -509,3 +508,4 @@ def deprecated_function(*args, **kw):
for k, (r, g, b) in fitz.utils.getColorInfoDict().items()
]
)
__version__ = fitz.VersionBind
97 changes: 55 additions & 42 deletions fitz/table.py
Original file line number Diff line number Diff line change
Expand Up @@ -81,7 +81,15 @@
# -------------------------------------------------------------------
# Start of PyMuPDF interface code
# -------------------------------------------------------------------
import fitz
from . import (
Rect,
Matrix,
TEXTFLAGS_TEXT,
TOOLS,
EMPTY_RECT,
sRGB_to_pdf,
Point,
)

EDGES = [] # vector graphics from PyMuPDF
CHARS = [] # text characters from PyMuPDF
Expand Down Expand Up @@ -1345,16 +1353,19 @@ def _get_header(self, y_tolerance=3):
*** PyMuPDF extension. ***
Check if text above the table qualifies as column headers.
Starting from the first line above the table upwards, check if it
qualifies to be part of the table header.
Criteria include:
* Column borders must not intersect any word. If this happens, the
respective line and all lines above of it are ignored.
* No excess inter-line distance. If a previous line has a distance
* A one-line table never has an extra header.
* Column borders must not intersect any word. If this happens, all
text of this line and above of it is ignored.
* No excess inter-line distance: If a line further up has a distance
of more than 1.5 times of its font size, it will be ignored and
all lines above of it.
* Must have same text properties (color, font etc.).
* If the top-left cell of the table body is bold, then the first line
above must ba bold too.
* Must have same text properties.
* Starting with the top table line, a bold text property cannot change
back to non-bold.
If not all criteria are met (or there is no text above the table),
the first table row is assumed to be the header.
Expand All @@ -1372,9 +1383,7 @@ def top_row_is_bold(bbox):
Returns True or False
"""
for b in page.get_text("dict", flags=fitz.TEXTFLAGS_TEXT, clip=bbox)[
"blocks"
]:
for b in page.get_text("dict", flags=TEXTFLAGS_TEXT, clip=bbox)["blocks"]:
for l in b["lines"]:
for s in l["spans"]:
if s["flags"] & 16:
Expand All @@ -1387,8 +1396,8 @@ def recover_top_row_cells(table):
We need all column x-coordinates even when the top table row
contains None cells.
"""
bbox = fitz.Rect(table.rows[0].bbox) # top row bbox
tbbox = fitz.Rect(table.bbox) # table bbox
bbox = Rect(table.rows[0].bbox) # top row bbox
tbbox = Rect(table.bbox) # table bbox
y0, y1 = bbox.y0, bbox.y1 # top row upper / lower coordinates

# make sure row0 bbox has the full table width
Expand All @@ -1399,7 +1408,7 @@ def recover_top_row_cells(table):
for cell in table.cells:
if cell == None: # skip non-existing cells
continue
cellbb = fitz.Rect(cell)
cellbb = Rect(cell)

# only accept cells wider than a character
if 10 < cellbb.width < tbbox.width:
Expand All @@ -1417,24 +1426,28 @@ def recover_top_row_cells(table):
return cells, bbox

# we depend on small glyph heights!
old_small = fitz.TOOLS.set_small_glyph_heights()
fitz.TOOLS.set_small_glyph_heights(True)
old_small = TOOLS.set_small_glyph_heights()
TOOLS.set_small_glyph_heights(True)
try:
row = self.rows[0]
cells = row.cells
bbox = fitz.Rect(row.bbox)
bbox = Rect(row.bbox)
except IndexError: # this table has no rows
return None

if None in cells: # if row 0 has empty cells, repair it
cells, bbox = recover_top_row_cells(self)

# return the following if we think that TableRow 0 is the header
header_first_row = TableHeader(bbox, cells, self.extract()[0], False)
# return this if we determine that the top row is the header
header_top_row = TableHeader(bbox, cells, self.extract()[0], False)

# one-line tables have no extra header
if len(self.rows) < 2:
return header_top_row

# x-ccordinates of columns between x0 and x1 of the table
if len(cells) < 2:
return header_first_row
return header_top_row

col_x = [c[2] for c in cells[:-1]] # column (x) coordinates

Expand All @@ -1450,7 +1463,7 @@ def recover_top_row_cells(table):
clip.y1 = bbox.y0 # end at top of table

spans = [] # the text spans inside clip
for b in page.get_text("dict", clip=clip, flags=fitz.TEXTFLAGS_TEXT)["blocks"]:
for b in page.get_text("dict", clip=clip, flags=TEXTFLAGS_TEXT)["blocks"]:
for l in b["lines"]:
for s in l["spans"]:
if (
Expand Down Expand Up @@ -1500,23 +1513,23 @@ def recover_top_row_cells(table):
line_bolds.append(bold)

if select == []: # nothing above the table?
return header_first_row
return header_top_row

select = select[:5] # only accept up to 5 lines in any header

# take top row as header if text above table is too far apart
if bbox.y0 - select[0] >= line_heights[0]:
return header_first_row
return header_top_row

# if top table row is bold, but line above is not:
if top_row_bold and not line_bolds[0]:
return header_first_row
return header_top_row

if spans == []: # nothing left above the table, return top row
return header_first_row
return header_top_row

# re-compute clip above table
nclip = fitz.EMPTY_RECT()
nclip = EMPTY_RECT()
for s in [s for s in spans if s["bbox"][3] >= select[-1]]:
nclip |= s["bbox"]
if not nclip.is_empty:
Expand All @@ -1525,7 +1538,7 @@ def recover_top_row_cells(table):
clip.y1 = bbox.y0 # make sure we still include every word above

# Confirm that no word in clip is intersecting a column separator
word_rects = [fitz.Rect(w[:4]) for w in page.get_text("words", clip=clip)]
word_rects = [Rect(w[:4]) for w in page.get_text("words", clip=clip)]
word_tops = sorted(list(set([r[1] for r in word_rects])), reverse=True)

select = []
Expand All @@ -1544,7 +1557,7 @@ def recover_top_row_cells(table):
break

if select == []: # nothing left over: return first row
return header_first_row
return header_top_row

hdr_bbox = +clip # compute the header cells
hdr_bbox.y0 = select[-1] # hdr_bbox top is smallest top coord of words
Expand All @@ -1559,7 +1572,7 @@ def recover_top_row_cells(table):
page.get_textbox(c).replace("\n", " ").replace(" ", " ").strip()
for c in hdr_cells
]
fitz.TOOLS.set_small_glyph_heights(old_small)
TOOLS.set_small_glyph_heights(old_small)
return TableHeader(tuple(hdr_bbox), hdr_cells, hdr_names, True)


Expand Down Expand Up @@ -1807,29 +1820,29 @@ def __getitem__(self, i):
def make_chars(page, clip=None):
"""Extract text as "rawdict" to fill CHARS."""
global CHARS
old_small = fitz.TOOLS.set_small_glyph_heights()
fitz.TOOLS.set_small_glyph_heights(True)
old_small = TOOLS.set_small_glyph_heights()
TOOLS.set_small_glyph_heights(True)
page_number = page.number + 1
page_height = page.rect.height
ctm = page.transformation_matrix
blocks = page.get_text("rawdict", clip=clip, flags=fitz.TEXTFLAGS_TEXT)["blocks"]
blocks = page.get_text("rawdict", clip=clip, flags=TEXTFLAGS_TEXT)["blocks"]
doctop_base = page_height * page.number
for block in blocks:
for line in block["lines"]:
ldir = line["dir"] # = (cosine, sine) of angle
matrix = fitz.Matrix(ldir[0], -ldir[1], ldir[1], ldir[0], 0, 0)
matrix = Matrix(ldir[0], -ldir[1], ldir[1], ldir[0], 0, 0)
if ldir[1] == 0:
upright = True
else:
upright = False
for span in sorted(line["spans"], key=lambda s: s["bbox"][0]):
fontname = span["font"]
fontsize = span["size"]
color = fitz.sRGB_to_pdf(span["color"])
color = sRGB_to_pdf(span["color"])
for char in sorted(span["chars"], key=lambda c: c["bbox"][0]):
bbox = fitz.Rect(char["bbox"])
bbox = Rect(char["bbox"])
bbox_ctm = bbox * ctm
origin = fitz.Point(char["origin"]) * ctm
origin = Point(char["origin"]) * ctm
matrix.e = origin.x
matrix.f = origin.y
text = char["c"]
Expand Down Expand Up @@ -1859,7 +1872,7 @@ def make_chars(page, clip=None):
}
CHARS.append(char_dict)

fitz.TOOLS.set_small_glyph_heights(old_small)
TOOLS.set_small_glyph_heights(old_small)


# -----------------------------------------------------------------------------
Expand All @@ -1874,7 +1887,7 @@ def make_edges(page, clip=None, tset=None):
x_tolerance = tset.snap_x_tolerance
y_tolerance = tset.snap_y_tolerance
if clip != None:
clip = fitz.Rect(clip)
clip = Rect(clip)
else:
clip = page.rect
clip = page.rect
Expand Down Expand Up @@ -1948,16 +1961,16 @@ def make_line(p, p1, p2, clip):
continue
if rect.width <= x_tolerance:
x = abs(rect.x1 + rect.x0) / 2
p1 = fitz.Point(x, rect.y0)
p2 = fitz.Point(x, rect.y1)
p1 = Point(x, rect.y0)
p2 = Point(x, rect.y1)
line_dict = make_line(p, p1, p2, clip)
if line_dict != {}:
EDGES.append(line_to_edge(line_dict))
continue
elif rect.height <= y_tolerance:
y = abs(rect.y1 + rect.y0) / 2
p1 = fitz.Point(rect.x0, y)
p2 = fitz.Point(rect.x1, y)
p1 = Point(rect.x0, y)
p2 = Point(rect.x1, y)
line_dict = make_line(p, p1, p2, clip)
if line_dict != {}:
EDGES.append(line_to_edge(line_dict))
Expand Down
5 changes: 5 additions & 0 deletions src/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -21220,6 +21220,9 @@ def _atexit():
Page.show_pdf_page = utils.show_pdf_page
Page.update_link = utils.update_link
Page.write_text = utils.write_text
from .table import find_tables

Page.find_tables = find_tables

Rect.get_area = utils.get_area

Expand Down Expand Up @@ -21520,3 +21523,5 @@ def deprecated_function( *args, **kwargs):

if 0:
restore_aliases()

__version__ = VersionBind
Loading

0 comments on commit 1ec7d35

Please sign in to comment.