Skip to content

Commit

Permalink
pageobjects: descend into form objects
Browse files Browse the repository at this point in the history
CC #137
  • Loading branch information
mara004 committed Sep 15, 2022
1 parent adb4294 commit dac4d6c
Show file tree
Hide file tree
Showing 3 changed files with 56 additions and 31 deletions.
20 changes: 10 additions & 10 deletions src/pypdfium2/_cli/find_pageobjects.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,30 +11,31 @@


def attach_parser(subparsers):
obj_types = list(ObjtypeToConst.keys())
parser = subparsers.add_parser(
"find-pageobjects",
help = "Locate page objects of a certain type",
help = "Locate page objects of given types.",
)
parser.add_argument(
"input",
type = os.path.abspath,
help = "Path to the PDF document to work with",
help = "Path to the PDF document to work with.",
)
parser.add_argument(
"--password",
help = "Password to unlock the PDF, if encrypted"
help = "Password to unlock the PDF, if encrypted."
)
parser.add_argument(
"--pages",
type = pagetext_type,
help = "The pages to search (defaults to all)",
help = "The pages to search (defaults to all).",
)
parser.add_argument(
"--types",
nargs = "+",
required = True,
choices = list(ObjtypeToConst.keys()),
help = "Object types to consider",
choices = obj_types,
default = obj_types,
help = "Object types to consider (defaults to all).",
)


Expand All @@ -48,9 +49,8 @@ def main(args):
for index in args.pages:
page = doc.get_page(index)
for obj in page.get_objects():
type = obj.get_type()
if type in args.types:
print(ObjtypeToName[type], obj.get_pos())
if obj.type in args.types:
print(" "*obj.level + ObjtypeToName[obj.type], obj.get_pos())
page.close()

doc.close()
65 changes: 45 additions & 20 deletions src/pypdfium2/_helpers/page.py
Original file line number Diff line number Diff line change
Expand Up @@ -225,23 +225,52 @@ def insert_text(
pdfium.FPDFPage_GenerateContent(self.raw)


def count_objects(self):
"""
Returns:
int: The number of page objects on this page.
"""
return pdfium.FPDFPage_CountObjects(self.raw)

def get_objects(self):
def get_objects(self, max_depth=3, form=None, level=0):
"""
Iterate through the page objects on this page.
Parameters:
max_depth (int):
Maximum recursion depth to consider when descending into Form XObjects.
Yields:
:class:`.PdfPageObject`: Page object helper.
(PdfPageObject, int): Page object and nesting level.
"""
for i in range( self.count_objects() ):
raw_obj = pdfium.FPDFPage_GetObject(self.raw, i)
yield PdfPageObject(raw_obj, self)

if form is None:
count_objects = pdfium.FPDFPage_CountObjects
get_object = pdfium.FPDFPage_GetObject
parent = self.raw
else:
count_objects = pdfium.FPDFFormObj_CountObjects
get_object = pdfium.FPDFFormObj_GetObject
parent = form

n_objects = count_objects(parent)
if n_objects == 0:
return
elif n_objects < 0:
raise PdfiumError("Failed to get number of page objects.")

for i in range(n_objects):

raw_obj = get_object(parent, i)
if raw_obj is None:
raise PdfiumError("Failed to get page object.")

helper_obj = PdfPageObject(
raw = raw_obj,
page = self,
level = level,
)
yield helper_obj

if level < max_depth-1 and helper_obj.type == pdfium.FPDF_PAGEOBJ_FORM:
yield from self.get_objects(
max_depth = max_depth,
form = raw_obj,
level = level + 1,
)


def render_base(
Expand Down Expand Up @@ -522,12 +551,15 @@ class PdfPageObject:
Note:
* The :attr:`.raw` attribute stores the underlying :class:`FPDF_PAGEOBJECT`.
* The :attr:`.type` attribute stores the type of the object (:data:`FPDF_PAGEOBJ_...`)
* The :attr:`.page` attribute holds a reference to the :class:`.PdfPage` this page object belongs to.
"""

def __init__(self, raw, page):
def __init__(self, raw, page, level=0):
self.raw = raw
self.page = page
self.level = level
self.type = pdfium.FPDFPageObj_GetType(self.raw)

def get_pos(self):
"""
Expand All @@ -541,10 +573,3 @@ def get_pos(self):
if not ret_code:
raise PdfiumError("Locating the page object failed")
return (left.value, bottom.value, right.value, top.value)

def get_type(self):
"""
Returns:
int: The type of the object (:data:`FPDF_PAGEOBJ_...`).
"""
return pdfium.FPDFPageObj_GetType(self.raw)
2 changes: 1 addition & 1 deletion tests/helpers/test_page.py
Original file line number Diff line number Diff line change
Expand Up @@ -52,7 +52,7 @@ def test_pageobjects():
pdf = pdfium.PdfDocument(TestFiles.images)
page = pdf.get_page(0)

images = [obj for obj in page.get_objects() if obj.get_type() == pdfium.FPDF_PAGEOBJ_IMAGE]
images = [obj for obj in page.get_objects() if obj.type == pdfium.FPDF_PAGEOBJ_IMAGE]
assert len(images) == 3

positions = [img.get_pos() for img in images]
Expand Down

0 comments on commit dac4d6c

Please sign in to comment.