From dac4d6c5b68e8c28401c95326c7e47207d340aa9 Mon Sep 17 00:00:00 2001 From: geisserml Date: Thu, 15 Sep 2022 16:48:42 +0200 Subject: [PATCH] pageobjects: descend into form objects CC #137 --- src/pypdfium2/_cli/find_pageobjects.py | 20 ++++---- src/pypdfium2/_helpers/page.py | 65 ++++++++++++++++++-------- tests/helpers/test_page.py | 2 +- 3 files changed, 56 insertions(+), 31 deletions(-) diff --git a/src/pypdfium2/_cli/find_pageobjects.py b/src/pypdfium2/_cli/find_pageobjects.py index 30bf2d668..b7cee9692 100644 --- a/src/pypdfium2/_cli/find_pageobjects.py +++ b/src/pypdfium2/_cli/find_pageobjects.py @@ -11,30 +11,31 @@ def attach_parser(subparsers): + obj_types = list(ObjtypeToConst.keys()) parser = subparsers.add_parser( "find-pageobjects", - help = "Locate page objects of a certain type", + help = "Locate page objects of given types.", ) parser.add_argument( "input", type = os.path.abspath, - help = "Path to the PDF document to work with", + help = "Path to the PDF document to work with.", ) parser.add_argument( "--password", - help = "Password to unlock the PDF, if encrypted" + help = "Password to unlock the PDF, if encrypted." ) parser.add_argument( "--pages", type = pagetext_type, - help = "The pages to search (defaults to all)", + help = "The pages to search (defaults to all).", ) parser.add_argument( "--types", nargs = "+", - required = True, - choices = list(ObjtypeToConst.keys()), - help = "Object types to consider", + choices = obj_types, + default = obj_types, + help = "Object types to consider (defaults to all).", ) @@ -48,9 +49,8 @@ def main(args): for index in args.pages: page = doc.get_page(index) for obj in page.get_objects(): - type = obj.get_type() - if type in args.types: - print(ObjtypeToName[type], obj.get_pos()) + if obj.type in args.types: + print(" "*obj.level + ObjtypeToName[obj.type], obj.get_pos()) page.close() doc.close() diff --git a/src/pypdfium2/_helpers/page.py b/src/pypdfium2/_helpers/page.py index 53c47aa2d..02df5e2a4 100644 --- a/src/pypdfium2/_helpers/page.py +++ b/src/pypdfium2/_helpers/page.py @@ -225,23 +225,52 @@ def insert_text( pdfium.FPDFPage_GenerateContent(self.raw) - def count_objects(self): - """ - Returns: - int: The number of page objects on this page. - """ - return pdfium.FPDFPage_CountObjects(self.raw) - - def get_objects(self): + def get_objects(self, max_depth=3, form=None, level=0): """ Iterate through the page objects on this page. + Parameters: + max_depth (int): + Maximum recursion depth to consider when descending into Form XObjects. + Yields: - :class:`.PdfPageObject`: Page object helper. + (PdfPageObject, int): Page object and nesting level. """ - for i in range( self.count_objects() ): - raw_obj = pdfium.FPDFPage_GetObject(self.raw, i) - yield PdfPageObject(raw_obj, self) + + if form is None: + count_objects = pdfium.FPDFPage_CountObjects + get_object = pdfium.FPDFPage_GetObject + parent = self.raw + else: + count_objects = pdfium.FPDFFormObj_CountObjects + get_object = pdfium.FPDFFormObj_GetObject + parent = form + + n_objects = count_objects(parent) + if n_objects == 0: + return + elif n_objects < 0: + raise PdfiumError("Failed to get number of page objects.") + + for i in range(n_objects): + + raw_obj = get_object(parent, i) + if raw_obj is None: + raise PdfiumError("Failed to get page object.") + + helper_obj = PdfPageObject( + raw = raw_obj, + page = self, + level = level, + ) + yield helper_obj + + if level < max_depth-1 and helper_obj.type == pdfium.FPDF_PAGEOBJ_FORM: + yield from self.get_objects( + max_depth = max_depth, + form = raw_obj, + level = level + 1, + ) def render_base( @@ -522,12 +551,15 @@ class PdfPageObject: Note: * The :attr:`.raw` attribute stores the underlying :class:`FPDF_PAGEOBJECT`. + * The :attr:`.type` attribute stores the type of the object (:data:`FPDF_PAGEOBJ_...`) * The :attr:`.page` attribute holds a reference to the :class:`.PdfPage` this page object belongs to. """ - def __init__(self, raw, page): + def __init__(self, raw, page, level=0): self.raw = raw self.page = page + self.level = level + self.type = pdfium.FPDFPageObj_GetType(self.raw) def get_pos(self): """ @@ -541,10 +573,3 @@ def get_pos(self): if not ret_code: raise PdfiumError("Locating the page object failed") return (left.value, bottom.value, right.value, top.value) - - def get_type(self): - """ - Returns: - int: The type of the object (:data:`FPDF_PAGEOBJ_...`). - """ - return pdfium.FPDFPageObj_GetType(self.raw) diff --git a/tests/helpers/test_page.py b/tests/helpers/test_page.py index e0cb09a17..e0371b3b4 100644 --- a/tests/helpers/test_page.py +++ b/tests/helpers/test_page.py @@ -52,7 +52,7 @@ def test_pageobjects(): pdf = pdfium.PdfDocument(TestFiles.images) page = pdf.get_page(0) - images = [obj for obj in page.get_objects() if obj.get_type() == pdfium.FPDF_PAGEOBJ_IMAGE] + images = [obj for obj in page.get_objects() if obj.type == pdfium.FPDF_PAGEOBJ_IMAGE] assert len(images) == 3 positions = [img.get_pos() for img in images]