textpage: replace count_chars() with char_count attribute

pypdfium2-team · Sep 22, 2022 · 26bf175 · 26bf175
1 parent 038feb2
commit 26bf175
Show file tree

Hide file tree

Showing 3 changed files with 12 additions and 20 deletions.
diff --git a/docs/devel/changelog_staging.md b/docs/devel/changelog_staging.md
@@ -10,6 +10,7 @@
     *API-breaking changes*
     - If the target page of a bookmark cannot be identified, `OutlineItem.page_index` is now `None` rather than `-1`, to avoid accidental reverse list indexing and to enforce that callers properly handle this case. Moreover, `OutlineItem.is_closed` is now `None` rather than `False` if the bookmark has no kids.
     - `PdfPageObject.get_type()` was replaced with a `type` attribute.
+    - `PdfTextPage.count_chars()` was replaced with a `char_count` attribute.
     - `PdfPage.count_objects()` was removed. Use `PdfPage.get_objects()` or the raw PDFium API instead.
     - If a negative index is passed to `PdfDocument.new_page()`, it is now interpreted in reversed direction, rather than inserting at the beginning.
     - PDFium is now provided with an external, python-allocated buffer for rendering. This has numerous advantages, most notably that callers don't need to free resources anymore. `PdfPage.render_base()` now directly returns a ctypes ubyte array; `BitmapDataHolder` has been removed.

diff --git a/src/pypdfium2/_helpers/textpage.py b/src/pypdfium2/_helpers/textpage.py
@@ -14,11 +14,13 @@ class PdfTextPage:
     Attributes:
         raw (FPDF_TEXTPAGE): The underlying PDFium textpage handle.
         page (PdfPage): Reference to the page this textpage belongs to.
+        char_count (int): Number of characters on the page.
     """
 
     def __init__(self, raw, page):
         self.raw = raw
         self.page = page
+        self.char_count = pdfium.FPDFText_CountChars(self.raw)
 
     def close(self):
         """
@@ -61,28 +63,19 @@ def get_text(self, left=0, bottom=0, right=0, top=0):
         return text
 
 
-    def count_chars(self):
-        """
-        Returns:
-            int: The number of characters on the page.
-        """
-        return pdfium.FPDFText_CountChars(self.raw)
-
-
     def count_rects(self, index=0, count=0):
         """
         Parameters:
             index (int): Character index at which to start.
-            count (int): Character count to consider (defaults to :meth:`.count_chars`).
+            count (int): Character count to consider (defaults to :attr:`.char_count`).
         Returns:
             int: The number of text rectangles on the page.
         """
-        n_chars = self.count_chars()
-        if n_chars == 0:
+        if self.char_count == 0:
             return 0
         if count == 0:
-            count = n_chars
-        if not (0 <= index < index+count <= n_chars):
+            count = self.char_count
+        if not (0 <= index < index+count <= self.char_count):
             raise ValueError("Character span is out of bounds.")
         return pdfium.FPDFText_CountRects(self.raw, index, count)
 
@@ -121,9 +114,8 @@ def get_charbox(self, index, loose=False):
             Values for left, bottom, right and top in PDF canvas units.
         """
 
-        n_chars = self.count_chars()
-        if not 0 <= index < n_chars:
-            raise ValueError("Character index %s is out of bounds. The maximum index is %d." % (index, n_chars-1))
+        if not 0 <= index < self.char_count:
+            raise ValueError("Character index %s is out of bounds. The maximum index is %d." % (index, self.char_count-1))
 
         if loose:
             rect = pdfium.FS_RECTF()

diff --git a/tests/helpers/test_text.py b/tests/helpers/test_text.py
@@ -42,7 +42,7 @@ def test_gettext(textpage):
 
 @pytest.mark.parametrize("loose", [False, True])
 def test_getcharbox(textpage, loose):
-    for index in range( textpage.count_chars() ):
+    for index in range(textpage.char_count):
         box = textpage.get_charbox(index, loose=loose)
         assert all( isinstance(val, (int, float)) for val in box )
         assert box[0] <= box[2] and box[1] <= box[3]
@@ -117,9 +117,8 @@ def test_get_index(textpage):
 
     x, y = (60, textpage.page.get_height()-66)
 
-    n_chars = textpage.count_chars()
     index = textpage.get_index(x, y, 5, 5)
-    assert index < n_chars and index == 0
+    assert index < textpage.char_count and index == 0
 
     charbox = textpage.get_charbox(index)
     char = textpage.get_text(*charbox)
@@ -131,8 +130,8 @@ def test_textpage_empty():
     page = pdf.get_page(0)
     textpage = page.get_textpage()
 
+    assert textpage.char_count == 0
     assert textpage.get_text() == ""
-    assert textpage.count_chars() == 0
     assert textpage.count_rects() == 0
     assert textpage.get_index(0, 0, 0, 0) is None
     assert [r for r in textpage.get_rectboxes()] == []