diff --git a/docs/devel/changelog_staging.md b/docs/devel/changelog_staging.md index 3422d4c99..fb166d1ca 100644 --- a/docs/devel/changelog_staging.md +++ b/docs/devel/changelog_staging.md @@ -10,6 +10,7 @@ *API-breaking changes* - If the target page of a bookmark cannot be identified, `OutlineItem.page_index` is now `None` rather than `-1`, to avoid accidental reverse list indexing and to enforce that callers properly handle this case. Moreover, `OutlineItem.is_closed` is now `None` rather than `False` if the bookmark has no kids. - `PdfPageObject.get_type()` was replaced with a `type` attribute. + - `PdfTextPage.count_chars()` was replaced with a `char_count` attribute. - `PdfPage.count_objects()` was removed. Use `PdfPage.get_objects()` or the raw PDFium API instead. - If a negative index is passed to `PdfDocument.new_page()`, it is now interpreted in reversed direction, rather than inserting at the beginning. - PDFium is now provided with an external, python-allocated buffer for rendering. This has numerous advantages, most notably that callers don't need to free resources anymore. `PdfPage.render_base()` now directly returns a ctypes ubyte array; `BitmapDataHolder` has been removed. diff --git a/src/pypdfium2/_helpers/textpage.py b/src/pypdfium2/_helpers/textpage.py index c5d29ee91..b0e0aacf9 100644 --- a/src/pypdfium2/_helpers/textpage.py +++ b/src/pypdfium2/_helpers/textpage.py @@ -14,11 +14,13 @@ class PdfTextPage: Attributes: raw (FPDF_TEXTPAGE): The underlying PDFium textpage handle. page (PdfPage): Reference to the page this textpage belongs to. + char_count (int): Number of characters on the page. """ def __init__(self, raw, page): self.raw = raw self.page = page + self.char_count = pdfium.FPDFText_CountChars(self.raw) def close(self): """ @@ -61,28 +63,19 @@ def get_text(self, left=0, bottom=0, right=0, top=0): return text - def count_chars(self): - """ - Returns: - int: The number of characters on the page. - """ - return pdfium.FPDFText_CountChars(self.raw) - - def count_rects(self, index=0, count=0): """ Parameters: index (int): Character index at which to start. - count (int): Character count to consider (defaults to :meth:`.count_chars`). + count (int): Character count to consider (defaults to :attr:`.char_count`). Returns: int: The number of text rectangles on the page. """ - n_chars = self.count_chars() - if n_chars == 0: + if self.char_count == 0: return 0 if count == 0: - count = n_chars - if not (0 <= index < index+count <= n_chars): + count = self.char_count + if not (0 <= index < index+count <= self.char_count): raise ValueError("Character span is out of bounds.") return pdfium.FPDFText_CountRects(self.raw, index, count) @@ -121,9 +114,8 @@ def get_charbox(self, index, loose=False): Values for left, bottom, right and top in PDF canvas units. """ - n_chars = self.count_chars() - if not 0 <= index < n_chars: - raise ValueError("Character index %s is out of bounds. The maximum index is %d." % (index, n_chars-1)) + if not 0 <= index < self.char_count: + raise ValueError("Character index %s is out of bounds. The maximum index is %d." % (index, self.char_count-1)) if loose: rect = pdfium.FS_RECTF() diff --git a/tests/helpers/test_text.py b/tests/helpers/test_text.py index fc5ad13ec..489c96733 100644 --- a/tests/helpers/test_text.py +++ b/tests/helpers/test_text.py @@ -42,7 +42,7 @@ def test_gettext(textpage): @pytest.mark.parametrize("loose", [False, True]) def test_getcharbox(textpage, loose): - for index in range( textpage.count_chars() ): + for index in range(textpage.char_count): box = textpage.get_charbox(index, loose=loose) assert all( isinstance(val, (int, float)) for val in box ) assert box[0] <= box[2] and box[1] <= box[3] @@ -117,9 +117,8 @@ def test_get_index(textpage): x, y = (60, textpage.page.get_height()-66) - n_chars = textpage.count_chars() index = textpage.get_index(x, y, 5, 5) - assert index < n_chars and index == 0 + assert index < textpage.char_count and index == 0 charbox = textpage.get_charbox(index) char = textpage.get_text(*charbox) @@ -131,8 +130,8 @@ def test_textpage_empty(): page = pdf.get_page(0) textpage = page.get_textpage() + assert textpage.char_count == 0 assert textpage.get_text() == "" - assert textpage.count_chars() == 0 assert textpage.count_rects() == 0 assert textpage.get_index(0, 0, 0, 0) is None assert [r for r in textpage.get_rectboxes()] == []