Skip to content

Commit

Permalink
textpage: replace count_chars() with char_count attribute
Browse files Browse the repository at this point in the history
  • Loading branch information
mara004 committed Sep 22, 2022
1 parent 038feb2 commit 26bf175
Show file tree
Hide file tree
Showing 3 changed files with 12 additions and 20 deletions.
1 change: 1 addition & 0 deletions docs/devel/changelog_staging.md
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@
*API-breaking changes*
- If the target page of a bookmark cannot be identified, `OutlineItem.page_index` is now `None` rather than `-1`, to avoid accidental reverse list indexing and to enforce that callers properly handle this case. Moreover, `OutlineItem.is_closed` is now `None` rather than `False` if the bookmark has no kids.
- `PdfPageObject.get_type()` was replaced with a `type` attribute.
- `PdfTextPage.count_chars()` was replaced with a `char_count` attribute.
- `PdfPage.count_objects()` was removed. Use `PdfPage.get_objects()` or the raw PDFium API instead.
- If a negative index is passed to `PdfDocument.new_page()`, it is now interpreted in reversed direction, rather than inserting at the beginning.
- PDFium is now provided with an external, python-allocated buffer for rendering. This has numerous advantages, most notably that callers don't need to free resources anymore. `PdfPage.render_base()` now directly returns a ctypes ubyte array; `BitmapDataHolder` has been removed.
Expand Down
24 changes: 8 additions & 16 deletions src/pypdfium2/_helpers/textpage.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,11 +14,13 @@ class PdfTextPage:
Attributes:
raw (FPDF_TEXTPAGE): The underlying PDFium textpage handle.
page (PdfPage): Reference to the page this textpage belongs to.
char_count (int): Number of characters on the page.
"""

def __init__(self, raw, page):
self.raw = raw
self.page = page
self.char_count = pdfium.FPDFText_CountChars(self.raw)

def close(self):
"""
Expand Down Expand Up @@ -61,28 +63,19 @@ def get_text(self, left=0, bottom=0, right=0, top=0):
return text


def count_chars(self):
"""
Returns:
int: The number of characters on the page.
"""
return pdfium.FPDFText_CountChars(self.raw)


def count_rects(self, index=0, count=0):
"""
Parameters:
index (int): Character index at which to start.
count (int): Character count to consider (defaults to :meth:`.count_chars`).
count (int): Character count to consider (defaults to :attr:`.char_count`).
Returns:
int: The number of text rectangles on the page.
"""
n_chars = self.count_chars()
if n_chars == 0:
if self.char_count == 0:
return 0
if count == 0:
count = n_chars
if not (0 <= index < index+count <= n_chars):
count = self.char_count
if not (0 <= index < index+count <= self.char_count):
raise ValueError("Character span is out of bounds.")
return pdfium.FPDFText_CountRects(self.raw, index, count)

Expand Down Expand Up @@ -121,9 +114,8 @@ def get_charbox(self, index, loose=False):
Values for left, bottom, right and top in PDF canvas units.
"""

n_chars = self.count_chars()
if not 0 <= index < n_chars:
raise ValueError("Character index %s is out of bounds. The maximum index is %d." % (index, n_chars-1))
if not 0 <= index < self.char_count:
raise ValueError("Character index %s is out of bounds. The maximum index is %d." % (index, self.char_count-1))

if loose:
rect = pdfium.FS_RECTF()
Expand Down
7 changes: 3 additions & 4 deletions tests/helpers/test_text.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,7 +42,7 @@ def test_gettext(textpage):

@pytest.mark.parametrize("loose", [False, True])
def test_getcharbox(textpage, loose):
for index in range( textpage.count_chars() ):
for index in range(textpage.char_count):
box = textpage.get_charbox(index, loose=loose)
assert all( isinstance(val, (int, float)) for val in box )
assert box[0] <= box[2] and box[1] <= box[3]
Expand Down Expand Up @@ -117,9 +117,8 @@ def test_get_index(textpage):

x, y = (60, textpage.page.get_height()-66)

n_chars = textpage.count_chars()
index = textpage.get_index(x, y, 5, 5)
assert index < n_chars and index == 0
assert index < textpage.char_count and index == 0

charbox = textpage.get_charbox(index)
char = textpage.get_text(*charbox)
Expand All @@ -131,8 +130,8 @@ def test_textpage_empty():
page = pdf.get_page(0)
textpage = page.get_textpage()

assert textpage.char_count == 0
assert textpage.get_text() == ""
assert textpage.count_chars() == 0
assert textpage.count_rects() == 0
assert textpage.get_index(0, 0, 0, 0) is None
assert [r for r in textpage.get_rectboxes()] == []
Expand Down

0 comments on commit 26bf175

Please sign in to comment.