Skip to content

Commit

Permalink
Partially fix #261
Browse files Browse the repository at this point in the history
  • Loading branch information
mara004 committed Sep 21, 2023
1 parent 339e0b1 commit a9c6485
Showing 1 changed file with 6 additions and 6 deletions.
12 changes: 6 additions & 6 deletions src/pypdfium2/_helpers/textpage.py
Original file line number Diff line number Diff line change
Expand Up @@ -47,17 +47,17 @@ def get_text_range(self, index=0, count=-1, errors="ignore"):
str: The text in the range in question, or an empty string if no text was found.
"""

# NOTE As of this writing, the GetText() API can't be called with a NULL buffer to get the required amount of memory first, thus calculate from char count.
# BUG(261) In some corner cases, the number of chars actually written to the buffer can be smaller, though, so use the char count returned by GetText() to slice the buffer.

if count == -1:
count = self.count_chars() - index

# NOTE apparently, pdfium treats a surrogate pair like two separate chars, so this allocates enough memory and we end up with the right result
# (which however is no good API design - pdfium should properly handle surrogation and the API should tell the exact amount of memory needed)
n_bytes = count * 2
buffer = ctypes.create_string_buffer(n_bytes+2)
buffer = ctypes.create_string_buffer((count+1)*2)
buffer_ptr = ctypes.cast(buffer, ctypes.POINTER(ctypes.c_ushort))
out_size = pdfium_c.FPDFText_GetText(self, index, count, buffer_ptr)

pdfium_c.FPDFText_GetText(self, index, count, buffer_ptr)
return buffer.raw[:n_bytes].decode("utf-16-le", errors=errors)
return buffer.raw[:(out_size-1)*2].decode("utf-16-le", errors=errors)


def get_text_bounded(self, left=None, bottom=None, right=None, top=None, errors="ignore"):
Expand Down

0 comments on commit a9c6485

Please sign in to comment.