From a9c64851eeec36ba0c352ecf776bbf1a92ab0a85 Mon Sep 17 00:00:00 2001 From: geisserml Date: Thu, 21 Sep 2023 14:47:31 +0200 Subject: [PATCH] Partially fix #261 --- src/pypdfium2/_helpers/textpage.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/src/pypdfium2/_helpers/textpage.py b/src/pypdfium2/_helpers/textpage.py index 27741fa8f..41888e118 100644 --- a/src/pypdfium2/_helpers/textpage.py +++ b/src/pypdfium2/_helpers/textpage.py @@ -47,17 +47,17 @@ def get_text_range(self, index=0, count=-1, errors="ignore"): str: The text in the range in question, or an empty string if no text was found. """ + # NOTE As of this writing, the GetText() API can't be called with a NULL buffer to get the required amount of memory first, thus calculate from char count. + # BUG(261) In some corner cases, the number of chars actually written to the buffer can be smaller, though, so use the char count returned by GetText() to slice the buffer. + if count == -1: count = self.count_chars() - index - # NOTE apparently, pdfium treats a surrogate pair like two separate chars, so this allocates enough memory and we end up with the right result - # (which however is no good API design - pdfium should properly handle surrogation and the API should tell the exact amount of memory needed) - n_bytes = count * 2 - buffer = ctypes.create_string_buffer(n_bytes+2) + buffer = ctypes.create_string_buffer((count+1)*2) buffer_ptr = ctypes.cast(buffer, ctypes.POINTER(ctypes.c_ushort)) + out_size = pdfium_c.FPDFText_GetText(self, index, count, buffer_ptr) - pdfium_c.FPDFText_GetText(self, index, count, buffer_ptr) - return buffer.raw[:n_bytes].decode("utf-16-le", errors=errors) + return buffer.raw[:(out_size-1)*2].decode("utf-16-le", errors=errors) def get_text_bounded(self, left=None, bottom=None, right=None, top=None, errors="ignore"):