From f4c1946df6b4f22a2dc2f903c29b0dd0448c06a7 Mon Sep 17 00:00:00 2001 From: Amit Dovev Date: Mon, 7 Nov 2022 15:10:18 +0200 Subject: [PATCH] pdfrenderer.cpp: Ignore non-text blocks Fix #3957. --- src/api/pdfrenderer.cpp | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/src/api/pdfrenderer.cpp b/src/api/pdfrenderer.cpp index 774558aed7..d5f020a7ca 100644 --- a/src/api/pdfrenderer.cpp +++ b/src/api/pdfrenderer.cpp @@ -25,6 +25,7 @@ #include #include +#include // for PTIsTextType() #include #include #include @@ -354,6 +355,10 @@ char *TessPDFRenderer::GetPDFTextObjects(TessBaseAPI *api, double width, double const std::unique_ptr res_it(api->GetIterator()); while (!res_it->Empty(RIL_BLOCK)) { if (res_it->IsAtBeginningOf(RIL_BLOCK)) { + auto block_type = res_it->BlockType(); + if (!PTIsTextType(block_type)) { + continue; // ignore non-text blocks + } pdf_str << "BT\n3 Tr"; // Begin text object, use invisible ink old_fontsize = 0; // Every block will declare its fontsize new_block = true; // Every block will declare its affine matrix