Skip to content

Commit

Permalink
workaround for UnicodeDecodeError issue: #144, #155
Browse files Browse the repository at this point in the history
  • Loading branch information
dothinking committed Jul 30, 2022
1 parent 6186db9 commit 7208ce3
Showing 1 changed file with 12 additions and 2 deletions.
14 changes: 12 additions & 2 deletions pdf2docx/page/RawPageFitz.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
A wrapper of PyMuPDF Page as page engine.
'''

import logging
from .RawPage import RawPage
from ..image.ImagesExtractor import ImagesExtractor
from ..shape.Paths import Paths
Expand Down Expand Up @@ -61,13 +62,22 @@ def _preprocess_text(self, **settings):
raw = self.page_engine.get_text('rawdict', flags=64)
text_blocks = raw.get('blocks', [])

# potential UnicodeDecodeError issue when trying to filter hidden text:
# https://github.com/dothinking/pdf2docx/issues/144
# https://github.com/dothinking/pdf2docx/issues/155
try:
spans = self.page_engine.get_texttrace()
except SystemError:
logging.warning('Ignore hidden text checking due to UnicodeDecodeError in upstream library.')
spans = []

if not spans: return text_blocks

# ignore hidden text if ocr=0, while extract only hidden text if ocr=2
if ocr==2:
f = lambda span: span['type']!=3 # find displayed text and ignore it
else:
f = lambda span: span['type']==3 # find hidden text and ignore it

spans = self.page_engine.get_texttrace()
filtered_spans = list(filter(f, spans))

def span_area(bbox):
Expand Down

0 comments on commit 7208ce3

Please sign in to comment.