From 9e2572bf5d100a5a7521576908eb5713e0dd24c8 Mon Sep 17 00:00:00 2001 From: Lukas Blecher Date: Wed, 20 Sep 2023 10:31:24 +0200 Subject: [PATCH] dep replacement: pdf2image -> pypdfium2 --- nougat/_version.py | 2 +- nougat/dataset/rasterize.py | 35 ++++++++++++++--------------------- setup.py | 4 ++-- 3 files changed, 17 insertions(+), 24 deletions(-) diff --git a/nougat/_version.py b/nougat/_version.py index 2f1b961..81beee3 100644 --- a/nougat/_version.py +++ b/nougat/_version.py @@ -5,4 +5,4 @@ LICENSE file in the root directory of this source tree. """ -__version__ = "0.1.9" +__version__ = "0.1.10" diff --git a/nougat/dataset/rasterize.py b/nougat/dataset/rasterize.py index 0241bb4..c5af924 100644 --- a/nougat/dataset/rasterize.py +++ b/nougat/dataset/rasterize.py @@ -5,8 +5,7 @@ LICENSE file in the root directory of this source tree. """ import argparse -import pdf2image -import pypdf +import pypdfium2 from pathlib import Path from tqdm import tqdm import io @@ -38,27 +37,21 @@ def rasterize_paper( return_pil = True try: if isinstance(pdf, (str, Path)): - pdf = pypdf.PdfReader(pdf) + pdf = pypdfium2.PdfDocument(pdf) if pages is None: - pages = range(len(pdf.pages)) - for i in pages: - page_bytes = io.BytesIO() - writer = pypdf.PdfWriter() - writer.add_page(pdf.pages[i]) - writer.write(page_bytes) - page_bytes = page_bytes.getvalue() - img = pdf2image.convert_from_bytes( - page_bytes, - dpi=dpi, - fmt="ppm" if outpath is None else "png", - output_folder=None if outpath is None else outpath, - single_file=True, - output_file="%02d" % (i + 1), - )[0] + pages = range(len(pdf)) + renderer = pdf.render( + pypdfium2.PdfBitmap.to_pil, + page_indices=pages, + scale=dpi / 72, + ) + for i, image in zip(pages, renderer): if return_pil: - img_bytes = io.BytesIO() - img.save(img_bytes, format=img.format) - pils.append(img_bytes) + page_bytes = io.BytesIO() + image.save(page_bytes, "bmp") + pils.append(page_bytes) + else: + image.save((outpath / ("%02d.png" % (i + 1))), "png") except Exception: pass if return_pil: diff --git a/setup.py b/setup.py index b6608de..3af94a3 100644 --- a/setup.py +++ b/setup.py @@ -53,9 +53,9 @@ def read_long_description(): "python-Levenshtein", "sentencepiece", "sconf>=0.2.3", - "albumentations", + "albumentations>=1.0.0", "pypdf>=3.1.0", - "pdf2image", + "pypdfium2", ], extras_require={ "api": [