Skip to content

Commit

Permalink
dep replacement: pdf2image -> pypdfium2
Browse files Browse the repository at this point in the history
  • Loading branch information
lukas-blecher committed Sep 20, 2023
1 parent 84b3ae1 commit 9e2572b
Show file tree
Hide file tree
Showing 3 changed files with 17 additions and 24 deletions.
2 changes: 1 addition & 1 deletion nougat/_version.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,4 +5,4 @@
LICENSE file in the root directory of this source tree.
"""

__version__ = "0.1.9"
__version__ = "0.1.10"
35 changes: 14 additions & 21 deletions nougat/dataset/rasterize.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,8 +5,7 @@
LICENSE file in the root directory of this source tree.
"""
import argparse
import pdf2image
import pypdf
import pypdfium2
from pathlib import Path
from tqdm import tqdm
import io
Expand Down Expand Up @@ -38,27 +37,21 @@ def rasterize_paper(
return_pil = True
try:
if isinstance(pdf, (str, Path)):
pdf = pypdf.PdfReader(pdf)
pdf = pypdfium2.PdfDocument(pdf)
if pages is None:
pages = range(len(pdf.pages))
for i in pages:
page_bytes = io.BytesIO()
writer = pypdf.PdfWriter()
writer.add_page(pdf.pages[i])
writer.write(page_bytes)
page_bytes = page_bytes.getvalue()
img = pdf2image.convert_from_bytes(
page_bytes,
dpi=dpi,
fmt="ppm" if outpath is None else "png",
output_folder=None if outpath is None else outpath,
single_file=True,
output_file="%02d" % (i + 1),
)[0]
pages = range(len(pdf))
renderer = pdf.render(
pypdfium2.PdfBitmap.to_pil,
page_indices=pages,
scale=dpi / 72,
)
for i, image in zip(pages, renderer):
if return_pil:
img_bytes = io.BytesIO()
img.save(img_bytes, format=img.format)
pils.append(img_bytes)
page_bytes = io.BytesIO()
image.save(page_bytes, "bmp")
pils.append(page_bytes)
else:
image.save((outpath / ("%02d.png" % (i + 1))), "png")
except Exception:
pass
if return_pil:
Expand Down
4 changes: 2 additions & 2 deletions setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -53,9 +53,9 @@ def read_long_description():
"python-Levenshtein",
"sentencepiece",
"sconf>=0.2.3",
"albumentations",
"albumentations>=1.0.0",
"pypdf>=3.1.0",
"pdf2image",
"pypdfium2",
],
extras_require={
"api": [
Expand Down

0 comments on commit 9e2572b

Please sign in to comment.