From aa6aee8bcd9da84a4f5b5dc1614fb6e200228d0d Mon Sep 17 00:00:00 2001 From: Edouard Belval Date: Mon, 5 Sep 2022 16:01:37 -0400 Subject: [PATCH] Version 1.16.1 --- README.md | 6 +- docs/conf.py | 19 +- docs/index.rst | 1 + docs/installation.md | 23 +-- docs/known_issues.md | 17 ++ docs/reference.md | 189 ------------------- docs/reference.rst | 20 ++ pdf2image/exceptions.py | 10 +- pdf2image/parsers.py | 41 +++- pdf2image/pdf2image.py | 408 ++++++++++++++++++++++++++-------------- setup.py | 6 +- tests.py | 53 ++++-- 12 files changed, 403 insertions(+), 390 deletions(-) create mode 100644 docs/known_issues.md delete mode 100644 docs/reference.md create mode 100644 docs/reference.rst diff --git a/README.md b/README.md index 9d7ab07..f40cdad 100644 --- a/README.md +++ b/README.md @@ -1,7 +1,7 @@ # pdf2image [![CircleCI](https://circleci.com/gh/Belval/pdf2image/tree/master.svg?style=svg)](https://circleci.com/gh/Belval/pdf2image/tree/master) [![PyPI version](https://badge.fury.io/py/pdf2image.svg)](https://badge.fury.io/py/pdf2image) [![codecov](https://codecov.io/gh/Belval/pdf2image/branch/master/graph/badge.svg)](https://codecov.io/gh/Belval/pdf2image) [![Downloads](https://pepy.tech/badge/pdf2image/month)](https://pepy.tech/project/pdf2image) [![Documentation Status](https://readthedocs.org/projects/pdf2image/badge/?version=latest)](https://pdf2image.readthedocs.io/en/latest/?badge=latest) -A python (3.6+) module that wraps pdftoppm and pdftocairo to convert PDF to a PIL Image object +A python (3.7+) module that wraps pdftoppm and pdftocairo to convert PDF to a PIL Image object ## How to install @@ -68,9 +68,9 @@ with tempfile.TemporaryDirectory() as path: Here are the definitions: -`convert_from_path(pdf_path, dpi=200, output_folder=None, first_page=None, last_page=None, fmt='ppm', jpegopt=None, thread_count=1, userpw=None, use_cropbox=False, strict=False, transparent=False, single_file=False, output_file=str(uuid.uuid4()), poppler_path=None, grayscale=False, size=None, paths_only=False, use_pdftocairo=False, timeout=600)` +`convert_from_path(pdf_path, dpi=200, output_folder=None, first_page=None, last_page=None, fmt='ppm', jpegopt=None, thread_count=1, userpw=None, use_cropbox=False, strict=False, transparent=False, single_file=False, output_file=str(uuid.uuid4()), poppler_path=None, grayscale=False, size=None, paths_only=False, use_pdftocairo=False, timeout=600, hide_attributes=False)` -`convert_from_bytes(pdf_file, dpi=200, output_folder=None, first_page=None, last_page=None, fmt='ppm', jpegopt=None, thread_count=1, userpw=None, use_cropbox=False, strict=False, transparent=False, single_file=False, output_file=str(uuid.uuid4()), poppler_path=None, grayscale=False, size=None, paths_only=False, use_pdftocairo=False, timeout=600)` +`convert_from_bytes(pdf_file, dpi=200, output_folder=None, first_page=None, last_page=None, fmt='ppm', jpegopt=None, thread_count=1, userpw=None, use_cropbox=False, strict=False, transparent=False, single_file=False, output_file=str(uuid.uuid4()), poppler_path=None, grayscale=False, size=None, paths_only=False, use_pdftocairo=False, timeout=600, hide_attributes=False)` ## What's new? diff --git a/docs/conf.py b/docs/conf.py index 11a0361..c4291c5 100644 --- a/docs/conf.py +++ b/docs/conf.py @@ -11,19 +11,20 @@ # add these directories to sys.path here. If the directory is relative to the # documentation root, use os.path.abspath to make it absolute, like shown here. # -# import os -# import sys -# sys.path.insert(0, os.path.abspath('.')) +import os +import sys + +sys.path.insert(0, os.path.abspath("..")) # -- Project information ----------------------------------------------------- project = "pdf2image" -copyright = "2019, Edouard Belval" +copyright = "2022, Edouard Belval" author = "Edouard Belval" # The short X.Y version -version = "" +version = "1.16.1" # The full version, including alpha/beta/rc tags release = "latest" @@ -40,6 +41,10 @@ extensions = [ "sphinx.ext.mathjax", "sphinx.ext.viewcode", + "sphinx.ext.autodoc", + "sphinx.ext.coverage", + "recommonmark", + "sphinx_rtd_theme", ] # Add any paths that contain templates here, relative to this directory. @@ -59,7 +64,7 @@ # # This is also used if you do content translation via gettext catalogs. # Usually you set "language" from the command line for these cases. -language = None +language = "en" # List of patterns, relative to source directory, that match files and # directories to ignore when looking for source files. @@ -75,7 +80,7 @@ # The theme to use for HTML and HTML Help pages. See the documentation for # a list of builtin themes. # -html_theme = "alabaster" +html_theme = "sphinx_rtd_theme" # Theme options are theme-specific and customize the look and feel of a theme # further. For a list of options available for each theme, see the diff --git a/docs/index.rst b/docs/index.rst index 8c86058..a020d4f 100644 --- a/docs/index.rst +++ b/docs/index.rst @@ -15,4 +15,5 @@ If you are new to the project, start with the installation section! installation overview + known_issues reference diff --git a/docs/installation.md b/docs/installation.md index d237d5f..cddab8a 100644 --- a/docs/installation.md +++ b/docs/installation.md @@ -33,22 +33,7 @@ Poppler is the underlying project that does the magic in pdf2image. You can chec ### Windows -1. Download the latest package from http://blog.alivate.com.au/poppler-windows/ -2. Extract the package -3. Move the extracted directory to the desired place on your system -4. Add the `bin/` directory to your [PATH](https://www.architectryan.com/2018/03/17/add-to-the-path-on-windows-10/) -5. Test that all went well by opening `cmd` and making sure that you can call `pdftoppm -h` - -## Solution for DocuSign issue -If you have this [error](https://stackoverflow.com/questions/66636441/pdf2image-library-failing-to-read-pdf-signed-using-docusign): -```bash -pdf2image.exceptions.PDFPageCountError: Unable to get page count. -Syntax Error: Gen inside xref table too large (bigger than INT_MAX) -Syntax Error: Invalid XRef entry 3 -Syntax Error: Top-level pages object is wrong type (null) -Command Line Error: Wrong page range given: the first page (1) can not be after the last page (0). -``` - -You are possibly using an old version of poppler. The solution is to update to the latest version. Similarly, if you are working with Docker (Debian 11 Image), maybe you can not update poppler because is not available. So, you have to use an image in ubuntu, install Python and then what you need. - -More details [here](https://github.com/Belval/pdf2image/issues/234). \ No newline at end of file +1. Download the latest poppler package from [@oschwartz10612 version](https://github.com/oschwartz10612/poppler-windows/releases/) which is the most up-to-date. +2. Move the extracted directory to the desired place on your system +3. Add the `bin/` directory to your [PATH](https://www.architectryan.com/2018/03/17/add-to-the-path-on-windows-10/) +4. Test that all went well by opening `cmd` and making sure that you can call `pdftoppm -h` diff --git a/docs/known_issues.md b/docs/known_issues.md new file mode 100644 index 0000000..ff03ba3 --- /dev/null +++ b/docs/known_issues.md @@ -0,0 +1,17 @@ +# Limitations / Known Issues + +## DocuSign PDFs + +If you have this [error](https://stackoverflow.com/questions/66636441/pdf2image-library-failing-to-read-pdf-signed-using-docusign): + +```bash +pdf2image.exceptions.PDFPageCountError: Unable to get page count. +Syntax Error: Gen inside xref table too large (bigger than INT_MAX) +Syntax Error: Invalid XRef entry 3 +Syntax Error: Top-level pages object is wrong type (null) +Command Line Error: Wrong page range given: the first page (1) can not be after the last page (0). +``` + +You are possibly using an old version of poppler. The solution is to update to the latest version. Similarly, if you are working with Docker (Debian 11 Image), maybe you can not update poppler because is not available. So, you have to use an image in ubuntu, install Python and then what you need. + +More details [here](https://github.com/Belval/pdf2image/issues/234). \ No newline at end of file diff --git a/docs/reference.md b/docs/reference.md deleted file mode 100644 index d26b95b..0000000 --- a/docs/reference.md +++ /dev/null @@ -1,189 +0,0 @@ -# Reference - -## Functions - -### convert_from_path & convert_from_bytes - -Converts a PDF into image(s) - -```py -convert_from_path( - pdf_path, - dpi=200, - output_folder=None, - first_page=None, - last_page=None, - fmt="ppm", - jpegopt=None, - thread_count=1, - userpw=None, - use_cropbox=False, - strict=False, - transparent=False, - single_file=False, - output_file=uuid_generator(), - poppler_path=None, - grayscale=False, - size=None, - paths_only=False, - hide_annotations=False, -) - -convert_from_bytes( - pdf_bytes, - dpi=200, - output_folder=None, - first_page=None, - last_page=None, - fmt="ppm", - jpegopt=None, - thread_count=1, - userpw=None, - use_cropbox=False, - strict=False, - transparent=False, - single_file=False, - output_file=uuid_generator(), - poppler_path=None, - grayscale=False, - size=None, - paths_only=False, - hide_annotations=False, -) -``` - -**pdf_path** - -Path to the PDF file. Can be a string or a `pathlib.Path` object - -**pdf_bytes** - -Bytes of the PDF file. - -**dpi** - -Dots per inch, can be seen as the relative resolution of the output PDF, higher is better but anything above 300 is usually not discernable to the naked eye. Keep in mind that this is directly related to the ouput images size when using file formats without compression (like PPM) - -**output_folder** - -Output directory for the generated files, should be seen more as a "working directory" than an output folder. The converted images will be written there to save system memory. - -**first_page** - -First page that will be converted. `first_page=2` will skip page 1. - -**last_page** - -Last page that will be converted. `last_page=2` will skip all pages after page 2. - -**fmt** - -File format or the output images. Supported values are `ppm`, `jpeg`, `png` and `tiff`. - -**jpegopt** - -Configuration for the jpeg output format. As such, only used with `fmt='jpeg'`. - -``` -jpegopt={ - "quality": 100, - "progressive": True, - "optimize": True -} -``` - -- `quality`: Selects the JPEG quality value. The value must be an integer between 0 and 100. -- `progressive`: Select progressive JPEG output. The possible values are `True`, `False`, indicating progressive (yes) or non-progressive (no), respectively. -- `optimize`: Sets whether to compute optimal Huffman coding tables for the JPEG output, which will create smaller files but make an extra pass over the data. The value must be `True` or `False`, with `True` performing optimization, otherwise the default Huffman tables are used. - -**thread_count** - -Number of threads to use when converting the PDF. Limited to the actual number of pages. - -**userpw** - -Password for the PDF if it is password-protected. - -**use_cropbox** - -Uses the PDF cropbox instead of the default mediabox. This is a rather dark feature that should be set to true when the module does not seem to work with your data. - -**strict** - -Raises PDFSyntaxError when the PDF is partially malformed. Most PDF are partially malformed and that parameter should be kept to `False`, unless standard compliance is paramount to your use case. - -**transparent** - -Instead of returning a white background, make the PDF background transparent. Only compatible with file formats that support transparency. - -**single_file** - -Only convert the PDF first page and does not append an index to the output file name. - -**output_file** - -Output filename, normally string, but can take a string generator. - -**poppler_path** - -Path to the poppler directory containing librairies and executable files. - -**grayscale** - -Returns grayscale images - -**size** - -Size of output images, using `None` as any of the dimension will resize and preserve aspect ratio. - -Examples of valid sizes are: - -- `size=400` will fit the image to a 400x400 box, preserving aspect ratio -- `size=(400, None)` will make the image 400 pixels wide, preserving aspect ratio -- `size=(500, 500)` will resize the image to 500x500 pixels, not preserving aspect ratio - -This behavior is derived directly from the `-scale-to`, `-scale-to-x`, and `-scale-to-y` parameters. - -**paths_only** - -A list of image paths rather than preloaded images are returned. - -**jpegopt** - -Provide additional options for jpeg format conversions. Requires `fmt="jpeg"` and is provided as dict, with all -optinal keywords: -`jpegopt={"quality": 100, "optimize": True, "progressive": False}` - -**hide_annotations** - -Hide link bounding boxes and other PDF annotations. This is only implemented in pdftoppm at the moment so it -cannot be combined with pdftocairo flags. - -## Exceptions - -```py -from pdf2image.exceptions import ( - PDFInfoNotInstalledError, - PDFPageCountError, - PDFSyntaxError -) -``` - -### PDFInfoNotInstalledError - -Exception raised when `pdfinfo`, which is part of poppler-utils, was not found on your system. This can be tested by trying to call it from your command line. - -When this error is raised, the error is almost always installation related. - -### PDFPageCountError - -Exception raised when `pdfinfo`, which is part of poppler-utils, was unable to get the page count from the PDF file. This is usually due to: - -- An invalid PDF file path -- A malformed or invalid PDF - -### PDFSyntaxError - -Exception raised when `convert_from_path` or `convert_from_bytes` is called using `strict=True` and the input PDF contained a syntax error. Simply use `strict=False` will usually solve this issue. - -Note that most PDF contain syntax errors and you can safely ignore strict mode. diff --git a/docs/reference.rst b/docs/reference.rst new file mode 100644 index 0000000..3424381 --- /dev/null +++ b/docs/reference.rst @@ -0,0 +1,20 @@ +Reference +********** + +Main functions +-------------- + +.. automodule:: pdf2image.pdf2image + :members: + +Exceptions +---------- + +.. automodule:: pdf2image.exceptions + :members: + +Parsers +------- + +.. automodule:: pdf2image.parsers + :members: \ No newline at end of file diff --git a/pdf2image/exceptions.py b/pdf2image/exceptions.py index 64d65a0..bf20108 100644 --- a/pdf2image/exceptions.py +++ b/pdf2image/exceptions.py @@ -4,30 +4,30 @@ class PopplerNotInstalledError(Exception): - """Happens when poppler is not installed""" + """Raised when poppler is not installed""" pass class PDFInfoNotInstalledError(PopplerNotInstalledError): - """Happens when pdfinfo is not installed""" + """Raised when pdfinfo is not installed""" pass class PDFPageCountError(Exception): - """Happens when the pdfinfo was unable to retrieve the page count""" + """Raised when the pdfinfo was unable to retrieve the page count""" pass class PDFSyntaxError(Exception): - """Syntax error was thrown during rendering""" + """Raised when a syntax error was thrown during rendering""" pass class PDFPopplerTimeoutError(Exception): - """Timeout when pdf convert image.""" + """Raised when the timeout is exceeded while converting a PDF""" pass diff --git a/pdf2image/parsers.py b/pdf2image/parsers.py index 7191774..72f5125 100644 --- a/pdf2image/parsers.py +++ b/pdf2image/parsers.py @@ -3,12 +3,19 @@ """ from io import BytesIO +from typing import List from PIL import Image -def parse_buffer_to_ppm(data): - """Parse PPM file bytes to Pillow Image""" +def parse_buffer_to_ppm(data: bytes) -> List[Image.Image]: + """Parse PPM file bytes to Pillow Image + + :param data: pdftoppm/pdftocairo output bytes + :type data: bytes + :return: List of PPM images parsed from the output + :rtype: List[Image.Image] + """ images = [] @@ -24,8 +31,14 @@ def parse_buffer_to_ppm(data): return images -def parse_buffer_to_pgm(data): - """Parse PGM file bytes to Pillow Image""" +def parse_buffer_to_pgm(data: bytes) -> List[Image.Image]: + """Parse PGM file bytes to Pillow Image + + :param data: pdftoppm/pdftocairo output bytes + :type data: bytes + :return: List of PGM images parsed from the output + :rtype: List[Image.Image] + """ images = [] @@ -41,8 +54,14 @@ def parse_buffer_to_pgm(data): return images -def parse_buffer_to_jpeg(data): - """Parse JPEG file bytes to Pillow Image""" +def parse_buffer_to_jpeg(data: bytes) -> List[Image.Image]: + """Parse JPEG file bytes to Pillow Image + + :param data: pdftoppm/pdftocairo output bytes + :type data: bytes + :return: List of JPEG images parsed from the output + :rtype: List[Image.Image] + """ return [ Image.open(BytesIO(image_data + b"\xff\xd9")) @@ -52,8 +71,14 @@ def parse_buffer_to_jpeg(data): ] -def parse_buffer_to_png(data): - """Parse PNG file bytes to Pillow Image""" +def parse_buffer_to_png(data: bytes) -> List[Image.Image]: + """Parse PNG file bytes to Pillow Image + + :param data: pdftoppm/pdftocairo output bytes + :type data: bytes + :return: List of PNG images parsed from the output + :rtype: List[Image.Image] + """ images = [] diff --git a/pdf2image/pdf2image.py b/pdf2image/pdf2image.py index 0cf8867..6a35549 100644 --- a/pdf2image/pdf2image.py +++ b/pdf2image/pdf2image.py @@ -5,25 +5,26 @@ import os import platform +import posix import tempfile import types import shutil -import pathlib import subprocess from subprocess import Popen, PIPE, TimeoutExpired +from typing import Any, Union, Tuple, List, Dict, Callable +from pathlib import PurePath from PIL import Image -from .generators import uuid_generator, counter_generator, ThreadSafeGenerator +from pdf2image.generators import uuid_generator, counter_generator, ThreadSafeGenerator -from .parsers import ( +from pdf2image.parsers import ( parse_buffer_to_pgm, parse_buffer_to_ppm, parse_buffer_to_jpeg, parse_buffer_to_png, ) -from .exceptions import ( - PopplerNotInstalledError, +from pdf2image.exceptions import ( PDFInfoNotInstalledError, PDFPageCountError, PDFSyntaxError, @@ -35,69 +36,98 @@ def convert_from_path( - pdf_path, - dpi=200, - output_folder=None, - first_page=None, - last_page=None, - fmt="ppm", - jpegopt=None, - thread_count=1, - userpw=None, - ownerpw=None, - use_cropbox=False, - strict=False, - transparent=False, - single_file=False, - output_file=uuid_generator(), - poppler_path=None, - grayscale=False, - size=None, - paths_only=False, - use_pdftocairo=False, - timeout=None, - hide_annotations=False, -): - """ - Description: Convert PDF to Image will throw whenever one of the condition is reached - Parameters: - pdf_path -> Path to the PDF that you want to convert - dpi -> Image quality in DPI (default 200) - output_folder -> Write the resulting images to a folder (instead of directly in memory) - first_page -> First page to process - last_page -> Last page to process before stopping - fmt -> Output image format - jpegopt -> jpeg options `quality`, `progressive`, and `optimize` (only for jpeg format) - thread_count -> How many threads we are allowed to spawn for processing - userpw -> PDF's password - ownerpw -> PDF's owner password - use_cropbox -> Use cropbox instead of mediabox - strict -> When a Syntax Error is thrown, it will be raised as an Exception - transparent -> Output with a transparent background instead of a white one. - single_file -> Uses the -singlefile option from pdftoppm/pdftocairo - output_file -> What is the output filename or generator - poppler_path -> Path to look for poppler binaries - grayscale -> Output grayscale image(s) - size -> Size of the resulting image(s), uses the Pillow (width, height) standard - paths_only -> Don't load image(s), return paths instead (requires output_folder) - use_pdftocairo -> Use pdftocairo instead of pdftoppm, may help performance - timeout -> Raise PDFPopplerTimeoutError after the given time + pdf_path: Union[str, PurePath], + dpi: int = 200, + output_folder: Union[str, PurePath] = None, + first_page: int = None, + last_page: int = None, + fmt: str = "ppm", + jpegopt: Dict = None, + thread_count: int = 1, + userpw: str = None, + ownerpw: str = None, + use_cropbox: bool = False, + strict: bool = False, + transparent: bool = False, + single_file: bool = False, + output_file: Any = uuid_generator(), + poppler_path: Union[str, PurePath] = None, + grayscale: bool = False, + size: Union[Tuple, int] = None, + paths_only: bool = False, + use_pdftocairo: bool = False, + timeout: int = None, + hide_annotations: bool = False, +) -> List[Image.Image]: + """Function wrapping pdftoppm and pdftocairo + + :param pdf_path: Path to the PDF that you want to convert + :type pdf_path: Union[str, PurePath] + :param dpi: Image quality in DPI (default 200), defaults to 200 + :type dpi: int, optional + :param output_folder: Write the resulting images to a folder (instead of directly in memory), defaults to None + :type output_folder: Union[str, PurePath], optional + :param first_page: First page to process, defaults to None + :type first_page: int, optional + :param last_page: Last page to process before stopping, defaults to None + :type last_page: int, optional + :param fmt: Output image format, defaults to "ppm" + :type fmt: str, optional + :param jpegopt: jpeg options `quality`, `progressive`, and `optimize` (only for jpeg format), defaults to None + :type jpegopt: Dict, optional + :param thread_count: How many threads we are allowed to spawn for processing, defaults to 1 + :type thread_count: int, optional + :param userpw: PDF's password, defaults to None + :type userpw: str, optional + :param ownerpw: PDF's owner password, defaults to None + :type ownerpw: str, optional + :param use_cropbox: Use cropbox instead of mediabox, defaults to False + :type use_cropbox: bool, optional + :param strict: When a Syntax Error is thrown, it will be raised as an Exception, defaults to False + :type strict: bool, optional + :param transparent: Output with a transparent background instead of a white one, defaults to False + :type transparent: bool, optional + :param single_file: Uses the -singlefile option from pdftoppm/pdftocairo, defaults to False + :type single_file: bool, optional + :param output_file: What is the output filename or generator, defaults to uuid_generator() + :type output_file: Any, optional + :param poppler_path: Path to look for poppler binaries, defaults to None + :type poppler_path: Union[str, PurePath], optional + :param grayscale: Output grayscale image(s), defaults to False + :type grayscale: bool, optional + :param size: Size of the resulting image(s), uses the Pillow (width, height) standard, defaults to None + :type size: Union[Tuple, int], optional + :param paths_only: Don't load image(s), return paths instead (requires output_folder), defaults to False + :type paths_only: bool, optional + :param use_pdftocairo: Use pdftocairo instead of pdftoppm, may help performance, defaults to False + :type use_pdftocairo: bool, optional + :param timeout: Raise PDFPopplerTimeoutError after the given time, defaults to None + :type timeout: int, optional + :param hide_annotations: Hide PDF annotations in the output, defaults to False + :type hide_annotations: bool, optional + :raises NotImplementedError: Raised when conflicting parameters are given (hide_annotations for pdftocairo) + :raises PDFPopplerTimeoutError: Raised after the timeout for the image processing is exceeded + :raises PDFSyntaxError: Raised if there is a syntax error in the PDF and strict=True + :return: A list of Pillow images, one for each page between first_page and last_page + :rtype: List[Image.Image] """ if use_pdftocairo and fmt == "ppm": fmt = "png" # We make sure that if passed arguments are Path objects, they're converted to strings - if isinstance(pdf_path, pathlib.PurePath): + if isinstance(pdf_path, PurePath): pdf_path = pdf_path.as_posix() - if isinstance(output_folder, pathlib.PurePath): + if isinstance(output_folder, PurePath): output_folder = output_folder.as_posix() - if isinstance(poppler_path, pathlib.PurePath): + if isinstance(poppler_path, PurePath): poppler_path = poppler_path.as_posix() - page_count = pdfinfo_from_path(pdf_path, userpw, ownerpw, poppler_path=poppler_path)["Pages"] + page_count = pdfinfo_from_path( + pdf_path, userpw, ownerpw, poppler_path=poppler_path + )["Pages"] # We start by getting the output format, the buffer processing function and if we need pdftocairo parsed_fmt, final_extension, parse_buffer_func, use_pdfcairo_format = _parse_format( @@ -183,7 +213,9 @@ def convert_from_path( if use_pdfcairo: if hide_annotations: - raise NotImplementedError("Hide annotations flag not implemented in pdftocairo.") + raise NotImplementedError( + "Hide annotations flag not implemented in pdftocairo." + ) args = [_get_command_path("pdftocairo", poppler_path)] + args else: args = [_get_command_path("pdftoppm", poppler_path)] + args @@ -194,15 +226,22 @@ def convert_from_path( # Add poppler path to LD_LIBRARY_PATH env = os.environ.copy() if poppler_path is not None: - env["LD_LIBRARY_PATH"] = poppler_path + ":" + env.get("LD_LIBRARY_PATH", "") + env["LD_LIBRARY_PATH"] = ( + poppler_path + ":" + env.get("LD_LIBRARY_PATH", "") + ) # Spawn the process and save its uuid - startupinfo=None - if platform.system() == 'Windows': + startupinfo = None + if platform.system() == "Windows": # this startupinfo structure prevents a console window from popping up on Windows startupinfo = subprocess.STARTUPINFO() startupinfo.dwFlags |= subprocess.STARTF_USESHOWWINDOW processes.append( - (thread_output_file, Popen(args, env=env, stdout=PIPE, stderr=PIPE, startupinfo=startupinfo)) + ( + thread_output_file, + Popen( + args, env=env, stdout=PIPE, stderr=PIPE, startupinfo=startupinfo + ), + ) ) images = [] @@ -220,7 +259,11 @@ def convert_from_path( if output_folder is not None: images += _load_from_output_folder( - output_folder, uid, final_extension, paths_only, in_memory=auto_temp_dir + output_folder, + uid, + final_extension, + paths_only, + in_memory=auto_temp_dir, ) else: images += parse_buffer_func(data) @@ -232,53 +275,80 @@ def convert_from_path( def convert_from_bytes( - pdf_file, - dpi=200, - output_folder=None, - first_page=None, - last_page=None, - fmt="ppm", - jpegopt=None, - thread_count=1, - userpw=None, - ownerpw=None, - use_cropbox=False, - strict=False, - transparent=False, - single_file=False, - output_file=uuid_generator(), - poppler_path=None, - grayscale=False, - size=None, - paths_only=False, - use_pdftocairo=False, - timeout=None, - hide_annotations=False, -): - """ - Description: Convert PDF to Image will throw whenever one of the condition is reached - Parameters: - pdf_file -> Bytes representing the PDF file - dpi -> Image quality in DPI - output_folder -> Write the resulting images to a folder (instead of directly in memory) - first_page -> First page to process - last_page -> Last page to process before stopping - fmt -> Output image format - jpegopt -> jpeg options `quality`, `progressive`, and `optimize` (only for jpeg format) - thread_count -> How many threads we are allowed to spawn for processing - userpw -> PDF's password - ownerpw -> PDF's owner password - use_cropbox -> Use cropbox instead of mediabox - strict -> When a Syntax Error is thrown, it will be raised as an Exception - transparent -> Output with a transparent background instead of a white one. - single_file -> Uses the -singlefile option from pdftoppm/pdftocairo - output_file -> What is the output filename or generator - poppler_path -> Path to look for poppler binaries - grayscale -> Output grayscale image(s) - size -> Size of the resulting image(s), uses the Pillow (width, height) standard - paths_only -> Don't load image(s), return paths instead (requires output_folder) - use_pdftocairo -> Use pdftocairo instead of pdftoppm, may help performance - timeout -> Raise PDFPopplerTimeoutError after the given time + pdf_file: bytes, + dpi: int = 200, + output_folder: Union[str, PurePath] = None, + first_page: int = None, + last_page: int = None, + fmt: str = "ppm", + jpegopt: Dict = None, + thread_count: int = 1, + userpw: str = None, + ownerpw: str = None, + use_cropbox: bool = False, + strict: bool = False, + transparent: bool = False, + single_file: bool = False, + output_file: Union[str, PurePath] = uuid_generator(), + poppler_path: Union[str, PurePath] = None, + grayscale: bool = False, + size: Union[Tuple, int] = None, + paths_only: bool = False, + use_pdftocairo: bool = False, + timeout: int = None, + hide_annotations: bool = False, +) -> List[Image.Image]: + """Function wrapping pdftoppm and pdftocairo. + + :param pdf_bytes: Bytes of the PDF that you want to convert + :type pdf_bytes: bytes + :param dpi: Image quality in DPI (default 200), defaults to 200 + :type dpi: int, optional + :param output_folder: Write the resulting images to a folder (instead of directly in memory), defaults to None + :type output_folder: Union[str, PurePath], optional + :param first_page: First page to process, defaults to None + :type first_page: int, optional + :param last_page: Last page to process before stopping, defaults to None + :type last_page: int, optional + :param fmt: Output image format, defaults to "ppm" + :type fmt: str, optional + :param jpegopt: jpeg options `quality`, `progressive`, and `optimize` (only for jpeg format), defaults to None + :type jpegopt: Dict, optional + :param thread_count: How many threads we are allowed to spawn for processing, defaults to 1 + :type thread_count: int, optional + :param userpw: PDF's password, defaults to None + :type userpw: str, optional + :param ownerpw: PDF's owner password, defaults to None + :type ownerpw: str, optional + :param use_cropbox: Use cropbox instead of mediabox, defaults to False + :type use_cropbox: bool, optional + :param strict: When a Syntax Error is thrown, it will be raised as an Exception, defaults to False + :type strict: bool, optional + :param transparent: Output with a transparent background instead of a white one, defaults to False + :type transparent: bool, optional + :param single_file: Uses the -singlefile option from pdftoppm/pdftocairo, defaults to False + :type single_file: bool, optional + :param output_file: What is the output filename or generator, defaults to uuid_generator() + :type output_file: Any, optional + :param poppler_path: Path to look for poppler binaries, defaults to None + :type poppler_path: Union[str, PurePath], optional + :param grayscale: Output grayscale image(s), defaults to False + :type grayscale: bool, optional + :param size: Size of the resulting image(s), uses the Pillow (width, height) standard, defaults to None + :type size: Union[Tuple, int], optional + :param paths_only: Don't load image(s), return paths instead (requires output_folder), defaults to False + :type paths_only: bool, optional + :param use_pdftocairo: Use pdftocairo instead of pdftoppm, may help performance, defaults to False + :type use_pdftocairo: bool, optional + :param timeout: Raise PDFPopplerTimeoutError after the given time, defaults to None + :type timeout: int, optional + :param hide_annotations: Hide PDF annotations in the output, defaults to False + :type hide_annotations: bool, optional + :raises NotImplementedError: Raised when conflicting parameters are given (hide_annotations for pdftocairo) + :raises PDFPopplerTimeoutError: Raised after the timeout for the image processing is exceeded + :raises PDFSyntaxError: Raised if there is a syntax error in the PDF and strict=True + :return: A list of Pillow images, one for each page between first_page and last_page + :rtype: List[Image.Image] """ fh, temp_filename = tempfile.mkstemp() @@ -316,22 +386,22 @@ def convert_from_bytes( def _build_command( - args, - output_folder, - first_page, - last_page, - fmt, - jpegopt, - output_file, - userpw, - ownerpw, - use_cropbox, - transparent, - single_file, - grayscale, - size, - hide_annotations, -): + args: List, + output_folder: str, + first_page: int, + last_page: int, + fmt: str, + jpegopt: Dict, + output_file: str, + userpw: str, + ownerpw: str, + use_cropbox: bool, + transparent: bool, + single_file: bool, + grayscale: bool, + size: Union[int, Tuple[int, int]], + hide_annotations: bool, +) -> List[str]: if use_cropbox: args.append("-cropbox") @@ -384,12 +454,12 @@ def _build_command( elif isinstance(size, int) or isinstance(size, float): args.extend(["-scale-to", str(int(size))]) else: - raise ValueError("Size {} is not a tuple or an integer") + raise ValueError(f"Size {size} is not a tuple or an integer") return args -def _parse_format(fmt, grayscale=False): +def _parse_format(fmt: str, grayscale: bool = False) -> Tuple[str, str, Callable, bool]: fmt = fmt.lower() if fmt[0] == ".": fmt = fmt[1:] @@ -405,7 +475,7 @@ def _parse_format(fmt, grayscale=False): return "ppm", "ppm", parse_buffer_to_ppm, False -def _parse_jpegopt(jpegopt): +def _parse_jpegopt(jpegopt: Dict) -> str: parts = [] for k, v in jpegopt.items(): if v is True: @@ -416,7 +486,7 @@ def _parse_jpegopt(jpegopt): return ",".join(parts) -def _get_command_path(command, poppler_path=None): +def _get_command_path(command: str, poppler_path: str = None) -> str: if platform.system() == "Windows": command = command + ".exe" @@ -426,7 +496,9 @@ def _get_command_path(command, poppler_path=None): return command -def _get_poppler_version(command, poppler_path=None, timeout=None): +def _get_poppler_version( + command: str, poppler_path: str = None, timeout: int = None +) -> Tuple[int, int]: command = [_get_command_path(command, poppler_path), "-v"] env = os.environ.copy() @@ -451,8 +523,33 @@ def _get_poppler_version(command, poppler_path=None, timeout=None): def pdfinfo_from_path( - pdf_path, userpw=None, ownerpw=None, poppler_path=None, rawdates=False, timeout=None -): + pdf_path: str, + userpw: str = None, + ownerpw: str = None, + poppler_path: str = None, + rawdates: bool = False, + timeout: int = None, +) -> Dict: + """Function wrapping poppler's pdfinfo utility and returns the result as a dictionary. + + :param pdf_path: Path to the PDF that you want to convert + :type pdf_path: str + :param userpw: PDF's password, defaults to None + :type userpw: str, optional + :param ownerpw: PDF's owner password, defaults to None + :type ownerpw: str, optional + :param poppler_path: Path to look for poppler binaries, defaults to None + :type poppler_path: Union[str, PurePath], optional + :param rawdates: Return the undecoded data strings, defaults to False + :type rawdates: bool, optional + :param timeout: Raise PDFPopplerTimeoutError after the given time, defaults to None + :type timeout: int, optional + :raises PDFPopplerTimeoutError: Raised after the timeout for the image processing is exceeded + :raises PDFInfoNotInstalledError: Raised if pdfinfo is not installed + :raises PDFPageCountError: Raised if the output could not be parsed + :return: Dictionary containing various information on the PDF + :rtype: Dict + """ try: command = [_get_command_path("pdfinfo", poppler_path), pdf_path] @@ -500,28 +597,59 @@ def pdfinfo_from_path( ) except ValueError: raise PDFPageCountError( - "Unable to get page count.\n%s" % err.decode("utf8", "ignore") + f"Unable to get page count.\n{err.decode('utf8', 'ignore')}" ) def pdfinfo_from_bytes( - pdf_file, userpw=None, ownerpw=None, poppler_path=None, rawdates=False, timeout=None -): + pdf_bytes: bytes, + userpw: str = None, + ownerpw: str = None, + poppler_path: str = None, + rawdates: bool = False, + timeout: int = None, +) -> Dict: + """Function wrapping poppler's pdfinfo utility and returns the result as a dictionary. + + :param pdf_bytes: Bytes of the PDF that you want to convert + :type pdf_bytes: bytes + :param userpw: PDF's password, defaults to None + :type userpw: str, optional + :param ownerpw: PDF's owner password, defaults to None + :type ownerpw: str, optional + :param poppler_path: Path to look for poppler binaries, defaults to None + :type poppler_path: Union[str, PurePath], optional + :param rawdates: Return the undecoded data strings, defaults to False + :type rawdates: bool, optional + :param timeout: Raise PDFPopplerTimeoutError after the given time, defaults to None + :type timeout: int, optional + :return: Dictionary containing various information on the PDF + :rtype: Dict + """ fh, temp_filename = tempfile.mkstemp() try: with open(temp_filename, "wb") as f: - f.write(pdf_file) + f.write(pdf_bytes) f.flush() - return pdfinfo_from_path(temp_filename, userpw=userpw, ownerpw=ownerpw, - rawdates=rawdates, poppler_path=poppler_path) + return pdfinfo_from_path( + temp_filename, + userpw=userpw, + ownerpw=ownerpw, + rawdates=rawdates, + poppler_path=poppler_path, + ) finally: os.close(fh) os.remove(temp_filename) def _load_from_output_folder( - output_folder, output_file, ext, paths_only, in_memory=False -): + output_folder: str, + output_file: str, + ext: str, + paths_only: bool, + in_memory: bool = False, +) -> List[Image.Image]: images = [] for f in sorted(os.listdir(output_folder)): if f.startswith(output_file) and f.split(".")[-1] == ext: diff --git a/setup.py b/setup.py index f61a21c..6f25c47 100644 --- a/setup.py +++ b/setup.py @@ -12,7 +12,7 @@ setup( name="pdf2image", - version="1.16.0", + version="1.16.1", description="A wrapper around the pdftoppm and pdftocairo command line tools to convert PDF to a PIL Image list.", long_description=long_description, long_description_content_type="text/markdown", @@ -30,10 +30,10 @@ "Intended Audience :: Developers", "License :: OSI Approved :: MIT License", "Programming Language :: Python :: 3", - "Programming Language :: Python :: 3.5", - "Programming Language :: Python :: 3.6", "Programming Language :: Python :: 3.7", "Programming Language :: Python :: 3.8", + "Programming Language :: Python :: 3.9", + "Programming Language :: Python :: 3.10", ], keywords="pdf image png jpeg jpg convert", packages=find_packages(exclude=["contrib", "docs", "tests"]), diff --git a/tests.py b/tests.py index 591e8e9..ae421f6 100644 --- a/tests.py +++ b/tests.py @@ -30,7 +30,7 @@ from functools import wraps -PROFILE_MEMORY = os.environ.get('PROFILE_MEMORY', False) +PROFILE_MEMORY = os.environ.get("PROFILE_MEMORY", False) try: subprocess.call( @@ -1514,7 +1514,9 @@ def test_conversion_from_path_with_2d_tuple_size_with_None_height(self): @profile @unittest.skipIf(not POPPLER_INSTALLED, "Poppler is not installed!") def test_conversion_from_path_with_hide_annotations(self): - images_from_path = convert_from_path("./tests/test_annotations.pdf", hide_annotations=True) + images_from_path = convert_from_path( + "./tests/test_annotations.pdf", hide_annotations=True + ) start_time = time.time() self.assertTrue(len(images_from_path) == 1) print( @@ -1541,7 +1543,9 @@ def test_conversion_from_bytes_with_hide_annotations(self): @profile @unittest.skipIf(not POPPLER_INSTALLED, "Poppler is not installed!") - def test_conversion_from_path_with_hide_annotations_with_invalid_arg_combination(self): + def test_conversion_from_path_with_hide_annotations_with_invalid_arg_combination( + self, + ): start_time = time.time() try: images_from_path = convert_from_path( @@ -1640,7 +1644,9 @@ def test_conversion_from_path_using_dir_paths_only(self): @unittest.skipIf(not POPPLER_INSTALLED, "Poppler is not installed") def test_multithread_conversion(self): start_time = time.time() - files = ["./tests/test.pdf",] * 50 + files = [ + "./tests/test.pdf", + ] * 50 with Pool(10) as p: res = p.map(convert_from_path, files) self.assertTrue(len(res) == 50) @@ -1677,9 +1683,7 @@ def test_pdfinfo_rawdates(self): start_time = time.time() info = pdfinfo_from_path("./tests/test.pdf", rawdates=True) self.assertTrue("D:" in info["CreationDate"]) - print( - "test_pdfinfo_rawdates: {} sec".format(time.time() - start_time) - ) + print("test_pdfinfo_rawdates: {} sec".format(time.time() - start_time)) @profile @unittest.skipIf(not POPPLER_INSTALLED, "Poppler is not installed!") @@ -1687,12 +1691,12 @@ def test_pdfinfo_locked_pdf_with_userpw_only(self): start_time = time.time() with TemporaryDirectory() as path: with open("./tests/test_locked_user_only.pdf", "rb") as pdf_file: - info = pdfinfo_from_bytes( - pdf_file.read(), userpw="pdf2image" - ) + info = pdfinfo_from_bytes(pdf_file.read(), userpw="pdf2image") self.assertTrue("CreationDate" in info) print( - "test_pdfinfo_locked_pdf_with_userpw_only: {} sec".format(time.time() - start_time) + "test_pdfinfo_locked_pdf_with_userpw_only: {} sec".format( + time.time() - start_time + ) ) @profile @@ -1702,7 +1706,11 @@ def test_convert_from_functions_same_number_of_parameters(self): len(signature(convert_from_path).parameters), len(signature(convert_from_bytes).parameters), ) - print("test_convert_from_functions_same_number_of_parameters: {} sec".format(time.time() - start_time)) + print( + "test_convert_from_functions_same_number_of_parameters: {} sec".format( + time.time() - start_time + ) + ) @profile def test_pdfinfo_functions_same_number_of_parameters(self): @@ -1711,14 +1719,22 @@ def test_pdfinfo_functions_same_number_of_parameters(self): len(signature(pdfinfo_from_path).parameters), len(signature(pdfinfo_from_bytes).parameters), ) - print("test_pdfinfo_functions_same_number_of_parameters: {} sec".format(time.time() - start_time)) - + print( + "test_pdfinfo_functions_same_number_of_parameters: {} sec".format( + time.time() - start_time + ) + ) + @unittest.skipIf(not POPPLER_INSTALLED, "Poppler is not installed!") def test_timeout_pdfinfo_from_path_241(self): start_time = time.time() with self.assertRaises(PDFPopplerTimeoutError): info = pdfinfo_from_path("./tests/test_241.pdf", timeout=0.00001) - print("test_timeout_pdfinfo_from_path_241: {} sec".format(time.time() - start_time)) + print( + "test_timeout_pdfinfo_from_path_241: {} sec".format( + time.time() - start_time + ) + ) @profile @unittest.skipIf(not POPPLER_INSTALLED, "Poppler is not installed!") @@ -1726,7 +1742,12 @@ def test_timeout_convert_from_path_241(self): start_time = time.time() with self.assertRaises(PDFPopplerTimeoutError): imgs = convert_from_path("./tests/test_241.pdf", timeout=1) - print("test_timeout_convert_from_path_241: {} sec".format(time.time() - start_time)) + print( + "test_timeout_convert_from_path_241: {} sec".format( + time.time() - start_time + ) + ) + if __name__ == "__main__": unittest.main()