Skip to content

Commit

Permalink
drop to pdfminer
Browse files Browse the repository at this point in the history
  • Loading branch information
iodabasi committed Feb 7, 2024
1 parent 698913e commit cd0c03e
Show file tree
Hide file tree
Showing 2 changed files with 27 additions and 12 deletions.
35 changes: 25 additions & 10 deletions hotpdf/processor.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,8 +3,12 @@
from pathlib import PurePath
from typing import Optional, Union

from pdfminer.high_level import extract_pages
from pdfminer.converter import PDFPageAggregator
from pdfminer.layout import LAParams
from pdfminer.pdfdocument import PDFDocument
from pdfminer.pdfinterp import PDFPageInterpreter, PDFResourceManager
from pdfminer.pdfpage import PDFPage
from pdfminer.pdfparser import PDFParser

from hotpdf.memory_map import MemoryMap

Expand All @@ -21,16 +25,27 @@ def __process(
pages: list[MemoryMap] = []
page_numbers = sorted(page_numbers) if page_numbers else []

laparams_obj = LAParams(**laparams if laparams else {}) # type: ignore
file = open(source, "rb") if not isinstance(source, IOBase) else source # noqa: SIM115

hl_page_layouts = extract_pages(
source, password=password, page_numbers=page_numbers, caching=False, laparams=laparams_obj
)
for page_layout in hl_page_layouts:
parsed_page: MemoryMap = MemoryMap()
parsed_page.build_memory_map()
parsed_page.load_memory_map(page=page_layout, include_annotation_spaces=include_annotation_spaces)
pages.append(parsed_page)
laparams_obj = LAParams(**laparams if laparams else {})
parser = PDFParser(file)
encoded_password = password.encode() if password else b""
doc = PDFDocument(parser, password=encoded_password)
rsrcmgr = PDFResourceManager()
device = PDFPageAggregator(rsrcmgr, laparams=laparams_obj)
interpreter = PDFPageInterpreter(rsrcmgr, device)

for page_number, page in enumerate(PDFPage.create_pages(doc), start=0):
if not page_numbers or page_number in page_numbers:
interpreter.process_page(page)
page_layout = device.get_result()
parsed_page: MemoryMap = MemoryMap()
parsed_page.build_memory_map()
parsed_page.load_memory_map(page=page_layout, include_annotation_spaces=include_annotation_spaces)
pages.append(parsed_page)

if not isinstance(source, IOBase):
file.close()
return pages


Expand Down
4 changes: 2 additions & 2 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta:__legacy__"

[project]
name="hotpdf"
version="0.4.5.2"
version="0.4.5.2.dev"
authors = [
{name = "Krishnasis Mandal", email = "krishnasis.mandal@prestatech.com"}]
maintainers = [
Expand Down Expand Up @@ -40,7 +40,7 @@ keywords = [
]

dependencies = [
"pdfminer.six==20231228",
"pdfminer==20191125",
]

[tool.setuptools]
Expand Down

0 comments on commit cd0c03e

Please sign in to comment.