Skip to content

Commit

Permalink
feat!: adding load/unload from key (#9)
Browse files Browse the repository at this point in the history
* adding load/unload from key

Signed-off-by: Peter Staar <taa@zurich.ibm.com>

* updated tests

Signed-off-by: Peter Staar <taa@zurich.ibm.com>

* all fixed, still need to clean all commented out code

Signed-off-by: Peter Staar <taa@zurich.ibm.com>

* ran pre-commit hooks

Signed-off-by: Peter Staar <taa@zurich.ibm.com>

* allow more tabulate versions

Signed-off-by: Michele Dolfi <dol@zurich.ibm.com>

* renamed some key functions

Signed-off-by: Peter Staar <taa@zurich.ibm.com>

* ran pre-commit hooks (2)

Signed-off-by: Peter Staar <taa@zurich.ibm.com>

---------

Signed-off-by: Peter Staar <taa@zurich.ibm.com>
Signed-off-by: Michele Dolfi <dol@zurich.ibm.com>
Co-authored-by: Michele Dolfi <dol@zurich.ibm.com>
  • Loading branch information
PeterStaar-IBM and dolfim-ibm authored Aug 22, 2024
1 parent 89211eb commit dd122d0
Show file tree
Hide file tree
Showing 18 changed files with 32,295 additions and 209 deletions.
80 changes: 75 additions & 5 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -21,15 +21,85 @@ pip install docling-parse

Convert a PDF

```sh
```python
from docling_parse.docling_parse import pdf_parser

# Do this only once to load fonts (avoid initialising it many times)
parser = pdf_parser()
doc = parser.find_cells("mydoc.pdf")

for i, page in enumerate(doc["pages"]):
for j, cell in enumerate(page["cells"]):
print(i, "\t", j, "\t", cell["content"]["rnormalized"])
# parser.set_loglevel(1) # 1=error, 2=warning, 3=success, 4=info

doc_file = "my-doc.pdf" # filename
doc_key = f"key={pdf_doc}" # unique document key (eg hash, UUID, etc)

# Load the document from file using filename doc_file. This only loads
# the QPDF document, but no extracted data
success = parser.load_document(doc_key, doc_file)

# Open the file in binary mode and read its contents
# with open(pdf_doc, "rb") as file:
# file_content = file.read()

# Create a BytesIO object and write the file contents to it
# bytes_io = io.BytesIO(file_content)
# success = parser.load_document_from_bytesio(doc_key, bytes_io)

# Parse the entire document in one go, easier, but could require
# a lot (more) memory as parsing page-by-page
# json_doc = parser.parse_pdf_from_key(doc_key)

# Get number of pages
num_pages = parser.number_of_pages(doc_key)

# Parse page by page to minimize memory footprint
for page in range(0, num_pages):

# Internal memory for page is auto-deleted after this call.
# No need to unload a specifc page
json_doc = parser.parse_pdf_from_key_on_page(doc_key, page)

# parsed page is the first one!
json_page = json_doc["pages"][0]

page_dimensions = [json_page["dimensions"]["width"], json_page["dimensions"]["height"]]

# find text cells
cells=[]
for cell_id,cell in enumerate(json_page["cells"]):
cells.append([page,
cell_id,
cell["content"]["rnormalized"], # text
cell["box"]["device"][0], # x0 (lower left x)
cell["box"]["device"][1], # y0 (lower left y)
cell["box"]["device"][2], # x1 (upper right x)
cell["box"]["device"][3], # y1 (upper right y)
])

# find bitmap images
images=[]
for image_id,image in enumerate(json_page["images"]):
images.append([page,
image_id,
image["box"][0], # x0 (lower left x)
image["box"][1], # y0 (lower left y)
image["box"][2], # x1 (upper right x)
image["box"][3], # y1 (upper right y)
])

# find paths
paths=[]
for path_id,path in enumerate(json_page["paths"]):
paths.append([page,
path_id,
path["x-values"], # array of x values
path["y-values"], # array of y values
])

# Unload the (QPDF) document and buffers
parser.unload_document(doc_key)

# Unloads everything at once
# parser.unload_documents()
```

Use the CLI
Expand Down
20 changes: 19 additions & 1 deletion app/pybind_parse.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -16,8 +16,25 @@ PYBIND11_MODULE(docling_parse, m) {

.def("set_loglevel", &docling::docling_parser::set_loglevel)

.def("is_loaded", &docling::docling_parser::is_loaded)
.def("list_loaded_keys", &docling::docling_parser::list_loaded_keys)

.def("load_document", &docling::docling_parser::load_document)
.def("load_document_from_bytesio", &docling::docling_parser::load_document_from_bytesio)

.def("unload_document", &docling::docling_parser::unload_document)
.def("unload_documents", &docling::docling_parser::unload_documents)

.def("number_of_pages", &docling::docling_parser::number_of_pages)

.def("parse_pdf_from_key",
pybind11::overload_cast<std::string>(&docling::docling_parser::parse_pdf_from_key),
"parse pdf-document using doc-key into json")

.def("parse_pdf_from_key_on_page",
&docling::docling_parser::parse_pdf_from_key_on_page,
"parse specific page in pdf-document using doc-key from path into json")
/*
.def("find_cells",
pybind11::overload_cast<std::string>(&docling::docling_parser::find_cells),
"parse pdf-document from path into json")
Expand All @@ -32,5 +49,6 @@ PYBIND11_MODULE(docling_parse, m) {
.def("find_cells_from_bytesio_on_page",
&docling::docling_parser::find_cells_from_bytesio_on_page,
"parse pdf-document from a BytesIO object for a specific page");
"parse pdf-document from a BytesIO object for a specific page")*/
;
}
74 changes: 74 additions & 0 deletions docling_parse/run.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,8 @@
import io
import os

from tabulate import tabulate

# from docling_parse.docling_parse import pdf_parser
import docling_parse
from docling_parse import pdf_parser
Expand All @@ -28,6 +30,76 @@ def main():
# Print the path to the PDF file (or add your processing logic here)

parser = docling_parse.pdf_parser()

doc_file = args.pdf # filename
doc_key = f"key={args.pdf}" # unique document key (eg hash, UUID, etc)

# Load the document
success = parser.load_document(doc_key, doc_file)

# Get number of pages
num_pages = parser.number_of_pages(doc_key)

# Parse page by page to minimize memory footprint
for page in range(0, num_pages):
json_doc = parser.parse_pdf_from_key_on_page(doc_key, page)
json_page = json_doc["pages"][0]

page_dimensions = [
json_page["dimensions"]["width"],
json_page["dimensions"]["height"],
]

# find text cells
cells = []
for cell_id, cell in enumerate(json_page["cells"]):
cells.append(
[
page,
cell_id,
cell["content"]["rnormalized"], # text
cell["box"]["device"][0], # x0 (lower left x)
cell["box"]["device"][1], # y0 (lower left y)
cell["box"]["device"][2], # x1 (upper right x)
cell["box"]["device"][3], # y1 (upper right y)
]
)

print(f"cells of page: {page}")
print(
tabulate(cells, headers=["page", "cell-id", "text", "x0", "y0", "x1", "y1"])
)

# find bitmap images
images = []
for image_id, image in enumerate(json_page["images"]):
images.append(
[
page,
image_id,
image["box"][0], # x0 (lower left x)
image["box"][1], # y0 (lower left y)
image["box"][2], # x1 (upper right x)
image["box"][3], # y1 (upper right y)
]
)

# find paths
paths = []
for path_id, path in enumerate(json_page["paths"]):
paths.append(
[
page,
path_id,
path["x-values"], # array of x values
path["y-values"], # array of y values
]
)

# Unload the document
parser.unload_document(doc_key)

"""
doc = parser.find_cells(args.pdf)
# print(json.dumps(data, indent=2))
Expand Down Expand Up @@ -55,6 +127,8 @@ def main():
for j, cell in enumerate(page["cells"]):
print(i, "\t", j, "\t", cell["content"]["rnormalized"])
"""


if __name__ == "__main__":
main()
16 changes: 15 additions & 1 deletion poetry.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

1 change: 1 addition & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,7 @@ build = "build.py"

[tool.poetry.dependencies]
python = "^3.9"
tabulate = ">=0.9.0,<1.0.0"

[tool.poetry.group.dev.dependencies]
pytest = "^7.4.2"
Expand Down
Loading

0 comments on commit dd122d0

Please sign in to comment.