feat!: adding load/unload from key (#9)

* adding load/unload from key Signed-off-by: Peter Staar <taa@zurich.ibm.com> * updated tests Signed-off-by: Peter Staar <taa@zurich.ibm.com> * all fixed, still need to clean all commented out code Signed-off-by: Peter Staar <taa@zurich.ibm.com> * ran pre-commit hooks Signed-off-by: Peter Staar <taa@zurich.ibm.com> * allow more tabulate versions Signed-off-by: Michele Dolfi <dol@zurich.ibm.com> * renamed some key functions Signed-off-by: Peter Staar <taa@zurich.ibm.com> * ran pre-commit hooks (2) Signed-off-by: Peter Staar <taa@zurich.ibm.com> --------- Signed-off-by: Peter Staar <taa@zurich.ibm.com> Signed-off-by: Michele Dolfi <dol@zurich.ibm.com> Co-authored-by: Michele Dolfi <dol@zurich.ibm.com>
DS4SD · Aug 22, 2024 · dd122d0 · dd122d0
1 parent 89211eb
commit dd122d0
Show file tree

Hide file tree

Showing 18 changed files with 32,295 additions and 209 deletions.
diff --git a/README.md b/README.md
@@ -21,15 +21,85 @@ pip install docling-parse
 
 Convert a PDF
 
-```sh
+```python
 from docling_parse.docling_parse import pdf_parser
 
+# Do this only once to load fonts (avoid initialising it many times)
 parser = pdf_parser()
-doc = parser.find_cells("mydoc.pdf")
 
-for i, page in enumerate(doc["pages"]):
-    for j, cell in enumerate(page["cells"]):
-        print(i, "\t", j, "\t", cell["content"]["rnormalized"])
+# parser.set_loglevel(1) # 1=error, 2=warning, 3=success, 4=info
+
+doc_file = "my-doc.pdf" # filename
+doc_key = f"key={pdf_doc}" # unique document key (eg hash, UUID, etc)
+
+# Load the document from file using filename doc_file. This only loads
+# the QPDF document, but no extracted data
+success = parser.load_document(doc_key, doc_file)
+
+# Open the file in binary mode and read its contents
+# with open(pdf_doc, "rb") as file:
+#      file_content = file.read()
+
+# Create a BytesIO object and write the file contents to it
+# bytes_io = io.BytesIO(file_content)
+# success = parser.load_document_from_bytesio(doc_key, bytes_io)
+
+# Parse the entire document in one go, easier, but could require
+# a lot (more) memory as parsing page-by-page
+# json_doc = parser.parse_pdf_from_key(doc_key)	
+
+# Get number of pages
+num_pages = parser.number_of_pages(doc_key)
+
+# Parse page by page to minimize memory footprint
+for page in range(0, num_pages):
+
+    # Internal memory for page is auto-deleted after this call.
+    # No need to unload a specifc page 
+    json_doc = parser.parse_pdf_from_key_on_page(doc_key, page)
+
+    # parsed page is the first one!				  
+    json_page = json_doc["pages"][0] 
+
+    page_dimensions = [json_page["dimensions"]["width"], json_page["dimensions"]["height"]]
+
+    # find text cells
+    cells=[]
+    for cell_id,cell in enumerate(json_page["cells"]):
+    	cells.append([page,
+	              cell_id,
+		      cell["content"]["rnormalized"], # text
+	              cell["box"]["device"][0], # x0 (lower left x)
+		      cell["box"]["device"][1], # y0 (lower left y)
+		      cell["box"]["device"][2], # x1 (upper right x)
+		      cell["box"]["device"][3], # y1 (upper right y)	
+		      ])
+
+    # find bitmap images
+    images=[]
+    for image_id,image in enumerate(json_page["images"]):
+    	images.append([page,
+	               image_id,
+	               image["box"][0], # x0 (lower left x)
+		       image["box"][1], # y0 (lower left y)
+		       image["box"][2], # x1 (upper right x)
+		       image["box"][3], # y1 (upper right y)
+		       ])
+
+    # find paths
+    paths=[]
+    for path_id,path in enumerate(json_page["paths"]):
+    	paths.append([page,
+	              path_id,
+	              path["x-values"], # array of x values
+	              path["y-values"], # array of y values
+		      ])
+
+# Unload the (QPDF) document and buffers
+parser.unload_document(doc_key)
+
+# Unloads everything at once
+# parser.unload_documents()
 ```
 
 Use the CLI

diff --git a/app/pybind_parse.cpp b/app/pybind_parse.cpp
@@ -16,8 +16,25 @@ PYBIND11_MODULE(docling_parse, m) {
 
     .def("set_loglevel", &docling::docling_parser::set_loglevel)
 
+    .def("is_loaded", &docling::docling_parser::is_loaded)
+    .def("list_loaded_keys", &docling::docling_parser::list_loaded_keys)
+
+    .def("load_document", &docling::docling_parser::load_document)
+    .def("load_document_from_bytesio", &docling::docling_parser::load_document_from_bytesio)
+
+    .def("unload_document", &docling::docling_parser::unload_document)    
     .def("unload_documents", &docling::docling_parser::unload_documents)
 
+    .def("number_of_pages", &docling::docling_parser::number_of_pages)
+
+    .def("parse_pdf_from_key",
+	 pybind11::overload_cast<std::string>(&docling::docling_parser::parse_pdf_from_key),
+	 "parse pdf-document using doc-key into json")    
+
+    .def("parse_pdf_from_key_on_page",
+	 &docling::docling_parser::parse_pdf_from_key_on_page,
+	 "parse specific page in pdf-document using doc-key from path into json")
+    /*
     .def("find_cells",
 	 pybind11::overload_cast<std::string>(&docling::docling_parser::find_cells),
 	 "parse pdf-document from path into json")    
@@ -32,5 +49,6 @@ PYBIND11_MODULE(docling_parse, m) {
 
     .def("find_cells_from_bytesio_on_page",
 	 &docling::docling_parser::find_cells_from_bytesio_on_page,
-	 "parse pdf-document from a BytesIO object for a specific page");
+	 "parse pdf-document from a BytesIO object for a specific page")*/
+    ;
 }
diff --git a/docling_parse/run.py b/docling_parse/run.py
@@ -2,6 +2,8 @@
 import io
 import os
 
+from tabulate import tabulate
+
 # from docling_parse.docling_parse import pdf_parser
 import docling_parse
 from docling_parse import pdf_parser
@@ -28,6 +30,76 @@ def main():
     # Print the path to the PDF file (or add your processing logic here)
 
     parser = docling_parse.pdf_parser()
+
+    doc_file = args.pdf  # filename
+    doc_key = f"key={args.pdf}"  # unique document key (eg hash, UUID, etc)
+
+    # Load the document
+    success = parser.load_document(doc_key, doc_file)
+
+    # Get number of pages
+    num_pages = parser.number_of_pages(doc_key)
+
+    # Parse page by page to minimize memory footprint
+    for page in range(0, num_pages):
+        json_doc = parser.parse_pdf_from_key_on_page(doc_key, page)
+        json_page = json_doc["pages"][0]
+
+        page_dimensions = [
+            json_page["dimensions"]["width"],
+            json_page["dimensions"]["height"],
+        ]
+
+        # find text cells
+        cells = []
+        for cell_id, cell in enumerate(json_page["cells"]):
+            cells.append(
+                [
+                    page,
+                    cell_id,
+                    cell["content"]["rnormalized"],  # text
+                    cell["box"]["device"][0],  # x0 (lower left x)
+                    cell["box"]["device"][1],  # y0 (lower left y)
+                    cell["box"]["device"][2],  # x1 (upper right x)
+                    cell["box"]["device"][3],  # y1 (upper right y)
+                ]
+            )
+
+        print(f"cells of page: {page}")
+        print(
+            tabulate(cells, headers=["page", "cell-id", "text", "x0", "y0", "x1", "y1"])
+        )
+
+        # find bitmap images
+        images = []
+        for image_id, image in enumerate(json_page["images"]):
+            images.append(
+                [
+                    page,
+                    image_id,
+                    image["box"][0],  # x0 (lower left x)
+                    image["box"][1],  # y0 (lower left y)
+                    image["box"][2],  # x1 (upper right x)
+                    image["box"][3],  # y1 (upper right y)
+                ]
+            )
+
+        # find paths
+        paths = []
+        for path_id, path in enumerate(json_page["paths"]):
+            paths.append(
+                [
+                    page,
+                    path_id,
+                    path["x-values"],  # array of x values
+                    path["y-values"],  # array of y values
+                ]
+            )
+
+    # Unload the document
+    parser.unload_document(doc_key)
+
+    """
     doc = parser.find_cells(args.pdf)
 
     # print(json.dumps(data, indent=2))
@@ -55,6 +127,8 @@ def main():
         for j, cell in enumerate(page["cells"]):
             print(i, "\t", j, "\t", cell["content"]["rnormalized"])
 
+    """
+
 
 if __name__ == "__main__":
     main()
diff --git a/poetry.lock b/poetry.lock
diff --git a/pyproject.toml b/pyproject.toml
@@ -34,6 +34,7 @@ build = "build.py"
 
 [tool.poetry.dependencies]
 python = "^3.9"
+tabulate = ">=0.9.0,<1.0.0"
 
 [tool.poetry.group.dev.dependencies]
 pytest = "^7.4.2"