fix: resolve more assert errors (#16)

* added eval scripts Signed-off-by: Peter Staar <taa@zurich.ibm.com> * fix errors in PMC Signed-off-by: Peter Staar <taa@zurich.ibm.com> * reformat code Signed-off-by: Peter Staar <taa@zurich.ibm.com> --------- Signed-off-by: Peter Staar <taa@zurich.ibm.com>
DS4SD · Aug 30, 2024 · c3a6b03 · c3a6b03
1 parent e842657
commit c3a6b03
Show file tree

Hide file tree

Showing 3 changed files with 179 additions and 235 deletions.
diff --git a/docling_parse/eval.py b/docling_parse/eval.py
@@ -0,0 +1,104 @@
+import argparse
+import glob
+import io
+import os
+
+from tabulate import tabulate
+
+# from docling_parse.docling_parse import pdf_parser
+import docling_parse
+from docling_parse import pdf_parser
+
+
+def main():
+    # Create the argument parser
+    parser = argparse.ArgumentParser(description="Process a PDF file.")
+
+    # Add an argument for the path to the PDF file
+    parser.add_argument(
+        "-l",
+        "--log-level",
+        type=int,
+        required=False,
+        default=2,
+        help="log-level 1,2,3,4",
+    )
+
+    # Add an argument for the path to the PDF file
+    parser.add_argument(
+        "-d",
+        "--pdfdir",
+        type=str,
+        help="Path to the directory with PDF files",
+        required=True,
+    )
+
+    # Add an argument for the path to the PDF file
+    parser.add_argument(
+        "-m",
+        "--max-docs",
+        type=int,
+        required=False,
+        default=None,
+        help="max number of documents to run on",
+    )
+
+    # Parse the command-line arguments
+    args = parser.parse_args()
+    print(f"The provided PDF path is: {args.pdfdir}")
+
+    # Check if the provided path is valid
+    if not os.path.exists(args.pdfdir):
+        print(f"Error: The directory {args.pdfdir} does not exist.")
+        return
+
+    # Print the path to the PDF file (or add your processing logic here)
+
+    parser = docling_parse.pdf_parser()
+    parser.set_loglevel(args.log_level)
+
+    overview = []
+
+    doc_files = sorted(glob.glob(os.path.join(args.pdfdir, "*.pdf")))
+    if args.max_docs != None:
+        doc_files = doc_files[0 : args.max_docs]
+
+    for doc_id, doc_file in enumerate(doc_files):
+        print(doc_file)
+
+        doc_key = f"key={doc_file}"  # unique document key (eg hash, UUID, etc)
+
+        # Load the document
+        success = parser.load_document(doc_key, doc_file)
+        # parser.set_loglevel(args.log_level)
+
+        # Get number of pages
+        num_pages = parser.number_of_pages(doc_key)
+        print("#-pages: ", num_pages)
+
+        failed = False
+
+        try:
+            # Parse page by page to minimize memory footprint
+            for page in range(0, num_pages):
+                json_doc = parser.parse_pdf_from_key_on_page(doc_key, page)
+
+                if "pages" not in json_doc:  # page could not get parsed
+                    print(f"ERROR: page {page} is not parsed ... ")
+                    failed = True
+                else:
+                    print(f"SUCCESS: page {page} is parsed ... ")
+        except Exception as e:
+            print(f"ERROR: page {page} is not parsed: {e}")
+            failed = True
+
+        # Unload the document
+        parser.unload_document(doc_key)
+
+        overview.append([doc_file, (not failed), num_pages])
+
+    print(tabulate(overview, headers=["filename", "success", "#-pages"]))
+
+
+if __name__ == "__main__":
+    main()
diff --git a/src/proj_folders/pdf_library/core/object/cmap.h b/src/proj_folders/pdf_library/core/object/cmap.h
@@ -200,8 +200,15 @@ namespace pdf_lib
 	auto itr = src.begin();
 	c = utf8::next(itr, src.end());	  
       }
-      assert(_range.first<=c and c<=_range.second);
+      //assert(_range.first<=c and c<=_range.second);
 
+      if(c<_range.first or _range.second<c)
+	{
+	  logging_lib::warn("pdf-parser") << __FILE__ << ":" << __LINE__ 
+					  << "\tchar-index " << c << " for " << tgt
+					  << " is out of range [" << _range.first << ", " << _range.second << "]";
+	}
+
       _map[c] = tgt;
 
       return *this;
@@ -239,8 +246,15 @@ namespace pdf_lib
 	{
 	  for(uint32_t i = 0; i < end - begin + 1; i++)
 	    {
-	      assert(_range.first<=begin+i and begin+i<=_range.second);
+	      //assert(_range.first<=begin+i and begin+i<=_range.second);
 
+	      if(begin+i<_range.first or _range.second<begin+i)
+		{
+		  logging_lib::warn("pdf-parser") << __FILE__ << ":" << __LINE__ 
+						  << "\tchar-index " << begin+i //<< " for " << tgt.at(i)
+						  << " is out of range [" << _range.first << ", " << _range.second << "]";	      		  
+		}
+
 	      try
 		{
 		  std::string tmp(16, 0);
@@ -254,6 +268,9 @@ namespace pdf_lib
 		}
 	      catch(...)
 		{
+		  logging_lib::error("pdf-parser") << __FILE__ << ":" << __LINE__  << "\t"
+						   << "could not determine char-value for cmap at index " << (begin+i);
+
 		  _map[begin + i] = "UNICODE<"+std::to_string(begin+i)+">";
 		}
 	    }
@@ -262,8 +279,15 @@ namespace pdf_lib
 	{
 	  for(uint32_t i = 0; i < end - begin + 1; i++)
 	    {
-	      assert(_range.first<=begin+i and begin+i<=_range.second);
+	      //assert(_range.first<=begin+i and begin+i<=_range.second);
 
+	      if(begin+i<_range.first or _range.second<begin+i)
+		{
+		  logging_lib::warn("pdf-parser") << __FILE__ << ":" << __LINE__ 
+						  << "\tchar-index " << begin+i //<< " for " << tgt.at(i)
+						  << " is out of range [" << _range.first << ", " << _range.second << "]";	      		  
+		}
+
 	      try
 		{
 		  std::string tmp(128, 0);
@@ -280,6 +304,8 @@ namespace pdf_lib
 		}
 	      catch(...)
 		{
+		  logging_lib::error("pdf-parser") << __FILE__ << ":" << __LINE__  << "\t"
+						   << "could not determine char-value for cmap at index " << (begin+i);
 		  _map[begin + i] = "UNICODE<"+std::to_string(begin+i)+">";		  
 		}
 
@@ -336,8 +362,15 @@ namespace pdf_lib
 	  //if(begin + i>255)
 	  //std::cout << src_begin << "\t" << tgt.at(i) << "\t" << __FUNCTION__ << "\n" ;
 
-	  assert(_range.first<=begin+i and begin+i<=_range.second);
+	  //assert(_range.first<=begin+i and begin+i<=_range.second);
 
+	  if(begin+i<_range.first or _range.second<begin+i)
+	    {
+	      logging_lib::warn("pdf-parser") << __FILE__ << ":" << __LINE__ 
+					      << "\tchar-index " << begin+i << " for " << tgt.at(i)
+					      << " is out of range [" << _range.first << ", " << _range.second << "]";	      
+	    }
+
 	  _map[begin + i] = tgt.at(i);
 
 	  //std::cout << __FUNCTION__ << "-2\t[" << src_begin << ":" << src_end