Skip to content

Commit

Permalink
fix: resolve more assert errors (#16)
Browse files Browse the repository at this point in the history
* added eval scripts

Signed-off-by: Peter Staar <taa@zurich.ibm.com>

* fix errors in PMC

Signed-off-by: Peter Staar <taa@zurich.ibm.com>

* reformat code

Signed-off-by: Peter Staar <taa@zurich.ibm.com>

---------

Signed-off-by: Peter Staar <taa@zurich.ibm.com>
  • Loading branch information
PeterStaar-IBM authored Aug 30, 2024
1 parent e842657 commit c3a6b03
Show file tree
Hide file tree
Showing 3 changed files with 179 additions and 235 deletions.
104 changes: 104 additions & 0 deletions docling_parse/eval.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,104 @@
import argparse
import glob
import io
import os

from tabulate import tabulate

# from docling_parse.docling_parse import pdf_parser
import docling_parse
from docling_parse import pdf_parser


def main():
# Create the argument parser
parser = argparse.ArgumentParser(description="Process a PDF file.")

# Add an argument for the path to the PDF file
parser.add_argument(
"-l",
"--log-level",
type=int,
required=False,
default=2,
help="log-level 1,2,3,4",
)

# Add an argument for the path to the PDF file
parser.add_argument(
"-d",
"--pdfdir",
type=str,
help="Path to the directory with PDF files",
required=True,
)

# Add an argument for the path to the PDF file
parser.add_argument(
"-m",
"--max-docs",
type=int,
required=False,
default=None,
help="max number of documents to run on",
)

# Parse the command-line arguments
args = parser.parse_args()
print(f"The provided PDF path is: {args.pdfdir}")

# Check if the provided path is valid
if not os.path.exists(args.pdfdir):
print(f"Error: The directory {args.pdfdir} does not exist.")
return

# Print the path to the PDF file (or add your processing logic here)

parser = docling_parse.pdf_parser()
parser.set_loglevel(args.log_level)

overview = []

doc_files = sorted(glob.glob(os.path.join(args.pdfdir, "*.pdf")))
if args.max_docs != None:
doc_files = doc_files[0 : args.max_docs]

for doc_id, doc_file in enumerate(doc_files):
print(doc_file)

doc_key = f"key={doc_file}" # unique document key (eg hash, UUID, etc)

# Load the document
success = parser.load_document(doc_key, doc_file)
# parser.set_loglevel(args.log_level)

# Get number of pages
num_pages = parser.number_of_pages(doc_key)
print("#-pages: ", num_pages)

failed = False

try:
# Parse page by page to minimize memory footprint
for page in range(0, num_pages):
json_doc = parser.parse_pdf_from_key_on_page(doc_key, page)

if "pages" not in json_doc: # page could not get parsed
print(f"ERROR: page {page} is not parsed ... ")
failed = True
else:
print(f"SUCCESS: page {page} is parsed ... ")
except Exception as e:
print(f"ERROR: page {page} is not parsed: {e}")
failed = True

# Unload the document
parser.unload_document(doc_key)

overview.append([doc_file, (not failed), num_pages])

print(tabulate(overview, headers=["filename", "success", "#-pages"]))


if __name__ == "__main__":
main()
41 changes: 37 additions & 4 deletions src/proj_folders/pdf_library/core/object/cmap.h
Original file line number Diff line number Diff line change
Expand Up @@ -200,8 +200,15 @@ namespace pdf_lib
auto itr = src.begin();
c = utf8::next(itr, src.end());
}
assert(_range.first<=c and c<=_range.second);
//assert(_range.first<=c and c<=_range.second);

if(c<_range.first or _range.second<c)
{
logging_lib::warn("pdf-parser") << __FILE__ << ":" << __LINE__
<< "\tchar-index " << c << " for " << tgt
<< " is out of range [" << _range.first << ", " << _range.second << "]";
}

_map[c] = tgt;

return *this;
Expand Down Expand Up @@ -239,8 +246,15 @@ namespace pdf_lib
{
for(uint32_t i = 0; i < end - begin + 1; i++)
{
assert(_range.first<=begin+i and begin+i<=_range.second);
//assert(_range.first<=begin+i and begin+i<=_range.second);

if(begin+i<_range.first or _range.second<begin+i)
{
logging_lib::warn("pdf-parser") << __FILE__ << ":" << __LINE__
<< "\tchar-index " << begin+i //<< " for " << tgt.at(i)
<< " is out of range [" << _range.first << ", " << _range.second << "]";
}

try
{
std::string tmp(16, 0);
Expand All @@ -254,6 +268,9 @@ namespace pdf_lib
}
catch(...)
{
logging_lib::error("pdf-parser") << __FILE__ << ":" << __LINE__ << "\t"
<< "could not determine char-value for cmap at index " << (begin+i);

_map[begin + i] = "UNICODE<"+std::to_string(begin+i)+">";
}
}
Expand All @@ -262,8 +279,15 @@ namespace pdf_lib
{
for(uint32_t i = 0; i < end - begin + 1; i++)
{
assert(_range.first<=begin+i and begin+i<=_range.second);
//assert(_range.first<=begin+i and begin+i<=_range.second);

if(begin+i<_range.first or _range.second<begin+i)
{
logging_lib::warn("pdf-parser") << __FILE__ << ":" << __LINE__
<< "\tchar-index " << begin+i //<< " for " << tgt.at(i)
<< " is out of range [" << _range.first << ", " << _range.second << "]";
}

try
{
std::string tmp(128, 0);
Expand All @@ -280,6 +304,8 @@ namespace pdf_lib
}
catch(...)
{
logging_lib::error("pdf-parser") << __FILE__ << ":" << __LINE__ << "\t"
<< "could not determine char-value for cmap at index " << (begin+i);
_map[begin + i] = "UNICODE<"+std::to_string(begin+i)+">";
}

Expand Down Expand Up @@ -336,8 +362,15 @@ namespace pdf_lib
//if(begin + i>255)
//std::cout << src_begin << "\t" << tgt.at(i) << "\t" << __FUNCTION__ << "\n" ;

assert(_range.first<=begin+i and begin+i<=_range.second);
//assert(_range.first<=begin+i and begin+i<=_range.second);

if(begin+i<_range.first or _range.second<begin+i)
{
logging_lib::warn("pdf-parser") << __FILE__ << ":" << __LINE__
<< "\tchar-index " << begin+i << " for " << tgt.at(i)
<< " is out of range [" << _range.first << ", " << _range.second << "]";
}

_map[begin + i] = tgt.at(i);

//std::cout << __FUNCTION__ << "-2\t[" << src_begin << ":" << src_end
Expand Down
Loading

0 comments on commit c3a6b03

Please sign in to comment.