Skip to content

Commit

Permalink
Gracefully exit if a document hasn't been OCRd
Browse files Browse the repository at this point in the history
  • Loading branch information
duckduckgrayduck committed Aug 12, 2024
1 parent dcbaac6 commit 97d975c
Showing 1 changed file with 7 additions and 1 deletion.
8 changes: 7 additions & 1 deletion main.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,8 @@
""" Requires eyecite to find legal citations """
import sys
import csv
from documentcloud.addon import AddOn
from documentcloud.exceptions import DoesNotExistError
from eyecite import get_citations


Expand All @@ -12,7 +14,11 @@ def main(self):

for document in self.get_documents():
for page_number in range(1, document.page_count + 1):
page_text = document.get_page_text(page_number)
try:
page_text = document.get_page_text(page_number)
except DoesNotExistError:
self.set_message(f"Could not match regular expressions on document with id {document.id}, please OCR this document and run Regex Extractor again.")
sys.exit(0)
citation_list = get_citations(page_text)
tagged_citation_list = [
(document.title, document.id, page_number, citation)
Expand Down

0 comments on commit 97d975c

Please sign in to comment.