issue alexgand#19 - check for corrupted/invalid files

- check whether PDFs are valid using PyPDF2 - check whether ePubs (=Zips) are valid using zipfile - try 3 times and then give up and continue - print error information - can be recovered/tried again by running the downloader again
pjungermann · Apr 16, 2020 · 40af2ca · 40af2ca
1 parent 807e8cc
commit 40af2ca
Show file tree

Hide file tree

Showing 3 changed files with 53 additions and 8 deletions.
diff --git a/.dockerignore b/.dockerignore
@@ -0,0 +1 @@
+/downloads
diff --git a/main.py b/main.py
@@ -3,8 +3,37 @@
 import os
 import requests
 import pandas as pd
+import PyPDF2
+import zipfile
 from tqdm import tqdm
 
+
+def is_valid_pdf(filename):
+    try:
+        PyPDF2.PdfFileReader(open(filename, 'rb'), strict=False)
+        return True
+
+    except PyPDF2.utils.PdfReadError:
+        print(f'PDF corrupted or not a PDF: {filename}')
+        return False
+
+def is_valid_epub(filename):
+    if not zipfile.is_zipfile(filename):
+        print(f'ePub corrupted or not an ePub: {filename}')
+        return False
+
+    try:
+        with zipfile.ZipFile(filename, 'r') as zip_ref:
+            all_valid = zip_ref.testzip() is None
+            if not all_valid:
+                print(f'ePub corrupted or not an ePub: {filename}')
+            return all_valid
+
+    except zipfile.BadZipFile:
+        print(f'ePub corrupted or not an ePub: {filename}')
+        return False
+
+
 # insert here the folder you want the books to be downloaded:
 folder = os.path.join(os.getcwd(), 'downloads')
 
@@ -42,11 +71,17 @@
     output_file = os.path.join(new_folder, final)
 
     if not os.path.exists(output_file):
-        myfile = requests.get(new_url, allow_redirects=True)
-        try:
-            open(output_file, 'wb').write(myfile.content)
-        except OSError: 
-            print("Error: PDF filename is appears incorrect.")
+        tries = 0
+        while tries < 3:
+            myfile = requests.get(new_url, allow_redirects=True)
+            try:
+                open(output_file, 'wb').write(myfile.content)
+                if is_valid_pdf(output_file):
+                    break
+                os.remove(output_file)
+            except OSError: 
+                print("Error: PDF filename appears incorrect.")
+                break
 
         #download epub version too if exists
         new_url = r.url
@@ -59,12 +94,20 @@
         final = title.replace(',','-').replace('.','').replace('/',' ').replace(':',' ') + ' - ' + author.replace(',','-').replace('.','').replace('/',' ').replace(':',' ') + ' - ' + final
         output_file = os.path.join(new_folder, final)
 
-        request = requests.get(new_url)
-        if request.status_code == 200:
+        tries = 0
+        while tries < 3:
+            request = requests.get(new_url)
+            if request.status_code != 200:
+                break
+
             myfile = requests.get(new_url, allow_redirects=True)
             try:
                 open(output_file, 'wb').write(myfile.content)
+                if is_valid_epub(output_file):
+                    break
+                os.remove(output_file)
             except OSError: 
-                print("Error: EPUB filename is appears incorrect.")
+                print("Error: EPUB filename appears incorrect.")
+                break
 
 print('Download finished.')
diff --git a/requirements.txt b/requirements.txt
@@ -13,3 +13,4 @@ six==1.14.0
 tqdm==4.45.0
 urllib3==1.25.8
 xlrd==1.2.0
+PyPDF2==1.26.0