Skip to content

Commit

Permalink
issue alexgand#19 - check for corrupted/invalid files
Browse files Browse the repository at this point in the history
- check whether PDFs are valid using PyPDF2
- check whether ePubs (=Zips) are valid using zipfile
- try 3 times and then give up and continue
    - print error information
    - can be recovered/tried again by running the downloader again
  • Loading branch information
pjungermann committed Apr 16, 2020
1 parent 807e8cc commit 40af2ca
Show file tree
Hide file tree
Showing 3 changed files with 53 additions and 8 deletions.
1 change: 1 addition & 0 deletions .dockerignore
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
/downloads
59 changes: 51 additions & 8 deletions main.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,8 +3,37 @@
import os
import requests
import pandas as pd
import PyPDF2
import zipfile
from tqdm import tqdm


def is_valid_pdf(filename):
try:
PyPDF2.PdfFileReader(open(filename, 'rb'), strict=False)
return True

except PyPDF2.utils.PdfReadError:
print(f'PDF corrupted or not a PDF: {filename}')
return False

def is_valid_epub(filename):
if not zipfile.is_zipfile(filename):
print(f'ePub corrupted or not an ePub: {filename}')
return False

try:
with zipfile.ZipFile(filename, 'r') as zip_ref:
all_valid = zip_ref.testzip() is None
if not all_valid:
print(f'ePub corrupted or not an ePub: {filename}')
return all_valid

except zipfile.BadZipFile:
print(f'ePub corrupted or not an ePub: {filename}')
return False


# insert here the folder you want the books to be downloaded:
folder = os.path.join(os.getcwd(), 'downloads')

Expand Down Expand Up @@ -42,11 +71,17 @@
output_file = os.path.join(new_folder, final)

if not os.path.exists(output_file):
myfile = requests.get(new_url, allow_redirects=True)
try:
open(output_file, 'wb').write(myfile.content)
except OSError:
print("Error: PDF filename is appears incorrect.")
tries = 0
while tries < 3:
myfile = requests.get(new_url, allow_redirects=True)
try:
open(output_file, 'wb').write(myfile.content)
if is_valid_pdf(output_file):
break
os.remove(output_file)
except OSError:
print("Error: PDF filename appears incorrect.")
break

#download epub version too if exists
new_url = r.url
Expand All @@ -59,12 +94,20 @@
final = title.replace(',','-').replace('.','').replace('/',' ').replace(':',' ') + ' - ' + author.replace(',','-').replace('.','').replace('/',' ').replace(':',' ') + ' - ' + final
output_file = os.path.join(new_folder, final)

request = requests.get(new_url)
if request.status_code == 200:
tries = 0
while tries < 3:
request = requests.get(new_url)
if request.status_code != 200:
break

myfile = requests.get(new_url, allow_redirects=True)
try:
open(output_file, 'wb').write(myfile.content)
if is_valid_epub(output_file):
break
os.remove(output_file)
except OSError:
print("Error: EPUB filename is appears incorrect.")
print("Error: EPUB filename appears incorrect.")
break

print('Download finished.')
1 change: 1 addition & 0 deletions requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -13,3 +13,4 @@ six==1.14.0
tqdm==4.45.0
urllib3==1.25.8
xlrd==1.2.0
PyPDF2==1.26.0

0 comments on commit 40af2ca

Please sign in to comment.