From 40af2cae187fe5a114b7777760a24adc78e9ace0 Mon Sep 17 00:00:00 2001 From: Patrick Jungermann Date: Thu, 16 Apr 2020 02:54:30 +0200 Subject: [PATCH] issue #19 - check for corrupted/invalid files - check whether PDFs are valid using PyPDF2 - check whether ePubs (=Zips) are valid using zipfile - try 3 times and then give up and continue - print error information - can be recovered/tried again by running the downloader again --- .dockerignore | 1 + main.py | 59 +++++++++++++++++++++++++++++++++++++++++------- requirements.txt | 1 + 3 files changed, 53 insertions(+), 8 deletions(-) create mode 100644 .dockerignore diff --git a/.dockerignore b/.dockerignore new file mode 100644 index 0000000..ddb0e92 --- /dev/null +++ b/.dockerignore @@ -0,0 +1 @@ +/downloads diff --git a/main.py b/main.py index a25e101..a2362e2 100644 --- a/main.py +++ b/main.py @@ -3,8 +3,37 @@ import os import requests import pandas as pd +import PyPDF2 +import zipfile from tqdm import tqdm + +def is_valid_pdf(filename): + try: + PyPDF2.PdfFileReader(open(filename, 'rb'), strict=False) + return True + + except PyPDF2.utils.PdfReadError: + print(f'PDF corrupted or not a PDF: {filename}') + return False + +def is_valid_epub(filename): + if not zipfile.is_zipfile(filename): + print(f'ePub corrupted or not an ePub: {filename}') + return False + + try: + with zipfile.ZipFile(filename, 'r') as zip_ref: + all_valid = zip_ref.testzip() is None + if not all_valid: + print(f'ePub corrupted or not an ePub: {filename}') + return all_valid + + except zipfile.BadZipFile: + print(f'ePub corrupted or not an ePub: {filename}') + return False + + # insert here the folder you want the books to be downloaded: folder = os.path.join(os.getcwd(), 'downloads') @@ -42,11 +71,17 @@ output_file = os.path.join(new_folder, final) if not os.path.exists(output_file): - myfile = requests.get(new_url, allow_redirects=True) - try: - open(output_file, 'wb').write(myfile.content) - except OSError: - print("Error: PDF filename is appears incorrect.") + tries = 0 + while tries < 3: + myfile = requests.get(new_url, allow_redirects=True) + try: + open(output_file, 'wb').write(myfile.content) + if is_valid_pdf(output_file): + break + os.remove(output_file) + except OSError: + print("Error: PDF filename appears incorrect.") + break #download epub version too if exists new_url = r.url @@ -59,12 +94,20 @@ final = title.replace(',','-').replace('.','').replace('/',' ').replace(':',' ') + ' - ' + author.replace(',','-').replace('.','').replace('/',' ').replace(':',' ') + ' - ' + final output_file = os.path.join(new_folder, final) - request = requests.get(new_url) - if request.status_code == 200: + tries = 0 + while tries < 3: + request = requests.get(new_url) + if request.status_code != 200: + break + myfile = requests.get(new_url, allow_redirects=True) try: open(output_file, 'wb').write(myfile.content) + if is_valid_epub(output_file): + break + os.remove(output_file) except OSError: - print("Error: EPUB filename is appears incorrect.") + print("Error: EPUB filename appears incorrect.") + break print('Download finished.') diff --git a/requirements.txt b/requirements.txt index 26bb32c..6d91eed 100644 --- a/requirements.txt +++ b/requirements.txt @@ -13,3 +13,4 @@ six==1.14.0 tqdm==4.45.0 urllib3==1.25.8 xlrd==1.2.0 +PyPDF2==1.26.0