From 97ba691c04c59fb84c5ef21eb8ba718b05259045 Mon Sep 17 00:00:00 2001
From: Patrick Jungermann <Patrick.Jungermann@gmail.com>
Date: Thu, 16 Apr 2020 02:54:30 +0200
Subject: [PATCH] issue #19 - check for corrupted/invalid files

- check whether PDFs are valid using PyPDF2
- check whether ePubs (=Zips) are valid using zipfile
- try 3 times and then give up and continue
    - print error information
    - can be recovered/tried again by running the downloader again
---
 .dockerignore    |  1 +
 main.py          | 61 +++++++++++++++++++++++++++++++++++++++++-------
 requirements.txt |  1 +
 3 files changed, 55 insertions(+), 8 deletions(-)
 create mode 100644 .dockerignore

diff --git a/.dockerignore b/.dockerignore
new file mode 100644
index 0000000..ddb0e92
--- /dev/null
+++ b/.dockerignore
@@ -0,0 +1 @@
+/downloads
diff --git a/main.py b/main.py
index a25e101..d5eadb4 100644
--- a/main.py
+++ b/main.py
@@ -3,8 +3,37 @@
 import os
 import requests
 import pandas as pd
+import PyPDF2
+import zipfile
 from tqdm import tqdm
 
+
+def is_valid_pdf(filename):
+    try:
+        PyPDF2.PdfFileReader(open(filename, 'rb'), strict=False)
+        return True
+
+    except PyPDF2.utils.PdfReadError:
+        print(f'PDF corrupted or not a PDF: {filename}')
+        return False
+
+def is_valid_epub(filename):
+    if not zipfile.is_zipfile(filename):
+        print(f'ePub corrupted or not an ePub: {filename}')
+        return False
+
+    try:
+        with zipfile.ZipFile(filename, 'r') as zip_ref:
+            all_valid = zip_ref.testzip() is None
+            if not all_valid:
+                print(f'ePub corrupted or not an ePub: {filename}')
+            return all_valid
+
+    except zipfile.BadZipFile:
+        print(f'ePub corrupted or not an ePub: {filename}')
+        return False
+
+
 # insert here the folder you want the books to be downloaded:
 folder = os.path.join(os.getcwd(), 'downloads')
 
@@ -42,11 +71,18 @@
     output_file = os.path.join(new_folder, final)
 
     if not os.path.exists(output_file):
-        myfile = requests.get(new_url, allow_redirects=True)
-        try:
-            open(output_file, 'wb').write(myfile.content)
-        except OSError: 
-            print("Error: PDF filename is appears incorrect.")
+        tries = 0
+        while tries < 3:
+            myfile = requests.get(new_url, allow_redirects=True)
+            try:
+                open(output_file, 'wb').write(myfile.content)
+                if is_valid_pdf(output_file):
+                    break
+                os.remove(output_file)
+                tries += 1
+            except OSError: 
+                print("Error: PDF filename appears incorrect.")
+                break
         
         #download epub version too if exists
         new_url = r.url
@@ -59,12 +95,21 @@
         final = title.replace(',','-').replace('.','').replace('/',' ').replace(':',' ') + ' - ' + author.replace(',','-').replace('.','').replace('/',' ').replace(':',' ') + ' - ' + final
         output_file = os.path.join(new_folder, final)
         
-        request = requests.get(new_url)
-        if request.status_code == 200:
+        tries = 0
+        while tries < 3:
+            request = requests.get(new_url)
+            if request.status_code != 200:
+                break
+
             myfile = requests.get(new_url, allow_redirects=True)
             try:
                 open(output_file, 'wb').write(myfile.content)
+                if is_valid_epub(output_file):
+                    break
+                os.remove(output_file)
+                tries += 1
             except OSError: 
-                print("Error: EPUB filename is appears incorrect.")
+                print("Error: EPUB filename appears incorrect.")
+                break
             
 print('Download finished.')
diff --git a/requirements.txt b/requirements.txt
index 26bb32c..6d91eed 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -13,3 +13,4 @@ six==1.14.0
 tqdm==4.45.0
 urllib3==1.25.8
 xlrd==1.2.0
+PyPDF2==1.26.0