-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathepub.py
107 lines (85 loc) · 3.32 KB
/
epub.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
import ebooklib
from ebooklib import epub
from bs4 import BeautifulSoup
def flatten(item, result=[]):
for t in item:
if isinstance(t, (list, tuple)): flatten(t, result)
else: result.append(t)
return result
UID = 0
def get_chapter_name(doc, chapters):
name = doc.file_name
for entry in chapters:
if entry.href[:len(name)] == name:
return entry.title
global UID
UID += 1
return "untitled-"+str(UID)
def get_texts(doc, class_filter=[], remove_links=False):
# deletes empty parents with elements
def recurs_del(e):
p = e.parent
e.decompose()
if p and (p.text.isspace() or not p.text):
recurs_del(p)
# select elements for removal
def custom_selector(tag):
if remove_links and tag.name == 'a':
return True
if tag.has_attr('class'):
for c in tag.get('class'):
for f in class_filter.split(' '):
if f in c.lower():
return True
return False
body = doc.get_body_content().decode('utf-8')
body = body.replace('\n', '')
splitters = ['div','p','h1','h2','h3','h4','h5','h6','blockquote','li','dt','dd']
for s in splitters:
tag ='</'+s+'>'
body = body.replace(tag, tag + '\n')
# import re
# # # remove newlines in paragraphs
# rx = re.compile(r'(<p>(?!.*<\/p>).*)\n')
# while rx.search(body):
# body = rx.sub(r'\1 ', body)
# body = body.replace('</p>', '</p>\n') # add \n after paragraphs
soup = BeautifulSoup(body, 'html.parser')
for d in soup.find_all(custom_selector): recurs_del(d) # remove custom_selector items
texts = soup.get_text().strip()
texts = texts.replace('\r', '').split('\n') # remove \r, organize by \n
texts = list(filter(str.strip, texts)) # remove empty and whitespace items
return texts
def extract(file, ignore_chapters=[], class_filter=[], start_chapter=None, end_chapter=None, remove_links=False):
# returns a tuple like so:
# (author, book_name, year, cover, [ (chapter_name, [ paragraph1, ... ]), ... ])
book = epub.read_epub(file, {"ignore_ncx": True})
docs = book.get_items_of_type(ebooklib.ITEM_DOCUMENT)
chapters = flatten(book.toc)
start_chapter = start_chapter or chapters[0].title
images = book.get_items_of_type(ebooklib.ITEM_IMAGE)
cover = next((x for x in images if 'cover' in x.file_name), None)
try: author = book.get_metadata('DC', 'creator')[0][0]
except: author = None
try: year = int(book.get_metadata('DC', 'date')[0][0][:4])
except: year = None
try: title = book.get_metadata('DC', 'title')[0][0]
except: title = book.title
started = False
content = []
for doc in docs:
chapter_name = get_chapter_name(doc, chapters)
if started and 'titlepage' not in doc.file_name:
pass
elif chapter_name.lower() == start_chapter.lower():
started = True
else:
continue
if chapter_name.lower() in ignore_chapters: continue
texts = get_texts(doc, class_filter=class_filter, remove_links=remove_links)
if not texts: continue
content.append((chapter_name, texts))
if chapter_name == end_chapter: break
global UID
UID = 0
return (author, title, year, cover, content)