forked from FSOCIETY06/pdf2textlib
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathpdf2textlib.py
84 lines (68 loc) · 2.25 KB
/
pdf2textlib.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
import PIL.Image
import pdf2image
import pytesseract
from googletrans import Translator
DPI = 200
OUTPUT_FOLDER = None
FIRST_PAGE = None
LAST_PAGE = None
FORMAT = 'jpg'
THREAD_COUNT = 1
USERPWD = None
USE_CROPBOX = False
STRICT = False
index = 0
def pdftopil(PDF_PATH):
pil_images = pdf2image.convert_from_path(PDF_PATH, dpi=DPI, output_folder=OUTPUT_FOLDER, first_page=FIRST_PAGE,
last_page=LAST_PAGE, fmt=FORMAT, thread_count=THREAD_COUNT, userpw=USERPWD,
use_cropbox=USE_CROPBOX, strict=STRICT)
return pil_images
def save_images(pil_images):
index = 1
for image in pil_images:
image.save("pages\\page_" + str(index) + ".jpg")
index += 1
print("Number of pages :", index - 1)
return index
def search(output, s):
flag = 0
for i in s.split():
if i in output.split():
flag = 1
else:
flag = 0
if flag:
print("Found")
return 1
else:
print("Not Found")
return 0
def translate(s, dest):
translator = Translator()
l = translator.translate(s, dest=dest)
return l
def getText(PDF_PATH, lang):
index = save_images(pdftopil(PDF_PATH))
output = ""
for i in range(1, index):
output += pytesseract.image_to_string(
PIL.Image.open(
'pages\\page_' + str(i) + '.jpg').convert(
"RGB"),
lang=lang)
output += "\n______________________________________________________________________\n"
print(output)
def getText(PDF_PATH):
index = save_images(pdftopil(PDF_PATH))
output = ""
for i in range(1, index):
output += pytesseract.image_to_string(
PIL.Image.open(
'pages\\page_' + str(i) + '.jpg').convert(
"RGB"),
lang='eng')
output += "\n______________________________________________________________________\n"
print(output)
if __name__ == "__main__":
# getText("C:\\Users\\Kingsmanvk\\PycharmProjects\\selfPRO\\sih\\demo.pdf", "urd+tel+eng")
getText("C:\\Users\\Kingsmanvk\\PycharmProjects\\selfPRO\\sih\\demo.pdf")