forked from anjesh/pdf-processor
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathPdfProcessor.py
98 lines (83 loc) · 3.94 KB
/
PdfProcessor.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
from os import listdir
import os.path
import json
from pdftools.PdfInfo import *
from pdftools.PdfToText import *
from pdftools.PdfTkSeparate import *
from pdftools.PdfSeparate import *
from abbyy.AbbyyPdfTextExtractor import *
import ProcessLogger
class PDFProcessor:
logger = ProcessLogger.getLogger('PDFProcessor')
def __init__(self, filePath, outputDir, language):
self.filePath = filePath
self.outputDir = outputDir
self.language = language
self.isEncrypted = False
self.textContentSize = 0
self.totalPages = 0
self.process()
self.processToCheckStructured()
def setConfigParser(self, configParser):
self.configParser = configParser
def process(self):
self.logger.info('Processing %s', self.filePath)
self.logger.info('Calling Pdfinfo')
pdfInfo = PdfInfo(self.filePath)
self.totalPages = pdfInfo.getPages()
self.fileSize = pdfInfo.getFileSizeInBytes()
self.logger.info('Total Pages: %d, File Size: %d bytes', self.totalPages, self.fileSize)
self.isEncrypted = pdfInfo.isEncrypted()
if self.isEncrypted:
self.writeStats()
raise Exception('Pdf is encrypted. Can\'t do processing.')
self.separatePdfPages()
def processToCheckStructured(self):
"""
dumps the entire pdf to text to get the size of the content
"""
pdfToText = PdfToText(self.filePath, self.totalPages, self.outputDir)
pdfToText.dumpPages()
self.textContentSize += os.path.getsize(pdfToText.dumpedTextFilepath)
self.logger.info('Text content size: %d bytes', self.textContentSize)
self.logger.info('Structured? %s', self.isStructured())
def isStructured(self):
"""
assuming that text content should be at least 500 bytes in average in each page to say
that the pdf is structured
"""
return True if self.textContentSize > (self.totalPages*500) else False
def getStatus(self):
if self.isEncrypted:
return "Encrypted"
else:
return "Structured" if self.isStructured() else "Scanned";
def writeStats(self):
stats = {"pages": self.totalPages, "status": self.getStatus()}
with open(os.path.join(self.outputDir,'stats.json'),'w') as outfile:
json.dump(stats, outfile)
self.logger.info('Writing %s to %s', json.dumps(stats), 'stats.json')
def separatePdfPages(self):
self.logger.info('Calling PdfTkseparate: Separating pdf to pages at %s', os.path.join(self.outputDir,'pages'))
pdfTkSeparate = PdfTkSeparate(self.filePath, os.path.join(self.outputDir,'pages'))
pdfTkProcessStatus = pdfTkSeparate.extractPages()
self.logger.info('PdfTkseparate Status: %s', pdfTkProcessStatus)
if pdfTkProcessStatus != 0:
self.logger.info('Calling Pdfseparate: Separating pdf to pages at %s', os.path.join(self.outputDir,'pages'))
pdfSeparate = PdfSeparate(self.filePath, os.path.join(self.outputDir,'pages'))
pdfSeparate.extractPages()
def extractTextFromStructuredDoc(self):
"""
creates "text" dir to dump the extracted pages
"""
self.logger.info('Calling Pdftotext: Dumping text pages at %s', os.path.join(self.outputDir,'text'))
pdfToText = PdfToText(self.filePath, self.totalPages, os.path.join(self.outputDir,'text'))
pdfToText.extractPages()
def extractTextFromScannedDoc(self):
"""
makes api calls
"""
self.logger.info('Calling Abbyy: OCR-ing %d pages at %s', self.totalPages, os.path.join(self.outputDir,'text'))
abbyyPdf = AbbyyPdfTextExtractor(os.path.join(self.outputDir,'pages'), os.path.join(self.outputDir,'text'), self.totalPages, self.language)
abbyyPdf.setApplicationCredentials(self.configParser.get('abbyy','appid'), self.configParser.get('abbyy','password'))
abbyyPdf.extractPages();