forked from anjesh/pdf-processor
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathrun.py
63 lines (51 loc) · 2.32 KB
/
run.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
import argparse
import os
import sys
from datetime import datetime
import configparser
import ProcessLogger
import traceback
from urllib.request import URLError, HTTPError
from PdfProcessor import PDFProcessor
parser = argparse.ArgumentParser(description='Processes the pdf and extracts the text')
parser.add_argument('-l', '--language', help='Language of input pdf file for transcription (english, french, spanish).', required=False, default="english")
parser.add_argument('-i', '--infile', help='File path of the input pdf file.', required=True)
parser.add_argument('-o', '--outdir', help='File name of the output csv file.', required=True)
results = parser.parse_args()
allowed_languages = ["english", "french", "spanish", "portuguese", "arabic"]
pdfProcessor = ""
try:
logger = ProcessLogger.getLogger('run')
logger.info("Processing started at %s ", str(datetime.now()))
logger.info("input: %s", results.infile)
logger.info("outdir: %s", results.outdir)
language = results.language.lower()
if language not in allowed_languages:
raise Exception("language should be one of english, french, spanish, portuguese or arabic")
if language == "portuguese":
language = "portuguesestandard"
if language != "english":
language += ",english"
configParser = configparser.RawConfigParser()
configParser.read(os.path.join(os.path.dirname(os.path.abspath(__file__)), 'settings.config'))
pdfProcessor = PDFProcessor(results.infile, results.outdir, language)
pdfProcessor.setConfigParser(configParser)
pdfProcessor.writeStats()
if pdfProcessor.isStructured():
pdfProcessor.extractTextFromStructuredDoc()
else:
pdfProcessor.extractTextFromScannedDoc()
except URLError as e:
logger.error("URLError: %s", e.reason)
logger.debug(traceback.format_exception(*sys.exc_info()))
except HTTPError as e:
logger.error("HTTPError: [%s] %s", e.code, e.reason)
logger.debug(traceback.format_exception(*sys.exc_info()))
except OSError as e:
logger.error("OSError: %s [%s] in %s", e.strerror, e.errno, e.filename)
logger.debug(traceback.format_exception(*sys.exc_info()))
except Exception as e:
logger.error("Exception: %s ", e)
logger.debug(traceback.format_exception(*sys.exc_info()))
finally:
logger.info("Processing ended at %s ", str(datetime.now()))