-
Notifications
You must be signed in to change notification settings - Fork 29
/
Copy pathIAM_preprocessing.py
62 lines (49 loc) · 1.96 KB
/
IAM_preprocessing.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
import pickle
from shutil import copyfile, rmtree
import re
import xml.etree.ElementTree as ET
import os
import argparse
import cv2
import numpy as np
from tqdm import tqdm
from utils import get_lexicon
# python3 IAM_preprocessing.py -p ./data -np ./data/IAM_processed
if __name__ == "__main__":
parser = argparse.ArgumentParser(description='Process IAM dataset')
# in path folder should be located /words dir and /xml_data dir
# from original IAM dataset archive
parser.add_argument('-p', '--path')
parser.add_argument('-np', '--new_path')
args = parser.parse_args()
globals().update(vars(args))
classes = {j:i for i, j in enumerate(get_lexicon())}
# declare constants
counters = [2, 30] # minimum and maximum word lengths
try:
rmtree(new_path)
except:
pass
os.mkdir(new_path)
c = 0
for xml in tqdm(os.listdir(path + '/xml_data/'), desc='xml files'):
tree = ET.parse(path+'/xml_data/'+xml)
root = tree.getroot()
for line in root[1]:
line = [word for word in line if word.attrib]
for i in range(len(line)):
word = line[i]
text = word.attrib['text'].lower()
text = re.sub("\W+|_", " ", text)
text = re.sub("\s+", "-", text)
if counters[0] <= len(re.sub("-", "", text)) <= counters[1]:
img_name = word.attrib['id'].split('-')
img_name = img_name[0]+'/'+'-'.join(img_name[:2])+'/'+'-'.join(img_name)+'.png'
img = cv2.imread(path + '/words/' + img_name)
if img is None:
print("can't load file!")
continue
copyfile(path+'/words/'+img_name, os.path.join(new_path, "_"+text+"_"+word.attrib['id']+'.png'))
c += 1
print(" [INFO] number of instances: %s" % c)
pickle.dump(classes, open(new_path+'/classes.pickle.dat', 'wb'))