-
Notifications
You must be signed in to change notification settings - Fork 1.7k
/
Copy pathnotmnist.py
52 lines (43 loc) · 1.74 KB
/
notmnist.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
import os
from glob import glob
import numpy as np
from imageio import imread
from skimage.transform import resize
from sklearn.model_selection import train_test_split
def load_notmnist(path='./notMNIST_small', letters='ABCDEFGHIJ',
img_shape=(28, 28), test_size=0.25, one_hot=False):
# download data if it's missing. If you have any problems, go to the urls
# and load it manually.
if not os.path.exists(path):
print("Downloading data...")
assert os.system(
'wget http://yaroslavvb.com/upload/notMNIST/notMNIST_small.tar.gz') == 0
print("Extracting ...")
assert os.system(
'tar -zxvf notMNIST_small.tar.gz > untar_notmnist.log') == 0
data, labels = [], []
print("Parsing...")
for img_path in glob(os.path.join(path, '*/*')):
class_i = img_path.split(os.sep)[-2]
if class_i not in letters:
continue
try:
data.append(resize(imread(img_path), img_shape))
labels.append(class_i,)
except BaseException:
print(
"found broken img: %s [it's ok if <10 images are broken]" %
img_path)
data = np.stack(data)[:, None].astype('float32')
data = (data - np.mean(data)) / np.std(data)
# convert classes to ints
letter_to_i = {l: i for i, l in enumerate(letters)}
labels = np.array(list(map(letter_to_i.get, labels)))
if one_hot:
labels = (np.arange(np.max(labels) + 1)
[None, :] == labels[:, None]).astype('float32')
# split into train/test
X_train, X_test, y_train, y_test = train_test_split(
data, labels, test_size=test_size, random_state=42)
print("Done")
return X_train, y_train, X_test, y_test