Skip to content

Commit

Permalink
Neural network improved [v0.1.0] (#1)
Browse files Browse the repository at this point in the history
* Artifact
 - Best parameter for train neural network (.json)
 - Face rappresentation 'pickled' for avoid face recognition process (.dat)
 - Neural network model tuned

* v0.1.0

BugFix
- predict_image: Ensure that classifier is loaded before apply face recognition
- load_classifier_from_file: Avoid to raise exception in case of no valid classifier written in configuration. Instead handle only the training/tuning process
- init_dataset: Ensure that image is processable

Enhancements
- dump_model: Handle more use case
  - Set model path as default if not provided
  - Set 'model' as file name if not provided
  - Dump model parameter + model name
- init_peoples_list: Initialization of new people is now parallelized
  - init_peoples_list_core is delegated to execute the core work
- init_dataset: Recognition and parsing of new face is now parallelized
  -  init_dataset_core is delegated to execute the core work

New Features
- unzip_data: Create method for unzip archive in oreder to semplify upload/storage phase
- remove_dir: Create method for remove the data after be processed
- dump_dataset: Create method for 'picklelize' the image processed and recognized
- verify_extension: Create a basecode for verify the type of the file uploaded
- tune_network: Create method for tune the hyperparameter of the nerual network
  - Handle zip file (apply face recognition process
  - Handle pre-processed data (picklelized dataset)
- tuning: Search among every possibile hyperparameter combination and train the neural network
- verify_performance: Print lot of usefull information about the trained model
  • Loading branch information
Alessio Savi authored and GitHub Enterprise committed May 19, 2019
1 parent 16997c7 commit e3faf10
Show file tree
Hide file tree
Showing 10 changed files with 316 additions and 50 deletions.
68 changes: 65 additions & 3 deletions api/Api.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,13 +2,15 @@
"""
Custom function that will be wrapped for be HTTP compliant
"""

import os
import pickle
import time
import zipfile
from logging import getLogger
from os.path import join as path_join

from datastructure.Response import Response
from utils.util import print_prediction_on_image, random_string, remove_dir
from utils.util import print_prediction_on_image, random_string, remove_dir, unzip_data

log = getLogger()

Expand All @@ -23,7 +25,10 @@ def predict_image(img_path, clf, PREDICTION_PATH):
"""
response = Response()
log.debug("predict_image | Predicting {}".format(img_path))
prediction = clf.predict(img_path)
if clf is None:
prediction = None
else:
prediction = clf.predict(img_path)
log.debug("predict_image | Image analyzed!")
# Manage success
if prediction is not None and isinstance(prediction, list) and len(prediction) == 1:
Expand Down Expand Up @@ -91,3 +96,60 @@ def train_network(folder_uncompress, zip_file, clf):
response.description = "Model succesfully trained!"

return response.__dict__


def tune_network(folder_uncompress, zip_file, clf):
"""
Train a new neural model with the zip file provided
:param folder_uncompress:
:param zip_file:
:param clf:
:return:
"""
log.debug("tune_network | uncompressing zip file ...")
check = verify_extension(zip_file.filename)
if check == "zip": # Image provided
folder_name = unzip_data(folder_uncompress, zip_file)
log.debug("tune_network | zip file uncompressed!")
clf.init_peoples_list(peoples_path=folder_name)
dataset = clf.init_dataset()
elif check == "dat":
dataset = pickle.load(zip_file)
else:
dataset = None

if dataset is not None:
start_time = time.time()
neural_model_file = clf.tuning(dataset["X"], dataset["Y"])
elapsed_time = time.time() - start_time

log.debug("tune_network | Removing unzipped files")
if check == "zip":
# TODO: Refactor this method :/
remove_dir(folder_name)
response = Response()
response.status = "OK"
response.data = neural_model_file
response.description = "Model succesfully trained! | {}".format(
time.strftime("%H:%M:%S.%f", time.gmtime(elapsed_time)))
else:
response = Response()
response.error = "ERROR DURING LOADING DAT"
return response.__dict__


def verify_extension(file):
"""
Wrapper for validate file
:param file:
:return:
"""
extension = os.path.splitext(file)[1]
log.debug("verify_extension | File: {} | Ext: {}".format(file, extension))
if extension == ".zip":
# In this case we have to analyze the photos
return "zip"
elif extension == ".dat":
# Photos have been alredy analyzed, dataset is ready!
return "dat"
return None
2 changes: 1 addition & 1 deletion conf/test.json
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@
"classifier": {
"trainin_dir": "dataset/images/",
"model_path": "dataset/model/",
"model": "model-20190519_125204.clf",
"model": "model-20190518_191827.clf",
"n_neighbors": "",
"knn_algo": ""
},
Expand Down
15 changes: 15 additions & 0 deletions dataset/model/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
### Neural Network model folder

This directory will contains the model generated by the neural network training among the give images

#### image_dataset-DATE_TIME.**dat**

Contains the dataset parsed from the image

#### model-DATE_TIME.**clf**

Is the neural model able to classify a given face

#### model-DATE_TIME.**json**

Contains the json configuration for training the neural network with the best parameters
Binary file added dataset/model/image_dataset-20190519_210331.dat
Binary file not shown.
Binary file added dataset/model/model-20190519_210950.clf
Binary file not shown.
10 changes: 10 additions & 0 deletions dataset/model/model-20190519_210950.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
{
"classifier_file": "dataset/model/model-20190519_210950",
"params": {
"algorithm": "ball_tree",
"metric": "minkowski",
"n_neighbors": 78,
"p": 2,
"weights": "distance"
}
}
138 changes: 118 additions & 20 deletions datastructure/Classifier.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,19 +2,23 @@
"""
Core utils for manage face recognition process
"""

import json
import logging
import os
import pickle
from datetime import datetime
from math import sqrt
from multiprocessing.pool import ThreadPool
from pprint import pformat

import face_recognition
from sklearn.metrics import accuracy_score, balanced_accuracy_score, classification_report, \
precision_score
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.neighbors import KNeighborsClassifier
from tqdm import tqdm

from datastructure.Person import Person
from utils.util import dump_dataset

log = logging.getLogger()

Expand Down Expand Up @@ -104,7 +108,9 @@ def load_classifier_from_file(self, classifier_file):
err = "load_classifier_from_file | FATAL | Path {} DOES NOT EXIST ...".format(self.model_path)
if err is not None:
log.error(err)
raise Exception(err)
log.error("load_classifier_from_file | Seems that the model is gone :/ | Loading an empty classifier for "
"training purpouse ...")
self.classifier = None
return

def train(self, X, Y):
Expand All @@ -116,23 +122,103 @@ def train(self, X, Y):
log.debug("train | START")
if self.classifier is not None:
log.debug("train | Training ...")
self.classifier.fit(X, Y)
X_train, x_test, Y_train, y_test = train_test_split(X, Y, test_size=0.25)
self.classifier.fit(X_train, Y_train)
log.debug("train | Model Trained!")
log.debug("train | Checking performance ...")
y_pred = self.classifier.predict(x_test)
# Static method
self.verify_performance(y_test, y_pred)
return self.dump_model(self.model_path, "model")

def dump_model(self, path, file):
def tuning(self, X, Y):
"""
Tune the hyperparameter of a new model by the given data [X] related to the given target [Y]
:param X:
:param Y:
:return:
"""
X_train, x_test, Y_train, y_test = train_test_split(X, Y, test_size=0.25)
self.classifier = KNeighborsClassifier()
# Hyperparameter of the neural network (KKN)

# n_neighbors_range = list(range(1, round(sqrt(len(X_train))))) # n_neighbors <= n_samples
weights_range = ['uniform', 'distance']
metrics_range = ['minkowski', 'euclidean', 'manhattan']
# 'auto' will automagically choose an algorithm by the given value
algorithm_range = ['ball_tree', 'kd_tree', 'brute']
power_range = [1, 2]
nn_root = int(round(sqrt(len(X_train))))
parameter_space = {
# 'n_neighbors': list(range(1,nn_root)),
'n_neighbors': [nn_root],
'metric': metrics_range,
'weights': weights_range,
'algorithm': algorithm_range,
'p': power_range,
}
log.debug("tuning | Parameter -> {}".format(pformat(parameter_space)))
grid = GridSearchCV(self.classifier, parameter_space, cv=3, scoring='accuracy', verbose=10, n_jobs=3)
grid.fit(X_train, Y_train)
log.info("TUNING COMPLETE | DUMPING DATA!")
# log.info("tuning | Grid Scores: {}".format(pformat(grid.grid_scores_)))
log.info('Best parameters found: {}'.format(grid.best_params_))

y_pred = grid.predict(x_test)

log.info('Results on the test set: {}'.format(pformat(grid.score(x_test, y_test))))

self.verify_performance(y_test, y_pred)

return self.dump_model(params=grid.best_params_)

@staticmethod
def verify_performance(y_test, y_pred):
"""
Verify the performance of the result analyzing the known-predict result
:param y_test:
:param y_pred:
:return:
"""

log.debug("verify_performance | Analyzing performance ...")
# log.info("Computing classifier score --> {}".format(pformat(clf.score(y_test,y_pred))))
log.info("Classification Report: {}".format(pformat(classification_report(y_test, y_pred))))
log.info("balanced_accuracy_score: {}".format(pformat(balanced_accuracy_score(y_test, y_pred))))
log.info("accuracy_score: {}".format(pformat(accuracy_score(y_test, y_pred))))
log.info("precision_score: {}".format(pformat(precision_score(y_test, y_pred, average='weighted'))))

def dump_model(self, params, path=None, file=None):
"""
Dump the model to the given path, file
:param params:
:param path:
:param file:
"""
if path is None:
if self.model_path is not None:
if os.path.exists(self.model_path) and os.path.isdir(self.model_path):
path = self.model_path
if file is None:
file = "model"

if os.path.isdir(path):
time_parsed = datetime.now().strftime('%Y%m%d_%H%M%S')
classifier_file = os.path.join(path, "{}-{}.clf".format(file, time_parsed))
classifier_file = os.path.join(path, "{}-{}".format(file, time_parsed))
config = {'classifier_file': classifier_file,
'params': params
}

log.debug("dump_model | Dumping model ... | Path: {} | File: {}".format(path, classifier_file))
with open(classifier_file, 'wb') as f:
# TODO: Save every model in a different folder
with open(classifier_file + ".clf", 'wb') as f:
pickle.dump(self.classifier, f)
return classifier_file
with open(classifier_file + ".json", 'w') as f:
json.dump(config, f)
log.info('dump_model | Configuration saved to {0}'.format(classifier_file))

return config

def init_peoples_list(self, peoples_path=None):
"""
Expand All @@ -144,18 +230,29 @@ def init_peoples_list(self, peoples_path=None):
log.debug("init_peoples_list | Initalizing people ...")
if peoples_path is not None and os.path.isdir(peoples_path):
self.training_dir = peoples_path
# freq_list = pool.map(partial(get_frequency, nlp=nlp_en, client=mongo_client), fileList)
pool = ThreadPool(3)
self.peoples_list = pool.map(self.init_peoples_list_core, os.listdir(self.training_dir))
self.peoples_list = list(filter(None.__ne__, self.peoples_list)) # Remove None

for people_name in tqdm(os.listdir(self.training_dir),
total=len(os.listdir(self.training_dir)), desc="Init people list ..."):
log.debug("init_peoples_list | Initalizing [{}]".format(people_name))
# Filter only folder
if os.path.isdir(os.path.join(self.training_dir, people_name)):
log.debug("{0}".format(os.path.join(self.training_dir, people_name)))
person = Person()
person.name = people_name
person.path = os.path.join(self.training_dir, people_name)
person.init_dataset()
self.peoples_list.append(person)
# TODO: Add method for dump datastructure in order to don't wait to load same data for test

def init_peoples_list_core(self, people_name):
"""
Delegated core method for parallelize operation
:param people_name:
:return:
"""
if os.path.isdir(os.path.join(self.training_dir, people_name)):
log.debug("Initalizing people {0}".format(os.path.join(self.training_dir, people_name)))
person = Person()
person.name = people_name
person.path = os.path.join(self.training_dir, people_name)
person.init_dataset()
return person
else:
log.debug("People {0} invalid folder!".format(os.path.join(self.training_dir, people_name)))
return None

def init_dataset(self):
"""
Expand All @@ -175,9 +272,10 @@ def init_dataset(self):
DATASET["X"].append(item)
for item in people.dataset["Y"]:
DATASET["Y"].append(item)

dump_dataset(DATASET, self.model_path)
return DATASET

# TODO: Add configuration parameter for choose the distance_threshold
def predict(self, X_img_path, distance_threshold=0.45):
"""
Recognizes faces in given image using a trained KNN classifier
Expand Down
Loading

0 comments on commit e3faf10

Please sign in to comment.