Neural network improved [v0.1.0] (#1)

* Artifact - Best parameter for train neural network (.json) - Face rappresentation 'pickled' for avoid face recognition process (.dat) - Neural network model tuned * v0.1.0 BugFix - predict_image: Ensure that classifier is loaded before apply face recognition - load_classifier_from_file: Avoid to raise exception in case of no valid classifier written in configuration. Instead handle only the training/tuning process - init_dataset: Ensure that image is processable Enhancements - dump_model: Handle more use case - Set model path as default if not provided - Set 'model' as file name if not provided - Dump model parameter + model name - init_peoples_list: Initialization of new people is now parallelized - init_peoples_list_core is delegated to execute the core work - init_dataset: Recognition and parsing of new face is now parallelized - init_dataset_core is delegated to execute the core work New Features - unzip_data: Create method for unzip archive in oreder to semplify upload/storage phase - remove_dir: Create method for remove the data after be processed - dump_dataset: Create method for 'picklelize' the image processed and recognized - verify_extension: Create a basecode for verify the type of the file uploaded - tune_network: Create method for tune the hyperparameter of the nerual network - Handle zip file (apply face recognition process - Handle pre-processed data (picklelized dataset) - tuning: Search among every possibile hyperparameter combination and train the neural network - verify_performance: Print lot of usefull information about the trained model
alessiosavi · May 19, 2019 · c936636 · c936636
1 parent 09ad753
commit c936636
Show file tree

Hide file tree

Showing 10 changed files with 316 additions and 50 deletions.
diff --git a/api/Api.py b/api/Api.py
@@ -2,13 +2,15 @@
 """
 Custom function that will be wrapped for be HTTP compliant
 """
-
+import os
+import pickle
+import time
 import zipfile
 from logging import getLogger
 from os.path import join as path_join
 
 from datastructure.Response import Response
-from utils.util import print_prediction_on_image, random_string, remove_dir
+from utils.util import print_prediction_on_image, random_string, remove_dir, unzip_data
 
 log = getLogger()
 
@@ -23,7 +25,10 @@ def predict_image(img_path, clf, PREDICTION_PATH):
 	"""
 	response = Response()
 	log.debug("predict_image | Predicting {}".format(img_path))
-	prediction = clf.predict(img_path)
+	if clf is None:
+		prediction = None
+	else:
+		prediction = clf.predict(img_path)
 	log.debug("predict_image | Image analyzed!")
 	# Manage success
 	if prediction is not None and isinstance(prediction, list) and len(prediction) == 1:
@@ -91,3 +96,60 @@ def train_network(folder_uncompress, zip_file, clf):
 	response.description = "Model succesfully trained!"
 
 	return response.__dict__
+
+
+def tune_network(folder_uncompress, zip_file, clf):
+	"""
+	Train a new neural model with the zip file provided
+	:param folder_uncompress:
+	:param zip_file:
+	:param clf:
+	:return:
+	"""
+	log.debug("tune_network | uncompressing zip file ...")
+	check = verify_extension(zip_file.filename)
+	if check == "zip":  # Image provided
+		folder_name = unzip_data(folder_uncompress, zip_file)
+		log.debug("tune_network | zip file uncompressed!")
+		clf.init_peoples_list(peoples_path=folder_name)
+		dataset = clf.init_dataset()
+	elif check == "dat":
+		dataset = pickle.load(zip_file)
+	else:
+		dataset = None
+
+	if dataset is not None:
+		start_time = time.time()
+		neural_model_file = clf.tuning(dataset["X"], dataset["Y"])
+		elapsed_time = time.time() - start_time
+
+		log.debug("tune_network | Removing unzipped files")
+		if check == "zip":
+			# TODO: Refactor this method :/
+			remove_dir(folder_name)
+		response = Response()
+		response.status = "OK"
+		response.data = neural_model_file
+		response.description = "Model succesfully trained! | {}".format(
+			time.strftime("%H:%M:%S.%f", time.gmtime(elapsed_time)))
+	else:
+		response = Response()
+		response.error = "ERROR DURING LOADING DAT"
+	return response.__dict__
+
+
+def verify_extension(file):
+	"""
+	Wrapper for validate file
+	:param file:
+	:return:
+	"""
+	extension = os.path.splitext(file)[1]
+	log.debug("verify_extension | File: {} | Ext: {}".format(file, extension))
+	if extension == ".zip":
+		# In this case we have to analyze the photos
+		return "zip"
+	elif extension == ".dat":
+		# Photos have been alredy analyzed, dataset is ready!
+		return "dat"
+	return None
diff --git a/conf/test.json b/conf/test.json
@@ -23,7 +23,7 @@
 	"classifier": {
 		"trainin_dir": "dataset/images/",
 		"model_path": "dataset/model/",
-		"model": "model-20190519_125204.clf",
+		"model": "model-20190518_191827.clf",
 		"n_neighbors": "",
 		"knn_algo": ""
 	},

diff --git a/dataset/model/README.md b/dataset/model/README.md
@@ -0,0 +1,15 @@
+### Neural Network model folder
+
+This directory will contains the model generated by the neural network training among the give images
+
+#### image_dataset-DATE_TIME.**dat**
+
+Contains the dataset parsed from the image
+
+#### model-DATE_TIME.**clf**
+
+Is the neural model able to classify a given face
+
+#### model-DATE_TIME.**json**
+
+Contains the json configuration for training the neural network with the best parameters
diff --git a/dataset/model/model-20190518_191827.clf b/dataset/model/model-20190518_191827.clf
diff --git a/dataset/model/model-20190519_210950.clf b/dataset/model/model-20190519_210950.clf
diff --git a/dataset/model/model-20190519_210950.json b/dataset/model/model-20190519_210950.json
@@ -0,0 +1,10 @@
+{
+	"classifier_file": "dataset/model/model-20190519_210950",
+	"params": {
+		"algorithm": "ball_tree",
+		"metric": "minkowski",
+		"n_neighbors": 78,
+		"p": 2,
+		"weights": "distance"
+	}
+}
diff --git a/datastructure/Classifier.py b/datastructure/Classifier.py
@@ -2,19 +2,23 @@
 """
 Core utils for manage face recognition process
 """
-
+import json
 import logging
 import os
 import pickle
 from datetime import datetime
 from math import sqrt
+from multiprocessing.pool import ThreadPool
 from pprint import pformat
 
 import face_recognition
+from sklearn.metrics import accuracy_score, balanced_accuracy_score, classification_report, \
+	precision_score
+from sklearn.model_selection import GridSearchCV, train_test_split
 from sklearn.neighbors import KNeighborsClassifier
-from tqdm import tqdm
 
 from datastructure.Person import Person
+from utils.util import dump_dataset
 
 log = logging.getLogger()
 
@@ -104,7 +108,9 @@ def load_classifier_from_file(self, classifier_file):
 			err = "load_classifier_from_file | FATAL | Path {} DOES NOT EXIST ...".format(self.model_path)
 		if err is not None:
 			log.error(err)
-			raise Exception(err)
+			log.error("load_classifier_from_file | Seems that the model is gone :/ | Loading an empty classifier for "
+			          "training purpouse ...")
+			self.classifier = None
 		return
 
 	def train(self, X, Y):
@@ -116,23 +122,103 @@ def train(self, X, Y):
 		log.debug("train | START")
 		if self.classifier is not None:
 			log.debug("train | Training ...")
-			self.classifier.fit(X, Y)
+			X_train, x_test, Y_train, y_test = train_test_split(X, Y, test_size=0.25)
+			self.classifier.fit(X_train, Y_train)
 			log.debug("train | Model Trained!")
+			log.debug("train | Checking performance ...")
+			y_pred = self.classifier.predict(x_test)
+			# Static method
+			self.verify_performance(y_test, y_pred)
 			return self.dump_model(self.model_path, "model")
 
-	def dump_model(self, path, file):
+	def tuning(self, X, Y):
+		"""
+		Tune the hyperparameter of a new model by the given data [X] related to the given target [Y]
+
+		:param X:
+		:param Y:
+		:return:
+		"""
+		X_train, x_test, Y_train, y_test = train_test_split(X, Y, test_size=0.25)
+		self.classifier = KNeighborsClassifier()
+		# Hyperparameter of the neural network (KKN)
+
+		# n_neighbors_range = list(range(1, round(sqrt(len(X_train)))))  # n_neighbors <= n_samples
+		weights_range = ['uniform', 'distance']
+		metrics_range = ['minkowski', 'euclidean', 'manhattan']
+		# 'auto' will automagically choose an algorithm by the given value
+		algorithm_range = ['ball_tree', 'kd_tree', 'brute']
+		power_range = [1, 2]
+		nn_root = int(round(sqrt(len(X_train))))
+		parameter_space = {
+			# 'n_neighbors': list(range(1,nn_root)),
+			'n_neighbors': [nn_root],
+			'metric': metrics_range,
+			'weights': weights_range,
+			'algorithm': algorithm_range,
+			'p': power_range,
+		}
+		log.debug("tuning | Parameter -> {}".format(pformat(parameter_space)))
+		grid = GridSearchCV(self.classifier, parameter_space, cv=3, scoring='accuracy', verbose=10, n_jobs=3)
+		grid.fit(X_train, Y_train)
+		log.info("TUNING COMPLETE | DUMPING DATA!")
+		# log.info("tuning | Grid Scores: {}".format(pformat(grid.grid_scores_)))
+		log.info('Best parameters found: {}'.format(grid.best_params_))
+
+		y_pred = grid.predict(x_test)
+
+		log.info('Results on the test set: {}'.format(pformat(grid.score(x_test, y_test))))
+
+		self.verify_performance(y_test, y_pred)
+
+		return self.dump_model(params=grid.best_params_)
+
+	@staticmethod
+	def verify_performance(y_test, y_pred):
+		"""
+		Verify the performance of the result analyzing the known-predict result
+		:param y_test:
+		:param y_pred:
+		:return:
+		"""
+
+		log.debug("verify_performance | Analyzing performance ...")
+		# log.info("Computing classifier score --> {}".format(pformat(clf.score(y_test,y_pred))))
+		log.info("Classification Report: {}".format(pformat(classification_report(y_test, y_pred))))
+		log.info("balanced_accuracy_score: {}".format(pformat(balanced_accuracy_score(y_test, y_pred))))
+		log.info("accuracy_score: {}".format(pformat(accuracy_score(y_test, y_pred))))
+		log.info("precision_score: {}".format(pformat(precision_score(y_test, y_pred, average='weighted'))))
+
+	def dump_model(self, params, path=None, file=None):
 		"""
 		Dump the model to the given path, file
+		:param params:
 		:param path:
 		:param file:
 		"""
+		if path is None:
+			if self.model_path is not None:
+				if os.path.exists(self.model_path) and os.path.isdir(self.model_path):
+					path = self.model_path
+		if file is None:
+			file = "model"
+
 		if os.path.isdir(path):
 			time_parsed = datetime.now().strftime('%Y%m%d_%H%M%S')
-			classifier_file = os.path.join(path, "{}-{}.clf".format(file, time_parsed))
+			classifier_file = os.path.join(path, "{}-{}".format(file, time_parsed))
+			config = {'classifier_file': classifier_file,
+			          'params': params
+			          }
+
 			log.debug("dump_model | Dumping model ... | Path: {} | File: {}".format(path, classifier_file))
-			with open(classifier_file, 'wb') as f:
+			# TODO: Save every model in a different folder
+			with open(classifier_file + ".clf", 'wb') as f:
 				pickle.dump(self.classifier, f)
-			return classifier_file
+			with open(classifier_file + ".json", 'w') as f:
+				json.dump(config, f)
+				log.info('dump_model | Configuration saved to {0}'.format(classifier_file))
+
+			return config
 
 	def init_peoples_list(self, peoples_path=None):
 		"""
@@ -144,18 +230,29 @@ def init_peoples_list(self, peoples_path=None):
 		log.debug("init_peoples_list | Initalizing people ...")
 		if peoples_path is not None and os.path.isdir(peoples_path):
 			self.training_dir = peoples_path
+		# freq_list = pool.map(partial(get_frequency, nlp=nlp_en, client=mongo_client), fileList)
+		pool = ThreadPool(3)
+		self.peoples_list = pool.map(self.init_peoples_list_core, os.listdir(self.training_dir))
+		self.peoples_list = list(filter(None.__ne__, self.peoples_list))  # Remove None
 
-		for people_name in tqdm(os.listdir(self.training_dir),
-		                        total=len(os.listdir(self.training_dir)), desc="Init people list ..."):
-			log.debug("init_peoples_list | Initalizing [{}]".format(people_name))
-			# Filter only folder
-			if os.path.isdir(os.path.join(self.training_dir, people_name)):
-				log.debug("{0}".format(os.path.join(self.training_dir, people_name)))
-				person = Person()
-				person.name = people_name
-				person.path = os.path.join(self.training_dir, people_name)
-				person.init_dataset()
-				self.peoples_list.append(person)
+	# TODO: Add method for dump datastructure in order to don't wait to load same data for test
+
+	def init_peoples_list_core(self, people_name):
+		"""
+		Delegated core method for parallelize operation
+		:param people_name:
+		:return:
+		"""
+		if os.path.isdir(os.path.join(self.training_dir, people_name)):
+			log.debug("Initalizing people {0}".format(os.path.join(self.training_dir, people_name)))
+			person = Person()
+			person.name = people_name
+			person.path = os.path.join(self.training_dir, people_name)
+			person.init_dataset()
+			return person
+		else:
+			log.debug("People {0} invalid folder!".format(os.path.join(self.training_dir, people_name)))
+			return None
 
 	def init_dataset(self):
 		"""
@@ -175,9 +272,10 @@ def init_dataset(self):
 				DATASET["X"].append(item)
 			for item in people.dataset["Y"]:
 				DATASET["Y"].append(item)
-
+		dump_dataset(DATASET, self.model_path)
 		return DATASET
 
+	# TODO: Add configuration parameter for choose the distance_threshold
 	def predict(self, X_img_path, distance_threshold=0.45):
 		"""
 		Recognizes faces in given image using a trained KNN classifier