From b212901c1a15a7c516749eee61de6c58655e7f54 Mon Sep 17 00:00:00 2001 From: AlessioSavi Date: Tue, 21 May 2019 23:31:56 +0200 Subject: [PATCH] v0.1.1 Enhancements Issue #2 - Model saving mechanism rewritten from scratch (using timestamp as name) - Every model will be now saved in a different directory - Every data related to the model (dataset + configuration) will be saved in the same folder - Configuration file changed due to new implementation of model folder - dump_model (dataset) rewritten and migrated to utils - dump_model (classifier) rewriten in order to be compliant with new folder architecture - Remove migrated parallelism from "different person" from "different image same person" - Enabled progress bar during face analysis - Response constuctor will now accept parameter Issue #4 - Create function for retrieve the dataset from the input HTML form and return to tune/train function - Standardize and refactor logic for train/tune BugFix - Dump the real classifier (grid.best_estimator_) --- api/Api.py | 101 ++++++++-------------- conf/test.json | 13 ++- datastructure/Classifier.py | 162 ++++++++++++++++++++++-------------- datastructure/Person.py | 2 +- datastructure/Response.py | 10 +-- main.py | 2 +- utils/util.py | 74 +++++++++++++--- 7 files changed, 210 insertions(+), 154 deletions(-) diff --git a/api/Api.py b/api/Api.py index ad782af..4c08b30 100644 --- a/api/Api.py +++ b/api/Api.py @@ -2,15 +2,13 @@ """ Custom function that will be wrapped for be HTTP compliant """ -import os -import pickle + import time -import zipfile +from datetime import datetime from logging import getLogger -from os.path import join as path_join from datastructure.Response import Response -from utils.util import print_prediction_on_image, random_string, remove_dir, unzip_data +from utils.util import print_prediction_on_image, random_string, retrieve_dataset log = getLogger() @@ -24,12 +22,12 @@ def predict_image(img_path, clf, PREDICTION_PATH): :return: Response dictionary jsonizable """ response = Response() - log.debug("predict_image | Predicting {}".format(img_path)) if clf is None: + log.error("predict_image | FATAL | Classifier is None!") prediction = None else: - prediction = clf.predict(img_path) - log.debug("predict_image | Image analyzed!") + log.debug("predict_image | Predicting {}".format(img_path)) + prediction = clf.predict(img_path, distance_threshold=0.45) # Manage success if prediction is not None and isinstance(prediction, list) and len(prediction) == 1: img_name = random_string() + ".png" @@ -79,23 +77,23 @@ def train_network(folder_uncompress, zip_file, clf): :param clf: :return: """ - log.debug("train_network | uncompressing zip file ...") - folder_name = path_join(folder_uncompress, random_string()) - zip_ref = zipfile.ZipFile(zip_file) - zip_ref.extractall(folder_name) - zip_ref.close() - log.debug("train_network | zip file uncompressed!") - clf.init_peoples_list(peoples_path=folder_name) - dataset = clf.init_dataset() - neural_model_file = clf.train(dataset["X"], dataset["Y"]) - log.debug("train_network | Removing unzipped files") - remove_dir(folder_name) - response = Response() - response.status = "OK" - response.data = neural_model_file - response.description = "Model succesfully trained!" - return response.__dict__ + log.debug("train_network | Starting training phase ...") + dataset = retrieve_dataset(folder_uncompress, zip_file, clf) + + if dataset is None: + return Response(error="ERROR DURING LOADING DAT", description="Seems that the dataset is not valid").__dict__ + + else: + timestamp = datetime.now().strftime('%Y%m%d_%H%M%S') + neural_model_file, elapsed_time = clf.train(dataset["X"], dataset["Y"], timestamp) + + response = Response(status="OK", data=neural_model_file) + response.description = "Model succesfully trained! | {}".format( + time.strftime("%H:%M:%S.%f", time.gmtime(elapsed_time))) + log.debug("train_network | Tuning phase finihsed! | {}".format(response.description)) + + return response.__dict__ def tune_network(folder_uncompress, zip_file, clf): @@ -106,50 +104,19 @@ def tune_network(folder_uncompress, zip_file, clf): :param clf: :return: """ - log.debug("tune_network | uncompressing zip file ...") - check = verify_extension(zip_file.filename) - if check == "zip": # Image provided - folder_name = unzip_data(folder_uncompress, zip_file) - log.debug("tune_network | zip file uncompressed!") - clf.init_peoples_list(peoples_path=folder_name) - dataset = clf.init_dataset() - elif check == "dat": - dataset = pickle.load(zip_file) + log.debug("tune_network | Starting tuning phase ...") + dataset = retrieve_dataset(folder_uncompress, zip_file, clf) + + if dataset is None: + return Response(error="ERROR DURING LOADING DAT", description="Seems that the dataset is not valid").__dict__ + else: - dataset = None - - if dataset is not None: - start_time = time.time() - neural_model_file = clf.tuning(dataset["X"], dataset["Y"]) - elapsed_time = time.time() - start_time - - log.debug("tune_network | Removing unzipped files") - if check == "zip": - # TODO: Refactor this method :/ - remove_dir(folder_name) - response = Response() - response.status = "OK" - response.data = neural_model_file + timestamp = datetime.now().strftime('%Y%m%d_%H%M%S') + neural_model_file, elapsed_time = clf.tuning(dataset["X"], dataset["Y"], timestamp) + + response = Response(status="OK", data=neural_model_file) response.description = "Model succesfully trained! | {}".format( time.strftime("%H:%M:%S.%f", time.gmtime(elapsed_time))) - else: - response = Response() - response.error = "ERROR DURING LOADING DAT" - return response.__dict__ - + log.debug("train_network | Tuning phase finihsed! | {}".format(response.description)) -def verify_extension(file): - """ - Wrapper for validate file - :param file: - :return: - """ - extension = os.path.splitext(file)[1] - log.debug("verify_extension | File: {} | Ext: {}".format(file, extension)) - if extension == ".zip": - # In this case we have to analyze the photos - return "zip" - elif extension == ".dat": - # Photos have been alredy analyzed, dataset is ready! - return "dat" - return None + return response.__dict__ diff --git a/conf/test.json b/conf/test.json index 5fcf24f..de4e3f6 100644 --- a/conf/test.json +++ b/conf/test.json @@ -1,6 +1,6 @@ { "PyRecognizer": { - "Version": "0.0.1", + "Version": "0.1.1 ", "temp_upload_training": "uploads/training/", "temp_upload_predict": "uploads/predict/", "temp_upload": "uploads/upload" @@ -23,9 +23,14 @@ "classifier": { "trainin_dir": "dataset/images/", "model_path": "dataset/model/", - "model": "model-20190518_191827.clf", - "n_neighbors": "", - "knn_algo": "" + "timestamp": "20190521_131449", + "params": { + "algorithm": "ball_tree", + "metric": "minkowski", + "n_neighbors": 78, + "p": 2, + "weights": "distance" + } }, "data": { "test_data": "/tmp/test_data/" diff --git a/datastructure/Classifier.py b/datastructure/Classifier.py index 5841615..fa6077e 100644 --- a/datastructure/Classifier.py +++ b/datastructure/Classifier.py @@ -6,9 +6,11 @@ import logging import os import pickle -from datetime import datetime +import time from math import sqrt -from multiprocessing.pool import ThreadPool +from tqdm import tqdm + + from pprint import pformat import face_recognition @@ -32,19 +34,22 @@ def __init__(self): self.training_dir = None self.model_path = None self.n_neighbors = None - self.knn_algo = None + self.algorithm = None + self.metric = None + self.p = None + self.weights = None self.peoples_list = [] self.classifier = None - def init_knn_algo(self, knn_algo): + def init_algorithm(self, algorithm): """ - Initialize the knn_algorithm for the neural network. If not provided the 'ball_tree' will + Initialize the algorithmrithm for the neural network. If not provided the 'ball_tree' will be used as default - :param knn_algo: 'ball_tree' as default + :param algorithm: 'ball_tree' as default """ - log.debug("init_knn_algo | Initializing knn algorithm ...") - if self.knn_algo is None: - self.knn_algo = knn_algo + log.debug("init_algorithm | Initializing knn algorithm ...") + if self.algorithm is None: + self.algorithm = algorithm def init_n_neighbors(self, X_len=10): """ @@ -63,45 +68,51 @@ def init_classifier(self): """ if self.classifier is None: log.debug("init_classifier | START!") - if self.knn_algo is not None and self.n_neighbors is not None: + if self.algorithm is not None and self.n_neighbors is not None: log.debug("init_classifier | Initializing a new classifier ... | {0}".format(pformat(self.__dict__))) self.classifier = KNeighborsClassifier( - n_neighbors=self.n_neighbors, algorithm=self.knn_algo, weights='distance') + n_neighbors=self.n_neighbors, algorithm=self.algorithm, weights='distance') else: - log.error("init_classifier | Mandatory parameter not provided :/") - self.classifier = None - - def init_specs(self, X_len, knn_algo='ball_tree'): - """ - Initalize the classifier - :param knn_algo: - :param X_len: - """ - log.debug("init_specs | Init knn algorithm ...") - self.init_knn_algo(knn_algo) - self.init_n_neighbors(X_len) - self.init_classifier() + log.error("init_classifier | Mandatory parameter not provided | Init a new KNN Classifier") + self.classifier = KNeighborsClassifier() - def load_classifier_from_file(self, classifier_file): + def load_classifier_from_file(self, timestamp): """ - Initalize the classifier from file - :param classifier_file: + Initalize the classifier from file. + The classifier file rappresent the name of the directory related to the classifier that we want to load. + + The tree structure of the the model folder will be something like this + + Structure: + model/ + ├── <20190520_095119>/ --> Timestamp in which the model was created + │ ├── model.dat --> Dataset generated by encoding the faces and pickelizing them + │ ├── model.clf --> Classifier delegated to recognize a given face + │ ├── model.json --> Hyperparameters related to the current classifier + ├── <20190519_210950>/ + │ ├── model.dat + │ ├── model.clf + │ ├── model.json + └── ... + + :param timestamp: :return: """ - log.debug("load_classifier_from_file | Loading classifier from file ... | File: {}".format(classifier_file)) + log.debug("load_classifier_from_file | Loading classifier from file ... | File: {}".format(timestamp)) # Load a trained KNN model (if one was passed in) err = None if self.classifier is None: if self.model_path is None or not os.path.isdir(self.model_path): raise Exception("Model folder not provided!") - log.debug("load_classifier_from_file | Loading classifier from file ...") - log.debug("load_classifier_from_file | Path {} exist ...".format(self.model_path)) - filename = os.path.join(self.model_path, classifier_file) + # Adding the conventional name used for the classifier -> 'model.clf' + filename = os.path.join(self.model_path, timestamp, "model.clf") + log.debug("load_classifier_from_file | Loading classifier from file: {}".format(filename)) if os.path.isfile(filename): log.debug("load_classifier_from_file | File {} exist ...".format(filename)) with open(filename, 'rb') as f: self.classifier = pickle.load(f) + log.debug("load_classifier_from_file | Classifier loaded!") else: err = "load_classifier_from_file | FATAL | File {} DOES NOT EXIST ...".format(filename) else: @@ -113,32 +124,44 @@ def load_classifier_from_file(self, classifier_file): self.classifier = None return - def train(self, X, Y): + def train(self, X, Y, timestamp): """ Train a new model by the given data [X] related to the given target [Y] :param X: :param Y: + :param timestamp: """ log.debug("train | START") - if self.classifier is not None: - log.debug("train | Training ...") - X_train, x_test, Y_train, y_test = train_test_split(X, Y, test_size=0.25) - self.classifier.fit(X_train, Y_train) - log.debug("train | Model Trained!") - log.debug("train | Checking performance ...") - y_pred = self.classifier.predict(x_test) - # Static method - self.verify_performance(y_test, y_pred) - return self.dump_model(self.model_path, "model") - - def tuning(self, X, Y): + if self.classifier is None: + self.init_classifier() + + dump_dataset(X, Y, os.path.join(self.model_path, timestamp)) + + start_time = time.time() + + X_train, x_test, Y_train, y_test = train_test_split(X, Y, test_size=0.25) + log.debug("train | Training ...") + self.classifier.fit(X_train, Y_train) + log.debug("train | Model Trained!") + log.debug("train | Checking performance ...") + y_pred = self.classifier.predict(x_test) + # Static method + self.verify_performance(y_test, y_pred) + + return self.dump_model(timestamp=timestamp, classifier=self.classifier), time.time() - start_time + + def tuning(self, X, Y, timestamp): """ Tune the hyperparameter of a new model by the given data [X] related to the given target [Y] :param X: :param Y: + :param timestamp: :return: """ + start_time = time.time() + dump_dataset(X, Y, os.path.join(self.model_path, timestamp)) + X_train, x_test, Y_train, y_test = train_test_split(X, Y, test_size=0.25) self.classifier = KNeighborsClassifier() # Hyperparameter of the neural network (KKN) @@ -159,7 +182,7 @@ def tuning(self, X, Y): 'p': power_range, } log.debug("tuning | Parameter -> {}".format(pformat(parameter_space))) - grid = GridSearchCV(self.classifier, parameter_space, cv=3, scoring='accuracy', verbose=10, n_jobs=3) + grid = GridSearchCV(self.classifier, parameter_space, cv=3, scoring='accuracy', verbose=10, n_jobs=1) grid.fit(X_train, Y_train) log.info("TUNING COMPLETE | DUMPING DATA!") # log.info("tuning | Grid Scores: {}".format(pformat(grid.grid_scores_))) @@ -171,7 +194,8 @@ def tuning(self, X, Y): self.verify_performance(y_test, y_pred) - return self.dump_model(params=grid.best_params_) + return self.dump_model(timestamp=timestamp, params=grid.best_params_, + classifier=grid.best_estimator_), time.time() - start_time @staticmethod def verify_performance(y_test, y_pred): @@ -183,42 +207,47 @@ def verify_performance(y_test, y_pred): """ log.debug("verify_performance | Analyzing performance ...") - # log.info("Computing classifier score --> {}".format(pformat(clf.score(y_test,y_pred)))) log.info("Classification Report: {}".format(pformat(classification_report(y_test, y_pred)))) log.info("balanced_accuracy_score: {}".format(pformat(balanced_accuracy_score(y_test, y_pred)))) log.info("accuracy_score: {}".format(pformat(accuracy_score(y_test, y_pred)))) log.info("precision_score: {}".format(pformat(precision_score(y_test, y_pred, average='weighted')))) - def dump_model(self, params, path=None, file=None): + def dump_model(self, timestamp, classifier, params=None, path=None): """ Dump the model to the given path, file :param params: + :param timestamp: + :param classifier: :param path: - :param file: + """ + log.debug("dump_model | Dumping model ...") if path is None: if self.model_path is not None: if os.path.exists(self.model_path) and os.path.isdir(self.model_path): path = self.model_path - if file is None: - file = "model" - - if os.path.isdir(path): - time_parsed = datetime.now().strftime('%Y%m%d_%H%M%S') - classifier_file = os.path.join(path, "{}-{}".format(file, time_parsed)) - config = {'classifier_file': classifier_file, - 'params': params - } + config = {'classifier_file': os.path.join(timestamp, "model.clf"), + 'params': params + } + if not os.path.isdir(path): + os.makedirs(timestamp) + classifier_folder = os.path.join(path, timestamp) + classifier_file = os.path.join(classifier_folder, "model") log.debug("dump_model | Dumping model ... | Path: {} | File: {}".format(path, classifier_file)) # TODO: Save every model in a different folder + if not os.path.exists(classifier_folder): + os.makedirs(classifier_folder) + with open(classifier_file + ".clf", 'wb') as f: - pickle.dump(self.classifier, f) + pickle.dump(classifier, f) + + with open(classifier_file + ".json", 'w') as f: json.dump(config, f) log.info('dump_model | Configuration saved to {0}'.format(classifier_file)) - return config + return config def init_peoples_list(self, peoples_path=None): """ @@ -230,11 +259,17 @@ def init_peoples_list(self, peoples_path=None): log.debug("init_peoples_list | Initalizing people ...") if peoples_path is not None and os.path.isdir(peoples_path): self.training_dir = peoples_path - # freq_list = pool.map(partial(get_frequency, nlp=nlp_en, client=mongo_client), fileList) - pool = ThreadPool(3) - self.peoples_list = pool.map(self.init_peoples_list_core, os.listdir(self.training_dir)) + #pool = ThreadPool(3) + #self.peoples_list = pool.map(self.init_peoples_list_core, os.listdir(self.training_dir)) + + for people_name in tqdm(os.listdir(self.training_dir), + total=len(os.listdir(self.training_dir)), desc="Init people list ..."): + self.peoples_list.append(self.init_peoples_list_core(people_name)) + self.peoples_list = list(filter(None.__ne__, self.peoples_list)) # Remove None + + # TODO: Add method for dump datastructure in order to don't wait to load same data for test def init_peoples_list_core(self, people_name): @@ -272,7 +307,6 @@ def init_dataset(self): DATASET["X"].append(item) for item in people.dataset["Y"]: DATASET["Y"].append(item) - dump_dataset(DATASET, self.model_path) return DATASET # TODO: Add configuration parameter for choose the distance_threshold diff --git a/datastructure/Person.py b/datastructure/Person.py index 0df3ae1..2d8a9d8 100644 --- a/datastructure/Person.py +++ b/datastructure/Person.py @@ -42,7 +42,7 @@ def init_dataset(self): if self.path != "" and isdir(self.path): log.debug("initDataset | Paramater provided, iterating images ..") # Iterating the images in parallel - pool = ThreadPool(1) + pool = ThreadPool(2) self.dataset["X"] = pool.map(self.init_dataset_core, image_files_in_folder(self.path)) self.dataset["X"] = list(filter(None.__ne__, self.dataset["X"])) # Remove None # Loading the Y [target] diff --git a/datastructure/Response.py b/datastructure/Response.py index 82cbaa8..c028edf 100644 --- a/datastructure/Response.py +++ b/datastructure/Response.py @@ -12,9 +12,9 @@ class Response(object): external tools """ - def __init__(self): - self.status = "KO" - self.description = None - self.error = None - self.data = None + def __init__(self, status="KO", description=None, error=None, data=None): + self.status = status + self.description = description + self.error = error + self.data = data self.date = str(datetime.now()) diff --git a/main.py b/main.py index 79617a2..e0202bb 100644 --- a/main.py +++ b/main.py @@ -39,7 +39,7 @@ log.debug("Init classifier ...") clf = Classifier() clf.model_path = CFG["classifier"]["model_path"] -clf.load_classifier_from_file(CFG["classifier"]["model"]) +clf.load_classifier_from_file(CFG["classifier"]["timestamp"]) # TODO Add check on extension allowed_ext = ["jpg", "jpeg", "png"] diff --git a/utils/util.py b/utils/util.py index 1db6639..a34cf36 100644 --- a/utils/util.py +++ b/utils/util.py @@ -10,7 +10,6 @@ import shutil import string import zipfile -from datetime import datetime from logging.handlers import TimedRotatingFileHandler from PIL import Image, ImageDraw @@ -132,7 +131,7 @@ def unzip_data(unzipped_folder, zip_file): Unzip the zip file in input in the given 'unzipped_folder' :param unzipped_folder: :param zip_file: - :return: + :return: The name of the folder in which find the unzipped data """ log = logging.getLogger() folder_name = os.path.join(unzipped_folder, random_string()) @@ -144,23 +143,28 @@ def unzip_data(unzipped_folder, zip_file): return folder_name -def dump_dataset(dataset, path, dataset_name=None): +def dump_dataset(X, Y, path): """ - :param dataset: + :param X: + :param Y: :param path: - :param dataset_name: :return: """ log = logging.getLogger() - log.debug("dump_dataset | Dumping {} {}".format(path, dataset_name)) - if os.path.exists(path) and os.path.isdir(path): - if dataset_name is None: - dataset_name = "image_dataset" - time_parsed = datetime.now().strftime('%Y%m%d_%H%M%S') - dataset_name = os.path.join(path, "{}-{}".format(dataset_name, time_parsed)) - with open(dataset_name + ".dat", 'wb') as f: + dataset = { + 'X': X, + 'Y': Y + } + log.debug("dump_dataset | Dumping dataset int {}".format(path)) + if not os.path.exists(path): + os.makedirs(path) + log.debug("dump_dataset | Path {} exist".format(path)) + dataset_name = os.path.join(path, "model.dat") + with open(dataset_name, 'wb') as f: pickle.dump(dataset, f) + else: + log.error("dump_dataset | Path {} ALREDY EXIST exist".format(path)) def remove_dir(directory): @@ -173,3 +177,49 @@ def remove_dir(directory): log.debug("remove_dir | Removing directory {}".format(directory)) if os.path.isdir(directory): shutil.rmtree(directory) + + +def verify_extension(file): + """ + Wrapper for validate file + :param file: + :return: + """ + log = logging.getLogger() + extension = os.path.splitext(file)[1] + log.debug("verify_extension | File: {} | Ext: {}".format(file, extension)) + if extension == ".zip": + # In this case we have to analyze the photos + return "zip" + elif extension == ".dat": + # Photos have been alredy analyzed, dataset is ready! + return "dat" + return None + + +def retrieve_dataset(folder_uncompress, zip_file, clf): + """ + + :param folder_uncompress: + :param zip_file: + :param clf: + :return: + """ + log = logging.getLogger() + log.debug("retrieve_dataset | Parsing dataset ...") + check = verify_extension(zip_file.filename) + if check == "zip": # Image provided + log.debug("retrieve_dataset | Zip file uploaded") + folder_name = unzip_data(folder_uncompress, zip_file) + log.debug("retrieve_dataset | zip file uncompressed!") + clf.init_peoples_list(peoples_path=folder_name) + dataset = clf.init_dataset() + log.debug("retrieve_dataset | Removing [{}]".format(folder_name)) + remove_dir(folder_name) + elif check == "dat": + log.debug("retrieve_dataset | Pickle data uploaded") + dataset = pickle.load(zip_file) + else: + dataset = None + log.debug("tune_network | Dataset parsed!") + return dataset