-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy patheval_aux.py
234 lines (182 loc) · 8.65 KB
/
eval_aux.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
import os
import sys
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score, precision_score
from imblearn.over_sampling import SMOTE, RandomOverSampler
# Parse terminal input
FLAG = ''
valid_flags = ('missing_values', 'outliers', 'scaling', 'feature_selection')
if len(sys.argv) == 2 and sys.argv[1] in valid_flags:
FLAG = sys.argv[1]
else:
print("Invalid format, try: python eval_aux.py [missing_values|outliers|scaling|feature_selection]")
exit(1)
# Folder path
dir_path = f'datasets/{FLAG}/'
# List to store files
file_names = []
file_paths = []
# Iterate directory
for file in os.listdir(dir_path):
# check if current path is a file
if os.path.isfile(os.path.join(dir_path, file)):
file_name = os.path.splitext(file)[0]
file_names.append(file_name)
file_paths.append(f'datasets/{FLAG}/{file_name}')
print("Evaluating datasets: ", file_names)
print("-" * 20)
# ---------------------------------------------------------------------------
# target variable
target = 'Like'
NUMBER_ITERATIONS = 10
for i in range(len(file_names)):
mean_balanced_accuracy = 0
for j in range(NUMBER_ITERATIONS):
file_path = file_paths[i]
# read data
df = pd.read_csv(f'{file_path}.csv', low_memory=False)
# remove index column
index_column = df.columns[0]
df = df.drop([index_column], axis=1)
# Drop TestSetId column and Like NaN rows (no way to know if they liked or not)
df.drop('TestSetId', axis=1, inplace=True)
df.dropna(subset=[target], inplace=True)
# Take a random sample of 10k rows
df_test = df.sample(n=int(df.shape[0]/10)).copy(deep=True)
df_test.to_csv(f'df_test_aux.csv', index=False)
df.drop(df_test.index, axis=0, inplace=True)
# ----------------------------- #
# BALANCING #
# ----------------------------- #
like_zero = df[df[target] == 0.0]
like_one = df[df[target] == 1.0]
df_one_sample = like_one.sample(len(like_zero), replace=True)
df_zero_sample = like_zero.sample(len(like_zero))
df = pd.concat([df_zero_sample, df_one_sample], axis=0)
# ----------------------------- #
y = df.pop(target).values
X = df.values
X_train, X_test, y_train, y_test = \
train_test_split(X, y,
test_size=0.3,
shuffle=True,
random_state=3)
# Here, we want to find the best classifier. As candidates, we consider
# 1. LogisticRegression
# 2. RandomForestClassifier
# 3. GradientBoostingClassifier
# 4. HistGradientBoostingClassifier
# 5. AdaBoostClassifier
# 6. MLPClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier, HistGradientBoostingClassifier, AdaBoostClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.decomposition import PCA
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
model_logistic_regression = LogisticRegression(max_iter=300)
model_random_forest = RandomForestClassifier()
model_gradient_boosting = GradientBoostingClassifier()
model_adaboost = AdaBoostClassifier()
model_hist_gradient_boosting = HistGradientBoostingClassifier()
model_neural_network = MLPClassifier(max_iter=200)
pipeline = Pipeline(steps=[("model", None)])
parameter_grid_preprocessing = {
# Empty on purpose (no preprocessing)
}
# NOTE: Logistic Regression does not perform as well as the other models
parameter_grid_logistic_regression = {
"model" : [model_logistic_regression],
"model__C" : [0.1, 1, 10], # inverse regularization strength
}
parameter_grid_gradient_boosting = {
"model" : [model_gradient_boosting],
"model__n_estimators" : [150],
"model__max_depth" : list(range(df.shape[1]-12, df.shape[1]+1)), # 4
# "model__learning_rate" : [0.2],
}
# This estimator is much faster than GradientBoostingClassifier for big datasets (n_samples >= 10 000).
parameter_grid_hist_gradient_boosting = {
"model" : [model_hist_gradient_boosting],
# "model__learning_rate" : [0.2],
"model__max_depth" : list(range(df.shape[1]-15, df.shape[1]+1)), # 4
}
parameter_grid_adaboost = {
"model" : [model_adaboost],
"model__n_estimators" : [150]
}
parameter_grid_random_forest = {
"model" : [model_random_forest],
"model__n_estimators" : [10, 20, 50], # number of max trees in the forest
"model__max_depth" : [5, 10, 15],
}
# NOTE: NN does not perform well on this dataset + takes a long time to train
parameter_grid_neural_network = {
"model": [model_neural_network],
"model__hidden_layer_sizes": [(30, 30), (40, 30)], # Example hidden layer configurations
"model__alpha": [0.0001], # Regularization parameter
}
meta_parameter_grid = [
# parameter_grid_logistic_regression,
# parameter_grid_random_forest] #,
# parameter_grid_gradient_boosting] #,
parameter_grid_hist_gradient_boosting] #,
# parameter_grid_adaboost ] #,
# parameter_grid_neural_network]
meta_parameter_grid = [{**parameter_grid_preprocessing, **model_grid}
for model_grid in meta_parameter_grid]
search = GridSearchCV(pipeline,
meta_parameter_grid,
scoring="balanced_accuracy",
n_jobs=-1,
cv=5, # number of folds for cross-validation
error_score="raise"
)
# here, the actual training and grid search happens
search.fit(X_train, y_train.ravel())
# print("best parameter:", search.best_params_ ,"(CV score=%0.3f)" % search.best_score_)
# print("-" * 20)
# evaluate performance of model on test set
# print("Score on test set:", search.score(X_test, y_test.ravel()))
# contingency table
ct = pd.crosstab(search.best_estimator_.predict(X_test), y_test.ravel(),
rownames=["pred"], colnames=["true"])
# print(ct)
# print("-" * 20)
# ----------------------------------------------------------------- #
# TEST with a random sample of 10k rows from the original dataset #
# ----------------------------------------------------------------- #
from sklearn.metrics import balanced_accuracy_score
# read data
df_test = pd.read_csv('df_test_aux.csv', low_memory=False)
def micro_service_classify_review_test(datapoint):
# make sure the provided datapoints adhere to the correct format for model input
# fetch your trained model
model = search.best_estimator_
# make prediction with the model
prediction = model.predict(datapoint)
return prediction
# Save the Like values in a vector
like_values = df_test['Like'].values
# Optionally, you can drop the 'Like' column from the sampled_df if you don't need it
df_test.drop('Like', axis=1, inplace=True)
# make the missing predictions for the Like column
df_test['Like'] = micro_service_classify_review_test(df_test.values)
# Calculate balanced accuracy
balanced_acc = balanced_accuracy_score(like_values, df_test['Like'])
# print(f"Balanced Accuracy: {balanced_acc}")
mean_balanced_accuracy += balanced_acc
print(f"\t{file_names[i]} - Iteration {j+1} - Balanced Accuracy: {balanced_acc}")
print(f"Mean Balanced Accuracy {file_names[i]}: {mean_balanced_accuracy/NUMBER_ITERATIONS}")
print("-" * 20)