-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathCO2_example.py
423 lines (367 loc) · 15.2 KB
/
CO2_example.py
1
from tune_hyperparms_regression import overlapfrom scipy.stats import normimport randomimport numpy as npimport matplotlib.pyplot as pltfrom sklearn.datasets import fetch_mldatanp.set_printoptions(precision=3, suppress=True)def kernel_1(sqdist, theta_1, theta_2): """ the first kernel--the RBF kernel :param sqdist: input vector a :param theta_1: input vector b :param theta_2: lengthscale :return: kernel matrix(covariance matrix) """ return (theta_1 ** 2) * np.exp(-.5 * sqdist / theta_2**2)def kernel_2(l2_norm, sqdist, theta_3, theta_4, theta_5): """ the second kernel--the modified periodic kernel :param a: the first input vector :param b: the second input vector :param theta_3: the magnitude :param theta_4: the decay-time for the periodic component :param theta_5: the smoothness of the periodic component :return: covariance matrix """ first_item = -.5 * sqdist / theta_4**2 second_item = -2 * ((np.sin(np.pi * l2_norm)) / theta_5)**2 return theta_3**2 * np.exp(first_item + second_item)def kernel_3(sqdist, theta_6, theta_7, theta_8): """ the third kernel--the rational quadratic kernel in order to model the (small) medium term irregularities :param sqdist: l2-norm square :param theta_6: the magnitude :param theta_7: the typical length-scale :param theta_8: the shape parameter :return: covariance matrix """ item = 1 + .5 * sqdist / (theta_8 * theta_7**2) part_kernel = 1.0 / np.power(item, theta_8) return theta_6**2 * part_kerneldef kernel_4(sqdist, theta_9, theta_10, theta_11): """ the forth kernel--a noise model :param sqdist: l2-norm square :param theta_9: the magnitude of the correlated noise component :param theta_10: lengthscale :param theta_11: the magnitude of the independent noise component :return: covariance noise matrix """ n = len(sqdist) # delta is the indicator function if sqdist.shape[0] == sqdist.shape[1]: delta = np.eye(n) else: delta = 0 item = np.exp(-.5 * sqdist / theta_10**2) return theta_9**2 * item + theta_11**2 * deltadef covariance_function(a, b, hyperparms): """ covariance function :param a: the first input vector :param b: the second input vector :param hyperparms: hyperparameters for covariance function :return: kernel matrix """ # for 1 dimensional data if a.shape[1] ==1 and b.shape[1] == 1: sqdist = ((a[:, :, None] - b[:, :, None].T) ** 2).sum(1) l2_norm = np.sqrt(sqdist) #num_a = len(a) #num_b = len(b) #l2_norm = np.absolute(np.tile(a, (1, num_b)) - np.tile(b.T, (num_a, 1))) # for multi-dimensional data else: sqdist = (((a[:, None, :] - b[None, :, :]) ** 2).sum(axis=2))#.T #sqdist = distance.cdist(a, b, 'sqeuclidean') l2_norm = np.sqrt(sqdist) kernel = kernel_1(sqdist, hyperparms[0], hyperparms[1]) \ + kernel_2(l2_norm, sqdist, hyperparms[2], hyperparms[3], hyperparms[4]) \ + kernel_3(sqdist, hyperparms[5], hyperparms[6], hyperparms[7]) \ + kernel_4(sqdist, hyperparms[8], hyperparms[9], hyperparms[10]) return kerneldef get_pos_data(info): """ remove outlier and get positive data :param info: all information :return: positive data """ data = info[:, 1:13].flatten() neg_indices = np.where(data <= 0) pos_data = np.delete(data, neg_indices) return pos_datadef random_sample_test_parms(n_test_hyperparms, train_parms): """ randomly sample hyperparameters for test :param n_test_hyperparms: number of test hyperparameters :param train_parms: training hyperparms, in other words, params_done :return: test hyperparameters matrix """ dim_parms = 11 hyperms_book = np.array([66, 67, 2.4, 90, 1.3, .66, 1.2, .78, .18, 1.6, .19]) lower = hyperms_book - hyperms_book * .7 upper = hyperms_book + hyperms_book * .5 test_parms_matrix = np.zeros(shape=(n_test_hyperparms, dim_parms)) for i in range(dim_parms): num_gen = n_test_hyperparms + len(train_parms) + 10 test_parms = np.linspace(lower[i], upper[i], num_gen) ind_done, ind_sample = overlap(train_parms[:,i], test_parms) test_parms = np.delete(test_parms, ind_sample) test_parms_sampled = random.sample(test_parms, n_test_hyperparms) test_parms_matrix[:, i] = np.asarray(test_parms_sampled) return test_parms_matrixdef compute_mar_likelihood(X_train, y_train, hyperparms): """ compute log marginal likelihood :param X_train: training data :param y_train: training targets :param hyperparms: hyperparameters :return: value of log marginal likelihood """ s = 0.0005 # noise variance and zero mean for noise n = len(X_train) K_train = covariance_function(X_train, X_train, hyperparms) L = np.linalg.cholesky(K_train + s * np.eye(n)) L_inv = np.linalg.inv(L) alpha = np.dot(L_inv.T, np.dot(L_inv, y_train)) # compute log marginal likelihood log_marg_likelihood = -.5 * np.dot(y_train.T, alpha) - np.log(np.diagonal(L)).sum(0) - n / 2.0 * np.log(2 * np.pi) return log_marg_likelihooddef bayesian_opt(hyperparms_train, hyperparms_test, y_train): """ Bayesian optimization for computing posterior of training hyperparameters :param hyperparms_train: training hyperparameters :param hyperparms_test: test hyperparameters :param y_train: training targets :return: posterior means, standard deviation, value of sampled posterior function """ s = 0.0001 # noise variance and zero mean for noise n = len(hyperparms_train) # number of training points hyperparms = hyperparms_train[0] ######## can be change from [0] to [n-1] K = covariance_function(hyperparms_train, hyperparms_train, hyperparms) K_s = covariance_function(hyperparms_train, hyperparms_test, hyperparms) K_ss = covariance_function(hyperparms_test, hyperparms_test, hyperparms) L = np.linalg.cholesky(K + s * np.eye(n)) L_inv = np.linalg.inv(L) alpha = np.dot(L_inv.T, np.dot(L_inv, y_train)) # compute mean of test points for posterior mu_post = np.dot(K_s.T, alpha) v = np.dot(L_inv, K_s) # compute variance for test points var_test = np.diag(K_ss) - np.sum(v ** 2, axis=0) stand_devi = np.sqrt(var_test) return mu_post, stand_devidef make_prediction(X_train, X_test, y_train, hyperparms): """ make prediction :param X_train: training data :param X_test: test data :param y_train: training targets :param hyperparms: hyperparameters :return: """ s = 0.0005 # noise variance and zero mean for noise n = len(X_train) N = len(X_test) K_train = covariance_function(X_train, X_train, hyperparms) K_s = covariance_function(X_train, X_test, hyperparms) K_ss = covariance_function(X_test, X_test, hyperparms) L = np.linalg.cholesky(K_train + s * np.eye(n)) L_inv = np.linalg.inv(L) alpha = np.dot(L_inv.T, np.dot(L_inv, y_train)) # compute mean of test points for posterior mu_post = np.dot(K_s.T, alpha) v = np.dot(L_inv, K_s) # compute variance for test points var_test = np.diag(K_ss) - np.sum(v ** 2, axis=0) stand_devi = np.sqrt(var_test) # sample from test points, in other words, make prediction num_fun = 1 L_ = np.linalg.cholesky(K_ss + 1e-6 * np.eye(N) - np.dot(v.T, v)) f_post_fun = mu_post.reshape(-1, 1) + np.dot(L_, np.random.normal(size=(N, num_fun))) return mu_post, stand_devi, f_post_fundef UBC(hyperparms_train, hyperparms_test, mu_post, stand_devi): """ Upper Confidence Bound acquisition function :param hyperparms_train: training hyperparameters :param hyperparms_test: test hyperparameters :param mu_post: posterior means :param stand_devi: standard deviation :param y: training targets :return: next point that need to pick up """ num_parms = len(hyperparms_train) kappa = 7 objective = mu_post + kappa * stand_devi indices = np.where(objective == np.max(objective))[0] indices = np.asarray(indices) next_point = hyperparms_test[indices[0]] if np.array_equal(hyperparms_train[num_parms - 1], next_point): return True return next_pointdef TS(hyperparms_train, hyperparms_test, y_train): """ Thompson Sampling acquisition function :param hyperparms_train: training hyperparameters :param hyperparms_test: test hyperparameters :param y_train: training targets :return: next point that need to pick up """ mu_post, stand_devi, f_post_fun = bayesian_opt(hyperparms_train, hyperparms_test, y_train) max_index = np.where(f_post_fun == np.max(f_post_fun)) next_point = hyperparms_test[max_index[0]] return next_point.flatten()def EI(hyperparms_test, mu_post, stand_devi, y): """ Expected Improvement acquisition function :param hyperparms_test: test data :param mu_post: GP posterior mean :param stand_devi: standard deviation :param y: training targets :return: next point that need to pick up """ s = 0.0005 # small value max_mean = np.max(y) f_max = max_mean + s z = (mu_post - f_max) / stand_devi EI_vector = (mu_post - f_max) * norm.cdf(z) + stand_devi * norm.pdf(z) max_index = np.where(EI_vector == np.max(EI_vector)) next_point = hyperparms_test[max_index] return next_point.flatten()def PI(hyperparms_test, mu_post, stand_devi, y): """ Probability of Improvement acquisition function :param hyperparms_test: test hyperparameters :param mu_post: GP posterior mean :param stand_devi: standard deviation :param y: training targets :return: next point that need to pick up """ s = 0.0005 # small value max_mean = np.max(y) f_max = max_mean + s z = (mu_post - f_max) / stand_devi cumu_gaussian = norm.cdf(z) indices = np.where(cumu_gaussian == np.max(cumu_gaussian))[0] indices = np.asarray(indices) # since there are several 1, random pick one of them as the next point except for parms_done rand_index = random.randint(0, len(indices) - 1) next_point = hyperparms_test[indices[rand_index]] return next_pointdef acquisition_fun(choice, hyperparms_train, hyperparms_test, mu_post, stand_devi, y): """ different acquisition functions :param hyperparms_train: training hyperparameters :param hyperparms_test: test hyperparameters :param mu_post: posterior means :param stand_devi: standard deviation :param y: training targets :return: next point that need to pick up """ if choice == 'UBC': next_point = UBC(hyperparms_train, hyperparms_test, mu_post, stand_devi) elif choice == 'TS': next_point = TS(hyperparms_train, hyperparms_test, y) elif choice == 'EI': next_point = EI(hyperparms_test, mu_post, stand_devi, y) else: next_point = PI(hyperparms_test, mu_post, stand_devi, y) return next_pointdef init_hyperms(n_hyperms, dim_parms): """ initialize hyperparameters :param n_hyperms: number of hyperparameters :return: initialized hyperparameters """ hyperparms = np.zeros(shape=(n_hyperms, dim_parms)) # define initial hyperparms hyperms_book = np.array([66, 67, 2.4, 90, 1.3, .66, 1.2, .78, .18, 1.6, .19]) for i in range(n_hyperms): hyperparms[i] = hyperms_book + 0.5*(i+5) return hyperparmsdef tune_hyperparameters_BO(X_train, X_test, y_train): """ using Bayesian optimization in order to tune hyperparameters :param X_train: training data :param y_train: training targets :param X_test: test data :return: all values of log marginal likelihoods """ dim_parms = 11 n_train_hyperparms = 5 n_hyperparms_test = 500 # number of test hyperparameters choice = ['UCB', 'TS', 'EI', 'PI'] # run the loops for different acquisition functions for j in choice: print j # initialize hyperparameters hyperparms_train = init_hyperms(n_train_hyperparms, dim_parms) num_iterations = 10 y_axis = np.zeros(num_iterations) for k in range(num_iterations): n_hyperparms_train = len(hyperparms_train) hyperparms_test = random_sample_test_parms(n_hyperparms_test, hyperparms_train) log_marg_likelihood = np.zeros(n_hyperparms_train) # compute likelihood for the training points for i in range(n_hyperparms_train): log_marg_likelihood[i] = compute_mar_likelihood(X_train, y_train, hyperparms_train[i]) # Bayesian optimization for hyperparameters mu_post, stand_devi = bayesian_opt(hyperparms_train, hyperparms_test, log_marg_likelihood) next_point = acquisition_fun(choice, hyperparms_train, hyperparms_test, mu_post, stand_devi, log_marg_likelihood) hyperparms_train = np.append(hyperparms_train, [next_point], axis=0) max_index = np.where(log_marg_likelihood == np.max(log_marg_likelihood))[0] print "**" print "the " + `k+1` + "th iteration!" y_axis[k] = np.max(log_marg_likelihood) print y_axis[k] print hyperparms_train[max_index][0] # hyperparameters in the book hyperms_book = np.array([66, 67, 2.4, 90, 1.3, .66, 1.2, .78, .18, 1.6, .19]) print "hyperms in the book is:" print hyperms_book print "marginal log likelihood in the book is:" print compute_mar_likelihood(X_train, y_train, hyperms_book) plt.plot((np.arange(num_iterations)).reshape(-1,1), np.abs(y_axis), label=j) plt.xlabel("iterations") plt.ylabel("absolute log marginal likelihood") plt.legend(loc='best') plt.title("different acquisition functions in the CO2 example") plt.show() return hyperparms_train[max_index][0]def plot_prediction(X_train, X_test, y_train, y_test, stand_devi): """ plot the prediction figure :param X_train: training data :param X_test: test data :param y_train: training targets :param y_test: test targets :param stand_devi: standard deviation :return: """ plt.clf() plt.plot(X_train.reshape(-1,1), y_train, label='training data') plt.plot(X_test.reshape(-1,1), y_test, label='test data') plt.gca().fill_between(X_test.flat, mu_post - 3 * stand_devi, mu_post + 3 * stand_devi, color="#dddddd") plt.xlim(X_train.min(), X_test.max()) plt.xlabel("Year") plt.ylabel(r"CO$_2$ in ppm") plt.legend(loc=4) plt.title(r"Atmospheric CO$_2$ concentration at Mauna Loa") plt.show()if __name__ == "__main__": data = fetch_mldata('mauna-loa-atmospheric-co2').data X_train = data[:, [1]] y_train = data[:, 0] X_train = np.asarray(X_train) X_test = np.arange(X_train.max()//1 + 1, X_train.max()//1 + 21, 1./12)[:, np.newaxis] empirical_mean = np.mean(y_train) y_train = y_train - empirical_mean # tune hyperparameters hyperms_iterns = tune_hyperparameters_BO(X_train.reshape(-1,1), X_test, y_train) # make prediction hyperms_book = np.array([66, 67, 2.4, 90, 1.3, .66, 1.2, .78, .18, 1.6, .19]) mu_post, stand_devi, f_post_fun = make_prediction(X_train, X_test, y_train, hyperms_iterns) y_test = mu_post y_train += empirical_mean y_test += empirical_mean plot_prediction(X_train, X_test, y_train, y_test, stand_devi)