CO2_example.py

from tune_hyperparms_regression import overlapfrom scipy.stats import normimport randomimport numpy as npimport matplotlib.pyplot as pltfrom sklearn.datasets import fetch_mldatanp.set_printoptions(precision=3, suppress=True)def kernel_1(sqdist, theta_1, theta_2):    """    the first kernel--the RBF kernel    :param sqdist: input vector a    :param theta_1: input vector b    :param theta_2: lengthscale    :return: kernel matrix(covariance matrix)    """    return (theta_1 ** 2) * np.exp(-.5 * sqdist / theta_2**2)def kernel_2(l2_norm, sqdist, theta_3, theta_4, theta_5):    """    the second kernel--the modified periodic kernel    :param a: the first input vector    :param b: the second input vector    :param theta_3: the magnitude    :param theta_4: the decay-time for the periodic component    :param theta_5: the smoothness of the periodic component    :return: covariance matrix    """    first_item = -.5 * sqdist / theta_4**2    second_item = -2 * ((np.sin(np.pi * l2_norm)) / theta_5)**2    return theta_3**2 * np.exp(first_item + second_item)def kernel_3(sqdist, theta_6, theta_7, theta_8):    """    the third kernel--the rational quadratic kernel in order to model the (small) medium term irregularities    :param sqdist: l2-norm square    :param theta_6: the magnitude    :param theta_7: the typical length-scale    :param theta_8: the shape parameter    :return: covariance matrix    """    item = 1 + .5 * sqdist / (theta_8 * theta_7**2)    part_kernel = 1.0 / np.power(item, theta_8)    return theta_6**2 * part_kerneldef kernel_4(sqdist, theta_9, theta_10, theta_11):    """    the forth kernel--a noise model    :param sqdist: l2-norm square    :param theta_9: the magnitude of the correlated noise component    :param theta_10: lengthscale    :param theta_11: the magnitude of the independent noise component    :return: covariance noise matrix    """    n = len(sqdist)    # delta is the indicator function    if sqdist.shape[0] == sqdist.shape[1]:        delta = np.eye(n)    else:        delta = 0    item = np.exp(-.5 * sqdist / theta_10**2)    return theta_9**2 * item + theta_11**2 * deltadef covariance_function(a, b, hyperparms):    """    covariance function    :param a: the first input vector    :param b: the second input vector    :param hyperparms: hyperparameters for covariance function    :return: kernel matrix    """    # for 1 dimensional data    if a.shape[1] ==1 and b.shape[1] == 1:        sqdist = ((a[:, :, None] - b[:, :, None].T) ** 2).sum(1)        l2_norm = np.sqrt(sqdist)        #num_a = len(a)        #num_b = len(b)        #l2_norm = np.absolute(np.tile(a, (1, num_b)) - np.tile(b.T, (num_a, 1)))    # for multi-dimensional data    else:        sqdist = (((a[:, None, :] - b[None, :, :]) ** 2).sum(axis=2))#.T        #sqdist = distance.cdist(a, b, 'sqeuclidean')        l2_norm = np.sqrt(sqdist)    kernel = kernel_1(sqdist, hyperparms[0], hyperparms[1]) \     + kernel_2(l2_norm, sqdist, hyperparms[2], hyperparms[3], hyperparms[4]) \     + kernel_3(sqdist, hyperparms[5], hyperparms[6], hyperparms[7]) \     + kernel_4(sqdist, hyperparms[8], hyperparms[9], hyperparms[10])    return kerneldef get_pos_data(info):    """    remove outlier and get positive data    :param info: all information    :return: positive data    """    data = info[:, 1:13].flatten()    neg_indices = np.where(data <= 0)    pos_data = np.delete(data, neg_indices)    return pos_datadef random_sample_test_parms(n_test_hyperparms, train_parms):    """    randomly sample hyperparameters for test    :param n_test_hyperparms: number of test hyperparameters    :param train_parms: training hyperparms, in other words, params_done    :return: test hyperparameters matrix    """    dim_parms = 11    hyperms_book = np.array([66, 67, 2.4, 90, 1.3, .66, 1.2, .78, .18, 1.6, .19])    lower = hyperms_book - hyperms_book * .7    upper = hyperms_book + hyperms_book * .5    test_parms_matrix = np.zeros(shape=(n_test_hyperparms, dim_parms))    for i in range(dim_parms):        num_gen = n_test_hyperparms + len(train_parms) + 10        test_parms = np.linspace(lower[i], upper[i], num_gen)        ind_done, ind_sample = overlap(train_parms[:,i], test_parms)        test_parms = np.delete(test_parms, ind_sample)        test_parms_sampled = random.sample(test_parms, n_test_hyperparms)        test_parms_matrix[:, i] = np.asarray(test_parms_sampled)    return test_parms_matrixdef compute_mar_likelihood(X_train, y_train, hyperparms):    """    compute log marginal likelihood    :param X_train: training data    :param y_train: training targets    :param hyperparms: hyperparameters    :return: value of log marginal likelihood    """    s = 0.0005  # noise variance and zero mean for noise    n = len(X_train)    K_train = covariance_function(X_train, X_train, hyperparms)    L = np.linalg.cholesky(K_train + s * np.eye(n))    L_inv = np.linalg.inv(L)    alpha = np.dot(L_inv.T, np.dot(L_inv, y_train))    # compute log marginal likelihood    log_marg_likelihood = -.5 * np.dot(y_train.T, alpha) - np.log(np.diagonal(L)).sum(0) - n / 2.0 * np.log(2 * np.pi)    return log_marg_likelihooddef bayesian_opt(hyperparms_train, hyperparms_test, y_train):    """    Bayesian optimization for computing posterior of training hyperparameters    :param hyperparms_train: training hyperparameters    :param hyperparms_test: test hyperparameters    :param y_train: training targets    :return: posterior means, standard deviation, value of sampled posterior function    """    s = 0.0001  # noise variance and zero mean for noise    n = len(hyperparms_train)  # number of training points    hyperparms = hyperparms_train[0] ######## can be change from [0] to [n-1]    K = covariance_function(hyperparms_train, hyperparms_train, hyperparms)    K_s = covariance_function(hyperparms_train, hyperparms_test, hyperparms)    K_ss = covariance_function(hyperparms_test, hyperparms_test, hyperparms)    L = np.linalg.cholesky(K + s * np.eye(n))    L_inv = np.linalg.inv(L)    alpha = np.dot(L_inv.T, np.dot(L_inv, y_train))    # compute mean of test points for posterior    mu_post = np.dot(K_s.T, alpha)    v = np.dot(L_inv, K_s)    # compute variance for test points    var_test = np.diag(K_ss) - np.sum(v ** 2, axis=0)    stand_devi = np.sqrt(var_test)    return mu_post, stand_devidef make_prediction(X_train, X_test, y_train, hyperparms):    """    make prediction    :param X_train: training data    :param X_test: test data    :param y_train: training targets    :param hyperparms: hyperparameters    :return:    """    s = 0.0005  # noise variance and zero mean for noise    n = len(X_train)    N = len(X_test)    K_train = covariance_function(X_train, X_train, hyperparms)    K_s = covariance_function(X_train, X_test, hyperparms)    K_ss = covariance_function(X_test, X_test, hyperparms)    L = np.linalg.cholesky(K_train + s * np.eye(n))    L_inv = np.linalg.inv(L)    alpha = np.dot(L_inv.T, np.dot(L_inv, y_train))    # compute mean of test points for posterior    mu_post = np.dot(K_s.T, alpha)    v = np.dot(L_inv, K_s)    # compute variance for test points    var_test = np.diag(K_ss) - np.sum(v ** 2, axis=0)    stand_devi = np.sqrt(var_test)    # sample from test points, in other words, make prediction    num_fun = 1    L_ = np.linalg.cholesky(K_ss + 1e-6 * np.eye(N) - np.dot(v.T, v))    f_post_fun = mu_post.reshape(-1, 1) + np.dot(L_, np.random.normal(size=(N, num_fun)))    return mu_post, stand_devi, f_post_fundef UBC(hyperparms_train, hyperparms_test, mu_post, stand_devi):    """    Upper Confidence Bound acquisition function    :param hyperparms_train: training hyperparameters    :param hyperparms_test: test hyperparameters    :param mu_post: posterior means    :param stand_devi: standard deviation    :param y: training targets    :return: next point that need to pick up    """    num_parms = len(hyperparms_train)    kappa = 7    objective = mu_post + kappa * stand_devi    indices = np.where(objective == np.max(objective))[0]    indices = np.asarray(indices)    next_point = hyperparms_test[indices[0]]    if np.array_equal(hyperparms_train[num_parms - 1], next_point):        return True    return next_pointdef TS(hyperparms_train, hyperparms_test, y_train):    """    Thompson Sampling acquisition function    :param hyperparms_train: training hyperparameters    :param hyperparms_test: test hyperparameters    :param y_train: training targets    :return: next point that need to pick up    """    mu_post, stand_devi, f_post_fun = bayesian_opt(hyperparms_train, hyperparms_test, y_train)    max_index = np.where(f_post_fun == np.max(f_post_fun))    next_point = hyperparms_test[max_index[0]]    return next_point.flatten()def EI(hyperparms_test, mu_post, stand_devi, y):    """    Expected Improvement acquisition function    :param hyperparms_test: test data    :param mu_post: GP posterior mean    :param stand_devi: standard deviation    :param y: training targets    :return: next point that need to pick up    """    s = 0.0005  # small value    max_mean = np.max(y)    f_max = max_mean + s    z = (mu_post - f_max) / stand_devi    EI_vector = (mu_post - f_max) * norm.cdf(z) + stand_devi * norm.pdf(z)    max_index = np.where(EI_vector == np.max(EI_vector))    next_point = hyperparms_test[max_index]    return next_point.flatten()def PI(hyperparms_test, mu_post, stand_devi, y):    """    Probability of Improvement acquisition function    :param hyperparms_test: test hyperparameters    :param mu_post: GP posterior mean    :param stand_devi: standard deviation    :param y: training targets    :return: next point that need to pick up    """    s = 0.0005  # small value    max_mean = np.max(y)    f_max = max_mean + s    z = (mu_post - f_max) / stand_devi    cumu_gaussian = norm.cdf(z)    indices = np.where(cumu_gaussian == np.max(cumu_gaussian))[0]    indices = np.asarray(indices)    # since there are several 1, random pick one of them as the next point except for parms_done    rand_index = random.randint(0, len(indices) - 1)    next_point = hyperparms_test[indices[rand_index]]    return next_pointdef acquisition_fun(choice, hyperparms_train, hyperparms_test, mu_post, stand_devi, y):    """    different acquisition functions    :param hyperparms_train: training hyperparameters    :param hyperparms_test: test hyperparameters    :param mu_post: posterior means    :param stand_devi: standard deviation    :param y: training targets    :return: next point that need to pick up    """    if choice == 'UBC':        next_point = UBC(hyperparms_train, hyperparms_test, mu_post, stand_devi)    elif choice == 'TS':        next_point = TS(hyperparms_train, hyperparms_test, y)    elif choice == 'EI':        next_point = EI(hyperparms_test, mu_post, stand_devi, y)    else:        next_point = PI(hyperparms_test, mu_post, stand_devi, y)    return next_pointdef init_hyperms(n_hyperms, dim_parms):    """    initialize hyperparameters    :param n_hyperms: number of hyperparameters    :return: initialized hyperparameters    """    hyperparms = np.zeros(shape=(n_hyperms, dim_parms))  # define initial hyperparms    hyperms_book = np.array([66, 67, 2.4, 90, 1.3, .66, 1.2, .78, .18, 1.6, .19])    for i in range(n_hyperms):        hyperparms[i] = hyperms_book + 0.5*(i+5)    return hyperparmsdef tune_hyperparameters_BO(X_train, X_test, y_train):    """    using Bayesian optimization in order to tune hyperparameters    :param X_train: training data    :param y_train: training targets    :param X_test: test data    :return: all values of log marginal likelihoods    """    dim_parms = 11    n_train_hyperparms = 5    n_hyperparms_test = 500  # number of test hyperparameters    choice = ['UCB', 'TS', 'EI', 'PI']    # run the loops for different acquisition functions    for j in choice:        print j        # initialize hyperparameters        hyperparms_train = init_hyperms(n_train_hyperparms, dim_parms)        num_iterations = 10        y_axis = np.zeros(num_iterations)        for k in range(num_iterations):            n_hyperparms_train = len(hyperparms_train)            hyperparms_test = random_sample_test_parms(n_hyperparms_test, hyperparms_train)            log_marg_likelihood = np.zeros(n_hyperparms_train)            # compute likelihood for the training points            for i in range(n_hyperparms_train):                log_marg_likelihood[i] = compute_mar_likelihood(X_train, y_train, hyperparms_train[i])            # Bayesian optimization for hyperparameters            mu_post, stand_devi = bayesian_opt(hyperparms_train, hyperparms_test, log_marg_likelihood)            next_point = acquisition_fun(choice, hyperparms_train, hyperparms_test, mu_post, stand_devi, log_marg_likelihood)            hyperparms_train = np.append(hyperparms_train, [next_point], axis=0)            max_index = np.where(log_marg_likelihood == np.max(log_marg_likelihood))[0]            print "**"            print "the " + `k+1` + "th iteration!"            y_axis[k] = np.max(log_marg_likelihood)            print y_axis[k]            print hyperparms_train[max_index][0]        # hyperparameters in the book        hyperms_book = np.array([66, 67, 2.4, 90, 1.3, .66, 1.2, .78, .18, 1.6, .19])        print "hyperms in the book is:"        print hyperms_book        print "marginal log likelihood in the book is:"        print compute_mar_likelihood(X_train, y_train, hyperms_book)        plt.plot((np.arange(num_iterations)).reshape(-1,1), np.abs(y_axis), label=j)    plt.xlabel("iterations")    plt.ylabel("absolute log marginal likelihood")    plt.legend(loc='best')    plt.title("different acquisition functions in the CO2 example")    plt.show()    return hyperparms_train[max_index][0]def plot_prediction(X_train, X_test, y_train, y_test, stand_devi):    """    plot the prediction figure    :param X_train: training data    :param X_test: test data    :param y_train: training targets    :param y_test: test targets    :param stand_devi: standard deviation    :return:    """    plt.clf()    plt.plot(X_train.reshape(-1,1), y_train, label='training data')    plt.plot(X_test.reshape(-1,1), y_test, label='test data')    plt.gca().fill_between(X_test.flat, mu_post - 3 * stand_devi, mu_post + 3 * stand_devi, color="#dddddd")    plt.xlim(X_train.min(), X_test.max())    plt.xlabel("Year")    plt.ylabel(r"CO$_2$ in ppm")    plt.legend(loc=4)    plt.title(r"Atmospheric CO$_2$ concentration at Mauna Loa")    plt.show()if __name__ == "__main__":    data = fetch_mldata('mauna-loa-atmospheric-co2').data    X_train = data[:, [1]]    y_train = data[:, 0]    X_train = np.asarray(X_train)    X_test = np.arange(X_train.max()//1 + 1, X_train.max()//1 + 21, 1./12)[:, np.newaxis]    empirical_mean = np.mean(y_train)    y_train = y_train  - empirical_mean    # tune hyperparameters    hyperms_iterns = tune_hyperparameters_BO(X_train.reshape(-1,1), X_test, y_train)    # make prediction    hyperms_book = np.array([66, 67, 2.4, 90, 1.3, .66, 1.2, .78, .18, 1.6, .19])    mu_post, stand_devi, f_post_fun = make_prediction(X_train, X_test, y_train, hyperms_iterns)    y_test = mu_post    y_train += empirical_mean    y_test += empirical_mean    plot_prediction(X_train, X_test, y_train, y_test, stand_devi)