-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathMountainCarHillClimbing.py
77 lines (66 loc) · 2.18 KB
/
MountainCarHillClimbing.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
'''
This code tests a policy in the CartPole_V0 environment
format for weight vector update and implementation from:
https://github.com/kvfrans/openai-cartpole
'''
import math
import gym
from gym import spaces, logger
from gym.utils import seeding
import numpy as np
import pandas as pd
import random
import timeit
def run_episode(env, parameters):
observation = env.reset()
cumulative_reward = 0
max_steps = 200
for step in range(max_steps):
if np.matmul(parameters, observation) < 0:
action = 0
else:
action = 2
#print(env.step(action))
observation, reward, done, info = env.step(action)
cumulative_reward += reward
if done:
return cumulative_reward
##########################################
###############BEGIN PROGRAM##############
##########################################
env = gym.make('MountainCar-v0') #The mountaincar source code can be edited to perform different tests
#env._max_episode_steps = 500
visualization = False
exportFilename = "Hill Climbing MountainCar.txt"
trials = 50
iters = 150
start_time = timeit.default_timer()
###SIMULATE###
noise_scaling = 0.2
parameters = np.random.rand(2) * 2 - 1
parameters = [-1.87949625e-04, 5.72014565e-01] #New
oldR = []
for _ in range(trials):
oldR.append(run_episode(env, parameters))
bestreward = np.mean(oldR)
print("Best Reward: " + str(bestreward))
print("Best Parameters: " + str(parameters))
for _ in range(iters):
newparams = parameters + (np.random.rand(2) * 2 - 1)*noise_scaling
#newparams = parameters
reward = []
for i in range(trials):
reward.append(run_episode(env, newparams))
#print(np.mean(reward))
#print(np.mean(reward))
if np.mean(reward) > bestreward:
bestreward = np.mean(reward)
parameters = newparams
print("\nCurrent Best: " + str(bestreward))
print("Best Parameters: " + str(parameters))
if np.mean(reward) == 0:
break
print("\nBest Reward: " + str(bestreward))
print(parameters)
elapsed = timeit.default_timer() - start_time
print("\nTIME: " + str(elapsed))