-
Notifications
You must be signed in to change notification settings - Fork 9
/
Copy pathmc_control_epsilon_greedy.py
79 lines (65 loc) · 3.31 KB
/
mc_control_epsilon_greedy.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
import numpy as np
from collections import defaultdict
from utils.policy import make_epsilon_greedy_policy
def mc_control_epsilon_greedy(env, num_episodes, discount_factor=1.0, epsilon=0.1):
"""
Monte Carlo Control using Epsilon-Greedy policies.
Finds an optimal epsilon-greedy policy.
Args:
env: OpenAI gym environment.
num_episodes: Nubmer of episodes to sample.
discount_factor: Lambda discount factor.
epsilon: Chance the sample a random action. Float betwen 0 and 1.
Returns:
A tuple (Q, policy).
Q is a dictionary mapping state -> action values.
policy is a function that takes an state as an argument and returns
action probabilities
"""
# Keeps track of sum and count of returns for each state
# to calculate an average. We could use an array to save all
# returns (like in the book) but that's memory inefficient.
returns_sum = defaultdict(float)
returns_count = defaultdict(float)
# The final action-value function.
# A nested dictionary that maps state -> [...action_value](key corresponds to action)
Q = defaultdict(lambda: np.zeros(env.action_space.n))
# Initialize the epsilon-greedy policy
policy = make_epsilon_greedy_policy(Q, epsilon, env.action_space.n)
for episode in range(num_episodes):
# keep track of visited rewards, states, actions in current episode
visited_rewards = []
visited_state_actions = []
current_state = env.reset()
while True:
# choose the action based on policy
probs = policy(current_state)
action = np.random.choice(np.arange(len(probs)), p=probs)
next_state, reward, done, _ = env.step(action)
visited_state_actions.append((current_state, action))
visited_rewards.append(reward)
if done:
# we only take the first time the (state, action) is visited in the episode
# into account if we use first-visit Monte Carlo methonds
was_state_action_visited = {}
for i, state_action in enumerate(visited_state_actions):
# uncomment this part if you want to use first-visit
# if state_action not in was_state_action_visited:
# was_state_action_visited[state_action] = True
returns_count[state_action] += 1.0
# calculate the return (expected rewards from current state_action onwards)
# Note: we need to take care of discount_factor
return_ = 0.0
for j, reward in enumerate(visited_rewards[i:]):
return_ += (discount_factor ** j) * reward
returns_sum[state_action] += return_
# evaluate Q after every episode (We can improve policy when we have more experience)
for state_action, count in returns_count.items():
state, action = state_action
Q[state][action] = returns_sum[state_action] / count
# improve the policy based on new evaluated Q
policy = make_epsilon_greedy_policy(Q, epsilon, env.action_space.n)
break
else:
current_state = next_state
return Q, policy