Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Use gym and atari wrappers instead of chainerrl.envs.ale #253

Merged
merged 40 commits into from
May 29, 2018
Merged
Show file tree
Hide file tree
Changes from 38 commits
Commits
Show all changes
40 commits
Select commit Hold shift + click to select a range
b4be931
Copy atari_wrappers.py and comment on the license
muupan Mar 20, 2018
1241675
Fix style
muupan Mar 20, 2018
e41d760
Support old gym
muupan Mar 21, 2018
6ffeec9
Use atari_wrappers
muupan Mar 21, 2018
59abb04
Merge branch 'c51' into atari-wrappers
muupan Mar 22, 2018
85e0356
Use make_atari as well as wrap_deepmind
muupan Mar 22, 2018
cda70e8
Turn FireResetEnv off by default
muupan Mar 22, 2018
8c448f9
Add eval_max_episode_len as 5 minutes
muupan Mar 22, 2018
5b409ad
Merge branch 'c51' into atari-wrappers
muupan Mar 28, 2018
6b83f18
Merge branch 'c51' into atari-wrappers
muupan Mar 29, 2018
4287965
Merge branch 'c51' into atari-wrappers
muupan Mar 30, 2018
eb60d7a
Merge branch 'c51' into atari-wrappers
muupan Apr 5, 2018
d7e57b3
Merge branch 'master' into atari-wrappers
muupan Apr 16, 2018
5b98f3f
Add --logging-level to train_a3c_ale.py
muupan Apr 16, 2018
10911c7
Merge branch 'logger-level-for-a3c' into atari-wrappers-a3c
muupan Apr 16, 2018
caec04f
Use atari_wrappers in train_a3c_ale.py
muupan Apr 16, 2018
a8dac37
Cast to int since np.int64 doesn't work
muupan Apr 16, 2018
1362086
Set --max-episode-len to 30 min by default
muupan Apr 16, 2018
1bcffb6
Merge branch 'master' into atari-wrappers
muupan Apr 18, 2018
883ec7e
Merge branch 'atari-wrappers' into atari-wrappers-a3c
muupan Apr 18, 2018
739b401
Add --render and --monitor
muupan Apr 20, 2018
162b8dd
Merge branch 'atari-wrappers-a3c' into atari-wrappers
muupan Apr 20, 2018
f5e8fce
Support both hwc and chw layout
muupan Apr 20, 2018
9502232
Add --render --monitor
muupan Apr 20, 2018
cf12f9f
Merge branch 'master' into atari-wrappers
muupan Apr 20, 2018
23d7c3a
Use save_best_so_far_agent=False
muupan Apr 20, 2018
534757a
Use atari_wrappers for PPO
muupan Apr 20, 2018
d616d93
Set save_best_so_far_agent=False
muupan Apr 20, 2018
fba1e33
Remove --use-sdl
muupan Apr 20, 2018
457b3fa
Use atari_wrappers for ACER
muupan Apr 20, 2018
6fb96e7
Use atari_wrappers for NSQ
muupan Apr 20, 2018
7cc2fb4
Use 10 ** 7 steps
muupan Apr 21, 2018
068bdf0
Add opencv-python as an optional dependency
muupan Apr 24, 2018
fa1e359
Use 8 * 10 ** 8 steps again for A3C and NSQ
muupan Apr 24, 2018
0dd1ae7
Merge branch 'master' into atari-wrappers
muupan Apr 24, 2018
9e25356
Replace pong with --env option
muupan Apr 24, 2018
c9b3ae1
Limit each episode 5min everywhere
muupan Apr 25, 2018
3d07043
Merge branch 'master' into atari-wrappers
toslunar May 29, 2018
493f7c7
Add noqa as 731d6dd327f8d869ac2a8b68d219c1540e6d769c
toslunar May 29, 2018
943869e
Merge pull request #1 from toslunar/pr253
muupan May 29, 2018
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions .travis.yml
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@ install:
- pip install autopep8
- pip install hacking
- pip install coveralls
- pip install opencv-python
- python setup.py develop
- python -c "import numpy; numpy.show_config()"
before_script:
Expand Down
277 changes: 277 additions & 0 deletions examples/ale/atari_wrappers.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,277 @@
"""This file is a fork from a MIT-licensed project named OpenAI Baselines:
https://github.com/openai/baselines/blob/master/baselines/common/atari_wrappers.py
"""

from collections import deque

import cv2
import gym
import numpy as np

from chainerrl import spaces

cv2.ocl.setUseOpenCL(False)


class NoopResetEnv(gym.Wrapper):
def __init__(self, env, noop_max=30):
"""Sample initial states by taking random number of no-ops on reset.

No-op is assumed to be action 0.
"""
gym.Wrapper.__init__(self, env)
self.noop_max = noop_max
self.override_num_noops = None
self.noop_action = 0
assert env.unwrapped.get_action_meanings()[0] == 'NOOP'

def _reset(self, **kwargs):
"""Do no-op action for a number of steps in [1, noop_max]."""
self.env.reset(**kwargs)
if self.override_num_noops is not None:
noops = self.override_num_noops
else:
noops = self.unwrapped.np_random.randint(
1, self.noop_max + 1) # pylint: disable=E1101
assert noops > 0
obs = None
for _ in range(noops):
obs, _, done, _ = self.env.step(self.noop_action)
if done:
obs = self.env.reset(**kwargs)
return obs

def _step(self, ac):
return self.env.step(ac)


class FireResetEnv(gym.Wrapper):
def __init__(self, env):
"""Take action on reset for envs that are fixed until firing."""
gym.Wrapper.__init__(self, env)
assert env.unwrapped.get_action_meanings()[1] == 'FIRE'
assert len(env.unwrapped.get_action_meanings()) >= 3

def _reset(self, **kwargs):
self.env.reset(**kwargs)
obs, _, done, _ = self.env.step(1)
if done:
self.env.reset(**kwargs)
obs, _, done, _ = self.env.step(2)
if done:
self.env.reset(**kwargs)
return obs

def _step(self, ac):
return self.env.step(ac)


class EpisodicLifeEnv(gym.Wrapper):
def __init__(self, env):
"""Make end-of-life == end-of-episode, but only reset on true game over.

Done by DeepMind for the DQN and co. since it helps value estimation.
"""
gym.Wrapper.__init__(self, env)
self.lives = 0
self.was_real_done = True

def _step(self, action):
obs, reward, done, info = self.env.step(action)
self.was_real_done = done
# check current lives, make loss of life terminal,
# then update lives to handle bonus lives
lives = self.env.unwrapped.ale.lives()
if lives < self.lives and lives > 0:
# for Qbert sometimes we stay in lives == 0 condtion for a few
# frames
# so its important to keep lives > 0, so that we only reset once
# the environment advertises done.
done = True
self.lives = lives
return obs, reward, done, info

def _reset(self, **kwargs):
"""Reset only when lives are exhausted.

This way all states are still reachable even though lives are episodic,
and the learner need not know about any of this behind-the-scenes.
"""
if self.was_real_done:
obs = self.env.reset(**kwargs)
else:
# no-op step to advance from terminal/lost life state
obs, _, _, _ = self.env.step(0)
self.lives = self.env.unwrapped.ale.lives()
return obs


class MaxAndSkipEnv(gym.Wrapper):
def __init__(self, env, skip=4):
"""Return only every `skip`-th frame"""
gym.Wrapper.__init__(self, env)
# most recent raw observations (for max pooling across time steps)
self._obs_buffer = np.zeros(
(2,) + env.observation_space.shape, dtype=np.uint8)
self._skip = skip

def _step(self, action):
"""Repeat action, sum reward, and max over last observations."""
total_reward = 0.0
done = None
for i in range(self._skip):
obs, reward, done, info = self.env.step(action)
if i == self._skip - 2:
self._obs_buffer[0] = obs
if i == self._skip - 1:
self._obs_buffer[1] = obs
total_reward += reward
if done:
break
# Note that the observation on the done=True frame
# doesn't matter
max_frame = self._obs_buffer.max(axis=0)

return max_frame, total_reward, done, info

def _reset(self, **kwargs):
return self.env.reset(**kwargs)


class ClipRewardEnv(gym.RewardWrapper):
def __init__(self, env):
gym.RewardWrapper.__init__(self, env)

def _reward(self, reward):
"""Bin reward to {+1, 0, -1} by its sign."""
return np.sign(reward)


class WarpFrame(gym.ObservationWrapper):
def __init__(self, env, channel_order='hwc'):
"""Warp frames to 84x84 as done in the Nature paper and later work."""
gym.ObservationWrapper.__init__(self, env)
self.width = 84
self.height = 84
shape = {
'hwc': (self.height, self.width, 1),
'chw': (1, self.height, self.width),
}
self.observation_space = spaces.Box(
low=0, high=255,
shape=shape[channel_order], dtype=np.uint8)

def _observation(self, frame):
frame = cv2.cvtColor(frame, cv2.COLOR_RGB2GRAY)
frame = cv2.resize(frame, (self.width, self.height),
interpolation=cv2.INTER_AREA)
return frame.reshape(self.observation_space.low.shape)


class FrameStack(gym.Wrapper):
def __init__(self, env, k, channel_order='hwc'):
"""Stack k last frames.

Returns lazy array, which is much more memory efficient.

See Also
--------
baselines.common.atari_wrappers.LazyFrames
"""
gym.Wrapper.__init__(self, env)
self.k = k
self.frames = deque([], maxlen=k)
orig_shape = env.observation_space.shape
self.stack_axis = {'hwc': 2, 'chw': 0}[channel_order]
shape = list(orig_shape)
shape[self.stack_axis] *= k
self.observation_space = spaces.Box(
low=0, high=255, shape=shape, dtype=np.uint8)

def _reset(self):
ob = self.env.reset()
for _ in range(self.k):
self.frames.append(ob)
return self._get_ob()

def _step(self, action):
ob, reward, done, info = self.env.step(action)
self.frames.append(ob)
return self._get_ob(), reward, done, info

def _get_ob(self):
assert len(self.frames) == self.k
return LazyFrames(list(self.frames), stack_axis=self.stack_axis)


class ScaledFloatFrame(gym.ObservationWrapper):
def __init__(self, env):
gym.ObservationWrapper.__init__(self, env)

def _observation(self, observation):
# careful! This undoes the memory optimization, use
# with smaller replay buffers only.
return np.array(observation).astype(np.float32) / 255.0


class LazyFrames(object):
"""Array-like object that lazily concat multiple frames.

This object ensures that common frames between the observations are only
stored once. It exists purely to optimize memory usage which can be huge
for DQN's 1M frames replay buffers.

This object should only be converted to numpy array before being passed to
the model.

You'd not believe how complex the previous solution was.
"""

def __init__(self, frames, stack_axis=2):
self.stack_axis = stack_axis
self._frames = frames
self._out = None

def _force(self):
if self._out is None:
self._out = np.concatenate(self._frames, axis=self.stack_axis)
self._frames = None
return self._out

def __array__(self, dtype=None):
out = self._force()
if dtype is not None:
out = out.astype(dtype)
return out

def __len__(self):
return len(self._force())

def __getitem__(self, i):
return self._force()[i]


def make_atari(env_id):
env = gym.make(env_id)
assert 'NoFrameskip' in env.spec.id
env = NoopResetEnv(env, noop_max=30)
env = MaxAndSkipEnv(env, skip=4)
return env


def wrap_deepmind(env, episode_life=True, clip_rewards=True,
frame_stack=True, scale=False, fire_reset=False,
channel_order='chw'):
"""Configure environment for DeepMind-style Atari."""
if episode_life:
env = EpisodicLifeEnv(env)
if fire_reset and 'FIRE' in env.unwrapped.get_action_meanings():
env = FireResetEnv(env)
env = WarpFrame(env, channel_order=channel_order)
if scale:
env = ScaledFloatFrame(env)
if clip_rewards:
env = ClipRewardEnv(env)
if frame_stack:
env = FrameStack(env, 4, channel_order=channel_order)
return env
Loading