diff --git a/.coveragerc b/.coveragerc new file mode 100644 index 0000000000..b3937f6ef3 --- /dev/null +++ b/.coveragerc @@ -0,0 +1,15 @@ +[run] +branch = False +omit = + # Mujoco requires a licence + stable_baselines/*/run_mujoco.py + stable_baselines/ppo1/run_humanoid.py + stable_baselines/ppo1/run_robotics.py + # HER requires mpi and Mujoco + stable_baselines/her/experiment/ + +[report] +exclude_lines = + pragma: no cover + raise NotImplementedError() + if KFAC_DEBUG: diff --git a/.gitignore b/.gitignore index 722e942b29..99acdd0537 100644 --- a/.gitignore +++ b/.gitignore @@ -2,9 +2,14 @@ *.pyc *.pkl *.py~ +*.bak .pytest_cache .DS_Store .idea +.coverage +.coverage.* +__pycache__/ +_build/ # Setuptools distribution and build folders. /dist/ @@ -34,5 +39,3 @@ src .cache MUJOCO_LOG.TXT - - diff --git a/.travis.yml b/.travis.yml index 5ba3eadd97..a09c537b88 100644 --- a/.travis.yml +++ b/.travis.yml @@ -2,6 +2,9 @@ language: python python: - "3.6" +notifications: + email: false + services: - docker @@ -10,5 +13,5 @@ install: - docker build . -t baselines-test script: - - flake8 --select=F baselines/common - - docker run baselines-test pytest + - flake8 --select=F stable_baselines/common + - docker run --env CODACY_PROJECT_TOKEN=$CODACY_PROJECT_TOKEN baselines-test sh -c 'pytest --cov-config .coveragerc --cov-report term --cov-report xml --cov=. && python-codacy-coverage -r coverage.xml --token=$CODACY_PROJECT_TOKEN' diff --git a/Dockerfile b/Dockerfile index eeac22ad2f..a1e18598a4 100644 --- a/Dockerfile +++ b/Dockerfile @@ -1,20 +1,45 @@ FROM ubuntu:16.04 -RUN apt-get -y update && apt-get -y install git wget python-dev python3-dev libopenmpi-dev python-pip zlib1g-dev cmake +RUN apt-get -y update && apt-get -y install git wget python-dev python3-dev libopenmpi-dev python-pip zlib1g-dev cmake libglib2.0-0 libsm6 libxext6 libfontconfig1 libxrender1 ENV CODE_DIR /root/code ENV VENV /root/venv -COPY . $CODE_DIR/baselines RUN \ pip install virtualenv && \ virtualenv $VENV --python=python3 && \ . $VENV/bin/activate && \ + mkdir $CODE_DIR && \ cd $CODE_DIR && \ pip install --upgrade pip && \ - pip install -e baselines && \ - pip install pytest + pip install pytest && \ + pip install pytest-cov && \ + pip install codacy-coverage && \ + pip install scipy && \ + pip install tqdm && \ + pip install joblib && \ + pip install zmq && \ + pip install dill && \ + pip install progressbar2 && \ + pip install mpi4py && \ + pip install cloudpickle && \ + pip install tensorflow>=1.5.0 && \ + pip install click && \ + pip install opencv-python && \ + pip install numpy && \ + pip install pandas && \ + pip install pytest && \ + pip install matplotlib && \ + pip install seaborn && \ + pip install glob2 && \ + pip install gym[mujoco,atari,classic_control,robotics] + +COPY . $CODE_DIR/stable_baselines +RUN \ + . $VENV/bin/activate && \ + cd $CODE_DIR && \ + pip install -e stable_baselines ENV PATH=$VENV/bin:$PATH -WORKDIR $CODE_DIR/baselines +WORKDIR $CODE_DIR/stable_baselines CMD /bin/bash diff --git a/README.md b/README.md index 197f01af97..a36d3f8554 100644 --- a/README.md +++ b/README.md @@ -1,87 +1,163 @@ - [![Build status](https://travis-ci.org/openai/baselines.svg?branch=master)](https://travis-ci.org/openai/baselines) +[![Build Status](https://travis-ci.com/hill-a/stable-baselines.svg?branch=stable)](https://travis-ci.com/hill-a/stable-baselines) [![Documentation Status](https://readthedocs.org/projects/stable-baselines/badge/?version=docs)](https://stable-baselines.readthedocs.io/en/docs/?badge=master) [![Codacy Badge](https://api.codacy.com/project/badge/Grade/3bcb4cd6d76a4270acb16b5fe6dd9efa)](https://www.codacy.com/app/baselines_janitors/stable-baselines?utm_source=github.com&utm_medium=referral&utm_content=hill-a/stable-baselines&utm_campaign=Badge_Grade) [![Codacy Badge](https://api.codacy.com/project/badge/Coverage/3bcb4cd6d76a4270acb16b5fe6dd9efa)](https://www.codacy.com/app/baselines_janitors/stable-baselines?utm_source=github.com&utm_medium=referral&utm_content=hill-a/stable-baselines&utm_campaign=Badge_Coverage) -# Baselines +# Stable Baselines -OpenAI Baselines is a set of high-quality implementations of reinforcement learning algorithms. +Stable Baselines is a set of improved implementations of reinforcement learning algorithms based on OpenAI [Baselines](https://github.com/openai/baselines/). -These algorithms will make it easier for the research community to replicate, refine, and identify new ideas, and will create good baselines to build research on top of. Our DQN implementation and its variants are roughly on par with the scores in published papers. We expect they will be used as a base around which new ideas can be added, and as a tool for comparing a new approach against existing ones. +You can read a detailed presentation of Stable Baselines in the [Medium article](https://medium.com/@araffin/stable-baselines-a-fork-of-openai-baselines-reinforcement-learning-made-easy-df87c4b2fc82). -## Prerequisites + +These algorithms will make it easier for the research community and industry to replicate, refine, and identify new ideas, and will create good baselines to build projects on top of. We expect these tools will be used as a base around which new ideas can be added, and as a tool for comparing a new approach against existing ones. We also hope that the simplicity of these tools will allow beginners to experiment with a more advanced toolset, without being buried in implementation details. + +## Main differences with OpenAI Baselines + +This toolset is a fork of OpenAI Baselines, with a major structural refactoring, and code cleanups: +- Unified structure for all algorithms +- PEP8 compliant (unified code style) +- Documented functions and classes +- More tests & more code coverage + +## Documentation + +Documentation is available online: [http://stable-baselines.readthedocs.io/](http://stable-baselines.readthedocs.io/) + +## Installation + +### Prerequisites Baselines requires python3 (>=3.5) with the development headers. You'll also need system packages CMake, OpenMPI and zlib. Those can be installed as follows -### Ubuntu - + +#### Ubuntu + ```bash sudo apt-get update && sudo apt-get install cmake libopenmpi-dev python3-dev zlib1g-dev ``` - -### Mac OS X + +#### Mac OS X Installation of system packages on Mac requires [Homebrew](https://brew.sh). With Homebrew installed, run the follwing: ```bash brew install cmake openmpi ``` - -## Virtual environment -From the general python package sanity perspective, it is a good idea to use virtual environments (virtualenvs) to make sure packages from different projects do not interfere with each other. You can install virtualenv (which is itself a pip package) via -```bash -pip install virtualenv -``` -Virtualenvs are essentially folders that have copies of python executable and all python packages. -To create a virtualenv called venv with python3, one runs -```bash -virtualenv /path/to/venv --python=python3 -``` -To activate a virtualenv: + +### Install using pip +Install the Stable Baselines package + +Using pip from pypi: ``` -. /path/to/venv/bin/activate +pip install stable-baselines ``` -More thorough tutorial on virtualenvs and options can be found [here](https://virtualenv.pypa.io/en/stable/) +Please read the [documentation](http://stable-baselines.readthedocs.io/) for more details and alternatives. -## Installation -Clone the repo and cd into it: -```bash -git clone https://github.com/openai/baselines.git -cd baselines -``` -If using virtualenv, create a new virtualenv and activate it -```bash - virtualenv env --python=python3 - . env/bin/activate + +## Example + +Most of the library tries to follow a sklearn-like syntax for the Reinforcement Learning algorithms. + +Here is a quick example of how to train and run PPO2 on a cartpole environment: +```python +import gym + +from stable_baselines.common.policies import MlpPolicy +from stable_baselines.common.vec_env import DummyVecEnv +from stable_baselines import PPO2 + +env = gym.make('CartPole-v1') +env = DummyVecEnv([lambda: env]) # The algorithms require a vectorized environment to run + +model = PPO2(MlpPolicy, env, verbose=1) +model.learn(total_timesteps=10000) + +obs = env.reset() +for i in range(1000): + action, _states = model.predict(obs) + obs, rewards, dones, info = env.step(action) + env.render() ``` -Install baselines package -```bash -pip install -e . + +Or just train a model with a one liner if [the environment is registed in Gym](https://github.com/openai/gym/wiki/Environments): + +```python + +from stable_baselines.common.policies import MlpPolicy +from stable_baselines import PPO2 + +model = PPO2(MlpPolicy, 'CartPole-v1').learn(10000) + ``` -### MuJoCo + +Please read the [documentation](http://stable-baselines.readthedocs.io/) for more examples. + + +## Try it online with Colab Notebooks ! + +All the following examples can be executed online using Google colab notebooks: + +- [Getting Started](https://colab.research.google.com/drive/1_1H5bjWKYBVKbbs-Kj83dsfuZieDNcFU) +- [Training, Saving, Loading](https://colab.research.google.com/drive/1KoAQ1C_BNtGV3sVvZCnNZaER9rstmy0s) +- [Multiprocessing](https://colab.research.google.com/drive/1ZzNFMUUi923foaVsYb4YjPy4mjKtnOxb) +- [Monitor Training and Plotting](https://colab.research.google.com/drive/1L_IMo6v0a0ALK8nefZm6PqPSy0vZIWBT) +- [Atari Games](https://colab.research.google.com/drive/1iYK11yDzOOqnrXi1Sfjm1iekZr4cxLaN) + + +## Implemented Algorithms + +| **Name** | **Refactored**(1) | **Recurrent** | ```Box``` | ```Discrete``` | ```MultiDiscrete``` | ```MultiBinary``` | **Multi Processing** | +| ------------------- | ---------------------------- | ------------------ | ------------------ | ------------------ | ------------------- | ------------------ | --------------------------------- | +| A2C | :heavy_check_mark: | :heavy_check_mark: | :heavy_check_mark: | :heavy_check_mark: | :heavy_check_mark: | :heavy_check_mark: | :heavy_check_mark: | +| ACER | :heavy_check_mark: | :heavy_check_mark: | :x: (5) | :heavy_check_mark: | :x: | :x: | :heavy_check_mark: | +| ACKTR | :heavy_check_mark: | :heavy_check_mark: | :x: (5) | :heavy_check_mark: | :x: | :x: | :heavy_check_mark: | +| DDPG | :heavy_check_mark: | :x: | :heavy_check_mark: | :x: | :x: | :x: | :x: | +| DeepQ | :heavy_check_mark: | :x: | :x: | :heavy_check_mark: | :x: | :x: | :x: | +| GAIL (2) | :heavy_check_mark: | :heavy_check_mark: | :heavy_check_mark: | :x: | :x: | :x: | :heavy_check_mark: (4) | +| HER (3) | :x: (5) | :x: | :heavy_check_mark: | :x: | :x: | :x: | :x: | +| PPO1 | :heavy_check_mark: | :heavy_check_mark: | :heavy_check_mark: | :heavy_check_mark: | :heavy_check_mark: | :heavy_check_mark: | :heavy_check_mark: (4) | +| PPO2 | :heavy_check_mark: | :heavy_check_mark: | :heavy_check_mark: | :heavy_check_mark: | :heavy_check_mark: | :heavy_check_mark: | :heavy_check_mark: | +| TRPO | :heavy_check_mark: | :heavy_check_mark: | :heavy_check_mark: | :heavy_check_mark: | :heavy_check_mark: | :heavy_check_mark: | :heavy_check_mark: (4) | + +(1): Whether or not the algorithm has be refactored to fit the ```BaseRLModel``` class.
+(2): Only implemented for TRPO.
+(3): Only implemented for DDPG.
+(4): Multi Processing with [MPI](https://mpi4py.readthedocs.io/en/stable/).
+(5): TODO, in project scope. + +Actions ```gym.spaces```: + * ```Box```: A N-dimensional box that containes every point in the action space. + * ```Discrete```: A list of possible actions, where each timestep only one of the actions can be used. + * ```MultiDiscrete```: A list of possible actions, where each timestep only one action of each discrete set can be used. + * ```MultiBinary```: A list of possible actions, where each timestep any of the actions can be used in any combination. + + +## MuJoCo Some of the baselines examples use [MuJoCo](http://www.mujoco.org) (multi-joint dynamics in contact) physics simulator, which is proprietary and requires binaries and a license (temporary 30-day license can be obtained from [www.mujoco.org](http://www.mujoco.org)). Instructions on setting up MuJoCo can be found [here](https://github.com/openai/mujoco-py) ## Testing the installation All unit tests in baselines can be run using pytest runner: ``` -pip install pytest -pytest +pip install pytest pytest-cov +pytest --cov-config .coveragerc --cov-report html --cov-report term --cov=. ``` -## Subpackages - -- [A2C](baselines/a2c) -- [ACER](baselines/acer) -- [ACKTR](baselines/acktr) -- [DDPG](baselines/ddpg) -- [DQN](baselines/deepq) -- [GAIL](baselines/gail) -- [HER](baselines/her) -- [PPO1](baselines/ppo1) (Multi-CPU using MPI) -- [PPO2](baselines/ppo2) (Optimized for GPU) -- [TRPO](baselines/trpo_mpi) +## Citing the Project To cite this repository in publications: - @misc{baselines, - author = {Dhariwal, Prafulla and Hesse, Christopher and Klimov, Oleg and Nichol, Alex and Plappert, Matthias and Radford, Alec and Schulman, John and Sidor, Szymon and Wu, Yuhuai}, - title = {OpenAI Baselines}, - year = {2017}, +``` + @misc{stable-baselines, + author = {Hill, Ashley and Raffin, Antonin and Traore, Rene and Dhariwal, Prafulla and Hesse, Christopher and Klimov, Oleg and Nichol, Alex and Plappert, Matthias and Radford, Alec and Schulman, John and Sidor, Szymon and Wu, Yuhuai}, + title = {Stable Baselines}, + year = {2018}, publisher = {GitHub}, journal = {GitHub repository}, - howpublished = {\url{https://github.com/openai/baselines}}, + howpublished = {\url{https://github.com/hill-a/stable-baselines}}, } +``` + +## How To Contribute + +To any interested in making the baselines better, there is still some documentation that needs to be done. +If you want to contribute, please open an issue first and then propose your pull request. + +Nice to have (for the future): +- [ ] Continuous actions support for ACER +- [ ] Continuous actions support for ACKTR +- [ ] Tensorboard integration (see branch `Tensorboard`) diff --git a/baselines/a2c/README.md b/baselines/a2c/README.md deleted file mode 100644 index 2df6eb2ee2..0000000000 --- a/baselines/a2c/README.md +++ /dev/null @@ -1,5 +0,0 @@ -# A2C - -- Original paper: https://arxiv.org/abs/1602.01783 -- Baselines blog post: https://blog.openai.com/baselines-acktr-a2c/ -- `python -m baselines.a2c.run_atari` runs the algorithm for 40M frames = 10M timesteps on an Atari game. See help (`-h`) for more options. \ No newline at end of file diff --git a/baselines/a2c/a2c.py b/baselines/a2c/a2c.py deleted file mode 100644 index f1de88a37e..0000000000 --- a/baselines/a2c/a2c.py +++ /dev/null @@ -1,160 +0,0 @@ -import os.path as osp -import time -import joblib -import numpy as np -import tensorflow as tf -from baselines import logger - -from baselines.common import set_global_seeds, explained_variance -from baselines.common.runners import AbstractEnvRunner -from baselines.common import tf_util - -from baselines.a2c.utils import discount_with_dones -from baselines.a2c.utils import Scheduler, make_path, find_trainable_variables -from baselines.a2c.utils import cat_entropy, mse - -class Model(object): - - def __init__(self, policy, ob_space, ac_space, nenvs, nsteps, - ent_coef=0.01, vf_coef=0.5, max_grad_norm=0.5, lr=7e-4, - alpha=0.99, epsilon=1e-5, total_timesteps=int(80e6), lrschedule='linear'): - - sess = tf_util.make_session() - nbatch = nenvs*nsteps - - A = tf.placeholder(tf.int32, [nbatch]) - ADV = tf.placeholder(tf.float32, [nbatch]) - R = tf.placeholder(tf.float32, [nbatch]) - LR = tf.placeholder(tf.float32, []) - - step_model = policy(sess, ob_space, ac_space, nenvs, 1, reuse=False) - train_model = policy(sess, ob_space, ac_space, nenvs*nsteps, nsteps, reuse=True) - - neglogpac = tf.nn.sparse_softmax_cross_entropy_with_logits(logits=train_model.pi, labels=A) - pg_loss = tf.reduce_mean(ADV * neglogpac) - vf_loss = tf.reduce_mean(mse(tf.squeeze(train_model.vf), R)) - entropy = tf.reduce_mean(cat_entropy(train_model.pi)) - loss = pg_loss - entropy*ent_coef + vf_loss * vf_coef - - params = find_trainable_variables("model") - grads = tf.gradients(loss, params) - if max_grad_norm is not None: - grads, grad_norm = tf.clip_by_global_norm(grads, max_grad_norm) - grads = list(zip(grads, params)) - trainer = tf.train.RMSPropOptimizer(learning_rate=LR, decay=alpha, epsilon=epsilon) - _train = trainer.apply_gradients(grads) - - lr = Scheduler(v=lr, nvalues=total_timesteps, schedule=lrschedule) - - def train(obs, states, rewards, masks, actions, values): - advs = rewards - values - for step in range(len(obs)): - cur_lr = lr.value() - td_map = {train_model.X:obs, A:actions, ADV:advs, R:rewards, LR:cur_lr} - if states is not None: - td_map[train_model.S] = states - td_map[train_model.M] = masks - policy_loss, value_loss, policy_entropy, _ = sess.run( - [pg_loss, vf_loss, entropy, _train], - td_map - ) - return policy_loss, value_loss, policy_entropy - - def save(save_path): - ps = sess.run(params) - make_path(osp.dirname(save_path)) - joblib.dump(ps, save_path) - - def load(load_path): - loaded_params = joblib.load(load_path) - restores = [] - for p, loaded_p in zip(params, loaded_params): - restores.append(p.assign(loaded_p)) - sess.run(restores) - - self.train = train - self.train_model = train_model - self.step_model = step_model - self.step = step_model.step - self.value = step_model.value - self.initial_state = step_model.initial_state - self.save = save - self.load = load - tf.global_variables_initializer().run(session=sess) - -class Runner(AbstractEnvRunner): - - def __init__(self, env, model, nsteps=5, gamma=0.99): - super().__init__(env=env, model=model, nsteps=nsteps) - self.gamma = gamma - - def run(self): - mb_obs, mb_rewards, mb_actions, mb_values, mb_dones = [],[],[],[],[] - mb_states = self.states - for n in range(self.nsteps): - actions, values, states, _ = self.model.step(self.obs, self.states, self.dones) - mb_obs.append(np.copy(self.obs)) - mb_actions.append(actions) - mb_values.append(values) - mb_dones.append(self.dones) - obs, rewards, dones, _ = self.env.step(actions) - self.states = states - self.dones = dones - for n, done in enumerate(dones): - if done: - self.obs[n] = self.obs[n]*0 - self.obs = obs - mb_rewards.append(rewards) - mb_dones.append(self.dones) - #batch of steps to batch of rollouts - mb_obs = np.asarray(mb_obs, dtype=np.uint8).swapaxes(1, 0).reshape(self.batch_ob_shape) - mb_rewards = np.asarray(mb_rewards, dtype=np.float32).swapaxes(1, 0) - mb_actions = np.asarray(mb_actions, dtype=np.int32).swapaxes(1, 0) - mb_values = np.asarray(mb_values, dtype=np.float32).swapaxes(1, 0) - mb_dones = np.asarray(mb_dones, dtype=np.bool).swapaxes(1, 0) - mb_masks = mb_dones[:, :-1] - mb_dones = mb_dones[:, 1:] - last_values = self.model.value(self.obs, self.states, self.dones).tolist() - #discount/bootstrap off value fn - for n, (rewards, dones, value) in enumerate(zip(mb_rewards, mb_dones, last_values)): - rewards = rewards.tolist() - dones = dones.tolist() - if dones[-1] == 0: - rewards = discount_with_dones(rewards+[value], dones+[0], self.gamma)[:-1] - else: - rewards = discount_with_dones(rewards, dones, self.gamma) - mb_rewards[n] = rewards - mb_rewards = mb_rewards.flatten() - mb_actions = mb_actions.flatten() - mb_values = mb_values.flatten() - mb_masks = mb_masks.flatten() - return mb_obs, mb_states, mb_rewards, mb_masks, mb_actions, mb_values - -def learn(policy, env, seed, nsteps=5, total_timesteps=int(80e6), vf_coef=0.5, ent_coef=0.01, max_grad_norm=0.5, lr=7e-4, lrschedule='linear', epsilon=1e-5, alpha=0.99, gamma=0.99, log_interval=100): - set_global_seeds(seed) - - nenvs = env.num_envs - ob_space = env.observation_space - ac_space = env.action_space - model = Model(policy=policy, ob_space=ob_space, ac_space=ac_space, nenvs=nenvs, nsteps=nsteps, ent_coef=ent_coef, vf_coef=vf_coef, - max_grad_norm=max_grad_norm, lr=lr, alpha=alpha, epsilon=epsilon, total_timesteps=total_timesteps, lrschedule=lrschedule) - runner = Runner(env, model, nsteps=nsteps, gamma=gamma) - - nbatch = nenvs*nsteps - tstart = time.time() - for update in range(1, total_timesteps//nbatch+1): - obs, states, rewards, masks, actions, values = runner.run() - policy_loss, value_loss, policy_entropy = model.train(obs, states, rewards, masks, actions, values) - nseconds = time.time()-tstart - fps = int((update*nbatch)/nseconds) - if update % log_interval == 0 or update == 1: - ev = explained_variance(values, rewards) - logger.record_tabular("nupdates", update) - logger.record_tabular("total_timesteps", update*nbatch) - logger.record_tabular("fps", fps) - logger.record_tabular("policy_entropy", float(policy_entropy)) - logger.record_tabular("value_loss", float(value_loss)) - logger.record_tabular("explained_variance", float(ev)) - logger.dump_tabular() - env.close() - return model diff --git a/baselines/a2c/policies.py b/baselines/a2c/policies.py deleted file mode 100644 index 6fbbb14ac8..0000000000 --- a/baselines/a2c/policies.py +++ /dev/null @@ -1,146 +0,0 @@ -import numpy as np -import tensorflow as tf -from baselines.a2c.utils import conv, fc, conv_to_fc, batch_to_seq, seq_to_batch, lstm, lnlstm -from baselines.common.distributions import make_pdtype -from baselines.common.input import observation_input - -def nature_cnn(unscaled_images, **conv_kwargs): - """ - CNN from Nature paper. - """ - scaled_images = tf.cast(unscaled_images, tf.float32) / 255. - activ = tf.nn.relu - h = activ(conv(scaled_images, 'c1', nf=32, rf=8, stride=4, init_scale=np.sqrt(2), - **conv_kwargs)) - h2 = activ(conv(h, 'c2', nf=64, rf=4, stride=2, init_scale=np.sqrt(2), **conv_kwargs)) - h3 = activ(conv(h2, 'c3', nf=64, rf=3, stride=1, init_scale=np.sqrt(2), **conv_kwargs)) - h3 = conv_to_fc(h3) - return activ(fc(h3, 'fc1', nh=512, init_scale=np.sqrt(2))) - -class LnLstmPolicy(object): - def __init__(self, sess, ob_space, ac_space, nbatch, nsteps, nlstm=256, reuse=False): - nenv = nbatch // nsteps - X, processed_x = observation_input(ob_space, nbatch) - M = tf.placeholder(tf.float32, [nbatch]) #mask (done t-1) - S = tf.placeholder(tf.float32, [nenv, nlstm*2]) #states - self.pdtype = make_pdtype(ac_space) - with tf.variable_scope("model", reuse=reuse): - h = nature_cnn(processed_x) - xs = batch_to_seq(h, nenv, nsteps) - ms = batch_to_seq(M, nenv, nsteps) - h5, snew = lnlstm(xs, ms, S, 'lstm1', nh=nlstm) - h5 = seq_to_batch(h5) - vf = fc(h5, 'v', 1) - self.pd, self.pi = self.pdtype.pdfromlatent(h5) - - v0 = vf[:, 0] - a0 = self.pd.sample() - neglogp0 = self.pd.neglogp(a0) - self.initial_state = np.zeros((nenv, nlstm*2), dtype=np.float32) - - def step(ob, state, mask): - return sess.run([a0, v0, snew, neglogp0], {X:ob, S:state, M:mask}) - - def value(ob, state, mask): - return sess.run(v0, {X:ob, S:state, M:mask}) - - self.X = X - self.M = M - self.S = S - self.vf = vf - self.step = step - self.value = value - -class LstmPolicy(object): - - def __init__(self, sess, ob_space, ac_space, nbatch, nsteps, nlstm=256, reuse=False): - nenv = nbatch // nsteps - self.pdtype = make_pdtype(ac_space) - X, processed_x = observation_input(ob_space, nbatch) - - M = tf.placeholder(tf.float32, [nbatch]) #mask (done t-1) - S = tf.placeholder(tf.float32, [nenv, nlstm*2]) #states - with tf.variable_scope("model", reuse=reuse): - h = nature_cnn(X) - xs = batch_to_seq(h, nenv, nsteps) - ms = batch_to_seq(M, nenv, nsteps) - h5, snew = lstm(xs, ms, S, 'lstm1', nh=nlstm) - h5 = seq_to_batch(h5) - vf = fc(h5, 'v', 1) - self.pd, self.pi = self.pdtype.pdfromlatent(h5) - - v0 = vf[:, 0] - a0 = self.pd.sample() - neglogp0 = self.pd.neglogp(a0) - self.initial_state = np.zeros((nenv, nlstm*2), dtype=np.float32) - - def step(ob, state, mask): - return sess.run([a0, v0, snew, neglogp0], {X:ob, S:state, M:mask}) - - def value(ob, state, mask): - return sess.run(v0, {X:ob, S:state, M:mask}) - - self.X = X - self.M = M - self.S = S - self.vf = vf - self.step = step - self.value = value - -class CnnPolicy(object): - - def __init__(self, sess, ob_space, ac_space, nbatch, nsteps, reuse=False, **conv_kwargs): #pylint: disable=W0613 - self.pdtype = make_pdtype(ac_space) - X, processed_x = observation_input(ob_space, nbatch) - with tf.variable_scope("model", reuse=reuse): - h = nature_cnn(processed_x, **conv_kwargs) - vf = fc(h, 'v', 1)[:,0] - self.pd, self.pi = self.pdtype.pdfromlatent(h, init_scale=0.01) - - a0 = self.pd.sample() - neglogp0 = self.pd.neglogp(a0) - self.initial_state = None - - def step(ob, *_args, **_kwargs): - a, v, neglogp = sess.run([a0, vf, neglogp0], {X:ob}) - return a, v, self.initial_state, neglogp - - def value(ob, *_args, **_kwargs): - return sess.run(vf, {X:ob}) - - self.X = X - self.vf = vf - self.step = step - self.value = value - -class MlpPolicy(object): - def __init__(self, sess, ob_space, ac_space, nbatch, nsteps, reuse=False): #pylint: disable=W0613 - self.pdtype = make_pdtype(ac_space) - with tf.variable_scope("model", reuse=reuse): - X, processed_x = observation_input(ob_space, nbatch) - activ = tf.tanh - processed_x = tf.layers.flatten(processed_x) - pi_h1 = activ(fc(processed_x, 'pi_fc1', nh=64, init_scale=np.sqrt(2))) - pi_h2 = activ(fc(pi_h1, 'pi_fc2', nh=64, init_scale=np.sqrt(2))) - vf_h1 = activ(fc(processed_x, 'vf_fc1', nh=64, init_scale=np.sqrt(2))) - vf_h2 = activ(fc(vf_h1, 'vf_fc2', nh=64, init_scale=np.sqrt(2))) - vf = fc(vf_h2, 'vf', 1)[:,0] - - self.pd, self.pi = self.pdtype.pdfromlatent(pi_h2, init_scale=0.01) - - - a0 = self.pd.sample() - neglogp0 = self.pd.neglogp(a0) - self.initial_state = None - - def step(ob, *_args, **_kwargs): - a, v, neglogp = sess.run([a0, vf, neglogp0], {X:ob}) - return a, v, self.initial_state, neglogp - - def value(ob, *_args, **_kwargs): - return sess.run(vf, {X:ob}) - - self.X = X - self.vf = vf - self.step = step - self.value = value diff --git a/baselines/a2c/run_atari.py b/baselines/a2c/run_atari.py deleted file mode 100644 index b09d9bbffe..0000000000 --- a/baselines/a2c/run_atari.py +++ /dev/null @@ -1,30 +0,0 @@ -#!/usr/bin/env python3 - -from baselines import logger -from baselines.common.cmd_util import make_atari_env, atari_arg_parser -from baselines.common.vec_env.vec_frame_stack import VecFrameStack -from baselines.a2c.a2c import learn -from baselines.ppo2.policies import CnnPolicy, LstmPolicy, LnLstmPolicy - -def train(env_id, num_timesteps, seed, policy, lrschedule, num_env): - if policy == 'cnn': - policy_fn = CnnPolicy - elif policy == 'lstm': - policy_fn = LstmPolicy - elif policy == 'lnlstm': - policy_fn = LnLstmPolicy - env = VecFrameStack(make_atari_env(env_id, num_env, seed), 4) - learn(policy_fn, env, seed, total_timesteps=int(num_timesteps * 1.1), lrschedule=lrschedule) - env.close() - -def main(): - parser = atari_arg_parser() - parser.add_argument('--policy', help='Policy architecture', choices=['cnn', 'lstm', 'lnlstm'], default='cnn') - parser.add_argument('--lrschedule', help='Learning rate schedule', choices=['constant', 'linear'], default='constant') - args = parser.parse_args() - logger.configure() - train(args.env, num_timesteps=args.num_timesteps, seed=args.seed, - policy=args.policy, lrschedule=args.lrschedule, num_env=16) - -if __name__ == '__main__': - main() diff --git a/baselines/a2c/utils.py b/baselines/a2c/utils.py deleted file mode 100644 index a7610ebcdc..0000000000 --- a/baselines/a2c/utils.py +++ /dev/null @@ -1,290 +0,0 @@ -import os -import gym -import numpy as np -import tensorflow as tf -from gym import spaces -from collections import deque - -def sample(logits): - noise = tf.random_uniform(tf.shape(logits)) - return tf.argmax(logits - tf.log(-tf.log(noise)), 1) - -def cat_entropy(logits): - a0 = logits - tf.reduce_max(logits, 1, keep_dims=True) - ea0 = tf.exp(a0) - z0 = tf.reduce_sum(ea0, 1, keep_dims=True) - p0 = ea0 / z0 - return tf.reduce_sum(p0 * (tf.log(z0) - a0), 1) - -def cat_entropy_softmax(p0): - return - tf.reduce_sum(p0 * tf.log(p0 + 1e-6), axis = 1) - -def mse(pred, target): - return tf.square(pred-target)/2. - -def ortho_init(scale=1.0): - def _ortho_init(shape, dtype, partition_info=None): - #lasagne ortho init for tf - shape = tuple(shape) - if len(shape) == 2: - flat_shape = shape - elif len(shape) == 4: # assumes NHWC - flat_shape = (np.prod(shape[:-1]), shape[-1]) - else: - raise NotImplementedError - a = np.random.normal(0.0, 1.0, flat_shape) - u, _, v = np.linalg.svd(a, full_matrices=False) - q = u if u.shape == flat_shape else v # pick the one with the correct shape - q = q.reshape(shape) - return (scale * q[:shape[0], :shape[1]]).astype(np.float32) - return _ortho_init - -def conv(x, scope, *, nf, rf, stride, pad='VALID', init_scale=1.0, data_format='NHWC', one_dim_bias=False): - if data_format == 'NHWC': - channel_ax = 3 - strides = [1, stride, stride, 1] - bshape = [1, 1, 1, nf] - elif data_format == 'NCHW': - channel_ax = 1 - strides = [1, 1, stride, stride] - bshape = [1, nf, 1, 1] - else: - raise NotImplementedError - bias_var_shape = [nf] if one_dim_bias else [1, nf, 1, 1] - nin = x.get_shape()[channel_ax].value - wshape = [rf, rf, nin, nf] - with tf.variable_scope(scope): - w = tf.get_variable("w", wshape, initializer=ortho_init(init_scale)) - b = tf.get_variable("b", bias_var_shape, initializer=tf.constant_initializer(0.0)) - if not one_dim_bias and data_format == 'NHWC': - b = tf.reshape(b, bshape) - return b + tf.nn.conv2d(x, w, strides=strides, padding=pad, data_format=data_format) - -def fc(x, scope, nh, *, init_scale=1.0, init_bias=0.0): - with tf.variable_scope(scope): - nin = x.get_shape()[1].value - w = tf.get_variable("w", [nin, nh], initializer=ortho_init(init_scale)) - b = tf.get_variable("b", [nh], initializer=tf.constant_initializer(init_bias)) - return tf.matmul(x, w)+b - -def batch_to_seq(h, nbatch, nsteps, flat=False): - if flat: - h = tf.reshape(h, [nbatch, nsteps]) - else: - h = tf.reshape(h, [nbatch, nsteps, -1]) - return [tf.squeeze(v, [1]) for v in tf.split(axis=1, num_or_size_splits=nsteps, value=h)] - -def seq_to_batch(h, flat = False): - shape = h[0].get_shape().as_list() - if not flat: - assert(len(shape) > 1) - nh = h[0].get_shape()[-1].value - return tf.reshape(tf.concat(axis=1, values=h), [-1, nh]) - else: - return tf.reshape(tf.stack(values=h, axis=1), [-1]) - -def lstm(xs, ms, s, scope, nh, init_scale=1.0): - nbatch, nin = [v.value for v in xs[0].get_shape()] - nsteps = len(xs) - with tf.variable_scope(scope): - wx = tf.get_variable("wx", [nin, nh*4], initializer=ortho_init(init_scale)) - wh = tf.get_variable("wh", [nh, nh*4], initializer=ortho_init(init_scale)) - b = tf.get_variable("b", [nh*4], initializer=tf.constant_initializer(0.0)) - - c, h = tf.split(axis=1, num_or_size_splits=2, value=s) - for idx, (x, m) in enumerate(zip(xs, ms)): - c = c*(1-m) - h = h*(1-m) - z = tf.matmul(x, wx) + tf.matmul(h, wh) + b - i, f, o, u = tf.split(axis=1, num_or_size_splits=4, value=z) - i = tf.nn.sigmoid(i) - f = tf.nn.sigmoid(f) - o = tf.nn.sigmoid(o) - u = tf.tanh(u) - c = f*c + i*u - h = o*tf.tanh(c) - xs[idx] = h - s = tf.concat(axis=1, values=[c, h]) - return xs, s - -def _ln(x, g, b, e=1e-5, axes=[1]): - u, s = tf.nn.moments(x, axes=axes, keep_dims=True) - x = (x-u)/tf.sqrt(s+e) - x = x*g+b - return x - -def lnlstm(xs, ms, s, scope, nh, init_scale=1.0): - nbatch, nin = [v.value for v in xs[0].get_shape()] - nsteps = len(xs) - with tf.variable_scope(scope): - wx = tf.get_variable("wx", [nin, nh*4], initializer=ortho_init(init_scale)) - gx = tf.get_variable("gx", [nh*4], initializer=tf.constant_initializer(1.0)) - bx = tf.get_variable("bx", [nh*4], initializer=tf.constant_initializer(0.0)) - - wh = tf.get_variable("wh", [nh, nh*4], initializer=ortho_init(init_scale)) - gh = tf.get_variable("gh", [nh*4], initializer=tf.constant_initializer(1.0)) - bh = tf.get_variable("bh", [nh*4], initializer=tf.constant_initializer(0.0)) - - b = tf.get_variable("b", [nh*4], initializer=tf.constant_initializer(0.0)) - - gc = tf.get_variable("gc", [nh], initializer=tf.constant_initializer(1.0)) - bc = tf.get_variable("bc", [nh], initializer=tf.constant_initializer(0.0)) - - c, h = tf.split(axis=1, num_or_size_splits=2, value=s) - for idx, (x, m) in enumerate(zip(xs, ms)): - c = c*(1-m) - h = h*(1-m) - z = _ln(tf.matmul(x, wx), gx, bx) + _ln(tf.matmul(h, wh), gh, bh) + b - i, f, o, u = tf.split(axis=1, num_or_size_splits=4, value=z) - i = tf.nn.sigmoid(i) - f = tf.nn.sigmoid(f) - o = tf.nn.sigmoid(o) - u = tf.tanh(u) - c = f*c + i*u - h = o*tf.tanh(_ln(c, gc, bc)) - xs[idx] = h - s = tf.concat(axis=1, values=[c, h]) - return xs, s - -def conv_to_fc(x): - nh = np.prod([v.value for v in x.get_shape()[1:]]) - x = tf.reshape(x, [-1, nh]) - return x - -def discount_with_dones(rewards, dones, gamma): - discounted = [] - r = 0 - for reward, done in zip(rewards[::-1], dones[::-1]): - r = reward + gamma*r*(1.-done) # fixed off by one bug - discounted.append(r) - return discounted[::-1] - -def find_trainable_variables(key): - with tf.variable_scope(key): - return tf.trainable_variables() - -def make_path(f): - return os.makedirs(f, exist_ok=True) - -def constant(p): - return 1 - -def linear(p): - return 1-p - -def middle_drop(p): - eps = 0.75 - if 1-p 0: - buffer = Buffer(env=env, nsteps=nsteps, nstack=nstack, size=buffer_size) - else: - buffer = None - nbatch = nenvs*nsteps - acer = Acer(runner, model, buffer, log_interval) - acer.tstart = time.time() - for acer.steps in range(0, total_timesteps, nbatch): #nbatch samples, 1 on_policy call and multiple off-policy calls - acer.call(on_policy=True) - if replay_ratio > 0 and buffer.has_atleast(replay_start): - n = np.random.poisson(replay_ratio) - for _ in range(n): - acer.call(on_policy=False) # no simulation steps in this - - env.close() diff --git a/baselines/acer/buffer.py b/baselines/acer/buffer.py deleted file mode 100644 index 2dcfa1098a..0000000000 --- a/baselines/acer/buffer.py +++ /dev/null @@ -1,103 +0,0 @@ -import numpy as np - -class Buffer(object): - # gets obs, actions, rewards, mu's, (states, masks), dones - def __init__(self, env, nsteps, nstack, size=50000): - self.nenv = env.num_envs - self.nsteps = nsteps - self.nh, self.nw, self.nc = env.observation_space.shape - self.nstack = nstack - self.nbatch = self.nenv * self.nsteps - self.size = size // (self.nsteps) # Each loc contains nenv * nsteps frames, thus total buffer is nenv * size frames - - # Memory - self.enc_obs = None - self.actions = None - self.rewards = None - self.mus = None - self.dones = None - self.masks = None - - # Size indexes - self.next_idx = 0 - self.num_in_buffer = 0 - - def has_atleast(self, frames): - # Frames per env, so total (nenv * frames) Frames needed - # Each buffer loc has nenv * nsteps frames - return self.num_in_buffer >= (frames // self.nsteps) - - def can_sample(self): - return self.num_in_buffer > 0 - - # Generate stacked frames - def decode(self, enc_obs, dones): - # enc_obs has shape [nenvs, nsteps + nstack, nh, nw, nc] - # dones has shape [nenvs, nsteps, nh, nw, nc] - # returns stacked obs of shape [nenv, (nsteps + 1), nh, nw, nstack*nc] - nstack, nenv, nsteps, nh, nw, nc = self.nstack, self.nenv, self.nsteps, self.nh, self.nw, self.nc - y = np.empty([nsteps + nstack - 1, nenv, 1, 1, 1], dtype=np.float32) - obs = np.zeros([nstack, nsteps + nstack, nenv, nh, nw, nc], dtype=np.uint8) - x = np.reshape(enc_obs, [nenv, nsteps + nstack, nh, nw, nc]).swapaxes(1, - 0) # [nsteps + nstack, nenv, nh, nw, nc] - y[3:] = np.reshape(1.0 - dones, [nenv, nsteps, 1, 1, 1]).swapaxes(1, 0) # keep - y[:3] = 1.0 - # y = np.reshape(1 - dones, [nenvs, nsteps, 1, 1, 1]) - for i in range(nstack): - obs[-(i + 1), i:] = x - # obs[:,i:,:,:,-(i+1),:] = x - x = x[:-1] * y - y = y[1:] - return np.reshape(obs[:, 3:].transpose((2, 1, 3, 4, 0, 5)), [nenv, (nsteps + 1), nh, nw, nstack * nc]) - - def put(self, enc_obs, actions, rewards, mus, dones, masks): - # enc_obs [nenv, (nsteps + nstack), nh, nw, nc] - # actions, rewards, dones [nenv, nsteps] - # mus [nenv, nsteps, nact] - - if self.enc_obs is None: - self.enc_obs = np.empty([self.size] + list(enc_obs.shape), dtype=np.uint8) - self.actions = np.empty([self.size] + list(actions.shape), dtype=np.int32) - self.rewards = np.empty([self.size] + list(rewards.shape), dtype=np.float32) - self.mus = np.empty([self.size] + list(mus.shape), dtype=np.float32) - self.dones = np.empty([self.size] + list(dones.shape), dtype=np.bool) - self.masks = np.empty([self.size] + list(masks.shape), dtype=np.bool) - - self.enc_obs[self.next_idx] = enc_obs - self.actions[self.next_idx] = actions - self.rewards[self.next_idx] = rewards - self.mus[self.next_idx] = mus - self.dones[self.next_idx] = dones - self.masks[self.next_idx] = masks - - self.next_idx = (self.next_idx + 1) % self.size - self.num_in_buffer = min(self.size, self.num_in_buffer + 1) - - def take(self, x, idx, envx): - nenv = self.nenv - out = np.empty([nenv] + list(x.shape[2:]), dtype=x.dtype) - for i in range(nenv): - out[i] = x[idx[i], envx[i]] - return out - - def get(self): - # returns - # obs [nenv, (nsteps + 1), nh, nw, nstack*nc] - # actions, rewards, dones [nenv, nsteps] - # mus [nenv, nsteps, nact] - nenv = self.nenv - assert self.can_sample() - - # Sample exactly one id per env. If you sample across envs, then higher correlation in samples from same env. - idx = np.random.randint(0, self.num_in_buffer, nenv) - envx = np.arange(nenv) - - take = lambda x: self.take(x, idx, envx) # for i in range(nenv)], axis = 0) - dones = take(self.dones) - enc_obs = take(self.enc_obs) - obs = self.decode(enc_obs, dones) - actions = take(self.actions) - rewards = take(self.rewards) - mus = take(self.mus) - masks = take(self.masks) - return obs, actions, rewards, mus, dones, masks diff --git a/baselines/acer/policies.py b/baselines/acer/policies.py deleted file mode 100644 index 627c40016c..0000000000 --- a/baselines/acer/policies.py +++ /dev/null @@ -1,79 +0,0 @@ -import numpy as np -import tensorflow as tf -from baselines.ppo2.policies import nature_cnn -from baselines.a2c.utils import fc, batch_to_seq, seq_to_batch, lstm, sample - - -class AcerCnnPolicy(object): - - def __init__(self, sess, ob_space, ac_space, nenv, nsteps, nstack, reuse=False): - nbatch = nenv * nsteps - nh, nw, nc = ob_space.shape - ob_shape = (nbatch, nh, nw, nc * nstack) - nact = ac_space.n - X = tf.placeholder(tf.uint8, ob_shape) # obs - with tf.variable_scope("model", reuse=reuse): - h = nature_cnn(X) - pi_logits = fc(h, 'pi', nact, init_scale=0.01) - pi = tf.nn.softmax(pi_logits) - q = fc(h, 'q', nact) - - a = sample(pi_logits) # could change this to use self.pi instead - self.initial_state = [] # not stateful - self.X = X - self.pi = pi # actual policy params now - self.q = q - - def step(ob, *args, **kwargs): - # returns actions, mus, states - a0, pi0 = sess.run([a, pi], {X: ob}) - return a0, pi0, [] # dummy state - - def out(ob, *args, **kwargs): - pi0, q0 = sess.run([pi, q], {X: ob}) - return pi0, q0 - - def act(ob, *args, **kwargs): - return sess.run(a, {X: ob}) - - self.step = step - self.out = out - self.act = act - -class AcerLstmPolicy(object): - - def __init__(self, sess, ob_space, ac_space, nenv, nsteps, nstack, reuse=False, nlstm=256): - nbatch = nenv * nsteps - nh, nw, nc = ob_space.shape - ob_shape = (nbatch, nh, nw, nc * nstack) - nact = ac_space.n - X = tf.placeholder(tf.uint8, ob_shape) # obs - M = tf.placeholder(tf.float32, [nbatch]) #mask (done t-1) - S = tf.placeholder(tf.float32, [nenv, nlstm*2]) #states - with tf.variable_scope("model", reuse=reuse): - h = nature_cnn(X) - - # lstm - xs = batch_to_seq(h, nenv, nsteps) - ms = batch_to_seq(M, nenv, nsteps) - h5, snew = lstm(xs, ms, S, 'lstm1', nh=nlstm) - h5 = seq_to_batch(h5) - - pi_logits = fc(h5, 'pi', nact, init_scale=0.01) - pi = tf.nn.softmax(pi_logits) - q = fc(h5, 'q', nact) - - a = sample(pi_logits) # could change this to use self.pi instead - self.initial_state = np.zeros((nenv, nlstm*2), dtype=np.float32) - self.X = X - self.M = M - self.S = S - self.pi = pi # actual policy params now - self.q = q - - def step(ob, state, mask, *args, **kwargs): - # returns actions, mus, states - a0, pi0, s = sess.run([a, pi, snew], {X: ob, S: state, M: mask}) - return a0, pi0, s - - self.step = step diff --git a/baselines/acer/run_atari.py b/baselines/acer/run_atari.py deleted file mode 100644 index cce979eddd..0000000000 --- a/baselines/acer/run_atari.py +++ /dev/null @@ -1,30 +0,0 @@ -#!/usr/bin/env python3 -from baselines import logger -from baselines.acer.acer_simple import learn -from baselines.acer.policies import AcerCnnPolicy, AcerLstmPolicy -from baselines.common.cmd_util import make_atari_env, atari_arg_parser - -def train(env_id, num_timesteps, seed, policy, lrschedule, num_cpu): - env = make_atari_env(env_id, num_cpu, seed) - if policy == 'cnn': - policy_fn = AcerCnnPolicy - elif policy == 'lstm': - policy_fn = AcerLstmPolicy - else: - print("Policy {} not implemented".format(policy)) - return - learn(policy_fn, env, seed, total_timesteps=int(num_timesteps * 1.1), lrschedule=lrschedule) - env.close() - -def main(): - parser = atari_arg_parser() - parser.add_argument('--policy', help='Policy architecture', choices=['cnn', 'lstm', 'lnlstm'], default='cnn') - parser.add_argument('--lrschedule', help='Learning rate schedule', choices=['constant', 'linear'], default='constant') - parser.add_argument('--logdir', help ='Directory for logging') - args = parser.parse_args() - logger.configure(args.logdir) - train(args.env, num_timesteps=args.num_timesteps, seed=args.seed, - policy=args.policy, lrschedule=args.lrschedule, num_cpu=16) - -if __name__ == '__main__': - main() diff --git a/baselines/acktr/README.md b/baselines/acktr/README.md deleted file mode 100644 index e8a806d273..0000000000 --- a/baselines/acktr/README.md +++ /dev/null @@ -1,5 +0,0 @@ -# ACKTR - -- Original paper: https://arxiv.org/abs/1708.05144 -- Baselines blog post: https://blog.openai.com/baselines-acktr-a2c/ -- `python -m baselines.acktr.run_atari` runs the algorithm for 40M frames = 10M timesteps on an Atari game. See help (`-h`) for more options. \ No newline at end of file diff --git a/baselines/acktr/acktr_cont.py b/baselines/acktr/acktr_cont.py deleted file mode 100644 index 45f2fa29fa..0000000000 --- a/baselines/acktr/acktr_cont.py +++ /dev/null @@ -1,142 +0,0 @@ -import numpy as np -import tensorflow as tf -from baselines import logger -import baselines.common as common -from baselines.common import tf_util as U -from baselines.acktr import kfac -from baselines.common.filters import ZFilter - -def pathlength(path): - return path["reward"].shape[0]# Loss function that we'll differentiate to get the policy gradient - -def rollout(env, policy, max_pathlength, animate=False, obfilter=None): - """ - Simulate the env and policy for max_pathlength steps - """ - ob = env.reset() - prev_ob = np.float32(np.zeros(ob.shape)) - if obfilter: ob = obfilter(ob) - terminated = False - - obs = [] - acs = [] - ac_dists = [] - logps = [] - rewards = [] - for _ in range(max_pathlength): - if animate: - env.render() - state = np.concatenate([ob, prev_ob], -1) - obs.append(state) - ac, ac_dist, logp = policy.act(state) - acs.append(ac) - ac_dists.append(ac_dist) - logps.append(logp) - prev_ob = np.copy(ob) - scaled_ac = env.action_space.low + (ac + 1.) * 0.5 * (env.action_space.high - env.action_space.low) - scaled_ac = np.clip(scaled_ac, env.action_space.low, env.action_space.high) - ob, rew, done, _ = env.step(scaled_ac) - if obfilter: ob = obfilter(ob) - rewards.append(rew) - if done: - terminated = True - break - return {"observation" : np.array(obs), "terminated" : terminated, - "reward" : np.array(rewards), "action" : np.array(acs), - "action_dist": np.array(ac_dists), "logp" : np.array(logps)} - -def learn(env, policy, vf, gamma, lam, timesteps_per_batch, num_timesteps, - animate=False, callback=None, desired_kl=0.002): - - obfilter = ZFilter(env.observation_space.shape) - - max_pathlength = env.spec.timestep_limit - stepsize = tf.Variable(initial_value=np.float32(np.array(0.03)), name='stepsize') - inputs, loss, loss_sampled = policy.update_info - optim = kfac.KfacOptimizer(learning_rate=stepsize, cold_lr=stepsize*(1-0.9), momentum=0.9, kfac_update=2,\ - epsilon=1e-2, stats_decay=0.99, async=1, cold_iter=1, - weight_decay_dict=policy.wd_dict, max_grad_norm=None) - pi_var_list = [] - for var in tf.trainable_variables(): - if "pi" in var.name: - pi_var_list.append(var) - - update_op, q_runner = optim.minimize(loss, loss_sampled, var_list=pi_var_list) - do_update = U.function(inputs, update_op) - U.initialize() - - # start queue runners - enqueue_threads = [] - coord = tf.train.Coordinator() - for qr in [q_runner, vf.q_runner]: - assert (qr != None) - enqueue_threads.extend(qr.create_threads(tf.get_default_session(), coord=coord, start=True)) - - i = 0 - timesteps_so_far = 0 - while True: - if timesteps_so_far > num_timesteps: - break - logger.log("********** Iteration %i ************"%i) - - # Collect paths until we have enough timesteps - timesteps_this_batch = 0 - paths = [] - while True: - path = rollout(env, policy, max_pathlength, animate=(len(paths)==0 and (i % 10 == 0) and animate), obfilter=obfilter) - paths.append(path) - n = pathlength(path) - timesteps_this_batch += n - timesteps_so_far += n - if timesteps_this_batch > timesteps_per_batch: - break - - # Estimate advantage function - vtargs = [] - advs = [] - for path in paths: - rew_t = path["reward"] - return_t = common.discount(rew_t, gamma) - vtargs.append(return_t) - vpred_t = vf.predict(path) - vpred_t = np.append(vpred_t, 0.0 if path["terminated"] else vpred_t[-1]) - delta_t = rew_t + gamma*vpred_t[1:] - vpred_t[:-1] - adv_t = common.discount(delta_t, gamma * lam) - advs.append(adv_t) - # Update value function - vf.fit(paths, vtargs) - - # Build arrays for policy update - ob_no = np.concatenate([path["observation"] for path in paths]) - action_na = np.concatenate([path["action"] for path in paths]) - oldac_dist = np.concatenate([path["action_dist"] for path in paths]) - adv_n = np.concatenate(advs) - standardized_adv_n = (adv_n - adv_n.mean()) / (adv_n.std() + 1e-8) - - # Policy update - do_update(ob_no, action_na, standardized_adv_n) - - min_stepsize = np.float32(1e-8) - max_stepsize = np.float32(1e0) - # Adjust stepsize - kl = policy.compute_kl(ob_no, oldac_dist) - if kl > desired_kl * 2: - logger.log("kl too high") - tf.assign(stepsize, tf.maximum(min_stepsize, stepsize / 1.5)).eval() - elif kl < desired_kl / 2: - logger.log("kl too low") - tf.assign(stepsize, tf.minimum(max_stepsize, stepsize * 1.5)).eval() - else: - logger.log("kl just right!") - - logger.record_tabular("EpRewMean", np.mean([path["reward"].sum() for path in paths])) - logger.record_tabular("EpRewSEM", np.std([path["reward"].sum()/np.sqrt(len(paths)) for path in paths])) - logger.record_tabular("EpLenMean", np.mean([pathlength(path) for path in paths])) - logger.record_tabular("KL", kl) - if callback: - callback() - logger.dump_tabular() - i += 1 - - coord.request_stop() - coord.join(enqueue_threads) diff --git a/baselines/acktr/acktr_disc.py b/baselines/acktr/acktr_disc.py deleted file mode 100644 index a8b77b6fd5..0000000000 --- a/baselines/acktr/acktr_disc.py +++ /dev/null @@ -1,155 +0,0 @@ -import os.path as osp -import time -import joblib -import numpy as np -import tensorflow as tf -from baselines import logger - -from baselines.common import set_global_seeds, explained_variance - -from baselines.a2c.a2c import Runner -from baselines.a2c.utils import discount_with_dones -from baselines.a2c.utils import Scheduler, find_trainable_variables -from baselines.a2c.utils import cat_entropy, mse -from baselines.acktr import kfac - - -class Model(object): - - def __init__(self, policy, ob_space, ac_space, nenvs,total_timesteps, nprocs=32, nsteps=20, - ent_coef=0.01, vf_coef=0.5, vf_fisher_coef=1.0, lr=0.25, max_grad_norm=0.5, - kfac_clip=0.001, lrschedule='linear'): - config = tf.ConfigProto(allow_soft_placement=True, - intra_op_parallelism_threads=nprocs, - inter_op_parallelism_threads=nprocs) - config.gpu_options.allow_growth = True - self.sess = sess = tf.Session(config=config) - nact = ac_space.n - nbatch = nenvs * nsteps - A = tf.placeholder(tf.int32, [nbatch]) - ADV = tf.placeholder(tf.float32, [nbatch]) - R = tf.placeholder(tf.float32, [nbatch]) - PG_LR = tf.placeholder(tf.float32, []) - VF_LR = tf.placeholder(tf.float32, []) - - self.model = step_model = policy(sess, ob_space, ac_space, nenvs, 1, reuse=False) - self.model2 = train_model = policy(sess, ob_space, ac_space, nenvs*nsteps, nsteps, reuse=True) - - logpac = tf.nn.sparse_softmax_cross_entropy_with_logits(logits=train_model.pi, labels=A) - self.logits = logits = train_model.pi - - ##training loss - pg_loss = tf.reduce_mean(ADV*logpac) - entropy = tf.reduce_mean(cat_entropy(train_model.pi)) - pg_loss = pg_loss - ent_coef * entropy - vf_loss = tf.reduce_mean(mse(tf.squeeze(train_model.vf), R)) - train_loss = pg_loss + vf_coef * vf_loss - - - ##Fisher loss construction - self.pg_fisher = pg_fisher_loss = -tf.reduce_mean(logpac) - sample_net = train_model.vf + tf.random_normal(tf.shape(train_model.vf)) - self.vf_fisher = vf_fisher_loss = - vf_fisher_coef*tf.reduce_mean(tf.pow(train_model.vf - tf.stop_gradient(sample_net), 2)) - self.joint_fisher = joint_fisher_loss = pg_fisher_loss + vf_fisher_loss - - self.params=params = find_trainable_variables("model") - - self.grads_check = grads = tf.gradients(train_loss,params) - - with tf.device('/gpu:0'): - self.optim = optim = kfac.KfacOptimizer(learning_rate=PG_LR, clip_kl=kfac_clip,\ - momentum=0.9, kfac_update=1, epsilon=0.01,\ - stats_decay=0.99, async=1, cold_iter=10, max_grad_norm=max_grad_norm) - - update_stats_op = optim.compute_and_apply_stats(joint_fisher_loss, var_list=params) - train_op, q_runner = optim.apply_gradients(list(zip(grads,params))) - self.q_runner = q_runner - self.lr = Scheduler(v=lr, nvalues=total_timesteps, schedule=lrschedule) - - def train(obs, states, rewards, masks, actions, values): - advs = rewards - values - for step in range(len(obs)): - cur_lr = self.lr.value() - - td_map = {train_model.X:obs, A:actions, ADV:advs, R:rewards, PG_LR:cur_lr} - if states is not None: - td_map[train_model.S] = states - td_map[train_model.M] = masks - - policy_loss, value_loss, policy_entropy, _ = sess.run( - [pg_loss, vf_loss, entropy, train_op], - td_map - ) - return policy_loss, value_loss, policy_entropy - - def save(save_path): - ps = sess.run(params) - joblib.dump(ps, save_path) - - def load(load_path): - loaded_params = joblib.load(load_path) - restores = [] - for p, loaded_p in zip(params, loaded_params): - restores.append(p.assign(loaded_p)) - sess.run(restores) - - - - self.train = train - self.save = save - self.load = load - self.train_model = train_model - self.step_model = step_model - self.step = step_model.step - self.value = step_model.value - self.initial_state = step_model.initial_state - tf.global_variables_initializer().run(session=sess) - -def learn(policy, env, seed, total_timesteps=int(40e6), gamma=0.99, log_interval=1, nprocs=32, nsteps=20, - ent_coef=0.01, vf_coef=0.5, vf_fisher_coef=1.0, lr=0.25, max_grad_norm=0.5, - kfac_clip=0.001, save_interval=None, lrschedule='linear'): - tf.reset_default_graph() - set_global_seeds(seed) - - nenvs = env.num_envs - ob_space = env.observation_space - ac_space = env.action_space - make_model = lambda : Model(policy, ob_space, ac_space, nenvs, total_timesteps, nprocs=nprocs, nsteps - =nsteps, ent_coef=ent_coef, vf_coef=vf_coef, vf_fisher_coef= - vf_fisher_coef, lr=lr, max_grad_norm=max_grad_norm, kfac_clip=kfac_clip, - lrschedule=lrschedule) - if save_interval and logger.get_dir(): - import cloudpickle - with open(osp.join(logger.get_dir(), 'make_model.pkl'), 'wb') as fh: - fh.write(cloudpickle.dumps(make_model)) - model = make_model() - - runner = Runner(env, model, nsteps=nsteps, gamma=gamma) - nbatch = nenvs*nsteps - tstart = time.time() - coord = tf.train.Coordinator() - enqueue_threads = model.q_runner.create_threads(model.sess, coord=coord, start=True) - for update in range(1, total_timesteps//nbatch+1): - obs, states, rewards, masks, actions, values = runner.run() - policy_loss, value_loss, policy_entropy = model.train(obs, states, rewards, masks, actions, values) - model.old_obs = obs - nseconds = time.time()-tstart - fps = int((update*nbatch)/nseconds) - if update % log_interval == 0 or update == 1: - ev = explained_variance(values, rewards) - logger.record_tabular("nupdates", update) - logger.record_tabular("total_timesteps", update*nbatch) - logger.record_tabular("fps", fps) - logger.record_tabular("policy_entropy", float(policy_entropy)) - logger.record_tabular("policy_loss", float(policy_loss)) - logger.record_tabular("value_loss", float(value_loss)) - logger.record_tabular("explained_variance", float(ev)) - logger.dump_tabular() - - if save_interval and (update % save_interval == 0 or update == 1) and logger.get_dir(): - savepath = osp.join(logger.get_dir(), 'checkpoint%.5i'%update) - print('Saving to', savepath) - model.save(savepath) - coord.request_stop() - coord.join(enqueue_threads) - env.close() diff --git a/baselines/acktr/kfac.py b/baselines/acktr/kfac.py deleted file mode 100644 index b4208199dc..0000000000 --- a/baselines/acktr/kfac.py +++ /dev/null @@ -1,926 +0,0 @@ -import tensorflow as tf -import numpy as np -import re -from baselines.acktr.kfac_utils import * -from functools import reduce - -KFAC_OPS = ['MatMul', 'Conv2D', 'BiasAdd'] -KFAC_DEBUG = False - - -class KfacOptimizer(): - - def __init__(self, learning_rate=0.01, momentum=0.9, clip_kl=0.01, kfac_update=2, stats_accum_iter=60, full_stats_init=False, cold_iter=100, cold_lr=None, async=False, async_stats=False, epsilon=1e-2, stats_decay=0.95, blockdiag_bias=False, channel_fac=False, factored_damping=False, approxT2=False, use_float64=False, weight_decay_dict={},max_grad_norm=0.5): - self.max_grad_norm = max_grad_norm - self._lr = learning_rate - self._momentum = momentum - self._clip_kl = clip_kl - self._channel_fac = channel_fac - self._kfac_update = kfac_update - self._async = async - self._async_stats = async_stats - self._epsilon = epsilon - self._stats_decay = stats_decay - self._blockdiag_bias = blockdiag_bias - self._approxT2 = approxT2 - self._use_float64 = use_float64 - self._factored_damping = factored_damping - self._cold_iter = cold_iter - if cold_lr == None: - # good heuristics - self._cold_lr = self._lr# * 3. - else: - self._cold_lr = cold_lr - self._stats_accum_iter = stats_accum_iter - self._weight_decay_dict = weight_decay_dict - self._diag_init_coeff = 0. - self._full_stats_init = full_stats_init - if not self._full_stats_init: - self._stats_accum_iter = self._cold_iter - - self.sgd_step = tf.Variable(0, name='KFAC/sgd_step', trainable=False) - self.global_step = tf.Variable( - 0, name='KFAC/global_step', trainable=False) - self.cold_step = tf.Variable(0, name='KFAC/cold_step', trainable=False) - self.factor_step = tf.Variable( - 0, name='KFAC/factor_step', trainable=False) - self.stats_step = tf.Variable( - 0, name='KFAC/stats_step', trainable=False) - self.vFv = tf.Variable(0., name='KFAC/vFv', trainable=False) - - self.factors = {} - self.param_vars = [] - self.stats = {} - self.stats_eigen = {} - - def getFactors(self, g, varlist): - graph = tf.get_default_graph() - factorTensors = {} - fpropTensors = [] - bpropTensors = [] - opTypes = [] - fops = [] - - def searchFactors(gradient, graph): - # hard coded search stratergy - bpropOp = gradient.op - bpropOp_name = bpropOp.name - - bTensors = [] - fTensors = [] - - # combining additive gradient, assume they are the same op type and - # indepedent - if 'AddN' in bpropOp_name: - factors = [] - for g in gradient.op.inputs: - factors.append(searchFactors(g, graph)) - op_names = [item['opName'] for item in factors] - # TO-DO: need to check all the attribute of the ops as well - print (gradient.name) - print (op_names) - print (len(np.unique(op_names))) - assert len(np.unique(op_names)) == 1, gradient.name + \ - ' is shared among different computation OPs' - - bTensors = reduce(lambda x, y: x + y, - [item['bpropFactors'] for item in factors]) - if len(factors[0]['fpropFactors']) > 0: - fTensors = reduce( - lambda x, y: x + y, [item['fpropFactors'] for item in factors]) - fpropOp_name = op_names[0] - fpropOp = factors[0]['op'] - else: - fpropOp_name = re.search( - 'gradientsSampled(_[0-9]+|)/(.+?)_grad', bpropOp_name).group(2) - fpropOp = graph.get_operation_by_name(fpropOp_name) - if fpropOp.op_def.name in KFAC_OPS: - # Known OPs - ### - bTensor = [ - i for i in bpropOp.inputs if 'gradientsSampled' in i.name][-1] - bTensorShape = fpropOp.outputs[0].get_shape() - if bTensor.get_shape()[0].value == None: - bTensor.set_shape(bTensorShape) - bTensors.append(bTensor) - ### - if fpropOp.op_def.name == 'BiasAdd': - fTensors = [] - else: - fTensors.append( - [i for i in fpropOp.inputs if param.op.name not in i.name][0]) - fpropOp_name = fpropOp.op_def.name - else: - # unknown OPs, block approximation used - bInputsList = [i for i in bpropOp.inputs[ - 0].op.inputs if 'gradientsSampled' in i.name if 'Shape' not in i.name] - if len(bInputsList) > 0: - bTensor = bInputsList[0] - bTensorShape = fpropOp.outputs[0].get_shape() - if len(bTensor.get_shape()) > 0 and bTensor.get_shape()[0].value == None: - bTensor.set_shape(bTensorShape) - bTensors.append(bTensor) - fpropOp_name = opTypes.append('UNK-' + fpropOp.op_def.name) - - return {'opName': fpropOp_name, 'op': fpropOp, 'fpropFactors': fTensors, 'bpropFactors': bTensors} - - for t, param in zip(g, varlist): - if KFAC_DEBUG: - print(('get factor for '+param.name)) - factors = searchFactors(t, graph) - factorTensors[param] = factors - - ######## - # check associated weights and bias for homogeneous coordinate representation - # and check redundent factors - # TO-DO: there may be a bug to detect associate bias and weights for - # forking layer, e.g. in inception models. - for param in varlist: - factorTensors[param]['assnWeights'] = None - factorTensors[param]['assnBias'] = None - for param in varlist: - if factorTensors[param]['opName'] == 'BiasAdd': - factorTensors[param]['assnWeights'] = None - for item in varlist: - if len(factorTensors[item]['bpropFactors']) > 0: - if (set(factorTensors[item]['bpropFactors']) == set(factorTensors[param]['bpropFactors'])) and (len(factorTensors[item]['fpropFactors']) > 0): - factorTensors[param]['assnWeights'] = item - factorTensors[item]['assnBias'] = param - factorTensors[param]['bpropFactors'] = factorTensors[ - item]['bpropFactors'] - - ######## - - ######## - # concatenate the additive gradients along the batch dimension, i.e. - # assuming independence structure - for key in ['fpropFactors', 'bpropFactors']: - for i, param in enumerate(varlist): - if len(factorTensors[param][key]) > 0: - if (key + '_concat') not in factorTensors[param]: - name_scope = factorTensors[param][key][0].name.split(':')[ - 0] - with tf.name_scope(name_scope): - factorTensors[param][ - key + '_concat'] = tf.concat(factorTensors[param][key], 0) - else: - factorTensors[param][key + '_concat'] = None - for j, param2 in enumerate(varlist[(i + 1):]): - if (len(factorTensors[param][key]) > 0) and (set(factorTensors[param2][key]) == set(factorTensors[param][key])): - factorTensors[param2][key] = factorTensors[param][key] - factorTensors[param2][ - key + '_concat'] = factorTensors[param][key + '_concat'] - ######## - - if KFAC_DEBUG: - for items in zip(varlist, fpropTensors, bpropTensors, opTypes): - print((items[0].name, factorTensors[item])) - self.factors = factorTensors - return factorTensors - - def getStats(self, factors, varlist): - if len(self.stats) == 0: - # initialize stats variables on CPU because eigen decomp is - # computed on CPU - with tf.device('/cpu'): - tmpStatsCache = {} - - # search for tensor factors and - # use block diag approx for the bias units - for var in varlist: - fpropFactor = factors[var]['fpropFactors_concat'] - bpropFactor = factors[var]['bpropFactors_concat'] - opType = factors[var]['opName'] - if opType == 'Conv2D': - Kh = var.get_shape()[0] - Kw = var.get_shape()[1] - C = fpropFactor.get_shape()[-1] - - Oh = bpropFactor.get_shape()[1] - Ow = bpropFactor.get_shape()[2] - if Oh == 1 and Ow == 1 and self._channel_fac: - # factorization along the channels do not support - # homogeneous coordinate - var_assnBias = factors[var]['assnBias'] - if var_assnBias: - factors[var]['assnBias'] = None - factors[var_assnBias]['assnWeights'] = None - ## - - for var in varlist: - fpropFactor = factors[var]['fpropFactors_concat'] - bpropFactor = factors[var]['bpropFactors_concat'] - opType = factors[var]['opName'] - self.stats[var] = {'opName': opType, - 'fprop_concat_stats': [], - 'bprop_concat_stats': [], - 'assnWeights': factors[var]['assnWeights'], - 'assnBias': factors[var]['assnBias'], - } - if fpropFactor is not None: - if fpropFactor not in tmpStatsCache: - if opType == 'Conv2D': - Kh = var.get_shape()[0] - Kw = var.get_shape()[1] - C = fpropFactor.get_shape()[-1] - - Oh = bpropFactor.get_shape()[1] - Ow = bpropFactor.get_shape()[2] - if Oh == 1 and Ow == 1 and self._channel_fac: - # factorization along the channels - # assume independence between input channels and spatial - # 2K-1 x 2K-1 covariance matrix and C x C covariance matrix - # factorization along the channels do not - # support homogeneous coordinate, assnBias - # is always None - fpropFactor2_size = Kh * Kw - slot_fpropFactor_stats2 = tf.Variable(tf.diag(tf.ones( - [fpropFactor2_size])) * self._diag_init_coeff, name='KFAC_STATS/' + fpropFactor.op.name, trainable=False) - self.stats[var]['fprop_concat_stats'].append( - slot_fpropFactor_stats2) - - fpropFactor_size = C - else: - # 2K-1 x 2K-1 x C x C covariance matrix - # assume BHWC - fpropFactor_size = Kh * Kw * C - else: - # D x D covariance matrix - fpropFactor_size = fpropFactor.get_shape()[-1] - - # use homogeneous coordinate - if not self._blockdiag_bias and self.stats[var]['assnBias']: - fpropFactor_size += 1 - - slot_fpropFactor_stats = tf.Variable(tf.diag(tf.ones( - [fpropFactor_size])) * self._diag_init_coeff, name='KFAC_STATS/' + fpropFactor.op.name, trainable=False) - self.stats[var]['fprop_concat_stats'].append( - slot_fpropFactor_stats) - if opType != 'Conv2D': - tmpStatsCache[fpropFactor] = self.stats[ - var]['fprop_concat_stats'] - else: - self.stats[var][ - 'fprop_concat_stats'] = tmpStatsCache[fpropFactor] - - if bpropFactor is not None: - # no need to collect backward stats for bias vectors if - # using homogeneous coordinates - if not((not self._blockdiag_bias) and self.stats[var]['assnWeights']): - if bpropFactor not in tmpStatsCache: - slot_bpropFactor_stats = tf.Variable(tf.diag(tf.ones([bpropFactor.get_shape( - )[-1]])) * self._diag_init_coeff, name='KFAC_STATS/' + bpropFactor.op.name, trainable=False) - self.stats[var]['bprop_concat_stats'].append( - slot_bpropFactor_stats) - tmpStatsCache[bpropFactor] = self.stats[ - var]['bprop_concat_stats'] - else: - self.stats[var][ - 'bprop_concat_stats'] = tmpStatsCache[bpropFactor] - - return self.stats - - def compute_and_apply_stats(self, loss_sampled, var_list=None): - varlist = var_list - if varlist is None: - varlist = tf.trainable_variables() - - stats = self.compute_stats(loss_sampled, var_list=varlist) - return self.apply_stats(stats) - - def compute_stats(self, loss_sampled, var_list=None): - varlist = var_list - if varlist is None: - varlist = tf.trainable_variables() - - gs = tf.gradients(loss_sampled, varlist, name='gradientsSampled') - self.gs = gs - factors = self.getFactors(gs, varlist) - stats = self.getStats(factors, varlist) - - updateOps = [] - statsUpdates = {} - statsUpdates_cache = {} - for var in varlist: - opType = factors[var]['opName'] - fops = factors[var]['op'] - fpropFactor = factors[var]['fpropFactors_concat'] - fpropStats_vars = stats[var]['fprop_concat_stats'] - bpropFactor = factors[var]['bpropFactors_concat'] - bpropStats_vars = stats[var]['bprop_concat_stats'] - SVD_factors = {} - for stats_var in fpropStats_vars: - stats_var_dim = int(stats_var.get_shape()[0]) - if stats_var not in statsUpdates_cache: - old_fpropFactor = fpropFactor - B = (tf.shape(fpropFactor)[0]) # batch size - if opType == 'Conv2D': - strides = fops.get_attr("strides") - padding = fops.get_attr("padding") - convkernel_size = var.get_shape()[0:3] - - KH = int(convkernel_size[0]) - KW = int(convkernel_size[1]) - C = int(convkernel_size[2]) - flatten_size = int(KH * KW * C) - - Oh = int(bpropFactor.get_shape()[1]) - Ow = int(bpropFactor.get_shape()[2]) - - if Oh == 1 and Ow == 1 and self._channel_fac: - # factorization along the channels - # assume independence among input channels - # factor = B x 1 x 1 x (KH xKW x C) - # patches = B x Oh x Ow x (KH xKW x C) - if len(SVD_factors) == 0: - if KFAC_DEBUG: - print(('approx %s act factor with rank-1 SVD factors' % (var.name))) - # find closest rank-1 approx to the feature map - S, U, V = tf.batch_svd(tf.reshape( - fpropFactor, [-1, KH * KW, C])) - # get rank-1 approx slides - sqrtS1 = tf.expand_dims(tf.sqrt(S[:, 0, 0]), 1) - patches_k = U[:, :, 0] * sqrtS1 # B x KH*KW - full_factor_shape = fpropFactor.get_shape() - patches_k.set_shape( - [full_factor_shape[0], KH * KW]) - patches_c = V[:, :, 0] * sqrtS1 # B x C - patches_c.set_shape([full_factor_shape[0], C]) - SVD_factors[C] = patches_c - SVD_factors[KH * KW] = patches_k - fpropFactor = SVD_factors[stats_var_dim] - - else: - # poor mem usage implementation - patches = tf.extract_image_patches(fpropFactor, ksizes=[1, convkernel_size[ - 0], convkernel_size[1], 1], strides=strides, rates=[1, 1, 1, 1], padding=padding) - - if self._approxT2: - if KFAC_DEBUG: - print(('approxT2 act fisher for %s' % (var.name))) - # T^2 terms * 1/T^2, size: B x C - fpropFactor = tf.reduce_mean(patches, [1, 2]) - else: - # size: (B x Oh x Ow) x C - fpropFactor = tf.reshape( - patches, [-1, flatten_size]) / Oh / Ow - fpropFactor_size = int(fpropFactor.get_shape()[-1]) - if stats_var_dim == (fpropFactor_size + 1) and not self._blockdiag_bias: - if opType == 'Conv2D' and not self._approxT2: - # correct padding for numerical stability (we - # divided out OhxOw from activations for T1 approx) - fpropFactor = tf.concat([fpropFactor, tf.ones( - [tf.shape(fpropFactor)[0], 1]) / Oh / Ow], 1) - else: - # use homogeneous coordinates - fpropFactor = tf.concat( - [fpropFactor, tf.ones([tf.shape(fpropFactor)[0], 1])], 1) - - # average over the number of data points in a batch - # divided by B - cov = tf.matmul(fpropFactor, fpropFactor, - transpose_a=True) / tf.cast(B, tf.float32) - updateOps.append(cov) - statsUpdates[stats_var] = cov - if opType != 'Conv2D': - # HACK: for convolution we recompute fprop stats for - # every layer including forking layers - statsUpdates_cache[stats_var] = cov - - for stats_var in bpropStats_vars: - stats_var_dim = int(stats_var.get_shape()[0]) - if stats_var not in statsUpdates_cache: - old_bpropFactor = bpropFactor - bpropFactor_shape = bpropFactor.get_shape() - B = tf.shape(bpropFactor)[0] # batch size - C = int(bpropFactor_shape[-1]) # num channels - if opType == 'Conv2D' or len(bpropFactor_shape) == 4: - if fpropFactor is not None: - if self._approxT2: - if KFAC_DEBUG: - print(('approxT2 grad fisher for %s' % (var.name))) - bpropFactor = tf.reduce_sum( - bpropFactor, [1, 2]) # T^2 terms * 1/T^2 - else: - bpropFactor = tf.reshape( - bpropFactor, [-1, C]) * Oh * Ow # T * 1/T terms - else: - # just doing block diag approx. spatial independent - # structure does not apply here. summing over - # spatial locations - if KFAC_DEBUG: - print(('block diag approx fisher for %s' % (var.name))) - bpropFactor = tf.reduce_sum(bpropFactor, [1, 2]) - - # assume sampled loss is averaged. TO-DO:figure out better - # way to handle this - bpropFactor *= tf.to_float(B) - ## - - cov_b = tf.matmul( - bpropFactor, bpropFactor, transpose_a=True) / tf.to_float(tf.shape(bpropFactor)[0]) - - updateOps.append(cov_b) - statsUpdates[stats_var] = cov_b - statsUpdates_cache[stats_var] = cov_b - - if KFAC_DEBUG: - aKey = list(statsUpdates.keys())[0] - statsUpdates[aKey] = tf.Print(statsUpdates[aKey], - [tf.convert_to_tensor('step:'), - self.global_step, - tf.convert_to_tensor( - 'computing stats'), - ]) - self.statsUpdates = statsUpdates - return statsUpdates - - def apply_stats(self, statsUpdates): - """ compute stats and update/apply the new stats to the running average - """ - - def updateAccumStats(): - if self._full_stats_init: - return tf.cond(tf.greater(self.sgd_step, self._cold_iter), lambda: tf.group(*self._apply_stats(statsUpdates, accumulate=True, accumulateCoeff=1. / self._stats_accum_iter)), tf.no_op) - else: - return tf.group(*self._apply_stats(statsUpdates, accumulate=True, accumulateCoeff=1. / self._stats_accum_iter)) - - def updateRunningAvgStats(statsUpdates, fac_iter=1): - # return tf.cond(tf.greater_equal(self.factor_step, - # tf.convert_to_tensor(fac_iter)), lambda: - # tf.group(*self._apply_stats(stats_list, varlist)), tf.no_op) - return tf.group(*self._apply_stats(statsUpdates)) - - if self._async_stats: - # asynchronous stats update - update_stats = self._apply_stats(statsUpdates) - - queue = tf.FIFOQueue(1, [item.dtype for item in update_stats], shapes=[ - item.get_shape() for item in update_stats]) - enqueue_op = queue.enqueue(update_stats) - - def dequeue_stats_op(): - return queue.dequeue() - self.qr_stats = tf.train.QueueRunner(queue, [enqueue_op]) - update_stats_op = tf.cond(tf.equal(queue.size(), tf.convert_to_tensor( - 0)), tf.no_op, lambda: tf.group(*[dequeue_stats_op(), ])) - else: - # synchronous stats update - update_stats_op = tf.cond(tf.greater_equal( - self.stats_step, self._stats_accum_iter), lambda: updateRunningAvgStats(statsUpdates), updateAccumStats) - self._update_stats_op = update_stats_op - return update_stats_op - - def _apply_stats(self, statsUpdates, accumulate=False, accumulateCoeff=0.): - updateOps = [] - # obtain the stats var list - for stats_var in statsUpdates: - stats_new = statsUpdates[stats_var] - if accumulate: - # simple superbatch averaging - update_op = tf.assign_add( - stats_var, accumulateCoeff * stats_new, use_locking=True) - else: - # exponential running averaging - update_op = tf.assign( - stats_var, stats_var * self._stats_decay, use_locking=True) - update_op = tf.assign_add( - update_op, (1. - self._stats_decay) * stats_new, use_locking=True) - updateOps.append(update_op) - - with tf.control_dependencies(updateOps): - stats_step_op = tf.assign_add(self.stats_step, 1) - - if KFAC_DEBUG: - stats_step_op = (tf.Print(stats_step_op, - [tf.convert_to_tensor('step:'), - self.global_step, - tf.convert_to_tensor('fac step:'), - self.factor_step, - tf.convert_to_tensor('sgd step:'), - self.sgd_step, - tf.convert_to_tensor('Accum:'), - tf.convert_to_tensor(accumulate), - tf.convert_to_tensor('Accum coeff:'), - tf.convert_to_tensor(accumulateCoeff), - tf.convert_to_tensor('stat step:'), - self.stats_step, updateOps[0], updateOps[1]])) - return [stats_step_op, ] - - def getStatsEigen(self, stats=None): - if len(self.stats_eigen) == 0: - stats_eigen = {} - if stats is None: - stats = self.stats - - tmpEigenCache = {} - with tf.device('/cpu:0'): - for var in stats: - for key in ['fprop_concat_stats', 'bprop_concat_stats']: - for stats_var in stats[var][key]: - if stats_var not in tmpEigenCache: - stats_dim = stats_var.get_shape()[1].value - e = tf.Variable(tf.ones( - [stats_dim]), name='KFAC_FAC/' + stats_var.name.split(':')[0] + '/e', trainable=False) - Q = tf.Variable(tf.diag(tf.ones( - [stats_dim])), name='KFAC_FAC/' + stats_var.name.split(':')[0] + '/Q', trainable=False) - stats_eigen[stats_var] = {'e': e, 'Q': Q} - tmpEigenCache[ - stats_var] = stats_eigen[stats_var] - else: - stats_eigen[stats_var] = tmpEigenCache[ - stats_var] - self.stats_eigen = stats_eigen - return self.stats_eigen - - def computeStatsEigen(self): - """ compute the eigen decomp using copied var stats to avoid concurrent read/write from other queue """ - # TO-DO: figure out why this op has delays (possibly moving - # eigenvectors around?) - with tf.device('/cpu:0'): - def removeNone(tensor_list): - local_list = [] - for item in tensor_list: - if item is not None: - local_list.append(item) - return local_list - - def copyStats(var_list): - print("copying stats to buffer tensors before eigen decomp") - redundant_stats = {} - copied_list = [] - for item in var_list: - if item is not None: - if item not in redundant_stats: - if self._use_float64: - redundant_stats[item] = tf.cast( - tf.identity(item), tf.float64) - else: - redundant_stats[item] = tf.identity(item) - copied_list.append(redundant_stats[item]) - else: - copied_list.append(None) - return copied_list - #stats = [copyStats(self.fStats), copyStats(self.bStats)] - #stats = [self.fStats, self.bStats] - - stats_eigen = self.stats_eigen - computedEigen = {} - eigen_reverse_lookup = {} - updateOps = [] - # sync copied stats - # with tf.control_dependencies(removeNone(stats[0]) + - # removeNone(stats[1])): - with tf.control_dependencies([]): - for stats_var in stats_eigen: - if stats_var not in computedEigen: - eigens = tf.self_adjoint_eig(stats_var) - e = eigens[0] - Q = eigens[1] - if self._use_float64: - e = tf.cast(e, tf.float32) - Q = tf.cast(Q, tf.float32) - updateOps.append(e) - updateOps.append(Q) - computedEigen[stats_var] = {'e': e, 'Q': Q} - eigen_reverse_lookup[e] = stats_eigen[stats_var]['e'] - eigen_reverse_lookup[Q] = stats_eigen[stats_var]['Q'] - - self.eigen_reverse_lookup = eigen_reverse_lookup - self.eigen_update_list = updateOps - - if KFAC_DEBUG: - self.eigen_update_list = [item for item in updateOps] - with tf.control_dependencies(updateOps): - updateOps.append(tf.Print(tf.constant( - 0.), [tf.convert_to_tensor('computed factor eigen')])) - - return updateOps - - def applyStatsEigen(self, eigen_list): - updateOps = [] - print(('updating %d eigenvalue/vectors' % len(eigen_list))) - for i, (tensor, mark) in enumerate(zip(eigen_list, self.eigen_update_list)): - stats_eigen_var = self.eigen_reverse_lookup[mark] - updateOps.append( - tf.assign(stats_eigen_var, tensor, use_locking=True)) - - with tf.control_dependencies(updateOps): - factor_step_op = tf.assign_add(self.factor_step, 1) - updateOps.append(factor_step_op) - if KFAC_DEBUG: - updateOps.append(tf.Print(tf.constant( - 0.), [tf.convert_to_tensor('updated kfac factors')])) - return updateOps - - def getKfacPrecondUpdates(self, gradlist, varlist): - updatelist = [] - vg = 0. - - assert len(self.stats) > 0 - assert len(self.stats_eigen) > 0 - assert len(self.factors) > 0 - counter = 0 - - grad_dict = {var: grad for grad, var in zip(gradlist, varlist)} - - for grad, var in zip(gradlist, varlist): - GRAD_RESHAPE = False - GRAD_TRANSPOSE = False - - fpropFactoredFishers = self.stats[var]['fprop_concat_stats'] - bpropFactoredFishers = self.stats[var]['bprop_concat_stats'] - - if (len(fpropFactoredFishers) + len(bpropFactoredFishers)) > 0: - counter += 1 - GRAD_SHAPE = grad.get_shape() - if len(grad.get_shape()) > 2: - # reshape conv kernel parameters - KW = int(grad.get_shape()[0]) - KH = int(grad.get_shape()[1]) - C = int(grad.get_shape()[2]) - D = int(grad.get_shape()[3]) - - if len(fpropFactoredFishers) > 1 and self._channel_fac: - # reshape conv kernel parameters into tensor - grad = tf.reshape(grad, [KW * KH, C, D]) - else: - # reshape conv kernel parameters into 2D grad - grad = tf.reshape(grad, [-1, D]) - GRAD_RESHAPE = True - elif len(grad.get_shape()) == 1: - # reshape bias or 1D parameters - D = int(grad.get_shape()[0]) - - grad = tf.expand_dims(grad, 0) - GRAD_RESHAPE = True - else: - # 2D parameters - C = int(grad.get_shape()[0]) - D = int(grad.get_shape()[1]) - - if (self.stats[var]['assnBias'] is not None) and not self._blockdiag_bias: - # use homogeneous coordinates only works for 2D grad. - # TO-DO: figure out how to factorize bias grad - # stack bias grad - var_assnBias = self.stats[var]['assnBias'] - grad = tf.concat( - [grad, tf.expand_dims(grad_dict[var_assnBias], 0)], 0) - - # project gradient to eigen space and reshape the eigenvalues - # for broadcasting - eigVals = [] - - for idx, stats in enumerate(self.stats[var]['fprop_concat_stats']): - Q = self.stats_eigen[stats]['Q'] - e = detectMinVal(self.stats_eigen[stats][ - 'e'], var, name='act', debug=KFAC_DEBUG) - - Q, e = factorReshape(Q, e, grad, facIndx=idx, ftype='act') - eigVals.append(e) - grad = gmatmul(Q, grad, transpose_a=True, reduce_dim=idx) - - for idx, stats in enumerate(self.stats[var]['bprop_concat_stats']): - Q = self.stats_eigen[stats]['Q'] - e = detectMinVal(self.stats_eigen[stats][ - 'e'], var, name='grad', debug=KFAC_DEBUG) - - Q, e = factorReshape(Q, e, grad, facIndx=idx, ftype='grad') - eigVals.append(e) - grad = gmatmul(grad, Q, transpose_b=False, reduce_dim=idx) - ## - - ##### - # whiten using eigenvalues - weightDecayCoeff = 0. - if var in self._weight_decay_dict: - weightDecayCoeff = self._weight_decay_dict[var] - if KFAC_DEBUG: - print(('weight decay coeff for %s is %f' % (var.name, weightDecayCoeff))) - - if self._factored_damping: - if KFAC_DEBUG: - print(('use factored damping for %s' % (var.name))) - coeffs = 1. - num_factors = len(eigVals) - # compute the ratio of two trace norm of the left and right - # KFac matrices, and their generalization - if len(eigVals) == 1: - damping = self._epsilon + weightDecayCoeff - else: - damping = tf.pow( - self._epsilon + weightDecayCoeff, 1. / num_factors) - eigVals_tnorm_avg = [tf.reduce_mean( - tf.abs(e)) for e in eigVals] - for e, e_tnorm in zip(eigVals, eigVals_tnorm_avg): - eig_tnorm_negList = [ - item for item in eigVals_tnorm_avg if item != e_tnorm] - if len(eigVals) == 1: - adjustment = 1. - elif len(eigVals) == 2: - adjustment = tf.sqrt( - e_tnorm / eig_tnorm_negList[0]) - else: - eig_tnorm_negList_prod = reduce( - lambda x, y: x * y, eig_tnorm_negList) - adjustment = tf.pow( - tf.pow(e_tnorm, num_factors - 1.) / eig_tnorm_negList_prod, 1. / num_factors) - coeffs *= (e + adjustment * damping) - else: - coeffs = 1. - damping = (self._epsilon + weightDecayCoeff) - for e in eigVals: - coeffs *= e - coeffs += damping - - #grad = tf.Print(grad, [tf.convert_to_tensor('1'), tf.convert_to_tensor(var.name), grad.get_shape()]) - - grad /= coeffs - - #grad = tf.Print(grad, [tf.convert_to_tensor('2'), tf.convert_to_tensor(var.name), grad.get_shape()]) - ##### - # project gradient back to euclidean space - for idx, stats in enumerate(self.stats[var]['fprop_concat_stats']): - Q = self.stats_eigen[stats]['Q'] - grad = gmatmul(Q, grad, transpose_a=False, reduce_dim=idx) - - for idx, stats in enumerate(self.stats[var]['bprop_concat_stats']): - Q = self.stats_eigen[stats]['Q'] - grad = gmatmul(grad, Q, transpose_b=True, reduce_dim=idx) - ## - - #grad = tf.Print(grad, [tf.convert_to_tensor('3'), tf.convert_to_tensor(var.name), grad.get_shape()]) - if (self.stats[var]['assnBias'] is not None) and not self._blockdiag_bias: - # use homogeneous coordinates only works for 2D grad. - # TO-DO: figure out how to factorize bias grad - # un-stack bias grad - var_assnBias = self.stats[var]['assnBias'] - C_plus_one = int(grad.get_shape()[0]) - grad_assnBias = tf.reshape(tf.slice(grad, - begin=[ - C_plus_one - 1, 0], - size=[1, -1]), var_assnBias.get_shape()) - grad_assnWeights = tf.slice(grad, - begin=[0, 0], - size=[C_plus_one - 1, -1]) - grad_dict[var_assnBias] = grad_assnBias - grad = grad_assnWeights - - #grad = tf.Print(grad, [tf.convert_to_tensor('4'), tf.convert_to_tensor(var.name), grad.get_shape()]) - if GRAD_RESHAPE: - grad = tf.reshape(grad, GRAD_SHAPE) - - grad_dict[var] = grad - - print(('projecting %d gradient matrices' % counter)) - - for g, var in zip(gradlist, varlist): - grad = grad_dict[var] - ### clipping ### - if KFAC_DEBUG: - print(('apply clipping to %s' % (var.name))) - tf.Print(grad, [tf.sqrt(tf.reduce_sum(tf.pow(grad, 2)))], "Euclidean norm of new grad") - local_vg = tf.reduce_sum(grad * g * (self._lr * self._lr)) - vg += local_vg - - # recale everything - if KFAC_DEBUG: - print('apply vFv clipping') - - scaling = tf.minimum(1., tf.sqrt(self._clip_kl / vg)) - if KFAC_DEBUG: - scaling = tf.Print(scaling, [tf.convert_to_tensor( - 'clip: '), scaling, tf.convert_to_tensor(' vFv: '), vg]) - with tf.control_dependencies([tf.assign(self.vFv, vg)]): - updatelist = [grad_dict[var] for var in varlist] - for i, item in enumerate(updatelist): - updatelist[i] = scaling * item - - return updatelist - - def compute_gradients(self, loss, var_list=None): - varlist = var_list - if varlist is None: - varlist = tf.trainable_variables() - g = tf.gradients(loss, varlist) - - return [(a, b) for a, b in zip(g, varlist)] - - def apply_gradients_kfac(self, grads): - g, varlist = list(zip(*grads)) - - if len(self.stats_eigen) == 0: - self.getStatsEigen() - - qr = None - # launch eigen-decomp on a queue thread - if self._async: - print('Use async eigen decomp') - # get a list of factor loading tensors - factorOps_dummy = self.computeStatsEigen() - - # define a queue for the list of factor loading tensors - queue = tf.FIFOQueue(1, [item.dtype for item in factorOps_dummy], shapes=[ - item.get_shape() for item in factorOps_dummy]) - enqueue_op = tf.cond(tf.logical_and(tf.equal(tf.mod(self.stats_step, self._kfac_update), tf.convert_to_tensor( - 0)), tf.greater_equal(self.stats_step, self._stats_accum_iter)), lambda: queue.enqueue(self.computeStatsEigen()), tf.no_op) - - def dequeue_op(): - return queue.dequeue() - - qr = tf.train.QueueRunner(queue, [enqueue_op]) - - updateOps = [] - global_step_op = tf.assign_add(self.global_step, 1) - updateOps.append(global_step_op) - - with tf.control_dependencies([global_step_op]): - - # compute updates - assert self._update_stats_op != None - updateOps.append(self._update_stats_op) - dependency_list = [] - if not self._async: - dependency_list.append(self._update_stats_op) - - with tf.control_dependencies(dependency_list): - def no_op_wrapper(): - return tf.group(*[tf.assign_add(self.cold_step, 1)]) - - if not self._async: - # synchronous eigen-decomp updates - updateFactorOps = tf.cond(tf.logical_and(tf.equal(tf.mod(self.stats_step, self._kfac_update), - tf.convert_to_tensor(0)), - tf.greater_equal(self.stats_step, self._stats_accum_iter)), lambda: tf.group(*self.applyStatsEigen(self.computeStatsEigen())), no_op_wrapper) - else: - # asynchronous eigen-decomp updates using queue - updateFactorOps = tf.cond(tf.greater_equal(self.stats_step, self._stats_accum_iter), - lambda: tf.cond(tf.equal(queue.size(), tf.convert_to_tensor(0)), - tf.no_op, - - lambda: tf.group( - *self.applyStatsEigen(dequeue_op())), - ), - no_op_wrapper) - - updateOps.append(updateFactorOps) - - with tf.control_dependencies([updateFactorOps]): - def gradOp(): - return list(g) - - def getKfacGradOp(): - return self.getKfacPrecondUpdates(g, varlist) - u = tf.cond(tf.greater(self.factor_step, - tf.convert_to_tensor(0)), getKfacGradOp, gradOp) - - optim = tf.train.MomentumOptimizer( - self._lr * (1. - self._momentum), self._momentum) - #optim = tf.train.AdamOptimizer(self._lr, epsilon=0.01) - - def optimOp(): - def updateOptimOp(): - if self._full_stats_init: - return tf.cond(tf.greater(self.factor_step, tf.convert_to_tensor(0)), lambda: optim.apply_gradients(list(zip(u, varlist))), tf.no_op) - else: - return optim.apply_gradients(list(zip(u, varlist))) - if self._full_stats_init: - return tf.cond(tf.greater_equal(self.stats_step, self._stats_accum_iter), updateOptimOp, tf.no_op) - else: - return tf.cond(tf.greater_equal(self.sgd_step, self._cold_iter), updateOptimOp, tf.no_op) - updateOps.append(optimOp()) - - return tf.group(*updateOps), qr - - def apply_gradients(self, grads): - coldOptim = tf.train.MomentumOptimizer( - self._cold_lr, self._momentum) - - def coldSGDstart(): - sgd_grads, sgd_var = zip(*grads) - - if self.max_grad_norm != None: - sgd_grads, sgd_grad_norm = tf.clip_by_global_norm(sgd_grads,self.max_grad_norm) - - sgd_grads = list(zip(sgd_grads,sgd_var)) - - sgd_step_op = tf.assign_add(self.sgd_step, 1) - coldOptim_op = coldOptim.apply_gradients(sgd_grads) - if KFAC_DEBUG: - with tf.control_dependencies([sgd_step_op, coldOptim_op]): - sgd_step_op = tf.Print( - sgd_step_op, [self.sgd_step, tf.convert_to_tensor('doing cold sgd step')]) - return tf.group(*[sgd_step_op, coldOptim_op]) - - kfacOptim_op, qr = self.apply_gradients_kfac(grads) - - def warmKFACstart(): - return kfacOptim_op - - return tf.cond(tf.greater(self.sgd_step, self._cold_iter), warmKFACstart, coldSGDstart), qr - - def minimize(self, loss, loss_sampled, var_list=None): - grads = self.compute_gradients(loss, var_list=var_list) - update_stats_op = self.compute_and_apply_stats( - loss_sampled, var_list=var_list) - return self.apply_gradients(grads) diff --git a/baselines/acktr/kfac_utils.py b/baselines/acktr/kfac_utils.py deleted file mode 100644 index edc623d737..0000000000 --- a/baselines/acktr/kfac_utils.py +++ /dev/null @@ -1,86 +0,0 @@ -import tensorflow as tf - -def gmatmul(a, b, transpose_a=False, transpose_b=False, reduce_dim=None): - assert reduce_dim is not None - - # weird batch matmul - if len(a.get_shape()) == 2 and len(b.get_shape()) > 2: - # reshape reduce_dim to the left most dim in b - b_shape = b.get_shape() - if reduce_dim != 0: - b_dims = list(range(len(b_shape))) - b_dims.remove(reduce_dim) - b_dims.insert(0, reduce_dim) - b = tf.transpose(b, b_dims) - b_t_shape = b.get_shape() - b = tf.reshape(b, [int(b_shape[reduce_dim]), -1]) - result = tf.matmul(a, b, transpose_a=transpose_a, - transpose_b=transpose_b) - result = tf.reshape(result, b_t_shape) - if reduce_dim != 0: - b_dims = list(range(len(b_shape))) - b_dims.remove(0) - b_dims.insert(reduce_dim, 0) - result = tf.transpose(result, b_dims) - return result - - elif len(a.get_shape()) > 2 and len(b.get_shape()) == 2: - # reshape reduce_dim to the right most dim in a - a_shape = a.get_shape() - outter_dim = len(a_shape) - 1 - reduce_dim = len(a_shape) - reduce_dim - 1 - if reduce_dim != outter_dim: - a_dims = list(range(len(a_shape))) - a_dims.remove(reduce_dim) - a_dims.insert(outter_dim, reduce_dim) - a = tf.transpose(a, a_dims) - a_t_shape = a.get_shape() - a = tf.reshape(a, [-1, int(a_shape[reduce_dim])]) - result = tf.matmul(a, b, transpose_a=transpose_a, - transpose_b=transpose_b) - result = tf.reshape(result, a_t_shape) - if reduce_dim != outter_dim: - a_dims = list(range(len(a_shape))) - a_dims.remove(outter_dim) - a_dims.insert(reduce_dim, outter_dim) - result = tf.transpose(result, a_dims) - return result - - elif len(a.get_shape()) == 2 and len(b.get_shape()) == 2: - return tf.matmul(a, b, transpose_a=transpose_a, transpose_b=transpose_b) - - assert False, 'something went wrong' - - -def clipoutNeg(vec, threshold=1e-6): - mask = tf.cast(vec > threshold, tf.float32) - return mask * vec - - -def detectMinVal(input_mat, var, threshold=1e-6, name='', debug=False): - eigen_min = tf.reduce_min(input_mat) - eigen_max = tf.reduce_max(input_mat) - eigen_ratio = eigen_max / eigen_min - input_mat_clipped = clipoutNeg(input_mat, threshold) - - if debug: - input_mat_clipped = tf.cond(tf.logical_or(tf.greater(eigen_ratio, 0.), tf.less(eigen_ratio, -500)), lambda: input_mat_clipped, lambda: tf.Print( - input_mat_clipped, [tf.convert_to_tensor('screwed ratio ' + name + ' eigen values!!!'), tf.convert_to_tensor(var.name), eigen_min, eigen_max, eigen_ratio])) - - return input_mat_clipped - - -def factorReshape(Q, e, grad, facIndx=0, ftype='act'): - grad_shape = grad.get_shape() - if ftype == 'act': - assert e.get_shape()[0] == grad_shape[facIndx] - expanded_shape = [1, ] * len(grad_shape) - expanded_shape[facIndx] = -1 - e = tf.reshape(e, expanded_shape) - if ftype == 'grad': - assert e.get_shape()[0] == grad_shape[len(grad_shape) - facIndx - 1] - expanded_shape = [1, ] * len(grad_shape) - expanded_shape[len(grad_shape) - facIndx - 1] = -1 - e = tf.reshape(e, expanded_shape) - - return Q, e diff --git a/baselines/acktr/policies.py b/baselines/acktr/policies.py deleted file mode 100644 index 39bb6cbe6d..0000000000 --- a/baselines/acktr/policies.py +++ /dev/null @@ -1,42 +0,0 @@ -import numpy as np -import tensorflow as tf -from baselines.acktr.utils import dense, kl_div -import baselines.common.tf_util as U - -class GaussianMlpPolicy(object): - def __init__(self, ob_dim, ac_dim): - # Here we'll construct a bunch of expressions, which will be used in two places: - # (1) When sampling actions - # (2) When computing loss functions, for the policy update - # Variables specific to (1) have the word "sampled" in them, - # whereas variables specific to (2) have the word "old" in them - ob_no = tf.placeholder(tf.float32, shape=[None, ob_dim*2], name="ob") # batch of observations - oldac_na = tf.placeholder(tf.float32, shape=[None, ac_dim], name="ac") # batch of actions previous actions - oldac_dist = tf.placeholder(tf.float32, shape=[None, ac_dim*2], name="oldac_dist") # batch of actions previous action distributions - adv_n = tf.placeholder(tf.float32, shape=[None], name="adv") # advantage function estimate - wd_dict = {} - h1 = tf.nn.tanh(dense(ob_no, 64, "h1", weight_init=U.normc_initializer(1.0), bias_init=0.0, weight_loss_dict=wd_dict)) - h2 = tf.nn.tanh(dense(h1, 64, "h2", weight_init=U.normc_initializer(1.0), bias_init=0.0, weight_loss_dict=wd_dict)) - mean_na = dense(h2, ac_dim, "mean", weight_init=U.normc_initializer(0.1), bias_init=0.0, weight_loss_dict=wd_dict) # Mean control output - self.wd_dict = wd_dict - self.logstd_1a = logstd_1a = tf.get_variable("logstd", [ac_dim], tf.float32, tf.zeros_initializer()) # Variance on outputs - logstd_1a = tf.expand_dims(logstd_1a, 0) - std_1a = tf.exp(logstd_1a) - std_na = tf.tile(std_1a, [tf.shape(mean_na)[0], 1]) - ac_dist = tf.concat([tf.reshape(mean_na, [-1, ac_dim]), tf.reshape(std_na, [-1, ac_dim])], 1) - sampled_ac_na = tf.random_normal(tf.shape(ac_dist[:,ac_dim:])) * ac_dist[:,ac_dim:] + ac_dist[:,:ac_dim] # This is the sampled action we'll perform. - logprobsampled_n = - tf.reduce_sum(tf.log(ac_dist[:,ac_dim:]), axis=1) - 0.5 * tf.log(2.0*np.pi)*ac_dim - 0.5 * tf.reduce_sum(tf.square(ac_dist[:,:ac_dim] - sampled_ac_na) / (tf.square(ac_dist[:,ac_dim:])), axis=1) # Logprob of sampled action - logprob_n = - tf.reduce_sum(tf.log(ac_dist[:,ac_dim:]), axis=1) - 0.5 * tf.log(2.0*np.pi)*ac_dim - 0.5 * tf.reduce_sum(tf.square(ac_dist[:,:ac_dim] - oldac_na) / (tf.square(ac_dist[:,ac_dim:])), axis=1) # Logprob of previous actions under CURRENT policy (whereas oldlogprob_n is under OLD policy) - kl = tf.reduce_mean(kl_div(oldac_dist, ac_dist, ac_dim)) - #kl = .5 * tf.reduce_mean(tf.square(logprob_n - oldlogprob_n)) # Approximation of KL divergence between old policy used to generate actions, and new policy used to compute logprob_n - surr = - tf.reduce_mean(adv_n * logprob_n) # Loss function that we'll differentiate to get the policy gradient - surr_sampled = - tf.reduce_mean(logprob_n) # Sampled loss of the policy - self._act = U.function([ob_no], [sampled_ac_na, ac_dist, logprobsampled_n]) # Generate a new action and its logprob - #self.compute_kl = U.function([ob_no, oldac_na, oldlogprob_n], kl) # Compute (approximate) KL divergence between old policy and new policy - self.compute_kl = U.function([ob_no, oldac_dist], kl) - self.update_info = ((ob_no, oldac_na, adv_n), surr, surr_sampled) # Input and output variables needed for computing loss - U.initialize() # Initialize uninitialized TF variables - - def act(self, ob): - ac, ac_dist, logp = self._act(ob[None]) - return ac[0], ac_dist[0], logp[0] diff --git a/baselines/acktr/run_atari.py b/baselines/acktr/run_atari.py deleted file mode 100644 index 6e398ce25d..0000000000 --- a/baselines/acktr/run_atari.py +++ /dev/null @@ -1,23 +0,0 @@ -#!/usr/bin/env python3 - -from functools import partial - -from baselines import logger -from baselines.acktr.acktr_disc import learn -from baselines.common.cmd_util import make_atari_env, atari_arg_parser -from baselines.common.vec_env.vec_frame_stack import VecFrameStack -from baselines.ppo2.policies import CnnPolicy - -def train(env_id, num_timesteps, seed, num_cpu): - env = VecFrameStack(make_atari_env(env_id, num_cpu, seed), 4) - policy_fn = partial(CnnPolicy, one_dim_bias=True) - learn(policy_fn, env, seed, total_timesteps=int(num_timesteps * 1.1), nprocs=num_cpu) - env.close() - -def main(): - args = atari_arg_parser().parse_args() - logger.configure() - train(args.env, num_timesteps=args.num_timesteps, seed=args.seed, num_cpu=32) - -if __name__ == '__main__': - main() diff --git a/baselines/acktr/run_mujoco.py b/baselines/acktr/run_mujoco.py deleted file mode 100644 index 9065d58807..0000000000 --- a/baselines/acktr/run_mujoco.py +++ /dev/null @@ -1,34 +0,0 @@ -#!/usr/bin/env python3 - -import tensorflow as tf -from baselines import logger -from baselines.common.cmd_util import make_mujoco_env, mujoco_arg_parser -from baselines.acktr.acktr_cont import learn -from baselines.acktr.policies import GaussianMlpPolicy -from baselines.acktr.value_functions import NeuralNetValueFunction - -def train(env_id, num_timesteps, seed): - env = make_mujoco_env(env_id, seed) - - with tf.Session(config=tf.ConfigProto()): - ob_dim = env.observation_space.shape[0] - ac_dim = env.action_space.shape[0] - with tf.variable_scope("vf"): - vf = NeuralNetValueFunction(ob_dim, ac_dim) - with tf.variable_scope("pi"): - policy = GaussianMlpPolicy(ob_dim, ac_dim) - - learn(env, policy=policy, vf=vf, - gamma=0.99, lam=0.97, timesteps_per_batch=2500, - desired_kl=0.002, - num_timesteps=num_timesteps, animate=False) - - env.close() - -def main(): - args = mujoco_arg_parser().parse_args() - logger.configure() - train(args.env, num_timesteps=args.num_timesteps, seed=args.seed) - -if __name__ == "__main__": - main() diff --git a/baselines/acktr/utils.py b/baselines/acktr/utils.py deleted file mode 100644 index 227350fe5e..0000000000 --- a/baselines/acktr/utils.py +++ /dev/null @@ -1,28 +0,0 @@ -import tensorflow as tf - -def dense(x, size, name, weight_init=None, bias_init=0, weight_loss_dict=None, reuse=None): - with tf.variable_scope(name, reuse=reuse): - assert (len(tf.get_variable_scope().name.split('/')) == 2) - - w = tf.get_variable("w", [x.get_shape()[1], size], initializer=weight_init) - b = tf.get_variable("b", [size], initializer=tf.constant_initializer(bias_init)) - weight_decay_fc = 3e-4 - - if weight_loss_dict is not None: - weight_decay = tf.multiply(tf.nn.l2_loss(w), weight_decay_fc, name='weight_decay_loss') - if weight_loss_dict is not None: - weight_loss_dict[w] = weight_decay_fc - weight_loss_dict[b] = 0.0 - - tf.add_to_collection(tf.get_variable_scope().name.split('/')[0] + '_' + 'losses', weight_decay) - - return tf.nn.bias_add(tf.matmul(x, w), b) - -def kl_div(action_dist1, action_dist2, action_size): - mean1, std1 = action_dist1[:, :action_size], action_dist1[:, action_size:] - mean2, std2 = action_dist2[:, :action_size], action_dist2[:, action_size:] - - numerator = tf.square(mean1 - mean2) + tf.square(std1) - tf.square(std2) - denominator = 2 * tf.square(std2) + 1e-8 - return tf.reduce_sum( - numerator/denominator + tf.log(std2) - tf.log(std1),reduction_indices=-1) diff --git a/baselines/acktr/value_functions.py b/baselines/acktr/value_functions.py deleted file mode 100644 index d1e9e1a361..0000000000 --- a/baselines/acktr/value_functions.py +++ /dev/null @@ -1,50 +0,0 @@ -from baselines import logger -import numpy as np -import baselines.common as common -from baselines.common import tf_util as U -import tensorflow as tf -from baselines.acktr import kfac -from baselines.acktr.utils import dense - -class NeuralNetValueFunction(object): - def __init__(self, ob_dim, ac_dim): #pylint: disable=W0613 - X = tf.placeholder(tf.float32, shape=[None, ob_dim*2+ac_dim*2+2]) # batch of observations - vtarg_n = tf.placeholder(tf.float32, shape=[None], name='vtarg') - wd_dict = {} - h1 = tf.nn.elu(dense(X, 64, "h1", weight_init=U.normc_initializer(1.0), bias_init=0, weight_loss_dict=wd_dict)) - h2 = tf.nn.elu(dense(h1, 64, "h2", weight_init=U.normc_initializer(1.0), bias_init=0, weight_loss_dict=wd_dict)) - vpred_n = dense(h2, 1, "hfinal", weight_init=U.normc_initializer(1.0), bias_init=0, weight_loss_dict=wd_dict)[:,0] - sample_vpred_n = vpred_n + tf.random_normal(tf.shape(vpred_n)) - wd_loss = tf.get_collection("vf_losses", None) - loss = tf.reduce_mean(tf.square(vpred_n - vtarg_n)) + tf.add_n(wd_loss) - loss_sampled = tf.reduce_mean(tf.square(vpred_n - tf.stop_gradient(sample_vpred_n))) - self._predict = U.function([X], vpred_n) - optim = kfac.KfacOptimizer(learning_rate=0.001, cold_lr=0.001*(1-0.9), momentum=0.9, \ - clip_kl=0.3, epsilon=0.1, stats_decay=0.95, \ - async=1, kfac_update=2, cold_iter=50, \ - weight_decay_dict=wd_dict, max_grad_norm=None) - vf_var_list = [] - for var in tf.trainable_variables(): - if "vf" in var.name: - vf_var_list.append(var) - - update_op, self.q_runner = optim.minimize(loss, loss_sampled, var_list=vf_var_list) - self.do_update = U.function([X, vtarg_n], update_op) #pylint: disable=E1101 - U.initialize() # Initialize uninitialized TF variables - def _preproc(self, path): - l = pathlength(path) - al = np.arange(l).reshape(-1,1)/10.0 - act = path["action_dist"].astype('float32') - X = np.concatenate([path['observation'], act, al, np.ones((l, 1))], axis=1) - return X - def predict(self, path): - return self._predict(self._preproc(path)) - def fit(self, paths, targvals): - X = np.concatenate([self._preproc(p) for p in paths]) - y = np.concatenate(targvals) - logger.record_tabular("EVBefore", common.explained_variance(self._predict(X), y)) - for _ in range(25): self.do_update(X, y) - logger.record_tabular("EVAfter", common.explained_variance(self._predict(X), y)) - -def pathlength(path): - return path["reward"].shape[0] diff --git a/baselines/bench/__init__.py b/baselines/bench/__init__.py deleted file mode 100644 index 4fd3874b39..0000000000 --- a/baselines/bench/__init__.py +++ /dev/null @@ -1,2 +0,0 @@ -from baselines.bench.benchmarks import * -from baselines.bench.monitor import * \ No newline at end of file diff --git a/baselines/bench/monitor.py b/baselines/bench/monitor.py deleted file mode 100644 index 0da1b4f878..0000000000 --- a/baselines/bench/monitor.py +++ /dev/null @@ -1,161 +0,0 @@ -__all__ = ['Monitor', 'get_monitor_files', 'load_results'] - -import gym -from gym.core import Wrapper -import time -from glob import glob -import csv -import os.path as osp -import json -import numpy as np - -class Monitor(Wrapper): - EXT = "monitor.csv" - f = None - - def __init__(self, env, filename, allow_early_resets=False, reset_keywords=(), info_keywords=()): - Wrapper.__init__(self, env=env) - self.tstart = time.time() - if filename is None: - self.f = None - self.logger = None - else: - if not filename.endswith(Monitor.EXT): - if osp.isdir(filename): - filename = osp.join(filename, Monitor.EXT) - else: - filename = filename + "." + Monitor.EXT - self.f = open(filename, "wt") - self.f.write('#%s\n'%json.dumps({"t_start": self.tstart, 'env_id' : env.spec and env.spec.id})) - self.logger = csv.DictWriter(self.f, fieldnames=('r', 'l', 't')+reset_keywords+info_keywords) - self.logger.writeheader() - self.f.flush() - - self.reset_keywords = reset_keywords - self.info_keywords = info_keywords - self.allow_early_resets = allow_early_resets - self.rewards = None - self.needs_reset = True - self.episode_rewards = [] - self.episode_lengths = [] - self.episode_times = [] - self.total_steps = 0 - self.current_reset_info = {} # extra info about the current episode, that was passed in during reset() - - def reset(self, **kwargs): - if not self.allow_early_resets and not self.needs_reset: - raise RuntimeError("Tried to reset an environment before done. If you want to allow early resets, wrap your env with Monitor(env, path, allow_early_resets=True)") - self.rewards = [] - self.needs_reset = False - for k in self.reset_keywords: - v = kwargs.get(k) - if v is None: - raise ValueError('Expected you to pass kwarg %s into reset'%k) - self.current_reset_info[k] = v - return self.env.reset(**kwargs) - - def step(self, action): - if self.needs_reset: - raise RuntimeError("Tried to step environment that needs reset") - ob, rew, done, info = self.env.step(action) - self.rewards.append(rew) - if done: - self.needs_reset = True - eprew = sum(self.rewards) - eplen = len(self.rewards) - epinfo = {"r": round(eprew, 6), "l": eplen, "t": round(time.time() - self.tstart, 6)} - for k in self.info_keywords: - epinfo[k] = info[k] - self.episode_rewards.append(eprew) - self.episode_lengths.append(eplen) - self.episode_times.append(time.time() - self.tstart) - epinfo.update(self.current_reset_info) - if self.logger: - self.logger.writerow(epinfo) - self.f.flush() - info['episode'] = epinfo - self.total_steps += 1 - return (ob, rew, done, info) - - def close(self): - if self.f is not None: - self.f.close() - - def get_total_steps(self): - return self.total_steps - - def get_episode_rewards(self): - return self.episode_rewards - - def get_episode_lengths(self): - return self.episode_lengths - - def get_episode_times(self): - return self.episode_times - -class LoadMonitorResultsError(Exception): - pass - -def get_monitor_files(dir): - return glob(osp.join(dir, "*" + Monitor.EXT)) - -def load_results(dir): - import pandas - monitor_files = ( - glob(osp.join(dir, "*monitor.json")) + - glob(osp.join(dir, "*monitor.csv"))) # get both csv and (old) json files - if not monitor_files: - raise LoadMonitorResultsError("no monitor files of the form *%s found in %s" % (Monitor.EXT, dir)) - dfs = [] - headers = [] - for fname in monitor_files: - with open(fname, 'rt') as fh: - if fname.endswith('csv'): - firstline = fh.readline() - assert firstline[0] == '#' - header = json.loads(firstline[1:]) - df = pandas.read_csv(fh, index_col=None) - headers.append(header) - elif fname.endswith('json'): # Deprecated json format - episodes = [] - lines = fh.readlines() - header = json.loads(lines[0]) - headers.append(header) - for line in lines[1:]: - episode = json.loads(line) - episodes.append(episode) - df = pandas.DataFrame(episodes) - else: - assert 0, 'unreachable' - df['t'] += header['t_start'] - dfs.append(df) - df = pandas.concat(dfs) - df.sort_values('t', inplace=True) - df.reset_index(inplace=True) - df['t'] -= min(header['t_start'] for header in headers) - df.headers = headers # HACK to preserve backwards compatibility - return df - -def test_monitor(): - env = gym.make("CartPole-v1") - env.seed(0) - mon_file = "/tmp/baselines-test-%s.monitor.csv" % uuid.uuid4() - menv = Monitor(env, mon_file) - menv.reset() - for _ in range(1000): - _, _, done, _ = menv.step(0) - if done: - menv.reset() - - f = open(mon_file, 'rt') - - firstline = f.readline() - assert firstline.startswith('#') - metadata = json.loads(firstline[1:]) - assert metadata['env_id'] == "CartPole-v1" - assert set(metadata.keys()) == {'env_id', 'gym_version', 't_start'}, "Incorrect keys in monitor metadata" - - last_logline = pandas.read_csv(f, index_col=None) - assert set(last_logline.keys()) == {'l', 't', 'r'}, "Incorrect keys in monitor logline" - f.close() - os.remove(mon_file) \ No newline at end of file diff --git a/baselines/common/__init__.py b/baselines/common/__init__.py deleted file mode 100644 index 0834b36492..0000000000 --- a/baselines/common/__init__.py +++ /dev/null @@ -1,5 +0,0 @@ -# flake8: noqa F403 -from baselines.common.console_util import * -from baselines.common.dataset import Dataset -from baselines.common.math_util import * -from baselines.common.misc_util import * diff --git a/baselines/common/cg.py b/baselines/common/cg.py deleted file mode 100644 index a913186666..0000000000 --- a/baselines/common/cg.py +++ /dev/null @@ -1,34 +0,0 @@ -import numpy as np -def cg(f_Ax, b, cg_iters=10, callback=None, verbose=False, residual_tol=1e-10): - """ - Demmel p 312 - """ - p = b.copy() - r = b.copy() - x = np.zeros_like(b) - rdotr = r.dot(r) - - fmtstr = "%10i %10.3g %10.3g" - titlestr = "%10s %10s %10s" - if verbose: print(titlestr % ("iter", "residual norm", "soln norm")) - - for i in range(cg_iters): - if callback is not None: - callback(x) - if verbose: print(fmtstr % (i, rdotr, np.linalg.norm(x))) - z = f_Ax(p) - v = rdotr / p.dot(z) - x += v*p - r -= v*z - newrdotr = r.dot(r) - mu = newrdotr/rdotr - p = r + mu*p - - rdotr = newrdotr - if rdotr < residual_tol: - break - - if callback is not None: - callback(x) - if verbose: print(fmtstr % (i+1, rdotr, np.linalg.norm(x))) # pylint: disable=W0631 - return x \ No newline at end of file diff --git a/baselines/common/console_util.py b/baselines/common/console_util.py deleted file mode 100644 index 8adc3f83ad..0000000000 --- a/baselines/common/console_util.py +++ /dev/null @@ -1,59 +0,0 @@ -from __future__ import print_function -from contextlib import contextmanager -import numpy as np -import time - -# ================================================================ -# Misc -# ================================================================ - -def fmt_row(width, row, header=False): - out = " | ".join(fmt_item(x, width) for x in row) - if header: out = out + "\n" + "-"*len(out) - return out - -def fmt_item(x, l): - if isinstance(x, np.ndarray): - assert x.ndim==0 - x = x.item() - if isinstance(x, (float, np.float32, np.float64)): - v = abs(x) - if (v < 1e-4 or v > 1e+4) and v > 0: - rep = "%7.2e" % x - else: - rep = "%7.5f" % x - else: rep = str(x) - return " "*(l - len(rep)) + rep - -color2num = dict( - gray=30, - red=31, - green=32, - yellow=33, - blue=34, - magenta=35, - cyan=36, - white=37, - crimson=38 -) - -def colorize(string, color, bold=False, highlight=False): - attr = [] - num = color2num[color] - if highlight: num += 10 - attr.append(str(num)) - if bold: attr.append('1') - return '\x1b[%sm%s\x1b[0m' % (';'.join(attr), string) - - -MESSAGE_DEPTH = 0 - -@contextmanager -def timed(msg): - global MESSAGE_DEPTH #pylint: disable=W0603 - print(colorize('\t'*MESSAGE_DEPTH + '=: ' + msg, color='magenta')) - tstart = time.time() - MESSAGE_DEPTH += 1 - yield - MESSAGE_DEPTH -= 1 - print(colorize('\t'*MESSAGE_DEPTH + "done in %.3f seconds"%(time.time() - tstart), color='magenta')) diff --git a/baselines/common/dataset.py b/baselines/common/dataset.py deleted file mode 100644 index 41a38c8af6..0000000000 --- a/baselines/common/dataset.py +++ /dev/null @@ -1,60 +0,0 @@ -import numpy as np - -class Dataset(object): - def __init__(self, data_map, deterministic=False, shuffle=True): - self.data_map = data_map - self.deterministic = deterministic - self.enable_shuffle = shuffle - self.n = next(iter(data_map.values())).shape[0] - self._next_id = 0 - self.shuffle() - - def shuffle(self): - if self.deterministic: - return - perm = np.arange(self.n) - np.random.shuffle(perm) - - for key in self.data_map: - self.data_map[key] = self.data_map[key][perm] - - self._next_id = 0 - - def next_batch(self, batch_size): - if self._next_id >= self.n and self.enable_shuffle: - self.shuffle() - - cur_id = self._next_id - cur_batch_size = min(batch_size, self.n - self._next_id) - self._next_id += cur_batch_size - - data_map = dict() - for key in self.data_map: - data_map[key] = self.data_map[key][cur_id:cur_id+cur_batch_size] - return data_map - - def iterate_once(self, batch_size): - if self.enable_shuffle: self.shuffle() - - while self._next_id <= self.n - batch_size: - yield self.next_batch(batch_size) - self._next_id = 0 - - def subset(self, num_elements, deterministic=True): - data_map = dict() - for key in self.data_map: - data_map[key] = self.data_map[key][:num_elements] - return Dataset(data_map, deterministic) - - -def iterbatches(arrays, *, num_batches=None, batch_size=None, shuffle=True, include_final_partial_batch=True): - assert (num_batches is None) != (batch_size is None), 'Provide num_batches or batch_size, but not both' - arrays = tuple(map(np.asarray, arrays)) - n = arrays[0].shape[0] - assert all(a.shape[0] == n for a in arrays[1:]) - inds = np.arange(n) - if shuffle: np.random.shuffle(inds) - sections = np.arange(0, n, batch_size)[1:] if num_batches is None else num_batches - for batch_inds in np.array_split(inds, sections): - if include_final_partial_batch or len(batch_inds) == batch_size: - yield tuple(a[batch_inds] for a in arrays) diff --git a/baselines/common/distributions.py b/baselines/common/distributions.py deleted file mode 100644 index 8a57c37605..0000000000 --- a/baselines/common/distributions.py +++ /dev/null @@ -1,309 +0,0 @@ -import tensorflow as tf -import numpy as np -import baselines.common.tf_util as U -from baselines.a2c.utils import fc -from tensorflow.python.ops import math_ops - -class Pd(object): - """ - A particular probability distribution - """ - def flatparam(self): - raise NotImplementedError - def mode(self): - raise NotImplementedError - def neglogp(self, x): - # Usually it's easier to define the negative logprob - raise NotImplementedError - def kl(self, other): - raise NotImplementedError - def entropy(self): - raise NotImplementedError - def sample(self): - raise NotImplementedError - def logp(self, x): - return - self.neglogp(x) - -class PdType(object): - """ - Parametrized family of probability distributions - """ - def pdclass(self): - raise NotImplementedError - def pdfromflat(self, flat): - return self.pdclass()(flat) - def pdfromlatent(self, latent_vector): - raise NotImplementedError - def param_shape(self): - raise NotImplementedError - def sample_shape(self): - raise NotImplementedError - def sample_dtype(self): - raise NotImplementedError - - def param_placeholder(self, prepend_shape, name=None): - return tf.placeholder(dtype=tf.float32, shape=prepend_shape+self.param_shape(), name=name) - def sample_placeholder(self, prepend_shape, name=None): - return tf.placeholder(dtype=self.sample_dtype(), shape=prepend_shape+self.sample_shape(), name=name) - -class CategoricalPdType(PdType): - def __init__(self, ncat): - self.ncat = ncat - def pdclass(self): - return CategoricalPd - def pdfromlatent(self, latent_vector, init_scale=1.0, init_bias=0.0): - pdparam = fc(latent_vector, 'pi', self.ncat, init_scale=init_scale, init_bias=init_bias) - return self.pdfromflat(pdparam), pdparam - - def param_shape(self): - return [self.ncat] - def sample_shape(self): - return [] - def sample_dtype(self): - return tf.int32 - - -class MultiCategoricalPdType(PdType): - def __init__(self, nvec): - self.ncats = nvec - def pdclass(self): - return MultiCategoricalPd - def pdfromflat(self, flat): - return MultiCategoricalPd(self.ncats, flat) - def param_shape(self): - return [sum(self.ncats)] - def sample_shape(self): - return [len(self.ncats)] - def sample_dtype(self): - return tf.int32 - -class DiagGaussianPdType(PdType): - def __init__(self, size): - self.size = size - def pdclass(self): - return DiagGaussianPd - - def pdfromlatent(self, latent_vector, init_scale=1.0, init_bias=0.0): - mean = fc(latent_vector, 'pi', self.size, init_scale=init_scale, init_bias=init_bias) - logstd = tf.get_variable(name='logstd', shape=[1, self.size], initializer=tf.zeros_initializer()) - pdparam = tf.concat([mean, mean * 0.0 + logstd], axis=1) - return self.pdfromflat(pdparam), mean - - def param_shape(self): - return [2*self.size] - def sample_shape(self): - return [self.size] - def sample_dtype(self): - return tf.float32 - -class BernoulliPdType(PdType): - def __init__(self, size): - self.size = size - def pdclass(self): - return BernoulliPd - def param_shape(self): - return [self.size] - def sample_shape(self): - return [self.size] - def sample_dtype(self): - return tf.int32 - -# WRONG SECOND DERIVATIVES -# class CategoricalPd(Pd): -# def __init__(self, logits): -# self.logits = logits -# self.ps = tf.nn.softmax(logits) -# @classmethod -# def fromflat(cls, flat): -# return cls(flat) -# def flatparam(self): -# return self.logits -# def mode(self): -# return U.argmax(self.logits, axis=-1) -# def logp(self, x): -# return -tf.nn.sparse_softmax_cross_entropy_with_logits(self.logits, x) -# def kl(self, other): -# return tf.nn.softmax_cross_entropy_with_logits(other.logits, self.ps) \ -# - tf.nn.softmax_cross_entropy_with_logits(self.logits, self.ps) -# def entropy(self): -# return tf.nn.softmax_cross_entropy_with_logits(self.logits, self.ps) -# def sample(self): -# u = tf.random_uniform(tf.shape(self.logits)) -# return U.argmax(self.logits - tf.log(-tf.log(u)), axis=-1) - -class CategoricalPd(Pd): - def __init__(self, logits): - self.logits = logits - def flatparam(self): - return self.logits - def mode(self): - return tf.argmax(self.logits, axis=-1) - def neglogp(self, x): - # return tf.nn.sparse_softmax_cross_entropy_with_logits(logits=self.logits, labels=x) - # Note: we can't use sparse_softmax_cross_entropy_with_logits because - # the implementation does not allow second-order derivatives... - one_hot_actions = tf.one_hot(x, self.logits.get_shape().as_list()[-1]) - return tf.nn.softmax_cross_entropy_with_logits( - logits=self.logits, - labels=one_hot_actions) - def kl(self, other): - a0 = self.logits - tf.reduce_max(self.logits, axis=-1, keep_dims=True) - a1 = other.logits - tf.reduce_max(other.logits, axis=-1, keep_dims=True) - ea0 = tf.exp(a0) - ea1 = tf.exp(a1) - z0 = tf.reduce_sum(ea0, axis=-1, keep_dims=True) - z1 = tf.reduce_sum(ea1, axis=-1, keep_dims=True) - p0 = ea0 / z0 - return tf.reduce_sum(p0 * (a0 - tf.log(z0) - a1 + tf.log(z1)), axis=-1) - def entropy(self): - a0 = self.logits - tf.reduce_max(self.logits, axis=-1, keep_dims=True) - ea0 = tf.exp(a0) - z0 = tf.reduce_sum(ea0, axis=-1, keep_dims=True) - p0 = ea0 / z0 - return tf.reduce_sum(p0 * (tf.log(z0) - a0), axis=-1) - def sample(self): - u = tf.random_uniform(tf.shape(self.logits)) - return tf.argmax(self.logits - tf.log(-tf.log(u)), axis=-1) - @classmethod - def fromflat(cls, flat): - return cls(flat) - -class MultiCategoricalPd(Pd): - def __init__(self, nvec, flat): - self.flat = flat - self.categoricals = list(map(CategoricalPd, tf.split(flat, nvec, axis=-1))) - def flatparam(self): - return self.flat - def mode(self): - return tf.cast(tf.stack([p.mode() for p in self.categoricals], axis=-1), tf.int32) - def neglogp(self, x): - return tf.add_n([p.neglogp(px) for p, px in zip(self.categoricals, tf.unstack(x, axis=-1))]) - def kl(self, other): - return tf.add_n([p.kl(q) for p, q in zip(self.categoricals, other.categoricals)]) - def entropy(self): - return tf.add_n([p.entropy() for p in self.categoricals]) - def sample(self): - return tf.cast(tf.stack([p.sample() for p in self.categoricals], axis=-1), tf.int32) - @classmethod - def fromflat(cls, flat): - raise NotImplementedError - -class DiagGaussianPd(Pd): - def __init__(self, flat): - self.flat = flat - mean, logstd = tf.split(axis=len(flat.shape)-1, num_or_size_splits=2, value=flat) - self.mean = mean - self.logstd = logstd - self.std = tf.exp(logstd) - def flatparam(self): - return self.flat - def mode(self): - return self.mean - def neglogp(self, x): - return 0.5 * tf.reduce_sum(tf.square((x - self.mean) / self.std), axis=-1) \ - + 0.5 * np.log(2.0 * np.pi) * tf.to_float(tf.shape(x)[-1]) \ - + tf.reduce_sum(self.logstd, axis=-1) - def kl(self, other): - assert isinstance(other, DiagGaussianPd) - return tf.reduce_sum(other.logstd - self.logstd + (tf.square(self.std) + tf.square(self.mean - other.mean)) / (2.0 * tf.square(other.std)) - 0.5, axis=-1) - def entropy(self): - return tf.reduce_sum(self.logstd + .5 * np.log(2.0 * np.pi * np.e), axis=-1) - def sample(self): - return self.mean + self.std * tf.random_normal(tf.shape(self.mean)) - @classmethod - def fromflat(cls, flat): - return cls(flat) - -class BernoulliPd(Pd): - def __init__(self, logits): - self.logits = logits - self.ps = tf.sigmoid(logits) - def flatparam(self): - return self.logits - def mode(self): - return tf.round(self.ps) - def neglogp(self, x): - return tf.reduce_sum(tf.nn.sigmoid_cross_entropy_with_logits(logits=self.logits, labels=tf.to_float(x)), axis=-1) - def kl(self, other): - return tf.reduce_sum(tf.nn.sigmoid_cross_entropy_with_logits(logits=other.logits, labels=self.ps), axis=-1) - tf.reduce_sum(tf.nn.sigmoid_cross_entropy_with_logits(logits=self.logits, labels=self.ps), axis=-1) - def entropy(self): - return tf.reduce_sum(tf.nn.sigmoid_cross_entropy_with_logits(logits=self.logits, labels=self.ps), axis=-1) - def sample(self): - u = tf.random_uniform(tf.shape(self.ps)) - return tf.to_float(math_ops.less(u, self.ps)) - @classmethod - def fromflat(cls, flat): - return cls(flat) - -def make_pdtype(ac_space): - from gym import spaces - if isinstance(ac_space, spaces.Box): - assert len(ac_space.shape) == 1 - return DiagGaussianPdType(ac_space.shape[0]) - elif isinstance(ac_space, spaces.Discrete): - return CategoricalPdType(ac_space.n) - elif isinstance(ac_space, spaces.MultiDiscrete): - return MultiCategoricalPdType(ac_space.nvec) - elif isinstance(ac_space, spaces.MultiBinary): - return BernoulliPdType(ac_space.n) - else: - raise NotImplementedError - -def shape_el(v, i): - maybe = v.get_shape()[i] - if maybe is not None: - return maybe - else: - return tf.shape(v)[i] - -@U.in_session -def test_probtypes(): - np.random.seed(0) - - pdparam_diag_gauss = np.array([-.2, .3, .4, -.5, .1, -.5, .1, 0.8]) - diag_gauss = DiagGaussianPdType(pdparam_diag_gauss.size // 2) #pylint: disable=E1101 - validate_probtype(diag_gauss, pdparam_diag_gauss) - - pdparam_categorical = np.array([-.2, .3, .5]) - categorical = CategoricalPdType(pdparam_categorical.size) #pylint: disable=E1101 - validate_probtype(categorical, pdparam_categorical) - - nvec = [1,2,3] - pdparam_multicategorical = np.array([-.2, .3, .5, .1, 1, -.1]) - multicategorical = MultiCategoricalPdType(nvec) #pylint: disable=E1101 - validate_probtype(multicategorical, pdparam_multicategorical) - - pdparam_bernoulli = np.array([-.2, .3, .5]) - bernoulli = BernoulliPdType(pdparam_bernoulli.size) #pylint: disable=E1101 - validate_probtype(bernoulli, pdparam_bernoulli) - - -def validate_probtype(probtype, pdparam): - N = 100000 - # Check to see if mean negative log likelihood == differential entropy - Mval = np.repeat(pdparam[None, :], N, axis=0) - M = probtype.param_placeholder([N]) - X = probtype.sample_placeholder([N]) - pd = probtype.pdfromflat(M) - calcloglik = U.function([X, M], pd.logp(X)) - calcent = U.function([M], pd.entropy()) - Xval = tf.get_default_session().run(pd.sample(), feed_dict={M:Mval}) - logliks = calcloglik(Xval, Mval) - entval_ll = - logliks.mean() #pylint: disable=E1101 - entval_ll_stderr = logliks.std() / np.sqrt(N) #pylint: disable=E1101 - entval = calcent(Mval).mean() #pylint: disable=E1101 - assert np.abs(entval - entval_ll) < 3 * entval_ll_stderr # within 3 sigmas - - # Check to see if kldiv[p,q] = - ent[p] - E_p[log q] - M2 = probtype.param_placeholder([N]) - pd2 = probtype.pdfromflat(M2) - q = pdparam + np.random.randn(pdparam.size) * 0.1 - Mval2 = np.repeat(q[None, :], N, axis=0) - calckl = U.function([M, M2], pd.kl(pd2)) - klval = calckl(Mval, Mval2).mean() #pylint: disable=E1101 - logliks = calcloglik(Xval, Mval2) - klval_ll = - entval - logliks.mean() #pylint: disable=E1101 - klval_ll_stderr = logliks.std() / np.sqrt(N) #pylint: disable=E1101 - assert np.abs(klval - klval_ll) < 3 * klval_ll_stderr # within 3 sigmas - print('ok on', probtype, pdparam) - diff --git a/baselines/common/filters.py b/baselines/common/filters.py deleted file mode 100644 index 5ce019cd22..0000000000 --- a/baselines/common/filters.py +++ /dev/null @@ -1,98 +0,0 @@ -from .running_stat import RunningStat -from collections import deque -import numpy as np - -class Filter(object): - def __call__(self, x, update=True): - raise NotImplementedError - def reset(self): - pass - -class IdentityFilter(Filter): - def __call__(self, x, update=True): - return x - -class CompositionFilter(Filter): - def __init__(self, fs): - self.fs = fs - def __call__(self, x, update=True): - for f in self.fs: - x = f(x) - return x - def output_shape(self, input_space): - out = input_space.shape - for f in self.fs: - out = f.output_shape(out) - return out - -class ZFilter(Filter): - """ - y = (x-mean)/std - using running estimates of mean,std - """ - - def __init__(self, shape, demean=True, destd=True, clip=10.0): - self.demean = demean - self.destd = destd - self.clip = clip - - self.rs = RunningStat(shape) - - def __call__(self, x, update=True): - if update: self.rs.push(x) - if self.demean: - x = x - self.rs.mean - if self.destd: - x = x / (self.rs.std+1e-8) - if self.clip: - x = np.clip(x, -self.clip, self.clip) - return x - def output_shape(self, input_space): - return input_space.shape - -class AddClock(Filter): - def __init__(self): - self.count = 0 - def reset(self): - self.count = 0 - def __call__(self, x, update=True): - return np.append(x, self.count/100.0) - def output_shape(self, input_space): - return (input_space.shape[0]+1,) - -class FlattenFilter(Filter): - def __call__(self, x, update=True): - return x.ravel() - def output_shape(self, input_space): - return (int(np.prod(input_space.shape)),) - -class Ind2OneHotFilter(Filter): - def __init__(self, n): - self.n = n - def __call__(self, x, update=True): - out = np.zeros(self.n) - out[x] = 1 - return out - def output_shape(self, input_space): - return (input_space.n,) - -class DivFilter(Filter): - def __init__(self, divisor): - self.divisor = divisor - def __call__(self, x, update=True): - return x / self.divisor - def output_shape(self, input_space): - return input_space.shape - -class StackFilter(Filter): - def __init__(self, length): - self.stack = deque(maxlen=length) - def reset(self): - self.stack.clear() - def __call__(self, x, update=True): - self.stack.append(x) - while len(self.stack) < self.stack.maxlen: - self.stack.append(x) - return np.concatenate(self.stack, axis=-1) - def output_shape(self, input_space): - return input_space.shape[:-1] + (input_space.shape[-1] * self.stack.maxlen,) diff --git a/baselines/common/identity_env.py b/baselines/common/identity_env.py deleted file mode 100644 index f07cd5b8d4..0000000000 --- a/baselines/common/identity_env.py +++ /dev/null @@ -1,30 +0,0 @@ -from gym import Env -from gym.spaces import Discrete - - -class IdentityEnv(Env): - def __init__( - self, - dim, - ep_length=100, - ): - - self.action_space = Discrete(dim) - self.reset() - - def reset(self): - self._choose_next_state() - self.observation_space = self.action_space - - return self.state - - def step(self, actions): - rew = self._get_reward(actions) - self._choose_next_state() - return self.state, rew, False, {} - - def _choose_next_state(self): - self.state = self.action_space.sample() - - def _get_reward(self, actions): - return 1 if self.state == actions else 0 diff --git a/baselines/common/input.py b/baselines/common/input.py deleted file mode 100644 index 7fbf9fc00b..0000000000 --- a/baselines/common/input.py +++ /dev/null @@ -1,30 +0,0 @@ -import tensorflow as tf -from gym.spaces import Discrete, Box - -def observation_input(ob_space, batch_size=None, name='Ob'): - ''' - Build observation input with encoding depending on the - observation space type - Params: - - ob_space: observation space (should be one of gym.spaces) - batch_size: batch size for input (default is None, so that resulting input placeholder can take tensors with any batch size) - name: tensorflow variable name for input placeholder - - returns: tuple (input_placeholder, processed_input_tensor) - ''' - if isinstance(ob_space, Discrete): - input_x = tf.placeholder(shape=(batch_size,), dtype=tf.int32, name=name) - processed_x = tf.to_float(tf.one_hot(input_x, ob_space.n)) - return input_x, processed_x - - elif isinstance(ob_space, Box): - input_shape = (batch_size,) + ob_space.shape - input_x = tf.placeholder(shape=input_shape, dtype=ob_space.dtype, name=name) - processed_x = tf.to_float(input_x) - return input_x, processed_x - - else: - raise NotImplementedError - - diff --git a/baselines/common/math_util.py b/baselines/common/math_util.py deleted file mode 100644 index 36b8927781..0000000000 --- a/baselines/common/math_util.py +++ /dev/null @@ -1,85 +0,0 @@ -import numpy as np -import scipy.signal - - -def discount(x, gamma): - """ - computes discounted sums along 0th dimension of x. - - inputs - ------ - x: ndarray - gamma: float - - outputs - ------- - y: ndarray with same shape as x, satisfying - - y[t] = x[t] + gamma*x[t+1] + gamma^2*x[t+2] + ... + gamma^k x[t+k], - where k = len(x) - t - 1 - - """ - assert x.ndim >= 1 - return scipy.signal.lfilter([1],[1,-gamma],x[::-1], axis=0)[::-1] - -def explained_variance(ypred,y): - """ - Computes fraction of variance that ypred explains about y. - Returns 1 - Var[y-ypred] / Var[y] - - interpretation: - ev=0 => might as well have predicted zero - ev=1 => perfect prediction - ev<0 => worse than just predicting zero - - """ - assert y.ndim == 1 and ypred.ndim == 1 - vary = np.var(y) - return np.nan if vary==0 else 1 - np.var(y-ypred)/vary - -def explained_variance_2d(ypred, y): - assert y.ndim == 2 and ypred.ndim == 2 - vary = np.var(y, axis=0) - out = 1 - np.var(y-ypred)/vary - out[vary < 1e-10] = 0 - return out - -def ncc(ypred, y): - return np.corrcoef(ypred, y)[1,0] - -def flatten_arrays(arrs): - return np.concatenate([arr.flat for arr in arrs]) - -def unflatten_vector(vec, shapes): - i=0 - arrs = [] - for shape in shapes: - size = np.prod(shape) - arr = vec[i:i+size].reshape(shape) - arrs.append(arr) - i += size - return arrs - -def discount_with_boundaries(X, New, gamma): - """ - X: 2d array of floats, time x features - New: 2d array of bools, indicating when a new episode has started - """ - Y = np.zeros_like(X) - T = X.shape[0] - Y[T-1] = X[T-1] - for t in range(T-2, -1, -1): - Y[t] = X[t] + gamma * Y[t+1] * (1 - New[t+1]) - return Y - -def test_discount_with_boundaries(): - gamma=0.9 - x = np.array([1.0, 2.0, 3.0, 4.0], 'float32') - starts = [1.0, 0.0, 0.0, 1.0] - y = discount_with_boundaries(x, starts, gamma) - assert np.allclose(y, [ - 1 + gamma * 2 + gamma**2 * 3, - 2 + gamma * 3, - 3, - 4 - ]) \ No newline at end of file diff --git a/baselines/common/mpi_adam.py b/baselines/common/mpi_adam.py deleted file mode 100644 index 4902caf629..0000000000 --- a/baselines/common/mpi_adam.py +++ /dev/null @@ -1,79 +0,0 @@ -from mpi4py import MPI -import baselines.common.tf_util as U -import tensorflow as tf -import numpy as np - -class MpiAdam(object): - def __init__(self, var_list, *, beta1=0.9, beta2=0.999, epsilon=1e-08, scale_grad_by_procs=True, comm=None): - self.var_list = var_list - self.beta1 = beta1 - self.beta2 = beta2 - self.epsilon = epsilon - self.scale_grad_by_procs = scale_grad_by_procs - size = sum(U.numel(v) for v in var_list) - self.m = np.zeros(size, 'float32') - self.v = np.zeros(size, 'float32') - self.t = 0 - self.setfromflat = U.SetFromFlat(var_list) - self.getflat = U.GetFlat(var_list) - self.comm = MPI.COMM_WORLD if comm is None else comm - - def update(self, localg, stepsize): - if self.t % 100 == 0: - self.check_synced() - localg = localg.astype('float32') - globalg = np.zeros_like(localg) - self.comm.Allreduce(localg, globalg, op=MPI.SUM) - if self.scale_grad_by_procs: - globalg /= self.comm.Get_size() - - self.t += 1 - a = stepsize * np.sqrt(1 - self.beta2**self.t)/(1 - self.beta1**self.t) - self.m = self.beta1 * self.m + (1 - self.beta1) * globalg - self.v = self.beta2 * self.v + (1 - self.beta2) * (globalg * globalg) - step = (- a) * self.m / (np.sqrt(self.v) + self.epsilon) - self.setfromflat(self.getflat() + step) - - def sync(self): - theta = self.getflat() - self.comm.Bcast(theta, root=0) - self.setfromflat(theta) - - def check_synced(self): - if self.comm.Get_rank() == 0: # this is root - theta = self.getflat() - self.comm.Bcast(theta, root=0) - else: - thetalocal = self.getflat() - thetaroot = np.empty_like(thetalocal) - self.comm.Bcast(thetaroot, root=0) - assert (thetaroot == thetalocal).all(), (thetaroot, thetalocal) - -@U.in_session -def test_MpiAdam(): - np.random.seed(0) - tf.set_random_seed(0) - - a = tf.Variable(np.random.randn(3).astype('float32')) - b = tf.Variable(np.random.randn(2,5).astype('float32')) - loss = tf.reduce_sum(tf.square(a)) + tf.reduce_sum(tf.sin(b)) - - stepsize = 1e-2 - update_op = tf.train.AdamOptimizer(stepsize).minimize(loss) - do_update = U.function([], loss, updates=[update_op]) - - tf.get_default_session().run(tf.global_variables_initializer()) - for i in range(10): - print(i,do_update()) - - tf.set_random_seed(0) - tf.get_default_session().run(tf.global_variables_initializer()) - - var_list = [a,b] - lossandgrad = U.function([], [loss, U.flatgrad(loss, var_list)], updates=[update_op]) - adam = MpiAdam(var_list) - - for i in range(10): - l,g = lossandgrad() - adam.update(g, stepsize) - print(i,l) \ No newline at end of file diff --git a/baselines/common/mpi_moments.py b/baselines/common/mpi_moments.py deleted file mode 100644 index 7fcc6cd828..0000000000 --- a/baselines/common/mpi_moments.py +++ /dev/null @@ -1,60 +0,0 @@ -from mpi4py import MPI -import numpy as np -from baselines.common import zipsame - - -def mpi_mean(x, axis=0, comm=None, keepdims=False): - x = np.asarray(x) - assert x.ndim > 0 - if comm is None: comm = MPI.COMM_WORLD - xsum = x.sum(axis=axis, keepdims=keepdims) - n = xsum.size - localsum = np.zeros(n+1, x.dtype) - localsum[:n] = xsum.ravel() - localsum[n] = x.shape[axis] - globalsum = np.zeros_like(localsum) - comm.Allreduce(localsum, globalsum, op=MPI.SUM) - return globalsum[:n].reshape(xsum.shape) / globalsum[n], globalsum[n] - -def mpi_moments(x, axis=0, comm=None, keepdims=False): - x = np.asarray(x) - assert x.ndim > 0 - mean, count = mpi_mean(x, axis=axis, comm=comm, keepdims=True) - sqdiffs = np.square(x - mean) - meansqdiff, count1 = mpi_mean(sqdiffs, axis=axis, comm=comm, keepdims=True) - assert count1 == count - std = np.sqrt(meansqdiff) - if not keepdims: - newshape = mean.shape[:axis] + mean.shape[axis+1:] - mean = mean.reshape(newshape) - std = std.reshape(newshape) - return mean, std, count - - -def test_runningmeanstd(): - import subprocess - subprocess.check_call(['mpirun', '-np', '3', - 'python','-c', - 'from baselines.common.mpi_moments import _helper_runningmeanstd; _helper_runningmeanstd()']) - -def _helper_runningmeanstd(): - comm = MPI.COMM_WORLD - np.random.seed(0) - for (triple,axis) in [ - ((np.random.randn(3), np.random.randn(4), np.random.randn(5)),0), - ((np.random.randn(3,2), np.random.randn(4,2), np.random.randn(5,2)),0), - ((np.random.randn(2,3), np.random.randn(2,4), np.random.randn(2,4)),1), - ]: - - - x = np.concatenate(triple, axis=axis) - ms1 = [x.mean(axis=axis), x.std(axis=axis), x.shape[axis]] - - - ms2 = mpi_moments(triple[comm.Get_rank()],axis=axis) - - for (a1,a2) in zipsame(ms1, ms2): - print(a1, a2) - assert np.allclose(a1, a2) - print("ok!") - diff --git a/baselines/common/mpi_running_mean_std.py b/baselines/common/mpi_running_mean_std.py deleted file mode 100644 index 408f8a22b8..0000000000 --- a/baselines/common/mpi_running_mean_std.py +++ /dev/null @@ -1,107 +0,0 @@ -from mpi4py import MPI -import tensorflow as tf, baselines.common.tf_util as U, numpy as np - -class RunningMeanStd(object): - # https://en.wikipedia.org/wiki/Algorithms_for_calculating_variance#Parallel_algorithm - def __init__(self, epsilon=1e-2, shape=()): - - self._sum = tf.get_variable( - dtype=tf.float64, - shape=shape, - initializer=tf.constant_initializer(0.0), - name="runningsum", trainable=False) - self._sumsq = tf.get_variable( - dtype=tf.float64, - shape=shape, - initializer=tf.constant_initializer(epsilon), - name="runningsumsq", trainable=False) - self._count = tf.get_variable( - dtype=tf.float64, - shape=(), - initializer=tf.constant_initializer(epsilon), - name="count", trainable=False) - self.shape = shape - - self.mean = tf.to_float(self._sum / self._count) - self.std = tf.sqrt( tf.maximum( tf.to_float(self._sumsq / self._count) - tf.square(self.mean) , 1e-2 )) - - newsum = tf.placeholder(shape=self.shape, dtype=tf.float64, name='sum') - newsumsq = tf.placeholder(shape=self.shape, dtype=tf.float64, name='var') - newcount = tf.placeholder(shape=[], dtype=tf.float64, name='count') - self.incfiltparams = U.function([newsum, newsumsq, newcount], [], - updates=[tf.assign_add(self._sum, newsum), - tf.assign_add(self._sumsq, newsumsq), - tf.assign_add(self._count, newcount)]) - - - def update(self, x): - x = x.astype('float64') - n = int(np.prod(self.shape)) - totalvec = np.zeros(n*2+1, 'float64') - addvec = np.concatenate([x.sum(axis=0).ravel(), np.square(x).sum(axis=0).ravel(), np.array([len(x)],dtype='float64')]) - MPI.COMM_WORLD.Allreduce(addvec, totalvec, op=MPI.SUM) - self.incfiltparams(totalvec[0:n].reshape(self.shape), totalvec[n:2*n].reshape(self.shape), totalvec[2*n]) - -@U.in_session -def test_runningmeanstd(): - for (x1, x2, x3) in [ - (np.random.randn(3), np.random.randn(4), np.random.randn(5)), - (np.random.randn(3,2), np.random.randn(4,2), np.random.randn(5,2)), - ]: - - rms = RunningMeanStd(epsilon=0.0, shape=x1.shape[1:]) - U.initialize() - - x = np.concatenate([x1, x2, x3], axis=0) - ms1 = [x.mean(axis=0), x.std(axis=0)] - rms.update(x1) - rms.update(x2) - rms.update(x3) - ms2 = [rms.mean.eval(), rms.std.eval()] - - assert np.allclose(ms1, ms2) - -@U.in_session -def test_dist(): - np.random.seed(0) - p1,p2,p3=(np.random.randn(3,1), np.random.randn(4,1), np.random.randn(5,1)) - q1,q2,q3=(np.random.randn(6,1), np.random.randn(7,1), np.random.randn(8,1)) - - # p1,p2,p3=(np.random.randn(3), np.random.randn(4), np.random.randn(5)) - # q1,q2,q3=(np.random.randn(6), np.random.randn(7), np.random.randn(8)) - - comm = MPI.COMM_WORLD - assert comm.Get_size()==2 - if comm.Get_rank()==0: - x1,x2,x3 = p1,p2,p3 - elif comm.Get_rank()==1: - x1,x2,x3 = q1,q2,q3 - else: - assert False - - rms = RunningMeanStd(epsilon=0.0, shape=(1,)) - U.initialize() - - rms.update(x1) - rms.update(x2) - rms.update(x3) - - bigvec = np.concatenate([p1,p2,p3,q1,q2,q3]) - - def checkallclose(x,y): - print(x,y) - return np.allclose(x,y) - - assert checkallclose( - bigvec.mean(axis=0), - rms.mean.eval(), - ) - assert checkallclose( - bigvec.std(axis=0), - rms.std.eval(), - ) - - -if __name__ == "__main__": - # Run with mpirun -np 2 python - test_dist() diff --git a/baselines/common/runners.py b/baselines/common/runners.py deleted file mode 100644 index 0a4b2214f7..0000000000 --- a/baselines/common/runners.py +++ /dev/null @@ -1,18 +0,0 @@ -import numpy as np -from abc import ABC, abstractmethod - -class AbstractEnvRunner(ABC): - def __init__(self, *, env, model, nsteps): - self.env = env - self.model = model - nenv = env.num_envs - self.batch_ob_shape = (nenv*nsteps,) + env.observation_space.shape - self.obs = np.zeros((nenv,) + env.observation_space.shape, dtype=env.observation_space.dtype.name) - self.obs[:] = env.reset() - self.nsteps = nsteps - self.states = model.initial_state - self.dones = [False for _ in range(nenv)] - - @abstractmethod - def run(self): - raise NotImplementedError diff --git a/baselines/common/running_mean_std.py b/baselines/common/running_mean_std.py deleted file mode 100644 index 06ba8d8f11..0000000000 --- a/baselines/common/running_mean_std.py +++ /dev/null @@ -1,46 +0,0 @@ -import numpy as np -class RunningMeanStd(object): - # https://en.wikipedia.org/wiki/Algorithms_for_calculating_variance#Parallel_algorithm - def __init__(self, epsilon=1e-4, shape=()): - self.mean = np.zeros(shape, 'float64') - self.var = np.ones(shape, 'float64') - self.count = epsilon - - def update(self, x): - batch_mean = np.mean(x, axis=0) - batch_var = np.var(x, axis=0) - batch_count = x.shape[0] - self.update_from_moments(batch_mean, batch_var, batch_count) - - def update_from_moments(self, batch_mean, batch_var, batch_count): - delta = batch_mean - self.mean - tot_count = self.count + batch_count - - new_mean = self.mean + delta * batch_count / tot_count - m_a = self.var * (self.count) - m_b = batch_var * (batch_count) - M2 = m_a + m_b + np.square(delta) * self.count * batch_count / (self.count + batch_count) - new_var = M2 / (self.count + batch_count) - - new_count = batch_count + self.count - - self.mean = new_mean - self.var = new_var - self.count = new_count - -def test_runningmeanstd(): - for (x1, x2, x3) in [ - (np.random.randn(3), np.random.randn(4), np.random.randn(5)), - (np.random.randn(3,2), np.random.randn(4,2), np.random.randn(5,2)), - ]: - - rms = RunningMeanStd(epsilon=0.0, shape=x1.shape[1:]) - - x = np.concatenate([x1, x2, x3], axis=0) - ms1 = [x.mean(axis=0), x.var(axis=0)] - rms.update(x1) - rms.update(x2) - rms.update(x3) - ms2 = [rms.mean, rms.var] - - assert np.allclose(ms1, ms2) diff --git a/baselines/common/running_stat.py b/baselines/common/running_stat.py deleted file mode 100644 index b9aa86c2ff..0000000000 --- a/baselines/common/running_stat.py +++ /dev/null @@ -1,46 +0,0 @@ -import numpy as np - -# http://www.johndcook.com/blog/standard_deviation/ -class RunningStat(object): - def __init__(self, shape): - self._n = 0 - self._M = np.zeros(shape) - self._S = np.zeros(shape) - def push(self, x): - x = np.asarray(x) - assert x.shape == self._M.shape - self._n += 1 - if self._n == 1: - self._M[...] = x - else: - oldM = self._M.copy() - self._M[...] = oldM + (x - oldM)/self._n - self._S[...] = self._S + (x - oldM)*(x - self._M) - @property - def n(self): - return self._n - @property - def mean(self): - return self._M - @property - def var(self): - return self._S/(self._n - 1) if self._n > 1 else np.square(self._M) - @property - def std(self): - return np.sqrt(self.var) - @property - def shape(self): - return self._M.shape - -def test_running_stat(): - for shp in ((), (3,), (3,4)): - li = [] - rs = RunningStat(shp) - for _ in range(5): - val = np.random.randn(*shp) - rs.push(val) - li.append(val) - m = np.mean(li, axis=0) - assert np.allclose(rs.mean, m) - v = np.square(m) if (len(li) == 1) else np.var(li, ddof=1, axis=0) - assert np.allclose(rs.var, v) diff --git a/baselines/common/test_identity.py b/baselines/common/test_identity.py deleted file mode 100644 index a429e0c27b..0000000000 --- a/baselines/common/test_identity.py +++ /dev/null @@ -1,44 +0,0 @@ -import pytest -import tensorflow as tf -import random -import numpy as np -from gym.spaces import np_random - -from baselines.a2c import a2c -from baselines.ppo2 import ppo2 -from baselines.common.identity_env import IdentityEnv -from baselines.common.vec_env.dummy_vec_env import DummyVecEnv -from baselines.ppo2.policies import MlpPolicy - - -learn_func_list = [ - lambda e: a2c.learn(policy=MlpPolicy, env=e, seed=0, total_timesteps=50000), - lambda e: ppo2.learn(policy=MlpPolicy, env=e, total_timesteps=50000, lr=1e-3, nsteps=128, ent_coef=0.01) -] - - -@pytest.mark.slow -@pytest.mark.parametrize("learn_func", learn_func_list) -def test_identity(learn_func): - ''' - Test if the algorithm (with a given policy) - can learn an identity transformation (i.e. return observation as an action) - ''' - np.random.seed(0) - np_random.seed(0) - random.seed(0) - - env = DummyVecEnv([lambda: IdentityEnv(10)]) - - with tf.Graph().as_default(), tf.Session().as_default(): - tf.set_random_seed(0) - model = learn_func(env) - - N_TRIALS = 1000 - sum_rew = 0 - obs = env.reset() - for i in range(N_TRIALS): - obs, rew, done, _ = env.step(model.step(obs)[0]) - sum_rew += rew - - assert sum_rew > 0.9 * N_TRIALS diff --git a/baselines/common/tests/test_schedules.py b/baselines/common/tests/test_schedules.py deleted file mode 100644 index 4e8d02d291..0000000000 --- a/baselines/common/tests/test_schedules.py +++ /dev/null @@ -1,26 +0,0 @@ -import numpy as np - -from baselines.common.schedules import ConstantSchedule, PiecewiseSchedule - - -def test_piecewise_schedule(): - ps = PiecewiseSchedule([(-5, 100), (5, 200), (10, 50), (100, 50), (200, -50)], outside_value=500) - - assert np.isclose(ps.value(-10), 500) - assert np.isclose(ps.value(0), 150) - assert np.isclose(ps.value(5), 200) - assert np.isclose(ps.value(9), 80) - assert np.isclose(ps.value(50), 50) - assert np.isclose(ps.value(80), 50) - assert np.isclose(ps.value(150), 0) - assert np.isclose(ps.value(175), -25) - assert np.isclose(ps.value(201), 500) - assert np.isclose(ps.value(500), 500) - - assert np.isclose(ps.value(200 - 1e-10), -50) - - -def test_constant_schedule(): - cs = ConstantSchedule(5) - for i in range(-100, 100): - assert np.isclose(cs.value(i), 5) diff --git a/baselines/common/tests/test_tf_util.py b/baselines/common/tests/test_tf_util.py deleted file mode 100644 index daad9d0210..0000000000 --- a/baselines/common/tests/test_tf_util.py +++ /dev/null @@ -1,40 +0,0 @@ -# tests for tf_util -import tensorflow as tf -from baselines.common.tf_util import ( - function, - initialize, - single_threaded_session -) - - -def test_function(): - with tf.Graph().as_default(): - x = tf.placeholder(tf.int32, (), name="x") - y = tf.placeholder(tf.int32, (), name="y") - z = 3 * x + 2 * y - lin = function([x, y], z, givens={y: 0}) - - with single_threaded_session(): - initialize() - - assert lin(2) == 6 - assert lin(2, 2) == 10 - - -def test_multikwargs(): - with tf.Graph().as_default(): - x = tf.placeholder(tf.int32, (), name="x") - with tf.variable_scope("other"): - x2 = tf.placeholder(tf.int32, (), name="x") - z = 3 * x + 2 * x2 - - lin = function([x, x2], z, givens={x2: 0}) - with single_threaded_session(): - initialize() - assert lin(2) == 6 - assert lin(2, 2) == 10 - - -if __name__ == '__main__': - test_function() - test_multikwargs() diff --git a/baselines/common/tf_util.py b/baselines/common/tf_util.py deleted file mode 100644 index afcd593e85..0000000000 --- a/baselines/common/tf_util.py +++ /dev/null @@ -1,304 +0,0 @@ -import numpy as np -import tensorflow as tf # pylint: ignore-module -import copy -import os -import functools -import collections -import multiprocessing - -def switch(condition, then_expression, else_expression): - """Switches between two operations depending on a scalar value (int or bool). - Note that both `then_expression` and `else_expression` - should be symbolic tensors of the *same shape*. - - # Arguments - condition: scalar tensor. - then_expression: TensorFlow operation. - else_expression: TensorFlow operation. - """ - x_shape = copy.copy(then_expression.get_shape()) - x = tf.cond(tf.cast(condition, 'bool'), - lambda: then_expression, - lambda: else_expression) - x.set_shape(x_shape) - return x - -# ================================================================ -# Extras -# ================================================================ - -def lrelu(x, leak=0.2): - f1 = 0.5 * (1 + leak) - f2 = 0.5 * (1 - leak) - return f1 * x + f2 * abs(x) - -# ================================================================ -# Mathematical utils -# ================================================================ - -def huber_loss(x, delta=1.0): - """Reference: https://en.wikipedia.org/wiki/Huber_loss""" - return tf.where( - tf.abs(x) < delta, - tf.square(x) * 0.5, - delta * (tf.abs(x) - 0.5 * delta) - ) - -# ================================================================ -# Global session -# ================================================================ - -def make_session(num_cpu=None, make_default=False, graph=None): - """Returns a session that will use CPU's only""" - if num_cpu is None: - num_cpu = int(os.getenv('RCALL_NUM_CPU', multiprocessing.cpu_count())) - tf_config = tf.ConfigProto( - inter_op_parallelism_threads=num_cpu, - intra_op_parallelism_threads=num_cpu) - if make_default: - return tf.InteractiveSession(config=tf_config, graph=graph) - else: - return tf.Session(config=tf_config, graph=graph) - -def single_threaded_session(): - """Returns a session which will only use a single CPU""" - return make_session(num_cpu=1) - -def in_session(f): - @functools.wraps(f) - def newfunc(*args, **kwargs): - with tf.Session(): - f(*args, **kwargs) - return newfunc - -ALREADY_INITIALIZED = set() - -def initialize(): - """Initialize all the uninitialized variables in the global scope.""" - new_variables = set(tf.global_variables()) - ALREADY_INITIALIZED - tf.get_default_session().run(tf.variables_initializer(new_variables)) - ALREADY_INITIALIZED.update(new_variables) - -# ================================================================ -# Model components -# ================================================================ - -def normc_initializer(std=1.0, axis=0): - def _initializer(shape, dtype=None, partition_info=None): # pylint: disable=W0613 - out = np.random.randn(*shape).astype(np.float32) - out *= std / np.sqrt(np.square(out).sum(axis=axis, keepdims=True)) - return tf.constant(out) - return _initializer - -def conv2d(x, num_filters, name, filter_size=(3, 3), stride=(1, 1), pad="SAME", dtype=tf.float32, collections=None, - summary_tag=None): - with tf.variable_scope(name): - stride_shape = [1, stride[0], stride[1], 1] - filter_shape = [filter_size[0], filter_size[1], int(x.get_shape()[3]), num_filters] - - # there are "num input feature maps * filter height * filter width" - # inputs to each hidden unit - fan_in = intprod(filter_shape[:3]) - # each unit in the lower layer receives a gradient from: - # "num output feature maps * filter height * filter width" / - # pooling size - fan_out = intprod(filter_shape[:2]) * num_filters - # initialize weights with random weights - w_bound = np.sqrt(6. / (fan_in + fan_out)) - - w = tf.get_variable("W", filter_shape, dtype, tf.random_uniform_initializer(-w_bound, w_bound), - collections=collections) - b = tf.get_variable("b", [1, 1, 1, num_filters], initializer=tf.zeros_initializer(), - collections=collections) - - if summary_tag is not None: - tf.summary.image(summary_tag, - tf.transpose(tf.reshape(w, [filter_size[0], filter_size[1], -1, 1]), - [2, 0, 1, 3]), - max_images=10) - - return tf.nn.conv2d(x, w, stride_shape, pad) + b - -# ================================================================ -# Theano-like Function -# ================================================================ - -def function(inputs, outputs, updates=None, givens=None): - """Just like Theano function. Take a bunch of tensorflow placeholders and expressions - computed based on those placeholders and produces f(inputs) -> outputs. Function f takes - values to be fed to the input's placeholders and produces the values of the expressions - in outputs. - - Input values can be passed in the same order as inputs or can be provided as kwargs based - on placeholder name (passed to constructor or accessible via placeholder.op.name). - - Example: - x = tf.placeholder(tf.int32, (), name="x") - y = tf.placeholder(tf.int32, (), name="y") - z = 3 * x + 2 * y - lin = function([x, y], z, givens={y: 0}) - - with single_threaded_session(): - initialize() - - assert lin(2) == 6 - assert lin(x=3) == 9 - assert lin(2, 2) == 10 - assert lin(x=2, y=3) == 12 - - Parameters - ---------- - inputs: [tf.placeholder, tf.constant, or object with make_feed_dict method] - list of input arguments - outputs: [tf.Variable] or tf.Variable - list of outputs or a single output to be returned from function. Returned - value will also have the same shape. - """ - if isinstance(outputs, list): - return _Function(inputs, outputs, updates, givens=givens) - elif isinstance(outputs, (dict, collections.OrderedDict)): - f = _Function(inputs, outputs.values(), updates, givens=givens) - return lambda *args, **kwargs: type(outputs)(zip(outputs.keys(), f(*args, **kwargs))) - else: - f = _Function(inputs, [outputs], updates, givens=givens) - return lambda *args, **kwargs: f(*args, **kwargs)[0] - - -class _Function(object): - def __init__(self, inputs, outputs, updates, givens): - for inpt in inputs: - if not hasattr(inpt, 'make_feed_dict') and not (type(inpt) is tf.Tensor and len(inpt.op.inputs) == 0): - assert False, "inputs should all be placeholders, constants, or have a make_feed_dict method" - self.inputs = inputs - updates = updates or [] - self.update_group = tf.group(*updates) - self.outputs_update = list(outputs) + [self.update_group] - self.givens = {} if givens is None else givens - - def _feed_input(self, feed_dict, inpt, value): - if hasattr(inpt, 'make_feed_dict'): - feed_dict.update(inpt.make_feed_dict(value)) - else: - feed_dict[inpt] = value - - def __call__(self, *args): - assert len(args) <= len(self.inputs), "Too many arguments provided" - feed_dict = {} - # Update the args - for inpt, value in zip(self.inputs, args): - self._feed_input(feed_dict, inpt, value) - # Update feed dict with givens. - for inpt in self.givens: - feed_dict[inpt] = feed_dict.get(inpt, self.givens[inpt]) - results = tf.get_default_session().run(self.outputs_update, feed_dict=feed_dict)[:-1] - return results - -# ================================================================ -# Flat vectors -# ================================================================ - -def var_shape(x): - out = x.get_shape().as_list() - assert all(isinstance(a, int) for a in out), \ - "shape function assumes that shape is fully known" - return out - -def numel(x): - return intprod(var_shape(x)) - -def intprod(x): - return int(np.prod(x)) - -def flatgrad(loss, var_list, clip_norm=None): - grads = tf.gradients(loss, var_list) - if clip_norm is not None: - grads = [tf.clip_by_norm(grad, clip_norm=clip_norm) for grad in grads] - return tf.concat(axis=0, values=[ - tf.reshape(grad if grad is not None else tf.zeros_like(v), [numel(v)]) - for (v, grad) in zip(var_list, grads) - ]) - -class SetFromFlat(object): - def __init__(self, var_list, dtype=tf.float32): - assigns = [] - shapes = list(map(var_shape, var_list)) - total_size = np.sum([intprod(shape) for shape in shapes]) - - self.theta = theta = tf.placeholder(dtype, [total_size]) - start = 0 - assigns = [] - for (shape, v) in zip(shapes, var_list): - size = intprod(shape) - assigns.append(tf.assign(v, tf.reshape(theta[start:start + size], shape))) - start += size - self.op = tf.group(*assigns) - - def __call__(self, theta): - tf.get_default_session().run(self.op, feed_dict={self.theta: theta}) - -class GetFlat(object): - def __init__(self, var_list): - self.op = tf.concat(axis=0, values=[tf.reshape(v, [numel(v)]) for v in var_list]) - - def __call__(self): - return tf.get_default_session().run(self.op) - -_PLACEHOLDER_CACHE = {} # name -> (placeholder, dtype, shape) - -def get_placeholder(name, dtype, shape): - if name in _PLACEHOLDER_CACHE: - out, dtype1, shape1 = _PLACEHOLDER_CACHE[name] - assert dtype1 == dtype and shape1 == shape - return out - else: - out = tf.placeholder(dtype=dtype, shape=shape, name=name) - _PLACEHOLDER_CACHE[name] = (out, dtype, shape) - return out - -def get_placeholder_cached(name): - return _PLACEHOLDER_CACHE[name][0] - -def flattenallbut0(x): - return tf.reshape(x, [-1, intprod(x.get_shape().as_list()[1:])]) - - -# ================================================================ -# Diagnostics -# ================================================================ - -def display_var_info(vars): - from baselines import logger - count_params = 0 - for v in vars: - name = v.name - if "/Adam" in name or "beta1_power" in name or "beta2_power" in name: continue - v_params = np.prod(v.shape.as_list()) - count_params += v_params - if "/b:" in name or "/biases" in name: continue # Wx+b, bias is not interesting to look at => count params, but not print - logger.info(" %s%s %i params %s" % (name, " "*(55-len(name)), v_params, str(v.shape))) - - logger.info("Total model parameters: %0.2f million" % (count_params*1e-6)) - - -def get_available_gpus(): - # recipe from here: - # https://stackoverflow.com/questions/38559755/how-to-get-current-available-gpus-in-tensorflow?utm_medium=organic&utm_source=google_rich_qa&utm_campaign=google_rich_qa - - from tensorflow.python.client import device_lib - local_device_protos = device_lib.list_local_devices() - return [x.name for x in local_device_protos if x.device_type == 'GPU'] - -# ================================================================ -# Saving variables -# ================================================================ - -def load_state(fname): - saver = tf.train.Saver() - saver.restore(tf.get_default_session(), fname) - -def save_state(fname): - os.makedirs(os.path.dirname(fname), exist_ok=True) - saver = tf.train.Saver() - saver.save(tf.get_default_session(), fname) - - diff --git a/baselines/common/tile_images.py b/baselines/common/tile_images.py deleted file mode 100644 index 929da8994a..0000000000 --- a/baselines/common/tile_images.py +++ /dev/null @@ -1,23 +0,0 @@ -import numpy as np - -def tile_images(img_nhwc): - """ - Tile N images into one big PxQ image - (P,Q) are chosen to be as close as possible, and if N - is square, then P=Q. - - input: img_nhwc, list or array of images, ndim=4 once turned into array - n = batch index, h = height, w = width, c = channel - returns: - bigim_HWc, ndarray with ndim=3 - """ - img_nhwc = np.asarray(img_nhwc) - N, h, w, c = img_nhwc.shape - H = int(np.ceil(np.sqrt(N))) - W = int(np.ceil(float(N)/H)) - img_nhwc = np.array(list(img_nhwc) + [img_nhwc[0]*0 for _ in range(N, H*W)]) - img_HWhwc = img_nhwc.reshape(H, W, h, w, c) - img_HhWwc = img_HWhwc.transpose(0, 2, 1, 3, 4) - img_Hh_Ww_c = img_HhWwc.reshape(H*h, W*w, c) - return img_Hh_Ww_c - diff --git a/baselines/common/vec_env/subproc_vec_env.py b/baselines/common/vec_env/subproc_vec_env.py deleted file mode 100644 index fb55df45d3..0000000000 --- a/baselines/common/vec_env/subproc_vec_env.py +++ /dev/null @@ -1,97 +0,0 @@ -import numpy as np -from multiprocessing import Process, Pipe -from baselines.common.vec_env import VecEnv, CloudpickleWrapper -from baselines.common.tile_images import tile_images - - -def worker(remote, parent_remote, env_fn_wrapper): - parent_remote.close() - env = env_fn_wrapper.x() - while True: - cmd, data = remote.recv() - if cmd == 'step': - ob, reward, done, info = env.step(data) - if done: - ob = env.reset() - remote.send((ob, reward, done, info)) - elif cmd == 'reset': - ob = env.reset() - remote.send(ob) - elif cmd == 'render': - remote.send(env.render(mode='rgb_array')) - elif cmd == 'close': - remote.close() - break - elif cmd == 'get_spaces': - remote.send((env.observation_space, env.action_space)) - else: - raise NotImplementedError - - -class SubprocVecEnv(VecEnv): - def __init__(self, env_fns, spaces=None): - """ - envs: list of gym environments to run in subprocesses - """ - self.waiting = False - self.closed = False - nenvs = len(env_fns) - self.remotes, self.work_remotes = zip(*[Pipe() for _ in range(nenvs)]) - self.ps = [Process(target=worker, args=(work_remote, remote, CloudpickleWrapper(env_fn))) - for (work_remote, remote, env_fn) in zip(self.work_remotes, self.remotes, env_fns)] - for p in self.ps: - p.daemon = True # if the main process crashes, we should not cause things to hang - p.start() - for remote in self.work_remotes: - remote.close() - - self.remotes[0].send(('get_spaces', None)) - observation_space, action_space = self.remotes[0].recv() - VecEnv.__init__(self, len(env_fns), observation_space, action_space) - - def step_async(self, actions): - for remote, action in zip(self.remotes, actions): - remote.send(('step', action)) - self.waiting = True - - def step_wait(self): - results = [remote.recv() for remote in self.remotes] - self.waiting = False - obs, rews, dones, infos = zip(*results) - return np.stack(obs), np.stack(rews), np.stack(dones), infos - - def reset(self): - for remote in self.remotes: - remote.send(('reset', None)) - return np.stack([remote.recv() for remote in self.remotes]) - - def reset_task(self): - for remote in self.remotes: - remote.send(('reset_task', None)) - return np.stack([remote.recv() for remote in self.remotes]) - - def close(self): - if self.closed: - return - if self.waiting: - for remote in self.remotes: - remote.recv() - for remote in self.remotes: - remote.send(('close', None)) - for p in self.ps: - p.join() - self.closed = True - - def render(self, mode='human'): - for pipe in self.remotes: - pipe.send(('render', None)) - imgs = [pipe.recv() for pipe in self.remotes] - bigimg = tile_images(imgs) - if mode == 'human': - import cv2 - cv2.imshow('vecenv', bigimg[:,:,::-1]) - cv2.waitKey(1) - elif mode == 'rgb_array': - return bigimg - else: - raise NotImplementedError \ No newline at end of file diff --git a/baselines/common/vec_env/vec_frame_stack.py b/baselines/common/vec_env/vec_frame_stack.py deleted file mode 100644 index 0bbcbdbb58..0000000000 --- a/baselines/common/vec_env/vec_frame_stack.py +++ /dev/null @@ -1,38 +0,0 @@ -from baselines.common.vec_env import VecEnvWrapper -import numpy as np -from gym import spaces - -class VecFrameStack(VecEnvWrapper): - """ - Vectorized environment base class - """ - def __init__(self, venv, nstack): - self.venv = venv - self.nstack = nstack - wos = venv.observation_space # wrapped ob space - low = np.repeat(wos.low, self.nstack, axis=-1) - high = np.repeat(wos.high, self.nstack, axis=-1) - self.stackedobs = np.zeros((venv.num_envs,)+low.shape, low.dtype) - observation_space = spaces.Box(low=low, high=high, dtype=venv.observation_space.dtype) - VecEnvWrapper.__init__(self, venv, observation_space=observation_space) - - def step_wait(self): - obs, rews, news, infos = self.venv.step_wait() - self.stackedobs = np.roll(self.stackedobs, shift=-1, axis=-1) - for (i, new) in enumerate(news): - if new: - self.stackedobs[i] = 0 - self.stackedobs[..., -obs.shape[-1]:] = obs - return self.stackedobs, rews, news, infos - - def reset(self): - """ - Reset all environments - """ - obs = self.venv.reset() - self.stackedobs[...] = 0 - self.stackedobs[..., -obs.shape[-1]:] = obs - return self.stackedobs - - def close(self): - self.venv.close() diff --git a/baselines/common/vec_env/vec_normalize.py b/baselines/common/vec_env/vec_normalize.py deleted file mode 100644 index dda767da15..0000000000 --- a/baselines/common/vec_env/vec_normalize.py +++ /dev/null @@ -1,47 +0,0 @@ -from baselines.common.vec_env import VecEnvWrapper -from baselines.common.running_mean_std import RunningMeanStd -import numpy as np - -class VecNormalize(VecEnvWrapper): - """ - Vectorized environment base class - """ - def __init__(self, venv, ob=True, ret=True, clipob=10., cliprew=10., gamma=0.99, epsilon=1e-8): - VecEnvWrapper.__init__(self, venv) - self.ob_rms = RunningMeanStd(shape=self.observation_space.shape) if ob else None - self.ret_rms = RunningMeanStd(shape=()) if ret else None - self.clipob = clipob - self.cliprew = cliprew - self.ret = np.zeros(self.num_envs) - self.gamma = gamma - self.epsilon = epsilon - - def step_wait(self): - """ - Apply sequence of actions to sequence of environments - actions -> (observations, rewards, news) - - where 'news' is a boolean vector indicating whether each element is new. - """ - obs, rews, news, infos = self.venv.step_wait() - self.ret = self.ret * self.gamma + rews - obs = self._obfilt(obs) - if self.ret_rms: - self.ret_rms.update(self.ret) - rews = np.clip(rews / np.sqrt(self.ret_rms.var + self.epsilon), -self.cliprew, self.cliprew) - return obs, rews, news, infos - - def _obfilt(self, obs): - if self.ob_rms: - self.ob_rms.update(obs) - obs = np.clip((obs - self.ob_rms.mean) / np.sqrt(self.ob_rms.var + self.epsilon), -self.clipob, self.clipob) - return obs - else: - return obs - - def reset(self): - """ - Reset all environments - """ - obs = self.venv.reset() - return self._obfilt(obs) diff --git a/baselines/ddpg/README.md b/baselines/ddpg/README.md deleted file mode 100644 index 6e936dd334..0000000000 --- a/baselines/ddpg/README.md +++ /dev/null @@ -1,5 +0,0 @@ -# DDPG - -- Original paper: https://arxiv.org/abs/1509.02971 -- Baselines post: https://blog.openai.com/better-exploration-with-parameter-noise/ -- `python -m baselines.ddpg.main` runs the algorithm for 1M frames = 10M timesteps on a Mujoco environment. See help (`-h`) for more options. \ No newline at end of file diff --git a/baselines/ddpg/__init__.py b/baselines/ddpg/__init__.py deleted file mode 100644 index e69de29bb2..0000000000 diff --git a/baselines/ddpg/ddpg.py b/baselines/ddpg/ddpg.py deleted file mode 100644 index e2d49501c7..0000000000 --- a/baselines/ddpg/ddpg.py +++ /dev/null @@ -1,378 +0,0 @@ -from copy import copy -from functools import reduce - -import numpy as np -import tensorflow as tf -import tensorflow.contrib as tc - -from baselines import logger -from baselines.common.mpi_adam import MpiAdam -import baselines.common.tf_util as U -from baselines.common.mpi_running_mean_std import RunningMeanStd -from mpi4py import MPI - -def normalize(x, stats): - if stats is None: - return x - return (x - stats.mean) / stats.std - - -def denormalize(x, stats): - if stats is None: - return x - return x * stats.std + stats.mean - -def reduce_std(x, axis=None, keepdims=False): - return tf.sqrt(reduce_var(x, axis=axis, keepdims=keepdims)) - -def reduce_var(x, axis=None, keepdims=False): - m = tf.reduce_mean(x, axis=axis, keep_dims=True) - devs_squared = tf.square(x - m) - return tf.reduce_mean(devs_squared, axis=axis, keep_dims=keepdims) - -def get_target_updates(vars, target_vars, tau): - logger.info('setting up target updates ...') - soft_updates = [] - init_updates = [] - assert len(vars) == len(target_vars) - for var, target_var in zip(vars, target_vars): - logger.info(' {} <- {}'.format(target_var.name, var.name)) - init_updates.append(tf.assign(target_var, var)) - soft_updates.append(tf.assign(target_var, (1. - tau) * target_var + tau * var)) - assert len(init_updates) == len(vars) - assert len(soft_updates) == len(vars) - return tf.group(*init_updates), tf.group(*soft_updates) - - -def get_perturbed_actor_updates(actor, perturbed_actor, param_noise_stddev): - assert len(actor.vars) == len(perturbed_actor.vars) - assert len(actor.perturbable_vars) == len(perturbed_actor.perturbable_vars) - - updates = [] - for var, perturbed_var in zip(actor.vars, perturbed_actor.vars): - if var in actor.perturbable_vars: - logger.info(' {} <- {} + noise'.format(perturbed_var.name, var.name)) - updates.append(tf.assign(perturbed_var, var + tf.random_normal(tf.shape(var), mean=0., stddev=param_noise_stddev))) - else: - logger.info(' {} <- {}'.format(perturbed_var.name, var.name)) - updates.append(tf.assign(perturbed_var, var)) - assert len(updates) == len(actor.vars) - return tf.group(*updates) - - -class DDPG(object): - def __init__(self, actor, critic, memory, observation_shape, action_shape, param_noise=None, action_noise=None, - gamma=0.99, tau=0.001, normalize_returns=False, enable_popart=False, normalize_observations=True, - batch_size=128, observation_range=(-5., 5.), action_range=(-1., 1.), return_range=(-np.inf, np.inf), - adaptive_param_noise=True, adaptive_param_noise_policy_threshold=.1, - critic_l2_reg=0., actor_lr=1e-4, critic_lr=1e-3, clip_norm=None, reward_scale=1.): - # Inputs. - self.obs0 = tf.placeholder(tf.float32, shape=(None,) + observation_shape, name='obs0') - self.obs1 = tf.placeholder(tf.float32, shape=(None,) + observation_shape, name='obs1') - self.terminals1 = tf.placeholder(tf.float32, shape=(None, 1), name='terminals1') - self.rewards = tf.placeholder(tf.float32, shape=(None, 1), name='rewards') - self.actions = tf.placeholder(tf.float32, shape=(None,) + action_shape, name='actions') - self.critic_target = tf.placeholder(tf.float32, shape=(None, 1), name='critic_target') - self.param_noise_stddev = tf.placeholder(tf.float32, shape=(), name='param_noise_stddev') - - # Parameters. - self.gamma = gamma - self.tau = tau - self.memory = memory - self.normalize_observations = normalize_observations - self.normalize_returns = normalize_returns - self.action_noise = action_noise - self.param_noise = param_noise - self.action_range = action_range - self.return_range = return_range - self.observation_range = observation_range - self.critic = critic - self.actor = actor - self.actor_lr = actor_lr - self.critic_lr = critic_lr - self.clip_norm = clip_norm - self.enable_popart = enable_popart - self.reward_scale = reward_scale - self.batch_size = batch_size - self.stats_sample = None - self.critic_l2_reg = critic_l2_reg - - # Observation normalization. - if self.normalize_observations: - with tf.variable_scope('obs_rms'): - self.obs_rms = RunningMeanStd(shape=observation_shape) - else: - self.obs_rms = None - normalized_obs0 = tf.clip_by_value(normalize(self.obs0, self.obs_rms), - self.observation_range[0], self.observation_range[1]) - normalized_obs1 = tf.clip_by_value(normalize(self.obs1, self.obs_rms), - self.observation_range[0], self.observation_range[1]) - - # Return normalization. - if self.normalize_returns: - with tf.variable_scope('ret_rms'): - self.ret_rms = RunningMeanStd() - else: - self.ret_rms = None - - # Create target networks. - target_actor = copy(actor) - target_actor.name = 'target_actor' - self.target_actor = target_actor - target_critic = copy(critic) - target_critic.name = 'target_critic' - self.target_critic = target_critic - - # Create networks and core TF parts that are shared across setup parts. - self.actor_tf = actor(normalized_obs0) - self.normalized_critic_tf = critic(normalized_obs0, self.actions) - self.critic_tf = denormalize(tf.clip_by_value(self.normalized_critic_tf, self.return_range[0], self.return_range[1]), self.ret_rms) - self.normalized_critic_with_actor_tf = critic(normalized_obs0, self.actor_tf, reuse=True) - self.critic_with_actor_tf = denormalize(tf.clip_by_value(self.normalized_critic_with_actor_tf, self.return_range[0], self.return_range[1]), self.ret_rms) - Q_obs1 = denormalize(target_critic(normalized_obs1, target_actor(normalized_obs1)), self.ret_rms) - self.target_Q = self.rewards + (1. - self.terminals1) * gamma * Q_obs1 - - # Set up parts. - if self.param_noise is not None: - self.setup_param_noise(normalized_obs0) - self.setup_actor_optimizer() - self.setup_critic_optimizer() - if self.normalize_returns and self.enable_popart: - self.setup_popart() - self.setup_stats() - self.setup_target_network_updates() - - def setup_target_network_updates(self): - actor_init_updates, actor_soft_updates = get_target_updates(self.actor.vars, self.target_actor.vars, self.tau) - critic_init_updates, critic_soft_updates = get_target_updates(self.critic.vars, self.target_critic.vars, self.tau) - self.target_init_updates = [actor_init_updates, critic_init_updates] - self.target_soft_updates = [actor_soft_updates, critic_soft_updates] - - def setup_param_noise(self, normalized_obs0): - assert self.param_noise is not None - - # Configure perturbed actor. - param_noise_actor = copy(self.actor) - param_noise_actor.name = 'param_noise_actor' - self.perturbed_actor_tf = param_noise_actor(normalized_obs0) - logger.info('setting up param noise') - self.perturb_policy_ops = get_perturbed_actor_updates(self.actor, param_noise_actor, self.param_noise_stddev) - - # Configure separate copy for stddev adoption. - adaptive_param_noise_actor = copy(self.actor) - adaptive_param_noise_actor.name = 'adaptive_param_noise_actor' - adaptive_actor_tf = adaptive_param_noise_actor(normalized_obs0) - self.perturb_adaptive_policy_ops = get_perturbed_actor_updates(self.actor, adaptive_param_noise_actor, self.param_noise_stddev) - self.adaptive_policy_distance = tf.sqrt(tf.reduce_mean(tf.square(self.actor_tf - adaptive_actor_tf))) - - def setup_actor_optimizer(self): - logger.info('setting up actor optimizer') - self.actor_loss = -tf.reduce_mean(self.critic_with_actor_tf) - actor_shapes = [var.get_shape().as_list() for var in self.actor.trainable_vars] - actor_nb_params = sum([reduce(lambda x, y: x * y, shape) for shape in actor_shapes]) - logger.info(' actor shapes: {}'.format(actor_shapes)) - logger.info(' actor params: {}'.format(actor_nb_params)) - self.actor_grads = U.flatgrad(self.actor_loss, self.actor.trainable_vars, clip_norm=self.clip_norm) - self.actor_optimizer = MpiAdam(var_list=self.actor.trainable_vars, - beta1=0.9, beta2=0.999, epsilon=1e-08) - - def setup_critic_optimizer(self): - logger.info('setting up critic optimizer') - normalized_critic_target_tf = tf.clip_by_value(normalize(self.critic_target, self.ret_rms), self.return_range[0], self.return_range[1]) - self.critic_loss = tf.reduce_mean(tf.square(self.normalized_critic_tf - normalized_critic_target_tf)) - if self.critic_l2_reg > 0.: - critic_reg_vars = [var for var in self.critic.trainable_vars if 'kernel' in var.name and 'output' not in var.name] - for var in critic_reg_vars: - logger.info(' regularizing: {}'.format(var.name)) - logger.info(' applying l2 regularization with {}'.format(self.critic_l2_reg)) - critic_reg = tc.layers.apply_regularization( - tc.layers.l2_regularizer(self.critic_l2_reg), - weights_list=critic_reg_vars - ) - self.critic_loss += critic_reg - critic_shapes = [var.get_shape().as_list() for var in self.critic.trainable_vars] - critic_nb_params = sum([reduce(lambda x, y: x * y, shape) for shape in critic_shapes]) - logger.info(' critic shapes: {}'.format(critic_shapes)) - logger.info(' critic params: {}'.format(critic_nb_params)) - self.critic_grads = U.flatgrad(self.critic_loss, self.critic.trainable_vars, clip_norm=self.clip_norm) - self.critic_optimizer = MpiAdam(var_list=self.critic.trainable_vars, - beta1=0.9, beta2=0.999, epsilon=1e-08) - - def setup_popart(self): - # See https://arxiv.org/pdf/1602.07714.pdf for details. - self.old_std = tf.placeholder(tf.float32, shape=[1], name='old_std') - new_std = self.ret_rms.std - self.old_mean = tf.placeholder(tf.float32, shape=[1], name='old_mean') - new_mean = self.ret_rms.mean - - self.renormalize_Q_outputs_op = [] - for vs in [self.critic.output_vars, self.target_critic.output_vars]: - assert len(vs) == 2 - M, b = vs - assert 'kernel' in M.name - assert 'bias' in b.name - assert M.get_shape()[-1] == 1 - assert b.get_shape()[-1] == 1 - self.renormalize_Q_outputs_op += [M.assign(M * self.old_std / new_std)] - self.renormalize_Q_outputs_op += [b.assign((b * self.old_std + self.old_mean - new_mean) / new_std)] - - def setup_stats(self): - ops = [] - names = [] - - if self.normalize_returns: - ops += [self.ret_rms.mean, self.ret_rms.std] - names += ['ret_rms_mean', 'ret_rms_std'] - - if self.normalize_observations: - ops += [tf.reduce_mean(self.obs_rms.mean), tf.reduce_mean(self.obs_rms.std)] - names += ['obs_rms_mean', 'obs_rms_std'] - - ops += [tf.reduce_mean(self.critic_tf)] - names += ['reference_Q_mean'] - ops += [reduce_std(self.critic_tf)] - names += ['reference_Q_std'] - - ops += [tf.reduce_mean(self.critic_with_actor_tf)] - names += ['reference_actor_Q_mean'] - ops += [reduce_std(self.critic_with_actor_tf)] - names += ['reference_actor_Q_std'] - - ops += [tf.reduce_mean(self.actor_tf)] - names += ['reference_action_mean'] - ops += [reduce_std(self.actor_tf)] - names += ['reference_action_std'] - - if self.param_noise: - ops += [tf.reduce_mean(self.perturbed_actor_tf)] - names += ['reference_perturbed_action_mean'] - ops += [reduce_std(self.perturbed_actor_tf)] - names += ['reference_perturbed_action_std'] - - self.stats_ops = ops - self.stats_names = names - - def pi(self, obs, apply_noise=True, compute_Q=True): - if self.param_noise is not None and apply_noise: - actor_tf = self.perturbed_actor_tf - else: - actor_tf = self.actor_tf - feed_dict = {self.obs0: [obs]} - if compute_Q: - action, q = self.sess.run([actor_tf, self.critic_with_actor_tf], feed_dict=feed_dict) - else: - action = self.sess.run(actor_tf, feed_dict=feed_dict) - q = None - action = action.flatten() - if self.action_noise is not None and apply_noise: - noise = self.action_noise() - assert noise.shape == action.shape - action += noise - action = np.clip(action, self.action_range[0], self.action_range[1]) - return action, q - - def store_transition(self, obs0, action, reward, obs1, terminal1): - reward *= self.reward_scale - self.memory.append(obs0, action, reward, obs1, terminal1) - if self.normalize_observations: - self.obs_rms.update(np.array([obs0])) - - def train(self): - # Get a batch. - batch = self.memory.sample(batch_size=self.batch_size) - - if self.normalize_returns and self.enable_popart: - old_mean, old_std, target_Q = self.sess.run([self.ret_rms.mean, self.ret_rms.std, self.target_Q], feed_dict={ - self.obs1: batch['obs1'], - self.rewards: batch['rewards'], - self.terminals1: batch['terminals1'].astype('float32'), - }) - self.ret_rms.update(target_Q.flatten()) - self.sess.run(self.renormalize_Q_outputs_op, feed_dict={ - self.old_std : np.array([old_std]), - self.old_mean : np.array([old_mean]), - }) - - # Run sanity check. Disabled by default since it slows down things considerably. - # print('running sanity check') - # target_Q_new, new_mean, new_std = self.sess.run([self.target_Q, self.ret_rms.mean, self.ret_rms.std], feed_dict={ - # self.obs1: batch['obs1'], - # self.rewards: batch['rewards'], - # self.terminals1: batch['terminals1'].astype('float32'), - # }) - # print(target_Q_new, target_Q, new_mean, new_std) - # assert (np.abs(target_Q - target_Q_new) < 1e-3).all() - else: - target_Q = self.sess.run(self.target_Q, feed_dict={ - self.obs1: batch['obs1'], - self.rewards: batch['rewards'], - self.terminals1: batch['terminals1'].astype('float32'), - }) - - # Get all gradients and perform a synced update. - ops = [self.actor_grads, self.actor_loss, self.critic_grads, self.critic_loss] - actor_grads, actor_loss, critic_grads, critic_loss = self.sess.run(ops, feed_dict={ - self.obs0: batch['obs0'], - self.actions: batch['actions'], - self.critic_target: target_Q, - }) - self.actor_optimizer.update(actor_grads, stepsize=self.actor_lr) - self.critic_optimizer.update(critic_grads, stepsize=self.critic_lr) - - return critic_loss, actor_loss - - def initialize(self, sess): - self.sess = sess - self.sess.run(tf.global_variables_initializer()) - self.actor_optimizer.sync() - self.critic_optimizer.sync() - self.sess.run(self.target_init_updates) - - def update_target_net(self): - self.sess.run(self.target_soft_updates) - - def get_stats(self): - if self.stats_sample is None: - # Get a sample and keep that fixed for all further computations. - # This allows us to estimate the change in value for the same set of inputs. - self.stats_sample = self.memory.sample(batch_size=self.batch_size) - values = self.sess.run(self.stats_ops, feed_dict={ - self.obs0: self.stats_sample['obs0'], - self.actions: self.stats_sample['actions'], - }) - - names = self.stats_names[:] - assert len(names) == len(values) - stats = dict(zip(names, values)) - - if self.param_noise is not None: - stats = {**stats, **self.param_noise.get_stats()} - - return stats - - def adapt_param_noise(self): - if self.param_noise is None: - return 0. - - # Perturb a separate copy of the policy to adjust the scale for the next "real" perturbation. - batch = self.memory.sample(batch_size=self.batch_size) - self.sess.run(self.perturb_adaptive_policy_ops, feed_dict={ - self.param_noise_stddev: self.param_noise.current_stddev, - }) - distance = self.sess.run(self.adaptive_policy_distance, feed_dict={ - self.obs0: batch['obs0'], - self.param_noise_stddev: self.param_noise.current_stddev, - }) - - mean_distance = MPI.COMM_WORLD.allreduce(distance, op=MPI.SUM) / MPI.COMM_WORLD.Get_size() - self.param_noise.adapt(mean_distance) - return mean_distance - - def reset(self): - # Reset internal state after an episode is complete. - if self.action_noise is not None: - self.action_noise.reset() - if self.param_noise is not None: - self.sess.run(self.perturb_policy_ops, feed_dict={ - self.param_noise_stddev: self.param_noise.current_stddev, - }) diff --git a/baselines/ddpg/models.py b/baselines/ddpg/models.py deleted file mode 100644 index dc5803a035..0000000000 --- a/baselines/ddpg/models.py +++ /dev/null @@ -1,77 +0,0 @@ -import tensorflow as tf -import tensorflow.contrib as tc - - -class Model(object): - def __init__(self, name): - self.name = name - - @property - def vars(self): - return tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope=self.name) - - @property - def trainable_vars(self): - return tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope=self.name) - - @property - def perturbable_vars(self): - return [var for var in self.trainable_vars if 'LayerNorm' not in var.name] - - -class Actor(Model): - def __init__(self, nb_actions, name='actor', layer_norm=True): - super(Actor, self).__init__(name=name) - self.nb_actions = nb_actions - self.layer_norm = layer_norm - - def __call__(self, obs, reuse=False): - with tf.variable_scope(self.name) as scope: - if reuse: - scope.reuse_variables() - - x = obs - x = tf.layers.dense(x, 64) - if self.layer_norm: - x = tc.layers.layer_norm(x, center=True, scale=True) - x = tf.nn.relu(x) - - x = tf.layers.dense(x, 64) - if self.layer_norm: - x = tc.layers.layer_norm(x, center=True, scale=True) - x = tf.nn.relu(x) - - x = tf.layers.dense(x, self.nb_actions, kernel_initializer=tf.random_uniform_initializer(minval=-3e-3, maxval=3e-3)) - x = tf.nn.tanh(x) - return x - - -class Critic(Model): - def __init__(self, name='critic', layer_norm=True): - super(Critic, self).__init__(name=name) - self.layer_norm = layer_norm - - def __call__(self, obs, action, reuse=False): - with tf.variable_scope(self.name) as scope: - if reuse: - scope.reuse_variables() - - x = obs - x = tf.layers.dense(x, 64) - if self.layer_norm: - x = tc.layers.layer_norm(x, center=True, scale=True) - x = tf.nn.relu(x) - - x = tf.concat([x, action], axis=-1) - x = tf.layers.dense(x, 64) - if self.layer_norm: - x = tc.layers.layer_norm(x, center=True, scale=True) - x = tf.nn.relu(x) - - x = tf.layers.dense(x, 1, kernel_initializer=tf.random_uniform_initializer(minval=-3e-3, maxval=3e-3)) - return x - - @property - def output_vars(self): - output_vars = [var for var in self.trainable_vars if 'output' in var.name] - return output_vars diff --git a/baselines/ddpg/noise.py b/baselines/ddpg/noise.py deleted file mode 100644 index c48d0d6a22..0000000000 --- a/baselines/ddpg/noise.py +++ /dev/null @@ -1,67 +0,0 @@ -import numpy as np - - -class AdaptiveParamNoiseSpec(object): - def __init__(self, initial_stddev=0.1, desired_action_stddev=0.1, adoption_coefficient=1.01): - self.initial_stddev = initial_stddev - self.desired_action_stddev = desired_action_stddev - self.adoption_coefficient = adoption_coefficient - - self.current_stddev = initial_stddev - - def adapt(self, distance): - if distance > self.desired_action_stddev: - # Decrease stddev. - self.current_stddev /= self.adoption_coefficient - else: - # Increase stddev. - self.current_stddev *= self.adoption_coefficient - - def get_stats(self): - stats = { - 'param_noise_stddev': self.current_stddev, - } - return stats - - def __repr__(self): - fmt = 'AdaptiveParamNoiseSpec(initial_stddev={}, desired_action_stddev={}, adoption_coefficient={})' - return fmt.format(self.initial_stddev, self.desired_action_stddev, self.adoption_coefficient) - - -class ActionNoise(object): - def reset(self): - pass - - -class NormalActionNoise(ActionNoise): - def __init__(self, mu, sigma): - self.mu = mu - self.sigma = sigma - - def __call__(self): - return np.random.normal(self.mu, self.sigma) - - def __repr__(self): - return 'NormalActionNoise(mu={}, sigma={})'.format(self.mu, self.sigma) - - -# Based on http://math.stackexchange.com/questions/1287634/implementing-ornstein-uhlenbeck-in-matlab -class OrnsteinUhlenbeckActionNoise(ActionNoise): - def __init__(self, mu, sigma, theta=.15, dt=1e-2, x0=None): - self.theta = theta - self.mu = mu - self.sigma = sigma - self.dt = dt - self.x0 = x0 - self.reset() - - def __call__(self): - x = self.x_prev + self.theta * (self.mu - self.x_prev) * self.dt + self.sigma * np.sqrt(self.dt) * np.random.normal(size=self.mu.shape) - self.x_prev = x - return x - - def reset(self): - self.x_prev = self.x0 if self.x0 is not None else np.zeros_like(self.mu) - - def __repr__(self): - return 'OrnsteinUhlenbeckActionNoise(mu={}, sigma={})'.format(self.mu, self.sigma) diff --git a/baselines/ddpg/training.py b/baselines/ddpg/training.py deleted file mode 100644 index 74a9b8fd1c..0000000000 --- a/baselines/ddpg/training.py +++ /dev/null @@ -1,191 +0,0 @@ -import os -import time -from collections import deque -import pickle - -from baselines.ddpg.ddpg import DDPG -import baselines.common.tf_util as U - -from baselines import logger -import numpy as np -import tensorflow as tf -from mpi4py import MPI - - -def train(env, nb_epochs, nb_epoch_cycles, render_eval, reward_scale, render, param_noise, actor, critic, - normalize_returns, normalize_observations, critic_l2_reg, actor_lr, critic_lr, action_noise, - popart, gamma, clip_norm, nb_train_steps, nb_rollout_steps, nb_eval_steps, batch_size, memory, - tau=0.01, eval_env=None, param_noise_adaption_interval=50): - rank = MPI.COMM_WORLD.Get_rank() - - assert (np.abs(env.action_space.low) == env.action_space.high).all() # we assume symmetric actions. - max_action = env.action_space.high - logger.info('scaling actions by {} before executing in env'.format(max_action)) - agent = DDPG(actor, critic, memory, env.observation_space.shape, env.action_space.shape, - gamma=gamma, tau=tau, normalize_returns=normalize_returns, normalize_observations=normalize_observations, - batch_size=batch_size, action_noise=action_noise, param_noise=param_noise, critic_l2_reg=critic_l2_reg, - actor_lr=actor_lr, critic_lr=critic_lr, enable_popart=popart, clip_norm=clip_norm, - reward_scale=reward_scale) - logger.info('Using agent with the following configuration:') - logger.info(str(agent.__dict__.items())) - - # Set up logging stuff only for a single worker. - if rank == 0: - saver = tf.train.Saver() - else: - saver = None - - step = 0 - episode = 0 - eval_episode_rewards_history = deque(maxlen=100) - episode_rewards_history = deque(maxlen=100) - with U.single_threaded_session() as sess: - # Prepare everything. - agent.initialize(sess) - sess.graph.finalize() - - agent.reset() - obs = env.reset() - if eval_env is not None: - eval_obs = eval_env.reset() - done = False - episode_reward = 0. - episode_step = 0 - episodes = 0 - t = 0 - - epoch = 0 - start_time = time.time() - - epoch_episode_rewards = [] - epoch_episode_steps = [] - epoch_episode_eval_rewards = [] - epoch_episode_eval_steps = [] - epoch_start_time = time.time() - epoch_actions = [] - epoch_qs = [] - epoch_episodes = 0 - for epoch in range(nb_epochs): - for cycle in range(nb_epoch_cycles): - # Perform rollouts. - for t_rollout in range(nb_rollout_steps): - # Predict next action. - action, q = agent.pi(obs, apply_noise=True, compute_Q=True) - assert action.shape == env.action_space.shape - - # Execute next action. - if rank == 0 and render: - env.render() - assert max_action.shape == action.shape - new_obs, r, done, info = env.step(max_action * action) # scale for execution in env (as far as DDPG is concerned, every action is in [-1, 1]) - t += 1 - if rank == 0 and render: - env.render() - episode_reward += r - episode_step += 1 - - # Book-keeping. - epoch_actions.append(action) - epoch_qs.append(q) - agent.store_transition(obs, action, r, new_obs, done) - obs = new_obs - - if done: - # Episode done. - epoch_episode_rewards.append(episode_reward) - episode_rewards_history.append(episode_reward) - epoch_episode_steps.append(episode_step) - episode_reward = 0. - episode_step = 0 - epoch_episodes += 1 - episodes += 1 - - agent.reset() - obs = env.reset() - - # Train. - epoch_actor_losses = [] - epoch_critic_losses = [] - epoch_adaptive_distances = [] - for t_train in range(nb_train_steps): - # Adapt param noise, if necessary. - if memory.nb_entries >= batch_size and t_train % param_noise_adaption_interval == 0: - distance = agent.adapt_param_noise() - epoch_adaptive_distances.append(distance) - - cl, al = agent.train() - epoch_critic_losses.append(cl) - epoch_actor_losses.append(al) - agent.update_target_net() - - # Evaluate. - eval_episode_rewards = [] - eval_qs = [] - if eval_env is not None: - eval_episode_reward = 0. - for t_rollout in range(nb_eval_steps): - eval_action, eval_q = agent.pi(eval_obs, apply_noise=False, compute_Q=True) - eval_obs, eval_r, eval_done, eval_info = eval_env.step(max_action * eval_action) # scale for execution in env (as far as DDPG is concerned, every action is in [-1, 1]) - if render_eval: - eval_env.render() - eval_episode_reward += eval_r - - eval_qs.append(eval_q) - if eval_done: - eval_obs = eval_env.reset() - eval_episode_rewards.append(eval_episode_reward) - eval_episode_rewards_history.append(eval_episode_reward) - eval_episode_reward = 0. - - mpi_size = MPI.COMM_WORLD.Get_size() - # Log stats. - # XXX shouldn't call np.mean on variable length lists - duration = time.time() - start_time - stats = agent.get_stats() - combined_stats = stats.copy() - combined_stats['rollout/return'] = np.mean(epoch_episode_rewards) - combined_stats['rollout/return_history'] = np.mean(episode_rewards_history) - combined_stats['rollout/episode_steps'] = np.mean(epoch_episode_steps) - combined_stats['rollout/actions_mean'] = np.mean(epoch_actions) - combined_stats['rollout/Q_mean'] = np.mean(epoch_qs) - combined_stats['train/loss_actor'] = np.mean(epoch_actor_losses) - combined_stats['train/loss_critic'] = np.mean(epoch_critic_losses) - combined_stats['train/param_noise_distance'] = np.mean(epoch_adaptive_distances) - combined_stats['total/duration'] = duration - combined_stats['total/steps_per_second'] = float(t) / float(duration) - combined_stats['total/episodes'] = episodes - combined_stats['rollout/episodes'] = epoch_episodes - combined_stats['rollout/actions_std'] = np.std(epoch_actions) - # Evaluation statistics. - if eval_env is not None: - combined_stats['eval/return'] = eval_episode_rewards - combined_stats['eval/return_history'] = np.mean(eval_episode_rewards_history) - combined_stats['eval/Q'] = eval_qs - combined_stats['eval/episodes'] = len(eval_episode_rewards) - def as_scalar(x): - if isinstance(x, np.ndarray): - assert x.size == 1 - return x[0] - elif np.isscalar(x): - return x - else: - raise ValueError('expected scalar, got %s'%x) - combined_stats_sums = MPI.COMM_WORLD.allreduce(np.array([as_scalar(x) for x in combined_stats.values()])) - combined_stats = {k : v / mpi_size for (k,v) in zip(combined_stats.keys(), combined_stats_sums)} - - # Total statistics. - combined_stats['total/epochs'] = epoch + 1 - combined_stats['total/steps'] = t - - for key in sorted(combined_stats.keys()): - logger.record_tabular(key, combined_stats[key]) - logger.dump_tabular() - logger.info('') - logdir = logger.get_dir() - if rank == 0 and logdir: - if hasattr(env, 'get_state'): - with open(os.path.join(logdir, 'env_state.pkl'), 'wb') as f: - pickle.dump(env.get_state(), f) - if eval_env and hasattr(eval_env, 'get_state'): - with open(os.path.join(logdir, 'eval_env_state.pkl'), 'wb') as f: - pickle.dump(eval_env.get_state(), f) diff --git a/baselines/deepq/README.md b/baselines/deepq/README.md deleted file mode 100644 index 4ea19d5ff2..0000000000 --- a/baselines/deepq/README.md +++ /dev/null @@ -1,52 +0,0 @@ -## If you are curious. - -##### Train a Cartpole agent and watch it play once it converges! - -Here's a list of commands to run to quickly get a working example: - - - - -```bash -# Train model and save the results to cartpole_model.pkl -python -m baselines.deepq.experiments.train_cartpole -# Load the model saved in cartpole_model.pkl and visualize the learned policy -python -m baselines.deepq.experiments.enjoy_cartpole -``` - - -Be sure to check out the source code of [both](experiments/train_cartpole.py) [files](experiments/enjoy_cartpole.py)! - -## If you wish to apply DQN to solve a problem. - -Check out our simple agent trained with one stop shop `deepq.learn` function. - -- [baselines/deepq/experiments/train_cartpole.py](experiments/train_cartpole.py) - train a Cartpole agent. -- [baselines/deepq/experiments/train_pong.py](experiments/train_pong.py) - train a Pong agent using convolutional neural networks. - -In particular notice that once `deepq.learn` finishes training it returns `act` function which can be used to select actions in the environment. Once trained you can easily save it and load at later time. For both of the files listed above there are complimentary files `enjoy_cartpole.py` and `enjoy_pong.py` respectively, that load and visualize the learned policy. - -## If you wish to experiment with the algorithm - -##### Check out the examples - - -- [baselines/deepq/experiments/custom_cartpole.py](experiments/custom_cartpole.py) - Cartpole training with more fine grained control over the internals of DQN algorithm. -- [baselines/deepq/experiments/atari/train.py](experiments/atari/train.py) - more robust setup for training at scale. - - -##### Download a pretrained Atari agent - -For some research projects it is sometimes useful to have an already trained agent handy. There's a variety of models to choose from. You can list them all by running: - -```bash -python -m baselines.deepq.experiments.atari.download_model -``` - -Once you pick a model, you can download it and visualize the learned policy. Be sure to pass `--dueling` flag to visualization script when using dueling models. - -```bash -python -m baselines.deepq.experiments.atari.download_model --blob model-atari-duel-pong-1 --model-dir /tmp/models -python -m baselines.deepq.experiments.atari.enjoy --model-dir /tmp/models/model-atari-duel-pong-1 --env Pong --dueling - -``` diff --git a/baselines/deepq/__init__.py b/baselines/deepq/__init__.py deleted file mode 100644 index 4472399a51..0000000000 --- a/baselines/deepq/__init__.py +++ /dev/null @@ -1,8 +0,0 @@ -from baselines.deepq import models # noqa -from baselines.deepq.build_graph import build_act, build_train # noqa -from baselines.deepq.simple import learn, load # noqa -from baselines.deepq.replay_buffer import ReplayBuffer, PrioritizedReplayBuffer # noqa - -def wrap_atari_dqn(env): - from baselines.common.atari_wrappers import wrap_deepmind - return wrap_deepmind(env, frame_stack=True, scale=True) \ No newline at end of file diff --git a/baselines/deepq/build_graph.py b/baselines/deepq/build_graph.py deleted file mode 100644 index e9ff1a41a3..0000000000 --- a/baselines/deepq/build_graph.py +++ /dev/null @@ -1,449 +0,0 @@ -"""Deep Q learning graph - -The functions in this file can are used to create the following functions: - -======= act ======== - - Function to chose an action given an observation - - Parameters - ---------- - observation: object - Observation that can be feed into the output of make_obs_ph - stochastic: bool - if set to False all the actions are always deterministic (default False) - update_eps_ph: float - update epsilon a new value, if negative not update happens - (default: no update) - - Returns - ------- - Tensor of dtype tf.int64 and shape (BATCH_SIZE,) with an action to be performed for - every element of the batch. - - -======= act (in case of parameter noise) ======== - - Function to chose an action given an observation - - Parameters - ---------- - observation: object - Observation that can be feed into the output of make_obs_ph - stochastic: bool - if set to False all the actions are always deterministic (default False) - update_eps_ph: float - update epsilon a new value, if negative not update happens - (default: no update) - reset_ph: bool - reset the perturbed policy by sampling a new perturbation - update_param_noise_threshold_ph: float - the desired threshold for the difference between non-perturbed and perturbed policy - update_param_noise_scale_ph: bool - whether or not to update the scale of the noise for the next time it is re-perturbed - - Returns - ------- - Tensor of dtype tf.int64 and shape (BATCH_SIZE,) with an action to be performed for - every element of the batch. - - -======= train ======= - - Function that takes a transition (s,a,r,s') and optimizes Bellman equation's error: - - td_error = Q(s,a) - (r + gamma * max_a' Q(s', a')) - loss = huber_loss[td_error] - - Parameters - ---------- - obs_t: object - a batch of observations - action: np.array - actions that were selected upon seeing obs_t. - dtype must be int32 and shape must be (batch_size,) - reward: np.array - immediate reward attained after executing those actions - dtype must be float32 and shape must be (batch_size,) - obs_tp1: object - observations that followed obs_t - done: np.array - 1 if obs_t was the last observation in the episode and 0 otherwise - obs_tp1 gets ignored, but must be of the valid shape. - dtype must be float32 and shape must be (batch_size,) - weight: np.array - imporance weights for every element of the batch (gradient is multiplied - by the importance weight) dtype must be float32 and shape must be (batch_size,) - - Returns - ------- - td_error: np.array - a list of differences between Q(s,a) and the target in Bellman's equation. - dtype is float32 and shape is (batch_size,) - -======= update_target ======== - - copy the parameters from optimized Q function to the target Q function. - In Q learning we actually optimize the following error: - - Q(s,a) - (r + gamma * max_a' Q'(s', a')) - - Where Q' is lagging behind Q to stablize the learning. For example for Atari - - Q' is set to Q once every 10000 updates training steps. - -""" -import tensorflow as tf -import baselines.common.tf_util as U - - -def scope_vars(scope, trainable_only=False): - """ - Get variables inside a scope - The scope can be specified as a string - Parameters - ---------- - scope: str or VariableScope - scope in which the variables reside. - trainable_only: bool - whether or not to return only the variables that were marked as trainable. - Returns - ------- - vars: [tf.Variable] - list of variables in `scope`. - """ - return tf.get_collection( - tf.GraphKeys.TRAINABLE_VARIABLES if trainable_only else tf.GraphKeys.GLOBAL_VARIABLES, - scope=scope if isinstance(scope, str) else scope.name - ) - - -def scope_name(): - """Returns the name of current scope as a string, e.g. deepq/q_func""" - return tf.get_variable_scope().name - - -def absolute_scope_name(relative_scope_name): - """Appends parent scope name to `relative_scope_name`""" - return scope_name() + "/" + relative_scope_name - - -def default_param_noise_filter(var): - if var not in tf.trainable_variables(): - # We never perturb non-trainable vars. - return False - if "fully_connected" in var.name: - # We perturb fully-connected layers. - return True - - # The remaining layers are likely conv or layer norm layers, which we do not wish to - # perturb (in the former case because they only extract features, in the latter case because - # we use them for normalization purposes). If you change your network, you will likely want - # to re-consider which layers to perturb and which to keep untouched. - return False - - -def build_act(make_obs_ph, q_func, num_actions, scope="deepq", reuse=None): - """Creates the act function: - - Parameters - ---------- - make_obs_ph: str -> tf.placeholder or TfInput - a function that take a name and creates a placeholder of input with that name - q_func: (tf.Variable, int, str, bool) -> tf.Variable - the model that takes the following inputs: - observation_in: object - the output of observation placeholder - num_actions: int - number of actions - scope: str - reuse: bool - should be passed to outer variable scope - and returns a tensor of shape (batch_size, num_actions) with values of every action. - num_actions: int - number of actions. - scope: str or VariableScope - optional scope for variable_scope. - reuse: bool or None - whether or not the variables should be reused. To be able to reuse the scope must be given. - - Returns - ------- - act: (tf.Variable, bool, float) -> tf.Variable - function to select and action given observation. -` See the top of the file for details. - """ - with tf.variable_scope(scope, reuse=reuse): - observations_ph = make_obs_ph("observation") - stochastic_ph = tf.placeholder(tf.bool, (), name="stochastic") - update_eps_ph = tf.placeholder(tf.float32, (), name="update_eps") - - eps = tf.get_variable("eps", (), initializer=tf.constant_initializer(0)) - - q_values = q_func(observations_ph.get(), num_actions, scope="q_func") - deterministic_actions = tf.argmax(q_values, axis=1) - - batch_size = tf.shape(observations_ph.get())[0] - random_actions = tf.random_uniform(tf.stack([batch_size]), minval=0, maxval=num_actions, dtype=tf.int64) - chose_random = tf.random_uniform(tf.stack([batch_size]), minval=0, maxval=1, dtype=tf.float32) < eps - stochastic_actions = tf.where(chose_random, random_actions, deterministic_actions) - - output_actions = tf.cond(stochastic_ph, lambda: stochastic_actions, lambda: deterministic_actions) - update_eps_expr = eps.assign(tf.cond(update_eps_ph >= 0, lambda: update_eps_ph, lambda: eps)) - _act = U.function(inputs=[observations_ph, stochastic_ph, update_eps_ph], - outputs=output_actions, - givens={update_eps_ph: -1.0, stochastic_ph: True}, - updates=[update_eps_expr]) - def act(ob, stochastic=True, update_eps=-1): - return _act(ob, stochastic, update_eps) - return act - - -def build_act_with_param_noise(make_obs_ph, q_func, num_actions, scope="deepq", reuse=None, param_noise_filter_func=None): - """Creates the act function with support for parameter space noise exploration (https://arxiv.org/abs/1706.01905): - - Parameters - ---------- - make_obs_ph: str -> tf.placeholder or TfInput - a function that take a name and creates a placeholder of input with that name - q_func: (tf.Variable, int, str, bool) -> tf.Variable - the model that takes the following inputs: - observation_in: object - the output of observation placeholder - num_actions: int - number of actions - scope: str - reuse: bool - should be passed to outer variable scope - and returns a tensor of shape (batch_size, num_actions) with values of every action. - num_actions: int - number of actions. - scope: str or VariableScope - optional scope for variable_scope. - reuse: bool or None - whether or not the variables should be reused. To be able to reuse the scope must be given. - param_noise_filter_func: tf.Variable -> bool - function that decides whether or not a variable should be perturbed. Only applicable - if param_noise is True. If set to None, default_param_noise_filter is used by default. - - Returns - ------- - act: (tf.Variable, bool, float, bool, float, bool) -> tf.Variable - function to select and action given observation. -` See the top of the file for details. - """ - if param_noise_filter_func is None: - param_noise_filter_func = default_param_noise_filter - - with tf.variable_scope(scope, reuse=reuse): - observations_ph = make_obs_ph("observation") - stochastic_ph = tf.placeholder(tf.bool, (), name="stochastic") - update_eps_ph = tf.placeholder(tf.float32, (), name="update_eps") - update_param_noise_threshold_ph = tf.placeholder(tf.float32, (), name="update_param_noise_threshold") - update_param_noise_scale_ph = tf.placeholder(tf.bool, (), name="update_param_noise_scale") - reset_ph = tf.placeholder(tf.bool, (), name="reset") - - eps = tf.get_variable("eps", (), initializer=tf.constant_initializer(0)) - param_noise_scale = tf.get_variable("param_noise_scale", (), initializer=tf.constant_initializer(0.01), trainable=False) - param_noise_threshold = tf.get_variable("param_noise_threshold", (), initializer=tf.constant_initializer(0.05), trainable=False) - - # Unmodified Q. - q_values = q_func(observations_ph.get(), num_actions, scope="q_func") - - # Perturbable Q used for the actual rollout. - q_values_perturbed = q_func(observations_ph.get(), num_actions, scope="perturbed_q_func") - # We have to wrap this code into a function due to the way tf.cond() works. See - # https://stackoverflow.com/questions/37063952/confused-by-the-behavior-of-tf-cond for - # a more detailed discussion. - def perturb_vars(original_scope, perturbed_scope): - all_vars = scope_vars(absolute_scope_name(original_scope)) - all_perturbed_vars = scope_vars(absolute_scope_name(perturbed_scope)) - assert len(all_vars) == len(all_perturbed_vars) - perturb_ops = [] - for var, perturbed_var in zip(all_vars, all_perturbed_vars): - if param_noise_filter_func(perturbed_var): - # Perturb this variable. - op = tf.assign(perturbed_var, var + tf.random_normal(shape=tf.shape(var), mean=0., stddev=param_noise_scale)) - else: - # Do not perturb, just assign. - op = tf.assign(perturbed_var, var) - perturb_ops.append(op) - assert len(perturb_ops) == len(all_vars) - return tf.group(*perturb_ops) - - # Set up functionality to re-compute `param_noise_scale`. This perturbs yet another copy - # of the network and measures the effect of that perturbation in action space. If the perturbation - # is too big, reduce scale of perturbation, otherwise increase. - q_values_adaptive = q_func(observations_ph.get(), num_actions, scope="adaptive_q_func") - perturb_for_adaption = perturb_vars(original_scope="q_func", perturbed_scope="adaptive_q_func") - kl = tf.reduce_sum(tf.nn.softmax(q_values) * (tf.log(tf.nn.softmax(q_values)) - tf.log(tf.nn.softmax(q_values_adaptive))), axis=-1) - mean_kl = tf.reduce_mean(kl) - def update_scale(): - with tf.control_dependencies([perturb_for_adaption]): - update_scale_expr = tf.cond(mean_kl < param_noise_threshold, - lambda: param_noise_scale.assign(param_noise_scale * 1.01), - lambda: param_noise_scale.assign(param_noise_scale / 1.01), - ) - return update_scale_expr - - # Functionality to update the threshold for parameter space noise. - update_param_noise_threshold_expr = param_noise_threshold.assign(tf.cond(update_param_noise_threshold_ph >= 0, - lambda: update_param_noise_threshold_ph, lambda: param_noise_threshold)) - - # Put everything together. - deterministic_actions = tf.argmax(q_values_perturbed, axis=1) - batch_size = tf.shape(observations_ph.get())[0] - random_actions = tf.random_uniform(tf.stack([batch_size]), minval=0, maxval=num_actions, dtype=tf.int64) - chose_random = tf.random_uniform(tf.stack([batch_size]), minval=0, maxval=1, dtype=tf.float32) < eps - stochastic_actions = tf.where(chose_random, random_actions, deterministic_actions) - - output_actions = tf.cond(stochastic_ph, lambda: stochastic_actions, lambda: deterministic_actions) - update_eps_expr = eps.assign(tf.cond(update_eps_ph >= 0, lambda: update_eps_ph, lambda: eps)) - updates = [ - update_eps_expr, - tf.cond(reset_ph, lambda: perturb_vars(original_scope="q_func", perturbed_scope="perturbed_q_func"), lambda: tf.group(*[])), - tf.cond(update_param_noise_scale_ph, lambda: update_scale(), lambda: tf.Variable(0., trainable=False)), - update_param_noise_threshold_expr, - ] - _act = U.function(inputs=[observations_ph, stochastic_ph, update_eps_ph, reset_ph, update_param_noise_threshold_ph, update_param_noise_scale_ph], - outputs=output_actions, - givens={update_eps_ph: -1.0, stochastic_ph: True, reset_ph: False, update_param_noise_threshold_ph: False, update_param_noise_scale_ph: False}, - updates=updates) - def act(ob, reset, update_param_noise_threshold, update_param_noise_scale, stochastic=True, update_eps=-1): - return _act(ob, stochastic, update_eps, reset, update_param_noise_threshold, update_param_noise_scale) - return act - - -def build_train(make_obs_ph, q_func, num_actions, optimizer, grad_norm_clipping=None, gamma=1.0, - double_q=True, scope="deepq", reuse=None, param_noise=False, param_noise_filter_func=None): - """Creates the train function: - - Parameters - ---------- - make_obs_ph: str -> tf.placeholder or TfInput - a function that takes a name and creates a placeholder of input with that name - q_func: (tf.Variable, int, str, bool) -> tf.Variable - the model that takes the following inputs: - observation_in: object - the output of observation placeholder - num_actions: int - number of actions - scope: str - reuse: bool - should be passed to outer variable scope - and returns a tensor of shape (batch_size, num_actions) with values of every action. - num_actions: int - number of actions - reuse: bool - whether or not to reuse the graph variables - optimizer: tf.train.Optimizer - optimizer to use for the Q-learning objective. - grad_norm_clipping: float or None - clip gradient norms to this value. If None no clipping is performed. - gamma: float - discount rate. - double_q: bool - if true will use Double Q Learning (https://arxiv.org/abs/1509.06461). - In general it is a good idea to keep it enabled. - scope: str or VariableScope - optional scope for variable_scope. - reuse: bool or None - whether or not the variables should be reused. To be able to reuse the scope must be given. - param_noise: bool - whether or not to use parameter space noise (https://arxiv.org/abs/1706.01905) - param_noise_filter_func: tf.Variable -> bool - function that decides whether or not a variable should be perturbed. Only applicable - if param_noise is True. If set to None, default_param_noise_filter is used by default. - - Returns - ------- - act: (tf.Variable, bool, float) -> tf.Variable - function to select and action given observation. -` See the top of the file for details. - train: (object, np.array, np.array, object, np.array, np.array) -> np.array - optimize the error in Bellman's equation. -` See the top of the file for details. - update_target: () -> () - copy the parameters from optimized Q function to the target Q function. -` See the top of the file for details. - debug: {str: function} - a bunch of functions to print debug data like q_values. - """ - if param_noise: - act_f = build_act_with_param_noise(make_obs_ph, q_func, num_actions, scope=scope, reuse=reuse, - param_noise_filter_func=param_noise_filter_func) - else: - act_f = build_act(make_obs_ph, q_func, num_actions, scope=scope, reuse=reuse) - - with tf.variable_scope(scope, reuse=reuse): - # set up placeholders - obs_t_input = make_obs_ph("obs_t") - act_t_ph = tf.placeholder(tf.int32, [None], name="action") - rew_t_ph = tf.placeholder(tf.float32, [None], name="reward") - obs_tp1_input = make_obs_ph("obs_tp1") - done_mask_ph = tf.placeholder(tf.float32, [None], name="done") - importance_weights_ph = tf.placeholder(tf.float32, [None], name="weight") - - # q network evaluation - q_t = q_func(obs_t_input.get(), num_actions, scope="q_func", reuse=True) # reuse parameters from act - q_func_vars = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope=tf.get_variable_scope().name + "/q_func") - - # target q network evalution - q_tp1 = q_func(obs_tp1_input.get(), num_actions, scope="target_q_func") - target_q_func_vars = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope=tf.get_variable_scope().name + "/target_q_func") - - # q scores for actions which we know were selected in the given state. - q_t_selected = tf.reduce_sum(q_t * tf.one_hot(act_t_ph, num_actions), 1) - - # compute estimate of best possible value starting from state at t + 1 - if double_q: - q_tp1_using_online_net = q_func(obs_tp1_input.get(), num_actions, scope="q_func", reuse=True) - q_tp1_best_using_online_net = tf.argmax(q_tp1_using_online_net, 1) - q_tp1_best = tf.reduce_sum(q_tp1 * tf.one_hot(q_tp1_best_using_online_net, num_actions), 1) - else: - q_tp1_best = tf.reduce_max(q_tp1, 1) - q_tp1_best_masked = (1.0 - done_mask_ph) * q_tp1_best - - # compute RHS of bellman equation - q_t_selected_target = rew_t_ph + gamma * q_tp1_best_masked - - # compute the error (potentially clipped) - td_error = q_t_selected - tf.stop_gradient(q_t_selected_target) - errors = U.huber_loss(td_error) - weighted_error = tf.reduce_mean(importance_weights_ph * errors) - - # compute optimization op (potentially with gradient clipping) - if grad_norm_clipping is not None: - gradients = optimizer.compute_gradients(weighted_error, var_list=q_func_vars) - for i, (grad, var) in enumerate(gradients): - if grad is not None: - gradients[i] = (tf.clip_by_norm(grad, grad_norm_clipping), var) - optimize_expr = optimizer.apply_gradients(gradients) - else: - optimize_expr = optimizer.minimize(weighted_error, var_list=q_func_vars) - - # update_target_fn will be called periodically to copy Q network to target Q network - update_target_expr = [] - for var, var_target in zip(sorted(q_func_vars, key=lambda v: v.name), - sorted(target_q_func_vars, key=lambda v: v.name)): - update_target_expr.append(var_target.assign(var)) - update_target_expr = tf.group(*update_target_expr) - - # Create callable functions - train = U.function( - inputs=[ - obs_t_input, - act_t_ph, - rew_t_ph, - obs_tp1_input, - done_mask_ph, - importance_weights_ph - ], - outputs=td_error, - updates=[optimize_expr] - ) - update_target = U.function([], [], updates=[update_target_expr]) - - q_values = U.function([obs_t_input], q_t) - - return act_f, train, update_target, {'q_values': q_values} diff --git a/baselines/deepq/experiments/__init__.py b/baselines/deepq/experiments/__init__.py deleted file mode 100644 index e69de29bb2..0000000000 diff --git a/baselines/deepq/experiments/enjoy_cartpole.py b/baselines/deepq/experiments/enjoy_cartpole.py deleted file mode 100644 index 1c6176bac3..0000000000 --- a/baselines/deepq/experiments/enjoy_cartpole.py +++ /dev/null @@ -1,21 +0,0 @@ -import gym - -from baselines import deepq - - -def main(): - env = gym.make("CartPole-v0") - act = deepq.load("cartpole_model.pkl") - - while True: - obs, done = env.reset(), False - episode_rew = 0 - while not done: - env.render() - obs, rew, done, _ = env.step(act(obs[None])[0]) - episode_rew += rew - print("Episode reward", episode_rew) - - -if __name__ == '__main__': - main() diff --git a/baselines/deepq/experiments/enjoy_mountaincar.py b/baselines/deepq/experiments/enjoy_mountaincar.py deleted file mode 100644 index 8bced8c0f8..0000000000 --- a/baselines/deepq/experiments/enjoy_mountaincar.py +++ /dev/null @@ -1,21 +0,0 @@ -import gym - -from baselines import deepq - - -def main(): - env = gym.make("MountainCar-v0") - act = deepq.load("mountaincar_model.pkl") - - while True: - obs, done = env.reset(), False - episode_rew = 0 - while not done: - env.render() - obs, rew, done, _ = env.step(act(obs[None])[0]) - episode_rew += rew - print("Episode reward", episode_rew) - - -if __name__ == '__main__': - main() diff --git a/baselines/deepq/experiments/train_cartpole.py b/baselines/deepq/experiments/train_cartpole.py deleted file mode 100644 index a50c2428f9..0000000000 --- a/baselines/deepq/experiments/train_cartpole.py +++ /dev/null @@ -1,31 +0,0 @@ -import gym - -from baselines import deepq - - -def callback(lcl, _glb): - # stop training if reward exceeds 199 - is_solved = lcl['t'] > 100 and sum(lcl['episode_rewards'][-101:-1]) / 100 >= 199 - return is_solved - - -def main(): - env = gym.make("CartPole-v0") - model = deepq.models.mlp([64]) - act = deepq.learn( - env, - q_func=model, - lr=1e-3, - max_timesteps=100000, - buffer_size=50000, - exploration_fraction=0.1, - exploration_final_eps=0.02, - print_freq=10, - callback=callback - ) - print("Saving model to cartpole_model.pkl") - act.save("cartpole_model.pkl") - - -if __name__ == '__main__': - main() diff --git a/baselines/deepq/experiments/train_mountaincar.py b/baselines/deepq/experiments/train_mountaincar.py deleted file mode 100644 index 061967d760..0000000000 --- a/baselines/deepq/experiments/train_mountaincar.py +++ /dev/null @@ -1,26 +0,0 @@ -import gym - -from baselines import deepq - - -def main(): - env = gym.make("MountainCar-v0") - # Enabling layer_norm here is import for parameter space noise! - model = deepq.models.mlp([64], layer_norm=True) - act = deepq.learn( - env, - q_func=model, - lr=1e-3, - max_timesteps=100000, - buffer_size=50000, - exploration_fraction=0.1, - exploration_final_eps=0.1, - print_freq=10, - param_noise=True - ) - print("Saving model to mountaincar_model.pkl") - act.save("mountaincar_model.pkl") - - -if __name__ == '__main__': - main() diff --git a/baselines/deepq/simple.py b/baselines/deepq/simple.py deleted file mode 100644 index 4bad145503..0000000000 --- a/baselines/deepq/simple.py +++ /dev/null @@ -1,306 +0,0 @@ -import os -import tempfile - -import tensorflow as tf -import zipfile -import cloudpickle -import numpy as np - -import baselines.common.tf_util as U -from baselines.common.tf_util import load_state, save_state -from baselines import logger -from baselines.common.schedules import LinearSchedule -from baselines.common.input import observation_input - -from baselines import deepq -from baselines.deepq.replay_buffer import ReplayBuffer, PrioritizedReplayBuffer -from baselines.deepq.utils import ObservationInput - - -class ActWrapper(object): - def __init__(self, act, act_params): - self._act = act - self._act_params = act_params - - @staticmethod - def load(path): - with open(path, "rb") as f: - model_data, act_params = cloudpickle.load(f) - act = deepq.build_act(**act_params) - sess = tf.Session() - sess.__enter__() - with tempfile.TemporaryDirectory() as td: - arc_path = os.path.join(td, "packed.zip") - with open(arc_path, "wb") as f: - f.write(model_data) - - zipfile.ZipFile(arc_path, 'r', zipfile.ZIP_DEFLATED).extractall(td) - load_state(os.path.join(td, "model")) - - return ActWrapper(act, act_params) - - def __call__(self, *args, **kwargs): - return self._act(*args, **kwargs) - - def save(self, path=None): - """Save model to a pickle located at `path`""" - if path is None: - path = os.path.join(logger.get_dir(), "model.pkl") - - with tempfile.TemporaryDirectory() as td: - save_state(os.path.join(td, "model")) - arc_name = os.path.join(td, "packed.zip") - with zipfile.ZipFile(arc_name, 'w') as zipf: - for root, dirs, files in os.walk(td): - for fname in files: - file_path = os.path.join(root, fname) - if file_path != arc_name: - zipf.write(file_path, os.path.relpath(file_path, td)) - with open(arc_name, "rb") as f: - model_data = f.read() - with open(path, "wb") as f: - cloudpickle.dump((model_data, self._act_params), f) - - -def load(path): - """Load act function that was returned by learn function. - - Parameters - ---------- - path: str - path to the act function pickle - - Returns - ------- - act: ActWrapper - function that takes a batch of observations - and returns actions. - """ - return ActWrapper.load(path) - - -def learn(env, - q_func, - lr=5e-4, - max_timesteps=100000, - buffer_size=50000, - exploration_fraction=0.1, - exploration_final_eps=0.02, - train_freq=1, - batch_size=32, - print_freq=100, - checkpoint_freq=10000, - checkpoint_path=None, - learning_starts=1000, - gamma=1.0, - target_network_update_freq=500, - prioritized_replay=False, - prioritized_replay_alpha=0.6, - prioritized_replay_beta0=0.4, - prioritized_replay_beta_iters=None, - prioritized_replay_eps=1e-6, - param_noise=False, - callback=None): - """Train a deepq model. - - Parameters - ------- - env: gym.Env - environment to train on - q_func: (tf.Variable, int, str, bool) -> tf.Variable - the model that takes the following inputs: - observation_in: object - the output of observation placeholder - num_actions: int - number of actions - scope: str - reuse: bool - should be passed to outer variable scope - and returns a tensor of shape (batch_size, num_actions) with values of every action. - lr: float - learning rate for adam optimizer - max_timesteps: int - number of env steps to optimizer for - buffer_size: int - size of the replay buffer - exploration_fraction: float - fraction of entire training period over which the exploration rate is annealed - exploration_final_eps: float - final value of random action probability - train_freq: int - update the model every `train_freq` steps. - set to None to disable printing - batch_size: int - size of a batched sampled from replay buffer for training - print_freq: int - how often to print out training progress - set to None to disable printing - checkpoint_freq: int - how often to save the model. This is so that the best version is restored - at the end of the training. If you do not wish to restore the best version at - the end of the training set this variable to None. - learning_starts: int - how many steps of the model to collect transitions for before learning starts - gamma: float - discount factor - target_network_update_freq: int - update the target network every `target_network_update_freq` steps. - prioritized_replay: True - if True prioritized replay buffer will be used. - prioritized_replay_alpha: float - alpha parameter for prioritized replay buffer - prioritized_replay_beta0: float - initial value of beta for prioritized replay buffer - prioritized_replay_beta_iters: int - number of iterations over which beta will be annealed from initial value - to 1.0. If set to None equals to max_timesteps. - prioritized_replay_eps: float - epsilon to add to the TD errors when updating priorities. - callback: (locals, globals) -> None - function called at every steps with state of the algorithm. - If callback returns true training stops. - - Returns - ------- - act: ActWrapper - Wrapper over act function. Adds ability to save it and load it. - See header of baselines/deepq/categorical.py for details on the act function. - """ - # Create all the functions necessary to train the model - - sess = tf.Session() - sess.__enter__() - - # capture the shape outside the closure so that the env object is not serialized - # by cloudpickle when serializing make_obs_ph - - def make_obs_ph(name): - return ObservationInput(env.observation_space, name=name) - - act, train, update_target, debug = deepq.build_train( - make_obs_ph=make_obs_ph, - q_func=q_func, - num_actions=env.action_space.n, - optimizer=tf.train.AdamOptimizer(learning_rate=lr), - gamma=gamma, - grad_norm_clipping=10, - param_noise=param_noise - ) - - act_params = { - 'make_obs_ph': make_obs_ph, - 'q_func': q_func, - 'num_actions': env.action_space.n, - } - - act = ActWrapper(act, act_params) - - # Create the replay buffer - if prioritized_replay: - replay_buffer = PrioritizedReplayBuffer(buffer_size, alpha=prioritized_replay_alpha) - if prioritized_replay_beta_iters is None: - prioritized_replay_beta_iters = max_timesteps - beta_schedule = LinearSchedule(prioritized_replay_beta_iters, - initial_p=prioritized_replay_beta0, - final_p=1.0) - else: - replay_buffer = ReplayBuffer(buffer_size) - beta_schedule = None - # Create the schedule for exploration starting from 1. - exploration = LinearSchedule(schedule_timesteps=int(exploration_fraction * max_timesteps), - initial_p=1.0, - final_p=exploration_final_eps) - - # Initialize the parameters and copy them to the target network. - U.initialize() - update_target() - - episode_rewards = [0.0] - saved_mean_reward = None - obs = env.reset() - reset = True - - with tempfile.TemporaryDirectory() as td: - td = checkpoint_path or td - - model_file = os.path.join(td, "model") - model_saved = False - if tf.train.latest_checkpoint(td) is not None: - load_state(model_file) - logger.log('Loaded model from {}'.format(model_file)) - model_saved = True - - for t in range(max_timesteps): - if callback is not None: - if callback(locals(), globals()): - break - # Take action and update exploration to the newest value - kwargs = {} - if not param_noise: - update_eps = exploration.value(t) - update_param_noise_threshold = 0. - else: - update_eps = 0. - # Compute the threshold such that the KL divergence between perturbed and non-perturbed - # policy is comparable to eps-greedy exploration with eps = exploration.value(t). - # See Appendix C.1 in Parameter Space Noise for Exploration, Plappert et al., 2017 - # for detailed explanation. - update_param_noise_threshold = -np.log(1. - exploration.value(t) + exploration.value(t) / float(env.action_space.n)) - kwargs['reset'] = reset - kwargs['update_param_noise_threshold'] = update_param_noise_threshold - kwargs['update_param_noise_scale'] = True - action = act(np.array(obs)[None], update_eps=update_eps, **kwargs)[0] - env_action = action - reset = False - new_obs, rew, done, _ = env.step(env_action) - # Store transition in the replay buffer. - replay_buffer.add(obs, action, rew, new_obs, float(done)) - obs = new_obs - - episode_rewards[-1] += rew - if done: - obs = env.reset() - episode_rewards.append(0.0) - reset = True - - if t > learning_starts and t % train_freq == 0: - # Minimize the error in Bellman's equation on a batch sampled from replay buffer. - if prioritized_replay: - experience = replay_buffer.sample(batch_size, beta=beta_schedule.value(t)) - (obses_t, actions, rewards, obses_tp1, dones, weights, batch_idxes) = experience - else: - obses_t, actions, rewards, obses_tp1, dones = replay_buffer.sample(batch_size) - weights, batch_idxes = np.ones_like(rewards), None - td_errors = train(obses_t, actions, rewards, obses_tp1, dones, weights) - if prioritized_replay: - new_priorities = np.abs(td_errors) + prioritized_replay_eps - replay_buffer.update_priorities(batch_idxes, new_priorities) - - if t > learning_starts and t % target_network_update_freq == 0: - # Update target network periodically. - update_target() - - mean_100ep_reward = round(np.mean(episode_rewards[-101:-1]), 1) - num_episodes = len(episode_rewards) - if done and print_freq is not None and len(episode_rewards) % print_freq == 0: - logger.record_tabular("steps", t) - logger.record_tabular("episodes", num_episodes) - logger.record_tabular("mean 100 episode reward", mean_100ep_reward) - logger.record_tabular("% time spent exploring", int(100 * exploration.value(t))) - logger.dump_tabular() - - if (checkpoint_freq is not None and t > learning_starts and - num_episodes > 100 and t % checkpoint_freq == 0): - if saved_mean_reward is None or mean_100ep_reward > saved_mean_reward: - if print_freq is not None: - logger.log("Saving model due to mean reward increase: {} -> {}".format( - saved_mean_reward, mean_100ep_reward)) - save_state(model_file) - model_saved = True - saved_mean_reward = mean_100ep_reward - if model_saved: - if print_freq is not None: - logger.log("Restored model with mean reward: {}".format(saved_mean_reward)) - load_state(model_file) - - return act diff --git a/baselines/deepq/test_identity.py b/baselines/deepq/test_identity.py deleted file mode 100644 index ef57e70b45..0000000000 --- a/baselines/deepq/test_identity.py +++ /dev/null @@ -1,43 +0,0 @@ -import tensorflow as tf -import random - -from baselines import deepq -from baselines.common.identity_env import IdentityEnv - - -def test_identity(): - - with tf.Graph().as_default(): - env = IdentityEnv(10) - random.seed(0) - - tf.set_random_seed(0) - - param_noise = False - model = deepq.models.mlp([32]) - act = deepq.learn( - env, - q_func=model, - lr=1e-3, - max_timesteps=10000, - buffer_size=50000, - exploration_fraction=0.1, - exploration_final_eps=0.02, - print_freq=10, - param_noise=param_noise, - ) - - tf.set_random_seed(0) - - N_TRIALS = 1000 - sum_rew = 0 - obs = env.reset() - for i in range(N_TRIALS): - obs, rew, done, _ = env.step(act([obs])) - sum_rew += rew - - assert sum_rew > 0.9 * N_TRIALS - - -if __name__ == '__main__': - test_identity() diff --git a/baselines/gail/README.md b/baselines/gail/README.md deleted file mode 100644 index 2a8941fdac..0000000000 --- a/baselines/gail/README.md +++ /dev/null @@ -1,52 +0,0 @@ -# Generative Adversarial Imitation Learning (GAIL) - -- Original paper: https://arxiv.org/abs/1606.03476 - -For results benchmarking on MuJoCo, please navigate to [here](result/gail-result.md) - -## If you want to train an imitation learning agent - -### Step 1: Download expert data - -Download the expert data into `./data`, [download link](https://drive.google.com/drive/folders/1h3H4AY_ZBx08hz-Ct0Nxxus-V1melu1U?usp=sharing) - -### Step 2: Run GAIL - -Run with single thread: - -```bash -python -m baselines.gail.run_mujoco -``` - -Run with multiple threads: - -```bash -mpirun -np 16 python -m baselines.gail.run_mujoco -``` - -See help (`-h`) for more options. - -#### In case you want to run Behavior Cloning (BC) - -```bash -python -m baselines.gail.behavior_clone -``` - -See help (`-h`) for more options. - - -## Contributing - -Bug reports and pull requests are welcome on GitHub at https://github.com/openai/baselines/pulls. - -## Maintainers - -- Yuan-Hong Liao, andrewliao11_at_gmail_dot_com -- Ryan Julian, ryanjulian_at_gmail_dot_com - -## Others - -Thanks to the open source: - -- @openai/imitation -- @carpedm20/deep-rl-tensorflow diff --git a/baselines/gail/__init__.py b/baselines/gail/__init__.py deleted file mode 100644 index e69de29bb2..0000000000 diff --git a/baselines/gail/behavior_clone.py b/baselines/gail/behavior_clone.py deleted file mode 100644 index 82f65ecf19..0000000000 --- a/baselines/gail/behavior_clone.py +++ /dev/null @@ -1,124 +0,0 @@ -''' -The code is used to train BC imitator, or pretrained GAIL imitator -''' - -import argparse -import tempfile -import os.path as osp -import gym -import logging -from tqdm import tqdm - -import tensorflow as tf - -from baselines.gail import mlp_policy -from baselines import bench -from baselines import logger -from baselines.common import set_global_seeds, tf_util as U -from baselines.common.misc_util import boolean_flag -from baselines.common.mpi_adam import MpiAdam -from baselines.gail.run_mujoco import runner -from baselines.gail.dataset.mujoco_dset import Mujoco_Dset - - -def argsparser(): - parser = argparse.ArgumentParser("Tensorflow Implementation of Behavior Cloning") - parser.add_argument('--env_id', help='environment ID', default='Hopper-v1') - parser.add_argument('--seed', help='RNG seed', type=int, default=0) - parser.add_argument('--expert_path', type=str, default='data/deterministic.trpo.Hopper.0.00.npz') - parser.add_argument('--checkpoint_dir', help='the directory to save model', default='checkpoint') - parser.add_argument('--log_dir', help='the directory to save log file', default='log') - # Mujoco Dataset Configuration - parser.add_argument('--traj_limitation', type=int, default=-1) - # Network Configuration (Using MLP Policy) - parser.add_argument('--policy_hidden_size', type=int, default=100) - # for evaluatation - boolean_flag(parser, 'stochastic_policy', default=False, help='use stochastic/deterministic policy to evaluate') - boolean_flag(parser, 'save_sample', default=False, help='save the trajectories or not') - parser.add_argument('--BC_max_iter', help='Max iteration for training BC', type=int, default=1e5) - return parser.parse_args() - - -def learn(env, policy_func, dataset, optim_batch_size=128, max_iters=1e4, - adam_epsilon=1e-5, optim_stepsize=3e-4, - ckpt_dir=None, log_dir=None, task_name=None, - verbose=False): - - val_per_iter = int(max_iters/10) - ob_space = env.observation_space - ac_space = env.action_space - pi = policy_func("pi", ob_space, ac_space) # Construct network for new policy - # placeholder - ob = U.get_placeholder_cached(name="ob") - ac = pi.pdtype.sample_placeholder([None]) - stochastic = U.get_placeholder_cached(name="stochastic") - loss = tf.reduce_mean(tf.square(ac-pi.ac)) - var_list = pi.get_trainable_variables() - adam = MpiAdam(var_list, epsilon=adam_epsilon) - lossandgrad = U.function([ob, ac, stochastic], [loss]+[U.flatgrad(loss, var_list)]) - - U.initialize() - adam.sync() - logger.log("Pretraining with Behavior Cloning...") - for iter_so_far in tqdm(range(int(max_iters))): - ob_expert, ac_expert = dataset.get_next_batch(optim_batch_size, 'train') - train_loss, g = lossandgrad(ob_expert, ac_expert, True) - adam.update(g, optim_stepsize) - if verbose and iter_so_far % val_per_iter == 0: - ob_expert, ac_expert = dataset.get_next_batch(-1, 'val') - val_loss, _ = lossandgrad(ob_expert, ac_expert, True) - logger.log("Training loss: {}, Validation loss: {}".format(train_loss, val_loss)) - - if ckpt_dir is None: - savedir_fname = tempfile.TemporaryDirectory().name - else: - savedir_fname = osp.join(ckpt_dir, task_name) - U.save_state(savedir_fname, var_list=pi.get_variables()) - return savedir_fname - - -def get_task_name(args): - task_name = 'BC' - task_name += '.{}'.format(args.env_id.split("-")[0]) - task_name += '.traj_limitation_{}'.format(args.traj_limitation) - task_name += ".seed_{}".format(args.seed) - return task_name - - -def main(args): - U.make_session(num_cpu=1).__enter__() - set_global_seeds(args.seed) - env = gym.make(args.env_id) - - def policy_fn(name, ob_space, ac_space, reuse=False): - return mlp_policy.MlpPolicy(name=name, ob_space=ob_space, ac_space=ac_space, - reuse=reuse, hid_size=args.policy_hidden_size, num_hid_layers=2) - env = bench.Monitor(env, logger.get_dir() and - osp.join(logger.get_dir(), "monitor.json")) - env.seed(args.seed) - gym.logger.setLevel(logging.WARN) - task_name = get_task_name(args) - args.checkpoint_dir = osp.join(args.checkpoint_dir, task_name) - args.log_dir = osp.join(args.log_dir, task_name) - dataset = Mujoco_Dset(expert_path=args.expert_path, traj_limitation=args.traj_limitation) - savedir_fname = learn(env, - policy_fn, - dataset, - max_iters=args.BC_max_iter, - ckpt_dir=args.checkpoint_dir, - log_dir=args.log_dir, - task_name=task_name, - verbose=True) - avg_len, avg_ret = runner(env, - policy_fn, - savedir_fname, - timesteps_per_batch=1024, - number_trajs=10, - stochastic_policy=args.stochastic_policy, - save=args.save_sample, - reuse=True) - - -if __name__ == '__main__': - args = argsparser() - main(args) diff --git a/baselines/gail/dataset/__init__.py b/baselines/gail/dataset/__init__.py deleted file mode 100644 index e69de29bb2..0000000000 diff --git a/baselines/gail/mlp_policy.py b/baselines/gail/mlp_policy.py deleted file mode 100644 index d8df120719..0000000000 --- a/baselines/gail/mlp_policy.py +++ /dev/null @@ -1,75 +0,0 @@ -''' -from baselines/ppo1/mlp_policy.py and add simple modification -(1) add reuse argument -(2) cache the `stochastic` placeholder -''' -import tensorflow as tf -import gym - -import baselines.common.tf_util as U -from baselines.common.mpi_running_mean_std import RunningMeanStd -from baselines.common.distributions import make_pdtype -from baselines.acktr.utils import dense - - -class MlpPolicy(object): - recurrent = False - - def __init__(self, name, reuse=False, *args, **kwargs): - with tf.variable_scope(name): - if reuse: - tf.get_variable_scope().reuse_variables() - self._init(*args, **kwargs) - self.scope = tf.get_variable_scope().name - - def _init(self, ob_space, ac_space, hid_size, num_hid_layers, gaussian_fixed_var=True): - assert isinstance(ob_space, gym.spaces.Box) - - self.pdtype = pdtype = make_pdtype(ac_space) - sequence_length = None - - ob = U.get_placeholder(name="ob", dtype=tf.float32, shape=[sequence_length] + list(ob_space.shape)) - - with tf.variable_scope("obfilter"): - self.ob_rms = RunningMeanStd(shape=ob_space.shape) - - obz = tf.clip_by_value((ob - self.ob_rms.mean) / self.ob_rms.std, -5.0, 5.0) - last_out = obz - for i in range(num_hid_layers): - last_out = tf.nn.tanh(dense(last_out, hid_size, "vffc%i" % (i+1), weight_init=U.normc_initializer(1.0))) - self.vpred = dense(last_out, 1, "vffinal", weight_init=U.normc_initializer(1.0))[:, 0] - - last_out = obz - for i in range(num_hid_layers): - last_out = tf.nn.tanh(dense(last_out, hid_size, "polfc%i" % (i+1), weight_init=U.normc_initializer(1.0))) - - if gaussian_fixed_var and isinstance(ac_space, gym.spaces.Box): - mean = dense(last_out, pdtype.param_shape()[0]//2, "polfinal", U.normc_initializer(0.01)) - logstd = tf.get_variable(name="logstd", shape=[1, pdtype.param_shape()[0]//2], initializer=tf.zeros_initializer()) - pdparam = tf.concat([mean, mean * 0.0 + logstd], axis=1) - else: - pdparam = dense(last_out, pdtype.param_shape()[0], "polfinal", U.normc_initializer(0.01)) - - self.pd = pdtype.pdfromflat(pdparam) - - self.state_in = [] - self.state_out = [] - - # change for BC - stochastic = U.get_placeholder(name="stochastic", dtype=tf.bool, shape=()) - ac = U.switch(stochastic, self.pd.sample(), self.pd.mode()) - self.ac = ac - self._act = U.function([stochastic, ob], [ac, self.vpred]) - - def act(self, stochastic, ob): - ac1, vpred1 = self._act(stochastic, ob[None]) - return ac1[0], vpred1[0] - - def get_variables(self): - return tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, self.scope) - - def get_trainable_variables(self): - return tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, self.scope) - - def get_initial_state(self): - return [] diff --git a/baselines/gail/run_mujoco.py b/baselines/gail/run_mujoco.py deleted file mode 100644 index 379f7f8cb8..0000000000 --- a/baselines/gail/run_mujoco.py +++ /dev/null @@ -1,239 +0,0 @@ -''' -Disclaimer: this code is highly based on trpo_mpi at @openai/baselines and @openai/imitation -''' - -import argparse -import os.path as osp -import logging -from mpi4py import MPI -from tqdm import tqdm - -import numpy as np -import gym - -from baselines.gail import mlp_policy -from baselines.common import set_global_seeds, tf_util as U -from baselines.common.misc_util import boolean_flag -from baselines import bench -from baselines import logger -from baselines.gail.dataset.mujoco_dset import Mujoco_Dset -from baselines.gail.adversary import TransitionClassifier - - -def argsparser(): - parser = argparse.ArgumentParser("Tensorflow Implementation of GAIL") - parser.add_argument('--env_id', help='environment ID', default='Hopper-v2') - parser.add_argument('--seed', help='RNG seed', type=int, default=0) - parser.add_argument('--expert_path', type=str, default='data/deterministic.trpo.Hopper.0.00.npz') - parser.add_argument('--checkpoint_dir', help='the directory to save model', default='checkpoint') - parser.add_argument('--log_dir', help='the directory to save log file', default='log') - parser.add_argument('--load_model_path', help='if provided, load the model', type=str, default=None) - # Task - parser.add_argument('--task', type=str, choices=['train', 'evaluate', 'sample'], default='train') - # for evaluatation - boolean_flag(parser, 'stochastic_policy', default=False, help='use stochastic/deterministic policy to evaluate') - boolean_flag(parser, 'save_sample', default=False, help='save the trajectories or not') - # Mujoco Dataset Configuration - parser.add_argument('--traj_limitation', type=int, default=-1) - # Optimization Configuration - parser.add_argument('--g_step', help='number of steps to train policy in each epoch', type=int, default=3) - parser.add_argument('--d_step', help='number of steps to train discriminator in each epoch', type=int, default=1) - # Network Configuration (Using MLP Policy) - parser.add_argument('--policy_hidden_size', type=int, default=100) - parser.add_argument('--adversary_hidden_size', type=int, default=100) - # Algorithms Configuration - parser.add_argument('--algo', type=str, choices=['trpo', 'ppo'], default='trpo') - parser.add_argument('--max_kl', type=float, default=0.01) - parser.add_argument('--policy_entcoeff', help='entropy coefficiency of policy', type=float, default=0) - parser.add_argument('--adversary_entcoeff', help='entropy coefficiency of discriminator', type=float, default=1e-3) - # Traing Configuration - parser.add_argument('--save_per_iter', help='save model every xx iterations', type=int, default=100) - parser.add_argument('--num_timesteps', help='number of timesteps per episode', type=int, default=5e6) - # Behavior Cloning - boolean_flag(parser, 'pretrained', default=False, help='Use BC to pretrain') - parser.add_argument('--BC_max_iter', help='Max iteration for training BC', type=int, default=1e4) - return parser.parse_args() - - -def get_task_name(args): - task_name = args.algo + "_gail." - if args.pretrained: - task_name += "with_pretrained." - if args.traj_limitation != np.inf: - task_name += "transition_limitation_%d." % args.traj_limitation - task_name += args.env_id.split("-")[0] - task_name = task_name + ".g_step_" + str(args.g_step) + ".d_step_" + str(args.d_step) + \ - ".policy_entcoeff_" + str(args.policy_entcoeff) + ".adversary_entcoeff_" + str(args.adversary_entcoeff) - task_name += ".seed_" + str(args.seed) - return task_name - - -def main(args): - U.make_session(num_cpu=1).__enter__() - set_global_seeds(args.seed) - env = gym.make(args.env_id) - - def policy_fn(name, ob_space, ac_space, reuse=False): - return mlp_policy.MlpPolicy(name=name, ob_space=ob_space, ac_space=ac_space, - reuse=reuse, hid_size=args.policy_hidden_size, num_hid_layers=2) - env = bench.Monitor(env, logger.get_dir() and - osp.join(logger.get_dir(), "monitor.json")) - env.seed(args.seed) - gym.logger.setLevel(logging.WARN) - task_name = get_task_name(args) - args.checkpoint_dir = osp.join(args.checkpoint_dir, task_name) - args.log_dir = osp.join(args.log_dir, task_name) - - if args.task == 'train': - dataset = Mujoco_Dset(expert_path=args.expert_path, traj_limitation=args.traj_limitation) - reward_giver = TransitionClassifier(env, args.adversary_hidden_size, entcoeff=args.adversary_entcoeff) - train(env, - args.seed, - policy_fn, - reward_giver, - dataset, - args.algo, - args.g_step, - args.d_step, - args.policy_entcoeff, - args.num_timesteps, - args.save_per_iter, - args.checkpoint_dir, - args.log_dir, - args.pretrained, - args.BC_max_iter, - task_name - ) - elif args.task == 'evaluate': - runner(env, - policy_fn, - args.load_model_path, - timesteps_per_batch=1024, - number_trajs=10, - stochastic_policy=args.stochastic_policy, - save=args.save_sample - ) - else: - raise NotImplementedError - env.close() - - -def train(env, seed, policy_fn, reward_giver, dataset, algo, - g_step, d_step, policy_entcoeff, num_timesteps, save_per_iter, - checkpoint_dir, log_dir, pretrained, BC_max_iter, task_name=None): - - pretrained_weight = None - if pretrained and (BC_max_iter > 0): - # Pretrain with behavior cloning - from baselines.gail import behavior_clone - pretrained_weight = behavior_clone.learn(env, policy_fn, dataset, - max_iters=BC_max_iter) - - if algo == 'trpo': - from baselines.gail import trpo_mpi - # Set up for MPI seed - rank = MPI.COMM_WORLD.Get_rank() - if rank != 0: - logger.set_level(logger.DISABLED) - workerseed = seed + 10000 * MPI.COMM_WORLD.Get_rank() - set_global_seeds(workerseed) - env.seed(workerseed) - trpo_mpi.learn(env, policy_fn, reward_giver, dataset, rank, - pretrained=pretrained, pretrained_weight=pretrained_weight, - g_step=g_step, d_step=d_step, - entcoeff=policy_entcoeff, - max_timesteps=num_timesteps, - ckpt_dir=checkpoint_dir, log_dir=log_dir, - save_per_iter=save_per_iter, - timesteps_per_batch=1024, - max_kl=0.01, cg_iters=10, cg_damping=0.1, - gamma=0.995, lam=0.97, - vf_iters=5, vf_stepsize=1e-3, - task_name=task_name) - else: - raise NotImplementedError - - -def runner(env, policy_func, load_model_path, timesteps_per_batch, number_trajs, - stochastic_policy, save=False, reuse=False): - - # Setup network - # ---------------------------------------- - ob_space = env.observation_space - ac_space = env.action_space - pi = policy_func("pi", ob_space, ac_space, reuse=reuse) - U.initialize() - # Prepare for rollouts - # ---------------------------------------- - U.load_state(load_model_path) - - obs_list = [] - acs_list = [] - len_list = [] - ret_list = [] - for _ in tqdm(range(number_trajs)): - traj = traj_1_generator(pi, env, timesteps_per_batch, stochastic=stochastic_policy) - obs, acs, ep_len, ep_ret = traj['ob'], traj['ac'], traj['ep_len'], traj['ep_ret'] - obs_list.append(obs) - acs_list.append(acs) - len_list.append(ep_len) - ret_list.append(ep_ret) - if stochastic_policy: - print('stochastic policy:') - else: - print('deterministic policy:') - if save: - filename = load_model_path.split('/')[-1] + '.' + env.spec.id - np.savez(filename, obs=np.array(obs_list), acs=np.array(acs_list), - lens=np.array(len_list), rets=np.array(ret_list)) - avg_len = sum(len_list)/len(len_list) - avg_ret = sum(ret_list)/len(ret_list) - print("Average length:", avg_len) - print("Average return:", avg_ret) - return avg_len, avg_ret - - -# Sample one trajectory (until trajectory end) -def traj_1_generator(pi, env, horizon, stochastic): - - t = 0 - ac = env.action_space.sample() # not used, just so we have the datatype - new = True # marks if we're on first timestep of an episode - - ob = env.reset() - cur_ep_ret = 0 # return in current episode - cur_ep_len = 0 # len of current episode - - # Initialize history arrays - obs = [] - rews = [] - news = [] - acs = [] - - while True: - ac, vpred = pi.act(stochastic, ob) - obs.append(ob) - news.append(new) - acs.append(ac) - - ob, rew, new, _ = env.step(ac) - rews.append(rew) - - cur_ep_ret += rew - cur_ep_len += 1 - if new or t >= horizon: - break - t += 1 - - obs = np.array(obs) - rews = np.array(rews) - news = np.array(news) - acs = np.array(acs) - traj = {"ob": obs, "rew": rews, "new": news, "ac": acs, - "ep_ret": cur_ep_ret, "ep_len": cur_ep_len} - return traj - - -if __name__ == '__main__': - args = argsparser() - main(args) diff --git a/baselines/gail/statistics.py b/baselines/gail/statistics.py deleted file mode 100644 index 5f7c57e449..0000000000 --- a/baselines/gail/statistics.py +++ /dev/null @@ -1,45 +0,0 @@ -''' -This code is highly based on https://github.com/carpedm20/deep-rl-tensorflow/blob/master/agents/statistic.py -''' - -import tensorflow as tf -import numpy as np - -import baselines.common.tf_util as U - - -class stats(): - - def __init__(self, scalar_keys=[], histogram_keys=[]): - self.scalar_keys = scalar_keys - self.histogram_keys = histogram_keys - self.scalar_summaries = [] - self.scalar_summaries_ph = [] - self.histogram_summaries_ph = [] - self.histogram_summaries = [] - with tf.variable_scope('summary'): - for k in scalar_keys: - ph = tf.placeholder('float32', None, name=k+'.scalar.summary') - sm = tf.summary.scalar(k+'.scalar.summary', ph) - self.scalar_summaries_ph.append(ph) - self.scalar_summaries.append(sm) - for k in histogram_keys: - ph = tf.placeholder('float32', None, name=k+'.histogram.summary') - sm = tf.summary.scalar(k+'.histogram.summary', ph) - self.histogram_summaries_ph.append(ph) - self.histogram_summaries.append(sm) - - self.summaries = tf.summary.merge(self.scalar_summaries+self.histogram_summaries) - - def add_all_summary(self, writer, values, iter): - # Note that the order of the incoming ```values``` should be the same as the that of the - # ```scalar_keys``` given in ```__init__``` - if np.sum(np.isnan(values)+0) != 0: - return - sess = U.get_session() - keys = self.scalar_summaries_ph + self.histogram_summaries_ph - feed_dict = {} - for k, v in zip(keys, values): - feed_dict.update({k: v}) - summaries_str = sess.run(self.summaries, feed_dict) - writer.add_summary(summaries_str, iter) diff --git a/baselines/gail/trpo_mpi.py b/baselines/gail/trpo_mpi.py deleted file mode 100644 index 615a4326a7..0000000000 --- a/baselines/gail/trpo_mpi.py +++ /dev/null @@ -1,354 +0,0 @@ -''' -Disclaimer: The trpo part highly rely on trpo_mpi at @openai/baselines -''' - -import time -import os -from contextlib import contextmanager -from mpi4py import MPI -from collections import deque - -import tensorflow as tf -import numpy as np - -import baselines.common.tf_util as U -from baselines.common import explained_variance, zipsame, dataset, fmt_row -from baselines import logger -from baselines.common import colorize -from baselines.common.mpi_adam import MpiAdam -from baselines.common.cg import cg -from baselines.gail.statistics import stats - - -def traj_segment_generator(pi, env, reward_giver, horizon, stochastic): - - # Initialize state variables - t = 0 - ac = env.action_space.sample() - new = True - rew = 0.0 - true_rew = 0.0 - ob = env.reset() - - cur_ep_ret = 0 - cur_ep_len = 0 - cur_ep_true_ret = 0 - ep_true_rets = [] - ep_rets = [] - ep_lens = [] - - # Initialize history arrays - obs = np.array([ob for _ in range(horizon)]) - true_rews = np.zeros(horizon, 'float32') - rews = np.zeros(horizon, 'float32') - vpreds = np.zeros(horizon, 'float32') - news = np.zeros(horizon, 'int32') - acs = np.array([ac for _ in range(horizon)]) - prevacs = acs.copy() - - while True: - prevac = ac - ac, vpred = pi.act(stochastic, ob) - # Slight weirdness here because we need value function at time T - # before returning segment [0, T-1] so we get the correct - # terminal value - if t > 0 and t % horizon == 0: - yield {"ob": obs, "rew": rews, "vpred": vpreds, "new": news, - "ac": acs, "prevac": prevacs, "nextvpred": vpred * (1 - new), - "ep_rets": ep_rets, "ep_lens": ep_lens, "ep_true_rets": ep_true_rets} - _, vpred = pi.act(stochastic, ob) - # Be careful!!! if you change the downstream algorithm to aggregate - # several of these batches, then be sure to do a deepcopy - ep_rets = [] - ep_true_rets = [] - ep_lens = [] - i = t % horizon - obs[i] = ob - vpreds[i] = vpred - news[i] = new - acs[i] = ac - prevacs[i] = prevac - - rew = reward_giver.get_reward(ob, ac) - ob, true_rew, new, _ = env.step(ac) - rews[i] = rew - true_rews[i] = true_rew - - cur_ep_ret += rew - cur_ep_true_ret += true_rew - cur_ep_len += 1 - if new: - ep_rets.append(cur_ep_ret) - ep_true_rets.append(cur_ep_true_ret) - ep_lens.append(cur_ep_len) - cur_ep_ret = 0 - cur_ep_true_ret = 0 - cur_ep_len = 0 - ob = env.reset() - t += 1 - - -def add_vtarg_and_adv(seg, gamma, lam): - new = np.append(seg["new"], 0) # last element is only used for last vtarg, but we already zeroed it if last new = 1 - vpred = np.append(seg["vpred"], seg["nextvpred"]) - T = len(seg["rew"]) - seg["adv"] = gaelam = np.empty(T, 'float32') - rew = seg["rew"] - lastgaelam = 0 - for t in reversed(range(T)): - nonterminal = 1-new[t+1] - delta = rew[t] + gamma * vpred[t+1] * nonterminal - vpred[t] - gaelam[t] = lastgaelam = delta + gamma * lam * nonterminal * lastgaelam - seg["tdlamret"] = seg["adv"] + seg["vpred"] - - -def learn(env, policy_func, reward_giver, expert_dataset, rank, - pretrained, pretrained_weight, *, - g_step, d_step, entcoeff, save_per_iter, - ckpt_dir, log_dir, timesteps_per_batch, task_name, - gamma, lam, - max_kl, cg_iters, cg_damping=1e-2, - vf_stepsize=3e-4, d_stepsize=3e-4, vf_iters=3, - max_timesteps=0, max_episodes=0, max_iters=0, - callback=None - ): - - nworkers = MPI.COMM_WORLD.Get_size() - rank = MPI.COMM_WORLD.Get_rank() - np.set_printoptions(precision=3) - # Setup losses and stuff - # ---------------------------------------- - ob_space = env.observation_space - ac_space = env.action_space - pi = policy_func("pi", ob_space, ac_space, reuse=(pretrained_weight != None)) - oldpi = policy_func("oldpi", ob_space, ac_space) - atarg = tf.placeholder(dtype=tf.float32, shape=[None]) # Target advantage function (if applicable) - ret = tf.placeholder(dtype=tf.float32, shape=[None]) # Empirical return - - ob = U.get_placeholder_cached(name="ob") - ac = pi.pdtype.sample_placeholder([None]) - - kloldnew = oldpi.pd.kl(pi.pd) - ent = pi.pd.entropy() - meankl = tf.reduce_mean(kloldnew) - meanent = tf.reduce_mean(ent) - entbonus = entcoeff * meanent - - vferr = tf.reduce_mean(tf.square(pi.vpred - ret)) - - ratio = tf.exp(pi.pd.logp(ac) - oldpi.pd.logp(ac)) # advantage * pnew / pold - surrgain = tf.reduce_mean(ratio * atarg) - - optimgain = surrgain + entbonus - losses = [optimgain, meankl, entbonus, surrgain, meanent] - loss_names = ["optimgain", "meankl", "entloss", "surrgain", "entropy"] - - dist = meankl - - all_var_list = pi.get_trainable_variables() - var_list = [v for v in all_var_list if v.name.startswith("pi/pol") or v.name.startswith("pi/logstd")] - vf_var_list = [v for v in all_var_list if v.name.startswith("pi/vff")] - assert len(var_list) == len(vf_var_list) + 1 - d_adam = MpiAdam(reward_giver.get_trainable_variables()) - vfadam = MpiAdam(vf_var_list) - - get_flat = U.GetFlat(var_list) - set_from_flat = U.SetFromFlat(var_list) - klgrads = tf.gradients(dist, var_list) - flat_tangent = tf.placeholder(dtype=tf.float32, shape=[None], name="flat_tan") - shapes = [var.get_shape().as_list() for var in var_list] - start = 0 - tangents = [] - for shape in shapes: - sz = U.intprod(shape) - tangents.append(tf.reshape(flat_tangent[start:start+sz], shape)) - start += sz - gvp = tf.add_n([tf.reduce_sum(g*tangent) for (g, tangent) in zipsame(klgrads, tangents)]) # pylint: disable=E1111 - fvp = U.flatgrad(gvp, var_list) - - assign_old_eq_new = U.function([], [], updates=[tf.assign(oldv, newv) - for (oldv, newv) in zipsame(oldpi.get_variables(), pi.get_variables())]) - compute_losses = U.function([ob, ac, atarg], losses) - compute_lossandgrad = U.function([ob, ac, atarg], losses + [U.flatgrad(optimgain, var_list)]) - compute_fvp = U.function([flat_tangent, ob, ac, atarg], fvp) - compute_vflossandgrad = U.function([ob, ret], U.flatgrad(vferr, vf_var_list)) - - @contextmanager - def timed(msg): - if rank == 0: - print(colorize(msg, color='magenta')) - tstart = time.time() - yield - print(colorize("done in %.3f seconds" % (time.time() - tstart), color='magenta')) - else: - yield - - def allmean(x): - assert isinstance(x, np.ndarray) - out = np.empty_like(x) - MPI.COMM_WORLD.Allreduce(x, out, op=MPI.SUM) - out /= nworkers - return out - - U.initialize() - th_init = get_flat() - MPI.COMM_WORLD.Bcast(th_init, root=0) - set_from_flat(th_init) - d_adam.sync() - vfadam.sync() - if rank == 0: - print("Init param sum", th_init.sum(), flush=True) - - # Prepare for rollouts - # ---------------------------------------- - seg_gen = traj_segment_generator(pi, env, reward_giver, timesteps_per_batch, stochastic=True) - - episodes_so_far = 0 - timesteps_so_far = 0 - iters_so_far = 0 - tstart = time.time() - lenbuffer = deque(maxlen=40) # rolling buffer for episode lengths - rewbuffer = deque(maxlen=40) # rolling buffer for episode rewards - true_rewbuffer = deque(maxlen=40) - - assert sum([max_iters > 0, max_timesteps > 0, max_episodes > 0]) == 1 - - g_loss_stats = stats(loss_names) - d_loss_stats = stats(reward_giver.loss_name) - ep_stats = stats(["True_rewards", "Rewards", "Episode_length"]) - # if provide pretrained weight - if pretrained_weight is not None: - U.load_state(pretrained_weight, var_list=pi.get_variables()) - - while True: - if callback: callback(locals(), globals()) - if max_timesteps and timesteps_so_far >= max_timesteps: - break - elif max_episodes and episodes_so_far >= max_episodes: - break - elif max_iters and iters_so_far >= max_iters: - break - - # Save model - if rank == 0 and iters_so_far % save_per_iter == 0 and ckpt_dir is not None: - fname = os.path.join(ckpt_dir, task_name) - os.makedirs(os.path.dirname(fname), exist_ok=True) - saver = tf.train.Saver() - saver.save(tf.get_default_session(), fname) - - logger.log("********** Iteration %i ************" % iters_so_far) - - def fisher_vector_product(p): - return allmean(compute_fvp(p, *fvpargs)) + cg_damping * p - # ------------------ Update G ------------------ - logger.log("Optimizing Policy...") - for _ in range(g_step): - with timed("sampling"): - seg = seg_gen.__next__() - add_vtarg_and_adv(seg, gamma, lam) - # ob, ac, atarg, ret, td1ret = map(np.concatenate, (obs, acs, atargs, rets, td1rets)) - ob, ac, atarg, tdlamret = seg["ob"], seg["ac"], seg["adv"], seg["tdlamret"] - vpredbefore = seg["vpred"] # predicted value function before udpate - atarg = (atarg - atarg.mean()) / atarg.std() # standardized advantage function estimate - - if hasattr(pi, "ob_rms"): pi.ob_rms.update(ob) # update running mean/std for policy - - args = seg["ob"], seg["ac"], atarg - fvpargs = [arr[::5] for arr in args] - - assign_old_eq_new() # set old parameter values to new parameter values - with timed("computegrad"): - *lossbefore, g = compute_lossandgrad(*args) - lossbefore = allmean(np.array(lossbefore)) - g = allmean(g) - if np.allclose(g, 0): - logger.log("Got zero gradient. not updating") - else: - with timed("cg"): - stepdir = cg(fisher_vector_product, g, cg_iters=cg_iters, verbose=rank == 0) - assert np.isfinite(stepdir).all() - shs = .5*stepdir.dot(fisher_vector_product(stepdir)) - lm = np.sqrt(shs / max_kl) - # logger.log("lagrange multiplier:", lm, "gnorm:", np.linalg.norm(g)) - fullstep = stepdir / lm - expectedimprove = g.dot(fullstep) - surrbefore = lossbefore[0] - stepsize = 1.0 - thbefore = get_flat() - for _ in range(10): - thnew = thbefore + fullstep * stepsize - set_from_flat(thnew) - meanlosses = surr, kl, *_ = allmean(np.array(compute_losses(*args))) - improve = surr - surrbefore - logger.log("Expected: %.3f Actual: %.3f" % (expectedimprove, improve)) - if not np.isfinite(meanlosses).all(): - logger.log("Got non-finite value of losses -- bad!") - elif kl > max_kl * 1.5: - logger.log("violated KL constraint. shrinking step.") - elif improve < 0: - logger.log("surrogate didn't improve. shrinking step.") - else: - logger.log("Stepsize OK!") - break - stepsize *= .5 - else: - logger.log("couldn't compute a good step") - set_from_flat(thbefore) - if nworkers > 1 and iters_so_far % 20 == 0: - paramsums = MPI.COMM_WORLD.allgather((thnew.sum(), vfadam.getflat().sum())) # list of tuples - assert all(np.allclose(ps, paramsums[0]) for ps in paramsums[1:]) - with timed("vf"): - for _ in range(vf_iters): - for (mbob, mbret) in dataset.iterbatches((seg["ob"], seg["tdlamret"]), - include_final_partial_batch=False, batch_size=128): - if hasattr(pi, "ob_rms"): - pi.ob_rms.update(mbob) # update running mean/std for policy - g = allmean(compute_vflossandgrad(mbob, mbret)) - vfadam.update(g, vf_stepsize) - - g_losses = meanlosses - for (lossname, lossval) in zip(loss_names, meanlosses): - logger.record_tabular(lossname, lossval) - logger.record_tabular("ev_tdlam_before", explained_variance(vpredbefore, tdlamret)) - # ------------------ Update D ------------------ - logger.log("Optimizing Discriminator...") - logger.log(fmt_row(13, reward_giver.loss_name)) - ob_expert, ac_expert = expert_dataset.get_next_batch(len(ob)) - batch_size = len(ob) // d_step - d_losses = [] # list of tuples, each of which gives the loss for a minibatch - for ob_batch, ac_batch in dataset.iterbatches((ob, ac), - include_final_partial_batch=False, - batch_size=batch_size): - ob_expert, ac_expert = expert_dataset.get_next_batch(len(ob_batch)) - # update running mean/std for reward_giver - if hasattr(reward_giver, "obs_rms"): reward_giver.obs_rms.update(np.concatenate((ob_batch, ob_expert), 0)) - *newlosses, g = reward_giver.lossandgrad(ob_batch, ac_batch, ob_expert, ac_expert) - d_adam.update(allmean(g), d_stepsize) - d_losses.append(newlosses) - logger.log(fmt_row(13, np.mean(d_losses, axis=0))) - - lrlocal = (seg["ep_lens"], seg["ep_rets"], seg["ep_true_rets"]) # local values - listoflrpairs = MPI.COMM_WORLD.allgather(lrlocal) # list of tuples - lens, rews, true_rets = map(flatten_lists, zip(*listoflrpairs)) - true_rewbuffer.extend(true_rets) - lenbuffer.extend(lens) - rewbuffer.extend(rews) - - logger.record_tabular("EpLenMean", np.mean(lenbuffer)) - logger.record_tabular("EpRewMean", np.mean(rewbuffer)) - logger.record_tabular("EpTrueRewMean", np.mean(true_rewbuffer)) - logger.record_tabular("EpThisIter", len(lens)) - episodes_so_far += len(lens) - timesteps_so_far += sum(lens) - iters_so_far += 1 - - logger.record_tabular("EpisodesSoFar", episodes_so_far) - logger.record_tabular("TimestepsSoFar", timesteps_so_far) - logger.record_tabular("TimeElapsed", time.time() - tstart) - - if rank == 0: - logger.dump_tabular() - - -def flatten_lists(listoflists): - return [el for list_ in listoflists for el in list_] diff --git a/baselines/her/README.md b/baselines/her/README.md deleted file mode 100644 index 6bd02b4bbf..0000000000 --- a/baselines/her/README.md +++ /dev/null @@ -1,32 +0,0 @@ -# Hindsight Experience Replay -For details on Hindsight Experience Replay (HER), please read the [paper](https://arxiv.org/abs/1707.01495). - -## How to use Hindsight Experience Replay - -### Getting started -Training an agent is very simple: -```bash -python -m baselines.her.experiment.train -``` -This will train a DDPG+HER agent on the `FetchReach` environment. -You should see the success rate go up quickly to `1.0`, which means that the agent achieves the -desired goal in 100% of the cases. -The training script logs other diagnostics as well and pickles the best policy so far (w.r.t. to its test success rate), -the latest policy, and, if enabled, a history of policies every K epochs. - -To inspect what the agent has learned, use the play script: -```bash -python -m baselines.her.experiment.play /path/to/an/experiment/policy_best.pkl -``` -You can try it right now with the results of the training step (the script prints out the path for you). -This should visualize the current policy for 10 episodes and will also print statistics. - - -### Reproducing results -In order to reproduce the results from [Plappert et al. (2018)](https://arxiv.org/abs/1802.09464), run the following command: -```bash -python -m baselines.her.experiment.train --num_cpu 19 -``` -This will require a machine with sufficient amount of physical CPU cores. In our experiments, -we used [Azure's D15v2 instances](https://docs.microsoft.com/en-us/azure/virtual-machines/linux/sizes), -which have 20 physical cores. We only scheduled the experiment on 19 of those to leave some head-room on the system. diff --git a/baselines/her/__init__.py b/baselines/her/__init__.py deleted file mode 100644 index e69de29bb2..0000000000 diff --git a/baselines/her/actor_critic.py b/baselines/her/actor_critic.py deleted file mode 100644 index d5443fe0c3..0000000000 --- a/baselines/her/actor_critic.py +++ /dev/null @@ -1,44 +0,0 @@ -import tensorflow as tf -from baselines.her.util import store_args, nn - - -class ActorCritic: - @store_args - def __init__(self, inputs_tf, dimo, dimg, dimu, max_u, o_stats, g_stats, hidden, layers, - **kwargs): - """The actor-critic network and related training code. - - Args: - inputs_tf (dict of tensors): all necessary inputs for the network: the - observation (o), the goal (g), and the action (u) - dimo (int): the dimension of the observations - dimg (int): the dimension of the goals - dimu (int): the dimension of the actions - max_u (float): the maximum magnitude of actions; action outputs will be scaled - accordingly - o_stats (baselines.her.Normalizer): normalizer for observations - g_stats (baselines.her.Normalizer): normalizer for goals - hidden (int): number of hidden units that should be used in hidden layers - layers (int): number of hidden layers - """ - self.o_tf = inputs_tf['o'] - self.g_tf = inputs_tf['g'] - self.u_tf = inputs_tf['u'] - - # Prepare inputs for actor and critic. - o = self.o_stats.normalize(self.o_tf) - g = self.g_stats.normalize(self.g_tf) - input_pi = tf.concat(axis=1, values=[o, g]) # for actor - - # Networks. - with tf.variable_scope('pi'): - self.pi_tf = self.max_u * tf.tanh(nn( - input_pi, [self.hidden] * self.layers + [self.dimu])) - with tf.variable_scope('Q'): - # for policy training - input_Q = tf.concat(axis=1, values=[o, g, self.pi_tf / self.max_u]) - self.Q_pi_tf = nn(input_Q, [self.hidden] * self.layers + [1]) - # for critic training - input_Q = tf.concat(axis=1, values=[o, g, self.u_tf / self.max_u]) - self._input_Q = input_Q # exposed for tests - self.Q_tf = nn(input_Q, [self.hidden] * self.layers + [1], reuse=True) diff --git a/baselines/her/ddpg.py b/baselines/her/ddpg.py deleted file mode 100644 index 92165de958..0000000000 --- a/baselines/her/ddpg.py +++ /dev/null @@ -1,340 +0,0 @@ -from collections import OrderedDict - -import numpy as np -import tensorflow as tf -from tensorflow.contrib.staging import StagingArea - -from baselines import logger -from baselines.her.util import ( - import_function, store_args, flatten_grads, transitions_in_episode_batch) -from baselines.her.normalizer import Normalizer -from baselines.her.replay_buffer import ReplayBuffer -from baselines.common.mpi_adam import MpiAdam - - -def dims_to_shapes(input_dims): - return {key: tuple([val]) if val > 0 else tuple() for key, val in input_dims.items()} - - -class DDPG(object): - @store_args - def __init__(self, input_dims, buffer_size, hidden, layers, network_class, polyak, batch_size, - Q_lr, pi_lr, norm_eps, norm_clip, max_u, action_l2, clip_obs, scope, T, - rollout_batch_size, subtract_goals, relative_goals, clip_pos_returns, clip_return, - sample_transitions, gamma, reuse=False, **kwargs): - """Implementation of DDPG that is used in combination with Hindsight Experience Replay (HER). - - Args: - input_dims (dict of ints): dimensions for the observation (o), the goal (g), and the - actions (u) - buffer_size (int): number of transitions that are stored in the replay buffer - hidden (int): number of units in the hidden layers - layers (int): number of hidden layers - network_class (str): the network class that should be used (e.g. 'baselines.her.ActorCritic') - polyak (float): coefficient for Polyak-averaging of the target network - batch_size (int): batch size for training - Q_lr (float): learning rate for the Q (critic) network - pi_lr (float): learning rate for the pi (actor) network - norm_eps (float): a small value used in the normalizer to avoid numerical instabilities - norm_clip (float): normalized inputs are clipped to be in [-norm_clip, norm_clip] - max_u (float): maximum action magnitude, i.e. actions are in [-max_u, max_u] - action_l2 (float): coefficient for L2 penalty on the actions - clip_obs (float): clip observations before normalization to be in [-clip_obs, clip_obs] - scope (str): the scope used for the TensorFlow graph - T (int): the time horizon for rollouts - rollout_batch_size (int): number of parallel rollouts per DDPG agent - subtract_goals (function): function that subtracts goals from each other - relative_goals (boolean): whether or not relative goals should be fed into the network - clip_pos_returns (boolean): whether or not positive returns should be clipped - clip_return (float): clip returns to be in [-clip_return, clip_return] - sample_transitions (function) function that samples from the replay buffer - gamma (float): gamma used for Q learning updates - reuse (boolean): whether or not the networks should be reused - """ - if self.clip_return is None: - self.clip_return = np.inf - - self.create_actor_critic = import_function(self.network_class) - - input_shapes = dims_to_shapes(self.input_dims) - self.dimo = self.input_dims['o'] - self.dimg = self.input_dims['g'] - self.dimu = self.input_dims['u'] - - # Prepare staging area for feeding data to the model. - stage_shapes = OrderedDict() - for key in sorted(self.input_dims.keys()): - if key.startswith('info_'): - continue - stage_shapes[key] = (None, *input_shapes[key]) - for key in ['o', 'g']: - stage_shapes[key + '_2'] = stage_shapes[key] - stage_shapes['r'] = (None,) - self.stage_shapes = stage_shapes - - # Create network. - with tf.variable_scope(self.scope): - self.staging_tf = StagingArea( - dtypes=[tf.float32 for _ in self.stage_shapes.keys()], - shapes=list(self.stage_shapes.values())) - self.buffer_ph_tf = [ - tf.placeholder(tf.float32, shape=shape) for shape in self.stage_shapes.values()] - self.stage_op = self.staging_tf.put(self.buffer_ph_tf) - - self._create_network(reuse=reuse) - - # Configure the replay buffer. - buffer_shapes = {key: (self.T if key != 'o' else self.T+1, *input_shapes[key]) - for key, val in input_shapes.items()} - buffer_shapes['g'] = (buffer_shapes['g'][0], self.dimg) - buffer_shapes['ag'] = (self.T+1, self.dimg) - - buffer_size = (self.buffer_size // self.rollout_batch_size) * self.rollout_batch_size - self.buffer = ReplayBuffer(buffer_shapes, buffer_size, self.T, self.sample_transitions) - - def _random_action(self, n): - return np.random.uniform(low=-self.max_u, high=self.max_u, size=(n, self.dimu)) - - def _preprocess_og(self, o, ag, g): - if self.relative_goals: - g_shape = g.shape - g = g.reshape(-1, self.dimg) - ag = ag.reshape(-1, self.dimg) - g = self.subtract_goals(g, ag) - g = g.reshape(*g_shape) - o = np.clip(o, -self.clip_obs, self.clip_obs) - g = np.clip(g, -self.clip_obs, self.clip_obs) - return o, g - - def get_actions(self, o, ag, g, noise_eps=0., random_eps=0., use_target_net=False, - compute_Q=False): - o, g = self._preprocess_og(o, ag, g) - policy = self.target if use_target_net else self.main - # values to compute - vals = [policy.pi_tf] - if compute_Q: - vals += [policy.Q_pi_tf] - # feed - feed = { - policy.o_tf: o.reshape(-1, self.dimo), - policy.g_tf: g.reshape(-1, self.dimg), - policy.u_tf: np.zeros((o.size // self.dimo, self.dimu), dtype=np.float32) - } - - ret = self.sess.run(vals, feed_dict=feed) - # action postprocessing - u = ret[0] - noise = noise_eps * self.max_u * np.random.randn(*u.shape) # gaussian noise - u += noise - u = np.clip(u, -self.max_u, self.max_u) - u += np.random.binomial(1, random_eps, u.shape[0]).reshape(-1, 1) * (self._random_action(u.shape[0]) - u) # eps-greedy - if u.shape[0] == 1: - u = u[0] - u = u.copy() - ret[0] = u - - if len(ret) == 1: - return ret[0] - else: - return ret - - def store_episode(self, episode_batch, update_stats=True): - """ - episode_batch: array of batch_size x (T or T+1) x dim_key - 'o' is of size T+1, others are of size T - """ - - self.buffer.store_episode(episode_batch) - - if update_stats: - # add transitions to normalizer - episode_batch['o_2'] = episode_batch['o'][:, 1:, :] - episode_batch['ag_2'] = episode_batch['ag'][:, 1:, :] - num_normalizing_transitions = transitions_in_episode_batch(episode_batch) - transitions = self.sample_transitions(episode_batch, num_normalizing_transitions) - - o, o_2, g, ag = transitions['o'], transitions['o_2'], transitions['g'], transitions['ag'] - transitions['o'], transitions['g'] = self._preprocess_og(o, ag, g) - # No need to preprocess the o_2 and g_2 since this is only used for stats - - self.o_stats.update(transitions['o']) - self.g_stats.update(transitions['g']) - - self.o_stats.recompute_stats() - self.g_stats.recompute_stats() - - def get_current_buffer_size(self): - return self.buffer.get_current_size() - - def _sync_optimizers(self): - self.Q_adam.sync() - self.pi_adam.sync() - - def _grads(self): - # Avoid feed_dict here for performance! - critic_loss, actor_loss, Q_grad, pi_grad = self.sess.run([ - self.Q_loss_tf, - self.main.Q_pi_tf, - self.Q_grad_tf, - self.pi_grad_tf - ]) - return critic_loss, actor_loss, Q_grad, pi_grad - - def _update(self, Q_grad, pi_grad): - self.Q_adam.update(Q_grad, self.Q_lr) - self.pi_adam.update(pi_grad, self.pi_lr) - - def sample_batch(self): - transitions = self.buffer.sample(self.batch_size) - o, o_2, g = transitions['o'], transitions['o_2'], transitions['g'] - ag, ag_2 = transitions['ag'], transitions['ag_2'] - transitions['o'], transitions['g'] = self._preprocess_og(o, ag, g) - transitions['o_2'], transitions['g_2'] = self._preprocess_og(o_2, ag_2, g) - - transitions_batch = [transitions[key] for key in self.stage_shapes.keys()] - return transitions_batch - - def stage_batch(self, batch=None): - if batch is None: - batch = self.sample_batch() - assert len(self.buffer_ph_tf) == len(batch) - self.sess.run(self.stage_op, feed_dict=dict(zip(self.buffer_ph_tf, batch))) - - def train(self, stage=True): - if stage: - self.stage_batch() - critic_loss, actor_loss, Q_grad, pi_grad = self._grads() - self._update(Q_grad, pi_grad) - return critic_loss, actor_loss - - def _init_target_net(self): - self.sess.run(self.init_target_net_op) - - def update_target_net(self): - self.sess.run(self.update_target_net_op) - - def clear_buffer(self): - self.buffer.clear_buffer() - - def _vars(self, scope): - res = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope=self.scope + '/' + scope) - assert len(res) > 0 - return res - - def _global_vars(self, scope): - res = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope=self.scope + '/' + scope) - return res - - def _create_network(self, reuse=False): - logger.info("Creating a DDPG agent with action space %d x %s..." % (self.dimu, self.max_u)) - - self.sess = tf.get_default_session() - if self.sess is None: - self.sess = tf.InteractiveSession() - - # running averages - with tf.variable_scope('o_stats') as vs: - if reuse: - vs.reuse_variables() - self.o_stats = Normalizer(self.dimo, self.norm_eps, self.norm_clip, sess=self.sess) - with tf.variable_scope('g_stats') as vs: - if reuse: - vs.reuse_variables() - self.g_stats = Normalizer(self.dimg, self.norm_eps, self.norm_clip, sess=self.sess) - - # mini-batch sampling. - batch = self.staging_tf.get() - batch_tf = OrderedDict([(key, batch[i]) - for i, key in enumerate(self.stage_shapes.keys())]) - batch_tf['r'] = tf.reshape(batch_tf['r'], [-1, 1]) - - # networks - with tf.variable_scope('main') as vs: - if reuse: - vs.reuse_variables() - self.main = self.create_actor_critic(batch_tf, net_type='main', **self.__dict__) - vs.reuse_variables() - with tf.variable_scope('target') as vs: - if reuse: - vs.reuse_variables() - target_batch_tf = batch_tf.copy() - target_batch_tf['o'] = batch_tf['o_2'] - target_batch_tf['g'] = batch_tf['g_2'] - self.target = self.create_actor_critic( - target_batch_tf, net_type='target', **self.__dict__) - vs.reuse_variables() - assert len(self._vars("main")) == len(self._vars("target")) - - # loss functions - target_Q_pi_tf = self.target.Q_pi_tf - clip_range = (-self.clip_return, 0. if self.clip_pos_returns else np.inf) - target_tf = tf.clip_by_value(batch_tf['r'] + self.gamma * target_Q_pi_tf, *clip_range) - self.Q_loss_tf = tf.reduce_mean(tf.square(tf.stop_gradient(target_tf) - self.main.Q_tf)) - self.pi_loss_tf = -tf.reduce_mean(self.main.Q_pi_tf) - self.pi_loss_tf += self.action_l2 * tf.reduce_mean(tf.square(self.main.pi_tf / self.max_u)) - Q_grads_tf = tf.gradients(self.Q_loss_tf, self._vars('main/Q')) - pi_grads_tf = tf.gradients(self.pi_loss_tf, self._vars('main/pi')) - assert len(self._vars('main/Q')) == len(Q_grads_tf) - assert len(self._vars('main/pi')) == len(pi_grads_tf) - self.Q_grads_vars_tf = zip(Q_grads_tf, self._vars('main/Q')) - self.pi_grads_vars_tf = zip(pi_grads_tf, self._vars('main/pi')) - self.Q_grad_tf = flatten_grads(grads=Q_grads_tf, var_list=self._vars('main/Q')) - self.pi_grad_tf = flatten_grads(grads=pi_grads_tf, var_list=self._vars('main/pi')) - - # optimizers - self.Q_adam = MpiAdam(self._vars('main/Q'), scale_grad_by_procs=False) - self.pi_adam = MpiAdam(self._vars('main/pi'), scale_grad_by_procs=False) - - # polyak averaging - self.main_vars = self._vars('main/Q') + self._vars('main/pi') - self.target_vars = self._vars('target/Q') + self._vars('target/pi') - self.stats_vars = self._global_vars('o_stats') + self._global_vars('g_stats') - self.init_target_net_op = list( - map(lambda v: v[0].assign(v[1]), zip(self.target_vars, self.main_vars))) - self.update_target_net_op = list( - map(lambda v: v[0].assign(self.polyak * v[0] + (1. - self.polyak) * v[1]), zip(self.target_vars, self.main_vars))) - - # initialize all variables - tf.variables_initializer(self._global_vars('')).run() - self._sync_optimizers() - self._init_target_net() - - def logs(self, prefix=''): - logs = [] - logs += [('stats_o/mean', np.mean(self.sess.run([self.o_stats.mean])))] - logs += [('stats_o/std', np.mean(self.sess.run([self.o_stats.std])))] - logs += [('stats_g/mean', np.mean(self.sess.run([self.g_stats.mean])))] - logs += [('stats_g/std', np.mean(self.sess.run([self.g_stats.std])))] - - if prefix is not '' and not prefix.endswith('/'): - return [(prefix + '/' + key, val) for key, val in logs] - else: - return logs - - def __getstate__(self): - """Our policies can be loaded from pkl, but after unpickling you cannot continue training. - """ - excluded_subnames = ['_tf', '_op', '_vars', '_adam', 'buffer', 'sess', '_stats', - 'main', 'target', 'lock', 'env', 'sample_transitions', - 'stage_shapes', 'create_actor_critic'] - - state = {k: v for k, v in self.__dict__.items() if all([not subname in k for subname in excluded_subnames])} - state['buffer_size'] = self.buffer_size - state['tf'] = self.sess.run([x for x in self._global_vars('') if 'buffer' not in x.name]) - return state - - def __setstate__(self, state): - if 'sample_transitions' not in state: - # We don't need this for playing the policy. - state['sample_transitions'] = None - - self.__init__(**state) - # set up stats (they are overwritten in __init__) - for k, v in state.items(): - if k[-6:] == '_stats': - self.__dict__[k] = v - # load TF variables - vars = [x for x in self._global_vars('') if 'buffer' not in x.name] - assert(len(vars) == len(state["tf"])) - node = [tf.assign(var, val) for var, val in zip(vars, state["tf"])] - self.sess.run(node) diff --git a/baselines/her/experiment/__init__.py b/baselines/her/experiment/__init__.py deleted file mode 100644 index e69de29bb2..0000000000 diff --git a/baselines/her/rollout.py b/baselines/her/rollout.py deleted file mode 100644 index 5beba69dd7..0000000000 --- a/baselines/her/rollout.py +++ /dev/null @@ -1,188 +0,0 @@ -from collections import deque - -import numpy as np -import pickle -from mujoco_py import MujocoException - -from baselines.her.util import convert_episode_to_batch_major, store_args - - -class RolloutWorker: - - @store_args - def __init__(self, make_env, policy, dims, logger, T, rollout_batch_size=1, - exploit=False, use_target_net=False, compute_Q=False, noise_eps=0, - random_eps=0, history_len=100, render=False, **kwargs): - """Rollout worker generates experience by interacting with one or many environments. - - Args: - make_env (function): a factory function that creates a new instance of the environment - when called - policy (object): the policy that is used to act - dims (dict of ints): the dimensions for observations (o), goals (g), and actions (u) - logger (object): the logger that is used by the rollout worker - rollout_batch_size (int): the number of parallel rollouts that should be used - exploit (boolean): whether or not to exploit, i.e. to act optimally according to the - current policy without any exploration - use_target_net (boolean): whether or not to use the target net for rollouts - compute_Q (boolean): whether or not to compute the Q values alongside the actions - noise_eps (float): scale of the additive Gaussian noise - random_eps (float): probability of selecting a completely random action - history_len (int): length of history for statistics smoothing - render (boolean): whether or not to render the rollouts - """ - self.envs = [make_env() for _ in range(rollout_batch_size)] - assert self.T > 0 - - self.info_keys = [key.replace('info_', '') for key in dims.keys() if key.startswith('info_')] - - self.success_history = deque(maxlen=history_len) - self.Q_history = deque(maxlen=history_len) - - self.n_episodes = 0 - self.g = np.empty((self.rollout_batch_size, self.dims['g']), np.float32) # goals - self.initial_o = np.empty((self.rollout_batch_size, self.dims['o']), np.float32) # observations - self.initial_ag = np.empty((self.rollout_batch_size, self.dims['g']), np.float32) # achieved goals - self.reset_all_rollouts() - self.clear_history() - - def reset_rollout(self, i): - """Resets the `i`-th rollout environment, re-samples a new goal, and updates the `initial_o` - and `g` arrays accordingly. - """ - obs = self.envs[i].reset() - self.initial_o[i] = obs['observation'] - self.initial_ag[i] = obs['achieved_goal'] - self.g[i] = obs['desired_goal'] - - def reset_all_rollouts(self): - """Resets all `rollout_batch_size` rollout workers. - """ - for i in range(self.rollout_batch_size): - self.reset_rollout(i) - - def generate_rollouts(self): - """Performs `rollout_batch_size` rollouts in parallel for time horizon `T` with the current - policy acting on it accordingly. - """ - self.reset_all_rollouts() - - # compute observations - o = np.empty((self.rollout_batch_size, self.dims['o']), np.float32) # observations - ag = np.empty((self.rollout_batch_size, self.dims['g']), np.float32) # achieved goals - o[:] = self.initial_o - ag[:] = self.initial_ag - - # generate episodes - obs, achieved_goals, acts, goals, successes = [], [], [], [], [] - info_values = [np.empty((self.T, self.rollout_batch_size, self.dims['info_' + key]), np.float32) for key in self.info_keys] - Qs = [] - for t in range(self.T): - policy_output = self.policy.get_actions( - o, ag, self.g, - compute_Q=self.compute_Q, - noise_eps=self.noise_eps if not self.exploit else 0., - random_eps=self.random_eps if not self.exploit else 0., - use_target_net=self.use_target_net) - - if self.compute_Q: - u, Q = policy_output - Qs.append(Q) - else: - u = policy_output - - if u.ndim == 1: - # The non-batched case should still have a reasonable shape. - u = u.reshape(1, -1) - - o_new = np.empty((self.rollout_batch_size, self.dims['o'])) - ag_new = np.empty((self.rollout_batch_size, self.dims['g'])) - success = np.zeros(self.rollout_batch_size) - # compute new states and observations - for i in range(self.rollout_batch_size): - try: - # We fully ignore the reward here because it will have to be re-computed - # for HER. - curr_o_new, _, _, info = self.envs[i].step(u[i]) - if 'is_success' in info: - success[i] = info['is_success'] - o_new[i] = curr_o_new['observation'] - ag_new[i] = curr_o_new['achieved_goal'] - for idx, key in enumerate(self.info_keys): - info_values[idx][t, i] = info[key] - if self.render: - self.envs[i].render() - except MujocoException as e: - return self.generate_rollouts() - - if np.isnan(o_new).any(): - self.logger.warning('NaN caught during rollout generation. Trying again...') - self.reset_all_rollouts() - return self.generate_rollouts() - - obs.append(o.copy()) - achieved_goals.append(ag.copy()) - successes.append(success.copy()) - acts.append(u.copy()) - goals.append(self.g.copy()) - o[...] = o_new - ag[...] = ag_new - obs.append(o.copy()) - achieved_goals.append(ag.copy()) - self.initial_o[:] = o - - episode = dict(o=obs, - u=acts, - g=goals, - ag=achieved_goals) - for key, value in zip(self.info_keys, info_values): - episode['info_{}'.format(key)] = value - - # stats - successful = np.array(successes)[-1, :] - assert successful.shape == (self.rollout_batch_size,) - success_rate = np.mean(successful) - self.success_history.append(success_rate) - if self.compute_Q: - self.Q_history.append(np.mean(Qs)) - self.n_episodes += self.rollout_batch_size - - return convert_episode_to_batch_major(episode) - - def clear_history(self): - """Clears all histories that are used for statistics - """ - self.success_history.clear() - self.Q_history.clear() - - def current_success_rate(self): - return np.mean(self.success_history) - - def current_mean_Q(self): - return np.mean(self.Q_history) - - def save_policy(self, path): - """Pickles the current policy for later inspection. - """ - with open(path, 'wb') as f: - pickle.dump(self.policy, f) - - def logs(self, prefix='worker'): - """Generates a dictionary that contains all collected statistics. - """ - logs = [] - logs += [('success_rate', np.mean(self.success_history))] - if self.compute_Q: - logs += [('mean_Q', np.mean(self.Q_history))] - logs += [('episode', self.n_episodes)] - - if prefix is not '' and not prefix.endswith('/'): - return [(prefix + '/' + key, val) for key, val in logs] - else: - return logs - - def seed(self, seed): - """Seeds each environment with a distinct seed derived from the passed in global seed. - """ - for idx, env in enumerate(self.envs): - env.seed(seed + 1000 * idx) diff --git a/baselines/her/util.py b/baselines/her/util.py deleted file mode 100644 index d637aa69f1..0000000000 --- a/baselines/her/util.py +++ /dev/null @@ -1,140 +0,0 @@ -import os -import subprocess -import sys -import importlib -import inspect -import functools - -import tensorflow as tf -import numpy as np - -from baselines.common import tf_util as U - - -def store_args(method): - """Stores provided method args as instance attributes. - """ - argspec = inspect.getfullargspec(method) - defaults = {} - if argspec.defaults is not None: - defaults = dict( - zip(argspec.args[-len(argspec.defaults):], argspec.defaults)) - if argspec.kwonlydefaults is not None: - defaults.update(argspec.kwonlydefaults) - arg_names = argspec.args[1:] - - @functools.wraps(method) - def wrapper(*positional_args, **keyword_args): - self = positional_args[0] - # Get default arg values - args = defaults.copy() - # Add provided arg values - for name, value in zip(arg_names, positional_args[1:]): - args[name] = value - args.update(keyword_args) - self.__dict__.update(args) - return method(*positional_args, **keyword_args) - - return wrapper - - -def import_function(spec): - """Import a function identified by a string like "pkg.module:fn_name". - """ - mod_name, fn_name = spec.split(':') - module = importlib.import_module(mod_name) - fn = getattr(module, fn_name) - return fn - - -def flatten_grads(var_list, grads): - """Flattens a variables and their gradients. - """ - return tf.concat([tf.reshape(grad, [U.numel(v)]) - for (v, grad) in zip(var_list, grads)], 0) - - -def nn(input, layers_sizes, reuse=None, flatten=False, name=""): - """Creates a simple neural network - """ - for i, size in enumerate(layers_sizes): - activation = tf.nn.relu if i < len(layers_sizes) - 1 else None - input = tf.layers.dense(inputs=input, - units=size, - kernel_initializer=tf.contrib.layers.xavier_initializer(), - reuse=reuse, - name=name + '_' + str(i)) - if activation: - input = activation(input) - if flatten: - assert layers_sizes[-1] == 1 - input = tf.reshape(input, [-1]) - return input - - -def install_mpi_excepthook(): - import sys - from mpi4py import MPI - old_hook = sys.excepthook - - def new_hook(a, b, c): - old_hook(a, b, c) - sys.stdout.flush() - sys.stderr.flush() - MPI.COMM_WORLD.Abort() - sys.excepthook = new_hook - - -def mpi_fork(n, extra_mpi_args=[]): - """Re-launches the current script with workers - Returns "parent" for original parent, "child" for MPI children - """ - if n <= 1: - return "child" - if os.getenv("IN_MPI") is None: - env = os.environ.copy() - env.update( - MKL_NUM_THREADS="1", - OMP_NUM_THREADS="1", - IN_MPI="1" - ) - # "-bind-to core" is crucial for good performance - args = ["mpirun", "-np", str(n)] + \ - extra_mpi_args + \ - [sys.executable] - - args += sys.argv - subprocess.check_call(args, env=env) - return "parent" - else: - install_mpi_excepthook() - return "child" - - -def convert_episode_to_batch_major(episode): - """Converts an episode to have the batch dimension in the major (first) - dimension. - """ - episode_batch = {} - for key in episode.keys(): - val = np.array(episode[key]).copy() - # make inputs batch-major instead of time-major - episode_batch[key] = val.swapaxes(0, 1) - - return episode_batch - - -def transitions_in_episode_batch(episode_batch): - """Number of transitions in a given episode batch. - """ - shape = episode_batch['u'].shape - return shape[0] * shape[1] - - -def reshape_for_broadcasting(source, target): - """Reshapes a tensor (source) to have the correct shape and dtype of the target - before broadcasting it with MPI. - """ - dim = len(target.get_shape()) - shape = ([1] * (dim - 1)) + [-1] - return tf.reshape(tf.cast(source, target.dtype), shape) diff --git a/baselines/logger.py b/baselines/logger.py deleted file mode 100644 index 0abad0e8c5..0000000000 --- a/baselines/logger.py +++ /dev/null @@ -1,475 +0,0 @@ -import os -import sys -import shutil -import os.path as osp -import json -import time -import datetime -import tempfile -from collections import defaultdict - -DEBUG = 10 -INFO = 20 -WARN = 30 -ERROR = 40 - -DISABLED = 50 - -class KVWriter(object): - def writekvs(self, kvs): - raise NotImplementedError - -class SeqWriter(object): - def writeseq(self, seq): - raise NotImplementedError - -class HumanOutputFormat(KVWriter, SeqWriter): - def __init__(self, filename_or_file): - if isinstance(filename_or_file, str): - self.file = open(filename_or_file, 'wt') - self.own_file = True - else: - assert hasattr(filename_or_file, 'read'), 'expected file or str, got %s'%filename_or_file - self.file = filename_or_file - self.own_file = False - - def writekvs(self, kvs): - # Create strings for printing - key2str = {} - for (key, val) in sorted(kvs.items()): - if isinstance(val, float): - valstr = '%-8.3g' % (val,) - else: - valstr = str(val) - key2str[self._truncate(key)] = self._truncate(valstr) - - # Find max widths - if len(key2str) == 0: - print('WARNING: tried to write empty key-value dict') - return - else: - keywidth = max(map(len, key2str.keys())) - valwidth = max(map(len, key2str.values())) - - # Write out the data - dashes = '-' * (keywidth + valwidth + 7) - lines = [dashes] - for (key, val) in sorted(key2str.items()): - lines.append('| %s%s | %s%s |' % ( - key, - ' ' * (keywidth - len(key)), - val, - ' ' * (valwidth - len(val)), - )) - lines.append(dashes) - self.file.write('\n'.join(lines) + '\n') - - # Flush the output to the file - self.file.flush() - - def _truncate(self, s): - return s[:20] + '...' if len(s) > 23 else s - - def writeseq(self, seq): - seq = list(seq) - for (i, elem) in enumerate(seq): - self.file.write(elem) - if i < len(seq) - 1: # add space unless this is the last one - self.file.write(' ') - self.file.write('\n') - self.file.flush() - - def close(self): - if self.own_file: - self.file.close() - -class JSONOutputFormat(KVWriter): - def __init__(self, filename): - self.file = open(filename, 'wt') - - def writekvs(self, kvs): - for k, v in sorted(kvs.items()): - if hasattr(v, 'dtype'): - v = v.tolist() - kvs[k] = float(v) - self.file.write(json.dumps(kvs) + '\n') - self.file.flush() - - def close(self): - self.file.close() - -class CSVOutputFormat(KVWriter): - def __init__(self, filename): - self.file = open(filename, 'w+t') - self.keys = [] - self.sep = ',' - - def writekvs(self, kvs): - # Add our current row to the history - extra_keys = kvs.keys() - self.keys - if extra_keys: - self.keys.extend(extra_keys) - self.file.seek(0) - lines = self.file.readlines() - self.file.seek(0) - for (i, k) in enumerate(self.keys): - if i > 0: - self.file.write(',') - self.file.write(k) - self.file.write('\n') - for line in lines[1:]: - self.file.write(line[:-1]) - self.file.write(self.sep * len(extra_keys)) - self.file.write('\n') - for (i, k) in enumerate(self.keys): - if i > 0: - self.file.write(',') - v = kvs.get(k) - if v is not None: - self.file.write(str(v)) - self.file.write('\n') - self.file.flush() - - def close(self): - self.file.close() - - -class TensorBoardOutputFormat(KVWriter): - """ - Dumps key/value pairs into TensorBoard's numeric format. - """ - def __init__(self, dir): - os.makedirs(dir, exist_ok=True) - self.dir = dir - self.step = 1 - prefix = 'events' - path = osp.join(osp.abspath(dir), prefix) - import tensorflow as tf - from tensorflow.python import pywrap_tensorflow - from tensorflow.core.util import event_pb2 - from tensorflow.python.util import compat - self.tf = tf - self.event_pb2 = event_pb2 - self.pywrap_tensorflow = pywrap_tensorflow - self.writer = pywrap_tensorflow.EventsWriter(compat.as_bytes(path)) - - def writekvs(self, kvs): - def summary_val(k, v): - kwargs = {'tag': k, 'simple_value': float(v)} - return self.tf.Summary.Value(**kwargs) - summary = self.tf.Summary(value=[summary_val(k, v) for k, v in kvs.items()]) - event = self.event_pb2.Event(wall_time=time.time(), summary=summary) - event.step = self.step # is there any reason why you'd want to specify the step? - self.writer.WriteEvent(event) - self.writer.Flush() - self.step += 1 - - def close(self): - if self.writer: - self.writer.Close() - self.writer = None - -def make_output_format(format, ev_dir, log_suffix=''): - os.makedirs(ev_dir, exist_ok=True) - if format == 'stdout': - return HumanOutputFormat(sys.stdout) - elif format == 'log': - return HumanOutputFormat(osp.join(ev_dir, 'log%s.txt' % log_suffix)) - elif format == 'json': - return JSONOutputFormat(osp.join(ev_dir, 'progress%s.json' % log_suffix)) - elif format == 'csv': - return CSVOutputFormat(osp.join(ev_dir, 'progress%s.csv' % log_suffix)) - elif format == 'tensorboard': - return TensorBoardOutputFormat(osp.join(ev_dir, 'tb%s' % log_suffix)) - else: - raise ValueError('Unknown format specified: %s' % (format,)) - -# ================================================================ -# API -# ================================================================ - -def logkv(key, val): - """ - Log a value of some diagnostic - Call this once for each diagnostic quantity, each iteration - If called many times, last value will be used. - """ - Logger.CURRENT.logkv(key, val) - -def logkv_mean(key, val): - """ - The same as logkv(), but if called many times, values averaged. - """ - Logger.CURRENT.logkv_mean(key, val) - -def logkvs(d): - """ - Log a dictionary of key-value pairs - """ - for (k, v) in d.items(): - logkv(k, v) - -def dumpkvs(): - """ - Write all of the diagnostics from the current iteration - - level: int. (see logger.py docs) If the global logger level is higher than - the level argument here, don't print to stdout. - """ - Logger.CURRENT.dumpkvs() - -def getkvs(): - return Logger.CURRENT.name2val - - -def log(*args, level=INFO): - """ - Write the sequence of args, with no separators, to the console and output files (if you've configured an output file). - """ - Logger.CURRENT.log(*args, level=level) - -def debug(*args): - log(*args, level=DEBUG) - -def info(*args): - log(*args, level=INFO) - -def warn(*args): - log(*args, level=WARN) - -def error(*args): - log(*args, level=ERROR) - - -def set_level(level): - """ - Set logging threshold on current logger. - """ - Logger.CURRENT.set_level(level) - -def get_dir(): - """ - Get directory that log files are being written to. - will be None if there is no output directory (i.e., if you didn't call start) - """ - return Logger.CURRENT.get_dir() - -record_tabular = logkv -dump_tabular = dumpkvs - -class ProfileKV: - """ - Usage: - with logger.ProfileKV("interesting_scope"): - code - """ - def __init__(self, n): - self.n = "wait_" + n - def __enter__(self): - self.t1 = time.time() - def __exit__(self ,type, value, traceback): - Logger.CURRENT.name2val[self.n] += time.time() - self.t1 - -def profile(n): - """ - Usage: - @profile("my_func") - def my_func(): code - """ - def decorator_with_name(func): - def func_wrapper(*args, **kwargs): - with ProfileKV(n): - return func(*args, **kwargs) - return func_wrapper - return decorator_with_name - - -# ================================================================ -# Backend -# ================================================================ - -class Logger(object): - DEFAULT = None # A logger with no output files. (See right below class definition) - # So that you can still log to the terminal without setting up any output files - CURRENT = None # Current logger being used by the free functions above - - def __init__(self, dir, output_formats): - self.name2val = defaultdict(float) # values this iteration - self.name2cnt = defaultdict(int) - self.level = INFO - self.dir = dir - self.output_formats = output_formats - - # Logging API, forwarded - # ---------------------------------------- - def logkv(self, key, val): - self.name2val[key] = val - - def logkv_mean(self, key, val): - if val is None: - self.name2val[key] = None - return - oldval, cnt = self.name2val[key], self.name2cnt[key] - self.name2val[key] = oldval*cnt/(cnt+1) + val/(cnt+1) - self.name2cnt[key] = cnt + 1 - - def dumpkvs(self): - if self.level == DISABLED: return - for fmt in self.output_formats: - if isinstance(fmt, KVWriter): - fmt.writekvs(self.name2val) - self.name2val.clear() - self.name2cnt.clear() - - def log(self, *args, level=INFO): - if self.level <= level: - self._do_log(args) - - # Configuration - # ---------------------------------------- - def set_level(self, level): - self.level = level - - def get_dir(self): - return self.dir - - def close(self): - for fmt in self.output_formats: - fmt.close() - - # Misc - # ---------------------------------------- - def _do_log(self, args): - for fmt in self.output_formats: - if isinstance(fmt, SeqWriter): - fmt.writeseq(map(str, args)) - -Logger.DEFAULT = Logger.CURRENT = Logger(dir=None, output_formats=[HumanOutputFormat(sys.stdout)]) - -def configure(dir=None, format_strs=None): - if dir is None: - dir = os.getenv('OPENAI_LOGDIR') - if dir is None: - dir = osp.join(tempfile.gettempdir(), - datetime.datetime.now().strftime("openai-%Y-%m-%d-%H-%M-%S-%f")) - assert isinstance(dir, str) - os.makedirs(dir, exist_ok=True) - - log_suffix = '' - from mpi4py import MPI - rank = MPI.COMM_WORLD.Get_rank() - if rank > 0: - log_suffix = "-rank%03i" % rank - - if format_strs is None: - if rank == 0: - format_strs = os.getenv('OPENAI_LOG_FORMAT', 'stdout,log,csv').split(',') - else: - format_strs = os.getenv('OPENAI_LOG_FORMAT_MPI', 'log').split(',') - format_strs = filter(None, format_strs) - output_formats = [make_output_format(f, dir, log_suffix) for f in format_strs] - - Logger.CURRENT = Logger(dir=dir, output_formats=output_formats) - log('Logging to %s'%dir) - -def reset(): - if Logger.CURRENT is not Logger.DEFAULT: - Logger.CURRENT.close() - Logger.CURRENT = Logger.DEFAULT - log('Reset logger') - -class scoped_configure(object): - def __init__(self, dir=None, format_strs=None): - self.dir = dir - self.format_strs = format_strs - self.prevlogger = None - def __enter__(self): - self.prevlogger = Logger.CURRENT - configure(dir=self.dir, format_strs=self.format_strs) - def __exit__(self, *args): - Logger.CURRENT.close() - Logger.CURRENT = self.prevlogger - -# ================================================================ - -def _demo(): - info("hi") - debug("shouldn't appear") - set_level(DEBUG) - debug("should appear") - dir = "/tmp/testlogging" - if os.path.exists(dir): - shutil.rmtree(dir) - configure(dir=dir) - logkv("a", 3) - logkv("b", 2.5) - dumpkvs() - logkv("b", -2.5) - logkv("a", 5.5) - dumpkvs() - info("^^^ should see a = 5.5") - logkv_mean("b", -22.5) - logkv_mean("b", -44.4) - logkv("a", 5.5) - dumpkvs() - info("^^^ should see b = 33.3") - - logkv("b", -2.5) - dumpkvs() - - logkv("a", "longasslongasslongasslongasslongasslongassvalue") - dumpkvs() - - -# ================================================================ -# Readers -# ================================================================ - -def read_json(fname): - import pandas - ds = [] - with open(fname, 'rt') as fh: - for line in fh: - ds.append(json.loads(line)) - return pandas.DataFrame(ds) - -def read_csv(fname): - import pandas - return pandas.read_csv(fname, index_col=None, comment='#') - -def read_tb(path): - """ - path : a tensorboard file OR a directory, where we will find all TB files - of the form events.* - """ - import pandas - import numpy as np - from glob import glob - from collections import defaultdict - import tensorflow as tf - if osp.isdir(path): - fnames = glob(osp.join(path, "events.*")) - elif osp.basename(path).startswith("events."): - fnames = [path] - else: - raise NotImplementedError("Expected tensorboard file or directory containing them. Got %s"%path) - tag2pairs = defaultdict(list) - maxstep = 0 - for fname in fnames: - for summary in tf.train.summary_iterator(fname): - if summary.step > 0: - for v in summary.summary.value: - pair = (summary.step, v.simple_value) - tag2pairs[v.tag].append(pair) - maxstep = max(summary.step, maxstep) - data = np.empty((maxstep, len(tag2pairs))) - data[:] = np.nan - tags = sorted(tag2pairs.keys()) - for (colidx,tag) in enumerate(tags): - pairs = tag2pairs[tag] - for (step, value) in pairs: - data[step-1, colidx] = value - return pandas.DataFrame(data, columns=tags) - -if __name__ == "__main__": - _demo() diff --git a/baselines/ppo1/README.md b/baselines/ppo1/README.md deleted file mode 100644 index 1faf5adf1a..0000000000 --- a/baselines/ppo1/README.md +++ /dev/null @@ -1,9 +0,0 @@ -# PPOSGD - -- Original paper: https://arxiv.org/abs/1707.06347 -- Baselines blog post: https://blog.openai.com/openai-baselines-ppo/ -- `mpirun -np 8 python -m baselines.ppo1.run_atari` runs the algorithm for 40M frames = 10M timesteps on an Atari game. See help (`-h`) for more options. -- `python -m baselines.ppo1.run_mujoco` runs the algorithm for 1M frames on a Mujoco environment. - -- Train mujoco 3d humanoid (with optimal-ish hyperparameters): `mpirun -np 16 python -m baselines.ppo1.run_humanoid --model-path=/path/to/model` -- Render the 3d humanoid: `python -m baselines.ppo1.run_humanoid --play --model-path=/path/to/model` diff --git a/baselines/ppo1/__init__.py b/baselines/ppo1/__init__.py deleted file mode 100644 index e69de29bb2..0000000000 diff --git a/baselines/ppo1/cnn_policy.py b/baselines/ppo1/cnn_policy.py deleted file mode 100644 index 6aec8c0e97..0000000000 --- a/baselines/ppo1/cnn_policy.py +++ /dev/null @@ -1,56 +0,0 @@ -import baselines.common.tf_util as U -import tensorflow as tf -import gym -from baselines.common.distributions import make_pdtype - -class CnnPolicy(object): - recurrent = False - def __init__(self, name, ob_space, ac_space, kind='large'): - with tf.variable_scope(name): - self._init(ob_space, ac_space, kind) - self.scope = tf.get_variable_scope().name - - def _init(self, ob_space, ac_space, kind): - assert isinstance(ob_space, gym.spaces.Box) - - self.pdtype = pdtype = make_pdtype(ac_space) - sequence_length = None - - ob = U.get_placeholder(name="ob", dtype=tf.float32, shape=[sequence_length] + list(ob_space.shape)) - - x = ob / 255.0 - if kind == 'small': # from A3C paper - x = tf.nn.relu(U.conv2d(x, 16, "l1", [8, 8], [4, 4], pad="VALID")) - x = tf.nn.relu(U.conv2d(x, 32, "l2", [4, 4], [2, 2], pad="VALID")) - x = U.flattenallbut0(x) - x = tf.nn.relu(tf.layers.dense(x, 256, name='lin', kernel_initializer=U.normc_initializer(1.0))) - elif kind == 'large': # Nature DQN - x = tf.nn.relu(U.conv2d(x, 32, "l1", [8, 8], [4, 4], pad="VALID")) - x = tf.nn.relu(U.conv2d(x, 64, "l2", [4, 4], [2, 2], pad="VALID")) - x = tf.nn.relu(U.conv2d(x, 64, "l3", [3, 3], [1, 1], pad="VALID")) - x = U.flattenallbut0(x) - x = tf.nn.relu(tf.layers.dense(x, 512, name='lin', kernel_initializer=U.normc_initializer(1.0))) - else: - raise NotImplementedError - - logits = tf.layers.dense(x, pdtype.param_shape()[0], name='logits', kernel_initializer=U.normc_initializer(0.01)) - self.pd = pdtype.pdfromflat(logits) - self.vpred = tf.layers.dense(x, 1, name='value', kernel_initializer=U.normc_initializer(1.0))[:,0] - - self.state_in = [] - self.state_out = [] - - stochastic = tf.placeholder(dtype=tf.bool, shape=()) - ac = self.pd.sample() # XXX - self._act = U.function([stochastic, ob], [ac, self.vpred]) - - def act(self, stochastic, ob): - ac1, vpred1 = self._act(stochastic, ob[None]) - return ac1[0], vpred1[0] - def get_variables(self): - return tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, self.scope) - def get_trainable_variables(self): - return tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, self.scope) - def get_initial_state(self): - return [] - diff --git a/baselines/ppo1/mlp_policy.py b/baselines/ppo1/mlp_policy.py deleted file mode 100644 index 7f979b3495..0000000000 --- a/baselines/ppo1/mlp_policy.py +++ /dev/null @@ -1,61 +0,0 @@ -from baselines.common.mpi_running_mean_std import RunningMeanStd -import baselines.common.tf_util as U -import tensorflow as tf -import gym -from baselines.common.distributions import make_pdtype - -class MlpPolicy(object): - recurrent = False - def __init__(self, name, *args, **kwargs): - with tf.variable_scope(name): - self._init(*args, **kwargs) - self.scope = tf.get_variable_scope().name - - def _init(self, ob_space, ac_space, hid_size, num_hid_layers, gaussian_fixed_var=True): - assert isinstance(ob_space, gym.spaces.Box) - - self.pdtype = pdtype = make_pdtype(ac_space) - sequence_length = None - - ob = U.get_placeholder(name="ob", dtype=tf.float32, shape=[sequence_length] + list(ob_space.shape)) - - with tf.variable_scope("obfilter"): - self.ob_rms = RunningMeanStd(shape=ob_space.shape) - - with tf.variable_scope('vf'): - obz = tf.clip_by_value((ob - self.ob_rms.mean) / self.ob_rms.std, -5.0, 5.0) - last_out = obz - for i in range(num_hid_layers): - last_out = tf.nn.tanh(tf.layers.dense(last_out, hid_size, name="fc%i"%(i+1), kernel_initializer=U.normc_initializer(1.0))) - self.vpred = tf.layers.dense(last_out, 1, name='final', kernel_initializer=U.normc_initializer(1.0))[:,0] - - with tf.variable_scope('pol'): - last_out = obz - for i in range(num_hid_layers): - last_out = tf.nn.tanh(tf.layers.dense(last_out, hid_size, name='fc%i'%(i+1), kernel_initializer=U.normc_initializer(1.0))) - if gaussian_fixed_var and isinstance(ac_space, gym.spaces.Box): - mean = tf.layers.dense(last_out, pdtype.param_shape()[0]//2, name='final', kernel_initializer=U.normc_initializer(0.01)) - logstd = tf.get_variable(name="logstd", shape=[1, pdtype.param_shape()[0]//2], initializer=tf.zeros_initializer()) - pdparam = tf.concat([mean, mean * 0.0 + logstd], axis=1) - else: - pdparam = tf.layers.dense(last_out, pdtype.param_shape()[0], name='final', kernel_initializer=U.normc_initializer(0.01)) - - self.pd = pdtype.pdfromflat(pdparam) - - self.state_in = [] - self.state_out = [] - - stochastic = tf.placeholder(dtype=tf.bool, shape=()) - ac = U.switch(stochastic, self.pd.sample(), self.pd.mode()) - self._act = U.function([stochastic, ob], [ac, self.vpred]) - - def act(self, stochastic, ob): - ac1, vpred1 = self._act(stochastic, ob[None]) - return ac1[0], vpred1[0] - def get_variables(self): - return tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, self.scope) - def get_trainable_variables(self): - return tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, self.scope) - def get_initial_state(self): - return [] - diff --git a/baselines/ppo1/pposgd_simple.py b/baselines/ppo1/pposgd_simple.py deleted file mode 100644 index f2f13a6172..0000000000 --- a/baselines/ppo1/pposgd_simple.py +++ /dev/null @@ -1,218 +0,0 @@ -from baselines.common import Dataset, explained_variance, fmt_row, zipsame -from baselines import logger -import baselines.common.tf_util as U -import tensorflow as tf, numpy as np -import time -from baselines.common.mpi_adam import MpiAdam -from baselines.common.mpi_moments import mpi_moments -from mpi4py import MPI -from collections import deque - -def traj_segment_generator(pi, env, horizon, stochastic): - t = 0 - ac = env.action_space.sample() # not used, just so we have the datatype - new = True # marks if we're on first timestep of an episode - ob = env.reset() - - cur_ep_ret = 0 # return in current episode - cur_ep_len = 0 # len of current episode - ep_rets = [] # returns of completed episodes in this segment - ep_lens = [] # lengths of ... - - # Initialize history arrays - obs = np.array([ob for _ in range(horizon)]) - rews = np.zeros(horizon, 'float32') - vpreds = np.zeros(horizon, 'float32') - news = np.zeros(horizon, 'int32') - acs = np.array([ac for _ in range(horizon)]) - prevacs = acs.copy() - - while True: - prevac = ac - ac, vpred = pi.act(stochastic, ob) - # Slight weirdness here because we need value function at time T - # before returning segment [0, T-1] so we get the correct - # terminal value - if t > 0 and t % horizon == 0: - yield {"ob" : obs, "rew" : rews, "vpred" : vpreds, "new" : news, - "ac" : acs, "prevac" : prevacs, "nextvpred": vpred * (1 - new), - "ep_rets" : ep_rets, "ep_lens" : ep_lens} - # Be careful!!! if you change the downstream algorithm to aggregate - # several of these batches, then be sure to do a deepcopy - ep_rets = [] - ep_lens = [] - i = t % horizon - obs[i] = ob - vpreds[i] = vpred - news[i] = new - acs[i] = ac - prevacs[i] = prevac - - ob, rew, new, _ = env.step(ac) - rews[i] = rew - - cur_ep_ret += rew - cur_ep_len += 1 - if new: - ep_rets.append(cur_ep_ret) - ep_lens.append(cur_ep_len) - cur_ep_ret = 0 - cur_ep_len = 0 - ob = env.reset() - t += 1 - -def add_vtarg_and_adv(seg, gamma, lam): - """ - Compute target value using TD(lambda) estimator, and advantage with GAE(lambda) - """ - new = np.append(seg["new"], 0) # last element is only used for last vtarg, but we already zeroed it if last new = 1 - vpred = np.append(seg["vpred"], seg["nextvpred"]) - T = len(seg["rew"]) - seg["adv"] = gaelam = np.empty(T, 'float32') - rew = seg["rew"] - lastgaelam = 0 - for t in reversed(range(T)): - nonterminal = 1-new[t+1] - delta = rew[t] + gamma * vpred[t+1] * nonterminal - vpred[t] - gaelam[t] = lastgaelam = delta + gamma * lam * nonterminal * lastgaelam - seg["tdlamret"] = seg["adv"] + seg["vpred"] - -def learn(env, policy_fn, *, - timesteps_per_actorbatch, # timesteps per actor per update - clip_param, entcoeff, # clipping parameter epsilon, entropy coeff - optim_epochs, optim_stepsize, optim_batchsize,# optimization hypers - gamma, lam, # advantage estimation - max_timesteps=0, max_episodes=0, max_iters=0, max_seconds=0, # time constraint - callback=None, # you can do anything in the callback, since it takes locals(), globals() - adam_epsilon=1e-5, - schedule='constant' # annealing for stepsize parameters (epsilon and adam) - ): - # Setup losses and stuff - # ---------------------------------------- - ob_space = env.observation_space - ac_space = env.action_space - pi = policy_fn("pi", ob_space, ac_space) # Construct network for new policy - oldpi = policy_fn("oldpi", ob_space, ac_space) # Network for old policy - atarg = tf.placeholder(dtype=tf.float32, shape=[None]) # Target advantage function (if applicable) - ret = tf.placeholder(dtype=tf.float32, shape=[None]) # Empirical return - - lrmult = tf.placeholder(name='lrmult', dtype=tf.float32, shape=[]) # learning rate multiplier, updated with schedule - clip_param = clip_param * lrmult # Annealed cliping parameter epislon - - ob = U.get_placeholder_cached(name="ob") - ac = pi.pdtype.sample_placeholder([None]) - - kloldnew = oldpi.pd.kl(pi.pd) - ent = pi.pd.entropy() - meankl = tf.reduce_mean(kloldnew) - meanent = tf.reduce_mean(ent) - pol_entpen = (-entcoeff) * meanent - - ratio = tf.exp(pi.pd.logp(ac) - oldpi.pd.logp(ac)) # pnew / pold - surr1 = ratio * atarg # surrogate from conservative policy iteration - surr2 = tf.clip_by_value(ratio, 1.0 - clip_param, 1.0 + clip_param) * atarg # - pol_surr = - tf.reduce_mean(tf.minimum(surr1, surr2)) # PPO's pessimistic surrogate (L^CLIP) - vf_loss = tf.reduce_mean(tf.square(pi.vpred - ret)) - total_loss = pol_surr + pol_entpen + vf_loss - losses = [pol_surr, pol_entpen, vf_loss, meankl, meanent] - loss_names = ["pol_surr", "pol_entpen", "vf_loss", "kl", "ent"] - - var_list = pi.get_trainable_variables() - lossandgrad = U.function([ob, ac, atarg, ret, lrmult], losses + [U.flatgrad(total_loss, var_list)]) - adam = MpiAdam(var_list, epsilon=adam_epsilon) - - assign_old_eq_new = U.function([],[], updates=[tf.assign(oldv, newv) - for (oldv, newv) in zipsame(oldpi.get_variables(), pi.get_variables())]) - compute_losses = U.function([ob, ac, atarg, ret, lrmult], losses) - - U.initialize() - adam.sync() - - # Prepare for rollouts - # ---------------------------------------- - seg_gen = traj_segment_generator(pi, env, timesteps_per_actorbatch, stochastic=True) - - episodes_so_far = 0 - timesteps_so_far = 0 - iters_so_far = 0 - tstart = time.time() - lenbuffer = deque(maxlen=100) # rolling buffer for episode lengths - rewbuffer = deque(maxlen=100) # rolling buffer for episode rewards - - assert sum([max_iters>0, max_timesteps>0, max_episodes>0, max_seconds>0])==1, "Only one time constraint permitted" - - while True: - if callback: callback(locals(), globals()) - if max_timesteps and timesteps_so_far >= max_timesteps: - break - elif max_episodes and episodes_so_far >= max_episodes: - break - elif max_iters and iters_so_far >= max_iters: - break - elif max_seconds and time.time() - tstart >= max_seconds: - break - - if schedule == 'constant': - cur_lrmult = 1.0 - elif schedule == 'linear': - cur_lrmult = max(1.0 - float(timesteps_so_far) / max_timesteps, 0) - else: - raise NotImplementedError - - logger.log("********** Iteration %i ************"%iters_so_far) - - seg = seg_gen.__next__() - add_vtarg_and_adv(seg, gamma, lam) - - # ob, ac, atarg, ret, td1ret = map(np.concatenate, (obs, acs, atargs, rets, td1rets)) - ob, ac, atarg, tdlamret = seg["ob"], seg["ac"], seg["adv"], seg["tdlamret"] - vpredbefore = seg["vpred"] # predicted value function before udpate - atarg = (atarg - atarg.mean()) / atarg.std() # standardized advantage function estimate - d = Dataset(dict(ob=ob, ac=ac, atarg=atarg, vtarg=tdlamret), shuffle=not pi.recurrent) - optim_batchsize = optim_batchsize or ob.shape[0] - - if hasattr(pi, "ob_rms"): pi.ob_rms.update(ob) # update running mean/std for policy - - assign_old_eq_new() # set old parameter values to new parameter values - logger.log("Optimizing...") - logger.log(fmt_row(13, loss_names)) - # Here we do a bunch of optimization epochs over the data - for _ in range(optim_epochs): - losses = [] # list of tuples, each of which gives the loss for a minibatch - for batch in d.iterate_once(optim_batchsize): - *newlosses, g = lossandgrad(batch["ob"], batch["ac"], batch["atarg"], batch["vtarg"], cur_lrmult) - adam.update(g, optim_stepsize * cur_lrmult) - losses.append(newlosses) - logger.log(fmt_row(13, np.mean(losses, axis=0))) - - logger.log("Evaluating losses...") - losses = [] - for batch in d.iterate_once(optim_batchsize): - newlosses = compute_losses(batch["ob"], batch["ac"], batch["atarg"], batch["vtarg"], cur_lrmult) - losses.append(newlosses) - meanlosses,_,_ = mpi_moments(losses, axis=0) - logger.log(fmt_row(13, meanlosses)) - for (lossval, name) in zipsame(meanlosses, loss_names): - logger.record_tabular("loss_"+name, lossval) - logger.record_tabular("ev_tdlam_before", explained_variance(vpredbefore, tdlamret)) - lrlocal = (seg["ep_lens"], seg["ep_rets"]) # local values - listoflrpairs = MPI.COMM_WORLD.allgather(lrlocal) # list of tuples - lens, rews = map(flatten_lists, zip(*listoflrpairs)) - lenbuffer.extend(lens) - rewbuffer.extend(rews) - logger.record_tabular("EpLenMean", np.mean(lenbuffer)) - logger.record_tabular("EpRewMean", np.mean(rewbuffer)) - logger.record_tabular("EpThisIter", len(lens)) - episodes_so_far += len(lens) - timesteps_so_far += sum(lens) - iters_so_far += 1 - logger.record_tabular("EpisodesSoFar", episodes_so_far) - logger.record_tabular("TimestepsSoFar", timesteps_so_far) - logger.record_tabular("TimeElapsed", time.time() - tstart) - if MPI.COMM_WORLD.Get_rank()==0: - logger.dump_tabular() - - return pi - -def flatten_lists(listoflists): - return [el for list_ in listoflists for el in list_] diff --git a/baselines/ppo1/run_atari.py b/baselines/ppo1/run_atari.py deleted file mode 100644 index 17941c6d39..0000000000 --- a/baselines/ppo1/run_atari.py +++ /dev/null @@ -1,48 +0,0 @@ -#!/usr/bin/env python3 - -from mpi4py import MPI -from baselines.common import set_global_seeds -from baselines import bench -import os.path as osp -from baselines import logger -from baselines.common.atari_wrappers import make_atari, wrap_deepmind -from baselines.common.cmd_util import atari_arg_parser - -def train(env_id, num_timesteps, seed): - from baselines.ppo1 import pposgd_simple, cnn_policy - import baselines.common.tf_util as U - rank = MPI.COMM_WORLD.Get_rank() - sess = U.single_threaded_session() - sess.__enter__() - if rank == 0: - logger.configure() - else: - logger.configure(format_strs=[]) - workerseed = seed + 10000 * MPI.COMM_WORLD.Get_rank() - set_global_seeds(workerseed) - env = make_atari(env_id) - def policy_fn(name, ob_space, ac_space): #pylint: disable=W0613 - return cnn_policy.CnnPolicy(name=name, ob_space=ob_space, ac_space=ac_space) - env = bench.Monitor(env, logger.get_dir() and - osp.join(logger.get_dir(), str(rank))) - env.seed(workerseed) - - env = wrap_deepmind(env) - env.seed(workerseed) - - pposgd_simple.learn(env, policy_fn, - max_timesteps=int(num_timesteps * 1.1), - timesteps_per_actorbatch=256, - clip_param=0.2, entcoeff=0.01, - optim_epochs=4, optim_stepsize=1e-3, optim_batchsize=64, - gamma=0.99, lam=0.95, - schedule='linear' - ) - env.close() - -def main(): - args = atari_arg_parser().parse_args() - train(args.env, num_timesteps=args.num_timesteps, seed=args.seed) - -if __name__ == '__main__': - main() diff --git a/baselines/ppo1/run_humanoid.py b/baselines/ppo1/run_humanoid.py deleted file mode 100644 index d7d8f5a49b..0000000000 --- a/baselines/ppo1/run_humanoid.py +++ /dev/null @@ -1,75 +0,0 @@ -#!/usr/bin/env python3 -import os -from baselines.common.cmd_util import make_mujoco_env, mujoco_arg_parser -from baselines.common import tf_util as U -from baselines import logger - -import gym - -def train(num_timesteps, seed, model_path=None): - env_id = 'Humanoid-v2' - from baselines.ppo1 import mlp_policy, pposgd_simple - U.make_session(num_cpu=1).__enter__() - def policy_fn(name, ob_space, ac_space): - return mlp_policy.MlpPolicy(name=name, ob_space=ob_space, ac_space=ac_space, - hid_size=64, num_hid_layers=2) - env = make_mujoco_env(env_id, seed) - - # parameters below were the best found in a simple random search - # these are good enough to make humanoid walk, but whether those are - # an absolute best or not is not certain - env = RewScale(env, 0.1) - pi = pposgd_simple.learn(env, policy_fn, - max_timesteps=num_timesteps, - timesteps_per_actorbatch=2048, - clip_param=0.2, entcoeff=0.0, - optim_epochs=10, - optim_stepsize=3e-4, - optim_batchsize=64, - gamma=0.99, - lam=0.95, - schedule='linear', - ) - env.close() - if model_path: - U.save_state(model_path) - - return pi - -class RewScale(gym.RewardWrapper): - def __init__(self, env, scale): - gym.RewardWrapper.__init__(self, env) - self.scale = scale - def reward(self, r): - return r * self.scale - -def main(): - logger.configure() - parser = mujoco_arg_parser() - parser.add_argument('--model-path', default=os.path.join(logger.get_dir(), 'humanoid_policy')) - parser.set_defaults(num_timesteps=int(2e7)) - - args = parser.parse_args() - - if not args.play: - # train the model - train(num_timesteps=args.num_timesteps, seed=args.seed, model_path=args.model_path) - else: - # construct the model object, load pre-trained model and render - pi = train(num_timesteps=1, seed=args.seed) - U.load_state(args.model_path) - env = make_mujoco_env('Humanoid-v2', seed=0) - - ob = env.reset() - while True: - action = pi.act(stochastic=False, ob=ob)[0] - ob, _, done, _ = env.step(action) - env.render() - if done: - ob = env.reset() - - - - -if __name__ == '__main__': - main() diff --git a/baselines/ppo1/run_mujoco.py b/baselines/ppo1/run_mujoco.py deleted file mode 100644 index 638998316b..0000000000 --- a/baselines/ppo1/run_mujoco.py +++ /dev/null @@ -1,29 +0,0 @@ -#!/usr/bin/env python3 - -from baselines.common.cmd_util import make_mujoco_env, mujoco_arg_parser -from baselines.common import tf_util as U -from baselines import logger - -def train(env_id, num_timesteps, seed): - from baselines.ppo1 import mlp_policy, pposgd_simple - U.make_session(num_cpu=1).__enter__() - def policy_fn(name, ob_space, ac_space): - return mlp_policy.MlpPolicy(name=name, ob_space=ob_space, ac_space=ac_space, - hid_size=64, num_hid_layers=2) - env = make_mujoco_env(env_id, seed) - pposgd_simple.learn(env, policy_fn, - max_timesteps=num_timesteps, - timesteps_per_actorbatch=2048, - clip_param=0.2, entcoeff=0.0, - optim_epochs=10, optim_stepsize=3e-4, optim_batchsize=64, - gamma=0.99, lam=0.95, schedule='linear', - ) - env.close() - -def main(): - args = mujoco_arg_parser().parse_args() - logger.configure() - train(args.env, num_timesteps=args.num_timesteps, seed=args.seed) - -if __name__ == '__main__': - main() diff --git a/baselines/ppo1/run_robotics.py b/baselines/ppo1/run_robotics.py deleted file mode 100644 index 7d84185a1b..0000000000 --- a/baselines/ppo1/run_robotics.py +++ /dev/null @@ -1,40 +0,0 @@ -#!/usr/bin/env python3 - -from mpi4py import MPI -from baselines.common import set_global_seeds -from baselines import logger -from baselines.common.cmd_util import make_robotics_env, robotics_arg_parser -import mujoco_py - - -def train(env_id, num_timesteps, seed): - from baselines.ppo1 import mlp_policy, pposgd_simple - import baselines.common.tf_util as U - rank = MPI.COMM_WORLD.Get_rank() - sess = U.single_threaded_session() - sess.__enter__() - mujoco_py.ignore_mujoco_warnings().__enter__() - workerseed = seed + 10000 * rank - set_global_seeds(workerseed) - env = make_robotics_env(env_id, workerseed, rank=rank) - def policy_fn(name, ob_space, ac_space): - return mlp_policy.MlpPolicy(name=name, ob_space=ob_space, ac_space=ac_space, - hid_size=256, num_hid_layers=3) - - pposgd_simple.learn(env, policy_fn, - max_timesteps=num_timesteps, - timesteps_per_actorbatch=2048, - clip_param=0.2, entcoeff=0.0, - optim_epochs=5, optim_stepsize=3e-4, optim_batchsize=256, - gamma=0.99, lam=0.95, schedule='linear', - ) - env.close() - - -def main(): - args = robotics_arg_parser().parse_args() - train(args.env, num_timesteps=args.num_timesteps, seed=args.seed) - - -if __name__ == '__main__': - main() diff --git a/baselines/ppo2/README.md b/baselines/ppo2/README.md deleted file mode 100644 index 4c262adeb7..0000000000 --- a/baselines/ppo2/README.md +++ /dev/null @@ -1,6 +0,0 @@ -# PPO2 - -- Original paper: https://arxiv.org/abs/1707.06347 -- Baselines blog post: https://blog.openai.com/openai-baselines-ppo/ -- `python -m baselines.ppo2.run_atari` runs the algorithm for 40M frames = 10M timesteps on an Atari game. See help (`-h`) for more options. -- `python -m baselines.ppo2.run_mujoco` runs the algorithm for 1M frames on a Mujoco environment. diff --git a/baselines/ppo2/__init__.py b/baselines/ppo2/__init__.py deleted file mode 100644 index e69de29bb2..0000000000 diff --git a/baselines/ppo2/policies.py b/baselines/ppo2/policies.py deleted file mode 100644 index 6fbbb14ac8..0000000000 --- a/baselines/ppo2/policies.py +++ /dev/null @@ -1,146 +0,0 @@ -import numpy as np -import tensorflow as tf -from baselines.a2c.utils import conv, fc, conv_to_fc, batch_to_seq, seq_to_batch, lstm, lnlstm -from baselines.common.distributions import make_pdtype -from baselines.common.input import observation_input - -def nature_cnn(unscaled_images, **conv_kwargs): - """ - CNN from Nature paper. - """ - scaled_images = tf.cast(unscaled_images, tf.float32) / 255. - activ = tf.nn.relu - h = activ(conv(scaled_images, 'c1', nf=32, rf=8, stride=4, init_scale=np.sqrt(2), - **conv_kwargs)) - h2 = activ(conv(h, 'c2', nf=64, rf=4, stride=2, init_scale=np.sqrt(2), **conv_kwargs)) - h3 = activ(conv(h2, 'c3', nf=64, rf=3, stride=1, init_scale=np.sqrt(2), **conv_kwargs)) - h3 = conv_to_fc(h3) - return activ(fc(h3, 'fc1', nh=512, init_scale=np.sqrt(2))) - -class LnLstmPolicy(object): - def __init__(self, sess, ob_space, ac_space, nbatch, nsteps, nlstm=256, reuse=False): - nenv = nbatch // nsteps - X, processed_x = observation_input(ob_space, nbatch) - M = tf.placeholder(tf.float32, [nbatch]) #mask (done t-1) - S = tf.placeholder(tf.float32, [nenv, nlstm*2]) #states - self.pdtype = make_pdtype(ac_space) - with tf.variable_scope("model", reuse=reuse): - h = nature_cnn(processed_x) - xs = batch_to_seq(h, nenv, nsteps) - ms = batch_to_seq(M, nenv, nsteps) - h5, snew = lnlstm(xs, ms, S, 'lstm1', nh=nlstm) - h5 = seq_to_batch(h5) - vf = fc(h5, 'v', 1) - self.pd, self.pi = self.pdtype.pdfromlatent(h5) - - v0 = vf[:, 0] - a0 = self.pd.sample() - neglogp0 = self.pd.neglogp(a0) - self.initial_state = np.zeros((nenv, nlstm*2), dtype=np.float32) - - def step(ob, state, mask): - return sess.run([a0, v0, snew, neglogp0], {X:ob, S:state, M:mask}) - - def value(ob, state, mask): - return sess.run(v0, {X:ob, S:state, M:mask}) - - self.X = X - self.M = M - self.S = S - self.vf = vf - self.step = step - self.value = value - -class LstmPolicy(object): - - def __init__(self, sess, ob_space, ac_space, nbatch, nsteps, nlstm=256, reuse=False): - nenv = nbatch // nsteps - self.pdtype = make_pdtype(ac_space) - X, processed_x = observation_input(ob_space, nbatch) - - M = tf.placeholder(tf.float32, [nbatch]) #mask (done t-1) - S = tf.placeholder(tf.float32, [nenv, nlstm*2]) #states - with tf.variable_scope("model", reuse=reuse): - h = nature_cnn(X) - xs = batch_to_seq(h, nenv, nsteps) - ms = batch_to_seq(M, nenv, nsteps) - h5, snew = lstm(xs, ms, S, 'lstm1', nh=nlstm) - h5 = seq_to_batch(h5) - vf = fc(h5, 'v', 1) - self.pd, self.pi = self.pdtype.pdfromlatent(h5) - - v0 = vf[:, 0] - a0 = self.pd.sample() - neglogp0 = self.pd.neglogp(a0) - self.initial_state = np.zeros((nenv, nlstm*2), dtype=np.float32) - - def step(ob, state, mask): - return sess.run([a0, v0, snew, neglogp0], {X:ob, S:state, M:mask}) - - def value(ob, state, mask): - return sess.run(v0, {X:ob, S:state, M:mask}) - - self.X = X - self.M = M - self.S = S - self.vf = vf - self.step = step - self.value = value - -class CnnPolicy(object): - - def __init__(self, sess, ob_space, ac_space, nbatch, nsteps, reuse=False, **conv_kwargs): #pylint: disable=W0613 - self.pdtype = make_pdtype(ac_space) - X, processed_x = observation_input(ob_space, nbatch) - with tf.variable_scope("model", reuse=reuse): - h = nature_cnn(processed_x, **conv_kwargs) - vf = fc(h, 'v', 1)[:,0] - self.pd, self.pi = self.pdtype.pdfromlatent(h, init_scale=0.01) - - a0 = self.pd.sample() - neglogp0 = self.pd.neglogp(a0) - self.initial_state = None - - def step(ob, *_args, **_kwargs): - a, v, neglogp = sess.run([a0, vf, neglogp0], {X:ob}) - return a, v, self.initial_state, neglogp - - def value(ob, *_args, **_kwargs): - return sess.run(vf, {X:ob}) - - self.X = X - self.vf = vf - self.step = step - self.value = value - -class MlpPolicy(object): - def __init__(self, sess, ob_space, ac_space, nbatch, nsteps, reuse=False): #pylint: disable=W0613 - self.pdtype = make_pdtype(ac_space) - with tf.variable_scope("model", reuse=reuse): - X, processed_x = observation_input(ob_space, nbatch) - activ = tf.tanh - processed_x = tf.layers.flatten(processed_x) - pi_h1 = activ(fc(processed_x, 'pi_fc1', nh=64, init_scale=np.sqrt(2))) - pi_h2 = activ(fc(pi_h1, 'pi_fc2', nh=64, init_scale=np.sqrt(2))) - vf_h1 = activ(fc(processed_x, 'vf_fc1', nh=64, init_scale=np.sqrt(2))) - vf_h2 = activ(fc(vf_h1, 'vf_fc2', nh=64, init_scale=np.sqrt(2))) - vf = fc(vf_h2, 'vf', 1)[:,0] - - self.pd, self.pi = self.pdtype.pdfromlatent(pi_h2, init_scale=0.01) - - - a0 = self.pd.sample() - neglogp0 = self.pd.neglogp(a0) - self.initial_state = None - - def step(ob, *_args, **_kwargs): - a, v, neglogp = sess.run([a0, vf, neglogp0], {X:ob}) - return a, v, self.initial_state, neglogp - - def value(ob, *_args, **_kwargs): - return sess.run(vf, {X:ob}) - - self.X = X - self.vf = vf - self.step = step - self.value = value diff --git a/baselines/ppo2/ppo2.py b/baselines/ppo2/ppo2.py deleted file mode 100644 index fd34f52f36..0000000000 --- a/baselines/ppo2/ppo2.py +++ /dev/null @@ -1,242 +0,0 @@ -import os -import time -import joblib -import numpy as np -import os.path as osp -import tensorflow as tf -from baselines import logger -from collections import deque -from baselines.common import explained_variance -from baselines.common.runners import AbstractEnvRunner - -class Model(object): - def __init__(self, *, policy, ob_space, ac_space, nbatch_act, nbatch_train, - nsteps, ent_coef, vf_coef, max_grad_norm): - sess = tf.get_default_session() - - act_model = policy(sess, ob_space, ac_space, nbatch_act, 1, reuse=False) - train_model = policy(sess, ob_space, ac_space, nbatch_train, nsteps, reuse=True) - - A = train_model.pdtype.sample_placeholder([None]) - ADV = tf.placeholder(tf.float32, [None]) - R = tf.placeholder(tf.float32, [None]) - OLDNEGLOGPAC = tf.placeholder(tf.float32, [None]) - OLDVPRED = tf.placeholder(tf.float32, [None]) - LR = tf.placeholder(tf.float32, []) - CLIPRANGE = tf.placeholder(tf.float32, []) - - neglogpac = train_model.pd.neglogp(A) - entropy = tf.reduce_mean(train_model.pd.entropy()) - - vpred = train_model.vf - vpredclipped = OLDVPRED + tf.clip_by_value(train_model.vf - OLDVPRED, - CLIPRANGE, CLIPRANGE) - vf_losses1 = tf.square(vpred - R) - vf_losses2 = tf.square(vpredclipped - R) - vf_loss = .5 * tf.reduce_mean(tf.maximum(vf_losses1, vf_losses2)) - ratio = tf.exp(OLDNEGLOGPAC - neglogpac) - pg_losses = -ADV * ratio - pg_losses2 = -ADV * tf.clip_by_value(ratio, 1.0 - CLIPRANGE, 1.0 + CLIPRANGE) - pg_loss = tf.reduce_mean(tf.maximum(pg_losses, pg_losses2)) - approxkl = .5 * tf.reduce_mean(tf.square(neglogpac - OLDNEGLOGPAC)) - clipfrac = tf.reduce_mean(tf.to_float(tf.greater(tf.abs(ratio - 1.0), CLIPRANGE))) - loss = pg_loss - entropy * ent_coef + vf_loss * vf_coef - with tf.variable_scope('model'): - params = tf.trainable_variables() - grads = tf.gradients(loss, params) - if max_grad_norm is not None: - grads, _grad_norm = tf.clip_by_global_norm(grads, max_grad_norm) - grads = list(zip(grads, params)) - trainer = tf.train.AdamOptimizer(learning_rate=LR, epsilon=1e-5) - _train = trainer.apply_gradients(grads) - - def train(lr, cliprange, obs, returns, masks, actions, values, neglogpacs, states=None): - advs = returns - values - advs = (advs - advs.mean()) / (advs.std() + 1e-8) - td_map = {train_model.X:obs, A:actions, ADV:advs, R:returns, LR:lr, - CLIPRANGE:cliprange, OLDNEGLOGPAC:neglogpacs, OLDVPRED:values} - if states is not None: - td_map[train_model.S] = states - td_map[train_model.M] = masks - return sess.run( - [pg_loss, vf_loss, entropy, approxkl, clipfrac, _train], - td_map - )[:-1] - self.loss_names = ['policy_loss', 'value_loss', 'policy_entropy', 'approxkl', 'clipfrac'] - - def save(save_path): - ps = sess.run(params) - joblib.dump(ps, save_path) - - def load(load_path): - loaded_params = joblib.load(load_path) - restores = [] - for p, loaded_p in zip(params, loaded_params): - restores.append(p.assign(loaded_p)) - sess.run(restores) - # If you want to load weights, also save/load observation scaling inside VecNormalize - - self.train = train - self.train_model = train_model - self.act_model = act_model - self.step = act_model.step - self.value = act_model.value - self.initial_state = act_model.initial_state - self.save = save - self.load = load - tf.global_variables_initializer().run(session=sess) #pylint: disable=E1101 - -class Runner(AbstractEnvRunner): - - def __init__(self, *, env, model, nsteps, gamma, lam): - super().__init__(env=env, model=model, nsteps=nsteps) - self.lam = lam - self.gamma = gamma - - def run(self): - mb_obs, mb_rewards, mb_actions, mb_values, mb_dones, mb_neglogpacs = [],[],[],[],[],[] - mb_states = self.states - epinfos = [] - for _ in range(self.nsteps): - actions, values, self.states, neglogpacs = self.model.step(self.obs, self.states, self.dones) - mb_obs.append(self.obs.copy()) - mb_actions.append(actions) - mb_values.append(values) - mb_neglogpacs.append(neglogpacs) - mb_dones.append(self.dones) - self.obs[:], rewards, self.dones, infos = self.env.step(actions) - for info in infos: - maybeepinfo = info.get('episode') - if maybeepinfo: epinfos.append(maybeepinfo) - mb_rewards.append(rewards) - #batch of steps to batch of rollouts - mb_obs = np.asarray(mb_obs, dtype=self.obs.dtype) - mb_rewards = np.asarray(mb_rewards, dtype=np.float32) - mb_actions = np.asarray(mb_actions) - mb_values = np.asarray(mb_values, dtype=np.float32) - mb_neglogpacs = np.asarray(mb_neglogpacs, dtype=np.float32) - mb_dones = np.asarray(mb_dones, dtype=np.bool) - last_values = self.model.value(self.obs, self.states, self.dones) - #discount/bootstrap off value fn - mb_returns = np.zeros_like(mb_rewards) - mb_advs = np.zeros_like(mb_rewards) - lastgaelam = 0 - for t in reversed(range(self.nsteps)): - if t == self.nsteps - 1: - nextnonterminal = 1.0 - self.dones - nextvalues = last_values - else: - nextnonterminal = 1.0 - mb_dones[t+1] - nextvalues = mb_values[t+1] - delta = mb_rewards[t] + self.gamma * nextvalues * nextnonterminal - mb_values[t] - mb_advs[t] = lastgaelam = delta + self.gamma * self.lam * nextnonterminal * lastgaelam - mb_returns = mb_advs + mb_values - return (*map(sf01, (mb_obs, mb_returns, mb_dones, mb_actions, mb_values, mb_neglogpacs)), - mb_states, epinfos) -# obs, returns, masks, actions, values, neglogpacs, states = runner.run() -def sf01(arr): - """ - swap and then flatten axes 0 and 1 - """ - s = arr.shape - return arr.swapaxes(0, 1).reshape(s[0] * s[1], *s[2:]) - -def constfn(val): - def f(_): - return val - return f - -def learn(*, policy, env, nsteps, total_timesteps, ent_coef, lr, - vf_coef=0.5, max_grad_norm=0.5, gamma=0.99, lam=0.95, - log_interval=10, nminibatches=4, noptepochs=4, cliprange=0.2, - save_interval=0, load_path=None): - - if isinstance(lr, float): lr = constfn(lr) - else: assert callable(lr) - if isinstance(cliprange, float): cliprange = constfn(cliprange) - else: assert callable(cliprange) - total_timesteps = int(total_timesteps) - - nenvs = env.num_envs - ob_space = env.observation_space - ac_space = env.action_space - nbatch = nenvs * nsteps - nbatch_train = nbatch // nminibatches - - make_model = lambda : Model(policy=policy, ob_space=ob_space, ac_space=ac_space, nbatch_act=nenvs, nbatch_train=nbatch_train, - nsteps=nsteps, ent_coef=ent_coef, vf_coef=vf_coef, - max_grad_norm=max_grad_norm) - if save_interval and logger.get_dir(): - import cloudpickle - with open(osp.join(logger.get_dir(), 'make_model.pkl'), 'wb') as fh: - fh.write(cloudpickle.dumps(make_model)) - model = make_model() - if load_path is not None: - model.load(load_path) - runner = Runner(env=env, model=model, nsteps=nsteps, gamma=gamma, lam=lam) - - epinfobuf = deque(maxlen=100) - tfirststart = time.time() - - nupdates = total_timesteps//nbatch - for update in range(1, nupdates+1): - assert nbatch % nminibatches == 0 - nbatch_train = nbatch // nminibatches - tstart = time.time() - frac = 1.0 - (update - 1.0) / nupdates - lrnow = lr(frac) - cliprangenow = cliprange(frac) - obs, returns, masks, actions, values, neglogpacs, states, epinfos = runner.run() #pylint: disable=E0632 - epinfobuf.extend(epinfos) - mblossvals = [] - if states is None: # nonrecurrent version - inds = np.arange(nbatch) - for _ in range(noptepochs): - np.random.shuffle(inds) - for start in range(0, nbatch, nbatch_train): - end = start + nbatch_train - mbinds = inds[start:end] - slices = (arr[mbinds] for arr in (obs, returns, masks, actions, values, neglogpacs)) - mblossvals.append(model.train(lrnow, cliprangenow, *slices)) - else: # recurrent version - assert nenvs % nminibatches == 0 - envsperbatch = nenvs // nminibatches - envinds = np.arange(nenvs) - flatinds = np.arange(nenvs * nsteps).reshape(nenvs, nsteps) - envsperbatch = nbatch_train // nsteps - for _ in range(noptepochs): - np.random.shuffle(envinds) - for start in range(0, nenvs, envsperbatch): - end = start + envsperbatch - mbenvinds = envinds[start:end] - mbflatinds = flatinds[mbenvinds].ravel() - slices = (arr[mbflatinds] for arr in (obs, returns, masks, actions, values, neglogpacs)) - mbstates = states[mbenvinds] - mblossvals.append(model.train(lrnow, cliprangenow, *slices, mbstates)) - - lossvals = np.mean(mblossvals, axis=0) - tnow = time.time() - fps = int(nbatch / (tnow - tstart)) - if update % log_interval == 0 or update == 1: - ev = explained_variance(values, returns) - logger.logkv("serial_timesteps", update*nsteps) - logger.logkv("nupdates", update) - logger.logkv("total_timesteps", update*nbatch) - logger.logkv("fps", fps) - logger.logkv("explained_variance", float(ev)) - logger.logkv('eprewmean', safemean([epinfo['r'] for epinfo in epinfobuf])) - logger.logkv('eplenmean', safemean([epinfo['l'] for epinfo in epinfobuf])) - logger.logkv('time_elapsed', tnow - tfirststart) - for (lossval, lossname) in zip(lossvals, model.loss_names): - logger.logkv(lossname, lossval) - logger.dumpkvs() - if save_interval and (update % save_interval == 0 or update == 1) and logger.get_dir(): - checkdir = osp.join(logger.get_dir(), 'checkpoints') - os.makedirs(checkdir, exist_ok=True) - savepath = osp.join(checkdir, '%.5i'%update) - print('Saving to', savepath) - model.save(savepath) - env.close() - return model - -def safemean(xs): - return np.nan if len(xs) == 0 else np.mean(xs) diff --git a/baselines/ppo2/run_atari.py b/baselines/ppo2/run_atari.py deleted file mode 100644 index 322837ac86..0000000000 --- a/baselines/ppo2/run_atari.py +++ /dev/null @@ -1,40 +0,0 @@ -#!/usr/bin/env python3 -import sys -from baselines import logger -from baselines.common.cmd_util import make_atari_env, atari_arg_parser -from baselines.common.vec_env.vec_frame_stack import VecFrameStack -from baselines.ppo2 import ppo2 -from baselines.ppo2.policies import CnnPolicy, LstmPolicy, LnLstmPolicy, MlpPolicy -import multiprocessing -import tensorflow as tf - - -def train(env_id, num_timesteps, seed, policy): - - ncpu = multiprocessing.cpu_count() - if sys.platform == 'darwin': ncpu //= 2 - config = tf.ConfigProto(allow_soft_placement=True, - intra_op_parallelism_threads=ncpu, - inter_op_parallelism_threads=ncpu) - config.gpu_options.allow_growth = True #pylint: disable=E1101 - tf.Session(config=config).__enter__() - - env = VecFrameStack(make_atari_env(env_id, 8, seed), 4) - policy = {'cnn' : CnnPolicy, 'lstm' : LstmPolicy, 'lnlstm' : LnLstmPolicy, 'mlp': MlpPolicy}[policy] - ppo2.learn(policy=policy, env=env, nsteps=128, nminibatches=4, - lam=0.95, gamma=0.99, noptepochs=4, log_interval=1, - ent_coef=.01, - lr=lambda f : f * 2.5e-4, - cliprange=lambda f : f * 0.1, - total_timesteps=int(num_timesteps * 1.1)) - -def main(): - parser = atari_arg_parser() - parser.add_argument('--policy', help='Policy architecture', choices=['cnn', 'lstm', 'lnlstm', 'mlp'], default='cnn') - args = parser.parse_args() - logger.configure() - train(args.env, num_timesteps=args.num_timesteps, seed=args.seed, - policy=args.policy) - -if __name__ == '__main__': - main() diff --git a/baselines/ppo2/run_mujoco.py b/baselines/ppo2/run_mujoco.py deleted file mode 100644 index 282aa3f134..0000000000 --- a/baselines/ppo2/run_mujoco.py +++ /dev/null @@ -1,57 +0,0 @@ -#!/usr/bin/env python3 -import numpy as np -from baselines.common.cmd_util import mujoco_arg_parser -from baselines import bench, logger - - -def train(env_id, num_timesteps, seed): - from baselines.common import set_global_seeds - from baselines.common.vec_env.vec_normalize import VecNormalize - from baselines.ppo2 import ppo2 - from baselines.ppo2.policies import MlpPolicy - import gym - import tensorflow as tf - from baselines.common.vec_env.dummy_vec_env import DummyVecEnv - ncpu = 1 - config = tf.ConfigProto(allow_soft_placement=True, - intra_op_parallelism_threads=ncpu, - inter_op_parallelism_threads=ncpu) - tf.Session(config=config).__enter__() - - def make_env(): - env = gym.make(env_id) - env = bench.Monitor(env, logger.get_dir(), allow_early_resets=True) - return env - - env = DummyVecEnv([make_env]) - env = VecNormalize(env) - - set_global_seeds(seed) - policy = MlpPolicy - model = ppo2.learn(policy=policy, env=env, nsteps=2048, nminibatches=32, - lam=0.95, gamma=0.99, noptepochs=10, log_interval=1, - ent_coef=0.0, - lr=3e-4, - cliprange=0.2, - total_timesteps=num_timesteps) - - return model, env - - -def main(): - args = mujoco_arg_parser().parse_args() - logger.configure() - model, env = train(args.env, num_timesteps=args.num_timesteps, seed=args.seed) - - if args.play: - logger.log("Running trained model") - obs = np.zeros((env.num_envs,) + env.observation_space.shape) - obs[:] = env.reset() - while True: - actions = model.step(obs)[0] - obs[:] = env.step(actions)[0] - env.render() - - -if __name__ == '__main__': - main() diff --git a/baselines/results_plotter.py b/baselines/results_plotter.py deleted file mode 100644 index 051420474a..0000000000 --- a/baselines/results_plotter.py +++ /dev/null @@ -1,87 +0,0 @@ -import numpy as np -import matplotlib -matplotlib.use('TkAgg') # Can change to 'Agg' for non-interactive mode - -import matplotlib.pyplot as plt -plt.rcParams['svg.fonttype'] = 'none' - -from baselines.bench.monitor import load_results - -X_TIMESTEPS = 'timesteps' -X_EPISODES = 'episodes' -X_WALLTIME = 'walltime_hrs' -POSSIBLE_X_AXES = [X_TIMESTEPS, X_EPISODES, X_WALLTIME] -EPISODES_WINDOW = 100 -COLORS = ['blue', 'green', 'red', 'cyan', 'magenta', 'yellow', 'black', 'purple', 'pink', - 'brown', 'orange', 'teal', 'coral', 'lightblue', 'lime', 'lavender', 'turquoise', - 'darkgreen', 'tan', 'salmon', 'gold', 'lightpurple', 'darkred', 'darkblue'] - -def rolling_window(a, window): - shape = a.shape[:-1] + (a.shape[-1] - window + 1, window) - strides = a.strides + (a.strides[-1],) - return np.lib.stride_tricks.as_strided(a, shape=shape, strides=strides) - -def window_func(x, y, window, func): - yw = rolling_window(y, window) - yw_func = func(yw, axis=-1) - return x[window-1:], yw_func - -def ts2xy(ts, xaxis): - if xaxis == X_TIMESTEPS: - x = np.cumsum(ts.l.values) - y = ts.r.values - elif xaxis == X_EPISODES: - x = np.arange(len(ts)) - y = ts.r.values - elif xaxis == X_WALLTIME: - x = ts.t.values / 3600. - y = ts.r.values - else: - raise NotImplementedError - return x, y - -def plot_curves(xy_list, xaxis, title): - plt.figure(figsize=(8,2)) - maxx = max(xy[0][-1] for xy in xy_list) - minx = 0 - for (i, (x, y)) in enumerate(xy_list): - color = COLORS[i] - plt.scatter(x, y, s=2) - x, y_mean = window_func(x, y, EPISODES_WINDOW, np.mean) #So returns average of last EPISODE_WINDOW episodes - plt.plot(x, y_mean, color=color) - plt.xlim(minx, maxx) - plt.title(title) - plt.xlabel(xaxis) - plt.ylabel("Episode Rewards") - plt.tight_layout() - -def plot_results(dirs, num_timesteps, xaxis, task_name): - tslist = [] - for dir in dirs: - ts = load_results(dir) - ts = ts[ts.l.cumsum() <= num_timesteps] - tslist.append(ts) - xy_list = [ts2xy(ts, xaxis) for ts in tslist] - plot_curves(xy_list, xaxis, task_name) - -# Example usage in jupyter-notebook -# from baselines import log_viewer -# %matplotlib inline -# log_viewer.plot_results(["./log"], 10e6, log_viewer.X_TIMESTEPS, "Breakout") -# Here ./log is a directory containing the monitor.csv files - -def main(): - import argparse - import os - parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter) - parser.add_argument('--dirs', help='List of log directories', nargs = '*', default=['./log']) - parser.add_argument('--num_timesteps', type=int, default=int(10e6)) - parser.add_argument('--xaxis', help = 'Varible on X-axis', default = X_TIMESTEPS) - parser.add_argument('--task_name', help = 'Title of plot', default = 'Breakout') - args = parser.parse_args() - args.dirs = [os.path.abspath(dir) for dir in args.dirs] - plot_results(args.dirs, args.num_timesteps, args.xaxis, args.task_name) - plt.show() - -if __name__ == '__main__': - main() \ No newline at end of file diff --git a/baselines/trpo_mpi/README.md b/baselines/trpo_mpi/README.md deleted file mode 100644 index b3d9b9db3c..0000000000 --- a/baselines/trpo_mpi/README.md +++ /dev/null @@ -1,6 +0,0 @@ -# trpo_mpi - -- Original paper: https://arxiv.org/abs/1502.05477 -- Baselines blog post https://blog.openai.com/openai-baselines-ppo/ -- `mpirun -np 16 python -m baselines.trpo_mpi.run_atari` runs the algorithm for 40M frames = 10M timesteps on an Atari game. See help (`-h`) for more options. -- `python -m baselines.trpo_mpi.run_mujoco` runs the algorithm for 1M timesteps on a Mujoco environment. diff --git a/baselines/trpo_mpi/__init__.py b/baselines/trpo_mpi/__init__.py deleted file mode 100644 index e69de29bb2..0000000000 diff --git a/baselines/trpo_mpi/nosharing_cnn_policy.py b/baselines/trpo_mpi/nosharing_cnn_policy.py deleted file mode 100644 index 97b2dcd0b5..0000000000 --- a/baselines/trpo_mpi/nosharing_cnn_policy.py +++ /dev/null @@ -1,56 +0,0 @@ -import baselines.common.tf_util as U -import tensorflow as tf -import gym -from baselines.common.distributions import make_pdtype - -class CnnPolicy(object): - recurrent = False - def __init__(self, name, ob_space, ac_space): - with tf.variable_scope(name): - self._init(ob_space, ac_space) - self.scope = tf.get_variable_scope().name - - def _init(self, ob_space, ac_space): - assert isinstance(ob_space, gym.spaces.Box) - - self.pdtype = pdtype = make_pdtype(ac_space) - sequence_length = None - - ob = U.get_placeholder(name="ob", dtype=tf.float32, shape=[sequence_length] + list(ob_space.shape)) - - obscaled = ob / 255.0 - - with tf.variable_scope("pol"): - x = obscaled - x = tf.nn.relu(U.conv2d(x, 8, "l1", [8, 8], [4, 4], pad="VALID")) - x = tf.nn.relu(U.conv2d(x, 16, "l2", [4, 4], [2, 2], pad="VALID")) - x = U.flattenallbut0(x) - x = tf.nn.relu(tf.layers.dense(x, 128, name='lin', kernel_initializer=U.normc_initializer(1.0))) - logits = tf.layers.dense(x, pdtype.param_shape()[0], name='logits', kernel_initializer=U.normc_initializer(0.01)) - self.pd = pdtype.pdfromflat(logits) - with tf.variable_scope("vf"): - x = obscaled - x = tf.nn.relu(U.conv2d(x, 8, "l1", [8, 8], [4, 4], pad="VALID")) - x = tf.nn.relu(U.conv2d(x, 16, "l2", [4, 4], [2, 2], pad="VALID")) - x = U.flattenallbut0(x) - x = tf.nn.relu(tf.layers.dense(x, 128, name='lin', kernel_initializer=U.normc_initializer(1.0))) - self.vpred = tf.layers.dense(x, 1, name='value', kernel_initializer=U.normc_initializer(1.0)) - self.vpredz = self.vpred - - self.state_in = [] - self.state_out = [] - - stochastic = tf.placeholder(dtype=tf.bool, shape=()) - ac = self.pd.sample() - self._act = U.function([stochastic, ob], [ac, self.vpred]) - - def act(self, stochastic, ob): - ac1, vpred1 = self._act(stochastic, ob[None]) - return ac1[0], vpred1[0] - def get_variables(self): - return tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, self.scope) - def get_trainable_variables(self): - return tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, self.scope) - def get_initial_state(self): - return [] - diff --git a/baselines/trpo_mpi/run_atari.py b/baselines/trpo_mpi/run_atari.py deleted file mode 100644 index f31ebfd7c5..0000000000 --- a/baselines/trpo_mpi/run_atari.py +++ /dev/null @@ -1,43 +0,0 @@ - #!/usr/bin/env python3 -from mpi4py import MPI -from baselines.common import set_global_seeds -import os.path as osp -import gym, logging -from baselines import logger -from baselines import bench -from baselines.common.atari_wrappers import make_atari, wrap_deepmind -from baselines.common.cmd_util import atari_arg_parser - -def train(env_id, num_timesteps, seed): - from baselines.trpo_mpi.nosharing_cnn_policy import CnnPolicy - from baselines.trpo_mpi import trpo_mpi - import baselines.common.tf_util as U - rank = MPI.COMM_WORLD.Get_rank() - sess = U.single_threaded_session() - sess.__enter__() - if rank == 0: - logger.configure() - else: - logger.configure(format_strs=[]) - - workerseed = seed + 10000 * MPI.COMM_WORLD.Get_rank() - set_global_seeds(workerseed) - env = make_atari(env_id) - def policy_fn(name, ob_space, ac_space): #pylint: disable=W0613 - return CnnPolicy(name=name, ob_space=env.observation_space, ac_space=env.action_space) - env = bench.Monitor(env, logger.get_dir() and osp.join(logger.get_dir(), str(rank))) - env.seed(workerseed) - - env = wrap_deepmind(env) - env.seed(workerseed) - - trpo_mpi.learn(env, policy_fn, timesteps_per_batch=512, max_kl=0.001, cg_iters=10, cg_damping=1e-3, - max_timesteps=int(num_timesteps * 1.1), gamma=0.98, lam=1.0, vf_iters=3, vf_stepsize=1e-4, entcoeff=0.00) - env.close() - -def main(): - args = atari_arg_parser().parse_args() - train(args.env, num_timesteps=args.num_timesteps, seed=args.seed) - -if __name__ == "__main__": - main() diff --git a/baselines/trpo_mpi/run_mujoco.py b/baselines/trpo_mpi/run_mujoco.py deleted file mode 100644 index 220bb91aba..0000000000 --- a/baselines/trpo_mpi/run_mujoco.py +++ /dev/null @@ -1,36 +0,0 @@ -#!/usr/bin/env python3 -# noinspection PyUnresolvedReferences -from mpi4py import MPI -from baselines.common.cmd_util import make_mujoco_env, mujoco_arg_parser -from baselines import logger -from baselines.ppo1.mlp_policy import MlpPolicy -from baselines.trpo_mpi import trpo_mpi - -def train(env_id, num_timesteps, seed): - import baselines.common.tf_util as U - sess = U.single_threaded_session() - sess.__enter__() - - rank = MPI.COMM_WORLD.Get_rank() - if rank == 0: - logger.configure() - else: - logger.configure(format_strs=[]) - logger.set_level(logger.DISABLED) - workerseed = seed + 10000 * MPI.COMM_WORLD.Get_rank() - def policy_fn(name, ob_space, ac_space): - return MlpPolicy(name=name, ob_space=ob_space, ac_space=ac_space, - hid_size=32, num_hid_layers=2) - env = make_mujoco_env(env_id, workerseed) - trpo_mpi.learn(env, policy_fn, timesteps_per_batch=1024, max_kl=0.01, cg_iters=10, cg_damping=0.1, - max_timesteps=num_timesteps, gamma=0.99, lam=0.98, vf_iters=5, vf_stepsize=1e-3) - env.close() - -def main(): - args = mujoco_arg_parser().parse_args() - train(args.env, num_timesteps=args.num_timesteps, seed=args.seed) - - -if __name__ == '__main__': - main() - diff --git a/baselines/trpo_mpi/trpo_mpi.py b/baselines/trpo_mpi/trpo_mpi.py deleted file mode 100644 index e23d9ac793..0000000000 --- a/baselines/trpo_mpi/trpo_mpi.py +++ /dev/null @@ -1,291 +0,0 @@ -from baselines.common import explained_variance, zipsame, dataset -from baselines import logger -import baselines.common.tf_util as U -import tensorflow as tf, numpy as np -import time -from baselines.common import colorize -from mpi4py import MPI -from collections import deque -from baselines.common.mpi_adam import MpiAdam -from baselines.common.cg import cg -from contextlib import contextmanager - -def traj_segment_generator(pi, env, horizon, stochastic): - # Initialize state variables - t = 0 - ac = env.action_space.sample() - new = True - rew = 0.0 - ob = env.reset() - - cur_ep_ret = 0 - cur_ep_len = 0 - ep_rets = [] - ep_lens = [] - - # Initialize history arrays - obs = np.array([ob for _ in range(horizon)]) - rews = np.zeros(horizon, 'float32') - vpreds = np.zeros(horizon, 'float32') - news = np.zeros(horizon, 'int32') - acs = np.array([ac for _ in range(horizon)]) - prevacs = acs.copy() - - while True: - prevac = ac - ac, vpred = pi.act(stochastic, ob) - # Slight weirdness here because we need value function at time T - # before returning segment [0, T-1] so we get the correct - # terminal value - if t > 0 and t % horizon == 0: - yield {"ob" : obs, "rew" : rews, "vpred" : vpreds, "new" : news, - "ac" : acs, "prevac" : prevacs, "nextvpred": vpred * (1 - new), - "ep_rets" : ep_rets, "ep_lens" : ep_lens} - _, vpred = pi.act(stochastic, ob) - # Be careful!!! if you change the downstream algorithm to aggregate - # several of these batches, then be sure to do a deepcopy - ep_rets = [] - ep_lens = [] - i = t % horizon - obs[i] = ob - vpreds[i] = vpred - news[i] = new - acs[i] = ac - prevacs[i] = prevac - - ob, rew, new, _ = env.step(ac) - rews[i] = rew - - cur_ep_ret += rew - cur_ep_len += 1 - if new: - ep_rets.append(cur_ep_ret) - ep_lens.append(cur_ep_len) - cur_ep_ret = 0 - cur_ep_len = 0 - ob = env.reset() - t += 1 - -def add_vtarg_and_adv(seg, gamma, lam): - new = np.append(seg["new"], 0) # last element is only used for last vtarg, but we already zeroed it if last new = 1 - vpred = np.append(seg["vpred"], seg["nextvpred"]) - T = len(seg["rew"]) - seg["adv"] = gaelam = np.empty(T, 'float32') - rew = seg["rew"] - lastgaelam = 0 - for t in reversed(range(T)): - nonterminal = 1-new[t+1] - delta = rew[t] + gamma * vpred[t+1] * nonterminal - vpred[t] - gaelam[t] = lastgaelam = delta + gamma * lam * nonterminal * lastgaelam - seg["tdlamret"] = seg["adv"] + seg["vpred"] - -def learn(env, policy_fn, *, - timesteps_per_batch, # what to train on - max_kl, cg_iters, - gamma, lam, # advantage estimation - entcoeff=0.0, - cg_damping=1e-2, - vf_stepsize=3e-4, - vf_iters =3, - max_timesteps=0, max_episodes=0, max_iters=0, # time constraint - callback=None - ): - nworkers = MPI.COMM_WORLD.Get_size() - rank = MPI.COMM_WORLD.Get_rank() - np.set_printoptions(precision=3) - # Setup losses and stuff - # ---------------------------------------- - ob_space = env.observation_space - ac_space = env.action_space - pi = policy_fn("pi", ob_space, ac_space) - oldpi = policy_fn("oldpi", ob_space, ac_space) - atarg = tf.placeholder(dtype=tf.float32, shape=[None]) # Target advantage function (if applicable) - ret = tf.placeholder(dtype=tf.float32, shape=[None]) # Empirical return - - ob = U.get_placeholder_cached(name="ob") - ac = pi.pdtype.sample_placeholder([None]) - - kloldnew = oldpi.pd.kl(pi.pd) - ent = pi.pd.entropy() - meankl = tf.reduce_mean(kloldnew) - meanent = tf.reduce_mean(ent) - entbonus = entcoeff * meanent - - vferr = tf.reduce_mean(tf.square(pi.vpred - ret)) - - ratio = tf.exp(pi.pd.logp(ac) - oldpi.pd.logp(ac)) # advantage * pnew / pold - surrgain = tf.reduce_mean(ratio * atarg) - - optimgain = surrgain + entbonus - losses = [optimgain, meankl, entbonus, surrgain, meanent] - loss_names = ["optimgain", "meankl", "entloss", "surrgain", "entropy"] - - dist = meankl - - all_var_list = pi.get_trainable_variables() - var_list = [v for v in all_var_list if v.name.split("/")[1].startswith("pol")] - vf_var_list = [v for v in all_var_list if v.name.split("/")[1].startswith("vf")] - vfadam = MpiAdam(vf_var_list) - - get_flat = U.GetFlat(var_list) - set_from_flat = U.SetFromFlat(var_list) - klgrads = tf.gradients(dist, var_list) - flat_tangent = tf.placeholder(dtype=tf.float32, shape=[None], name="flat_tan") - shapes = [var.get_shape().as_list() for var in var_list] - start = 0 - tangents = [] - for shape in shapes: - sz = U.intprod(shape) - tangents.append(tf.reshape(flat_tangent[start:start+sz], shape)) - start += sz - gvp = tf.add_n([tf.reduce_sum(g*tangent) for (g, tangent) in zipsame(klgrads, tangents)]) #pylint: disable=E1111 - fvp = U.flatgrad(gvp, var_list) - - assign_old_eq_new = U.function([],[], updates=[tf.assign(oldv, newv) - for (oldv, newv) in zipsame(oldpi.get_variables(), pi.get_variables())]) - compute_losses = U.function([ob, ac, atarg], losses) - compute_lossandgrad = U.function([ob, ac, atarg], losses + [U.flatgrad(optimgain, var_list)]) - compute_fvp = U.function([flat_tangent, ob, ac, atarg], fvp) - compute_vflossandgrad = U.function([ob, ret], U.flatgrad(vferr, vf_var_list)) - - @contextmanager - def timed(msg): - if rank == 0: - print(colorize(msg, color='magenta')) - tstart = time.time() - yield - print(colorize("done in %.3f seconds"%(time.time() - tstart), color='magenta')) - else: - yield - - def allmean(x): - assert isinstance(x, np.ndarray) - out = np.empty_like(x) - MPI.COMM_WORLD.Allreduce(x, out, op=MPI.SUM) - out /= nworkers - return out - - U.initialize() - th_init = get_flat() - MPI.COMM_WORLD.Bcast(th_init, root=0) - set_from_flat(th_init) - vfadam.sync() - print("Init param sum", th_init.sum(), flush=True) - - # Prepare for rollouts - # ---------------------------------------- - seg_gen = traj_segment_generator(pi, env, timesteps_per_batch, stochastic=True) - - episodes_so_far = 0 - timesteps_so_far = 0 - iters_so_far = 0 - tstart = time.time() - lenbuffer = deque(maxlen=40) # rolling buffer for episode lengths - rewbuffer = deque(maxlen=40) # rolling buffer for episode rewards - - assert sum([max_iters>0, max_timesteps>0, max_episodes>0])==1 - - while True: - if callback: callback(locals(), globals()) - if max_timesteps and timesteps_so_far >= max_timesteps: - break - elif max_episodes and episodes_so_far >= max_episodes: - break - elif max_iters and iters_so_far >= max_iters: - break - logger.log("********** Iteration %i ************"%iters_so_far) - - with timed("sampling"): - seg = seg_gen.__next__() - add_vtarg_and_adv(seg, gamma, lam) - - # ob, ac, atarg, ret, td1ret = map(np.concatenate, (obs, acs, atargs, rets, td1rets)) - ob, ac, atarg, tdlamret = seg["ob"], seg["ac"], seg["adv"], seg["tdlamret"] - vpredbefore = seg["vpred"] # predicted value function before udpate - atarg = (atarg - atarg.mean()) / atarg.std() # standardized advantage function estimate - - if hasattr(pi, "ret_rms"): pi.ret_rms.update(tdlamret) - if hasattr(pi, "ob_rms"): pi.ob_rms.update(ob) # update running mean/std for policy - - args = seg["ob"], seg["ac"], atarg - fvpargs = [arr[::5] for arr in args] - def fisher_vector_product(p): - return allmean(compute_fvp(p, *fvpargs)) + cg_damping * p - - assign_old_eq_new() # set old parameter values to new parameter values - with timed("computegrad"): - *lossbefore, g = compute_lossandgrad(*args) - lossbefore = allmean(np.array(lossbefore)) - g = allmean(g) - if np.allclose(g, 0): - logger.log("Got zero gradient. not updating") - else: - with timed("cg"): - stepdir = cg(fisher_vector_product, g, cg_iters=cg_iters, verbose=rank==0) - assert np.isfinite(stepdir).all() - shs = .5*stepdir.dot(fisher_vector_product(stepdir)) - lm = np.sqrt(shs / max_kl) - # logger.log("lagrange multiplier:", lm, "gnorm:", np.linalg.norm(g)) - fullstep = stepdir / lm - expectedimprove = g.dot(fullstep) - surrbefore = lossbefore[0] - stepsize = 1.0 - thbefore = get_flat() - for _ in range(10): - thnew = thbefore + fullstep * stepsize - set_from_flat(thnew) - meanlosses = surr, kl, *_ = allmean(np.array(compute_losses(*args))) - improve = surr - surrbefore - logger.log("Expected: %.3f Actual: %.3f"%(expectedimprove, improve)) - if not np.isfinite(meanlosses).all(): - logger.log("Got non-finite value of losses -- bad!") - elif kl > max_kl * 1.5: - logger.log("violated KL constraint. shrinking step.") - elif improve < 0: - logger.log("surrogate didn't improve. shrinking step.") - else: - logger.log("Stepsize OK!") - break - stepsize *= .5 - else: - logger.log("couldn't compute a good step") - set_from_flat(thbefore) - if nworkers > 1 and iters_so_far % 20 == 0: - paramsums = MPI.COMM_WORLD.allgather((thnew.sum(), vfadam.getflat().sum())) # list of tuples - assert all(np.allclose(ps, paramsums[0]) for ps in paramsums[1:]) - - for (lossname, lossval) in zip(loss_names, meanlosses): - logger.record_tabular(lossname, lossval) - - with timed("vf"): - - for _ in range(vf_iters): - for (mbob, mbret) in dataset.iterbatches((seg["ob"], seg["tdlamret"]), - include_final_partial_batch=False, batch_size=64): - g = allmean(compute_vflossandgrad(mbob, mbret)) - vfadam.update(g, vf_stepsize) - - logger.record_tabular("ev_tdlam_before", explained_variance(vpredbefore, tdlamret)) - - lrlocal = (seg["ep_lens"], seg["ep_rets"]) # local values - listoflrpairs = MPI.COMM_WORLD.allgather(lrlocal) # list of tuples - lens, rews = map(flatten_lists, zip(*listoflrpairs)) - lenbuffer.extend(lens) - rewbuffer.extend(rews) - - logger.record_tabular("EpLenMean", np.mean(lenbuffer)) - logger.record_tabular("EpRewMean", np.mean(rewbuffer)) - logger.record_tabular("EpThisIter", len(lens)) - episodes_so_far += len(lens) - timesteps_so_far += sum(lens) - iters_so_far += 1 - - logger.record_tabular("EpisodesSoFar", episodes_so_far) - logger.record_tabular("TimestepsSoFar", timesteps_so_far) - logger.record_tabular("TimeElapsed", time.time() - tstart) - - if rank==0: - logger.dump_tabular() - -def flatten_lists(listoflists): - return [el for list_ in listoflists for el in list_] \ No newline at end of file diff --git a/conftest.py b/conftest.py new file mode 100644 index 0000000000..21ec52f09b --- /dev/null +++ b/conftest.py @@ -0,0 +1,14 @@ +import pytest + + +def pytest_addoption(parser): + parser.addoption("--rungpu", action="store_true", default=False, help="run gpu tests") + + +def pytest_collection_modifyitems(config, items): + if config.getoption("--rungpu"): + return + skip_gpu = pytest.mark.skip(reason="need --rungpu option to run") + for item in items: + if "gpu" in item.keywords: + item.add_marker(skip_gpu) diff --git a/docs/Makefile b/docs/Makefile new file mode 100644 index 0000000000..0e3ca53829 --- /dev/null +++ b/docs/Makefile @@ -0,0 +1,20 @@ +# Minimal makefile for Sphinx documentation +# + +# You can set these variables from the command line. +SPHINXOPTS = +SPHINXBUILD = sphinx-build +SPHINXPROJ = StableBaselines +SOURCEDIR = . +BUILDDIR = _build + +# Put it first so that "make" without argument is like "make help". +help: + @$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) + +.PHONY: help Makefile + +# Catch-all target: route all unknown targets to Sphinx using the new +# "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS). +%: Makefile + @$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) \ No newline at end of file diff --git a/docs/_static/css/baselines_theme.css b/docs/_static/css/baselines_theme.css new file mode 100644 index 0000000000..5701d00c36 --- /dev/null +++ b/docs/_static/css/baselines_theme.css @@ -0,0 +1,52 @@ +/* Main colors from https://color.adobe.com/fr/Copy-of-NOUEBO-Original-color-theme-11116609 */ +:root{ + --main-bg-color: #324D5C; + --link-color: #14B278; +} + +/* Header fonts y */ +h1, h2, .rst-content .toctree-wrapper p.caption, h3, h4, h5, h6, legend, p.caption { + font-family: "Lato","proxima-nova","Helvetica Neue",Arial,sans-serif; +} + + +/* Docs background */ +.wy-side-nav-search{ + background-color: var(--main-bg-color); +} + +/* Mobile version */ +.wy-nav-top{ + background-color: var(--main-bg-color); +} + +/* Change link colors (except for the menu) */ +a { + color: var(--link-color); +} + +a:hover { + color: #4F778F; +} + +.wy-menu a { + color: #b3b3b3; +} + +.wy-menu a:hover { + color: #b3b3b3; +} + +a.icon.icon-home { + color: #b3b3b3; +} + +.version{ + color: var(--link-color) !important; +} + + +/* Make code blocks have a background */ +.codeblock,pre.literal-block,.rst-content .literal-block,.rst-content pre.literal-block,div[class^='highlight'] { + background: #f8f8f8;; +} diff --git a/docs/_static/img/breakout.gif b/docs/_static/img/breakout.gif new file mode 100644 index 0000000000..da5beb4f66 Binary files /dev/null and b/docs/_static/img/breakout.gif differ diff --git a/docs/_static/img/colab.svg b/docs/_static/img/colab.svg new file mode 100644 index 0000000000..c2d30e973a --- /dev/null +++ b/docs/_static/img/colab.svg @@ -0,0 +1,7 @@ + + + + + + + \ No newline at end of file diff --git a/docs/_static/img/learning_curve.png b/docs/_static/img/learning_curve.png new file mode 100644 index 0000000000..5dd8edf5a8 Binary files /dev/null and b/docs/_static/img/learning_curve.png differ diff --git a/docs/_static/img/try_it.png b/docs/_static/img/try_it.png new file mode 100644 index 0000000000..961ca222e2 Binary files /dev/null and b/docs/_static/img/try_it.png differ diff --git a/docs/changelog.rst b/docs/changelog.rst new file mode 100644 index 0000000000..4efdc0699d --- /dev/null +++ b/docs/changelog.rst @@ -0,0 +1,96 @@ +.. _changelog: + +========== +Changelog +========== + +For download links, please look at `Github release page `_. + +Master version 1.0.8.a0 (WIP) +============================= + +Nothing new for now... + +Tensorboard support in progress (see ``tensorboard`` branch) + + +Release 1.0.7 (2018-08-29) +=========================== + +**Bug fixes and documentation** + +- added html documentation using sphinx + integration with read the docs +- cleaned up README + typos +- fixed normalization for DQN with images +- fixed DQN identity test + + +Release 1.0.1 (2018-08-20) +========================== + +**Refactored Stable Baselines** + +- refactored A2C, ACER, ACTKR, DDPG, DeepQ, GAIL, TRPO, PPO1 and PPO2 under a single constant class +- added callback to refactored algorithm training +- added saving and loading to refactored algorithms +- refactored ACER, DDPG, GAIL, PPO1 and TRPO to fit with A2C, PPO2 and ACKTR policies +- added new policies for most algorithms (Mlp, MlpLstm, MlpLnLstm, Cnn, CnnLstm and CnnLnLstm) +- added dynamic environment switching (so continual RL learning is now feasible) +- added prediction from observation and action probability from observation for all the algorithms +- fixed graphs issues, so models wont collide in names +- fixed behavior_clone weight loading for GAIL +- fixed Tensorflow using all the GPU VRAM +- fixed models so that they are all compatible with vectorized environments +- fixed ```set_global_seed``` to update ```gym.spaces```'s random seed +- fixed PPO1 and TRPO performance issues when learning identity function +- added new tests for loading, saving, continuous actions and learning the identity function +- fixed DQN wrapping for atari +- added saving and loading for Vecnormalize wrapper +- added automatic detection of action space (for the policy network) +- fixed ACER buffer with constant values assuming n_stack=4 +- fixed some RL algorithms not clipping the action to be in the action_space, when using ```gym.spaces.Box``` +- refactored algorithms can take either a ```gym.Environment``` or a ```str``` ([if the environment name is registered](https://github.com/openai/gym/wiki/Environments)) +- Hoftix in ACER (compared to v1.0.0) + +Future Work : + +- Finish refactoring HER +- Refactor ACKTR and ACER for continuous implementation + + + +Release 0.1.6 (2018-07-27) +========================== + +**Deobfuscation of the code base + pep8 and fixes** + +- Fixed ``tf.session().__enter__()`` being used, rather than + ``sess = tf.session()`` and passing the session to the objects +- Fixed uneven scoping of TensorFlow Sessions throughout the code +- Fixed rolling vecwrapper to handle observations that are not only + grayscale images +- Fixed deepq saving the environment when trying to save itself +- Fixed + ``ValueError: Cannot take the length of Shape with unknown rank.`` in + ``acktr``, when running ``run_atari.py`` script. +- Fixed calling baselines sequentially no longer creates graph + conflicts +- Fixed mean on empty array warning with deepq +- Fixed kfac eigen decomposition not cast to float64, when the + parameter use_float64 is set to True +- Fixed Dataset data loader, not correctly resetting id position if + shuffling is disabled +- Fixed ``EOFError`` when reading from connection in the ``worker`` in + ``subproc_vec_env.py`` +- Fixed ``behavior_clone`` weight loading and saving for GAIL +- Avoid taking root square of negative number in ``trpo_mpi.py`` +- Removed some duplicated code (a2cpolicy, trpo_mpi) +- Removed unused, undocumented and crashing function ``reset_task`` in + ``subproc_vec_env.py`` +- Reformated code to PEP8 style +- Documented all the codebase +- Added atari tests +- Added logger tests + +Missing: tests for acktr continuous (+ HER, gail but they rely on +mujoco...) diff --git a/docs/conf.py b/docs/conf.py new file mode 100644 index 0000000000..922c1c2767 --- /dev/null +++ b/docs/conf.py @@ -0,0 +1,192 @@ +# -*- coding: utf-8 -*- +# +# Configuration file for the Sphinx documentation builder. +# +# This file does only contain a selection of the most common options. For a +# full list see the documentation: +# http://www.sphinx-doc.org/en/master/config + +# -- Path setup -------------------------------------------------------------- + +# If extensions (or modules to document with autodoc) are in another directory, +# add these directories to sys.path here. If the directory is relative to the +# documentation root, use os.path.abspath to make it absolute, like shown here. +# +import os +import sys +from unittest.mock import MagicMock + +# source code directory, relative to this file, for sphinx-autobuild +sys.path.insert(0, os.path.abspath('..')) + + +class Mock(MagicMock): + @classmethod + def __getattr__(cls, name): + return MagicMock() + +# Mock modules that requires C modules +# Note: because of that we cannot test examples using CI +MOCK_MODULES = ['joblib', 'scipy', 'scipy.signal', + 'numpy', 'pandas', 'mpi4py', 'mujoco-py', 'cv2', 'tensorflow', + 'tensorflow.contrib', 'tensorflow.contrib.layers', + 'tensorflow.python', 'tensorflow.python.client', 'tensorflow.python.ops', + 'tqdm', 'cloudpickle', 'matplotlib', + 'seaborn', 'gym', 'gym.spaces', 'zmq'] +sys.modules.update((mod_name, Mock()) for mod_name in MOCK_MODULES) + +import stable_baselines + + +# -- Project information ----------------------------------------------------- + +project = 'Stable Baselines' +copyright = '2018, Stable Baselines' +author = 'Stable Baselines Contributors' + +# The short X.Y version +version = 'master (' + stable_baselines.__version__ + ' )' +# The full version, including alpha/beta/rc tags +release = stable_baselines.__version__ + + +# -- General configuration --------------------------------------------------- + +# If your documentation needs a minimal Sphinx version, state it here. +# +# needs_sphinx = '1.0' + +# Add any Sphinx extension module names here, as strings. They can be +# extensions coming with Sphinx (named 'sphinx.ext.*') or your custom +# ones. +extensions = [ + 'sphinx.ext.autodoc', + 'sphinx.ext.autosummary', + 'sphinx.ext.mathjax', + 'sphinx.ext.ifconfig', + 'sphinx.ext.viewcode', +] + +# Add any paths that contain templates here, relative to this directory. +templates_path = ['_templates'] + +# The suffix(es) of source filenames. +# You can specify multiple suffix as a list of string: +# +# source_suffix = ['.rst', '.md'] +source_suffix = '.rst' + +# The master toctree document. +master_doc = 'index' + +# The language for content autogenerated by Sphinx. Refer to documentation +# for a list of supported languages. +# +# This is also used if you do content translation via gettext catalogs. +# Usually you set "language" from the command line for these cases. +language = None + +# List of patterns, relative to source directory, that match files and +# directories to ignore when looking for source files. +# This pattern also affects html_static_path and html_extra_path . +exclude_patterns = ['_build', 'Thumbs.db', '.DS_Store'] + +# The name of the Pygments (syntax highlighting) style to use. +pygments_style = 'sphinx' + + +# -- Options for HTML output ------------------------------------------------- + +# The theme to use for HTML and HTML Help pages. See the documentation for +# a list of builtin themes. + +# Fix for read the docs +on_rtd = os.environ.get('READTHEDOCS') == 'True' +if on_rtd: + html_theme = 'default' +else: + html_theme = 'sphinx_rtd_theme' + +def setup(app): + app.add_stylesheet("css/baselines_theme.css") + +# Theme options are theme-specific and customize the look and feel of a theme +# further. For a list of options available for each theme, see the +# documentation. +# +# html_theme_options = {} + +# Add any paths that contain custom static files (such as style sheets) here, +# relative to this directory. They are copied after the builtin static files, +# so a file named "default.css" will overwrite the builtin "default.css". +html_static_path = ['_static'] + +# Custom sidebar templates, must be a dictionary that maps document names +# to template names. +# +# The default sidebars (for documents that don't match any pattern) are +# defined by theme itself. Builtin themes are using these templates by +# default: ``['localtoc.html', 'relations.html', 'sourcelink.html', +# 'searchbox.html']``. +# +# html_sidebars = {} + + +# -- Options for HTMLHelp output --------------------------------------------- + +# Output file base name for HTML help builder. +htmlhelp_basename = 'StableBaselinesdoc' + + +# -- Options for LaTeX output ------------------------------------------------ + +latex_elements = { + # The paper size ('letterpaper' or 'a4paper'). + # + # 'papersize': 'letterpaper', + + # The font size ('10pt', '11pt' or '12pt'). + # + # 'pointsize': '10pt', + + # Additional stuff for the LaTeX preamble. + # + # 'preamble': '', + + # Latex figure (float) alignment + # + # 'figure_align': 'htbp', +} + +# Grouping the document tree into LaTeX files. List of tuples +# (source start file, target name, title, +# author, documentclass [howto, manual, or own class]). +latex_documents = [ + (master_doc, 'StableBaselines.tex', 'Stable Baselines Documentation', + 'Stable Baselines Contributors', 'manual'), +] + + +# -- Options for manual page output ------------------------------------------ + +# One entry per manual page. List of tuples +# (source start file, name, description, authors, manual section). +man_pages = [ + (master_doc, 'stablebaselines', 'Stable Baselines Documentation', + [author], 1) +] + + +# -- Options for Texinfo output ---------------------------------------------- + +# Grouping the document tree into Texinfo files. List of tuples +# (source start file, target name, title, author, +# dir menu entry, description, category) +texinfo_documents = [ + (master_doc, 'StableBaselines', 'Stable Baselines Documentation', + author, 'StableBaselines', 'One line description of project.', + 'Miscellaneous'), +] + + +# -- Extension configuration ------------------------------------------------- diff --git a/docs/guide/algos.rst b/docs/guide/algos.rst new file mode 100644 index 0000000000..1e60523d80 --- /dev/null +++ b/docs/guide/algos.rst @@ -0,0 +1,47 @@ +RL Algorithms +============= + + +.. Table too large +.. ===== ======================== ========= ======= ============ ================= =============== ================ +.. Name Refactored \ :sup:`(1)`\ Recurrent ``Box`` ``Discrete`` ``MultiDiscrete`` ``MultiBinary`` Multi Processing +.. ===== ======================== ========= ======= ============ ================= =============== ================ +.. A2C ✔️ +.. ===== ======================== ========= ======= ============ ================= =============== ================ + +.. There is an issue with Read The Docs for building the table when the "HER" row is present: +.. Apparently a problem of spacing +.. HER [#f3]_ ❌ [#f5]_ ❌ ✔️ ❌ ❌ + + +============ ======================== ========= =========== ============ ================ +Name Refactored [#f1]_ Recurrent ``Box`` ``Discrete`` Multi Processing +============ ======================== ========= =========== ============ ================ +A2C ✔️ ✔️ ✔️ ✔️ ✔️ +ACER ✔️ ✔️ ❌ [#f5]_ ✔️ ✔️ +ACKTR ✔️ ✔️ ❌ [#f5]_ ✔️ ✔️ +DDPG ✔️ ✔️ ✔️ ❌ ❌ +DQN ✔️ ❌ ❌ ✔️ ❌ +GAIL [#f2]_ ✔️ ✔️ ✔️ ✔️ ✔️ [#f4]_ +PPO1 ✔️ ✔️ ✔️ ✔️ ✔️ [#f4]_ +PPO2 ✔️ ✔️ ✔️ ✔️ ✔️ +TRPO ✔️ ✔️ ✔️ ✔️ ✔️ [#f4]_ +============ ======================== ========= =========== ============ ================ + +.. [#f1] Whether or not the algorithm has be refactored to fit the ``BaseRLModel`` class. +.. [#f2] Only implemented for TRPO. +.. [#f3] Only implemented for DDPG. +.. [#f4] Multi Processing with `MPI`_. +.. [#f5] TODO, in project scope. + + +Actions ``gym.spaces``: + +- ``Box``: A N-dimensional box that containes every point in the action + space. +- ``Discrete``: A list of possible actions, where each timestep only + one of the actions can be used. +- ``MultiDiscrete``: A list of possible actions, where each timestep only one action of each discrete set can be used. +- ``MultiBinary``: A list of possible actions, where each timestep any of the actions can be used in any combination. + +.. _MPI: https://mpi4py.readthedocs.io/en/stable/ diff --git a/docs/guide/examples.rst b/docs/guide/examples.rst new file mode 100644 index 0000000000..eb79dd1bda --- /dev/null +++ b/docs/guide/examples.rst @@ -0,0 +1,379 @@ +Examples +======== + +Try it online with Colab Notebooks! +----------------------------------- + +All the following examples can be executed online using Google colab |colab| +notebooks: + +- `Getting Started`_ +- `Training, Saving, Loading`_ +- `Multiprocessing`_ +- `Monitor Training and Plotting`_ +- `Atari Games`_ +- `Breakout`_ (trained agent included) + +.. _Getting Started: https://colab.research.google.com/drive/1_1H5bjWKYBVKbbs-Kj83dsfuZieDNcFU +.. _Training, Saving, Loading: https://colab.research.google.com/drive/1KoAQ1C_BNtGV3sVvZCnNZaER9rstmy0s +.. _Multiprocessing: https://colab.research.google.com/drive/1ZzNFMUUi923foaVsYb4YjPy4mjKtnOxb +.. _Monitor Training and Plotting: https://colab.research.google.com/drive/1L_IMo6v0a0ALK8nefZm6PqPSy0vZIWBT +.. _Atari Games: https://colab.research.google.com/drive/1iYK11yDzOOqnrXi1Sfjm1iekZr4cxLaN +.. _Breakout: https://colab.research.google.com/drive/14NwwEHwN4hdNgGzzySjxQhEVDff-zr7O + +.. |colab| image:: ../_static/img/colab.svg + +Basic Usage: Training, Saving, Loading +-------------------------------------- + +In the following example, we will train, save and load an A2C model on the Lunar Lander environment. + +.. image:: ../_static/img/try_it.png + :scale: 30 % + :target: https://colab.research.google.com/drive/1KoAQ1C_BNtGV3sVvZCnNZaER9rstmy0s + + +.. figure:: https://cdn-images-1.medium.com/max/960/1*W7X69nxINgZEcJEAyoHCVw.gif + + Lunar Lander Environment + + +.. note:: + LunarLander requires the python package `box2d`. + You can install it using ``apt install swing`` and then ``pip install box2d box2d-kengz`` + +.. code-block:: python + + import gym + + from stable_baselines.common.policies import MlpPolicy + from stable_baselines.common.vec_env import DummyVecEnv + from stable_baselines import A2C + + # Create and wrap the environment + env = gym.make('LunarLander-v2') + env = DummyVecEnv([lambda: env]) + + model = A2C(MlpPolicy, env, ent_coef=0.1, verbose=1) + # Train the agent + model.learn(total_timesteps=100000) + # Save the agent + model.save("a2c_lunar") + del model # delete trained model to demonstrate loading + + # Load the trained agent + model = A2C.load("a2c_lunar") + + # Enjoy trained agent + obs = env.reset() + for i in range(1000): + action, _states = model.predict(obs) + obs, rewards, dones, info = env.step(action) + env.render() + + +Multiprocessing: Unleashing the Power of Vectorized Environments +---------------------------------------------------------------- + +.. image:: ../_static/img/try_it.png + :scale: 30 % + :target: https://colab.research.google.com/drive/1ZzNFMUUi923foaVsYb4YjPy4mjKtnOxb + +.. figure:: https://cdn-images-1.medium.com/max/960/1*h4WTQNVIsvMXJTCpXm_TAw.gif + + CartPole Environment + + +.. code-block:: python + + import gym + import numpy as np + + from stable_baselines.common.policies import MlpPolicy + from stable_baselines.common.vec_env import SubprocVecEnv + from stable_baselines.common import set_global_seeds + from stable_baselines import ACKTR + + def make_env(env_id, rank, seed=0): + """ + Utility function for multiprocessed env. + + :param env_id: (str) the environment ID + :param num_env: (int) the number of environments you wish to have in subprocesses + :param seed: (int) the inital seed for RNG + :param rank: (int) index of the subprocess + """ + def _init(): + env = gym.make(env_id) + env.seed(seed + rank) + return env + set_global_seeds(seed) + return _init + + env_id = "CartPole-v1" + num_cpu = 4 # Number of processes to use + # Create the vectorized environment + env = SubprocVecEnv([make_env(env_id, i) for i in range(num_cpu)]) + + model = ACKTR(MlpPolicy, env, verbose=1) + model.learn(total_timesteps=25000) + + obs = env.reset() + for _ in range(1000): + action, _states = model.predict(obs) + obs, rewards, dones, info = env.step(action) + env.render() + + + +Using Callback: Monitoring Training +----------------------------------- + +You can define a custom callback function that will be called inside the agent. +This could be useful when you want to monitor training, for instance display live +learning curves in Tensorboard (or in Visdom) or save the best agent. + +.. image:: ../_static/img/try_it.png + :scale: 30 % + :target: https://colab.research.google.com/drive/1L_IMo6v0a0ALK8nefZm6PqPSy0vZIWBT + +.. figure:: ../_static/img/learning_curve.png + + Learning curve of DDPG on LunarLanderContinuous environment + +.. code-block:: python + + import os + + import gym + import numpy as np + import matplotlib.pyplot as plt + + from stable_baselines.common.policies import MlpPolicy + from stable_baselines.common.vec_env.dummy_vec_env import DummyVecEnv + from stable_baselines.bench import Monitor + from stable_baselines.results_plotter import load_results, ts2xy + from stable_baselines import DDPG + from stable_baselines.ddpg.noise import AdaptiveParamNoiseSpec + + + best_mean_reward, n_steps = -np.inf, 0 + + def callback(_locals, _globals): + """ + Callback called at each step (for DQN an others) or after n steps (see ACER or PPO2) + :param _locals: (dict) + :param _globals: (dict) + """ + global n_steps, best_mean_reward + # Print stats every 1000 calls + if (n_steps + 1) % 1000 == 0: + # Evaluate policy performance + x, y = ts2xy(load_results(log_dir), 'timesteps') + if len(x) > 0: + mean_reward = np.mean(y[-100:]) + print(x[-1], 'timesteps') + print("Best mean reward: {:.2f} - Last mean reward per episode: {:.2f}".format(best_mean_reward, mean_reward)) + + # New best model, you could save the agent here + if mean_reward > best_mean_reward: + best_mean_reward = mean_reward + # Example for saving best model + print("Saving new best model") + _locals['self'].save(log_dir + 'best_model.pkl') + n_steps += 1 + return False + + + # Create log dir + log_dir = "/tmp/gym/" + os.makedirs(log_dir, exist_ok=True) + + # Create and wrap the environment + env = gym.make('LunarLanderContinuous-v2') + env = Monitor(env, log_dir, allow_early_resets=True) + env = DummyVecEnv([lambda: env]) + + # Add some param noise for exploration + param_noise = AdaptiveParamNoiseSpec(initial_stddev=0.2, desired_action_stddev=0.2) + model = DDPG(MlpPolicy, env, param_noise=param_noise, memory_limit=int(1e6), verbose=0) + # Train the agent + model.learn(total_timesteps=200000, callback=callback) + +Atari Games +----------- + +.. figure:: ../_static/img/breakout.gif + + Trained A2C agent on Breakout + +.. figure:: https://cdn-images-1.medium.com/max/960/1*UHYJE7lF8IDZS_U5SsAFUQ.gif + + Pong Environment + + +Training a RL agent on Atari games is straightforward thanks to ``make_atari_env`` helper function. +It will do `all the preprocessing `_ +and multiprocessing for you. + +.. image:: ../_static/img/try_it.png + :scale: 30 % + :target: https://colab.research.google.com/drive/1iYK11yDzOOqnrXi1Sfjm1iekZr4cxLaN + + +.. code-block:: python + + from stable_baselines.common.cmd_util import make_atari_env + from stable_baselines.common.policies import CnnPolicy + from stable_baselines.common.vec_env import VecFrameStack + from stable_baselines import ACER + + # There already exists an environment generator + # that will make and wrap atari environments correctly. + # Here we are also multiprocessing training (num_env=4 => 4 processes) + env = make_atari_env('PongNoFrameskip-v4', num_env=4, seed=0) + # Frame-stacking with 4 frames + env = VecFrameStack(env, n_stack=4) + + model = ACER(CnnPolicy, env, verbose=1) + model.learn(total_timesteps=25000) + + obs = env.reset() + while True: + action, _states = model.predict(obs) + obs, rewards, dones, info = env.step(action) + env.render() + + +Mujoco: Normalizing input features +---------------------------------- + +Normalizing input features may be essential to successful training of an RL agent +(by default, images are scaled but not other types of input), +for instance when training on `Mujoco `_. For that, a wrapper exists and +will compute a running average and standard deviation of input features (it can do the same for rewards). + +.. note:: + We cannot provide a notebook for this example + because Mujoco is a proprietary engine and requires a license. + + +.. code-block:: python + + import gym + + from stable_baselines.common.policies import MlpPolicy + from stable_baselines.common.vec_env import DummyVecEnv, VecNormalize + from stable_baselines import PPO2 + + env = DummyVecEnv([lambda: gym.make("Reacher-v2")]) + # Automatically normalize the input features + env = VecNormalize(env, norm_obs=True, norm_reward=False, + clip_obs=10.) + + model = PPO2(MlpPolicy, env) + model.learn(total_timesteps=2000) + + # Don't forget to save the running average when saving the agent + log_dir = "/tmp/" + model.save(log_dir + "ppo_reacher") + env.save_running_average(log_dir) + + +Custom Policy Network +--------------------- + +Stable baselines provides default policy networks for images (CNNPolicies) +and other type of inputs (MlpPolicies). +However, you can also easily define a custom architecture for the policy network: + +.. code-block:: python + + import gym + + from stable_baselines.common.policies import FeedForwardPolicy + from stable_baselines.common.vec_env import DummyVecEnv + from stable_baselines import A2C + + # Custom MLP policy of three layers of size 128 each + class CustomPolicy(FeedForwardPolicy): + def __init__(self, *args, **kwargs): + super(CustomPolicy, self).__init__(*args, **kwargs, + layers=[128, 128, 128], + feature_extraction="mlp") + + # Create and wrap the environment + env = gym.make('LunarLander-v2') + env = DummyVecEnv([lambda: env]) + + model = A2C(CustomPolicy, env, verbose=1) + # Train the agent + model.learn(total_timesteps=100000) + + +Continual Learning +------------------ + +You can also move from learning on one environment to another for `continual learning `_ +(PPO2 on ``DemonAttack-v0``, then transferred on ``SpaceInvaders-v0``): + +.. code-block:: python + + from stable_baselines.common.cmd_util import make_atari_env + from stable_baselines.common.policies import CnnPolicy + from stable_baselines import PPO2 + + # There already exists an environment generator + # that will make and wrap atari environments correctly + env = make_atari_env('DemonAttackNoFrameskip-v4', num_env=8, seed=0) + + model = PPO2(CnnPolicy, env, verbose=1) + model.learn(total_timesteps=10000) + + obs = env.reset() + for i in range(1000): + action, _states = model.predict(obs) + obs, rewards, dones, info = env.step(action) + env.render() + + # The number of environments must be identical when changing environments + env = make_atari_env('SpaceInvadersNoFrameskip-v4', num_env=8, seed=0) + + # change env + model.set_env(env) + model.learn(total_timesteps=10000) + + obs = env.reset() + while True: + action, _states = model.predict(obs) + obs, rewards, dones, info = env.step(action) + env.render() + + +Bonus: Make a GIF of a Trained Agent +------------------------------------ + +.. note:: + For Atari games, you need to use a screen recorder such as `Kazam `_. + And then convert the video using `ffmpeg `_ + +.. code-block:: python + + import imageio + import numpy as np + + from stable_baselines.common.policies import MlpPolicy + from stable_baselines import A2C + + model = A2C(MlpPolicy, "LunarLander-v2").learn(100000) + + images = [] + obs = model.env.reset() + img = model.env.render(mode='rgb_array') + for i in range(350): + images.append(img) + action, _ = model.predict(obs) + obs, _, _ ,_ = model.env.step(action) + img = model.env.render(mode='rgb_array') + + imageio.mimsave('lander_a2c.gif', [np.array(img[0]) for i, img in enumerate(images) if i%2 == 0], fps=29) diff --git a/docs/guide/install.rst b/docs/guide/install.rst new file mode 100644 index 0000000000..fd35bb0d25 --- /dev/null +++ b/docs/guide/install.rst @@ -0,0 +1,48 @@ +.. _install: + +============ +Installation +============ + +Prerequisites +------------- + +Baselines requires python3 (>=3.5) with the development headers. You'll +also need system packages CMake, OpenMPI and zlib. Those can be +installed as follows + +Ubuntu +~~~~~~ + +.. code-block:: bash + + sudo apt-get update && sudo apt-get install cmake libopenmpi-dev python3-dev zlib1g-dev + +Mac OS X +~~~~~~~~ + +Installation of system packages on Mac requires `Homebrew`_. With +Homebrew installed, run the follwing: + +.. code-block:: bash + + brew install cmake openmpi + +.. _Homebrew: https://brew.sh + + +Stable Release +-------------- + +.. code-block:: bash + + pip install stable-baselines + + +Bleeding-edge version +--------------------- + +.. code-block:: bash + + git clone https://github.com/hill-a/stable-baselines && cd stable-baselines + pip install -e . diff --git a/docs/guide/quickstart.rst b/docs/guide/quickstart.rst new file mode 100644 index 0000000000..c7d478d7a0 --- /dev/null +++ b/docs/guide/quickstart.rst @@ -0,0 +1,45 @@ +.. _quickstart: + +=============== +Getting Started +=============== + +Most of the library tries to follow a sklearn-like syntax for the Reinforcement Learning algorithms. + +Here is a quick example of how to train and run PPO2 on a cartpole environment: + +.. code-block:: python + + import gym + + from stable_baselines.common.policies import MlpPolicy + from stable_baselines.common.vec_env import DummyVecEnv + from stable_baselines import PPO2 + + env = gym.make('CartPole-v1') + env = DummyVecEnv([lambda: env]) # The algorithms require a vectorized environment to run + + model = PPO2(MlpPolicy, env, verbose=1) + model.learn(total_timesteps=10000) + + obs = env.reset() + for i in range(1000): + action, _states = model.predict(obs) + obs, rewards, dones, info = env.step(action) + env.render() + + +Or just train a model with a one liner if +`the environment is registered in Gym `_: + +.. code-block:: python + + from stable_baselines.common.policies import MlpPolicy + from stable_baselines import PPO2 + + model = PPO2(MlpPolicy, 'CartPole-v1').learn(10000) + + +.. figure:: https://cdn-images-1.medium.com/max/960/1*R_VMmdgKAY0EDhEjHVelzw.gif + + Define and train a RL agent in one line of code! diff --git a/docs/guide/vec_envs.rst b/docs/guide/vec_envs.rst new file mode 100644 index 0000000000..01c5fe2985 --- /dev/null +++ b/docs/guide/vec_envs.rst @@ -0,0 +1,48 @@ +.. _vec_env: + +.. automodule:: stable_baselines.common.vec_env + +Vectorized Environments +======================= + +Vectorized Environments are a way to multiprocess training. Instead of training a RL agent +on 1 environment, it allows to train it on `n` environments using `n` processes. +Because of that, `actions` passed to the environment are now a vector (of dimension `n`). It is the same for `observations`, +`rewards` and end of episode signals (`dones`). + + +.. note:: + + Vectorized environments are required when using wrappers for frame-stacking or normalization. + +.. note:: + + When using vectorized environments, the environments are automatically resetted at the end of each episode. + +DummyVecEnv +----------- + +.. autoclass:: DummyVecEnv + :members: + +SubprocVecEnv +------------- + +.. autoclass:: SubprocVecEnv + :members: + +Wrappers +-------- + +VecFrameStack +~~~~~~~~~~~~~ + +.. autoclass:: VecFrameStack + :members: + + +VecNormalize +~~~~~~~~~~~~ + +.. autoclass:: VecNormalize + :members: diff --git a/docs/index.rst b/docs/index.rst new file mode 100644 index 0000000000..90d4573480 --- /dev/null +++ b/docs/index.rst @@ -0,0 +1,92 @@ +.. Stable Baselines documentation master file, created by + sphinx-quickstart on Sat Aug 25 10:33:54 2018. + You can adapt this file completely to your liking, but it should at least + contain the root `toctree` directive. + +Welcome to Stable Baselines docs! +============================================ + +`Stable Baselines `_ is a set of improved implementations +of reinforcement learning algorithms based on OpenAI `Baselines `_. + +Github repository: https://github.com/hill-a/stable-baselines + +You can read a detailed presentation of Stable Baselines in the +Medium article: `link `_ + + +.. Example of internal link: :ref:`ppo2` + +Main differences with OpenAI Baselines +-------------------------------------- + +This toolset is a fork of OpenAI Baselines, with a major structural refactoring, and code cleanups: + +- Unified structure for all algorithms +- PEP8 compliant (unified code style) +- Documented functions and classes +- More tests & more code coverage + + +.. toctree:: + :maxdepth: 2 + :caption: User Guide + + guide/install + guide/quickstart + guide/algos + guide/examples + guide/vec_envs + + +.. toctree:: + :maxdepth: 1 + :caption: RL Algorithms + + modules/base + modules/policies + modules/a2c + modules/acer + modules/acktr + modules/ddpg + modules/dqn + modules/gail + modules/her + modules/ppo1 + modules/ppo2 + modules/trpo + +.. toctree:: + :maxdepth: 1 + :caption: Misc + + changelog + +Citing Stable Baselines +----------------------- +To cite this project in publications: + +.. code-block:: bibtex + + @misc{stable-baselines, + author = {Hill, Ashley and Raffin, Antonin and Traore, Rene and Dhariwal, Prafulla and Hesse, Christopher and Klimov, Oleg and Nichol, Alex and Plappert, Matthias and Radford, Alec and Schulman, John and Sidor, Szymon and Wu, Yuhuai}, + title = {Stable Baselines}, + year = {2018}, + publisher = {GitHub}, + journal = {GitHub repository}, + howpublished = {\url{https://github.com/hill-a/stable-baselines}}, + } + +Contributing +------------ + +To any interested in making the baselines better, there is still some documentation/improvements that needs to be done. +If you want to contribute, please open an issue first and then propose your pull request on Github at +https://github.com/hill-a/stable-baselines. + +Indices and tables +------------------- + +* :ref:`genindex` +* :ref:`search` +* :ref:`modindex` diff --git a/docs/make.bat b/docs/make.bat new file mode 100644 index 0000000000..22b5fff4ee --- /dev/null +++ b/docs/make.bat @@ -0,0 +1,36 @@ +@ECHO OFF + +pushd %~dp0 + +REM Command file for Sphinx documentation + +if "%SPHINXBUILD%" == "" ( + set SPHINXBUILD=sphinx-build +) +set SOURCEDIR=. +set BUILDDIR=_build +set SPHINXPROJ=StableBaselines + +if "%1" == "" goto help + +%SPHINXBUILD% >NUL 2>NUL +if errorlevel 9009 ( + echo. + echo.The 'sphinx-build' command was not found. Make sure you have Sphinx + echo.installed, then set the SPHINXBUILD environment variable to point + echo.to the full path of the 'sphinx-build' executable. Alternatively you + echo.may add the Sphinx directory to PATH. + echo. + echo.If you don't have Sphinx installed, grab it from + echo.http://sphinx-doc.org/ + exit /b 1 +) + +%SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% +goto end + +:help +%SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% + +:end +popd diff --git a/docs/modules/a2c.rst b/docs/modules/a2c.rst new file mode 100644 index 0000000000..6b6a17fa34 --- /dev/null +++ b/docs/modules/a2c.rst @@ -0,0 +1,77 @@ +.. _a2c: + +.. automodule:: stable_baselines.a2c + + +A2C +==== + +A synchronous, deterministic variant of `Asynchronous Advantage Actor Critic (A3C) `_. +It uses multiple workers to avoid the use of a replay buffer. + + +Notes +----- + +- Original paper: https://arxiv.org/abs/1602.01783 +- OpenAI blog post: https://blog.openai.com/openai-baselines-ppo/ +- ``python -m stable_baselines.ppo2.run_atari`` runs the algorithm for 40M + frames = 10M timesteps on an Atari game. See help (``-h``) for more + options. +- ``python -m stable_baselines.ppo2.run_mujoco`` runs the algorithm for 1M + frames on a Mujoco environment. + +Can I use? +---------- + +- Reccurent policies: ✔️ +- Multi processing: ✔️ +- Gym spaces: + + +============= ====== =========== +Space Action Observation +============= ====== =========== +Discrete ✔️ ✔️ +Box ✔️ ✔️ +MultiDiscrete ✔️ ✔️ +MultiBinary ✔️ ✔️ +============= ====== =========== + + +Example +------- + +Train a A2C agent on `CartPole-v1` using 4 processes. + +.. code-block:: python + + import gym + + from stable_baselines.common.policies import MlpPolicy + from stable_baselines.common.vec_env import SubprocVecEnv + from stable_baselines import A2C + + # multiprocess environment + n_cpu = 4 + env = SubprocVecEnv([lambda: gym.make('CartPole-v1') for i in range(n_cpu)]) + + model = A2C(MlpPolicy, env, verbose=1) + model.learn(total_timesteps=25000) + model.save("a2c_cartpole") + + del model # remove to demonstrate saving and loading + + A2C.load("a2c_cartpole") + + obs = env.reset() + while True: + action, _states = model.predict(obs) + obs, rewards, dones, info = env.step(action) + env.render() + +Parameters +---------- + +.. autoclass:: A2C + :members: diff --git a/docs/modules/acer.rst b/docs/modules/acer.rst new file mode 100644 index 0000000000..3c5aceefb3 --- /dev/null +++ b/docs/modules/acer.rst @@ -0,0 +1,74 @@ +.. _acer: + +.. automodule:: stable_baselines.acer + + +ACER +==== + + `Sample Efficient Actor-Critic with Experience Replay (ACER) `_ combines + several ideas of previous algorithms: it uses multiple workers (as A2C), implements a replay buffer (as in DQN), + uses Retrace for Q-value estimation, importance sampling and a trust region. + + +Notes +----- + +- Original paper: https://arxiv.org/abs/1611.01224 +- ``python -m stable_baselines.acer.run_atari`` runs the algorithm for 40M frames = 10M timesteps on an Atari game. + See help (``-h``) for more options. + +Can I use? +---------- + +- Reccurent policies: ✔️ +- Multi processing: ✔️ +- Gym spaces: + + +============= ====== =========== +Space Action Observation +============= ====== =========== +Discrete ✔️ ✔️ +Box ❌ ✔️ +MultiDiscrete ❌ ✔️ +MultiBinary ❌ ✔️ +============= ====== =========== + + +Example +------- + +.. code-block:: python + + import gym + + from stable_baselines.common.policies import MlpPolicy, MlpLstmPolicy, MlpLnLstmPolicy, \ + CnnPolicy, CnnLstmPolicy, CnnLnLstmPolicy + from stable_baselines.common.vec_env import SubprocVecEnv + from stable_baselines import ACER + + # multiprocess environment + n_cpu = 4 + env = SubprocVecEnv([lambda: gym.make('CartPole-v1') for i in range(n_cpu)]) + + model = ACER(MlpPolicy, env, verbose=1) + model.learn(total_timesteps=25000) + model.save("acer_cartpole") + + del model # remove to demonstrate saving and loading + + ACER.load("acer_cartpole") + + obs = env.reset() + while True: + action, _states = model.predict(obs) + obs, rewards, dones, info = env.step(action) + env.render() + + +Parameters +---------- + +.. autoclass:: ACER + :members: diff --git a/docs/modules/acktr.rst b/docs/modules/acktr.rst new file mode 100644 index 0000000000..fd1954f095 --- /dev/null +++ b/docs/modules/acktr.rst @@ -0,0 +1,74 @@ +.. _acktr: + +.. automodule:: stable_baselines.acktr + + +ACKTR +===== + +`Actor Critic using Kronecker-Factored Trust Region (ACKTR) `_ uses +Kronecker-factored approximate curvature (K-FAC) for trust region optimization. + + +Notes +----- + +- Original paper: https://arxiv.org/abs/1708.05144 +- Baselines blog post: https://blog.openai.com/baselines-acktr-a2c/ +- ``python -m stable_baselines.acktr.run_atari`` runs the algorithm for 40M frames = 10M timesteps on an Atari game. + See help (``-h``) for more options. + +Can I use? +---------- + +- Reccurent policies: ✔️ +- Multi processing: ✔️ +- Gym spaces: + + +============= ====== =========== +Space Action Observation +============= ====== =========== +Discrete ✔️ ✔️ +Box ❌ ✔️ +MultiDiscrete ❌ ✔️ +MultiBinary ❌ ✔️ +============= ====== =========== + + +Example +------- + +.. code-block:: python + + import gym + + from stable_baselines.common.policies import MlpPolicy, MlpLstmPolicy, MlpLnLstmPolicy, \ + CnnPolicy, CnnLstmPolicy, CnnLnLstmPolicy + from stable_baselines.common.vec_env import SubprocVecEnv + from stable_baselines import ACKTR + + # multiprocess environment + n_cpu = 4 + env = SubprocVecEnv([lambda: gym.make('CartPole-v1') for i in range(n_cpu)]) + + model = ACKTR(MlpPolicy, env, verbose=1) + model.learn(total_timesteps=25000) + model.save("acktr_cartpole") + + del model # remove to demonstrate saving and loading + + ACKTR.load("acktr_cartpole") + + obs = env.reset() + while True: + action, _states = model.predict(obs) + obs, rewards, dones, info = env.step(action) + env.render() + + +Parameters +---------- + +.. autoclass:: ACKTR + :members: diff --git a/docs/modules/base.rst b/docs/modules/base.rst new file mode 100644 index 0000000000..6e35e6ac33 --- /dev/null +++ b/docs/modules/base.rst @@ -0,0 +1,12 @@ +.. _base_algo: + +.. automodule:: stable_baselines.common.base_class + + +Base RL Class +============= + +Common interface for all the RL algorithms. + +.. autoclass:: BaseRLModel + :members: diff --git a/docs/modules/ddpg.rst b/docs/modules/ddpg.rst new file mode 100644 index 0000000000..2a5ac94772 --- /dev/null +++ b/docs/modules/ddpg.rst @@ -0,0 +1,74 @@ +.. _ddpg: + +.. automodule:: stable_baselines.ddpg + + +DDPG +==== +`Deep Deterministic Policy Gradient (DDPG) `_ + + +Notes +----- + +- Original paper: https://arxiv.org/abs/1509.02971 +- Baselines post: https://blog.openai.com/better-exploration-with-parameter-noise/ +- ``python -m baselines.ddpg.main`` runs the algorithm for 1M frames = 10M timesteps + on a Mujoco environment. See help (``-h``) for more options. + +Can I use? +---------- + +- Reccurent policies: ❌ +- Multi processing: ❌ +- Gym spaces: + + +============= ====== =========== +Space Action Observation +============= ====== =========== +Discrete ❌ ✔️ +Box ✔️ ✔️ +MultiDiscrete ❌ ✔️ +MultiBinary ❌ ✔️ +============= ====== =========== + + +Example +------- + +.. code-block:: python + + import gym + + from stable_baselines.common.policies import MlpPolicy, CnnPolicy + from stable_baselines.common.vec_env import DummyVecEnv + from stable_baselines.ddpg.noise import NormalActionNoise, OrnsteinUhlenbeckActionNoise, AdaptiveParamNoiseSpec + from stable_baselines import DDPG + + env = gym.make('CartPole-v1') + env = DummyVecEnv([lambda: env]) + + # the noise objects for DDPG + param_noise = None + action_noise = NormalActionNoise(mean=1, sigma=0) + + model = DDPG(MlpPolicy, env, verbose=1, param_noise=param_noise, action_noise=action_noise) + model.learn(total_timesteps=25000) + model.save("ddpg_cartpole") + + del model # remove to demonstrate saving and loading + + DDPG.load("ddpg_cartpole") + + obs = env.reset() + while True: + action, _states = model.predict(obs) + obs, rewards, dones, info = env.step(action) + env.render() + +Parameters +---------- + +.. autoclass:: DDPG + :members: diff --git a/docs/modules/dqn.rst b/docs/modules/dqn.rst new file mode 100644 index 0000000000..9ec7569a9f --- /dev/null +++ b/docs/modules/dqn.rst @@ -0,0 +1,99 @@ +.. _dqn: + +.. automodule:: stable_baselines.deepq + + +DQN +=== + +`Deep Q Network (DQN) `_ +and its extensions (Double-DQN, Dueling-DQN, Prioritized Experience Replay). + +Notes +----- + +- Original paper: https://arxiv.org/abs/1312.5602 + + +Can I use? +---------- + +- Reccurent policies: ❌ +- Multi processing: ❌ +- Gym spaces: + + +============= ====== =========== +Space Action Observation +============= ====== =========== +Discrete ✔️ ✔️ +Box ❌ ✔️ +MultiDiscrete ❌ ✔️ +MultiBinary ❌ ✔️ +============= ====== =========== + + +Example +------- + +.. code-block:: python + + import gym + + from stable_baselines.common.vec_env import DummyVecEnv + from stable_baselines.deepq.models import mlp, cnn_to_mlp + from stable_baselines import DeepQ + + env = gym.make('CartPole-v1') + env = DummyVecEnv([lambda: env]) + + model = DeepQ(mlp(hiddens=[32]), env, verbose=1) + model.learn(total_timesteps=25000) + model.save("deepq_cartpole") + + del model # remove to demonstrate saving and loading + + DeepQ.load("deepq_cartpole") + + obs = env.reset() + while True: + action, _states = model.predict(obs) + obs, rewards, dones, info = env.step(action) + env.render() + + +With Atari: + +.. code-block:: python + + from stable_baselines.common.atari_wrappers import make_atari + from stable_baselines.deepq.models import mlp, cnn_to_mlp + from stable_baselines import DeepQ + + env = make_atari('BreakoutNoFrameskip-v4') + + # nature CNN for DeepQ + cnn_policy = cnn_to_mlp( + convs=[(32, 8, 4), (64, 4, 2), (64, 3, 1)], + hiddens=[256], + dueling=True) + + model = DeepQ(cnn_policy, env, verbose=1) + model.learn(total_timesteps=25000) + model.save("deepq_breakout") + + del model # remove to demonstrate saving and loading + + DeepQ.load("deepq_breakout") + + obs = env.reset() + while True: + action, _states = model.predict(obs) + obs, rewards, dones, info = env.step(action) + env.render() + +Parameters +---------- + +.. autoclass:: DeepQ + :members: diff --git a/docs/modules/gail.rst b/docs/modules/gail.rst new file mode 100644 index 0000000000..66e8d43cad --- /dev/null +++ b/docs/modules/gail.rst @@ -0,0 +1,95 @@ +.. _gail: + +.. automodule:: stable_baselines.gail + + +GAIL +==== + +`Generative Adversarial Imitation Learning (GAIL) `_ + + +Notes +----- + +- Original paper: https://arxiv.org/abs/1606.03476 + +If you want to train an imitation learning agent +------------------------------------------------ + +.. _step-1:-download-expert-data: + +Step 1: Download expert data +~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +Download the expert data into ``./data``, `download link`_ + +.. _step-2:-run-gail: + +Step 2: Run GAIL +~~~~~~~~~~~~~~~~ + +Run with single thread: + +.. code:: bash + + python -m stable_baselines.gail.run_mujoco + +Run with multiple threads: + +.. code:: bash + + mpirun -np 16 python -m stable_baselines.gail.run_mujoco + +See help (``-h``) for more options. + +.. _in-case-you-want-to-run-behavior-cloning-(bc): + +**In case you want to run Behavior Cloning (BC)** + +.. code:: bash + + python -m baselines.gail.behavior_clone + +See help (``-h``) for more options. + + +OpenAI Maintainers: + +- Yuan-Hong Liao, andrewliao11_at_gmail_dot_com +- Ryan Julian, ryanjulian_at_gmail_dot_com + +**Others** + +Thanks to the open source: + +- @openai/imitation +- @carpedm20/deep-rl-tensorflow + +.. _download link: https://drive.google.com/drive/folders/1h3H4AY_ZBx08hz-Ct0Nxxus-V1melu1U?usp=sharing + + + +Can I use? +---------- + +- Reccurent policies: ✔️ +- Multi processing: ✔️ (using MPI) +- Gym spaces: + + +============= ====== =========== +Space Action Observation +============= ====== =========== +Discrete ❌ ✔️ +Box ✔️ ✔️ +MultiDiscrete ❌ ✔️ +MultiBinary ❌ ✔️ +============= ====== =========== + + +Parameters +---------- + +.. autoclass:: GAIL + :members: diff --git a/docs/modules/her.rst b/docs/modules/her.rst new file mode 100644 index 0000000000..bbbdcc9552 --- /dev/null +++ b/docs/modules/her.rst @@ -0,0 +1,67 @@ +.. _her: + +.. automodule:: stable_baselines.her + + +HER +==== + +`Hindsight Experience Replay (HER) `_ + +.. warning:: + + HER is not refactored yet. We are looking for contributors to help us. + +How to use Hindsight Experience Replay +-------------------------------------- + +Getting started +~~~~~~~~~~~~~~~ + +Training an agent is very simple: + +.. code:: bash + + python -m stable_baselines.her.experiment.train + +This will train a DDPG+HER agent on the ``FetchReach`` environment. You +should see the success rate go up quickly to ``1.0``, which means that +the agent achieves the desired goal in 100% of the cases. The training +script logs other diagnostics as well and pickles the best policy so far +(w.r.t. to its test success rate), the latest policy, and, if enabled, a +history of policies every K epochs. + +To inspect what the agent has learned, use the play script: + +.. code:: bash + + python -m stable_baselines.her.experiment.play /path/to/an/experiment/policy_best.pkl + +You can try it right now with the results of the training step (the +script prints out the path for you). This should visualize the current +policy for 10 episodes and will also print statistics. + +Reproducing results +~~~~~~~~~~~~~~~~~~~ + +In order to reproduce the results from `Plappert et al. (2018)`_, run +the following command: + +.. code:: bash + + python -m stable_baselines.her.experiment.train --num_cpu 19 + +This will require a machine with sufficient amount of physical CPU +cores. In our experiments, we used `Azure's D15v2 instances`_, which +have 20 physical cores. We only scheduled the experiment on 19 of those +to leave some head-room on the system. + +.. _Plappert et al. (2018): https://arxiv.org/abs/1802.09464 +.. _Azure's D15v2 instances: https://docs.microsoft.com/en-us/azure/virtual-machines/linux/sizes + + +Parameters +---------- + +.. autoclass:: HER + :members: diff --git a/docs/modules/policies.rst b/docs/modules/policies.rst new file mode 100644 index 0000000000..7f202ab02a --- /dev/null +++ b/docs/modules/policies.rst @@ -0,0 +1,44 @@ +.. _policies: + +.. automodule:: stable_baselines.common.policies + +Policy Networks +=============== + + +Base Classes +------------ + +.. autoclass:: ActorCriticPolicy + :members: + +.. autoclass:: FeedForwardPolicy + :members: + +.. autoclass:: LstmPolicy + :members: + +MLP Policies +------------ + +.. autoclass:: MlpPolicy + :members: + +.. autoclass:: MlpLstmPolicy + :members: + +.. autoclass:: MlpLnLstmPolicy + :members: + + +CNN Policies +------------ + +.. autoclass:: CnnPolicy + :members: + +.. autoclass:: CnnLstmPolicy + :members: + +.. autoclass:: CnnLnLstmPolicy + :members: diff --git a/docs/modules/ppo1.rst b/docs/modules/ppo1.rst new file mode 100644 index 0000000000..ad2dfe7731 --- /dev/null +++ b/docs/modules/ppo1.rst @@ -0,0 +1,82 @@ +.. _ppo1: + +.. automodule:: stable_baselines.ppo1 + + +PPO1 +==== + +The `Proximal Policy Optimization `_ algorithm combines ideas from A2C (having multiple workers) +and TRPO (it uses a trust region to improve the actor). + +The main idea is that after an update, the new policy should be not too far form the `old` policy. +For that, ppo uses clipping to avoid too large update. + +.. note:: + + PPO2 is the implementation of OpenAI made for GPU. For multiprocessing, it uses vectorized environments + compared to PPO1 which uses MPI. + +Notes +----- + +- Original paper: https://arxiv.org/abs/1502.05477 +- OpenAI blog post: https://blog.openai.com/openai-baselines-ppo/ +- ``mpirun -np 8 python -m stable_baselines.ppo1.run_atari`` runs the algorithm for 40M frames = 10M timesteps on an Atari game. See help (``-h``) for more options. +- ``python -m stable_baselines.ppo1.run_mujoco`` runs the algorithm for 1M frames on a Mujoco environment. +- Train mujoco 3d humanoid (with optimal-ish hyperparameters): ``mpirun -np 16 python -m stable_baselines.ppo1.run_humanoid --model-path=/path/to/model`` +- Render the 3d humanoid: ``python -m stable_baselines.ppo1.run_humanoid --play --model-path=/path/to/model`` + +Can I use? +---------- + +- Reccurent policies: ✔️ +- Multi processing: ✔️ (using MPI) +- Gym spaces: + + +============= ====== =========== +Space Action Observation +============= ====== =========== +Discrete ✔️ ✔️ +Box ✔️ ✔️ +MultiDiscrete ✔️ ✔️ +MultiBinary ✔️ ✔️ +============= ====== =========== + + +Example +------- + +.. code-block:: python + + import gym + + from stable_baselines.common.policies import MlpPolicy, MlpLstmPolicy, MlpLnLstmPolicy, \ + CnnPolicy, CnnLstmPolicy, CnnLnLstmPolicy + from stable_baselines.common.vec_env import DummyVecEnv + from stable_baselines import PPO1 + + env = gym.make('CartPole-v1') + env = DummyVecEnv([lambda: env]) + + model = PPO1(MlpPolicy, env, verbose=1) + model.learn(total_timesteps=25000) + model.save("ppo1_cartpole") + + del model # remove to demonstrate saving and loading + + PPO1.load("ppo1_cartpole") + + obs = env.reset() + while True: + action, _states = model.predict(obs) + obs, rewards, dones, info = env.step(action) + env.render() + + +Parameters +---------- + +.. autoclass:: PPO1 + :members: diff --git a/docs/modules/ppo2.rst b/docs/modules/ppo2.rst new file mode 100644 index 0000000000..f8e36aeac0 --- /dev/null +++ b/docs/modules/ppo2.rst @@ -0,0 +1,89 @@ +.. _ppo2: + +.. automodule:: stable_baselines.ppo2 + +PPO2 +==== + +The `Proximal Policy Optimization `_ algorithm combines ideas from A2C (having multiple workers) +and TRPO (it uses a trust region to improve the actor). + +The main idea is that after an update, the new policy should be not too far form the `old` policy. +For that, ppo uses clipping to avoid too large update. + +.. note:: + + PPO2 is the implementation of OpenAI made for GPU. For multiprocessing, it uses vectorized environments + compared to PPO1 which uses MPI. + +.. note:: + + PPO2 contains several modifications from the original algorithm not documented + by OpenAI: value function is also clipped and advantages are normalized. + + +Notes +----- + +- Original paper: https://arxiv.org/abs/1707.06347 +- OpenAI blog post: https://blog.openai.com/openai-baselines-ppo/ +- ``python -m stable_baselines.ppo2.run_atari`` runs the algorithm for 40M + frames = 10M timesteps on an Atari game. See help (``-h``) for more + options. +- ``python -m stable_baselines.ppo2.run_mujoco`` runs the algorithm for 1M + frames on a Mujoco environment. + +Can I use? +---------- + +- Reccurent policies: ✔️ +- Multi processing: ✔️ +- Gym spaces: + + +============= ====== =========== +Space Action Observation +============= ====== =========== +Discrete ✔️ ✔️ +Box ✔️ ✔️ +MultiDiscrete ✔️ ✔️ +MultiBinary ✔️ ✔️ +============= ====== =========== + +Example +------- + +Train a PPO agent on `CartPole-v1` using 4 processes. + +.. code-block:: python + + import gym + + from stable_baselines.common.policies import MlpPolicy + from stable_baselines.common.vec_env import SubprocVecEnv + from stable_baselines import PPO2 + + # multiprocess environment + n_cpu = 4 + env = SubprocVecEnv([lambda: gym.make('CartPole-v1') for i in range(n_cpu)]) + + model = PPO2(MlpPolicy, env, verbose=1) + model.learn(total_timesteps=25000) + model.save("ppo2_cartpole") + + del model # remove to demonstrate saving and loading + + PPO2.load("ppo2_cartpole") + + # Enjoy trained agent + obs = env.reset() + while True: + action, _states = model.predict(obs) + obs, rewards, dones, info = env.step(action) + env.render() + +Parameters +---------- + +.. autoclass:: PPO2 + :members: diff --git a/docs/modules/trpo.rst b/docs/modules/trpo.rst new file mode 100644 index 0000000000..bc3bc2c25a --- /dev/null +++ b/docs/modules/trpo.rst @@ -0,0 +1,73 @@ +.. _trpo: + +.. automodule:: stable_baselines.trpo_mpi + + +TRPO +==== + +`Trust Region Policy Optimization (TRPO) `_ +is an iterative approach for optimizing policies with guaranteed monotonic improvement. + +Notes +----- + +- Original paper: https://arxiv.org/abs/1502.05477 +- OpenAI blog post: https://blog.openai.com/openai-baselines-ppo/ +- ``mpirun -np 16 python -m stable_baselines.trpo_mpi.run_atari`` runs the algorithm + for 40M frames = 10M timesteps on an Atari game. See help (``-h``) for more options. +- ``python -m stable_baselines.trpo_mpi.run_mujoco`` runs the algorithm for 1M timesteps on a Mujoco environment. + +Can I use? +---------- + +- Reccurent policies: ✔️ +- Multi processing: ✔️ (using MPI) +- Gym spaces: + + +============= ====== =========== +Space Action Observation +============= ====== =========== +Discrete ✔️ ✔️ +Box ✔️ ✔️ +MultiDiscrete ✔️ ✔️ +MultiBinary ✔️ ✔️ +============= ====== =========== + + +Example +------- + +.. code-block:: python + + import gym + + from stable_baselines.common.policies import MlpPolicy, MlpLstmPolicy, MlpLnLstmPolicy, \ + CnnPolicy, CnnLstmPolicy, CnnLnLstmPolicy + from stable_baselines.common.vec_env import DummyVecEnv + from stable_baselines import TRPO + + env = gym.make('CartPole-v1') + env = DummyVecEnv([lambda: env]) + + model = TRPO(MlpPolicy, env, verbose=1) + model.learn(total_timesteps=25000) + model.save("trpo_cartpole") + + del model # remove to demonstrate saving and loading + + TRPO.load("trpo_cartpole") + + obs = env.reset() + while True: + action, _states = model.predict(obs) + obs, rewards, dones, info = env.step(action) + env.render() + + +Parameters +---------- + +.. autoclass:: TRPO + :members: diff --git a/run_tests.sh b/run_tests.sh new file mode 100755 index 0000000000..bbb432ba68 --- /dev/null +++ b/run_tests.sh @@ -0,0 +1,2 @@ +#!/bin/bash +python -m pytest --cov-config .coveragerc --cov-report html --cov-report term --cov=. --rungpu diff --git a/setup.cfg b/setup.cfg new file mode 100644 index 0000000000..eedd265bc1 --- /dev/null +++ b/setup.cfg @@ -0,0 +1,3 @@ +[metadata] +# This includes the license file in the wheel. +license_file = LICENSE \ No newline at end of file diff --git a/setup.py b/setup.py index bf8badcf60..b074355ca6 100644 --- a/setup.py +++ b/setup.py @@ -5,10 +5,73 @@ print('This Python is only compatible with Python 3, but you are running ' 'Python {}. The installation will likely fail.'.format(sys.version_info.major)) +version = "1.0.7" -setup(name='baselines', +long_description = """ +[![Build Status](https://travis-ci.com/hill-a/stable-baselines.svg?branch=master)](https://travis-ci.com/hill-a/stable-baselines) [![Documentation Status](https://readthedocs.org/projects/stable-baselines/badge/?version=latest)](https://stable-baselines.readthedocs.io/en/docs/?badge=latest) [![Codacy Badge](https://api.codacy.com/project/badge/Grade/3bcb4cd6d76a4270acb16b5fe6dd9efa)](https://www.codacy.com/app/baselines_janitors/stable-baselines?utm_source=github.com&utm_medium=referral&utm_content=hill-a/stable-baselines&utm_campaign=Badge_Grade) [![Codacy Badge](https://api.codacy.com/project/badge/Coverage/3bcb4cd6d76a4270acb16b5fe6dd9efa)](https://www.codacy.com/app/baselines_janitors/stable-baselines?utm_source=github.com&utm_medium=referral&utm_content=hill-a/stable-baselines&utm_campaign=Badge_Coverage) + +# Stable Baselines + +Stable Baselines is a set of improved implementations of reinforcement learning algorithms based on OpenAI [Baselines](https://github.com/openai/baselines/). + +These algorithms will make it easier for the research community and industry to replicate, refine, and identify new ideas, and will create good baselines to build projects on top of. We expect these tools will be used as a base around which new ideas can be added, and as a tool for comparing a new approach against existing ones. We also hope that the simplicity of these tools will allow beginners to experiment with a more advanced toolset, without being buried in implementation details. + +## Main differences with OpenAI Baselines +This toolset is a fork of OpenAI Baselines, with a major structural refactoring, and code cleanups: + +- Unified structure for all algorithms +- PEP8 compliant (unified code style) +- Documented functions and classes +- More tests & more code coverage + +## Links + +Repository: +https://github.com/hill-a/stable-baselines + +Medium article: +https://medium.com/@araffin/df87c4b2fc82 + +## Quick example + +Most of the library tries to follow a sklearn-like syntax for the Reinforcement Learning algorithms using Gym. + +Here is a quick example of how to train and run PPO2 on a cartpole environment: + +```python +import gym + +from stable_baselines.common.policies import MlpPolicy +from stable_baselines.common.vec_env import DummyVecEnv +from stable_baselines import PPO2 + +env = gym.make('CartPole-v1') +env = DummyVecEnv([lambda: env]) # The algorithms require a vectorized environment to run + +model = PPO2(MlpPolicy, env, verbose=1) +model.learn(total_timesteps=10000) + +obs = env.reset() +for i in range(1000): + action, _states = model.predict(obs) + obs, rewards, dones, info = env.step(action) + env.render() +``` + +Or just train a model with a one liner if [the environment is registed in Gym](https://github.com/openai/gym/wiki/Environments): + +```python +from stable_baselines.common.policies import MlpPolicy +from stable_baselines import PPO2 + +model = PPO2(MlpPolicy, 'CartPole-v1').learn(10000) +``` + +""" + +setup(name='stable_baselines', packages=[package for package in find_packages() - if package.startswith('baselines')], + if package.startswith('stable_baselines')], install_requires=[ 'gym[mujoco,atari,classic_control,robotics]', 'scipy', @@ -19,12 +82,28 @@ 'progressbar2', 'mpi4py', 'cloudpickle', - 'tensorflow>=1.4.0', + 'tensorflow>=1.5.0', 'click', - 'opencv-python' + 'opencv-python', + 'numpy', + 'pandas', + 'pytest', + 'matplotlib', + 'seaborn', + 'glob2' ], - description='OpenAI baselines: high quality implementations of reinforcement learning algorithms', - author='OpenAI', - url='https://github.com/openai/baselines', - author_email='gym@openai.com', - version='0.1.5') + description='A fork of OpenAI Baselines, implementations of reinforcement learning algorithms.', + author='Ashley Hill', + url='https://github.com/hill-a/stable-baselines', + author_email='ashley.hill@u-psud.fr', + keywords="reinforcement-learning-algorithms reinforcement-learning machine-learning " + "gym openai baselines toolbox python data-science", + license="MIT", + long_description=long_description, + version=version, + ) + +# python setup.py sdist +# python setup.py bdist_wheel +# twine upload --repository-url https://test.pypi.org/legacy/ dist/* +# twine upload dist/* diff --git a/stable_baselines/__init__.py b/stable_baselines/__init__.py new file mode 100644 index 0000000000..903184bab4 --- /dev/null +++ b/stable_baselines/__init__.py @@ -0,0 +1,11 @@ +from stable_baselines.a2c import A2C +from stable_baselines.acer import ACER +from stable_baselines.acktr import ACKTR +from stable_baselines.ddpg import DDPG +from stable_baselines.deepq import DeepQ +from stable_baselines.gail import GAIL +from stable_baselines.ppo1 import PPO1 +from stable_baselines.ppo2 import PPO2 +from stable_baselines.trpo_mpi import TRPO + +__version__ = "1.0.6.a0" diff --git a/stable_baselines/a2c/__init__.py b/stable_baselines/a2c/__init__.py new file mode 100644 index 0000000000..d68abe7eb5 --- /dev/null +++ b/stable_baselines/a2c/__init__.py @@ -0,0 +1 @@ +from stable_baselines.a2c.a2c import A2C diff --git a/stable_baselines/a2c/a2c.py b/stable_baselines/a2c/a2c.py new file mode 100644 index 0000000000..966e23a5ee --- /dev/null +++ b/stable_baselines/a2c/a2c.py @@ -0,0 +1,300 @@ +import time + +import numpy as np +import tensorflow as tf + +from stable_baselines import logger +from stable_baselines.common import explained_variance, tf_util, BaseRLModel, SetVerbosity +from stable_baselines.common.policies import LstmPolicy +from stable_baselines.common.runners import AbstractEnvRunner +from stable_baselines.a2c.utils import discount_with_dones, Scheduler, find_trainable_variables, mse + + +class A2C(BaseRLModel): + """ + The A2C (Advantage Actor Critic) model class, https://arxiv.org/abs/1602.01783 + + :param policy: (ActorCriticPolicy) The policy model to use (MLP, CNN, LSTM, ...) + :param env: (Gym environment or str) The environment to learn from (if registered in Gym, can be str) + :param gamma: (float) Discount factor + :param n_steps: (int) The number of steps to run for each environment + :param vf_coef: (float) Value function coefficient for the loss calculation + :param ent_coef: (float) Entropy coefficient for the loss caculation + :param max_grad_norm: (float) The maximum value for the gradient clipping + :param learning_rate: (float) The learning rate + :param alpha: (float) RMS prop optimizer decay + :param epsilon: (float) RMS prop optimizer epsilon + :param lr_schedule: (str) The type of scheduler for the learning rate update ('linear', 'constant', + 'double_linear_con', 'middle_drop' or 'double_middle_drop') + :param verbose: (int) the verbosity level: 0 none, 1 training information, 2 tensorflow debug + :param _init_setup_model: (bool) Whether or not to build the network at the creation of the instance + (used only for loading) + """ + + def __init__(self, policy, env, gamma=0.99, n_steps=5, vf_coef=0.25, ent_coef=0.01, max_grad_norm=0.5, + learning_rate=7e-4, alpha=0.99, epsilon=1e-5, lr_schedule='linear', verbose=0, _init_setup_model=True): + super(A2C, self).__init__(policy=policy, env=env, requires_vec_env=True, verbose=verbose) + + self.n_steps = n_steps + self.gamma = gamma + self.vf_coef = vf_coef + self.ent_coef = ent_coef + self.max_grad_norm = max_grad_norm + self.alpha = alpha + self.epsilon = epsilon + self.lr_schedule = lr_schedule + self.learning_rate = learning_rate + + self.graph = None + self.sess = None + self.learning_rate_ph = None + self.n_batch = None + self.actions_ph = None + self.advs_ph = None + self.rewards_ph = None + self.pg_loss = None + self.vf_loss = None + self.entropy = None + self.params = None + self.apply_backprop = None + self.train_model = None + self.step_model = None + self.step = None + self.proba_step = None + self.value = None + self.initial_state = None + self.learning_rate_schedule = None + + # if we are loading, it is possible the environment is not known, however the obs and action space are known + if _init_setup_model: + self.setup_model() + + def setup_model(self): + with SetVerbosity(self.verbose): + + self.graph = tf.Graph() + with self.graph.as_default(): + self.sess = tf_util.make_session(graph=self.graph) + + self.n_batch = self.n_envs * self.n_steps + + n_batch_step = None + n_batch_train = None + if issubclass(self.policy, LstmPolicy): + n_batch_step = self.n_envs + n_batch_train = self.n_envs * self.n_steps + + step_model = self.policy(self.sess, self.observation_space, self.action_space, self.n_envs, 1, + n_batch_step, reuse=False) + train_model = self.policy(self.sess, self.observation_space, self.action_space, self.n_envs, + self.n_steps, n_batch_train, reuse=True) + + self.actions_ph = train_model.pdtype.sample_placeholder([None]) + self.advs_ph = tf.placeholder(tf.float32, [None]) + self.rewards_ph = tf.placeholder(tf.float32, [None]) + self.learning_rate_ph = tf.placeholder(tf.float32, []) + + neglogpac = train_model.proba_distribution.neglogp(self.actions_ph) + self.entropy = tf.reduce_mean(train_model.proba_distribution.entropy()) + self.pg_loss = tf.reduce_mean(self.advs_ph * neglogpac) + self.vf_loss = mse(tf.squeeze(train_model.value_fn), self.rewards_ph) + loss = self.pg_loss - self.entropy * self.ent_coef + self.vf_loss * self.vf_coef + + self.params = find_trainable_variables("model") + grads = tf.gradients(loss, self.params) + if self.max_grad_norm is not None: + grads, _ = tf.clip_by_global_norm(grads, self.max_grad_norm) + grads = list(zip(grads, self.params)) + trainer = tf.train.RMSPropOptimizer(learning_rate=self.learning_rate_ph, decay=self.alpha, + epsilon=self.epsilon) + self.apply_backprop = trainer.apply_gradients(grads) + + self.train_model = train_model + self.step_model = step_model + self.step = step_model.step + self.proba_step = step_model.proba_step + self.value = step_model.value + self.initial_state = step_model.initial_state + tf.global_variables_initializer().run(session=self.sess) + + def _train_step(self, obs, states, rewards, masks, actions, values): + """ + applies a training step to the model + + :param obs: ([float]) The input observations + :param states: ([float]) The states (used for reccurent policies) + :param rewards: ([float]) The rewards from the environment + :param masks: ([bool]) Whether or not the episode is over (used for reccurent policies) + :param actions: ([float]) The actions taken + :param values: ([float]) The logits values + :return: (float, float, float) policy loss, value loss, policy entropy + """ + advs = rewards - values + cur_lr = None + for _ in range(len(obs)): + cur_lr = self.learning_rate_schedule.value() + assert cur_lr is not None, "Error: the observation input array cannon be empty" + + td_map = {self.train_model.obs_ph: obs, self.actions_ph: actions, self.advs_ph: advs, + self.rewards_ph: rewards, self.learning_rate_ph: cur_lr} + if states is not None: + td_map[self.train_model.states_ph] = states + td_map[self.train_model.masks_ph] = masks + + policy_loss, value_loss, policy_entropy, _ = self.sess.run( + [self.pg_loss, self.vf_loss, self.entropy, self.apply_backprop], td_map) + return policy_loss, value_loss, policy_entropy + + def learn(self, total_timesteps, callback=None, seed=None, log_interval=100): + with SetVerbosity(self.verbose): + self._setup_learn(seed) + + self.learning_rate_schedule = Scheduler(initial_value=self.learning_rate, n_values=total_timesteps, + schedule=self.lr_schedule) + + runner = A2CRunner(self.env, self, n_steps=self.n_steps, gamma=self.gamma) + + t_start = time.time() + for update in range(1, total_timesteps // self.n_batch + 1): + obs, states, rewards, masks, actions, values = runner.run() + _, value_loss, policy_entropy = self._train_step(obs, states, rewards, masks, actions, values) + n_seconds = time.time() - t_start + fps = int((update * self.n_batch) / n_seconds) + + if callback is not None: + callback(locals(), globals()) + + if self.verbose >= 1 and (update % log_interval == 0 or update == 1): + explained_var = explained_variance(values, rewards) + logger.record_tabular("nupdates", update) + logger.record_tabular("total_timesteps", update * self.n_batch) + logger.record_tabular("fps", fps) + logger.record_tabular("policy_entropy", float(policy_entropy)) + logger.record_tabular("value_loss", float(value_loss)) + logger.record_tabular("explained_variance", float(explained_var)) + logger.dump_tabular() + + return self + + def predict(self, observation, state=None, mask=None): + if state is None: + state = self.initial_state + if mask is None: + mask = [False for _ in range(self.n_envs)] + observation = np.array(observation).reshape((-1,) + self.observation_space.shape) + + actions, _, states, _ = self.step(observation, state, mask) + return actions, states + + def action_probability(self, observation, state=None, mask=None): + if state is None: + state = self.initial_state + if mask is None: + mask = [False for _ in range(self.n_envs)] + observation = np.array(observation).reshape((-1,) + self.observation_space.shape) + + return self.proba_step(observation, state, mask) + + def save(self, save_path): + data = { + "gamma": self.gamma, + "n_steps": self.n_steps, + "vf_coef": self.vf_coef, + "ent_coef": self.ent_coef, + "max_grad_norm": self.max_grad_norm, + "learning_rate": self.learning_rate, + "alpha": self.alpha, + "epsilon": self.epsilon, + "lr_schedule": self.lr_schedule, + "verbose": self.verbose, + "policy": self.policy, + "observation_space": self.observation_space, + "action_space": self.action_space, + "n_envs": self.n_envs, + "_vectorize_action": self._vectorize_action + } + + params = self.sess.run(self.params) + + self._save_to_file(save_path, data=data, params=params) + + @classmethod + def load(cls, load_path, env=None, **kwargs): + data, params = cls._load_from_file(load_path) + + model = cls(policy=data["policy"], env=None, _init_setup_model=False) + model.__dict__.update(data) + model.__dict__.update(kwargs) + model.set_env(env) + model.setup_model() + + restores = [] + for param, loaded_p in zip(model.params, params): + restores.append(param.assign(loaded_p)) + model.sess.run(restores) + + return model + + +class A2CRunner(AbstractEnvRunner): + def __init__(self, env, model, n_steps=5, gamma=0.99): + """ + A runner to learn the policy of an environment for an a2c model + + :param env: (Gym environment) The environment to learn from + :param model: (Model) The model to learn + :param n_steps: (int) The number of steps to run for each environment + :param gamma: (float) Discount factor + """ + super(A2CRunner, self).__init__(env=env, model=model, n_steps=n_steps) + self.gamma = gamma + + def run(self): + """ + Run a learning step of the model + + :return: ([float], [float], [float], [bool], [float], [float]) + observations, states, rewards, masks, actions, values + """ + mb_obs, mb_rewards, mb_actions, mb_values, mb_dones = [], [], [], [], [] + mb_states = self.states + for _ in range(self.n_steps): + actions, values, states, _ = self.model.step(self.obs, self.states, self.dones) + mb_obs.append(np.copy(self.obs)) + mb_actions.append(actions) + mb_values.append(values) + mb_dones.append(self.dones) + obs, rewards, dones, _ = self.env.step(actions) + self.states = states + self.dones = dones + for n, done in enumerate(dones): + if done: + self.obs[n] = self.obs[n] * 0 + self.obs = obs + mb_rewards.append(rewards) + mb_dones.append(self.dones) + # batch of steps to batch of rollouts + mb_obs = np.asarray(mb_obs, dtype=self.obs.dtype).swapaxes(1, 0).reshape(self.batch_ob_shape) + mb_rewards = np.asarray(mb_rewards, dtype=np.float32).swapaxes(0, 1) + mb_actions = np.asarray(mb_actions, dtype=np.int32).swapaxes(0, 1) + mb_values = np.asarray(mb_values, dtype=np.float32).swapaxes(0, 1) + mb_dones = np.asarray(mb_dones, dtype=np.bool).swapaxes(0, 1) + mb_masks = mb_dones[:, :-1] + mb_dones = mb_dones[:, 1:] + last_values = self.model.value(self.obs, self.states, self.dones).tolist() + # discount/bootstrap off value fn + for n, (rewards, dones, value) in enumerate(zip(mb_rewards, mb_dones, last_values)): + rewards = rewards.tolist() + dones = dones.tolist() + if dones[-1] == 0: + rewards = discount_with_dones(rewards + [value], dones + [0], self.gamma)[:-1] + else: + rewards = discount_with_dones(rewards, dones, self.gamma) + mb_rewards[n] = rewards + + # convert from [n_env, n_steps, ...] to [n_steps * n_env, ...] + mb_rewards = mb_rewards.reshape(-1, *mb_rewards.shape[2:]) + mb_actions = mb_actions.reshape(-1, *mb_actions.shape[2:]) + mb_values = mb_values.reshape(-1, *mb_values.shape[2:]) + mb_masks = mb_masks.reshape(-1, *mb_masks.shape[2:]) + return mb_obs, mb_states, mb_rewards, mb_masks, mb_actions, mb_values diff --git a/stable_baselines/a2c/run_atari.py b/stable_baselines/a2c/run_atari.py new file mode 100644 index 0000000000..29815812ab --- /dev/null +++ b/stable_baselines/a2c/run_atari.py @@ -0,0 +1,54 @@ +#!/usr/bin/env python3 + +from stable_baselines import logger +from stable_baselines.common.cmd_util import make_atari_env, atari_arg_parser +from stable_baselines.common.vec_env.vec_frame_stack import VecFrameStack +from stable_baselines.a2c import A2C +from stable_baselines.common.policies import CnnPolicy, CnnLstmPolicy, CnnLnLstmPolicy + + +def train(env_id, num_timesteps, seed, policy, lr_schedule, num_env): + """ + Train A2C model for atari environment, for testing purposes + + :param env_id: (str) Environment ID + :param num_timesteps: (int) The total number of samples + :param seed: (int) The initial seed for training + :param policy: (A2CPolicy) The policy model to use (MLP, CNN, LSTM, ...) + :param lr_schedule: (str) The type of scheduler for the learning rate update ('linear', 'constant', + 'double_linear_con', 'middle_drop' or 'double_middle_drop') + :param num_env: (int) The number of environments + """ + policy_fn = None + if policy == 'cnn': + policy_fn = CnnPolicy + elif policy == 'lstm': + policy_fn = CnnLstmPolicy + elif policy == 'lnlstm': + policy_fn = CnnLnLstmPolicy + if policy_fn is None: + raise ValueError("Error: policy {} not implemented".format(policy)) + + env = VecFrameStack(make_atari_env(env_id, num_env, seed), 4) + + model = A2C(policy_fn, env, lr_schedule=lr_schedule) + model.learn(total_timesteps=int(num_timesteps * 1.1), seed=seed) + env.close() + + +def main(): + """ + Runs the test + """ + parser = atari_arg_parser() + parser.add_argument('--policy', choices=['cnn', 'lstm', 'lnlstm'], default='cnn', help='Policy architecture') + parser.add_argument('--lr_schedule', choices=['constant', 'linear'], default='constant', + help='Learning rate schedule') + args = parser.parse_args() + logger.configure() + train(args.env, num_timesteps=args.num_timesteps, seed=args.seed, policy=args.policy, lr_schedule=args.lr_schedule, + num_env=16) + + +if __name__ == '__main__': + main() diff --git a/stable_baselines/a2c/utils.py b/stable_baselines/a2c/utils.py new file mode 100644 index 0000000000..3371c48d5e --- /dev/null +++ b/stable_baselines/a2c/utils.py @@ -0,0 +1,559 @@ +import os +from collections import deque + +import numpy as np +import tensorflow as tf + + +def sample(logits): + """ + Creates a sampling Tensor for non deterministic policies + + :param logits: (TensorFlow Tensor) The input probability for each action + :return: (TensorFlow Tensor) The sampled action + """ + noise = tf.random_uniform(tf.shape(logits)) + return tf.argmax(logits - tf.log(-tf.log(noise)), 1) + + +def calc_entropy(logits): + """ + Calculates the entropy of the output values of the network + + :param logits: (TensorFlow Tensor) The input probability for each action + :return: (TensorFlow Tensor) The Entropy of the output values of the network + """ + # Compute softmax + a_0 = logits - tf.reduce_max(logits, 1, keepdims=True) + exp_a_0 = tf.exp(a_0) + z_0 = tf.reduce_sum(exp_a_0, 1, keepdims=True) + p_0 = exp_a_0 / z_0 + return tf.reduce_sum(p_0 * (tf.log(z_0) - a_0), 1) + + +def calc_entropy_softmax(action_proba): + """ + Calculates the softmax entropy of the output values of the network + + :param action_proba: (TensorFlow Tensor) The input probability for each action + :return: (TensorFlow Tensor) The softmax entropy of the output values of the network + """ + return - tf.reduce_sum(action_proba * tf.log(action_proba + 1e-6), axis=1) + + +def mse(pred, target): + """ + Returns the Mean squared error between prediction and target + + :param pred: (TensorFlow Tensor) The predicted value + :param target: (TensorFlow Tensor) The target value + :return: (TensorFlow Tensor) The Mean squared error between prediction and target + """ + return tf.reduce_mean(tf.square(pred - target)) + + +def ortho_init(scale=1.0): + """ + Orthogonal initialization for the policy weights + + :param scale: (float) Scaling factor for the weights. + :return: (function) an initialization function for the weights + """ + + # _ortho_init(shape, dtype, partition_info=None) + def _ortho_init(shape, *_, **_kwargs): + """Intialize weights as Orthogonal matrix. + + Orthogonal matrix initialization [1]_. For n-dimensional shapes where + n > 2, the n-1 trailing axes are flattened. For convolutional layers, this + corresponds to the fan-in, so this makes the initialization usable for + both dense and convolutional layers. + + References + ---------- + .. [1] Saxe, Andrew M., James L. McClelland, and Surya Ganguli. + "Exact solutions to the nonlinear dynamics of learning in deep + linear + """ + # lasagne ortho init for tf + shape = tuple(shape) + if len(shape) == 2: + flat_shape = shape + elif len(shape) == 4: # assumes NHWC + flat_shape = (np.prod(shape[:-1]), shape[-1]) + else: + raise NotImplementedError + gaussian_noise = np.random.normal(0.0, 1.0, flat_shape) + u, _, v = np.linalg.svd(gaussian_noise, full_matrices=False) + weights = u if u.shape == flat_shape else v # pick the one with the correct shape + weights = weights.reshape(shape) + return (scale * weights[:shape[0], :shape[1]]).astype(np.float32) + + return _ortho_init + + +def conv(input_tensor, scope, *, n_filters, filter_size, stride, + pad='VALID', init_scale=1.0, data_format='NHWC', one_dim_bias=False): + """ + Creates a 2d convolutional layer for TensorFlow + + :param input_tensor: (TensorFlow Tensor) The input tensor for the convolution + :param scope: (str) The TensorFlow variable scope + :param n_filters: (int) The number of filters + :param filter_size: (int) The filter size + :param stride: (int) The stride of the convolution + :param pad: (str) The padding type ('VALID' or 'SAME') + :param init_scale: (int) The initialization scale + :param data_format: (str) The data format for the convolution weights + :param one_dim_bias: (bool) If the bias should be one dimentional or not + :return: (TensorFlow Tensor) 2d convolutional layer + """ + if data_format == 'NHWC': + channel_ax = 3 + strides = [1, stride, stride, 1] + bshape = [1, 1, 1, n_filters] + elif data_format == 'NCHW': + channel_ax = 1 + strides = [1, 1, stride, stride] + bshape = [1, n_filters, 1, 1] + else: + raise NotImplementedError + bias_var_shape = [n_filters] if one_dim_bias else [1, n_filters, 1, 1] + n_input = input_tensor.get_shape()[channel_ax].value + wshape = [filter_size, filter_size, n_input, n_filters] + with tf.variable_scope(scope): + weight = tf.get_variable("w", wshape, initializer=ortho_init(init_scale)) + bias = tf.get_variable("b", bias_var_shape, initializer=tf.constant_initializer(0.0)) + if not one_dim_bias and data_format == 'NHWC': + bias = tf.reshape(bias, bshape) + return bias + tf.nn.conv2d(input_tensor, weight, strides=strides, padding=pad, data_format=data_format) + + +def linear(input_tensor, scope, n_hidden, *, init_scale=1.0, init_bias=0.0): + """ + Creates a fully connected layer for TensorFlow + + :param input_tensor: (TensorFlow Tensor) The input tensor for the fully connected layer + :param scope: (str) The TensorFlow variable scope + :param n_hidden: (int) The number of hidden neurons + :param init_scale: (int) The initialization scale + :param init_bias: (int) The initialization offset bias + :return: (TensorFlow Tensor) fully connected layer + """ + with tf.variable_scope(scope): + n_input = input_tensor.get_shape()[1].value + weight = tf.get_variable("w", [n_input, n_hidden], initializer=ortho_init(init_scale)) + bias = tf.get_variable("b", [n_hidden], initializer=tf.constant_initializer(init_bias)) + return tf.matmul(input_tensor, weight) + bias + + +def batch_to_seq(tensor_batch, n_batch, n_steps, flat=False): + """ + Transform a batch of Tensors, into a sequence of Tensors for reccurent policies + + :param tensor_batch: (TensorFlow Tensor) The input tensor to unroll + :param n_batch: (int) The number of batch to run (n_envs * n_steps) + :param n_steps: (int) The number of steps to run for each environment + :param flat: (bool) If the input Tensor is flat + :return: (TensorFlow Tensor) sequence of Tensors for reccurent policies + """ + if flat: + tensor_batch = tf.reshape(tensor_batch, [n_batch, n_steps]) + else: + tensor_batch = tf.reshape(tensor_batch, [n_batch, n_steps, -1]) + return [tf.squeeze(v, [1]) for v in tf.split(axis=1, num_or_size_splits=n_steps, value=tensor_batch)] + + +def seq_to_batch(tensor_sequence, flat=False): + """ + Transform a sequence of Tensors, into a batch of Tensors for reccurent policies + + :param tensor_sequence: (TensorFlow Tensor) The input tensor to batch + :param flat: (bool) If the input Tensor is flat + :return: (TensorFlow Tensor) batch of Tensors for reccurent policies + """ + shape = tensor_sequence[0].get_shape().as_list() + if not flat: + assert len(shape) > 1 + n_hidden = tensor_sequence[0].get_shape()[-1].value + return tf.reshape(tf.concat(axis=1, values=tensor_sequence), [-1, n_hidden]) + else: + return tf.reshape(tf.stack(values=tensor_sequence, axis=1), [-1]) + + +def lstm(input_tensor, mask_tensor, cell_state_hidden, scope, n_hidden, init_scale=1.0, layer_norm=False): + """ + Creates an Long Short Term Memory (LSTM) cell for TensorFlow + + :param input_tensor: (TensorFlow Tensor) The input tensor for the LSTM cell + :param mask_tensor: (TensorFlow Tensor) The mask tensor for the LSTM cell + :param cell_state_hidden: (TensorFlow Tensor) The state tensor for the LSTM cell + :param scope: (str) The TensorFlow variable scope + :param n_hidden: (int) The number of hidden neurons + :param init_scale: (int) The initialization scale + :param layer_norm: (bool) Whether to apply Layer Normalization or not + :return: (TensorFlow Tensor) LSTM cell + """ + _, n_input = [v.value for v in input_tensor[0].get_shape()] + with tf.variable_scope(scope): + weight_x = tf.get_variable("wx", [n_input, n_hidden * 4], initializer=ortho_init(init_scale)) + weight_h = tf.get_variable("wh", [n_hidden, n_hidden * 4], initializer=ortho_init(init_scale)) + bias = tf.get_variable("b", [n_hidden * 4], initializer=tf.constant_initializer(0.0)) + + if layer_norm: + # Gain and bias of layer norm + gain_x = tf.get_variable("gx", [n_hidden * 4], initializer=tf.constant_initializer(1.0)) + bias_x = tf.get_variable("bx", [n_hidden * 4], initializer=tf.constant_initializer(0.0)) + + gain_h = tf.get_variable("gh", [n_hidden * 4], initializer=tf.constant_initializer(1.0)) + bias_h = tf.get_variable("bh", [n_hidden * 4], initializer=tf.constant_initializer(0.0)) + + gain_c = tf.get_variable("gc", [n_hidden], initializer=tf.constant_initializer(1.0)) + bias_c = tf.get_variable("bc", [n_hidden], initializer=tf.constant_initializer(0.0)) + + cell_state, hidden = tf.split(axis=1, num_or_size_splits=2, value=cell_state_hidden) + for idx, (_input, mask) in enumerate(zip(input_tensor, mask_tensor)): + cell_state = cell_state * (1 - mask) + hidden = hidden * (1 - mask) + if layer_norm: + gates = _ln(tf.matmul(_input, weight_x), gain_x, bias_x) \ + + _ln(tf.matmul(hidden, weight_h), gain_h, bias_h) + bias + else: + gates = tf.matmul(_input, weight_x) + tf.matmul(hidden, weight_h) + bias + in_gate, forget_gate, out_gate, cell_candidate = tf.split(axis=1, num_or_size_splits=4, value=gates) + in_gate = tf.nn.sigmoid(in_gate) + forget_gate = tf.nn.sigmoid(forget_gate) + out_gate = tf.nn.sigmoid(out_gate) + cell_candidate = tf.tanh(cell_candidate) + cell_state = forget_gate * cell_state + in_gate * cell_candidate + if layer_norm: + hidden = out_gate * tf.tanh(_ln(cell_state, gain_c, bias_c)) + else: + hidden = out_gate * tf.tanh(cell_state) + input_tensor[idx] = hidden + cell_state_hidden = tf.concat(axis=1, values=[cell_state, hidden]) + return input_tensor, cell_state_hidden + + +def _ln(input_tensor, gain, bias, epsilon=1e-5, axes=None): + """ + Apply layer normalisation. + + :param input_tensor: (TensorFlow Tensor) The input tensor for the Layer normalization + :param gain: (TensorFlow Tensor) The scale tensor for the Layer normalization + :param bias: (TensorFlow Tensor) The bias tensor for the Layer normalization + :param epsilon: (float) The epsilon value for floating point calculations + :param axes: (tuple, list or int) The axes to apply the mean and variance calculation + :return: (TensorFlow Tensor) a normalizing layer + """ + if axes is None: + axes = [1] + mean, variance = tf.nn.moments(input_tensor, axes=axes, keep_dims=True) + input_tensor = (input_tensor - mean) / tf.sqrt(variance + epsilon) + input_tensor = input_tensor * gain + bias + return input_tensor + + +def lnlstm(input_tensor, mask_tensor, cell_state, scope, n_hidden, init_scale=1.0): + """ + Creates a LSTM with Layer Normalization (lnlstm) cell for TensorFlow + + :param input_tensor: (TensorFlow Tensor) The input tensor for the LSTM cell + :param mask_tensor: (TensorFlow Tensor) The mask tensor for the LSTM cell + :param cell_state: (TensorFlow Tensor) The state tensor for the LSTM cell + :param scope: (str) The TensorFlow variable scope + :param n_hidden: (int) The number of hidden neurons + :param init_scale: (int) The initialization scale + :return: (TensorFlow Tensor) lnlstm cell + """ + return lstm(input_tensor, mask_tensor, cell_state, scope, n_hidden, init_scale, layer_norm=True) + + +def conv_to_fc(input_tensor): + """ + Reshapes a Tensor from a convolutional network to a Tensor for a fully connected network + + :param input_tensor: (TensorFlow Tensor) The convolutional input tensor + :return: (TensorFlow Tensor) The fully connected output tensor + """ + n_hidden = np.prod([v.value for v in input_tensor.get_shape()[1:]]) + input_tensor = tf.reshape(input_tensor, [-1, n_hidden]) + return input_tensor + + +def discount_with_dones(rewards, dones, gamma): + """ + Apply the discount value to the reward, where the environment is not done + + :param rewards: ([float]) The rewards + :param dones: ([bool]) Whether an environment is done or not + :param gamma: (float) The discount value + :return: ([float]) The discounted rewards + """ + discounted = [] + ret = 0 # Return: discounted reward + for reward, done in zip(rewards[::-1], dones[::-1]): + ret = reward + gamma * ret * (1. - done) # fixed off by one bug + discounted.append(ret) + return discounted[::-1] + + +def find_trainable_variables(key): + """ + Returns the trainable variables within a given scope + + :param key: (str) The variable scope + :return: ([TensorFlow Tensor]) the trainable variables + """ + with tf.variable_scope(key): + return tf.trainable_variables() + + +def make_path(path): + """ + For a given path, create the folders if they do not exist + + :param path: (str) The path + :return: (bool) Whether or not it finished correctly + """ + return os.makedirs(path, exist_ok=True) + + +def constant(_): + """ + Returns a constant value for the Scheduler + + :param _: ignored + :return: (float) 1 + """ + return 1. + + +def linear_schedule(progress): + """ + Returns a linear value for the Scheduler + + :param progress: (float) Current progress status (in [0, 1]) + :return: (float) 1 - progress + """ + return 1 - progress + + +def middle_drop(progress): + """ + Returns a linear value with a drop near the middle to a constant value for the Scheduler + + :param progress: (float) Current progress status (in [0, 1]) + :return: (float) 1 - progress if (1 - progress) >= 0.75 else 0.075 + """ + eps = 0.75 + if 1 - progress < eps: + return eps * 0.1 + return 1 - progress + + +def double_linear_con(progress): + """ + Returns a linear value (x2) with a flattened tail for the Scheduler + + :param progress: (float) Current progress status (in [0, 1]) + :return: (float) 1 - progress*2 if (1 - progress*2) >= 0.125 else 0.125 + """ + progress *= 2 + eps = 0.125 + if 1 - progress < eps: + return eps + return 1 - progress + + +def double_middle_drop(progress): + """ + Returns a linear value with two drops near the middle to a constant value for the Scheduler + + :param progress: (float) Current progress status (in [0, 1]) + :return: (float) if 0.75 <= 1 - p: 1 - p, if 0.25 <= 1 - p < 0.75: 0.75, if 1 - p < 0.25: 0.125 + """ + eps1 = 0.75 + eps2 = 0.25 + if 1 - progress < eps1: + if 1 - progress < eps2: + return eps2 * 0.5 + return eps1 * 0.1 + return 1 - progress + + +SCHEDULES = { + 'linear': linear_schedule, + 'constant': constant, + 'double_linear_con': double_linear_con, + 'middle_drop': middle_drop, + 'double_middle_drop': double_middle_drop +} + + +class Scheduler(object): + def __init__(self, initial_value, n_values, schedule): + """ + Update a value every iteration, with a specific curve + + :param initial_value: (float) initial value + :param n_values: (int) the total number of iterations + :param schedule: (function) the curve you wish to follow for your value + """ + self.step = 0. + self.initial_value = initial_value + self.nvalues = n_values + self.schedule = SCHEDULES[schedule] + + def value(self): + """ + Update the Scheduler, and return the current value + + :return: (float) the current value + """ + current_value = self.initial_value * self.schedule(self.step / self.nvalues) + self.step += 1. + return current_value + + def value_steps(self, steps): + """ + Get a value for a given step + + :param steps: (int) The current number of iterations + :return: (float) the value for the current number of iterations + """ + return self.initial_value * self.schedule(steps / self.nvalues) + + +class EpisodeStats: + def __init__(self, n_steps, n_envs): + """ + Calculates the episode statistics + + :param n_steps: (int) The number of steps to run for each environment + :param n_envs: (int) The number of environments + """ + self.episode_rewards = [] + for _ in range(n_envs): + self.episode_rewards.append([]) + self.len_buffer = deque(maxlen=40) # rolling buffer for episode lengths + self.rewbuffer = deque(maxlen=40) # rolling buffer for episode rewards + self.n_steps = n_steps + self.n_envs = n_envs + + def feed(self, rewards, masks): + """ + Update the latest reward and mask + + :param rewards: ([float]) The new rewards for the new step + :param masks: ([float]) The new masks for the new step + """ + rewards = np.reshape(rewards, [self.n_envs, self.n_steps]) + masks = np.reshape(masks, [self.n_envs, self.n_steps]) + for i in range(0, self.n_envs): + for j in range(0, self.n_steps): + self.episode_rewards[i].append(rewards[i][j]) + if masks[i][j]: + reward_length = len(self.episode_rewards[i]) + reward_sum = sum(self.episode_rewards[i]) + self.len_buffer.append(reward_length) + self.rewbuffer.append(reward_sum) + self.episode_rewards[i] = [] + + def mean_length(self): + """ + Returns the average length of each episode + + :return: (float) + """ + if self.len_buffer: + return np.mean(self.len_buffer) + else: + return 0 # on the first params dump, no episodes are finished + + def mean_reward(self): + """ + Returns the average reward of each episode + + :return: (float) + """ + if self.rewbuffer: + return np.mean(self.rewbuffer) + else: + return 0 + + +# For ACER +def get_by_index(input_tensor, idx): + """ + Return the input tensor, offset by a certain value + + :param input_tensor: (TensorFlow Tensor) The input tensor + :param idx: (int) The index offset + :return: (TensorFlow Tensor) the offset tensor + """ + assert len(input_tensor.get_shape()) == 2 + assert len(idx.get_shape()) == 1 + idx_flattened = tf.range(0, input_tensor.shape[0]) * input_tensor.shape[1] + idx + offset_tensor = tf.gather(tf.reshape(input_tensor, [-1]), # flatten input + idx_flattened) # use flattened indices + return offset_tensor + + +def check_shape(tensors, shapes): + """ + Verifies the tensors match the given shape, will raise an error if the shapes do not match + + :param tensors: ([TensorFlow Tensor]) The tensors that should be checked + :param shapes: ([list]) The list of shapes for each tensor + """ + i = 0 + for (tensor, shape) in zip(tensors, shapes): + assert tensor.get_shape().as_list() == shape, "id " + str(i) + " shape " + str(tensor.get_shape()) + str(shape) + i += 1 + + +def avg_norm(tensor): + """ + Return an average of the L2 normalization of the batch + + :param tensor: (TensorFlow Tensor) The input tensor + :return: (TensorFlow Tensor) Average L2 normalization of the batch + """ + return tf.reduce_mean(tf.sqrt(tf.reduce_sum(tf.square(tensor), axis=-1))) + + +def gradient_add(grad_1, grad_2, param, verbose=0): + """ + Sum two gradients + + :param grad_1: (TensorFlow Tensor) The first gradient + :param grad_2: (TensorFlow Tensor) The second gradient + :param param: (TensorFlow parameters) The trainable parameters + :param verbose: (int) verbosity level + :return: (TensorFlow Tensor) the sum of the gradients + """ + if verbose > 1: + print([grad_1, grad_2, param.name]) + if grad_1 is None and grad_2 is None: + return None + elif grad_1 is None: + return grad_2 + elif grad_2 is None: + return grad_1 + else: + return grad_1 + grad_2 + + +def q_explained_variance(q_pred, q_true): + """ + Calculates the explained variance of the Q value + + :param q_pred: (TensorFlow Tensor) The predicted Q value + :param q_true: (TensorFlow Tensor) The expected Q value + :return: (TensorFlow Tensor) the explained variance of the Q value + """ + _, var_y = tf.nn.moments(q_true, axes=[0, 1]) + _, var_pred = tf.nn.moments(q_true - q_pred, axes=[0, 1]) + check_shape([var_y, var_pred], [[]] * 2) + return 1.0 - (var_pred / var_y) diff --git a/stable_baselines/acer/__init__.py b/stable_baselines/acer/__init__.py new file mode 100644 index 0000000000..a81d161a3a --- /dev/null +++ b/stable_baselines/acer/__init__.py @@ -0,0 +1 @@ +from stable_baselines.acer.acer_simple import ACER diff --git a/stable_baselines/acer/acer_simple.py b/stable_baselines/acer/acer_simple.py new file mode 100644 index 0000000000..0a5ebd7de4 --- /dev/null +++ b/stable_baselines/acer/acer_simple.py @@ -0,0 +1,620 @@ +import time + +import numpy as np +import tensorflow as tf +from gym.spaces import Discrete, Box + +from stable_baselines import logger +from stable_baselines.a2c.utils import batch_to_seq, seq_to_batch, Scheduler, find_trainable_variables, EpisodeStats, \ + get_by_index, check_shape, avg_norm, gradient_add, q_explained_variance +from stable_baselines.acer.buffer import Buffer +from stable_baselines.common import BaseRLModel, tf_util, SetVerbosity +from stable_baselines.common.runners import AbstractEnvRunner +from stable_baselines.common.policies import LstmPolicy + + +def strip(var, n_envs, n_steps, flat=False): + """ + Removes the last step in the batch + + :param var: (TensorFlow Tensor) The input Tensor + :param n_envs: (int) The number of environments + :param n_steps: (int) The number of steps to run for each environment + :param flat: (bool) If the input Tensor is flat + :return: (TensorFlow Tensor) the input tensor, without the last step in the batch + """ + out_vars = batch_to_seq(var, n_envs, n_steps + 1, flat) + return seq_to_batch(out_vars[:-1], flat) + + +def q_retrace(rewards, dones, q_i, values, rho_i, n_envs, n_steps, gamma): + """ + Calculates the target Q-retrace + + :param rewards: ([TensorFlow Tensor]) The rewards + :param dones: ([TensorFlow Tensor]) + :param q_i: ([TensorFlow Tensor]) The Q values for actions taken + :param values: ([TensorFlow Tensor]) The output of the value functions + :param rho_i: ([TensorFlow Tensor]) The importance weight for each action + :param n_envs: (int) The number of environments + :param n_steps: (int) The number of steps to run for each environment + :param gamma: (float) The discount value + :return: ([TensorFlow Tensor]) the target Q-retrace + """ + rho_bar = batch_to_seq(tf.minimum(1.0, rho_i), n_envs, n_steps, True) # list of len steps, shape [n_envs] + reward_seq = batch_to_seq(rewards, n_envs, n_steps, True) # list of len steps, shape [n_envs] + done_seq = batch_to_seq(dones, n_envs, n_steps, True) # list of len steps, shape [n_envs] + q_is = batch_to_seq(q_i, n_envs, n_steps, True) + value_sequence = batch_to_seq(values, n_envs, n_steps + 1, True) + final_value = value_sequence[-1] + qret = final_value + qrets = [] + for i in range(n_steps - 1, -1, -1): + check_shape([qret, done_seq[i], reward_seq[i], rho_bar[i], q_is[i], value_sequence[i]], [[n_envs]] * 6) + qret = reward_seq[i] + gamma * qret * (1.0 - done_seq[i]) + qrets.append(qret) + qret = (rho_bar[i] * (qret - q_is[i])) + value_sequence[i] + qrets = qrets[::-1] + qret = seq_to_batch(qrets, flat=True) + return qret + + +class ACER(BaseRLModel): + """ + The ACER (Actor-Critic with Experience Replay) model class, https://arxiv.org/abs/1611.01224 + + :param policy: (ActorCriticPolicy) The policy model to use (MLP, CNN, LSTM, ...) + :param env: (Gym environment or str) The environment to learn from (if registered in Gym, can be str) + :param gamma: (float) The discount value + :param n_steps: (int) The number of steps to run for each environment + :param num_procs: (int) The number of threads for TensorFlow operations + :param q_coef: (float) The weight for the loss on the Q value + :param ent_coef: (float) The weight for the entropic loss + :param max_grad_norm: (float) The clipping value for the maximum gradient + :param learning_rate: (float) The initial learning rate for the RMS prop optimizer + :param lr_schedule: (str) The type of scheduler for the learning rate update ('linear', 'constant', + 'double_linear_con', 'middle_drop' or 'double_middle_drop') + :param rprop_epsilon: (float) RMS prop optimizer epsilon + :param rprop_alpha: (float) RMS prop optimizer decay + :param buffer_size: (int) The buffer size in number of steps + :param replay_ratio: (float) The number of replay learning per on policy learning on average, + using a poisson distribution + :param replay_start: (int) The minimum number of steps in the buffer, before learning replay + :param correction_term: (float) The correction term for the weights + :param trust_region: (bool) Enable Trust region policy optimization loss + :param alpha: (float) The decay rate for the Exponential moving average of the parameters + :param delta: (float) trust region delta value + :param verbose: (int) the verbosity level: 0 none, 1 training information, 2 tensorflow debug + :param _init_setup_model: (bool) Whether or not to build the network at the creation of the instance + """ + + def __init__(self, policy, env, gamma=0.99, n_steps=20, num_procs=1, q_coef=0.5, ent_coef=0.01, max_grad_norm=10, + learning_rate=7e-4, lr_schedule='linear', rprop_alpha=0.99, rprop_epsilon=1e-5, buffer_size=5000, + replay_ratio=4, replay_start=1000, correction_term=10.0, trust_region=True, alpha=0.99, delta=1, + verbose=0, _init_setup_model=True): + super(ACER, self).__init__(policy=policy, env=env, requires_vec_env=True, verbose=verbose) + + self.n_steps = n_steps + self.replay_ratio = replay_ratio + self.buffer_size = buffer_size + self.replay_start = replay_start + self.gamma = gamma + self.alpha = alpha + self.correction_term = correction_term + self.q_coef = q_coef + self.ent_coef = ent_coef + self.trust_region = trust_region + self.delta = delta + self.max_grad_norm = max_grad_norm + self.rprop_alpha = rprop_alpha + self.rprop_epsilon = rprop_epsilon + self.learning_rate = learning_rate + self.lr_schedule = lr_schedule + self.num_procs = num_procs + + self.graph = None + self.sess = None + self.action_ph = None + self.done_ph = None + self.reward_ph = None + self.mu_ph = None + self.learning_rate_ph = None + self.params = None + self.polyak_model = None + self.learning_rate_schedule = None + self.run_ops = None + self.names_ops = None + self.train_model = None + self.step_model = None + self.step = None + self.proba_step = None + self.initial_state = None + self.n_act = None + self.n_batch = None + + if _init_setup_model: + self.setup_model() + + def set_env(self, env): + if env is not None: + assert self.n_envs == env.num_envs, \ + "Error: the environment passed must have the same number of environments as the model was trained on." \ + "This is due to ACER not being capable of changing the number of environments." + + super().set_env(env) + + def setup_model(self): + with SetVerbosity(self.verbose): + + if isinstance(self.action_space, Discrete): + self.n_act = self.action_space.n + continuous = False + elif isinstance(self.action_space, Box): + # self.n_act = self.action_space.shape[-1] + # continuous = True + raise NotImplementedError("WIP: Acer does not support Continuous actions yet.") + else: + raise ValueError("Error: ACER does not work with {} actions space.".format(self.action_space)) + + self.n_batch = self.n_envs * self.n_steps + + self.graph = tf.Graph() + with self.graph.as_default(): + self.sess = tf_util.make_session(num_cpu=self.num_procs, graph=self.graph) + + self.done_ph = tf.placeholder(tf.float32, [self.n_batch]) # dones + self.reward_ph = tf.placeholder(tf.float32, [self.n_batch]) # rewards, not returns + self.mu_ph = tf.placeholder(tf.float32, [self.n_batch, self.n_act]) # mu's + self.learning_rate_ph = tf.placeholder(tf.float32, []) + eps = 1e-6 + + n_batch_step = None + if issubclass(self.policy, LstmPolicy): + n_batch_step = self.n_envs + n_batch_train = self.n_envs * (self.n_steps + 1) + + step_model = self.policy(self.sess, self.observation_space, self.action_space, self.n_envs, 1, + n_batch_step, reuse=False) + train_model = self.policy(self.sess, self.observation_space, self.action_space, self.n_envs, + self.n_steps + 1, n_batch_train, reuse=True) + + self.action_ph = train_model.pdtype.sample_placeholder([self.n_batch]) + + self.params = find_trainable_variables("model") + + # create averaged model + ema = tf.train.ExponentialMovingAverage(self.alpha) + ema_apply_op = ema.apply(self.params) + + def custom_getter(getter, *args, **kwargs): + val = ema.average(getter(*args, **kwargs)) + return val + + with tf.variable_scope("", custom_getter=custom_getter, reuse=True): + self.polyak_model = polyak_model = self.policy(self.sess, self.observation_space, self.action_space, + self.n_envs, self.n_steps + 1, + self.n_envs * (self.n_steps + 1), reuse=True) + + # Notation: (var) = batch variable, (var)s = sequence variable, + # (var)_i = variable index by action at step i + # shape is [n_envs * (n_steps + 1)] + if continuous: + value = train_model.value_fn[:, 0] + else: + value = tf.reduce_sum(train_model.policy_proba * train_model.q_value, axis=-1) + + rho, rho_i_ = None, None + if continuous: + action_ = strip(train_model.proba_distribution.sample(), self.n_envs, self.n_steps) + distribution_f = tf.contrib.distributions.MultivariateNormalDiag( + loc=strip(train_model.proba_distribution.mean, self.n_envs, self.n_steps), + scale_diag=strip(train_model.proba_distribution.logstd, self.n_envs, self.n_steps)) + f_polyak = tf.contrib.distributions.MultivariateNormalDiag( + loc=strip(polyak_model.proba_distribution.mean, self.n_envs, self.n_steps), + scale_diag=strip(polyak_model.proba_distribution.logstd, self.n_envs, self.n_steps)) + + f_i = distribution_f.prob(self.action_ph) + f_i_ = distribution_f.prob(action_) + f_polyak_i = f_polyak.prob(self.action_ph) + phi_i = strip(train_model.proba_distribution.mean, self.n_envs, self.n_steps) + + q_value = strip(train_model.value_fn, self.n_envs, self.n_steps) + q_i = q_value[:, 0] + + rho_i = tf.reshape(f_i, [-1, 1]) / (self.mu_ph + eps) + rho_i_ = tf.reshape(f_i_, [-1, 1]) / (self.mu_ph + eps) + + qret = q_retrace(self.reward_ph, self.done_ph, q_i, value, tf.pow(rho_i, 1/self.n_act), self.n_envs, + self.n_steps, self.gamma) + else: + # strip off last step + # f is a distribution, chosen to be Gaussian distributions + # with fixed diagonal covariance and mean \phi(x) + # in the paper + distribution_f, f_polyak, q_value = \ + map(lambda variables: strip(variables, self.n_envs, self.n_steps), + [train_model.policy_proba, polyak_model.policy_proba, train_model.q_value]) + + # Get pi and q values for actions taken + f_i = get_by_index(distribution_f, self.action_ph) + f_i_ = distribution_f + phi_i = distribution_f + f_polyak_i = f_polyak + + q_i = get_by_index(q_value, self.action_ph) + + # Compute ratios for importance truncation + rho = distribution_f / (self.mu_ph + eps) + rho_i = get_by_index(rho, self.action_ph) + + # Calculate Q_retrace targets + qret = q_retrace(self.reward_ph, self.done_ph, q_i, value, rho_i, self.n_envs, self.n_steps, + self.gamma) + + # Calculate losses + # Entropy + entropy = tf.reduce_sum(train_model.proba_distribution.entropy()) + + # Policy Gradient loss, with truncated importance sampling & bias correction + value = strip(value, self.n_envs, self.n_steps, True) + # check_shape([qret, value, rho_i, f_i], [[self.n_envs * self.n_steps]] * 4) + # check_shape([rho, distribution_f, q_value], [[self.n_envs * self.n_steps, self.n_act]] * 2) + + # Truncated importance sampling + adv = qret - value + log_f = tf.log(f_i + eps) + gain_f = log_f * tf.stop_gradient(adv * tf.minimum(self.correction_term, rho_i)) # [n_envs * n_steps] + loss_f = -tf.reduce_mean(gain_f) + + # Bias correction for the truncation + adv_bc = (q_value - tf.reshape(value, [self.n_envs * self.n_steps, 1])) # [n_envs * n_steps, n_act] + + # check_shape([adv_bc, log_f_bc], [[self.n_envs * self.n_steps, self.n_act]] * 2) + if continuous: + gain_bc = tf.stop_gradient(adv_bc * + tf.nn.relu(1.0 - (self.correction_term / (rho_i_ + eps))) * + f_i_) + else: + log_f_bc = tf.log(f_i_ + eps) # / (f_old + eps) + gain_bc = tf.reduce_sum(log_f_bc * + tf.stop_gradient( + adv_bc * + tf.nn.relu(1.0 - (self.correction_term / (rho + eps))) * + f_i_), + axis=1) + # IMP: This is sum, as expectation wrt f + loss_bc = -tf.reduce_mean(gain_bc) + + loss_policy = loss_f + loss_bc + + # Value/Q function loss, and explained variance + check_shape([qret, q_i], [[self.n_envs * self.n_steps]] * 2) + explained_variance = q_explained_variance(tf.reshape(q_i, [self.n_envs, self.n_steps]), + tf.reshape(qret, [self.n_envs, self.n_steps])) + loss_q = tf.reduce_mean(tf.square(tf.stop_gradient(qret) - q_i) * 0.5) + + # Net loss + check_shape([loss_policy, loss_q, entropy], [[]] * 3) + loss = loss_policy + self.q_coef * loss_q - self.ent_coef * entropy + + norm_grads_q, norm_grads_policy, avg_norm_grads_f = None, None, None + avg_norm_k, avg_norm_g, avg_norm_k_dot_g, avg_norm_adj = None, None, None, None + if self.trust_region: + # [n_envs * n_steps, n_act] + grad = tf.gradients(- (loss_policy - self.ent_coef * entropy) * self.n_steps * self.n_envs, + phi_i) + # [n_envs * n_steps, n_act] # Directly computed gradient of KL divergence wrt f + kl_grad = - f_polyak_i / (f_i_ + eps) + k_dot_g = tf.reduce_sum(kl_grad * grad, axis=-1) + adj = tf.maximum(0.0, (tf.reduce_sum(kl_grad * grad, axis=-1) - self.delta) / ( + tf.reduce_sum(tf.square(kl_grad), axis=-1) + eps)) # [n_envs * n_steps] + + # Calculate stats (before doing adjustment) for logging. + avg_norm_k = avg_norm(kl_grad) + avg_norm_g = avg_norm(grad) + avg_norm_k_dot_g = tf.reduce_mean(tf.abs(k_dot_g)) + avg_norm_adj = tf.reduce_mean(tf.abs(adj)) + + grad = grad - tf.reshape(adj, [self.n_envs * self.n_steps, 1]) * kl_grad + # These are turst region adjusted gradients wrt f ie statistics of policy pi + grads_f = -grad / (self.n_envs * self.n_steps) + grads_policy = tf.gradients(f_i_, self.params, grads_f) + grads_q = tf.gradients(loss_q * self.q_coef, self.params) + grads = [gradient_add(g1, g2, param, verbose=self.verbose) + for (g1, g2, param) in zip(grads_policy, grads_q, self.params)] + + avg_norm_grads_f = avg_norm(grads_f) * (self.n_steps * self.n_envs) + norm_grads_q = tf.global_norm(grads_q) + norm_grads_policy = tf.global_norm(grads_policy) + else: + grads = tf.gradients(loss, self.params) + + norm_grads = None + if self.max_grad_norm is not None: + grads, norm_grads = tf.clip_by_global_norm(grads, self.max_grad_norm) + grads = list(zip(grads, self.params)) + trainer = tf.train.RMSPropOptimizer(learning_rate=self.learning_rate_ph, decay=self.rprop_alpha, + epsilon=self.rprop_epsilon) + _opt_op = trainer.apply_gradients(grads) + + # so when you call _train, you first do the gradient step, then you apply ema + with tf.control_dependencies([_opt_op]): + _train = tf.group(ema_apply_op) + + # Ops/Summaries to run, and their names for logging + assert norm_grads is not None + run_ops = [_train, loss, loss_q, entropy, loss_policy, loss_f, loss_bc, explained_variance, norm_grads] + names_ops = ['loss', 'loss_q', 'entropy', 'loss_policy', 'loss_f', 'loss_bc', 'explained_variance', + 'norm_grads'] + if self.trust_region: + self.run_ops = run_ops + [norm_grads_q, norm_grads_policy, avg_norm_grads_f, avg_norm_k, avg_norm_g, + avg_norm_k_dot_g, avg_norm_adj] + self.names_ops = names_ops + ['norm_grads_q', 'norm_grads_policy', 'avg_norm_grads_f', 'avg_norm_k', + 'avg_norm_g', 'avg_norm_k_dot_g', 'avg_norm_adj'] + + self.train_model = train_model + self.step_model = step_model + self.step = step_model.step + self.proba_step = step_model.proba_step + self.initial_state = step_model.initial_state + + tf.global_variables_initializer().run(session=self.sess) + + def _train_step(self, obs, actions, rewards, dones, mus, states, masks, steps): + """ + applies a training step to the model + + :param obs: ([float]) The input observations + :param actions: ([float]) The actions taken + :param rewards: ([float]) The rewards from the environment + :param dones: ([bool]) Whether or not the episode is over (aligned with reward, used for reward calculation) + :param mus: ([float]) The logits values + :param states: ([float]) The states (used for reccurent policies) + :param masks: ([bool]) Whether or not the episode is over (used for reccurent policies) + :param steps: (int) the number of steps done so far + :return: ([str], [float]) the list of update operation name, and the list of the results of the operations + """ + cur_lr = self.learning_rate_schedule.value_steps(steps) + td_map = {self.train_model.obs_ph: obs, self.polyak_model.obs_ph: obs, self.action_ph: actions, + self.reward_ph: rewards, self.done_ph: dones, self.mu_ph: mus, self.learning_rate_ph: cur_lr} + + if states is not None: + td_map[self.train_model.states_ph] = states + td_map[self.train_model.masks_ph] = masks + td_map[self.polyak_model.states_ph] = states + td_map[self.polyak_model.masks_ph] = masks + + return self.names_ops, self.sess.run(self.run_ops, td_map)[1:] # strip off _train + + def learn(self, total_timesteps, callback=None, seed=None, log_interval=100): + with SetVerbosity(self.verbose): + self._setup_learn(seed) + + self.learning_rate_schedule = Scheduler(initial_value=self.learning_rate, n_values=total_timesteps, + schedule=self.lr_schedule) + + episode_stats = EpisodeStats(self.n_steps, self.n_envs) + + runner = _Runner(env=self.env, model=self, n_steps=self.n_steps) + if self.replay_ratio > 0: + buffer = Buffer(env=self.env, n_steps=self.n_steps, size=self.buffer_size) + else: + buffer = None + + t_start = time.time() + + # n_batch samples, 1 on_policy call and multiple off-policy calls + for steps in range(0, total_timesteps, self.n_batch): + enc_obs, obs, actions, rewards, mus, dones, masks = runner.run() + episode_stats.feed(rewards, dones) + + if buffer is not None: + buffer.put(enc_obs, actions, rewards, mus, dones, masks) + + # reshape stuff correctly + obs = obs.reshape(runner.batch_ob_shape) + actions = actions.reshape([runner.n_batch]) + rewards = rewards.reshape([runner.n_batch]) + mus = mus.reshape([runner.n_batch, runner.n_act]) + dones = dones.reshape([runner.n_batch]) + masks = masks.reshape([runner.batch_ob_shape[0]]) + + names_ops, values_ops = self._train_step(obs, actions, rewards, dones, mus, self.initial_state, masks, + steps) + + if callback is not None: + callback(locals(), globals()) + + if self.verbose >= 1 and (int(steps / runner.n_batch) % log_interval == 0): + logger.record_tabular("total_timesteps", steps) + logger.record_tabular("fps", int(steps / (time.time() - t_start))) + # IMP: In EpisodicLife env, during training, we get done=True at each loss of life, + # not just at the terminal state. Thus, this is mean until end of life, not end of episode. + # For true episode rewards, see the monitor files in the log folder. + logger.record_tabular("mean_episode_length", episode_stats.mean_length()) + logger.record_tabular("mean_episode_reward", episode_stats.mean_reward()) + for name, val in zip(names_ops, values_ops): + logger.record_tabular(name, float(val)) + logger.dump_tabular() + + if self.replay_ratio > 0 and buffer.has_atleast(self.replay_start): + samples_number = np.random.poisson(self.replay_ratio) + for _ in range(samples_number): + # get obs, actions, rewards, mus, dones from buffer. + obs, actions, rewards, mus, dones, masks = buffer.get() + + # reshape stuff correctly + obs = obs.reshape(runner.batch_ob_shape) + actions = actions.reshape([runner.n_batch]) + rewards = rewards.reshape([runner.n_batch]) + mus = mus.reshape([runner.n_batch, runner.n_act]) + dones = dones.reshape([runner.n_batch]) + masks = masks.reshape([runner.batch_ob_shape[0]]) + + self._train_step(obs, actions, rewards, dones, mus, self.initial_state, masks, steps) + + return self + + def predict(self, observation, state=None, mask=None): + """ + Get the model's action from an observation + + :param observation: (numpy Number) the input observation + :param state: (numpy Number) The last states (can be None, used in reccurent policies) + :param mask: (numpy Number) The last masks (can be None, used in reccurent policies) + :return: (numpy Number, numpy Number) the model's action and the next state (used in reccurent policies) + """ + if state is None: + state = self.initial_state + if mask is None: + mask = [False for _ in range(self.n_envs)] + + observation = np.array(observation).reshape((-1,) + self.observation_space.shape) + + actions, _, states, _ = self.step(observation, state, mask) + return actions, states + + def action_probability(self, observation, state=None, mask=None): + """ + Get the model's action probability distribution from an observation + + :param observation: (numpy Number) the input observation + :param state: (numpy Number) The last states (can be None, used in reccurent policies) + :param mask: (numpy Number) The last masks (can be None, used in reccurent policies) + :return: (numpy Number) the model's action probability distribution + """ + if state is None: + state = self.initial_state + if mask is None: + mask = [False for _ in range(self.n_envs)] + + observation = np.array(observation).reshape((-1,) + self.observation_space.shape) + + return self.proba_step(observation, state, mask) + + def save(self, save_path): + data = { + "gamma": self.gamma, + "n_steps": self.n_steps, + "q_coef": self.q_coef, + "ent_coef": self.ent_coef, + "max_grad_norm": self.max_grad_norm, + "learning_rate": self.learning_rate, + "lr_schedule": self.lr_schedule, + "rprop_alpha": self.rprop_alpha, + "rprop_epsilon": self.rprop_epsilon, + "replay_ratio": self.replay_ratio, + "replay_start": self.replay_start, + "verbose": self.verbose, + "policy": self.policy, + "observation_space": self.observation_space, + "action_space": self.action_space, + "n_envs": self.n_envs, + "_vectorize_action": self._vectorize_action + } + + params = self.sess.run(self.params) + + self._save_to_file(save_path, data=data, params=params) + + @classmethod + def load(cls, load_path, env=None, **kwargs): + data, params = cls._load_from_file(load_path) + + model = cls(policy=data["policy"], env=env, _init_setup_model=False) + model.__dict__.update(data) + model.__dict__.update(kwargs) + model.set_env(env) + model.setup_model() + + restores = [] + for param, loaded_p in zip(model.params, params): + restores.append(param.assign(loaded_p)) + model.sess.run(restores) + + return model + + +class _Runner(AbstractEnvRunner): + def __init__(self, env, model, n_steps): + """ + A runner to learn the policy of an environment for a model + + :param env: (Gym environment) The environment to learn from + :param model: (Model) The model to learn + :param n_steps: (int) The number of steps to run for each environment + """ + + super(_Runner, self).__init__(env=env, model=model, n_steps=n_steps) + self.env = env + self.model = model + self.n_env = n_env = env.num_envs + if isinstance(env.action_space, Discrete): + self.n_act = env.action_space.n + else: + self.n_act = env.action_space.shape[-1] + self.n_batch = n_env * n_steps + + if len(env.observation_space.shape) > 1: + self.raw_pixels = True + obs_height, obs_width, obs_num_channels = env.observation_space.shape + self.batch_ob_shape = (n_env * (n_steps + 1), obs_height, obs_width, obs_num_channels) + self.obs_dtype = np.uint8 + self.obs = np.zeros((n_env, obs_height, obs_width, obs_num_channels), dtype=self.obs_dtype) + self.num_channels = obs_num_channels + else: + if len(env.observation_space.shape) == 1: + self.obs_dim = env.observation_space.shape[0] + else: + self.obs_dim = 1 + self.raw_pixels = False + if isinstance(self.env.observation_space, Discrete): + self.batch_ob_shape = (n_env * (n_steps + 1),) + else: + self.batch_ob_shape = (n_env * (n_steps + 1), self.obs_dim) + self.obs_dtype = np.float32 + + self.n_steps = n_steps + self.states = model.initial_state + self.dones = [False for _ in range(n_env)] + + def run(self): + """ + Run a step leaning of the model + + :return: ([float], [float], [float], [float], [float], [bool], [float]) + encoded observation, observations, actions, rewards, mus, dones, masks + """ + enc_obs = [self.obs] + mb_obs, mb_actions, mb_mus, mb_dones, mb_rewards = [], [], [], [], [] + for _ in range(self.n_steps): + actions, _, states, _ = self.model.step(self.obs, self.states, self.dones) + mus = self.model.proba_step(self.obs, self.states, self.dones) + mb_obs.append(np.copy(self.obs)) + mb_actions.append(actions) + mb_mus.append(mus) + mb_dones.append(self.dones) + obs, rewards, dones, _ = self.env.step(actions) + # states information for statefull models like LSTM + self.states = states + self.dones = dones + self.obs = obs + mb_rewards.append(rewards) + enc_obs.append(obs) + mb_obs.append(np.copy(self.obs)) + mb_dones.append(self.dones) + + enc_obs = np.asarray(enc_obs, dtype=self.obs_dtype).swapaxes(1, 0) + mb_obs = np.asarray(mb_obs, dtype=self.obs_dtype).swapaxes(1, 0) + mb_actions = np.asarray(mb_actions, dtype=np.int32).swapaxes(1, 0) + mb_rewards = np.asarray(mb_rewards, dtype=np.float32).swapaxes(1, 0) + mb_mus = np.asarray(mb_mus, dtype=np.float32).swapaxes(1, 0) + mb_dones = np.asarray(mb_dones, dtype=np.bool).swapaxes(1, 0) + + mb_masks = mb_dones # Used for statefull models like LSTM's to mask state when done + mb_dones = mb_dones[:, 1:] # Used for calculating returns. The dones array is now aligned with rewards + + # shapes are now [nenv, nsteps, []] + # When pulling from buffer, arrays will now be reshaped in place, preventing a deep copy. + + return enc_obs, mb_obs, mb_actions, mb_rewards, mb_mus, mb_dones, mb_masks diff --git a/stable_baselines/acer/buffer.py b/stable_baselines/acer/buffer.py new file mode 100644 index 0000000000..07bc2f4b89 --- /dev/null +++ b/stable_baselines/acer/buffer.py @@ -0,0 +1,162 @@ +import numpy as np + + +class Buffer(object): + def __init__(self, env, n_steps, size=50000): + """ + A buffer for observations, actions, rewards, mu's, states, masks and dones values + + :param env: (Gym environment) The environment to learn from + :param n_steps: (int) The number of steps to run for each environment + :param size: (int) The buffer size in number of steps + """ + self.n_env = env.num_envs + self.n_steps = n_steps + self.n_batch = self.n_env * self.n_steps + # Each loc contains n_env * n_steps frames, thus total buffer is n_env * size frames + self.size = size // self.n_steps + + if len(env.observation_space.shape) > 1: + self.raw_pixels = True + self.height, self.width, self.n_channels = env.observation_space.shape + self.obs_dtype = np.uint8 + else: + self.raw_pixels = False + if len(env.observation_space.shape) == 1: + self.obs_dim = env.observation_space.shape[-1] + else: + self.obs_dim = 1 + self.obs_dtype = np.float32 + + # Memory + self.enc_obs = None + self.actions = None + self.rewards = None + self.mus = None + self.dones = None + self.masks = None + + # Size indexes + self.next_idx = 0 + self.num_in_buffer = 0 + + def has_atleast(self, frames): + """ + Check to see if the buffer has at least the asked number of frames + + :param frames: (int) The number of frames checked + :return: (bool) number of frames in buffer >= number asked + """ + # Frames per env, so total (n_env * frames) Frames needed + # Each buffer loc has n_env * n_steps frames + return self.num_in_buffer >= (frames // self.n_steps) + + def can_sample(self): + """ + Check if the buffer has at least one frame + + :return: (bool) if the buffer has at least one frame + """ + return self.num_in_buffer > 0 + + def decode(self, enc_obs): + """ + Get the stacked frames of an observation + + :param enc_obs: ([float]) the encoded observation + :return: ([float]) the decoded observation + """ + # enc_obs has shape [n_envs, n_steps + 1, nh, nw, nc] + # dones has shape [n_envs, n_steps, nh, nw, nc] + # returns stacked obs of shape [n_env, (n_steps + 1), nh, nw, nc] + n_env, n_steps = self.n_env, self.n_steps + if self.raw_pixels: + obs_dim = [self.height, self.width, self.n_channels] + else: + obs_dim = [self.obs_dim] + + obs = np.zeros([1, n_steps + 1, n_env] + obs_dim, dtype=self.obs_dtype) + # [n_steps + nstack, n_env, nh, nw, nc] + x_var = np.reshape(enc_obs, [n_env, n_steps + 1] + obs_dim).swapaxes(1, 0) + obs[-1, :] = x_var + + if self.raw_pixels: + obs = obs.transpose((2, 1, 3, 4, 0, 5)) + else: + obs = obs.transpose((2, 1, 3, 0)) + return np.reshape(obs, [n_env, (n_steps + 1)] + obs_dim[:-1] + [obs_dim[-1]]) + + def put(self, enc_obs, actions, rewards, mus, dones, masks): + """ + Adds a frame to the buffer + + :param enc_obs: ([float]) the encoded observation + :param actions: ([float]) the actions + :param rewards: ([float]) the rewards + :param mus: ([float]) the policy probability for the actions + :param dones: ([bool]) + :param masks: ([bool]) + """ + # enc_obs [n_env, (n_steps + n_stack), nh, nw, nc] + # actions, rewards, dones [n_env, n_steps] + # mus [n_env, n_steps, n_act] + + if self.enc_obs is None: + self.enc_obs = np.empty([self.size] + list(enc_obs.shape), dtype=self.obs_dtype) + self.actions = np.empty([self.size] + list(actions.shape), dtype=np.int32) + self.rewards = np.empty([self.size] + list(rewards.shape), dtype=np.float32) + self.mus = np.empty([self.size] + list(mus.shape), dtype=np.float32) + self.dones = np.empty([self.size] + list(dones.shape), dtype=np.bool) + self.masks = np.empty([self.size] + list(masks.shape), dtype=np.bool) + + self.enc_obs[self.next_idx] = enc_obs + self.actions[self.next_idx] = actions + self.rewards[self.next_idx] = rewards + self.mus[self.next_idx] = mus + self.dones[self.next_idx] = dones + self.masks[self.next_idx] = masks + + self.next_idx = (self.next_idx + 1) % self.size + self.num_in_buffer = min(self.size, self.num_in_buffer + 1) + + def take(self, arr, idx, envx): + """ + Reads a frame from a list and index for the asked environment ids + + :param arr: (numpy array) the array that is read + :param idx: ([int]) the idx that are read + :param envx: ([int]) the idx for the environments + :return: ([float]) the askes frames from the list + """ + n_env = self.n_env + out = np.empty([n_env] + list(arr.shape[2:]), dtype=arr.dtype) + for i in range(n_env): + out[i] = arr[idx[i], envx[i]] + return out + + def get(self): + """ + randomly read a frame from the buffer + + :return: ([float], [float], [float], [float], [bool], [float]) + observations, actions, rewards, mus, dones, maskes + """ + # returns + # obs [n_env, (n_steps + 1), nh, nw, n_stack*nc] + # actions, rewards, dones [n_env, n_steps] + # mus [n_env, n_steps, n_act] + n_env = self.n_env + assert self.can_sample() + + # Sample exactly one id per env. If you sample across envs, then higher correlation in samples from same env. + idx = np.random.randint(0, self.num_in_buffer, n_env) + envx = np.arange(n_env) + + dones = self.take(self.dones, idx, envx) + enc_obs = self.take(self.enc_obs, idx, envx) + obs = self.decode(enc_obs) + actions = self.take(self.actions, idx, envx) + rewards = self.take(self.rewards, idx, envx) + mus = self.take(self.mus, idx, envx) + masks = self.take(self.masks, idx, envx) + return obs, actions, rewards, mus, dones, masks diff --git a/stable_baselines/acer/run_atari.py b/stable_baselines/acer/run_atari.py new file mode 100644 index 0000000000..4d86a77cfe --- /dev/null +++ b/stable_baselines/acer/run_atari.py @@ -0,0 +1,51 @@ +#!/usr/bin/env python3 + +from stable_baselines import logger +from stable_baselines.acer import ACER +from stable_baselines.common.policies import CnnPolicy, CnnLstmPolicy +from stable_baselines.common.cmd_util import make_atari_env, atari_arg_parser + + +def train(env_id, num_timesteps, seed, policy, lr_schedule, num_cpu): + """ + train an ACER model on atari + + :param env_id: (str) Environment ID + :param num_timesteps: (int) The total number of samples + :param seed: (int) The initial seed for training + :param policy: (A2CPolicy) The policy model to use (MLP, CNN, LSTM, ...) + :param lr_schedule: (str) The type of scheduler for the learning rate update ('linear', 'constant', + 'double_linear_con', 'middle_drop' or 'double_middle_drop') + :param num_cpu: (int) The number of cpu to train on + """ + env = make_atari_env(env_id, num_cpu, seed) + if policy == 'cnn': + policy_fn = CnnPolicy + elif policy == 'lstm': + policy_fn = CnnLstmPolicy + else: + print("Policy {} not implemented".format(policy)) + return + + model = ACER(policy_fn, env, lr_schedule=lr_schedule, buffer_size=5000) + model.learn(total_timesteps=int(num_timesteps * 1.1), seed=seed) + env.close() + + +def main(): + """ + Runs the test + """ + parser = atari_arg_parser() + parser.add_argument('--policy', choices=['cnn', 'lstm', 'lnlstm'], default='cnn', help='Policy architecture') + parser.add_argument('--lr_schedule', choices=['constant', 'linear'], default='constant', + help='Learning rate schedule') + parser.add_argument('--logdir', help='Directory for logging') + args = parser.parse_args() + logger.configure(args.logdir) + train(args.env, num_timesteps=args.num_timesteps, seed=args.seed, + policy=args.policy, lr_schedule=args.lr_schedule, num_cpu=16) + + +if __name__ == '__main__': + main() diff --git a/stable_baselines/acktr/__init__.py b/stable_baselines/acktr/__init__.py new file mode 100644 index 0000000000..b8418b225a --- /dev/null +++ b/stable_baselines/acktr/__init__.py @@ -0,0 +1 @@ +from stable_baselines.acktr.acktr_disc import ACKTR diff --git a/stable_baselines/acktr/acktr_cont.py b/stable_baselines/acktr/acktr_cont.py new file mode 100644 index 0000000000..dee06a804a --- /dev/null +++ b/stable_baselines/acktr/acktr_cont.py @@ -0,0 +1,168 @@ +""" +Continuous acktr +""" + +import numpy as np +import tensorflow as tf + +from stable_baselines import logger +import stable_baselines.common as common +from stable_baselines.common import tf_util +from stable_baselines.acktr import kfac +from stable_baselines.common.filters import ZFilter + + +def rollout(env, policy, max_pathlength, animate=False, obfilter=None): + """ + Simulate the env and policy for max_pathlength steps + + :param env: (Gym environment) The environment to learn from + :param policy: (Object) The policy model to use (MLP, CNN, LSTM, ...) + :param max_pathlength: (int) The maximum length for an episode + :param animate: (bool) if render env + :param obfilter: (Filter) the observation filter + :return: (dict) observation, terminated, reward, action, action_dist, logp + """ + observation = env.reset() + prev_ob = np.float32(np.zeros(observation.shape)) + if obfilter: + observation = obfilter(observation) + terminated = False + + observations = [] + actions = [] + action_dists = [] + logps = [] + rewards = [] + for _ in range(max_pathlength): + if animate: + env.render() + state = np.concatenate([observation, prev_ob], -1) + observations.append(state) + action, ac_dist, logp = policy.act(state) + actions.append(action) + action_dists.append(ac_dist) + logps.append(logp) + prev_ob = np.copy(observation) + scaled_ac = env.action_space.low + (action + 1.) * 0.5 * (env.action_space.high - env.action_space.low) + scaled_ac = np.clip(scaled_ac, env.action_space.low, env.action_space.high) + observation, rew, done, _ = env.step(scaled_ac) + if obfilter: + observation = obfilter(observation) + rewards.append(rew) + if done: + terminated = True + break + return {"observation": np.array(observations), "terminated": terminated, + "reward": np.array(rewards), "action": np.array(actions), + "action_dist": np.array(action_dists), "logp": np.array(logps)} + + +def learn(env, policy, value_fn, gamma, lam, timesteps_per_batch, num_timesteps, + animate=False, callback=None, desired_kl=0.002): + """ + Trains an ACKTR model. + + :param env: (Gym environment) The environment to learn from + :param policy: (Object) The policy model to use (MLP, CNN, LSTM, ...) + :param value_fn: (Object) The value function model to use (MLP, CNN, LSTM, ...) + :param gamma: (float) The discount value + :param lam: (float) the tradeoff between exploration and exploitation + :param timesteps_per_batch: (int) the number of timesteps for each batch + :param num_timesteps: (int) the total number of timesteps to run + :param animate: (bool) if render env + :param callback: (function) called every step, used for logging and saving + :param desired_kl: (float) the Kullback leibler weight for the loss + """ + obfilter = ZFilter(env.observation_space.shape) + + max_pathlength = env.spec.timestep_limit + stepsize = tf.Variable(initial_value=np.float32(np.array(0.03)), name='stepsize') + inputs, loss, loss_sampled = policy.update_info + optim = kfac.KfacOptimizer(learning_rate=stepsize, cold_lr=stepsize * (1 - 0.9), momentum=0.9, kfac_update=2, + epsilon=1e-2, stats_decay=0.99, async=1, cold_iter=1, + weight_decay_dict=policy.wd_dict, max_grad_norm=None) + pi_var_list = [] + for var in tf.trainable_variables(): + if "pi" in var.name: + pi_var_list.append(var) + + update_op, q_runner = optim.minimize(loss, loss_sampled, var_list=pi_var_list) + do_update = tf_util.function(inputs, update_op) + tf_util.initialize() + + # start queue runners + enqueue_threads = [] + coord = tf.train.Coordinator() + for queue_runner in [q_runner, value_fn.q_runner]: + assert queue_runner is not None + enqueue_threads.extend(queue_runner.create_threads(tf.get_default_session(), coord=coord, start=True)) + + i = 0 + timesteps_so_far = 0 + while True: + if timesteps_so_far > num_timesteps: + break + logger.log("********** Iteration %i ************" % i) + + # Collect paths until we have enough timesteps + timesteps_this_batch = 0 + paths = [] + while True: + path = rollout(env, policy, max_pathlength, animate=(len(paths) == 0 and (i % 10 == 0) and animate), + obfilter=obfilter) + paths.append(path) + timesteps_this_batch += path["reward"].shape[0] + timesteps_so_far += path["reward"].shape[0] + if timesteps_this_batch > timesteps_per_batch: + break + + # Estimate advantage function + vtargs = [] + advs = [] + for path in paths: + rew_t = path["reward"] + return_t = common.discount(rew_t, gamma) + vtargs.append(return_t) + vpred_t = value_fn.predict(path) + vpred_t = np.append(vpred_t, 0.0 if path["terminated"] else vpred_t[-1]) + delta_t = rew_t + gamma * vpred_t[1:] - vpred_t[:-1] + adv_t = common.discount(delta_t, gamma * lam) + advs.append(adv_t) + # Update value function + value_fn.fit(paths, vtargs) + + # Build arrays for policy update + ob_no = np.concatenate([path["observation"] for path in paths]) + action_na = np.concatenate([path["action"] for path in paths]) + oldac_dist = np.concatenate([path["action_dist"] for path in paths]) + adv_n = np.concatenate(advs) + standardized_adv_n = (adv_n - adv_n.mean()) / (adv_n.std() + 1e-8) + + # Policy update + do_update(ob_no, action_na, standardized_adv_n) + + min_stepsize = np.float32(1e-8) + max_stepsize = np.float32(1e0) + # Adjust stepsize + kl_loss = policy.compute_kl(ob_no, oldac_dist) + if kl_loss > desired_kl * 2: + logger.log("kl too high") + tf.assign(stepsize, tf.maximum(min_stepsize, stepsize / 1.5)).eval() + elif kl_loss < desired_kl / 2: + logger.log("kl too low") + tf.assign(stepsize, tf.minimum(max_stepsize, stepsize * 1.5)).eval() + else: + logger.log("kl just right!") + + logger.record_tabular("EpRewMean", np.mean([path["reward"].sum() for path in paths])) + logger.record_tabular("EpRewSEM", np.std([path["reward"].sum() / np.sqrt(len(paths)) for path in paths])) + logger.record_tabular("EpLenMean", np.mean([path["reward"].shape[0] for path in paths])) + logger.record_tabular("KL", kl_loss) + if callback: + callback() + logger.dump_tabular() + i += 1 + + coord.request_stop() + coord.join(enqueue_threads) diff --git a/stable_baselines/acktr/acktr_disc.py b/stable_baselines/acktr/acktr_disc.py new file mode 100644 index 0000000000..0101f0051d --- /dev/null +++ b/stable_baselines/acktr/acktr_disc.py @@ -0,0 +1,294 @@ +""" +Discrete acktr +""" + +import time + +import tensorflow as tf +import numpy as np + +from stable_baselines import logger +from stable_baselines.common import explained_variance, BaseRLModel, tf_util, SetVerbosity +from stable_baselines.a2c.a2c import A2CRunner +from stable_baselines.a2c.utils import Scheduler, find_trainable_variables, calc_entropy, mse +from stable_baselines.acktr import kfac +from stable_baselines.common.policies import LstmPolicy + + +class ACKTR(BaseRLModel): + """ + The ACKTR (Actor Critic using Kronecker-Factored Trust Region) model class, https://arxiv.org/abs/1708.05144 + + :param policy: (ActorCriticPolicy) The policy model to use (MLP, CNN, LSTM, ...) + :param env: (Gym environment or str) The environment to learn from (if registered in Gym, can be str) + :param gamma: (float) Discount factor + :param nprocs: (int) The number of threads for TensorFlow operations + :param n_steps: (int) The number of steps to run for each environment + :param ent_coef: (float) The weight for the entropic loss + :param vf_coef: (float) The weight for the loss on the value function + :param vf_fisher_coef: (float) The weight for the fisher loss on the value function + :param learning_rate: (float) The initial learning rate for the RMS prop optimizer + :param max_grad_norm: (float) The clipping value for the maximum gradient + :param kfac_clip: (float) gradient clipping for Kullback leiber + :param lr_schedule: (str) The type of scheduler for the learning rate update ('linear', 'constant', + 'double_linear_con', 'middle_drop' or 'double_middle_drop') + :param verbose: (int) the verbosity level: 0 none, 1 training information, 2 tensorflow debug + :param _init_setup_model: (bool) Whether or not to build the network at the creation of the instance + """ + + def __init__(self, policy, env, gamma=0.99, nprocs=1, n_steps=20, ent_coef=0.01, vf_coef=0.25, vf_fisher_coef=1.0, + learning_rate=0.25, max_grad_norm=0.5, kfac_clip=0.001, lr_schedule='linear', verbose=0, + _init_setup_model=True): + super(ACKTR, self).__init__(policy=policy, env=env, requires_vec_env=True, verbose=verbose) + + self.n_steps = n_steps + self.gamma = gamma + self.ent_coef = ent_coef + self.vf_coef = vf_coef + self.vf_fisher_coef = vf_fisher_coef + self.kfac_clip = kfac_clip + self.max_grad_norm = max_grad_norm + self.learning_rate = learning_rate + self.lr_schedule = lr_schedule + self.nprocs = nprocs + + self.graph = None + self.sess = None + self.action_ph = None + self.advs_ph = None + self.rewards_ph = None + self.pg_lr_ph = None + self.model = None + self.model2 = None + self.logits = None + self.entropy = None + self.pg_loss = None + self.vf_loss = None + self.pg_fisher = None + self.vf_fisher = None + self.joint_fisher = None + self.params = None + self.grads_check = None + self.optim = None + self.train_op = None + self.q_runner = None + self.learning_rate_schedule = None + self.train_model = None + self.step_model = None + self.step = None + self.proba_step = None + self.value = None + self.initial_state = None + self.n_batch = None + + if _init_setup_model: + self.setup_model() + + def setup_model(self): + with SetVerbosity(self.verbose): + + self.graph = tf.Graph() + with self.graph.as_default(): + self.sess = tf_util.make_session(num_cpu=self.nprocs, graph=self.graph) + + self.advs_ph = advs_ph = tf.placeholder(tf.float32, [None]) + self.rewards_ph = rewards_ph = tf.placeholder(tf.float32, [None]) + self.pg_lr_ph = pg_lr_ph = tf.placeholder(tf.float32, []) + + n_batch_step = None + n_batch_train = None + if issubclass(self.policy, LstmPolicy): + n_batch_step = self.n_envs + n_batch_train = self.n_envs * self.n_steps + + self.model = step_model = self.policy(self.sess, self.observation_space, self.action_space, self.n_envs, + 1, n_batch_step, reuse=False) + self.model2 = train_model = self.policy(self.sess, self.observation_space, self.action_space, + self.n_envs, self.n_steps, n_batch_train, + reuse=True) + + self.action_ph = action_ph = train_model.pdtype.sample_placeholder([None]) + + logpac = tf.nn.sparse_softmax_cross_entropy_with_logits(logits=train_model.policy, labels=action_ph) + self.logits = train_model.policy + + # training loss + pg_loss = tf.reduce_mean(advs_ph * logpac) + self.entropy = entropy = tf.reduce_mean(calc_entropy(train_model.policy)) + self.pg_loss = pg_loss = pg_loss - self.ent_coef * entropy + self.vf_loss = vf_loss = mse(tf.squeeze(train_model.value_fn), rewards_ph) + train_loss = pg_loss + self.vf_coef * vf_loss + + # Fisher loss construction + self.pg_fisher = pg_fisher_loss = -tf.reduce_mean(logpac) + sample_net = train_model.value_fn + tf.random_normal(tf.shape(train_model.value_fn)) + self.vf_fisher = vf_fisher_loss = - self.vf_fisher_coef * tf.reduce_mean( + tf.pow(train_model.value_fn - tf.stop_gradient(sample_net), 2)) + self.joint_fisher = pg_fisher_loss + vf_fisher_loss + + self.params = params = find_trainable_variables("model") + + self.grads_check = tf.gradients(train_loss, params) + + with tf.device('/gpu:0'): + self.optim = optim = kfac.KfacOptimizer(learning_rate=pg_lr_ph, clip_kl=self.kfac_clip, + momentum=0.9, kfac_update=1, epsilon=0.01, + stats_decay=0.99, async=1, cold_iter=10, + max_grad_norm=self.max_grad_norm, verbose=self.verbose) + + optim.compute_and_apply_stats(self.joint_fisher, var_list=params) + + self.train_model = train_model + self.step_model = step_model + self.step = step_model.step + self.proba_step = step_model.proba_step + self.value = step_model.value + self.initial_state = step_model.initial_state + tf.global_variables_initializer().run(session=self.sess) + + def _train_step(self, obs, states, rewards, masks, actions, values): + """ + applies a training step to the model + + :param obs: ([float]) The input observations + :param states: ([float]) The states (used for reccurent policies) + :param rewards: ([float]) The rewards from the environment + :param masks: ([bool]) Whether or not the episode is over (used for reccurent policies) + :param actions: ([float]) The actions taken + :param values: ([float]) The logits values + :return: (float, float, float) policy loss, value loss, policy entropy + """ + advs = rewards - values + cur_lr = None + for _ in range(len(obs)): + cur_lr = self.learning_rate_schedule.value() + assert cur_lr is not None, "Error: the observation input array cannon be empty" + + td_map = {self.train_model.obs_ph: obs, self.action_ph: actions, self.advs_ph: advs, self.rewards_ph: rewards, + self.pg_lr_ph: cur_lr} + if states is not None: + td_map[self.train_model.states_ph] = states + td_map[self.train_model.masks_ph] = masks + + policy_loss, value_loss, policy_entropy, _ = self.sess.run( + [self.pg_loss, self.vf_loss, self.entropy, self.train_op], + td_map + ) + return policy_loss, value_loss, policy_entropy + + def learn(self, total_timesteps, callback=None, seed=None, log_interval=100): + with SetVerbosity(self.verbose): + self._setup_learn(seed) + self.n_batch = self.n_envs * self.n_steps + + self.learning_rate_schedule = Scheduler(initial_value=self.learning_rate, n_values=total_timesteps, + schedule=self.lr_schedule) + + # FIFO queue of the q_runner thread is closed at the end of the learn function. + # As a result, it needs to be redefinied at every call + with self.graph.as_default(): + # Some of the variables are not in a scope when they are create + # so we make a note of any previously uninitialized variables + tf_vars = tf.global_variables() + is_uninitialized = self.sess.run([tf.is_variable_initialized(var) for var in tf_vars]) + old_uninitialized_vars = [v for (v, f) in zip(tf_vars, is_uninitialized) if not f] + + self.train_op, self.q_runner = self.optim.apply_gradients(list(zip(self.grads_check, self.params))) + + # then we check for new uninitialized variables and initialize them + tf_vars = tf.global_variables() + is_uninitialized = self.sess.run([tf.is_variable_initialized(var) for var in tf_vars]) + new_uninitialized_vars = [v for (v, f) in zip(tf_vars, is_uninitialized) + if not f and v not in old_uninitialized_vars] + + if len(new_uninitialized_vars) != 0: + self.sess.run(tf.variables_initializer(new_uninitialized_vars)) + + runner = A2CRunner(self.env, self, n_steps=self.n_steps, gamma=self.gamma) + + t_start = time.time() + coord = tf.train.Coordinator() + enqueue_threads = self.q_runner.create_threads(self.sess, coord=coord, start=True) + for update in range(1, total_timesteps // self.n_batch + 1): + obs, states, rewards, masks, actions, values = runner.run() + policy_loss, value_loss, policy_entropy = self._train_step(obs, states, rewards, masks, actions, values) + n_seconds = time.time() - t_start + fps = int((update * self.n_batch) / n_seconds) + + if callback is not None: + callback(locals(), globals()) + + if self.verbose >= 1 and (update % log_interval == 0 or update == 1): + explained_var = explained_variance(values, rewards) + logger.record_tabular("nupdates", update) + logger.record_tabular("total_timesteps", update * self.n_batch) + logger.record_tabular("fps", fps) + logger.record_tabular("policy_entropy", float(policy_entropy)) + logger.record_tabular("policy_loss", float(policy_loss)) + logger.record_tabular("value_loss", float(value_loss)) + logger.record_tabular("explained_variance", float(explained_var)) + logger.dump_tabular() + + coord.request_stop() + coord.join(enqueue_threads) + return self + + def predict(self, observation, state=None, mask=None): + if state is None: + state = self.initial_state + if mask is None: + mask = [False for _ in range(self.n_envs)] + observation = np.array(observation).reshape((-1,) + self.observation_space.shape) + + actions, _, states, _ = self.step(observation, state, mask) + return actions, states + + def action_probability(self, observation, state=None, mask=None): + if state is None: + state = self.initial_state + if mask is None: + mask = [False for _ in range(self.n_envs)] + observation = np.array(observation).reshape((-1,) + self.observation_space.shape) + + return self.proba_step(observation, state, mask) + + def save(self, save_path): + data = { + "gamma": self.gamma, + "nprocs": self.nprocs, + "n_steps": self.n_steps, + "vf_coef": self.vf_coef, + "ent_coef": self.ent_coef, + "vf_fisher_coef": self.vf_fisher_coef, + "max_grad_norm": self.max_grad_norm, + "learning_rate": self.learning_rate, + "kfac_clip": self.kfac_clip, + "lr_schedule": self.lr_schedule, + "verbose": self.verbose, + "policy": self.policy, + "observation_space": self.observation_space, + "action_space": self.action_space, + "n_envs": self.n_envs, + "_vectorize_action": self._vectorize_action + } + + params = self.sess.run(self.params) + + self._save_to_file(save_path, data=data, params=params) + + @classmethod + def load(cls, load_path, env=None, **kwargs): + data, params = cls._load_from_file(load_path) + + model = cls(policy=data["policy"], env=env, _init_setup_model=False) + model.__dict__.update(data) + model.__dict__.update(kwargs) + model.set_env(env) + model.setup_model() + + restores = [] + for param, loaded_p in zip(model.params, params): + restores.append(param.assign(loaded_p)) + model.sess.run(restores) + + return model diff --git a/stable_baselines/acktr/kfac.py b/stable_baselines/acktr/kfac.py new file mode 100644 index 0000000000..d00cacee0d --- /dev/null +++ b/stable_baselines/acktr/kfac.py @@ -0,0 +1,1005 @@ +import re +from functools import reduce + +import tensorflow as tf +import numpy as np + +from stable_baselines.acktr.kfac_utils import detect_min_val, factor_reshape, gmatmul + +KFAC_OPS = ['MatMul', 'Conv2D', 'BiasAdd'] +KFAC_DEBUG = False + + +class KfacOptimizer: + def __init__(self, learning_rate=0.01, momentum=0.9, clip_kl=0.01, kfac_update=2, stats_accum_iter=60, + full_stats_init=False, cold_iter=100, cold_lr=None, async=False, async_stats=False, epsilon=1e-2, + stats_decay=0.95, blockdiag_bias=False, channel_fac=False, factored_damping=False, approx_t2=False, + use_float64=False, weight_decay_dict=None, max_grad_norm=0.5, verbose=1): + """ + Kfac Optimizer for ACKTR models + link: https://arxiv.org/pdf/1708.05144.pdf + + :param learning_rate: (float) The learning rate + :param momentum: (float) The momentum value for the TensorFlow momentum optimizer + :param clip_kl: (float) gradient clipping for Kullback leiber + :param kfac_update: (int) update kfac after kfac_update steps + :param stats_accum_iter: (int) how may steps to accumulate stats + :param full_stats_init: (bool) whether or not to fully initalize stats + :param cold_iter: (int) Cold start learning rate for how many steps + :param cold_lr: (float) Cold start learning rate + :param async: (bool) Use async eigen decomposition + :param async_stats: (bool) Asynchronous stats update + :param epsilon: (float) epsilon value for small numbers + :param stats_decay: (float) the stats decay rate + :param blockdiag_bias: (bool) + :param channel_fac: (bool) factorization along the channels + :param factored_damping: (bool) use factored damping + :param approx_t2: (bool) approximate T2 act and grad fisher + :param use_float64: (bool) use 64-bit float + :param weight_decay_dict: (dict) custom weight decay coeff for a given gradient + :param max_grad_norm: (float) The maximum value for the gradient clipping + :param verbose: (int) verbosity level + """ + self.max_grad_norm = max_grad_norm + self._lr = learning_rate + self._momentum = momentum + self._clip_kl = clip_kl + self._channel_fac = channel_fac + self._kfac_update = kfac_update + self._async = async + self._async_stats = async_stats + self._epsilon = epsilon + self._stats_decay = stats_decay + self._blockdiag_bias = blockdiag_bias + self._approx_t2 = approx_t2 + self._use_float64 = use_float64 + self._factored_damping = factored_damping + self._cold_iter = cold_iter + self.verbose = verbose + if cold_lr is None: + # good heuristics + self._cold_lr = self._lr # * 3. + else: + self._cold_lr = cold_lr + self._stats_accum_iter = stats_accum_iter + if weight_decay_dict is None: + weight_decay_dict = {} + self._weight_decay_dict = weight_decay_dict + self._diag_init_coeff = 0. + self._full_stats_init = full_stats_init + if not self._full_stats_init: + self._stats_accum_iter = self._cold_iter + + self.sgd_step = tf.Variable(0, name='KFAC/sgd_step', trainable=False) + self.global_step = tf.Variable( + 0, name='KFAC/global_step', trainable=False) + self.cold_step = tf.Variable(0, name='KFAC/cold_step', trainable=False) + self.factor_step = tf.Variable( + 0, name='KFAC/factor_step', trainable=False) + self.stats_step = tf.Variable( + 0, name='KFAC/stats_step', trainable=False) + self.v_f_v = tf.Variable(0., name='KFAC/vFv', trainable=False) + + self.factors = {} + self.param_vars = [] + self.stats = {} + self.stats_eigen = {} + + def get_factors(self, gradients, varlist): + """ + get factors to update + + :param gradients: ([TensorFlow Tensor]) The gradients + :param varlist: ([TensorFlow Tensor]) The parameters + :return: ([TensorFlow Tensor]) The factors to update + """ + default_graph = tf.get_default_graph() + factor_tensors = {} + fprop_tensors = [] + bprop_tensors = [] + op_types = [] + + def _search_factors(gradient, graph): + # hard coded search stratergy + bprop_op = gradient.op + bprop_op_name = bprop_op.name + + b_tensors = [] + f_tensors = [] + + # combining additive gradient, assume they are the same op type and + # indepedent + if 'AddN' in bprop_op_name: + factors = [] + for grad in gradient.op.inputs: + factors.append(_search_factors(grad, graph)) + op_names = [_item['opName'] for _item in factors] + if self.verbose >= 1: + # TODO: need to check all the attribute of the ops as well + print(gradient.name) + print(op_names) + print(len(np.unique(op_names))) + assert len(np.unique(op_names)) == 1, \ + 'Error: {} is shared among different computation OPs'.format(gradient.name) + + b_tensors = reduce(lambda x, y: x + y, + [_item['bpropFactors'] for _item in factors]) + if len(factors[0]['fpropFactors']) > 0: + f_tensors = reduce( + lambda x, y: x + y, [_item['fpropFactors'] for _item in factors]) + fprop_op_name = op_names[0] + fprop_op = factors[0]['op'] + else: + fprop_op_name = re.search('gradientsSampled(_[0-9]+|)/(.+?)_grad', bprop_op_name).group(2) + fprop_op = graph.get_operation_by_name(fprop_op_name) + if fprop_op.op_def.name in KFAC_OPS: + # Known OPs + b_tensor = [_i for _i in bprop_op.inputs if 'gradientsSampled' in _i.name][-1] + b_tensor_shape = fprop_op.outputs[0].get_shape() + if b_tensor.get_shape()[0].value is None: + b_tensor.set_shape(b_tensor_shape) + b_tensors.append(b_tensor) + + if fprop_op.op_def.name == 'BiasAdd': + f_tensors = [] + else: + f_tensors.append([_i for _i in fprop_op.inputs if param.op.name not in _i.name][0]) + fprop_op_name = fprop_op.op_def.name + else: + # unknown OPs, block approximation used + b_inputs_list = [_i for _i in bprop_op.inputs[0].op.inputs + if 'gradientsSampled' in _i.name if 'Shape' not in _i.name] + if len(b_inputs_list) > 0: + b_tensor = b_inputs_list[0] + # only if tensor shape is defined, usually this will prevent tensor like Sum:0 to be used. + if b_tensor.get_shape(): + b_tensor_shape = fprop_op.outputs[0].get_shape() + if len(b_tensor.get_shape()) > 0 and b_tensor.get_shape()[0].value is None: + b_tensor.set_shape(b_tensor_shape) + b_tensors.append(b_tensor) + fprop_op_name = op_types.append('UNK-' + fprop_op.op_def.name) + + return {'opName': fprop_op_name, 'op': fprop_op, 'fpropFactors': f_tensors, 'bpropFactors': b_tensors} + + for _grad, param in zip(gradients, varlist): + if KFAC_DEBUG: + print(('get factor for ' + param.name)) + found_factors = _search_factors(_grad, default_graph) + factor_tensors[param] = found_factors + + # check associated weights and bias for homogeneous coordinate representation + # and check redundent factors + # TODO: there may be a bug to detect associate bias and weights for forking layer, e.g. in inception models. + for param in varlist: + factor_tensors[param]['assnWeights'] = None + factor_tensors[param]['assnBias'] = None + for param in varlist: + if factor_tensors[param]['opName'] == 'BiasAdd': + factor_tensors[param]['assnWeights'] = None + for item in varlist: + if len(factor_tensors[item]['bpropFactors']) > 0: + if (set(factor_tensors[item]['bpropFactors']) == set(factor_tensors[param]['bpropFactors'])) \ + and (len(factor_tensors[item]['fpropFactors']) > 0): + factor_tensors[param]['assnWeights'] = item + factor_tensors[item]['assnBias'] = param + factor_tensors[param]['bpropFactors'] = factor_tensors[ + item]['bpropFactors'] + + # concatenate the additive gradients along the batch dimension, i.e. assuming independence structure + for key in ['fpropFactors', 'bpropFactors']: + for i, param in enumerate(varlist): + if len(factor_tensors[param][key]) > 0: + if (key + '_concat') not in factor_tensors[param]: + name_scope = factor_tensors[param][key][0].name.split(':')[ + 0] + with tf.name_scope(name_scope): + factor_tensors[param][ + key + '_concat'] = tf.concat(factor_tensors[param][key], 0) + else: + factor_tensors[param][key + '_concat'] = None + for _, param2 in enumerate(varlist[(i + 1):]): + if (len(factor_tensors[param][key]) > 0) and ( + set(factor_tensors[param2][key]) == set(factor_tensors[param][key])): + factor_tensors[param2][key] = factor_tensors[param][key] + factor_tensors[param2][ + key + '_concat'] = factor_tensors[param][key + '_concat'] + + if KFAC_DEBUG: + for items in zip(varlist, fprop_tensors, bprop_tensors, op_types): + print((items[0].name, factor_tensors[item])) + self.factors = factor_tensors + return factor_tensors + + def get_stats(self, factors, varlist): + """ + return the stats values from the factors to update and the parameters + + :param factors: ([TensorFlow Tensor]) The factors to update + :param varlist: ([TensorFlow Tensor]) The parameters + :return: ([TensorFlow Tensor]) The stats values + """ + if len(self.stats) == 0: + # initialize stats variables on CPU because eigen decomp is + # computed on CPU + with tf.device('/cpu'): + tmp_stats_cache = {} + + # search for tensor factors and + # use block diag approx for the bias units + for var in varlist: + bprop_factor = factors[var]['bpropFactors_concat'] + op_type = factors[var]['opName'] + if op_type == 'Conv2D': + operator_height = bprop_factor.get_shape()[1] + operator_width = bprop_factor.get_shape()[2] + if operator_height == 1 and operator_width == 1 and self._channel_fac: + # factorization along the channels do not support + # homogeneous coordinate + var_assn_bias = factors[var]['assnBias'] + if var_assn_bias: + factors[var]['assnBias'] = None + factors[var_assn_bias]['assnWeights'] = None + + for var in varlist: + fprop_factor = factors[var]['fpropFactors_concat'] + bprop_factor = factors[var]['bpropFactors_concat'] + op_type = factors[var]['opName'] + self.stats[var] = {'opName': op_type, + 'fprop_concat_stats': [], + 'bprop_concat_stats': [], + 'assnWeights': factors[var]['assnWeights'], + 'assnBias': factors[var]['assnBias'], + } + if fprop_factor is not None: + if fprop_factor not in tmp_stats_cache: + if op_type == 'Conv2D': + kernel_height = var.get_shape()[0] + kernel_width = var.get_shape()[1] + n_channels = fprop_factor.get_shape()[-1] + + operator_height = bprop_factor.get_shape()[1] + operator_width = bprop_factor.get_shape()[2] + if operator_height == 1 and operator_width == 1 and self._channel_fac: + # factorization along the channels + # assume independence between input channels and spatial + # 2K-1 x 2K-1 covariance matrix and C x C covariance matrix + # factorization along the channels do not + # support homogeneous coordinate, assnBias + # is always None + fprop_factor2_size = kernel_height * kernel_width + slot_fprop_factor_stats2 = tf.Variable(tf.diag(tf.ones( + [fprop_factor2_size])) * self._diag_init_coeff, + name='KFAC_STATS/' + fprop_factor.op.name, + trainable=False) + self.stats[var]['fprop_concat_stats'].append( + slot_fprop_factor_stats2) + + fprop_factor_size = n_channels + else: + # 2K-1 x 2K-1 x C x C covariance matrix + # assume BHWC + fprop_factor_size = kernel_height * kernel_width * n_channels + else: + # D x D covariance matrix + fprop_factor_size = fprop_factor.get_shape()[-1] + + # use homogeneous coordinate + if not self._blockdiag_bias and self.stats[var]['assnBias']: + fprop_factor_size += 1 + + slot_fprop_factor_stats = tf.Variable( + tf.diag(tf.ones([fprop_factor_size])) * self._diag_init_coeff, + name='KFAC_STATS/' + fprop_factor.op.name, trainable=False) + self.stats[var]['fprop_concat_stats'].append( + slot_fprop_factor_stats) + if op_type != 'Conv2D': + tmp_stats_cache[fprop_factor] = self.stats[ + var]['fprop_concat_stats'] + else: + self.stats[var][ + 'fprop_concat_stats'] = tmp_stats_cache[fprop_factor] + + if bprop_factor is not None: + # no need to collect backward stats for bias vectors if + # using homogeneous coordinates + if not ((not self._blockdiag_bias) and self.stats[var]['assnWeights']): + if bprop_factor not in tmp_stats_cache: + slot_bprop_factor_stats = tf.Variable(tf.diag(tf.ones([bprop_factor.get_shape( + )[-1]])) * self._diag_init_coeff, name='KFAC_STATS/' + bprop_factor.op.name, + trainable=False) + self.stats[var]['bprop_concat_stats'].append( + slot_bprop_factor_stats) + tmp_stats_cache[bprop_factor] = self.stats[ + var]['bprop_concat_stats'] + else: + self.stats[var][ + 'bprop_concat_stats'] = tmp_stats_cache[bprop_factor] + + return self.stats + + def compute_and_apply_stats(self, loss_sampled, var_list=None): + """ + compute and apply stats + + :param loss_sampled: ([TensorFlow Tensor]) the loss function output + :param var_list: ([TensorFlow Tensor]) The parameters + :return: (function) apply stats + """ + varlist = var_list + if varlist is None: + varlist = tf.trainable_variables() + + stats = self.compute_stats(loss_sampled, var_list=varlist) + return self.apply_stats(stats) + + def compute_stats(self, loss_sampled, var_list=None): + """ + compute the stats values + + :param loss_sampled: ([TensorFlow Tensor]) the loss function output + :param var_list: ([TensorFlow Tensor]) The parameters + :return: ([TensorFlow Tensor]) stats updates + """ + varlist = var_list + if varlist is None: + varlist = tf.trainable_variables() + + gradient_sampled = tf.gradients(loss_sampled, varlist, name='gradientsSampled') + self.gradient_sampled = gradient_sampled + + # remove unused variables + gradient_sampled, varlist = zip(*[(grad, var) for (grad, var) in zip(gradient_sampled, varlist) + if grad is not None]) + + factors = self.get_factors(gradient_sampled, varlist) + stats = self.get_stats(factors, varlist) + + update_ops = [] + stats_updates = {} + stats_updates_cache = {} + for var in varlist: + op_type = factors[var]['opName'] + fops = factors[var]['op'] + fprop_factor = factors[var]['fpropFactors_concat'] + fprop_stats_vars = stats[var]['fprop_concat_stats'] + bprop_factor = factors[var]['bpropFactors_concat'] + bprop_stats_vars = stats[var]['bprop_concat_stats'] + svd_factors = {} + for stats_var in fprop_stats_vars: + stats_var_dim = int(stats_var.get_shape()[0]) + if stats_var not in stats_updates_cache: + batch_size = (tf.shape(fprop_factor)[0]) # batch size + if op_type == 'Conv2D': + strides = fops.get_attr("strides") + padding = fops.get_attr("padding") + convkernel_size = var.get_shape()[0:3] + + kernel_height = int(convkernel_size[0]) + kernel_width = int(convkernel_size[1]) + chan = int(convkernel_size[2]) + flatten_size = int(kernel_height * kernel_width * chan) + + operator_height = int(bprop_factor.get_shape()[1]) + operator_width = int(bprop_factor.get_shape()[2]) + + if operator_height == 1 and operator_width == 1 and self._channel_fac: + # factorization along the channels + # assume independence among input channels + # factor = B x 1 x 1 x (KH xKW x C) + # patches = B x Oh x Ow x (KH xKW x C) + if len(svd_factors) == 0: + if KFAC_DEBUG: + print(('approx %s act factor with rank-1 SVD factors' % var.name)) + # find closest rank-1 approx to the feature map + S, U, V = tf.batch_svd(tf.reshape( + fprop_factor, [-1, kernel_height * kernel_width, chan])) + # get rank-1 approx slides + sqrt_s1 = tf.expand_dims(tf.sqrt(S[:, 0, 0]), 1) + patches_k = U[:, :, 0] * sqrt_s1 # B x KH*KW + full_factor_shape = fprop_factor.get_shape() + patches_k.set_shape( + [full_factor_shape[0], kernel_height * kernel_width]) + patches_c = V[:, :, 0] * sqrt_s1 # B x C + patches_c.set_shape([full_factor_shape[0], chan]) + svd_factors[chan] = patches_c + svd_factors[kernel_height * kernel_width] = patches_k + fprop_factor = svd_factors[stats_var_dim] + + else: + # poor mem usage implementation + patches = tf.extract_image_patches(fprop_factor, ksizes=[1, convkernel_size[ + 0], convkernel_size[1], 1], strides=strides, rates=[1, 1, 1, 1], padding=padding) + + if self._approx_t2: + if KFAC_DEBUG: + print(('approxT2 act fisher for %s' % var.name)) + # T^2 terms * 1/T^2, size: B x C + fprop_factor = tf.reduce_mean(patches, [1, 2]) + else: + # size: (B x Oh x Ow) x C + fprop_factor = tf.reshape( + patches, [-1, flatten_size]) / operator_height / operator_width + fprop_factor_size = int(fprop_factor.get_shape()[-1]) + if stats_var_dim == (fprop_factor_size + 1) and not self._blockdiag_bias: + if op_type == 'Conv2D' and not self._approx_t2: + # correct padding for numerical stability (we + # divided out OhxOw from activations for T1 approx) + fprop_factor = tf.concat([fprop_factor, tf.ones( + [tf.shape(fprop_factor)[0], 1]) / operator_height / operator_width], 1) + else: + # use homogeneous coordinates + fprop_factor = tf.concat( + [fprop_factor, tf.ones([tf.shape(fprop_factor)[0], 1])], 1) + + # average over the number of data points in a batch + # divided by B + cov = tf.matmul(fprop_factor, fprop_factor, + transpose_a=True) / tf.cast(batch_size, tf.float32) + update_ops.append(cov) + stats_updates[stats_var] = cov + if op_type != 'Conv2D': + # HACK: for convolution we recompute fprop stats for + # every layer including forking layers + stats_updates_cache[stats_var] = cov + + for stats_var in bprop_stats_vars: + if stats_var not in stats_updates_cache: + bprop_factor_shape = bprop_factor.get_shape() + batch_size = tf.shape(bprop_factor)[0] # batch size + chan = int(bprop_factor_shape[-1]) # num channels + if op_type == 'Conv2D' or len(bprop_factor_shape) == 4: + if fprop_factor is not None: + if self._approx_t2: + if KFAC_DEBUG: + print(('approxT2 grad fisher for %s' % var.name)) + bprop_factor = tf.reduce_sum( + bprop_factor, [1, 2]) # T^2 terms * 1/T^2 + else: + bprop_factor = tf.reshape( + bprop_factor, [-1, chan]) * operator_height * operator_width # T * 1/T terms + else: + # just doing block diag approx. spatial independent + # structure does not apply here. summing over + # spatial locations + if KFAC_DEBUG: + print(('block diag approx fisher for %s' % var.name)) + bprop_factor = tf.reduce_sum(bprop_factor, [1, 2]) + + # assume sampled loss is averaged. TODO:figure out better + # way to handle this + bprop_factor *= tf.to_float(batch_size) + ## + + cov_b = tf.matmul( + bprop_factor, bprop_factor, transpose_a=True) / tf.to_float(tf.shape(bprop_factor)[0]) + + update_ops.append(cov_b) + stats_updates[stats_var] = cov_b + stats_updates_cache[stats_var] = cov_b + + if KFAC_DEBUG: + a_key = list(stats_updates.keys())[0] + stats_updates[a_key] = tf.Print(stats_updates[a_key], [tf.convert_to_tensor('step:'), self.global_step, + tf.convert_to_tensor('computing stats')]) + self.stats_updates = stats_updates + return stats_updates + + def apply_stats(self, stats_updates): + """ + compute stats and update/apply the new stats to the running average + + :param stats_updates: ([TensorFlow Tensor]) The stats updates + :return: (function) update stats operation + """ + + def _update_accum_stats(): + if self._full_stats_init: + return tf.cond(tf.greater(self.sgd_step, self._cold_iter), lambda: tf.group( + *self._apply_stats(stats_updates, accumulate=True, accumulate_coeff=1. / self._stats_accum_iter)), + tf.no_op) + else: + return tf.group( + *self._apply_stats(stats_updates, accumulate=True, accumulate_coeff=1. / self._stats_accum_iter)) + + def _update_running_avg_stats(stats_updates): + return tf.group(*self._apply_stats(stats_updates)) + + if self._async_stats: + # asynchronous stats update + update_stats = self._apply_stats(stats_updates) + + queue = tf.FIFOQueue(1, [item.dtype for item in update_stats], shapes=[ + item.get_shape() for item in update_stats]) + enqueue_op = queue.enqueue(update_stats) + + def dequeue_stats_op(): + return queue.dequeue() + + self.qr_stats = tf.train.QueueRunner(queue, [enqueue_op]) + update_stats_op = tf.cond(tf.equal(queue.size(), tf.convert_to_tensor( + 0)), tf.no_op, lambda: tf.group(*[dequeue_stats_op(), ])) + else: + # synchronous stats update + update_stats_op = tf.cond(tf.greater_equal(self.stats_step, self._stats_accum_iter), + lambda: _update_running_avg_stats(stats_updates), _update_accum_stats) + self._update_stats_op = update_stats_op + return update_stats_op + + def _apply_stats(self, stats_updates, accumulate=False, accumulate_coeff=0.): + update_ops = [] + # obtain the stats var list + for stats_var in stats_updates: + stats_new = stats_updates[stats_var] + if accumulate: + # simple superbatch averaging + update_op = tf.assign_add( + stats_var, accumulate_coeff * stats_new, use_locking=True) + else: + # exponential running averaging + update_op = tf.assign( + stats_var, stats_var * self._stats_decay, use_locking=True) + update_op = tf.assign_add( + update_op, (1. - self._stats_decay) * stats_new, use_locking=True) + update_ops.append(update_op) + + with tf.control_dependencies(update_ops): + stats_step_op = tf.assign_add(self.stats_step, 1) + + if KFAC_DEBUG: + stats_step_op = (tf.Print(stats_step_op, + [tf.convert_to_tensor('step:'), + self.global_step, + tf.convert_to_tensor('fac step:'), + self.factor_step, + tf.convert_to_tensor('sgd step:'), + self.sgd_step, + tf.convert_to_tensor('Accum:'), + tf.convert_to_tensor(accumulate), + tf.convert_to_tensor('Accum coeff:'), + tf.convert_to_tensor(accumulate_coeff), + tf.convert_to_tensor('stat step:'), + self.stats_step, update_ops[0], update_ops[1]])) + return [stats_step_op, ] + + def get_stats_eigen(self, stats=None): + """ + Return the eigen values from the stats + + :param stats: ([TensorFlow Tensor]) The stats + :return: ([TensorFlow Tensor]) The stats eigen values + """ + if len(self.stats_eigen) == 0: + stats_eigen = {} + if stats is None: + stats = self.stats + + tmp_eigen_cache = {} + with tf.device('/cpu:0'): + for var in stats: + for key in ['fprop_concat_stats', 'bprop_concat_stats']: + for stats_var in stats[var][key]: + if stats_var not in tmp_eigen_cache: + stats_dim = stats_var.get_shape()[1].value + eigen_values = tf.Variable(tf.ones( + [stats_dim]), name='KFAC_FAC/' + stats_var.name.split(':')[0] + '/e', + trainable=False) + eigen_vectors = tf.Variable(tf.diag(tf.ones( + [stats_dim])), name='KFAC_FAC/' + stats_var.name.split(':')[0] + '/Q', + trainable=False) + stats_eigen[stats_var] = {'e': eigen_values, 'Q': eigen_vectors} + tmp_eigen_cache[ + stats_var] = stats_eigen[stats_var] + else: + stats_eigen[stats_var] = tmp_eigen_cache[ + stats_var] + self.stats_eigen = stats_eigen + return self.stats_eigen + + def compute_stats_eigen(self): + """ + compute the eigen decomp using copied var stats to avoid concurrent read/write from other queue + + :return: ([TensorFlow Tensor]) update operations + """ + # TODO: figure out why this op has delays (possibly moving eigenvectors around?) + with tf.device('/cpu:0'): + stats_eigen = self.stats_eigen + computed_eigen = {} + eigen_reverse_lookup = {} + update_ops = [] + # sync copied stats + with tf.control_dependencies([]): + for stats_var in stats_eigen: + if stats_var not in computed_eigen: + eigen_decomposition = tf.self_adjoint_eig(stats_var) + eigen_values = eigen_decomposition[0] + eigen_vectors = eigen_decomposition[1] + if self._use_float64: + eigen_values = tf.cast(eigen_values, tf.float64) + eigen_vectors = tf.cast(eigen_vectors, tf.float64) + update_ops.append(eigen_values) + update_ops.append(eigen_vectors) + computed_eigen[stats_var] = {'e': eigen_values, 'Q': eigen_vectors} + eigen_reverse_lookup[eigen_values] = stats_eigen[stats_var]['e'] + eigen_reverse_lookup[eigen_vectors] = stats_eigen[stats_var]['Q'] + + self.eigen_reverse_lookup = eigen_reverse_lookup + self.eigen_update_list = update_ops + + if KFAC_DEBUG: + self.eigen_update_list = [item for item in update_ops] + with tf.control_dependencies(update_ops): + update_ops.append(tf.Print(tf.constant( + 0.), [tf.convert_to_tensor('computed factor eigen')])) + + return update_ops + + def apply_stats_eigen(self, eigen_list): + """ + apply the update using the eigen values of the stats + + :param eigen_list: ([TensorFlow Tensor]) The list of eigen values of the stats + :return: ([TensorFlow Tensor]) update operations + """ + update_ops = [] + if self.verbose >= 1: + print(('updating %d eigenvalue/vectors' % len(eigen_list))) + for _, (tensor, mark) in enumerate(zip(eigen_list, self.eigen_update_list)): + stats_eigen_var = self.eigen_reverse_lookup[mark] + update_ops.append( + tf.assign(stats_eigen_var, tensor, use_locking=True)) + + with tf.control_dependencies(update_ops): + factor_step_op = tf.assign_add(self.factor_step, 1) + update_ops.append(factor_step_op) + if KFAC_DEBUG: + update_ops.append(tf.Print(tf.constant( + 0.), [tf.convert_to_tensor('updated kfac factors')])) + return update_ops + + def get_kfac_precond_updates(self, gradlist, varlist): + """ + return the KFAC updates + + :param gradlist: ([TensorFlow Tensor]) The gradients + :param varlist: ([TensorFlow Tensor]) The parameters + :return: ([TensorFlow Tensor]) the update list + """ + v_g = 0. + + assert len(self.stats) > 0 + assert len(self.stats_eigen) > 0 + assert len(self.factors) > 0 + counter = 0 + + grad_dict = {var: grad for grad, var in zip(gradlist, varlist)} + + for grad, var in zip(gradlist, varlist): + grad_reshape = False + + fprop_factored_fishers = self.stats[var]['fprop_concat_stats'] + bprop_factored_fishers = self.stats[var]['bprop_concat_stats'] + + if (len(fprop_factored_fishers) + len(bprop_factored_fishers)) > 0: + counter += 1 + grad_shape = grad.get_shape() + if len(grad.get_shape()) > 2: + # reshape conv kernel parameters + kernel_width = int(grad.get_shape()[0]) + kernel_height = int(grad.get_shape()[1]) + n_channels = int(grad.get_shape()[2]) + depth = int(grad.get_shape()[3]) + + if len(fprop_factored_fishers) > 1 and self._channel_fac: + # reshape conv kernel parameters into tensor + grad = tf.reshape(grad, [kernel_width * kernel_height, n_channels, depth]) + else: + # reshape conv kernel parameters into 2D grad + grad = tf.reshape(grad, [-1, depth]) + grad_reshape = True + elif len(grad.get_shape()) == 1: + # reshape bias or 1D parameters + + grad = tf.expand_dims(grad, 0) + grad_reshape = True + + if (self.stats[var]['assnBias'] is not None) and not self._blockdiag_bias: + # use homogeneous coordinates only works for 2D grad. + # TODO: figure out how to factorize bias grad + # stack bias grad + var_assn_bias = self.stats[var]['assnBias'] + grad = tf.concat( + [grad, tf.expand_dims(grad_dict[var_assn_bias], 0)], 0) + + # project gradient to eigen space and reshape the eigenvalues + # for broadcasting + eig_vals = [] + + for idx, stats in enumerate(self.stats[var]['fprop_concat_stats']): + eigen_vectors = self.stats_eigen[stats]['Q'] + eigen_values = detect_min_val(self.stats_eigen[stats][ + 'e'], var, name='act', debug=KFAC_DEBUG) + + eigen_vectors, eigen_values = factor_reshape(eigen_vectors, eigen_values, + grad, fac_idx=idx, f_type='act') + eig_vals.append(eigen_values) + grad = gmatmul(eigen_vectors, grad, transpose_a=True, reduce_dim=idx) + + for idx, stats in enumerate(self.stats[var]['bprop_concat_stats']): + eigen_vectors = self.stats_eigen[stats]['Q'] + eigen_values = detect_min_val(self.stats_eigen[stats][ + 'e'], var, name='grad', debug=KFAC_DEBUG) + + eigen_vectors, eigen_values = factor_reshape(eigen_vectors, eigen_values, + grad, fac_idx=idx, f_type='grad') + eig_vals.append(eigen_values) + grad = gmatmul(grad, eigen_vectors, transpose_b=False, reduce_dim=idx) + + # whiten using eigenvalues + weight_decay_coeff = 0. + if var in self._weight_decay_dict: + weight_decay_coeff = self._weight_decay_dict[var] + if KFAC_DEBUG: + print(('weight decay coeff for %s is %f' % (var.name, weight_decay_coeff))) + + if self._factored_damping: + if KFAC_DEBUG: + print(('use factored damping for %s' % var.name)) + coeffs = 1. + num_factors = len(eig_vals) + # compute the ratio of two trace norm of the left and right + # KFac matrices, and their generalization + if len(eig_vals) == 1: + damping = self._epsilon + weight_decay_coeff + else: + damping = tf.pow( + self._epsilon + weight_decay_coeff, 1. / num_factors) + eig_vals_tnorm_avg = [tf.reduce_mean( + tf.abs(e)) for e in eig_vals] + for eigen_val, e_tnorm in zip(eig_vals, eig_vals_tnorm_avg): + eig_tnorm_neg_list = [ + item for item in eig_vals_tnorm_avg if item != e_tnorm] + if len(eig_vals) == 1: + adjustment = 1. + elif len(eig_vals) == 2: + adjustment = tf.sqrt( + e_tnorm / eig_tnorm_neg_list[0]) + else: + eig_tnorm_neg_list_prod = reduce( + lambda x, y: x * y, eig_tnorm_neg_list) + adjustment = tf.pow( + tf.pow(e_tnorm, num_factors - 1.) / eig_tnorm_neg_list_prod, 1. / num_factors) + coeffs *= (eigen_val + adjustment * damping) + else: + coeffs = 1. + damping = (self._epsilon + weight_decay_coeff) + for eigen_val in eig_vals: + coeffs *= eigen_val + coeffs += damping + + grad /= coeffs + + # project gradient back to euclidean space + for idx, stats in enumerate(self.stats[var]['fprop_concat_stats']): + eigen_vectors = self.stats_eigen[stats]['Q'] + grad = gmatmul(eigen_vectors, grad, transpose_a=False, reduce_dim=idx) + + for idx, stats in enumerate(self.stats[var]['bprop_concat_stats']): + eigen_vectors = self.stats_eigen[stats]['Q'] + grad = gmatmul(grad, eigen_vectors, transpose_b=True, reduce_dim=idx) + + if (self.stats[var]['assnBias'] is not None) and not self._blockdiag_bias: + # use homogeneous coordinates only works for 2D grad. + # TODO: figure out how to factorize bias grad + # un-stack bias grad + var_assn_bias = self.stats[var]['assnBias'] + c_plus_one = int(grad.get_shape()[0]) + grad_assn_bias = tf.reshape(tf.slice(grad, + begin=[ + c_plus_one - 1, 0], + size=[1, -1]), var_assn_bias.get_shape()) + grad_assn_weights = tf.slice(grad, + begin=[0, 0], + size=[c_plus_one - 1, -1]) + grad_dict[var_assn_bias] = grad_assn_bias + grad = grad_assn_weights + + if grad_reshape: + grad = tf.reshape(grad, grad_shape) + + grad_dict[var] = grad + + if self.verbose >= 1: + print(('projecting %d gradient matrices' % counter)) + + for grad_1, var in zip(gradlist, varlist): + grad = grad_dict[var] + # clipping + if KFAC_DEBUG: + print(('apply clipping to %s' % var.name)) + tf.Print(grad, [tf.sqrt(tf.reduce_sum(tf.pow(grad, 2)))], "Euclidean norm of new grad") + local_vg = tf.reduce_sum(grad * grad_1 * (self._lr * self._lr)) + v_g += local_vg + + # rescale everything + if KFAC_DEBUG: + print('apply vFv clipping') + + scaling = tf.minimum(1., tf.sqrt(self._clip_kl / v_g)) + if KFAC_DEBUG: + scaling = tf.Print(scaling, [tf.convert_to_tensor( + 'clip: '), scaling, tf.convert_to_tensor(' vFv: '), v_g]) + with tf.control_dependencies([tf.assign(self.v_f_v, v_g)]): + updatelist = [grad_dict[var] for var in varlist] + for i, item in enumerate(updatelist): + updatelist[i] = scaling * item + + return updatelist + + @classmethod + def compute_gradients(cls, loss, var_list=None): + """ + compute the gradients from the loss and the parameters + + :param loss: ([TensorFlow Tensor]) The loss + :param var_list: ([TensorFlow Tensor]) The parameters + :return: ([TensorFlow Tensor]) the gradient + """ + varlist = var_list + if varlist is None: + varlist = tf.trainable_variables() + gradients = tf.gradients(loss, varlist) + + return [(a, b) for a, b in zip(gradients, varlist)] + + def apply_gradients_kfac(self, grads): + """ + apply the kfac gradient + + :param grads: ([TensorFlow Tensor]) the gradient + :return: ([function], QueueRunner) Update functions, queue operation runner + """ + grad, varlist = list(zip(*grads)) + + if len(self.stats_eigen) == 0: + self.get_stats_eigen() + + queue_runner = None + # launch eigen-decomp on a queue thread + if self._async: + if self.verbose >= 1: + print('Use async eigen decomp') + # get a list of factor loading tensors + factor_ops_dummy = self.compute_stats_eigen() + + # define a queue for the list of factor loading tensors + queue = tf.FIFOQueue(1, [item.dtype for item in factor_ops_dummy], + shapes=[item.get_shape() for item in factor_ops_dummy]) + enqueue_op = tf.cond( + tf.logical_and(tf.equal(tf.mod(self.stats_step, self._kfac_update), tf.convert_to_tensor( + 0)), tf.greater_equal(self.stats_step, self._stats_accum_iter)), + lambda: queue.enqueue(self.compute_stats_eigen()), tf.no_op) + + def dequeue_op(): + return queue.dequeue() + + queue_runner = tf.train.QueueRunner(queue, [enqueue_op]) + + update_ops = [] + global_step_op = tf.assign_add(self.global_step, 1) + update_ops.append(global_step_op) + + with tf.control_dependencies([global_step_op]): + + # compute updates + assert self._update_stats_op is not None + update_ops.append(self._update_stats_op) + dependency_list = [] + if not self._async: + dependency_list.append(self._update_stats_op) + + with tf.control_dependencies(dependency_list): + def no_op_wrapper(): + return tf.group(*[tf.assign_add(self.cold_step, 1)]) + + if not self._async: + # synchronous eigen-decomp updates + update_factor_ops = tf.cond(tf.logical_and(tf.equal(tf.mod(self.stats_step, self._kfac_update), + tf.convert_to_tensor(0)), + tf.greater_equal(self.stats_step, + self._stats_accum_iter)), + lambda: tf.group(*self.apply_stats_eigen(self.compute_stats_eigen())), + no_op_wrapper) + else: + # asynchronous eigen-decomp updates using queue + update_factor_ops = tf.cond(tf.greater_equal(self.stats_step, self._stats_accum_iter), + lambda: tf.cond(tf.equal(queue.size(), tf.convert_to_tensor(0)), + tf.no_op, + + lambda: tf.group( + *self.apply_stats_eigen(dequeue_op())), + ), + no_op_wrapper) + + update_ops.append(update_factor_ops) + + with tf.control_dependencies([update_factor_ops]): + def grad_op(): + return list(grad) + + def get_kfac_grad_op(): + return self.get_kfac_precond_updates(grad, varlist) + + u = tf.cond(tf.greater(self.factor_step, + tf.convert_to_tensor(0)), get_kfac_grad_op, grad_op) + + optim = tf.train.MomentumOptimizer( + self._lr * (1. - self._momentum), self._momentum) + + # optim = tf.train.AdamOptimizer(self._lr, epsilon=0.01) + + def optim_op(): + def update_optim_op(): + if self._full_stats_init: + return tf.cond(tf.greater(self.factor_step, tf.convert_to_tensor(0)), + lambda: optim.apply_gradients(list(zip(u, varlist))), tf.no_op) + else: + return optim.apply_gradients(list(zip(u, varlist))) + + if self._full_stats_init: + return tf.cond(tf.greater_equal(self.stats_step, self._stats_accum_iter), update_optim_op, + tf.no_op) + else: + return tf.cond(tf.greater_equal(self.sgd_step, self._cold_iter), update_optim_op, tf.no_op) + + update_ops.append(optim_op()) + + return tf.group(*update_ops), queue_runner + + def apply_gradients(self, grads): + """ + apply the gradient + + :param grads: ([TensorFlow Tensor]) the gradient + :return: (function, QueueRunner) train operation, queue operation runner + """ + cold_optim = tf.train.MomentumOptimizer(self._cold_lr, self._momentum) + + def _cold_sgd_start(): + sgd_grads, sgd_var = zip(*grads) + + if self.max_grad_norm is not None: + sgd_grads, _ = tf.clip_by_global_norm(sgd_grads, self.max_grad_norm) + + sgd_grads = list(zip(sgd_grads, sgd_var)) + + sgd_step_op = tf.assign_add(self.sgd_step, 1) + cold_optim_op = cold_optim.apply_gradients(sgd_grads) + if KFAC_DEBUG: + with tf.control_dependencies([sgd_step_op, cold_optim_op]): + sgd_step_op = tf.Print( + sgd_step_op, [self.sgd_step, tf.convert_to_tensor('doing cold sgd step')]) + return tf.group(*[sgd_step_op, cold_optim_op]) + + # remove unused variables + grads = [(grad, var) for (grad, var) in grads if grad is not None] + + kfac_optim_op, queue_runner = self.apply_gradients_kfac(grads) + + def _warm_kfac_start(): + return kfac_optim_op + + return tf.cond(tf.greater(self.sgd_step, self._cold_iter), _warm_kfac_start, _cold_sgd_start), queue_runner + + def minimize(self, loss, loss_sampled, var_list=None): + """ + minimize the gradient loss + + :param loss: ([TensorFlow Tensor]) The loss + :param loss_sampled: ([TensorFlow Tensor]) the loss function output + :param var_list: ([TensorFlow Tensor]) The parameters + :return: (function, q_runner) train operation, queue operation runner + """ + grads = self.compute_gradients(loss, var_list=var_list) + self.compute_and_apply_stats(loss_sampled, var_list=var_list) + return self.apply_gradients(grads) diff --git a/stable_baselines/acktr/kfac_utils.py b/stable_baselines/acktr/kfac_utils.py new file mode 100644 index 0000000000..512e21a239 --- /dev/null +++ b/stable_baselines/acktr/kfac_utils.py @@ -0,0 +1,128 @@ +import tensorflow as tf + + +def gmatmul(tensor_a, tensor_b, transpose_a=False, transpose_b=False, reduce_dim=None): + """ + Do a matrix multiplication with tensor 'a' and 'b', even when their shape do not match + + :param tensor_a: (TensorFlow Tensor) + :param tensor_b: (TensorFlow Tensor) + :param transpose_a: (bool) If 'a' needs transposing + :param transpose_b: (bool) If 'b' needs transposing + :param reduce_dim: (int) the multiplication over the dim + :return: (TensorFlow Tensor) a * b + """ + assert reduce_dim is not None + + # weird batch matmul + if len(tensor_a.get_shape()) == 2 and len(tensor_b.get_shape()) > 2: + # reshape reduce_dim to the left most dim in b + b_shape = tensor_b.get_shape() + if reduce_dim != 0: + b_dims = list(range(len(b_shape))) + b_dims.remove(reduce_dim) + b_dims.insert(0, reduce_dim) + tensor_b = tf.transpose(tensor_b, b_dims) + b_t_shape = tensor_b.get_shape() + tensor_b = tf.reshape(tensor_b, [int(b_shape[reduce_dim]), -1]) + result = tf.matmul(tensor_a, tensor_b, transpose_a=transpose_a, + transpose_b=transpose_b) + result = tf.reshape(result, b_t_shape) + if reduce_dim != 0: + b_dims = list(range(len(b_shape))) + b_dims.remove(0) + b_dims.insert(reduce_dim, 0) + result = tf.transpose(result, b_dims) + return result + + elif len(tensor_a.get_shape()) > 2 and len(tensor_b.get_shape()) == 2: + # reshape reduce_dim to the right most dim in a + a_shape = tensor_a.get_shape() + outter_dim = len(a_shape) - 1 + reduce_dim = len(a_shape) - reduce_dim - 1 + if reduce_dim != outter_dim: + a_dims = list(range(len(a_shape))) + a_dims.remove(reduce_dim) + a_dims.insert(outter_dim, reduce_dim) + tensor_a = tf.transpose(tensor_a, a_dims) + a_t_shape = tensor_a.get_shape() + tensor_a = tf.reshape(tensor_a, [-1, int(a_shape[reduce_dim])]) + result = tf.matmul(tensor_a, tensor_b, transpose_a=transpose_a, + transpose_b=transpose_b) + result = tf.reshape(result, a_t_shape) + if reduce_dim != outter_dim: + a_dims = list(range(len(a_shape))) + a_dims.remove(outter_dim) + a_dims.insert(reduce_dim, outter_dim) + result = tf.transpose(result, a_dims) + return result + + elif len(tensor_a.get_shape()) == 2 and len(tensor_b.get_shape()) == 2: + return tf.matmul(tensor_a, tensor_b, transpose_a=transpose_a, transpose_b=transpose_b) + + assert False, 'something went wrong' + + +def clipout_neg(vec, threshold=1e-6): + """ + clip to 0 if input lower than threshold value + + :param vec: (TensorFlow Tensor) + :param threshold: (float) the cutoff threshold + :return: (TensorFlow Tensor) clipped input + """ + mask = tf.cast(vec > threshold, tf.float32) + return mask * vec + + +def detect_min_val(input_mat, var, threshold=1e-6, name='', debug=False): + """ + If debug is not set, will run clipout_neg. Else, will clip and print out odd eigen values + + :param input_mat: (TensorFlow Tensor) + :param var: (TensorFlow Tensor) variable + :param threshold: (float) the cutoff threshold + :param name: (str) the name of the variable + :param debug: (bool) debug function + :return: (TensorFlow Tensor) clipped tensor + """ + eigen_min = tf.reduce_min(input_mat) + eigen_max = tf.reduce_max(input_mat) + eigen_ratio = eigen_max / eigen_min + input_mat_clipped = clipout_neg(input_mat, threshold) + + if debug: + input_mat_clipped = tf.cond(tf.logical_or(tf.greater(eigen_ratio, 0.), tf.less(eigen_ratio, -500)), + lambda: input_mat_clipped, lambda: tf.Print( + input_mat_clipped, + [tf.convert_to_tensor('odd ratio ' + name + ' eigen values!!!'), tf.convert_to_tensor(var.name), + eigen_min, eigen_max, eigen_ratio])) + + return input_mat_clipped + + +def factor_reshape(eigen_vectors, eigen_values, grad, fac_idx=0, f_type='act'): + """ + factor and reshape input eigen values + + :param eigen_vectors: ([TensorFlow Tensor]) eigen vectors + :param eigen_values: ([TensorFlow Tensor]) eigen values + :param grad: ([TensorFlow Tensor]) gradient + :param fac_idx: (int) index that should be factored + :param f_type: (str) function type to factor and reshape + :return: ([TensorFlow Tensor], [TensorFlow Tensor]) factored and reshaped eigen vectors + and eigen values + """ + grad_shape = grad.get_shape() + if f_type == 'act': + assert eigen_values.get_shape()[0] == grad_shape[fac_idx] + expanded_shape = [1, ] * len(grad_shape) + expanded_shape[fac_idx] = -1 + eigen_values = tf.reshape(eigen_values, expanded_shape) + if f_type == 'grad': + assert eigen_values.get_shape()[0] == grad_shape[len(grad_shape) - fac_idx - 1] + expanded_shape = [1, ] * len(grad_shape) + expanded_shape[len(grad_shape) - fac_idx - 1] = -1 + eigen_values = tf.reshape(eigen_values, expanded_shape) + + return eigen_vectors, eigen_values diff --git a/stable_baselines/acktr/policies.py b/stable_baselines/acktr/policies.py new file mode 100644 index 0000000000..2fee7cc0ea --- /dev/null +++ b/stable_baselines/acktr/policies.py @@ -0,0 +1,75 @@ +import numpy as np +import tensorflow as tf + +from stable_baselines.acktr.utils import dense, kl_div +import stable_baselines.common.tf_util as tf_util + + +class GaussianMlpPolicy(object): + def __init__(self, ob_dim, ac_dim): + """ + Create a gaussian MLP policy + + :param ob_dim: (int) Observation dimention + :param ac_dim: (int) action dimention + """ + # Here we'll construct a bunch of expressions, which will be used in two places: + # (1) When sampling actions + # (2) When computing loss functions, for the policy update + # Variables specific to (1) have the word "sampled" in them, + # whereas variables specific to (2) have the word "old" in them + ob_no = tf.placeholder(tf.float32, shape=[None, ob_dim * 2], name="ob") # batch of observations + oldac_na = tf.placeholder(tf.float32, shape=[None, ac_dim], name="ac") # batch of actions previous actions + # batch of actions previous action distributions + oldac_dist = tf.placeholder(tf.float32, shape=[None, ac_dim * 2], name="oldac_dist") + adv_n = tf.placeholder(tf.float32, shape=[None], name="adv") # advantage function estimate + wd_dict = {} + layer_1 = tf.nn.tanh(dense(ob_no, 64, "h1", + weight_init=tf_util.normc_initializer(1.0), + bias_init=0.0, weight_loss_dict=wd_dict)) + layer_2 = tf.nn.tanh(dense(layer_1, 64, "h2", + weight_init=tf_util.normc_initializer(1.0), + bias_init=0.0, weight_loss_dict=wd_dict)) + mean_na = dense(layer_2, ac_dim, "mean", weight_init=tf_util.normc_initializer(0.1), + bias_init=0.0, weight_loss_dict=wd_dict) # Mean control output + self.wd_dict = wd_dict + # Variance on outputs + self.logstd_1a = logstd_1a = tf.get_variable("logstd", [ac_dim], tf.float32, tf.zeros_initializer()) + logstd_1a = tf.expand_dims(logstd_1a, 0) + std_1a = tf.exp(logstd_1a) + std_na = tf.tile(std_1a, [tf.shape(mean_na)[0], 1]) + ac_dist = tf.concat([tf.reshape(mean_na, [-1, ac_dim]), tf.reshape(std_na, [-1, ac_dim])], 1) + # This is the sampled action we'll perform. + sampled_ac_na = tf.random_normal(tf.shape(ac_dist[:, ac_dim:])) * ac_dist[:, ac_dim:] + ac_dist[:, :ac_dim] + logprobsampled_n = - tf.reduce_sum(tf.log(ac_dist[:, ac_dim:]), axis=1) - 0.5 * tf.log( + 2.0 * np.pi) * ac_dim - 0.5 * tf.reduce_sum( + tf.square(ac_dist[:, :ac_dim] - sampled_ac_na) / (tf.square(ac_dist[:, ac_dim:])), + axis=1) # Logprob of sampled action + logprob_n = - tf.reduce_sum(tf.log(ac_dist[:, ac_dim:]), axis=1) - 0.5 * tf.log( + 2.0 * np.pi) * ac_dim - 0.5 * tf.reduce_sum( + tf.square(ac_dist[:, :ac_dim] - oldac_na) / (tf.square(ac_dist[:, ac_dim:])), + axis=1) # Logprob of previous actions under CURRENT policy (whereas oldlogprob_n is under OLD policy) + kl_loss = tf.reduce_mean(kl_div(oldac_dist, ac_dist, ac_dim)) + # kl = .5 * tf.reduce_mean(tf.square(logprob_n - oldlogprob_n)) + # Approximation of KL divergence between old policy used to generate actions, + # and new policy used to compute logprob_n + surr = - tf.reduce_mean(adv_n * logprob_n) # Loss function that we'll differentiate to get the policy gradient + surr_sampled = - tf.reduce_mean(logprob_n) # Sampled loss of the policy + # Generate a new action and its logprob + self._act = tf_util.function([ob_no], [sampled_ac_na, ac_dist, logprobsampled_n]) + # self.compute_kl = U.function([ob_no, oldac_na, oldlogprob_n], kl) + # Compute (approximate) KL divergence between old policy and new policy + self.compute_kl = tf_util.function([ob_no, oldac_dist], kl_loss) + # Input and output variables needed for computing loss + self.update_info = ((ob_no, oldac_na, adv_n), surr, surr_sampled) + tf_util.initialize() # Initialize uninitialized TF variables + + def act(self, obs): + """ + get the action from an observation + + :param obs: ([float]) observation + :return: ([float], [float], [float]) action, action_proba, logp + """ + action, ac_dist, logp = self._act(obs[None]) + return action[0], ac_dist[0], logp[0] diff --git a/stable_baselines/acktr/run_atari.py b/stable_baselines/acktr/run_atari.py new file mode 100644 index 0000000000..a7d599619a --- /dev/null +++ b/stable_baselines/acktr/run_atari.py @@ -0,0 +1,36 @@ +#!/usr/bin/env python3 +from functools import partial + +from stable_baselines import logger +from stable_baselines.acktr import ACKTR +from stable_baselines.common.cmd_util import make_atari_env, atari_arg_parser +from stable_baselines.common.vec_env.vec_frame_stack import VecFrameStack +from stable_baselines.common.policies import CnnPolicy + + +def train(env_id, num_timesteps, seed, num_cpu): + """ + train an ACKTR model on atari + + :param env_id: (str) Environment ID + :param num_timesteps: (int) The total number of samples + :param seed: (int) The initial seed for training + :param num_cpu: (int) The number of cpu to train on + """ + env = VecFrameStack(make_atari_env(env_id, num_cpu, seed), 4) + model = ACKTR(CnnPolicy, env, nprocs=num_cpu) + model.learn(total_timesteps=int(num_timesteps * 1.1), seed=seed) + env.close() + + +def main(): + """ + Runs the test + """ + args = atari_arg_parser().parse_args() + logger.configure() + train(args.env, num_timesteps=args.num_timesteps, seed=args.seed, num_cpu=32) + + +if __name__ == '__main__': + main() diff --git a/stable_baselines/acktr/run_mujoco.py b/stable_baselines/acktr/run_mujoco.py new file mode 100644 index 0000000000..1c99f88a4f --- /dev/null +++ b/stable_baselines/acktr/run_mujoco.py @@ -0,0 +1,46 @@ +#!/usr/bin/env python3 + +import tensorflow as tf + +from stable_baselines import logger +from stable_baselines.common.cmd_util import make_mujoco_env, mujoco_arg_parser +from stable_baselines.acktr.acktr_cont import learn +from stable_baselines.acktr.policies import GaussianMlpPolicy +from stable_baselines.acktr.value_functions import NeuralNetValueFunction + + +def train(env_id, num_timesteps, seed): + """ + train an ACKTR model on atari + + :param env_id: (str) Environment ID + :param num_timesteps: (int) The total number of samples + :param seed: (int) The initial seed for training + """ + env = make_mujoco_env(env_id, seed) + + with tf.Session(config=tf.ConfigProto()): + ob_dim = env.observation_space.shape[0] + ac_dim = env.action_space.shape[0] + with tf.variable_scope("vf"): + value_fn = NeuralNetValueFunction(ob_dim, ac_dim) + with tf.variable_scope("pi"): + policy = GaussianMlpPolicy(ob_dim, ac_dim) + + learn(env, policy=policy, value_fn=value_fn, gamma=0.99, lam=0.97, timesteps_per_batch=2500, desired_kl=0.002, + num_timesteps=num_timesteps, animate=False) + + env.close() + + +def main(): + """ + Runs the test + """ + args = mujoco_arg_parser().parse_args() + logger.configure() + train(args.env, num_timesteps=args.num_timesteps, seed=args.seed) + + +if __name__ == "__main__": + main() diff --git a/stable_baselines/acktr/utils.py b/stable_baselines/acktr/utils.py new file mode 100644 index 0000000000..5b67b2c804 --- /dev/null +++ b/stable_baselines/acktr/utils.py @@ -0,0 +1,49 @@ +import tensorflow as tf + + +def dense(input_tensor, size, name, weight_init=None, bias_init=0, weight_loss_dict=None, reuse=None): + """ + A dense Layer + + :param input_tensor: ([TensorFlow Tensor]) input + :param size: (int) number of hidden neurons + :param name: (str) layer name + :param weight_init: (function or int or float) initialize the weight + :param bias_init: (function or int or float) initialize the weight + :param weight_loss_dict: (dict) store the weight loss if not None + :param reuse: (bool) if can be reused + :return: ([TensorFlow Tensor]) the output of the dense Layer + """ + with tf.variable_scope(name, reuse=reuse): + assert len(tf.get_variable_scope().name.split('/')) == 2 + + weight = tf.get_variable("w", [input_tensor.get_shape()[1], size], initializer=weight_init) + bias = tf.get_variable("b", [size], initializer=tf.constant_initializer(bias_init)) + weight_decay_fc = 3e-4 + + if weight_loss_dict is not None: + weight_decay = tf.multiply(tf.nn.l2_loss(weight), weight_decay_fc, name='weight_decay_loss') + weight_loss_dict[weight] = weight_decay_fc + weight_loss_dict[bias] = 0.0 + + tf.add_to_collection(tf.get_variable_scope().name.split('/')[0] + '_' + 'losses', weight_decay) + + return tf.nn.bias_add(tf.matmul(input_tensor, weight), bias) + + +def kl_div(action_dist1, action_dist2, action_size): + """ + Kullback leiber divergence + + :param action_dist1: ([TensorFlow Tensor]) action distribution 1 + :param action_dist2: ([TensorFlow Tensor]) action distribution 2 + :param action_size: (int) the shape of an action + :return: (float) Kullback leiber divergence + """ + mean1, std1 = action_dist1[:, :action_size], action_dist1[:, action_size:] + mean2, std2 = action_dist2[:, :action_size], action_dist2[:, action_size:] + + numerator = tf.square(mean1 - mean2) + tf.square(std1) - tf.square(std2) + denominator = 2 * tf.square(std2) + 1e-8 + return tf.reduce_sum( + numerator / denominator + tf.log(std2) - tf.log(std1), reduction_indices=-1) diff --git a/stable_baselines/acktr/value_functions.py b/stable_baselines/acktr/value_functions.py new file mode 100644 index 0000000000..e1e5ed5d32 --- /dev/null +++ b/stable_baselines/acktr/value_functions.py @@ -0,0 +1,87 @@ +import numpy as np +import tensorflow as tf + +from stable_baselines import logger +import stable_baselines.common as common +from stable_baselines.common import tf_util +from stable_baselines.acktr import kfac +from stable_baselines.acktr.utils import dense + + +class NeuralNetValueFunction(object): + def __init__(self, ob_dim, ac_dim, verbose=1): + """ + Create an MLP policy for a value function + + :param ob_dim: (int) Observation dimention + :param ac_dim: (int) action dimention + :param verbose: (int) verbosity level + """ + obs_ph = tf.placeholder(tf.float32, shape=[None, ob_dim * 2 + ac_dim * 2 + 2]) # batch of observations + vtarg_n = tf.placeholder(tf.float32, shape=[None], name='vtarg') + wd_dict = {} + layer_1 = tf.nn.elu(dense(obs_ph, 64, "h1", + weight_init=tf_util.normc_initializer(1.0), bias_init=0, weight_loss_dict=wd_dict)) + layer_2 = tf.nn.elu(dense(layer_1, 64, "h2", + weight_init=tf_util.normc_initializer(1.0), bias_init=0, weight_loss_dict=wd_dict)) + vpred_n = dense(layer_2, 1, "hfinal", + weight_init=tf_util.normc_initializer(1.0), bias_init=0, + weight_loss_dict=wd_dict)[:, 0] + sample_vpred_n = vpred_n + tf.random_normal(tf.shape(vpred_n)) + wd_loss = tf.get_collection("vf_losses", None) + loss = tf.reduce_mean(tf.square(vpred_n - vtarg_n)) + tf.add_n(wd_loss) + loss_sampled = tf.reduce_mean(tf.square(vpred_n - tf.stop_gradient(sample_vpred_n))) + + self._predict = tf_util.function([obs_ph], vpred_n) + + optim = kfac.KfacOptimizer(learning_rate=0.001, cold_lr=0.001 * (1 - 0.9), momentum=0.9, + clip_kl=0.3, epsilon=0.1, stats_decay=0.95, + async=1, kfac_update=2, cold_iter=50, + weight_decay_dict=wd_dict, max_grad_norm=None, verbose=verbose) + vf_var_list = [] + for var in tf.trainable_variables(): + if "vf" in var.name: + vf_var_list.append(var) + + update_op, self.q_runner = optim.minimize(loss, loss_sampled, var_list=vf_var_list) + self.do_update = tf_util.function([obs_ph, vtarg_n], update_op) # pylint: disable=E1101 + tf_util.initialize() # Initialize uninitialized TF variables + + @classmethod + def _preproc(cls, path): + """ + preprocess path + + :param path: ({TensorFlow Tensor}) the history of the network + :return: ([TensorFlow Tensor]) processed input + """ + length = path["reward"].shape[0] + # used to be named 'al', unfortunalty we cant seem to know why it was called 'al' or what it means. + # Feel free to fix it if you know what is meant here. + # Could mean 'array_length', but even then we are not sure how this array is useful for the network. + al_capone = np.arange(length).reshape(-1, 1) / 10.0 + act = path["action_dist"].astype('float32') + return np.concatenate([path['observation'], act, al_capone, np.ones((length, 1))], axis=1) + + def predict(self, path): + """ + predict value from history + + :param path: ({TensorFlow Tensor}) the history of the network + :return: ([TensorFlow Tensor]) value function output + """ + return self._predict(self._preproc(path)) + + def fit(self, paths, targvals): + """ + fit paths to target values + + :param paths: ({TensorFlow Tensor}) the history of the network + :param targvals: ([TensorFlow Tensor]) the expected value + """ + _input = np.concatenate([self._preproc(p) for p in paths]) + targets = np.concatenate(targvals) + logger.record_tabular("EVBefore", common.explained_variance(self._predict(_input), targets)) + for _ in range(25): + self.do_update(_input, targets) + logger.record_tabular("EVAfter", common.explained_variance(self._predict(_input), targets)) diff --git a/stable_baselines/bench/__init__.py b/stable_baselines/bench/__init__.py new file mode 100644 index 0000000000..58a0727d90 --- /dev/null +++ b/stable_baselines/bench/__init__.py @@ -0,0 +1 @@ +from stable_baselines.bench.monitor import Monitor, load_results diff --git a/baselines/bench/benchmarks.py b/stable_baselines/bench/benchmarks.py similarity index 68% rename from baselines/bench/benchmarks.py rename to stable_baselines/bench/benchmarks.py index a5a35f831a..298edd23b6 100644 --- a/baselines/bench/benchmarks.py +++ b/stable_baselines/bench/benchmarks.py @@ -1,10 +1,10 @@ import re -import os.path as osp import os + SCRIPT_DIR = os.path.dirname(os.path.abspath(__file__)) -_atari7 = ['BeamRider', 'Breakout', 'Enduro', 'Pong', 'Qbert', 'Seaquest', 'SpaceInvaders'] -_atariexpl7 = ['Freeway', 'Gravitar', 'MontezumaRevenge', 'Pitfall', 'PrivateEye', 'Solaris', 'Venture'] +_ATARI7 = ['BeamRider', 'Breakout', 'Enduro', 'Pong', 'Qbert', 'Seaquest', 'SpaceInvaders'] +_ATARIEXPL7 = ['Freeway', 'Gravitar', 'MontezumaRevenge', 'Pitfall', 'PrivateEye', 'Solaris', 'Venture'] _BENCHMARKS = [] @@ -12,39 +12,67 @@ def register_benchmark(benchmark): - for b in _BENCHMARKS: - if b['name'] == benchmark['name']: - raise ValueError('Benchmark with name %s already registered!' % b['name']) + """ + Register an OpenAI gym environment + + :param benchmark: (dict) Containes the name, description and tasks of the environment you wish to register + """ + for bench in _BENCHMARKS: + if bench['name'] == benchmark['name']: + raise ValueError('Benchmark with name %s already registered!' % bench['name']) # automatically add a description if it is not present if 'tasks' in benchmark: - for t in benchmark['tasks']: - if 'desc' not in t: - t['desc'] = remove_version_re.sub('', t['env_id']) + for task in benchmark['tasks']: + if 'desc' not in task: + task['desc'] = remove_version_re.sub('', task['env_id']) _BENCHMARKS.append(benchmark) def list_benchmarks(): + """ + Retuns a list of all the benchmark dictionaries registed by this module + + :return: ([dict]) the benchmarks + """ return [b['name'] for b in _BENCHMARKS] def get_benchmark(benchmark_name): - for b in _BENCHMARKS: - if b['name'] == benchmark_name: - return b + """ + Returns the registered benchmark of the same name, will raise a ValueError if the name is not present + + :param benchmark_name: (str) the name of the benchmark you wish to lookup + :return: (dict) the benchmark dictionarie + """ + for bench in _BENCHMARKS: + if bench['name'] == benchmark_name: + return bench raise ValueError('%s not found! Known benchmarks: %s' % (benchmark_name, list_benchmarks())) def get_task(benchmark, env_id): - """Get a task by env_id. Return None if the benchmark doesn't have the env""" + """ + Get a task by env_id. Return None if the benchmark doesn't have the env. + + :param benchmark: (dict) the benchmark you wish to look in + :param env_id: (str) the environment id you want to find + :return: (dict) the task + """ return next(filter(lambda task: task['env_id'] == env_id, benchmark['tasks']), None) -def find_task_for_env_id_in_any_benchmark(env_id): - for bm in _BENCHMARKS: - for task in bm["tasks"]: +def find_task_in_benchmarks(env_id): + """ + Get the first task and benchmark, that has the corresponding environment id + + :param env_id: (str) the environment id you want to find + :return: (dict, dict) the benchmark and task dictionaries + """ + for bench in _BENCHMARKS: + for task in bench["tasks"]: if task["env_id"] == env_id: - return bm, task + return bench, task return None, None @@ -53,38 +81,42 @@ def find_task_for_env_id_in_any_benchmark(env_id): register_benchmark({ 'name': 'Atari50M', 'description': '7 Atari games from Mnih et al. (2013), with pixel observations, 50M timesteps', - 'tasks': [{'desc': _game, 'env_id': _game + _ATARI_SUFFIX, 'trials': 2, 'num_timesteps': int(50e6)} for _game in _atari7] + 'tasks': [{'desc': _game, 'env_id': _game + _ATARI_SUFFIX, 'trials': 2, 'num_timesteps': int(50e6)} + for _game in _ATARI7] }) register_benchmark({ 'name': 'Atari10M', 'description': '7 Atari games from Mnih et al. (2013), with pixel observations, 10M timesteps', - 'tasks': [{'desc': _game, 'env_id': _game + _ATARI_SUFFIX, 'trials': 2, 'num_timesteps': int(10e6)} for _game in _atari7] + 'tasks': [{'desc': _game, 'env_id': _game + _ATARI_SUFFIX, 'trials': 2, 'num_timesteps': int(10e6)} + for _game in _ATARI7] }) register_benchmark({ 'name': 'Atari1Hr', 'description': '7 Atari games from Mnih et al. (2013), with pixel observations, 1 hour of walltime', - 'tasks': [{'desc': _game, 'env_id': _game + _ATARI_SUFFIX, 'trials': 2, 'num_seconds': 60 * 60} for _game in _atari7] + 'tasks': [{'desc': _game, 'env_id': _game + _ATARI_SUFFIX, 'trials': 2, 'num_seconds': 60 * 60} + for _game in _ATARI7] }) register_benchmark({ 'name': 'AtariExploration10M', 'description': '7 Atari games emphasizing exploration, with pixel observations, 10M timesteps', - 'tasks': [{'desc': _game, 'env_id': _game + _ATARI_SUFFIX, 'trials': 2, 'num_timesteps': int(10e6)} for _game in _atariexpl7] + 'tasks': [{'desc': _game, 'env_id': _game + _ATARI_SUFFIX, 'trials': 2, 'num_timesteps': int(10e6)} + for _game in _ATARIEXPL7] }) # MuJoCo -_mujocosmall = [ +_MUJOCO_SMALL = [ 'InvertedDoublePendulum-v2', 'InvertedPendulum-v2', 'HalfCheetah-v2', 'Hopper-v2', 'Walker2d-v2', 'Reacher-v2', 'Swimmer-v2'] register_benchmark({ 'name': 'Mujoco1M', 'description': 'Some small 2D MuJoCo tasks, run for 1M timesteps', - 'tasks': [{'env_id': _envid, 'trials': 3, 'num_timesteps': int(1e6)} for _envid in _mujocosmall] + 'tasks': [{'env_id': _envid, 'trials': 3, 'num_timesteps': int(1e6)} for _envid in _MUJOCO_SMALL] }) register_benchmark({ 'name': 'MujocoWalkers', @@ -121,7 +153,7 @@ def find_task_for_env_id_in_any_benchmark(env_id): # Other -_atari50 = [ # actually 47 +_ATARI50 = [ # actually 47 'Alien', 'Amidar', 'Assault', 'Asterix', 'Asteroids', 'Atlantis', 'BankHeist', 'BattleZone', 'BeamRider', 'Bowling', 'Breakout', 'Centipede', 'ChopperCommand', 'CrazyClimber', @@ -137,7 +169,8 @@ def find_task_for_env_id_in_any_benchmark(env_id): register_benchmark({ 'name': 'Atari50_10M', 'description': '47 Atari games from Mnih et al. (2013), with pixel observations, 10M timesteps', - 'tasks': [{'desc': _game, 'env_id': _game + _ATARI_SUFFIX, 'trials': 2, 'num_timesteps': int(10e6)} for _game in _atari50] + 'tasks': [{'desc': _game, 'env_id': _game + _ATARI_SUFFIX, 'trials': 2, 'num_timesteps': int(10e6)} + for _game in _ATARI50] }) # HER DDPG @@ -147,4 +180,3 @@ def find_task_for_env_id_in_any_benchmark(env_id): 'description': 'Smoke-test only benchmark of HER', 'tasks': [{'trials': 1, 'env_id': 'FetchReach-v1'}] }) - diff --git a/stable_baselines/bench/monitor.py b/stable_baselines/bench/monitor.py new file mode 100644 index 0000000000..d50c1ddd37 --- /dev/null +++ b/stable_baselines/bench/monitor.py @@ -0,0 +1,230 @@ +__all__ = ['Monitor', 'get_monitor_files', 'load_results'] + +import os +import time +import csv +import json +import uuid +from glob import glob + +import gym +from gym.core import Wrapper +import pandas + + +class Monitor(Wrapper): + EXT = "monitor.csv" + file_handler = None + + def __init__(self, env, filename, allow_early_resets=False, reset_keywords=(), info_keywords=()): + """ + A monitor wrapper for Gym environments, it is used to know the episode reward, length, time and other data. + + :param env: (Gym environment) The environment + :param filename: (str) the location to save a log file, can be None for no log + :param allow_early_resets: (bool) allows the reset of the environment before it is done + :param reset_keywords: (tuple) extra keywords for the reset call, if extra parameters are needed at reset + :param info_keywords: (tuple) extra information to log, from the information return of environment.step + """ + Wrapper.__init__(self, env=env) + self.t_start = time.time() + if filename is None: + self.file_handler = None + self.logger = None + else: + if not filename.endswith(Monitor.EXT): + if os.path.isdir(filename): + filename = os.path.join(filename, Monitor.EXT) + else: + filename = filename + "." + Monitor.EXT + self.file_handler = open(filename, "wt") + self.file_handler.write('#%s\n' % json.dumps({"t_start": self.t_start, 'env_id': env.spec and env.spec.id})) + self.logger = csv.DictWriter(self.file_handler, + fieldnames=('r', 'l', 't') + reset_keywords + info_keywords) + self.logger.writeheader() + self.file_handler.flush() + + self.reset_keywords = reset_keywords + self.info_keywords = info_keywords + self.allow_early_resets = allow_early_resets + self.rewards = None + self.needs_reset = True + self.episode_rewards = [] + self.episode_lengths = [] + self.episode_times = [] + self.total_steps = 0 + self.current_reset_info = {} # extra info about the current episode, that was passed in during reset() + + def reset(self, **kwargs): + """ + Calls the Gym environment reset. Can only be called if the environment is over, or if allow_early_resets is True + + :param kwargs: Extra keywords saved for the next episode. only if defined by reset_keywords + :return: ([int] or [float]) the first observation of the environment + """ + if not self.allow_early_resets and not self.needs_reset: + raise RuntimeError("Tried to reset an environment before done. If you want to allow early resets, " + "wrap your env with Monitor(env, path, allow_early_resets=True)") + self.rewards = [] + self.needs_reset = False + for key in self.reset_keywords: + value = kwargs.get(key) + if value is None: + raise ValueError('Expected you to pass kwarg %s into reset' % key) + self.current_reset_info[key] = value + return self.env.reset(**kwargs) + + def step(self, action): + """ + Step the environment with the given action + + :param action: ([int] or [float]) the action + :return: ([int] or [float], [float], [bool], dict) observation, reward, done, information + """ + if self.needs_reset: + raise RuntimeError("Tried to step environment that needs reset") + observation, reward, done, info = self.env.step(action) + self.rewards.append(reward) + if done: + self.needs_reset = True + ep_rew = sum(self.rewards) + eplen = len(self.rewards) + ep_info = {"r": round(ep_rew, 6), "l": eplen, "t": round(time.time() - self.t_start, 6)} + for key in self.info_keywords: + ep_info[key] = info[key] + self.episode_rewards.append(ep_rew) + self.episode_lengths.append(eplen) + self.episode_times.append(time.time() - self.t_start) + ep_info.update(self.current_reset_info) + if self.logger: + self.logger.writerow(ep_info) + self.file_handler.flush() + info['episode'] = ep_info + self.total_steps += 1 + return observation, reward, done, info + + def close(self): + """ + Closes the environment + """ + if self.file_handler is not None: + self.file_handler.close() + + def get_total_steps(self): + """ + Returns the total number of timesteps + + :return: (int) + """ + return self.total_steps + + def get_episode_rewards(self): + """ + Returns the rewards of all the episodes + + :return: ([float]) + """ + return self.episode_rewards + + def get_episode_lengths(self): + """ + Returns the number of timesteps of all the episodes + + :return: ([int]) + """ + return self.episode_lengths + + def get_episode_times(self): + """ + Returns the runtime in seconds of all the episodes + + :return: ([float]) + """ + return self.episode_times + + +class LoadMonitorResultsError(Exception): + """ + Raised when loading the monitor log fails. + """ + pass + + +def get_monitor_files(path): + """ + get all the monitor files in the given path + + :param path: (str) the logging folder + :return: ([str]) the log files + """ + return glob(os.path.join(path, "*" + Monitor.EXT)) + + +def load_results(path): + """ + Load results from a given file + + :param path: (str) the path to the log file + :return: (Pandas DataFrame) the logged data + """ + # get both csv and (old) json files + monitor_files = (glob(os.path.join(path, "*monitor.json")) + glob(os.path.join(path, "*monitor.csv"))) + if not monitor_files: + raise LoadMonitorResultsError("no monitor files of the form *%s found in %s" % (Monitor.EXT, path)) + data_frames = [] + headers = [] + for file_name in monitor_files: + with open(file_name, 'rt') as file_handler: + if file_name.endswith('csv'): + first_line = file_handler.readline() + assert first_line[0] == '#' + header = json.loads(first_line[1:]) + data_frame = pandas.read_csv(file_handler, index_col=None) + headers.append(header) + elif file_name.endswith('json'): # Deprecated json format + episodes = [] + lines = file_handler.readlines() + header = json.loads(lines[0]) + headers.append(header) + for line in lines[1:]: + episode = json.loads(line) + episodes.append(episode) + data_frame = pandas.DataFrame(episodes) + else: + assert 0, 'unreachable' + data_frame['t'] += header['t_start'] + data_frames.append(data_frame) + data_frame = pandas.concat(data_frames) + data_frame.sort_values('t', inplace=True) + data_frame.reset_index(inplace=True) + data_frame['t'] -= min(header['t_start'] for header in headers) + # data_frame.headers = headers # HACK to preserve backwards compatibility + return data_frame + + +def test_monitor(): + """ + test the monitor wrapper + """ + env = gym.make("CartPole-v1") + env.seed(0) + mon_file = "/tmp/stable_baselines-test-%s.monitor.csv" % uuid.uuid4() + menv = Monitor(env, mon_file) + menv.reset() + for _ in range(1000): + _, _, done, _ = menv.step(0) + if done: + menv.reset() + + file_handler = open(mon_file, 'rt') + + firstline = file_handler.readline() + assert firstline.startswith('#') + metadata = json.loads(firstline[1:]) + assert metadata['env_id'] == "CartPole-v1" + assert set(metadata.keys()) == {'env_id', 'gym_version', 't_start'}, "Incorrect keys in monitor metadata" + + last_logline = pandas.read_csv(file_handler, index_col=None) + assert set(last_logline.keys()) == {'l', 't', 'r'}, "Incorrect keys in monitor logline" + file_handler.close() + os.remove(mon_file) diff --git a/stable_baselines/common/__init__.py b/stable_baselines/common/__init__.py new file mode 100644 index 0000000000..c4c6e1f63b --- /dev/null +++ b/stable_baselines/common/__init__.py @@ -0,0 +1,8 @@ +# flake8: noqa F403 +from stable_baselines.common.console_util import fmt_row, fmt_item, colorize +from stable_baselines.common.dataset import Dataset +from stable_baselines.common.math_util import discount, discount_with_boundaries, explained_variance, explained_variance_2d,\ + flatten_arrays, unflatten_vector +from stable_baselines.common.misc_util import zipsame, unpack, EzPickle, set_global_seeds, pretty_eta, RunningAvg,\ + boolean_flag, get_wrapper_by_name, relatively_safe_pickle_dump, pickle_load +from stable_baselines.common.base_class import BaseRLModel, SetVerbosity diff --git a/baselines/common/atari_wrappers.py b/stable_baselines/common/atari_wrappers.py similarity index 60% rename from baselines/common/atari_wrappers.py rename to stable_baselines/common/atari_wrappers.py index 2aefad78cf..56ca54a9ea 100644 --- a/baselines/common/atari_wrappers.py +++ b/stable_baselines/common/atari_wrappers.py @@ -1,14 +1,20 @@ -import numpy as np from collections import deque + +import numpy as np import gym from gym import spaces import cv2 cv2.ocl.setUseOpenCL(False) + class NoopResetEnv(gym.Wrapper): def __init__(self, env, noop_max=30): - """Sample initial states by taking random number of no-ops on reset. + """ + Sample initial states by taking random number of no-ops on reset. No-op is assumed to be action 0. + + :param env: (Gym Environment) the environment to wrap + :param noop_max: (int) the maximum value of no-ops to run """ gym.Wrapper.__init__(self, env) self.noop_max = noop_max @@ -17,12 +23,11 @@ def __init__(self, env, noop_max=30): assert env.unwrapped.get_action_meanings()[0] == 'NOOP' def reset(self, **kwargs): - """ Do no-op action for a number of steps in [1, noop_max].""" self.env.reset(**kwargs) if self.override_num_noops is not None: noops = self.override_num_noops else: - noops = self.unwrapped.np_random.randint(1, self.noop_max + 1) #pylint: disable=E1101 + noops = self.unwrapped.np_random.randint(1, self.noop_max + 1) assert noops > 0 obs = None for _ in range(noops): @@ -31,12 +36,17 @@ def reset(self, **kwargs): obs = self.env.reset(**kwargs) return obs - def step(self, ac): - return self.env.step(ac) + def step(self, action): + return self.env.step(action) + class FireResetEnv(gym.Wrapper): def __init__(self, env): - """Take action on reset for environments that are fixed until firing.""" + """ + Take action on reset for environments that are fixed until firing. + + :param env: (Gym Environment) the environment to wrap + """ gym.Wrapper.__init__(self, env) assert env.unwrapped.get_action_meanings()[1] == 'FIRE' assert len(env.unwrapped.get_action_meanings()) >= 3 @@ -51,17 +61,21 @@ def reset(self, **kwargs): self.env.reset(**kwargs) return obs - def step(self, ac): - return self.env.step(ac) + def step(self, action): + return self.env.step(action) + class EpisodicLifeEnv(gym.Wrapper): def __init__(self, env): - """Make end-of-life == end-of-episode, but only reset on true game over. + """ + Make end-of-life == end-of-episode, but only reset on true game over. Done by DeepMind for the DQN and co. since it helps value estimation. + + :param env: (Gym Environment) the environment to wrap """ gym.Wrapper.__init__(self, env) self.lives = 0 - self.was_real_done = True + self.was_real_done = True def step(self, action): obs, reward, done, info = self.env.step(action) @@ -69,7 +83,7 @@ def step(self, action): # check current lives, make loss of life terminal, # then update lives to handle bonus lives lives = self.env.unwrapped.ale.lives() - if lives < self.lives and lives > 0: + if 0 < lives < self.lives: # for Qbert sometimes we stay in lives == 0 condtion for a few frames # so its important to keep lives > 0, so that we only reset once # the environment advertises done. @@ -78,9 +92,13 @@ def step(self, action): return obs, reward, done, info def reset(self, **kwargs): - """Reset only when lives are exhausted. + """ + Calls the Gym environment reset, only when lives are exhausted. This way all states are still reachable even though lives are episodic, and the learner need not know about any of this behind-the-scenes. + + :param kwargs: Extra keywords passed to env.reset() call + :return: ([int] or [float]) the first observation of the environment """ if self.was_real_done: obs = self.env.reset(**kwargs) @@ -90,22 +108,36 @@ def reset(self, **kwargs): self.lives = self.env.unwrapped.ale.lives() return obs + class MaxAndSkipEnv(gym.Wrapper): def __init__(self, env, skip=4): - """Return only every `skip`-th frame""" + """ + Return only every `skip`-th frame (frameskipping) + + :param env: (Gym Environment) the environment + :param skip: (int) number of `skip`-th frame + """ gym.Wrapper.__init__(self, env) # most recent raw observations (for max pooling across time steps) - self._obs_buffer = np.zeros((2,)+env.observation_space.shape, dtype=np.uint8) - self._skip = skip + self._obs_buffer = np.zeros((2,)+env.observation_space.shape, dtype=env.observation_space.dtype) + self._skip = skip def step(self, action): - """Repeat action, sum reward, and max over last observations.""" + """ + Step the environment with the given action + Repeat action, sum reward, and max over last observations. + + :param action: ([int] or [float]) the action + :return: ([int] or [float], [float], [bool], dict) observation, reward, done, information + """ total_reward = 0.0 done = None for i in range(self._skip): obs, reward, done, info = self.env.step(action) - if i == self._skip - 2: self._obs_buffer[0] = obs - if i == self._skip - 1: self._obs_buffer[1] = obs + if i == self._skip - 2: + self._obs_buffer[0] = obs + if i == self._skip - 1: + self._obs_buffer[1] = obs total_reward += reward if done: break @@ -118,77 +150,108 @@ def step(self, action): def reset(self, **kwargs): return self.env.reset(**kwargs) + class ClipRewardEnv(gym.RewardWrapper): def __init__(self, env): + """ + clips the reward to {+1, 0, -1} by its sign. + + :param env: (Gym Environment) the environment + """ gym.RewardWrapper.__init__(self, env) def reward(self, reward): - """Bin reward to {+1, 0, -1} by its sign.""" + """ + Bin reward to {+1, 0, -1} by its sign. + + :param reward: (float) + """ return np.sign(reward) + class WarpFrame(gym.ObservationWrapper): def __init__(self, env): - """Warp frames to 84x84 as done in the Nature paper and later work.""" + """ + Warp frames to 84x84 as done in the Nature paper and later work. + + :param env: (Gym Environment) the environment + """ gym.ObservationWrapper.__init__(self, env) self.width = 84 self.height = 84 - self.observation_space = spaces.Box(low=0, high=255, - shape=(self.height, self.width, 1), dtype=np.uint8) + self.observation_space = spaces.Box(low=0, high=255, shape=(self.height, self.width, 1), + dtype=env.observation_space.dtype) def observation(self, frame): + """ + returns the current observation from a frame + + :param frame: ([int] or [float]) environment frame + :return: ([int] or [float]) the observation + """ frame = cv2.cvtColor(frame, cv2.COLOR_RGB2GRAY) frame = cv2.resize(frame, (self.width, self.height), interpolation=cv2.INTER_AREA) return frame[:, :, None] + class FrameStack(gym.Wrapper): - def __init__(self, env, k): - """Stack k last frames. + def __init__(self, env, n_frames): + """Stack n_frames last frames. Returns lazy array, which is much more memory efficient. See Also -------- - baselines.common.atari_wrappers.LazyFrames + stable_baselines.common.atari_wrappers.LazyFrames + + :param env: (Gym Environment) the environment + :param n_frames: (int) the number of frames to stack """ gym.Wrapper.__init__(self, env) - self.k = k - self.frames = deque([], maxlen=k) + self.n_frames = n_frames + self.frames = deque([], maxlen=n_frames) shp = env.observation_space.shape - self.observation_space = spaces.Box(low=0, high=255, shape=(shp[0], shp[1], shp[2] * k), dtype=np.uint8) + self.observation_space = spaces.Box(low=0, high=255, shape=(shp[0], shp[1], shp[2] * n_frames), + dtype=env.observation_space.dtype) def reset(self): - ob = self.env.reset() - for _ in range(self.k): - self.frames.append(ob) + obs = self.env.reset() + for _ in range(self.n_frames): + self.frames.append(obs) return self._get_ob() def step(self, action): - ob, reward, done, info = self.env.step(action) - self.frames.append(ob) + obs, reward, done, info = self.env.step(action) + self.frames.append(obs) return self._get_ob(), reward, done, info def _get_ob(self): - assert len(self.frames) == self.k + assert len(self.frames) == self.n_frames return LazyFrames(list(self.frames)) + class ScaledFloatFrame(gym.ObservationWrapper): def __init__(self, env): gym.ObservationWrapper.__init__(self, env) + self.observation_space = spaces.Box(low=0, high=1.0, shape=env.observation_space.shape, dtype=np.float32) def observation(self, observation): # careful! This undoes the memory optimization, use # with smaller replay buffers only. return np.array(observation).astype(np.float32) / 255.0 + class LazyFrames(object): def __init__(self, frames): - """This object ensures that common frames between the observations are only stored once. + """ + This object ensures that common frames between the observations are only stored once. It exists purely to optimize memory usage which can be huge for DQN's 1M frames replay buffers. This object should only be converted to numpy array before being passed to the model. - You'd not believe how complex the previous solution was.""" + :param frames: ([int] or [float]) environment frames + """ self._frames = frames self._out = None @@ -210,15 +273,31 @@ def __len__(self): def __getitem__(self, i): return self._force()[i] + def make_atari(env_id): + """ + Create a wrapped atari envrionment + + :param env_id: (str) the environment ID + :return: (Gym Environment) the wrapped atari environment + """ env = gym.make(env_id) assert 'NoFrameskip' in env.spec.id env = NoopResetEnv(env, noop_max=30) env = MaxAndSkipEnv(env, skip=4) return env + def wrap_deepmind(env, episode_life=True, clip_rewards=True, frame_stack=False, scale=False): - """Configure environment for DeepMind-style Atari. + """ + Configure environment for DeepMind-style Atari. + + :param env: (Gym Environment) the atari environment + :param episode_life: (bool) wrap the episode life wrapper + :param clip_rewards: (bool) wrap the reward clipping wrapper + :param frame_stack: (bool) wrap the frame stacking wrapper + :param scale: (bool) wrap the scaling observation wrapper + :return: (Gym Environment) the wrapped atari environment """ if episode_life: env = EpisodicLifeEnv(env) @@ -232,4 +311,3 @@ def wrap_deepmind(env, episode_life=True, clip_rewards=True, frame_stack=False, if frame_stack: env = FrameStack(env, 4) return env - diff --git a/stable_baselines/common/base_class.py b/stable_baselines/common/base_class.py new file mode 100644 index 0000000000..74572161ed --- /dev/null +++ b/stable_baselines/common/base_class.py @@ -0,0 +1,277 @@ +from abc import ABC, abstractmethod +import os + +import cloudpickle +import numpy as np +import gym + +from stable_baselines.common import set_global_seeds +from stable_baselines.common.policies import LstmPolicy +from stable_baselines.common.vec_env import VecEnvWrapper, VecEnv, DummyVecEnv +from stable_baselines import logger + + +class BaseRLModel(ABC): + """ + The base RL model + + :param policy: (Object) Policy object + :param env: (Gym environment) The environment to learn from + (if registered in Gym, can be str. Can be None for loading trained models) + :param requires_vec_env: (bool) + :param verbose: (int) the verbosity level: 0 none, 1 training information, 2 tensorflow debug + """ + + def __init__(self, policy, env, requires_vec_env, verbose=0): + super(BaseRLModel, self).__init__() + + self.policy = policy + self.env = env + self.verbose = verbose + self._requires_vec_env = requires_vec_env + self.observation_space = None + self.action_space = None + self.n_envs = None + self._vectorize_action = False + + if env is not None: + if isinstance(env, str): + if self.verbose >= 1: + print("Creating environment from the given name, wrapped in a DummyVecEnv.") + self.env = env = DummyVecEnv([lambda: gym.make(env)]) + + self.observation_space = env.observation_space + self.action_space = env.action_space + if requires_vec_env: + if isinstance(env, VecEnv): + self.n_envs = env.num_envs + else: + raise ValueError("Error: the model requires a vectorized environment, please use a VecEnv wrapper.") + else: + if isinstance(env, VecEnv): + if env.num_envs == 1: + self.env = _UnvecWrapper(env) + self.n_envs = 1 + self._vectorize_action = True + else: + raise ValueError("Error: the model requires a non vectorized environment or a single vectorized" + " environment.") + + def get_env(self): + """ + returns the current environment (can be None if not defined) + + :return: (Gym Environment) The current environment + """ + return self.env + + def set_env(self, env): + """ + Checks the validity of the environment, and if it is coherent, set it as the current environment. + + :param env: (Gym Environment) The environment for learning a policy + """ + if env is None and self.env is None: + if self.verbose >= 1: + print("Loading a model without an environment, " + "this model cannot be trained until it has a valid environment.") + return + elif env is None: + raise ValueError("Error: trying to replace the current environment with None") + + # sanity checking the environment + assert self.observation_space == env.observation_space, \ + "Error: the environment passed must have at least the same observation space as the model was trained on." + assert self.action_space == env.action_space, \ + "Error: the environment passed must have at least the same action space as the model was trained on." + if self._requires_vec_env: + assert isinstance(env, VecEnv), \ + "Error: the environment passed is not a vectorized environment, however {} requires it".format( + self.__class__.__name__) + assert not issubclass(self.policy, LstmPolicy) or self.n_envs == env.num_envs, \ + "Error: the environment passed must have the same number of environments as the model was trained on." \ + "This is due to the Lstm policy not being capable of changing the number of environments." + self.n_envs = env.num_envs + + # for models that dont want vectorized environment, check if they make sense and adapt them. + # Otherwise tell the user about this issue- + if not self._requires_vec_env and isinstance(env, VecEnv): + if env.num_envs == 1: + env = _UnvecWrapper(env) + self.n_envs = 1 + self._vectorize_action = True + else: + raise ValueError("Error: the model requires a non vectorized environment or a single vectorized " + "environment.") + else: + self._vectorize_action = False + + self.env = env + + @abstractmethod + def setup_model(self): + """ + Create all the functions and tensorflow graphs necessary to train the model + """ + pass + + def _setup_learn(self, seed): + """ + check the environment, set the seed, and set the logger + + :param seed: (int) the seed value + """ + if self.env is None: + raise ValueError("Error: cannot train the model without a valid environment, please set an environment with" + "set_env(self, env) method.") + if seed is not None: + set_global_seeds(seed) + + @abstractmethod + def learn(self, total_timesteps, callback=None, seed=None, log_interval=100): + """ + Return a trained model. + + :param total_timesteps: (int) The total number of samples to train on + :param seed: (int) The initial seed for training, if None: keep current seed + :param callback: (function (dict, dict)) function called at every steps with state of the algorithm. + It takes the local and global variables. + :param log_interval: (int) The number of timesteps before logging. + :return: (BaseRLModel) the trained model + """ + pass + + @abstractmethod + def predict(self, observation, state=None, mask=None): + """ + Get the model's action from an observation + + :param observation: (numpy Number) the input observation + :param state: (numpy Number) The last states (can be None, used in reccurent policies) + :param mask: (numpy Number) The last masks (can be None, used in reccurent policies) + :return: (numpy Number, numpy Number) the model's action and the next state (used in reccurent policies) + """ + pass + + @abstractmethod + def action_probability(self, observation, state=None, mask=None): + """ + Get the model's action probability distribution from an observation + + :param observation: (numpy Number) the input observation + :param state: (numpy Number) The last states (can be None, used in reccurent policies) + :param mask: (numpy Number) The last masks (can be None, used in reccurent policies) + :return: (numpy Number) the model's action probability distribution + """ + pass + + @abstractmethod + def save(self, save_path): + """ + Save the current parameters to file + + :param save_path: (str) the save location + """ + # self._save_to_file(save_path, data={}, params=None) + raise NotImplementedError() + + @classmethod + @abstractmethod + def load(cls, load_path, env=None, **kwargs): + """ + Load the model from file + + :param load_path: (str) the saved parameter location + :param env: (Gym Envrionment) the new environment to run the loaded model on + (can be None if you only need prediction from a trained model) + :param kwargs: extra arguments to change the model when loading + """ + # data, param = cls._load_from_file(load_path) + raise NotImplementedError() + + @staticmethod + def _save_to_file(save_path, data=None, params=None): + _, ext = os.path.splitext(save_path) + if ext == "": + save_path += ".pkl" + + with open(save_path, "wb") as file: + cloudpickle.dump((data, params), file) + + @staticmethod + def _load_from_file(load_path): + if not os.path.exists(load_path): + if os.path.exists(load_path + ".pkl"): + load_path += ".pkl" + else: + raise ValueError("Error: the file {} could not be found".format(load_path)) + + with open(load_path, "rb") as file: + data, params = cloudpickle.load(file) + + return data, params + + @staticmethod + def _softmax(x_input): + """ + An implementation of softmax. + + :param x_input: (numpy float) input vector + :return: (numpy float) output vector + """ + x_exp = np.exp(x_input.T - np.max(x_input.T, axis=0)) + return (x_exp / x_exp.sum(axis=0)).T + + +class _UnvecWrapper(VecEnvWrapper): + def __init__(self, venv): + """ + Unvectorize a vectorized environment, for vectorized environment that only have one environment + + :param venv: (VecEnv) the vectorized environment to wrap + """ + super().__init__(venv) + assert venv.num_envs == 1, "Error: cannot unwrap a environment wrapper that has more than one environment." + + def reset(self): + return self.venv.reset()[0] + + def step_async(self, actions): + self.venv.step_async([actions]) + + def step_wait(self): + actions, values, states, information = self.venv.step_wait() + return actions[0], values[0], states[0], information[0] + + def render(self, mode='human'): + return self.venv.render(mode)[0] + + +class SetVerbosity: + def __init__(self, verbose=0): + """ + define a region of code for certain level of verbosity + + :param verbose: (int) the verbosity level: 0 none, 1 training information, 2 tensorflow debug + """ + self.verbose = verbose + + def __enter__(self): + self.tf_level = os.environ.get('TF_CPP_MIN_LOG_LEVEL', '0') + self.log_level = logger.get_level() + self.gym_level = gym.logger.MIN_LEVEL + + if self.verbose <= 1: + os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3' + + if self.verbose <= 0: + logger.set_level(logger.DISABLED) + gym.logger.set_level(gym.logger.DISABLED) + + def __exit__(self, exc_type, exc_val, exc_tb): + if self.verbose <= 1: + os.environ['TF_CPP_MIN_LOG_LEVEL'] = self.tf_level + + if self.verbose <= 0: + logger.set_level(self.log_level) + gym.logger.set_level(self.gym_level) diff --git a/stable_baselines/common/cg.py b/stable_baselines/common/cg.py new file mode 100644 index 0000000000..15c0f9524d --- /dev/null +++ b/stable_baselines/common/cg.py @@ -0,0 +1,49 @@ +import numpy as np + + +def conjugate_gradient(f_ax, b_vec, cg_iters=10, callback=None, verbose=False, residual_tol=1e-10): + """ + conjugate gradient calculation (Ax = b), bases on + https://epubs.siam.org/doi/book/10.1137/1.9781611971446 Demmel p 312 + + :param f_ax: (function) The function describing the Matrix A dot the vector x + (x being the input parameter of the function) + :param b_vec: (numpy float) vector b, where Ax = b + :param cg_iters: (int) the maximum number of iterations for converging + :param callback: (function) callback the values of x while converging + :param verbose: (bool) print extra information + :param residual_tol: (float) the break point if the residual is below this value + :return: (numpy float) vector x, where Ax = b + """ + first_basis_vect = b_vec.copy() # the first basis vector + residual = b_vec.copy() # the residual + x_var = np.zeros_like(b_vec) # vector x, where Ax = b + residual_dot_residual = residual.dot(residual) # L2 norm of the residual + + fmt_str = "%10i %10.3g %10.3g" + title_str = "%10s %10s %10s" + if verbose: + print(title_str % ("iter", "residual norm", "soln norm")) + + for i in range(cg_iters): + if callback is not None: + callback(x_var) + if verbose: + print(fmt_str % (i, residual_dot_residual, np.linalg.norm(x_var))) + z_var = f_ax(first_basis_vect) + v_var = residual_dot_residual / first_basis_vect.dot(z_var) + x_var += v_var * first_basis_vect + residual -= v_var * z_var + new_residual_dot_residual = residual.dot(residual) + mu_val = new_residual_dot_residual / residual_dot_residual + first_basis_vect = residual + mu_val * first_basis_vect + + residual_dot_residual = new_residual_dot_residual + if residual_dot_residual < residual_tol: + break + + if callback is not None: + callback(x_var) + if verbose: + print(fmt_str % (i + 1, residual_dot_residual, np.linalg.norm(x_var))) + return x_var diff --git a/baselines/common/cmd_util.py b/stable_baselines/common/cmd_util.py similarity index 55% rename from baselines/common/cmd_util.py rename to stable_baselines/common/cmd_util.py index 5707695487..aada581ec2 100644 --- a/baselines/common/cmd_util.py +++ b/stable_baselines/common/cmd_util.py @@ -3,74 +3,110 @@ """ import os + from mpi4py import MPI import gym from gym.wrappers import FlattenDictWrapper -from baselines import logger -from baselines.bench import Monitor -from baselines.common import set_global_seeds -from baselines.common.atari_wrappers import make_atari, wrap_deepmind -from baselines.common.vec_env.subproc_vec_env import SubprocVecEnv -def make_atari_env(env_id, num_env, seed, wrapper_kwargs=None, start_index=0): +from stable_baselines import logger +from stable_baselines.bench import Monitor +from stable_baselines.common import set_global_seeds +from stable_baselines.common.atari_wrappers import make_atari, wrap_deepmind +from stable_baselines.common.vec_env.subproc_vec_env import SubprocVecEnv + + +def make_atari_env(env_id, num_env, seed, wrapper_kwargs=None, start_index=0, allow_early_resets=True): """ Create a wrapped, monitored SubprocVecEnv for Atari. + + :param env_id: (str) the environment ID + :param num_env: (int) the number of environment you wish to have in subprocesses + :param seed: (int) the inital seed for RNG + :param wrapper_kwargs: (dict) the parameters for wrap_deepmind function + :param start_index: (int) start rank index + :param allow_early_resets: (bool) allows early reset of the environment + :return: (Gym Environment) The atari environment """ - if wrapper_kwargs is None: wrapper_kwargs = {} - def make_env(rank): # pylint: disable=C0111 + if wrapper_kwargs is None: + wrapper_kwargs = {} + + def make_env(rank): def _thunk(): env = make_atari(env_id) env.seed(seed + rank) - env = Monitor(env, logger.get_dir() and os.path.join(logger.get_dir(), str(rank))) + env = Monitor(env, logger.get_dir() and os.path.join(logger.get_dir(), str(rank)), + allow_early_resets=allow_early_resets) return wrap_deepmind(env, **wrapper_kwargs) return _thunk set_global_seeds(seed) return SubprocVecEnv([make_env(i + start_index) for i in range(num_env)]) -def make_mujoco_env(env_id, seed): + +def make_mujoco_env(env_id, seed, allow_early_resets=True): """ Create a wrapped, monitored gym.Env for MuJoCo. + + :param env_id: (str) the environment ID + :param seed: (int) the inital seed for RNG + :param allow_early_resets: (bool) allows early reset of the environment + :return: (Gym Environment) The mujoco environment """ rank = MPI.COMM_WORLD.Get_rank() set_global_seeds(seed + 10000 * rank) env = gym.make(env_id) - env = Monitor(env, os.path.join(logger.get_dir(), str(rank))) + env = Monitor(env, os.path.join(logger.get_dir(), str(rank)), allow_early_resets=allow_early_resets) env.seed(seed) return env -def make_robotics_env(env_id, seed, rank=0): + +def make_robotics_env(env_id, seed, rank=0, allow_early_resets=True): """ Create a wrapped, monitored gym.Env for MuJoCo. + + :param env_id: (str) the environment ID + :param seed: (int) the inital seed for RNG + :param rank: (int) the rank of the environment (for logging) + :param allow_early_resets: (bool) allows early reset of the environment + :return: (Gym Environment) The robotic environment """ set_global_seeds(seed) env = gym.make(env_id) env = FlattenDictWrapper(env, ['observation', 'desired_goal']) env = Monitor( env, logger.get_dir() and os.path.join(logger.get_dir(), str(rank)), - info_keywords=('is_success',)) + info_keywords=('is_success',), allow_early_resets=allow_early_resets) env.seed(seed) return env + def arg_parser(): """ Create an empty argparse.ArgumentParser. + + :return: (ArgumentParser) """ import argparse return argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter) + def atari_arg_parser(): """ Create an argparse.ArgumentParser for run_atari.py. + + :return: (ArgumentParser) parser {'--env': 'BreakoutNoFrameskip-v4', '--seed': 0, '--num-timesteps': int(1e7)} """ parser = arg_parser() parser.add_argument('--env', help='environment ID', default='BreakoutNoFrameskip-v4') parser.add_argument('--seed', help='RNG seed', type=int, default=0) - parser.add_argument('--num-timesteps', type=int, default=int(10e6)) + parser.add_argument('--num-timesteps', type=int, default=int(1e7)) return parser + def mujoco_arg_parser(): """ Create an argparse.ArgumentParser for run_mujoco.py. + + :return: (ArgumentParser) parser {'--env': 'Reacher-v2', '--seed': 0, '--num-timesteps': int(1e6), '--play': False} """ parser = arg_parser() parser.add_argument('--env', help='environment ID', type=str, default='Reacher-v2') @@ -79,9 +115,12 @@ def mujoco_arg_parser(): parser.add_argument('--play', default=False, action='store_true') return parser + def robotics_arg_parser(): """ Create an argparse.ArgumentParser for run_mujoco.py. + + :return: (ArgumentParser) parser {'--env': 'FetchReach-v0', '--seed': 0, '--num-timesteps': int(1e6)} """ parser = arg_parser() parser.add_argument('--env', help='environment ID', type=str, default='FetchReach-v0') diff --git a/stable_baselines/common/console_util.py b/stable_baselines/common/console_util.py new file mode 100644 index 0000000000..c8b4c94cb2 --- /dev/null +++ b/stable_baselines/common/console_util.py @@ -0,0 +1,78 @@ +from __future__ import print_function + +import numpy as np + + +# ================================================================ +# Misc +# ================================================================ + + +def fmt_row(width, row, header=False): + """ + fits a list of items to at least a certain length + + :param width: (int) the minimum width of the string + :param row: ([Any]) a list of object you wish to get the string representation + :param header: (bool) whether or not to return the string as a header + :return: (str) the string representation of all the elements in 'row', of length >= 'width' + """ + out = " | ".join(fmt_item(x, width) for x in row) + if header: + out = out + "\n" + "-" * len(out) + return out + + +def fmt_item(item, min_width): + """ + fits items to a given string length + + :param item: (Any) the item you wish to get the string representation + :param min_width: (int) the minimum width of the string + :return: (str) the string representation of 'x' of length >= 'l' + """ + if isinstance(item, np.ndarray): + assert item.ndim == 0 + item = item.item() + if isinstance(item, (float, np.float32, np.float64)): + value = abs(item) + if (value < 1e-4 or value > 1e+4) and value > 0: + rep = "%7.2e" % item + else: + rep = "%7.5f" % item + else: + rep = str(item) + return " " * (min_width - len(rep)) + rep + + +COLOR_TO_NUM = dict( + gray=30, + red=31, + green=32, + yellow=33, + blue=34, + magenta=35, + cyan=36, + white=37, + crimson=38 +) + + +def colorize(string, color, bold=False, highlight=False): + """ + Colorize, bold and/or highlight a string for terminal print + + :param string: (str) input string + :param color: (str) the color, the lookup table is the dict at console_util.color2num + :param bold: (bool) if the string should be bold or not + :param highlight: (bool) if the string should be highlighted or not + :return: (str) the stylized output string + """ + attr = [] + num = COLOR_TO_NUM[color] + if highlight: + num += 10 + attr.append(str(num)) + if bold: + attr.append('1') + return '\x1b[%sm%s\x1b[0m' % (';'.join(attr), string) diff --git a/stable_baselines/common/dataset.py b/stable_baselines/common/dataset.py new file mode 100644 index 0000000000..1f951b3b02 --- /dev/null +++ b/stable_baselines/common/dataset.py @@ -0,0 +1,103 @@ +import numpy as np + + +class Dataset(object): + def __init__(self, data_map, deterministic=False, shuffle=True): + """ + Data loader that handles batches and shuffling. + WARNING: this will alter the given data_map ordering, as dicts are mutable + + :param data_map: (dict) the input data, where every column is a key + :param deterministic: (bool) disables the shuffle function + :param shuffle: (bool) enable auto shuffle + """ + self.data_map = data_map + self.deterministic = deterministic + self.enable_shuffle = shuffle + self.n_samples = next(iter(data_map.values())).shape[0] + self._next_id = 0 + self.shuffle() + + def shuffle(self): + """ + shuffles the data_map + """ + if self.deterministic: + return + perm = np.arange(self.n_samples) + np.random.shuffle(perm) + + for key in self.data_map: + self.data_map[key] = self.data_map[key][perm] + + def next_batch(self, batch_size): + """ + returns a batch of data of a given size + + :param batch_size: (int) the size of the batch + :return: (dict) a batch of the input data of size 'batch_size' + """ + if self._next_id >= self.n_samples: + self._next_id = 0 + if self.enable_shuffle: + self.shuffle() + + cur_id = self._next_id + cur_batch_size = min(batch_size, self.n_samples - self._next_id) + self._next_id += cur_batch_size + + data_map = dict() + for key in self.data_map: + data_map[key] = self.data_map[key][cur_id:cur_id+cur_batch_size] + return data_map + + def iterate_once(self, batch_size): + """ + generator that iterates over the dataset + + :param batch_size: (int) the size of the batch + :return: (dict) a batch of the input data of size 'batch_size' + """ + if self.enable_shuffle: + self.shuffle() + + while self._next_id <= self.n_samples - batch_size: + yield self.next_batch(batch_size) + self._next_id = 0 + + def subset(self, num_elements, deterministic=True): + """ + Return a subset of the current dataset + + :param num_elements: (int) the number of element you wish to have in the subset + :param deterministic: (bool) disables the shuffle function + :return: (Dataset) a new subset of the current Dataset object + """ + data_map = dict() + for key in self.data_map: + data_map[key] = self.data_map[key][:num_elements] + return Dataset(data_map, deterministic) + + +def iterbatches(arrays, *, num_batches=None, batch_size=None, shuffle=True, include_final_partial_batch=True): + """ + Iterates over arrays in batches, must provide either num_batches or batch_size, the other must be None. + + :param arrays: (tuple) a tuple of arrays + :param num_batches: (int) the number of batches, must be None is batch_size is defined + :param batch_size: (int) the size of the batch, must be None is num_batches is defined + :param shuffle: (bool) enable auto shuffle + :param include_final_partial_batch: (bool) add the last batch if not the same size as the batch_size + :return: (tuples) a tuple of a batch of the arrays + """ + assert (num_batches is None) != (batch_size is None), 'Provide num_batches or batch_size, but not both' + arrays = tuple(map(np.asarray, arrays)) + n_samples = arrays[0].shape[0] + assert all(a.shape[0] == n_samples for a in arrays[1:]) + inds = np.arange(n_samples) + if shuffle: + np.random.shuffle(inds) + sections = np.arange(0, n_samples, batch_size)[1:] if num_batches is None else num_batches + for batch_inds in np.array_split(inds, sections): + if include_final_partial_batch or len(batch_inds) == batch_size: + yield tuple(a[batch_inds] for a in arrays) diff --git a/stable_baselines/common/distributions.py b/stable_baselines/common/distributions.py new file mode 100644 index 0000000000..61b124f8e6 --- /dev/null +++ b/stable_baselines/common/distributions.py @@ -0,0 +1,507 @@ +import tensorflow as tf +from tensorflow.python.ops import math_ops +import numpy as np +from gym import spaces + +from stable_baselines.a2c.utils import linear + + +class ProbabilityDistribution(object): + """ + A particular probability distribution + """ + + def flatparam(self): + """ + Return the direct probabilities + + :return: ([float]) the probabilites + """ + raise NotImplementedError + + def mode(self): + """ + Returns the index of the highest probability + + :return: (int) the max index of the probabilites + """ + raise NotImplementedError + + def neglogp(self, x): + """ + returns the of the negative log likelihood + + :param x: (str) the labels of each index + :return: ([float]) The negative log likelihood of the distribution + """ + # Usually it's easier to define the negative logprob + raise NotImplementedError + + def kl(self, other): + """ + Calculates the Kullback-Leiber divergence from the given probabilty distribution + + :param other: ([float]) the distibution to compare with + :return: (float) the KL divergence of the two distributions + """ + raise NotImplementedError + + def entropy(self): + """ + Returns shannon's entropy of the probability + + :return: (float) the entropy + """ + raise NotImplementedError + + def sample(self): + """ + Sample an index from the probabilty distribution + + :return: (int) the sampled index + """ + raise NotImplementedError + + def logp(self, x): + """ + returns the of the log likelihood + + :param x: (str) the labels of each index + :return: ([float]) The log likelihood of the distribution + """ + return - self.neglogp(x) + + +class ProbabilityDistributionType(object): + """ + Parametrized family of probability distributions + """ + + def probability_distribution_class(self): + """ + returns the ProbabilityDistribution class of this type + + :return: (Type ProbabilityDistribution) the probability distribution class associated + """ + raise NotImplementedError + + def proba_distribution_from_flat(self, flat): + """ + returns the probability distribution from flat probabilities + + :param flat: ([float]) the flat probabilities + :return: (ProbabilityDistribution) the instance of the ProbabilityDistribution associated + """ + return self.probability_distribution_class()(flat) + + def proba_distribution_from_latent(self, pi_latent_vector, vf_latent_vector, init_scale=1.0, init_bias=0.0): + """ + returns the probability distribution from latent values + + :param pi_latent_vector: ([float]) the latent pi values + :param vf_latent_vector: ([float]) the latent vf values + :param init_scale: (float) the inital scale of the distribution + :param init_bias: (float) the inital bias of the distribution + :return: (ProbabilityDistribution) the instance of the ProbabilityDistribution associated + """ + raise NotImplementedError + + def param_shape(self): + """ + returns the shape of the input parameters + + :return: ([int]) the shape + """ + raise NotImplementedError + + def sample_shape(self): + """ + returns the shape of the sampling + + :return: ([int]) the shape + """ + raise NotImplementedError + + def sample_dtype(self): + """ + returns the type of the sampling + + :return: (type) the type + """ + raise NotImplementedError + + def param_placeholder(self, prepend_shape, name=None): + """ + returns the TensorFlow placeholder for the input parameters + + :param prepend_shape: ([int]) the prepend shape + :param name: (str) the placeholder name + :return: (TensorFlow Tensor) the placeholder + """ + return tf.placeholder(dtype=tf.float32, shape=prepend_shape + self.param_shape(), name=name) + + def sample_placeholder(self, prepend_shape, name=None): + """ + returns the TensorFlow placeholder for the sampling + + :param prepend_shape: ([int]) the prepend shape + :param name: (str) the placeholder name + :return: (TensorFlow Tensor) the placeholder + """ + return tf.placeholder(dtype=self.sample_dtype(), shape=prepend_shape + self.sample_shape(), name=name) + + +class CategoricalProbabilityDistributionType(ProbabilityDistributionType): + def __init__(self, n_cat): + """ + The probability distribution type for categorical input + + :param n_cat: (int) the number of categories + """ + self.n_cat = n_cat + + def probability_distribution_class(self): + return CategoricalProbabilityDistribution + + def proba_distribution_from_latent(self, pi_latent_vector, vf_latent_vector, init_scale=1.0, init_bias=0.0): + pdparam = linear(pi_latent_vector, 'pi', self.n_cat, init_scale=init_scale, init_bias=init_bias) + q_values = linear(vf_latent_vector, 'q', self.n_cat, init_scale=init_scale, init_bias=init_bias) + return self.proba_distribution_from_flat(pdparam), pdparam, q_values + + def param_shape(self): + return [self.n_cat] + + def sample_shape(self): + return [] + + def sample_dtype(self): + return tf.int32 + + +class MultiCategoricalProbabilityDistributionType(ProbabilityDistributionType): + def __init__(self, n_vec): + """ + The probability distribution type for multiple categorical input + + :param n_vec: ([int]) the vectors + """ + self.n_vec = n_vec + + def probability_distribution_class(self): + return MultiCategoricalProbabilityDistribution + + def proba_distribution_from_flat(self, flat): + return MultiCategoricalProbabilityDistribution(self.n_vec, flat) + + def proba_distribution_from_latent(self, pi_latent_vector, vf_latent_vector, init_scale=1.0, init_bias=0.0): + pdparam = linear(pi_latent_vector, 'pi', sum(self.n_vec), init_scale=init_scale, init_bias=init_bias) + q_values = linear(vf_latent_vector, 'q', sum(self.n_vec), init_scale=init_scale, init_bias=init_bias) + return self.proba_distribution_from_flat(pdparam), pdparam, q_values + + def param_shape(self): + return [sum(self.n_vec)] + + def sample_shape(self): + return [len(self.n_vec)] + + def sample_dtype(self): + return tf.int32 + + +class DiagGaussianProbabilityDistributionType(ProbabilityDistributionType): + def __init__(self, size, bounds=(-np.inf, np.inf)): + """ + The probability distribution type for multivariate gaussian input + + :param size: (int) the number of dimensions of the multivariate gaussian + :param bounds: (float, float) the lower and upper bounds limit for the action space + """ + self.size = size + self.bounds = bounds + + def probability_distribution_class(self): + return DiagGaussianProbabilityDistribution + + def proba_distribution_from_flat(self, flat, bounds=(-np.inf, np.inf)): + """ + returns the probability distribution from flat probabilities + + :param flat: ([float]) the flat probabilities + :param bounds: (float, float) the lower and upper bounds limit for the action space + :return: (ProbabilityDistribution) the instance of the ProbabilityDistribution associated + """ + return self.probability_distribution_class()(flat, bounds) + + def proba_distribution_from_latent(self, pi_latent_vector, vf_latent_vector, init_scale=1.0, init_bias=0.0): + mean = linear(pi_latent_vector, 'pi', self.size, init_scale=init_scale, init_bias=init_bias) + logstd = tf.get_variable(name='logstd', shape=[1, self.size], initializer=tf.zeros_initializer()) + pdparam = tf.concat([mean, mean * 0.0 + logstd], axis=1) + q_values = linear(vf_latent_vector, 'q', self.size, init_scale=init_scale, init_bias=init_bias) + return self.proba_distribution_from_flat(pdparam, self.bounds), mean, q_values + + def param_shape(self): + return [2 * self.size] + + def sample_shape(self): + return [self.size] + + def sample_dtype(self): + return tf.float32 + + +class BernoulliProbabilityDistributionType(ProbabilityDistributionType): + def __init__(self, size): + """ + The probability distribution type for bernoulli input + + :param size: (int) the number of dimensions of the bernoulli distribution + """ + self.size = size + + def probability_distribution_class(self): + return BernoulliProbabilityDistribution + + def proba_distribution_from_latent(self, pi_latent_vector, vf_latent_vector, init_scale=1.0, init_bias=0.0): + pdparam = linear(pi_latent_vector, 'pi', self.size, init_scale=init_scale, init_bias=init_bias) + q_values = linear(vf_latent_vector, 'q', self.size, init_scale=init_scale, init_bias=init_bias) + return self.proba_distribution_from_flat(pdparam), pdparam, q_values + + def param_shape(self): + return [self.size] + + def sample_shape(self): + return [self.size] + + def sample_dtype(self): + return tf.int32 + + +class CategoricalProbabilityDistribution(ProbabilityDistribution): + def __init__(self, logits): + """ + Probability distributions from categorical input + + :param logits: ([float]) the categorical logits input + """ + self.logits = logits + + def flatparam(self): + return self.logits + + def mode(self): + return tf.argmax(self.logits, axis=-1) + + def neglogp(self, x): + # return tf.nn. (logits=self.logits, labels=x) + # Note: we can't use sparse_softmax_cross_entropy_with_logits because + # the implementation does not allow second-order derivatives... + one_hot_actions = tf.one_hot(x, self.logits.get_shape().as_list()[-1]) + return tf.nn.softmax_cross_entropy_with_logits_v2( + logits=self.logits, + labels=tf.stop_gradient(one_hot_actions)) + + def kl(self, other): + a_0 = self.logits - tf.reduce_max(self.logits, axis=-1, keepdims=True) + a_1 = other.logits - tf.reduce_max(other.logits, axis=-1, keepdims=True) + exp_a_0 = tf.exp(a_0) + exp_a_1 = tf.exp(a_1) + z_0 = tf.reduce_sum(exp_a_0, axis=-1, keepdims=True) + z_1 = tf.reduce_sum(exp_a_1, axis=-1, keepdims=True) + p_0 = exp_a_0 / z_0 + return tf.reduce_sum(p_0 * (a_0 - tf.log(z_0) - a_1 + tf.log(z_1)), axis=-1) + + def entropy(self): + a_0 = self.logits - tf.reduce_max(self.logits, axis=-1, keepdims=True) + exp_a_0 = tf.exp(a_0) + z_0 = tf.reduce_sum(exp_a_0, axis=-1, keepdims=True) + p_0 = exp_a_0 / z_0 + return tf.reduce_sum(p_0 * (tf.log(z_0) - a_0), axis=-1) + + def sample(self): + uniform = tf.random_uniform(tf.shape(self.logits)) + return tf.argmax(self.logits - tf.log(-tf.log(uniform)), axis=-1) + + @classmethod + def fromflat(cls, flat): + """ + Create an instance of this from new logits values + + :param flat: ([float]) the categorical logits input + :return: (ProbabilityDistribution) the instance from the given categorical input + """ + return cls(flat) + + +class MultiCategoricalProbabilityDistribution(ProbabilityDistribution): + def __init__(self, nvec, flat): + """ + Probability distributions from multicategorical input + + :param nvec: ([int]) the sizes of the different categorical inputs + :param flat: ([float]) the categorical logits input + """ + self.flat = flat + self.categoricals = list(map(CategoricalProbabilityDistribution, tf.split(flat, nvec, axis=-1))) + + def flatparam(self): + return self.flat + + def mode(self): + return tf.cast(tf.stack([p.mode() for p in self.categoricals], axis=-1), tf.int32) + + def neglogp(self, x): + return tf.add_n([p.neglogp(px) for p, px in zip(self.categoricals, tf.unstack(x, axis=-1))]) + + def kl(self, other): + return tf.add_n([p.kl(q) for p, q in zip(self.categoricals, other.categoricals)]) + + def entropy(self): + return tf.add_n([p.entropy() for p in self.categoricals]) + + def sample(self): + return tf.cast(tf.stack([p.sample() for p in self.categoricals], axis=-1), tf.int32) + + @classmethod + def fromflat(cls, flat): + """ + Create an instance of this from new logits values + + :param flat: ([float]) the multi categorical logits input + :return: (ProbabilityDistribution) the instance from the given multi categorical input + """ + raise NotImplementedError + + +class DiagGaussianProbabilityDistribution(ProbabilityDistribution): + def __init__(self, flat, bounds=(-np.inf, np.inf)): + """ + Probability distributions from multivariate gaussian input + + :param flat: ([float]) the multivariate gaussian input data + :param bounds: (float, float) the lower and upper bounds limit for the action space + """ + self.flat = flat + mean, logstd = tf.split(axis=len(flat.shape) - 1, num_or_size_splits=2, value=flat) + self.mean = mean + self.logstd = logstd + self.std = tf.exp(logstd) + self.bounds = bounds + + def flatparam(self): + return self.flat + + def mode(self): + return self.mean + + def neglogp(self, x): + return 0.5 * tf.reduce_sum(tf.square((x - self.mean) / self.std), axis=-1) \ + + 0.5 * np.log(2.0 * np.pi) * tf.to_float(tf.shape(x)[-1]) \ + + tf.reduce_sum(self.logstd, axis=-1) + + def kl(self, other): + assert isinstance(other, DiagGaussianProbabilityDistribution) + return tf.reduce_sum(other.logstd - self.logstd + (tf.square(self.std) + tf.square(self.mean - other.mean)) / + (2.0 * tf.square(other.std)) - 0.5, axis=-1) + + def entropy(self): + return tf.reduce_sum(self.logstd + .5 * np.log(2.0 * np.pi * np.e), axis=-1) + + def sample(self): + low = self.bounds[0] + high = self.bounds[1] + + # clip the output (clip_by_value does not broadcast correctly) + return tf.minimum(tf.maximum(self.mean + self.std * tf.random_normal(tf.shape(self.mean)), low), high) + + @classmethod + def fromflat(cls, flat, bounds=(-np.inf, np.inf)): + """ + Create an instance of this from new multivariate gaussian input + + :param flat: ([float]) the multivariate gaussian input data + :param bounds: (float, float) the lower and upper bounds limit for the action space + :return: (ProbabilityDistribution) the instance from the given multivariate gaussian input data + """ + return cls(flat, bounds) + + +class BernoulliProbabilityDistribution(ProbabilityDistribution): + def __init__(self, logits): + """ + Probability distributions from bernoulli input + + :param logits: ([float]) the bernoulli input data + """ + self.logits = logits + self.probabilities = tf.sigmoid(logits) + + def flatparam(self): + return self.logits + + def mode(self): + return tf.round(self.probabilities) + + def neglogp(self, x): + return tf.reduce_sum(tf.nn.sigmoid_cross_entropy_with_logits(logits=self.logits, labels=tf.to_float(x)), + axis=-1) + + def kl(self, other): + return tf.reduce_sum(tf.nn.sigmoid_cross_entropy_with_logits(logits=other.logits, + labels=self.probabilities), axis=-1) - \ + tf.reduce_sum(tf.nn.sigmoid_cross_entropy_with_logits(logits=self.logits, + labels=self.probabilities), axis=-1) + + def entropy(self): + return tf.reduce_sum(tf.nn.sigmoid_cross_entropy_with_logits(logits=self.logits, + labels=self.probabilities), axis=-1) + + def sample(self): + samples_from_uniform = tf.random_uniform(tf.shape(self.probabilities)) + return tf.to_float(math_ops.less(samples_from_uniform, self.probabilities)) + + @classmethod + def fromflat(cls, flat): + """ + Create an instance of this from new bernoulli input + + :param flat: ([float]) the bernoulli input data + :return: (ProbabilityDistribution) the instance from the given bernoulli input data + """ + return cls(flat) + + +def make_proba_dist_type(ac_space): + """ + return an instance of ProbabilityDistributionType for the correct type of action space + + :param ac_space: (Gym Space) the input action space + :return: (ProbabilityDistributionType) the approriate instance of a ProbabilityDistributionType + """ + if isinstance(ac_space, spaces.Box): + assert len(ac_space.shape) == 1, "Error: the action space must be a vector" + return DiagGaussianProbabilityDistributionType(ac_space.shape[0], (ac_space.low, ac_space.high)) + elif isinstance(ac_space, spaces.Discrete): + return CategoricalProbabilityDistributionType(ac_space.n) + elif isinstance(ac_space, spaces.MultiDiscrete): + return MultiCategoricalProbabilityDistributionType(ac_space.nvec) + elif isinstance(ac_space, spaces.MultiBinary): + return BernoulliProbabilityDistributionType(ac_space.n) + else: + raise NotImplementedError("Error: probability distribution, not implemented for action space of type {}." + .format(type(ac_space)) + + " Must be of type Gym Spaces: Box, Discrete, MultiDiscrete or MultiBinary.") + + +def shape_el(tensor, index): + """ + get the shape of a TensorFlow Tensor element + + :param tensor: (TensorFlow Tensor) the input tensor + :param index: (int) the element + :return: ([int]) the shape + """ + maybe = tensor.get_shape()[index] + if maybe is not None: + return maybe + else: + return tf.shape(tensor)[index] diff --git a/stable_baselines/common/filters.py b/stable_baselines/common/filters.py new file mode 100644 index 0000000000..38d602004e --- /dev/null +++ b/stable_baselines/common/filters.py @@ -0,0 +1,211 @@ +from collections import deque + +import numpy as np + +from .running_stat import RunningStat + + +class Filter(object): + """ + takes a value 'x' (numpy Number), applies the filter, and returns the new value. + + Can pass kwarg: 'update' (bool) if the filter can update from the value + """ + def __call__(self, arr, update=True): + raise NotImplementedError + + def reset(self): + """ + resets the filter + """ + pass + + def output_shape(self, input_space): + """ + returns the output shape + + :param input_space: (numpy int) + :return: (numpy int) output shape + """ + raise NotImplementedError + + +class IdentityFilter(Filter): + """ + A filter that implements an identity function + + takes a value 'x' (numpy Number), applies the filter, and returns the new value. + + Can pass kwarg: 'update' (bool) if the filter can update from the value + """ + def __call__(self, arr, update=True): + return arr + + def output_shape(self, input_space): + return input_space.shape + + +class CompositionFilter(Filter): + def __init__(self, functions): + """ + A filter that implements a composition with other functions + + takes a value 'x' (numpy Number), applies the filter, and returns the new value. + + Can pass kwarg: 'update' (bool) if the filter can update from the value + + :param functions: ([function]) composition of these functions and the input + """ + self.functions = functions + + def __call__(self, arr, update=True): + for func in self.functions: + arr = func(arr) + return arr + + def output_shape(self, input_space): + out = input_space.shape + for func in self.functions: + out = func.output_shape(out) + return out + + +class ZFilter(Filter): + def __init__(self, shape, demean=True, destd=True, clip=10.0): + """ + A filter that implements a z-filter + y = (x-mean)/std + using running estimates of mean,std + + takes a value 'x' (numpy Number), applies the filter, and returns the new value. + + Can pass kwarg: 'update' (bool) if the filter can update from the value + + :param shape: ([int]) the shape of the input + :param demean: (bool) filter mean + :param destd: (bool) filter standard deviation + :param clip: (float) clip filter absolute value to this value + """ + self.demean = demean + self.destd = destd + self.clip = clip + + self.running_stat = RunningStat(shape) + + def __call__(self, arr, update=True): + if update: + self.running_stat.push(arr) + if self.demean: + arr = arr - self.running_stat.mean + if self.destd: + arr = arr / (self.running_stat.std + 1e-8) + if self.clip: + arr = np.clip(arr, -self.clip, self.clip) + return arr + + def output_shape(self, input_space): + return input_space.shape + + +class AddClock(Filter): + def __init__(self): + """ + A filter that appends a counter to the input + + takes a value 'x' (numpy Number), applies the filter, and returns the new value. + + Can pass kwarg: 'update' (bool) if the filter can update from the value + """ + self.count = 0 + + def reset(self): + self.count = 0 + + def __call__(self, arr, update=True): + return np.append(arr, self.count / 100.0) + + def output_shape(self, input_space): + return input_space.shape[0] + 1, + + +class FlattenFilter(Filter): + """ + A filter that flattens the input + + takes a value 'x' (numpy Number), applies the filter, and returns the new value. + + Can pass kwarg: 'update' (bool) if the filter can update from the value + """ + def __call__(self, arr, update=True): + return arr.ravel() + + def output_shape(self, input_space): + return int(np.prod(input_space.shape)), + + +class Ind2OneHotFilter(Filter): + def __init__(self, n_cat): + """ + A filter that turns indices to onehot encoding + + takes a value 'x' (numpy Number), applies the filter, and returns the new value. + + Can pass kwarg: 'update' (bool) if the filter can update from the value + + :param n_cat: (int) the number of categories + """ + self.n_cat = n_cat + + def __call__(self, arr, update=True): + out = np.zeros(self.n_cat) + out[arr] = 1 + return out + + def output_shape(self, input_space): + return input_space.n, + + +class DivFilter(Filter): + def __init__(self, divisor): + """ + A filter that divides the input from a value + + takes a value 'x' (numpy Number), applies the filter, and returns the new value. + + Can pass kwarg: 'update' (bool) if the filter can update from the value + + :param divisor: (float) the number you want to divide by + """ + self.divisor = divisor + + def __call__(self, arr, update=True): + return arr / self.divisor + + def output_shape(self, input_space): + return input_space.shape + + +class StackFilter(Filter): + def __init__(self, length): + """ + A filter that runs a stacking of a 'length' inputs + + takes a value 'x' (numpy Number), applies the filter, and returns the new value. + + Can pass kwarg: 'update' (bool) if the filter can update from the value + + :param length: (int) the number of inputs to stack + """ + self.stack = deque(maxlen=length) + + def reset(self): + self.stack.clear() + + def __call__(self, arr, update=True): + self.stack.append(arr) + while len(self.stack) < self.stack.maxlen: + self.stack.append(arr) + return np.concatenate(self.stack, axis=-1) + + def output_shape(self, input_space): + return input_space.shape[:-1] + (input_space.shape[-1] * self.stack.maxlen,) diff --git a/stable_baselines/common/identity_env.py b/stable_baselines/common/identity_env.py new file mode 100644 index 0000000000..1401b2921d --- /dev/null +++ b/stable_baselines/common/identity_env.py @@ -0,0 +1,69 @@ +import numpy as np + +from gym import Env +from gym.spaces import Discrete, MultiDiscrete, MultiBinary + + +class IdentityEnv(Env): + def __init__(self, dim, ep_length=100): + """ + Identity environment for testing purposes + + :param dim: (int) the size of the dimensions you want to learn + :param ep_length: (int) the length of each episodes in timesteps + """ + self.action_space = Discrete(dim) + self.observation_space = self.action_space + self.ep_length = ep_length + self.current_step = 0 + self.dim = dim + self.reset() + + def reset(self): + self.current_step = 0 + self._choose_next_state() + return self.state + + def step(self, action): + reward = self._get_reward(action) + self._choose_next_state() + self.current_step += 1 + done = self.current_step >= self.ep_length + return self.state, reward, done, {} + + def _choose_next_state(self): + self.state = self.action_space.sample() + + def _get_reward(self, action): + return 1 if np.all(self.state == action) else 0 + + def render(self, mode='human'): + pass + + +class IdentityEnvMultiDiscrete(IdentityEnv): + def __init__(self, dim, ep_length=100): + """ + Identity environment for testing purposes + + :param dim: (int) the size of the dimensions you want to learn + :param ep_length: (int) the length of each episodes in timesteps + """ + super(IdentityEnvMultiDiscrete, self).__init__(dim, ep_length) + self.action_space = MultiDiscrete([dim, dim]) + self.observation_space = self.action_space + self.reset() + + +class IdentityEnvMultiBinary(IdentityEnv): + def __init__(self, dim, ep_length=100): + """ + Identity environment for testing purposes + + :param dim: (int) the size of the dimensions you want to learn + :param ep_length: (int) the length of each episodes in timesteps + """ + super(IdentityEnvMultiBinary, self).__init__(dim, ep_length) + self.action_space = MultiBinary(dim) + self.observation_space = self.action_space + self.reset() diff --git a/stable_baselines/common/input.py b/stable_baselines/common/input.py new file mode 100644 index 0000000000..fca9df6214 --- /dev/null +++ b/stable_baselines/common/input.py @@ -0,0 +1,50 @@ +import numpy as np +import tensorflow as tf +from gym.spaces import Discrete, Box, MultiBinary, MultiDiscrete + + +def observation_input(ob_space, batch_size=None, name='Ob', scale=False): + """ + Build observation input with encoding depending on the observation space type + + When using Box ob_space, the input will be normalized between [1, 0] on the bounds ob_space.low and ob_space.high. + + :param ob_space: (Gym Space) The observation space + :param batch_size: (int) batch size for input + (default is None, so that resulting input placeholder can take tensors with any batch size) + :param name: (str) tensorflow variable name for input placeholder + :param scale: (bool) whether or not to scale the input + :return: (TensorFlow Tensor, TensorFlow Tensor) input_placeholder, processed_input_tensor + """ + if isinstance(ob_space, Discrete): + input_x = tf.placeholder(shape=(batch_size,), dtype=tf.int32, name=name) + processed_x = tf.to_float(tf.one_hot(input_x, ob_space.n)) + return input_x, processed_x + + elif isinstance(ob_space, Box): + input_x = tf.placeholder(shape=(batch_size,) + ob_space.shape, dtype=ob_space.dtype, name=name) + processed_x = tf.to_float(input_x) + # rescale to [1, 0] if the bounds are defined + if (scale and + not np.any(np.isinf(ob_space.low)) and not np.any(np.isinf(ob_space.high)) and + np.any(ob_space.high - ob_space.low) != 0): + + # equivalent to processed_x / 255.0 when bounds are set to [255, 0] + processed_x = ((processed_x - ob_space.low) / (ob_space.high - ob_space.low)) + return input_x, processed_x + + elif isinstance(ob_space, MultiBinary): + input_x = tf.placeholder(shape=(batch_size, ob_space.n), dtype=tf.int32, name=name) + processed_x = tf.to_float(input_x) + return input_x, processed_x + + elif isinstance(ob_space, MultiDiscrete): + input_x = tf.placeholder(shape=(batch_size, len(ob_space.nvec)), dtype=tf.int32, name=name) + processed_x = tf.concat([tf.to_float(tf.one_hot(input_split, ob_space.nvec[i])) + for i, input_split in enumerate(tf.split(input_x, len(ob_space.nvec), axis=-1))], + axis=-1) + return input_x, processed_x + + else: + raise NotImplementedError("Error: the model does not support input space of type {}".format( + type(ob_space).__name__)) diff --git a/stable_baselines/common/math_util.py b/stable_baselines/common/math_util.py new file mode 100644 index 0000000000..327e69fbe0 --- /dev/null +++ b/stable_baselines/common/math_util.py @@ -0,0 +1,103 @@ +import numpy as np +import scipy.signal + + +def discount(vector, gamma): + """ + computes discounted sums along 0th dimension of vector x. + y[t] = x[t] + gamma*x[t+1] + gamma^2*x[t+2] + ... + gamma^k x[t+k], + where k = len(x) - t - 1 + + :param vector: (numpy array) the input vector + :param gamma: (float) the discount value + :return: (numpy Number) the output vector + """ + assert vector.ndim >= 1 + return scipy.signal.lfilter([1], [1, -gamma], vector[::-1], axis=0)[::-1] + + +def explained_variance(y_pred, y_true): + """ + Computes fraction of variance that ypred explains about y. + Returns 1 - Var[y-ypred] / Var[y] + + interpretation: + ev=0 => might as well have predicted zero + ev=1 => perfect prediction + ev<0 => worse than just predicting zero + + :param y_pred: (numpy Number) the prediction + :param y_true: (numpy Number) the expected value + :return: (float) explained variance of ypred and y + """ + assert y_true.ndim == 1 and y_pred.ndim == 1 + var_y = np.var(y_true) + return np.nan if var_y == 0 else 1 - np.var(y_true - y_pred) / var_y + + +def explained_variance_2d(y_pred, y_true): + """ + Computes fraction of variance that ypred explains about y, for 2D arrays. + Returns 1 - Var[y-ypred] / Var[y] + + interpretation: + ev=0 => might as well have predicted zero + ev=1 => perfect prediction + ev<0 => worse than just predicting zero + + :param y_pred: (numpy Number) the prediction + :param y_true: (numpy Number) the expected value + :return: (float) explained variance of ypred and y + """ + assert y_true.ndim == 2 and y_pred.ndim == 2 + var_y = np.var(y_true, axis=0) + explained_var = 1 - np.var(y_true - y_pred) / var_y + explained_var[var_y < 1e-10] = 0 + return explained_var + + +def flatten_arrays(arrs): + """ + flattens a list of arrays down to 1D + + :param arrs: ([numpy Number]) arrays + :return: (numpy Number) 1D flattend array + """ + return np.concatenate([arr.flat for arr in arrs]) + + +def unflatten_vector(vec, shapes): + """ + reshape a flattened array + + :param vec: (numpy Number) 1D arrays + :param shapes: (tuple) + :return: ([numpy Number]) reshaped array + """ + i = 0 + arrs = [] + for shape in shapes: + size = np.prod(shape) + arr = vec[i:i + size].reshape(shape) + arrs.append(arr) + i += size + return arrs + + +def discount_with_boundaries(rewards, episode_starts, gamma): + """ + computes discounted sums along 0th dimension of x (reward), while taking into account the start of each episode. + y[t] = x[t] + gamma*x[t+1] + gamma^2*x[t+2] + ... + gamma^k x[t+k], + where k = len(x) - t - 1 + + :param rewards: (numpy Number) the input vector (rewards) + :param episode_starts: (numpy Number) 2d array of bools, indicating when a new episode has started + :param gamma: (float) the discount factor + :return: (numpy Number) the output vector (discounted rewards) + """ + discounted_rewards = np.zeros_like(rewards) + n_samples = rewards.shape[0] + discounted_rewards[n_samples - 1] = rewards[n_samples - 1] + for step in range(n_samples - 2, -1, -1): + discounted_rewards[step] = rewards[step] + gamma * discounted_rewards[step + 1] * (1 - episode_starts[step + 1]) + return discounted_rewards diff --git a/baselines/common/misc_util.py b/stable_baselines/common/misc_util.py similarity index 51% rename from baselines/common/misc_util.py rename to stable_baselines/common/misc_util.py index 9985dea205..5532805168 100644 --- a/baselines/common/misc_util.py +++ b/stable_baselines/common/misc_util.py @@ -1,15 +1,23 @@ -import gym -import numpy as np import os import pickle import random import tempfile import zipfile +import gym +import numpy as np +import tensorflow as tf + def zipsame(*seqs): - L = len(seqs[0]) - assert all(len(seq) == L for seq in seqs[1:]) + """ + Performes a zip function, but asserts that all zipped elements are of the same size + + :param seqs: a list of arrays that are zipped together + :return: the zipped arguments + """ + length = len(seqs[0]) + assert all(len(seq) == length for seq in seqs[1:]) return zip(*seqs) @@ -20,79 +28,81 @@ def unpack(seq, sizes): Example: unpack([1,2,3,4,5,6], [3,None,2]) -> ([1,2,3], 4, [5,6]) + + :param seq: (Iterable) the sequence to unpack + :param sizes: ([int]) the shape to unpack + :return: ([Any] or Any) the unpacked sequence """ seq = list(seq) - it = iter(seq) + iterator = iter(seq) assert sum(1 if s is None else s for s in sizes) == len(seq), "Trying to unpack %s into %s" % (seq, sizes) for size in sizes: if size is None: - yield it.__next__() + yield iterator.__next__() else: - li = [] + _list = [] for _ in range(size): - li.append(it.__next__()) - yield li + _list.append(iterator.__next__()) + yield _list class EzPickle(object): - """Objects that are pickled and unpickled via their constructor - arguments. + def __init__(self, *args, **kwargs): + """ + Objects that are pickled and unpickled via their constructor arguments. - Example usage: + Example usage: - class Dog(Animal, EzPickle): - def __init__(self, furcolor, tailkind="bushy"): - Animal.__init__() - EzPickle.__init__(furcolor, tailkind) - ... + class Dog(Animal, EzPickle): + def __init__(self, furcolor, tailkind="bushy"): + Animal.__init__() + EzPickle.__init__(furcolor, tailkind) + ... - When this object is unpickled, a new Dog will be constructed by passing the provided - furcolor and tailkind into the constructor. However, philosophers are still not sure - whether it is still the same dog. + When this object is unpickled, a new Dog will be constructed by passing the provided + furcolor and tailkind into the constructor. However, philosophers are still not sure + whether it is still the same dog. - This is generally needed only for environments which wrap C/C++ code, such as MuJoCo - and Atari. - """ + This is generally needed only for environments which wrap C/C++ code, such as MuJoCo + and Atari. - def __init__(self, *args, **kwargs): + :param args: ezpickle args + :param kwargs: ezpickle kwargs + """ self._ezpickle_args = args self._ezpickle_kwargs = kwargs def __getstate__(self): return {"_ezpickle_args": self._ezpickle_args, "_ezpickle_kwargs": self._ezpickle_kwargs} - def __setstate__(self, d): - out = type(self)(*d["_ezpickle_args"], **d["_ezpickle_kwargs"]) + def __setstate__(self, _dict): + out = type(self)(*_dict["_ezpickle_args"], **_dict["_ezpickle_kwargs"]) self.__dict__.update(out.__dict__) -def set_global_seeds(i): - try: - import tensorflow as tf - except ImportError: - pass - else: - tf.set_random_seed(i) - np.random.seed(i) - random.seed(i) +def set_global_seeds(seed): + """ + set the seed for python random, tensorflow, numpy and gym spaces + + :param seed: (int) the seed + """ + tf.set_random_seed(seed) + np.random.seed(seed) + random.seed(seed) + gym.spaces.prng.seed(seed) def pretty_eta(seconds_left): - """Print the number of seconds in human readable format. + """ + Print the number of seconds in human readable format. Examples: 2 days 2 hours and 37 minutes less than a minute - Paramters - --------- - seconds_left: int - Number of seconds to be converted to the ETA - Returns - ------- - eta: str - String representing the pretty ETA. + :param seconds_left: (int) Number of seconds to be converted to the ETA + :return: (str) String representing the pretty ETA. """ minutes_left = seconds_left // 60 seconds_left %= 60 @@ -121,27 +131,21 @@ def helper(cnt, name): class RunningAvg(object): def __init__(self, gamma, init_value=None): - """Keep a running estimate of a quantity. This is a bit like mean + """ + Keep a running estimate of a quantity. This is a bit like mean but more sensitive to recent changes. - Parameters - ---------- - gamma: float - Must be between 0 and 1, where 0 is the most sensitive to recent - changes. - init_value: float or None - Initial value of the estimate. If None, it will be set on the first update. + :param gamma: (float) Must be between 0 and 1, where 0 is the most sensitive to recent changes. + :param init_value: (float) Initial value of the estimate. If None, it will be set on the first update. """ self._value = init_value self._gamma = gamma def update(self, new_val): - """Update the estimate. + """ + Update the estimate. - Parameters - ---------- - new_val: float - new observated value of estimated quantity. + :param new_val: (float) new observated value of estimated quantity. """ if self._value is None: self._value = new_val @@ -149,43 +153,36 @@ def update(self, new_val): self._value = self._gamma * self._value + (1.0 - self._gamma) * new_val def __float__(self): - """Get the current estimate""" + """ + Get the current estimate + + :return: (float) current value + """ return self._value -def boolean_flag(parser, name, default=False, help=None): - """Add a boolean flag to argparse parser. - - Parameters - ---------- - parser: argparse.Parser - parser to add the flag to - name: str - -- will enable the flag, while --no- will disable it - default: bool or None - default value of the flag - help: str - help string for the flag + +def boolean_flag(parser, name, default=False, help_msg=None): + """ + Add a boolean flag to argparse parser. + + :param parser: (argparse.Parser) parser to add the flag to + :param name: (str) -- will enable the flag, while --no- will disable it + :param default: (bool) default value of the flag + :param help_msg: (str) help string for the flag """ dest = name.replace('-', '_') - parser.add_argument("--" + name, action="store_true", default=default, dest=dest, help=help) + parser.add_argument("--" + name, action="store_true", default=default, dest=dest, help=help_msg) parser.add_argument("--no-" + name, action="store_false", dest=dest) def get_wrapper_by_name(env, classname): - """Given an a gym environment possibly wrapped multiple times, returns a wrapper + """ + Given an a gym environment possibly wrapped multiple times, returns a wrapper of class named classname or raises ValueError if no such wrapper was applied - Parameters - ---------- - env: gym.Env of gym.Wrapper - gym environment - classname: str - name of the wrapper - - Returns - ------- - wrapper: gym.Wrapper - wrapper named classname + :param env: (Gym Environment) the environment + :param classname: (str) name of the wrapper + :return: (Gym Environment) the wrapped environment """ currentenv = env while True: @@ -198,7 +195,8 @@ def get_wrapper_by_name(env, classname): def relatively_safe_pickle_dump(obj, path, compression=False): - """This is just like regular pickle dump, except from the fact that failure cases are + """ + This is just like regular pickle dump, except from the fact that failure cases are different: - It's never possible that we end up with a pickle in corrupted state. @@ -210,14 +208,9 @@ def relatively_safe_pickle_dump(obj, path, compression=False): The indended use case is periodic checkpoints of experiment state, such that we never corrupt previous checkpoints if the current one fails. - Parameters - ---------- - obj: object - object to pickle - path: str - path to the output file - compression: bool - if true pickle will be compressed + :param obj: (Object) object to pickle + :param path: (str) path to the output file + :param compression: (bool) if true pickle will be compressed """ temp_storage = path + ".relatively_safe" if compression: @@ -228,31 +221,24 @@ def relatively_safe_pickle_dump(obj, path, compression=False): with zipfile.ZipFile(temp_storage, "w", compression=zipfile.ZIP_DEFLATED) as myzip: myzip.write(uncompressed_file.name, "data") else: - with open(temp_storage, "wb") as f: - pickle.dump(obj, f) + with open(temp_storage, "wb") as file_handler: + pickle.dump(obj, file_handler) os.rename(temp_storage, path) def pickle_load(path, compression=False): - """Unpickle a possible compressed pickle. - - Parameters - ---------- - path: str - path to the output file - compression: bool - if true assumes that pickle was compressed when created and attempts decompression. - - Returns - ------- - obj: object - the unpickled object + """ + Unpickle a possible compressed pickle. + + :param path: (str) path to the output file + :param compression: (bool) if true assumes that pickle was compressed when created and attempts decompression. + :return: (Object) the unpickled object """ if compression: with zipfile.ZipFile(path, "r", compression=zipfile.ZIP_DEFLATED) as myzip: - with myzip.open("data") as f: - return pickle.load(f) + with myzip.open("data") as file_handler: + return pickle.load(file_handler) else: - with open(path, "rb") as f: - return pickle.load(f) + with open(path, "rb") as file_handler: + return pickle.load(file_handler) diff --git a/stable_baselines/common/mpi_adam.py b/stable_baselines/common/mpi_adam.py new file mode 100644 index 0000000000..924cabcd63 --- /dev/null +++ b/stable_baselines/common/mpi_adam.py @@ -0,0 +1,121 @@ +import tensorflow as tf +import numpy as np +from mpi4py import MPI + +import stable_baselines.common.tf_util as tf_utils + + +class MpiAdam(object): + def __init__(self, var_list, *, beta1=0.9, beta2=0.999, epsilon=1e-08, scale_grad_by_procs=True, comm=None, + sess=None): + """ + A parallel MPI implementation of the Adam optimizer for TensorFlow + https://arxiv.org/abs/1412.6980 + + :param var_list: ([TensorFlow Tensor]) the variables + :param beta1: (float) Adam beta1 parameter + :param beta2: (float) Adam beta1 parameter + :param epsilon: (float) to help with preventing arithmetic issues + :param scale_grad_by_procs: (bool) if the scaling should be done by processes + :param comm: (MPI Communicators) if None, MPI.COMM_WORLD + :param sess: (TensorFlow Session) if None, tf.get_default_session() + """ + self.var_list = var_list + self.beta1 = beta1 + self.beta2 = beta2 + self.epsilon = epsilon + self.scale_grad_by_procs = scale_grad_by_procs + size = sum(tf_utils.numel(v) for v in var_list) + # Exponential moving average of gradient values + # "first moment estimate" m in the paper + self.exp_avg = np.zeros(size, 'float32') + # Exponential moving average of squared gradient values + # "second raw moment estimate" v in the paper + self.exp_avg_sq = np.zeros(size, 'float32') + self.step = 0 + self.setfromflat = tf_utils.SetFromFlat(var_list, sess=sess) + self.getflat = tf_utils.GetFlat(var_list, sess=sess) + self.comm = MPI.COMM_WORLD if comm is None else comm + + def update(self, local_grad, learning_rate): + """ + update the values of the graph + + :param local_grad: (numpy float) the gradient + :param learning_rate: (float) the learning_rate for the update + """ + if self.step % 100 == 0: + self.check_synced() + local_grad = local_grad.astype('float32') + global_grad = np.zeros_like(local_grad) + self.comm.Allreduce(local_grad, global_grad, op=MPI.SUM) + if self.scale_grad_by_procs: + global_grad /= self.comm.Get_size() + + self.step += 1 + # Learning rate with bias correction + step_size = learning_rate * np.sqrt(1 - self.beta2 ** self.step) / (1 - self.beta1 ** self.step) + # Decay the first and second moment running average coefficient + self.exp_avg = self.beta1 * self.exp_avg + (1 - self.beta1) * global_grad + self.exp_avg_sq = self.beta2 * self.exp_avg_sq + (1 - self.beta2) * (global_grad * global_grad) + step = (- step_size) * self.exp_avg / (np.sqrt(self.exp_avg_sq) + self.epsilon) + self.setfromflat(self.getflat() + step) + + def sync(self): + """ + syncronize the MPI threads + """ + theta = self.getflat() + self.comm.Bcast(theta, root=0) + self.setfromflat(theta) + + def check_synced(self): + """ + confirm the MPI threads are synced + """ + if self.comm.Get_rank() == 0: # this is root + theta = self.getflat() + self.comm.Bcast(theta, root=0) + else: + thetalocal = self.getflat() + thetaroot = np.empty_like(thetalocal) + self.comm.Bcast(thetaroot, root=0) + assert (thetaroot == thetalocal).all(), (thetaroot, thetalocal) + + +@tf_utils.in_session +def test_mpi_adam(): + """ + tests the MpiAdam object's functionality + """ + np.random.seed(0) + tf.set_random_seed(0) + + a_var = tf.Variable(np.random.randn(3).astype('float32')) + b_var = tf.Variable(np.random.randn(2, 5).astype('float32')) + loss = tf.reduce_sum(tf.square(a_var)) + tf.reduce_sum(tf.sin(b_var)) + + learning_rate = 1e-2 + update_op = tf.train.AdamOptimizer(learning_rate).minimize(loss) + do_update = tf_utils.function([], loss, updates=[update_op]) + + tf.get_default_session().run(tf.global_variables_initializer()) + for step in range(10): + print(step, do_update()) + + tf.set_random_seed(0) + tf.get_default_session().run(tf.global_variables_initializer()) + + var_list = [a_var, b_var] + lossandgrad = tf_utils.function([], [loss, tf_utils.flatgrad(loss, var_list)], updates=[update_op]) + adam = MpiAdam(var_list) + + for step in range(10): + loss, grad = lossandgrad() + adam.update(grad, learning_rate) + print(step, loss) + + +if __name__ == "__main__": + # Run with mpirun -np 2 python + test_mpi_adam() diff --git a/baselines/common/mpi_fork.py b/stable_baselines/common/mpi_fork.py similarity index 59% rename from baselines/common/mpi_fork.py rename to stable_baselines/common/mpi_fork.py index c5e609e66c..2012f5cad3 100644 --- a/baselines/common/mpi_fork.py +++ b/stable_baselines/common/mpi_fork.py @@ -1,10 +1,18 @@ -import os, subprocess, sys +import os +import subprocess +import sys -def mpi_fork(n, bind_to_core=False): - """Re-launches the current script with workers + +def mpi_fork(rank, bind_to_core=False): + """ + Re-launches the current script with workers Returns "parent" for original parent, "child" for MPI children + + :param rank: (int) the rank + :param bind_to_core: (bool) enables binding to core + :return: (str) the correct type of thread name """ - if n<=1: + if rank <= 1: return "child" if os.getenv("IN_MPI") is None: env = os.environ.copy() @@ -13,7 +21,7 @@ def mpi_fork(n, bind_to_core=False): OMP_NUM_THREADS="1", IN_MPI="1" ) - args = ["mpirun", "-np", str(n)] + args = ["mpirun", "-np", str(rank)] if bind_to_core: args += ["-bind-to", "core"] args += [sys.executable] + sys.argv diff --git a/stable_baselines/common/mpi_moments.py b/stable_baselines/common/mpi_moments.py new file mode 100644 index 0000000000..45e143e1d6 --- /dev/null +++ b/stable_baselines/common/mpi_moments.py @@ -0,0 +1,71 @@ +from mpi4py import MPI +import numpy as np + +from stable_baselines.common import zipsame + + +def mpi_mean(arr, axis=0, comm=None, keepdims=False): + """ + calculates the mean of an array, using MPI + + :param arr: (numpy Number) + :param axis: (int or tuple or list) the axis to run the means over + :param comm: (MPI Communicators) if None, MPI.COMM_WORLD + :param keepdims: (bool) keep the other dimensions intact + :return: (numpy Number or Number) the result of the sum + """ + arr = np.asarray(arr) + assert arr.ndim > 0 + if comm is None: + comm = MPI.COMM_WORLD + xsum = arr.sum(axis=axis, keepdims=keepdims) + size = xsum.size + localsum = np.zeros(size + 1, arr.dtype) + localsum[:size] = xsum.ravel() + localsum[size] = arr.shape[axis] + globalsum = np.zeros_like(localsum) + comm.Allreduce(localsum, globalsum, op=MPI.SUM) + return globalsum[:size].reshape(xsum.shape) / globalsum[size], globalsum[size] + + +def mpi_moments(arr, axis=0, comm=None, keepdims=False): + """ + calculates the mean and std of an array, using MPI + + :param arr: (numpy Number) + :param axis: (int or tuple or list) the axis to run the moments over + :param comm: (MPI Communicators) if None, MPI.COMM_WORLD + :param keepdims: (bool) keep the other dimensions intact + :return: (numpy Number or Number) the result of the moments + """ + arr = np.asarray(arr) + assert arr.ndim > 0 + mean, count = mpi_mean(arr, axis=axis, comm=comm, keepdims=True) + sqdiffs = np.square(arr - mean) + meansqdiff, count1 = mpi_mean(sqdiffs, axis=axis, comm=comm, keepdims=True) + assert count1 == count + std = np.sqrt(meansqdiff) + if not keepdims: + newshape = mean.shape[:axis] + mean.shape[axis+1:] + mean = mean.reshape(newshape) + std = std.reshape(newshape) + return mean, std, count + + +def _helper_runningmeanstd(): + comm = MPI.COMM_WORLD + np.random.seed(0) + for (triple, axis) in [ + ((np.random.randn(3), np.random.randn(4), np.random.randn(5)), 0), + ((np.random.randn(3, 2), np.random.randn(4, 2), np.random.randn(5, 2)), 0), + ((np.random.randn(2, 3), np.random.randn(2, 4), np.random.randn(2, 4)), 1)]: + + arr = np.concatenate(triple, axis=axis) + ms1 = [arr.mean(axis=axis), arr.std(axis=axis), arr.shape[axis]] + + ms2 = mpi_moments(triple[comm.Get_rank()], axis=axis) + + for (res_1, res_2) in zipsame(ms1, ms2): + print(res_1, res_2) + assert np.allclose(res_1, res_2) + print("ok!") diff --git a/stable_baselines/common/mpi_running_mean_std.py b/stable_baselines/common/mpi_running_mean_std.py new file mode 100644 index 0000000000..b520fd4cc4 --- /dev/null +++ b/stable_baselines/common/mpi_running_mean_std.py @@ -0,0 +1,104 @@ +from mpi4py import MPI +import tensorflow as tf +import numpy as np + +import stable_baselines.common.tf_util as tf_util + + +class RunningMeanStd(object): + def __init__(self, epsilon=1e-2, shape=()): + """ + calulates the running mean and std of a data stream + https://en.wikipedia.org/wiki/Algorithms_for_calculating_variance#Parallel_algorithm + + :param epsilon: (float) helps with arithmetic issues + :param shape: (tuple) the shape of the data stream's output + """ + self._sum = tf.get_variable( + dtype=tf.float64, + shape=shape, + initializer=tf.constant_initializer(0.0), + name="runningsum", trainable=False) + self._sumsq = tf.get_variable( + dtype=tf.float64, + shape=shape, + initializer=tf.constant_initializer(epsilon), + name="runningsumsq", trainable=False) + self._count = tf.get_variable( + dtype=tf.float64, + shape=(), + initializer=tf.constant_initializer(epsilon), + name="count", trainable=False) + self.shape = shape + + self.mean = tf.to_float(self._sum / self._count) + self.std = tf.sqrt(tf.maximum(tf.to_float(self._sumsq / self._count) - tf.square(self.mean), 1e-2)) + + newsum = tf.placeholder(shape=self.shape, dtype=tf.float64, name='sum') + newsumsq = tf.placeholder(shape=self.shape, dtype=tf.float64, name='var') + newcount = tf.placeholder(shape=[], dtype=tf.float64, name='count') + self.incfiltparams = tf_util.function([newsum, newsumsq, newcount], [], + updates=[tf.assign_add(self._sum, newsum), + tf.assign_add(self._sumsq, newsumsq), + tf.assign_add(self._count, newcount)]) + + def update(self, data): + """ + update the running mean and std + + :param data: (numpy Number) the data + """ + data = data.astype('float64') + data_size = int(np.prod(self.shape)) + totalvec = np.zeros(data_size * 2 + 1, 'float64') + addvec = np.concatenate([data.sum(axis=0).ravel(), np.square(data).sum(axis=0).ravel(), + np.array([len(data)], dtype='float64')]) + MPI.COMM_WORLD.Allreduce(addvec, totalvec, op=MPI.SUM) + self.incfiltparams(totalvec[0: data_size].reshape(self.shape), + totalvec[data_size: 2 * data_size].reshape(self.shape), totalvec[2 * data_size]) + + +@tf_util.in_session +def test_dist(): + """ + test the running mean std + """ + np.random.seed(0) + p_1, p_2, p_3 = (np.random.randn(3, 1), np.random.randn(4, 1), np.random.randn(5, 1)) + q_1, q_2, q_3 = (np.random.randn(6, 1), np.random.randn(7, 1), np.random.randn(8, 1)) + + comm = MPI.COMM_WORLD + assert comm.Get_size() == 2 + if comm.Get_rank() == 0: + x_1, x_2, x_3 = p_1, p_2, p_3 + elif comm.Get_rank() == 1: + x_1, x_2, x_3 = q_1, q_2, q_3 + else: + assert False + + rms = RunningMeanStd(epsilon=0.0, shape=(1,)) + tf_util.initialize() + + rms.update(x_1) + rms.update(x_2) + rms.update(x_3) + + bigvec = np.concatenate([p_1, p_2, p_3, q_1, q_2, q_3]) + + def checkallclose(var_1, var_2): + print(var_1, var_2) + return np.allclose(var_1, var_2) + + assert checkallclose( + bigvec.mean(axis=0), + rms.mean.eval(), + ) + assert checkallclose( + bigvec.std(axis=0), + rms.std.eval(), + ) + + +if __name__ == "__main__": + # Run with mpirun -np 2 python + test_dist() diff --git a/stable_baselines/common/policies.py b/stable_baselines/common/policies.py new file mode 100644 index 0000000000..f3fb17b0bb --- /dev/null +++ b/stable_baselines/common/policies.py @@ -0,0 +1,338 @@ +import numpy as np +import tensorflow as tf +from gym.spaces import Discrete + +from stable_baselines.a2c.utils import conv, linear, conv_to_fc, batch_to_seq, seq_to_batch, lstm +from stable_baselines.common.distributions import make_proba_dist_type +from stable_baselines.common.input import observation_input + + +def nature_cnn(scaled_images, **kwargs): + """ + CNN from Nature paper. + + :param scaled_images: (TensorFlow Tensor) Image input placeholder + :param kwargs: (dict) Extra keywords parameters for the convolutional layers of the CNN + :return: (TensorFlow Tensor) The CNN output layer + """ + activ = tf.nn.relu + layer_1 = activ(conv(scaled_images, 'c1', n_filters=32, filter_size=8, stride=4, init_scale=np.sqrt(2), **kwargs)) + layer_2 = activ(conv(layer_1, 'c2', n_filters=64, filter_size=4, stride=2, init_scale=np.sqrt(2), **kwargs)) + layer_3 = activ(conv(layer_2, 'c3', n_filters=64, filter_size=3, stride=1, init_scale=np.sqrt(2), **kwargs)) + layer_3 = conv_to_fc(layer_3) + return activ(linear(layer_3, 'fc1', n_hidden=512, init_scale=np.sqrt(2))) + + +class ActorCriticPolicy(object): + """ + Policy object that implements actor critic + + :param sess: (TensorFlow session) The current TensorFlow session + :param ob_space: (Gym Space) The observation space of the environment + :param ac_space: (Gym Space) The action space of the environment + :param n_env: (int) The number of environments to run + :param n_steps: (int) The number of steps to run for each environment + :param n_batch: (int) The number of batch to run (n_envs * n_steps) + :param n_lstm: (int) The number of LSTM cells (for reccurent policies) + :param reuse: (bool) If the policy is reusable or not + :param scale: (bool) whether or not to scale the input + """ + + def __init__(self, sess, ob_space, ac_space, n_env, n_steps, n_batch, n_lstm=256, reuse=False, scale=False): + self.n_env = n_env + self.n_steps = n_steps + self.obs_ph, self.processed_x = observation_input(ob_space, n_batch, scale=scale) + self.masks_ph = tf.placeholder(tf.float32, [n_batch]) # mask (done t-1) + self.states_ph = tf.placeholder(tf.float32, [self.n_env, n_lstm * 2]) # states + self.pdtype = make_proba_dist_type(ac_space) + self.sess = sess + self.reuse = reuse + self.is_discrete = isinstance(ac_space, Discrete) + self.policy = None + self.proba_distribution = None + self.value_fn = None + self.ob_space = ob_space + + def _setup_init(self): + """ + sets up the distibutions, actions, and value + """ + assert self.policy is not None and self.proba_distribution is not None and self.value_fn is not None + self.action = self.proba_distribution.sample() + self.neglogp = self.proba_distribution.neglogp(self.action) + self.policy_proba = self.policy + if self.is_discrete: + self.policy_proba = tf.nn.softmax(self.policy_proba) + self._value = self.value_fn[:, 0] + + def step(self, obs, state=None, mask=None): + """ + Returns the policy for a single step + + :param obs: ([float] or [int]) The current observation of the environment + :param state: ([float]) The last states (used in reccurent policies) + :param mask: ([float]) The last masks (used in reccurent policies) + :return: ([float], [float], [float], [float]) actions, values, states, neglogp + """ + raise NotImplementedError + + def proba_step(self, obs, state=None, mask=None): + """ + Returns the action probability for a single step + + :param obs: ([float] or [int]) The current observation of the environment + :param state: ([float]) The last states (used in reccurent policies) + :param mask: ([float]) The last masks (used in reccurent policies) + :return: ([float]) the action probability + """ + raise NotImplementedError + + def value(self, obs, state=None, mask=None): + """ + Returns the value for a single step + + :param obs: ([float] or [int]) The current observation of the environment + :param state: ([float]) The last states (used in reccurent policies) + :param mask: ([float]) The last masks (used in reccurent policies) + :return: ([float]) The associated value of the action + """ + raise NotImplementedError + + +class LstmPolicy(ActorCriticPolicy): + """ + Policy object that implements actor critic, using LSTMs. + + :param sess: (TensorFlow session) The current TensorFlow session + :param ob_space: (Gym Space) The observation space of the environment + :param ac_space: (Gym Space) The action space of the environment + :param n_env: (int) The number of environments to run + :param n_steps: (int) The number of steps to run for each environment + :param n_batch: (int) The number of batch to run (n_envs * n_steps) + :param n_lstm: (int) The number of LSTM cells (for reccurent policies) + :param reuse: (bool) If the policy is reusable or not + :param layers: ([int]) The size of the Neural network before the LSTM layer (if None, default to [64, 64]) + :param cnn_extractor: (function (TensorFlow Tensor, **kwargs): (TensorFlow Tensor)) the CNN feature extraction + :param layer_norm: (bool) Whether or not to use layer normalizing LSTMs + :param feature_extraction: (str) The feature extraction type ("cnn" or "mlp") + :param kwargs: (dict) Extra keyword arguments for the nature CNN feature extraction + """ + + def __init__(self, sess, ob_space, ac_space, n_env, n_steps, n_batch, n_lstm=256, reuse=False, layers=None, + cnn_extractor=nature_cnn, layer_norm=False, feature_extraction="cnn", **kwargs): + super(LstmPolicy, self).__init__(sess, ob_space, ac_space, n_env, n_steps, n_batch, n_lstm, reuse, + scale=(feature_extraction == "cnn")) + + if layers is None: + layers = [64, 64] + + with tf.variable_scope("model", reuse=reuse): + if feature_extraction == "cnn": + extracted_features = cnn_extractor(self.processed_x, **kwargs) + else: + activ = tf.tanh + extracted_features = tf.layers.flatten(self.processed_x) + for i, layer_size in enumerate(layers): + extracted_features = activ(linear(extracted_features, 'pi_fc' + str(i), n_hidden=layer_size, + init_scale=np.sqrt(2))) + input_sequence = batch_to_seq(extracted_features, self.n_env, n_steps) + masks = batch_to_seq(self.masks_ph, self.n_env, n_steps) + rnn_output, self.snew = lstm(input_sequence, masks, self.states_ph, 'lstm1', n_hidden=n_lstm, + layer_norm=layer_norm) + rnn_output = seq_to_batch(rnn_output) + value_fn = linear(rnn_output, 'vf', 1) + + self.proba_distribution, self.policy, self.q_value = \ + self.pdtype.proba_distribution_from_latent(rnn_output, rnn_output) + + self.value_fn = value_fn + self.initial_state = np.zeros((self.n_env, n_lstm * 2), dtype=np.float32) + self._setup_init() + + def step(self, obs, state=None, mask=None): + return self.sess.run([self.action, self._value, self.snew, self.neglogp], + {self.obs_ph: obs, self.states_ph: state, self.masks_ph: mask}) + + def proba_step(self, obs, state=None, mask=None): + return self.sess.run(self.policy_proba, {self.obs_ph: obs, self.states_ph: state, self.masks_ph: mask}) + + def value(self, obs, state=None, mask=None): + return self.sess.run(self._value, {self.obs_ph: obs, self.states_ph: state, self.masks_ph: mask}) + + +class FeedForwardPolicy(ActorCriticPolicy): + """ + Policy object that implements actor critic, using a feed forward neural network. + + :param sess: (TensorFlow session) The current TensorFlow session + :param ob_space: (Gym Space) The observation space of the environment + :param ac_space: (Gym Space) The action space of the environment + :param n_env: (int) The number of environments to run + :param n_steps: (int) The number of steps to run for each environment + :param n_batch: (int) The number of batch to run (n_envs * n_steps) + :param reuse: (bool) If the policy is reusable or not + :param layers: ([int]) The size of the Neural network for the policy (if None, default to [64, 64]) + :param cnn_extractor: (function (TensorFlow Tensor, **kwargs): (TensorFlow Tensor)) the CNN feature extraction + :param feature_extraction: (str) The feature extraction type ("cnn" or "mlp") + :param kwargs: (dict) Extra keyword arguments for the nature CNN feature extraction + """ + + def __init__(self, sess, ob_space, ac_space, n_env, n_steps, n_batch, reuse=False, layers=None, + cnn_extractor=nature_cnn, feature_extraction="cnn", **kwargs): + super(FeedForwardPolicy, self).__init__(sess, ob_space, ac_space, n_env, n_steps, n_batch, n_lstm=256, + reuse=reuse, scale=(feature_extraction == "cnn")) + if layers is None: + layers = [64, 64] + + with tf.variable_scope("model", reuse=reuse): + if feature_extraction == "cnn": + extracted_features = cnn_extractor(self.processed_x, **kwargs) + value_fn = linear(extracted_features, 'vf', 1) + pi_latent = extracted_features + vf_latent = extracted_features + else: + activ = tf.tanh + processed_x = tf.layers.flatten(self.processed_x) + pi_h = processed_x + vf_h = processed_x + for i, layer_size in enumerate(layers): + pi_h = activ(linear(pi_h, 'pi_fc' + str(i), n_hidden=layer_size, init_scale=np.sqrt(2))) + vf_h = activ(linear(vf_h, 'vf_fc' + str(i), n_hidden=layer_size, init_scale=np.sqrt(2))) + value_fn = linear(vf_h, 'vf', 1) + pi_latent = pi_h + vf_latent = vf_h + + self.proba_distribution, self.policy, self.q_value = \ + self.pdtype.proba_distribution_from_latent(pi_latent, vf_latent, init_scale=0.01) + + self.value_fn = value_fn + self.initial_state = None + self._setup_init() + + def step(self, obs, state=None, mask=None): + action, value, neglogp = self.sess.run([self.action, self._value, self.neglogp], {self.obs_ph: obs}) + return action, value, self.initial_state, neglogp + + def proba_step(self, obs, state=None, mask=None): + return self.sess.run(self.policy_proba, {self.obs_ph: obs}) + + def value(self, obs, state=None, mask=None): + return self.sess.run(self._value, {self.obs_ph: obs}) + + +class CnnPolicy(FeedForwardPolicy): + """ + Policy object that implements actor critic, using a CNN (the nature CNN) + + :param sess: (TensorFlow session) The current TensorFlow session + :param ob_space: (Gym Space) The observation space of the environment + :param ac_space: (Gym Space) The action space of the environment + :param n_env: (int) The number of environments to run + :param n_steps: (int) The number of steps to run for each environment + :param n_batch: (int) The number of batch to run (n_envs * n_steps) + :param reuse: (bool) If the policy is reusable or not + :param _kwargs: (dict) Extra keyword arguments for the nature CNN feature extraction + """ + + def __init__(self, sess, ob_space, ac_space, n_env, n_steps, n_batch, reuse=False, **_kwargs): + super(CnnPolicy, self).__init__(sess, ob_space, ac_space, n_env, n_steps, n_batch, reuse, + feature_extraction="cnn", **_kwargs) + + +class CnnLstmPolicy(LstmPolicy): + """ + Policy object that implements actor critic, using LSTMs with a CNN feature extraction + + :param sess: (TensorFlow session) The current TensorFlow session + :param ob_space: (Gym Space) The observation space of the environment + :param ac_space: (Gym Space) The action space of the environment + :param n_env: (int) The number of environments to run + :param n_steps: (int) The number of steps to run for each environment + :param n_batch: (int) The number of batch to run (n_envs * n_steps) + :param n_lstm: (int) The number of LSTM cells (for reccurent policies) + :param reuse: (bool) If the policy is reusable or not + :param kwargs: (dict) Extra keyword arguments for the nature CNN feature extraction + """ + + def __init__(self, sess, ob_space, ac_space, n_env, n_steps, n_batch, n_lstm=256, reuse=False, **_kwargs): + super(CnnLstmPolicy, self).__init__(sess, ob_space, ac_space, n_env, n_steps, n_batch, n_lstm, reuse, + layer_norm=False, feature_extraction="cnn", **_kwargs) + + +class CnnLnLstmPolicy(LstmPolicy): + """ + Policy object that implements actor critic, using a layer normalized LSTMs with a CNN feature extraction + + :param sess: (TensorFlow session) The current TensorFlow session + :param ob_space: (Gym Space) The observation space of the environment + :param ac_space: (Gym Space) The action space of the environment + :param n_env: (int) The number of environments to run + :param n_steps: (int) The number of steps to run for each environment + :param n_batch: (int) The number of batch to run (n_envs * n_steps) + :param n_lstm: (int) The number of LSTM cells (for reccurent policies) + :param reuse: (bool) If the policy is reusable or not + :param kwargs: (dict) Extra keyword arguments for the nature CNN feature extraction + """ + + def __init__(self, sess, ob_space, ac_space, n_env, n_steps, n_batch, n_lstm=256, reuse=False, **_kwargs): + super(CnnLnLstmPolicy, self).__init__(sess, ob_space, ac_space, n_env, n_steps, n_batch, n_lstm, reuse, + layer_norm=True, feature_extraction="cnn", **_kwargs) + + +class MlpPolicy(FeedForwardPolicy): + """ + Policy object that implements actor critic, using a MLP (2 layers of 64) + + :param sess: (TensorFlow session) The current TensorFlow session + :param ob_space: (Gym Space) The observation space of the environment + :param ac_space: (Gym Space) The action space of the environment + :param n_env: (int) The number of environments to run + :param n_steps: (int) The number of steps to run for each environment + :param n_batch: (int) The number of batch to run (n_envs * n_steps) + :param reuse: (bool) If the policy is reusable or not + :param _kwargs: (dict) Extra keyword arguments for the nature CNN feature extraction + """ + + def __init__(self, sess, ob_space, ac_space, n_env, n_steps, n_batch, reuse=False, **_kwargs): + super(MlpPolicy, self).__init__(sess, ob_space, ac_space, n_env, n_steps, n_batch, reuse, + feature_extraction="mlp", **_kwargs) + + +class MlpLstmPolicy(LstmPolicy): + """ + Policy object that implements actor critic, using LSTMs with a MLP feature extraction + + :param sess: (TensorFlow session) The current TensorFlow session + :param ob_space: (Gym Space) The observation space of the environment + :param ac_space: (Gym Space) The action space of the environment + :param n_env: (int) The number of environments to run + :param n_steps: (int) The number of steps to run for each environment + :param n_batch: (int) The number of batch to run (n_envs * n_steps) + :param n_lstm: (int) The number of LSTM cells (for reccurent policies) + :param reuse: (bool) If the policy is reusable or not + :param kwargs: (dict) Extra keyword arguments for the nature CNN feature extraction + """ + + def __init__(self, sess, ob_space, ac_space, n_env, n_steps, n_batch, n_lstm=256, reuse=False, **_kwargs): + super(MlpLstmPolicy, self).__init__(sess, ob_space, ac_space, n_env, n_steps, n_batch, n_lstm, reuse, + layer_norm=False, feature_extraction="mlp", **_kwargs) + + +class MlpLnLstmPolicy(LstmPolicy): + """ + Policy object that implements actor critic, using a layer normalized LSTMs with a MLP feature extraction + + :param sess: (TensorFlow session) The current TensorFlow session + :param ob_space: (Gym Space) The observation space of the environment + :param ac_space: (Gym Space) The action space of the environment + :param n_env: (int) The number of environments to run + :param n_steps: (int) The number of steps to run for each environment + :param n_batch: (int) The number of batch to run (n_envs * n_steps) + :param n_lstm: (int) The number of LSTM cells (for reccurent policies) + :param reuse: (bool) If the policy is reusable or not + :param kwargs: (dict) Extra keyword arguments for the nature CNN feature extraction + """ + + def __init__(self, sess, ob_space, ac_space, n_env, n_steps, n_batch, n_lstm=256, reuse=False, **_kwargs): + super(MlpLnLstmPolicy, self).__init__(sess, ob_space, ac_space, n_env, n_steps, n_batch, n_lstm, reuse, + layer_norm=True, feature_extraction="mlp", **_kwargs) diff --git a/stable_baselines/common/runners.py b/stable_baselines/common/runners.py new file mode 100644 index 0000000000..7c9df3ce7d --- /dev/null +++ b/stable_baselines/common/runners.py @@ -0,0 +1,29 @@ +import numpy as np +from abc import ABC, abstractmethod + + +class AbstractEnvRunner(ABC): + def __init__(self, *, env, model, n_steps): + """ + A runner to learn the policy of an environment for a model + + :param env: (Gym environment) The environment to learn from + :param model: (Model) The model to learn + :param n_steps: (int) The number of steps to run for each environment + """ + self.env = env + self.model = model + n_env = env.num_envs + self.batch_ob_shape = (n_env*n_steps,) + env.observation_space.shape + self.obs = np.zeros((n_env,) + env.observation_space.shape, dtype=env.observation_space.dtype.name) + self.obs[:] = env.reset() + self.n_steps = n_steps + self.states = model.initial_state + self.dones = [False for _ in range(n_env)] + + @abstractmethod + def run(self): + """ + Run a learning step of the model + """ + raise NotImplementedError diff --git a/stable_baselines/common/running_mean_std.py b/stable_baselines/common/running_mean_std.py new file mode 100644 index 0000000000..d6a03d6ebf --- /dev/null +++ b/stable_baselines/common/running_mean_std.py @@ -0,0 +1,37 @@ +import numpy as np + + +class RunningMeanStd(object): + def __init__(self, epsilon=1e-4, shape=()): + """ + calulates the running mean and std of a data stream + https://en.wikipedia.org/wiki/Algorithms_for_calculating_variance#Parallel_algorithm + + :param epsilon: (float) helps with arithmetic issues + :param shape: (tuple) the shape of the data stream's output + """ + self.mean = np.zeros(shape, 'float64') + self.var = np.ones(shape, 'float64') + self.count = epsilon + + def update(self, arr): + batch_mean = np.mean(arr, axis=0) + batch_var = np.var(arr, axis=0) + batch_count = arr.shape[0] + self.update_from_moments(batch_mean, batch_var, batch_count) + + def update_from_moments(self, batch_mean, batch_var, batch_count): + delta = batch_mean - self.mean + tot_count = self.count + batch_count + + new_mean = self.mean + delta * batch_count / tot_count + m_a = self.var * self.count + m_b = batch_var * batch_count + m_2 = m_a + m_b + np.square(delta) * self.count * batch_count / (self.count + batch_count) + new_var = m_2 / (self.count + batch_count) + + new_count = batch_count + self.count + + self.mean = new_mean + self.var = new_var + self.count = new_count diff --git a/stable_baselines/common/running_stat.py b/stable_baselines/common/running_stat.py new file mode 100644 index 0000000000..4c074590a3 --- /dev/null +++ b/stable_baselines/common/running_stat.py @@ -0,0 +1,75 @@ +import numpy as np + + +class RunningStat(object): + def __init__(self, shape): + """ + calulates the running mean and std of a data stream + http://www.johndcook.com/blog/standard_deviation/ + + :param shape: (tuple) the shape of the data stream's output + """ + self._step = 0 + self._mean = np.zeros(shape) + self._std = np.zeros(shape) + + def push(self, value): + """ + update the running mean and std + + :param value: (numpy Number) the data + """ + value = np.asarray(value) + assert value.shape == self._mean.shape + self._step += 1 + if self._step == 1: + self._mean[...] = value + else: + old_m = self._mean.copy() + self._mean[...] = old_m + (value - old_m) / self._step + self._std[...] = self._std + (value - old_m) * (value - self._mean) + + @property + def n(self): + """ + the number of data points + + :return: (int) + """ + return self._step + + @property + def mean(self): + """ + the average value + + :return: (float) + """ + return self._mean + + @property + def var(self): + """ + the variation of the data points + + :return: (float) + """ + return self._std / (self._step - 1) if self._step > 1 else np.square(self._mean) + + @property + def std(self): + """ + the standard deviation of the data points + + :return: (float) + """ + return np.sqrt(self.var) + + @property + def shape(self): + """ + the shape of the data points + + :return: (tuple) + """ + return self._mean.shape diff --git a/baselines/common/schedules.py b/stable_baselines/common/schedules.py similarity index 57% rename from baselines/common/schedules.py rename to stable_baselines/common/schedules.py index 9dfff50f95..9fc3d6f11b 100644 --- a/baselines/common/schedules.py +++ b/stable_baselines/common/schedules.py @@ -10,47 +10,57 @@ class Schedule(object): - def value(self, t): - """Value of the schedule at time t""" - raise NotImplementedError() + def value(self, step): + """ + Value of the schedule for a given timestep + + :param step: (int) the timestep + :return: (float) the output value for the given timestep + """ + raise NotImplementedError -class ConstantSchedule(object): +class ConstantSchedule(Schedule): def __init__(self, value): - """Value remains constant over time. + """ + Value remains constant over time. - Parameters - ---------- - value: float - Constant value of the schedule + :param value: (float) Constant value of the schedule """ - self._v = value + self._value = value - def value(self, t): - """See Schedule.value""" - return self._v + def value(self, step): + return self._value -def linear_interpolation(l, r, alpha): - return l + alpha * (r - l) +def linear_interpolation(left, right, alpha): + """ + Linear interpolation between `left` and `right` + :param left: (float) left boundary + :param right: (float) right boundary + :param alpha: (float) coeff in [0, 1] + :return: (float) + """ + return left + alpha * (right - left) -class PiecewiseSchedule(object): +class PiecewiseSchedule(Schedule): def __init__(self, endpoints, interpolation=linear_interpolation, outside_value=None): - """Piecewise schedule. + """ + Piecewise schedule. - endpoints: [(int, int)] + :param endpoints: ([(int, int)]) list of pairs `(time, value)` meanining that schedule should output `value` when `t==time`. All the values for time must be sorted in an increasing order. When t is between two times, e.g. `(time_a, value_a)` and `(time_b, value_b)`, such that `time_a <= t < time_b` then value outputs `interpolation(value_a, value_b, alpha)` where alpha is a fraction of time passed between `time_a` and `time_b` for time `t`. - interpolation: lambda float, float, float: float + :param interpolation: (lambda (float, float, float): float) a function that takes value to the left and to the right of t according to the `endpoints`. Alpha is the fraction of distance from left endpoint to right endpoint that t has covered. See linear_interpolation for example. - outside_value: float + :param outside_value: (float) if the value is requested outside of all the intervals sepecified in `endpoints` this value is returned. If None then AssertionError is raised when outside value is requested. @@ -61,39 +71,32 @@ def __init__(self, endpoints, interpolation=linear_interpolation, outside_value= self._outside_value = outside_value self._endpoints = endpoints - def value(self, t): - """See Schedule.value""" - for (l_t, l), (r_t, r) in zip(self._endpoints[:-1], self._endpoints[1:]): - if l_t <= t and t < r_t: - alpha = float(t - l_t) / (r_t - l_t) - return self._interpolation(l, r, alpha) + def value(self, step): + for (left_t, left), (right_t, right) in zip(self._endpoints[:-1], self._endpoints[1:]): + if left_t <= step < right_t: + alpha = float(step - left_t) / (right_t - left_t) + return self._interpolation(left, right, alpha) # t does not belong to any of the pieces, so doom. assert self._outside_value is not None return self._outside_value -class LinearSchedule(object): +class LinearSchedule(Schedule): def __init__(self, schedule_timesteps, final_p, initial_p=1.0): - """Linear interpolation between initial_p and final_p over + """ + Linear interpolation between initial_p and final_p over schedule_timesteps. After this many timesteps pass final_p is returned. - Parameters - ---------- - schedule_timesteps: int - Number of timesteps for which to linearly anneal initial_p - to final_p - initial_p: float - initial output value - final_p: float - final output value + :param schedule_timesteps: (int) Number of timesteps for which to linearly anneal initial_p to final_p + :param initial_p: (float) initial output value + :param final_p: (float) final output value """ self.schedule_timesteps = schedule_timesteps self.final_p = final_p self.initial_p = initial_p - def value(self, t): - """See Schedule.value""" - fraction = min(float(t) / self.schedule_timesteps, 1.0) + def value(self, step): + fraction = min(float(step) / self.schedule_timesteps, 1.0) return self.initial_p + fraction * (self.final_p - self.initial_p) diff --git a/baselines/common/segment_tree.py b/stable_baselines/common/segment_tree.py similarity index 69% rename from baselines/common/segment_tree.py rename to stable_baselines/common/segment_tree.py index cb386ecdb5..1a22d8eed0 100644 --- a/baselines/common/segment_tree.py +++ b/stable_baselines/common/segment_tree.py @@ -3,7 +3,8 @@ class SegmentTree(object): def __init__(self, capacity, operation, neutral_element): - """Build a Segment Tree data structure. + """ + Build a Segment Tree data structure. https://en.wikipedia.org/wiki/Segment_tree @@ -16,17 +17,10 @@ def __init__(self, capacity, operation, neutral_element): `reduce` operation which reduces `operation` over a contiguous subsequence of items in the array. - Paramters - --------- - capacity: int - Total size of the array - must be a power of two. - operation: lambda obj, obj -> obj - and operation for combining elements (eg. sum, max) - must form a mathematical group together with the set of - possible values for array elements (i.e. be associative) - neutral_element: obj - neutral element for the operation above. eg. float('-inf') - for max and 0 for sum. + :param capacity: (int) Total size of the array - must be a power of two. + :param operation: (lambda (Any, Any): Any) operation for combining elements (eg. sum, max) must form a + mathematical group together with the set of possible values for array elements (i.e. be associative) + :param neutral_element: (Any) neutral element for the operation above. eg. float('-inf') for max and 0 for sum. """ assert capacity > 0 and capacity & (capacity - 1) == 0, "capacity must be positive and a power of 2." self._capacity = capacity @@ -49,22 +43,15 @@ def _reduce_helper(self, start, end, node, node_start, node_end): ) def reduce(self, start=0, end=None): - """Returns result of applying `self.operation` + """ + Returns result of applying `self.operation` to a contiguous subsequence of the array. self.operation(arr[start], operation(arr[start+1], operation(... arr[end]))) - Parameters - ---------- - start: int - beginning of the subsequence - end: int - end of the subsequences - - Returns - ------- - reduced: obj - result of reducing self.operation over the specified range of array elements. + :param start: (int) beginning of the subsequence + :param end: (int) end of the subsequences + :return: (Any) result of reducing self.operation over the specified range of array elements. """ if end is None: end = self._capacity @@ -99,26 +86,26 @@ def __init__(self, capacity): ) def sum(self, start=0, end=None): - """Returns arr[start] + ... + arr[end]""" + """ + Returns arr[start] + ... + arr[end] + + :param start: (int) start position of the reduction (must be >= 0) + :param end: (int) end position of the reduction (must be < len(arr), can be None for len(arr) - 1) + :return: (Any) reduction of SumSegmentTree + """ return super(SumSegmentTree, self).reduce(start, end) def find_prefixsum_idx(self, prefixsum): - """Find the highest index `i` in the array such that + """ + Find the highest index `i` in the array such that sum(arr[0] + arr[1] + ... + arr[i - i]) <= prefixsum if array values are probabilities, this function allows to sample indexes according to the discrete probability efficiently. - Parameters - ---------- - perfixsum: float - upperbound on the sum of array prefix - - Returns - ------- - idx: int - highest index satisfying the prefixsum constraint + :param prefixsum: (float) upperbound on the sum of array prefix + :return: (int) highest index satisfying the prefixsum constraint """ assert 0 <= prefixsum <= self.sum() + 1e-5 idx = 1 @@ -140,6 +127,11 @@ def __init__(self, capacity): ) def min(self, start=0, end=None): - """Returns min(arr[start], ..., arr[end])""" + """ + Returns min(arr[start], ..., arr[end]) + :param start: (int) start position of the reduction (must be >= 0) + :param end: (int) end position of the reduction (must be < len(arr), can be None for len(arr) - 1) + :return: (Any) reduction of MinSegmentTree + """ return super(MinSegmentTree, self).reduce(start, end) diff --git a/stable_baselines/common/tf_util.py b/stable_baselines/common/tf_util.py new file mode 100644 index 0000000000..6c7c2c170f --- /dev/null +++ b/stable_baselines/common/tf_util.py @@ -0,0 +1,506 @@ +import copy +import os +import functools +import collections +import multiprocessing + +import numpy as np +import tensorflow as tf +from tensorflow.python.client import device_lib + +from stable_baselines import logger + + +def switch(condition, then_expression, else_expression): + """ + Switches between two operations depending on a scalar value (int or bool). + Note that both `then_expression` and `else_expression` + should be symbolic tensors of the *same shape*. + + :param condition: (TensorFlow Tensor) scalar tensor. + :param then_expression: (TensorFlow Operation) + :param else_expression: (TensorFlow Operation) + :return: (TensorFlow Operation) the switch output + """ + x_shape = copy.copy(then_expression.get_shape()) + out_tensor = tf.cond(tf.cast(condition, 'bool'), + lambda: then_expression, + lambda: else_expression) + out_tensor.set_shape(x_shape) + return out_tensor + + +# ================================================================ +# Extras +# ================================================================ + +def leaky_relu(tensor, leak=0.2): + """ + Leaky ReLU + http://web.stanford.edu/~awni/papers/relu_hybrid_icml2013_final.pdf + + :param tensor: (float) the input value + :param leak: (float) the leaking coeficient when the function is saturated + :return: (float) Leaky ReLU output + """ + f_1 = 0.5 * (1 + leak) + f_2 = 0.5 * (1 - leak) + return f_1 * tensor + f_2 * abs(tensor) + + +# ================================================================ +# Mathematical utils +# ================================================================ + +def huber_loss(tensor, delta=1.0): + """ + Reference: https://en.wikipedia.org/wiki/Huber_loss + + :param tensor: (TensorFlow Tensor) the input value + :param delta: (float) huber loss delta value + :return: (TensorFlow Tensor) huber loss output + """ + return tf.where( + tf.abs(tensor) < delta, + tf.square(tensor) * 0.5, + delta * (tf.abs(tensor) - 0.5 * delta) + ) + + +# ================================================================ +# Global session +# ================================================================ + +def make_session(num_cpu=None, make_default=False, graph=None): + """ + Returns a session that will use CPU's only + + :param num_cpu: (int) number of CPUs to use for TensorFlow + :param make_default: (bool) if this should return an InteractiveSession or a normal Session + :param graph: (TensorFlow Graph) the graph of the session + :return: (TensorFlow session) + """ + if num_cpu is None: + num_cpu = int(os.getenv('RCALL_NUM_CPU', multiprocessing.cpu_count())) + tf_config = tf.ConfigProto( + allow_soft_placement=True, + inter_op_parallelism_threads=num_cpu, + intra_op_parallelism_threads=num_cpu) + # Prevent tensorflow from taking all the gpu memory + tf_config.gpu_options.allow_growth = True + if make_default: + return tf.InteractiveSession(config=tf_config, graph=graph) + else: + return tf.Session(config=tf_config, graph=graph) + + +def single_threaded_session(make_default=False, graph=None): + """ + Returns a session which will only use a single CPU + + :param make_default: (bool) if this should return an InteractiveSession or a normal Session + :param graph: (TensorFlow Graph) the graph of the session + :return: (TensorFlow session) + """ + return make_session(num_cpu=1, make_default=make_default, graph=graph) + + +def in_session(func): + """ + wrappes a function so that it is in a TensorFlow Session + + :param func: (function) the function to wrap + :return: (function) + """ + + @functools.wraps(func) + def newfunc(*args, **kwargs): + with tf.Session(): + func(*args, **kwargs) + + return newfunc + + +ALREADY_INITIALIZED = set() + + +def initialize(sess=None): + """ + Initialize all the uninitialized variables in the global scope. + + :param sess: (TensorFlow Session) + """ + if sess is None: + sess = tf.get_default_session() + new_variables = set(tf.global_variables()) - ALREADY_INITIALIZED + sess.run(tf.variables_initializer(new_variables)) + ALREADY_INITIALIZED.update(new_variables) + + +# ================================================================ +# Model components +# ================================================================ + +def normc_initializer(std=1.0, axis=0): + """ + Return a parameter initializer for TensorFlow + + :param std: (float) standard deviation + :param axis: (int) the axis to normalize on + :return: (function) + """ + + def _initializer(shape, dtype=None, partition_info=None): + out = np.random.randn(*shape).astype(np.float32) + out *= std / np.sqrt(np.square(out).sum(axis=axis, keepdims=True)) + return tf.constant(out) + + return _initializer + + +def conv2d(input_tensor, num_filters, name, filter_size=(3, 3), stride=(1, 1), + pad="SAME", dtype=tf.float32, collections=None, summary_tag=None): + """ + Creates a 2d convolutional layer for TensorFlow + + :param input_tensor: (TensorFlow Tensor) The input tensor for the convolution + :param num_filters: (int) The number of filters + :param name: (str) The TensorFlow variable scope + :param filter_size: (tuple) The filter size + :param stride: (tuple) The stride of the convolution + :param pad: (str) The padding type ('VALID' or 'SAME') + :param dtype: (type) The data type for the Tensors + :param collections: (list) List of graph collections keys to add the Variable to + :param summary_tag: (str) image summary name, can be None for no image summary + :return: (TensorFlow Tensor) 2d convolutional layer + """ + with tf.variable_scope(name): + stride_shape = [1, stride[0], stride[1], 1] + filter_shape = [filter_size[0], filter_size[1], int(input_tensor.get_shape()[3]), num_filters] + + # there are "num input feature maps * filter height * filter width" + # inputs to each hidden unit + fan_in = intprod(filter_shape[:3]) + # each unit in the lower layer receives a gradient from: + # "num output feature maps * filter height * filter width" / + # pooling size + fan_out = intprod(filter_shape[:2]) * num_filters + # initialize weights with random weights + w_bound = np.sqrt(6. / (fan_in + fan_out)) + + weight = tf.get_variable("W", filter_shape, dtype, tf.random_uniform_initializer(-w_bound, w_bound), + collections=collections) + bias = tf.get_variable("b", [1, 1, 1, num_filters], initializer=tf.zeros_initializer(), + collections=collections) + + if summary_tag is not None: + tf.summary.image(summary_tag, + tf.transpose(tf.reshape(weight, [filter_size[0], filter_size[1], -1, 1]), [2, 0, 1, 3]), + max_outputs=10) + + return tf.nn.conv2d(input_tensor, weight, stride_shape, pad) + bias + + +# ================================================================ +# Theano-like Function +# ================================================================ + +def function(inputs, outputs, updates=None, givens=None): + """ + Just like Theano function. Take a bunch of tensorflow placeholders and expressions + computed based on those placeholders and produces f(inputs) -> outputs. Function f takes + values to be fed to the input's placeholders and produces the values of the expressions + in outputs. + + Input values can be passed in the same order as inputs or can be provided as kwargs based + on placeholder name (passed to constructor or accessible via placeholder.op.name). + + Example: + x = tf.placeholder(tf.int32, (), name="x") + y = tf.placeholder(tf.int32, (), name="y") + z = 3 * x + 2 * y + lin = function([x, y], z, givens={y: 0}) + + with single_threaded_session(): + initialize() + + assert lin(2) == 6 + assert lin(x=3) == 9 + assert lin(2, 2) == 10 + assert lin(x=2, y=3) == 12 + + :param inputs: (TensorFlow Tensor or Object with make_feed_dict) list of input arguments + :param outputs: (TensorFlow Tensor) list of outputs or a single output to be returned from function. Returned + value will also have the same shape. + :param updates: (list) update functions + :param givens: (dict) the values known for the output + """ + if isinstance(outputs, list): + return _Function(inputs, outputs, updates, givens=givens) + elif isinstance(outputs, (dict, collections.OrderedDict)): + func = _Function(inputs, outputs.values(), updates, givens=givens) + return lambda *args, **kwargs: type(outputs)(zip(outputs.keys(), func(*args, **kwargs))) + else: + func = _Function(inputs, [outputs], updates, givens=givens) + return lambda *args, **kwargs: func(*args, **kwargs)[0] + + +class _Function(object): + def __init__(self, inputs, outputs, updates, givens): + """ + Theano like function + + :param inputs: (TensorFlow Tensor or Object with make_feed_dict) list of input arguments + :param outputs: (TensorFlow Tensor) list of outputs or a single output to be returned from function. Returned + value will also have the same shape. + :param updates: (list) update functions + :param givens: (dict) the values known for the output + """ + for inpt in inputs: + if not hasattr(inpt, 'make_feed_dict') and not (isinstance(inpt, tf.Tensor)and len(inpt.op.inputs) == 0): + assert False, "inputs should all be placeholders, constants, or have a make_feed_dict method" + self.inputs = inputs + updates = updates or [] + self.update_group = tf.group(*updates) + self.outputs_update = list(outputs) + [self.update_group] + self.givens = {} if givens is None else givens + + @classmethod + def _feed_input(cls, feed_dict, inpt, value): + if hasattr(inpt, 'make_feed_dict'): + feed_dict.update(inpt.make_feed_dict(value)) + else: + feed_dict[inpt] = value + + def __call__(self, *args, sess=None): + assert len(args) <= len(self.inputs), "Too many arguments provided" + if sess is None: + sess = tf.get_default_session() + feed_dict = {} + # Update the args + for inpt, value in zip(self.inputs, args): + self._feed_input(feed_dict, inpt, value) + # Update feed dict with givens. + for inpt in self.givens: + feed_dict[inpt] = feed_dict.get(inpt, self.givens[inpt]) + results = sess.run(self.outputs_update, feed_dict=feed_dict)[:-1] + return results + + +# ================================================================ +# Flat vectors +# ================================================================ + +def var_shape(tensor): + """ + get TensorFlow Tensor shape + + :param tensor: (TensorFlow Tensor) the input tensor + :return: ([int]) the shape + """ + out = tensor.get_shape().as_list() + assert all(isinstance(a, int) for a in out), \ + "shape function assumes that shape is fully known" + return out + + +def numel(tensor): + """ + get TensorFlow Tensor's number of elements + + :param tensor: (TensorFlow Tensor) the input tensor + :return: (int) the number of elements + """ + return intprod(var_shape(tensor)) + + +def intprod(tensor): + """ + calculates the product of all the elements in a list + + :param tensor: ([Number]) the list of elements + :return: (int) the product truncated + """ + return int(np.prod(tensor)) + + +def flatgrad(loss, var_list, clip_norm=None): + """ + calculates the gradient and flattens it + + :param loss: (float) the loss value + :param var_list: ([TensorFlow Tensor]) the variables + :param clip_norm: (float) clip the gradients (disabled if None) + :return: ([TensorFlow Tensor]) flattend gradient + """ + grads = tf.gradients(loss, var_list) + if clip_norm is not None: + grads = [tf.clip_by_norm(grad, clip_norm=clip_norm) for grad in grads] + return tf.concat(axis=0, values=[ + tf.reshape(grad if grad is not None else tf.zeros_like(v), [numel(v)]) + for (v, grad) in zip(var_list, grads) + ]) + + +class SetFromFlat(object): + def __init__(self, var_list, dtype=tf.float32, sess=None): + """ + Set the parameters from a flat vector + + :param var_list: ([TensorFlow Tensor]) the variables + :param dtype: (type) the type for the placeholder + :param sess: (TensorFlow Session) + """ + shapes = list(map(var_shape, var_list)) + total_size = np.sum([intprod(shape) for shape in shapes]) + + self.theta = theta = tf.placeholder(dtype, [total_size]) + start = 0 + assigns = [] + for (shape, _var) in zip(shapes, var_list): + size = intprod(shape) + assigns.append(tf.assign(_var, tf.reshape(theta[start:start + size], shape))) + start += size + self.operation = tf.group(*assigns) + self.sess = sess + + def __call__(self, theta): + if self.sess is None: + return tf.get_default_session().run(self.operation, feed_dict={self.theta: theta}) + else: + return self.sess.run(self.operation, feed_dict={self.theta: theta}) + + +class GetFlat(object): + def __init__(self, var_list, sess=None): + """ + Get the parameters as a flat vector + + :param var_list: ([TensorFlow Tensor]) the variables + :param sess: (TensorFlow Session) + """ + self.operation = tf.concat(axis=0, values=[tf.reshape(v, [numel(v)]) for v in var_list]) + self.sess = sess + + def __call__(self): + if self.sess is None: + return tf.get_default_session().run(self.operation) + else: + return self.sess.run(self.operation) + + +def flattenallbut0(tensor): + """ + flatten all the dimension, except from the first one + + :param tensor: (TensorFlow Tensor) the input tensor + :return: (TensorFlow Tensor) the flattened tensor + """ + return tf.reshape(tensor, [-1, intprod(tensor.get_shape().as_list()[1:])]) + + +# ================================================================ +# Diagnostics +# ================================================================ + +def display_var_info(_vars): + """ + log variable information, for debug purposes + + :param _vars: ([TensorFlow Tensor]) the variables + """ + count_params = 0 + for _var in _vars: + name = _var.name + if "/Adam" in name or "beta1_power" in name or "beta2_power" in name: + continue + v_params = np.prod(_var.shape.as_list()) + count_params += v_params + if "/b:" in name or "/biases" in name: + continue # Wx+b, bias is not interesting to look at => count params, but not print + logger.info(" %s%s %i params %s" % (name, " " * (55 - len(name)), v_params, str(_var.shape))) + + logger.info("Total model parameters: %0.2f million" % (count_params * 1e-6)) + + +def get_available_gpus(): + """ + Return a list of all the available GPUs + + :return: ([str]) the GPUs available + """ + # recipe from here: + # https://stackoverflow.com/questions/38559755/how-to-get-current-available-gpus-in-tensorflow?utm_medium=organic&utm_source=google_rich_qa&utm_campaign=google_rich_qa + local_device_protos = device_lib.list_local_devices() + return [x.name for x in local_device_protos if x.device_type == 'GPU'] + + +# ================================================================ +# Saving variables +# ================================================================ + +def load_state(fname, sess=None, var_list=None): + """ + Load a TensorFlow saved model + + :param fname: (str) the graph name + :param sess: (TensorFlow Session) the session, if None: get_default_session() + :param var_list: ([TensorFlow Tensor] or {str: TensorFlow Tensor}) A list of Variable/SaveableObject, + or a dictionary mapping names to SaveableObject`s. If `None, defaults to the list of all saveable objects. + """ + if sess is None: + sess = tf.get_default_session() + + # avoir crashing when loading the direct name without explicitly adding the root folder + if os.path.dirname(fname) == '': + fname = os.path.join('./', fname) + + saver = tf.train.Saver(var_list=var_list) + saver.restore(sess, fname) + + +def save_state(fname, sess=None, var_list=None): + """ + Save a TensorFlow model + + :param fname: (str) the graph name + :param sess: (TensorFlow Session) the session, if None: get_default_session() + :param var_list: ([TensorFlow Tensor] or {str: TensorFlow Tensor}) A list of Variable/SaveableObject, + or a dictionary mapping names to SaveableObject`s. If `None, defaults to the list of all saveable objects. + """ + if sess is None: + sess = tf.get_default_session() + + dir_name = os.path.dirname(fname) + # avoir crashing when saving the direct name without explicitly adding the root folder + if dir_name == '': + dir_name = './' + fname = os.path.join(dir_name, fname) + os.makedirs(dir_name, exist_ok=True) + + saver = tf.train.Saver(var_list=var_list) + saver.save(sess, fname) + + +# ================================================================ +# retrieving variables +# ================================================================ + +def get_trainable_vars(name): + """ + returns the trainable variables + + :param name: (str) the scope + :return: ([TensorFlow Variable]) + """ + return tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope=name) + + +def get_globals_vars(name): + """ + returns the trainable variables + + :param name: (str) the scope + :return: ([TensorFlow Variable]) + """ + return tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope=name) diff --git a/stable_baselines/common/tile_images.py b/stable_baselines/common/tile_images.py new file mode 100644 index 0000000000..14922a990a --- /dev/null +++ b/stable_baselines/common/tile_images.py @@ -0,0 +1,28 @@ +import numpy as np + + +def tile_images(img_nhwc): + """ + Tile N images into one big PxQ image + (P,Q) are chosen to be as close as possible, and if N + is square, then P=Q. + + :param img_nhwc: (list) list or array of images, ndim=4 once turned into array. img nhwc + n = batch index, h = height, w = width, c = channel + :return: (numpy float) img_HWc, ndim=3 + """ + img_nhwc = np.asarray(img_nhwc) + n_images, height, width, n_channels = img_nhwc.shape + # new_height was named H before + new_height = int(np.ceil(np.sqrt(n_images))) + # new_width was named W before + new_width = int(np.ceil(float(n_images) / new_height)) + img_nhwc = np.array(list(img_nhwc) + [img_nhwc[0] * 0 for _ in range(n_images, new_height * new_width)]) + # img_HWhwc + out_image = img_nhwc.reshape(new_height, new_width, height, width, n_channels) + # img_HhWwc + out_image = out_image.transpose(0, 2, 1, 3, 4) + # img_Hh_Ww_c + out_image = out_image.reshape(new_height * height, new_width * width, n_channels) + return out_image + diff --git a/stable_baselines/common/vec_env/__init__.py b/stable_baselines/common/vec_env/__init__.py new file mode 100644 index 0000000000..548dc5ac89 --- /dev/null +++ b/stable_baselines/common/vec_env/__init__.py @@ -0,0 +1,7 @@ +# flake8: noqa F401 +from stable_baselines.common.vec_env.base_vec_env import AlreadySteppingError, NotSteppingError, VecEnv, VecEnvWrapper, \ + CloudpickleWrapper +from stable_baselines.common.vec_env.dummy_vec_env import DummyVecEnv +from stable_baselines.common.vec_env.subproc_vec_env import SubprocVecEnv +from stable_baselines.common.vec_env.vec_frame_stack import VecFrameStack +from stable_baselines.common.vec_env.vec_normalize import VecNormalize diff --git a/baselines/common/vec_env/__init__.py b/stable_baselines/common/vec_env/base_vec_env.py similarity index 59% rename from baselines/common/vec_env/__init__.py rename to stable_baselines/common/vec_env/base_vec_env.py index eb07310d15..e7780bd75a 100644 --- a/baselines/common/vec_env/__init__.py +++ b/stable_baselines/common/vec_env/base_vec_env.py @@ -1,28 +1,41 @@ from abc import ABC, abstractmethod -from baselines import logger +import pickle + +import cloudpickle +from stable_baselines import logger + class AlreadySteppingError(Exception): """ Raised when an asynchronous step is running while step_async() is called again. """ + def __init__(self): msg = 'already running an async step' Exception.__init__(self, msg) + class NotSteppingError(Exception): """ Raised when an asynchronous step is not running but step_wait() is called. """ + def __init__(self): msg = 'not running an async step' Exception.__init__(self, msg) + class VecEnv(ABC): """ An abstract asynchronous, vectorized environment. + + :param num_envs: (int) the number of environments + :param observation_space: (Gym Space) the observation space + :param action_space: (Gym Space) the action space """ + def __init__(self, num_envs, observation_space, action_space): self.num_envs = num_envs self.observation_space = observation_space @@ -37,6 +50,8 @@ def reset(self): If step_async is still doing work, that work will be cancelled and step_wait() should not be called until step_async() is invoked again. + + :return: ([int] or [float]) observation """ pass @@ -57,28 +72,34 @@ def step_wait(self): """ Wait for the step taken with step_async(). - Returns (obs, rews, dones, infos): - - obs: an array of observations, or a tuple of - arrays of observations. - - rews: an array of rewards - - dones: an array of "episode done" booleans - - infos: a sequence of info objects + :return: ([int] or [float], [float], [bool], dict) observation, reward, done, information """ pass @abstractmethod def close(self): """ - Clean up the environments' resources. + Clean up the environment's resources. """ pass def step(self, actions): + """ + Step the environments with the given action + + :param actions: ([int] or [float]) the action + :return: ([int] or [float], [float], [bool], dict) observation, reward, done, information + """ self.step_async(actions) return self.step_wait() def render(self, mode='human'): - logger.warn('Render not defined for %s'%self) + """ + Gym environment rendering + + :param mode: (str) the rendering type + """ + logger.warn('Render not defined for %s' % self) @property def unwrapped(self): @@ -87,13 +108,20 @@ def unwrapped(self): else: return self + class VecEnvWrapper(VecEnv): + """ + Vectorized environment base class + + :param venv: (VecEnv) the vectorized environment to wrap + :param observation_space: (Gym Space) the observation space (can be None to load from venv) + :param action_space: (Gym Space) the action space (can be None to load from venv) + """ + def __init__(self, venv, observation_space=None, action_space=None): self.venv = venv - VecEnv.__init__(self, - num_envs=venv.num_envs, - observation_space=observation_space or venv.observation_space, - action_space=action_space or venv.action_space) + VecEnv.__init__(self, num_envs=venv.num_envs, observation_space=observation_space or venv.observation_space, + action_space=action_space or venv.action_space) def step_async(self, actions): self.venv.step_async(actions) @@ -109,18 +137,21 @@ def step_wait(self): def close(self): return self.venv.close() - def render(self): + def render(self, mode='human'): self.venv.render() + class CloudpickleWrapper(object): - """ - Uses cloudpickle to serialize contents (otherwise multiprocessing tries to use pickle) - """ - def __init__(self, x): - self.x = x + def __init__(self, var): + """ + Uses cloudpickle to serialize contents (otherwise multiprocessing tries to use pickle) + + :param var: (Any) the variable you wish to wrap for pickling with cloudpickle + """ + self.var = var + def __getstate__(self): - import cloudpickle - return cloudpickle.dumps(self.x) - def __setstate__(self, ob): - import pickle - self.x = pickle.loads(ob) + return cloudpickle.dumps(self.var) + + def __setstate__(self, obs): + self.var = pickle.loads(obs) diff --git a/baselines/common/vec_env/dummy_vec_env.py b/stable_baselines/common/vec_env/dummy_vec_env.py similarity index 58% rename from baselines/common/vec_env/dummy_vec_env.py rename to stable_baselines/common/vec_env/dummy_vec_env.py index d0ae455d4a..b3cbdc170b 100644 --- a/baselines/common/vec_env/dummy_vec_env.py +++ b/stable_baselines/common/vec_env/dummy_vec_env.py @@ -1,9 +1,18 @@ +from collections import OrderedDict + import numpy as np from gym import spaces -from collections import OrderedDict + from . import VecEnv + class DummyVecEnv(VecEnv): + """ + Creates a simple vectorized wrapper for multiple environments + + :param env_fns: ([Gym Environment]) the list of environments to vectorize + """ + def __init__(self, env_fns): self.envs = [fn() for fn in env_fns] env = self.envs[0] @@ -22,10 +31,10 @@ def __init__(self, env_fns): shapes[key] = box.shape dtypes[key] = box.dtype self.keys.append(key) - - self.buf_obs = { k: np.zeros((self.num_envs,) + tuple(shapes[k]), dtype=dtypes[k]) for k in self.keys } + + self.buf_obs = {k: np.zeros((self.num_envs,) + tuple(shapes[k]), dtype=dtypes[k]) for k in self.keys} self.buf_dones = np.zeros((self.num_envs,), dtype=np.bool) - self.buf_rews = np.zeros((self.num_envs,), dtype=np.float32) + self.buf_rews = np.zeros((self.num_envs,), dtype=np.float32) self.buf_infos = [{} for _ in range(self.num_envs)] self.actions = None @@ -33,18 +42,19 @@ def step_async(self, actions): self.actions = actions def step_wait(self): - for e in range(self.num_envs): - obs, self.buf_rews[e], self.buf_dones[e], self.buf_infos[e] = self.envs[e].step(self.actions[e]) - if self.buf_dones[e]: - obs = self.envs[e].reset() - self._save_obs(e, obs) + for env_idx in range(self.num_envs): + obs, self.buf_rews[env_idx], self.buf_dones[env_idx], self.buf_infos[env_idx] =\ + self.envs[env_idx].step(self.actions[env_idx]) + if self.buf_dones[env_idx]: + obs = self.envs[env_idx].reset() + self._save_obs(env_idx, obs) return (self._obs_from_buf(), np.copy(self.buf_rews), np.copy(self.buf_dones), self.buf_infos.copy()) def reset(self): - for e in range(self.num_envs): - obs = self.envs[e].reset() - self._save_obs(e, obs) + for env_idx in range(self.num_envs): + obs = self.envs[env_idx].reset() + self._save_obs(env_idx, obs) return self._obs_from_buf() def close(self): @@ -53,15 +63,15 @@ def close(self): def render(self, mode='human'): return [e.render(mode=mode) for e in self.envs] - def _save_obs(self, e, obs): - for k in self.keys: - if k is None: - self.buf_obs[k][e] = obs + def _save_obs(self, env_idx, obs): + for key in self.keys: + if key is None: + self.buf_obs[key][env_idx] = obs else: - self.buf_obs[k][e] = obs[k] + self.buf_obs[key][env_idx] = obs[key] def _obs_from_buf(self): - if self.keys==[None]: + if self.keys == [None]: return self.buf_obs[None] else: return self.buf_obs diff --git a/stable_baselines/common/vec_env/subproc_vec_env.py b/stable_baselines/common/vec_env/subproc_vec_env.py new file mode 100644 index 0000000000..bf812caab4 --- /dev/null +++ b/stable_baselines/common/vec_env/subproc_vec_env.py @@ -0,0 +1,100 @@ +from multiprocessing import Process, Pipe + +import numpy as np + +from stable_baselines.common.vec_env import VecEnv, CloudpickleWrapper +from stable_baselines.common.tile_images import tile_images + + +def _worker(remote, parent_remote, env_fn_wrapper): + parent_remote.close() + env = env_fn_wrapper.var() + while True: + try: + cmd, data = remote.recv() + if cmd == 'step': + observation, reward, done, info = env.step(data) + if done: + observation = env.reset() + remote.send((observation, reward, done, info)) + elif cmd == 'reset': + observation = env.reset() + remote.send(observation) + elif cmd == 'render': + remote.send(env.render(mode='rgb_array')) + elif cmd == 'close': + remote.close() + break + elif cmd == 'get_spaces': + remote.send((env.observation_space, env.action_space)) + else: + raise NotImplementedError + except EOFError: + break + + +class SubprocVecEnv(VecEnv): + """ + Creates a multiprocess vectorized wrapper for multiple environments + + :param env_fns: ([Gym Environment]) Environments to run in subprocesses + """ + + def __init__(self, env_fns): + self.waiting = False + self.closed = False + n_envs = len(env_fns) + self.remotes, self.work_remotes = zip(*[Pipe() for _ in range(n_envs)]) + self.processes = [Process(target=_worker, args=(work_remote, remote, CloudpickleWrapper(env_fn))) + for (work_remote, remote, env_fn) in zip(self.work_remotes, self.remotes, env_fns)] + for process in self.processes: + process.daemon = True # if the main process crashes, we should not cause things to hang + process.start() + for remote in self.work_remotes: + remote.close() + + self.remotes[0].send(('get_spaces', None)) + observation_space, action_space = self.remotes[0].recv() + VecEnv.__init__(self, len(env_fns), observation_space, action_space) + + def step_async(self, actions): + for remote, action in zip(self.remotes, actions): + remote.send(('step', action)) + self.waiting = True + + def step_wait(self): + results = [remote.recv() for remote in self.remotes] + self.waiting = False + obs, rews, dones, infos = zip(*results) + return np.stack(obs), np.stack(rews), np.stack(dones), infos + + def reset(self): + for remote in self.remotes: + remote.send(('reset', None)) + return np.stack([remote.recv() for remote in self.remotes]) + + def close(self): + if self.closed: + return + if self.waiting: + for remote in self.remotes: + remote.recv() + for remote in self.remotes: + remote.send(('close', None)) + for process in self.processes: + process.join() + self.closed = True + + def render(self, mode='human'): + for pipe in self.remotes: + pipe.send(('render', None)) + imgs = [pipe.recv() for pipe in self.remotes] + bigimg = tile_images(imgs) + if mode == 'human': + import cv2 + cv2.imshow('vecenv', bigimg[:, :, ::-1]) + cv2.waitKey(1) + elif mode == 'rgb_array': + return bigimg + else: + raise NotImplementedError diff --git a/stable_baselines/common/vec_env/vec_frame_stack.py b/stable_baselines/common/vec_env/vec_frame_stack.py new file mode 100644 index 0000000000..d580c1652f --- /dev/null +++ b/stable_baselines/common/vec_env/vec_frame_stack.py @@ -0,0 +1,44 @@ +import numpy as np +from gym import spaces + +from stable_baselines.common.vec_env import VecEnvWrapper + + +class VecFrameStack(VecEnvWrapper): + """ + Frame stacking wrapper for vectorized environment + + :param venv: (VecEnv) the vectorized environment to wrap + :param n_stack: (int) Number of frames to stack + """ + + def __init__(self, venv, n_stack): + self.venv = venv + self.n_stack = n_stack + wrapped_obs_space = venv.observation_space + low = np.repeat(wrapped_obs_space.low, self.n_stack, axis=-1) + high = np.repeat(wrapped_obs_space.high, self.n_stack, axis=-1) + self.stackedobs = np.zeros((venv.num_envs,) + low.shape, low.dtype) + observation_space = spaces.Box(low=low, high=high, dtype=venv.observation_space.dtype) + VecEnvWrapper.__init__(self, venv, observation_space=observation_space) + + def step_wait(self): + observations, rewards, dones, infos = self.venv.step_wait() + self.stackedobs = np.roll(self.stackedobs, shift=-observations.shape[-1], axis=-1) + for i, done in enumerate(dones): + if done: + self.stackedobs[i] = 0 + self.stackedobs[..., -observations.shape[-1]:] = observations + return self.stackedobs, rewards, dones, infos + + def reset(self): + """ + Reset all environments + """ + obs = self.venv.reset() + self.stackedobs[...] = 0 + self.stackedobs[..., -obs.shape[-1]:] = obs + return self.stackedobs + + def close(self): + self.venv.close() diff --git a/stable_baselines/common/vec_env/vec_normalize.py b/stable_baselines/common/vec_env/vec_normalize.py new file mode 100644 index 0000000000..cd86fa5c4d --- /dev/null +++ b/stable_baselines/common/vec_env/vec_normalize.py @@ -0,0 +1,103 @@ +import pickle + +import numpy as np + +from stable_baselines.common.vec_env import VecEnvWrapper +from stable_baselines.common.running_mean_std import RunningMeanStd + + +class VecNormalize(VecEnvWrapper): + """ + A moving average, normalizing wrapper for vectorized environment. + has support for saving/loading moving average, + + :param venv: (VecEnv) the vectorized environment to wrap + :param training: (bool) Whether to update or not the moving average + :param norm_obs: (bool) Whether to normalize observation or not (default: True) + :param norm_reward: (bool) Whether to normalize rewards or not (default: False) + :param clip_obs: (float) Max absolute value for observation + :param clip_reward: (float) Max value absolute for discounted reward + :param gamma: (float) discount factor + :param epsilon: (float) To avoid division by zero + """ + + def __init__(self, venv, training=True, norm_obs=True, norm_reward=True, + clip_obs=10., clip_reward=10., gamma=0.99, epsilon=1e-8): + VecEnvWrapper.__init__(self, venv) + self.obs_rms = RunningMeanStd(shape=self.observation_space.shape) + self.ret_rms = RunningMeanStd(shape=()) + self.clip_obs = clip_obs + self.clip_reward = clip_reward + # Returns: discounted rewards + self.ret = np.zeros(self.num_envs) + self.gamma = gamma + self.epsilon = epsilon + self.training = training + self.norm_obs = norm_obs + self.norm_reward = norm_reward + self.old_obs = np.array([]) + + def step_wait(self): + """ + Apply sequence of actions to sequence of environments + actions -> (observations, rewards, news) + + where 'news' is a boolean vector indicating whether each element is new. + """ + obs, rews, news, infos = self.venv.step_wait() + self.ret = self.ret * self.gamma + rews + self.old_obs = obs + obs = self._normalize_observation(obs) + if self.norm_reward: + if self.training: + self.ret_rms.update(self.ret) + rews = np.clip(rews / np.sqrt(self.ret_rms.var + self.epsilon), -self.clip_reward, self.clip_reward) + return obs, rews, news, infos + + def _normalize_observation(self, obs): + """ + :param obs: (numpy tensor) + """ + if self.norm_obs: + if self.training: + self.obs_rms.update(obs) + obs = np.clip((obs - self.obs_rms.mean) / np.sqrt(self.obs_rms.var + self.epsilon), -self.clip_obs, + self.clip_obs) + return obs + else: + return obs + + def get_original_obs(self): + """ + returns the unnormalized observation + + :return: (numpy float) + """ + return self.old_obs + + def reset(self): + """ + Reset all environments + """ + obs = self.venv.reset() + if len(np.array(obs).shape) == 1: # for when num_cpu is 1 + self.old_obs = [obs] + else: + self.old_obs = obs + return self._normalize_observation(obs) + + def save_running_average(self, path): + """ + :param path: (str) path to log dir + """ + for rms, name in zip([self.obs_rms, self.ret_rms], ['obs_rms', 'ret_rms']): + with open("{}/{}.pkl".format(path, name), 'wb') as file_handler: + pickle.dump(rms, file_handler) + + def load_running_average(self, path): + """ + :param path: (str) path to log dir + """ + for name in ['obs_rms', 'ret_rms']: + with open("{}/{}.pkl".format(path, name), 'rb') as file_handler: + setattr(self, name, pickle.load(file_handler)) diff --git a/stable_baselines/ddpg/__init__.py b/stable_baselines/ddpg/__init__.py new file mode 100644 index 0000000000..09aaf5e311 --- /dev/null +++ b/stable_baselines/ddpg/__init__.py @@ -0,0 +1 @@ +from stable_baselines.ddpg.ddpg import DDPG diff --git a/stable_baselines/ddpg/ddpg.py b/stable_baselines/ddpg/ddpg.py new file mode 100644 index 0000000000..d0ea62cad9 --- /dev/null +++ b/stable_baselines/ddpg/ddpg.py @@ -0,0 +1,878 @@ +from functools import reduce +import os +import time +from collections import deque +import pickle + +import gym +import numpy as np +import tensorflow as tf +import tensorflow.contrib as tc +from mpi4py import MPI + +from stable_baselines import logger +from stable_baselines.common import tf_util, BaseRLModel, SetVerbosity +from stable_baselines.common.vec_env import VecEnv +from stable_baselines.common.mpi_adam import MpiAdam +from stable_baselines.common.policies import LstmPolicy +from stable_baselines.common.mpi_running_mean_std import RunningMeanStd +from stable_baselines.a2c.utils import find_trainable_variables +from stable_baselines.ddpg.memory import Memory + + +def normalize(tensor, stats): + """ + normalize a tensor using a running mean and std + + :param tensor: (TensorFlow Tensor) the input tensor + :param stats: (RunningMeanStd) the running mean and std of the input to normalize + :return: (TensorFlow Tensor) the normalized tensor + """ + if stats is None: + return tensor + return (tensor - stats.mean) / stats.std + + +def denormalize(tensor, stats): + """ + denormalize a tensor using a running mean and std + + :param tensor: (TensorFlow Tensor) the normalized tensor + :param stats: (RunningMeanStd) the running mean and std of the input to normalize + :return: (TensorFlow Tensor) the restored tensor + """ + if stats is None: + return tensor + return tensor * stats.std + stats.mean + + +def reduce_std(tensor, axis=None, keepdims=False): + """ + get the standard deviation of a Tensor + + :param tensor: (TensorFlow Tensor) the input tensor + :param axis: (int or [int]) the axis to itterate the std over + :param keepdims: (bool) keep the other dimensions the same + :return: (TensorFlow Tensor) the std of the tensor + """ + return tf.sqrt(reduce_var(tensor, axis=axis, keepdims=keepdims)) + + +def reduce_var(tensor, axis=None, keepdims=False): + """ + get the variance of a Tensor + + :param tensor: (TensorFlow Tensor) the input tensor + :param axis: (int or [int]) the axis to itterate the variance over + :param keepdims: (bool) keep the other dimensions the same + :return: (TensorFlow Tensor) the variance of the tensor + """ + tensor_mean = tf.reduce_mean(tensor, axis=axis, keepdims=True) + devs_squared = tf.square(tensor - tensor_mean) + return tf.reduce_mean(devs_squared, axis=axis, keepdims=keepdims) + + +def get_target_updates(_vars, target_vars, tau): + """ + get target update operations + + :param _vars: ([TensorFlow Tensor]) the initial variables + :param target_vars: ([TensorFlow Tensor]) the target variables + :param tau: (float) the soft update coefficient (keep old values, between 0 and 1) + :return: (TensorFlow Operation, TensorFlow Operation) initial update, soft update + """ + logger.info('setting up target updates ...') + soft_updates = [] + init_updates = [] + assert len(_vars) == len(target_vars) + for var, target_var in zip(_vars, target_vars): + logger.info(' {} <- {}'.format(target_var.name, var.name)) + init_updates.append(tf.assign(target_var, var)) + soft_updates.append(tf.assign(target_var, (1. - tau) * target_var + tau * var)) + assert len(init_updates) == len(_vars) + assert len(soft_updates) == len(_vars) + return tf.group(*init_updates), tf.group(*soft_updates) + + +def get_perturbed_actor_updates(actor, perturbed_actor, param_noise_stddev): + """ + get the actor update, with noise. + + :param actor: (str) the actor + :param perturbed_actor: (str) the pertubed actor + :param param_noise_stddev: (float) the std of the parameter noise + :return: (TensorFlow Operation) the update function + """ + assert len(tf_util.get_globals_vars(actor)) == len(tf_util.get_globals_vars(perturbed_actor)) + assert len([var for var in tf_util.get_trainable_vars(actor) if 'LayerNorm' not in var.name]) == \ + len([var for var in tf_util.get_trainable_vars(perturbed_actor) if 'LayerNorm' not in var.name]) + + updates = [] + for var, perturbed_var in zip(tf_util.get_globals_vars(actor), tf_util.get_globals_vars(perturbed_actor)): + if var in [var for var in tf_util.get_trainable_vars(actor) if 'LayerNorm' not in var.name]: + logger.info(' {} <- {} + noise'.format(perturbed_var.name, var.name)) + updates.append(tf.assign(perturbed_var, + var + tf.random_normal(tf.shape(var), mean=0., stddev=param_noise_stddev))) + else: + logger.info(' {} <- {}'.format(perturbed_var.name, var.name)) + updates.append(tf.assign(perturbed_var, var)) + assert len(updates) == len(tf_util.get_globals_vars(actor)) + return tf.group(*updates) + + +class DDPG(BaseRLModel): + """ + Deep Deterministic Policy Gradient (DDPG) model + + DDPG: https://arxiv.org/pdf/1509.02971.pdf + + :param policy: (ActorCriticPolicy) the policy + :param env: (Gym environment or str) The environment to learn from (if registered in Gym, can be str) + :param gamma: (float) the discount rate + :param memory_policy: (Memory) the replay buffer (if None, default to baselines.ddpg.memory.Memory) + :param eval_env: (Gym Environment) the evaluation environment (can be None) + :param nb_train_steps: (int) the number of training steps + :param nb_rollout_steps: (int) the number of rollout steps + :param nb_eval_steps: (int) the number of evalutation steps + :param param_noise: (AdaptiveParamNoiseSpec) the parameter noise type (can be None) + :param action_noise: (ActionNoise) the action noise type (can be None) + :param param_noise_adaption_interval: (int) apply param noise every N steps + :param tau: (float) the soft update coefficient (keep old values, between 0 and 1) + :param normalize_returns: (bool) should the critic output be normalized + :param enable_popart: (bool) enable pop-art normalization of the critic output + (https://arxiv.org/pdf/1602.07714.pdf) + :param normalize_observations: (bool) should the observation be normalized + :param batch_size: (int) the size of the batch for learning the policy + :param observation_range: (tuple) the bounding values for the observation + :param action_range: (tuple) the bounding values for the actions + :param return_range: (tuple) the bounding values for the critic output + :param critic_l2_reg: (float) l2 regularizer coefficient + :param actor_lr: (float) the actor learning rate + :param critic_lr: (float) the critic learning rate + :param clip_norm: (float) clip the gradients (disabled if None) + :param reward_scale: (float) the value the reward should be scaled by + :param render: (bool) enable rendering of the environment + :param render_eval: (bool) enable rendering of the evalution environment + :param layer_norm: (bool) enable layer normalization for the policies + :param memory_limit: (int) the max number of transitions to store + :param verbose: (int) the verbosity level: 0 none, 1 training information, 2 tensorflow debug + :param _init_setup_model: (bool) Whether or not to build the network at the creation of the instance + """ + + def __init__(self, policy, env, gamma=0.99, memory_policy=None, eval_env=None, + nb_train_steps=50, nb_rollout_steps=100, nb_eval_steps=100, param_noise=None, action_noise=None, + action_range=(-1., 1.), normalize_observations=False, tau=0.001, batch_size=128, + param_noise_adaption_interval=50, normalize_returns=False, enable_popart=False, + observation_range=(-5., 5.), critic_l2_reg=0., return_range=(-np.inf, np.inf), actor_lr=1e-4, + critic_lr=1e-3, clip_norm=None, reward_scale=1., render=False, render_eval=False, layer_norm=True, + memory_limit=100, verbose=0, _init_setup_model=True): + super(DDPG, self).__init__(policy=policy, env=env, requires_vec_env=False, verbose=verbose) + + # Parameters. + self.gamma = gamma + self.tau = tau + self.memory_policy = memory_policy or Memory + self.normalize_observations = normalize_observations + self.normalize_returns = normalize_returns + self.action_noise = action_noise + self.param_noise = param_noise + self.action_range = action_range + self.return_range = return_range + self.observation_range = observation_range + self.actor_lr = actor_lr + self.critic_lr = critic_lr + self.clip_norm = clip_norm + self.enable_popart = enable_popart + self.reward_scale = reward_scale + self.batch_size = batch_size + self.critic_l2_reg = critic_l2_reg + self.eval_env = eval_env + self.render = render + self.render_eval = render_eval + self.nb_eval_steps = nb_eval_steps + self.param_noise_adaption_interval = param_noise_adaption_interval + self.nb_train_steps = nb_train_steps + self.nb_rollout_steps = nb_rollout_steps + self.layer_norm = layer_norm + self.memory_limit = memory_limit + + # init + self.graph = None + self.stats_sample = None + self.memory = None + self.policy_tf = None + self.target_init_updates = None + self.target_soft_updates = None + self.critic_loss = None + self.critic_grads = None + self.critic_optimizer = None + self.sess = None + self.stats_ops = None + self.stats_names = None + self.perturbed_actor_tf = None + self.perturb_policy_ops = None + self.perturb_adaptive_policy_ops = None + self.adaptive_policy_distance = None + self.actor_loss = None + self.actor_grads = None + self.actor_optimizer = None + self.old_std = None + self.old_mean = None + self.renormalize_q_outputs_op = None + self.obs_rms = None + self.ret_rms = None + self.target_policy = None + self.actor_tf = None + self.normalized_critic_tf = None + self.critic_tf = None + self.normalized_critic_with_actor_tf = None + self.critic_with_actor_tf = None + self.target_q = None + self.obs_train = None + self.obs_target = None + self.obs_noise = None + self.obs_adapt_noise = None + self.terminals1 = None + self.rewards = None + self.actions = None + self.critic_target = None + self.param_noise_stddev = None + self.params = None + + if _init_setup_model: + self.setup_model() + + def setup_model(self): + with SetVerbosity(self.verbose): + + assert isinstance(self.action_space, gym.spaces.Box), \ + "Error: DDPG cannot output a {} action space, only spaces.Box is supported.".format(self.action_space) + assert not issubclass(self.policy, LstmPolicy), "Error: cannot use a reccurent policy for the DDPG model." + + self.graph = tf.Graph() + with self.graph.as_default(): + self.sess = tf_util.single_threaded_session(graph=self.graph) + + self.memory = self.memory_policy(limit=self.memory_limit, action_shape=self.action_space.shape, + observation_shape=self.observation_space.shape) + + with tf.variable_scope("train", reuse=False): + self.policy_tf = self.policy(self.sess, self.observation_space, self.action_space, 1, 1, None) + + # Inputs. + self.obs_train = self.policy_tf.obs_ph + self.terminals1 = tf.placeholder(tf.float32, shape=(None, 1), name='terminals1') + self.rewards = tf.placeholder(tf.float32, shape=(None, 1), name='rewards') + self.actions = tf.placeholder(tf.float32, shape=(None,) + self.action_space.shape, name='actions') + self.critic_target = tf.placeholder(tf.float32, shape=(None, 1), name='critic_target') + self.param_noise_stddev = tf.placeholder(tf.float32, shape=(), name='param_noise_stddev') + + # Observation normalization. + if self.normalize_observations: + with tf.variable_scope('obs_rms'): + self.obs_rms = RunningMeanStd(shape=self.observation_space.shape) + else: + self.obs_rms = None + + # Return normalization. + if self.normalize_returns: + with tf.variable_scope('ret_rms'): + self.ret_rms = RunningMeanStd() + else: + self.ret_rms = None + + # Create target networks. + with tf.variable_scope("target", reuse=False): + self.target_policy = self.policy(self.sess, self.observation_space, self.action_space, 1, 1, None) + self.obs_target = self.target_policy.obs_ph + + # Create networks and core TF parts that are shared across setup parts. + self.actor_tf = self.policy_tf.policy + self.normalized_critic_tf = self.policy_tf.value_fn + self.critic_tf = denormalize( + tf.clip_by_value(self.normalized_critic_tf, self.return_range[0], self.return_range[1]), + self.ret_rms) + self.normalized_critic_with_actor_tf = self.policy_tf.value_fn + self.critic_with_actor_tf = denormalize( + tf.clip_by_value(self.normalized_critic_with_actor_tf, self.return_range[0], self.return_range[1]), + self.ret_rms) + q_obs1 = denormalize(self.target_policy.value_fn, self.ret_rms) + self.target_q = self.rewards + (1. - self.terminals1) * self.gamma * q_obs1 + + # Set up parts. + if self.param_noise is not None: + self._setup_param_noise() + self._setup_actor_optimizer() + self._setup_critic_optimizer() + if self.normalize_returns and self.enable_popart: + self._setup_popart() + self._setup_stats() + self._setup_target_network_updates() + + self.params = find_trainable_variables("train") + + with self.sess.as_default(): + self._initialize(self.sess) + + def _setup_target_network_updates(self): + """ + set the target update operations + """ + init_updates, soft_updates = get_target_updates(tf_util.get_trainable_vars('train'), + tf_util.get_trainable_vars('target'), self.tau) + self.target_init_updates = init_updates + self.target_soft_updates = soft_updates + + def _setup_param_noise(self): + """ + set the parameter noise operations + """ + assert self.param_noise is not None + + # Configure perturbed actor. + with tf.variable_scope("noise", reuse=False): + param_noise_actor = self.policy(self.sess, self.observation_space, self.action_space, 1, 1, None) + self.obs_noise = param_noise_actor.obs_ph + self.perturbed_actor_tf = param_noise_actor.policy + logger.info('setting up param noise') + self.perturb_policy_ops = get_perturbed_actor_updates('train', 'noise', self.param_noise_stddev) + + # Configure separate copy for stddev adoption. + with tf.variable_scope("noise_adapt", reuse=False): + adaptive_param_noise_actor = self.policy(self.sess, self.observation_space, self.action_space, 1, 1, None) + self.obs_adapt_noise = adaptive_param_noise_actor.obs_ph + adaptive_actor_tf = adaptive_param_noise_actor.policy + self.perturb_adaptive_policy_ops = get_perturbed_actor_updates('train', 'noise_adapt', self.param_noise_stddev) + self.adaptive_policy_distance = tf.sqrt(tf.reduce_mean(tf.square(self.actor_tf - adaptive_actor_tf))) + + def _setup_actor_optimizer(self): + """ + setup the optimizer for the actor + """ + logger.info('setting up actor optimizer') + self.actor_loss = -tf.reduce_mean(self.critic_with_actor_tf) + actor_shapes = [var.get_shape().as_list() for var in tf_util.get_trainable_vars('train')] + actor_nb_params = sum([reduce(lambda x, y: x * y, shape) for shape in actor_shapes]) + logger.info(' actor shapes: {}'.format(actor_shapes)) + logger.info(' actor params: {}'.format(actor_nb_params)) + self.actor_grads = tf_util.flatgrad(self.actor_loss, tf_util.get_trainable_vars('train'), + clip_norm=self.clip_norm) + self.actor_optimizer = MpiAdam(var_list=tf_util.get_trainable_vars('train'), beta1=0.9, beta2=0.999, + epsilon=1e-08) + + def _setup_critic_optimizer(self): + """ + setup the optimizer for the critic + """ + logger.info('setting up critic optimizer') + normalized_critic_target_tf = tf.clip_by_value(normalize(self.critic_target, self.ret_rms), + self.return_range[0], self.return_range[1]) + self.critic_loss = tf.reduce_mean(tf.square(self.normalized_critic_tf - normalized_critic_target_tf)) + if self.critic_l2_reg > 0.: + critic_reg_vars = [var for var in tf_util.get_trainable_vars('train') + if 'bias' not in var.name and 'output' not in var.name and 'b' not in var.name] + for var in critic_reg_vars: + logger.info(' regularizing: {}'.format(var.name)) + logger.info(' applying l2 regularization with {}'.format(self.critic_l2_reg)) + critic_reg = tc.layers.apply_regularization( + tc.layers.l2_regularizer(self.critic_l2_reg), + weights_list=critic_reg_vars + ) + self.critic_loss += critic_reg + critic_shapes = [var.get_shape().as_list() for var in tf_util.get_trainable_vars('train')] + critic_nb_params = sum([reduce(lambda x, y: x * y, shape) for shape in critic_shapes]) + logger.info(' critic shapes: {}'.format(critic_shapes)) + logger.info(' critic params: {}'.format(critic_nb_params)) + self.critic_grads = tf_util.flatgrad(self.critic_loss, tf_util.get_trainable_vars('train'), + clip_norm=self.clip_norm) + self.critic_optimizer = MpiAdam(var_list=tf_util.get_trainable_vars('train'), beta1=0.9, beta2=0.999, + epsilon=1e-08) + + def _setup_popart(self): + """ + setup pop-art normalization of the critic output + + See https://arxiv.org/pdf/1602.07714.pdf for details. + Preserving Outputs Precisely, while Adaptively Rescaling Targets”. + """ + self.old_std = tf.placeholder(tf.float32, shape=[1], name='old_std') + new_std = self.ret_rms.std + self.old_mean = tf.placeholder(tf.float32, shape=[1], name='old_mean') + new_mean = self.ret_rms.mean + + self.renormalize_q_outputs_op = [] + for out_vars in [[var for var in tf_util.get_trainable_vars('train') if 'output' in var.name], + [var for var in tf_util.get_trainable_vars('target') if 'output' in var.name]]: + assert len(out_vars) == 2 + # wieght and bias of the last layer + weight, bias = out_vars + assert 'kernel' in weight.name + assert 'bias' in bias.name + assert weight.get_shape()[-1] == 1 + assert bias.get_shape()[-1] == 1 + self.renormalize_q_outputs_op += [weight.assign(weight * self.old_std / new_std)] + self.renormalize_q_outputs_op += [bias.assign((bias * self.old_std + self.old_mean - new_mean) / new_std)] + + def _setup_stats(self): + """ + setup the running means and std of the inputs and outputs of the model + """ + ops = [] + names = [] + + if self.normalize_returns: + ops += [self.ret_rms.mean, self.ret_rms.std] + names += ['ret_rms_mean', 'ret_rms_std'] + + if self.normalize_observations: + ops += [tf.reduce_mean(self.obs_rms.mean), tf.reduce_mean(self.obs_rms.std)] + names += ['obs_rms_mean', 'obs_rms_std'] + + ops += [tf.reduce_mean(self.critic_tf)] + names += ['reference_Q_mean'] + ops += [reduce_std(self.critic_tf)] + names += ['reference_Q_std'] + + ops += [tf.reduce_mean(self.critic_with_actor_tf)] + names += ['reference_actor_Q_mean'] + ops += [reduce_std(self.critic_with_actor_tf)] + names += ['reference_actor_Q_std'] + + ops += [tf.reduce_mean(self.actor_tf)] + names += ['reference_action_mean'] + ops += [reduce_std(self.actor_tf)] + names += ['reference_action_std'] + + if self.param_noise: + ops += [tf.reduce_mean(self.perturbed_actor_tf)] + names += ['reference_perturbed_action_mean'] + ops += [reduce_std(self.perturbed_actor_tf)] + names += ['reference_perturbed_action_std'] + + self.stats_ops = ops + self.stats_names = names + + def _policy(self, obs, apply_noise=True, compute_q=True): + """ + Get the actions and critic output, from a given observation + + :param obs: ([float] or [int]) the observation + :param apply_noise: (bool) enable the noise + :param compute_q: (bool) compute the critic output + :return: ([float], float) the action and critic value + """ + feed_dict = {self.obs_train: [obs]} + if self.param_noise is not None and apply_noise: + actor_tf = self.perturbed_actor_tf + feed_dict[self.obs_noise] = [obs] + else: + actor_tf = self.actor_tf + if compute_q: + action, q_value = self.sess.run([actor_tf, self.critic_with_actor_tf], feed_dict=feed_dict) + else: + action = self.sess.run(actor_tf, feed_dict=feed_dict) + q_value = None + action = action.flatten() + if self.action_noise is not None and apply_noise: + noise = self.action_noise() + assert noise.shape == action.shape + action += noise + action = np.clip(action, self.action_range[0], self.action_range[1]) + return action, q_value + + def _store_transition(self, obs0, action, reward, obs1, terminal1): + """ + Store a transition in the replay buffer + + :param obs0: ([float] or [int]) the last observation + :param action: ([float]) the action + :param reward: (float] the reward + :param obs1: ([float] or [int]) the current observation + :param terminal1: (bool) is the episode done + """ + reward *= self.reward_scale + self.memory.append(obs0, action, reward, obs1, terminal1) + if self.normalize_observations: + self.obs_rms.update(np.array([obs0])) + + def _train_step(self): + """ + run a step of training from batch + + :return: (float, float) critic loss, actor loss + """ + # Get a batch. + batch = self.memory.sample(batch_size=self.batch_size) + + if self.normalize_returns and self.enable_popart: + old_mean, old_std, target_q = self.sess.run([self.ret_rms.mean, self.ret_rms.std, self.target_q], + feed_dict={ + self.obs_target: batch['obs1'], + self.rewards: batch['rewards'], + self.terminals1: batch['terminals1'].astype('float32'), + }) + self.ret_rms.update(target_q.flatten()) + self.sess.run(self.renormalize_q_outputs_op, feed_dict={ + self.old_std: np.array([old_std]), + self.old_mean: np.array([old_mean]), + }) + + else: + target_q = self.sess.run(self.target_q, feed_dict={ + self.obs_target: batch['obs1'], + self.rewards: batch['rewards'], + self.terminals1: batch['terminals1'].astype('float32'), + }) + + # Get all gradients and perform a synced update. + ops = [self.actor_grads, self.actor_loss, self.critic_grads, self.critic_loss] + actor_grads, actor_loss, critic_grads, critic_loss = self.sess.run(ops, feed_dict={ + self.obs_train: batch['obs0'], + self.actions: batch['actions'], + self.critic_target: target_q, + }) + self.actor_optimizer.update(actor_grads, learning_rate=self.actor_lr) + self.critic_optimizer.update(critic_grads, learning_rate=self.critic_lr) + + return critic_loss, actor_loss + + def _initialize(self, sess): + """ + initialize the model parameters and optimizers + + :param sess: (TensorFlow Session) the current TensorFlow session + """ + self.sess = sess + self.sess.run(tf.global_variables_initializer()) + self.actor_optimizer.sync() + self.critic_optimizer.sync() + self.sess.run(self.target_init_updates) + + def _update_target_net(self): + """ + run target soft update operation + """ + self.sess.run(self.target_soft_updates) + + def _get_stats(self): + """ + Get the mean and standard deviation of the model's inputs and outputs + + :return: (dict) the means and stds + """ + if self.stats_sample is None: + # Get a sample and keep that fixed for all further computations. + # This allows us to estimate the change in value for the same set of inputs. + self.stats_sample = self.memory.sample(batch_size=self.batch_size) + + feed_dict = { + self.actions: self.stats_sample['actions'] + } + + for placeholder in [self.obs_train, self.obs_target, self.obs_adapt_noise, self.obs_noise]: + if placeholder is not None: + feed_dict[placeholder] = self.stats_sample['obs0'] + + values = self.sess.run(self.stats_ops, feed_dict=feed_dict) + + names = self.stats_names[:] + assert len(names) == len(values) + stats = dict(zip(names, values)) + + if self.param_noise is not None: + stats = {**stats, **self.param_noise.get_stats()} + + return stats + + def _adapt_param_noise(self): + """ + calculate the adaptation for the parameter noise + + :return: (float) the mean distance for the parameter noise + """ + if self.param_noise is None: + return 0. + + # Perturb a separate copy of the policy to adjust the scale for the next "real" perturbation. + batch = self.memory.sample(batch_size=self.batch_size) + self.sess.run(self.perturb_adaptive_policy_ops, feed_dict={ + self.param_noise_stddev: self.param_noise.current_stddev, + }) + distance = self.sess.run(self.adaptive_policy_distance, feed_dict={ + self.obs_adapt_noise: batch['obs0'], self.obs_train: batch['obs0'], + self.param_noise_stddev: self.param_noise.current_stddev, + }) + + mean_distance = MPI.COMM_WORLD.allreduce(distance, op=MPI.SUM) / MPI.COMM_WORLD.Get_size() + self.param_noise.adapt(mean_distance) + return mean_distance + + def _reset(self): + """ + Reset internal state after an episode is complete. + """ + if self.action_noise is not None: + self.action_noise.reset() + if self.param_noise is not None: + self.sess.run(self.perturb_policy_ops, feed_dict={ + self.param_noise_stddev: self.param_noise.current_stddev, + }) + + def learn(self, total_timesteps, callback=None, seed=None, log_interval=100): + with SetVerbosity(self.verbose): + self._setup_learn(seed) + + rank = MPI.COMM_WORLD.Get_rank() + # we assume symmetric actions. + assert np.all(np.abs(self.env.action_space.low) == self.env.action_space.high) + max_action = self.env.action_space.high + logger.log('scaling actions by {} before executing in env'.format(max_action)) + logger.log('Using agent with the following configuration:') + logger.log(str(self.__dict__.items())) + + eval_episode_rewards_history = deque(maxlen=100) + episode_rewards_history = deque(maxlen=100) + with self.sess.as_default(), self.graph.as_default(): + # Prepare everything. + self._reset() + obs = self.env.reset() + eval_obs = None + if self.eval_env is not None: + eval_obs = self.eval_env.reset() + episode_reward = 0. + episode_step = 0 + episodes = 0 + step = 0 + total_steps = 0 + + start_time = time.time() + + epoch_episode_rewards = [] + epoch_episode_steps = [] + epoch_actor_losses = [] + epoch_critic_losses = [] + epoch_adaptive_distances = [] + eval_episode_rewards = [] + eval_qs = [] + epoch_actions = [] + epoch_qs = [] + epoch_episodes = 0 + epoch = 0 + while True: + for _ in range(log_interval): + # Perform rollouts. + for _ in range(self.nb_rollout_steps): + if total_steps >= total_timesteps: + return self + + # Predict next action. + action, q_value = self._policy(obs, apply_noise=True, compute_q=True) + assert action.shape == self.env.action_space.shape + + # Execute next action. + if rank == 0 and self.render: + self.env.render() + assert max_action.shape == action.shape + # scale for execution in env (as far as DDPG is concerned, every action is in [-1, 1]) + new_obs, reward, done, _ = self.env.step(max_action * action) + step += 1 + total_steps += 1 + if rank == 0 and self.render: + self.env.render() + episode_reward += reward + episode_step += 1 + + # Book-keeping. + epoch_actions.append(action) + epoch_qs.append(q_value) + self._store_transition(obs, action, reward, new_obs, done) + obs = new_obs + if callback is not None: + callback(locals(), globals()) + + if done: + # Episode done. + epoch_episode_rewards.append(episode_reward) + episode_rewards_history.append(episode_reward) + epoch_episode_steps.append(episode_step) + episode_reward = 0. + episode_step = 0 + epoch_episodes += 1 + episodes += 1 + + self._reset() + if not isinstance(self.env, VecEnv): + obs = self.env.reset() + + # Train. + epoch_actor_losses = [] + epoch_critic_losses = [] + epoch_adaptive_distances = [] + for t_train in range(self.nb_train_steps): + # Adapt param noise, if necessary. + if self.memory.nb_entries >= self.batch_size and \ + t_train % self.param_noise_adaption_interval == 0: + distance = self._adapt_param_noise() + epoch_adaptive_distances.append(distance) + + critic_loss, actor_loss = self._train_step() + epoch_critic_losses.append(critic_loss) + epoch_actor_losses.append(actor_loss) + self._update_target_net() + + # Evaluate. + eval_episode_rewards = [] + eval_qs = [] + if self.eval_env is not None: + eval_episode_reward = 0. + for _ in range(self.nb_eval_steps): + if total_steps >= total_timesteps: + return self + + eval_action, eval_q = self._policy(eval_obs, apply_noise=False, compute_q=True) + # scale for execution in env (as far as DDPG is concerned, every action is in [-1, 1]) + eval_obs, eval_r, eval_done, _ = self.eval_env.step(max_action * eval_action) + if self.render_eval: + self.eval_env.render() + eval_episode_reward += eval_r + + eval_qs.append(eval_q) + if eval_done: + if not isinstance(self.env, VecEnv): + eval_obs = self.eval_env.reset() + eval_episode_rewards.append(eval_episode_reward) + eval_episode_rewards_history.append(eval_episode_reward) + eval_episode_reward = 0. + + mpi_size = MPI.COMM_WORLD.Get_size() + # Log stats. + # XXX shouldn't call np.mean on variable length lists + duration = time.time() - start_time + stats = self._get_stats() + combined_stats = stats.copy() + combined_stats['rollout/return'] = np.mean(epoch_episode_rewards) + combined_stats['rollout/return_history'] = np.mean(episode_rewards_history) + combined_stats['rollout/episode_steps'] = np.mean(epoch_episode_steps) + combined_stats['rollout/actions_mean'] = np.mean(epoch_actions) + combined_stats['rollout/Q_mean'] = np.mean(epoch_qs) + combined_stats['train/loss_actor'] = np.mean(epoch_actor_losses) + combined_stats['train/loss_critic'] = np.mean(epoch_critic_losses) + if len(epoch_adaptive_distances) != 0: + combined_stats['train/param_noise_distance'] = np.mean(epoch_adaptive_distances) + combined_stats['total/duration'] = duration + combined_stats['total/steps_per_second'] = float(step) / float(duration) + combined_stats['total/episodes'] = episodes + combined_stats['rollout/episodes'] = epoch_episodes + combined_stats['rollout/actions_std'] = np.std(epoch_actions) + # Evaluation statistics. + if self.eval_env is not None: + combined_stats['eval/return'] = eval_episode_rewards + combined_stats['eval/return_history'] = np.mean(eval_episode_rewards_history) + combined_stats['eval/Q'] = eval_qs + combined_stats['eval/episodes'] = len(eval_episode_rewards) + + def as_scalar(scalar): + """ + check and return the input if it is a scalar, otherwise raise ValueError + + :param scalar: (Any) the object to check + :return: (Number) the scalar if x is a scalar + """ + if isinstance(scalar, np.ndarray): + assert scalar.size == 1 + return scalar[0] + elif np.isscalar(scalar): + return scalar + else: + raise ValueError('expected scalar, got %s' % scalar) + + combined_stats_sums = MPI.COMM_WORLD.allreduce( + np.array([as_scalar(x) for x in combined_stats.values()])) + combined_stats = {k: v / mpi_size for (k, v) in zip(combined_stats.keys(), combined_stats_sums)} + + # Total statistics. + combined_stats['total/epochs'] = epoch + 1 + combined_stats['total/steps'] = step + + for key in sorted(combined_stats.keys()): + logger.record_tabular(key, combined_stats[key]) + logger.dump_tabular() + logger.info('') + logdir = logger.get_dir() + if rank == 0 and logdir: + if hasattr(self.env, 'get_state'): + with open(os.path.join(logdir, 'env_state.pkl'), 'wb') as file_handler: + pickle.dump(self.env.get_state(), file_handler) + if self.eval_env and hasattr(self.eval_env, 'get_state'): + with open(os.path.join(logdir, 'eval_env_state.pkl'), 'wb') as file_handler: + pickle.dump(self.eval_env.get_state(), file_handler) + + def predict(self, observation, state=None, mask=None): + observation = np.array(observation).reshape(self.observation_space.shape) + + action, _ = self._policy(observation, apply_noise=False, compute_q=True) + if self._vectorize_action: + return [action], [None] + else: + return action, None + + def action_probability(self, observation, state=None, mask=None): + # here there are no action probabilities, as DDPG is continuous + if self._vectorize_action: + return self.sess.run(self.policy_tf.policy_proba, feed_dict={self.obs_train: observation}) + else: + return self.sess.run(self.policy_tf.policy_proba, feed_dict={self.obs_train: observation})[0] + + def save(self, save_path): + data = { + "observation_space": self.observation_space, + "action_space": self.action_space, + "nb_eval_steps": self.nb_eval_steps, + "param_noise_adaption_interval": self.param_noise_adaption_interval, + "nb_train_steps": self.nb_train_steps, + "nb_rollout_steps": self.nb_rollout_steps, + "verbose": self.verbose, + "param_noise": self.param_noise, + "action_noise": self.action_noise, + "gamma": self.gamma, + "tau": self.tau, + "normalize_returns": self.normalize_returns, + "enable_popart": self.enable_popart, + "normalize_observations": self.normalize_observations, + "batch_size": self.batch_size, + "observation_range": self.observation_range, + "action_range": self.action_range, + "return_range": self.return_range, + "critic_l2_reg": self.critic_l2_reg, + "actor_lr": self.actor_lr, + "critic_lr": self.critic_lr, + "clip_norm": self.clip_norm, + "reward_scale": self.reward_scale, + "layer_norm": self.layer_norm, + "memory_limit": self.memory_limit, + "policy": self.policy, + "memory_policy": self.memory_policy, + "n_envs": self.n_envs, + "_vectorize_action": self._vectorize_action + } + + params = self.sess.run(self.params) + + self._save_to_file(save_path, data=data, params=params) + + @classmethod + def load(cls, load_path, env=None, **kwargs): + data, params = cls._load_from_file(load_path) + + model = cls(None, env, _init_setup_model=False) + model.__dict__.update(data) + model.__dict__.update(kwargs) + model.set_env(env) + model.setup_model() + + restores = [] + for param, loaded_p in zip(model.params, params): + restores.append(param.assign(loaded_p)) + model.sess.run(restores) + + return model diff --git a/baselines/ddpg/main.py b/stable_baselines/ddpg/main.py similarity index 68% rename from baselines/ddpg/main.py rename to stable_baselines/ddpg/main.py index e877507b8e..46fd7a39e0 100644 --- a/baselines/ddpg/main.py +++ b/stable_baselines/ddpg/main.py @@ -1,22 +1,33 @@ import argparse import time import os -import logging -from baselines import logger, bench -from baselines.common.misc_util import ( - set_global_seeds, - boolean_flag, -) -import baselines.ddpg.training as training -from baselines.ddpg.models import Actor, Critic -from baselines.ddpg.memory import Memory -from baselines.ddpg.noise import * import gym import tensorflow as tf +import numpy as np from mpi4py import MPI +from stable_baselines import logger, bench +from stable_baselines.common.misc_util import set_global_seeds, boolean_flag +from stable_baselines.common.policies import MlpPolicy +from stable_baselines.ddpg import DDPG +from stable_baselines.ddpg.memory import Memory +from stable_baselines.ddpg.noise import AdaptiveParamNoiseSpec, OrnsteinUhlenbeckActionNoise, NormalActionNoise + + def run(env_id, seed, noise_type, layer_norm, evaluation, **kwargs): + """ + run the training of DDPG + + :param env_id: (str) the environment ID + :param seed: (int) the initial random seed + :param noise_type: (str) the wanted noises ('adaptive-param', 'normal' or 'ou'), can use multiple noise type by + seperating them with commas + :param layer_norm: (bool) use layer normalization + :param evaluation: (bool) enable evaluation of DDPG training + :param kwargs: (dict) extra keywords for the training.train function + """ + # Configure things. rank = MPI.COMM_WORLD.Get_rank() if rank != 0: @@ -26,7 +37,7 @@ def run(env_id, seed, noise_type, layer_norm, evaluation, **kwargs): env = gym.make(env_id) env = bench.Monitor(env, logger.get_dir() and os.path.join(logger.get_dir(), str(rank))) - if evaluation and rank==0: + if evaluation and rank == 0: eval_env = gym.make(env_id) eval_env = bench.Monitor(eval_env, os.path.join(logger.get_dir(), 'gym_eval')) env = bench.Monitor(env, None) @@ -46,18 +57,14 @@ def run(env_id, seed, noise_type, layer_norm, evaluation, **kwargs): param_noise = AdaptiveParamNoiseSpec(initial_stddev=float(stddev), desired_action_stddev=float(stddev)) elif 'normal' in current_noise_type: _, stddev = current_noise_type.split('_') - action_noise = NormalActionNoise(mu=np.zeros(nb_actions), sigma=float(stddev) * np.ones(nb_actions)) + action_noise = NormalActionNoise(mean=np.zeros(nb_actions), sigma=float(stddev) * np.ones(nb_actions)) elif 'ou' in current_noise_type: _, stddev = current_noise_type.split('_') - action_noise = OrnsteinUhlenbeckActionNoise(mu=np.zeros(nb_actions), sigma=float(stddev) * np.ones(nb_actions)) + action_noise = OrnsteinUhlenbeckActionNoise(mean=np.zeros(nb_actions), + sigma=float(stddev) * np.ones(nb_actions)) else: raise RuntimeError('unknown noise type "{}"'.format(current_noise_type)) - # Configure components. - memory = Memory(limit=int(1e6), action_shape=env.action_space.shape, observation_shape=env.observation_space.shape) - critic = Critic(layer_norm=layer_norm) - actor = Actor(nb_actions, layer_norm=layer_norm) - # Seed everything to make things reproducible. seed = seed + 1000000 * rank logger.info('rank {}: seed={}, logdir={}'.format(rank, seed, logger.get_dir())) @@ -68,10 +75,12 @@ def run(env_id, seed, noise_type, layer_norm, evaluation, **kwargs): eval_env.seed(seed) # Disable logging for rank != 0 to avoid noise. + start_time = 0 if rank == 0: start_time = time.time() - training.train(env=env, eval_env=eval_env, param_noise=param_noise, - action_noise=action_noise, actor=actor, critic=critic, memory=memory, **kwargs) + model = DDPG(policy=MlpPolicy, env=env, memory_policy=Memory, eval_env=eval_env, param_noise=param_noise, + action_noise=action_noise, memory_limit=int(1e6), layer_norm=layer_norm, verbose=2, **kwargs) + model.learn(total_timesteps=10000) env.close() if eval_env is not None: eval_env.close() @@ -80,6 +89,11 @@ def run(env_id, seed, noise_type, layer_norm, evaluation, **kwargs): def parse_args(): + """ + parse the arguments for DDPG training + + :return: (dict) the arguments + """ parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter) parser.add_argument('--env-id', type=str, default='HalfCheetah-v1') @@ -93,25 +107,18 @@ def parse_args(): parser.add_argument('--batch-size', type=int, default=64) # per MPI worker parser.add_argument('--actor-lr', type=float, default=1e-4) parser.add_argument('--critic-lr', type=float, default=1e-3) - boolean_flag(parser, 'popart', default=False) + boolean_flag(parser, 'enable-popart', default=False) parser.add_argument('--gamma', type=float, default=0.99) parser.add_argument('--reward-scale', type=float, default=1.) parser.add_argument('--clip-norm', type=float, default=None) - parser.add_argument('--nb-epochs', type=int, default=500) # with default settings, perform 1M steps total - parser.add_argument('--nb-epoch-cycles', type=int, default=20) parser.add_argument('--nb-train-steps', type=int, default=50) # per epoch cycle and MPI worker parser.add_argument('--nb-eval-steps', type=int, default=100) # per epoch cycle and MPI worker parser.add_argument('--nb-rollout-steps', type=int, default=100) # per epoch cycle and MPI worker - parser.add_argument('--noise-type', type=str, default='adaptive-param_0.2') # choices are adaptive-param_xx, ou_xx, normal_xx, none - parser.add_argument('--num-timesteps', type=int, default=None) + # choices are adaptive-param_xx, ou_xx, normal_xx, none + parser.add_argument('--noise-type', type=str, default='adaptive-param_0.2') boolean_flag(parser, 'evaluation', default=False) args = parser.parse_args() - # we don't directly specify timesteps for this script, so make sure that if we do specify them - # they agree with the other parameters - if args.num_timesteps is not None: - assert(args.num_timesteps == args.nb_epochs * args.nb_epoch_cycles * args.nb_rollout_steps) dict_args = vars(args) - del dict_args['num_timesteps'] return dict_args diff --git a/baselines/ddpg/memory.py b/stable_baselines/ddpg/memory.py similarity index 59% rename from baselines/ddpg/memory.py rename to stable_baselines/ddpg/memory.py index 90f0f9a18a..474c42a82b 100644 --- a/baselines/ddpg/memory.py +++ b/stable_baselines/ddpg/memory.py @@ -3,6 +3,13 @@ class RingBuffer(object): def __init__(self, maxlen, shape, dtype='float32'): + """ + A buffer object, when full restarts at the initial position + + :param maxlen: (int) the max number of numpy objects to store + :param shape: (tuple) the shape of the numpy objects you want to store + :param dtype: (str) the name of the type of the numpy object you want to store + """ self.maxlen = maxlen self.start = 0 self.length = 0 @@ -17,9 +24,20 @@ def __getitem__(self, idx): return self.data[(self.start + idx) % self.maxlen] def get_batch(self, idxs): + """ + get the value at the indexes + + :param idxs: (int or numpy int) the indexes + :return: (numpy Any) the stored information in the buffer at the asked positions + """ return self.data[(self.start + idxs) % self.maxlen] - def append(self, v): + def append(self, var): + """ + Append an object to the buffer + + :param var: (numpy Any) the object you wish to add + """ if self.length < self.maxlen: # We have space, simply increase the length. self.length += 1 @@ -29,18 +47,31 @@ def append(self, v): else: # This should never happen. raise RuntimeError() - self.data[(self.start + self.length - 1) % self.maxlen] = v + self.data[(self.start + self.length - 1) % self.maxlen] = var -def array_min2d(x): - x = np.array(x) - if x.ndim >= 2: - return x - return x.reshape(-1, 1) +def array_min2d(arr): + """ + cast to numpy array, and make sure it is of 2 dim + + :param arr: ([Any]) the array to clean + :return: (numpy Any) the cleaned array + """ + arr = np.array(arr) + if arr.ndim >= 2: + return arr + return arr.reshape(-1, 1) class Memory(object): def __init__(self, limit, action_shape, observation_shape): + """ + The replay buffer object + + :param limit: (int) the max number of transitions to store + :param action_shape: (tuple) the action shape + :param observation_shape: (tuple) the observation shape + """ self.limit = limit self.observations0 = RingBuffer(limit, shape=observation_shape) @@ -50,6 +81,12 @@ def __init__(self, limit, action_shape, observation_shape): self.observations1 = RingBuffer(limit, shape=observation_shape) def sample(self, batch_size): + """ + sample a random batch from the buffer + + :param batch_size: (int) the number of element to sample for the batch + :return: (dict) the sampled batch + """ # Draw such that we always have a proceeding element. batch_idxs = np.random.random_integers(self.nb_entries - 2, size=batch_size) @@ -69,6 +106,16 @@ def sample(self, batch_size): return result def append(self, obs0, action, reward, obs1, terminal1, training=True): + """ + Append a transition to the buffer + + :param obs0: ([float] or [int]) the last observation + :param action: ([float]) the action + :param reward: (float] the reward + :param obs1: ([float] or [int]) the current observation + :param terminal1: (bool) is the episode done + :param training: (bool) is the RL model training or not + """ if not training: return diff --git a/stable_baselines/ddpg/noise.py b/stable_baselines/ddpg/noise.py new file mode 100644 index 0000000000..46c1da8011 --- /dev/null +++ b/stable_baselines/ddpg/noise.py @@ -0,0 +1,108 @@ +import numpy as np + + +class AdaptiveParamNoiseSpec(object): + def __init__(self, initial_stddev=0.1, desired_action_stddev=0.1, adoption_coefficient=1.01): + """ + Implements adaptive parameter noise + + :param initial_stddev: (float) the initial value for the standard deviation of the noise + :param desired_action_stddev: (float) the desired value for the standard deviation of the noise + :param adoption_coefficient: (float) the update coefficient for the standard deviation of the noise + """ + self.initial_stddev = initial_stddev + self.desired_action_stddev = desired_action_stddev + self.adoption_coefficient = adoption_coefficient + + self.current_stddev = initial_stddev + + def adapt(self, distance): + """ + update the standard deviation for the parameter noise + + :param distance: (float) the noise distance applied to the parameters + """ + if distance > self.desired_action_stddev: + # Decrease stddev. + self.current_stddev /= self.adoption_coefficient + else: + # Increase stddev. + self.current_stddev *= self.adoption_coefficient + + def get_stats(self): + """ + return the standard deviation for the parameter noise + + :return: (dict) the stats of the noise + """ + return {'param_noise_stddev': self.current_stddev} + + def __repr__(self): + fmt = 'AdaptiveParamNoiseSpec(initial_stddev={}, desired_action_stddev={}, adoption_coefficient={})' + return fmt.format(self.initial_stddev, self.desired_action_stddev, self.adoption_coefficient) + + +class ActionNoise(object): + """ + The action noise base class + """ + def reset(self): + """ + call end of episode reset for the noise + """ + pass + + +class NormalActionNoise(ActionNoise): + def __init__(self, mean, sigma): + """ + A guassian action noise + + :param mean: (float) the mean value of the noise + :param sigma: (float) the scale of the noise (std here) + """ + self._mu = mean + self._sigma = sigma + + def __call__(self): + return np.random.normal(self._mu, self._sigma) + + def __repr__(self): + return 'NormalActionNoise(mu={}, sigma={})'.format(self._mu, self._sigma) + + +class OrnsteinUhlenbeckActionNoise(ActionNoise): + def __init__(self, mean, sigma, theta=.15, dt=1e-2, initial_noise=None): + """ + A Ornstein Uhlenbeck action noise, this is designed to aproximate brownian motion with friction. + + Based on http://math.stackexchange.com/questions/1287634/implementing-ornstein-uhlenbeck-in-matlab + + :param mean: (float) the mean of the noise + :param sigma: (float) the scale of the noise + :param theta: (float) the rate of mean reversion + :param dt: (float) the timestep for the noise + :param initial_noise: ([float]) the initial value for the noise output, (if None: 0) + """ + self._theta = theta + self._mu = mean + self._sigma = sigma + self._dt = dt + self.initial_noise = initial_noise + self.noise_prev = None + self.reset() + + def __call__(self): + noise = self.noise_prev + self._theta * (self._mu - self.noise_prev) * self._dt + \ + self._sigma * np.sqrt(self._dt) * np.random.normal(size=self._mu.shape) + self.noise_prev = noise + return noise + + def reset(self): + """ + reset the Ornstein Uhlenbeck noise, to the initial position + """ + self.noise_prev = self.initial_noise if self.initial_noise is not None else np.zeros_like(self._mu) + + def __repr__(self): + return 'OrnsteinUhlenbeckActionNoise(mu={}, sigma={})'.format(self._mu, self._sigma) diff --git a/stable_baselines/deepq/__init__.py b/stable_baselines/deepq/__init__.py new file mode 100644 index 0000000000..de87c893d4 --- /dev/null +++ b/stable_baselines/deepq/__init__.py @@ -0,0 +1,15 @@ +from stable_baselines.deepq import models # noqa +from stable_baselines.deepq.build_graph import build_act, build_train # noqa +from stable_baselines.deepq.simple import DeepQ +from stable_baselines.deepq.replay_buffer import ReplayBuffer, PrioritizedReplayBuffer # noqa + + +def wrap_atari_dqn(env): + """ + wrap the environment in atari wrappers for DeepQ + + :param env: (Gym Environment) the environment + :return: (Gym Environment) the wrapped environment + """ + from stable_baselines.common.atari_wrappers import wrap_deepmind + return wrap_deepmind(env, frame_stack=True, scale=False) diff --git a/stable_baselines/deepq/build_graph.py b/stable_baselines/deepq/build_graph.py new file mode 100644 index 0000000000..05563456ca --- /dev/null +++ b/stable_baselines/deepq/build_graph.py @@ -0,0 +1,456 @@ +"""Deep Q learning graph + +The functions in this file can are used to create the following functions: + +======= act ======== + + Function to chose an action given an observation + + :param observation: (Any) Observation that can be feed into the output of make_obs_ph + :param stochastic: (bool) if set to False all the actions are always deterministic (default False) + :param update_eps_ph: (float) update epsilon a new value, if negative not update happens (default: no update) + :return: (TensorFlow Tensor) tensor of dtype tf.int64 and shape (BATCH_SIZE,) with an action to be performed for + every element of the batch. + + +======= act (in case of parameter noise) ======== + + Function to chose an action given an observation + + :param observation: (Any) Observation that can be feed into the output of make_obs_ph + :param stochastic: (bool) if set to False all the actions are always deterministic (default False) + :param update_eps_ph: (float) update epsilon a new value, if negative not update happens + (default: no update) + :param reset_ph: (bool) reset the perturbed policy by sampling a new perturbation + :param update_param_noise_threshold_ph: (float) the desired threshold for the difference between + non-perturbed and perturbed policy + :param update_param_noise_scale_ph: (bool) whether or not to update the scale of the noise for the next time it is + re-perturbed + :return: (TensorFlow Tensor) tensor of dtype tf.int64 and shape (BATCH_SIZE,) with an action to be performed for + every element of the batch. + + +======= train ======= + + Function that takes a transition (s,a,r,s') and optimizes Bellman equation's error: + + td_error = Q(s,a) - (r + gamma * max_a' Q(s', a')) + loss = huber_loss[td_error] + + :param obs_t: (Any) a batch of observations + :param action: (numpy int) actions that were selected upon seeing obs_t. dtype must be int32 and shape must be + (batch_size,) + :param reward: (numpy float) immediate reward attained after executing those actions dtype must be float32 and + shape must be (batch_size,) + :param obs_tp1: (Any) observations that followed obs_t + :param done: (numpy bool) 1 if obs_t was the last observation in the episode and 0 otherwise obs_tp1 gets ignored, + but must be of the valid shape. dtype must be float32 and shape must be (batch_size,) + :param weight: (numpy float) imporance weights for every element of the batch (gradient is multiplied by the + importance weight) dtype must be float32 and shape must be (batch_size,) + :return: (numpy float) td_error: a list of differences between Q(s,a) and the target in Bellman's equation. + dtype is float32 and shape is (batch_size,) + +======= update_target ======== + + copy the parameters from optimized Q function to the target Q function. + In Q learning we actually optimize the following error: + + Q(s,a) - (r + gamma * max_a' Q'(s', a')) + + Where Q' is lagging behind Q to stablize the learning. For example for Atari + + Q' is set to Q once every 10000 updates training steps. + +""" +import tensorflow as tf +import stable_baselines.common.tf_util as tf_utils + + +def scope_vars(scope, trainable_only=False): + """ + Get variables inside a scope + The scope can be specified as a string + + :param scope: (str or VariableScope) scope in which the variables reside. + :param trainable_only: (bool) whether or not to return only the variables that were marked as trainable. + :return: ([TensorFlow Tensor]) vars: list of variables in `scope`. + """ + return tf.get_collection( + tf.GraphKeys.TRAINABLE_VARIABLES if trainable_only else tf.GraphKeys.GLOBAL_VARIABLES, + scope=scope if isinstance(scope, str) else scope.name + ) + + +def scope_name(): + """ + Returns the name of current scope as a string, e.g. deepq/q_func + + :return: (str) the name of current scope + """ + return tf.get_variable_scope().name + + +def absolute_scope_name(relative_scope_name): + """ + Appends parent scope name to `relative_scope_name` + + :return: (str) the absolute name of the scope + """ + return scope_name() + "/" + relative_scope_name + + +def default_param_noise_filter(var): + """ + check whether or not a variable is perturbable or not + + :param var: (TensorFlow Tensor) the variable + :return: (bool) can be perturb + """ + if var not in tf.trainable_variables(): + # We never perturb non-trainable vars. + return False + if "fully_connected" in var.name: + # We perturb fully-connected layers. + return True + + # The remaining layers are likely conv or layer norm layers, which we do not wish to + # perturb (in the former case because they only extract features, in the latter case because + # we use them for normalization purposes). If you change your network, you will likely want + # to re-consider which layers to perturb and which to keep untouched. + return False + + +def build_act(make_obs_ph, q_func, num_actions, scope="deepq", reuse=None): + """Creates the act function: + + :param make_obs_ph: (function (str): TensorFlow Tensor) a function that take a name and creates a placeholder of + input with that name + :param q_func: (function (TensorFlow Tensor, int, str, bool): TensorFlow Tensor) + the model that takes the following inputs: + observation_in: object + the output of observation placeholder + num_actions: int + number of actions + scope: str + reuse: bool + should be passed to outer variable scope + and returns a tensor of shape (batch_size, num_actions) with values of every action. + :param num_actions: (int) number of actions. + :param scope: (str or VariableScope) optional scope for variable_scope. + :param reuse: (bool) whether or not the variables should be reused. To be able to reuse the scope must be given. + :return: (function (TensorFlow Tensor, bool, float): TensorFlow Tensor) act function to select and action given + observation. See the top of the file for details. + """ + with tf.variable_scope(scope, reuse=reuse): + observations_ph = make_obs_ph("observation") + stochastic_ph = tf.placeholder(tf.bool, (), name="stochastic") + update_eps_ph = tf.placeholder(tf.float32, (), name="update_eps") + + eps = tf.get_variable("eps", (), initializer=tf.constant_initializer(0)) + + q_values = q_func(observations_ph.get(), num_actions, scope="q_func") + deterministic_actions = tf.argmax(q_values, axis=1) + + batch_size = tf.shape(observations_ph.get())[0] + random_actions = tf.random_uniform(tf.stack([batch_size]), minval=0, maxval=num_actions, dtype=tf.int64) + chose_random = tf.random_uniform(tf.stack([batch_size]), minval=0, maxval=1, dtype=tf.float32) < eps + stochastic_actions = tf.where(chose_random, random_actions, deterministic_actions) + + output_actions = tf.cond(stochastic_ph, lambda: stochastic_actions, lambda: deterministic_actions) + update_eps_expr = eps.assign(tf.cond(update_eps_ph >= 0, lambda: update_eps_ph, lambda: eps)) + _act = tf_utils.function(inputs=[observations_ph, stochastic_ph, update_eps_ph], + outputs=output_actions, + givens={update_eps_ph: -1.0, stochastic_ph: True}, + updates=[update_eps_expr]) + + def act(obs, stochastic=True, update_eps=-1): + return _act(obs, stochastic, update_eps) + + return act + + +def build_act_with_param_noise(make_obs_ph, q_func, num_actions, scope="deepq", reuse=None, + param_noise_filter_func=None): + """Creates the act function with support for parameter space noise exploration (https://arxiv.org/abs/1706.01905): + + Parameters + ---------- + :param make_obs_ph: (function (str): TensorFlow Tensor) a function that take a name and creates a placeholder of + input with that name + :param q_func: (function (TensorFlow Tensor, int, str, bool): TensorFlow Tensor) + the model that takes the following inputs: + observation_in: object + the output of observation placeholder + num_actions: int + number of actions + scope: str + reuse: bool + should be passed to outer variable scope + and returns a tensor of shape (batch_size, num_actions) with values of every action. + :param num_actions: (int) number of actions. + :param scope: (str or VariableScope) optional scope for variable_scope. + :param reuse: (bool) whether or not the variables should be reused. To be able to reuse the scope must be given. + :param param_noise_filter_func: (function (TensorFlow Tensor): bool) function that decides whether or not a + variable should be perturbed. Only applicable if param_noise is True. If set to None, default_param_noise_filter + is used by default. + :return: (function (TensorFlow Tensor, bool, float): TensorFlow Tensor) act function to select and action given + observation. See the top of the file for details. + """ + if param_noise_filter_func is None: + param_noise_filter_func = default_param_noise_filter + + with tf.variable_scope(scope, reuse=reuse): + observations_ph = make_obs_ph("observation") + stochastic_ph = tf.placeholder(tf.bool, (), name="stochastic") + update_eps_ph = tf.placeholder(tf.float32, (), name="update_eps") + update_param_noise_threshold_ph = tf.placeholder(tf.float32, (), name="update_param_noise_threshold") + update_param_noise_scale_ph = tf.placeholder(tf.bool, (), name="update_param_noise_scale") + reset_ph = tf.placeholder(tf.bool, (), name="reset") + + eps = tf.get_variable("eps", (), initializer=tf.constant_initializer(0)) + param_noise_scale = tf.get_variable("param_noise_scale", (), initializer=tf.constant_initializer(0.01), + trainable=False) + param_noise_threshold = tf.get_variable("param_noise_threshold", (), initializer=tf.constant_initializer(0.05), + trainable=False) + + # Unmodified Q. + q_values = q_func(observations_ph.get(), num_actions, scope="q_func") + + # Perturbable Q used for the actual rollout. + q_values_perturbed = q_func(observations_ph.get(), num_actions, scope="perturbed_q_func") + + def perturb_vars(original_scope, perturbed_scope): + """ + We have to wrap this code into a function due to the way tf.cond() works. + + See https://stackoverflow.com/questions/37063952/confused-by-the-behavior-of-tf-cond for a more detailed + discussion. + + :param original_scope: (str or VariableScope) the original scope. + :param perturbed_scope: (str or VariableScope) the perturbed scope. + :return: (TensorFlow Operation) + """ + all_vars = scope_vars(absolute_scope_name(original_scope)) + all_perturbed_vars = scope_vars(absolute_scope_name(perturbed_scope)) + assert len(all_vars) == len(all_perturbed_vars) + perturb_ops = [] + for var, perturbed_var in zip(all_vars, all_perturbed_vars): + if param_noise_filter_func(perturbed_var): + # Perturb this variable. + operation = tf.assign(perturbed_var, + var + tf.random_normal(shape=tf.shape(var), mean=0., + stddev=param_noise_scale)) + else: + # Do not perturb, just assign. + operation = tf.assign(perturbed_var, var) + perturb_ops.append(operation) + assert len(perturb_ops) == len(all_vars) + return tf.group(*perturb_ops) + + # Set up functionality to re-compute `param_noise_scale`. This perturbs yet another copy + # of the network and measures the effect of that perturbation in action space. If the perturbation + # is too big, reduce scale of perturbation, otherwise increase. + q_values_adaptive = q_func(observations_ph.get(), num_actions, scope="adaptive_q_func") + perturb_for_adaption = perturb_vars(original_scope="q_func", perturbed_scope="adaptive_q_func") + kl_loss = tf.reduce_sum( + tf.nn.softmax(q_values) * (tf.log(tf.nn.softmax(q_values)) - tf.log(tf.nn.softmax(q_values_adaptive))), + axis=-1) + mean_kl = tf.reduce_mean(kl_loss) + + def update_scale(): + """ + update the scale expression + + :return: (TensorFlow Tensor) the updated scale expression + """ + with tf.control_dependencies([perturb_for_adaption]): + update_scale_expr = tf.cond(mean_kl < param_noise_threshold, + lambda: param_noise_scale.assign(param_noise_scale * 1.01), + lambda: param_noise_scale.assign(param_noise_scale / 1.01), + ) + return update_scale_expr + + # Functionality to update the threshold for parameter space noise. + update_param_noise_thres_expr = param_noise_threshold.assign( + tf.cond(update_param_noise_threshold_ph >= 0, lambda: update_param_noise_threshold_ph, + lambda: param_noise_threshold)) + + # Put everything together. + perturbed_deterministic_actions = tf.argmax(q_values_perturbed, axis=1) + deterministic_actions = tf.argmax(q_values, axis=1) + batch_size = tf.shape(observations_ph.get())[0] + random_actions = tf.random_uniform(tf.stack([batch_size]), minval=0, maxval=num_actions, dtype=tf.int64) + chose_random = tf.random_uniform(tf.stack([batch_size]), minval=0, maxval=1, dtype=tf.float32) < eps + perturbed_stochastic_actions = tf.where(chose_random, random_actions, perturbed_deterministic_actions) + stochastic_actions = tf.where(chose_random, random_actions, deterministic_actions) + + perturbed_output_actions = tf.cond(stochastic_ph, lambda: perturbed_stochastic_actions, + lambda: deterministic_actions) + output_actions = tf.cond(stochastic_ph, lambda: stochastic_actions, lambda: deterministic_actions) + update_eps_expr = eps.assign(tf.cond(update_eps_ph >= 0, lambda: update_eps_ph, lambda: eps)) + updates = [ + update_eps_expr, + tf.cond(reset_ph, lambda: perturb_vars(original_scope="q_func", perturbed_scope="perturbed_q_func"), + lambda: tf.group(*[])), + tf.cond(update_param_noise_scale_ph, lambda: update_scale(), lambda: tf.Variable(0., trainable=False)), + update_param_noise_thres_expr, + ] + + _act = tf_utils.function(inputs=[observations_ph, stochastic_ph, update_eps_ph], + outputs=output_actions, + givens={update_eps_ph: -1.0, stochastic_ph: True}, + updates=[update_eps_expr]) + + _perturbed_act = tf_utils.function( + inputs=[observations_ph, stochastic_ph, update_eps_ph, reset_ph, update_param_noise_threshold_ph, + update_param_noise_scale_ph], + outputs=perturbed_output_actions, + givens={update_eps_ph: -1.0, stochastic_ph: True, reset_ph: False, update_param_noise_threshold_ph: False, + update_param_noise_scale_ph: False}, + updates=updates) + + def act(obs, reset=None, update_param_noise_threshold=None, update_param_noise_scale=None, stochastic=True, + update_eps=-1): + """ + get the action from the current observation + + :param obs: (Any) Observation that can be feed into the output of make_obs_ph + :param reset: (bool) reset the perturbed policy by sampling a new perturbation + :param update_param_noise_threshold: (float) the desired threshold for the difference between + non-perturbed and perturbed policy + :param update_param_noise_scale: (bool) whether or not to update the scale of the noise for the next time + it is re-perturbed + :param stochastic: (bool) if set to False all the actions are always deterministic (default False) + :param update_eps: (float) update epsilon a new value, if negative not update happens + (default: no update) + :return: (TensorFlow Tensor) tensor of dtype tf.int64 and shape (BATCH_SIZE,) with an action to be + performed for every element of the batch. + """ + if reset is None or update_param_noise_threshold is None or update_param_noise_scale is None: + return _act(obs, stochastic, update_eps) + else: + return _perturbed_act(obs, stochastic, update_eps, reset, update_param_noise_threshold, + update_param_noise_scale) + + return act + + +def build_train(make_obs_ph, q_func, num_actions, optimizer, grad_norm_clipping=None, gamma=1.0, + double_q=True, scope="deepq", reuse=None, param_noise=False, param_noise_filter_func=None): + """ + Creates the train function: + + :param make_obs_ph: (function (str): TensorFlow Tensor) a function that takes a name and creates a placeholder of + input with that name + :param q_func: (function (TensorFlow Tensor, int, str, bool): TensorFlow Tensor) + the model that takes the following inputs: + - observation_in: (Any) the output of observation placeholder + - num_actions: int number of actions + - scope: (str) + - reuse: (bool) + + should be passed to outer variable scope and returns a tensor of shape (batch_size, num_actions) + with values of every action. + :param num_actions: (int) number of actions + :param reuse: (bool) whether or not to reuse the graph variables + :param optimizer: (tf.train.Optimizer) optimizer to use for the Q-learning objective. + :param grad_norm_clipping: (float) clip gradient norms to this value. If None no clipping is performed. + :param gamma: (float) discount rate. + :param double_q: (bool) if true will use Double Q Learning (https://arxiv.org/abs/1509.06461). In general it is a + good idea to keep it enabled. + :param scope: (str or VariableScope) optional scope for variable_scope. + :param reuse: (bool) whether or not the variables should be reused. To be able to reuse the scope must be given. + :param param_noise: (bool) whether or not to use parameter space noise (https://arxiv.org/abs/1706.01905) + :param param_noise_filter_func: (function (TensorFlow Tensor): bool) function that decides whether or not a + variable should be perturbed. Only applicable if param_noise is True. If set to None, default_param_noise_filter + is used by default. + + :return: (tuple) + + act: (function (TensorFlow Tensor, bool, float): TensorFlow Tensor) function to select and action given + observation. See the top of the file for details. + train: (function (Any, numpy float, numpy float, Any, numpy bool, numpy float): numpy float) + optimize the error in Bellman's equation. See the top of the file for details. + update_target: (function) copy the parameters from optimized Q function to the target Q function. + See the top of the file for details. + debug: ({str: function}) a bunch of functions to print debug data like q_values. + """ + if param_noise: + act_f = build_act_with_param_noise(make_obs_ph, q_func, num_actions, scope=scope, reuse=reuse, + param_noise_filter_func=param_noise_filter_func) + else: + act_f = build_act(make_obs_ph, q_func, num_actions, scope=scope, reuse=reuse) + + with tf.variable_scope(scope, reuse=reuse): + # set up placeholders + obs_t_input = make_obs_ph("obs_t") + act_t_ph = tf.placeholder(tf.int32, [None], name="action") + rew_t_ph = tf.placeholder(tf.float32, [None], name="reward") + obs_tp1_input = make_obs_ph("obs_tp1") + done_mask_ph = tf.placeholder(tf.float32, [None], name="done") + importance_weights_ph = tf.placeholder(tf.float32, [None], name="weight") + + # q network evaluation + q_t = q_func(obs_t_input.get(), num_actions, scope="q_func", reuse=True) # reuse parameters from act + q_func_vars = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope=tf.get_variable_scope().name + "/q_func") + + # target q network evalution + q_tp1 = q_func(obs_tp1_input.get(), num_actions, scope="target_q_func") + target_q_func_vars = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, + scope=tf.get_variable_scope().name + "/target_q_func") + + # q scores for actions which we know were selected in the given state. + q_t_selected = tf.reduce_sum(q_t * tf.one_hot(act_t_ph, num_actions), 1) + + # compute estimate of best possible value starting from state at t + 1 + if double_q: + q_tp1_using_online_net = q_func(obs_tp1_input.get(), num_actions, scope="q_func", reuse=True) + q_tp1_best_using_online_net = tf.argmax(q_tp1_using_online_net, 1) + q_tp1_best = tf.reduce_sum(q_tp1 * tf.one_hot(q_tp1_best_using_online_net, num_actions), 1) + else: + q_tp1_best = tf.reduce_max(q_tp1, 1) + q_tp1_best_masked = (1.0 - done_mask_ph) * q_tp1_best + + # compute RHS of bellman equation + q_t_selected_target = rew_t_ph + gamma * q_tp1_best_masked + + # compute the error (potentially clipped) + td_error = q_t_selected - tf.stop_gradient(q_t_selected_target) + errors = tf_utils.huber_loss(td_error) + weighted_error = tf.reduce_mean(importance_weights_ph * errors) + + # compute optimization op (potentially with gradient clipping) + if grad_norm_clipping is not None: + gradients = optimizer.compute_gradients(weighted_error, var_list=q_func_vars) + for i, (grad, var) in enumerate(gradients): + if grad is not None: + gradients[i] = (tf.clip_by_norm(grad, grad_norm_clipping), var) + optimize_expr = optimizer.apply_gradients(gradients) + else: + optimize_expr = optimizer.minimize(weighted_error, var_list=q_func_vars) + + # update_target_fn will be called periodically to copy Q network to target Q network + update_target_expr = [] + for var, var_target in zip(sorted(q_func_vars, key=lambda v: v.name), + sorted(target_q_func_vars, key=lambda v: v.name)): + update_target_expr.append(var_target.assign(var)) + update_target_expr = tf.group(*update_target_expr) + + # Create callable functions + train = tf_utils.function( + inputs=[ + obs_t_input, + act_t_ph, + rew_t_ph, + obs_tp1_input, + done_mask_ph, + importance_weights_ph + ], + outputs=td_error, + updates=[optimize_expr] + ) + update_target = tf_utils.function([], [], updates=[update_target_expr]) + + q_values = tf_utils.function([obs_t_input], q_t) + + return act_f, train, update_target, {'q_values': q_values} diff --git a/baselines/__init__.py b/stable_baselines/deepq/experiments/__init__.py similarity index 100% rename from baselines/__init__.py rename to stable_baselines/deepq/experiments/__init__.py diff --git a/baselines/deepq/experiments/custom_cartpole.py b/stable_baselines/deepq/experiments/custom_cartpole.py similarity index 56% rename from baselines/deepq/experiments/custom_cartpole.py rename to stable_baselines/deepq/experiments/custom_cartpole.py index b5a381a37e..55edd14ba4 100644 --- a/baselines/deepq/experiments/custom_cartpole.py +++ b/stable_baselines/deepq/experiments/custom_cartpole.py @@ -1,20 +1,28 @@ -import gym import itertools +import argparse + +import gym import numpy as np import tensorflow as tf import tensorflow.contrib.layers as layers -import baselines.common.tf_util as U - -from baselines import logger -from baselines import deepq -from baselines.deepq.replay_buffer import ReplayBuffer -from baselines.deepq.utils import ObservationInput -from baselines.common.schedules import LinearSchedule +import stable_baselines.common.tf_util as tf_utils +from stable_baselines import logger, deepq +from stable_baselines.deepq.replay_buffer import ReplayBuffer +from stable_baselines.deepq.utils import ObservationInput +from stable_baselines.common.schedules import LinearSchedule def model(inpt, num_actions, scope, reuse=False): - """This model takes as input an observation and returns values of all actions.""" + """ + This model takes as input an observation and returns values of all actions. + + :param inpt: (TensorFlow Tensor) the input placeholder + :param num_actions: (int) size of the action space + :param scope: (str) the variable scope + :param reuse: (bool) is a reusable model + :return: (TensorFlow Tensor) + """ with tf.variable_scope(scope, reuse=reuse): out = inpt out = layers.fully_connected(out, num_outputs=64, activation_fn=tf.nn.tanh) @@ -22,12 +30,16 @@ def model(inpt, num_actions, scope, reuse=False): return out -if __name__ == '__main__': - with U.make_session(8): +def main(args): + """ + Train a DeepQ agent on cartpole env + :param args: (Parsed Arguments) the input arguments + """ + with tf_utils.make_session(8): # Create the environment env = gym.make("CartPole-v0") # Create all the functions necessary to train the model - act, train, update_target, debug = deepq.build_train( + act, train, update_target, _ = deepq.build_train( make_obs_ph=lambda name: ObservationInput(env.observation_space, name=name), q_func=model, num_actions=env.action_space.n, @@ -40,14 +52,14 @@ def model(inpt, num_actions, scope, reuse=False): exploration = LinearSchedule(schedule_timesteps=10000, initial_p=1.0, final_p=0.02) # Initialize the parameters and copy them to the target network. - U.initialize() + tf_utils.initialize() update_target() episode_rewards = [0.0] obs = env.reset() - for t in itertools.count(): + for step in itertools.count(): # Take action and update exploration to the newest value - action = act(obs[None], update_eps=exploration.value(t))[0] + action = act(obs[None], update_eps=exploration.value(step))[0] new_obs, rew, done, _ = env.step(action) # Store transition in the replay buffer. replay_buffer.add(obs, action, rew, new_obs, float(done)) @@ -58,22 +70,44 @@ def model(inpt, num_actions, scope, reuse=False): obs = env.reset() episode_rewards.append(0) - is_solved = t > 100 and np.mean(episode_rewards[-101:-1]) >= 200 + if len(episode_rewards[-101:-1]) == 0: + mean_100ep_reward = -np.inf + else: + mean_100ep_reward = round(float(np.mean(episode_rewards[-101:-1])), 1) + + is_solved = step > 100 and mean_100ep_reward >= 200 + + if args.no_render and step > args.max_timesteps: + break + if is_solved: + if args.no_render: + break # Show off the result env.render() else: # Minimize the error in Bellman's equation on a batch sampled from replay buffer. - if t > 1000: + if step > 1000: obses_t, actions, rewards, obses_tp1, dones = replay_buffer.sample(32) train(obses_t, actions, rewards, obses_tp1, dones, np.ones_like(rewards)) # Update target network periodically. - if t % 1000 == 0: + if step % 1000 == 0: update_target() if done and len(episode_rewards) % 10 == 0: - logger.record_tabular("steps", t) + logger.record_tabular("steps", step) logger.record_tabular("episodes", len(episode_rewards)) - logger.record_tabular("mean episode reward", round(np.mean(episode_rewards[-101:-1]), 1)) - logger.record_tabular("% time spent exploring", int(100 * exploration.value(t))) + logger.record_tabular("mean episode reward", mean_100ep_reward) + logger.record_tabular("% time spent exploring", int(100 * exploration.value(step))) logger.dump_tabular() + + +if __name__ == '__main__': + parser = argparse.ArgumentParser(description="Train DQN on cartpole using a custom mlp") + parser.add_argument('--no-render', default=False, action="store_true", help="Disable rendering") + parser.add_argument('--max-timesteps', default=50000, type=int, + help="Maximum number of timesteps when not rendering") + args = parser.parse_args() + main(args) + + diff --git a/stable_baselines/deepq/experiments/enjoy_cartpole.py b/stable_baselines/deepq/experiments/enjoy_cartpole.py new file mode 100644 index 0000000000..07ad0514e3 --- /dev/null +++ b/stable_baselines/deepq/experiments/enjoy_cartpole.py @@ -0,0 +1,35 @@ +import argparse + +import gym + +from stable_baselines.deepq import DeepQ + +def main(args): + """ + Run a trained model for the cartpole problem + + :param args: (ArgumentParser) the input arguments + """ + env = gym.make("CartPole-v0") + model = DeepQ.load("cartpole_model.pkl", env) + + while True: + obs, done = env.reset(), False + episode_rew = 0 + while not done: + if not args.no_render: + env.render() + action, _ = model.predict(obs) + obs, rew, done, _ = env.step(action) + episode_rew += rew + print("Episode reward", episode_rew) + # No render is only used for automatic testing + if args.no_render: + break + + +if __name__ == '__main__': + parser = argparse.ArgumentParser(description="Enjoy trained DQN on cartpole") + parser.add_argument('--no-render', default=False, action="store_true", help="Disable rendering") + args = parser.parse_args() + main(args) diff --git a/stable_baselines/deepq/experiments/enjoy_mountaincar.py b/stable_baselines/deepq/experiments/enjoy_mountaincar.py new file mode 100644 index 0000000000..82a698bf7f --- /dev/null +++ b/stable_baselines/deepq/experiments/enjoy_mountaincar.py @@ -0,0 +1,36 @@ +import argparse + +import gym + +from stable_baselines.deepq import DeepQ + + +def main(args): + """ + run a trained model for the mountain car problem + + :param args: (ArgumentParser) the input arguments + """ + env = gym.make("MountainCar-v0") + model = DeepQ.load("mountaincar_model.pkl", env) + + while True: + obs, done = env.reset(), False + episode_rew = 0 + while not done: + if not args.no_render: + env.render() + action, _ = model.predict(obs) + obs, rew, done, _ = env.step(action) + episode_rew += rew + print("Episode reward", episode_rew) + # No render is only used for automatic testing + if args.no_render: + break + + +if __name__ == '__main__': + parser = argparse.ArgumentParser(description="Enjoy trained DQN on MountainCar") + parser.add_argument('--no-render', default=False, action="store_true", help="Disable rendering") + args = parser.parse_args() + main(args) diff --git a/baselines/deepq/experiments/enjoy_pong.py b/stable_baselines/deepq/experiments/enjoy_pong.py similarity index 55% rename from baselines/deepq/experiments/enjoy_pong.py rename to stable_baselines/deepq/experiments/enjoy_pong.py index 5b16fec6b6..a582bdd2dd 100644 --- a/baselines/deepq/experiments/enjoy_pong.py +++ b/stable_baselines/deepq/experiments/enjoy_pong.py @@ -1,18 +1,24 @@ import gym -from baselines import deepq + +from stable_baselines import deepq +from stable_baselines.deepq import DeepQ def main(): + """ + run a trained model for the pong problem + """ env = gym.make("PongNoFrameskip-v4") env = deepq.wrap_atari_dqn(env) - act = deepq.load("pong_model.pkl") + model = DeepQ.load("pong_model.pkl", env) while True: obs, done = env.reset(), False episode_rew = 0 while not done: env.render() - obs, rew, done, _ = env.step(act(obs[None])[0]) + action, _ = model.predict(obs) + obs, rew, done, _ = env.step(action) episode_rew += rew print("Episode reward", episode_rew) diff --git a/baselines/deepq/experiments/run_atari.py b/stable_baselines/deepq/experiments/run_atari.py similarity index 75% rename from baselines/deepq/experiments/run_atari.py rename to stable_baselines/deepq/experiments/run_atari.py index b6b427ba7a..6bf4cfb8c3 100644 --- a/baselines/deepq/experiments/run_atari.py +++ b/stable_baselines/deepq/experiments/run_atari.py @@ -1,12 +1,15 @@ -from baselines import deepq -from baselines.common import set_global_seeds -from baselines import bench import argparse -from baselines import logger -from baselines.common.atari_wrappers import make_atari + +from stable_baselines import bench, logger +from stable_baselines.common import set_global_seeds +from stable_baselines.common.atari_wrappers import make_atari +from stable_baselines.deepq import DeepQ, wrap_atari_dqn, models as deepq_models def main(): + """ + run the atari test + """ parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter) parser.add_argument('--env', help='environment ID', default='BreakoutNoFrameskip-v4') parser.add_argument('--seed', help='RNG seed', type=int, default=0) @@ -22,18 +25,17 @@ def main(): set_global_seeds(args.seed) env = make_atari(args.env) env = bench.Monitor(env, logger.get_dir()) - env = deepq.wrap_atari_dqn(env) - model = deepq.models.cnn_to_mlp( + env = wrap_atari_dqn(env) + q_func = deepq_models.cnn_to_mlp( convs=[(32, 8, 4), (64, 4, 2), (64, 3, 1)], hiddens=[256], dueling=bool(args.dueling), ) - deepq.learn( - env, - q_func=model, - lr=1e-4, - max_timesteps=args.num_timesteps, + model = DeepQ( + env=env, + policy=q_func, + learning_rate=1e-4, buffer_size=10000, exploration_fraction=0.1, exploration_final_eps=0.01, @@ -46,6 +48,7 @@ def main(): checkpoint_freq=args.checkpoint_freq, checkpoint_path=args.checkpoint_path, ) + model.learn(total_timesteps=args.num_timesteps) env.close() diff --git a/stable_baselines/deepq/experiments/train_cartpole.py b/stable_baselines/deepq/experiments/train_cartpole.py new file mode 100644 index 0000000000..73c5853a6e --- /dev/null +++ b/stable_baselines/deepq/experiments/train_cartpole.py @@ -0,0 +1,52 @@ +import argparse + +import gym +import numpy as np + +from stable_baselines.deepq import DeepQ, models as deepq_models + + +def callback(lcl, _glb): + """ + the callback function for logging and saving + + :param lcl: (dict) the local variables + :param _glb: (dict) the global variables + :return: (bool) is solved + """ + # stop training if reward exceeds 199 + if len(lcl['episode_rewards'][-101:-1]) == 0: + mean_100ep_reward = -np.inf + else: + mean_100ep_reward = round(float(np.mean(lcl['episode_rewards'][-101:-1])), 1) + is_solved = lcl['step'] > 100 and mean_100ep_reward >= 199 + return is_solved + + +def main(args): + """ + train and save the DeepQ model, for the cartpole problem + + :param args: (ArgumentParser) the input arguments + """ + env = gym.make("CartPole-v0") + q_func = deepq_models.mlp([64]) + model = DeepQ( + env=env, + policy=q_func, + learning_rate=1e-3, + buffer_size=50000, + exploration_fraction=0.1, + exploration_final_eps=0.02, + ) + model.learn(total_timesteps=args.max_timesteps, callback=callback) + + print("Saving model to cartpole_model.pkl") + model.save("cartpole_model.pkl") + + +if __name__ == '__main__': + parser = argparse.ArgumentParser(description="Train DQN on cartpole") + parser.add_argument('--max-timesteps', default=100000, type=int, help="Maximum number of timesteps") + args = parser.parse_args() + main(args) diff --git a/stable_baselines/deepq/experiments/train_mountaincar.py b/stable_baselines/deepq/experiments/train_mountaincar.py new file mode 100644 index 0000000000..154cc11e76 --- /dev/null +++ b/stable_baselines/deepq/experiments/train_mountaincar.py @@ -0,0 +1,37 @@ +import argparse + +import gym + +from stable_baselines.deepq import DeepQ, models as deepq_models + + +def main(args): + """ + train and save the DeepQ model, for the mountain car problem + + :param args: (ArgumentParser) the input arguments + """ + env = gym.make("MountainCar-v0") + # Enabling layer_norm here is important for parameter space noise! + q_func = deepq_models.mlp([64], layer_norm=True) + + model = DeepQ( + policy=q_func, + env=env, + learning_rate=1e-3, + buffer_size=50000, + exploration_fraction=0.1, + exploration_final_eps=0.1, + param_noise=True + ) + model.learn(total_timesteps=args.max_timesteps) + + print("Saving model to mountaincar_model.pkl") + model.save("mountaincar_model.pkl") + + +if __name__ == '__main__': + parser = argparse.ArgumentParser(description="Train DQN on cartpole") + parser.add_argument('--max-timesteps', default=100000, type=int, help="Maximum number of timesteps") + args = parser.parse_args() + main(args) diff --git a/baselines/deepq/models.py b/stable_baselines/deepq/models.py similarity index 79% rename from baselines/deepq/models.py rename to stable_baselines/deepq/models.py index 198d795a06..686d989260 100644 --- a/baselines/deepq/models.py +++ b/stable_baselines/deepq/models.py @@ -14,19 +14,17 @@ def _mlp(hiddens, inpt, num_actions, scope, reuse=False, layer_norm=False): return q_out -def mlp(hiddens=[], layer_norm=False): - """This model takes as input an observation and returns values of all actions. +def mlp(hiddens=None, layer_norm=False): + """ + This model takes as input an observation and returns values of all actions. - Parameters - ---------- - hiddens: [int] - list of sizes of hidden layers + :param hiddens: ([int]) list of sizes of hidden layers + :param layer_norm: (bool) if true, use layer normalization - Returns - ------- - q_func: function - q_function for DQN algorithm. + :return: (function) q_function for DQN algorithm. """ + if hiddens is None: + hiddens = [] return lambda *args, **kwargs: _mlp(hiddens, layer_norm=layer_norm, *args, **kwargs) @@ -70,21 +68,11 @@ def _cnn_to_mlp(convs, hiddens, dueling, inpt, num_actions, scope, reuse=False, def cnn_to_mlp(convs, hiddens, dueling=False, layer_norm=False): """This model takes as input an observation and returns values of all actions. - Parameters - ---------- - convs: [(int, int int)] - list of convolutional layers in form of - (num_outputs, kernel_size, stride) - hiddens: [int] - list of sizes of hidden layers - dueling: bool - if true double the output MLP to compute a baseline - for action scores - - Returns - ------- - q_func: function - q_function for DQN algorithm. + :param convs: ([(int, int, int)]) list of convolutional layers in form of (num_outputs, kernel_size, stride) + :param hiddens: ([int]) list of sizes of hidden layers + :param dueling: (bool) if true double the output MLP to compute a baseline for action scores + :param layer_norm: (bool) if true, use layer normalization + :return: (function) q_function for DQN algorithm. """ return lambda *args, **kwargs: _cnn_to_mlp(convs, hiddens, dueling, layer_norm=layer_norm, *args, **kwargs) diff --git a/baselines/deepq/replay_buffer.py b/stable_baselines/deepq/replay_buffer.py similarity index 50% rename from baselines/deepq/replay_buffer.py rename to stable_baselines/deepq/replay_buffer.py index 7988113b0e..0e45d20de4 100644 --- a/baselines/deepq/replay_buffer.py +++ b/stable_baselines/deepq/replay_buffer.py @@ -1,18 +1,17 @@ -import numpy as np import random -from baselines.common.segment_tree import SumSegmentTree, MinSegmentTree +import numpy as np + +from stable_baselines.common.segment_tree import SumSegmentTree, MinSegmentTree class ReplayBuffer(object): def __init__(self, size): - """Create Replay buffer. + """ + Create Replay buffer. - Parameters - ---------- - size: int - Max number of transitions to store in the buffer. When the buffer - overflows the old memories are dropped. + :param size: (int) Max number of transitions to store in the buffer. When the buffer overflows the old + memories are dropped. """ self._storage = [] self._maxsize = size @@ -22,6 +21,15 @@ def __len__(self): return len(self._storage) def add(self, obs_t, action, reward, obs_tp1, done): + """ + add a new transition to the buffer + + :param obs_t: (Any) the last observation + :param action: ([float]) the action + :param reward: (float) the reward of the transition + :param obs_tp1: (Any) the current observation + :param done: (bool) is the episode done + """ data = (obs_t, action, reward, obs_tp1, done) if self._next_idx >= len(self._storage): @@ -42,27 +50,18 @@ def _encode_sample(self, idxes): dones.append(done) return np.array(obses_t), np.array(actions), np.array(rewards), np.array(obses_tp1), np.array(dones) - def sample(self, batch_size): - """Sample a batch of experiences. - - Parameters - ---------- - batch_size: int - How many transitions to sample. - - Returns - ------- - obs_batch: np.array - batch of observations - act_batch: np.array - batch of actions executed given obs_batch - rew_batch: np.array - rewards received as results of executing act_batch - next_obs_batch: np.array - next set of observations seen after executing act_batch - done_mask: np.array - done_mask[i] = 1 if executing act_batch[i] resulted in - the end of an episode and 0 otherwise. + def sample(self, batch_size, **_kwargs): + """ + Sample a batch of experiences. + + :param batch_size: (int) How many transitions to sample. + :return: + - obs_batch: (numpy Any) batch of observations + - act_batch: (numpy float) batch of actions executed given obs_batch + - rew_batch: (numpy float) rewards received as results of executing act_batch + - next_obs_batch: (numpy Any) next set of observations seen after executing act_batch + - done_mask: (numpy bool) done_mask[i] = 1 if executing act_batch[i] resulted in the end of an episode + and 0 otherwise. """ idxes = [random.randint(0, len(self._storage) - 1) for _ in range(batch_size)] return self._encode_sample(idxes) @@ -70,20 +69,14 @@ def sample(self, batch_size): class PrioritizedReplayBuffer(ReplayBuffer): def __init__(self, size, alpha): - """Create Prioritized Replay buffer. - - Parameters - ---------- - size: int - Max number of transitions to store in the buffer. When the buffer - overflows the old memories are dropped. - alpha: float - how much prioritization is used - (0 - no prioritization, 1 - full prioritization) - - See Also - -------- - ReplayBuffer.__init__ + """ + Create Prioritized Replay buffer. + + See Also ReplayBuffer.__init__ + + :param size: (int) Max number of transitions to store in the buffer. When the buffer overflows the old memories + are dropped. + :param alpha: (float) how much prioritization is used (0 - no prioritization, 1 - full prioritization) """ super(PrioritizedReplayBuffer, self).__init__(size) assert alpha >= 0 @@ -97,10 +90,18 @@ def __init__(self, size, alpha): self._it_min = MinSegmentTree(it_capacity) self._max_priority = 1.0 - def add(self, *args, **kwargs): - """See ReplayBuffer.store_effect""" + def add(self, obs_t, action, reward, obs_tp1, done): + """ + add a new transition to the buffer + + :param obs_t: (Any) the last observation + :param action: ([float]) the action + :param reward: (float) the reward of the transition + :param obs_tp1: (Any) the current observation + :param done: (bool) is the episode done + """ idx = self._next_idx - super().add(*args, **kwargs) + super().add(obs_t, action, reward, obs_tp1, done) self._it_sum[idx] = self._max_priority ** self._alpha self._it_min[idx] = self._max_priority ** self._alpha @@ -113,41 +114,26 @@ def _sample_proportional(self, batch_size): res.append(idx) return res - def sample(self, batch_size, beta): - """Sample a batch of experiences. + def sample(self, batch_size, beta=0): + """ + Sample a batch of experiences. compared to ReplayBuffer.sample it also returns importance weights and idxes of sampled experiences. - - Parameters - ---------- - batch_size: int - How many transitions to sample. - beta: float - To what degree to use importance weights - (0 - no corrections, 1 - full correction) - - Returns - ------- - obs_batch: np.array - batch of observations - act_batch: np.array - batch of actions executed given obs_batch - rew_batch: np.array - rewards received as results of executing act_batch - next_obs_batch: np.array - next set of observations seen after executing act_batch - done_mask: np.array - done_mask[i] = 1 if executing act_batch[i] resulted in - the end of an episode and 0 otherwise. - weights: np.array - Array of shape (batch_size,) and dtype np.float32 - denoting importance weight of each sampled transition - idxes: np.array - Array of shape (batch_size,) and dtype np.int32 - idexes in buffer of sampled experiences + :param batch_size: (int) How many transitions to sample. + :param beta: (float) To what degree to use importance weights (0 - no corrections, 1 - full correction) + :return: + - obs_batch: (numpy Any) batch of observations + - act_batch: (numpy float) batch of actions executed given obs_batch + - rew_batch: (numpy float) rewards received as results of executing act_batch + - next_obs_batch: (numpy Any) next set of observations seen after executing act_batch + - done_mask: (numpy bool) done_mask[i] = 1 if executing act_batch[i] resulted in the end of an episode + and 0 otherwise. + - weights: (numpy float) Array of shape (batch_size,) and dtype np.float32 denoting importance weight of + each sampled transition + - idxes: (numpy int) Array of shape (batch_size,) and dtype np.int32 idexes in buffer of sampled experiences """ assert beta > 0 @@ -166,19 +152,15 @@ def sample(self, batch_size, beta): return tuple(list(encoded_sample) + [weights, idxes]) def update_priorities(self, idxes, priorities): - """Update priorities of sampled transitions. + """ + Update priorities of sampled transitions. sets priority of transition at index idxes[i] in buffer to priorities[i]. - Parameters - ---------- - idxes: [int] - List of idxes of sampled transitions - priorities: [float] - List of updated priorities corresponding to - transitions at the sampled idxes denoted by - variable `idxes`. + :param idxes: ([int]) List of idxes of sampled transitions + :param priorities: ([float]) List of updated priorities corresponding to transitions at the sampled idxes + denoted by variable `idxes`. """ assert len(idxes) == len(priorities) for idx, priority in zip(idxes, priorities): diff --git a/stable_baselines/deepq/simple.py b/stable_baselines/deepq/simple.py new file mode 100644 index 0000000000..f37c4fc02d --- /dev/null +++ b/stable_baselines/deepq/simple.py @@ -0,0 +1,297 @@ +import tensorflow as tf +import numpy as np +import gym + +from stable_baselines import logger, deepq +from stable_baselines.common import tf_util, BaseRLModel, SetVerbosity +from stable_baselines.common.vec_env import VecEnv +from stable_baselines.common.schedules import LinearSchedule +from stable_baselines.common.policies import ActorCriticPolicy +from stable_baselines.deepq.replay_buffer import ReplayBuffer, PrioritizedReplayBuffer +from stable_baselines.deepq.utils import ObservationInput +from stable_baselines.a2c.utils import find_trainable_variables + + +class DeepQ(BaseRLModel): + """ + The DQN model class. DQN paper: https://arxiv.org/pdf/1312.5602.pdf + + :param policy: (function (TensorFlow Tensor, int, str, bool): TensorFlow Tensor) + the policy that takes the following inputs: + - observation_in: (object) the output of observation placeholder + - num_actions: (int) number of actions + - scope: (str) + - reuse: (bool) should be passed to outer variable scope + and returns a tensor of shape (batch_size, num_actions) with values of every action. + :param env: (Gym environment or str) The environment to learn from (if registered in Gym, can be str) + :param gamma: (float) discount factor + :param learning_rate: (float) learning rate for adam optimizer + :param buffer_size: (int) size of the replay buffer + :param exploration_fraction: (float) fraction of entire training period over which the exploration rate is + annealed + :param exploration_final_eps: (float) final value of random action probability + :param train_freq: (int) update the model every `train_freq` steps. set to None to disable printing + :param batch_size: (int) size of a batched sampled from replay buffer for training + :param checkpoint_freq: (int) how often to save the model. This is so that the best version is restored at the + end of the training. If you do not wish to restore the best version + at the end of the training set this variable to None. + :param checkpoint_path: (str) replacement path used if you need to log to somewhere else than a temporary + directory. + :param learning_starts: (int) how many steps of the model to collect transitions for before learning starts + :param target_network_update_freq: (int) update the target network every `target_network_update_freq` steps. + :param prioritized_replay: (bool) if True prioritized replay buffer will be used. + :param prioritized_replay_alpha: (float) alpha parameter for prioritized replay buffer + :param prioritized_replay_beta0: (float) initial value of beta for prioritized replay buffer + :param prioritized_replay_beta_iters: (int) number of iterations over which beta will be annealed from initial + value to 1.0. If set to None equals to max_timesteps. + :param prioritized_replay_eps: (float) epsilon to add to the TD errors when updating priorities. + :param param_noise: (bool) Whether or not to apply noise to the parameters of the policy. + :param verbose: (int) the verbosity level: 0 none, 1 training information, 2 tensorflow debug + :param _init_setup_model: (bool) Whether or not to build the network at the creation of the instance + """ + + def __init__(self, policy, env, gamma=0.99, learning_rate=5e-4, buffer_size=50000, exploration_fraction=0.1, + exploration_final_eps=0.02, train_freq=1, batch_size=32, checkpoint_freq=10000, checkpoint_path=None, + learning_starts=1000, target_network_update_freq=500, prioritized_replay=False, + prioritized_replay_alpha=0.6, prioritized_replay_beta0=0.4, prioritized_replay_beta_iters=None, + prioritized_replay_eps=1e-6, param_noise=False, verbose=0, _init_setup_model=True): + super(DeepQ, self).__init__(policy=policy, env=env, requires_vec_env=False, verbose=verbose) + + assert not isinstance(policy, ActorCriticPolicy), \ + "Error: DeepQ does not support the actor critic policies, please use the " \ + "'stable_baselines.deepq.models.mlp' and 'stable_baselines.deepq.models.cnn_to_mlp' " \ + "functions to create your policies." + + self.checkpoint_path = checkpoint_path + self.param_noise = param_noise + self.learning_starts = learning_starts + self.train_freq = train_freq + self.prioritized_replay = prioritized_replay + self.prioritized_replay_eps = prioritized_replay_eps + self.batch_size = batch_size + self.target_network_update_freq = target_network_update_freq + self.checkpoint_freq = checkpoint_freq + self.prioritized_replay_alpha = prioritized_replay_alpha + self.prioritized_replay_beta0 = prioritized_replay_beta0 + self.prioritized_replay_beta_iters = prioritized_replay_beta_iters + self.exploration_final_eps = exploration_final_eps + self.exploration_fraction = exploration_fraction + self.buffer_size = buffer_size + self.learning_rate = learning_rate + self.gamma = gamma + + self.graph = None + self.sess = None + self._train_step = None + self.update_target = None + self.act = None + self.replay_buffer = None + self.beta_schedule = None + self.exploration = None + self.params = None + + if _init_setup_model: + self.setup_model() + + def setup_model(self): + with SetVerbosity(self.verbose): + + assert isinstance(self.action_space, gym.spaces.Discrete), \ + "Error: DeepQ cannot output a {} action space, only spaces.Discrete is supported."\ + .format(self.action_space) + + self.graph = tf.Graph() + with self.graph.as_default(): + self.sess = tf_util.make_session(graph=self.graph) + + # capture the shape outside the closure so that the env object is not serialized + # by cloudpickle when serializing make_obs_ph + observation_space = self.observation_space + + def make_obs_ph(name): + """ + makes the observation placeholder + + :param name: (str) the placeholder name + :return: (TensorFlow Tensor) the placeholder + """ + return ObservationInput(observation_space, name=name) + + self.act, self._train_step, self.update_target, _ = deepq.build_train( + make_obs_ph=make_obs_ph, + q_func=self.policy, + num_actions=self.action_space.n, + optimizer=tf.train.AdamOptimizer(learning_rate=self.learning_rate), + gamma=self.gamma, + grad_norm_clipping=10, + param_noise=self.param_noise + ) + + self.params = find_trainable_variables("deepq") + + # Initialize the parameters and copy them to the target network. + tf_util.initialize(self.sess) + self.update_target(sess=self.sess) + + def learn(self, total_timesteps, callback=None, seed=None, log_interval=100): + with SetVerbosity(self.verbose): + self._setup_learn(seed) + + # Create the replay buffer + if self.prioritized_replay: + self.replay_buffer = PrioritizedReplayBuffer(self.buffer_size, alpha=self.prioritized_replay_alpha) + if self.prioritized_replay_beta_iters is None: + prioritized_replay_beta_iters = total_timesteps + self.beta_schedule = LinearSchedule(prioritized_replay_beta_iters, + initial_p=self.prioritized_replay_beta0, + final_p=1.0) + else: + self.replay_buffer = ReplayBuffer(self.buffer_size) + self.beta_schedule = None + # Create the schedule for exploration starting from 1. + self.exploration = LinearSchedule(schedule_timesteps=int(self.exploration_fraction * total_timesteps), + initial_p=1.0, + final_p=self.exploration_final_eps) + + episode_rewards = [0.0] + obs = self.env.reset() + reset = True + + for step in range(total_timesteps): + if callback is not None: + callback(locals(), globals()) + # Take action and update exploration to the newest value + kwargs = {} + if not self.param_noise: + update_eps = self.exploration.value(step) + update_param_noise_threshold = 0. + else: + update_eps = 0. + # Compute the threshold such that the KL divergence between perturbed and non-perturbed + # policy is comparable to eps-greedy exploration with eps = exploration.value(t). + # See Appendix C.1 in Parameter Space Noise for Exploration, Plappert et al., 2017 + # for detailed explanation. + update_param_noise_threshold = \ + -np.log(1. - self.exploration.value(step) + + self.exploration.value(step) / float(self.env.action_space.n)) + kwargs['reset'] = reset + kwargs['update_param_noise_threshold'] = update_param_noise_threshold + kwargs['update_param_noise_scale'] = True + with self.sess.as_default(): + action = self.act(np.array(obs)[None], update_eps=update_eps, **kwargs)[0] + env_action = action + reset = False + new_obs, rew, done, _ = self.env.step(env_action) + # Store transition in the replay buffer. + self.replay_buffer.add(obs, action, rew, new_obs, float(done)) + obs = new_obs + + episode_rewards[-1] += rew + if done: + if not isinstance(self.env, VecEnv): + obs = self.env.reset() + episode_rewards.append(0.0) + reset = True + + if step > self.learning_starts and step % self.train_freq == 0: + # Minimize the error in Bellman's equation on a batch sampled from replay buffer. + if self.prioritized_replay: + experience = self.replay_buffer.sample(self.batch_size, beta=self.beta_schedule.value(step)) + (obses_t, actions, rewards, obses_tp1, dones, weights, batch_idxes) = experience + else: + obses_t, actions, rewards, obses_tp1, dones = self.replay_buffer.sample(self.batch_size) + weights, batch_idxes = np.ones_like(rewards), None + td_errors = self._train_step(obses_t, actions, rewards, obses_tp1, dones, weights, + sess=self.sess) + if self.prioritized_replay: + new_priorities = np.abs(td_errors) + self.prioritized_replay_eps + self.replay_buffer.update_priorities(batch_idxes, new_priorities) + + if step > self.learning_starts and step % self.target_network_update_freq == 0: + # Update target network periodically. + self.update_target(sess=self.sess) + + if len(episode_rewards[-101:-1]) == 0: + mean_100ep_reward = -np.inf + else: + mean_100ep_reward = round(float(np.mean(episode_rewards[-101:-1])), 1) + + num_episodes = len(episode_rewards) + if self.verbose >= 1 and done and log_interval is not None and len(episode_rewards) % log_interval == 0: + logger.record_tabular("steps", step) + logger.record_tabular("episodes", num_episodes) + logger.record_tabular("mean 100 episode reward", mean_100ep_reward) + logger.record_tabular("% time spent exploring", int(100 * self.exploration.value(step))) + logger.dump_tabular() + + return self + + def predict(self, observation, state=None, mask=None): + observation = np.array(observation).reshape(self.observation_space.shape) + + with self.sess.as_default(): + action = self.act(observation[None])[0] + + if self._vectorize_action: + return [action], [None] + else: + return action, None + + def action_probability(self, observation, state=None, mask=None): + observation = np.array(observation).reshape((-1,) + self.observation_space.shape) + + # Get the tensor just before the softmax function in the TensorFlow graph, + # then execute the graph from the input observation to this tensor. + tensor = self.graph.get_tensor_by_name('deepq/q_func/fully_connected_2/BiasAdd:0') + if self._vectorize_action: + return self._softmax(self.sess.run(tensor, feed_dict={'deepq/observation:0': observation})) + else: + return self._softmax(self.sess.run(tensor, feed_dict={'deepq/observation:0': observation}))[0] + + def save(self, save_path): + # params + data = { + "checkpoint_path": self.checkpoint_path, + "param_noise": self.param_noise, + "learning_starts": self.learning_starts, + "train_freq": self.train_freq, + "prioritized_replay": self.prioritized_replay, + "prioritized_replay_eps": self.prioritized_replay_eps, + "batch_size": self.batch_size, + "target_network_update_freq": self.target_network_update_freq, + "checkpoint_freq": self.checkpoint_freq, + "prioritized_replay_alpha": self.prioritized_replay_alpha, + "prioritized_replay_beta0": self.prioritized_replay_beta0, + "prioritized_replay_beta_iters": self.prioritized_replay_beta_iters, + "exploration_final_eps": self.exploration_final_eps, + "exploration_fraction": self.exploration_fraction, + "learning_rate": self.learning_rate, + "gamma": self.gamma, + "verbose": self.verbose, + "observation_space": self.observation_space, + "action_space": self.action_space, + "policy": self.policy, + "n_envs": self.n_envs, + "_vectorize_action": self._vectorize_action + } + + params = self.sess.run(self.params) + + self._save_to_file(save_path, data=data, params=params) + + @classmethod + def load(cls, load_path, env=None, **kwargs): + data, params = cls._load_from_file(load_path) + + model = cls(policy=data["policy"], env=env, _init_setup_model=False) + model.__dict__.update(data) + model.__dict__.update(kwargs) + model.set_env(env) + model.setup_model() + + restores = [] + for param, loaded_p in zip(model.params, params): + restores.append(param.assign(loaded_p)) + model.sess.run(restores) + + return model diff --git a/baselines/deepq/utils.py b/stable_baselines/deepq/utils.py similarity index 53% rename from baselines/deepq/utils.py rename to stable_baselines/deepq/utils.py index 90b932e74a..ac8d916646 100644 --- a/baselines/deepq/utils.py +++ b/stable_baselines/deepq/utils.py @@ -1,7 +1,7 @@ -from baselines.common.input import observation_input - import tensorflow as tf +from stable_baselines.common.input import observation_input + # ================================================================ # Placeholders # ================================================================ @@ -9,26 +9,40 @@ class TfInput(object): def __init__(self, name="(unnamed)"): - """Generalized Tensorflow placeholder. The main differences are: + """ + Generalized Tensorflow placeholder. The main differences are: - possibly uses multiple placeholders internally and returns multiple values - can apply light postprocessing to the value feed to placeholder. + + :param name: (str) the input name """ self.name = name def get(self): - """Return the tf variable(s) representing the possibly postprocessed value + """ + Return the tf variable(s) representing the possibly postprocessed value of placeholder(s). + + :return: (TensorFlow Tensor) the placeholder + """ + raise NotImplementedError + + def make_feed_dict(self, data): """ - raise NotImplemented() + Given data input it to the placeholder(s). - def make_feed_dict(data): - """Given data input it to the placeholder(s).""" - raise NotImplemented() + :return: (dict) the given data input + """ + raise NotImplementedError class PlaceholderTfInput(TfInput): def __init__(self, placeholder): - """Wrapper for regular tensorflow placeholder.""" + """ + Wrapper for regular tensorflow placeholder. + + :param placeholder: (TensorFlow Tensor) + """ super().__init__(placeholder.name) self._placeholder = placeholder @@ -41,17 +55,14 @@ def make_feed_dict(self, data): class Uint8Input(PlaceholderTfInput): def __init__(self, shape, name=None): - """Takes input in uint8 format which is cast to float32 and divided by 255 + """ + Takes input in uint8 format which is cast to float32 and divided by 255 before passing it to the model. On GPU this ensures lower data transfer times. - Parameters - ---------- - shape: [int] - shape of the tensor. - name: str - name of the underlying placeholder + :param shape: ([int]) shape of the tensor. + :param name: (str) name of the underlying placeholder """ super().__init__(tf.placeholder(tf.uint8, [None] + list(shape), name=name)) @@ -64,20 +75,16 @@ def get(self): class ObservationInput(PlaceholderTfInput): def __init__(self, observation_space, name=None): - """Creates an input placeholder tailored to a specific observation space - - Parameters - ---------- - - observation_space: - observation space of the environment. Should be one of the gym.spaces types - name: str - tensorflow name of the underlying placeholder """ - inpt, self.processed_inpt = observation_input(observation_space, name=name) + Creates an input placeholder tailored to a specific observation space + + :param observation_space: (Gym Space) observation space of the environment. Should be one of the gym.spaces + types + :param name: (str) tensorflow name of the underlying placeholder + """ + is_image = len(observation_space.shape) == 3 + inpt, self.processed_inpt = observation_input(observation_space, name=name, scale=is_image) super().__init__(inpt) def get(self): return self.processed_inpt - - diff --git a/stable_baselines/gail/__init__.py b/stable_baselines/gail/__init__.py new file mode 100644 index 0000000000..f6356a3b03 --- /dev/null +++ b/stable_baselines/gail/__init__.py @@ -0,0 +1 @@ +from stable_baselines.gail.model import GAIL diff --git a/baselines/gail/adversary.py b/stable_baselines/gail/adversary.py similarity index 51% rename from baselines/gail/adversary.py rename to stable_baselines/gail/adversary.py index 18df69ccca..9e253fcfc7 100644 --- a/baselines/gail/adversary.py +++ b/stable_baselines/gail/adversary.py @@ -1,28 +1,50 @@ -''' +""" Reference: https://github.com/openai/imitation I follow the architecture from the official repository -''' +""" import tensorflow as tf import numpy as np -from baselines.common.mpi_running_mean_std import RunningMeanStd -from baselines.common import tf_util as U +from stable_baselines.common.mpi_running_mean_std import RunningMeanStd +from stable_baselines.common import tf_util as tf_util + + +def logsigmoid(input_tensor): + """ + Equivalent to tf.log(tf.sigmoid(a)) + + :param input_tensor: (TensorFlow Tensor) + :return: (TensorFlow Tensor) + """ + return -tf.nn.softplus(-input_tensor) -def logsigmoid(a): - '''Equivalent to tf.log(tf.sigmoid(a))''' - return -tf.nn.softplus(-a) -""" Reference: https://github.com/openai/imitation/blob/99fbccf3e060b6e6c739bdf209758620fcdefd3c/policyopt/thutil.py#L48-L51""" def logit_bernoulli_entropy(logits): - ent = (1.-tf.nn.sigmoid(logits))*logits - logsigmoid(logits) + """ + Reference: + https://github.com/openai/imitation/blob/99fbccf3e060b6e6c739bdf209758620fcdefd3c/policyopt/thutil.py#L48-L51 + + :param logits: (TensorFlow Tensor) the logits + :return: (TensorFlow Tensor) the bernoulli entropy + """ + ent = (1. - tf.nn.sigmoid(logits)) * logits - logsigmoid(logits) return ent + class TransitionClassifier(object): - def __init__(self, env, hidden_size, entcoeff=0.001, lr_rate=1e-3, scope="adversary"): + def __init__(self, env, hidden_size, entcoeff=0.001, scope="adversary"): + """ + reward regression from observations and transitions + + :param env: (Gym Environment) + :param hidden_size: ([int]) the hidden dimension for the MLP + :param entcoeff: (float) the entropy loss weight + :param scope: (str) tensorflow variable scope + """ self.scope = scope self.observation_shape = env.observation_space.shape self.actions_shape = env.action_space.shape - self.input_shape = tuple([o+a for o, a in zip(self.observation_shape, self.actions_shape)]) + self.input_shape = tuple([o + a for o, a in zip(self.observation_shape, self.actions_shape)]) self.num_actions = env.action_space.shape[0] self.hidden_size = hidden_size self.build_ph() @@ -35,31 +57,48 @@ def __init__(self, env, hidden_size, entcoeff=0.001, lr_rate=1e-3, scope="advers # Build regression loss # let x = logits, z = targets. # z * -log(sigmoid(x)) + (1 - z) * -log(1 - sigmoid(x)) - generator_loss = tf.nn.sigmoid_cross_entropy_with_logits(logits=generator_logits, labels=tf.zeros_like(generator_logits)) + generator_loss = tf.nn.sigmoid_cross_entropy_with_logits(logits=generator_logits, + labels=tf.zeros_like(generator_logits)) generator_loss = tf.reduce_mean(generator_loss) expert_loss = tf.nn.sigmoid_cross_entropy_with_logits(logits=expert_logits, labels=tf.ones_like(expert_logits)) expert_loss = tf.reduce_mean(expert_loss) # Build entropy loss logits = tf.concat([generator_logits, expert_logits], 0) entropy = tf.reduce_mean(logit_bernoulli_entropy(logits)) - entropy_loss = -entcoeff*entropy + entropy_loss = -entcoeff * entropy # Loss + Accuracy terms self.losses = [generator_loss, expert_loss, entropy, entropy_loss, generator_acc, expert_acc] self.loss_name = ["generator_loss", "expert_loss", "entropy", "entropy_loss", "generator_acc", "expert_acc"] self.total_loss = generator_loss + expert_loss + entropy_loss # Build Reward for policy - self.reward_op = -tf.log(1-tf.nn.sigmoid(generator_logits)+1e-8) + self.reward_op = -tf.log(1 - tf.nn.sigmoid(generator_logits) + 1e-8) var_list = self.get_trainable_variables() - self.lossandgrad = U.function([self.generator_obs_ph, self.generator_acs_ph, self.expert_obs_ph, self.expert_acs_ph], - self.losses + [U.flatgrad(self.total_loss, var_list)]) + self.lossandgrad = tf_util.function( + [self.generator_obs_ph, self.generator_acs_ph, self.expert_obs_ph, self.expert_acs_ph], + self.losses + [tf_util.flatgrad(self.total_loss, var_list)]) def build_ph(self): - self.generator_obs_ph = tf.placeholder(tf.float32, (None, ) + self.observation_shape, name="observations_ph") - self.generator_acs_ph = tf.placeholder(tf.float32, (None, ) + self.actions_shape, name="actions_ph") - self.expert_obs_ph = tf.placeholder(tf.float32, (None, ) + self.observation_shape, name="expert_observations_ph") - self.expert_acs_ph = tf.placeholder(tf.float32, (None, ) + self.actions_shape, name="expert_actions_ph") + """ + build placeholder + """ + self.generator_obs_ph = tf.placeholder(tf.float32, (None,) + self.observation_shape, + name="observations_ph") + self.generator_acs_ph = tf.placeholder(tf.float32, (None,) + self.actions_shape, + name="actions_ph") + self.expert_obs_ph = tf.placeholder(tf.float32, (None,) + self.observation_shape, + name="expert_observations_ph") + self.expert_acs_ph = tf.placeholder(tf.float32, (None,) + self.actions_shape, + name="expert_actions_ph") def build_graph(self, obs_ph, acs_ph, reuse=False): + """ + build the graph + + :param obs_ph: (TensorFlow Tensor) the observation placeholder + :param acs_ph: (TensorFlow Tensor) the action placeholder + :param reuse: (bool) + :return: (TensorFlow Tensor) the graph output + """ with tf.variable_scope(self.scope): if reuse: tf.get_variable_scope().reuse_variables() @@ -74,14 +113,26 @@ def build_graph(self, obs_ph, acs_ph, reuse=False): return logits def get_trainable_variables(self): + """ + get all the trainable variables from the graph + + :return: ([TensorFlow Tensor]) the variables + """ return tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, self.scope) - def get_reward(self, obs, acs): + def get_reward(self, obs, actions): + """ + get the reward using the observation and action + + :param obs: (TensorFlow Tensor or numpy Number) the observation + :param actions: (TensorFlow Tensor or numpy Number) the action + :return: (numpy Number) the reward + """ sess = tf.get_default_session() if len(obs.shape) == 1: obs = np.expand_dims(obs, 0) - if len(acs.shape) == 1: - acs = np.expand_dims(acs, 0) - feed_dict = {self.generator_obs_ph: obs, self.generator_acs_ph: acs} + if len(actions.shape) == 1: + actions = np.expand_dims(actions, 0) + feed_dict = {self.generator_obs_ph: obs, self.generator_acs_ph: actions} reward = sess.run(self.reward_op, feed_dict) return reward diff --git a/stable_baselines/gail/behavior_clone.py b/stable_baselines/gail/behavior_clone.py new file mode 100644 index 0000000000..85cce513b1 --- /dev/null +++ b/stable_baselines/gail/behavior_clone.py @@ -0,0 +1,145 @@ +""" +The code is used to train BC imitator, or pretrained GAIL imitator +""" +import os +import argparse +import tempfile +import logging + +from tqdm import tqdm +import gym +import tensorflow as tf + +from stable_baselines.gail import mlp_policy +from stable_baselines import logger, bench +from stable_baselines.common import set_global_seeds, tf_util +from stable_baselines.common.misc_util import boolean_flag +from stable_baselines.common.mpi_adam import MpiAdam +from stable_baselines.gail.run_mujoco import runner +from stable_baselines.gail.dataset.mujocodset import MujocoDset + + +def argsparser(): + """ + make a behavior cloning argument parser + + :return: (ArgumentParser) + """ + parser = argparse.ArgumentParser("Tensorflow Implementation of Behavior Cloning") + parser.add_argument('--env_id', help='environment ID', default='Hopper-v1') + parser.add_argument('--seed', help='RNG seed', type=int, default=0) + parser.add_argument('--expert_path', type=str, default='data/deterministic.trpo.Hopper.0.00.npz') + parser.add_argument('--checkpoint_dir', help='the directory to save model', default='checkpoint') + parser.add_argument('--log_dir', help='the directory to save log file', default='log') + # Mujoco Dataset Configuration + parser.add_argument('--traj_limitation', type=int, default=-1) + # Network Configuration (Using MLP Policy) + parser.add_argument('--policy_hidden_size', type=int, default=100) + # for evaluatation + boolean_flag(parser, 'stochastic_policy', default=False, help_msg='use stochastic/deterministic policy to evaluate') + boolean_flag(parser, 'save_sample', default=False, help_msg='save the trajectories or not') + parser.add_argument('--BC_max_iter', help='Max iteration for training BC', type=int, default=1e5) + return parser.parse_args() + + +def learn(env, policy_func, dataset, optim_batch_size=128, max_iters=1e4, adam_epsilon=1e-5, optim_stepsize=3e-4, + ckpt_dir=None, task_name=None, verbose=False): + """ + Learn a behavior clone policy, and return the save location + + :param env: (Gym Environment) the environment + :param policy_func: (function (str, Gym Space, Gym Space): TensorFlow Tensor) creates the policy + :param dataset: (Dset or MujocoDset) the dataset manager + :param optim_batch_size: (int) the batch size + :param max_iters: (int) the maximum number of iterations + :param adam_epsilon: (float) the epsilon value for the adam optimizer + :param optim_stepsize: (float) the optimizer stepsize + :param ckpt_dir: (str) the save directory, can be None for temporary directory + :param task_name: (str) the save name, can be None for saving directly to the directory name + :param verbose: (bool) + :return: (str) the save location for the TensorFlow model + """ + + val_per_iter = int(max_iters/10) + ob_space = env.observation_space + ac_space = env.action_space + policy = policy_func("pi", ob_space, ac_space) # Construct network for new policy + # placeholder + obs_ph = policy.obs_ph + action_ph = policy.pdtype.sample_placeholder([None]) + stochastic_ph = policy.stochastic_ph + loss = tf.reduce_mean(tf.square(action_ph - policy.ac)) + var_list = policy.get_trainable_variables() + adam = MpiAdam(var_list, epsilon=adam_epsilon) + lossandgrad = tf_util.function([obs_ph, action_ph, stochastic_ph], [loss] + [tf_util.flatgrad(loss, var_list)]) + + tf_util.initialize() + adam.sync() + logger.log("Pretraining with Behavior Cloning...") + for iter_so_far in tqdm(range(int(max_iters))): + ob_expert, ac_expert = dataset.get_next_batch(optim_batch_size, 'train') + train_loss, grad = lossandgrad(ob_expert, ac_expert, True) + adam.update(grad, optim_stepsize) + if verbose and iter_so_far % val_per_iter == 0: + ob_expert, ac_expert = dataset.get_next_batch(-1, 'val') + val_loss, _ = lossandgrad(ob_expert, ac_expert, True) + logger.log("Training loss: {}, Validation loss: {}".format(train_loss, val_loss)) + + if ckpt_dir is None: + savedir_fname = tempfile.TemporaryDirectory().name + else: + savedir_fname = os.path.join(ckpt_dir, task_name) + tf_util.save_state(savedir_fname, var_list=policy.get_variables()) + return savedir_fname + + +def get_task_name(args): + """ + Get the task name + + :param args: (ArgumentParser) the training argument + :return: (str) the task name + """ + task_name = 'BC' + task_name += '.{}'.format(args.env_id.split("-")[0]) + task_name += '.traj_limitation_{}'.format(args.traj_limitation) + task_name += ".seed_{}".format(args.seed) + return task_name + + +def main(args): + """ + start training the model + + :param args: (ArgumentParser) the training argument + """ + with tf_util.make_session(num_cpu=1): + set_global_seeds(args.seed) + env = gym.make(args.env_id) + + def policy_fn(name, ob_space, ac_space, reuse=False, sess=None): + return mlp_policy.MlpPolicy(name=name, ob_space=ob_space, ac_space=ac_space, sess=sess, + reuse=reuse, hid_size=args.policy_hidden_size, num_hid_layers=2) + env = bench.Monitor(env, logger.get_dir() and + os.path.join(logger.get_dir(), "monitor.json")) + env.seed(args.seed) + gym.logger.setLevel(logging.WARN) + task_name = get_task_name(args) + args.checkpoint_dir = os.path.join(args.checkpoint_dir, task_name) + args.log_dir = os.path.join(args.log_dir, task_name) + dataset = MujocoDset(expert_path=args.expert_path, traj_limitation=args.traj_limitation) + savedir_fname = learn(env, policy_fn, dataset, max_iters=args.BC_max_iter, ckpt_dir=args.checkpoint_dir, + task_name=task_name, verbose=True) + runner(env, + policy_fn, + savedir_fname, + timesteps_per_batch=1024, + number_trajs=10, + stochastic_policy=args.stochastic_policy, + save=args.save_sample, + reuse=True) + + +if __name__ == '__main__': + args = argsparser() + main(args) diff --git a/baselines/a2c/__init__.py b/stable_baselines/gail/dataset/__init__.py similarity index 100% rename from baselines/a2c/__init__.py rename to stable_baselines/gail/dataset/__init__.py diff --git a/baselines/gail/dataset/mujoco_dset.py b/stable_baselines/gail/dataset/mujocodset.py similarity index 70% rename from baselines/gail/dataset/mujoco_dset.py rename to stable_baselines/gail/dataset/mujocodset.py index 0693262270..0195e09eff 100644 --- a/baselines/gail/dataset/mujoco_dset.py +++ b/stable_baselines/gail/dataset/mujocodset.py @@ -1,16 +1,25 @@ -''' +""" Data structure of the input .npz: the data is save in python dictionary format with keys: 'acs', 'ep_rets', 'rews', 'obs' the values of each item is a list storing the expert trajectory sequentially a transition can be: (data['obs'][t], data['acs'][t], data['obs'][t+1]) and get reward data['rews'][t] -''' +""" -from baselines import logger import numpy as np +import matplotlib.pyplot as plt + +from stable_baselines import logger class Dset(object): def __init__(self, inputs, labels, randomize): + """ + Dataset object + + :param inputs: (numpy Number) the input values + :param labels: (numpy Number) the target values + :param randomize: (bool) if the dataset should be shuffled + """ self.inputs = inputs self.labels = labels assert len(self.inputs) == len(self.labels) @@ -19,6 +28,9 @@ def __init__(self, inputs, labels, randomize): self.init_pointer() def init_pointer(self): + """ + initialize the pointer and shuffle the dataset, if randomize the dataset + """ self.pointer = 0 if self.randomize: idx = np.arange(self.num_pairs) @@ -27,6 +39,12 @@ def init_pointer(self): self.labels = self.labels[idx, :] def get_next_batch(self, batch_size): + """ + get the batch from the dataset + + :param batch_size: (int) the size of the batch from the dataset + :return: (numpy Number, numpy Number) inputs and labels + """ # if batch_size is negative -> return all if batch_size < 0: return self.inputs, self.labels @@ -39,8 +57,16 @@ def get_next_batch(self, batch_size): return inputs, labels -class Mujoco_Dset(object): +class MujocoDset(object): def __init__(self, expert_path, train_fraction=0.7, traj_limitation=-1, randomize=True): + """ + Dataset for mujoco + + :param expert_path: (str) the path to trajectory data + :param train_fraction: (float) the train val split (0 to 1) + :param traj_limitation: (int) the dims to load (if -1, load all) + :param randomize: (bool) if the dataset should be shuffled + """ traj_data = np.load(expert_path) if traj_limitation < 0: traj_limitation = len(traj_data['obs']) @@ -73,12 +99,22 @@ def __init__(self, expert_path, train_fraction=0.7, traj_limitation=-1, randomiz self.log_info() def log_info(self): + """ + log the information of the dataset + """ logger.log("Total trajectorues: %d" % self.num_traj) logger.log("Total transitions: %d" % self.num_transition) logger.log("Average returns: %f" % self.avg_ret) logger.log("Std for returns: %f" % self.std_ret) def get_next_batch(self, batch_size, split=None): + """ + get the batch from the dataset + + :param batch_size: (int) the size of the batch from the dataset + :param split: (str) the type of data split (can be None, 'train', 'val') + :return: (numpy Number, numpy Number) inputs and labels + """ if split is None: return self.dset.get_next_batch(batch_size) elif split == 'train': @@ -89,17 +125,27 @@ def get_next_batch(self, batch_size, split=None): raise NotImplementedError def plot(self): - import matplotlib.pyplot as plt + """ + show and save (to 'histogram_rets.png') a histogram plotting of the episode returns + """ plt.hist(self.rets) plt.savefig("histogram_rets.png") plt.close() def test(expert_path, traj_limitation, plot): - dset = Mujoco_Dset(expert_path, traj_limitation=traj_limitation) + """ + test mujoco dataset object + + :param expert_path: (str) the path to trajectory data + :param traj_limitation: (int) the dims to load (if -1, load all) + :param plot: (bool) enable plotting + """ + dset = MujocoDset(expert_path, traj_limitation=traj_limitation) if plot: dset.plot() + if __name__ == '__main__': import argparse parser = argparse.ArgumentParser() diff --git a/baselines/gail/gail-eval.py b/stable_baselines/gail/gail_eval.py similarity index 65% rename from baselines/gail/gail-eval.py rename to stable_baselines/gail/gail_eval.py index 1148cb309c..d13d13ed64 100644 --- a/baselines/gail/gail-eval.py +++ b/stable_baselines/gail/gail_eval.py @@ -1,22 +1,21 @@ -''' +""" This code is used to evalaute the imitators trained with different number of trajectories and plot the results in the same figure for easy comparison. -''' +""" import argparse import os import glob -import gym +import gym import matplotlib.pyplot as plt import numpy as np import tensorflow as tf -from baselines.gail import run_mujoco -from baselines.gail import mlp_policy -from baselines.common import set_global_seeds, tf_util as U -from baselines.common.misc_util import boolean_flag -from baselines.gail.dataset.mujoco_dset import Mujoco_Dset +from stable_baselines.gail import run_mujoco, mlp_policy +from stable_baselines.common import set_global_seeds, tf_util +from stable_baselines.common.misc_util import boolean_flag +from stable_baselines.gail.dataset.mujocodset import MujocoDset plt.style.use('ggplot') @@ -26,30 +25,52 @@ def load_dataset(expert_path): - dataset = Mujoco_Dset(expert_path=expert_path) + """ + load mujoco dataset + + :param expert_path: (str) the path to trajectory data + :return: (MujocoDset) the dataset manager object + """ + dataset = MujocoDset(expert_path=expert_path) return dataset def argsparser(): + """ + make a argument parser for evaluation of gail + + :return: (ArgumentParser) + """ parser = argparse.ArgumentParser('Do evaluation') parser.add_argument('--seed', type=int, default=0) parser.add_argument('--policy_hidden_size', type=int, default=100) parser.add_argument('--env', type=str, choices=['Hopper', 'Walker2d', 'HalfCheetah', 'Humanoid', 'HumanoidStandup']) - boolean_flag(parser, 'stochastic_policy', default=False, help='use stochastic/deterministic policy to evaluate') + boolean_flag(parser, 'stochastic_policy', default=False, help_msg='use stochastic/deterministic policy to evaluate') return parser.parse_args() def evaluate_env(env_name, seed, policy_hidden_size, stochastic, reuse, prefix): - - def get_checkpoint_dir(checkpoint_list, limit, prefix): + """ + Evaluate an environment + + :param env_name: (str) the environment name + :param seed: (int) the initial random seed + :param policy_hidden_size: (int) the number of hidden neurons in the 4 layer MLP + :param stochastic: (bool) use a stochastic policy + :param reuse: (bool) allow reuse of the graph + :param prefix: (str) the checkpoint prefix for the type ('BC' or 'gail') + :return: (dict) the logging information of the evaluation + """ + + def _get_checkpoint_dir(checkpoint_list, limit, prefix): for checkpoint in checkpoint_list: if ('limitation_'+str(limit) in checkpoint) and (prefix in checkpoint): return checkpoint return None - def policy_fn(name, ob_space, ac_space, reuse=False): - return mlp_policy.MlpPolicy(name=name, ob_space=ob_space, ac_space=ac_space, + def _policy_fn(name, ob_space, ac_space, reuse=False, sess=None): + return mlp_policy.MlpPolicy(name=name, ob_space=ob_space, ac_space=ac_space, sess=sess, reuse=reuse, hid_size=policy_hidden_size, num_hid_layers=2) data_path = os.path.join('data', 'deterministic.trpo.' + env_name + '.0.00.npz') @@ -65,13 +86,13 @@ def policy_fn(name, ob_space, ac_space, reuse=False): for i, limit in enumerate(CONFIG['traj_limitation']): # Do one evaluation upper_bound = sum(dataset.rets[:limit])/limit - checkpoint_dir = get_checkpoint_dir(checkpoint_list, limit, prefix=prefix) + checkpoint_dir = _get_checkpoint_dir(checkpoint_list, limit, prefix=prefix) checkpoint_path = tf.train.latest_checkpoint(checkpoint_dir) env = gym.make(env_name + '-v1') env.seed(seed) print('Trajectory limitation: {}, Load checkpoint: {}, '.format(limit, checkpoint_path)) avg_len, avg_ret = run_mujoco.runner(env, - policy_fn, + _policy_fn, checkpoint_path, timesteps_per_batch=1024, number_trajs=10, @@ -90,6 +111,14 @@ def policy_fn(name, ob_space, ac_space, reuse=False): def plot(env_name, bc_log, gail_log, stochastic): + """ + plot and display all the evalutation results + + :param env_name: (str) the environment name + :param bc_log: (dict) the behavior_clone log + :param gail_log: (dict) the gail log + :param stochastic: (bool) use a stochastic policy + """ upper_bound = bc_log['upper_bound'] bc_avg_ret = bc_log['avg_ret'] gail_avg_ret = gail_log['avg_ret'] @@ -128,18 +157,23 @@ def plot(env_name, bc_log, gail_log, stochastic): def main(args): - U.make_session(num_cpu=1).__enter__() - set_global_seeds(args.seed) - print('Evaluating {}'.format(args.env)) - bc_log = evaluate_env(args.env, args.seed, args.policy_hidden_size, - args.stochastic_policy, False, 'BC') - print('Evaluation for {}'.format(args.env)) - print(bc_log) - gail_log = evaluate_env(args.env, args.seed, args.policy_hidden_size, - args.stochastic_policy, True, 'gail') - print('Evaluation for {}'.format(args.env)) - print(gail_log) - plot(args.env, bc_log, gail_log, args.stochastic_policy) + """ + evaluate and plot Behavior clone and gail + + :param args: (ArgumentParser) the arguments for training and evaluating + """ + with tf_util.make_session(num_cpu=1): + set_global_seeds(args.seed) + print('Evaluating {}'.format(args.env)) + bc_log = evaluate_env(args.env, args.seed, args.policy_hidden_size, + args.stochastic_policy, False, 'BC') + print('Evaluation for {}'.format(args.env)) + print(bc_log) + gail_log = evaluate_env(args.env, args.seed, args.policy_hidden_size, + args.stochastic_policy, True, 'gail') + print('Evaluation for {}'.format(args.env)) + print(gail_log) + plot(args.env, bc_log, gail_log, args.stochastic_policy) if __name__ == '__main__': diff --git a/stable_baselines/gail/mlp_policy.py b/stable_baselines/gail/mlp_policy.py new file mode 100644 index 0000000000..4e16eac71b --- /dev/null +++ b/stable_baselines/gail/mlp_policy.py @@ -0,0 +1,75 @@ +""" +from stable_baselines/ppo1/mlp_policy.py and add simple modification +(1) add reuse argument +(2) cache the `stochastic` placeholder +""" +import gym +import tensorflow as tf + +import stable_baselines.common.tf_util as tf_util +from stable_baselines.acktr.utils import dense +from stable_baselines.common.mpi_running_mean_std import RunningMeanStd +from stable_baselines.ppo1.mlp_policy import BasePolicy + + +class MlpPolicy(BasePolicy): + recurrent = False + + def __init__(self, name, *args, sess=None, reuse=False, placeholders=None, **kwargs): + """ + MLP policy for Gail + + :param name: (str) the variable scope name + :param ob_space: (Gym Space) The observation space of the environment + :param ac_space: (Gym Space) The action space of the environment + :param hid_size: (int) the size of the hidden layers + :param num_hid_layers: (int) the number of hidden layers + :param sess: (TensorFlow session) The current TensorFlow session containing the variables. + :param reuse: (bool) allow resue of the graph + :param placeholders: (dict) To feed existing placeholders if needed + :param gaussian_fixed_var: (bool) fix the gaussian variance + """ + super(MlpPolicy, self).__init__(placeholders=placeholders) + self.sess = sess + with tf.variable_scope(name): + if reuse: + tf.get_variable_scope().reuse_variables() + self._init(*args, **kwargs) + self.scope = tf.get_variable_scope().name + + def _init(self, ob_space, ac_space, hid_size, num_hid_layers, gaussian_fixed_var=True): + + obs, pdtype = self.get_obs_and_pdtype(ob_space, ac_space) + + with tf.variable_scope("obfilter"): + self.ob_rms = RunningMeanStd(shape=ob_space.shape) + + obz = tf.clip_by_value((obs - self.ob_rms.mean) / self.ob_rms.std, -5.0, 5.0) + last_out = obz + for i in range(num_hid_layers): + last_out = tf.nn.tanh(dense(last_out, hid_size, "vffc%i" % (i+1), + weight_init=tf_util.normc_initializer(1.0))) + self.vpred = dense(last_out, 1, "vffinal", weight_init=tf_util.normc_initializer(1.0))[:, 0] + + last_out = obz + for i in range(num_hid_layers): + last_out = tf.nn.tanh(dense(last_out, hid_size, "polfc%i" % (i+1), + weight_init=tf_util.normc_initializer(1.0))) + + if gaussian_fixed_var and isinstance(ac_space, gym.spaces.Box): + mean = dense(last_out, pdtype.param_shape()[0] // 2, "polfinal", tf_util.normc_initializer(0.01)) + logstd = tf.get_variable(name="logstd", shape=[1, pdtype.param_shape()[0]//2], + initializer=tf.zeros_initializer()) + pdparam = tf.concat([mean, mean * 0.0 + logstd], axis=1) + else: + pdparam = dense(last_out, pdtype.param_shape()[0], "polfinal", tf_util.normc_initializer(0.01)) + + self.proba_distribution = pdtype.proba_distribution_from_flat(pdparam) + self.state_in = [] + self.state_out = [] + + # change for BC + self.stochastic_ph = tf.placeholder(dtype=tf.bool, shape=(), name="stochastic") + action = tf_util.switch(self.stochastic_ph, self.proba_distribution.sample(), self.proba_distribution.mode()) + self.action = action + self._act = tf_util.function([self.stochastic_ph, obs], [action, self.vpred]) diff --git a/stable_baselines/gail/model.py b/stable_baselines/gail/model.py new file mode 100644 index 0000000000..9334e87e64 --- /dev/null +++ b/stable_baselines/gail/model.py @@ -0,0 +1,93 @@ +import gym + +from stable_baselines.common import BaseRLModel +from stable_baselines.trpo_mpi import TRPO + + +class GAIL(BaseRLModel): + """ + Generative Adversarial Imitation Learning (GAIL) + + :param policy: (function (str, Gym Space, Gym Space, bool): MLPPolicy) policy generator + :param env: (Gym environment or str) The environment to learn from (if registered in Gym, can be str) + :param gamma: (float) the discount value + :param timesteps_per_batch: (int) the number of timesteps to run per batch (horizon) + :param max_kl: (float) the kullback leiber loss threashold + :param cg_iters: (int) the number of iterations for the conjugate gradient calculation + :param lam: (float) GAE factor + :param entcoeff: (float) the weight for the entropy loss + :param cg_damping: (float) the compute gradient dampening factor + :param vf_stepsize: (float) the value function stepsize + :param vf_iters: (int) the value function's number iterations for learning + :param pretrained_weight: (str) the save location for the pretrained weights + :param hidden_size: ([int]) the hidden dimension for the MLP + :param expert_dataset: (Dset) the dataset manager + :param save_per_iter: (int) the number of iterations before saving + :param checkpoint_dir: (str) the location for saving checkpoints + :param g_step: (int) number of steps to train policy in each epoch + :param d_step: (int) number of steps to train discriminator in each epoch + :param task_name: (str) the name of the task (can be None) + :param d_stepsize: (float) the reward giver stepsize + :param verbose: (int) the verbosity level: 0 none, 1 training information, 2 tensorflow debug + :param _init_setup_model: (bool) Whether or not to build the network at the creation of the instance + """ + + def __init__(self, policy, env, pretrained_weight=False, hidden_size_adversary=100, adversary_entcoeff=1e-3, + expert_dataset=None, save_per_iter=1, checkpoint_dir="/tmp/gail/ckpt/", g_step=1, d_step=1, + task_name="task_name", d_stepsize=3e-4, verbose=0, _init_setup_model=True, **kwargs): + super().__init__(policy=policy, env=env, requires_vec_env=False, verbose=verbose) + + self.trpo = TRPO(policy, env, verbose=verbose, _init_setup_model=False, **kwargs) + self.trpo.using_gail = True + self.trpo.pretrained_weight = pretrained_weight + self.trpo.expert_dataset = expert_dataset + self.trpo.save_per_iter = save_per_iter + self.trpo.checkpoint_dir = checkpoint_dir + self.trpo.g_step = g_step + self.trpo.d_step = d_step + self.trpo.task_name = task_name + self.trpo.d_stepsize = d_stepsize + self.trpo.hidden_size_adversary = hidden_size_adversary + self.trpo.adversary_entcoeff = adversary_entcoeff + + if _init_setup_model: + self.setup_model() + + def set_env(self, env): + super().set_env(env) + self.trpo.set_env(env) + + def setup_model(self): + assert isinstance(self.action_space, gym.spaces.Box), "Error: GAIL requires a continuous action space." + + self.trpo.setup_model() + + def learn(self, total_timesteps, callback=None, seed=None, log_interval=100): + self.trpo.learn(total_timesteps, callback, seed, log_interval) + return self + + def predict(self, observation, state=None, mask=None): + return self.trpo.predict(observation, state, mask) + + def action_probability(self, observation, state=None, mask=None): + return self.trpo.action_probability(observation, state, mask) + + def save(self, save_path): + self.trpo.save(save_path) + + @classmethod + def load(cls, load_path, env=None, **kwargs): + data, params = cls._load_from_file(load_path) + + model = cls(policy=data["policy"], env=None, _init_setup_model=False) + model.trpo.__dict__.update(data) + model.trpo.__dict__.update(kwargs) + model.set_env(env) + model.setup_model() + + restores = [] + for param, loaded_p in zip(model.trpo.params, params): + restores.append(param.assign(loaded_p)) + model.trpo.sess.run(restores) + + return model diff --git a/baselines/gail/result/HalfCheetah-normalized-deterministic-scores.png b/stable_baselines/gail/result/HalfCheetah-normalized-deterministic-scores.png similarity index 100% rename from baselines/gail/result/HalfCheetah-normalized-deterministic-scores.png rename to stable_baselines/gail/result/HalfCheetah-normalized-deterministic-scores.png diff --git a/baselines/gail/result/HalfCheetah-normalized-stochastic-scores.png b/stable_baselines/gail/result/HalfCheetah-normalized-stochastic-scores.png similarity index 100% rename from baselines/gail/result/HalfCheetah-normalized-stochastic-scores.png rename to stable_baselines/gail/result/HalfCheetah-normalized-stochastic-scores.png diff --git a/baselines/gail/result/HalfCheetah-unnormalized-deterministic-scores.png b/stable_baselines/gail/result/HalfCheetah-unnormalized-deterministic-scores.png similarity index 100% rename from baselines/gail/result/HalfCheetah-unnormalized-deterministic-scores.png rename to stable_baselines/gail/result/HalfCheetah-unnormalized-deterministic-scores.png diff --git a/baselines/gail/result/HalfCheetah-unnormalized-stochastic-scores.png b/stable_baselines/gail/result/HalfCheetah-unnormalized-stochastic-scores.png similarity index 100% rename from baselines/gail/result/HalfCheetah-unnormalized-stochastic-scores.png rename to stable_baselines/gail/result/HalfCheetah-unnormalized-stochastic-scores.png diff --git a/baselines/gail/result/Hopper-normalized-deterministic-scores.png b/stable_baselines/gail/result/Hopper-normalized-deterministic-scores.png similarity index 100% rename from baselines/gail/result/Hopper-normalized-deterministic-scores.png rename to stable_baselines/gail/result/Hopper-normalized-deterministic-scores.png diff --git a/baselines/gail/result/Hopper-normalized-stochastic-scores.png b/stable_baselines/gail/result/Hopper-normalized-stochastic-scores.png similarity index 100% rename from baselines/gail/result/Hopper-normalized-stochastic-scores.png rename to stable_baselines/gail/result/Hopper-normalized-stochastic-scores.png diff --git a/baselines/gail/result/Hopper-unnormalized-deterministic-scores.png b/stable_baselines/gail/result/Hopper-unnormalized-deterministic-scores.png similarity index 100% rename from baselines/gail/result/Hopper-unnormalized-deterministic-scores.png rename to stable_baselines/gail/result/Hopper-unnormalized-deterministic-scores.png diff --git a/baselines/gail/result/Hopper-unnormalized-stochastic-scores.png b/stable_baselines/gail/result/Hopper-unnormalized-stochastic-scores.png similarity index 100% rename from baselines/gail/result/Hopper-unnormalized-stochastic-scores.png rename to stable_baselines/gail/result/Hopper-unnormalized-stochastic-scores.png diff --git a/baselines/gail/result/Humanoid-normalized-deterministic-scores.png b/stable_baselines/gail/result/Humanoid-normalized-deterministic-scores.png similarity index 100% rename from baselines/gail/result/Humanoid-normalized-deterministic-scores.png rename to stable_baselines/gail/result/Humanoid-normalized-deterministic-scores.png diff --git a/baselines/gail/result/Humanoid-normalized-stochastic-scores.png b/stable_baselines/gail/result/Humanoid-normalized-stochastic-scores.png similarity index 100% rename from baselines/gail/result/Humanoid-normalized-stochastic-scores.png rename to stable_baselines/gail/result/Humanoid-normalized-stochastic-scores.png diff --git a/baselines/gail/result/Humanoid-unnormalized-deterministic-scores.png b/stable_baselines/gail/result/Humanoid-unnormalized-deterministic-scores.png similarity index 100% rename from baselines/gail/result/Humanoid-unnormalized-deterministic-scores.png rename to stable_baselines/gail/result/Humanoid-unnormalized-deterministic-scores.png diff --git a/baselines/gail/result/Humanoid-unnormalized-stochastic-scores.png b/stable_baselines/gail/result/Humanoid-unnormalized-stochastic-scores.png similarity index 100% rename from baselines/gail/result/Humanoid-unnormalized-stochastic-scores.png rename to stable_baselines/gail/result/Humanoid-unnormalized-stochastic-scores.png diff --git a/baselines/gail/result/HumanoidStandup-normalized-deterministic-scores.png b/stable_baselines/gail/result/HumanoidStandup-normalized-deterministic-scores.png similarity index 100% rename from baselines/gail/result/HumanoidStandup-normalized-deterministic-scores.png rename to stable_baselines/gail/result/HumanoidStandup-normalized-deterministic-scores.png diff --git a/baselines/gail/result/HumanoidStandup-normalized-stochastic-scores.png b/stable_baselines/gail/result/HumanoidStandup-normalized-stochastic-scores.png similarity index 100% rename from baselines/gail/result/HumanoidStandup-normalized-stochastic-scores.png rename to stable_baselines/gail/result/HumanoidStandup-normalized-stochastic-scores.png diff --git a/baselines/gail/result/HumanoidStandup-unnormalized-deterministic-scores.png b/stable_baselines/gail/result/HumanoidStandup-unnormalized-deterministic-scores.png similarity index 100% rename from baselines/gail/result/HumanoidStandup-unnormalized-deterministic-scores.png rename to stable_baselines/gail/result/HumanoidStandup-unnormalized-deterministic-scores.png diff --git a/baselines/gail/result/HumanoidStandup-unnormalized-stochastic-scores.png b/stable_baselines/gail/result/HumanoidStandup-unnormalized-stochastic-scores.png similarity index 100% rename from baselines/gail/result/HumanoidStandup-unnormalized-stochastic-scores.png rename to stable_baselines/gail/result/HumanoidStandup-unnormalized-stochastic-scores.png diff --git a/baselines/gail/result/Walker2d-normalized-deterministic-scores.png b/stable_baselines/gail/result/Walker2d-normalized-deterministic-scores.png similarity index 100% rename from baselines/gail/result/Walker2d-normalized-deterministic-scores.png rename to stable_baselines/gail/result/Walker2d-normalized-deterministic-scores.png diff --git a/baselines/gail/result/Walker2d-normalized-stochastic-scores.png b/stable_baselines/gail/result/Walker2d-normalized-stochastic-scores.png similarity index 100% rename from baselines/gail/result/Walker2d-normalized-stochastic-scores.png rename to stable_baselines/gail/result/Walker2d-normalized-stochastic-scores.png diff --git a/baselines/gail/result/Walker2d-unnormalized-deterministic-scores.png b/stable_baselines/gail/result/Walker2d-unnormalized-deterministic-scores.png similarity index 100% rename from baselines/gail/result/Walker2d-unnormalized-deterministic-scores.png rename to stable_baselines/gail/result/Walker2d-unnormalized-deterministic-scores.png diff --git a/baselines/gail/result/Walker2d-unnormalized-stochastic-scores.png b/stable_baselines/gail/result/Walker2d-unnormalized-stochastic-scores.png similarity index 100% rename from baselines/gail/result/Walker2d-unnormalized-stochastic-scores.png rename to stable_baselines/gail/result/Walker2d-unnormalized-stochastic-scores.png diff --git a/baselines/gail/result/gail-result.md b/stable_baselines/gail/result/gail-result.md similarity index 100% rename from baselines/gail/result/gail-result.md rename to stable_baselines/gail/result/gail-result.md diff --git a/baselines/gail/result/halfcheetah-training.png b/stable_baselines/gail/result/halfcheetah-training.png similarity index 100% rename from baselines/gail/result/halfcheetah-training.png rename to stable_baselines/gail/result/halfcheetah-training.png diff --git a/baselines/gail/result/hopper-training.png b/stable_baselines/gail/result/hopper-training.png similarity index 100% rename from baselines/gail/result/hopper-training.png rename to stable_baselines/gail/result/hopper-training.png diff --git a/baselines/gail/result/humanoid-training.png b/stable_baselines/gail/result/humanoid-training.png similarity index 100% rename from baselines/gail/result/humanoid-training.png rename to stable_baselines/gail/result/humanoid-training.png diff --git a/baselines/gail/result/humanoidstandup-training.png b/stable_baselines/gail/result/humanoidstandup-training.png similarity index 100% rename from baselines/gail/result/humanoidstandup-training.png rename to stable_baselines/gail/result/humanoidstandup-training.png diff --git a/baselines/gail/result/walker2d-training.png b/stable_baselines/gail/result/walker2d-training.png similarity index 100% rename from baselines/gail/result/walker2d-training.png rename to stable_baselines/gail/result/walker2d-training.png diff --git a/stable_baselines/gail/run_mujoco.py b/stable_baselines/gail/run_mujoco.py new file mode 100644 index 0000000000..4d460f77b6 --- /dev/null +++ b/stable_baselines/gail/run_mujoco.py @@ -0,0 +1,281 @@ +""" +Disclaimer: this code is highly based on trpo_mpi at @openai/stable_baselines and @openai/imitation +""" + +import argparse +import os +import logging + +from mpi4py import MPI +from tqdm import tqdm +import numpy as np +import gym + +from stable_baselines.gail import mlp_policy, behavior_clone +from stable_baselines.trpo_mpi.trpo_mpi import TRPO +from stable_baselines.common import set_global_seeds, tf_util +from stable_baselines.common.misc_util import boolean_flag +from stable_baselines import bench, logger +from stable_baselines.gail.dataset.mujocodset import MujocoDset +from stable_baselines.gail.adversary import TransitionClassifier + + +def argsparser(): + """ + get an argument parser for training mujoco on gail + + :return: (ArgumentParser) + """ + parser = argparse.ArgumentParser("Tensorflow Implementation of GAIL") + parser.add_argument('--env_id', help='environment ID', default='Hopper-v2') + parser.add_argument('--seed', help='RNG seed', type=int, default=0) + parser.add_argument('--expert_path', type=str, default='data/deterministic.trpo.Hopper.0.00.npz') + parser.add_argument('--checkpoint_dir', help='the directory to save model', default='checkpoint') + parser.add_argument('--log_dir', help='the directory to save log file', default='log') + parser.add_argument('--load_model_path', help='if provided, load the model', type=str, default=None) + # Task + parser.add_argument('--task', type=str, choices=['train', 'evaluate', 'sample'], default='train') + # for evaluatation + boolean_flag(parser, 'stochastic_policy', default=False, help_msg='use stochastic/deterministic policy to evaluate') + boolean_flag(parser, 'save_sample', default=False, help_msg='save the trajectories or not') + # Mujoco Dataset Configuration + parser.add_argument('--traj_limitation', type=int, default=-1) + # Optimization Configuration + parser.add_argument('--g_step', help='number of steps to train policy in each epoch', type=int, default=3) + parser.add_argument('--d_step', help='number of steps to train discriminator in each epoch', type=int, default=1) + # Network Configuration (Using MLP Policy) + parser.add_argument('--policy_hidden_size', type=int, default=100) + parser.add_argument('--adversary_hidden_size', type=int, default=100) + # Algorithms Configuration + parser.add_argument('--algo', type=str, choices=['trpo', 'ppo'], default='trpo') + parser.add_argument('--max_kl', type=float, default=0.01) + parser.add_argument('--policy_entcoeff', help='entropy coefficiency of policy', type=float, default=0) + parser.add_argument('--adversary_entcoeff', help='entropy coefficiency of discriminator', type=float, default=1e-3) + # Traing Configuration + parser.add_argument('--save_per_iter', help='save model every xx iterations', type=int, default=100) + parser.add_argument('--num_timesteps', help='number of timesteps per episode', type=int, default=5e6) + # Behavior Cloning + boolean_flag(parser, 'pretrained', default=False, help_msg='Use BC to pretrain') + parser.add_argument('--bc_max_iter', help='Max iteration for training BC', type=int, default=1e4) + return parser.parse_args() + + +def get_task_name(args): + """ + get the task name + + :param args: (ArgumentParser) the training argument + :return: (str) the task name + """ + task_name = args.algo + "_gail." + if args.pretrained: + task_name += "with_pretrained." + if args.traj_limitation != np.inf: + task_name += "transition_limitation_%d." % args.traj_limitation + task_name += args.env_id.split("-")[0] + task_name = task_name + ".g_step_" + str(args.g_step) + ".d_step_" + str(args.d_step) + \ + ".policy_entcoeff_" + str(args.policy_entcoeff) + ".adversary_entcoeff_" + str(args.adversary_entcoeff) + task_name += ".seed_" + str(args.seed) + return task_name + + +def main(args): + """ + start training the model + + :param args: (ArgumentParser) the training argument + """ + with tf_util.make_session(num_cpu=1): + set_global_seeds(args.seed) + env = gym.make(args.env_id) + + def policy_fn(name, ob_space, ac_space, reuse=False, placeholders=None, sess=None): + return mlp_policy.MlpPolicy(name=name, ob_space=ob_space, ac_space=ac_space, reuse=reuse, sess=sess, + hid_size=args.policy_hidden_size, num_hid_layers=2, placeholders=placeholders) + env = bench.Monitor(env, logger.get_dir() and + os.path.join(logger.get_dir(), "monitor.json")) + env.seed(args.seed) + gym.logger.setLevel(logging.WARN) + task_name = get_task_name(args) + args.checkpoint_dir = os.path.join(args.checkpoint_dir, task_name) + args.log_dir = os.path.join(args.log_dir, task_name) + + if args.task == 'train': + dataset = MujocoDset(expert_path=args.expert_path, traj_limitation=args.traj_limitation) + reward_giver = TransitionClassifier(env, args.adversary_hidden_size, entcoeff=args.adversary_entcoeff) + train(env, args.seed, policy_fn, reward_giver, dataset, args.algo, args.g_step, args.d_step, + args.policy_entcoeff, args.num_timesteps, args.save_per_iter, args.checkpoint_dir, args.pretrained, + args.bc_max_iter, task_name) + elif args.task == 'evaluate': + runner(env, + policy_fn, + args.load_model_path, + timesteps_per_batch=1024, + number_trajs=10, + stochastic_policy=args.stochastic_policy, + save=args.save_sample + ) + else: + raise NotImplementedError + env.close() + + +def train(env, seed, policy_fn, reward_giver, dataset, algo, g_step, d_step, policy_entcoeff, num_timesteps, + save_per_iter, checkpoint_dir, pretrained, bc_max_iter, task_name=None): + """ + train gail on mujoco + + :param env: (Gym Environment) the environment + :param seed: (int) the initial random seed + :param policy_fn: (function (str, Gym Space, Gym Space, bool): MLPPolicy) policy generator + :param reward_giver: (TransitionClassifier) the reward predicter from obsevation and action + :param dataset: (MujocoDset) the dataset manager + :param algo: (str) the algorithm type (only 'trpo' is supported) + :param g_step: (int) number of steps to train policy in each epoch + :param d_step: (int) number of steps to train discriminator in each epoch + :param policy_entcoeff: (float) the weight of the entropy loss for the policy + :param num_timesteps: (int) the number of timesteps to run + :param save_per_iter: (int) the number of iterations before saving + :param checkpoint_dir: (str) the location for saving checkpoints + :param pretrained: (bool) use a pretrained behavior clone + :param bc_max_iter: (int) the maximum number of training iterations for the behavior clone + :param task_name: (str) the name of the task (can be None) + """ + + pretrained_weight = None + if pretrained and (bc_max_iter > 0): + # Pretrain with behavior cloning + pretrained_weight = behavior_clone.learn(env, policy_fn, dataset, max_iters=bc_max_iter) + + if algo == 'trpo': + # Set up for MPI seed + rank = MPI.COMM_WORLD.Get_rank() + if rank != 0: + logger.set_level(logger.DISABLED) + workerseed = seed + 10000 * MPI.COMM_WORLD.Get_rank() + set_global_seeds(workerseed) + env.seed(workerseed) + model = TRPO(policy_fn, env, timesteps_per_batch=1024, max_kl=0.01, cg_iters=10, gamma=0.995, lam=0.97, + entcoeff=policy_entcoeff, cg_damping=0.1, vf_stepsize=1e-3, vf_iters=5, _init_setup_model=False) + + # GAIL param + model.pretrained_weight = pretrained_weight + model.reward_giver = reward_giver + model.expert_dataset = dataset + model.save_per_iter = save_per_iter + model.checkpoint_dir = checkpoint_dir + model.g_step = g_step + model.d_step = d_step + model.task_name = task_name + model.using_gail = True + model.setup_model() + + model.learn(total_timesteps=num_timesteps) + else: + raise NotImplementedError + + +def runner(env, policy_func, load_model_path, timesteps_per_batch, number_trajs, + stochastic_policy, save=False, reuse=False): + """ + run the training for all the trajectories + + :param env: (Gym Environment) the environment + :param policy_func: (function (str, Gym Space, Gym Space, bool): MLPPolicy) policy generator + :param load_model_path: (str) the path to the model + :param timesteps_per_batch: (int) the number of timesteps to run per batch (horizon) + :param number_trajs: (int) the number of trajectories to run + :param stochastic_policy: (bool) use a stochastic policy + :param save: (bool) save the policy + :param reuse: (bool) allow reuse of the graph + :return: (float, float) average trajectory lenght, average trajectory reward + """ + + # Setup network + # ---------------------------------------- + ob_space = env.observation_space + ac_space = env.action_space + policy = policy_func("pi", ob_space, ac_space, reuse=reuse) + tf_util.initialize() + # Prepare for rollouts + # ---------------------------------------- + tf_util.load_state(load_model_path) + + obs_list = [] + acs_list = [] + len_list = [] + ret_list = [] + for _ in tqdm(range(number_trajs)): + traj = traj_1_generator(policy, env, timesteps_per_batch, stochastic=stochastic_policy) + obs, acs, ep_len, ep_ret = traj['ob'], traj['ac'], traj['ep_len'], traj['ep_ret'] + obs_list.append(obs) + acs_list.append(acs) + len_list.append(ep_len) + ret_list.append(ep_ret) + if stochastic_policy: + print('stochastic policy:') + else: + print('deterministic policy:') + if save: + filename = load_model_path.split('/')[-1] + '.' + env.spec.id + np.savez(filename, obs=np.array(obs_list), acs=np.array(acs_list), + lens=np.array(len_list), rets=np.array(ret_list)) + avg_len = sum(len_list)/len(len_list) + avg_ret = sum(ret_list)/len(ret_list) + print("Average length:", avg_len) + print("Average return:", avg_ret) + return avg_len, avg_ret + + +def traj_1_generator(policy, env, horizon, stochastic): + """ + Sample one trajectory (until trajectory end) + + :param policy: (MLPPolicy) the policy + :param env: (Gym Environment) the environment + :param horizon: (int) the search horizon + :param stochastic: (bool) use a stochastic policy + :return: (dict) the trajectory + """ + + step = 0 + env.action_space.sample() # not used, just so we have the datatype + new = True # marks if we're on first timestep of an episode + + observation = env.reset() + cur_ep_ret = 0 # return in current episode + cur_ep_len = 0 # len of current episode + + # Initialize history arrays + observations = [] + rewards = [] + news = [] + actions = [] + + while True: + acttion, _ = policy.act(stochastic, observation) + observations.append(observation) + news.append(new) + actions.append(acttion) + + observation, reward, new, _ = env.step(acttion) + rewards.append(reward) + + cur_ep_ret += reward + cur_ep_len += 1 + if new or step >= horizon: + break + step += 1 + + observations = np.array(observations) + rewards = np.array(rewards) + news = np.array(news) + actions = np.array(actions) + traj = {"ob": observations, "rew": rewards, "new": news, "ac": actions, + "ep_ret": cur_ep_ret, "ep_len": cur_ep_len} + return traj + + +if __name__ == '__main__': + args = argsparser() + main(args) diff --git a/stable_baselines/gail/statistics.py b/stable_baselines/gail/statistics.py new file mode 100644 index 0000000000..6acc29e86d --- /dev/null +++ b/stable_baselines/gail/statistics.py @@ -0,0 +1,61 @@ +""" +This code is highly based on https://github.com/carpedm20/deep-rl-tensorflow/blob/master/agents/statistic.py +""" + +import tensorflow as tf +import numpy as np + +import stable_baselines.common.tf_util as tf_util + + +class Stats: + + def __init__(self, scalar_keys=None, histogram_keys=None): + """ + initialize the placeholders from the input keys, for summary logging + + :param scalar_keys: ([str]) the name of all the scalar inputs + :param histogram_keys: ([str]) the name of all the histogram inputs + """ + if scalar_keys is None: + scalar_keys = [] + if histogram_keys is None: + histogram_keys = [] + self.scalar_keys = scalar_keys + self.histogram_keys = histogram_keys + self.scalar_summaries = [] + self.scalar_summaries_ph = [] + self.histogram_summaries_ph = [] + self.histogram_summaries = [] + with tf.variable_scope('summary'): + for key in scalar_keys: + place_holder = tf.placeholder('float32', None, name=key + '.scalar.summary') + string_summary = tf.summary.scalar(key + '.scalar.summary', place_holder) + self.scalar_summaries_ph.append(place_holder) + self.scalar_summaries.append(string_summary) + for key in histogram_keys: + place_holder = tf.placeholder('float32', None, name=key + '.histogram.summary') + string_summary = tf.summary.scalar(key + '.histogram.summary', place_holder) + self.histogram_summaries_ph.append(place_holder) + self.histogram_summaries.append(string_summary) + + self.summaries = tf.summary.merge(self.scalar_summaries + self.histogram_summaries) + + def add_all_summary(self, writer, values, _iter): + """ + Note that the order of the incoming ```values``` should be the same as the that of the + ```scalar_keys``` given in ```__init__``` + + :param writer: (TensorFlow FileWriter) the writer + :param values: (TensorFlow Tensor or numpy Number) the input for the summary run + :param _iter: (Number) the global step value + """ + if np.sum(np.isnan(values) + 0) != 0: + return + sess = tf_util.get_session() + keys = self.scalar_summaries_ph + self.histogram_summaries_ph + feed_dict = {} + for key, value in zip(keys, values): + feed_dict.update({key: value}) + summaries_str = sess.run(self.summaries, feed_dict) + writer.add_summary(summaries_str, _iter) diff --git a/stable_baselines/her/__init__.py b/stable_baselines/her/__init__.py new file mode 100644 index 0000000000..4c28812c8a --- /dev/null +++ b/stable_baselines/her/__init__.py @@ -0,0 +1 @@ +from stable_baselines.her.her import HER diff --git a/stable_baselines/her/actor_critic.py b/stable_baselines/her/actor_critic.py new file mode 100644 index 0000000000..e108b69215 --- /dev/null +++ b/stable_baselines/her/actor_critic.py @@ -0,0 +1,52 @@ +import tensorflow as tf + +from stable_baselines.her.util import mlp + + +class ActorCritic: + def __init__(self, inputs_tf, dim_obs, dim_goal, dim_action, + max_u, o_stats, g_stats, hidden, layers, **kwargs): + """The actor-critic network and related training code. + + :param inputs_tf: ({str: TensorFlow Tensor}) all necessary inputs for the network: the + observation (o), the goal (g), and the action (u) + :param dim_obs: (int) the dimension of the observations + :param dim_goal: (int) the dimension of the goals + :param dim_action: (int) the dimension of the actions + :param max_u: (float) the maximum magnitude of actions; action outputs will be scaled accordingly + :param o_stats (stable_baselines.her.Normalizer): normalizer for observations + :param g_stats (stable_baselines.her.Normalizer): normalizer for goals + :param hidden (int): number of hidden units that should be used in hidden layers + :param layers (int): number of hidden layers + """ + self.inputs_tf = inputs_tf + self.dim_obs = dim_obs + self.dim_goal = dim_goal + self.dim_action = dim_action + self.max_u = max_u + self.o_stats = o_stats + self.g_stats = g_stats + self.hidden = hidden + self.layers = layers + + self.o_tf = inputs_tf['o'] + self.g_tf = inputs_tf['g'] + self.u_tf = inputs_tf['u'] + + # Prepare inputs for actor and critic. + obs = self.o_stats.normalize(self.o_tf) + goals = self.g_stats.normalize(self.g_tf) + input_pi = tf.concat(axis=1, values=[obs, goals]) # for actor + + # Networks. + with tf.variable_scope('pi'): + self.pi_tf = self.max_u * tf.tanh(mlp( + input_pi, [self.hidden] * self.layers + [self.dimu])) + with tf.variable_scope('Q'): + # for policy training + input_q = tf.concat(axis=1, values=[obs, goals, self.pi_tf / self.max_u]) + self.q_pi_tf = mlp(input_q, [self.hidden] * self.layers + [1]) + # for critic training + input_q = tf.concat(axis=1, values=[obs, goals, self.u_tf / self.max_u]) + self._input_q = input_q # exposed for tests + self.q_tf = mlp(input_q, [self.hidden] * self.layers + [1], reuse=True) diff --git a/stable_baselines/her/ddpg.py b/stable_baselines/her/ddpg.py new file mode 100644 index 0000000000..b28b57d36b --- /dev/null +++ b/stable_baselines/her/ddpg.py @@ -0,0 +1,417 @@ +from collections import OrderedDict + +import numpy as np +import tensorflow as tf +from tensorflow.contrib.staging import StagingArea + +from stable_baselines import logger +from stable_baselines.her.util import import_function, flatten_grads, transitions_in_episode_batch +from stable_baselines.her.normalizer import Normalizer +from stable_baselines.her.replay_buffer import ReplayBuffer +from stable_baselines.common.mpi_adam import MpiAdam + + +def dims_to_shapes(input_dims): + return {key: tuple([val]) if val > 0 else tuple() for key, val in input_dims.items()} + + +class DDPG(object): + def __init__(self, input_dims, buffer_size, hidden, layers, network_class, polyak, batch_size, + q_lr, pi_lr, norm_eps, norm_clip, max_u, action_l2, clip_obs, scope, time_horizon, + rollout_batch_size, subtract_goals, relative_goals, clip_pos_returns, clip_return, + sample_transitions, gamma, reuse=False): + """ + Implementation of DDPG that is used in combination with Hindsight Experience Replay (HER). + + :param input_dims: ({str: int}) dimensions for the observation (o), the goal (g), and the actions (u) + :param buffer_size: (int) number of transitions that are stored in the replay buffer + :param hidden: (int) number of units in the hidden layers + :param layers: (int) number of hidden layers + :param network_class: (str) the network class that should be used (e.g. 'stable_baselines.her.ActorCritic') + :param polyak: (float) coefficient for Polyak-averaging of the target network + :param batch_size: (int) batch size for training + :param q_lr: (float) learning rate for the Q (critic) network + :param pi_lr: (float) learning rate for the pi (actor) network + :param norm_eps: (float) a small value used in the normalizer to avoid numerical instabilities + :param norm_clip: (float) normalized inputs are clipped to be in [-norm_clip, norm_clip] + :param max_u: (float) maximum action magnitude, i.e. actions are in [-max_u, max_u] + :param action_l2: (float) coefficient for L2 penalty on the actions + :param clip_obs: (float) clip observations before normalization to be in [-clip_obs, clip_obs] + :param scope: (str) the scope used for the TensorFlow graph + :param time_horizon: (int) the time horizon for rollouts + :param rollout_batch_size: (int) number of parallel rollouts per DDPG agent + :param subtract_goals: (function (numpy Number, numpy Number): numpy Number) function that subtracts goals + from each other + :param relative_goals: (boolean) whether or not relative goals should be fed into the network + :param clip_pos_returns: (boolean) whether or not positive returns should be clipped + :param clip_return: (float) clip returns to be in [-clip_return, clip_return] + :param sample_transitions: (function (dict, int): dict) function that samples from the replay buffer + :param gamma: (float) gamma used for Q learning updates + :param reuse: (boolean) whether or not the networks should be reused + """ + # Updated in experiments/config.py + self.input_dims = input_dims + self.buffer_size = buffer_size + self.hidden = hidden + self.layers = layers + self.network_class = network_class + self.polyak = polyak + self.batch_size = batch_size + self.q_lr = q_lr + self.pi_lr = pi_lr + self.norm_eps = norm_eps + self.norm_clip = norm_clip + self.max_u = max_u + self.action_l2 = action_l2 + self.clip_obs = clip_obs + self.scope = scope + self.time_horizon = time_horizon + self.rollout_batch_size = rollout_batch_size + self.subtract_goals = subtract_goals + self.relative_goals = relative_goals + self.clip_pos_returns = clip_pos_returns + self.clip_return = clip_return + self.sample_transitions = sample_transitions + self.gamma = gamma + self.reuse = reuse + + if self.clip_return is None: + self.clip_return = np.inf + + self.create_actor_critic = import_function(self.network_class) + + input_shapes = dims_to_shapes(self.input_dims) + self.dim_obs = self.input_dims['o'] + self.dim_goal = self.input_dims['g'] + self.dim_action = self.input_dims['u'] + + # Prepare staging area for feeding data to the model. + stage_shapes = OrderedDict() + for key in sorted(self.input_dims.keys()): + if key.startswith('info_'): + continue + stage_shapes[key] = (None, *input_shapes[key]) + for key in ['o', 'g']: + stage_shapes[key + '_2'] = stage_shapes[key] + stage_shapes['r'] = (None,) + self.stage_shapes = stage_shapes + + # Create network. + with tf.variable_scope(self.scope): + self.staging_tf = StagingArea( + dtypes=[tf.float32 for _ in self.stage_shapes.keys()], + shapes=list(self.stage_shapes.values())) + self.buffer_ph_tf = [ + tf.placeholder(tf.float32, shape=shape) for shape in self.stage_shapes.values()] + self.stage_op = self.staging_tf.put(self.buffer_ph_tf) + + self._create_network(reuse=reuse) + + # Configure the replay buffer. + buffer_shapes = {key: (self.time_horizon if key != 'o' else self.time_horizon + 1, *input_shapes[key]) + for key, val in input_shapes.items()} + buffer_shapes['g'] = (buffer_shapes['g'][0], self.dim_goal) + buffer_shapes['ag'] = (self.time_horizon + 1, self.dim_goal) + + buffer_size = (self.buffer_size // self.rollout_batch_size) * self.rollout_batch_size + self.buffer = ReplayBuffer(buffer_shapes, buffer_size, self.time_horizon, self.sample_transitions) + + def _random_action(self, num): + return np.random.uniform(low=-self.max_u, high=self.max_u, size=(num, self.dim_action)) + + def _preprocess_obs_goal(self, obs, achieved_goal, goal): + if self.relative_goals: + g_shape = goal.shape + goal = goal.reshape(-1, self.dim_goal) + achieved_goal = achieved_goal.reshape(-1, self.dim_goal) + goal = self.subtract_goals(goal, achieved_goal) + goal = goal.reshape(*g_shape) + obs = np.clip(obs, -self.clip_obs, self.clip_obs) + goal = np.clip(goal, -self.clip_obs, self.clip_obs) + return obs, goal + + def get_actions(self, obs, achieved_goal, goal, noise_eps=0., random_eps=0., use_target_net=False, compute_q=False): + """ + return the action from an observation and goal + + :param obs: (numpy Number) the observation + :param achieved_goal: (numpy Number) the achieved goal + :param goal: (numpy Number) the goal + :param noise_eps: (float) the noise epsilon + :param random_eps: (float) the random epsilon + :param use_target_net: (bool) whether or not to use the target network + :param compute_q: (bool) whether or not to compute Q value + :return: (numpy float or float) the actions + """ + obs, goal = self._preprocess_obs_goal(obs, achieved_goal, goal) + policy = self.target if use_target_net else self.main + # values to compute + vals = [policy.pi_tf] + if compute_q: + vals += [policy.q_pi_tf] + # feed + feed = { + policy.o_tf: obs.reshape(-1, self.dim_obs), + policy.g_tf: goal.reshape(-1, self.dim_goal), + policy.u_tf: np.zeros((obs.size // self.dim_obs, self.dim_action), dtype=np.float32) + } + + ret = self.sess.run(vals, feed_dict=feed) + # action postprocessing + action = ret[0] + noise = noise_eps * self.max_u * np.random.randn(*action.shape) # gaussian noise + action += noise + action = np.clip(action, -self.max_u, self.max_u) + # eps-greedy + n_ac = action.shape[0] + action += np.random.binomial(1, random_eps, n_ac).reshape(-1, 1) * (self._random_action(n_ac) - action) + if action.shape[0] == 1: + action = action[0] + action = action.copy() + ret[0] = action + + if len(ret) == 1: + return ret[0] + else: + return ret + + def store_episode(self, episode_batch, update_stats=True): + """ + Story the episode transitions + + :param episode_batch: (numpy Number) array of batch_size x (T or T+1) x dim_key 'o' is of size T+1, + others are of size T + :param update_stats: (bool) whether to update stats or not + """ + + self.buffer.store_episode(episode_batch) + + if update_stats: + # add transitions to normalizer + episode_batch['o_2'] = episode_batch['o'][:, 1:, :] + episode_batch['ag_2'] = episode_batch['ag'][:, 1:, :] + num_normalizing_transitions = transitions_in_episode_batch(episode_batch) + transitions = self.sample_transitions(episode_batch, num_normalizing_transitions) + + obs, _, goal, achieved_goal = transitions['o'], transitions['o_2'], transitions['g'], transitions['ag'] + transitions['o'], transitions['g'] = self._preprocess_obs_goal(obs, achieved_goal, goal) + # No need to preprocess the o_2 and g_2 since this is only used for stats + + self.o_stats.update(transitions['o']) + self.g_stats.update(transitions['g']) + + self.o_stats.recompute_stats() + self.g_stats.recompute_stats() + + def get_current_buffer_size(self): + """ + returns the current buffer size + + :return: (int) buffer size + """ + return self.buffer.get_current_size() + + def _sync_optimizers(self): + self.q_adam.sync() + self.pi_adam.sync() + + def _grads(self): + # Avoid feed_dict here for performance! + critic_loss, actor_loss, q_grad, pi_grad = self.sess.run([ + self.q_loss_tf, + self.main.q_pi_tf, + self.q_grad_tf, + self.pi_grad_tf + ]) + return critic_loss, actor_loss, q_grad, pi_grad + + def _update(self, q_grad, pi_grad): + self.q_adam.update(q_grad, self.q_lr) + self.pi_adam.update(pi_grad, self.pi_lr) + + def sample_batch(self): + """ + sample a batch + + :return: (dict) the batch + """ + transitions = self.buffer.sample(self.batch_size) + obs, obs_2, goal = transitions['o'], transitions['o_2'], transitions['g'] + achieved_goal, achieved_goal_2 = transitions['ag'], transitions['ag_2'] + transitions['o'], transitions['g'] = self._preprocess_obs_goal(obs, achieved_goal, goal) + transitions['o_2'], transitions['g_2'] = self._preprocess_obs_goal(obs_2, achieved_goal_2, goal) + + transitions_batch = [transitions[key] for key in self.stage_shapes.keys()] + return transitions_batch + + def stage_batch(self, batch=None): + """ + apply a batch to staging + + :param batch: (dict) the batch to add to staging, if None: self.sample_batch() + """ + if batch is None: + batch = self.sample_batch() + assert len(self.buffer_ph_tf) == len(batch) + self.sess.run(self.stage_op, feed_dict=dict(zip(self.buffer_ph_tf, batch))) + + def train(self, stage=True): + """ + train DDPG + + :param stage: (bool) enable staging + :return: (float, float) critic loss, actor loss + """ + if stage: + self.stage_batch() + critic_loss, actor_loss, q_grad, pi_grad = self._grads() + self._update(q_grad, pi_grad) + return critic_loss, actor_loss + + def _init_target_net(self): + self.sess.run(self.init_target_net_op) + + def update_target_net(self): + """ + update the target network + """ + self.sess.run(self.update_target_net_op) + + def clear_buffer(self): + """ + clears the replay buffer + """ + self.buffer.clear_buffer() + + def _vars(self, scope): + res = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope=self.scope + '/' + scope) + assert len(res) > 0 + return res + + def _global_vars(self, scope): + res = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope=self.scope + '/' + scope) + return res + + def _create_network(self, reuse=False): + logger.info("Creating a DDPG agent with action space %d x %s..." % (self.dim_action, self.max_u)) + + self.sess = tf.get_default_session() + if self.sess is None: + self.sess = tf.InteractiveSession() + + # running averages + with tf.variable_scope('o_stats') as scope: + if reuse: + scope.reuse_variables() + self.o_stats = Normalizer(self.dim_obs, self.norm_eps, self.norm_clip, sess=self.sess) + with tf.variable_scope('g_stats') as scope: + if reuse: + scope.reuse_variables() + self.g_stats = Normalizer(self.dim_goal, self.norm_eps, self.norm_clip, sess=self.sess) + + # mini-batch sampling. + batch = self.staging_tf.get() + batch_tf = OrderedDict([(key, batch[i]) + for i, key in enumerate(self.stage_shapes.keys())]) + batch_tf['r'] = tf.reshape(batch_tf['r'], [-1, 1]) + + # networks + with tf.variable_scope('main') as scope: + if reuse: + scope.reuse_variables() + self.main = self.create_actor_critic(batch_tf, net_type='main', **self.__dict__) + scope.reuse_variables() + with tf.variable_scope('target') as scope: + if reuse: + scope.reuse_variables() + target_batch_tf = batch_tf.copy() + target_batch_tf['o'] = batch_tf['o_2'] + target_batch_tf['g'] = batch_tf['g_2'] + self.target = self.create_actor_critic( + target_batch_tf, net_type='target', **self.__dict__) + scope.reuse_variables() + assert len(self._vars("main")) == len(self._vars("target")) + + # loss functions + target_q_pi_tf = self.target.q_pi_tf + clip_range = (-self.clip_return, 0. if self.clip_pos_returns else np.inf) + target_tf = tf.clip_by_value(batch_tf['r'] + self.gamma * target_q_pi_tf, *clip_range) + + self.q_loss_tf = tf.reduce_mean(tf.square(tf.stop_gradient(target_tf) - self.main.q_tf)) + self.pi_loss_tf = -tf.reduce_mean(self.main.q_pi_tf) + self.pi_loss_tf += self.action_l2 * tf.reduce_mean(tf.square(self.main.pi_tf / self.max_u)) + + q_grads_tf = tf.gradients(self.q_loss_tf, self._vars('main/Q')) + pi_grads_tf = tf.gradients(self.pi_loss_tf, self._vars('main/pi')) + + assert len(self._vars('main/Q')) == len(q_grads_tf) + assert len(self._vars('main/pi')) == len(pi_grads_tf) + + self.q_grads_vars_tf = zip(q_grads_tf, self._vars('main/Q')) + self.pi_grads_vars_tf = zip(pi_grads_tf, self._vars('main/pi')) + self.q_grad_tf = flatten_grads(grads=q_grads_tf, var_list=self._vars('main/Q')) + self.pi_grad_tf = flatten_grads(grads=pi_grads_tf, var_list=self._vars('main/pi')) + + # optimizers + self.q_adam = MpiAdam(self._vars('main/Q'), scale_grad_by_procs=False) + self.pi_adam = MpiAdam(self._vars('main/pi'), scale_grad_by_procs=False) + + # polyak averaging + self.main_vars = self._vars('main/Q') + self._vars('main/pi') + self.target_vars = self._vars('target/Q') + self._vars('target/pi') + self.stats_vars = self._global_vars('o_stats') + self._global_vars('g_stats') + self.init_target_net_op = list( + map(lambda v: v[0].assign(v[1]), zip(self.target_vars, self.main_vars))) + self.update_target_net_op = list( + map(lambda v: v[0].assign(self.polyak * v[0] + (1. - self.polyak) * v[1]), + zip(self.target_vars, self.main_vars))) + + # initialize all variables + tf.variables_initializer(self._global_vars('')).run() + self._sync_optimizers() + self._init_target_net() + + def logs(self, prefix=''): + """ + create a log dictionary + :param prefix: (str) the prefix for evey index + :return: ({str: Any}) the log + """ + logs = [] + logs += [('stats_o/mean', np.mean(self.sess.run([self.o_stats.mean])))] + logs += [('stats_o/std', np.mean(self.sess.run([self.o_stats.std])))] + logs += [('stats_g/mean', np.mean(self.sess.run([self.g_stats.mean])))] + logs += [('stats_g/std', np.mean(self.sess.run([self.g_stats.std])))] + + if prefix is not '' and not prefix.endswith('/'): + return [(prefix + '/' + key, val) for key, val in logs] + else: + return logs + + def __getstate__(self): + """Our policies can be loaded from pkl, but after unpickling you cannot continue training. + """ + excluded_subnames = ['_tf', '_op', '_vars', '_adam', 'buffer', 'sess', '_stats', + 'main', 'target', 'lock', 'env', 'sample_transitions', + 'stage_shapes', 'create_actor_critic'] + + state = {k: v for k, v in self.__dict__.items() if all([subname not in k for subname in excluded_subnames])} + state['buffer_size'] = self.buffer_size + state['tf'] = self.sess.run([x for x in self._global_vars('') if 'buffer' not in x.name]) + return state + + def __setstate__(self, state): + if 'sample_transitions' not in state: + # We don't need this for playing the policy. + state['sample_transitions'] = None + + self.__init__(**state) + # set up stats (they are overwritten in __init__) + for key, value in state.items(): + if key[-6:] == '_stats': + self.__dict__[key] = value + # load TF variables + _vars = [x for x in self._global_vars('') if 'buffer' not in x.name] + assert len(_vars) == len(state["tf"]) + node = [tf.assign(var, val) for var, val in zip(_vars, state["tf"])] + self.sess.run(node) diff --git a/baselines/acer/__init__.py b/stable_baselines/her/experiment/__init__.py similarity index 100% rename from baselines/acer/__init__.py rename to stable_baselines/her/experiment/__init__.py diff --git a/baselines/her/experiment/config.py b/stable_baselines/her/experiment/config.py similarity index 69% rename from baselines/her/experiment/config.py rename to stable_baselines/her/experiment/config.py index cf29ca52b8..529cf3e393 100644 --- a/baselines/her/experiment/config.py +++ b/stable_baselines/her/experiment/config.py @@ -1,9 +1,9 @@ import numpy as np import gym -from baselines import logger -from baselines.her.ddpg import DDPG -from baselines.her.her import make_sample_her_transitions +from stable_baselines import logger +from stable_baselines.her.ddpg import DDPG +from stable_baselines.her.her import make_sample_her_transitions DEFAULT_ENV_PARAMS = { @@ -19,8 +19,8 @@ # ddpg 'layers': 3, # number of layers in the critic/actor networks 'hidden': 256, # number of neurons in each hidden layers - 'network_class': 'baselines.her.actor_critic:ActorCritic', - 'Q_lr': 0.001, # critic learning rate + 'network_class': 'stable_baselines.her.actor_critic:ActorCritic', + 'q_lr': 0.001, # critic learning rate 'pi_lr': 0.001, # actor learning rate 'buffer_size': int(1E6), # for experience replay 'polyak': 0.95, # polyak averaging coefficient @@ -55,6 +55,9 @@ def cached_make_env(make_env): Only creates a new environment from the provided function if one has not yet already been created. This is useful here because we need to infer certain properties of the env, e.g. its observation and action spaces, without any intend of actually using it. + + :param make_env: (function (): Gym Environment) creates the environment + :return: (Gym Environment) the created environment """ if make_env not in CACHED_ENVS: env = make_env() @@ -63,6 +66,12 @@ def cached_make_env(make_env): def prepare_params(kwargs): + """ + prepares DDPG params from kwargs + + :param kwargs: (dict) the input kwargs + :return: (dict) DDPG parameters + """ # DDPG params ddpg_params = dict() @@ -73,18 +82,18 @@ def make_env(): kwargs['make_env'] = make_env tmp_env = cached_make_env(kwargs['make_env']) assert hasattr(tmp_env, '_max_episode_steps') - kwargs['T'] = tmp_env._max_episode_steps + kwargs['time_horizon'] = tmp_env.spec.max_episode_steps # wrapped envs preserve their spec tmp_env.reset() kwargs['max_u'] = np.array(kwargs['max_u']) if isinstance(kwargs['max_u'], list) else kwargs['max_u'] - kwargs['gamma'] = 1. - 1. / kwargs['T'] + kwargs['gamma'] = 1. - 1. / kwargs['time_horizon'] if 'lr' in kwargs: kwargs['pi_lr'] = kwargs['lr'] - kwargs['Q_lr'] = kwargs['lr'] + kwargs['q_lr'] = kwargs['lr'] del kwargs['lr'] for name in ['buffer_size', 'hidden', 'layers', 'network_class', 'polyak', - 'batch_size', 'Q_lr', 'pi_lr', + 'batch_size', 'q_lr', 'pi_lr', 'norm_eps', 'norm_clip', 'max_u', 'action_l2', 'clip_obs', 'scope', 'relative_goals']: ddpg_params[name] = kwargs[name] @@ -95,17 +104,29 @@ def make_env(): return kwargs -def log_params(params, logger=logger): +def log_params(params, logger_input=logger): + """ + log the parameters + + :param params: (dict) parameters to log + :param logger_input: (logger) the logger + """ for key in sorted(params.keys()): - logger.info('{}: {}'.format(key, params[key])) + logger_input.info('{}: {}'.format(key, params[key])) def configure_her(params): + """ + configure hindsight experience replay + + :param params: (dict) input parameters + :return: (function (dict, int): dict) returns a HER update function for replay buffer batch + """ env = cached_make_env(params['make_env']) env.reset() - def reward_fun(ag_2, g, info): # vectorized - return env.compute_reward(achieved_goal=ag_2, desired_goal=g, info=info) + def reward_fun(achieved_goal, goal, info): # vectorized + return env.compute_reward(achieved_goal=achieved_goal, desired_goal=goal, info=info) # Prepare configuration for HER. her_params = { @@ -120,12 +141,29 @@ def reward_fun(ag_2, g, info): # vectorized return sample_her_transitions -def simple_goal_subtract(a, b): - assert a.shape == b.shape - return a - b +def simple_goal_subtract(vec_a, vec_b): + """ + checks if a and b have the same shape, and does a - b + + :param vec_a: (numpy array) + :param vec_b: (numpy array) + :return: (numpy array) a - b + """ + assert vec_a.shape == vec_b.shape + return vec_a - vec_b def configure_ddpg(dims, params, reuse=False, use_mpi=True, clip_return=True): + """ + configure a DDPG model from parameters + + :param dims: ({str: int}) the dimensions + :param params: (dict) the DDPG parameters + :param reuse: (bool) whether or not the networks should be reused + :param use_mpi: (bool) whether or not to use MPI + :param clip_return: (float) clip returns to be in [-clip_return, clip_return] + :return: (her.DDPG) the ddpg model + """ sample_her_transitions = configure_her(params) # Extract relevant parameters. gamma = params['gamma'] @@ -138,7 +176,7 @@ def configure_ddpg(dims, params, reuse=False, use_mpi=True, clip_return=True): env = cached_make_env(params['make_env']) env.reset() ddpg_params.update({'input_dims': input_dims, # agent takes an input observations - 'T': params['T'], + 'time_horizon': params['time_horizon'], 'clip_pos_returns': True, # clip positive returns 'clip_return': (1. / (1. - gamma)) if clip_return else np.inf, # max abs of return 'rollout_batch_size': rollout_batch_size, @@ -154,6 +192,12 @@ def configure_ddpg(dims, params, reuse=False, use_mpi=True, clip_return=True): def configure_dims(params): + """ + configure input and output dimensions + + :param params: (dict) the parameters + :return: ({str: int}) the dimensions + """ env = cached_make_env(params['make_env']) env.reset() obs, _, _, info = env.step(env.action_space.sample()) diff --git a/baselines/her/experiment/play.py b/stable_baselines/her/experiment/play.py similarity index 65% rename from baselines/her/experiment/play.py rename to stable_baselines/her/experiment/play.py index 5b2f85d2ff..6d01e03ea1 100644 --- a/baselines/her/experiment/play.py +++ b/stable_baselines/her/experiment/play.py @@ -1,11 +1,12 @@ import click -import numpy as np import pickle -from baselines import logger -from baselines.common import set_global_seeds -import baselines.her.experiment.config as config -from baselines.her.rollout import RolloutWorker +import numpy as np + +from stable_baselines import logger +from stable_baselines.common import set_global_seeds +import stable_baselines.her.experiment.config as config +from stable_baselines.her.rollout import RolloutWorker @click.command() @@ -14,11 +15,19 @@ @click.option('--n_test_rollouts', type=int, default=10) @click.option('--render', type=int, default=1) def main(policy_file, seed, n_test_rollouts, render): + """ + run HER from a saved policy + + :param policy_file: (str) pickle path to a saved policy + :param seed: (int) initial seed + :param n_test_rollouts: (int) the number of test rollouts + :param render: (bool) if rendering should be done + """ set_global_seeds(seed) # Load policy. - with open(policy_file, 'rb') as f: - policy = pickle.load(f) + with open(policy_file, 'rb') as file_handler: + policy = pickle.load(file_handler) env_name = policy.info['env_name'] # Prepare params. @@ -27,21 +36,21 @@ def main(policy_file, seed, n_test_rollouts, render): params.update(config.DEFAULT_ENV_PARAMS[env_name]) # merge env-specific parameters in params['env_name'] = env_name params = config.prepare_params(params) - config.log_params(params, logger=logger) + config.log_params(params, logger_input=logger) dims = config.configure_dims(params) eval_params = { 'exploit': True, 'use_target_net': params['test_with_polyak'], - 'compute_Q': True, + 'compute_q': True, 'rollout_batch_size': 1, 'render': bool(render), } - for name in ['T', 'gamma', 'noise_eps', 'random_eps']: + for name in ['time_horizon', 'gamma', 'noise_eps', 'random_eps']: eval_params[name] = params[name] - + evaluator = RolloutWorker(params['make_env'], policy, dims, logger, **eval_params) evaluator.seed(seed) diff --git a/baselines/her/experiment/plot.py b/stable_baselines/her/experiment/plot.py similarity index 84% rename from baselines/her/experiment/plot.py rename to stable_baselines/her/experiment/plot.py index 560903f82d..e9ee808a2e 100644 --- a/baselines/her/experiment/plot.py +++ b/stable_baselines/her/experiment/plot.py @@ -1,26 +1,42 @@ import os +import json +import argparse + import matplotlib.pyplot as plt import numpy as np -import json -import seaborn as sns; sns.set() +import seaborn as sns import glob2 -import argparse +# Initialize seaborn +sns.set() def smooth_reward_curve(x, y): + """ + smooth the reward curve + + :param x: (numpy float) the x coord of the reward + :param y: (numpy float) the y coord of the reward + :return: (numpy float, numpy float) smoothed x, smoothed y + """ halfwidth = int(np.ceil(len(x) / 60)) # Halfwidth of our smoothing convolution k = halfwidth xsmoo = x ysmoo = np.convolve(y, np.ones(2 * k + 1), mode='same') / np.convolve(np.ones_like(y), np.ones(2 * k + 1), - mode='same') + mode='same') return xsmoo, ysmoo def load_results(file): + """ + load the results from a file + + :param file: (str) the saved results + :return: (dict) the result + """ if not os.path.exists(file): return None - with open(file, 'r') as f: - lines = [line for line in f] + with open(file, 'r') as file_handler: + lines = [line for line in file_handler] if len(lines) < 2: return None keys = [name.strip() for name in lines[0].split(',')] @@ -36,13 +52,20 @@ def load_results(file): def pad(xs, value=np.nan): + """ + + + :param xs: + :param value: + :return: + """ maxlen = np.max([len(x) for x in xs]) - + padded_xs = [] for x in xs: if x.shape[0] >= maxlen: padded_xs.append(x) - + padding = np.ones((maxlen - x.shape[0],) + x.shape[1:]) * value x_padded = np.concatenate([x, padding], axis=0) assert x_padded.shape[1:] == x.shape[1:] diff --git a/baselines/her/experiment/train.py b/stable_baselines/her/experiment/train.py similarity index 58% rename from baselines/her/experiment/train.py rename to stable_baselines/her/experiment/train.py index aeaf1c5418..03d0a585bc 100644 --- a/baselines/her/experiment/train.py +++ b/stable_baselines/her/experiment/train.py @@ -1,32 +1,50 @@ import os import sys +from subprocess import CalledProcessError import click import numpy as np import json from mpi4py import MPI -from baselines import logger -from baselines.common import set_global_seeds -from baselines.common.mpi_moments import mpi_moments -import baselines.her.experiment.config as config -from baselines.her.rollout import RolloutWorker -from baselines.her.util import mpi_fork - -from subprocess import CalledProcessError +from stable_baselines import logger +from stable_baselines.common import set_global_seeds, tf_util +from stable_baselines.common.mpi_moments import mpi_moments +import stable_baselines.her.experiment.config as config +from stable_baselines.her.rollout import RolloutWorker +from stable_baselines.her.util import mpi_fork def mpi_average(value): - if value == []: + """ + calculate the average from the array, using MPI + + :param value: (numpy Number) the array + :return: (float) the average + """ + if len(value) == 0: value = [0.] if not isinstance(value, list): value = [value] return mpi_moments(np.array(value))[0] -def train(policy, rollout_worker, evaluator, - n_epochs, n_test_rollouts, n_cycles, n_batches, policy_save_interval, - save_policies, **kwargs): +def train(policy, rollout_worker, evaluator, n_epochs, n_test_rollouts, n_cycles, n_batches, policy_save_interval, + save_policies): + """ + train the given policy + + :param policy: (her.DDPG) the policy to train + :param rollout_worker: (RolloutWorker) Rollout worker generates experience for training. + :param evaluator: (RolloutWorker) Rollout worker for evalutation + :param n_epochs: (int) the number of epochs + :param n_test_rollouts: (int) the number of for the evalutation RolloutWorker + :param n_cycles: (int) the number of cycles for training per epoch + :param n_batches: (int) the batch size + :param policy_save_interval: (int) the interval with which policy pickles are saved. + If set to 0, only the best and latest policy will be pickled. + :param save_policies: (bool) whether or not to save the policies + """ rank = MPI.COMM_WORLD.Get_rank() latest_policy_path = os.path.join(logger.get_dir(), 'policy_latest.pkl') @@ -42,7 +60,7 @@ def train(policy, rollout_worker, evaluator, episode = rollout_worker.generate_rollouts() policy.store_episode(episode) for _ in range(n_batches): - policy.train() + policy.train_step() policy.update_target_net() # test @@ -66,7 +84,8 @@ def train(policy, rollout_worker, evaluator, success_rate = mpi_average(evaluator.current_success_rate()) if rank == 0 and success_rate >= best_success_rate and save_policies: best_success_rate = success_rate - logger.info('New best success rate: {}. Saving policy to {} ...'.format(best_success_rate, best_policy_path)) + logger.info('New best success rate: {}. Saving policy to {} ...' + .format(best_success_rate, best_policy_path)) evaluator.save_policy(best_policy_path) evaluator.save_policy(latest_policy_path) if rank == 0 and policy_save_interval > 0 and epoch % policy_save_interval == 0 and save_policies: @@ -82,10 +101,26 @@ def train(policy, rollout_worker, evaluator, assert local_uniform[0] != root_uniform[0] -def launch( - env, logdir, n_epochs, num_cpu, seed, replay_strategy, policy_save_interval, clip_return, - override_params={}, save_policies=True -): +def launch(env, logdir, n_epochs, num_cpu, seed, replay_strategy, policy_save_interval, clip_return, + override_params=None, save_policies=True): + """ + launch training with mpi + + :param env: (str) environment ID + :param logdir: (str) the log directory + :param n_epochs: (int) the number of training epochs + :param num_cpu: (int) the number of CPUs to run on + :param seed: (int) the initial random seed + :param replay_strategy: (str) the type of replay strategy ('future' or 'none') + :param policy_save_interval: (int) the interval with which policy pickles are saved. + If set to 0, only the best and latest policy will be pickled. + :param clip_return: (float): clip returns to be in [-clip_return, clip_return] + :param override_params: (dict) override any parameter for training + :param save_policies: (bool) whether or not to save the policies + """ + + if override_params is None: + override_params = {} # Fork for multi-CPU MPI implementation. if num_cpu > 1: try: @@ -96,14 +131,13 @@ def launch( if whoami == 'parent': sys.exit(0) - import baselines.common.tf_util as U - U.single_threaded_session().__enter__() + tf_util.single_threaded_session().__enter__() rank = MPI.COMM_WORLD.Get_rank() # Configure logging if rank == 0: if logdir or logger.get_dir() is None: - logger.configure(dir=logdir) + logger.configure(folder=logdir) else: logger.configure() logdir = logger.get_dir() @@ -121,10 +155,10 @@ def launch( if env in config.DEFAULT_ENV_PARAMS: params.update(config.DEFAULT_ENV_PARAMS[env]) # merge env-specific parameters in params.update(**override_params) # makes it possible to override any parameter - with open(os.path.join(logger.get_dir(), 'params.json'), 'w') as f: - json.dump(params, f) + with open(os.path.join(logger.get_dir(), 'params.json'), 'w') as file_handler: + json.dump(params, file_handler) params = config.prepare_params(params) - config.log_params(params, logger=logger) + config.log_params(params, logger_input=logger) if num_cpu == 1: logger.warn() @@ -134,7 +168,7 @@ def launch( 'experiments that we report in Plappert et al. (2018, https://arxiv.org/abs/1802.09464) ' + 'were obtained with --num_cpu 19. This makes a significant difference and if you ' + 'are looking to reproduce those results, be aware of this. Please also refer to ' + - 'https://github.com/openai/baselines/issues/314 for further details.') + 'https://github.com/openai/stable_baselines/issues/314 for further details.') logger.warn('****************') logger.warn() @@ -144,20 +178,20 @@ def launch( rollout_params = { 'exploit': False, 'use_target_net': False, - 'use_demo_states': True, - 'compute_Q': False, - 'T': params['T'], + # 'use_demo_states': True, + 'compute_q': False, + 'time_horizon': params['time_horizon'], } eval_params = { 'exploit': True, 'use_target_net': params['test_with_polyak'], - 'use_demo_states': False, - 'compute_Q': True, - 'T': params['T'], + # 'use_demo_states': False, + 'compute_q': True, + 'time_horizon': params['time_horizon'], } - for name in ['T', 'rollout_batch_size', 'gamma', 'noise_eps', 'random_eps']: + for name in ['time_horizon', 'rollout_batch_size', 'gamma', 'noise_eps', 'random_eps']: rollout_params[name] = params[name] eval_params[name] = params[name] @@ -168,22 +202,33 @@ def launch( evaluator.seed(rank_seed) train( - logdir=logdir, policy=policy, rollout_worker=rollout_worker, + policy=policy, rollout_worker=rollout_worker, evaluator=evaluator, n_epochs=n_epochs, n_test_rollouts=params['n_test_rollouts'], n_cycles=params['n_cycles'], n_batches=params['n_batches'], policy_save_interval=policy_save_interval, save_policies=save_policies) @click.command() -@click.option('--env', type=str, default='FetchReach-v1', help='the name of the OpenAI Gym environment that you want to train on') -@click.option('--logdir', type=str, default=None, help='the path to where logs and policy pickles should go. If not specified, creates a folder in /tmp/') +@click.option('--env', type=str, default='FetchReach-v1', + help='the name of the OpenAI Gym environment that you want to train on') +@click.option('--logdir', type=str, default=None, + help='the path to where logs and policy pickles should go. If not specified, creates a folder in /tmp/') @click.option('--n_epochs', type=int, default=50, help='the number of training epochs to run') @click.option('--num_cpu', type=int, default=1, help='the number of CPU cores to use (using MPI)') -@click.option('--seed', type=int, default=0, help='the random seed used to seed both the environment and the training code') -@click.option('--policy_save_interval', type=int, default=5, help='the interval with which policy pickles are saved. If set to 0, only the best and latest policy will be pickled.') -@click.option('--replay_strategy', type=click.Choice(['future', 'none']), default='future', help='the HER replay strategy to be used. "future" uses HER, "none" disables HER.') +@click.option('--seed', type=int, default=0, + help='the random seed used to seed both the environment and the training code') +@click.option('--policy_save_interval', type=int, default=5, + help='the interval with which policy pickles are saved. ' + 'If set to 0, only the best and latest policy will be pickled.') +@click.option('--replay_strategy', type=click.Choice(['future', 'none']), default='future', + help='the HER replay strategy to be used. "future" uses HER, "none" disables HER.') @click.option('--clip_return', type=int, default=1, help='whether or not returns should be clipped') def main(**kwargs): + """ + run launch for MPI HER DDPG training + + :param kwargs: (dict) the launch kwargs + """ launch(**kwargs) diff --git a/baselines/her/her.py b/stable_baselines/her/her.py similarity index 50% rename from baselines/her/her.py rename to stable_baselines/her/her.py index 76f3c346ae..15db296724 100644 --- a/baselines/her/her.py +++ b/stable_baselines/her/her.py @@ -1,15 +1,20 @@ +import tensorflow as tf import numpy as np +import gym + +from stable_baselines.common import BaseRLModel, SetVerbosity +from stable_baselines.common.policies import LstmPolicy def make_sample_her_transitions(replay_strategy, replay_k, reward_fun): - """Creates a sample function that can be used for HER experience replay. + """ + Creates a sample function that can be used for HER experience replay. - Args: - replay_strategy (in ['future', 'none']): the HER replay strategy; if set to 'none', - regular DDPG experience replay is used - replay_k (int): the ratio between HER replays and regular replays (e.g. k = 4 -> 4 times + :param replay_strategy: (str) the HER replay strategy; if set to 'none', regular DDPG experience replay is used + (can be 'future' or 'none'). + :param replay_k: (int) the ratio between HER replays and regular replays (e.g. k = 4 -> 4 times as many HER replays as regular replays are used) - reward_fun (function): function to re-compute the reward with substituted goals + :param reward_fun: (function (dict, dict): float) function to re-compute the reward with substituted goals """ if replay_strategy == 'future': future_p = 1 - (1. / (1 + replay_k)) @@ -19,20 +24,20 @@ def make_sample_her_transitions(replay_strategy, replay_k, reward_fun): def _sample_her_transitions(episode_batch, batch_size_in_transitions): """episode_batch is {key: array(buffer_size x T x dim_key)} """ - T = episode_batch['u'].shape[1] + time_horizon = episode_batch['u'].shape[1] rollout_batch_size = episode_batch['u'].shape[0] batch_size = batch_size_in_transitions # Select which episodes and time steps to use. episode_idxs = np.random.randint(0, rollout_batch_size, batch_size) - t_samples = np.random.randint(T, size=batch_size) + t_samples = np.random.randint(time_horizon, size=batch_size) transitions = {key: episode_batch[key][episode_idxs, t_samples].copy() for key in episode_batch.keys()} # Select future time indexes proportional with probability future_p. These # will be used for HER replay by substituting in future goals. her_indexes = np.where(np.random.uniform(size=batch_size) < future_p) - future_offset = np.random.uniform(size=batch_size) * (T - t_samples) + future_offset = np.random.uniform(size=batch_size) * (time_horizon - t_samples) future_offset = future_offset.astype(int) future_t = (t_samples + 1 + future_offset)[her_indexes] @@ -56,8 +61,51 @@ def _sample_her_transitions(episode_batch, batch_size_in_transitions): transitions = {k: transitions[k].reshape(batch_size, *transitions[k].shape[1:]) for k in transitions.keys()} - assert(transitions['u'].shape[0] == batch_size_in_transitions) + assert transitions['u'].shape[0] == batch_size_in_transitions return transitions return _sample_her_transitions + + +class HER(BaseRLModel): + def __init__(self, policy, env, verbose=0, _init_setup_model=True): + super().__init__(policy=policy, env=env, requires_vec_env=False, verbose=verbose) + + self.policy = policy + + self.sess = None + self.graph = None + + if _init_setup_model: + self.setup_model() + + def setup_model(self): + with SetVerbosity(self.verbose): + + assert isinstance(self.action_space, gym.spaces.Box), \ + "Error: HER cannot output a {} action space, only spaces.Box is supported.".format(self.action_space) + assert not issubclass(self.policy, LstmPolicy), "Error: cannot use a reccurent policy for the HER model." + + self.graph = tf.Graph() + with self.graph.as_default(): + pass + + def learn(self, total_timesteps, callback=None, seed=None, log_interval=100): + with SetVerbosity(self.verbose): + self._setup_learn(seed) + + return self + + def predict(self, observation, state=None, mask=None): + pass + + def action_probability(self, observation, state=None, mask=None): + pass + + def save(self, save_path): + pass + + @classmethod + def load(cls, load_path, env=None, **kwargs): + pass diff --git a/baselines/her/normalizer.py b/stable_baselines/her/normalizer.py similarity index 54% rename from baselines/her/normalizer.py rename to stable_baselines/her/normalizer.py index d2b0588e8b..e59c8fd913 100644 --- a/baselines/her/normalizer.py +++ b/stable_baselines/her/normalizer.py @@ -4,20 +4,20 @@ from mpi4py import MPI import tensorflow as tf -from baselines.her.util import reshape_for_broadcasting +from stable_baselines.her.util import reshape_for_broadcasting class Normalizer: def __init__(self, size, eps=1e-2, default_clip_range=np.inf, sess=None): - """A normalizer that ensures that observations are approximately distributed according to + """ + A normalizer that ensures that observations are approximately distributed according to a standard Normal distribution (i.e. have mean zero and variance one). - Args: - size (int): the size of the observation to be normalized - eps (float): a small constant that avoids underflows - default_clip_range (float): normalized observations are clipped to be in - [-default_clip_range, default_clip_range] - sess (object): the TensorFlow session to be used + :param size: (int) the size of the observation to be normalized + :param eps: (float) a small constant that avoids underflows + :param default_clip_range: (float) normalized observations are clipped to be in + [-default_clip_range, default_clip_range] + :param sess: (TensorFlow Session) the TensorFlow session to be used """ self.size = size self.eps = eps @@ -61,39 +61,69 @@ def __init__(self, size, eps=1e-2, default_clip_range=np.inf, sess=None): ) self.lock = threading.Lock() - def update(self, v): - v = v.reshape(-1, self.size) + def update(self, arr): + """ + update the parameters from the input + + :param arr: (numpy Number) the input + """ + arr = arr.reshape(-1, self.size) with self.lock: - self.local_sum += v.sum(axis=0) - self.local_sumsq += (np.square(v)).sum(axis=0) - self.local_count[0] += v.shape[0] + self.local_sum += arr.sum(axis=0) + self.local_sumsq += (np.square(arr)).sum(axis=0) + self.local_count[0] += arr.shape[0] - def normalize(self, v, clip_range=None): + def normalize(self, arr, clip_range=None): + """ + normalize the input + + :param arr: (numpy Number) the input + :param clip_range: (float) the range to clip to [-clip_range, clip_range] + :return: (numpy Number) normalized input + """ if clip_range is None: clip_range = self.default_clip_range - mean = reshape_for_broadcasting(self.mean, v) - std = reshape_for_broadcasting(self.std, v) - return tf.clip_by_value((v - mean) / std, -clip_range, clip_range) - - def denormalize(self, v): - mean = reshape_for_broadcasting(self.mean, v) - std = reshape_for_broadcasting(self.std, v) - return mean + v * std - - def _mpi_average(self, x): - buf = np.zeros_like(x) - MPI.COMM_WORLD.Allreduce(x, buf, op=MPI.SUM) + mean = reshape_for_broadcasting(self.mean, arr) + std = reshape_for_broadcasting(self.std, arr) + return tf.clip_by_value((arr - mean) / std, -clip_range, clip_range) + + def denormalize(self, arr): + """ + denormalize the input + + :param arr: (numpy Number) the normalized input + :return: (numpy Number) original input + """ + mean = reshape_for_broadcasting(self.mean, arr) + std = reshape_for_broadcasting(self.std, arr) + return mean + arr * std + + @classmethod + def _mpi_average(cls, arr): + buf = np.zeros_like(arr) + MPI.COMM_WORLD.Allreduce(arr, buf, op=MPI.SUM) buf /= MPI.COMM_WORLD.Get_size() return buf - def synchronize(self, local_sum, local_sumsq, local_count, root=None): + def synchronize(self, local_sum, local_sumsq, local_count): + """ + syncronize over mpi threads + + :param local_sum: (numpy Number) the sum + :param local_sumsq: (numpy Number) the square root sum + :param local_count: (numpy Number) the number of values updated + :return: (numpy Number, numpy Number, numpy Number) the updated local_sum, local_sumsq, and local_count + """ local_sum[...] = self._mpi_average(local_sum) local_sumsq[...] = self._mpi_average(local_sumsq) local_count[...] = self._mpi_average(local_count) return local_sum, local_sumsq, local_count def recompute_stats(self): + """ + recompute the stats + """ with self.lock: # Copy over results. local_count = self.local_count.copy() @@ -120,21 +150,50 @@ def recompute_stats(self): class IdentityNormalizer: def __init__(self, size, std=1.): + """ + Normalizer that returns the input unchanged + + :param size: (int or [int]) the shape of the input to normalize + :param std: (float) the initial standard deviation or the normalization + """ self.size = size self.mean = tf.zeros(self.size, tf.float32) self.std = std * tf.ones(self.size, tf.float32) - def update(self, x): + def update(self, arr): + """ + update the parameters from the input + + :param arr: (numpy Number) the input + """ pass - def normalize(self, x, clip_range=None): - return x / self.std + def normalize(self, arr, **_kwargs): + """ + normalize the input + + :param arr: (numpy Number) the input + :return: (numpy Number) normalized input + """ + return arr / self.std + + def denormalize(self, arr): + """ + denormalize the input - def denormalize(self, x): - return self.std * x + :param arr: (numpy Number) the normalized input + :return: (numpy Number) original input + """ + return self.std * arr def synchronize(self): + """ + syncronize over mpi threads + """ pass def recompute_stats(self): + """ + recompute the stats + """ pass diff --git a/baselines/her/replay_buffer.py b/stable_baselines/her/replay_buffer.py similarity index 62% rename from baselines/her/replay_buffer.py rename to stable_baselines/her/replay_buffer.py index b0005523fd..c46de90b0c 100644 --- a/baselines/her/replay_buffer.py +++ b/stable_baselines/her/replay_buffer.py @@ -4,19 +4,18 @@ class ReplayBuffer: - def __init__(self, buffer_shapes, size_in_transitions, T, sample_transitions): - """Creates a replay buffer. - - Args: - buffer_shapes (dict of ints): the shape for all buffers that are used in the replay - buffer - size_in_transitions (int): the size of the buffer, measured in transitions - T (int): the time horizon for episodes - sample_transitions (function): a function that samples from the replay buffer + def __init__(self, buffer_shapes, size_in_transitions, time_horizon, sample_transitions): + """ + Creates a replay buffer. + + :param buffer_shapes: ({str: int}) the shape for all buffers that are used in the replay buffer + :param size_in_transitions: (int) the size of the buffer, measured in transitions + :param time_horizon: (int) the time horizon for episodes + :param sample_transitions: (function) a function that samples from the replay buffer """ self.buffer_shapes = buffer_shapes - self.size = size_in_transitions // T - self.T = T + self.size = size_in_transitions // time_horizon + self.time_horizon = time_horizon self.sample_transitions = sample_transitions # self.buffers is {key: array(size_in_episodes x T or T+1 x dim_key)} @@ -35,7 +34,11 @@ def full(self): return self.current_size == self.size def sample(self, batch_size): - """Returns a dict {key: array(batch_size x shapes[key])} + """ + sample random transitions + + :param batch_size: (int) How many transitions to sample. + :return: (dict) {key: array(batch_size x shapes[key])} """ buffers = {} @@ -55,7 +58,10 @@ def sample(self, batch_size): return transitions def store_episode(self, episode_batch): - """episode_batch: array(batch_size x (T or T+1) x dim_key) + """ + Store an episode in the replay buffer + + :param episode_batch: (numpy Number) batch_size x (T or T+1) x dim_key """ batch_sizes = [len(episode_batch[key]) for key in episode_batch.keys()] assert np.all(np.array(batch_sizes) == batch_sizes[0]) @@ -68,30 +74,48 @@ def store_episode(self, episode_batch): for key in self.buffers.keys(): self.buffers[key][idxs] = episode_batch[key] - self.n_transitions_stored += batch_size * self.T + self.n_transitions_stored += batch_size * self.time_horizon def get_current_episode_size(self): + """ + get current episode size + + :return: (int) the current size of the episode + """ with self.lock: return self.current_size def get_current_size(self): + """ + get current size of the buffer + + :return: (int) the current size of the buffer + """ with self.lock: - return self.current_size * self.T + return self.current_size * self.time_horizon def get_transitions_stored(self): + """ + get the number of stored transitions + + :return: (int) the number of transitions stored + """ with self.lock: return self.n_transitions_stored def clear_buffer(self): + """ + clear the buffer of all entries + """ with self.lock: self.current_size = 0 def _get_storage_idx(self, inc=None): - inc = inc or 1 # size increment + inc = inc or 1 # size increment assert inc <= self.size, "Batch committed to replay is too large!" # go consecutively until you hit the end, and then go randomly. - if self.current_size+inc <= self.size: - idx = np.arange(self.current_size, self.current_size+inc) + if self.current_size + inc <= self.size: + idx = np.arange(self.current_size, self.current_size + inc) elif self.current_size < self.size: overflow = inc - (self.size - self.current_size) idx_a = np.arange(self.current_size, self.size) @@ -101,7 +125,7 @@ def _get_storage_idx(self, inc=None): idx = np.random.randint(0, self.size, inc) # update replay size - self.current_size = min(self.size, self.current_size+inc) + self.current_size = min(self.size, self.current_size + inc) if inc == 1: idx = idx[0] diff --git a/stable_baselines/her/rollout.py b/stable_baselines/her/rollout.py new file mode 100644 index 0000000000..aa85e719f2 --- /dev/null +++ b/stable_baselines/her/rollout.py @@ -0,0 +1,228 @@ +from collections import deque +import pickle + +import numpy as np +from mujoco_py import MujocoException + +from stable_baselines.her.util import convert_episode_to_batch_major + + +class RolloutWorker: + def __init__(self, make_env, policy, dims, logger, time_horizon, rollout_batch_size=1, + exploit=False, use_target_net=False, compute_q=False, noise_eps=0, + random_eps=0, history_len=100, render=False): + """ + Rollout worker generates experience by interacting with one or many environments. + + :param make_env: (function (): Gym Environment) a factory function that creates a new instance of the + environment when called + :param policy: (Object) the policy that is used to act + :param dims: ({str: int}) the dimensions for observations (o), goals (g), and actions (u) + :param logger: (Object) the logger that is used by the rollout worker + :param rollout_batch_size: (int) the number of parallel rollouts that should be used + :param exploit: (bool) whether or not to exploit, i.e. to act optimally according to the current policy without + any exploration + :param use_target_net: (bool) whether or not to use the target net for rollouts + :param compute_q: (bool) whether or not to compute the Q values alongside the actions + :param noise_eps: (float) scale of the additive Gaussian noise + :param random_eps: (float) probability of selecting a completely random action + :param history_len: (int) length of history for statistics smoothing + :param render: (boolean) whether or not to render the rollouts + """ + self.make_env = make_env + self.policy = policy + self.dims = dims + self.logger = logger + self.time_horizon = time_horizon + self.rollout_batch_size = rollout_batch_size + self.exploit = exploit + self.use_target_net = use_target_net + self.compute_q = compute_q + self.noise_eps = noise_eps + self.random_eps = random_eps + self.history_len = history_len + self.render = render + + self.envs = [make_env() for _ in range(rollout_batch_size)] + assert self.time_horizon > 0 + + self.info_keys = [key.replace('info_', '') for key in dims.keys() if key.startswith('info_')] + + self.success_history = deque(maxlen=history_len) + self.q_history = deque(maxlen=history_len) + + self.n_episodes = 0 + self.goals = np.empty((self.rollout_batch_size, self.dims['g']), np.float32) # goals + self.initial_obs = np.empty((self.rollout_batch_size, self.dims['o']), np.float32) # observations + self.initial_ag = np.empty((self.rollout_batch_size, self.dims['g']), np.float32) # achieved goals + self.reset_all_rollouts() + self.clear_history() + + def reset_rollout(self, index): + """ + Resets the `i`-th rollout environment, re-samples a new goal, and updates the `initial_o` and `g` arrays + accordingly. + + :param index: (int) the index to reset + """ + obs = self.envs[index].reset() + self.initial_obs[index] = obs['observation'] + self.initial_ag[index] = obs['achieved_goal'] + self.goals[index] = obs['desired_goal'] + + def reset_all_rollouts(self): + """ + Resets all `rollout_batch_size` rollout workers. + """ + for step in range(self.rollout_batch_size): + self.reset_rollout(step) + + def generate_rollouts(self): + """ + Performs `rollout_batch_size` rollouts in parallel for time horizon with the current + policy acting on it accordingly. + + :return: (dict) batch + """ + self.reset_all_rollouts() + + # compute observations + observations = np.empty((self.rollout_batch_size, self.dims['o']), np.float32) # observations + achieved_goals = np.empty((self.rollout_batch_size, self.dims['g']), np.float32) # achieved goals + observations[:] = self.initial_obs + achieved_goals[:] = self.initial_ag + + # generate episodes + obs, achieved_goals, acts, goals, successes = [], [], [], [], [] + info_values = [np.empty((self.time_horizon, self.rollout_batch_size, self.dims['info_' + key]), np.float32) + for key in self.info_keys] + q_values = [] + for step in range(self.time_horizon): + policy_output = self.policy.get_actions( + observations, achieved_goals, self.goals, + compute_q=self.compute_q, + noise_eps=self.noise_eps if not self.exploit else 0., + random_eps=self.random_eps if not self.exploit else 0., + use_target_net=self.use_target_net) + + if self.compute_q: + action, q_value = policy_output + q_values.append(q_value) + else: + action = policy_output + + if action.ndim == 1: + # The non-batched case should still have a reasonable shape. + action = action.reshape(1, -1) + + o_new = np.empty((self.rollout_batch_size, self.dims['o'])) + ag_new = np.empty((self.rollout_batch_size, self.dims['g'])) + success = np.zeros(self.rollout_batch_size) + # compute new states and observations + for batch_idx in range(self.rollout_batch_size): + try: + # We fully ignore the reward here because it will have to be re-computed + # for HER. + curr_o_new, _, _, info = self.envs[batch_idx].step(action[batch_idx]) + if 'is_success' in info: + success[batch_idx] = info['is_success'] + o_new[batch_idx] = curr_o_new['observation'] + ag_new[batch_idx] = curr_o_new['achieved_goal'] + for idx, key in enumerate(self.info_keys): + info_values[idx][step, batch_idx] = info[key] + if self.render: + self.envs[batch_idx].render() + except MujocoException: + return self.generate_rollouts() + + if np.isnan(o_new).any(): + self.logger.warning('NaN caught during rollout generation. Trying again...') + self.reset_all_rollouts() + return self.generate_rollouts() + + obs.append(observations.copy()) + achieved_goals.append(achieved_goals.copy()) + successes.append(success.copy()) + acts.append(action.copy()) + goals.append(self.goals.copy()) + observations[...] = o_new + achieved_goals[...] = ag_new + obs.append(observations.copy()) + achieved_goals.append(achieved_goals.copy()) + self.initial_obs[:] = observations + + episode = dict(o=obs, + u=acts, + g=goals, + ag=achieved_goals) + for key, value in zip(self.info_keys, info_values): + episode['info_{}'.format(key)] = value + + # stats + successful = np.array(successes)[-1, :] + assert successful.shape == (self.rollout_batch_size,) + success_rate = np.mean(successful) + self.success_history.append(success_rate) + + if self.compute_q: + self.q_history.append(np.mean(q_values)) + self.n_episodes += self.rollout_batch_size + + return convert_episode_to_batch_major(episode) + + def clear_history(self): + """ + Clears all histories that are used for statistics + """ + self.success_history.clear() + self.q_history.clear() + + def current_success_rate(self): + """ + returns the current success rate + :return: (float) the success rate + """ + return np.mean(self.success_history) + + def current_mean_q(self): + """ + returns the current mean Q value + :return: (float) the mean Q value + """ + return np.mean(self.q_history) + + def save_policy(self, path): + """ + Pickles the current policy for later inspection. + + :param path: (str) the save location + """ + with open(path, 'wb') as file_handler: + pickle.dump(self.policy, file_handler) + + def logs(self, prefix='worker'): + """ + Generates a dictionary that contains all collected statistics. + + :param prefix: (str) the prefix for the name in logging + :return: ([(str, float)]) the logging information + """ + logs = [] + logs += [('success_rate', np.mean(self.success_history))] + if self.compute_q: + logs += [('mean_q', np.mean(self.q_history))] + logs += [('episode', self.n_episodes)] + + if prefix is not '' and not prefix.endswith('/'): + return [(prefix + '/' + key, val) for key, val in logs] + else: + return logs + + def seed(self, seed): + """ + Seeds each environment with a distinct seed derived from the passed in global seed. + + :param seed: (int) the random seed + """ + for idx, env in enumerate(self.envs): + env.seed(seed + 1000 * idx) diff --git a/stable_baselines/her/util.py b/stable_baselines/her/util.py new file mode 100644 index 0000000000..c5a7088981 --- /dev/null +++ b/stable_baselines/her/util.py @@ -0,0 +1,150 @@ +import os +import subprocess +import sys +import importlib + +import tensorflow as tf +import numpy as np +from mpi4py import MPI + +from stable_baselines.common import tf_util + + +def import_function(spec): + """ + Import a function identified by a string like "pkg.module:fn_name". + + :param spec: (str) the function to import + :return: (function) + """ + mod_name, fn_name = spec.split(':') + module = importlib.import_module(mod_name) + func = getattr(module, fn_name) + return func + + +def flatten_grads(var_list, grads): + """ + Flattens a variables and their gradients. + + :param var_list: ([TensorFlow Tensor]) the variables + :param grads: ([TensorFlow Tensor]) the gradients + :return: (TensorFlow Tensor) the flattend variable and gradient + """ + return tf.concat([tf.reshape(grad, [tf_util.numel(v)]) + for (v, grad) in zip(var_list, grads)], 0) + + +def mlp(_input, layers_sizes, reuse=None, flatten=False, name=""): + """ + Creates a simple fully-connected neural network + + :param _input: (TensorFlow Tensor) the input + :param layers_sizes: ([int]) the hidden layers + :param reuse: (bool) Enable reuse of the network + :param flatten: (bool) flatten the network output + :param name: (str) the name of the network + :return: (TensorFlow Tensor) the network + """ + for i, size in enumerate(layers_sizes): + activation = tf.nn.relu if i < len(layers_sizes) - 1 else None + _input = tf.layers.dense(inputs=_input, + units=size, + kernel_initializer=tf.contrib.layers.xavier_initializer(), + reuse=reuse, + name=name + '_' + str(i)) + if activation: + _input = activation(_input) + if flatten: + assert layers_sizes[-1] == 1 + _input = tf.reshape(_input, [-1]) + return _input + + +def install_mpi_excepthook(): + """ + setup the MPI exception hooks + """ + old_hook = sys.excepthook + + def new_hook(a, b, c): + old_hook(a, b, c) + sys.stdout.flush() + sys.stderr.flush() + MPI.COMM_WORLD.Abort() + + sys.excepthook = new_hook + + +def mpi_fork(rank, extra_mpi_args=None): + """ + Re-launches the current script with workers + Returns "parent" for original parent, "child" for MPI children + + :param rank: (int) the thread rank + :param extra_mpi_args: (dict) extra arguments for MPI + :return: (str) the correct type of thread name + """ + if extra_mpi_args is None: + extra_mpi_args = [] + + if rank <= 1: + return "child" + if os.getenv("IN_MPI") is None: + env = os.environ.copy() + env.update( + MKL_NUM_THREADS="1", + OMP_NUM_THREADS="1", + IN_MPI="1" + ) + # "-bind-to core" is crucial for good performance + args = ["mpirun", "-np", str(rank)] + \ + extra_mpi_args + \ + [sys.executable] + + args += sys.argv + subprocess.check_call(args, env=env) + return "parent" + else: + install_mpi_excepthook() + return "child" + + +def convert_episode_to_batch_major(episode): + """ + Converts an episode to have the batch dimension in the major (first) dimension. + + :param episode: (dict) the episode batch + :return: (dict) the episode batch with he batch dimension in the major (first) dimension. + """ + episode_batch = {} + for key in episode.keys(): + val = np.array(episode[key]).copy() + # make inputs batch-major instead of time-major + episode_batch[key] = val.swapaxes(0, 1) + + return episode_batch + + +def transitions_in_episode_batch(episode_batch): + """ + Number of transitions in a given episode batch. + + :param episode_batch: (dict) the episode batch + :return: (int) the number of transitions in episode batch + """ + shape = episode_batch['u'].shape + return shape[0] * shape[1] + + +def reshape_for_broadcasting(source, target): + """ + Reshapes a tensor (source) to have the correct shape and dtype of the target before broadcasting it with MPI. + + :param source: (TensorFlow Tensor) the input tensor + :param target: (TensorFlow Tensor) the target tensor + :return: (TensorFlow Tensor) the rehshaped tensor + """ + dim = len(target.get_shape()) + shape = ([1] * (dim - 1)) + [-1] + return tf.reshape(tf.cast(source, target.dtype), shape) diff --git a/stable_baselines/logger.py b/stable_baselines/logger.py new file mode 100644 index 0000000000..e84f5751fa --- /dev/null +++ b/stable_baselines/logger.py @@ -0,0 +1,720 @@ +import os +import sys +import shutil +import json +import time +import datetime +import tempfile +from collections import defaultdict + +DEBUG = 10 +INFO = 20 +WARN = 30 +ERROR = 40 + +DISABLED = 50 + + +class KVWriter(object): + """ + Key Value writer + """ + def writekvs(self, kvs): + """ + write a dictionary to file + + :param kvs: (dict) + """ + raise NotImplementedError + + +class SeqWriter(object): + """ + sequence writer + """ + def writeseq(self, seq): + """ + write an array to file + + :param seq: (list) + """ + raise NotImplementedError + + +class HumanOutputFormat(KVWriter, SeqWriter): + def __init__(self, filename_or_file): + """ + log to a file, in a human readable format + + :param filename_or_file: (str or File) the file to write the log to + """ + if isinstance(filename_or_file, str): + self.file = open(filename_or_file, 'wt') + self.own_file = True + else: + assert hasattr(filename_or_file, 'read'), 'expected file or str, got %s' % filename_or_file + self.file = filename_or_file + self.own_file = False + + def writekvs(self, kvs): + # Create strings for printing + key2str = {} + for (key, val) in sorted(kvs.items()): + if isinstance(val, float): + valstr = '%-8.3g' % (val,) + else: + valstr = str(val) + key2str[self._truncate(key)] = self._truncate(valstr) + + # Find max widths + if len(key2str) == 0: + print('WARNING: tried to write empty key-value dict') + return + else: + keywidth = max(map(len, key2str.keys())) + valwidth = max(map(len, key2str.values())) + + # Write out the data + dashes = '-' * (keywidth + valwidth + 7) + lines = [dashes] + for (key, val) in sorted(key2str.items()): + lines.append('| %s%s | %s%s |' % ( + key, + ' ' * (keywidth - len(key)), + val, + ' ' * (valwidth - len(val)), + )) + lines.append(dashes) + self.file.write('\n'.join(lines) + '\n') + + # Flush the output to the file + self.file.flush() + + @classmethod + def _truncate(cls, string): + return string[:20] + '...' if len(string) > 23 else string + + def writeseq(self, seq): + seq = list(seq) + for (i, elem) in enumerate(seq): + self.file.write(elem) + if i < len(seq) - 1: # add space unless this is the last one + self.file.write(' ') + self.file.write('\n') + self.file.flush() + + def close(self): + """ + closes the file + """ + if self.own_file: + self.file.close() + + +class JSONOutputFormat(KVWriter): + def __init__(self, filename): + """ + log to a file, in the JSON format + + :param filename: (str) the file to write the log to + """ + self.file = open(filename, 'wt') + + def writekvs(self, kvs): + for key, value in sorted(kvs.items()): + if hasattr(value, 'dtype'): + value = value.tolist() + kvs[key] = float(value) + self.file.write(json.dumps(kvs) + '\n') + self.file.flush() + + def close(self): + """ + closes the file + """ + self.file.close() + + +class CSVOutputFormat(KVWriter): + def __init__(self, filename): + """ + log to a file, in a CSV format + + :param filename: (str) the file to write the log to + """ + self.file = open(filename, 'w+t') + self.keys = [] + self.sep = ',' + + def writekvs(self, kvs): + # Add our current row to the history + extra_keys = kvs.keys() - self.keys + if extra_keys: + self.keys.extend(extra_keys) + self.file.seek(0) + lines = self.file.readlines() + self.file.seek(0) + for (i, key) in enumerate(self.keys): + if i > 0: + self.file.write(',') + self.file.write(key) + self.file.write('\n') + for line in lines[1:]: + self.file.write(line[:-1]) + self.file.write(self.sep * len(extra_keys)) + self.file.write('\n') + for i, key in enumerate(self.keys): + if i > 0: + self.file.write(',') + value = kvs.get(key) + if value is not None: + self.file.write(str(value)) + self.file.write('\n') + self.file.flush() + + def close(self): + """ + closes the file + """ + self.file.close() + + +class TensorBoardOutputFormat(KVWriter): + def __init__(self, folder): + """ + Dumps key/value pairs into TensorBoard's numeric format. + + :param folder: (str) the folder to write the log to + """ + os.makedirs(folder, exist_ok=True) + self.dir = folder + self.step = 1 + prefix = 'events' + path = os.path.join(os.path.abspath(folder), prefix) + import tensorflow as tf + from tensorflow.python import pywrap_tensorflow + from tensorflow.core.util import event_pb2 + from tensorflow.python.util import compat + self._tf = tf + self.event_pb2 = event_pb2 + self.pywrap_tensorflow = pywrap_tensorflow + self.writer = pywrap_tensorflow.EventsWriter(compat.as_bytes(path)) + + def writekvs(self, kvs): + def summary_val(key, value): + kwargs = {'tag': key, 'simple_value': float(value)} + return self._tf.Summary.Value(**kwargs) + + summary = self._tf.Summary(value=[summary_val(k, v) for k, v in kvs.items()]) + event = self.event_pb2.Event(wall_time=time.time(), summary=summary) + event.step = self.step # is there any reason why you'd want to specify the step? + self.writer.WriteEvent(event) + self.writer.Flush() + self.step += 1 + + def close(self): + """ + closes the file + """ + if self.writer: + self.writer.Close() + self.writer = None + + +def make_output_format(_format, ev_dir, log_suffix=''): + """ + return a logger for the requested format + + :param _format: (str) the requested format to log to ('stdout', 'log', 'json', 'csv' or 'tensorboard') + :param ev_dir: (str) the logging directory + :param log_suffix: (str) the suffix for the log file + :return: (KVWrite) the logger + """ + os.makedirs(ev_dir, exist_ok=True) + if _format == 'stdout': + return HumanOutputFormat(sys.stdout) + elif _format == 'log': + return HumanOutputFormat(os.path.join(ev_dir, 'log%s.txt' % log_suffix)) + elif _format == 'json': + return JSONOutputFormat(os.path.join(ev_dir, 'progress%s.json' % log_suffix)) + elif _format == 'csv': + return CSVOutputFormat(os.path.join(ev_dir, 'progress%s.csv' % log_suffix)) + elif _format == 'tensorboard': + return TensorBoardOutputFormat(os.path.join(ev_dir, 'tb%s' % log_suffix)) + else: + raise ValueError('Unknown format specified: %s' % (_format,)) + + +# ================================================================ +# API +# ================================================================ + +def logkv(key, val): + """ + Log a value of some diagnostic + Call this once for each diagnostic quantity, each iteration + If called many times, last value will be used. + + :param key: (Any) save to log this key + :param val: (Any) save to log this value + """ + Logger.CURRENT.logkv(key, val) + + +def logkv_mean(key, val): + """ + The same as logkv(), but if called many times, values averaged. + + :param key: (Any) save to log this key + :param val: (Number) save to log this value + """ + Logger.CURRENT.logkv_mean(key, val) + + +def logkvs(key_values): + """ + Log a dictionary of key-value pairs + + :param key_values: (dict) the list of keys and values to save to log + """ + for key, value in key_values.items(): + logkv(key, value) + + +def dumpkvs(): + """ + Write all of the diagnostics from the current iteration + """ + Logger.CURRENT.dumpkvs() + + +def getkvs(): + """ + get the key values logs + + :return: (dict) the logged values + """ + return Logger.CURRENT.name2val + + +def log(*args, level=INFO): + """ + Write the sequence of args, with no separators, + to the console and output files (if you've configured an output file). + + level: int. (see logger.py docs) If the global logger level is higher than + the level argument here, don't print to stdout. + + :param args: (list) log the arguments + :param level: (int) the logging level (can be DEBUG=10, INFO=20, WARN=30, ERROR=40, DISABLED=50) + """ + Logger.CURRENT.log(*args, level=level) + + +def debug(*args): + """ + Write the sequence of args, with no separators, + to the console and output files (if you've configured an output file). + Using the DEBUG level. + + :param args: (list) log the arguments + """ + log(*args, level=DEBUG) + + +def info(*args): + """ + Write the sequence of args, with no separators, + to the console and output files (if you've configured an output file). + Using the INFO level. + + :param args: (list) log the arguments + """ + log(*args, level=INFO) + + +def warn(*args): + """ + Write the sequence of args, with no separators, + to the console and output files (if you've configured an output file). + Using the WARN level. + + :param args: (list) log the arguments + """ + log(*args, level=WARN) + + +def error(*args): + """ + Write the sequence of args, with no separators, + to the console and output files (if you've configured an output file). + Using the ERROR level. + + :param args: (list) log the arguments + """ + log(*args, level=ERROR) + + +def set_level(level): + """ + Set logging threshold on current logger. + + :param level: (int) the logging level (can be DEBUG=10, INFO=20, WARN=30, ERROR=40, DISABLED=50) + """ + Logger.CURRENT.set_level(level) + + +def get_level(): + """ + Get logging threshold on current logger. + :return: (int) the logging level (can be DEBUG=10, INFO=20, WARN=30, ERROR=40, DISABLED=50) + """ + return Logger.CURRENT.level + + +def get_dir(): + """ + Get directory that log files are being written to. + will be None if there is no output directory (i.e., if you didn't call start) + + :return: (str) the logging directory + """ + return Logger.CURRENT.get_dir() + + +record_tabular = logkv +dump_tabular = dumpkvs + + +class ProfileKV: + def __init__(self, name): + """ + Usage: + with logger.ProfileKV("interesting_scope"): + code + + :param name: (str) the profiling name + """ + self.name = "wait_" + name + + def __enter__(self): + self.start_time = time.time() + + def __exit__(self, _type, value, traceback): + Logger.CURRENT.name2val[self.name] += time.time() - self.start_time + + +def profile(name): + """ + Usage: + @profile("my_func") + def my_func(): code + + :param name: (str) the profiling name + :return: (function) the wrapped function + """ + def decorator_with_name(func): + def func_wrapper(*args, **kwargs): + with ProfileKV(name): + return func(*args, **kwargs) + + return func_wrapper + + return decorator_with_name + + +# ================================================================ +# Backend +# ================================================================ + +class Logger(object): + # A logger with no output files. (See right below class definition) + # So that you can still log to the terminal without setting up any output files + DEFAULT = None + CURRENT = None # Current logger being used by the free functions above + + def __init__(self, folder, output_formats): + """ + the logger class + + :param folder: (str) the logging location + :param output_formats: ([str]) the list of output format + """ + self.name2val = defaultdict(float) # values this iteration + self.name2cnt = defaultdict(int) + self.level = INFO + self.dir = folder + self.output_formats = output_formats + + # Logging API, forwarded + # ---------------------------------------- + def logkv(self, key, val): + """ + Log a value of some diagnostic + Call this once for each diagnostic quantity, each iteration + If called many times, last value will be used. + + :param key: (Any) save to log this key + :param val: (Any) save to log this value + """ + self.name2val[key] = val + + def logkv_mean(self, key, val): + """ + The same as logkv(), but if called many times, values averaged. + + :param key: (Any) save to log this key + :param val: (Number) save to log this value + """ + if val is None: + self.name2val[key] = None + return + oldval, cnt = self.name2val[key], self.name2cnt[key] + self.name2val[key] = oldval * cnt / (cnt + 1) + val / (cnt + 1) + self.name2cnt[key] = cnt + 1 + + def dumpkvs(self): + """ + Write all of the diagnostics from the current iteration + """ + if self.level == DISABLED: + return + for fmt in self.output_formats: + if isinstance(fmt, KVWriter): + fmt.writekvs(self.name2val) + self.name2val.clear() + self.name2cnt.clear() + + def log(self, *args, level=INFO): + """ + Write the sequence of args, with no separators, + to the console and output files (if you've configured an output file). + + level: int. (see logger.py docs) If the global logger level is higher than + the level argument here, don't print to stdout. + + :param args: (list) log the arguments + :param level: (int) the logging level (can be DEBUG=10, INFO=20, WARN=30, ERROR=40, DISABLED=50) + """ + if self.level <= level: + self._do_log(args) + + # Configuration + # ---------------------------------------- + def set_level(self, level): + """ + Set logging threshold on current logger. + + :param level: (int) the logging level (can be DEBUG=10, INFO=20, WARN=30, ERROR=40, DISABLED=50) + """ + self.level = level + + def get_dir(self): + """ + Get directory that log files are being written to. + will be None if there is no output directory (i.e., if you didn't call start) + + :return: (str) the logging directory + """ + return self.dir + + def close(self): + """ + closes the file + """ + for fmt in self.output_formats: + fmt.close() + + # Misc + # ---------------------------------------- + def _do_log(self, args): + """ + log to the requested format outputs + + :param args: (list) the arguments to log + """ + for fmt in self.output_formats: + if isinstance(fmt, SeqWriter): + fmt.writeseq(map(str, args)) + + +Logger.DEFAULT = Logger.CURRENT = Logger(folder=None, output_formats=[HumanOutputFormat(sys.stdout)]) + + +def configure(folder=None, format_strs=None): + """ + configure the current logger + + :param folder: (str) the save location (if None, $OPENAI_LOGDIR, if still None, tempdir/openai-[date & time]) + :param format_strs: (list) the output logging format + (if None, $OPENAI_LOG_FORMAT, if still None, ['stdout', 'log', 'csv']) + """ + if folder is None: + folder = os.getenv('OPENAI_LOGDIR') + if folder is None: + folder = os.path.join(tempfile.gettempdir(), datetime.datetime.now().strftime("openai-%Y-%m-%d-%H-%M-%S-%f")) + assert isinstance(folder, str) + os.makedirs(folder, exist_ok=True) + + log_suffix = '' + from mpi4py import MPI + rank = MPI.COMM_WORLD.Get_rank() + if rank > 0: + log_suffix = "-rank%03i" % rank + + if format_strs is None: + if rank == 0: + format_strs = os.getenv('OPENAI_LOG_FORMAT', 'stdout,log,csv').split(',') + else: + format_strs = os.getenv('OPENAI_LOG_FORMAT_MPI', 'log').split(',') + format_strs = filter(None, format_strs) + output_formats = [make_output_format(f, folder, log_suffix) for f in format_strs] + + Logger.CURRENT = Logger(folder=folder, output_formats=output_formats) + log('Logging to %s' % folder) + + +def reset(): + """ + reset the current logger + """ + if Logger.CURRENT is not Logger.DEFAULT: + Logger.CURRENT.close() + Logger.CURRENT = Logger.DEFAULT + log('Reset logger') + + +class ScopedConfigure(object): + def __init__(self, folder=None, format_strs=None): + """ + Class for using context manager while logging + + usage: + with ScopedConfigure(folder=None, format_strs=None): + {code} + + :param folder: (str) the logging folder + :param format_strs: ([str]) the list of output logging format + """ + self.dir = folder + self.format_strs = format_strs + self.prevlogger = None + + def __enter__(self): + self.prevlogger = Logger.CURRENT + configure(folder=self.dir, format_strs=self.format_strs) + + def __exit__(self, *args): + Logger.CURRENT.close() + Logger.CURRENT = self.prevlogger + + +# ================================================================ + +def _demo(): + """ + tests for the logger module + """ + info("hi") + debug("shouldn't appear") + set_level(DEBUG) + debug("should appear") + folder = "/tmp/testlogging" + if os.path.exists(folder): + shutil.rmtree(folder) + configure(folder=folder) + logkv("a", 3) + logkv("b", 2.5) + dumpkvs() + logkv("b", -2.5) + logkv("a", 5.5) + dumpkvs() + info("^^^ should see a = 5.5") + logkv_mean("b", -22.5) + logkv_mean("b", -44.4) + logkv("a", 5.5) + dumpkvs() + with ScopedConfigure(None, None): + info("^^^ should see b = 33.3") + + with ScopedConfigure("/tmp/test-logger/", ["json"]): + logkv("b", -2.5) + dumpkvs() + + reset() + logkv("a", "longasslongasslongasslongasslongasslongassvalue") + dumpkvs() + warn("hey") + error("oh") + logkvs({"test": 1}) + + +# ================================================================ +# Readers +# ================================================================ + +def read_json(fname): + """ + read a json file using pandas + + :param fname: (str) the file path to read + :return: (pandas DataFrame) the data in the json + """ + import pandas + data = [] + with open(fname, 'rt') as file_handler: + for line in file_handler: + data.append(json.loads(line)) + return pandas.DataFrame(data) + + +def read_csv(fname): + """ + read a csv file using pandas + + :param fname: (str) the file path to read + :return: (pandas DataFrame) the data in the csv + """ + import pandas + return pandas.read_csv(fname, index_col=None, comment='#') + + +def read_tb(path): + """ + read a tensorboard output + + :param path: (str) a tensorboard file OR a directory, where we will find all TB files of the form events. + :return: (pandas DataFrame) the tensorboad data + """ + import pandas + import numpy as np + from glob import glob + # from collections import defaultdict + import tensorflow as tf + if os.path.isdir(path): + fnames = glob(os.path.join(path, "events.*")) + elif os.path.basename(path).startswith("events."): + fnames = [path] + else: + raise NotImplementedError("Expected tensorboard file or directory containing them. Got %s" % path) + tag2pairs = defaultdict(list) + maxstep = 0 + for fname in fnames: + for summary in tf.train.summary_iterator(fname): + if summary.step > 0: + for value in summary.summary.value: + pair = (summary.step, value.simple_value) + tag2pairs[value.tag].append(pair) + maxstep = max(summary.step, maxstep) + data = np.empty((maxstep, len(tag2pairs))) + data[:] = np.nan + tags = sorted(tag2pairs.keys()) + for (colidx, tag) in enumerate(tags): + pairs = tag2pairs[tag] + for (step, value) in pairs: + data[step - 1, colidx] = value + return pandas.DataFrame(data, columns=tags) + + +if __name__ == "__main__": + _demo() diff --git a/stable_baselines/ppo1/__init__.py b/stable_baselines/ppo1/__init__.py new file mode 100644 index 0000000000..e04efd2a3d --- /dev/null +++ b/stable_baselines/ppo1/__init__.py @@ -0,0 +1 @@ +from stable_baselines.ppo1.pposgd_simple import PPO1 diff --git a/stable_baselines/ppo1/mlp_policy.py b/stable_baselines/ppo1/mlp_policy.py new file mode 100644 index 0000000000..d55deefd0c --- /dev/null +++ b/stable_baselines/ppo1/mlp_policy.py @@ -0,0 +1,78 @@ +import tensorflow as tf + +from stable_baselines.common.input import observation_input +from stable_baselines.common.distributions import make_proba_dist_type + + +class BasePolicy(object): + def __init__(self, placeholders=None): + """ + A base policy object for PPO1 + + :param placeholders: (dict) To feed existing placeholders if needed + """ + super(BasePolicy, self).__init__() + self.sess = None + self.pdtype = None + self._act = None + self.scope = None + self.obs_ph = None + self.stochastic_ph = None + self.processed_x = None + + if placeholders is not None: + self.obs_ph = placeholders.get("obs", None) + self.processed_x = placeholders.get("processed_obs", None) + self.stochastic_ph = placeholders.get("stochastic", None) + + def get_obs_and_pdtype(self, ob_space, ac_space): + """ + Initialize probability distribution and get observation placeholder. + + :param ob_space: (Gym Spaces) the observation space + :param ac_space: (Gym Spaces) the action space + """ + self.pdtype = pdtype = make_proba_dist_type(ac_space) + + if self.obs_ph is None: + self.obs_ph, self.processed_x = observation_input(ob_space) + else: + assert self.processed_x is not None + + return self.obs_ph, pdtype + + def act(self, stochastic, obs): + """ + Get the action from the policy, using the observation + + :param stochastic: (bool) whether or not to use a stochastic or deterministic policy + :param obs: (TensorFlow Tensor or numpy Number) the observation + :return: (numpy Number, numpy Number) the action and value function + """ + ac1, vpred1 = self._act(stochastic, obs[None], sess=self.sess) + return ac1[0], vpred1[0] + + def get_variables(self): + """ + Get all the policy's variables + + :return: ([TensorFlow Tensor]) the variables of the network + """ + return tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, self.scope) + + def get_trainable_variables(self): + """ + Get the policy's trainable variables + + :return: ([TensorFlow Tensor]) the trainable variables of the network + """ + return tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, self.scope) + + @classmethod + def get_initial_state(cls): + """ + Get the initial state + + :return: ([numpy Number]) the initial state + """ + return [] diff --git a/stable_baselines/ppo1/pposgd_simple.py b/stable_baselines/ppo1/pposgd_simple.py new file mode 100644 index 0000000000..7ee7e38dc3 --- /dev/null +++ b/stable_baselines/ppo1/pposgd_simple.py @@ -0,0 +1,298 @@ +from collections import deque +import time + +import tensorflow as tf +import numpy as np +from mpi4py import MPI + +from stable_baselines.common import Dataset, explained_variance, fmt_row, zipsame, BaseRLModel, SetVerbosity +from stable_baselines import logger +import stable_baselines.common.tf_util as tf_util +from stable_baselines.common.policies import LstmPolicy +from stable_baselines.common.mpi_adam import MpiAdam +from stable_baselines.common.mpi_moments import mpi_moments +from stable_baselines.trpo_mpi.utils import traj_segment_generator, add_vtarg_and_adv, flatten_lists + + +class PPO1(BaseRLModel): + """ + Proximal Policy Optimization algorithm (MPI version). + Paper: https://arxiv.org/abs/1707.06347 + + :param env: (Gym environment or str) The environment to learn from (if registered in Gym, can be str) + :param policy: (function (str, Gym Spaces, Gym Spaces): TensorFlow Tensor) creates the policy + :param timesteps_per_actorbatch: (int) timesteps per actor per update + :param clip_param: (float) clipping parameter epsilon + :param entcoeff: (float) the entropy loss weight + :param optim_epochs: (float) the optimizer's number of epochs + :param optim_stepsize: (float) the optimizer's stepsize + :param optim_batchsize: (int) the optimizer's the batch size + :param gamma: (float) discount factor + :param lam: (float) advantage estimation + :param adam_epsilon: (float) the epsilon value for the adam optimizer + :param schedule: (str) The type of scheduler for the learning rate update ('linear', 'constant', + 'double_linear_con', 'middle_drop' or 'double_middle_drop') + :param verbose: (int) the verbosity level: 0 none, 1 training information, 2 tensorflow debug + :param _init_setup_model: (bool) Whether or not to build the network at the creation of the instance + """ + + def __init__(self, policy, env, gamma=0.99, timesteps_per_actorbatch=256, clip_param=0.2, entcoeff=0.01, + optim_epochs=4, optim_stepsize=1e-3, optim_batchsize=64, lam=0.95, adam_epsilon=1e-5, + schedule='linear', verbose=0, _init_setup_model=True): + super().__init__(policy=policy, env=env, requires_vec_env=False, verbose=verbose) + + self.gamma = gamma + self.timesteps_per_actorbatch = timesteps_per_actorbatch + self.clip_param = clip_param + self.entcoeff = entcoeff + self.optim_epochs = optim_epochs + self.optim_stepsize = optim_stepsize + self.optim_batchsize = optim_batchsize + self.lam = lam + self.adam_epsilon = adam_epsilon + self.schedule = schedule + + self.graph = None + self.sess = None + self.policy_pi = None + self.loss_names = None + self.lossandgrad = None + self.adam = None + self.assign_old_eq_new = None + self.compute_losses = None + self.params = None + self.step = None + self.proba_step = None + self.initial_state = None + + if _init_setup_model: + self.setup_model() + + def setup_model(self): + with SetVerbosity(self.verbose): + + self.graph = tf.Graph() + with self.graph.as_default(): + self.sess = tf_util.single_threaded_session(graph=self.graph) + + # Construct network for new policy + with tf.variable_scope("pi", reuse=False): + self.policy_pi = self.policy(self.sess, self.observation_space, self.action_space, self.n_envs, 1, + None, reuse=False) + + # Network for old policy + with tf.variable_scope("oldpi", reuse=False): + old_pi = self.policy(self.sess, self.observation_space, self.action_space, self.n_envs, 1, + None, reuse=False) + + # Target advantage function (if applicable) + atarg = tf.placeholder(dtype=tf.float32, shape=[None]) + + # Empirical return + ret = tf.placeholder(dtype=tf.float32, shape=[None]) + + # learning rate multiplier, updated with schedule + lrmult = tf.placeholder(name='lrmult', dtype=tf.float32, shape=[]) + + # Annealed cliping parameter epislon + clip_param = self.clip_param * lrmult + + obs_ph = self.policy_pi.obs_ph + action_ph = self.policy_pi.pdtype.sample_placeholder([None]) + + kloldnew = old_pi.proba_distribution.kl(self.policy_pi.proba_distribution) + ent = self.policy_pi.proba_distribution.entropy() + meankl = tf.reduce_mean(kloldnew) + meanent = tf.reduce_mean(ent) + pol_entpen = (-self.entcoeff) * meanent + + # pnew / pold + ratio = tf.exp(self.policy_pi.proba_distribution.logp(action_ph) - + old_pi.proba_distribution.logp(action_ph)) + + # surrogate from conservative policy iteration + surr1 = ratio * atarg + surr2 = tf.clip_by_value(ratio, 1.0 - clip_param, 1.0 + clip_param) * atarg + + # PPO's pessimistic surrogate (L^CLIP) + pol_surr = - tf.reduce_mean(tf.minimum(surr1, surr2)) + vf_loss = tf.reduce_mean(tf.square(self.policy_pi.value_fn[:, 0] - ret)) + total_loss = pol_surr + pol_entpen + vf_loss + losses = [pol_surr, pol_entpen, vf_loss, meankl, meanent] + self.loss_names = ["pol_surr", "pol_entpen", "vf_loss", "kl", "ent"] + + self.params = tf_util.get_trainable_vars("pi") + self.lossandgrad = tf_util.function([obs_ph, old_pi.obs_ph, action_ph, atarg, ret, lrmult], + losses + [tf_util.flatgrad(total_loss, self.params)]) + self.adam = MpiAdam(self.params, epsilon=self.adam_epsilon, sess=self.sess) + + self.assign_old_eq_new = tf_util.function( + [], [], updates=[tf.assign(oldv, newv) for (oldv, newv) in + zipsame(tf_util.get_globals_vars("oldpi"), tf_util.get_globals_vars("pi"))]) + self.compute_losses = tf_util.function([obs_ph, old_pi.obs_ph, action_ph, atarg, ret, lrmult], losses) + + self.step = self.policy_pi.step + self.proba_step = self.policy_pi.proba_step + self.initial_state = self.policy_pi.initial_state + + tf_util.initialize(sess=self.sess) + + def learn(self, total_timesteps, callback=None, seed=None, log_interval=100): + with SetVerbosity(self.verbose): + self._setup_learn(seed) + + with self.sess.as_default(): + self.adam.sync() + + # Prepare for rollouts + seg_gen = traj_segment_generator(self.policy_pi, self.env, self.timesteps_per_actorbatch) + + episodes_so_far = 0 + timesteps_so_far = 0 + iters_so_far = 0 + t_start = time.time() + + # rolling buffer for episode lengths + lenbuffer = deque(maxlen=100) + # rolling buffer for episode rewards + rewbuffer = deque(maxlen=100) + + while True: + if callback: + callback(locals(), globals()) + if total_timesteps and timesteps_so_far >= total_timesteps: + break + + if self.schedule == 'constant': + cur_lrmult = 1.0 + elif self.schedule == 'linear': + cur_lrmult = max(1.0 - float(timesteps_so_far) / total_timesteps, 0) + else: + raise NotImplementedError + + logger.log("********** Iteration %i ************" % iters_so_far) + + seg = seg_gen.__next__() + add_vtarg_and_adv(seg, self.gamma, self.lam) + + # ob, ac, atarg, ret, td1ret = map(np.concatenate, (obs, acs, atargs, rets, td1rets)) + obs_ph, action_ph, atarg, tdlamret = seg["ob"], seg["ac"], seg["adv"], seg["tdlamret"] + + # predicted value function before udpate + vpredbefore = seg["vpred"] + + # standardized advantage function estimate + atarg = (atarg - atarg.mean()) / atarg.std() + dataset = Dataset(dict(ob=obs_ph, ac=action_ph, atarg=atarg, vtarg=tdlamret), + shuffle=not issubclass(self.policy, LstmPolicy)) + optim_batchsize = self.optim_batchsize or obs_ph.shape[0] + + # set old parameter values to new parameter values + self.assign_old_eq_new(sess=self.sess) + logger.log("Optimizing...") + logger.log(fmt_row(13, self.loss_names)) + + # Here we do a bunch of optimization epochs over the data + for _ in range(self.optim_epochs): + # list of tuples, each of which gives the loss for a minibatch + losses = [] + for batch in dataset.iterate_once(optim_batchsize): + *newlosses, grad = self.lossandgrad(batch["ob"], batch["ob"], batch["ac"], batch["atarg"], + batch["vtarg"], cur_lrmult, sess=self.sess) + self.adam.update(grad, self.optim_stepsize * cur_lrmult) + losses.append(newlosses) + logger.log(fmt_row(13, np.mean(losses, axis=0))) + + logger.log("Evaluating losses...") + losses = [] + for batch in dataset.iterate_once(optim_batchsize): + newlosses = self.compute_losses(batch["ob"], batch["ob"], batch["ac"], batch["atarg"], + batch["vtarg"], cur_lrmult, sess=self.sess) + losses.append(newlosses) + mean_losses, _, _ = mpi_moments(losses, axis=0) + logger.log(fmt_row(13, mean_losses)) + for (loss_val, name) in zipsame(mean_losses, self.loss_names): + logger.record_tabular("loss_" + name, loss_val) + logger.record_tabular("ev_tdlam_before", explained_variance(vpredbefore, tdlamret)) + + # local values + lrlocal = (seg["ep_lens"], seg["ep_rets"]) + + # list of tuples + listoflrpairs = MPI.COMM_WORLD.allgather(lrlocal) + lens, rews = map(flatten_lists, zip(*listoflrpairs)) + lenbuffer.extend(lens) + rewbuffer.extend(rews) + logger.record_tabular("EpLenMean", np.mean(lenbuffer)) + logger.record_tabular("EpRewMean", np.mean(rewbuffer)) + logger.record_tabular("EpThisIter", len(lens)) + episodes_so_far += len(lens) + timesteps_so_far += seg["total_timestep"] + iters_so_far += 1 + logger.record_tabular("EpisodesSoFar", episodes_so_far) + logger.record_tabular("TimestepsSoFar", timesteps_so_far) + logger.record_tabular("TimeElapsed", time.time() - t_start) + if self.verbose >= 1 and MPI.COMM_WORLD.Get_rank() == 0: + logger.dump_tabular() + + return self + + def predict(self, observation, state=None, mask=None): + if state is None: + state = self.initial_state + if mask is None: + mask = [False for _ in range(self.n_envs)] + observation = np.array(observation).reshape((-1,) + self.observation_space.shape) + + actions, _, states, _ = self.step(observation, state, mask) + return actions, states + + def action_probability(self, observation, state=None, mask=None): + if state is None: + state = self.initial_state + if mask is None: + mask = [False for _ in range(self.n_envs)] + observation = np.array(observation).reshape((-1,) + self.observation_space.shape) + + return self.proba_step(observation, state, mask) + + def save(self, save_path): + data = { + "gamma": self.gamma, + "timesteps_per_actorbatch": self.timesteps_per_actorbatch, + "clip_param": self.clip_param, + "entcoeff": self.entcoeff, + "optim_epochs": self.optim_epochs, + "optim_stepsize": self.optim_stepsize, + "optim_batchsize": self.optim_batchsize, + "lam": self.lam, + "adam_epsilon": self.adam_epsilon, + "schedule": self.schedule, + "verbose": self.verbose, + "policy": self.policy, + "observation_space": self.observation_space, + "action_space": self.action_space, + "n_envs": self.n_envs, + "_vectorize_action": self._vectorize_action + } + + params = self.sess.run(self.params) + + self._save_to_file(save_path, data=data, params=params) + + @classmethod + def load(cls, load_path, env=None, **kwargs): + data, params = cls._load_from_file(load_path) + + model = cls(None, env=None, _init_setup_model=False) + model.__dict__.update(data) + model.__dict__.update(kwargs) + model.set_env(env) + model.setup_model() + + restores = [] + for param, loaded_p in zip(model.params, params): + restores.append(param.assign(loaded_p)) + model.sess.run(restores) + + return model diff --git a/stable_baselines/ppo1/run_atari.py b/stable_baselines/ppo1/run_atari.py new file mode 100644 index 0000000000..e32c2ee0c3 --- /dev/null +++ b/stable_baselines/ppo1/run_atari.py @@ -0,0 +1,54 @@ +#!/usr/bin/env python3 +import os + +from mpi4py import MPI + +from stable_baselines.common import set_global_seeds +from stable_baselines import bench, logger +from stable_baselines.common.atari_wrappers import make_atari, wrap_deepmind +from stable_baselines.common.cmd_util import atari_arg_parser +from stable_baselines.common.policies import CnnPolicy +from stable_baselines.ppo1 import PPO1 + + +def train(env_id, num_timesteps, seed): + """ + Train PPO1 model for Atari environments, for testing purposes + + :param env_id: (str) Environment ID + :param num_timesteps: (int) The total number of samples + :param seed: (int) The initial seed for training + """ + rank = MPI.COMM_WORLD.Get_rank() + + if rank == 0: + logger.configure() + else: + logger.configure(format_strs=[]) + workerseed = seed + 10000 * MPI.COMM_WORLD.Get_rank() + set_global_seeds(workerseed) + env = make_atari(env_id) + + env = bench.Monitor(env, logger.get_dir() and + os.path.join(logger.get_dir(), str(rank))) + env.seed(workerseed) + + env = wrap_deepmind(env) + env.seed(workerseed) + + model = PPO1(CnnPolicy, env, timesteps_per_actorbatch=256, clip_param=0.2, entcoeff=0.01, optim_epochs=4, + optim_stepsize=1e-3, optim_batchsize=64, gamma=0.99, lam=0.95, schedule='linear', verbose=2) + model.learn(total_timesteps=num_timesteps) + env.close() + + +def main(): + """ + Runs the test + """ + args = atari_arg_parser().parse_args() + train(args.env, num_timesteps=args.num_timesteps, seed=args.seed) + + +if __name__ == '__main__': + main() diff --git a/stable_baselines/ppo1/run_humanoid.py b/stable_baselines/ppo1/run_humanoid.py new file mode 100644 index 0000000000..b981ae4e12 --- /dev/null +++ b/stable_baselines/ppo1/run_humanoid.py @@ -0,0 +1,78 @@ +#!/usr/bin/env python3 +import os + +import gym + +from stable_baselines.ppo1 import PPO1 +from stable_baselines.common.cmd_util import make_mujoco_env, mujoco_arg_parser +from stable_baselines.common import tf_util +from stable_baselines.common.policies import MlpPolicy +from stable_baselines import logger + + +def train(num_timesteps, seed, model_path=None): + """ + Train PPO1 model for the Humanoid environment, for testing purposes + + :param num_timesteps: (int) The total number of samples + :param seed: (int) The initial seed for training + :param model_path: (str) path to the model + """ + env_id = 'Humanoid-v2' + + env = make_mujoco_env(env_id, seed) + + # parameters below were the best found in a simple random search + # these are good enough to make humanoid walk, but whether those are + # an absolute best or not is not certain + env = RewScale(env, 0.1) + model = PPO1(MlpPolicy, env, timesteps_per_actorbatch=2048, clip_param=0.2, entcoeff=0.0, optim_epochs=10, + optim_stepsize=3e-4, optim_batchsize=64, gamma=0.99, lam=0.95, schedule='linear') + model.learn(total_timesteps=num_timesteps) + env.close() + if model_path: + tf_util.save_state(model_path) + + return model + + +class RewScale(gym.RewardWrapper): + def __init__(self, env, scale): + gym.RewardWrapper.__init__(self, env) + self.scale = scale + + def reward(self, _reward): + return _reward * self.scale + + +def main(): + """ + Runs the test + """ + logger.configure() + parser = mujoco_arg_parser() + parser.add_argument('--model-path', default=os.path.join(logger.get_dir(), 'humanoid_policy')) + parser.set_defaults(num_timesteps=int(2e7)) + + args = parser.parse_args() + + if not args.play: + # train the model + train(num_timesteps=args.num_timesteps, seed=args.seed, model_path=args.model_path) + else: + # construct the model object, load pre-trained model and render + model = train(num_timesteps=1, seed=args.seed) + tf_util.load_state(args.model_path) + env = make_mujoco_env('Humanoid-v2', seed=0) + + obs = env.reset() + while True: + action = model.policy.act(stochastic=False, obs=obs)[0] + obs, _, done, _ = env.step(action) + env.render() + if done: + obs = env.reset() + + +if __name__ == '__main__': + main() diff --git a/stable_baselines/ppo1/run_mujoco.py b/stable_baselines/ppo1/run_mujoco.py new file mode 100644 index 0000000000..77bfd06583 --- /dev/null +++ b/stable_baselines/ppo1/run_mujoco.py @@ -0,0 +1,34 @@ +#!/usr/bin/env python3 + +from stable_baselines.ppo1 import PPO1 +from stable_baselines.common.policies import MlpPolicy +from stable_baselines.common.cmd_util import make_mujoco_env, mujoco_arg_parser +from stable_baselines import logger + + +def train(env_id, num_timesteps, seed): + """ + Train PPO1 model for the Mujoco environment, for testing purposes + + :param env_id: (str) Environment ID + :param num_timesteps: (int) The total number of samples + :param seed: (int) The initial seed for training + """ + env = make_mujoco_env(env_id, seed) + model = PPO1(MlpPolicy, env, timesteps_per_actorbatch=2048, clip_param=0.2, entcoeff=0.0, optim_epochs=10, + optim_stepsize=3e-4, optim_batchsize=64, gamma=0.99, lam=0.95, schedule='linear') + model.learn(total_timesteps=num_timesteps) + env.close() + + +def main(): + """ + Runs the test + """ + args = mujoco_arg_parser().parse_args() + logger.configure() + train(args.env, num_timesteps=args.num_timesteps, seed=args.seed) + + +if __name__ == '__main__': + main() diff --git a/stable_baselines/ppo1/run_robotics.py b/stable_baselines/ppo1/run_robotics.py new file mode 100644 index 0000000000..f83739c04c --- /dev/null +++ b/stable_baselines/ppo1/run_robotics.py @@ -0,0 +1,42 @@ +#!/usr/bin/env python3 + +from mpi4py import MPI +import mujoco_py + +from stable_baselines.common import set_global_seeds +from stable_baselines.common.policies import MlpPolicy +from stable_baselines.common.cmd_util import make_robotics_env, robotics_arg_parser +from stable_baselines.ppo1 import PPO1 + + +def train(env_id, num_timesteps, seed): + """ + Train PPO1 model for Robotics environment, for testing purposes + + :param env_id: (str) Environment ID + :param num_timesteps: (int) The total number of samples + :param seed: (int) The initial seed for training + """ + + rank = MPI.COMM_WORLD.Get_rank() + with mujoco_py.ignore_mujoco_warnings(): + workerseed = seed + 10000 * rank + set_global_seeds(workerseed) + env = make_robotics_env(env_id, workerseed, rank=rank) + + model = PPO1(MlpPolicy, env, timesteps_per_actorbatch=2048, clip_param=0.2, entcoeff=0.0, optim_epochs=5, + optim_stepsize=3e-4, optim_batchsize=256, gamma=0.99, lam=0.95, schedule='linear') + model.learn(total_timesteps=num_timesteps) + env.close() + + +def main(): + """ + Runs the test + """ + args = robotics_arg_parser().parse_args() + train(args.env, num_timesteps=args.num_timesteps, seed=args.seed) + + +if __name__ == '__main__': + main() diff --git a/stable_baselines/ppo2/__init__.py b/stable_baselines/ppo2/__init__.py new file mode 100644 index 0000000000..6eb9f827f7 --- /dev/null +++ b/stable_baselines/ppo2/__init__.py @@ -0,0 +1 @@ +from stable_baselines.ppo2.ppo2 import PPO2 diff --git a/stable_baselines/ppo2/ppo2.py b/stable_baselines/ppo2/ppo2.py new file mode 100644 index 0000000000..bda17662fd --- /dev/null +++ b/stable_baselines/ppo2/ppo2.py @@ -0,0 +1,418 @@ +import time +from collections import deque +import sys +import multiprocessing + +import numpy as np +import tensorflow as tf + +from stable_baselines import logger +from stable_baselines.common import explained_variance, BaseRLModel, tf_util, SetVerbosity +from stable_baselines.common.runners import AbstractEnvRunner +from stable_baselines.common.policies import LstmPolicy + + +class PPO2(BaseRLModel): + """ + Proximal Policy Optimization algorithm (GPU version). + Paper: https://arxiv.org/abs/1707.06347 + + :param policy: (ActorCriticPolicy) The policy model to use (MLP, CNN, LSTM, ...) + :param env: (Gym environment or str) The environment to learn from (if registered in Gym, can be str) + :param gamma: (float) Discount factor + :param n_steps: (int) The number of steps to run for each environment + :param ent_coef: (float) Entropy coefficient for the loss caculation + :param learning_rate: (float or callable) The learning rate, it can be a function + :param vf_coef: (float) Value function coefficient for the loss calculation + :param max_grad_norm: (float) The maximum value for the gradient clipping + :param lam: (float) Factor for trade-off of bias vs variance for Generalized Advantage Estimator + :param nminibatches: (int) Number of minibatches for the policies + :param noptepochs: (int) Number of epoch when optimizing the surrogate + :param cliprange: (float or callable) Clipping parameter, it can be a function + :param verbose: (int) the verbosity level: 0 none, 1 training information, 2 tensorflow debug + :param _init_setup_model: (bool) Whether or not to build the network at the creation of the instance + """ + + def __init__(self, policy, env, gamma=0.99, n_steps=128, ent_coef=0.01, learning_rate=2.5e-4, vf_coef=0.5, + max_grad_norm=0.5, lam=0.95, nminibatches=4, noptepochs=4, cliprange=0.2, verbose=0, + _init_setup_model=True): + super(PPO2, self).__init__(policy=policy, env=env, requires_vec_env=True, verbose=verbose) + + if isinstance(learning_rate, float): + learning_rate = constfn(learning_rate) + else: + assert callable(learning_rate) + if isinstance(cliprange, float): + cliprange = constfn(cliprange) + else: + assert callable(cliprange) + + self.learning_rate = learning_rate + self.cliprange = cliprange + self.n_steps = n_steps + self.ent_coef = ent_coef + self.vf_coef = vf_coef + self.max_grad_norm = max_grad_norm + self.gamma = gamma + self.lam = lam + self.nminibatches = nminibatches + self.noptepochs = noptepochs + + self.graph = None + self.sess = None + self.action_ph = None + self.advs_ph = None + self.rewards_ph = None + self.old_neglog_pac_ph = None + self.old_vpred_ph = None + self.learning_rate_ph = None + self.clip_range_ph = None + self.entropy = None + self.vf_loss = None + self.pg_loss = None + self.approxkl = None + self.clipfrac = None + self.params = None + self._train = None + self.loss_names = None + self.train_model = None + self.act_model = None + self.step = None + self.proba_step = None + self.value = None + self.initial_state = None + self.n_batch = None + + if _init_setup_model: + self.setup_model() + + def setup_model(self): + with SetVerbosity(self.verbose): + + self.n_batch = self.n_envs * self.n_steps + + n_cpu = multiprocessing.cpu_count() + if sys.platform == 'darwin': + n_cpu //= 2 + + self.graph = tf.Graph() + with self.graph.as_default(): + self.sess = tf_util.make_session(num_cpu=n_cpu, graph=self.graph) + + n_batch_step = None + n_batch_train = None + if issubclass(self.policy, LstmPolicy): + n_batch_step = self.n_envs + n_batch_train = self.n_batch // self.nminibatches + + act_model = self.policy(self.sess, self.observation_space, self.action_space, self.n_envs, 1, + n_batch_step, reuse=False) + train_model = self.policy(self.sess, self.observation_space, self.action_space, + self.n_envs // self.nminibatches, self.n_steps, n_batch_train, + reuse=True) + + self.action_ph = train_model.pdtype.sample_placeholder([None]) + self.advs_ph = tf.placeholder(tf.float32, [None]) + self.rewards_ph = tf.placeholder(tf.float32, [None]) + self.old_neglog_pac_ph = tf.placeholder(tf.float32, [None]) + self.old_vpred_ph = tf.placeholder(tf.float32, [None]) + self.learning_rate_ph = tf.placeholder(tf.float32, []) + self.clip_range_ph = tf.placeholder(tf.float32, []) + + neglogpac = train_model.proba_distribution.neglogp(self.action_ph) + self.entropy = tf.reduce_mean(train_model.proba_distribution.entropy()) + + vpred = train_model.value_fn + vpredclipped = self.old_vpred_ph + tf.clip_by_value( + train_model.value_fn - self.old_vpred_ph, - self.clip_range_ph, self.clip_range_ph) + vf_losses1 = tf.square(vpred - self.rewards_ph) + vf_losses2 = tf.square(vpredclipped - self.rewards_ph) + self.vf_loss = .5 * tf.reduce_mean(tf.maximum(vf_losses1, vf_losses2)) + ratio = tf.exp(self.old_neglog_pac_ph - neglogpac) + pg_losses = -self.advs_ph * ratio + pg_losses2 = -self.advs_ph * tf.clip_by_value(ratio, 1.0 - self.clip_range_ph, 1.0 + self.clip_range_ph) + self.pg_loss = tf.reduce_mean(tf.maximum(pg_losses, pg_losses2)) + self.approxkl = .5 * tf.reduce_mean(tf.square(neglogpac - self.old_neglog_pac_ph)) + self.clipfrac = tf.reduce_mean(tf.to_float(tf.greater(tf.abs(ratio - 1.0), self.clip_range_ph))) + loss = self.pg_loss - self.entropy * self.ent_coef + self.vf_loss * self.vf_coef + with tf.variable_scope('model'): + self.params = tf.trainable_variables() + grads = tf.gradients(loss, self.params) + if self.max_grad_norm is not None: + grads, _grad_norm = tf.clip_by_global_norm(grads, self.max_grad_norm) + grads = list(zip(grads, self.params)) + trainer = tf.train.AdamOptimizer(learning_rate=self.learning_rate_ph, epsilon=1e-5) + self._train = trainer.apply_gradients(grads) + + self.loss_names = ['policy_loss', 'value_loss', 'policy_entropy', 'approxkl', 'clipfrac'] + + self.train_model = train_model + self.act_model = act_model + self.step = act_model.step + self.proba_step = act_model.proba_step + self.value = act_model.value + self.initial_state = act_model.initial_state + tf.global_variables_initializer().run(session=self.sess) # pylint: disable=E1101 + + def _train_step(self, learning_rate, cliprange, obs, returns, masks, actions, values, neglogpacs, states=None): + """ + Training of PPO2 Algorithm + + :param learning_rate: (float) learning rate + :param cliprange: (float) Clipping factor + :param obs: (numpy array) The current observation of the environment + :param returns: (numpy array) the rewards + :param masks: (numpy array) The last masks for done episodes (used in recurent policies) + :param actions: (numpy array) the actions + :param values: (numpy array) the values + :param neglogpacs: (numpy array) Negative Log-likelihood probability of Actions + :param states: (numpy array) For recurrent policies, the internal state of the recurrent model + :return: policy gradient loss, value function loss, policy entropy, + approximation of kl divergence, updated clipping range, training update operation + """ + advs = returns - values + advs = (advs - advs.mean()) / (advs.std() + 1e-8) + td_map = {self.train_model.obs_ph: obs, self.action_ph: actions, self.advs_ph: advs, self.rewards_ph: returns, + self.learning_rate_ph: learning_rate, self.clip_range_ph: cliprange, + self.old_neglog_pac_ph: neglogpacs, self.old_vpred_ph: values} + if states is not None: + td_map[self.train_model.states_ph] = states + td_map[self.train_model.masks_ph] = masks + return self.sess.run([self.pg_loss, self.vf_loss, self.entropy, self.approxkl, self.clipfrac, self._train], + td_map)[:-1] + + def learn(self, total_timesteps, callback=None, seed=None, log_interval=100): + with SetVerbosity(self.verbose): + self._setup_learn(seed) + + runner = Runner(env=self.env, model=self, n_steps=self.n_steps, gamma=self.gamma, lam=self.lam) + + ep_info_buf = deque(maxlen=100) + t_first_start = time.time() + + nupdates = total_timesteps // self.n_batch + for update in range(1, nupdates + 1): + assert self.n_batch % self.nminibatches == 0 + n_batch_train = self.n_batch // self.nminibatches + t_start = time.time() + frac = 1.0 - (update - 1.0) / nupdates + lr_now = self.learning_rate(frac) + cliprangenow = self.cliprange(frac) + obs, returns, masks, actions, values, neglogpacs, states, ep_infos = runner.run() # pylint: disable=E0632 + ep_info_buf.extend(ep_infos) + mb_loss_vals = [] + if states is None: # nonrecurrent version + inds = np.arange(self.n_batch) + for _ in range(self.noptepochs): + np.random.shuffle(inds) + for start in range(0, self.n_batch, n_batch_train): + end = start + n_batch_train + mbinds = inds[start:end] + slices = (arr[mbinds] for arr in (obs, returns, masks, actions, values, neglogpacs)) + mb_loss_vals.append(self._train_step(lr_now, cliprangenow, *slices)) + else: # recurrent version + assert self.n_envs % self.nminibatches == 0 + envinds = np.arange(self.n_envs) + flatinds = np.arange(self.n_envs * self.n_steps).reshape(self.n_envs, self.n_steps) + envsperbatch = n_batch_train // self.n_steps + for _ in range(self.noptepochs): + np.random.shuffle(envinds) + for start in range(0, self.n_envs, envsperbatch): + end = start + envsperbatch + mb_env_inds = envinds[start:end] + mb_flat_inds = flatinds[mb_env_inds].ravel() + slices = (arr[mb_flat_inds] for arr in (obs, returns, masks, actions, values, neglogpacs)) + mb_states = states[mb_env_inds] + mb_loss_vals.append(self._train_step(lr_now, cliprangenow, *slices, mb_states)) + + loss_vals = np.mean(mb_loss_vals, axis=0) + t_now = time.time() + fps = int(self.n_batch / (t_now - t_start)) + + if callback is not None: + callback(locals(), globals()) + + if self.verbose >= 1 and (update % log_interval//100 == 0 or update == 1): + explained_var = explained_variance(values, returns) + logger.logkv("serial_timesteps", update * self.n_steps) + logger.logkv("nupdates", update) + logger.logkv("total_timesteps", update * self.n_batch) + logger.logkv("fps", fps) + logger.logkv("explained_variance", float(explained_var)) + logger.logkv('ep_rewmean', safe_mean([ep_info['r'] for ep_info in ep_info_buf])) + logger.logkv('eplenmean', safe_mean([ep_info['l'] for ep_info in ep_info_buf])) + logger.logkv('time_elapsed', t_start - t_first_start) + for (loss_val, loss_name) in zip(loss_vals, self.loss_names): + logger.logkv(loss_name, loss_val) + logger.dumpkvs() + + return self + + def predict(self, observation, state=None, mask=None): + if state is None: + state = self.initial_state + if mask is None: + mask = [False for _ in range(self.n_envs)] + observation = np.array(observation).reshape((-1,) + self.observation_space.shape) + + actions, _, states, _ = self.step(observation, state, mask) + return actions, states + + def action_probability(self, observation, state=None, mask=None): + if state is None: + state = self.initial_state + if mask is None: + mask = [False for _ in range(self.n_envs)] + observation = np.array(observation).reshape((-1,) + self.observation_space.shape) + + return self.proba_step(observation, state, mask) + + def save(self, save_path): + data = { + "gamma": self.gamma, + "n_steps": self.n_steps, + "vf_coef": self.vf_coef, + "ent_coef": self.ent_coef, + "max_grad_norm": self.max_grad_norm, + "learning_rate": self.learning_rate, + "lam": self.lam, + "nminibatches": self.nminibatches, + "noptepochs": self.noptepochs, + "cliprange": self.cliprange, + "verbose": self.verbose, + "policy": self.policy, + "observation_space": self.observation_space, + "action_space": self.action_space, + "n_envs": self.n_envs, + "_vectorize_action": self._vectorize_action + } + + params = self.sess.run(self.params) + + self._save_to_file(save_path, data=data, params=params) + + @classmethod + def load(cls, load_path, env=None, **kwargs): + data, params = cls._load_from_file(load_path) + + model = cls(policy=data["policy"], env=None, _init_setup_model=False) + model.__dict__.update(data) + model.__dict__.update(kwargs) + model.set_env(env) + model.setup_model() + + restores = [] + for param, loaded_p in zip(model.params, params): + restores.append(param.assign(loaded_p)) + model.sess.run(restores) + + return model + + +class Runner(AbstractEnvRunner): + def __init__(self, *, env, model, n_steps, gamma, lam): + """ + A runner to learn the policy of an environment for a model + + :param env: (Gym environment) The environment to learn from + :param model: (Model) The model to learn + :param n_steps: (int) The number of steps to run for each environment + :param gamma: (float) Discount factor + :param lam: (float) Factor for trade-off of bias vs variance for Generalized Advantage Estimator + """ + super().__init__(env=env, model=model, n_steps=n_steps) + self.lam = lam + self.gamma = gamma + + def run(self): + """ + Run a learning step of the model + + :return: + - observations: (numpy Number) the observations + - rewards: (numpy Number) the rewards + - masks: (numpy bool) whether an episode is over or not + - actions: (numpy Number) the actions + - values: (numpy Number) the value function output + - negative log probabilities: (numpy Number) + - states: (numpy Number) the internal states of the recurrent policies + - infos: (dict) the extra information of the model + """ + # mb stands for minibatch + mb_obs, mb_rewards, mb_actions, mb_values, mb_dones, mb_neglogpacs = [], [], [], [], [], [] + mb_states = self.states + ep_infos = [] + for _ in range(self.n_steps): + actions, values, self.states, neglogpacs = self.model.step(self.obs, self.states, self.dones) + mb_obs.append(self.obs.copy()) + mb_actions.append(actions) + mb_values.append(values) + mb_neglogpacs.append(neglogpacs) + mb_dones.append(self.dones) + self.obs[:], rewards, self.dones, infos = self.env.step(actions) + for info in infos: + maybeep_info = info.get('episode') + if maybeep_info: + ep_infos.append(maybeep_info) + mb_rewards.append(rewards) + # batch of steps to batch of rollouts + mb_obs = np.asarray(mb_obs, dtype=self.obs.dtype) + mb_rewards = np.asarray(mb_rewards, dtype=np.float32) + mb_actions = np.asarray(mb_actions) + mb_values = np.asarray(mb_values, dtype=np.float32) + mb_neglogpacs = np.asarray(mb_neglogpacs, dtype=np.float32) + mb_dones = np.asarray(mb_dones, dtype=np.bool) + last_values = self.model.value(self.obs, self.states, self.dones) + # discount/bootstrap off value fn + mb_advs = np.zeros_like(mb_rewards) + last_gae_lam = 0 + for step in reversed(range(self.n_steps)): + if step == self.n_steps - 1: + nextnonterminal = 1.0 - self.dones + nextvalues = last_values + else: + nextnonterminal = 1.0 - mb_dones[step + 1] + nextvalues = mb_values[step + 1] + delta = mb_rewards[step] + self.gamma * nextvalues * nextnonterminal - mb_values[step] + mb_advs[step] = last_gae_lam = delta + self.gamma * self.lam * nextnonterminal * last_gae_lam + mb_returns = mb_advs + mb_values + return (*map(swap_and_flatten, (mb_obs, mb_returns, mb_dones, mb_actions, mb_values, mb_neglogpacs)), mb_states, + ep_infos) + + +# obs, returns, masks, actions, values, neglogpacs, states = runner.run() +def swap_and_flatten(arr): + """ + swap and then flatten axes 0 and 1 + + :param arr: (numpy array) + :return: (numpy array) + """ + shape = arr.shape + return arr.swapaxes(0, 1).reshape(shape[0] * shape[1], *shape[2:]) + + +def constfn(val): + """ + Create a function that returns a constant + It is useful for learning rate schedule (to avoid code duplication) + + :param val: (float) + :return: (function) + """ + + def func(_): + return val + + return func + + +def safe_mean(arr): + """ + Compute the mean of an array if there is at least one element. + For empty array, return zero. It is used for logging only. + + :param arr: (numpy array) + :return: (float) + """ + return np.nan if len(arr) == 0 else np.mean(arr) diff --git a/stable_baselines/ppo2/run_atari.py b/stable_baselines/ppo2/run_atari.py new file mode 100644 index 0000000000..641cdf5835 --- /dev/null +++ b/stable_baselines/ppo2/run_atari.py @@ -0,0 +1,39 @@ +#!/usr/bin/env python3 +from stable_baselines import logger +from stable_baselines.common.cmd_util import make_atari_env, atari_arg_parser +from stable_baselines.common.vec_env.vec_frame_stack import VecFrameStack +from stable_baselines.ppo2 import PPO2 +from stable_baselines.common.policies import CnnPolicy, CnnLstmPolicy, CnnLnLstmPolicy, MlpPolicy + + +def train(env_id, num_timesteps, seed, policy): + """ + Train PPO2 model for atari environment, for testing purposes + + :param env_id: (str) the environment id string + :param num_timesteps: (int) the number of timesteps to run + :param seed: (int) Used to seed the random generator. + :param policy: (Object) The policy model to use (MLP, CNN, LSTM, ...) + """ + + env = VecFrameStack(make_atari_env(env_id, 8, seed), 4) + policy = {'cnn': CnnPolicy, 'lstm': CnnLstmPolicy, 'lnlstm': CnnLnLstmPolicy, 'mlp': MlpPolicy}[policy] + model = PPO2(policy=policy, env=env, n_steps=128, nminibatches=4, lam=0.95, gamma=0.99, noptepochs=4, ent_coef=.01, + learning_rate=lambda f: f * 2.5e-4, cliprange=lambda f: f * 0.1, verbose=1) + model.learn(total_timesteps=num_timesteps) + + +def main(): + """ + Runs the test + """ + parser = atari_arg_parser() + parser.add_argument('--policy', help='Policy architecture', choices=['cnn', 'lstm', 'lnlstm', 'mlp'], default='cnn') + args = parser.parse_args() + logger.configure() + train(args.env, num_timesteps=args.num_timesteps, seed=args.seed, + policy=args.policy) + + +if __name__ == '__main__': + main() diff --git a/stable_baselines/ppo2/run_mujoco.py b/stable_baselines/ppo2/run_mujoco.py new file mode 100644 index 0000000000..a99efbca92 --- /dev/null +++ b/stable_baselines/ppo2/run_mujoco.py @@ -0,0 +1,58 @@ +#!/usr/bin/env python3 +import numpy as np +import gym + +from stable_baselines.common.cmd_util import mujoco_arg_parser +from stable_baselines import bench, logger +from stable_baselines.common import set_global_seeds +from stable_baselines.common.vec_env.vec_normalize import VecNormalize +from stable_baselines.ppo2 import PPO2 +from stable_baselines.common.policies import MlpPolicy +from stable_baselines.common.vec_env.dummy_vec_env import DummyVecEnv + + +def train(env_id, num_timesteps, seed): + """ + Train PPO2 model for Mujoco environment, for testing purposes + + :param env_id: (str) the environment id string + :param num_timesteps: (int) the number of timesteps to run + :param seed: (int) Used to seed the random generator. + """ + def make_env(): + env_out = gym.make(env_id) + env_out = bench.Monitor(env_out, logger.get_dir(), allow_early_resets=True) + return env_out + + env = DummyVecEnv([make_env]) + env = VecNormalize(env) + + set_global_seeds(seed) + policy = MlpPolicy + model = PPO2(policy=policy, env=env, n_steps=2048, nminibatches=32, lam=0.95, gamma=0.99, noptepochs=10, + ent_coef=0.0, learning_rate=3e-4, cliprange=0.2) + model.learn(total_timesteps=num_timesteps) + + return model, env + + +def main(): + """ + Runs the test + """ + args = mujoco_arg_parser().parse_args() + logger.configure() + model, env = train(args.env, num_timesteps=args.num_timesteps, seed=args.seed) + + if args.play: + logger.log("Running trained model") + obs = np.zeros((env.num_envs,) + env.observation_space.shape) + obs[:] = env.reset() + while True: + actions = model.step(obs)[0] + obs[:] = env.step(actions)[0] + env.render() + + +if __name__ == '__main__': + main() diff --git a/stable_baselines/results_plotter.py b/stable_baselines/results_plotter.py new file mode 100644 index 0000000000..eefaafac7f --- /dev/null +++ b/stable_baselines/results_plotter.py @@ -0,0 +1,138 @@ +import numpy as np +import matplotlib +import matplotlib.pyplot as plt + +from stable_baselines.bench.monitor import load_results + +# matplotlib.use('TkAgg') # Can change to 'Agg' for non-interactive mode +plt.rcParams['svg.fonttype'] = 'none' + +X_TIMESTEPS = 'timesteps' +X_EPISODES = 'episodes' +X_WALLTIME = 'walltime_hrs' +POSSIBLE_X_AXES = [X_TIMESTEPS, X_EPISODES, X_WALLTIME] +EPISODES_WINDOW = 100 +COLORS = ['blue', 'green', 'red', 'cyan', 'magenta', 'yellow', 'black', 'purple', 'pink', + 'brown', 'orange', 'teal', 'coral', 'lightblue', 'lime', 'lavender', 'turquoise', + 'darkgreen', 'tan', 'salmon', 'gold', 'lightpurple', 'darkred', 'darkblue'] + + +def rolling_window(array, window): + """ + apply a rolling window to a numpy array + + :param array: (numpy Any) the input Array + :param window: (int) length of the rolling window + :return: (numpy Any) rolling window on the input array + """ + shape = array.shape[:-1] + (array.shape[-1] - window + 1, window) + strides = array.strides + (array.strides[-1],) + return np.lib.stride_tricks.as_strided(array, shape=shape, strides=strides) + + +def window_func(var_1, var_2, window, func): + """ + apply a function to the rolling window of 2 arrays + + :param var_1: (numpy Any) variable 1 + :param var_2: (numpy Any) variable 2 + :param window: (int) length of the rolling window + :param func: (numpy function) function to apply on the rolling window on variable 2 (such as np.mean) + :return: (numpy Any, numpy Any) the rolling output with applied function + """ + var_2_window = rolling_window(var_2, window) + function_on_var2 = func(var_2_window, axis=-1) + return var_1[window - 1:], function_on_var2 + + +def ts2xy(timesteps, xaxis): + """ + Decompose a timesteps variable to x ans ys + + :param timesteps: (Pandas DataFrame) the input data + :param xaxis: (str) the axis for the x and y output + (can be X_TIMESTEPS='timesteps', X_EPISODES='episodes' or X_WALLTIME='walltime_hrs') + :return: (numpy Number, numpy Number) the x and y output + """ + if xaxis == X_TIMESTEPS: + x_var = np.cumsum(timesteps.l.values) + y_var = timesteps.r.values + elif xaxis == X_EPISODES: + x_var = np.arange(len(timesteps)) + y_var = timesteps.r.values + elif xaxis == X_WALLTIME: + x_var = timesteps.t.values / 3600. + y_var = timesteps.r.values + else: + raise NotImplementedError + return x_var, y_var + + +def plot_curves(xy_list, xaxis, title): + """ + plot the curves + + :param xy_list: ([(numpy Number, numpy Number)]) the x and y coordinates to plot + :param xaxis: (str) the axis for the x and y output + (can be X_TIMESTEPS='timesteps', X_EPISODES='episodes' or X_WALLTIME='walltime_hrs') + :param title: (str) the title of the plot + """ + + plt.figure(figsize=(8, 2)) + maxx = max(xy[0][-1] for xy in xy_list) + minx = 0 + for (i, (x, y)) in enumerate(xy_list): + color = COLORS[i] + plt.scatter(x, y, s=2) + x, y_mean = window_func(x, y, EPISODES_WINDOW, np.mean) # So returns average of last EPISODE_WINDOW episodes + plt.plot(x, y_mean, color=color) + plt.xlim(minx, maxx) + plt.title(title) + plt.xlabel(xaxis) + plt.ylabel("Episode Rewards") + plt.tight_layout() + + +def plot_results(dirs, num_timesteps, xaxis, task_name): + """ + plot the results + + :param dirs: ([str]) the save location of the results to plot + :param num_timesteps: (int) only plot the points below this value + :param xaxis: (str) the axis for the x and y output + (can be X_TIMESTEPS='timesteps', X_EPISODES='episodes' or X_WALLTIME='walltime_hrs') + :param task_name: (str) the title of the task to plot + """ + + tslist = [] + for folder in dirs: + timesteps = load_results(folder) + timesteps = timesteps[timesteps.l.cumsum() <= num_timesteps] + tslist.append(timesteps) + xy_list = [ts2xy(timesteps_item, xaxis) for timesteps_item in tslist] + plot_curves(xy_list, xaxis, task_name) + + +def main(): + """ + Example usage in jupyter-notebook + from stable_baselines import log_viewer + %matplotlib inline + log_viewer.plot_results(["./log"], 10e6, log_viewer.X_TIMESTEPS, "Breakout") + Here ./log is a directory containing the monitor.csv files + """ + import argparse + import os + parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter) + parser.add_argument('--dirs', help='List of log directories', nargs='*', default=['./log']) + parser.add_argument('--num_timesteps', type=int, default=int(10e6)) + parser.add_argument('--xaxis', help='Varible on X-axis', default=X_TIMESTEPS) + parser.add_argument('--task_name', help='Title of plot', default='Breakout') + args = parser.parse_args() + args.dirs = [os.path.abspath(folder) for folder in args.dirs] + plot_results(args.dirs, args.num_timesteps, args.xaxis, args.task_name) + plt.show() + + +if __name__ == '__main__': + main() diff --git a/stable_baselines/trpo_mpi/__init__.py b/stable_baselines/trpo_mpi/__init__.py new file mode 100644 index 0000000000..601d7f8847 --- /dev/null +++ b/stable_baselines/trpo_mpi/__init__.py @@ -0,0 +1 @@ +from stable_baselines.trpo_mpi.trpo_mpi import TRPO diff --git a/stable_baselines/trpo_mpi/run_atari.py b/stable_baselines/trpo_mpi/run_atari.py new file mode 100644 index 0000000000..170f23a29b --- /dev/null +++ b/stable_baselines/trpo_mpi/run_atari.py @@ -0,0 +1,58 @@ +#!/usr/bin/env python3 +import os + +from mpi4py import MPI + +from stable_baselines.common import set_global_seeds +from stable_baselines import bench, logger +from stable_baselines.common.atari_wrappers import make_atari, wrap_deepmind +from stable_baselines.common.cmd_util import atari_arg_parser +from stable_baselines.common.policies import CnnPolicy +# from stable_baselines.trpo_mpi.nosharing_cnn_policy import CnnPolicy +from stable_baselines.trpo_mpi import TRPO + + +def train(env_id, num_timesteps, seed): + """ + Train TRPO model for the atari environment, for testing purposes + + :param env_id: (str) Environment ID + :param num_timesteps: (int) The total number of samples + :param seed: (int) The initial seed for training + """ + rank = MPI.COMM_WORLD.Get_rank() + + if rank == 0: + logger.configure() + else: + logger.configure(format_strs=[]) + + workerseed = seed + 10000 * MPI.COMM_WORLD.Get_rank() + set_global_seeds(workerseed) + env = make_atari(env_id) + + # def policy_fn(name, ob_space, ac_space, sess=None, placeholders=None): # pylint: disable=W0613 + # return CnnPolicy(name=name, ob_space=ob_space, ac_space=ac_space, sess=sess, placeholders=placeholders) + + env = bench.Monitor(env, logger.get_dir() and os.path.join(logger.get_dir(), str(rank))) + env.seed(workerseed) + + env = wrap_deepmind(env) + env.seed(workerseed) + + model = TRPO(CnnPolicy, env, timesteps_per_batch=512, max_kl=0.001, cg_iters=10, cg_damping=1e-3, entcoeff=0.0, + gamma=0.98, lam=1, vf_iters=3, vf_stepsize=1e-4) + model.learn(total_timesteps=int(num_timesteps * 1.1)) + env.close() + + +def main(): + """ + Runs the test + """ + args = atari_arg_parser().parse_args() + train(args.env, num_timesteps=args.num_timesteps, seed=args.seed) + + +if __name__ == "__main__": + main() diff --git a/stable_baselines/trpo_mpi/run_mujoco.py b/stable_baselines/trpo_mpi/run_mujoco.py new file mode 100644 index 0000000000..de303dba14 --- /dev/null +++ b/stable_baselines/trpo_mpi/run_mujoco.py @@ -0,0 +1,45 @@ +#!/usr/bin/env python3 +# noinspection PyUnresolvedReferences +from mpi4py import MPI + +from stable_baselines.common.cmd_util import make_mujoco_env, mujoco_arg_parser +from stable_baselines.common.policies import MlpPolicy +from stable_baselines import logger +from stable_baselines.trpo_mpi import TRPO +import stable_baselines.common.tf_util as tf_util + + +def train(env_id, num_timesteps, seed): + """ + Train TRPO model for the mujoco environment, for testing purposes + + :param env_id: (str) Environment ID + :param num_timesteps: (int) The total number of samples + :param seed: (int) The initial seed for training + """ + with tf_util.single_threaded_session(): + rank = MPI.COMM_WORLD.Get_rank() + if rank == 0: + logger.configure() + else: + logger.configure(format_strs=[]) + logger.set_level(logger.DISABLED) + workerseed = seed + 10000 * MPI.COMM_WORLD.Get_rank() + + env = make_mujoco_env(env_id, workerseed) + model = TRPO(MlpPolicy, env, timesteps_per_batch=1024, max_kl=0.01, cg_iters=10, cg_damping=0.1, entcoeff=0.0, + gamma=0.99, lam=0.98, vf_iters=5, vf_stepsize=1e-3) + model.learn(total_timesteps=num_timesteps) + env.close() + + +def main(): + """ + Runs the test + """ + args = mujoco_arg_parser().parse_args() + train(args.env, num_timesteps=args.num_timesteps, seed=args.seed) + + +if __name__ == '__main__': + main() diff --git a/stable_baselines/trpo_mpi/trpo_mpi.py b/stable_baselines/trpo_mpi/trpo_mpi.py new file mode 100644 index 0000000000..90465513bf --- /dev/null +++ b/stable_baselines/trpo_mpi/trpo_mpi.py @@ -0,0 +1,449 @@ +import time +from contextlib import contextmanager +from collections import deque + +from mpi4py import MPI +import tensorflow as tf +import numpy as np + +import stable_baselines.common.tf_util as tf_util +from stable_baselines.common import explained_variance, zipsame, dataset, fmt_row, colorize, BaseRLModel, SetVerbosity +from stable_baselines import logger +from stable_baselines.common.mpi_adam import MpiAdam +from stable_baselines.common.cg import conjugate_gradient +from stable_baselines.a2c.utils import find_trainable_variables +from stable_baselines.trpo_mpi.utils import traj_segment_generator, add_vtarg_and_adv, flatten_lists +# from stable_baselines.gail.statistics import Stats + + +class TRPO(BaseRLModel): + def __init__(self, policy, env, gamma=0.99, timesteps_per_batch=1024, max_kl=0.01, cg_iters=10, lam=0.98, + entcoeff=0.0, cg_damping=1e-2, vf_stepsize=3e-4, vf_iters=3, verbose=0, _init_setup_model=True): + """ + learns a TRPO policy using the given environment + + :param policy: (function (str, Gym Space, Gym Space, bool): MLPPolicy) policy generator + :param env: (Gym environment or str) The environment to learn from (if registered in Gym, can be str) + :param gamma: (float) the discount value + :param timesteps_per_batch: (int) the number of timesteps to run per batch (horizon) + :param max_kl: (float) the kullback leiber loss threshold + :param cg_iters: (int) the number of iterations for the conjugate gradient calculation + :param lam: (float) GAE factor + :param entcoeff: (float) the weight for the entropy loss + :param cg_damping: (float) the compute gradient dampening factor + :param vf_stepsize: (float) the value function stepsize + :param vf_iters: (int) the value function's number iterations for learning + :param verbose: (int) the verbosity level: 0 none, 1 training information, 2 tensorflow debug + :param _init_setup_model: (bool) Whether or not to build the network at the creation of the instance + """ + super(TRPO, self).__init__(policy=policy, env=env, requires_vec_env=False, verbose=verbose) + + self.using_gail = False + self.timesteps_per_batch = timesteps_per_batch + self.cg_iters = cg_iters + self.cg_damping = cg_damping + self.gamma = gamma + self.lam = lam + self.max_kl = max_kl + self.vf_iters = vf_iters + self.vf_stepsize = vf_stepsize + self.entcoeff = entcoeff + + # GAIL Params + self.pretrained_weight = None + self.hidden_size_adversary = 100 + self.adversary_entcoeff = 1e-3 + self.expert_dataset = None + self.save_per_iter = 1 + self.checkpoint_dir = "/tmp/gail/ckpt/" + self.g_step = 1 + self.d_step = 1 + self.task_name = "task_name" + self.d_stepsize = 3e-4 + + self.graph = None + self.sess = None + self.policy_pi = None + self.loss_names = None + self.assign_old_eq_new = None + self.compute_losses = None + self.compute_lossandgrad = None + self.compute_fvp = None + self.compute_vflossandgrad = None + self.d_adam = None + self.vfadam = None + self.get_flat = None + self.set_from_flat = None + self.timed = None + self.allmean = None + self.nworkers = None + self.rank = None + self.reward_giver = None + self.step = None + self.proba_step = None + self.initial_state = None + self.params = None + + if _init_setup_model: + self.setup_model() + + def setup_model(self): + # prevent import loops + from stable_baselines.gail.adversary import TransitionClassifier + + with SetVerbosity(self.verbose): + + self.nworkers = MPI.COMM_WORLD.Get_size() + self.rank = MPI.COMM_WORLD.Get_rank() + np.set_printoptions(precision=3) + + self.graph = tf.Graph() + with self.graph.as_default(): + self.sess = tf_util.single_threaded_session(graph=self.graph) + + if self.using_gail: + self.reward_giver = TransitionClassifier(self.env, self.hidden_size_adversary, + entcoeff=self.adversary_entcoeff) + + # Construct network for new policy + with tf.variable_scope("pi", reuse=False): + self.policy_pi = self.policy(self.sess, self.observation_space, self.action_space, self.n_envs, 1, + None, reuse=False) + + # Network for old policy + with tf.variable_scope("oldpi", reuse=False): + old_policy = self.policy(self.sess, self.observation_space, self.action_space, self.n_envs, 1, + None, reuse=False) + + atarg = tf.placeholder(dtype=tf.float32, shape=[None]) # Target advantage function (if applicable) + ret = tf.placeholder(dtype=tf.float32, shape=[None]) # Empirical return + + observation = self.policy_pi.obs_ph + action = self.policy_pi.pdtype.sample_placeholder([None]) + + kloldnew = old_policy.proba_distribution.kl(self.policy_pi.proba_distribution) + ent = self.policy_pi.proba_distribution.entropy() + meankl = tf.reduce_mean(kloldnew) + meanent = tf.reduce_mean(ent) + entbonus = self.entcoeff * meanent + + vferr = tf.reduce_mean(tf.square(self.policy_pi.value_fn[:, 0] - ret)) + + # advantage * pnew / pold + ratio = tf.exp(self.policy_pi.proba_distribution.logp(action) - + old_policy.proba_distribution.logp(action)) + surrgain = tf.reduce_mean(ratio * atarg) + + optimgain = surrgain + entbonus + losses = [optimgain, meankl, entbonus, surrgain, meanent] + self.loss_names = ["optimgain", "meankl", "entloss", "surrgain", "entropy"] + + dist = meankl + + all_var_list = tf_util.get_trainable_vars("pi") + var_list = [v for v in all_var_list if "/vf" not in v.name and "/q/" not in v.name] + vf_var_list = [v for v in all_var_list if "/pi" not in v.name and "/logstd" not in v.name] + self.vfadam = MpiAdam(vf_var_list, sess=self.sess) + self.get_flat = tf_util.GetFlat(var_list, sess=self.sess) + self.set_from_flat = tf_util.SetFromFlat(var_list, sess=self.sess) + + if self.using_gail: + self.d_adam = MpiAdam(self.reward_giver.get_trainable_variables()) + + klgrads = tf.gradients(dist, var_list) + flat_tangent = tf.placeholder(dtype=tf.float32, shape=[None], name="flat_tan") + shapes = [var.get_shape().as_list() for var in var_list] + start = 0 + tangents = [] + for shape in shapes: + var_size = tf_util.intprod(shape) + tangents.append(tf.reshape(flat_tangent[start: start + var_size], shape)) + start += var_size + gvp = tf.add_n( + [tf.reduce_sum(grad * tangent) for (grad, tangent) in zipsame(klgrads, tangents)]) # pylint: disable=E1111 + fvp = tf_util.flatgrad(gvp, var_list) + + self.assign_old_eq_new = tf_util.function([], [], updates=[tf.assign(oldv, newv) for (oldv, newv) in + zipsame(tf_util.get_globals_vars("oldpi"), + tf_util.get_globals_vars("pi"))]) + self.compute_losses = tf_util.function([observation, old_policy.obs_ph, action, atarg], losses) + self.compute_lossandgrad = tf_util.function([observation, old_policy.obs_ph, action, atarg], + losses + [tf_util.flatgrad(optimgain, var_list)]) + self.compute_fvp = tf_util.function([flat_tangent, observation, old_policy.obs_ph, action, atarg], fvp) + self.compute_vflossandgrad = tf_util.function([observation, old_policy.obs_ph, ret], + tf_util.flatgrad(vferr, vf_var_list)) + + @contextmanager + def timed(msg): + if self.rank == 0 and self.verbose >= 1: + print(colorize(msg, color='magenta')) + start_time = time.time() + yield + print(colorize("done in %.3f seconds" % (time.time() - start_time), color='magenta')) + else: + yield + + def allmean(arr): + assert isinstance(arr, np.ndarray) + out = np.empty_like(arr) + MPI.COMM_WORLD.Allreduce(arr, out, op=MPI.SUM) + out /= self.nworkers + return out + + tf_util.initialize(sess=self.sess) + + th_init = self.get_flat() + MPI.COMM_WORLD.Bcast(th_init, root=0) + self.set_from_flat(th_init) + + if self.using_gail: + self.d_adam.sync() + self.vfadam.sync() + + self.timed = timed + self.allmean = allmean + + self.step = self.policy_pi.step + self.proba_step = self.policy_pi.proba_step + self.initial_state = self.policy_pi.initial_state + + self.params = find_trainable_variables("pi") + if self.using_gail: + self.params.extend(self.reward_giver.get_trainable_variables()) + + def learn(self, total_timesteps, callback=None, seed=None, log_interval=100): + with SetVerbosity(self.verbose): + self._setup_learn(seed) + + with self.sess.as_default(): + seg_gen = traj_segment_generator(self.policy_pi, self.env, self.timesteps_per_batch, + reward_giver=self.reward_giver, gail=self.using_gail) + + episodes_so_far = 0 + timesteps_so_far = 0 + iters_so_far = 0 + t_start = time.time() + lenbuffer = deque(maxlen=40) # rolling buffer for episode lengths + rewbuffer = deque(maxlen=40) # rolling buffer for episode rewards + + true_rewbuffer = None + if self.using_gail: + true_rewbuffer = deque(maxlen=40) + # Stats not used for now + #  g_loss_stats = Stats(loss_names) + # d_loss_stats = Stats(reward_giver.loss_name) + # ep_stats = Stats(["True_rewards", "Rewards", "Episode_length"]) + + # if provide pretrained weight + if self.pretrained_weight is not None: + tf_util.load_state(self.pretrained_weight, var_list=tf_util.get_globals_vars("pi"), + sess=self.sess) + + while True: + if callback: + callback(locals(), globals()) + if total_timesteps and timesteps_so_far >= total_timesteps: + break + + logger.log("********** Iteration %i ************" % iters_so_far) + + def fisher_vector_product(vec): + return self.allmean(self.compute_fvp(vec, *fvpargs, sess=self.sess)) + self.cg_damping * vec + # ------------------ Update G ------------------ + logger.log("Optimizing Policy...") + # g_step = 1 when not using GAIL + mean_losses = None + vpredbefore = None + tdlamret = None + observation = None + action = None + seg = None + for _ in range(self.g_step): + with self.timed("sampling"): + seg = seg_gen.__next__() + add_vtarg_and_adv(seg, self.gamma, self.lam) + # ob, ac, atarg, ret, td1ret = map(np.concatenate, (obs, acs, atargs, rets, td1rets)) + observation, action, atarg, tdlamret = seg["ob"], seg["ac"], seg["adv"], seg["tdlamret"] + vpredbefore = seg["vpred"] # predicted value function before udpate + atarg = (atarg - atarg.mean()) / atarg.std() # standardized advantage function estimate + + args = seg["ob"], seg["ob"], seg["ac"], atarg + fvpargs = [arr[::5] for arr in args] + + self.assign_old_eq_new(sess=self.sess) + + with self.timed("computegrad"): + *lossbefore, grad = self.compute_lossandgrad(*args, sess=self.sess) + lossbefore = self.allmean(np.array(lossbefore)) + grad = self.allmean(grad) + if np.allclose(grad, 0): + logger.log("Got zero gradient. not updating") + else: + with self.timed("cg"): + stepdir = conjugate_gradient(fisher_vector_product, grad, cg_iters=self.cg_iters, + verbose=self.rank == 0 and self.verbose >= 1) + assert np.isfinite(stepdir).all() + shs = .5 * stepdir.dot(fisher_vector_product(stepdir)) + # abs(shs) to avoid taking square root of negative values + lagrange_multiplier = np.sqrt(abs(shs) / self.max_kl) + # logger.log("lagrange multiplier:", lm, "gnorm:", np.linalg.norm(g)) + fullstep = stepdir / lagrange_multiplier + expectedimprove = grad.dot(fullstep) + surrbefore = lossbefore[0] + stepsize = 1.0 + thbefore = self.get_flat() + thnew = None + for _ in range(10): + thnew = thbefore + fullstep * stepsize + self.set_from_flat(thnew) + mean_losses = surr, kl_loss, *_ = self.allmean( + np.array(self.compute_losses(*args, sess=self.sess))) + improve = surr - surrbefore + logger.log("Expected: %.3f Actual: %.3f" % (expectedimprove, improve)) + if not np.isfinite(mean_losses).all(): + logger.log("Got non-finite value of losses -- bad!") + elif kl_loss > self.max_kl * 1.5: + logger.log("violated KL constraint. shrinking step.") + elif improve < 0: + logger.log("surrogate didn't improve. shrinking step.") + else: + logger.log("Stepsize OK!") + break + stepsize *= .5 + else: + logger.log("couldn't compute a good step") + self.set_from_flat(thbefore) + if self.nworkers > 1 and iters_so_far % 20 == 0: + # list of tuples + paramsums = MPI.COMM_WORLD.allgather((thnew.sum(), self.vfadam.getflat().sum())) + assert all(np.allclose(ps, paramsums[0]) for ps in paramsums[1:]) + + with self.timed("vf"): + for _ in range(self.vf_iters): + for (mbob, mbret) in dataset.iterbatches((seg["ob"], seg["tdlamret"]), + include_final_partial_batch=False, + batch_size=128): + grad = self.allmean(self.compute_vflossandgrad(mbob, mbob, mbret, sess=self.sess)) + self.vfadam.update(grad, self.vf_stepsize) + + for (loss_name, loss_val) in zip(self.loss_names, mean_losses): + logger.record_tabular(loss_name, loss_val) + + logger.record_tabular("ev_tdlam_before", explained_variance(vpredbefore, tdlamret)) + + if self.using_gail: + # ------------------ Update D ------------------ + logger.log("Optimizing Discriminator...") + logger.log(fmt_row(13, self.reward_giver.loss_name)) + ob_expert, ac_expert = self.expert_dataset.get_next_batch(len(observation)) + batch_size = len(observation) // self.d_step + d_losses = [] # list of tuples, each of which gives the loss for a minibatch + for ob_batch, ac_batch in dataset.iterbatches((observation, action), + include_final_partial_batch=False, + batch_size=batch_size): + ob_expert, ac_expert = self.expert_dataset.get_next_batch(len(ob_batch)) + # update running mean/std for reward_giver + if hasattr(self.reward_giver, "obs_rms"): + self.reward_giver.obs_rms.update(np.concatenate((ob_batch, ob_expert), 0)) + *newlosses, grad = self.reward_giver.lossandgrad(ob_batch, ac_batch, ob_expert, ac_expert) + self.d_adam.update(self.allmean(grad), self.d_stepsize) + d_losses.append(newlosses) + logger.log(fmt_row(13, np.mean(d_losses, axis=0))) + + lrlocal = (seg["ep_lens"], seg["ep_rets"], seg["ep_true_rets"]) # local values + listoflrpairs = MPI.COMM_WORLD.allgather(lrlocal) # list of tuples + lens, rews, true_rets = map(flatten_lists, zip(*listoflrpairs)) + true_rewbuffer.extend(true_rets) + else: + lrlocal = (seg["ep_lens"], seg["ep_rets"]) # local values + listoflrpairs = MPI.COMM_WORLD.allgather(lrlocal) # list of tuples + lens, rews = map(flatten_lists, zip(*listoflrpairs)) + lenbuffer.extend(lens) + rewbuffer.extend(rews) + + logger.record_tabular("EpLenMean", np.mean(lenbuffer)) + logger.record_tabular("EpRewMean", np.mean(rewbuffer)) + if self.using_gail: + logger.record_tabular("EpTrueRewMean", np.mean(true_rewbuffer)) + logger.record_tabular("EpThisIter", len(lens)) + episodes_so_far += len(lens) + timesteps_so_far += seg["total_timestep"] + iters_so_far += 1 + + logger.record_tabular("EpisodesSoFar", episodes_so_far) + logger.record_tabular("TimestepsSoFar", timesteps_so_far) + logger.record_tabular("TimeElapsed", time.time() - t_start) + + if self.verbose >= 1 and self.rank == 0: + logger.dump_tabular() + + return self + + def predict(self, observation, state=None, mask=None): + if state is None: + state = self.initial_state + if mask is None: + mask = [False for _ in range(self.n_envs)] + observation = np.array(observation).reshape((-1,) + self.observation_space.shape) + + actions, _, states, _ = self.step(observation, state, mask) + return actions, states + + def action_probability(self, observation, state=None, mask=None): + if state is None: + state = self.initial_state + if mask is None: + mask = [False for _ in range(self.n_envs)] + observation = np.array(observation).reshape((-1,) + self.observation_space.shape) + + return self.proba_step(observation, state, mask) + + def save(self, save_path): + data = { + "gamma": self.gamma, + "timesteps_per_batch": self.timesteps_per_batch, + "max_kl": self.max_kl, + "cg_iters": self.cg_iters, + "lam": self.lam, + "entcoeff": self.entcoeff, + "cg_damping": self.cg_damping, + "vf_stepsize": self.vf_stepsize, + "vf_iters": self.vf_iters, + "pretrained_weight": self.pretrained_weight, + "reward_giver": self.reward_giver, + "expert_dataset": self.expert_dataset, + "save_per_iter": self.save_per_iter, + "checkpoint_dir": self.checkpoint_dir, + "g_step": self.g_step, + "d_step": self.d_step, + "task_name": self.task_name, + "d_stepsize": self.d_stepsize, + "using_gail": self.using_gail, + "verbose": self.verbose, + "policy": self.policy, + "observation_space": self.observation_space, + "action_space": self.action_space, + "n_envs": self.n_envs, + "_vectorize_action": self._vectorize_action + } + + params = self.sess.run(self.params) + + self._save_to_file(save_path, data=data, params=params) + + @classmethod + def load(cls, load_path, env=None, **kwargs): + data, params = cls._load_from_file(load_path) + + model = cls(policy=data["policy"], env=None, _init_setup_model=False) + model.__dict__.update(data) + model.__dict__.update(kwargs) + model.set_env(env) + model.setup_model() + + restores = [] + for param, loaded_p in zip(model.params, params): + restores.append(param.assign(loaded_p)) + model.sess.run(restores) + + return model diff --git a/stable_baselines/trpo_mpi/utils.py b/stable_baselines/trpo_mpi/utils.py new file mode 100644 index 0000000000..469a7004e8 --- /dev/null +++ b/stable_baselines/trpo_mpi/utils.py @@ -0,0 +1,141 @@ +import numpy as np + +from stable_baselines.common.vec_env import VecEnv + + +def traj_segment_generator(policy, env, horizon, reward_giver=None, gail=False): + """ + Compute target value using TD(lambda) estimator, and advantage with GAE(lambda) + + :param policy: (MLPPolicy) the policy + :param env: (Gym Environment) the environment + :param horizon: (int) the number of timesteps to run per batch + :param reward_giver: (TransitionClassifier) the reward predicter from obsevation and action + :param gail: (bool) Whether we are using this generator for standard trpo or with gail + :return: (dict) generator that returns a dict with the following keys: + + - ob: (numpy Number) observations + - rew: (numpy float) rewards (if gail is used it is the predicted reward) + - vpred: (numpy float) action logits + - new: (numpy bool) dones (is end of episode) + - ac: (numpy Number) actions + - prevac: (numpy Number) previous actions + - nextvpred: (numpy float) next action logits + - ep_rets: (float) cumulated current episode reward + - ep_lens: (int) the length of the current episode + - ep_true_rets: (float) the real environment reward + """ + # Check when using GAIL + assert not (gail and reward_giver is None), "You must pass a reward giver when using GAIL" + + # Initialize state variables + step = 0 + action = env.action_space.sample() # not used, just so we have the datatype + new = True + observation = env.reset() + + cur_ep_ret = 0 # return in current episode + cur_ep_len = 0 # len of current episode + cur_ep_true_ret = 0 + ep_true_rets = [] + ep_rets = [] # returns of completed episodes in this segment + ep_lens = [] # Episode lengths + + # Initialize history arrays + observations = np.array([observation for _ in range(horizon)]) + true_rews = np.zeros(horizon, 'float32') + rews = np.zeros(horizon, 'float32') + vpreds = np.zeros(horizon, 'float32') + news = np.zeros(horizon, 'int32') + actions = np.array([action for _ in range(horizon)]) + prev_actions = actions.copy() + states = policy.initial_state + done = None + + while True: + prevac = action + action, vpred, states, _ = policy.step(observation.reshape(-1, *observation.shape), states, done) + # Slight weirdness here because we need value function at time T + # before returning segment [0, T-1] so we get the correct + # terminal value + if step > 0 and step % horizon == 0: + # Fix to avoid "mean of empty slice" warning when there is only one episode + if len(ep_rets) == 0: + ep_rets = [cur_ep_ret] + ep_lens = [cur_ep_len] + ep_true_rets = [cur_ep_true_ret] + total_timesteps = cur_ep_len + else: + total_timesteps = sum(ep_lens) + cur_ep_len + + yield {"ob": observations, "rew": rews, "vpred": vpreds, "new": news, + "ac": actions, "prevac": prev_actions, "nextvpred": vpred * (1 - new), + "ep_rets": ep_rets, "ep_lens": ep_lens, "ep_true_rets": ep_true_rets, + "total_timestep": total_timesteps} + _, vpred, _, _ = policy.step(observation.reshape(-1, *observation.shape)) + # Be careful!!! if you change the downstream algorithm to aggregate + # several of these batches, then be sure to do a deepcopy + ep_rets = [] + ep_true_rets = [] + ep_lens = [] + i = step % horizon + observations[i] = observation + vpreds[i] = vpred[0] + news[i] = new + actions[i] = action[0] + prev_actions[i] = prevac + + if gail: + rew = reward_giver.get_reward(observation, action[0]) + observation, true_rew, new, done = env.step(action[0]) + else: + observation, rew, new, done = env.step(action[0]) + true_rew = rew + rews[i] = rew + true_rews[i] = true_rew + + cur_ep_ret += rew + cur_ep_true_ret += true_rew + cur_ep_len += 1 + if new: + ep_rets.append(cur_ep_ret) + ep_true_rets.append(cur_ep_true_ret) + ep_lens.append(cur_ep_len) + cur_ep_ret = 0 + cur_ep_true_ret = 0 + cur_ep_len = 0 + if not isinstance(env, VecEnv): + observation = env.reset() + step += 1 + + +def add_vtarg_and_adv(seg, gamma, lam): + """ + Compute target value using TD(lambda) estimator, and advantage with GAE(lambda) + + :param seg: (dict) the current segment of the trajectory (see traj_segment_generator return for more information) + :param gamma: (float) Discount factor + :param lam: (float) GAE factor + """ + # last element is only used for last vtarg, but we already zeroed it if last new = 1 + new = np.append(seg["new"], 0) + vpred = np.append(seg["vpred"], seg["nextvpred"]) + rew_len = len(seg["rew"]) + seg["adv"] = gaelam = np.empty(rew_len, 'float32') + rew = seg["rew"] + lastgaelam = 0 + for step in reversed(range(rew_len)): + nonterminal = 1 - new[step + 1] + delta = rew[step] + gamma * vpred[step + 1] * nonterminal - vpred[step] + gaelam[step] = lastgaelam = delta + gamma * lam * nonterminal * lastgaelam + seg["tdlamret"] = seg["adv"] + seg["vpred"] + + +def flatten_lists(listoflists): + """ + Flatten a python list of list + + :param listoflists: (list(list)) + :return: (list) + """ + return [el for list_ in listoflists for el in list_] diff --git a/baselines/acktr/__init__.py b/tests/__init__.py similarity index 100% rename from baselines/acktr/__init__.py rename to tests/__init__.py diff --git a/tests/test_action_space.py b/tests/test_action_space.py new file mode 100644 index 0000000000..66ccd590b0 --- /dev/null +++ b/tests/test_action_space.py @@ -0,0 +1,64 @@ +import pytest + +from stable_baselines.a2c import A2C +from stable_baselines.ppo1 import PPO1 +from stable_baselines.ppo2 import PPO2 +from stable_baselines.trpo_mpi import TRPO +from stable_baselines.common.identity_env import IdentityEnvMultiBinary, IdentityEnvMultiDiscrete +from stable_baselines.common.vec_env.dummy_vec_env import DummyVecEnv +from stable_baselines.common.policies import MlpPolicy + +MODEL_FUNC_LIST = [ + lambda e: A2C(policy=MlpPolicy, env=e), + lambda e: PPO1(policy=MlpPolicy, env=e), + lambda e: PPO2(policy=MlpPolicy, env=e), + lambda e: TRPO(policy=MlpPolicy, env=e), +] + + +@pytest.mark.slow +@pytest.mark.parametrize("model_func", MODEL_FUNC_LIST) +def test_identity_multidiscrete(model_func): + """ + Test if the algorithm (with a given policy) + can learn an identity transformation (i.e. return observation as an action) + with a multidiscrete action space + + :param model_func: (lambda (Gym Environment): BaseRLModel) the model generator + """ + env = DummyVecEnv([lambda: IdentityEnvMultiDiscrete(10)]) + + model = model_func(env) + model.learn(total_timesteps=1000, seed=0) + + n_trials = 1000 + reward_sum = 0 + obs = env.reset() + for _ in range(n_trials): + action, _ = model.predict(obs) + obs, reward, _, _ = env.step(action) + reward_sum += reward + + +@pytest.mark.slow +@pytest.mark.parametrize("model_func", MODEL_FUNC_LIST) +def test_identity_multibinary(model_func): + """ + Test if the algorithm (with a given policy) + can learn an identity transformation (i.e. return observation as an action) + with a multibinary action space + + :param model_func: (lambda (Gym Environment): BaseRLModel) the model generator + """ + env = DummyVecEnv([lambda: IdentityEnvMultiBinary(10)]) + + model = model_func(env) + model.learn(total_timesteps=1000, seed=0) + + n_trials = 1000 + reward_sum = 0 + obs = env.reset() + for _ in range(n_trials): + action, _ = model.predict(obs) + obs, reward, _, _ = env.step(action) + reward_sum += reward diff --git a/tests/test_atari.py b/tests/test_atari.py new file mode 100644 index 0000000000..c6e2523913 --- /dev/null +++ b/tests/test_atari.py @@ -0,0 +1,98 @@ +import pytest + +from stable_baselines import bench, logger +from stable_baselines.deepq import DeepQ, wrap_atari_dqn, models as deepq_models +from stable_baselines.common import set_global_seeds +from stable_baselines.common.atari_wrappers import make_atari +import stable_baselines.a2c.run_atari as a2c_atari +import stable_baselines.acer.run_atari as acer_atari +import stable_baselines.acktr.run_atari as acktr_atari +import stable_baselines.ppo1.run_atari as ppo1_atari +import stable_baselines.ppo2.run_atari as ppo2_atari +import stable_baselines.trpo_mpi.run_atari as trpo_atari + + +ENV_ID = 'BreakoutNoFrameskip-v4' +SEED = 3 +NUM_TIMESTEPS = 500 +NUM_CPU = 4 + + +@pytest.mark.slow +@pytest.mark.parametrize("policy", ['cnn', 'lstm', 'lnlstm']) +def test_a2c(policy): + """ + test A2C on atari + + :param policy: (str) the policy to test for A2C + """ + a2c_atari.train(env_id=ENV_ID, num_timesteps=NUM_TIMESTEPS, seed=SEED, + policy=policy, lr_schedule='constant', num_env=NUM_CPU) + + +@pytest.mark.slow +@pytest.mark.parametrize("policy", ['cnn', 'lstm']) +def test_acer(policy): + """ + test ACER on atari + + :param policy: (str) the policy to test for ACER + """ + acer_atari.train(env_id=ENV_ID, num_timesteps=NUM_TIMESTEPS, seed=SEED, + policy=policy, lr_schedule='constant', num_cpu=NUM_CPU) + + +@pytest.mark.slow +def test_acktr(): + """ + test ACKTR on atari + """ + acktr_atari.train(env_id=ENV_ID, num_timesteps=NUM_TIMESTEPS, seed=SEED, num_cpu=NUM_CPU) + + +@pytest.mark.slow +def test_deepq(): + """ + test DeepQ on atari + """ + logger.configure() + set_global_seeds(SEED) + env = make_atari(ENV_ID) + env = bench.Monitor(env, logger.get_dir()) + env = wrap_atari_dqn(env) + q_func = deepq_models.cnn_to_mlp(convs=[(32, 8, 4), (64, 4, 2), (64, 3, 1)], hiddens=[256], dueling=True) + + model = DeepQ(env=env, policy=q_func, learning_rate=1e-4, buffer_size=10000, exploration_fraction=0.1, + exploration_final_eps=0.01, train_freq=4, learning_starts=10000, target_network_update_freq=1000, + gamma=0.99, prioritized_replay=True, prioritized_replay_alpha=0.6, checkpoint_freq=10000) + model.learn(total_timesteps=NUM_TIMESTEPS) + + env.close() + del model, env + + +@pytest.mark.slow +def test_ppo1(): + """ + test PPO1 on atari + """ + ppo1_atari.train(env_id=ENV_ID, num_timesteps=NUM_TIMESTEPS, seed=SEED) + + +@pytest.mark.slow +@pytest.mark.parametrize("policy", ['cnn', 'lstm', 'lnlstm', 'mlp']) +def test_ppo2(policy): + """ + test PPO2 on atari + + :param policy: (str) the policy to test for PPO2 + """ + ppo2_atari.train(env_id=ENV_ID, num_timesteps=NUM_TIMESTEPS, seed=SEED, policy=policy) + + +@pytest.mark.slow +def test_trpo(): + """ + test TRPO on atari + """ + trpo_atari.train(env_id=ENV_ID, num_timesteps=NUM_TIMESTEPS, seed=SEED) diff --git a/tests/test_common.py b/tests/test_common.py new file mode 100644 index 0000000000..7fccd367a9 --- /dev/null +++ b/tests/test_common.py @@ -0,0 +1,6 @@ +def _assert_eq(left, right): + assert left == right, '{} != {}'.format(left, right) + + +def _assert_neq(left, right): + assert left != right, '{} == {}'.format(left, right) diff --git a/tests/test_continuous.py b/tests/test_continuous.py new file mode 100644 index 0000000000..42d53c5a2d --- /dev/null +++ b/tests/test_continuous.py @@ -0,0 +1,122 @@ +import subprocess +import os + +import pytest +import gym + +from stable_baselines.a2c import A2C +# TODO: add support for continuous actions +# from stable_baselines.acer import ACER +# from stable_baselines.acktr import ACKTR +from stable_baselines.ddpg import DDPG +from stable_baselines.ppo1 import PPO1 +from stable_baselines.ppo2 import PPO2 +from stable_baselines.trpo_mpi import TRPO +from stable_baselines.common import set_global_seeds +from stable_baselines.common.vec_env.dummy_vec_env import DummyVecEnv +from stable_baselines.common.policies import MlpPolicy +from tests.test_common import _assert_eq + +ENV_ID = 'Pendulum-v0' +N_TRIALS = 1000 +NUM_TIMESTEPS = 1000 + +MODEL_LIST = [ + A2C, + # ACER, + # ACKTR, + DDPG, + PPO1, + PPO2, + TRPO +] + + +@pytest.mark.slow +@pytest.mark.parametrize("model_class", MODEL_LIST) +def test_model_manipulation(model_class): + """ + Test if the algorithm can be loaded and saved without any issues, the environment switching + works and that the action prediction works + + :param model_class: (BaseRLModel) A model + """ + try: + env = gym.make(ENV_ID) + env = DummyVecEnv([lambda: env]) + + # create and train + model = model_class(policy=MlpPolicy, env=env) + model.learn(total_timesteps=NUM_TIMESTEPS) + + # predict and measure the acc reward + acc_reward = 0 + obs = env.reset() + set_global_seeds(0) + for _ in range(N_TRIALS): + action, _ = model.predict(obs) + obs, reward, _, _ = env.step(action) + acc_reward += reward + acc_reward = sum(acc_reward) / N_TRIALS + + # saving + model.save("./test_model") + + del model, env + + # loading + model = model_class.load("./test_model") + + # changing environment (note: this can be done at loading) + env = gym.make(ENV_ID) + env = DummyVecEnv([lambda: env]) + model.set_env(env) + + # predict the same output before saving + loaded_acc_reward = 0 + obs = env.reset() + set_global_seeds(0) + for _ in range(N_TRIALS): + action, _ = model.predict(obs) + obs, reward, _, _ = env.step(action) + loaded_acc_reward += reward + loaded_acc_reward = sum(loaded_acc_reward) / N_TRIALS + # assert <5% diff + assert abs(acc_reward - loaded_acc_reward) / max(acc_reward, loaded_acc_reward) < 0.05, \ + "Error: the prediction seems to have changed between loading and saving" + + # learn post loading + model.learn(total_timesteps=int(NUM_TIMESTEPS / 2)) + + # validate no reset post learning + loaded_acc_reward = 0 + obs = env.reset() + set_global_seeds(0) + for _ in range(N_TRIALS): + action, _ = model.predict(obs) + obs, reward, _, _ = env.step(action) + loaded_acc_reward += reward + loaded_acc_reward = sum(loaded_acc_reward) / N_TRIALS + # assert <5% diff + assert abs(acc_reward - loaded_acc_reward) / max(acc_reward, loaded_acc_reward) < 0.05, \ + "Error: the prediction seems to have changed between pre learning and post learning" + + # predict new values + obs = env.reset() + for _ in range(N_TRIALS): + action, _ = model.predict(obs) + obs, _, _, _ = env.step(action) + + # Free memory + del model, env + + finally: + if os.path.exists("./test_model"): + os.remove("./test_model") + + +def test_ddpg(): + args = ['--env-id', ENV_ID, '--nb-rollout-steps', 100] + args = list(map(str, args)) + return_code = subprocess.call(['python', '-m', 'stable_baselines.ddpg.main'] + args) + _assert_eq(return_code, 0) diff --git a/tests/test_deepq.py b/tests/test_deepq.py new file mode 100644 index 0000000000..c15eba6867 --- /dev/null +++ b/tests/test_deepq.py @@ -0,0 +1,31 @@ +from stable_baselines.deepq.experiments.custom_cartpole import main as main_custom +from stable_baselines.deepq.experiments.train_cartpole import main as train_cartpole +from stable_baselines.deepq.experiments.enjoy_cartpole import main as enjoy_cartpole +from stable_baselines.deepq.experiments.train_mountaincar import main as train_mountaincar +from stable_baselines.deepq.experiments.enjoy_mountaincar import main as enjoy_mountaincar + + +class DummyObject(object): + """ + Dummy object to create fake Parsed Arguments object + """ + pass + + +args = DummyObject() +args.no_render = True +args.max_timesteps = 200 + + +def test_custom_cartpole(): + main_custom(args) + + +def test_cartpole(): + train_cartpole(args) + enjoy_cartpole(args) + + +def test_mountaincar(): + train_mountaincar(args) + enjoy_mountaincar(args) diff --git a/tests/test_distri.py b/tests/test_distri.py new file mode 100644 index 0000000000..d33b14ecf1 --- /dev/null +++ b/tests/test_distri.py @@ -0,0 +1,68 @@ +import numpy as np +import tensorflow as tf + +import stable_baselines.common.tf_util as tf_util +from stable_baselines.common.distributions import DiagGaussianProbabilityDistributionType,\ + CategoricalProbabilityDistributionType, \ + MultiCategoricalProbabilityDistributionType, BernoulliProbabilityDistributionType + + +@tf_util.in_session +def test_probtypes(): + """ + test probability distribution types + """ + np.random.seed(0) + + pdparam_diag_gauss = np.array([-.2, .3, .4, -.5, .1, -.5, .1, 0.8]) + diag_gauss = DiagGaussianProbabilityDistributionType(pdparam_diag_gauss.size // 2) + validate_probtype(diag_gauss, pdparam_diag_gauss) + + pdparam_categorical = np.array([-.2, .3, .5]) + categorical = CategoricalProbabilityDistributionType(pdparam_categorical.size) + validate_probtype(categorical, pdparam_categorical) + + nvec = [1, 2, 3] + pdparam_multicategorical = np.array([-.2, .3, .5, .1, 1, -.1]) + multicategorical = MultiCategoricalProbabilityDistributionType(nvec) + validate_probtype(multicategorical, pdparam_multicategorical) + + pdparam_bernoulli = np.array([-.2, .3, .5]) + bernoulli = BernoulliProbabilityDistributionType(pdparam_bernoulli.size) + validate_probtype(bernoulli, pdparam_bernoulli) + + +def validate_probtype(probtype, pdparam): + """ + validate probability distribution types + + :param probtype: (ProbabilityDistributionType) the type to validate + :param pdparam: ([float]) the flat probabilities to test + """ + number_samples = 100000 + # Check to see if mean negative log likelihood == differential entropy + mval = np.repeat(pdparam[None, :], number_samples, axis=0) + mval_ph = probtype.param_placeholder([number_samples]) + xval_ph = probtype.sample_placeholder([number_samples]) + proba_distribution = probtype.proba_distribution_from_flat(mval_ph) + calcloglik = tf_util.function([xval_ph, mval_ph], proba_distribution.logp(xval_ph)) + calcent = tf_util.function([mval_ph], proba_distribution.entropy()) + xval = tf.get_default_session().run(proba_distribution.sample(), feed_dict={mval_ph: mval}) + logliks = calcloglik(xval, mval) + entval_ll = - logliks.mean() + entval_ll_stderr = logliks.std() / np.sqrt(number_samples) + entval = calcent(mval).mean() + assert np.abs(entval - entval_ll) < 3 * entval_ll_stderr # within 3 sigmas + + # Check to see if kldiv[p,q] = - ent[p] - E_p[log q] + mval2_ph = probtype.param_placeholder([number_samples]) + pd2 = probtype.proba_distribution_from_flat(mval2_ph) + tmp = pdparam + np.random.randn(pdparam.size) * 0.1 + mval2 = np.repeat(tmp[None, :], number_samples, axis=0) + calckl = tf_util.function([mval_ph, mval2_ph], proba_distribution.kl(pd2)) + klval = calckl(mval, mval2).mean() + logliks = calcloglik(xval, mval2) + klval_ll = - entval - logliks.mean() + klval_ll_stderr = logliks.std() / np.sqrt(number_samples) + assert np.abs(klval - klval_ll) < 3 * klval_ll_stderr # within 3 sigmas + print('ok on', probtype, pdparam) diff --git a/tests/test_identity.py b/tests/test_identity.py new file mode 100644 index 0000000000..a5ee0e016a --- /dev/null +++ b/tests/test_identity.py @@ -0,0 +1,53 @@ +import pytest + +from stable_baselines.a2c import A2C +from stable_baselines.acer import ACER +from stable_baselines.acktr import ACKTR +from stable_baselines.deepq import DeepQ +from stable_baselines.ppo1 import PPO1 +from stable_baselines.ppo2 import PPO2 +from stable_baselines.trpo_mpi import TRPO +from stable_baselines.common.identity_env import IdentityEnv +from stable_baselines.common.vec_env.dummy_vec_env import DummyVecEnv +from stable_baselines.common.policies import MlpPolicy +from stable_baselines.deepq import models as deepq_models + +learn_func_list = [ + lambda e: A2C(policy=MlpPolicy, learning_rate=1e-3, n_steps=1, + gamma=0.7, env=e).learn(total_timesteps=10000, seed=0), + lambda e: ACER(policy=MlpPolicy, env=e, + n_steps=1, replay_ratio=1).learn(total_timesteps=10000, seed=0), + lambda e: ACKTR(policy=MlpPolicy, env=e, learning_rate=5e-4, n_steps=1).learn(total_timesteps=20000, seed=0), + lambda e: DeepQ(policy=deepq_models.mlp([32]), batch_size=16, gamma=0.1, + exploration_fraction=0.001, env=e).learn(total_timesteps=30000, seed=0), + lambda e: PPO1(policy=MlpPolicy, env=e, lam=0.7, + optim_batchsize=16, optim_stepsize=1e-3).learn(total_timesteps=10000, seed=0), + lambda e: PPO2(policy=MlpPolicy, env=e, learning_rate=1.5e-3, + lam=0.8).learn(total_timesteps=20000, seed=0), + lambda e: TRPO(policy=MlpPolicy, env=e, max_kl=0.05, lam=0.7).learn(total_timesteps=10000, seed=0), +] + + +@pytest.mark.slow +@pytest.mark.parametrize("learn_func", learn_func_list) +def test_identity(learn_func): + """ + Test if the algorithm (with a given policy) + can learn an identity transformation (i.e. return observation as an action) + + :param learn_func: (lambda (Gym Environment): A2CPolicy) the policy generator + """ + env = DummyVecEnv([lambda: IdentityEnv(10)]) + + model = learn_func(env) + + n_trials = 1000 + reward_sum = 0 + obs = env.reset() + for _ in range(n_trials): + action, _ = model.predict(obs) + obs, reward, _, _ = env.step(action) + reward_sum += reward + assert reward_sum > 0.9 * n_trials + # Free memory + del model, env diff --git a/tests/test_logger.py b/tests/test_logger.py new file mode 100644 index 0000000000..aebdb6673c --- /dev/null +++ b/tests/test_logger.py @@ -0,0 +1,40 @@ +import pytest + +from stable_baselines.logger import make_output_format, read_tb, read_csv, read_json, _demo + + +KEY_VALUES = {'test': 1, 'b': -3.14, '8': 9.9} +LOG_DIR = '/tmp/openai_baselines/' + + +def test_main(): + """ + Dry-run python -m stable_baselines.logger + """ + _demo() + + +@pytest.mark.parametrize('_format', ['tensorboard', 'stdout', 'log', 'json', 'csv']) +def test_make_output(_format): + """ + test make output + + :param _format: (str) output format + """ + writer = make_output_format(_format, LOG_DIR) + writer.writekvs(KEY_VALUES) + if _format == 'tensorboard': + read_tb(LOG_DIR) + elif _format == "csv": + read_csv(LOG_DIR + 'progress.csv') + elif _format == 'json': + read_json(LOG_DIR + 'progress.json') + writer.close() + + +def test_make_output_fail(): + """ + test value error on logger + """ + with pytest.raises(ValueError): + make_output_format('dummy_format', LOG_DIR) diff --git a/tests/test_math_util.py b/tests/test_math_util.py new file mode 100644 index 0000000000..aac4107a93 --- /dev/null +++ b/tests/test_math_util.py @@ -0,0 +1,15 @@ +import numpy as np + +from stable_baselines.common.math_util import discount_with_boundaries + + +def test_discount_with_boundaries(): + """ + test the discount_with_boundaries function + """ + gamma = 0.9 + rewards = np.array([1.0, 2.0, 3.0, 4.0], 'float32') + episode_starts = [1.0, 0.0, 0.0, 1.0] + discounted_rewards = discount_with_boundaries(rewards, episode_starts, gamma) + assert np.allclose(discounted_rewards, [1 + gamma * 2 + gamma ** 2 * 3, 2 + gamma * 3, 3, 4]) + return diff --git a/tests/test_mpi_adam.py b/tests/test_mpi_adam.py new file mode 100644 index 0000000000..c6129fd96e --- /dev/null +++ b/tests/test_mpi_adam.py @@ -0,0 +1,10 @@ +import subprocess + +from .test_common import _assert_eq + + +def test_mpi_adam(): + """Test RunningMeanStd object for MPI""" + return_code = subprocess.call(['mpirun', '--allow-run-as-root', '-np', '2', + 'python', '-m', 'stable_baselines.common.mpi_adam']) + _assert_eq(return_code, 0) diff --git a/tests/test_running_stat.py b/tests/test_running_stat.py new file mode 100644 index 0000000000..fa6a7fc2d5 --- /dev/null +++ b/tests/test_running_stat.py @@ -0,0 +1,20 @@ +import numpy as np + +from stable_baselines.common.running_stat import RunningStat + + +def test_running_stat(): + """ + test RunningStat object + """ + for shape in ((), (3,), (3, 4)): + hist = [] + running_stat = RunningStat(shape) + for _ in range(5): + val = np.random.randn(*shape) + running_stat.push(val) + hist.append(val) + _mean = np.mean(hist, axis=0) + assert np.allclose(running_stat.mean, _mean) + _var = np.square(_mean) if (len(hist) == 1) else np.var(hist, ddof=1, axis=0) + assert np.allclose(running_stat.var, _var) diff --git a/tests/test_save.py b/tests/test_save.py new file mode 100644 index 0000000000..f9a90804f0 --- /dev/null +++ b/tests/test_save.py @@ -0,0 +1,116 @@ +import os + +import pytest + +from stable_baselines.a2c import A2C +from stable_baselines.acer import ACER +from stable_baselines.acktr import ACKTR +from stable_baselines.deepq import DeepQ +from stable_baselines.ppo1 import PPO1 +from stable_baselines.ppo2 import PPO2 +from stable_baselines.trpo_mpi import TRPO +from stable_baselines.common import set_global_seeds +from stable_baselines.common.identity_env import IdentityEnv +from stable_baselines.common.vec_env.dummy_vec_env import DummyVecEnv +from stable_baselines.common.policies import MlpPolicy +from stable_baselines.deepq import models as deepq_models + +N_TRIALS = 2000 + +MODEL_POLICY_LIST = [ + (A2C, MlpPolicy), + (ACER, MlpPolicy), + (ACKTR, MlpPolicy), + (DeepQ, deepq_models.mlp([32])), + (PPO1, MlpPolicy), + (PPO2, MlpPolicy), + (TRPO, MlpPolicy) +] + + +@pytest.mark.slow +@pytest.mark.parametrize("model_policy", MODEL_POLICY_LIST) +def test_model_manipulation(model_policy): + """ + Test if the algorithm (with a given policy) can be loaded and saved without any issues, the environment switching + works and that the action prediction works + + :param model_policy: (BaseRLModel, Object) A model, policy pair + """ + model_class, policy = model_policy + + try: + env = DummyVecEnv([lambda: IdentityEnv(10)]) + + # check the env is deterministic + action = [env.action_space.sample()] + set_global_seeds(0) + obs = env.step(action)[0] + for _ in range(N_TRIALS): + set_global_seeds(0) + assert obs == env.step(action)[0], "Error: environment tested not deterministic with the same seed" + + # create and train + model = model_class(policy=policy, env=env) + model.learn(total_timesteps=50000) + + # predict and measure the acc reward + acc_reward = 0 + obs = env.reset() + set_global_seeds(0) + for _ in range(N_TRIALS): + action, _ = model.predict(obs) + obs, reward, _, _ = env.step(action) + acc_reward += reward + acc_reward = sum(acc_reward) / N_TRIALS + + # saving + model.save("./test_model") + + del model, env + + # loading + model = model_class.load("./test_model") + + # changing environment (note: this can be done at loading) + env = DummyVecEnv([lambda: IdentityEnv(10)]) + model.set_env(env) + + # predict the same output before saving + loaded_acc_reward = 0 + obs = env.reset() + set_global_seeds(0) + for _ in range(N_TRIALS): + action, _ = model.predict(obs) + obs, reward, _, _ = env.step(action) + loaded_acc_reward += reward + loaded_acc_reward = sum(loaded_acc_reward) / N_TRIALS + assert abs(acc_reward - loaded_acc_reward) < 0.1, "Error: the prediction seems to have changed between " \ + "loading and saving" + + # learn post loading + model.learn(total_timesteps=1000) + + # validate no reset post learning + loaded_acc_reward = 0 + obs = env.reset() + set_global_seeds(0) + for _ in range(N_TRIALS): + action, _ = model.predict(obs) + obs, reward, _, _ = env.step(action) + loaded_acc_reward += reward + loaded_acc_reward = sum(loaded_acc_reward) / N_TRIALS + assert abs(acc_reward - loaded_acc_reward) < 0.1, "Error: the prediction seems to have changed between " \ + "pre learning and post learning" + + # predict new values + obs = env.reset() + for _ in range(N_TRIALS): + action, _ = model.predict(obs) + obs, _, _, _ = env.step(action) + + del model, env + + finally: + if os.path.exists("./test_model"): + os.remove("./test_model") diff --git a/tests/test_schedules.py b/tests/test_schedules.py new file mode 100644 index 0000000000..d8deb254fe --- /dev/null +++ b/tests/test_schedules.py @@ -0,0 +1,33 @@ +import numpy as np + +from stable_baselines.common.schedules import ConstantSchedule, PiecewiseSchedule + + +def test_piecewise_schedule(): + """ + test PiecewiseSchedule + """ + piecewise_sched = PiecewiseSchedule([(-5, 100), (5, 200), (10, 50), (100, 50), (200, -50)], + outside_value=500) + + assert np.isclose(piecewise_sched.value(-10), 500) + assert np.isclose(piecewise_sched.value(0), 150) + assert np.isclose(piecewise_sched.value(5), 200) + assert np.isclose(piecewise_sched.value(9), 80) + assert np.isclose(piecewise_sched.value(50), 50) + assert np.isclose(piecewise_sched.value(80), 50) + assert np.isclose(piecewise_sched.value(150), 0) + assert np.isclose(piecewise_sched.value(175), -25) + assert np.isclose(piecewise_sched.value(201), 500) + assert np.isclose(piecewise_sched.value(500), 500) + + assert np.isclose(piecewise_sched.value(200 - 1e-10), -50) + + +def test_constant_schedule(): + """ + test ConstantSchedule + """ + constant_sched = ConstantSchedule(5) + for i in range(-100, 100): + assert np.isclose(constant_sched.value(i), 5) diff --git a/baselines/common/tests/test_segment_tree.py b/tests/test_segment_tree.py similarity index 88% rename from baselines/common/tests/test_segment_tree.py rename to tests/test_segment_tree.py index 700e0bb456..4719d67d37 100644 --- a/baselines/common/tests/test_segment_tree.py +++ b/tests/test_segment_tree.py @@ -1,9 +1,12 @@ import numpy as np -from baselines.common.segment_tree import SumSegmentTree, MinSegmentTree +from stable_baselines.common.segment_tree import SumSegmentTree, MinSegmentTree def test_tree_set(): + """ + test Segment Tree data structure + """ tree = SumSegmentTree(4) tree[2] = 1.0 @@ -18,6 +21,9 @@ def test_tree_set(): def test_tree_set_overlap(): + """ + test Segment Tree data structure + """ tree = SumSegmentTree(4) tree[2] = 1.0 @@ -31,6 +37,9 @@ def test_tree_set_overlap(): def test_prefixsum_idx(): + """ + test Segment Tree data structure + """ tree = SumSegmentTree(4) tree[2] = 1.0 @@ -45,6 +54,9 @@ def test_prefixsum_idx(): def test_prefixsum_idx2(): + """ + test Segment Tree data structure + """ tree = SumSegmentTree(4) tree[0] = 0.5 @@ -61,6 +73,9 @@ def test_prefixsum_idx2(): def test_max_interval_tree(): + """ + test Segment Tree data structure + """ tree = MinSegmentTree(4) tree[0] = 1.0 diff --git a/tests/test_tf_util.py b/tests/test_tf_util.py new file mode 100644 index 0000000000..ef7f872139 --- /dev/null +++ b/tests/test_tf_util.py @@ -0,0 +1,43 @@ +# tests for tf_util +import tensorflow as tf + +from stable_baselines.common.tf_util import function, initialize, single_threaded_session + + +def test_function(): + """ + test the function function in tf_util + """ + with tf.Graph().as_default(): + x_ph = tf.placeholder(tf.int32, (), name="x") + y_ph = tf.placeholder(tf.int32, (), name="y") + z_ph = 3 * x_ph + 2 * y_ph + linear_fn = function([x_ph, y_ph], z_ph, givens={y_ph: 0}) + + with single_threaded_session(): + initialize() + + assert linear_fn(2) == 6 + assert linear_fn(2, 2) == 10 + + +def test_multikwargs(): + """ + test the function function in tf_util + """ + with tf.Graph().as_default(): + x_ph = tf.placeholder(tf.int32, (), name="x") + with tf.variable_scope("other"): + x2_ph = tf.placeholder(tf.int32, (), name="x") + z_ph = 3 * x_ph + 2 * x2_ph + + linear_fn = function([x_ph, x2_ph], z_ph, givens={x2_ph: 0}) + with single_threaded_session(): + initialize() + assert linear_fn(2) == 6 + assert linear_fn(2, 2) == 10 + + +if __name__ == '__main__': + test_function() + test_multikwargs() diff --git a/tests/test_vec_normalize.py b/tests/test_vec_normalize.py new file mode 100644 index 0000000000..609a483d8f --- /dev/null +++ b/tests/test_vec_normalize.py @@ -0,0 +1,59 @@ +import subprocess + +import gym +import numpy as np + +from stable_baselines.common.running_mean_std import RunningMeanStd +from stable_baselines.common.vec_env.dummy_vec_env import DummyVecEnv +from stable_baselines.common.vec_env.vec_normalize import VecNormalize +from .test_common import _assert_eq + +ENV_ID = 'BreakoutNoFrameskip-v4' + + +def test_runningmeanstd(): + """Test RunningMeanStd object""" + for (x_1, x_2, x_3) in [ + (np.random.randn(3), np.random.randn(4), np.random.randn(5)), + (np.random.randn(3, 2), np.random.randn(4, 2), np.random.randn(5, 2))]: + rms = RunningMeanStd(epsilon=0.0, shape=x_1.shape[1:]) + + x_cat = np.concatenate([x_1, x_2, x_3], axis=0) + moments_1 = [x_cat.mean(axis=0), x_cat.var(axis=0)] + rms.update(x_1) + rms.update(x_2) + rms.update(x_3) + moments_2 = [rms.mean, rms.var] + + assert np.allclose(moments_1, moments_2) + + +def test_vec_env(): + """Test VecNormalize Object""" + + def make_env(): + return gym.make(ENV_ID) + + env = DummyVecEnv([make_env]) + env = VecNormalize(env, norm_obs=True, norm_reward=True, clip_obs=10., clip_reward=10.) + _, done = env.reset(), [False] + obs = None + while not done[0]: + actions = [env.action_space.sample()] + obs, _, done, _ = env.step(actions) + assert np.max(obs) <= 10 + + +def test_mpi_runningmeanstd(): + """Test RunningMeanStd object for MPI""" + return_code = subprocess.call(['mpirun', '--allow-run-as-root', '-np', '2', + 'python', '-m', 'stable_baselines.common.mpi_running_mean_std']) + _assert_eq(return_code, 0) + + +def test_mpi_moments(): + """ + test running mean std function + """ + subprocess.check_call(['mpirun', '--allow-run-as-root', '-np', '3', 'python', '-c', + 'from stable_baselines.common.mpi_moments import _helper_runningmeanstd; _helper_runningmeanstd()'])