From 489387475366699aa734b6d4e21fbd387ac91dee Mon Sep 17 00:00:00 2001 From: Antonin RAFFIN Date: Sun, 15 Mar 2020 22:35:11 +0100 Subject: [PATCH 1/7] Fix benchmark script --- utils/benchmark.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/utils/benchmark.py b/utils/benchmark.py index e5b2268..5e0a2f8 100644 --- a/utils/benchmark.py +++ b/utils/benchmark.py @@ -56,7 +56,7 @@ continue # Skip old BipedalWalker version - if 'Walker-v2' in trained_model or 'WalkerHardcore-v2' in trained_model: + if 'Walker-v2' in env_id or 'WalkerHardcore-v2' in env_id: continue reward_log = '{}/{}/'.format(args.benchmark_dir, trained_model) From 2777c0376b2e96f146993b5b85a94ebaa0ea0b37 Mon Sep 17 00:00:00 2001 From: Antonin RAFFIN Date: Tue, 17 Mar 2020 21:59:51 +0100 Subject: [PATCH 2/7] Fix for new gym version --- hyperparams/a2c.yml | 2 +- hyperparams/acktr.yml | 2 +- hyperparams/ddpg.yml | 2 +- hyperparams/ppo2.yml | 2 +- hyperparams/sac.yml | 2 +- hyperparams/td3.yml | 2 +- hyperparams/trpo.yml | 2 +- 7 files changed, 7 insertions(+), 7 deletions(-) diff --git a/hyperparams/a2c.yml b/hyperparams/a2c.yml index b89d301..3e46101 100644 --- a/hyperparams/a2c.yml +++ b/hyperparams/a2c.yml @@ -57,7 +57,7 @@ MountainCarContinuous-v0: policy: 'MlpPolicy' ent_coef: 0.0 -BipedalWalker-v2: +BipedalWalker-v3: normalize: true n_envs: 16 n_timesteps: !!float 5e6 diff --git a/hyperparams/acktr.yml b/hyperparams/acktr.yml index 742bc6a..ae61155 100644 --- a/hyperparams/acktr.yml +++ b/hyperparams/acktr.yml @@ -123,7 +123,7 @@ BipedalWalkerHardcore-v3: vf_coef: 0.51 # Tuned -BipedalWalker-v2: +BipedalWalker-v3: normalize: true n_envs: 8 n_timesteps: !!float 5e6 diff --git a/hyperparams/ddpg.yml b/hyperparams/ddpg.yml index eb092ac..8f9f6fe 100644 --- a/hyperparams/ddpg.yml +++ b/hyperparams/ddpg.yml @@ -20,7 +20,7 @@ Pendulum-v0: memory_limit: 50000 # Tuned -BipedalWalker-v2: +BipedalWalker-v3: n_timesteps: !!float 1e6 policy: 'MlpPolicy' noise_type: 'adaptive-param' diff --git a/hyperparams/ppo2.yml b/hyperparams/ppo2.yml index 6c14b27..b66e303 100644 --- a/hyperparams/ppo2.yml +++ b/hyperparams/ppo2.yml @@ -101,7 +101,7 @@ Acrobot-v1: noptepochs: 4 ent_coef: 0.0 -BipedalWalker-v2: +BipedalWalker-v3: normalize: true n_envs: 16 n_timesteps: !!float 5e6 diff --git a/hyperparams/sac.yml b/hyperparams/sac.yml index 01ee733..4fb50c5 100644 --- a/hyperparams/sac.yml +++ b/hyperparams/sac.yml @@ -23,7 +23,7 @@ LunarLanderContinuous-v2: batch_size: 256 learning_starts: 1000 -BipedalWalker-v2: +BipedalWalker-v3: n_timesteps: !!float 1e6 policy: 'CustomSACPolicy' learning_rate: lin_3e-4 diff --git a/hyperparams/td3.yml b/hyperparams/td3.yml index 13e4932..a5551de 100644 --- a/hyperparams/td3.yml +++ b/hyperparams/td3.yml @@ -49,7 +49,7 @@ HalfCheetahBulletEnv-v0: gradient_steps: 1000 policy_kwargs: "dict(layers=[400, 300])" -BipedalWalker-v2: +BipedalWalker-v3: n_timesteps: !!float 2e6 policy: 'MlpPolicy' gamma: 0.99 diff --git a/hyperparams/trpo.yml b/hyperparams/trpo.yml index 9db166e..d66c55f 100644 --- a/hyperparams/trpo.yml +++ b/hyperparams/trpo.yml @@ -145,7 +145,7 @@ HopperBulletEnv-v0: vf_stepsize: !!float 1e-3 # Tuned -BipedalWalker-v2: +BipedalWalker-v3: env_wrapper: utils.wrappers.TimeFeatureWrapper n_timesteps: !!float 5e6 policy: 'MlpPolicy' From 8af9557dc47d6297128a4f6bff0589d7db1d36aa Mon Sep 17 00:00:00 2001 From: Antonin RAFFIN Date: Tue, 17 Mar 2020 22:01:00 +0100 Subject: [PATCH 3/7] Add callback support --- enjoy.py | 25 ++++----- train.py | 82 ++++++++++++++++++++++------- utils/callbacks.py | 67 +++++++++++++++++++++++ utils/hyperparams_opt.py | 111 ++++++++++----------------------------- utils/import_envs.py | 14 +++++ utils/utils.py | 7 ++- 6 files changed, 190 insertions(+), 116 deletions(-) create mode 100644 utils/callbacks.py create mode 100644 utils/import_envs.py diff --git a/enjoy.py b/enjoy.py index 7f04a4f..8af4f19 100644 --- a/enjoy.py +++ b/enjoy.py @@ -10,15 +10,8 @@ warnings.filterwarnings("ignore", category=UserWarning, module='gym') import gym -try: - import pybullet_envs -except ImportError: - pybullet_envs = None +import utils.import_envs # pytype: disable=import-error import numpy as np -try: - import highway_env -except ImportError: - highway_env = None import stable_baselines from stable_baselines.common import set_global_seeds from stable_baselines.common.vec_env import VecNormalize, VecFrameStack, VecEnv @@ -50,6 +43,8 @@ def main(): help='Use deterministic actions') parser.add_argument('--stochastic', action='store_true', default=False, help='Use stochastic actions (for DDPG/DQN/SAC)') + parser.add_argument('--load-best', action='store_true', default=False, + help='Load best model instead of last model if available') parser.add_argument('--norm-reward', action='store_true', default=False, help='Normalize reward if applicable (trained with VecNormalize)') parser.add_argument('--seed', help='Random generator seed', type=int, default=0) @@ -78,7 +73,7 @@ def main(): assert os.path.isdir(log_path), "The {} folder was not found".format(log_path) - model_path = find_saved_model(algo, log_path, env_id) + model_path = find_saved_model(algo, log_path, env_id, load_best=args.load_best) if algo in ['dqn', 'ddpg', 'sac', 'td3']: args.n_envs = 1 @@ -108,12 +103,13 @@ def main(): deterministic = args.deterministic or algo in ['dqn', 'ddpg', 'sac', 'her', 'td3'] and not args.stochastic episode_reward = 0.0 - episode_rewards = [] + episode_rewards, episode_lengths = [], [] ep_len = 0 # For HER, monitor success rate successes = [] + state = None for _ in range(args.n_timesteps): - action, _ = model.predict(obs, deterministic=deterministic) + action, state = model.predict(obs, state=state, deterministic=deterministic) # Random Agent # action = [env.action_space.sample()] # Clip Action to avoid out of bound errors @@ -140,7 +136,9 @@ def main(): # is a normalized reward when `--norm_reward` flag is passed print("Episode Reward: {:.2f}".format(episode_reward)) print("Episode Length", ep_len) + state = None episode_rewards.append(episode_reward) + episode_lengths.append(ep_len) episode_reward = 0.0 ep_len = 0 @@ -159,7 +157,10 @@ def main(): print("Success rate: {:.2f}%".format(100 * np.mean(successes))) if args.verbose > 0 and len(episode_rewards) > 0: - print("Mean reward: {:.2f}".format(np.mean(episode_rewards))) + print("Mean reward: {:.2f} +/- {:.2f}".format(np.mean(episode_rewards), np.std(episode_rewards))) + + if args.verbose > 0 and len(episode_lengths) > 0: + print("Mean episode length: {:.2f} +/- {:.2f}".format(np.mean(episode_lengths), np.std(episode_lengths))) # Workaround for https://github.com/openai/gym/issues/893 if not args.no_render: diff --git a/train.py b/train.py index 2b89411..1fe6407 100644 --- a/train.py +++ b/train.py @@ -1,5 +1,6 @@ import os import time +import uuid import difflib import argparse import importlib @@ -15,15 +16,7 @@ import numpy as np import yaml # Optional dependencies -try: - import pybullet_envs -except ImportError: - pybullet_envs = None -try: - import highway_env -except ImportError: - highway_env = None - +import utils.import_envs # pytype: disable=import-error try: import mpi4py from mpi4py import MPI @@ -35,9 +28,11 @@ from stable_baselines.common.vec_env import VecFrameStack, SubprocVecEnv, VecNormalize, DummyVecEnv from stable_baselines.common.noise import AdaptiveParamNoiseSpec, NormalActionNoise, OrnsteinUhlenbeckActionNoise from stable_baselines.common.schedules import constfn +from stable_baselines.common.callbacks import CheckpointCallback, EvalCallback from utils import make_env, ALGOS, linear_schedule, get_latest_run_id, get_wrapper_class, find_saved_model from utils.hyperparams_opt import hyperparam_optimization +from utils.callbacks import SaveVecNormalizeCallback from utils.noise import LinearNormalActionNoise from utils.utils import StoreDict @@ -54,6 +49,12 @@ type=int) parser.add_argument('--log-interval', help='Override log interval (default: -1, no change)', default=-1, type=int) + parser.add_argument('--eval-freq', help='Evaluate the agent every n steps (if negative, no evaluation)', + default=10000, type=int) + parser.add_argument('--eval-episodes', help='Number of episodes to use for evaluation', + default=5, type=int) + parser.add_argument('--save-freq', help='Save the model every n steps (if negative, no checkpoint)', + default=-1, type=int) parser.add_argument('-f', '--log-folder', help='Log folder', type=str, default='logs') parser.add_argument('--seed', help='Random generator seed', type=int, default=0) parser.add_argument('--n-trials', help='Number of trials for optimizing hyperparameters', type=int, default=10) @@ -70,6 +71,8 @@ help='Additional external Gym environemnt package modules to import (e.g. gym_minigrid)') parser.add_argument('-params', '--hyperparams', type=str, nargs='+', action=StoreDict, help='Overwrite hyperparameter (e.g. learning_rate:0.01 train_freq:10)') + parser.add_argument('-uuid', '--uuid', action='store_true', default=False, + help='Ensure that the run has a unique ID') args = parser.parse_args() # Going through custom gym packages to let them register in the global registory @@ -87,6 +90,12 @@ closest_match = "'no close match found...'" raise ValueError('{} not found in gym registry, you maybe meant {}?'.format(env_id, closest_match)) + # Unique id to ensure there is no race condition for the folder creation + uuid_str = f'_{uuid.uuid4()}' if args.uuid else '' + if args.seed < 0: + # Seed but with a random one + args.seed = np.random.randint(2**32 - 1) + set_global_seeds(args.seed) if args.trained_agent != "": @@ -199,10 +208,17 @@ del hyperparams['env_wrapper'] log_path = "{}/{}/".format(args.log_folder, args.algo) - save_path = os.path.join(log_path, "{}_{}".format(env_id, get_latest_run_id(log_path, env_id) + 1)) + save_path = os.path.join(log_path, "{}_{}{}".format(env_id, get_latest_run_id(log_path, env_id) + 1, uuid_str)) params_path = "{}/{}".format(save_path, env_id) os.makedirs(params_path, exist_ok=True) + callbacks = [] + if args.save_freq > 0: + # Account for the number of parallel environments + args.save_freq = max(args.save_freq // n_envs, 1) + callbacks.append(CheckpointCallback(save_freq=args.save_freq, + save_path=save_path, name_prefix='rl_model', verbose=1)) + def create_env(n_envs, eval_env=False): """ Create the environment and wrap it if necessary @@ -254,6 +270,35 @@ def create_env(n_envs, eval_env=False): env = create_env(n_envs) + # Create test env if needed, do not normalize reward + eval_env = None + if args.eval_freq > 0: + # Account for the number of parallel environments + args.eval_freq = max(args.eval_freq // n_envs, 1) + + # Do not normalize the rewards of the eval env + old_kwargs = None + if normalize: + if len(normalize_kwargs) > 0: + old_kwargs = normalize_kwargs.copy() + normalize_kwargs['norm_reward'] = False + else: + normalize_kwargs = {'norm_reward': False} + + if args.verbose > 0: + print("Creating test environment") + + save_vec_normalize = SaveVecNormalizeCallback(save_freq=1, save_path=params_path) + eval_callback = EvalCallback(create_env(1, eval_env=True), callback_on_new_best=save_vec_normalize, + best_model_save_path=save_path, n_eval_episodes=args.eval_episodes, + log_path=save_path, eval_freq=args.eval_freq) + callbacks.append(eval_callback) + + # Restore original kwargs + if old_kwargs is not None: + normalize_kwargs = old_kwargs.copy() + + # Stop env processes to free memory if args.optimize_hyperparameters and n_envs > 1: env.close() @@ -348,6 +393,9 @@ def create_model(*_args, **kwargs): if args.log_interval > -1: kwargs = {'log_interval': args.log_interval} + if len(callbacks) > 0: + kwargs['callback'] = callbacks + # Save hyperparams with open(os.path.join(params_path, 'config.yml'), 'w') as f: yaml.dump(saved_hyperparams, f) @@ -366,12 +414,8 @@ def create_model(*_args, **kwargs): model.save("{}/{}".format(save_path, env_id)) - if normalize: - # TODO: use unwrap_vec_normalize() - # Unwrap - if isinstance(env, VecFrameStack): - env = env.venv - # Important: save the running average, for testing the agent we need that normalization - env.save(os.path.join(params_path, 'vecnormalize.pkl')) - # Deprecated saving: - # env.save_running_average(params_path) + if normalize: + # Important: save the running average, for testing the agent we need that normalization + model.get_vec_normalize_env().save(os.path.join(params_path, 'vecnormalize.pkl')) + # Deprecated saving: + # env.save_running_average(params_path) diff --git a/utils/callbacks.py b/utils/callbacks.py new file mode 100644 index 0000000..c31f6dc --- /dev/null +++ b/utils/callbacks.py @@ -0,0 +1,67 @@ +import os + +import numpy as np + +from stable_baselines.common.callbacks import BaseCallback, EvalCallback + + +class TrialEvalCallback(EvalCallback): + """ + Callback used for evaluating and reporting a trial. + """ + def __init__(self, eval_env, trial, n_eval_episodes=5, + eval_freq=10000, deterministic=True, verbose=0): + + super(TrialEvalCallback, self).__init__(eval_env=eval_env, n_eval_episodes=n_eval_episodes, + eval_freq=eval_freq, + deterministic=deterministic, + verbose=verbose) + self.trial = trial + self.eval_idx = 0 + self.is_pruned = False + + def _on_step(self): + if self.eval_freq > 0 and self.n_calls % self.eval_freq == 0: + super(TrialEvalCallback, self)._on_step() + self.eval_idx += 1 + # report best or report current ? + # report num_timesteps or elasped time ? + self.trial.report(-1 * self.last_mean_reward, self.eval_idx) + # Prune trial if need + if self.trial.should_prune(self.eval_idx): + self.is_pruned = True + return False + return True + + +class SaveVecNormalizeCallback(BaseCallback): + """ + Callback for saving a VecNormalize wrapper every ``save_freq`` steps + + :param save_freq: (int) + :param save_path: (str) Path to the folder where ``VecNormalize`` will be saved, as ``vecnormalize.pkl`` + :param name_prefix: (str) Common prefix to the saved ``VecNormalize``, if None (default) + only one file will be kept. + """ + def __init__(self, save_freq: int, save_path: str, name_prefix=None, verbose=0): + super(SaveVecNormalizeCallback, self).__init__(verbose) + self.save_freq = save_freq + self.save_path = save_path + self.name_prefix = name_prefix + + def _init_callback(self) -> None: + # Create folder if needed + if self.save_path is not None: + os.makedirs(self.save_path, exist_ok=True) + + def _on_step(self) -> bool: + if self.n_calls % self.save_freq == 0: + if self.name_prefix is not None: + path = os.path.join(self.save_path, f'{self.name_prefix}_{self.num_timesteps}_steps.pkl') + else: + path = os.path.join(self.save_path, 'vecnormalize.pkl') + if self.model.get_vec_normalize_env() is not None: + self.model.get_vec_normalize_env().save(path) + if self.verbose > 1: + print(f"Saving VecNormalize to {path}") + return True diff --git a/utils/hyperparams_opt.py b/utils/hyperparams_opt.py index 8d528a7..f3443f0 100644 --- a/utils/hyperparams_opt.py +++ b/utils/hyperparams_opt.py @@ -11,6 +11,8 @@ from stable_baselines.her import HERGoalEnvWrapper from stable_baselines.common.base_class import _UnvecWrapper +from .callbacks import TrialEvalCallback + def hyperparam_optimization(algo, model_fn, env_fn, n_trials=10, n_timesteps=5000, hyperparams=None, n_jobs=1, sampler_method='random', pruner_method='halving', @@ -34,6 +36,7 @@ def hyperparam_optimization(algo, model_fn, env_fn, n_trials=10, n_timesteps=500 if hyperparams is None: hyperparams = {} + n_startup_trials = 10 # test during 5 episodes n_test_episodes = 5 # evaluate every 20th of the maximum budget per iteration @@ -44,7 +47,7 @@ def hyperparam_optimization(algo, model_fn, env_fn, n_trials=10, n_timesteps=500 if sampler_method == 'random': sampler = RandomSampler(seed=seed) elif sampler_method == 'tpe': - sampler = TPESampler(n_startup_trials=5, seed=seed) + sampler = TPESampler(n_startup_trials=n_startup_trials, seed=seed) elif sampler_method == 'skopt': # cf https://scikit-optimize.github.io/#skopt.Optimizer # GP: gaussian process @@ -56,7 +59,7 @@ def hyperparam_optimization(algo, model_fn, env_fn, n_trials=10, n_timesteps=500 if pruner_method == 'halving': pruner = SuccessiveHalvingPruner(min_resource=1, reduction_factor=4, min_early_stopping_rate=0) elif pruner_method == 'median': - pruner = MedianPruner(n_startup_trials=5, n_warmup_steps=n_evaluations // 3) + pruner = MedianPruner(n_startup_trials=n_startup_trials, n_warmup_steps=n_evaluations // 3) elif pruner_method == 'none': # Do not prune pruner = MedianPruner(n_startup_trials=n_trials, n_warmup_steps=n_evaluations) @@ -82,102 +85,42 @@ def objective(trial): trial.n_actions = env_fn(n_envs=1).action_space.shape[0] kwargs.update(algo_sampler(trial)) - def callback(_locals, _globals): - """ - Callback for monitoring learning progress. - - :param _locals: (dict) - :param _globals: (dict) - :return: (bool) If False: stop training - """ - self_ = _locals['self'] - trial = self_.trial - - # Initialize variables - if not hasattr(self_, 'is_pruned'): - self_.is_pruned = False - self_.last_mean_test_reward = -np.inf - self_.last_time_evaluated = 0 - self_.eval_idx = 0 - - if (self_.num_timesteps - self_.last_time_evaluated) < evaluate_interval: - return True - - self_.last_time_evaluated = self_.num_timesteps - - # Evaluate the trained agent on the test env - rewards = [] - n_episodes, reward_sum = 0, 0.0 - - # Sync the obs rms if using vecnormalize - # NOTE: this does not cover all the possible cases - if isinstance(self_.test_env, VecNormalize): - self_.test_env.obs_rms = deepcopy(self_.env.obs_rms) - # Do not normalize reward - self_.test_env.norm_reward = False - - obs = self_.test_env.reset() - while n_episodes < n_test_episodes: - # Use default value for deterministic - action, _ = self_.predict(obs) - obs, reward, done, _ = self_.test_env.step(action) - reward_sum += reward - - if done: - rewards.append(reward_sum) - reward_sum = 0.0 - n_episodes += 1 - obs = self_.test_env.reset() - - mean_reward = np.mean(rewards) - self_.last_mean_test_reward = mean_reward - self_.eval_idx += 1 - - # report best or report current ? - # report num_timesteps or elasped time ? - trial.report(-1 * mean_reward, self_.eval_idx) - # Prune trial if need - if trial.should_prune(self_.eval_idx): - self_.is_pruned = True - return False - - return True - model = model_fn(**kwargs) - model.test_env = env_fn(n_envs=1) - model.trial = trial + + eval_env = env_fn(n_envs=1, eval_env=True) + # Account for parallel envs + eval_freq_ = eval_freq + if isinstance(model.get_env(), VecEnv): + eval_freq_ = max(eval_freq // model.get_env().num_envs, 1) + # TODO: use non-deterministic eval for Atari? + eval_callback = TrialEvalCallback(eval_env, trial, n_eval_episodes=n_eval_episodes, + eval_freq=eval_freq_, deterministic=True) + if algo == 'her': - model.model.trial = trial # Wrap the env if need to flatten the dict obs - if isinstance(model.test_env, VecEnv): - model.test_env = _UnvecWrapper(model.test_env) - model.model.test_env = HERGoalEnvWrapper(model.test_env) + if isinstance(eval_env, VecEnv): + eval_env = _UnvecWrapper(eval_env) + eval_env = HERGoalEnvWrapper(eval_env) try: - model.learn(n_timesteps, callback=callback) + model.learn(n_timesteps, callback=eval_callback) # Free memory model.env.close() - model.test_env.close() + eval_env.close() except AssertionError: # Sometimes, random hyperparams can generate NaN # Free memory model.env.close() - model.test_env.close() - raise - is_pruned = False - cost = np.inf - if hasattr(model, 'is_pruned'): - is_pruned = model.is_pruned - cost = -1 * model.last_mean_test_reward - del model.env, model.test_env + eval_env.close() + raise optuna.exceptions.TrialPruned() + is_pruned = eval_callback.is_pruned + cost = -1 * eval_callback.last_mean_reward + + del model.env, eval_env del model if is_pruned: - try: - # Optuna >= 0.19.0 - raise optuna.exceptions.TrialPruned() - except AttributeError: - raise optuna.structs.TrialPruned() + raise optuna.exceptions.TrialPruned() return cost diff --git a/utils/import_envs.py b/utils/import_envs.py new file mode 100644 index 0000000..83c123f --- /dev/null +++ b/utils/import_envs.py @@ -0,0 +1,14 @@ +try: + import pybullet_envs # pytype: disable=import-error +except ImportError: + pybullet_envs = None + +try: + import highway_env # pytype: disable=import-error +except ImportError: + highway_env = None + +try: + import mocca_envs # pytype: disable=import-error +except ImportError: + mocca_envs = None diff --git a/utils/utils.py b/utils/utils.py index 470de4f..c012005 100644 --- a/utils/utils.py +++ b/utils/utils.py @@ -372,11 +372,12 @@ def get_saved_hyperparams(stats_path, norm_reward=False, test_mode=False): return hyperparams, stats_path -def find_saved_model(algo, log_path, env_id): +def find_saved_model(algo, log_path, env_id, load_best=False): """ :param algo: (str) :param log_path: (str) Path to the directory with the saved model :param env_id: (str) + :param load_best: (bool) :return: (str) Path to the saved model """ model_path, found = None, False @@ -386,6 +387,10 @@ def find_saved_model(algo, log_path, env_id): if found: break + if load_best: + model_path = os.path.join(log_path, "best_model.zip") + found = os.path.isfile(model_path) + if not found: raise ValueError("No model found for {} on {}, path: {}".format(algo, env_id, model_path)) return model_path From a108bfd2eb1a684a1a8e0e9a2d2783887dd714a2 Mon Sep 17 00:00:00 2001 From: Antonin RAFFIN Date: Tue, 17 Mar 2020 22:40:29 +0100 Subject: [PATCH 4/7] Fixes --- .gitignore | 1 + Makefile | 2 +- train.py | 2 +- utils/hyperparams_opt.py | 4 ++-- utils/utils.py | 36 +++--------------------------------- 5 files changed, 8 insertions(+), 37 deletions(-) diff --git a/.gitignore b/.gitignore index 4ae990e..b760f4c 100644 --- a/.gitignore +++ b/.gitignore @@ -8,3 +8,4 @@ cluster_sbatch.sh cluster_sbatch_mpi.sh trained_agents/ .git/ +.pytype/ diff --git a/Makefile b/Makefile index df76e64..8e72fae 100644 --- a/Makefile +++ b/Makefile @@ -4,7 +4,7 @@ pytest: # Type check type: - pytype + pytype . docker: docker-cpu docker-gpu diff --git a/train.py b/train.py index 1fe6407..bc994b7 100644 --- a/train.py +++ b/train.py @@ -91,7 +91,7 @@ raise ValueError('{} not found in gym registry, you maybe meant {}?'.format(env_id, closest_match)) # Unique id to ensure there is no race condition for the folder creation - uuid_str = f'_{uuid.uuid4()}' if args.uuid else '' + uuid_str = '_{}'.format(uuid.uuid4()) if args.uuid else '' if args.seed < 0: # Seed but with a random one args.seed = np.random.randint(2**32 - 1) diff --git a/utils/hyperparams_opt.py b/utils/hyperparams_opt.py index f3443f0..c181811 100644 --- a/utils/hyperparams_opt.py +++ b/utils/hyperparams_opt.py @@ -38,10 +38,10 @@ def hyperparam_optimization(algo, model_fn, env_fn, n_trials=10, n_timesteps=500 n_startup_trials = 10 # test during 5 episodes - n_test_episodes = 5 + n_eval_episodes = 5 # evaluate every 20th of the maximum budget per iteration n_evaluations = 20 - evaluate_interval = int(n_timesteps / n_evaluations) + eval_freq = int(n_timesteps / n_evaluations) # n_warmup_steps: Disable pruner until the trial reaches the given number of step. if sampler_method == 'random': diff --git a/utils/utils.py b/utils/utils.py index c012005..9981218 100644 --- a/utils/utils.py +++ b/utils/utils.py @@ -1,13 +1,11 @@ import time import os import argparse -import inspect import glob import yaml import importlib import gym -from gym.envs.registration import load try: import pybullet_envs except ImportError: @@ -227,36 +225,8 @@ def create_test_env(env_id, n_envs=1, is_atari=False, env = SubprocVecEnv([make_env(env_id, i, seed, log_dir, wrapper_class=env_wrapper) for i in range(n_envs)]) # Pybullet envs does not follow gym.render() interface elif "Bullet" in env_id: - spec = gym.envs.registry.env_specs[env_id] - try: - class_ = load(spec.entry_point) - except AttributeError: - # Backward compatibility with gym - class_ = load(spec._entry_point) - # HACK: force SubprocVecEnv for Bullet env that does not - # have a render argument - render_name = None - use_subproc = 'renders' not in inspect.getfullargspec(class_.__init__).args - if not use_subproc: - render_name = 'renders' - # Dev branch of pybullet - # use_subproc = use_subproc and 'render' not in inspect.getfullargspec(class_.__init__).args - # if not use_subproc and render_name is None: - # render_name = 'render' - - # Create the env, with the original kwargs, and the new ones overriding them if needed - def _init(): - # TODO: fix for pybullet locomotion envs - env = class_(**{**spec._kwargs}, **{render_name: should_render}) - env.seed(0) - if log_dir is not None: - env = Monitor(env, os.path.join(log_dir, "0"), allow_early_resets=True) - return env - - if use_subproc: - env = SubprocVecEnv([make_env(env_id, 0, seed, log_dir, wrapper_class=env_wrapper)]) - else: - env = DummyVecEnv([_init]) + # HACK: force SubprocVecEnv for Bullet env + env = SubprocVecEnv([make_env(env_id, 0, seed, log_dir, wrapper_class=env_wrapper)]) else: env = DummyVecEnv([make_env(env_id, 0, seed, log_dir, wrapper_class=env_wrapper)]) @@ -354,7 +324,7 @@ def get_saved_hyperparams(stats_path, norm_reward=False, test_mode=False): if os.path.isfile(config_file): # Load saved hyperparameters with open(os.path.join(stats_path, 'config.yml'), 'r') as f: - hyperparams = yaml.load(f, Loader=yaml.UnsafeLoader) + hyperparams = yaml.load(f, Loader=yaml.UnsafeLoader) # pytype: disable=module-attr hyperparams['normalize'] = hyperparams.get('normalize', False) else: obs_rms_path = os.path.join(stats_path, 'obs_rms.pkl') From 6a90a77676ea2e1317f1debf4161b9fcf8bd8c5d Mon Sep 17 00:00:00 2001 From: Antonin RAFFIN Date: Wed, 18 Mar 2020 09:05:31 +0100 Subject: [PATCH 5/7] Remove f-string --- utils/callbacks.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/utils/callbacks.py b/utils/callbacks.py index c31f6dc..422c7b5 100644 --- a/utils/callbacks.py +++ b/utils/callbacks.py @@ -57,7 +57,7 @@ def _init_callback(self) -> None: def _on_step(self) -> bool: if self.n_calls % self.save_freq == 0: if self.name_prefix is not None: - path = os.path.join(self.save_path, f'{self.name_prefix}_{self.num_timesteps}_steps.pkl') + path = os.path.join(self.save_path, '{}_{}_steps.pkl'.format(self.name_prefix, self.num_timesteps)) else: path = os.path.join(self.save_path, 'vecnormalize.pkl') if self.model.get_vec_normalize_env() is not None: From 679cf7dcf712b55b15dc34fac0bbbeae4da4b9cc Mon Sep 17 00:00:00 2001 From: Antonin RAFFIN Date: Wed, 18 Mar 2020 10:39:50 +0100 Subject: [PATCH 6/7] Fix print --- utils/callbacks.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/utils/callbacks.py b/utils/callbacks.py index 422c7b5..03b5a98 100644 --- a/utils/callbacks.py +++ b/utils/callbacks.py @@ -63,5 +63,5 @@ def _on_step(self) -> bool: if self.model.get_vec_normalize_env() is not None: self.model.get_vec_normalize_env().save(path) if self.verbose > 1: - print(f"Saving VecNormalize to {path}") + print("Saving VecNormalize to {}".format(path)) return True From 89c54a2f8df7eb387fabb375a33c2e34d5146ce6 Mon Sep 17 00:00:00 2001 From: Antonin RAFFIN Date: Sun, 22 Mar 2020 13:30:50 +0100 Subject: [PATCH 7/7] [ci skip] Update README --- README.md | 72 +++++++++++++++++++++++++++++++++---------------------- 1 file changed, 44 insertions(+), 28 deletions(-) diff --git a/README.md b/README.md index f76d97f..a0fe461 100644 --- a/README.md +++ b/README.md @@ -34,6 +34,11 @@ If you have trained an agent yourself, you need to do: python enjoy.py --algo algo_name --env env_id -f logs/ --exp-id 0 ``` +To load the best model (when using evaluation environment): +``` +python enjoy.py --algo algo_name --env env_id -f logs/ --exp-id 1 --load-best +``` + ## Train an Agent The hyperparameters for each environment are defined in `hyperparameters/algo_name.yml`. @@ -48,6 +53,17 @@ For example (with tensorboard support): python train.py --algo ppo2 --env CartPole-v1 --tensorboard-log /tmp/stable-baselines/ ``` +Evaluate the agent every 10000 steps using 10 episodes for evaluation: +``` +python train.py --algo sac --env HalfCheetahBulletEnv-v0 --eval-freq 10000 --eval-episodes 10 +``` + +Save a checkpoint of the agent every 100000 steps: +``` +python train.py --algo td3 --env HalfCheetahBulletEnv-v0 --save-freq 100000 +``` + + Continue training (here, load pretrained agent for Breakout and continue training for 5000 steps): ``` python train.py --algo a2c --env BreakoutNoFrameskip-v4 -i trained_agents/a2c/BreakoutNoFrameskip-v4.pkl -n 5000 @@ -74,6 +90,34 @@ python train.py --algo ppo2 --env MountainCar-v0 -n 50000 -optimize --n-trials 1 ``` +## Env Wrappers + +You can specify in the hyperparameter config one or more wrapper to use around the environment: + +for one wrapper: +``` +env_wrapper: gym_minigrid.wrappers.FlatObsWrapper +``` + +for multiple, specify a list: + +``` +env_wrapper: + - utils.wrappers.DoneOnSuccessWrapper: + reward_offset: 1.0 + - utils.wrappers.TimeFeatureWrapper +``` + +Note that you can easily specify parameters too. + +## Overwrite hyperparameters + +You can easily overwrite hyperparameters in the command line, using ``--hyperparams``: + +``` +python train.py --algo a2c --env MountainCarContinuous-v0 --hyperparams learning_rate:0.001 policy_kwargs:"dict(net_arch=[64, 64])" +``` + ## Record a Video of a Trained Agent Record 1000 steps: @@ -204,34 +248,6 @@ MiniGrid-DoorKey-5x5-v0: env_wrapper: gym_minigrid.wrappers.FlatObsWrapper ``` -## Env Wrappers - -You can specify in the hyperparameter config one or more wrapper to use around the environment: - -for one wrapper: -``` -env_wrapper: gym_minigrid.wrappers.FlatObsWrapper -``` - -for multiple, specify a list: - -``` -env_wrapper: - - utils.wrappers.DoneOnSuccessWrapper: - reward_offset: 1.0 - - utils.wrappers.TimeFeatureWrapper -``` - -Note that you can easily specify parameters too. - -## Overwrite hyperparameters - -You can easily overwrite hyperparameters in the command line, using ``--hyperparams``: - -``` -python train.py --algo a2c --env MountainCarContinuous-v0 --hyperparams learning_rate:0.001 policy_kwargs:"dict(net_arch=[64, 64])" -``` - ## Colab Notebook: Try it Online!