openai · lebrice · Nov 20, 2020 · Nov 20, 2020 · Aug 3, 2021 · Sep 24, 2021
diff --git a/gym/vector/async_vector_env.py b/gym/vector/async_vector_env.py
@@ -635,6 +635,16 @@ def _worker(index, env_fn, pipe, parent_pipe, shared_memory, error_queue):
     assert shared_memory is None
     env = env_fn()
     parent_pipe.close()
+
+    def step_fn(actions):
+        observation, reward, done, info = env.step(actions)
+        # Do nothing if the env is a VectorEnv, since it will automatically
+        # reset the envs that are done if needed in the 'step' method and return
+        # the initial observation instead of the final observation.
+        if not isinstance(env.unwrapped, VectorEnv) and done:
+            observation = env.reset()
+        return observation, reward, done, info
+
     try:
         while True:
             command, data = pipe.recv()
@@ -699,6 +709,16 @@ def _worker_shared_memory(index, env_fn, pipe, parent_pipe, shared_memory, error
     env = env_fn()
     observation_space = env.observation_space
     parent_pipe.close()
+
+    def step_fn(actions):
+        observation, reward, done, info = env.step(actions)
+        # Do nothing if the env is a VectorEnv, since it will automatically
+        # reset the envs that are done if needed in the 'step' method and return
+        # the initial observation instead of the final observation.
+        if not isinstance(env.unwrapped, VectorEnv) and done:
+            observation = env.reset()
+        return observation, reward, done, info
+
     try:
         while True:
             command, data = pipe.recv()
@@ -717,7 +737,10 @@ def _worker_shared_memory(index, env_fn, pipe, parent_pipe, shared_memory, error
                     pipe.send((None, True))
             elif command == "step":
                 observation, reward, done, info = env.step(data)
-                if done:
+                if isinstance(env, VectorEnv):
+                    # VectorEnvs take care of resetting the envs that are done.
+                    pass
+                elif done:
                     info["terminal_observation"] = observation
                     observation = env.reset()
                 write_to_shared_memory(

diff --git a/gym/vector/sync_vector_env.py b/gym/vector/sync_vector_env.py
@@ -71,8 +71,11 @@ def __init__(self, env_fns, observation_space=None, action_space=None, copy=True
         self.observations = create_empty_array(
             self.single_observation_space, n=self.num_envs, fn=np.zeros
         )
-        self._rewards = np.zeros((self.num_envs,), dtype=np.float64)
-        self._dones = np.zeros((self.num_envs,), dtype=np.bool_)
+        shape = (self.num_envs,)
+        if isinstance(self.envs[0].unwrapped, VectorEnv):
+            shape += (self.envs[0].num_envs,)
+        self._rewards = np.zeros(shape, dtype=np.float64)
+        self._dones = np.zeros(shape, dtype=np.bool_)
         self._actions = None
 
     def seed(self, seed=None):
@@ -133,10 +136,15 @@ def step_async(self, actions):
         self._actions = iterate(self.action_space, actions)
 
     def step_wait(self):
-        observations, infos = [], []
+        observations, rewards, dones, infos = [], [], [], []
         for i, (env, action) in enumerate(zip(self.envs, self._actions)):
             observation, self._rewards[i], self._dones[i], info = env.step(action)
-            if self._dones[i]:
+            if isinstance(env, VectorEnv):
+                # Do nothing if the env is a VectorEnv, since it will automatically
+                # reset the envs that are done if needed in the 'step' method and
+                # return the initial observation instead of the final observation.
+                pass
+            elif self._dones[i]:
                 info["terminal_observation"] = observation
                 observation = env.reset()
             observations.append(observation)

diff --git a/tests/vector/test_vector_env.py b/tests/vector/test_vector_env.py
@@ -1,7 +1,11 @@
+from functools import partial
+from typing import Callable, Type
+
 import numpy as np
 import pytest
 
-from gym.spaces import Tuple
+from gym import Space, spaces
+from gym.spaces import Box, Tuple
 from gym.vector.async_vector_env import AsyncVectorEnv
 from gym.vector.sync_vector_env import SyncVectorEnv
 from gym.vector.vector_env import VectorEnv
@@ -58,3 +62,114 @@ def test_custom_space_vector_env():
 
     assert isinstance(env.single_action_space, CustomSpace)
     assert isinstance(env.action_space, Tuple)
+
+
+@pytest.mark.parametrize("base_env", ["Pendulum-v1", "CartPole-v1"])
+@pytest.mark.parametrize("async_inner", [False, True])
+@pytest.mark.parametrize("async_outer", [False, True])
+@pytest.mark.parametrize("n_inner_envs", [1, 4, 7])
+@pytest.mark.parametrize("n_outer_envs", [1, 4, 7])
+def test_nesting_vector_envs(
+    base_env: str,
+    async_inner: bool,
+    async_outer: bool,
+    n_inner_envs: int,
+    n_outer_envs: int,
+):
+    """Tests nesting of vector envs: Using a VectorEnv of VectorEnvs.
+
+    This can be useful for example when running a large number of environments
+    on a machine with few cores, as worker process of an AsyncVectorEnv can themselves
+    run multiple environments sequentially using a SyncVectorEnv (a.k.a. chunking).
+
+    This test uses creates `n_outer_envs` vectorized environments, each of which has
+    `n_inner_envs` inned environments. If `async_outer` is True, then the outermost
+    wrapper is an `AsyncVectorEnv` and a `SyncVectorEnv` when `async_outer` is False.
+    Same goes for the "inner" environments.
+
+    Parameters
+    ----------
+    - base_env : str
+        The base environment id.
+    - async_inner : bool
+        Whether the inner VectorEnv will be async or not.
+    - async_outer : bool
+        Whether the outer VectorEnv will be async or not.
+    - n_inner_envs : int
+        Number of inner environments.
+    - n_outer_envs : int
+        Number of outer environments.
+    """
+
+    inner_vectorenv_type: Type[VectorEnv] = (
+        AsyncVectorEnv if async_inner else SyncVectorEnv
+    )
+    outer_vectorenv_type: Type[VectorEnv] = (
+        partial(AsyncVectorEnv, daemon=False) if async_outer else SyncVectorEnv
+    )
+    # NOTE: When nesting AsyncVectorEnvs, only the "innermost" envs can have
+    # `daemon=True`, otherwise the "daemonic processes are not allowed to have
+    # children" AssertionError is raised in `multiprocessing.process`.
+
+    # Create the VectorEnv of VectorEnvs
+    env = outer_vectorenv_type(
+        [
+            partial(
+                inner_vectorenv_type,
+                env_fns=[
+                    make_env(base_env, seed=n_inner_envs * i + j)
+                    for j in range(n_inner_envs)
+                ],
+            )
+            for i in range(n_outer_envs)
+        ]
+    )
+
+    # Create a single test environment.
+    with make_env(base_env, 0)() as temp_single_env:
+        single_observation_space = temp_single_env.observation_space
+        single_action_space = temp_single_env.action_space
+
+    assert isinstance(single_observation_space, Box)
+    assert isinstance(env.observation_space, Box)
+    assert env.observation_space.shape == (
+        n_outer_envs,
+        n_inner_envs,
+        *single_observation_space.shape,
+    )
+    assert env.observation_space.dtype == single_observation_space.dtype
+
+    from gym.vector.utils.spaces import iterate
+
+    def batch_size(space: Space) -> int:
+        return len(list(iterate(space, space.sample())))
+
+    assert batch_size(env.action_space) == n_outer_envs
+
+    with env:
+        observations = env.reset()
+        assert observations in env.observation_space
+
+        actions = env.action_space.sample()
+        assert actions in env.action_space
+
+        observations, rewards, dones, _ = env.step(actions)
+        assert observations in env.observation_space
+
+    assert isinstance(env.observation_space, Box)
+    assert isinstance(observations, np.ndarray)
+    assert observations.dtype == env.observation_space.dtype
+    assert (
+        observations.shape
+        == (n_outer_envs, n_inner_envs) + single_observation_space.shape
+    )
+
+    assert isinstance(rewards, np.ndarray)
+    assert isinstance(rewards[0], np.ndarray)
+    assert rewards.ndim == 2
+    assert rewards.shape == (n_outer_envs, n_inner_envs)
+
+    assert isinstance(dones, np.ndarray)
+    assert dones.dtype == np.bool_
+    assert dones.ndim == 2
+    assert dones.shape == (n_outer_envs, n_inner_envs)