From 90806971f2fc1c8b5442cbb76b3498d812d85bc0 Mon Sep 17 00:00:00 2001 From: Gary Lvov Date: Thu, 23 Jan 2025 18:16:18 -0500 Subject: [PATCH 01/19] clean up and bug fixes --- docs/source/features/ray.rst | 206 +++++++++++------- .../ray/grok_cluster_with_kubectl.py | 2 +- source/standalone/workflows/ray/launch.py | 4 +- source/standalone/workflows/ray/tuner.py | 21 +- .../workflows/ray/wrap_resources.py | 4 +- 5 files changed, 151 insertions(+), 86 deletions(-) diff --git a/docs/source/features/ray.rst b/docs/source/features/ray.rst index 86fdf48e5d..789b562354 100644 --- a/docs/source/features/ray.rst +++ b/docs/source/features/ray.rst @@ -34,19 +34,20 @@ The core functionality of the Ray workflow consists of two main scripts that ena of resource-wrapped and tuning aggregate jobs. These scripts facilitate the decomposition of aggregate jobs (overarching experiments) into individual jobs, which are discrete commands executed on the cluster. An aggregate job can include multiple individual jobs. -For clarity, this guide refers to the jobs one layer below the topmost aggregate level as sub-jobs. - Both resource-wrapped and tuning aggregate jobs dispatch individual jobs to a designated Ray cluster, which leverages the cluster's resources (e.g., a single workstation node or multiple nodes) -to execute these jobs with workers in parallel and/or sequentially. By default, aggregate jobs use all \ +to execute these jobs with workers in parallel and/or sequentially. + +By default, jobs use all \ available resources on each available GPU-enabled node for each sub-job worker. This can be changed through -specifying the ``--num_workers`` argument, especially critical for parallel aggregate -job processing on local or virtual multi-GPU machines +specifying the ``--num_workers`` argument for resource-wrapped jobs, or ``--num_workers_per_node`` +for tuning jobs, which is especially critical for parallel aggregate +job processing on local/virtual multi-GPU machines. In resource-wrapped aggregate jobs, each sub-job and its resource requirements are defined manually, enabling resource isolation. For tuning aggregate jobs, individual jobs are generated automatically based on a hyperparameter -sweep configuration. This assumes homogeneous node resource composition for nodes with GPUs. +sweep configuration. Tuning jobs assume homogeneous node resource composition for nodes with GPUs. .. dropdown:: source/standalone/workflows/ray/wrap_resources.py :icon: code @@ -66,7 +67,7 @@ sweep configuration. This assumes homogeneous node resource composition for node The following script can be used to submit aggregate jobs to one or more Ray cluster(s), which can be used for running jobs on a remote cluster or simultaneous jobs with heterogeneous -resource requirements: +resource requirements. .. dropdown:: source/standalone/workflows/ray/submit_job.py :icon: code @@ -75,7 +76,7 @@ resource requirements: :language: python :emphasize-lines: 12-53 -The following script can be used to extract KubeRay Cluster information for aggregate job submission. +The following script can be used to extract KubeRay cluster information for aggregate job submission. .. dropdown:: source/standalone/workflows/ray/grok_cluster_with_kubectl.py :icon: code @@ -93,6 +94,54 @@ The following script can be used to easily create clusters on Google GKE. :language: python :emphasize-lines: 16-37 +** Docker-based Local Quickstart ** +----------------------------------- + +First, follow the `Docker Guide ` +to set up the NVIDIA Container Toolkit and Docker Compose. + +Then, try the following steps to start your first tuning run. + +.. code-block:: bash + + # Build the base image + ./isaaclab.sh docker/container.py start + # Build the tuning image with extra deps + docker build -t isaacray -f source/standalone/workflows/ray/cluster_configs/Dockerfile . + # Start the tuning image + docker run -it --gpus all --net=host --entrypoint /bin/bash isaacray + # Start the Ray server within the tuning image + echo "import ray; ray.init(); import time; [time.sleep(10) for _ in iter(int, 1)]" | ./isaaclab.sh -p + + # In a new terminal (don't close the above) , enter the image with a new shell. + docker container ps + docker exec -it /bin/bash + # Start a tuning run, with one parallel worker per GPU + /isaaclab.sh -p source/standalone/workflows/ray/tuner.py \ + --cfg_file source/standalone/workflows/ray/hyperparameter_tuning/vision_cartpole_cfg.py \ + --cfg_class CartpoleTheiaJobCfg \ + --num_workers_per_node + + +For tuning jobs, specify the tuning job / hyperparameter sweep as child class of + :class:`JobCfg` . + +.. dropdown:: source/standalone/workflows/ray/tuner.py JobCfg definition + :icon: code + + .. literalinclude:: ../../../source/standalone/workflows/ray/tuner.py + :language: python + :start-at: class JobCfg + :end-at: self.cfg = cfg + +For example, see the following Cartpole Example configurations. + +.. dropdown:: source/standalone/workflows/ray/hyperparameter_tuning/vision_cartpole_cfg.py + :icon: code + + .. literalinclude:: ../../../source/standalone/workflows/ray/hyperparameter_tuning/vision_cartpole_cfg.py + :language: python + **Installation** ---------------- @@ -103,7 +152,7 @@ To use Ray without Kubernetes, like on a local computer or VM, such as Google Kubernetes Engine or Amazon Elastic Kubernetes Service, ``kubectl`` is required, and can be installed via the `Kubernetes website `_ -The pythonic dependencies can be installed with: +The pythonic dependencies can be installed with the following. .. code-block:: bash @@ -122,77 +171,28 @@ the following dependencies are also needed. ./isaaclab.sh -p -m pip install kubernetes Jinja2 -**Setup Overview: Cluster Configuration** ------------------------------------------ - -Select one of the following methods to create a Ray Cluster to accept and execute dispatched jobs. - -Single-Node Ray Cluster (Recommended for Beginners) -''''''''''''''''''''''''''''''''''''''''''''''''''' -For use on a single machine (node) such as a local computer or VM, the -following command can be used start a ray server. This is compatible with -multiple-GPU machines. This Ray server will run indefinitely until it is stopped with ``CTRL + C`` - -.. code-block:: bash - - echo "import ray; ray.init(); import time; [time.sleep(10) for _ in iter(int, 1)]" | ./isaaclab.sh -p - -KubeRay Clusters -'''''''''''''''' -.. attention:: - The ``ray`` command should be modified to use Isaac python, which could be achieved in a fashion similar to - ``sed -i "1i $(echo "#!/workspace/isaaclab/_isaac_sim/python.sh")" \ - /isaac-sim/kit/python/bin/ray && ln -s /isaac-sim/kit/python/bin/ray /usr/local/bin/ray``. - -Google Cloud is currently the only platform tested, although -any cloud provider should work if one configures the following: - -- An container registry (NGC, GCS artifact registry, AWS ECR, etc) with - an Isaac Lab image configured to support Ray. See ``cluster_configs/Dockerfile`` to see how to modify the ``isaac-lab-base`` - container for Ray compatibility. Ray should use the isaac sim python shebang, and ``nvidia-smi`` - should work within the container. Be careful with the setup here as - paths need to be configured correctly for everything to work. It's likely that - the example dockerfile will work out of the box and can be pushed to the registry, as - long as the base image has already been built as in the container guide -- A Kubernetes setup with available NVIDIA RTX (likely ``l4`` or ``l40`` or ``tesla-t4`` or ``a10``) GPU-passthrough node-pool resources, - that has access to your container registry/storage bucket and has the Ray operator enabled with correct IAM - permissions. This can be easily achieved with services such as Google GKE or AWS EKS, - provided that your account or organization has been granted a GPU-budget. It is recommended - to use manual kubernetes services as opposed to "autopilot" services for cost-effective - experimentation as this way clusters can be completely shut down when not in use, although - this may require installing the `Nvidia GPU Operator `_ -- An MLFlow server that your cluster has access to. -- A ``kuberay.yaml.ninja`` file that describes how to allocate resources (already included for - Google Cloud, which can be referenced for the format and MLFlow integration) - -Ray Clusters (Without Kubernetes) -''''''''''''''''''''''''''''''''' -.. attention:: - Modify the Ray command to use Isaac Python like in KubeRay Clusters, and follow the same - steps for creating an image/cluster permissions/bucket access. - -See the `Ray Clusters Overview `_ or -`Anyscale `_ for more information - **Dispatching Jobs and Tuning** ------------------------------- -Select one of the following guides that matches your desired Cluster configuration. +Select one of the following guides that matches your desired cluster configuration. Simple Ray Cluster (Local/VM) ''''''''''''''''''''''''''''' -This guide assumes that there is a Ray cluster already running, and that this script is run locally on the cluster, or -that the cluster job submission address is known. +1.) Start a Ray cluster. -1.) Testing that the cluster works can be done as follows. +.. code-block:: bash + + echo "import ray; ray.init(); import time; [time.sleep(10) for _ in iter(int, 1)]" | ./isaaclab.sh -p + +2.) Testing that the cluster works can be done as follows. .. code-block:: bash ./isaaclab.sh -p source/standalone/workflows/ray/wrap_resources.py --test -2.) Submitting resource-wrapped sub-jobs can be done as described in the following file: +3.) Submitting resource-wrapped individual can be done as described in the following file. .. dropdown:: source/standalone/workflows/ray/wrap_resources.py :icon: code @@ -201,13 +201,18 @@ that the cluster job submission address is known. :language: python :emphasize-lines: 14-66 -3.) For tuning jobs, specify the hyperparameter sweep similar to the following two files. +4.) For tuning jobs, specify the tuning job / hyperparameter sweep as child class of + :class:`JobCfg` . -.. dropdown:: source/standalone/workflows/ray/hyperparameter_tuning/vision_cfg.py +.. dropdown:: source/standalone/workflows/ray/tuner.py :icon: code - .. literalinclude:: ../../../source/standalone/workflows/ray/hyperparameter_tuning/vision_cfg.py + .. literalinclude:: ../../../source/standalone/workflows/ray/tuner.py :language: python + :start-at: class JobCfg + :end-at: self.cfg = cfg + +For example, see the following Cartpole Example configurations. .. dropdown:: source/standalone/workflows/ray/hyperparameter_tuning/vision_cartpole_cfg.py :icon: code @@ -215,7 +220,8 @@ that the cluster job submission address is known. .. literalinclude:: ../../../source/standalone/workflows/ray/hyperparameter_tuning/vision_cartpole_cfg.py :language: python -Then, see the local examples in the following file to see how to start a tuning run. + +5.) Then, see the local examples in the following file to see how to start a tuning run. .. dropdown:: source/standalone/workflows/ray/tuner.py :icon: code @@ -224,12 +230,54 @@ Then, see the local examples in the following file to see how to start a tuning :language: python :emphasize-lines: 18-53 +To view the logs, simply run ``tensorboard --logdir=`` . + +Remote Ray Cluster +'''''''''''''''''' + +**Setup Overview: Cluster Configuration** +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +Select one of the following methods to create a Ray cluster to accept and execute dispatched jobs. + +KubeRay Clusters +~~~~~~~~~~~~~~~~ +.. attention:: + The ``ray`` command should be modified to use Isaac python, which could be achieved in a fashion similar to + ``sed -i "1i $(echo "#!/workspace/isaaclab/_isaac_sim/python.sh")" \ + /isaac-sim/kit/python/bin/ray && ln -s /isaac-sim/kit/python/bin/ray /usr/local/bin/ray``. + +Google Cloud is currently the only platform tested, although +any cloud provider should work if one configures the following. +- An container registry (NGC, GCS artifact registry, AWS ECR, etc) with + an Isaac Lab image configured to support Ray. See ``cluster_configs/Dockerfile`` to see how to modify the ``isaac-lab-base`` + container for Ray compatibility. Ray should use the isaac sim python shebang, and ``nvidia-smi`` + should work within the container. Be careful with the setup here as + paths need to be configured correctly for everything to work. It's likely that + the example dockerfile will work out of the box and can be pushed to the registry, as + long as the base image has already been built as in the container guide. +- A Kubernetes setup with available NVIDIA RTX (likely ``l4`` or ``l40`` or ``tesla-t4`` or ``a10``) GPU-passthrough node-pool resources, + that has access to your container registry/storage bucket and has the Ray operator enabled with correct IAM + permissions. This can be easily achieved with services such as Google GKE or AWS EKS, + provided that your account or organization has been granted a GPU-budget. It is recommended + to use manual kubernetes services as opposed to "autopilot" services for cost-effective + experimentation as this way clusters can be completely shut down when not in use, although + this may require installing the `Nvidia GPU Operator `_ +- An MLFlow server that your cluster has access to. +- A ``kuberay.yaml.ninja`` file that describes how to allocate resources (already included for + Google Cloud, which can be referenced for the format and MLFlow integration). + +Ray Clusters (Without Kubernetes) +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +.. attention:: + Modify the Ray command to use Isaac Python like in KubeRay clusters, and follow the same + steps for creating an image/cluster permissions/bucket access. + +See the `Ray Clusters Overview `_ or +`Anyscale `_ for more information -To view the logs, simply run ``tensorboard --logdir=`` -Remote Ray Cluster Setup and Use -''''''''''''''''''''''''''''''''' This guide assumes that one desires to create a cluster on a remote host or server. This guide includes shared steps, and KubeRay or Ray specific steps. Follow all shared steps (part I and II), and then only the KubeRay or Ray steps depending on your desired configuration, in order of shared steps part I, then @@ -313,6 +361,8 @@ and determine the server URI. Shared Steps Between KubeRay and Pure Ray Part II ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + + 1.) Test that your cluster is operational with the following. .. code-block:: bash @@ -330,16 +380,17 @@ Shared Steps Between KubeRay and Pure Ray Part II :language: python :emphasize-lines: 12-53 -3.) For tuning jobs, specify the hyperparameter sweep similar to :class:`RLGamesCameraJobCfg` in the following file: +3.) For tuning jobs, specify the tuning job / hyperparameter sweep as a :class:`JobCfg` . .. dropdown:: source/standalone/workflows/ray/tuner.py :icon: code .. literalinclude:: ../../../source/standalone/workflows/ray/tuner.py :language: python - :emphasize-lines: 18-53 + :start-at: class JobCfg + :end-at: self.cfg = cfg -For example, see the Cartpole Example configurations. +For example, see the following Cartpole Example configurations. .. dropdown:: source/standalone/workflows/ray/hyperparameter_tuning/vision_cartpole_cfg.py :icon: code @@ -347,8 +398,7 @@ For example, see the Cartpole Example configurations. .. literalinclude:: ../../../source/standalone/workflows/ray/hyperparameter_tuning/vision_cartpole_cfg.py :language: python - -Tuning jobs can also be submitted via ``submit_job.py`` +Tuning jobs can also be submitted via ``submit_job.py`` . To view the tuning results, view the MLFlow dashboard of the server that you created. For KubeRay, this can be done through port forwarding the MLFlow dashboard, with diff --git a/source/standalone/workflows/ray/grok_cluster_with_kubectl.py b/source/standalone/workflows/ray/grok_cluster_with_kubectl.py index bb83a211af..c00a28d066 100644 --- a/source/standalone/workflows/ray/grok_cluster_with_kubectl.py +++ b/source/standalone/workflows/ray/grok_cluster_with_kubectl.py @@ -103,7 +103,7 @@ def get_mlflow_info(namespace: str = None, cluster_prefix: str = "isaacray") -> cluster_ip = fields[2] port = "5000" # Default MLflow port - return f"http://{cluster_ip}:{port}" + return f"https://{cluster_ip}:{port}" except subprocess.CalledProcessError as e: raise ValueError(f"Could not grok MLflow: {e}") # Fixed f-string diff --git a/source/standalone/workflows/ray/launch.py b/source/standalone/workflows/ray/launch.py index 0a6ff89f6e..6e31c21ba9 100644 --- a/source/standalone/workflows/ray/launch.py +++ b/source/standalone/workflows/ray/launch.py @@ -11,7 +11,7 @@ from jinja2 import Environment, FileSystemLoader from kubernetes import config -import source.standalone.workflows.ray.util as util +import util """This script helps create one or more KubeRay clusters. @@ -53,7 +53,7 @@ def apply_manifest(args: argparse.Namespace) -> None: # Set up Jinja2 environment for loading templates templates_dir = RAY_DIR / "cluster_configs" / args.cluster_host file_loader = FileSystemLoader(str(templates_dir)) - jinja_env = Environment(loader=file_loader, keep_trailing_newline=True) + jinja_env = Environment(loader=file_loader, keep_trailing_newline=True, autoescape=True) # Define template filename template_file = "kuberay.yaml.jinja" diff --git a/source/standalone/workflows/ray/tuner.py b/source/standalone/workflows/ray/tuner.py index e7b41dd22b..ddc9a97e5a 100644 --- a/source/standalone/workflows/ray/tuner.py +++ b/source/standalone/workflows/ray/tuner.py @@ -17,8 +17,9 @@ """ This script breaks down an aggregate tuning job, as defined by a hyperparameter sweep configuration, into individual jobs (shell commands) to run on the GPU-enabled nodes of the cluster. -By default, (unless combined as a sub-job in a resource-wrapped aggregate job), one worker is created -for each GPU-enabled node in the cluster for each individual job. +By default, one worker is created for each GPU-enabled node in the cluster for each individual job. +To use more than one worker per node (likely the case for multi-GPU machines), supply the +num_workers_per_node argument. Each hyperparameter sweep configuration should include the workflow, runner arguments, and hydra arguments to vary. @@ -221,9 +222,23 @@ class JobCfg: at a minimum, the tune job should inherit from this class.""" def __init__(self, cfg): + ''' + Runner args include command line arguments passed to the task. + For example: + cfg["runner_args"]["headless_singleton"] = "--headless" + cfg["runner_args"]["enable_cameras_singleton"] = "--enable_cameras" + ''' assert "runner_args" in cfg, "No runner arguments specified." + ''' + Task is the desired task to train on. For example: + cfg["runner_args"]["--task"] = tune.choice(["Isaac-Cartpole-RGB-TheiaTiny-v0"]) + ''' assert "--task" in cfg["runner_args"], "No task specified." - assert "hydra_args" in cfg, "No hypeparameters specified." + ''' + Hydra args define the hyperparameters varied within the sweep. For example: + cfg["hydra_args"]["agent.params.network.cnn.activation"] = tune.choice(["relu", "elu"]) + ''' + assert "hydra_args" in cfg, "No hyperparameters specified." self.cfg = cfg diff --git a/source/standalone/workflows/ray/wrap_resources.py b/source/standalone/workflows/ray/wrap_resources.py index 2fcd4cafa1..0cd18762b0 100644 --- a/source/standalone/workflows/ray/wrap_resources.py +++ b/source/standalone/workflows/ray/wrap_resources.py @@ -8,10 +8,10 @@ import ray from ray.util.scheduling_strategies import NodeAffinitySchedulingStrategy -import source.standalone.workflows.ray.util as util +import util """ -This script dispatches sub-job(s) (either individual jobs or tuning aggregate jobs) +This script dispatches sub-job(s) (individual jobs, use :file:`tuner.py` for tuning jobs) to worker(s) on GPU-enabled node(s) of a specific cluster as part of an resource-wrapped aggregate job. If no desired compute resources for each sub-job are specified, this script creates one worker per available node for each node with GPU(s) in the cluster. From 11ce1e9e8fe88ffa02bf6db7b102f6cbf228900a Mon Sep 17 00:00:00 2001 From: Gary Lvov Date: Thu, 23 Jan 2025 18:41:08 -0500 Subject: [PATCH 02/19] update --- docs/source/features/ray.rst | 3 ++- .../ray/cluster_configs/google_cloud/kuberay.yaml.jinja | 4 ++-- source/standalone/workflows/ray/launch.py | 1 + 3 files changed, 5 insertions(+), 3 deletions(-) diff --git a/docs/source/features/ray.rst b/docs/source/features/ray.rst index 789b562354..1ee4a82fd1 100644 --- a/docs/source/features/ray.rst +++ b/docs/source/features/ray.rst @@ -427,4 +427,5 @@ recreated! For KubeRay clusters, this can be done as follows. kubectl get raycluster | egrep 'isaacray' | awk '{print $1}' | xargs kubectl delete raycluster && kubectl get deployments | egrep 'mlflow' | awk '{print $1}' | xargs kubectl delete deployment && - kubectl get services | egrep 'mlflow' | awk '{print $1}' | xargs kubectl delete service + kubectl get services | egrep 'mlflow' | awk '{print $1}' | xargs kubectl delete service && + kubectl get services | egrep 'isaacray' | awk '{print $1}' | xargs kubectl delete service diff --git a/source/standalone/workflows/ray/cluster_configs/google_cloud/kuberay.yaml.jinja b/source/standalone/workflows/ray/cluster_configs/google_cloud/kuberay.yaml.jinja index 2d43075445..79a4ac1231 100644 --- a/source/standalone/workflows/ray/cluster_configs/google_cloud/kuberay.yaml.jinja +++ b/source/standalone/workflows/ray/cluster_configs/google_cloud/kuberay.yaml.jinja @@ -30,7 +30,7 @@ spec: apiVersion: v1 kind: Service metadata: - name: head + name: {{ name }}-head spec: type: LoadBalancer template: @@ -130,7 +130,7 @@ spec: volumeMounts: - mountPath: /tmp/ray name: ray-logs - command: ["/bin/bash", "-c", "ray start --address=head.{{ namespace }}.svc.cluster.local:6379 && tail -f /dev/null"] + command: ["/bin/bash", "-c", "ray start --address={{name}}-head.{{ namespace }}.svc.cluster.local:6379 && tail -f /dev/null"] - image: fluent/fluent-bit:1.9.6 name: fluentbit resources: diff --git a/source/standalone/workflows/ray/launch.py b/source/standalone/workflows/ray/launch.py index 6e31c21ba9..5be009672d 100644 --- a/source/standalone/workflows/ray/launch.py +++ b/source/standalone/workflows/ray/launch.py @@ -79,6 +79,7 @@ def apply_manifest(args: argparse.Namespace) -> None: # Apply the Kubernetes manifest using kubectl try: + print(cleaned_yaml_string) subprocess.run(["kubectl", "apply", "-f", "-"], input=cleaned_yaml_string, text=True, check=True) except subprocess.CalledProcessError as e: exit(f"An error occurred while running `kubectl`: {e}") From cce47e764d77c626c9e6786c662036b16bff6707 Mon Sep 17 00:00:00 2001 From: Gary Lvov Date: Thu, 23 Jan 2025 19:24:28 -0500 Subject: [PATCH 03/19] fixes --- .../workflows/ray/cluster_configs/Dockerfile | 4 ++++ .../google_cloud/kuberay.yaml.jinja | 1 - .../workflows/ray/grok_cluster_with_kubectl.py | 16 ++++++++-------- source/standalone/workflows/ray/submit_job.py | 2 +- 4 files changed, 13 insertions(+), 10 deletions(-) diff --git a/source/standalone/workflows/ray/cluster_configs/Dockerfile b/source/standalone/workflows/ray/cluster_configs/Dockerfile index 48dfe56f7f..a09fc2e5f3 100644 --- a/source/standalone/workflows/ray/cluster_configs/Dockerfile +++ b/source/standalone/workflows/ray/cluster_configs/Dockerfile @@ -1,5 +1,9 @@ FROM isaac-lab-base:latest +# WGet is needed so that GCS or other cloud providers can mark the container as ready. +# Otherwise the liveliness checks fail. +RUN apt-get install wget + # Set NVIDIA paths ENV PATH="/usr/local/nvidia/bin:$PATH" ENV LD_LIBRARY_PATH="/usr/local/nvidia/lib64" diff --git a/source/standalone/workflows/ray/cluster_configs/google_cloud/kuberay.yaml.jinja b/source/standalone/workflows/ray/cluster_configs/google_cloud/kuberay.yaml.jinja index 79a4ac1231..40ccccf7c6 100644 --- a/source/standalone/workflows/ray/cluster_configs/google_cloud/kuberay.yaml.jinja +++ b/source/standalone/workflows/ray/cluster_configs/google_cloud/kuberay.yaml.jinja @@ -19,7 +19,6 @@ spec: block: "true" dashboard-host: 0.0.0.0 dashboard-port: "8265" - node-ip-address: "0.0.0.0" port: "6379" include-dashboard: "true" ray-debugger-external: "true" diff --git a/source/standalone/workflows/ray/grok_cluster_with_kubectl.py b/source/standalone/workflows/ray/grok_cluster_with_kubectl.py index c00a28d066..62e4c48c68 100644 --- a/source/standalone/workflows/ray/grok_cluster_with_kubectl.py +++ b/source/standalone/workflows/ray/grok_cluster_with_kubectl.py @@ -67,9 +67,10 @@ def get_clusters(pods: list, cluster_name_prefix: str) -> set: match = re.match(r"(" + re.escape(cluster_name_prefix) + r"[-\w]+)", pod_name) if match: - # Get base name without head/worker suffix - base_name = match.group(1).split("-head")[0].split("-worker")[0] - clusters.add(base_name) + # Get base name without head/worker suffix (skip workers) + if "head" in pod_name: + base_name = match.group(1).split("-head")[0] + clusters.add(base_name) return sorted(clusters) @@ -90,9 +91,7 @@ def get_mlflow_info(namespace: str = None, cluster_prefix: str = "isaacray") -> clusters = get_clusters(pods=pods, cluster_name_prefix=cluster_prefix) if len(clusters) > 1: raise ValueError("More than one cluster matches prefix, could not automatically determine mlflow info.") - - base_name = cluster_prefix.split("-head")[0].split("-worker")[0] - mlflow_name = f"{base_name}-mlflow" + mlflow_name = f"{cluster_prefix}-mlflow" cmd = ["kubectl", "get", "svc", mlflow_name, "-n", namespace, "--no-headers"] try: @@ -102,8 +101,9 @@ def get_mlflow_info(namespace: str = None, cluster_prefix: str = "isaacray") -> # Get cluster IP cluster_ip = fields[2] port = "5000" # Default MLflow port - - return f"https://{cluster_ip}:{port}" + # This needs to be http to be resolved. HTTPS can't be resolved + # This should be fine as it is on a subnet on the cluster regardless + return f"http://{cluster_ip}:{port}" except subprocess.CalledProcessError as e: raise ValueError(f"Could not grok MLflow: {e}") # Fixed f-string diff --git a/source/standalone/workflows/ray/submit_job.py b/source/standalone/workflows/ray/submit_job.py index 4f6d08326e..b7eb57343d 100644 --- a/source/standalone/workflows/ray/submit_job.py +++ b/source/standalone/workflows/ray/submit_job.py @@ -43,7 +43,7 @@ ./isaaclab.sh -p source/standalone/workflows/ray/submit_job.py \ --aggregate_jobs /workspace/isaaclab/source/standalone/workflows/ray/tuner.py \ --cfg_file hyperparameter_tuning/vision_cartpole_cfg.py \ - --cfg_class CartpoleRGBNoTuneJobCfg --mlflow_uri + --cfg_class CartpoleTheiaJobCfg --mlflow_uri # Example: Submitting resource wrapped job ./isaaclab.sh -p source/standalone/workflows/ray/submit_job.py --aggregate_jobs wrap_resources.py --sub_jobs ./isaaclab.sh -p source/standalone/workflows/rl_games/train.py --task Isaac-Cartpole-v0 --headless+./isaaclab.sh -p source/standalone/workflows/rl_games/train.py --task Isaac-Cartpole-RGB-Camera-Direct-v0 --headless --enable_cameras agent.params.config.max_epochs=150 From 5cb8ec5d51e2168018618bbe96e98f1f274acf92 Mon Sep 17 00:00:00 2001 From: garylvov Date: Thu, 23 Jan 2025 21:39:18 -0500 Subject: [PATCH 04/19] type hinting and doc formatting --- docs/source/features/ray.rst | 69 ++++++++++--------- .../workflows/ray/cluster_configs/Dockerfile | 4 +- source/standalone/workflows/ray/launch.py | 3 +- source/standalone/workflows/ray/tuner.py | 25 ++++--- .../workflows/ray/wrap_resources.py | 3 +- 5 files changed, 54 insertions(+), 50 deletions(-) diff --git a/docs/source/features/ray.rst b/docs/source/features/ray.rst index 1ee4a82fd1..c231eca87d 100644 --- a/docs/source/features/ray.rst +++ b/docs/source/features/ray.rst @@ -20,34 +20,33 @@ the general workflow is the same. :depth: 3 :local: -Overview --------- +**Overview** +------------ -The Ray integration is useful for the following: +The Ray integration is useful for the following. -- Dispatching several training jobs in parallel or sequentially with minimal interaction -- Tuning hyperparameters; in parallel or sequentially with support for multiple GPUs and/or multiple GPU Nodes -- Using the same training setup everywhere (on cloud and local) with minimal overhead -- Resource Isolation for training jobs +- Dispatching several training jobs in parallel or sequentially with minimal interaction. +- Tuning hyperparameters; in parallel or sequentially with support for multiple GPUs and/or multiple GPU Nodes. +- Using the same training setup everywhere (on cloud and local) with minimal overhead. +- Resource Isolation for training jobs (resource-wrapped jobs). The core functionality of the Ray workflow consists of two main scripts that enable the orchestration -of resource-wrapped and tuning aggregate jobs. These scripts facilitate the decomposition of -aggregate jobs (overarching experiments) into individual jobs, which are discrete commands -executed on the cluster. An aggregate job can include multiple individual jobs. +of resource-wrapped and tuning aggregate jobs. In resource-wrapped aggregate jobs, each sub-job and its +resource requirements are defined manually, enabling resource isolation. +For tuning aggregate jobs, individual jobs are generated automatically based on a hyperparameter +sweep configuration. + Both resource-wrapped and tuning aggregate jobs dispatch individual jobs to a designated Ray cluster, which leverages the cluster's resources (e.g., a single workstation node or multiple nodes) -to execute these jobs with workers in parallel and/or sequentially. +to execute these jobs with workers in parallel and/or sequentially. By default, jobs use all \ available resources on each available GPU-enabled node for each sub-job worker. This can be changed through specifying the ``--num_workers`` argument for resource-wrapped jobs, or ``--num_workers_per_node`` for tuning jobs, which is especially critical for parallel aggregate -job processing on local/virtual multi-GPU machines. +job processing on local/virtual multi-GPU machines. Tuning jobs assume homogeneous node resource composition for nodes with GPUs. -In resource-wrapped aggregate jobs, each sub-job and its -resource requirements are defined manually, enabling resource isolation. -For tuning aggregate jobs, individual jobs are generated automatically based on a hyperparameter -sweep configuration. Tuning jobs assume homogeneous node resource composition for nodes with GPUs. +The two following files contain the core functionality of the Ray integration. .. dropdown:: source/standalone/workflows/ray/wrap_resources.py :icon: code @@ -94,39 +93,47 @@ The following script can be used to easily create clusters on Google GKE. :language: python :emphasize-lines: 16-37 -** Docker-based Local Quickstart ** +**Docker-based Local Quickstart** ----------------------------------- -First, follow the `Docker Guide ` +First, follow the `Docker Guide `_ to set up the NVIDIA Container Toolkit and Docker Compose. -Then, try the following steps to start your first tuning run. +Then, run the following steps to start a tuning run. .. code-block:: bash - # Build the base image - ./isaaclab.sh docker/container.py start + # Build the base image, but we don't need to run it + python3 docker/container.py start && python3 docker/container.py stop # Build the tuning image with extra deps docker build -t isaacray -f source/standalone/workflows/ray/cluster_configs/Dockerfile . # Start the tuning image docker run -it --gpus all --net=host --entrypoint /bin/bash isaacray - # Start the Ray server within the tuning image + # Start the Ray server within the tuning image echo "import ray; ray.init(); import time; [time.sleep(10) for _ in iter(int, 1)]" | ./isaaclab.sh -p - + + + +In a different terminal, run the following. + + +.. code-block:: bash + # In a new terminal (don't close the above) , enter the image with a new shell. docker container ps docker exec -it /bin/bash - # Start a tuning run, with one parallel worker per GPU - /isaaclab.sh -p source/standalone/workflows/ray/tuner.py \ + # Start a tuning run, with one parallel worker per GPU + ./isaaclab.sh -p source/standalone/workflows/ray/tuner.py \ --cfg_file source/standalone/workflows/ray/hyperparameter_tuning/vision_cartpole_cfg.py \ --cfg_class CartpoleTheiaJobCfg \ + --run_mode local \ --num_workers_per_node -For tuning jobs, specify the tuning job / hyperparameter sweep as child class of - :class:`JobCfg` . -.. dropdown:: source/standalone/workflows/ray/tuner.py JobCfg definition +For tuning jobs, specify the tuning job / hyperparameter sweep as child class of :class:`JobCfg` . + +.. dropdown:: source/standalone/workflows/ray/tuner.py (JobCfg definition) :icon: code .. literalinclude:: ../../../source/standalone/workflows/ray/tuner.py @@ -204,7 +211,7 @@ Simple Ray Cluster (Local/VM) 4.) For tuning jobs, specify the tuning job / hyperparameter sweep as child class of :class:`JobCfg` . -.. dropdown:: source/standalone/workflows/ray/tuner.py +.. dropdown:: source/standalone/workflows/ray/tuner.py (JobCfg definition) :icon: code .. literalinclude:: ../../../source/standalone/workflows/ray/tuner.py @@ -275,7 +282,7 @@ Ray Clusters (Without Kubernetes) steps for creating an image/cluster permissions/bucket access. See the `Ray Clusters Overview `_ or -`Anyscale `_ for more information +`Anyscale `_ for more information. This guide assumes that one desires to create a cluster on a remote host or server. This @@ -382,7 +389,7 @@ Shared Steps Between KubeRay and Pure Ray Part II 3.) For tuning jobs, specify the tuning job / hyperparameter sweep as a :class:`JobCfg` . -.. dropdown:: source/standalone/workflows/ray/tuner.py +.. dropdown:: source/standalone/workflows/ray/tuner.py (JobCfg definition) :icon: code .. literalinclude:: ../../../source/standalone/workflows/ray/tuner.py diff --git a/source/standalone/workflows/ray/cluster_configs/Dockerfile b/source/standalone/workflows/ray/cluster_configs/Dockerfile index a09fc2e5f3..e75b756057 100644 --- a/source/standalone/workflows/ray/cluster_configs/Dockerfile +++ b/source/standalone/workflows/ray/cluster_configs/Dockerfile @@ -1,8 +1,8 @@ FROM isaac-lab-base:latest # WGet is needed so that GCS or other cloud providers can mark the container as ready. -# Otherwise the liveliness checks fail. -RUN apt-get install wget +# Otherwise the Ray liveliness checks fail. +RUN apt-get update && apt-get install wget # Set NVIDIA paths ENV PATH="/usr/local/nvidia/bin:$PATH" diff --git a/source/standalone/workflows/ray/launch.py b/source/standalone/workflows/ray/launch.py index 5be009672d..acd590a065 100644 --- a/source/standalone/workflows/ray/launch.py +++ b/source/standalone/workflows/ray/launch.py @@ -8,11 +8,10 @@ import subprocess import yaml +import util from jinja2 import Environment, FileSystemLoader from kubernetes import config -import util - """This script helps create one or more KubeRay clusters. Usage: diff --git a/source/standalone/workflows/ray/tuner.py b/source/standalone/workflows/ray/tuner.py index ddc9a97e5a..b15205c4d5 100644 --- a/source/standalone/workflows/ray/tuner.py +++ b/source/standalone/workflows/ray/tuner.py @@ -18,7 +18,7 @@ This script breaks down an aggregate tuning job, as defined by a hyperparameter sweep configuration, into individual jobs (shell commands) to run on the GPU-enabled nodes of the cluster. By default, one worker is created for each GPU-enabled node in the cluster for each individual job. -To use more than one worker per node (likely the case for multi-GPU machines), supply the +To use more than one worker per node (likely the case for multi-GPU machines), supply the num_workers_per_node argument. Each hyperparameter sweep configuration should include the workflow, @@ -40,16 +40,15 @@ ./isaaclab.sh -p source/standalone/workflows/ray/tuner.py -h # Examples - # Local (not within a docker container, when within a local docker container, do not supply run_mode argument) + # Local ./isaaclab.sh -p source/standalone/workflows/ray/tuner.py --run_mode local \ --cfg_file source/standalone/workflows/ray/hyperparameter_tuning/vision_cartpole_cfg.py \ - --cfg_class CartpoleRGBNoTuneJobCfg - # Local docker: start the ray server and run above command in the same running container without run_mode arg + --cfg_class CartpoleTheiaJobCfg # Remote (run grok cluster or create config file mentioned in :file:`submit_job.py`) ./isaaclab.sh -p source/standalone/workflows/ray/submit_job.py \ --aggregate_jobs tuner.py \ --cfg_file hyperparameter_tuning/vision_cartpole_cfg.py \ - --cfg_class CartpoleRGBNoTuneJobCfg --mlflow_uri + --cfg_class CartpoleTheiaJobCfg --mlflow_uri """ @@ -75,7 +74,7 @@ def setup(self, config: dict) -> None: print(f"[INFO]: Recovered invocation with {self.invoke_cmd}") self.experiment = None - def reset_config(self, new_config): + def reset_config(self, new_config: dict): """Allow environments to be re-used by fetching a new invocation command""" self.setup(new_config) return True @@ -221,23 +220,23 @@ class JobCfg: """To be compatible with :meth: invoke_tuning_run and :class:IsaacLabTuneTrainable, at a minimum, the tune job should inherit from this class.""" - def __init__(self, cfg): - ''' + def __init__(self, cfg: dict): + """ Runner args include command line arguments passed to the task. For example: cfg["runner_args"]["headless_singleton"] = "--headless" cfg["runner_args"]["enable_cameras_singleton"] = "--enable_cameras" - ''' + """ assert "runner_args" in cfg, "No runner arguments specified." - ''' + """ Task is the desired task to train on. For example: cfg["runner_args"]["--task"] = tune.choice(["Isaac-Cartpole-RGB-TheiaTiny-v0"]) - ''' + """ assert "--task" in cfg["runner_args"], "No task specified." - ''' + """ Hydra args define the hyperparameters varied within the sweep. For example: cfg["hydra_args"]["agent.params.network.cnn.activation"] = tune.choice(["relu", "elu"]) - ''' + """ assert "hydra_args" in cfg, "No hyperparameters specified." self.cfg = cfg diff --git a/source/standalone/workflows/ray/wrap_resources.py b/source/standalone/workflows/ray/wrap_resources.py index 0cd18762b0..cd29b39d8d 100644 --- a/source/standalone/workflows/ray/wrap_resources.py +++ b/source/standalone/workflows/ray/wrap_resources.py @@ -6,9 +6,8 @@ import argparse import ray -from ray.util.scheduling_strategies import NodeAffinitySchedulingStrategy - import util +from ray.util.scheduling_strategies import NodeAffinitySchedulingStrategy """ This script dispatches sub-job(s) (individual jobs, use :file:`tuner.py` for tuning jobs) From 00b0ef07d35f82bcbd45daea5cbf1363f1169f50 Mon Sep 17 00:00:00 2001 From: garylvov Date: Thu, 23 Jan 2025 21:42:33 -0500 Subject: [PATCH 05/19] formatting --- docs/source/features/ray.rst | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/docs/source/features/ray.rst b/docs/source/features/ray.rst index c231eca87d..265646e944 100644 --- a/docs/source/features/ray.rst +++ b/docs/source/features/ray.rst @@ -208,8 +208,9 @@ Simple Ray Cluster (Local/VM) :language: python :emphasize-lines: 14-66 -4.) For tuning jobs, specify the tuning job / hyperparameter sweep as child class of - :class:`JobCfg` . + +4.) For tuning jobs, specify the tuning job / hyperparameter sweep as child class of :class:`JobCfg` . + .. dropdown:: source/standalone/workflows/ray/tuner.py (JobCfg definition) :icon: code From b28dbe0e7332aaf439634ef681608169f37591d0 Mon Sep 17 00:00:00 2001 From: garylvov Date: Thu, 23 Jan 2025 21:48:12 -0500 Subject: [PATCH 06/19] finish --- docs/source/features/ray.rst | 25 +++++++++---------------- 1 file changed, 9 insertions(+), 16 deletions(-) diff --git a/docs/source/features/ray.rst b/docs/source/features/ray.rst index 265646e944..9445c07d7d 100644 --- a/docs/source/features/ray.rst +++ b/docs/source/features/ray.rst @@ -240,11 +240,8 @@ For example, see the following Cartpole Example configurations. To view the logs, simply run ``tensorboard --logdir=`` . -Remote Ray Cluster -'''''''''''''''''' - -**Setup Overview: Cluster Configuration** -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +Remote Ray Clusters +''''''''''''''''''' Select one of the following methods to create a Ray cluster to accept and execute dispatched jobs. @@ -271,7 +268,7 @@ any cloud provider should work if one configures the following. provided that your account or organization has been granted a GPU-budget. It is recommended to use manual kubernetes services as opposed to "autopilot" services for cost-effective experimentation as this way clusters can be completely shut down when not in use, although - this may require installing the `Nvidia GPU Operator `_ + this may require installing the `Nvidia GPU Operator `_ . - An MLFlow server that your cluster has access to. - A ``kuberay.yaml.ninja`` file that describes how to allocate resources (already included for Google Cloud, which can be referenced for the format and MLFlow integration). @@ -286,10 +283,6 @@ See the `Ray Clusters Overview `_ for more information. -This guide assumes that one desires to create a cluster on a remote host or server. This -guide includes shared steps, and KubeRay or Ray specific steps. Follow all shared steps (part I and II), and then -only the KubeRay or Ray steps depending on your desired configuration, in order of shared steps part I, then -the configuration specific steps, then shared steps part II. Shared Steps Between KubeRay and Pure Ray Part I ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ @@ -305,8 +298,8 @@ Shared Steps Between KubeRay and Pure Ray Part I # Push the image to your registry of choice. docker push -KubeRay Specific -~~~~~~~~~~~~~~~~ +KubeRay Clusters Only +~~~~~~~~~~~~~~~~~~~~~ `k9s `_ is a great tool for monitoring your clusters that can easily be installed with ``snap install k9s --devmode``. @@ -355,8 +348,8 @@ printed. :language: python :emphasize-lines: 14-26 -Ray Specific -~~~~~~~~~~~~ +Ray Clusters Only (Without Kubernetes) +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 1.) Verify cluster access. @@ -409,11 +402,11 @@ For example, see the following Cartpole Example configurations. Tuning jobs can also be submitted via ``submit_job.py`` . To view the tuning results, view the MLFlow dashboard of the server that you created. -For KubeRay, this can be done through port forwarding the MLFlow dashboard, with +For KubeRay, this can be done through port forwarding the MLFlow dashboard with the following. ``kubectl port-forward service/isaacray-mlflow 5000:5000`` -and visiting the following address in a browser. +Then visit the following address in a browser. ``localhost:5000`` From e725f1bc93f66b0299a504eeb204c956ff2c4b44 Mon Sep 17 00:00:00 2001 From: garylvov Date: Thu, 23 Jan 2025 21:49:45 -0500 Subject: [PATCH 07/19] polish --- docs/source/features/ray.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/source/features/ray.rst b/docs/source/features/ray.rst index 9445c07d7d..e9ab31609f 100644 --- a/docs/source/features/ray.rst +++ b/docs/source/features/ray.rst @@ -157,7 +157,7 @@ The Ray functionality requires additional dependencies be installed. To use Ray without Kubernetes, like on a local computer or VM, ``kubectl`` is not required. For use on Kubernetes clusters with KubeRay, such as Google Kubernetes Engine or Amazon Elastic Kubernetes Service, ``kubectl`` is required, and can -be installed via the `Kubernetes website `_ +be installed via the `Kubernetes website `_ . The pythonic dependencies can be installed with the following. From 74a9deb0aa7bd6a805c274ab4f9983dbd25ed747 Mon Sep 17 00:00:00 2001 From: garylvov Date: Thu, 23 Jan 2025 21:51:08 -0500 Subject: [PATCH 08/19] dont need bucket --- docs/source/features/ray.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/source/features/ray.rst b/docs/source/features/ray.rst index e9ab31609f..98dfa57c14 100644 --- a/docs/source/features/ray.rst +++ b/docs/source/features/ray.rst @@ -277,7 +277,7 @@ Ray Clusters (Without Kubernetes) ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ .. attention:: Modify the Ray command to use Isaac Python like in KubeRay clusters, and follow the same - steps for creating an image/cluster permissions/bucket access. + steps for creating an image/cluster permissions See the `Ray Clusters Overview `_ or `Anyscale `_ for more information. From 8766dfc8337c5f59074fe82fa96eb954542a9352 Mon Sep 17 00:00:00 2001 From: garylvov Date: Thu, 23 Jan 2025 21:51:34 -0500 Subject: [PATCH 09/19] more . --- docs/source/features/ray.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/source/features/ray.rst b/docs/source/features/ray.rst index 98dfa57c14..604ac26238 100644 --- a/docs/source/features/ray.rst +++ b/docs/source/features/ray.rst @@ -277,7 +277,7 @@ Ray Clusters (Without Kubernetes) ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ .. attention:: Modify the Ray command to use Isaac Python like in KubeRay clusters, and follow the same - steps for creating an image/cluster permissions + steps for creating an image/cluster permissions. See the `Ray Clusters Overview `_ or `Anyscale `_ for more information. From e9477eee9d6bbd287b6b0c9c477ecfffc4bb7d8e Mon Sep 17 00:00:00 2001 From: garylvov Date: Thu, 23 Jan 2025 22:30:31 -0500 Subject: [PATCH 10/19] clean up --- docs/source/features/ray.rst | 160 ++++++++---------- .../ray/grok_cluster_with_kubectl.py | 2 +- source/standalone/workflows/ray/launch.py | 6 +- source/standalone/workflows/ray/submit_job.py | 6 +- 4 files changed, 73 insertions(+), 101 deletions(-) diff --git a/docs/source/features/ray.rst b/docs/source/features/ray.rst index 604ac26238..c7ac3495c1 100644 --- a/docs/source/features/ray.rst +++ b/docs/source/features/ray.rst @@ -130,89 +130,47 @@ In a different terminal, run the following. --num_workers_per_node +To view the training logs, in a different terminal, run the following and visit ``localhost:6006`` in a browser afterwards. -For tuning jobs, specify the tuning job / hyperparameter sweep as child class of :class:`JobCfg` . +.. code-block:: bash -.. dropdown:: source/standalone/workflows/ray/tuner.py (JobCfg definition) - :icon: code + # In a new terminal (don't close the above) , enter the image with a new shell. + docker container ps + docker exec -it /bin/bash + # Start a tuning run, with one parallel worker per GPU + tensorboard --logdir=. - .. literalinclude:: ../../../source/standalone/workflows/ray/tuner.py - :language: python - :start-at: class JobCfg - :end-at: self.cfg = cfg -For example, see the following Cartpole Example configurations. +Submitting resource-wrapped individual jobs instead of automatic tuning runs is described in the following file. -.. dropdown:: source/standalone/workflows/ray/hyperparameter_tuning/vision_cartpole_cfg.py +.. dropdown:: source/standalone/workflows/ray/wrap_resources.py :icon: code - .. literalinclude:: ../../../source/standalone/workflows/ray/hyperparameter_tuning/vision_cartpole_cfg.py + .. literalinclude:: ../../../source/standalone/workflows/ray/wrap_resources.py :language: python + :emphasize-lines: 14-66 -**Installation** ----------------- - -The Ray functionality requires additional dependencies be installed. - -To use Ray without Kubernetes, like on a local computer or VM, -``kubectl`` is not required. For use on Kubernetes clusters with KubeRay, -such as Google Kubernetes Engine or Amazon Elastic Kubernetes Service, ``kubectl`` is required, and can -be installed via the `Kubernetes website `_ . - -The pythonic dependencies can be installed with the following. .. code-block:: bash - # For multi-run support and resource isolation - ./isaaclab.sh -p -m pip install ray[default]==2.31.0 - # For hyperparameter tuning - ./isaaclab.sh -p -m pip install ray[tune]==2.31.0 - ./isaaclab.sh -p -m pip install optuna bayesian-optimization - # MLFlow is needed only for fetching logs on clusters, not needed for local - ./isaaclab.sh -p -m pip install mlflow - -If using KubeRay clusters on Google GKE with the batteries-included cluster launch file, -the following dependencies are also needed. - -.. code-block:: bash - - ./isaaclab.sh -p -m pip install kubernetes Jinja2 - - -**Dispatching Jobs and Tuning** -------------------------------- - -Select one of the following guides that matches your desired cluster configuration. - -Simple Ray Cluster (Local/VM) -''''''''''''''''''''''''''''' - -1.) Start a Ray cluster. - -.. code-block:: bash + # In a new terminal (don't close the above) , enter the image with a new shell. + docker container ps + docker exec -it /bin/bash + # Start a tuning run, with one parallel worker per GPU + tensorboard --logdir=. - echo "import ray; ray.init(); import time; [time.sleep(10) for _ in iter(int, 1)]" | ./isaaclab.sh -p -2.) Testing that the cluster works can be done as follows. +Transferring files from the running container can be done as follows. .. code-block:: bash - ./isaaclab.sh -p source/standalone/workflows/ray/wrap_resources.py --test - -3.) Submitting resource-wrapped individual can be done as described in the following file. - -.. dropdown:: source/standalone/workflows/ray/wrap_resources.py - :icon: code - - .. literalinclude:: ../../../source/standalone/workflows/ray/wrap_resources.py - :language: python - :emphasize-lines: 14-66 - + docker container ps + docker cp : -4.) For tuning jobs, specify the tuning job / hyperparameter sweep as child class of :class:`JobCfg` . +For tuning jobs, specify the tuning job / hyperparameter sweep as child class of :class:`JobCfg` . -.. dropdown:: source/standalone/workflows/ray/tuner.py (JobCfg definition) +.. dropdown:: source/standalone/workflows/ray/tuner.py (JobCfg definition) :icon: code .. literalinclude:: ../../../source/standalone/workflows/ray/tuner.py @@ -229,24 +187,13 @@ For example, see the following Cartpole Example configurations. :language: python -5.) Then, see the local examples in the following file to see how to start a tuning run. - -.. dropdown:: source/standalone/workflows/ray/tuner.py - :icon: code - - .. literalinclude:: ../../../source/standalone/workflows/ray/tuner.py - :language: python - :emphasize-lines: 18-53 - -To view the logs, simply run ``tensorboard --logdir=`` . - -Remote Ray Clusters -''''''''''''''''''' +**Remote Clusters** +------------------------- Select one of the following methods to create a Ray cluster to accept and execute dispatched jobs. -KubeRay Clusters -~~~~~~~~~~~~~~~~ +KubeRay Setup +~~~~~~~~~~~~~ .. attention:: The ``ray`` command should be modified to use Isaac python, which could be achieved in a fashion similar to ``sed -i "1i $(echo "#!/workspace/isaaclab/_isaac_sim/python.sh")" \ @@ -273,8 +220,20 @@ any cloud provider should work if one configures the following. - A ``kuberay.yaml.ninja`` file that describes how to allocate resources (already included for Google Cloud, which can be referenced for the format and MLFlow integration). -Ray Clusters (Without Kubernetes) -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +For use on Kubernetes clusters with KubeRay, +such as Google Kubernetes Engine or Amazon Elastic Kubernetes Service, ``kubectl`` is required, and can +be installed via the `Kubernetes website `_ . + +If using KubeRay clusters on Google GKE with the batteries-included cluster launch file, +the following dependencies are also needed. + +.. code-block:: bash + + python3 -p -m pip install kubernetes Jinja2 + +Ray Clusters (Without Kubernetes) Setup +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ .. attention:: Modify the Ray command to use Isaac Python like in KubeRay clusters, and follow the same steps for creating an image/cluster permissions. @@ -282,12 +241,19 @@ Ray Clusters (Without Kubernetes) See the `Ray Clusters Overview `_ or `Anyscale `_ for more information. +This likely requires a `local installation of Ray `_ to interface with the cluster. Shared Steps Between KubeRay and Pure Ray Part I ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -1.) Build the Isaac Ray image, and upload it to your container registry of choice. +1.) Install Ray on your local machine. + +.. code-block:: bash + + python3 -p -m pip install ray[default]==2.31.0 + +2.) Build the Isaac Ray image, and upload it to your container registry of choice. .. code-block:: bash @@ -322,11 +288,7 @@ easily be installed with ``snap install k9s --devmode``. 2.) Create the KubeRay cluster and an MLFlow server for receiving logs that your cluster has access to. This can be done automatically for Google GKE, -where instructions are included in the following creation file. More than once cluster -can be created at once. Each cluster can have heterogeneous resources if so desired, -although only -For other cloud services, the ``kuberay.yaml.ninja`` will be similar to that of -Google's. +where instructions are included in the following creation file. .. dropdown:: source/standalone/workflows/ray/launch.py :icon: code @@ -335,6 +297,18 @@ Google's. :language: python :emphasize-lines: 15-37 +For other cloud services, the ``kuberay.yaml.ninja`` will be similar to that of +Google's. + + +.. dropdown:: source/standalone/workflows/ray/cluster_configs/google_cloud/kuberay.yaml.ninja + :icon: code + + .. literalinclude:: ../../../source/standalone/workflows/ray/cluster_configs/google_cloud/kuberay.yaml.jinja + :language: python + + + 3.) Fetch the KubeRay cluster IP addresses, and the MLFLow Server IP. This can be done automatically for KubeRay clusters, where instructions are included in the following fetching file. @@ -360,8 +334,8 @@ a new line for each unique cluster. For one cluster, there should only be one li 3.) Start an MLFLow Server to receive the logs that the ray cluster has access to, and determine the server URI. -Shared Steps Between KubeRay and Pure Ray Part II -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +Dispatching Steps Shared Between KubeRay and Pure Ray Part II +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 1.) Test that your cluster is operational with the following. @@ -369,10 +343,9 @@ Shared Steps Between KubeRay and Pure Ray Part II .. code-block:: bash # Test that NVIDIA GPUs are visible and that Ray is operation with the following command: - ./isaaclab.sh -p source/standalone/workflows/ray/wrap_resources.py - --jobs wrap_resources.py --test + python3 source/standalone/workflows/ray/submit_job.py --aggregate_jobs wrap_resources.py --test -2.) Submitting Jobs can be done in the following manner, with the following script. +2.) Submitting tuning and/or resource-wrapped jobs is described in the :file:`submit_job.py` file. .. dropdown:: source/standalone/workflows/ray/submit_job.py :icon: code @@ -399,7 +372,6 @@ For example, see the following Cartpole Example configurations. .. literalinclude:: ../../../source/standalone/workflows/ray/hyperparameter_tuning/vision_cartpole_cfg.py :language: python -Tuning jobs can also be submitted via ``submit_job.py`` . To view the tuning results, view the MLFlow dashboard of the server that you created. For KubeRay, this can be done through port forwarding the MLFlow dashboard with the following. @@ -417,8 +389,8 @@ this following command. --uri http://localhost:5000 --experiment-name IsaacRay--tune --download-dir test`` -**Cluster Cleanup** -''''''''''''''''''' +**Kubernetes Cluster Cleanup** +'''''''''''''''''''''''''''''' For the sake of conserving resources, and potentially freeing precious GPU resources for other people to use on shared compute platforms, please destroy the Ray cluster after use. They can be easily diff --git a/source/standalone/workflows/ray/grok_cluster_with_kubectl.py b/source/standalone/workflows/ray/grok_cluster_with_kubectl.py index 62e4c48c68..a8f8bdd7c8 100644 --- a/source/standalone/workflows/ray/grok_cluster_with_kubectl.py +++ b/source/standalone/workflows/ray/grok_cluster_with_kubectl.py @@ -21,7 +21,7 @@ .. code-block:: bash - ./isaaclab.sh -p source/standalone/workflows/ray/grok_cluster_with_kubectl.py + python3 source/standalone/workflows/ray/grok_cluster_with_kubectl.py # For options, supply -h arg """ diff --git a/source/standalone/workflows/ray/launch.py b/source/standalone/workflows/ray/launch.py index acd590a065..e64779d407 100644 --- a/source/standalone/workflows/ray/launch.py +++ b/source/standalone/workflows/ray/launch.py @@ -18,18 +18,18 @@ .. code-block:: bash # If the head node is stuck on container creating, make sure to create a secret - ./isaaclab.sh -p source/standalone/workflows/ray/launch.py -h + python3 source/standalone/workflows/ray/launch.py -h # Examples # The following creates 8 GPUx1 nvidia l4 workers - ./isaaclab.sh -p source/standalone/workflows/ray/launch.py --cluster_host google_cloud \ + python3 source/standalone/workflows/ray/launch.py --cluster_host google_cloud \ --namespace --image \ --num_workers 8 --num_clusters 1 --worker_accelerator nvidia-l4 --gpu_per_worker 1 # The following creates 1 GPUx1 nvidia l4 worker, 2 GPUx2 nvidia-tesla-t4 workers, # and 2 GPUx4 nvidia-tesla-t4 GPU workers - ./isaaclab.sh -p source/standalone/workflows/ray/launch.py --cluster_host google_cloud \ + python3 source/standalone/workflows/ray/launch.py --cluster_host google_cloud \ --namespace --image \ --num_workers 1 2 --num_clusters 1 \ --worker_accelerator nvidia-l4 nvidia-tesla-t4 --gpu_per_worker 1 2 4 diff --git a/source/standalone/workflows/ray/submit_job.py b/source/standalone/workflows/ray/submit_job.py index b7eb57343d..8fd1fb87f9 100644 --- a/source/standalone/workflows/ray/submit_job.py +++ b/source/standalone/workflows/ray/submit_job.py @@ -40,16 +40,16 @@ .. code-block:: bash # Example; submitting a tuning job - ./isaaclab.sh -p source/standalone/workflows/ray/submit_job.py \ + python3 source/standalone/workflows/ray/submit_job.py \ --aggregate_jobs /workspace/isaaclab/source/standalone/workflows/ray/tuner.py \ --cfg_file hyperparameter_tuning/vision_cartpole_cfg.py \ --cfg_class CartpoleTheiaJobCfg --mlflow_uri # Example: Submitting resource wrapped job - ./isaaclab.sh -p source/standalone/workflows/ray/submit_job.py --aggregate_jobs wrap_resources.py --sub_jobs ./isaaclab.sh -p source/standalone/workflows/rl_games/train.py --task Isaac-Cartpole-v0 --headless+./isaaclab.sh -p source/standalone/workflows/rl_games/train.py --task Isaac-Cartpole-RGB-Camera-Direct-v0 --headless --enable_cameras agent.params.config.max_epochs=150 + python3 source/standalone/workflows/ray/submit_job.py --aggregate_jobs wrap_resources.py --test # For all command line arguments - ./isaaclab.sh -p source/standalone/workflows/ray/submit_job.py -h + python3 source/standalone/workflows/ray/submit_job.py -h """ script_directory = os.path.dirname(os.path.abspath(__file__)) CONFIG = {"working_dir": script_directory, "executable": "/workspace/isaaclab/isaaclab.sh -p"} From 9305c85ada323dd56c122c206bc2133d61556504 Mon Sep 17 00:00:00 2001 From: garylvov Date: Thu, 23 Jan 2025 22:39:12 -0500 Subject: [PATCH 11/19] finish v2 --- docs/source/features/ray.rst | 49 +++++++++++++++--------------------- 1 file changed, 20 insertions(+), 29 deletions(-) diff --git a/docs/source/features/ray.rst b/docs/source/features/ray.rst index c7ac3495c1..56bfa6c1d2 100644 --- a/docs/source/features/ray.rst +++ b/docs/source/features/ray.rst @@ -150,16 +150,6 @@ Submitting resource-wrapped individual jobs instead of automatic tuning runs is :language: python :emphasize-lines: 14-66 - -.. code-block:: bash - - # In a new terminal (don't close the above) , enter the image with a new shell. - docker container ps - docker exec -it /bin/bash - # Start a tuning run, with one parallel worker per GPU - tensorboard --logdir=. - - Transferring files from the running container can be done as follows. .. code-block:: bash @@ -194,14 +184,26 @@ Select one of the following methods to create a Ray cluster to accept and execut KubeRay Setup ~~~~~~~~~~~~~ + +If using KubeRay clusters on Google GKE with the batteries-included cluster launch file, +the following dependencies are also needed. + +.. code-block:: bash + + python3 -p -m pip install kubernetes Jinja2 + +For use on Kubernetes clusters with KubeRay, +such as Google Kubernetes Engine or Amazon Elastic Kubernetes Service, ``kubectl`` is required, and can +be installed via the `Kubernetes website `_ . + +Google Cloud is currently the only platform tested, although +any cloud provider should work if one configures the following. + .. attention:: The ``ray`` command should be modified to use Isaac python, which could be achieved in a fashion similar to ``sed -i "1i $(echo "#!/workspace/isaaclab/_isaac_sim/python.sh")" \ /isaac-sim/kit/python/bin/ray && ln -s /isaac-sim/kit/python/bin/ray /usr/local/bin/ray``. -Google Cloud is currently the only platform tested, although -any cloud provider should work if one configures the following. - - An container registry (NGC, GCS artifact registry, AWS ECR, etc) with an Isaac Lab image configured to support Ray. See ``cluster_configs/Dockerfile`` to see how to modify the ``isaac-lab-base`` container for Ray compatibility. Ray should use the isaac sim python shebang, and ``nvidia-smi`` @@ -216,22 +218,11 @@ any cloud provider should work if one configures the following. to use manual kubernetes services as opposed to "autopilot" services for cost-effective experimentation as this way clusters can be completely shut down when not in use, although this may require installing the `Nvidia GPU Operator `_ . -- An MLFlow server that your cluster has access to. +- An `MLFlow server `_ that your cluster has access to + (already included for Google Cloud, which can be referenced for the format and MLFlow integration). - A ``kuberay.yaml.ninja`` file that describes how to allocate resources (already included for Google Cloud, which can be referenced for the format and MLFlow integration). - -For use on Kubernetes clusters with KubeRay, -such as Google Kubernetes Engine or Amazon Elastic Kubernetes Service, ``kubectl`` is required, and can -be installed via the `Kubernetes website `_ . - -If using KubeRay clusters on Google GKE with the batteries-included cluster launch file, -the following dependencies are also needed. - -.. code-block:: bash - - python3 -p -m pip install kubernetes Jinja2 - Ray Clusters (Without Kubernetes) Setup ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ .. attention:: @@ -241,8 +232,8 @@ Ray Clusters (Without Kubernetes) Setup See the `Ray Clusters Overview `_ or `Anyscale `_ for more information. -This likely requires a `local installation of Ray `_ to interface with the cluster. - +Also, create an `MLFlow server `_ that your local +host and cluster have access to. Shared Steps Between KubeRay and Pure Ray Part I ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ @@ -258,7 +249,7 @@ Shared Steps Between KubeRay and Pure Ray Part I .. code-block:: bash # Login with NGC (nvcr.io) registry first, see docker steps in repo. - ./isaaclab.sh -p docker/container.py start + python3 docker/container.py start # Build the special Isaac Lab Ray Image docker build -t -f source/standalone/workflows/ray/cluster_configs/Dockerfile . # Push the image to your registry of choice. From d320287693c9f68afc62dec71bdd428c3d21d47c Mon Sep 17 00:00:00 2001 From: Gary Lvov Date: Thu, 23 Jan 2025 23:40:26 -0500 Subject: [PATCH 12/19] add symlink --- docs/source/features/ray.rst | 53 +++++++++++++++--------------------- 1 file changed, 22 insertions(+), 31 deletions(-) diff --git a/docs/source/features/ray.rst b/docs/source/features/ray.rst index c7ac3495c1..5ffa34872d 100644 --- a/docs/source/features/ray.rst +++ b/docs/source/features/ray.rst @@ -107,8 +107,8 @@ Then, run the following steps to start a tuning run. python3 docker/container.py start && python3 docker/container.py stop # Build the tuning image with extra deps docker build -t isaacray -f source/standalone/workflows/ray/cluster_configs/Dockerfile . - # Start the tuning image - docker run -it --gpus all --net=host --entrypoint /bin/bash isaacray + # Start the tuning image - symlink so that changes in the source folder show up in the container + docker run -v $(pwd)/source:/workspace/isaaclab/source -it --gpus all --net=host --entrypoint /bin/bash isaacray # Start the Ray server within the tuning image echo "import ray; ray.init(); import time; [time.sleep(10) for _ in iter(int, 1)]" | ./isaaclab.sh -p @@ -150,16 +150,6 @@ Submitting resource-wrapped individual jobs instead of automatic tuning runs is :language: python :emphasize-lines: 14-66 - -.. code-block:: bash - - # In a new terminal (don't close the above) , enter the image with a new shell. - docker container ps - docker exec -it /bin/bash - # Start a tuning run, with one parallel worker per GPU - tensorboard --logdir=. - - Transferring files from the running container can be done as follows. .. code-block:: bash @@ -194,14 +184,26 @@ Select one of the following methods to create a Ray cluster to accept and execut KubeRay Setup ~~~~~~~~~~~~~ + +If using KubeRay clusters on Google GKE with the batteries-included cluster launch file, +the following dependencies are also needed. + +.. code-block:: bash + + python3 -p -m pip install kubernetes Jinja2 + +For use on Kubernetes clusters with KubeRay, +such as Google Kubernetes Engine or Amazon Elastic Kubernetes Service, ``kubectl`` is required, and can +be installed via the `Kubernetes website `_ . + +Google Cloud is currently the only platform tested, although +any cloud provider should work if one configures the following. + .. attention:: The ``ray`` command should be modified to use Isaac python, which could be achieved in a fashion similar to ``sed -i "1i $(echo "#!/workspace/isaaclab/_isaac_sim/python.sh")" \ /isaac-sim/kit/python/bin/ray && ln -s /isaac-sim/kit/python/bin/ray /usr/local/bin/ray``. -Google Cloud is currently the only platform tested, although -any cloud provider should work if one configures the following. - - An container registry (NGC, GCS artifact registry, AWS ECR, etc) with an Isaac Lab image configured to support Ray. See ``cluster_configs/Dockerfile`` to see how to modify the ``isaac-lab-base`` container for Ray compatibility. Ray should use the isaac sim python shebang, and ``nvidia-smi`` @@ -216,22 +218,11 @@ any cloud provider should work if one configures the following. to use manual kubernetes services as opposed to "autopilot" services for cost-effective experimentation as this way clusters can be completely shut down when not in use, although this may require installing the `Nvidia GPU Operator `_ . -- An MLFlow server that your cluster has access to. +- An `MLFlow server `_ that your cluster has access to + (already included for Google Cloud, which can be referenced for the format and MLFlow integration). - A ``kuberay.yaml.ninja`` file that describes how to allocate resources (already included for Google Cloud, which can be referenced for the format and MLFlow integration). - -For use on Kubernetes clusters with KubeRay, -such as Google Kubernetes Engine or Amazon Elastic Kubernetes Service, ``kubectl`` is required, and can -be installed via the `Kubernetes website `_ . - -If using KubeRay clusters on Google GKE with the batteries-included cluster launch file, -the following dependencies are also needed. - -.. code-block:: bash - - python3 -p -m pip install kubernetes Jinja2 - Ray Clusters (Without Kubernetes) Setup ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ .. attention:: @@ -241,8 +232,8 @@ Ray Clusters (Without Kubernetes) Setup See the `Ray Clusters Overview `_ or `Anyscale `_ for more information. -This likely requires a `local installation of Ray `_ to interface with the cluster. - +Also, create an `MLFlow server `_ that your local +host and cluster have access to. Shared Steps Between KubeRay and Pure Ray Part I ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ @@ -258,7 +249,7 @@ Shared Steps Between KubeRay and Pure Ray Part I .. code-block:: bash # Login with NGC (nvcr.io) registry first, see docker steps in repo. - ./isaaclab.sh -p docker/container.py start + python3 docker/container.py start # Build the special Isaac Lab Ray Image docker build -t -f source/standalone/workflows/ray/cluster_configs/Dockerfile . # Push the image to your registry of choice. From 8cab13553d30bd9de55f757127a9f52b1d5f8241 Mon Sep 17 00:00:00 2001 From: Gary Lvov Date: Fri, 24 Jan 2025 00:05:36 -0500 Subject: [PATCH 13/19] remove bold from cleanup --- docs/source/features/ray.rst | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/docs/source/features/ray.rst b/docs/source/features/ray.rst index 5ffa34872d..d82b109959 100644 --- a/docs/source/features/ray.rst +++ b/docs/source/features/ray.rst @@ -380,8 +380,8 @@ this following command. --uri http://localhost:5000 --experiment-name IsaacRay--tune --download-dir test`` -**Kubernetes Cluster Cleanup** -'''''''''''''''''''''''''''''' +Kubernetes Cluster Cleanup +'''''''''''''''''''''''''' For the sake of conserving resources, and potentially freeing precious GPU resources for other people to use on shared compute platforms, please destroy the Ray cluster after use. They can be easily From 27930c4bca675d78b87d6b0cd1492dcf4bc02837 Mon Sep 17 00:00:00 2001 From: Gary Lvov Date: Fri, 24 Jan 2025 11:32:04 -0500 Subject: [PATCH 14/19] add compatibility with other workflows --- docs/source/features/ray.rst | 7 +++++ source/standalone/workflows/ray/tuner.py | 2 +- source/standalone/workflows/ray/util.py | 31 ++++++++++++--------- source/standalone/workflows/rsl_rl/train.py | 2 ++ source/standalone/workflows/sb3/train.py | 6 +++- source/standalone/workflows/skrl/train.py | 1 + 6 files changed, 34 insertions(+), 15 deletions(-) diff --git a/docs/source/features/ray.rst b/docs/source/features/ray.rst index d82b109959..b99c894666 100644 --- a/docs/source/features/ray.rst +++ b/docs/source/features/ray.rst @@ -127,6 +127,7 @@ In a different terminal, run the following. --cfg_file source/standalone/workflows/ray/hyperparameter_tuning/vision_cartpole_cfg.py \ --cfg_class CartpoleTheiaJobCfg \ --run_mode local \ + --workflow source/standalone/workflows/rl_games/train.py --num_workers_per_node @@ -159,6 +160,9 @@ Transferring files from the running container can be done as follows. For tuning jobs, specify the tuning job / hyperparameter sweep as child class of :class:`JobCfg` . +The included :class:`JobCfg` only supports the ``rl_games`` workflow due to differences in +environment entrypoints and hydra arguments, although other workflows will work if provided a compatible +:class:`JobCfg`. .. dropdown:: source/standalone/workflows/ray/tuner.py (JobCfg definition) :icon: code @@ -346,6 +350,9 @@ Dispatching Steps Shared Between KubeRay and Pure Ray Part II :emphasize-lines: 12-53 3.) For tuning jobs, specify the tuning job / hyperparameter sweep as a :class:`JobCfg` . + The included :class:`JobCfg` only supports the ``rl_games`` workflow due to differences in + environment entrypoints and hydra arguments, although other workflows will work if provided a compatible + :class:`JobCfg`. .. dropdown:: source/standalone/workflows/ray/tuner.py (JobCfg definition) :icon: code diff --git a/source/standalone/workflows/ray/tuner.py b/source/standalone/workflows/ray/tuner.py index b15205c4d5..966f7ccb63 100644 --- a/source/standalone/workflows/ray/tuner.py +++ b/source/standalone/workflows/ray/tuner.py @@ -95,7 +95,7 @@ def step(self) -> dict: self.proc = experiment["proc"] self.experiment_name = experiment["experiment_name"] self.isaac_logdir = experiment["logdir"] - self.tensorboard_logdir = self.isaac_logdir + f"/{self.experiment_name}/summaries" + self.tensorboard_logdir = self.isaac_logdir + {self.experiment_name} self.done = False if self.proc is None: diff --git a/source/standalone/workflows/ray/util.py b/source/standalone/workflows/ray/util.py index a7c4bda5c2..1e9289be60 100644 --- a/source/standalone/workflows/ray/util.py +++ b/source/standalone/workflows/ray/util.py @@ -14,7 +14,8 @@ def load_tensorboard_logs(directory: str) -> dict: - """From a tensorboard directory, get the latest scalar values. + """From a tensorboard directory, get the latest scalar values. If the logs can't be + found, check the summaries sublevel. Args: directory: The directory of the tensorboard logging. @@ -22,19 +23,23 @@ def load_tensorboard_logs(directory: str) -> dict: Returns: The latest available scalar values. """ + # Initialize the event accumulator with a size guidance for only the latest entry - size_guidance = {"scalars": 1} # Load only the latest entry for scalars - event_acc = EventAccumulator(directory, size_guidance=size_guidance) - event_acc.Reload() # Load all data from the directory - - # Extract the latest scalars logged - latest_scalars = {} - for tag in event_acc.Tags()["scalars"]: - events = event_acc.Scalars(tag) - if events: # Check if there is at least one entry - latest_event = events[-1] # Get the latest entry - latest_scalars[tag] = latest_event.value - return latest_scalars + def get_latest_scalars(path: str) -> dict: + event_acc = EventAccumulator(path, size_guidance={"scalars": 1}) + try: + event_acc.Reload() + if event_acc.Tags()["scalars"]: + return { + tag: event_acc.Scalars(tag)[-1].value + for tag in event_acc.Tags()["scalars"] + if event_acc.Scalars(tag) + } + except (KeyError, OSError, RuntimeError): + return {} + + scalars = get_latest_scalars(directory) + return scalars or get_latest_scalars(os.path.join(directory, "summaries")) def get_invocation_command_from_cfg( diff --git a/source/standalone/workflows/rsl_rl/train.py b/source/standalone/workflows/rsl_rl/train.py index 293fc6867b..ca5e4567e5 100644 --- a/source/standalone/workflows/rsl_rl/train.py +++ b/source/standalone/workflows/rsl_rl/train.py @@ -93,6 +93,8 @@ def main(env_cfg: ManagerBasedRLEnvCfg | DirectRLEnvCfg | DirectMARLEnvCfg, agen print(f"[INFO] Logging experiment in directory: {log_root_path}") # specify directory for logging runs: {time-stamp}_{run_name} log_dir = datetime.now().strftime("%Y-%m-%d_%H-%M-%S") + # This way, the Ray Tune workflow can extract experiment name. + print(f"Exact experiment name requested from command line: {log_dir}") if agent_cfg.run_name: log_dir += f"_{agent_cfg.run_name}" log_dir = os.path.join(log_root_path, log_dir) diff --git a/source/standalone/workflows/sb3/train.py b/source/standalone/workflows/sb3/train.py index b5e01855b9..ea2bb61123 100644 --- a/source/standalone/workflows/sb3/train.py +++ b/source/standalone/workflows/sb3/train.py @@ -89,7 +89,11 @@ def main(env_cfg: ManagerBasedRLEnvCfg | DirectRLEnvCfg | DirectMARLEnvCfg, agen env_cfg.sim.device = args_cli.device if args_cli.device is not None else env_cfg.sim.device # directory for logging into - log_dir = os.path.join("logs", "sb3", args_cli.task, datetime.now().strftime("%Y-%m-%d_%H-%M-%S")) + run_info = datetime.now().strftime("%Y-%m-%d_%H-%M-%S") + log_root_path = os.path.abspath(os.path.join("logs", "sb3", args_cli.task)) + print(f"[INFO] Logging experiment in directory: {log_root_path}") + print(f"Exact experiment name requested from command line: {run_info}") + log_dir = os.path.join(log_root_path, run_info) # dump the configuration into log-directory dump_yaml(os.path.join(log_dir, "params", "env.yaml"), env_cfg) dump_yaml(os.path.join(log_dir, "params", "agent.yaml"), agent_cfg) diff --git a/source/standalone/workflows/skrl/train.py b/source/standalone/workflows/skrl/train.py index 90b7a0ed52..045ed540c3 100644 --- a/source/standalone/workflows/skrl/train.py +++ b/source/standalone/workflows/skrl/train.py @@ -135,6 +135,7 @@ def main(env_cfg: ManagerBasedRLEnvCfg | DirectRLEnvCfg | DirectMARLEnvCfg, agen print(f"[INFO] Logging experiment in directory: {log_root_path}") # specify directory for logging runs: {time-stamp}_{run_name} log_dir = datetime.now().strftime("%Y-%m-%d_%H-%M-%S") + f"_{algorithm}_{args_cli.ml_framework}" + print(f"Exact experiment name requested from command line {log_dir}") if agent_cfg["agent"]["experiment"]["experiment_name"]: log_dir += f'_{agent_cfg["agent"]["experiment"]["experiment_name"]}' # set directory into agent config From 6112f6b77ba1dd15cb6092b494ca8de06d36df84 Mon Sep 17 00:00:00 2001 From: Gary Lvov Date: Fri, 24 Jan 2025 11:35:46 -0500 Subject: [PATCH 15/19] remove bolding --- docs/source/features/ray.rst | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/docs/source/features/ray.rst b/docs/source/features/ray.rst index b99c894666..f94d5cacd1 100644 --- a/docs/source/features/ray.rst +++ b/docs/source/features/ray.rst @@ -350,9 +350,9 @@ Dispatching Steps Shared Between KubeRay and Pure Ray Part II :emphasize-lines: 12-53 3.) For tuning jobs, specify the tuning job / hyperparameter sweep as a :class:`JobCfg` . - The included :class:`JobCfg` only supports the ``rl_games`` workflow due to differences in - environment entrypoints and hydra arguments, although other workflows will work if provided a compatible - :class:`JobCfg`. +The included :class:`JobCfg` only supports the ``rl_games`` workflow due to differences in +environment entrypoints and hydra arguments, although other workflows will work if provided a compatible +:class:`JobCfg`. .. dropdown:: source/standalone/workflows/ray/tuner.py (JobCfg definition) :icon: code From 3eb9ac6bf0a3a2194ae4a18720a5ad5631f5600f Mon Sep 17 00:00:00 2001 From: Gary Lvov Date: Fri, 24 Jan 2025 11:41:57 -0500 Subject: [PATCH 16/19] fix bug --- source/standalone/workflows/ray/tuner.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/source/standalone/workflows/ray/tuner.py b/source/standalone/workflows/ray/tuner.py index 966f7ccb63..1e4ca7ddc3 100644 --- a/source/standalone/workflows/ray/tuner.py +++ b/source/standalone/workflows/ray/tuner.py @@ -95,7 +95,7 @@ def step(self) -> dict: self.proc = experiment["proc"] self.experiment_name = experiment["experiment_name"] self.isaac_logdir = experiment["logdir"] - self.tensorboard_logdir = self.isaac_logdir + {self.experiment_name} + self.tensorboard_logdir = self.isaac_logdir + self.experiment_name self.done = False if self.proc is None: From 78d7bb7286875d2c6990282e1d0f2158aef233fa Mon Sep 17 00:00:00 2001 From: Gary Lvov Date: Fri, 24 Jan 2025 12:11:45 -0500 Subject: [PATCH 17/19] need slash --- source/standalone/workflows/ray/tuner.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/source/standalone/workflows/ray/tuner.py b/source/standalone/workflows/ray/tuner.py index 1e4ca7ddc3..458394d6cd 100644 --- a/source/standalone/workflows/ray/tuner.py +++ b/source/standalone/workflows/ray/tuner.py @@ -95,7 +95,7 @@ def step(self) -> dict: self.proc = experiment["proc"] self.experiment_name = experiment["experiment_name"] self.isaac_logdir = experiment["logdir"] - self.tensorboard_logdir = self.isaac_logdir + self.experiment_name + self.tensorboard_logdir = self.isaac_logdir + "/" + self.experiment_name self.done = False if self.proc is None: From 9aba927e3876d2f5778c33c3972f708ed7352592 Mon Sep 17 00:00:00 2001 From: Gary Lvov Date: Fri, 24 Jan 2025 13:30:06 -0500 Subject: [PATCH 18/19] Trigger Build From 828b979ed18f9419ddfe67a52d92a80f98d2660e Mon Sep 17 00:00:00 2001 From: Gary Lvov Date: Fri, 24 Jan 2025 14:34:58 -0500 Subject: [PATCH 19/19] catch correct exception --- source/standalone/workflows/ray/util.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/source/standalone/workflows/ray/util.py b/source/standalone/workflows/ray/util.py index 1e9289be60..844bb1c09a 100644 --- a/source/standalone/workflows/ray/util.py +++ b/source/standalone/workflows/ray/util.py @@ -10,6 +10,7 @@ from math import isclose import ray +from tensorboard.backend.event_processing.directory_watcher import DirectoryDeletedError from tensorboard.backend.event_processing.event_accumulator import EventAccumulator @@ -35,7 +36,7 @@ def get_latest_scalars(path: str) -> dict: for tag in event_acc.Tags()["scalars"] if event_acc.Scalars(tag) } - except (KeyError, OSError, RuntimeError): + except (KeyError, OSError, RuntimeError, DirectoryDeletedError): return {} scalars = get_latest_scalars(directory)