Merge pull request #4 from r-pad/beneisner/add_ray

Beneisner/add ray
r-pad · Oct 18, 2024 · e6ed268 · e6ed268
2 parents 50c21d8 + 201865b
commit e6ed268
Show file tree

Hide file tree

Showing 9 changed files with 331 additions and 6 deletions.
diff --git a/.github/workflows/run-tests.yaml b/.github/workflows/run-tests.yaml
@@ -21,7 +21,7 @@ jobs:
 
     defaults:
       run:
-        working-directory: /opt/baeisner/code
+        working-directory: /opt/rpad/code
 
     steps:
       - uses: actions/checkout@v4
@@ -32,7 +32,7 @@ jobs:
       # Use the github workspace variable to get the correct directory.
       # Can't use the checkout action to checkout to a different directory, so we have to simlink.
       - name: Move code to correct directory
-        run: rm -rf /opt/baeisner/code && ln -s $GITHUB_WORKSPACE /opt/baeisner/code
+        run: rm -rf /opt/rpad/code && ln -s $GITHUB_WORKSPACE /opt/rpad/code
 
       - name: Code Quality
         run: python -m black src/ tests/ --check

diff --git a/Dockerfile b/Dockerfile
@@ -14,7 +14,7 @@ RUN apt-get update && \
     rm -rf /var/lib/apt/lists/*
 
 # Install pyenv
-ENV CODING_ROOT="/opt/baeisner"
+ENV CODING_ROOT="/opt/rpad"
 
 WORKDIR $CODING_ROOT
 RUN git clone --depth=1 https://github.com/pyenv/pyenv.git .pyenv

diff --git a/cluster/launch_seuss.slurm b/cluster/launch_seuss.slurm
@@ -46,7 +46,8 @@ SINGULARITYENV_WANDB_DOCKER_IMAGE=python-ml-project-template \
     SINGULARITYENV_WANDB_API_KEY=$WANDB_API_KEY \
     singularity exec \
     --nv \
-    --pwd /opt/$(whoami)/code \
+    -B $root_dir:/opt/rpad/code \
+    --pwd /opt/rpad/code \
     -B /scratch/$(whoami)/data:/opt/data \
     -B /scratch/$(whoami)/logs:/opt/logs \
     ${sif_name} \

diff --git a/cluster/seuss.md b/cluster/seuss.md
@@ -0,0 +1,39 @@
+# Running different types of jobs on SEUSS.
+
+## Prerequisites.
+
+1. You need to build/push a SIF to SEUSS. On a LOCAL machine (NOT SEUSS!), you can do this by running:
+
+    ```bash
+    ./cluster/build_push_sif_seuss.bash
+    ```
+    We have to do this because we can't build SIFs on SEUSS for some reason. This will also create one per-branch, and rsync.
+
+2. You need to generate .egg-info files for the package, so that we can mount the codebase dynamically (e.g. without rebuilding the SIF).
+
+    ```bash
+    module load anaconda2
+    conda create -n python_ml_project_template python=3.9
+    source activate python_ml_project_template
+    pip install -e ".[develop]"
+    ```
+
+    Subsequent singularity commands should bind wherever this is installed to `/opt/rpad/code` inside the container.
+
+## Training.
+
+Run the following command to train a model on SEUSS:
+
+```bash
+./cluster/launch_seuss.bash
+```
+
+## Running a ray distributed job.
+
+Run the following command to run a ray distributed job on SEUSS:
+
+```bash
+python cluster/slurm-launch.py --exp-name test --command "python scripts/simple_ray_job.py" --num-nodes 1 --partition GPU --num-gpus 1
+```
+
+This will launch a single node job on the GPU partition with 1 GPU, and spin up a ray cluster on that node (more nodes if you specify `--num-nodes`). It then will parallelize the `simple_ray_job.py` script across the ray cluster (or whatever you want to run).
diff --git a/cluster/slurm-launch.py b/cluster/slurm-launch.py
@@ -0,0 +1,110 @@
+# slurm-launch.py
+# Usage:
+# python slurm-launch.py --exp-name test \
+#     --command "rllib train --run PPO --env CartPole-v0"
+
+import argparse
+import subprocess
+import sys
+import time
+from pathlib import Path
+
+template_file = Path(__file__).parent / "slurm-template.sh"
+JOB_NAME = "${JOB_NAME}"
+NUM_NODES = "${NUM_NODES}"
+NUM_GPUS_PER_NODE = "${NUM_GPUS_PER_NODE}"
+PARTITION_OPTION = "${PARTITION_OPTION}"
+COMMAND_PLACEHOLDER = "${COMMAND_PLACEHOLDER}"
+GIVEN_NODE = "${GIVEN_NODE}"
+LOAD_ENV = "${LOAD_ENV}"
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "--exp-name",
+        type=str,
+        required=True,
+        help="The job name and path to logging file (exp_name.log).",
+    )
+    parser.add_argument(
+        "--num-nodes", "-n", type=int, default=1, help="Number of nodes to use."
+    )
+    parser.add_argument(
+        "--node",
+        "-w",
+        type=str,
+        help="The specified nodes to use. Same format as the "
+        "return of 'sinfo'. Default: ''.",
+    )
+    parser.add_argument(
+        "--num-gpus",
+        type=int,
+        default=0,
+        help="Number of GPUs to use in each node. (Default: 0)",
+    )
+    parser.add_argument(
+        "--partition",
+        "-p",
+        type=str,
+    )
+    parser.add_argument(
+        "--load-env",
+        type=str,
+        help="The script to load your environment ('module load cuda/10.1')",
+        default="",
+    )
+    parser.add_argument(
+        "--command",
+        type=str,
+        required=True,
+        help="The command you wish to execute. For example: "
+        " --command 'python test.py'. "
+        "Note that the command must be a string.",
+    )
+    args = parser.parse_args()
+
+    if args.node:
+        # assert args.num_nodes == 1
+        node_info = "#SBATCH -w {}".format(args.node)
+    else:
+        node_info = ""
+
+    job_name = "{}_{}".format(
+        args.exp_name, time.strftime("%m%d-%H%M", time.localtime())
+    )
+
+    partition_option = (
+        "#SBATCH --partition={}".format(args.partition) if args.partition else ""
+    )
+
+    # ===== Modified the template script =====
+    with open(template_file, "r") as f:
+        text = f.read()
+    text = text.replace(JOB_NAME, job_name)
+    text = text.replace(NUM_NODES, str(args.num_nodes))
+    text = text.replace(NUM_GPUS_PER_NODE, str(args.num_gpus))
+    text = text.replace(PARTITION_OPTION, partition_option)
+    text = text.replace(COMMAND_PLACEHOLDER, str(args.command))
+    text = text.replace(LOAD_ENV, str(args.load_env))
+    text = text.replace(GIVEN_NODE, node_info)
+    text = text.replace(
+        "# THIS FILE IS A TEMPLATE AND IT SHOULD NOT BE DEPLOYED TO " "PRODUCTION!",
+        "# THIS FILE IS MODIFIED AUTOMATICALLY FROM TEMPLATE AND SHOULD BE "
+        "RUNNABLE!",
+    )
+
+    # ===== Save the script =====
+    script_file = "./.slurm_logs/{}.sh".format(job_name)
+    with open(script_file, "w") as f:
+        f.write(text)
+
+    # ===== Submit the job =====
+    print("Starting to submit job!")
+    subprocess.Popen(["sbatch", script_file])
+    print(
+        "Job submitted! Script file is at: <{}>. Log file is at: <{}>".format(
+            script_file, "{}.log".format(job_name)
+        )
+    )
+    sys.exit(0)
+
diff --git a/cluster/slurm-template.sh b/cluster/slurm-template.sh
@@ -0,0 +1,142 @@
+#!/bin/bash
+# shellcheck disable=SC2206
+# THIS FILE IS GENERATED BY AUTOMATION SCRIPT! PLEASE REFER TO ORIGINAL SCRIPT!
+# THIS FILE IS A TEMPLATE AND IT SHOULD NOT BE DEPLOYED TO PRODUCTION!
+${PARTITION_OPTION}
+#SBATCH --job-name={JOB_NAME}
+#SBATCH --output=./.slurm_logs/${JOB_NAME}.log
+${GIVEN_NODE}
+### This script works for any number of nodes, Ray will find and manage all resources
+#SBATCH --nodes=${NUM_NODES}
+#SBATCH --exclusive
+### Give all resources to a single Ray task, ray can manage the resources internally
+#SBATCH --ntasks-per-node=1
+### Currently we're overriding the number of GPUs per node in the exec below. This is a hack.
+### Normally, we would set this to the number of GPUs per node, esp. if we have a cluster
+### with homogeneous nodes.
+#           #SBATCH --gres=gpu:${NUM_GPUS_PER_NODE}
+#SBATCH --gres=gpu:1
+### Exclude the following nodes
+#   # SBATCH --exclude=compute-0-19,compute-0-21,compute-0-23,compute-0-25,compute-0-27
+
+# Load modules or your own conda environment here
+# module load pytorch/v1.4.0-gpu
+# conda activate ${CONDA_ENV}
+${LOAD_ENV}
+
+module load singularity
+
+#############################################################
+# Modifications to the original script start here.
+#############################################################
+
+dockerhub_username=beisner
+project_name=python_ml_project_template
+scs_username=baeisner
+
+# Get the branch name.
+root_dir=$HOME/code/${project_name}
+
+# Compute a good tag for the image, which will be <dockerhub_username>/<project_name>:<branch-name>-scratch.
+sanitized_branch_name=$(${root_dir}/cluster/sanitize_branch_name.bash)
+
+# Get the SIF name
+sif_name=$HOME/singularity_images/${project_name}_${sanitized_branch_name}-scratch.sif
+
+if [ ! -f ${sif_name} ]; then
+  echo "SIF file not found: ${sif_name}"
+  echo "You need to run the ./cluster/build_push_sif_seuss.bash script first."
+fi
+
+echo "Using SIF name: ${sif_name}"
+
+# ===== DO NOT CHANGE THINGS HERE UNLESS YOU KNOW WHAT YOU ARE DOING =====
+# This script is a modification to the implementation suggest by gregSchwartz18 here:
+# https://github.com/ray-project/ray/issues/826#issuecomment-522116599
+redis_password=$(uuidgen)
+export redis_password
+
+nodes=$(scontrol show hostnames "$SLURM_JOB_NODELIST") # Getting the node names
+nodes_array=($nodes)
+
+node_1=${nodes_array[0]}
+ip=$(srun --nodes=1 --ntasks=1 -w "$node_1" hostname --ip-address) # making redis-address
+
+# if we detect a space character in the head node IP, we'll
+# convert it to an ipv4 address. This step is optional.
+if [[ "$ip" == *" "* ]]; then
+  IFS=' ' read -ra ADDR <<<"$ip"
+  if [[ ${#ADDR[0]} -gt 16 ]]; then
+    ip=${ADDR[1]}
+  else
+    ip=${ADDR[0]}
+  fi
+  echo "IPV6 address detected. We split the IPV4 address as $ip"
+fi
+
+port=6379
+ip_head=$ip:$port
+export ip_head
+echo "IP Head: $ip_head"
+
+export RAY_num_server_call_thread=2
+
+echo "STARTING HEAD at $node_1"
+
+num_gpus=$(scontrol show node $node_1 | grep CfgTRES | awk -F'gpu=' '{print $2}' | awk '{print $1}')
+cuda_devices=$(seq -s, 0 $(($num_gpus - 1)))
+echo "CUDA_VISIBLE_DEVICES: $cuda_devices"
+
+# Original.
+# srun --nodes=1 --ntasks=1 -w "$node_1" \
+#   bash -c "CUDA_VISIBLE_DEVICES=$cuda_devices ray start --head --node-ip-address="$ip" --port=$port --redis-password="$redis_password" --block --num-gpus=$num_gpus" &
+
+# With singularity. Quite hacky that we have to use bash here.
+srun --nodes=1 --ntasks=1 -w "$node_1" \
+  bash -c "CUDA_VISIBLE_DEVICES=$cuda_devices singularity exec \
+  -B $root_dir:/opt/rpad/code \
+  --pwd /opt/rpad/code \
+  --nv ${sif_name} \
+  ray start --head --node-ip-address="$ip" --port=$port --redis-password="$redis_password" --block --num-gpus=$num_gpus" &
+
+sleep 10
+
+worker_num=$((SLURM_JOB_NUM_NODES - 1)) #number of nodes other than the head node
+for ((i = 1; i <= worker_num; i++)); do
+  node_i=${nodes_array[$i]}
+  echo "STARTING WORKER $i at $node_i"
+
+  # Get the number of GPUs on the node.
+  num_gpus=$(scontrol show node $node_i | grep CfgTRES | awk -F'gpu=' '{print $2}' | awk '{print $1}')
+  echo "NUM_GPUS: $num_gpus"
+
+  # Set CUDA_VISIBLE_DEVICES for each worker
+  cuda_devices=$(seq -s, 0 $(($num_gpus - 1)))
+  echo "CUDA_VISIBLE_DEVICES: $cuda_devices"
+  # srun --nodes=1 --ntasks=1 -w "$node_i" ray start --address "$ip_head" --redis-password="$redis_password" --block &
+
+  srun --nodes=1 --ntasks=1 -w "$node_i" \
+    bash -c "CUDA_VISIBLE_DEVICES=$cuda_devices singularity exec \
+    -B $root_dir:/opt/rpad/code \
+    --pwd /opt/rpad/code \
+    --nv ${sif_name} \
+    ray start --address "$ip_head" --redis-password="$redis_password" --block --num-gpus=$num_gpus" &
+  sleep 5
+done
+
+echo "RUNNING YOUR CODE NOW: ${COMMAND_PLACEHOLDER}"
+
+# Print a command to run SSH tunnel to access Ray Dashboard
+# Should look like this but fill in with whoami and ip:
+# ssh -J [email protected] -L 8265:localhost:8265 [email protected]
+echo "To access the Ray Dashboard, run the following command on your local machine:"
+echo "ssh -J $(whoami)@seuss.ri.cmu.edu -L 8265:localhost:8265 $(whoami)@${ip}"
+echo "Then open a browser and go to http://localhost:8265"
+
+# ===== Call your code below =====
+singularity exec \
+  -B $root_dir:/opt/rpad/code \
+  --pwd /opt/rpad/code \
+  --nv \
+  ${sif_name} \
+  ${COMMAND_PLACEHOLDER}
diff --git a/pyproject.toml b/pyproject.toml
@@ -11,9 +11,10 @@ dependencies = [
   "lightning == 2.0.3",
   "omegaconf == 2.3.0",
   "pandas",
-  "torch == 2.0.1",        # CUDA 11.8
+  "ray[default] == 2.37.0",
+  "torch == 2.0.1",         # CUDA 11.8
   "torchmetrics",
-  "torchvision == 0.15.2", # CUDA 11.8
+  "torchvision == 0.15.2",  # CUDA 11.8
   "wandb == 0.15.4",
 ]
 

diff --git a/scripts/simple_ray_job.py b/scripts/simple_ray_job.py
@@ -0,0 +1,32 @@
+import ray
+import torch
+import time
+import logging
+from python_ml_project_template.models.classifier import ClassifierInferenceModule
+
+
+# Simple ray job that parallelizes 20 different jobst that consumer 1 GPU each, putting a 1GB tensor on each GPU.
+
+@ray.remote(num_gpus=1)
+def f():
+    x = torch.rand(1024, 1024).cuda()
+
+    print("STARTING THE JOB!")
+    model = ClassifierInferenceModule(None)
+
+    # Wait for 30s. This is to simulate a job that takes a while to finish.
+    time.sleep(5)
+
+    return x.sum()
+
+
+
+if __name__ == "__main__":
+    ray.init()
+    # Start 30 tasks in parallel.
+    print("Starting 30 tasks on the head unit...")
+    result_ids = [f.remote() for _ in range(30)]
+    results = ray.get(result_ids)
+    print("Results are", results)
+    print("The sum of results is", sum(results))
+    ray.shutdown()
diff --git a/src/python_ml_project_template/utils/__init__.py b/src/python_ml_project_template/utils/__init__.py