Skip to content

Commit

Permalink
Merge pull request #4 from r-pad/beneisner/add_ray
Browse files Browse the repository at this point in the history
Beneisner/add ray
  • Loading branch information
beneisner authored Oct 18, 2024
2 parents 50c21d8 + 201865b commit e6ed268
Show file tree
Hide file tree
Showing 9 changed files with 331 additions and 6 deletions.
4 changes: 2 additions & 2 deletions .github/workflows/run-tests.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@ jobs:

defaults:
run:
working-directory: /opt/baeisner/code
working-directory: /opt/rpad/code

steps:
- uses: actions/checkout@v4
Expand All @@ -32,7 +32,7 @@ jobs:
# Use the github workspace variable to get the correct directory.
# Can't use the checkout action to checkout to a different directory, so we have to simlink.
- name: Move code to correct directory
run: rm -rf /opt/baeisner/code && ln -s $GITHUB_WORKSPACE /opt/baeisner/code
run: rm -rf /opt/rpad/code && ln -s $GITHUB_WORKSPACE /opt/rpad/code

- name: Code Quality
run: python -m black src/ tests/ --check
Expand Down
2 changes: 1 addition & 1 deletion Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@ RUN apt-get update && \
rm -rf /var/lib/apt/lists/*

# Install pyenv
ENV CODING_ROOT="/opt/baeisner"
ENV CODING_ROOT="/opt/rpad"

WORKDIR $CODING_ROOT
RUN git clone --depth=1 https://github.com/pyenv/pyenv.git .pyenv
Expand Down
3 changes: 2 additions & 1 deletion cluster/launch_seuss.slurm
Original file line number Diff line number Diff line change
Expand Up @@ -46,7 +46,8 @@ SINGULARITYENV_WANDB_DOCKER_IMAGE=python-ml-project-template \
SINGULARITYENV_WANDB_API_KEY=$WANDB_API_KEY \
singularity exec \
--nv \
--pwd /opt/$(whoami)/code \
-B $root_dir:/opt/rpad/code \
--pwd /opt/rpad/code \
-B /scratch/$(whoami)/data:/opt/data \
-B /scratch/$(whoami)/logs:/opt/logs \
${sif_name} \
Expand Down
39 changes: 39 additions & 0 deletions cluster/seuss.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,39 @@
# Running different types of jobs on SEUSS.

## Prerequisites.

1. You need to build/push a SIF to SEUSS. On a LOCAL machine (NOT SEUSS!), you can do this by running:

```bash
./cluster/build_push_sif_seuss.bash
```
We have to do this because we can't build SIFs on SEUSS for some reason. This will also create one per-branch, and rsync.
2. You need to generate .egg-info files for the package, so that we can mount the codebase dynamically (e.g. without rebuilding the SIF).
```bash
module load anaconda2
conda create -n python_ml_project_template python=3.9
source activate python_ml_project_template
pip install -e ".[develop]"
```
Subsequent singularity commands should bind wherever this is installed to `/opt/rpad/code` inside the container.
## Training.
Run the following command to train a model on SEUSS:
```bash
./cluster/launch_seuss.bash
```
## Running a ray distributed job.
Run the following command to run a ray distributed job on SEUSS:
```bash
python cluster/slurm-launch.py --exp-name test --command "python scripts/simple_ray_job.py" --num-nodes 1 --partition GPU --num-gpus 1
```
This will launch a single node job on the GPU partition with 1 GPU, and spin up a ray cluster on that node (more nodes if you specify `--num-nodes`). It then will parallelize the `simple_ray_job.py` script across the ray cluster (or whatever you want to run).
110 changes: 110 additions & 0 deletions cluster/slurm-launch.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,110 @@
# slurm-launch.py
# Usage:
# python slurm-launch.py --exp-name test \
# --command "rllib train --run PPO --env CartPole-v0"

import argparse
import subprocess
import sys
import time
from pathlib import Path

template_file = Path(__file__).parent / "slurm-template.sh"
JOB_NAME = "${JOB_NAME}"
NUM_NODES = "${NUM_NODES}"
NUM_GPUS_PER_NODE = "${NUM_GPUS_PER_NODE}"
PARTITION_OPTION = "${PARTITION_OPTION}"
COMMAND_PLACEHOLDER = "${COMMAND_PLACEHOLDER}"
GIVEN_NODE = "${GIVEN_NODE}"
LOAD_ENV = "${LOAD_ENV}"

if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument(
"--exp-name",
type=str,
required=True,
help="The job name and path to logging file (exp_name.log).",
)
parser.add_argument(
"--num-nodes", "-n", type=int, default=1, help="Number of nodes to use."
)
parser.add_argument(
"--node",
"-w",
type=str,
help="The specified nodes to use. Same format as the "
"return of 'sinfo'. Default: ''.",
)
parser.add_argument(
"--num-gpus",
type=int,
default=0,
help="Number of GPUs to use in each node. (Default: 0)",
)
parser.add_argument(
"--partition",
"-p",
type=str,
)
parser.add_argument(
"--load-env",
type=str,
help="The script to load your environment ('module load cuda/10.1')",
default="",
)
parser.add_argument(
"--command",
type=str,
required=True,
help="The command you wish to execute. For example: "
" --command 'python test.py'. "
"Note that the command must be a string.",
)
args = parser.parse_args()

if args.node:
# assert args.num_nodes == 1
node_info = "#SBATCH -w {}".format(args.node)
else:
node_info = ""

job_name = "{}_{}".format(
args.exp_name, time.strftime("%m%d-%H%M", time.localtime())
)

partition_option = (
"#SBATCH --partition={}".format(args.partition) if args.partition else ""
)

# ===== Modified the template script =====
with open(template_file, "r") as f:
text = f.read()
text = text.replace(JOB_NAME, job_name)
text = text.replace(NUM_NODES, str(args.num_nodes))
text = text.replace(NUM_GPUS_PER_NODE, str(args.num_gpus))
text = text.replace(PARTITION_OPTION, partition_option)
text = text.replace(COMMAND_PLACEHOLDER, str(args.command))
text = text.replace(LOAD_ENV, str(args.load_env))
text = text.replace(GIVEN_NODE, node_info)
text = text.replace(
"# THIS FILE IS A TEMPLATE AND IT SHOULD NOT BE DEPLOYED TO " "PRODUCTION!",
"# THIS FILE IS MODIFIED AUTOMATICALLY FROM TEMPLATE AND SHOULD BE "
"RUNNABLE!",
)

# ===== Save the script =====
script_file = "./.slurm_logs/{}.sh".format(job_name)
with open(script_file, "w") as f:
f.write(text)

# ===== Submit the job =====
print("Starting to submit job!")
subprocess.Popen(["sbatch", script_file])
print(
"Job submitted! Script file is at: <{}>. Log file is at: <{}>".format(
script_file, "{}.log".format(job_name)
)
)
sys.exit(0)

142 changes: 142 additions & 0 deletions cluster/slurm-template.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,142 @@
#!/bin/bash
# shellcheck disable=SC2206
# THIS FILE IS GENERATED BY AUTOMATION SCRIPT! PLEASE REFER TO ORIGINAL SCRIPT!
# THIS FILE IS A TEMPLATE AND IT SHOULD NOT BE DEPLOYED TO PRODUCTION!
${PARTITION_OPTION}
#SBATCH --job-name={JOB_NAME}
#SBATCH --output=./.slurm_logs/${JOB_NAME}.log
${GIVEN_NODE}
### This script works for any number of nodes, Ray will find and manage all resources
#SBATCH --nodes=${NUM_NODES}
#SBATCH --exclusive
### Give all resources to a single Ray task, ray can manage the resources internally
#SBATCH --ntasks-per-node=1
### Currently we're overriding the number of GPUs per node in the exec below. This is a hack.
### Normally, we would set this to the number of GPUs per node, esp. if we have a cluster
### with homogeneous nodes.
# #SBATCH --gres=gpu:${NUM_GPUS_PER_NODE}
#SBATCH --gres=gpu:1
### Exclude the following nodes
# # SBATCH --exclude=compute-0-19,compute-0-21,compute-0-23,compute-0-25,compute-0-27

# Load modules or your own conda environment here
# module load pytorch/v1.4.0-gpu
# conda activate ${CONDA_ENV}
${LOAD_ENV}

module load singularity

#############################################################
# Modifications to the original script start here.
#############################################################

dockerhub_username=beisner
project_name=python_ml_project_template
scs_username=baeisner

# Get the branch name.
root_dir=$HOME/code/${project_name}

# Compute a good tag for the image, which will be <dockerhub_username>/<project_name>:<branch-name>-scratch.
sanitized_branch_name=$(${root_dir}/cluster/sanitize_branch_name.bash)

# Get the SIF name
sif_name=$HOME/singularity_images/${project_name}_${sanitized_branch_name}-scratch.sif

if [ ! -f ${sif_name} ]; then
echo "SIF file not found: ${sif_name}"
echo "You need to run the ./cluster/build_push_sif_seuss.bash script first."
fi

echo "Using SIF name: ${sif_name}"

# ===== DO NOT CHANGE THINGS HERE UNLESS YOU KNOW WHAT YOU ARE DOING =====
# This script is a modification to the implementation suggest by gregSchwartz18 here:
# https://github.com/ray-project/ray/issues/826#issuecomment-522116599
redis_password=$(uuidgen)
export redis_password

nodes=$(scontrol show hostnames "$SLURM_JOB_NODELIST") # Getting the node names
nodes_array=($nodes)

node_1=${nodes_array[0]}
ip=$(srun --nodes=1 --ntasks=1 -w "$node_1" hostname --ip-address) # making redis-address

# if we detect a space character in the head node IP, we'll
# convert it to an ipv4 address. This step is optional.
if [[ "$ip" == *" "* ]]; then
IFS=' ' read -ra ADDR <<<"$ip"
if [[ ${#ADDR[0]} -gt 16 ]]; then
ip=${ADDR[1]}
else
ip=${ADDR[0]}
fi
echo "IPV6 address detected. We split the IPV4 address as $ip"
fi

port=6379
ip_head=$ip:$port
export ip_head
echo "IP Head: $ip_head"

export RAY_num_server_call_thread=2

echo "STARTING HEAD at $node_1"

num_gpus=$(scontrol show node $node_1 | grep CfgTRES | awk -F'gpu=' '{print $2}' | awk '{print $1}')
cuda_devices=$(seq -s, 0 $(($num_gpus - 1)))
echo "CUDA_VISIBLE_DEVICES: $cuda_devices"

# Original.
# srun --nodes=1 --ntasks=1 -w "$node_1" \
# bash -c "CUDA_VISIBLE_DEVICES=$cuda_devices ray start --head --node-ip-address="$ip" --port=$port --redis-password="$redis_password" --block --num-gpus=$num_gpus" &

# With singularity. Quite hacky that we have to use bash here.
srun --nodes=1 --ntasks=1 -w "$node_1" \
bash -c "CUDA_VISIBLE_DEVICES=$cuda_devices singularity exec \
-B $root_dir:/opt/rpad/code \
--pwd /opt/rpad/code \
--nv ${sif_name} \
ray start --head --node-ip-address="$ip" --port=$port --redis-password="$redis_password" --block --num-gpus=$num_gpus" &

sleep 10

worker_num=$((SLURM_JOB_NUM_NODES - 1)) #number of nodes other than the head node
for ((i = 1; i <= worker_num; i++)); do
node_i=${nodes_array[$i]}
echo "STARTING WORKER $i at $node_i"

# Get the number of GPUs on the node.
num_gpus=$(scontrol show node $node_i | grep CfgTRES | awk -F'gpu=' '{print $2}' | awk '{print $1}')
echo "NUM_GPUS: $num_gpus"

# Set CUDA_VISIBLE_DEVICES for each worker
cuda_devices=$(seq -s, 0 $(($num_gpus - 1)))
echo "CUDA_VISIBLE_DEVICES: $cuda_devices"
# srun --nodes=1 --ntasks=1 -w "$node_i" ray start --address "$ip_head" --redis-password="$redis_password" --block &

srun --nodes=1 --ntasks=1 -w "$node_i" \
bash -c "CUDA_VISIBLE_DEVICES=$cuda_devices singularity exec \
-B $root_dir:/opt/rpad/code \
--pwd /opt/rpad/code \
--nv ${sif_name} \
ray start --address "$ip_head" --redis-password="$redis_password" --block --num-gpus=$num_gpus" &
sleep 5
done

echo "RUNNING YOUR CODE NOW: ${COMMAND_PLACEHOLDER}"

# Print a command to run SSH tunnel to access Ray Dashboard
# Should look like this but fill in with whoami and ip:
# ssh -J [email protected] -L 8265:localhost:8265 [email protected]
echo "To access the Ray Dashboard, run the following command on your local machine:"
echo "ssh -J $(whoami)@seuss.ri.cmu.edu -L 8265:localhost:8265 $(whoami)@${ip}"
echo "Then open a browser and go to http://localhost:8265"

# ===== Call your code below =====
singularity exec \
-B $root_dir:/opt/rpad/code \
--pwd /opt/rpad/code \
--nv \
${sif_name} \
${COMMAND_PLACEHOLDER}
5 changes: 3 additions & 2 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -11,9 +11,10 @@ dependencies = [
"lightning == 2.0.3",
"omegaconf == 2.3.0",
"pandas",
"torch == 2.0.1", # CUDA 11.8
"ray[default] == 2.37.0",
"torch == 2.0.1", # CUDA 11.8
"torchmetrics",
"torchvision == 0.15.2", # CUDA 11.8
"torchvision == 0.15.2", # CUDA 11.8
"wandb == 0.15.4",
]

Expand Down
32 changes: 32 additions & 0 deletions scripts/simple_ray_job.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
import ray
import torch
import time
import logging
from python_ml_project_template.models.classifier import ClassifierInferenceModule


# Simple ray job that parallelizes 20 different jobst that consumer 1 GPU each, putting a 1GB tensor on each GPU.

@ray.remote(num_gpus=1)
def f():
x = torch.rand(1024, 1024).cuda()

print("STARTING THE JOB!")
model = ClassifierInferenceModule(None)

# Wait for 30s. This is to simulate a job that takes a while to finish.
time.sleep(5)

return x.sum()



if __name__ == "__main__":
ray.init()
# Start 30 tasks in parallel.
print("Starting 30 tasks on the head unit...")
result_ids = [f.remote() for _ in range(30)]
results = ray.get(result_ids)
print("Results are", results)
print("The sum of results is", sum(results))
ray.shutdown()
Empty file.

0 comments on commit e6ed268

Please sign in to comment.