forked from beneisner/python_pkg_template
-
Notifications
You must be signed in to change notification settings - Fork 4
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request #4 from r-pad/beneisner/add_ray
Beneisner/add ray
- Loading branch information
Showing
9 changed files
with
331 additions
and
6 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,39 @@ | ||
# Running different types of jobs on SEUSS. | ||
|
||
## Prerequisites. | ||
|
||
1. You need to build/push a SIF to SEUSS. On a LOCAL machine (NOT SEUSS!), you can do this by running: | ||
|
||
```bash | ||
./cluster/build_push_sif_seuss.bash | ||
``` | ||
We have to do this because we can't build SIFs on SEUSS for some reason. This will also create one per-branch, and rsync. | ||
2. You need to generate .egg-info files for the package, so that we can mount the codebase dynamically (e.g. without rebuilding the SIF). | ||
```bash | ||
module load anaconda2 | ||
conda create -n python_ml_project_template python=3.9 | ||
source activate python_ml_project_template | ||
pip install -e ".[develop]" | ||
``` | ||
Subsequent singularity commands should bind wherever this is installed to `/opt/rpad/code` inside the container. | ||
## Training. | ||
Run the following command to train a model on SEUSS: | ||
```bash | ||
./cluster/launch_seuss.bash | ||
``` | ||
## Running a ray distributed job. | ||
Run the following command to run a ray distributed job on SEUSS: | ||
```bash | ||
python cluster/slurm-launch.py --exp-name test --command "python scripts/simple_ray_job.py" --num-nodes 1 --partition GPU --num-gpus 1 | ||
``` | ||
This will launch a single node job on the GPU partition with 1 GPU, and spin up a ray cluster on that node (more nodes if you specify `--num-nodes`). It then will parallelize the `simple_ray_job.py` script across the ray cluster (or whatever you want to run). |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,110 @@ | ||
# slurm-launch.py | ||
# Usage: | ||
# python slurm-launch.py --exp-name test \ | ||
# --command "rllib train --run PPO --env CartPole-v0" | ||
|
||
import argparse | ||
import subprocess | ||
import sys | ||
import time | ||
from pathlib import Path | ||
|
||
template_file = Path(__file__).parent / "slurm-template.sh" | ||
JOB_NAME = "${JOB_NAME}" | ||
NUM_NODES = "${NUM_NODES}" | ||
NUM_GPUS_PER_NODE = "${NUM_GPUS_PER_NODE}" | ||
PARTITION_OPTION = "${PARTITION_OPTION}" | ||
COMMAND_PLACEHOLDER = "${COMMAND_PLACEHOLDER}" | ||
GIVEN_NODE = "${GIVEN_NODE}" | ||
LOAD_ENV = "${LOAD_ENV}" | ||
|
||
if __name__ == "__main__": | ||
parser = argparse.ArgumentParser() | ||
parser.add_argument( | ||
"--exp-name", | ||
type=str, | ||
required=True, | ||
help="The job name and path to logging file (exp_name.log).", | ||
) | ||
parser.add_argument( | ||
"--num-nodes", "-n", type=int, default=1, help="Number of nodes to use." | ||
) | ||
parser.add_argument( | ||
"--node", | ||
"-w", | ||
type=str, | ||
help="The specified nodes to use. Same format as the " | ||
"return of 'sinfo'. Default: ''.", | ||
) | ||
parser.add_argument( | ||
"--num-gpus", | ||
type=int, | ||
default=0, | ||
help="Number of GPUs to use in each node. (Default: 0)", | ||
) | ||
parser.add_argument( | ||
"--partition", | ||
"-p", | ||
type=str, | ||
) | ||
parser.add_argument( | ||
"--load-env", | ||
type=str, | ||
help="The script to load your environment ('module load cuda/10.1')", | ||
default="", | ||
) | ||
parser.add_argument( | ||
"--command", | ||
type=str, | ||
required=True, | ||
help="The command you wish to execute. For example: " | ||
" --command 'python test.py'. " | ||
"Note that the command must be a string.", | ||
) | ||
args = parser.parse_args() | ||
|
||
if args.node: | ||
# assert args.num_nodes == 1 | ||
node_info = "#SBATCH -w {}".format(args.node) | ||
else: | ||
node_info = "" | ||
|
||
job_name = "{}_{}".format( | ||
args.exp_name, time.strftime("%m%d-%H%M", time.localtime()) | ||
) | ||
|
||
partition_option = ( | ||
"#SBATCH --partition={}".format(args.partition) if args.partition else "" | ||
) | ||
|
||
# ===== Modified the template script ===== | ||
with open(template_file, "r") as f: | ||
text = f.read() | ||
text = text.replace(JOB_NAME, job_name) | ||
text = text.replace(NUM_NODES, str(args.num_nodes)) | ||
text = text.replace(NUM_GPUS_PER_NODE, str(args.num_gpus)) | ||
text = text.replace(PARTITION_OPTION, partition_option) | ||
text = text.replace(COMMAND_PLACEHOLDER, str(args.command)) | ||
text = text.replace(LOAD_ENV, str(args.load_env)) | ||
text = text.replace(GIVEN_NODE, node_info) | ||
text = text.replace( | ||
"# THIS FILE IS A TEMPLATE AND IT SHOULD NOT BE DEPLOYED TO " "PRODUCTION!", | ||
"# THIS FILE IS MODIFIED AUTOMATICALLY FROM TEMPLATE AND SHOULD BE " | ||
"RUNNABLE!", | ||
) | ||
|
||
# ===== Save the script ===== | ||
script_file = "./.slurm_logs/{}.sh".format(job_name) | ||
with open(script_file, "w") as f: | ||
f.write(text) | ||
|
||
# ===== Submit the job ===== | ||
print("Starting to submit job!") | ||
subprocess.Popen(["sbatch", script_file]) | ||
print( | ||
"Job submitted! Script file is at: <{}>. Log file is at: <{}>".format( | ||
script_file, "{}.log".format(job_name) | ||
) | ||
) | ||
sys.exit(0) | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,142 @@ | ||
#!/bin/bash | ||
# shellcheck disable=SC2206 | ||
# THIS FILE IS GENERATED BY AUTOMATION SCRIPT! PLEASE REFER TO ORIGINAL SCRIPT! | ||
# THIS FILE IS A TEMPLATE AND IT SHOULD NOT BE DEPLOYED TO PRODUCTION! | ||
${PARTITION_OPTION} | ||
#SBATCH --job-name={JOB_NAME} | ||
#SBATCH --output=./.slurm_logs/${JOB_NAME}.log | ||
${GIVEN_NODE} | ||
### This script works for any number of nodes, Ray will find and manage all resources | ||
#SBATCH --nodes=${NUM_NODES} | ||
#SBATCH --exclusive | ||
### Give all resources to a single Ray task, ray can manage the resources internally | ||
#SBATCH --ntasks-per-node=1 | ||
### Currently we're overriding the number of GPUs per node in the exec below. This is a hack. | ||
### Normally, we would set this to the number of GPUs per node, esp. if we have a cluster | ||
### with homogeneous nodes. | ||
# #SBATCH --gres=gpu:${NUM_GPUS_PER_NODE} | ||
#SBATCH --gres=gpu:1 | ||
### Exclude the following nodes | ||
# # SBATCH --exclude=compute-0-19,compute-0-21,compute-0-23,compute-0-25,compute-0-27 | ||
|
||
# Load modules or your own conda environment here | ||
# module load pytorch/v1.4.0-gpu | ||
# conda activate ${CONDA_ENV} | ||
${LOAD_ENV} | ||
|
||
module load singularity | ||
|
||
############################################################# | ||
# Modifications to the original script start here. | ||
############################################################# | ||
|
||
dockerhub_username=beisner | ||
project_name=python_ml_project_template | ||
scs_username=baeisner | ||
|
||
# Get the branch name. | ||
root_dir=$HOME/code/${project_name} | ||
|
||
# Compute a good tag for the image, which will be <dockerhub_username>/<project_name>:<branch-name>-scratch. | ||
sanitized_branch_name=$(${root_dir}/cluster/sanitize_branch_name.bash) | ||
|
||
# Get the SIF name | ||
sif_name=$HOME/singularity_images/${project_name}_${sanitized_branch_name}-scratch.sif | ||
|
||
if [ ! -f ${sif_name} ]; then | ||
echo "SIF file not found: ${sif_name}" | ||
echo "You need to run the ./cluster/build_push_sif_seuss.bash script first." | ||
fi | ||
|
||
echo "Using SIF name: ${sif_name}" | ||
|
||
# ===== DO NOT CHANGE THINGS HERE UNLESS YOU KNOW WHAT YOU ARE DOING ===== | ||
# This script is a modification to the implementation suggest by gregSchwartz18 here: | ||
# https://github.com/ray-project/ray/issues/826#issuecomment-522116599 | ||
redis_password=$(uuidgen) | ||
export redis_password | ||
|
||
nodes=$(scontrol show hostnames "$SLURM_JOB_NODELIST") # Getting the node names | ||
nodes_array=($nodes) | ||
|
||
node_1=${nodes_array[0]} | ||
ip=$(srun --nodes=1 --ntasks=1 -w "$node_1" hostname --ip-address) # making redis-address | ||
|
||
# if we detect a space character in the head node IP, we'll | ||
# convert it to an ipv4 address. This step is optional. | ||
if [[ "$ip" == *" "* ]]; then | ||
IFS=' ' read -ra ADDR <<<"$ip" | ||
if [[ ${#ADDR[0]} -gt 16 ]]; then | ||
ip=${ADDR[1]} | ||
else | ||
ip=${ADDR[0]} | ||
fi | ||
echo "IPV6 address detected. We split the IPV4 address as $ip" | ||
fi | ||
|
||
port=6379 | ||
ip_head=$ip:$port | ||
export ip_head | ||
echo "IP Head: $ip_head" | ||
|
||
export RAY_num_server_call_thread=2 | ||
|
||
echo "STARTING HEAD at $node_1" | ||
|
||
num_gpus=$(scontrol show node $node_1 | grep CfgTRES | awk -F'gpu=' '{print $2}' | awk '{print $1}') | ||
cuda_devices=$(seq -s, 0 $(($num_gpus - 1))) | ||
echo "CUDA_VISIBLE_DEVICES: $cuda_devices" | ||
|
||
# Original. | ||
# srun --nodes=1 --ntasks=1 -w "$node_1" \ | ||
# bash -c "CUDA_VISIBLE_DEVICES=$cuda_devices ray start --head --node-ip-address="$ip" --port=$port --redis-password="$redis_password" --block --num-gpus=$num_gpus" & | ||
|
||
# With singularity. Quite hacky that we have to use bash here. | ||
srun --nodes=1 --ntasks=1 -w "$node_1" \ | ||
bash -c "CUDA_VISIBLE_DEVICES=$cuda_devices singularity exec \ | ||
-B $root_dir:/opt/rpad/code \ | ||
--pwd /opt/rpad/code \ | ||
--nv ${sif_name} \ | ||
ray start --head --node-ip-address="$ip" --port=$port --redis-password="$redis_password" --block --num-gpus=$num_gpus" & | ||
|
||
sleep 10 | ||
|
||
worker_num=$((SLURM_JOB_NUM_NODES - 1)) #number of nodes other than the head node | ||
for ((i = 1; i <= worker_num; i++)); do | ||
node_i=${nodes_array[$i]} | ||
echo "STARTING WORKER $i at $node_i" | ||
|
||
# Get the number of GPUs on the node. | ||
num_gpus=$(scontrol show node $node_i | grep CfgTRES | awk -F'gpu=' '{print $2}' | awk '{print $1}') | ||
echo "NUM_GPUS: $num_gpus" | ||
|
||
# Set CUDA_VISIBLE_DEVICES for each worker | ||
cuda_devices=$(seq -s, 0 $(($num_gpus - 1))) | ||
echo "CUDA_VISIBLE_DEVICES: $cuda_devices" | ||
# srun --nodes=1 --ntasks=1 -w "$node_i" ray start --address "$ip_head" --redis-password="$redis_password" --block & | ||
|
||
srun --nodes=1 --ntasks=1 -w "$node_i" \ | ||
bash -c "CUDA_VISIBLE_DEVICES=$cuda_devices singularity exec \ | ||
-B $root_dir:/opt/rpad/code \ | ||
--pwd /opt/rpad/code \ | ||
--nv ${sif_name} \ | ||
ray start --address "$ip_head" --redis-password="$redis_password" --block --num-gpus=$num_gpus" & | ||
sleep 5 | ||
done | ||
|
||
echo "RUNNING YOUR CODE NOW: ${COMMAND_PLACEHOLDER}" | ||
|
||
# Print a command to run SSH tunnel to access Ray Dashboard | ||
# Should look like this but fill in with whoami and ip: | ||
# ssh -J [email protected] -L 8265:localhost:8265 [email protected] | ||
echo "To access the Ray Dashboard, run the following command on your local machine:" | ||
echo "ssh -J $(whoami)@seuss.ri.cmu.edu -L 8265:localhost:8265 $(whoami)@${ip}" | ||
echo "Then open a browser and go to http://localhost:8265" | ||
|
||
# ===== Call your code below ===== | ||
singularity exec \ | ||
-B $root_dir:/opt/rpad/code \ | ||
--pwd /opt/rpad/code \ | ||
--nv \ | ||
${sif_name} \ | ||
${COMMAND_PLACEHOLDER} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,32 @@ | ||
import ray | ||
import torch | ||
import time | ||
import logging | ||
from python_ml_project_template.models.classifier import ClassifierInferenceModule | ||
|
||
|
||
# Simple ray job that parallelizes 20 different jobst that consumer 1 GPU each, putting a 1GB tensor on each GPU. | ||
|
||
@ray.remote(num_gpus=1) | ||
def f(): | ||
x = torch.rand(1024, 1024).cuda() | ||
|
||
print("STARTING THE JOB!") | ||
model = ClassifierInferenceModule(None) | ||
|
||
# Wait for 30s. This is to simulate a job that takes a while to finish. | ||
time.sleep(5) | ||
|
||
return x.sum() | ||
|
||
|
||
|
||
if __name__ == "__main__": | ||
ray.init() | ||
# Start 30 tasks in parallel. | ||
print("Starting 30 tasks on the head unit...") | ||
result_ids = [f.remote() for _ in range(30)] | ||
results = ray.get(result_ids) | ||
print("Results are", results) | ||
print("The sum of results is", sum(results)) | ||
ray.shutdown() |
Empty file.