From e2b05f5cedadc5653da22cb62c6a0549b2324aca Mon Sep 17 00:00:00 2001 From: David Chin Date: Thu, 23 Feb 2023 15:23:50 -0500 Subject: [PATCH 01/11] Update instructions --- README.md | 13 ++++++++++--- 1 file changed, 10 insertions(+), 3 deletions(-) diff --git a/README.md b/README.md index e695db5..19ed7b8 100644 --- a/README.md +++ b/README.md @@ -25,7 +25,7 @@ A prebuilt image is hosted on cloud.sylabs.io: [https://cloud.sylabs.io/library/ N.B. The AlphaFold version and the alphafold_singularity versions must match. ``` -$ export ALPHAFOLD_VERSION=2.2.4 +$ export ALPHAFOLD_VERSION=2.3.1 $ wget https://github.com/deepmind/alphafold/archive/refs/tags/v${ALPHAFOLD_VERSION}.tar.gz -O alphafold-${ALPHAFOLD_VERSION}.tar.gz ... 2023-02-08 17:28:50 (1.24 MB/s) - ‘alphafold-x.x.x.tar.gz’ saved [5855095] @@ -55,7 +55,12 @@ If your `/tmp` directory is small, you may need to set the [`SINGULARITY_TMPDIR` environment variable](https://sylabs.io/guides/3.3/user-guide/build_env.html#temporary-folders) to a directory on a filesystem with more free space. My builds have consumed up to 15 GiB of space. The resulting image file may be up to 10 GiB. -### Install and run +### Download genetic databases +See [AlphaFold 2.3.1 README](https://github.com/deepmind/alphafold/tree/v2.3.1) +for instructions on downloading genetic databases. These are necessary +to run AlphaFold. + +### Modify run script, install, and run To run, modify the `$ALPHAFOLD_SRC/singularity/run_singularity.py` and change the section marked `USER CONFIGURATION`. At the least, you will need to modify the values of: @@ -69,4 +74,6 @@ singularity_image = Client.load(os.path.join(os.environ['ALPHAFOLD_DIR'], 'alpha ``` ### Run as a Slurm job on a cluster -See the example job script [`example_slurm_job.sh`](https://github.com/prehensilecode/alphafold_singularity/blob/main/example_slurm_job.sh) +See the example job script [`example_slurm_job.sh`](https://github.com/prehensilecode/alphafold_singularity/blob/main/example_slurm_job.sh). +N.B. this example must be modified to suit your specific HPC environment. + From fbf00a8e40ea3a6a766ba273c7dffccd70de59bb Mon Sep 17 00:00:00 2001 From: David Chin Date: Thu, 23 Feb 2023 15:24:58 -0500 Subject: [PATCH 02/11] Add note about aria2c --- README.md | 2 ++ 1 file changed, 2 insertions(+) diff --git a/README.md b/README.md index 19ed7b8..38336ce 100644 --- a/README.md +++ b/README.md @@ -60,6 +60,8 @@ See [AlphaFold 2.3.1 README](https://github.com/deepmind/alphafold/tree/v2.3.1) for instructions on downloading genetic databases. These are necessary to run AlphaFold. +This step requires [aria2c](https://aria2.github.io/). + ### Modify run script, install, and run To run, modify the `$ALPHAFOLD_SRC/singularity/run_singularity.py` and change the section marked `USER CONFIGURATION`. At the least, you will need to modify the values From 97061d4dd684ce1a5d683683e40e5f52328eea67 Mon Sep 17 00:00:00 2001 From: David Chin Date: Fri, 24 Feb 2023 11:01:39 -0500 Subject: [PATCH 03/11] Update requirements to match AlphaFold 2.3.1; latest spython. --- requirements.txt | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/requirements.txt b/requirements.txt index 1a482f1..bcb7859 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,3 +1,4 @@ # Dependencies necessary to execute run_singularity.py -absl-py==0.13.0 -spython==0.1.16 +# absl-py version to match deepmind/alphafold +absl-py==1.0.0 +spython==0.3.0 From 1c7efe8487dcb8cda9f793cb3bdcec4ce5ab21e0 Mon Sep 17 00:00:00 2001 From: David Chin Date: Fri, 24 Feb 2023 14:08:44 -0500 Subject: [PATCH 04/11] Update for 2.3.1; update to bind /tmp --- run_singularity.py | 31 ++++++++++++++++++++++--------- 1 file changed, 22 insertions(+), 9 deletions(-) diff --git a/run_singularity.py b/run_singularity.py index bf1f1b3..46f4bd1 100644 --- a/run_singularity.py +++ b/run_singularity.py @@ -34,11 +34,16 @@ singularity_image = Client.load(os.path.join(os.environ['ALPHAFOLD_DIR'], 'alphafold.sif')) # Path to a directory that will store the results. -if 'TMPDIR' in os.environ: +if 'TMP' in os.environ: + output_dir = os.environ['TMP'] +elif 'TMPDIR' in os.environ: output_dir = os.environ['TMPDIR'] else: output_dir = tempfile.mkdtemp(dir='/tmp', prefix='alphafold-') +# set tmp dir the same as output dir +tmp_dir = output_dir + #### END USER CONFIGURATION #### @@ -62,7 +67,7 @@ 'separated by commas. All FASTA paths must have a unique basename as the ' 'basename is used to name the output directories for each prediction.') flags.DEFINE_string( - 'output_dir', '/tmp/alphafold', + 'output_dir', output_dir, 'Path to a directory that will store the results.') flags.DEFINE_string( 'data_dir', None, @@ -113,6 +118,7 @@ def _create_bind(bind_name: str, path: str) -> Tuple[str, str]: + """Create a bind point for each file and directory used by the model.""" path = os.path.abspath(path) source_path = os.path.dirname(path) if bind_name != 'data_dir' else path target_path = os.path.join(_ROOT_MOUNT_DIRECTORY, bind_name) @@ -145,7 +151,7 @@ def main(argv): # Path to the MGnify database for use by JackHMMER. mgnify_database_path = os.path.join( - FLAGS.data_dir, 'mgnify', 'mgy_clusters_2018_12.fa') + FLAGS.data_dir, 'mgnify', 'mgy_clusters_2022_05.fa') # Path to the BFD database for use by HHblits. bfd_database_path = os.path.join( @@ -156,9 +162,9 @@ def main(argv): small_bfd_database_path = os.path.join( FLAGS.data_dir, 'small_bfd', 'bfd-first_non_consensus_sequences.fasta') - # Path to the Uniclust30 database for use by HHblits. - uniclust30_database_path = os.path.join( - FLAGS.data_dir, 'uniclust30', 'uniclust30_2018_08', 'uniclust30_2018_08') + # Path to the Uniref30 database for use by HHblits. + uniref30_database_path = os.path.join( + FLAGS.data_dir, 'uniref30', 'UniRef30_2021_03') # Path to the PDB70 database for use by HHsearch. pdb70_database_path = os.path.join(FLAGS.data_dir, 'pdb70', 'pdb70') @@ -178,7 +184,7 @@ def main(argv): if alphafold_path == data_dir_path or alphafold_path in data_dir_path.parents: raise app.UsageError( f'The download directory {FLAGS.data_dir} should not be a subdirectory ' - f'in the AlphaFold repository directory. If it is, the Docker build is ' + f'in the AlphaFold repository directory. If it is, the Singularity build is ' f'slow since the large databases are copied during the image creation.') binds = [] @@ -211,7 +217,7 @@ def main(argv): database_paths.append(('small_bfd_database_path', small_bfd_database_path)) else: database_paths.extend([ - ('uniclust30_database_path', uniclust30_database_path), + ('uniref30_database_path', uniref30_database_path), ('bfd_database_path', bfd_database_path), ]) for name, path in database_paths: @@ -222,6 +228,11 @@ def main(argv): output_target_path = os.path.join(_ROOT_MOUNT_DIRECTORY, 'output') binds.append(f'{output_dir}:{output_target_path}') + logging.info('Binding %s -> %s', output_dir, output_target_path) + + tmp_target_path = '/tmp' + binds.append(f'{tmp_dir}:{tmp_target_path}') + logging.info('Binding %s -> %s', tmp_dir, tmp_target_path) use_gpu_relax = FLAGS.enable_gpu_relax and FLAGS.use_gpu @@ -240,9 +251,11 @@ def main(argv): options = [ '--bind', f'{",".join(binds)}', + '--env', 'OPENMM_CPU_THREADS=12', + # The following flags allow us to make predictions on proteins that + # would typically be too long to fit into GPU memory. '--env', 'TF_FORCE_UNIFIED_MEMORY=1', '--env', 'XLA_PYTHON_CLIENT_MEM_FRACTION=4.0', - '--env', 'OPENMM_CPU_THREADS=12' ] # Run the container. From ff3a2f466ede4eea74154bcde14e3c815069b520 Mon Sep 17 00:00:00 2001 From: David Chin Date: Tue, 28 Feb 2023 09:39:18 -0500 Subject: [PATCH 05/11] Terminate if Slurm job has >1 GPU --- run_singularity.py | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/run_singularity.py b/run_singularity.py index 46f4bd1..b0aa378 100644 --- a/run_singularity.py +++ b/run_singularity.py @@ -15,6 +15,7 @@ """Singularity launch script for Alphafold Singularity image.""" import os +import sys import pathlib import signal from typing import Tuple @@ -26,6 +27,15 @@ import tempfile from spython.main import Client +# Check Slurm environment if available +if os.environ['SLURM_GPUS_ON_NODE']: + ngpus_requested = int(os.environ['SLURM_GPUS_ON_NODE']) + if ngpus_requested > 1: + logging.fatal(f'No. of GPUs requested is > 1: {ngpus_requested}') + # absl.logging.fatal() does not terminate this process + # so, manually call sys.exit() + sys.exit(1) + #### USER CONFIGURATION #### # Path to AlphaFold Singularity image. This relies on From d64e4697c855f0e2be1f146a3dcf5cca23b4ca9f Mon Sep 17 00:00:00 2001 From: David Chin Date: Tue, 28 Feb 2023 10:36:23 -0500 Subject: [PATCH 06/11] Address #25 --- run_singularity.py | 26 +++++++++++++++++--------- 1 file changed, 17 insertions(+), 9 deletions(-) diff --git a/run_singularity.py b/run_singularity.py index b0aa378..835fe36 100644 --- a/run_singularity.py +++ b/run_singularity.py @@ -23,18 +23,11 @@ from absl import app from absl import flags from absl import logging +from spython.main import Client import tempfile -from spython.main import Client +import subprocess -# Check Slurm environment if available -if os.environ['SLURM_GPUS_ON_NODE']: - ngpus_requested = int(os.environ['SLURM_GPUS_ON_NODE']) - if ngpus_requested > 1: - logging.fatal(f'No. of GPUs requested is > 1: {ngpus_requested}') - # absl.logging.fatal() does not terminate this process - # so, manually call sys.exit() - sys.exit(1) #### USER CONFIGURATION #### @@ -148,6 +141,21 @@ def main(argv): if len(argv) > 1: raise app.UsageError('Too many command-line arguments.') + # Using more than one GPU causes TensorFold's amber_minimize.py to fail + # See: https://github.com/prehensilecode/alphafold_singularity/issues/25 + ngpus_requested = 0 + + # Check Slurm environment if available + if os.environ['SLURM_GPUS_ON_NODE']: + ngpus_requested = int(os.environ['SLURM_GPUS_ON_NODE']) + else: + # use nvidia-smi to count GPUs + # this works if using cgroups but may not work otherwise + ngpus_requested = len(subprocess.run(['nvidia-smi', '-L'], check=True, capture_output=True, text=True).stdout.strip().split('\n')) + + if ngpus_requested > 1: + logging.fatal(f'No. of GPUs requested is > 1: {ngpus_requested}') + # You can individually override the following paths if you have placed the # data in locations other than the FLAGS.data_dir. From f2faddf90bb61d5adfdbf820b2ca900ad7e99cbb Mon Sep 17 00:00:00 2001 From: David Chin Date: Wed, 1 Mar 2023 10:43:43 -0500 Subject: [PATCH 07/11] Updated for AlphaFold 2.3.1 --- Singularity.def | 17 +++++++++-------- 1 file changed, 9 insertions(+), 8 deletions(-) diff --git a/Singularity.def b/Singularity.def index 504e7a8..893f381 100644 --- a/Singularity.def +++ b/Singularity.def @@ -22,7 +22,8 @@ Stage: spython-base # FROM directive resets ARGS, so we specify again (the value is retained if # previously set). -apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install --no-install-recommends -y \ +apt-get update \ +&& DEBIAN_FRONTEND=noninteractive apt-get install --no-install-recommends -y \ build-essential \ cmake \ cuda-command-line-tools-11-1 \ @@ -48,9 +49,9 @@ wget \ # Install Miniconda package manager. wget -q -P /tmp \ -https://repo.anaconda.com/miniconda/Miniconda3-py37_4.12.0-Linux-x86_64.sh \ -&& bash /tmp/Miniconda3-py37_4.12.0-Linux-x86_64.sh -b -p /opt/conda \ -&& rm /tmp/Miniconda3-py37_4.12.0-Linux-x86_64.sh +https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh \ +&& bash /tmp/Miniconda3-latest-Linux-x86_64.sh -b -p /opt/conda \ +&& rm /tmp/Miniconda3-latest-Linux-x86_64.sh # Install conda packages. PATH="/opt/conda/bin:/usr/local/cuda-11.1/bin:$PATH" @@ -60,7 +61,7 @@ openmm=7.5.1 \ cudatoolkit==11.1.1 \ pdbfixer \ pip \ -python=3.7 \ +python=3.8 \ && conda clean --all --force-pkgs-dirs --yes ### /bin/cp -r . /app/alphafold @@ -73,12 +74,12 @@ https://git.scicore.unibas.ch/schwede/openstructure/-/raw/7102c63615b64735c49412 pip3 install --upgrade pip --no-cache-dir \ && pip3 install -r /app/alphafold/requirements.txt --no-cache-dir \ && pip3 install --upgrade --no-cache-dir \ -jax==0.3.17 \ -jaxlib==0.3.15+cuda11.cudnn805 \ +jax==0.3.25 \ +jaxlib==0.3.25+cuda11.cudnn805 \ -f https://storage.googleapis.com/jax-releases/jax_cuda_releases.html # Apply OpenMM patch. -cd /opt/conda/lib/python3.7/site-packages +cd /opt/conda/lib/python3.8/site-packages patch -p0 < /app/alphafold/docker/openmm.patch # Add SETUID bit to the ldconfig binary so that non-root users can run it. From 28933b0226d7f709060feca7b7e8a0d78ea1d8c9 Mon Sep 17 00:00:00 2001 From: David Chin Date: Wed, 1 Mar 2023 11:01:25 -0500 Subject: [PATCH 08/11] Figured out multiple GPUs --- run_singularity.py | 17 +---------------- 1 file changed, 1 insertion(+), 16 deletions(-) diff --git a/run_singularity.py b/run_singularity.py index 835fe36..9d62e18 100644 --- a/run_singularity.py +++ b/run_singularity.py @@ -141,21 +141,6 @@ def main(argv): if len(argv) > 1: raise app.UsageError('Too many command-line arguments.') - # Using more than one GPU causes TensorFold's amber_minimize.py to fail - # See: https://github.com/prehensilecode/alphafold_singularity/issues/25 - ngpus_requested = 0 - - # Check Slurm environment if available - if os.environ['SLURM_GPUS_ON_NODE']: - ngpus_requested = int(os.environ['SLURM_GPUS_ON_NODE']) - else: - # use nvidia-smi to count GPUs - # this works if using cgroups but may not work otherwise - ngpus_requested = len(subprocess.run(['nvidia-smi', '-L'], check=True, capture_output=True, text=True).stdout.strip().split('\n')) - - if ngpus_requested > 1: - logging.fatal(f'No. of GPUs requested is > 1: {ngpus_requested}') - # You can individually override the following paths if you have placed the # data in locations other than the FLAGS.data_dir. @@ -269,7 +254,7 @@ def main(argv): options = [ '--bind', f'{",".join(binds)}', - '--env', 'OPENMM_CPU_THREADS=12', + '--env', f'NVIDIA_VISIBLE_DEVICES={FLAGS.gpu_devices}', # The following flags allow us to make predictions on proteins that # would typically be too long to fit into GPU memory. '--env', 'TF_FORCE_UNIFIED_MEMORY=1', From f7fe7d4991f61ebe00429dbe23efec4ce7669a64 Mon Sep 17 00:00:00 2001 From: David Chin Date: Wed, 1 Mar 2023 11:04:25 -0500 Subject: [PATCH 09/11] Updated for AlphaFold 2.3.1 --- example_slurm_job.sh | 22 ++++++++++------------ 1 file changed, 10 insertions(+), 12 deletions(-) diff --git a/example_slurm_job.sh b/example_slurm_job.sh index 4f3c091..42c6acd 100644 --- a/example_slurm_job.sh +++ b/example_slurm_job.sh @@ -1,44 +1,42 @@ #!/bin/bash -#SBATCH -p gpu +#SBATCH --partition=gpu #SBATCH --time=18:00:00 #SBATCH --gpus=4 #SBATCH --cpus-per-gpu=12 -#SBATCH --mem=140G +#SBATCH --mem=45G ### NOTE ### This job script cannot be used without modification for your specific environment. -module load alphafold/2.2.4 -module load python/gcc/3.10 +module load python/gcc/3.11 +module load alphafold/2.3.1 ### Check values of some environment variables -echo SLURM_JOB_GPUS=$SLURM_JOB_GPUS +echo SLURM_GPUS_ON_NODE=$SLURM_GPUS_ON_NODE echo ALPHAFOLD_DIR=$ALPHAFOLD_DIR echo ALPHAFOLD_DATADIR=$ALPHAFOLD_DATADIR ### -### README This runs AlphaFold 2.2.2 on the T1050.fasta file +### README This runs AlphaFold 2.3.1 on the T1050.fasta file ### # AlphaFold should use all GPU devices available to the job by default. -# To explicitly specify use of GPUs, and the GPU devices to use, add -# --use_gpu --gpu_devices=${SLURM_JOB_GPUS} # # To run the CASP14 evaluation, use: # --model_preset=monomer_casp14 +# --db_preset=full_dbs (or delete the line; default is "full_dbs") # # To benchmark, running multiple JAX model evaluations (NB this # significantly increases run time): # --benchmark -# Run AlphaFold; default is to use GPUs, i.e. "--use_gpu" can be omitted. +# Run AlphaFold; default is to use GPUs python3 ${ALPHAFOLD_DIR}/singularity/run_singularity.py \ - --use_gpu --gpu_devices=${SLURM_JOB_GPUS} \ --data_dir=${ALPHAFOLD_DATADIR} \ --fasta_paths=T1050.fasta \ --max_template_date=2020-05-14 \ - --model_preset=monomer_casp14 \ - --benchmark + --db_preset=reduced_dbs \ + --model_preset=monomer echo INFO: AlphaFold returned $? From fdb4a26cccba3a182448c8d201969fe05d3f372e Mon Sep 17 00:00:00 2001 From: David Chin Date: Wed, 1 Mar 2023 11:05:02 -0500 Subject: [PATCH 10/11] Add detail about multi-GPU jobs --- README.md | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) diff --git a/README.md b/README.md index 38336ce..29bcab4 100644 --- a/README.md +++ b/README.md @@ -62,6 +62,10 @@ to run AlphaFold. This step requires [aria2c](https://aria2.github.io/). +N.B. The difference between downloading the "reduced databases" as opposed +to the "full databases" is that the reduced databases download "small BFD" +instead of "BFD". + ### Modify run script, install, and run To run, modify the `$ALPHAFOLD_SRC/singularity/run_singularity.py` and change the section marked `USER CONFIGURATION`. At the least, you will need to modify the values @@ -75,7 +79,22 @@ E.g. singularity_image = Client.load(os.path.join(os.environ['ALPHAFOLD_DIR'], 'alphafold.sif')) ``` +## Running on an HPC cluster +Currently, this project only supports Slurm. Please open an issue to request +support for other job schedulers/resource managers. + + ### Run as a Slurm job on a cluster See the example job script [`example_slurm_job.sh`](https://github.com/prehensilecode/alphafold_singularity/blob/main/example_slurm_job.sh). N.B. this example must be modified to suit your specific HPC environment. +The `run_singularity.py` script will use all GPUs available to the job. If +Slurm has been set up with [`cgroups`](https://en.wikipedia.org/wiki/Cgroups), +the job may request fewer than the total number of GPUs installed on a node. +E.g. if the GPU nodes in the cluster have 4 GPU devices each, the job can +do +```bash +#SBATCH --gpus=2 +``` +and AlphaFold Singularity will use only two of the four GPUs. This is +because the `cgroup` for the job only shows 2 GPUs to the job. From 8f52ba60b78931e99a4f3d0711319108a03cc947 Mon Sep 17 00:00:00 2001 From: David Chin Date: Wed, 1 Mar 2023 11:55:54 -0500 Subject: [PATCH 11/11] Remove unused env var --- example_slurm_job.sh | 1 - 1 file changed, 1 deletion(-) diff --git a/example_slurm_job.sh b/example_slurm_job.sh index 42c6acd..845cc66 100644 --- a/example_slurm_job.sh +++ b/example_slurm_job.sh @@ -12,7 +12,6 @@ module load python/gcc/3.11 module load alphafold/2.3.1 ### Check values of some environment variables -echo SLURM_GPUS_ON_NODE=$SLURM_GPUS_ON_NODE echo ALPHAFOLD_DIR=$ALPHAFOLD_DIR echo ALPHAFOLD_DATADIR=$ALPHAFOLD_DATADIR