From 759964e67fd82ffb1a098c8fa0dc953a01e9c7d8 Mon Sep 17 00:00:00 2001
From: Dima Molodenskiy <dmolodenskiy@embl-hamburg.de>
Date: Wed, 15 Nov 2023 10:48:52 +0100
Subject: [PATCH] Add example slurm script for features generation

---
 example_3.md | 66 ++++++++++++++++++++++++++++++++++++++++++----------
 1 file changed, 54 insertions(+), 12 deletions(-)
diff --git a/example_3.md b/example_3.md
index 142d1b6d..3373ff8f 100644
--- a/example_3.md
+++ b/example_3.md
@@ -8,7 +8,8 @@ Firstly, download sequences of NS1(Uniprot: [P03496](https://www.uniprot.org/uni
 Then download the multimeric template in either pdb or mmCIF format(PDB: [3L4Q](https://www.rcsb.org/structure/3L4Q)).
 Create directories named "fastas" and "templates" and put the sequences and pdb/cif files in the corresponding directories.
 Finally, create a text file with description for generating features (description.csv).
-**Please note**, the first column must be an exact copy of the protein description from your fasta files. Please consider shortening them in your favorite text editor for convenience. These names will be used to generate pickle files with monomeric features!
+
+**Please note**, the first column must be an exact copy of the protein description from your fasta files. Please consider shortening them in fasta files using your favorite text editor for convenience. These names will be used to generate pickle files with monomeric features!
 The description.csv for the NS1-P85B complex should look like:
 ```
 >sp|P03496|NS1_I34A1, 3L4Q.cif, A
@@ -24,12 +25,12 @@ Now run:
     --description_file=description.csv \
     --fasta_paths=fastas/P03496.fasta,fastas/P23726.fasta \
     --path_to_mmt=templates/ \
-    --data_dir=<path to alphafold databases> \
-    --save_msa_files=False \
-    --output_dir=<dir to save the output objects> \ 
-    --use_precomputed_msas=False \
+    --data_dir=/scratch/AlphaFold_DBs/2.3.2/ \
+    --save_msa_files=True \
+    --output_dir=features\ 
+    --use_precomputed_msas=True \
     --max_template_date=2050-01-01 \
-    --skip_existing=False --seq_index=<any number you want>
+    --skip_existing=True
 ```
 It is also possible to combine all your fasta files into a single fasta file.
 ```create_individual_features_with_templates.py``` will compute the features similarly to the create_individual_features.py, but will utilize the provided templates instead of the PDB database.
@@ -67,7 +68,48 @@ run_multimer_jobs.py \
 
 ### Running on a computer cluster in parallel
 
-On a compute cluster, you may want to run all jobs in parallel as a [job array](https://slurm.schedmd.com/job_array.html). For example, on SLURM queuing system at EMBL we could use the following ```run_multimer_jobs_SLURM.sh``` sbatch script:
+On a compute cluster, you may want to run all jobs in parallel as a [job array](https://slurm.schedmd.com/job_array.html). For example, on SLURM queuing system at EMBL we could use the following ```create_feature_jobs_SLURM.sh``` sbatch script:
+```bash
+#!/bin/bash
+
+#A typical run takes couple of hours but may be much longer
+#SBATCH --job-name=array
+#SBATCH --time=5:00:00
+
+#log files:
+#SBATCH -e logs/create_individual_features_%A_%a_err.txt
+#SBATCH -o logs/create_individual_features_%A_%a_out.txt
+
+#qos sets priority
+#SBATCH --qos=normal
+
+#SBATCH -p htc-el8
+#Limit the run to a single node
+#SBATCH -N 1
+
+#Adjust this depending on the node
+#SBATCH --ntasks=8
+#SBATCH --mem=32000
+
+module load HMMER/3.3.2-gompic-2020b
+module load HH-suite/3.3.0-gompic-2020b
+module load Anaconda3
+source activate AlphaPulldown
+
+  create_individual_features_with_templates.py \
+    --description_file=description.csv \
+    --fasta_paths=fastas/P03496.fasta,fastas/P23726.fasta \
+    --path_to_mmt=templates/ \
+    --data_dir=/scratch/AlphaFold_DBs/2.3.2/ \
+    --save_msa_files=True \
+    --output_dir=features\ 
+    --use_precomputed_msas=True \
+    --max_template_date=2050-01-01 \
+    --skip_existing=True
+    --job_index=$SLURM_ARRAY_TASK_ID
+```
+
+and the following ```run_multimer_jobs_SLURM.sh``` sbatch script:
 
 ```bash
 #!/bin/bash
@@ -81,11 +123,9 @@ On a compute cluster, you may want to run all jobs in parallel as a [job array](
 #SBATCH -o logs/run_multimer_jobs_%A_%a_out.txt
 
 #qos sets priority
-#SBATCH --qos=low
+#SBATCH --qos=normal
 
 #SBATCH -p gpu-el8
-#You might want to use a higher-end card in case higher oligomeric state get big:
-#SBATCH -C "gpu=A40|gpu=A100"
 
 #Reserve the entire GPU so no-one else slows you down
 #SBATCH --gres=gpu:1
@@ -95,7 +135,7 @@ On a compute cluster, you may want to run all jobs in parallel as a [job array](
 
 #Adjust this depending on the node
 #SBATCH --ntasks=8
-#SBATCH --mem=128000
+#SBATCH --mem=64000
 
 module load Anaconda3 
 module load CUDA/11.3.1
@@ -125,7 +165,9 @@ and then run using:
 
 ```
 mkdir -p logs
-count=`grep -c "" custom_mode.txt` #count lines even if the last one has no end of line
+count=`grep -c "" description.csv` #count lines even if the last one has no end of line
+sbatch --array=1-$count create_feature_jobs_SLURM.sh
+count=`grep -c "" custom_mode.txt`
 sbatch --array=1-$count run_multimer_jobs_SLURM.sh
 ```
 After the successful run one can evaluate and visualise the results in a usual manner (see e.g. [Example 2](https://github.com/KosinskiLab/AlphaPulldown/blob/main/example_2.md#3rd-step-evalutaion-and-visualisation))
\ No newline at end of file