Merge branch 'main' into TrueMultimer

KosinskiLab · Nov 15, 2023 · 0ed62cd · 0ed62cd
2 parents 768fc68 + 4e0ec78
commit 0ed62cd
Show file tree

Hide file tree

Showing 20 changed files with 367 additions and 49 deletions.
diff --git a/.github/workflows/github_actions.yml b/.github/workflows/github_actions.yml
@@ -35,30 +35,51 @@ jobs:
           conda develop alphapulldown
           conda develop alphapulldown/ColabFold
           conda develop alphafold
-          #conda info
-          #conda list
-          #python -c "import sys; print(sys.path)"
-          #echo "Python path: $PYTHONPATH"
           python -c "import alphafold; import os; print('Alphafold module is located at:', alphafold.__file__); alphafold_dir = os.path.dirname(alphafold.__file__); print('Contents of the Alphafold directory:', os.listdir(alphafold_dir))"
+      - name: Install dependencies in unifold setup.py
+        run: |
+          eval "$(conda shell.bash hook)"
+          export WORKDIRPATH=$PWD
+          conda activate AlphaPulldown
+          cd $WORKDIRPATH/unifold && python3 setup.py install
+      - name: Install dependencies in AlphaFold setup.py 
+        run: |
+          eval "$(conda shell.bash hook)"
+          export WORKDIRPATH=$PWD
+          conda activate AlphaPulldown
+          pip install protobuf==4.23.0
+          cd $WORKDIRPATH/alphafold && python3 setup.py install
+          python -c "import tree"
       - name: Install Dependencies
         run: |
           eval "$(conda shell.bash hook)"
           conda activate AlphaPulldown
           conda install -c omnia -c bioconda -c conda-forge openmm==8.0 pdbfixer==1.9 kalign2 cctbx-base importlib_metadata
           #conda install -c bioconda hmmer hhsuite #this causes conda conflicts
+
+          # install PyTorch 
+          pip install torch --pre -f https://download.pytorch.org/whl/nightly/cu121/torch_nightly.html 
           pip install jax==0.3.25 jaxlib==0.3.25+cuda11.cudnn805 -f https://storage.googleapis.com/jax-releases/jax_cuda_releases.html
           pip install "absl-py>=0.13.0" dm-haiku "dm-tree>=0.1.6" "h5py>=3.1.0" "ml-collections>=0.1.0" "pandas>=1.3.4" tensorflow "importlib-resources==5.8.0" "nbformat==5.4.0" "py3Dmol==2.0.1" ipython appdirs jupyterlab ipywidgets pytest
+      # - name: Check imports of submodules
+      #   run : |
+      #     eval "$(conda shell.bash hook)"
+      #     conda activate AlphaPulldown
+      #     # install PyTorch 
+      #     pip install torch --pre -f https://download.pytorch.org/whl/nightly/cu121/torch_nightly.html 
+      #     # setup unicore
+      #     git clone https://github.com/dptech-corp/Uni-Core.git
+      #     cd Uni-Core
+      #     python3 setup.py install --disable-cuda-ext
+      #     python -c "from unifold.alphalink_inference import alphalink_prediction"
+      #     python -c "from unifold.dataset import process_ap"
+      #     python -c "from unifold.config import model_config"
+      #     python -c "from colabfold.batch import get_queries, unserialize_msa, get_msa_and_templates, msa_to_str, build_monomer_feature, parse_fasta"
       - name: Run Tests
         run: |
           eval "$(conda shell.bash hook)"
           conda activate AlphaPulldown
-          pytest test/
-      - name: Build and Push to Docker Hub
-        uses: mr-smithers-excellent/docker-build-push@v5
-        with:
-          image: dmolodenskiy/AlphaPulldown
-          registry: docker.io
-          dockerfile: docker/Dockerfile
-          username: ${{ secrets.DOCKER_USERNAME }}
-          password: ${{ secrets.DOCKER_PASSWORD }}
-          tags: latest
+          pytest -s test/test_custom_db.py
+          pytest -s test/test_remove_clashes_low_plddt.py
+          # python3 -m unittest test/test_crosslink_input.py 
+
diff --git a/.gitmodules b/.gitmodules
@@ -10,3 +10,7 @@
 	path = alphapulldown/analysis_pipeline/af2plots
 	url = https://gitlab.com/gchojnowski/af2plots.git
 	branch = main
+[submodule "unifold"]
+	path = unifold
+	url = https://github.com/dingquanyu/Uni-Fold.git
+	branch = main
diff --git a/README.md b/README.md
@@ -55,7 +55,8 @@ conda create -n AlphaPulldown -c omnia -c bioconda -c conda-forge python==3.10 o
 **Secondly**, activate the AlphaPulldown environment and install AlphaPulldown
 ```bash
 source activate AlphaPulldown
-python3 -m pip install alphapulldown==1.00.0
+
+python3 -m pip install alphapulldown==1.0.0
 pip install jax==0.3.25 jaxlib==0.3.25+cuda11.cudnn805 -f https://storage.googleapis.com/jax-releases/jax_cuda_releases.html
 ```
 

diff --git a/alphafold b/alphafold
diff --git a/alphapulldown/ColabFold b/alphapulldown/ColabFold
diff --git a/alphapulldown/__init__.py b/alphapulldown/__init__.py
@@ -1 +1 @@
-__version__ = "0.40.0"
+__version__ = "1.0.0"
diff --git a/alphapulldown/analysis_pipeline/calculate_mpdockq.py b/alphapulldown/analysis_pipeline/calculate_mpdockq.py
@@ -134,6 +134,35 @@ def calculate_mpDockQ(complex_score):
     b = 0.221
     return L/(1+math.exp(-1*k*(complex_score-x_0))) + b
 
+def read_pdb_pdockq(pdbfile):
+    '''Read a pdb file predicted with AF and rewritten to conatin all chains
+    Adepted from FoldDock repo:
+    https://gitlab.com/ElofssonLab/FoldDock/-/blob/main/src/pdockq.py#L34-59
+    '''
+
+    chain_coords, chain_plddt = {}, {}
+    with open(pdbfile, 'r') as file:
+        for line in file:
+            if not line.startswith('ATOM'):
+                continue
+            record = parse_atm_record(line)
+            #Get CB - CA for GLY
+            if record['atm_name']=='CB' or (record['atm_name']=='CA' and record['res_name']=='GLY'):
+                if record['chain'] in [*chain_coords.keys()]:
+                    chain_coords[record['chain']].append([record['x'],record['y'],record['z']])
+                    chain_plddt[record['chain']].append(record['B'])
+                else:
+                    chain_coords[record['chain']] = [[record['x'],record['y'],record['z']]]
+                    chain_plddt[record['chain']] = [record['B']]
+
+
+    #Convert to arrays
+    for chain in chain_coords:
+        chain_coords[chain] = np.array(chain_coords[chain])
+        chain_plddt[chain] = np.array(chain_plddt[chain])
+
+    return chain_coords, chain_plddt
+
 
 def calc_pdockq(chain_coords, chain_plddt, t):
     '''Calculate the pDockQ scores

diff --git a/alphapulldown/analysis_pipeline/get_good_inter_pae.py b/alphapulldown/analysis_pipeline/get_good_inter_pae.py
@@ -40,6 +40,7 @@ def obtain_mpdockq(work_dir):
     if complex_score is not None and num_chains>2:
         mpDockq_or_pdockq = calculate_mpDockQ(complex_score)
     elif complex_score is not None and num_chains==2:
+        chain_coords,plddt_per_chain = read_pdb_pdockq(pdb_path)
         mpDockq_or_pdockq = calc_pdockq(chain_coords,plddt_per_chain,t=8)
     else:
         mpDockq_or_pdockq = "None"
@@ -88,9 +89,10 @@ def run_and_summarise_pi_score(workd_dir,jobs,surface_thres):
                     pi_score = pd.DataFrame.from_dict({"pi_score":['SC:  mds: too many atoms']})
                 f.close()
             filtered_df['jobs'] = str(job)
-            filtered_df=pd.merge(filtered_df,pi_score,on='jobs')
+            pi_score['interface'] = pi_score['chains']
+            filtered_df=pd.merge(filtered_df,pi_score,on=['jobs','interface'])
             try:
-                filtered_df.drop(columns=["#PDB","pdb"," pvalue","chains","predicted_class"])
+                filtered_df=filtered_df.drop(columns=["#PDB","pdb"," pvalue","chains","predicted_class"])
             except:
                 pass
 

diff --git a/alphapulldown/objects.py b/alphapulldown/objects.py
@@ -215,16 +215,14 @@ def make_mmseq_features(
             ) = unserialize_msa(a3m_lines, self.sequence)
 
         else:
+            print("###### lin 228 will run mmseqs2 with the latest modifications")
             (
                 unpaired_msa,
                 paired_msa,
                 query_seqs_unique,
                 query_seqs_cardinality,
                 template_features,
             ) = get_msa_and_templates(
-                jobname=self.description,
-                query_sequences=self.sequence,
-                a3m_lines=None,
                 result_dir=plPath(result_dir),
                 msa_mode=msa_mode,
                 use_templates=use_templates,

diff --git a/alphapulldown/plot_pae.py b/alphapulldown/plot_pae.py
@@ -56,3 +56,34 @@ def plot_pae(seqs: list, order, feature_dir, job_name):
             ax1.axvline(t, color="black", linewidth=3.5)
         plt.title("ranked_{}".format(i))
         plt.savefig(f"{feature_dir}/{job_name}_PAE_plot_ranked_{i}.png")
+
+def plot_pae_from_matrix(seqs,pae_matrix,figure_name=''):
+    xticks = []
+    initial_tick = 0
+    for s in seqs:
+        initial_tick = initial_tick + len(s)
+        xticks.append(initial_tick)
+
+    xticks_labels = []
+    for i, t in enumerate(xticks):
+        xticks_labels.append(str(i + 1))
+
+    yticks_labels = []
+    for s in seqs:
+        yticks_labels.append(str(len(s)))
+    fig, ax1 = plt.subplots(1, 1)
+    # plt.figure(figsize=(3,18))
+    check = pae_matrix
+    fig, ax1 = plt.subplots(1, 1)
+    pos = ax1.imshow(check, cmap="bwr", vmin=0, vmax=30)
+    ax1.set_xticks(xticks)
+    ax1.set_yticks(xticks)
+
+    ax1.set_xticklabels(xticks_labels, size="large")
+    ax1.set_yticklabels(yticks_labels,size="large")
+    fig.colorbar(pos).ax.set_title("unit: Angstrom")
+    for t in xticks:
+        ax1.axhline(t, color="black", linewidth=3.5)
+        ax1.axvline(t, color="black", linewidth=3.5)
+    plt.title("ranked_{}".format(i))
+    plt.savefig(figure_name)
diff --git a/alphapulldown/run_multimer_jobs.py b/alphapulldown/run_multimer_jobs.py
@@ -65,7 +65,24 @@
 flags.DEFINE_integer(
     "msa_depth", None, "Number of sequences to use from the MSA (by default is taken from AF model config)"
 )
+flags.DEFINE_boolean(
+    "use_unifold",False,"Whether unifold models are going to be used. Default it False"
+)
 
+flags.DEFINE_boolean(
+    "use_alphalink",False,"Whether alphalink models are going to be used. Default it False"
+)
+flags.DEFINE_string(
+    "crosslinks",None,"Path to crosslink information pickle"
+)
+flags.DEFINE_string(
+    "alphalink_weight",None,'Path to AlphaLink neural network weights'
+)
+flags.DEFINE_string(
+    "unifold_param",None,'Path to UniFold neural network weights'
+)
+flags.DEFINE_enum("unifold_model_name","multimer_af2",
+                  ["multimer_af2","multimer_ft","multimer","multimer_af2_v3","multimer_af2_model45_v3"],"choose unifold model structure")
 flags.mark_flag_as_required("output_path")
 
 delattr(flags.FLAGS, "models_to_relax")
@@ -304,20 +321,52 @@ def predict_individual_jobs(multimer_object, output_path, model_runners, random_
     if not isinstance(multimer_object, MultimericObject):
         multimer_object.input_seqs = [multimer_object.sequence]
 
-
-    predict(
-        model_runners,
-        output_path,
-        multimer_object.feature_dict,
-        random_seed,
-        FLAGS.benchmark,
-        fasta_name=multimer_object.description,
-        models_to_relax=FLAGS.models_to_relax,
-        seqs=multimer_object.input_seqs,
-    )
-    create_and_save_pae_plots(multimer_object, output_path)
-
-
+    if FLAGS.use_unifold:
+        from unifold.inference import config_args,unifold_config_model,unifold_predict
+        from unifold.dataset import process_ap
+        from unifold.config import model_config
+        configs = model_config(FLAGS.unifold_model_name)
+        general_args = config_args(FLAGS.unifold_param,
+                                   target_name=multimer_object.description,
+                                   output_dir=output_path)
+        model_runner = unifold_config_model(general_args)
+        # First need to add num_recycling_iters to the feature dictionary
+        # multimer_object.feature_dict.update({"num_recycling_iters":general_args.max_recycling_iters})
+        processed_features,_ = process_ap(config=configs.data,
+                                          features=multimer_object.feature_dict,
+                                          mode="predict",labels=None,
+                                          seed=42,batch_idx=None,
+                                          data_idx=None,is_distillation=False
+                                          )
+        logging.info(f"finished configuring the Unifold AlphlaFold model and process numpy features")
+        unifold_predict(model_runner,general_args,processed_features)
+
+    elif FLAGS.use_alphalink:
+        assert FLAGS.alphalink_weight is not None
+        from unifold.alphalink_inference import alphalink_prediction
+
+        from unifold.config import model_config
+        logging.info(f"Start using AlphaLink weights and cross-link information")  
+        MODEL_NAME = 'model_5_ptm_af2'
+        configs = model_config(MODEL_NAME)
+        alphalink_prediction(multimer_object.feature_dict,
+                             os.path.join(FLAGS.output_path,multimer_object.description),
+                             input_seqs = multimer_object.input_seqs,
+                             param_path = FLAGS.alphalink_weight,
+                             configs = configs,crosslinks=FLAGS.crosslinks,
+                             chain_id_map=multimer_object.chain_id_map)
+    else:
+        predict(
+            model_runners,
+            output_path,
+            multimer_object.feature_dict,
+            random_seed,
+            FLAGS.benchmark,
+            fasta_name=multimer_object.description,
+            models_to_relax=FLAGS.models_to_relax,
+            seqs=multimer_object.input_seqs,
+        )
+        create_and_save_pae_plots(multimer_object, output_path)
 
 
 def predict_multimers(multimers):

diff --git a/example_1.md b/example_1.md
@@ -171,6 +171,8 @@ run_multimer_jobs.py --mode=pulldown \
 
 :memo: To reproduce the results of Lassa virus Z protein vs L protein fragments written in our paper, simply use [baits_Z_protein.txt](./example_data/baits_Z_protein.txt) and [L_protein_fragments.txt](./example_data/L_protein_fragments.txt) as the ```--protein_lists```inputs. This example shows also how to run the interaction screen for fragments of proteins, keeping the original full-length residue numbering in the output!
 
+✨ **New Features** Now AlphaPulldown supports integrative structural modelling if the user has experimental cross-link data. Please refer to [this manual](run_with_AlphaLink2.md) if you'd like to model your protein complexes with cross-link MS data as extra input.
+
 ## Explanation about the parameters
 
 ####  **```monomer_objects_dir```**

diff --git a/example_2.md b/example_2.md
@@ -149,6 +149,9 @@ different number if you wish to run an array of jobs in parallel then the progra
 
 :exclamation: ```job_index``` starts from 1
 
+
+✨ **New Features** Now AlphaPulldown supports integrative structural modelling if the user has experimental cross-link data. Please refer to [this manual](run_with_AlphaLink2.md) if you'd like to model your protein complexes with cross-link MS data as extra input.
+
 ### Running on a computer cluster in parallel
 
 On a compute cluster, you may want to run all jobs in parallel as a [job array](https://slurm.schedmd.com/job_array.html). For example, on SLURM queuing system at EMBL we could use the following ```run_multimer_jobs_SLURM.sh``` sbatch script:

diff --git a/run_with_AlphaLink2.md b/run_with_AlphaLink2.md
@@ -0,0 +1,65 @@
+# Instruction of running structural predictions with cross-link data via [AlphaLink2](https://github.com/Rappsilber-Laboratory/AlphaLink2/tree/main)
+## Introduction
+As [Stahl et al., 2023](https://www.nature.com/articles/s41587-023-01704-z) showed, integrating cross-link data with AlphaFold could improve the modelling quality in 
+some challenging cases. Thus AlphaPulldown has integrated [AlphaLink2](https://github.com/Rappsilber-Laboratory/AlphaLink2/tree/main) pipeline 
+and allows the user to combine cross-link data with AlphaFold Multimer inference, without the need of calculating MSAs from the scratch again.
+
+In addition, this integration retains all the other benefits from AlphaPulldown, such as the interface for fragmenting protein into regions; automatically 
+generating PAE plots after the predictions etc.
+
+## 1st step: configure the Conda environment
+After you initialise the same conda environment, where you normally run AlphaPulldown, firstly, you need to compile [UniCore](https://github.com/dptech-corp/Uni-Core).
+
+```bash
+git clone https://github.com/dptech-corp/Uni-Core.git
+cd Uni-Core
+python setup.py install --disable-cuda-ext
+
+# test whether unicore is successfully installed
+python -c "import unicore"
+```
+You may see the following warning but it's fine: 
+```
+fused_multi_tensor is not installed corrected
+fused_rounding is not installed corrected
+fused_layer_norm is not installed corrected
+fused_softmax is not installed corrected
+```
+Next, make sure you have PyTorch corresponding to the CUDA version installed. For example, [PyTorch 1.13.0+cu117](https://pytorch.org/get-started/previous-versions/) 
+and CUDA/11.7.0 
+## 2nd step: download AlphaLink2 checkpoint 
+Now please download the PyTorch checkpoints from [Zenodo](https://zenodo.org/records/8007238), unzip it, then you should obtain a file named: ```AlphaLink-Multimer_SDA_v3.pt```
+
+## 3rd step: prepare cross-link input data
+As instructed by [AlphaLink2](https://github.com/Rappsilber-Laboratory/AlphaLink2/tree/main), information of cross-linked residues
+between 2 proteins, inter-protein crosslinks A->B 1,50 and 30,80 and an FDR=20%, should look like: 
+```
+{'protein_A': {'protein_B': [(1, 50, 0.2), (30, 80, 0.2)]}}
+```
+and intra-protein crosslinks follow the same format: 
+```
+{'protein_A': {'protein_A': [(5, 20, 0.2)]}}
+```
+The keys in these dictionaries should be the same as your pickle files created by [the first stage of AlphaPulldown](https://github.com/KosinskiLab/AlphaPulldown/blob/main/example_1.md). e.g. you should have ```protein_A.pkl``` 
+and ```protein_B.pkl``` already calculated. 
+
+Dictionaries like these should be stored in **```.pkl.gz```** files and provided to AlphaPulldown in the next step. You can use the script from [AlphaLink2](https://github.com/Rappsilber-Laboratory/AlphaLink2/tree/main)
+to prepare these pickle files. 
+### **NB** The dictionaries are 0-indexed, i.e., residues start from 0.
+
+## 4th step: run with AlphaLink2 prediction via AlphaPulldown
+Within the same conda environment, run in e.g. ```custom``` mode:
+```bash
+run_multimer_jobs.py --mode=custom \
+--num_predictions_per_model=1 \
+--output_path=/scratch/scratch/user/output/models \
+--data_dir=/g/alphafold/AlphaFold_DBs/2.3.0/ \
+--protein_lists=custom.txt \
+--monomer_objects_dir=/scratch/user/output/features \
+--job_index=$SLURM_ARRAY_TASK_ID --alphalink_weight=/scratch/user/alphalink_weights/AlphaLink-Multimer_SDA_v3.pt \
+--use_alphalink=True --crosslinks=/path/to/crosslinks.pkl.gz 
+```
+The other modes provided by AlphaPulldown also work in the same way.
+
+
+
+54 −0		.github/workflows/docker.yml
+25 −0		.github/workflows/publish.yml
+6 −5		AlphaFold2.ipynb
+18 −0		Dockerfile
+2 −2		ESMFold.ipynb
+70 −55		README.md
+0 −27		TemplateServer/README.md
+0 −226		TemplateServer/config.go
+0 −100		TemplateServer/dbreader.go
+0 −143		TemplateServer/decoder.go
+0 −13		TemplateServer/go.mod
+0 −43		TemplateServer/go.sum
+0 −259		TemplateServer/main.go
+8 −7		batch/AlphaFold2_batch.ipynb
+102 −0		beta/convert_256_to_384_rep.ipynb
+42 −14		beta/relax_amber.ipynb
+499 −267		colabfold/batch.py
+65 −163		colabfold/mmseqs/search.py
+108 −0		colabfold/relax.py
+4 −2		colabfold/utils.py
+2 −1		pyproject.toml
+18 −9		setup_databases.sh
+8 −0		tests/mock.py
+1 −0		tests/test_msa.py