Merge pull request #190 from KosinskiLab/TrueMultimer

True multimer
KosinskiLab · Nov 15, 2023 · fbc42bd · fbc42bd
2 parents 4e0ec78 + 0ed62cd
commit fbc42bd
Show file tree

Hide file tree

Showing 54 changed files with 712,455 additions and 672 deletions.
diff --git a/Developing.md b/Developing.md
@@ -18,17 +18,15 @@
 1. Test your package during development using tests in ```test/```, e.g.:
    ```
    pip install pytest
-   pytest
-   pytest test
-   python test/test_predict_structure.py
-   sbatch test/test_predict_structure.sh
-   python -m unittest test/test_predict_structure.<name of the test>
+   pytest -s test/
+   pytest -s test/test_predictions_slurm.py
+   pytest -s test/test_features_with_templates.py::TestCreateIndividualFeaturesWithTemplates::test_1a_run_features_generation
    ```
 1. Before pushing to the remote or submitting pull request
     ```
     pip install .
-    pytest test
+    pytest -s test/
     ```
-    to install the package and test
+    to install the package and test. Pytest for predictions only work if slurm is available. Check the created log files in your current directory.
     
     
diff --git a/MANIFEST.in b/MANIFEST.in
@@ -1,2 +1 @@
-include ./alphafold/run_alphafold.py
-include stereo_chemical_props.txt
+include stereo_chemical_props.txt
diff --git a/README.md b/README.md
@@ -55,7 +55,8 @@ conda create -n AlphaPulldown -c omnia -c bioconda -c conda-forge python==3.10 o
 **Secondly**, activate the AlphaPulldown environment and install AlphaPulldown
 ```bash
 source activate AlphaPulldown
-python3 -m pip install alphapulldown==0.40.4
+
+python3 -m pip install alphapulldown==1.0.0
 pip install jax==0.3.25 jaxlib==0.3.25+cuda11.cudnn805 -f https://storage.googleapis.com/jax-releases/jax_cuda_releases.html
 ```
 

diff --git a/alphapulldown/__init__.py b/alphapulldown/__init__.py
@@ -1 +1 @@
-__version__ = "1.00.0"
+__version__ = "1.0.0"
diff --git a/alphapulldown/create_custom_template_db.py b/alphapulldown/create_custom_template_db.py
@@ -10,13 +10,13 @@
 
 import os
 import shutil
-import sys
+import random
+import string
 from pathlib import Path
 from absl import logging, flags, app
 from alphapulldown.remove_clashes_low_plddt import MmcifChainFiltered
-from colabfold.batch import validate_and_fix_mmcif, convert_pdb_to_mmcif
+from colabfold.batch import validate_and_fix_mmcif
 from alphafold.common.protein import _from_bio_structure, to_mmcif
-from Bio import SeqIO, PDB
 
 FLAGS = flags.FLAGS
 
@@ -47,10 +47,11 @@ def parse_code(template):
         for line in f:
             if line.startswith("_entry.id"):
                 code = line.split()[1]
-                if len(code) != 4:
-                    logging.error(f'Error for template {template}!\n'
-                                  f'Code must have 4 characters but is {code}\n')
-                    sys.exit(1)
+
+    # Generate a random 4-character code if needed
+    if len(code) != 4:
+        code = ''.join(random.choice(string.ascii_letters + string.digits) for _ in range(4))
+
     return code.lower()
 
 
@@ -90,40 +91,6 @@ def create_tree(pdb_mmcif_dir, mmcif_dir, seqres_dir, templates_dir):
     create_dir_and_remove_files(seqres_dir, ['pdb_seqres.txt'])
 
 
-def extract_seqs(template, chain_id):
-    """
-    Extract sequences from PDB/CIF file using Bio.SeqIO.
-    o input_file_path - path to the input file
-    o chain_id - chain ID
-    Returns:
-        o sequence_atom - sequence from ATOM records
-        o sequence_seqres - sequence from SEQRES records
-    """
-    file_type = template.suffix.lower()
-
-    if template.suffix.lower() != '.pdb' and template.suffix.lower() != '.cif':
-        raise ValueError(f"Unknown file type for {template}!")
-
-    format_types = [f"{file_type[1:]}-atom", f"{file_type[1:]}-seqres"]
-    # initialize the sequences
-    sequence_atom = None
-    sequence_seqres = None
-    # parse
-    for format_type in format_types:
-        for record in SeqIO.parse(template, format_type):
-            chain = record.annotations['chain']
-            if chain == chain_id:
-                if format_type.endswith('atom'):
-                    sequence_atom = str(record.seq)
-                elif format_type.endswith('seqres'):
-                    sequence_seqres = str(record.seq)
-    if sequence_atom is None:
-        logging.error(f"No atom sequence found for chain {chain_id}")
-    if sequence_seqres is None:
-        logging.warning(f"No SEQRES sequence found for chain {chain_id}")
-    return sequence_atom, sequence_seqres
-
-
 def create_db(out_path, templates, chains, threshold_clashes, hb_allowance, plddt_threshold):
     """
     Main function that creates a custom template database for AlphaFold2
@@ -146,30 +113,20 @@ def create_db(out_path, templates, chains, threshold_clashes, hb_allowance, pldd
     # Process each template/chain pair
     for template, chain_id in zip(templates, chains):
         code = parse_code(template)
+        logging.info(f"Template code: {code}")
+        assert len(code) == 4
         # Copy the template to out_path to avoid conflicts with the same file names
-        shutil.copyfile(template, templates_dir / Path(template).name)
-        template = templates_dir / Path(template).name
-        logging.info(f"Processing template: {template}  Chain {chain_id} Code: {code}")
-        logging.info("Parsing SEQRES...")
-        atom_seq, seqres_seq = None, None
-        if template.suffix == '.pdb':
-            atom_seq, seqres_seq = extract_seqs(template, chain_id)
-            logging.info(f"Converting to mmCIF: {template}")
-            template = Path(template)
-            convert_pdb_to_mmcif(template)
-            template = template.parent.joinpath(f"{template.stem}.cif")
+        new_template = templates_dir / Path(code + Path(template).suffix)
+        shutil.copyfile(template, new_template)
+        template = new_template
+        logging.info(f"Processing template: {template}  Chain {chain_id}")
         # Convert to (our) mmcif object
         mmcif_obj = MmcifChainFiltered(template, code, chain_id)
-        # Parse SEQRES
+        # full sequence is either SEQRES or parsed from (original) ATOMs
         if mmcif_obj.sequence_seqres:
             seqres = mmcif_obj.sequence_seqres
         else:
             seqres = mmcif_obj.sequence_atom
-        # if we converted from pdb, seqres is parsed from Bio.SeqIO
-        if seqres_seq or atom_seq:
-            seqres = seqres_seq
-            if seqres is None:
-                seqres = atom_seq
         sqrres_path = save_seqres(code, chain_id, seqres, seqres_dir)
         logging.info(f"SEQRES saved to {sqrres_path}!")
         # Remove clashes and low pLDDT regions for each template

diff --git a/alphapulldown/create_individual_features.py b/alphapulldown/create_individual_features.py
@@ -4,59 +4,27 @@
 # This script is just to create msa and structural features for each sequences and store them in pickle
 # #
 
-import os
-import pickle
-import sys
 from alphapulldown.objects import MonomericObject
-import importlib
-from absl import app
-from absl import flags
-from absl import logging
-
 from alphafold.data.pipeline import DataPipeline
 from alphafold.data.tools import hmmsearch
 from alphafold.data import templates
-import numpy as np
-import os
 from absl import logging, app
-import numpy as np
-from alphapulldown.utils import *
+from alphapulldown.utils import save_meta_data, create_uniprot_runner, parse_fasta, get_flags_from_af
 import contextlib
 from datetime import datetime
-import alphafold
 from pathlib import Path
 from colabfold.utils import DEFAULT_API_SERVER
+import os
+import sys
+import pickle
 
 @contextlib.contextmanager
 def output_meta_file(file_path):
     """function that create temp file"""
     with open(file_path, "w") as outfile:
         yield outfile.name
 
-
-def load_module(file_name, module_name):
-    spec = importlib.util.spec_from_file_location(module_name, file_name)
-    module = importlib.util.module_from_spec(spec)
-    sys.modules[module_name] = module
-    spec.loader.exec_module(module)
-    return module
-
-
-PATH_TO_RUN_ALPHAFOLD = os.path.join(
-    os.path.dirname(alphafold.__file__), "run_alphafold.py"
-)
-
-try:
-    run_af = load_module(PATH_TO_RUN_ALPHAFOLD, "run_alphafold")
-except FileNotFoundError:
-    PATH_TO_RUN_ALPHAFOLD = os.path.join(
-        os.path.dirname(os.path.dirname(alphafold.__file__)), "run_alphafold.py"
-    )
-
-    run_af = load_module(PATH_TO_RUN_ALPHAFOLD, "run_alphafold")
-
-
-flags = run_af.flags
+flags = get_flags_from_af()
 flags.DEFINE_bool("save_msa_files", False, "save msa output or not")
 flags.DEFINE_bool(
     "skip_existing", False, "skip existing monomer feature pickles or not"
@@ -221,11 +189,7 @@ def create_and_save_monomer_objects(m, pipeline, flags_dict,use_mmseqs2=False):
         else:
             logging.info("running mmseq now")
             m.make_mmseq_features(DEFAULT_API_SERVER=DEFAULT_API_SERVER,
-            pdb70_database_path=pdb70_database_path,
-            template_mmcif_dir=template_mmcif_dir,
-            max_template_date=FLAGS.max_template_date,
-            output_dir=FLAGS.output_dir,
-            obsolete_pdbs_path=FLAGS.obsolete_pdbs_path
+                                  pipeline=pipeline,output_dir=FLAGS.output_dir
             )
         pickle.dump(m, open(f"{FLAGS.output_dir}/{m.description}.pkl", "wb"))
         del m
@@ -264,8 +228,7 @@ def main(argv):
                 )
                 sys.exit()
     else:
-
-        pipeline=None
+        pipeline = create_pipeline()
         uniprot_runner=None
         flags_dict=FLAGS.flag_values_dict()