KosinskiLab · DimaMolod · Sep 5, 2024 · Sep 4, 2024 · Sep 4, 2024 · Sep 4, 2024
diff --git a/alphapulldown/analysis_pipeline/af2_3dmol.py b/alphapulldown/analysis_pipeline/af2_3dmol.py
@@ -2,10 +2,9 @@
 # Author: Grzegorz Chojnowski @ EMBL-Hamburg
 #
 
-import os, sys, re
+import os
 import glob
-
-from Bio.PDB import *
+from Bio.PDB import PDBIO, PDBParser, Superimposer
 from io import StringIO  
 import re
 import py3Dmol

diff --git a/alphapulldown/analysis_pipeline/create_notebook.py b/alphapulldown/analysis_pipeline/create_notebook.py
@@ -4,16 +4,11 @@
 # with good
 #  #
 
-from math import pi
-from operator import index
 import os
-import pickle
 from absl import flags, app, logging
 import json
 import numpy as np
 import pandas as pd
-import subprocess
-import gzip
 from analysis_pipeline.utils import obtain_seq_lengths,obtain_pae_and_iptm
 
 flags.DEFINE_string("output_dir", '.', "directory where predicted models are stored")

diff --git a/alphapulldown/analysis_pipeline/utils.py b/alphapulldown/analysis_pipeline/utils.py
@@ -1,3 +1,4 @@
+import gzip
 import matplotlib.pyplot as plt
 from Bio.PDB import PDBParser
 from Bio.PDB.Polypeptide import PPBuilder

diff --git a/alphapulldown/objects.py b/alphapulldown/objects.py
@@ -6,7 +6,6 @@
 """
 from absl import logging
 logging.set_verbosity(logging.INFO)
-import tempfile
 import os
 import subprocess
 import numpy as np
@@ -18,7 +17,7 @@
 from alphafold.data import feature_processing
 from pathlib import Path as plPath
 from typing import List, Dict
-from colabfold.batch import unserialize_msa, get_msa_and_templates, msa_to_str, build_monomer_feature
+from colabfold.batch import get_msa_and_templates, msa_to_str, build_monomer_feature
 from alphapulldown.utils.multimeric_template_utils import (extract_multimeric_template_features_for_single_chain,
                                                      prepare_multimeric_template_meta_info)
 from alphapulldown.utils.file_handling import temp_fasta_file
@@ -378,10 +377,8 @@ def prepare_individual_sliced_feature_dict(
     def concatenate_sliced_feature_dict(self, feature_dicts: list):
         """concatenate regions such as 1-200 + 500-600"""
         output_dict = feature_dicts[0]
-        new_sequence_length = feature_dicts[0]["seq_length"][0]
         num_alignment = feature_dicts[0]["num_alignments"][0]
         for sub_dict in feature_dicts[1:]:
-            new_sequence_length += sub_dict["seq_length"][0]
             for k in feature_dicts[0].keys():
                 if sub_dict[k].ndim > 1:
                     if k == "aatype":

diff --git a/alphapulldown/scripts/truncate_pickles.py b/alphapulldown/scripts/truncate_pickles.py
@@ -48,7 +48,7 @@ def copy_contents(src_dir, dst_dir, keys_to_exclude, number_of_threads=1):
     tasks = []
 
     with ThreadPoolExecutor(max_workers=number_of_threads) as executor:
-        for root, dirs, files in os.walk(src_dir):
+        for root, _, files in os.walk(src_dir):
             rel_path = os.path.relpath(root, src_dir)
             dst_path = os.path.join(dst_dir, rel_path)
             os.makedirs(dst_path, exist_ok=True)

diff --git a/alphapulldown/utils/file_handling.py b/alphapulldown/utils/file_handling.py
@@ -13,16 +13,6 @@ def temp_fasta_file(sequence_str):
         fasta_file.seek(0)
         yield fasta_file.name
 
-def ensure_directory_exists(directory):
-    """
-    Ensures that a directory exists. If the directory does not exist, it is created.
-
-    Args:
-    directory (str): The path of the directory to check or create.
-    """
-    if not os.path.exists(directory):
-        logging.info(f"Creating directory: {directory}")
-        os.makedirs(directory, exist_ok=True)
 
 def parse_csv_file(csv_path, fasta_paths, mmt_dir, cluster=False):
     """
@@ -36,7 +26,6 @@ def parse_csv_file(csv_path, fasta_paths, mmt_dir, cluster=False):
         list: A list of dictionaries, each containing protein data.
     """
     protein_names = {}
-    protein_counters = {}
     for fasta_path in fasta_paths:
         if not os.path.isfile(fasta_path):
             logging.error(f"Fasta file {fasta_path} does not exist.")

diff --git a/alphapulldown/utils/modelling_setup.py b/alphapulldown/utils/modelling_setup.py
@@ -5,19 +5,13 @@
 
 import os
 import sys
-import random
 import pickle
 import lzma
 import importlib.util
-from pathlib import Path
 from typing import List,Dict,Union
 import numpy as np
 import alphafold
-from alphafold.data import parsers
 from alphafold.data.tools import jackhmmer
-from alphafold.model import config
-from alphafold.model import model
-from alphafold.model import data
 from alphafold.data import templates
 from alphapulldown.objects import MonomericObject
 from os.path import exists,join
@@ -109,8 +103,8 @@ def pad_individual_matrix(v, axis_indexes, shape, nums_to_add):
 
     assembly_num_chains = feature_dict.pop('assembly_num_chains')
     num_templates = feature_dict.pop('num_templates')
-    seq_length = feature_dict.pop('seq_length')
-    num_alignments = feature_dict.pop('num_alignments')
+    feature_dict.pop('seq_length')
+    feature_dict.pop('num_alignments')
     original_num_msa , original_num_res = feature_dict['msa'].shape
     num_res_to_pad = desired_num_res - original_num_res
     num_msa_to_pad = desired_num_msa - original_num_msa
@@ -282,84 +276,6 @@ def load_monomer_objects(monomer_dir_dict, protein_name):
     return monomer
 
 
-def read_all_proteins(fasta_path) -> list:
-    """
-    A function to read all proteins in the file
-
-    Args:
-    fasta_path: path to the fasta file where all proteins are in one file
-    """
-    all_proteins = []
-    with open(fasta_path, "r") as f:
-        lines = list(f.readlines())
-        if any(l.startswith(">") for l in lines):
-            # this mean the file is a fasta file
-            with open(fasta_path, "r") as input_file:
-                sequences, descriptions = parsers.parse_fasta(input_file.read())
-                for desc in descriptions:
-                    all_proteins.append({desc: "all"})
-        else:
-            for l in lines:
-                if len(l.strip()) > 0:
-                    curr_list = l.rstrip().split(",")
-                    if len(curr_list) == 1:
-                        all_proteins.append({l.rstrip().split(",")[0]: "all"})
-
-                    elif len(curr_list) > 1:
-                        protein_name = curr_list[0]
-                        regions = curr_list[1:]
-                        output_region = []
-                        for r in regions:
-                            output_region.append(
-                                (int(r.split("-")[0]), int(r.split("-")[1]))
-                            )
-                        all_proteins.append({protein_name: output_region})
-    return all_proteins
-
-
-def obtain_region(input_string):
-    """
-    A function that extract regions from the input string
-
-    Args
-    input_string: format is 'protein_n,1-100,2-200'
-    or 'protein_n'
-    """
-    curr_list = input_string.split(",")
-    if len(curr_list) == 1:
-        return {input_string.rstrip().split(",")[0]: "all"}
-
-    elif len(curr_list) > 1:
-        protein_name = curr_list[0]
-        regions = curr_list[1:]
-        output_region = []
-        for r in regions:
-            output_region.append((int(r.split("-")[0]), int(r.split("-")[1])))
-        return {protein_name: output_region}
-
-
-def read_custom(line) -> list:
-    """
-    A function to input file under the mode: custom
-
-    Args:
-    line: each individual line in the custom input file
-    """
-    all_proteins = []
-    curr_list = line.rstrip().split(";")
-    for substring in curr_list:
-        curr_protein = obtain_region(substring)
-        all_proteins.append(curr_protein)
-
-    return all_proteins
-
-
-def check_existing_objects(output_dir, pickle_name):
-    """check whether the wanted monomer object already exists in the output_dir"""
-    logging.info(f"checking if {os.path.join(output_dir, pickle_name)} already exists")
-    return os.path.isfile(os.path.join(output_dir, pickle_name))
-
-
 def create_interactors(data : List[Dict[str, List[str]]], 
                        monomer_objects_dir : List[str], i : int = 0) -> List[List[Union[MonomericObject, ChoppedObject]]]:
     """
@@ -401,88 +317,3 @@ def process_each_dict(data,monomer_objects_dir):
         interactors.append(process_each_dict(d, monomer_objects_dir))
     return interactors
 
-
-def check_output_dir(path):
-    """
-    A function to automatically the output directory provided by the user
-    if the user hasn't already created the directory
-    """
-    logging.info(f"checking if output_dir exists {path}")
-    if not os.path.isdir(path):
-        Path(path).mkdir(parents=True, exist_ok=True)
-
-
-def compute_msa_ranges(num_msa, num_extra_msa, num_multimer_predictions):
-    """
-    Denser for smaller num_msa, sparser for larger num_msa
-    """
-    msa_ranges = np.rint(np.logspace(np.log10(16), np.log10(num_msa),
-                                     num_multimer_predictions)).astype(int).tolist()
-    extra_msa_ranges = np.rint(np.logspace(np.log10(32), np.log10(num_extra_msa),
-                                           num_multimer_predictions)).astype(int).tolist()
-    return msa_ranges, extra_msa_ranges
-
-
-def update_model_config(model_config, num_msa, num_extra_msa):
-    embeddings_and_evo = model_config["model"]["embeddings_and_evoformer"]
-    embeddings_and_evo.update({"num_msa": num_msa, "num_extra_msa": num_extra_msa})
-
-
-def create_model_runners_and_random_seed(
-        model_preset, num_cycle, random_seed, data_dir,
-        num_multimer_predictions_per_model,
-        gradient_msa_depth=False, model_names_custom=None,
-        msa_depth=None):
-    num_ensemble = 1
-    model_runners = {}
-    model_names = config.MODEL_PRESETS[model_preset]
-
-    if model_names_custom:
-        model_names_custom = tuple(model_names_custom.split(","))
-        if all(x in model_names for x in model_names_custom):
-            model_names = model_names_custom
-        else:
-            raise Exception(f"Provided model names {model_names_custom} not part of available {model_names}")
-
-    for model_name in model_names:
-        model_config = config.model_config(model_name)
-        model_config.model.num_ensemble_eval = num_ensemble
-        model_config["model"].update({"num_recycle": num_cycle})
-
-        model_params = data.get_model_haiku_params(model_name=model_name, data_dir=data_dir)
-        model_runner = model.RunModel(model_config, model_params)
-
-        if gradient_msa_depth or msa_depth:
-            num_msa, num_extra_msa = get_default_msa(model_config)
-            msa_ranges, extra_msa_ranges = compute_msa_ranges(num_msa, num_extra_msa,
-                                                              num_multimer_predictions_per_model)
-
-        for i in range(num_multimer_predictions_per_model):
-            if msa_depth or gradient_msa_depth:
-                if msa_depth:
-                    num_msa = int(msa_depth)
-                    num_extra_msa = num_msa * 4  # approx. 4x the number of msa, as in the AF2 config file
-                elif gradient_msa_depth:
-                    num_msa = msa_ranges[i]
-                    num_extra_msa = extra_msa_ranges[i]
-                update_model_config(model_config, num_msa, num_extra_msa)
-                logging.info(
-                    f"Model {model_name} is running {i} prediction with num_msa={num_msa} "
-                    f"and num_extra_msa={num_extra_msa}")
-                model_runners[f"{model_name}_pred_{i}_msa_{num_msa}"] = model_runner
-                #model_runners[f"{model_name}_pred_{i}"] = model_runner
-            else:
-                logging.info(
-                    f"Model {model_name} is running {i} prediction with default MSA depth")
-                model_runners[f"{model_name}_pred_{i}"] = model_runner
-
-    if random_seed is None:
-        random_seed = random.randrange(sys.maxsize // len(model_runners))
-        logging.info("Using random seed %d for the data pipeline", random_seed)
-
-    return model_runners, random_seed
-
-
-def get_default_msa(model_config):
-    embeddings_and_evo = model_config["model"]["embeddings_and_evoformer"]
-    return embeddings_and_evo["num_msa"], embeddings_and_evo["num_extra_msa"]
diff --git a/alphapulldown/utils/save_meta_data.py b/alphapulldown/utils/save_meta_data.py
@@ -4,16 +4,13 @@
 #
 from alphapulldown import __version__ as AP_VERSION
 from alphafold.version import __version__ as AF_VERSION
-import json
 import os
 from absl import logging
-from alphapulldown.utils.file_handling import ensure_directory_exists
 import subprocess
 import datetime
 import re
 import hashlib
 import glob
-import contextlib
 
 
 COMMON_PATTERNS = [
@@ -188,18 +185,3 @@ def get_hash(filename):
         for byte_block in iter(lambda: f.read(4096), b""):
             md5_hash.update(byte_block)
         return (md5_hash.hexdigest())
-
-@contextlib.contextmanager
-def output_meta_file(file_path):
-    """
-    A context manager that ensures the directory for a file exists and then opens the file for writing.
-
-    Args:
-    file_path (str): The path of the file to be opened.
-
-    Yields:
-    Generator[str]: The name of the file opened.
-    """
-    ensure_directory_exists(os.path.dirname(file_path))
-    with open(file_path, "w") as outfile:
-        yield outfile.name
diff --git a/pytest.ini b/pytest.ini
@@ -0,0 +1,4 @@
+[pytest]
+log_cli = true
+log_level = INFO
+norecursedirs = test/alphalink
diff --git a/test/check_predict_structure.py b/test/check_predict_structure.py
@@ -64,11 +64,11 @@ def _runCommonTests(self, result, multimer_mode, dirname=None):
             example_pickle = pickles[0]
             example_pickle = pickle.load(open(os.path.join(self.output_dir, dirname, example_pickle), 'rb'))
 
-            required_keys_multimer = ['distogram', 'experimentally_resolved', 'masked_msa', 'predicted_aligned_error',
-                                      'predicted_lddt', 'structure_module', 'plddt', 'aligned_confidence_probs',
+            required_keys_multimer = ['experimentally_resolved', 'predicted_aligned_error',
+                                      'predicted_lddt', 'structure_module', 'plddt',
                                       'max_predicted_aligned_error', 'seqs', 'iptm', 'ptm', 'ranking_confidence']
-            required_keys_monomer = ['distogram', 'experimentally_resolved', 'masked_msa', 'predicted_aligned_error',
-                                     'predicted_lddt', 'structure_module', 'plddt', 'aligned_confidence_probs',
+            required_keys_monomer = ['experimentally_resolved', 'predicted_aligned_error',
+                                     'predicted_lddt', 'structure_module', 'plddt',
                                      'max_predicted_aligned_error', 'seqs', 'ptm', 'ranking_confidence']
 
             required_keys = required_keys_multimer if multimer_mode else required_keys_monomer