Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Remove unused code, clean code/tests #404

Merged
merged 6 commits into from
Sep 5, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 2 additions & 3 deletions alphapulldown/analysis_pipeline/af2_3dmol.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,10 +2,9 @@
# Author: Grzegorz Chojnowski @ EMBL-Hamburg
#

import os, sys, re
import os
import glob

from Bio.PDB import *
from Bio.PDB import PDBIO, PDBParser, Superimposer
from io import StringIO
import re
import py3Dmol
Expand Down
5 changes: 0 additions & 5 deletions alphapulldown/analysis_pipeline/create_notebook.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,16 +4,11 @@
# with good
# #

from math import pi
from operator import index
import os
import pickle
from absl import flags, app, logging
import json
import numpy as np
import pandas as pd
import subprocess
import gzip
from analysis_pipeline.utils import obtain_seq_lengths,obtain_pae_and_iptm

flags.DEFINE_string("output_dir", '.', "directory where predicted models are stored")
Expand Down
1 change: 1 addition & 0 deletions alphapulldown/analysis_pipeline/utils.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
import gzip
import matplotlib.pyplot as plt
from Bio.PDB import PDBParser
from Bio.PDB.Polypeptide import PPBuilder
Expand Down
5 changes: 1 addition & 4 deletions alphapulldown/objects.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,6 @@
"""
from absl import logging
logging.set_verbosity(logging.INFO)
import tempfile
import os
import subprocess
import numpy as np
Expand All @@ -18,7 +17,7 @@
from alphafold.data import feature_processing
from pathlib import Path as plPath
from typing import List, Dict
from colabfold.batch import unserialize_msa, get_msa_and_templates, msa_to_str, build_monomer_feature
from colabfold.batch import get_msa_and_templates, msa_to_str, build_monomer_feature
from alphapulldown.utils.multimeric_template_utils import (extract_multimeric_template_features_for_single_chain,
prepare_multimeric_template_meta_info)
from alphapulldown.utils.file_handling import temp_fasta_file
Expand Down Expand Up @@ -378,10 +377,8 @@ def prepare_individual_sliced_feature_dict(
def concatenate_sliced_feature_dict(self, feature_dicts: list):
"""concatenate regions such as 1-200 + 500-600"""
output_dict = feature_dicts[0]
new_sequence_length = feature_dicts[0]["seq_length"][0]
num_alignment = feature_dicts[0]["num_alignments"][0]
for sub_dict in feature_dicts[1:]:
new_sequence_length += sub_dict["seq_length"][0]
for k in feature_dicts[0].keys():
if sub_dict[k].ndim > 1:
if k == "aatype":
Expand Down
2 changes: 1 addition & 1 deletion alphapulldown/scripts/truncate_pickles.py
Original file line number Diff line number Diff line change
Expand Up @@ -48,7 +48,7 @@ def copy_contents(src_dir, dst_dir, keys_to_exclude, number_of_threads=1):
tasks = []

with ThreadPoolExecutor(max_workers=number_of_threads) as executor:
for root, dirs, files in os.walk(src_dir):
for root, _, files in os.walk(src_dir):
rel_path = os.path.relpath(root, src_dir)
dst_path = os.path.join(dst_dir, rel_path)
os.makedirs(dst_path, exist_ok=True)
Expand Down
11 changes: 0 additions & 11 deletions alphapulldown/utils/file_handling.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,16 +13,6 @@ def temp_fasta_file(sequence_str):
fasta_file.seek(0)
yield fasta_file.name

def ensure_directory_exists(directory):
"""
Ensures that a directory exists. If the directory does not exist, it is created.

Args:
directory (str): The path of the directory to check or create.
"""
if not os.path.exists(directory):
logging.info(f"Creating directory: {directory}")
os.makedirs(directory, exist_ok=True)

def parse_csv_file(csv_path, fasta_paths, mmt_dir, cluster=False):
"""
Expand All @@ -36,7 +26,6 @@ def parse_csv_file(csv_path, fasta_paths, mmt_dir, cluster=False):
list: A list of dictionaries, each containing protein data.
"""
protein_names = {}
protein_counters = {}
for fasta_path in fasta_paths:
if not os.path.isfile(fasta_path):
logging.error(f"Fasta file {fasta_path} does not exist.")
Expand Down
173 changes: 2 additions & 171 deletions alphapulldown/utils/modelling_setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,19 +5,13 @@

import os
import sys
import random
import pickle
import lzma
import importlib.util
from pathlib import Path
from typing import List,Dict,Union
import numpy as np
import alphafold
from alphafold.data import parsers
from alphafold.data.tools import jackhmmer
from alphafold.model import config
from alphafold.model import model
from alphafold.model import data
from alphafold.data import templates
from alphapulldown.objects import MonomericObject
from os.path import exists,join
Expand Down Expand Up @@ -109,8 +103,8 @@ def pad_individual_matrix(v, axis_indexes, shape, nums_to_add):

assembly_num_chains = feature_dict.pop('assembly_num_chains')
num_templates = feature_dict.pop('num_templates')
seq_length = feature_dict.pop('seq_length')
num_alignments = feature_dict.pop('num_alignments')
feature_dict.pop('seq_length')
feature_dict.pop('num_alignments')
original_num_msa , original_num_res = feature_dict['msa'].shape
num_res_to_pad = desired_num_res - original_num_res
num_msa_to_pad = desired_num_msa - original_num_msa
Expand Down Expand Up @@ -282,84 +276,6 @@ def load_monomer_objects(monomer_dir_dict, protein_name):
return monomer


def read_all_proteins(fasta_path) -> list:
"""
A function to read all proteins in the file

Args:
fasta_path: path to the fasta file where all proteins are in one file
"""
all_proteins = []
with open(fasta_path, "r") as f:
lines = list(f.readlines())
if any(l.startswith(">") for l in lines):
# this mean the file is a fasta file
with open(fasta_path, "r") as input_file:
sequences, descriptions = parsers.parse_fasta(input_file.read())
for desc in descriptions:
all_proteins.append({desc: "all"})
else:
for l in lines:
if len(l.strip()) > 0:
curr_list = l.rstrip().split(",")
if len(curr_list) == 1:
all_proteins.append({l.rstrip().split(",")[0]: "all"})

elif len(curr_list) > 1:
protein_name = curr_list[0]
regions = curr_list[1:]
output_region = []
for r in regions:
output_region.append(
(int(r.split("-")[0]), int(r.split("-")[1]))
)
all_proteins.append({protein_name: output_region})
return all_proteins


def obtain_region(input_string):
"""
A function that extract regions from the input string

Args
input_string: format is 'protein_n,1-100,2-200'
or 'protein_n'
"""
curr_list = input_string.split(",")
if len(curr_list) == 1:
return {input_string.rstrip().split(",")[0]: "all"}

elif len(curr_list) > 1:
protein_name = curr_list[0]
regions = curr_list[1:]
output_region = []
for r in regions:
output_region.append((int(r.split("-")[0]), int(r.split("-")[1])))
return {protein_name: output_region}


def read_custom(line) -> list:
"""
A function to input file under the mode: custom

Args:
line: each individual line in the custom input file
"""
all_proteins = []
curr_list = line.rstrip().split(";")
for substring in curr_list:
curr_protein = obtain_region(substring)
all_proteins.append(curr_protein)

return all_proteins


def check_existing_objects(output_dir, pickle_name):
"""check whether the wanted monomer object already exists in the output_dir"""
logging.info(f"checking if {os.path.join(output_dir, pickle_name)} already exists")
return os.path.isfile(os.path.join(output_dir, pickle_name))


def create_interactors(data : List[Dict[str, List[str]]],
monomer_objects_dir : List[str], i : int = 0) -> List[List[Union[MonomericObject, ChoppedObject]]]:
"""
Expand Down Expand Up @@ -401,88 +317,3 @@ def process_each_dict(data,monomer_objects_dir):
interactors.append(process_each_dict(d, monomer_objects_dir))
return interactors


def check_output_dir(path):
"""
A function to automatically the output directory provided by the user
if the user hasn't already created the directory
"""
logging.info(f"checking if output_dir exists {path}")
if not os.path.isdir(path):
Path(path).mkdir(parents=True, exist_ok=True)


def compute_msa_ranges(num_msa, num_extra_msa, num_multimer_predictions):
"""
Denser for smaller num_msa, sparser for larger num_msa
"""
msa_ranges = np.rint(np.logspace(np.log10(16), np.log10(num_msa),
num_multimer_predictions)).astype(int).tolist()
extra_msa_ranges = np.rint(np.logspace(np.log10(32), np.log10(num_extra_msa),
num_multimer_predictions)).astype(int).tolist()
return msa_ranges, extra_msa_ranges


def update_model_config(model_config, num_msa, num_extra_msa):
embeddings_and_evo = model_config["model"]["embeddings_and_evoformer"]
embeddings_and_evo.update({"num_msa": num_msa, "num_extra_msa": num_extra_msa})


def create_model_runners_and_random_seed(
model_preset, num_cycle, random_seed, data_dir,
num_multimer_predictions_per_model,
gradient_msa_depth=False, model_names_custom=None,
msa_depth=None):
num_ensemble = 1
model_runners = {}
model_names = config.MODEL_PRESETS[model_preset]

if model_names_custom:
model_names_custom = tuple(model_names_custom.split(","))
if all(x in model_names for x in model_names_custom):
model_names = model_names_custom
else:
raise Exception(f"Provided model names {model_names_custom} not part of available {model_names}")

for model_name in model_names:
model_config = config.model_config(model_name)
model_config.model.num_ensemble_eval = num_ensemble
model_config["model"].update({"num_recycle": num_cycle})

model_params = data.get_model_haiku_params(model_name=model_name, data_dir=data_dir)
model_runner = model.RunModel(model_config, model_params)

if gradient_msa_depth or msa_depth:
num_msa, num_extra_msa = get_default_msa(model_config)
msa_ranges, extra_msa_ranges = compute_msa_ranges(num_msa, num_extra_msa,
num_multimer_predictions_per_model)

for i in range(num_multimer_predictions_per_model):
if msa_depth or gradient_msa_depth:
if msa_depth:
num_msa = int(msa_depth)
num_extra_msa = num_msa * 4 # approx. 4x the number of msa, as in the AF2 config file
elif gradient_msa_depth:
num_msa = msa_ranges[i]
num_extra_msa = extra_msa_ranges[i]
update_model_config(model_config, num_msa, num_extra_msa)
logging.info(
f"Model {model_name} is running {i} prediction with num_msa={num_msa} "
f"and num_extra_msa={num_extra_msa}")
model_runners[f"{model_name}_pred_{i}_msa_{num_msa}"] = model_runner
#model_runners[f"{model_name}_pred_{i}"] = model_runner
else:
logging.info(
f"Model {model_name} is running {i} prediction with default MSA depth")
model_runners[f"{model_name}_pred_{i}"] = model_runner

if random_seed is None:
random_seed = random.randrange(sys.maxsize // len(model_runners))
logging.info("Using random seed %d for the data pipeline", random_seed)

return model_runners, random_seed


def get_default_msa(model_config):
embeddings_and_evo = model_config["model"]["embeddings_and_evoformer"]
return embeddings_and_evo["num_msa"], embeddings_and_evo["num_extra_msa"]
18 changes: 0 additions & 18 deletions alphapulldown/utils/save_meta_data.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,16 +4,13 @@
#
from alphapulldown import __version__ as AP_VERSION
from alphafold.version import __version__ as AF_VERSION
import json
import os
from absl import logging
from alphapulldown.utils.file_handling import ensure_directory_exists
import subprocess
import datetime
import re
import hashlib
import glob
import contextlib


COMMON_PATTERNS = [
Expand Down Expand Up @@ -188,18 +185,3 @@ def get_hash(filename):
for byte_block in iter(lambda: f.read(4096), b""):
md5_hash.update(byte_block)
return (md5_hash.hexdigest())

@contextlib.contextmanager
def output_meta_file(file_path):
"""
A context manager that ensures the directory for a file exists and then opens the file for writing.

Args:
file_path (str): The path of the file to be opened.

Yields:
Generator[str]: The name of the file opened.
"""
ensure_directory_exists(os.path.dirname(file_path))
with open(file_path, "w") as outfile:
yield outfile.name
4 changes: 4 additions & 0 deletions pytest.ini
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
[pytest]
log_cli = true
log_level = INFO
norecursedirs = test/alphalink
8 changes: 4 additions & 4 deletions test/check_predict_structure.py
Original file line number Diff line number Diff line change
Expand Up @@ -64,11 +64,11 @@ def _runCommonTests(self, result, multimer_mode, dirname=None):
example_pickle = pickles[0]
example_pickle = pickle.load(open(os.path.join(self.output_dir, dirname, example_pickle), 'rb'))

required_keys_multimer = ['distogram', 'experimentally_resolved', 'masked_msa', 'predicted_aligned_error',
'predicted_lddt', 'structure_module', 'plddt', 'aligned_confidence_probs',
required_keys_multimer = ['experimentally_resolved', 'predicted_aligned_error',
'predicted_lddt', 'structure_module', 'plddt',
'max_predicted_aligned_error', 'seqs', 'iptm', 'ptm', 'ranking_confidence']
required_keys_monomer = ['distogram', 'experimentally_resolved', 'masked_msa', 'predicted_aligned_error',
'predicted_lddt', 'structure_module', 'plddt', 'aligned_confidence_probs',
required_keys_monomer = ['experimentally_resolved', 'predicted_aligned_error',
'predicted_lddt', 'structure_module', 'plddt',
'max_predicted_aligned_error', 'seqs', 'ptm', 'ranking_confidence']

required_keys = required_keys_multimer if multimer_mode else required_keys_monomer
Expand Down
Loading
Loading