-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathreduce_actives_analogue_bias.py
78 lines (70 loc) · 3.57 KB
/
reduce_actives_analogue_bias.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
import pandas as pd
import os
from utils import log, load_smiles
from paths_and_settings import *
from typing import Union
def load_activities():
activities = []
activities_paths = [join(STANDARDS_FOLDER, f) for f in os.listdir(STANDARDS_FOLDER)
if (os.path.isfile(join(STANDARDS_FOLDER, f)) and 'compound' in f)]
for p in activities_paths:
data = pd.read_csv(p)
data['Value'] = pd.to_numeric(data['Value'])
data = data[data['Relation'] == '=']
data = data[data['Value'] < ACTIVITY_VALUE_THRESHOLD]
activities.append(data)
output = pd.concat(activities, ignore_index=True)
return output
def get_ligands_best_activity(ligand_id: str, acts_dataframe: pd.DataFrame):
best = min(acts_dataframe[acts_dataframe['ID_compound'] == ligand_id]['Value'].to_list())
return best
def choose_best_actives(target_id: str, path_to_analog_matrix: str, acts_dataframe: pd.DataFrame, tc_threshold=ACTIVES_TC_SIMILARITY_THRESHOLD):
if os.path.exists(os.path.join(CHEMBL_SMILES_FOLDER, f'{target_id}_filtered_active.smi')):
log(f'Filtered actives file for target {target_id} already exists!')
return None
else:
chembl_actives_matrix = pd.read_csv(path_to_analog_matrix, index_col=0).apply(pd.to_numeric).to_dict(orient='index')
log(f'Before filtering best actives for {target_id}: {len(chembl_actives_matrix)}')
target_act_dataframe = acts_dataframe[acts_dataframe['ID_target_CHEMBL'] == target_id]
all_actives_len = len(chembl_actives_matrix)
for i in range(len(chembl_actives_matrix)):
if i % 200 == 0:
log(f'Currently finished filtering actives for {i} out of {all_actives_len} in {target_id}')
bail = False
to_drop = None
for query_id_1 in chembl_actives_matrix:
for subject_id_2 in chembl_actives_matrix[query_id_1]:
if query_id_1 != subject_id_2:
tc = max(chembl_actives_matrix[query_id_1][subject_id_2], chembl_actives_matrix[subject_id_2][query_id_1])
if tc >= tc_threshold:
best1 = get_ligands_best_activity(query_id_1, target_act_dataframe)
best2 = get_ligands_best_activity(subject_id_2, target_act_dataframe)
if best1 >= best2:
to_drop = subject_id_2
else:
to_drop = query_id_1
bail = True
if bail:
break
if bail:
break
if to_drop is not None:
del chembl_actives_matrix[to_drop]
for k in chembl_actives_matrix:
del chembl_actives_matrix[k][to_drop]
else:
break
log(f'After filtering best actives for {target_id}: {len(chembl_actives_matrix)}')
return list(chembl_actives_matrix.keys())
def filter_actives_smiles_file(target_id: str, best_actives: Union[list, None]):
if best_actives is not None:
smiles = load_smiles(os.path.join(CHEMBL_SMILES_FOLDER, f'{target_id}_active.smi'))
to_del = []
for i in smiles:
if i not in best_actives:
to_del.append(i)
for i in to_del:
del smiles[i]
with open(os.path.join(CHEMBL_SMILES_FOLDER, f'{target_id}_filtered_active.smi'), 'w') as handle:
for tup in smiles.items():
handle.write(f'{tup[1]}\t{tup[0]}\n')