-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathrpCache.py
463 lines (436 loc) · 19.9 KB
/
rpCache.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
import csv
import logging
import gzip
import os
import json
import pickle
import sqlite3
import re
import itertools
from ast import literal_eval
import gzip
## @package Cache
#
# Documentation for the cache generation of rpFBA
## \brief Class to generate the cache
#
# Contains all the functions that parse different files, used to calculate the thermodynamics and the FBA of the
#the other steps. These should be called only when the files have changes
class rpCache:
## Cache constructor
#
# @param self The object pointer
# @param inputPath The path to the folder that contains all the input/output files required
def __init__(self):
#given by Thomas
self.convertMNXM = {'MNXM162231': 'MNXM6',
'MNXM84': 'MNXM15',
'MNXM96410': 'MNXM14',
'MNXM114062': 'MNXM3',
'MNXM145523': 'MNXM57',
'MNXM57425': 'MNXM9',
'MNXM137': 'MNXM588022'}
#personally looked at the KEGG to MNXM conversion for the thermodynamics
self.deprecatedMNXM_mnxm = None
########################################################
############################# FUNCTIONS ################
########################################################
#[TODO] merge the two functions
## Function to parse the chem_xref.tsv file of MetanetX
#
# Generate a dictionnary of old to new MetanetX identifiers
#
# @param self Object pointer
# @param chem_xref_path Input file path
# @return a The dictionnary of identifiers
#TODO: save the self.deprecatedMNXM_mnxm to be used in case there rp_paths uses an old version of MNX
def deprecatedMNXM(self, chem_xref_path):
a = {}
with open(chem_xref_path) as f:
c = csv.reader(f, delimiter='\t')
for row in c:
mnx = row[0].split(':')
if not row[0][0]=='#':
if mnx[0]=='deprecated':
#a[row[1]] = mnx[1]
a[mnx[1]] = row[1]
a.update(self.convertMNXM)
a['MNXM01'] = 'MNXM1'
return a
## Function to parse the chem_xref.tsv file of MetanetX
#
# Generate a dictionnary of old to new MetanetX identifiers
#
# @param self Object pointer
# @param chem_xref_path Input file path
# @return a The dictionnary of identifiers
#TODO: save the self.deprecatedMNXM_mnxm to be used in case there rp_paths uses an old version of MNX
def chem_xref(self, chem_xref_path):
chemXref = {}
with open(chem_xref_path) as f:
c = csv.reader(f, delimiter='\t')
for row in c:
if not row[0][0]=='#':
mnx = row[1]
if len(row[0].split(':'))==1:
dbName = 'mnx'
dbId = row[0]
else:
dbName = row[0].split(':')[0]
dbId = ''.join(row[0].split(':')[1:])
if dbName=='deprecated':
dbName = 'mnx'
### MNXM ###
if not mnx in chemXref:
chemXref[mnx] = {}
if not dbName in chemXref[mnx]:
chemXref[mnx][dbName] = []
if not dbId in chemXref[mnx][dbName]:
chemXref[mnx][dbName].append(dbId)
### DB ###
if not dbName in chemXref:
chemXref[dbName] = {}
if not dbId in chemXref[dbName]:
chemXref[dbName][dbId] = mnx
return chemXref
## Function to parse the reac_xref.tsv file of MetanetX
#
# Generate a dictionnary of old to new MetanetX identifiers
#
# @param self Object pointer
# @param chem_xref_path Input file path
# @return a The dictionnary of identifiers
#TODO: save the self.deprecatedMNXM_mnxm to be used in case there rp_paths uses an old version of MNX
def reac_xref(self, reac_xref_path):
reacXref = {}
with open(reac_xref_path) as f:
c = csv.reader(f, delimiter='\t')
for row in c:
if not row[0][0]=='#' and len(row[0].split(':'))==2:
mnx = row[1]
if len(row[0].split(':'))==1:
dbName = 'mnx'
dbId = row[0]
else:
dbName = row[0].split(':')[0]
dbId = ''.join(row[0].split(':')[1:])
if dbName=='deprecated':
dbName = 'mnx'
if not mnx in reacXref:
reacXref[mnx] = {}
if not dbName in reacXref[mnx]:
reacXref[mnx][dbName] = []
if not dbId in reacXref[mnx][dbName]:
reacXref[mnx][dbName].append(dbId)
return reacXref
## Function to parse the comp_xref.tsv file of MetanetX
#
# Generate a dictionnary of old to new MetanetX identifiers
#
# @param self Object pointer
# @param chem_xref_path Input file path
# @return a The dictionnary of identifiers
#TODO: save the self.deprecatedMNXM_mnxm to be used in case there rp_paths uses an old version of MNX
def comp_xref(self, comp_xref_path, comp_prop_path):
mnxc_name = {}
with open(comp_prop_path) as f:
c = csv.reader(f, delimiter='\t')
for row in c:
if not row[0][0]=='#':
if not row[0] in mnxc_name:
mnxc_name[row[0]] = row[1]
#Mel --> not a fan of the hardcoding, if the file changes then one would need to add new entries
#possCID = ['mnxc', 'bigg', 'cco', 'go', 'seed', 'name']
#pubDB_name_xref = {}
name_pubDB_xref = {}
try:
with open(comp_xref_path) as f:
c = csv.reader(f, delimiter='\t')
#not_recognised = []
for row in c:
#cid = row[0].split(':')
if not row[0][0]=='#':
#collect the info
mnxc = row[1]
name = mnxc_name[mnxc]
if len(row[0].split(':'))==1:
dbName = 'mnx'
dbCompId = row[0]
else:
dbName = row[0].split(':')[0]
dbCompId = ''.join(row[0].split(':')[1:])
#create the dicts
if not name in name_pubDB_xref:
name_pubDB_xref[name] = {}
if not dbName in name_pubDB_xref[name]:
name_pubDB_xref[name][dbName] = []
if not dbCompId in name_pubDB_xref[name][dbName]:
name_pubDB_xref[name][dbName].append(dbCompId)
except FileNotFoundError:
logging.error('comp_xref file not found')
return {}
return name_pubDB_xref
## Function to parse the chemp_prop.tsv file from MetanetX
#
# Generate a dictionnary gaving the formula, smiles, inchi and inchikey for the components
#
# @param self Object pointer
# @param chem_prop_path Input file path
# @return smiles_inchi Dictionnary of formula, smiles, inchi and inchikey
def smiles_inchi(self, chem_prop_path):
#TODO: need to reduce the size of this file. As it stands its 250MB
smiles_inchi = {}
with open(chem_prop_path) as f:
c = csv.reader(f, delimiter='\t')
for row in c:
if not row[0][0]=='#':
smiles_inchi[row[0]] = {'forumla': row[2], 'smiles': row[6], 'inchi': row[5], 'inchikey': row[8]}
return smiles_inchi
## Function exctract the dG of components
#
#
#
# @param self Object pointer
# @param self.deprecatedMNXM_mnxm Dictionnary of old/new MNX identifiers
# @param chem_xref_path chem_xref.tsv file path
# @param cc_compounds_path cc_compounds.json.gz file path
# @param alberty_path alberty.json file path
# @param compounds_path compounds.csv file path
def kegg_dG(self,
cc_compounds_path,
alberty_path,
compounds_path):
cc_alberty = {}
########################## compounds ##################
#contains the p_kas and molecule decomposition
cid_comp = {}
with open(compounds_path) as f:
c = csv.reader(f, delimiter=',', quotechar='"')
next(c)
for row in c:
cid_comp[row[-1].split(':')[1]] = {}
cid_comp[row[-1].split(':')[1]]['atom_bag'] = literal_eval(row[3])
cid_comp[row[-1].split(':')[1]]['p_kas'] = literal_eval(row[4])
cid_comp[row[-1].split(':')[1]]['major_ms'] = int(literal_eval(row[6]))
cid_comp[row[-1].split(':')[1]]['number_of_protons'] = literal_eval(row[7])
cid_comp[row[-1].split(':')[1]]['charges'] = literal_eval(row[8])
'''
###################### mnxm_kegg ################
kegg_mnxm = {}
with open(self._checkFilePath(chem_xref_path, 'chem_xref.tsv')) as f:
c = csv.reader(f, delimiter='\t')
for row in c:
mnx = row[0].split(':')
if mnx[0]=='kegg' and mnx[1][0]=='C':
if mnx[1] in kegg_mnxm:
logging.warning(
'Replaced '+str(mnx[1])+': '+str(kegg_mnxm[mnx[1]])+' from '+str(row[1])
)
else:
try:
mnxm = self.deprecatedMNXM_mnxm[mnx[1]]
except KeyError:
mnxm = mnx[1]
kegg_mnxm[mnxm] = row[1]
'''
####################### cc_compounds ############
#TODO: seems like the new version of equilibrator got rid of this file... need to update the function
#to take as input the new file --> i.e. the JSON input
#notFound_cc = []
gz_file = gzip.open(cc_compounds_path, 'rb')
f_c = gz_file.read()
c = json.loads(f_c)
for cd in c:
'''
#find MNXM from CID
try:
mnxm = kegg_mnxm[cd['CID']]
except KeyError:
try:
mnxm = self.curated_kegg_mnxm[cd['CID']]
try:
mnxm = self.deprecatedMNXM_mnxm[mnxm]
except KeyError:
pass
except KeyError:
logging.warning('Cannot find: '+str(cd))
notFound_cc.append(cd['CID'])
continue
'''
#find the compound descriptions
try:
cd.update(cid_comp[cd['CID']])
except KeyError:
pass
#add the CID
#if not mnxm in cc_alberty:
if not cd['CID'] in cc_alberty:
cc_alberty[cd['CID']] = {}
if not 'component_contribution' in cc_alberty[cd['CID']]:
cc_alberty[cd['CID']]['component_contribution'] = [cd]
else:
cc_alberty[cd['CID']]['component_contribution'].append(cd)
######################## alberty ################
with open(alberty_path) as json_data:
d = json.loads(json_data.read())
for cd in d:
'''
#find the MNXM from CID
try:
mnxm = kegg_mnxm[cd['cid']]
except KeyError:
try:
mnxm = self.curated_kegg_mnxm[cd['cid']]
try:
mnxm = self.deprecatedMNXM_mnxm[mnxm]
except KeyError:
pass
except KeyError:
logging.warning('Cannot find: '+str(cd))
notFound_alberty.append(cd['cid'])
continue
'''
#find the compound description
try:
cd.update(cid_comp[cd['CID']])
except KeyError:
pass
#add the CID
#if not mnxm in cc_alberty:
if not cd['CID'] in cc_alberty:
cc_alberty[cd['CID']] = {}
if not 'alberty' in cc_alberty[cd['CID']]:
cc_alberty[cd['CID']]['alberty'] = [cd]
else:
cc_alberty[cd['CID']]['alberty'].append(cd)
return cc_alberty
## Function to parse the rules_rall.tsv file
#
# Extract from the reactions rules the ruleID, the reactionID, the direction of the rule directed to the origin reaction
#
# @param self The object pointer.
# @param path The input file path.
# @return rule Dictionnary describing each reaction rule
def retro_reactions(self, path):
try:
with open(path, 'r') as f:
reader = csv.reader(f, delimiter = '\t')
next(reader)
rule = {}
for row in reader:
rule[row[0]]= {'rule_id': row[0], 'rule_score': row[11], 'reaction':row[1], 'rel_direction': row[13], 'left': row[5], 'right': row[7]}
except (TypeError, FileNotFoundError) as e:
logging.error('Could not read the rules_rall file ('+str(path)+')')
return {}
return(rule)
## Function to parse rxn_recipes.tsv file
#
# Extract the substracts and products of the origin reaction with the stochiometry
#
# @param self The object pointer.
# @param self.deprecatedMNXM_mnxm Dictionnary of old to new version of MetanetX identifiers
# @param path The input file path.
# @return reaction Dictionnnary containing the description of the reaction of origin for each reactionID
def full_reac(self, path):
try:
with open(path) as f:
def dico(liste, i=0, x=1):
while i <= len(liste)-1:
tmp = liste[i];
liste[i] = liste[x]
liste[x] = tmp
i = i+2
x = x+2
dico = dict(itertools.zip_longest(*[iter(liste)] * 2, fillvalue=""))
return dico
reader = csv.reader(f, delimiter = '\t')
next(reader)
reaction = {}
for row in reader:
reaction[row[0]]= {'main_left': row[9], 'main_right': row[10], 'left':{}, 'right':{}, 'direction': row[3]}
#tmp_l = (((((((((row[1].split('=')[0]).replace('+ ',"")).replace('@MNXD1','')).replace('@MNXD2','')).replace('@MNXDX','')).replace('@BOUNDARY','')).replace('@\S',''))).split(' '))[:-1]
tmp_l = ((row[1].split('=')[0]).replace('+ ',"")).split(' ')[:-1]
#tmp_r = ((((((((row[1].split('=')[1]).replace('+ ',"")).replace('@MNXD1','')).replace('@MNXD2','')).replace('@MNXDX','')).replace('@BOUNDARY',''))).split(' '))[1:]
tmp_r = ((row[1].split('=')[1]).replace('+ ',"")).split(' ')[1:]
p = re.compile('@\\w+')
for x,i in enumerate(tmp_l):
l = re.sub(p, '', i)
tmp_l[x]= l
for x,i in enumerate(tmp_r):
r = re.sub(p, '', i)
tmp_r[x]= r
dico_l = dico(tmp_l)
dico_r = dico(tmp_r)
#remove the main_right and the main_left because already in the out_path.csv
#WARNING: There is a small chance that you will remove elements with MNX codes that
#are in fact not described as the CMP
'''for i in reaction[row[0]]['main_right'].split(','):
del dico_r[i]
for i in reaction[row[0]]['main_left'].split(','):
del dico_l[i]'''
for i in dico_l:
tmp_i = i
while tmp_i in self.deprecatedMNXM_mnxm:
tmp_i = self.deprecatedMNXM_mnxm[tmp_i]
reaction[row[0]]['left'][tmp_i] = dico_l[i]
for i in dico_r:
tmp_i = i
while tmp_i in self.deprecatedMNXM_mnxm:
tmp_i = self.deprecatedMNXM_mnxm[tmp_i]
reaction[row[0]]['right'][tmp_i] = dico_r[i]
except (TypeError, FileNotFoundError) as e:
logging.error('Could not read the rxn_recipes file ('+str(path)+')')
return {}
return reaction
## Run all the functions
#
# Run all the files required to generate the cache. Requires : mvc.db, chem_xref.tsv, chem_prop.tsv, cc_compounds.json.gz and alberty.json
#
# @param self Object pointer
#TODO: change the input_cache to a compressed file with a given checksum to assure that it is fine
#TODO: consider checksumming the individual files that are generated from here
if __name__ == "__main__":
if not os.path.isdir(os.getcwd()+'/cache'):
os.mkdir('cache')
#dirname = os.path.dirname(os.path.abspath( __file__ ))
cache = rpCache()
#cache.deprecatedMNXM_mnxm
logging.info('Generating deprecatedMNXM_mnxm')
cache.deprecatedMNXM_mnxm = cache.deprecatedMNXM('input_cache/chem_xref.tsv')
pickle.dump(cache.deprecatedMNXM_mnxm, open('cache/deprecatedMNXM_mnxm.pickle', 'wb'))
#mnxm_dG
logging.info('Generating mnxm_dG')
pickle.dump(cache.kegg_dG('input_cache/cc_compounds.json.gz',
'input_cache/alberty.json',
'input_cache/compounds.csv'),
open('cache/kegg_dG.pickle', 'wb'))
#rr_reactions
logging.info('Generating rr_reactions')
rr_reactions = cache.retro_reactions('input_cache/rules_rall.tsv')
pickle.dump(rr_reactions, open('cache/rr_reactions.pickle', 'wb'))
#full_reactions
logging.info('Generating full_reactions')
pickle.dump(cache.full_reac('input_cache/rxn_recipes.tsv'),
open('cache/full_reactions.pickle', 'wb'))
#smiles_inchi --> use gzip since it is a large file
logging.info('Parsing the SMILES and InChI')
#pickle.dump(cache.smiles_inchi(), open('cache/smiles_inchi.pickle', 'wb'))
pickle.dump(cache.smiles_inchi('input_cache/chem_prop.tsv'),
gzip.open('cache/smiles_inchi.pickle.gz','wb'))
#xref --> use gzip since it is a large file
logging.info('Parsing the Cross-references')
pickle.dump(cache.chem_xref('input_cache/chem_xref.tsv'), gzip.open('cache/chemXref.pickle.gz','wb'))
pickle.dump(cache.reac_xref('input_cache/reac_xref.tsv'), gzip.open('cache/reacXref.pickle.gz','wb'))
pickle.dump(cache.comp_xref('input_cache/comp_xref.tsv', 'input_cache/comp_prop.tsv'), gzip.open('cache/compXref.pickle.gz','wb'))
'''
pub_mnx_chem_xref, mnx_pub_chem_xref = cache.chem_xref()
pickle.dump(pub_mnx_chem_xref, gzip.open('cache/pub_mnx_chem_xref.gz','wb'))
pickle.dump(mnx_pub_chem_xref, gzip.open('cache/mnx_pub_chem_xref.gz','wb'))
pub_mnx_reax_xref, mnx_pub_reac_xref = cache.reac_xref()
pickle.dump(pub_mnx_reax_xref, gzip.open('cache/pub_mnx_reax_xref.gz','wb'))
pickle.dump(mnx_pub_reac_xref, gzip.open('cache/mnx_pub_reac_xref.gz','wb'))
pub_mnx_comp_xref, mnx_pub_comp_xref = cache.comp_xref()
pickle.dump(pub_mnx_comp_xref, gzip.open('cache/pub_mnx_comp_xref.gz','wb'))
pickle.dump(mnx_pub_comp_xref, gzip.open('cache/mnx_pub_comp_xref.gz','wb'))
'''
#pickle.dump(cache.xref(), gzip.open('cache/Id_xref.pickle.gz','wb'))