-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathparse.py
88 lines (76 loc) · 3.01 KB
/
parse.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
import re
def parse_flat_file(pathway):
name = None
description = None
classes = None
diseases = None
drugs = None
genes = None
compounds = None
def last_index(string, substring):
return len(string) - len(substring) - string[::-1].index(substring)
def parse_disease_line(line):
disease_id, disease_name = line.strip().split(" ")
return disease_id, disease_name
def parse_drug_line(line):
drug_id, drug_name = line.strip().split(" ")
return drug_id, drug_name
def parse_gene_line(line):
gene_id, gene_names = line.strip().split(" ")
if ";" in gene_names: # Mutliple names
names = list(map(lambda string: string.strip(), gene_names.split(";")))
short_name, long_name = names[0], "; ".join(names[1:])
if "[" in long_name:
long_name = long_name[: long_name.index("[") - 1]
else: # One name
short_name = ""
long_name = gene_names
if "[" in long_name:
long_name = long_name[: long_name.index("[") - 1]
return gene_id, short_name, long_name
def parse_compound_line(line):
line = line.strip()
if " " in line:
compound_id, compound_name = line.split(" ")
else:
compound_id = line
compound_name = ""
return compound_id, compound_name
state = None
for line in re.split("\n+", pathway):
if not line.startswith(" "):
state = None
# List continuation
if state == "DISEASE":
diseases.append(parse_disease_line(line))
elif state == "DRUG":
drugs.append(parse_drug_line(line))
elif state == "GENE":
genes.append(parse_gene_line(line))
elif state == "COMPOUND":
compounds.append(parse_compound_line(line))
# One-line entries
elif line.startswith("NAME"):
name = line.lstrip("NAME").lstrip()
name = name[: last_index(name, " - ")]
if name.strip() == "":
raise ValueError("Name cannot be an empty string")
elif line.startswith("DESCRIPTION"):
description = line.lstrip("DESCRIPTION").lstrip()
elif line.startswith("CLASS"):
classes_str = line.lstrip("CLASS")
classes = list(map(lambda string: string.strip(), classes_str.split(";")))
# List start
elif line.startswith("DISEASE"):
state = "DISEASE"
diseases = [parse_disease_line(line.lstrip("DISEASE"))]
elif line.startswith("DRUG"):
state = "DRUG"
drugs = [parse_drug_line(line.lstrip("DRUG"))]
elif line.startswith("GENE"):
state = "GENE"
genes = [parse_gene_line(line.lstrip("GENE"))]
elif line.startswith("COMPOUND"):
state = "COMPOUND"
compounds = [parse_compound_line(line.lstrip("COMPOUND"))]
return name, description, classes, diseases, drugs, genes, compounds