-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathsubmit_LSF_job.py
175 lines (142 loc) · 5.34 KB
/
submit_LSF_job.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
# See the NOTICE file distributed with this work for additional information
# regarding copyright ownership.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""
Submit an LSF job to train or test a neural network protein coding potential classifier.
"""
# standard library imports
import argparse
import datetime as dt
import importlib
import pathlib
import shutil
import subprocess
import sys
# third party imports
import yaml
# project imports
from utils import AttributeDict
def main():
"""
main function
"""
argument_parser = argparse.ArgumentParser()
argument_parser.add_argument("--pipeline", help="pipeline script path")
argument_parser.add_argument(
"--configuration",
help="experiment configuration file path",
)
argument_parser.add_argument(
"--mem_limit", default=16384, type=int, help="LSF job memory limit"
)
argument_parser.add_argument(
"--gpu",
action="store_true",
help="submit training job to the gpu queue",
)
argument_parser.add_argument(
"--checkpoint",
help="path to the saved experiment checkpoint",
)
args = argument_parser.parse_args()
# submit new classifier training
if args.pipeline and args.configuration:
datetime = dt.datetime.now().isoformat(sep="_", timespec="seconds")
pipeline_path = pathlib.Path(args.pipeline)
with open(args.configuration) as file:
configuration = yaml.safe_load(file)
configuration = AttributeDict(configuration)
job_name = (
f"{configuration.experiment_prefix}_{configuration.dataset_id}_{datetime}"
)
root_directory = configuration.save_directory
experiment_directory = pathlib.Path(f"{root_directory}/{job_name}")
experiment_directory.mkdir(exist_ok=True)
# copy pipeline script, configuration file, and dependencies
pipeline_copy = shutil.copy(pipeline_path, experiment_directory)
configuration_copy = shutil.copy(args.configuration, experiment_directory)
shutil.copy(pipeline_path.parent / "utils.py", experiment_directory)
pipeline_command_elements = [
f"python {pipeline_copy}",
f"--datetime {datetime}",
f"--configuration {configuration_copy}",
"--test",
"--train",
]
# test a trained classifier
elif args.pipeline and args.checkpoint:
pipeline_path = pathlib.Path(args.pipeline)
checkpoint_path = pathlib.Path(args.checkpoint)
# load classifier class from pipeline script
assert (
pipeline_path.suffix == ".py"
), f"pipeline should be a Python script, got {args.pipeline}"
pipeline_name = pipeline_path.with_suffix("").name
ProteinCodingClassifier = getattr(
importlib.import_module(pipeline_name), "ProteinCodingClassifier"
)
network = ProteinCodingClassifier.load_from_checkpoint(checkpoint_path)
job_name = checkpoint_path.stem
root_directory = checkpoint_path.parent
experiment_directory = pathlib.Path(f"{root_directory}/{job_name}")
experiment_directory.mkdir(exist_ok=True)
# copy pipeline script and dependencies
pipeline_copy = shutil.copy(pipeline_path, experiment_directory)
shutil.copy(pipeline_path.parent / "utils.py", experiment_directory)
pipeline_command_elements = [
f"python {pipeline_copy}",
f"--checkpoint {args.checkpoint}",
"--test",
]
# no task specified
else:
print(__doc__)
argument_parser.print_help()
sys.exit()
pipeline_command = " ".join(pipeline_command_elements)
# common job arguments
bsub_command_elements = [
"bsub",
f"-M {args.mem_limit}",
f"-o {experiment_directory}/stdout.log",
f"-e {experiment_directory}/stderr.log",
]
if args.gpu:
num_gpus = 1
# gpu_memory = 16384 # 16 GiBs
gpu_memory = 32510 # ~32 GiBs, total Tesla V100 memory
bsub_command_elements.extend(
[
"-q gpu",
f'-gpu "num={num_gpus}:gmem={gpu_memory}:j_exclusive=yes"',
f"-M {args.mem_limit}",
f'-R"select[mem>{args.mem_limit}] rusage[mem={args.mem_limit}] span[hosts=1]"',
]
)
else:
bsub_command_elements.extend(
[
"-q production",
f'-R"select[mem>{args.mem_limit}] rusage[mem={args.mem_limit}]"',
]
)
bsub_command_elements.append(pipeline_command)
bsub_command = " ".join(bsub_command_elements)
print(f"running command:\n{bsub_command}")
subprocess.run(bsub_command, shell=True)
if __name__ == "__main__":
try:
main()
except KeyboardInterrupt:
print("Interrupted with CTRL-C, exiting...")