forked from team-ocean/veros
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathrun_benchmarks.py
259 lines (211 loc) · 9.54 KB
/
run_benchmarks.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
#! /usr/bin/env python
import sys
import os
import subprocess
import multiprocessing
import importlib.util
import re
import time
import math
import itertools
import json
import click
import numpy as np
"""
Runs selected Veros benchmarks back to back and writes timing results to a JSON file.
"""
TESTDIR = os.path.join(os.path.dirname(__file__), os.path.relpath("benchmarks"))
COMPONENTS = ["numpy", "numpy-mpi", "jax", "jax-gpu", "jax-mpi", "jax-gpu-mpi", "fortran", "fortran-mpi"]
STATIC_SETTINGS = " --size {nx} {ny} {nz} --timesteps {timesteps} --float-type {float_type}"
BENCHMARK_COMMANDS = {
"numpy": "{python} {filename}" + STATIC_SETTINGS,
"numpy-mpi": "OMP_NUM_THREADS=1 {mpiexec} -n {nproc} {python} {filename} --nproc {decomp}" + STATIC_SETTINGS,
"jax": "{python} {filename} -b jax" + STATIC_SETTINGS,
"jax-gpu": "{python} {filename} -b jax --device gpu" + STATIC_SETTINGS,
"jax-mpi": "OMP_NUM_THREADS=1 {mpiexec} -n {nproc} {python} {filename} -b jax --nproc {decomp}" + STATIC_SETTINGS,
"jax-gpu-mpi": "OMP_NUM_THREADS=1 {mpiexec} -n {nproc} {python} {filename} -b jax --device gpu --nproc {decomp}"
+ STATIC_SETTINGS,
"fortran": "{python} {filename} --pyom2-lib {pyom2_lib}" + STATIC_SETTINGS,
"fortran-mpi": "{mpiexec} -n {nproc} {python} {filename} --pyom2-lib {pyom2_lib} --nproc {decomp}"
+ STATIC_SETTINGS,
}
SLURM_COMMANDS = {
"numpy": "{mpiexec} --ntasks 1 --cpus-per-task {nproc} -- {python} {filename} -b numpy" + STATIC_SETTINGS,
"numpy-mpi": "{mpiexec} --ntasks {nproc} --cpus-per-task 1 -- {python} {filename} -b numpy --nproc {decomp}"
+ STATIC_SETTINGS,
"jax": "{mpiexec} --ntasks 1 --cpus-per-task {nproc} -- {python} {filename} -b jax" + STATIC_SETTINGS,
"jax-gpu": "{mpiexec} --ntasks 1 --cpus-per-task {nproc} -- {python} {filename} -b jax --device gpu"
+ STATIC_SETTINGS,
"jax-mpi": "{mpiexec} --ntasks {nproc} --cpus-per-task 1 -- {python} {filename} -b jax --nproc {decomp}"
+ STATIC_SETTINGS,
"jax-gpu-mpi": "{mpiexec} --ntasks {nproc} --cpus-per-task 1 -- {python} {filename} -b jax --device gpu --nproc {decomp}"
+ STATIC_SETTINGS,
"fortran": "{mpiexec} --ntasks 1 -- {python} {filename} --pyom2-lib {pyom2_lib}" + STATIC_SETTINGS,
"fortran-mpi": "{mpiexec} --ntasks {nproc} --cpus-per-task 1 -- {python} {filename} --pyom2-lib {pyom2_lib} --nproc {decomp}"
+ STATIC_SETTINGS,
}
AVAILABLE_BENCHMARKS = [f for f in os.listdir(TESTDIR) if f.endswith("_benchmark.py")]
TIME_PATTERN = r"Time step took ([-+]?[0-9]*\.?[0-9]+(?:[eE][-+]?[0-9]+)?)s"
def check_arguments(pyom2_lib, components, float_type, burnin, timesteps, **kwargs):
fortran_version = check_pyom2_lib(pyom2_lib)
if "fortran" in components or "fortran-mpi" in components:
if not pyom2_lib:
raise click.UsageError("Path to fortran library must be given when running fortran components")
if not fortran_version:
raise click.UsageError("Fortran library failed to import")
if fortran_version != "parallel" and "fortran-mpi" in components:
raise click.UsageError("Fortran library must be compiled with MPI support for fortran-mpi component")
if float_type != "float64" and ("fortran" in components or "fortran-mpi" in components):
raise click.UsageError('Can run Fortran components only with "float64" float type')
if not burnin < timesteps:
raise click.UsageError("burnin must be smaller than number of timesteps")
def check_pyom2_lib(path):
if not path:
return None
def _check_library(module):
spec = importlib.util.spec_from_file_location(module, path)
try:
importlib.util.module_from_spec(spec)
except ImportError:
return False
else:
return True
if _check_library("pyOM_code"):
return "sequential"
if _check_library("pyOM_code_MPI"):
return "parallel"
return None
def _factorize(num):
j = 2
while num > 1:
for i in range(j, int(math.sqrt(num + 0.05)) + 1):
if num % i == 0:
num /= i
j = i
yield i
break
else:
if num > 1:
yield num
break
def _decompose_num(num, into=2):
out = [1] * into
for fac, i in zip(_factorize(num), itertools.cycle(range(into))):
out[i] *= fac
return tuple(map(int, out))
def _round_to_multiple(num, divisor):
return int(round(num / divisor) * divisor)
@click.command("veros-benchmarks", help="Run Veros benchmarks")
@click.option("-f", "--pyom2-lib", type=str, help="Path to PyOM2 fortran library")
@click.option(
"-s", "--sizes", multiple=True, type=float, required=True, help="Problem sizes to test (total number of elements)"
)
@click.option(
"-c",
"--components",
multiple=True,
type=click.Choice(COMPONENTS),
default=["numpy"],
metavar="COMPONENT",
help="Numerical backend components to benchmark (possible values: {})".format(", ".join(COMPONENTS)),
)
@click.option(
"-n",
"--nproc",
type=int,
default=multiprocessing.cpu_count(),
help="Number of processes / threads for parallel execution",
)
@click.option(
"-o",
"--outfile",
type=click.Path(exists=False),
default="benchmark_{}.json".format(time.time()),
help="JSON file to write timings to",
)
@click.option("-t", "--timesteps", default=100, type=int, help="Number of time steps that each benchmark is run for")
@click.option(
"--only",
multiple=True,
default=AVAILABLE_BENCHMARKS,
help="Run only these benchmarks (possible values: {})".format(", ".join(AVAILABLE_BENCHMARKS)),
type=click.Choice(AVAILABLE_BENCHMARKS),
required=False,
metavar="BENCHMARK",
)
@click.option("--mpiexec", default=None, help="Executable used for calling MPI (e.g. mpirun, mpiexec)")
@click.option("--slurm", is_flag=True, help="Run benchmarks using SLURM scheduling command (srun)")
@click.option("--debug", is_flag=True, help="Additionally print each command that is executed")
@click.option("--float-type", default="float64", help="Data type for floating point arrays in Veros components")
@click.option("--burnin", default=3, type=int, help="Number of iterations to exclude in timings")
def run(**kwargs):
check_arguments(**kwargs)
proc_decom = _decompose_num(kwargs["nproc"], 2)
settings = kwargs.copy()
settings["decomp"] = f"{proc_decom[0]} {proc_decom[1]}"
out_data = {}
all_passed = True
try:
for f in kwargs["only"]:
out_data[f] = []
click.echo(f"running benchmark {f}")
for size in kwargs["sizes"]:
nz = min(max(math.ceil(0.5 * size ** (1 / 3)), 2), 120)
n = math.ceil((size / nz) ** (1 / 2))
nx = _round_to_multiple(n, proc_decom[0])
ny = _round_to_multiple(n, proc_decom[1])
real_size = nx * ny * nz
click.echo(f" current size: {real_size}")
cmd_args = settings.copy()
cmd_args.update(
{
"python": sys.executable,
"filename": os.path.realpath(os.path.join(TESTDIR, f)),
"nx": nx,
"ny": ny,
"nz": nz,
}
)
if cmd_args["mpiexec"] is None:
if kwargs["slurm"]:
cmd_args["mpiexec"] = "srun"
else:
cmd_args["mpiexec"] = "mpirun"
for comp in kwargs["components"]:
cmd = (SLURM_COMMANDS[comp] if kwargs["slurm"] else BENCHMARK_COMMANDS[comp]).format(**cmd_args)
if kwargs["debug"]:
click.echo(f" $ {cmd}")
sys.stdout.write(f" {comp:<15} ... ")
sys.stdout.flush()
try: # must run each benchmark in its own Python subprocess to reload the Fortran library
output = subprocess.check_output(cmd, shell=True, stderr=subprocess.STDOUT)
except subprocess.CalledProcessError as e:
click.echo("failed")
click.echo(e.output.decode("utf-8"))
all_passed = False
continue
output = output.decode("utf-8")
iteration_times = list(map(float, re.findall(TIME_PATTERN, output)))[kwargs["burnin"] :]
if not iteration_times:
raise RuntimeError("could not extract iteration times from output")
total_elapsed = sum(iteration_times)
click.echo(f"{total_elapsed:>6.2f}s")
out_data[f].append(
{
"component": comp,
"size": real_size,
"wall_time": total_elapsed,
"per_iteration": {
"best": float(np.min(iteration_times)),
"worst": float(np.max(iteration_times)),
"mean": float(np.mean(iteration_times)),
"stdev": float(np.std(iteration_times)),
},
}
)
finally:
with open(kwargs["outfile"], "w") as f:
json.dump({"benchmarks": out_data, "settings": settings}, f, indent=4, sort_keys=True)
raise SystemExit(int(not all_passed))
if __name__ == "__main__":
run()