From 432a7274e63dcaf6fa12a684d68d314519ad7732 Mon Sep 17 00:00:00 2001
From: Edmund Higham <edhigham@gmail.com>
Date: Thu, 5 Dec 2024 14:50:21 -0500
Subject: [PATCH] [query] Benchmark configurations

---
 .../hail/benchmark_benchmark_analysis.py      |   2 +-
 .../benchmark/hail/benchmark_combiner.py      |  10 +-
 .../python/benchmark/hail/benchmark_linalg.py |  12 +-
 .../benchmark/hail/benchmark_matrix_table.py  |  73 +++++----
 .../benchmark/hail/benchmark_methods.py       |  32 ++--
 .../benchmark/hail/benchmark_sentinel.py      |   4 +-
 .../benchmark/hail/benchmark_shuffle.py       |  12 +-
 hail/python/benchmark/hail/benchmark_table.py | 154 ++++++++++++------
 8 files changed, 176 insertions(+), 123 deletions(-)

diff --git a/hail/python/benchmark/hail/benchmark_benchmark_analysis.py b/hail/python/benchmark/hail/benchmark_benchmark_analysis.py
index 2b1c147f9c2f..c3b0b7f04f28 100644
--- a/hail/python/benchmark/hail/benchmark_benchmark_analysis.py
+++ b/hail/python/benchmark/hail/benchmark_benchmark_analysis.py
@@ -7,7 +7,7 @@
 from benchmark.tools.statistics import analyze_benchmarks
 
 
-@pytest.mark.benchmark()
+@pytest.mark.benchmark(mds=1.2, instances=10, iterations=5, burn_in_iterations=10)
 def benchmark_analyze_benchmarks(local_tmpdir, onethreetwo, onethreethree):
     inputs = (onethreetwo, onethreethree)
     inputs = ((v, Path(tempfile.mktemp(dir=local_tmpdir))) for v in inputs)
diff --git a/hail/python/benchmark/hail/benchmark_combiner.py b/hail/python/benchmark/hail/benchmark_combiner.py
index 608a39b3db38..84d7e76030f4 100644
--- a/hail/python/benchmark/hail/benchmark_combiner.py
+++ b/hail/python/benchmark/hail/benchmark_combiner.py
@@ -24,14 +24,14 @@ def benchmark_compile_2k_merge(empty_gvcf, tmp_path):
     hl.vds.write_variant_datasets(combined, str(tmp_path / 'combiner-multi-write'), overwrite=True)
 
 
-@pytest.mark.benchmark()
+@pytest.mark.benchmark(mds=1.1, instances=10, iterations=10, burn_in_iterations=10)
 @pytest.mark.xtimeout(270)
 def benchmark_python_only_10k_transform(empty_gvcf):
     for vcf in [import_vcf(empty_gvcf)] * 10_000:
         transform_gvcf(vcf, [])
 
 
-@pytest.mark.benchmark()
+@pytest.mark.benchmark(mds=1.1, instances=10, iterations=10, burn_in_iterations=20)
 def benchmark_python_only_10k_combine(empty_gvcf):
     vcf = import_vcf(empty_gvcf)
     mt = transform_gvcf(vcf, [])
@@ -39,7 +39,7 @@ def benchmark_python_only_10k_combine(empty_gvcf):
         combine_variant_datasets(mts)
 
 
-@pytest.mark.benchmark()
+@pytest.mark.benchmark(mds=1.1, instances=20, iterations=10, burn_in_iterations=10)
 def benchmark_import_and_transform_gvcf(single_gvcf):
     mt = import_vcf(single_gvcf)
     vds = transform_gvcf(mt, [])
@@ -47,7 +47,7 @@ def benchmark_import_and_transform_gvcf(single_gvcf):
     vds.variant_data._force_count_rows()
 
 
-@pytest.mark.benchmark()
+@pytest.mark.benchmark(mds=1.2, instances=10, iterations=15, burn_in_iterations=8)
 def benchmark_import_gvcf_force_count(single_gvcf):
     mt = import_vcf(single_gvcf)
     mt._force_count_rows()
@@ -62,7 +62,7 @@ def tmp_and_output_paths(tmp_path):
     return (tmp, output)
 
 
-@pytest.mark.benchmark()
+@pytest.mark.benchmark(mds=1.1, instances=10, iterations=5, burn_in_iterations=10)
 @pytest.mark.xtimeout(180)
 def benchmark_vds_combiner_chr22(chr22_gvcfs, tmp_and_output_paths):
     parts = hl.eval([hl.parse_locus_interval('chr22:start-end', reference_genome='GRCh38')])
diff --git a/hail/python/benchmark/hail/benchmark_linalg.py b/hail/python/benchmark/hail/benchmark_linalg.py
index df615ef9a3a0..6c9d2e7aea6c 100644
--- a/hail/python/benchmark/hail/benchmark_linalg.py
+++ b/hail/python/benchmark/hail/benchmark_linalg.py
@@ -12,32 +12,32 @@ def benchmark_block_matrix_nested_multiply(tmp_path):
     bm.write(str(tmp_path / 'result.mt'), overwrite=True)
 
 
-@pytest.mark.benchmark()
+@pytest.mark.benchmark(mds=1.2, instances=10, iterations=5, burn_in_iterations=5)
 def benchmark_make_ndarray():
     ht = hl.utils.range_table(200_000)
     ht = ht.annotate(x=hl.nd.array(hl.range(ht.idx)))
     ht._force_count()
 
 
-@pytest.mark.benchmark()
+@pytest.mark.benchmark(mds=1.2, instances=10, iterations=20, burn_in_iterations=10)
 def benchmark_ndarray_addition():
     arr = hl.nd.ones((1024, 1024))
     hl.eval(arr + arr)
 
 
-@pytest.mark.benchmark()
+@pytest.mark.benchmark(mds=1.2, instances=20, iterations=5, burn_in_iterations=10)
 def benchmark_ndarray_matmul_int64():
     arr = hl.nd.arange(1024 * 1024).map(hl.int64).reshape((1024, 1024))
     hl.eval(arr @ arr)
 
 
-@pytest.mark.benchmark()
+@pytest.mark.benchmark(mds=1.1, instances=20, iterations=15, burn_in_iterations=6)
 def benchmark_ndarray_matmul_float64():
     arr = hl.nd.arange(1024 * 1024).map(hl.float64).reshape((1024, 1024))
     hl.eval(arr @ arr)
 
 
-@pytest.mark.benchmark()
+@pytest.mark.benchmark(mds=1.2, instances=10, iterations=5, burn_in_iterations=10)
 @pytest.mark.xtimeout(200)
 def benchmark_blockmatrix_write_from_entry_expr_range_mt(tmp_path):
     mt = hl.utils.range_matrix_table(40_000, 40_000, n_partitions=4)
@@ -55,7 +55,7 @@ def benchmark_blockmatrix_write_from_entry_expr_range_mt_standardize(tmp_path):
     )
 
 
-@pytest.mark.benchmark()
+@pytest.mark.benchmark(mds=1.1, instances=5, iterations=8, burn_in_iterations=10)
 def benchmark_sum_table_of_ndarrays():
     ht = hl.utils.range_table(400).annotate(nd=hl.nd.ones((4096, 4096)))
     ht.aggregate(hl.agg.ndarray_sum(ht.nd))
diff --git a/hail/python/benchmark/hail/benchmark_matrix_table.py b/hail/python/benchmark/hail/benchmark_matrix_table.py
index 81f7785fba52..17a7f61eac09 100644
--- a/hail/python/benchmark/hail/benchmark_matrix_table.py
+++ b/hail/python/benchmark/hail/benchmark_matrix_table.py
@@ -3,110 +3,110 @@
 import hail as hl
 
 
-@pytest.mark.benchmark()
+@pytest.mark.benchmark(mds=1.1, instances=25, iterations=15, burn_in_iterations=8)
 def benchmark_matrix_table_decode_and_count(profile25_mt):
     mt = hl.read_matrix_table(str(profile25_mt))
     mt._force_count_rows()
 
 
-@pytest.mark.benchmark()
+@pytest.mark.benchmark(mds=1.1, instances=20, iterations=20, burn_in_iterations=5)
 def benchmark_matrix_table_decode_and_count_just_gt(profile25_mt):
     mt = hl.read_matrix_table(str(profile25_mt)).select_entries('GT')
     mt._force_count_rows()
 
 
-@pytest.mark.benchmark()
+@pytest.mark.benchmark(mds=1.1, instances=25, iterations=10, burn_in_iterations=20)
 def benchmark_matrix_table_array_arithmetic(profile25_mt):
     mt = hl.read_matrix_table(str(profile25_mt))
     mt = mt.filter_rows(mt.alleles.length() == 2)
     mt.select_entries(dosage=hl.pl_dosage(mt.PL)).select_rows()._force_count_rows()
 
 
-@pytest.mark.benchmark()
+@pytest.mark.benchmark(mds=1.1, instances=25, iterations=5, burn_in_iterations=10)
 def benchmark_matrix_table_entries_table(profile25_mt):
     mt = hl.read_matrix_table(str(profile25_mt))
     mt.entries()._force_count()
 
 
-@pytest.mark.benchmark()
+@pytest.mark.benchmark(mds=1.1, instances=20, iterations=10, burn_in_iterations=10)
 def benchmark_matrix_table_entries_table_no_key(profile25_mt):
     mt = hl.read_matrix_table(str(profile25_mt)).key_rows_by().key_cols_by()
     mt.entries()._force_count()
 
 
-@pytest.mark.benchmark()
+@pytest.mark.benchmark(mds=1.1, instances=20, iterations=10, burn_in_iterations=30)
 def benchmark_matrix_table_rows_force_count(profile25_mt):
     ht = hl.read_matrix_table(str(profile25_mt)).rows().key_by()
     ht._force_count()
 
 
-@pytest.mark.benchmark()
+@pytest.mark.benchmark(mds=1.1, instances=15, iterations=10, burn_in_iterations=15)
 def benchmark_matrix_table_show(profile25_mt):
     mt = hl.read_matrix_table(str(profile25_mt))
     mt.show(100)
 
 
-@pytest.mark.benchmark()
+@pytest.mark.benchmark(mds=1.1, instances=20, iterations=10, burn_in_iterations=15)
 def benchmark_matrix_table_rows_show(profile25_mt):
     mt = hl.read_matrix_table(str(profile25_mt))
     mt.rows().show(100)
 
 
-@pytest.mark.benchmark()
+@pytest.mark.benchmark(mds=1.1, instances=15, iterations=15, burn_in_iterations=16)
 def benchmark_matrix_table_cols_show(profile25_mt):
     mt = hl.read_matrix_table(str(profile25_mt))
     mt.cols().show(100)
 
 
-@pytest.mark.benchmark()
+@pytest.mark.benchmark(mds=1.1, instances=15, iterations=25, burn_in_iterations=10)
 def benchmark_matrix_table_take_entry(profile25_mt):
     mt = hl.read_matrix_table(str(profile25_mt))
     mt.GT.take(100)
 
 
-@pytest.mark.benchmark()
+@pytest.mark.benchmark(mds=1.1, instances=20, iterations=15, burn_in_iterations=15)
 def benchmark_matrix_table_entries_show(profile25_mt):
     mt = hl.read_matrix_table(str(profile25_mt))
     mt.entries().show()
 
 
-@pytest.mark.benchmark()
+@pytest.mark.benchmark(mds=1.1, instances=10, iterations=20, burn_in_iterations=10)
 def benchmark_matrix_table_take_row(profile25_mt):
     mt = hl.read_matrix_table(str(profile25_mt))
     mt.info.AF.take(100)
 
 
-@pytest.mark.benchmark()
+@pytest.mark.benchmark(mds=1.1, instances=15, iterations=20, burn_in_iterations=10)
 def benchmark_matrix_table_take_col(profile25_mt):
     mt = hl.read_matrix_table(str(profile25_mt))
     mt.s.take(100)
 
 
-@pytest.mark.benchmark()
+@pytest.mark.benchmark(mds=1.1, instances=20, iterations=10, burn_in_iterations=8)
 def benchmark_write_range_matrix_table_p100(tmp_path):
     mt = hl.utils.range_matrix_table(n_rows=1_000_000, n_cols=10, n_partitions=100)
     mt = mt.annotate_entries(x=mt.col_idx + mt.row_idx)
     mt.write(str(tmp_path / 'tmp.mt'))
 
 
-@pytest.mark.benchmark()
+@pytest.mark.benchmark(mds=1.1, instances=20, iterations=15, burn_in_iterations=15)
 def benchmark_write_profile_mt(profile25_mt, tmp_path):
     hl.read_matrix_table(str(profile25_mt)).write(str(tmp_path / 'tmp.mt'))
 
 
-@pytest.mark.benchmark()
+@pytest.mark.benchmark(mds=1.1, instances=20, iterations=20, burn_in_iterations=10)
 def benchmark_matrix_table_rows_is_transition(profile25_mt):
     ht = hl.read_matrix_table(str(profile25_mt)).rows().key_by()
     ht.select(is_snp=hl.is_snp(ht.alleles[0], ht.alleles[1]))._force_count()
 
 
-@pytest.mark.benchmark()
+@pytest.mark.benchmark(mds=1.1, instances=15, iterations=20, burn_in_iterations=6)
 def benchmark_matrix_table_filter_entries(profile25_mt):
     mt = hl.read_matrix_table(str(profile25_mt))
     mt.filter_entries((mt.GQ > 8) & (mt.DP > 2))._force_count_rows()
 
 
-@pytest.mark.benchmark()
+@pytest.mark.benchmark(mds=1.1, instances=20, iterations=10, burn_in_iterations=3)
 def benchmark_matrix_table_filter_entries_unfilter(profile25_mt):
     mt = hl.read_matrix_table(str(profile25_mt))
     mt.filter_entries((mt.GQ > 8) & (mt.DP > 2)).unfilter_entries()._force_count_rows()
@@ -163,27 +163,27 @@ def many_aggs(mt):
     return {f'x{i}': expr for i, expr in enumerate(aggs)}
 
 
-@pytest.mark.benchmark()
+@pytest.mark.benchmark(mds=1.1, instances=20, iterations=5, burn_in_iterations=4)
 def benchmark_matrix_table_many_aggs_row_wise(profile25_mt):
     mt = hl.read_matrix_table(str(profile25_mt))
     mt = mt.annotate_rows(**many_aggs(mt))
     mt.rows()._force_count()
 
 
-@pytest.mark.benchmark()
+@pytest.mark.benchmark(mds=1.1, instances=20, iterations=5, burn_in_iterations=10)
 def benchmark_matrix_table_many_aggs_col_wise(profile25_mt):
     mt = hl.read_matrix_table(str(profile25_mt))
     mt = mt.annotate_cols(**many_aggs(mt))
     mt.cols()._force_count()
 
 
-@pytest.mark.benchmark()
+@pytest.mark.benchmark(mds=1.1, instances=20, iterations=15, burn_in_iterations=8)
 def benchmark_matrix_table_aggregate_entries(profile25_mt):
     mt = hl.read_matrix_table(str(profile25_mt))
     mt.aggregate_entries(hl.agg.stats(mt.GQ))
 
 
-@pytest.mark.benchmark()
+@pytest.mark.benchmark(mds=1.1, instances=20, iterations=10, burn_in_iterations=8)
 def benchmark_matrix_table_call_stats_star_star(profile25_mt):
     mt = hl.read_matrix_table(str(profile25_mt))
     mt.annotate_rows(**hl.agg.call_stats(mt.GT, mt.alleles))._force_count_rows()
@@ -241,24 +241,25 @@ def benchmark_gnomad_coverage_stats_optimized(gnomad_dp_sim):
     mt.rows()._force_count()
 
 
-@pytest.mark.benchmark()
+@pytest.mark.benchmark(mds=1.1, instances=15, iterations=20, burn_in_iterations=10)
 def benchmark_per_row_stats_star_star(gnomad_dp_sim):
     mt = hl.read_matrix_table(str(gnomad_dp_sim))
     mt.annotate_rows(**hl.agg.stats(mt.x))._force_count_rows()
 
 
-@pytest.mark.benchmark()
+@pytest.mark.benchmark(mds=1.1, instances=20, iterations=10, burn_in_iterations=10)
 def benchmark_read_decode_gnomad_coverage(gnomad_dp_sim):
     hl.read_matrix_table(str(gnomad_dp_sim))._force_count_rows()
 
 
-@pytest.mark.benchmark()
+@pytest.mark.benchmark(mds=1.1, instances=10, iterations=5, burn_in_iterations=20)
 def benchmark_import_bgen_force_count_just_gp(sim_ukb_bgen, sim_ukb_sample):
     mt = hl.import_bgen(str(sim_ukb_bgen), sample_file=str(sim_ukb_sample), entry_fields=['GP'], n_partitions=8)
     mt._force_count_rows()
 
 
-@pytest.mark.benchmark()
+@pytest.mark.benchmark(mds=1.1, instances=10, iterations=5, burn_in_iterations=20)
+@pytest.mark.xfail(raises=TimeoutError, reason=XFail.Timeout)
 def benchmark_import_bgen_force_count_all(sim_ukb_bgen, sim_ukb_sample):
     mt = hl.import_bgen(
         str(sim_ukb_bgen), sample_file=str(sim_ukb_sample), entry_fields=['GT', 'GP', 'dosage'], n_partitions=8
@@ -266,7 +267,7 @@ def benchmark_import_bgen_force_count_all(sim_ukb_bgen, sim_ukb_sample):
     mt._force_count_rows()
 
 
-@pytest.mark.benchmark()
+@pytest.mark.benchmark(mds=1.1, instances=10, iterations=5, burn_in_iterations=12)
 @pytest.mark.xtimeout(180)
 def benchmark_import_bgen_info_score(sim_ukb_bgen, sim_ukb_sample):
     mt = hl.import_bgen(str(sim_ukb_bgen), sample_file=str(sim_ukb_sample), entry_fields=['GP'], n_partitions=8)
@@ -274,27 +275,27 @@ def benchmark_import_bgen_info_score(sim_ukb_bgen, sim_ukb_sample):
     mt.rows().select('info_score')._force_count()
 
 
-@pytest.mark.benchmark()
+@pytest.mark.benchmark(mds=1.1, instances=10, iterations=5, burn_in_iterations=18)
 def benchmark_import_bgen_filter_count(sim_ukb_bgen, sim_ukb_sample):
     mt = hl.import_bgen(str(sim_ukb_bgen), sample_file=str(sim_ukb_sample), entry_fields=['GT', 'GP'], n_partitions=8)
     mt = mt.filter_rows(mt.alleles == ['A', 'T'])
     mt._force_count_rows()
 
 
-@pytest.mark.benchmark()
+@pytest.mark.benchmark(mds=1.1, instances=15, iterations=20, burn_in_iterations=3)
 def benchmark_export_range_matrix_table_entry_field_p100(tmp_path):
     mt = hl.utils.range_matrix_table(n_rows=1_000_000, n_cols=10, n_partitions=100)
     mt = mt.annotate_entries(x=mt.col_idx + mt.row_idx)
     mt.x.export(str(tmp_path / 'result.txt'))
 
 
-@pytest.mark.benchmark()
+@pytest.mark.benchmark(mds=1.2, instances=10, iterations=10, burn_in_iterations=8)
 def benchmark_export_range_matrix_table_row_p100(tmp_path):
     mt = hl.utils.range_matrix_table(n_rows=1_000_000, n_cols=10, n_partitions=100)
     mt.row.export(str(tmp_path / 'result.txt'))
 
 
-@pytest.mark.benchmark()
+@pytest.mark.benchmark(mds=1.2, instances=15, iterations=25, burn_in_iterations=15)
 def benchmark_export_range_matrix_table_col_p100(tmp_path):
     mt = hl.utils.range_matrix_table(n_rows=1_000_000, n_cols=10, n_partitions=100)
     mt.col.export(str(tmp_path / 'result.txt'))
@@ -308,7 +309,7 @@ def benchmark_large_range_matrix_table_sum():
     mt.annotate_cols(foo=hl.agg.sum(mt.x))._force_count_cols()
 
 
-@pytest.mark.benchmark()
+@pytest.mark.benchmark(mds=1.1, instances=20, iterations=20, burn_in_iterations=7)
 def benchmark_kyle_sex_specific_qc(profile25_mt):
     mt = hl.read_matrix_table(str(profile25_mt))
     mt = mt.annotate_cols(sex=hl.if_else(hl.rand_bool(0.5), 'Male', 'Female'))
@@ -349,14 +350,14 @@ def benchmark_kyle_sex_specific_qc(profile25_mt):
     mt.rows()._force_count()
 
 
-@pytest.mark.benchmark()
+@pytest.mark.benchmark(mds=1.1, instances=25, iterations=10, burn_in_iterations=5)
 def benchmark_matrix_table_scan_count_rows_2():
     mt = hl.utils.range_matrix_table(n_rows=200_000_000, n_cols=10, n_partitions=16)
     mt = mt.annotate_rows(x=hl.scan.count())
     mt._force_count_rows()
 
 
-@pytest.mark.benchmark()
+@pytest.mark.benchmark(mds=1.3, instances=20, iterations=10, burn_in_iterations=20)
 def benchmark_matrix_table_scan_count_cols_2():
     mt = hl.utils.range_matrix_table(n_cols=10_000_000, n_rows=10)
     mt = mt.annotate_cols(x=hl.scan.count())
@@ -371,14 +372,14 @@ def benchmark_matrix_multi_write_nothing(tmp_path):
     hl.experimental.write_matrix_tables(mts, str(tmp_path / 'multi-write'))
 
 
-@pytest.mark.benchmark()
+@pytest.mark.benchmark(mds=1.1, instances=25, iterations=10, burn_in_iterations=5)
 def benchmark_mt_localize_and_collect(profile25_mt):
     mt = hl.read_matrix_table(str(profile25_mt))
     ht = mt.localize_entries("ent")
     ht.head(150).collect()
 
 
-@pytest.mark.benchmark()
+@pytest.mark.benchmark(mds=1.1, instances=20, iterations=15, burn_in_iterations=5)
 def benchmark_mt_group_by_memory_usage(random_doubles_mt):
     mt = hl.read_matrix_table(str(random_doubles_mt))
     mt = mt.group_rows_by(new_idx=mt.row_idx % 3).aggregate(x=hl.agg.mean(mt.x))
diff --git a/hail/python/benchmark/hail/benchmark_methods.py b/hail/python/benchmark/hail/benchmark_methods.py
index c1c8401e169c..1dd3c8718f09 100644
--- a/hail/python/benchmark/hail/benchmark_methods.py
+++ b/hail/python/benchmark/hail/benchmark_methods.py
@@ -3,43 +3,43 @@
 import hail as hl
 
 
-@pytest.mark.benchmark()
+@pytest.mark.benchmark(mds=1.1, instances=20, iterations=10, burn_in_iterations=5)
 def benchmark_import_vcf_write(profile25_vcf, tmp_path):
     mt = hl.import_vcf(str(profile25_vcf))
     out = str(tmp_path / 'out.mt')
     mt.write(out)
 
 
-@pytest.mark.benchmark()
+@pytest.mark.benchmark(mds=1.2, instances=10, iterations=5, burn_in_iterations=1)
 def benchmark_import_vcf_count_rows(profile25_vcf):
     mt = hl.import_vcf(str(profile25_vcf))
     mt.count_rows()
 
 
-@pytest.mark.benchmark()
+@pytest.mark.benchmark(mds=1.1, instances=15, iterations=5, burn_in_iterations=15)
 def benchmark_export_vcf(profile25_mt, tmp_path):
     mt = hl.read_matrix_table(str(profile25_mt))
     out = str(tmp_path / 'out.vcf.bgz')
     hl.export_vcf(mt, out)
 
 
-@pytest.mark.benchmark()
+@pytest.mark.benchmark(mds=1.2, instances=10, iterations=5, burn_in_iterations=10)
 def benchmark_sample_qc(profile25_mt):
     hl.sample_qc(hl.read_matrix_table(str(profile25_mt))).cols()._force_count()
 
 
-@pytest.mark.benchmark()
+@pytest.mark.benchmark(mds=1.1, instances=15, iterations=20, burn_in_iterations=8)
 def benchmark_variant_qc(profile25_mt):
     hl.variant_qc(hl.read_matrix_table(str(profile25_mt))).rows()._force_count()
 
 
-@pytest.mark.benchmark()
+@pytest.mark.benchmark(mds=1.1, instances=10, iterations=5, burn_in_iterations=18)
 def benchmark_variant_and_sample_qc(profile25_mt):
     mt = hl.read_matrix_table(str(profile25_mt))
     hl.sample_qc(hl.variant_qc(mt))._force_count_rows()
 
 
-@pytest.mark.benchmark()
+@pytest.mark.benchmark(mds=1.1, instances=15, iterations=5, burn_in_iterations=13)
 def benchmark_variant_and_sample_qc_nested_with_filters_2(profile25_mt):
     mt = hl.read_matrix_table(str(profile25_mt))
     mt = hl.variant_qc(mt)
@@ -53,7 +53,7 @@ def benchmark_variant_and_sample_qc_nested_with_filters_2(profile25_mt):
     mt.count()
 
 
-@pytest.mark.benchmark()
+@pytest.mark.benchmark(mds=1.1, instances=10, iterations=5, burn_in_iterations=20)
 def benchmark_variant_and_sample_qc_nested_with_filters_4(profile25_mt):
     mt = hl.read_matrix_table(str(profile25_mt))
     mt = hl.variant_qc(mt)
@@ -75,7 +75,7 @@ def benchmark_variant_and_sample_qc_nested_with_filters_4(profile25_mt):
     mt.count()
 
 
-@pytest.mark.benchmark()
+@pytest.mark.benchmark(mds=1.1, instances=15, iterations=8, burn_in_iterations=20)
 def benchmark_variant_and_sample_qc_nested_with_filters_4_counts(profile25_mt):
     mt = hl.read_matrix_table(str(profile25_mt))
     mt = hl.variant_qc(mt)
@@ -107,7 +107,7 @@ def benchmark_hwe_normalized_pca(profile25_mt):
     hl.hwe_normalized_pca(mt.GT)
 
 
-@pytest.mark.benchmark()
+@pytest.mark.benchmark(mds=1.2, instances=10, iterations=10, burn_in_iterations=5)
 def benchmark_hwe_normalized_pca_blanczos_small_data_0_iterations(profile25_mt):
     mt = hl.read_matrix_table(str(profile25_mt))
     mt = mt.filter_rows(mt.info.AF[0] > 0.01)
@@ -121,19 +121,19 @@ def benchmark_hwe_normalized_pca_blanczos_small_data_10_iterations(profile25_mt)
     hl._hwe_normalized_blanczos(mt.GT, q_iterations=10)
 
 
-@pytest.mark.benchmark()
+@pytest.mark.benchmark(mds=1.1, instances=20, iterations=10, burn_in_iterations=10)
 def benchmark_split_multi_hts(profile25_mt):
     mt = hl.read_matrix_table(str(profile25_mt))
     hl.split_multi_hts(mt)._force_count_rows()
 
 
-@pytest.mark.benchmark()
+@pytest.mark.benchmark(mds=1.2, instances=10, iterations=5, burn_in_iterations=4)
 def benchmark_split_multi(profile25_mt):
     mt = hl.read_matrix_table(str(profile25_mt))
     hl.split_multi(mt)._force_count_rows()
 
 
-@pytest.mark.benchmark()
+@pytest.mark.benchmark(mds=1.1, instances=15, iterations=10, burn_in_iterations=3)
 def benchmark_concordance(profile25_mt):
     mt = hl.read_matrix_table(str(profile25_mt))
     mt = mt.filter_rows(mt.alleles.length() == 2)
@@ -142,7 +142,7 @@ def benchmark_concordance(profile25_mt):
     c._force_count()
 
 
-@pytest.mark.benchmark()
+@pytest.mark.benchmark(mds=1.1, instances=10, iterations=10, burn_in_iterations=10)
 def benchmark_genetics_pipeline(profile25_mt, tmp_path):
     mt = hl.read_matrix_table(str(profile25_mt))
     mt = hl.split_multi_hts(mt)
@@ -179,7 +179,7 @@ def benchmark_pc_relate_5k_5k(balding_nichols_5k_5k):
     rel._force_count()
 
 
-@pytest.mark.benchmark()
+@pytest.mark.benchmark(mds=1.1, instances=15, iterations=5, burn_in_iterations=10)
 def benchmark_linear_regression_rows(random_doubles_mt):
     mt = hl.read_matrix_table(str(random_doubles_mt))
     num_phenos = 100
@@ -209,7 +209,7 @@ def benchmark_linear_regression_rows_nd(random_doubles_mt):
     res._force_count()
 
 
-@pytest.mark.benchmark()
+@pytest.mark.benchmark(mds=1.1, instances=20, iterations=10, burn_in_iterations=5)
 def benchmark_logistic_regression_rows_wald(random_doubles_mt):
     mt = hl.read_matrix_table(str(random_doubles_mt))
     mt = mt.head(2000)
diff --git a/hail/python/benchmark/hail/benchmark_sentinel.py b/hail/python/benchmark/hail/benchmark_sentinel.py
index 8f82fc648406..26cd072376e0 100644
--- a/hail/python/benchmark/hail/benchmark_sentinel.py
+++ b/hail/python/benchmark/hail/benchmark_sentinel.py
@@ -3,14 +3,14 @@
 import pytest
 
 
-@pytest.mark.benchmark()
+@pytest.mark.benchmark(mds=1.1, instances=20, iterations=5, burn_in_iterations=10)
 def benchmark_sentinel_read_gunzip(many_ints_tsv):
     with gzip.open(many_ints_tsv) as f:
         for _ in f:
             pass
 
 
-@pytest.mark.benchmark()
+@pytest.mark.benchmark(mds=1.1, instances=20, iterations=15, burn_in_iterations=5)
 def benchmark_sentinel_cpu_hash_1():
     x = 0
     for _ in range(10_000):
diff --git a/hail/python/benchmark/hail/benchmark_shuffle.py b/hail/python/benchmark/hail/benchmark_shuffle.py
index 474a911bc6af..cbcde9ca834b 100644
--- a/hail/python/benchmark/hail/benchmark_shuffle.py
+++ b/hail/python/benchmark/hail/benchmark_shuffle.py
@@ -17,7 +17,7 @@ def new_query_tmpdir(tmp_path):
             backend.local_tmpdir = old
 
 
-@pytest.mark.benchmark()
+@pytest.mark.benchmark(mds=1.1, instances=10, iterations=10, burn_in_iterations=10)
 def benchmark_shuffle_key_rows_by_mt(profile25_mt):
     mt = hl.read_matrix_table(str(profile25_mt))
     mt = mt.annotate_rows(reversed_position_locus=hl.struct(contig=mt.locus.contig, position=-mt.locus.position))
@@ -25,14 +25,14 @@ def benchmark_shuffle_key_rows_by_mt(profile25_mt):
     mt._force_count_rows()
 
 
-@pytest.mark.benchmark()
+@pytest.mark.benchmark(mds=1.1, instances=10, iterations=10, burn_in_iterations=22)
 def benchmark_shuffle_order_by_10m_int():
     t = hl.utils.range_table(10_000_000, n_partitions=100)
     t = t.order_by(-t.idx)
     t._force_count()
 
 
-@pytest.mark.benchmark()
+@pytest.mark.benchmark(mds=1.1, instances=20, iterations=10, burn_in_iterations=4)
 def benchmark_shuffle_key_rows_by_4096_byte_rows():
     mt = hl.utils.range_matrix_table(100_000, (1 << 12) // 4)
     mt = mt.annotate_entries(entry=mt.row_idx * mt.col_idx)
@@ -40,7 +40,7 @@ def benchmark_shuffle_key_rows_by_4096_byte_rows():
     mt._force_count_rows()
 
 
-@pytest.mark.benchmark()
+@pytest.mark.benchmark(mds=1.2, instances=10, iterations=10, burn_in_iterations=4)
 def benchmark_shuffle_key_rows_by_65k_byte_rows():
     mt = hl.utils.range_matrix_table(10_000, (1 << 16) // 4)
     mt = mt.annotate_entries(entry=mt.row_idx * mt.col_idx)
@@ -48,13 +48,13 @@ def benchmark_shuffle_key_rows_by_65k_byte_rows():
     mt._force_count_rows()
 
 
-@pytest.mark.benchmark()
+@pytest.mark.benchmark(mds=1.1, instances=15, iterations=5, burn_in_iterations=8)
 def benchmark_shuffle_key_by_aggregate_bad_locality(many_ints_ht):
     ht = hl.read_table(str(many_ints_ht))
     ht.group_by(x=ht.i0 % 1000).aggregate(c=hl.agg.count(), m=hl.agg.mean(ht.i2))._force_count()
 
 
-@pytest.mark.benchmark()
+@pytest.mark.benchmark(mds=1.1, instances=15, iterations=10, burn_in_iterations=5)
 def benchmark_shuffle_key_by_aggregate_good_locality(many_ints_ht):
     ht = hl.read_table(str(many_ints_ht))
     divisor = 7_500_000 / 51  # should ensure each partition never overflows default buffer size
diff --git a/hail/python/benchmark/hail/benchmark_table.py b/hail/python/benchmark/hail/benchmark_table.py
index e841cc4e6d23..044c02aea256 100644
--- a/hail/python/benchmark/hail/benchmark_table.py
+++ b/hail/python/benchmark/hail/benchmark_table.py
@@ -4,7 +4,7 @@
 from benchmark.hail.fixtures import many_partitions_ht
 
 
-@pytest.mark.benchmark()
+@pytest.mark.benchmark(mds=1.1, instances=25, iterations=10, burn_in_iterations=6)
 def benchmark_table_key_by_shuffle():
     n = 1_000_000
     ht = hl.utils.range_table(n)
@@ -12,7 +12,7 @@ def benchmark_table_key_by_shuffle():
     ht._force_count()
 
 
-@pytest.mark.benchmark()
+@pytest.mark.benchmark(mds=1.1, instances=20, iterations=25, burn_in_iterations=10)
 def benchmark_table_group_by_aggregate_sorted():
     n = 10_000_000
     ht = hl.utils.range_table(n)
@@ -20,7 +20,7 @@ def benchmark_table_group_by_aggregate_sorted():
     ht._force_count()
 
 
-@pytest.mark.benchmark()
+@pytest.mark.benchmark(mds=1.1, instances=20, iterations=20, burn_in_iterations=6)
 def benchmark_table_group_by_aggregate_unsorted():
     n = 10_000_000
     ht = hl.utils.range_table(n)
@@ -28,26 +28,25 @@ def benchmark_table_group_by_aggregate_unsorted():
     ht._force_count()
 
 
-@pytest.mark.benchmark()
+@pytest.mark.benchmark(mds=1.1, instances=10, iterations=25, burn_in_iterations=10)
 def benchmark_table_range_force_count():
     hl.utils.range_table(100_000_000)._force_count()
 
 
-@pytest.mark.benchmark()
-def benchmark_table_range_join_1b_1k():
-    ht1 = hl.utils.range_table(1_000_000_000)
-    ht2 = hl.utils.range_table(1_000)
+@pytest.mark.parametrize(
+    'a, b',
+    [
+        pytest.param(1_000_000_000, 1_000, marks=pytest.mark.benchmark(mds=1.2, instances=15, iterations=25, burn_in_iterations=20)),
+        pytest.param(1_000_000_000, 1_000_000_000, marks=pytest.mark.benchmark(mds=1.1, instances=5, iterations=20, burn_in_iterations=10)),
+    ],
+)
+def benchmark_table_range_join(a, b):
+    ht1 = hl.utils.range_table(a)
+    ht2 = hl.utils.range_table(b)
     ht1.join(ht2, 'inner').count()
 
 
-@pytest.mark.benchmark()
-def benchmark_table_range_join_1b_1b():
-    ht1 = hl.utils.range_table(1_000_000_000)
-    ht2 = hl.utils.range_table(1_000_000_000)
-    ht1.join(ht2, 'inner').count()
-
-
-@pytest.mark.benchmark()
+@pytest.mark.benchmark(mds=1.2, instances=10, iterations=25, burn_in_iterations=10)
 def benchmark_table_python_construction():
     n = 100
     ht = hl.utils.range_table(100)
@@ -56,6 +55,7 @@ def benchmark_table_python_construction():
 
 
 @pytest.mark.benchmark()
+@pytest.mark.xfail(raises=TimeoutError, reason=XFail.Timeout)
 def benchmark_table_big_aggregate_compilation():
     n = 1_000
     ht = hl.utils.range_table(1)
@@ -72,15 +72,20 @@ def benchmark_table_big_aggregate_compile_and_execute():
     ht.aggregate(expr)
 
 
-@pytest.mark.benchmark()
-@pytest.mark.parametrize('m, n', [(1_000_000, 1_000_000), (1_000_000, 1_000)])
+@pytest.mark.parametrize(
+    'm, n',
+    [
+        pytest.param(1_000_000, 1_000, marks=pytest.mark.benchmark(mds=1.1, instances=20, iterations=15, burn_in_iterations=8)),
+        pytest.param(1_000_000, 1_000_000, marks=pytest.mark.benchmark(mds=1.1, instances=20, iterations=15, burn_in_iterations=20)),
+    ],
+)
 def benchmark_table_foreign_key_join(m, n):
     ht = hl.utils.range_table(m)
     ht2 = hl.utils.range_table(n)
     ht.annotate(x=ht2[(m - 1 - ht.idx) % n])._force_count()
 
 
-@pytest.mark.benchmark()
+@pytest.mark.benchmark(mds=1.1, instances=20, iterations=20, burn_in_iterations=5)
 def benchmark_table_aggregate_array_sum():
     n = 10_000_000
     m = 100
@@ -88,7 +93,7 @@ def benchmark_table_aggregate_array_sum():
     ht.aggregate(hl.agg.array_sum(hl.range(0, m)))
 
 
-@pytest.mark.benchmark()
+@pytest.mark.benchmark(mds=1.1, instances=20, iterations=20, burn_in_iterations=18)
 def benchmark_table_annotate_many_flat():
     n = 1_000_000
     m = 100
@@ -131,19 +136,19 @@ def benchmark_table_annotate_many_nested_dependence():
     ht._force_count()
 
 
-@pytest.mark.benchmark()
+@pytest.mark.benchmark(mds=1.1, instances=20, iterations=15, burn_in_iterations=5)
 def benchmark_table_read_force_count_ints(many_ints_ht):
     ht = hl.read_table(str(many_ints_ht))
     ht._force_count()
 
 
-@pytest.mark.benchmark()
+@pytest.mark.benchmark(mds=1.1, instances=20, iterations=15, burn_in_iterations=10)
 def benchmark_table_read_force_count_strings(many_strings_ht):
     ht = hl.read_table(str(many_strings_ht))
     ht._force_count()
 
 
-@pytest.mark.benchmark()
+@pytest.mark.benchmark(mds=1.1, instances=10, iterations=5, burn_in_iterations=9)
 def benchmark_table_import_ints(many_ints_tsv):
     hl.import_table(
         str(many_ints_tsv),
@@ -151,17 +156,17 @@ def benchmark_table_import_ints(many_ints_tsv):
     )._force_count()
 
 
-@pytest.mark.benchmark()
+@pytest.mark.benchmark(mds=1.1, instances=10, iterations=5, burn_in_iterations=10)
 def benchmark_table_import_ints_impute(many_ints_tsv):
     hl.import_table(str(many_ints_tsv), impute=True)._force_count()
 
 
-@pytest.mark.benchmark()
+@pytest.mark.benchmark(mds=1.2, instances=10, iterations=25, burn_in_iterations=3)
 def benchmark_table_import_strings(many_strings_tsv):
     hl.import_table(str(many_strings_tsv))._force_count()
 
 
-@pytest.mark.benchmark()
+@pytest.mark.benchmark(mds=1.1, instances=10, iterations=5, burn_in_iterations=18)
 def benchmark_table_aggregate_int_stats(many_ints_ht):
     ht = hl.read_table(str(many_ints_ht))
     ht.aggregate(
@@ -173,20 +178,20 @@ def benchmark_table_aggregate_int_stats(many_ints_ht):
     )
 
 
-@pytest.mark.benchmark()
+@pytest.mark.benchmark(mds=1.1, instances=10, iterations=10, burn_in_iterations=10)
 def benchmark_table_range_means():
     ht = hl.utils.range_table(10_000_000, 16)
     ht = ht.annotate(m=hl.mean(hl.range(0, ht.idx % 1111)))
     ht._force_count()
 
 
-@pytest.mark.benchmark()
+@pytest.mark.benchmark(mds=1.1, instances=10, iterations=20, burn_in_iterations=6)
 def benchmark_table_range_array_range_force_count():
     ht = hl.utils.range_table(30).annotate(big_range=hl.range(100_000_000))
     ht._force_count()
 
 
-@pytest.mark.benchmark()
+@pytest.mark.benchmark(mds=1.1, instances=10, iterations=5, burn_in_iterations=15)
 def benchmark_table_aggregate_approx_cdf(random_doubles_mt):
     mt = hl.read_matrix_table(str(random_doubles_mt))
     mt.aggregate_entries((
@@ -196,13 +201,13 @@ def benchmark_table_aggregate_approx_cdf(random_doubles_mt):
     ))
 
 
-@pytest.mark.benchmark()
+@pytest.mark.benchmark(mds=1.1, instances=15, iterations=5, burn_in_iterations=20)
 def benchmark_table_aggregate_counter(many_strings_ht):
     ht = hl.read_table(str(many_strings_ht))
     ht.aggregate(hl.tuple([hl.agg.counter(ht[f'f{i}']) for i in range(8)]))
 
 
-@pytest.mark.benchmark()
+@pytest.mark.benchmark(mds=1.1, instances=20, iterations=5, burn_in_iterations=10)
 def benchmark_table_aggregate_take_by_strings(many_strings_ht):
     ht = hl.read_table(str(many_strings_ht))
     ht.aggregate(hl.tuple([hl.agg.take(ht['f18'], 25, ordering=ht[f'f{i}']) for i in range(18)]))
@@ -214,7 +219,7 @@ def benchmark_table_aggregate_downsample_dense(many_ints_ht):
     ht.aggregate(tuple([hl.agg.downsample(ht[f'i{i}'], ht['i3'], label=hl.str(ht['i4'])) for i in range(3)]))
 
 
-@pytest.mark.benchmark()
+@pytest.mark.benchmark(mds=1.1, instances=10, iterations=10, burn_in_iterations=10)
 def benchmark_table_aggregate_downsample_worst_case():
     ht = hl.utils.range_table(250_000_000, 8)
     ht.aggregate(hl.agg.downsample(ht.idx, -ht.idx))
@@ -227,44 +232,63 @@ def benchmark_table_aggregate_downsample_sparse():
     ht.aggregate(hl.agg.downsample(hl.rand_norm() ** 5, hl.rand_norm() ** 5))
 
 
-@pytest.mark.benchmark()
+@pytest.mark.benchmark(mds=1.1, instances=15, iterations=10, burn_in_iterations=8)
 def benchmark_table_aggregate_linreg(many_ints_ht):
     ht = hl.read_table(str(many_ints_ht))
     ht.aggregate(hl.agg.array_agg(lambda i: hl.agg.linreg(ht.i0 + i, [ht.i1, ht.i2, ht.i3, ht.i4]), hl.range(75)))
 
 
-@pytest.mark.benchmark()
+@pytest.mark.benchmark(mds=1.1, instances=20, iterations=15, burn_in_iterations=25)
 def benchmark_table_take(many_strings_ht):
     ht = hl.read_table(str(many_strings_ht))
     ht.take(100)
 
 
-@pytest.mark.benchmark()
+@pytest.mark.benchmark(mds=1.1, instances=15, iterations=30, burn_in_iterations=20)
 def benchmark_table_show(many_strings_ht):
     ht = hl.read_table(str(many_strings_ht))
     ht.show(100)
 
 
-@pytest.mark.benchmark()
+@pytest.mark.benchmark(mds=1.1, instances=15, iterations=15, burn_in_iterations=20)
 def benchmark_table_expr_take(many_strings_ht):
     ht = hl.read_table(str(many_strings_ht))
     hl.tuple([ht.f1, ht.f2]).take(100)
 
 
-@pytest.mark.benchmark()
+@pytest.mark.parametrize(
+    'many_partitions_ht',
+    [
+        pytest.param(10, marks=pytest.mark.benchmark(mds=1.1, instances=25, iterations=15, burn_in_iterations=10)),
+        pytest.param(100, marks=pytest.mark.benchmark(mds=1.1, instances=15, iterations=20, burn_in_iterations=8)),
+        pytest.param(1000, marks=pytest.mark.benchmark(mds=1.1, instances=15, iterations=10, burn_in_iterations=10)),
+    ],
+    indirect=True,
+)
 def benchmark_read_force_count_partitions(many_partitions_ht):
     hl.read_table(str(many_partitions_ht))._force_count()
 
 
-@pytest.mark.benchmark()
-@pytest.mark.parametrize('n,n_partitions', [(10_000_000, 1000), (10_000_000, 100), (10_000_000, 10)])
+@pytest.mark.parametrize(
+    'n,n_partitions',
+    [
+        pytest.param(10_000_000, 10, marks=pytest.mark.benchmark(mds=1.1, instances=20, iterations=10, burn_in_iteratins=5)),
+        pytest.param(10_000_000, 100, marks=pytest.mark.benchmark(mds=1.1, instances=20, iterations=12, burn_in_iterations=3)),
+        pytest.param(10_000_000, 1000, marks=pytest.mark.benchmark(mds=1.2, instances=10, iterations=10, burn_in_iterations=7)),
+     ],
+)
 def benchmark_write_range_table(tmp_path, n, n_partitions):
     ht = hl.utils.range_table(n, n_partitions)
     ht.write(str(tmp_path / 'tmp.ht'))
 
 
-@pytest.mark.benchmark()
-@pytest.mark.parametrize('many_partitions_ht', [1_000], indirect=True)
+@pytest.mark.parametrize(
+    'many_partitions_ht',
+    [
+        pytest.param(1000, marks=pytest.mark.benchmark(mds=1.2, instances=10, iterations=5, burn_in_iterations=8)),
+    ],
+    indirect=True,
+)
 def benchmark_read_with_index(many_partitions_ht):
     rows = 10_000_000
     bins = 1_000
@@ -277,40 +301,68 @@ def benchmark_read_with_index(many_partitions_ht):
 many_partitions_ht1, many_partitions_ht2 = [many_partitions_ht] * 2
 
 
-@pytest.mark.benchmark()
+@pytest.mark.parametrize(
+    'many_partitions_ht1, many_partitions_ht2',
+    [
+        pytest.param(10, 10, marks=pytest.mark.benchmark(mds=1.2, instances=10, iterations=5, burn_in_iterations=5)),
+        pytest.param(10, 100, marks=pytest.mark.benchmark(mds=1.2, instances=10, iterations=15, burn_in_iterations=5)),
+        pytest.param(10, 1000, marks=pytest.mark.benchmark(mds=1.2, instances=10, iterations=10, burn_in_iterations=8)),
+        pytest.param(100, 10, marks=pytest.mark.benchmark(mds=1.1, instances=10, iterations=10, burn_in_iterations=20)),
+        pytest.param(100, 100, marks=pytest.mark.benchmark(mds=1.1, instances=20, iterations=10, burn_in_iterations=5)),
+        pytest.param(100, 1000, marks=pytest.mark.benchmark(mds=1.2, instances=15, iterations=10, burn_in_iterations=15)),
+        pytest.param(1000, 10, marks=pytest.mark.benchmark(mds=1.2, instances=15, iterations=5, burn_in_iterations=12)),
+        pytest.param(1000, 100, marks=pytest.mark.benchmark(mds=1.2, instances=15, iterations=10, burn_in_iterations=6)),
+        pytest.param(1000, 1000, marks=pytest.mark.benchmark(mds=1.2, instances=10, iterations=10, burn_in_iterations=10)),
+    ],
+    indirect=True,
+)
 def benchmark_union_partitions_table(many_partitions_ht1, many_partitions_ht2):
     ht1 = hl.read_table(str(many_partitions_ht1))
     ht2 = hl.read_table(str(many_partitions_ht2))
     ht1.union(ht2)._force_count()
 
 
-@pytest.mark.benchmark()
+@pytest.mark.parametrize(
+    'many_partitions_ht1, many_partitions_ht2',
+    [
+        pytest.param(10, 10, marks=pytest.mark.benchmark(mds=1.1, instances=15, iterations=15, burn_in_iterations=6)),
+        pytest.param(10, 100, marks=pytest.mark.benchmark(mds=1.1, instances=25, iterations=15, burn_in_iterations=4)),
+        pytest.param(10, 1000, marks=pytest.mark.benchmark(mds=1.2, instances=10, iterations=15, burn_in_iterations=14)),
+        pytest.param(100, 10, marks=pytest.mark.benchmark(mds=1.2, instances=10, iterations=10, burn_in_iterations=10)),
+        pytest.param(100, 100, marks=pytest.mark.benchmark(mds=1.1, instances=15, iterations=20, burn_in_iterations=10)),
+        pytest.param(100, 1000, marks=pytest.mark.benchmark(mds=1.2, instances=15, iterations=5, burn_in_iterations=10)),
+        pytest.param(1000, 10, marks=pytest.mark.benchmark(mds=1.2, instances=10, iterations=20, burn_in_iterations=12)),
+        pytest.param(1000, 100, marks=pytest.mark.benchmark(mds=1.2, instances=10, iterations=20, burn_in_iterations=10)),
+        pytest.param(1000, 1000, marks=pytest.mark.benchmark(mds=1.2, instances=10, iterations=10, burn_in_iterations=10)),
+    ],
+    indirect=True,
+)
 def benchmark_join_partitions_table(many_partitions_ht1, many_partitions_ht2):
     ht1 = hl.read_table(str(many_partitions_ht1))
     ht2 = hl.read_table(str(many_partitions_ht2))
     ht1.join(ht2)._force_count()
 
 
-@pytest.mark.benchmark()
+@pytest.mark.benchmark(mds=1.1, instances=20, iterations=15, burn_in_iterations=8)
 def benchmark_group_by_collect_per_row(gnomad_dp_sim):
     ht = hl.read_matrix_table(str(gnomad_dp_sim)).localize_entries('e', 'c')
     ht.group_by(*ht.key).aggregate(value=hl.agg.collect(ht.row_value))._force_count()
 
 
-@pytest.mark.benchmark()
+@pytest.mark.benchmark(mds=1.2, instances=10, iterations=5, burn_in_iterations=10)
 def benchmark_group_by_take_rekey(gnomad_dp_sim):
     ht = hl.read_matrix_table(str(gnomad_dp_sim)).localize_entries('e', 'c')
     ht.group_by(k=hl.int(ht.row_idx / 50)).aggregate(value=hl.agg.take(ht.row_value, 1))._force_count()
 
 
-@pytest.mark.benchmark()
+@pytest.mark.benchmark(mds=1.1, instances=25, iterations=5, burn_in_iterations=6)
 def benchmark_table_scan_sum_1k_partitions():
     ht = hl.utils.range_table(1000000, n_partitions=1000)
     ht = ht.annotate(x=hl.scan.sum(ht.idx))
     ht._force_count()
 
 
-@pytest.mark.benchmark()
+@pytest.mark.benchmark(mds=1.2, instances=10, iterations=5, burn_in_iterations=20)
 def benchmark_table_scan_prev_non_null():
     ht = hl.utils.range_table(100000000, n_partitions=10)
     ht = ht.annotate(x=hl.range(0, ht.idx % 25))
@@ -318,21 +370,21 @@ def benchmark_table_scan_prev_non_null():
     ht._force_count()
 
 
-@pytest.mark.benchmark()
+@pytest.mark.benchmark(mds=1.1, instances=10, iterations=10, burn_in_iterations=15)
 def benchmark_test_map_filter_region_memory():
     high_mem_table = hl.utils.range_table(30).naive_coalesce(1).annotate(big_array=hl.zeros(100_000_000))
     high_mem_table = high_mem_table.filter(high_mem_table.idx % 2 == 0)
     assert high_mem_table._force_count() == 15
 
 
-@pytest.mark.benchmark()
+@pytest.mark.benchmark(mds=1.1, instances=25, iterations=5, burn_in_iterations=10)
 def benchmark_test_head_and_tail_region_memory():
     high_mem_table = hl.utils.range_table(100).annotate(big_array=hl.zeros(100_000_000))
     high_mem_table = high_mem_table.head(30)
     high_mem_table._force_count()
 
 
-@pytest.mark.benchmark()
+@pytest.mark.benchmark(mds=1.1, instances=15, iterations=10, burn_in_iterations=10)
 def benchmark_test_inner_join_region_memory():
     high_mem_table = hl.utils.range_table(30).naive_coalesce(1).annotate(big_array=hl.zeros(50_000_000))
     high_mem_table2 = hl.utils.range_table(30).naive_coalesce(1).annotate(big_array=hl.zeros(50_000_000))
@@ -340,7 +392,7 @@ def benchmark_test_inner_join_region_memory():
     joined._force_count()
 
 
-@pytest.mark.benchmark()
+@pytest.mark.benchmark(mds=1.1, instances=10, iterations=10, burn_in_iterations=9)
 def benchmark_test_left_join_region_memory():
     high_mem_table = hl.utils.range_table(30).naive_coalesce(1).annotate(big_array=hl.zeros(50_000_000))
     high_mem_table2 = hl.utils.range_table(30).naive_coalesce(1).annotate(big_array=hl.zeros(50_000_000))