Skip to content

Commit

Permalink
[query] Benchmark configurations
Browse files Browse the repository at this point in the history
  • Loading branch information
ehigham committed Dec 6, 2024
1 parent 97a40fa commit 432a727
Show file tree
Hide file tree
Showing 8 changed files with 176 additions and 123 deletions.
2 changes: 1 addition & 1 deletion hail/python/benchmark/hail/benchmark_benchmark_analysis.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@
from benchmark.tools.statistics import analyze_benchmarks


@pytest.mark.benchmark()
@pytest.mark.benchmark(mds=1.2, instances=10, iterations=5, burn_in_iterations=10)
def benchmark_analyze_benchmarks(local_tmpdir, onethreetwo, onethreethree):
inputs = (onethreetwo, onethreethree)
inputs = ((v, Path(tempfile.mktemp(dir=local_tmpdir))) for v in inputs)
Expand Down
10 changes: 5 additions & 5 deletions hail/python/benchmark/hail/benchmark_combiner.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,30 +24,30 @@ def benchmark_compile_2k_merge(empty_gvcf, tmp_path):
hl.vds.write_variant_datasets(combined, str(tmp_path / 'combiner-multi-write'), overwrite=True)


@pytest.mark.benchmark()
@pytest.mark.benchmark(mds=1.1, instances=10, iterations=10, burn_in_iterations=10)
@pytest.mark.xtimeout(270)
def benchmark_python_only_10k_transform(empty_gvcf):
for vcf in [import_vcf(empty_gvcf)] * 10_000:
transform_gvcf(vcf, [])


@pytest.mark.benchmark()
@pytest.mark.benchmark(mds=1.1, instances=10, iterations=10, burn_in_iterations=20)
def benchmark_python_only_10k_combine(empty_gvcf):
vcf = import_vcf(empty_gvcf)
mt = transform_gvcf(vcf, [])
for mts in chunk(COMBINE_GVCF_MAX, [mt] * 10_000):
combine_variant_datasets(mts)


@pytest.mark.benchmark()
@pytest.mark.benchmark(mds=1.1, instances=20, iterations=10, burn_in_iterations=10)
def benchmark_import_and_transform_gvcf(single_gvcf):
mt = import_vcf(single_gvcf)
vds = transform_gvcf(mt, [])
vds.reference_data._force_count_rows()
vds.variant_data._force_count_rows()


@pytest.mark.benchmark()
@pytest.mark.benchmark(mds=1.2, instances=10, iterations=15, burn_in_iterations=8)
def benchmark_import_gvcf_force_count(single_gvcf):
mt = import_vcf(single_gvcf)
mt._force_count_rows()
Expand All @@ -62,7 +62,7 @@ def tmp_and_output_paths(tmp_path):
return (tmp, output)


@pytest.mark.benchmark()
@pytest.mark.benchmark(mds=1.1, instances=10, iterations=5, burn_in_iterations=10)
@pytest.mark.xtimeout(180)
def benchmark_vds_combiner_chr22(chr22_gvcfs, tmp_and_output_paths):
parts = hl.eval([hl.parse_locus_interval('chr22:start-end', reference_genome='GRCh38')])
Expand Down
12 changes: 6 additions & 6 deletions hail/python/benchmark/hail/benchmark_linalg.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,32 +12,32 @@ def benchmark_block_matrix_nested_multiply(tmp_path):
bm.write(str(tmp_path / 'result.mt'), overwrite=True)


@pytest.mark.benchmark()
@pytest.mark.benchmark(mds=1.2, instances=10, iterations=5, burn_in_iterations=5)
def benchmark_make_ndarray():
ht = hl.utils.range_table(200_000)
ht = ht.annotate(x=hl.nd.array(hl.range(ht.idx)))
ht._force_count()


@pytest.mark.benchmark()
@pytest.mark.benchmark(mds=1.2, instances=10, iterations=20, burn_in_iterations=10)
def benchmark_ndarray_addition():
arr = hl.nd.ones((1024, 1024))
hl.eval(arr + arr)


@pytest.mark.benchmark()
@pytest.mark.benchmark(mds=1.2, instances=20, iterations=5, burn_in_iterations=10)
def benchmark_ndarray_matmul_int64():
arr = hl.nd.arange(1024 * 1024).map(hl.int64).reshape((1024, 1024))
hl.eval(arr @ arr)


@pytest.mark.benchmark()
@pytest.mark.benchmark(mds=1.1, instances=20, iterations=15, burn_in_iterations=6)
def benchmark_ndarray_matmul_float64():
arr = hl.nd.arange(1024 * 1024).map(hl.float64).reshape((1024, 1024))
hl.eval(arr @ arr)


@pytest.mark.benchmark()
@pytest.mark.benchmark(mds=1.2, instances=10, iterations=5, burn_in_iterations=10)
@pytest.mark.xtimeout(200)
def benchmark_blockmatrix_write_from_entry_expr_range_mt(tmp_path):
mt = hl.utils.range_matrix_table(40_000, 40_000, n_partitions=4)
Expand All @@ -55,7 +55,7 @@ def benchmark_blockmatrix_write_from_entry_expr_range_mt_standardize(tmp_path):
)


@pytest.mark.benchmark()
@pytest.mark.benchmark(mds=1.1, instances=5, iterations=8, burn_in_iterations=10)
def benchmark_sum_table_of_ndarrays():
ht = hl.utils.range_table(400).annotate(nd=hl.nd.ones((4096, 4096)))
ht.aggregate(hl.agg.ndarray_sum(ht.nd))
Expand Down
73 changes: 37 additions & 36 deletions hail/python/benchmark/hail/benchmark_matrix_table.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,110 +3,110 @@
import hail as hl


@pytest.mark.benchmark()
@pytest.mark.benchmark(mds=1.1, instances=25, iterations=15, burn_in_iterations=8)
def benchmark_matrix_table_decode_and_count(profile25_mt):
mt = hl.read_matrix_table(str(profile25_mt))
mt._force_count_rows()


@pytest.mark.benchmark()
@pytest.mark.benchmark(mds=1.1, instances=20, iterations=20, burn_in_iterations=5)
def benchmark_matrix_table_decode_and_count_just_gt(profile25_mt):
mt = hl.read_matrix_table(str(profile25_mt)).select_entries('GT')
mt._force_count_rows()


@pytest.mark.benchmark()
@pytest.mark.benchmark(mds=1.1, instances=25, iterations=10, burn_in_iterations=20)
def benchmark_matrix_table_array_arithmetic(profile25_mt):
mt = hl.read_matrix_table(str(profile25_mt))
mt = mt.filter_rows(mt.alleles.length() == 2)
mt.select_entries(dosage=hl.pl_dosage(mt.PL)).select_rows()._force_count_rows()


@pytest.mark.benchmark()
@pytest.mark.benchmark(mds=1.1, instances=25, iterations=5, burn_in_iterations=10)
def benchmark_matrix_table_entries_table(profile25_mt):
mt = hl.read_matrix_table(str(profile25_mt))
mt.entries()._force_count()


@pytest.mark.benchmark()
@pytest.mark.benchmark(mds=1.1, instances=20, iterations=10, burn_in_iterations=10)
def benchmark_matrix_table_entries_table_no_key(profile25_mt):
mt = hl.read_matrix_table(str(profile25_mt)).key_rows_by().key_cols_by()
mt.entries()._force_count()


@pytest.mark.benchmark()
@pytest.mark.benchmark(mds=1.1, instances=20, iterations=10, burn_in_iterations=30)
def benchmark_matrix_table_rows_force_count(profile25_mt):
ht = hl.read_matrix_table(str(profile25_mt)).rows().key_by()
ht._force_count()


@pytest.mark.benchmark()
@pytest.mark.benchmark(mds=1.1, instances=15, iterations=10, burn_in_iterations=15)
def benchmark_matrix_table_show(profile25_mt):
mt = hl.read_matrix_table(str(profile25_mt))
mt.show(100)


@pytest.mark.benchmark()
@pytest.mark.benchmark(mds=1.1, instances=20, iterations=10, burn_in_iterations=15)
def benchmark_matrix_table_rows_show(profile25_mt):
mt = hl.read_matrix_table(str(profile25_mt))
mt.rows().show(100)


@pytest.mark.benchmark()
@pytest.mark.benchmark(mds=1.1, instances=15, iterations=15, burn_in_iterations=16)
def benchmark_matrix_table_cols_show(profile25_mt):
mt = hl.read_matrix_table(str(profile25_mt))
mt.cols().show(100)


@pytest.mark.benchmark()
@pytest.mark.benchmark(mds=1.1, instances=15, iterations=25, burn_in_iterations=10)
def benchmark_matrix_table_take_entry(profile25_mt):
mt = hl.read_matrix_table(str(profile25_mt))
mt.GT.take(100)


@pytest.mark.benchmark()
@pytest.mark.benchmark(mds=1.1, instances=20, iterations=15, burn_in_iterations=15)
def benchmark_matrix_table_entries_show(profile25_mt):
mt = hl.read_matrix_table(str(profile25_mt))
mt.entries().show()


@pytest.mark.benchmark()
@pytest.mark.benchmark(mds=1.1, instances=10, iterations=20, burn_in_iterations=10)
def benchmark_matrix_table_take_row(profile25_mt):
mt = hl.read_matrix_table(str(profile25_mt))
mt.info.AF.take(100)


@pytest.mark.benchmark()
@pytest.mark.benchmark(mds=1.1, instances=15, iterations=20, burn_in_iterations=10)
def benchmark_matrix_table_take_col(profile25_mt):
mt = hl.read_matrix_table(str(profile25_mt))
mt.s.take(100)


@pytest.mark.benchmark()
@pytest.mark.benchmark(mds=1.1, instances=20, iterations=10, burn_in_iterations=8)
def benchmark_write_range_matrix_table_p100(tmp_path):
mt = hl.utils.range_matrix_table(n_rows=1_000_000, n_cols=10, n_partitions=100)
mt = mt.annotate_entries(x=mt.col_idx + mt.row_idx)
mt.write(str(tmp_path / 'tmp.mt'))


@pytest.mark.benchmark()
@pytest.mark.benchmark(mds=1.1, instances=20, iterations=15, burn_in_iterations=15)
def benchmark_write_profile_mt(profile25_mt, tmp_path):
hl.read_matrix_table(str(profile25_mt)).write(str(tmp_path / 'tmp.mt'))


@pytest.mark.benchmark()
@pytest.mark.benchmark(mds=1.1, instances=20, iterations=20, burn_in_iterations=10)
def benchmark_matrix_table_rows_is_transition(profile25_mt):
ht = hl.read_matrix_table(str(profile25_mt)).rows().key_by()
ht.select(is_snp=hl.is_snp(ht.alleles[0], ht.alleles[1]))._force_count()


@pytest.mark.benchmark()
@pytest.mark.benchmark(mds=1.1, instances=15, iterations=20, burn_in_iterations=6)
def benchmark_matrix_table_filter_entries(profile25_mt):
mt = hl.read_matrix_table(str(profile25_mt))
mt.filter_entries((mt.GQ > 8) & (mt.DP > 2))._force_count_rows()


@pytest.mark.benchmark()
@pytest.mark.benchmark(mds=1.1, instances=20, iterations=10, burn_in_iterations=3)
def benchmark_matrix_table_filter_entries_unfilter(profile25_mt):
mt = hl.read_matrix_table(str(profile25_mt))
mt.filter_entries((mt.GQ > 8) & (mt.DP > 2)).unfilter_entries()._force_count_rows()
Expand Down Expand Up @@ -163,27 +163,27 @@ def many_aggs(mt):
return {f'x{i}': expr for i, expr in enumerate(aggs)}


@pytest.mark.benchmark()
@pytest.mark.benchmark(mds=1.1, instances=20, iterations=5, burn_in_iterations=4)
def benchmark_matrix_table_many_aggs_row_wise(profile25_mt):
mt = hl.read_matrix_table(str(profile25_mt))
mt = mt.annotate_rows(**many_aggs(mt))
mt.rows()._force_count()


@pytest.mark.benchmark()
@pytest.mark.benchmark(mds=1.1, instances=20, iterations=5, burn_in_iterations=10)
def benchmark_matrix_table_many_aggs_col_wise(profile25_mt):
mt = hl.read_matrix_table(str(profile25_mt))
mt = mt.annotate_cols(**many_aggs(mt))
mt.cols()._force_count()


@pytest.mark.benchmark()
@pytest.mark.benchmark(mds=1.1, instances=20, iterations=15, burn_in_iterations=8)
def benchmark_matrix_table_aggregate_entries(profile25_mt):
mt = hl.read_matrix_table(str(profile25_mt))
mt.aggregate_entries(hl.agg.stats(mt.GQ))


@pytest.mark.benchmark()
@pytest.mark.benchmark(mds=1.1, instances=20, iterations=10, burn_in_iterations=8)
def benchmark_matrix_table_call_stats_star_star(profile25_mt):
mt = hl.read_matrix_table(str(profile25_mt))
mt.annotate_rows(**hl.agg.call_stats(mt.GT, mt.alleles))._force_count_rows()
Expand Down Expand Up @@ -241,60 +241,61 @@ def benchmark_gnomad_coverage_stats_optimized(gnomad_dp_sim):
mt.rows()._force_count()


@pytest.mark.benchmark()
@pytest.mark.benchmark(mds=1.1, instances=15, iterations=20, burn_in_iterations=10)
def benchmark_per_row_stats_star_star(gnomad_dp_sim):
mt = hl.read_matrix_table(str(gnomad_dp_sim))
mt.annotate_rows(**hl.agg.stats(mt.x))._force_count_rows()


@pytest.mark.benchmark()
@pytest.mark.benchmark(mds=1.1, instances=20, iterations=10, burn_in_iterations=10)
def benchmark_read_decode_gnomad_coverage(gnomad_dp_sim):
hl.read_matrix_table(str(gnomad_dp_sim))._force_count_rows()


@pytest.mark.benchmark()
@pytest.mark.benchmark(mds=1.1, instances=10, iterations=5, burn_in_iterations=20)
def benchmark_import_bgen_force_count_just_gp(sim_ukb_bgen, sim_ukb_sample):
mt = hl.import_bgen(str(sim_ukb_bgen), sample_file=str(sim_ukb_sample), entry_fields=['GP'], n_partitions=8)
mt._force_count_rows()


@pytest.mark.benchmark()
@pytest.mark.benchmark(mds=1.1, instances=10, iterations=5, burn_in_iterations=20)
@pytest.mark.xfail(raises=TimeoutError, reason=XFail.Timeout)

Check failure on line 262 in hail/python/benchmark/hail/benchmark_matrix_table.py

View check run for this annotation

Codacy Production / Codacy Static Code Analysis

hail/python/benchmark/hail/benchmark_matrix_table.py#L262

Undefined variable 'XFail'
def benchmark_import_bgen_force_count_all(sim_ukb_bgen, sim_ukb_sample):
mt = hl.import_bgen(
str(sim_ukb_bgen), sample_file=str(sim_ukb_sample), entry_fields=['GT', 'GP', 'dosage'], n_partitions=8
)
mt._force_count_rows()


@pytest.mark.benchmark()
@pytest.mark.benchmark(mds=1.1, instances=10, iterations=5, burn_in_iterations=12)
@pytest.mark.xtimeout(180)
def benchmark_import_bgen_info_score(sim_ukb_bgen, sim_ukb_sample):
mt = hl.import_bgen(str(sim_ukb_bgen), sample_file=str(sim_ukb_sample), entry_fields=['GP'], n_partitions=8)
mt = mt.annotate_rows(info_score=hl.agg.info_score(mt.GP))
mt.rows().select('info_score')._force_count()


@pytest.mark.benchmark()
@pytest.mark.benchmark(mds=1.1, instances=10, iterations=5, burn_in_iterations=18)
def benchmark_import_bgen_filter_count(sim_ukb_bgen, sim_ukb_sample):
mt = hl.import_bgen(str(sim_ukb_bgen), sample_file=str(sim_ukb_sample), entry_fields=['GT', 'GP'], n_partitions=8)
mt = mt.filter_rows(mt.alleles == ['A', 'T'])
mt._force_count_rows()


@pytest.mark.benchmark()
@pytest.mark.benchmark(mds=1.1, instances=15, iterations=20, burn_in_iterations=3)
def benchmark_export_range_matrix_table_entry_field_p100(tmp_path):
mt = hl.utils.range_matrix_table(n_rows=1_000_000, n_cols=10, n_partitions=100)
mt = mt.annotate_entries(x=mt.col_idx + mt.row_idx)
mt.x.export(str(tmp_path / 'result.txt'))


@pytest.mark.benchmark()
@pytest.mark.benchmark(mds=1.2, instances=10, iterations=10, burn_in_iterations=8)
def benchmark_export_range_matrix_table_row_p100(tmp_path):
mt = hl.utils.range_matrix_table(n_rows=1_000_000, n_cols=10, n_partitions=100)
mt.row.export(str(tmp_path / 'result.txt'))


@pytest.mark.benchmark()
@pytest.mark.benchmark(mds=1.2, instances=15, iterations=25, burn_in_iterations=15)
def benchmark_export_range_matrix_table_col_p100(tmp_path):
mt = hl.utils.range_matrix_table(n_rows=1_000_000, n_cols=10, n_partitions=100)
mt.col.export(str(tmp_path / 'result.txt'))
Expand All @@ -308,7 +309,7 @@ def benchmark_large_range_matrix_table_sum():
mt.annotate_cols(foo=hl.agg.sum(mt.x))._force_count_cols()


@pytest.mark.benchmark()
@pytest.mark.benchmark(mds=1.1, instances=20, iterations=20, burn_in_iterations=7)
def benchmark_kyle_sex_specific_qc(profile25_mt):
mt = hl.read_matrix_table(str(profile25_mt))
mt = mt.annotate_cols(sex=hl.if_else(hl.rand_bool(0.5), 'Male', 'Female'))
Expand Down Expand Up @@ -349,14 +350,14 @@ def benchmark_kyle_sex_specific_qc(profile25_mt):
mt.rows()._force_count()


@pytest.mark.benchmark()
@pytest.mark.benchmark(mds=1.1, instances=25, iterations=10, burn_in_iterations=5)
def benchmark_matrix_table_scan_count_rows_2():
mt = hl.utils.range_matrix_table(n_rows=200_000_000, n_cols=10, n_partitions=16)
mt = mt.annotate_rows(x=hl.scan.count())
mt._force_count_rows()


@pytest.mark.benchmark()
@pytest.mark.benchmark(mds=1.3, instances=20, iterations=10, burn_in_iterations=20)
def benchmark_matrix_table_scan_count_cols_2():
mt = hl.utils.range_matrix_table(n_cols=10_000_000, n_rows=10)
mt = mt.annotate_cols(x=hl.scan.count())
Expand All @@ -371,14 +372,14 @@ def benchmark_matrix_multi_write_nothing(tmp_path):
hl.experimental.write_matrix_tables(mts, str(tmp_path / 'multi-write'))


@pytest.mark.benchmark()
@pytest.mark.benchmark(mds=1.1, instances=25, iterations=10, burn_in_iterations=5)
def benchmark_mt_localize_and_collect(profile25_mt):
mt = hl.read_matrix_table(str(profile25_mt))
ht = mt.localize_entries("ent")
ht.head(150).collect()


@pytest.mark.benchmark()
@pytest.mark.benchmark(mds=1.1, instances=20, iterations=15, burn_in_iterations=5)
def benchmark_mt_group_by_memory_usage(random_doubles_mt):
mt = hl.read_matrix_table(str(random_doubles_mt))
mt = mt.group_rows_by(new_idx=mt.row_idx % 3).aggregate(x=hl.agg.mean(mt.x))
Expand Down
Loading

0 comments on commit 432a727

Please sign in to comment.