From f3ac6447b532af6e593db1be555efde28b6451de Mon Sep 17 00:00:00 2001
From: Tobias Ribizel <mail@ribizel.de>
Date: Sun, 19 Nov 2023 16:49:20 +0100
Subject: [PATCH 1/7] add benchmark comparison script

---
 benchmark/tools/compare.py | 135 +++++++++++++++++++++++++++++++++++++
 1 file changed, 135 insertions(+)
 create mode 100755 benchmark/tools/compare.py

diff --git a/benchmark/tools/compare.py b/benchmark/tools/compare.py
new file mode 100755
index 00000000000..7619a4ce59f
--- /dev/null
+++ b/benchmark/tools/compare.py
@@ -0,0 +1,135 @@
+#!/usr/bin/env python3
+import sys
+import json
+import argparse
+
+parser = argparse.ArgumentParser(description="Compare to Ginkgo benchmark outputs")
+parser.add_argument("--outlier-threshold")
+parser.add_argument("--output")
+parser.add_argument("baseline")
+parser.add_argument("comparison")
+args = parser.parse_args()
+keys = {"stencil", "size", "filename", "n", "r", "k", "m"}
+
+
+def key_to_str(key: tuple) -> str:
+    """Restore a JSON output from a key tuple"""
+    result = {}
+    for key_name, key_val in zip(keys, key):
+        if key_val is not None:
+            result[key_name] = key_val
+    return json.dumps(result)
+
+
+def parse_json_matrix(filename: str) -> dict:
+    """Parse a JSON file into a key -> test_case dict"""
+    parsed = json.load(open(filename))
+    result = {}
+    assert isinstance(parsed, list)
+    for case in parsed:
+        assert isinstance(case, dict)
+        assert not keys.isdisjoint(case.keys())
+        dict_key = tuple(case.get(key, None) for key in keys)
+        if dict_key in result.keys():
+            print(
+                "WARNING: Duplicate key {}".format(key_to_str(dict_key)),
+                file=sys.stderr,
+            )
+        result[dict_key] = case
+    return result
+
+
+def warn_on_inconsistent_keys(baseline: dict, comparison: dict, context: str):
+    """Print a warning message for non-matching keys between baseline/comparison using the given context string"""
+    baseline_only = set(baseline.keys()).difference(comparison.keys())
+    comparison_only = set(comparison.keys()).difference(baseline.keys())
+    for dict_key in baseline_only:
+        print(
+            "WARNING: Key {} found in baseline only in context {}".format(
+                key_to_str(dict_key), context
+            ),
+            file=sys.stderr,
+        )
+    for dict_key in comparison_only:
+        print(
+            "WARNING: Key {} found in comparison only in context {}".format(
+                key_to_str(dict_key), context
+            ),
+            file=sys.stderr,
+        )
+
+
+def ratio(baseline: int | float, comparison: int | float) -> float:
+    """Compares the ratio between baseline and comparison. For runtimes, this is the speedup."""
+    return baseline / comparison
+
+
+def compare_benchmark(baseline: dict, comparison: dict, context: str):
+    """Compares a handful of keys and component breakdowns recursively, writing them with a suffix to the output"""
+    comparison_keys = {"time", "storage", "iterations"}
+    suffix = ".ratio"
+    warn_on_inconsistent_keys(baseline, comparison, context)
+    result = {}
+    for key in baseline.keys():
+        sub_context = "{}.{}".format(context, key)
+        if key == "components":
+            assert isinstance(baseline[key], dict)
+            assert isinstance(comparison[key], dict)
+            warn_on_inconsistent_keys(baseline[key], comparison[key], sub_context)
+            result[key + suffix] = {
+                sub_key: ratio(baseline[key][sub_key], comparison[key][sub_key])
+                for sub_key in baseline[key]
+            }
+        elif isinstance(baseline[key], dict):
+            result[key] = compare_benchmark(baseline[key], comparison[key], sub_context)
+        elif key in comparison_keys:
+            result[key + suffix] = ratio(baseline[key], comparison[key])
+    return result
+
+
+def compare(baseline: dict, comparison: dict, context: str) -> dict:
+    """Compares a test case, keeping root-level values and recursing into benchmarks"""
+    warn_on_inconsistent_keys(baseline, comparison, context)
+    result = {}
+    for key in baseline.keys():
+        # we don't have lists on the test case root level
+        assert not isinstance(baseline[key], list)
+        if isinstance(baseline[key], dict):
+            benchmark_result = {}
+            warn_on_inconsistent_keys(
+                baseline[key], comparison[key], "{}.{}".format(context, key)
+            )
+            for benchmark_name in baseline[key].keys():
+                if isinstance(baseline[key][benchmark_name], dict):
+                    comparison_result = compare_benchmark(
+                        baseline[key][benchmark_name],
+                        comparison[key][benchmark_name],
+                        "{}.{}.{}".format(context, key, benchmark_name),
+                    )
+                    if len(comparison_result) > 0:
+                        benchmark_result[benchmark_name] = comparison_result
+            if len(benchmark_result) > 0:
+                result[key] = benchmark_result
+        else:
+            # everything that's not a dict should only depend on the key in the root level
+            if baseline[key] != comparison[key]:
+                print(
+                    "WARNING: Inconsistent value for {}: {} != {} in context {}".format(
+                        key, baseline[key], comparison[key], context
+                    ),
+                    file=sys.stderr,
+                )
+            result[key] = baseline[key]
+    return result
+
+
+baseline_json = parse_json_matrix(args.baseline)
+comparison_json = parse_json_matrix(args.comparison)
+warn_on_inconsistent_keys(baseline_json, comparison_json, "root")
+
+results = []
+
+for key in baseline_json.keys():
+    results.append(compare(baseline_json[key], comparison_json[key], key_to_str(key)))
+
+json.dump(results, sys.stdout, indent=4)

From 8178aa1ee1747223bc0d244177818945e1454add Mon Sep 17 00:00:00 2001
From: Tobias Ribizel <mail@ribizel.de>
Date: Sun, 19 Nov 2023 16:53:20 +0100
Subject: [PATCH 2/7] pretty-print final JSON output

---
 benchmark/utils/general.hpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/benchmark/utils/general.hpp b/benchmark/utils/general.hpp
index 241d2225938..e0045d8f417 100644
--- a/benchmark/utils/general.hpp
+++ b/benchmark/utils/general.hpp
@@ -289,7 +289,7 @@ void backup_results(json& results)
         return;
     }
     std::ofstream ofs(filenames[next]);
-    ofs << results;
+    ofs << std::setw(4) << results;
     next = 1 - next;
 }
 

From 7bf1de8b1371cd63e89463b217cd859f67b639d5 Mon Sep 17 00:00:00 2001
From: Tobias Ribizel <mail@ribizel.de>
Date: Mon, 20 Nov 2023 06:03:29 +0100
Subject: [PATCH 3/7] enable outlier detection and table output

---
 benchmark/tools/compare.py | 144 ++++++++++++++++++++++++++++++++++---
 1 file changed, 135 insertions(+), 9 deletions(-)

diff --git a/benchmark/tools/compare.py b/benchmark/tools/compare.py
index 7619a4ce59f..938d7aa65a6 100755
--- a/benchmark/tools/compare.py
+++ b/benchmark/tools/compare.py
@@ -2,23 +2,51 @@
 import sys
 import json
 import argparse
+import math
+import pandas as pd
 
 parser = argparse.ArgumentParser(description="Compare to Ginkgo benchmark outputs")
-parser.add_argument("--outlier-threshold")
-parser.add_argument("--output")
+parser.add_argument(
+    "--outliers", action="store_true", help="List outliers from the results"
+)
+parser.add_argument(
+    "--outlier-threshold",
+    type=float,
+    default=10,
+    help="At what percentage of deviation (above or below) should outliers be reported",
+)
+parser.add_argument(
+    "--outlier-count",
+    type=int,
+    default=1000,
+    help="How many outliers should be reported per benchmark",
+)
+parser.add_argument("--output", choices=["json", "csv", "markdown"], default="json")
 parser.add_argument("baseline")
 parser.add_argument("comparison")
 args = parser.parse_args()
 keys = {"stencil", "size", "filename", "n", "r", "k", "m"}
+comparison_keys = {"time", "storage", "iterations"}
+suffix = ".ratio"
 
 
-def key_to_str(key: tuple) -> str:
+def key_to_json(key: tuple) -> dict:
     """Restore a JSON output from a key tuple"""
     result = {}
     for key_name, key_val in zip(keys, key):
         if key_val is not None:
             result[key_name] = key_val
-    return json.dumps(result)
+    return result
+
+
+def key_to_str(key: tuple) -> str:
+    """Restore a JSON output string from a key tuple"""
+    mapped = key_to_json(key)
+    if "filename" in mapped.keys():
+        return mapped["filename"]
+    if "stencil" in mapped.keys():
+        return "stencil({}, {})".format(mapped["stencil"], mapped["size"])
+    return json.dumps(mapped).replace('"', "")
 
 
 def parse_json_matrix(filename: str) -> dict:
@@ -66,8 +94,6 @@ def ratio(baseline: int | float, comparison: int | float) -> float:
 
 def compare_benchmark(baseline: dict, comparison: dict, context: str):
     """Compares a handful of keys and component breakdowns recursively, writing them with a suffix to the output"""
-    comparison_keys = {"time", "storage", "iterations"}
-    suffix = ".ratio"
     warn_on_inconsistent_keys(baseline, comparison, context)
     result = {}
     for key in baseline.keys():
@@ -123,13 +149,113 @@ def compare(baseline: dict, comparison: dict, context: str) -> dict:
     return result
 
 
+def extract_benchmark_results(
+    input: dict, benchmarks: dict, case_key: tuple, context: str | None
+):
+    for key, value in input.items():
+        benchmark_name = key if context is None else "{}.{}".format(context, key)
+        if key in map(lambda x: x + suffix, comparison_keys):
+            benchmark_name = benchmark_name[: -len(suffix)]
+            if benchmark_name not in benchmarks.keys():
+                benchmarks[benchmark_name] = []
+            benchmarks[benchmark_name].append((case_key, value))
+        elif isinstance(value, dict):
+            extract_benchmark_results(value, benchmarks, case_key, benchmark_name)
+
+
+def is_outlier(value: float):
+    return math.fabs(math.log(value)) > math.log(1.0 + args.outlier_threshold / 100)
+
+
 baseline_json = parse_json_matrix(args.baseline)
 comparison_json = parse_json_matrix(args.comparison)
 warn_on_inconsistent_keys(baseline_json, comparison_json, "root")
 
-results = []
+results = {}
 
 for key in baseline_json.keys():
-    results.append(compare(baseline_json[key], comparison_json[key], key_to_str(key)))
+    results[key] = compare(baseline_json[key], comparison_json[key], key_to_str(key))
+
+outliers = {}
+if args.outliers:
+    benchmarks = {}
+    for key, value in results.items():
+        extract_benchmark_results(value, benchmarks, key, None)
+    for benchmark_name, benchmark_results in benchmarks.items():
+        outlier = sorted(
+            [
+                (case_key, value)
+                for case_key, value in benchmark_results
+                if is_outlier(value)
+            ],
+            key=lambda x: math.fabs(math.log(x[1])),
+            reverse=True,
+        )
+        outliers[benchmark_name] = outlier[: min(len(outlier), args.outlier_count)]
 
-json.dump(results, sys.stdout, indent=4)
+if args.output == "json":
+    print(
+        json.dumps(
+            {
+                "results": [value for _, value in results.items()],
+                "outliers": {
+                    key: [
+                        {"value": ratio_value} + key_to_json(case_key)
+                        for (case_key, ratio_value) in value
+                    ]
+                    for key, value in outliers.items()
+                    if len(value) > 0
+                },
+            },
+            indent=4,
+        )
+    )
+else:
+    columns = ["benchmark", "testcase", "ratio"]
+    only_first = args.output == "markdown"
+    table = pd.DataFrame(
+        sum(
+            [
+                [
+                    (
+                        key if i == 0 or not only_first else "",
+                        key_to_str(value[0]),
+                        value[1],
+                    )
+                    for i, value in enumerate(values)
+                ]
+                for key, values in benchmarks.items()
+            ],
+            [],
+        ),
+        columns=columns,
+    )
+    if args.output == "csv":
+        table.to_csv(sys.stdout, index=False)
+    else:
+        table.to_markdown(sys.stdout, index=False)
+    if args.outliers:
+        outlier_table = pd.DataFrame(
+            sum(
+                [
+                    [
+                        (
+                            key if i == 0 or not only_first else "",
+                            key_to_str(value[0]),
+                            value[1],
+                        )
+                        for i, value in enumerate(values)
+                    ]
+                    for key, values in outliers.items()
+                ],
+                [],
+            ),
+            columns=columns,
+        )
+        if len(outlier_table) > 0:
+            print("\n\nOutliers")
+            if args.output == "csv":
+                outlier_table.to_csv(sys.stdout, index=False)
+            else:
+                outlier_table.to_markdown(sys.stdout, index=False)
+        print()

From e15ab8610b691e0cef74bbbe37d7137f1c9a1a24 Mon Sep 17 00:00:00 2001
From: Tobias Ribizel <mail@ribizel.de>
Date: Mon, 20 Nov 2023 23:24:47 +0100
Subject: [PATCH 4/7] review updates

- simplify things
- add type annotations
- fix outlier issues
- test everything

Co-authored-by: Gregor Olenik <gregor.olenik@kit.edu>
Co-authored-by: Marcel Koch
<marcel.koch@kit.edu>
---
 benchmark/tools/compare.py               | 271 +++++++++++------------
 benchmark/tools/compare_test.py          | 226 +++++++++++++++++++
 benchmark/tools/compare_test_input1.json |  48 ++++
 benchmark/tools/compare_test_input2.json |  48 ++++
 benchmark/tools/compare_test_input3.json |  39 ++++
 5 files changed, 492 insertions(+), 140 deletions(-)
 create mode 100644 benchmark/tools/compare_test.py
 create mode 100644 benchmark/tools/compare_test_input1.json
 create mode 100644 benchmark/tools/compare_test_input2.json
 create mode 100644 benchmark/tools/compare_test_input3.json

diff --git a/benchmark/tools/compare.py b/benchmark/tools/compare.py
index 938d7aa65a6..9dd72c12538 100755
--- a/benchmark/tools/compare.py
+++ b/benchmark/tools/compare.py
@@ -1,90 +1,65 @@
 #!/usr/bin/env python3
+# SPDX-FileCopyrightText: 2017-2023 The Ginkgo authors
+# SPDX-License-Identifier: BSD-3-Clause
 import sys
 import json
 import argparse
 import math
 import pandas as pd
+import tabulate  # for pandas markdown output
+from frozendict import frozendict
+
 
-parser = argparse.ArgumentParser(description="Compare to Ginkgo benchmark outputs")
-parser.add_argument(
-    "--outliers", action="store_true", help="List outliers from the results"
-)
-parser.add_argument(
-    "--outlier-threshold",
-    type=float,
-    default=10,
-    help="At what percentage of deviation (above or below) should outliers be reported",
-)
-parser.add_argument(
-    "--outlier-count",
-    type=int,
-    default=1000,
-    help="How many outliers should be reported per benchmark",
-)
-parser.add_argument("--output", choices=["json", "csv", "markdown"], default="json")
-parser.add_argument("baseline")
-parser.add_argument("comparison")
-args = parser.parse_args()
 keys = {"stencil", "size", "filename", "n", "r", "k", "m"}
 comparison_keys = {"time", "storage", "iterations"}
 suffix = ".ratio"
 
 
-def key_to_json(key: tuple) -> dict:
-    """Restore a JSON output from a key tuple"""
-    result = {}
-    for key_name, key_val in zip(keys, key):
-        if key_val is not None:
-            result[key_name] = key_val
-    return result
-
-
-def key_to_str(key: tuple) -> str:
-    """Restore a JSON output string from a key tuple"""
-    mapped = key_to_json(key)
-    if "filename" in mapped.keys():
-        return mapped["filename"]
-    if "stencil" in mapped.keys():
-        return "stencil({}, {})".format(mapped["stencil"], mapped["size"])
-    return json.dumps(mapped).replace('"', "")
+def sorted_key_intersection(a: dict, b: dict) -> list:
+    return sorted(set(a.keys()).intersection(b.keys()), key=str)
 
 
 def parse_json_matrix(filename: str) -> dict:
     """Parse a JSON file into a key -> test_case dict"""
-    parsed = json.load(open(filename))
+    with open(filename) as file:
+        parsed = json.load(file)
     result = {}
     assert isinstance(parsed, list)
     for case in parsed:
         assert isinstance(case, dict)
         assert not keys.isdisjoint(case.keys())
-        dict_key = tuple(case.get(key, None) for key in keys)
+        dict_key = frozendict(
+            {key: case[key] for key in keys.intersection(case.keys())}
+        )
         if dict_key in result.keys():
             print(
-                "WARNING: Duplicate key {}".format(key_to_str(dict_key)),
+                f"WARNING: Duplicate key {json.dumps(dict_key)}",
                 file=sys.stderr,
             )
-        result[dict_key] = case
+        result[frozendict(dict_key)] = case
     return result
 
 
 def warn_on_inconsistent_keys(baseline: dict, comparison: dict, context: str):
     """Print a warning message for non-matching keys between baseline/comparison using the given context string"""
-    baseline_only = set(baseline.keys()).difference(comparison.keys())
-    comparison_only = set(comparison.keys()).difference(baseline.keys())
-    for dict_key in baseline_only:
+    baseline_only = sorted(set(baseline.keys()).difference(comparison.keys()))
+    comparison_only = sorted(set(comparison.keys()).difference(baseline.keys()))
+    for key in baseline_only:
         print(
-            "WARNING: Key {} found in baseline only in context {}".format(
-                key_to_str(dict_key), context
-            ),
+            f"WARNING: Key {json.dumps(key) if isinstance(key, dict) else key} found in baseline only in context {context}",
             file=sys.stderr,
         )
-    for dict_key in comparison_only:
+    for key in comparison_only:
         print(
-            "WARNING: Key {} found in comparison only in context {}".format(
-                key_to_str(dict_key), context
-            ),
+            f"WARNING: Key {json.dumps(key) if isinstance(key, dict) else key} found in comparison only in context {context}",
             file=sys.stderr,
         )
+    for key in sorted_key_intersection(baseline, comparison):
+        if isinstance(baseline[key], dict):
+            assert isinstance(comparison[key], dict)
+            warn_on_inconsistent_keys(
+                baseline[key], comparison[key], f"{context}/{key}"
+            )
 
 
 def ratio(baseline: int | float, comparison: int | float) -> float:
@@ -92,45 +67,36 @@ def ratio(baseline: int | float, comparison: int | float) -> float:
     return baseline / comparison
 
 
-def compare_benchmark(baseline: dict, comparison: dict, context: str):
+def compare_benchmark(baseline: dict, comparison: dict) -> dict:
     """Compares a handful of keys and component breakdowns recursively, writing them with a suffix to the output"""
-    warn_on_inconsistent_keys(baseline, comparison, context)
     result = {}
-    for key in baseline.keys():
-        sub_context = "{}.{}".format(context, key)
+    for key in sorted_key_intersection(baseline, comparison):
         if key == "components":
             assert isinstance(baseline[key], dict)
             assert isinstance(comparison[key], dict)
-            warn_on_inconsistent_keys(baseline[key], comparison[key], sub_context)
             result[key + suffix] = {
                 sub_key: ratio(baseline[key][sub_key], comparison[key][sub_key])
                 for sub_key in baseline[key]
             }
         elif isinstance(baseline[key], dict):
-            result[key] = compare_benchmark(baseline[key], comparison[key], sub_context)
+            result[key] = compare_benchmark(baseline[key], comparison[key])
         elif key in comparison_keys:
             result[key + suffix] = ratio(baseline[key], comparison[key])
     return result
 
 
-def compare(baseline: dict, comparison: dict, context: str) -> dict:
+def compare(baseline: dict, comparison: dict) -> dict:
     """Compares a test case, keeping root-level values and recursing into benchmarks"""
-    warn_on_inconsistent_keys(baseline, comparison, context)
     result = {}
-    for key in baseline.keys():
+    for key in sorted_key_intersection(baseline, comparison):
         # we don't have lists on the test case root level
         assert not isinstance(baseline[key], list)
         if isinstance(baseline[key], dict):
             benchmark_result = {}
-            warn_on_inconsistent_keys(
-                baseline[key], comparison[key], "{}.{}".format(context, key)
-            )
             for benchmark_name in baseline[key].keys():
                 if isinstance(baseline[key][benchmark_name], dict):
                     comparison_result = compare_benchmark(
-                        baseline[key][benchmark_name],
-                        comparison[key][benchmark_name],
-                        "{}.{}.{}".format(context, key, benchmark_name),
+                        baseline[key][benchmark_name], comparison[key][benchmark_name]
                     )
                     if len(comparison_result) > 0:
                         benchmark_result[benchmark_name] = comparison_result
@@ -140,9 +106,7 @@ def compare(baseline: dict, comparison: dict, context: str) -> dict:
             # everything that's not a dict should only depend on the key in the root level
             if baseline[key] != comparison[key]:
                 print(
-                    "WARNING: Inconsistent value for {}: {} != {} in context {}".format(
-                        key, baseline[key], comparison[key], context
-                    ),
+                    f"WARNING: Inconsistent value for {key}: {baseline[key]} != {comparison[key]}",
                     file=sys.stderr,
                 )
             result[key] = baseline[key]
@@ -151,9 +115,9 @@ def compare(baseline: dict, comparison: dict, context: str) -> dict:
 
 def extract_benchmark_results(
     input: dict, benchmarks: dict, case_key: tuple, context: str | None
-):
+) -> None:
     for key, value in input.items():
-        benchmark_name = key if context is None else "{}.{}".format(context, key)
+        benchmark_name = key if context is None else f"{context}/{key}"
         if key in map(lambda x: x + suffix, comparison_keys):
             benchmark_name = benchmark_name[: -len(suffix)]
             if benchmark_name not in benchmarks.keys():
@@ -163,99 +127,126 @@ def extract_benchmark_results(
             extract_benchmark_results(value, benchmarks, case_key, benchmark_name)
 
 
-def is_outlier(value: float):
+def is_outlier(value: float, args) -> bool:
+    """returns true iff the value exceeds 1.0 above the outlier threshold"""
     return math.fabs(math.log(value)) > math.log(1.0 + args.outlier_threshold / 100)
 
 
-baseline_json = parse_json_matrix(args.baseline)
-comparison_json = parse_json_matrix(args.comparison)
-warn_on_inconsistent_keys(baseline_json, comparison_json, "root")
+def compare_main(args: list):
+    """Runs the comparison script"""
+    parser = argparse.ArgumentParser(description="Compare to Ginkgo benchmark outputs")
+    parser.add_argument(
+        "--outliers", action="store_true", help="List outliers from the results"
+    )
+    parser.add_argument(
+        "--outlier-threshold",
+        type=float,
+        default=10,
+        help="At what percentage of deviation (above or below) should outliers be reported",
+    )
+    parser.add_argument(
+        "--outlier-count",
+        type=int,
+        default=1000,
+        help="How many outliers should be reported per benchmark",
+    )
+    parser.add_argument("--output", choices=["json", "csv", "markdown"], default="json")
+    parser.add_argument("baseline")
+    parser.add_argument("comparison")
+    args = parser.parse_args(args)
+    baseline_json = parse_json_matrix(args.baseline)
+    comparison_json = parse_json_matrix(args.comparison)
+    warn_on_inconsistent_keys(baseline_json, comparison_json, "root")
 
-results = {}
+    results = {}
 
-for key in baseline_json.keys():
-    results[key] = compare(baseline_json[key], comparison_json[key], key_to_str(key))
+    for key in set(baseline_json.keys()).intersection(comparison_json.keys()):
+        results[key] = compare(baseline_json[key], comparison_json[key])
 
-outliers = {}
-if args.outliers:
+    outliers = {}
     benchmarks = {}
     for key, value in results.items():
         extract_benchmark_results(value, benchmarks, key, None)
-    for benchmark_name, benchmark_results in benchmarks.items():
-        outlier = sorted(
-            [
-                (case_key, value)
-                for case_key, value in benchmark_results
-                if is_outlier(value)
-            ],
-            key=lambda x: math.fabs(math.log(x[1])),
-            reverse=True,
-        )
-        outliers[benchmark_name] = outlier[: min(len(outlier), args.outlier_count)]
+    if args.outliers:
+        for benchmark_name, benchmark_results in benchmarks.items():
+            outlier = sorted(
+                [
+                    (case_key, value)
+                    for case_key, value in benchmark_results
+                    if is_outlier(value, args)
+                ],
+                key=lambda x: math.fabs(math.log(x[1])),
+                reverse=True,
+            )
+            outliers[benchmark_name] = outlier[: min(len(outlier), args.outlier_count)]
 
-if args.output == "json":
-    print(
-        json.dumps(
-            {
-                "results": [value for _, value in results.items()],
-                "outliers": {
-                    key: [
-                        {"value": ratio_value} + key_to_json(case_key)
-                        for (case_key, ratio_value) in value
-                    ]
-                    for key, value in outliers.items()
-                    if len(value) > 0
+    if args.output == "json":
+        print(
+            json.dumps(
+                {
+                    "results": [value for _, value in results.items()],
+                    "outliers": {
+                        key: [
+                            {"value": ratio_value, **case_key}
+                            for (case_key, ratio_value) in value
+                        ]
+                        for key, value in outliers.items()
+                        if len(value) > 0
+                    },
                 },
-            },
-            indent=4,
+                indent=4,
+            )
         )
-    )
-else:
-    columns = ["benchmark", "testcase", "ratio"]
-    only_first = args.output == "markdown"
-    table = pd.DataFrame(
-        sum(
-            [
-                [
-                    (
-                        key if i == 0 or not only_first else "",
-                        key_to_str(value[0]),
-                        value[1],
-                    )
-                    for i, value in enumerate(values)
-                ]
-                for key, values in benchmarks.items()
-            ],
-            [],
-        ),
-        columns=columns,
-    )
-    if args.output == "csv":
-        table.to_csv(sys.stdout, index=False)
     else:
-        table.to_markdown(sys.stdout, index=False)
-    if args.outliers:
-        outlier_table = pd.DataFrame(
+        columns = ["benchmark", "testcase", "ratio"]
+        only_first = args.output == "markdown"
+        table = pd.DataFrame(
             sum(
                 [
                     [
                         (
                             key if i == 0 or not only_first else "",
-                            key_to_str(value[0]),
+                            json.dumps(value[0]),
                             value[1],
                         )
                         for i, value in enumerate(values)
                     ]
-                    for key, values in outliers.items()
+                    for key, values in benchmarks.items()
                 ],
                 [],
             ),
             columns=columns,
         )
-        if len(outlier_table) > 0:
-            print("\n\nOutliers")
-            if args.output == "csv":
-                outlier_table.to_csv(sys.stdout, index=False)
-            else:
-                outlier_table.to_markdown(sys.stdout, index=False)
-        print()
+        if args.output == "csv":
+            table.to_csv(sys.stdout, index=False)
+        else:
+            table.to_markdown(sys.stdout, index=False)
+        if args.outliers:
+            outlier_table = pd.DataFrame(
+                sum(
+                    [
+                        [
+                            (
+                                key if i == 0 or not only_first else "",
+                                json.dumps(value[0]),
+                                value[1],
+                            )
+                            for i, value in enumerate(values)
+                        ]
+                        for key, values in outliers.items()
+                    ],
+                    [],
+                ),
+                columns=columns,
+            )
+            if len(outlier_table) > 0:
+                print("\n\nOutliers")
+                if args.output == "csv":
+                    outlier_table.to_csv(sys.stdout, index=False)
+                else:
+                    outlier_table.to_markdown(sys.stdout, index=False)
+            print()
+
+
+if __name__ == "__main__":
+    compare(sys.argv)
diff --git a/benchmark/tools/compare_test.py b/benchmark/tools/compare_test.py
new file mode 100644
index 00000000000..1b906c63b45
--- /dev/null
+++ b/benchmark/tools/compare_test.py
@@ -0,0 +1,226 @@
+import json
+import compare
+import os
+
+dir_path = os.path.dirname(os.path.realpath(__file__))
+
+
+def test_mismatch(capsys):
+    compare.compare_main(
+        [
+            dir_path + "/../test/reference/blas.simple.stdout",
+            dir_path + "/../test/reference/spmv.matrix.stdout",
+        ]
+    )
+    captured = capsys.readouterr()
+    ref_out = {"results": [], "outliers": {}}
+
+    ref_err = """WARNING: Key {"n": 100} found in baseline only in context root
+WARNING: Key {"filename": ""} found in comparison only in context root
+"""
+    assert json.loads(captured.out) == ref_out
+    assert captured.err == ref_err
+
+
+def test_simple(capsys):
+    compare.compare_main(
+        [
+            dir_path + "/../test/reference/spmv.matrix.stdout",
+            dir_path + "/../test/reference/spmv.matrix.stdout",
+        ]
+    )
+    captured = capsys.readouterr()
+    ref_out = {
+        "results": [
+            {
+                "cols": 36,
+                "filename": "",
+                "nonzeros": 208,
+                "rows": 36,
+                "spmv": {"coo": {"storage.ratio": 1.0, "time.ratio": 1.0}},
+            }
+        ],
+        "outliers": {},
+    }
+
+    assert json.loads(captured.out) == ref_out
+    assert captured.err == ""
+
+
+def test_outliers(capsys):
+    compare.compare_main(
+        [
+            "--outliers",
+            dir_path + "/compare_test_input1.json",
+            dir_path + "/compare_test_input2.json",
+        ]
+    )
+    captured = capsys.readouterr()
+    ref_out = {
+        "results": [
+            {
+                "cols": 36,
+                "filename": "mtx",
+                "nonzeros": 208,
+                "rows": 36,
+                "spmv": {
+                    "coo": {"storage.ratio": 1.0, "time.ratio": 1.2},
+                    "csr": {"storage.ratio": 2.0, "time.ratio": 0.8},
+                    "ell": {"storage.ratio": 0.5, "time.ratio": 1.0},
+                    "sellp": {"storage.ratio": 1.0, "time.ratio": 1.11},
+                    "hybrid": {"storage.ratio": 1.0, "time.ratio": 1.01},
+                },
+            }
+        ],
+        "outliers": {
+            "spmv/coo/time": [{"value": 1.2, "filename": "mtx"}],
+            "spmv/csr/storage": [{"value": 2.0, "filename": "mtx"}],
+            "spmv/csr/time": [{"value": 0.8, "filename": "mtx"}],
+            "spmv/ell/storage": [{"value": 0.5, "filename": "mtx"}],
+            "spmv/sellp/time": [{"value": 1.11, "filename": "mtx"}],
+        },
+    }
+
+    assert json.loads(captured.out) == ref_out
+    assert captured.err == ""
+
+
+def test_outliers_imited(capsys):
+    compare.compare_main(
+        [
+            "--outliers",
+            "--outlier-count",
+            "0",
+            dir_path + "/compare_test_input1.json",
+            dir_path + "/compare_test_input2.json",
+        ]
+    )
+    captured = capsys.readouterr()
+    ref_out = {
+        "results": [
+            {
+                "cols": 36,
+                "filename": "mtx",
+                "nonzeros": 208,
+                "rows": 36,
+                "spmv": {
+                    "coo": {"storage.ratio": 1.0, "time.ratio": 1.2},
+                    "csr": {"storage.ratio": 2.0, "time.ratio": 0.8},
+                    "ell": {"storage.ratio": 0.5, "time.ratio": 1.0},
+                    "sellp": {"storage.ratio": 1.0, "time.ratio": 1.11},
+                    "hybrid": {"storage.ratio": 1.0, "time.ratio": 1.01},
+                },
+            }
+        ],
+        "outliers": {},
+    }
+
+    assert json.loads(captured.out) == ref_out
+    assert captured.err == ""
+
+
+def test_csv(capsys):
+    compare.compare_main(
+        [
+            "--outliers",
+            "--output",
+            "csv",
+            dir_path + "/compare_test_input1.json",
+            dir_path + "/compare_test_input2.json",
+        ]
+    )
+    captured = capsys.readouterr()
+    ref_out = """benchmark,testcase,ratio
+spmv/coo/storage,"{""filename"": ""mtx""}",1.0
+spmv/coo/time,"{""filename"": ""mtx""}",1.2
+spmv/csr/storage,"{""filename"": ""mtx""}",2.0
+spmv/csr/time,"{""filename"": ""mtx""}",0.8
+spmv/ell/storage,"{""filename"": ""mtx""}",0.5
+spmv/ell/time,"{""filename"": ""mtx""}",1.0
+spmv/sellp/storage,"{""filename"": ""mtx""}",1.0
+spmv/sellp/time,"{""filename"": ""mtx""}",1.11
+spmv/hybrid/storage,"{""filename"": ""mtx""}",1.0
+spmv/hybrid/time,"{""filename"": ""mtx""}",1.01
+
+
+Outliers
+benchmark,testcase,ratio
+spmv/coo/time,"{""filename"": ""mtx""}",1.2
+spmv/csr/storage,"{""filename"": ""mtx""}",2.0
+spmv/csr/time,"{""filename"": ""mtx""}",0.8
+spmv/ell/storage,"{""filename"": ""mtx""}",0.5
+spmv/sellp/time,"{""filename"": ""mtx""}",1.11
+
+"""
+    assert captured.out == ref_out
+    assert captured.err == ""
+
+
+def test_md(capsys):
+    compare.compare_main(
+        [
+            "--outliers",
+            "--output",
+            "markdown",
+            dir_path + "/compare_test_input1.json",
+            dir_path + "/compare_test_input2.json",
+        ]
+    )
+    captured = capsys.readouterr()
+    ref_out = """| benchmark           | testcase            |   ratio |
+|:--------------------|:--------------------|--------:|
+| spmv/coo/storage    | {"filename": "mtx"} |    1    |
+| spmv/coo/time       | {"filename": "mtx"} |    1.2  |
+| spmv/csr/storage    | {"filename": "mtx"} |    2    |
+| spmv/csr/time       | {"filename": "mtx"} |    0.8  |
+| spmv/ell/storage    | {"filename": "mtx"} |    0.5  |
+| spmv/ell/time       | {"filename": "mtx"} |    1    |
+| spmv/sellp/storage  | {"filename": "mtx"} |    1    |
+| spmv/sellp/time     | {"filename": "mtx"} |    1.11 |
+| spmv/hybrid/storage | {"filename": "mtx"} |    1    |
+| spmv/hybrid/time    | {"filename": "mtx"} |    1.01 |
+
+Outliers
+| benchmark        | testcase            |   ratio |
+|:-----------------|:--------------------|--------:|
+| spmv/coo/time    | {"filename": "mtx"} |    1.2  |
+| spmv/csr/storage | {"filename": "mtx"} |    2    |
+| spmv/csr/time    | {"filename": "mtx"} |    0.8  |
+| spmv/ell/storage | {"filename": "mtx"} |    0.5  |
+| spmv/sellp/time  | {"filename": "mtx"} |    1.11 |
+"""
+    assert captured.out == ref_out
+    assert captured.err == ""
+
+
+def test_complex(capsys):
+    compare.compare_main(
+        [
+            dir_path + "/compare_test_input3.json",
+            dir_path + "/compare_test_input3.json",
+        ]
+    )
+    captured = capsys.readouterr()
+    ref_out = {
+        "results": [
+            {"blas": {"axpy": {"time.ratio": 1.0}}, "k": 2, "m": 3, "n": 1, "r": 4},
+            {"size": 100, "spmv": {"csr": {"time.ratio": 1.0}}, "stencil": "7pt"},
+            {
+                "filename": "mtx",
+                "solver": {
+                    "gmres": {
+                        "apply": {
+                            "components.ratio": {"foo": 1.0},
+                            "iterations.ratio": 1.0,
+                            "time.ratio": 1.0,
+                        },
+                        "generate": {"time.ratio": 1.0},
+                    }
+                },
+            },
+        ],
+        "outliers": {},
+    }
+
+    assert json.loads(captured.out) == ref_out
+    assert captured.err == ""
diff --git a/benchmark/tools/compare_test_input1.json b/benchmark/tools/compare_test_input1.json
new file mode 100644
index 00000000000..da7b190c270
--- /dev/null
+++ b/benchmark/tools/compare_test_input1.json
@@ -0,0 +1,48 @@
+[
+    {
+        "filename": "mtx",
+        "spmv": {
+            "coo": {
+                "storage": 1000,
+                "max_relative_norm2": 1.0,
+                "time": 1.2,
+                "repetitions": 10,
+                "completed": true
+            },
+            "csr": {
+                "storage": 2000,
+                "max_relative_norm2": 1.0,
+                "time": 0.8,
+                "repetitions": 10,
+                "completed": true
+            },
+            "ell": {
+                "storage": 500,
+                "max_relative_norm2": 1.0,
+                "time": 1.0,
+                "repetitions": 10,
+                "completed": true
+            },
+            "sellp": {
+                "storage": 1000,
+                "max_relative_norm2": 1.0,
+                "time": 1.11,
+                "repetitions": 10,
+                "completed": true
+            },
+            "hybrid": {
+                "storage": 1000,
+                "max_relative_norm2": 1.0,
+                "time": 1.01,
+                "repetitions": 10,
+                "completed": true
+            }
+        },
+        "rows": 36,
+        "cols": 36,
+        "nonzeros": 208,
+        "optimal": {
+            "spmv": "coo"
+        }
+    }
+]
\ No newline at end of file
diff --git a/benchmark/tools/compare_test_input2.json b/benchmark/tools/compare_test_input2.json
new file mode 100644
index 00000000000..29a8d348618
--- /dev/null
+++ b/benchmark/tools/compare_test_input2.json
@@ -0,0 +1,48 @@
+[
+    {
+        "filename": "mtx",
+        "spmv": {
+            "coo": {
+                "storage": 1000,
+                "max_relative_norm2": 1.0,
+                "time": 1.0,
+                "repetitions": 10,
+                "completed": true
+            },
+            "csr": {
+                "storage": 1000,
+                "max_relative_norm2": 1.0,
+                "time": 1.0,
+                "repetitions": 10,
+                "completed": true
+            },
+            "ell": {
+                "storage": 1000,
+                "max_relative_norm2": 1.0,
+                "time": 1.0,
+                "repetitions": 10,
+                "completed": true
+            },
+            "sellp": {
+                "storage": 1000,
+                "max_relative_norm2": 1.0,
+                "time": 1.0,
+                "repetitions": 10,
+                "completed": true
+            },
+            "hybrid": {
+                "storage": 1000,
+                "max_relative_norm2": 1.0,
+                "time": 1.0,
+                "repetitions": 10,
+                "completed": true
+            }
+        },
+        "rows": 36,
+        "cols": 36,
+        "nonzeros": 208,
+        "optimal": {
+            "spmv": "coo"
+        }
+    }
+]
\ No newline at end of file
diff --git a/benchmark/tools/compare_test_input3.json b/benchmark/tools/compare_test_input3.json
new file mode 100644
index 00000000000..f317073d12d
--- /dev/null
+++ b/benchmark/tools/compare_test_input3.json
@@ -0,0 +1,39 @@
+[
+    {
+        "stencil": "7pt",
+        "size": 100,
+        "spmv": {
+            "csr": {
+                "time": 0.5
+            }
+        }
+    },
+    {
+        "n": 1,
+        "k": 2,
+        "m": 3,
+        "r": 4,
+        "blas": {
+            "axpy": {
+                "time": 100
+            }
+        }
+    },
+    {
+        "filename": "mtx",
+        "solver": {
+            "gmres": {
+                "apply": {
+                    "time": 1.0,
+                    "components": {
+                        "foo": 2.0
+                    },
+                    "iterations": 10
+                },
+                "generate": {
+                    "time": 2.0
+                }
+            }
+        }
+    }
+]
\ No newline at end of file

From b60a34da75715482b28960b8ba9f8dcaa4007230 Mon Sep 17 00:00:00 2001
From: Tobias Ribizel <mail@ribizel.de>
Date: Tue, 21 Nov 2023 04:39:53 +0100
Subject: [PATCH 5/7] fix script mode

---
 benchmark/tools/compare.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/benchmark/tools/compare.py b/benchmark/tools/compare.py
index 9dd72c12538..8de109fe34a 100755
--- a/benchmark/tools/compare.py
+++ b/benchmark/tools/compare.py
@@ -249,4 +249,4 @@ def compare_main(args: list):
 
 
 if __name__ == "__main__":
-    compare(sys.argv)
+    compare_main(sys.argv)

From 8fecafbfa8a222bc04f2cf730a05baa5fe705f55 Mon Sep 17 00:00:00 2001
From: Tobias Ribizel <mail@ribizel.de>
Date: Wed, 22 Nov 2023 10:37:32 +0100
Subject: [PATCH 6/7] more stable output

---
 benchmark/tools/compare.py      | 2 +-
 benchmark/tools/compare_test.py | 4 ++--
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/benchmark/tools/compare.py b/benchmark/tools/compare.py
index 8de109fe34a..e0f79e90988 100755
--- a/benchmark/tools/compare.py
+++ b/benchmark/tools/compare.py
@@ -160,7 +160,7 @@ def compare_main(args: list):
 
     results = {}
 
-    for key in set(baseline_json.keys()).intersection(comparison_json.keys()):
+    for key in sorted_key_intersection(baseline_json, comparison_json):
         results[key] = compare(baseline_json[key], comparison_json[key])
 
     outliers = {}
diff --git a/benchmark/tools/compare_test.py b/benchmark/tools/compare_test.py
index 1b906c63b45..83e2ee5dbda 100644
--- a/benchmark/tools/compare_test.py
+++ b/benchmark/tools/compare_test.py
@@ -203,8 +203,6 @@ def test_complex(capsys):
     captured = capsys.readouterr()
     ref_out = {
         "results": [
-            {"blas": {"axpy": {"time.ratio": 1.0}}, "k": 2, "m": 3, "n": 1, "r": 4},
-            {"size": 100, "spmv": {"csr": {"time.ratio": 1.0}}, "stencil": "7pt"},
             {
                 "filename": "mtx",
                 "solver": {
@@ -218,6 +216,8 @@ def test_complex(capsys):
                     }
                 },
             },
+            {"blas": {"axpy": {"time.ratio": 1.0}}, "k": 2, "m": 3, "n": 1, "r": 4},
+            {"size": 100, "spmv": {"csr": {"time.ratio": 1.0}}, "stencil": "7pt"},
         ],
         "outliers": {},
     }

From c26f69fe65a9eea60566c50f89846ccb661d03de Mon Sep 17 00:00:00 2001
From: Tobias Ribizel <mail@ribizel.de>
Date: Mon, 27 Nov 2023 14:50:58 +0100
Subject: [PATCH 7/7] review update

---
 benchmark/tools/compare.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/benchmark/tools/compare.py b/benchmark/tools/compare.py
index e0f79e90988..f6ac5ae321a 100755
--- a/benchmark/tools/compare.py
+++ b/benchmark/tools/compare.py
@@ -36,7 +36,7 @@ def parse_json_matrix(filename: str) -> dict:
                 f"WARNING: Duplicate key {json.dumps(dict_key)}",
                 file=sys.stderr,
             )
-        result[frozendict(dict_key)] = case
+        result[dict_key] = case
     return result
 
 
@@ -128,7 +128,7 @@ def extract_benchmark_results(
 
 
 def is_outlier(value: float, args) -> bool:
-    """returns true iff the value exceeds 1.0 above the outlier threshold"""
+    """returns true iff the is more than the outlier threshold away from 1.0"""
     return math.fabs(math.log(value)) > math.log(1.0 + args.outlier_threshold / 100)