dynamic analysis

binpash · Jan 2, 2025 · d8ef9d4 · d8ef9d4
1 parent 9f2e4f7
commit d8ef9d4
Show file tree

Hide file tree

Showing 37 changed files with 447 additions and 10 deletions.
diff --git a/covid-mts/run.sh b/covid-mts/run.sh
@@ -16,9 +16,15 @@ output_scoped="$outputs_dir/outputs$suffix"
 mkdir -p "$output_scoped"
 
 BENCHMARK_SHELL=${BENCHMARK_SHELL:-bash}
+export BENCHMARK_CATEGORY="covid-mts"
+export BENCHMARK_INPUT_FILE="$(realpath "$input_file")"
 
+export BENCHMARK_SCRIPT="$(realpath "$scripts_dir/1.sh")"
 $BENCHMARK_SHELL "$scripts_dir/1.sh" "$input_file" > "$output_scoped/1.out"
+export BENCHMARK_SCRIPT="$(realpath "$scripts_dir/2.sh")"
 $BENCHMARK_SHELL "$scripts_dir/2.sh" "$input_file" > "$output_scoped/2.out"
+export BENCHMARK_SCRIPT="$(realpath "$scripts_dir/3.sh")"
 $BENCHMARK_SHELL "$scripts_dir/3.sh" "$input_file" > "$output_scoped/3.out"
+export BENCHMARK_SCRIPT="$(realpath "$scripts_dir/4.sh")"
 $BENCHMARK_SHELL "$scripts_dir/4.sh" "$input_file" > "$output_scoped/4.out"
 
diff --git a/file-enc/deps.sh b/file-enc/deps.sh
@@ -1,3 +1,5 @@
+#!/bin/bash
+
 sudo apt-get update
 
 pkgs='ffmpeg unrtf imagemagick libarchive-tools libncurses5-dev libncursesw5-dev zstd liblzma-dev libbz2-dev zip unzip nodejs tcpdump'

diff --git a/file-enc/run.sh b/file-enc/run.sh
@@ -17,6 +17,11 @@ if [[ "$1" == "--small" ]]; then
     suffix=".small"
 fi
 
+export BENCHMARK_CATEGORY="file-enc"
+export BENCHMARK_INPUT_FILE="$(realpath "$input_pcaps")"
 BENCHMARK_SHELL=${BENCHMARK_SHELL:-bash}
-$BENCHMARK_SHELL $scripts_dir/compress_files.sh $input_pcaps $results_dir/compress_files$suffix
-$BENCHMARK_SHELL $scripts_dir/encrypt_files.sh $input_pcaps $results_dir/encrypt_files$suffix
+
+export BENCHMARK_SCRIPT="$(realpath "$scripts_dir/compress_files.sh")"
+$BENCHMARK_SHELL "$scripts_dir/compress_files.sh" "$input_pcaps" "$results_dir/compress_files$suffix"
+export BENCHMARK_SCRIPT="$(realpath "$scripts_dir/encrypt_files.sh")"
+$BENCHMARK_SHELL "$scripts_dir/encrypt_files.sh" "$input_pcaps" "$results_dir/encrypt_files$suffix"
diff --git a/infrastructure/Makefile b/infrastructure/Makefile
@@ -1,7 +1,10 @@
-STATIC_OUTPUTS = target/lines_of_code.csv target/nodes_in_scripts.csv target/scripts_to_benchmark.csv
+STATIC_OUTPUTS = target/lines_of_code.csv target/nodes_in_scripts.csv target/scripts_to_benchmark.csv target/dynamic_analysis.csv
 
 static: $(STATIC_OUTPUTS)
 
+target/dynamic_analysis.csv: dynamic_analysis.py target/collect_dynamic_logs.touch 
+	python3 $< | sort > $@
+
 target/scripts_to_benchmark.csv: scripts_to_benchmark.py
 	python3 $< | sort > $@
 
@@ -17,6 +20,7 @@ static-test: tests/test_syntax_analysis.py
 clean-static: 
 	rm -f $(STATIC_OUTPUTS)
 
-dynamic:
+target/collect_dynamic_logs.touch: 
+	python3 collect_dynamic_logs.py 
 
-.PHONY: static dynamic clean-static static-test
+.PHONY: static clean-static static-test
diff --git a/infrastructure/all_scripts.py b/infrastructure/all_scripts.py
@@ -19,3 +19,7 @@ def get_all_scripts(
         ]
         for benchmark_name, benchmark_data in benchmark_data.items()
     }
+
+if __name__ == "__main__":
+    for bench in get_all_scripts().keys():
+        print(bench)
diff --git a/infrastructure/collect_dynamic_logs.sh b/infrastructure/collect_dynamic_logs.sh
@@ -0,0 +1,19 @@
+#!/bin/bash
+
+REPO_TOP="$(git rev-parse --show-toplevel)"
+
+benches=$(python3 "$REPO_TOP/infrastructure/all_scripts.py" | sort)
+
+for bench in $benches; do
+    bash $REPO_TOP/$bench/deps.sh
+done
+
+for bench in $benches; do
+    bash $REPO_TOP/$bench/input.sh
+done
+
+for bench in $benches; do
+    python3 $REPO_TOP/infrastructure/run_dynamic.py $bench
+done
+
+touch "$REPO_TOP/infrastructure/target/collect_dynamic_logs.touch"
diff --git a/infrastructure/dynamic_analysis.py b/infrastructure/dynamic_analysis.py
@@ -0,0 +1,110 @@
+#!/usr/bin/env python3
+
+import lzma
+from collections import defaultdict
+from pathlib import Path
+import json
+import math
+
+from project_root import get_project_root
+
+def correct_base(path):
+    return Path(path).is_relative_to('/benchmarks')
+
+def rebase(path):
+    return Path(path).relative_to('/benchmarks')
+
+def readings_dict(readings):
+    return {r['log_current_time']: r for r in readings}
+
+def is_shell(cmd):
+#     non_shell = {'cat', 'tr', 'grep', 'sort', 'uniq', 'cut', 'awk', 'sed', 'rev', 'wc', 'convert', 'ffmpeg'}
+#     is_shell_names = {'/bin/bash'}
+#     assert cmd in non_shell | is_shell_names, f"unknown whether {cmd} is a shell"
+#     return cmd in is_shell_names
+    return cmd is not None and len(cmd) > 0 and 'bash' in cmd[0]
+
+def sum_counters(pid, at_time, processes, children, should_include, which_counter):
+    s = defaultdict(int)
+    def recurse(pid):
+        if at_time in processes[pid]:
+            record = processes[pid][at_time]
+            if should_include(record):
+                for k, v in which_counter(record).items():
+                    s[k] += v
+        for c in children[pid]:
+            recurse(c)
+    recurse(pid)
+    return s
+
+def input_files(pid, at_time, processes, children):
+    s = set()
+    def recurse(pid):
+        if at_time in processes[pid]:
+            record = processes[pid][at_time]
+            s.add(record['benchmark_input_file'])
+            for file_path, _, _, mode, _ in record['full']['open_files']:
+                known_modes = {'r', 'r+', 'w'}
+                assert mode in known_modes, f"unknown mode {mode}"
+
+                is_a_script = '.sh' in Path(file_path).suffixes
+                if mode != 'w' and not is_a_script and correct_base(file_path):
+                    s.add(file_path)
+        for c in children[pid]:
+            recurse(c)
+    recurse(pid)
+    s = s - {None}
+    return s
+
+def read_log_file(path):
+    parents = defaultdict(lambda: None)
+    children = defaultdict(set)
+    processes = defaultdict(list)
+    with lzma.open(path, 'r') as lines:
+        for data in lines:
+            data = json.loads(data)
+            processes[data['pid']].append(data)
+            pid = data['pid']
+            parent = data['parent']
+            children[parent].add(pid)
+            parents[pid] = parent
+    processes = defaultdict(lambda: None, {pid: readings_dict(rs) for pid, rs in processes.items()})
+    return processes, parents, children
+
+def print_statistics(pid, processes, parents, children):
+    rs = processes[pid]
+
+    max_uss = max(
+        sum_counters(pid, log_time, processes, children, 
+            lambda record: True,
+            lambda record: record['pfullmem'],
+        )
+        ['uss'] 
+        for log_time in rs
+    )
+
+    all_input_files = set()
+    for log_time in rs: 
+        all_input_files |= input_files(pid, log_time, processes, children)
+    all_input_files = ";".join(str(rebase(p)) for p in all_input_files)
+
+    max_reading = max(rs)
+    tis = sum_counters(pid, max_reading, processes, children,
+        lambda record: is_shell(record['cmdline']),
+        lambda record: record['cpu_times'],
+    )
+    user = rs[max_reading]['cpu_times']['children_user']
+    system = rs[max_reading]['cpu_times']['children_system']
+    read_chars = rs[max_reading]['io_counters']['read_chars']
+    write_chars = rs[max_reading]['io_counters']['write_chars']
+    benchmark_script = rs[max_reading]['benchmark_script']
+    benchmark_script = None if benchmark_script is None else rebase(benchmark_script)
+    print(benchmark_script, user, system, max_uss, read_chars, write_chars, tis['user'], tis['system'], all_input_files, sep=',')
+
+if __name__ == '__main__':
+    process_logs = get_project_root() / 'infrastructure' / 'target' / 'process-logs'
+    for path in process_logs.glob('*.jsonl.xz'):
+        processes, parents, children = read_log_file(path)
+        top_level = [pid for pid in processes if parents[parents[pid]] is None]
+        for pid in top_level:
+            print_statistics(pid, processes, parents, children)
diff --git a/infrastructure/run_dynamic.py b/infrastructure/run_dynamic.py
@@ -0,0 +1,51 @@
+#!/usr/bin/env python3
+
+import lzma
+import tempfile
+import argparse
+from pathlib import Path
+from typing import Optional
+import json
+from subprocess import check_output, run
+from collections import Counter
+import os
+from datetime import datetime, timezone
+
+
+from all_scripts import get_all_scripts
+from syntax_analysis import parse_shell_script, count_nodes
+from project_root import get_project_root
+
+def get_parser():
+    parser = argparse.ArgumentParser(
+        prog='run_dynamic',
+        description='runs the dynamic analysis',
+    )
+    parser.add_argument('bench', type=str)
+    return parser
+
+def get_environment(root: Path, start_time: str, bench: str, data_log: str):
+    env = os.environ.copy()  
+    dynamic_shell = root / 'infrastructure' / 'run_dynamic_shell.py'
+    env['BENCHMARK_SHELL'] = str(dynamic_shell)
+    env['BENCHMARK_EXPERIMENT_START'] = start_time
+    env['BENCHMARK_PROCESS_LOG'] = data_log
+    env['BENCHMARK_MORTEM_LOG'] = str(
+        root / 'infrastructure' / 'target' / 'process-logs' / f'{start_time}-{bench}.mortem'
+    )
+    return env
+
+if __name__ == '__main__':
+    parser = get_parser()
+    args = parser.parse_args()
+    root = get_project_root()
+    bench = args.bench
+    start_time = datetime.now(timezone.utc).isoformat()
+    with tempfile.NamedTemporaryFile() as data_log:
+        data_log = data_log.name
+        env = get_environment(root=root, start_time=start_time, bench=bench, data_log=data_log)
+        # write to an uncompressed file because it is faster
+        run([root / bench / 'run.sh'], env=env)
+        compressed_data_log = root / 'infrastructure' / 'target' / 'process-logs' / f'{start_time}-{bench}.jsonl.xz'
+        with compressed_data_log.open('w') as stdout:
+            run(['xz', '-6e', '-T0', '-c', data_log], stdout=stdout)
diff --git a/infrastructure/run_dynamic_shell.py b/infrastructure/run_dynamic_shell.py
@@ -0,0 +1,101 @@
+#!/usr/bin/env python3
+
+import os
+from datetime import datetime, timezone
+from itertools import chain
+import time
+import psutil
+import signal
+from pathlib import Path
+from typing import Optional
+import json
+from subprocess import run
+import sys
+import asyncio
+
+from all_scripts import get_all_scripts
+from syntax_analysis import parse_shell_script, count_nodes
+from project_root import get_project_root
+
+def data_json(p: psutil.Process, log_current_time: str, benchmark_experiment_start: str) -> str: 
+    parent_pid = None
+    try:
+        parent_pid = p.parent().pid
+    except AttributeError:
+        pass
+    p = p.as_dict()
+    times = p['cpu_times']
+    mem = p['memory_full_info']
+    io_counters = p['io_counters']
+    return json.dumps({
+        'pid': p['pid'],
+        'parent': parent_pid,
+        'benchmark_category': p['environ'].get('BENCHMARK_CATEGORY'),
+        'benchmark_script': p['environ'].get('BENCHMARK_SCRIPT'),
+        'benchmark_input_file': p['environ'].get('BENCHMARK_INPUT_FILE'),
+        'benchmark_experiment_start': benchmark_experiment_start,
+        'log_current_time': log_current_time,
+        'cwd': p['cwd'],
+        'cmdline': p['cmdline'],
+        'create_time': p['create_time'],
+        'cpu_times': {
+            'user': times.user,
+            'system': times.system,
+            'children_user': times.children_user,
+            'children_system': times.children_system,
+            'iowait': times.iowait,
+        },
+        'pfullmem': {
+            'rss': mem.rss, 'vms': mem.vms, 'shared': mem.shared, 'text': mem.text, 'lib': mem.lib, 'data': mem.data, 'dirty': mem.dirty, 'uss': mem.uss, 'pss': mem.pss, 'swap': mem.swap,
+        },
+        'io_counters': {
+            'read_count': io_counters.read_count, 'write_count': io_counters.write_count, 'read_bytes': io_counters.read_bytes, 'write_bytes': io_counters.write_bytes, 'read_chars': io_counters.read_chars, 'write_chars': io_counters.write_chars,
+        },
+        'num_fds': p['num_fds'],
+        'full': p, # this does not provide field names for cpu_times and io_counters, etc.
+    })
+
+def write_process_data(parent: int, data_log, benchmark_experiment_start):
+    log_current_time = datetime.now(timezone.utc).isoformat()
+    parent = psutil.Process(parent)
+    for p in chain(parent.children(recursive=True), [parent]):
+        try:
+            print(data_json(p, log_current_time, benchmark_experiment_start), file=data_log)
+        except psutil.NoSuchProcess:
+            pass
+
+async def collect_process_data(parent: int, data_log, benchmark_experiment_start):
+    try:
+        write_process_data(parent, data_log, benchmark_experiment_start)
+        while True:
+            await asyncio.sleep(0.05)
+            write_process_data(parent, data_log, benchmark_experiment_start)
+    except Exception as e:
+        print(e, type(e))
+
+async def run_and_collect(program, data_log: Path, mortem_log: Path, benchmark_experiment_start: Path):
+    start_time = time.perf_counter()
+    process = await asyncio.create_subprocess_exec(*program)
+    pid = process.pid
+    with data_log.open('a') as stdout:
+        process_data = asyncio.create_task(collect_process_data(pid, stdout, benchmark_experiment_start))
+        await process.wait()
+        end_time = time.perf_counter() 
+        process_data.cancel()
+    with mortem_log.open('a') as mortem_log:
+        print(benchmark_experiment_start, pid, end_time - start_time, sep=',', file=mortem_log)
+
+async def main():
+    program = sys.argv[1:]
+    category = os.environ.get('BENCHMARK_CATEGORY')
+    data_log = Path(os.environ.get('BENCHMARK_PROCESS_LOG'))
+    mortem_log = Path(os.environ.get('BENCHMARK_MORTEM_LOG'))
+    benchmark_experiment_start = os.environ.get('BENCHMARK_EXPERIMENT_START')
+    await run_and_collect(
+        program=program, 
+        data_log=data_log, 
+        mortem_log=mortem_log,
+        benchmark_experiment_start=benchmark_experiment_start,
+    )
+
+asyncio.run(main())
diff --git a/infrastructure/target/collect_dynamic_logs.touch b/infrastructure/target/collect_dynamic_logs.touch