Skip to content

Commit

Permalink
dynamic analysis
Browse files Browse the repository at this point in the history
  • Loading branch information
EtomicBomb committed Jan 2, 2025
1 parent 9f2e4f7 commit d8ef9d4
Show file tree
Hide file tree
Showing 37 changed files with 447 additions and 10 deletions.
6 changes: 6 additions & 0 deletions covid-mts/run.sh
Original file line number Diff line number Diff line change
Expand Up @@ -16,9 +16,15 @@ output_scoped="$outputs_dir/outputs$suffix"
mkdir -p "$output_scoped"

BENCHMARK_SHELL=${BENCHMARK_SHELL:-bash}
export BENCHMARK_CATEGORY="covid-mts"
export BENCHMARK_INPUT_FILE="$(realpath "$input_file")"

export BENCHMARK_SCRIPT="$(realpath "$scripts_dir/1.sh")"
$BENCHMARK_SHELL "$scripts_dir/1.sh" "$input_file" > "$output_scoped/1.out"
export BENCHMARK_SCRIPT="$(realpath "$scripts_dir/2.sh")"
$BENCHMARK_SHELL "$scripts_dir/2.sh" "$input_file" > "$output_scoped/2.out"
export BENCHMARK_SCRIPT="$(realpath "$scripts_dir/3.sh")"
$BENCHMARK_SHELL "$scripts_dir/3.sh" "$input_file" > "$output_scoped/3.out"
export BENCHMARK_SCRIPT="$(realpath "$scripts_dir/4.sh")"
$BENCHMARK_SHELL "$scripts_dir/4.sh" "$input_file" > "$output_scoped/4.out"

2 changes: 2 additions & 0 deletions file-enc/deps.sh
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
#!/bin/bash

sudo apt-get update

pkgs='ffmpeg unrtf imagemagick libarchive-tools libncurses5-dev libncursesw5-dev zstd liblzma-dev libbz2-dev zip unzip nodejs tcpdump'
Expand Down
9 changes: 7 additions & 2 deletions file-enc/run.sh
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,11 @@ if [[ "$1" == "--small" ]]; then
suffix=".small"
fi

export BENCHMARK_CATEGORY="file-enc"
export BENCHMARK_INPUT_FILE="$(realpath "$input_pcaps")"
BENCHMARK_SHELL=${BENCHMARK_SHELL:-bash}
$BENCHMARK_SHELL $scripts_dir/compress_files.sh $input_pcaps $results_dir/compress_files$suffix
$BENCHMARK_SHELL $scripts_dir/encrypt_files.sh $input_pcaps $results_dir/encrypt_files$suffix

export BENCHMARK_SCRIPT="$(realpath "$scripts_dir/compress_files.sh")"
$BENCHMARK_SHELL "$scripts_dir/compress_files.sh" "$input_pcaps" "$results_dir/compress_files$suffix"
export BENCHMARK_SCRIPT="$(realpath "$scripts_dir/encrypt_files.sh")"
$BENCHMARK_SHELL "$scripts_dir/encrypt_files.sh" "$input_pcaps" "$results_dir/encrypt_files$suffix"
10 changes: 7 additions & 3 deletions infrastructure/Makefile
Original file line number Diff line number Diff line change
@@ -1,7 +1,10 @@
STATIC_OUTPUTS = target/lines_of_code.csv target/nodes_in_scripts.csv target/scripts_to_benchmark.csv
STATIC_OUTPUTS = target/lines_of_code.csv target/nodes_in_scripts.csv target/scripts_to_benchmark.csv target/dynamic_analysis.csv

static: $(STATIC_OUTPUTS)

target/dynamic_analysis.csv: dynamic_analysis.py target/collect_dynamic_logs.touch
python3 $< | sort > $@

target/scripts_to_benchmark.csv: scripts_to_benchmark.py
python3 $< | sort > $@

Expand All @@ -17,6 +20,7 @@ static-test: tests/test_syntax_analysis.py
clean-static:
rm -f $(STATIC_OUTPUTS)

dynamic:
target/collect_dynamic_logs.touch:
python3 collect_dynamic_logs.py

.PHONY: static dynamic clean-static static-test
.PHONY: static clean-static static-test
4 changes: 4 additions & 0 deletions infrastructure/all_scripts.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,3 +19,7 @@ def get_all_scripts(
]
for benchmark_name, benchmark_data in benchmark_data.items()
}

if __name__ == "__main__":
for bench in get_all_scripts().keys():
print(bench)
19 changes: 19 additions & 0 deletions infrastructure/collect_dynamic_logs.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
#!/bin/bash

REPO_TOP="$(git rev-parse --show-toplevel)"

benches=$(python3 "$REPO_TOP/infrastructure/all_scripts.py" | sort)

for bench in $benches; do
bash $REPO_TOP/$bench/deps.sh
done

for bench in $benches; do
bash $REPO_TOP/$bench/input.sh
done

for bench in $benches; do
python3 $REPO_TOP/infrastructure/run_dynamic.py $bench
done

touch "$REPO_TOP/infrastructure/target/collect_dynamic_logs.touch"
110 changes: 110 additions & 0 deletions infrastructure/dynamic_analysis.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,110 @@
#!/usr/bin/env python3

import lzma
from collections import defaultdict
from pathlib import Path
import json
import math

from project_root import get_project_root

def correct_base(path):
return Path(path).is_relative_to('/benchmarks')

def rebase(path):
return Path(path).relative_to('/benchmarks')

def readings_dict(readings):
return {r['log_current_time']: r for r in readings}

def is_shell(cmd):
# non_shell = {'cat', 'tr', 'grep', 'sort', 'uniq', 'cut', 'awk', 'sed', 'rev', 'wc', 'convert', 'ffmpeg'}
# is_shell_names = {'/bin/bash'}
# assert cmd in non_shell | is_shell_names, f"unknown whether {cmd} is a shell"
# return cmd in is_shell_names
return cmd is not None and len(cmd) > 0 and 'bash' in cmd[0]

def sum_counters(pid, at_time, processes, children, should_include, which_counter):
s = defaultdict(int)
def recurse(pid):
if at_time in processes[pid]:
record = processes[pid][at_time]
if should_include(record):
for k, v in which_counter(record).items():
s[k] += v
for c in children[pid]:
recurse(c)
recurse(pid)
return s

def input_files(pid, at_time, processes, children):
s = set()
def recurse(pid):
if at_time in processes[pid]:
record = processes[pid][at_time]
s.add(record['benchmark_input_file'])
for file_path, _, _, mode, _ in record['full']['open_files']:
known_modes = {'r', 'r+', 'w'}
assert mode in known_modes, f"unknown mode {mode}"

is_a_script = '.sh' in Path(file_path).suffixes
if mode != 'w' and not is_a_script and correct_base(file_path):
s.add(file_path)
for c in children[pid]:
recurse(c)
recurse(pid)
s = s - {None}
return s

def read_log_file(path):
parents = defaultdict(lambda: None)
children = defaultdict(set)
processes = defaultdict(list)
with lzma.open(path, 'r') as lines:
for data in lines:
data = json.loads(data)
processes[data['pid']].append(data)
pid = data['pid']
parent = data['parent']
children[parent].add(pid)
parents[pid] = parent
processes = defaultdict(lambda: None, {pid: readings_dict(rs) for pid, rs in processes.items()})
return processes, parents, children

def print_statistics(pid, processes, parents, children):
rs = processes[pid]

max_uss = max(
sum_counters(pid, log_time, processes, children,
lambda record: True,
lambda record: record['pfullmem'],
)
['uss']
for log_time in rs
)

all_input_files = set()
for log_time in rs:
all_input_files |= input_files(pid, log_time, processes, children)
all_input_files = ";".join(str(rebase(p)) for p in all_input_files)

max_reading = max(rs)
tis = sum_counters(pid, max_reading, processes, children,
lambda record: is_shell(record['cmdline']),
lambda record: record['cpu_times'],
)
user = rs[max_reading]['cpu_times']['children_user']
system = rs[max_reading]['cpu_times']['children_system']
read_chars = rs[max_reading]['io_counters']['read_chars']
write_chars = rs[max_reading]['io_counters']['write_chars']
benchmark_script = rs[max_reading]['benchmark_script']
benchmark_script = None if benchmark_script is None else rebase(benchmark_script)
print(benchmark_script, user, system, max_uss, read_chars, write_chars, tis['user'], tis['system'], all_input_files, sep=',')

if __name__ == '__main__':
process_logs = get_project_root() / 'infrastructure' / 'target' / 'process-logs'
for path in process_logs.glob('*.jsonl.xz'):
processes, parents, children = read_log_file(path)
top_level = [pid for pid in processes if parents[parents[pid]] is None]
for pid in top_level:
print_statistics(pid, processes, parents, children)
51 changes: 51 additions & 0 deletions infrastructure/run_dynamic.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,51 @@
#!/usr/bin/env python3

import lzma
import tempfile
import argparse
from pathlib import Path
from typing import Optional
import json
from subprocess import check_output, run
from collections import Counter
import os
from datetime import datetime, timezone


from all_scripts import get_all_scripts
from syntax_analysis import parse_shell_script, count_nodes
from project_root import get_project_root

def get_parser():
parser = argparse.ArgumentParser(
prog='run_dynamic',
description='runs the dynamic analysis',
)
parser.add_argument('bench', type=str)
return parser

def get_environment(root: Path, start_time: str, bench: str, data_log: str):
env = os.environ.copy()
dynamic_shell = root / 'infrastructure' / 'run_dynamic_shell.py'
env['BENCHMARK_SHELL'] = str(dynamic_shell)
env['BENCHMARK_EXPERIMENT_START'] = start_time
env['BENCHMARK_PROCESS_LOG'] = data_log
env['BENCHMARK_MORTEM_LOG'] = str(
root / 'infrastructure' / 'target' / 'process-logs' / f'{start_time}-{bench}.mortem'
)
return env

if __name__ == '__main__':
parser = get_parser()
args = parser.parse_args()
root = get_project_root()
bench = args.bench
start_time = datetime.now(timezone.utc).isoformat()
with tempfile.NamedTemporaryFile() as data_log:
data_log = data_log.name
env = get_environment(root=root, start_time=start_time, bench=bench, data_log=data_log)
# write to an uncompressed file because it is faster
run([root / bench / 'run.sh'], env=env)
compressed_data_log = root / 'infrastructure' / 'target' / 'process-logs' / f'{start_time}-{bench}.jsonl.xz'
with compressed_data_log.open('w') as stdout:
run(['xz', '-6e', '-T0', '-c', data_log], stdout=stdout)
101 changes: 101 additions & 0 deletions infrastructure/run_dynamic_shell.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,101 @@
#!/usr/bin/env python3

import os
from datetime import datetime, timezone
from itertools import chain
import time
import psutil
import signal
from pathlib import Path
from typing import Optional
import json
from subprocess import run
import sys
import asyncio

from all_scripts import get_all_scripts
from syntax_analysis import parse_shell_script, count_nodes
from project_root import get_project_root

def data_json(p: psutil.Process, log_current_time: str, benchmark_experiment_start: str) -> str:
parent_pid = None
try:
parent_pid = p.parent().pid
except AttributeError:
pass
p = p.as_dict()
times = p['cpu_times']
mem = p['memory_full_info']
io_counters = p['io_counters']
return json.dumps({
'pid': p['pid'],
'parent': parent_pid,
'benchmark_category': p['environ'].get('BENCHMARK_CATEGORY'),
'benchmark_script': p['environ'].get('BENCHMARK_SCRIPT'),
'benchmark_input_file': p['environ'].get('BENCHMARK_INPUT_FILE'),
'benchmark_experiment_start': benchmark_experiment_start,
'log_current_time': log_current_time,
'cwd': p['cwd'],
'cmdline': p['cmdline'],
'create_time': p['create_time'],
'cpu_times': {
'user': times.user,
'system': times.system,
'children_user': times.children_user,
'children_system': times.children_system,
'iowait': times.iowait,
},
'pfullmem': {
'rss': mem.rss, 'vms': mem.vms, 'shared': mem.shared, 'text': mem.text, 'lib': mem.lib, 'data': mem.data, 'dirty': mem.dirty, 'uss': mem.uss, 'pss': mem.pss, 'swap': mem.swap,
},
'io_counters': {
'read_count': io_counters.read_count, 'write_count': io_counters.write_count, 'read_bytes': io_counters.read_bytes, 'write_bytes': io_counters.write_bytes, 'read_chars': io_counters.read_chars, 'write_chars': io_counters.write_chars,
},
'num_fds': p['num_fds'],
'full': p, # this does not provide field names for cpu_times and io_counters, etc.
})

def write_process_data(parent: int, data_log, benchmark_experiment_start):
log_current_time = datetime.now(timezone.utc).isoformat()
parent = psutil.Process(parent)
for p in chain(parent.children(recursive=True), [parent]):
try:
print(data_json(p, log_current_time, benchmark_experiment_start), file=data_log)
except psutil.NoSuchProcess:
pass

async def collect_process_data(parent: int, data_log, benchmark_experiment_start):
try:
write_process_data(parent, data_log, benchmark_experiment_start)
while True:
await asyncio.sleep(0.05)
write_process_data(parent, data_log, benchmark_experiment_start)
except Exception as e:
print(e, type(e))

async def run_and_collect(program, data_log: Path, mortem_log: Path, benchmark_experiment_start: Path):
start_time = time.perf_counter()
process = await asyncio.create_subprocess_exec(*program)
pid = process.pid
with data_log.open('a') as stdout:
process_data = asyncio.create_task(collect_process_data(pid, stdout, benchmark_experiment_start))
await process.wait()
end_time = time.perf_counter()
process_data.cancel()
with mortem_log.open('a') as mortem_log:
print(benchmark_experiment_start, pid, end_time - start_time, sep=',', file=mortem_log)

async def main():
program = sys.argv[1:]
category = os.environ.get('BENCHMARK_CATEGORY')
data_log = Path(os.environ.get('BENCHMARK_PROCESS_LOG'))
mortem_log = Path(os.environ.get('BENCHMARK_MORTEM_LOG'))
benchmark_experiment_start = os.environ.get('BENCHMARK_EXPERIMENT_START')
await run_and_collect(
program=program,
data_log=data_log,
mortem_log=mortem_log,
benchmark_experiment_start=benchmark_experiment_start,
)

asyncio.run(main())
Empty file.
Loading

0 comments on commit d8ef9d4

Please sign in to comment.