Skip to content

Commit

Permalink
dynamic analysis 2
Browse files Browse the repository at this point in the history
  • Loading branch information
EtomicBomb committed Jan 11, 2025
1 parent 4cf0a29 commit 5b1a88e
Show file tree
Hide file tree
Showing 268 changed files with 8,447 additions and 303 deletions.
5 changes: 5 additions & 0 deletions aurpkg/run.sh
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,11 @@ mkdir -p ${OUT}

script="./scripts/pacaur.sh"

BENCHMARK_SHELL=${BENCHMARK_SHELL:-bash}
export BENCHMARK_CATEGORY="aurpkg"
export BENCHMARK_SCRIPT="$(realpath "$script")"
export BENCHMARK_INPUT_FILE="$(realpath "$IN")"

# Switch to user "user" to avoid permission issues

echo "$script"
Expand Down
4 changes: 3 additions & 1 deletion bio/deps.sh
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
#!/bin/bash
# install dependencies
required_version="1.7"

Expand Down Expand Up @@ -43,4 +44,5 @@ else
echo "Failed to install the correct version of Samtools."
exit 1
fi
fi
fi

2 changes: 2 additions & 0 deletions bio/input.sh
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
#!/bin/bash

IN=inputs
IN_NAME=input.txt

Expand Down
9 changes: 8 additions & 1 deletion bio/run.sh
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
#!/bin/bash

# create bam files with regions
################### 1KG SAMPLES
IN=inputs
Expand All @@ -8,6 +10,11 @@ if [[ "$@" == *"--small"* ]]; then
IN_NAME=input_small.txt
fi

export BENCHMARK_CATEGORY="bio"
BENCHMARK_SHELL=${BENCHMARK_SHELL:-bash}

"$BENCHMARK_SHELL" ./scripts/bio.sh "$IN" "$IN_NAME" "$OUT"
script_file=./scripts/bio.sh
export BENCHMARK_SCRIPT="$(realpath "$script_file")"
export BENCHMARK_INPUT_FILE="$(realpath "$IN_NAME")"

$BENCHMARK_SHELL "$script_file" "$IN" "$IN_NAME" "$OUT"
6 changes: 6 additions & 0 deletions covid-mts/run.sh
Original file line number Diff line number Diff line change
Expand Up @@ -16,9 +16,15 @@ output_scoped="$outputs_dir/outputs$suffix"
mkdir -p "$output_scoped"

BENCHMARK_SHELL=${BENCHMARK_SHELL:-bash}
export BENCHMARK_CATEGORY="covid-mts"
export BENCHMARK_INPUT_FILE="$(realpath "$input_file")"

export BENCHMARK_SCRIPT="$(realpath "$scripts_dir/1.sh")"
$BENCHMARK_SHELL "$scripts_dir/1.sh" "$input_file" > "$output_scoped/1.out"
export BENCHMARK_SCRIPT="$(realpath "$scripts_dir/2.sh")"
$BENCHMARK_SHELL "$scripts_dir/2.sh" "$input_file" > "$output_scoped/2.out"
export BENCHMARK_SCRIPT="$(realpath "$scripts_dir/3.sh")"
$BENCHMARK_SHELL "$scripts_dir/3.sh" "$input_file" > "$output_scoped/3.out"
export BENCHMARK_SCRIPT="$(realpath "$scripts_dir/4.sh")"
$BENCHMARK_SHELL "$scripts_dir/4.sh" "$input_file" > "$output_scoped/4.out"

2 changes: 2 additions & 0 deletions file-enc/deps.sh
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
#!/bin/bash

sudo apt-get update

pkgs='ffmpeg unrtf imagemagick libarchive-tools libncurses5-dev libncursesw5-dev zstd liblzma-dev libbz2-dev zip unzip nodejs tcpdump'
Expand Down
9 changes: 7 additions & 2 deletions file-enc/run.sh
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,11 @@ if [[ "$1" == "--small" ]]; then
suffix=".small"
fi

export BENCHMARK_CATEGORY="file-enc"
export BENCHMARK_INPUT_FILE="$(realpath "$input_pcaps")"
BENCHMARK_SHELL=${BENCHMARK_SHELL:-bash}
$BENCHMARK_SHELL $scripts_dir/compress_files.sh $input_pcaps $results_dir/compress_files$suffix
$BENCHMARK_SHELL $scripts_dir/encrypt_files.sh $input_pcaps $results_dir/encrypt_files$suffix

export BENCHMARK_SCRIPT="$(realpath "$scripts_dir/compress_files.sh")"
$BENCHMARK_SHELL "$scripts_dir/compress_files.sh" "$input_pcaps" "$results_dir/compress_files$suffix"
export BENCHMARK_SCRIPT="$(realpath "$scripts_dir/encrypt_files.sh")"
$BENCHMARK_SHELL "$scripts_dir/encrypt_files.sh" "$input_pcaps" "$results_dir/encrypt_files$suffix"
10 changes: 4 additions & 6 deletions infrastructure/Makefile
Original file line number Diff line number Diff line change
@@ -1,8 +1,8 @@
STATIC_OUTPUTS = target/lines_of_code.csv target/nodes_in_scripts.csv target/scripts_to_benchmark.csv target/cyclomatic.csv target/shellmetrics.sh
STATIC_OUTPUTS = target/lines_of_code.csv target/nodes_in_scripts.csv target/cyclomatic.csv target/shellmetrics.sh target/dynamic_analysis.jsonl

static: $(STATIC_OUTPUTS)

target/scripts_to_benchmark.csv: scripts_to_benchmark.py
target/dynamic_analysis.jsonl: dynamic_analysis.py
python3 $< | sort > $@

target/lines_of_code.csv: count_lines_of_code.py
Expand All @@ -22,8 +22,6 @@ target/shellmetrics.sh:
chmod +x $@

target/cyclomatic.csv: get_cyclomatic.py target/shellmetrics.sh
python3 get_cyclomatic.py > $@
python3 get_cyclomatic.py | sort > $@

dynamic:

.PHONY: static dynamic clean-static static-test
.PHONY: static clean-static static-test
4 changes: 4 additions & 0 deletions infrastructure/all_scripts.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,3 +19,7 @@ def get_all_scripts(
]
for benchmark_name, benchmark_data in benchmark_data.items()
}

if __name__ == "__main__":
for bench in get_all_scripts().keys():
print(bench)
198 changes: 198 additions & 0 deletions infrastructure/colossal_table.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,198 @@
#!/usr/bin/env python3

import pandas as pd
import fnmatch
import viz_syntax as stx
import viz_dynamic as dyn

from all_scripts import get_all_scripts
from project_root import get_project_root

root = get_project_root()
data_path = root / 'infrastructure/target/dynamic_analysis.jsonl'
input_size_path = root / 'infrastructure/data/size_inputs.jsonl'
loc_data_path = root / 'infrastructure/target/lines_of_code.csv'

benchmark_category_style = {
'bio': ('XXX', 'XXX', 'XXX'),
'vps-audit': ('XXX', 'XXX', 'XXX'),
'vps-audit-negate': ('XXX', 'XXX', 'XXX'),
'aurpkg': ('XXX', 'XXX', 'XXX'),
'makeself': ('XXX', 'XXX', 'XXX'),
'infrastructure/standards/100-files': ('XXX', 'XXX', 'XXX'),
'infrastructure/standards/read-write': ('XXX', 'XXX', 'XXX'),
'infrastructure/standards/shell-memory': ('XXX', 'XXX', 'XXX'),
'infrastructure/standards/sleep': ('XXX', 'XXX', 'XXX'),
'infrastructure/standards/time-in-shell-subprocess': ('XXX', 'XXX', 'XXX'),
'infrastructure/standards/user-time': ('XXX', 'XXX', 'XXX'),
'infrastructure/standards/user-time-in-shell': ('XXX', 'XXX', 'XXX'),
'infrastructure/standards/write-only': ('XXX', 'XXX', 'XXX'),
'covid-mts': ('Data analysis', 'Data extraction', '\\cite{covid-mts-source}'),
'file-enc': ('Cryptography', 'Automation', '\\cite{cito2020empirical}'),
'log-analysis': ('System admin.', 'Data extraction', '\\cite{spinellis2017extending, raghavan2020posh}'),
'max-temp': ('Data analysis', 'Data extraction', '\\cite{hadoop-guide-2009}'),
'media-conv': ('Misc.', 'Automation', '\\cite{spinellis2017extending, raghavan2020posh}'),
'nlp': ('Machine learning', 'Text processing', '\\cite{unix-for-poets-church}'),
'oneliners': ('Misc.', 'Text processing', '\\cite{bentley-pearl-cacm-1985, bentley-pearl-cacm-1986, unix-cacm-1974, wicked-cool-shell-scripts}'),
'riker': ('Development', 'Build scripts', ''),
'sklearn': ('Machine learning', 'Automation', ''),
'uniq-ips': ('System admin.', 'Automation', ''),
'unix50': ('Misc.', 'Text processing', '\\cite{bhandari2020solutions}'),
'web-index': ('Development', 'Text processing', '\\cite{pash2021}')
}

def short_category(benchmark):
dom, style, _ = benchmark_category_style[benchmark]
def shorten(str):
return ''.join([x[0].upper() for x in str.split(' ')])
return shorten(dom) + '/' + shorten(style)

benchmark_input_description = {
'aurpkg': 'package files',
'bio': 'biological data files',
'covid-mts': 'transit data',
'file-enc': 'pcap files',
'log-analysis': 'log files',
'max-temp': 'temperature data',
'media-conv': 'media files',
'nlp': 'text files',
'oneliners': 'text files',
'riker': 'source code files',
'sklearn': 'CSV files',
'uniq-ips': 'text files',
'unix50': 'text files',
'web-index': 'HTML files',
'bio': 'XXX',
'vps-audit': 'system status',
'vps-audit-negate': 'system status',
'makeself': 'XXX',
'aurpkg': 'XXX',
'infrastructure/standards/100-files': 'XXX',
'infrastructure/standards/read-write': 'XXX',
'infrastructure/standards/shell-memory': 'XXX',
'infrastructure/standards/sleep': 'XXX',
'infrastructure/standards/time-in-shell-subprocess': 'XXX',
'infrastructure/standards/user-time': 'XXX',
'infrastructure/standards/user-time-in-shell': 'XXX',
'infrastructure/standards/write-only': 'XXX',
}

scripts_to_include = [
'covid-mts/scripts/1.sh',
'file-enc/scripts/encrypt_files.sh',
'log-analysis/scripts/nginx.sh',
'media-conv/scripts/img_convert.sh',
'nlp/scripts/bigrams.sh',
'oneliners/*',
'unix50/scripts/1.sh',
'riker/scripts/redis/build.sh',
# max-temp is just 1
# sklearn is just 1
# aurpkg is just 1
# bio is just 1
# makeself??
'web-index/scripts/ngrams.sh',
# vps-audit is just 1
]


def count_unique_cmds(series):
return len({node for node in series if 'command(' in node})

def count_constructs(series):
return len(set(series))

def read_loc_data():
loc_data = pd.read_csv(loc_data_path, header=None)
loc_data.columns = ['script', 'loc']
loc_data['benchmark'] = loc_data['script'].apply(lambda x: x.split('/')[0])
loc_data_bench = loc_data.groupby('benchmark').agg({
'loc': 'sum',
'script': 'count'
}).reset_index()
loc_data_bench.rename(columns={'script': 'number_of_scripts'}, inplace=True)
return loc_data, loc_data_bench

def prettify_bytes_number(n):
if n < 1024:
value, unit = n, "B"
elif n < 1024 * 1024:
value, unit = n / 1024, "KB"
elif n < 1024 * 1024 * 1024:
value, unit = n / (1024 * 1024), "MB"
else:
value, unit = n / (1024 * 1024 * 1024), "GB"

if value < 10:
decimals = 2
elif value < 100:
decimals = 1
else:
decimals = 0

color = 'black' if unit == 'GB' else 'gray'
return f"{value:.{decimals}f} \\textcolor{{{color}}}{{{unit}}}"

def main():
syntax_script, syntax_bench = stx.read_data(True)
syntax_script_all_cmds, syntax_bench_all_cmds = stx.read_data(False)
dyn_script, dyn_bench = dyn.read_data()
loc_data_script, loc_data_bench = read_loc_data()

syntax_script_all_cmds['unique_cmds'] = syntax_script_all_cmds['nodes'].apply(count_unique_cmds)
syntax_bench_all_cmds['unique_cmds'] = syntax_bench_all_cmds['nodes'].apply(count_unique_cmds)
syntax_script['constructs'] = syntax_script['nodes'].apply(count_constructs)
syntax_bench['constructs'] = syntax_bench['nodes'].apply(count_constructs)

# all_scripts = set(syntax_script['script'].unique())

# missing_in_dyn = all_scripts - set(dyn_script['script'].unique())
# missing_in_loc_data = all_scripts - set(loc_data_script['script'].unique())
# missing_in_cmds = all_scripts - set(syntax_script_all_cmds['script'].unique())

# print("Missing in dyn_script:", missing_in_dyn)
# print("Missing in loc_data_script:", missing_in_loc_data)
# print("Missing in syntax_script_all_cmds:", missing_in_cmds)

dyn_bench['input_description'] = dyn_bench['benchmark'].apply(lambda x: benchmark_input_description[x])

big_bench = syntax_bench.merge(dyn_bench, on='benchmark')\
.merge(loc_data_bench, on='benchmark')\
.merge(syntax_bench_all_cmds[['benchmark', 'unique_cmds']], on='benchmark')

big_script = syntax_script.merge(dyn_script, on='script')\
.merge(loc_data_script, on='script')\
.merge(syntax_script_all_cmds[['script', 'unique_cmds']], on='script')


print("""
\\def\\idw{5em}
\\begin{tabular}{l|lrr|rr|l|rrrr|lr}
\\toprule
\\multirow{2}{*}{Benchmark/Script} & \\multicolumn{3}{c|}{Surface} & \\multicolumn{2}{c|}{Syntax} & \\multicolumn{1}{c|}{Inputs} & \\multicolumn{4}{c|}{Dynamic} & \\multicolumn{2}{c}{System} \\\\
& Dom & \\#.sh & LOC & \\# Cons & \\# Cmd & & T.sh & T.cmd & Mem & I/O & \\# s/c & \\# fd \\\\
\\midrule
""")
# generate a big latex table with the following columns:
# benchmark, short_category, number of scripts, LOC, number of constructs, number of unique commands, input description, time in shell, time in commands, max memory, IO
for _, row in big_bench.iterrows():
numscripts_shown = 0
numscripts = row['number_of_scripts']
print("\\rule{0pt}{5ex}")
print(f"\\textbf{{\\tt {row['benchmark']}}} & {short_category(row['benchmark'])} & {row['number_of_scripts']} & {row['loc']} & {row['constructs']} & {row['unique_cmds']} & \\multirow{{2}}{{*}}{{\\parbox{{\\idw}}{{{prettify_bytes_number(row['input_size']) + ' of ' + row['input_description']}}}}} & {row['time_in_shell']:.2f} & {row['time_in_commands']:.2f} & {prettify_bytes_number(row['max_unique_set_size'])} & {prettify_bytes_number(row['io_chars'])} \\\\")
# now print the details of all scripts in the benchmark
for _, row_script in big_script.iterrows():
if row_script['benchmark'] == row['benchmark'] and any([fnmatch.fnmatch(row_script['script'], pattern) for pattern in scripts_to_include]):
# all columns except leave blank benchmark, category, number of scripts, input description
print(f"\\hspace{{0.5em}} {row_script['script'].split('/')[-1]} & & & {row_script['loc']} & {row_script['constructs']} & {row_script['unique_cmds']} & & {row_script['time_in_shell']:.2f} & {row_script['time_in_commands']:.2f} & {prettify_bytes_number(row_script['max_unique_set_size'])} & {prettify_bytes_number(row_script['io_chars'])} \\\\")
numscripts_shown += 1
if numscripts_shown < numscripts and numscripts > 1:
print(f"\\hspace{{0.5em}} \\ldots & & & & & & & & & & \\\\")
print("""
\\bottomrule
\\end{tabular}
""")


if __name__ == '__main__':
main()
35 changes: 31 additions & 4 deletions infrastructure/data/script-globs.json
Original file line number Diff line number Diff line change
@@ -1,4 +1,10 @@
{
"aurpkg": {
"scripts": ["aurpkg/scripts/*.sh"]
},
"bio": {
"scripts": ["bio/scripts/*.sh"]
},
"covid-mts": {
"scripts": ["covid-mts/scripts/*.sh"]
},
Expand Down Expand Up @@ -26,22 +32,43 @@
"riker": {
"scripts": ["riker/scripts/*/build.sh"]
},
"uniq-ips": {
"scripts": ["uniq-ips/scripts/run.sh"]
},
"unix50": {
"scripts": ["unix50/scripts/*.sh"]
},
"web-index": {
"scripts": ["web-index/scripts/*.sh"]
},
"makeself": {
"scripts": ["makeself/makeself/*.sh"]
"scripts": ["makeself/makeself/test/*/*.sh"]
},
"vps-audit": {
"scripts": ["vps-audit/scripts/*.sh"]
},
"vps-audit-negate": {
"scripts": ["vps-audit-negate/scripts/*.sh"]
},
"infrastructure/standards/100-files": {
"scripts": ["infrastructure/standards/100-files/scripts/*.sh"]
},
"infrastructure/standards/read-write": {
"scripts": ["infrastructure/standards/read-write/scripts/*.sh"]
},
"infrastructure/standards/shell-memory": {
"scripts": ["infrastructure/standards/shell-memory/scripts/*.sh"]
},
"infrastructure/standards/sleep": {
"scripts": ["infrastructure/standards/sleep/scripts/*.sh"]
},
"infrastructure/standards/time-in-shell-subprocess": {
"scripts": ["infrastructure/standards/time-in-shell-subprocess/scripts/*.sh"]
},
"infrastructure/standards/user-time": {
"scripts": ["infrastructure/standards/user-time/scripts/*.sh"]
},
"infrastructure/standards/user-time-in-shell": {
"scripts": ["infrastructure/standards/user-time-in-shell/scripts/*.sh"]
},
"infrastructure/standards/write-only": {
"scripts": ["infrastructure/standards/write-only/scripts/*.sh"]
}
}
Loading

0 comments on commit 5b1a88e

Please sign in to comment.