From be4562e74eb8e3bc5e1d0f604d360233204a00e6 Mon Sep 17 00:00:00 2001 From: EtomicBomb Date: Sun, 20 Oct 2024 19:46:10 -0400 Subject: [PATCH] just max-temp --- max-temp/README.md | 11 ----- max-temp/analytics-preprocess.sh | 15 ------ max-temp/analytics-process.sh | 19 -------- max-temp/cleanup.sh | 8 ++++ max-temp/input.sh | 25 ++++++++++ max-temp/input/.gitignore | 1 + max-temp/inputs/README.md | 0 max-temp/inputs/cleanup.sh | 4 -- max-temp/inputs/dependencies.sh | 3 -- max-temp/inputs/inputs.sh | 11 ----- max-temp/inputs/run.sh | 40 ---------------- max-temp/inputs/tes.sh | 45 ------------------ max-temp/inputs/verify.sh | 74 ------------------------------ max-temp/max-temp.sh | 24 ---------- max-temp/new-temp-analytics.sh | 44 ------------------ max-temp/run.sh | 19 ++++++++ max-temp/scripts/temp-analytics.sh | 21 +++++++++ max-temp/temp-analytics.sh | 43 ----------------- max-temp/verify.sh | 18 ++++++++ max-temp/whole_shebang.sh | 15 ------ 20 files changed, 92 insertions(+), 348 deletions(-) delete mode 100644 max-temp/README.md delete mode 100755 max-temp/analytics-preprocess.sh delete mode 100755 max-temp/analytics-process.sh create mode 100755 max-temp/cleanup.sh create mode 100755 max-temp/input.sh create mode 100644 max-temp/input/.gitignore delete mode 100644 max-temp/inputs/README.md delete mode 100644 max-temp/inputs/cleanup.sh delete mode 100644 max-temp/inputs/dependencies.sh delete mode 100644 max-temp/inputs/inputs.sh delete mode 100644 max-temp/inputs/run.sh delete mode 100644 max-temp/inputs/tes.sh delete mode 100644 max-temp/inputs/verify.sh delete mode 100755 max-temp/max-temp.sh delete mode 100755 max-temp/new-temp-analytics.sh create mode 100755 max-temp/run.sh create mode 100755 max-temp/scripts/temp-analytics.sh delete mode 100755 max-temp/temp-analytics.sh create mode 100755 max-temp/verify.sh delete mode 100644 max-temp/whole_shebang.sh diff --git a/max-temp/README.md b/max-temp/README.md deleted file mode 100644 index e8e619c36..000000000 --- a/max-temp/README.md +++ /dev/null @@ -1,11 +0,0 @@ -## Max-Temp -> Downloads and processes large temperature data set - -## Running Max-Temp - -* By default we're downloading data from the year 2015. -* --flags can be: --pre (for preprocessing), --pro (for processing), --full (for full execution) -* TODO: --small, --large input specification - - - diff --git a/max-temp/analytics-preprocess.sh b/max-temp/analytics-preprocess.sh deleted file mode 100755 index c49b0fc1a..000000000 --- a/max-temp/analytics-preprocess.sh +++ /dev/null @@ -1,15 +0,0 @@ -#!/bin/bash -#https://www1.ncdc.noaa.gov/pub/data/noaa/ works! -## KK: not exactly. It returns a different format! -IN=${IN:-'https://atlas-group.cs.brown.edu/data/noaa/ '} - -sed "s;^;$IN;" | - sed 's;$;/;' | - xargs -r -n 1 curl -s | - grep gz | - tr -s ' \n' | - cut -d ' ' -f9 | - sed 's;^\(.*\)\(20[0-9][0-9]\).gz;\2/\1\2\.gz;' | - sed "s;^;$IN;" | - xargs -n1 curl -s | - gunzip \ No newline at end of file diff --git a/max-temp/analytics-process.sh b/max-temp/analytics-process.sh deleted file mode 100755 index 9ad160d87..000000000 --- a/max-temp/analytics-process.sh +++ /dev/null @@ -1,19 +0,0 @@ -#!/bin/bash - -## Processing -cat "${data_file}" | - cut -c 88-92 | - grep -v 999 | - sort -rn | - head -n1 > ${outputs_dir}/max.txt - -cat "${data_file}" | - cut -c 88-92 | - grep -v 999 | - sort -n | - head -n1 > ${outputs_dir}/min.txt - -cat "${data_file}" | - cut -c 88-92 | - grep -v 999 | - awk "{ total += \$1; count++ } END { print total/count }" > ${outputs_dir}/average.txt \ No newline at end of file diff --git a/max-temp/cleanup.sh b/max-temp/cleanup.sh new file mode 100755 index 000000000..c57ffff1e --- /dev/null +++ b/max-temp/cleanup.sh @@ -0,0 +1,8 @@ +#!/bin/bash + +REPO_TOP=$(git rev-parse --show-toplevel) +results_dir="${REPO_TOP}/max-temp/results" + +echo "Cleaning up outputs..." +rm -rf $results_dir + diff --git a/max-temp/input.sh b/max-temp/input.sh new file mode 100755 index 000000000..7053ba6cb --- /dev/null +++ b/max-temp/input.sh @@ -0,0 +1,25 @@ +#!/bin/bash + +REPO_TOP=$(git rev-parse --show-toplevel) + +eval_dir="${REPO_TOP}/max-temp" +results_dir="${eval_dir}/results" +scripts_dir="${eval_dir}/scripts" +input_dir="${eval_dir}/input" + +FROM=${FROM:-2015} +TO=${TO:-2015} +URL='https://www1.ncdc.noaa.gov/pub/data/noaa/' + +## Downloading and extracting +seq $FROM $TO | + sed "s;^;$URL;" | + sed 's;$;/;' | + xargs -r -n1 --insecure | + grep gz | + tr -s ' \n' | + cut -d ' ' -f9 | + sed 's;^\(.*\)\(20[0-9][0-9]\).gz;\2/\1\2\.gz;' | + sed "s;^;$URL;" | + xargs -n1 curl --insecure | + gunzip > "$input_dir/temperatures2015.txt" diff --git a/max-temp/input/.gitignore b/max-temp/input/.gitignore new file mode 100644 index 000000000..8b1378917 --- /dev/null +++ b/max-temp/input/.gitignore @@ -0,0 +1 @@ + diff --git a/max-temp/inputs/README.md b/max-temp/inputs/README.md deleted file mode 100644 index e69de29bb..000000000 diff --git a/max-temp/inputs/cleanup.sh b/max-temp/inputs/cleanup.sh deleted file mode 100644 index 203aecabf..000000000 --- a/max-temp/inputs/cleanup.sh +++ /dev/null @@ -1,4 +0,0 @@ -#!/bin/bash - -echo "Cleaning up" -rm -r temp diff --git a/max-temp/inputs/dependencies.sh b/max-temp/inputs/dependencies.sh deleted file mode 100644 index b5be726f6..000000000 --- a/max-temp/inputs/dependencies.sh +++ /dev/null @@ -1,3 +0,0 @@ -#!/bin/bash - -echo "Clearing depedencies" \ No newline at end of file diff --git a/max-temp/inputs/inputs.sh b/max-temp/inputs/inputs.sh deleted file mode 100644 index 8d8e0dbdb..000000000 --- a/max-temp/inputs/inputs.sh +++ /dev/null @@ -1,11 +0,0 @@ -#!/bin/bash - -#setup_dataset_max_temp() { -# echo 'This experiment is expected to fetch data from a remote server' -# echo 'To fetch the original dataset, use an FTP client' -# echo 'e.g., "lftp ftp://ftp.ncdc.noaa.gov/pub/data/noaa"' -#} - - -echo "Setting up inputs" -#setup_dataset_max_temp \ No newline at end of file diff --git a/max-temp/inputs/run.sh b/max-temp/inputs/run.sh deleted file mode 100644 index c17c24871..000000000 --- a/max-temp/inputs/run.sh +++ /dev/null @@ -1,40 +0,0 @@ -if [[ "$1" == "-c" ]]; then - rm -r ../outputs/* - return 0 -fi - -max-temp(){ - outputs_dir="../outputs" - times_file="${outputs_dir}/time.res" - hashed_file="${outputs_dir}/hashed.res" - outputs_suffix=".out" - local script=$1 - - if [ -e "${times_file}" ]; then - echo "skipping max-temp" - return 0 - fi - - if [ ! -d ../data ]; then - mkdir ../data - fi - - mkdir -p "$outputs_dir" - touch "$times_file" - outputs_file="${outputs_dir}/${script}.${outputs_suffix}" - echo "${script}.sh:" $({ time ../${script}.sh > "${outputs_file}"; } 2>&1) | tee -a "$times_file" - -} - -if [[ $1 == "--pre" ]]; then - script="analytics-preprocess" -elif [[ $1 == "--pro" ]]; then - script="analytics-process" -elif [[ $1 == "--full" ]] || [[ $1 == "" ]] ; then - script="temp-analytics" -else - echo "Invalid choice! Choices are --pre (for preprocessing), --pro (for processing), --full (for everything)" - return 0 -fi - -max-temp $script \ No newline at end of file diff --git a/max-temp/inputs/tes.sh b/max-temp/inputs/tes.sh deleted file mode 100644 index 9608fb571..000000000 --- a/max-temp/inputs/tes.sh +++ /dev/null @@ -1,45 +0,0 @@ -#!/bin/bash -#This is just for testing. DON'T LOOK - -size='.' -file1="../for_validation/${size}/for_validation.time" -file2="../outputs/${size}/asimpletime.res" - -convert_to_seconds() { - delimiter_min="m" - delimiter_sec="s" - delimiter_dec="," - - local num1=$1 - - local minutes="${num1%%$delimiter_min*}" - local seconds="${num1#*$delimiter_min}" - local seconds="${seconds%%$delimiter_sec*}" - local seconds_before="${seconds%%$delimiter_dec*}" - local seconds_dec="${seconds#*$delimiter_dec}" - - local total_sec=$((minutes * 60000 + seconds_before * 1000 + seconds_dec)) - echo "$total_sec" -} - -cut1=$(awk '{print $3}' "$file1") -cut2=$(awk '{print $3}' "$file2") -cutname=$(awk '{print $1}' "$file1") - -paste <(echo "$cutname") <(echo "$cut1") <(echo "$cut2") > "./temp/joint_cut.txt" - -while read -r num1 num2 num3; do - - first=$(convert_to_seconds "$num2") - second=$(convert_to_seconds "$num3") - - diff=$((first - second)) - if [ "$diff" -lt 0 ];then - diff=$((-diff)) - fi - - if [[ $diff -gt $((first * 2 / 100)) ]];then #does not work for very small times :( - echo "$num1" "Deviation in excecution times! Make sure everything runs smoothly." - fi - -done < "./temp/joint_cut.txt" \ No newline at end of file diff --git a/max-temp/inputs/verify.sh b/max-temp/inputs/verify.sh deleted file mode 100644 index ec0d038cf..000000000 --- a/max-temp/inputs/verify.sh +++ /dev/null @@ -1,74 +0,0 @@ -#!/bin/bash -convert_to_seconds() { - delimiter_min="m" - delimiter_sec="s" - delimiter_dec="," - - local num1=$1 - - local minutes="${num1%%$delimiter_min*}" - local seconds="${num1#*$delimiter_min}" - local seconds="${seconds%%$delimiter_sec*}" - local seconds_before="${seconds%%$delimiter_dec*}" - local seconds_dec="${seconds#*$delimiter_dec}" - - local total_sec=$((10#$minutes * 60000 + 10#$seconds_before * 1000 + 10#$seconds_dec)) - echo "$total_sec" -} - -time_check(){ - local file1=$1 - local file2=$2 - - local cut1=$(awk '{print $3}' "$file1") - local cut2=$(awk '{print $3}' "$file2") - local cutname=$(awk '{print $1}' "$file1") - - paste <(echo "$cutname") <(echo "$cut1") <(echo "$cut2") > "./temp/joint_cut.txt" - - while read -r num1 num2 num3; do - - local first=$(convert_to_seconds "$num2") - local second=$(convert_to_seconds "$num3") - - local diff=$((first - second)) - - if [ "$diff" -lt 0 ];then - diff=$((-diff)) - fi - - if [[ $diff -gt $((first * 20 / 100)) ]];then #does not work for very small times :( - echo "$num1" "Deviation in excecution times! Make sure everything runs smoothly." - fi - - done < "./temp/joint_cut.txt" -} -echo "Verifying correct execution" -echo "" - -if [ ! -d ./temp ]; then - mkdir ./temp -fi - -size="." #changes for small/large inputs -og_output_time="../for_validation/${size}/for_validation.time" -my_output_time="../outputs/${size}/time.res" - -echo "Time comparison" -time_check "$og_output_time" "$my_output_time" - -echo "" - -echo "Values comparison" -og_output_avg="../for_validation/${size}/for_validation.avg" -og_output_min="../for_validation/${size}/for_validation.min" -og_output_max="../for_validation/${size}/for_validation.max" - -my_output_avg="../outputs/${size}/average.txt" -my_output_min="../outputs/${size}/min.txt" -my_output_max="../outputs/${size}/max.txt" - - -diff -q "$og_output_avg" "$my_output_avg" -diff -q "$og_output_min" "$my_output_min" -diff -q "$og_output_max" "$my_output_max" \ No newline at end of file diff --git a/max-temp/max-temp.sh b/max-temp/max-temp.sh deleted file mode 100755 index f12fdd54d..000000000 --- a/max-temp/max-temp.sh +++ /dev/null @@ -1,24 +0,0 @@ -#!/bin/bash - -FROM=${FROM:-2015} -TO=${TO:-2015} -IN=${IN:-'https://atlas-group.cs.brown.edu/data/noaa/ '} -fetch=${fetch:-"curl -s"} - -seq $FROM $TO | - ## URL manipulation and data download - sed "s;^;$IN;" | - sed 's;$;/;' | - xargs -r -n 1 $fetch | - grep gz | - tr -s ' \n' | - cut -d ' ' -f9 | - sed 's;^\(.*\)\(20[0-9][0-9]\).gz;\2/\1\2\.gz;' | - sed "s;^;$IN;" | - xargs -n1 curl -s | - gunzip | - ## Processing - cut -c 88-92 | - grep -v 999 | - sort -rn | - head -n1 \ No newline at end of file diff --git a/max-temp/new-temp-analytics.sh b/max-temp/new-temp-analytics.sh deleted file mode 100755 index a9c0f4a07..000000000 --- a/max-temp/new-temp-analytics.sh +++ /dev/null @@ -1,44 +0,0 @@ -#!/bin/bash - -FROM=${FROM:-2015} -TO=${TO:-2015} -# IN=${IN:-'https://www1.ncdc.noaa.gov/pub/data/noaa/'} -IN=${IN:-'https://atlas-group.cs.brown.edu/data/noaa/'} -fetch=${fetch:-"curl -s"} - -data_dir=../data -outputs_dir=../outputs - - -## Downloading and extracting -for year in $(seq $FROM $TO) -do - url_year="$IN$year/" - data_file="${data_dir}/temperatures.$year.txt" - ## Note: I am concerned about the use of -s because it might hide some error which might make it hard to find - curl -s "$url_year" | - grep gz | - ## --------- NECESSARY WHEN DOWNLOADING FROM NOAA ----------- - ## note: regexp generated with ChatGPT - ## sed -n 's/.*href="\([^"]*\.gz\)".*/\1/p' | - ## sed "s;^;$url_year;" | - ## ---------------------------------------------------------- - tr -s ' \n' | - cut -d ' ' -f9 | - sed 's;^\(.*\)\(20[0-9][0-9]\).gz;\2/\1\2\.gz;' | - sed "s;^;$IN;" | - ## Note: I am concerned about the use of -s because it might hide some error which might make it hard to find - xargs -n1 curl -s | - gunzip > "${data_file}" -done - -for year in $(seq $FROM $TO) -do - data_file="${data_dir}/temperatures.$year.txt" - cat "${data_file}" | - cut -c 88-92 | - grep -v 999 | - sort -rg | - head -n1 > ${outputs_dir}/max.$year.txt - echo "Max for $year is $(cat ${outputs_dir}/max.$year.txt)" -done \ No newline at end of file diff --git a/max-temp/run.sh b/max-temp/run.sh new file mode 100755 index 000000000..d9f917484 --- /dev/null +++ b/max-temp/run.sh @@ -0,0 +1,19 @@ +#!/bin/bash + +REPO_TOP=$(git rev-parse --show-toplevel) + +eval_dir="${REPO_TOP}/max-temp" +results_dir="${eval_dir}/results" +scripts_dir="${eval_dir}/scripts" +input_dir="${eval_dir}/input" + +shell="/bin/bash" + +mkdir -p $results_dir + +export input_file=${input_dir}/temperatures2015.txt +export results_dir +script="${scripts_dir}/temp-analytics.sh" + +echo "Executing $(basename "$script")" +$shell "$script" > "$results_dir/$(basename "$script").out" diff --git a/max-temp/scripts/temp-analytics.sh b/max-temp/scripts/temp-analytics.sh new file mode 100755 index 000000000..0c01fbc54 --- /dev/null +++ b/max-temp/scripts/temp-analytics.sh @@ -0,0 +1,21 @@ +#!/bin/bash + +[[ -n "$input_file" ]] || echo "script was not provided with \$input_file" +[[ -n "$results_dir" ]] || echo "script was not provided with \$results_dir" + +cat "${input_file}" | + cut -c 88-92 | + grep -v 999 | + sort -rn | + head -n1 > ${results_dir}/max.txt + +cat "${input_file}" | + cut -c 88-92 | + grep -v 999 | + sort -n | + head -n1 > ${results_dir}/min.txt + +cat "${input_file}" | + cut -c 88-92 | + grep -v 999 | + awk "{ total += \$1; count++ } END { print total/count }" > ${results_dir}/average.txt diff --git a/max-temp/temp-analytics.sh b/max-temp/temp-analytics.sh deleted file mode 100755 index 0102eee7f..000000000 --- a/max-temp/temp-analytics.sh +++ /dev/null @@ -1,43 +0,0 @@ -#!/bin/bash -#https://www1.ncdc.noaa.gov/pub/data/noaa/ seems to be working yay! -#!/bin/bash - -FROM=${FROM:-2015} -TO=${TO:-2015} -IN=${IN:-'https://www1.ncdc.noaa.gov/pub/data/noaa/'} -fetch=${fetch:-"curl -s"} - -data_dir=../data -outputs_dir=../outputs -data_file=${data_dir}/temperatures.txt - -## Downloading and extracting -seq $FROM $TO | - sed "s;^;$IN;" | - sed 's;$;/;' | - xargs -r -n 1 $fetch | - grep gz | - tr -s ' \n' | - cut -d ' ' -f9 | - sed 's;^\(.*\)\(20[0-9][0-9]\).gz;\2/\1\2\.gz;' | - sed "s;^;$IN;" | - xargs -n1 curl -s | - gunzip > "${data_file}" - -## Processing -cat "${data_file}" | - cut -c 88-92 | - grep -v 999 | - sort -rn | - head -n1 > ${outputs_dir}/max.txt - -cat "${data_file}" | - cut -c 88-92 | - grep -v 999 | - sort -n | - head -n1 > ${outputs_dir}/min.txt - -cat "${data_file}" | - cut -c 88-92 | - grep -v 999 | - awk "{ total += \$1; count++ } END { print total/count }" > ${outputs_dir}/average.txt \ No newline at end of file diff --git a/max-temp/verify.sh b/max-temp/verify.sh new file mode 100755 index 000000000..7b488d906 --- /dev/null +++ b/max-temp/verify.sh @@ -0,0 +1,18 @@ +#!/bin/bash + +REPO_TOP=$(git rev-parse --show-toplevel) + +eval_dir="${REPO_TOP}/max-temp" +results_dir="${eval_dir}/results" +correct_dir="${eval_dir}/correct-results" + +diff "$results_dir/average.txt" "$correct_dir/average.txt" \ + && diff "$results_dir/min.txt" "$correct_dir/min.txt" \ + && diff "$results_dir/max.txt" "$correct_dir/max.txt" + +if [ $? -eq 0 ]; then + echo "Valid" +else + echo "Invalid" + exit 1 +fi diff --git a/max-temp/whole_shebang.sh b/max-temp/whole_shebang.sh deleted file mode 100644 index e246015a3..000000000 --- a/max-temp/whole_shebang.sh +++ /dev/null @@ -1,15 +0,0 @@ -#!/bin/bash - -echo "Start" -echo " || " -cd inputs -source ./inputs.sh -echo " || " -source ./run.sh $1 -echo " || " -source ./verify.sh -echo " || " -source ./cleanup.sh -cd .. -echo " || " -echo "We did it!"