diff --git a/.github/lm-eval-configs/full-large-models.txt b/.github/lm-eval-configs/full-large-models.txt
new file mode 100644
index 0000000000000..55c913a282bc8
--- /dev/null
+++ b/.github/lm-eval-configs/full-large-models.txt
@@ -0,0 +1,9 @@
+Meta-Llama-3-70B-Instruct-FP8.yaml
+Meta-Llama-3-70B-Instruct.yaml
+Mixtral-8x22B-Instruct-v0.1-FP8.yaml
+Mixtral-8x22B-Instruct-v0.1.yaml
+Mixtral-8x7B-Instruct-v0.1-FP8.yaml
+Mixtral-8x7B-Instruct-v0.1.yaml
+Qwen2-57B-A14B-Instruct.yaml
+Qwen2-72B-Instruct.yaml
+Phi-3-medium-4k-instruct.yaml
diff --git a/.github/lm-eval-configs/full-small-models.txt b/.github/lm-eval-configs/full-small-models.txt
new file mode 100644
index 0000000000000..caca502f76d04
--- /dev/null
+++ b/.github/lm-eval-configs/full-small-models.txt
@@ -0,0 +1,7 @@
+gemma-7b-it.yaml
+Meta-Llama-3-8B-Instruct-FP8-KV.yaml
+Meta-Llama-3-8B-Instruct-FP8.yaml
+Meta-Llama-3-8B-Instruct-W4A16.yaml
+Meta-Llama-3-8B-Instruct.yaml
+Mistral-7B-Instruct-v0.3.yaml
+Qwen2-7B-Instruct.yaml
diff --git a/.github/lm-eval-configs/models/Meta-Llama-3-70B-Instruct-FP8.yaml b/.github/lm-eval-configs/models/Meta-Llama-3-70B-Instruct-FP8.yaml
new file mode 100644
index 0000000000000..2ef7b975b8bc9
--- /dev/null
+++ b/.github/lm-eval-configs/models/Meta-Llama-3-70B-Instruct-FP8.yaml
@@ -0,0 +1,11 @@
+# ./nm-run-lm-eval-gsm-hf-baseline.sh -m meta-llama/Meta-Llama-3-70B-Instruct -b 32 -l 250 -f 5
+model_name: "meta-llama/Meta-Llama-3-70B-Instruct"
+tasks:
+- name: "gsm8k"
+  metrics:
+  - name: "exact_match,strict-match"
+    value: 0.900
+  - name: "exact_match,flexible-extract"
+    value: 0.900
+limit: 250
+num_fewshot: 5
diff --git a/.github/lm-eval-configs/models/Meta-Llama-3-70B-Instruct.yaml b/.github/lm-eval-configs/models/Meta-Llama-3-70B-Instruct.yaml
new file mode 100644
index 0000000000000..70f1030fa0007
--- /dev/null
+++ b/.github/lm-eval-configs/models/Meta-Llama-3-70B-Instruct.yaml
@@ -0,0 +1,11 @@
+# ./nm-run-lm-eval-gsm-hf-baseline.sh -m meta-llama/Meta-Llama-3-70B-Instruct -b 32 -l 250 -f 5
+model_name: "meta-llama/Meta-Llama-3-70B-Instruct"
+tasks:
+- name: "gsm8k"
+  metrics:
+  - name: "exact_match,strict-match"
+    value: 0.888
+  - name: "exact_match,flexible-extract"
+    value: 0.888
+limit: 250
+num_fewshot: 5
diff --git a/.github/lm-eval-configs/models/Meta-Llama-3-8B-Instruct-FP8-KV.yaml b/.github/lm-eval-configs/models/Meta-Llama-3-8B-Instruct-FP8-KV.yaml
new file mode 100644
index 0000000000000..1c46cda9da11a
--- /dev/null
+++ b/.github/lm-eval-configs/models/Meta-Llama-3-8B-Instruct-FP8-KV.yaml
@@ -0,0 +1,11 @@
+# ./nm-run-lm-eval-gsm-vllm-baseline.sh -m neuralmagic/Meta-Llama-3-8B-Instruct-FP8-KV -b 32 -l 250 -f 5 -t 1
+model_name: "neuralmagic/Meta-Llama-3-8B-Instruct-FP8-KV"
+tasks:
+- name: "gsm8k"
+  metrics:
+  - name: "exact_match,strict-match"
+    value: 0.764
+  - name: "exact_match,flexible-extract"
+    value: 0.764
+limit: 250
+num_fewshot: 5
diff --git a/.github/lm-eval-configs/models/Meta-Llama-3-8B-Instruct-FP8.yaml b/.github/lm-eval-configs/models/Meta-Llama-3-8B-Instruct-FP8.yaml
new file mode 100644
index 0000000000000..0d077dc19d95a
--- /dev/null
+++ b/.github/lm-eval-configs/models/Meta-Llama-3-8B-Instruct-FP8.yaml
@@ -0,0 +1,11 @@
+# ./nm-run-lm-eval-gsm-vllm-baseline.sh -m neuralmagic/Meta-Llama-3-8B-Instruct-FP8 -b 32 -l 250 -f 5 -t 1
+model_name: "neuralmagic/Meta-Llama-3-8B-Instruct-FP8"
+tasks:
+- name: "gsm8k"
+  metrics:
+  - name: "exact_match,strict-match"
+    value: 0.744
+  - name: "exact_match,flexible-extract"
+    value: 0.740
+limit: 250
+num_fewshot: 5
diff --git a/.github/lm-eval-configs/models/Meta-Llama-3-8B-Instruct-W4A16.yaml b/.github/lm-eval-configs/models/Meta-Llama-3-8B-Instruct-W4A16.yaml
new file mode 100644
index 0000000000000..92d07ad0c734a
--- /dev/null
+++ b/.github/lm-eval-configs/models/Meta-Llama-3-8B-Instruct-W4A16.yaml
@@ -0,0 +1,11 @@
+# ./nm-run-lm-eval-gsm-hf-baseline.sh -m TechxGenus/Meta-Llama-3-8B-Instruct-GPTQ -b 32 -l 250 -f 5
+model_name: "TechxGenus/Meta-Llama-3-8B-Instruct-GPTQ"
+tasks:
+- name: "gsm8k"
+  metrics:
+  - name: "exact_match,strict-match"
+    value: 0.684
+  - name: "exact_match,flexible-extract"
+    value: 0.688
+limit: 250
+num_fewshot: 5
diff --git a/.github/lm-eval-configs/models/Meta-Llama-3-8B-Instruct.yaml b/.github/lm-eval-configs/models/Meta-Llama-3-8B-Instruct.yaml
new file mode 100644
index 0000000000000..d7abd6b36bfc6
--- /dev/null
+++ b/.github/lm-eval-configs/models/Meta-Llama-3-8B-Instruct.yaml
@@ -0,0 +1,11 @@
+# ./nm-run-lm-eval-gsm-hf-baseline.sh -m meta-llama/Meta-Llama-3-8B-Instruct -b 32 -l 250 -f 5
+model_name: "meta-llama/Meta-Llama-3-8B-Instruct"
+tasks:
+- name: "gsm8k"
+  metrics:
+  - name: "exact_match,strict-match"
+    value: 0.74
+  - name: "exact_match,flexible-extract"
+    value: 0.74
+limit: 250
+num_fewshot: 5
diff --git a/.github/lm-eval-configs/models/Mistral-7B-Instruct-v0.3.yaml b/.github/lm-eval-configs/models/Mistral-7B-Instruct-v0.3.yaml
new file mode 100644
index 0000000000000..592652eed999e
--- /dev/null
+++ b/.github/lm-eval-configs/models/Mistral-7B-Instruct-v0.3.yaml
@@ -0,0 +1,11 @@
+# ./nm-run-lm-eval-gsm-hf-baseline.sh -m mistralai/Mistral-7B-Instruct-v0.3 -b 32 -l 250 -f 5
+model_name: "mistralai/Mistral-7B-Instruct-v0.3"
+tasks:
+- name: "gsm8k"
+  metrics:
+  - name: "exact_match,strict-match"
+    value: 0.524
+  - name: "exact_match,flexible-extract"
+    value: 0.524
+limit: 250
+num_fewshot: 5
diff --git a/.github/lm-eval-configs/models/Mixtral-8x22B-Instruct-v0.1-FP8.yaml b/.github/lm-eval-configs/models/Mixtral-8x22B-Instruct-v0.1-FP8.yaml
new file mode 100644
index 0000000000000..8d1eaecf5bec6
--- /dev/null
+++ b/.github/lm-eval-configs/models/Mixtral-8x22B-Instruct-v0.1-FP8.yaml
@@ -0,0 +1,11 @@
+# ./nm-run-lm-eval-gsm-hf-baseline.sh -m mistralai/Mixtral-8x22B-Instruct-v0.1 -b 32 -l 250 -f 5
+model_name: "mistralai/Mixtral-8x22B-Instruct-v0.1"
+tasks:
+- name: "gsm8k"
+  metrics:
+  - name: "exact_match,strict-match"
+    value: 0.840
+  - name: "exact_match,flexible-extract"
+    value: 0.844
+limit: 250
+num_fewshot: 5
diff --git a/.github/lm-eval-configs/models/Mixtral-8x22B-Instruct-v0.1.yaml b/.github/lm-eval-configs/models/Mixtral-8x22B-Instruct-v0.1.yaml
new file mode 100644
index 0000000000000..73f00b16c51aa
--- /dev/null
+++ b/.github/lm-eval-configs/models/Mixtral-8x22B-Instruct-v0.1.yaml
@@ -0,0 +1,11 @@
+# ./nm-run-lm-eval-gsm-hf-baseline.sh -m mistralai/Mixtral-8x22B-Instruct-v0.1 -b 32 -l 250 -f 5
+model_name: "mistralai/Mixtral-8x22B-Instruct-v0.1"
+tasks:
+- name: "gsm8k"
+  metrics:
+  - name: "exact_match,strict-match"
+    value: 0.876
+  - name: "exact_match,flexible-extract"
+    value: 0.880
+limit: 250
+num_fewshot: 5
diff --git a/.github/lm-eval-configs/models/Mixtral-8x7B-Instruct-v0.1-FP8.yaml b/.github/lm-eval-configs/models/Mixtral-8x7B-Instruct-v0.1-FP8.yaml
new file mode 100644
index 0000000000000..e3f30baf316be
--- /dev/null
+++ b/.github/lm-eval-configs/models/Mixtral-8x7B-Instruct-v0.1-FP8.yaml
@@ -0,0 +1,11 @@
+# ./nm-run-lm-eval-gsm-hf-baseline.sh -m mistralai/Mixtral-8x7B-Instruct-v0.1 -b 32 -l 250 -f 5
+model_name: "mistralai/Mixtral-8x7B-Instruct-v0.1"
+tasks:
+- name: "gsm8k"
+  metrics:
+  - name: "exact_match,strict-match"
+    value: 0.616
+  - name: "exact_match,flexible-extract"
+    value: 0.620
+limit: 250
+num_fewshot: 5
diff --git a/.github/lm-eval-configs/models/Mixtral-8x7B-Instruct-v0.1.yaml b/.github/lm-eval-configs/models/Mixtral-8x7B-Instruct-v0.1.yaml
new file mode 100644
index 0000000000000..629e3721fdf44
--- /dev/null
+++ b/.github/lm-eval-configs/models/Mixtral-8x7B-Instruct-v0.1.yaml
@@ -0,0 +1,11 @@
+# bash ./nm-run-lm-eval-gsm-vllm-baseline.sh -m neuralmagic/Mixtral-8x7B-Instruct-v0.1-FP8 -b 32 -l 250 -f 5 -t 4
+model_name: "mistralai/Mixtral-8x7B-Instruct-v0.1"
+tasks:
+- name: "gsm8k"
+  metrics:
+  - name: "exact_match,strict-match"
+    value: 0.616
+  - name: "exact_match,flexible-extract"
+    value: 0.628
+limit: 250
+num_fewshot: 5
diff --git a/.github/lm-eval-configs/models/Phi-3-medium-4k-instruct.yaml b/.github/lm-eval-configs/models/Phi-3-medium-4k-instruct.yaml
new file mode 100644
index 0000000000000..443db66c9adc6
--- /dev/null
+++ b/.github/lm-eval-configs/models/Phi-3-medium-4k-instruct.yaml
@@ -0,0 +1,11 @@
+# ./nm-run-lm-eval-gsm-hf-baseline.sh -m microsoft/Phi-3-medium-4k-instruct -b 16 -l 250 -f 5
+model_name: "microsoft/Phi-3-medium-4k-instruct"
+tasks:
+- name: "gsm8k"
+  metrics:
+  - name: "exact_match,strict-match"
+    value: 0.840
+  - name: "exact_match,flexible-extract"
+    value: 0.852
+limit: 250
+num_fewshot: 5
diff --git a/.github/lm-eval-configs/models/Qwen2-57B-A14B-Instruct.yaml b/.github/lm-eval-configs/models/Qwen2-57B-A14B-Instruct.yaml
new file mode 100644
index 0000000000000..a46aa16f0bcd4
--- /dev/null
+++ b/.github/lm-eval-configs/models/Qwen2-57B-A14B-Instruct.yaml
@@ -0,0 +1,11 @@
+# ./nm-run-lm-eval-gsm-hf-baseline.sh -m Qwen/Qwen2-57B-A14B-Instruct -b 32 -l 250 -f 5
+model_name: "Qwen/Qwen2-57B-A14B-Instruct"
+tasks:
+- name: "gsm8k"
+  metrics:
+  - name: "exact_match,strict-match"
+    value: 0.736
+  - name: "exact_match,flexible-extract"
+    value: 0.800
+limit: 250
+num_fewshot: 5
diff --git a/.github/lm-eval-configs/models/Qwen2-72B-Instruct.yaml b/.github/lm-eval-configs/models/Qwen2-72B-Instruct.yaml
new file mode 100644
index 0000000000000..fe5a2c0af1e4a
--- /dev/null
+++ b/.github/lm-eval-configs/models/Qwen2-72B-Instruct.yaml
@@ -0,0 +1,11 @@
+# ./nm-run-lm-eval-gsm-hf-baseline.sh -m Qwen/Qwen2-72B-Instruct -b 16 -l 250 -f 5
+model_name: "Qwen/Qwen2-72B-Instruct"
+tasks:
+- name: "gsm8k"
+  metrics:
+  - name: "exact_match,strict-match"
+    value: 0.828
+  - name: "exact_match,flexible-extract"
+    value: 0.856
+limit: 250
+num_fewshot: 5
diff --git a/.github/lm-eval-configs/models/Qwen2-7B-Instruct.yaml b/.github/lm-eval-configs/models/Qwen2-7B-Instruct.yaml
new file mode 100644
index 0000000000000..5bf60816dac8f
--- /dev/null
+++ b/.github/lm-eval-configs/models/Qwen2-7B-Instruct.yaml
@@ -0,0 +1,11 @@
+# ./nm-run-lm-eval-gsm-hf-baseline.sh -m Qwen/Qwen2-7B-Instruct -b 32 -l 250 -f 5
+model_name: "Qwen/Qwen2-7B-Instruct"
+tasks:
+- name: "gsm8k"
+  metrics:
+  - name: "exact_match,strict-match"
+    value: 0.680
+  - name: "exact_match,flexible-extract"
+    value: 0.756
+limit: 250
+num_fewshot: 5
diff --git a/.github/lm-eval-configs/models/gemma-7b-it.yaml b/.github/lm-eval-configs/models/gemma-7b-it.yaml
new file mode 100644
index 0000000000000..0b3813d240add
--- /dev/null
+++ b/.github/lm-eval-configs/models/gemma-7b-it.yaml
@@ -0,0 +1,11 @@
+# ./nm-run-lm-eval-gsm-hf-baseline.sh -m google/gemma-7b-it -b 16 -l 250 -f 5
+model_name: "google/gemma-7b-it"
+tasks:
+- name: "gsm8k"
+  metrics:
+  - name: "exact_match,strict-match"
+    value: 0.284
+  - name: "exact_match,flexible-extract"
+    value: 0.324
+limit: 250
+num_fewshot: 5
diff --git a/.github/lm-eval-configs/smoke-large-models.txt b/.github/lm-eval-configs/smoke-large-models.txt
new file mode 100644
index 0000000000000..127ec5d97bcff
--- /dev/null
+++ b/.github/lm-eval-configs/smoke-large-models.txt
@@ -0,0 +1,2 @@
+Meta-Llama-3-70B-Instruct.yaml
+Mixtral-8x7B-Instruct-v0.1.yaml
diff --git a/.github/lm-eval-configs/smoke-small-models.txt b/.github/lm-eval-configs/smoke-small-models.txt
new file mode 100644
index 0000000000000..d884f36672a74
--- /dev/null
+++ b/.github/lm-eval-configs/smoke-small-models.txt
@@ -0,0 +1 @@
+Meta-Llama-3-8B-Instruct.yaml
diff --git a/.github/scripts/nm-run-lm-eval-gsm-hf-baseline.sh b/.github/scripts/nm-run-lm-eval-gsm-hf-baseline.sh
index 76f7100af5949..fdb8ec5393b36 100755
--- a/.github/scripts/nm-run-lm-eval-gsm-hf-baseline.sh
+++ b/.github/scripts/nm-run-lm-eval-gsm-hf-baseline.sh
@@ -14,13 +14,12 @@ usage() {
     echo
     echo "  -m    - huggingface stub or local directory of the model"
     echo "  -b    - batch size to run the evaluation at"
-    echo "  -d    - device to use (e.g. cuda, cuda:0, auto, cpu)"
     echo "  -l    - limit number of samples to run"
     echo "  -f    - number of fewshot samples to use"
     echo
 }
 
-while getopts "m:b:d:l:f:" OPT; do
+while getopts "m:b:l:f:" OPT; do
   case ${OPT} in
     m ) 
         MODEL="$OPTARG"
@@ -28,9 +27,6 @@ while getopts "m:b:d:l:f:" OPT; do
     b ) 
         BATCH_SIZE="$OPTARG"
         ;;
-    d ) 
-        DEVICE="$OPTARG"
-        ;;
     l ) 
         LIMIT="$OPTARG"
         ;;
@@ -45,6 +41,6 @@ while getopts "m:b:d:l:f:" OPT; do
 done
 
 lm_eval --model hf \
-  --model_args pretrained=$MODEL \
+  --model_args pretrained=$MODEL,parallelize=True \
   --tasks gsm8k --num_fewshot $FEWSHOT --limit $LIMIT \
-  --batch_size $BATCH_SIZE --device $DEVICE
+  --batch_size $BATCH_SIZE
diff --git a/.github/scripts/nm-run-lm-eval-gsm-vllm-baseline.sh b/.github/scripts/nm-run-lm-eval-gsm-vllm-baseline.sh
new file mode 100644
index 0000000000000..d6b38752945ce
--- /dev/null
+++ b/.github/scripts/nm-run-lm-eval-gsm-vllm-baseline.sh
@@ -0,0 +1,51 @@
+#!/bin/bash
+# We can use this script to compute baseline accuracy on GSM for vllm.
+# We use this for fp8, which HF does not support.
+#
+# Make sure you have lm-eval-harness installed:
+#   pip install git+https://github.com/EleutherAI/lm-evaluation-harness.git@9516087b81a61d0e220b22cc1b75be76de23bc10
+
+usage() {
+    echo``
+    echo "Runs lm eval harness on GSM8k using huggingface transformers."
+    echo "This pathway is intended to be used to create baselines for "
+    echo "our automated nm-test-accuracy workflow"
+    echo
+    echo "usage: ${0} <options>"
+    echo
+    echo "  -m    - huggingface stub or local directory of the model"
+    echo "  -b    - batch size to run the evaluation at"
+    echo "  -l    - limit number of samples to run"
+    echo "  -f    - number of fewshot samples to use"
+    echo "  -t    - tensor parallel size to run at"
+    echo
+}
+
+while getopts "m:b:l:f:t:" OPT; do
+  case ${OPT} in
+    m ) 
+        MODEL="$OPTARG"
+        ;;
+    b ) 
+        BATCH_SIZE="$OPTARG"
+        ;;
+    l ) 
+        LIMIT="$OPTARG"
+        ;;
+    f ) 
+        FEWSHOT="$OPTARG"
+        ;;
+    t )
+        TP_SIZE="$OPTARG"
+        ;;
+    \? ) 
+        usage
+        exit 1
+        ;;
+  esac
+done
+
+lm_eval --model vllm \
+  --model_args pretrained=$MODEL,tensor_parallel_size=$TP_SIZE \
+  --tasks gsm8k --num_fewshot $FEWSHOT --limit $LIMIT \
+  --batch_size $BATCH_SIZE
diff --git a/.github/scripts/nm-run-lm-eval-vllm.sh b/.github/scripts/nm-run-lm-eval-vllm.sh
index c68d6d1d7697f..d0702a086d911 100755
--- a/.github/scripts/nm-run-lm-eval-vllm.sh
+++ b/.github/scripts/nm-run-lm-eval-vllm.sh
@@ -7,15 +7,19 @@
 usage() {
     echo``
     echo "Runs lm eval harness on GSM8k using vllm server and compares to "
-    echo "precomputed baseline (measured by HF transformers."
+    echo "precomputed baseline (measured by HF transformers.)"
+    echo
+    echo "This script should be run from the /nm-vllm directory" 
     echo
     echo "usage: ${0} <options>"
     echo
-    echo "  -c    - path to the test data config (e.g. neuralmagic/lm-eval/YOUR_CONFIG.yaml)"
+    echo "  -c    - path to the test data config (e.g. .github/lm-eval-configs/small-models-smoke.txt)"
     echo
 }
 
-while getopts "c:" OPT; do
+SUCCESS=0
+
+while getopts "c:t:" OPT; do
   case ${OPT} in
     c ) 
         CONFIG="$OPTARG"
@@ -27,4 +31,30 @@ while getopts "c:" OPT; do
   esac
 done
 
-LM_EVAL_TEST_DATA_FILE=$CONFIG pytest -v tests/accuracy/test_lm_eval_correctness.py
+# Parse list of configs.
+IFS=$'\n' read -d '' -r -a MODEL_CONFIGS < $CONFIG
+
+for MODEL_CONFIG in "${MODEL_CONFIGS[@]}"
+do
+    LOCAL_SUCCESS=0
+    
+    echo "=== RUNNING MODEL: $MODEL_CONFIG ==="
+
+    MODEL_CONFIG_PATH=$PWD/.github/lm-eval-configs/models/${MODEL_CONFIG}
+    LM_EVAL_TEST_DATA_FILE=$MODEL_CONFIG_PATH pytest -s tests/accuracy/test_lm_eval_correctness.py || LOCAL_SUCCESS=$?
+
+    if [[ $LOCAL_SUCCESS == 0 ]]; then
+        echo "=== PASSED MODEL: ${MODEL_CONFIG} ==="
+    else
+        echo "=== FAILED MODEL: ${MODEL_CONFIG} ==="
+    fi
+
+    SUCCESS=$((SUCCESS + LOCAL_SUCCESS))
+
+done
+
+if [ "${SUCCESS}" -eq "0" ]; then
+    exit 0
+else
+    exit 1
+fi
diff --git a/.github/workflows/nm-build-test.yml b/.github/workflows/nm-build-test.yml
index febf6b444582a..9a5043308a067 100644
--- a/.github/workflows/nm-build-test.yml
+++ b/.github/workflows/nm-build-test.yml
@@ -85,7 +85,7 @@ on:
         type: string
         default: "60"
       lm_eval_configuration:
-        description: "configuration for lm-eval test (see neuralmagic/lm-eval)"
+        description: "configuration for lm-eval test (see .github/lm-eval-configs)"
         type: string
         default: ""
 
diff --git a/.github/workflows/nm-nightly.yml b/.github/workflows/nm-nightly.yml
index 86201939d359a..8a9ee1a6b9cb7 100644
--- a/.github/workflows/nm-nightly.yml
+++ b/.github/workflows/nm-nightly.yml
@@ -45,6 +45,6 @@ jobs:
             push_benchmark_results_to_gh_pages: "${{ github.event_name == 'schedule' || inputs.push_benchmark_results_to_gh_pages }}"
 
             lm_eval_label: gcp-k8s-l4-solo
-            lm_eval_configuration: ./neuralmagic/lm-eval/full-small-models.yaml
+            lm_eval_configuration: ./.github/lm-eval-configs/smoke-small-models.txt
             lm_eval_timeout: 60
         secrets: inherit
diff --git a/.github/workflows/nm-remote-push.yml b/.github/workflows/nm-remote-push.yml
index 5bbd760b63e0b..41398f0625d16 100644
--- a/.github/workflows/nm-remote-push.yml
+++ b/.github/workflows/nm-remote-push.yml
@@ -30,6 +30,6 @@ jobs:
             benchmark_timeout: 480
 
             lm_eval_label: gcp-k8s-l4-solo
-            lm_eval_configuration: ./neuralmagic/lm-eval/smoke-small-models.yaml
+            lm_eval_configuration: ./.github/lm-eval-configs/smoke-small-models.txt
             lm_eval_timeout: 60
         secrets: inherit
diff --git a/neuralmagic/lm-eval/full-small-models.yaml b/neuralmagic/lm-eval/full-small-models.yaml
deleted file mode 100644
index adb00ba65c1f1..0000000000000
--- a/neuralmagic/lm-eval/full-small-models.yaml
+++ /dev/null
@@ -1,11 +0,0 @@
-# ./nm-run-lm-eval-gsm-hf-baseline -m meta-llama/Meta-Llama-3-8B-Instruct -b 32 -d cuda -l 250 -f 5
-- model_name: "meta-llama/Meta-Llama-3-8B-Instruct"
-  tasks:
-  - name: "gsm8k"
-    metrics:
-    - name: "exact_match,strict-match"
-      value: 0.74
-    - name: "exact_match,flexible-extract"
-      value: 0.74
-  limit: 250
-  num_fewshot: 5
diff --git a/neuralmagic/lm-eval/smoke-small-models.yaml b/neuralmagic/lm-eval/smoke-small-models.yaml
deleted file mode 100644
index 546a221872af8..0000000000000
--- a/neuralmagic/lm-eval/smoke-small-models.yaml
+++ /dev/null
@@ -1,11 +0,0 @@
-# ./nm-run-lm-eval-gsm-hf-baseline.sh -m meta-llama/Meta-Llama-3-8B-Instruct -b 32 -d cuda -l 250 -f 5
-- model_name: "meta-llama/Meta-Llama-3-8B-Instruct"
-  tasks:
-  - name: "gsm8k"
-    metrics:
-    - name: "exact_match,strict-match"
-      value: 0.74
-    - name: "exact_match,flexible-extract"
-      value: 0.74
-  limit: 250
-  num_fewshot: 5
diff --git a/tests/accuracy/test_lm_eval_correctness.py b/tests/accuracy/test_lm_eval_correctness.py
index 581e56352064b..de975b2785ffd 100644
--- a/tests/accuracy/test_lm_eval_correctness.py
+++ b/tests/accuracy/test_lm_eval_correctness.py
@@ -1,15 +1,14 @@
-# mypy: ignore-errors
-# TODO (robertgshaw2-neuralmagic): clean this up
 import os
+import subprocess
+import time
 from pathlib import Path
-from typing import TYPE_CHECKING, Any, Dict, List, TypedDict
+from typing import TYPE_CHECKING
 
 import numpy
 import pytest
-import torch
+import requests
 import yaml
 
-from tests.nm_utils.server import ServerContext
 from tests.nm_utils.utils_skip import should_skip_test_group
 
 if should_skip_test_group(group_name="TEST_ACCURACY"):
@@ -24,97 +23,93 @@
 lm_eval: "lm_eval_t" = pytest.importorskip("lm_eval",
                                            reason="lm_eval required")
 
+RTOL = 0.02
+TEST_DATA_FILE = os.environ.get(
+    "LM_EVAL_TEST_DATA_FILE",
+    ".github/lm-eval-configs/models/Meta-Llama-3-8B-Instruct.yaml")
 
-class Metric(TypedDict):
-    name: str
-    value: float
 
+def wait_for_server(timeout=900) -> bool:
 
-class Task(TypedDict):
-    name: str
-    metrics: List[Metric]
+    def try_connection() -> bool:
+        try:
+            r = requests.get("http://localhost:8000/health")
+            return r.status_code == 200
+        except Exception as _:
+            return False
 
+    timeout_part = 15  # retry every 15 seconds
+    time_waited = 0
+    while time_waited <= timeout:
+        time.sleep(timeout_part)
+        if try_connection():
+            return True
+        time_waited = time_waited + timeout_part
 
-# to support python3.8 typing prior to adding `Required`/`NotRequired`, this
-# class stores the optional keys and the `EvalTaskDefinition` subclass inherits
-# those alongside the required keys it defines.
-class EvalTaskDefinitionOpts(TypedDict, total=False):
-    enable_tensor_parallel: bool
-    extra_args: Dict[str, Any]
-    rtol: float
+    return False
 
 
-class EvalTaskDefinition(EvalTaskDefinitionOpts):
-    model_name: str
-    tasks: List[Task]
-
-
-TEST_DATA_FILE = os.environ.get("LM_EVAL_TEST_DATA_FILE", None)
-if TEST_DATA_FILE is None:
-    raise ValueError("LM_EVAL_TEST_DATA_FILE env variable is not set.")
-TEST_DATA_FILE = Path(TEST_DATA_FILE)
-
-TEST_DATA: List[EvalTaskDefinition] = [
-    pytest.param(eval_def, id=eval_def["model_name"])
-    for eval_def in yaml.safe_load(TEST_DATA_FILE.read_text(encoding="utf-8"))
-]
-DEFAULT_RTOL = 0.05
-
-
-@pytest.mark.parametrize("eval_data", TEST_DATA)
-def test_lm_eval_correctness(
-    eval_data: EvalTaskDefinition,
-    monkeypatch: pytest.MonkeyPatch,
-):
-    monkeypatch.setenv("TOKENIZERS_PARALLELISM", "false")
-    monkeypatch.setenv("OPENAI_API_KEY", "dummy")
-
-    model_name = eval_data["model_name"]
-    vllm_args = {
-        "--model": model_name,
-        "--disable-log-requests": None,
-        "--max-model-len": 4096,
-    }
-
-    if eval_data.get("enable_tensor_parallel") is True:
-        tp = torch.cuda.device_count()
-        vllm_args["--tensor-parallel-size"] = tp
-
-    if extra_args := eval_data.get("extra_args"):
-        vllm_args.update(extra_args)
-
+def launch_lm_eval(eval_config):
+    os.environ["OPENAI_API_KEY"] = "dummy"
     openai_args = ",".join([
-        f"model={model_name}",
+        f"model={eval_config['model_name']}",
         "tokenizer_backend=huggingface",
         "base_url=http://localhost:8000/v1",
     ])
 
-    with ServerContext(vllm_args) as _:
-        task_names = [task["name"] for task in eval_data["tasks"]]
-        limit = eval_data["limit"]
-        new_fewshot = eval_data["num_fewshot"]
-        results = lm_eval.simple_evaluate(
-            model="local-completions",
-            model_args=openai_args,
-            tasks=task_names,
-            batch_size=32,
-            num_fewshot=new_fewshot,
-            limit=limit,
-        )
-
-    lm_eval.models.utils.clear_torch_cache()
-
-    rtol = eval_data.get("rtol", DEFAULT_RTOL)
-    for task in eval_data["tasks"]:
-        for metric in task["metrics"]:
-            ground_truth = metric["value"]
-            measured_value = results["results"][task["name"]][metric["name"]]
-            print(
-                "%s %s:\nground_truth=%s measured_value=%s",
-                task["name"],
-                metric["name"],
-                ground_truth,
-                measured_value,
-            )
-
-            assert numpy.isclose(ground_truth, measured_value, rtol=rtol)
+    results = lm_eval.simple_evaluate(
+        model="local-completions",
+        model_args=openai_args,
+        tasks=[task["name"] for task in eval_config["tasks"]],
+        batch_size=32,
+        num_fewshot=eval_config["num_fewshot"],
+        limit=eval_config["limit"],
+    )
+
+    return results
+
+
+def test_lm_eval_correctness(num_gpus_available):
+    eval_config = yaml.safe_load(
+        Path(TEST_DATA_FILE).read_text(encoding="utf-8"))
+
+    # Setup server launch.
+    server_args = {
+        "model": eval_config["model_name"],
+        "max-model-len": 4096,
+        "tensor-parallel-size": num_gpus_available,
+        # TODO (@robertgshaw2): understand why default (mp) does not
+        # shut down cleanly (it works, but not clean).
+        "distributed-executor-backend": "ray",
+        "disable-log-requests": "",
+    }
+
+    server_cmd = "python3 -m vllm.entrypoints.openai.api_server " + \
+                    " ".join([f"--{k} {v}"
+                                for k, v in server_args.items()])
+
+    try:
+        # Launch server.
+        server_process = subprocess.Popen("exec " + server_cmd, shell=True)
+        assert wait_for_server(), "Server did not start up in time."
+
+        # Launch eval requests.
+        results = launch_lm_eval(eval_config)
+
+        # Confirm scores match ground truth.
+        for task in eval_config["tasks"]:
+            for metric in task["metrics"]:
+                ground_truth = metric["value"]
+                measured_value = results["results"][task["name"]][
+                    metric["name"]]
+                print(
+                    f'{task["name"]} | {metric["name"]}: '
+                    f'ground_truth={ground_truth} | measured={measured_value}')
+                assert numpy.isclose(ground_truth, measured_value, rtol=RTOL)
+
+    finally:
+        assert server_process is not None
+        server_process.terminate()
+
+        # Make sure the server finishes tearing down.
+        time.sleep(10.)