diff --git a/.github/lm-eval-configs/full-large-models.txt b/.github/lm-eval-configs/full-large-models.txt new file mode 100644 index 0000000000000..55c913a282bc8 --- /dev/null +++ b/.github/lm-eval-configs/full-large-models.txt @@ -0,0 +1,9 @@ +Meta-Llama-3-70B-Instruct-FP8.yaml +Meta-Llama-3-70B-Instruct.yaml +Mixtral-8x22B-Instruct-v0.1-FP8.yaml +Mixtral-8x22B-Instruct-v0.1.yaml +Mixtral-8x7B-Instruct-v0.1-FP8.yaml +Mixtral-8x7B-Instruct-v0.1.yaml +Qwen2-57B-A14B-Instruct.yaml +Qwen2-72B-Instruct.yaml +Phi-3-medium-4k-instruct.yaml diff --git a/.github/lm-eval-configs/full-small-models.txt b/.github/lm-eval-configs/full-small-models.txt new file mode 100644 index 0000000000000..caca502f76d04 --- /dev/null +++ b/.github/lm-eval-configs/full-small-models.txt @@ -0,0 +1,7 @@ +gemma-7b-it.yaml +Meta-Llama-3-8B-Instruct-FP8-KV.yaml +Meta-Llama-3-8B-Instruct-FP8.yaml +Meta-Llama-3-8B-Instruct-W4A16.yaml +Meta-Llama-3-8B-Instruct.yaml +Mistral-7B-Instruct-v0.3.yaml +Qwen2-7B-Instruct.yaml diff --git a/.github/lm-eval-configs/models/Meta-Llama-3-70B-Instruct-FP8.yaml b/.github/lm-eval-configs/models/Meta-Llama-3-70B-Instruct-FP8.yaml new file mode 100644 index 0000000000000..2ef7b975b8bc9 --- /dev/null +++ b/.github/lm-eval-configs/models/Meta-Llama-3-70B-Instruct-FP8.yaml @@ -0,0 +1,11 @@ +# ./nm-run-lm-eval-gsm-hf-baseline.sh -m meta-llama/Meta-Llama-3-70B-Instruct -b 32 -l 250 -f 5 +model_name: "meta-llama/Meta-Llama-3-70B-Instruct" +tasks: +- name: "gsm8k" + metrics: + - name: "exact_match,strict-match" + value: 0.900 + - name: "exact_match,flexible-extract" + value: 0.900 +limit: 250 +num_fewshot: 5 diff --git a/.github/lm-eval-configs/models/Meta-Llama-3-70B-Instruct.yaml b/.github/lm-eval-configs/models/Meta-Llama-3-70B-Instruct.yaml new file mode 100644 index 0000000000000..70f1030fa0007 --- /dev/null +++ b/.github/lm-eval-configs/models/Meta-Llama-3-70B-Instruct.yaml @@ -0,0 +1,11 @@ +# ./nm-run-lm-eval-gsm-hf-baseline.sh -m meta-llama/Meta-Llama-3-70B-Instruct -b 32 -l 250 -f 5 +model_name: "meta-llama/Meta-Llama-3-70B-Instruct" +tasks: +- name: "gsm8k" + metrics: + - name: "exact_match,strict-match" + value: 0.888 + - name: "exact_match,flexible-extract" + value: 0.888 +limit: 250 +num_fewshot: 5 diff --git a/.github/lm-eval-configs/models/Meta-Llama-3-8B-Instruct-FP8-KV.yaml b/.github/lm-eval-configs/models/Meta-Llama-3-8B-Instruct-FP8-KV.yaml new file mode 100644 index 0000000000000..1c46cda9da11a --- /dev/null +++ b/.github/lm-eval-configs/models/Meta-Llama-3-8B-Instruct-FP8-KV.yaml @@ -0,0 +1,11 @@ +# ./nm-run-lm-eval-gsm-vllm-baseline.sh -m neuralmagic/Meta-Llama-3-8B-Instruct-FP8-KV -b 32 -l 250 -f 5 -t 1 +model_name: "neuralmagic/Meta-Llama-3-8B-Instruct-FP8-KV" +tasks: +- name: "gsm8k" + metrics: + - name: "exact_match,strict-match" + value: 0.764 + - name: "exact_match,flexible-extract" + value: 0.764 +limit: 250 +num_fewshot: 5 diff --git a/.github/lm-eval-configs/models/Meta-Llama-3-8B-Instruct-FP8.yaml b/.github/lm-eval-configs/models/Meta-Llama-3-8B-Instruct-FP8.yaml new file mode 100644 index 0000000000000..0d077dc19d95a --- /dev/null +++ b/.github/lm-eval-configs/models/Meta-Llama-3-8B-Instruct-FP8.yaml @@ -0,0 +1,11 @@ +# ./nm-run-lm-eval-gsm-vllm-baseline.sh -m neuralmagic/Meta-Llama-3-8B-Instruct-FP8 -b 32 -l 250 -f 5 -t 1 +model_name: "neuralmagic/Meta-Llama-3-8B-Instruct-FP8" +tasks: +- name: "gsm8k" + metrics: + - name: "exact_match,strict-match" + value: 0.744 + - name: "exact_match,flexible-extract" + value: 0.740 +limit: 250 +num_fewshot: 5 diff --git a/.github/lm-eval-configs/models/Meta-Llama-3-8B-Instruct-W4A16.yaml b/.github/lm-eval-configs/models/Meta-Llama-3-8B-Instruct-W4A16.yaml new file mode 100644 index 0000000000000..92d07ad0c734a --- /dev/null +++ b/.github/lm-eval-configs/models/Meta-Llama-3-8B-Instruct-W4A16.yaml @@ -0,0 +1,11 @@ +# ./nm-run-lm-eval-gsm-hf-baseline.sh -m TechxGenus/Meta-Llama-3-8B-Instruct-GPTQ -b 32 -l 250 -f 5 +model_name: "TechxGenus/Meta-Llama-3-8B-Instruct-GPTQ" +tasks: +- name: "gsm8k" + metrics: + - name: "exact_match,strict-match" + value: 0.684 + - name: "exact_match,flexible-extract" + value: 0.688 +limit: 250 +num_fewshot: 5 diff --git a/.github/lm-eval-configs/models/Meta-Llama-3-8B-Instruct.yaml b/.github/lm-eval-configs/models/Meta-Llama-3-8B-Instruct.yaml new file mode 100644 index 0000000000000..d7abd6b36bfc6 --- /dev/null +++ b/.github/lm-eval-configs/models/Meta-Llama-3-8B-Instruct.yaml @@ -0,0 +1,11 @@ +# ./nm-run-lm-eval-gsm-hf-baseline.sh -m meta-llama/Meta-Llama-3-8B-Instruct -b 32 -l 250 -f 5 +model_name: "meta-llama/Meta-Llama-3-8B-Instruct" +tasks: +- name: "gsm8k" + metrics: + - name: "exact_match,strict-match" + value: 0.74 + - name: "exact_match,flexible-extract" + value: 0.74 +limit: 250 +num_fewshot: 5 diff --git a/.github/lm-eval-configs/models/Mistral-7B-Instruct-v0.3.yaml b/.github/lm-eval-configs/models/Mistral-7B-Instruct-v0.3.yaml new file mode 100644 index 0000000000000..592652eed999e --- /dev/null +++ b/.github/lm-eval-configs/models/Mistral-7B-Instruct-v0.3.yaml @@ -0,0 +1,11 @@ +# ./nm-run-lm-eval-gsm-hf-baseline.sh -m mistralai/Mistral-7B-Instruct-v0.3 -b 32 -l 250 -f 5 +model_name: "mistralai/Mistral-7B-Instruct-v0.3" +tasks: +- name: "gsm8k" + metrics: + - name: "exact_match,strict-match" + value: 0.524 + - name: "exact_match,flexible-extract" + value: 0.524 +limit: 250 +num_fewshot: 5 diff --git a/.github/lm-eval-configs/models/Mixtral-8x22B-Instruct-v0.1-FP8.yaml b/.github/lm-eval-configs/models/Mixtral-8x22B-Instruct-v0.1-FP8.yaml new file mode 100644 index 0000000000000..8d1eaecf5bec6 --- /dev/null +++ b/.github/lm-eval-configs/models/Mixtral-8x22B-Instruct-v0.1-FP8.yaml @@ -0,0 +1,11 @@ +# ./nm-run-lm-eval-gsm-hf-baseline.sh -m mistralai/Mixtral-8x22B-Instruct-v0.1 -b 32 -l 250 -f 5 +model_name: "mistralai/Mixtral-8x22B-Instruct-v0.1" +tasks: +- name: "gsm8k" + metrics: + - name: "exact_match,strict-match" + value: 0.840 + - name: "exact_match,flexible-extract" + value: 0.844 +limit: 250 +num_fewshot: 5 diff --git a/.github/lm-eval-configs/models/Mixtral-8x22B-Instruct-v0.1.yaml b/.github/lm-eval-configs/models/Mixtral-8x22B-Instruct-v0.1.yaml new file mode 100644 index 0000000000000..73f00b16c51aa --- /dev/null +++ b/.github/lm-eval-configs/models/Mixtral-8x22B-Instruct-v0.1.yaml @@ -0,0 +1,11 @@ +# ./nm-run-lm-eval-gsm-hf-baseline.sh -m mistralai/Mixtral-8x22B-Instruct-v0.1 -b 32 -l 250 -f 5 +model_name: "mistralai/Mixtral-8x22B-Instruct-v0.1" +tasks: +- name: "gsm8k" + metrics: + - name: "exact_match,strict-match" + value: 0.876 + - name: "exact_match,flexible-extract" + value: 0.880 +limit: 250 +num_fewshot: 5 diff --git a/.github/lm-eval-configs/models/Mixtral-8x7B-Instruct-v0.1-FP8.yaml b/.github/lm-eval-configs/models/Mixtral-8x7B-Instruct-v0.1-FP8.yaml new file mode 100644 index 0000000000000..e3f30baf316be --- /dev/null +++ b/.github/lm-eval-configs/models/Mixtral-8x7B-Instruct-v0.1-FP8.yaml @@ -0,0 +1,11 @@ +# ./nm-run-lm-eval-gsm-hf-baseline.sh -m mistralai/Mixtral-8x7B-Instruct-v0.1 -b 32 -l 250 -f 5 +model_name: "mistralai/Mixtral-8x7B-Instruct-v0.1" +tasks: +- name: "gsm8k" + metrics: + - name: "exact_match,strict-match" + value: 0.616 + - name: "exact_match,flexible-extract" + value: 0.620 +limit: 250 +num_fewshot: 5 diff --git a/.github/lm-eval-configs/models/Mixtral-8x7B-Instruct-v0.1.yaml b/.github/lm-eval-configs/models/Mixtral-8x7B-Instruct-v0.1.yaml new file mode 100644 index 0000000000000..629e3721fdf44 --- /dev/null +++ b/.github/lm-eval-configs/models/Mixtral-8x7B-Instruct-v0.1.yaml @@ -0,0 +1,11 @@ +# bash ./nm-run-lm-eval-gsm-vllm-baseline.sh -m neuralmagic/Mixtral-8x7B-Instruct-v0.1-FP8 -b 32 -l 250 -f 5 -t 4 +model_name: "mistralai/Mixtral-8x7B-Instruct-v0.1" +tasks: +- name: "gsm8k" + metrics: + - name: "exact_match,strict-match" + value: 0.616 + - name: "exact_match,flexible-extract" + value: 0.628 +limit: 250 +num_fewshot: 5 diff --git a/.github/lm-eval-configs/models/Phi-3-medium-4k-instruct.yaml b/.github/lm-eval-configs/models/Phi-3-medium-4k-instruct.yaml new file mode 100644 index 0000000000000..443db66c9adc6 --- /dev/null +++ b/.github/lm-eval-configs/models/Phi-3-medium-4k-instruct.yaml @@ -0,0 +1,11 @@ +# ./nm-run-lm-eval-gsm-hf-baseline.sh -m microsoft/Phi-3-medium-4k-instruct -b 16 -l 250 -f 5 +model_name: "microsoft/Phi-3-medium-4k-instruct" +tasks: +- name: "gsm8k" + metrics: + - name: "exact_match,strict-match" + value: 0.840 + - name: "exact_match,flexible-extract" + value: 0.852 +limit: 250 +num_fewshot: 5 diff --git a/.github/lm-eval-configs/models/Qwen2-57B-A14B-Instruct.yaml b/.github/lm-eval-configs/models/Qwen2-57B-A14B-Instruct.yaml new file mode 100644 index 0000000000000..a46aa16f0bcd4 --- /dev/null +++ b/.github/lm-eval-configs/models/Qwen2-57B-A14B-Instruct.yaml @@ -0,0 +1,11 @@ +# ./nm-run-lm-eval-gsm-hf-baseline.sh -m Qwen/Qwen2-57B-A14B-Instruct -b 32 -l 250 -f 5 +model_name: "Qwen/Qwen2-57B-A14B-Instruct" +tasks: +- name: "gsm8k" + metrics: + - name: "exact_match,strict-match" + value: 0.736 + - name: "exact_match,flexible-extract" + value: 0.800 +limit: 250 +num_fewshot: 5 diff --git a/.github/lm-eval-configs/models/Qwen2-72B-Instruct.yaml b/.github/lm-eval-configs/models/Qwen2-72B-Instruct.yaml new file mode 100644 index 0000000000000..fe5a2c0af1e4a --- /dev/null +++ b/.github/lm-eval-configs/models/Qwen2-72B-Instruct.yaml @@ -0,0 +1,11 @@ +# ./nm-run-lm-eval-gsm-hf-baseline.sh -m Qwen/Qwen2-72B-Instruct -b 16 -l 250 -f 5 +model_name: "Qwen/Qwen2-72B-Instruct" +tasks: +- name: "gsm8k" + metrics: + - name: "exact_match,strict-match" + value: 0.828 + - name: "exact_match,flexible-extract" + value: 0.856 +limit: 250 +num_fewshot: 5 diff --git a/.github/lm-eval-configs/models/Qwen2-7B-Instruct.yaml b/.github/lm-eval-configs/models/Qwen2-7B-Instruct.yaml new file mode 100644 index 0000000000000..5bf60816dac8f --- /dev/null +++ b/.github/lm-eval-configs/models/Qwen2-7B-Instruct.yaml @@ -0,0 +1,11 @@ +# ./nm-run-lm-eval-gsm-hf-baseline.sh -m Qwen/Qwen2-7B-Instruct -b 32 -l 250 -f 5 +model_name: "Qwen/Qwen2-7B-Instruct" +tasks: +- name: "gsm8k" + metrics: + - name: "exact_match,strict-match" + value: 0.680 + - name: "exact_match,flexible-extract" + value: 0.756 +limit: 250 +num_fewshot: 5 diff --git a/.github/lm-eval-configs/models/gemma-7b-it.yaml b/.github/lm-eval-configs/models/gemma-7b-it.yaml new file mode 100644 index 0000000000000..0b3813d240add --- /dev/null +++ b/.github/lm-eval-configs/models/gemma-7b-it.yaml @@ -0,0 +1,11 @@ +# ./nm-run-lm-eval-gsm-hf-baseline.sh -m google/gemma-7b-it -b 16 -l 250 -f 5 +model_name: "google/gemma-7b-it" +tasks: +- name: "gsm8k" + metrics: + - name: "exact_match,strict-match" + value: 0.284 + - name: "exact_match,flexible-extract" + value: 0.324 +limit: 250 +num_fewshot: 5 diff --git a/.github/lm-eval-configs/smoke-large-models.txt b/.github/lm-eval-configs/smoke-large-models.txt new file mode 100644 index 0000000000000..127ec5d97bcff --- /dev/null +++ b/.github/lm-eval-configs/smoke-large-models.txt @@ -0,0 +1,2 @@ +Meta-Llama-3-70B-Instruct.yaml +Mixtral-8x7B-Instruct-v0.1.yaml diff --git a/.github/lm-eval-configs/smoke-small-models.txt b/.github/lm-eval-configs/smoke-small-models.txt new file mode 100644 index 0000000000000..d884f36672a74 --- /dev/null +++ b/.github/lm-eval-configs/smoke-small-models.txt @@ -0,0 +1 @@ +Meta-Llama-3-8B-Instruct.yaml diff --git a/.github/scripts/nm-run-lm-eval-gsm-hf-baseline.sh b/.github/scripts/nm-run-lm-eval-gsm-hf-baseline.sh index 76f7100af5949..fdb8ec5393b36 100755 --- a/.github/scripts/nm-run-lm-eval-gsm-hf-baseline.sh +++ b/.github/scripts/nm-run-lm-eval-gsm-hf-baseline.sh @@ -14,13 +14,12 @@ usage() { echo echo " -m - huggingface stub or local directory of the model" echo " -b - batch size to run the evaluation at" - echo " -d - device to use (e.g. cuda, cuda:0, auto, cpu)" echo " -l - limit number of samples to run" echo " -f - number of fewshot samples to use" echo } -while getopts "m:b:d:l:f:" OPT; do +while getopts "m:b:l:f:" OPT; do case ${OPT} in m ) MODEL="$OPTARG" @@ -28,9 +27,6 @@ while getopts "m:b:d:l:f:" OPT; do b ) BATCH_SIZE="$OPTARG" ;; - d ) - DEVICE="$OPTARG" - ;; l ) LIMIT="$OPTARG" ;; @@ -45,6 +41,6 @@ while getopts "m:b:d:l:f:" OPT; do done lm_eval --model hf \ - --model_args pretrained=$MODEL \ + --model_args pretrained=$MODEL,parallelize=True \ --tasks gsm8k --num_fewshot $FEWSHOT --limit $LIMIT \ - --batch_size $BATCH_SIZE --device $DEVICE + --batch_size $BATCH_SIZE diff --git a/.github/scripts/nm-run-lm-eval-gsm-vllm-baseline.sh b/.github/scripts/nm-run-lm-eval-gsm-vllm-baseline.sh new file mode 100644 index 0000000000000..d6b38752945ce --- /dev/null +++ b/.github/scripts/nm-run-lm-eval-gsm-vllm-baseline.sh @@ -0,0 +1,51 @@ +#!/bin/bash +# We can use this script to compute baseline accuracy on GSM for vllm. +# We use this for fp8, which HF does not support. +# +# Make sure you have lm-eval-harness installed: +# pip install git+https://github.com/EleutherAI/lm-evaluation-harness.git@9516087b81a61d0e220b22cc1b75be76de23bc10 + +usage() { + echo`` + echo "Runs lm eval harness on GSM8k using huggingface transformers." + echo "This pathway is intended to be used to create baselines for " + echo "our automated nm-test-accuracy workflow" + echo + echo "usage: ${0} " + echo + echo " -m - huggingface stub or local directory of the model" + echo " -b - batch size to run the evaluation at" + echo " -l - limit number of samples to run" + echo " -f - number of fewshot samples to use" + echo " -t - tensor parallel size to run at" + echo +} + +while getopts "m:b:l:f:t:" OPT; do + case ${OPT} in + m ) + MODEL="$OPTARG" + ;; + b ) + BATCH_SIZE="$OPTARG" + ;; + l ) + LIMIT="$OPTARG" + ;; + f ) + FEWSHOT="$OPTARG" + ;; + t ) + TP_SIZE="$OPTARG" + ;; + \? ) + usage + exit 1 + ;; + esac +done + +lm_eval --model vllm \ + --model_args pretrained=$MODEL,tensor_parallel_size=$TP_SIZE \ + --tasks gsm8k --num_fewshot $FEWSHOT --limit $LIMIT \ + --batch_size $BATCH_SIZE diff --git a/.github/scripts/nm-run-lm-eval-vllm.sh b/.github/scripts/nm-run-lm-eval-vllm.sh index c68d6d1d7697f..d0702a086d911 100755 --- a/.github/scripts/nm-run-lm-eval-vllm.sh +++ b/.github/scripts/nm-run-lm-eval-vllm.sh @@ -7,15 +7,19 @@ usage() { echo`` echo "Runs lm eval harness on GSM8k using vllm server and compares to " - echo "precomputed baseline (measured by HF transformers." + echo "precomputed baseline (measured by HF transformers.)" + echo + echo "This script should be run from the /nm-vllm directory" echo echo "usage: ${0} " echo - echo " -c - path to the test data config (e.g. neuralmagic/lm-eval/YOUR_CONFIG.yaml)" + echo " -c - path to the test data config (e.g. .github/lm-eval-configs/small-models-smoke.txt)" echo } -while getopts "c:" OPT; do +SUCCESS=0 + +while getopts "c:t:" OPT; do case ${OPT} in c ) CONFIG="$OPTARG" @@ -27,4 +31,30 @@ while getopts "c:" OPT; do esac done -LM_EVAL_TEST_DATA_FILE=$CONFIG pytest -v tests/accuracy/test_lm_eval_correctness.py +# Parse list of configs. +IFS=$'\n' read -d '' -r -a MODEL_CONFIGS < $CONFIG + +for MODEL_CONFIG in "${MODEL_CONFIGS[@]}" +do + LOCAL_SUCCESS=0 + + echo "=== RUNNING MODEL: $MODEL_CONFIG ===" + + MODEL_CONFIG_PATH=$PWD/.github/lm-eval-configs/models/${MODEL_CONFIG} + LM_EVAL_TEST_DATA_FILE=$MODEL_CONFIG_PATH pytest -s tests/accuracy/test_lm_eval_correctness.py || LOCAL_SUCCESS=$? + + if [[ $LOCAL_SUCCESS == 0 ]]; then + echo "=== PASSED MODEL: ${MODEL_CONFIG} ===" + else + echo "=== FAILED MODEL: ${MODEL_CONFIG} ===" + fi + + SUCCESS=$((SUCCESS + LOCAL_SUCCESS)) + +done + +if [ "${SUCCESS}" -eq "0" ]; then + exit 0 +else + exit 1 +fi diff --git a/.github/workflows/nm-build-test.yml b/.github/workflows/nm-build-test.yml index febf6b444582a..9a5043308a067 100644 --- a/.github/workflows/nm-build-test.yml +++ b/.github/workflows/nm-build-test.yml @@ -85,7 +85,7 @@ on: type: string default: "60" lm_eval_configuration: - description: "configuration for lm-eval test (see neuralmagic/lm-eval)" + description: "configuration for lm-eval test (see .github/lm-eval-configs)" type: string default: "" diff --git a/.github/workflows/nm-nightly.yml b/.github/workflows/nm-nightly.yml index 86201939d359a..8a9ee1a6b9cb7 100644 --- a/.github/workflows/nm-nightly.yml +++ b/.github/workflows/nm-nightly.yml @@ -45,6 +45,6 @@ jobs: push_benchmark_results_to_gh_pages: "${{ github.event_name == 'schedule' || inputs.push_benchmark_results_to_gh_pages }}" lm_eval_label: gcp-k8s-l4-solo - lm_eval_configuration: ./neuralmagic/lm-eval/full-small-models.yaml + lm_eval_configuration: ./.github/lm-eval-configs/smoke-small-models.txt lm_eval_timeout: 60 secrets: inherit diff --git a/.github/workflows/nm-remote-push.yml b/.github/workflows/nm-remote-push.yml index 5bbd760b63e0b..41398f0625d16 100644 --- a/.github/workflows/nm-remote-push.yml +++ b/.github/workflows/nm-remote-push.yml @@ -30,6 +30,6 @@ jobs: benchmark_timeout: 480 lm_eval_label: gcp-k8s-l4-solo - lm_eval_configuration: ./neuralmagic/lm-eval/smoke-small-models.yaml + lm_eval_configuration: ./.github/lm-eval-configs/smoke-small-models.txt lm_eval_timeout: 60 secrets: inherit diff --git a/neuralmagic/lm-eval/full-small-models.yaml b/neuralmagic/lm-eval/full-small-models.yaml deleted file mode 100644 index adb00ba65c1f1..0000000000000 --- a/neuralmagic/lm-eval/full-small-models.yaml +++ /dev/null @@ -1,11 +0,0 @@ -# ./nm-run-lm-eval-gsm-hf-baseline -m meta-llama/Meta-Llama-3-8B-Instruct -b 32 -d cuda -l 250 -f 5 -- model_name: "meta-llama/Meta-Llama-3-8B-Instruct" - tasks: - - name: "gsm8k" - metrics: - - name: "exact_match,strict-match" - value: 0.74 - - name: "exact_match,flexible-extract" - value: 0.74 - limit: 250 - num_fewshot: 5 diff --git a/neuralmagic/lm-eval/smoke-small-models.yaml b/neuralmagic/lm-eval/smoke-small-models.yaml deleted file mode 100644 index 546a221872af8..0000000000000 --- a/neuralmagic/lm-eval/smoke-small-models.yaml +++ /dev/null @@ -1,11 +0,0 @@ -# ./nm-run-lm-eval-gsm-hf-baseline.sh -m meta-llama/Meta-Llama-3-8B-Instruct -b 32 -d cuda -l 250 -f 5 -- model_name: "meta-llama/Meta-Llama-3-8B-Instruct" - tasks: - - name: "gsm8k" - metrics: - - name: "exact_match,strict-match" - value: 0.74 - - name: "exact_match,flexible-extract" - value: 0.74 - limit: 250 - num_fewshot: 5 diff --git a/tests/accuracy/test_lm_eval_correctness.py b/tests/accuracy/test_lm_eval_correctness.py index 581e56352064b..de975b2785ffd 100644 --- a/tests/accuracy/test_lm_eval_correctness.py +++ b/tests/accuracy/test_lm_eval_correctness.py @@ -1,15 +1,14 @@ -# mypy: ignore-errors -# TODO (robertgshaw2-neuralmagic): clean this up import os +import subprocess +import time from pathlib import Path -from typing import TYPE_CHECKING, Any, Dict, List, TypedDict +from typing import TYPE_CHECKING import numpy import pytest -import torch +import requests import yaml -from tests.nm_utils.server import ServerContext from tests.nm_utils.utils_skip import should_skip_test_group if should_skip_test_group(group_name="TEST_ACCURACY"): @@ -24,97 +23,93 @@ lm_eval: "lm_eval_t" = pytest.importorskip("lm_eval", reason="lm_eval required") +RTOL = 0.02 +TEST_DATA_FILE = os.environ.get( + "LM_EVAL_TEST_DATA_FILE", + ".github/lm-eval-configs/models/Meta-Llama-3-8B-Instruct.yaml") -class Metric(TypedDict): - name: str - value: float +def wait_for_server(timeout=900) -> bool: -class Task(TypedDict): - name: str - metrics: List[Metric] + def try_connection() -> bool: + try: + r = requests.get("http://localhost:8000/health") + return r.status_code == 200 + except Exception as _: + return False + timeout_part = 15 # retry every 15 seconds + time_waited = 0 + while time_waited <= timeout: + time.sleep(timeout_part) + if try_connection(): + return True + time_waited = time_waited + timeout_part -# to support python3.8 typing prior to adding `Required`/`NotRequired`, this -# class stores the optional keys and the `EvalTaskDefinition` subclass inherits -# those alongside the required keys it defines. -class EvalTaskDefinitionOpts(TypedDict, total=False): - enable_tensor_parallel: bool - extra_args: Dict[str, Any] - rtol: float + return False -class EvalTaskDefinition(EvalTaskDefinitionOpts): - model_name: str - tasks: List[Task] - - -TEST_DATA_FILE = os.environ.get("LM_EVAL_TEST_DATA_FILE", None) -if TEST_DATA_FILE is None: - raise ValueError("LM_EVAL_TEST_DATA_FILE env variable is not set.") -TEST_DATA_FILE = Path(TEST_DATA_FILE) - -TEST_DATA: List[EvalTaskDefinition] = [ - pytest.param(eval_def, id=eval_def["model_name"]) - for eval_def in yaml.safe_load(TEST_DATA_FILE.read_text(encoding="utf-8")) -] -DEFAULT_RTOL = 0.05 - - -@pytest.mark.parametrize("eval_data", TEST_DATA) -def test_lm_eval_correctness( - eval_data: EvalTaskDefinition, - monkeypatch: pytest.MonkeyPatch, -): - monkeypatch.setenv("TOKENIZERS_PARALLELISM", "false") - monkeypatch.setenv("OPENAI_API_KEY", "dummy") - - model_name = eval_data["model_name"] - vllm_args = { - "--model": model_name, - "--disable-log-requests": None, - "--max-model-len": 4096, - } - - if eval_data.get("enable_tensor_parallel") is True: - tp = torch.cuda.device_count() - vllm_args["--tensor-parallel-size"] = tp - - if extra_args := eval_data.get("extra_args"): - vllm_args.update(extra_args) - +def launch_lm_eval(eval_config): + os.environ["OPENAI_API_KEY"] = "dummy" openai_args = ",".join([ - f"model={model_name}", + f"model={eval_config['model_name']}", "tokenizer_backend=huggingface", "base_url=http://localhost:8000/v1", ]) - with ServerContext(vllm_args) as _: - task_names = [task["name"] for task in eval_data["tasks"]] - limit = eval_data["limit"] - new_fewshot = eval_data["num_fewshot"] - results = lm_eval.simple_evaluate( - model="local-completions", - model_args=openai_args, - tasks=task_names, - batch_size=32, - num_fewshot=new_fewshot, - limit=limit, - ) - - lm_eval.models.utils.clear_torch_cache() - - rtol = eval_data.get("rtol", DEFAULT_RTOL) - for task in eval_data["tasks"]: - for metric in task["metrics"]: - ground_truth = metric["value"] - measured_value = results["results"][task["name"]][metric["name"]] - print( - "%s %s:\nground_truth=%s measured_value=%s", - task["name"], - metric["name"], - ground_truth, - measured_value, - ) - - assert numpy.isclose(ground_truth, measured_value, rtol=rtol) + results = lm_eval.simple_evaluate( + model="local-completions", + model_args=openai_args, + tasks=[task["name"] for task in eval_config["tasks"]], + batch_size=32, + num_fewshot=eval_config["num_fewshot"], + limit=eval_config["limit"], + ) + + return results + + +def test_lm_eval_correctness(num_gpus_available): + eval_config = yaml.safe_load( + Path(TEST_DATA_FILE).read_text(encoding="utf-8")) + + # Setup server launch. + server_args = { + "model": eval_config["model_name"], + "max-model-len": 4096, + "tensor-parallel-size": num_gpus_available, + # TODO (@robertgshaw2): understand why default (mp) does not + # shut down cleanly (it works, but not clean). + "distributed-executor-backend": "ray", + "disable-log-requests": "", + } + + server_cmd = "python3 -m vllm.entrypoints.openai.api_server " + \ + " ".join([f"--{k} {v}" + for k, v in server_args.items()]) + + try: + # Launch server. + server_process = subprocess.Popen("exec " + server_cmd, shell=True) + assert wait_for_server(), "Server did not start up in time." + + # Launch eval requests. + results = launch_lm_eval(eval_config) + + # Confirm scores match ground truth. + for task in eval_config["tasks"]: + for metric in task["metrics"]: + ground_truth = metric["value"] + measured_value = results["results"][task["name"]][ + metric["name"]] + print( + f'{task["name"]} | {metric["name"]}: ' + f'ground_truth={ground_truth} | measured={measured_value}') + assert numpy.isclose(ground_truth, measured_value, rtol=RTOL) + + finally: + assert server_process is not None + server_process.terminate() + + # Make sure the server finishes tearing down. + time.sleep(10.)