Update tests/smoketest.sh to support FSDP + LoRA as a testing path.

Additionally introuce a max_seq_len parameter to support testing on lower-end hardware. Signed-off-by: Oleg S <[email protected]>
instructlab · Nov 7, 2024 · 54a77a4 · 54a77a4
1 parent 32b75f1
commit 54a77a4
Showing 1 changed file with 5 additions and 6 deletions.
diff --git a/tests/smoketest.sh b/tests/smoketest.sh
@@ -2,7 +2,7 @@
 set -eux -o pipefail
 
 # ############### Read-only parameters ############### 
-MODEL_NAME="instructlab/granite-7b-lab"
+MODEL_NAME="/home/ec2-user/.cache/huggingface/hub/models--instructlab--granite-7b-lab/snapshots/4fb6a018d68ab813b95c7f470e424a70f2f7e561"
 # gets directory of current file.
 SCRIPT_DIR="$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd )"
 CORRECT_WORKING_DIR="${SCRIPT_DIR}/../src/instructlab/training/"
@@ -13,12 +13,13 @@ DATA_DIR="${TMP_DIR}/data"
 COMPUTED_DATA_PATH="${DATA_DIR}/data.jsonl"
 DEFAULT_DISTRIB_FRAMEWORK='fsdp'
 DISTRIB_FRAMEWORK="${1:-$DEFAULT_DISTRIB_FRAMEWORK}" # defaults to FSDP
-DEFAULT_GPUS=8
+DEFAULT_GPUS=4
 NUM_GPUS="${2:-$DEFAULT_GPUS}"
 
 # ############### User-modifiable parameters ############### 
 # Change these as needed
 MAX_BATCH_LEN=60000
+MAX_SEQ_LEN=4096
 NUM_SAMPLES_TRAINED_ON=5000 # upper-bound on training dataset size.
 
 # ############### Test Functions ############### 
@@ -63,7 +64,7 @@ function prepare_data () {
     python3 data_process.py \
     --data_path="$SAMPLE_DATA_PATH" \
     --data_output_path="$DATA_DIR" \
-    --max_seq_len=4096 \
+    --max_seq_len="${MAX_SEQ_LEN}" \
     --model_name_or_path="$MODEL_NAME"
 
     # trim data so we only keep the first 'n' samples.
@@ -203,17 +204,14 @@ function test_standard_loop_fsdp_lora() {
     --nproc_per_node="${NUM_GPUS}" \
     main_ds.py \
     --model_name_or_path="${MODEL_NAME}" \
-    --is_granite \
     --data_path="${COMPUTED_DATA_PATH}" \
     --output_dir="${CHECKPOINTS_DIR}" \
     --num_epochs=1 \
     --effective_batch_size=128 \
     --save_samples=0 \
     --checkpoint_at_epoch \
-    --accelerate_full_state_at_epoch \
     --distributed_training_framework="${DISTRIB_FRAMEWORK}" \
     --max_batch_len="${MAX_BATCH_LEN}" \
-    --is_granite \
     --lora_r=4 \
     --lora_alpha=32 \
     --lora_dropout=0.1
@@ -235,6 +233,7 @@ function main () {
     test_standard_loop_nongranite
     _cleanup_saved_checkpoints
     test_standard_loop
+    test_standard_loop_fsdp_lora
 }
 
 main