Skip to content

Commit

Permalink
Update tests/smoketest.sh to support FSDP + LoRA as a testing path.
Browse files Browse the repository at this point in the history
Additionally introuce a max_seq_len parameter to support testing
on lower-end hardware.

Signed-off-by: Oleg S <[email protected]>
  • Loading branch information
RobotSail committed Nov 7, 2024
1 parent 32b75f1 commit 54a77a4
Showing 1 changed file with 5 additions and 6 deletions.
11 changes: 5 additions & 6 deletions tests/smoketest.sh
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
set -eux -o pipefail

# ############### Read-only parameters ###############
MODEL_NAME="instructlab/granite-7b-lab"
MODEL_NAME="/home/ec2-user/.cache/huggingface/hub/models--instructlab--granite-7b-lab/snapshots/4fb6a018d68ab813b95c7f470e424a70f2f7e561"
# gets directory of current file.
SCRIPT_DIR="$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd )"
CORRECT_WORKING_DIR="${SCRIPT_DIR}/../src/instructlab/training/"
Expand All @@ -13,12 +13,13 @@ DATA_DIR="${TMP_DIR}/data"
COMPUTED_DATA_PATH="${DATA_DIR}/data.jsonl"
DEFAULT_DISTRIB_FRAMEWORK='fsdp'
DISTRIB_FRAMEWORK="${1:-$DEFAULT_DISTRIB_FRAMEWORK}" # defaults to FSDP
DEFAULT_GPUS=8
DEFAULT_GPUS=4
NUM_GPUS="${2:-$DEFAULT_GPUS}"

# ############### User-modifiable parameters ###############
# Change these as needed
MAX_BATCH_LEN=60000
MAX_SEQ_LEN=4096
NUM_SAMPLES_TRAINED_ON=5000 # upper-bound on training dataset size.

# ############### Test Functions ###############
Expand Down Expand Up @@ -63,7 +64,7 @@ function prepare_data () {
python3 data_process.py \
--data_path="$SAMPLE_DATA_PATH" \
--data_output_path="$DATA_DIR" \
--max_seq_len=4096 \
--max_seq_len="${MAX_SEQ_LEN}" \
--model_name_or_path="$MODEL_NAME"

# trim data so we only keep the first 'n' samples.
Expand Down Expand Up @@ -203,17 +204,14 @@ function test_standard_loop_fsdp_lora() {
--nproc_per_node="${NUM_GPUS}" \
main_ds.py \
--model_name_or_path="${MODEL_NAME}" \
--is_granite \
--data_path="${COMPUTED_DATA_PATH}" \
--output_dir="${CHECKPOINTS_DIR}" \
--num_epochs=1 \
--effective_batch_size=128 \
--save_samples=0 \
--checkpoint_at_epoch \
--accelerate_full_state_at_epoch \
--distributed_training_framework="${DISTRIB_FRAMEWORK}" \
--max_batch_len="${MAX_BATCH_LEN}" \
--is_granite \
--lora_r=4 \
--lora_alpha=32 \
--lora_dropout=0.1
Expand All @@ -235,6 +233,7 @@ function main () {
test_standard_loop_nongranite
_cleanup_saved_checkpoints
test_standard_loop
test_standard_loop_fsdp_lora
}

main

0 comments on commit 54a77a4

Please sign in to comment.