From f5e3344634f19466dba02101da456953cc4a8602 Mon Sep 17 00:00:00 2001
From: Lo-Mein <ryanlo5102000@gmail.com>
Date: Thu, 31 Oct 2024 12:18:53 -0400
Subject: [PATCH 1/6] Added CHANGELOG.md

Signed-off-by: Lo-Mein <ryanlo5102000@gmail.com>
---
 CHANGELOG.md | 213 +++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 213 insertions(+)
 create mode 100644 CHANGELOG.md

diff --git a/CHANGELOG.md b/CHANGELOG.md
new file mode 100644
index 00000000..bc0027f1
--- /dev/null
+++ b/CHANGELOG.md
@@ -0,0 +1,213 @@
+# Changelog
+
+## v0.5.5
+
+### Features
+* e2e: replace old small job with new medium job
+
+### Fixes
+* fix: incorrect label for AWS medium runner
+* chore: add exit code & tox fix
+
+### Infrastructure
+* ci: grant HF_TOKEN access to the medium-size E2E CI job
+
+## v0.5.4
+
+### Features
+* Add rocm extra to pyproject.toml
+
+## v0.5.3
+
+### Fixes
+* fix: Add explicit flash_attn requirement for ROCm
+
+## v0.5.2 - Fix Pretraining Masking
+
+### Fixes
+* fix: improve linting and automation
+* Fix pretrain token list->int for masking
+
+## v0.5.1
+
+### Fixes
+* fix: updates sorting logic to correctly compare numbers
+
+## v0.5.0 - FSDP and Full-State Checkpoint Resuming
+
+### Features
+* feat: add e2e test for instructlab CI
+* feat: add mergify
+* Adding FSDP Support to Training Library by @aldopareja @Maxusmusti @RobotSail
+* adds Accelerate full-state (opt, lr_sched, params)
+* changes StreamablePopen to return a process and implement listening
+
+### Fixes
+* Fix lint error to make CI happy
+* Fix typos
+* Ap/fix multipack for non granite models
+* Fix generic chat template saved to tokenizer for generation
+* Fix linting error and missing quote
+
+### Infrastructure
+* Add license identifiers
+* ci: update runner labels to uniquely identify instance sizes
+* ci: minor cleanup of E2E job
+* Fixing e2e to use relative path for working-directory
+* switch -T to -a
+* github: add stale bot to training repo
+* fix: markdown lint error and mergify bug
+* Bump actions/checkout from 4.1.7 to 4.2.0
+* Bump step-security/harden-runner from 2.8.1 to 2.9.1
+* Bump pypa/gh-action-pypi-publish from 1.9.0 to 1.10.2
+* Bump actions/setup-python from 5.1.0 to 5.2.0
+* Bump rhysd/actionlint from 1.7.1 to 1.7.2
+* Bump hynek/build-and-inspect-python-package from 2.6.0 to 2.9.0
+* Bump DavidAnson/markdownlint-cli2-action from 16.0.0 to 17.0.0
+* ci: fix lint action
+* ci: add AWS tags to show github ref and PR num for all jobs
+
+## v0.5.0 Alpha 0 - The FSDP Release Pre-release
+
+### Description
+The FSDP Release introduces FSDP support in addition to the existing DeepSpeed support through the accelerate library.
+
+### Features
+* feat: add e2e test for instructlab CI
+* feat: add mergify
+* Adding FSDP Support to Training Library by @aldopareja @Maxusmusti @RobotSail
+
+### Fixes
+* Fix lint error to make CI happy
+* Fix typos
+* Ap/fix multipack for non granite models
+* Fix linting error and missing quote
+
+### Infrastructure
+* Add license identifiers
+* ci: update runner labels to uniquely identify instance sizes
+* ci: minor cleanup of E2E job
+* Fixing e2e to use relative path for working-directory
+* Bump step-security/harden-runner from 2.8.1 to 2.9.1
+* Bump pypa/gh-action-pypi-publish from 1.9.0 to 1.10.2
+* Bump actions/setup-python from 5.1.0 to 5.2.0
+* Bump rhysd/actionlint from 1.7.1 to 1.7.2
+* Bump hynek/build-and-inspect-python-package from 2.6.0 to 2.9.0
+* Bump DavidAnson/markdownlint-cli2-action from 16.0.0 to 17.0.0
+* ci: fix lint action
+* ci: add AWS tags to show github ref and PR num for all jobs
+
+## v0.4.2
+
+### Features
+* Provide safeguards during training
+
+## v0.4.1
+
+### Changes
+* makes saving every save_samples an optional feature
+
+## v0.4.0
+
+### Features
+* Adds a flag to save checkpoints at the end of an epoch
+
+### Changes
+* Change success message at end of training
+
+## v0.3.2
+
+### Features
+* Accept tuples for lora.target_modules
+
+### Documentation
+* patch some hyper parameter arg descriptions in README
+
+## v0.3.1
+
+### Dependencies
+* Update requirements to have bitsandbytes min and dolomite min
+
+## v0.3.0
+
+### Features
+* Updating token masking to support pretraining w/ masked special tokens
+* Adding weight merging for LoRA/QLoRA ckpts
+
+### Fixes
+* remove dead code
+* fix: changes the check to check against both the enum option and enum value
+
+## v0.2.0
+
+### Features
+* Fix ckpt save to include architecture for inference runtime consumption
+* Logging updates
+
+### Performance
+* Reducing deepspeed timeout to 10mins
+
+## v0.1.0
+
+### Features
+* Flash Attention Disable Toggle (Take 2)
+
+### Performance
+* Reduce Unnecessary Multiprocessing
+
+### Fixes
+* 🐛: fix optimizer selection logic so that FusedAdam is never loaded when CPU offloading is enabled
+* Add wheel to requirements
+
+## v0.0.5.1
+
+### Fixes
+This release includes PR [#121](https://github.com/instructlab/training/pull/121) to overcome an issue where our way of lazily importing the run_training function is being picked up as an error by pylint.
+
+## v0.0.5
+Minor bugfixes and updates.
+
+## v0.0.4
+Minor bugfixes and updates.
+
+## v0.0.3
+Minor bugfixes and updates.
+
+## v0.0.2
+
+### Features
+This introduces the instructlab library as a package in the instructlab package namespace.
+
+To install it:
+```
+pip install instructlab-training
+```
+
+And to install it with flash-attn and other CUDA-dependent packages, you can use
+```
+pip install instructlab-training[cuda]
+```
+
+Here's how to use it:
+```python
+from instructlab.training.config import TorchrunArgs, TrainingArgs, run_training
+
+torchrun_args = TorchrunArgs(
+    nproc_per_node = 1,  # 1 GPU
+    nnodes = 1,  # only 1 overall machine in the system
+    node_rank = 0,  # rank of the current machine
+    rdzv_id = 123,  # what ID other nodes will join on
+    rdzv_endpoint = '0.0.0.0:12345'  # address where other nodes will join
+)
+
+training_args = TrainingArgs(
+    # specify training args here
+)
+
+run_training(torch_args = torchrun_args, train_args = training_args)
+```
+
+## v0.0.1
+
+### Features
+Initial release with same features as v0.0.2.
\ No newline at end of file

From d18e57234e274eec5d57bf334338f347cb4d95a8 Mon Sep 17 00:00:00 2001
From: Lo-Mein <ryanlo5102000@gmail.com>
Date: Tue, 5 Nov 2024 12:16:34 -0500
Subject: [PATCH 2/6] Fixed linting errors

Signed-off-by: Lo-Mein <ryanlo5102000@gmail.com>
---
 CHANGELOG.md | 101 +++++++++++++++++++++++++++++++++++----------------
 1 file changed, 69 insertions(+), 32 deletions(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index bc0027f1..ca1cb720 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -2,54 +2,64 @@
 
 ## v0.5.5
 
-### Features
+### v0.5.5 Features
+
 * e2e: replace old small job with new medium job
 
-### Fixes
+### v0.5.5 Fixes
+
 * fix: incorrect label for AWS medium runner
 * chore: add exit code & tox fix
 
-### Infrastructure
+### v0.5.5 Infrastructure
+
 * ci: grant HF_TOKEN access to the medium-size E2E CI job
 
 ## v0.5.4
 
-### Features
+### v0.5.4 Features
+
 * Add rocm extra to pyproject.toml
 
 ## v0.5.3
 
-### Fixes
+### v0.5.3 Fixes
+
 * fix: Add explicit flash_attn requirement for ROCm
 
 ## v0.5.2 - Fix Pretraining Masking
 
-### Fixes
+### v0.5.2 Fixes
+
 * fix: improve linting and automation
 * Fix pretrain token list->int for masking
 
 ## v0.5.1
 
-### Fixes
+### v0.5.1 Fixes
+
 * fix: updates sorting logic to correctly compare numbers
 
 ## v0.5.0 - FSDP and Full-State Checkpoint Resuming
 
-### Features
+### v0.5.0 Features
+
 * feat: add e2e test for instructlab CI
 * feat: add mergify
 * Adding FSDP Support to Training Library by @aldopareja @Maxusmusti @RobotSail
 * adds Accelerate full-state (opt, lr_sched, params)
 * changes StreamablePopen to return a process and implement listening
 
-### Fixes
+### v0.5.0 Fixes
+
 * Fix lint error to make CI happy
 * Fix typos
 * Ap/fix multipack for non granite models
 * Fix generic chat template saved to tokenizer for generation
 * Fix linting error and missing quote
 
-### Infrastructure
+### v0.5.0 Infrastructure
+
 * Add license identifiers
 * ci: update runner labels to uniquely identify instance sizes
 * ci: minor cleanup of E2E job
@@ -69,21 +79,25 @@
 
 ## v0.5.0 Alpha 0 - The FSDP Release Pre-release
 
-### Description
+### v0.5.0 Alpha Description
+
 The FSDP Release introduces FSDP support in addition to the existing DeepSpeed support through the accelerate library.
 
-### Features
+### v0.5.0 Alpha Features
+
 * feat: add e2e test for instructlab CI
 * feat: add mergify
 * Adding FSDP Support to Training Library by @aldopareja @Maxusmusti @RobotSail
 
-### Fixes
+### v0.5.0 Alpha Fixes
+
 * Fix lint error to make CI happy
 * Fix typos
 * Ap/fix multipack for non granite models
 * Fix linting error and missing quote
 
-### Infrastructure
+### v0.5.0 Alpha Infrastructure
+
 * Add license identifiers
 * ci: update runner labels to uniquely identify instance sizes
 * ci: minor cleanup of E2E job
@@ -99,96 +113,118 @@ The FSDP Release introduces FSDP support in addition to the existing DeepSpeed s
 
 ## v0.4.2
 
-### Features
+### v0.4.2 Features
+
 * Provide safeguards during training
 
 ## v0.4.1
 
-### Changes
+### v0.4.1 Changes
+
 * makes saving every save_samples an optional feature
 
 ## v0.4.0
 
-### Features
+### v0.4.0 Features
+
 * Adds a flag to save checkpoints at the end of an epoch
 
-### Changes
+### v0.4.0 Changes
+
 * Change success message at end of training
 
 ## v0.3.2
 
-### Features
+### v0.3.2 Features
+
 * Accept tuples for lora.target_modules
 
-### Documentation
+### v0.3.2 Documentation
+
 * patch some hyper parameter arg descriptions in README
 
 ## v0.3.1
 
-### Dependencies
+### v0.3.1 Dependencies
+
 * Update requirements to have bitsandbytes min and dolomite min
 
 ## v0.3.0
 
-### Features
+### v0.3.0 Features
+
 * Updating token masking to support pretraining w/ masked special tokens
 * Adding weight merging for LoRA/QLoRA ckpts
 
-### Fixes
+### v0.3.0 Fixes
+
 * remove dead code
 * fix: changes the check to check against both the enum option and enum value
 
 ## v0.2.0
 
-### Features
+### v0.2.0 Features
+
 * Fix ckpt save to include architecture for inference runtime consumption
 * Logging updates
 
-### Performance
+### v0.2.0 Performance
+
 * Reducing deepspeed timeout to 10mins
 
 ## v0.1.0
 
-### Features
+### v0.1.0 Features
+
 * Flash Attention Disable Toggle (Take 2)
 
-### Performance
+### v0.1.0 Performance
+
 * Reduce Unnecessary Multiprocessing
 
-### Fixes
+### v0.1.0 Fixes
+
 * 🐛: fix optimizer selection logic so that FusedAdam is never loaded when CPU offloading is enabled
 * Add wheel to requirements
 
 ## v0.0.5.1
 
-### Fixes
+### v0.0.5.1 Fixes
+
 This release includes PR [#121](https://github.com/instructlab/training/pull/121) to overcome an issue where our way of lazily importing the run_training function is being picked up as an error by pylint.
 
 ## v0.0.5
+
 Minor bugfixes and updates.
 
 ## v0.0.4
+
 Minor bugfixes and updates.
 
 ## v0.0.3
+
 Minor bugfixes and updates.
 
 ## v0.0.2
 
 ### Features
+
 This introduces the instructlab library as a package in the instructlab package namespace.
 
 To install it:
-```
+
+```bash
 pip install instructlab-training
 ```
 
 And to install it with flash-attn and other CUDA-dependent packages, you can use
-```
+
+```bash
 pip install instructlab-training[cuda]
 ```
 
 Here's how to use it:
+
 ```python
 from instructlab.training.config import TorchrunArgs, TrainingArgs, run_training
 
@@ -209,5 +245,6 @@ run_training(torch_args = torchrun_args, train_args = training_args)
 
 ## v0.0.1
 
-### Features
+### v0.0.1 Features
+
 Initial release with same features as v0.0.2.
\ No newline at end of file

From 1d00406098d2a817cc49fb684fce322426016286 Mon Sep 17 00:00:00 2001
From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com>
Date: Thu, 7 Nov 2024 12:47:07 +0000
Subject: [PATCH 3/6] build(deps): Bump pypa/gh-action-pypi-publish from 1.12.0
 to 1.12.2

Bumps [pypa/gh-action-pypi-publish](https://github.com/pypa/gh-action-pypi-publish) from 1.12.0 to 1.12.2.
- [Release notes](https://github.com/pypa/gh-action-pypi-publish/releases)
- [Commits](https://github.com/pypa/gh-action-pypi-publish/compare/61da13deb5f5124fb1536194f82ed3d9bbc7e8f3...15c56dba361d8335944d31a2ecd17d700fc7bcbc)

---
updated-dependencies:
- dependency-name: pypa/gh-action-pypi-publish
  dependency-type: direct:production
  update-type: version-update:semver-patch
...

Signed-off-by: dependabot[bot] <support@github.com>
---
 .github/workflows/pypi.yaml | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/.github/workflows/pypi.yaml b/.github/workflows/pypi.yaml
index dedc40b8..a8be1c1c 100644
--- a/.github/workflows/pypi.yaml
+++ b/.github/workflows/pypi.yaml
@@ -77,7 +77,7 @@ jobs:
                   path: dist
 
             - name: "Upload to Test PyPI"
-              uses: pypa/gh-action-pypi-publish@61da13deb5f5124fb1536194f82ed3d9bbc7e8f3 # v1.12.0
+              uses: pypa/gh-action-pypi-publish@15c56dba361d8335944d31a2ecd17d700fc7bcbc # v1.12.2
               with:
                   repository-url: https://test.pypi.org/legacy/
 
@@ -129,4 +129,4 @@ jobs:
                   rm ./dist/*.sigstore.json
 
             - name: "Upload to PyPI"
-              uses: pypa/gh-action-pypi-publish@61da13deb5f5124fb1536194f82ed3d9bbc7e8f3 # v1.12.0
+              uses: pypa/gh-action-pypi-publish@15c56dba361d8335944d31a2ecd17d700fc7bcbc # v1.12.2

From ebd3f73014efbab12cfb1ce3ab2e827bc4a944be Mon Sep 17 00:00:00 2001
From: Oleg S <97077423+RobotSail@users.noreply.github.com>
Date: Thu, 7 Nov 2024 10:14:31 -0500
Subject: [PATCH 4/6] enhancement: enhances bash script with proper syntax

This commit enhances the existing bash script to encapsulate variables as "${VAR}" so that the
bounds are clearly known. For reference, see: https://google.github.io/styleguide/shellguide.html\#variable-expansion

Signed-off-by: Oleg S <97077423+RobotSail@users.noreply.github.com>
---
 tests/smoketest.sh | 64 +++++++++++++++++++++++-----------------------
 1 file changed, 32 insertions(+), 32 deletions(-)

diff --git a/tests/smoketest.sh b/tests/smoketest.sh
index e104b233..9bdb0df4 100755
--- a/tests/smoketest.sh
+++ b/tests/smoketest.sh
@@ -12,9 +12,9 @@ CHECKPOINTS_DIR="${TMP_DIR}/checkpoints"
 DATA_DIR="${TMP_DIR}/data"
 COMPUTED_DATA_PATH="${DATA_DIR}/data.jsonl"
 DEFAULT_DISTRIB_FRAMEWORK='fsdp'
-DISTRIB_FRAMEWORK="${1:-$DEFAULT_DISTRIB_FRAMEWORK}" # defaults to FSDP
+DISTRIB_FRAMEWORK="${1:-${DEFAULT_DISTRIB_FRAMEWORK}}" # defaults to FSDP
 DEFAULT_GPUS=8
-NUM_GPUS="${2:-$DEFAULT_GPUS}"
+NUM_GPUS="${2:-${DEFAULT_GPUS}}"
 
 # ############### User-modifiable parameters ############### 
 # Change these as needed
@@ -36,8 +36,8 @@ NUM_SAMPLES_TRAINED_ON=5000 # upper-bound on training dataset size.
 #   None
 #######################################
 function setup_tmpdir () {
-    mkdir "$CHECKPOINTS_DIR"
-    mkdir "$DATA_DIR"
+    mkdir "${CHECKPOINTS_DIR}"
+    mkdir "${DATA_DIR}"
 }
 
 #######################################
@@ -61,17 +61,17 @@ function prepare_data () {
     # go faster.
     
     python3 data_process.py \
-    --data_path="$SAMPLE_DATA_PATH" \
-    --data_output_path="$DATA_DIR" \
+    --data_path="${SAMPLE_DATA_PATH}" \
+    --data_output_path="${DATA_DIR}" \
     --max_seq_len=4096 \
-    --model_name_or_path="$MODEL_NAME"
+    --model_name_or_path="${MODEL_NAME}"
 
     # trim data so we only keep the first 'n' samples.
     # should be enough data for training to be meaningful but not enough
     # that training takes a large amount of time.
-    echo "$(head -"$NUM_SAMPLES_TRAINED_ON" "$COMPUTED_DATA_PATH")" > "$COMPUTED_DATA_PATH"
+    echo "$(head -"${NUM_SAMPLES_TRAINED_ON}" "${COMPUTED_DATA_PATH}")" > "${COMPUTED_DATA_PATH}"
 
-    echo "TRAINING ON $(wc -l "$COMPUTED_DATA_PATH") SAMPLES"
+    echo "TRAINING ON $(wc -l "${COMPUTED_DATA_PATH}") SAMPLES"
 }
 
 #######################################
@@ -86,9 +86,9 @@ function prepare_data () {
 #   writes location of checkpoints dir to standard out.
 #######################################
 function _cleanup_saved_checkpoints() {
-    echo "CLEARING CHECKPOINTS: $CHECKPOINTS_DIR"
-    rm -rf "$CHECKPOINTS_DIR"
-    mkdir "$CHECKPOINTS_DIR"
+    echo "CLEARING CHECKPOINTS: ${CHECKPOINTS_DIR}"
+    rm -rf "${CHECKPOINTS_DIR}"
+    mkdir "${CHECKPOINTS_DIR}"
 }
 
 #######################################
@@ -109,18 +109,18 @@ function _cleanup_saved_checkpoints() {
 function test_standard_loop () {
     torchrun \
     --standalone \
-    --nproc_per_node="$NUM_GPUS" \
+    --nproc_per_node="${NUM_GPUS}" \
     main_ds.py \
-    --model_name_or_path="$MODEL_NAME" \
-    --data_path="$COMPUTED_DATA_PATH" \
-    --output_dir="$CHECKPOINTS_DIR" \
+    --model_name_or_path="${MODEL_NAME}" \
+    --data_path="${COMPUTED_DATA_PATH}" \
+    --output_dir="${CHECKPOINTS_DIR}" \
     --num_epochs=1 \
     --effective_batch_size=128 \
     --save_samples=0 \
     --checkpoint_at_epoch \
     --accelerate_full_state_at_epoch \
-    --distributed_training_framework="$DISTRIB_FRAMEWORK" \
-    --max_batch_len="$MAX_BATCH_LEN" \
+    --distributed_training_framework="${DISTRIB_FRAMEWORK}" \
+    --max_batch_len="${MAX_BATCH_LEN}" \
     --is_granite
 }
 
@@ -142,18 +142,18 @@ function test_standard_loop () {
 function test_standard_loop_nongranite () {
     torchrun \
     --standalone \
-    --nproc_per_node="$NUM_GPUS" \
+    --nproc_per_node="${NUM_GPUS}" \
     main_ds.py \
-    --model_name_or_path="$MODEL_NAME" \
-    --data_path="$COMPUTED_DATA_PATH" \
-    --output_dir="$CHECKPOINTS_DIR" \
+    --model_name_or_path="${MODEL_NAME}" \
+    --data_path="${COMPUTED_DATA_PATH}" \
+    --output_dir="${CHECKPOINTS_DIR}" \
     --num_epochs=1 \
     --effective_batch_size=128 \
     --save_samples=0 \
     --checkpoint_at_epoch \
     --accelerate_full_state_at_epoch \
-    --distributed_training_framework="$DISTRIB_FRAMEWORK" \
-    --max_batch_len="$MAX_BATCH_LEN"
+    --distributed_training_framework="${DISTRIB_FRAMEWORK}" \
+    --max_batch_len="${MAX_BATCH_LEN}"
     # --is_granite \
 }
 
@@ -175,18 +175,18 @@ function test_standard_loop_nongranite () {
 function test_standard_loop_noflashattention_nogranite () {
     torchrun \
     --standalone \
-    --nproc_per_node="$NUM_GPUS" \
+    --nproc_per_node="${NUM_GPUS}" \
     main_ds.py \
-    --model_name_or_path="$MODEL_NAME" \
-    --data_path="$COMPUTED_DATA_PATH" \
-    --output_dir="$CHECKPOINTS_DIR" \
+    --model_name_or_path="${MODEL_NAME}" \
+    --data_path="${COMPUTED_DATA_PATH}" \
+    --output_dir="${CHECKPOINTS_DIR}" \
     --num_epochs=1 \
     --effective_batch_size=128 \
     --save_samples=0 \
     --checkpoint_at_epoch \
     --accelerate_full_state_at_epoch \
-    --distributed_training_framework="$DISTRIB_FRAMEWORK" \
-    --max_batch_len="$MAX_BATCH_LEN" \
+    --distributed_training_framework="${DISTRIB_FRAMEWORK}" \
+    --max_batch_len="${MAX_BATCH_LEN}" \
     --disable_flash_attn
     # --is_granite
 }
@@ -194,11 +194,11 @@ function test_standard_loop_noflashattention_nogranite () {
 function main () {
 
     setup_tmpdir
-    trap "rm -rf $TMP_DIR" EXIT
+    trap 'rm -rf ${TMP_DIR}' EXIT
 
     #NOTE (jkunstle): script is run as though it's
     # in the same source dir as main_ds and data_process.
-    cd "$CORRECT_WORKING_DIR"
+    cd "${CORRECT_WORKING_DIR}"
     echo "CURRENT WORKING DIRECTORY: $(pwd)"
 
     prepare_data

From 6ca54fb2f01c49830760db42436a4d95922e6fcd Mon Sep 17 00:00:00 2001
From: Mustafa Eyceoz <meyceoz@redhat.com>
Date: Thu, 7 Nov 2024 17:33:23 -0500
Subject: [PATCH 5/6] Update default chat template to Granite 3.0 template and
 update token processing (#319)

Handles the new roles and chat template included in Granite 3.0 models

* Add new template: In progress

Signed-off-by: Mustafa Eyceoz <meyceoz@redhat.com>

* First pass role token handling

Signed-off-by: Mustafa Eyceoz <meyceoz@redhat.com>

* Quick list concat

Signed-off-by: Mustafa Eyceoz <meyceoz@redhat.com>

* Add pretraining role

Signed-off-by: Mustafa Eyceoz <meyceoz@redhat.com>

* remove TODOs

Signed-off-by: Mustafa Eyceoz <meyceoz@redhat.com>

* More forgiving newline buffer

Signed-off-by: Mustafa Eyceoz <meyceoz@redhat.com>

* Make sure dolomite conversion isn't always attempted

Signed-off-by: Mustafa Eyceoz <meyceoz@redhat.com>

* Add tool response role handling

Signed-off-by: Mustafa Eyceoz <meyceoz@redhat.com>

* Added tool sp token

Signed-off-by: Mustafa Eyceoz <meyceoz@redhat.com>

* Fix sp token retrieval

Signed-off-by: Mustafa Eyceoz <meyceoz@redhat.com>

---------

Signed-off-by: Mustafa Eyceoz <meyceoz@redhat.com>
---
 .../chat_templates/ibm_generic_tmpl.py        | 40 ++++++++++-----
 .../chat_templates/ibm_legacy_tmpl.py         | 30 +++++++++++
 src/instructlab/training/data_process.py      | 51 +++++++++++++++++--
 src/instructlab/training/tokenizer_utils.py   |  3 ++
 src/instructlab/training/utils.py             |  6 ++-
 5 files changed, 111 insertions(+), 19 deletions(-)
 create mode 100644 src/instructlab/training/chat_templates/ibm_legacy_tmpl.py

diff --git a/src/instructlab/training/chat_templates/ibm_generic_tmpl.py b/src/instructlab/training/chat_templates/ibm_generic_tmpl.py
index 5eaf795e..73a21652 100644
--- a/src/instructlab/training/chat_templates/ibm_generic_tmpl.py
+++ b/src/instructlab/training/chat_templates/ibm_generic_tmpl.py
@@ -4,27 +4,41 @@
 from instructlab.training.tokenizer_utils import SpecialTokens, TokenInfo
 
 SPECIAL_TOKENS = SpecialTokens(
-    system=TokenInfo("<|system|>", add_to_tokenizer=True),
-    user=TokenInfo("<|user|>", add_to_tokenizer=True),
-    assistant=TokenInfo("<|assistant|>", add_to_tokenizer=True),
-    eos=TokenInfo("<|endoftext|>", add_to_tokenizer=True),
-    pad=TokenInfo("<|pad|>", add_to_tokenizer=True),
-    bos=TokenInfo("<|begginingoftext|>", add_to_tokenizer=True),
+    start_role=TokenInfo("<|start_of_role|>", add_to_tokenizer=True),
+    end_role=TokenInfo("<|end_of_role|>", add_to_tokenizer=True),
+    tool=TokenInfo("<|tool_call|>", add_to_tokenizer=True),
+    eos=TokenInfo("<|end_of_text|>", add_to_tokenizer=True),
+    bos=TokenInfo("<|end_of_text|>", add_to_tokenizer=True),
+    pad=TokenInfo("<|end_of_text|>", add_to_tokenizer=True),
 )
 
 CHAT_TEMPLATE = (
+    "{%- if tools %}"
+    "{{ '<|start_of_role|>available_tools<|end_of_role|>\n' }}"
+    "{% for tool in tools %}"
+    "{{ tool | tojson(indent=4) }}"
+    "{% if not loop.last %}"
+    "{{- '\n\n' }}"
+    "{% endif %}"
+    "{% endfor %}"
+    "{{ '<|end_of_text|>\n' }}"
+    "{% endif %}"
     "{% for message in messages %}"
-    "{% if message['role'] == 'pretraining' %}"
-    "{{'<|pretrain|>' + message['content'] + '<|endoftext|>' + '<|/pretrain|>' }}"
-    "{% elif message['role'] == 'system' %}"
-    "{{'<|system|>'+ '\n' + message['content'] + '\n'}}"
+    "{% if message['role'] == 'system' %}"
+    "{{ '<|start_of_role|>system<|end_of_role|>' + message['content'] + '<|end_of_text|>\n' }}"
+    "{% elif message['role'] == 'pretraining' %}"
+    "{{ '<|pretrain|>' + message['content'] + '<|end_of_text|>' + '<|/pretrain|>'}}"
     "{% elif message['role'] == 'user' %}"
-    "{{'<|user|>' + '\n' + message['content'] + '\n'}}"
+    "{{ '<|start_of_role|>user<|end_of_role|>' + message['content'] + '<|end_of_text|>\n' }}"
     "{% elif message['role'] == 'assistant' %}"
-    "{{'<|assistant|>' + '\n' + message['content'] + '<|endoftext|>' + ('' if loop.last else '\n')}}"
+    "{{ '<|start_of_role|>assistant<|end_of_role|>'  + message['content'] + '<|end_of_text|>\n' }}"
+    "{% elif message['role'] == 'assistant_tool_call' %}"
+    "{{ '<|start_of_role|>assistant<|end_of_role|><|tool_call|>' + message['content'] + '<|end_of_text|>\n' }}"
+    "{% elif message['role'] == 'tool_response' %}"
+    "{{ '<|start_of_role|>tool_response<|end_of_role|>' + message['content'] + '<|end_of_text|>\n' }}"
     "{% endif %}"
     "{% if loop.last and add_generation_prompt %}"
-    "{{ '<|assistant|>' + '\n' }}"
+    "{{ '<|start_of_role|>assistant<|end_of_role|>' }}"
     "{% endif %}"
     "{% endfor %}"
 )
diff --git a/src/instructlab/training/chat_templates/ibm_legacy_tmpl.py b/src/instructlab/training/chat_templates/ibm_legacy_tmpl.py
new file mode 100644
index 00000000..5eaf795e
--- /dev/null
+++ b/src/instructlab/training/chat_templates/ibm_legacy_tmpl.py
@@ -0,0 +1,30 @@
+# SPDX-License-Identifier: Apache-2.0
+
+# First Party
+from instructlab.training.tokenizer_utils import SpecialTokens, TokenInfo
+
+SPECIAL_TOKENS = SpecialTokens(
+    system=TokenInfo("<|system|>", add_to_tokenizer=True),
+    user=TokenInfo("<|user|>", add_to_tokenizer=True),
+    assistant=TokenInfo("<|assistant|>", add_to_tokenizer=True),
+    eos=TokenInfo("<|endoftext|>", add_to_tokenizer=True),
+    pad=TokenInfo("<|pad|>", add_to_tokenizer=True),
+    bos=TokenInfo("<|begginingoftext|>", add_to_tokenizer=True),
+)
+
+CHAT_TEMPLATE = (
+    "{% for message in messages %}"
+    "{% if message['role'] == 'pretraining' %}"
+    "{{'<|pretrain|>' + message['content'] + '<|endoftext|>' + '<|/pretrain|>' }}"
+    "{% elif message['role'] == 'system' %}"
+    "{{'<|system|>'+ '\n' + message['content'] + '\n'}}"
+    "{% elif message['role'] == 'user' %}"
+    "{{'<|user|>' + '\n' + message['content'] + '\n'}}"
+    "{% elif message['role'] == 'assistant' %}"
+    "{{'<|assistant|>' + '\n' + message['content'] + '<|endoftext|>' + ('' if loop.last else '\n')}}"
+    "{% endif %}"
+    "{% if loop.last and add_generation_prompt %}"
+    "{{ '<|assistant|>' + '\n' }}"
+    "{% endif %}"
+    "{% endfor %}"
+)
diff --git a/src/instructlab/training/data_process.py b/src/instructlab/training/data_process.py
index 10214e9d..389a8cb0 100644
--- a/src/instructlab/training/data_process.py
+++ b/src/instructlab/training/data_process.py
@@ -28,7 +28,11 @@ def check_valid_sample(
     if len(whole_sentence_tk) >= max_len or len(whole_sentence_tk) < 20:
         return False
     # last token should be eos_token
-    if not eos_tk[0] in (whole_sentence_tk[-1], whole_sentence_tk[-2]):
+    if not eos_tk[0] in (
+        whole_sentence_tk[-1],
+        whole_sentence_tk[-2],
+        whole_sentence_tk[-3],
+    ):
         return False
 
     # NOTE - below checks are no longer strictly required, but we may want to revisit to make sure there's nothing we need to bring back in validity checking
@@ -61,6 +65,7 @@ def unmask_message_content(
     system_tokens,
     pretrain_token,
     pretrain_end_token,
+    tool_resp_tokens=None,
 ):
     """
     Create labels for tokens in a sequence with special handling for pretraining tokens and role-specific sequences.
@@ -130,6 +135,10 @@ def find_longest_match(start_idx, sequences):
             default=None,
         )
 
+    special_sequences = [user_tokens, assist_tokens, system_tokens]
+    if tool_resp_tokens:
+        special_sequences.append(tool_resp_tokens)
+
     in_pretraining = False
     unmasking = False
     i = 0
@@ -143,7 +152,7 @@ def find_longest_match(start_idx, sequences):
             i += 1
             continue
 
-        match = find_longest_match(i, [user_tokens, assist_tokens, system_tokens])
+        match = find_longest_match(i, special_sequences)
         if match:
             unmasking = match == assist_tokens
             i += len(match)
@@ -167,8 +176,6 @@ def find_longest_match(start_idx, sequences):
     ]
 
     # Assertions
-    special_sequences = [user_tokens, assist_tokens, system_tokens]
-
     # 1. No special sequence of tokens should be unmasked
     for i in range(len(final_sentence_tk)):
         for seq in special_sequences:
@@ -229,10 +236,43 @@ def main(args: DataProcessArgs):
     CHAT_TEMPLATE, SPECIAL_TOKENS = retrieve_chat_template(args.chat_tmpl_path)
     tokenizer = setup_tokenizer(args.model_path, SPECIAL_TOKENS, CHAT_TEMPLATE)
 
-    system_tk, user_tk, assistant_tk, eos_tk, pad_tk, bos_tk = [
+    (
+        system_tk,
+        user_tk,
+        assistant_tk,
+        eos_tk,
+        pad_tk,
+        bos_tk,
+        start_role_tk,
+        end_role_tk,
+        _,
+    ) = [
         get_sp_token(tokenizer, getattr(SPECIAL_TOKENS, sp).token)
         for sp in SPECIAL_TOKENS.__annotations__.keys()
     ]
+    if start_role_tk and end_role_tk:
+        system_tk = (
+            start_role_tk
+            + tokenizer.encode("system", add_special_tokens=False)
+            + end_role_tk
+        )
+        user_tk = (
+            start_role_tk
+            + tokenizer.encode("user", add_special_tokens=False)
+            + end_role_tk
+        )
+        assistant_tk = (
+            start_role_tk
+            + tokenizer.encode("assistant", add_special_tokens=False)
+            + end_role_tk
+        )
+        tool_resp_tk = (
+            start_role_tk
+            + tokenizer.encode("tool_response", add_special_tokens=False)
+            + end_role_tk
+        )
+    else:
+        tool_resp_tk = None
     log_rank_0(
         f"Special tokens: eos: {eos_tk}, pad: {pad_tk}, bos: {bos_tk}, system: {system_tk}, user: {user_tk}, assistant: {assistant_tk}"
     )
@@ -324,6 +364,7 @@ def main(args: DataProcessArgs):
         system_tokens=system_tk,
         pretrain_token=get_sp_token(tokenizer, "<|pretrain|>")[0],
         pretrain_end_token=get_sp_token(tokenizer, "<|/pretrain|>")[0],
+        tool_resp_tokens=tool_resp_tk,
     )
     print("\033[92munmasking the appropriate message content...\033[0m")
     data_with_labels = data_with_input_ids.map(
diff --git a/src/instructlab/training/tokenizer_utils.py b/src/instructlab/training/tokenizer_utils.py
index 45ad4699..f142dec6 100644
--- a/src/instructlab/training/tokenizer_utils.py
+++ b/src/instructlab/training/tokenizer_utils.py
@@ -25,6 +25,9 @@ class SpecialTokens:
     eos: TokenInfo = field(default_factory=lambda: TokenInfo(""))
     pad: TokenInfo = field(default_factory=lambda: TokenInfo(""))
     bos: TokenInfo = field(default_factory=lambda: TokenInfo(""))
+    start_role: TokenInfo = field(default_factory=lambda: TokenInfo(""))
+    end_role: TokenInfo = field(default_factory=lambda: TokenInfo(""))
+    tool: TokenInfo = field(default_factory=lambda: TokenInfo(""))
 
     def get_tokens_to_add(self) -> List[str]:
         return [
diff --git a/src/instructlab/training/utils.py b/src/instructlab/training/utils.py
index 41b410c7..b6f655bf 100644
--- a/src/instructlab/training/utils.py
+++ b/src/instructlab/training/utils.py
@@ -761,7 +761,6 @@ def save_hf_format_accelerate(
     tokenizer,
     accelerator: Accelerator,
     samples_seen,
-    convert_dolomite=True,
     is_lora=False,
 ):
     log_rank_0(
@@ -770,6 +769,11 @@ def save_hf_format_accelerate(
     )
     start = time.time()
 
+    if args.model_type in ("gpt_megatron", "gpt_dolomite"):
+        convert_dolomite = False
+    else:
+        convert_dolomite = True
+
     final_output_dir = Path(args.output_dir) / "hf_format" / f"samples_{samples_seen}"
     if args.use_dolomite and convert_dolomite:
         tmpdir = TemporaryDirectory("w")  # pylint: disable=consider-using-with

From a50929ffd69a865859d2f339a0f51be270ae2f5c Mon Sep 17 00:00:00 2001
From: Jaideep Rao <jrao@redhat.com>
Date: Thu, 7 Nov 2024 10:55:28 -0500
Subject: [PATCH 6/6] chore: move token classes into chat templates

Signed-off-by: Jaideep Rao <jrao@redhat.com>
---
 .../chat_templates/ibm_generic_tmpl.py        |  2 +-
 .../chat_templates/ibm_legacy_tmpl.py         |  2 +-
 .../training/chat_templates/mistral_tmpl.py   |  2 +-
 .../training/chat_templates/utils.py          | 29 ++++++++++++++++++
 src/instructlab/training/tokenizer_utils.py   | 30 -------------------
 5 files changed, 32 insertions(+), 33 deletions(-)
 create mode 100644 src/instructlab/training/chat_templates/utils.py

diff --git a/src/instructlab/training/chat_templates/ibm_generic_tmpl.py b/src/instructlab/training/chat_templates/ibm_generic_tmpl.py
index 73a21652..1403276b 100644
--- a/src/instructlab/training/chat_templates/ibm_generic_tmpl.py
+++ b/src/instructlab/training/chat_templates/ibm_generic_tmpl.py
@@ -1,7 +1,7 @@
 # SPDX-License-Identifier: Apache-2.0
 
 # First Party
-from instructlab.training.tokenizer_utils import SpecialTokens, TokenInfo
+from instructlab.training.chat_templates.utils import SpecialTokens, TokenInfo
 
 SPECIAL_TOKENS = SpecialTokens(
     start_role=TokenInfo("<|start_of_role|>", add_to_tokenizer=True),
diff --git a/src/instructlab/training/chat_templates/ibm_legacy_tmpl.py b/src/instructlab/training/chat_templates/ibm_legacy_tmpl.py
index 5eaf795e..0f09468f 100644
--- a/src/instructlab/training/chat_templates/ibm_legacy_tmpl.py
+++ b/src/instructlab/training/chat_templates/ibm_legacy_tmpl.py
@@ -1,7 +1,7 @@
 # SPDX-License-Identifier: Apache-2.0
 
 # First Party
-from instructlab.training.tokenizer_utils import SpecialTokens, TokenInfo
+from instructlab.training.chat_templates.utils import SpecialTokens, TokenInfo
 
 SPECIAL_TOKENS = SpecialTokens(
     system=TokenInfo("<|system|>", add_to_tokenizer=True),
diff --git a/src/instructlab/training/chat_templates/mistral_tmpl.py b/src/instructlab/training/chat_templates/mistral_tmpl.py
index dda051b3..6c1e8757 100644
--- a/src/instructlab/training/chat_templates/mistral_tmpl.py
+++ b/src/instructlab/training/chat_templates/mistral_tmpl.py
@@ -1,7 +1,7 @@
 # SPDX-License-Identifier: Apache-2.0
 
 # First Party
-from instructlab.training.tokenizer_utils import SpecialTokens, TokenInfo
+from instructlab.training.chat_templates.utils import SpecialTokens, TokenInfo
 
 SPECIAL_TOKENS = SpecialTokens(
     bos=TokenInfo("<s>", add_to_tokenizer=True),
diff --git a/src/instructlab/training/chat_templates/utils.py b/src/instructlab/training/chat_templates/utils.py
new file mode 100644
index 00000000..c0e62796
--- /dev/null
+++ b/src/instructlab/training/chat_templates/utils.py
@@ -0,0 +1,29 @@
+# Standard
+from dataclasses import dataclass, field
+from typing import List
+
+
+@dataclass
+class TokenInfo:
+    token: str
+    add_to_tokenizer: bool = False
+
+
+@dataclass
+class SpecialTokens:
+    system: TokenInfo = field(default_factory=lambda: TokenInfo(""))
+    user: TokenInfo = field(default_factory=lambda: TokenInfo(""))
+    assistant: TokenInfo = field(default_factory=lambda: TokenInfo(""))
+    eos: TokenInfo = field(default_factory=lambda: TokenInfo(""))
+    pad: TokenInfo = field(default_factory=lambda: TokenInfo(""))
+    bos: TokenInfo = field(default_factory=lambda: TokenInfo(""))
+    start_role: TokenInfo = field(default_factory=lambda: TokenInfo(""))
+    end_role: TokenInfo = field(default_factory=lambda: TokenInfo(""))
+    tool: TokenInfo = field(default_factory=lambda: TokenInfo(""))
+
+    def get_tokens_to_add(self) -> List[str]:
+        return [
+            token_info.token
+            for token_info in self.__dict__.values()
+            if token_info.add_to_tokenizer and token_info.token
+        ]
diff --git a/src/instructlab/training/tokenizer_utils.py b/src/instructlab/training/tokenizer_utils.py
index f142dec6..d6c55e7e 100644
--- a/src/instructlab/training/tokenizer_utils.py
+++ b/src/instructlab/training/tokenizer_utils.py
@@ -1,9 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
 
-# Standard
-from dataclasses import dataclass, field
-from typing import List
-
 # Third Party
 from transformers import AutoTokenizer, PreTrainedTokenizer
 
@@ -11,32 +7,6 @@
 from instructlab.training.utils import log_rank_0
 
 
-@dataclass
-class TokenInfo:
-    token: str
-    add_to_tokenizer: bool = False
-
-
-@dataclass
-class SpecialTokens:
-    system: TokenInfo = field(default_factory=lambda: TokenInfo(""))
-    user: TokenInfo = field(default_factory=lambda: TokenInfo(""))
-    assistant: TokenInfo = field(default_factory=lambda: TokenInfo(""))
-    eos: TokenInfo = field(default_factory=lambda: TokenInfo(""))
-    pad: TokenInfo = field(default_factory=lambda: TokenInfo(""))
-    bos: TokenInfo = field(default_factory=lambda: TokenInfo(""))
-    start_role: TokenInfo = field(default_factory=lambda: TokenInfo(""))
-    end_role: TokenInfo = field(default_factory=lambda: TokenInfo(""))
-    tool: TokenInfo = field(default_factory=lambda: TokenInfo(""))
-
-    def get_tokens_to_add(self) -> List[str]:
-        return [
-            token_info.token
-            for token_info in self.__dict__.values()
-            if token_info.add_to_tokenizer and token_info.token
-        ]
-
-
 def setup_tokenizer(
     model_name_or_path, SPECIAL_TOKENS, CHAT_TEMPLATE
 ) -> PreTrainedTokenizer: