From f5e3344634f19466dba02101da456953cc4a8602 Mon Sep 17 00:00:00 2001 From: Lo-Mein Date: Thu, 31 Oct 2024 12:18:53 -0400 Subject: [PATCH 1/6] Added CHANGELOG.md Signed-off-by: Lo-Mein --- CHANGELOG.md | 213 +++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 213 insertions(+) create mode 100644 CHANGELOG.md diff --git a/CHANGELOG.md b/CHANGELOG.md new file mode 100644 index 00000000..bc0027f1 --- /dev/null +++ b/CHANGELOG.md @@ -0,0 +1,213 @@ +# Changelog + +## v0.5.5 + +### Features +* e2e: replace old small job with new medium job + +### Fixes +* fix: incorrect label for AWS medium runner +* chore: add exit code & tox fix + +### Infrastructure +* ci: grant HF_TOKEN access to the medium-size E2E CI job + +## v0.5.4 + +### Features +* Add rocm extra to pyproject.toml + +## v0.5.3 + +### Fixes +* fix: Add explicit flash_attn requirement for ROCm + +## v0.5.2 - Fix Pretraining Masking + +### Fixes +* fix: improve linting and automation +* Fix pretrain token list->int for masking + +## v0.5.1 + +### Fixes +* fix: updates sorting logic to correctly compare numbers + +## v0.5.0 - FSDP and Full-State Checkpoint Resuming + +### Features +* feat: add e2e test for instructlab CI +* feat: add mergify +* Adding FSDP Support to Training Library by @aldopareja @Maxusmusti @RobotSail +* adds Accelerate full-state (opt, lr_sched, params) +* changes StreamablePopen to return a process and implement listening + +### Fixes +* Fix lint error to make CI happy +* Fix typos +* Ap/fix multipack for non granite models +* Fix generic chat template saved to tokenizer for generation +* Fix linting error and missing quote + +### Infrastructure +* Add license identifiers +* ci: update runner labels to uniquely identify instance sizes +* ci: minor cleanup of E2E job +* Fixing e2e to use relative path for working-directory +* switch -T to -a +* github: add stale bot to training repo +* fix: markdown lint error and mergify bug +* Bump actions/checkout from 4.1.7 to 4.2.0 +* Bump step-security/harden-runner from 2.8.1 to 2.9.1 +* Bump pypa/gh-action-pypi-publish from 1.9.0 to 1.10.2 +* Bump actions/setup-python from 5.1.0 to 5.2.0 +* Bump rhysd/actionlint from 1.7.1 to 1.7.2 +* Bump hynek/build-and-inspect-python-package from 2.6.0 to 2.9.0 +* Bump DavidAnson/markdownlint-cli2-action from 16.0.0 to 17.0.0 +* ci: fix lint action +* ci: add AWS tags to show github ref and PR num for all jobs + +## v0.5.0 Alpha 0 - The FSDP Release Pre-release + +### Description +The FSDP Release introduces FSDP support in addition to the existing DeepSpeed support through the accelerate library. + +### Features +* feat: add e2e test for instructlab CI +* feat: add mergify +* Adding FSDP Support to Training Library by @aldopareja @Maxusmusti @RobotSail + +### Fixes +* Fix lint error to make CI happy +* Fix typos +* Ap/fix multipack for non granite models +* Fix linting error and missing quote + +### Infrastructure +* Add license identifiers +* ci: update runner labels to uniquely identify instance sizes +* ci: minor cleanup of E2E job +* Fixing e2e to use relative path for working-directory +* Bump step-security/harden-runner from 2.8.1 to 2.9.1 +* Bump pypa/gh-action-pypi-publish from 1.9.0 to 1.10.2 +* Bump actions/setup-python from 5.1.0 to 5.2.0 +* Bump rhysd/actionlint from 1.7.1 to 1.7.2 +* Bump hynek/build-and-inspect-python-package from 2.6.0 to 2.9.0 +* Bump DavidAnson/markdownlint-cli2-action from 16.0.0 to 17.0.0 +* ci: fix lint action +* ci: add AWS tags to show github ref and PR num for all jobs + +## v0.4.2 + +### Features +* Provide safeguards during training + +## v0.4.1 + +### Changes +* makes saving every save_samples an optional feature + +## v0.4.0 + +### Features +* Adds a flag to save checkpoints at the end of an epoch + +### Changes +* Change success message at end of training + +## v0.3.2 + +### Features +* Accept tuples for lora.target_modules + +### Documentation +* patch some hyper parameter arg descriptions in README + +## v0.3.1 + +### Dependencies +* Update requirements to have bitsandbytes min and dolomite min + +## v0.3.0 + +### Features +* Updating token masking to support pretraining w/ masked special tokens +* Adding weight merging for LoRA/QLoRA ckpts + +### Fixes +* remove dead code +* fix: changes the check to check against both the enum option and enum value + +## v0.2.0 + +### Features +* Fix ckpt save to include architecture for inference runtime consumption +* Logging updates + +### Performance +* Reducing deepspeed timeout to 10mins + +## v0.1.0 + +### Features +* Flash Attention Disable Toggle (Take 2) + +### Performance +* Reduce Unnecessary Multiprocessing + +### Fixes +* 🐛: fix optimizer selection logic so that FusedAdam is never loaded when CPU offloading is enabled +* Add wheel to requirements + +## v0.0.5.1 + +### Fixes +This release includes PR [#121](https://github.com/instructlab/training/pull/121) to overcome an issue where our way of lazily importing the run_training function is being picked up as an error by pylint. + +## v0.0.5 +Minor bugfixes and updates. + +## v0.0.4 +Minor bugfixes and updates. + +## v0.0.3 +Minor bugfixes and updates. + +## v0.0.2 + +### Features +This introduces the instructlab library as a package in the instructlab package namespace. + +To install it: +``` +pip install instructlab-training +``` + +And to install it with flash-attn and other CUDA-dependent packages, you can use +``` +pip install instructlab-training[cuda] +``` + +Here's how to use it: +```python +from instructlab.training.config import TorchrunArgs, TrainingArgs, run_training + +torchrun_args = TorchrunArgs( + nproc_per_node = 1, # 1 GPU + nnodes = 1, # only 1 overall machine in the system + node_rank = 0, # rank of the current machine + rdzv_id = 123, # what ID other nodes will join on + rdzv_endpoint = '0.0.0.0:12345' # address where other nodes will join +) + +training_args = TrainingArgs( + # specify training args here +) + +run_training(torch_args = torchrun_args, train_args = training_args) +``` + +## v0.0.1 + +### Features +Initial release with same features as v0.0.2. \ No newline at end of file From d18e57234e274eec5d57bf334338f347cb4d95a8 Mon Sep 17 00:00:00 2001 From: Lo-Mein Date: Tue, 5 Nov 2024 12:16:34 -0500 Subject: [PATCH 2/6] Fixed linting errors Signed-off-by: Lo-Mein --- CHANGELOG.md | 101 +++++++++++++++++++++++++++++++++++---------------- 1 file changed, 69 insertions(+), 32 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index bc0027f1..ca1cb720 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -2,54 +2,64 @@ ## v0.5.5 -### Features +### v0.5.5 Features + * e2e: replace old small job with new medium job -### Fixes +### v0.5.5 Fixes + * fix: incorrect label for AWS medium runner * chore: add exit code & tox fix -### Infrastructure +### v0.5.5 Infrastructure + * ci: grant HF_TOKEN access to the medium-size E2E CI job ## v0.5.4 -### Features +### v0.5.4 Features + * Add rocm extra to pyproject.toml ## v0.5.3 -### Fixes +### v0.5.3 Fixes + * fix: Add explicit flash_attn requirement for ROCm ## v0.5.2 - Fix Pretraining Masking -### Fixes +### v0.5.2 Fixes + * fix: improve linting and automation * Fix pretrain token list->int for masking ## v0.5.1 -### Fixes +### v0.5.1 Fixes + * fix: updates sorting logic to correctly compare numbers ## v0.5.0 - FSDP and Full-State Checkpoint Resuming -### Features +### v0.5.0 Features + * feat: add e2e test for instructlab CI * feat: add mergify * Adding FSDP Support to Training Library by @aldopareja @Maxusmusti @RobotSail * adds Accelerate full-state (opt, lr_sched, params) * changes StreamablePopen to return a process and implement listening -### Fixes +### v0.5.0 Fixes + * Fix lint error to make CI happy * Fix typos * Ap/fix multipack for non granite models * Fix generic chat template saved to tokenizer for generation * Fix linting error and missing quote -### Infrastructure +### v0.5.0 Infrastructure + * Add license identifiers * ci: update runner labels to uniquely identify instance sizes * ci: minor cleanup of E2E job @@ -69,21 +79,25 @@ ## v0.5.0 Alpha 0 - The FSDP Release Pre-release -### Description +### v0.5.0 Alpha Description + The FSDP Release introduces FSDP support in addition to the existing DeepSpeed support through the accelerate library. -### Features +### v0.5.0 Alpha Features + * feat: add e2e test for instructlab CI * feat: add mergify * Adding FSDP Support to Training Library by @aldopareja @Maxusmusti @RobotSail -### Fixes +### v0.5.0 Alpha Fixes + * Fix lint error to make CI happy * Fix typos * Ap/fix multipack for non granite models * Fix linting error and missing quote -### Infrastructure +### v0.5.0 Alpha Infrastructure + * Add license identifiers * ci: update runner labels to uniquely identify instance sizes * ci: minor cleanup of E2E job @@ -99,96 +113,118 @@ The FSDP Release introduces FSDP support in addition to the existing DeepSpeed s ## v0.4.2 -### Features +### v0.4.2 Features + * Provide safeguards during training ## v0.4.1 -### Changes +### v0.4.1 Changes + * makes saving every save_samples an optional feature ## v0.4.0 -### Features +### v0.4.0 Features + * Adds a flag to save checkpoints at the end of an epoch -### Changes +### v0.4.0 Changes + * Change success message at end of training ## v0.3.2 -### Features +### v0.3.2 Features + * Accept tuples for lora.target_modules -### Documentation +### v0.3.2 Documentation + * patch some hyper parameter arg descriptions in README ## v0.3.1 -### Dependencies +### v0.3.1 Dependencies + * Update requirements to have bitsandbytes min and dolomite min ## v0.3.0 -### Features +### v0.3.0 Features + * Updating token masking to support pretraining w/ masked special tokens * Adding weight merging for LoRA/QLoRA ckpts -### Fixes +### v0.3.0 Fixes + * remove dead code * fix: changes the check to check against both the enum option and enum value ## v0.2.0 -### Features +### v0.2.0 Features + * Fix ckpt save to include architecture for inference runtime consumption * Logging updates -### Performance +### v0.2.0 Performance + * Reducing deepspeed timeout to 10mins ## v0.1.0 -### Features +### v0.1.0 Features + * Flash Attention Disable Toggle (Take 2) -### Performance +### v0.1.0 Performance + * Reduce Unnecessary Multiprocessing -### Fixes +### v0.1.0 Fixes + * 🐛: fix optimizer selection logic so that FusedAdam is never loaded when CPU offloading is enabled * Add wheel to requirements ## v0.0.5.1 -### Fixes +### v0.0.5.1 Fixes + This release includes PR [#121](https://github.com/instructlab/training/pull/121) to overcome an issue where our way of lazily importing the run_training function is being picked up as an error by pylint. ## v0.0.5 + Minor bugfixes and updates. ## v0.0.4 + Minor bugfixes and updates. ## v0.0.3 + Minor bugfixes and updates. ## v0.0.2 ### Features + This introduces the instructlab library as a package in the instructlab package namespace. To install it: -``` + +```bash pip install instructlab-training ``` And to install it with flash-attn and other CUDA-dependent packages, you can use -``` + +```bash pip install instructlab-training[cuda] ``` Here's how to use it: + ```python from instructlab.training.config import TorchrunArgs, TrainingArgs, run_training @@ -209,5 +245,6 @@ run_training(torch_args = torchrun_args, train_args = training_args) ## v0.0.1 -### Features +### v0.0.1 Features + Initial release with same features as v0.0.2. \ No newline at end of file From 1d00406098d2a817cc49fb684fce322426016286 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Thu, 7 Nov 2024 12:47:07 +0000 Subject: [PATCH 3/6] build(deps): Bump pypa/gh-action-pypi-publish from 1.12.0 to 1.12.2 Bumps [pypa/gh-action-pypi-publish](https://github.com/pypa/gh-action-pypi-publish) from 1.12.0 to 1.12.2. - [Release notes](https://github.com/pypa/gh-action-pypi-publish/releases) - [Commits](https://github.com/pypa/gh-action-pypi-publish/compare/61da13deb5f5124fb1536194f82ed3d9bbc7e8f3...15c56dba361d8335944d31a2ecd17d700fc7bcbc) --- updated-dependencies: - dependency-name: pypa/gh-action-pypi-publish dependency-type: direct:production update-type: version-update:semver-patch ... Signed-off-by: dependabot[bot] --- .github/workflows/pypi.yaml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/pypi.yaml b/.github/workflows/pypi.yaml index dedc40b8..a8be1c1c 100644 --- a/.github/workflows/pypi.yaml +++ b/.github/workflows/pypi.yaml @@ -77,7 +77,7 @@ jobs: path: dist - name: "Upload to Test PyPI" - uses: pypa/gh-action-pypi-publish@61da13deb5f5124fb1536194f82ed3d9bbc7e8f3 # v1.12.0 + uses: pypa/gh-action-pypi-publish@15c56dba361d8335944d31a2ecd17d700fc7bcbc # v1.12.2 with: repository-url: https://test.pypi.org/legacy/ @@ -129,4 +129,4 @@ jobs: rm ./dist/*.sigstore.json - name: "Upload to PyPI" - uses: pypa/gh-action-pypi-publish@61da13deb5f5124fb1536194f82ed3d9bbc7e8f3 # v1.12.0 + uses: pypa/gh-action-pypi-publish@15c56dba361d8335944d31a2ecd17d700fc7bcbc # v1.12.2 From ebd3f73014efbab12cfb1ce3ab2e827bc4a944be Mon Sep 17 00:00:00 2001 From: Oleg S <97077423+RobotSail@users.noreply.github.com> Date: Thu, 7 Nov 2024 10:14:31 -0500 Subject: [PATCH 4/6] enhancement: enhances bash script with proper syntax This commit enhances the existing bash script to encapsulate variables as "${VAR}" so that the bounds are clearly known. For reference, see: https://google.github.io/styleguide/shellguide.html\#variable-expansion Signed-off-by: Oleg S <97077423+RobotSail@users.noreply.github.com> --- tests/smoketest.sh | 64 +++++++++++++++++++++++----------------------- 1 file changed, 32 insertions(+), 32 deletions(-) diff --git a/tests/smoketest.sh b/tests/smoketest.sh index e104b233..9bdb0df4 100755 --- a/tests/smoketest.sh +++ b/tests/smoketest.sh @@ -12,9 +12,9 @@ CHECKPOINTS_DIR="${TMP_DIR}/checkpoints" DATA_DIR="${TMP_DIR}/data" COMPUTED_DATA_PATH="${DATA_DIR}/data.jsonl" DEFAULT_DISTRIB_FRAMEWORK='fsdp' -DISTRIB_FRAMEWORK="${1:-$DEFAULT_DISTRIB_FRAMEWORK}" # defaults to FSDP +DISTRIB_FRAMEWORK="${1:-${DEFAULT_DISTRIB_FRAMEWORK}}" # defaults to FSDP DEFAULT_GPUS=8 -NUM_GPUS="${2:-$DEFAULT_GPUS}" +NUM_GPUS="${2:-${DEFAULT_GPUS}}" # ############### User-modifiable parameters ############### # Change these as needed @@ -36,8 +36,8 @@ NUM_SAMPLES_TRAINED_ON=5000 # upper-bound on training dataset size. # None ####################################### function setup_tmpdir () { - mkdir "$CHECKPOINTS_DIR" - mkdir "$DATA_DIR" + mkdir "${CHECKPOINTS_DIR}" + mkdir "${DATA_DIR}" } ####################################### @@ -61,17 +61,17 @@ function prepare_data () { # go faster. python3 data_process.py \ - --data_path="$SAMPLE_DATA_PATH" \ - --data_output_path="$DATA_DIR" \ + --data_path="${SAMPLE_DATA_PATH}" \ + --data_output_path="${DATA_DIR}" \ --max_seq_len=4096 \ - --model_name_or_path="$MODEL_NAME" + --model_name_or_path="${MODEL_NAME}" # trim data so we only keep the first 'n' samples. # should be enough data for training to be meaningful but not enough # that training takes a large amount of time. - echo "$(head -"$NUM_SAMPLES_TRAINED_ON" "$COMPUTED_DATA_PATH")" > "$COMPUTED_DATA_PATH" + echo "$(head -"${NUM_SAMPLES_TRAINED_ON}" "${COMPUTED_DATA_PATH}")" > "${COMPUTED_DATA_PATH}" - echo "TRAINING ON $(wc -l "$COMPUTED_DATA_PATH") SAMPLES" + echo "TRAINING ON $(wc -l "${COMPUTED_DATA_PATH}") SAMPLES" } ####################################### @@ -86,9 +86,9 @@ function prepare_data () { # writes location of checkpoints dir to standard out. ####################################### function _cleanup_saved_checkpoints() { - echo "CLEARING CHECKPOINTS: $CHECKPOINTS_DIR" - rm -rf "$CHECKPOINTS_DIR" - mkdir "$CHECKPOINTS_DIR" + echo "CLEARING CHECKPOINTS: ${CHECKPOINTS_DIR}" + rm -rf "${CHECKPOINTS_DIR}" + mkdir "${CHECKPOINTS_DIR}" } ####################################### @@ -109,18 +109,18 @@ function _cleanup_saved_checkpoints() { function test_standard_loop () { torchrun \ --standalone \ - --nproc_per_node="$NUM_GPUS" \ + --nproc_per_node="${NUM_GPUS}" \ main_ds.py \ - --model_name_or_path="$MODEL_NAME" \ - --data_path="$COMPUTED_DATA_PATH" \ - --output_dir="$CHECKPOINTS_DIR" \ + --model_name_or_path="${MODEL_NAME}" \ + --data_path="${COMPUTED_DATA_PATH}" \ + --output_dir="${CHECKPOINTS_DIR}" \ --num_epochs=1 \ --effective_batch_size=128 \ --save_samples=0 \ --checkpoint_at_epoch \ --accelerate_full_state_at_epoch \ - --distributed_training_framework="$DISTRIB_FRAMEWORK" \ - --max_batch_len="$MAX_BATCH_LEN" \ + --distributed_training_framework="${DISTRIB_FRAMEWORK}" \ + --max_batch_len="${MAX_BATCH_LEN}" \ --is_granite } @@ -142,18 +142,18 @@ function test_standard_loop () { function test_standard_loop_nongranite () { torchrun \ --standalone \ - --nproc_per_node="$NUM_GPUS" \ + --nproc_per_node="${NUM_GPUS}" \ main_ds.py \ - --model_name_or_path="$MODEL_NAME" \ - --data_path="$COMPUTED_DATA_PATH" \ - --output_dir="$CHECKPOINTS_DIR" \ + --model_name_or_path="${MODEL_NAME}" \ + --data_path="${COMPUTED_DATA_PATH}" \ + --output_dir="${CHECKPOINTS_DIR}" \ --num_epochs=1 \ --effective_batch_size=128 \ --save_samples=0 \ --checkpoint_at_epoch \ --accelerate_full_state_at_epoch \ - --distributed_training_framework="$DISTRIB_FRAMEWORK" \ - --max_batch_len="$MAX_BATCH_LEN" + --distributed_training_framework="${DISTRIB_FRAMEWORK}" \ + --max_batch_len="${MAX_BATCH_LEN}" # --is_granite \ } @@ -175,18 +175,18 @@ function test_standard_loop_nongranite () { function test_standard_loop_noflashattention_nogranite () { torchrun \ --standalone \ - --nproc_per_node="$NUM_GPUS" \ + --nproc_per_node="${NUM_GPUS}" \ main_ds.py \ - --model_name_or_path="$MODEL_NAME" \ - --data_path="$COMPUTED_DATA_PATH" \ - --output_dir="$CHECKPOINTS_DIR" \ + --model_name_or_path="${MODEL_NAME}" \ + --data_path="${COMPUTED_DATA_PATH}" \ + --output_dir="${CHECKPOINTS_DIR}" \ --num_epochs=1 \ --effective_batch_size=128 \ --save_samples=0 \ --checkpoint_at_epoch \ --accelerate_full_state_at_epoch \ - --distributed_training_framework="$DISTRIB_FRAMEWORK" \ - --max_batch_len="$MAX_BATCH_LEN" \ + --distributed_training_framework="${DISTRIB_FRAMEWORK}" \ + --max_batch_len="${MAX_BATCH_LEN}" \ --disable_flash_attn # --is_granite } @@ -194,11 +194,11 @@ function test_standard_loop_noflashattention_nogranite () { function main () { setup_tmpdir - trap "rm -rf $TMP_DIR" EXIT + trap 'rm -rf ${TMP_DIR}' EXIT #NOTE (jkunstle): script is run as though it's # in the same source dir as main_ds and data_process. - cd "$CORRECT_WORKING_DIR" + cd "${CORRECT_WORKING_DIR}" echo "CURRENT WORKING DIRECTORY: $(pwd)" prepare_data From 6ca54fb2f01c49830760db42436a4d95922e6fcd Mon Sep 17 00:00:00 2001 From: Mustafa Eyceoz Date: Thu, 7 Nov 2024 17:33:23 -0500 Subject: [PATCH 5/6] Update default chat template to Granite 3.0 template and update token processing (#319) Handles the new roles and chat template included in Granite 3.0 models * Add new template: In progress Signed-off-by: Mustafa Eyceoz * First pass role token handling Signed-off-by: Mustafa Eyceoz * Quick list concat Signed-off-by: Mustafa Eyceoz * Add pretraining role Signed-off-by: Mustafa Eyceoz * remove TODOs Signed-off-by: Mustafa Eyceoz * More forgiving newline buffer Signed-off-by: Mustafa Eyceoz * Make sure dolomite conversion isn't always attempted Signed-off-by: Mustafa Eyceoz * Add tool response role handling Signed-off-by: Mustafa Eyceoz * Added tool sp token Signed-off-by: Mustafa Eyceoz * Fix sp token retrieval Signed-off-by: Mustafa Eyceoz --------- Signed-off-by: Mustafa Eyceoz --- .../chat_templates/ibm_generic_tmpl.py | 40 ++++++++++----- .../chat_templates/ibm_legacy_tmpl.py | 30 +++++++++++ src/instructlab/training/data_process.py | 51 +++++++++++++++++-- src/instructlab/training/tokenizer_utils.py | 3 ++ src/instructlab/training/utils.py | 6 ++- 5 files changed, 111 insertions(+), 19 deletions(-) create mode 100644 src/instructlab/training/chat_templates/ibm_legacy_tmpl.py diff --git a/src/instructlab/training/chat_templates/ibm_generic_tmpl.py b/src/instructlab/training/chat_templates/ibm_generic_tmpl.py index 5eaf795e..73a21652 100644 --- a/src/instructlab/training/chat_templates/ibm_generic_tmpl.py +++ b/src/instructlab/training/chat_templates/ibm_generic_tmpl.py @@ -4,27 +4,41 @@ from instructlab.training.tokenizer_utils import SpecialTokens, TokenInfo SPECIAL_TOKENS = SpecialTokens( - system=TokenInfo("<|system|>", add_to_tokenizer=True), - user=TokenInfo("<|user|>", add_to_tokenizer=True), - assistant=TokenInfo("<|assistant|>", add_to_tokenizer=True), - eos=TokenInfo("<|endoftext|>", add_to_tokenizer=True), - pad=TokenInfo("<|pad|>", add_to_tokenizer=True), - bos=TokenInfo("<|begginingoftext|>", add_to_tokenizer=True), + start_role=TokenInfo("<|start_of_role|>", add_to_tokenizer=True), + end_role=TokenInfo("<|end_of_role|>", add_to_tokenizer=True), + tool=TokenInfo("<|tool_call|>", add_to_tokenizer=True), + eos=TokenInfo("<|end_of_text|>", add_to_tokenizer=True), + bos=TokenInfo("<|end_of_text|>", add_to_tokenizer=True), + pad=TokenInfo("<|end_of_text|>", add_to_tokenizer=True), ) CHAT_TEMPLATE = ( + "{%- if tools %}" + "{{ '<|start_of_role|>available_tools<|end_of_role|>\n' }}" + "{% for tool in tools %}" + "{{ tool | tojson(indent=4) }}" + "{% if not loop.last %}" + "{{- '\n\n' }}" + "{% endif %}" + "{% endfor %}" + "{{ '<|end_of_text|>\n' }}" + "{% endif %}" "{% for message in messages %}" - "{% if message['role'] == 'pretraining' %}" - "{{'<|pretrain|>' + message['content'] + '<|endoftext|>' + '<|/pretrain|>' }}" - "{% elif message['role'] == 'system' %}" - "{{'<|system|>'+ '\n' + message['content'] + '\n'}}" + "{% if message['role'] == 'system' %}" + "{{ '<|start_of_role|>system<|end_of_role|>' + message['content'] + '<|end_of_text|>\n' }}" + "{% elif message['role'] == 'pretraining' %}" + "{{ '<|pretrain|>' + message['content'] + '<|end_of_text|>' + '<|/pretrain|>'}}" "{% elif message['role'] == 'user' %}" - "{{'<|user|>' + '\n' + message['content'] + '\n'}}" + "{{ '<|start_of_role|>user<|end_of_role|>' + message['content'] + '<|end_of_text|>\n' }}" "{% elif message['role'] == 'assistant' %}" - "{{'<|assistant|>' + '\n' + message['content'] + '<|endoftext|>' + ('' if loop.last else '\n')}}" + "{{ '<|start_of_role|>assistant<|end_of_role|>' + message['content'] + '<|end_of_text|>\n' }}" + "{% elif message['role'] == 'assistant_tool_call' %}" + "{{ '<|start_of_role|>assistant<|end_of_role|><|tool_call|>' + message['content'] + '<|end_of_text|>\n' }}" + "{% elif message['role'] == 'tool_response' %}" + "{{ '<|start_of_role|>tool_response<|end_of_role|>' + message['content'] + '<|end_of_text|>\n' }}" "{% endif %}" "{% if loop.last and add_generation_prompt %}" - "{{ '<|assistant|>' + '\n' }}" + "{{ '<|start_of_role|>assistant<|end_of_role|>' }}" "{% endif %}" "{% endfor %}" ) diff --git a/src/instructlab/training/chat_templates/ibm_legacy_tmpl.py b/src/instructlab/training/chat_templates/ibm_legacy_tmpl.py new file mode 100644 index 00000000..5eaf795e --- /dev/null +++ b/src/instructlab/training/chat_templates/ibm_legacy_tmpl.py @@ -0,0 +1,30 @@ +# SPDX-License-Identifier: Apache-2.0 + +# First Party +from instructlab.training.tokenizer_utils import SpecialTokens, TokenInfo + +SPECIAL_TOKENS = SpecialTokens( + system=TokenInfo("<|system|>", add_to_tokenizer=True), + user=TokenInfo("<|user|>", add_to_tokenizer=True), + assistant=TokenInfo("<|assistant|>", add_to_tokenizer=True), + eos=TokenInfo("<|endoftext|>", add_to_tokenizer=True), + pad=TokenInfo("<|pad|>", add_to_tokenizer=True), + bos=TokenInfo("<|begginingoftext|>", add_to_tokenizer=True), +) + +CHAT_TEMPLATE = ( + "{% for message in messages %}" + "{% if message['role'] == 'pretraining' %}" + "{{'<|pretrain|>' + message['content'] + '<|endoftext|>' + '<|/pretrain|>' }}" + "{% elif message['role'] == 'system' %}" + "{{'<|system|>'+ '\n' + message['content'] + '\n'}}" + "{% elif message['role'] == 'user' %}" + "{{'<|user|>' + '\n' + message['content'] + '\n'}}" + "{% elif message['role'] == 'assistant' %}" + "{{'<|assistant|>' + '\n' + message['content'] + '<|endoftext|>' + ('' if loop.last else '\n')}}" + "{% endif %}" + "{% if loop.last and add_generation_prompt %}" + "{{ '<|assistant|>' + '\n' }}" + "{% endif %}" + "{% endfor %}" +) diff --git a/src/instructlab/training/data_process.py b/src/instructlab/training/data_process.py index 10214e9d..389a8cb0 100644 --- a/src/instructlab/training/data_process.py +++ b/src/instructlab/training/data_process.py @@ -28,7 +28,11 @@ def check_valid_sample( if len(whole_sentence_tk) >= max_len or len(whole_sentence_tk) < 20: return False # last token should be eos_token - if not eos_tk[0] in (whole_sentence_tk[-1], whole_sentence_tk[-2]): + if not eos_tk[0] in ( + whole_sentence_tk[-1], + whole_sentence_tk[-2], + whole_sentence_tk[-3], + ): return False # NOTE - below checks are no longer strictly required, but we may want to revisit to make sure there's nothing we need to bring back in validity checking @@ -61,6 +65,7 @@ def unmask_message_content( system_tokens, pretrain_token, pretrain_end_token, + tool_resp_tokens=None, ): """ Create labels for tokens in a sequence with special handling for pretraining tokens and role-specific sequences. @@ -130,6 +135,10 @@ def find_longest_match(start_idx, sequences): default=None, ) + special_sequences = [user_tokens, assist_tokens, system_tokens] + if tool_resp_tokens: + special_sequences.append(tool_resp_tokens) + in_pretraining = False unmasking = False i = 0 @@ -143,7 +152,7 @@ def find_longest_match(start_idx, sequences): i += 1 continue - match = find_longest_match(i, [user_tokens, assist_tokens, system_tokens]) + match = find_longest_match(i, special_sequences) if match: unmasking = match == assist_tokens i += len(match) @@ -167,8 +176,6 @@ def find_longest_match(start_idx, sequences): ] # Assertions - special_sequences = [user_tokens, assist_tokens, system_tokens] - # 1. No special sequence of tokens should be unmasked for i in range(len(final_sentence_tk)): for seq in special_sequences: @@ -229,10 +236,43 @@ def main(args: DataProcessArgs): CHAT_TEMPLATE, SPECIAL_TOKENS = retrieve_chat_template(args.chat_tmpl_path) tokenizer = setup_tokenizer(args.model_path, SPECIAL_TOKENS, CHAT_TEMPLATE) - system_tk, user_tk, assistant_tk, eos_tk, pad_tk, bos_tk = [ + ( + system_tk, + user_tk, + assistant_tk, + eos_tk, + pad_tk, + bos_tk, + start_role_tk, + end_role_tk, + _, + ) = [ get_sp_token(tokenizer, getattr(SPECIAL_TOKENS, sp).token) for sp in SPECIAL_TOKENS.__annotations__.keys() ] + if start_role_tk and end_role_tk: + system_tk = ( + start_role_tk + + tokenizer.encode("system", add_special_tokens=False) + + end_role_tk + ) + user_tk = ( + start_role_tk + + tokenizer.encode("user", add_special_tokens=False) + + end_role_tk + ) + assistant_tk = ( + start_role_tk + + tokenizer.encode("assistant", add_special_tokens=False) + + end_role_tk + ) + tool_resp_tk = ( + start_role_tk + + tokenizer.encode("tool_response", add_special_tokens=False) + + end_role_tk + ) + else: + tool_resp_tk = None log_rank_0( f"Special tokens: eos: {eos_tk}, pad: {pad_tk}, bos: {bos_tk}, system: {system_tk}, user: {user_tk}, assistant: {assistant_tk}" ) @@ -324,6 +364,7 @@ def main(args: DataProcessArgs): system_tokens=system_tk, pretrain_token=get_sp_token(tokenizer, "<|pretrain|>")[0], pretrain_end_token=get_sp_token(tokenizer, "<|/pretrain|>")[0], + tool_resp_tokens=tool_resp_tk, ) print("\033[92munmasking the appropriate message content...\033[0m") data_with_labels = data_with_input_ids.map( diff --git a/src/instructlab/training/tokenizer_utils.py b/src/instructlab/training/tokenizer_utils.py index 45ad4699..f142dec6 100644 --- a/src/instructlab/training/tokenizer_utils.py +++ b/src/instructlab/training/tokenizer_utils.py @@ -25,6 +25,9 @@ class SpecialTokens: eos: TokenInfo = field(default_factory=lambda: TokenInfo("")) pad: TokenInfo = field(default_factory=lambda: TokenInfo("")) bos: TokenInfo = field(default_factory=lambda: TokenInfo("")) + start_role: TokenInfo = field(default_factory=lambda: TokenInfo("")) + end_role: TokenInfo = field(default_factory=lambda: TokenInfo("")) + tool: TokenInfo = field(default_factory=lambda: TokenInfo("")) def get_tokens_to_add(self) -> List[str]: return [ diff --git a/src/instructlab/training/utils.py b/src/instructlab/training/utils.py index 41b410c7..b6f655bf 100644 --- a/src/instructlab/training/utils.py +++ b/src/instructlab/training/utils.py @@ -761,7 +761,6 @@ def save_hf_format_accelerate( tokenizer, accelerator: Accelerator, samples_seen, - convert_dolomite=True, is_lora=False, ): log_rank_0( @@ -770,6 +769,11 @@ def save_hf_format_accelerate( ) start = time.time() + if args.model_type in ("gpt_megatron", "gpt_dolomite"): + convert_dolomite = False + else: + convert_dolomite = True + final_output_dir = Path(args.output_dir) / "hf_format" / f"samples_{samples_seen}" if args.use_dolomite and convert_dolomite: tmpdir = TemporaryDirectory("w") # pylint: disable=consider-using-with From a50929ffd69a865859d2f339a0f51be270ae2f5c Mon Sep 17 00:00:00 2001 From: Jaideep Rao Date: Thu, 7 Nov 2024 10:55:28 -0500 Subject: [PATCH 6/6] chore: move token classes into chat templates Signed-off-by: Jaideep Rao --- .../chat_templates/ibm_generic_tmpl.py | 2 +- .../chat_templates/ibm_legacy_tmpl.py | 2 +- .../training/chat_templates/mistral_tmpl.py | 2 +- .../training/chat_templates/utils.py | 29 ++++++++++++++++++ src/instructlab/training/tokenizer_utils.py | 30 ------------------- 5 files changed, 32 insertions(+), 33 deletions(-) create mode 100644 src/instructlab/training/chat_templates/utils.py diff --git a/src/instructlab/training/chat_templates/ibm_generic_tmpl.py b/src/instructlab/training/chat_templates/ibm_generic_tmpl.py index 73a21652..1403276b 100644 --- a/src/instructlab/training/chat_templates/ibm_generic_tmpl.py +++ b/src/instructlab/training/chat_templates/ibm_generic_tmpl.py @@ -1,7 +1,7 @@ # SPDX-License-Identifier: Apache-2.0 # First Party -from instructlab.training.tokenizer_utils import SpecialTokens, TokenInfo +from instructlab.training.chat_templates.utils import SpecialTokens, TokenInfo SPECIAL_TOKENS = SpecialTokens( start_role=TokenInfo("<|start_of_role|>", add_to_tokenizer=True), diff --git a/src/instructlab/training/chat_templates/ibm_legacy_tmpl.py b/src/instructlab/training/chat_templates/ibm_legacy_tmpl.py index 5eaf795e..0f09468f 100644 --- a/src/instructlab/training/chat_templates/ibm_legacy_tmpl.py +++ b/src/instructlab/training/chat_templates/ibm_legacy_tmpl.py @@ -1,7 +1,7 @@ # SPDX-License-Identifier: Apache-2.0 # First Party -from instructlab.training.tokenizer_utils import SpecialTokens, TokenInfo +from instructlab.training.chat_templates.utils import SpecialTokens, TokenInfo SPECIAL_TOKENS = SpecialTokens( system=TokenInfo("<|system|>", add_to_tokenizer=True), diff --git a/src/instructlab/training/chat_templates/mistral_tmpl.py b/src/instructlab/training/chat_templates/mistral_tmpl.py index dda051b3..6c1e8757 100644 --- a/src/instructlab/training/chat_templates/mistral_tmpl.py +++ b/src/instructlab/training/chat_templates/mistral_tmpl.py @@ -1,7 +1,7 @@ # SPDX-License-Identifier: Apache-2.0 # First Party -from instructlab.training.tokenizer_utils import SpecialTokens, TokenInfo +from instructlab.training.chat_templates.utils import SpecialTokens, TokenInfo SPECIAL_TOKENS = SpecialTokens( bos=TokenInfo("", add_to_tokenizer=True), diff --git a/src/instructlab/training/chat_templates/utils.py b/src/instructlab/training/chat_templates/utils.py new file mode 100644 index 00000000..c0e62796 --- /dev/null +++ b/src/instructlab/training/chat_templates/utils.py @@ -0,0 +1,29 @@ +# Standard +from dataclasses import dataclass, field +from typing import List + + +@dataclass +class TokenInfo: + token: str + add_to_tokenizer: bool = False + + +@dataclass +class SpecialTokens: + system: TokenInfo = field(default_factory=lambda: TokenInfo("")) + user: TokenInfo = field(default_factory=lambda: TokenInfo("")) + assistant: TokenInfo = field(default_factory=lambda: TokenInfo("")) + eos: TokenInfo = field(default_factory=lambda: TokenInfo("")) + pad: TokenInfo = field(default_factory=lambda: TokenInfo("")) + bos: TokenInfo = field(default_factory=lambda: TokenInfo("")) + start_role: TokenInfo = field(default_factory=lambda: TokenInfo("")) + end_role: TokenInfo = field(default_factory=lambda: TokenInfo("")) + tool: TokenInfo = field(default_factory=lambda: TokenInfo("")) + + def get_tokens_to_add(self) -> List[str]: + return [ + token_info.token + for token_info in self.__dict__.values() + if token_info.add_to_tokenizer and token_info.token + ] diff --git a/src/instructlab/training/tokenizer_utils.py b/src/instructlab/training/tokenizer_utils.py index f142dec6..d6c55e7e 100644 --- a/src/instructlab/training/tokenizer_utils.py +++ b/src/instructlab/training/tokenizer_utils.py @@ -1,9 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 -# Standard -from dataclasses import dataclass, field -from typing import List - # Third Party from transformers import AutoTokenizer, PreTrainedTokenizer @@ -11,32 +7,6 @@ from instructlab.training.utils import log_rank_0 -@dataclass -class TokenInfo: - token: str - add_to_tokenizer: bool = False - - -@dataclass -class SpecialTokens: - system: TokenInfo = field(default_factory=lambda: TokenInfo("")) - user: TokenInfo = field(default_factory=lambda: TokenInfo("")) - assistant: TokenInfo = field(default_factory=lambda: TokenInfo("")) - eos: TokenInfo = field(default_factory=lambda: TokenInfo("")) - pad: TokenInfo = field(default_factory=lambda: TokenInfo("")) - bos: TokenInfo = field(default_factory=lambda: TokenInfo("")) - start_role: TokenInfo = field(default_factory=lambda: TokenInfo("")) - end_role: TokenInfo = field(default_factory=lambda: TokenInfo("")) - tool: TokenInfo = field(default_factory=lambda: TokenInfo("")) - - def get_tokens_to_add(self) -> List[str]: - return [ - token_info.token - for token_info in self.__dict__.values() - if token_info.add_to_tokenizer and token_info.token - ] - - def setup_tokenizer( model_name_or_path, SPECIAL_TOKENS, CHAT_TEMPLATE ) -> PreTrainedTokenizer: