Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat: support for mcore optimizer (to enable MoE) #380

Open
wants to merge 4 commits into
base: dev
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from 3 commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions .github/workflows/cicd-main.yml
Original file line number Diff line number Diff line change
Expand Up @@ -98,6 +98,8 @@ jobs:
- sft-llama3-cp
- rm-llama3
- e2e-nemo2
- dpo-mixtral-ep
- dpo-mixtral-peft-tp-sp
with:
RUNNER: self-hosted-azure
# Fairly aggresive timeout that all functional tests should try to adhere to
Expand Down
8 changes: 6 additions & 2 deletions Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -13,8 +13,8 @@ ARG MAX_JOBS=8
# Git refs for dependencies
ARG TE_TAG=7d576ed25266a17a7b651f2c12e8498f67e0baea
ARG PYTRITON_VERSION=0.5.10
ARG NEMO_TAG=06eae2895c0fea09f8dd7c34feff0163e55c419a # On: main
ARG MLM_TAG=844119f5c856a3037ec7c7f6d6ef7b3518ceee6b # On: main
ARG NEMO_TAG=633cb602777bffefbe12066b0c915c87e7b469e9 # On: v2.1.0
ARG MLM_TAG=d15cec53beb283e7127b7d594e1c46b8a0719b6d # On: core_r0.10.0
ARG ALIGNER_COMMIT=main
ARG TRTLLM_VERSION=v0.13.0
ARG PROTOBUF_VERSION=4.24.4
Expand Down Expand Up @@ -70,6 +70,10 @@ RUN git clone https://github.com/NVIDIA/TensorRT-LLM.git && \
pip install -e .
ENV LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/usr/local/cuda-12/compat/lib.real/

# TODO: This pinning of pynvml is only needed while on TRTLLM v13 since pynvml>=11.5.0 but pynvml==12.0.0 contains a
# breaking change. The last known working verison is 11.5.3
RUN pip install pynvml==11.5.3

# install TransformerEngine
ARG MAX_JOBS
ARG TE_TAG
Expand Down
2 changes: 2 additions & 0 deletions examples/nlp/gpt/conf/gpt_dpo.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@ trainer:
devices: 8
accelerator: gpu
precision: bf16
gradient_clip_val: 0.0 # No need to change. Megatron Core optimizer uses this value

# dpo specific args
dpo:
Expand All @@ -17,6 +18,7 @@ trainer:

# how many GBS we loop over
limit_val_batches: 1.0
# TODO: delete once Megatron Core optimizer becomes default
gradient_clip_val: 1.0

# do not change these
Expand Down
2 changes: 2 additions & 0 deletions examples/nlp/gpt/conf/gpt_kto.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@ trainer:
devices: 8
accelerator: gpu
precision: bf16
gradient_clip_val: 0.0 # No need to change. Megatron Core optimizer uses this value

# kto specific args
kto:
Expand All @@ -17,6 +18,7 @@ trainer:

# how many GBS we loop over
limit_val_batches: 1.0
# TODO: delete once Megatron Core optimizer becomes default
gradient_clip_val: 1.0

# do not change these
Expand Down
2 changes: 2 additions & 0 deletions examples/nlp/gpt/conf/gpt_ppo_actor.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@ trainer:
devices: 8
accelerator: gpu
precision: bf16
gradient_clip_val: 0.0 # No need to change. Megatron Core optimizer uses this value

ppo:
# How many steps we train warmup the critic for (without training the policy)
Expand All @@ -21,6 +22,7 @@ trainer:
max_steps: -1 # max PPO steps (-1 to go through the whole train set)
val_check_interval: 10
save_interval: ${.val_check_interval}
# TODO: delete once Megatron Core optimizer becomes default
gradient_clip_val: 1.0

# PPO args to generate the data for training
Expand Down
2 changes: 2 additions & 0 deletions examples/nlp/gpt/conf/gpt_ppo_critic.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@ trainer:
devices: 8
accelerator: gpu
precision: bf16
gradient_clip_val: 0.0 # No need to change. Megatron Core optimizer uses this value

ppo:
port: 5556
Expand All @@ -15,6 +16,7 @@ trainer:

# used to set the learning rate scheduler
max_steps: 10000
# TODO: delete once Megatron Core optimizer becomes default
gradient_clip_val: 1.0

# a PyTriton parameter to specify
Expand Down
4 changes: 3 additions & 1 deletion examples/nlp/gpt/conf/gpt_rs_actor.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -7,12 +7,14 @@ trainer:
devices: 8
accelerator: gpu
precision: bf16
gradient_clip_val: 0.0 # No need to change. Megatron Core optimizer uses this value

rs:
max_epochs: 1
max_steps: -1 # max rs steps (-1 to go through the whole train set)
val_check_interval: 10
save_interval: ${.val_check_interval}
# TODO: delete once Megatron Core optimizer becomes default
gradient_clip_val: 1.0

# pick up from the model
Expand Down Expand Up @@ -178,4 +180,4 @@ model:
# define fields from the base model's config that should be ignored when merging with this config.
overwrite_base_config:
data:
data_prefix: True
data_prefix: True
2 changes: 2 additions & 0 deletions examples/nlp/gpt/conf/gpt_sft.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@ trainer:
devices: 1
accelerator: gpu
precision: bf16
gradient_clip_val: 0.0 # No need to change. Megatron Core optimizer uses this value

sft:
max_epochs: 1
Expand All @@ -15,6 +16,7 @@ trainer:
limit_train_batches: 1.0

limit_val_batches: 1.0
# TODO: delete once Megatron Core optimizer becomes default
gradient_clip_val: 1.0

# can be used to register any custom metrics that require token-by-token generation
Expand Down
2 changes: 2 additions & 0 deletions examples/nlp/gpt/conf/gpt_spin.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@ trainer:
devices: 8
accelerator: gpu
precision: bf16-mixed
gradient_clip_val: 0.0 # No need to change. Megatron Core optimizer uses this value

# spin specific args
spin:
Expand All @@ -18,6 +19,7 @@ trainer:

# how many GBS we loop over
limit_val_batches: 1.0
# TODO: delete once Megatron Core optimizer becomes default
gradient_clip_val: 1.0

# do not change these
Expand Down
2 changes: 2 additions & 0 deletions examples/nlp/gpt/conf/training_rm.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@ trainer:
devices: 8
accelerator: gpu
precision: bf16
gradient_clip_val: 0.0 # No need to change. Megatron Core optimizer uses this value

# rm specific args
rm:
Expand All @@ -20,6 +21,7 @@ trainer:
# set to float for a percentage
# of the validation dataset
limit_val_batches: 1.0
# TODO: delete once Megatron Core optimizer becomes default
gradient_clip_val: 1.0

# do not change these
Expand Down
2 changes: 1 addition & 1 deletion nemo_aligner/algorithms/critic_server_trainer.py
Original file line number Diff line number Diff line change
Expand Up @@ -322,7 +322,7 @@ def run_training(self, tokens=None, returns=None, prev_values=None, mask=None):
grad_norm = grad_norm.item() if torch.is_tensor(grad_norm) else grad_norm
lr = self.optimizer.param_groups[0]["lr"]

self.optimizer.step()
self.optimizer.step(closure=None)
self.scheduler.step()

if grad_norm is not None:
Expand Down
2 changes: 1 addition & 1 deletion nemo_aligner/algorithms/dpo.py
Original file line number Diff line number Diff line change
Expand Up @@ -220,7 +220,7 @@ def train_single_step(self, global_batch):
grad_norm = grad_norm.item() if torch.is_tensor(grad_norm) else grad_norm
lr = self.optimizer.param_groups[0]["lr"]

self.optimizer.step()
self.optimizer.step(closure=None)
self.scheduler.step()

trainer_metrics = {}
Expand Down
2 changes: 1 addition & 1 deletion nemo_aligner/algorithms/ppo.py
Original file line number Diff line number Diff line change
Expand Up @@ -440,7 +440,7 @@ def run_training(self, dataloader_iter):
grad_norm = grad_norm.item() if torch.is_tensor(grad_norm) else grad_norm
lr = self.optimizer.param_groups[0]["lr"]

self.optimizer.step()
self.optimizer.step(closure=None)
self.scheduler.step()

if grad_norm is not None:
Expand Down
2 changes: 1 addition & 1 deletion nemo_aligner/algorithms/rs.py
Original file line number Diff line number Diff line change
Expand Up @@ -294,7 +294,7 @@ def run_training(self, dataloader_iter):
grad_norm = grad_norm.item() if torch.is_tensor(grad_norm) else grad_norm
lr = self.optimizer.param_groups[0]["lr"]

self.optimizer.step()
self.optimizer.step(closure=None)
self.scheduler.step()

if grad_norm is not None:
Expand Down
2 changes: 1 addition & 1 deletion nemo_aligner/algorithms/spin.py
Original file line number Diff line number Diff line change
Expand Up @@ -195,7 +195,7 @@ def train_single_step(self, global_batch):
grad_norm = grad_norm.item() if torch.is_tensor(grad_norm) else grad_norm
lr = self.optimizer.param_groups[0]["lr"]

self.optimizer.step()
self.optimizer.step(closure=None)
self.scheduler.step()

trainer_metrics = {}
Expand Down
2 changes: 1 addition & 1 deletion nemo_aligner/algorithms/supervised.py
Original file line number Diff line number Diff line change
Expand Up @@ -150,7 +150,7 @@ def train_single_step(self, batch):
grad_norm = grad_norm.item() if torch.is_tensor(grad_norm) else grad_norm
lr = self.optimizer.param_groups[0]["lr"]

self.optimizer.step()
self.optimizer.step(closure=None)
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

do we have to specify the closure now? i thought this was optional?

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

for mcore dist opt it's required, so i just set it everywhere

self.scheduler.step()

trainer_metrics = {}
Expand Down
56 changes: 44 additions & 12 deletions nemo_aligner/utils/train_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -101,31 +101,52 @@ def prepare_for_training_step(ptl_model, zero_grad=True):
param.data_ptr()


# TODO: Delete this once API introduced in NeMo (https://github.com/NVIDIA/NeMo/pull/10803)
# TODO: Update PR to move this logic into staticmethod in nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py
def grad_reductions(ptl_model):
# when using sequence parallelism, the sequence parallel layernorm grads must be all-reduced
if ptl_model.cfg.get("tensor_model_parallel_size", 1) > 1 and ptl_model.cfg.get("sequence_parallel", False):
ptl_model.allreduce_sequence_parallel_gradients()

if ptl_model.with_distributed_adam:
# synchronize asynchronous grad reductions
# note: not necessary, but reduces performance degradation
# from multiple simultaneous NCCL calls
ptl_model._optimizer._finish_bucket_grad_sync()
# Mcore DistOpt handles this, so we don't have to
if not ptl_model.use_mcore_dist_optim:
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

how do you feel about dropping support for non mcore dist optim? are they equivalent to apex now?

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

yea, i want to do that in a follow up PR (would help our build times immensely). This just adds the feature without breaking apex

ptl_model.megatron_timer_start("allreduce_sequence_parallel_gradients", log_level=1)
ptl_model.allreduce_sequence_parallel_gradients()
ptl_model.megatron_timer_stop("allreduce_sequence_parallel_gradients")

ptl_model.megatron_timer_start("gradient_allreduce", log_level=1)
if ptl_model.use_fsdp:
# Reduce the gradients omitted from FSDP-sharding
ptl_model.allreduce_fsdp_sharding_omitted_gradients()
elif ptl_model.with_distributed_adam:
if not ptl_model.use_mcore_dist_optim:
# synchronize asynchronous grad reductions
# note: not necessary, but reduces performance degradation
# from multiple simultaneous NCCL calls
ptl_model._optimizer._finish_bucket_grad_sync()
# else: Mcore distributed optim calls finalize_model_grads to finish grad sync
elif ptl_model.megatron_amp_O2:
# when using pipeline parallelism grads must be all-reduced after the pipeline (not asynchronously)
if ptl_model.cfg.get("pipeline_model_parallel_size", 1) > 1 or ptl_model.cfg.get("sequence_parallel", False):
if (
ptl_model.cfg.get("pipeline_model_parallel_size", 1) > 1
or ptl_model.cfg.get("sequence_parallel", False)
or not ptl_model.cfg.get("async_grad_allreduce", True)
):
# main grads are stored in the MainParamsOptimizer wrapper
ptl_model._optimizer.allreduce_main_grads()
else:
# async grad allreduce is not currently implemented for O1/autocasting mixed precision training
# so we all-reduce gradients after the pipeline
ptl_model.allreduce_gradients() # @sangkug we think this is causing memory to blow up (hurts perf)
ptl_model.megatron_timer_stop("gradient_allreduce")
gshennvm marked this conversation as resolved.
Show resolved Hide resolved

if ptl_model.cfg.get("pipeline_model_parallel_size", 1) > 1 and ptl_model.cfg.get(
"share_embeddings_and_output_weights", True
if (
not ptl_model.use_mcore_dist_optim
and ptl_model.cfg.get("pipeline_model_parallel_size", 1) > 1
and ptl_model.cfg.get("share_embeddings_and_output_weights", True)
):
ptl_model.megatron_timer_start("allreduce_first_last_embeddings", log_level=1)
# when using pipeline parallelism the first and last stage must keep embeddings in sync
ptl_model.allreduce_first_last_embeddings()
ptl_model.megatron_timer_stop("allreduce_first_last_embeddings")


def prepare_for_validation_step(ptl_model):
Expand Down Expand Up @@ -155,14 +176,26 @@ def set_eval(ptl_model):
ptl_model.eval()


# TODO: adapt the version in /opt/NeMo/nemo/collections/nlp/models/language_modeling/megatron_base_model.py
def clip_gradients(ptl_model, clip_val):
"""PTL hook to configure gradients.
We use gradient clipping implementation from megatron-lm.
"""
if clip_val is None:
return

clip_val = float(clip_val)
if clip_val <= 0:
return

if ptl_model.with_megatron_fused_adam or ptl_model.use_mcore_dist_optim:
# Gradient clipping is done in optimizer step
return

if ptl_model.grad_clip_pl_default:
# use the default behavior
return super().configure_gradient_clipping(*args, **kwargs)

if ptl_model.with_distributed_adam:
grad_norm = clip_grad_norm_distributed_optimizer(ptl_model._optimizer, clip_val)
else:
Expand All @@ -171,6 +204,5 @@ def clip_gradients(ptl_model, clip_val):
parameters = ptl_model._optimizer.get_parameters_with_grad()
else:
parameters = ptl_model.get_parameters_with_grad()
grad_norm = clip_grad_norm_fp32(parameters=parameters, max_norm=clip_val)

grad_norm = clip_grad_norm_fp32(parameters=parameters, max_norm=clip_val, use_fsdp=ptl_model.use_fsdp,)
return grad_norm
28 changes: 20 additions & 8 deletions tests/functional/dpo.sh
Original file line number Diff line number Diff line change
@@ -1,14 +1,26 @@
#!/bin/bash

# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd )
cd $SCRIPT_DIR
set -eoux pipefail

export NCCL_ALGO=Tree
export NVTE_APPLY_QK_LAYER_SCALING=1
export NVTE_APPLY_QK_LAYER_SCALING=${NVTE_APPLY_QK_LAYER_SCALING:-0}

KL=${KL:-0.1}
GBS=${GBS:-4}
PRETRAINED_CHECKPOINT_NEMO_FILE=${PRETRAINED_CHECKPOINT_NEMO_FILE}


Expand All @@ -23,7 +35,6 @@ mkdir -p $RESULTS_DIR

GPFS=$(git rev-parse --show-toplevel)

# START HETEROGENEUS JOB 3
CONF_DIR="${GPFS}/examples/nlp/gpt/conf/"
CONF_NAME="gpt_dpo"

Expand All @@ -33,17 +44,16 @@ dpo() {
export CUDA_VISIBLE_DEVICES=0,1
export PYTHONPATH="${GPFS}:${PYTHONPATH:-}"
export HYDRA_FULL_ERROR=1
torchrun --nproc-per-node 2 ${GPFS}/examples/nlp/gpt/train_gpt_dpo.py \
torchrun --nproc_per_node=2 ${GPFS}/examples/nlp/gpt/train_gpt_dpo.py \
--config-path=${CONF_DIR} \
--config-name=${CONF_NAME} \
trainer.num_nodes=1 \
trainer.devices=2 \
++model.data.seq_length=128 \
++model.global_batch_size=${GBS} \
++model.micro_batch_size=1 \
++model.mcore_gpt=true \
++model.megatron_amp_O2=true \
++model.dpo.ref_policy_kl_penalty=${KL} \
++model.dpo.ref_policy_kl_penalty=0.1 \
++model.dpo.log_prob_forward_micro_batch_size=1 \
++model.dpo.average_log_probs=false \
++model.dpo.sft_loss_weight=0.1 \
Expand All @@ -52,6 +62,7 @@ torchrun --nproc-per-node 2 ${GPFS}/examples/nlp/gpt/train_gpt_dpo.py \
"model.data.data_prefix={train: [${TRAIN_DATA_PATH}], validation: [${VALID_DATA_PATH}], test: [${VALID_DATA_PATH}]}" \
exp_manager.create_checkpoint_callback=${CREATE_CHECKPOINT_CALLBACK:-False} \
model.data.num_workers=2 \
++model.data.seq_length=128 \
++model.tensor_model_parallel_size=1 \
++model.pipeline_model_parallel_size=1 \
trainer.dpo.max_steps=${MAX_STEPS:-3} \
Expand All @@ -67,4 +78,5 @@ torchrun --nproc-per-node 2 ${GPFS}/examples/nlp/gpt/train_gpt_dpo.py \
}

log_file=$(mktemp /tmp/dpo-log-XXXXXX)
dpo "$@" | tee $log_file
dpo "$@" | tee $log_file
echo "[Finished] $0"
Loading
Loading