From b9cf1847154b0d483cb0e53ddc3ca71f69841a0b Mon Sep 17 00:00:00 2001
From: Terry Kong <terryk@nvidia.com>
Date: Thu, 7 Nov 2024 18:51:13 +0000
Subject: [PATCH] fix: adds missing support for mcore dist opt and adds test
 for moe

Signed-off-by: Terry Kong <terryk@nvidia.com>
---
 .github/workflows/cicd-main.yml               |  2 +
 Dockerfile                                    |  6 +-
 examples/nlp/gpt/conf/gpt_dpo.yaml            |  2 +
 examples/nlp/gpt/conf/gpt_kto.yaml            |  2 +
 examples/nlp/gpt/conf/gpt_ppo_actor.yaml      |  2 +
 examples/nlp/gpt/conf/gpt_ppo_critic.yaml     |  2 +
 examples/nlp/gpt/conf/gpt_rs_actor.yaml       |  4 +-
 examples/nlp/gpt/conf/gpt_sft.yaml            |  2 +
 examples/nlp/gpt/conf/gpt_spin.yaml           |  2 +
 examples/nlp/gpt/conf/training_rm.yaml        |  2 +
 .../algorithms/critic_server_trainer.py       |  2 +-
 nemo_aligner/algorithms/dpo.py                |  2 +-
 nemo_aligner/algorithms/ppo.py                |  2 +-
 nemo_aligner/algorithms/rs.py                 |  2 +-
 nemo_aligner/algorithms/spin.py               |  2 +-
 nemo_aligner/algorithms/supervised.py         |  2 +-
 nemo_aligner/utils/train_utils.py             | 56 ++++++++++++++----
 tests/functional/dpo.sh                       | 57 +++++++++++--------
 tests/functional/test_cases/dpo-llama3        |  7 ++-
 tests/functional/test_cases/dpo-mixtral-ep    | 27 +++++++++
 .../test_cases/dpo-mixtral-peft-tp-sp         | 31 ++++++++++
 21 files changed, 171 insertions(+), 45 deletions(-)
 create mode 100755 tests/functional/test_cases/dpo-mixtral-ep
 create mode 100755 tests/functional/test_cases/dpo-mixtral-peft-tp-sp

diff --git a/.github/workflows/cicd-main.yml b/.github/workflows/cicd-main.yml
index 1ab97629c..f68b50dde 100644
--- a/.github/workflows/cicd-main.yml
+++ b/.github/workflows/cicd-main.yml
@@ -93,6 +93,8 @@ jobs:
           - dpo-llama3
           - sft-llama3
           - rm-llama3
+          - dpo-mixtral-ep
+          - dpo-mixtral-peft-tp-sp
     with:
       RUNNER: self-hosted-azure
       # Fairly aggresive timeout that all functional tests should try to adhere to
diff --git a/Dockerfile b/Dockerfile
index 44a9f8651..3e752c72f 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -130,16 +130,20 @@ git fetch -a
 # 60e677423667c029dd05875da72bf0719774f844: [feat] Update get_model_parallel_src_rank to support tp-pp-dp ordering NeMo#10652
 # 0deaf6716cb4f20766c995ce25d129795f1ae200: fix[export]: update API for disabling device reassignment in TRTLLM for Aligner NeMo#10863
 # (superceded by 10863) 148543d6e9c66ff1f8562e84484448202249811d: feat: Migrate GPTSession refit path in Nemo export to ModelRunner for Aligner NeMo#10654
+# ba8edbd2063f3349c40c9c73e5bae46abbe65f94: fix: regular torch optims (e.g., sgd) no longer error with closure spec NeMo#11189
+# 35a7f718237cf011215db9e92273ed7236d0e8b1: Fix for crash with LoRA + tp_overlap_comm=false + sequence_parallel=true NeMo#10920
 for pr_and_commit in \
   "10651 0c92fe17df4642ffc33d5d8c0c83fda729e3910c" \
   "10652 60e677423667c029dd05875da72bf0719774f844" \
   "10863 0deaf6716cb4f20766c995ce25d129795f1ae200" \
+  "11189 ba8edbd2063f3349c40c9c73e5bae46abbe65f94" \
+  "10920 53cf6527571b29379188c8bb0dba8e507db3cca1" \
 ; do
   pr=$(cut -f1 -d' ' <<<"$pr_and_commit")
   head_pr_commit=$(cut -f2 -d' ' <<<"$pr_and_commit")
   git fetch origin $head_pr_commit:PR-${pr}
   # cherry-picks all commits between main and the top of the PR
-  git cherry-pick --allow-empty $(git merge-base origin/main PR-${pr})..PR-${pr}
+  git cherry-pick -m 1 --allow-empty $(git merge-base origin/main PR-${pr})..PR-${pr}
   # Tag cherry-picks to help
   git tag cherry-pick-PR-${pr}
 done
diff --git a/examples/nlp/gpt/conf/gpt_dpo.yaml b/examples/nlp/gpt/conf/gpt_dpo.yaml
index 1179dc2b1..fd6a2e28b 100644
--- a/examples/nlp/gpt/conf/gpt_dpo.yaml
+++ b/examples/nlp/gpt/conf/gpt_dpo.yaml
@@ -6,6 +6,7 @@ trainer:
   devices: 8
   accelerator: gpu
   precision: bf16
+  gradient_clip_val: 0.0  # No need to change. Megatron Core optimizer uses this value
   
   # dpo specific args
   dpo:
@@ -17,6 +18,7 @@ trainer:
 
     # how many GBS we loop over
     limit_val_batches: 1.0
+    # TODO: delete once Megatron Core optimizer becomes default
     gradient_clip_val: 1.0
 
   # do not change these
diff --git a/examples/nlp/gpt/conf/gpt_kto.yaml b/examples/nlp/gpt/conf/gpt_kto.yaml
index de264056a..f6cd60059 100644
--- a/examples/nlp/gpt/conf/gpt_kto.yaml
+++ b/examples/nlp/gpt/conf/gpt_kto.yaml
@@ -6,6 +6,7 @@ trainer:
   devices: 8
   accelerator: gpu
   precision: bf16
+  gradient_clip_val: 0.0  # No need to change. Megatron Core optimizer uses this value
   
   # kto specific args
   kto:
@@ -17,6 +18,7 @@ trainer:
 
     # how many GBS we loop over
     limit_val_batches: 1.0
+    # TODO: delete once Megatron Core optimizer becomes default
     gradient_clip_val: 1.0
 
   # do not change these
diff --git a/examples/nlp/gpt/conf/gpt_ppo_actor.yaml b/examples/nlp/gpt/conf/gpt_ppo_actor.yaml
index e0a5a1045..22b899e50 100644
--- a/examples/nlp/gpt/conf/gpt_ppo_actor.yaml
+++ b/examples/nlp/gpt/conf/gpt_ppo_actor.yaml
@@ -7,6 +7,7 @@ trainer:
   devices: 8
   accelerator: gpu
   precision: bf16
+  gradient_clip_val: 0.0  # No need to change. Megatron Core optimizer uses this value
 
   ppo:
     # How many steps we train warmup the critic for (without training the policy)
@@ -21,6 +22,7 @@ trainer:
     max_steps: -1  # max PPO steps (-1 to go through the whole train set)
     val_check_interval: 10
     save_interval: ${.val_check_interval}
+    # TODO: delete once Megatron Core optimizer becomes default
     gradient_clip_val: 1.0
 
     # PPO args to generate the data for training
diff --git a/examples/nlp/gpt/conf/gpt_ppo_critic.yaml b/examples/nlp/gpt/conf/gpt_ppo_critic.yaml
index 75974767f..8e146eb8c 100644
--- a/examples/nlp/gpt/conf/gpt_ppo_critic.yaml
+++ b/examples/nlp/gpt/conf/gpt_ppo_critic.yaml
@@ -6,6 +6,7 @@ trainer:
   devices: 8
   accelerator: gpu
   precision: bf16
+  gradient_clip_val: 0.0  # No need to change. Megatron Core optimizer uses this value
 
   ppo:
     port: 5556
@@ -15,6 +16,7 @@ trainer:
 
     # used to set the learning rate scheduler
     max_steps: 10000
+    # TODO: delete once Megatron Core optimizer becomes default
     gradient_clip_val: 1.0
 
     # a PyTriton parameter to specify
diff --git a/examples/nlp/gpt/conf/gpt_rs_actor.yaml b/examples/nlp/gpt/conf/gpt_rs_actor.yaml
index b819ca287..6ff1a228a 100644
--- a/examples/nlp/gpt/conf/gpt_rs_actor.yaml
+++ b/examples/nlp/gpt/conf/gpt_rs_actor.yaml
@@ -7,12 +7,14 @@ trainer:
   devices: 8
   accelerator: gpu
   precision: bf16
+  gradient_clip_val: 0.0  # No need to change. Megatron Core optimizer uses this value
 
   rs:
     max_epochs: 1
     max_steps: -1  # max rs steps (-1 to go through the whole train set)
     val_check_interval: 10
     save_interval: ${.val_check_interval}
+    # TODO: delete once Megatron Core optimizer becomes default
     gradient_clip_val: 1.0
 
     # pick up from the model
@@ -177,4 +179,4 @@ model:
   # define fields from the base model's config that should be ignored when merging with this config.
   overwrite_base_config:
     data:
-      data_prefix: True
\ No newline at end of file
+      data_prefix: True
diff --git a/examples/nlp/gpt/conf/gpt_sft.yaml b/examples/nlp/gpt/conf/gpt_sft.yaml
index bdd757f31..d6c9561dd 100644
--- a/examples/nlp/gpt/conf/gpt_sft.yaml
+++ b/examples/nlp/gpt/conf/gpt_sft.yaml
@@ -5,6 +5,7 @@ trainer:
   devices: 1
   accelerator: gpu
   precision: bf16
+  gradient_clip_val: 0.0  # No need to change. Megatron Core optimizer uses this value
 
   sft:
     max_epochs: 1
@@ -15,6 +16,7 @@ trainer:
     limit_train_batches: 1.0
 
     limit_val_batches: 1.0
+    # TODO: delete once Megatron Core optimizer becomes default
     gradient_clip_val: 1.0
 
     # can be used to register any custom metrics that require token-by-token generation
diff --git a/examples/nlp/gpt/conf/gpt_spin.yaml b/examples/nlp/gpt/conf/gpt_spin.yaml
index 4027dbf8e..96772d975 100644
--- a/examples/nlp/gpt/conf/gpt_spin.yaml
+++ b/examples/nlp/gpt/conf/gpt_spin.yaml
@@ -6,6 +6,7 @@ trainer:
   devices: 8
   accelerator: gpu
   precision: bf16-mixed
+  gradient_clip_val: 0.0  # No need to change. Megatron Core optimizer uses this value
   
   # spin specific args
   spin:
@@ -18,6 +19,7 @@ trainer:
 
     # how many GBS we loop over
     limit_val_batches: 1.0
+    # TODO: delete once Megatron Core optimizer becomes default
     gradient_clip_val: 1.0
 
   # do not change these
diff --git a/examples/nlp/gpt/conf/training_rm.yaml b/examples/nlp/gpt/conf/training_rm.yaml
index afe927423..77a2ba09c 100644
--- a/examples/nlp/gpt/conf/training_rm.yaml
+++ b/examples/nlp/gpt/conf/training_rm.yaml
@@ -6,6 +6,7 @@ trainer:
   devices: 8
   accelerator: gpu
   precision: bf16
+  gradient_clip_val: 0.0  # No need to change. Megatron Core optimizer uses this value
 
   # rm specific args
   rm:
@@ -20,6 +21,7 @@ trainer:
     # set to float for a percentage
     # of the validation dataset
     limit_val_batches: 1.0
+    # TODO: delete once Megatron Core optimizer becomes default
     gradient_clip_val: 1.0
 
   # do not change these
diff --git a/nemo_aligner/algorithms/critic_server_trainer.py b/nemo_aligner/algorithms/critic_server_trainer.py
index d3a7e0d8c..ff91214ac 100644
--- a/nemo_aligner/algorithms/critic_server_trainer.py
+++ b/nemo_aligner/algorithms/critic_server_trainer.py
@@ -322,7 +322,7 @@ def run_training(self, tokens=None, returns=None, prev_values=None, mask=None):
             grad_norm = grad_norm.item() if torch.is_tensor(grad_norm) else grad_norm
             lr = self.optimizer.param_groups[0]["lr"]
 
-            self.optimizer.step()
+            self.optimizer.step(closure=None)
             self.scheduler.step()
 
             if grad_norm is not None:
diff --git a/nemo_aligner/algorithms/dpo.py b/nemo_aligner/algorithms/dpo.py
index 198350ea6..428ae7bee 100644
--- a/nemo_aligner/algorithms/dpo.py
+++ b/nemo_aligner/algorithms/dpo.py
@@ -219,7 +219,7 @@ def train_single_step(self, global_batch):
         grad_norm = grad_norm.item() if torch.is_tensor(grad_norm) else grad_norm
         lr = self.optimizer.param_groups[0]["lr"]
 
-        self.optimizer.step()
+        self.optimizer.step(closure=None)
         self.scheduler.step()
 
         trainer_metrics = {}
diff --git a/nemo_aligner/algorithms/ppo.py b/nemo_aligner/algorithms/ppo.py
index 323c18224..3851cc4b1 100644
--- a/nemo_aligner/algorithms/ppo.py
+++ b/nemo_aligner/algorithms/ppo.py
@@ -440,7 +440,7 @@ def run_training(self, dataloader_iter):
             grad_norm = grad_norm.item() if torch.is_tensor(grad_norm) else grad_norm
             lr = self.optimizer.param_groups[0]["lr"]
 
-            self.optimizer.step()
+            self.optimizer.step(closure=None)
             self.scheduler.step()
 
             if grad_norm is not None:
diff --git a/nemo_aligner/algorithms/rs.py b/nemo_aligner/algorithms/rs.py
index 493b743d4..11bb7b141 100644
--- a/nemo_aligner/algorithms/rs.py
+++ b/nemo_aligner/algorithms/rs.py
@@ -294,7 +294,7 @@ def run_training(self, dataloader_iter):
             grad_norm = grad_norm.item() if torch.is_tensor(grad_norm) else grad_norm
             lr = self.optimizer.param_groups[0]["lr"]
 
-            self.optimizer.step()
+            self.optimizer.step(closure=None)
             self.scheduler.step()
 
             if grad_norm is not None:
diff --git a/nemo_aligner/algorithms/spin.py b/nemo_aligner/algorithms/spin.py
index 717daaa53..f40611957 100644
--- a/nemo_aligner/algorithms/spin.py
+++ b/nemo_aligner/algorithms/spin.py
@@ -195,7 +195,7 @@ def train_single_step(self, global_batch):
         grad_norm = grad_norm.item() if torch.is_tensor(grad_norm) else grad_norm
         lr = self.optimizer.param_groups[0]["lr"]
 
-        self.optimizer.step()
+        self.optimizer.step(closure=None)
         self.scheduler.step()
 
         trainer_metrics = {}
diff --git a/nemo_aligner/algorithms/supervised.py b/nemo_aligner/algorithms/supervised.py
index 3f2f67c61..ed3ce707d 100644
--- a/nemo_aligner/algorithms/supervised.py
+++ b/nemo_aligner/algorithms/supervised.py
@@ -150,7 +150,7 @@ def train_single_step(self, batch):
         grad_norm = grad_norm.item() if torch.is_tensor(grad_norm) else grad_norm
         lr = self.optimizer.param_groups[0]["lr"]
 
-        self.optimizer.step()
+        self.optimizer.step(closure=None)
         self.scheduler.step()
 
         trainer_metrics = {}
diff --git a/nemo_aligner/utils/train_utils.py b/nemo_aligner/utils/train_utils.py
index da176b785..1883632cf 100644
--- a/nemo_aligner/utils/train_utils.py
+++ b/nemo_aligner/utils/train_utils.py
@@ -101,31 +101,52 @@ def prepare_for_training_step(ptl_model, zero_grad=True):
                     param.data_ptr()
 
 
+# TODO: Delete this once API introduced in NeMo (https://github.com/NVIDIA/NeMo/pull/10803)
+# TODO: Update PR to move this logic into staticmethod in nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py
 def grad_reductions(ptl_model):
     # when using sequence parallelism, the sequence parallel layernorm grads must be all-reduced
     if ptl_model.cfg.get("tensor_model_parallel_size", 1) > 1 and ptl_model.cfg.get("sequence_parallel", False):
-        ptl_model.allreduce_sequence_parallel_gradients()
-
-    if ptl_model.with_distributed_adam:
-        # synchronize asynchronous grad reductions
-        # note: not necessary, but reduces performance degradation
-        # from multiple simultaneous NCCL calls
-        ptl_model._optimizer._finish_bucket_grad_sync()
+        # Mcore DistOpt handles this, so we don't have to
+        if not ptl_model.use_mcore_dist_optim:
+            ptl_model.megatron_timer_start("allreduce_sequence_parallel_gradients", log_level=1)
+            ptl_model.allreduce_sequence_parallel_gradients()
+            ptl_model.megatron_timer_stop("allreduce_sequence_parallel_gradients")
+
+    ptl_model.megatron_timer_start("gradient_allreduce", log_level=1)
+    if ptl_model.use_fsdp:
+        # Reduce the gradients omitted from FSDP-sharding
+        ptl_model.allreduce_fsdp_sharding_omitted_gradients()
+    elif ptl_model.with_distributed_adam:
+        if not ptl_model.use_mcore_dist_optim:
+            # synchronize asynchronous grad reductions
+            # note: not necessary, but reduces performance degradation
+            # from multiple simultaneous NCCL calls
+            ptl_model._optimizer._finish_bucket_grad_sync()
+        # else: Mcore distributed optim calls finalize_model_grads to finish grad sync
     elif ptl_model.megatron_amp_O2:
         # when using pipeline parallelism grads must be all-reduced after the pipeline (not asynchronously)
-        if ptl_model.cfg.get("pipeline_model_parallel_size", 1) > 1 or ptl_model.cfg.get("sequence_parallel", False):
+        if (
+            ptl_model.cfg.get("pipeline_model_parallel_size", 1) > 1
+            or ptl_model.cfg.get("sequence_parallel", False)
+            or not ptl_model.cfg.get("async_grad_allreduce", True)
+        ):
             # main grads are stored in the MainParamsOptimizer wrapper
             ptl_model._optimizer.allreduce_main_grads()
     else:
         # async grad allreduce is not currently implemented for O1/autocasting mixed precision training
         # so we all-reduce gradients after the pipeline
         ptl_model.allreduce_gradients()  # @sangkug we think this is causing memory to blow up (hurts perf)
+    ptl_model.megatron_timer_stop("gradient_allreduce")
 
-    if ptl_model.cfg.get("pipeline_model_parallel_size", 1) > 1 and ptl_model.cfg.get(
-        "share_embeddings_and_output_weights", True
+    if (
+        not ptl_model.use_mcore_dist_optim
+        and ptl_model.cfg.get("pipeline_model_parallel_size", 1) > 1
+        and ptl_model.cfg.get("share_embeddings_and_output_weights", True)
     ):
+        ptl_model.megatron_timer_start("allreduce_first_last_embeddings", log_level=1)
         # when using pipeline parallelism the first and last stage must keep embeddings in sync
         ptl_model.allreduce_first_last_embeddings()
+        ptl_model.megatron_timer_stop("allreduce_first_last_embeddings")
 
 
 def prepare_for_validation_step(ptl_model):
@@ -155,7 +176,11 @@ def set_eval(ptl_model):
         ptl_model.eval()
 
 
+# TODO: adapt the version in /opt/NeMo/nemo/collections/nlp/models/language_modeling/megatron_base_model.py
 def clip_gradients(ptl_model, clip_val):
+    """PTL hook to configure gradients.
+    We use gradient clipping implementation from megatron-lm.
+    """
     if clip_val is None:
         return
 
@@ -163,6 +188,14 @@ def clip_gradients(ptl_model, clip_val):
     if clip_val <= 0:
         return
 
+    if ptl_model.with_megatron_fused_adam or ptl_model.use_mcore_dist_optim:
+        # Gradient clipping is done in optimizer step
+        return
+
+    if ptl_model.grad_clip_pl_default:
+        # use the default behavior
+        return super().configure_gradient_clipping(*args, **kwargs)
+
     if ptl_model.with_distributed_adam:
         grad_norm = clip_grad_norm_distributed_optimizer(ptl_model._optimizer, clip_val)
     else:
@@ -171,6 +204,5 @@ def clip_gradients(ptl_model, clip_val):
             parameters = ptl_model._optimizer.get_parameters_with_grad()
         else:
             parameters = ptl_model.get_parameters_with_grad()
-        grad_norm = clip_grad_norm_fp32(parameters=parameters, max_norm=clip_val)
-
+        grad_norm = clip_grad_norm_fp32(parameters=parameters, max_norm=clip_val, use_fsdp=ptl_model.use_fsdp,)
     return grad_norm
diff --git a/tests/functional/dpo.sh b/tests/functional/dpo.sh
index 3bfe3b8db..886e3517d 100755
--- a/tests/functional/dpo.sh
+++ b/tests/functional/dpo.sh
@@ -1,20 +1,29 @@
 #!/bin/bash
 
+# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd )
 cd $SCRIPT_DIR
 set -eoux pipefail
 
 export NCCL_ALGO=Tree
-export NVTE_APPLY_QK_LAYER_SCALING=1
+export NVTE_APPLY_QK_LAYER_SCALING=${NVTE_APPLY_QK_LAYER_SCALING:-0}
 
-KL=${KL:-0.1}
-#LR=${LR:-9e-7}
-GBS=${GBS:-4}
 PRETRAINED_CHECKPOINT_NEMO_FILE=${PRETRAINED_CHECKPOINT_NEMO_FILE}
 
 
-#MIN_LR=$(awk -v var="$LR" 'BEGIN {print var - 1e-11}')
-
 TRAIN_DATA_PATH=$SCRIPT_DIR/test_data/dummy-dpo.jsonl
 VALID_DATA_PATH=$SCRIPT_DIR/test_data/dummy-dpo.jsonl
 
@@ -27,7 +36,7 @@ mkdir -p $RESULTS_DIR
 GPFS=$(git rev-parse --show-toplevel)
 
 # W&B Logging
-PROJECT=llama3_dpo_test
+PROJECT=dpo_test
 
 # START HETEROGENEUS JOB 3
 CONF_DIR="${GPFS}/examples/nlp/gpt/conf/"
@@ -44,38 +53,40 @@ dpo() {
 export CUDA_VISIBLE_DEVICES=0,1
 export PYTHONPATH="${GPFS}:${PYTHONPATH:-}"
 export HYDRA_FULL_ERROR=1
-mpirun -np 2 --allow-run-as-root python -u ${GPFS}/examples/nlp/gpt/train_gpt_dpo.py \
+torchrun --nproc_per_node=2 ${GPFS}/examples/nlp/gpt/train_gpt_dpo.py \
     --config-path=${CONF_DIR} \
     --config-name=${CONF_NAME} \
     trainer.num_nodes=1 \
     trainer.devices=2 \
-    ++model.data.data_impl=jsonl \
-    ++model.data.seq_length=128 \
-    ++model.global_batch_size=${GBS} \
+    pretrained_checkpoint.restore_from_path=${PRETRAINED_CHECKPOINT_NEMO_FILE} \
+    exp_manager.create_checkpoint_callback=False \
+    exp_manager.explicit_log_dir=${RESULTS_DIR} \
+    ++model.tensor_model_parallel_size=1 \
+    ++model.pipeline_model_parallel_size=1 \
+    ++model.global_batch_size=4 \
     ++model.micro_batch_size=1 \
     ++model.mcore_gpt=true \
     ++model.megatron_amp_O2=true \
-    ++model.dpo.ref_policy_kl_penalty=${KL} \
+    ++model.dpo.ref_policy_kl_penalty=0.1 \
     ++model.dpo.log_prob_forward_micro_batch_size=1 \
     ++model.dpo.average_log_probs=false \
     ++model.dpo.sft_loss_weight=0.1 \
     ++model.dpo.preference_loss_weight=1.0 \
-    pretrained_checkpoint.restore_from_path=${PRETRAINED_CHECKPOINT_NEMO_FILE} \
-    "model.data.data_prefix={train: [${TRAIN_DATA_PATH}], validation: [${VALID_DATA_PATH}], test: [${VALID_DATA_PATH}]}" \
-    exp_manager.create_checkpoint_callback=False \
+    ++model.activations_checkpoint_granularity=full \
+    ++model.activations_checkpoint_method=uniform \
+    ++model.activations_checkpoint_num_layers=1 \
+    ++model.dist_ckpt_load_strictness=log_all \
+    ++model.data.data_impl=jsonl \
+    ++model.data.seq_length=128 \
     model.data.num_workers=2 \
-    ++model.tensor_model_parallel_size=1 \
-    ++model.pipeline_model_parallel_size=1 \
+    "model.data.data_prefix={train: [${TRAIN_DATA_PATH}], validation: [${VALID_DATA_PATH}], test: [${VALID_DATA_PATH}]}" \
     trainer.dpo.max_steps=3 \
     trainer.dpo.val_check_interval=3 \
     trainer.dpo.limit_val_batches=8 \
     trainer.dpo.save_interval=0 \
-    exp_manager.explicit_log_dir=${RESULTS_DIR} \
-    ++model.activations_checkpoint_granularity=full \
-    ++model.activations_checkpoint_method=uniform \
-    ++model.activations_checkpoint_num_layers=1 \
-    ++model.dist_ckpt_load_strictness=log_all
+    "$@"
 }
 
 log_file=$(mktemp /tmp/dpo-log-XXXXXX)
-dpo | tee $log_file
\ No newline at end of file
+dpo "$@" | tee $log_file
+echo "[Finished] $0"
diff --git a/tests/functional/test_cases/dpo-llama3 b/tests/functional/test_cases/dpo-llama3
index 8e40e94c8..f841ab8b0 100755
--- a/tests/functional/test_cases/dpo-llama3
+++ b/tests/functional/test_cases/dpo-llama3
@@ -1,5 +1,6 @@
 #!/bin/bash
-# Copyright (c) 2023, NVIDIA CORPORATION.  All rights reserved.
+
+# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -19,4 +20,6 @@ cd $SCRIPT_DIR
 set -eoux pipefail
 
 PRETRAINED_CHECKPOINT_NEMO_FILE=${ALIGNER_CI_DIR}/checkpoints/tiny-llama3-results-nlayers2-hidden128-ffn448-nhead4-qgroup2-megatron_gpt.nemo \
-bash ../dpo.sh
+bash ../dpo.sh \
+  ++model.optim.name=mcore_distributed_optim \
+  2>&1 | tee $(basename $0).log
diff --git a/tests/functional/test_cases/dpo-mixtral-ep b/tests/functional/test_cases/dpo-mixtral-ep
new file mode 100755
index 000000000..79f6ffd1d
--- /dev/null
+++ b/tests/functional/test_cases/dpo-mixtral-ep
@@ -0,0 +1,27 @@
+#!/bin/bash
+
+# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd )
+cd $SCRIPT_DIR
+
+set -eoux pipefail
+
+PRETRAINED_CHECKPOINT_NEMO_FILE=$ALIGNER_CI_DIR/checkpoints/tiny-mixtral-nlayers2-hidden128-ffn448-nhead4-qgroup2.nemo \
+bash ../dpo.sh \
+  ++model.optim.name=mcore_distributed_optim \
+  ++model.expert_model_parallel_size=2 \
+  2>&1 | tee $(basename $0).log
+
diff --git a/tests/functional/test_cases/dpo-mixtral-peft-tp-sp b/tests/functional/test_cases/dpo-mixtral-peft-tp-sp
new file mode 100755
index 000000000..5350d42a3
--- /dev/null
+++ b/tests/functional/test_cases/dpo-mixtral-peft-tp-sp
@@ -0,0 +1,31 @@
+#!/bin/bash
+
+# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd )
+cd $SCRIPT_DIR
+
+set -eoux pipefail
+
+PRETRAINED_CHECKPOINT_NEMO_FILE=$ALIGNER_CI_DIR/checkpoints/tiny-mixtral-nlayers2-hidden128-ffn448-nhead4-qgroup2.nemo \
+bash ../dpo.sh \
+  ++model.optim.name=mcore_distributed_optim \
+  ++model.tensor_model_parallel_size=2 \
+  ++model.expert_model_parallel_size=1 \
+  ++model.sequence_parallel=True \
+  ++model.tp_comm_overlap_disable_qkv=True \
+  model.data.pad_length_to_multiple_of=2 \
+  model.peft.peft_scheme=lora \
+  2>&1 | tee $(basename $0).log