From e00cf7ecdfb7ee3a65eac27f99a00a0695c1f510 Mon Sep 17 00:00:00 2001 From: abdullah-ibm Date: Tue, 22 Oct 2024 12:05:43 +0300 Subject: [PATCH 1/4] added some safeguards to prevent the code from proceeding when the necessary imports are not available Signed-off-by: Harthi7 Signed-off-by: abdullah-ibm --- src/instructlab/training/main_ds.py | 34 +++++++++++++++++++++++++++-- 1 file changed, 32 insertions(+), 2 deletions(-) diff --git a/src/instructlab/training/main_ds.py b/src/instructlab/training/main_ds.py index d1ae0e01..1b017c82 100644 --- a/src/instructlab/training/main_ds.py +++ b/src/instructlab/training/main_ds.py @@ -12,10 +12,27 @@ # Third Party from accelerate import Accelerator -from deepspeed.ops.adam import DeepSpeedCPUAdam, FusedAdam -from deepspeed.runtime.zero.utils import ZeRORuntimeException + +try: + from deepspeed.ops.adam import DeepSpeedCPUAdam +except ImportError: + DeepSpeedCPUAdam = None + local_rank = int(os.getenv('LOCAL_RANK', None)) + if __name__ == '__main__' and (not local_rank or local_rank == 0): + print("DeepSpeed CPU Optimizer is not available. Some features may be unavailable.") + +try: + from deepspeed.ops.adam import FusedAdam + from deepspeed.runtime.zero.utils import ZeRORuntimeException +except ImportError: + FusedAdam = None + ZeRORuntimeException = None + local_rank = int(os.getenv('LOCAL_RANK', None)) + if __name__ == '__main__' and (not local_rank or local_rank == 0): + print("DeepSpeed is not available. Some features may be unavailable.") # pylint: disable=no-name-in-module +from instructlab.training.confg import DistributedBackend from instructlab.dolomite.hf_models import GPTDolomiteForCausalLM from tqdm import tqdm from transformers import AutoModelForCausalLM, get_scheduler @@ -513,6 +530,12 @@ def main(args): # Third Party import yaml + if args.distributed_training_framework == 'deepspeed' and not FusedAdam: + raise ImportError("DeepSpeed was selected but we cannot import the `FusedAdam` optimizer") + + if args.distributed_training_framework == 'deeppeed' and args.cpu_offload_optimizer and not DeepSpeedCPUAdam: + raise ImportError("DeepSpeed was selected and CPU offloading was requested, but DeepSpeedCPUAdam could not be imported. This likely means you need to build DeepSpeed with the CPU adam flags.") + metric_logger = AsyncStructuredLogger( args.output_dir + f"/training_params_and_metrics_global{os.environ['RANK']}.jsonl" @@ -733,6 +756,13 @@ def run_training(torch_args: TorchrunArgs, train_args: TrainingArgs) -> None: ) # deepspeed options + if train_args.distributed_backend == DistributedBackend.DeepSpeed: + if not FusedAdam: + raise ImportError("DeepSpeed was selected as the distributed backend, but FusedAdam could not be imported. Please double-check that DeepSpeed is installed correctly") + + if train_args.deepspeed_options.cpu_offload_optimizer and not DeepSpeedCPUAdam: + raise ImportError("DeepSpeed CPU offloading was enabled, but DeepSpeedCPUAdam could not be imported. This is most likely because DeepSpeed was not built with CPU Adam. Please rebuild DeepSpeed to have CPU Adam, or disable CPU offloading.") + if train_args.deepspeed_options.save_samples: command.append(f"--save_samples_ds={train_args.deepspeed_options.save_samples}") if train_args.deepspeed_options.cpu_offload_optimizer: From d0e944ffbeb506565cbaba1ec62717afc2675390 Mon Sep 17 00:00:00 2001 From: abdullah-ibm Date: Wed, 30 Oct 2024 11:54:15 +0300 Subject: [PATCH 2/4] typo fix Signed-off-by: abdullah-ibm --- src/instructlab/training/main_ds.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/instructlab/training/main_ds.py b/src/instructlab/training/main_ds.py index 1b017c82..3a7e1225 100644 --- a/src/instructlab/training/main_ds.py +++ b/src/instructlab/training/main_ds.py @@ -533,7 +533,7 @@ def main(args): if args.distributed_training_framework == 'deepspeed' and not FusedAdam: raise ImportError("DeepSpeed was selected but we cannot import the `FusedAdam` optimizer") - if args.distributed_training_framework == 'deeppeed' and args.cpu_offload_optimizer and not DeepSpeedCPUAdam: + if args.distributed_training_framework == 'deepspeed' and args.cpu_offload_optimizer and not DeepSpeedCPUAdam: raise ImportError("DeepSpeed was selected and CPU offloading was requested, but DeepSpeedCPUAdam could not be imported. This likely means you need to build DeepSpeed with the CPU adam flags.") metric_logger = AsyncStructuredLogger( From 12c67d199ac3cbb147f01845763fc0442fc3ffad Mon Sep 17 00:00:00 2001 From: abdullah-ibm Date: Fri, 1 Nov 2024 17:28:36 +0300 Subject: [PATCH 3/4] fixed pylint error Signed-off-by: Harthi7 Signed-off-by: abdullah-ibm --- src/instructlab/training/main_ds.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/instructlab/training/main_ds.py b/src/instructlab/training/main_ds.py index fb7427fa..973a7e9c 100644 --- a/src/instructlab/training/main_ds.py +++ b/src/instructlab/training/main_ds.py @@ -536,8 +536,8 @@ def main(args): raise ImportError("DeepSpeed was selected but we cannot import the `FusedAdam` optimizer") if args.distributed_training_framework == 'deepspeed' and args.cpu_offload_optimizer and not DeepSpeedCPUAdam: - raise ImportError("DeepSpeed was selected and CPU offloading was requested, but DeepSpeedCPUAdam could not be imported. This likely means you need to build DeepSpeed with the CPU adam flags.") - + raise ImportError("DeepSpeed was selected and CPU offloading was requested, but DeepSpeedCPUAdam could not be imported. This likely means you need to build DeepSpeed with the CPU adam flags.") + metric_logger = AsyncStructuredLogger( args.output_dir + f"/training_params_and_metrics_global{os.environ['RANK']}.jsonl" @@ -760,7 +760,7 @@ def run_training(torch_args: TorchrunArgs, train_args: TrainingArgs) -> None: if train_args.deepspeed_options.cpu_offload_optimizer and not DeepSpeedCPUAdam: raise ImportError("DeepSpeed CPU offloading was enabled, but DeepSpeedCPUAdam could not be imported. This is most likely because DeepSpeed was not built with CPU Adam. Please rebuild DeepSpeed to have CPU Adam, or disable CPU offloading.") - + if train_args.deepspeed_options.save_samples: command.append(f"--save_samples_ds={train_args.deepspeed_options.save_samples}") if train_args.deepspeed_options.cpu_offload_optimizer: From 5417952a86db74fef902158721d5caa47d18df95 Mon Sep 17 00:00:00 2001 From: abdullah-ibm Date: Thu, 7 Nov 2024 13:38:39 +0300 Subject: [PATCH 4/4] fix: pylint fixes Signed-off-by: abdullah-ibm --- src/instructlab/training/main_ds.py | 44 ++++++++++++++++++++--------- 1 file changed, 30 insertions(+), 14 deletions(-) diff --git a/src/instructlab/training/main_ds.py b/src/instructlab/training/main_ds.py index 54d80e5e..72d8357b 100644 --- a/src/instructlab/training/main_ds.py +++ b/src/instructlab/training/main_ds.py @@ -15,25 +15,28 @@ from accelerate import Accelerator try: + # Third Party from deepspeed.ops.adam import DeepSpeedCPUAdam except ImportError: DeepSpeedCPUAdam = None - local_rank = int(os.getenv('LOCAL_RANK', None)) - if __name__ == '__main__' and (not local_rank or local_rank == 0): - print("DeepSpeed CPU Optimizer is not available. Some features may be unavailable.") + local_rank = int(os.getenv("LOCAL_RANK", "0")) + if __name__ == "__main__" and (not local_rank or local_rank == 0): + print( + "DeepSpeed CPU Optimizer is not available. Some features may be unavailable." + ) try: + # Third Party from deepspeed.ops.adam import FusedAdam from deepspeed.runtime.zero.utils import ZeRORuntimeException except ImportError: FusedAdam = None ZeRORuntimeException = None - local_rank = int(os.getenv('LOCAL_RANK', None)) - if __name__ == '__main__' and (not local_rank or local_rank == 0): + local_rank = int(os.getenv("LOCAL_RANK", "0")) + if __name__ == "__main__" and (not local_rank or local_rank == 0): print("DeepSpeed is not available. Some features may be unavailable.") -# pylint: disable=no-name-in-module -from instructlab.training.confg import DistributedBackend +# Third Party from instructlab.dolomite.hf_models import GPTDolomiteForCausalLM from tqdm import tqdm from transformers import AutoModelForCausalLM, get_scheduler @@ -43,6 +46,8 @@ # First Party from instructlab.training import config from instructlab.training.async_logger import AsyncStructuredLogger + +# pylint: disable=no-name-in-module from instructlab.training.config import ( DataProcessArgs, DistributedBackend, @@ -533,11 +538,19 @@ def main(args): # Third Party import yaml - if args.distributed_training_framework == 'deepspeed' and not FusedAdam: - raise ImportError("DeepSpeed was selected but we cannot import the `FusedAdam` optimizer") + if args.distributed_training_framework == "deepspeed" and not FusedAdam: + raise ImportError( + "DeepSpeed was selected but we cannot import the `FusedAdam` optimizer" + ) - if args.distributed_training_framework == 'deepspeed' and args.cpu_offload_optimizer and not DeepSpeedCPUAdam: - raise ImportError("DeepSpeed was selected and CPU offloading was requested, but DeepSpeedCPUAdam could not be imported. This likely means you need to build DeepSpeed with the CPU adam flags.") + if ( + args.distributed_training_framework == "deepspeed" + and args.cpu_offload_optimizer + and not DeepSpeedCPUAdam + ): + raise ImportError( + "DeepSpeed was selected and CPU offloading was requested, but DeepSpeedCPUAdam could not be imported. This likely means you need to build DeepSpeed with the CPU adam flags." + ) metric_logger = AsyncStructuredLogger( args.output_dir @@ -761,11 +774,14 @@ def run_training(torch_args: TorchrunArgs, train_args: TrainingArgs) -> None: # deepspeed options if train_args.distributed_backend == DistributedBackend.DeepSpeed: if not FusedAdam: - raise ImportError("DeepSpeed was selected as the distributed backend, but FusedAdam could not be imported. Please double-check that DeepSpeed is installed correctly") + raise ImportError( + "DeepSpeed was selected as the distributed backend, but FusedAdam could not be imported. Please double-check that DeepSpeed is installed correctly" + ) if train_args.deepspeed_options.cpu_offload_optimizer and not DeepSpeedCPUAdam: - raise ImportError("DeepSpeed CPU offloading was enabled, but DeepSpeedCPUAdam could not be imported. This is most likely because DeepSpeed was not built with CPU Adam. Please rebuild DeepSpeed to have CPU Adam, or disable CPU offloading.") - + raise ImportError( + "DeepSpeed CPU offloading was enabled, but DeepSpeedCPUAdam could not be imported. This is most likely because DeepSpeed was not built with CPU Adam. Please rebuild DeepSpeed to have CPU Adam, or disable CPU offloading." + ) if train_args.deepspeed_options.save_samples: command.append(f"--save_samples_ds={train_args.deepspeed_options.save_samples}") if train_args.deepspeed_options.cpu_offload_optimizer: