Skip to content

Commit

Permalink
added some safeguards to prevent the code from proceeding when the ne…
Browse files Browse the repository at this point in the history
…cessary imports are not available

Signed-off-by: Harthi7 <[email protected]>
Signed-off-by: abdullah-ibm <[email protected]>
  • Loading branch information
abdullah-ibm committed Oct 22, 2024
1 parent 7b7894b commit 2615349
Showing 1 changed file with 32 additions and 2 deletions.
34 changes: 32 additions & 2 deletions src/instructlab/training/main_ds.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,10 +12,27 @@

# Third Party
from accelerate import Accelerator
from deepspeed.ops.adam import DeepSpeedCPUAdam, FusedAdam
from deepspeed.runtime.zero.utils import ZeRORuntimeException

try:
from deepspeed.ops.adam import DeepSpeedCPUAdam
except ImportError:
DeepSpeedCPUAdam = None
local_rank = int(os.getenv('LOCAL_RANK', None))
if __name__ == '__main__' and (not local_rank or local_rank == 0):
print("DeepSpeed CPU Optimizer is not available. Some features may be unavailable.")

try:
from deepspeed.ops.adam import FusedAdam
from deepspeed.runtime.zero.utils import ZeRORuntimeException
except ImportError:
FusedAdam = None
ZeRORuntimeException = None
local_rank = int(os.getenv('LOCAL_RANK', None))
if __name__ == '__main__' and (not local_rank or local_rank == 0):
print("DeepSpeed is not available. Some features may be unavailable.")

# pylint: disable=no-name-in-module
from instructlab.training.confg import DistributedBackend

Check failure on line 35 in src/instructlab/training/main_ds.py

View workflow job for this annotation

GitHub Actions / pylint

E0401: Unable to import 'instructlab.training.confg' (import-error)
from instructlab.dolomite.hf_models import GPTDolomiteForCausalLM
from tqdm import tqdm
from transformers import AutoModelForCausalLM, get_scheduler
Expand Down Expand Up @@ -513,6 +530,12 @@ def main(args):
# Third Party
import yaml

if args.distributed_training_framework == 'deepspeed' and not FusedAdam:
raise ImportError("DeepSpeed was selected but we cannot import the `FusedAdam` optimizer")

if args.distributed_training_framework == 'deeppeed' and args.cpu_offload_optimizer and not DeepSpeedCPUAdam:
raise ImportError("DeepSpeed was selected and CPU offloading was requested, but DeepSpeedCPUAdam could not be imported. This likely means you need to build DeepSpeed with the CPU adam flags.")

Check warning on line 537 in src/instructlab/training/main_ds.py

View workflow job for this annotation

GitHub Actions / pylint

C0303: Trailing whitespace (trailing-whitespace)

Check warning on line 538 in src/instructlab/training/main_ds.py

View workflow job for this annotation

GitHub Actions / pylint

C0303: Trailing whitespace (trailing-whitespace)
metric_logger = AsyncStructuredLogger(
args.output_dir
+ f"/training_params_and_metrics_global{os.environ['RANK']}.jsonl"
Expand Down Expand Up @@ -733,6 +756,13 @@ def run_training(torch_args: TorchrunArgs, train_args: TrainingArgs) -> None:
)

# deepspeed options
if train_args.distributed_backend == DistributedBackend.DeepSpeed:
if not FusedAdam:
raise ImportError("DeepSpeed was selected as the distributed backend, but FusedAdam could not be imported. Please double-check that DeepSpeed is installed correctly")

if train_args.deepspeed_options.cpu_offload_optimizer and not DeepSpeedCPUAdam:
raise ImportError("DeepSpeed CPU offloading was enabled, but DeepSpeedCPUAdam could not be imported. This is most likely because DeepSpeed was not built with CPU Adam. Please rebuild DeepSpeed to have CPU Adam, or disable CPU offloading.")

Check warning on line 765 in src/instructlab/training/main_ds.py

View workflow job for this annotation

GitHub Actions / pylint

C0303: Trailing whitespace (trailing-whitespace)
if train_args.deepspeed_options.save_samples:
command.append(f"--save_samples_ds={train_args.deepspeed_options.save_samples}")
if train_args.deepspeed_options.cpu_offload_optimizer:
Expand Down

0 comments on commit 2615349

Please sign in to comment.