Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Added some safeguards when the necessary imports are not available #291

Merged
52 changes: 49 additions & 3 deletions src/instructlab/training/main_ds.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,10 +13,30 @@

# Third Party
from accelerate import Accelerator
from deepspeed.ops.adam import DeepSpeedCPUAdam, FusedAdam
from deepspeed.runtime.zero.utils import ZeRORuntimeException

# pylint: disable=no-name-in-module
try:
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

You separated these try-except blocks because DeepSpeedCPUAdam is only imported for CPU offloading training correct? It'd be appreciated to have this documented since you're repeating code.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes, I believe that is correct as @RobotSail have stated in a previous comment #267 (comment), as for the documentation how should we move forward?

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Correct

# Third Party
from deepspeed.ops.adam import DeepSpeedCPUAdam
except ImportError:
DeepSpeedCPUAdam = None
local_rank = int(os.getenv("LOCAL_RANK", "0"))
if __name__ == "__main__" and (not local_rank or local_rank == 0):
print(
"DeepSpeed CPU Optimizer is not available. Some features may be unavailable."
)

try:
# Third Party
from deepspeed.ops.adam import FusedAdam
from deepspeed.runtime.zero.utils import ZeRORuntimeException
except ImportError:
FusedAdam = None
ZeRORuntimeException = None
local_rank = int(os.getenv("LOCAL_RANK", "0"))
if __name__ == "__main__" and (not local_rank or local_rank == 0):
print("DeepSpeed is not available. Some features may be unavailable.")

# Third Party
from instructlab.dolomite.hf_models import GPTDolomiteForCausalLM
from tqdm import tqdm
from transformers import AutoModelForCausalLM, get_scheduler
Expand All @@ -26,6 +46,8 @@
# First Party
from instructlab.training import config
from instructlab.training.async_logger import AsyncStructuredLogger

# pylint: disable=no-name-in-module
from instructlab.training.config import (
DataProcessArgs,
DistributedBackend,
Expand Down Expand Up @@ -516,6 +538,20 @@ def main(args):
# Third Party
import yaml

if args.distributed_training_framework == "deepspeed" and not FusedAdam:
raise ImportError(
"DeepSpeed was selected but we cannot import the `FusedAdam` optimizer"
)

if (
args.distributed_training_framework == "deepspeed"
and args.cpu_offload_optimizer
and not DeepSpeedCPUAdam
):
raise ImportError(
"DeepSpeed was selected and CPU offloading was requested, but DeepSpeedCPUAdam could not be imported. This likely means you need to build DeepSpeed with the CPU adam flags."
)

metric_logger = AsyncStructuredLogger(
args.output_dir
+ f"/training_params_and_metrics_global{os.environ['RANK']}.jsonl"
Expand Down Expand Up @@ -736,6 +772,16 @@ def run_training(torch_args: TorchrunArgs, train_args: TrainingArgs) -> None:
)

# deepspeed options
if train_args.distributed_backend == DistributedBackend.DeepSpeed:
if not FusedAdam:
raise ImportError(
"DeepSpeed was selected as the distributed backend, but FusedAdam could not be imported. Please double-check that DeepSpeed is installed correctly"
)

if train_args.deepspeed_options.cpu_offload_optimizer and not DeepSpeedCPUAdam:
raise ImportError(
"DeepSpeed CPU offloading was enabled, but DeepSpeedCPUAdam could not be imported. This is most likely because DeepSpeed was not built with CPU Adam. Please rebuild DeepSpeed to have CPU Adam, or disable CPU offloading."
)
if train_args.deepspeed_options.save_samples:
command.append(f"--save_samples_ds={train_args.deepspeed_options.save_samples}")
if train_args.deepspeed_options.cpu_offload_optimizer:
Expand Down