Skip to content

Commit

Permalink
Merge pull request #163 from ServiceNow/deepspeed_vllm
Browse files Browse the repository at this point in the history
various fixes
  • Loading branch information
rizar authored Jan 8, 2025
2 parents 1c7b16a + 695f006 commit 10ec04b
Show file tree
Hide file tree
Showing 4 changed files with 9 additions and 31 deletions.
2 changes: 1 addition & 1 deletion examples/rl_gsm8k/orchestrate_rl.py
Original file line number Diff line number Diff line change
Expand Up @@ -472,7 +472,7 @@ def main(cfg: DictConfig):
finetune_cfg = cfg.copy()

checkpoint_steps = finetune_cfg.finetune.save_checkpoint_steps
interrupt_train_steps = int((state["iteration"] + 1) * checkpoint_steps - 1)
interrupt_train_steps = int((state["iteration"] + 1) * checkpoint_steps)

finetune_cfg.finetune.interrupt_train_steps = interrupt_train_steps
finetune_cfg.output_dir = str(finetune_path)
Expand Down
2 changes: 1 addition & 1 deletion examples/rl_gsm8k/requirements.txt
Original file line number Diff line number Diff line change
@@ -1 +1 @@
vllm==0.6.3
vllm==0.6.6.post1
5 changes: 2 additions & 3 deletions requirements.finetune.txt
Original file line number Diff line number Diff line change
@@ -1,9 +1,8 @@
accelerate==1.0.1
accelerate==1.2.0
datasets==2.21.0
deepspeed==0.15.1
deepspeed==0.15.4
numpy==1.26.4
peft==0.12.0
tokenizers==0.20.1
transformers==4.45.2
wandb==0.19.1
vllm==0.6.1
31 changes: 5 additions & 26 deletions tapeagents/finetune/checkpoints.py
Original file line number Diff line number Diff line change
Expand Up @@ -326,39 +326,14 @@ def save_model_only(

logger.info(f"Save model to {output_dir}")

if is_deepspeed_model(model):
logger.info(f"Saving through deepspeed engine path {output_dir}")
# saving using DeepSpeed's checkpoint mechanism
model.save_checkpoint(save_dir=output_dir)

# convert to HF format on main process
if accelerator.is_main_process:
from deepspeed.utils.zero_to_fp32 import convert_zero_checkpoint_to_fp32_state_dict
logger.info("Converting DeepSpeed checkpoint to HF format")

convert_zero_checkpoint_to_fp32_state_dict(
checkpoint_dir=output_dir,
output_dir=output_dir,
tag=None, # will use 'global_step{step}' from DeepSpeed
safe_serialization=safe_serialization
)

# save model config
logger.info("Save model config (config.json)")
unwrapped_model = model.module
config = unwrapped_model.config
config.save_pretrained(output_dir)

logger.info(f"Saved converted checkpoint to {output_dir}")
return

unwrapped_model = accelerator.unwrap_model(model) if unwrap else model
if lora:
lora_save(output_dir, unwrapped_model)
return

# for non-deepspeed models
elif isinstance(unwrapped_model, transformers.PreTrainedModel):
logger.info("Saving model using transformers save_pretrained")
unwrapped_model.save_pretrained( # type: ignore
output_dir,
is_main_process=accelerator.is_main_process,
Expand All @@ -369,6 +344,10 @@ def save_model_only(
logger.info(f"Saved model to {output_dir}")
else:
raise ValueError(f"model is neither a deepspeed model nor a transformers.PreTrainedModel: {type(model)}")

if os.path.exists(output_dir / "model.safetensors") and os.path.exists(output_dir / "model.safetensors.index.json"):
logger.info("Hide model.safetensors because it utterly confuses the HF model loading code")
os.rename(output_dir / "model.safetensors", output_dir / "model.safetensors.bak")


def save_tokenizer_only(
Expand Down

0 comments on commit 10ec04b

Please sign in to comment.