From 9d7735a669dfd31e5634f760253db7b042dc615d Mon Sep 17 00:00:00 2001 From: ExponentialML Date: Fri, 4 Aug 2023 16:34:50 -0700 Subject: [PATCH 1/2] Simplify training loop --- train.py | 36 +++++++++++------------------------- 1 file changed, 11 insertions(+), 25 deletions(-) diff --git a/train.py b/train.py index 592a761..ce98a05 100644 --- a/train.py +++ b/train.py @@ -784,40 +784,26 @@ def finetune_unet(batch, train_encoder=False): else: raise ValueError(f"Unknown prediction type {noise_scheduler.prediction_type}") - - # Here we do two passes for video and text training. - # If we are on the second iteration of the loop, get one frame. # This allows us to train text information only on the spatial layers. - losses = [] should_truncate_video = (video_length > 1 and text_trainable) + should_detach = video_length > 1 # We detach the encoder hidden states for the first pass (video frames > 1) # Then we make a clone of the initial state to ensure we can train it in the loop. detached_encoder_state = encoder_hidden_states.clone().detach() trainable_encoder_state = encoder_hidden_states.clone() - for i in range(2): - - should_detach = noisy_latents.shape[2] > 1 and i == 0 - - if should_truncate_video and i == 1: - noisy_latents = noisy_latents[:,:,1,:,:].unsqueeze(2) - target = target[:,:,1,:,:].unsqueeze(2) - - encoder_hidden_states = ( - detached_encoder_state if should_detach else trainable_encoder_state - ) - - model_pred = unet(noisy_latents, timesteps, encoder_hidden_states=encoder_hidden_states).sample - loss = F.mse_loss(model_pred.float(), target.float(), reduction="mean") - - losses.append(loss) - - # This was most likely single frame training or a single image. - if video_length == 1 and i == 0: break - - loss = losses[0] if len(losses) == 1 else losses[0] + losses[1] + if should_truncate_video: + noisy_latents = noisy_latents[:,:,:1, ...] + target = target[:,:,:1, ...] + + encoder_hidden_states = ( + detached_encoder_state if should_detach else trainable_encoder_state + ) + model_pred = unet(noisy_latents, timesteps, encoder_hidden_states=encoder_hidden_states).sample + loss = F.mse_loss(model_pred.float(), target.float(), reduction="mean") + return loss, latents for epoch in range(first_epoch, num_train_epochs): From 8af9284e60311e11f5cc30caf4c595ea9a6b8345 Mon Sep 17 00:00:00 2001 From: ExponentialML Date: Fri, 4 Aug 2023 17:45:57 -0700 Subject: [PATCH 2/2] Remove video truncation --- train.py | 33 +++++++++++++++++++-------------- 1 file changed, 19 insertions(+), 14 deletions(-) diff --git a/train.py b/train.py index ce98a05..69dfae0 100644 --- a/train.py +++ b/train.py @@ -706,16 +706,21 @@ def main( # Only show the progress bar once on each machine. progress_bar = tqdm(range(global_step, max_train_steps), disable=not accelerator.is_local_main_process) progress_bar.set_description("Steps") + + unet_train_enabled = False + text_train_enabled = False def finetune_unet(batch, train_encoder=False): nonlocal use_offset_noise nonlocal rescale_schedule - + nonlocal unet_train_enabled + nonlocal text_train_enabled + # Check if we are training the text encoder - text_trainable = (train_text_encoder or lora_manager.use_text_lora) + text_trainable = (train_text_encoder or use_text_lora) # Unfreeze UNET Layers - if global_step == 0: + if global_step == 0 and not unet_train_enabled: already_printed_trainables = False unet.train() handle_trainable_modules( @@ -724,6 +729,7 @@ def finetune_unet(batch, train_encoder=False): is_enabled=True, negation=unet_negation ) + unet_train_enabled = True # Convert videos to latent space pixel_values = batch["pixel_values"] @@ -736,9 +742,6 @@ def finetune_unet(batch, train_encoder=False): # Get video length video_length = latents.shape[2] - # Sample noise that we'll add to the latents - use_offset_noise = use_offset_noise and not rescale_schedule - noise = sample_noise(latents, offset_noise_strength, use_offset_noise) bsz = latents.shape[0] # Sample a random timestep for each video @@ -747,10 +750,16 @@ def finetune_unet(batch, train_encoder=False): # Add noise to the latents according to the noise magnitude at each timestep # (this is the forward diffusion process) + #latents = rearrange(latents, 'b c f h w -> (b f) c h w') + + # Sample noise that we'll add to the latents + use_offset_noise = use_offset_noise and not rescale_schedule + noise = sample_noise(latents, offset_noise_strength, use_offset_noise) + noisy_latents = noise_scheduler.add_noise(latents, noise, timesteps) - + # Enable text encoder training - if text_trainable: + if text_trainable and not text_train_enabled: text_encoder.train() if lora_manager.use_text_lora: @@ -763,6 +772,7 @@ def finetune_unet(batch, train_encoder=False): negation=text_encoder_negation ) cast_to_gpu_and_type([text_encoder], accelerator, torch.float32) + text_train_enabled = True # *Potentially* Fixes gradient checkpointing training. # See: https://github.com/prigoyal/pytorch_memonger/blob/master/tutorial/Checkpointing_for_PyTorch_models.ipynb @@ -783,9 +793,8 @@ def finetune_unet(batch, train_encoder=False): else: raise ValueError(f"Unknown prediction type {noise_scheduler.prediction_type}") - + # This allows us to train text information only on the spatial layers. - should_truncate_video = (video_length > 1 and text_trainable) should_detach = video_length > 1 # We detach the encoder hidden states for the first pass (video frames > 1) @@ -793,10 +802,6 @@ def finetune_unet(batch, train_encoder=False): detached_encoder_state = encoder_hidden_states.clone().detach() trainable_encoder_state = encoder_hidden_states.clone() - if should_truncate_video: - noisy_latents = noisy_latents[:,:,:1, ...] - target = target[:,:,:1, ...] - encoder_hidden_states = ( detached_encoder_state if should_detach else trainable_encoder_state )