diff --git a/src/models/mutual_self_attention.py b/src/models/mutual_self_attention.py index b5d4a7f..e3e57e9 100644 --- a/src/models/mutual_self_attention.py +++ b/src/models/mutual_self_attention.py @@ -80,13 +80,11 @@ def register_reference_hooks( [1] * batch_size * num_images_per_prompt * 16 + [0] * batch_size * num_images_per_prompt * 16 ) - .to(device) .bool() ) else: uc_mask = ( torch.Tensor([0] * batch_size * num_images_per_prompt * 2) - .to(device) .bool() ) @@ -170,7 +168,6 @@ def hacked_basic_transformer_inner_forward( [1] * (hidden_states.shape[0] // 2) + [0] * (hidden_states.shape[0] // 2) ) - .to(device) .bool() ) hidden_states_c[_uc_mask] = ( diff --git a/webgui.py b/webgui.py index 5e46781..b12eeb4 100644 --- a/webgui.py +++ b/webgui.py @@ -67,7 +67,7 @@ ############# model_init started ############# ## vae init -vae = AutoencoderKL.from_pretrained(config.pretrained_vae_path).to("cuda", dtype=weight_dtype) +vae = AutoencoderKL.from_pretrained(config.pretrained_vae_path).to(device, dtype=weight_dtype) ## reference net init reference_unet = UNet2DConditionModel.from_pretrained( @@ -101,7 +101,7 @@ denoising_unet.load_state_dict(torch.load(config.denoising_unet_path, map_location="cpu"), strict=False) ## face locator init -face_locator = FaceLocator(320, conditioning_channels=1, block_out_channels=(16, 32, 96, 256)).to(dtype=weight_dtype, device="cuda") +face_locator = FaceLocator(320, conditioning_channels=1, block_out_channels=(16, 32, 96, 256)).to(dtype=weight_dtype, device=device) face_locator.load_state_dict(torch.load(config.face_locator_path)) ## load audio processor params @@ -122,7 +122,7 @@ audio_guider=audio_processor, face_locator=face_locator, scheduler=scheduler, -).to("cuda", dtype=weight_dtype) +).to(device, dtype=weight_dtype) def select_face(det_bboxes, probs): ## max face from faces that the prob is above 0.8 @@ -170,7 +170,7 @@ def process_video(uploaded_img, uploaded_audio, width, height, length, seed, fac face_mask = cv2.resize(face_mask, (width, height)) ref_image_pil = Image.fromarray(face_img[:, :, [2, 1, 0]]) - face_mask_tensor = torch.Tensor(face_mask).to(dtype=weight_dtype, device="cuda").unsqueeze(0).unsqueeze(0).unsqueeze(0) / 255.0 + face_mask_tensor = torch.Tensor(face_mask).to(dtype=weight_dtype, device=device).unsqueeze(0).unsqueeze(0).unsqueeze(0) / 255.0 video = pipe( ref_image_pil,