diff --git a/tripy/examples/diffusion/example.py b/tripy/examples/diffusion/example.py index 0bba0bff0..179186c67 100644 --- a/tripy/examples/diffusion/example.py +++ b/tripy/examples/diffusion/example.py @@ -1,6 +1,22 @@ +# +# SPDX-FileCopyrightText: Copyright (c) 1993-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + import argparse, os from tqdm import tqdm -from pathlib import Path from PIL import Image import time @@ -15,7 +31,8 @@ def compile_model(model, inputs, verbose=False): if verbose: - print(f"Compiling {model.__class__.__name__}...", end=' ') + name = model.__class__.__name__ if isinstance(model, tp.Module) else model.__name__ + print(f"[I] Compiling {name}...", end=' ', flush=True) compile_start_time = time.perf_counter() compiler = tp.Compiler(model) @@ -54,57 +71,14 @@ def compile_vae(model, verbose=False): return compile_model(model, inputs, verbose=verbose) -# def compile_CLIP(model, verbose=False): -# if verbose: -# print("Compiling CLIP model...") -# clip_compile_start_time = time.perf_counter() - -# clip_compiler = tp.Compiler(model) -# compiled_clip = clip_compiler.compile(tp.InputInfo((1, 77), dtype=tp.int32)) - -# if verbose: -# clip_compile_end_time = time.perf_counter() -# print(f"Compilation of CLIP took {clip_compile_end_time - clip_compile_start_time} seconds.") - -# return compiled_clip - - -# def compile_unet(model, verbose=False): -# if verbose: -# print("Compiling UNet...") -# unet_compile_start_time = time.perf_counter() - -# compiler = tp.Compiler(model) -# unconditional_context_shape = (1, 77, 768) -# conditional_context_shape = (1, 77, 768) -# latent_shape = (1, 4, 64, 64) -# compiled_model = compiler.compile( -# tp.InputInfo(unconditional_context_shape, dtype=tp.float32), -# tp.InputInfo(conditional_context_shape, dtype=tp.float32), -# tp.InputInfo(latent_shape, dtype=tp.float32), -# tp.InputInfo((1,), dtype=tp.float32), -# tp.InputInfo((1,), dtype=tp.float32), -# tp.InputInfo((1,), dtype=tp.float32), -# tp.InputInfo((1,), dtype=tp.float32), -# ) - -# if verbose: -# unet_compile_end_time = time.perf_counter() -# print(f"Compilation of UNet took {unet_compile_end_time - unet_compile_start_time} seconds.") - -# return compiled_model - - def run_diffusion_loop(model, unconditional_context, context, latent, steps, guidance): - timesteps = list(range(1, 1000, 1000 // steps))[::-1] - # print(f"t: {timesteps}") + timesteps = list(range(1, 1000, 1000 // steps)) + print(f"[I] Running diffusion for {timesteps} timesteps...") alphas = get_alphas_cumprod()[tp.Tensor(timesteps)] alphas_prev = tp.concatenate([tp.Tensor([1.0]), alphas[:-1]], dim=0) - # print(f"a: {alphas}") - # print(f"aP: {alphas_prev}") - # unet_run_start = time.perf_counter() - for index, timestep in enumerate(timesteps): + for index, timestep in (t := tqdm(list(enumerate(timesteps))[::-1])): + t.set_description("idx: %1d, timestep: %3d" % (index, timestep)) tid = tp.Tensor([index]) latent = model( unconditional_context, @@ -115,141 +89,82 @@ def run_diffusion_loop(model, unconditional_context, context, latent, steps, gui alphas_prev[tid], tp.Tensor([guidance]), ) - # unet_run_end = time.perf_counter() - # print(f"Finished running diffusion. Inference took {unet_run_end - unet_run_start} seconds.") return latent def tripy_diffusion(args): - model = StableDiffusion() - load_from_diffusers(model, tp.float32, debug=True) - run_start_time = time.perf_counter() # if os.path.isdir("engines"): - # compiled_clip = tp.Executable.load(os.path.join("engines", "clip_executable.json")) - # compiled_unet = tp.Executable.load(os.path.join("engines", "unet_executable.json")) - # compiled_vae = tp.Executable.load(os.path.join("engines", "vae_executable.json")) + # print("[I] Loading cached engines from disk...") + # clip_compiled = tp.Executable.load(os.path.join("engines", "clip_executable.json")) + # unet_compiled = tp.Executable.load(os.path.join("engines", "unet_executable.json")) + # vae_compiled = tp.Executable.load(os.path.join("engines", "vae_executable.json")) # else: - compiled_clip = compile_clip(model.cond_stage_model.transformer.text_model, verbose=True) - compiled_unet = compile_unet(model, verbose=True) - compiled_vae = compile_vae(model.decode, verbose=True) - + model = StableDiffusion() + load_from_diffusers(model, tp.float32, debug=True) + clip_compiled = compile_clip(model.cond_stage_model.transformer.text_model, verbose=True) + unet_compiled = compile_unet(model, verbose=True) + vae_compiled = compile_vae(model.decode, verbose=True) + # os.mkdir("engines") - # compiled_clip.save(os.path.join("engines", "clip_executable.json")) - # compiled_unet.save(os.path.join("engines", "unet_executable.json")) - # compiled_vae.save(os.path.join("engines", "vae_executable.json")) + # print("[I] Saving engines to disk...") + # clip_compiled.save(os.path.join("engines", "clip_executable.json")) + # unet_compiled.save(os.path.join("engines", "unet_executable.json")) + # vae_compiled.save(os.path.join("engines", "vae_executable.json")) - # Run through CLIP to get context + # Run through CLIP to get context from prompt tokenizer = ClipTokenizer() prompt = tp.Tensor([tokenizer.encode(args.prompt)]) - print(f"Got tokenized prompt.") + print(f"[I] Got tokenized prompt.") unconditional_prompt = tp.Tensor([tokenizer.encode("")]) - print(f"Got unconditional tokenized prompt.") + print(f"[I] Got unconditional tokenized prompt.") - print("Getting CLIP conditional and unconditional context...", end=' ') + print("[I] Getting CLIP conditional and unconditional context...", end=" ") clip_run_start = time.perf_counter() - context = compiled_clip(prompt) - unconditional_context = compiled_clip(unconditional_prompt) + context = clip_compiled(prompt) + unconditional_context = clip_compiled(unconditional_prompt) clip_run_end = time.perf_counter() print(f"took {clip_run_end - clip_run_start} seconds.") # Backbone of diffusion - the UNet - - # start with random noise if args.seed is not None: torch.manual_seed(args.seed) torch_latent = torch.randn((1, 4, 64, 64)).to("cuda") latent = tp.Tensor(torch_latent) - print(f"Running diffusion loop for {args.steps} steps...", end=' ') - - # compiler = tp.Compiler(run_diffusion_loop) - # unconditional_context_shape = (1, 77, 768) - # conditional_context_shape = (1, 77, 768) - # latent_shape = (1, 4, 64, 64) - # compiled_diffusion_loop = compiler.compile( - # model, - # tp.InputInfo(unconditional_context_shape, dtype=tp.float32), - # tp.InputInfo(conditional_context_shape, dtype=tp.float32), - # tp.InputInfo(latent_shape, dtype=tp.float32), - # args.steps, - # args.guidance, - # ) - - timesteps = list(range(1, 1000, 1000 // args.steps))[::-1] - alphas = get_alphas_cumprod()[tp.Tensor(timesteps)] - alphas_prev = tp.concatenate([tp.Tensor([1.0]), alphas[:-1]], dim=0) - tid = tp.Tensor([0]) diffusion_run_start = time.perf_counter() - # latent = run_diffusion_loop(compiled_unet, unconditional_context, context, latent, args.steps, args.guidance) - latent = compiled_unet( - unconditional_context, - context, - latent, - tp.cast(tp.Tensor([timesteps[0]]), tp.float32), - alphas[tid], - alphas_prev[tid], - tp.Tensor([args.guidance]), - ) + latent = run_diffusion_loop(unet_compiled, unconditional_context, context, latent, args.steps, args.guidance) diffusion_run_end = time.perf_counter() - print(f"took {diffusion_run_end - diffusion_run_start} seconds.") - - #latent = run_diffusion_loop(compiled_unet, unconditional_context, context, latent, args.steps, args.guidance) - - # timesteps = list(range(1, 1000, 1000 // args.steps)) - # print(f"Running for {timesteps} timesteps.") - # alphas = model.alphas_cumprod[tp.Tensor(timesteps)] - # alphas_prev = tp.concatenate([tp.Tensor([1.0]), alphas[:-1]], dim=0) - - # def run(model, unconditional_context, context, latent, timestep, alphas, alphas_prev, guidance): - # return model(unconditional_context, context, latent, timestep, alphas, alphas_prev, guidance) - - # # This is diffusion - # print("Running diffusion...") - # unet_run_start = time.perf_counter() - # for index, timestep in (t := tqdm(list(enumerate(timesteps))[::-1])): - # t.set_description("idx: %1d, timestep: %3d" % (index, timestep)) - # tid = tp.Tensor([index]) - # latent = run( - # compiled_unet, - # unconditional_context, - # context, - # latent, - # tp.cast(tp.Tensor([timestep]), tp.float32), - # alphas[tid], - # alphas_prev[tid], - # tp.Tensor([args.guidance]), - # ) - # unet_run_end = time.perf_counter() - # print(f"Finished running diffusion. Inference took {unet_run_end - unet_run_start} seconds.") + print(f"[I] Finished diffusion denoising. Inference took {diffusion_run_end - diffusion_run_start} seconds.") # Upsample latent space to image with autoencoder - - print(f"Decoding latent...", end=' ') + print(f"[I] Decoding latent...", end=" ") vae_run_start = time.perf_counter() - x = compiled_vae(latent) - # x = model.decode(latent) + x = vae_compiled(latent) vae_run_end = time.perf_counter() print(f"took {vae_run_end - vae_run_start} seconds.") - run_end_time = time.perf_counter() + # Evaluate output x.eval() - print(f"Full pipeline took {run_end_time - run_start_time} seconds.") + run_end_time = time.perf_counter() + print(f"[I] Full script took {run_end_time - run_start_time} seconds.") # save image im = Image.fromarray(cp.from_dlpack(x).get().astype(np.uint8, copy=False)) - print(f"saving {args.out}") + print(f"[I] Saving {args.out}") if not os.path.isdir("output"): + print("[I] Creating 'output' directory.") os.mkdir("output") im.save(args.out) return im, [clip_run_start, clip_run_end, diffusion_run_start, diffusion_run_end, vae_run_start, vae_run_end] + def hf_diffusion(args): from diffusers import StableDiffusionPipeline, UNet2DConditionModel, LMSDiscreteScheduler, AutoencoderKL - model_id = "runwayml/stable-diffusion-v1-5" # "CompVis/stable-diffusion-v1-4" + model_id = "runwayml/stable-diffusion-v1-5" pipe = StableDiffusionPipeline.from_pretrained(model_id, dtype=torch.float32) pipe = pipe.to("cuda") hf_tokenizer = pipe.tokenizer @@ -260,7 +175,7 @@ def hf_diffusion(args): run_start_time = time.perf_counter() - print("Starting tokenization and running clip...", end=" ") + print("[I] Starting tokenization and running clip...", end=" ") clip_run_start = time.perf_counter() text_input = hf_tokenizer(args.prompt, padding="max_length", max_length=hf_tokenizer.model_max_length, truncation=True, return_tensors="pt").to("cuda") max_length = text_input.input_ids.shape[-1] # 77 @@ -312,6 +227,8 @@ def print_summary(denoising_steps, times): print('Throughput: {:.2f} image/s'.format(1000. / total_ms)) +# TODO: Add torch compilation modes +# TODO: Add fp16 def main(): default_prompt = "a horse sized cat eating a bagel" parser = argparse.ArgumentParser( @@ -329,7 +246,8 @@ def main(): args = parser.parse_args() if args.torch_inference: - hf_diffusion(args) + _, times = hf_diffusion(args) + print_summary(args.steps, times) else: _, times = tripy_diffusion(args) print_summary(args.steps, times) diff --git a/tripy/examples/diffusion/model.py b/tripy/examples/diffusion/model.py index 7d8203281..67f76fb17 100644 --- a/tripy/examples/diffusion/model.py +++ b/tripy/examples/diffusion/model.py @@ -1,3 +1,20 @@ +# +# SPDX-FileCopyrightText: Copyright (c) 1993-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + # https://arxiv.org/pdf/2112.10752.pdf # https://github.com/ekagra-ranjan/huggingface-blog/blob/main/stable_diffusion.md # Adapted from https://github.com/tinygrad/tinygrad/blob/master/examples/stable_diffusion.py @@ -13,17 +30,33 @@ import tripy as tp from dataclasses import dataclass -# @dataclass -# class StableDiffusion15Config: -# block_size: int = 1024 -# vocab_size: int = 50257 -# num_layers: int = 12 -# num_heads: int = 12 -# embedding_size: int = 768 -# bias: bool = True -# seq_len: int = 1 -# batch_size: int = 1 -# dtype: "tripy.datatype" = tp.float32 +@dataclass +class CLIPConfig: + vocab_size: int = 49408 + embedding_size: int = 768 + num_heads: int = 12 + max_seq_len: int = 77 + num_hidden_layers: int = 12 + dtype: "tripy.datatype" = tp.float32 + +@dataclass +class StableDiffusion15UNetConfig: + io_channels: int = 4 + model_channels: int = 320 + channel_mult: List[int] = [1, 2, 4, 4] + attention_resolutions: List[int] = [4, 2, 1] + num_heads: int = 8 + context_dim: int = 768 + dtype: "tripy.datatype" = tp.float32 + +@dataclass +class StableDiffusionVAEConfig: + io_channels: int = 3 + latent_channels: int = 4 + model_channel: int = 128 + resolution: int = 256 + channel_mult: List[int] = [1, 2, 4, 4] + dtype: "tripy.datatype" = tp.float32 # convenience methods adapted from tinygrad/tensor.py (https://docs.tinygrad.org/tensor/ops/) def scaled_dot_product_attention( @@ -219,7 +252,9 @@ def __init__(self, channels, emb_channels, out_channels): def __call__(self, x, emb): h = self.conv1(self.nonlinearity(self.norm1(x))) emb_out = self.time_emb_proj(self.nonlinearity(emb)) - target_shape = emb_out.shape + (1, 1) + one_shape = tp.Shape(tp.ones((1,), dtype=tp.int32)) + target_shape = tp.concatenate([emb_out.shape, one_shape, one_shape], dim=0) + # target_shape = emb_out.shape + (1, 1) # TODO: #228: WAR to prevent computing output rank in infer_rank for reshape target_shape.trace_tensor.shape = (emb_out.rank + 2,) h = h + tp.reshape(emb_out, target_shape) @@ -383,12 +418,9 @@ def __init__(self, channels, out_channels, emb_channels=1280): self.upsamplers = [Upsample(out_channels)] def __call__(self, x, emb, saved_inputs): - x = tp.concatenate([x, saved_inputs.pop()], dim=1) - x = self.resnets[0](x, emb) - x = tp.concatenate([x, saved_inputs.pop()], dim=1) - x = self.resnets[1](x, emb) - x = tp.concatenate([x, saved_inputs.pop()], dim=1) - x = self.resnets[2](x, emb) + for resblock in self.resnets: + x = tp.concatenate([x, saved_inputs.pop()], dim=1) + x = resblock(x, emb) return self.upsamplers[0](x) @@ -418,15 +450,10 @@ def __init__( self.upsamplers = [Upsample(channels)] def __call__(self, x, emb, context, saved_inputs): - x = tp.concatenate([x, saved_inputs.pop()], dim=1) - x = self.resnets[0](x, emb) - x = self.attentions[0](x, context) - x = tp.concatenate([x, saved_inputs.pop()], dim=1) - x = self.resnets[1](x, emb) - x = self.attentions[1](x, context) - x = tp.concatenate([x, saved_inputs.pop()], dim=1) - x = self.resnets[2](x, emb) - x = self.attentions[2](x, context) + for i in range(len(self.attentions)): + x = tp.concatenate([x, saved_inputs.pop()], dim=1) + x = self.resnets[i](x, emb) + x = self.attentions[i](x, context) if hasattr(self, "upsamplers"): x = self.upsamplers[0](x) return x @@ -737,7 +764,7 @@ def clamp(tensor: tp.Tensor, min: int, max: int): class StableDiffusion(tp.Module): def __init__(self): - self.alphas_cumprod = get_alphas_cumprod().eval() + self.alphas_cumprod = get_alphas_cumprod() self.model = namedtuple("DiffusionModel", ["diffusion_model"])(diffusion_model=UNetModel()) self.first_stage_model = AutoencoderKL() self.cond_stage_model = namedtuple("CondStageModel", ["transformer"])(