add profileing and small conv3d repoducer

2025-02-12 16:29:43 +01:00 · 2025-02-12 16:29:43 +01:00 · 54a585f2b9
commit 54a585f2b9
parent b279e2dfd7
4 changed files with 67 additions and 7 deletions
--- a/conv3d.py
+++ b/conv3d.py
@ -0,0 +1,46 @@
+import torch
+import time
+
+configs = [
+	[128, 128, 3, 1],
+	[256, 256, 3, 1],
+	[512, 512, 3, 1],
+	[128, 256, 1, 1],
+	[512, 512, 3, (2, 2, 2)],
+	[256, 256, 3, (2, 2, 2)],
+	[128, 3, 3, 1]
+]
+
+inputs = [
+	[1, 128, 67, 258, 258],
+	[1, 256, 35, 130, 130],
+	[1, 512, 35, 130, 130],
+	[1, 128, 67, 258, 258],
+	[1, 512, 35, 130, 130],
+	[1, 256, 27, 258, 258],
+	[1, 128, 67, 258, 258],
+]
+
+
+def conv3dbenchmark(configs: list[list[int]], inputs: list[list[int]], repeat: int, dtype: torch.dtype, device: torch.device):
+	modules = list()
+	assert len(inputs) == len(configs)
+
+	for config in configs:
+		modules.append(torch.nn.Conv3d(config[0], config[1], config[2], stride=config[3]).to(device, dtype))
+
+	for i in range(len(modules)):
+		x = torch.randn(inputs[i]).to(device, dtype)
+		print(f"Running Conv3d config: {configs[i]} input: {inputs[i]} type: {dtype}")
+		start = time.perf_counter()
+		for n in range(repeat):
+			modules[i].forward(x)
+		torch.cuda.synchronize(device)
+		print(f"Time {(time.perf_counter() - start) / repeat} seconds\n")
+
+
+if __name__ == "__main__":
+	device = torch.device(0)
+
+	conv3dbenchmark(configs, inputs, 5, torch.bfloat16, device)
+	conv3dbenchmark(configs, inputs, 5, torch.float16, device)
--- a/run.py
+++ b/run.py
@ -6,6 +6,8 @@ import json
 from vae.autoencoder_kl_causal_3d import AutoencoderKLCausal3D
 import time

+from torch.profiler import profile, record_function, ProfilerActivity
+

 def load_vae(path: str, compile_vae: bool):
 	with open("hy_vae_config.json") as f:
@ -25,6 +27,10 @@ def load_vae(path: str, compile_vae: bool):


 if __name__ == "__main__":
+	props = torch.cuda.get_device_properties(0)
+
+	print(f"Device: {props.name}")
+
 	latents = torch.randn((1, 16, 19, 120, 68)).to(torch.device(0), torch.bfloat16)
 	print(f"Latent dims: {latents.size()}")

@ -40,11 +46,15 @@ if __name__ == "__main__":
 	vae.enable_tiling()

 	print("decodeing")
-	start = time.perf_counter()
-	generator = torch.Generator(device=torch.device("cpu"))
-	decoded = vae.decode(
-		latents, return_dict=False, generator=generator
-	)[0]
-	print(f"decoded in {time.perf_counter() - start} seconds")
-	print(f"decoded dims: {decoded.size()}")

+	with profile(activities=[ProfilerActivity.CUDA, ProfilerActivity.CPU], record_shapes=True, with_flops=True) as prof:
+		start = time.perf_counter()
+		generator = torch.Generator(device=torch.device("cpu"))
+		decoded = vae.decode(
+			latents, return_dict=False, generator=generator
+		)[0]
+		print(f"decoded in {time.perf_counter() - start} seconds")
+		print(f"decoded dims: {decoded.size()}")
+
+	print(prof.key_averages(group_by_input_shape=True).table(sort_by="cuda_time_total", row_limit=100))
+	prof.export_chrome_trace("trace.json")
--- a/vae/autoencoder_kl_causal_3d.py
+++ b/vae/autoencoder_kl_causal_3d.py
@ -115,6 +115,9 @@ class AutoencoderKLCausal3D(ModelMixin, ConfigMixin, FromOriginalVAEMixin):
        self.post_quant_conv = nn.Conv3d(
            latent_channels, latent_channels, kernel_size=1)

+        print(f"Conv3d: {2 * latent_channels}, {2 * latent_channels}, 1")
+        print(f"Conv3d: {latent_channels}, {latent_channels}, 1")
+
        self.use_slicing = False
        self.use_spatial_tiling = False
        self.use_temporal_tiling = False
--- a/vae/unet_causal_3d_blocks.py
+++ b/vae/unet_causal_3d_blocks.py
@ -72,6 +72,7 @@ class CausalConv3d(nn.Module):

        self.conv = nn.Conv3d(chan_in, chan_out, kernel_size,
                              stride=stride, dilation=dilation, **kwargs)
+        print(f"Conv3d: {chan_in}, {chan_out}, {kernel_size}, stride={stride}, dilation={dilation}")

    def forward(self, x):
        x = F.pad(x, self.time_causal_padding, mode=self.pad_mode)