add profileing and small conv3d repoducer

2025-02-12 16:29:43 +01:00 · 2025-02-12 16:29:43 +01:00 · 54a585f2b9
commit 54a585f2b9
parent b279e2dfd7
4 changed files with 67 additions and 7 deletions
--- a/run.py
+++ b/run.py
@ -6,6 +6,8 @@ import json
 from vae.autoencoder_kl_causal_3d import AutoencoderKLCausal3D
 import time

+from torch.profiler import profile, record_function, ProfilerActivity
+

 def load_vae(path: str, compile_vae: bool):
 	with open("hy_vae_config.json") as f:
@ -25,6 +27,10 @@ def load_vae(path: str, compile_vae: bool):


 if __name__ == "__main__":
+	props = torch.cuda.get_device_properties(0)
+
+	print(f"Device: {props.name}")
+
 	latents = torch.randn((1, 16, 19, 120, 68)).to(torch.device(0), torch.bfloat16)
 	print(f"Latent dims: {latents.size()}")

@ -40,11 +46,15 @@ if __name__ == "__main__":
 	vae.enable_tiling()

 	print("decodeing")
-	start = time.perf_counter()
-	generator = torch.Generator(device=torch.device("cpu"))
-	decoded = vae.decode(
-		latents, return_dict=False, generator=generator
-	)[0]
-	print(f"decoded in {time.perf_counter() - start} seconds")
-	print(f"decoded dims: {decoded.size()}")

+	with profile(activities=[ProfilerActivity.CUDA, ProfilerActivity.CPU], record_shapes=True, with_flops=True) as prof:
+		start = time.perf_counter()
+		generator = torch.Generator(device=torch.device("cpu"))
+		decoded = vae.decode(
+			latents, return_dict=False, generator=generator
+		)[0]
+		print(f"decoded in {time.perf_counter() - start} seconds")
+		print(f"decoded dims: {decoded.size()}")
+
+	print(prof.key_averages(group_by_input_shape=True).table(sort_by="cuda_time_total", row_limit=100))
+	prof.export_chrome_trace("trace.json")