add profileing and small conv3d repoducer
This commit is contained in:
parent
b279e2dfd7
commit
54a585f2b9
46
conv3d.py
Normal file
46
conv3d.py
Normal file
@ -0,0 +1,46 @@
|
|||||||
|
import torch
|
||||||
|
import time
|
||||||
|
|
||||||
|
configs = [
|
||||||
|
[128, 128, 3, 1],
|
||||||
|
[256, 256, 3, 1],
|
||||||
|
[512, 512, 3, 1],
|
||||||
|
[128, 256, 1, 1],
|
||||||
|
[512, 512, 3, (2, 2, 2)],
|
||||||
|
[256, 256, 3, (2, 2, 2)],
|
||||||
|
[128, 3, 3, 1]
|
||||||
|
]
|
||||||
|
|
||||||
|
inputs = [
|
||||||
|
[1, 128, 67, 258, 258],
|
||||||
|
[1, 256, 35, 130, 130],
|
||||||
|
[1, 512, 35, 130, 130],
|
||||||
|
[1, 128, 67, 258, 258],
|
||||||
|
[1, 512, 35, 130, 130],
|
||||||
|
[1, 256, 27, 258, 258],
|
||||||
|
[1, 128, 67, 258, 258],
|
||||||
|
]
|
||||||
|
|
||||||
|
|
||||||
|
def conv3dbenchmark(configs: list[list[int]], inputs: list[list[int]], repeat: int, dtype: torch.dtype, device: torch.device):
|
||||||
|
modules = list()
|
||||||
|
assert len(inputs) == len(configs)
|
||||||
|
|
||||||
|
for config in configs:
|
||||||
|
modules.append(torch.nn.Conv3d(config[0], config[1], config[2], stride=config[3]).to(device, dtype))
|
||||||
|
|
||||||
|
for i in range(len(modules)):
|
||||||
|
x = torch.randn(inputs[i]).to(device, dtype)
|
||||||
|
print(f"Running Conv3d config: {configs[i]} input: {inputs[i]} type: {dtype}")
|
||||||
|
start = time.perf_counter()
|
||||||
|
for n in range(repeat):
|
||||||
|
modules[i].forward(x)
|
||||||
|
torch.cuda.synchronize(device)
|
||||||
|
print(f"Time {(time.perf_counter() - start) / repeat} seconds\n")
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
device = torch.device(0)
|
||||||
|
|
||||||
|
conv3dbenchmark(configs, inputs, 5, torch.bfloat16, device)
|
||||||
|
conv3dbenchmark(configs, inputs, 5, torch.float16, device)
|
24
run.py
24
run.py
@ -6,6 +6,8 @@ import json
|
|||||||
from vae.autoencoder_kl_causal_3d import AutoencoderKLCausal3D
|
from vae.autoencoder_kl_causal_3d import AutoencoderKLCausal3D
|
||||||
import time
|
import time
|
||||||
|
|
||||||
|
from torch.profiler import profile, record_function, ProfilerActivity
|
||||||
|
|
||||||
|
|
||||||
def load_vae(path: str, compile_vae: bool):
|
def load_vae(path: str, compile_vae: bool):
|
||||||
with open("hy_vae_config.json") as f:
|
with open("hy_vae_config.json") as f:
|
||||||
@ -25,6 +27,10 @@ def load_vae(path: str, compile_vae: bool):
|
|||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
|
props = torch.cuda.get_device_properties(0)
|
||||||
|
|
||||||
|
print(f"Device: {props.name}")
|
||||||
|
|
||||||
latents = torch.randn((1, 16, 19, 120, 68)).to(torch.device(0), torch.bfloat16)
|
latents = torch.randn((1, 16, 19, 120, 68)).to(torch.device(0), torch.bfloat16)
|
||||||
print(f"Latent dims: {latents.size()}")
|
print(f"Latent dims: {latents.size()}")
|
||||||
|
|
||||||
@ -40,11 +46,15 @@ if __name__ == "__main__":
|
|||||||
vae.enable_tiling()
|
vae.enable_tiling()
|
||||||
|
|
||||||
print("decodeing")
|
print("decodeing")
|
||||||
start = time.perf_counter()
|
|
||||||
generator = torch.Generator(device=torch.device("cpu"))
|
|
||||||
decoded = vae.decode(
|
|
||||||
latents, return_dict=False, generator=generator
|
|
||||||
)[0]
|
|
||||||
print(f"decoded in {time.perf_counter() - start} seconds")
|
|
||||||
print(f"decoded dims: {decoded.size()}")
|
|
||||||
|
|
||||||
|
with profile(activities=[ProfilerActivity.CUDA, ProfilerActivity.CPU], record_shapes=True, with_flops=True) as prof:
|
||||||
|
start = time.perf_counter()
|
||||||
|
generator = torch.Generator(device=torch.device("cpu"))
|
||||||
|
decoded = vae.decode(
|
||||||
|
latents, return_dict=False, generator=generator
|
||||||
|
)[0]
|
||||||
|
print(f"decoded in {time.perf_counter() - start} seconds")
|
||||||
|
print(f"decoded dims: {decoded.size()}")
|
||||||
|
|
||||||
|
print(prof.key_averages(group_by_input_shape=True).table(sort_by="cuda_time_total", row_limit=100))
|
||||||
|
prof.export_chrome_trace("trace.json")
|
||||||
|
@ -115,6 +115,9 @@ class AutoencoderKLCausal3D(ModelMixin, ConfigMixin, FromOriginalVAEMixin):
|
|||||||
self.post_quant_conv = nn.Conv3d(
|
self.post_quant_conv = nn.Conv3d(
|
||||||
latent_channels, latent_channels, kernel_size=1)
|
latent_channels, latent_channels, kernel_size=1)
|
||||||
|
|
||||||
|
print(f"Conv3d: {2 * latent_channels}, {2 * latent_channels}, 1")
|
||||||
|
print(f"Conv3d: {latent_channels}, {latent_channels}, 1")
|
||||||
|
|
||||||
self.use_slicing = False
|
self.use_slicing = False
|
||||||
self.use_spatial_tiling = False
|
self.use_spatial_tiling = False
|
||||||
self.use_temporal_tiling = False
|
self.use_temporal_tiling = False
|
||||||
|
@ -72,6 +72,7 @@ class CausalConv3d(nn.Module):
|
|||||||
|
|
||||||
self.conv = nn.Conv3d(chan_in, chan_out, kernel_size,
|
self.conv = nn.Conv3d(chan_in, chan_out, kernel_size,
|
||||||
stride=stride, dilation=dilation, **kwargs)
|
stride=stride, dilation=dilation, **kwargs)
|
||||||
|
print(f"Conv3d: {chan_in}, {chan_out}, {kernel_size}, stride={stride}, dilation={dilation}")
|
||||||
|
|
||||||
def forward(self, x):
|
def forward(self, x):
|
||||||
x = F.pad(x, self.time_causal_padding, mode=self.pad_mode)
|
x = F.pad(x, self.time_causal_padding, mode=self.pad_mode)
|
||||||
|
Loading…
x
Reference in New Issue
Block a user