Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Make FusionProfile object not a singleton and allow copying #3771

Open
kshitij12345 opened this issue Jan 28, 2025 · 0 comments
Open

Make FusionProfile object not a singleton and allow copying #3771

kshitij12345 opened this issue Jan 28, 2025 · 0 comments

Comments

@kshitij12345
Copy link

Using thunder on a model, several Fusions are generated with Fuser. When we want to profile the generated regions, it can be easily done so with fd.profile(). However, problem is that fd.profile() returns a singleton object which is overwritten when we profile another fusion definition in the same script (see example script). It would be great if fd.profile() returns a separate object. Also, it would be great if it can be serialized.

Example Script

import torch
from nvfuser import FusionDefinition, DataType

def nvfuser_fusion_id12(fd : FusionDefinition) -> None :
    T0 = fd.define_tensor(shape=[4096, 3584], contiguity=[True, True], dtype=DataType.BFloat16, is_cpu=False, stride_order=[1, 0])
    T1 = fd.define_tensor(shape=[1, 4096, 3584], contiguity=[None, True, True], dtype=DataType.BFloat16, is_cpu=False, stride_order=[2, 1, 0])
    T2 = fd.define_tensor(shape=[1, 4096, 18944], contiguity=[None, True, True], dtype=DataType.BFloat16, is_cpu=False, stride_order=[2, 1, 0])
    T7 = fd.ops.reshape(T0, new_shape=[1, 4096, 3584])
    T8 = fd.ops.cast(T1, dtype=DataType.Float)
    T9 = fd.ops.cast(T7, dtype=DataType.Float)
    T10 = fd.ops.add(T8, T9)
    T11 = fd.ops.cast(T10, dtype=DataType.BFloat16)
    T15 = fd.ops.reshape(T2, new_shape=[4096, 18944])
    fd.add_output(T11)
    fd.add_output(T15)

with FusionDefinition() as fd:
    nvfuser_fusion_id12(fd)

inputs = [
    torch.testing.make_tensor((4096, 3584), dtype=torch.bfloat16, device='cuda:0'),
    torch.testing.make_tensor((1, 4096, 3584), dtype=torch.bfloat16, device='cuda:0'),
    torch.testing.make_tensor((1, 4096, 18944), dtype=torch.bfloat16, device='cuda:0'),
]
fd.execute(inputs, profile=True)

prof_data = fd.profile()

print("BEFORE PROFILING FD2")
print(prof_data.kernel_time_ms)

def nvfuser_fusion_id0(fd : FusionDefinition) -> None :
    T0 = fd.define_tensor(shape=[64], contiguity=[True], dtype=DataType.BFloat16, is_cpu=False, stride_order=[0])
    S1 = fd.define_scalar(4096, dtype=DataType.Int)
    S2 = fd.define_scalar(0, dtype=DataType.Int)
    S3 = fd.define_scalar(1, dtype=DataType.Int)
    T4 = fd.ops.iota(S1, S2, S3, dtype=DataType.Int)
    T8 = fd.ops.broadcast_in_dim(T4, shape=[1, 4096], broadcast_dims=[1])
    T13 = fd.ops.broadcast_in_dim(T0, shape=[1, 64, 1], broadcast_dims=[1])
    T14 = fd.ops.cast(T13, dtype=DataType.Float)
    T19 = fd.ops.broadcast_in_dim(T14, shape=[1, 64, 1], broadcast_dims=[0, 1, 2])
    T24 = fd.ops.broadcast_in_dim(T8, shape=[1, 1, 4096], broadcast_dims=[0, 2])
    T25 = fd.ops.cast(T24, dtype=DataType.Float)
    fd.add_output(T19)
    fd.add_output(T25)

with FusionDefinition() as fd2:
    nvfuser_fusion_id0(fd2)

inputs = [
    torch.testing.make_tensor((64,), dtype=torch.bfloat16, device='cuda:0'),
]
fd2.execute(inputs, profile=True)

# Running profile on different fusion definition.
prof_data2 = fd2.profile()  # data in prof_data1 is overwritten.

print("AFTER PROFILING FD2")
print(prof_data.kernel_time_ms)
print(prof_data2.kernel_time_ms)
print(prof_data is prof_data2)  # True
print(prof_data)
print(prof_data2)


import copy
# Traceback (most recent call last):
#   File "/home/kkalambarkar/lightning-thunder/scratchpad/test_nvfuser_profile.py", line 61, in <module>
#     copy.deepcopy(prof_data2)
#   File "/home/kkalambarkar/miniconda3/envs/pytorch-dev/lib/python3.10/copy.py", line 161, in deepcopy
#     rv = reductor(4)
# TypeError: cannot pickle 'nvfuser._C.FusionProfile' object
copy.deepcopy(prof_data2)

Output

BEFORE PROFILING FD2
0.08934399999999999
AFTER PROFILING FD2
0.002048
0.002048
True
<nvfuser._C.FusionProfile object at 0x7f4a5243d470>
<nvfuser._C.FusionProfile object at 0x7f4a5243d470>
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
Labels
None yet
Projects
None yet
Development

No branches or pull requests

1 participant