13
13
14
14
prompt = "a photo of an astronaut riding a horse on mars"
15
15
16
+
16
17
def get_inference_pipeline (precision , backend ):
17
18
"""
18
19
returns HuggingFace diffuser pipeline
@@ -31,12 +32,14 @@ def get_inference_pipeline(precision, backend):
31
32
torch_dtype = torch .float32 if precision == "single" else torch .float16 ,
32
33
)
33
34
pipe = pipe .to (device )
34
- else :
35
+ else :
35
36
pipe = StableDiffusionOnnxPipeline .from_pretrained (
36
37
"CompVis/stable-diffusion-v1-4" ,
37
38
use_auth_token = os .environ ["ACCESS_TOKEN" ],
38
39
revision = "onnx" ,
39
- provider = "CPUExecutionProvider" if device .type == "cpu" else "CUDAExecutionProvider" ,
40
+ provider = "CPUExecutionProvider"
41
+ if device .type == "cpu"
42
+ else "CUDAExecutionProvider" ,
40
43
torch_dtype = torch .float32 if precision == "single" else torch .float16 ,
41
44
)
42
45
@@ -51,43 +54,59 @@ def null_safety(images, **kwargs):
51
54
return pipe
52
55
53
56
54
- def do_inference (pipe , n_samples , precision , num_inference_steps ):
57
+ def do_inference (pipe , n_samples , use_autocast , num_inference_steps ):
55
58
torch .cuda .empty_cache ()
56
- context = autocast if (device .type == "cuda" and precision == 'half' ) else nullcontext
59
+ context = (
60
+ autocast if (device .type == "cuda" and use_autocast ) else nullcontext
61
+ )
57
62
with context ("cuda" ):
58
- images = pipe (prompt = [prompt ] * n_samples , num_inference_steps = num_inference_steps ).images
63
+ images = pipe (
64
+ prompt = [prompt ] * n_samples , num_inference_steps = num_inference_steps
65
+ ).images
59
66
60
67
return images
61
68
62
69
63
- def get_inference_time (pipe , n_samples , n_repeats , precision , num_inference_steps ):
70
+ def get_inference_time (
71
+ pipe , n_samples , n_repeats , use_autocast , num_inference_steps
72
+ ):
64
73
from torch .utils .benchmark import Timer
74
+
65
75
timer = Timer (
66
- stmt = "do_inference(pipe, n_samples, precision , num_inference_steps)" ,
76
+ stmt = "do_inference(pipe, n_samples, use_autocast , num_inference_steps)" ,
67
77
setup = "from __main__ import do_inference" ,
68
- globals = {"pipe" : pipe , "n_samples" : n_samples , "precision" : precision , "num_inference_steps" : num_inference_steps },
69
- num_threads = multiprocessing .cpu_count ()
78
+ globals = {
79
+ "pipe" : pipe ,
80
+ "n_samples" : n_samples ,
81
+ "use_autocast" : use_autocast ,
82
+ "num_inference_steps" : num_inference_steps ,
83
+ },
84
+ num_threads = multiprocessing .cpu_count (),
70
85
)
71
86
profile_result = timer .timeit (
72
87
n_repeats
73
88
) # benchmark.Timer performs 2 iterations for warmup
74
89
return round (profile_result .mean , 2 )
75
90
76
91
77
- def get_inference_memory (pipe , n_samples , precision , num_inference_steps ):
92
+ def get_inference_memory (pipe , n_samples , use_autocast , num_inference_steps ):
78
93
if not torch .cuda .is_available ():
79
94
return 0
80
-
95
+
81
96
torch .cuda .empty_cache ()
82
- context = autocast if (device .type == "cuda" and precision == 'half' ) else nullcontext
97
+ context = autocast if (device .type == "cuda" and use_autocast ) else nullcontext
83
98
with context ("cuda" ):
84
- images = pipe (prompt = [prompt ] * n_samples , num_inference_steps = num_inference_steps ).images
99
+ images = pipe (
100
+ prompt = [prompt ] * n_samples , num_inference_steps = num_inference_steps
101
+ ).images
85
102
86
103
mem = torch .cuda .memory_reserved ()
87
104
return round (mem / 1e9 , 2 )
88
105
89
106
90
- def run_benchmark (n_repeats , n_samples , precision , backend , num_inference_steps ):
107
+ def run_benchmark (
108
+ n_repeats , n_samples , precision , use_autocast , backend , num_inference_steps
109
+ ):
91
110
"""
92
111
* n_repeats: nb datapoints for inference latency benchmark
93
112
* n_samples: number of samples to generate (~ batch size)
@@ -100,10 +119,16 @@ def run_benchmark(n_repeats, n_samples, precision, backend, num_inference_steps)
100
119
pipe = get_inference_pipeline (precision , backend )
101
120
102
121
logs = {
103
- "memory" : 0.00 if device .type == "cpu" else get_inference_memory (pipe , n_samples , precision , num_inference_steps ),
104
- "latency" : get_inference_time (pipe , n_samples , n_repeats , precision , num_inference_steps ),
122
+ "memory" : 0.00
123
+ if device .type == "cpu"
124
+ else get_inference_memory (
125
+ pipe , n_samples , use_autocast , num_inference_steps
126
+ ),
127
+ "latency" : get_inference_time (
128
+ pipe , n_samples , n_repeats , use_autocast , num_inference_steps
129
+ ),
105
130
}
106
- print (f"n_samples: { n_samples } \t precision: { precision } \t backend: { backend } " )
131
+ print (f"n_samples: { n_samples } \t precision: { precision } \t autocast: { use_autocast } \ t backend: { backend } " )
107
132
print (logs , "\n " )
108
133
return logs
109
134
@@ -115,9 +140,8 @@ def get_device_description():
115
140
"""
116
141
if device .type == "cpu" :
117
142
name = subprocess .check_output (
118
- "grep -m 1 'model name' /proc/cpuinfo" ,
119
- shell = True
120
- ).decode ("utf-8" )
143
+ "grep -m 1 'model name' /proc/cpuinfo" , shell = True
144
+ ).decode ("utf-8" )
121
145
name = " " .join (name .split (" " )[2 :]).strip ()
122
146
return name
123
147
else :
@@ -130,14 +154,23 @@ def run_benchmark_grid(grid, n_repeats, num_inference_steps):
130
154
{
131
155
"n_samples": (1, 2),
132
156
"precision": ("single", "half"),
157
+ "autocast" : ("yes", "no")
133
158
}
134
159
* n_repeats: nb datapoints for inference latency benchmark
135
160
"""
136
161
137
162
csv_fpath = pathlib .Path (__file__ ).parent .parent / "benchmark_tmp.csv"
138
163
# create benchmark.csv if not exists
139
164
if not os .path .isfile (csv_fpath ):
140
- header = ["device" , "precision" , "runtime" , "n_samples" , "latency" , "memory" ]
165
+ header = [
166
+ "device" ,
167
+ "precision" ,
168
+ "autocast" ,
169
+ "runtime" ,
170
+ "n_samples" ,
171
+ "latency" ,
172
+ "memory" ,
173
+ ]
141
174
with open (csv_fpath , "w" ) as f :
142
175
writer = csv .writer (f )
143
176
writer .writerow (header )
@@ -148,48 +181,58 @@ def run_benchmark_grid(grid, n_repeats, num_inference_steps):
148
181
device_desc = get_device_description ()
149
182
for n_samples in grid ["n_samples" ]:
150
183
for precision in grid ["precision" ]:
151
- for backend in grid ["backend" ]:
152
- try :
153
- new_log = run_benchmark (
154
- n_repeats = n_repeats ,
155
- n_samples = n_samples ,
156
- precision = precision ,
157
- backend = backend ,
158
- num_inference_steps = num_inference_steps
159
- )
160
- except Exception as e :
161
- if "CUDA out of memory" in str (e ) or "Failed to allocate memory" in str (e ):
162
- print (str (e ))
163
- torch .cuda .empty_cache ()
164
- new_log = {
165
- "latency" : - 1.00 ,
166
- "memory" : - 1.00
167
- }
168
- else :
169
- raise e
170
-
171
- latency = new_log ["latency" ]
172
- memory = new_log ["memory" ]
173
- new_row = [device_desc , precision , backend , n_samples , latency , memory ]
174
- writer .writerow (new_row )
184
+ use_autocast = False
185
+ if precision == "half" :
186
+ for autocast in grid ["autocast" ]:
187
+ if autocast == "yes" :
188
+ use_autocast = True
189
+ for backend in grid ["backend" ]:
190
+ try :
191
+ new_log = run_benchmark (
192
+ n_repeats = n_repeats ,
193
+ n_samples = n_samples ,
194
+ precision = precision ,
195
+ use_autocast = use_autocast ,
196
+ backend = backend ,
197
+ num_inference_steps = num_inference_steps ,
198
+ )
199
+ except Exception as e :
200
+ if "CUDA out of memory" in str (
201
+ e
202
+ ) or "Failed to allocate memory" in str (e ):
203
+ print (str (e ))
204
+ torch .cuda .empty_cache ()
205
+ new_log = {"latency" : - 1.00 , "memory" : - 1.00 }
206
+ else :
207
+ raise e
208
+
209
+ latency = new_log ["latency" ]
210
+ memory = new_log ["memory" ]
211
+ new_row = [
212
+ device_desc ,
213
+ precision ,
214
+ autocast ,
215
+ backend ,
216
+ n_samples ,
217
+ latency ,
218
+ memory ,
219
+ ]
220
+ writer .writerow (new_row )
175
221
176
222
177
223
if __name__ == "__main__" :
178
224
179
225
parser = argparse .ArgumentParser ()
180
226
181
227
parser .add_argument (
182
- "--samples" ,
228
+ "--samples" ,
183
229
default = "1" ,
184
- type = str ,
185
- help = "Comma sepearated list of batch sizes (number of samples)"
230
+ type = str ,
231
+ help = "Comma sepearated list of batch sizes (number of samples)" ,
186
232
)
187
233
188
234
parser .add_argument (
189
- "--steps" ,
190
- default = 50 ,
191
- type = int ,
192
- help = "Number of diffusion steps."
235
+ "--steps" , default = 50 , type = int , help = "Number of diffusion steps."
193
236
)
194
237
195
238
parser .add_argument (
@@ -199,17 +242,25 @@ def run_benchmark_grid(grid, n_repeats, num_inference_steps):
199
242
help = "Number of repeats." ,
200
243
)
201
244
245
+ parser .add_argument (
246
+ "--autocast" ,
247
+ default = "no" ,
248
+ type = str ,
249
+ help = "If 'yes', will perform additional runs with autocast activated for half precision inferences" ,
250
+ )
251
+
202
252
args = parser .parse_args ()
203
253
204
254
grid = {
205
- "n_samples" : tuple (map (int , args .samples .split ("," ))),
206
- # Only use single-precision for cpu because "LayerNormKernelImpl" not implemented for 'Half' on cpu,
255
+ "n_samples" : tuple (map (int , args .samples .split ("," ))),
256
+ # Only use single-precision for cpu because "LayerNormKernelImpl" not implemented for 'Half' on cpu,
207
257
# Remove autocast won't help. Ref:
208
258
# https://github.com/CompVis/stable-diffusion/issues/307
209
259
"precision" : ("single" ,) if device .type == "cpu" else ("single" , "half" ),
260
+ "autocast" : ("no" ,) if args .autocast == "no" else ("yes" , "no" ),
210
261
# Only use onnx for cpu, until issues are fixed by upstreams. Ref:
211
262
# https://github.com/huggingface/diffusers/issues/489#issuecomment-1261577250
212
263
# https://github.com/huggingface/diffusers/pull/440
213
- "backend" : ("pytorch" , "onnx" ) if device .type == "cpu" else ("pytorch" ,)
264
+ "backend" : ("pytorch" , "onnx" ) if device .type == "cpu" else ("pytorch" ,),
214
265
}
215
266
run_benchmark_grid (grid , n_repeats = args .repeats , num_inference_steps = args .steps )
0 commit comments