5
5
import pathlib
6
6
import csv
7
7
from contextlib import nullcontext
8
+ import itertools
8
9
import torch
9
10
from torch import autocast
10
11
from diffusers import StableDiffusionPipeline , StableDiffusionOnnxPipeline
13
14
14
15
prompt = "a photo of an astronaut riding a horse on mars"
15
16
17
+ def make_bool (yes_or_no ):
18
+ if yes_or_no .lower () == "yes" :
19
+ return True
20
+ elif yes_or_no .lower () == "no" :
21
+ return False
22
+ else :
23
+ raise ValueError (f"unrecognised input { yes_or_no } " )
16
24
17
25
def get_inference_pipeline (precision , backend ):
18
26
"""
19
27
returns HuggingFace diffuser pipeline
20
28
cf https://github.com/huggingface/diffusers#text-to-image-generation-with-stable-diffusion
21
- note: could not download from CompVis/stable-diffusion-v1-4 (access restricted)
22
29
"""
23
30
24
31
assert precision in ("half" , "single" ), "precision in ['half', 'single']"
@@ -28,7 +35,6 @@ def get_inference_pipeline(precision, backend):
28
35
pipe = StableDiffusionPipeline .from_pretrained (
29
36
"CompVis/stable-diffusion-v1-4" ,
30
37
revision = "main" if precision == "single" else "fp16" ,
31
- use_auth_token = os .environ ["ACCESS_TOKEN" ],
32
38
torch_dtype = torch .float32 if precision == "single" else torch .float16 ,
33
39
)
34
40
pipe = pipe .to (device )
@@ -103,9 +109,9 @@ def get_inference_memory(pipe, n_samples, use_autocast, num_inference_steps):
103
109
mem = torch .cuda .memory_reserved ()
104
110
return round (mem / 1e9 , 2 )
105
111
106
-
112
+ @ torch . inference_mode ()
107
113
def run_benchmark (
108
- n_repeats , n_samples , precision , use_autocast , backend , num_inference_steps
114
+ n_repeats , n_samples , precision , use_autocast , xformers , backend , num_inference_steps
109
115
):
110
116
"""
111
117
* n_repeats: nb datapoints for inference latency benchmark
@@ -116,7 +122,14 @@ def run_benchmark(
116
122
dict like {'memory usage': 17.70, 'latency': 86.71'}
117
123
"""
118
124
125
+ print (f"n_samples: { n_samples } \t precision: { precision } \t autocast: { use_autocast } \t xformers: { xformers } \t backend: { backend } " )
126
+
119
127
pipe = get_inference_pipeline (precision , backend )
128
+ if xformers :
129
+ pipe .enable_xformers_memory_efficient_attention ()
130
+
131
+ if n_samples > 16 :
132
+ pipe .enable_vae_slicing ()
120
133
121
134
logs = {
122
135
"memory" : 0.00
@@ -128,8 +141,8 @@ def run_benchmark(
128
141
pipe , n_samples , n_repeats , use_autocast , num_inference_steps
129
142
),
130
143
}
131
- print (f"n_samples: { n_samples } \t precision: { precision } \t autocast: { use_autocast } \t backend: { backend } " )
132
144
print (logs , "\n " )
145
+ print ("============================" )
133
146
return logs
134
147
135
148
@@ -148,7 +161,7 @@ def get_device_description():
148
161
return torch .cuda .get_device_name ()
149
162
150
163
151
- def run_benchmark_grid (grid , n_repeats , num_inference_steps ):
164
+ def run_benchmark_grid (grid , n_repeats , num_inference_steps , csv_fpath ):
152
165
"""
153
166
* grid : dict like
154
167
{
@@ -159,13 +172,13 @@ def run_benchmark_grid(grid, n_repeats, num_inference_steps):
159
172
* n_repeats: nb datapoints for inference latency benchmark
160
173
"""
161
174
162
- csv_fpath = pathlib .Path (__file__ ).parent .parent / "benchmark_tmp.csv"
163
175
# create benchmark.csv if not exists
164
176
if not os .path .isfile (csv_fpath ):
165
177
header = [
166
178
"device" ,
167
179
"precision" ,
168
180
"autocast" ,
181
+ "xformers"
169
182
"runtime" ,
170
183
"n_samples" ,
171
184
"latency" ,
@@ -179,45 +192,44 @@ def run_benchmark_grid(grid, n_repeats, num_inference_steps):
179
192
with open (csv_fpath , "a" ) as f :
180
193
writer = csv .writer (f )
181
194
device_desc = get_device_description ()
182
- for n_samples in grid ["n_samples" ]:
183
- for precision in grid ["precision" ]:
184
- use_autocast = False
185
- if precision == "half" :
186
- for autocast in grid ["autocast" ]:
187
- if autocast == "yes" :
188
- use_autocast = True
189
- for backend in grid ["backend" ]:
190
- try :
191
- new_log = run_benchmark (
192
- n_repeats = n_repeats ,
193
- n_samples = n_samples ,
194
- precision = precision ,
195
- use_autocast = use_autocast ,
196
- backend = backend ,
197
- num_inference_steps = num_inference_steps ,
198
- )
199
- except Exception as e :
200
- if "CUDA out of memory" in str (
201
- e
202
- ) or "Failed to allocate memory" in str (e ):
203
- print (str (e ))
204
- torch .cuda .empty_cache ()
205
- new_log = {"latency" : - 1.00 , "memory" : - 1.00 }
206
- else :
207
- raise e
208
-
209
- latency = new_log ["latency" ]
210
- memory = new_log ["memory" ]
211
- new_row = [
212
- device_desc ,
213
- precision ,
214
- autocast ,
215
- backend ,
216
- n_samples ,
217
- latency ,
218
- memory ,
219
- ]
220
- writer .writerow (new_row )
195
+ for trial in itertools .product (* grid .values ()):
196
+
197
+ n_samples , precision , use_autocast , xformers , backend = trial
198
+ use_autocast = make_bool (use_autocast )
199
+ xformers = make_bool (xformers )
200
+
201
+ try :
202
+ new_log = run_benchmark (
203
+ n_repeats = n_repeats ,
204
+ n_samples = n_samples ,
205
+ precision = precision ,
206
+ use_autocast = use_autocast ,
207
+ xformers = xformers ,
208
+ backend = backend ,
209
+ num_inference_steps = num_inference_steps ,
210
+ )
211
+ except Exception as e :
212
+ if "CUDA out of memory" in str (
213
+ e
214
+ ) or "Failed to allocate memory" in str (e ):
215
+ print (str (e ))
216
+ torch .cuda .empty_cache ()
217
+ new_log = {"latency" : - 1.00 , "memory" : - 1.00 }
218
+ else :
219
+ raise e
220
+
221
+ latency = new_log ["latency" ]
222
+ memory = new_log ["memory" ]
223
+ new_row = [
224
+ device_desc ,
225
+ precision ,
226
+ use_autocast ,
227
+ backend ,
228
+ n_samples ,
229
+ latency ,
230
+ memory ,
231
+ ]
232
+ writer .writerow (new_row )
221
233
222
234
223
235
if __name__ == "__main__" :
@@ -249,6 +261,20 @@ def run_benchmark_grid(grid, n_repeats, num_inference_steps):
249
261
help = "If 'yes', will perform additional runs with autocast activated for half precision inferences" ,
250
262
)
251
263
264
+ parser .add_argument (
265
+ "--xformers" ,
266
+ default = "yes" ,
267
+ type = str ,
268
+ help = "If 'yes', will use xformers flash attention" ,
269
+ )
270
+
271
+ parser .add_argument (
272
+ "--output_file" ,
273
+ default = "results.py" ,
274
+ type = str ,
275
+ help = "Path to output csv file to write" ,
276
+ )
277
+
252
278
args = parser .parse_args ()
253
279
254
280
grid = {
@@ -257,10 +283,11 @@ def run_benchmark_grid(grid, n_repeats, num_inference_steps):
257
283
# Remove autocast won't help. Ref:
258
284
# https://github.com/CompVis/stable-diffusion/issues/307
259
285
"precision" : ("single" ,) if device .type == "cpu" else ("single" , "half" ),
260
- "autocast" : ("no" ,) if args .autocast == "no" else ("yes" , "no" ),
286
+ "autocast" : args .autocast .split ("," ),
287
+ "xformers" : args .xformers .split ("," ),
261
288
# Only use onnx for cpu, until issues are fixed by upstreams. Ref:
262
289
# https://github.com/huggingface/diffusers/issues/489#issuecomment-1261577250
263
290
# https://github.com/huggingface/diffusers/pull/440
264
291
"backend" : ("pytorch" , "onnx" ) if device .type == "cpu" else ("pytorch" ,),
265
292
}
266
- run_benchmark_grid (grid , n_repeats = args .repeats , num_inference_steps = args .steps )
293
+ run_benchmark_grid (grid , n_repeats = args .repeats , num_inference_steps = args .steps , csv_fpath = args . output_file )
0 commit comments