Skip to content

Commit

Permalink
benchmark: Add readme to benchmark
Browse files Browse the repository at this point in the history
  • Loading branch information
chuanli11 committed Oct 3, 2022
1 parent 2469d61 commit cc81843
Show file tree
Hide file tree
Showing 14 changed files with 232 additions and 55 deletions.
2 changes: 2 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -100,6 +100,8 @@ for idx, im in enumerate(images):

## Benchmarking inference

Detailed benchmark documentation can be found [here](./docs/benchmark.md).

### Setup

Before running the benchmark, make sure you have completed the repository [installation steps](#installation).
Expand Down
72 changes: 18 additions & 54 deletions benchmark.csv
Original file line number Diff line number Diff line change
@@ -1,94 +1,58 @@
Quadro RTX 8000,single,pytorch,1,12.50,7.69
Quadro RTX 8000,single,onnx,1,13.54,0.0
Quadro RTX 8000,half,pytorch,1,6.52,4.5
Quadro RTX 8000,half,onnx,1,13.59,0.0
Quadro RTX 8000,single,pytorch,2,23.92,10.9
Quadro RTX 8000,single,onnx,2,25.56,0.0
Quadro RTX 8000,half,pytorch,2,12.61,8.8
Quadro RTX 8000,half,onnx,2,25.83,0.0
Intel(R) Core(TM) i7-6850K CPU @ 3.60GHz,single,pytorch,1,458.97,0.0
Intel(R) Core(TM) i7-6850K CPU @ 3.60GHz,single,onnx,1,286.13,0.0
NVIDIA GeForce RTX 3090,single,pytorch,1,7.96,7.72
NVIDIA GeForce RTX 3090,single,onnx,1,7.93,0.0
NVIDIA GeForce RTX 3090,half,pytorch,1,4.83,5.47
NVIDIA GeForce RTX 3090,half,onnx,1,7.96,0.0
NVIDIA GeForce RTX 3090,half,pytorch,1,4.83,4.54
NVIDIA GeForce RTX 3090,single,pytorch,2,14.49,11
NVIDIA GeForce RTX 3090,single,onnx,2,14.85,0.0
NVIDIA GeForce RTX 3090,half,pytorch,2,8.42,8.75
NVIDIA GeForce RTX 3090,half,onnx,2,14.84,0.0
NVIDIA GeForce RTX 3090,single,pytorch,4,27.94,17.69
NVIDIA GeForce RTX 3090,single,onnx,4,-1.0,-1.0
NVIDIA GeForce RTX 3090,half,pytorch,4,15.87,15.36
NVIDIA GeForce RTX 3090,half,onnx,4,29.05,0.0
NVIDIA GeForce RTX 3090,single,pytorch,8,-1.0,-1.0
NVIDIA GeForce RTX 3090,single,onnx,8,-1.0,-1.0
NVIDIA GeForce RTX 3090,half,pytorch,8,-1.0,-1.0
NVIDIA GeForce RTX 3090,half,onnx,8,-1.0,-1.0
NVIDIA RTX A5500,single,pytorch,1,8.55,7.69
NVIDIA RTX A5500,single,onnx,1,8.75,0.0
NVIDIA RTX A5500,half,pytorch,1,5.05,5.51
NVIDIA RTX A5500,half,onnx,1,8.83,0.0
NVIDIA RTX A5500,half,pytorch,1,5.05,4.58
NVIDIA RTX A5500,single,pytorch,2,15.71,11
NVIDIA RTX A5500,single,onnx,2,16.83,0.0
NVIDIA RTX A5500,half,pytorch,2,9.37,8.8
NVIDIA RTX A5500,half,onnx,2,16.82,0.0
NVIDIA RTX A5500,single,pytorch,4,30.51,17.69
NVIDIA RTX A5500,single,onnx,4,30.51,17.69
NVIDIA RTX A5500,half,pytorch,4,16.97,15.33
NVIDIA RTX A5500,half,onnx,4,33.87,0.0
NVIDIA RTX A5500,single,pytorch,8,-1.0,-1.0
NVIDIA RTX A5500,single,onnx,8,-1.0,-1.0
NVIDIA RTX A5500,half,pytorch,8,-1.0,-1.0
NVIDIA RTX A5500,half,onnx,8,-1.0,-1.0
AMD EPYC 7352 24-Core Processor,single,pytorch,1,529.93,0.0
AMD EPYC 7352 24-Core Processor,single,onnx,1,223.19,0.0
NVIDIA GeForce RTX 3080,single,pytorch,4,-1.0,-1.0
NVIDIA GeForce RTX 3080,single,onnx,4,-1.0,-1.0
NVIDIA GeForce RTX 3080,half,pytorch,4,-1.0,-1.0
NVIDIA GeForce RTX 3080,half,onnx,4,-1.0,-1.0
NVIDIA GeForce RTX 3080,single,pytorch,1,-1.0,-1.0
NVIDIA GeForce RTX 3080,single,onnx,1,-1.0,-1.0
NVIDIA GeForce RTX 3080,half,pytorch,1,5.59,4.52
NVIDIA GeForce RTX 3080,half,onnx,1,5.59,4.52
NVIDIA GeForce RTX 3080,single,pytorch,2,-1.0,-1.0
NVIDIA GeForce RTX 3080,single,onnx,2,-1.0,-1.0
NVIDIA GeForce RTX 3080,half,pytorch,2,-1.0,-1.0
NVIDIA GeForce RTX 3080,half,onnx,2,-1.0,-1.0
NVIDIA A100 80GB PCIe,single,pytorch,1,6.49,7.69
NVIDIA A100 80GB PCIe,single,pytorch,1,6.39,7.75
NVIDIA A100 80GB PCIe,single,onnx,1,7.34,0.0
NVIDIA A100 80GB PCIe,half,pytorch,1,3.74,5.48
NVIDIA A100 80GB PCIe,half,onnx,1,7.38,0.0
NVIDIA A100 80GB PCIe,half,pytorch,1,3.74,4.55
NVIDIA A100 80GB PCIe,single,pytorch,2,11.12,11.05
NVIDIA A100 80GB PCIe,single,onnx,2,12.93,0.0
NVIDIA A100 80GB PCIe,half,pytorch,2,5.72,8.77
NVIDIA A100 80GB PCIe,half,onnx,2,12.94,0.0
NVIDIA A100 80GB PCIe,single,pytorch,4,20.18,17.63
NVIDIA A100 80GB PCIe,single,onnx,4,25.59,0.0
NVIDIA A100 80GB PCIe,half,pytorch,4,10.04,15.34
NVIDIA A100 80GB PCIe,half,onnx,4,25.56,0.0
NVIDIA A100 80GB PCIe,single,pytorch,8,38.88,30.88
NVIDIA A100 80GB PCIe,single,onnx,8,48.62,0.0
NVIDIA A100 80GB PCIe,half,pytorch,8,18.68,28.47
NVIDIA A100 80GB PCIe,half,onnx,8,48.57,0.0
NVIDIA A100 80GB PCIe,single,pytorch,16,76.92,57.46
NVIDIA A100 80GB PCIe,single,onnx,16,-1.0,-1.0
NVIDIA A100 80GB PCIe,half,pytorch,16,36.67,54.73
NVIDIA A100 80GB PCIe,half,onnx,16,-1.0,-1.0
NVIDIA A100 80GB PCIe,half,pytorch,28,63.88,78.78
NVIDIA RTX A6000,single,pytorch,1,8.09,7.75
NVIDIA RTX A6000,single,pytorch,1,8.15,7.68
NVIDIA RTX A6000,single,onnx,1,7.96,0.0
NVIDIA RTX A6000,half,pytorch,1,5.03,5.5
NVIDIA RTX A6000,half,onnx,1,7.99,0.0
NVIDIA RTX A6000,half,pytorch,1,5.03,4.53
NVIDIA RTX A6000,single,pytorch,2,14.86,10.98
NVIDIA RTX A6000,single,onnx,2,15.12,0.0
NVIDIA RTX A6000,half,pytorch,2,9.03,8.79
NVIDIA RTX A6000,half,onnx,2,15.16,0.0
NVIDIA RTX A6000,single,pytorch,4,27.92,17.62
NVIDIA RTX A6000,single,onnx,4,30.64,0.0
NVIDIA RTX A6000,half,pytorch,4,17.0,15.34
NVIDIA RTX A6000,half,onnx,4,31.34,0.0
NVIDIA RTX A6000,single,pytorch,8,53.95,30.88
NVIDIA RTX A6000,single,onnx,8,-1.0,-1.0
NVIDIA RTX A6000,half,pytorch,8,32.57,28.51
NVIDIA RTX A6000,half,onnx,8,-1.0,-1.0
NVIDIA RTX A6000,half,pytorch,16,63.16,46.11
Quadro RTX 8000,single,pytorch,1,12.3,7.71
Quadro RTX 8000,half,pytorch,1,5.93,4.52
Quadro RTX 8000,single,pytorch,2,24.42,9.16
Quadro RTX 8000,half,pytorch,2,10.92,7.02
Quadro RTX 8000,single,pytorch,4,42.56,15.58
Quadro RTX 8000,half,pytorch,4,21.24,12.39
Quadro RTX 8000,single,pytorch,8,76.96,23.11
Quadro RTX 8000,half,pytorch,8,40.52,20.98
Quadro RTX 8000,single,pytorch,16,152.55,42.47
Quadro RTX 8000,half,pytorch,16,80.31,38.18
Quadro RTX 8000,single,pytorch,32,-1.0,-1.0
Quadro RTX 8000,half,pytorch,32,-1.0,-1.0
86 changes: 86 additions & 0 deletions docs/benchmark.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,86 @@
# Benchmarking Diffuser Models

We benchmark [Stable Diffusion](https://huggingface.co/CompVis/stable-diffusion) model inference using different GPUs and CPUs. When possible, we use half-precision and onnxruntime to speed up the inference. The input is a text prompt, and the output is an image of resolution `512x512`.

## Latency

The figure below shows the latency of running inference using a single text prompt "a photo of an astronaut riding a horse on mars". The key insights are:
* The inference latencies range between `3.74` to `5.56` seconds across our tested Ampere GPUs, including the consumer 3080 card to the flagship A100 80GB card.
* Half-precision reduces the latency by about `40%` for Ampere GPUs, and by `52%` for the previous generation `RTX8000` GPU. We believe Ampere GPUs enjoy a relatively "smaller" speedup from half-precision due to their use of `TF32`. For readers who are not familiar with `TF32`, it is a [`19-bit` format](https://blogs.nvidia.com/blog/2020/05/14/tensorfloat-32-precision-format/) that has been used as the default single-precision data type on Ampere GPUs for major deep learning frameworks such as PyTorch and TensorFlow. One can expect half-precision's speedup over `FP32` to be bigger since it is a true `32-bit` format.

![Stable Diffusion Text2Image Latency (seconds)](./pictures/benchmark_sd_txt2img_latency.svg)


Putting such performance in the context of the performance of running the same inference job on CPUs (the figure below), we notice that:
* GPUs are significantly faster -- by one or two orders of magnitudes depending on the precisions.
* `onnxruntime` can reduce the latency for CPU by about `40%` to `50%`, depending on the type of CPUs.
* ONNX currently does not have [stable support](https://github.com/huggingface/diffusers/issues/489) for Huggingface diffusers. We will investigate `onnxruntime-gpu` in future benchmarks.

![GPU v.s. CPU](./pictures/benchmark_sd_txt2img_gpu_vs_cpu.svg)


## Memory

We also measure the memory consumption of running stable diffusion inference, and here are the results:
* It takes about `7.7 GB` GPU memory to run single-precision inference with batch size one. This is consistent across all tested GPUs.
* It takes about `4.5 GB` GPU memory to run half-precision inference with batch size one. This is consistent across all tested GPUs.

![Stable Diffusion Text2Image Memory (GB)](./pictures/benchmark_sd_txt2img_mem.svg)


## Throughput

Latency measures how quickly a _single_ input can be processed, which is critical to online applications that don't tolerate even the slightest delay. However, some (offline) applications may focus on "throughput", which measures the total volume of data processed in a fixed amount of time.

Our throughput benchmark pushes the batch size to the maximum for each GPU, and measures the number of images they can process per minute. The reason for maximizing the batch size is to keep tensor cores busy so that computation can dominate the workload, avoiding any non-computational bottlenecks.

* Once again, A100 80GB has the highest throughput. In the meantime, the gap between A100 80GB and other cards are enlarged due to the largest batch size that can be used on this card.

![Stable Diffusion Text2Image Throughput (images/minute)](./pictures/benchmark_sd_txt2img_throughput.svg)


As a concrete example, the chart below shows how A100 80GB's throughput increases by `64%` when we changed the batch size from 1 to 28 (the largest without causing an out of memory error). It is also interesting to see that the increase is not linear and flattens out when batch size reaches a certain value, at which point the tensor cores on the GPU are saturated and any new data in the GPU memory will have to be queued up before getting their own computing resources.

![Stable Diffusion Text2Image Throughput (images/minute)](./pictures/benchmark_sd_txt2img_batchsize_vs_throughput.svg)



## Precision

We are curious about whether half-precision introduces degradations to the quality of the output images. To test this out, we fixed the text prompt as well as the "latent" input vector and fed them to the single-precision model and the half-precision model. We ran the inference for 100 steps and saved both models' outputs at each step, as well as the difference map:

![Stable Diffusion Text2Image Throughput (images/minute)](./pictures/benchmark_sd_precision_history.gif)

Our observation is that there are indeed visible differences between the single-precision output and the half-precision output, especially in the early steps. The differences often decrease with the number of steps, but might not always vanish.

Interestingly, such a difference may not imply artifacts in half-precision's outputs. For example, in step 70, the picture below shows half-precision didn't produce the artifact in the single-precision output (an extra front leg):

![Stable Diffusion Text2Image Throughput (images/minute)](./pictures/benchmark_sd_precision_step_70.png)


# Reproduce

You can use this [Lambda Diffusers](https://github.com/LambdaLabsML/lambda-diffusers) repo to reproduce the results.

## Setup

Before running the benchmark, make sure you have completed the repository [installation steps](../README.md#installation).

You will then need to set the huggingface access token:
1. Create a user account on HuggingFace and generate an access token.
2. Set your huggingface access token as the `ACCESS_TOKEN` environment variable:
```
export ACCESS_TOKEN=<hf_...>
```

## Usage

Launch the `benchmark.py` script to append benchmark results to the existing [benchmark.csv](../benchmark.csv) results file:
```
python ./scripts/benchmark.py
```

Lauch the `benchmark_quality.py` script to compare the output of single-precision and half-precision models:
```
python ./scripts/benchmark_quality.py
```
Binary file added docs/pictures/FreeMono.ttf
Binary file not shown.
Binary file added docs/pictures/benchmark_sd_precision_history.gif
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file added docs/pictures/benchmark_sd_precision_step_70.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
1 change: 1 addition & 0 deletions docs/pictures/benchmark_sd_txt2img_gpu_vs_cpu.svg
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
1 change: 1 addition & 0 deletions docs/pictures/benchmark_sd_txt2img_latency.svg
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
1 change: 1 addition & 0 deletions docs/pictures/benchmark_sd_txt2img_mem.svg
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
1 change: 1 addition & 0 deletions docs/pictures/benchmark_sd_txt2img_throughput.svg
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
1 change: 1 addition & 0 deletions requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -5,4 +5,5 @@ ftfy==6.1.1
Pillow==9.2.0
diffusers==0.3.0
onnxruntime==1.12.1
scikit-image==0.19.3
-e .
2 changes: 1 addition & 1 deletion scripts/benchmark.py
Original file line number Diff line number Diff line change
Expand Up @@ -201,7 +201,7 @@ def run_benchmark_grid(grid, n_repeats, num_inference_steps):

grid = {
"n_samples": tuple(map(int, args.samples.split(","))),
# Only use single precision for cpu because "LayerNormKernelImpl" not implemented for 'Half' on cpu,
# Only use single-precision for cpu because "LayerNormKernelImpl" not implemented for 'Half' on cpu,
# Remove autocast won't help. Ref:
# https://github.com/CompVis/stable-diffusion/issues/307
"precision": ("single",) if device.type == "cpu" else ("single", "half"),
Expand Down
119 changes: 119 additions & 0 deletions scripts/benchmark_quality.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,119 @@
import os
from platform import mac_ver
import numpy as np
from PIL import Image, ImageDraw, ImageFont
from skimage.metrics import structural_similarity as ssim
from skimage.metrics import normalized_root_mse as nmse
from skimage.metrics import peak_signal_noise_ratio as psnr

import torch
from torch import autocast
from diffusers import StableDiffusionPipeline

model_id = "CompVis/stable-diffusion-v1-4"
device = "cuda"
prompt = "a photo of an astronaut riding a horse on mars"
output_folder = "_".join(prompt.split(" "))
os.makedirs(output_folder, exist_ok=True)

num_images = 1
width = 512
height = 512
min_inference_steps = 10
max_inference_steps = 100

list_ssim = []
list_nmse = []
list_psnr = []

# Create piplines for single and half-precision
pipe = StableDiffusionPipeline.from_pretrained(
model_id,
use_auth_token=True,
torch_dtype=torch.float32)
pipe = pipe.to(device)

pipe_half = StableDiffusionPipeline.from_pretrained(
model_id,
revision="fp16",
torch_dtype=torch.float16,
use_auth_token=True)
pipe_half = pipe_half.to(device)

# Generate latent vectors
generator = torch.Generator(device=device)
latents = None
seeds = []
for _ in range(num_images):
# Get a new random seed, store it and use it as the generator state
seed = generator.seed()
seeds.append(seed)
generator = generator.manual_seed(seed)

image_latents = torch.randn(
(1, pipe.unet.in_channels, height // 8, width // 8),
generator = generator,
device = device
)
latents = image_latents if latents is None else torch.cat((latents, image_latents))

for num_inference_steps in range(min_inference_steps, max_inference_steps, 5):
# Inference with single and half-precision

torch.cuda.empty_cache()
images = pipe(
[prompt] * num_images,
guidance_scale=7.5,
latents = latents,
num_inference_steps = num_inference_steps
)["sample"]

torch.cuda.empty_cache()
with torch.autocast(device):
images_half = pipe_half(
[prompt] * num_images,
guidance_scale=7.5,
latents = latents,
num_inference_steps = num_inference_steps
)["sample"]

m_ssim = []
m_nmse = []
m_psnr = []

for idx, (image, image_half) in enumerate(zip(images, images_half)):
# Need to convert to float because uint8 can't store negative value
np_image = np.float32(np.asarray(image)) / 255.0
np_image_half = np.float32(np.asarray(image_half)) / 255.0
np_image_diff = np.absolute(np.float32(np.asarray(image)) - np.float32(np.asarray(image_half)))

# Compute quantitative metrics
m_ssim.append(ssim(np_image, np_image_half, channel_axis=2))
m_nmse.append(nmse(np_image, np_image_half))
m_psnr.append(psnr(np_image, np_image_half))
im_diff = Image.fromarray(np.uint8(np_image_diff))

# Compose results in a single output image
dst = Image.new('RGB', (image.width + image_half.width + im_diff.width, image.height))
dst.paste(image, (0, 0))
dst.paste(image_half, (image.width, 0))
dst.paste(im_diff, (image.width + image_half.width, 0))
I1 = ImageDraw.Draw(dst)
font = ImageFont.truetype('../docs/pictures/FreeMono.ttf', 16)
I1.text((32, 32), "Single Precision", font=font, fill=(255, 255, 255))
I1.text((image.width + 32, 32), "Half Precision", font=font, fill=(255, 255, 255))
I1.text((image.width + image_half.width + 32, 32), "Step " + str(num_inference_steps), font=font, fill=(255, 255, 255))
dst.save(output_folder + "/" + str(idx) + "_" + str(num_inference_steps) + ".png")

list_ssim.append(sum(m_ssim) / len(m_ssim))
list_nmse.append(sum(m_nmse) / len(m_nmse))
list_psnr.append(sum(m_psnr) / len(m_psnr))

print("ssim: ")
print(list_ssim)

print("nmse: ")
print(list_nmse)

print("psnr: ")
print(list_psnr)

0 comments on commit cc81843

Please sign in to comment.