From 4ef62dc1e9f109ca17736c46be166c2c3aa86390 Mon Sep 17 00:00:00 2001
From: Damian Kalinowski <damian.kalinowski@intel.com>
Date: Thu, 26 Jun 2025 10:54:14 +0200
Subject: [PATCH 1/4] poc

---
 demos/image_generation/README.md       | 93 ++++++++++++++++++++++++++
 demos/image_generation/input_data.json | 13 ++++
 2 files changed, 106 insertions(+)
 create mode 100644 demos/image_generation/input_data.json

diff --git a/demos/image_generation/README.md b/demos/image_generation/README.md
index 046c65e2ba..5d5d4423d5 100644
--- a/demos/image_generation/README.md
+++ b/demos/image_generation/README.md
@@ -361,7 +361,100 @@ Output file (`output2.png`):
 ![output2](./output2.png)
 
 
+## Measuring performance
+Prepare example request `input_data.json`:
+```
+{
+  "data": [
+    {
+      "payload": [
+        {
+          "model": "OpenVINO/stable-diffusion-v1-5-int8-ov",
+          "prompt": "dog",
+          "num_inference_steps": 2
+        }
+      ]
+    }
+  ]
+}
+
+```
+
+Run benchmark:
+```bash
+docker run --rm -it --net=host -v $(pwd):/work:rw nvcr.io/nvidia/tritonserver:24.12-py3-sdk \
+  perf_analyzer \
+    -m OpenVINO/stable-diffusion-v1-5-int8-ov \
+    --input-data=/work/input_data.json \
+    --service-kind=openai \
+    --endpoint=v3/images/generations \
+    --async \
+    -u localhost:8000 \
+    --request-count 8 \
+    --concurrency-range 8
+```
+
+MCLX23
+```
+*** Measurement Settings ***
+  Service Kind: OPENAI
+  Sending 8 benchmark requests
+  Using asynchronous calls for inference
+
+Request concurrency: 8
+  Client: 
+    Request count: 8
+    Throughput: 0.210501 infer/sec
+    Avg latency: 29514881 usec (standard deviation 1509943 usec)
+    p50 latency: 31140977 usec
+    p90 latency: 36002018 usec
+    p95 latency: 37274567 usec
+    p99 latency: 37274567 usec
+    Avg HTTP time: 29514870 usec (send/recv 3558 usec + response wait 29511312 usec)
+Inferences/Second vs. Client Average Batch Latency
+Concurrency: 8, throughput: 0.210501 infer/sec, latency 29514881 usec
+```
 
+SPR36
+```
+*** Measurement Settings ***
+  Service Kind: OPENAI
+  Sending 8 benchmark requests
+  Using asynchronous calls for inference
+
+Request concurrency: 8
+  Client: 
+    Request count: 8
+    Throughput: 1.14268 infer/sec
+    Avg latency: 5124694 usec (standard deviation 695195 usec)
+    p50 latency: 5252478 usec
+    p90 latency: 5922719 usec
+    p95 latency: 6080321 usec
+    p99 latency: 6080321 usec
+    Avg HTTP time: 5124684 usec (send/recv 15272 usec + response wait 5109412 usec)
+Inferences/Second vs. Client Average Batch Latency
+Concurrency: 8, throughput: 1.14268 infer/sec, latency 5124694 usec
+```
+
+```
+*** Measurement Settings ***
+  Service Kind: OPENAI
+  Sending 16 benchmark requests
+  Using asynchronous calls for inference
+
+Request concurrency: 16
+  Client: 
+    Request count: 16
+    Throughput: 1.33317 infer/sec
+    Avg latency: 8945421 usec (standard deviation 929729 usec)
+    p50 latency: 9395319 usec
+    p90 latency: 11657659 usec
+    p95 latency: 11657659 usec
+    p99 latency: 11659369 usec
+    Avg HTTP time: 8945411 usec (send/recv 491743 usec + response wait 8453668 usec)
+Inferences/Second vs. Client Average Batch Latency
+Concurrency: 16, throughput: 1.33317 infer/sec, latency 8945421 usec
+```
 
 ## References
 - [Image Generation API](../../docs/model_server_rest_api_image_generation.md)
diff --git a/demos/image_generation/input_data.json b/demos/image_generation/input_data.json
new file mode 100644
index 0000000000..5594922266
--- /dev/null
+++ b/demos/image_generation/input_data.json
@@ -0,0 +1,13 @@
+{
+  "data": [
+    {
+      "payload": [
+        {
+          "model": "OpenVINO/stable-diffusion-v1-5-int8-ov",
+          "prompt": "dog",
+          "num_inference_steps": 2
+        }
+      ]
+    }
+  ]
+}

From 53426601a6a7f6ccc6fa22d954c959ec82d3d487 Mon Sep 17 00:00:00 2001
From: Damian Kalinowski <damian.kalinowski@intel.com>
Date: Fri, 25 Jul 2025 16:49:00 +0200
Subject: [PATCH 2/4] static shape doc

---
 docs/image_generation/reference.md | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/docs/image_generation/reference.md b/docs/image_generation/reference.md
index f8357384a9..c4c8e4a53d 100644
--- a/docs/image_generation/reference.md
+++ b/docs/image_generation/reference.md
@@ -60,6 +60,11 @@ The calculator supports the following `node_options` for tuning the pipeline con
 -    `optional uint64 default_num_inference_steps` - default number of inference steps used for generation, if not specified by the request [default = 50];
 -    `optional uint64 max_num_inference_steps` - maximum number of inference steps allowed for generation. Requests exceeding this value will be rejected. [default = 100];
 
+Static model resolution settings:
+-    `optional string resolution` - enforces static resolution for all requests. When specified, underlying models are reshaped to this resolution.
+-    `optional uint64 num_images_per_prompt` - used together with max_resolution, to define batch size in static model shape.
+-    `optional float guidance_scale` - used together with max_resolution
+
 
 ## Models Directory
 

From f679d15b7cad18e63e40b05b5fda7edd368765df Mon Sep 17 00:00:00 2001
From: Damian Kalinowski <damian.kalinowski@intel.com>
Date: Fri, 25 Jul 2025 16:50:13 +0200
Subject: [PATCH 3/4] doc

---
 demos/image_generation/input_data.json | 13 -------------
 1 file changed, 13 deletions(-)
 delete mode 100644 demos/image_generation/input_data.json

diff --git a/demos/image_generation/input_data.json b/demos/image_generation/input_data.json
deleted file mode 100644
index 5594922266..0000000000
--- a/demos/image_generation/input_data.json
+++ /dev/null
@@ -1,13 +0,0 @@
-{
-  "data": [
-    {
-      "payload": [
-        {
-          "model": "OpenVINO/stable-diffusion-v1-5-int8-ov",
-          "prompt": "dog",
-          "num_inference_steps": 2
-        }
-      ]
-    }
-  ]
-}

From e919f398d5b4b2629e381f41f6634768199a6aed Mon Sep 17 00:00:00 2001
From: Damian Kalinowski <damian.kalinowski@intel.com>
Date: Fri, 25 Jul 2025 16:55:54 +0200
Subject: [PATCH 4/4] save

---
 demos/image_generation/README.md | 96 ++++++++++++++------------------
 1 file changed, 41 insertions(+), 55 deletions(-)

diff --git a/demos/image_generation/README.md b/demos/image_generation/README.md
index 3bbd48be89..e271f47374 100644
--- a/demos/image_generation/README.md
+++ b/demos/image_generation/README.md
@@ -474,7 +474,32 @@ Output file (`output2.png`):
 ![output2](./output2.png)
 
 
-## Measuring performance
+## Measuring throughput
+To increase throughput in image generation scenarios, it is worth changing plugin config and increase NUM_STREAMS. Additionally, set up static shape for the model to avoid dynamic shape overhead. This can be done by setting `resolution` parameter in the request.
+
+Edit graph.pbtxt and restart the server:
+```
+input_stream: "HTTP_REQUEST_PAYLOAD:input"
+output_stream: "HTTP_RESPONSE_PAYLOAD:output"
+
+node: {
+  name: "ImageGenExecutor"
+  calculator: "ImageGenCalculator"
+  input_stream: "HTTP_REQUEST_PAYLOAD:input"
+  input_side_packet: "IMAGE_GEN_NODE_RESOURCES:pipes"
+  output_stream: "HTTP_RESPONSE_PAYLOAD:output"
+  node_options: {
+      [type.googleapis.com / mediapipe.ImageGenCalculatorOptions]: {
+          models_path: "./"
+          device: "CPU"
+          num_images_per_prompt: 4  # 4 images per inference request
+          resolution: "512x512"     # reshape to static value
+          plugin_config: '{"PERFORMANCE_HINT":"THROUGHPUT","NUM_STREAMS":8}'
+      }
+  }
+}
+```
+
 Prepare example request `input_data.json`:
 ```
 {
@@ -484,7 +509,7 @@ Prepare example request `input_data.json`:
         {
           "model": "OpenVINO/stable-diffusion-v1-5-int8-ov",
           "prompt": "dog",
-          "num_inference_steps": 2
+          "num_inference_steps": 50
         }
       ]
     }
@@ -503,50 +528,8 @@ docker run --rm -it --net=host -v $(pwd):/work:rw nvcr.io/nvidia/tritonserver:24
     --endpoint=v3/images/generations \
     --async \
     -u localhost:8000 \
-    --request-count 8 \
-    --concurrency-range 8
-```
-
-MCLX23
-```
-*** Measurement Settings ***
-  Service Kind: OPENAI
-  Sending 8 benchmark requests
-  Using asynchronous calls for inference
-
-Request concurrency: 8
-  Client: 
-    Request count: 8
-    Throughput: 0.210501 infer/sec
-    Avg latency: 29514881 usec (standard deviation 1509943 usec)
-    p50 latency: 31140977 usec
-    p90 latency: 36002018 usec
-    p95 latency: 37274567 usec
-    p99 latency: 37274567 usec
-    Avg HTTP time: 29514870 usec (send/recv 3558 usec + response wait 29511312 usec)
-Inferences/Second vs. Client Average Batch Latency
-Concurrency: 8, throughput: 0.210501 infer/sec, latency 29514881 usec
-```
-
-SPR36
-```
-*** Measurement Settings ***
-  Service Kind: OPENAI
-  Sending 8 benchmark requests
-  Using asynchronous calls for inference
-
-Request concurrency: 8
-  Client: 
-    Request count: 8
-    Throughput: 1.14268 infer/sec
-    Avg latency: 5124694 usec (standard deviation 695195 usec)
-    p50 latency: 5252478 usec
-    p90 latency: 5922719 usec
-    p95 latency: 6080321 usec
-    p99 latency: 6080321 usec
-    Avg HTTP time: 5124684 usec (send/recv 15272 usec + response wait 5109412 usec)
-Inferences/Second vs. Client Average Batch Latency
-Concurrency: 8, throughput: 1.14268 infer/sec, latency 5124694 usec
+    --request-count 16 \
+    --concurrency-range 16
 ```
 
 ```
@@ -556,19 +539,22 @@ Concurrency: 8, throughput: 1.14268 infer/sec, latency 5124694 usec
   Using asynchronous calls for inference
 
 Request concurrency: 16
-  Client: 
+  Client:
     Request count: 16
-    Throughput: 1.33317 infer/sec
-    Avg latency: 8945421 usec (standard deviation 929729 usec)
-    p50 latency: 9395319 usec
-    p90 latency: 11657659 usec
-    p95 latency: 11657659 usec
-    p99 latency: 11659369 usec
-    Avg HTTP time: 8945411 usec (send/recv 491743 usec + response wait 8453668 usec)
+    Throughput: 0.0999919 infer/sec
+    Avg latency: 156783666 usec (standard deviation 1087845 usec)
+    p50 latency: 157110315 usec
+    p90 latency: 158720060 usec
+    p95 latency: 158720060 usec
+    p99 latency: 159494095 usec
+    Avg HTTP time: 156783654 usec (send/recv 8717 usec + response wait 156774937 usec)
 Inferences/Second vs. Client Average Batch Latency
-Concurrency: 16, throughput: 1.33317 infer/sec, latency 8945421 usec
+Concurrency: 16, throughput: 0.0999919 infer/sec, latency 156783666 usec
 ```
 
+0.0999919 infer/sec meaning 0.4 images per second considering 4 images per prompt.
+
+
 ## References
 - [Image Generation API](../../docs/model_server_rest_api_image_generation.md)
 - [Writing client code](../../docs/clients_genai.md)