allenai · finbarrtimbers · Nov 3, 2025 · Oct 21, 2025 · Oct 21, 2025 · Oct 21, 2025
diff --git a/open_instruct/benchmark_generators.py b/open_instruct/benchmark_generators.py
@@ -473,25 +473,27 @@ def run_benchmark(
                 "dataset_indices": all_dataset_indices,
             }
 
-            # Calculate total FLOPs for all prompts and responses in the batch
-            # No need to expand prompt_lengths - the flops method now handles samples_per_prompt
-            model_flops = model_dims.flops(
-                all_prompt_lengths, all_response_lengths, samples_per_prompt=args.num_samples_per_prompt_rollout
+            num_engines = args.vllm_num_engines
+            num_gpus_per_engine = args.vllm_tensor_parallel_size
+            num_inference_gpus = num_engines * num_gpus_per_engine
+
+            result_dict["mfu"] = model_dims.calculate_mfu(
+                all_prompt_lengths,
+                batch_generation_time,
+                response_lengths=all_response_lengths,
+                samples_per_prompt=args.num_samples_per_prompt_rollout,
+                num_gpus=num_inference_gpus,
             )
 
-            # MFU = (FLOPs / time) / peak_FLOPS * 100
-            model_flops_per_second = model_flops / batch_generation_time if batch_generation_time > 0 else 0
-            result_dict["mfu"] = 100 * model_flops_per_second / model_dims.device_flops
-
-            # Calculate total memory bytes for all prompts and responses in the batch
-            model_memory_bytes = model_dims.memory_bytes(
-                all_prompt_lengths, all_response_lengths, samples_per_prompt=args.num_samples_per_prompt_rollout
+            result_dict["mbu"] = model_dims.calculate_mbu(
+                all_prompt_lengths,
+                batch_generation_time,
+                response_lengths=all_response_lengths,
+                samples_per_prompt=args.num_samples_per_prompt_rollout,
+                num_engines=num_engines,
+                num_gpus_per_engine=num_gpus_per_engine,
             )
 
-            # MBU = (Memory bytes / time) / peak_bandwidth * 100
-            model_bytes_per_second = model_memory_bytes / batch_generation_time if batch_generation_time > 0 else 0
-            result_dict["mbu"] = 100 * model_bytes_per_second / model_dims.device_memory_bandwidth
-
             save_completion_lengths([result_dict], timestamp, batch_idx)
             results.append(result_dict)
             logger.info(

diff --git a/open_instruct/grpo_fast.py b/open_instruct/grpo_fast.py
@@ -1470,7 +1470,8 @@ def calculate_utilization_metrics(
     response_lengths: list[int],
     total_generation_time: float,
     samples_per_prompt: int,
-    num_inference_gpus: int,
+    num_engines: int,
+    num_gpus_per_engine: int,
     training_time: float,
     num_training_gpus: int,
 ) -> dict:
@@ -1482,7 +1483,8 @@ def calculate_utilization_metrics(
         response_lengths: List of response lengths
         total_generation_time: Total time taken for generation (for actor metrics)
         samples_per_prompt: Number of samples generated per prompt
-        num_inference_gpus: Number of GPUs used for inference
+        num_engines: Number of vLLM engines for inference
+        num_gpus_per_engine: Number of GPUs assigned to each vLLM engine (tensor parallel size)
         training_time: Time taken for training step (for learner metrics)
         num_training_gpus: Number of GPUs used for training (for learner metrics)
 
@@ -1496,42 +1498,27 @@ def calculate_utilization_metrics(
         f"Expected {len(prompt_lengths) * samples_per_prompt} response lengths, got {len(response_lengths)}"
     )
 
-    # Calculate FLOPs and memory bytes for inference
-    actor_total_flops = model_dims.flops(prompt_lengths, response_lengths, samples_per_prompt=samples_per_prompt)
-    actor_total_memory_bytes = model_dims.memory_bytes(
-        prompt_lengths, response_lengths, samples_per_prompt=samples_per_prompt
+    actor_metrics = model_dims.calculate_actor_utilization(
+        prompt_lengths=prompt_lengths,
+        response_lengths=response_lengths,
+        total_generation_time=total_generation_time,
+        samples_per_prompt=samples_per_prompt,
+        num_engines=num_engines,
+        num_gpus_per_engine=num_gpus_per_engine,
     )
 
-    # Calculate MFU and MBU accounting for multiple GPUs
-    flops_per_second = actor_total_flops / total_generation_time
-    bytes_per_second = actor_total_memory_bytes / total_generation_time
-    # Scale device capabilities by number of GPUs
-    total_device_flops = model_dims.device_flops * num_inference_gpus
-    total_device_bandwidth = model_dims.device_memory_bandwidth * num_inference_gpus
-    actor_mfu = 100 * flops_per_second / total_device_flops
-    actor_mbu = 100 * bytes_per_second / total_device_bandwidth
-
-    # Calculate learner/training metrics
-    # For training, we need to use total sequence lengths (prompt + response) since training
-    # processes the full sequences, not separate prefill/decode operations
-    total_sequence_lengths = [
-        prompt_lengths[i // samples_per_prompt] + response_len for i, response_len in enumerate(response_lengths)
-    ]
-
-    # For training FLOPs, pass total sequence lengths as prompt_lengths with response_lengths=None
-    training_flops = model_dims.flops(
-        prompt_lengths=total_sequence_lengths,
-        response_lengths=None,
-        samples_per_prompt=1,  # Already expanded in total_sequence_lengths
-        is_training=True,
+    learner_metrics = model_dims.calculate_learner_utilization(
+        prompt_lengths=prompt_lengths,
+        response_lengths=response_lengths,
+        training_time=training_time,
+        samples_per_prompt=samples_per_prompt,
+        num_training_gpus=num_training_gpus,
     )
 
-    # Calculate training MFU
-    training_flops_per_second = training_flops / training_time
-    total_training_device_flops = model_dims.device_flops * num_training_gpus
-    learner_mfu = 100 * training_flops_per_second / total_training_device_flops
+    utilization_metrics = {f"actor_{k}": v for k, v in actor_metrics.items()}
+    utilization_metrics["learner_mfu"] = learner_metrics["mfu"]
 
-    return {"actor_mfu": actor_mfu, "actor_mbu": actor_mbu, "learner_mfu": learner_mfu}
+    return utilization_metrics
 
 
 def accumulate_inference_batches(
@@ -2489,7 +2476,6 @@ def one_training_step(
     step_time = time.perf_counter() - start_time
     total_training_time = time.perf_counter() - training_start_time
 
-    num_actor_gpus = args.vllm_num_engines * args.vllm_tensor_parallel_size
     total_generation_time = data_thread_metrics["time/getting_response"]
 
     utilization_metrics = calculate_utilization_metrics(
@@ -2498,7 +2484,8 @@ def one_training_step(
         response_lengths=response_lengths,
         total_generation_time=total_generation_time,
         samples_per_prompt=args.num_samples_per_prompt_rollout,
-        num_inference_gpus=num_actor_gpus,
+        num_engines=args.vllm_num_engines,
+        num_gpus_per_engine=args.vllm_tensor_parallel_size,
         training_time=train_timer.duration,
         num_training_gpus=args.world_size,
     )

diff --git a/open_instruct/test_data/mbu_reproduction_cases.json b/open_instruct/test_data/mbu_reproduction_cases.json
@@ -0,0 +1,68 @@
+{
+  "mbu_157_percent": {
+    "model_name": "Qwen/Qwen2.5-7B",
+    "total_generation_time": 13.85860692244023,
+    "samples_per_prompt": 16,
+    "num_engines": 8,
+    "num_gpus_per_engine": 1,
+    "training_time": 4.0,
+    "num_training_gpus": 16,
+    "prompt_lengths": [183, 147, 64, 312, 193, 206, 171, 436, 80, 176, 210, 165, 268, 195, 230, 93, 162, 56, 362, 135, 257, 57, 304, 163, 326, 324, 155, 119, 108, 234, 82, 205],
+    "response_lengths": [108, 238, 308, 506, 182, 255, 248, 265, 221, 230, 347, 247, 497, 410, 223, 244, 540, 194, 246, 348, 383, 271, 246, 112, 171, 134, 88, 133, 1, 358, 279, 203, 107, 93, 119, 478, 202, 57, 116, 126, 560, 230, 92, 69, 88, 353, 74, 62, 3976, 407, 3104, 473, 237, 495, 299, 487, 1181, 1273, 475, 466, 326, 279, 870, 1053, 289, 585, 432, 476, 66, 340, 307, 512, 632, 526, 552, 117, 163, 541, 143, 226, 187, 196, 4096, 161, 186, 341, 205, 182, 435, 535, 493, 382, 248, 408, 156, 171, 345, 148, 451, 274, 222, 142, 144, 377, 215, 211, 224, 207, 805, 568, 142, 208, 3739, 1886, 1541, 671, 100, 2063, 645, 230, 533, 465, 961, 374, 1, 1076, 715, 4096, 262, 185, 171, 103, 224, 83, 118, 114, 112, 864, 267, 96, 1, 254, 130, 224, 309, 204, 823, 178, 391, 541, 346, 493, 756, 324, 402, 248, 1, 801, 364, 357, 124, 369, 57, 414, 452, 971, 271, 514, 391, 221, 262, 332, 1, 891, 385, 541, 539, 299, 325, 388, 1045, 237, 347, 322, 162, 456, 598, 170, 1, 259, 354, 401, 286, 500, 190, 545, 298, 421, 599, 374, 300, 154, 357, 366, 240, 302, 1077, 179, 572, 538, 580, 1210, 339, 500, 597, 681, 149, 499, 622, 423, 75, 391, 508, 175, 958, 548, 359, 302, 461, 608, 547, 360, 295, 1039, 776, 681, 465, 556, 566, 573, 1046, 209, 156, 467, 872, 481, 88, 265, 215, 62, 343, 190, 1, 240, 264, 404, 255, 239, 135, 344, 440, 200, 388, 355, 185, 300, 192, 1194, 1039, 661, 380, 184, 455, 461, 306, 212, 1489, 309, 195, 370, 381, 268, 350, 282, 368, 282, 366, 517, 395, 240, 1154, 402, 601, 678, 502, 445, 555, 102, 689, 362, 1, 337, 1472, 526, 573, 461, 226, 362, 419, 239, 178, 1542, 889, 528, 295, 168, 587, 308, 323, 827, 714, 733, 429, 271, 509, 630, 746, 1682, 631, 1459, 631, 439, 1, 786, 992, 717, 1665, 225, 308, 281, 503, 541, 515, 346, 157, 597, 143, 339, 1, 944, 709, 293, 368, 516, 447, 802, 443, 674, 360, 1894, 422, 760, 631, 1066, 245, 627, 722, 534, 310, 392, 2009, 119, 537, 311, 465, 164, 318, 417, 551, 269, 1, 597, 114, 523, 660, 499, 584, 1685, 362, 234, 528, 249, 900, 2014, 92, 383, 1, 991, 741, 278, 587, 579, 250, 2777, 621, 653, 745, 1355, 579, 1459, 730, 671, 523, 1497, 652, 832, 362, 139, 189, 109, 361, 205, 65, 101, 314, 125, 73, 363, 1, 283, 166, 146, 99, 123, 135, 54, 236, 118, 329, 119, 111, 249, 196, 75, 197, 308, 237, 232, 234, 106, 385, 213, 154, 191, 248, 199, 235, 184, 242, 167, 182, 184, 146, 223, 220, 224, 287, 287, 174, 392, 219, 342, 194, 172, 179, 192, 303, 164, 307, 159, 113, 302, 149, 345, 279, 71, 102, 576, 254, 395, 143, 155, 176, 279, 190, 270, 317, 68, 173, 173, 242, 446, 209, 199, 118, 167, 93, 117, 174, 128, 234, 132]
+  },
+  "mbu_161_percent": {
+    "model_name": "Qwen/Qwen2.5-7B",
+    "total_generation_time": 15.400770215317607,
+    "samples_per_prompt": 16,
+    "num_engines": 8,
+    "num_gpus_per_engine": 1,
+    "training_time": 4.0,
+    "num_training_gpus": 16,
+    "prompt_lengths": [139, 83, 409, 247, 132, 271, 347, 305, 139, 127, 75, 358, 284, 245, 284, 389, 117, 233, 186, 179, 244, 318, 295, 630, 296, 206, 146, 138, 167, 415, 157, 120],
+    "response_lengths": [1052, 252, 536, 218, 268, 627, 246, 225, 252, 181, 161, 201, 1, 156, 223, 323, 312, 598, 342, 147, 219, 416, 216, 94, 486, 302, 297, 524, 1, 1106, 254, 192, 1352, 528, 658, 679, 475, 737, 273, 356, 105, 845, 810, 913, 1, 667, 1057, 1029, 313, 823, 145, 739, 444, 1380, 34, 1423, 284, 319, 202, 222, 1, 349, 302, 453, 1248, 284, 618, 204, 170, 440, 316, 512, 174, 615, 257, 234, 223, 233, 578, 181, 86, 262, 148, 1246, 338, 848, 216, 671, 470, 538, 562, 670, 546, 591, 344, 122, 573, 869, 1095, 178, 196, 838, 161, 599, 1018, 1058, 924, 379, 689, 465, 490, 414, 449, 791, 328, 667, 583, 228, 1233, 869, 816, 923, 973, 1211, 1, 736, 947, 918, 354, 491, 187, 170, 471, 383, 199, 178, 596, 287, 143, 124, 145, 195, 173, 1360, 215, 199, 166, 260, 335, 236, 207, 116, 108, 346, 1632, 357, 1, 236, 387, 120, 512, 294, 120, 1389, 120, 188, 60, 152, 139, 173, 58, 73, 91, 195, 124, 266, 46, 183, 354, 476, 99, 141, 1191, 1698, 576, 677, 1212, 94, 1, 1106, 503, 27, 647, 508, 511, 666, 98, 738, 429, 431, 566, 611, 393, 1275, 1, 457, 417, 513, 168, 327, 229, 404, 120, 1643, 1107, 93, 297, 388, 643, 364, 1, 560, 408, 689, 757, 1601, 78, 679, 552, 1264, 1109, 454, 849, 836, 1125, 1066, 1, 618, 459, 539, 425, 327, 1488, 873, 815, 543, 800, 406, 1962, 464, 1813, 360, 1, 729, 788, 1365, 527, 187, 508, 139, 429, 1519, 470, 284, 178, 1235, 360, 200, 1, 179, 224, 250, 602, 555, 1778, 565, 1180, 427, 1679, 732, 167, 681, 509, 508, 339, 1326, 718, 775, 281, 1729, 352, 362, 1044, 855, 663, 451, 543, 326, 772, 330, 1, 590, 1151, 359, 1884, 571, 452, 574, 450, 220, 210, 226, 1294, 588, 287, 989, 1, 199, 1467, 360, 357, 387, 240, 63, 2146, 295, 234, 417, 475, 271, 170, 703, 294, 465, 404, 359, 639, 728, 343, 659, 285, 873, 270, 830, 383, 706, 35, 2391, 386, 599, 711, 594, 715, 541, 435, 771, 602, 2520, 335, 1047, 708, 926, 542, 419, 1703, 310, 490, 773, 515, 300, 661, 736, 594, 521, 60, 702, 2636, 629, 24, 492, 1, 429, 429, 487, 188, 520, 690, 931, 2613, 627, 341, 82, 443, 356, 738, 1005, 1, 561, 771, 1178, 495, 491, 564, 881, 489, 148, 340, 511, 718, 563, 301, 309, 1207, 386, 3066, 256, 137, 208, 192, 150, 199, 128, 161, 107, 145, 126, 180, 194, 1, 256, 139, 207, 183, 54, 116, 270, 194, 225, 125, 393, 121, 89, 124, 273, 168, 185, 162, 189, 140, 65, 289, 217, 315, 76, 119, 130, 143, 229, 115, 56, 258, 195, 414, 284, 389, 1160, 270, 360, 415, 939, 2735, 273, 371, 886, 748, 1912, 508, 198, 323, 796, 221, 134, 359, 158, 185, 253, 328, 516, 337, 106, 249, 414, 1, 386, 334, 564, 276, 47, 148, 131, 175, 177, 441, 474, 109, 101, 24, 240, 1, 542, 583, 595]
+  },
+  "mbu_258_percent": {
+    "model_name": "Qwen/Qwen2.5-7B",
+    "total_generation_time": 11.019336524419487,
+    "samples_per_prompt": 16,
+    "num_engines": 8,
+    "num_gpus_per_engine": 1,
+    "training_time": 4.0,
+    "num_training_gpus": 16,
+    "prompt_lengths": [88, 72, 450, 163, 172, 69, 240, 197, 531, 189, 115, 293, 326, 320, 115, 234, 326, 108, 275, 229, 217, 360, 181, 232, 195, 286, 449, 135, 184, 65, 114, 138],
+    "response_lengths": [567, 609, 229, 839, 86, 138, 107, 180, 143, 187, 180, 125, 1, 203, 108, 218, 100, 134, 59, 144, 211, 101, 184, 228, 189, 146, 328, 87, 1, 873, 283, 345, 261, 606, 730, 237, 781, 76, 238, 527, 474, 501, 584, 291, 480, 507, 497, 722, 857, 399, 246, 352, 469, 777, 333, 354, 572, 592, 287, 236, 1, 214, 683, 493, 100, 236, 180, 138, 403, 67, 193, 237, 190, 871, 127, 64, 166, 211, 124, 123, 654, 126, 97, 53, 897, 91, 81, 395, 524, 108, 399, 55, 1, 390, 296, 120, 136, 253, 109, 540, 371, 985, 354, 348, 171, 502, 197, 222, 1, 545, 402, 353, 408, 181, 206, 230, 186, 272, 195, 147, 231, 753, 436, 186, 241, 225, 3753, 226, 585, 425, 678, 926, 752, 914, 826, 591, 965, 350, 24, 608, 1, 551, 251, 256, 363, 507, 1116, 195, 321, 653, 173, 194, 657, 229, 608, 305, 183, 317, 333, 323, 679, 275, 99, 144, 848, 560, 210, 342, 486, 3937, 261, 573, 1, 171, 236, 178, 521, 1224, 57, 596, 291, 584, 471, 1291, 303, 499, 719, 546, 415, 535, 365, 533, 573, 174, 2085, 333, 372, 1831, 4096, 377, 627, 1202, 280, 4096, 215, 465, 612, 293, 393, 187, 780, 778, 235, 541, 877, 295, 80, 643, 275, 12, 1, 1512, 240, 451, 149, 288, 185, 206, 186, 57, 288, 95, 244, 68, 131, 159, 92, 442, 1408, 465, 275, 1190, 822, 3377, 339, 4096, 2546, 1604, 1068, 1328, 4096, 633, 1, 260, 4096, 516, 110, 414, 208, 368, 336, 1343, 305, 451, 226, 490, 297, 334, 1, 597, 590, 385, 312, 315, 330, 628, 239, 664, 597, 461, 816, 1512, 305, 421, 1, 552, 270, 674, 1461, 108, 960, 171, 212, 734, 561, 555, 382, 917, 473, 273, 1, 525, 583, 614, 379, 505, 753, 1523, 329, 778, 332, 783, 390, 55, 728, 259, 1, 125, 524, 234, 349, 201, 437, 150, 1352, 264, 178, 209, 248, 185, 387, 117, 143, 1559, 277, 811, 357, 572, 514, 288, 523, 1897, 425, 467, 195, 1686, 4096, 626, 1, 797, 482, 774, 161, 95, 1150, 1575, 291, 1414, 502, 1413, 387, 538, 1096, 1072, 1, 431, 628, 658, 169, 617, 697, 276, 917, 316, 610, 423, 1057, 1243, 245, 724, 272, 402, 1093, 1778, 1220, 555, 240, 1261, 1040, 356, 151, 275, 557, 1540, 293, 1884, 1, 670, 1016, 232, 279, 1183, 578, 871, 752, 2367, 585, 315, 802, 326, 548, 1194, 820, 580, 943, 583, 1310, 244, 318, 1996, 753, 2520, 25, 1719, 1769, 554, 554, 932, 1, 992, 893, 244, 2113, 1348, 327, 785, 2424, 525, 350, 887, 408, 534, 961, 186, 1, 383, 533, 244, 2575, 260, 438, 667, 403, 1519, 948, 1511, 480, 627, 307, 443, 1, 195, 645, 120, 151, 293, 282, 223, 154, 126, 139, 146, 410, 130, 429, 72, 292, 209, 240, 204, 288, 368, 145, 680, 545, 372, 234, 360, 143, 419, 340, 160, 271, 556, 260, 350, 455, 122, 146, 123, 178, 260, 169, 95, 200, 268, 773, 297, 1, 126, 149, 160]
+  },
+  "beaker_212_percent_bug": {
+    "model_name": "Qwen/Qwen3-1.7B",
+    "total_generation_time": 2.048383,
+    "samples_per_prompt": 4,
+    "num_engines": 1,
+    "num_gpus_per_engine": 1,
+    "training_time": 5.0,
+    "num_training_gpus": 1,
+    "prompt_lengths": [145, 145, 145, 145, 145, 145, 145, 145],
+    "response_lengths": [275, 275, 275, 275, 275, 275, 275, 275, 275, 275, 275, 275, 275, 275, 275, 275, 275, 275, 275, 275, 275, 275, 274, 274, 274, 274, 274, 274, 274, 274, 274, 274]
+  },
+  "small_batch": {
+    "model_name": "Qwen/Qwen2.5-7B",
+    "total_generation_time": 5.0,
+    "samples_per_prompt": 2,
+    "num_engines": 2,
+    "num_gpus_per_engine": 1,
+    "training_time": 3.0,
+    "num_training_gpus": 2,
+    "prompt_lengths": [512, 512],
+    "response_lengths": [512, 512, 512, 512]
+  },
+  "large_batch": {
+    "model_name": "Qwen/Qwen2.5-7B",
+    "total_generation_time": 8.55,
+    "samples_per_prompt": 2,
+    "num_engines": 1,
+    "num_gpus_per_engine": 2,
+    "training_time": 4.0,
+    "num_training_gpus": 4,
+    "prompt_lengths": [256, 256, 256, 256, 256, 256, 256, 256, 256, 256, 256, 256, 256, 256, 256, 256],
+    "response_lengths": [256, 256, 256, 256, 256, 256, 256, 256, 256, 256, 256, 256, 256, 256, 256, 256, 256, 256, 256, 256, 256, 256, 256, 256, 256, 256, 256, 256, 256, 256, 256, 256]
+  }
+}