@@ -1025,11 +1025,12 @@ define i128 @v_mul_i128(i128 %num, i128 %den) {
1025
1025
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
1026
1026
; GFX12-NEXT: v_mov_b32_e32 v2, v11
1027
1027
; GFX12-NEXT: v_mad_co_u64_u32 v[1:2], vcc_lo, v8, v5, v[1:2]
1028
- ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4 ) | instid1(VALU_DEP_1 )
1028
+ ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1 ) | instid1(VALU_DEP_4 )
1029
1029
; GFX12-NEXT: v_mad_co_u64_u32 v[1:2], s0, v9, v4, v[1:2]
1030
1030
; GFX12-NEXT: s_wait_alu 0xf1ff
1031
1031
; GFX12-NEXT: v_add_co_ci_u32_e64 v7, null, v12, v7, s0
1032
1032
; GFX12-NEXT: s_wait_alu 0xfffd
1033
+ ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
1033
1034
; GFX12-NEXT: v_add_co_ci_u32_e64 v6, null, v7, v6, vcc_lo
1034
1035
; GFX12-NEXT: v_mad_co_u64_u32 v[5:6], null, v10, v5, v[6:7]
1035
1036
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
@@ -2387,33 +2388,39 @@ define i256 @v_mul_i256(i256 %num, i256 %den) {
2387
2388
; GFX12-NEXT: v_mad_co_u64_u32 v[18:19], null, v16, v12, 0
2388
2389
; GFX12-NEXT: v_mul_lo_u32 v30, v17, v14
2389
2390
; GFX12-NEXT: v_mad_co_u64_u32 v[0:1], null, v17, v13, v[0:1]
2390
- ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3)
2391
+ ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_3)
2391
2392
; GFX12-NEXT: v_mad_co_u64_u32 v[18:19], s0, v17, v11, v[18:19]
2392
2393
; GFX12-NEXT: s_wait_alu 0xf1ff
2393
2394
; GFX12-NEXT: v_cndmask_b32_e64 v20, 0, 1, s0
2394
2395
; GFX12-NEXT: v_mad_co_u64_u32 v[0:1], null, v2, v12, v[0:1]
2396
+ ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
2395
2397
; GFX12-NEXT: v_mad_co_u64_u32 v[18:19], vcc_lo, v2, v10, v[18:19]
2396
2398
; GFX12-NEXT: s_wait_alu 0xfffd
2397
2399
; GFX12-NEXT: v_add_co_ci_u32_e64 v22, null, 0, v20, vcc_lo
2398
2400
; GFX12-NEXT: v_mad_co_u64_u32 v[20:21], null, v16, v10, 0
2401
+ ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
2399
2402
; GFX12-NEXT: v_mad_co_u64_u32 v[0:1], null, v3, v11, v[0:1]
2400
2403
; GFX12-NEXT: v_mad_co_u64_u32 v[18:19], vcc_lo, v3, v9, v[18:19]
2401
2404
; GFX12-NEXT: s_wait_alu 0xfffd
2402
2405
; GFX12-NEXT: v_add_co_ci_u32_e64 v24, null, 0, v22, vcc_lo
2406
+ ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
2403
2407
; GFX12-NEXT: v_mad_co_u64_u32 v[0:1], null, v4, v10, v[0:1]
2404
2408
; GFX12-NEXT: v_mad_co_u64_u32 v[18:19], vcc_lo, v4, v8, v[18:19]
2405
2409
; GFX12-NEXT: s_wait_alu 0xfffd
2410
+ ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
2406
2411
; GFX12-NEXT: v_add_co_ci_u32_e64 v26, null, 0, v24, vcc_lo
2407
2412
; GFX12-NEXT: v_mad_co_u64_u32 v[0:1], null, v5, v9, v[0:1]
2408
- ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
2413
+ ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_3)
2409
2414
; GFX12-NEXT: v_mad_co_u64_u32 v[22:23], null, v6, v8, v[0:1]
2410
2415
; GFX12-NEXT: v_mad_co_u64_u32 v[0:1], s0, v17, v9, v[20:21]
2411
2416
; GFX12-NEXT: s_wait_alu 0xf1ff
2412
2417
; GFX12-NEXT: v_cndmask_b32_e64 v25, 0, 1, s0
2413
2418
; GFX12-NEXT: v_mov_b32_e32 v20, v22
2419
+ ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
2414
2420
; GFX12-NEXT: v_mad_co_u64_u32 v[21:22], vcc_lo, v2, v8, v[0:1]
2415
2421
; GFX12-NEXT: s_wait_alu 0xfffd
2416
2422
; GFX12-NEXT: v_add_co_ci_u32_e64 v29, null, 0, v25, vcc_lo
2423
+ ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
2417
2424
; GFX12-NEXT: v_mad_co_u64_u32 v[0:1], s0, v16, v13, v[19:20]
2418
2425
; GFX12-NEXT: v_mov_b32_e32 v19, v22
2419
2426
; GFX12-NEXT: v_mul_lo_u32 v22, v16, v15
@@ -2434,6 +2441,7 @@ define i256 @v_mul_i256(i256 %num, i256 %den) {
2434
2441
; GFX12-NEXT: s_wait_alu 0xf1ff
2435
2442
; GFX12-NEXT: v_add_co_ci_u32_e64 v6, null, 0, v6, s2
2436
2443
; GFX12-NEXT: v_dual_mov_b32 v13, v1 :: v_dual_mov_b32 v14, v21
2444
+ ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3)
2437
2445
; GFX12-NEXT: v_mad_co_u64_u32 v[1:2], s2, v2, v9, v[11:12]
2438
2446
; GFX12-NEXT: s_wait_alu 0xf1ff
2439
2447
; GFX12-NEXT: v_add_co_ci_u32_e64 v6, null, 0, v6, s2
@@ -2447,6 +2455,7 @@ define i256 @v_mul_i256(i256 %num, i256 %den) {
2447
2455
; GFX12-NEXT: v_mad_co_u64_u32 v[5:6], s4, v5, v8, v[10:11]
2448
2456
; GFX12-NEXT: v_mad_co_u64_u32 v[1:2], s5, v17, v8, v[12:13]
2449
2457
; GFX12-NEXT: s_wait_alu 0xf1ff
2458
+ ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_4)
2450
2459
; GFX12-NEXT: v_add_co_ci_u32_e64 v3, s5, v9, v3, s5
2451
2460
; GFX12-NEXT: s_wait_alu 0xf1ff
2452
2461
; GFX12-NEXT: v_add_co_ci_u32_e64 v4, s5, v29, v4, s5
@@ -2463,9 +2472,10 @@ define i256 @v_mul_i256(i256 %num, i256 %den) {
2463
2472
; GFX12-NEXT: v_add_co_ci_u32_e64 v9, null, v9, v25, s3
2464
2473
; GFX12-NEXT: v_add_co_ci_u32_e64 v9, null, v9, v20, s1
2465
2474
; GFX12-NEXT: s_wait_alu 0xfffd
2466
- ; GFX12-NEXT: v_add_co_ci_u32_e64 v9, null, v9, v28, vcc_lo
2467
2475
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
2476
+ ; GFX12-NEXT: v_add_co_ci_u32_e64 v9, null, v9, v28, vcc_lo
2468
2477
; GFX12-NEXT: v_add_co_ci_u32_e64 v9, null, v9, v27, s0
2478
+ ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
2469
2479
; GFX12-NEXT: v_mad_co_u64_u32 v[7:8], null, v7, v8, v[9:10]
2470
2480
; GFX12-NEXT: s_setpc_b64 s[30:31]
2471
2481
%result = mul i256 %num , %den
0 commit comments