-
Notifications
You must be signed in to change notification settings - Fork 14.3k
[AArch64] Change IssueWidth to 5 in AArch64SchedNeoverseN2.td #145717
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Conversation
It has been observed that the issue width for neoverse-n2 CPUs is set too high, and does not properly reflect the dispatch constraints. I tested various values of IssueWidth (10, 8, 6, 5, 4) with runs of various workloads on a neoverse-n2 machine and I got the highest overall geomean score with an issue width of 5. If this patch were to cause any major regression post-commit, it could be easily reverted, but it is likely to show an overall improvement. Related Neoverse-V2 PR: llvm#142565 Change-Id: Icdbc0aef5ea004439da6abffaaae3151f91c1b5c
@llvm/pr-subscribers-backend-aarch64 Author: Simon Wallis (simonwallis2) ChangesIt has been observed that the issue width for neoverse-n2 CPUs is set too high, and does not properly reflect the dispatch constraints. I tested various values of IssueWidth (10, 8, 6, 5, 4) with runs of various workloads on a neoverse-n2 machine and I got the highest overall geomean score with an issue width of 5. If this patch were to cause any major regression post-commit, it could be easily reverted, but it is likely to show an overall improvement. Related Neoverse-V2 PR: #142565 Change-Id: Icdbc0aef5ea004439da6abffaaae3151f91c1b5c Patch is 324.01 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/145717.diff 4 Files Affected:
diff --git a/llvm/lib/Target/AArch64/AArch64SchedNeoverseN2.td b/llvm/lib/Target/AArch64/AArch64SchedNeoverseN2.td
index e23daec97bd2d..91a707910a7f3 100644
--- a/llvm/lib/Target/AArch64/AArch64SchedNeoverseN2.td
+++ b/llvm/lib/Target/AArch64/AArch64SchedNeoverseN2.td
@@ -11,7 +11,7 @@
//===----------------------------------------------------------------------===//
def NeoverseN2Model : SchedMachineModel {
- let IssueWidth = 10; // Micro-ops dispatched at a time.
+ let IssueWidth = 5; // Micro-ops dispatched at a time.
let MicroOpBufferSize = 160; // Entries in micro-op re-order buffer.
let LoadLatency = 4; // Optimistic load latency.
let MispredictPenalty = 10; // Extra cycles for mispredicted branch.
diff --git a/llvm/test/CodeGen/AArch64/machine-combiner.ll b/llvm/test/CodeGen/AArch64/machine-combiner.ll
index c8df283aace0b..70a638857ce4a 100644
--- a/llvm/test/CodeGen/AArch64/machine-combiner.ll
+++ b/llvm/test/CodeGen/AArch64/machine-combiner.ll
@@ -262,8 +262,8 @@ define half @reassociate_adds_half(half %x0, half %x1, half %x2, half %x3) {
; CHECK-UNSAFE-LABEL: reassociate_adds_half:
; CHECK-UNSAFE: // %bb.0:
; CHECK-UNSAFE-NEXT: fdiv h0, h0, h1
-; CHECK-UNSAFE-NEXT: fadd h2, h3, h2
-; CHECK-UNSAFE-NEXT: fadd h0, h2, h0
+; CHECK-UNSAFE-NEXT: fadd h1, h3, h2
+; CHECK-UNSAFE-NEXT: fadd h0, h1, h0
; CHECK-UNSAFE-NEXT: ret
%t0 = fdiv half %x0, %x1
%t1 = fadd half %x2, %t0
@@ -284,8 +284,8 @@ define half @reassociate_muls_half(half %x0, half %x1, half %x2, half %x3) {
; CHECK-UNSAFE-LABEL: reassociate_muls_half:
; CHECK-UNSAFE: // %bb.0:
; CHECK-UNSAFE-NEXT: fdiv h0, h0, h1
-; CHECK-UNSAFE-NEXT: fmul h2, h3, h2
-; CHECK-UNSAFE-NEXT: fmul h0, h2, h0
+; CHECK-UNSAFE-NEXT: fmul h1, h3, h2
+; CHECK-UNSAFE-NEXT: fmul h0, h1, h0
; CHECK-UNSAFE-NEXT: ret
%t0 = fdiv half %x0, %x1
%t1 = fmul half %x2, %t0
diff --git a/llvm/test/tools/llvm-mca/AArch64/Neoverse/N2-sve-instructions.s b/llvm/test/tools/llvm-mca/AArch64/Neoverse/N2-sve-instructions.s
index 99e39567b1ad6..ef9d4463ebe52 100644
--- a/llvm/test/tools/llvm-mca/AArch64/Neoverse/N2-sve-instructions.s
+++ b/llvm/test/tools/llvm-mca/AArch64/Neoverse/N2-sve-instructions.s
@@ -5066,19 +5066,19 @@ zip2 z31.s, z31.s, z31.s
# CHECK-NEXT: 2 2 1.00 movs p0.b, p0/z, p0.b
# CHECK-NEXT: 2 2 1.00 movs p15.b, p15.b
# CHECK-NEXT: 2 2 1.00 movs p15.b, p15/z, p15.b
-# CHECK-NEXT: 1 1 0.10 U mrs x3, ID_AA64ZFR0_EL1
-# CHECK-NEXT: 1 1 0.10 U mrs x3, ZCR_EL1
-# CHECK-NEXT: 1 1 0.10 U mrs x3, ZCR_EL12
-# CHECK-NEXT: 1 1 0.10 U mrs x3, ZCR_EL2
-# CHECK-NEXT: 1 1 0.10 U mrs x3, ZCR_EL3
+# CHECK-NEXT: 1 1 0.20 U mrs x3, ID_AA64ZFR0_EL1
+# CHECK-NEXT: 1 1 0.20 U mrs x3, ZCR_EL1
+# CHECK-NEXT: 1 1 0.20 U mrs x3, ZCR_EL12
+# CHECK-NEXT: 1 1 0.20 U mrs x3, ZCR_EL2
+# CHECK-NEXT: 1 1 0.20 U mrs x3, ZCR_EL3
# CHECK-NEXT: 1 4 1.00 msb z0.b, p7/m, z1.b, z31.b
# CHECK-NEXT: 2 5 2.00 msb z0.d, p7/m, z1.d, z31.d
# CHECK-NEXT: 1 4 1.00 msb z0.h, p7/m, z1.h, z31.h
# CHECK-NEXT: 1 4 1.00 msb z0.s, p7/m, z1.s, z31.s
-# CHECK-NEXT: 1 1 0.10 U msr ZCR_EL1, x3
-# CHECK-NEXT: 1 1 0.10 U msr ZCR_EL12, x3
-# CHECK-NEXT: 1 1 0.10 U msr ZCR_EL2, x3
-# CHECK-NEXT: 1 1 0.10 U msr ZCR_EL3, x3
+# CHECK-NEXT: 1 1 0.20 U msr ZCR_EL1, x3
+# CHECK-NEXT: 1 1 0.20 U msr ZCR_EL12, x3
+# CHECK-NEXT: 1 1 0.20 U msr ZCR_EL2, x3
+# CHECK-NEXT: 1 1 0.20 U msr ZCR_EL3, x3
# CHECK-NEXT: 1 4 1.00 mul z0.b, p7/m, z0.b, z31.b
# CHECK-NEXT: 1 4 1.00 mul z0.b, z1.b, z2.b
# CHECK-NEXT: 2 5 2.00 mul z0.d, p7/m, z0.d, z31.d
diff --git a/llvm/test/tools/llvm-mca/AArch64/Neoverse/N2-writeback.s b/llvm/test/tools/llvm-mca/AArch64/Neoverse/N2-writeback.s
index 5ffaf9138d482..dee46a304582b 100644
--- a/llvm/test/tools/llvm-mca/AArch64/Neoverse/N2-writeback.s
+++ b/llvm/test/tools/llvm-mca/AArch64/Neoverse/N2-writeback.s
@@ -1185,10 +1185,10 @@ add x0, x27, 1
# CHECK-NEXT: Total Cycles: 508
# CHECK-NEXT: Total uOps: 1500
-# CHECK: Dispatch Width: 10
+# CHECK: Dispatch Width: 5
# CHECK-NEXT: uOps Per Cycle: 2.95
# CHECK-NEXT: IPC: 1.97
-# CHECK-NEXT: Block RThroughput: 2.5
+# CHECK-NEXT: Block RThroughput: 3.0
# CHECK: Timeline view:
# CHECK-NEXT: 012
@@ -1197,13 +1197,13 @@ add x0, x27, 1
# CHECK: [0,0] DeeeeeeER . . ld1 { v1.1d }, [x27], #8
# CHECK-NEXT: [0,1] D=eE----R . . add x0, x27, #1
# CHECK-NEXT: [0,2] D=eeeeeeER. . ld1 { v1.2d }, [x27], #16
-# CHECK-NEXT: [0,3] D==eE----R. . add x0, x27, #1
-# CHECK-NEXT: [0,4] D==eeeeeeER . ld1 { v1.2s }, [x27], #8
-# CHECK-NEXT: [0,5] D===eE----R . add x0, x27, #1
-# CHECK-NEXT: [0,6] .D==eeeeeeER. ld1 { v1.4h }, [x27], #8
-# CHECK-NEXT: [0,7] .D===eE----R. add x0, x27, #1
-# CHECK-NEXT: [0,8] .D===eeeeeeER ld1 { v1.4s }, [x27], #16
-# CHECK-NEXT: [0,9] .D====eE----R add x0, x27, #1
+# CHECK-NEXT: [0,3] .D=eE----R. . add x0, x27, #1
+# CHECK-NEXT: [0,4] .D=eeeeeeER . ld1 { v1.2s }, [x27], #8
+# CHECK-NEXT: [0,5] .D==eE----R . add x0, x27, #1
+# CHECK-NEXT: [0,6] . D=eeeeeeER. ld1 { v1.4h }, [x27], #8
+# CHECK-NEXT: [0,7] . D==eE----R. add x0, x27, #1
+# CHECK-NEXT: [0,8] . D==eeeeeeER ld1 { v1.4s }, [x27], #16
+# CHECK-NEXT: [0,9] . D==eE----R add x0, x27, #1
# CHECK: Average Wait times (based on the timeline view):
# CHECK-NEXT: [0]: Executions
@@ -1215,14 +1215,14 @@ add x0, x27, 1
# CHECK-NEXT: 0. 1 1.0 1.0 0.0 ld1 { v1.1d }, [x27], #8
# CHECK-NEXT: 1. 1 2.0 0.0 4.0 add x0, x27, #1
# CHECK-NEXT: 2. 1 2.0 0.0 0.0 ld1 { v1.2d }, [x27], #16
-# CHECK-NEXT: 3. 1 3.0 0.0 4.0 add x0, x27, #1
-# CHECK-NEXT: 4. 1 3.0 0.0 0.0 ld1 { v1.2s }, [x27], #8
-# CHECK-NEXT: 5. 1 4.0 0.0 4.0 add x0, x27, #1
-# CHECK-NEXT: 6. 1 3.0 0.0 0.0 ld1 { v1.4h }, [x27], #8
-# CHECK-NEXT: 7. 1 4.0 0.0 4.0 add x0, x27, #1
-# CHECK-NEXT: 8. 1 4.0 0.0 0.0 ld1 { v1.4s }, [x27], #16
-# CHECK-NEXT: 9. 1 5.0 0.0 4.0 add x0, x27, #1
-# CHECK-NEXT: 1 3.1 0.1 2.0 <total>
+# CHECK-NEXT: 3. 1 2.0 0.0 4.0 add x0, x27, #1
+# CHECK-NEXT: 4. 1 2.0 0.0 0.0 ld1 { v1.2s }, [x27], #8
+# CHECK-NEXT: 5. 1 3.0 0.0 4.0 add x0, x27, #1
+# CHECK-NEXT: 6. 1 2.0 0.0 0.0 ld1 { v1.4h }, [x27], #8
+# CHECK-NEXT: 7. 1 3.0 0.0 4.0 add x0, x27, #1
+# CHECK-NEXT: 8. 1 3.0 0.0 0.0 ld1 { v1.4s }, [x27], #16
+# CHECK-NEXT: 9. 1 3.0 0.0 4.0 add x0, x27, #1
+# CHECK-NEXT: 1 2.3 0.1 2.0 <total>
# CHECK: [1] Code Region - G02
@@ -1231,10 +1231,10 @@ add x0, x27, 1
# CHECK-NEXT: Total Cycles: 508
# CHECK-NEXT: Total uOps: 1500
-# CHECK: Dispatch Width: 10
+# CHECK: Dispatch Width: 5
# CHECK-NEXT: uOps Per Cycle: 2.95
# CHECK-NEXT: IPC: 1.97
-# CHECK-NEXT: Block RThroughput: 2.5
+# CHECK-NEXT: Block RThroughput: 3.0
# CHECK: Timeline view:
# CHECK-NEXT: 012
@@ -1243,13 +1243,13 @@ add x0, x27, 1
# CHECK: [0,0] DeeeeeeER . . ld1 { v1.8b }, [x27], #8
# CHECK-NEXT: [0,1] D=eE----R . . add x0, x27, #1
# CHECK-NEXT: [0,2] D=eeeeeeER. . ld1 { v1.8h }, [x27], #16
-# CHECK-NEXT: [0,3] D==eE----R. . add x0, x27, #1
-# CHECK-NEXT: [0,4] D==eeeeeeER . ld1 { v1.16b }, [x27], #16
-# CHECK-NEXT: [0,5] D===eE----R . add x0, x27, #1
-# CHECK-NEXT: [0,6] .D==eeeeeeER. ld1 { v1.1d }, [x27], x28
-# CHECK-NEXT: [0,7] .D===eE----R. add x0, x27, #1
-# CHECK-NEXT: [0,8] .D===eeeeeeER ld1 { v1.2d }, [x27], x28
-# CHECK-NEXT: [0,9] .D====eE----R add x0, x27, #1
+# CHECK-NEXT: [0,3] .D=eE----R. . add x0, x27, #1
+# CHECK-NEXT: [0,4] .D=eeeeeeER . ld1 { v1.16b }, [x27], #16
+# CHECK-NEXT: [0,5] .D==eE----R . add x0, x27, #1
+# CHECK-NEXT: [0,6] . D=eeeeeeER. ld1 { v1.1d }, [x27], x28
+# CHECK-NEXT: [0,7] . D==eE----R. add x0, x27, #1
+# CHECK-NEXT: [0,8] . D==eeeeeeER ld1 { v1.2d }, [x27], x28
+# CHECK-NEXT: [0,9] . D==eE----R add x0, x27, #1
# CHECK: Average Wait times (based on the timeline view):
# CHECK-NEXT: [0]: Executions
@@ -1261,14 +1261,14 @@ add x0, x27, 1
# CHECK-NEXT: 0. 1 1.0 1.0 0.0 ld1 { v1.8b }, [x27], #8
# CHECK-NEXT: 1. 1 2.0 0.0 4.0 add x0, x27, #1
# CHECK-NEXT: 2. 1 2.0 0.0 0.0 ld1 { v1.8h }, [x27], #16
-# CHECK-NEXT: 3. 1 3.0 0.0 4.0 add x0, x27, #1
-# CHECK-NEXT: 4. 1 3.0 0.0 0.0 ld1 { v1.16b }, [x27], #16
-# CHECK-NEXT: 5. 1 4.0 0.0 4.0 add x0, x27, #1
-# CHECK-NEXT: 6. 1 3.0 0.0 0.0 ld1 { v1.1d }, [x27], x28
-# CHECK-NEXT: 7. 1 4.0 0.0 4.0 add x0, x27, #1
-# CHECK-NEXT: 8. 1 4.0 0.0 0.0 ld1 { v1.2d }, [x27], x28
-# CHECK-NEXT: 9. 1 5.0 0.0 4.0 add x0, x27, #1
-# CHECK-NEXT: 1 3.1 0.1 2.0 <total>
+# CHECK-NEXT: 3. 1 2.0 0.0 4.0 add x0, x27, #1
+# CHECK-NEXT: 4. 1 2.0 0.0 0.0 ld1 { v1.16b }, [x27], #16
+# CHECK-NEXT: 5. 1 3.0 0.0 4.0 add x0, x27, #1
+# CHECK-NEXT: 6. 1 2.0 0.0 0.0 ld1 { v1.1d }, [x27], x28
+# CHECK-NEXT: 7. 1 3.0 0.0 4.0 add x0, x27, #1
+# CHECK-NEXT: 8. 1 3.0 0.0 0.0 ld1 { v1.2d }, [x27], x28
+# CHECK-NEXT: 9. 1 3.0 0.0 4.0 add x0, x27, #1
+# CHECK-NEXT: 1 2.3 0.1 2.0 <total>
# CHECK: [2] Code Region - G03
@@ -1277,10 +1277,10 @@ add x0, x27, 1
# CHECK-NEXT: Total Cycles: 508
# CHECK-NEXT: Total uOps: 1500
-# CHECK: Dispatch Width: 10
+# CHECK: Dispatch Width: 5
# CHECK-NEXT: uOps Per Cycle: 2.95
# CHECK-NEXT: IPC: 1.97
-# CHECK-NEXT: Block RThroughput: 2.5
+# CHECK-NEXT: Block RThroughput: 3.0
# CHECK: Timeline view:
# CHECK-NEXT: 012
@@ -1289,13 +1289,13 @@ add x0, x27, 1
# CHECK: [0,0] DeeeeeeER . . ld1 { v1.2s }, [x27], x28
# CHECK-NEXT: [0,1] D=eE----R . . add x0, x27, #1
# CHECK-NEXT: [0,2] D=eeeeeeER. . ld1 { v1.4h }, [x27], x28
-# CHECK-NEXT: [0,3] D==eE----R. . add x0, x27, #1
-# CHECK-NEXT: [0,4] D==eeeeeeER . ld1 { v1.4s }, [x27], x28
-# CHECK-NEXT: [0,5] D===eE----R . add x0, x27, #1
-# CHECK-NEXT: [0,6] .D==eeeeeeER. ld1 { v1.8b }, [x27], x28
-# CHECK-NEXT: [0,7] .D===eE----R. add x0, x27, #1
-# CHECK-NEXT: [0,8] .D===eeeeeeER ld1 { v1.8h }, [x27], x28
-# CHECK-NEXT: [0,9] .D====eE----R add x0, x27, #1
+# CHECK-NEXT: [0,3] .D=eE----R. . add x0, x27, #1
+# CHECK-NEXT: [0,4] .D=eeeeeeER . ld1 { v1.4s }, [x27], x28
+# CHECK-NEXT: [0,5] .D==eE----R . add x0, x27, #1
+# CHECK-NEXT: [0,6] . D=eeeeeeER. ld1 { v1.8b }, [x27], x28
+# CHECK-NEXT: [0,7] . D==eE----R. add x0, x27, #1
+# CHECK-NEXT: [0,8] . D==eeeeeeER ld1 { v1.8h }, [x27], x28
+# CHECK-NEXT: [0,9] . D==eE----R add x0, x27, #1
# CHECK: Average Wait times (based on the timeline view):
# CHECK-NEXT: [0]: Executions
@@ -1307,14 +1307,14 @@ add x0, x27, 1
# CHECK-NEXT: 0. 1 1.0 1.0 0.0 ld1 { v1.2s }, [x27], x28
# CHECK-NEXT: 1. 1 2.0 0.0 4.0 add x0, x27, #1
# CHECK-NEXT: 2. 1 2.0 0.0 0.0 ld1 { v1.4h }, [x27], x28
-# CHECK-NEXT: 3. 1 3.0 0.0 4.0 add x0, x27, #1
-# CHECK-NEXT: 4. 1 3.0 0.0 0.0 ld1 { v1.4s }, [x27], x28
-# CHECK-NEXT: 5. 1 4.0 0.0 4.0 add x0, x27, #1
-# CHECK-NEXT: 6. 1 3.0 0.0 0.0 ld1 { v1.8b }, [x27], x28
-# CHECK-NEXT: 7. 1 4.0 0.0 4.0 add x0, x27, #1
-# CHECK-NEXT: 8. 1 4.0 0.0 0.0 ld1 { v1.8h }, [x27], x28
-# CHECK-NEXT: 9. 1 5.0 0.0 4.0 add x0, x27, #1
-# CHECK-NEXT: 1 3.1 0.1 2.0 <total>
+# CHECK-NEXT: 3. 1 2.0 0.0 4.0 add x0, x27, #1
+# CHECK-NEXT: 4. 1 2.0 0.0 0.0 ld1 { v1.4s }, [x27], x28
+# CHECK-NEXT: 5. 1 3.0 0.0 4.0 add x0, x27, #1
+# CHECK-NEXT: 6. 1 2.0 0.0 0.0 ld1 { v1.8b }, [x27], x28
+# CHECK-NEXT: 7. 1 3.0 0.0 4.0 add x0, x27, #1
+# CHECK-NEXT: 8. 1 3.0 0.0 0.0 ld1 { v1.8h }, [x27], x28
+# CHECK-NEXT: 9. 1 3.0 0.0 4.0 add x0, x27, #1
+# CHECK-NEXT: 1 2.3 0.1 2.0 <total>
# CHECK: [3] Code Region - G04
@@ -1323,10 +1323,10 @@ add x0, x27, 1
# CHECK-NEXT: Total Cycles: 508
# CHECK-NEXT: Total uOps: 1900
-# CHECK: Dispatch Width: 10
+# CHECK: Dispatch Width: 5
# CHECK-NEXT: uOps Per Cycle: 3.74
# CHECK-NEXT: IPC: 1.97
-# CHECK-NEXT: Block RThroughput: 3.0
+# CHECK-NEXT: Block RThroughput: 3.8
# CHECK: Timeline view:
# CHECK-NEXT: 012
@@ -1334,14 +1334,14 @@ add x0, x27, 1
# CHECK: [0,0] DeeeeeeER . . ld1 { v1.16b }, [x27], x28
# CHECK-NEXT: [0,1] D=eE----R . . add x0, x27, #1
-# CHECK-NEXT: [0,2] D=eeeeeeER. . ld1 { v1.1d, v2.1d }, [x27], #16
-# CHECK-NEXT: [0,3] D==eE----R. . add x0, x27, #1
-# CHECK-NEXT: [0,4] D==eeeeeeER . ld1 { v1.2d, v2.2d }, [x27], #32
-# CHECK-NEXT: [0,5] .D==eE----R . add x0, x27, #1
-# CHECK-NEXT: [0,6] .D==eeeeeeER. ld1 { v1.2s, v2.2s }, [x27], #16
-# CHECK-NEXT: [0,7] .D===eE----R. add x0, x27, #1
-# CHECK-NEXT: [0,8] .D===eeeeeeER ld1 { v1.4h, v2.4h }, [x27], #16
-# CHECK-NEXT: [0,9] .D====eE----R add x0, x27, #1
+# CHECK-NEXT: [0,2] .DeeeeeeER. . ld1 { v1.1d, v2.1d }, [x27], #16
+# CHECK-NEXT: [0,3] .D=eE----R. . add x0, x27, #1
+# CHECK-NEXT: [0,4] . DeeeeeeER . ld1 { v1.2d, v2.2d }, [x27], #32
+# CHECK-NEXT: [0,5] . D=eE----R . add x0, x27, #1
+# CHECK-NEXT: [0,6] . DeeeeeeER. ld1 { v1.2s, v2.2s }, [x27], #16
+# CHECK-NEXT: [0,7] . D=eE----R. add x0, x27, #1
+# CHECK-NEXT: [0,8] . DeeeeeeER ld1 { v1.4h, v2.4h }, [x27], #16
+# CHECK-NEXT: [0,9] . D=eE----R add x0, x27, #1
# CHECK: Average Wait times (based on the timeline view):
# CHECK-NEXT: [0]: Executions
@@ -1352,15 +1352,15 @@ add x0, x27, 1
# CHECK: [0] [1] [2] [3]
# CHECK-NEXT: 0. 1 1.0 1.0 0.0 ld1 { v1.16b }, [x27], x28
# CHECK-NEXT: 1. 1 2.0 0.0 4.0 add x0, x27, #1
-# CHECK-NEXT: 2. 1 2.0 0.0 0.0 ld1 { v1.1d, v2.1d }, [x27], #16
-# CHECK-NEXT: 3. 1 3.0 0.0 4.0 add x0, x27, #1
-# CHECK-NEXT: 4. 1 3.0 0.0 0.0 ld1 { v1.2d, v2.2d }, [x27], #32
-# CHECK-NEXT: 5. 1 3.0 0.0 4.0 add x0, x27, #1
-# CHECK-NEXT: 6. 1 3.0 0.0 0.0 ld1 { v1.2s, v2.2s }, [x27], #16
-# CHECK-NEXT: 7. 1 4.0 0.0 4.0 add x0, x27, #1
-# CHECK-NEXT: 8. 1 4.0 0.0 0.0 ld1 { v1.4h, v2.4h }, [x27], #16
-# CHECK-NEXT: 9. 1 5.0 0.0 4.0 add x0, x27, #1
-# CHECK-NEXT: 1 3.0 0.1 2.0 <total>
+# CHECK-NEXT: 2. 1 1.0 0.0 0.0 ld1 { v1.1d, v2.1d }, [x27], #16
+# CHECK-NEXT: 3. 1 2.0 0.0 4.0 add x0, x27, #1
+# CHECK-NEXT: 4. 1 1.0 0.0 0.0 ld1 { v1.2d, v2.2d }, [x27], #32
+# CHECK-NEXT: 5. 1 2.0 0.0 4.0 add x0, x27, #1
+# CHECK-NEXT: 6. 1 1.0 0.0 0.0 ld1 { v1.2s, v2.2s }, [x27], #16
+# CHECK-NEXT: 7. 1 2.0 0.0 4.0 add x0, x27, #1
+# CHECK-NEXT: 8. 1 1.0 0.0 0.0 ld1 { v1.4h, v2.4h }, [x27], #16
+# CHECK-NEXT: 9. 1 2.0 0.0 4.0 add x0, x27, #1
+# CHECK-NEXT: 1 1.5 0.1 2.0 <total>
# CHECK: [4] Code Region - G05
@@ -1369,10 +1369,10 @@ add x0, x27, 1
# CHECK-NEXT: Total Cycles: 508
# CHECK-NEXT: Total uOps: 2000
-# CHECK: Dispatch Width: 10
+# CHECK: Dispatch Width: 5
# CHECK-NEXT: uOps Per Cycle: 3.94
# CHECK-NEXT: IPC: 1.97
-# CHECK-NEXT: Block RThroughput: 3.3
+# CHECK-NEXT: Block RThroughput: 4.0
# CHECK: Timeline view:
# CHECK-NEXT: 012
@@ -1380,14 +1380,14 @@ add x0, x27, 1
# CHECK: [0,0] DeeeeeeER . . ld1 { v1.4s, v2.4s }, [x27], #32
# CHECK-NEXT: [0,1] D=eE----R . . add x0, x27, #1
-# CHECK-NEXT: [0,2] D=eeeeeeER. . ld1 { v1.8b, v2.8b }, [x27], #16
-# CHECK-NEXT: [0,3] D==eE----R. . add x0, x27, #1
-# CHECK-NEXT: [0,4] .D=eeeeeeER . ld1 { v1.8h, v2.8h }, [x27], #32
-# CHECK-NEXT: [0,5] .D==eE----R . add x0, x27, #1
-# CHECK-NEXT: [0,6] .D==eeeeeeER. ld1 { v1.16b, v2.16b }, [x27], #32
-# CHECK-NEXT: [0,7] .D===eE----R. add x0, x27, #1
-# CHECK-NEXT: [0,8] . D==eeeeeeER ld1 { v1.1d, v2.1d }, [x27], x28
-# CHECK-NEXT: [0,9] . D===eE----R add x0, x27, #1
+# CHECK-NEXT: [0,2] .DeeeeeeER. . ld1 { v1.8b, v2.8b }, [x27], #16
+# CHECK-NEXT: [0,3] .D=eE----R. . add x0, x27, #1
+# CHECK-NEXT: [0,4] . DeeeeeeER . ld1 { v1.8h, v2.8h }, [x27], #32
+# CHECK-NEXT: [0,5] . D=eE----R . add x0, x27, #1
+# CHECK-NEXT: [0,6] . DeeeeeeER. ld1 { v1.16b, v2.16b }, [x27], #32
+# CHECK-NEXT: [0,7] . D=eE----R. add x0, x27, #1
+# CHECK-NEXT: [0,8] . DeeeeeeER ld1 { v1.1d, v2.1d }, [x27], x28
+# CHECK-NEXT: [0,9] . D=eE----R add x0, x27, #1
# CHECK: Average Wait times (based on the timeline view):
# CHECK-NEXT: [0]: Executions
@@ -1398,15 +1398,15 @@ add x0, x27, 1
# CHECK: [0] [1] [2] [3]
# CHECK-NEXT: 0. 1 1.0 1.0 0.0 ld1 { v1.4s, v2.4s }, [x27], #32
# CHECK-NEXT: 1. 1 2.0 0.0 4.0 add x0, x27, #1
-# CHECK-NEXT: 2. 1 2.0 0.0 0.0 ld1 { v1.8b, v2.8b }, [x27], #16
-# CHECK-NEXT: 3. 1 3.0 0.0 4.0 add x0, x27, #1
-# CHECK-NEXT: 4. 1 2.0 0.0 0.0 ld1 { v1.8h, v2.8h }, [x27], #32
-# CHECK-NEXT: 5. 1 3.0 0.0 4.0 add x0, x27, #1
-# CHECK-NEXT: 6. 1 3.0 0.0 0.0 ld1 { v1.16b, v2.16b }, [x27], #32
-# CHECK-NEXT: 7. 1 4.0 0.0 4.0 add x0, x27, #1
-# CHECK-NEXT: 8. 1 3.0 0.0 0.0 ld1 { v1.1d, v2.1d }, [x27], x28
-# CHECK-NEXT: 9. 1 4.0 0.0 4.0 add x0, x27, #1
-# CHECK-NEXT: 1 2.7 0.1 2.0 <total>
+# CHECK-NEXT: 2. 1 1.0 0.0 0.0 ld1 { v1.8b, v2.8b }, [x27], #16
+# CHECK-NEXT: 3. 1 2.0 0.0 4.0 add x0, x27, #1
+# CHECK-NEXT: 4. 1 1.0 0.0 0.0 ld1 { v1.8h, v2.8h }, [x27], #32
+# CHECK-NEXT: 5. 1 2.0 0.0 4.0 add x0, x27, #1
+# CHECK-NEXT: 6. 1 1.0 0.0 0.0 ld1 { v1.16b, v2.16b }, [x27], #32
+# CHECK-NEXT: 7. 1 2.0 0.0 4.0 add x0, x27, #1
+# CHECK-NEXT: 8. 1 1.0 0.0 0.0 ld1 { v1.1d, v2.1d }, [x27], x28
+# CHECK-NEXT: 9. 1 2.0 0.0 4.0 ...
[truncated]
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
5 sounds good. LGTM
(But you can remove the "Change-Id" from the patch description). |
It has been observed that the issue width for neoverse-n2 CPUs is set too high, and does not properly reflect the dispatch constraints.
I tested various values of IssueWidth (10, 8, 6, 5, 4) with runs of various workloads on a neoverse-n2 machine and I got the highest overall geomean score with an issue width of 5.
If this patch were to cause any major regression post-commit, it could be easily reverted, but it is likely to show an overall improvement.
Related Neoverse-V2 PR: #142565
Change-Id: Icdbc0aef5ea004439da6abffaaae3151f91c1b5c