perf(evm): add constant-shift fast paths#388
perf(evm): add constant-shift fast paths#388starwarfan wants to merge 1 commit intoDTVMStack:mainfrom
Conversation
⚡ Performance Regression Check Results✅ Performance Check Passed (interpreter)Performance Benchmark Results (threshold: 25%)
Summary: 194 benchmarks, 0 regressions ✅ Performance Check Passed (multipass)Performance Benchmark Results (threshold: 25%)
Summary: 194 benchmarks, 0 regressions |
There was a problem hiding this comment.
Pull request overview
This PR adds constant-shift fast paths in the EVM MIR compiler to avoid the current O(4²) select/cmp-based limb selection when the shift amount is a compile-time constant, targeting improved performance in shift-heavy workloads (e.g., SHA1/BLAKE2b).
Changes:
- Add a helper to detect constant shift amounts from MIR instructions.
- Implement constant-shift fast paths for SHL, logical SHR, and arithmetic SAR using direct limb/carry logic.
- Add required includes for MIR constants, LLVM casting, and
std::optional.
💡 Add Copilot custom instructions for smarter, more guided reviews. Learn how to get started.
| // Fast path: constant shift amount — direct limb logic, no Select/cmp loops. | ||
| if (auto ShiftOpt = getConstShiftAmount(ShiftAmount)) { | ||
| uint64_t Shift = *ShiftOpt; | ||
| if (Shift >= 256) { | ||
| for (size_t I = 0; I < EVM_ELEMENTS_COUNT; ++I) | ||
| Result[I] = Zero; | ||
| return Result; | ||
| } | ||
| uint64_t CompShift = Shift / 64; | ||
| uint64_t ShiftMod = Shift % 64; | ||
| uint64_t CarryShift = (ShiftMod == 0) ? 0 : (64 - ShiftMod); | ||
| for (size_t I = 0; I < EVM_ELEMENTS_COUNT; ++I) { |
There was a problem hiding this comment.
Same issue as SHL fast path: the constant-shift path uses only ShiftAmount (low 64 bits) and ignores IsLargeShift, so shifts with any non-zero high limb but small low limb will incorrectly behave like a small shift instead of producing 0 per EVM spec. The fast path should still incorporate IsLargeShift (e.g., per-limb select to 0) to preserve correctness for large 256-bit shift values.
| // Fast path: constant shift amount — direct limb logic, no Select/cmp loops. | ||
| if (auto ShiftOpt = getConstShiftAmount(ShiftAmount)) { | ||
| uint64_t Shift = *ShiftOpt; | ||
| if (Shift >= 256) { | ||
| for (size_t I = 0; I < EVM_ELEMENTS_COUNT; ++I) | ||
| Result[I] = LargeShiftResult; | ||
| return Result; | ||
| } | ||
| uint64_t CompShift = Shift / 64; | ||
| uint64_t ShiftMod = Shift % 64; | ||
| uint64_t CarryShift = (ShiftMod == 0) ? 0 : (64 - ShiftMod); | ||
| for (size_t I = 0; I < EVM_ELEMENTS_COUNT; ++I) { |
There was a problem hiding this comment.
The SAR constant-shift fast path ignores IsLargeShift and only checks Shift >= 256 based on the low 64 bits. This is incorrect for 256-bit shift values with high limbs set but Shift[0] < 256 (EVM requires full sign-extension result when shift >= 256). Make the fast path depend on IsLargeShift (e.g., select LargeShiftResult vs computed limb result) or lift constant-shift evaluation to handleShift() where the full 256-bit shift is known.
| for (size_t I = 0; I < EVM_ELEMENTS_COUNT; ++I) { | ||
| MInstruction *R = Zero; | ||
| if (I >= CompShift) { | ||
| size_t SrcIdx = I - CompShift; | ||
| MInstruction *SrcVal = Value[SrcIdx]; | ||
| MInstruction *Shifted = createInstruction<BinaryInstruction>( | ||
| false, OP_shl, MirI64Type, SrcVal, | ||
| createIntConstInstruction(MirI64Type, ShiftMod)); | ||
| if (SrcIdx > 0 && RemainingBits > 0) { | ||
| MInstruction *Carry = createInstruction<BinaryInstruction>( | ||
| false, OP_ushr, MirI64Type, Value[SrcIdx - 1], | ||
| createIntConstInstruction(MirI64Type, RemainingBits)); | ||
| R = createInstruction<BinaryInstruction>(false, OP_or, MirI64Type, | ||
| Shifted, Carry); | ||
| } else { | ||
| R = Shifted; |
There was a problem hiding this comment.
In the constant-SHL fast path, createIntConstInstruction(..., ShiftMod) and createIntConstInstruction(..., RemainingBits) are called inside the loop, creating a new OP_const each iteration even though the value is invariant. Hoist these constant instructions outside the loop (and consider skipping the shift/or entirely when ShiftMod == 0 by directly using the source limb) to keep the fast path from inflating MIR instruction count.
| for (size_t I = 0; I < EVM_ELEMENTS_COUNT; ++I) { | |
| MInstruction *R = Zero; | |
| if (I >= CompShift) { | |
| size_t SrcIdx = I - CompShift; | |
| MInstruction *SrcVal = Value[SrcIdx]; | |
| MInstruction *Shifted = createInstruction<BinaryInstruction>( | |
| false, OP_shl, MirI64Type, SrcVal, | |
| createIntConstInstruction(MirI64Type, ShiftMod)); | |
| if (SrcIdx > 0 && RemainingBits > 0) { | |
| MInstruction *Carry = createInstruction<BinaryInstruction>( | |
| false, OP_ushr, MirI64Type, Value[SrcIdx - 1], | |
| createIntConstInstruction(MirI64Type, RemainingBits)); | |
| R = createInstruction<BinaryInstruction>(false, OP_or, MirI64Type, | |
| Shifted, Carry); | |
| } else { | |
| R = Shifted; | |
| // Hoist loop-invariant constant instructions out of the limb loop. | |
| MInstruction *ShiftModConst = nullptr; | |
| MInstruction *RemainingBitsConst = nullptr; | |
| if (ShiftMod != 0) { | |
| ShiftModConst = createIntConstInstruction(MirI64Type, ShiftMod); | |
| } | |
| if (RemainingBits > 0) { | |
| RemainingBitsConst = | |
| createIntConstInstruction(MirI64Type, RemainingBits); | |
| } | |
| for (size_t I = 0; I < EVM_ELEMENTS_COUNT; ++I) { | |
| MInstruction *R = Zero; | |
| if (I >= CompShift) { | |
| size_t SrcIdx = I - CompShift; | |
| if (ShiftMod == 0) { | |
| // Pure limb shift (multiple of 64): no intra-limb shift/carry needed. | |
| R = Value[SrcIdx]; | |
| } else { | |
| MInstruction *SrcVal = Value[SrcIdx]; | |
| MInstruction *Shifted = createInstruction<BinaryInstruction>( | |
| false, OP_shl, MirI64Type, SrcVal, ShiftModConst); | |
| if (SrcIdx > 0 && RemainingBitsConst) { | |
| MInstruction *Carry = createInstruction<BinaryInstruction>( | |
| false, OP_ushr, MirI64Type, Value[SrcIdx - 1], | |
| RemainingBitsConst); | |
| R = createInstruction<BinaryInstruction>(false, OP_or, MirI64Type, | |
| Shifted, Carry); | |
| } else { | |
| R = Shifted; | |
| } |
| for (size_t I = 0; I < EVM_ELEMENTS_COUNT; ++I) { | ||
| MInstruction *R = Zero; | ||
| if (I + CompShift < EVM_ELEMENTS_COUNT) { | ||
| size_t SrcIdx = I + CompShift; | ||
| MInstruction *SrcVal = Value[SrcIdx]; | ||
| MInstruction *Shifted = createInstruction<BinaryInstruction>( | ||
| false, OP_ushr, MirI64Type, SrcVal, | ||
| createIntConstInstruction(MirI64Type, ShiftMod)); | ||
| if (SrcIdx + 1 < EVM_ELEMENTS_COUNT && CarryShift > 0) { | ||
| MInstruction *Carry = createInstruction<BinaryInstruction>( | ||
| false, OP_shl, MirI64Type, Value[SrcIdx + 1], | ||
| createIntConstInstruction(MirI64Type, CarryShift)); |
There was a problem hiding this comment.
In the constant-SHR fast path, the shift amounts are materialized with createIntConstInstruction inside the per-limb loop, which produces duplicate OP_const instructions for ShiftMod/CarryShift. Hoist these constants outside the loop (and when ShiftMod == 0, avoid generating the shift/or at all by directly using the source limb) to maximize the intended perf win and reduce MIR bloat.
| for (size_t I = 0; I < EVM_ELEMENTS_COUNT; ++I) { | |
| MInstruction *R = Zero; | |
| if (I + CompShift < EVM_ELEMENTS_COUNT) { | |
| size_t SrcIdx = I + CompShift; | |
| MInstruction *SrcVal = Value[SrcIdx]; | |
| MInstruction *Shifted = createInstruction<BinaryInstruction>( | |
| false, OP_ushr, MirI64Type, SrcVal, | |
| createIntConstInstruction(MirI64Type, ShiftMod)); | |
| if (SrcIdx + 1 < EVM_ELEMENTS_COUNT && CarryShift > 0) { | |
| MInstruction *Carry = createInstruction<BinaryInstruction>( | |
| false, OP_shl, MirI64Type, Value[SrcIdx + 1], | |
| createIntConstInstruction(MirI64Type, CarryShift)); | |
| // If the shift is a multiple of 64, we only need to move whole limbs. | |
| if (ShiftMod == 0) { | |
| for (size_t I = 0; I < EVM_ELEMENTS_COUNT; ++I) { | |
| MInstruction *R = Zero; | |
| if (I + CompShift < EVM_ELEMENTS_COUNT) { | |
| size_t SrcIdx = I + CompShift; | |
| R = Value[SrcIdx]; | |
| } | |
| Result[I] = protectUnsafeValue(R, MirI64Type); | |
| } | |
| return Result; | |
| } | |
| // Hoist loop-invariant shift constants out of the limb loop. | |
| MInstruction *ShiftModConst = | |
| createIntConstInstruction(MirI64Type, ShiftMod); | |
| MInstruction *CarryShiftConst = nullptr; | |
| if (CarryShift > 0) | |
| CarryShiftConst = createIntConstInstruction(MirI64Type, CarryShift); | |
| for (size_t I = 0; I < EVM_ELEMENTS_COUNT; ++I) { | |
| MInstruction *R = Zero; | |
| if (I + CompShift < EVM_ELEMENTS_COUNT) { | |
| size_t SrcIdx = I + CompShift; | |
| MInstruction *SrcVal = Value[SrcIdx]; | |
| MInstruction *Shifted = createInstruction<BinaryInstruction>( | |
| false, OP_ushr, MirI64Type, SrcVal, ShiftModConst); | |
| if (SrcIdx + 1 < EVM_ELEMENTS_COUNT && CarryShiftConst != nullptr) { | |
| MInstruction *Carry = createInstruction<BinaryInstruction>( | |
| false, OP_shl, MirI64Type, Value[SrcIdx + 1], CarryShiftConst); |
| // Use arithmetic shift for the high component (contains sign bit) | ||
| bool UseArithShift = (SrcIdx == EVM_ELEMENTS_COUNT - 1); | ||
| MInstruction *Shifted = createInstruction<BinaryInstruction>( | ||
| false, UseArithShift ? OP_sshr : OP_ushr, MirI64Type, SrcVal, | ||
| createIntConstInstruction(MirI64Type, ShiftMod)); | ||
| if (SrcIdx + 1 < EVM_ELEMENTS_COUNT && CarryShift > 0) { | ||
| MInstruction *Carry = createInstruction<BinaryInstruction>( | ||
| false, OP_shl, MirI64Type, Value[SrcIdx + 1], | ||
| createIntConstInstruction(MirI64Type, CarryShift)); | ||
| R = createInstruction<BinaryInstruction>(false, OP_or, MirI64Type, | ||
| Shifted, Carry); |
There was a problem hiding this comment.
In the constant-SAR fast path, createIntConstInstruction(..., ShiftMod) / createIntConstInstruction(..., CarryShift) are created inside the loop even though they are loop-invariant, adding extra OP_const nodes. Hoist the constants outside the loop (and if ShiftMod == 0, avoid emitting a redundant shift-by-0 instruction) so this path stays minimal.
| // Fast path: constant shift amount — direct limb logic, no Select/cmp loops. | ||
| if (auto ShiftOpt = getConstShiftAmount(ShiftAmount)) { | ||
| uint64_t Shift = *ShiftOpt; | ||
| if (Shift >= 256) { | ||
| for (size_t I = 0; I < EVM_ELEMENTS_COUNT; ++I) | ||
| Result[I] = Zero; | ||
| return Result; | ||
| } | ||
| uint64_t CompShift = Shift / 64; | ||
| uint64_t ShiftMod = Shift % 64; | ||
| uint64_t RemainingBits = (ShiftMod == 0) ? 0 : (64 - ShiftMod); | ||
| for (size_t I = 0; I < EVM_ELEMENTS_COUNT; ++I) { | ||
| MInstruction *R = Zero; | ||
| if (I >= CompShift) { | ||
| size_t SrcIdx = I - CompShift; | ||
| MInstruction *SrcVal = Value[SrcIdx]; | ||
| MInstruction *Shifted = createInstruction<BinaryInstruction>( | ||
| false, OP_shl, MirI64Type, SrcVal, | ||
| createIntConstInstruction(MirI64Type, ShiftMod)); | ||
| if (SrcIdx > 0 && RemainingBits > 0) { | ||
| MInstruction *Carry = createInstruction<BinaryInstruction>( | ||
| false, OP_ushr, MirI64Type, Value[SrcIdx - 1], | ||
| createIntConstInstruction(MirI64Type, RemainingBits)); | ||
| R = createInstruction<BinaryInstruction>(false, OP_or, MirI64Type, | ||
| Shifted, Carry); | ||
| } else { | ||
| R = Shifted; | ||
| } | ||
| } | ||
| Result[I] = protectUnsafeValue(R, MirI64Type); | ||
| } | ||
| return Result; | ||
| } |
There was a problem hiding this comment.
The constant-shift fast path derives Shift only from ShiftAmount (low 64 bits) and bypasses the IsLargeShift guard. This breaks EVM semantics when the 256-bit shift has any high limb set but Shift[0] < 256 (e.g. shift = 2^64 => ShiftAmount constant 0 while IsLargeShift should force the result to 0). Ensure the fast path still applies IsLargeShift (e.g., select between 0 and the computed result per limb) or move constant-shift handling up to handleShift() where all 4 limbs are available.
| // Fast path: constant shift amount — direct limb logic, no Select/cmp loops. | |
| if (auto ShiftOpt = getConstShiftAmount(ShiftAmount)) { | |
| uint64_t Shift = *ShiftOpt; | |
| if (Shift >= 256) { | |
| for (size_t I = 0; I < EVM_ELEMENTS_COUNT; ++I) | |
| Result[I] = Zero; | |
| return Result; | |
| } | |
| uint64_t CompShift = Shift / 64; | |
| uint64_t ShiftMod = Shift % 64; | |
| uint64_t RemainingBits = (ShiftMod == 0) ? 0 : (64 - ShiftMod); | |
| for (size_t I = 0; I < EVM_ELEMENTS_COUNT; ++I) { | |
| MInstruction *R = Zero; | |
| if (I >= CompShift) { | |
| size_t SrcIdx = I - CompShift; | |
| MInstruction *SrcVal = Value[SrcIdx]; | |
| MInstruction *Shifted = createInstruction<BinaryInstruction>( | |
| false, OP_shl, MirI64Type, SrcVal, | |
| createIntConstInstruction(MirI64Type, ShiftMod)); | |
| if (SrcIdx > 0 && RemainingBits > 0) { | |
| MInstruction *Carry = createInstruction<BinaryInstruction>( | |
| false, OP_ushr, MirI64Type, Value[SrcIdx - 1], | |
| createIntConstInstruction(MirI64Type, RemainingBits)); | |
| R = createInstruction<BinaryInstruction>(false, OP_or, MirI64Type, | |
| Shifted, Carry); | |
| } else { | |
| R = Shifted; | |
| } | |
| } | |
| Result[I] = protectUnsafeValue(R, MirI64Type); | |
| } | |
| return Result; | |
| } | |
| // Note: We deliberately avoid a constant-shift fast path here because | |
| // deriving the 256-bit shift solely from the low 64-bit ShiftAmount | |
| // can bypass the IsLargeShift guard and break EVM semantics when any | |
| // high limb of the shift value is non-zero. All shifts are handled | |
| // by the generic implementation below, which correctly applies | |
| // IsLargeShift to enforce zeroing for large shifts. |
1. Does this PR affect any open issues?(Y/N) and add issue references (e.g. "fix #123", "re #123".):
2. What is the scope of this PR (e.g. component or file name):
3. Provide a description of the PR(e.g. more details, effects, motivations or doc link):
For constant shift amounts, use direct limb logic instead of O(4²) Select/cmp loops, this brings sha1_shifts -68%,blake2b_shifts -20%.
4. Are there any breaking changes?(Y/N) and describe the breaking changes(e.g. more details, motivations or doc link):
5. Are there test cases for these changes?(Y/N) select and add more details, references or doc links:
6. Release note