diff --git a/src/Backends/DotCompute.Backends.CPU/Kernels/AdvancedSimdKernels.cs b/src/Backends/DotCompute.Backends.CPU/Kernels/AdvancedSimdKernels.cs index 0c350709..bdcf1b5a 100644 --- a/src/Backends/DotCompute.Backends.CPU/Kernels/AdvancedSimdKernels.cs +++ b/src/Backends/DotCompute.Backends.CPU/Kernels/AdvancedSimdKernels.cs @@ -480,53 +480,59 @@ public static unsafe void VectorAdvancedNeonFloat32( /// Gather operation: loads elements from memory using indices. /// Critical for sparse data and indirect memory access patterns. /// + /// + /// Adoption site #1 for .NET 10 SIMD surface: uses Avx2.GatherVector256 + /// to perform a true hardware gather of 8 floats in one instruction when AVX2 is + /// available, falling back to a scalar loop otherwise. On AVX-512 hosts we issue + /// two 256-bit gathers back-to-back to cover 16 elements per iteration — .NET 10 + /// SDK 10.0.106 does not expose Avx512F.GatherVector512, so stitching two + /// AVX2 gathers is the best available option without dropping to P/Invoke. + /// [MethodImpl(MethodImplOptions.AggressiveInlining | MethodImplOptions.AggressiveOptimization)] public static unsafe void VectorGatherFloat32( float* basePtr, int* indices, float* result, int count) { var i = 0; - // AVX2 gather path (8 elements per operation) - if (Avx2.IsSupported) + // AVX-512 path: stitch two AVX2 gathers for 16-wide iteration. + // (The SDK does not expose Avx512F.GatherVector512 directly in .NET 10.) + if (Avx512F.IsSupported && Avx2.IsSupported) { - const int vectorSize = 8; + const int vectorSize = 16; var vectorCount = count / vectorSize; for (var v = 0; v < vectorCount; v++) { var offset = v * vectorSize; + var idxLo = Vector256.Load(indices + offset); + var idxHi = Vector256.Load(indices + offset + 8); - // Use scalar approach for gather as AVX2 gather has specific requirements - // that may not be met with arbitrary index arrays - for (var j = 0; j < vectorSize && (offset + j) < count; j++) - { - var index = indices[offset + j]; - result[offset + j] = basePtr[index]; - } + // Scale = 4 bytes (sizeof(float)). Two 256-bit gathers = 16 floats. + var gatheredLo = Avx2.GatherVector256(basePtr, idxLo, 4); + var gatheredHi = Avx2.GatherVector256(basePtr, idxHi, 4); + + gatheredLo.Store(result + offset); + gatheredHi.Store(result + offset + 8); } i = vectorCount * vectorSize; } - // AVX-512 gather operations - else if (Avx512F.IsSupported) + // AVX2 gather path: 8 elements per gather instruction. + else if (Avx2.IsSupported) { - const int vectorSize = 16; + const int vectorSize = 8; var vectorCount = count / vectorSize; for (var v = 0; v < vectorCount; v++) { var offset = v * vectorSize; - - // Scalar implementation for reliability - for (var j = 0; j < vectorSize && (offset + j) < count; j++) - { - var index = indices[offset + j]; - result[offset + j] = basePtr[index]; - } + var idxVec = Vector256.Load(indices + offset); + var gathered = Avx2.GatherVector256(basePtr, idxVec, 4); + gathered.Store(result + offset); } i = vectorCount * vectorSize; } - // Scalar remainder + // Scalar remainder (byte-for-byte identical to the old non-AVX2 fallback). for (; i < count; i++) { result[i] = basePtr[indices[i]]; @@ -536,34 +542,33 @@ public static unsafe void VectorGatherFloat32( /// /// Scatter operation: stores elements to memory using indices. /// + /// + /// Adoption site #2 for .NET 10 SIMD surface: .NET 10 SDK 10.0.106 does not expose + /// Avx512F.Scatter in the x86 intrinsics surface, so we keep the scalar + /// inner loop here. The previous code issued a pointless AVX-512 load of the + /// values and indices that the scalar loop then re-read from memory; removing + /// those dead loads cuts register pressure and lets the inner loop vectorize + /// via LICM + the standard reuse of scalar stores. If a future .NET SDK exposes + /// scatter intrinsics this is the single point to revisit. + /// [MethodImpl(MethodImplOptions.AggressiveInlining | MethodImplOptions.AggressiveOptimization)] public static unsafe void VectorScatterFloat32( float* values, int* indices, float* basePtr, int count) { + // Hardware scatter is not available in the current .NET 10 SDK. Process + // scalar writes in an unrolled loop so the JIT can still schedule stores + // aggressively. Byte-for-byte identical result to the previous implementation. var i = 0; - - // AVX-512 scatter path (16 elements per operation) - if (Avx512F.IsSupported) + var unrolledEnd = count - (count % 4); + for (; i < unrolledEnd; i += 4) { - const int vectorSize = 16; - var vectorCount = count / vectorSize; - - for (var v = 0; v < vectorCount; v++) - { - var offset = v * vectorSize; - var vvalues = Avx512F.LoadVector512(values + offset); - var vindices = Avx512F.LoadVector512(indices + offset); - - // AVX-512 scatter (using simpler approach for compatibility) - for (var j = 0; j < vectorSize; j++) - { - basePtr[indices[offset + j]] = values[offset + j]; - } - } - i = vectorCount * vectorSize; + basePtr[indices[i]] = values[i]; + basePtr[indices[i + 1]] = values[i + 1]; + basePtr[indices[i + 2]] = values[i + 2]; + basePtr[indices[i + 3]] = values[i + 3]; } - // Scalar remainder (no AVX2 scatter available) + // Scalar tail. for (; i < count; i++) { basePtr[indices[i]] = values[i]; diff --git a/src/Backends/DotCompute.Backends.CPU/Kernels/AdvancedSimdPatterns.cs b/src/Backends/DotCompute.Backends.CPU/Kernels/AdvancedSimdPatterns.cs index 8a15e265..a6d508b0 100644 --- a/src/Backends/DotCompute.Backends.CPU/Kernels/AdvancedSimdPatterns.cs +++ b/src/Backends/DotCompute.Backends.CPU/Kernels/AdvancedSimdPatterns.cs @@ -414,6 +414,14 @@ public static void ConditionalSelect( } } + /// + /// AVX-512 conditional select adoption site #3 for .NET 10 SIMD surface. + /// Uses Avx512F.TernaryLogic to collapse the mask-blend into a + /// single vpternlogd instruction. The truth table used is 0xCA + /// (a := (c ? a : b)), which matches bitwise select semantics when the mask + /// is an all-ones/all-zeros comparison result. Byte-identical to the previous + /// implementation that used Avx512F.BlendVariable. + /// [MethodImpl(MethodImplOptions.AggressiveInlining | MethodImplOptions.AggressiveOptimization)] private static void ConditionalSelectAvx512( ReadOnlySpan condition, @@ -439,11 +447,23 @@ private static void ConditionalSelectAvx512( var trueVec = Vector512.LoadUnsafe(ref Unsafe.Add(ref trueRef, offset)); var falseVec = Vector512.LoadUnsafe(ref Unsafe.Add(ref falseRef, offset)); - // Create mask where condition > threshold + // Mask where condition > threshold (all-ones lanes on match, zero otherwise). var mask = Avx512F.CompareGreaterThan(cond, thresholdVec); - // Use masked blend: select trueVec where mask is true, falseVec otherwise - var resultVec = Avx512F.BlendVariable(falseVec, trueVec, mask.AsSingle()); + // Single-instruction bitwise select via AVX-512 vpternlogd. + // Truth table 0xCA computes (C ? A : B) = (C & A) | (~C & B). + // bit | A | B | C | out + // -----+---+---+---+---- + // ca_7 | 1 | 1 | 1 | 1 + // ca_6 | 1 | 1 | 0 | 1 + // ca_5 | 1 | 0 | 1 | 1 + // ca_4 | 1 | 0 | 0 | 0 + // ca_3 | 0 | 1 | 1 | 0 + // ca_2 | 0 | 1 | 0 | 1 + // ca_1 | 0 | 0 | 1 | 0 + // ca_0 | 0 | 0 | 0 | 0 + // => 0b11001010 = 0xCA + var resultVec = Avx512F.TernaryLogic(trueVec, falseVec, mask.AsSingle(), 0xCA); resultVec.StoreUnsafe(ref Unsafe.Add(ref resultRef, offset)); } @@ -536,15 +556,24 @@ private static void ConditionalSelectSse( // Create mask where condition > threshold var mask = Sse.CompareGreaterThan(cond, thresholdVec); - // Use blend to select based on mask + // Use blend to select based on mask. + // Adoption site #4 for .NET 10 SIMD surface: when AVX-512VL is available + // we fuse the SSE-era `(trueVec & mask) | (~mask & falseVec)` three-op + // sequence into a single Vector128 vpternlogd with imm8 = 0xCA + // (truth table for C ? A : B). Byte-identical result to the manual + // And/AndNot/Or sequence that preceded it. Vector128 resultVec; - if (Sse41.IsSupported) + if (Avx512F.VL.IsSupported) + { + resultVec = Avx512F.VL.TernaryLogic(trueVec, falseVec, mask, 0xCA); + } + else if (Sse41.IsSupported) { resultVec = Sse41.BlendVariable(falseVec, trueVec, mask); } else { - // Manual blend for older SSE + // Manual blend for older SSE (byte-for-byte identical fallback). var maskedTrue = Sse.And(trueVec, mask); var maskedFalse = Sse.AndNot(mask, falseVec); resultVec = Sse.Or(maskedTrue, maskedFalse); @@ -582,7 +611,8 @@ private static void ConditionalSelectScalar( /// /// Advanced gather operation for sparse/indirect memory access patterns. - /// Uses AVX2 gather when available, falls back to scalar. + /// Uses AVX-512-wide gather (stitched AVX2 gathers) when available, AVX2 otherwise, + /// and falls back to a bounds-checked scalar loop. /// [MethodImpl(MethodImplOptions.AggressiveInlining | MethodImplOptions.AggressiveOptimization)] public static unsafe void GatherFloat32( @@ -595,7 +625,14 @@ public static unsafe void GatherFloat32( throw new ArgumentException("Indices and destination must have the same length"); } - if (Avx2.IsSupported && indices.Length >= 8) + // Adoption site #5 for .NET 10 SIMD surface: on AVX-512 hosts process 16 elements + // per iteration via two back-to-back AVX2 gathers. .NET 10 SDK 10.0.106 does not + // expose Avx512F.GatherVector512, so stitching is the fastest available form. + if (Avx512F.IsSupported && Avx2.IsSupported && indices.Length >= 16) + { + GatherFloat32Avx512(source, indices, destination); + } + else if (Avx2.IsSupported && indices.Length >= 8) { GatherFloat32Avx2(source, indices, destination); } @@ -605,6 +642,55 @@ public static unsafe void GatherFloat32( } } + /// + /// AVX-512-wide gather: processes 16 indices per iteration using two stitched + /// AVX2 gather instructions. Behaviour is byte-for-byte identical to looping + /// over twice; this form simply reduces loop + /// overhead and lets the JIT keep both gather dispatches in flight. + /// + [MethodImpl(MethodImplOptions.AggressiveInlining | MethodImplOptions.AggressiveOptimization)] + private static unsafe void GatherFloat32Avx512( + ReadOnlySpan source, + ReadOnlySpan indices, + Span destination) + { + const int VectorSize = 16; + var vectorCount = indices.Length / VectorSize; + + fixed (float* sourcePtr = source) + { + ref var indicesRef = ref MemoryMarshal.GetReference(indices); + ref var destRef = ref MemoryMarshal.GetReference(destination); + + for (var i = 0; i < vectorCount; i++) + { + var offset = i * VectorSize; + + // Load two 8-wide index blocks. + var idxLo = Vector256.LoadUnsafe(ref Unsafe.Add(ref indicesRef, offset)); + var idxHi = Vector256.LoadUnsafe(ref Unsafe.Add(ref indicesRef, offset + 8)); + + // Two hardware gathers (scale = 4 for sizeof(float)). + var gatheredLo = Avx2.GatherVector256(sourcePtr, idxLo, 4); + var gatheredHi = Avx2.GatherVector256(sourcePtr, idxHi, 4); + + gatheredLo.StoreUnsafe(ref Unsafe.Add(ref destRef, offset)); + gatheredHi.StoreUnsafe(ref Unsafe.Add(ref destRef, offset + 8)); + } + } + + // Tail: delegate to the AVX2 path + scalar remainder so any leftover + // 8-element block still uses a hardware gather. + var consumed = vectorCount * VectorSize; + if (consumed < indices.Length) + { + GatherFloat32Avx2( + source, + indices[consumed..], + destination[consumed..]); + } + } + [MethodImpl(MethodImplOptions.AggressiveInlining | MethodImplOptions.AggressiveOptimization)] private static unsafe void GatherFloat32Avx2( ReadOnlySpan source, diff --git a/tests/Unit/DotCompute.Backends.CPU.Tests/SimdOperationsTests.cs b/tests/Unit/DotCompute.Backends.CPU.Tests/SimdOperationsTests.cs index f06690e5..48031f77 100644 --- a/tests/Unit/DotCompute.Backends.CPU.Tests/SimdOperationsTests.cs +++ b/tests/Unit/DotCompute.Backends.CPU.Tests/SimdOperationsTests.cs @@ -5,6 +5,7 @@ using System.Runtime.Intrinsics.Arm; using System.Runtime.Intrinsics.X86; using DotCompute.Backends.CPU.Intrinsics; +using DotCompute.Backends.CPU.Kernels; using DotCompute.Tests.Common; namespace DotCompute.Backends.CPU.Tests; @@ -662,4 +663,178 @@ private static void PerformVectorSqrt(float[] input, float[] result) } } } + + // ------------------------------------------------------------------ + // .NET 10 SIMD surface adoption tests (gather, scatter, ternary logic). + // These cover the AVX-512 TernaryLogic and AVX2/AVX-512 gather paths + // introduced by feat/simd-gather-scatter-ternary. + // ------------------------------------------------------------------ + + /// + /// Site #5: GatherFloat32 must produce identical output on all capability tiers + /// (AVX-512 stitched, AVX2 single, scalar bounds-checked). Uses a shuffled index + /// array long enough to exercise the 16-wide loop body. + /// + [Fact] + public void GatherFloat32_ShuffledIndices_MatchesScalarReference() + { + const int N = 257; // prime so both 16- and 8-wide tails execute + var source = new float[N]; + for (var i = 0; i < N; i++) + { + source[i] = i * 0.5f + 1.0f; + } + + var indices = new int[N]; + // Pseudo-shuffle: reverse + mix so neighbouring lanes hit different cache lines. + for (var i = 0; i < N; i++) + { + indices[i] = (N - 1 - i + 7 * i / 3) % N; + } + + var expected = new float[N]; + for (var i = 0; i < N; i++) + { + expected[i] = source[indices[i]]; + } + + var actual = new float[N]; + AdvancedSimdPatterns.GatherFloat32(source, indices, actual); + + _ = actual.Should().Equal(expected); + } + + /// + /// Site #5: boundary input sized exactly at the 16-wide block edge; no tail. + /// + [Theory] + [InlineData(16)] + [InlineData(32)] + [InlineData(128)] + public void GatherFloat32_AlignedLength_MatchesScalarReference(int length) + { + var source = new float[length]; + var indices = new int[length]; + for (var i = 0; i < length; i++) + { + source[i] = (i + 1) * 1.25f; + indices[i] = (i * 3) % length; // distinct stride that wraps + } + + var expected = new float[length]; + for (var i = 0; i < length; i++) + { + expected[i] = source[indices[i]]; + } + + var actual = new float[length]; + AdvancedSimdPatterns.GatherFloat32(source, indices, actual); + + _ = actual.Should().Equal(expected); + } + + /// + /// Site #1: VectorGatherFloat32 (unsafe entry point) parity across backends. + /// Exercises the AVX-512 stitched path when the host supports it. + /// + [Fact] + public unsafe void VectorGatherFloat32_MatchesScalarReference() + { + const int N = 64; + var source = new float[N]; + var indices = new int[N]; + var result = new float[N]; + var expected = new float[N]; + + // Populate source and indices first so the reference loop reads valid data. + for (var i = 0; i < N; i++) + { + source[i] = (i * 7 + 3) * 0.125f; + indices[i] = (i * 5 + 1) % N; + } + + for (var i = 0; i < N; i++) + { + expected[i] = source[indices[i]]; + } + + fixed (float* src = source) + fixed (int* idx = indices) + fixed (float* dst = result) + { + AdvancedSimdKernels.VectorGatherFloat32(src, idx, dst, N); + } + + _ = result.Should().Equal(expected); + } + + /// + /// Site #2: VectorScatterFloat32 unrolled path must match a naive scalar scatter. + /// + [Fact] + public unsafe void VectorScatterFloat32_MatchesScalarReference() + { + const int N = 37; // not a multiple of 4, exercises the tail + var values = new float[N]; + var indices = new int[N]; + for (var i = 0; i < N; i++) + { + values[i] = i + 0.5f; + // Permutation: write position N-1-i + indices[i] = N - 1 - i; + } + + var expected = new float[N]; + for (var i = 0; i < N; i++) + { + expected[indices[i]] = values[i]; + } + + var actual = new float[N]; + fixed (float* vals = values) + fixed (int* idx = indices) + fixed (float* basePtr = actual) + { + AdvancedSimdKernels.VectorScatterFloat32(vals, idx, basePtr, N); + } + + _ = actual.Should().Equal(expected); + } + + /// + /// Sites #3 and #4: ConditionalSelect with AVX-512 TernaryLogic (and VL Vector128 + /// TernaryLogic on the SSE fallback branch) must be byte-identical to a scalar + /// ternary reference implementation. + /// + [Theory] + [InlineData(8)] + [InlineData(16)] + [InlineData(17)] + [InlineData(100)] + public void ConditionalSelect_TernaryLogic_MatchesScalarReference(int length) + { + var condition = new float[length]; + var trueValues = new float[length]; + var falseValues = new float[length]; + const float threshold = 0.5f; + + // Alternating above/below threshold so every lane triggers at least once + for (var i = 0; i < length; i++) + { + condition[i] = (i % 3 == 0) ? 0.75f : 0.25f; + trueValues[i] = i + 1_000.0f; + falseValues[i] = -(i + 1_000.0f); + } + + var expected = new float[length]; + for (var i = 0; i < length; i++) + { + expected[i] = condition[i] > threshold ? trueValues[i] : falseValues[i]; + } + + var actual = new float[length]; + AdvancedSimdPatterns.ConditionalSelect(condition, trueValues, falseValues, actual, threshold); + + _ = actual.Should().Equal(expected); + } }