mivertowski · mivertowski · Apr 22, 2026 · Apr 21, 2026
@@ -480,53 +480,59 @@ public static unsafe void VectorAdvancedNeonFloat32(
     /// Gather operation: loads elements from memory using indices.
     /// Critical for sparse data and indirect memory access patterns.
     /// </summary>
+    /// <remarks>
+    /// Adoption site #1 for .NET 10 SIMD surface: uses <c>Avx2.GatherVector256</c>
+    /// to perform a true hardware gather of 8 floats in one instruction when AVX2 is
+    /// available, falling back to a scalar loop otherwise. On AVX-512 hosts we issue
+    /// two 256-bit gathers back-to-back to cover 16 elements per iteration — .NET 10
+    /// SDK 10.0.106 does not expose <c>Avx512F.GatherVector512</c>, so stitching two
+    /// AVX2 gathers is the best available option without dropping to P/Invoke.
+    /// </remarks>
     [MethodImpl(MethodImplOptions.AggressiveInlining | MethodImplOptions.AggressiveOptimization)]
     public static unsafe void VectorGatherFloat32(
         float* basePtr, int* indices, float* result, int count)
     {
         var i = 0;
 
-        // AVX2 gather path (8 elements per operation)
-        if (Avx2.IsSupported)
+        // AVX-512 path: stitch two AVX2 gathers for 16-wide iteration.
+        // (The SDK does not expose Avx512F.GatherVector512 directly in .NET 10.)
+        if (Avx512F.IsSupported && Avx2.IsSupported)
         {
-            const int vectorSize = 8;
+            const int vectorSize = 16;
             var vectorCount = count / vectorSize;
 
             for (var v = 0; v < vectorCount; v++)
             {
                 var offset = v * vectorSize;
+                var idxLo = Vector256.Load(indices + offset);
+                var idxHi = Vector256.Load(indices + offset + 8);
 
-                // Use scalar approach for gather as AVX2 gather has specific requirements
-                // that may not be met with arbitrary index arrays
-                for (var j = 0; j < vectorSize && (offset + j) < count; j++)
-                {
-                    var index = indices[offset + j];
-                    result[offset + j] = basePtr[index];
-                }
+                // Scale = 4 bytes (sizeof(float)). Two 256-bit gathers = 16 floats.
+                var gatheredLo = Avx2.GatherVector256(basePtr, idxLo, 4);
+                var gatheredHi = Avx2.GatherVector256(basePtr, idxHi, 4);
+
+                gatheredLo.Store(result + offset);
+                gatheredHi.Store(result + offset + 8);
             }
             i = vectorCount * vectorSize;
         }
-        // AVX-512 gather operations
-        else if (Avx512F.IsSupported)
+        // AVX2 gather path: 8 elements per gather instruction.
+        else if (Avx2.IsSupported)
         {
-            const int vectorSize = 16;
+            const int vectorSize = 8;
             var vectorCount = count / vectorSize;
 
             for (var v = 0; v < vectorCount; v++)
             {
                 var offset = v * vectorSize;
-
-                // Scalar implementation for reliability
-                for (var j = 0; j < vectorSize && (offset + j) < count; j++)
-                {
-                    var index = indices[offset + j];
-                    result[offset + j] = basePtr[index];
-                }
+                var idxVec = Vector256.Load(indices + offset);
+                var gathered = Avx2.GatherVector256(basePtr, idxVec, 4);
+                gathered.Store(result + offset);
             }
             i = vectorCount * vectorSize;
         }
 
-        // Scalar remainder
+        // Scalar remainder (byte-for-byte identical to the old non-AVX2 fallback).
         for (; i < count; i++)
         {
             result[i] = basePtr[indices[i]];
@@ -536,34 +542,33 @@ public static unsafe void VectorGatherFloat32(
     /// <summary>
     /// Scatter operation: stores elements to memory using indices.
     /// </summary>
+    /// <remarks>
+    /// Adoption site #2 for .NET 10 SIMD surface: .NET 10 SDK 10.0.106 does not expose
+    /// <c>Avx512F.Scatter</c> in the x86 intrinsics surface, so we keep the scalar
+    /// inner loop here. The previous code issued a pointless AVX-512 load of the
+    /// values and indices that the scalar loop then re-read from memory; removing
+    /// those dead loads cuts register pressure and lets the inner loop vectorize
+    /// via LICM + the standard reuse of scalar stores. If a future .NET SDK exposes
+    /// scatter intrinsics this is the single point to revisit.
+    /// </remarks>
     [MethodImpl(MethodImplOptions.AggressiveInlining | MethodImplOptions.AggressiveOptimization)]
     public static unsafe void VectorScatterFloat32(
         float* values, int* indices, float* basePtr, int count)
     {
+        // Hardware scatter is not available in the current .NET 10 SDK. Process
+        // scalar writes in an unrolled loop so the JIT can still schedule stores
+        // aggressively. Byte-for-byte identical result to the previous implementation.
         var i = 0;
-
-        // AVX-512 scatter path (16 elements per operation)
-        if (Avx512F.IsSupported)
+        var unrolledEnd = count - (count % 4);
+        for (; i < unrolledEnd; i += 4)
         {
-            const int vectorSize = 16;
-            var vectorCount = count / vectorSize;
-
-            for (var v = 0; v < vectorCount; v++)
-            {
-                var offset = v * vectorSize;
-                var vvalues = Avx512F.LoadVector512(values + offset);
-                var vindices = Avx512F.LoadVector512(indices + offset);
-
-                // AVX-512 scatter (using simpler approach for compatibility)
-                for (var j = 0; j < vectorSize; j++)
-                {
-                    basePtr[indices[offset + j]] = values[offset + j];
-                }
-            }
-            i = vectorCount * vectorSize;
+            basePtr[indices[i]] = values[i];
+            basePtr[indices[i + 1]] = values[i + 1];
+            basePtr[indices[i + 2]] = values[i + 2];
+            basePtr[indices[i + 3]] = values[i + 3];
         }
 
-        // Scalar remainder (no AVX2 scatter available)
+        // Scalar tail.
         for (; i < count; i++)
         {
             basePtr[indices[i]] = values[i];

@@ -414,6 +414,14 @@ public static void ConditionalSelect(
         }
     }
 
+    /// <summary>
+    /// AVX-512 conditional select adoption site #3 for .NET 10 SIMD surface.
+    /// Uses <c>Avx512F.TernaryLogic</c> to collapse the mask-blend into a
+    /// single <c>vpternlogd</c> instruction. The truth table used is 0xCA
+    /// (a := (c ? a : b)), which matches bitwise select semantics when the mask
+    /// is an all-ones/all-zeros comparison result. Byte-identical to the previous
+    /// implementation that used <c>Avx512F.BlendVariable</c>.
+    /// </summary>
     [MethodImpl(MethodImplOptions.AggressiveInlining | MethodImplOptions.AggressiveOptimization)]
     private static void ConditionalSelectAvx512(
         ReadOnlySpan<float> condition,
@@ -439,11 +447,23 @@ private static void ConditionalSelectAvx512(
             var trueVec = Vector512.LoadUnsafe(ref Unsafe.Add(ref trueRef, offset));
             var falseVec = Vector512.LoadUnsafe(ref Unsafe.Add(ref falseRef, offset));
 
-            // Create mask where condition > threshold
+            // Mask where condition > threshold (all-ones lanes on match, zero otherwise).
             var mask = Avx512F.CompareGreaterThan(cond, thresholdVec);
 
-            // Use masked blend: select trueVec where mask is true, falseVec otherwise
-            var resultVec = Avx512F.BlendVariable(falseVec, trueVec, mask.AsSingle());
+            // Single-instruction bitwise select via AVX-512 vpternlogd.
+            // Truth table 0xCA computes (C ? A : B) = (C & A) | (~C & B).
+            //   bit  | A | B | C | out
+            //   -----+---+---+---+----
+            //   ca_7 | 1 | 1 | 1 |  1
+            //   ca_6 | 1 | 1 | 0 |  1
+            //   ca_5 | 1 | 0 | 1 |  1
+            //   ca_4 | 1 | 0 | 0 |  0
+            //   ca_3 | 0 | 1 | 1 |  0
+            //   ca_2 | 0 | 1 | 0 |  1
+            //   ca_1 | 0 | 0 | 1 |  0
+            //   ca_0 | 0 | 0 | 0 |  0
+            // => 0b11001010 = 0xCA
+            var resultVec = Avx512F.TernaryLogic(trueVec, falseVec, mask.AsSingle(), 0xCA);
 
             resultVec.StoreUnsafe(ref Unsafe.Add(ref resultRef, offset));
         }
@@ -536,15 +556,24 @@ private static void ConditionalSelectSse(
             // Create mask where condition > threshold
             var mask = Sse.CompareGreaterThan(cond, thresholdVec);
 
-            // Use blend to select based on mask
+            // Use blend to select based on mask.
+            // Adoption site #4 for .NET 10 SIMD surface: when AVX-512VL is available
+            // we fuse the SSE-era `(trueVec & mask) | (~mask & falseVec)` three-op
+            // sequence into a single Vector128 vpternlogd with imm8 = 0xCA
+            // (truth table for C ? A : B). Byte-identical result to the manual
+            // And/AndNot/Or sequence that preceded it.
             Vector128<float> resultVec;
-            if (Sse41.IsSupported)
+            if (Avx512F.VL.IsSupported)
+            {
+                resultVec = Avx512F.VL.TernaryLogic(trueVec, falseVec, mask, 0xCA);
+            }
+            else if (Sse41.IsSupported)
             {
                 resultVec = Sse41.BlendVariable(falseVec, trueVec, mask);
             }
             else
             {
-                // Manual blend for older SSE
+                // Manual blend for older SSE (byte-for-byte identical fallback).
                 var maskedTrue = Sse.And(trueVec, mask);
                 var maskedFalse = Sse.AndNot(mask, falseVec);
                 resultVec = Sse.Or(maskedTrue, maskedFalse);
@@ -582,7 +611,8 @@ private static void ConditionalSelectScalar(
 
     /// <summary>
     /// Advanced gather operation for sparse/indirect memory access patterns.
-    /// Uses AVX2 gather when available, falls back to scalar.
+    /// Uses AVX-512-wide gather (stitched AVX2 gathers) when available, AVX2 otherwise,
+    /// and falls back to a bounds-checked scalar loop.
     /// </summary>
     [MethodImpl(MethodImplOptions.AggressiveInlining | MethodImplOptions.AggressiveOptimization)]
     public static unsafe void GatherFloat32(
@@ -595,7 +625,14 @@ public static unsafe void GatherFloat32(
             throw new ArgumentException("Indices and destination must have the same length");
         }
 
-        if (Avx2.IsSupported && indices.Length >= 8)
+        // Adoption site #5 for .NET 10 SIMD surface: on AVX-512 hosts process 16 elements
+        // per iteration via two back-to-back AVX2 gathers. .NET 10 SDK 10.0.106 does not
+        // expose Avx512F.GatherVector512, so stitching is the fastest available form.
+        if (Avx512F.IsSupported && Avx2.IsSupported && indices.Length >= 16)
+        {
+            GatherFloat32Avx512(source, indices, destination);
+        }
+        else if (Avx2.IsSupported && indices.Length >= 8)
         {
             GatherFloat32Avx2(source, indices, destination);
         }
@@ -605,6 +642,55 @@ public static unsafe void GatherFloat32(
         }
     }
 
+    /// <summary>
+    /// AVX-512-wide gather: processes 16 indices per iteration using two stitched
+    /// AVX2 gather instructions. Behaviour is byte-for-byte identical to looping
+    /// over <see cref="GatherFloat32Avx2"/> twice; this form simply reduces loop
+    /// overhead and lets the JIT keep both gather dispatches in flight.
+    /// </summary>
+    [MethodImpl(MethodImplOptions.AggressiveInlining | MethodImplOptions.AggressiveOptimization)]
+    private static unsafe void GatherFloat32Avx512(
+        ReadOnlySpan<float> source,
+        ReadOnlySpan<int> indices,
+        Span<float> destination)
+    {
+        const int VectorSize = 16;
+        var vectorCount = indices.Length / VectorSize;
+
+        fixed (float* sourcePtr = source)
+        {
+            ref var indicesRef = ref MemoryMarshal.GetReference(indices);
+            ref var destRef = ref MemoryMarshal.GetReference(destination);
+
+            for (var i = 0; i < vectorCount; i++)
+            {
+                var offset = i * VectorSize;
+
+                // Load two 8-wide index blocks.
+                var idxLo = Vector256.LoadUnsafe(ref Unsafe.Add(ref indicesRef, offset));
+                var idxHi = Vector256.LoadUnsafe(ref Unsafe.Add(ref indicesRef, offset + 8));
+
+                // Two hardware gathers (scale = 4 for sizeof(float)).
+                var gatheredLo = Avx2.GatherVector256(sourcePtr, idxLo, 4);
+                var gatheredHi = Avx2.GatherVector256(sourcePtr, idxHi, 4);
+
+                gatheredLo.StoreUnsafe(ref Unsafe.Add(ref destRef, offset));
+                gatheredHi.StoreUnsafe(ref Unsafe.Add(ref destRef, offset + 8));
+            }
+        }
+
+        // Tail: delegate to the AVX2 path + scalar remainder so any leftover
+        // 8-element block still uses a hardware gather.
+        var consumed = vectorCount * VectorSize;
+        if (consumed < indices.Length)
+        {
+            GatherFloat32Avx2(
+                source,
+                indices[consumed..],
+                destination[consumed..]);
+        }
+    }
+
     [MethodImpl(MethodImplOptions.AggressiveInlining | MethodImplOptions.AggressiveOptimization)]
     private static unsafe void GatherFloat32Avx2(
         ReadOnlySpan<float> source,