Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -480,53 +480,59 @@ public static unsafe void VectorAdvancedNeonFloat32(
/// Gather operation: loads elements from memory using indices.
/// Critical for sparse data and indirect memory access patterns.
/// </summary>
/// <remarks>
/// Adoption site #1 for .NET 10 SIMD surface: uses <c>Avx2.GatherVector256</c>
/// to perform a true hardware gather of 8 floats in one instruction when AVX2 is
/// available, falling back to a scalar loop otherwise. On AVX-512 hosts we issue
/// two 256-bit gathers back-to-back to cover 16 elements per iteration — .NET 10
/// SDK 10.0.106 does not expose <c>Avx512F.GatherVector512</c>, so stitching two
/// AVX2 gathers is the best available option without dropping to P/Invoke.
/// </remarks>
[MethodImpl(MethodImplOptions.AggressiveInlining | MethodImplOptions.AggressiveOptimization)]
public static unsafe void VectorGatherFloat32(
float* basePtr, int* indices, float* result, int count)
{
var i = 0;

// AVX2 gather path (8 elements per operation)
if (Avx2.IsSupported)
// AVX-512 path: stitch two AVX2 gathers for 16-wide iteration.
// (The SDK does not expose Avx512F.GatherVector512 directly in .NET 10.)
if (Avx512F.IsSupported && Avx2.IsSupported)
{
const int vectorSize = 8;
const int vectorSize = 16;
var vectorCount = count / vectorSize;

for (var v = 0; v < vectorCount; v++)
{
var offset = v * vectorSize;
var idxLo = Vector256.Load(indices + offset);
var idxHi = Vector256.Load(indices + offset + 8);

// Use scalar approach for gather as AVX2 gather has specific requirements
// that may not be met with arbitrary index arrays
for (var j = 0; j < vectorSize && (offset + j) < count; j++)
{
var index = indices[offset + j];
result[offset + j] = basePtr[index];
}
// Scale = 4 bytes (sizeof(float)). Two 256-bit gathers = 16 floats.
var gatheredLo = Avx2.GatherVector256(basePtr, idxLo, 4);
var gatheredHi = Avx2.GatherVector256(basePtr, idxHi, 4);

gatheredLo.Store(result + offset);
gatheredHi.Store(result + offset + 8);
}
i = vectorCount * vectorSize;
}
// AVX-512 gather operations
else if (Avx512F.IsSupported)
// AVX2 gather path: 8 elements per gather instruction.
else if (Avx2.IsSupported)
{
const int vectorSize = 16;
const int vectorSize = 8;
var vectorCount = count / vectorSize;

for (var v = 0; v < vectorCount; v++)
{
var offset = v * vectorSize;

// Scalar implementation for reliability
for (var j = 0; j < vectorSize && (offset + j) < count; j++)
{
var index = indices[offset + j];
result[offset + j] = basePtr[index];
}
var idxVec = Vector256.Load(indices + offset);
var gathered = Avx2.GatherVector256(basePtr, idxVec, 4);
gathered.Store(result + offset);
}
i = vectorCount * vectorSize;
}

// Scalar remainder
// Scalar remainder (byte-for-byte identical to the old non-AVX2 fallback).
for (; i < count; i++)
{
result[i] = basePtr[indices[i]];
Expand All @@ -536,34 +542,33 @@ public static unsafe void VectorGatherFloat32(
/// <summary>
/// Scatter operation: stores elements to memory using indices.
/// </summary>
/// <remarks>
/// Adoption site #2 for .NET 10 SIMD surface: .NET 10 SDK 10.0.106 does not expose
/// <c>Avx512F.Scatter</c> in the x86 intrinsics surface, so we keep the scalar
/// inner loop here. The previous code issued a pointless AVX-512 load of the
/// values and indices that the scalar loop then re-read from memory; removing
/// those dead loads cuts register pressure and lets the inner loop vectorize
/// via LICM + the standard reuse of scalar stores. If a future .NET SDK exposes
/// scatter intrinsics this is the single point to revisit.
/// </remarks>
[MethodImpl(MethodImplOptions.AggressiveInlining | MethodImplOptions.AggressiveOptimization)]
public static unsafe void VectorScatterFloat32(
float* values, int* indices, float* basePtr, int count)
{
// Hardware scatter is not available in the current .NET 10 SDK. Process
// scalar writes in an unrolled loop so the JIT can still schedule stores
// aggressively. Byte-for-byte identical result to the previous implementation.
var i = 0;

// AVX-512 scatter path (16 elements per operation)
if (Avx512F.IsSupported)
var unrolledEnd = count - (count % 4);
for (; i < unrolledEnd; i += 4)
{
const int vectorSize = 16;
var vectorCount = count / vectorSize;

for (var v = 0; v < vectorCount; v++)
{
var offset = v * vectorSize;
var vvalues = Avx512F.LoadVector512(values + offset);
var vindices = Avx512F.LoadVector512(indices + offset);

// AVX-512 scatter (using simpler approach for compatibility)
for (var j = 0; j < vectorSize; j++)
{
basePtr[indices[offset + j]] = values[offset + j];
}
}
i = vectorCount * vectorSize;
basePtr[indices[i]] = values[i];
basePtr[indices[i + 1]] = values[i + 1];
basePtr[indices[i + 2]] = values[i + 2];
basePtr[indices[i + 3]] = values[i + 3];
}

// Scalar remainder (no AVX2 scatter available)
// Scalar tail.
for (; i < count; i++)
{
basePtr[indices[i]] = values[i];
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -414,6 +414,14 @@ public static void ConditionalSelect(
}
}

/// <summary>
/// AVX-512 conditional select adoption site #3 for .NET 10 SIMD surface.
/// Uses <c>Avx512F.TernaryLogic</c> to collapse the mask-blend into a
/// single <c>vpternlogd</c> instruction. The truth table used is 0xCA
/// (a := (c ? a : b)), which matches bitwise select semantics when the mask
/// is an all-ones/all-zeros comparison result. Byte-identical to the previous
/// implementation that used <c>Avx512F.BlendVariable</c>.
/// </summary>
[MethodImpl(MethodImplOptions.AggressiveInlining | MethodImplOptions.AggressiveOptimization)]
private static void ConditionalSelectAvx512(
ReadOnlySpan<float> condition,
Expand All @@ -439,11 +447,23 @@ private static void ConditionalSelectAvx512(
var trueVec = Vector512.LoadUnsafe(ref Unsafe.Add(ref trueRef, offset));
var falseVec = Vector512.LoadUnsafe(ref Unsafe.Add(ref falseRef, offset));

// Create mask where condition > threshold
// Mask where condition > threshold (all-ones lanes on match, zero otherwise).
var mask = Avx512F.CompareGreaterThan(cond, thresholdVec);

// Use masked blend: select trueVec where mask is true, falseVec otherwise
var resultVec = Avx512F.BlendVariable(falseVec, trueVec, mask.AsSingle());
// Single-instruction bitwise select via AVX-512 vpternlogd.
// Truth table 0xCA computes (C ? A : B) = (C & A) | (~C & B).
// bit | A | B | C | out
// -----+---+---+---+----
// ca_7 | 1 | 1 | 1 | 1
// ca_6 | 1 | 1 | 0 | 1
// ca_5 | 1 | 0 | 1 | 1
// ca_4 | 1 | 0 | 0 | 0
// ca_3 | 0 | 1 | 1 | 0
// ca_2 | 0 | 1 | 0 | 1
// ca_1 | 0 | 0 | 1 | 0
// ca_0 | 0 | 0 | 0 | 0
// => 0b11001010 = 0xCA
var resultVec = Avx512F.TernaryLogic(trueVec, falseVec, mask.AsSingle(), 0xCA);

resultVec.StoreUnsafe(ref Unsafe.Add(ref resultRef, offset));
}
Expand Down Expand Up @@ -536,15 +556,24 @@ private static void ConditionalSelectSse(
// Create mask where condition > threshold
var mask = Sse.CompareGreaterThan(cond, thresholdVec);

// Use blend to select based on mask
// Use blend to select based on mask.
// Adoption site #4 for .NET 10 SIMD surface: when AVX-512VL is available
// we fuse the SSE-era `(trueVec & mask) | (~mask & falseVec)` three-op
// sequence into a single Vector128 vpternlogd with imm8 = 0xCA
// (truth table for C ? A : B). Byte-identical result to the manual
// And/AndNot/Or sequence that preceded it.
Vector128<float> resultVec;
if (Sse41.IsSupported)
if (Avx512F.VL.IsSupported)
{
resultVec = Avx512F.VL.TernaryLogic(trueVec, falseVec, mask, 0xCA);
}
else if (Sse41.IsSupported)
{
resultVec = Sse41.BlendVariable(falseVec, trueVec, mask);
}
else
{
// Manual blend for older SSE
// Manual blend for older SSE (byte-for-byte identical fallback).
var maskedTrue = Sse.And(trueVec, mask);
var maskedFalse = Sse.AndNot(mask, falseVec);
resultVec = Sse.Or(maskedTrue, maskedFalse);
Expand Down Expand Up @@ -582,7 +611,8 @@ private static void ConditionalSelectScalar(

/// <summary>
/// Advanced gather operation for sparse/indirect memory access patterns.
/// Uses AVX2 gather when available, falls back to scalar.
/// Uses AVX-512-wide gather (stitched AVX2 gathers) when available, AVX2 otherwise,
/// and falls back to a bounds-checked scalar loop.
/// </summary>
[MethodImpl(MethodImplOptions.AggressiveInlining | MethodImplOptions.AggressiveOptimization)]
public static unsafe void GatherFloat32(
Expand All @@ -595,7 +625,14 @@ public static unsafe void GatherFloat32(
throw new ArgumentException("Indices and destination must have the same length");
}

if (Avx2.IsSupported && indices.Length >= 8)
// Adoption site #5 for .NET 10 SIMD surface: on AVX-512 hosts process 16 elements
// per iteration via two back-to-back AVX2 gathers. .NET 10 SDK 10.0.106 does not
// expose Avx512F.GatherVector512, so stitching is the fastest available form.
if (Avx512F.IsSupported && Avx2.IsSupported && indices.Length >= 16)
{
GatherFloat32Avx512(source, indices, destination);
}
else if (Avx2.IsSupported && indices.Length >= 8)
{
GatherFloat32Avx2(source, indices, destination);
}
Expand All @@ -605,6 +642,55 @@ public static unsafe void GatherFloat32(
}
}

/// <summary>
/// AVX-512-wide gather: processes 16 indices per iteration using two stitched
/// AVX2 gather instructions. Behaviour is byte-for-byte identical to looping
/// over <see cref="GatherFloat32Avx2"/> twice; this form simply reduces loop
/// overhead and lets the JIT keep both gather dispatches in flight.
/// </summary>
[MethodImpl(MethodImplOptions.AggressiveInlining | MethodImplOptions.AggressiveOptimization)]
private static unsafe void GatherFloat32Avx512(
ReadOnlySpan<float> source,
ReadOnlySpan<int> indices,
Span<float> destination)
{
const int VectorSize = 16;
var vectorCount = indices.Length / VectorSize;

fixed (float* sourcePtr = source)
{
ref var indicesRef = ref MemoryMarshal.GetReference(indices);
ref var destRef = ref MemoryMarshal.GetReference(destination);

for (var i = 0; i < vectorCount; i++)
{
var offset = i * VectorSize;

// Load two 8-wide index blocks.
var idxLo = Vector256.LoadUnsafe(ref Unsafe.Add(ref indicesRef, offset));
var idxHi = Vector256.LoadUnsafe(ref Unsafe.Add(ref indicesRef, offset + 8));

// Two hardware gathers (scale = 4 for sizeof(float)).
var gatheredLo = Avx2.GatherVector256(sourcePtr, idxLo, 4);
var gatheredHi = Avx2.GatherVector256(sourcePtr, idxHi, 4);

gatheredLo.StoreUnsafe(ref Unsafe.Add(ref destRef, offset));
gatheredHi.StoreUnsafe(ref Unsafe.Add(ref destRef, offset + 8));
}
}

// Tail: delegate to the AVX2 path + scalar remainder so any leftover
// 8-element block still uses a hardware gather.
var consumed = vectorCount * VectorSize;
if (consumed < indices.Length)
{
GatherFloat32Avx2(
source,
indices[consumed..],
destination[consumed..]);
}
}

[MethodImpl(MethodImplOptions.AggressiveInlining | MethodImplOptions.AggressiveOptimization)]
private static unsafe void GatherFloat32Avx2(
ReadOnlySpan<float> source,
Expand Down
Loading
Loading