sundb · sundb · Sep 20, 2025 · Sep 20, 2025
diff --git a/modules/vector-sets/Makefile b/modules/vector-sets/Makefile
@@ -17,7 +17,23 @@ endif
 endif
 endif
 
-CFLAGS = -O2 -Wall -Wextra -g $(SAN) -std=c11
+# # CPU feature detection
+# CPU_FEATURES :=
+
+# # Check for AVX2 support
+# AVX2_SUPPORTED := $(shell echo | $(CC) -mavx2 -dM -E - 2>/dev/null | grep -q __AVX2__ && echo yes)
+# ifeq ($(AVX2_SUPPORTED),yes)
+#     CPU_FEATURES += -mavx2
+# endif
+
+# # Check for both AVX512F and FMA together (AVX512 code needs both)
+# AVX512F_SUPPORTED := $(shell echo | $(CC) -mavx512f -mfma -dM -E - 2>/dev/null | grep -q __AVX512F__ && echo yes)
+# FMA_SUPPORTED := $(shell echo | $(CC) -mavx512f -mfma -dM -E - 2>/dev/null | grep -q __FMA__ && echo yes)
+# ifeq ($(AVX512F_SUPPORTED)$(FMA_SUPPORTED),yesyes)
+#     CPU_FEATURES += -mavx512f -mfma
+# endif
+
+CFLAGS = -O2 -Wall -Wextra -g $(SAN) -std=c11 $(CPU_FEATURES)
 LDFLAGS = -lm $(SAN)
 
 # Detect OS

diff --git a/modules/vector-sets/hnsw.c b/modules/vector-sets/hnsw.c
@@ -47,6 +47,10 @@
 #include "hnsw.h"
 #include "mixer.h"
 
+#if defined(__AVX2__) || defined(__AVX512F__)
+    #include <immintrin.h>
+#endif
+
 #if 0
 #define debugmsg printf
 #else
@@ -192,16 +196,115 @@ float pq_max_distance(pqueue *pq) {
 
 /* ============================ HNSW algorithm ============================== */
 
-/* Dot product: our vectors are already normalized.
+// #ifdef __AVX512F__
+
+// /* AVX512 optimized dot product for float vectors */
+// float vectors_distance_float_avx512(const float *x, const float *y, uint32_t dim) {
+//     __m512 sum = _mm512_setzero_ps();
+//     uint32_t i;
+
+//     /* Process 16 floats at a time with AVX512 */
+//     for (i = 0; i + 15 < dim; i += 16) {
+//         __m512 vx = _mm512_loadu_ps(&x[i]);
+//         __m512 vy = _mm512_loadu_ps(&y[i]);
+//         sum = _mm512_fmadd_ps(vx, vy, sum);
+//     }
+
+//     /* Horizontal sum of the 16 elements in sum */
+//     float dot = _mm512_reduce_add_ps(sum);
+
+//     /* Handle remaining elements */
+//     for (; i < dim; i++) {
+//         dot += x[i] * y[i];
+//     }
+
+//     return 1.0f - dot;
+// }
+// #endif
+
+// #ifdef __AVX2__
+
+// /* AVX2 optimized dot product for float vectors */
+// float vectors_distance_float_avx2(const float *x, const float *y, uint32_t dim) {
+//     __m256 sum1 = _mm256_setzero_ps();
+//     __m256 sum2 = _mm256_setzero_ps();
+//     uint32_t i;
+
+//     /* Process 16 floats at a time with two AVX2 registers */
+//     for (i = 0; i + 15 < dim; i += 16) {
+//         __m256 vx1 = _mm256_loadu_ps(&x[i]);
+//         __m256 vy1 = _mm256_loadu_ps(&y[i]);
+//         __m256 vx2 = _mm256_loadu_ps(&x[i + 8]);
+//         __m256 vy2 = _mm256_loadu_ps(&y[i + 8]);
+
+//         sum1 = _mm256_fmadd_ps(vx1, vy1, sum1);
+//         sum2 = _mm256_fmadd_ps(vx2, vy2, sum2);
+//     }
+
+//     /* Combine the two sums */
+//     __m256 combined = _mm256_add_ps(sum1, sum2);
+
+//     /* Horizontal sum of the 8 elements */
+//     __m128 sum_high = _mm256_extractf128_ps(combined, 1);
+//     __m128 sum_low = _mm256_castps256_ps128(combined);
+//     __m128 sum_128 = _mm_add_ps(sum_high, sum_low);
+
+//     sum_128 = _mm_hadd_ps(sum_128, sum_128);
+//     sum_128 = _mm_hadd_ps(sum_128, sum_128);
+
+//     float dot = _mm_cvtss_f32(sum_128);
+
+//     /* Handle remaining elements */
+//     for (; i < dim; i++) {
+//         dot += x[i] * y[i];
+//     }
+
+//     return 1.0f - dot;
+// }
+// #endif
+
+/* Optimized dot product: automatically selects best available implementation 
+ * Dot product: our vectors are already normalized.
  * Version for not quantized vectors of floats. */
 float vectors_distance_float(const float *x, const float *y, uint32_t dim) {
-    /* Use two accumulators to reduce dependencies among multiplications.
-     * This provides a clear speed boost in Apple silicon, but should be
-     * help in general. */
+// #ifdef __AVX512F__
+//     /* Check if runtime supports AVX512F */
+//     static int avx512_checked = 0;
+//     static int has_avx512 = 0;
+
+//     if (!avx512_checked) {
+//         /* Simple runtime check - in production you might want to use CPUID */
+//         has_avx512 = __builtin_cpu_supports("avx512f");
+//         avx512_checked = 1;
+//     }
+
+//     if (has_avx512 && dim >= 16) {
+//         return vectors_distance_float_avx512(x, y, dim);
+//     }
+// #endif
+
+// #ifdef __AVX2__
+//     /* Check if runtime supports AVX2 */
+//     static int avx2_checked = 0;
+//     static int has_avx2 = 0;
+
+//     if (!avx2_checked) {
+//         has_avx2 = __builtin_cpu_supports("avx2") && __builtin_cpu_supports("fma");
+//         avx2_checked = 1;
+//     }
+
+//     if (has_avx2 && dim >= 16) {
+//         return vectors_distance_float_avx2(x, y, dim);
+//     }
+// #endif
+
+    /* Fallback to original scalar implementation */
     float dot0 = 0.0f, dot1 = 0.0f;
     uint32_t i;
 
-    // Process 8 elements per iteration, 50/50 with the two accumulators.
+    /* Use two accumulators to reduce dependencies among multiplications.
+     * This provides a clear speed boost in Apple silicon, but should be
+     * help in general. */
     for (i = 0; i + 7 < dim; i += 8) {
         dot0 += x[i] * y[i] +
                 x[i+1] * y[i+1] +
@@ -214,8 +317,7 @@ float vectors_distance_float(const float *x, const float *y, uint32_t dim) {
                 x[i+7] * y[i+7];
     }
 
-    /* Handle the remaining elements. These are a minority in the case
-     * of a small vector, don't optimize this part. */
+    /* Handle the remaining elements */
     for (; i < dim; i++) dot0 += x[i] * y[i];
 
     /* The following line may be counter intuitive. The dot product of

diff --git a/src/Makefile b/src/Makefile
@@ -334,6 +334,25 @@ ifneq ($(SKIP_VEC_SETS),yes)
 	vpath %.c ../modules/vector-sets
 	REDIS_VEC_SETS_OBJ=hnsw.o vset.o vset_config.o
 	FINAL_CFLAGS+=-DINCLUDE_VEC_SETS=1
+
+	# CPU feature detection for vector-sets
+	VEC_SETS_CPU_FEATURES :=
+
+	# Check for AVX2 support
+	VEC_SETS_AVX2_SUPPORTED := $(shell echo | $(CC) -mavx2 -dM -E - 2>/dev/null | grep -q __AVX2__ && echo yes)
+	ifeq ($(VEC_SETS_AVX2_SUPPORTED),yes)
+		VEC_SETS_CPU_FEATURES += -mavx2
+	endif
+
+	# Check for both AVX512F and FMA together (AVX512 code needs both)
+	VEC_SETS_AVX512F_SUPPORTED := $(shell echo | $(CC) -mavx512f -mfma -dM -E - 2>/dev/null | grep -q __AVX512F__ && echo yes)
+	VEC_SETS_FMA_SUPPORTED := $(shell echo | $(CC) -mavx512f -mfma -dM -E - 2>/dev/null | grep -q __FMA__ && echo yes)
+	ifeq ($(VEC_SETS_AVX512F_SUPPORTED)$(VEC_SETS_FMA_SUPPORTED),yesyes)
+		VEC_SETS_CPU_FEATURES += -mavx512f -mfma
+	endif
+
+	# Special compilation flags for vector-sets with detected CPU features
+	VEC_SETS_CFLAGS=$(FINAL_CFLAGS) $(VEC_SETS_CPU_FEATURES)
 endif
 
 ifndef V
@@ -462,6 +481,19 @@ DEP = $(REDIS_SERVER_OBJ:%.o=%.d) $(REDIS_VEC_SETS_OBJ:%.o=%.d) $(REDIS_CLI_OBJ:
 # Because the jemalloc.h header is generated as a part of the jemalloc build,
 # building it should complete before building any other object. Instead of
 # depending on a single artifact, build all dependencies first.
+
+# Special compilation rules for vector-sets files with CPU feature detection
+# ifneq ($(SKIP_VEC_SETS),yes)
+# hnsw.o: ../modules/vector-sets/hnsw.c .make-prerequisites
+# 	$(QUIET_CC)$(CC) $(VEC_SETS_CFLAGS) -MMD -o $@ -c $<
+
+# vset.o: ../modules/vector-sets/vset.c .make-prerequisites
+# 	$(QUIET_CC)$(CC) $(VEC_SETS_CFLAGS) -MMD -o $@ -c $<
+
+# vset_config.o: ../modules/vector-sets/vset_config.c .make-prerequisites
+# 	$(QUIET_CC)$(CC) $(VEC_SETS_CFLAGS) -MMD -o $@ -c $<
+# endif
+
 %.o: %.c .make-prerequisites
 	$(REDIS_CC) -MMD -o $@ -c $<