diff --git a/modules/vector-sets/Makefile b/modules/vector-sets/Makefile index f8c05c9bc87..9b44f3dc883 100644 --- a/modules/vector-sets/Makefile +++ b/modules/vector-sets/Makefile @@ -17,7 +17,23 @@ endif endif endif -CFLAGS = -O2 -Wall -Wextra -g $(SAN) -std=c11 +# # CPU feature detection +# CPU_FEATURES := + +# # Check for AVX2 support +# AVX2_SUPPORTED := $(shell echo | $(CC) -mavx2 -dM -E - 2>/dev/null | grep -q __AVX2__ && echo yes) +# ifeq ($(AVX2_SUPPORTED),yes) +# CPU_FEATURES += -mavx2 +# endif + +# # Check for both AVX512F and FMA together (AVX512 code needs both) +# AVX512F_SUPPORTED := $(shell echo | $(CC) -mavx512f -mfma -dM -E - 2>/dev/null | grep -q __AVX512F__ && echo yes) +# FMA_SUPPORTED := $(shell echo | $(CC) -mavx512f -mfma -dM -E - 2>/dev/null | grep -q __FMA__ && echo yes) +# ifeq ($(AVX512F_SUPPORTED)$(FMA_SUPPORTED),yesyes) +# CPU_FEATURES += -mavx512f -mfma +# endif + +CFLAGS = -O2 -Wall -Wextra -g $(SAN) -std=c11 $(CPU_FEATURES) LDFLAGS = -lm $(SAN) # Detect OS diff --git a/modules/vector-sets/hnsw.c b/modules/vector-sets/hnsw.c index cae43e305b5..5d3a0c94fd2 100644 --- a/modules/vector-sets/hnsw.c +++ b/modules/vector-sets/hnsw.c @@ -47,6 +47,10 @@ #include "hnsw.h" #include "mixer.h" +#if defined(__AVX2__) || defined(__AVX512F__) + #include +#endif + #if 0 #define debugmsg printf #else @@ -192,16 +196,115 @@ float pq_max_distance(pqueue *pq) { /* ============================ HNSW algorithm ============================== */ -/* Dot product: our vectors are already normalized. +// #ifdef __AVX512F__ + +// /* AVX512 optimized dot product for float vectors */ +// float vectors_distance_float_avx512(const float *x, const float *y, uint32_t dim) { +// __m512 sum = _mm512_setzero_ps(); +// uint32_t i; + +// /* Process 16 floats at a time with AVX512 */ +// for (i = 0; i + 15 < dim; i += 16) { +// __m512 vx = _mm512_loadu_ps(&x[i]); +// __m512 vy = _mm512_loadu_ps(&y[i]); +// sum = _mm512_fmadd_ps(vx, vy, sum); +// } + +// /* Horizontal sum of the 16 elements in sum */ +// float dot = _mm512_reduce_add_ps(sum); + +// /* Handle remaining elements */ +// for (; i < dim; i++) { +// dot += x[i] * y[i]; +// } + +// return 1.0f - dot; +// } +// #endif + +// #ifdef __AVX2__ + +// /* AVX2 optimized dot product for float vectors */ +// float vectors_distance_float_avx2(const float *x, const float *y, uint32_t dim) { +// __m256 sum1 = _mm256_setzero_ps(); +// __m256 sum2 = _mm256_setzero_ps(); +// uint32_t i; + +// /* Process 16 floats at a time with two AVX2 registers */ +// for (i = 0; i + 15 < dim; i += 16) { +// __m256 vx1 = _mm256_loadu_ps(&x[i]); +// __m256 vy1 = _mm256_loadu_ps(&y[i]); +// __m256 vx2 = _mm256_loadu_ps(&x[i + 8]); +// __m256 vy2 = _mm256_loadu_ps(&y[i + 8]); + +// sum1 = _mm256_fmadd_ps(vx1, vy1, sum1); +// sum2 = _mm256_fmadd_ps(vx2, vy2, sum2); +// } + +// /* Combine the two sums */ +// __m256 combined = _mm256_add_ps(sum1, sum2); + +// /* Horizontal sum of the 8 elements */ +// __m128 sum_high = _mm256_extractf128_ps(combined, 1); +// __m128 sum_low = _mm256_castps256_ps128(combined); +// __m128 sum_128 = _mm_add_ps(sum_high, sum_low); + +// sum_128 = _mm_hadd_ps(sum_128, sum_128); +// sum_128 = _mm_hadd_ps(sum_128, sum_128); + +// float dot = _mm_cvtss_f32(sum_128); + +// /* Handle remaining elements */ +// for (; i < dim; i++) { +// dot += x[i] * y[i]; +// } + +// return 1.0f - dot; +// } +// #endif + +/* Optimized dot product: automatically selects best available implementation + * Dot product: our vectors are already normalized. * Version for not quantized vectors of floats. */ float vectors_distance_float(const float *x, const float *y, uint32_t dim) { - /* Use two accumulators to reduce dependencies among multiplications. - * This provides a clear speed boost in Apple silicon, but should be - * help in general. */ +// #ifdef __AVX512F__ +// /* Check if runtime supports AVX512F */ +// static int avx512_checked = 0; +// static int has_avx512 = 0; + +// if (!avx512_checked) { +// /* Simple runtime check - in production you might want to use CPUID */ +// has_avx512 = __builtin_cpu_supports("avx512f"); +// avx512_checked = 1; +// } + +// if (has_avx512 && dim >= 16) { +// return vectors_distance_float_avx512(x, y, dim); +// } +// #endif + +// #ifdef __AVX2__ +// /* Check if runtime supports AVX2 */ +// static int avx2_checked = 0; +// static int has_avx2 = 0; + +// if (!avx2_checked) { +// has_avx2 = __builtin_cpu_supports("avx2") && __builtin_cpu_supports("fma"); +// avx2_checked = 1; +// } + +// if (has_avx2 && dim >= 16) { +// return vectors_distance_float_avx2(x, y, dim); +// } +// #endif + + /* Fallback to original scalar implementation */ float dot0 = 0.0f, dot1 = 0.0f; uint32_t i; - // Process 8 elements per iteration, 50/50 with the two accumulators. + /* Use two accumulators to reduce dependencies among multiplications. + * This provides a clear speed boost in Apple silicon, but should be + * help in general. */ for (i = 0; i + 7 < dim; i += 8) { dot0 += x[i] * y[i] + x[i+1] * y[i+1] + @@ -214,8 +317,7 @@ float vectors_distance_float(const float *x, const float *y, uint32_t dim) { x[i+7] * y[i+7]; } - /* Handle the remaining elements. These are a minority in the case - * of a small vector, don't optimize this part. */ + /* Handle the remaining elements */ for (; i < dim; i++) dot0 += x[i] * y[i]; /* The following line may be counter intuitive. The dot product of diff --git a/src/Makefile b/src/Makefile index 500f0c0173f..d1033b40957 100644 --- a/src/Makefile +++ b/src/Makefile @@ -334,6 +334,25 @@ ifneq ($(SKIP_VEC_SETS),yes) vpath %.c ../modules/vector-sets REDIS_VEC_SETS_OBJ=hnsw.o vset.o vset_config.o FINAL_CFLAGS+=-DINCLUDE_VEC_SETS=1 + + # CPU feature detection for vector-sets + VEC_SETS_CPU_FEATURES := + + # Check for AVX2 support + VEC_SETS_AVX2_SUPPORTED := $(shell echo | $(CC) -mavx2 -dM -E - 2>/dev/null | grep -q __AVX2__ && echo yes) + ifeq ($(VEC_SETS_AVX2_SUPPORTED),yes) + VEC_SETS_CPU_FEATURES += -mavx2 + endif + + # Check for both AVX512F and FMA together (AVX512 code needs both) + VEC_SETS_AVX512F_SUPPORTED := $(shell echo | $(CC) -mavx512f -mfma -dM -E - 2>/dev/null | grep -q __AVX512F__ && echo yes) + VEC_SETS_FMA_SUPPORTED := $(shell echo | $(CC) -mavx512f -mfma -dM -E - 2>/dev/null | grep -q __FMA__ && echo yes) + ifeq ($(VEC_SETS_AVX512F_SUPPORTED)$(VEC_SETS_FMA_SUPPORTED),yesyes) + VEC_SETS_CPU_FEATURES += -mavx512f -mfma + endif + + # Special compilation flags for vector-sets with detected CPU features + VEC_SETS_CFLAGS=$(FINAL_CFLAGS) $(VEC_SETS_CPU_FEATURES) endif ifndef V @@ -462,6 +481,19 @@ DEP = $(REDIS_SERVER_OBJ:%.o=%.d) $(REDIS_VEC_SETS_OBJ:%.o=%.d) $(REDIS_CLI_OBJ: # Because the jemalloc.h header is generated as a part of the jemalloc build, # building it should complete before building any other object. Instead of # depending on a single artifact, build all dependencies first. + +# Special compilation rules for vector-sets files with CPU feature detection +# ifneq ($(SKIP_VEC_SETS),yes) +# hnsw.o: ../modules/vector-sets/hnsw.c .make-prerequisites +# $(QUIET_CC)$(CC) $(VEC_SETS_CFLAGS) -MMD -o $@ -c $< + +# vset.o: ../modules/vector-sets/vset.c .make-prerequisites +# $(QUIET_CC)$(CC) $(VEC_SETS_CFLAGS) -MMD -o $@ -c $< + +# vset_config.o: ../modules/vector-sets/vset_config.c .make-prerequisites +# $(QUIET_CC)$(CC) $(VEC_SETS_CFLAGS) -MMD -o $@ -c $< +# endif + %.o: %.c .make-prerequisites $(REDIS_CC) -MMD -o $@ -c $<