Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
18 changes: 17 additions & 1 deletion modules/vector-sets/Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,23 @@ endif
endif
endif

CFLAGS = -O2 -Wall -Wextra -g $(SAN) -std=c11
# # CPU feature detection
# CPU_FEATURES :=

# # Check for AVX2 support
# AVX2_SUPPORTED := $(shell echo | $(CC) -mavx2 -dM -E - 2>/dev/null | grep -q __AVX2__ && echo yes)
# ifeq ($(AVX2_SUPPORTED),yes)
# CPU_FEATURES += -mavx2
# endif

# # Check for both AVX512F and FMA together (AVX512 code needs both)
# AVX512F_SUPPORTED := $(shell echo | $(CC) -mavx512f -mfma -dM -E - 2>/dev/null | grep -q __AVX512F__ && echo yes)
# FMA_SUPPORTED := $(shell echo | $(CC) -mavx512f -mfma -dM -E - 2>/dev/null | grep -q __FMA__ && echo yes)
# ifeq ($(AVX512F_SUPPORTED)$(FMA_SUPPORTED),yesyes)
# CPU_FEATURES += -mavx512f -mfma
# endif

CFLAGS = -O2 -Wall -Wextra -g $(SAN) -std=c11 $(CPU_FEATURES)
LDFLAGS = -lm $(SAN)

# Detect OS
Expand Down
116 changes: 109 additions & 7 deletions modules/vector-sets/hnsw.c
Original file line number Diff line number Diff line change
Expand Up @@ -47,6 +47,10 @@
#include "hnsw.h"
#include "mixer.h"

#if defined(__AVX2__) || defined(__AVX512F__)
#include <immintrin.h>
#endif

#if 0
#define debugmsg printf
#else
Expand Down Expand Up @@ -192,16 +196,115 @@ float pq_max_distance(pqueue *pq) {

/* ============================ HNSW algorithm ============================== */

/* Dot product: our vectors are already normalized.
// #ifdef __AVX512F__

// /* AVX512 optimized dot product for float vectors */
// float vectors_distance_float_avx512(const float *x, const float *y, uint32_t dim) {
// __m512 sum = _mm512_setzero_ps();
// uint32_t i;

// /* Process 16 floats at a time with AVX512 */
// for (i = 0; i + 15 < dim; i += 16) {
// __m512 vx = _mm512_loadu_ps(&x[i]);
// __m512 vy = _mm512_loadu_ps(&y[i]);
// sum = _mm512_fmadd_ps(vx, vy, sum);
// }

// /* Horizontal sum of the 16 elements in sum */
// float dot = _mm512_reduce_add_ps(sum);

// /* Handle remaining elements */
// for (; i < dim; i++) {
// dot += x[i] * y[i];
// }

// return 1.0f - dot;
// }
// #endif

// #ifdef __AVX2__

// /* AVX2 optimized dot product for float vectors */
// float vectors_distance_float_avx2(const float *x, const float *y, uint32_t dim) {
// __m256 sum1 = _mm256_setzero_ps();
// __m256 sum2 = _mm256_setzero_ps();
// uint32_t i;

// /* Process 16 floats at a time with two AVX2 registers */
// for (i = 0; i + 15 < dim; i += 16) {
// __m256 vx1 = _mm256_loadu_ps(&x[i]);
// __m256 vy1 = _mm256_loadu_ps(&y[i]);
// __m256 vx2 = _mm256_loadu_ps(&x[i + 8]);
// __m256 vy2 = _mm256_loadu_ps(&y[i + 8]);

// sum1 = _mm256_fmadd_ps(vx1, vy1, sum1);
// sum2 = _mm256_fmadd_ps(vx2, vy2, sum2);
// }

// /* Combine the two sums */
// __m256 combined = _mm256_add_ps(sum1, sum2);

// /* Horizontal sum of the 8 elements */
// __m128 sum_high = _mm256_extractf128_ps(combined, 1);
// __m128 sum_low = _mm256_castps256_ps128(combined);
// __m128 sum_128 = _mm_add_ps(sum_high, sum_low);

// sum_128 = _mm_hadd_ps(sum_128, sum_128);
// sum_128 = _mm_hadd_ps(sum_128, sum_128);

// float dot = _mm_cvtss_f32(sum_128);

// /* Handle remaining elements */
// for (; i < dim; i++) {
// dot += x[i] * y[i];
// }

// return 1.0f - dot;
// }
// #endif

/* Optimized dot product: automatically selects best available implementation
* Dot product: our vectors are already normalized.
* Version for not quantized vectors of floats. */
float vectors_distance_float(const float *x, const float *y, uint32_t dim) {
/* Use two accumulators to reduce dependencies among multiplications.
* This provides a clear speed boost in Apple silicon, but should be
* help in general. */
// #ifdef __AVX512F__
// /* Check if runtime supports AVX512F */
// static int avx512_checked = 0;
// static int has_avx512 = 0;

// if (!avx512_checked) {
// /* Simple runtime check - in production you might want to use CPUID */
// has_avx512 = __builtin_cpu_supports("avx512f");
// avx512_checked = 1;
// }

// if (has_avx512 && dim >= 16) {
// return vectors_distance_float_avx512(x, y, dim);
// }
// #endif

// #ifdef __AVX2__
// /* Check if runtime supports AVX2 */
// static int avx2_checked = 0;
// static int has_avx2 = 0;

// if (!avx2_checked) {
// has_avx2 = __builtin_cpu_supports("avx2") && __builtin_cpu_supports("fma");
// avx2_checked = 1;
// }

// if (has_avx2 && dim >= 16) {
// return vectors_distance_float_avx2(x, y, dim);
// }
// #endif

/* Fallback to original scalar implementation */
float dot0 = 0.0f, dot1 = 0.0f;
uint32_t i;

// Process 8 elements per iteration, 50/50 with the two accumulators.
/* Use two accumulators to reduce dependencies among multiplications.
* This provides a clear speed boost in Apple silicon, but should be
* help in general. */
for (i = 0; i + 7 < dim; i += 8) {
dot0 += x[i] * y[i] +
x[i+1] * y[i+1] +
Expand All @@ -214,8 +317,7 @@ float vectors_distance_float(const float *x, const float *y, uint32_t dim) {
x[i+7] * y[i+7];
}

/* Handle the remaining elements. These are a minority in the case
* of a small vector, don't optimize this part. */
/* Handle the remaining elements */
for (; i < dim; i++) dot0 += x[i] * y[i];

/* The following line may be counter intuitive. The dot product of
Expand Down
32 changes: 32 additions & 0 deletions src/Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -334,6 +334,25 @@ ifneq ($(SKIP_VEC_SETS),yes)
vpath %.c ../modules/vector-sets
REDIS_VEC_SETS_OBJ=hnsw.o vset.o vset_config.o
FINAL_CFLAGS+=-DINCLUDE_VEC_SETS=1

# CPU feature detection for vector-sets
VEC_SETS_CPU_FEATURES :=

# Check for AVX2 support
VEC_SETS_AVX2_SUPPORTED := $(shell echo | $(CC) -mavx2 -dM -E - 2>/dev/null | grep -q __AVX2__ && echo yes)
ifeq ($(VEC_SETS_AVX2_SUPPORTED),yes)
VEC_SETS_CPU_FEATURES += -mavx2
endif

# Check for both AVX512F and FMA together (AVX512 code needs both)
VEC_SETS_AVX512F_SUPPORTED := $(shell echo | $(CC) -mavx512f -mfma -dM -E - 2>/dev/null | grep -q __AVX512F__ && echo yes)
VEC_SETS_FMA_SUPPORTED := $(shell echo | $(CC) -mavx512f -mfma -dM -E - 2>/dev/null | grep -q __FMA__ && echo yes)
ifeq ($(VEC_SETS_AVX512F_SUPPORTED)$(VEC_SETS_FMA_SUPPORTED),yesyes)
VEC_SETS_CPU_FEATURES += -mavx512f -mfma
endif

# Special compilation flags for vector-sets with detected CPU features
VEC_SETS_CFLAGS=$(FINAL_CFLAGS) $(VEC_SETS_CPU_FEATURES)
endif

ifndef V
Expand Down Expand Up @@ -462,6 +481,19 @@ DEP = $(REDIS_SERVER_OBJ:%.o=%.d) $(REDIS_VEC_SETS_OBJ:%.o=%.d) $(REDIS_CLI_OBJ:
# Because the jemalloc.h header is generated as a part of the jemalloc build,
# building it should complete before building any other object. Instead of
# depending on a single artifact, build all dependencies first.

# Special compilation rules for vector-sets files with CPU feature detection
# ifneq ($(SKIP_VEC_SETS),yes)
# hnsw.o: ../modules/vector-sets/hnsw.c .make-prerequisites
# $(QUIET_CC)$(CC) $(VEC_SETS_CFLAGS) -MMD -o $@ -c $<

# vset.o: ../modules/vector-sets/vset.c .make-prerequisites
# $(QUIET_CC)$(CC) $(VEC_SETS_CFLAGS) -MMD -o $@ -c $<

# vset_config.o: ../modules/vector-sets/vset_config.c .make-prerequisites
# $(QUIET_CC)$(CC) $(VEC_SETS_CFLAGS) -MMD -o $@ -c $<
# endif

%.o: %.c .make-prerequisites
$(REDIS_CC) -MMD -o $@ -c $<

Expand Down
Loading