RedisAI
diff --git a/‎src/VecSim/spaces/AVX_utils.h‎
Lines changed: 2 additions & 2 deletions b/‎src/VecSim/spaces/AVX_utils.h‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎src/VecSim/spaces/CMakeLists.txt‎
Lines changed: 18 additions & 13 deletions b/‎src/VecSim/spaces/CMakeLists.txt‎
Lines changed: 18 additions & 13 deletions
diff --git a/‎src/VecSim/spaces/IP/IP_AVX.h‎
Lines changed: 0 additions & 10 deletions b/‎src/VecSim/spaces/IP/IP_AVX.h‎
Lines changed: 0 additions & 10 deletions
diff --git a/‎src/VecSim/spaces/IP/IP_AVX512.h‎
Lines changed: 0 additions & 10 deletions b/‎src/VecSim/spaces/IP/IP_AVX512.h‎
Lines changed: 0 additions & 10 deletions
diff --git a/‎src/VecSim/spaces/IP/IP_SSE.h‎
Lines changed: 0 additions & 10 deletions b/‎src/VecSim/spaces/IP/IP_SSE.h‎
Lines changed: 0 additions & 10 deletions
diff --git a/‎src/VecSim/spaces/IP/IP_SSE_FP32.h‎
Lines changed: 4 additions & 4 deletions b/‎src/VecSim/spaces/IP/IP_SSE_FP32.h‎
Lines changed: 4 additions & 4 deletions
diff --git a/‎src/VecSim/spaces/IP_space.cpp‎
Lines changed: 15 additions & 19 deletions b/‎src/VecSim/spaces/IP_space.cpp‎
Lines changed: 15 additions & 19 deletions
diff --git a/‎src/VecSim/spaces/L2/L2_AVX.h‎
Lines changed: 0 additions & 10 deletions b/‎src/VecSim/spaces/L2/L2_AVX.h‎
Lines changed: 0 additions & 10 deletions
diff --git a/‎src/VecSim/spaces/L2/L2_AVX512.h‎
Lines changed: 0 additions & 10 deletions b/‎src/VecSim/spaces/L2/L2_AVX512.h‎
Lines changed: 0 additions & 10 deletions
diff --git a/‎src/VecSim/spaces/L2/L2_SSE.h‎
Lines changed: 0 additions & 10 deletions b/‎src/VecSim/spaces/L2/L2_SSE.h‎
Lines changed: 0 additions & 10 deletions
@@ -7,7 +7,7 @@
 #pragma once
 #include "space_includes.h"
 
-template <__mmask8 mask> // (2^n)-1, where n is in 1..7 (1, 4, ..., 127)
+template <__mmask8 mask> // (2^n)-1, where n is in 1..7 (1, 3, ..., 127)
 static inline __m256 my_mm256_maskz_loadu_ps(const float *p) {
     // Load 8 floats (assuming this is safe to do)
     __m256 data = _mm256_loadu_ps(p);
@@ -17,7 +17,7 @@ static inline __m256 my_mm256_maskz_loadu_ps(const float *p) {
     return masked_data;
 }
 
-template <__mmask8 mask> // (2^n)-1, where n is in 1..3 (1, 4, 7)
+template <__mmask8 mask> // (2^n)-1, where n is in 1..3 (1, 3, 7)
 static inline __m256d my_mm256_maskz_loadu_pd(const double *p) {
     // Load 4 doubles (assuming this is safe to do)
     __m256d data = _mm256_loadu_pd(p);
 
@@ -10,52 +10,57 @@ include(CheckCXXCompilerFlag)
 project(VectorSimilarity_Spaces)
 
 # TODO: Remove this once cpu_features get support for M1
-if(NOT APPLE)
-	include(${root}/cmake/cpu_features.cmake)
-elseif(CMAKE_HOST_SYSTEM_PROCESSOR STREQUAL "x86_64")
+if((NOT APPLE) OR (CMAKE_HOST_SYSTEM_PROCESSOR STREQUAL "x86_64"))
 	include(${root}/cmake/cpu_features.cmake)
 else()
 	add_definitions(-DM1)
 endif()
 
+set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Werror -Wall")
+
+set(OPTIMIZATIONS "")
+
 if(CMAKE_HOST_SYSTEM_PROCESSOR MATCHES "(x86_64)|(AMD64|amd64)|(^i.86$)")
-	# build SSE/AVX* code only on x64 processors.
 	# Check that the compiler supports instructions flag.
-	# This will add the relevant flag both the the space selector and the optimization.
 	CHECK_CXX_COMPILER_FLAG(-mavx512f CXX_AVX512F)
 	CHECK_CXX_COMPILER_FLAG(-mavx CXX_AVX)
 	CHECK_CXX_COMPILER_FLAG(-msse CXX_SSE)
 
+	# build SSE/AVX* code only on x64 processors.
+	# This will add the relevant flag both to the space selector and the optimization.
 	if(CXX_AVX512F)
-		add_compile_options(-mavx512f)
 		message("Building with AVX512")
+		set_source_files_properties(functions/AVX512.cpp PROPERTIES COMPILE_FLAGS -mavx512f)
+		list(APPEND OPTIMIZATIONS functions/AVX512.cpp)
+		add_compile_definitions(OPT_AVX512F)
 	endif()
 
 	if(CXX_AVX)
-		add_compile_options(-mavx)
 		message("Building with AVX")
+		set_source_files_properties(functions/AVX.cpp PROPERTIES COMPILE_FLAGS -mavx)
+		list(APPEND OPTIMIZATIONS functions/AVX.cpp)
+		add_compile_definitions(OPT_AVX)
 	endif()
 
 	if(CXX_SSE)
-		add_compile_options(-msse)
 		message("Building with SSE")
+		set_source_files_properties(functions/SSE.cpp PROPERTIES COMPILE_FLAGS -msse)
+		list(APPEND OPTIMIZATIONS functions/SSE.cpp)
+		add_compile_definitions(OPT_SSE)
 	endif()
 endif()
 
-set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Werror -Wall")
-
 # Here we are compiling the space selectors with the relevant optimization flag.
 add_library(VectorSimilaritySpaces
 	space_aux.cpp
 	L2_space.cpp
 	IP_space.cpp
 	spaces.cpp
+	${OPTIMIZATIONS}
 )
 
 target_link_libraries(VectorSimilaritySpaces VectorSimilaritySpaces_no_optimization)
 
-if(NOT APPLE)
-	target_link_libraries(VectorSimilaritySpaces cpu_features)
-elseif(CMAKE_HOST_SYSTEM_PROCESSOR STREQUAL "x86_64")
+if((NOT APPLE) OR (CMAKE_HOST_SYSTEM_PROCESSOR STREQUAL "x86_64"))
 	target_link_libraries(VectorSimilaritySpaces cpu_features)
 endif()
@@ -29,10 +29,10 @@ float FP32_InnerProductSIMD16_SSE(const void *pVect1v, const void *pVect2v, size
         __m128 v1, v2;
         if (residual % 4 == 3) {
             // Load 3 floats and set the last one to 0
-            v1 = _mm_load_ps(pVect1); // load 4 floats
-            v2 = _mm_load_ps(pVect2);
-            v1 = _mm_blend_ps(_mm_setzero_ps(), v1, 7); // set the last one to 0
-            v2 = _mm_blend_ps(_mm_setzero_ps(), v2, 7);
+            v1 = _mm_load_ss(pVect1); // load 1 float, set the rest to 0
+            v2 = _mm_load_ss(pVect2);
+            v1 = _mm_loadh_pi(v1, (__m64 *)(pVect1 + 1));
+            v2 = _mm_loadh_pi(v2, (__m64 *)(pVect2 + 1));
         } else if (residual % 4 == 2) {
             // Load 2 floats and set the last two to 0
             v1 = _mm_loadh_pi(_mm_setzero_ps(), (__m64 *)pVect1);
 
@@ -7,13 +7,11 @@
 #include "VecSim/spaces/IP_space.h"
 #include "VecSim/spaces/IP/IP.h"
 #if defined(__x86_64__)
-#include "VecSim/spaces/IP/IP_AVX512.h"
-#include "VecSim/spaces/IP/IP_AVX.h"
-#include "VecSim/spaces/IP/IP_SSE.h"
+#include "VecSim/spaces/functions/AVX512.h"
+#include "VecSim/spaces/functions/AVX.h"
+#include "VecSim/spaces/functions/SSE.h"
 #endif
 
-#include "VecSim/spaces/implementation_chooser.h"
-
 namespace spaces {
 dist_func_t<float> IP_FP32_GetDistFunc(size_t dim, const Arch_Optimization arch_opt,
                                        unsigned char *alignment) {
@@ -32,22 +30,22 @@ dist_func_t<float> IP_FP32_GetDistFunc(size_t dim, const Arch_Optimization arch_
 
     switch (arch_opt) {
     case ARCH_OPT_AVX512_F:
-#ifdef __AVX512F__
-        CHOOSE_IMPLEMENTATION(ret_dist_func, dim, 16, FP32_InnerProductSIMD16_AVX512);
+#ifdef OPT_AVX512F
+        ret_dist_func = Choose_FP32_IP_implementation_AVX512(dim);
         if (dim % 16 == 0) // no point in aligning if we have an offsetting residual
             *alignment = 16 * sizeof(float); // handles 16 floats
         break;
 #endif
     case ARCH_OPT_AVX:
-#ifdef __AVX__
-        CHOOSE_IMPLEMENTATION(ret_dist_func, dim, 16, FP32_InnerProductSIMD16_AVX);
+#ifdef OPT_AVX
+        ret_dist_func = Choose_FP32_IP_implementation_AVX(dim);
         if (dim % 8 == 0) // no point in aligning if we have an offsetting residual
             *alignment = 8 * sizeof(float); // handles 8 floats
         break;
 #endif
     case ARCH_OPT_SSE:
-#ifdef __SSE__
-        CHOOSE_IMPLEMENTATION(ret_dist_func, dim, 16, FP32_InnerProductSIMD16_SSE);
+#ifdef OPT_SSE
+        ret_dist_func = Choose_FP32_IP_implementation_SSE(dim);
         if (dim % 4 == 0) // no point in aligning if we have an offsetting residual
             *alignment = 4 * sizeof(float); // handles 4 floats
         break;
@@ -77,22 +75,22 @@ dist_func_t<double> IP_FP64_GetDistFunc(size_t dim, const Arch_Optimization arch
 
     switch (arch_opt) {
     case ARCH_OPT_AVX512_F:
-#ifdef __AVX512F__
-        CHOOSE_IMPLEMENTATION(ret_dist_func, dim, 8, FP64_InnerProductSIMD8_AVX512);
+#ifdef OPT_AVX512F
+        ret_dist_func = Choose_FP64_IP_implementation_AVX512(dim);
         if (dim % 8 == 0) // no point in aligning if we have an offsetting residual
             *alignment = 8 * sizeof(double); // handles 8 doubles
         break;
 #endif
     case ARCH_OPT_AVX:
-#ifdef __AVX__
-        CHOOSE_IMPLEMENTATION(ret_dist_func, dim, 8, FP64_InnerProductSIMD8_AVX);
+#ifdef OPT_AVX
+        ret_dist_func = Choose_FP64_IP_implementation_AVX(dim);
         if (dim % 4 == 0) // no point in aligning if we have an offsetting residual
             *alignment = 4 * sizeof(double); // handles 4 doubles
         break;
 #endif
     case ARCH_OPT_SSE:
-#ifdef __SSE__
-        CHOOSE_IMPLEMENTATION(ret_dist_func, dim, 8, FP64_InnerProductSIMD8_SSE);
+#ifdef OPT_SSE
+        ret_dist_func = Choose_FP64_IP_implementation_SSE(dim);
         if (dim % 2 == 0) // no point in aligning if we have an offsetting residual
             *alignment = 2 * sizeof(double); // handles 2 doubles
         break;
@@ -106,5 +104,3 @@ dist_func_t<double> IP_FP64_GetDistFunc(size_t dim, const Arch_Optimization arch
 }
 
 } // namespace spaces
-
-#include "VecSim/spaces/implementation_chooser_cleanup.h"