Skip to content

Commit 9212165

Browse files
authored
Fix hardware optimization compilation - [MOD-6567] (#438)
* move different optimizations into different files * use new choosers * fixing names and includes * fixing functions to not use SSE4.1 only intrinsics * fix cmake files * add optimization flags to L2_space.cpp and IP_space.cpp so we can choose optimized funcs * fix benchmarks cmake file * format * review fixes + more tidy up
1 parent 0fca479 commit 9212165

27 files changed

+281
-201
lines changed

src/VecSim/spaces/AVX_utils.h

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,7 @@
77
#pragma once
88
#include "space_includes.h"
99

10-
template <__mmask8 mask> // (2^n)-1, where n is in 1..7 (1, 4, ..., 127)
10+
template <__mmask8 mask> // (2^n)-1, where n is in 1..7 (1, 3, ..., 127)
1111
static inline __m256 my_mm256_maskz_loadu_ps(const float *p) {
1212
// Load 8 floats (assuming this is safe to do)
1313
__m256 data = _mm256_loadu_ps(p);
@@ -17,7 +17,7 @@ static inline __m256 my_mm256_maskz_loadu_ps(const float *p) {
1717
return masked_data;
1818
}
1919

20-
template <__mmask8 mask> // (2^n)-1, where n is in 1..3 (1, 4, 7)
20+
template <__mmask8 mask> // (2^n)-1, where n is in 1..3 (1, 3, 7)
2121
static inline __m256d my_mm256_maskz_loadu_pd(const double *p) {
2222
// Load 4 doubles (assuming this is safe to do)
2323
__m256d data = _mm256_loadu_pd(p);

src/VecSim/spaces/CMakeLists.txt

Lines changed: 18 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -10,52 +10,57 @@ include(CheckCXXCompilerFlag)
1010
project(VectorSimilarity_Spaces)
1111

1212
# TODO: Remove this once cpu_features get support for M1
13-
if(NOT APPLE)
14-
include(${root}/cmake/cpu_features.cmake)
15-
elseif(CMAKE_HOST_SYSTEM_PROCESSOR STREQUAL "x86_64")
13+
if((NOT APPLE) OR (CMAKE_HOST_SYSTEM_PROCESSOR STREQUAL "x86_64"))
1614
include(${root}/cmake/cpu_features.cmake)
1715
else()
1816
add_definitions(-DM1)
1917
endif()
2018

19+
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Werror -Wall")
20+
21+
set(OPTIMIZATIONS "")
22+
2123
if(CMAKE_HOST_SYSTEM_PROCESSOR MATCHES "(x86_64)|(AMD64|amd64)|(^i.86$)")
22-
# build SSE/AVX* code only on x64 processors.
2324
# Check that the compiler supports instructions flag.
24-
# This will add the relevant flag both the the space selector and the optimization.
2525
CHECK_CXX_COMPILER_FLAG(-mavx512f CXX_AVX512F)
2626
CHECK_CXX_COMPILER_FLAG(-mavx CXX_AVX)
2727
CHECK_CXX_COMPILER_FLAG(-msse CXX_SSE)
2828

29+
# build SSE/AVX* code only on x64 processors.
30+
# This will add the relevant flag both to the space selector and the optimization.
2931
if(CXX_AVX512F)
30-
add_compile_options(-mavx512f)
3132
message("Building with AVX512")
33+
set_source_files_properties(functions/AVX512.cpp PROPERTIES COMPILE_FLAGS -mavx512f)
34+
list(APPEND OPTIMIZATIONS functions/AVX512.cpp)
35+
add_compile_definitions(OPT_AVX512F)
3236
endif()
3337

3438
if(CXX_AVX)
35-
add_compile_options(-mavx)
3639
message("Building with AVX")
40+
set_source_files_properties(functions/AVX.cpp PROPERTIES COMPILE_FLAGS -mavx)
41+
list(APPEND OPTIMIZATIONS functions/AVX.cpp)
42+
add_compile_definitions(OPT_AVX)
3743
endif()
3844

3945
if(CXX_SSE)
40-
add_compile_options(-msse)
4146
message("Building with SSE")
47+
set_source_files_properties(functions/SSE.cpp PROPERTIES COMPILE_FLAGS -msse)
48+
list(APPEND OPTIMIZATIONS functions/SSE.cpp)
49+
add_compile_definitions(OPT_SSE)
4250
endif()
4351
endif()
4452

45-
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Werror -Wall")
46-
4753
# Here we are compiling the space selectors with the relevant optimization flag.
4854
add_library(VectorSimilaritySpaces
4955
space_aux.cpp
5056
L2_space.cpp
5157
IP_space.cpp
5258
spaces.cpp
59+
${OPTIMIZATIONS}
5360
)
5461

5562
target_link_libraries(VectorSimilaritySpaces VectorSimilaritySpaces_no_optimization)
5663

57-
if(NOT APPLE)
58-
target_link_libraries(VectorSimilaritySpaces cpu_features)
59-
elseif(CMAKE_HOST_SYSTEM_PROCESSOR STREQUAL "x86_64")
64+
if((NOT APPLE) OR (CMAKE_HOST_SYSTEM_PROCESSOR STREQUAL "x86_64"))
6065
target_link_libraries(VectorSimilaritySpaces cpu_features)
6166
endif()

src/VecSim/spaces/IP/IP_AVX.h

Lines changed: 0 additions & 10 deletions
This file was deleted.

src/VecSim/spaces/IP/IP_AVX512.h

Lines changed: 0 additions & 10 deletions
This file was deleted.

src/VecSim/spaces/IP/IP_SSE.h

Lines changed: 0 additions & 10 deletions
This file was deleted.

src/VecSim/spaces/IP/IP_SSE_FP32.h

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -29,10 +29,10 @@ float FP32_InnerProductSIMD16_SSE(const void *pVect1v, const void *pVect2v, size
2929
__m128 v1, v2;
3030
if (residual % 4 == 3) {
3131
// Load 3 floats and set the last one to 0
32-
v1 = _mm_load_ps(pVect1); // load 4 floats
33-
v2 = _mm_load_ps(pVect2);
34-
v1 = _mm_blend_ps(_mm_setzero_ps(), v1, 7); // set the last one to 0
35-
v2 = _mm_blend_ps(_mm_setzero_ps(), v2, 7);
32+
v1 = _mm_load_ss(pVect1); // load 1 float, set the rest to 0
33+
v2 = _mm_load_ss(pVect2);
34+
v1 = _mm_loadh_pi(v1, (__m64 *)(pVect1 + 1));
35+
v2 = _mm_loadh_pi(v2, (__m64 *)(pVect2 + 1));
3636
} else if (residual % 4 == 2) {
3737
// Load 2 floats and set the last two to 0
3838
v1 = _mm_loadh_pi(_mm_setzero_ps(), (__m64 *)pVect1);

src/VecSim/spaces/IP_space.cpp

Lines changed: 15 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -7,13 +7,11 @@
77
#include "VecSim/spaces/IP_space.h"
88
#include "VecSim/spaces/IP/IP.h"
99
#if defined(__x86_64__)
10-
#include "VecSim/spaces/IP/IP_AVX512.h"
11-
#include "VecSim/spaces/IP/IP_AVX.h"
12-
#include "VecSim/spaces/IP/IP_SSE.h"
10+
#include "VecSim/spaces/functions/AVX512.h"
11+
#include "VecSim/spaces/functions/AVX.h"
12+
#include "VecSim/spaces/functions/SSE.h"
1313
#endif
1414

15-
#include "VecSim/spaces/implementation_chooser.h"
16-
1715
namespace spaces {
1816
dist_func_t<float> IP_FP32_GetDistFunc(size_t dim, const Arch_Optimization arch_opt,
1917
unsigned char *alignment) {
@@ -32,22 +30,22 @@ dist_func_t<float> IP_FP32_GetDistFunc(size_t dim, const Arch_Optimization arch_
3230

3331
switch (arch_opt) {
3432
case ARCH_OPT_AVX512_F:
35-
#ifdef __AVX512F__
36-
CHOOSE_IMPLEMENTATION(ret_dist_func, dim, 16, FP32_InnerProductSIMD16_AVX512);
33+
#ifdef OPT_AVX512F
34+
ret_dist_func = Choose_FP32_IP_implementation_AVX512(dim);
3735
if (dim % 16 == 0) // no point in aligning if we have an offsetting residual
3836
*alignment = 16 * sizeof(float); // handles 16 floats
3937
break;
4038
#endif
4139
case ARCH_OPT_AVX:
42-
#ifdef __AVX__
43-
CHOOSE_IMPLEMENTATION(ret_dist_func, dim, 16, FP32_InnerProductSIMD16_AVX);
40+
#ifdef OPT_AVX
41+
ret_dist_func = Choose_FP32_IP_implementation_AVX(dim);
4442
if (dim % 8 == 0) // no point in aligning if we have an offsetting residual
4543
*alignment = 8 * sizeof(float); // handles 8 floats
4644
break;
4745
#endif
4846
case ARCH_OPT_SSE:
49-
#ifdef __SSE__
50-
CHOOSE_IMPLEMENTATION(ret_dist_func, dim, 16, FP32_InnerProductSIMD16_SSE);
47+
#ifdef OPT_SSE
48+
ret_dist_func = Choose_FP32_IP_implementation_SSE(dim);
5149
if (dim % 4 == 0) // no point in aligning if we have an offsetting residual
5250
*alignment = 4 * sizeof(float); // handles 4 floats
5351
break;
@@ -77,22 +75,22 @@ dist_func_t<double> IP_FP64_GetDistFunc(size_t dim, const Arch_Optimization arch
7775

7876
switch (arch_opt) {
7977
case ARCH_OPT_AVX512_F:
80-
#ifdef __AVX512F__
81-
CHOOSE_IMPLEMENTATION(ret_dist_func, dim, 8, FP64_InnerProductSIMD8_AVX512);
78+
#ifdef OPT_AVX512F
79+
ret_dist_func = Choose_FP64_IP_implementation_AVX512(dim);
8280
if (dim % 8 == 0) // no point in aligning if we have an offsetting residual
8381
*alignment = 8 * sizeof(double); // handles 8 doubles
8482
break;
8583
#endif
8684
case ARCH_OPT_AVX:
87-
#ifdef __AVX__
88-
CHOOSE_IMPLEMENTATION(ret_dist_func, dim, 8, FP64_InnerProductSIMD8_AVX);
85+
#ifdef OPT_AVX
86+
ret_dist_func = Choose_FP64_IP_implementation_AVX(dim);
8987
if (dim % 4 == 0) // no point in aligning if we have an offsetting residual
9088
*alignment = 4 * sizeof(double); // handles 4 doubles
9189
break;
9290
#endif
9391
case ARCH_OPT_SSE:
94-
#ifdef __SSE__
95-
CHOOSE_IMPLEMENTATION(ret_dist_func, dim, 8, FP64_InnerProductSIMD8_SSE);
92+
#ifdef OPT_SSE
93+
ret_dist_func = Choose_FP64_IP_implementation_SSE(dim);
9694
if (dim % 2 == 0) // no point in aligning if we have an offsetting residual
9795
*alignment = 2 * sizeof(double); // handles 2 doubles
9896
break;
@@ -106,5 +104,3 @@ dist_func_t<double> IP_FP64_GetDistFunc(size_t dim, const Arch_Optimization arch
106104
}
107105

108106
} // namespace spaces
109-
110-
#include "VecSim/spaces/implementation_chooser_cleanup.h"

src/VecSim/spaces/L2/L2_AVX.h

Lines changed: 0 additions & 10 deletions
This file was deleted.

src/VecSim/spaces/L2/L2_AVX512.h

Lines changed: 0 additions & 10 deletions
This file was deleted.

src/VecSim/spaces/L2/L2_SSE.h

Lines changed: 0 additions & 10 deletions
This file was deleted.

0 commit comments

Comments
 (0)