From a470c018daf921a15a02628c86b26ab385a8b5a6 Mon Sep 17 00:00:00 2001 From: cyy Date: Wed, 13 Aug 2025 09:59:10 +0800 Subject: [PATCH 1/2] Enable fp32 tests for Windows Signed-off-by: cyy --- CMakeLists.txt | 3 --- bench/CMakeLists.txt | 5 ----- cmake/modules/CxxCompilerSetup.cmake | 2 +- defs.bzl | 13 ++----------- include/fbgemm/FbgemmFP16.h | 12 +----------- include/fbgemm/FbgemmFP32.h | 11 ----------- src/fp32/FbgemmFP32.cc | 6 +++--- test/CMakeLists.txt | 5 ----- 8 files changed, 7 insertions(+), 50 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 9c48da02a6..b7af93cb28 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -175,9 +175,6 @@ endif() ################################################################################ get_filelist("get_fbgemm_generic_srcs(with_base=True)" FBGEMM_GENERIC_SRCS) -if(MSVC) - list(FILTER FBGEMM_GENERIC_SRCS EXCLUDE REGEX "src/fp32/.*\\.cc$") -endif() set(fbgemm_generic_defs "${fbgemm_arm_defs}") if(FBGEMM_LIBRARY_TYPE STREQUAL STATIC) diff --git a/bench/CMakeLists.txt b/bench/CMakeLists.txt index b1f2295b5e..325970cc98 100644 --- a/bench/CMakeLists.txt +++ b/bench/CMakeLists.txt @@ -103,11 +103,6 @@ if(FBGEMM_BUILD_BENCHMARKS) list(FILTER BENCH_LIST EXCLUDE REGEX "quantize") endif() - if(MSVC) - # NOTE: Skip FP32 benchmark for MSVC until intrinsic kernels are implemented - list(FILTER BENCH_LIST EXCLUDE REGEX "FP32Benchmark\\.cc$") - endif() - foreach(BENCH_FILE ${BENCH_LIST}) get_filename_component(BENCH_NAME ${BENCH_FILE} NAME_WE) get_filename_component(BENCH_FILENAME ${BENCH_FILE} NAME) diff --git a/cmake/modules/CxxCompilerSetup.cmake b/cmake/modules/CxxCompilerSetup.cmake index 4411ef3a38..bf73e4d758 100644 --- a/cmake/modules/CxxCompilerSetup.cmake +++ b/cmake/modules/CxxCompilerSetup.cmake @@ -40,7 +40,7 @@ BLOCK_PRINT( ) # Strip all symbols from the .SO file after building -if(NOT MSVC AND NOT APPLE) +if(NOT WIN32 AND NOT APPLE) add_link_options($<$:-s>) endif() diff --git a/defs.bzl b/defs.bzl index b8b59e6301..b6959ad90c 100644 --- a/defs.bzl +++ b/defs.bzl @@ -47,6 +47,7 @@ def get_fbgemm_generic_srcs(with_base = False, msvc = False, buck = False): "src/FbgemmSparseDense.cc", "src/FbgemmI8Spmdm.cc", "src/FbgemmPackMatrixB.cc", + "src/fp32/FbgemmFP32.cc", "src/GenerateKernelDirectConvU8S8S32ACC32.cc", "src/GenerateKernel.cc", "src/GenerateKernelU8S8S32ACC16.cc", @@ -73,17 +74,7 @@ def get_fbgemm_generic_srcs(with_base = False, msvc = False, buck = False): "src/TransposeUtils.cc", ] + (get_fbgemm_base_srcs() if with_base else []) - fp32sources = [ - "src/fp32/FbgemmFP32.cc", - ] - - if buck: - return select({ - "DEFAULT": sources + fp32sources, - "ovr_config//compiler:cl": sources, - }) - - return sources + fp32sources if not msvc else sources + return sources def get_fbgemm_public_headers(): return [ diff --git a/include/fbgemm/FbgemmFP16.h b/include/fbgemm/FbgemmFP16.h index c8a439d883..0a41a17e2b 100644 --- a/include/fbgemm/FbgemmFP16.h +++ b/include/fbgemm/FbgemmFP16.h @@ -13,6 +13,7 @@ #include +#include "fbgemm/FbgemmFPCommon.h" #include "./FbgemmPackMatrixB.h" // @manual #include "./FloatConversion.h" // @manual #include "./Types.h" // @manual @@ -31,17 +32,6 @@ struct TypeConverter { using PackedGemmMatrixFP16 = PackedGemmMatrixB; -template -FBGEMM_API void cblas_gemm_compute( - const matrix_op_t transa, - const int m, - const float* A, - const PackedGemmMatrixB& Bp, - const float beta, - float* C, - int thread_id = 0, - int num_threads = 1); - extern template void cblas_gemm_compute( const matrix_op_t transa, const int m, diff --git a/include/fbgemm/FbgemmFP32.h b/include/fbgemm/FbgemmFP32.h index 0aafcc53c1..d86c506979 100644 --- a/include/fbgemm/FbgemmFP32.h +++ b/include/fbgemm/FbgemmFP32.h @@ -22,17 +22,6 @@ struct TypeConverter { using GemmParamsFP32 = GemmParams; using PackedGemmMatrixFP32 = PackedGemmMatrixB; -template -void cblas_gemm_compute( - const matrix_op_t transa, - const int m, - const float* A, - const PackedGemmMatrixB& Bp, - const float beta, - float* C, - int thread_id = 0, - int num_threads = 1); - extern template void cblas_gemm_compute( const matrix_op_t transa, const int m, diff --git a/src/fp32/FbgemmFP32.cc b/src/fp32/FbgemmFP32.cc index 4e0f5ab33b..1a92e9b824 100644 --- a/src/fp32/FbgemmFP32.cc +++ b/src/fp32/FbgemmFP32.cc @@ -32,7 +32,7 @@ namespace { // Here with kernel_ncol_blocks = 2, we can provide up to 6x2 kernels, due to // the restrictions of ymm register numbers (16). constexpr kernel_array_t kernel_f32_avx2 = { -#ifndef __aarch64__ +#if !defined(__aarch64__) && !defined(_MSC_VER) nullptr, gemmkernel_1x2_Avx2_fp32_fA0fB0fC0, gemmkernel_2x2_Avx2_fp32_fA0fB0fC0, @@ -45,7 +45,7 @@ constexpr kernel_array_t kernel_f32_avx2 = { #endif constexpr kernel_array_t kernel_f32_avx512 = { -#ifndef __aarch64__ +#if !defined(__aarch64__) && !defined(_MSC_VER) nullptr, gemmkernel_1x2_Avx512_fp32_fA0fB0fC0, gemmkernel_2x2_Avx512_fp32_fA0fB0fC0, @@ -67,7 +67,7 @@ constexpr kernel_array_t kernel_f32_avx512 = { // clang-format on constexpr kernel_array_t kernel_f32_avx512_256 = { -#ifndef __aarch64__ +#if !defined(__aarch64__) && !defined(_MSC_VER) nullptr, gemmkernel_1x2_Avx2_fp32_fA0fB0fC0, gemmkernel_2x2_Avx2_fp32_fA0fB0fC0, diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt index d687e8dc5c..477fef975b 100644 --- a/test/CMakeLists.txt +++ b/test/CMakeLists.txt @@ -111,11 +111,6 @@ foreach(TEST_FILE ${TEST_LIST}) endif() endif() - if(MSVC AND TEST_FILE MATCHES "FP32Test.cc$") - # NOTE: Skip FP32 test for MSVC until intrinsic kernels are implemented - continue() - endif() - message(STATUS "Processing: ${TEST_FILE}") get_filename_component(TEST_NAME "${TEST_FILE}" NAME_WE) From 51c87718d914d2f0952c51b9a4b5526faa7dbc28 Mon Sep 17 00:00:00 2001 From: cyy Date: Mon, 18 Aug 2025 06:27:44 +0800 Subject: [PATCH 2/2] Export symbols Signed-off-by: cyy --- src/fp32/FbgemmFP32.cc | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/src/fp32/FbgemmFP32.cc b/src/fp32/FbgemmFP32.cc index 1a92e9b824..41b0621c7f 100644 --- a/src/fp32/FbgemmFP32.cc +++ b/src/fp32/FbgemmFP32.cc @@ -32,7 +32,7 @@ namespace { // Here with kernel_ncol_blocks = 2, we can provide up to 6x2 kernels, due to // the restrictions of ymm register numbers (16). constexpr kernel_array_t kernel_f32_avx2 = { -#if !defined(__aarch64__) && !defined(_MSC_VER) +#ifndef __aarch64__ nullptr, gemmkernel_1x2_Avx2_fp32_fA0fB0fC0, gemmkernel_2x2_Avx2_fp32_fA0fB0fC0, @@ -45,7 +45,7 @@ constexpr kernel_array_t kernel_f32_avx2 = { #endif constexpr kernel_array_t kernel_f32_avx512 = { -#if !defined(__aarch64__) && !defined(_MSC_VER) +#ifndef __aarch64__ nullptr, gemmkernel_1x2_Avx512_fp32_fA0fB0fC0, gemmkernel_2x2_Avx512_fp32_fA0fB0fC0, @@ -67,7 +67,7 @@ constexpr kernel_array_t kernel_f32_avx512 = { // clang-format on constexpr kernel_array_t kernel_f32_avx512_256 = { -#if !defined(__aarch64__) && !defined(_MSC_VER) +#ifndef __aarch64__ nullptr, gemmkernel_1x2_Avx2_fp32_fA0fB0fC0, gemmkernel_2x2_Avx2_fp32_fA0fB0fC0, @@ -180,7 +180,7 @@ FBGEMM_API void ref_kernel( } #endif // FBGEMM_FP32_FALLBACK_TO_REF_KERNEL -template void cblas_gemm_compute( +template FBGEMM_API void cblas_gemm_compute( const matrix_op_t transa, const int m, const float* A,