From 6d66322a8a14f1bf1c9357077952a7556c0928d9 Mon Sep 17 00:00:00 2001 From: Zeyu Song Date: Mon, 11 Aug 2025 09:59:20 -0700 Subject: [PATCH] Add CPP version of bitpacking. (#2725) Summary: Add cpp implementation of bitpacking functions (not rely on specific hardwares). Differential Revision: D79456037 --- .../kernels/cpu/aarch64/tests/CMakeLists.txt | 9 + .../cpu/aarch64/tests/build_and_run_tests.sh | 1 + .../test_bitpack_fallback_compatibility.cpp | 686 ++++++++++++++++++ .../kernels/cpu/fallback/CMakeLists.txt | 5 + .../kernels/cpu/fallback/bitpacking/bitpack.h | 179 +++++ .../kernels/cpu/fallback/bitpacking/uint1.h | 154 ++++ .../kernels/cpu/fallback/bitpacking/uint2.h | 119 +++ .../kernels/cpu/fallback/bitpacking/uint3.h | 195 +++++ .../kernels/cpu/fallback/bitpacking/uint4.h | 109 +++ .../kernels/cpu/fallback/bitpacking/uint5.h | 175 +++++ .../kernels/cpu/fallback/bitpacking/uint6.h | 142 ++++ .../kernels/cpu/fallback/bitpacking/uint7.h | 140 ++++ .../kernels/cpu/fallback/tests/CMakeLists.txt | 49 ++ .../cpu/fallback/tests/build_and_run_tests.sh | 35 + .../cpu/fallback/tests/test_bitpacking.cpp | 157 ++++ 15 files changed, 2155 insertions(+) create mode 100644 torchao/experimental/kernels/cpu/aarch64/tests/test_bitpack_fallback_compatibility.cpp create mode 100644 torchao/experimental/kernels/cpu/fallback/CMakeLists.txt create mode 100644 torchao/experimental/kernels/cpu/fallback/bitpacking/bitpack.h create mode 100644 torchao/experimental/kernels/cpu/fallback/bitpacking/uint1.h create mode 100644 torchao/experimental/kernels/cpu/fallback/bitpacking/uint2.h create mode 100644 torchao/experimental/kernels/cpu/fallback/bitpacking/uint3.h create mode 100644 torchao/experimental/kernels/cpu/fallback/bitpacking/uint4.h create mode 100644 torchao/experimental/kernels/cpu/fallback/bitpacking/uint5.h create mode 100644 torchao/experimental/kernels/cpu/fallback/bitpacking/uint6.h create mode 100644 torchao/experimental/kernels/cpu/fallback/bitpacking/uint7.h create mode 100644 torchao/experimental/kernels/cpu/fallback/tests/CMakeLists.txt create mode 100644 torchao/experimental/kernels/cpu/fallback/tests/build_and_run_tests.sh create mode 100644 torchao/experimental/kernels/cpu/fallback/tests/test_bitpacking.cpp diff --git a/torchao/experimental/kernels/cpu/aarch64/tests/CMakeLists.txt b/torchao/experimental/kernels/cpu/aarch64/tests/CMakeLists.txt index 5f4bca286b..2b38856b9f 100644 --- a/torchao/experimental/kernels/cpu/aarch64/tests/CMakeLists.txt +++ b/torchao/experimental/kernels/cpu/aarch64/tests/CMakeLists.txt @@ -128,6 +128,14 @@ target_link_libraries( dep ) +add_executable(test_bitpack_fallback_compatibility test_bitpack_fallback_compatibility.cpp) +target_link_libraries( + test_bitpack_fallback_compatibility + PRIVATE + GTest::gtest_main + dep +) + include(GoogleTest) gtest_discover_tests(test_quantization) gtest_discover_tests(test_reduction) @@ -137,3 +145,4 @@ gtest_discover_tests(test_embedding) gtest_discover_tests(test_weight_packing) gtest_discover_tests(test_qmatmul) gtest_discover_tests(test_lut) +gtest_discover_tests(test_bitpack_fallback_compatibility) diff --git a/torchao/experimental/kernels/cpu/aarch64/tests/build_and_run_tests.sh b/torchao/experimental/kernels/cpu/aarch64/tests/build_and_run_tests.sh index 474a77eb8c..c4d807c702 100644 --- a/torchao/experimental/kernels/cpu/aarch64/tests/build_and_run_tests.sh +++ b/torchao/experimental/kernels/cpu/aarch64/tests/build_and_run_tests.sh @@ -62,3 +62,4 @@ ${CMAKE_OUT}/test_embedding ${CMAKE_OUT}/test_weight_packing ${CMAKE_OUT}/test_qmatmul ${CMAKE_OUT}/test_lut +${CMAKE_OUT}/test_bitpack_fallback_compatibility diff --git a/torchao/experimental/kernels/cpu/aarch64/tests/test_bitpack_fallback_compatibility.cpp b/torchao/experimental/kernels/cpu/aarch64/tests/test_bitpack_fallback_compatibility.cpp new file mode 100644 index 0000000000..d0a8622b36 --- /dev/null +++ b/torchao/experimental/kernels/cpu/aarch64/tests/test_bitpack_fallback_compatibility.cpp @@ -0,0 +1,686 @@ +// Copyright (c) Meta Platforms, Inc. and affiliates. +// All rights reserved. +// +// This source code is licensed under the license found in the +// LICENSE file in the root directory of this source tree. +#if defined(__aarch64__) || defined(__ARM_NEON) + +#include +#include + +#include +#include +#include + +// --- Compatibility Tests for uint1 --- + +TEST(test_bitpacking_64_uint1_values, CppToNeon) { + int unpacked_bytes = 64; + int nbit = 1; + int packed_bytes = unpacked_bytes * nbit / 8; + auto input = torchao::get_random_lowbit_vector(unpacked_bytes, nbit); + std::vector packed(packed_bytes, 0); + std::vector unpacked(unpacked_bytes, 0); + + torchao::kernels::cpu::fallback::bitpacking::internal::pack_64_uint1_values( + packed.data(), input.data()); + + uint8x16_t u0, u1, u2, u3; + torchao::bitpacking::internal::vec_unpack_64_uint1_values( + u0, u1, u2, u3, packed.data()); + vst1q_u8(unpacked.data(), u0); + vst1q_u8(unpacked.data() + 16, u1); + vst1q_u8(unpacked.data() + 32, u2); + vst1q_u8(unpacked.data() + 48, u3); + + for (int i = 0; i < unpacked_bytes; ++i) { + EXPECT_EQ(input[i], unpacked[i]); + } +} + +TEST(test_bitpacking_64_uint1_values, NeonToCpp) { + int unpacked_bytes = 64; + int nbit = 1; + int packed_bytes = unpacked_bytes * nbit / 8; + auto input = torchao::get_random_lowbit_vector(unpacked_bytes, nbit); + std::vector packed(packed_bytes, 0); + std::vector unpacked(unpacked_bytes, 0); + + uint8x16_t i0, i1, i2, i3; + torchao::bitpacking::internal::vec_load_64_uint8_values( + i0, i1, i2, i3, input.data()); + torchao::bitpacking::internal::vec_pack_64_uint1_values( + packed.data(), i0, i1, i2, i3); + + torchao::kernels::cpu::fallback::bitpacking::internal::unpack_64_uint1_values( + unpacked.data(), packed.data()); + + for (int i = 0; i < unpacked_bytes; ++i) { + EXPECT_EQ(input[i], unpacked[i]); + } +} + +TEST(test_bitpacking_128_uint1_values, CppToNeon) { + int unpacked_bytes = 128; + int nbit = 1; + int packed_bytes = unpacked_bytes * nbit / 8; + auto input = torchao::get_random_lowbit_vector(unpacked_bytes, nbit); + std::vector packed(packed_bytes, 0); + std::vector unpacked(unpacked_bytes, 0); + + torchao::kernels::cpu::fallback::bitpacking::internal::pack_128_uint1_values( + packed.data(), input.data()); + + uint8x16_t u0, u1, u2, u3, u4, u5, u6, u7; + torchao::bitpacking::internal::vec_unpack_128_uint1_values( + u0, u1, u2, u3, u4, u5, u6, u7, packed.data()); + vst1q_u8(unpacked.data(), u0); + vst1q_u8(unpacked.data() + 16, u1); + vst1q_u8(unpacked.data() + 32, u2); + vst1q_u8(unpacked.data() + 48, u3); + vst1q_u8(unpacked.data() + 64, u4); + vst1q_u8(unpacked.data() + 80, u5); + vst1q_u8(unpacked.data() + 96, u6); + vst1q_u8(unpacked.data() + 112, u7); + + for (int i = 0; i < unpacked_bytes; ++i) { + EXPECT_EQ(input[i], unpacked[i]); + } +} + +TEST(test_bitpacking_128_uint1_values, NeonToCpp) { + int unpacked_bytes = 128; + int nbit = 1; + int packed_bytes = unpacked_bytes * nbit / 8; + auto input = torchao::get_random_lowbit_vector(unpacked_bytes, nbit); + std::vector packed(packed_bytes, 0); + std::vector unpacked(unpacked_bytes, 0); + + uint8x16_t i0, i1, i2, i3, i4, i5, i6, i7; + torchao::bitpacking::internal::vec_load_64_uint8_values( + i0, i1, i2, i3, input.data()); + torchao::bitpacking::internal::vec_load_64_uint8_values( + i4, i5, i6, i7, input.data() + 64); + torchao::bitpacking::internal::vec_pack_128_uint1_values( + packed.data(), i0, i1, i2, i3, i4, i5, i6, i7); + + torchao::kernels::cpu::fallback::bitpacking::internal:: + unpack_128_uint1_values(unpacked.data(), packed.data()); + + for (int i = 0; i < unpacked_bytes; ++i) { + EXPECT_EQ(input[i], unpacked[i]); + } +} + +// --- Compatibility Tests for uint2 --- + +TEST(test_bitpacking_32_uint2_values, CppToNeon) { + int unpacked_bytes = 32; + int nbit = 2; + int packed_bytes = unpacked_bytes * nbit / 8; + auto input = torchao::get_random_lowbit_vector(unpacked_bytes, nbit); + std::vector packed(packed_bytes, 0); + std::vector unpacked(unpacked_bytes, 0); + + torchao::kernels::cpu::fallback::bitpacking::internal::pack_32_uint2_values( + packed.data(), input.data()); + + uint8x8_t u0, u1, u2, u3; + torchao::bitpacking::internal::vec_unpack_32_uint2_values( + u0, u1, u2, u3, packed.data()); + vst1_u8(unpacked.data(), u0); + vst1_u8(unpacked.data() + 8, u1); + vst1_u8(unpacked.data() + 16, u2); + vst1_u8(unpacked.data() + 24, u3); + + for (int i = 0; i < unpacked_bytes; ++i) { + EXPECT_EQ(input[i], unpacked[i]); + } +} + +TEST(test_bitpacking_32_uint2_values, NeonToCpp) { + int unpacked_bytes = 32; + int nbit = 2; + int packed_bytes = unpacked_bytes * nbit / 8; + auto input = torchao::get_random_lowbit_vector(unpacked_bytes, nbit); + std::vector packed(packed_bytes, 0); + std::vector unpacked(unpacked_bytes, 0); + + uint8x8_t i0, i1, i2, i3; + torchao::bitpacking::internal::vec_load_32_uint8_values( + i0, i1, i2, i3, input.data()); + torchao::bitpacking::internal::vec_pack_32_uint2_values( + packed.data(), i0, i1, i2, i3); + + torchao::kernels::cpu::fallback::bitpacking::internal::unpack_32_uint2_values( + unpacked.data(), packed.data()); + + for (int i = 0; i < unpacked_bytes; ++i) { + EXPECT_EQ(input[i], unpacked[i]); + } +} + +TEST(test_bitpacking_64_uint2_values, CppToNeon) { + int unpacked_bytes = 64; + int nbit = 2; + int packed_bytes = unpacked_bytes * nbit / 8; + auto input = torchao::get_random_lowbit_vector(unpacked_bytes, nbit); + std::vector packed(packed_bytes, 0); + std::vector unpacked(unpacked_bytes, 0); + + torchao::kernels::cpu::fallback::bitpacking::internal::pack_64_uint2_values( + packed.data(), input.data()); + + uint8x16_t u0, u1, u2, u3; + torchao::bitpacking::internal::vec_unpack_64_uint2_values( + u0, u1, u2, u3, packed.data()); + vst1q_u8(unpacked.data(), u0); + vst1q_u8(unpacked.data() + 16, u1); + vst1q_u8(unpacked.data() + 32, u2); + vst1q_u8(unpacked.data() + 48, u3); + + for (int i = 0; i < unpacked_bytes; ++i) { + EXPECT_EQ(input[i], unpacked[i]); + } +} + +TEST(test_bitpacking_64_uint2_values, NeonToCpp) { + int unpacked_bytes = 64; + int nbit = 2; + int packed_bytes = unpacked_bytes * nbit / 8; + auto input = torchao::get_random_lowbit_vector(unpacked_bytes, nbit); + std::vector packed(packed_bytes, 0); + std::vector unpacked(unpacked_bytes, 0); + + uint8x16_t i0, i1, i2, i3; + torchao::bitpacking::internal::vec_load_64_uint8_values( + i0, i1, i2, i3, input.data()); + torchao::bitpacking::internal::vec_pack_64_uint2_values( + packed.data(), i0, i1, i2, i3); + + torchao::kernels::cpu::fallback::bitpacking::internal::unpack_64_uint2_values( + unpacked.data(), packed.data()); + + for (int i = 0; i < unpacked_bytes; ++i) { + EXPECT_EQ(input[i], unpacked[i]); + } +} + +// --- Compatibility Tests for uint3 --- + +TEST(test_bitpacking_64_uint3_values, CppToNeon) { + int unpacked_bytes = 64; + int nbit = 3; + int packed_bytes = unpacked_bytes * nbit / 8; + auto input = torchao::get_random_lowbit_vector(unpacked_bytes, nbit); + std::vector packed(packed_bytes, 0); + std::vector unpacked(unpacked_bytes, 0); + + torchao::kernels::cpu::fallback::bitpacking::internal::pack_64_uint3_values( + packed.data(), input.data()); + + uint8x16_t u0, u1, u2, u3; + torchao::bitpacking::internal::vec_unpack_64_uint3_values( + u0, u1, u2, u3, packed.data()); + vst1q_u8(unpacked.data(), u0); + vst1q_u8(unpacked.data() + 16, u1); + vst1q_u8(unpacked.data() + 32, u2); + vst1q_u8(unpacked.data() + 48, u3); + + for (int i = 0; i < unpacked_bytes; ++i) { + EXPECT_EQ(input[i], unpacked[i]); + } +} + +TEST(test_bitpacking_64_uint3_values, NeonToCpp) { + int unpacked_bytes = 64; + int nbit = 3; + int packed_bytes = unpacked_bytes * nbit / 8; + auto input = torchao::get_random_lowbit_vector(unpacked_bytes, nbit); + std::vector packed(packed_bytes, 0); + std::vector unpacked(unpacked_bytes, 0); + + uint8x16_t i0, i1, i2, i3; + torchao::bitpacking::internal::vec_load_64_uint8_values( + i0, i1, i2, i3, input.data()); + torchao::bitpacking::internal::vec_pack_64_uint3_values( + packed.data(), i0, i1, i2, i3); + + torchao::kernels::cpu::fallback::bitpacking::internal::unpack_64_uint3_values( + unpacked.data(), packed.data()); + + for (int i = 0; i < unpacked_bytes; ++i) { + EXPECT_EQ(input[i], unpacked[i]); + } +} + +TEST(test_bitpacking_128_uint3_values, CppToNeon) { + int unpacked_bytes = 128; + int nbit = 3; + int packed_bytes = unpacked_bytes * nbit / 8; + auto input = torchao::get_random_lowbit_vector(unpacked_bytes, nbit); + std::vector packed(packed_bytes, 0); + std::vector unpacked(unpacked_bytes, 0); + + torchao::kernels::cpu::fallback::bitpacking::internal::pack_128_uint3_values( + packed.data(), input.data()); + + uint8x16_t u0, u1, u2, u3, u4, u5, u6, u7; + torchao::bitpacking::internal::vec_unpack_128_uint3_values( + u0, u1, u2, u3, u4, u5, u6, u7, packed.data()); + vst1q_u8(unpacked.data(), u0); + vst1q_u8(unpacked.data() + 16, u1); + vst1q_u8(unpacked.data() + 32, u2); + vst1q_u8(unpacked.data() + 48, u3); + vst1q_u8(unpacked.data() + 64, u4); + vst1q_u8(unpacked.data() + 80, u5); + vst1q_u8(unpacked.data() + 96, u6); + vst1q_u8(unpacked.data() + 112, u7); + + for (int i = 0; i < unpacked_bytes; ++i) { + EXPECT_EQ(input[i], unpacked[i]); + } +} + +TEST(test_bitpacking_128_uint3_values, NeonToCpp) { + int unpacked_bytes = 128; + int nbit = 3; + int packed_bytes = unpacked_bytes * nbit / 8; + auto input = torchao::get_random_lowbit_vector(unpacked_bytes, nbit); + std::vector packed(packed_bytes, 0); + std::vector unpacked(unpacked_bytes, 0); + + uint8x16_t i0, i1, i2, i3, i4, i5, i6, i7; + torchao::bitpacking::internal::vec_load_64_uint8_values( + i0, i1, i2, i3, input.data()); + torchao::bitpacking::internal::vec_load_64_uint8_values( + i4, i5, i6, i7, input.data() + 64); + torchao::bitpacking::internal::vec_pack_128_uint3_values( + packed.data(), i0, i1, i2, i3, i4, i5, i6, i7); + + torchao::kernels::cpu::fallback::bitpacking::internal:: + unpack_128_uint3_values(unpacked.data(), packed.data()); + + for (int i = 0; i < unpacked_bytes; ++i) { + EXPECT_EQ(input[i], unpacked[i]); + } +} + +// --- Compatibility Tests for uint4 --- + +TEST(test_bitpacking_16_uint4_values, CppToNeon) { + int unpacked_bytes = 16; + int nbit = 4; + int packed_bytes = unpacked_bytes * nbit / 8; + auto input = torchao::get_random_lowbit_vector(unpacked_bytes, nbit); + std::vector packed(packed_bytes, 0); + std::vector unpacked(unpacked_bytes, 0); + + torchao::kernels::cpu::fallback::bitpacking::internal::pack_16_uint4_values( + packed.data(), input.data()); + + uint8x16_t unpacked0; + torchao::bitpacking::internal::vec_unpack_16_uint4_values( + unpacked0, packed.data()); + vst1q_u8(unpacked.data(), unpacked0); + + for (int i = 0; i < unpacked_bytes; ++i) { + EXPECT_EQ(input[i], unpacked[i]); + } +} + +TEST(test_bitpacking_16_uint4_values, NeonToCpp) { + int unpacked_bytes = 16; + int nbit = 4; + int packed_bytes = unpacked_bytes * nbit / 8; + auto input = torchao::get_random_lowbit_vector(unpacked_bytes, nbit); + std::vector packed(packed_bytes, 0); + std::vector unpacked(unpacked_bytes, 0); + + uint8x16_t input0 = vld1q_u8(input.data()); + torchao::bitpacking::internal::vec_pack_16_uint4_values( + packed.data(), input0); + + torchao::kernels::cpu::fallback::bitpacking::internal::unpack_16_uint4_values( + unpacked.data(), packed.data()); + + for (int i = 0; i < unpacked_bytes; ++i) { + EXPECT_EQ(input[i], unpacked[i]); + } +} + +TEST(test_bitpacking_32_uint4_values, CppToNeon) { + int unpacked_bytes = 32; + int nbit = 4; + int packed_bytes = unpacked_bytes * nbit / 8; + auto input = torchao::get_random_lowbit_vector(unpacked_bytes, nbit); + std::vector packed(packed_bytes, 0); + std::vector unpacked(unpacked_bytes, 0); + + torchao::kernels::cpu::fallback::bitpacking::internal::pack_32_uint4_values( + packed.data(), input.data()); + + uint8x16_t unpacked0, unpacked1; + torchao::bitpacking::internal::vec_unpack_32_uint4_values( + unpacked0, unpacked1, packed.data()); + vst1q_u8(unpacked.data(), unpacked0); + vst1q_u8(unpacked.data() + 16, unpacked1); + + for (int i = 0; i < unpacked_bytes; ++i) { + EXPECT_EQ(input[i], unpacked[i]); + } +} + +TEST(test_bitpacking_32_uint4_values, NeonToCpp) { + int unpacked_bytes = 32; + int nbit = 4; + int packed_bytes = unpacked_bytes * nbit / 8; + auto input = torchao::get_random_lowbit_vector(unpacked_bytes, nbit); + std::vector packed(packed_bytes, 0); + std::vector unpacked(unpacked_bytes, 0); + + uint8x16_t input0 = vld1q_u8(input.data()); + uint8x16_t input1 = vld1q_u8(input.data() + 16); + torchao::bitpacking::internal::vec_pack_32_uint4_values( + packed.data(), input0, input1); + + torchao::kernels::cpu::fallback::bitpacking::internal::unpack_32_uint4_values( + unpacked.data(), packed.data()); + + for (int i = 0; i < unpacked_bytes; ++i) { + EXPECT_EQ(input[i], unpacked[i]); + } +} + +// --- Compatibility Tests for uint5 --- + +TEST(test_bitpacking_64_uint5_values, CppToNeon) { + int unpacked_bytes = 64; + int nbit = 5; + int packed_bytes = unpacked_bytes * nbit / 8; + auto input = torchao::get_random_lowbit_vector(unpacked_bytes, nbit); + std::vector packed(packed_bytes, 0); + std::vector unpacked(unpacked_bytes, 0); + + torchao::kernels::cpu::fallback::bitpacking::internal::pack_64_uint5_values( + packed.data(), input.data()); + + uint8x16_t unpacked0, unpacked1, unpacked2, unpacked3; + torchao::bitpacking::internal::vec_unpack_64_uint5_values( + unpacked0, unpacked1, unpacked2, unpacked3, packed.data()); + vst1q_u8(unpacked.data(), unpacked0); + vst1q_u8(unpacked.data() + 16, unpacked1); + vst1q_u8(unpacked.data() + 32, unpacked2); + vst1q_u8(unpacked.data() + 48, unpacked3); + + for (int i = 0; i < unpacked_bytes; ++i) { + EXPECT_EQ(input[i], unpacked[i]); + } +} + +TEST(test_bitpacking_64_uint5_values, NeonToCpp) { + int unpacked_bytes = 64; + int nbit = 5; + int packed_bytes = unpacked_bytes * nbit / 8; + auto input = torchao::get_random_lowbit_vector(unpacked_bytes, nbit); + std::vector packed(packed_bytes, 0); + std::vector unpacked(unpacked_bytes, 0); + + uint8x16_t input0, input1, input2, input3; + torchao::bitpacking::internal::vec_load_64_uint8_values( + input0, input1, input2, input3, input.data()); + torchao::bitpacking::internal::vec_pack_64_uint5_values( + packed.data(), input0, input1, input2, input3); + + torchao::kernels::cpu::fallback::bitpacking::internal::unpack_64_uint5_values( + unpacked.data(), packed.data()); + + for (int i = 0; i < unpacked_bytes; ++i) { + EXPECT_EQ(input[i], unpacked[i]); + } +} + +TEST(test_bitpacking_128_uint5_values, CppToNeon) { + int unpacked_bytes = 128; + int nbit = 5; + int packed_bytes = unpacked_bytes * nbit / 8; + auto input = torchao::get_random_lowbit_vector(unpacked_bytes, nbit); + std::vector packed(packed_bytes, 0); + std::vector unpacked(unpacked_bytes, 0); + + torchao::kernels::cpu::fallback::bitpacking::internal::pack_128_uint5_values( + packed.data(), input.data()); + + uint8x16_t u0, u1, u2, u3, u4, u5, u6, u7; + torchao::bitpacking::internal::vec_unpack_128_uint5_values( + u0, u1, u2, u3, u4, u5, u6, u7, packed.data()); + vst1q_u8(unpacked.data(), u0); + vst1q_u8(unpacked.data() + 16, u1); + vst1q_u8(unpacked.data() + 32, u2); + vst1q_u8(unpacked.data() + 48, u3); + vst1q_u8(unpacked.data() + 64, u4); + vst1q_u8(unpacked.data() + 80, u5); + vst1q_u8(unpacked.data() + 96, u6); + vst1q_u8(unpacked.data() + 112, u7); + + for (int i = 0; i < unpacked_bytes; ++i) { + EXPECT_EQ(input[i], unpacked[i]); + } +} + +TEST(test_bitpacking_128_uint5_values, NeonToCpp) { + int unpacked_bytes = 128; + int nbit = 5; + int packed_bytes = unpacked_bytes * nbit / 8; + auto input = torchao::get_random_lowbit_vector(unpacked_bytes, nbit); + std::vector packed(packed_bytes, 0); + std::vector unpacked(unpacked_bytes, 0); + + uint8x16_t i0, i1, i2, i3, i4, i5, i6, i7; + torchao::bitpacking::internal::vec_load_64_uint8_values( + i0, i1, i2, i3, input.data()); + torchao::bitpacking::internal::vec_load_64_uint8_values( + i4, i5, i6, i7, input.data() + 64); + torchao::bitpacking::internal::vec_pack_128_uint5_values( + packed.data(), i0, i1, i2, i3, i4, i5, i6, i7); + + torchao::kernels::cpu::fallback::bitpacking::internal:: + unpack_128_uint5_values(unpacked.data(), packed.data()); + + for (int i = 0; i < unpacked_bytes; ++i) { + EXPECT_EQ(input[i], unpacked[i]); + } +} + +// --- Compatibility Tests for uint6 --- + +TEST(test_bitpacking_32_uint6_values, CppToNeon) { + int unpacked_bytes = 32; + int nbit = 6; + int packed_bytes = unpacked_bytes * nbit / 8; + auto input = torchao::get_random_lowbit_vector(unpacked_bytes, nbit); + std::vector packed(packed_bytes, 0); + std::vector unpacked(unpacked_bytes, 0); + + torchao::kernels::cpu::fallback::bitpacking::internal::pack_32_uint6_values( + packed.data(), input.data()); + + uint8x16_t u0, u1; + torchao::bitpacking::internal::vec_unpack_32_uint6_values( + u0, u1, packed.data()); + vst1q_u8(unpacked.data(), u0); + vst1q_u8(unpacked.data() + 16, u1); + + for (int i = 0; i < unpacked_bytes; ++i) { + EXPECT_EQ(input[i], unpacked[i]); + } +} + +TEST(test_bitpacking_32_uint6_values, NeonToCpp) { + int unpacked_bytes = 32; + int nbit = 6; + int packed_bytes = unpacked_bytes * nbit / 8; + auto input = torchao::get_random_lowbit_vector(unpacked_bytes, nbit); + std::vector packed(packed_bytes, 0); + std::vector unpacked(unpacked_bytes, 0); + + uint8x16_t i0 = vld1q_u8(input.data()); + uint8x16_t i1 = vld1q_u8(input.data() + 16); + torchao::bitpacking::internal::vec_pack_32_uint6_values( + packed.data(), i0, i1); + + torchao::kernels::cpu::fallback::bitpacking::internal::unpack_32_uint6_values( + unpacked.data(), packed.data()); + + for (int i = 0; i < unpacked_bytes; ++i) { + EXPECT_EQ(input[i], unpacked[i]); + } +} + +TEST(test_bitpacking_64_uint6_values, CppToNeon) { + int unpacked_bytes = 64; + int nbit = 6; + int packed_bytes = unpacked_bytes * nbit / 8; + auto input = torchao::get_random_lowbit_vector(unpacked_bytes, nbit); + std::vector packed(packed_bytes, 0); + std::vector unpacked(unpacked_bytes, 0); + + torchao::kernels::cpu::fallback::bitpacking::internal::pack_64_uint6_values( + packed.data(), input.data()); + + uint8x16_t u0, u1, u2, u3; + torchao::bitpacking::internal::vec_unpack_64_uint6_values( + u0, u1, u2, u3, packed.data()); + vst1q_u8(unpacked.data(), u0); + vst1q_u8(unpacked.data() + 16, u1); + vst1q_u8(unpacked.data() + 32, u2); + vst1q_u8(unpacked.data() + 48, u3); + + for (int i = 0; i < unpacked_bytes; ++i) { + EXPECT_EQ(input[i], unpacked[i]); + } +} + +TEST(test_bitpacking_64_uint6_values, NeonToCpp) { + int unpacked_bytes = 64; + int nbit = 6; + int packed_bytes = unpacked_bytes * nbit / 8; + auto input = torchao::get_random_lowbit_vector(unpacked_bytes, nbit); + std::vector packed(packed_bytes, 0); + std::vector unpacked(unpacked_bytes, 0); + + uint8x16_t i0, i1, i2, i3; + torchao::bitpacking::internal::vec_load_64_uint8_values( + i0, i1, i2, i3, input.data()); + torchao::bitpacking::internal::vec_pack_64_uint6_values( + packed.data(), i0, i1, i2, i3); + + torchao::kernels::cpu::fallback::bitpacking::internal::unpack_64_uint6_values( + unpacked.data(), packed.data()); + + for (int i = 0; i < unpacked_bytes; ++i) { + EXPECT_EQ(input[i], unpacked[i]); + } +} + +// --- Compatibility Tests for uint7 --- + +TEST(test_bitpacking_64_uint7_values, CppToNeon) { + int unpacked_bytes = 64; + int nbit = 7; + int packed_bytes = unpacked_bytes * nbit / 8; + auto input = torchao::get_random_lowbit_vector(unpacked_bytes, nbit); + std::vector packed(packed_bytes, 0); + std::vector unpacked(unpacked_bytes, 0); + + torchao::kernels::cpu::fallback::bitpacking::internal::pack_64_uint7_values( + packed.data(), input.data()); + + uint8x16_t unpacked0, unpacked1, unpacked2, unpacked3; + torchao::bitpacking::internal::vec_unpack_64_uint7_values( + unpacked0, unpacked1, unpacked2, unpacked3, packed.data()); + vst1q_u8(unpacked.data(), unpacked0); + vst1q_u8(unpacked.data() + 16, unpacked1); + vst1q_u8(unpacked.data() + 32, unpacked2); + vst1q_u8(unpacked.data() + 48, unpacked3); + + for (int i = 0; i < unpacked_bytes; ++i) { + EXPECT_EQ(input[i], unpacked[i]); + } +} + +TEST(test_bitpacking_64_uint7_values, NeonToCpp) { + int unpacked_bytes = 64; + int nbit = 7; + int packed_bytes = unpacked_bytes * nbit / 8; + auto input = torchao::get_random_lowbit_vector(unpacked_bytes, nbit); + std::vector packed(packed_bytes, 0); + std::vector unpacked(unpacked_bytes, 0); + + uint8x16_t input0, input1, input2, input3; + torchao::bitpacking::internal::vec_load_64_uint8_values( + input0, input1, input2, input3, input.data()); + torchao::bitpacking::internal::vec_pack_64_uint7_values( + packed.data(), input0, input1, input2, input3); + + torchao::kernels::cpu::fallback::bitpacking::internal::unpack_64_uint7_values( + unpacked.data(), packed.data()); + + for (int i = 0; i < unpacked_bytes; ++i) { + EXPECT_EQ(input[i], unpacked[i]); + } +} + +TEST(test_bitpacking_128_uint7_values, CppToNeon) { + int unpacked_bytes = 128; + int nbit = 7; + int packed_bytes = unpacked_bytes * nbit / 8; + auto input = torchao::get_random_lowbit_vector(unpacked_bytes, nbit); + std::vector packed(packed_bytes, 0); + std::vector unpacked(unpacked_bytes, 0); + + torchao::kernels::cpu::fallback::bitpacking::internal::pack_128_uint7_values( + packed.data(), input.data()); + + uint8x16_t u0, u1, u2, u3, u4, u5, u6, u7; + torchao::bitpacking::internal::vec_unpack_128_uint7_values( + u0, u1, u2, u3, u4, u5, u6, u7, packed.data()); + vst1q_u8(unpacked.data(), u0); + vst1q_u8(unpacked.data() + 16, u1); + vst1q_u8(unpacked.data() + 32, u2); + vst1q_u8(unpacked.data() + 48, u3); + vst1q_u8(unpacked.data() + 64, u4); + vst1q_u8(unpacked.data() + 80, u5); + vst1q_u8(unpacked.data() + 96, u6); + vst1q_u8(unpacked.data() + 112, u7); + + for (int i = 0; i < unpacked_bytes; ++i) { + EXPECT_EQ(input[i], unpacked[i]); + } +} + +TEST(test_bitpacking_128_uint7_values, NeonToCpp) { + int unpacked_bytes = 128; + int nbit = 7; + int packed_bytes = unpacked_bytes * nbit / 8; + auto input = torchao::get_random_lowbit_vector(unpacked_bytes, nbit); + std::vector packed(packed_bytes, 0); + std::vector unpacked(unpacked_bytes, 0); + + uint8x16_t i0, i1, i2, i3, i4, i5, i6, i7; + torchao::bitpacking::internal::vec_load_64_uint8_values( + i0, i1, i2, i3, input.data()); + torchao::bitpacking::internal::vec_load_64_uint8_values( + i4, i5, i6, i7, input.data() + 64); + torchao::bitpacking::internal::vec_pack_128_uint7_values( + packed.data(), i0, i1, i2, i3, i4, i5, i6, i7); + + torchao::kernels::cpu::fallback::bitpacking::internal:: + unpack_128_uint7_values(unpacked.data(), packed.data()); + + for (int i = 0; i < unpacked_bytes; ++i) { + EXPECT_EQ(input[i], unpacked[i]); + } +} + +#endif // defined(__aarch64__) || defined(__ARM_NEON) diff --git a/torchao/experimental/kernels/cpu/fallback/CMakeLists.txt b/torchao/experimental/kernels/cpu/fallback/CMakeLists.txt new file mode 100644 index 0000000000..0952fcc3f5 --- /dev/null +++ b/torchao/experimental/kernels/cpu/fallback/CMakeLists.txt @@ -0,0 +1,5 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the license found in the +# LICENSE file in the root directory of this source tree. diff --git a/torchao/experimental/kernels/cpu/fallback/bitpacking/bitpack.h b/torchao/experimental/kernels/cpu/fallback/bitpacking/bitpack.h new file mode 100644 index 0000000000..1a558d27ac --- /dev/null +++ b/torchao/experimental/kernels/cpu/fallback/bitpacking/bitpack.h @@ -0,0 +1,179 @@ +// Copyright (c) Meta Platforms, Inc. and affiliates. +// All rights reserved. +// +// This source code is licensed under the license found in the +// LICENSE file in the root directory of this source tree. + +#pragma once + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +namespace torchao::kernels::cpu::fallback::bitpacking { +namespace internal { +/** + * @brief Packs 128 unsigned 8-bit integers into a packed format of 'nbit' bits. + * + * @tparam nbit The number of bits to pack each value into (1-8). + * @param packed Pointer to the destination memory for the packed data. + * @param unpacked_values Pointer to the source memory with 128 uint8_t values. + */ +template +inline void pack_128_uint_values( + uint8_t* packed, + const uint8_t* unpacked_values) { + static_assert(nbit >= 1 && nbit <= 8, "nbit must be between 1 and 8"); + + // Dispatch to the correct packing function + if constexpr (nbit == 1) { + pack_128_uint1_values(packed, unpacked_values); + } else if constexpr (nbit == 2) { + pack_64_uint2_values(packed, unpacked_values); + pack_64_uint2_values(packed + 16, unpacked_values + 64); + } else if constexpr (nbit == 3) { + pack_128_uint3_values(packed, unpacked_values); + } else if constexpr (nbit == 4) { + pack_32_uint4_values(packed, unpacked_values); + pack_32_uint4_values(packed + 16, unpacked_values + 32); + pack_32_uint4_values(packed + 32, unpacked_values + 64); + pack_32_uint4_values(packed + 48, unpacked_values + 96); + } else if constexpr (nbit == 5) { + pack_128_uint5_values(packed, unpacked_values); + } else if constexpr (nbit == 6) { + pack_64_uint6_values(packed, unpacked_values); + pack_64_uint6_values(packed + 48, unpacked_values + 64); + } else if constexpr (nbit == 7) { + pack_128_uint7_values(packed, unpacked_values); + } else if constexpr (nbit == 8) { + // For 8-bit, it's a direct memory copy + for (int i = 0; i < 128; ++i) { + packed[i] = unpacked_values[i]; + } + } +} +/** + * @brief Unpacks 'nbit' data into 128 unsigned 8-bit integers. + * + * @tparam nbit The number of bits per value in the packed format (1-8). + * @param unpacked_values Pointer to the destination memory (128 uint8_t + * values). + * @param packed Pointer to the source packed data. + */ +template +inline void unpack_128_uint_values( + uint8_t* unpacked_values, + const uint8_t* packed) { + static_assert(nbit >= 1 && nbit <= 8, "nbit must be between 1 and 8"); + + // Dispatch to the correct unpacking function, writing directly to the output. + if constexpr (nbit == 1) { + unpack_128_uint1_values(unpacked_values, packed); + } else if constexpr (nbit == 2) { + unpack_64_uint2_values(unpacked_values, packed); + unpack_64_uint2_values(unpacked_values + 64, packed + 16); + } else if constexpr (nbit == 3) { + unpack_128_uint3_values(unpacked_values, packed); + } else if constexpr (nbit == 4) { + unpack_32_uint4_values(unpacked_values, packed); + unpack_32_uint4_values(unpacked_values + 32, packed + 16); + unpack_32_uint4_values(unpacked_values + 64, packed + 32); + unpack_32_uint4_values(unpacked_values + 96, packed + 48); + } else if constexpr (nbit == 5) { + unpack_128_uint5_values(unpacked_values, packed); + } else if constexpr (nbit == 6) { + unpack_64_uint6_values(unpacked_values, packed); + unpack_64_uint6_values(unpacked_values + 64, packed + 48); + } else if constexpr (nbit == 7) { + unpack_128_uint7_values(unpacked_values, packed); + } else if constexpr (nbit == 8) { + // For 8-bit, it's a direct memory copy + for (int i = 0; i < 128; ++i) { + unpacked_values[i] = packed[i]; + } + } +} + +/** + * @brief Packs 128 signed 8-bit integers into a packed format of 'nbit' bits. + * + * @tparam nbit The number of bits to pack each value into (1-8). + * @param packed Pointer to the destination memory. + * @param unpacked Pointer to the source memory containing 128 int8_t values. + */ +template +inline void pack_128_lowbit_int_values( + uint8_t* packed, + const int8_t* unpacked) { + // 1. Convert signed input to a temporary buffer of unsigned values. + uint8_t temp_unpacked[128]; + if constexpr (nbit < 8) { + const int8_t shift = 1 << (nbit - 1); + for (int i = 0; i < 128; ++i) { + temp_unpacked[i] = static_cast(unpacked[i] + shift); + } + } else { // nbit == 8 + for (int i = 0; i < 128; ++i) { + temp_unpacked[i] = static_cast(unpacked[i]); + } + } + + // 2. Call the generalized uint packing function. + pack_128_uint_values(packed, temp_unpacked); +} + +template +inline void unpack_128_lowbit_int_values( + int8_t* unpacked, + const uint8_t* packed) { + // 1. Get the raw unsigned values by calling the base function. + uint8_t temp_unpacked[128]; + unpack_128_uint_values(temp_unpacked, packed); + + // 2. Perform the signed conversion. + if constexpr (nbit < 8) { + const int8_t unshift = -(1 << (nbit - 1)); + for (int i = 0; i < 128; ++i) { + unpacked[i] = static_cast(temp_unpacked[i]) + unshift; + } + } else { // nbit == 8 + for (int i = 0; i < 128; ++i) { + unpacked[i] = static_cast(temp_unpacked[i]); + } + } +} + +/** + * @brief Unpacks 'nbit' data and de-quantizes it using a lookup table (LUT). + * + * @tparam nbit The number of bits per value in the packed format (1-4). + * @param unpacked Pointer to the destination memory (128 int8_t values). + * @param packed Pointer to the source packed data. + * @param lut Pointer to the lookup table (must have 2^nbit entries). + */ +template +inline void unpack_128_lowbit_values_with_lut( + int8_t* unpacked, + const uint8_t* packed, + const int8_t* lut) { + static_assert(nbit >= 1 && nbit <= 4, "LUT version only supports nbit <= 4"); + + // Create a temporary buffer on the stack for the indices. + uint8_t indices[128]; + + // 1. Call the utility function to handle all the unpacking logic. + unpack_128_uint_values(indices, packed); + + // 2. Apply the lookup table. + for (int i = 0; i < 128; ++i) { + unpacked[i] = lut[indices[i]]; + } +} +} // namespace internal +} // namespace torchao::kernels::cpu::fallback::bitpacking diff --git a/torchao/experimental/kernels/cpu/fallback/bitpacking/uint1.h b/torchao/experimental/kernels/cpu/fallback/bitpacking/uint1.h new file mode 100644 index 0000000000..67d4512a2c --- /dev/null +++ b/torchao/experimental/kernels/cpu/fallback/bitpacking/uint1.h @@ -0,0 +1,154 @@ +// Copyright (c) Meta Platforms, Inc. and affiliates. +// All rights reserved. +// +// This source code is licensed under the license found in the +// LICENSE file in the root directory of this source tree. + +#pragma once + +#include +#include + +namespace torchao::kernels::cpu::fallback::bitpacking { +namespace internal { + +/** + * @brief Packs 8 bytes, each containing a 1-bit value (0 or 1), into a single + * byte. + * @param packed Pointer to the destination memory (1 byte). + * @param unpacked Pointer to the source memory (8 bytes). + */ +TORCHAO_ALWAYS_INLINE inline void pack_8_uint1_values( + uint8_t* packed, + const uint8_t* unpacked) { + packed[0] = (unpacked[0] << 7) | (unpacked[1] << 6) | (unpacked[2] << 5) | + (unpacked[3] << 4) | (unpacked[4] << 3) | (unpacked[5] << 2) | + (unpacked[6] << 1) | (unpacked[7] << 0); +} + +/** + * @brief Unpacks a single byte into 8 bytes, each containing a 1-bit value. + * @param unpacked Pointer to the destination memory (8 bytes). + * @param packed Pointer to the source memory (1 byte). + */ +TORCHAO_ALWAYS_INLINE inline void unpack_8_uint1_values( + uint8_t* unpacked, + const uint8_t* packed) { + const uint8_t packed_byte = packed[0]; + unpacked[0] = (packed_byte >> 7) & 1; + unpacked[1] = (packed_byte >> 6) & 1; + unpacked[2] = (packed_byte >> 5) & 1; + unpacked[3] = (packed_byte >> 4) & 1; + unpacked[4] = (packed_byte >> 3) & 1; + unpacked[5] = (packed_byte >> 2) & 1; + unpacked[6] = (packed_byte >> 1) & 1; + unpacked[7] = (packed_byte >> 0) & 1; +} + +/** + * @brief Packs 64 bytes (each a 1-bit value) into 8 bytes. + * @param packed Pointer to the destination memory (8 bytes). + * @param unpacked Pointer to the source memory (64 bytes). + * @note This implementation mirrors the logic of the ARM NEON + * `vec_pack_64_uint1_values` function to ensure compatibility. The unpacked + * data is assumed to be organized as four 16-byte blocks. + */ +TORCHAO_ALWAYS_INLINE inline void pack_64_uint1_values( + uint8_t* packed, + const uint8_t* unpacked) { + const uint8_t* unpacked0 = unpacked; + const uint8_t* unpacked1 = unpacked + 16; + const uint8_t* unpacked2 = unpacked + 32; + const uint8_t* unpacked3 = unpacked + 48; + + for (int i = 0; i < 8; ++i) { + // Combine 4 bits for the low nibble of the output byte + uint8_t low_nibble = (unpacked0[i] << 3) | (unpacked1[i] << 2) | + (unpacked2[i] << 1) | (unpacked3[i] << 0); + + // Combine 4 bits for the high nibble of the output byte + uint8_t high_nibble_src = (unpacked0[i + 8] << 3) | + (unpacked1[i + 8] << 2) | (unpacked2[i + 8] << 1) | + (unpacked3[i + 8] << 0); + + // Assemble the final byte + packed[i] = low_nibble | (high_nibble_src << 4); + } +} + +/** + * @brief Unpacks 8 bytes into 64 bytes (each a 1-bit value). + * @param unpacked Pointer to the destination memory (64 bytes). + * @param packed Pointer to the source memory (8 bytes). + * @note This implementation mirrors the logic of the ARM NEON + * `vec_unpack_64_uint1_values` function to ensure compatibility. + */ +TORCHAO_ALWAYS_INLINE inline void unpack_64_uint1_values( + uint8_t* unpacked, + const uint8_t* packed) { + uint8_t* unpacked0 = unpacked; + uint8_t* unpacked1 = unpacked + 16; + uint8_t* unpacked2 = unpacked + 32; + uint8_t* unpacked3 = unpacked + 48; + + uint8_t combined[16]; + for (int i = 0; i < 8; ++i) { + combined[i] = packed[i] & 0x0F; // Low nibbles + combined[i + 8] = packed[i] >> 4; // High nibbles + } + + // Unpack from the combined buffer into the four destination blocks + for (int i = 0; i < 16; ++i) { + const uint8_t temp = combined[i]; + unpacked0[i] = (temp >> 3) & 1; + unpacked1[i] = (temp >> 2) & 1; + unpacked2[i] = (temp >> 1) & 1; + unpacked3[i] = (temp >> 0) & 1; + } +} + +/** + * @brief Packs 128 bytes (each a 1-bit value) into 16 bytes. + * @param packed Pointer to the destination memory (16 bytes). + * @param unpacked Pointer to the source memory (128 bytes). + * @note This implementation mirrors the logic of the ARM NEON + * `vec_pack_128_uint1_values` function (a transpose-and-pack operation) to + * ensure compatibility. The unpacked data is assumed to be organized as eight + * 16-byte blocks. + */ +TORCHAO_ALWAYS_INLINE inline void pack_128_uint1_values( + uint8_t* packed, + const uint8_t* unpacked) { + for (int i = 0; i < 16; ++i) { + packed[i] = (unpacked[i + 16 * 0] << 7) | (unpacked[i + 16 * 1] << 6) | + (unpacked[i + 16 * 2] << 5) | (unpacked[i + 16 * 3] << 4) | + (unpacked[i + 16 * 4] << 3) | (unpacked[i + 16 * 5] << 2) | + (unpacked[i + 16 * 6] << 1) | (unpacked[i + 16 * 7] << 0); + } +} + +/** + * @brief Unpacks 16 bytes into 128 bytes (each a 1-bit value). + * @param unpacked Pointer to the destination memory (128 bytes). + * @param packed Pointer to the source memory (16 bytes). + * @note This implementation mirrors the logic of the ARM NEON + * `vec_unpack_128_uint1_values` function (an unpack-and-transpose operation) + * to ensure compatibility. + */ +TORCHAO_ALWAYS_INLINE inline void unpack_128_uint1_values( + uint8_t* unpacked, + const uint8_t* packed) { + for (int i = 0; i < 16; ++i) { + const uint8_t packed_byte = packed[i]; + unpacked[i + 16 * 0] = (packed_byte >> 7) & 1; + unpacked[i + 16 * 1] = (packed_byte >> 6) & 1; + unpacked[i + 16 * 2] = (packed_byte >> 5) & 1; + unpacked[i + 16 * 3] = (packed_byte >> 4) & 1; + unpacked[i + 16 * 4] = (packed_byte >> 3) & 1; + unpacked[i + 16 * 5] = (packed_byte >> 2) & 1; + unpacked[i + 16 * 6] = (packed_byte >> 1) & 1; + unpacked[i + 16 * 7] = (packed_byte >> 0) & 1; + } +} +} // namespace internal +} // namespace torchao::kernels::cpu::fallback::bitpacking diff --git a/torchao/experimental/kernels/cpu/fallback/bitpacking/uint2.h b/torchao/experimental/kernels/cpu/fallback/bitpacking/uint2.h new file mode 100644 index 0000000000..2681110348 --- /dev/null +++ b/torchao/experimental/kernels/cpu/fallback/bitpacking/uint2.h @@ -0,0 +1,119 @@ +// Copyright (c) Meta Platforms, Inc. and affiliates. +// All rights reserved. +// +// This source code is licensed under the license found in the +// LICENSE file in the root directory of this source tree. + +#pragma once + +#include +#include +namespace torchao::kernels::cpu::fallback::bitpacking { +namespace internal { + +/** + * @brief Packs 4 bytes, each containing a 2-bit value (0-3), into a single + * byte. + * @param packed Pointer to the destination memory (1 byte). + * @param unpacked Pointer to the source memory (4 bytes). + */ +TORCHAO_ALWAYS_INLINE inline void pack_4_uint2_values( + uint8_t* packed, + const uint8_t* unpacked) { + // unpacked = {v0, v1, v2, v3} -> packed[0] = | v0 | v1 | v2 | v3 | + packed[0] = (unpacked[0] << 6) | (unpacked[1] << 4) | (unpacked[2] << 2) | + (unpacked[3]); +} + +/** + * @brief Unpacks a single byte into 4 bytes, each containing a 2-bit value. + * @param unpacked Pointer to the destination memory (4 bytes). + * @param packed Pointer to the source memory (1 byte). + */ +TORCHAO_ALWAYS_INLINE inline void unpack_4_uint2_values( + uint8_t* unpacked, + const uint8_t* packed) { + unpacked[0] = (packed[0] >> 6) & 0x03; // Mask 0b11000000 + unpacked[1] = (packed[0] >> 4) & 0x03; // Mask 0b00110000 + unpacked[2] = (packed[0] >> 2) & 0x03; // Mask 0b00001100 + unpacked[3] = packed[0] & 0x03; // Mask 0b00000011 +} + +/** + * @brief Packs 32 bytes (each a 2-bit value) into 8 bytes. + * @param packed Pointer to the destination memory (8 bytes). + * @param unpacked Pointer to the source memory (32 bytes). + * @note This implementation mirrors the logic of the ARM NEON + * `vec_pack_32_uint2_values` function (a transpose-and-pack operation) to + * ensure compatibility. The unpacked data is assumed to be organized as four + * 8-byte blocks. + */ +TORCHAO_ALWAYS_INLINE inline void pack_32_uint2_values( + uint8_t* packed, + const uint8_t* unpacked) { + for (int i = 0; i < 8; ++i) { + packed[i] = (unpacked[i + 8 * 0] << 6) | (unpacked[i + 8 * 1] << 4) | + (unpacked[i + 8 * 2] << 2) | (unpacked[i + 8 * 3] << 0); + } +} + +/** + * @brief Unpacks 8 bytes into 32 bytes (each a 2-bit value). + * @param unpacked Pointer to the destination memory (32 bytes). + * @param packed Pointer to the source memory (8 bytes). + * @note This implementation mirrors the logic of the ARM NEON + * `vec_unpack_32_uint2_values` function (an unpack-and-transpose operation) + * to ensure compatibility. + */ +TORCHAO_ALWAYS_INLINE inline void unpack_32_uint2_values( + uint8_t* unpacked, + const uint8_t* packed) { + for (int i = 0; i < 8; ++i) { + const uint8_t packed_byte = packed[i]; + unpacked[i + 8 * 0] = (packed_byte >> 6) & 0x03; + unpacked[i + 8 * 1] = (packed_byte >> 4) & 0x03; + unpacked[i + 8 * 2] = (packed_byte >> 2) & 0x03; + unpacked[i + 8 * 3] = (packed_byte >> 0) & 0x03; + } +} + +/** + * @brief Packs 64 bytes (each a 2-bit value) into 16 bytes. + * @param packed Pointer to the destination memory (16 bytes). + * @param unpacked Pointer to the source memory (64 bytes). + * @note This implementation mirrors the logic of the ARM NEON + * `vec_pack_64_uint2_values` function (a transpose-and-pack operation) to + * ensure compatibility. The unpacked data is assumed to be organized as four + * 16-byte blocks. + */ +TORCHAO_ALWAYS_INLINE inline void pack_64_uint2_values( + uint8_t* packed, + const uint8_t* unpacked) { + for (int i = 0; i < 16; ++i) { + packed[i] = (unpacked[i + 16 * 0] << 6) | (unpacked[i + 16 * 1] << 4) | + (unpacked[i + 16 * 2] << 2) | (unpacked[i + 16 * 3] << 0); + } +} + +/** + * @brief Unpacks 16 bytes into 64 bytes (each a 2-bit value). + * @param unpacked Pointer to the destination memory (64 bytes). + * @param packed Pointer to the source memory (16 bytes). + * @note This implementation mirrors the logic of the ARM NEON + * `vec_unpack_64_uint2_values` function (an unpack-and-transpose operation) + * to ensure compatibility. + */ +TORCHAO_ALWAYS_INLINE inline void unpack_64_uint2_values( + uint8_t* unpacked, + const uint8_t* packed) { + for (int i = 0; i < 16; ++i) { + const uint8_t packed_byte = packed[i]; + unpacked[i + 16 * 0] = (packed_byte >> 6) & 0x03; + unpacked[i + 16 * 1] = (packed_byte >> 4) & 0x03; + unpacked[i + 16 * 2] = (packed_byte >> 2) & 0x03; + unpacked[i + 16 * 3] = (packed_byte >> 0) & 0x03; + } +} + +} // namespace internal +} // namespace torchao::kernels::cpu::fallback::bitpacking diff --git a/torchao/experimental/kernels/cpu/fallback/bitpacking/uint3.h b/torchao/experimental/kernels/cpu/fallback/bitpacking/uint3.h new file mode 100644 index 0000000000..635e1bca6c --- /dev/null +++ b/torchao/experimental/kernels/cpu/fallback/bitpacking/uint3.h @@ -0,0 +1,195 @@ +// Copyright (c) Meta Platforms, Inc. and affiliates. +// All rights reserved. +// +// This source code is licensed under the license found in the +// LICENSE file in the root directory of this source tree. + +#pragma once + +#include +#include + +namespace torchao::kernels::cpu::fallback::bitpacking { +namespace internal { + +/** + * @brief Packs 8 bytes, each holding a 3-bit value (0-7), into 3 bytes. + * + * The packing scheme is non-trivial. Given 8 input values v0..v7, they are + * arranged into 3 bytes (b0, b1, b2) as follows: + * - b0: [v6(low 2 bits), v0(all 3 bits), v1(all 3 bits)] + * - b1: [v7(low 2 bits), v2(all 3 bits), v3(all 3 bits)] + * - b2: [v6(high 1 bit), v7(high 1 bit), v4(all 3 bits), v5(all 3 bits)] + * + * @param packed Pointer to the destination memory (3 bytes). + * @param unpacked Pointer to the source memory (8 bytes). + */ +TORCHAO_ALWAYS_INLINE inline void pack_8_uint3_values( + uint8_t* packed, + const uint8_t* unpacked) { + // byte 0 + packed[0] = ((unpacked[6] & 0x03) << 6) | ((unpacked[0] & 0x07) << 3) | + (unpacked[1] & 0x07); + + // byte 1 + packed[1] = ((unpacked[7] & 0x03) << 6) | ((unpacked[2] & 0x07) << 3) | + (unpacked[3] & 0x07); + + // byte 2 + packed[2] = ((unpacked[6] & 0x04) << 5) | ((unpacked[7] & 0x04) << 4) | + ((unpacked[4] & 0x07) << 3) | (unpacked[5] & 0x07); +} + +/** + * @brief Unpacks 3 bytes into 8 bytes, each containing a 3-bit value. + * @param unpacked Pointer to the destination memory (8 bytes). + * @param packed Pointer to the source memory (3 bytes). + */ +TORCHAO_ALWAYS_INLINE inline void unpack_8_uint3_values( + uint8_t* unpacked, + const uint8_t* packed) { + const uint8_t b0 = packed[0]; + const uint8_t b1 = packed[1]; + const uint8_t b2 = packed[2]; + + unpacked[0] = (b0 >> 3) & 0x07; + unpacked[1] = b0 & 0x07; + + unpacked[2] = (b1 >> 3) & 0x07; + unpacked[3] = b1 & 0x07; + + unpacked[4] = (b2 >> 3) & 0x07; + unpacked[5] = b2 & 0x07; + + unpacked[6] = (b0 >> 6) | ((b2 >> 5) & 0x04); + unpacked[7] = (b1 >> 6) | ((b2 >> 4) & 0x04); +} + +/** + * @brief Packs 64 bytes (each a 3-bit value) into 24 bytes. + * @param packed Pointer to the destination memory (24 bytes). + * @param unpacked Pointer to the source memory (64 bytes). + * @note This implementation mirrors the logic of the ARM NEON + * `vec_pack_64_uint3_values` function (a transpose-and-pack operation) to + * ensure compatibility. The unpacked data is assumed to be organized as eight + * 8-byte blocks. + */ +TORCHAO_ALWAYS_INLINE inline void pack_64_uint3_values( + uint8_t* packed, + const uint8_t* unpacked) { + for (int i = 0; i < 8; ++i) { + const uint8_t unpacked0 = unpacked[i + 8 * 0]; + const uint8_t unpacked1 = unpacked[i + 8 * 1]; + const uint8_t unpacked2 = unpacked[i + 8 * 2]; + const uint8_t unpacked3 = unpacked[i + 8 * 3]; + const uint8_t unpacked4 = unpacked[i + 8 * 4]; + const uint8_t unpacked5 = unpacked[i + 8 * 5]; + const uint8_t unpacked6 = unpacked[i + 8 * 6]; + const uint8_t unpacked7 = unpacked[i + 8 * 7]; + + // byte 0 + packed[i] = ((unpacked6 & 0x03) << 6) | ((unpacked0 & 0x07) << 3) | + (unpacked1 & 0x07); + + // byte 1 + packed[i + 8] = ((unpacked7 & 0x03) << 6) | ((unpacked2 & 0x07) << 3) | + (unpacked3 & 0x07); + + // byte 2 + packed[i + 16] = ((unpacked6 & 0x04) << 5) | ((unpacked7 & 0x04) << 4) | + ((unpacked4 & 0x07) << 3) | (unpacked5 & 0x07); + } +} + +/** + * @brief Unpacks 24 bytes into 64 bytes (each a 3-bit value). + * @param unpacked Pointer to the destination memory (64 bytes). + * @param packed Pointer to the source memory (24 bytes). + * @note This implementation mirrors the logic of the ARM NEON + * `vec_unpack_64_uint3_values` function (an unpack-and-transpose operation) + * to ensure compatibility. + */ +TORCHAO_ALWAYS_INLINE inline void unpack_64_uint3_values( + uint8_t* unpacked, + const uint8_t* packed) { + for (int i = 0; i < 8; ++i) { + const uint8_t b0 = packed[i]; + const uint8_t b1 = packed[i + 8]; + const uint8_t b2 = packed[i + 16]; + + unpacked[i + 8 * 0] = (b0 >> 3) & 0x07; + unpacked[i + 8 * 1] = b0 & 0x07; + unpacked[i + 8 * 2] = (b1 >> 3) & 0x07; + unpacked[i + 8 * 3] = b1 & 0x07; + unpacked[i + 8 * 4] = (b2 >> 3) & 0x07; + unpacked[i + 8 * 5] = b2 & 0x07; + unpacked[i + 8 * 6] = (b0 >> 6) | ((b2 >> 5) & 0x04); + unpacked[i + 8 * 7] = (b1 >> 6) | ((b2 >> 4) & 0x04); + } +} + +/** + * @brief Packs 128 bytes (each a 3-bit value) into 48 bytes. + * @param packed Pointer to the destination memory (48 bytes). + * @param unpacked Pointer to the source memory (128 bytes). + * @note This implementation mirrors the logic of the ARM NEON + * `vec_pack_128_uint3_values` function (a transpose-and-pack operation) to + * ensure compatibility. The unpacked data is assumed to be organized as eight + * 16-byte blocks. + */ +TORCHAO_ALWAYS_INLINE inline void pack_128_uint3_values( + uint8_t* packed, + const uint8_t* unpacked) { + for (int i = 0; i < 16; ++i) { + const uint8_t unpacked0 = unpacked[i + 16 * 0]; + const uint8_t unpacked1 = unpacked[i + 16 * 1]; + const uint8_t unpacked2 = unpacked[i + 16 * 2]; + const uint8_t unpacked3 = unpacked[i + 16 * 3]; + const uint8_t unpacked4 = unpacked[i + 16 * 4]; + const uint8_t unpacked5 = unpacked[i + 16 * 5]; + const uint8_t unpacked6 = unpacked[i + 16 * 6]; + const uint8_t unpacked7 = unpacked[i + 16 * 7]; + + // byte 0 + packed[i] = ((unpacked6 & 0x03) << 6) | ((unpacked0 & 0x07) << 3) | + (unpacked1 & 0x07); + + // byte 1 + packed[i + 16] = ((unpacked7 & 0x03) << 6) | ((unpacked2 & 0x07) << 3) | + (unpacked3 & 0x07); + + // byte 2 + packed[i + 32] = ((unpacked6 & 0x04) << 5) | ((unpacked7 & 0x04) << 4) | + ((unpacked4 & 0x07) << 3) | (unpacked5 & 0x07); + } +} + +/** + * @brief Unpacks 48 bytes into 128 bytes (each a 3-bit value). + * @param unpacked Pointer to the destination memory (128 bytes). + * @param packed Pointer to the source memory (48 bytes). + * @note This implementation mirrors the logic of the ARM NEON + * `vec_unpack_128_uint3_values` function (an unpack-and-transpose operation) + * to ensure compatibility. + */ +TORCHAO_ALWAYS_INLINE inline void unpack_128_uint3_values( + uint8_t* unpacked, + const uint8_t* packed) { + for (int i = 0; i < 16; ++i) { + const uint8_t b0 = packed[i]; + const uint8_t b1 = packed[i + 16]; + const uint8_t b2 = packed[i + 32]; + + unpacked[i + 16 * 0] = (b0 >> 3) & 0x07; + unpacked[i + 16 * 1] = b0 & 0x07; + unpacked[i + 16 * 2] = (b1 >> 3) & 0x07; + unpacked[i + 16 * 3] = b1 & 0x07; + unpacked[i + 16 * 4] = (b2 >> 3) & 0x07; + unpacked[i + 16 * 5] = b2 & 0x07; + unpacked[i + 16 * 6] = (b0 >> 6) | ((b2 >> 5) & 0x04); + unpacked[i + 16 * 7] = (b1 >> 6) | ((b2 >> 4) & 0x04); + } +} + +} // namespace internal +} // namespace torchao::kernels::cpu::fallback::bitpacking diff --git a/torchao/experimental/kernels/cpu/fallback/bitpacking/uint4.h b/torchao/experimental/kernels/cpu/fallback/bitpacking/uint4.h new file mode 100644 index 0000000000..27be9488d7 --- /dev/null +++ b/torchao/experimental/kernels/cpu/fallback/bitpacking/uint4.h @@ -0,0 +1,109 @@ +// Copyright (c) Meta Platforms, Inc. and affiliates. +// All rights reserved. +// +// This source code is licensed under the license found in the +// LICENSE file in the root directory of this source tree. + +#pragma once + +#include +#include + +namespace torchao::kernels::cpu::fallback::bitpacking { +namespace internal { +/** + * @brief Packs 2 bytes, each holding a 4-bit value (0-15), into a single + * byte. The first value goes into the high nibble, the second into the low + * nibble. + * @param packed Pointer to the destination memory (1 byte). + * @param unpacked Pointer to the source memory (2 bytes). + */ +TORCHAO_ALWAYS_INLINE inline void pack_2_uint4_values( + uint8_t* packed, + const uint8_t* unpacked) { + // This is compatible with the scalar NEON version. + packed[0] = (unpacked[0] << 4) | (unpacked[1] & 0x0F); +} + +/** + * @brief Unpacks a single byte into 2 bytes, each containing a 4-bit value. + * @param unpacked Pointer to the destination memory (2 bytes). + * @param packed Pointer to the source memory (1 byte). + */ +TORCHAO_ALWAYS_INLINE inline void unpack_2_uint4_values( + uint8_t* unpacked, + const uint8_t* packed) { + // This is compatible with the scalar NEON version. + unpacked[0] = packed[0] >> 4; + unpacked[1] = packed[0] & 0x0F; +} + +/** + * @brief Packs 16 bytes (each a 4-bit value) into 8 bytes. + * @param packed Pointer to the destination memory (8 bytes). + * @param unpacked Pointer to the source memory (16 bytes). + * @note This implementation mirrors the logic of the ARM NEON + * `vec_pack_16_uint4_values` function (a transpose-and-pack operation) to + * ensure compatibility. It packs unpacked[i] and unpacked[i+8] into + * packed[i]. + */ +TORCHAO_ALWAYS_INLINE inline void pack_16_uint4_values( + uint8_t* packed, + const uint8_t* unpacked) { + for (int i = 0; i < 8; ++i) { + packed[i] = ((unpacked[i + 8] & 0x0F) << 4) | (unpacked[i] & 0x0F); + } +} + +/** + * @brief Unpacks 8 bytes into 16 bytes (each a 4-bit value). + * @param unpacked Pointer to the destination memory (16 bytes). + * @param packed Pointer to the source memory (8 bytes). + * @note This implementation mirrors the logic of the ARM NEON + * `vec_unpack_16_uint4_values` function (an unpack-and-transpose operation) + * to ensure compatibility. + */ +TORCHAO_ALWAYS_INLINE inline void unpack_16_uint4_values( + uint8_t* unpacked, + const uint8_t* packed) { + for (int i = 0; i < 8; ++i) { + unpacked[i] = packed[i] & 0x0F; + unpacked[i + 8] = packed[i] >> 4; + } +} + +/** + * @brief Packs 32 bytes (each a 4-bit value) into 16 bytes. + * @param packed Pointer to the destination memory (16 bytes). + * @param unpacked Pointer to the source memory (32 bytes). + * @note This implementation mirrors the logic of the ARM NEON + * `vec_pack_32_uint4_values` function (a transpose-and-pack operation) to + * ensure compatibility. It packs unpacked[i] and unpacked[i+16] into + * packed[i]. + */ +TORCHAO_ALWAYS_INLINE inline void pack_32_uint4_values( + uint8_t* packed, + const uint8_t* unpacked) { + for (int i = 0; i < 16; ++i) { + packed[i] = ((unpacked[i + 16] & 0x0F) << 4) | (unpacked[i] & 0x0F); + } +} + +/** + * @brief Unpacks 16 bytes into 32 bytes (each a 4-bit value). + * @param unpacked Pointer to the destination memory (32 bytes). + * @param packed Pointer to the source memory (16 bytes). + * @note This implementation mirrors the logic of the ARM NEON + * `vec_unpack_32_uint4_values` function (an unpack-and-transpose operation) + * to ensure compatibility. + */ +TORCHAO_ALWAYS_INLINE inline void unpack_32_uint4_values( + uint8_t* unpacked, + const uint8_t* packed) { + for (int i = 0; i < 16; ++i) { + unpacked[i] = packed[i] & 0x0F; + unpacked[i + 16] = packed[i] >> 4; + } +} +} // namespace internal +} // namespace torchao::kernels::cpu::fallback::bitpacking diff --git a/torchao/experimental/kernels/cpu/fallback/bitpacking/uint5.h b/torchao/experimental/kernels/cpu/fallback/bitpacking/uint5.h new file mode 100644 index 0000000000..2ad408a75a --- /dev/null +++ b/torchao/experimental/kernels/cpu/fallback/bitpacking/uint5.h @@ -0,0 +1,175 @@ +// Copyright (c) Meta Platforms, Inc. and affiliates. +// All rights reserved. +// +// This source code is licensed under the license found in the +// LICENSE file in the root directory of this source tree. + +#pragma once + +#include +#include + +namespace torchao::kernels::cpu::fallback::bitpacking { +namespace internal { + +/** + * @brief Packs 8 bytes, each holding a 5-bit value (0-31), into 5 bytes. + * + * @param packed Pointer to the destination memory (5 bytes). + * @param unpacked Pointer to the source memory (8 bytes). + */ +TORCHAO_ALWAYS_INLINE inline void pack_8_uint5_values( + uint8_t* packed, + const uint8_t* unpacked) { + // pack 8 uint5 values (u0..u7) into 5 bytes (p0..p4) + // p0 = u0_all | u1_low_3_bits + // p1 = u2_all | u3_low_3_bits + // p2 = u4_all | u5_low_3_bits + // p3 = u6_all | u7_low_3_bits + // p4 = u1_high_2_bits | u3_high_2_bits | u5_high_2_bits | u7_high_2_bits + packed[0] = (unpacked[0] & 0x1F) | ((unpacked[1] & 0x1F) << 5); + packed[1] = (unpacked[2] & 0x1F) | ((unpacked[3] & 0x1F) << 5); + packed[2] = (unpacked[4] & 0x1F) | ((unpacked[5] & 0x1F) << 5); + packed[3] = (unpacked[6] & 0x1F) | ((unpacked[7] & 0x1F) << 5); + packed[4] = ((unpacked[1] & 0x1F) >> 3) | (((unpacked[3] & 0x1F) >> 3) << 2) | + (((unpacked[5] & 0x1F) >> 3) << 4) | (((unpacked[7] & 0x1F) >> 3) << 6); +} + +/** + * @brief Unpacks 5 bytes into 8 bytes, each containing a 5-bit value. + * + * @param unpacked Pointer to the destination memory (8 bytes). + * @param packed Pointer to the source memory (5 bytes). + */ +TORCHAO_ALWAYS_INLINE inline void unpack_8_uint5_values( + uint8_t* unpacked, + const uint8_t* packed) { + const uint8_t p0 = packed[0]; + const uint8_t p1 = packed[1]; + const uint8_t p2 = packed[2]; + const uint8_t p3 = packed[3]; + const uint8_t p4 = packed[4]; + + // This is compatible with the scalar NEON version. + unpacked[0] = p0 & 0x1F; + unpacked[1] = (p0 >> 5) | ((p4 & 0x03) << 3); + unpacked[2] = p1 & 0x1F; + unpacked[3] = (p1 >> 5) | ((p4 & 0x0C) << 1); + unpacked[4] = p2 & 0x1F; + unpacked[5] = (p2 >> 5) | ((p4 & 0x30) >> 1); + unpacked[6] = p3 & 0x1F; + unpacked[7] = (p3 >> 5) | ((p4 & 0xC0) >> 3); +} + +/** + * @brief Packs 64 bytes (each a 5-bit value) into 40 bytes. + * @param packed Pointer to the destination memory (40 bytes). + * @param unpacked Pointer to the source memory (64 bytes). + * @note This implementation mirrors the logic of the ARM NEON + * `vec_pack_64_uint5_values` function to ensure compatibility. + */ +TORCHAO_ALWAYS_INLINE inline void pack_64_uint5_values( + uint8_t* packed, + const uint8_t* unpacked) { + // Pack the first 32 bytes (p0, p1) + for (int i = 0; i < 16; ++i) { + packed[i] = (unpacked[i] & 0x1F) | ((unpacked[i + 16] & 0x1F) << 5); + packed[i + 16] = (unpacked[i + 32] & 0x1F) | ((unpacked[i + 48] & 0x1F) << 5); + } + + // Pack the final 8 bytes (p2) + for (int i = 0; i < 8; ++i) { + uint8_t val1 = (unpacked[16 + i] >> 3) & 0x03; + uint8_t val2 = (unpacked[24 + i] >> 3) & 0x03; + uint8_t val3 = (unpacked[48 + i] >> 3) & 0x03; + uint8_t val4 = (unpacked[56 + i] >> 3) & 0x03; + packed[32 + i] = val1 | (val2 << 2) | (val3 << 4) | (val4 << 6); + } +} + +/** + * @brief Unpacks 40 bytes into 64 bytes (each a 5-bit value). + * @param unpacked Pointer to the destination memory (64 bytes). + * @param packed Pointer to the source memory (40 bytes). + * @note This implementation mirrors the logic of the ARM NEON + * `vec_unpack_64_uint5_values` function to ensure compatibility. + */ +TORCHAO_ALWAYS_INLINE inline void unpack_64_uint5_values( + uint8_t* unpacked, + const uint8_t* packed) { + for (int i = 0; i < 16; ++i) { + const uint8_t p0 = packed[i]; + const uint8_t p1 = packed[i + 16]; + // p2 is only 8 bytes wide, so we use modulo to access it correctly. + const uint8_t p2 = packed[32 + (i % 8)]; + + unpacked[i] = p0 & 0x1F; + unpacked[i + 32] = p1 & 0x1F; + + if (i < 8) { + unpacked[i + 16] = (p0 >> 5) | ((p2 & 0x03) << 3); + unpacked[i + 48] = (p1 >> 5) | ((p2 & 0x30) >> 1); + } else { + unpacked[i + 16] = (p0 >> 5) | ((p2 & 0x0C) << 1); + unpacked[i + 48] = (p1 >> 5) | ((p2 & 0xC0) >> 3); + } + } +} + +/** + * @brief Packs 128 bytes (each a 5-bit value) into 80 bytes. + * @param packed Pointer to the destination memory (80 bytes). + * @param unpacked Pointer to the source memory (128 bytes). + * @note This implementation mirrors the logic of the ARM NEON + * `vec_pack_128_uint5_values` function to ensure compatibility. + */ +TORCHAO_ALWAYS_INLINE inline void pack_128_uint5_values( + uint8_t* packed, + const uint8_t* unpacked) { + // Pack the first 64 bytes (p0, p1, p2, p3) + for (int i = 0; i < 16; ++i) { + packed[i] = (unpacked[i] & 0x1F) | ((unpacked[i + 16] & 0x1F) << 5); + packed[i + 16] = (unpacked[i + 32] & 0x1F) | ((unpacked[i + 48] & 0x1F) << 5); + packed[i + 32] = (unpacked[i + 64] & 0x1F) | ((unpacked[i + 80] & 0x1F) << 5); + packed[i + 48] = (unpacked[i + 96] & 0x1F) | ((unpacked[i + 112] & 0x1F) << 5); + } + + // Pack the final 16 bytes (p4) + for (int i = 0; i < 16; ++i) { + uint8_t val1 = (unpacked[16 + i] >> 3) & 0x03; + uint8_t val2 = (unpacked[48 + i] >> 3) & 0x03; + uint8_t val3 = (unpacked[80 + i] >> 3) & 0x03; + uint8_t val4 = (unpacked[112 + i] >> 3) & 0x03; + packed[64 + i] = val1 | (val2 << 2) | (val3 << 4) | (val4 << 6); + } +} + +/** + * @brief Unpacks 80 bytes into 128 bytes (each a 5-bit value). + * @param unpacked Pointer to the destination memory (128 bytes). + * @param packed Pointer to the source memory (80 bytes). + * @note This implementation mirrors the logic of the ARM NEON + * `vec_unpack_128_uint5_values` function to ensure compatibility. + */ +TORCHAO_ALWAYS_INLINE inline void unpack_128_uint5_values( + uint8_t* unpacked, + const uint8_t* packed) { + for (int i = 0; i < 16; ++i) { + const uint8_t p0 = packed[i]; + const uint8_t p1 = packed[i + 16]; + const uint8_t p2 = packed[i + 32]; + const uint8_t p3 = packed[i + 48]; + const uint8_t p4 = packed[i + 64]; + + unpacked[i + 16 * 0] = p0 & 0x1F; + unpacked[i + 16 * 1] = (p0 >> 5) | ((p4 & 0x03) << 3); + unpacked[i + 16 * 2] = p1 & 0x1F; + unpacked[i + 16 * 3] = (p1 >> 5) | ((p4 & 0x0C) << 1); + unpacked[i + 16 * 4] = p2 & 0x1F; + unpacked[i + 16 * 5] = (p2 >> 5) | ((p4 & 0x30) >> 1); + unpacked[i + 16 * 6] = p3 & 0x1F; + unpacked[i + 16 * 7] = (p3 >> 5) | ((p4 & 0xC0) >> 3); + } +} + +}} diff --git a/torchao/experimental/kernels/cpu/fallback/bitpacking/uint6.h b/torchao/experimental/kernels/cpu/fallback/bitpacking/uint6.h new file mode 100644 index 0000000000..65325b030d --- /dev/null +++ b/torchao/experimental/kernels/cpu/fallback/bitpacking/uint6.h @@ -0,0 +1,142 @@ +// Copyright (c) Meta Platforms, Inc. and affiliates. +// All rights reserved. +// +// This source code is licensed under the license found in the +// LICENSE file in the root directory of this source tree. + +#pragma once + +#include +#include + +namespace torchao::kernels::cpu::fallback::bitpacking { +namespace internal { + +/** + * @brief Packs 4 bytes, each holding a 6-bit value (0-63), into 3 bytes. + * + * @param packed Pointer to the destination memory (3 bytes). + * @param unpacked Pointer to the source memory (4 bytes). + */ +TORCHAO_ALWAYS_INLINE inline void pack_4_uint6_values( + uint8_t* packed, + const uint8_t* unpacked) { + // pack 4 uint6 values (u0..u3) into 3 bytes (p0..p2) + // p0's low 6 bits = u0; p0's high 2 bits = u3's low 2 bits + // p1's low 6 bits = u1; p1's high 2 bits = u3's mid 2 bits + // p2's low 6 bits = u2; p2's high 2 bits = u3's high 2 bits + const uint8_t u3 = unpacked[3] & 0x3F; + packed[0] = (unpacked[0] & 0x3F) | ((u3 & 0x03) << 6); + packed[1] = (unpacked[1] & 0x3F) | ((u3 & 0x0C) << 4); + packed[2] = (unpacked[2] & 0x3F) | ((u3 & 0x30) << 2); +} + +/** + * @brief Unpacks 3 bytes into 4 bytes, each containing a 6-bit value. + * + * @param unpacked Pointer to the destination memory (4 bytes). + * @param packed Pointer to the source memory (3 bytes). + */ +TORCHAO_ALWAYS_INLINE inline void unpack_4_uint6_values( + uint8_t* unpacked, + const uint8_t* packed) { + // This is compatible with the scalar NEON version. + unpacked[0] = packed[0] & 0x3F; + unpacked[1] = packed[1] & 0x3F; + unpacked[2] = packed[2] & 0x3F; + unpacked[3] = ((packed[0] & 0xC0) >> 6) | ((packed[1] & 0xC0) >> 4) | + ((packed[2] & 0xC0) >> 2); +} + +/** + * @brief Packs 32 bytes (each a 6-bit value) into 24 bytes. + * @param packed Pointer to the destination memory (24 bytes). + * @param unpacked Pointer to the source memory (32 bytes). + * @note This implementation mirrors the logic of the ARM NEON + * `vec_pack_32_uint6_values` function to ensure compatibility. + */ +TORCHAO_ALWAYS_INLINE inline void pack_32_uint6_values( + uint8_t* packed, + const uint8_t* unpacked) { + for (int i = 0; i < 8; ++i) { + const uint8_t u0 = unpacked[i]; + const uint8_t u1 = unpacked[i + 8]; + const uint8_t u2 = unpacked[i + 16]; + const uint8_t u3 = unpacked[i + 24]; + + packed[i] = (u0 & 0x3F) | ((u3 & 0x03) << 6); + packed[i + 8] = (u1 & 0x3F) | ((u3 & 0x0C) << 4); + packed[i + 16] = (u2 & 0x3F) | ((u3 & 0x30) << 2); + } +} + +/** + * @brief Unpacks 24 bytes into 32 bytes (each a 6-bit value). + * @param unpacked Pointer to the destination memory (32 bytes). + * @param packed Pointer to the source memory (24 bytes). + * @note This implementation mirrors the logic of the ARM NEON + * `vec_unpack_32_uint6_values` function to ensure compatibility. + */ +TORCHAO_ALWAYS_INLINE inline void unpack_32_uint6_values( + uint8_t* unpacked, + const uint8_t* packed) { + for (int i = 0; i < 8; ++i) { + const uint8_t p0 = packed[i]; + const uint8_t p1 = packed[i + 8]; + const uint8_t p2 = packed[i + 16]; + + unpacked[i] = p0 & 0x3F; + unpacked[i + 8] = p1 & 0x3F; + unpacked[i + 16] = p2 & 0x3F; + unpacked[i + 24] = + ((p0 & 0xC0) >> 6) | ((p1 & 0xC0) >> 4) | ((p2 & 0xC0) >> 2); + } +} + +/** + * @brief Packs 64 bytes (each a 6-bit value) into 48 bytes. + * @param packed Pointer to the destination memory (48 bytes). + * @param unpacked Pointer to the source memory (64 bytes). + * @note This implementation mirrors the logic of the ARM NEON + * `vec_pack_64_uint6_values` function to ensure compatibility. + */ +TORCHAO_ALWAYS_INLINE inline void pack_64_uint6_values( + uint8_t* packed, + const uint8_t* unpacked) { + for (int i = 0; i < 16; ++i) { + const uint8_t u0 = unpacked[i]; + const uint8_t u1 = unpacked[i + 16]; + const uint8_t u2 = unpacked[i + 32]; + const uint8_t u3 = unpacked[i + 48]; + + packed[i] = (u0 & 0x3F) | ((u3 & 0x03) << 6); + packed[i + 16] = (u1 & 0x3F) | ((u3 & 0x0C) << 4); + packed[i + 32] = (u2 & 0x3F) | ((u3 & 0x30) << 2); + } +} + +/** + * @brief Unpacks 48 bytes into 64 bytes (each a 6-bit value). + * @param unpacked Pointer to the destination memory (64 bytes). + * @param packed Pointer to the source memory (48 bytes). + * @note This implementation mirrors the logic of the ARM NEON + * `vec_unpack_64_uint6_values` function to ensure compatibility. + */ +TORCHAO_ALWAYS_INLINE inline void unpack_64_uint6_values( + uint8_t* unpacked, + const uint8_t* packed) { + for (int i = 0; i < 16; ++i) { + const uint8_t p0 = packed[i]; + const uint8_t p1 = packed[i + 16]; + const uint8_t p2 = packed[i + 32]; + + unpacked[i] = p0 & 0x3F; + unpacked[i + 16] = p1 & 0x3F; + unpacked[i + 32] = p2 & 0x3F; + unpacked[i + 48] = + ((p0 & 0xC0) >> 6) | ((p1 & 0xC0) >> 4) | ((p2 & 0xC0) >> 2); + } +} + +} // namespace internal +} // namespace torchao::kernels::cpu::fallback::bitpacking diff --git a/torchao/experimental/kernels/cpu/fallback/bitpacking/uint7.h b/torchao/experimental/kernels/cpu/fallback/bitpacking/uint7.h new file mode 100644 index 0000000000..ee4d501324 --- /dev/null +++ b/torchao/experimental/kernels/cpu/fallback/bitpacking/uint7.h @@ -0,0 +1,140 @@ +// Copyright (c) Meta Platforms, Inc. and affiliates. +// All rights reserved. +// +// This source code is licensed under the license found in the +// LICENSE file in the root directory of this source tree. + +#pragma once + +#include +#include + +namespace torchao::kernels::cpu::fallback::bitpacking { +namespace internal { +/** + * @brief Packs 8 bytes, each holding a 7-bit value (0-127), into 7 bytes. + * + * @param packed Pointer to the destination memory (7 bytes). + * @param unpacked Pointer to the source memory (8 bytes). + */ +TORCHAO_ALWAYS_INLINE inline void pack_8_uint7_values( + uint8_t* packed, + const uint8_t* unpacked) { + // pack 8 uint7 values (u0..u7) into 7 bytes (p0..p6) + // The 7 bits of u7 are distributed across the most significant bit (MSB) + // of each of the 7 packed bytes. + // p0 = u7_bit_0 | u0_all_7_bits + // p1 = u7_bit_1 | u1_all_7_bits + // ... + // p6 = u7_bit_6 | u6_all_7_bits + const uint8_t u7 = unpacked[7] & 0x7F; + + for (int i = 0; i < 7; ++i) { + uint8_t u7_bit = (u7 >> i) & 1; + packed[i] = (unpacked[i] & 0x7F) | (u7_bit << 7); + } +} + +/** + * @brief Unpacks 7 bytes into 8 bytes, each containing a 7-bit value. + * + * @param unpacked Pointer to the destination memory (8 bytes). + * @param packed Pointer to the source memory (7 bytes). + */ +TORCHAO_ALWAYS_INLINE inline void unpack_8_uint7_values( + uint8_t* unpacked, + const uint8_t* packed) { + unpacked[7] = 0; + for (int i = 0; i < 7; ++i) { + // The low 7 bits of the packed byte are the original value. + unpacked[i] = packed[i] & 0x7F; + // The high bit of the packed byte is the i-th bit of the 8th value. + uint8_t u7_bit = packed[i] >> 7; + unpacked[7] |= (u7_bit << i); + } +} + +/** + * @brief Packs 64 bytes (each a 7-bit value) into 56 bytes. + * @param packed Pointer to the destination memory (56 bytes). + * @param unpacked Pointer to the source memory (64 bytes). + * @note This implementation mirrors the logic of the ARM NEON + * `vec_pack_64_uint7_values` function to ensure compatibility. + */ +TORCHAO_ALWAYS_INLINE inline void pack_64_uint7_values( + uint8_t* packed, + const uint8_t* unpacked) { + // Transpose-and-pack operation + for (int j = 0; j < 8; ++j) { // Iterate through columns + const uint8_t u7 = unpacked[56 + j] & 0x7F; + for (int i = 0; i < 7; ++i) { // Iterate through rows + uint8_t u7_bit = (u7 >> i) & 1; + packed[i * 8 + j] = (unpacked[i * 8 + j] & 0x7F) | (u7_bit << 7); + } + } +} + +/** + * @brief Unpacks 56 bytes into 64 bytes (each a 7-bit value). + * @param unpacked Pointer to the destination memory (64 bytes). + * @param packed Pointer to the source memory (56 bytes). + * @note This implementation mirrors the logic of the ARM NEON + * `vec_unpack_64_uint7_values` function to ensure compatibility. + */ +TORCHAO_ALWAYS_INLINE inline void unpack_64_uint7_values( + uint8_t* unpacked, + const uint8_t* packed) { + // Unpack-and-transpose operation + for (int j = 0; j < 8; ++j) { // Iterate through columns + uint8_t u7 = 0; + for (int i = 0; i < 7; ++i) { // Iterate through rows + unpacked[i * 8 + j] = packed[i * 8 + j] & 0x7F; + u7 |= ((packed[i * 8 + j] >> 7) & 1) << i; + } + unpacked[56 + j] = u7; + } +} + +/** + * @brief Packs 128 bytes (each a 7-bit value) into 112 bytes. + * @param packed Pointer to the destination memory (112 bytes). + * @param unpacked Pointer to the source memory (128 bytes). + * @note This implementation mirrors the logic of the ARM NEON + * `vec_pack_128_uint7_values` function to ensure compatibility. + */ +TORCHAO_ALWAYS_INLINE inline void pack_128_uint7_values( + uint8_t* packed, + const uint8_t* unpacked) { + // Transpose-and-pack operation + for (int j = 0; j < 16; ++j) { // Iterate through columns + const uint8_t u7 = unpacked[112 + j] & 0x7F; + for (int i = 0; i < 7; ++i) { // Iterate through rows + uint8_t u7_bit = (u7 >> i) & 1; + packed[i * 16 + j] = (unpacked[i * 16 + j] & 0x7F) | (u7_bit << 7); + } + } +} + +/** + * @brief Unpacks 112 bytes into 128 bytes (each a 7-bit value). + * @param unpacked Pointer to the destination memory (128 bytes). + * @param packed Pointer to the source memory (112 bytes). + * @note This implementation mirrors the logic of the ARM NEON + * `vec_unpack_128_uint7_values` function to ensure compatibility. + */ +TORCHAO_ALWAYS_INLINE inline void unpack_128_uint7_values( + uint8_t* unpacked, + const uint8_t* packed) { + // Unpack-and-transpose operation + for (int j = 0; j < 16; ++j) { // Iterate through columns + uint8_t u7 = 0; + for (int i = 0; i < 7; ++i) { // Iterate through rows + unpacked[i * 16 + j] = packed[i * 16 + j] & 0x7F; + u7 |= ((packed[i * 16 + j] >> 7) & 1) << i; + } + unpacked[112 + j] = u7; + } +} + +} // namespace internal +} // namespace torchao::kernels::cpu::fallback::bitpacking diff --git a/torchao/experimental/kernels/cpu/fallback/tests/CMakeLists.txt b/torchao/experimental/kernels/cpu/fallback/tests/CMakeLists.txt new file mode 100644 index 0000000000..652475766b --- /dev/null +++ b/torchao/experimental/kernels/cpu/fallback/tests/CMakeLists.txt @@ -0,0 +1,49 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the license found in the +# LICENSE file in the root directory of this source tree. + +cmake_minimum_required(VERSION 3.19) +project(tests) +set(CMAKE_CXX_STANDARD 17) + +include(FetchContent) +FetchContent_Declare( + googletest + URL https://github.com/google/googletest/archive/03597a01ee50ed33e9dfd640b249b4be3799d395.zip +) +FetchContent_MakeAvailable(googletest) + +add_compile_options("-Wall" "-Werror") + +include(CMakePrintHelpers) +message("TORCHAO_LIBRARIES: ${TORCHAO_LIBRARIES}") +include_directories(${TORCHAO_LIBRARIES}) +add_library( + dep + ${TORCHAO_LIBRARIES}/torchao/experimental/kernels/cpu/aarch64/reduction/find_min_and_max.cpp + ${TORCHAO_LIBRARIES}/torchao/experimental/kernels/cpu/aarch64/reduction/compute_sum.cpp + ${TORCHAO_LIBRARIES}/torchao/experimental/kernels/cpu/aarch64/quantization/quantize.cpp +) +if(NOT TORCHAO_INCLUDE_DIRS) + set(TORCHAO_INCLUDE_DIRS ${TORCHAO_LIBRARIES}) +endif() + +add_subdirectory( +${TORCHAO_LIBRARIES}/torchao/experimental/kernels/cpu/fallback +${CMAKE_CURRENT_BINARY_DIR}/torchao_kernels_cpu_fallback +) + +enable_testing() + +add_executable(test_bitpacking test_bitpacking.cpp) +target_link_libraries( + test_bitpacking + PRIVATE + GTest::gtest_main + dep +) + +include(GoogleTest) +gtest_discover_tests(test_bitpacking) diff --git a/torchao/experimental/kernels/cpu/fallback/tests/build_and_run_tests.sh b/torchao/experimental/kernels/cpu/fallback/tests/build_and_run_tests.sh new file mode 100644 index 0000000000..69590512ec --- /dev/null +++ b/torchao/experimental/kernels/cpu/fallback/tests/build_and_run_tests.sh @@ -0,0 +1,35 @@ +#!/bin/bash -eu +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the license found in the +# LICENSE file in the root directory of this source tree. + +set -eu +SCRIPT_DIR=$(cd -- "$(dirname -- "${BASH_SOURCE[0]}")" &> /dev/null && pwd) +export TORCHAO_LIBRARIES=${SCRIPT_DIR}/../../../../../.. +export CMAKE_OUT=/tmp/cmake-out/torch_ao/kernel_fallback_tests + +target=${1:-"native"} + +EXTRA_ARGS="" + +cmake \ + ${EXTRA_ARGS} \ + -DCMAKE_BUILD_TYPE=Debug \ + -DTORCHAO_LIBRARIES=${TORCHAO_LIBRARIES} \ + -DTORCHAO_BUILD_CPU_AARCH64=ON \ + -S ${TORCHAO_LIBRARIES}/torchao/experimental/kernels/cpu/fallback/tests \ + -B ${CMAKE_OUT} + +cmake --build ${CMAKE_OUT} + +echo "Successfully built tests." + +if [[ "${target}" != "native" ]]; then + echo "Skip running tests when cross compiling."; + exit 0; +fi + +# Run +${CMAKE_OUT}/test_bitpacking diff --git a/torchao/experimental/kernels/cpu/fallback/tests/test_bitpacking.cpp b/torchao/experimental/kernels/cpu/fallback/tests/test_bitpacking.cpp new file mode 100644 index 0000000000..baed31ac6f --- /dev/null +++ b/torchao/experimental/kernels/cpu/fallback/tests/test_bitpacking.cpp @@ -0,0 +1,157 @@ +// Copyright (c) Meta Platforms, Inc. and affiliates. +// All rights reserved. +// +// This source code is licensed under the license found in the +// LICENSE file in the root directory of this source tree. +// test pack with cpp unpack with arm_neon +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +TEST(GenericBitpackingTest, PackUnpack8_uint1) { + int unpacked_bytes = 8; + int packed_bytes = 1; + auto input = torchao::get_random_lowbit_vector(unpacked_bytes, 1); + std::vector packed(packed_bytes); + std::vector unpacked(unpacked_bytes); + + torchao::kernels::cpu::fallback::bitpacking::internal::pack_8_uint1_values( + packed.data(), input.data()); + torchao::kernels::cpu::fallback::bitpacking::internal::unpack_8_uint1_values( + unpacked.data(), packed.data()); + + ASSERT_EQ(input, unpacked); +} + +TEST(GenericBitpackingTest, PackUnpack32_uint4) { + int unpacked_bytes = 32; + int packed_bytes = 16; + auto input = torchao::get_random_lowbit_vector(unpacked_bytes, 4); + std::vector packed(packed_bytes); + std::vector unpacked(unpacked_bytes); + + torchao::kernels::cpu::fallback::bitpacking::internal::pack_32_uint4_values( + packed.data(), input.data()); + torchao::kernels::cpu::fallback::bitpacking::internal::unpack_32_uint4_values( + unpacked.data(), packed.data()); + + ASSERT_EQ(input, unpacked); +} + +TEST(GenericBitpackingTest, PackUnpack8_uint7) { + int unpacked_bytes = 8; + int packed_bytes = 7; + auto input = torchao::get_random_lowbit_vector(unpacked_bytes, 7); + std::vector packed(packed_bytes); + std::vector unpacked(unpacked_bytes); + + torchao::kernels::cpu::fallback::bitpacking::internal::pack_8_uint7_values( + packed.data(), input.data()); + torchao::kernels::cpu::fallback::bitpacking::internal::unpack_8_uint7_values( + unpacked.data(), packed.data()); + + ASSERT_EQ(input, unpacked); +} + +// --- Template test for the main dispatcher function --- +template +void test_bitpacking_128_lowbit_values() { + const int unpacked_bytes = 128; + const int packed_bytes = unpacked_bytes * nbit / 8; + + auto input = torchao::get_random_signed_lowbit_vector(unpacked_bytes, nbit); + std::vector packed(packed_bytes); + std::vector unpacked(unpacked_bytes); + + torchao::kernels::cpu::fallback::bitpacking::internal:: + pack_128_lowbit_int_values(packed.data(), input.data()); + torchao::kernels::cpu::fallback::bitpacking::internal:: + unpack_128_lowbit_int_values(unpacked.data(), packed.data()); + + ASSERT_EQ(input, unpacked); +} + +// --- Template test for the LUT dispatcher function --- +template +void test_bitpacking_128_lowbit_values_with_lut() { + const int unpacked_bytes = 128; + const int packed_bytes = unpacked_bytes * nbit / 8; + const int num_lut_entries = 1 << nbit; + + // 1. Create a LUT and random indices + auto lut = torchao::get_random_signed_lowbit_vector(num_lut_entries, 8); + auto indices = torchao::get_random_lowbit_vector(unpacked_bytes, nbit); + + // 2. Create the ground truth data by applying the LUT + std::vector ground_truth(unpacked_bytes); + for (int i = 0; i < unpacked_bytes; ++i) { + ground_truth[i] = lut[indices[i]]; + } + + // 3. Pack the indices + std::vector packed(packed_bytes); + if constexpr (nbit == 1) + torchao::kernels::cpu::fallback::bitpacking::internal:: + pack_128_uint1_values(packed.data(), indices.data()); + if constexpr (nbit == 2) { + torchao::kernels::cpu::fallback::bitpacking::internal::pack_64_uint2_values( + packed.data(), indices.data()); + torchao::kernels::cpu::fallback::bitpacking::internal::pack_64_uint2_values( + packed.data() + 16, indices.data() + 64); + } + if constexpr (nbit == 3) + torchao::kernels::cpu::fallback::bitpacking::internal:: + pack_128_uint3_values(packed.data(), indices.data()); + if constexpr (nbit == 4) { + torchao::kernels::cpu::fallback::bitpacking::internal::pack_32_uint4_values( + packed.data(), indices.data()); + torchao::kernels::cpu::fallback::bitpacking::internal::pack_32_uint4_values( + packed.data() + 16, indices.data() + 32); + torchao::kernels::cpu::fallback::bitpacking::internal::pack_32_uint4_values( + packed.data() + 32, indices.data() + 64); + torchao::kernels::cpu::fallback::bitpacking::internal::pack_32_uint4_values( + packed.data() + 48, indices.data() + 96); + } + + // 4. Unpack using the LUT function + std::vector unpacked(unpacked_bytes); + torchao::kernels::cpu::fallback::bitpacking::internal:: + unpack_128_lowbit_values_with_lut( + unpacked.data(), packed.data(), lut.data()); + + // 5. Verify the result matches the ground truth + ASSERT_EQ(ground_truth, unpacked); +} + +// --- Instantiate all test cases using macros --- +#define TEST_BITPACKING_128_LOWBIT_VALUES(nbit) \ + TEST(GenericBitpacking128, Lowbit_##nbit) { \ + test_bitpacking_128_lowbit_values(); \ + } + +#define TEST_BITPACKING_128_LOWBIT_VALUES_WITH_LUT(nbit) \ + TEST(GenericBitpacking128, Lowbit_with_lut_##nbit) { \ + test_bitpacking_128_lowbit_values_with_lut(); \ + } + +TEST_BITPACKING_128_LOWBIT_VALUES(1); +TEST_BITPACKING_128_LOWBIT_VALUES(2); +TEST_BITPACKING_128_LOWBIT_VALUES(3); +TEST_BITPACKING_128_LOWBIT_VALUES(4); +TEST_BITPACKING_128_LOWBIT_VALUES(5); +TEST_BITPACKING_128_LOWBIT_VALUES(6); +TEST_BITPACKING_128_LOWBIT_VALUES(7); +TEST_BITPACKING_128_LOWBIT_VALUES(8); + +TEST_BITPACKING_128_LOWBIT_VALUES_WITH_LUT(1); +TEST_BITPACKING_128_LOWBIT_VALUES_WITH_LUT(2); +TEST_BITPACKING_128_LOWBIT_VALUES_WITH_LUT(3); +TEST_BITPACKING_128_LOWBIT_VALUES_WITH_LUT(4);