From 6d66322a8a14f1bf1c9357077952a7556c0928d9 Mon Sep 17 00:00:00 2001
From: Zeyu Song <zys@meta.com>
Date: Mon, 11 Aug 2025 09:59:20 -0700
Subject: [PATCH] Add CPP version of bitpacking. (#2725)

Summary:

Add cpp implementation of bitpacking functions (not rely on specific hardwares).

Differential Revision: D79456037
---
 .../kernels/cpu/aarch64/tests/CMakeLists.txt  |   9 +
 .../cpu/aarch64/tests/build_and_run_tests.sh  |   1 +
 .../test_bitpack_fallback_compatibility.cpp   | 686 ++++++++++++++++++
 .../kernels/cpu/fallback/CMakeLists.txt       |   5 +
 .../kernels/cpu/fallback/bitpacking/bitpack.h | 179 +++++
 .../kernels/cpu/fallback/bitpacking/uint1.h   | 154 ++++
 .../kernels/cpu/fallback/bitpacking/uint2.h   | 119 +++
 .../kernels/cpu/fallback/bitpacking/uint3.h   | 195 +++++
 .../kernels/cpu/fallback/bitpacking/uint4.h   | 109 +++
 .../kernels/cpu/fallback/bitpacking/uint5.h   | 175 +++++
 .../kernels/cpu/fallback/bitpacking/uint6.h   | 142 ++++
 .../kernels/cpu/fallback/bitpacking/uint7.h   | 140 ++++
 .../kernels/cpu/fallback/tests/CMakeLists.txt |  49 ++
 .../cpu/fallback/tests/build_and_run_tests.sh |  35 +
 .../cpu/fallback/tests/test_bitpacking.cpp    | 157 ++++
 15 files changed, 2155 insertions(+)
 create mode 100644 torchao/experimental/kernels/cpu/aarch64/tests/test_bitpack_fallback_compatibility.cpp
 create mode 100644 torchao/experimental/kernels/cpu/fallback/CMakeLists.txt
 create mode 100644 torchao/experimental/kernels/cpu/fallback/bitpacking/bitpack.h
 create mode 100644 torchao/experimental/kernels/cpu/fallback/bitpacking/uint1.h
 create mode 100644 torchao/experimental/kernels/cpu/fallback/bitpacking/uint2.h
 create mode 100644 torchao/experimental/kernels/cpu/fallback/bitpacking/uint3.h
 create mode 100644 torchao/experimental/kernels/cpu/fallback/bitpacking/uint4.h
 create mode 100644 torchao/experimental/kernels/cpu/fallback/bitpacking/uint5.h
 create mode 100644 torchao/experimental/kernels/cpu/fallback/bitpacking/uint6.h
 create mode 100644 torchao/experimental/kernels/cpu/fallback/bitpacking/uint7.h
 create mode 100644 torchao/experimental/kernels/cpu/fallback/tests/CMakeLists.txt
 create mode 100644 torchao/experimental/kernels/cpu/fallback/tests/build_and_run_tests.sh
 create mode 100644 torchao/experimental/kernels/cpu/fallback/tests/test_bitpacking.cpp

diff --git a/torchao/experimental/kernels/cpu/aarch64/tests/CMakeLists.txt b/torchao/experimental/kernels/cpu/aarch64/tests/CMakeLists.txt
index 5f4bca286b..2b38856b9f 100644
--- a/torchao/experimental/kernels/cpu/aarch64/tests/CMakeLists.txt
+++ b/torchao/experimental/kernels/cpu/aarch64/tests/CMakeLists.txt
@@ -128,6 +128,14 @@ target_link_libraries(
     dep
 )
 
+add_executable(test_bitpack_fallback_compatibility test_bitpack_fallback_compatibility.cpp)
+target_link_libraries(
+  test_bitpack_fallback_compatibility
+    PRIVATE
+    GTest::gtest_main
+    dep
+)
+
 include(GoogleTest)
 gtest_discover_tests(test_quantization)
 gtest_discover_tests(test_reduction)
@@ -137,3 +145,4 @@ gtest_discover_tests(test_embedding)
 gtest_discover_tests(test_weight_packing)
 gtest_discover_tests(test_qmatmul)
 gtest_discover_tests(test_lut)
+gtest_discover_tests(test_bitpack_fallback_compatibility)
diff --git a/torchao/experimental/kernels/cpu/aarch64/tests/build_and_run_tests.sh b/torchao/experimental/kernels/cpu/aarch64/tests/build_and_run_tests.sh
index 474a77eb8c..c4d807c702 100644
--- a/torchao/experimental/kernels/cpu/aarch64/tests/build_and_run_tests.sh
+++ b/torchao/experimental/kernels/cpu/aarch64/tests/build_and_run_tests.sh
@@ -62,3 +62,4 @@ ${CMAKE_OUT}/test_embedding
 ${CMAKE_OUT}/test_weight_packing
 ${CMAKE_OUT}/test_qmatmul
 ${CMAKE_OUT}/test_lut
+${CMAKE_OUT}/test_bitpack_fallback_compatibility
diff --git a/torchao/experimental/kernels/cpu/aarch64/tests/test_bitpack_fallback_compatibility.cpp b/torchao/experimental/kernels/cpu/aarch64/tests/test_bitpack_fallback_compatibility.cpp
new file mode 100644
index 0000000000..d0a8622b36
--- /dev/null
+++ b/torchao/experimental/kernels/cpu/aarch64/tests/test_bitpack_fallback_compatibility.cpp
@@ -0,0 +1,686 @@
+// Copyright (c) Meta Platforms, Inc. and affiliates.
+// All rights reserved.
+//
+// This source code is licensed under the license found in the
+// LICENSE file in the root directory of this source tree.
+#if defined(__aarch64__) || defined(__ARM_NEON)
+
+#include <gtest/gtest.h>
+#include <vector>
+
+#include <torchao/experimental/kernels/cpu/aarch64/bitpacking/bitpack.h>
+#include <torchao/experimental/kernels/cpu/aarch64/tests/test_utils.h>
+#include <torchao/experimental/kernels/cpu/fallback/bitpacking/bitpack.h>
+
+// --- Compatibility Tests for uint1 ---
+
+TEST(test_bitpacking_64_uint1_values, CppToNeon) {
+  int unpacked_bytes = 64;
+  int nbit = 1;
+  int packed_bytes = unpacked_bytes * nbit / 8;
+  auto input = torchao::get_random_lowbit_vector(unpacked_bytes, nbit);
+  std::vector<uint8_t> packed(packed_bytes, 0);
+  std::vector<uint8_t> unpacked(unpacked_bytes, 0);
+
+  torchao::kernels::cpu::fallback::bitpacking::internal::pack_64_uint1_values(
+      packed.data(), input.data());
+
+  uint8x16_t u0, u1, u2, u3;
+  torchao::bitpacking::internal::vec_unpack_64_uint1_values(
+      u0, u1, u2, u3, packed.data());
+  vst1q_u8(unpacked.data(), u0);
+  vst1q_u8(unpacked.data() + 16, u1);
+  vst1q_u8(unpacked.data() + 32, u2);
+  vst1q_u8(unpacked.data() + 48, u3);
+
+  for (int i = 0; i < unpacked_bytes; ++i) {
+    EXPECT_EQ(input[i], unpacked[i]);
+  }
+}
+
+TEST(test_bitpacking_64_uint1_values, NeonToCpp) {
+  int unpacked_bytes = 64;
+  int nbit = 1;
+  int packed_bytes = unpacked_bytes * nbit / 8;
+  auto input = torchao::get_random_lowbit_vector(unpacked_bytes, nbit);
+  std::vector<uint8_t> packed(packed_bytes, 0);
+  std::vector<uint8_t> unpacked(unpacked_bytes, 0);
+
+  uint8x16_t i0, i1, i2, i3;
+  torchao::bitpacking::internal::vec_load_64_uint8_values(
+      i0, i1, i2, i3, input.data());
+  torchao::bitpacking::internal::vec_pack_64_uint1_values(
+      packed.data(), i0, i1, i2, i3);
+
+  torchao::kernels::cpu::fallback::bitpacking::internal::unpack_64_uint1_values(
+      unpacked.data(), packed.data());
+
+  for (int i = 0; i < unpacked_bytes; ++i) {
+    EXPECT_EQ(input[i], unpacked[i]);
+  }
+}
+
+TEST(test_bitpacking_128_uint1_values, CppToNeon) {
+  int unpacked_bytes = 128;
+  int nbit = 1;
+  int packed_bytes = unpacked_bytes * nbit / 8;
+  auto input = torchao::get_random_lowbit_vector(unpacked_bytes, nbit);
+  std::vector<uint8_t> packed(packed_bytes, 0);
+  std::vector<uint8_t> unpacked(unpacked_bytes, 0);
+
+  torchao::kernels::cpu::fallback::bitpacking::internal::pack_128_uint1_values(
+      packed.data(), input.data());
+
+  uint8x16_t u0, u1, u2, u3, u4, u5, u6, u7;
+  torchao::bitpacking::internal::vec_unpack_128_uint1_values(
+      u0, u1, u2, u3, u4, u5, u6, u7, packed.data());
+  vst1q_u8(unpacked.data(), u0);
+  vst1q_u8(unpacked.data() + 16, u1);
+  vst1q_u8(unpacked.data() + 32, u2);
+  vst1q_u8(unpacked.data() + 48, u3);
+  vst1q_u8(unpacked.data() + 64, u4);
+  vst1q_u8(unpacked.data() + 80, u5);
+  vst1q_u8(unpacked.data() + 96, u6);
+  vst1q_u8(unpacked.data() + 112, u7);
+
+  for (int i = 0; i < unpacked_bytes; ++i) {
+    EXPECT_EQ(input[i], unpacked[i]);
+  }
+}
+
+TEST(test_bitpacking_128_uint1_values, NeonToCpp) {
+  int unpacked_bytes = 128;
+  int nbit = 1;
+  int packed_bytes = unpacked_bytes * nbit / 8;
+  auto input = torchao::get_random_lowbit_vector(unpacked_bytes, nbit);
+  std::vector<uint8_t> packed(packed_bytes, 0);
+  std::vector<uint8_t> unpacked(unpacked_bytes, 0);
+
+  uint8x16_t i0, i1, i2, i3, i4, i5, i6, i7;
+  torchao::bitpacking::internal::vec_load_64_uint8_values(
+      i0, i1, i2, i3, input.data());
+  torchao::bitpacking::internal::vec_load_64_uint8_values(
+      i4, i5, i6, i7, input.data() + 64);
+  torchao::bitpacking::internal::vec_pack_128_uint1_values(
+      packed.data(), i0, i1, i2, i3, i4, i5, i6, i7);
+
+  torchao::kernels::cpu::fallback::bitpacking::internal::
+      unpack_128_uint1_values(unpacked.data(), packed.data());
+
+  for (int i = 0; i < unpacked_bytes; ++i) {
+    EXPECT_EQ(input[i], unpacked[i]);
+  }
+}
+
+// --- Compatibility Tests for uint2 ---
+
+TEST(test_bitpacking_32_uint2_values, CppToNeon) {
+  int unpacked_bytes = 32;
+  int nbit = 2;
+  int packed_bytes = unpacked_bytes * nbit / 8;
+  auto input = torchao::get_random_lowbit_vector(unpacked_bytes, nbit);
+  std::vector<uint8_t> packed(packed_bytes, 0);
+  std::vector<uint8_t> unpacked(unpacked_bytes, 0);
+
+  torchao::kernels::cpu::fallback::bitpacking::internal::pack_32_uint2_values(
+      packed.data(), input.data());
+
+  uint8x8_t u0, u1, u2, u3;
+  torchao::bitpacking::internal::vec_unpack_32_uint2_values(
+      u0, u1, u2, u3, packed.data());
+  vst1_u8(unpacked.data(), u0);
+  vst1_u8(unpacked.data() + 8, u1);
+  vst1_u8(unpacked.data() + 16, u2);
+  vst1_u8(unpacked.data() + 24, u3);
+
+  for (int i = 0; i < unpacked_bytes; ++i) {
+    EXPECT_EQ(input[i], unpacked[i]);
+  }
+}
+
+TEST(test_bitpacking_32_uint2_values, NeonToCpp) {
+  int unpacked_bytes = 32;
+  int nbit = 2;
+  int packed_bytes = unpacked_bytes * nbit / 8;
+  auto input = torchao::get_random_lowbit_vector(unpacked_bytes, nbit);
+  std::vector<uint8_t> packed(packed_bytes, 0);
+  std::vector<uint8_t> unpacked(unpacked_bytes, 0);
+
+  uint8x8_t i0, i1, i2, i3;
+  torchao::bitpacking::internal::vec_load_32_uint8_values(
+      i0, i1, i2, i3, input.data());
+  torchao::bitpacking::internal::vec_pack_32_uint2_values(
+      packed.data(), i0, i1, i2, i3);
+
+  torchao::kernels::cpu::fallback::bitpacking::internal::unpack_32_uint2_values(
+      unpacked.data(), packed.data());
+
+  for (int i = 0; i < unpacked_bytes; ++i) {
+    EXPECT_EQ(input[i], unpacked[i]);
+  }
+}
+
+TEST(test_bitpacking_64_uint2_values, CppToNeon) {
+  int unpacked_bytes = 64;
+  int nbit = 2;
+  int packed_bytes = unpacked_bytes * nbit / 8;
+  auto input = torchao::get_random_lowbit_vector(unpacked_bytes, nbit);
+  std::vector<uint8_t> packed(packed_bytes, 0);
+  std::vector<uint8_t> unpacked(unpacked_bytes, 0);
+
+  torchao::kernels::cpu::fallback::bitpacking::internal::pack_64_uint2_values(
+      packed.data(), input.data());
+
+  uint8x16_t u0, u1, u2, u3;
+  torchao::bitpacking::internal::vec_unpack_64_uint2_values(
+      u0, u1, u2, u3, packed.data());
+  vst1q_u8(unpacked.data(), u0);
+  vst1q_u8(unpacked.data() + 16, u1);
+  vst1q_u8(unpacked.data() + 32, u2);
+  vst1q_u8(unpacked.data() + 48, u3);
+
+  for (int i = 0; i < unpacked_bytes; ++i) {
+    EXPECT_EQ(input[i], unpacked[i]);
+  }
+}
+
+TEST(test_bitpacking_64_uint2_values, NeonToCpp) {
+  int unpacked_bytes = 64;
+  int nbit = 2;
+  int packed_bytes = unpacked_bytes * nbit / 8;
+  auto input = torchao::get_random_lowbit_vector(unpacked_bytes, nbit);
+  std::vector<uint8_t> packed(packed_bytes, 0);
+  std::vector<uint8_t> unpacked(unpacked_bytes, 0);
+
+  uint8x16_t i0, i1, i2, i3;
+  torchao::bitpacking::internal::vec_load_64_uint8_values(
+      i0, i1, i2, i3, input.data());
+  torchao::bitpacking::internal::vec_pack_64_uint2_values(
+      packed.data(), i0, i1, i2, i3);
+
+  torchao::kernels::cpu::fallback::bitpacking::internal::unpack_64_uint2_values(
+      unpacked.data(), packed.data());
+
+  for (int i = 0; i < unpacked_bytes; ++i) {
+    EXPECT_EQ(input[i], unpacked[i]);
+  }
+}
+
+// --- Compatibility Tests for uint3 ---
+
+TEST(test_bitpacking_64_uint3_values, CppToNeon) {
+  int unpacked_bytes = 64;
+  int nbit = 3;
+  int packed_bytes = unpacked_bytes * nbit / 8;
+  auto input = torchao::get_random_lowbit_vector(unpacked_bytes, nbit);
+  std::vector<uint8_t> packed(packed_bytes, 0);
+  std::vector<uint8_t> unpacked(unpacked_bytes, 0);
+
+  torchao::kernels::cpu::fallback::bitpacking::internal::pack_64_uint3_values(
+      packed.data(), input.data());
+
+  uint8x16_t u0, u1, u2, u3;
+  torchao::bitpacking::internal::vec_unpack_64_uint3_values(
+      u0, u1, u2, u3, packed.data());
+  vst1q_u8(unpacked.data(), u0);
+  vst1q_u8(unpacked.data() + 16, u1);
+  vst1q_u8(unpacked.data() + 32, u2);
+  vst1q_u8(unpacked.data() + 48, u3);
+
+  for (int i = 0; i < unpacked_bytes; ++i) {
+    EXPECT_EQ(input[i], unpacked[i]);
+  }
+}
+
+TEST(test_bitpacking_64_uint3_values, NeonToCpp) {
+  int unpacked_bytes = 64;
+  int nbit = 3;
+  int packed_bytes = unpacked_bytes * nbit / 8;
+  auto input = torchao::get_random_lowbit_vector(unpacked_bytes, nbit);
+  std::vector<uint8_t> packed(packed_bytes, 0);
+  std::vector<uint8_t> unpacked(unpacked_bytes, 0);
+
+  uint8x16_t i0, i1, i2, i3;
+  torchao::bitpacking::internal::vec_load_64_uint8_values(
+      i0, i1, i2, i3, input.data());
+  torchao::bitpacking::internal::vec_pack_64_uint3_values(
+      packed.data(), i0, i1, i2, i3);
+
+  torchao::kernels::cpu::fallback::bitpacking::internal::unpack_64_uint3_values(
+      unpacked.data(), packed.data());
+
+  for (int i = 0; i < unpacked_bytes; ++i) {
+    EXPECT_EQ(input[i], unpacked[i]);
+  }
+}
+
+TEST(test_bitpacking_128_uint3_values, CppToNeon) {
+  int unpacked_bytes = 128;
+  int nbit = 3;
+  int packed_bytes = unpacked_bytes * nbit / 8;
+  auto input = torchao::get_random_lowbit_vector(unpacked_bytes, nbit);
+  std::vector<uint8_t> packed(packed_bytes, 0);
+  std::vector<uint8_t> unpacked(unpacked_bytes, 0);
+
+  torchao::kernels::cpu::fallback::bitpacking::internal::pack_128_uint3_values(
+      packed.data(), input.data());
+
+  uint8x16_t u0, u1, u2, u3, u4, u5, u6, u7;
+  torchao::bitpacking::internal::vec_unpack_128_uint3_values(
+      u0, u1, u2, u3, u4, u5, u6, u7, packed.data());
+  vst1q_u8(unpacked.data(), u0);
+  vst1q_u8(unpacked.data() + 16, u1);
+  vst1q_u8(unpacked.data() + 32, u2);
+  vst1q_u8(unpacked.data() + 48, u3);
+  vst1q_u8(unpacked.data() + 64, u4);
+  vst1q_u8(unpacked.data() + 80, u5);
+  vst1q_u8(unpacked.data() + 96, u6);
+  vst1q_u8(unpacked.data() + 112, u7);
+
+  for (int i = 0; i < unpacked_bytes; ++i) {
+    EXPECT_EQ(input[i], unpacked[i]);
+  }
+}
+
+TEST(test_bitpacking_128_uint3_values, NeonToCpp) {
+  int unpacked_bytes = 128;
+  int nbit = 3;
+  int packed_bytes = unpacked_bytes * nbit / 8;
+  auto input = torchao::get_random_lowbit_vector(unpacked_bytes, nbit);
+  std::vector<uint8_t> packed(packed_bytes, 0);
+  std::vector<uint8_t> unpacked(unpacked_bytes, 0);
+
+  uint8x16_t i0, i1, i2, i3, i4, i5, i6, i7;
+  torchao::bitpacking::internal::vec_load_64_uint8_values(
+      i0, i1, i2, i3, input.data());
+  torchao::bitpacking::internal::vec_load_64_uint8_values(
+      i4, i5, i6, i7, input.data() + 64);
+  torchao::bitpacking::internal::vec_pack_128_uint3_values(
+      packed.data(), i0, i1, i2, i3, i4, i5, i6, i7);
+
+  torchao::kernels::cpu::fallback::bitpacking::internal::
+      unpack_128_uint3_values(unpacked.data(), packed.data());
+
+  for (int i = 0; i < unpacked_bytes; ++i) {
+    EXPECT_EQ(input[i], unpacked[i]);
+  }
+}
+
+// --- Compatibility Tests for uint4 ---
+
+TEST(test_bitpacking_16_uint4_values, CppToNeon) {
+  int unpacked_bytes = 16;
+  int nbit = 4;
+  int packed_bytes = unpacked_bytes * nbit / 8;
+  auto input = torchao::get_random_lowbit_vector(unpacked_bytes, nbit);
+  std::vector<uint8_t> packed(packed_bytes, 0);
+  std::vector<uint8_t> unpacked(unpacked_bytes, 0);
+
+  torchao::kernels::cpu::fallback::bitpacking::internal::pack_16_uint4_values(
+      packed.data(), input.data());
+
+  uint8x16_t unpacked0;
+  torchao::bitpacking::internal::vec_unpack_16_uint4_values(
+      unpacked0, packed.data());
+  vst1q_u8(unpacked.data(), unpacked0);
+
+  for (int i = 0; i < unpacked_bytes; ++i) {
+    EXPECT_EQ(input[i], unpacked[i]);
+  }
+}
+
+TEST(test_bitpacking_16_uint4_values, NeonToCpp) {
+  int unpacked_bytes = 16;
+  int nbit = 4;
+  int packed_bytes = unpacked_bytes * nbit / 8;
+  auto input = torchao::get_random_lowbit_vector(unpacked_bytes, nbit);
+  std::vector<uint8_t> packed(packed_bytes, 0);
+  std::vector<uint8_t> unpacked(unpacked_bytes, 0);
+
+  uint8x16_t input0 = vld1q_u8(input.data());
+  torchao::bitpacking::internal::vec_pack_16_uint4_values(
+      packed.data(), input0);
+
+  torchao::kernels::cpu::fallback::bitpacking::internal::unpack_16_uint4_values(
+      unpacked.data(), packed.data());
+
+  for (int i = 0; i < unpacked_bytes; ++i) {
+    EXPECT_EQ(input[i], unpacked[i]);
+  }
+}
+
+TEST(test_bitpacking_32_uint4_values, CppToNeon) {
+  int unpacked_bytes = 32;
+  int nbit = 4;
+  int packed_bytes = unpacked_bytes * nbit / 8;
+  auto input = torchao::get_random_lowbit_vector(unpacked_bytes, nbit);
+  std::vector<uint8_t> packed(packed_bytes, 0);
+  std::vector<uint8_t> unpacked(unpacked_bytes, 0);
+
+  torchao::kernels::cpu::fallback::bitpacking::internal::pack_32_uint4_values(
+      packed.data(), input.data());
+
+  uint8x16_t unpacked0, unpacked1;
+  torchao::bitpacking::internal::vec_unpack_32_uint4_values(
+      unpacked0, unpacked1, packed.data());
+  vst1q_u8(unpacked.data(), unpacked0);
+  vst1q_u8(unpacked.data() + 16, unpacked1);
+
+  for (int i = 0; i < unpacked_bytes; ++i) {
+    EXPECT_EQ(input[i], unpacked[i]);
+  }
+}
+
+TEST(test_bitpacking_32_uint4_values, NeonToCpp) {
+  int unpacked_bytes = 32;
+  int nbit = 4;
+  int packed_bytes = unpacked_bytes * nbit / 8;
+  auto input = torchao::get_random_lowbit_vector(unpacked_bytes, nbit);
+  std::vector<uint8_t> packed(packed_bytes, 0);
+  std::vector<uint8_t> unpacked(unpacked_bytes, 0);
+
+  uint8x16_t input0 = vld1q_u8(input.data());
+  uint8x16_t input1 = vld1q_u8(input.data() + 16);
+  torchao::bitpacking::internal::vec_pack_32_uint4_values(
+      packed.data(), input0, input1);
+
+  torchao::kernels::cpu::fallback::bitpacking::internal::unpack_32_uint4_values(
+      unpacked.data(), packed.data());
+
+  for (int i = 0; i < unpacked_bytes; ++i) {
+    EXPECT_EQ(input[i], unpacked[i]);
+  }
+}
+
+// --- Compatibility Tests for uint5 ---
+
+TEST(test_bitpacking_64_uint5_values, CppToNeon) {
+  int unpacked_bytes = 64;
+  int nbit = 5;
+  int packed_bytes = unpacked_bytes * nbit / 8;
+  auto input = torchao::get_random_lowbit_vector(unpacked_bytes, nbit);
+  std::vector<uint8_t> packed(packed_bytes, 0);
+  std::vector<uint8_t> unpacked(unpacked_bytes, 0);
+
+  torchao::kernels::cpu::fallback::bitpacking::internal::pack_64_uint5_values(
+      packed.data(), input.data());
+
+  uint8x16_t unpacked0, unpacked1, unpacked2, unpacked3;
+  torchao::bitpacking::internal::vec_unpack_64_uint5_values(
+      unpacked0, unpacked1, unpacked2, unpacked3, packed.data());
+  vst1q_u8(unpacked.data(), unpacked0);
+  vst1q_u8(unpacked.data() + 16, unpacked1);
+  vst1q_u8(unpacked.data() + 32, unpacked2);
+  vst1q_u8(unpacked.data() + 48, unpacked3);
+
+  for (int i = 0; i < unpacked_bytes; ++i) {
+    EXPECT_EQ(input[i], unpacked[i]);
+  }
+}
+
+TEST(test_bitpacking_64_uint5_values, NeonToCpp) {
+  int unpacked_bytes = 64;
+  int nbit = 5;
+  int packed_bytes = unpacked_bytes * nbit / 8;
+  auto input = torchao::get_random_lowbit_vector(unpacked_bytes, nbit);
+  std::vector<uint8_t> packed(packed_bytes, 0);
+  std::vector<uint8_t> unpacked(unpacked_bytes, 0);
+
+  uint8x16_t input0, input1, input2, input3;
+  torchao::bitpacking::internal::vec_load_64_uint8_values(
+      input0, input1, input2, input3, input.data());
+  torchao::bitpacking::internal::vec_pack_64_uint5_values(
+      packed.data(), input0, input1, input2, input3);
+
+  torchao::kernels::cpu::fallback::bitpacking::internal::unpack_64_uint5_values(
+      unpacked.data(), packed.data());
+
+  for (int i = 0; i < unpacked_bytes; ++i) {
+    EXPECT_EQ(input[i], unpacked[i]);
+  }
+}
+
+TEST(test_bitpacking_128_uint5_values, CppToNeon) {
+  int unpacked_bytes = 128;
+  int nbit = 5;
+  int packed_bytes = unpacked_bytes * nbit / 8;
+  auto input = torchao::get_random_lowbit_vector(unpacked_bytes, nbit);
+  std::vector<uint8_t> packed(packed_bytes, 0);
+  std::vector<uint8_t> unpacked(unpacked_bytes, 0);
+
+  torchao::kernels::cpu::fallback::bitpacking::internal::pack_128_uint5_values(
+      packed.data(), input.data());
+
+  uint8x16_t u0, u1, u2, u3, u4, u5, u6, u7;
+  torchao::bitpacking::internal::vec_unpack_128_uint5_values(
+      u0, u1, u2, u3, u4, u5, u6, u7, packed.data());
+  vst1q_u8(unpacked.data(), u0);
+  vst1q_u8(unpacked.data() + 16, u1);
+  vst1q_u8(unpacked.data() + 32, u2);
+  vst1q_u8(unpacked.data() + 48, u3);
+  vst1q_u8(unpacked.data() + 64, u4);
+  vst1q_u8(unpacked.data() + 80, u5);
+  vst1q_u8(unpacked.data() + 96, u6);
+  vst1q_u8(unpacked.data() + 112, u7);
+
+  for (int i = 0; i < unpacked_bytes; ++i) {
+    EXPECT_EQ(input[i], unpacked[i]);
+  }
+}
+
+TEST(test_bitpacking_128_uint5_values, NeonToCpp) {
+  int unpacked_bytes = 128;
+  int nbit = 5;
+  int packed_bytes = unpacked_bytes * nbit / 8;
+  auto input = torchao::get_random_lowbit_vector(unpacked_bytes, nbit);
+  std::vector<uint8_t> packed(packed_bytes, 0);
+  std::vector<uint8_t> unpacked(unpacked_bytes, 0);
+
+  uint8x16_t i0, i1, i2, i3, i4, i5, i6, i7;
+  torchao::bitpacking::internal::vec_load_64_uint8_values(
+      i0, i1, i2, i3, input.data());
+  torchao::bitpacking::internal::vec_load_64_uint8_values(
+      i4, i5, i6, i7, input.data() + 64);
+  torchao::bitpacking::internal::vec_pack_128_uint5_values(
+      packed.data(), i0, i1, i2, i3, i4, i5, i6, i7);
+
+  torchao::kernels::cpu::fallback::bitpacking::internal::
+      unpack_128_uint5_values(unpacked.data(), packed.data());
+
+  for (int i = 0; i < unpacked_bytes; ++i) {
+    EXPECT_EQ(input[i], unpacked[i]);
+  }
+}
+
+// --- Compatibility Tests for uint6 ---
+
+TEST(test_bitpacking_32_uint6_values, CppToNeon) {
+  int unpacked_bytes = 32;
+  int nbit = 6;
+  int packed_bytes = unpacked_bytes * nbit / 8;
+  auto input = torchao::get_random_lowbit_vector(unpacked_bytes, nbit);
+  std::vector<uint8_t> packed(packed_bytes, 0);
+  std::vector<uint8_t> unpacked(unpacked_bytes, 0);
+
+  torchao::kernels::cpu::fallback::bitpacking::internal::pack_32_uint6_values(
+      packed.data(), input.data());
+
+  uint8x16_t u0, u1;
+  torchao::bitpacking::internal::vec_unpack_32_uint6_values(
+      u0, u1, packed.data());
+  vst1q_u8(unpacked.data(), u0);
+  vst1q_u8(unpacked.data() + 16, u1);
+
+  for (int i = 0; i < unpacked_bytes; ++i) {
+    EXPECT_EQ(input[i], unpacked[i]);
+  }
+}
+
+TEST(test_bitpacking_32_uint6_values, NeonToCpp) {
+  int unpacked_bytes = 32;
+  int nbit = 6;
+  int packed_bytes = unpacked_bytes * nbit / 8;
+  auto input = torchao::get_random_lowbit_vector(unpacked_bytes, nbit);
+  std::vector<uint8_t> packed(packed_bytes, 0);
+  std::vector<uint8_t> unpacked(unpacked_bytes, 0);
+
+  uint8x16_t i0 = vld1q_u8(input.data());
+  uint8x16_t i1 = vld1q_u8(input.data() + 16);
+  torchao::bitpacking::internal::vec_pack_32_uint6_values(
+      packed.data(), i0, i1);
+
+  torchao::kernels::cpu::fallback::bitpacking::internal::unpack_32_uint6_values(
+      unpacked.data(), packed.data());
+
+  for (int i = 0; i < unpacked_bytes; ++i) {
+    EXPECT_EQ(input[i], unpacked[i]);
+  }
+}
+
+TEST(test_bitpacking_64_uint6_values, CppToNeon) {
+  int unpacked_bytes = 64;
+  int nbit = 6;
+  int packed_bytes = unpacked_bytes * nbit / 8;
+  auto input = torchao::get_random_lowbit_vector(unpacked_bytes, nbit);
+  std::vector<uint8_t> packed(packed_bytes, 0);
+  std::vector<uint8_t> unpacked(unpacked_bytes, 0);
+
+  torchao::kernels::cpu::fallback::bitpacking::internal::pack_64_uint6_values(
+      packed.data(), input.data());
+
+  uint8x16_t u0, u1, u2, u3;
+  torchao::bitpacking::internal::vec_unpack_64_uint6_values(
+      u0, u1, u2, u3, packed.data());
+  vst1q_u8(unpacked.data(), u0);
+  vst1q_u8(unpacked.data() + 16, u1);
+  vst1q_u8(unpacked.data() + 32, u2);
+  vst1q_u8(unpacked.data() + 48, u3);
+
+  for (int i = 0; i < unpacked_bytes; ++i) {
+    EXPECT_EQ(input[i], unpacked[i]);
+  }
+}
+
+TEST(test_bitpacking_64_uint6_values, NeonToCpp) {
+  int unpacked_bytes = 64;
+  int nbit = 6;
+  int packed_bytes = unpacked_bytes * nbit / 8;
+  auto input = torchao::get_random_lowbit_vector(unpacked_bytes, nbit);
+  std::vector<uint8_t> packed(packed_bytes, 0);
+  std::vector<uint8_t> unpacked(unpacked_bytes, 0);
+
+  uint8x16_t i0, i1, i2, i3;
+  torchao::bitpacking::internal::vec_load_64_uint8_values(
+      i0, i1, i2, i3, input.data());
+  torchao::bitpacking::internal::vec_pack_64_uint6_values(
+      packed.data(), i0, i1, i2, i3);
+
+  torchao::kernels::cpu::fallback::bitpacking::internal::unpack_64_uint6_values(
+      unpacked.data(), packed.data());
+
+  for (int i = 0; i < unpacked_bytes; ++i) {
+    EXPECT_EQ(input[i], unpacked[i]);
+  }
+}
+
+// --- Compatibility Tests for uint7 ---
+
+TEST(test_bitpacking_64_uint7_values, CppToNeon) {
+  int unpacked_bytes = 64;
+  int nbit = 7;
+  int packed_bytes = unpacked_bytes * nbit / 8;
+  auto input = torchao::get_random_lowbit_vector(unpacked_bytes, nbit);
+  std::vector<uint8_t> packed(packed_bytes, 0);
+  std::vector<uint8_t> unpacked(unpacked_bytes, 0);
+
+  torchao::kernels::cpu::fallback::bitpacking::internal::pack_64_uint7_values(
+      packed.data(), input.data());
+
+  uint8x16_t unpacked0, unpacked1, unpacked2, unpacked3;
+  torchao::bitpacking::internal::vec_unpack_64_uint7_values(
+      unpacked0, unpacked1, unpacked2, unpacked3, packed.data());
+  vst1q_u8(unpacked.data(), unpacked0);
+  vst1q_u8(unpacked.data() + 16, unpacked1);
+  vst1q_u8(unpacked.data() + 32, unpacked2);
+  vst1q_u8(unpacked.data() + 48, unpacked3);
+
+  for (int i = 0; i < unpacked_bytes; ++i) {
+    EXPECT_EQ(input[i], unpacked[i]);
+  }
+}
+
+TEST(test_bitpacking_64_uint7_values, NeonToCpp) {
+  int unpacked_bytes = 64;
+  int nbit = 7;
+  int packed_bytes = unpacked_bytes * nbit / 8;
+  auto input = torchao::get_random_lowbit_vector(unpacked_bytes, nbit);
+  std::vector<uint8_t> packed(packed_bytes, 0);
+  std::vector<uint8_t> unpacked(unpacked_bytes, 0);
+
+  uint8x16_t input0, input1, input2, input3;
+  torchao::bitpacking::internal::vec_load_64_uint8_values(
+      input0, input1, input2, input3, input.data());
+  torchao::bitpacking::internal::vec_pack_64_uint7_values(
+      packed.data(), input0, input1, input2, input3);
+
+  torchao::kernels::cpu::fallback::bitpacking::internal::unpack_64_uint7_values(
+      unpacked.data(), packed.data());
+
+  for (int i = 0; i < unpacked_bytes; ++i) {
+    EXPECT_EQ(input[i], unpacked[i]);
+  }
+}
+
+TEST(test_bitpacking_128_uint7_values, CppToNeon) {
+  int unpacked_bytes = 128;
+  int nbit = 7;
+  int packed_bytes = unpacked_bytes * nbit / 8;
+  auto input = torchao::get_random_lowbit_vector(unpacked_bytes, nbit);
+  std::vector<uint8_t> packed(packed_bytes, 0);
+  std::vector<uint8_t> unpacked(unpacked_bytes, 0);
+
+  torchao::kernels::cpu::fallback::bitpacking::internal::pack_128_uint7_values(
+      packed.data(), input.data());
+
+  uint8x16_t u0, u1, u2, u3, u4, u5, u6, u7;
+  torchao::bitpacking::internal::vec_unpack_128_uint7_values(
+      u0, u1, u2, u3, u4, u5, u6, u7, packed.data());
+  vst1q_u8(unpacked.data(), u0);
+  vst1q_u8(unpacked.data() + 16, u1);
+  vst1q_u8(unpacked.data() + 32, u2);
+  vst1q_u8(unpacked.data() + 48, u3);
+  vst1q_u8(unpacked.data() + 64, u4);
+  vst1q_u8(unpacked.data() + 80, u5);
+  vst1q_u8(unpacked.data() + 96, u6);
+  vst1q_u8(unpacked.data() + 112, u7);
+
+  for (int i = 0; i < unpacked_bytes; ++i) {
+    EXPECT_EQ(input[i], unpacked[i]);
+  }
+}
+
+TEST(test_bitpacking_128_uint7_values, NeonToCpp) {
+  int unpacked_bytes = 128;
+  int nbit = 7;
+  int packed_bytes = unpacked_bytes * nbit / 8;
+  auto input = torchao::get_random_lowbit_vector(unpacked_bytes, nbit);
+  std::vector<uint8_t> packed(packed_bytes, 0);
+  std::vector<uint8_t> unpacked(unpacked_bytes, 0);
+
+  uint8x16_t i0, i1, i2, i3, i4, i5, i6, i7;
+  torchao::bitpacking::internal::vec_load_64_uint8_values(
+      i0, i1, i2, i3, input.data());
+  torchao::bitpacking::internal::vec_load_64_uint8_values(
+      i4, i5, i6, i7, input.data() + 64);
+  torchao::bitpacking::internal::vec_pack_128_uint7_values(
+      packed.data(), i0, i1, i2, i3, i4, i5, i6, i7);
+
+  torchao::kernels::cpu::fallback::bitpacking::internal::
+      unpack_128_uint7_values(unpacked.data(), packed.data());
+
+  for (int i = 0; i < unpacked_bytes; ++i) {
+    EXPECT_EQ(input[i], unpacked[i]);
+  }
+}
+
+#endif // defined(__aarch64__) || defined(__ARM_NEON)
diff --git a/torchao/experimental/kernels/cpu/fallback/CMakeLists.txt b/torchao/experimental/kernels/cpu/fallback/CMakeLists.txt
new file mode 100644
index 0000000000..0952fcc3f5
--- /dev/null
+++ b/torchao/experimental/kernels/cpu/fallback/CMakeLists.txt
@@ -0,0 +1,5 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
diff --git a/torchao/experimental/kernels/cpu/fallback/bitpacking/bitpack.h b/torchao/experimental/kernels/cpu/fallback/bitpacking/bitpack.h
new file mode 100644
index 0000000000..1a558d27ac
--- /dev/null
+++ b/torchao/experimental/kernels/cpu/fallback/bitpacking/bitpack.h
@@ -0,0 +1,179 @@
+// Copyright (c) Meta Platforms, Inc. and affiliates.
+// All rights reserved.
+//
+// This source code is licensed under the license found in the
+// LICENSE file in the root directory of this source tree.
+
+#pragma once
+
+#include <torchao/experimental/kernels/cpu/aarch64/macro.h>
+#include <torchao/experimental/kernels/cpu/fallback/bitpacking/uint1.h>
+#include <torchao/experimental/kernels/cpu/fallback/bitpacking/uint2.h>
+#include <torchao/experimental/kernels/cpu/fallback/bitpacking/uint3.h>
+#include <torchao/experimental/kernels/cpu/fallback/bitpacking/uint4.h>
+#include <torchao/experimental/kernels/cpu/fallback/bitpacking/uint5.h>
+#include <torchao/experimental/kernels/cpu/fallback/bitpacking/uint6.h>
+#include <torchao/experimental/kernels/cpu/fallback/bitpacking/uint7.h>
+#include <cassert>
+
+namespace torchao::kernels::cpu::fallback::bitpacking {
+namespace internal {
+/**
+ * @brief Packs 128 unsigned 8-bit integers into a packed format of 'nbit' bits.
+ *
+ * @tparam nbit The number of bits to pack each value into (1-8).
+ * @param packed Pointer to the destination memory for the packed data.
+ * @param unpacked_values Pointer to the source memory with 128 uint8_t values.
+ */
+template <int nbit>
+inline void pack_128_uint_values(
+    uint8_t* packed,
+    const uint8_t* unpacked_values) {
+  static_assert(nbit >= 1 && nbit <= 8, "nbit must be between 1 and 8");
+
+  // Dispatch to the correct packing function
+  if constexpr (nbit == 1) {
+    pack_128_uint1_values(packed, unpacked_values);
+  } else if constexpr (nbit == 2) {
+    pack_64_uint2_values(packed, unpacked_values);
+    pack_64_uint2_values(packed + 16, unpacked_values + 64);
+  } else if constexpr (nbit == 3) {
+    pack_128_uint3_values(packed, unpacked_values);
+  } else if constexpr (nbit == 4) {
+    pack_32_uint4_values(packed, unpacked_values);
+    pack_32_uint4_values(packed + 16, unpacked_values + 32);
+    pack_32_uint4_values(packed + 32, unpacked_values + 64);
+    pack_32_uint4_values(packed + 48, unpacked_values + 96);
+  } else if constexpr (nbit == 5) {
+    pack_128_uint5_values(packed, unpacked_values);
+  } else if constexpr (nbit == 6) {
+    pack_64_uint6_values(packed, unpacked_values);
+    pack_64_uint6_values(packed + 48, unpacked_values + 64);
+  } else if constexpr (nbit == 7) {
+    pack_128_uint7_values(packed, unpacked_values);
+  } else if constexpr (nbit == 8) {
+    // For 8-bit, it's a direct memory copy
+    for (int i = 0; i < 128; ++i) {
+      packed[i] = unpacked_values[i];
+    }
+  }
+}
+/**
+ * @brief Unpacks 'nbit' data into 128 unsigned 8-bit integers.
+ *
+ * @tparam nbit The number of bits per value in the packed format (1-8).
+ * @param unpacked_values Pointer to the destination memory (128 uint8_t
+ * values).
+ * @param packed Pointer to the source packed data.
+ */
+template <int nbit>
+inline void unpack_128_uint_values(
+    uint8_t* unpacked_values,
+    const uint8_t* packed) {
+  static_assert(nbit >= 1 && nbit <= 8, "nbit must be between 1 and 8");
+
+  // Dispatch to the correct unpacking function, writing directly to the output.
+  if constexpr (nbit == 1) {
+    unpack_128_uint1_values(unpacked_values, packed);
+  } else if constexpr (nbit == 2) {
+    unpack_64_uint2_values(unpacked_values, packed);
+    unpack_64_uint2_values(unpacked_values + 64, packed + 16);
+  } else if constexpr (nbit == 3) {
+    unpack_128_uint3_values(unpacked_values, packed);
+  } else if constexpr (nbit == 4) {
+    unpack_32_uint4_values(unpacked_values, packed);
+    unpack_32_uint4_values(unpacked_values + 32, packed + 16);
+    unpack_32_uint4_values(unpacked_values + 64, packed + 32);
+    unpack_32_uint4_values(unpacked_values + 96, packed + 48);
+  } else if constexpr (nbit == 5) {
+    unpack_128_uint5_values(unpacked_values, packed);
+  } else if constexpr (nbit == 6) {
+    unpack_64_uint6_values(unpacked_values, packed);
+    unpack_64_uint6_values(unpacked_values + 64, packed + 48);
+  } else if constexpr (nbit == 7) {
+    unpack_128_uint7_values(unpacked_values, packed);
+  } else if constexpr (nbit == 8) {
+    // For 8-bit, it's a direct memory copy
+    for (int i = 0; i < 128; ++i) {
+      unpacked_values[i] = packed[i];
+    }
+  }
+}
+
+/**
+ * @brief Packs 128 signed 8-bit integers into a packed format of 'nbit' bits.
+ *
+ * @tparam nbit The number of bits to pack each value into (1-8).
+ * @param packed Pointer to the destination memory.
+ * @param unpacked Pointer to the source memory containing 128 int8_t values.
+ */
+template <int nbit>
+inline void pack_128_lowbit_int_values(
+    uint8_t* packed,
+    const int8_t* unpacked) {
+  // 1. Convert signed input to a temporary buffer of unsigned values.
+  uint8_t temp_unpacked[128];
+  if constexpr (nbit < 8) {
+    const int8_t shift = 1 << (nbit - 1);
+    for (int i = 0; i < 128; ++i) {
+      temp_unpacked[i] = static_cast<uint8_t>(unpacked[i] + shift);
+    }
+  } else { // nbit == 8
+    for (int i = 0; i < 128; ++i) {
+      temp_unpacked[i] = static_cast<uint8_t>(unpacked[i]);
+    }
+  }
+
+  // 2. Call the generalized uint packing function.
+  pack_128_uint_values<nbit>(packed, temp_unpacked);
+}
+
+template <int nbit>
+inline void unpack_128_lowbit_int_values(
+    int8_t* unpacked,
+    const uint8_t* packed) {
+  // 1. Get the raw unsigned values by calling the base function.
+  uint8_t temp_unpacked[128];
+  unpack_128_uint_values<nbit>(temp_unpacked, packed);
+
+  // 2. Perform the signed conversion.
+  if constexpr (nbit < 8) {
+    const int8_t unshift = -(1 << (nbit - 1));
+    for (int i = 0; i < 128; ++i) {
+      unpacked[i] = static_cast<int8_t>(temp_unpacked[i]) + unshift;
+    }
+  } else { // nbit == 8
+    for (int i = 0; i < 128; ++i) {
+      unpacked[i] = static_cast<int8_t>(temp_unpacked[i]);
+    }
+  }
+}
+
+/**
+ * @brief Unpacks 'nbit' data and de-quantizes it using a lookup table (LUT).
+ *
+ * @tparam nbit The number of bits per value in the packed format (1-4).
+ * @param unpacked Pointer to the destination memory (128 int8_t values).
+ * @param packed Pointer to the source packed data.
+ * @param lut Pointer to the lookup table (must have 2^nbit entries).
+ */
+template <int nbit>
+inline void unpack_128_lowbit_values_with_lut(
+    int8_t* unpacked,
+    const uint8_t* packed,
+    const int8_t* lut) {
+  static_assert(nbit >= 1 && nbit <= 4, "LUT version only supports nbit <= 4");
+
+  // Create a temporary buffer on the stack for the indices.
+  uint8_t indices[128];
+
+  // 1. Call the utility function to handle all the unpacking logic.
+  unpack_128_uint_values<nbit>(indices, packed);
+
+  // 2. Apply the lookup table.
+  for (int i = 0; i < 128; ++i) {
+    unpacked[i] = lut[indices[i]];
+  }
+}
+} // namespace internal
+} // namespace torchao::kernels::cpu::fallback::bitpacking
diff --git a/torchao/experimental/kernels/cpu/fallback/bitpacking/uint1.h b/torchao/experimental/kernels/cpu/fallback/bitpacking/uint1.h
new file mode 100644
index 0000000000..67d4512a2c
--- /dev/null
+++ b/torchao/experimental/kernels/cpu/fallback/bitpacking/uint1.h
@@ -0,0 +1,154 @@
+// Copyright (c) Meta Platforms, Inc. and affiliates.
+// All rights reserved.
+//
+// This source code is licensed under the license found in the
+// LICENSE file in the root directory of this source tree.
+
+#pragma once
+
+#include <torchao/experimental/kernels/cpu/aarch64/macro.h>
+#include <cstdint>
+
+namespace torchao::kernels::cpu::fallback::bitpacking {
+namespace internal {
+
+/**
+ * @brief Packs 8 bytes, each containing a 1-bit value (0 or 1), into a single
+ * byte.
+ * @param packed Pointer to the destination memory (1 byte).
+ * @param unpacked Pointer to the source memory (8 bytes).
+ */
+TORCHAO_ALWAYS_INLINE inline void pack_8_uint1_values(
+    uint8_t* packed,
+    const uint8_t* unpacked) {
+  packed[0] = (unpacked[0] << 7) | (unpacked[1] << 6) | (unpacked[2] << 5) |
+      (unpacked[3] << 4) | (unpacked[4] << 3) | (unpacked[5] << 2) |
+      (unpacked[6] << 1) | (unpacked[7] << 0);
+}
+
+/**
+ * @brief Unpacks a single byte into 8 bytes, each containing a 1-bit value.
+ * @param unpacked Pointer to the destination memory (8 bytes).
+ * @param packed Pointer to the source memory (1 byte).
+ */
+TORCHAO_ALWAYS_INLINE inline void unpack_8_uint1_values(
+    uint8_t* unpacked,
+    const uint8_t* packed) {
+  const uint8_t packed_byte = packed[0];
+  unpacked[0] = (packed_byte >> 7) & 1;
+  unpacked[1] = (packed_byte >> 6) & 1;
+  unpacked[2] = (packed_byte >> 5) & 1;
+  unpacked[3] = (packed_byte >> 4) & 1;
+  unpacked[4] = (packed_byte >> 3) & 1;
+  unpacked[5] = (packed_byte >> 2) & 1;
+  unpacked[6] = (packed_byte >> 1) & 1;
+  unpacked[7] = (packed_byte >> 0) & 1;
+}
+
+/**
+ * @brief Packs 64 bytes (each a 1-bit value) into 8 bytes.
+ * @param packed Pointer to the destination memory (8 bytes).
+ * @param unpacked Pointer to the source memory (64 bytes).
+ * @note This implementation mirrors the logic of the ARM NEON
+ * `vec_pack_64_uint1_values` function to ensure compatibility. The unpacked
+ * data is assumed to be organized as four 16-byte blocks.
+ */
+TORCHAO_ALWAYS_INLINE inline void pack_64_uint1_values(
+    uint8_t* packed,
+    const uint8_t* unpacked) {
+  const uint8_t* unpacked0 = unpacked;
+  const uint8_t* unpacked1 = unpacked + 16;
+  const uint8_t* unpacked2 = unpacked + 32;
+  const uint8_t* unpacked3 = unpacked + 48;
+
+  for (int i = 0; i < 8; ++i) {
+    // Combine 4 bits for the low nibble of the output byte
+    uint8_t low_nibble = (unpacked0[i] << 3) | (unpacked1[i] << 2) |
+        (unpacked2[i] << 1) | (unpacked3[i] << 0);
+
+    // Combine 4 bits for the high nibble of the output byte
+    uint8_t high_nibble_src = (unpacked0[i + 8] << 3) |
+        (unpacked1[i + 8] << 2) | (unpacked2[i + 8] << 1) |
+        (unpacked3[i + 8] << 0);
+
+    // Assemble the final byte
+    packed[i] = low_nibble | (high_nibble_src << 4);
+  }
+}
+
+/**
+ * @brief Unpacks 8 bytes into 64 bytes (each a 1-bit value).
+ * @param unpacked Pointer to the destination memory (64 bytes).
+ * @param packed Pointer to the source memory (8 bytes).
+ * @note This implementation mirrors the logic of the ARM NEON
+ * `vec_unpack_64_uint1_values` function to ensure compatibility.
+ */
+TORCHAO_ALWAYS_INLINE inline void unpack_64_uint1_values(
+    uint8_t* unpacked,
+    const uint8_t* packed) {
+  uint8_t* unpacked0 = unpacked;
+  uint8_t* unpacked1 = unpacked + 16;
+  uint8_t* unpacked2 = unpacked + 32;
+  uint8_t* unpacked3 = unpacked + 48;
+
+  uint8_t combined[16];
+  for (int i = 0; i < 8; ++i) {
+    combined[i] = packed[i] & 0x0F; // Low nibbles
+    combined[i + 8] = packed[i] >> 4; // High nibbles
+  }
+
+  // Unpack from the combined buffer into the four destination blocks
+  for (int i = 0; i < 16; ++i) {
+    const uint8_t temp = combined[i];
+    unpacked0[i] = (temp >> 3) & 1;
+    unpacked1[i] = (temp >> 2) & 1;
+    unpacked2[i] = (temp >> 1) & 1;
+    unpacked3[i] = (temp >> 0) & 1;
+  }
+}
+
+/**
+ * @brief Packs 128 bytes (each a 1-bit value) into 16 bytes.
+ * @param packed Pointer to the destination memory (16 bytes).
+ * @param unpacked Pointer to the source memory (128 bytes).
+ * @note This implementation mirrors the logic of the ARM NEON
+ * `vec_pack_128_uint1_values` function (a transpose-and-pack operation) to
+ * ensure compatibility. The unpacked data is assumed to be organized as eight
+ * 16-byte blocks.
+ */
+TORCHAO_ALWAYS_INLINE inline void pack_128_uint1_values(
+    uint8_t* packed,
+    const uint8_t* unpacked) {
+  for (int i = 0; i < 16; ++i) {
+    packed[i] = (unpacked[i + 16 * 0] << 7) | (unpacked[i + 16 * 1] << 6) |
+        (unpacked[i + 16 * 2] << 5) | (unpacked[i + 16 * 3] << 4) |
+        (unpacked[i + 16 * 4] << 3) | (unpacked[i + 16 * 5] << 2) |
+        (unpacked[i + 16 * 6] << 1) | (unpacked[i + 16 * 7] << 0);
+  }
+}
+
+/**
+ * @brief Unpacks 16 bytes into 128 bytes (each a 1-bit value).
+ * @param unpacked Pointer to the destination memory (128 bytes).
+ * @param packed Pointer to the source memory (16 bytes).
+ * @note This implementation mirrors the logic of the ARM NEON
+ * `vec_unpack_128_uint1_values` function (an unpack-and-transpose operation)
+ * to ensure compatibility.
+ */
+TORCHAO_ALWAYS_INLINE inline void unpack_128_uint1_values(
+    uint8_t* unpacked,
+    const uint8_t* packed) {
+  for (int i = 0; i < 16; ++i) {
+    const uint8_t packed_byte = packed[i];
+    unpacked[i + 16 * 0] = (packed_byte >> 7) & 1;
+    unpacked[i + 16 * 1] = (packed_byte >> 6) & 1;
+    unpacked[i + 16 * 2] = (packed_byte >> 5) & 1;
+    unpacked[i + 16 * 3] = (packed_byte >> 4) & 1;
+    unpacked[i + 16 * 4] = (packed_byte >> 3) & 1;
+    unpacked[i + 16 * 5] = (packed_byte >> 2) & 1;
+    unpacked[i + 16 * 6] = (packed_byte >> 1) & 1;
+    unpacked[i + 16 * 7] = (packed_byte >> 0) & 1;
+  }
+}
+} // namespace internal
+} // namespace torchao::kernels::cpu::fallback::bitpacking
diff --git a/torchao/experimental/kernels/cpu/fallback/bitpacking/uint2.h b/torchao/experimental/kernels/cpu/fallback/bitpacking/uint2.h
new file mode 100644
index 0000000000..2681110348
--- /dev/null
+++ b/torchao/experimental/kernels/cpu/fallback/bitpacking/uint2.h
@@ -0,0 +1,119 @@
+// Copyright (c) Meta Platforms, Inc. and affiliates.
+// All rights reserved.
+//
+// This source code is licensed under the license found in the
+// LICENSE file in the root directory of this source tree.
+
+#pragma once
+
+#include <torchao/experimental/kernels/cpu/aarch64/macro.h>
+#include <cstdint>
+namespace torchao::kernels::cpu::fallback::bitpacking {
+namespace internal {
+
+/**
+ * @brief Packs 4 bytes, each containing a 2-bit value (0-3), into a single
+ * byte.
+ * @param packed Pointer to the destination memory (1 byte).
+ * @param unpacked Pointer to the source memory (4 bytes).
+ */
+TORCHAO_ALWAYS_INLINE inline void pack_4_uint2_values(
+    uint8_t* packed,
+    const uint8_t* unpacked) {
+  // unpacked = {v0, v1, v2, v3} -> packed[0] = | v0 | v1 | v2 | v3 |
+  packed[0] = (unpacked[0] << 6) | (unpacked[1] << 4) | (unpacked[2] << 2) |
+      (unpacked[3]);
+}
+
+/**
+ * @brief Unpacks a single byte into 4 bytes, each containing a 2-bit value.
+ * @param unpacked Pointer to the destination memory (4 bytes).
+ * @param packed Pointer to the source memory (1 byte).
+ */
+TORCHAO_ALWAYS_INLINE inline void unpack_4_uint2_values(
+    uint8_t* unpacked,
+    const uint8_t* packed) {
+  unpacked[0] = (packed[0] >> 6) & 0x03; // Mask 0b11000000
+  unpacked[1] = (packed[0] >> 4) & 0x03; // Mask 0b00110000
+  unpacked[2] = (packed[0] >> 2) & 0x03; // Mask 0b00001100
+  unpacked[3] = packed[0] & 0x03; // Mask 0b00000011
+}
+
+/**
+ * @brief Packs 32 bytes (each a 2-bit value) into 8 bytes.
+ * @param packed Pointer to the destination memory (8 bytes).
+ * @param unpacked Pointer to the source memory (32 bytes).
+ * @note This implementation mirrors the logic of the ARM NEON
+ * `vec_pack_32_uint2_values` function (a transpose-and-pack operation) to
+ * ensure compatibility. The unpacked data is assumed to be organized as four
+ * 8-byte blocks.
+ */
+TORCHAO_ALWAYS_INLINE inline void pack_32_uint2_values(
+    uint8_t* packed,
+    const uint8_t* unpacked) {
+  for (int i = 0; i < 8; ++i) {
+    packed[i] = (unpacked[i + 8 * 0] << 6) | (unpacked[i + 8 * 1] << 4) |
+        (unpacked[i + 8 * 2] << 2) | (unpacked[i + 8 * 3] << 0);
+  }
+}
+
+/**
+ * @brief Unpacks 8 bytes into 32 bytes (each a 2-bit value).
+ * @param unpacked Pointer to the destination memory (32 bytes).
+ * @param packed Pointer to the source memory (8 bytes).
+ * @note This implementation mirrors the logic of the ARM NEON
+ * `vec_unpack_32_uint2_values` function (an unpack-and-transpose operation)
+ * to ensure compatibility.
+ */
+TORCHAO_ALWAYS_INLINE inline void unpack_32_uint2_values(
+    uint8_t* unpacked,
+    const uint8_t* packed) {
+  for (int i = 0; i < 8; ++i) {
+    const uint8_t packed_byte = packed[i];
+    unpacked[i + 8 * 0] = (packed_byte >> 6) & 0x03;
+    unpacked[i + 8 * 1] = (packed_byte >> 4) & 0x03;
+    unpacked[i + 8 * 2] = (packed_byte >> 2) & 0x03;
+    unpacked[i + 8 * 3] = (packed_byte >> 0) & 0x03;
+  }
+}
+
+/**
+ * @brief Packs 64 bytes (each a 2-bit value) into 16 bytes.
+ * @param packed Pointer to the destination memory (16 bytes).
+ * @param unpacked Pointer to the source memory (64 bytes).
+ * @note This implementation mirrors the logic of the ARM NEON
+ * `vec_pack_64_uint2_values` function (a transpose-and-pack operation) to
+ * ensure compatibility. The unpacked data is assumed to be organized as four
+ * 16-byte blocks.
+ */
+TORCHAO_ALWAYS_INLINE inline void pack_64_uint2_values(
+    uint8_t* packed,
+    const uint8_t* unpacked) {
+  for (int i = 0; i < 16; ++i) {
+    packed[i] = (unpacked[i + 16 * 0] << 6) | (unpacked[i + 16 * 1] << 4) |
+        (unpacked[i + 16 * 2] << 2) | (unpacked[i + 16 * 3] << 0);
+  }
+}
+
+/**
+ * @brief Unpacks 16 bytes into 64 bytes (each a 2-bit value).
+ * @param unpacked Pointer to the destination memory (64 bytes).
+ * @param packed Pointer to the source memory (16 bytes).
+ * @note This implementation mirrors the logic of the ARM NEON
+ * `vec_unpack_64_uint2_values` function (an unpack-and-transpose operation)
+ * to ensure compatibility.
+ */
+TORCHAO_ALWAYS_INLINE inline void unpack_64_uint2_values(
+    uint8_t* unpacked,
+    const uint8_t* packed) {
+  for (int i = 0; i < 16; ++i) {
+    const uint8_t packed_byte = packed[i];
+    unpacked[i + 16 * 0] = (packed_byte >> 6) & 0x03;
+    unpacked[i + 16 * 1] = (packed_byte >> 4) & 0x03;
+    unpacked[i + 16 * 2] = (packed_byte >> 2) & 0x03;
+    unpacked[i + 16 * 3] = (packed_byte >> 0) & 0x03;
+  }
+}
+
+} // namespace internal
+} // namespace torchao::kernels::cpu::fallback::bitpacking
diff --git a/torchao/experimental/kernels/cpu/fallback/bitpacking/uint3.h b/torchao/experimental/kernels/cpu/fallback/bitpacking/uint3.h
new file mode 100644
index 0000000000..635e1bca6c
--- /dev/null
+++ b/torchao/experimental/kernels/cpu/fallback/bitpacking/uint3.h
@@ -0,0 +1,195 @@
+// Copyright (c) Meta Platforms, Inc. and affiliates.
+// All rights reserved.
+//
+// This source code is licensed under the license found in the
+// LICENSE file in the root directory of this source tree.
+
+#pragma once
+
+#include <torchao/experimental/kernels/cpu/aarch64/macro.h>
+#include <cstdint>
+
+namespace torchao::kernels::cpu::fallback::bitpacking {
+namespace internal {
+
+/**
+ * @brief Packs 8 bytes, each holding a 3-bit value (0-7), into 3 bytes.
+ *
+ * The packing scheme is non-trivial. Given 8 input values v0..v7, they are
+ * arranged into 3 bytes (b0, b1, b2) as follows:
+ * - b0: [v6(low 2 bits), v0(all 3 bits), v1(all 3 bits)]
+ * - b1: [v7(low 2 bits), v2(all 3 bits), v3(all 3 bits)]
+ * - b2: [v6(high 1 bit), v7(high 1 bit), v4(all 3 bits), v5(all 3 bits)]
+ *
+ * @param packed Pointer to the destination memory (3 bytes).
+ * @param unpacked Pointer to the source memory (8 bytes).
+ */
+TORCHAO_ALWAYS_INLINE inline void pack_8_uint3_values(
+    uint8_t* packed,
+    const uint8_t* unpacked) {
+  // byte 0
+  packed[0] = ((unpacked[6] & 0x03) << 6) | ((unpacked[0] & 0x07) << 3) |
+      (unpacked[1] & 0x07);
+
+  // byte 1
+  packed[1] = ((unpacked[7] & 0x03) << 6) | ((unpacked[2] & 0x07) << 3) |
+      (unpacked[3] & 0x07);
+
+  // byte 2
+  packed[2] = ((unpacked[6] & 0x04) << 5) | ((unpacked[7] & 0x04) << 4) |
+      ((unpacked[4] & 0x07) << 3) | (unpacked[5] & 0x07);
+}
+
+/**
+ * @brief Unpacks 3 bytes into 8 bytes, each containing a 3-bit value.
+ * @param unpacked Pointer to the destination memory (8 bytes).
+ * @param packed Pointer to the source memory (3 bytes).
+ */
+TORCHAO_ALWAYS_INLINE inline void unpack_8_uint3_values(
+    uint8_t* unpacked,
+    const uint8_t* packed) {
+  const uint8_t b0 = packed[0];
+  const uint8_t b1 = packed[1];
+  const uint8_t b2 = packed[2];
+
+  unpacked[0] = (b0 >> 3) & 0x07;
+  unpacked[1] = b0 & 0x07;
+
+  unpacked[2] = (b1 >> 3) & 0x07;
+  unpacked[3] = b1 & 0x07;
+
+  unpacked[4] = (b2 >> 3) & 0x07;
+  unpacked[5] = b2 & 0x07;
+
+  unpacked[6] = (b0 >> 6) | ((b2 >> 5) & 0x04);
+  unpacked[7] = (b1 >> 6) | ((b2 >> 4) & 0x04);
+}
+
+/**
+ * @brief Packs 64 bytes (each a 3-bit value) into 24 bytes.
+ * @param packed Pointer to the destination memory (24 bytes).
+ * @param unpacked Pointer to the source memory (64 bytes).
+ * @note This implementation mirrors the logic of the ARM NEON
+ * `vec_pack_64_uint3_values` function (a transpose-and-pack operation) to
+ * ensure compatibility. The unpacked data is assumed to be organized as eight
+ * 8-byte blocks.
+ */
+TORCHAO_ALWAYS_INLINE inline void pack_64_uint3_values(
+    uint8_t* packed,
+    const uint8_t* unpacked) {
+  for (int i = 0; i < 8; ++i) {
+    const uint8_t unpacked0 = unpacked[i + 8 * 0];
+    const uint8_t unpacked1 = unpacked[i + 8 * 1];
+    const uint8_t unpacked2 = unpacked[i + 8 * 2];
+    const uint8_t unpacked3 = unpacked[i + 8 * 3];
+    const uint8_t unpacked4 = unpacked[i + 8 * 4];
+    const uint8_t unpacked5 = unpacked[i + 8 * 5];
+    const uint8_t unpacked6 = unpacked[i + 8 * 6];
+    const uint8_t unpacked7 = unpacked[i + 8 * 7];
+
+    // byte 0
+    packed[i] = ((unpacked6 & 0x03) << 6) | ((unpacked0 & 0x07) << 3) |
+        (unpacked1 & 0x07);
+
+    // byte 1
+    packed[i + 8] = ((unpacked7 & 0x03) << 6) | ((unpacked2 & 0x07) << 3) |
+        (unpacked3 & 0x07);
+
+    // byte 2
+    packed[i + 16] = ((unpacked6 & 0x04) << 5) | ((unpacked7 & 0x04) << 4) |
+        ((unpacked4 & 0x07) << 3) | (unpacked5 & 0x07);
+  }
+}
+
+/**
+ * @brief Unpacks 24 bytes into 64 bytes (each a 3-bit value).
+ * @param unpacked Pointer to the destination memory (64 bytes).
+ * @param packed Pointer to the source memory (24 bytes).
+ * @note This implementation mirrors the logic of the ARM NEON
+ * `vec_unpack_64_uint3_values` function (an unpack-and-transpose operation)
+ * to ensure compatibility.
+ */
+TORCHAO_ALWAYS_INLINE inline void unpack_64_uint3_values(
+    uint8_t* unpacked,
+    const uint8_t* packed) {
+  for (int i = 0; i < 8; ++i) {
+    const uint8_t b0 = packed[i];
+    const uint8_t b1 = packed[i + 8];
+    const uint8_t b2 = packed[i + 16];
+
+    unpacked[i + 8 * 0] = (b0 >> 3) & 0x07;
+    unpacked[i + 8 * 1] = b0 & 0x07;
+    unpacked[i + 8 * 2] = (b1 >> 3) & 0x07;
+    unpacked[i + 8 * 3] = b1 & 0x07;
+    unpacked[i + 8 * 4] = (b2 >> 3) & 0x07;
+    unpacked[i + 8 * 5] = b2 & 0x07;
+    unpacked[i + 8 * 6] = (b0 >> 6) | ((b2 >> 5) & 0x04);
+    unpacked[i + 8 * 7] = (b1 >> 6) | ((b2 >> 4) & 0x04);
+  }
+}
+
+/**
+ * @brief Packs 128 bytes (each a 3-bit value) into 48 bytes.
+ * @param packed Pointer to the destination memory (48 bytes).
+ * @param unpacked Pointer to the source memory (128 bytes).
+ * @note This implementation mirrors the logic of the ARM NEON
+ * `vec_pack_128_uint3_values` function (a transpose-and-pack operation) to
+ * ensure compatibility. The unpacked data is assumed to be organized as eight
+ * 16-byte blocks.
+ */
+TORCHAO_ALWAYS_INLINE inline void pack_128_uint3_values(
+    uint8_t* packed,
+    const uint8_t* unpacked) {
+  for (int i = 0; i < 16; ++i) {
+    const uint8_t unpacked0 = unpacked[i + 16 * 0];
+    const uint8_t unpacked1 = unpacked[i + 16 * 1];
+    const uint8_t unpacked2 = unpacked[i + 16 * 2];
+    const uint8_t unpacked3 = unpacked[i + 16 * 3];
+    const uint8_t unpacked4 = unpacked[i + 16 * 4];
+    const uint8_t unpacked5 = unpacked[i + 16 * 5];
+    const uint8_t unpacked6 = unpacked[i + 16 * 6];
+    const uint8_t unpacked7 = unpacked[i + 16 * 7];
+
+    // byte 0
+    packed[i] = ((unpacked6 & 0x03) << 6) | ((unpacked0 & 0x07) << 3) |
+        (unpacked1 & 0x07);
+
+    // byte 1
+    packed[i + 16] = ((unpacked7 & 0x03) << 6) | ((unpacked2 & 0x07) << 3) |
+        (unpacked3 & 0x07);
+
+    // byte 2
+    packed[i + 32] = ((unpacked6 & 0x04) << 5) | ((unpacked7 & 0x04) << 4) |
+        ((unpacked4 & 0x07) << 3) | (unpacked5 & 0x07);
+  }
+}
+
+/**
+ * @brief Unpacks 48 bytes into 128 bytes (each a 3-bit value).
+ * @param unpacked Pointer to the destination memory (128 bytes).
+ * @param packed Pointer to the source memory (48 bytes).
+ * @note This implementation mirrors the logic of the ARM NEON
+ * `vec_unpack_128_uint3_values` function (an unpack-and-transpose operation)
+ * to ensure compatibility.
+ */
+TORCHAO_ALWAYS_INLINE inline void unpack_128_uint3_values(
+    uint8_t* unpacked,
+    const uint8_t* packed) {
+  for (int i = 0; i < 16; ++i) {
+    const uint8_t b0 = packed[i];
+    const uint8_t b1 = packed[i + 16];
+    const uint8_t b2 = packed[i + 32];
+
+    unpacked[i + 16 * 0] = (b0 >> 3) & 0x07;
+    unpacked[i + 16 * 1] = b0 & 0x07;
+    unpacked[i + 16 * 2] = (b1 >> 3) & 0x07;
+    unpacked[i + 16 * 3] = b1 & 0x07;
+    unpacked[i + 16 * 4] = (b2 >> 3) & 0x07;
+    unpacked[i + 16 * 5] = b2 & 0x07;
+    unpacked[i + 16 * 6] = (b0 >> 6) | ((b2 >> 5) & 0x04);
+    unpacked[i + 16 * 7] = (b1 >> 6) | ((b2 >> 4) & 0x04);
+  }
+}
+
+} // namespace internal
+} // namespace torchao::kernels::cpu::fallback::bitpacking
diff --git a/torchao/experimental/kernels/cpu/fallback/bitpacking/uint4.h b/torchao/experimental/kernels/cpu/fallback/bitpacking/uint4.h
new file mode 100644
index 0000000000..27be9488d7
--- /dev/null
+++ b/torchao/experimental/kernels/cpu/fallback/bitpacking/uint4.h
@@ -0,0 +1,109 @@
+// Copyright (c) Meta Platforms, Inc. and affiliates.
+// All rights reserved.
+//
+// This source code is licensed under the license found in the
+// LICENSE file in the root directory of this source tree.
+
+#pragma once
+
+#include <torchao/experimental/kernels/cpu/aarch64/macro.h>
+#include <cstdint>
+
+namespace torchao::kernels::cpu::fallback::bitpacking {
+namespace internal {
+/**
+ * @brief Packs 2 bytes, each holding a 4-bit value (0-15), into a single
+ * byte. The first value goes into the high nibble, the second into the low
+ * nibble.
+ * @param packed Pointer to the destination memory (1 byte).
+ * @param unpacked Pointer to the source memory (2 bytes).
+ */
+TORCHAO_ALWAYS_INLINE inline void pack_2_uint4_values(
+    uint8_t* packed,
+    const uint8_t* unpacked) {
+  // This is compatible with the scalar NEON version.
+  packed[0] = (unpacked[0] << 4) | (unpacked[1] & 0x0F);
+}
+
+/**
+ * @brief Unpacks a single byte into 2 bytes, each containing a 4-bit value.
+ * @param unpacked Pointer to the destination memory (2 bytes).
+ * @param packed Pointer to the source memory (1 byte).
+ */
+TORCHAO_ALWAYS_INLINE inline void unpack_2_uint4_values(
+    uint8_t* unpacked,
+    const uint8_t* packed) {
+  // This is compatible with the scalar NEON version.
+  unpacked[0] = packed[0] >> 4;
+  unpacked[1] = packed[0] & 0x0F;
+}
+
+/**
+ * @brief Packs 16 bytes (each a 4-bit value) into 8 bytes.
+ * @param packed Pointer to the destination memory (8 bytes).
+ * @param unpacked Pointer to the source memory (16 bytes).
+ * @note This implementation mirrors the logic of the ARM NEON
+ * `vec_pack_16_uint4_values` function (a transpose-and-pack operation) to
+ * ensure compatibility. It packs unpacked[i] and unpacked[i+8] into
+ * packed[i].
+ */
+TORCHAO_ALWAYS_INLINE inline void pack_16_uint4_values(
+    uint8_t* packed,
+    const uint8_t* unpacked) {
+  for (int i = 0; i < 8; ++i) {
+    packed[i] = ((unpacked[i + 8] & 0x0F) << 4) | (unpacked[i] & 0x0F);
+  }
+}
+
+/**
+ * @brief Unpacks 8 bytes into 16 bytes (each a 4-bit value).
+ * @param unpacked Pointer to the destination memory (16 bytes).
+ * @param packed Pointer to the source memory (8 bytes).
+ * @note This implementation mirrors the logic of the ARM NEON
+ * `vec_unpack_16_uint4_values` function (an unpack-and-transpose operation)
+ * to ensure compatibility.
+ */
+TORCHAO_ALWAYS_INLINE inline void unpack_16_uint4_values(
+    uint8_t* unpacked,
+    const uint8_t* packed) {
+  for (int i = 0; i < 8; ++i) {
+    unpacked[i] = packed[i] & 0x0F;
+    unpacked[i + 8] = packed[i] >> 4;
+  }
+}
+
+/**
+ * @brief Packs 32 bytes (each a 4-bit value) into 16 bytes.
+ * @param packed Pointer to the destination memory (16 bytes).
+ * @param unpacked Pointer to the source memory (32 bytes).
+ * @note This implementation mirrors the logic of the ARM NEON
+ * `vec_pack_32_uint4_values` function (a transpose-and-pack operation) to
+ * ensure compatibility. It packs unpacked[i] and unpacked[i+16] into
+ * packed[i].
+ */
+TORCHAO_ALWAYS_INLINE inline void pack_32_uint4_values(
+    uint8_t* packed,
+    const uint8_t* unpacked) {
+  for (int i = 0; i < 16; ++i) {
+    packed[i] = ((unpacked[i + 16] & 0x0F) << 4) | (unpacked[i] & 0x0F);
+  }
+}
+
+/**
+ * @brief Unpacks 16 bytes into 32 bytes (each a 4-bit value).
+ * @param unpacked Pointer to the destination memory (32 bytes).
+ * @param packed Pointer to the source memory (16 bytes).
+ * @note This implementation mirrors the logic of the ARM NEON
+ * `vec_unpack_32_uint4_values` function (an unpack-and-transpose operation)
+ * to ensure compatibility.
+ */
+TORCHAO_ALWAYS_INLINE inline void unpack_32_uint4_values(
+    uint8_t* unpacked,
+    const uint8_t* packed) {
+  for (int i = 0; i < 16; ++i) {
+    unpacked[i] = packed[i] & 0x0F;
+    unpacked[i + 16] = packed[i] >> 4;
+  }
+}
+} // namespace internal
+} // namespace torchao::kernels::cpu::fallback::bitpacking
diff --git a/torchao/experimental/kernels/cpu/fallback/bitpacking/uint5.h b/torchao/experimental/kernels/cpu/fallback/bitpacking/uint5.h
new file mode 100644
index 0000000000..2ad408a75a
--- /dev/null
+++ b/torchao/experimental/kernels/cpu/fallback/bitpacking/uint5.h
@@ -0,0 +1,175 @@
+// Copyright (c) Meta Platforms, Inc. and affiliates.
+// All rights reserved.
+//
+// This source code is licensed under the license found in the
+// LICENSE file in the root directory of this source tree.
+
+#pragma once
+
+#include <torchao/experimental/kernels/cpu/aarch64/macro.h>
+#include <cstdint>
+
+namespace torchao::kernels::cpu::fallback::bitpacking {
+namespace internal {
+
+/**
+ * @brief Packs 8 bytes, each holding a 5-bit value (0-31), into 5 bytes.
+ *
+ * @param packed Pointer to the destination memory (5 bytes).
+ * @param unpacked Pointer to the source memory (8 bytes).
+ */
+TORCHAO_ALWAYS_INLINE inline void pack_8_uint5_values(
+    uint8_t* packed,
+    const uint8_t* unpacked) {
+  // pack 8 uint5 values (u0..u7) into 5 bytes (p0..p4)
+  // p0 = u0_all | u1_low_3_bits
+  // p1 = u2_all | u3_low_3_bits
+  // p2 = u4_all | u5_low_3_bits
+  // p3 = u6_all | u7_low_3_bits
+  // p4 = u1_high_2_bits | u3_high_2_bits | u5_high_2_bits | u7_high_2_bits
+  packed[0] = (unpacked[0] & 0x1F) | ((unpacked[1] & 0x1F) << 5);
+  packed[1] = (unpacked[2] & 0x1F) | ((unpacked[3] & 0x1F) << 5);
+  packed[2] = (unpacked[4] & 0x1F) | ((unpacked[5] & 0x1F) << 5);
+  packed[3] = (unpacked[6] & 0x1F) | ((unpacked[7] & 0x1F) << 5);
+  packed[4] = ((unpacked[1] & 0x1F) >> 3) | (((unpacked[3] & 0x1F) >> 3) << 2) |
+      (((unpacked[5] & 0x1F) >> 3) << 4) | (((unpacked[7] & 0x1F) >> 3) << 6);
+}
+
+/**
+ * @brief Unpacks 5 bytes into 8 bytes, each containing a 5-bit value.
+ *
+ * @param unpacked Pointer to the destination memory (8 bytes).
+ * @param packed Pointer to the source memory (5 bytes).
+ */
+TORCHAO_ALWAYS_INLINE inline void unpack_8_uint5_values(
+    uint8_t* unpacked,
+    const uint8_t* packed) {
+  const uint8_t p0 = packed[0];
+  const uint8_t p1 = packed[1];
+  const uint8_t p2 = packed[2];
+  const uint8_t p3 = packed[3];
+  const uint8_t p4 = packed[4];
+
+  // This is compatible with the scalar NEON version.
+  unpacked[0] = p0 & 0x1F;
+  unpacked[1] = (p0 >> 5) | ((p4 & 0x03) << 3);
+  unpacked[2] = p1 & 0x1F;
+  unpacked[3] = (p1 >> 5) | ((p4 & 0x0C) << 1);
+  unpacked[4] = p2 & 0x1F;
+  unpacked[5] = (p2 >> 5) | ((p4 & 0x30) >> 1);
+  unpacked[6] = p3 & 0x1F;
+  unpacked[7] = (p3 >> 5) | ((p4 & 0xC0) >> 3);
+}
+
+/**
+ * @brief Packs 64 bytes (each a 5-bit value) into 40 bytes.
+ * @param packed Pointer to the destination memory (40 bytes).
+ * @param unpacked Pointer to the source memory (64 bytes).
+ * @note This implementation mirrors the logic of the ARM NEON
+ * `vec_pack_64_uint5_values` function to ensure compatibility.
+ */
+TORCHAO_ALWAYS_INLINE inline void pack_64_uint5_values(
+    uint8_t* packed,
+    const uint8_t* unpacked) {
+  // Pack the first 32 bytes (p0, p1)
+  for (int i = 0; i < 16; ++i) {
+    packed[i] = (unpacked[i] & 0x1F) | ((unpacked[i + 16] & 0x1F) << 5);
+    packed[i + 16] = (unpacked[i + 32] & 0x1F) | ((unpacked[i + 48] & 0x1F) << 5);
+  }
+
+  // Pack the final 8 bytes (p2)
+  for (int i = 0; i < 8; ++i) {
+    uint8_t val1 = (unpacked[16 + i] >> 3) & 0x03;
+    uint8_t val2 = (unpacked[24 + i] >> 3) & 0x03;
+    uint8_t val3 = (unpacked[48 + i] >> 3) & 0x03;
+    uint8_t val4 = (unpacked[56 + i] >> 3) & 0x03;
+    packed[32 + i] = val1 | (val2 << 2) | (val3 << 4) | (val4 << 6);
+  }
+}
+
+/**
+ * @brief Unpacks 40 bytes into 64 bytes (each a 5-bit value).
+ * @param unpacked Pointer to the destination memory (64 bytes).
+ * @param packed Pointer to the source memory (40 bytes).
+ * @note This implementation mirrors the logic of the ARM NEON
+ * `vec_unpack_64_uint5_values` function to ensure compatibility.
+ */
+TORCHAO_ALWAYS_INLINE inline void unpack_64_uint5_values(
+    uint8_t* unpacked,
+    const uint8_t* packed) {
+  for (int i = 0; i < 16; ++i) {
+    const uint8_t p0 = packed[i];
+    const uint8_t p1 = packed[i + 16];
+    // p2 is only 8 bytes wide, so we use modulo to access it correctly.
+    const uint8_t p2 = packed[32 + (i % 8)];
+
+    unpacked[i] = p0 & 0x1F;
+    unpacked[i + 32] = p1 & 0x1F;
+
+    if (i < 8) {
+      unpacked[i + 16] = (p0 >> 5) | ((p2 & 0x03) << 3);
+      unpacked[i + 48] = (p1 >> 5) | ((p2 & 0x30) >> 1);
+    } else {
+      unpacked[i + 16] = (p0 >> 5) | ((p2 & 0x0C) << 1);
+      unpacked[i + 48] = (p1 >> 5) | ((p2 & 0xC0) >> 3);
+    }
+  }
+}
+
+/**
+ * @brief Packs 128 bytes (each a 5-bit value) into 80 bytes.
+ * @param packed Pointer to the destination memory (80 bytes).
+ * @param unpacked Pointer to the source memory (128 bytes).
+ * @note This implementation mirrors the logic of the ARM NEON
+ * `vec_pack_128_uint5_values` function to ensure compatibility.
+ */
+TORCHAO_ALWAYS_INLINE inline void pack_128_uint5_values(
+    uint8_t* packed,
+    const uint8_t* unpacked) {
+  // Pack the first 64 bytes (p0, p1, p2, p3)
+  for (int i = 0; i < 16; ++i) {
+    packed[i] = (unpacked[i] & 0x1F) | ((unpacked[i + 16] & 0x1F) << 5);
+    packed[i + 16] = (unpacked[i + 32] & 0x1F) | ((unpacked[i + 48] & 0x1F) << 5);
+    packed[i + 32] = (unpacked[i + 64] & 0x1F) | ((unpacked[i + 80] & 0x1F) << 5);
+    packed[i + 48] = (unpacked[i + 96] & 0x1F) | ((unpacked[i + 112] & 0x1F) << 5);
+  }
+
+  // Pack the final 16 bytes (p4)
+  for (int i = 0; i < 16; ++i) {
+    uint8_t val1 = (unpacked[16 + i] >> 3) & 0x03;
+    uint8_t val2 = (unpacked[48 + i] >> 3) & 0x03;
+    uint8_t val3 = (unpacked[80 + i] >> 3) & 0x03;
+    uint8_t val4 = (unpacked[112 + i] >> 3) & 0x03;
+    packed[64 + i] = val1 | (val2 << 2) | (val3 << 4) | (val4 << 6);
+  }
+}
+
+/**
+ * @brief Unpacks 80 bytes into 128 bytes (each a 5-bit value).
+ * @param unpacked Pointer to the destination memory (128 bytes).
+ * @param packed Pointer to the source memory (80 bytes).
+ * @note This implementation mirrors the logic of the ARM NEON
+ * `vec_unpack_128_uint5_values` function to ensure compatibility.
+ */
+TORCHAO_ALWAYS_INLINE inline void unpack_128_uint5_values(
+    uint8_t* unpacked,
+    const uint8_t* packed) {
+  for (int i = 0; i < 16; ++i) {
+    const uint8_t p0 = packed[i];
+    const uint8_t p1 = packed[i + 16];
+    const uint8_t p2 = packed[i + 32];
+    const uint8_t p3 = packed[i + 48];
+    const uint8_t p4 = packed[i + 64];
+
+    unpacked[i + 16 * 0] = p0 & 0x1F;
+    unpacked[i + 16 * 1] = (p0 >> 5) | ((p4 & 0x03) << 3);
+    unpacked[i + 16 * 2] = p1 & 0x1F;
+    unpacked[i + 16 * 3] = (p1 >> 5) | ((p4 & 0x0C) << 1);
+    unpacked[i + 16 * 4] = p2 & 0x1F;
+    unpacked[i + 16 * 5] = (p2 >> 5) | ((p4 & 0x30) >> 1);
+    unpacked[i + 16 * 6] = p3 & 0x1F;
+    unpacked[i + 16 * 7] = (p3 >> 5) | ((p4 & 0xC0) >> 3);
+  }
+}
+
+}}
diff --git a/torchao/experimental/kernels/cpu/fallback/bitpacking/uint6.h b/torchao/experimental/kernels/cpu/fallback/bitpacking/uint6.h
new file mode 100644
index 0000000000..65325b030d
--- /dev/null
+++ b/torchao/experimental/kernels/cpu/fallback/bitpacking/uint6.h
@@ -0,0 +1,142 @@
+// Copyright (c) Meta Platforms, Inc. and affiliates.
+// All rights reserved.
+//
+// This source code is licensed under the license found in the
+// LICENSE file in the root directory of this source tree.
+
+#pragma once
+
+#include <torchao/experimental/kernels/cpu/aarch64/macro.h>
+#include <cstdint>
+
+namespace torchao::kernels::cpu::fallback::bitpacking {
+namespace internal {
+
+/**
+ * @brief Packs 4 bytes, each holding a 6-bit value (0-63), into 3 bytes.
+ *
+ * @param packed Pointer to the destination memory (3 bytes).
+ * @param unpacked Pointer to the source memory (4 bytes).
+ */
+TORCHAO_ALWAYS_INLINE inline void pack_4_uint6_values(
+    uint8_t* packed,
+    const uint8_t* unpacked) {
+  // pack 4 uint6 values (u0..u3) into 3 bytes (p0..p2)
+  // p0's low 6 bits = u0; p0's high 2 bits = u3's low 2 bits
+  // p1's low 6 bits = u1; p1's high 2 bits = u3's mid 2 bits
+  // p2's low 6 bits = u2; p2's high 2 bits = u3's high 2 bits
+  const uint8_t u3 = unpacked[3] & 0x3F;
+  packed[0] = (unpacked[0] & 0x3F) | ((u3 & 0x03) << 6);
+  packed[1] = (unpacked[1] & 0x3F) | ((u3 & 0x0C) << 4);
+  packed[2] = (unpacked[2] & 0x3F) | ((u3 & 0x30) << 2);
+}
+
+/**
+ * @brief Unpacks 3 bytes into 4 bytes, each containing a 6-bit value.
+ *
+ * @param unpacked Pointer to the destination memory (4 bytes).
+ * @param packed Pointer to the source memory (3 bytes).
+ */
+TORCHAO_ALWAYS_INLINE inline void unpack_4_uint6_values(
+    uint8_t* unpacked,
+    const uint8_t* packed) {
+  // This is compatible with the scalar NEON version.
+  unpacked[0] = packed[0] & 0x3F;
+  unpacked[1] = packed[1] & 0x3F;
+  unpacked[2] = packed[2] & 0x3F;
+  unpacked[3] = ((packed[0] & 0xC0) >> 6) | ((packed[1] & 0xC0) >> 4) |
+      ((packed[2] & 0xC0) >> 2);
+}
+
+/**
+ * @brief Packs 32 bytes (each a 6-bit value) into 24 bytes.
+ * @param packed Pointer to the destination memory (24 bytes).
+ * @param unpacked Pointer to the source memory (32 bytes).
+ * @note This implementation mirrors the logic of the ARM NEON
+ * `vec_pack_32_uint6_values` function to ensure compatibility.
+ */
+TORCHAO_ALWAYS_INLINE inline void pack_32_uint6_values(
+    uint8_t* packed,
+    const uint8_t* unpacked) {
+  for (int i = 0; i < 8; ++i) {
+    const uint8_t u0 = unpacked[i];
+    const uint8_t u1 = unpacked[i + 8];
+    const uint8_t u2 = unpacked[i + 16];
+    const uint8_t u3 = unpacked[i + 24];
+
+    packed[i] = (u0 & 0x3F) | ((u3 & 0x03) << 6);
+    packed[i + 8] = (u1 & 0x3F) | ((u3 & 0x0C) << 4);
+    packed[i + 16] = (u2 & 0x3F) | ((u3 & 0x30) << 2);
+  }
+}
+
+/**
+ * @brief Unpacks 24 bytes into 32 bytes (each a 6-bit value).
+ * @param unpacked Pointer to the destination memory (32 bytes).
+ * @param packed Pointer to the source memory (24 bytes).
+ * @note This implementation mirrors the logic of the ARM NEON
+ * `vec_unpack_32_uint6_values` function to ensure compatibility.
+ */
+TORCHAO_ALWAYS_INLINE inline void unpack_32_uint6_values(
+    uint8_t* unpacked,
+    const uint8_t* packed) {
+  for (int i = 0; i < 8; ++i) {
+    const uint8_t p0 = packed[i];
+    const uint8_t p1 = packed[i + 8];
+    const uint8_t p2 = packed[i + 16];
+
+    unpacked[i] = p0 & 0x3F;
+    unpacked[i + 8] = p1 & 0x3F;
+    unpacked[i + 16] = p2 & 0x3F;
+    unpacked[i + 24] =
+        ((p0 & 0xC0) >> 6) | ((p1 & 0xC0) >> 4) | ((p2 & 0xC0) >> 2);
+  }
+}
+
+/**
+ * @brief Packs 64 bytes (each a 6-bit value) into 48 bytes.
+ * @param packed Pointer to the destination memory (48 bytes).
+ * @param unpacked Pointer to the source memory (64 bytes).
+ * @note This implementation mirrors the logic of the ARM NEON
+ * `vec_pack_64_uint6_values` function to ensure compatibility.
+ */
+TORCHAO_ALWAYS_INLINE inline void pack_64_uint6_values(
+    uint8_t* packed,
+    const uint8_t* unpacked) {
+  for (int i = 0; i < 16; ++i) {
+    const uint8_t u0 = unpacked[i];
+    const uint8_t u1 = unpacked[i + 16];
+    const uint8_t u2 = unpacked[i + 32];
+    const uint8_t u3 = unpacked[i + 48];
+
+    packed[i] = (u0 & 0x3F) | ((u3 & 0x03) << 6);
+    packed[i + 16] = (u1 & 0x3F) | ((u3 & 0x0C) << 4);
+    packed[i + 32] = (u2 & 0x3F) | ((u3 & 0x30) << 2);
+  }
+}
+
+/**
+ * @brief Unpacks 48 bytes into 64 bytes (each a 6-bit value).
+ * @param unpacked Pointer to the destination memory (64 bytes).
+ * @param packed Pointer to the source memory (48 bytes).
+ * @note This implementation mirrors the logic of the ARM NEON
+ * `vec_unpack_64_uint6_values` function to ensure compatibility.
+ */
+TORCHAO_ALWAYS_INLINE inline void unpack_64_uint6_values(
+    uint8_t* unpacked,
+    const uint8_t* packed) {
+  for (int i = 0; i < 16; ++i) {
+    const uint8_t p0 = packed[i];
+    const uint8_t p1 = packed[i + 16];
+    const uint8_t p2 = packed[i + 32];
+
+    unpacked[i] = p0 & 0x3F;
+    unpacked[i + 16] = p1 & 0x3F;
+    unpacked[i + 32] = p2 & 0x3F;
+    unpacked[i + 48] =
+        ((p0 & 0xC0) >> 6) | ((p1 & 0xC0) >> 4) | ((p2 & 0xC0) >> 2);
+  }
+}
+
+} // namespace internal
+} // namespace torchao::kernels::cpu::fallback::bitpacking
diff --git a/torchao/experimental/kernels/cpu/fallback/bitpacking/uint7.h b/torchao/experimental/kernels/cpu/fallback/bitpacking/uint7.h
new file mode 100644
index 0000000000..ee4d501324
--- /dev/null
+++ b/torchao/experimental/kernels/cpu/fallback/bitpacking/uint7.h
@@ -0,0 +1,140 @@
+// Copyright (c) Meta Platforms, Inc. and affiliates.
+// All rights reserved.
+//
+// This source code is licensed under the license found in the
+// LICENSE file in the root directory of this source tree.
+
+#pragma once
+
+#include <torchao/experimental/kernels/cpu/aarch64/macro.h>
+#include <cstdint>
+
+namespace torchao::kernels::cpu::fallback::bitpacking {
+namespace internal {
+/**
+ * @brief Packs 8 bytes, each holding a 7-bit value (0-127), into 7 bytes.
+ *
+ * @param packed Pointer to the destination memory (7 bytes).
+ * @param unpacked Pointer to the source memory (8 bytes).
+ */
+TORCHAO_ALWAYS_INLINE inline void pack_8_uint7_values(
+    uint8_t* packed,
+    const uint8_t* unpacked) {
+  // pack 8 uint7 values (u0..u7) into 7 bytes (p0..p6)
+  // The 7 bits of u7 are distributed across the most significant bit (MSB)
+  // of each of the 7 packed bytes.
+  // p0 = u7_bit_0 | u0_all_7_bits
+  // p1 = u7_bit_1 | u1_all_7_bits
+  // ...
+  // p6 = u7_bit_6 | u6_all_7_bits
+  const uint8_t u7 = unpacked[7] & 0x7F;
+
+  for (int i = 0; i < 7; ++i) {
+    uint8_t u7_bit = (u7 >> i) & 1;
+    packed[i] = (unpacked[i] & 0x7F) | (u7_bit << 7);
+  }
+}
+
+/**
+ * @brief Unpacks 7 bytes into 8 bytes, each containing a 7-bit value.
+ *
+ * @param unpacked Pointer to the destination memory (8 bytes).
+ * @param packed Pointer to the source memory (7 bytes).
+ */
+TORCHAO_ALWAYS_INLINE inline void unpack_8_uint7_values(
+    uint8_t* unpacked,
+    const uint8_t* packed) {
+  unpacked[7] = 0;
+  for (int i = 0; i < 7; ++i) {
+    // The low 7 bits of the packed byte are the original value.
+    unpacked[i] = packed[i] & 0x7F;
+    // The high bit of the packed byte is the i-th bit of the 8th value.
+    uint8_t u7_bit = packed[i] >> 7;
+    unpacked[7] |= (u7_bit << i);
+  }
+}
+
+/**
+ * @brief Packs 64 bytes (each a 7-bit value) into 56 bytes.
+ * @param packed Pointer to the destination memory (56 bytes).
+ * @param unpacked Pointer to the source memory (64 bytes).
+ * @note This implementation mirrors the logic of the ARM NEON
+ * `vec_pack_64_uint7_values` function to ensure compatibility.
+ */
+TORCHAO_ALWAYS_INLINE inline void pack_64_uint7_values(
+    uint8_t* packed,
+    const uint8_t* unpacked) {
+  // Transpose-and-pack operation
+  for (int j = 0; j < 8; ++j) { // Iterate through columns
+    const uint8_t u7 = unpacked[56 + j] & 0x7F;
+    for (int i = 0; i < 7; ++i) { // Iterate through rows
+      uint8_t u7_bit = (u7 >> i) & 1;
+      packed[i * 8 + j] = (unpacked[i * 8 + j] & 0x7F) | (u7_bit << 7);
+    }
+  }
+}
+
+/**
+ * @brief Unpacks 56 bytes into 64 bytes (each a 7-bit value).
+ * @param unpacked Pointer to the destination memory (64 bytes).
+ * @param packed Pointer to the source memory (56 bytes).
+ * @note This implementation mirrors the logic of the ARM NEON
+ * `vec_unpack_64_uint7_values` function to ensure compatibility.
+ */
+TORCHAO_ALWAYS_INLINE inline void unpack_64_uint7_values(
+    uint8_t* unpacked,
+    const uint8_t* packed) {
+  // Unpack-and-transpose operation
+  for (int j = 0; j < 8; ++j) { // Iterate through columns
+    uint8_t u7 = 0;
+    for (int i = 0; i < 7; ++i) { // Iterate through rows
+      unpacked[i * 8 + j] = packed[i * 8 + j] & 0x7F;
+      u7 |= ((packed[i * 8 + j] >> 7) & 1) << i;
+    }
+    unpacked[56 + j] = u7;
+  }
+}
+
+/**
+ * @brief Packs 128 bytes (each a 7-bit value) into 112 bytes.
+ * @param packed Pointer to the destination memory (112 bytes).
+ * @param unpacked Pointer to the source memory (128 bytes).
+ * @note This implementation mirrors the logic of the ARM NEON
+ * `vec_pack_128_uint7_values` function to ensure compatibility.
+ */
+TORCHAO_ALWAYS_INLINE inline void pack_128_uint7_values(
+    uint8_t* packed,
+    const uint8_t* unpacked) {
+  // Transpose-and-pack operation
+  for (int j = 0; j < 16; ++j) { // Iterate through columns
+    const uint8_t u7 = unpacked[112 + j] & 0x7F;
+    for (int i = 0; i < 7; ++i) { // Iterate through rows
+      uint8_t u7_bit = (u7 >> i) & 1;
+      packed[i * 16 + j] = (unpacked[i * 16 + j] & 0x7F) | (u7_bit << 7);
+    }
+  }
+}
+
+/**
+ * @brief Unpacks 112 bytes into 128 bytes (each a 7-bit value).
+ * @param unpacked Pointer to the destination memory (128 bytes).
+ * @param packed Pointer to the source memory (112 bytes).
+ * @note This implementation mirrors the logic of the ARM NEON
+ * `vec_unpack_128_uint7_values` function to ensure compatibility.
+ */
+TORCHAO_ALWAYS_INLINE inline void unpack_128_uint7_values(
+    uint8_t* unpacked,
+    const uint8_t* packed) {
+  // Unpack-and-transpose operation
+  for (int j = 0; j < 16; ++j) { // Iterate through columns
+    uint8_t u7 = 0;
+    for (int i = 0; i < 7; ++i) { // Iterate through rows
+      unpacked[i * 16 + j] = packed[i * 16 + j] & 0x7F;
+      u7 |= ((packed[i * 16 + j] >> 7) & 1) << i;
+    }
+    unpacked[112 + j] = u7;
+  }
+}
+
+} // namespace internal
+} // namespace torchao::kernels::cpu::fallback::bitpacking
diff --git a/torchao/experimental/kernels/cpu/fallback/tests/CMakeLists.txt b/torchao/experimental/kernels/cpu/fallback/tests/CMakeLists.txt
new file mode 100644
index 0000000000..652475766b
--- /dev/null
+++ b/torchao/experimental/kernels/cpu/fallback/tests/CMakeLists.txt
@@ -0,0 +1,49 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+
+cmake_minimum_required(VERSION 3.19)
+project(tests)
+set(CMAKE_CXX_STANDARD 17)
+
+include(FetchContent)
+FetchContent_Declare(
+  googletest
+  URL https://github.com/google/googletest/archive/03597a01ee50ed33e9dfd640b249b4be3799d395.zip
+)
+FetchContent_MakeAvailable(googletest)
+
+add_compile_options("-Wall" "-Werror")
+
+include(CMakePrintHelpers)
+message("TORCHAO_LIBRARIES: ${TORCHAO_LIBRARIES}")
+include_directories(${TORCHAO_LIBRARIES})
+add_library(
+  dep
+  ${TORCHAO_LIBRARIES}/torchao/experimental/kernels/cpu/aarch64/reduction/find_min_and_max.cpp
+  ${TORCHAO_LIBRARIES}/torchao/experimental/kernels/cpu/aarch64/reduction/compute_sum.cpp
+  ${TORCHAO_LIBRARIES}/torchao/experimental/kernels/cpu/aarch64/quantization/quantize.cpp
+)
+if(NOT TORCHAO_INCLUDE_DIRS)
+  set(TORCHAO_INCLUDE_DIRS ${TORCHAO_LIBRARIES})
+endif()
+
+add_subdirectory(
+${TORCHAO_LIBRARIES}/torchao/experimental/kernels/cpu/fallback
+${CMAKE_CURRENT_BINARY_DIR}/torchao_kernels_cpu_fallback
+)
+
+enable_testing()
+
+add_executable(test_bitpacking test_bitpacking.cpp)
+target_link_libraries(
+    test_bitpacking
+    PRIVATE
+    GTest::gtest_main
+    dep
+)
+
+include(GoogleTest)
+gtest_discover_tests(test_bitpacking)
diff --git a/torchao/experimental/kernels/cpu/fallback/tests/build_and_run_tests.sh b/torchao/experimental/kernels/cpu/fallback/tests/build_and_run_tests.sh
new file mode 100644
index 0000000000..69590512ec
--- /dev/null
+++ b/torchao/experimental/kernels/cpu/fallback/tests/build_and_run_tests.sh
@@ -0,0 +1,35 @@
+#!/bin/bash -eu
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+
+set -eu
+SCRIPT_DIR=$(cd -- "$(dirname -- "${BASH_SOURCE[0]}")" &> /dev/null && pwd)
+export TORCHAO_LIBRARIES=${SCRIPT_DIR}/../../../../../..
+export CMAKE_OUT=/tmp/cmake-out/torch_ao/kernel_fallback_tests
+
+target=${1:-"native"}
+
+EXTRA_ARGS=""
+
+cmake \
+    ${EXTRA_ARGS} \
+    -DCMAKE_BUILD_TYPE=Debug \
+    -DTORCHAO_LIBRARIES=${TORCHAO_LIBRARIES} \
+    -DTORCHAO_BUILD_CPU_AARCH64=ON \
+    -S ${TORCHAO_LIBRARIES}/torchao/experimental/kernels/cpu/fallback/tests \
+    -B ${CMAKE_OUT}
+
+cmake --build ${CMAKE_OUT}
+
+echo "Successfully built tests."
+
+if [[ "${target}" != "native" ]]; then
+    echo "Skip running tests when cross compiling.";
+    exit 0;
+fi
+
+# Run
+${CMAKE_OUT}/test_bitpacking
diff --git a/torchao/experimental/kernels/cpu/fallback/tests/test_bitpacking.cpp b/torchao/experimental/kernels/cpu/fallback/tests/test_bitpacking.cpp
new file mode 100644
index 0000000000..baed31ac6f
--- /dev/null
+++ b/torchao/experimental/kernels/cpu/fallback/tests/test_bitpacking.cpp
@@ -0,0 +1,157 @@
+// Copyright (c) Meta Platforms, Inc. and affiliates.
+// All rights reserved.
+//
+// This source code is licensed under the license found in the
+// LICENSE file in the root directory of this source tree.
+// test pack with cpp unpack with arm_neon
+#include <gtest/gtest.h>
+#include <torchao/experimental/kernels/cpu/aarch64/tests/test_utils.h>
+#include <torchao/experimental/kernels/cpu/fallback/bitpacking/bitpack.h>
+#include <torchao/experimental/kernels/cpu/fallback/bitpacking/uint1.h>
+#include <torchao/experimental/kernels/cpu/fallback/bitpacking/uint2.h>
+#include <torchao/experimental/kernels/cpu/fallback/bitpacking/uint3.h>
+#include <torchao/experimental/kernels/cpu/fallback/bitpacking/uint4.h>
+#include <torchao/experimental/kernels/cpu/fallback/bitpacking/uint5.h>
+#include <torchao/experimental/kernels/cpu/fallback/bitpacking/uint6.h>
+#include <torchao/experimental/kernels/cpu/fallback/bitpacking/uint7.h>
+#include <vector>
+
+TEST(GenericBitpackingTest, PackUnpack8_uint1) {
+  int unpacked_bytes = 8;
+  int packed_bytes = 1;
+  auto input = torchao::get_random_lowbit_vector(unpacked_bytes, 1);
+  std::vector<uint8_t> packed(packed_bytes);
+  std::vector<uint8_t> unpacked(unpacked_bytes);
+
+  torchao::kernels::cpu::fallback::bitpacking::internal::pack_8_uint1_values(
+      packed.data(), input.data());
+  torchao::kernels::cpu::fallback::bitpacking::internal::unpack_8_uint1_values(
+      unpacked.data(), packed.data());
+
+  ASSERT_EQ(input, unpacked);
+}
+
+TEST(GenericBitpackingTest, PackUnpack32_uint4) {
+  int unpacked_bytes = 32;
+  int packed_bytes = 16;
+  auto input = torchao::get_random_lowbit_vector(unpacked_bytes, 4);
+  std::vector<uint8_t> packed(packed_bytes);
+  std::vector<uint8_t> unpacked(unpacked_bytes);
+
+  torchao::kernels::cpu::fallback::bitpacking::internal::pack_32_uint4_values(
+      packed.data(), input.data());
+  torchao::kernels::cpu::fallback::bitpacking::internal::unpack_32_uint4_values(
+      unpacked.data(), packed.data());
+
+  ASSERT_EQ(input, unpacked);
+}
+
+TEST(GenericBitpackingTest, PackUnpack8_uint7) {
+  int unpacked_bytes = 8;
+  int packed_bytes = 7;
+  auto input = torchao::get_random_lowbit_vector(unpacked_bytes, 7);
+  std::vector<uint8_t> packed(packed_bytes);
+  std::vector<uint8_t> unpacked(unpacked_bytes);
+
+  torchao::kernels::cpu::fallback::bitpacking::internal::pack_8_uint7_values(
+      packed.data(), input.data());
+  torchao::kernels::cpu::fallback::bitpacking::internal::unpack_8_uint7_values(
+      unpacked.data(), packed.data());
+
+  ASSERT_EQ(input, unpacked);
+}
+
+// --- Template test for the main dispatcher function ---
+template <int nbit>
+void test_bitpacking_128_lowbit_values() {
+  const int unpacked_bytes = 128;
+  const int packed_bytes = unpacked_bytes * nbit / 8;
+
+  auto input = torchao::get_random_signed_lowbit_vector(unpacked_bytes, nbit);
+  std::vector<uint8_t> packed(packed_bytes);
+  std::vector<int8_t> unpacked(unpacked_bytes);
+
+  torchao::kernels::cpu::fallback::bitpacking::internal::
+      pack_128_lowbit_int_values<nbit>(packed.data(), input.data());
+  torchao::kernels::cpu::fallback::bitpacking::internal::
+      unpack_128_lowbit_int_values<nbit>(unpacked.data(), packed.data());
+
+  ASSERT_EQ(input, unpacked);
+}
+
+// --- Template test for the LUT dispatcher function ---
+template <int nbit>
+void test_bitpacking_128_lowbit_values_with_lut() {
+  const int unpacked_bytes = 128;
+  const int packed_bytes = unpacked_bytes * nbit / 8;
+  const int num_lut_entries = 1 << nbit;
+
+  // 1. Create a LUT and random indices
+  auto lut = torchao::get_random_signed_lowbit_vector(num_lut_entries, 8);
+  auto indices = torchao::get_random_lowbit_vector(unpacked_bytes, nbit);
+
+  // 2. Create the ground truth data by applying the LUT
+  std::vector<int8_t> ground_truth(unpacked_bytes);
+  for (int i = 0; i < unpacked_bytes; ++i) {
+    ground_truth[i] = lut[indices[i]];
+  }
+
+  // 3. Pack the indices
+  std::vector<uint8_t> packed(packed_bytes);
+  if constexpr (nbit == 1)
+    torchao::kernels::cpu::fallback::bitpacking::internal::
+        pack_128_uint1_values(packed.data(), indices.data());
+  if constexpr (nbit == 2) {
+    torchao::kernels::cpu::fallback::bitpacking::internal::pack_64_uint2_values(
+        packed.data(), indices.data());
+    torchao::kernels::cpu::fallback::bitpacking::internal::pack_64_uint2_values(
+        packed.data() + 16, indices.data() + 64);
+  }
+  if constexpr (nbit == 3)
+    torchao::kernels::cpu::fallback::bitpacking::internal::
+        pack_128_uint3_values(packed.data(), indices.data());
+  if constexpr (nbit == 4) {
+    torchao::kernels::cpu::fallback::bitpacking::internal::pack_32_uint4_values(
+        packed.data(), indices.data());
+    torchao::kernels::cpu::fallback::bitpacking::internal::pack_32_uint4_values(
+        packed.data() + 16, indices.data() + 32);
+    torchao::kernels::cpu::fallback::bitpacking::internal::pack_32_uint4_values(
+        packed.data() + 32, indices.data() + 64);
+    torchao::kernels::cpu::fallback::bitpacking::internal::pack_32_uint4_values(
+        packed.data() + 48, indices.data() + 96);
+  }
+
+  // 4. Unpack using the LUT function
+  std::vector<int8_t> unpacked(unpacked_bytes);
+  torchao::kernels::cpu::fallback::bitpacking::internal::
+      unpack_128_lowbit_values_with_lut<nbit>(
+          unpacked.data(), packed.data(), lut.data());
+
+  // 5. Verify the result matches the ground truth
+  ASSERT_EQ(ground_truth, unpacked);
+}
+
+// --- Instantiate all test cases using macros ---
+#define TEST_BITPACKING_128_LOWBIT_VALUES(nbit) \
+  TEST(GenericBitpacking128, Lowbit_##nbit) {   \
+    test_bitpacking_128_lowbit_values<nbit>();  \
+  }
+
+#define TEST_BITPACKING_128_LOWBIT_VALUES_WITH_LUT(nbit) \
+  TEST(GenericBitpacking128, Lowbit_with_lut_##nbit) {   \
+    test_bitpacking_128_lowbit_values_with_lut<nbit>();  \
+  }
+
+TEST_BITPACKING_128_LOWBIT_VALUES(1);
+TEST_BITPACKING_128_LOWBIT_VALUES(2);
+TEST_BITPACKING_128_LOWBIT_VALUES(3);
+TEST_BITPACKING_128_LOWBIT_VALUES(4);
+TEST_BITPACKING_128_LOWBIT_VALUES(5);
+TEST_BITPACKING_128_LOWBIT_VALUES(6);
+TEST_BITPACKING_128_LOWBIT_VALUES(7);
+TEST_BITPACKING_128_LOWBIT_VALUES(8);
+
+TEST_BITPACKING_128_LOWBIT_VALUES_WITH_LUT(1);
+TEST_BITPACKING_128_LOWBIT_VALUES_WITH_LUT(2);
+TEST_BITPACKING_128_LOWBIT_VALUES_WITH_LUT(3);
+TEST_BITPACKING_128_LOWBIT_VALUES_WITH_LUT(4);