1 file changed
+1
-1
lines changed- .github/workflows/1xH100_tests.yml+53
- .github/workflows/1xL4_tests.yml+3-8
- .github/workflows/4xH100_tests.yml+51
- .github/workflows/regression_test.yml+2-2
- .github/workflows/regression_test_rocm.yml+3-1
- .github/workflows/torchao_experimental_test.yml+53-53
- README.md+30-21
- benchmarks/dashboard/ci_microbenchmark_runner.py+50-4
- benchmarks/dashboard/microbenchmark_quantization_config.yml+1
- benchmarks/float8/bench_linear_float8.py+1-1
- benchmarks/float8/bench_padding.py+1-1
- benchmarks/float8/float8_roofline.py+3-15
- benchmarks/microbenchmarks/benchmark_inference.py+9-7
- benchmarks/microbenchmarks/profiler.py+10-1
- benchmarks/microbenchmarks/test/test_benchmark_profiler.py+2-2
- benchmarks/mx_formats/cast_bench.py+69-11
- docs/source/api_ref_qat.rst+58
- docs/source/api_ref_quantization.rst-18
- docs/source/finetuning.rst+115-2
- docs/source/index.rst+1
- setup.py+37
- test/float8/test_base.py+5-5
- test/float8/test_compile.py+11-7
- test/float8/test_dtensor.py+5-5
- test/float8/test_everything_multi_gpu.sh+21
- test/float8/test_everything_single_gpu.sh+16
- test/float8/test_fsdp2/test_fsdp2.py+1-1
- test/prototype/moe_training/test_scaled_grouped_mm.py+1-1
- test/prototype/mx_formats/test_kernels.py+106-1
- test/prototype/mx_formats/test_mx_dtensor.py+12-1
- test/prototype/mx_formats/test_mx_linear.py+24-11
- test/quantization/quantize_/workflows/int4/test_int4_preshuffled_tensor.py+73-26
- test/quantization/test_qat.py+4-4
- torchao/__init__.py+3-1
- torchao/csrc/cuda/mx_kernels/mxfp8_cuda.cu+112
- torchao/csrc/cuda/mx_kernels/mxfp8_extension.cpp+128
- torchao/csrc/cuda/mx_kernels/mxfp8_quantize.cuh+1.0k
- torchao/csrc/cuda/mx_kernels/ptx.cuh+290
- torchao/dtypes/nf4tensor.py+1-1
- torchao/experimental/kernels/cpu/aarch64/linear/groupwise_lowbit_weight/groupwise_lowbit_weight_lut.h+26-4
- torchao/experimental/kernels/cpu/aarch64/tests/test_lut.cpp+3-3
- torchao/experimental/kernels/cpu/aarch64/tests/test_utils.h+1-9
- torchao/experimental/ops/groupwise_lowbit_weight_lut/groupwise_lowbit_weight_lut.cpp+235
- torchao/experimental/ops/groupwise_lowbit_weight_lut/groupwise_lowbit_weight_lut.h+126
- torchao/experimental/ops/groupwise_lowbit_weight_lut/kernel_config.h+229
- torchao/experimental/ops/groupwise_lowbit_weight_lut/kernel_selector.h+240
- torchao/experimental/ops/groupwise_lowbit_weight_lut/packed_weights_format.h+110
- torchao/float8/__init__.py+5-5
- torchao/float8/distributed_utils.py+2-2
- torchao/float8/float8_linear.py+1-1
- torchao/float8/float8_ops.py+54-47
- torchao/float8/float8_scaling_utils.py+4-4
- torchao/float8/float8_tensor_parallel.py+7-7
- torchao/float8/float8_training_tensor.py+12-12
- torchao/float8/fsdp_utils.py+13-9
- torchao/float8/inference.py+1-1
- torchao/prototype/float8nocompile/float8nocompile_linear.py+5-1
- torchao/prototype/float8nocompile/float8nocompile_scaling_utils.py+1-1
- torchao/prototype/float8nocompile/kernels/fp8_dynamic_tensorwise.py+30-26
- torchao/prototype/float8nocompile/kernels/fp8_dynamic_tensorwise_test.py+1-1
- torchao/prototype/mx_formats/config.py+16-4
- torchao/prototype/mx_formats/kernels.py+138-5
- torchao/prototype/mx_formats/mx_linear.py+42-15
- torchao/prototype/quantization/autoquant_v2.py+2-2
- torchao/prototype/spinquant/hadamard_utils.py+2-2
- torchao/prototype/spinquant/spinquant.py+58-35
- torchao/quantization/__init__.py+3-3
- torchao/quantization/prototype/qat/affine_fake_quantized_tensor.py+4-4
- torchao/quantization/pt2e/prepare.py+7-13
- torchao/quantization/pt2e/quantizer/composable_quantizer.py+5-4
- torchao/quantization/pt2e/quantizer/duplicate_dq_pass.py+4-7
- torchao/quantization/pt2e/quantizer/embedding_quantizer.py+2-1
- torchao/quantization/pt2e/quantizer/port_metadata_pass.py+8-13
- torchao/quantization/pt2e/quantizer/quantizer.py+3
- torchao/quantization/pt2e/quantizer/utils.py+6-8
- torchao/quantization/qat/__init__.py+6
- torchao/quantization/qat/affine_fake_quantized_tensor.py+18-22
- torchao/quantization/qat/api.py+2-2
- torchao/quantization/qat/linear.py+23-23
- torchao/quantization/qat/utils.py-26
- torchao/quantization/quant_api.py+24-7
- torchao/quantization/quantize_/__init__.py-9
- torchao/quantization/quantize_/workflows/__init__.py+7
- torchao/quantization/quantize_/workflows/int4/__init__.py
- torchao/quantization/quantize_/workflows/int4/int4_preshuffled_tensor.py+166-81
- version.txt+1-1
0 commit comments