diff --git "a/docs/source/Instruction/\345\257\274\345\207\272\344\270\216\346\216\250\351\200\201.md" "b/docs/source/Instruction/\345\257\274\345\207\272\344\270\216\346\216\250\351\200\201.md" index 3a9deb993e..6c60b425b6 100644 --- "a/docs/source/Instruction/\345\257\274\345\207\272\344\270\216\346\216\250\351\200\201.md" +++ "b/docs/source/Instruction/\345\257\274\345\207\272\344\270\216\346\216\250\351\200\201.md" @@ -27,12 +27,15 @@ pip install autoawq -U # auto_gptq和cuda版本有对应关系,请按照`https://github.com/PanQiWei/AutoGPTQ#quick-installation`选择版本 pip install auto_gptq optimum -U +# 使用gptq v2量化: +pip install gptqmodel optimum -U + # 使用bnb量化: pip install bitsandbytes -U ``` 我们提供了一系列脚本展现SWIFT的量化导出能力: -- 支持[AWQ](https://github.com/modelscope/ms-swift/blob/main/examples/export/quantize/awq.sh)/[GPTQ](https://github.com/modelscope/ms-swift/blob/main/examples/export/quantize/gptq.sh)/[BNB](https://github.com/modelscope/ms-swift/blob/main/examples/export/quantize/bnb.sh)量化导出。 +- 支持[AWQ](https://github.com/modelscope/ms-swift/blob/main/examples/export/quantize/awq.sh)/[GPTQ](https://github.com/modelscope/ms-swift/blob/main/examples/export/quantize/gptq.sh)/[GPTQ v2](https://github.com/modelscope/ms-swift/blob/main/examples/export/quantize/gptq_v2.sh)/[BNB](https://github.com/modelscope/ms-swift/blob/main/examples/export/quantize/bnb.sh)量化导出。 - 多模态量化: 支持使用GPTQ和AWQ对多模态模型进行量化,其中AWQ支持的多模态模型有限。参考[这里](https://github.com/modelscope/ms-swift/tree/main/examples/export/quantize/mllm)。 - 更多系列模型的支持: 支持[Bert](https://github.com/modelscope/ms-swift/tree/main/examples/export/quantize/bert),[Reward Model](https://github.com/modelscope/ms-swift/tree/main/examples/export/quantize/reward_model)的量化导出。 - 使用SWIFT量化导出的模型支持使用vllm/sglang/lmdeploy进行推理加速;也支持使用QLoRA继续进行SFT/RLHF。 diff --git a/docs/source_en/Instruction/Export-and-push.md b/docs/source_en/Instruction/Export-and-push.md index 732c7a4a06..4819fdb6a4 100644 --- a/docs/source_en/Instruction/Export-and-push.md +++ b/docs/source_en/Instruction/Export-and-push.md @@ -26,13 +26,16 @@ pip install autoawq -U # The versions of auto_gptq and CUDA are correlated; please choose the version according to `https://github.com/PanQiWei/AutoGPTQ#quick-installation`. pip install auto_gptq optimum -U +# For GPTQ v2 quantization: +pip install gptqmodel optimum -U + # For BNB quantization: pip install bitsandbytes -U ``` We provide a series of scripts to demonstrate SWIFT's quantization export capabilities: -- Supports [AWQ](https://github.com/modelscope/ms-swift/blob/main/examples/export/quantize/awq.sh)/[GPTQ](https://github.com/modelscope/ms-swift/blob/main/examples/export/quantize/gptq.sh)/[BNB](https://github.com/modelscope/ms-swift/blob/main/examples/export/quantize/bnb.sh) quantization exports. +- Supports [AWQ](https://github.com/modelscope/ms-swift/blob/main/examples/export/quantize/awq.sh)/[GPTQ](https://github.com/modelscope/ms-swift/blob/main/examples/export/quantize/gptq.sh)/[GPTQ v2](https://github.com/modelscope/ms-swift/blob/main/examples/export/quantize/gptq_v2.sh)/[BNB](https://github.com/modelscope/ms-swift/blob/main/examples/export/quantize/bnb.sh) quantization exports. - Multimodal quantization: Supports quantizing multimodal models using GPTQ and AWQ, with limited multimodal models supported by AWQ. Refer to [here](https://github.com/modelscope/ms-swift/tree/main/examples/export/quantize/mllm). - Support for more model series: Supports quantization exports for [BERT](https://github.com/modelscope/ms-swift/tree/main/examples/export/quantize/bert) and [Reward Model](https://github.com/modelscope/ms-swift/tree/main/examples/export/quantize/reward_model). - Models exported with SWIFT's quantization support inference acceleration using vllm/sglang/lmdeploy; they also support further SFT/RLHF using QLoRA. diff --git a/examples/export/quantize/gptq_v2.sh b/examples/export/quantize/gptq_v2.sh new file mode 100644 index 0000000000..9f0cc81f8f --- /dev/null +++ b/examples/export/quantize/gptq_v2.sh @@ -0,0 +1,13 @@ +# OMP_NUM_THREADS=14 please Check issue: https://github.com/AutoGPTQ/AutoGPTQ/issues/439 +OMP_NUM_THREADS=14 \ +CUDA_VISIBLE_DEVICES=0 \ +swift export \ + --model Qwen/Qwen2.5-1.5B-Instruct \ + --dataset 'AI-ModelScope/alpaca-gpt4-data-zh#500' \ + 'AI-ModelScope/alpaca-gpt4-data-en#500' \ + --quant_n_samples 256 \ + --quant_batch_size 1 \ + --max_length 2048 \ + --quant_method gptq_v2 \ + --quant_bits 4 \ + --output_dir Qwen2.5-1.5B-Instruct-GPTQ-Int4 diff --git a/swift/llm/argument/export_args.py b/swift/llm/argument/export_args.py index 6c20a83b6a..f4cf8d5fc1 100644 --- a/swift/llm/argument/export_args.py +++ b/swift/llm/argument/export_args.py @@ -35,7 +35,7 @@ class ExportArguments(MergeArguments, BaseArguments): output_dir: Optional[str] = None # awq/gptq - quant_method: Literal['awq', 'gptq', 'bnb', 'fp8'] = None + quant_method: Literal['awq', 'gptq', 'bnb', 'fp8', 'gptq_v2'] = None quant_n_samples: int = 256 max_length: int = 2048 quant_batch_size: int = 1 diff --git a/swift/llm/export/quant.py b/swift/llm/export/quant.py index 6869fe09e5..4640cbf453 100644 --- a/swift/llm/export/quant.py +++ b/swift/llm/export/quant.py @@ -38,9 +38,9 @@ def quantize(self): self.awq_model_quantize() self.model.save_quantized( args.output_dir, safetensors=args.safe_serialization, shard_size=args.max_shard_size) - elif args.quant_method == 'gptq': + elif args.quant_method in {'gptq', 'gptq_v2'}: self.template.model = self.model - gptq_quantizer = self.gptq_model_quantize() + gptq_quantizer = self.gptq_model_quantize(v2=(args.quant_method == 'gptq_v2')) gptq_quantizer.save( self.model, args.output_dir, @@ -226,7 +226,7 @@ def get_modules_in_block_to_quantize(model, block_name: str): res[experts_idx:experts_idx] = experts.values() return res - def gptq_model_quantize(self): + def gptq_model_quantize(self, v2: bool = False): from optimum.gptq import GPTQQuantizer args = self.args logger.info(f'Quantization dataset: {args.dataset}') @@ -241,7 +241,8 @@ def gptq_model_quantize(self): dataset=','.join(args.dataset), batch_size=args.quant_batch_size, block_name_to_quantize=block_name_to_quantize, - modules_in_block_to_quantize=modules_in_block_to_quantize) + modules_in_block_to_quantize=modules_in_block_to_quantize, + checkpoint_format='gptq_v2' if v2 else 'gptq') gptq_quantizer.serialization_keys.append('block_name_to_quantize') logger.info('Start quantizing the model...') logger.warning('The process of packing the model takes a long time and there is no progress bar. '