@@ -69,7 +69,6 @@ def run_fn(model, dataloader):
69
69
else :
70
70
model (data )
71
71
72
- @pytest .mark .skip (reason = "SW-217321 pytorch inductor error" )
73
72
@pytest .mark .skipif (is_habana_framework_installed (), reason = "These tests are not supported on HPU for now." )
74
73
@pytest .mark .skipif (not auto_round_installed , reason = "auto_round module is not installed" )
75
74
class TestAutoRoundCPU :
@@ -97,7 +96,7 @@ def setup_method(self, method):
97
96
@pytest .mark .parametrize ("quant_lm_head" , [True , False ])
98
97
def test_autoround (self , quant_lm_head ):
99
98
fp32_model = copy .deepcopy (self .gptj )
100
- quant_config = AutoRoundConfig (nsamples = 32 , seqlen = 10 , iters = 10 , scale_dtype = "fp32" )
99
+ quant_config = AutoRoundConfig (nsamples = 32 , seqlen = 10 , iters = 10 , amp = False , scale_dtype = "fp32" )
101
100
if quant_lm_head is False :
102
101
quant_config .set_local ("lm_head" , AutoRoundConfig (dtype = "fp32" ))
103
102
logger .info (f"Test AutoRound with config { quant_config } " )
@@ -110,15 +109,15 @@ def test_autoround(self, quant_lm_head):
110
109
out = q_model (self .inp )[0 ]
111
110
assert torch .allclose (out , self .label , atol = 1e-1 )
112
111
assert "transformer.h.0.attn.k_proj" in q_model .autoround_config .keys ()
113
- assert "scale " in q_model .autoround_config ["transformer.h.0.attn.k_proj" ].keys ()
112
+ assert "scale_dtype " in q_model .autoround_config ["transformer.h.0.attn.k_proj" ].keys ()
114
113
assert torch .float32 == q_model .autoround_config ["transformer.h.0.attn.k_proj" ]["scale_dtype" ]
115
114
assert isinstance (q_model .transformer .h [0 ].attn .k_proj , WeightOnlyLinear ), "packing model failed."
116
115
if quant_lm_head is True :
117
116
assert isinstance (q_model .lm_head , WeightOnlyLinear ), "quantization for lm_head failed."
118
117
119
118
def test_int4_dtype (self ):
120
119
fp32_model = copy .deepcopy (self .gptj )
121
- quant_config = AutoRoundConfig (dtype = "int4" , nsamples = 32 , seqlen = 10 , iters = 10 , scale_dtype = "fp32" )
120
+ quant_config = AutoRoundConfig (dtype = "int4" , nsamples = 32 , seqlen = 10 , iters = 10 , amp = False , scale_dtype = "fp32" )
122
121
logger .info (f"Test AutoRound with config { quant_config } " )
123
122
124
123
# prepare + convert API
@@ -129,14 +128,14 @@ def test_int4_dtype(self):
129
128
out = q_model (self .inp )[0 ]
130
129
assert torch .allclose (out , self .label , atol = 1e-1 )
131
130
assert "transformer.h.0.attn.k_proj" in q_model .autoround_config .keys ()
132
- assert "scale " in q_model .autoround_config ["transformer.h.0.attn.k_proj" ].keys ()
131
+ assert "scale_dtype " in q_model .autoround_config ["transformer.h.0.attn.k_proj" ].keys ()
133
132
assert torch .float32 == q_model .autoround_config ["transformer.h.0.attn.k_proj" ]["scale_dtype" ]
134
133
assert isinstance (q_model .transformer .h [0 ].attn .k_proj , WeightOnlyLinear ), "packing model failed."
135
134
136
135
def test_autoround_with_quantize_API (self ):
137
136
gpt_j_model = copy .deepcopy (self .gptj )
138
137
139
- quant_config = AutoRoundConfig (nsamples = 32 , seqlen = 10 , iters = 10 , scale_dtype = "fp32" )
138
+ quant_config = AutoRoundConfig (nsamples = 32 , seqlen = 10 , iters = 10 , amp = False , scale_dtype = "fp32" )
140
139
quant_config .set_local ("lm_head" , AutoRoundConfig (dtype = "fp32" ))
141
140
142
141
logger .info (f"Test AutoRound with config { quant_config } " )
@@ -156,7 +155,7 @@ def test_save_and_load(self):
156
155
fp32_model = copy .deepcopy (self .gptj )
157
156
# known issue: scale_dtype="fp32" will cause accuracy gap between quantized model
158
157
# (using auto-round WeightOnlyLinear) and reloaded model (using INCWeightOnlyLinear)
159
- quant_config = AutoRoundConfig (nsamples = 32 , seqlen = 10 , iters = 10 , scale_dtype = "fp16" )
158
+ quant_config = AutoRoundConfig (nsamples = 32 , seqlen = 10 , iters = 10 , amp = False , scale_dtype = "fp16" )
160
159
# quant_config.set_local("lm_head", AutoRoundConfig(dtype="fp32"))
161
160
logger .info (f"Test AutoRound with config { quant_config } " )
162
161
@@ -185,11 +184,11 @@ def test_conv1d(self):
185
184
from transformers import GPT2Model , GPT2Tokenizer
186
185
187
186
tokenizer = GPT2Tokenizer .from_pretrained ("sshleifer/tiny-gpt2" )
188
- model = GPT2Model .from_pretrained ("sshleifer/tiny-gpt2" )
187
+ model = GPT2Model .from_pretrained ("sshleifer/tiny-gpt2" , use_cache = False )
189
188
text = "Replace me by any text you'd like."
190
189
encoded_input = tokenizer (text , return_tensors = "pt" )
191
190
out1 = model (** encoded_input )[0 ]
192
- quant_config = AutoRoundConfig (nsamples = 32 , seqlen = 10 , iters = 10 , scale_dtype = "fp32" )
191
+ quant_config = AutoRoundConfig (nsamples = 32 , seqlen = 10 , iters = 10 , amp = False , scale_dtype = "fp32" )
193
192
model = prepare (model = model , quant_config = quant_config )
194
193
run_fn (model , self .dataloader )
195
194
q_model = convert (model )
@@ -207,7 +206,7 @@ def test_utils(self):
207
206
fp32_model = copy .deepcopy (self .gptj )
208
207
to_quant_block_names = get_multimodal_block_names (fp32_model , quant_vision = True )
209
208
quant_config = AutoRoundConfig (
210
- nsamples = 32 , seqlen = 10 , iters = 10 , scale_dtype = "fp16" , to_quant_block_names = to_quant_block_names
209
+ nsamples = 32 , seqlen = 10 , iters = 10 , amp = False , scale_dtype = "fp16" , to_quant_block_names = to_quant_block_names
211
210
)
212
211
logger .info (f"Test AutoRound with config { quant_config } " )
213
212
device = detect_device ("auto" )
@@ -222,6 +221,7 @@ def test_utils(self):
222
221
assert torch .allclose (out , self .label , atol = 1e-1 )
223
222
assert isinstance (q_model .transformer .h [0 ].attn .k_proj , WeightOnlyLinear ), "packing model failed."
224
223
224
+ @pytest .mark .skipif (Version (auto_round .__version__ ) <= Version ("0.5.1" ), reason = "visual layer_name not processed." )
225
225
def test_mllm (self ):
226
226
input = torch .randn (1 , 32 )
227
227
from transformers import AutoProcessor , AutoTokenizer , Qwen2VLForConditionalGeneration
@@ -237,7 +237,7 @@ def test_mllm(self):
237
237
model = model ,
238
238
tokenizer = tokenizer ,
239
239
image_processor = None ,
240
- dataset = "liuhaotian/llava_conv_58k " ,
240
+ dataset = "NeelNanda/pile-10k " ,
241
241
extra_data_dir = None ,
242
242
seqlen = 32 ,
243
243
batch_size = 1 ,
@@ -266,13 +266,13 @@ def test_mllm(self):
266
266
model = prepare (model = model , quant_config = quant_config )
267
267
run_fn (model , dataloader )
268
268
q_model = convert (model )
269
- assert isinstance (q_model .model .layers [0 ].mlp .up_proj , WeightOnlyLinear ), "model quantization failed."
269
+ assert isinstance (q_model .language_model .layers [0 ].mlp .up_proj , WeightOnlyLinear ), "model quantization failed."
270
270
271
271
# def test_autoround_format_export(self):
272
272
# from neural_compressor.torch.quantization import load
273
273
# from auto_gptq.nn_modules.qlinear.qlinear_triton import QuantLinear
274
274
# gpt_j_model = copy.deepcopy(self.gptj)
275
- # quant_config = AutoRoundConfig(nsamples=32, seqlen=10, iters=10, scale_dtype="fp32", export_format="auto_round:gptq")
275
+ # quant_config = AutoRoundConfig(nsamples=32, seqlen=10, iters=10, amp=False , scale_dtype="fp32", export_format="auto_round:gptq")
276
276
# logger.info(f"Test AutoRound with config {quant_config}")
277
277
# model = prepare(model=gpt_j_model, quant_config=quant_config)
278
278
# run_fn(model, self.dataloader)
@@ -366,7 +366,7 @@ def test_autoround_w4a8(self):
366
366
@pytest .mark .parametrize ("quant_lm_head" , [True , False ])
367
367
def test_autoround (self , quant_lm_head ):
368
368
fp32_model = copy .deepcopy (self .tiny_llama_model )
369
- quant_config = AutoRoundConfig (nsamples = 32 , seqlen = 10 , iters = 10 , act_dtype = "fp32" , scale_dtype = "fp32" )
369
+ quant_config = AutoRoundConfig (nsamples = 32 , seqlen = 10 , iters = 10 , act_dtype = "fp32" , amp = False , scale_dtype = "fp32" )
370
370
if quant_lm_head is False :
371
371
quant_config .set_local ("lm_head" , AutoRoundConfig (dtype = "fp32" ))
372
372
logger .info (f"Test AutoRound with config { quant_config } " )
@@ -386,7 +386,7 @@ def test_autoround(self, quant_lm_head):
386
386
def test_int4_dtype (self ):
387
387
fp32_model = copy .deepcopy (self .tiny_llama_model )
388
388
quant_config = AutoRoundConfig (
389
- dtype = "int4" , nsamples = 32 , seqlen = 10 , iters = 10 , act_dtype = "fp32" , scale_dtype = "fp32"
389
+ dtype = "int4" , nsamples = 32 , seqlen = 10 , iters = 10 , act_dtype = "fp32" , amp = False , scale_dtype = "fp32"
390
390
)
391
391
logger .info (f"Test AutoRound with config { quant_config } " )
392
392
@@ -402,7 +402,7 @@ def test_int4_dtype(self):
402
402
def test_autoround_with_quantize_API (self ):
403
403
model = copy .deepcopy (self .tiny_llama_model )
404
404
405
- quant_config = AutoRoundConfig (nsamples = 32 , seqlen = 10 , iters = 10 , act_dtype = "fp32" , scale_dtype = "fp32" )
405
+ quant_config = AutoRoundConfig (nsamples = 32 , seqlen = 10 , iters = 10 , act_dtype = "fp32" , amp = False , scale_dtype = "fp32" )
406
406
quant_config .set_local ("lm_head" , AutoRoundConfig (dtype = "fp32" ))
407
407
408
408
logger .info (f"Test AutoRound with config { quant_config } " )
0 commit comments