From d4ac0efefcf167cea98dd29cade799526fcfd69f Mon Sep 17 00:00:00 2001
From: ceci3 <ceci3@users.noreply.github.com>
Date: Tue, 19 Dec 2023 18:42:50 +0800
Subject: [PATCH 1/4] support nf4 channel wise quant & fix bug when
 blocksize>512 (#1817) (#1818)

---
 csrc/lc/dequantize_blockwise.cu |  84 ++++++++++++++++++++---
 csrc/lc/quantize_blockwise.cu   | 115 ++++++++++++++++++++++++--------
 2 files changed, 162 insertions(+), 37 deletions(-)
diff --git a/csrc/lc/dequantize_blockwise.cu b/csrc/lc/dequantize_blockwise.cu
index 8046c34ac..0bf76a163 100644
--- a/csrc/lc/dequantize_blockwise.cu
+++ b/csrc/lc/dequantize_blockwise.cu
@@ -201,7 +201,6 @@ template __global__ void kDequantizeBlockwise<float, 512, 64, 8, NF4>(const floa
 //template __global__ void kDequantizeBlockwise<__nv_bfloat16, 512, 64, 8, NF4>(const float *code, const unsigned char * A, const float * absmax, __nv_bfloat16 *out, int blocksize, int n);
 
 
-
 template<typename T, int DATA_TYPE> void dequantize_blockwise(const float *code, const unsigned char *A, const float *absmax, T *out, int blocksize, int n)
 {
   int num_blocks = n/blocksize;
@@ -226,6 +225,50 @@ template void dequantize_blockwise<float, NF4>(const float *code, const unsigned
 //template void dequantize_blockwise<__nv_bfloat16, FP4>(const float *code, const unsigned char *A, const float *absmax, __nv_bfloat16 *out, int blocksize, int n);
 //template void dequantize_blockwise<__nv_bfloat16, NF4>(const float *code, const unsigned char *A, const float *absmax, __nv_bfloat16 *out, int blocksize, int n);
 
+template <typename T, int DATA_TYPE>
+__global__ void kDequantizeChannelwise(const unsigned char* A,
+                                      const float *absmax,
+                                      float *out,
+                                      int n,
+                                      int cout) {
+  int idx = blockDim.x * blockIdx.x + threadIdx.x;
+
+  int num = n / 2;
+  //int part_n = num / cout;
+  for (int i = idx; i < num; i += blockDim.x * gridDim.x) {
+    float local_absmax = absmax[i%cout];
+    int idx = 2*(i/cout)* cout + i%cout;
+    switch(DATA_TYPE)
+    {
+        case FP4:
+            out[i*2 + i%cout] = dDequantizeFP4Tree(A[i] >> 4, local_absmax);
+            out[i*2 + cout + i%cout] = dDequantizeFP4Tree(A[i] & 0x0F, local_absmax);
+            break;
+        case NF4:
+            out[idx] = dDequantizeNF4(A[i] >> 4)* local_absmax;
+            out[idx + cout] = dDequantizeNF4(A[i] & 0x0F)* local_absmax;
+            break;
+    }
+    __syncthreads();
+  }
+}
+
+template<typename T, int DATA_TYPE> void dequantize_channelwise(const unsigned char *A, const float *absmax, T *out, int n, int cout)
+{
+  int max_threads = 1024; 
+  int64_t block_size =
+      std::min(static_cast<int64_t>(n),
+               static_cast<int64_t>(max_threads/ 4));
+
+  const int64_t max_blocks =
+      std::max(((max_threads - 1) / block_size + 1), static_cast<int64_t>(1));
+  const int64_t grid_size =
+      std::min(max_blocks, (n + block_size - 1) / block_size);
+
+  kDequantizeChannelwise<T, DATA_TYPE><<<grid_size, block_size>>>(A, absmax, out, n, cout);
+  CUDA_CHECK_RETURN(cudaPeekAtLastError());
+}
+
 std::vector<paddle::Tensor> DequantizeBlockwise(const paddle::Tensor& input, const paddle::Tensor& code, const paddle::Tensor& absmax, int blocksize, std::string quant_type) {
     int64_t input_numel = input.numel();
     int n = input_numel;
@@ -234,23 +277,44 @@ std::vector<paddle::Tensor> DequantizeBlockwise(const paddle::Tensor& input, con
         out_shape = {input_numel * 2, 1};
         n = n * 2;
     }
+    if (blocksize == -1) {
+        out_shape = {input.shape()[0] * 2, input.shape()[1]};
+    }
     auto out = paddle::empty(out_shape, paddle::DataType::FLOAT32, input.place());
 
-    if (quant_type == "8bit")
-        dequantize_blockwise<float, General8bit>(code.data<float>(), input.data<unsigned char>(), absmax.data<float>(), out.data<float>(), blocksize, n);
-    else if (quant_type == "nf4")
-        dequantize_blockwise<float, NF4>(NULL, input.data<unsigned char>(), absmax.data<float>(), out.data<float>(), blocksize, n);
-    else if (quant_type == "fp4")
-        dequantize_blockwise<float, FP4>(NULL, input.data<unsigned char>(), absmax.data<float>(), out.data<float>(), blocksize, n);
-    else
-        PD_THROW("NOT supported quant type. Only 8bit, nf4, fp4 are supported. ");
+    if (blocksize == -1) {
+        if (quant_type == "8bit")
+            PD_THROW("blocksize is -1 only support NF4 and FP4.");
+        else
+            blocksize = n / absmax.numel() * 2;
+
+        int cout = input.shape()[1];
+        if (quant_type == "nf4")
+            dequantize_channelwise<float, NF4>(input.data<unsigned char>(), absmax.data<float>(), out.data<float>(), n, cout);
+        else if (quant_type == "fp4")
+            dequantize_channelwise<float, FP4>(input.data<unsigned char>(), absmax.data<float>(), out.data<float>(), n, cout);
+        else
+            PD_THROW("NOT supported quant type. Only 8bit, nf4, fp4 are supported. ");
+    } else {
+      if (quant_type == "8bit")
+          dequantize_blockwise<float, General8bit>(code.data<float>(), input.data<unsigned char>(), absmax.data<float>(), out.data<float>(), blocksize, n);
+      else if (quant_type == "nf4")
+          dequantize_blockwise<float, NF4>(NULL, input.data<unsigned char>(), absmax.data<float>(), out.data<float>(), blocksize, n);
+      else if (quant_type == "fp4")
+          dequantize_blockwise<float, FP4>(NULL, input.data<unsigned char>(), absmax.data<float>(), out.data<float>(), blocksize, n);
+      else
+          PD_THROW("NOT supported quant type. Only 8bit, nf4, fp4 are supported. ");
+    }
     return {out};
 };
 
 std::vector<std::vector<int64_t>> GetDequantizeBlockwiseInferShape(const std::vector<int64_t>& input_shape, const std::vector<int64_t>& code_shape, const std::vector<int64_t>& abs_max_shape, int blocksize, std::string quant_type){
     int64_t first_shape = input_shape[0] * input_shape[1] * 2;
     if (quant_type != "8bit")
-        return {{first_shape, 1}};
+        if (blocksize != -1)
+            return {{first_shape, 1}};
+        else
+            return {{input_shape[0] * 2, input_shape[1]}};
     else
         return {input_shape};
 }
diff --git a/csrc/lc/quantize_blockwise.cu b/csrc/lc/quantize_blockwise.cu
index d4f6ff2ca..e8e55b9d8 100644
--- a/csrc/lc/quantize_blockwise.cu
+++ b/csrc/lc/quantize_blockwise.cu
@@ -279,6 +279,7 @@ __global__ void kQuantizeBlockwise(const float * code, const T * __restrict__ A,
             #pragma unroll NUM_PER_TH
             for(int j = 0; j < NUM_PER_TH/2; j++)
             {
+              packed_4bit = 0;
               packed_4bit |= dQuantizeNF4(((float)vals[2*j])*local_abs_max) << 4;
               packed_4bit |= dQuantizeNF4(((float)vals[2*j+1])*local_abs_max);
               qvals[j] = packed_4bit;
@@ -360,9 +361,39 @@ MAKE_kQuantizeBlockwise(__nv_bfloat16,  256, 2, NF4)
 MAKE_kQuantizeBlockwise(__nv_bfloat16,  128, 2, NF4)
 MAKE_kQuantizeBlockwise(__nv_bfloat16,   64, 2, NF4)
 
+template <typename T, int DATA_TYPE>
+__global__ void kQuantizeChannelwise(const float *code,
+                                      const T* A,
+                                      unsigned char* out,
+                                      float *absmax,
+                                      int n,
+                                      int cout) {
+  int idx = blockDim.x * blockIdx.x + threadIdx.x;
+
+  int num = n / 2;
+  for (int i = idx; i < num; i += blockDim.x * gridDim.x) {
+    int idx = 2*(i/cout)* cout + i%cout;
+    float local_absmax = absmax[i %cout];
+    float inv_local_absmax = 1.0f/local_absmax;
 
+    unsigned char packed_4bit = 0;
+    switch(DATA_TYPE)
+    {
+        case FP4:
+            packed_4bit |= dQuantizeFP4(((float)A[idx])*inv_local_absmax) << 4;
+            packed_4bit |= dQuantizeFP4(((float)A[idx+cout])*inv_local_absmax);
+            out[i] = packed_4bit;
+            break;
+        case NF4:
+            packed_4bit |= dQuantizeNF4(((float)A[idx])*inv_local_absmax) << 4;
+            packed_4bit |= dQuantizeNF4(((float)A[idx+cout])*inv_local_absmax);
+            out[i] = packed_4bit;
+            break;
+    }
+  }
+}
 
-template <paddle::DataType D, int DATA_TYPE> void quantize_blockwise(const float *code, const paddle::Tensor& A, float *absmax, unsigned char *out, int blocksize, int n)
+template <paddle::DataType D, int DATA_TYPE> void quantize_blockwise(const float *code, const paddle::Tensor& A, paddle::Tensor& absmax, unsigned char *out, int blocksize, int n, int channelwise)
 {
   typedef PDTraits<D> traits_;
   typedef typename traits_::DataType DataType_;
@@ -372,22 +403,43 @@ template <paddle::DataType D, int DATA_TYPE> void quantize_blockwise(const float
   num_blocks = n % blocksize == 0 ? num_blocks : num_blocks + 1;
 
   const DataType_* A_data = reinterpret_cast<const DataType_*>(A.data<data_t>()); 
-  if(blocksize == 4096)
-    kQuantizeBlockwise<DataType_, 4096, 4, 0><<<num_blocks, 1024>>>(code, A_data, absmax, out, n);
-  else if(blocksize == 2048)
-    kQuantizeBlockwise<DataType_, 2048, 4, DATA_TYPE><<<num_blocks, 512>>>(code, A_data, absmax, out, n);
-  else if(blocksize == 1024)
-    kQuantizeBlockwise<DataType_, 1024, 4, DATA_TYPE><<<num_blocks, 256>>>(code, A_data, absmax, out, n);
-  else if(blocksize == 512)
-    kQuantizeBlockwise<DataType_, 512, 2, DATA_TYPE><<<num_blocks, 256>>>(code, A_data, absmax, out, n);
-  else if(blocksize == 256)
-    kQuantizeBlockwise<DataType_, 256, 2, DATA_TYPE><<<num_blocks, 128>>>(code, A_data, absmax, out, n);
-  else if(blocksize == 128)
-    kQuantizeBlockwise<DataType_, 128, 2, DATA_TYPE><<<num_blocks, 64>>>(code, A_data, absmax, out, n);
-  else if(blocksize == 64)
-    kQuantizeBlockwise<DataType_, 64, 2, DATA_TYPE><<<num_blocks, 32>>>(code, A_data, absmax, out, n);
-  else
-    PD_THROW("only support blocksize is [64, 128, 256, 512, 1024, 2048, 4096].");
+  if (channelwise == 0) {
+    if(blocksize == 4096)
+      kQuantizeBlockwise<DataType_, 4096, 4, 0><<<num_blocks, 1024>>>(code, A_data, absmax.data<float>(), out, n);
+    else if(blocksize == 2048)
+      kQuantizeBlockwise<DataType_, 2048, 4, DATA_TYPE><<<num_blocks, 512>>>(code, A_data, absmax.data<float>(), out, n);
+    else if(blocksize == 1024)
+      kQuantizeBlockwise<DataType_, 1024, 4, DATA_TYPE><<<num_blocks, 256>>>(code, A_data, absmax.data<float>(), out, n);
+    else if(blocksize == 512)
+      kQuantizeBlockwise<DataType_, 512, 2, DATA_TYPE><<<num_blocks, 256>>>(code, A_data, absmax.data<float>(), out, n);
+    else if(blocksize == 256)
+      kQuantizeBlockwise<DataType_, 256, 2, DATA_TYPE><<<num_blocks, 128>>>(code, A_data, absmax.data<float>(), out, n);
+    else if(blocksize == 128)
+      kQuantizeBlockwise<DataType_, 128, 2, DATA_TYPE><<<num_blocks, 64>>>(code, A_data, absmax.data<float>(), out, n);
+    else if(blocksize == 64)
+      kQuantizeBlockwise<DataType_, 64, 2, DATA_TYPE><<<num_blocks, 32>>>(code, A_data, absmax.data<float>(), out, n);
+  }
+  else {
+    if (DATA_TYPE == General8bit)
+        PD_THROW("blocksize is -1 only support NF4 and FP4.");
+
+    int cout = A.shape()[1];
+    int max_threads = 1024; 
+
+    absmax = A.abs().max({0});
+
+    int64_t block_size =
+        std::min(static_cast<int64_t>(n),
+                 static_cast<int64_t>(max_threads/ 4));
+
+    const int64_t max_blocks =
+        std::max(((max_threads - 1) / block_size + 1), static_cast<int64_t>(1));
+    const int64_t grid_size =
+        std::min(max_blocks, (n + block_size - 1) / block_size);
+
+    kQuantizeChannelwise<DataType_, DATA_TYPE><<<grid_size, block_size, 0>>>(
+      code, A_data, out, absmax.data<float>(), n, cout);
+  }
 
 
   CUDA_CHECK_RETURN(cudaPeekAtLastError());
@@ -395,38 +447,44 @@ template <paddle::DataType D, int DATA_TYPE> void quantize_blockwise(const float
 
 std::vector<paddle::Tensor> QuantizeBlockwise(const paddle::Tensor& input, const paddle::Tensor& code, int blocksize, std::string quant_type) {
     int n = input.numel();
+    int channelwise = 0;
     std::vector<int64_t> out_shape = input.shape();
     if (quant_type != "8bit") { // 4bit
         out_shape = {(n + 1) / 2, 1};
     }
+    if (blocksize == -1){
+        blocksize = input.shape()[0];
+        out_shape = {input.shape()[0]/2, input.shape()[1]};
+        channelwise = 1;
+    }
     auto out = paddle::empty(out_shape, paddle::DataType::UINT8, input.place());
     int64_t absmax_shape = n / blocksize;
     auto absmax = paddle::empty({absmax_shape}, paddle::DataType::FLOAT32, input.place());
     switch(input.type()) {
         case paddle::DataType::FLOAT32:
             if (quant_type == "8bit")
-                quantize_blockwise<paddle::DataType::FLOAT32, General8bit>(code.data<float>(), input, absmax.data<float>(), out.data<unsigned char>(), blocksize, n);
+                quantize_blockwise<paddle::DataType::FLOAT32, General8bit>(code.data<float>(), input, absmax, out.data<unsigned char>(), blocksize, n, channelwise);
             else if (quant_type == "nf4") {
-                quantize_blockwise<paddle::DataType::FLOAT32, NF4>(NULL, input, absmax.data<float>(), out.data<unsigned char>(), blocksize, n);
+                quantize_blockwise<paddle::DataType::FLOAT32, NF4>(NULL, input, absmax, out.data<unsigned char>(), blocksize, n, channelwise);
             }
             else if (quant_type == "fp4")
-                quantize_blockwise<paddle::DataType::FLOAT32, FP4>(NULL, input, absmax.data<float>(), out.data<unsigned char>(), blocksize, n);
+                quantize_blockwise<paddle::DataType::FLOAT32, FP4>(NULL, input, absmax, out.data<unsigned char>(), blocksize, n, channelwise);
             return {out, absmax};
         case paddle::DataType::FLOAT16:
             if (quant_type == "8bit")
-                quantize_blockwise<paddle::DataType::FLOAT16, General8bit>(code.data<float>(), input, absmax.data<float>(), out.data<unsigned char>(), blocksize, n);
+                quantize_blockwise<paddle::DataType::FLOAT16, General8bit>(code.data<float>(), input, absmax, out.data<unsigned char>(), blocksize, n, channelwise);
             else if (quant_type == "nf4")
-                quantize_blockwise<paddle::DataType::FLOAT16, NF4>(NULL, input, absmax.data<float>(), out.data<unsigned char>(), blocksize, n);
+                quantize_blockwise<paddle::DataType::FLOAT16, NF4>(NULL, input, absmax, out.data<unsigned char>(), blocksize, n, channelwise);
             else if (quant_type == "fp4")
-                quantize_blockwise<paddle::DataType::FLOAT16, FP4>(NULL, input, absmax.data<float>(), out.data<unsigned char>(), blocksize, n);
+                quantize_blockwise<paddle::DataType::FLOAT16, FP4>(NULL, input, absmax, out.data<unsigned char>(), blocksize, n, channelwise);
             return {out, absmax};
         case paddle::DataType::BFLOAT16:
             if (quant_type == "8bit")
-                quantize_blockwise<paddle::DataType::BFLOAT16, General8bit>(code.data<float>(), input, absmax.data<float>(), out.data<unsigned char>(), blocksize, n);
+                quantize_blockwise<paddle::DataType::BFLOAT16, General8bit>(code.data<float>(), input, absmax, out.data<unsigned char>(), blocksize, n, channelwise);
             else if (quant_type == "nf4")
-                quantize_blockwise<paddle::DataType::BFLOAT16, NF4>(NULL, input, absmax.data<float>(), out.data<unsigned char>(), blocksize, n);
+                quantize_blockwise<paddle::DataType::BFLOAT16, NF4>(NULL, input, absmax, out.data<unsigned char>(), blocksize, n, channelwise);
             else if (quant_type == "fp4")
-                quantize_blockwise<paddle::DataType::BFLOAT16, FP4>(NULL, input, absmax.data<float>(), out.data<unsigned char>(), blocksize, n);
+                quantize_blockwise<paddle::DataType::BFLOAT16, FP4>(NULL, input, absmax, out.data<unsigned char>(), blocksize, n, channelwise);
             return {out, absmax};
 
         default:
@@ -440,7 +498,10 @@ std::vector<paddle::Tensor> QuantizeBlockwise(const paddle::Tensor& input, const
 std::vector<std::vector<int64_t>> GetQuantizeBlockwiseInferShape(const std::vector<int64_t>& input_shape, const std::vector<int64_t>& code_shape, int blocksize, std::string quant_type){
     int64_t first_shape = (input_shape[0] * input_shape[1] + 1) / 2;
     if (quant_type != "8bit")
-        return {{first_shape, 1}};
+        if (blocksize != -1)
+          return {{first_shape, 1}};
+        else 
+          return {{input_shape[0]/2, input_shape[1]}};
     else
         return {input_shape};
 }

From dcf79e930694beded6965a21220f7001551f21eb Mon Sep 17 00:00:00 2001
From: Chang Xu <molixu7@gmail.com>
Date: Tue, 26 Dec 2023 15:52:31 +0800
Subject: [PATCH 2/4] Add GroupWiseQuant & AWQ & AutoClip (#1821)

---
 paddleslim/quant/advanced/__init__.py         |   8 +-
 paddleslim/quant/advanced/auto_clip.py        | 155 ++++++++++++++++++
 paddleslim/quant/advanced/awq_search.py       |  78 +++++++++
 paddleslim/quant/advanced/piecewise_search.py |  39 +++--
 paddleslim/quant/advanced/smooth.py           |  56 ++++---
 paddleslim/quant/advanced/utils.py            |  20 ++-
 paddleslim/quant/observers/__init__.py        |   2 +
 paddleslim/quant/observers/groupwise.py       | 112 +++++++++++++
 8 files changed, 428 insertions(+), 42 deletions(-)
 create mode 100644 paddleslim/quant/advanced/auto_clip.py
 create mode 100644 paddleslim/quant/advanced/awq_search.py
 create mode 100644 paddleslim/quant/observers/groupwise.py

diff --git a/paddleslim/quant/advanced/__init__.py b/paddleslim/quant/advanced/__init__.py
index 1f0744ecf..2e779a6e1 100644
--- a/paddleslim/quant/advanced/__init__.py
+++ b/paddleslim/quant/advanced/__init__.py
@@ -19,6 +19,8 @@
 from . import sample
 from . import layerwise_quant_error
 from . import utils_layers
+from . import awq_search
+from . import auto_clip
 
 from .gptq import *
 from .smooth import *
@@ -27,6 +29,8 @@
 from .sample import *
 from .layerwise_quant_error import *
 from .utils_layers import *
+from .awq_search import *
+from .auto_clip import *
 
 __all__ = []
 __all__ += gptq.__all__
@@ -35,4 +39,6 @@
 __all__ += piecewise_search.__all__
 __all__ += sample.__all__
 __all__ += layerwise_quant_error.__all__
-__all__ += utils_layers.__all__
\ No newline at end of file
+__all__ += utils_layers.__all__
+__all__ += awq_search.__all__
+__all__ += auto_clip.__all__
\ No newline at end of file
diff --git a/paddleslim/quant/advanced/auto_clip.py b/paddleslim/quant/advanced/auto_clip.py
new file mode 100644
index 000000000..696901110
--- /dev/null
+++ b/paddleslim/quant/advanced/auto_clip.py
@@ -0,0 +1,155 @@
+# Copyright (c) 2023  PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import paddle
+import paddle.nn as nn
+import numpy as np
+from .utils import fake_quant
+from .metrics import mse_loss
+from paddle.distributed.fleet.meta_parallel import (
+    ColumnParallelLinear,
+    RowParallelLinear,
+)
+__all__ = ['AutoClip']
+
+class AutoClip(nn.Layer):
+    """
+    AutoClip from AWQ[https://arxiv.org/abs/2306.00978]
+    """
+    def __init__(
+            self,
+            model,
+            weight_bits=8,
+            weight_quant_method='groupwise',
+            loss_function=mse_loss,
+            sample_function=None,
+            n_grid=20,
+            max_shrink=0.5,
+            n_sample_token=128,
+            group_size=-1,
+            ):
+        super(AutoClip, self).__init__()
+        self.model = model
+        self.weight_bits = weight_bits
+        self.weight_method = weight_quant_method
+        self.loss_function = loss_function
+        self.n_grid = n_grid
+        self.max_shrink = max_shrink
+        self.n_sample_token = n_sample_token
+        self.bnt = (1 << (self.weight_bits - 1)) - 1
+        self.sampled_inputs = {}
+        self.sample_function = sample_function
+        self.group_size = group_size
+
+        self._apply_hook()
+
+    def _apply_hook(self):
+        self._forward_hook_list = []
+        for _, sub_layer in self.model.named_sublayers():
+            if type(sub_layer) in [ColumnParallelLinear, RowParallelLinear, paddle.nn.Linear]:
+                forward_pre_hook_handle = sub_layer.register_forward_pre_hook(
+                    self._forward_pre_hook)
+                self._forward_hook_list.append(forward_pre_hook_handle)
+
+    def _forward_pre_hook(self, layer, input):
+        self._sample_scale(input, layer.full_name())
+        return input
+    
+    def _sample_scale(self, input, name):
+        input = input[0] if type(input) == tuple else input
+        input.stop_gradient = True
+        if name not in self.sampled_inputs:
+            self.sampled_inputs[name] = input
+        else:
+            if self.sample_function is not None:
+                self.sampled_inputs[name] = self.sample_function.sample(
+                    input, self.sampled_inputs[name], name)
+            else:
+                self.sampled_inputs[name] = input
+
+    
+    def auto_clip(self, group_size=128, oc_batch_size=1024):
+        """
+        search clip scale for each layer and update the layer's weight
+        """
+        for sub_name, sub_layer in self.model.named_sublayers():
+            name = sub_layer.full_name()
+            if name not in self.sampled_inputs:
+                continue
+            print('AutoClipping', sub_name, name)
+            weight = sub_layer.weight.cast('float16')
+            weight_t = paddle.transpose(weight, perm=[1, 0])
+            x = self.sampled_inputs[name].cast('float16')
+            x = x.reshape([-1, x.shape[-1]])
+            x = x.reshape([1, x.shape[0], -1, group_size])
+            x = x[:, 0::x.shape[1] // self.n_sample_token]
+            weight_t = weight_t.reshape([weight_t.shape[0], 1, -1, group_size])
+            # fast test
+            # oc_batch_size = weight_t.shape[0] // 4
+            oc_batch_size = oc_batch_size if weight_t.shape[0] % oc_batch_size == 0 else 128  # prevent OOM
+            assert weight_t.shape[0] % oc_batch_size == 0
+
+            w_all = weight_t
+            best_max_val_all = []
+
+            for i_b in range(weight_t.shape[0] // oc_batch_size):
+                w = w_all[i_b * oc_batch_size: (i_b + 1) * oc_batch_size]
+
+                org_max_val = w.abs().max(axis=-1, keepdim=True)  # co, 1, n_group, 1
+                best_max_val = org_max_val.clone()
+                min_errs = paddle.ones_like(org_max_val, dtype='float16') * 1e9
+                org_out = (x * w).sum(axis=-1)  # co, n_token, n_group
+                for i_s in range(int(self.max_shrink * self.n_grid)):
+                    max_val = org_max_val * (1 - i_s / self.n_grid)
+                    max_val_tmp = max_val
+                    cur_w = paddle.where(w > max_val_tmp, max_val_tmp, w)
+                    cur_w = paddle.where(cur_w < - max_val_tmp, - max_val_tmp, cur_w)
+                    quant_dequant_weight = fake_quant(cur_w, method='abs_max', weight_bits=4)
+                    cur_out = (x * quant_dequant_weight).sum(axis=-1)
+                    # co, 1, n_group, 1
+                    tmp = (cur_out - org_out).detach().clone()
+                    err = paddle.pow(tmp, 2).mean(axis=1).reshape(min_errs.shape)
+                    print('block {} search s {} err {}'.format(i_b, i_s, err.mean().item()))
+                    del cur_w, cur_out, quant_dequant_weight, tmp
+                    paddle.device.cuda.empty_cache()
+
+                    cur_best_idx = paddle.where(err < min_errs)
+                    if cur_best_idx[0].shape[0] != 0:
+                        min_errs[cur_best_idx] = err[cur_best_idx]
+                        best_max_val[cur_best_idx] = max_val[cur_best_idx]
+                best_max_val_all.append(best_max_val)
+
+                del org_out, org_max_val, min_errs, best_max_val, err, cur_best_idx, max_val_tmp, max_val, w
+                paddle.device.cuda.empty_cache()
+
+            best_max_val = paddle.concat(best_max_val_all, axis=0)
+            best_max_val = paddle.squeeze(best_max_val, axis=1)
+            for param in sub_layer.parameters(include_sublayers=False):
+                if 'w_0' in param.name:
+                    param_tmp = param.transpose(perm=[1, 0]).cast('float16')
+                    tmp_shape = param_tmp.shape
+                    param_tmp = param_tmp.reshape([best_max_val.shape[0], best_max_val.shape[1], -1])
+                    best_max_val = paddle.tile(best_max_val, repeat_times=(1, 1, param_tmp.shape[-1]))
+                    param_tmp = paddle.where(param_tmp > best_max_val, best_max_val, param_tmp)
+                    param_tmp = paddle.where(param_tmp < - best_max_val, - best_max_val, param_tmp)
+                    param_tmp = param_tmp.reshape(tmp_shape).cast(param.dtype)
+                    param_tmp = param_tmp.transpose(perm=[1, 0])
+                    paddle.assign(param_tmp, output=param)
+                    del param_tmp
+                    paddle.device.cuda.empty_cache()
+                    break
+
+            del best_max_val, weight_t, x, weight, self.sampled_inputs[name], w_all, best_max_val_all
+            paddle.device.cuda.empty_cache()
+
diff --git a/paddleslim/quant/advanced/awq_search.py b/paddleslim/quant/advanced/awq_search.py
new file mode 100644
index 000000000..55151c4e8
--- /dev/null
+++ b/paddleslim/quant/advanced/awq_search.py
@@ -0,0 +1,78 @@
+# Copyright (c) 2023  PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import paddle
+import numpy as np
+from .utils import compute_scales
+from .metrics import mse_loss
+__all__ = ['AWQSearch']
+
+class AWQSearch():
+    def __init__(self,
+                 n_grid=20,
+                 bits_length=4,
+                 weight_quant_method='groupwise',
+                 group_size=128,
+                 loss_function=mse_loss):
+        '''
+        The implementation of AutoScale from AWQ(https://arxiv.org/pdf/2306.00978.pdf).
+        '''
+        self.n_grid = n_grid
+        self.bits_length = bits_length
+        self.weight_quant_method = weight_quant_method
+        self.bnt = (1 << (bits_length - 1)) - 1
+        self.group_size = group_size
+        self.loss_function = loss_function
+        
+    def search(self, layer_name, sampled_input, act_abs_max, weight):
+        act = sampled_input
+        act.stop_gradient = True
+        print('[awq search] search input of %s' % layer_name)
+        dtype = weight.dtype
+        origin_out = paddle.matmul(act, weight)
+        best_error = float('inf')
+        best_ratio = -1
+        best_scales = None
+        
+        for ratio in range(self.n_grid):
+            ratio = ratio * 1 / self.n_grid
+            act_abs_max_tmp = act_abs_max.detach().clone().cast('float32')
+            scales = paddle.clip(paddle.pow(act_abs_max_tmp, ratio), min=1e-4)
+            scales = scales / (scales.max() * scales.min()).sqrt()
+            scales = scales.cast(dtype)
+            new_weight = weight * scales.reshape([-1, 1])
+            new_act = act / scales
+            quant_scale = compute_scales(
+                new_weight, method=self.weight_quant_method, group_size=self.group_size)
+            if self.weight_quant_method == 'groupwise':
+                quant_scale = paddle.repeat_interleave(quant_scale.cast('float32'), self.group_size, 0).cast(dtype)
+            quant_weight = paddle.clip(
+                paddle.round(new_weight / quant_scale * self.bnt),
+                -self.bnt - 1, self.bnt)
+            quant_dequant_weight = quant_weight / self.bnt * quant_scale
+            new_out = paddle.matmul(new_act,
+                                    quant_dequant_weight)
+            loss = self.loss_function(origin_out, new_out).numpy()
+            is_best = loss < best_error
+            if is_best:
+                print('find better ratio: {}, loss: {}'.format(ratio, loss))
+                best_error = loss
+                best_ratio = ratio
+                best_scales = scales
+        
+        if best_scales is None:
+            best_scales = paddle.ones(scales.shape, dtype=dtype)
+            print('Cannot find better ratio.')
+        else:
+            print('Best ratio :{}, minimal loss : {}.'.format(best_ratio, best_error))
+        return best_scales
diff --git a/paddleslim/quant/advanced/piecewise_search.py b/paddleslim/quant/advanced/piecewise_search.py
index 55678409b..e326f2e55 100644
--- a/paddleslim/quant/advanced/piecewise_search.py
+++ b/paddleslim/quant/advanced/piecewise_search.py
@@ -31,6 +31,8 @@ def __init__(self,
                  search_scale_max=5.,
                  weight_quant_method='abs_max_channel_wise',
                  act_quant_method='abs_max',
+                 use_clip=False,
+                 search_clip=False,
                  loss_function=mse_loss):
         '''
         PieceWiseSearch provides to search k_piece, alpha and scale.
@@ -58,31 +60,36 @@ def __init__(self,
         self.act_quant_method = act_quant_method
         self.bnt = (1 << (bits_length - 1)) - 1
         self.loss_function = loss_function
+        self.use_clip = use_clip
+        self.search_clip = search_clip
 
     def search(self, layer_name, sampled_input, act_abs_max, weight):
         act = sampled_input
         act.stop_gradient = True
         print('[smooth search] search input of %s' % layer_name)
-
+        dtype = weight.dtype
         origin_out = paddle.matmul(act, weight)
         w_abs_max = weight.abs().max(axis=-1, keepdim=True)
         rw_abs_max = w_abs_max.reshape(act_abs_max.shape)
-        np_act_abs_max = np.array(act_abs_max)
-        np_rw_abs_max = np.array(rw_abs_max)
-
+        
         smooth_scale_out = None
         global_loss = float('inf')
         best_scale = None
 
-        for k_piece in range(1, self.k_piece + 1):
+        if self.search_clip:
+            piece_range = [1] + list(range(1, self.k_piece + 1))
+        else:
+            piece_range = list(range(1, self.k_piece + 1))
+
+        for k_idx, k_piece in enumerate(piece_range):
             if not self.search_piece:
                 k_piece = self.k_piece
             print('Search {} Piece'.format(k_piece))
             centroids, labels = k_means(act_abs_max, k_piece)
             piece = ['piece_{}'.format(a) for a in range(len(centroids))]
             for i in range(len(centroids)):
-                # print('search for piece {}; centroids value is {}'.format(
-                #     piece[i], centroids[centroids.argsort()[i]].numpy()))
+                print('search for piece {}; centroids value is {}'.format(
+                    piece[i], float(centroids[centroids.argsort()[i: i + 1]].cast('float32'))))
                 alpha = self.search_alpha_min
                 alpha_max = self.search_scale_max if self.search_scale_max is not None else self.search_alpha_max
                 calibration_loss = float('inf')
@@ -104,12 +111,16 @@ def search(self, layer_name, sampled_input, act_abs_max, weight):
                     alpha = round(alpha, 2)
 
                     if alpha < 1:
-                        s = (np.power(np_act_abs_max, alpha) / np.power(
-                            np_rw_abs_max, 1. - alpha)).clip(min=1e-5)
-                        s = paddle.to_tensor(s, dtype='float32')
+                        act_abs_max_tmp = act_abs_max.detach().clone()
+                        s = paddle.clip(paddle.pow(act_abs_max_tmp, alpha) / paddle.pow(
+                            rw_abs_max, 1 - alpha), min=1e-5)
+                        
+                        if self.use_clip or (k_piece == 1 and k_idx == 1 and self.search_clip):
+                            s = paddle.clip(act_abs_max_tmp / paddle.max(act_abs_max / s), min=1)
+                        del act_abs_max_tmp
                         smooth_scale = s * mask_for_search
                     else:
-                        smooth_scale = alpha * mask_for_search
+                        smooth_scale = paddle.to_tensor(alpha,  dtype=dtype) * mask_for_search
 
                     if smooth_scale_out is not None:
                         mask_for_ones_new = paddle.where(
@@ -145,9 +156,10 @@ def search(self, layer_name, sampled_input, act_abs_max, weight):
                         calibration_loss = cur_loss
                         final_smooth_scale = smooth_scale
                         final_alpha = alpha
+                        # print('Better alpha: {} loss: {}'.format(alpha, calibration_loss.cast('float32')))
 
-                # print("Layer {} Piece {}, loss: {}, alpha : {}".format(
-                #     layer_name, piece[i], float(calibration_loss), final_alpha))
+                print("Layer {} Piece {}, loss: {}, alpha : {}".format(
+                    layer_name, piece[i], float(calibration_loss.cast('float32')), final_alpha))
                 if smooth_scale_out is None:
                     smooth_scale_out = final_smooth_scale
                 else:
@@ -160,4 +172,5 @@ def search(self, layer_name, sampled_input, act_abs_max, weight):
                     print('Find Better K-Piece {}'.format(k_piece))
             if not self.search_piece:
                 break
+        
         return best_scale
diff --git a/paddleslim/quant/advanced/smooth.py b/paddleslim/quant/advanced/smooth.py
index e715788ed..5e32435f5 100644
--- a/paddleslim/quant/advanced/smooth.py
+++ b/paddleslim/quant/advanced/smooth.py
@@ -26,6 +26,8 @@ def __init__(
             model_config,
             alpha=0.5,
             smooth_all_linears=False,
+            start_sample_step=10000,
+            smooth_method='smoothquant',
             sample_function=None,
             search_function=None, ):
         '''
@@ -68,6 +70,8 @@ def __init__(
         self.smooth_all_linears = smooth_all_linears
         self.sample_function = sample_function
         self.search_function = search_function
+        self.start_sample_step = start_sample_step
+        self.smooth_method = smooth_method
 
         self.model.eval()
         self.step = 0
@@ -98,7 +102,6 @@ def _get_smooth_layers(self):
         self.ln_linear_dict, self.linear_ln_dict = get_ln_linear_info(
             self.layer_order, self.norm_flag, self.linear_flag, self.fused_qkv,
             self.parallel_ffn, self.skip_norm_list)
-
         assert len(self.ln_linear_dict) > 0, 'No LN/Linear pair found'
         for key in self.ln_linear_dict:
             print('smooth pair LN {} : Linear {}'.format(
@@ -147,29 +150,32 @@ def _forward_pre_hook(self, layer, input):
     def _sample_scale(self, input, ln_name):
         x = input[0] if type(input) == tuple else input
         x.stop_gradient = True
-        x_abs_max = x.abs().max(axis=1, keepdim=True)
-        x_abs_max = x_abs_max.max(axis=0)
+
+        if self.smooth_method == 'smoothquant':
+            x_abs_max = x.abs().max(axis=1, keepdim=True)
+            x_abs_max = x_abs_max.max(axis=0)
+        elif self.smooth_method == 'awq':
+            x_abs_max = x.abs().reshape([-1, x.shape[-1]])
+            x_abs_max = x_abs_max.mean(axis=0).reshape([1, -1])
+        else:
+            raise NotImplementedError("To be implemented")
 
         if ln_name not in self.scale_dict:
             self.sampled_inputs[ln_name] = x
             self.scale_dict[ln_name] = x_abs_max
         else:
-            if self.sample_function is not None:
+            if self.sample_function is not None and self.step >= self.start_sample_step:
                 self.sampled_inputs[ln_name] = self.sample_function.sample(
                     x, self.sampled_inputs[ln_name], ln_name)
             else:
                 self.sampled_inputs[ln_name] = x
-            tmp1 = paddle.concat([x_abs_max, self.scale_dict[ln_name]], axis=0)
-            self.scale_dict[ln_name] = tmp1.max(axis=0, keepdim=True)
+            if self.smooth_method == 'smoothquant':
+                tmp1 = paddle.concat([x_abs_max, self.scale_dict[ln_name]], axis=0)
+                self.scale_dict[ln_name] = tmp1.max(axis=0, keepdim=True)
+            elif self.smooth_method == 'awq':
+                tmp1 = paddle.concat([x_abs_max, self.scale_dict[ln_name]], axis=0)
+                self.scale_dict[ln_name] = tmp1.mean(axis=0, keepdim=True)
 
-        # per step print once
-        if self.print_step == self.step:
-            print('[Smooth] Step [{}]: {}. abs_min: {}, abs_max: {}'.format(
-                self.step, ln_name,
-                float(self.scale_dict[ln_name].cast("float32").min()),
-                float(self.scale_dict[ln_name].cast("float32").max())))
-            if ln_name == list(self.linear_ln_dict.values())[-1]:
-                self.print_step += 1
 
     def update_weight(self):
 
@@ -181,24 +187,20 @@ def update_weight(self):
             if type(sub_layer) == ShiftSmoothHelpLayer:
                 ln_name = layer_name
             if ln_name is not None:
-                act_abs_max = self.scale_dict[ln_name].cast("float32")
-                sampled_input = self.sampled_inputs[ln_name].cast("float32")
+                act_abs_max = self.scale_dict[ln_name].cast("float16")
+                sampled_input = self.sampled_inputs[ln_name].cast("float16")
                 for param in sub_layer.parameters(include_sublayers=False):
                     if 'w_0' in param.name:
-                        weight = param.cast("float32")
+                        # weight = param.cast("float32")
                         if self.search_function is not None:
                             s = self.search_function.search(
-                                layer_name, sampled_input, act_abs_max, weight)
+                                layer_name, sampled_input, act_abs_max, param.cast("float16"))
                         else:
-                            w_abs_max = weight.abs().max(axis=-1, keepdim=True)
+                            w_abs_max = param.abs().max(axis=-1, keepdim=True)
                             rw_abs_max = w_abs_max.reshape(act_abs_max.shape)
-                            act_abs_max_np = act_abs_max.numpy()
-                            weight_abs_max_np = rw_abs_max.numpy()
-                            s = (
-                                np.power(act_abs_max_np, self.alpha) / np.power(
-                                    weight_abs_max_np, 1 - self.alpha)).clip(
-                                        min=1e-5)
-                            s = paddle.to_tensor(s, dtype="float32")
+                            act_abs_max_tmp = act_abs_max.detach().clone()
+                            s = paddle.clip(paddle.pow(act_abs_max_tmp, self.alpha) / paddle.pow(
+                                rw_abs_max, 1 - self.alpha), min=1e-5)
 
                         self.smooth_scale_dict[ln_name] = s.cast(param.dtype)
                         break
@@ -273,4 +275,4 @@ def update_weight(self):
     def _remove_hook(self):
         for hook in self._forward_hook_list:
             hook.remove()
-        self._forward_hook_list = []
+        self._forward_hook_list = []
\ No newline at end of file
diff --git a/paddleslim/quant/advanced/utils.py b/paddleslim/quant/advanced/utils.py
index 703fc5e1c..ff77462b2 100644
--- a/paddleslim/quant/advanced/utils.py
+++ b/paddleslim/quant/advanced/utils.py
@@ -38,7 +38,7 @@ def k_means(weight, n_clusters, init='k-means++', max_iter=300):
     return paddle.to_tensor(centroids.flatten()), paddle.to_tensor(labels)
 
 
-def compute_scales(x, method='abs_max'):
+def compute_scales(x, method='abs_max', group_size=-1):
     if method == 'abs_max':
         quant_scale = float(paddle.max(paddle.abs(x.flatten())))
         quant_scale = 1e-8 if quant_scale == 0.0 else quant_scale
@@ -52,8 +52,26 @@ def compute_scales(x, method='abs_max'):
             0, dtype=x.dtype),
                                    paddle.to_tensor(1e-8, dtype=x.dtype),
                                    quant_scale)
+    elif method == 'groupwise':
+        input_shape = x.shape
+        input_processed = x.transpose([1, 0]).reshape(
+            [input_shape[1], input_shape[0] // group_size, group_size])
+        quant_scale = paddle.max(
+            paddle.abs(input_processed), axis=2) 
+        quant_scale = paddle.where(quant_scale == paddle.to_tensor(0, dtype=x.dtype),
+                                      paddle.to_tensor(1e-8, dtype=x.dtype), quant_scale)
+        quant_scale = quant_scale.transpose([1, 0])
+        
     return quant_scale
 
+def fake_quant(x, method='abs_max', weight_bits=8, group_size=-1):
+    bnt = (1 << (weight_bits - 1)) - 1
+    quant_scale = compute_scales(x, method=method, group_size=group_size)
+    quant_value = paddle.clip(
+        paddle.round(x / quant_scale * bnt), -bnt - 1, bnt)
+    quant_dequant_value = quant_value / bnt * quant_scale
+    return quant_dequant_value
+
 
 def find_parent_layer_and_sub_name(model, name):
     last_idx = 0
diff --git a/paddleslim/quant/observers/__init__.py b/paddleslim/quant/observers/__init__.py
index 7ab3b723e..0b7970ba8 100644
--- a/paddleslim/quant/observers/__init__.py
+++ b/paddleslim/quant/observers/__init__.py
@@ -20,6 +20,7 @@
 from .abs_max import AbsmaxObserver
 from .mse_weight import MSEChannelWiseWeightObserver
 from .abs_max_weight import AbsMaxChannelWiseWeightObserver
+from .groupwise import GroupWiseWeightObserver
 
 __all__ = [
     "HistObserver",
@@ -31,4 +32,5 @@
     "AbsmaxObserver",
     "MSEChannelWiseWeightObserver",
     "AbsMaxChannelWiseWeightObserver",
+    "GroupWiseWeightObserver"
 ]
diff --git a/paddleslim/quant/observers/groupwise.py b/paddleslim/quant/observers/groupwise.py
new file mode 100644
index 000000000..1db2067c6
--- /dev/null
+++ b/paddleslim/quant/observers/groupwise.py
@@ -0,0 +1,112 @@
+# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import numpy as np
+import paddle
+from .channel_wise import ChannelWiseObserver
+from paddle.quantization.factory import ObserverFactory
+
+
+class GroupWiseWeightObserver(ObserverFactory):
+    r"""
+    It collects channel-wise maximum absolute values of target weights.
+    Args:
+        bit_length(int, optional): Number of bits to represent an quantized integer in binary.
+        dtype(str, optional): The data type of input tensor.
+        name (str, optional): This parameter is used by developers to print debugging information. \
+            For details, please refer to :ref:`api_guide_Name`. Default is None.
+    Examples:
+       .. code-block:: python
+            from paddle.quantization import QuantConfig
+            from paddle.quantization.quanters import AbsMaxChannelWiseWeightObserver
+            quanter = AbsMaxChannelWiseWeightObserver()
+            q_config = QuantConfig(activation=None, weight=quanter)
+    """
+
+    def __init__(self, quant_bits=8, group_size=128):
+        super(GroupWiseWeightObserver, self).__init__(
+            quant_bits=quant_bits,
+            group_size=group_size)
+
+    def _get_class(self):
+        return GroupWiseWeightObserverLayer
+
+
+class GroupWiseWeightObserverLayer(ChannelWiseObserver):
+    def __init__(self, layer, quant_bits=8, group_size=128):
+        super(GroupWiseWeightObserverLayer, self).__init__(
+            layer,
+            quant_bits=quant_bits,
+            sign=True,
+            symmetric=True, )
+        self.quant_bits = quant_bits
+        self.group_size = group_size 
+        self.qmin, self.qmax = self.qmin_qmax
+        self._layer = layer
+        self._max = None
+        self._scale = None
+        self._zero_point = None
+
+    def forward(self, inputs):
+        self._max = self._cal_abs_max(inputs)
+        return inputs
+
+    def _cal_abs_max(self, inputs):
+        """ Use group_size to group the input, then use the 
+            absmax method to calculate the scale
+        """
+        input_shape = inputs.shape
+        assert self.group_size == 64 or self.group_size == 128, \
+            "group_size only support 64 or 128"
+        assert inputs.shape[0] % self.group_size == 0, \
+            "group_size must be a factor of input channels"
+        assert len(inputs.shape) == 2, \
+            "Currently only support 2D tensor"
+        input_processed = inputs.transpose([1, 0]).reshape(
+            [input_shape[1], input_shape[0] // self.group_size, self.group_size])
+
+        abs_max_values = paddle.max(
+            paddle.abs(input_processed), axis=2).cast("float32")
+        # "abs_max_values < 1e-8" in bfloat16 type?
+        abs_max_values = paddle.where(abs_max_values == np.float32(0),
+                                      np.float32(1e-8), abs_max_values)
+        abs_max_values = abs_max_values.transpose([1, 0])
+        return abs_max_values
+
+    def min_value(self) -> float:
+        return 0.
+
+    def max_value(self) -> float:
+        return self._max
+
+    def cal_thresholds(self):
+        """ Compute thresholds for MAX function.
+        """
+        if self._scale is None:
+            self._scale = self._max
+        self._zero_point = paddle.zeros_like(self._scale)
+
+    def scales(self):
+        """ Return output scales.
+        """
+        if self._scale is None:
+            self.cal_thresholds()
+        return self._scale
+
+    def zero_points(self):
+        """ Return output zero points.
+        """
+        if self._zero_point is None:
+            self.cal_thresholds()
+        return self._zero_point

From 521157e390aa8bca62953e251495257e334a9477 Mon Sep 17 00:00:00 2001
From: Chang Xu <molixu7@gmail.com>
Date: Thu, 28 Dec 2023 21:36:33 +0800
Subject: [PATCH 3/4] [Cherry-Pick]Cp fit paddle26 (#1823)

---
 paddleslim/quant/advanced/gptq.py             | 19 +++++++++++++------
 paddleslim/quant/advanced/piecewise_search.py |  3 +++
 2 files changed, 16 insertions(+), 6 deletions(-)

diff --git a/paddleslim/quant/advanced/gptq.py b/paddleslim/quant/advanced/gptq.py
index 96566858f..5ae47205c 100644
--- a/paddleslim/quant/advanced/gptq.py
+++ b/paddleslim/quant/advanced/gptq.py
@@ -106,8 +106,9 @@ def fasterquant(self,
         H = self.hessian
         del self.hessian
         dead = paddle.where(paddle.diag(H) == 0)
-        H[dead, dead] = 1
-        W[:, dead] = 0
+        if dead[0].shape[0] != 0:
+            H[dead, dead] = 1
+            W[:, dead] = 0
         del dead
         if actorder:
             perm = paddle.argsort(paddle.diag(H), descending=True)
@@ -122,9 +123,15 @@ def fasterquant(self,
         damp = percdamp * paddle.mean(paddle.diag(H))
         diag = paddle.arange(self.columns)
         H[diag, diag] += damp
-
-        H = paddle.inverse(H)
-        H = paddle.linalg.cholesky(H, upper=True)
+        try:
+            H = paddle.inverse(H)
+            H = paddle.linalg.cholesky(H, upper=True)
+        except:
+            print('We skip GPTQ this layer now.')
+            print(
+                'If you want GPTQ this layer, please try setting damp_percent larger or increasing the number of samples.'
+            )
+            return
         Hinv = H
 
         for i1 in range(0, self.columns, blocksize):
@@ -182,4 +189,4 @@ def fasterquant(self,
 
         self.quantized = True
         del H, Q, Hinv, W, Losses
-        paddle.device.cuda.empty_cache()
+        paddle.device.cuda.empty_cache()
\ No newline at end of file
diff --git a/paddleslim/quant/advanced/piecewise_search.py b/paddleslim/quant/advanced/piecewise_search.py
index e326f2e55..a95b2a1c7 100644
--- a/paddleslim/quant/advanced/piecewise_search.py
+++ b/paddleslim/quant/advanced/piecewise_search.py
@@ -97,6 +97,8 @@ def search(self, layer_name, sampled_input, act_abs_max, weight):
                 mask_for_search = paddle.where(labels == centroids.argsort()[i],
                                                1., 0.)
                 mask_for_ones = paddle.where(mask_for_search == 0., 1., 0.)
+                mask_for_search = mask_for_search.cast(dtype)
+                mask_for_ones = mask_for_ones.cast(dtype)
 
                 while alpha <= alpha_max:
                     if alpha < 1:
@@ -125,6 +127,7 @@ def search(self, layer_name, sampled_input, act_abs_max, weight):
                     if smooth_scale_out is not None:
                         mask_for_ones_new = paddle.where(
                             smooth_scale_out == 0., 1., 0.)
+                        mask_for_ones_new = mask_for_ones_new.cast(dtype)
                         mask_for_ones *= mask_for_ones_new
                         smooth_scale_ = smooth_scale_out + smooth_scale
                         smooth_scale_tmp = smooth_scale_ + mask_for_ones

From 3c84fbf4a2fb24ea2913b4c1694b051293938a65 Mon Sep 17 00:00:00 2001
From: lizexu <2694294196@qq.com>
Date: Sun, 18 Feb 2024 10:51:04 +0000
Subject: [PATCH 4/4] =?UTF-8?q?=E7=9B=AE=E6=A0=87=E6=A3=80=E6=B5=8B?=
 =?UTF-8?q?=E6=A8=A1=E5=9E=8B=E7=A6=BB=E7=BA=BF=E9=87=8F=E5=8C=96=E7=A4=BA?=
 =?UTF-8?q?=E4=BE=8B?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 .../detection/configs/yolov3_r50vd_dcn.yml    |  30 +++
 .../detection/README.md                       | 174 ++++++++++++++++--
 .../detection/configs/picodet_s_analysis.yaml |  12 +-
 .../detection/configs/ppyoloe_s_ptq.yaml      |   4 +-
 .../detection/configs/yolov3_r50vd_dcn.yaml   |  37 ++++
 5 files changed, 237 insertions(+), 20 deletions(-)
 create mode 100644 example/auto_compression/detection/configs/yolov3_r50vd_dcn.yml
 create mode 100644 example/post_training_quantization/detection/configs/yolov3_r50vd_dcn.yaml

diff --git a/example/auto_compression/detection/configs/yolov3_r50vd_dcn.yml b/example/auto_compression/detection/configs/yolov3_r50vd_dcn.yml
new file mode 100644
index 000000000..f7498dabb
--- /dev/null
+++ b/example/auto_compression/detection/configs/yolov3_r50vd_dcn.yml
@@ -0,0 +1,30 @@
+metric: COCO
+num_classes: 80
+
+# Datset configuration
+TrainDataset:
+  !COCODataSet
+    image_dir: train2017
+    anno_path: annotations/instances_train2017.json
+    dataset_dir: /work/GETR-Lite-paddle-new/inference/datasets/coco/
+EvalDataset:
+  !COCODataSet
+    image_dir: val2017
+    anno_path: annotations/instances_val2017.json
+    dataset_dir: /work/GETR-Lite-paddle-new/inference/datasets/coco/
+
+eval_height: &eval_height 608
+eval_width: &eval_width 608
+eval_size: &eval_size [*eval_height, *eval_width]
+
+worker_num: 0
+
+EvalReader:
+  inputs_def:
+    image_shape: [1, 3, *eval_height, *eval_width]
+  sample_transforms:
+  - Decode: {}
+  - Resize: {interp: 2, target_size: *eval_size, keep_ratio: False}
+  - NormalizeImage: {is_scale: true, mean: [0.485,0.456,0.406], std: [0.229, 0.224,0.225]}
+  - Permute: {}
+  batch_size: 4
diff --git a/example/post_training_quantization/detection/README.md b/example/post_training_quantization/detection/README.md
index f590606dd..62ce6402d 100644
--- a/example/post_training_quantization/detection/README.md
+++ b/example/post_training_quantization/detection/README.md
@@ -17,35 +17,37 @@
 ## 1. 简介
 本示例将以目标检测模型PP-YOLOE和PicoDet为例，介绍如何使用PaddleDetection中Inference部署模型，使用离线量化功能进行压缩，并使用敏感度分析功能提升离线量化精度。
 
+注意:[Paddle-Inference-demo/c++/gpu/yolov3](https://github.com/PaddlePaddle/Paddle-Inference-Demo/tree/master/python/gpu/yolov3)使用量化校准表会有精度不对齐的情况，可对yolov3_r50vd_dcn_270e_coco模型进行离线量化。
 
 ## 2.Benchmark
 
 | 模型  |  策略  | 输入尺寸 | mAP<sup>val<br>0.5:0.95 | 预测时延<sup><small>FP32</small><sup><br><sup>(ms) |预测时延<sup><small>FP16</small><sup><br><sup>(ms) | 预测时延<sup><small>INT8</small><sup><br><sup>(ms) |  配置文件 | Inference模型  |
 | :-------- |:-------- |:--------: | :---------------------: | :----------------: | :----------------: | :---------------: | :-----------------------------: | :-----------------------------: |
-| PP-YOLOE-s |  Base模型 | 640*640  |  43.1   |   11.2ms  |   7.7ms   |    -    |    -   | [Model](https://bj.bcebos.com/v1/paddle-slim-models/act/ppyoloe_crn_s_300e_coco.tar) |
-| PP-YOLOE-s |  离线量化 | 640*640  |  42.6    |     -     |     -     |  6.7ms  |    -   |   [Model](https://bj.bcebos.com/v1/paddle-slim-models/act/ppyoloe_s_ptq.tar) |
+| yolov3_r50vd_dcn_270e_coco |  Base模型 | 608*608  |  40.6   |   92.2ms  |   41.3ms   |    -    |    -   | [Model](https://paddle-inference-dist.bj.bcebos.com/Paddle-Inference-Demo/yolov3_r50vd_dcn_270e_coco.tgz) |
+| yolov3_r50vd_dcn_270e_coco |  离线量化 | 608*608  |  40.3    |     -     |     -     |  27.9ms  |    -   |    |
 |  |  |  |  |  |  |  |  |  |
-| PicoDet-s |  Base模型 | 416*416  |  32.5   |   -  |   -   |  -  |  - | [Model](https://paddledet.bj.bcebos.com/deploy/Inference/picodet_s_416_coco_lcnet.tar) |
-| PicoDet-s |  离线量化(量化分析前) | 416*416  |  0.0   |   - |   -   |  -  |  -  | - |
-| PicoDet-s |  离线量化(量化分析后) | 416*416  |  24.9   |   - |   -   |  -  |  -  | [Infer Model](https://bj.bcebos.com/v1/paddle-slim-models/act/picodet_s_ptq.tar) |
+| PicoDet-s |  Base模型 | 416*416  |  32.5   |   82.5ms  |   59.7ms   |  -  |  - | [Model](https://paddledet.bj.bcebos.com/deploy/Inference/picodet_s_416_coco_lcnet.tar) |
+| PicoDet-s |  离线量化(量化分析前) | 416*416  |  0.0   |   - |   -   |  39.1ms  |  -  | - |
+| PicoDet-s |  离线量化(量化分析后) | 416*416  |  24.9   |   - |   -   |  64.8ms  |  -  | [Infer Model](https://bj.bcebos.com/v1/paddle-slim-models/act/picodet_s_ptq.tar) |
 
+mAP较低，导致目标框增多，NMS会增加耗时。
 - mAP的指标均在COCO val2017数据集中评测得到，IoU=0.5:0.95。
-
+测速环境:Tesla T4,TensorRT 8.6.1,CUDA 11.2,batch_size=1,cudnn 8.2.0  Intel(R)Xeon(R)Gold 6271C CPU
 
 ## 3. 离线量化流程
 
 #### 3.1 准备环境
-- PaddlePaddle >= 2.3 （可从[Paddle官网](https://www.paddlepaddle.org.cn/install/quick?docurl=/documentation/docs/zh/install/pip/linux-pip.html)下载安装）
-- PaddleSlim >= 2.3
+- PaddlePaddle == 2.6 （可从[Paddle官网](https://www.paddlepaddle.org.cn/install/quick?docurl=/documentation/docs/zh/install/pip/linux-pip.html)下载安装）
+- PaddleSlim 2.6
 - PaddleDet >= 2.4
 - opencv-python
 
 安装paddlepaddle：
 ```shell
 # CPU
-pip install paddlepaddle
-# GPU
-pip install paddlepaddle-gpu
+python -m pip install paddlepaddle==2.6.0 -i https://pypi.tuna.tsinghua.edu.cn/simple
+# GPU 以cuda11.2为例子
+python -m pip install paddlepaddle-gpu==2.6.0.post112 -f https://www.paddlepaddle.org.cn/whl/linux/mkl/avx/stable.html
 ```
 
 安装paddleslim：
@@ -116,6 +118,12 @@ python post_quant.py --config_path=./configs/ppyoloe_s_ptq.yaml --save_dir=./ppy
 export CUDA_VISIBLE_DEVICES=0
 python post_quant.py --config_path=./configs/picodet_s_ptq.yaml --save_dir=./picodet_s_ptq
 ```
+- yolov3_r50vd_dcn_270e_coco：
+
+```
+export CUDA_VISIBLE_DEVICES=0
+python post_quant.py --config_path=./configs/yolov3_r50vd_dcn.yaml --save_dir=./yolov3_r50vd_dcn_270e_coco_ptq
+```
 
 
 #### 3.5 测试模型精度
@@ -125,12 +133,21 @@ python post_quant.py --config_path=./configs/picodet_s_ptq.yaml --save_dir=./pic
 export CUDA_VISIBLE_DEVICES=0
 python eval.py --config_path=./configs/ppyoloe_s_ptq.yaml
 ```
+ppyoloe_s这个模型测试不出来精度，因为没有NMS
+```
+export CUDA_VISIBLE_DEVICES=0
+python eval.py --config_path=./configs/picodet_s_ptq.yaml
+```
+```
+export CUDA_VISIBLE_DEVICES=0
+python eval.py --config_path=./configs/yolov3_r50vd_dcn.yaml
+```
 
 **注意**：
 - 要测试的模型路径可以在配置文件中`model_dir`字段下进行修改。
 
 #### 3.6 提高离线量化精度
-本节介绍如何使用量化分析工具提升离线量化精度。离线量化功能仅需使用少量数据，且使用简单、能快速得到量化模型，但往往会造成较大的精度损失。PaddleSlim提供量化分析工具，会使用接口```paddleslim.quant.AnalysisPTQ```，可视化展示出不适合量化的层，通过跳过这些层，提高离线量化模型精度。```paddleslim.quant.AnalysisPTQ```详解见[AnalysisPTQ.md](../../../docs/zh_cn/tutorials/quant/AnalysisPTQ.md)。
+本节介绍如何使用量化分析工具提升离线量化精度。离线量化功能仅需使用少量数据，且使用简单、能快速得到量化模型，但往往会造成较大的精度损失。PaddleSlim提供量化分析工具，会使用接口```paddleslim.quant.AnalysisPTQ```，可视化展示出不适合量化的层，通过跳过这些层，提高离线量化模型精度。```paddleslim.quant.AnalysisPTQ```详解见[AnalysisPTQ.md](https://github.com/PaddlePaddle/PaddleSlim/blob/develop/docs/zh_cn/tutorials/quant/post_training_quantization.md)。
 
 
 经过多个实验，包括尝试多种激活算法（avg，KL等）、weight的量化方式（abs_max，channel_wise_abs_max），对PicoDet-s进行离线量化后精度均为0，以PicoDet-s为例，量化分析工具具体使用方法如下：
@@ -171,6 +188,139 @@ python post_quant.py --config_path=./configs/picodet_s_analyzed_ptq.yaml --save_
 ## 4.预测部署
 预测部署可参考[Detection模型自动压缩示例](https://github.com/PaddlePaddle/PaddleSlim/tree/develop/example/auto_compression/detection)
 
+量化模型可在GPU上可以使用TensorRT进行预测，在CPU上可以使用MKLDNN进行预测。
+
+以下字段可用于配置预测参数:
+
+| 参数名 | 含义 |
+|:------:|:------:|
+| model_path | inference 模型文件所在目录，该目录下需要有文件 model.pdmodel 和 model.pdiparams 两个文件 |
+| reader_config | eval时模型reader的配置文件路径 |
+| image_file | 如果只测试单张图片效果，直接根据image_file指定图片路径 |
+| device | 使用GPU或者CPU预测，可选CPU/GPU   |
+| use_trt | 是否使用 TesorRT 预测引擎   |
+| use_mkldnn | 是否启用```MKL-DNN```加速库，注意```use_mkldnn```与```use_gpu```同时为```True```时，将忽略```enable_mkldnn```，而使用```GPU```预测  |
+| cpu_threads | CPU预测时，使用CPU线程数量，默认10  |
+| precision | 预测精度，包括`fp32/fp16/int8`  |
+| include_nms | 是否包含nms，如果不包含nms，则设置False，如果包含nms，则设置为True  |
+| use_dynamic_shape | 是否使用动态shape，如果使用动态shape，则设置为True，否则设置为False  |
+| img_shape | 输入图片的大小。这里默认为640,意味着图像将被调整到640*640  |
+| trt_calib_mode | 如果模型是通过TensorRT离线量化校准生成的，那么需要将此参数设置为True。|
+
+-TesorRT预测示例：
+
+yolov3_r50vd_dcn_270e_coco模型
+```shell
+python paddle_inference_eval.py \
+       --model_path=yolov3_r50vd_dcn_270e_coco \
+       --reader_config=configs/yolov3_r50vd_dcn.yml \
+       --use_trt=True \
+       --precision=fp32 \
+       --include_nms=True \
+       --benchmark=True
+```
+```shell
+python paddle_inference_eval.py \
+       --model_path=yolov3_r50vd_dcn_270e_coco_ptq \
+       --reader_config=configs/yolov3_r50vd_dcn.yml \
+       --use_trt=True \
+       --precision=int8 \
+       --include_nms=True \
+       --benchmark=True
+```
+picodet_s模型
+```shell
+python paddle_inference_eval.py \
+    --model_path=picodet_s_416_coco_lcnet \
+    --reader_config=configs/picodet_reader.yml \
+    --use_trt=True \
+    --precision=fp16 \
+    --include_nms=True \
+    --benchmark=True
+```
+量化分析前
+```shell
+python paddle_inference_eval.py \
+    --model_path=picodet_s_ptq \
+    --reader_config=configs/picodet_reader.yml \
+    --use_trt=True \
+    --precision= \
+    --include_nms=True \
+    --benchmark=True
+```
+量化分析后
+```shell
+python paddle_inference_eval.py \
+    --model_path=picodet_s_analyzed_ptq_out \
+    --reader_config=configs/picodet_reader.yml \
+    --use_trt=True \
+    --precision=int8 \
+    --include_nms=True \
+    --benchmark=True
+```
+#### 4.1 C++部署
+请参考[YOLOv3推理](https://github.com/PaddlePaddle/Paddle-Inference-Demo/tree/master/c%2B%2B/gpu/yolov3)
+
+编译样例
+- 文件yolov3_test.cc改成PicoDet-s.cc，为预测的样例程序(程序中的输入为固定值，如果您有opencv或其他方式进行数据读取的需求，需要对程序进行一定的修改)。
+- 脚本compile.sh包含了第三方库、预编译库的信息配置。
+- 脚本run.sh为一键运行脚本。
+编译前，需要根据自己的环境修改compile.sh中的相关代码配置依赖库:
+
+```shell
+# 编译的 demo 名称
+DEMO_NAME=picoDet-s
+
+# 根据预编译库中的version.txt信息判断是否将以下三个标记打开
+WITH_MKL=ON
+WITH_GPU=ON
+USE_TENSORRT=ON
+
+# 配置预测库的根目录
+LIB_DIR=${work_path}/../lib/paddle_inference
+
+# 如果上述的WITH_GPU 或 USE_TENSORRT设为ON，请设置对应的CUDA， CUDNN， TENSORRT的路径。
+CUDNN_LIB=/usr/lib/x86_64-linux-gnu/
+CUDA_LIB=/usr/local/cuda/lib64
+TENSORRT_ROOT=/usr/local/TensorRT-7.1.3.4
+```
+运行bash compile.sh编译样例
+
+- 运行样例
+使用原生GPU运行样例
+```shell
+./build/picodet-s --model_file picodet_s_416_coco_lenet/model.pdmodel --params_file picodet_s_416_coco_lenet/model.pdiparams
+```
+使用Trt FP32运行样例
+```shell
+./build/picodet-s --model_file picodet_s_416_coco_lenet/model.pdmodel --params_file picodet_s_416_coco_lenet/model.pdiparams --run_mode=trt_fp32
+```
+使用Trt FP16运行样例
+```shell
+./build/picodet-s --model_file picodet_s_416_coco_lenet/model.pdmodel --params_file picodet_s_416_coco_lenet/model.pdiparams --run_mode=trt_fp16
+```
+使用Trt Int8运行样例
+在使用Trt Int8运行样例时，相同的运行命令需要执行两次。
+生成量化校准表
+```shell
+./build/picodet-s --model_file picodet_s_416_coco_lcnet/model.pdmodel --params_file picodet_s_416_coco_lcnet/model.pdiparams --run_mode=trt_int8
+```
+加载校准表预测的log:
+```shell
+I0623 08:40:49.386909 107053 tensorrt_engine_op.h:159] This process is generating calibration table for Paddle TRT int8...
+I0623 08:40:49.387279 107057 tensorrt_engine_op.h:352] Prepare TRT engine (Optimize model structure, Select OP kernel etc). This process may cost a lot of time.
+I0623 08:41:13.784473 107053 analysis_predictor.cc:791] Wait for calib threads done.
+I0623 08:41:14.419198 107053 analysis_predictor.cc:793] Generating TRT Calibration table data, this may cost a lot of time...
+```
+使用Trt dynamic shape运行样例(以Trt FP32为例)
+```shell
+./build/picodet-s --model_file picodet_s_416_coco_lcnet/model.pdmodel --params_file picodet_s_416_coco_lcnet/model.pdiparams --run_mode=trt_fp32 --use_dynamic_shape=1
+```
+| 模型 | trt-fp32 | trt-fp16 | trt-int8 | paddle_gpu fp32 | trt_fp32(dynamic_shape) |
+|:------:|:------:|:------:|:------:| :------:| :------:|
+| PicoDet-s | 3.05ms | 2.66ms | 2.40ms | 7.51ms | 2.82ms |
+测速环境:Tesla T4,TensorRT 8.6.1,CUDA 11.6,batch_size=1,cudnn 8.4.0  Intel(R)Xeon(R)Gold 6271C CPU
+
 ## 5.FAQ
 
 - 如果想对模型进行自动压缩，可进入[Detection模型自动压缩示例](https://github.com/PaddlePaddle/PaddleSlim/tree/develop/example/auto_compression/detection)中进行实验。
diff --git a/example/post_training_quantization/detection/configs/picodet_s_analysis.yaml b/example/post_training_quantization/detection/configs/picodet_s_analysis.yaml
index d3d6944c2..de8852c45 100644
--- a/example/post_training_quantization/detection/configs/picodet_s_analysis.yaml
+++ b/example/post_training_quantization/detection/configs/picodet_s_analysis.yaml
@@ -1,5 +1,5 @@
 input_list: ['image', 'scale_factor']
-model_dir: ./picodet_s_416_coco_lcnet/
+model_dir: ./picodet_s_416_coco_lcnet
 model_filename: model.pdmodel
 params_filename: model.pdiparams
 save_dir: ./analysis_results
@@ -26,11 +26,11 @@ EvalDataset:
 
 # Small Dataset to accelerate analysis
 # If not exist, delete the dict of FastEvalDataset
-FastEvalDataset:
-  !COCODataSet
-    image_dir: val2017
-    anno_path: annotations/small_instances_val2017.json
-    dataset_dir: /dataset/coco/
+# FastEvalDataset:
+#   !COCODataSet
+#     image_dir: val2017
+#     anno_path: annotations/small_instances_val2017.json
+#     dataset_dir: /dataset/coco/
 
 
 eval_height: &eval_height 416
diff --git a/example/post_training_quantization/detection/configs/ppyoloe_s_ptq.yaml b/example/post_training_quantization/detection/configs/ppyoloe_s_ptq.yaml
index 3c8752652..fadf41a4d 100644
--- a/example/post_training_quantization/detection/configs/ppyoloe_s_ptq.yaml
+++ b/example/post_training_quantization/detection/configs/ppyoloe_s_ptq.yaml
@@ -1,4 +1,4 @@
-input_list: ['image']
+input_list: ['image','scale_factor']
 arch: PPYOLOE    # When export exclude_nms=True, need set arch: PPYOLOE
 model_dir: ./ppyoloe_crn_s_300e_coco
 model_filename: model.pdmodel
@@ -29,4 +29,4 @@ EvalReader:
     - Resize: {target_size: [640, 640], keep_ratio: False, interp: 2}
     - NormalizeImage: {mean: [0.485, 0.456, 0.406], std: [0.229, 0.224, 0.225], is_scale: True}
     - Permute: {}
-  batch_size: 32
\ No newline at end of file
+  batch_size: 16
\ No newline at end of file
diff --git a/example/post_training_quantization/detection/configs/yolov3_r50vd_dcn.yaml b/example/post_training_quantization/detection/configs/yolov3_r50vd_dcn.yaml
new file mode 100644
index 000000000..7fdb52dbc
--- /dev/null
+++ b/example/post_training_quantization/detection/configs/yolov3_r50vd_dcn.yaml
@@ -0,0 +1,37 @@
+input_list: ['image', 'scale_factor','im_shape']
+model_dir: ./yolov3_r50vd_dcn_270e_coco
+model_filename: model.pdmodel
+params_filename: model.pdiparams
+metric: COCO
+num_classes: 80
+
+# Datset configuration
+TrainDataset:
+  !COCODataSet
+    image_dir: train2017
+    anno_path: annotations/instances_train2017.json
+    dataset_dir: /work/GETR-Lite-paddle-new/inference/datasets/coco/
+
+EvalDataset:
+  !COCODataSet
+    image_dir: val2017
+    anno_path: annotations/instances_val2017.json
+    dataset_dir: /work/GETR-Lite-paddle-new/inference/datasets/coco/
+
+eval_height: &eval_height 608
+eval_width: &eval_width
+eval_size: &eval_size [*eval_height, *eval_width]
+
+worker_num: 0
+
+# preprocess reader in test
+EvalReader:
+  inputs_def:
+    image_shape: [1, 3, *eval_height, *eval_width]
+  sample_transforms:
+  - Decode: {}
+  - Resize: {interp: 2, target_size: *eval_size, keep_ratio: False}
+  - NormalizeImage: {is_scale: true, mean: [0.485,0.456,0.406], std: [0.229, 0.224,0.225]}
+  - Permute: {}
+  batch_size: 4
+