[Inference] Update fakequant (#9140)

* add a8w8(fp8) a8w8c8(int8) quant_type support * add llama3.1 and qwen2 ptq config * reformat quantization.md and argument.py * update prepare data method for ceval ptq * fix wint4 config bug * use independent avg/abs_max observer * rename fp8 quant_type * update quantization.md * remove ceval in run_finetune.py
PaddlePaddle · Sep 14, 2024 · 0832b59 · 0832b59
1 parent e9338c2
commit 0832b59
Show file tree

Hide file tree

Showing 17 changed files with 359 additions and 103 deletions.
diff --git a/llm/config/llama/AdvertiseGen/w8a8_ptq_argument.json b/llm/config/llama/AdvertiseGen/w8a8_ptq_argument.json
@@ -21,6 +21,5 @@
   "smooth_piecewise_search": true,
   "smooth_k_piece": 3,
   "smooth_search_piece": true,
-  "act_quant_method": "avg",
-  "cachekv_quant_method": "avg_headwise"
+  "act_quant_method": "avg"
 }
diff --git a/llm/config/llama/AdvertiseGen/wfp8afp8_ptq_argument.json b/llm/config/llama/AdvertiseGen/wfp8afp8_ptq_argument.json
@@ -1,8 +1,6 @@
 {
   "model_name_or_path": "meta-llama/Meta-Llama-3.1-8B-Instruct",
-  "quant_type": "a8w8",
-  "use_fp8": "WA",
-  "fp8_type": ["e4m3", "e4m3"],
+  "quant_type": "a8w8_fp8",
   "per_device_train_batch_size": 8,
   "per_device_eval_batch_size": 8,
   "eval_accumulation_steps":16,
@@ -11,14 +9,13 @@
   "fp16": true,
   "fp16_opt_level": "O2",
   "dataset_name_or_path": "../dataset/AdvertiseGen",
-  "output_dir": "../output/llama3.1/w8a8_ptq_ckpts_AdvertiseGen",
+  "output_dir": "../output/llama3.1/wfp8afp8_ptq_ckpts_AdvertiseGen",
   "do_eval": true,
   "eval_with_do_generation": false,
   "do_ptq": true,
   "ptq_step": 16,
   "unified_checkpoint": false,
   "smooth": false,
   "weight_quant_method": "abs_max",
-  "act_quant_method": "abs_max",
-  "cachekv_quant_method": "abs_max"
+  "act_quant_method": "abs_max"
   }
diff --git a/llm/config/llama/ceval/w8a8_ptq_argument.json b/llm/config/llama/ceval/w8a8_ptq_argument.json
@@ -21,6 +21,5 @@
   "smooth_piecewise_search": true,
   "smooth_k_piece": 3,
   "smooth_search_piece": true,
-  "act_quant_method": "avg",
-  "cachekv_quant_method": "avg_headwise"
+  "act_quant_method": "avg"
 }
diff --git a/llm/config/llama/ceval/wfp8afp8_ptq_argument.json b/llm/config/llama/ceval/wfp8afp8_ptq_argument.json
@@ -1,7 +1,6 @@
 {
   "model_name_or_path": "meta-llama/Meta-Llama-3.1-8B-Instruct",
-  "quant_type": "a8w8",
-  "use_fp8": "WA",
+  "quant_type": "a8w8_fp8",
   "per_device_train_batch_size": 8,
   "per_device_eval_batch_size": 8,
   "eval_accumulation_steps":16,
@@ -18,6 +17,5 @@
   "unified_checkpoint": false,
   "smooth": false,
   "weight_quant_method": "abs_max",
-  "act_quant_method": "abs_max",
-  "cachekv_quant_method": "abs_max"
+  "act_quant_method": "abs_max"
   }
diff --git a/llm/config/llama/ceval_ptq_argument.json b/llm/config/llama/ceval_ptq_argument.json
diff --git a/llm/config/llama/fp8_ptq_argument.json b/llm/config/llama/fp8_ptq_argument.json
@@ -1,8 +1,6 @@
 {
   "model_name_or_path": "meta-llama/Meta-Llama-3-8B",
-  "quant_type": "W8A8",
-  "use_fp8": "WA",
-  "fp8_type": ["e4m3", "e4m3"],
+  "quant_type": "a8w8_fp8",
   "per_device_train_batch_size": 8,
   "per_device_eval_batch_size": 8,
   "eval_accumulation_steps":16,

diff --git a/llm/config/qwen/AdvertiseGen/w8a8_ptq_argument.json b/llm/config/qwen/AdvertiseGen/w8a8_ptq_argument.json
@@ -22,5 +22,5 @@
   "smooth_k_piece": 3,
   "smooth_search_piece": true,
   "act_quant_method": "abs_max",
-  "cachekv_quant_method": "abs_max_headwise"
+  "skip_list_names": ["down_proj"]
 }
diff --git a/llm/config/qwen/AdvertiseGen/w8a8c8_ptq_argument.json b/llm/config/qwen/AdvertiseGen/w8a8c8_ptq_argument.json
@@ -22,5 +22,6 @@
   "smooth_k_piece": 3,
   "smooth_search_piece": true,
   "act_quant_method": "abs_max",
-  "cachekv_quant_method": "abs_max_headwise"
+  "cachekv_quant_method": "abs_max_headwise",
+  "skip_list_names": ["down_proj"]
 }
diff --git a/llm/config/qwen/AdvertiseGen/wfp8afp8_ptq_argument.json b/llm/config/qwen/AdvertiseGen/wfp8afp8_ptq_argument.json
@@ -1,8 +1,6 @@
 {
   "model_name_or_path": "Qwen/Qwen2-7B-Instruct",
-  "quant_type": "a8w8",
-  "use_fp8": "WA",
-  "fp8_type": ["e4m3", "e4m3"],
+  "quant_type": "a8w8_fp8",
   "per_device_train_batch_size": 8,
   "per_device_eval_batch_size": 8,
   "eval_accumulation_steps":16,
@@ -20,5 +18,5 @@
   "smooth": false,
   "weight_quant_method": "abs_max",
   "act_quant_method": "abs_max",
-  "cachekv_quant_method": "abs_max"
+  "skip_list_names": ["down_proj"]
   }
diff --git a/llm/config/qwen/ceval/w8a8_ptq_argument.json b/llm/config/qwen/ceval/w8a8_ptq_argument.json
@@ -22,6 +22,5 @@
   "smooth_k_piece": 3,
   "smooth_search_piece": true,
   "act_quant_method": "abs_max",
-  "cachekv_quant_method": "abs_max_headwise",
   "skip_list_names": ["down_proj"]
 }
diff --git a/llm/config/qwen/ceval/wfp8afp8_ptq_argument.json b/llm/config/qwen/ceval/wfp8afp8_ptq_argument.json
@@ -1,7 +1,6 @@
 {
   "model_name_or_path": "Qwen/Qwen2-7B-Instruct",
-  "quant_type": "a8w8",
-  "use_fp8": "WA",
+  "quant_type": "a8w8_fp8",
   "per_device_train_batch_size": 8,
   "per_device_eval_batch_size": 8,
   "eval_accumulation_steps":16,
@@ -18,6 +17,5 @@
   "unified_checkpoint": false,
   "smooth": false,
   "weight_quant_method": "abs_max",
-  "act_quant_method": "abs_max",
-  "cachekv_quant_method": "abs_max"
+  "act_quant_method": "abs_max"
   }
diff --git a/llm/docs/quantization.md b/llm/docs/quantization.md
@@ -94,15 +94,19 @@ python  run_finetune.py ./config/llama/ptq_c8_argument.json
 python  run_finetune.py ./config/llama/fp8_ptq_argument.json
 ```
 
-### 2.9 量化参数介绍
+### 2.8 量化参数介绍
 
 <summary>&emsp; 量化参数（QuantArgument）</summary>
 
 <div>
 
-- `quant_type`: PTQ，QAT 量化类型，默认为 a8w8(不区分大小写)。支持 a8w8，a8w8c8，wint4/weight_only_int4，wint8/weight_only_int8：a8w8指对激活（输入）进行 8位量化，对模型权重进行 8位量化，具体量化类型通过`use_fp8`字段给出；a8w8c8指对激活、权重、kvcache 进行8位量化，具体量化类型通过`use_fp8`字段给出；wint4/weight_only_int4指仅对模型权重进行 INT4量化，后续使用 WeightOnly 进行推理；wint8/weight_only_int8指仅对模型权重进行 INT8量化，后续使用 WeightOnly 进行推理。
-- `use_fp8`: 是否使用 FP8 量化，默认为空字符串。输入`"WA"`(不区分大小写)则将权重和激活的8位量化转换为 FP8量化。
-- `fp8_type`: FP8量化类型，长度应与`use_fp8`相同。默认为`["e4m3","e4m3"]`。
+- `quant_type`: PTQ，QAT 量化类型，默认为 a8w8(不区分大小写)。支持 a8w8，a8w8c8，a8w8_fp8，wint4/weight_only_int4，wint8/weight_only_int8:
+    - a8w8指对激活（输入）进行 8位量化，对模型权重进行 INT8量化
+    - a8w8c8指对激活、权重、kvcache 进行 INT8量化
+    - a8w8_fp8指对激活、权重进行 FP8量化
+    - wint4/weight_only_int4指仅对模型权重进行 INT4量化，后续使用 WeightOnly 进行推理
+    - wint8/weight_only_int8指仅对模型权重进行 INT8量化，后续使用 WeightOnly 进行推理
+- `fp8_type`: FP8量化类型，指定 activatin，weight 的 fp8类型，默认为`["e4m3","e4m3"]`。
 - `do_ptq`: 是否进行 PTQ 量化，默认为 False。
 - `weight_quant_method`: 权重量化方式，INT8量化可选 groupwise 或者 abs_max_channel_wise，FP8量化可选 abs_max 或 avg。
 - `act_quant_method`: 激活量化方式，INT8可选 avg 或者 abs_max，FP8量化可选 abs_max 或 avg。

diff --git a/llm/experimental/observer/abs_max.py b/llm/experimental/observer/abs_max.py
@@ -0,0 +1,100 @@
+# Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import paddle
+from paddle.quantization.factory import ObserverFactory
+
+from .uniform import UniformObserver
+
+
+class AbsmaxObserver(ObserverFactory):
+    r"""
+    It collects maximum absolute values of target tensor.
+    Args:
+        bit_length(int, optional): Number of bits to represent an quantized integer in binary.
+        dtype(str, optional): The data type of input tensor.
+        name (str, optional): This parameter is used by developers to print debugging information. \
+            For details, please refer to :ref:`api_guide_Name`. Default is None.
+    Examples:
+       .. code-block:: python
+            from paddle.quantization import QuantConfig
+            from paddle.quantization.quanters import FakeQuanterWithAbsMaxObserver
+            quanter = FakeQuanterWithAbsMaxObserver(moving_rate=0.99)
+            q_config = QuantConfig(activation=quanter, weight=quanter)
+    """
+
+    def __init__(self, quant_bits=8):
+        super(AbsmaxObserver, self).__init__(quant_bits=quant_bits)
+
+    def _get_class(self):
+        return AbsmaxObserverLayer
+
+
+class AbsmaxObserverLayer(UniformObserver):
+    def __init__(
+        self,
+        layer,
+        quant_bits=8,
+    ):
+        super(AbsmaxObserverLayer, self).__init__(quant_bits=quant_bits)
+        self._quant_bits = quant_bits
+        self._layer = layer
+        self._scale = None
+        self._zero_point = None
+        self._min = None
+        self._max = paddle.to_tensor(1e-7, dtype="float32")
+        self.step = 0
+
+    def forward(self, inputs):
+        """Calculate forward pass."""
+        self._min, self._max = self.cal_min_max(inputs)
+        return inputs
+
+    def cal_min_max(self, inputs):
+        abs_max_val = paddle.max(paddle.abs(inputs.cast("float32")))
+        abs_max_val = paddle.maximum(abs_max_val, self._max)
+        return 0, abs_max_val
+
+    def cal_thresholds(self):
+        """Compute thresholds for MAX function."""
+        if self._scale is not None:
+            self._zero_point = 0
+            return
+        self._scale, self._zero_point = self.cal_scales_zero_points()
+
+    def min_value(self) -> float:
+        return self._min
+
+    def max_value(self) -> float:
+        return self._max
+
+    def bit_length(self):
+        """Return the bit length of quantized data."""
+        return self._quant_bits
+
+    def quant_axis(self):
+        """Return quantization axis."""
+        return -1
+
+    def scales(self):
+        """Return output scales."""
+        if self._scale is None:
+            self.cal_thresholds()
+        return self._scale
+
+    def zero_points(self):
+        """Return output zero points."""
+        if self._zero_point is None:
+            self.cal_thresholds()
+        return self._zero_point
diff --git a/llm/experimental/observer/avg.py b/llm/experimental/observer/avg.py
@@ -0,0 +1,102 @@
+# Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import paddle
+from paddle.quantization.factory import ObserverFactory
+
+from .uniform import UniformObserver
+
+
+class AVGObserver(ObserverFactory):
+    r"""
+    It collects maximum absolute values of target tensor.
+    Args:
+        bit_length(int, optional): Number of bits to represent an quantized integer in binary.
+        dtype(str, optional): The data type of input tensor.
+        name (str, optional): This parameter is used by developers to print debugging information. \
+            For details, please refer to :ref:`api_guide_Name`. Default is None.
+    Examples:
+       .. code-block:: python
+            from paddle.quantization import QuantConfig
+            from paddle.quantization.quanters import FakeQuanterWithAbsMaxObserver
+            quanter = FakeQuanterWithAbsMaxObserver(moving_rate=0.99)
+            q_config = QuantConfig(activation=quanter, weight=quanter)
+    """
+
+    def __init__(self, quant_bits=8):
+        super(AVGObserver, self).__init__(quant_bits=quant_bits)
+
+    def _get_class(self):
+        return AVGObserverLayer
+
+
+class AVGObserverLayer(UniformObserver):
+    def __init__(
+        self,
+        layer,
+        quant_bits=8,
+    ):
+        super(AVGObserverLayer, self).__init__(quant_bits=quant_bits)
+        self._quant_bits = quant_bits
+        self._avg_list = []
+
+    def forward(self, inputs):
+        """Calculate forward pass."""
+        self._scale = None
+        self._zero_point = None
+        self._min = None
+        self._max = None
+        self._avg_min, self._avg_max = self.cal_min_max(inputs)
+        self._avg_list.append(self._avg_max)
+
+        return inputs
+
+    def cal_min_max(self, inputs):
+        abs_avg_value = paddle.abs(inputs.reshape((inputs.shape[0], -1)))
+        abs_avg_value = float(paddle.mean(paddle.max(abs_avg_value, axis=(1))))
+        return 0, abs_avg_value
+
+    def cal_thresholds(self):
+        """Compute thresholds for MAX function."""
+        if self._scale is not None:
+            self._zero_point = 0
+            return
+        self._min, self._max = self._avg_min, paddle.mean(paddle.to_tensor(self._avg_list))
+        self._scale, self._zero_point = self.cal_scales_zero_points()
+
+    def min_value(self) -> float:
+        return self._min
+
+    def max_value(self) -> float:
+        return self._max
+
+    def bit_length(self):
+        """Return the bit length of quantized data."""
+        return self._quant_bits
+
+    def quant_axis(self):
+        """Return quantization axis."""
+        return -1
+
+    def scales(self):
+        """Return output scales."""
+        if self._scale is None:
+            self.cal_thresholds()
+        return self._scale
+
+    def zero_points(self):
+        """Return output zero points."""
+        if self._zero_point is None:
+            self.cal_thresholds()
+        return self._zero_point