Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[cherry-pick] fix QuantizeLinear pass and support reduce_max in quantization #44872

Merged
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion paddle/fluid/operators/fake_quantize_op.h
Original file line number Diff line number Diff line change
Expand Up @@ -139,7 +139,8 @@ struct FindMovingAverageAbsMaxFunctor {
void operator()(const DeviceContext &ctx,
const framework::Tensor &in_accum,
const framework::Tensor &in_state,
const framework::Tensor &cur_scale,
const T *cur_scale,
const float rate,
framework::Tensor *out_state,
framework::Tensor *out_accum,
framework::Tensor *out_scale);
Expand Down
29 changes: 26 additions & 3 deletions paddle/fluid/operators/quantize_linear_op.cc
Original file line number Diff line number Diff line change
Expand Up @@ -93,6 +93,12 @@ class QuantizeLinearOp : public framework::OperatorWithKernel {
ctx->SetOutputDim("OutScale", {ctx->GetInputDim("X")[quant_axis]});
}
}
if (ctx->HasOutput("OutState")) {
ctx->SetOutputDim("OutState", {1});
}
if (ctx->HasOutput("OutAccum")) {
ctx->SetOutputDim("OutAccum", {1});
}
ctx->ShareLoD("X", /*->*/ "Y");
}

Expand All @@ -113,7 +119,25 @@ class QuantizeLinearOpMaker : public framework::OpProtoAndCheckerMaker {
AddOutput("Y",
"(Tensor) Output of quantized low level tensor, "
"but also saved as float data type.");
AddOutput("OutScale", "(Tensor) Current scale").AsDispensable().AsExtra();
AddInput("InAccum", "Last accum.")
.AsDispensable()
.AsExtra(); // only qat use
AddInput("InState", "Last state.")
.AsDispensable()
.AsExtra(); // only qat use
AddOutput("OutState", "(Tensor) state buffer.")
.AsDispensable()
.AsExtra(); // only qat use
AddOutput("OutAccum", "(Tensor) accum buffer.")
.AsDispensable()
.AsExtra(); // only qat use
AddOutput("OutScale", "(Tensor) Current scale")
.AsDispensable()
.AsExtra(); // only qat use
AddAttr<float>("moving_rate",
"(float, default 0.9) moving rate.") // only qat use
.SetDefault(0.9)
.AsExtra();
AddAttr<int>("quant_axis",
"(int, default 0) The axis for quantization. "
"For conv2d, depthwise_conv2d, conv2d_transpose "
Expand Down Expand Up @@ -154,8 +178,7 @@ class QuantizeLinearOpMaker : public framework::OpProtoAndCheckerMaker {
"nearest ties to even and 1 is rounding to nearest "
"ties away from zero.but the received is %d",
round_type));
})
.AsExtra();
});
AddAttr<bool>("is_test",
"(bool, default false) Set to true for inference only, false "
"for training. Some layers may run faster when this is true.")
Expand Down
27 changes: 24 additions & 3 deletions paddle/fluid/operators/quantize_linear_op.h
Original file line number Diff line number Diff line change
Expand Up @@ -56,10 +56,31 @@ class QuantizeLinearKernel : public framework::OpKernel<T> {

if (quant_axis < 0) {
if (!is_test) {
auto* out_scale = context.Output<framework::Tensor>("OutScale");
T* out_s = out_scale->mutable_data<T>(context.GetPlace());
// training
auto* in_accum = context.Input<framework::Tensor>("InAccum");
auto* in_state = context.Input<framework::Tensor>("InState");
auto cur_scale = memory::Alloc(dev_ctx, sizeof(T));
T* cur_scale_data = static_cast<T*>(cur_scale->ptr());

FindAbsMaxFunctor<DeviceContext, T>()(
dev_ctx, in->data<T>(), in->numel(), out_s);
dev_ctx, in->data<T>(), in->numel(), cur_scale_data);

auto* out_state = context.Output<framework::Tensor>("OutState");
auto* out_accum = context.Output<framework::Tensor>("OutAccum");
auto* out_scale = context.Output<framework::Tensor>("OutScale");
out_state->mutable_data<T>(context.GetPlace());
out_accum->mutable_data<T>(context.GetPlace());
out_scale->mutable_data<T>(context.GetPlace());
float moving_rate = context.Attr<float>("moving_rate");

FindMovingAverageAbsMaxFunctor<DeviceContext, T>()(dev_ctx,
*in_accum,
*in_state,
cur_scale_data,
moving_rate,
out_state,
out_accum,
out_scale);
ClipAndFakeQuantFunctor<DeviceContext, T>()(
dev_ctx, *in, *out_scale, bin_cnt, round_type, out);
} else {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -418,8 +418,7 @@ def quantize(self):
self._update_program()

# save out_threshold for quantized ops.
if not self._onnx_format:
self._save_output_threshold()
self._save_output_threshold()

if any(op_type in self._quantizable_op_type
for op_type in self._dynamic_quantize_op_type):
Expand Down Expand Up @@ -996,16 +995,23 @@ def _save_output_threshold(self):
'''
Save output threshold to the quantized op.
'''
self._calibration_scales = {}

def save_info(op_node, out_var_name, threshold_map, out_info_name,
quantized_type):
assert out_var_name in threshold_map, \
"The output ({}) of {} node does not have threshold.".format(
out_var_name, op_node.type)
op_node._set_attr(out_info_name, threshold_map[var_name])
op_node._set_attr("with_quant_attr", True)
if op_node.type in self._quantizable_op_type:
op._set_attr("quantization_type", quantized_type)
if self._onnx_format:
# For easy extension, every var_node set a dict to save parameters of quant.
self._calibration_scales[var_name] = {}
self._calibration_scales[var_name]['scale'] = threshold_map[
var_name]
else:
op_node._set_attr(out_info_name, threshold_map[var_name])
op_node._set_attr("with_quant_attr", True)
if op_node.type in self._quantizable_op_type:
op._set_attr("quantization_type", quantized_type)

def analysis_and_save_info(op_node, out_var_name):
argname_index = utils._get_output_name_index(op_node, out_var_name)
Expand Down
88 changes: 52 additions & 36 deletions python/paddle/fluid/contrib/slim/quantization/quantization_pass.py
Original file line number Diff line number Diff line change
Expand Up @@ -1785,6 +1785,7 @@ class InsertQuantizeLinear(object):
equal to 0, it will quantization with per channel, else quantization with per layer.
Default is -1.
channel_wise(bool, optional): Whether quantization with per channel or not. Default is False.
moving_rate(float): the rate for 'moving average' method.
is_test(bool, optional): Whether quantization with training or not. Default is True.
"""

Expand All @@ -1794,22 +1795,24 @@ def __init__(self,
quant_bits=8,
quant_axis=-1,
channel_wise=False,
moving_rate=0.9,
is_test=True):
self._place = place
self._scope = scope
self.quant_bits = quant_bits
self.quant_axis = quant_axis
self.channel_wise = channel_wise
self._is_test = is_test
self._moving_rate = moving_rate

def insert_quant_op(self, graph, var_node):
def insert_quant_op(self, graph, var_node, var_name=None):
assert var_node.is_var(), '{} is not a var'.format(var_node.name())

quant_var_node = graph.create_var_node(name=self._quantized_var_name(
var_node.name()),
var_type=var_node.type(),
shape=var_node.shape(),
var_dtype=var_node.dtype())
var_name = var_node.name() if not var_name else var_name
quant_var_node = graph.create_var_node(
name=self._quantized_var_name(var_name),
var_type=var_node.type(),
shape=var_node.shape(),
var_dtype=var_node.dtype())
data_type = 'float64' if var_node.dtype(
) == core.VarDesc.VarType.FP64 else 'float32'
if self.channel_wise:
Expand All @@ -1821,7 +1824,7 @@ def insert_quant_op(self, graph, var_node):
scale_var_type = var_node.type()
init_scale_value = np.array([_SCALE_DEFAULT_VALUE], dtype=data_type)
scale_var_node = graph.create_persistable_node(
name=self._quantized_scale_name(var_node.name()),
name=self._quantized_scale_name(var_name),
var_type=scale_var_type,
shape=[scale_var_shape],
var_dtype=var_node.dtype())
Expand All @@ -1844,13 +1847,39 @@ def insert_quant_op(self, graph, var_node):
inputs["ZeroPoint"] = zero_point_node

attrs = {"quant_axis": self.quant_axis, "bit_length": self.quant_bits}
attrs["op_role"] = core.op_proto_and_checker_maker.OpRole.Forward
outputs = {"Y": quant_var_node}
if not self._is_test:
attrs["is_test"] = self._is_test
attrs["op_role"] = core.op_proto_and_checker_maker.OpRole.Forward
scale_out_node = graph.create_var_node_from_desc(
scale_var_node.var())
state_in_node = graph.create_persistable_node(
name=unique_name.generate('state'),
var_type=core.VarDesc.VarType.LOD_TENSOR,
var_dtype=var_node.dtype(),
shape=[1])
data_type = 'float64' if var_node.dtype(
) == core.VarDesc.VarType.FP64 else 'float32'
_init_var_node(state_in_node, np.ones([1], dtype=data_type),
self._scope, self._place)
accum_in_node = graph.create_persistable_node(
name=unique_name.generate('accum'),
var_type=core.VarDesc.VarType.LOD_TENSOR,
var_dtype=var_node.dtype(),
shape=[1])
_init_var_node(accum_in_node, np.ones([1], dtype=data_type),
self._scope, self._place)
state_out_node = graph.create_var_node_from_desc(
state_in_node.var())
accum_out_node = graph.create_var_node_from_desc(
accum_in_node.var())

outputs["OutScale"] = scale_out_node
inputs['InState'] = state_in_node
inputs['InAccum'] = accum_in_node
outputs['OutState'] = state_out_node
outputs['OutAccum'] = accum_out_node
attrs["is_test"] = self._is_test
attrs['moving_rate'] = self._moving_rate

quant_op_node = graph.create_op_node(op_type="quantize_linear",
attrs=attrs,
Expand All @@ -1863,6 +1892,10 @@ def insert_quant_op(self, graph, var_node):
graph.link_to(zero_point_node, quant_op_node)
graph.link_to(quant_op_node, quant_var_node)
if not self._is_test:
graph.link_to(state_in_node, quant_op_node)
graph.link_to(accum_in_node, quant_op_node)
graph.link_to(quant_op_node, state_out_node)
graph.link_to(quant_op_node, accum_out_node)
graph.link_to(quant_op_node, scale_out_node)
return quant_var_node, scale_var_node

Expand Down Expand Up @@ -1891,8 +1924,7 @@ def insert_dequant_op(self, graph, var_node, scale_var_node):
inputs["ZeroPoint"] = zero_point_node

attrs = {"quant_axis": self.quant_axis, "bit_length": self.quant_bits}
if not self._is_test:
attrs["op_role"] = core.op_proto_and_checker_maker.OpRole.Forward
attrs["op_role"] = core.op_proto_and_checker_maker.OpRole.Forward

quant_op_node = graph.create_op_node(op_type="dequantize_linear",
attrs=attrs,
Expand Down Expand Up @@ -1931,10 +1963,10 @@ def _zero_point_name(self, var_name):
return "%s@zero_point" % (var_name)


class QuantizationTransformPassV2(object):
class QuantizationTransformPassV2(QuantizationTransformPass):
"""
Quantize the ops that have weights. Add quant and dequant ops for
the quantized ops's inputs.
the quantized ops's inputs. It is used in the new format of quantization.
"""

def __init__(self,
Expand Down Expand Up @@ -2130,13 +2162,13 @@ def _transform_forward(self, graph, op):
if is_weight and self._weight_quantize_func is not None:
target_out_node = self._insert_func(
graph, self._weight_quantize_func, var_node, op)
processed_vars.append(name)
self.processed_vars.append(name)
continue
elif not is_weight and self._act_quantize_func is not None:
target_out_node = self._insert_func(graph,
self._act_quantize_func,
var_node, op)
processed_vars.append(name)
self.processed_vars.append(name)
continue

quant_bits = self._weight_bits if var_node.name() in self.persistable_vars \
Expand All @@ -2155,9 +2187,10 @@ def _transform_forward(self, graph, op):
quant_bits=quant_bits,
quant_axis=quant_axis,
channel_wise=channel_wise,
moving_rate=self._moving_rate,
is_test=self._is_test)
quant_var_node, scale_var_node = insert_quant_pass.insert_quant_op(
graph, var_node)
graph, var_node, var_name=name)
dequant_var_node = insert_quant_pass.insert_dequant_op(
graph, quant_var_node, scale_var_node)

Expand All @@ -2182,24 +2215,6 @@ def _has_weight(self, op):
has_weight = True
return has_weight

def _is_skip_quant(self, graph, op_node):
"""
Analyse whether the op node skips quantization.
"""
is_skip = False
if op_node.op().has_attr("skip_quant") and \
op_node.op().attr("skip_quant"):
is_skip = True
# if the inputs of mul and matmul are not all persistable, use
# AddQuantDequantPassV2 to quantize them.
if op_node.name() in ["mul", "matmul", "matmul_v2"] and \
_is_input_all_not_persistable(graph, op_node):
is_skip = True
if op_node.op().has_attr("quantization_type") and \
op_node.op().attr("quantization_type") == "qat_without_weight":
is_skip = True
return is_skip

def apply(self, graph):
"""
Quantize the graph for training process. According to weight and
Expand Down Expand Up @@ -2250,7 +2265,7 @@ def apply(self, graph):
class AddQuantDequantPassV2(object):
"""
Quantize the ops that do not have weights, and add quant_linear and dequant_linear
op for the quantized ops's inputs.
op for the quantized ops's inputs. It is used in the new format of quantization.
"""

# To be compatible with PaddleSlim, not remove _activation_type for now
Expand Down Expand Up @@ -2377,6 +2392,7 @@ def apply(self, graph):
quant_bits=self._quant_bits,
quant_axis=-1,
channel_wise=False,
moving_rate=self._moving_rate,
is_test=self._is_test)
quant_var_node, scale_var_node = insert_quant_pass.insert_quant_op(
graph, in_node)
Expand Down
2 changes: 2 additions & 0 deletions python/paddle/fluid/contrib/slim/quantization/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -109,6 +109,7 @@
"square",
"softplus",
"shuffle_channel",
"reduce_max",
]

_out_scale_op_list = list(
Expand Down Expand Up @@ -213,6 +214,7 @@
"square": [["X"], ["Out"]],
"softplus": [["X"], ["Out"]],
"shuffle_channel": [["X"], ["Out"]],
"reduce_max": [["X"], ["Out"]],
}


Expand Down
37 changes: 30 additions & 7 deletions python/paddle/fluid/tests/unittests/test_fake_quantize_op.py
Original file line number Diff line number Diff line change
Expand Up @@ -550,18 +550,41 @@ def set_args(self):
def setUp(self):
self.set_args()
self.op_type = "quantize_linear"
x = np.random.randn(31, 65).astype(self.data_type)
yq, scale = quantize_max_abs(x, self.max_range)
scale = np.array(scale).astype(self.data_type)
zero_point = np.zeros(scale.shape, dtype="int32")

self.inputs = {'X': x, 'Scale': scale, 'ZeroPoint': zero_point}
self.attrs = {
'bit_length': self.bit_length,
'quant_axis': self.quant_axis,
'moving_rate': 0.9,
'is_test': self.is_test
}
self.outputs = {'Y': yq, 'OutScale': scale}

x = np.random.randn(31, 65).astype(self.data_type)
scale = np.array([0.001]).astype(self.data_type)
zero_point = np.zeros(scale.shape, dtype="int32")
in_accum = np.ones(1).astype(self.data_type)
in_state = np.ones(1).astype(self.data_type)
out_accum = np.zeros(1).astype(self.data_type)
out_state = np.zeros(1).astype(self.data_type)
out_accum[0] = self.attrs['moving_rate'] * in_accum[0] + np.max(
np.abs(x))
out_state[0] = self.attrs['moving_rate'] * in_state[0] + 1.0
out_scale = out_accum / out_state

round_out = np.round(x / out_scale * self.max_range)
quant_data = np.clip(round_out, -self.max_range - 1, self.max_range)

self.inputs = {
'X': x,
'Scale': scale,
'ZeroPoint': zero_point,
'InAccum': in_accum,
'InState': in_state,
}
self.outputs = {
'Y': quant_data,
'OutScale': out_scale,
'OutAccum': out_accum,
'OutState': out_state,
}

def test_check_output(self):
self.check_output()
Expand Down