This is an automated email from the ASF dual-hosted git repository. andrewzhaoluo pushed a commit to branch main in repository https://gitbox.apache.org/repos/asf/tvm.git
The following commit(s) were added to refs/heads/main by this push: new 76155c2f3c [QNN] Support different qnn params between in/out tensor in leaky_relu (#12116) 76155c2f3c is described below commit 76155c2f3c327ad7ada9d8fcb1c7f6f447dcc0ec Author: zhaoyang-star <zhaoyangs...@foxmail.com> AuthorDate: Sat Jul 23 05:33:31 2022 +0800 [QNN] Support different qnn params between in/out tensor in leaky_relu (#12116) * [QNN] Support different qnn params between in/out tensor in leaky_relu * format code * format code * fix bug * fix format * fix format * fix --- python/tvm/relay/frontend/qnn_torch.py | 6 +- python/tvm/relay/qnn/op/qnn.py | 21 ++++-- .../transform/fake_quantization_to_integer.py | 9 ++- src/relay/qnn/op/leaky_relu.cc | 85 +++++++++++++++------- tests/python/relay/test_op_qnn_leaky_relu.py | 30 +++++--- 5 files changed, 104 insertions(+), 47 deletions(-) diff --git a/python/tvm/relay/frontend/qnn_torch.py b/python/tvm/relay/frontend/qnn_torch.py index 251f46630a..74d5e2e0f5 100644 --- a/python/tvm/relay/frontend/qnn_torch.py +++ b/python/tvm/relay/frontend/qnn_torch.py @@ -963,7 +963,11 @@ def _leaky_relu(fp32_piggy_back=False): alpha = inputs[1] output_scale = _expr.const(inputs[3]) output_zero_point = _expr.const(inputs[4]) - return relay.qnn.op.leaky_relu(inputs[0], alpha, output_scale, output_zero_point) + input_scale = _expr.const(inputs[5]) + input_zero_point = _expr.const(inputs[6]) + return relay.qnn.op.leaky_relu( + inputs[0], alpha, input_scale, input_zero_point, output_scale, output_zero_point + ) def _impl(inputs, _): assert len(inputs) == 7, "Input quant params not found in op inputs" diff --git a/python/tvm/relay/qnn/op/qnn.py b/python/tvm/relay/qnn/op/qnn.py index edb528708c..17dba15e09 100644 --- a/python/tvm/relay/qnn/op/qnn.py +++ b/python/tvm/relay/qnn/op/qnn.py @@ -1179,7 +1179,7 @@ reg.register_pattern("qnn.quantize", OpPattern.OPAQUE) reg.register_pattern("qnn.dequantize", OpPattern.OPAQUE) -def leaky_relu(x, alpha, scale, zero_point): +def leaky_relu(x, alpha, input_scale, input_zero_point, output_scale, output_zero_point): """Quantized leaky relu. Parameters @@ -1188,11 +1188,14 @@ def leaky_relu(x, alpha, scale, zero_point): The quantized input tensor. alpha: double The alpha value. - scale: relay.Expr - The scale of the quantized expr. - zero_point: relay.Expr - The zero point of quantized expr. - + input_scale: relay.Expr + The scale of the input quantized expr. + input_zero_point: relay.Expr + The zero point of input quantized expr. + output_scale: relay.Expr + The scale of the output quantized expr. + output_zero_point: relay.Expr + The zero point of output quantized expr. Returns ------- result : relay.Expr @@ -1201,6 +1204,8 @@ def leaky_relu(x, alpha, scale, zero_point): return _make.leaky_relu( x, alpha, - scale, - zero_point, + input_scale, + input_zero_point, + output_scale, + output_zero_point, ) diff --git a/python/tvm/relay/transform/fake_quantization_to_integer.py b/python/tvm/relay/transform/fake_quantization_to_integer.py index 4436960a20..8308298e70 100644 --- a/python/tvm/relay/transform/fake_quantization_to_integer.py +++ b/python/tvm/relay/transform/fake_quantization_to_integer.py @@ -364,10 +364,13 @@ def relu(expr, type_map): def leaky_relu(expr, type_map): """Rewrite a leaky relu op""" arg = expr.args[0] - t = type_map[arg] + x_t = type_map[arg] + out_t = type_map[expr] alpha = expr.attrs.alpha - output = relay.qnn.op.leaky_relu(expr, alpha, t.scale, t.zero_point) - return [output, t] + output = relay.qnn.op.leaky_relu( + expr, alpha, x_t.scale, x_t.zero_point, out_t.scale, out_t.zero_point + ) + return [output, x_t] @register_fake_quantization_to_integer("nn.pad") diff --git a/src/relay/qnn/op/leaky_relu.cc b/src/relay/qnn/op/leaky_relu.cc index a4881dfbbd..75bfabb7db 100644 --- a/src/relay/qnn/op/leaky_relu.cc +++ b/src/relay/qnn/op/leaky_relu.cc @@ -32,8 +32,8 @@ namespace qnn { bool QnnLeakyReluRel(const Array<Type>& types, int num_inputs, const Attrs& attrs, const TypeReporter& reporter) { - // Expected Types: data, scale, zero_point - ICHECK_EQ(types.size(), 4); + // Expected Types: data, input_scale, input_zero_point, output_scale, output_zero_point, out_type + ICHECK_EQ(types.size(), 6); const auto* x = types[0].as<TensorTypeNode>(); if (x == nullptr) return false; ICHECK(x->dtype == DataType::Int(8) || x->dtype == DataType::UInt(8)) @@ -42,31 +42,37 @@ bool QnnLeakyReluRel(const Array<Type>& types, int num_inputs, const Attrs& attr ICHECK(param != nullptr) << "LeakyReluAttrs cannot be nullptr."; // Check the types of scale and zero points. - for (size_t i = 1; i < 3; ++i) { + for (size_t i = 1; i < 5; ++i) { if (types[i].as<IncompleteTypeNode>()) { return false; } } - ICHECK(IsScalarType(types[1], DataType::Float(32))); // scale - ICHECK(IsScalarType(types[2], DataType::Int(32))); // zero_point + ICHECK(IsScalarType(types[1], DataType::Float(32))); // input_scale + ICHECK(IsScalarType(types[2], DataType::Int(32))); // input_zero_point + ICHECK(IsScalarType(types[3], DataType::Float(32))); // output_scale + ICHECK(IsScalarType(types[4], DataType::Int(32))); // output_zero_point // Assign types for scale and zero points. - reporter->Assign(types[1], TensorType({}, DataType::Float(32))); // scale - reporter->Assign(types[2], TensorType({}, DataType::Int(32))); // zero_point + reporter->Assign(types[1], TensorType({}, DataType::Float(32))); // input_scale + reporter->Assign(types[2], TensorType({}, DataType::Int(32))); // input_zero_point + reporter->Assign(types[3], TensorType({}, DataType::Float(32))); // output_scale + reporter->Assign(types[4], TensorType({}, DataType::Int(32))); // output_zero_point // Collect the input tensor and output tensor devoid of scale and zero points to reuse Relay // IdentityRel infer type function. - Array<Type> tensor_types = {types[0], types[3]}; + Array<Type> tensor_types = {types[0], types[5]}; return IdentityRel(tensor_types, 2, attrs, reporter); } // Positional relay function to create quantized leaky relu operator used by frontend FFI. -Expr MakeQuantizedLeakyRelu(Expr x, double alpha, Expr scale, Expr zero_point) { +Expr MakeQuantizedLeakyRelu(Expr x, double alpha, Expr input_scale, Expr input_zero_point, + Expr output_scale, Expr output_zero_point) { auto attrs = make_object<LeakyReluAttrs>(); attrs->alpha = alpha; static const Op& op = Op::Get("qnn.leaky_relu"); - return Call(op, {x, scale, zero_point}, Attrs(attrs), {}); + return Call(op, {x, input_scale, input_zero_point, output_scale, output_zero_point}, Attrs(attrs), + {}); } /* @@ -82,42 +88,69 @@ Expr QnnLeakyReluCanonicalize(const Attrs& attrs, const Array<Expr>& new_args, // by a small alpha value < 1. // // We assume the same scale and zero point for alpha and the input tensor. - // Let T = s(q_t - z) where q_t is the input arg[0] - // Then, the quantized value of alpha * T is: - // q(a * T, s, z) = [(a * T) / s] + z = a * s(q_t - z) / s + z = a * (q_t - z) + z - // = a * q_t + (1 - a) * z + // LeakyReLU can be written in terms of respective quantized tensors, scales and + // zero points as // - // We return the quantized value of alpha * T for all values q_t < input_zero_point. - - ICHECK_EQ(new_args.size(), 3); - Expr quantized_data = Cast(new_args[0], DataType::Int(32)); + // scale_o * (Q_o - zp_o) = alpha * scale_i * (Q_i - zp_i) when Q_i < zp_i (1) + // scale_o * (Q_o - zp_o) = scale_i * (Q_i - zp_i) when Q_i >= zp_i (2) + // + // Since the input qnn params can be different than output qnn params, we first requantize the + // input tensor to the output qnn params. After requantizing Q_i, equation (1) becames equation + // (3) where Q_i' is the requantized data from Q_i. + // + // scale_o * (Q_o - zp_o) = alpha * scale_o * (Q_i' - zp_o) when Q_i < zp_i (3) + // Q_o = alpha * Q_i' + (1 - alpha) * zp_o when Q_i < zp_i (4) + // + // It is equal to requantize Q_i to Q_o using scale_o and zp_o in equation (2). + // So equation (2) becomes + // + // Q_o = requantize(Q_i) when Q_i >= zp_i (5) + // + // Finnally, Q_o could be calculated by equation (4) and equation (5). + ICHECK_EQ(new_args.size(), 5); + Expr data = Cast(new_args[0], DataType::Int(32)); + Expr input_scale = new_args[1]; Expr input_zero_point = Cast(new_args[2], DataType::Int(32)); + Expr output_scale = new_args[3]; + Expr output_zero_point = Cast(new_args[4], DataType::Int(32)); const auto* q_attrs = attrs.as<LeakyReluAttrs>(); auto alpha = q_attrs->alpha; + const auto input_shape = get_shape(arg_types[0]); + const auto input_dtype = arg_types[0].as<TensorTypeNode>()->dtype; + + // requantize the input to Q_i' + auto requantized_expr = RequantizeOrUpcast(data, input_scale, input_zero_point, output_scale, + output_zero_point, input_shape); + + // alpha * Q_i' int32_t fixed_point_multiplier, shift; std::tie(fixed_point_multiplier, shift) = GetFixedPointMultiplierShift(alpha); - auto prod = FixedPointMultiply(quantized_data, fixed_point_multiplier, shift); + auto prod = FixedPointMultiply(requantized_expr, fixed_point_multiplier, shift); + // (1 - alpha) * zp_o int32_t fixed_point_multiplier_z, shift_z; std::tie(fixed_point_multiplier_z, shift_z) = GetFixedPointMultiplierShift(1 - alpha); - auto scaled_z = FixedPointMultiply(input_zero_point, fixed_point_multiplier_z, shift_z); + auto scaled_z = FixedPointMultiply(output_zero_point, fixed_point_multiplier_z, shift_z); + // alpha * Q_i' + (1 - alpha) * zp_o auto add = Add(prod, scaled_z); - auto output = Where(Less(quantized_data, input_zero_point), add, quantized_data); + auto output = Where(Less(data, input_zero_point), add, requantized_expr); - const auto* input_type = arg_types[0].as<TensorTypeNode>(); - return ConvertDtype(output, input_type->dtype); + return ConvertDtype(output, input_dtype); } RELAY_REGISTER_OP("qnn.leaky_relu") .describe("Leaky relu for quantized tensors.") .set_attrs_type<LeakyReluAttrs>() - .set_num_inputs(3) + .set_num_inputs(5) .add_argument("data", "Quantized Tensor", "The input data.") - .add_argument("scale", "Tensor", "The quantization scale of the input tensor.") - .add_argument("zero_point", "Tensor", "The quantization zero_point of the input tensor.") + .add_argument("input_scale", "Tensor", "The quantization scale of the input tensor.") + .add_argument("input_zero_point", "Tensor", "The quantization zero_point of the input tensor.") + .add_argument("output_scale", "Tensor", "The quantization scale of the output tensor.") + .add_argument("output_zero_point", "Tensor", + "The quantization zero_point of the output tensor.") .set_support_level(11) .add_type_rel("QLeakyRelu", QnnLeakyReluRel) .set_attr<TNonComputational>("TNonComputational", true) diff --git a/tests/python/relay/test_op_qnn_leaky_relu.py b/tests/python/relay/test_op_qnn_leaky_relu.py index 76f581817c..ade897bf6e 100644 --- a/tests/python/relay/test_op_qnn_leaky_relu.py +++ b/tests/python/relay/test_op_qnn_leaky_relu.py @@ -24,26 +24,36 @@ def dequantize(data, scale, zp): return scale * (np.asarray(data) - zp) -def generate_golden_output(x_data, dequantized_x, alpha, scale, zero_point): +def generate_golden_output(x_data, dequantized_x, alpha, o_scale, o_zero_point, i_zero_point): prod = np.multiply(dequantized_x, alpha) - prod = np.around(prod / scale + zero_point) + prod = np.around(prod / o_scale + o_zero_point) - output = np.where(x_data < zero_point, prod, x_data) + q_min = np.iinfo(np.uint8).min + q_max = np.iinfo(np.uint8).max + prod = np.clip(prod, q_min, q_max) + + requantized = np.clip(np.round(dequantized_x / o_scale + o_zero_point), q_min, q_max) + + output = np.where(x_data < i_zero_point, prod, requantized) return output def test_qnn_leaky_relu(): data_dtype = "uint8" - scale = 0.125 - zero_point = 60 + input_scale = 0.125 + input_zero_point = 60 + output_scale = 0.6 + output_zero_point = 17 alpha = 0.9 x = relay.var("x", shape=(1, 4), dtype=data_dtype) y = relay.qnn.op.leaky_relu( x=x, alpha=alpha, - scale=relay.const(scale, "float32"), - zero_point=relay.const(zero_point, "int32"), + input_scale=relay.const(input_scale, "float32"), + input_zero_point=relay.const(input_zero_point, "int32"), + output_scale=relay.const(output_scale, "float32"), + output_zero_point=relay.const(output_zero_point, "int32"), ) func = relay.Function([x], y) @@ -53,8 +63,10 @@ def test_qnn_leaky_relu(): func = mod["main"] x_data = np.array((255, 133, 0, 9)).reshape((1, 4)) - x_dequantized = dequantize(x_data, scale, zero_point) - golden_output = generate_golden_output(x_data, x_dequantized, alpha, scale, zero_point) + x_dequantized = dequantize(x_data, input_scale, input_zero_point) + golden_output = generate_golden_output( + x_data, x_dequantized, alpha, output_scale, output_zero_point, input_zero_point + ) op_res = relay.create_executor("graph", device=tvm.cpu(0), target="llvm").evaluate(func)(x_data)