Nikeran commented on code in PR #15407:
URL: https://github.com/apache/tvm/pull/15407#discussion_r1284539303


##########
src/relay/backend/contrib/cmsisnn/relay_to_tir.cc:
##########
@@ -517,36 +536,111 @@ class RelayToTIRVisitor : public MixedModeMutator {
 
     // calculate multiplier and shift for CMSIS-NN softmax API
     // Note: TensorFlow Lite Micro assumptions
-    // Output zero point and scale are fixed to -128 and 1 / 256
+    // Output zero point and scale are fixed to -128 and 1 / 256 in the case 
of an int8 operator
+    // or to 0 and 1 / 32768.
     // kScaledDiffIntegerBits, kInputBits, kBeta are described on the 
following github page
-    // 
https://github.com/tensorflow/tflite-micro/blob/d97cd0908d8cf5021e9d86f05a49888bee28c2a4/tensorflow/lite/micro/kernels/softmax_common.cc#L47
-    double beta_multiplier = (kBeta * quant_scale * (1 << (31 - kInputBits)));
-    beta_multiplier = std::min<double>(beta_multiplier, (1ll << 31) - 1.0);
-    auto mult_shift_pair = 
tvm::relay::qnn::GetFixedPointMultiplierShift(beta_multiplier);
-    int32_t mult = std::get<0>(mult_shift_pair);
-    int32_t shift = std::get<1>(mult_shift_pair);
-    int32_t diff_min = (1 << kScaledDiffIntegerBits) - 1;
-    diff_min <<= (31 - kScaledDiffIntegerBits);
-    diff_min >>= shift;
-    diff_min *= -1;
+    // 
https://github.com/tensorflow/tflite-micro/blob/d97cd0908d8cf5021e9d86f05a49888bee28c2a4/tensorflow/lite/exp_zero_pointmicro/kernels/softmax_common.cc#L47
+
+    int32_t mult;
+    int32_t shift;
+    int32_t diff_min = 0;
+
+    std::vector<user_const> softmax_params(2);
+    Device dev{DLDeviceType::kDLCPU, 0};
+
+    if (bit_width == 8) {
+      double beta_multiplier = (kBeta * quant_scale * (1 << (31 - 
kInputBits)));
+      beta_multiplier = std::min<double>(beta_multiplier, (1ll << 31) - 1.0);
+      auto mult_shift_pair = 
tvm::relay::qnn::GetFixedPointMultiplierShift(beta_multiplier);
+      mult = std::get<0>(mult_shift_pair);
+      shift = std::get<1>(mult_shift_pair);
+      diff_min = (1 << kScaledDiffIntegerBits) - 1;
+      diff_min <<= (31 - kScaledDiffIntegerBits);
+      diff_min >>= shift;
+      diff_min *= -1;
+    } else {  // bit_width == 16
+      double scale_beta_rescale = quant_scale * kBeta / (10.0 / 65535.0);
+      auto mult_shift_pair = 
tvm::relay::qnn::GetFixedPointMultiplierShift(scale_beta_rescale);
+      mult = std::get<0>(mult_shift_pair);
+      shift = std::get<1>(mult_shift_pair);
+
+      const int kLUTEntries = 513;
+      int16_t softmax_s16_exp_lut[kLUTEntries];
+      int16_t softmax_s16_one_by_one_lut[kLUTEntries];
+
+      const int range_int16 =
+          std::numeric_limits<int16_t>::max() - 
std::numeric_limits<int16_t>::min();
+      int exp_zero_point = std::numeric_limits<int16_t>::max();
+      float exp_scale = 10.0f / range_int16;

Review Comment:
   Yes, I already tried it and unfortunately, as expected, it didn't improve 
performance. But I can still make it be double for the sake of accuracy.



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]

Reply via email to