[GitHub] [tvm] quic-sanirudh commented on a diff in pull request #12204: [Hexagon] Add Hand written HVX conv2d

GitBox Sun, 14 Aug 2022 23:33:55 -0700


quic-sanirudh commented on code in PR #12204:
URL: https://github.com/apache/tvm/pull/12204#discussion_r945445274



##########
src/runtime/hexagon/ops/conv2d_hvx.cc:
##########
@@ -0,0 +1,468 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+#include <HAP_compute_res.h>
+#include <hexagon_types.h>
+#include <hvx_hexagon_protos.h>
+#include <tvm/runtime/c_runtime_api.h>
+#include <tvm/runtime/device_api.h>
+
+#include <algorithm>
+#include <cassert>
+#include <cinttypes>
+
+#include "conv2d.h"
+
+// Current limitations:
+// - N in NHWC must be 1
+// - dilated convolutions are not supported
+// - Bias is not accepted
+// - Optional "relu" is not performed
+
+// Packed arguments:
+//   0: DLTensor activations (NHWC)
+//   1: DLTensor weights (HWIO)
+//   2: int offset_top
+//   3: int offset_left
+//   4: int stride_h
+//   5: int stride_w
+//   6: DLTensor output (NHWC)
+extern "C" int conv2d_packed(TVMValue* args, int* type_codes, int num_args, 
TVMValue* out_val,
+                             int out_code, void* res_handle);
+
+namespace detail {
+
+inline uint16_t* getElementPtr(int block_out_y, int block_out_x, int 
block_out_c, int yi, int xio,
+                               int ci, int xii, const DLTensor& block) {
+  auto block_ptr = nhwc_at(block, 0, block_out_y, block_out_x, block_out_c);
+  auto block_offset = yi * 128 + xio * 64 + ci * 2 + xii;
+  auto first_element_ptr = reinterpret_cast<uint16_t*>(block_ptr);
+  return first_element_ptr + block_offset;
+}
+
+/**
+ * @brief Compute 2 vectors with ones in the even and odd lanes
+ *
+ * Output vectors are:
+ * vector 1     = [0xFFFF,0x0000,0xFFFFF,0x0000,...,0xFFFF,0x0000]
+ * vector lanes = [   0  ,   2  ,   3   ,   4  ,...,   62 ,   63 ]
+ *
+ * vector 2     = [0x0000,0xFFFF,0x0000,0xFFFFF,...,0xFFFF,0x0000]
+ * vector lanes = [   0  ,   2  ,   3   ,   4  ,...,   62 ,   63 ]
+ *
+ * @return Return the 2 vectors
+ */
+inline std::pair<HVX_Vector, HVX_Vector> getOddEvenOnes() {
+  HVX_Vector v0 = Q6_V_vzero();
+  HVX_Vector v1 = Q6_Vh_vsplat_R(0xFFFF);
+
+  HVX_Vector v1e = Q6_Vh_vshuffe_VhVh(v0, v1);
+  HVX_Vector v1o = Q6_V_vnot_V(v1e);
+  return {v1e, v1o};
+}
+
+/**
+ * @brief Return the input vector filled with the 2 channel elements(which is 
the 1st and 3rd
+ * element) from base_ptr filled up 32 times to get 64 elements
+ *
+ * 1. It's generated by first creating 2 vectors "splatted" with the 2 
required elements
+ * 2. Then we andd it with vectors containing all ones (0xFFFF) in the even 
and odd lanes
+ * 3. Finally those 2 vectors are OR'ed together
+ *
+ * @param base_ptr pointer to the first of the 2 channel elements to be filled
+ *
+ * @return input vector
+ */
+inline HVX_Vector getInputVector(uint16_t* base_ptr) {
+  HVX_Vector v1 = Q6_Vh_vsplat_R(base_ptr[0]);
+  HVX_Vector v2 = Q6_Vh_vsplat_R(base_ptr[2]);
+
+  auto oddEvenOnes = getOddEvenOnes();
+  auto v1e = oddEvenOnes.first;
+  auto v1o = oddEvenOnes.second;
+
+  HVX_Vector v_even_vals = Q6_V_vand_VV(v1, v1e);
+  HVX_Vector v_odd_vals = Q6_V_vand_VV(v2, v1o);
+
+  return Q6_V_vor_VV(v_even_vals, v_odd_vals);
+}
+
+/**
+ * @brief Return the Output vector which contains the 32 output channels in 
the even lanes
+ *
+ * The output vector is commputed as:
+ * 1. vector multiply(vmpy) of input and weights
+ * 2. Rotate the vector right by 1 element and add with the first vector to 
add the 2 input channels
+ * 3. Then convert the results back from qfloat16 to IEEE half-precision float
+ * 4. The added values are in even lanes, so zero out the odd lanes by anding 
with ones in even
+ * lanes and return
+ *
+ * @param act_vec Input activations vector
+ * @param wgt_vec Weights vector
+ *
+ * @return output vector with 32 output channels even lanes
+ */
+inline HVX_Vector computeOuputVector(HVX_Vector act_vec, HVX_Vector wgt_vec) {
+  HVX_Vector v_res = Q6_Vqf16_vmpy_VhfVhf(act_vec, wgt_vec);  // result is in 
qf16
+  HVX_Vector v_rot = Q6_V_vror_VR(v_res, 2);
+  HVX_Vector v_reduced = Q6_Vqf16_vadd_Vqf16Vqf16(v_res, v_rot);
+  HVX_Vector v_hf = Q6_Vhf_equals_Vqf16(v_reduced);
+  HVX_Vector v1e = getOddEvenOnes().first;
+  HVX_Vector v_reduced_even_lanes = Q6_V_vand_VV(v_hf, v1e);
+  return v_reduced_even_lanes;
+}
+
+static int round_down(int v, int base) { return v - (v % base); }
+
+/**
+ * @brief Compute the convolution of inputs from cr_act, and weights from
+ * cr_filt to update the output to cr_out. The goal is to have an efficient
+ * HVX implementation
+ *
+ * Assumptions:
+ * -----------
+ * - This implementation right now assumes that the dilation is 1
+ * - there is zero padding or the input was already pre-padded.
+ * - block specific spatial padding is only expected at the end and hence
+ *   pad_top and pad_left are not yet used
+ * - Relu activation is not used
+ * - Bias add is not done
+ *
+ * @param cr_out blockized output tensor with zeros already filled in
+ * @param cr_act blockized activations
+ * @param cr_filt Chunkified weights as returned from output of prepare_hwio
+ * @param out_shape Original output shape of the tensor before blockization
+ * @param act_shape Original input shape
+ * @param bias_flat Flat bias values and are not used right now
+ * @param filt_shape Original filter shape
+ * @param pad_shape Pad top and pad left shape
+ * @param relu Whether to apply relu after convolution, not done right now
+ * @param zero_block A block filled with zeros
+ *
+ * @return
+ */
+void conv_layer_fp16_hvx(DLTensor& cr_out, const DLTensor& cr_act,  // 
NOLINT(*)
+                         const DLTensor& cr_filt, const DLTensor& out_shape,
+                         const DLTensor& act_shape, const DLTensor& bias_flat,
+                         const DLTensor& filt_shape, const DLTensor& 
pad_shape, bool relu,
+                         int stride_h, int stride_w, uintptr_t zero_block) {
+  int64_t filt_height = filt_shape.shape[0];
+  int64_t filt_width = filt_shape.shape[1];
+  int64_t filt_idepth = filt_shape.shape[2];
+  (void)filt_idepth;
+
+  DEBUG_BLOCK(int pad_top = pad_shape.shape[0]; int pad_left = 
pad_shape.shape[1];)

Review Comment:
   I added this initially to avoid **unused variable** warning, when I was 
using the other `debug` macro for FARF logging. Now that I've switched to using 
`LOG_INFO`, I don't think these are necessary, so I'll remove them



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: commits-unsubscr...@tvm.apache.org

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org

[GitHub] [tvm] quic-sanirudh commented on a diff in pull request #12204: [Hexagon] Add Hand written HVX conv2d

Reply via email to