giuseros commented on a change in pull request #6095:
URL: https://github.com/apache/incubator-tvm/pull/6095#discussion_r458814745



##########
File path: topi/python/topi/arm_cpu/depthwise_conv2d.py
##########
@@ -181,6 +181,154 @@ def depthwise_conv2d_nchw_spatial_pack(cfg, data, kernel, 
strides, padding, dila
 
     return _decl_spatial_pack(cfg, data, kernel, strides, padding, dilation, 
out_dtype, num_tile=2)
 
+@autotvm.register_topi_compute("depthwise_conv2d_nhwc.arm_cpu")
+def compute_depthwise_conv2d_nhwc(_, data, kernel, strides, padding, dilation, 
out_dtype):
+    """TOPI compute callback for depthwise_conv2d nhwc
+
+    Parameters
+    ----------
+    cfg: ConfigEntity
+        The config for this template
+
+    data : tvm.te.Tensor
+        4-D with shape [batch, in_height, in_width, in_channel]
+
+    kernel : tvm.te.Tensor
+        4-D with shape [filter_height, filter_width, in_channel, 
channel_multiplier]
+
+    strides : list of two ints
+        [stride_height, stride_width]
+
+    padding : list of two ints
+        [pad_height, pad_width]
+
+    dilation : list of two ints
+        [dilation_height, dilation_width]
+
+    out_dtype: str
+        The output type. This is used for mixed precision.
+
+    Returns
+    -------
+    output : tvm.te.Tensor
+        4-D with shape [batch, out_height, out_width, out_channel]
+    """
+
+    out_dtype = out_dtype or data.dtype
+
+    N, IH, IW, IC = get_const_tuple(data.shape)
+
+    if isinstance(dilation, int):
+        dilation_h = dilation_w = dilation
+    else:
+        dilation_h, dilation_w = dilation
+
+    KH, KW, IC, channel_multiplier = get_const_tuple(kernel.shape)
+
+    dilated_kernel_h = (KH - 1) * dilation_h + 1
+    dilated_kernel_w = (KW - 1) * dilation_w + 1
+
+    pad_top, pad_left, pad_down, pad_right = get_pad_tuple(
+        padding, (dilated_kernel_h, dilated_kernel_w))
+    HSTR, WSTR = strides if isinstance(strides, (tuple, list)) else (strides, 
strides)
+
+    OH = (IH + pad_top + pad_down - dilated_kernel_h) // HSTR + 1
+    OW = (IW + pad_left + pad_right - dilated_kernel_w) // WSTR + 1
+
+    if pad_top or pad_left:
+        data_pad = nn.pad(data, [0, pad_top, pad_left, 0], [0, pad_down, 
pad_right, 0],
+                          name="data_pad")
+    else:
+        data_pad = data
+
+    output_shape = (N, OH, OW, IC*channel_multiplier)
+
+    idxdiv = tvm.tir.indexdiv
+    idxmod = tvm.tir.indexmod
+
+    reduce_h = te.reduce_axis((0, KH), name='reduce_h')
+    reduce_w = te.reduce_axis((0, KW), name='reduce_w')
+
+    out = te.compute(output_shape, lambda n, h, w, c:
+                     te.sum(data_pad[n,
+                                     HSTR*h+dilation_h*reduce_h,
+                                     w*WSTR+reduce_w*dilation_w,
+                                     idxdiv(c, 
channel_multiplier)].astype(out_dtype) *
+                            kernel[reduce_h,
+                                   reduce_w,
+                                   idxdiv(c, channel_multiplier),
+                                   idxmod(c, 
channel_multiplier)].astype(out_dtype),
+                            axis=[reduce_h, reduce_w]),
+                     name='depthwise_conv2d_nhwc_output')
+    return out
+
+@autotvm.register_topi_schedule("depthwise_conv2d_nhwc.arm_cpu")
+def schedule_depthwise_conv2d_nhwc(cfg, outs):
+    """Create the schedule for depthwise_conv2d_nchw_spatial_pack"""
+    outs = [outs] if isinstance(outs, te.tensor.Tensor) else outs
+    s = te.create_schedule([x.op for x in outs])
+    out = outs[0]
+
+    ##### space definition begin #####
+    n, h, w, c = s[out].op.axis
+    cfg.define_split('tile_c', c, num_outputs=2)
+    _, hi = cfg.define_split('tile_h', h, num_outputs=2)
+    _, wi = cfg.define_split('tile_w', w, num_outputs=2)
+    cfg.define_annotate('locate_output', [hi, wi], 'locate_cache', 
num_anchor=1)
+
+    # fallback support
+    if cfg.is_fallback:
+        cfg['tile_c'] = SplitEntity([-1, 8])
+        cfg['tile_h'] = SplitEntity([-1, 2])
+        cfg['tile_w'] = SplitEntity([-1, 2])
+        cfg['locate_output'] = AnnotateEntity([1])
+    ##### space definition end #####
+
+    def schedule_conv(conv):
+        conv_data = conv.op.input_tensors[0]
+        if conv_data.name == "data_pad":
+            s[conv_data].compute_inline()

Review comment:
       Since those will be other 4 knobs to add, I extracted and tuned the 
depthwise operators in mobilenet_v2 with the 4 different policies (no pading, 
inline, `compute_at{ho,wo}`) and reported the results in terms of TFlite/TVM 
times (higher is better)
   
   |H/W | C   |S  |  inline                |  compute_at(ho)      
|compute_at(wo)       | no-inline            |
   
|----|-----|---|------------------------|----------------------|---------------------|----------------------|
   |112 | 96  |2  |  1.452941176470588     |  0.7042857142857142  
|0.8355932203389829   | 0.36249999999999993  |
   |56  |144  |1  |  1.7249999999999999    |  0.85                
|0.9714285714285715   | 1.38                 |
   |56  |144  |2  |  3.028571428571429     |  1.3187499999999999  
|0.45869565217391306  | 1.5214285714285716   |
   |28  |192  |1  |  1.711111111111111     |  0.76                
|0.5166666666666667   | 1.409090909090909    |
   |28  |192  |2  |  1.6833333333333333    |  0.5666666666666667  
|0.48095238095238096  | 1.442857142857143    |
   |14  |384  |1  |  3.15                  |  1.26                |0.63         
        | 0.5727272727272728   |
   |14  |576  |1  |  0.8863636363636364    |  0.97                |0.40625      
        | 0.527027027027027    |
   |14  |576  |2  |  2.4                   |  0.6857142857142858  
|0.7000000000000001   | 0.6714285714285715   |
   |7   |960  |1  |  2.9272727272727272    |  1.211320754716981   
|0.9056338028169014   | 1.3416666666666668   |
   
   * Since this is a memory bound operator, not inlining padding is always 
going to behave poorly, so I would remove it to reduce the tuning time. There 
would be the argument also to avoid the `compute_at` policies, but since I 
didn't try other networks (and in small cases it seems to run better) I would 
leave those 3 knobs (instead of 4)
   * Except one case, we are always faster (sometimes a lot faster) than 
TFlite.  Once I am done with this and other few improvements I will compare 
with ACL as well. 
   




----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org


Reply via email to