This is an automated email from the ASF dual-hosted git repository.

zhaowu pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/incubator-tvm.git


The following commit(s) were added to refs/heads/master by this push:
     new e40b8bc  [TOPI][Winograd] Optimization of Conv2d Winograd algorithm on 
Tensor Core (#5485)
e40b8bc is described below

commit e40b8bc9e1229d8ee9d0228d5dbdc2338fd8677d
Author: wsl-inspur <61525780+wsl-ins...@users.noreply.github.com>
AuthorDate: Thu May 7 18:46:45 2020 +0800

    [TOPI][Winograd] Optimization of Conv2d Winograd algorithm on Tensor Core 
(#5485)
---
 python/tvm/relay/op/strategy/cuda.py               |  78 ++-
 topi/python/topi/cuda/__init__.py                  |   1 +
 topi/python/topi/cuda/conv2d_alter_op.py           |  36 ++
 topi/python/topi/cuda/conv2d_nhwc_winograd.py      | 639 +++++++++++++++++++++
 .../tests/python/test_topi_conv2d_nhwc_winograd.py | 152 +++++
 5 files changed, 904 insertions(+), 2 deletions(-)

diff --git a/python/tvm/relay/op/strategy/cuda.py 
b/python/tvm/relay/op/strategy/cuda.py
index 9189b5e..83e4e40 100644
--- a/python/tvm/relay/op/strategy/cuda.py
+++ b/python/tvm/relay/op/strategy/cuda.py
@@ -136,8 +136,32 @@ def conv2d_strategy_cuda(attrs, inputs, out_type, target):
                 wrap_compute_conv2d(topi.cuda.conv2d_nhwc),
                 wrap_topi_schedule(topi.cuda.schedule_conv2d_nhwc),
                 name="conv2d_nhwc.cuda")
-            N, _, _, _ = get_const_tuple(data.shape)
-            _, _, CI, CO = get_const_tuple(kernel.shape)
+            N, H, W, _ = get_const_tuple(data.shape)
+            KH, KW, CI, CO = get_const_tuple(kernel.shape)
+            # Winograd shape related judgment
+            judge_winograd_tensorcore, judge_winograd_shape = 
winograd_judge(N, H, W, KH, KW,
+                                                                             
CI, CO, padding,
+                                                                             
stride_h, stride_w,
+                                                                             
dilation_h, dilation_w,
+                                                                             
pre_flag=False)
+            if judge_winograd_shape:
+                if target.target_name == "cuda" and \
+                    nvcc.have_tensorcore(tvm.gpu(0).compute_version) and \
+                    judge_winograd_tensorcore:
+                    strategy.add_implementation(
+                        
wrap_compute_conv2d(topi.cuda.conv2d_nhwc_winograd_tensorcore),
+                        wrap_topi_schedule(
+                            
topi.cuda.schedule_conv2d_nhwc_winograd_tensorcore),
+                        name="conv2d_nhwc_winograd_tensorcore.cuda",
+                        plevel=5)
+                else:
+                    strategy.add_implementation(
+                        wrap_compute_conv2d(
+                            topi.cuda.conv2d_nhwc_winograd_direct),
+                        wrap_topi_schedule(
+                            topi.cuda.schedule_conv2d_nhwc_winograd_direct),
+                        name="conv2d_nhwc_winograd_direct.cuda",
+                        plevel=5)
             if target.target_name == "cuda":
                 if nvcc.have_tensorcore(tvm.gpu(0).compute_version):
                     if (N % 16 == 0 and CI % 16 == 0 and CO % 16 == 0) or \
@@ -220,6 +244,9 @@ def 
conv2d_winograd_without_weight_transfrom_strategy_cuda(attrs, inputs, out_ty
     dilation = attrs.get_int_tuple("dilation")
     groups = attrs.get_int("groups")
     layout = attrs.data_layout
+    data, kernel = inputs
+    stride_h, stride_w = attrs.get_int_tuple("strides")
+    padding = attrs.get_int_tuple("padding")
     assert dilation == (1, 1), "Do not support dilate now"
     assert groups == 1, "Do not supoort arbitrary group number"
     strategy = _op.OpStrategy()
@@ -229,6 +256,30 @@ def 
conv2d_winograd_without_weight_transfrom_strategy_cuda(attrs, inputs, out_ty
             wrap_topi_schedule(
                 
topi.cuda.schedule_conv2d_nchw_winograd_without_weight_transform),
             name="conv2d_nchw_winograd_without_weight_transform.cuda")
+    elif layout == "NHWC":
+        N, H, W, _ = get_const_tuple(data.shape)
+        alpha, _, CI, CO = get_const_tuple(kernel.shape)
+        dilation_h, dilation_w = dilation
+        judge_winograd_tensorcore, _ = winograd_judge(N, H, W, alpha, alpha, 
CI, CO,
+                                                      padding, stride_h, 
stride_w,
+                                                      dilation_h, dilation_w,
+                                                      pre_flag=True)
+        if target.target_name == "cuda" and \
+            nvcc.have_tensorcore(tvm.gpu(0).compute_version) and \
+            judge_winograd_tensorcore:
+            strategy.add_implementation(
+                wrap_compute_conv2d(
+                    
topi.cuda.conv2d_nhwc_winograd_tensorcore_without_weight_transform),
+                wrap_topi_schedule(
+                    
topi.cuda.schedule_conv2d_nhwc_winograd_tensorcore_without_weight_transform),
+                
name="conv2d_nhwc_winograd_tensorcore_without_weight_transform.cuda")
+        else:
+            strategy.add_implementation(
+                wrap_compute_conv2d(
+                    
topi.cuda.conv2d_nhwc_winograd_direct_without_weight_transform),
+                wrap_topi_schedule(
+                    
topi.cuda.schedule_conv2d_nhwc_winograd_direct_without_weight_transform),
+                
name="conv2d_nhwc_winograd_direct_without_weight_transform.cuda")
     else:
         raise RuntimeError("Unsupported 
conv2d_winograd_without_weight_transfrom layout {}".
                            format(layout))
@@ -516,3 +567,26 @@ def proposal_strategy_cuda(attrs, inputs, out_type, 
target):
                                 
wrap_topi_schedule(topi.cuda.schedule_proposal),
                                 name="proposal.cuda")
     return strategy
+
+def winograd_judge(N, H, W, KH, KW, CI, CO, padding, stride_h,
+                   stride_w, dilation_h, dilation_w, pre_flag):
+    """Winograd judgement about tensorcore and shape"""
+    if H % 8 == 0:
+        tile_size = 4
+    else:
+        tile_size = 2
+    if pre_flag:
+        alpha = KH
+        KH = KW = alpha + 1 - tile_size
+    pt, pl, pb, pr = topi.nn.get_pad_tuple(padding, (KH, KW))
+    OH = (H + pt + pb - KH) // stride_h + 1
+    OW = (W + pl + pr - KW) // stride_w + 1
+    nH, nW = (OH + tile_size - 1) // tile_size, (OW + tile_size - 1) // 
tile_size
+    P = N * nH * nW
+    judge_winograd_tensorcore = (P % 16 == 0 and CI % 16 == 0 and CO % 16 == 
0) or \
+                                   (P % 8 == 0 and CI % 16 == 0 and CO % 32 == 
0) or \
+                                   (P % 32 == 0 and CI % 16 == 0 and CO % 8 == 
0)
+    judge_winograd_shape = 2 < KH < 8 and 2 < KW < 8 and KH == KW and \
+                              stride_h == 1 and stride_w == 1 and \
+                              dilation_h == 1 and dilation_w == 1
+    return judge_winograd_tensorcore, judge_winograd_shape
diff --git a/topi/python/topi/cuda/__init__.py 
b/topi/python/topi/cuda/__init__.py
index 2b7a845..8ccd80f 100644
--- a/topi/python/topi/cuda/__init__.py
+++ b/topi/python/topi/cuda/__init__.py
@@ -25,6 +25,7 @@ from .conv2d import *
 from .conv2d_hwcn import *
 from .conv2d_int8 import *
 from .conv2d_winograd import *
+from .conv2d_nhwc_winograd import *
 from .depthwise_conv2d import *
 from .group_conv2d_nchw import *
 from . import conv2d_alter_op
diff --git a/topi/python/topi/cuda/conv2d_alter_op.py 
b/topi/python/topi/cuda/conv2d_alter_op.py
index 8d9e86c..c1e207c 100644
--- a/topi/python/topi/cuda/conv2d_alter_op.py
+++ b/topi/python/topi/cuda/conv2d_alter_op.py
@@ -111,6 +111,42 @@ def _alter_conv2d_layout(attrs, inputs, tinfos, out_type):
         return relay.nn.contrib_conv2d_winograd_without_weight_transform(
             inputs[0], weight, **new_attrs)
 
+    if topi_tmpl in ('conv2d_nhwc_winograd_direct.cuda', 
'conv2d_nhwc_winograd_tensorcore.cuda'):
+        if dilation != (1, 1):
+            logger.warning("Does not support weight pre-transform for dilated 
convolution.")
+            return None
+
+        assert data_layout == "NHWC" and kernel_layout == "HWIO"
+        N, H, W, CI = get_const_tuple(data.shape)
+        KH, KW, _, CO = get_const_tuple(kernel.shape)
+
+        # Pre-compute weight transformation in winograd
+        if H % 8 == 0:
+            tile_size = 4
+        else:
+            tile_size = 2
+        kernel_transform = relay.transpose(inputs[1], axes=[3, 2, 0, 1])
+        weight = 
relay.nn.contrib_conv2d_winograd_weight_transform(kernel_transform,
+                                                                   
tile_size=tile_size)
+        weight = relay.transpose(weight, axes=[0, 1, 3, 2])
+        new_attrs['tile_size'] = tile_size
+        new_attrs['channels'] = CO
+        # Store the same config for the altered operator (workload)
+        new_data = data
+        new_weight = te.placeholder((KH + tile_size - 1, KW + tile_size - 1, 
CI, CO),
+                                    dtype=kernel.dtype)
+        if topi_tmpl == "conv2d_nhwc_winograd_direct.cuda":
+            new_workload = autotvm.task.args_to_workload(
+                [new_data, new_weight, strides, padding, dilation, out_dtype],
+                "conv2d_nhwc_winograd_direct_without_weight_transform.cuda")
+        elif topi_tmpl == "conv2d_nhwc_winograd_tensorcore.cuda":
+            new_workload = autotvm.task.args_to_workload(
+                [new_data, new_weight, strides, padding, dilation, out_dtype],
+                
"conv2d_nhwc_winograd_tensorcore_without_weight_transform.cuda")
+        dispatch_ctx.update(target, new_workload, cfg)
+        return relay.nn.contrib_conv2d_winograd_without_weight_transform(
+            inputs[0], weight, **new_attrs)
+
     if topi_tmpl == "group_conv2d_NCHWc_int8.cuda":
         assert data_layout == "NCHW" and kernel_layout == "OIHW"
         N, CI, H, W = get_const_tuple(data.shape)
diff --git a/topi/python/topi/cuda/conv2d_nhwc_winograd.py 
b/topi/python/topi/cuda/conv2d_nhwc_winograd.py
new file mode 100644
index 0000000..2f5b85e
--- /dev/null
+++ b/topi/python/topi/cuda/conv2d_nhwc_winograd.py
@@ -0,0 +1,639 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+# pylint: disable=invalid-name,unused-variable,unused-argument
+# pylint: disable=too-many-arguments,too-many-locals
+# pylint: disable=too-many-statements
+"""Winograd template for cuda backend"""
+
+import tvm
+from tvm import te
+from tvm import autotvm
+from .. import nn
+from ..util import get_const_int, get_const_tuple, traverse_inline
+from ..nn.winograd_util import winograd_transform_matrices
+from .tensor_intrin import intrin_wmma_load_matrix_A
+from .tensor_intrin import intrin_wmma_load_matrix_W
+from .tensor_intrin import intrin_wmma_store_matrix
+from .tensor_intrin import intrin_wmma_gemm
+
+def _infer_tile_size(data, kernel):
+    """Compute the tile size"""
+    N, H, W, CI = get_const_tuple(data.shape)
+    if H % 8 == 0:
+        return 4
+    return 2
+
+
+def schedule_bgemm_tensorcore(cfg, s, bgemm, data_pack, kernel_pack):
+    """Schedule for bgemm tensorcore"""
+    A = data_pack
+    B = kernel_pack
+    C = bgemm
+    _, _, P, out_dim = get_const_tuple(C.shape)
+    out_dtype = C.dtype
+
+    # Explicit memory access
+    AS = s.cache_read(A, 'shared', [C])
+    BS = s.cache_read(B, 'shared', [C])
+    AF = s.cache_read(AS, 'wmma.matrix_a', [C])
+    BF = s.cache_read(BS, 'wmma.matrix_b', [C])
+    CF = s.cache_write(C, 'wmma.accumulator')
+    CS = s.cache_read(CF, 'shared', [C])
+
+    # Create tuning space
+    cfg.define_knob("block_row_warps", [1, 2, 4])
+    cfg.define_knob("block_col_warps", [1, 2, 4])
+    cfg.define_knob("warp_row_tiles", [1, 2, 4, 8])
+    cfg.define_knob("warp_col_tiles", [1, 2, 4, 8])
+    cfg.define_knob("chunk", [1, 2, 4, 8])
+    cfg.define_knob("offset", [0, 1, 2, 4, 8])
+    cfg.define_knob("offsetCS", [0, 1, 2, 4, 8])
+    cfg.define_knob("vec", [1, 2, 4, 8])
+
+    # Ensure that the default parameters are applicable when autotvm is not in 
use
+    if (P % 16 == 0 and out_dim % 16 == 0):
+        cfg.define_knob("wmma_m", [16, 8, 32])
+    elif (P % 32 == 0 and out_dim % 8 == 0):
+        cfg.define_knob("wmma_m", [32, 16, 8])
+    elif (P % 8 == 0 and out_dim % 32 == 0):
+        cfg.define_knob("wmma_m", [8, 16, 32])
+
+    warp_size = 32
+    wmma_k = 16
+    block_row_warps = cfg["block_row_warps"].val
+    block_col_warps = cfg["block_col_warps"].val
+    warp_row_tiles = cfg["warp_row_tiles"].val
+    warp_col_tiles = cfg["warp_col_tiles"].val
+    chunk = cfg["chunk"].val
+    offsetAB = cfg["offset"].val
+    offsetCS = cfg["offsetCS"].val
+    wmma_m = cfg["wmma_m"].val
+    vec = cfg["vec"].val
+
+    if wmma_m == 16:
+        wmma_n = 16
+    elif wmma_m == 8:
+        wmma_n = 32
+    elif wmma_m == 32:
+        wmma_n = 8
+
+    # Define the stride of intrin functions
+    AS_align = chunk * wmma_k + offsetAB
+    BS_align = warp_col_tiles * block_col_warps * wmma_n + offsetAB
+    CS_align = warp_col_tiles * block_col_warps * wmma_n + offsetCS
+    AS_stride = [AS_align, 1]
+    BS_stride = [BS_align, 1]
+    AF_stride = [wmma_k, 1]
+    BF_stride = [wmma_n * warp_col_tiles, 1]
+    CF_stride = [warp_col_tiles * wmma_n, 1]
+    CS_stride = [CS_align, 1]
+    block_x = te.thread_axis('blockIdx.x')
+    block_y = te.thread_axis('blockIdx.y')
+    block_z = te.thread_axis('blockIdx.z')
+    thread_x = te.thread_axis('threadIdx.x')
+    thread_y = te.thread_axis('threadIdx.y')
+    thread_z = te.thread_axis('threadIdx.z')
+
+    # Schedule for computation
+    block_factor_b = wmma_m * warp_row_tiles * block_row_warps
+    block_factor_o = wmma_n * warp_col_tiles * block_col_warps
+    alpha_1, alpha_2, b, o = C.op.axis
+    block_k = s[C].fuse(alpha_1, alpha_2)
+    block_i, bc = s[C].split(b, factor=block_factor_b)
+    block_j, oc = s[C].split(o, factor=block_factor_o)
+    s[C].reorder(block_k, block_i, block_j, bc, oc)
+    t = s[C].fuse(bc, oc)
+    t, vi = s[C].split(t, factor=vec)
+    t, tx = s[C].split(t, factor=warp_size)
+    t, ty = s[C].split(t, factor=block_row_warps)
+    t, tz = s[C].split(t, factor=block_col_warps)
+    s[C].bind(block_k, block_z)
+    s[C].bind(block_i, block_x)
+    s[C].bind(block_j, block_y)
+    s[C].bind(tz, thread_z)
+    s[C].bind(ty, thread_y)
+    s[C].bind(tx, thread_x)
+    s[C].vectorize(vi)
+
+    # Schedule for wmma store
+    s[CS].compute_at(s[C], block_j)
+    _, _, bb, oo = CS.op.axis
+    s[CS].storage_align(bb, CS_align - 1, CS_align)
+    bb, bbi = s[CS].split(bb, factor=wmma_m)
+    oo, ooi = s[CS].split(oo, factor=wmma_n)
+    bb, bbii = s[CS].split(bb, factor=warp_row_tiles)
+    oo, ooii = s[CS].split(oo, factor=warp_col_tiles)
+    s[CS].reorder(bb, oo, bbii, ooii, bbi, ooi)
+
+    # Schedule for wmma computation
+    s[CF].compute_at(s[CS], oo)
+    _, _, warp_i, warp_j = CF.op.axis
+    warp_i, _ii = s[CF].split(warp_i, factor=wmma_m)
+    warp_j, _jj = s[CF].split(warp_j, factor=wmma_n)
+    k, = CF.op.reduce_axis
+    k, _k = s[CF].split(k, factor=wmma_k)
+    ko, ki = s[CF].split(k, factor=chunk)
+    s[CF].reorder(ko, ki, warp_i, warp_j, _ii, _jj, _k)
+
+    # Schedule for  wmma_matrix_a load
+    s[AF].compute_at(s[CF], ki)
+    _, _, b, i = AF.op.axis
+    b, b_ii = s[AF].split(b, factor=wmma_m)
+    i, i_jj = s[AF].split(i, factor=wmma_k)
+    s[AF].reorder(b, i, b_ii, i_jj)
+
+    # Schedule for  wmma_matrix_b load
+    s[BF].compute_at(s[CF], ki)
+    _, _, i, o = BF.op.axis
+    o, o_ii = s[BF].split(o, factor=wmma_n)
+    i, i_ii = s[BF].split(i, factor=wmma_k)
+    s[BF].reorder(i, o, i_ii, o_ii)
+
+    # Schedule for A's(B's) shared memory load
+    def shared_shedule(stage, strides):
+        s[stage].compute_at(s[CF], ko)
+        _, _, xo, yo = stage.op.axis
+        s[stage].storage_align(xo, strides - 1, strides)
+        t = s[stage].fuse(xo, yo)
+        t, vi = s[stage].split(t, factor=vec)
+        t, tx = s[stage].split(t, factor=warp_size)
+        t, ty = s[stage].split(t, factor=block_row_warps)
+        _, tz = s[stage].split(t, factor=block_col_warps)
+        s[stage].bind(ty, thread_y)
+        s[stage].bind(tz, thread_z)
+        s[stage].bind(tx, thread_x)
+        s[stage].vectorize(vi)
+
+    shared_shedule(AS, AS_align)
+    shared_shedule(BS, BS_align)
+
+    shape = (wmma_m, wmma_n, wmma_k)
+    in_dtype = 'float16'
+    AL_gemm = te.placeholder((wmma_m, wmma_k), name='AL_gemm', dtype=in_dtype)
+    BL_gemm = te.placeholder((wmma_k, wmma_n), name='BL_gemm', dtype=in_dtype)
+    k_gemm = te.reduce_axis((0, wmma_k), name='k_gemm')
+    CL_compute = te.compute((wmma_m, wmma_n), lambda ii, jj:
+                            te.sum(AL_gemm[ii, k_gemm].astype(out_dtype) *
+                                   BL_gemm[k_gemm, jj].astype(out_dtype),
+                                   axis=k_gemm), name='CL_compute')
+
+    # Lower the computation loops down to TensorCore hardware intrinsics
+    # by mapping the tensorcore to tensor intrinsics
+    s[AF].tensorize(b_ii, intrin_wmma_load_matrix_A(AF_stride, AS_stride, 
shape, "row_major",
+                                                    (wmma_m, wmma_k), (wmma_m, 
wmma_k), 'float16'))
+    s[BF].tensorize(i_ii, intrin_wmma_load_matrix_W(BF_stride, BS_stride, 
shape, "row_major",
+                                                    (wmma_k, wmma_n), (wmma_k, 
wmma_n), 'float16'))
+    s[CF].tensorize(_ii, intrin_wmma_gemm(AL_gemm, BL_gemm, CL_compute, 
AF_stride,
+                                          BF_stride, CF_stride, shape))
+    s[CS].tensorize(bbi, intrin_wmma_store_matrix(CS_stride, CF_stride, shape, 
out_dtype,
+                                                  (wmma_m, wmma_n), (wmma_m, 
wmma_n)))
+
+
+def schedule_bgemm_direct(cfg, s, bgemm, data_pack, kernel_pack):
+    """Schedule for bgemm direct"""
+    b1, b2, y, x = s[bgemm].op.axis
+    rc = s[bgemm].op.reduce_axis[0]
+    alpha = get_const_int(b1.dom.extent)
+
+    # Create tuning space
+    cfg.define_split("tile_b", cfg.axis(alpha * alpha), num_outputs=4,
+                     filter=lambda x: x.size[-3:] == [1, 1, 1])
+    cfg.define_split("tile_y", y, num_outputs=4)
+    cfg.define_split("tile_x", x, num_outputs=4)
+    cfg.define_split("tile_rc", rc, num_outputs=2)
+    cfg.define_knob("offset_bgemm", [0, 1, 2, 4, 8])
+    cfg.define_knob("vector_bgemm", [1, 2, 4, 8])
+    offset_bgemm = cfg["offset_bgemm"].val
+    vector_bgemm = cfg["vector_bgemm"].val
+
+    C = bgemm
+    A0, B0 = kernel_pack, data_pack
+
+    # Designate the memory hierarchy
+    OL = s.cache_write(C, 'local')
+    AA = s.cache_read(A0, 'shared', [OL])
+    BB = s.cache_read(B0, 'shared', [OL])
+
+    # Tile and bind spatial axes
+    b = s[bgemm].fuse(b1, b2)
+    bgemm_scope, b = s[bgemm].split(b, nparts=1)
+    bz, vz, tz, zi = cfg["tile_b"].apply(s, C, b)
+    by, vy, ty, yi = cfg["tile_y"].apply(s, C, y)
+    bx, vx, tx, xi = cfg["tile_x"].apply(s, C, x)
+    s[C].bind(bz, te.thread_axis("blockIdx.z"))
+    s[C].bind(by, te.thread_axis("blockIdx.y"))
+    s[C].bind(bx, te.thread_axis("blockIdx.x"))
+    s[C].bind(vz, te.thread_axis("vthread"))
+    s[C].bind(vy, te.thread_axis("vthread"))
+    s[C].bind(vx, te.thread_axis("vthread"))
+    s[C].bind(tz, te.thread_axis("threadIdx.z"))
+    s[C].bind(ty, te.thread_axis("threadIdx.y"))
+    s[C].bind(tx, te.thread_axis("threadIdx.x"))
+    s[C].reorder(bgemm_scope, bz, by, bx, vz, vy, vx, tz, ty, tx, zi, yi, xi)
+
+    # Tile reduction axes
+    s[OL].compute_at(s[C], tx)
+    b1, b2, y, x = s[OL].op.axis
+    b = s[OL].fuse(b1, b2)
+    rc, = s[OL].op.reduce_axis
+    rco, rci = cfg['tile_rc'].apply(s, OL, rc)
+    s[OL].reorder(rco, b, y, x, rci)
+
+    s[AA].compute_at(s[OL], rco)
+    _, _, k, n = s[AA].op.axis
+    AA_align = offset_bgemm + cfg["tile_x"].size[1] * cfg["tile_x"].size[2] * 
cfg["tile_x"].size[3]
+    s[AA].storage_align(k, AA_align - 1, AA_align)
+
+    s[BB].compute_at(s[OL], rco)
+    _, _, m, k = s[BB].op.axis
+    BB_align = offset_bgemm + cfg["tile_rc"].size[1]
+    s[BB].storage_align(m, BB_align - 1, BB_align)
+
+    # Schedule for A and B shared memory load
+    for load in [AA, BB]:
+        fused = s[load].fuse(*list(s[load].op.axis))
+        fused, ti = s[load].split(fused, factor=vector_bgemm)
+        fused, tx = s[load].split(fused, cfg["tile_x"].size[2])
+        fused, ty = s[load].split(fused, cfg["tile_y"].size[2])
+        fused, tz = s[load].split(fused, cfg["tile_b"].size[2])
+        s[load].bind(tz, te.thread_axis("threadIdx.z"))
+        s[load].bind(ty, te.thread_axis("threadIdx.y"))
+        s[load].bind(tx, te.thread_axis("threadIdx.x"))
+        s[load].vectorize(ti)
+
+
+def nhwc_winograd_cuda(cfg, data, kernel, strides, padding, dilation, 
out_dtype,
+                       use_tensorcore, pre_computed):
+    """Compute declaration for winograd"""
+    tile_size = _infer_tile_size(data, kernel)
+    N, H, W, CI = get_const_tuple(data.shape)
+
+    if isinstance(dilation, int):
+        dilation_h = dilation_w = dilation
+    else:
+        dilation_h, dilation_w = dilation
+
+    HSTR, WSTR = (strides, strides) if isinstance(strides, int) else strides
+
+    if not pre_computed:  # Kernel tensor is raw tensor, do strict check
+        if dilation_h != 1 or dilation_w != 1:
+            kernel = nn.dilate(kernel, (dilation_h, dilation_w, 1, 1))
+        KH, KW, CI, CO = get_const_tuple(kernel.shape)
+        alpha = KW + tile_size - 1
+        assert HSTR == 1 and WSTR == 1 and KH == KW
+    else:
+        # Kernel tensor is pre-transfomred. This op is created by 
conv2d_alter_op.
+        # Dilation is not supported
+        alpha, _, CI, CO = get_const_tuple(kernel.shape)
+        KH = KW = alpha + 1 - tile_size
+        assert HSTR == 1 and WSTR == 1 and dilation_h == 1 and dilation_w == 1
+
+    pt, pl, pb, pr = nn.get_pad_tuple(padding, (KH, KW))
+    data_pad = nn.pad(data, (0, pt, pl, 0), (0, pb, pr, 0), name="data_pad")
+
+    r = KW
+    m = tile_size
+    H = (H + pt + pb - KH) // HSTR + 1
+    W = (W + pl + pr - KW) // WSTR + 1
+    nH, nW = (H + m - 1) // m, (W + m - 1) // m
+    P = N * nH * nW
+
+    # Determine whether the shape is available with tensorcore
+    shape_judge = (P % 16 == 0 and CI % 16 == 0 and CO % 16 == 0) or \
+                      (P % 8 == 0 and CI % 16 == 0 and CO % 32 == 0) or \
+                      (P % 32 == 0 and CI % 16 == 0 and CO % 8 == 0)
+
+    if shape_judge and use_tensorcore:
+        trans_type = "float16"
+    else:
+        trans_type = data.dtype
+
+    # Compute transform matrix
+    A, _, _ = winograd_transform_matrices(m, r, out_dtype)
+    _, B, G = winograd_transform_matrices(m, r, data.dtype)
+
+    # Transform kernel
+    if not pre_computed:
+        # Check if we are currently tuning, if so we want to avoid counting
+        # prepacking in time costs. Just use a placeholder with the packed 
shape instead.
+        if autotvm.GLOBAL_SCOPE.in_tuning:
+            kernel_pack = te.placeholder((alpha, alpha, CI, CO),
+                                         dtype=kernel.dtype,
+                                         name='kernel_pack')
+        else:
+            r_kh = te.reduce_axis((0, KH), name='r_kh')
+            r_kw = te.reduce_axis((0, KW), name='r_kw')
+            kernel_pack = te.compute((alpha, alpha, CI, CO), lambda eps, nu, 
ci, co:
+                                     te.sum((kernel[r_kh][r_kw][ci][co]) *
+                                            G[eps][r_kh] * G[nu][r_kw],
+                                            axis=[r_kh, r_kw]), 
name='kernel_pack')
+    else:
+        kernel_pack = kernel
+
+    idxdiv = tvm.tir.indexdiv
+    idxmod = tvm.tir.indexmod
+
+    # Pack input tile
+    input_tile = te.compute((P, CI, alpha, alpha), lambda p, c, eps, nu:
+                            data_pad[idxdiv(p, (nH * nW)),
+                                     idxmod(idxdiv(p, nW), nH) * m + eps,
+                                     idxmod(p, nW) * m + nu,
+                                     c], name='d')
+
+    # Transform data
+    r_a = te.reduce_axis((0, alpha), 'r_a')
+    r_b = te.reduce_axis((0, alpha), 'r_b')
+    data_pack = te.compute((alpha, alpha, P, CI), lambda eps, nu, p, ci:
+                           te.sum(input_tile[p][ci][r_a][r_b] * B[r_a][eps] * 
B[r_b][nu],
+                                  axis=[r_a, r_b]), name='data_pack')
+
+    # Convert data type of input feature maps and weights for tensorcore
+    Transdata = te.compute(
+        data_pack.shape, lambda eps, nu, p, ci: data_pack[eps, nu, p, 
ci].astype(trans_type))
+    TransFilter = te.compute(
+        kernel_pack.shape, lambda eps, nu, ci, co: kernel_pack[eps, nu, ci, 
co].astype(trans_type))
+
+    # Do batch gemm
+    ci = te.reduce_axis((0, CI), name='ci')
+    bgemm = te.compute((alpha, alpha, P, CO), lambda eps, nu, p, co:
+                       te.sum((Transdata[eps][nu][p][ci]).astype(out_dtype) *
+                              (TransFilter[eps][nu][ci][co]).astype(out_dtype),
+                              axis=[ci]), name='bgemm')
+
+    # Inverse transform
+    r_a = te.reduce_axis((0, alpha), 'r_a')
+    r_b = te.reduce_axis((0, alpha), 'r_a')
+    inverse = te.compute((P, CO, m, m), lambda p, co, vh, vw:
+                         te.sum(bgemm[r_a][r_b][p][co] * A[r_a][vh] * 
A[r_b][vw],
+                                axis=[r_a, r_b]), name='inverse')
+
+    # Output
+    output = te.compute((N, H, W, CO), lambda n, h, w, co:
+                        inverse[n * nH * nW + idxdiv(h, m) * nW + idxdiv(w, m),
+                                co,
+                                idxmod(h, m),
+                                idxmod(w, m)],
+                        name='output', tag='conv2d_nhwc_winograd')
+    cfg.add_flop(2 * N * CO * H * W * CI * KH * KW)
+    return output
+
+
+def data_weight_transform(s, data_trans, input_tile, thread_num_trans, 
offset_trans, trans_tag):
+    """Schedule for data or kernel transform"""
+    kernel_align = thread_num_trans + offset_trans
+    indata_s = s.cache_read(input_tile, 'shared', [data_trans])
+    data_l = s.cache_write(data_trans, 'local')
+    # Schedule for data or kernel transform
+    eps, nu, p, c = s[data_trans].op.axis
+
+    block_x, thread_x = s[data_trans].split(c, thread_num_trans)
+    block_x = s[data_trans].fuse(p, block_x)
+    s[data_trans].reorder(block_x, thread_x, eps, nu)
+    s[data_trans].bind(thread_x, te.thread_axis("threadIdx.x"))
+    s[data_trans].bind(block_x, te.thread_axis("blockIdx.x"))
+
+    s[data_l].compute_at(s[data_trans], thread_x)
+    eps_l, nu_l, p_l, c_l = s[data_l].op.axis
+    r_a, r_b = s[data_l].op.reduce_axis
+    block_x_l, thread_x_l = s[data_l].split(c_l, thread_num_trans)
+    block_x_l = s[data_l].fuse(p_l, block_x_l)
+
+    s[data_l].reorder(block_x_l, thread_x_l, eps_l, nu_l, r_a, r_b)
+
+    for axis in [eps_l, nu_l, r_a, r_b]:
+        s[data_l].unroll(axis)
+
+    # Schedule for share memory load
+    s[indata_s].compute_at(s[data_l], block_x_l)
+    if trans_tag == "data":
+        p_is, c_is, eps_is, nu_is = s[indata_s].op.axis
+        data_align = get_const_int(eps_is.dom.extent) * \
+                         get_const_int(nu_is.dom.extent) + offset_trans
+        s[indata_s].storage_align(c_is, data_align - 1, data_align)
+        block_x_is, thread_x_is = s[indata_s].split(c_is, thread_num_trans)
+        s[indata_s].bind(thread_x_is, te.thread_axis("threadIdx.x"))
+    else:
+        eps_is, nu_is, ci_is, co_is = s[indata_s].op.axis
+        s[indata_s].storage_align(nu_is, kernel_align - 1, kernel_align)
+        block_x_is, thread_x_is = s[indata_s].split(co_is, thread_num_trans)
+        s[indata_s].reorder(ci_is, block_x_is, eps_is, nu_is, thread_x_is)
+        s[indata_s].bind(thread_x_is, te.thread_axis("threadIdx.x"))
+
+
+def schedule_nhwc_winograd_cuda(cfg, s, output, use_tensorcore, pre_computed):
+    """Schedule winograd template"""
+    # Get stages
+    inverse = s[output].op.input_tensors[0]
+    bgemm, A = s[inverse].op.input_tensors
+    Transdata, TransFilter = s[bgemm].op.input_tensors
+    data_pack = s[Transdata].op.input_tensors[0]
+    kernel_pack = s[TransFilter].op.input_tensors[0]
+    s[Transdata].compute_inline()
+    s[TransFilter].compute_inline()
+
+    input_tile, B = s[data_pack].op.input_tensors
+    pad_data = s[input_tile].op.input_tensors[0]
+
+    # Define the stride of intrin functions
+    cfg.define_knob("thread_num_inverse", [1, 32, 64, 128, 256])
+    cfg.define_knob("thread_num_data", [1, 32, 64, 128, 256])
+    cfg.define_knob("thread_num_kernel", [1, 32, 64, 128, 256])
+    cfg.define_knob("offset_inverse", [0, 2, 4])
+    cfg.define_knob("offset_data", [0, 1, 2, 4])
+    cfg.define_knob("offset_kernel", [0, 1, 2, 4])
+    cfg.define_knob("inverse_in_vector", [1, 2, 4])
+
+    thread_num_data = cfg["thread_num_data"].val
+    thread_num_kernel = cfg["thread_num_kernel"].val
+    thread_num_inverse = cfg["thread_num_inverse"].val
+    offset_data = cfg["offset_data"].val
+    offset_kernel = cfg["offset_kernel"].val
+    offset_inverse = cfg["offset_inverse"].val
+    inverse_in_vector = cfg["inverse_in_vector"].val
+
+    # Data transform
+    s[B].compute_inline()
+    data_weight_transform(s, data_pack, input_tile, thread_num_data, 
offset_data, trans_tag="data")
+    s[input_tile].compute_inline()
+    s[pad_data].compute_inline()
+
+    # Kernel transform
+    if not pre_computed and not autotvm.GLOBAL_SCOPE.in_tuning:
+        kernel, G = s[kernel_pack].op.input_tensors
+        s[G].compute_inline()
+        data_weight_transform(s, kernel_pack, kernel, thread_num_kernel,
+                              offset_kernel, trans_tag="kernel")
+    else:
+        kernel = kernel_pack
+
+    if isinstance(kernel.op, tvm.te.ComputeOp) and "dilate" in kernel.op.tag:
+        s[kernel].compute_inline()
+
+    b1, b2, y, x = s[bgemm].op.axis
+    alpha = get_const_int(b1.dom.extent)
+    _, _, P, CI = get_const_tuple(Transdata.shape)
+    _, _, _, CO = get_const_tuple(TransFilter.shape)
+
+    # Determine whether the shape is available with tensorcore
+    shape_judge = (P % 16 == 0 and CI % 16 == 0 and CO % 16 == 0) or \
+                      (P % 8 == 0 and CI % 16 == 0 and CO % 32 == 0) or \
+                      (P % 32 == 0 and CI % 16 == 0 and CO % 8 == 0)
+
+    if shape_judge and use_tensorcore:
+        schedule_bgemm_tensorcore(cfg, s, bgemm, Transdata, TransFilter)
+    else:
+        schedule_bgemm_direct(cfg, s, bgemm, Transdata, TransFilter)
+
+    # Schedule inverse, output and fusion
+    if output.op in s.outputs:
+        OL = None
+    else:
+        OL = output
+        s[OL].set_scope('local')
+        output = s.outputs[0]
+
+    s[A].compute_inline()
+    inverse_s = s.cache_read(bgemm, 'shared', [inverse])
+
+    m = alpha - 3 + 1
+    offset_inverse_in = offset_inverse
+    vector_width_inverse_in = inverse_in_vector
+
+    # Schedule for output
+    n, h, w, co = s[output].op.axis
+    ho, wo, hi, wi = s[output].tile(h, w, m, m)
+    s[output].reorder(n, ho, wo, co, hi, wi)
+    fused = s[output].fuse(n, ho, wo)
+
+    block_x_s, thread_x_s = s[output].split(co, thread_num_inverse)
+    block_x_s = s[output].fuse(fused, block_x_s)
+    s[output].reorder(block_x_s, thread_x_s, hi, wi)
+
+    if OL is not None:
+        s[OL].compute_inline()
+
+    # Schedule for inverse
+    s[inverse].compute_at(s[output], thread_x_s)
+    p_inv, co_inv, eps_inv, nu_inv = s[inverse].op.axis
+    block_x_inv, thread_x_inv = s[inverse].split(co_inv, thread_num_inverse)
+    r_a, r_b = s[inverse].op.reduce_axis
+    for axis in [eps_inv, nu_inv, r_a, r_b]:
+        s[inverse].unroll(axis)
+
+    # Schedule for share memory load
+    s[inverse_s].compute_at(s[output], block_x_s)
+    eps_inv_s, nu_inv_s, p_inv_s, co_inv_s = s[inverse_s].op.axis
+    inverse_in_align = offset_inverse_in + thread_num_inverse
+    s[inverse_s].storage_align(p_inv_s, inverse_in_align - 1, inverse_in_align)
+    block_x_inv_s, thread_x_inv_s = s[inverse_s].split(co_inv_s, 
thread_num_inverse)
+    block_x_inv_s = s[inverse_s].fuse(p_inv_s, block_x_inv_s)
+    s[inverse_s].reorder(block_x_inv_s, eps_inv_s, nu_inv_s, thread_x_inv_s)
+    t = s[inverse_s].fuse(eps_inv_s, nu_inv_s, thread_x_inv_s)
+    t, ti = s[inverse_s].split(t, factor=vector_width_inverse_in)
+    t, tx = s[inverse_s].split(t, factor=thread_num_inverse)
+    s[inverse_s].bind(tx, te.thread_axis("threadIdx.x"))
+    s[inverse_s].vectorize(ti)
+
+    s[output].bind(thread_x_s, te.thread_axis("threadIdx.x"))
+    s[output].bind(block_x_s, te.thread_axis("blockIdx.x"))
+    return s
+
+
+@autotvm.register_topi_compute("conv2d_nhwc_winograd_direct.cuda")
+def conv2d_nhwc_winograd_direct(cfg, data, kernel, strides, padding, dilation, 
out_dtype):
+    """Compute conv2d with winograd for NHWC layout"""
+    return nhwc_winograd_cuda(cfg, data, kernel, strides, padding, dilation, 
out_dtype,
+                              use_tensorcore=False, pre_computed=False)
+
+
+@autotvm.register_topi_schedule("conv2d_nhwc_winograd_direct.cuda")
+def schedule_conv2d_nhwc_winograd_direct(cfg, outs):
+    """TOPI schedule callback"""
+    s = te.create_schedule([x.op for x in outs])
+
+    def _callback(op):
+        if 'conv2d_nhwc_winograd' in op.tag:
+            schedule_nhwc_winograd_cuda(cfg, s, op.output(0), 
use_tensorcore=False,
+                                        pre_computed=False)
+
+    traverse_inline(s, outs[0].op, _callback)
+    return s
+
+
+@autotvm.register_topi_compute("conv2d_nhwc_winograd_tensorcore.cuda")
+def conv2d_nhwc_winograd_tensorcore(cfg, data, kernel, strides, padding, 
dilation, out_dtype):
+    """Compute conv2d with winograd for NHWC layout"""
+    return nhwc_winograd_cuda(cfg, data, kernel, strides, padding, dilation, 
out_dtype,
+                              use_tensorcore=True, pre_computed=False)
+
+
+@autotvm.register_topi_schedule("conv2d_nhwc_winograd_tensorcore.cuda")
+def schedule_conv2d_nhwc_winograd_tensorcore(cfg, outs):
+    """TOPI schedule callback"""
+    s = te.create_schedule([x.op for x in outs])
+
+    def _callback(op):
+        if 'conv2d_nhwc_winograd' in op.tag:
+            schedule_nhwc_winograd_cuda(cfg, s, op.output(0), 
use_tensorcore=True,
+                                        pre_computed=False)
+
+    traverse_inline(s, outs[0].op, _callback)
+    return s
+
+
+@autotvm.register_topi_compute("conv2d_nhwc_winograd_direct_without_weight_transform.cuda")
+def conv2d_nhwc_winograd_direct_without_weight_transform(cfg, data, kernel, 
strides,
+                                                         padding, dilation, 
out_dtype):
+    """Compute conv2d with winograd for NHWC layout"""
+    return nhwc_winograd_cuda(cfg, data, kernel, strides, padding, dilation, 
out_dtype,
+                              use_tensorcore=False, pre_computed=True)
+
+
+@autotvm.register_topi_schedule("conv2d_nhwc_winograd_direct_without_weight_transform.cuda")
+def schedule_conv2d_nhwc_winograd_direct_without_weight_transform(cfg, outs):
+    """TOPI schedule callback"""
+    s = te.create_schedule([x.op for x in outs])
+
+    def _callback(op):
+        if 'conv2d_nhwc_winograd' in op.tag:
+            schedule_nhwc_winograd_cuda(cfg, s, op.output(0), 
use_tensorcore=False,
+                                        pre_computed=True)
+
+    traverse_inline(s, outs[0].op, _callback)
+    return s
+
+
+@autotvm.register_topi_compute("conv2d_nhwc_winograd_tensorcore_without_weight_transform.cuda")
+def conv2d_nhwc_winograd_tensorcore_without_weight_transform(cfg, data, 
kernel, strides,
+                                                             padding, 
dilation, out_dtype):
+    """Compute conv2d with winograd for NHWC layout"""
+    return nhwc_winograd_cuda(cfg, data, kernel, strides, padding, dilation, 
out_dtype,
+                              use_tensorcore=True, pre_computed=True)
+
+
+@autotvm.register_topi_schedule("conv2d_nhwc_winograd_tensorcore_without_weight_transform.cuda")
+def schedule_conv2d_nhwc_winograd_tensorcore_without_weight_transform(cfg, 
outs):
+    """TOPI schedule callback"""
+    s = te.create_schedule([x.op for x in outs])
+
+    def _callback(op):
+        if 'conv2d_nhwc_winograd' in op.tag:
+            schedule_nhwc_winograd_cuda(cfg, s, op.output(0), 
use_tensorcore=True,
+                                        pre_computed=True)
+
+    traverse_inline(s, outs[0].op, _callback)
+    return s
diff --git a/topi/tests/python/test_topi_conv2d_nhwc_winograd.py 
b/topi/tests/python/test_topi_conv2d_nhwc_winograd.py
new file mode 100644
index 0000000..45f0599
--- /dev/null
+++ b/topi/tests/python/test_topi_conv2d_nhwc_winograd.py
@@ -0,0 +1,152 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+# pylint: disable=invalid-name, too-many-locals, too-many-arguments
+# pylint: disable=bad-whitespace
+"""Example code to do convolution."""
+
+import numpy as np
+import tvm
+import topi
+import topi.testing
+from tvm import te
+from tvm.contrib.pickle_memoize import memoize
+from tvm.contrib import nvcc
+from topi.nn.util import get_pad_tuple
+from topi.util import get_const_tuple
+
+
+_conv2d_nhwc_winograd_tensorcore = {
+    "cuda": (topi.cuda.conv2d_nhwc_winograd_tensorcore,
+             topi.cuda.schedule_conv2d_nhwc_winograd_tensorcore)
+}
+
+_conv2d_nhwc_winograd_direct = {
+    "cuda": (topi.cuda.conv2d_nhwc_winograd_direct,
+             topi.cuda.schedule_conv2d_nhwc_winograd_direct)
+}
+
+
+def verify_conv2d_nhwc(batch, in_channel, in_size, num_filter, kernel, stride,
+                       padding, dilation=1, add_bias=False, add_relu=False,
+                       devices='cuda', bgemm="direct"):
+    """Test the conv2d with winograd for nhwc layout"""
+    pad_top, pad_left, pad_bottom, pad_right = get_pad_tuple(padding, (kernel, 
kernel))
+    padding_sum = pad_top + pad_left + pad_bottom + pad_right
+    print("Workload: (%d, %d, %d, %d, %d, %d, %d, %d)" % (
+        batch, in_channel, in_size, num_filter, kernel, stride, padding_sum, 
dilation))
+
+    in_height = in_width = in_size
+
+    A = te.placeholder((batch, in_height, in_width, in_channel), name='A')
+    W = te.placeholder((kernel, kernel, in_channel, num_filter), name='W')
+    bias = te.placeholder((1, 1, 1, num_filter), name='bias')
+
+    a_shape = get_const_tuple(A.shape)
+    w_shape = get_const_tuple(W.shape)
+    bias_shape = get_const_tuple(bias.shape)
+    dtype = A.dtype
+
+    @memoize("topi.tests.test_topi_conv2d_nhwc.verify_conv2d_nhwc")
+    def get_ref_data():
+        a_np = np.random.uniform(size=a_shape).astype(dtype)
+        w_np = np.random.uniform(size=w_shape).astype(dtype)
+        b_np = np.random.uniform(size=bias_shape).astype(dtype)
+        dw_np = topi.testing.dilate_python(w_np, (dilation, dilation, 1, 1))
+        c_np = topi.testing.conv2d_nhwc_python(a_np, dw_np, stride, padding)
+        if add_bias:
+            b_np = np.random.uniform(size=bias_shape).astype(dtype)
+            c_np += b_np
+        if add_relu:
+            c_np = np.maximum(c_np, 0)
+        return a_np, w_np, b_np, c_np
+
+    a_np, w_np, b_np, c_np = get_ref_data()
+
+    def check_device(device):
+        ctx = tvm.context(device, 0)
+        if not ctx.exist:
+            print("Skip because %s is not enabled" % device)
+            return
+        print("Running on target: %s" % device)
+        with tvm.target.create(device):
+            if bgemm == "direct":
+                fcompute, fschedule = topi.testing.dispatch(device,
+                                                            
_conv2d_nhwc_winograd_direct)
+            elif bgemm == "tensorcore":
+                fcompute, fschedule = topi.testing.dispatch(device,
+                                                            
_conv2d_nhwc_winograd_tensorcore)
+            C = fcompute(A, W, stride, padding, dilation, 'float32')
+            if add_bias:
+                C = topi.add(C, bias)
+            if add_relu:
+                C = topi.nn.relu(C)
+            s = fschedule([C])
+
+        a = tvm.nd.array(a_np, ctx)
+        w = tvm.nd.array(w_np, ctx)
+        b = tvm.nd.array(b_np, ctx)
+        c = tvm.nd.array(np.zeros(get_const_tuple(C.shape), dtype=C.dtype), 
ctx)
+        if add_bias:
+            func = tvm.build(s, [A, W, bias, C], device, 
name="relu_%d_%d_%d_%d_%d_%d_%d_%d" % (
+                batch, in_channel, in_size, num_filter, kernel, stride, 
padding_sum, dilation))
+            func(a, w, b, c)
+        else:
+            func = tvm.build(s, [A, W, C], device, 
name="relu_%d_%d_%d_%d_%d_%d_%d_%d" % (
+                batch, in_channel, in_size, num_filter, kernel, stride, 
padding_sum, dilation))
+            func(a, w, c)
+
+        tvm.testing.assert_allclose(c.asnumpy(), c_np, rtol=2e-3)
+
+    check_device(devices)
+
+
+def test_conv2d_nhwc_winograd_direct():
+    """Test the conv2d with winograd for nhwc layout"""
+    # resnet 18 workloads
+    print("test_winograd_direct...")
+    verify_conv2d_nhwc(1,  64, 56,  64, 3, 1, 1, bgemm="direct")
+    verify_conv2d_nhwc(1, 128, 28, 128, 3, 1, 1)
+    verify_conv2d_nhwc(1, 256, 14, 256, 3, 1, 1)
+    verify_conv2d_nhwc(1, 512,  7, 512, 3, 1, 1)
+    verify_conv2d_nhwc(1,  48, 35,  64, 5, 1, 2)
+
+    # weird workloads
+    verify_conv2d_nhwc(1,  1,  1,  1, 3, 1, 1)
+    verify_conv2d_nhwc(3,  3,  3,  3, 3, 1, 1)
+    verify_conv2d_nhwc(2, 13, 71, 59, 3, 1, 1)
+
+    # Asymmetric padding
+    verify_conv2d_nhwc(1, 512,  7, 512, 3, 1, "SAME")
+    verify_conv2d_nhwc(2,  48, 56,  48, 3, 1, (1, 1), add_relu=True)
+    verify_conv2d_nhwc(2,  48, 56,  48, 3, 1, "SAME", add_relu=True, 
add_bias=True)
+    verify_conv2d_nhwc(1, 48, 35,  48, 5, 1, "VALID")
+
+def test_conv2d_nhwc_winograd_tensorcore():
+    """Test the conv2d with winograd for nhwc layout"""
+    print("test_winograd_tensorcore...")
+    verify_conv2d_nhwc(8,  64, 56,  64, 3, 1, 1, bgemm="tensorcore")
+    verify_conv2d_nhwc(8, 128, 28, 128, 3, 1, 1, bgemm="tensorcore")
+    verify_conv2d_nhwc(8, 256, 14, 256, 3, 1, 1, bgemm="tensorcore")
+
+    verify_conv2d_nhwc(2,  64, 56,  64, 3, 1, (1, 1), add_relu=True, 
bgemm="tensorcore")
+    verify_conv2d_nhwc(2,  64, 56,  64, 3, 1, "SAME", add_relu=True, 
bgemm="tensorcore")
+
+if __name__ == "__main__":
+    test_conv2d_nhwc_winograd_direct()
+
+    if nvcc.have_tensorcore(tvm.gpu(0).compute_version):
+        test_conv2d_nhwc_winograd_tensorcore()

Reply via email to