(tvm) branch main updated: [TOPI][ADRENO] Add conv2d transpose nchw texture schedule (#15786)

echuraev Tue, 14 Nov 2023 22:08:11 -0800

This is an automated email from the ASF dual-hosted git repository.

echuraev pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/tvm.git



The following commit(s) were added to refs/heads/main by this push:
     new 015da7c185 [TOPI][ADRENO] Add conv2d transpose nchw texture schedule 
(#15786)
015da7c185 is described below

commit 015da7c185199add4af943970b6ee3d0a0661ec4
Author: krishnaraj36 <[email protected]>
AuthorDate: Wed Nov 15 11:37:40 2023 +0530

    [TOPI][ADRENO] Add conv2d transpose nchw texture schedule (#15786)
    
    * [TOPI][ADRENO] Add conv2d transpose nchw texture schedule
    
    Added the conv2d transpose strategy for adreno target and enable the
    optimized schedule.
    
    * Fix the whitespace lint error
    
    * Fix lint errors
    
    * Fix whitespace lint error
    
    * Removed unused variables
    
    * Add more conv2dTranspose testcases
    
    * empty update
    
    empty update for retrigger ci
    
    * Update test_conv2d_transpose_nchw_texture.py
    
    * Added more testcase to check memory scopes
    
    * Device specific alter_op_layout for conv2d_transpose
    
    * Fix in virtual device setup and added test case with scope check
    
    * Add the comment conv2d algo
    
    * Add the comment conv2d algo
    
    * Removed fp16 test case from texture
    
    It is failing for few gpu devices.
    
    * remove opencl config change for mainline confilct
    
    * Add the test case for 3 channel input which run with cuda schecule
    
    * Fix in op strategy for out channel 3
    
    * Comment in test case for memory scope
    
    ---------
    
    Co-authored-by: Siva <[email protected]>
---
 python/tvm/relay/op/nn/_nn.py                      |   6 +
 python/tvm/relay/op/strategy/adreno.py             |  52 +++
 python/tvm/topi/adreno/__init__.py                 |   2 +
 .../tvm/topi/adreno/conv2d_transpose_alter_op.py   | 121 ++++++
 python/tvm/topi/adreno/conv2d_transpose_nchw.py    | 412 +++++++++++++++++++++
 python/tvm/topi/adreno/utils.py                    |  23 ++
 python/tvm/topi/nn/conv2d.py                       |  23 ++
 src/relay/transforms/annotate_texture_storage.cc   |   4 +
 .../test_conv2d_transpose_nchw_texture.py          | 325 ++++++++++++++++
 .../relay/opencl_texture/utils/adreno_utils.py     |   5 +-
 10 files changed, 972 insertions(+), 1 deletion(-)

diff --git a/python/tvm/relay/op/nn/_nn.py b/python/tvm/relay/op/nn/_nn.py
index c68685f0ae..6acaf43fe7 100644
--- a/python/tvm/relay/op/nn/_nn.py
+++ b/python/tvm/relay/op/nn/_nn.py
@@ -335,6 +335,12 @@ def legalize_conv2d_transpose(attrs, inputs, types):
     return topi.nn.conv2d_transpose_legalize(attrs, inputs, types)
 
 
[email protected]_alter_op_layout("nn.conv2d_transpose")
+def alter_op_layout_conv2d_transpose(attrs, inputs, tinfos, out_type):
+    """Alternate the layout of conv2d_transpose"""
+    return topi.nn.conv2d_transpose_alter_layout(attrs, inputs, tinfos, 
out_type)
+
+
 @reg.register_convert_op_layout("nn.conv2d_transpose")
 def convert_conv2d_transpose(attrs, inputs, tinfos, desired_layouts):
     """Convert Layout pass registration for conv2d_transpose op.
diff --git a/python/tvm/relay/op/strategy/adreno.py 
b/python/tvm/relay/op/strategy/adreno.py
index c180eeec74..bacace9ad4 100644
--- a/python/tvm/relay/op/strategy/adreno.py
+++ b/python/tvm/relay/op/strategy/adreno.py
@@ -215,6 +215,58 @@ def 
conv2d_winograd_without_weight_transform_strategy_adreno(attrs, inputs, out_
     return strategy
 
 
+@conv2d_transpose_strategy.register("adreno")
+def conv2d_transpose_strategy_adreno(attrs, inputs, out_type, target):
+    """conv2d_transpose adreno strategy"""
+    strategy = _op.OpStrategy()
+    _, kernel = inputs
+    dilation = attrs.get_int_tuple("dilation")
+    groups = attrs.groups
+    data_layout = attrs.data_layout
+    kernel_layout = attrs.kernel_layout
+    assert dilation == (1, 1), "not support dilate now"
+
+    if (groups == 1) and (
+        (data_layout == "NCHW" and kernel_layout == "IOHW")
+        or (data_layout == "NCHW4c" and kernel_layout == "IOHW4o")
+        or (data_layout == "NCHW" and kernel_layout == "IOHW4o")
+    ):
+        if len(kernel.shape) == 4:
+            _, oc, _, _ = get_const_tuple(kernel.shape)
+        else:
+            _, oc, _, _, _ = get_const_tuple(kernel.shape)
+        # We cannot use textures for case than number of channels is less than 
4.
+        # So, we use compute functions from cuda.
+        if len(kernel.shape) == 4 and oc < 4:
+            strategy.add_implementation(
+                wrap_compute_conv2d_transpose(topi.cuda.conv2d_transpose_nchw),
+                wrap_topi_schedule(topi.cuda.schedule_conv2d_transpose_nchw),
+                name="conv2d_transpose_nchw.cuda",
+            )
+            return strategy
+        strategy.add_implementation(
+            wrap_compute_conv2d_transpose(topi.adreno.conv2d_transpose_nchwc),
+            wrap_topi_schedule(topi.adreno.schedule_conv2d_transpose_nchwc),
+            name="conv2d_transpose_nchwc.image2d",
+            plevel=10,
+        )
+    elif data_layout == "NCHW":
+        strategy.add_implementation(
+            wrap_compute_conv2d_transpose(topi.cuda.conv2d_transpose_nchw, 
has_groups=True),
+            wrap_topi_schedule(topi.cuda.schedule_conv2d_transpose_nchw),
+            name="conv2d_transpose_nchw.cuda",
+        )
+    else:
+        raise RuntimeError(
+            "Layout not supported: ("
+            + data_layout
+            + ", "
+            + kernel_layout
+            + ") - only support NCHW, NCHW4c / IOHW4o layouts for 
conv2d_transpose"
+        )
+    return strategy
+
+
 @schedule_pool.register("adreno")
 def schedule_pool_adreno(attrs, outs, target):
     """schedule pooling ops for adreno"""
diff --git a/python/tvm/topi/adreno/__init__.py 
b/python/tvm/topi/adreno/__init__.py
index 55bfbee2a8..cd42848b29 100644
--- a/python/tvm/topi/adreno/__init__.py
+++ b/python/tvm/topi/adreno/__init__.py
@@ -23,7 +23,9 @@ from .conv2d_nhwc import *
 from .depthwise_conv2d_nhwc import *
 from .pooling import *
 from .conv2d_alter_op import *
+from .conv2d_transpose_alter_op import *
 from .conv2d_nchw_winograd import *
 from .conv2d_nhwc_winograd import *
 from .injective import schedule_injective
 from .reduction import *
+from .conv2d_transpose_nchw import *
diff --git a/python/tvm/topi/adreno/conv2d_transpose_alter_op.py 
b/python/tvm/topi/adreno/conv2d_transpose_alter_op.py
new file mode 100644
index 0000000000..c68e5cb7a5
--- /dev/null
+++ b/python/tvm/topi/adreno/conv2d_transpose_alter_op.py
@@ -0,0 +1,121 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+# pylint: disable=invalid-name,unused-variable,unused-argument,no-member
+"""Conv2D Transpose alter op for Qualcomm Adreno GPU"""
+
+import logging
+
+import re
+import tvm
+from tvm import te
+from tvm import relay
+from tvm import autotvm
+from ..utils import get_const_tuple
+from ..nn import conv2d_transpose_alter_layout
+
+logger = logging.getLogger("topi")
+
+# Number of wildcards for matching of supported layouts to be transformed
+_NCHWc_matcher = re.compile("^NCHW[0-9]+c$")
+_IOHWo_matcher = re.compile("^IOHW[0-9]+o$")
+
+
+@conv2d_transpose_alter_layout.register("adreno")
+def _alter_conv2d_transpose_layout(attrs, inputs, tinfos, out_type):
+    """
+    Prepare of the new conv2d_transpose with proper target blocked layout 
attributes
+    OpenCL Textures supports 1d/2d/3d/4d tetures but read happens always only 
for 4 elements
+    in a line. Thus way we are supporting for now only 4d conversions on the 
end
+    NCHW -> NCHW4c & IOHW ->IOHW4o
+    """
+    target = tvm.target.Target.current(allow_none=False)
+    dispatch_ctx = autotvm.task.DispatchContext.current
+    new_attrs = {k: attrs[k] for k in attrs.keys()}
+
+    # Parse the attributes.
+    padding = attrs.get_int_tuple("padding")
+    strides = attrs.get_int_tuple("strides")
+    dilation = attrs.get_int_tuple("dilation")
+    data_layout = attrs["data_layout"]
+    kernel_layout = attrs["kernel_layout"]
+    data_tensor, kernel_tensor = tinfos
+    data_dtype = data_tensor.dtype
+    out_dtype = out_type.dtype
+
+    if isinstance(dispatch_ctx, autotvm.task.ApplyGraphBest):
+        cfg = dispatch_ctx.query(target, None)
+        workload = cfg.workload
+    else:
+        impl, outs = relay.backend.te_compiler.select_implementation(
+            relay.op.get("nn.conv2d_transpose"), attrs, tinfos, out_type, 
target
+        )
+        workload = autotvm.task.get_workload(outs)
+        cfg = dispatch_ctx.query(target, workload)
+
+    topi_tmpl = workload[0]
+
+    if "conv2d_transpose_nchwc" in topi_tmpl:  # covers conv2d_transpose_nchwc
+        if data_layout == "NCHW" and kernel_layout == "IOHW":
+            batch, in_channels, in_height, in_width = data_tensor.shape
+            _, out_channles, kernel_h, kernel_w = kernel_tensor.shape
+            in_channel_block = in_channels % 4
+            if in_channel_block == 0:
+                in_channel_block = 4
+            num_filter_block = out_channles % 4
+            if num_filter_block == 0:
+                num_filter_block = 4
+
+            # no support yet for tensors that cannot be divisible by factor 4
+            if num_filter_block != 4:
+                return None
+
+            batch_size, in_channel, height, width = 
get_const_tuple(data_tensor.shape)
+            in_filter_channel, out_channel, kh, kw = 
get_const_tuple(kernel_tensor.shape)
+
+            # update new attrs
+            new_attrs["channels"] = out_channel
+            if in_channel_block == 4:
+                new_attrs["data_layout"] = f"NCHW{in_channel_block}c"
+            else:
+                new_attrs["data_layout"] = "NCHW"
+            # (oc, ic, h, w) -> (ic, OC, h, w, oc)
+            new_attrs["kernel_layout"] = f"IOHW{num_filter_block}o"
+            new_attrs["out_layout"] = f"NCHW{num_filter_block}c"
+
+            # Store altered operator's config for applying of tuned AutoTVM 
statistics
+            if in_channel_block == 4:
+                new_data = te.placeholder(
+                    (batch_size, in_channel // in_channel_block, height, 
width, in_channel_block),
+                    dtype=data_dtype,
+                )
+            else:
+                new_data = data_tensor
+            new_kernel = te.placeholder(
+                (in_filter_channel, out_channel // num_filter_block, kh, kw, 
num_filter_block),
+                dtype=kernel_tensor.dtype,
+            )
+            new_workload = autotvm.task.args_to_workload(
+                [new_data, new_kernel, strides, padding, dilation, out_dtype],
+                topi_tmpl,  # "conv2d_transpose_nchwc.image2d",
+            )
+            dispatch_ctx.update(target, new_workload, cfg)
+        else:
+            assert _NCHWc_matcher.match(data_layout)
+            assert _IOHWo_matcher.match(kernel_layout)
+        return relay.nn.conv2d_transpose(*inputs, **new_attrs)
+
+    return None
diff --git a/python/tvm/topi/adreno/conv2d_transpose_nchw.py 
b/python/tvm/topi/adreno/conv2d_transpose_nchw.py
new file mode 100644
index 0000000000..ad8c7b88ef
--- /dev/null
+++ b/python/tvm/topi/adreno/conv2d_transpose_nchw.py
@@ -0,0 +1,412 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+# pylint: disable=invalid-name,unused-variable,unused-argument,no-else-return
+"""conv2d_transpose nchw schedule on Qualcomm Adreno GPU"""
+import tvm
+from tvm import te
+from tvm import autotvm
+from .. import nn
+
+
+from ..utils import get_const_tuple, traverse_inline
+from .utils import (
+    split_to_chunks,
+    pack_input,
+    pack_filter,
+    bind_data_copy,
+    get_default_conv2d_config,
+    get_texture_storage,
+)
+
+
[email protected]_topi_compute("conv2d_transpose_nchwc.image2d")
+def conv2d_transpose_nchwc(
+    cfg, Input, Filter, stride, padding, out_dtype, output_padding, groups=1
+):
+    """
+    Transposed Convolution operator in NCHWc layout.
+    Algo:
+      1. Convert into blocked format if we have 4d original tensor.
+         In case of AutoTVM we override the convert by just tensors since such 
conversion
+         will be absent for real blocked convolution, no sense to include into 
tuning
+      2. Expand spatial dimensions to have width and height be dividable by 
factor 4
+         This leads to slightly bigger amount of compute but allow utilize GPU 
much better
+      3. Add paddings. This happens even if we do not need pad originaly. This 
is useful
+         due to work arounding of the gaps of texture annotation between 
Primary Functions
+         and limited support of textures in schedules. Later on this pad will 
be executed
+         separately and will produce texture
+      4. 5d Convolution compute with accumulating into out_dtype
+      5. Cast to the origin output data type
+      6. For case of 4d convolution: convert of output from 5d to 4d
+    """
+
+    if out_dtype is None:
+        out_dtype = Input.dtype
+    assert isinstance(stride, int) or len(stride) == 2
+    if isinstance(stride, int):
+        stride_h = stride_w = stride
+    else:
+        stride_h, stride_w = stride
+
+    outpad_height, outpad_width = output_padding
+    assert outpad_height < stride_h and outpad_width < stride_w
+
+    convert_from4d = False
+    if len(Input.shape) == 4:
+        batch, in_channels, in_height, in_width = Input.shape
+        in_channel_chunks, in_channel_block, in_channel_tail = 
split_to_chunks(in_channels, 4)
+
+        if autotvm.GLOBAL_SCOPE.in_tuning:
+            dshape = (batch, in_channel_chunks, in_height, in_width, 
in_channel_block)
+            Input = tvm.te.placeholder(dshape, Input.dtype, 
name="data_placeholder")
+        else:
+            Input = pack_input(
+                Input,
+                "NCHW",
+                batch,
+                in_channel_chunks,
+                in_channel_block,
+                in_channel_tail,
+                in_height,
+                in_width,
+            )
+    else:
+        batch, in_channel_chunks, in_height, in_width, in_channel_block = 
Input.shape
+
+    if len(Filter.shape) == 4:
+        in_filter_channels, out_channels, kernel_h, kernel_w = Filter.shape
+        out_channel_chunks, out_channel_block, out_channel_tail = 
split_to_chunks(out_channels, 4)
+
+        if autotvm.GLOBAL_SCOPE.in_tuning:
+            kshape = (in_filter_channels, out_channel_chunks, kernel_h, 
kernel_w, out_channel_block)
+            Filter = tvm.te.placeholder(kshape, Filter.dtype, 
name="kernel_placeholder")
+        else:
+            convert_from4d = True
+            Filter = pack_filter(
+                Filter,
+                "IOHW",
+                out_channel_chunks,
+                out_channel_block,
+                out_channel_tail,
+                in_filter_channels,
+                in_channel_chunks,
+                in_channel_block,
+                in_channel_tail,
+                kernel_h,
+                kernel_w,
+            )
+    else:
+        in_filter_channels, out_channel_chunks, kernel_h, kernel_w, 
out_channel_block = Filter.shape
+
+    cfg.stride = stride
+
+    pad_top, pad_left, pad_bottom, pad_right = nn.get_pad_tuple(padding, 
(kernel_h, kernel_w))
+
+    out_width_orig = out_width = (
+        (in_width - 1) * stride_w + kernel_w - pad_left - pad_right + 
outpad_width
+    )
+    pad_left = kernel_w - 1 - pad_left
+    pad_right = kernel_w - 1 - pad_right + outpad_width
+    dilated_width = stride_w * (in_width - 1) + 1
+
+    out_height_orig = out_height = (
+        (in_height - 1) * stride_h + kernel_h - pad_top - pad_bottom + 
outpad_height
+    )
+    pad_top = kernel_h - 1 - pad_top
+    pad_bottom = kernel_h - 1 - pad_bottom + outpad_height
+    dilated_height = stride_h * (in_height - 1) + 1
+
+    if out_height % 2 != 0:
+        out_height += 1
+    if out_width % 2 != 0:
+        out_width += 1
+
+    if out_height % 4 != 0:
+        out_height += 2
+    if out_width % 4 != 0:
+        out_width += 2
+
+    # compute pad
+    temp = te.compute(
+        (
+            batch,
+            in_channel_chunks,
+            pad_top + dilated_height + pad_bottom,
+            pad_left + dilated_width + pad_right,
+            in_channel_block,
+        ),
+        lambda n, c, y, x, cb: tvm.tir.if_then_else(
+            tvm.tir.all(
+                x >= pad_left,
+                x < pad_left + dilated_width,
+                tvm.tir.indexmod(x - pad_left, stride_w).equal(0),
+                y >= pad_top,
+                y < pad_top + dilated_height,
+                tvm.tir.indexmod(y - pad_top, stride_h).equal(0),
+            ),
+            Input[
+                n,
+                c,
+                tvm.tir.indexdiv(y - pad_top, stride_h),
+                tvm.tir.indexdiv(x - pad_left, stride_w),
+                cb,
+            ],
+            tvm.tir.const(0.0, Input.dtype),
+        ),
+        name="pad_temp",
+    )
+
+    # compute transposed conv
+    dcc = te.reduce_axis((0, in_channel_chunks), name="dcc")
+    dcb = te.reduce_axis((0, in_channel_block), name="dcb")
+    dh = te.reduce_axis((0, kernel_h), name="dh")
+    dw = te.reduce_axis((0, kernel_w), name="dw")
+    conv = te.compute(
+        (batch, out_channel_chunks, out_height, out_width, out_channel_block),
+        lambda b, c, h, w, cb: te.sum(
+            temp[
+                b, c // out_channel_chunks * (in_channel_chunks) + dcc, h + 
dh, w + dw, dcb
+            ].astype(out_dtype)
+            * Filter[
+                dcc * in_channel_block + dcb,
+                c % out_channel_chunks,
+                kernel_h - 1 - dh,
+                kernel_w - 1 - dw,
+                cb,
+            ].astype(out_dtype),
+            axis=[dcc, dcb, dh, dw],
+        ),
+        tag="conv2d_transpose_nchwc",
+    )
+
+    if convert_from4d and not autotvm.GLOBAL_SCOPE.in_tuning:
+        dummy_cast = te.compute(
+            (batch, out_channel_chunks, out_height_orig, out_width_orig, 
out_channel_block),
+            lambda n, fc, y, x, fb: conv[n, fc, y, x, fb].astype(out_dtype),
+            tag="dummy_cast",
+        )
+        return te.compute(
+            (batch, out_channels, out_height_orig, out_width_orig),
+            lambda n, c, y, x: dummy_cast[n, c // out_channel_block, y, x, c % 
out_channel_block],
+            tag="adreno_conv2d_transpose_latest_op",
+        )
+    else:
+        return te.compute(
+            (batch, out_channel_chunks, out_height_orig, out_width_orig, 
out_channel_block),
+            lambda n, ffc, y, x, ffb: conv[n, ffc, y, x, 
ffb].astype(out_dtype),
+            tag="adreno_conv2d_transpose_latest_op",
+        )
+
+
[email protected]_topi_schedule("conv2d_transpose_nchwc.image2d")
+def schedule_conv2d_transpose_nchwc(cfg, outs):
+    """Create the schedule for conv2d_nchw"""
+    outs = [outs] if isinstance(outs, te.tensor.Tensor) else outs
+    s = te.create_schedule([x.op for x in outs])
+
+    def _callback(op):
+        if op.tag == "adreno_conv2d_transpose_latest_op":
+            schedule_conv2d_transpose_NCHWc(cfg, s, op.output(0))
+
+    traverse_inline(s, outs[0].op, _callback)
+    return s
+
+
+def schedule_conv2d_transpose_NCHWc(cfg, s, output):
+    """
+    schedule optimized for batch size = 1
+
+    Algo:
+    1. Split output axis to three parts: global work size, vthread, local 
worksize.
+       The limitations for tuning includes heuristics from some tuned networks 
to limit
+       search space and not pay much time for useles configurations.
+    2. In case of 4d convolution schedule copying of the input (and filter) 
into
+      5d tensors
+    4. pad should be scheduled separately to create independent opencl kernel. 
If pad is
+       inlined into convolution, this gives 1.5x performance drop
+    5. We are using cache_read for intermediate tensors to produce texture and 
guarantee
+       the best performance on the next stage.
+       The weights are managed through static texture planning mechanism and 
guarantied come
+       in texture memory scope.
+       Thus way we are calling cache_read only for data tensor
+    6. For 5d convolution we schedule the latest op with binding 5d axis and 
vectorize
+       for textures
+       For 4d tensor we are doing the same for the latest blocked stage, i.e. 
conversion
+       of data type
+    7. In case of 4d conv we need to schedule postops as well
+    """
+    latest = s.outputs[0].output(0)
+    if len(latest.op.axis) == 4:
+        latest_blocked = dummy = output.op.input_tensors[0]
+        conv = dummy.op.input_tensors[0]
+    else:
+        conv = output.op.input_tensors[0]
+        latest_blocked = latest
+
+    pad_data, kernel = s[conv].op.input_tensors
+    filter_pack_rt = bool(
+        isinstance(kernel.op, tvm.te.ComputeOp) and "filter_pack" in 
kernel.op.tag
+    )
+
+    if "pad_temp" in pad_data.op.name:
+        input_pad_temp = pad_data.op.input_tensors[0]
+    else:
+        input_pad_temp = pad_data
+
+    input_pack_rt = bool(
+        isinstance(input_pad_temp.op, tvm.te.ComputeOp) and "input_pack" in 
input_pad_temp.op.tag
+    )
+
+    ##### space definition begin #####
+    n, fc, y, x, fb = s[conv].op.axis
+    rcc, rcb, ry, rx = s[conv].op.reduce_axis
+
+    if conv.shape[1] % 2 == 0:
+        min_threads_div = 2
+    else:
+        min_threads_div = 1
+    cfg.define_split(
+        "tile_fc",
+        fc,
+        num_outputs=3,
+        filter=lambda entity: entity.size[1] <= 8
+        and entity.size[2] >= min_threads_div
+        and entity.size[2] < 256,
+    )
+    cfg.define_split(
+        "tile_y",
+        y,
+        num_outputs=3,
+        filter=lambda entity: entity.size[1] <= 8 and entity.size[2] <= 16,
+    )
+    cfg.define_split(
+        "tile_x",
+        x,
+        num_outputs=3,
+        filter=lambda entity: entity.size[1] <= 8 and entity.size[2] <= 16,
+    )
+
+    cfg.define_split("tile_rcc", rcc, num_outputs=2)
+    cfg.define_split("tile_ry", ry, num_outputs=2)
+    cfg.define_split("tile_rx", rx, num_outputs=2)
+    cfg.define_knob("auto_unroll_max_step", [0, 64])
+    cfg.define_knob("unroll_explicit", [0, 1])
+    cfg.multi_filter(
+        filter=lambda entity: (  # pylint: disable=chained-comparison
+            entity["tile_fc"].size[1] * entity["tile_y"].size[1] * 
entity["tile_x"].size[1]
+        )
+        <= 24
+        and 32
+        <= (entity["tile_fc"].size[2] * entity["tile_y"].size[2] * 
entity["tile_x"].size[2])
+        < 1024
+    )
+    if cfg.is_fallback:
+        get_default_conv2d_config(cfg, conv.shape[1], conv.shape[2], 
conv.shape[3])
+    ##### space definition end #####
+
+    pad_data, kernel = s[conv].op.input_tensors
+    # There are several conditions that have to be handled:
+    # 1. If we are in the tuning, we always add cache read for data to main 
conv kernel
+    #    to get texture in tuning opencl kernel
+    # 2. If we are repacking input in runtime, we should always explicit 
schedule this one more
+    #    stage of data copy from 4d to 5d (referred as pack_data).
+    # 3. If we have pad (independently if we have runtime repack or not) we 
should inline it in the
+    #    cache_read("texture")
+    if autotvm.GLOBAL_SCOPE.in_tuning or input_pack_rt:
+        if autotvm.GLOBAL_SCOPE.in_tuning:
+            if "pad_temp" in pad_data.op.name:
+                s[pad_data].compute_inline()
+        else:
+            if "pad_temp" in pad_data.op.name:
+                pack_data = pad_data.op.input_tensors[0]
+                bind_data_copy(s[pack_data])
+                s[pad_data].compute_inline()
+            else:
+                pack_data = pad_data
+                bind_data_copy(s[pack_data])
+
+        AT = s.cache_read(pad_data, get_texture_storage(pad_data.shape), 
[conv])
+        bind_data_copy(s[AT])
+    elif "pad_temp" in pad_data.op.name:
+        s[pad_data].compute_inline()
+        # create cache stage
+        AT = s.cache_read(pad_data, get_texture_storage(pad_data.shape), 
[conv])
+        bind_data_copy(s[AT])
+
+    if autotvm.GLOBAL_SCOPE.in_tuning or filter_pack_rt:
+        if not autotvm.GLOBAL_SCOPE.in_tuning:
+            bind_data_copy(s[kernel])
+        if kernel.shape[2] == 1 and kernel.shape[3] == 1:
+            WT = s.cache_read(kernel, get_texture_storage(kernel.shape), 
[conv])
+            bind_data_copy(s[WT])
+
+    s[conv].set_scope("local")
+    if latest_blocked == latest and output != latest:
+        s[output].compute_inline()
+
+    # tile and bind spatial axes
+    n, fc, y, x, fb = s[latest_blocked].op.axis
+
+    kernel_scope, n = s[latest_blocked].split(n, nparts=1)
+
+    bf, vf, tf = cfg["tile_fc"].apply(s, latest_blocked, fc)
+    by, vy, ty = cfg["tile_y"].apply(s, latest_blocked, y)
+    bx, vx, tx = cfg["tile_x"].apply(s, latest_blocked, x)
+
+    bf = s[latest_blocked].fuse(n, bf)
+    s[latest_blocked].bind(bf, te.thread_axis("blockIdx.z"))
+    s[latest_blocked].bind(by, te.thread_axis("blockIdx.y"))
+    s[latest_blocked].bind(bx, te.thread_axis("blockIdx.x"))
+    s[latest_blocked].bind(vf, te.thread_axis("vthread"))
+    s[latest_blocked].bind(vy, te.thread_axis("vthread"))
+    s[latest_blocked].bind(vx, te.thread_axis("vthread"))
+    s[latest_blocked].bind(tf, te.thread_axis("threadIdx.z"))
+    s[latest_blocked].bind(ty, te.thread_axis("threadIdx.y"))
+    s[latest_blocked].bind(tx, te.thread_axis("threadIdx.x"))
+    s[latest_blocked].reorder(bf, by, bx, vf, vy, vx, tf, ty, tx, fb)
+    s[latest_blocked].vectorize(fb)
+
+    s[conv].compute_at(s[latest_blocked], tx)
+
+    # tile reduction axes
+    n, fc, y, x, fb = s[conv].op.axis
+
+    rcc, rcb, ry, rx = s[conv].op.reduce_axis
+    rco, rci = cfg["tile_rcc"].apply(s, conv, rcc)
+    ryo, ryi = cfg["tile_ry"].apply(s, conv, ry)
+    rxo, rxi = cfg["tile_rx"].apply(s, conv, rx)
+
+    s[conv].reorder(rco, ryo, rxo, rci, ryi, rxi, rcb, n, fc, y, x, fb)
+    s[conv].vectorize(fb)
+    s[conv].unroll(rcb)
+
+    # unroll
+    s[latest_blocked].pragma(kernel_scope, "auto_unroll_max_step", 
cfg["auto_unroll_max_step"].val)
+    s[latest_blocked].pragma(kernel_scope, "unroll_explicit", 
cfg["unroll_explicit"].val)
+
+    if latest_blocked != latest:
+        s[latest].compute_root()
+        bind_data_copy(s[latest], 1)
+        if latest != output:
+            s[output].compute_inline()
+
+    N, OCC, OH, OW, OCB = get_const_tuple(latest_blocked.shape)
+    _, IC, KH, KW, _ = get_const_tuple(kernel.shape)
+    ICKHKW = IC * KH * KW
+
+    if isinstance(N, int):
+        cfg.add_flop(2 * N * OH * OW * OCC * OCB * ICKHKW)
diff --git a/python/tvm/topi/adreno/utils.py b/python/tvm/topi/adreno/utils.py
index 698a306514..a42cbeeb77 100644
--- a/python/tvm/topi/adreno/utils.py
+++ b/python/tvm/topi/adreno/utils.py
@@ -281,6 +281,22 @@ def pack_filter(
             Filter[indices[0], indices[1], indices[2], indices[3] * out_block 
+ indices[4]],
         )
 
+    def _reorder_weights_iohw(*indices):
+        conditionA = []
+        conditionA.append(indices[1] == out_chunks - 1)
+        conditionA.append(indices[4] >= out_original_tail)
+        conditionAT = tvm.tir.all(*conditionA)
+
+        conditionO = []
+        conditionO.append(conditionAT)
+        conditionO.append(indices[0] >= in_chunks * in_block + 
in_original_tail)
+        conditionOT = tvm.tir.any(*conditionO)
+        return tvm.tir.if_then_else(
+            conditionOT,
+            pad_value,
+            Filter[indices[0], indices[1] * out_block + indices[4], 
indices[2], indices[3]],
+        )
+
     if in_filter_channels == 1:
         if layout == "OIHW":
             reordered_filter = te.compute(
@@ -313,6 +329,13 @@ def pack_filter(
                 name="filter_pack",
                 tag="filter_pack",
             )
+        elif layout == "IOHW":
+            reordered_filter = te.compute(
+                [in_filter_channels, out_chunks, kernel_h, kernel_w, 
out_block],
+                _reorder_weights_iohw,
+                name="filter_pack",
+                tag="filter_pack",
+            )
         elif layout == "HWIO":
             reordered_filter = te.compute(
                 [kernel_h, kernel_w, in_filter_channels, out_chunks, 
out_block],
diff --git a/python/tvm/topi/nn/conv2d.py b/python/tvm/topi/nn/conv2d.py
index 8fdcb0dc1a..75f72ee93d 100644
--- a/python/tvm/topi/nn/conv2d.py
+++ b/python/tvm/topi/nn/conv2d.py
@@ -143,6 +143,29 @@ def conv2d_alter_layout(attrs, inputs, tinfos, out_type):
     return None
 
 
[email protected]_func
+def conv2d_transpose_alter_layout(attrs, inputs, tinfos, out_type):
+    """Change Conv2D_Transpose layout.
+
+    Parameters
+    ----------
+    attrs : tvm.ir.Attrs
+        Attributes of current convolution
+    inputs : tvm.relay.Expr
+        Grouped input symbols
+    tinfos : list
+        Input shape and dtype
+    out_type: type
+        The output type
+
+    Note
+    ----
+    Unlike other TOPI functions, this function operates on both graph level 
and operator level.
+    """
+    # not to change by default
+    return None
+
+
 @tvm.target.generic_func
 def conv2d_infer_layout(workload, cfg):
     """Infer input/output shapes and layouts from a workload and cfg.
diff --git a/src/relay/transforms/annotate_texture_storage.cc 
b/src/relay/transforms/annotate_texture_storage.cc
index 01d47b6953..9ccb2171d8 100644
--- a/src/relay/transforms/annotate_texture_storage.cc
+++ b/src/relay/transforms/annotate_texture_storage.cc
@@ -392,6 +392,10 @@ class StorageInfo : private 
transform::DeviceAwareExprVisitor {
           (attrs->kernel_layout == "OIHW4o" || attrs->kernel_layout == 
"HWIO4o")) {
         supports_texture_storage = true;
       }
+    } else if (auto attrs = call->attrs.as<Conv2DTransposeAttrs>()) {
+      if (attrs->data_layout == "NCHW4c" && attrs->kernel_layout == "IOHW4o") {
+        supports_texture_storage = true;
+      }
     } else if (auto attrs = call->attrs.as<GlobalPool2DAttrs>()) {
       if (attrs->layout == "NCHW4c") {
         supports_texture_storage = true;
diff --git 
a/tests/python/relay/opencl_texture/test_conv2d_transpose_nchw_texture.py 
b/tests/python/relay/opencl_texture/test_conv2d_transpose_nchw_texture.py
new file mode 100644
index 0000000000..d110c8329f
--- /dev/null
+++ b/tests/python/relay/opencl_texture/test_conv2d_transpose_nchw_texture.py
@@ -0,0 +1,325 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+import re
+import tvm
+import numpy as np
+from tvm import relay
+from tvm.relay import testing
+from tvm.contrib import utils
+from utils.adreno_utils import gpu_preprocess, build_run_compare, 
build_run_compare_vm
+import pytest
+
+
+executor_type = tvm.testing.parameter("ge", "vm")
+dtype = tvm.testing.parameter("float32")
+
+
[email protected]_opencl
[email protected]_targets("opencl -device=adreno")
+def test_conv2d_transpose_adreno(remote, target, executor_type, dtype):
+    # Conv2d transpose test cases lists
+    trials = [
+        [4, 4, (1, 1), (2, 2), (1, 1), 64, (256, 100, 100), (False, False), 
gpu_preprocess],
+        [4, 4, (0, 0), (2, 2), (1, 1), 256, (32, 64, 64), (False, False), 
None],
+        [3, 3, (0, 0), (2, 2), (1, 1), 64, (256, 100, 100), (True, True), 
None],
+        [4, 4, (1, 1), (1, 1), (1, 1), 512, (16, 100, 100), (False, False), 
gpu_preprocess],
+        [5, 5, (2, 2), (2, 2), (1, 1), 4, (16, 100, 100), (True, False), 
gpu_preprocess],
+        [7, 7, (3, 3), (2, 2), (1, 1), 8, (4, 100, 100), (False, True), None],
+        [7, 7, (3, 3), (2, 2), (1, 1), 64, (3, 100, 100), (True, True), None],
+        [3, 3, (1, 1), (1, 1), (1, 1), 3, (16, 8, 8), (True, True), None],
+    ]
+    # Tensors memory scope with graph executor build
+    ge_texture_scopes = [
+        ["", "global.texture", "global.texture-weight", "", ""],
+        ["", "global.texture", "global.texture-weight", "", ""],
+        ["", "global.texture", "global.texture-weight", 
"global.texture-weight", "", ""],
+        ["", "global.texture", "global.texture-weight", "", ""],
+        ["", "global.texture", "global.texture-weight", 
"global.texture-weight", "", ""],
+        ["", "global.texture", "global.texture-nhwc", "", ""],
+        [],
+        [],
+    ]
+    # Tensors memory scope with vm executor build
+    vm_texture_scopes = [
+        """
+        VM VirtualDevice[0]: device type 1, id 0 and mem_scope
+        VM VirtualDevice[1]: device type 4, id 0 and mem_scope
+        VM VirtualDevice[2]: device type 4, id 0 and mem_scope global.texture
+        VM VirtualDevice[3]: device type 4, id 0 and mem_scope 
global.texture-weight
+        """,
+        """
+        VM VirtualDevice[0]: device type 1, id 0 and mem_scope
+        VM VirtualDevice[1]: device type 4, id 0 and mem_scope
+        VM VirtualDevice[2]: device type 4, id 0 and mem_scope global.texture
+        VM VirtualDevice[3]: device type 4, id 0 and mem_scope 
global.texture-weight
+        """,
+        """
+        VM VirtualDevice[0]: device type 1, id 0 and mem_scope
+        VM VirtualDevice[1]: device type 4, id 0 and mem_scope
+        VM VirtualDevice[2]: device type 4, id 0 and mem_scope global.texture
+        VM VirtualDevice[3]: device type 4, id 0 and mem_scope 
global.texture-weight
+        VM VirtualDevice[4]: device type 4, id 0 and mem_scope 
global.texture-weight
+        """,
+        """
+        VM VirtualDevice[0]: device type 1, id 0 and mem_scope
+        VM VirtualDevice[1]: device type 4, id 0 and mem_scope
+        VM VirtualDevice[2]: device type 4, id 0 and mem_scope global.texture
+        VM VirtualDevice[3]: device type 4, id 0 and mem_scope 
global.texture-weight
+        """,
+        """
+        VM VirtualDevice[0]: device type 1, id 0 and mem_scope
+        VM VirtualDevice[1]: device type 4, id 0 and mem_scope
+        VM VirtualDevice[2]: device type 4, id 0 and mem_scope global.texture
+        VM VirtualDevice[3]: device type 4, id 0 and mem_scope 
global.texture-weight
+        VM VirtualDevice[4]: device type 4, id 0 and mem_scope 
global.texture-weight
+        """,
+        """
+        VM VirtualDevice[0]: device type 1, id 0 and mem_scope
+        VM VirtualDevice[1]: device type 4, id 0 and mem_scope
+        VM VirtualDevice[2]: device type 4, id 0 and mem_scope global.texture
+        VM VirtualDevice[3]: device type 4, id 0 and mem_scope 
global.texture-nhwc
+        """,
+        [],
+        [],
+    ]
+
+    for i, (
+        kernel_h,
+        kernel_w,
+        pad,
+        stride,
+        dilation,
+        out_channels,
+        shape,
+        composite,
+        _gpu_preprocess,
+    ) in enumerate(trials):
+        shape = (1, *shape)
+        has_bias = composite[0]
+        has_activation = composite[1]
+        input_shape = shape
+        filter_shape = (shape[1], out_channels, kernel_w, kernel_h)
+        x = relay.var("data", shape=input_shape, dtype=dtype)
+        w = relay.var("weight", shape=filter_shape, dtype=dtype)
+        inputs = [x, w]
+        y = relay.nn.conv2d_transpose(
+            x,
+            w,
+            channels=out_channels,
+            kernel_size=(kernel_w, kernel_h),
+            strides=stride,
+            padding=pad,
+            kernel_layout="IOHW",
+            data_layout="NCHW",
+            dilation=dilation,
+        )
+
+        np.random.seed(0)
+        initializer = relay.testing.init.Xavier()
+        filter_data = np.zeros(filter_shape).astype(dtype)
+        initializer("weight", filter_data)
+        params1 = {
+            "weight": tvm.nd.array(filter_data),
+        }
+
+        if has_bias:
+            b = relay.var("bias", shape=(out_channels,), dtype=dtype)
+            y = relay.nn.bias_add(y, b, axis=1)
+            inputs.append(b)
+            bias_data = np.zeros((out_channels,)).astype(dtype)
+            initializer("bias", bias_data)
+            params1["bias"] = tvm.nd.array(bias_data)
+        if has_activation:
+            y = relay.nn.relu(y)
+
+        mod = relay.Function(inputs, y)
+        if executor_type == "ge":
+            build_run_compare(
+                remote,
+                mod,
+                params1,
+                {"data": input_shape},
+                {"data": dtype},
+                target,
+                ge_texture_scopes[i],
+                _gpu_preprocess,
+            )
+        else:
+            build_run_compare_vm(
+                remote,
+                mod,
+                params1,
+                {"data": input_shape},
+                {"data": dtype},
+                target,
+                vm_texture_scopes[i],
+                _gpu_preprocess,
+            )
+
+
[email protected]_opencl
[email protected]_targets("opencl -device=adreno")
+def test_conv2d_transpose_three_layer_block(remote, target, executor_type, 
dtype):
+    # Conv2d transpose test cases lists
+    trials = [
+        [4, 4, (1, 1), (2, 2), (1, 1), 64, (256, 100, 100), (False, False), 
None],
+        [3, 3, (0, 0), (1, 1), (1, 1), 64, (256, 12, 12), (True, True), 
gpu_preprocess],
+    ]
+    ge_texture_scopes = [
+        [
+            "",
+            "global.texture",
+            "global.texture-weight",
+            "global.texture",
+            "global.texture-weight",
+            "global.texture",
+            "global.texture-weight",
+            "",
+            "",
+        ],
+        [
+            "",
+            "global.texture-nhwc",
+            "global.texture-weight",
+            "global.texture-nhwc",
+            "global.texture-weight",
+            "global.texture-weight",
+            "global.texture-nhwc",
+            "global.texture-weight",
+            "",
+            "",
+        ],
+    ]
+    vm_texture_scopes = [
+        """
+        VM VirtualDevice[0]: device type 1, id 0 and mem_scope
+        VM VirtualDevice[1]: device type 4, id 0 and mem_scope
+        VM VirtualDevice[2]: device type 4, id 0 and mem_scope global.texture
+        VM VirtualDevice[3]: device type 4, id 0 and mem_scope global.texture
+        VM VirtualDevice[4]: device type 4, id 0 and mem_scope 
global.texture-weight
+        VM VirtualDevice[5]: device type 4, id 0 and mem_scope global.texture
+        VM VirtualDevice[6]: device type 4, id 0 and mem_scope 
global.texture-weight
+        VM VirtualDevice[7]: device type 4, id 0 and mem_scope 
global.texture-weight
+        """,
+        """
+        VM VirtualDevice[0]: device type 1, id 0 and mem_scope
+        VM VirtualDevice[1]: device type 4, id 0 and mem_scope
+        VM VirtualDevice[2]: device type 4, id 0 and mem_scope 
global.texture-nhwc
+        VM VirtualDevice[3]: device type 4, id 0 and mem_scope 
global.texture-nhwc
+        VM VirtualDevice[4]: device type 4, id 0 and mem_scope 
global.texture-weight
+        VM VirtualDevice[5]: device type 4, id 0 and mem_scope 
global.texture-nhwc
+        VM VirtualDevice[6]: device type 4, id 0 and mem_scope 
global.texture-weight
+        VM VirtualDevice[7]: device type 4, id 0 and mem_scope 
global.texture-weight
+        VM VirtualDevice[8]: device type 4, id 0 and mem_scope 
global.texture-weight
+        """,
+    ]
+
+    for i, (
+        kernel_h,
+        kernel_w,
+        pad,
+        stride,
+        dilation,
+        out_channels,
+        shape,
+        composite,
+        _gpu_preprocess,
+    ) in enumerate(trials):
+        shape = (1, *shape)
+        has_bias = composite[0]
+        has_activation = composite[1]
+        input_shape = shape
+        filter_shape = (shape[1], out_channels, kernel_w, kernel_h)
+        x = relay.var("data", shape=input_shape, dtype=dtype)
+        w = relay.var("weight", shape=filter_shape, dtype=dtype)
+        inputs = [x, w]
+        W1 = relay.var("weight1", shape=(shape[1], shape[1], 1, 1), 
dtype=dtype)
+        conv = relay.nn.conv2d(x, W1, padding=[0, 0, 0, 0], channels=shape[1], 
kernel_size=(1, 1))
+        inputs.append(W1)
+        conv = relay.op.nn.relu(conv)
+        y = relay.nn.conv2d_transpose(
+            conv,
+            w,
+            channels=out_channels,
+            kernel_size=(kernel_w, kernel_h),
+            strides=stride,
+            padding=pad,
+            kernel_layout="IOHW",
+            data_layout="NCHW",
+            dilation=dilation,
+        )
+
+        if has_bias:
+            b = relay.var("bias", shape=(out_channels,), dtype=dtype)
+            y = relay.nn.bias_add(y, b, axis=1)
+            inputs.append(b)
+
+        if has_activation:
+            y = relay.nn.relu(y)
+        W2 = relay.var("weight2", shape=(out_channels, out_channels, 1, 1), 
dtype=dtype)
+        out = relay.nn.conv2d(
+            y, W2, padding=[0, 0, 0, 0], channels=out_channels, 
kernel_size=(1, 1)
+        )
+        out = relay.op.nn.relu(out)
+        np.random.seed(0)
+        inputs.append(W2)
+        initializer = relay.testing.init.Xavier()
+        filter_data = np.zeros(filter_shape).astype(dtype)
+        initializer("weight", filter_data)
+        filter_data1 = np.zeros((shape[1], shape[1], 1, 1)).astype(dtype)
+        initializer("weight", filter_data1)
+        filter_data2 = np.zeros((out_channels, out_channels, 1, 
1)).astype(dtype)
+        initializer("weight", filter_data2)
+        params1 = {
+            "weight": tvm.nd.array(filter_data),
+            "weight1": tvm.nd.array(filter_data1),
+            "weight2": tvm.nd.array(filter_data2),
+        }
+        if has_bias:
+            bias_data = np.zeros((out_channels,)).astype(dtype)
+            initializer("bias", bias_data)
+            params1["bias"] = tvm.nd.array(bias_data)
+
+        mod = relay.Function(inputs, out)
+
+        if executor_type == "ge":
+            build_run_compare(
+                remote,
+                mod,
+                params1,
+                {"data": input_shape},
+                {"data": dtype},
+                target,
+                ge_texture_scopes[i],
+                _gpu_preprocess,
+            )
+        else:
+            build_run_compare_vm(
+                remote,
+                mod,
+                params1,
+                {"data": input_shape},
+                {"data": dtype},
+                target,
+                vm_texture_scopes[i],
+                _gpu_preprocess,
+            )
+
+
+if __name__ == "__main__":
+    tvm.testing.main()
diff --git a/tests/python/relay/opencl_texture/utils/adreno_utils.py 
b/tests/python/relay/opencl_texture/utils/adreno_utils.py
index d9e52f8847..21bdfbdee3 100644
--- a/tests/python/relay/opencl_texture/utils/adreno_utils.py
+++ b/tests/python/relay/opencl_texture/utils/adreno_utils.py
@@ -200,7 +200,10 @@ def build_run_compare_vm(
 
 def gpu_preprocess(tvm_mod):
     layout_config = relay.transform.LayoutConfig()
-    desired_layouts = {"nn.conv2d": ["NCHW4c", "OIHW4o"]}
+    desired_layouts = {
+        "nn.conv2d": ["NCHW4c", "OIHW4o"],
+        "nn.conv2d_transpose": ["NCHW4c", "IOHW4o"],
+    }
     with layout_config:
         seq = 
tvm.transform.Sequential([relay.transform.ConvertLayout(desired_layouts)])
         with tvm.transform.PassContext(opt_level=3):

(tvm) branch main updated: [TOPI][ADRENO] Add conv2d transpose nchw texture schedule (#15786)

Reply via email to