This is an automated email from the ASF dual-hosted git repository. tqchen pushed a commit to branch refactor in repository https://gitbox.apache.org/repos/asf/tvm.git
commit 538d2c01a7869e6d6da58d8a8cfe70377c2bf978 Author: Siyuan Feng <[email protected]> AuthorDate: Sat Feb 15 16:09:15 2025 +0800 cleanup fix Cleanup and fix --- .../tvm/contrib/cuda_graph/cuda_graph_executor.py | 134 --- python/tvm/relax/backend/dispatch_sort_scan.py | 16 +- python/tvm/topi/__init__.py | 1 + .../{contrib/cuda_graph => topi/gpu}/__init__.py | 5 + python/tvm/topi/gpu/scan.py | 728 ++++++++++++++++ python/tvm/topi/gpu/sort.py | 939 +++++++++++++++++++++ rust/tvm-rt/Cargo.toml | 3 - tests/cpp/runtime_test.cc | 163 ---- tests/python/codegen/test_target_codegen_cuda.py | 1 - .../relax/test_backend_dispatch_sort_scan.py | 22 +- tests/python/relax/test_dataflow_pattern.py | 8 +- .../runtime/test_runtime_graph_cuda_graph.py | 100 --- tests/python/te/test_te_create_primfunc.py | 55 +- tests/python/te/test_te_tensor_overload.py | 276 ------ tests/python/testing/test_format_si_prefix.py | 41 - .../test_tir_transform_fp8_legalize.py | 4 - tests/scripts/task_config_build_cpu.sh | 3 +- tests/scripts/task_config_build_gpu.sh | 3 +- tests/scripts/task_rust.sh | 57 -- tests/scripts/task_web_wasm.sh | 9 +- tests/scripts/unity/task_python_relax.sh | 2 +- 21 files changed, 1707 insertions(+), 863 deletions(-) diff --git a/python/tvm/contrib/cuda_graph/cuda_graph_executor.py b/python/tvm/contrib/cuda_graph/cuda_graph_executor.py deleted file mode 100644 index d047316eb5..0000000000 --- a/python/tvm/contrib/cuda_graph/cuda_graph_executor.py +++ /dev/null @@ -1,134 +0,0 @@ -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. -"""Graph executor with CUDA Graph""" -import tvm._ffi - -from tvm._ffi.base import string_types -from tvm.contrib import graph_executor - - -def create(graph_json_str, libmod, device): - """Create a runtime executor module given a graph and module. - - Parameters - ---------- - graph_json_str : str - The graph to be deployed in json format output by json graph. - The graph can contain operator(tvm_op) that points to the name - of PackedFunc in the libmod. - - libmod : tvm.runtime.Module - The module of the corresponding function - - device : Device - The device to deploy the module, only supports CUDA GPU - - Returns - ------- - graph_module : GraphModuleCudaGraph - CUDA graph executor module that can be used to execute the graph. - - Note - ---- - See also :py:class:`tvm.contrib.cuda_graph.cuda_graph_executor.GraphModuleCudaGraph` - for examples to directly construct a GraphModuleCudaGraph from an exported - relay compiled library. - """ - assert isinstance(graph_json_str, string_types) - try: - dev, num_rpc_dev, device_type_id = graph_executor.get_device(libmod, device) - if num_rpc_dev == len(dev): - fcreate = dev[0]._rpc_sess.get_function("tvm.graph_executor_cuda_graph.create") - else: - fcreate = tvm._ffi.get_global_func("tvm.graph_executor_cuda_graph.create") - except ValueError: - raise ValueError( - "To enable CUDA graph support (experimental), please set " - "'(USE_GRAPH_EXECUTOR_CUGRAPH ON)' in config.cmake and rebuild TVM" - ) - - return GraphModuleCudaGraph(fcreate(graph_json_str, libmod, *device_type_id)) - - -class GraphModuleCudaGraph(graph_executor.GraphModule): - """CUDA graph executor module. - - This is a CUDA graph executor wrapper over the TVM runtime. - Runtime interfaces are wrapped with CUDA graph functionalities. - - Parameters - ---------- - module : Module - The internal tvm module that holds the actual graph functions. - """ - - def __init__(self, module): - self._start_capture = module["start_capture"] - self._end_capture = module["end_capture"] - self._run_cuda_graph = module["run_cuda_graph"] - self._cuda_graph_captured = False - graph_executor.GraphModule.__init__(self, module) - - def capture_cuda_graph(self): - """Capture a CUDA graph for tvm_op graph - - This should be called before run_cuda_graph() to capture and - instantiate a CUDA graph instance. - """ - self._run() # call cuModuleLoadData before cudaStream API - self._start_capture() - self._run() - self._end_capture() - self._cuda_graph_captured = True - - def run_cuda_graph(self): - """Run the CUDA graph for tvm_op graph - - Run the captured CUDA graph instance instead of the - for-loop kernel launch of default graph executor - """ - self._run_cuda_graph() - - def run(self, **input_dict): - """A run wrapper for graph capture / launch, user can just - change default graph executor to cuda graph executor, and - the first call will capture a cuda graph for future launch - - Parameters - ---------- - input_dict: dict of str to NDArray - List of input values to be feed to - """ - if input_dict: - self.set_input(**input_dict) - if not self._cuda_graph_captured: - self.capture_cuda_graph() - else: - self._run_cuda_graph() - - def debug_get_output(self, node, out): - """Run graph up to node and get the output to out - - Parameters - ---------- - node : int / str - The node index or name - - out : NDArray - The output array container - """ - raise NotImplementedError("Please use debugger.debug_executor as graph_executor instead.") diff --git a/python/tvm/relax/backend/dispatch_sort_scan.py b/python/tvm/relax/backend/dispatch_sort_scan.py index e37869c40c..b5a94619c2 100644 --- a/python/tvm/relax/backend/dispatch_sort_scan.py +++ b/python/tvm/relax/backend/dispatch_sort_scan.py @@ -79,10 +79,10 @@ class SortScanDispatcher(BackendDispatcher): kwargs = {} with tgt: if can_use_thrust(tgt, "tvm.contrib.thrust.sort"): - te_func = topi.cuda.sort_thrust + te_func = topi.gpu.sort_thrust kwargs["workspace"] = self.allocate_workspace(call) elif self.is_gpu_target(tgt): - te_func = topi.cuda.sort + te_func = topi.gpu.sort return self.builder_.call_te( te_func, call.args[0], call.attrs.axis, not call.attrs.descending, **kwargs ) @@ -92,10 +92,10 @@ class SortScanDispatcher(BackendDispatcher): kwargs = {} with tgt: if can_use_thrust(tgt, "tvm.contrib.thrust.sort"): - te_func = topi.cuda.argsort_thrust + te_func = topi.gpu.argsort_thrust kwargs["workspace"] = self.allocate_workspace(call) elif self.is_gpu_target(tgt): - te_func = topi.cuda.argsort + te_func = topi.gpu.argsort return self.builder_.call_te( te_func, call.args[0], @@ -109,10 +109,10 @@ class SortScanDispatcher(BackendDispatcher): te_func = topi.topk kwargs = {} if can_use_thrust(tgt, "tvm.contrib.thrust.sort"): - te_func = topi.cuda.topk_thrust + te_func = topi.gpu.topk_thrust kwargs["workspace"] = self.allocate_workspace(call) elif self.is_gpu_target(tgt): - te_func = topi.cuda.topk + te_func = topi.gpu.topk tir_call = self.builder_.call_te( te_func, call.args[0], @@ -176,11 +176,11 @@ class SortScanDispatcher(BackendDispatcher): with tgt: if call.op.name == "relax.cumsum": - te_func = topi.cuda.cumsum if self.is_gpu_target(tgt) else topi.cumsum + te_func = topi.gpu.cumsum if self.is_gpu_target(tgt) else topi.cumsum if can_use_thrust(tgt, "tvm.contrib.thrust.sum_scan"): kwargs["workspace"] = self.allocate_workspace(call) elif call.op.name == "relax.cumprod": - te_func = topi.cuda.cumprod if self.is_gpu_target(tgt) else topi.cumprod + te_func = topi.gpu.cumprod if self.is_gpu_target(tgt) else topi.cumprod else: raise ValueError(f"Unsupported op: {call.op.name}") tir_call = self.builder_.call_te( diff --git a/python/tvm/topi/__init__.py b/python/tvm/topi/__init__.py index 2bd5964fef..3588c04d8f 100644 --- a/python/tvm/topi/__init__.py +++ b/python/tvm/topi/__init__.py @@ -52,6 +52,7 @@ from . import utils from . import vision from . import image from . import random +from . import gpu # error reporting from .utils import InvalidShapeError diff --git a/python/tvm/contrib/cuda_graph/__init__.py b/python/tvm/topi/gpu/__init__.py similarity index 84% rename from python/tvm/contrib/cuda_graph/__init__.py rename to python/tvm/topi/gpu/__init__.py index 13a83393a9..14f1fa3aab 100644 --- a/python/tvm/contrib/cuda_graph/__init__.py +++ b/python/tvm/topi/gpu/__init__.py @@ -14,3 +14,8 @@ # KIND, either express or implied. See the License for the # specific language governing permissions and limitations # under the License. + +# pylint: disable=redefined-builtin, wildcard-import +"""GPU specific declaration.""" +from .scan import cumsum, cumprod +from .sort import * diff --git a/python/tvm/topi/gpu/scan.py b/python/tvm/topi/gpu/scan.py new file mode 100644 index 0000000000..f45702c634 --- /dev/null +++ b/python/tvm/topi/gpu/scan.py @@ -0,0 +1,728 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +# pylint: disable=invalid-name, too-many-locals, too-many-statements +"Scan related operators" +from typing import Callable, Optional, Union + +import tvm +from tvm import te +from tvm.contrib.thrust import can_use_rocthrust, can_use_thrust + +from ..math import cast, ceil_log2 +from ..transform import expand_dims, reshape, squeeze, transpose +from ..utils import ceil_div, get_const_int, prod, swap + + +def _get_thrust_func_name(tvmop): + tvmop_to_thrust_func_name = {tvm.tir.generic.add: "tvm.contrib.thrust.sum_scan"} + assert tvmop in tvmop_to_thrust_func_name, f"{tvmop} not supported by thrust" + return tvmop_to_thrust_func_name[tvmop] + + +def _can_use_scan_thrust(binop): + """ + Check if scan_thrust can be utilized based on the current target and binary op. + """ + target = tvm.target.Target.current() + if target is None: + return False + return binop == tvm.tir.generic.add and any( + [ + can_use_thrust(target, "tvm.contrib.thrust.sum_scan"), + can_use_rocthrust(target, "tvm.contrib.thrust.sum_scan"), + ] + ) + + +def exclusive_scan_ir(data, output, reduction=None, binop=tvm.tir.generic.add, identity_value=0): + """Low level IR to do exclusive sum scan along rows of 2D input. + + Parameters + ---------- + data : Buffer + Input N-D Buffer. Scan is done over the innermost axis. + + output: Buffer + A buffer to store the output scan, of the same shape as data + + reduction: Buffer, optional + (N-1)-D Buffer, to store the sum of each scan axis. + + binop: function, optional + A binary associative op to use for scan. The function takes two TIR expressions + and produce a new TIR expression. By default it uses tvm.tir.generic.add to compute + prefix sum. + + identity_value: int or float + A value for the binary operation which provides the identity property. E.g. if * is + your operator and i is the identity_value then a * i = a for all a in the domain of + your operation. + """ + + batch_size = cast(prod(data.shape[:-1]), "int32") + scan_axis_size = cast(data.shape[-1], "int32") + + ib = tvm.tir.ir_builder.create() + + data = ib.buffer_ptr(data) + output = ib.buffer_ptr(output) + + out_dtype = output.dtype + + if reduction is not None: + reduction = ib.buffer_ptr(reduction) + + max_threads = int(tvm.target.Target.current(allow_none=False).max_num_threads) + + with ib.if_scope(scan_axis_size == 0): + with ib.new_scope(): + bx = te.thread_axis("blockIdx.x") + ib.scope_attr(bx, "thread_extent", batch_size) + with ib.if_scope(bx < batch_size): + if reduction is not None: + reduction[bx] = cast(identity_value, out_dtype) + with ib.else_scope(): + with ib.new_scope(): + nthread_tx = max_threads + nthread_bx = ceil_div(scan_axis_size, max_threads) + nthread_by = batch_size + tx = te.thread_axis("threadIdx.x") + bx = te.thread_axis("blockIdx.x") + by = te.thread_axis("blockIdx.y") + ib.scope_attr(tx, "thread_extent", nthread_tx) + ib.scope_attr(bx, "thread_extent", nthread_bx) + ib.scope_attr(by, "thread_extent", nthread_by) + tid = bx * nthread_tx + tx + with ib.if_scope(tid < scan_axis_size): + output[by * scan_axis_size + tid] = cast(data[by * scan_axis_size + tid], out_dtype) + + nthread_tx = max_threads + nthread_bx = ceil_div(scan_axis_size, max_threads) + nthread_by = batch_size + + # The following algorithm performs parallel exclusive scan + # Up Sweep of exclusive scan + lim = ceil_log2(scan_axis_size) + + with ib.for_range(0, cast(lim, "int32"), dtype="int32") as l2_width: + width = 2 << l2_width + + with ib.new_scope(): + tx = te.thread_axis("threadIdx.x") + bx = te.thread_axis("blockIdx.x") + ib.scope_attr(tx, "thread_extent", nthread_tx) + ib.scope_attr( + bx, + "thread_extent", + tvm.tir.generic.cast(ceil_div(scan_axis_size, max_threads * width), "int32"), + ) + tid = bx * nthread_tx + tx + + by = te.thread_axis("blockIdx.y") + ib.scope_attr(by, "thread_extent", nthread_by) + start = ib.allocate("int32", (1,), name="start", scope="local") + middle = ib.allocate("int32", (1,), name="middle", scope="local") + end = ib.allocate("int32", (1,), name="end", scope="local") + start[0] = width * tid + with ib.if_scope(start[0] < scan_axis_size): + middle[0] = start[0] + tvm.tir.indexdiv(width, 2) + end[0] = tvm.te.min(start[0] + width, scan_axis_size) + with ib.if_scope(middle[0] < scan_axis_size): + output[by * scan_axis_size + end[0] - 1] = binop( + output[by * scan_axis_size + end[0] - 1], + output[by * scan_axis_size + middle[0] - 1], + ) + + # Down Sweep of exclusive scan + with ib.new_scope(): + bx = te.thread_axis("blockIdx.x") + ib.scope_attr(bx, "thread_extent", batch_size) + with ib.if_scope(bx < batch_size): + if reduction is not None: + reduction[bx] = output[(bx + 1) * scan_axis_size - 1] + output[(bx + 1) * scan_axis_size - 1] = cast(identity_value, out_dtype) + + with ib.for_range(0, cast(lim, "int32"), dtype="int32") as l2_width: + width = 2 << (lim - l2_width - 1) + + with ib.new_scope(): + tx = te.thread_axis("threadIdx.x") + bx = te.thread_axis("blockIdx.x") + ib.scope_attr(tx, "thread_extent", nthread_tx) + ib.scope_attr( + bx, + "thread_extent", + tvm.tir.generic.cast(ceil_div(scan_axis_size, max_threads * width), "int32"), + ) + tid = bx * nthread_tx + tx + + by = te.thread_axis("blockIdx.y") + ib.scope_attr(by, "thread_extent", nthread_by) + start = ib.allocate("int32", (1,), name="start", scope="local") + middle = ib.allocate("int32", (1,), name="middle", scope="local") + end = ib.allocate("int32", (1,), name="end", scope="local") + tmp = ib.allocate(out_dtype, (1,), name="end", scope="local") + start[0] = width * tid + with ib.if_scope(tvm.tir.all(start[0] < scan_axis_size)): + middle[0] = start[0] + tvm.tir.indexdiv(width, 2) + end[0] = tvm.tir.min(start[0] + width, scan_axis_size) + with ib.if_scope(middle[0] < scan_axis_size): + tmp[0] = output[by * scan_axis_size + middle[0] - 1] + output[by * scan_axis_size + middle[0] - 1] = output[ + by * scan_axis_size + end[0] - 1 + ] + output[by * scan_axis_size + end[0] - 1] = binop( + output[by * scan_axis_size + end[0] - 1], tmp[0] + ) + return ib.get() + + +def get_reduction_from_exclusive_scan(data, ex_scan_output, binop=tvm.tir.generic.add): + """Return the sum of the last element of data and the exclusive scan output. + The is the reduction of data along each row (for 2-D case). + + Parameters + ---------- + data : tvm.te.Tensor + Input data of any shape + + ex_scan_output : tvm.te.Tensor + The output of exclusive scan on data + + binop: function, optional + A binary associative op to use for scan. The function takes two TIR expressions + and produce a new TIR expression. By default it uses tvm.tir.generic.add to compute + prefix sum. + + Returns + ------- + reduction : tvm.te.Tensor + (N-1)-D tensor storing the reduction of each scan axis. + """ + ndim = len(data.shape) + if ndim == 1: + data = expand_dims(data, axis=0) + ex_scan_output = expand_dims(ex_scan_output, axis=0) + + def ir(data, data_ex_scan, reduction): + batch_size = cast(prod(data.shape[:-1]), "int32") + scan_axis_size = cast(data.shape[-1], "int32") + + ib = tvm.tir.ir_builder.create() + + data = ib.buffer_ptr(data) + data_ex_scan = ib.buffer_ptr(data_ex_scan) + reduction = ib.buffer_ptr(reduction) + + max_threads = int(tvm.target.Target.current(allow_none=False).max_num_threads) + with ib.new_scope(): + nthread_tx = max_threads + nthread_bx = ceil_div(batch_size, max_threads) + tx = te.thread_axis("threadIdx.x") + bx = te.thread_axis("blockIdx.x") + ib.scope_attr(tx, "thread_extent", nthread_tx) + ib.scope_attr(bx, "thread_extent", nthread_bx) + tid = bx * max_threads + tx + with ib.if_scope(tid < batch_size): + with ib.if_scope(scan_axis_size > 0): + reduction[tid] = binop( + data_ex_scan[tid * scan_axis_size + scan_axis_size - 1], + data[tid * scan_axis_size + scan_axis_size - 1], + ) + with ib.else_scope(): + reduction[tid] = cast(0, reduction.dtype) + + return ib.get() + + data_buf = tvm.tir.decl_buffer(data.shape, data.dtype, "valid_indices_buf", data_alignment=8) + ex_scan_output_buf = tvm.tir.decl_buffer( + ex_scan_output.shape, ex_scan_output.dtype, "ex_scan_output_buf", data_alignment=8 + ) + + reduction = te.extern( + [data.shape[:-1]], + [data, ex_scan_output], + lambda ins, outs: ir(ins[0], ins[1], outs[0]), + dtype=[ex_scan_output.dtype], + in_buffers=[data_buf, ex_scan_output_buf], + name="ex_scan_reduction", + tag="ex_scan_reduction_gpu", + ) + + if ndim == 1: + return squeeze(reduction, 0) + + return reduction + + +def scan_thrust( + data, + output_dtype, + exclusive=True, + return_reduction=False, + binop=tvm.tir.generic.add, + workspace=None, +): + """Do exclusive or inclusive scan on 1D or multidimensional input, using thrust. + + Parameters + ---------- + data : tvm.te.Tensor + Input data of any shape. The scan is done over the innermost axis. + + output_dtype: string + The dtype of the output scan tensor. + + exclusive: bool, optional + Whether or not do exclusive or inclusive scan. + + return_reduction: bool, optional + Whether or not return a (N-1)-D tensor storing the reduction of each scan axis. + Reductions are computed as part of the upsweep pass, so there is no extra cost. + If False, reductions are ignored. It must be False when exclusive is False. + + binop: function, optional + A binary associative op to use for scan. Since we need to lookup the corresponding + thrust function, arbitrariy callables are not supported. Currently only + tvm.tir.generic.add can be passed in. + + workspace: Optional[tvm.te.Tensor] + A buffer to store intermediate results. The size of the workspace should be sufficiently + large, this can be obtained by overestimation or memory usage profiling. If None, it will + fallback to use thrust internal memory allocation. + + Returns + ------- + output : tvm.te.Tensor + A N-D tensor of the same rank N and shape as the input data. + + reduction : tvm.te.Tensor, optional + (N-1)-D tensor storing the reduction of each scan axis. + Returned if return_reduction is True. + """ + data_buf = tvm.tir.decl_buffer(data.shape, data.dtype, "data_buf", data_alignment=8) + output_buf = tvm.tir.decl_buffer(data.shape, output_dtype, "output_buf", data_alignment=8) + + workspace_buf = ( + tvm.tir.decl_buffer(workspace.shape, workspace.dtype, "workspace_buf", data_alignment=8) + if workspace is not None + else None + ) + + def f_compute(ins, outs): + args = [_get_thrust_func_name(binop), ins[0], outs[0], exclusive] + if workspace is not None: + args.append(ins[1]) + return tvm.tir.call_packed(*args) + + output = te.extern( + [data.shape], + [data] if workspace is None else [data, workspace], + f_compute, + dtype=[output_dtype], + in_buffers=[data_buf] if workspace is None else [data_buf, workspace_buf], + out_buffers=[output_buf], + name="exclusive_scan_thrust", + tag="exclusive_scan_thrust_gpu", + ) + + if return_reduction: + assert exclusive, "return_reduction should be False for inclusive scan" + reduction = get_reduction_from_exclusive_scan(data, output, binop) + return output, reduction + + return output + + +def exclusive_scan( + data, + axis=-1, + return_reduction=False, + output_dtype=None, + binop=tvm.tir.generic.add, + identity_value=0, + workspace=None, +): + """Do exclusive scan on 1D or multidimensional input. + + Parameters + ---------- + data : tvm.te.Tensor + Input data of any shape. + + axis: int, optional + The axis to do scan on. By default, scan is done on the innermost axis. + + return_reduction: bool, optional + Whether or not return a tensor storing the reduction over each scan axis. + If the input rank is N, this tensor is of rank N - 1. + Reductions are computed as part of the upsweep pass, so there is no extra cost. + If False, reductions are ignored. + + output_dtype: string, optional + The dtype of the output scan tensor. If not provided, the dtype of the input is used. + + binop: function, optional + A binary associative op to use for scan. The function takes two TIR expressions + and produce a new TIR expression. By default it uses tvm.tir.generic.add to compute + prefix sum. + + identity_value: int or float + A value for the binary operation which provides the identity property. E.g. if * is + your operator and i is the identity_value then a * i = a for all a in the domain of + your operation. + + workspace: Optional[tvm.te.Tensor] + A buffer to store intermediate results if thrust is enabled. The size of the workspace + should be sufficiently large, this can be obtained by overestimation or memory usage + profiling. If None, it will fallback to use thrust internal memory allocation. + + Returns + ------- + output : tvm.te.Tensor + A N-D tensor of the same rank N and shape as the input data. + + reduction : tvm.te.Tensor, optional + (N-1)-D tensor storing the reduction of each scan axis. + Returned if return_reduction is True. + """ + + def do_scan(data, output_dtype): + # TODO: add support for a prod_scan + if _can_use_scan_thrust(binop): + return scan_thrust( + data, + output_dtype, + exclusive=True, + return_reduction=return_reduction, + binop=binop, + workspace=workspace, + ) + + if ndim == 1: + # TIR exclusive scan accepts only 2D or higher-rank inputs. + data = expand_dims(data, axis=0) + + data_buf = tvm.tir.decl_buffer(data.shape, data.dtype, "data_buf", data_alignment=8) + output_buf = tvm.tir.decl_buffer(data.shape, output_dtype, "output_buf", data_alignment=8) + + if return_reduction: + output, reduction = te.extern( + [data.shape, data.shape[:-1]], + [data], + lambda ins, outs: exclusive_scan_ir( + ins[0], outs[0], outs[1], binop=binop, identity_value=identity_value + ), + dtype=[output_dtype, output_dtype], + in_buffers=[data_buf], + name="exclusive_scan", + tag="exclusive_scan_gpu", + ) + else: + output = te.extern( + [data.shape], + [data], + lambda ins, outs: exclusive_scan_ir( + ins[0], outs[0], binop=binop, identity_value=identity_value + ), + dtype=[output_dtype], + in_buffers=[data_buf], + out_buffers=[output_buf], + name="exclusive_scan", + tag="exclusive_scan_gpu", + ) + reduction = None + + if ndim == 1: + output = squeeze(output, 0) + if return_reduction: + reduction = squeeze(reduction, 0) + + if return_reduction: + return output, reduction + + return output + + if output_dtype is None or output_dtype == "": + output_dtype = data.dtype + + ndim = len(data.shape) + if axis < 0: + axis += ndim + + # If scan axis is not the innermost one, swap the scan and the innermost axes + # Scan is always done on the innermost axis, for performance reason. + if axis != ndim - 1: + axes = swap(list(range(ndim)), axis) + data = transpose(data, axes) + + if return_reduction: + output, reduction = do_scan(data, output_dtype) + else: + output = do_scan(data, output_dtype) + + if axis != ndim - 1: + axes = swap(list(range(ndim)), axis) + output = transpose(output, axes) + + if return_reduction: + return output, reduction + + return output + + +def inclusive_scan( + data, axis=-1, output_dtype=None, binop=tvm.tir.generic.add, identity_value=0, workspace=None +): + """Do inclusive scan on 1D or multidimensional input. + + Parameters + ---------- + data : tvm.te.Tensor + Input data of any shape. + + axis: int, optional + The axis to do scan on. By default, scan is done on the innermost axis. + + output_dtype: string, optional + The dtype of the output scan tensor. If not provided, the dtype of the input is used. + + binop: function, optional + A binary associative op to use for scan. The function takes two TIR expressions + and produce a new TIR expression. By default it uses tvm.tir.generic.add to compute + prefix sum. + + identity_value: int or float + A value for the binary operation which provides the identity property. E.g. if * is + your operator and i is the identity_value then a * i = a for all a in the domain of + your operation. + + workspace: Optional[tvm.te.Tensor] + A buffer to store intermediate results if thrust is enabled. The size of the workspace + should be sufficiently large, this can be obtained by overestimation or memory usage + profiling. If None, it will fallback to use thrust internal memory allocation. + + Returns + ------- + output : tvm.te.Tensor + A N-D tensor of the same rank N as the input data. + """ + + if _can_use_scan_thrust(binop): + if output_dtype is None or output_dtype == "": + output_dtype = data.dtype + ndim = len(data.shape) + if axis < 0: + axis += ndim + + if axis != ndim - 1: + axes = swap(list(range(ndim)), axis) + data = transpose(data, axes) + output = scan_thrust(data, output_dtype, exclusive=False, binop=binop, workspace=workspace) + if axis != ndim - 1: + axes = swap(list(range(ndim)), axis) + output = transpose(output, axes) + return output + + ex_scan = exclusive_scan( + data, + axis, + output_dtype=output_dtype, + binop=binop, + identity_value=identity_value, + workspace=workspace, + ) + + if output_dtype is not None and data.dtype != output_dtype and output_dtype != "": + data = cast(data, output_dtype) + + return binop(data, ex_scan) + + +def scanop( + data: tvm.te.Tensor, + binop: Callable[["tvm.Expr", "tvm.Expr"], "tvm.Expr"], + identity_value: Union[float, int], + axis: Optional[int] = None, + dtype: Optional[str] = None, + exclusive: Optional[bool] = None, + workspace: Optional[tvm.te.Tensor] = None, +) -> tvm.te.Tensor: + """Cumulative binary operator (scan) with similar axis behavior as np.cumsum and np.cumprod. + + See cumprod and cumsum for an example of use. + + E.g. if * is your binary operator and the input tensor is [1, 2, 3, 4] the output may be + [1, 1 * 2, 1 * 2 * 3, 1 * 2 * 3 * 4] + + Parameters + ---------- + data : tvm.te.Tensor + The input data to the operator. + + binop: Callable (tvm.Expr, tvm.Expr) -> tvm.Expr + A binary operator which should be associative and commutative. E.g. if * is your + operator then a * (b * c) = (a * b) * c and a * b = b * a + + identity_value: int or float + A value for the binary operation which provides the identity property. E.g. if * is + your operator and i is the identity_value then a * i = a for all a in the domain of + your operation. + + axis : int, optional + Axis along which the operation is computed. The default (None) is to compute + the cumulative operation over the flattened array. + + dtype : string, optional + Type of the returned array and of the accumulator in which the elements are computed. + If dtype is not specified, it defaults to the dtype of data. + + exclusive : bool, optional + If true will return exclusive cumulative operation in which the first element is not + included. In other terms, if true, the j-th output element would be + the cumulative operation of the first (j-1) elements. Otherwise, it would be the + cumulative operation of the first j elements. + + workspace: Optional[tvm.te.Tensor] + + Returns + ------- + result : tvm.te.Tensor + The result has the same size as data, and the same shape as data if axis is not None. + If axis is None, the result is a 1-d array. + """ + if axis is None: + axis = 0 + data = reshape(data, (prod(data.shape),)) + axis = get_const_int(axis) + if exclusive is not None and exclusive: + return exclusive_scan( + data, + axis, + output_dtype=dtype, + binop=binop, + identity_value=identity_value, + workspace=workspace, + ) + return inclusive_scan( + data, + axis, + output_dtype=dtype, + binop=binop, + identity_value=identity_value, + workspace=workspace, + ) + + +def cumsum( + data: tvm.te.Tensor, + axis: Optional[int] = None, + dtype: Optional[int] = None, + exclusive: Optional[bool] = None, + workspace: Optional[tvm.te.Tensor] = None, +) -> tvm.te.Tensor: + """Numpy style cumsum op. Return the cumulative sum of the elements along a given axis. + + Parameters + ---------- + data : tvm.te.Tensor + The input data to the operator. + + axis : int, optional + Axis along which the cumulative sum is computed. The default (None) is to compute + the cumsum over the flattened array. + + dtype : string, optional + Type of the returned array and of the accumulator in which the elements are summed. + If dtype is not specified, it defaults to the dtype of data. + + exclusive : bool, optional + If true will return exclusive sum in which the first element is not + included. In other terms, if true, the j-th output element would be + the sum of the first (j-1) elements. Otherwise, it would be the sum of + the first j elements. + + workspace: Optional[tvm.te.Tensor] + A buffer to store intermediate results if thrust is enabled. The size of the workspace + should be sufficiently large, this can be obtained by overestimation or memory usage + profiling. If None, it will fallback to use thrust internal memory allocation. + + Returns + ------- + result : tvm.te.Tensor + The result has the same size as data, and the same shape as data if axis is not None. + If axis is None, the result is a 1-d array. + """ + return scanop( + data=data, + binop=tvm.tir.generic.add, + identity_value=0, + axis=axis, + dtype=dtype, + exclusive=exclusive, + workspace=workspace, + ) + + +def cumprod( + data: tvm.te.Tensor, + axis: Optional[int] = None, + dtype: Optional[int] = None, + exclusive: Optional[bool] = None, + workspace: Optional[tvm.te.Tensor] = None, +): + """Numpy style cumprod op. Return the cumulative product of the elements along a given axis. + + Parameters + ---------- + data : tvm.te.Tensor + The input data to the operator. + + axis : int, optional + Axis along which the cumulative product is computed. The default (None) is to compute + the cumproduct over the flattened array. + + dtype : string, optional + Type of the returned array and of the accumulator in which the elements are multiplied. + If dtype is not specified, it defaults to the dtype of data. + + exclusive : bool, optional + If True, will return exclusive product in which the first element is not + included. In other terms, if True, the j-th output element would be + the product of the first (j-1) elements. Otherwise, it would be the product of + the first j elements. + + workspace: Optional[tvm.te.Tensor] + A buffer to store intermediate results if thrust is enabled. The size of the workspace + should be sufficiently large, this can be obtained by overestimation or memory usage + profiling. If None, it will fallback to use thrust internal memory allocation. + + Returns + ------- + result : tvm.te.Tensor + The result has the same size as data, and the same shape as data if axis is not None. + If axis is None, the result is a 1-d array. + """ + return scanop( + data=data, + binop=tvm.tir.generic.multiply, + identity_value=1, + axis=axis, + dtype=dtype, + exclusive=exclusive, + workspace=workspace, + ) diff --git a/python/tvm/topi/gpu/sort.py b/python/tvm/topi/gpu/sort.py new file mode 100644 index 0000000000..71854e4399 --- /dev/null +++ b/python/tvm/topi/gpu/sort.py @@ -0,0 +1,939 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +# pylint: disable=invalid-name, no-member, too-many-locals, too-many-arguments, too-many-statements, singleton-comparison, unused-argument, no-else-return +"""Sort related operators """ +import tvm +from tvm import te + +from ..transform import strided_slice, transpose +from ..utils import ceil_div, swap +from ..math import cast, ceil_log2 + + +def _get_threads(ib, nthread_tx, nthread_bx, nthread_by): + tx = te.thread_axis("threadIdx.x") + bx = te.thread_axis("blockIdx.x") + ib.scope_attr(tx, "thread_extent", nthread_tx) + ib.scope_attr(bx, "thread_extent", nthread_bx) + + by = te.thread_axis("blockIdx.y") + ib.scope_attr(by, "thread_extent", nthread_by) + + return tx, bx, by + + +def _sort_init(ib, shape, axis, keys_in, keys_out, values_out=None, value_init_func=None): + """Initialize the output buffers by copying from inputs""" + axis_mul_before = 1 + axis_mul_after = 1 + if axis < 0: + axis = len(shape) + axis + for i, value in enumerate(shape, 0): + if i < axis: + axis_mul_before *= value + elif i > axis: + axis_mul_after *= value + + # Set up threading + max_threads = int(tvm.target.Target.current(allow_none=False).max_num_threads) + nthread_tx = max_threads + nthread_bx = ceil_div(shape[axis], max_threads) + nthread_by = axis_mul_before * axis_mul_after + + # Copy the keys_in to initial output + with ib.new_scope(): + tx, bx, by = _get_threads(ib, nthread_tx, nthread_bx, nthread_by) + tid = bx * nthread_tx + tx + by, bz = by % axis_mul_before, by // axis_mul_before + idx = (by * shape[axis] + tid) * axis_mul_after + bz + with ib.if_scope(tid < shape[axis]): + keys_out[idx] = keys_in[idx] + if values_out is not None: + values_out[idx] = value_init_func(idx, tid) + + return axis_mul_before, axis_mul_after + + +## TODO(mbrookhart): These are effective optimziation hyperparametrs +## Perhaps we can autotune? +block_size = 128 +thread_work = 4 + + +def _odd_even_sort( + ib, + size, + axis_mul_before, + axis_mul_after, + is_ascend, + keys, + keys_swap, + values=None, + values_swap=None, +): + nthread_tx = block_size // 2 + nthread_bx = ceil_div(size, block_size) + nthread_by = axis_mul_before * axis_mul_after + with ib.new_scope(): + ib.scope_attr(tvm.tir.const(0), "hand_threaded", 0) + tx, bx, by = _get_threads(ib, nthread_tx, nthread_bx, nthread_by) + by, bz = by % axis_mul_before, by // axis_mul_before + tid = 2 * tx + start = bx * block_size + + ## Create shared memory as syncable thread scratch space + tmp_keys_swap = ib.allocate( + keys_swap.dtype, + (block_size,), + name="temp_keys_swap", + scope="shared", + ) + if values_swap is not None: + tmp_values_swap = ib.allocate( + values_swap.dtype, + (block_size,), + name="temp_values_swap", + scope="shared", + ) + + ## Create thread local data for swapping + temp_keys = ib.allocate(keys_swap.dtype, (1,), name="temp_keys", scope="local") + if values_swap is not None: + temp_values = ib.allocate(values_swap.dtype, (1,), name="temp_values", scope="local") + + temp_cond1 = ib.allocate(keys_swap.dtype, (1,), name="temp_cond1", scope="local") + temp_cond2 = ib.allocate(keys_swap.dtype, (1,), name="temp_cond2", scope="local") + # Copy data to scratch space + base_idx = by * size * axis_mul_after + bz + with ib.for_range(0, 2) as n: + with ib.if_scope((tid + n + start) < size): + tmp_keys_swap[tid + n] = keys[base_idx + (tid + n + start) * axis_mul_after] + if values_swap is not None: + tmp_values_swap[tid + n] = values[base_idx + (tid + n + start) * axis_mul_after] + + ib.emit(tvm.tir.Call(None, "tir.tvm_storage_sync", tvm.runtime.convert(["shared"]))) + + idxm = tvm.tir.indexmod + # OddEvenTransposeSort + current_sort_num = tvm.tir.min(block_size, size - start) + with ib.for_range(0, current_sort_num) as k: + n = idxm(tid + k, 2) + with ib.if_scope(tid + n < current_sort_num - 1): + temp_cond1[0] = tmp_keys_swap[tid + n] + temp_cond2[0] = tmp_keys_swap[tid + n + 1] + if is_ascend: + cond = temp_cond1[0] > temp_cond2[0] + else: + cond = temp_cond1[0] < temp_cond2[0] + with ib.if_scope(cond): + temp_keys[0] = tmp_keys_swap[tid + n] + tmp_keys_swap[tid + n] = tmp_keys_swap[tid + n + 1] + tmp_keys_swap[tid + n + 1] = temp_keys[0] + if values_swap is not None: + temp_values[0] = tmp_values_swap[tid + n] + tmp_values_swap[tid + n] = tmp_values_swap[tid + n + 1] + tmp_values_swap[tid + n + 1] = temp_values[0] + ib.emit(tvm.tir.Call(None, "tir.tvm_storage_sync", tvm.runtime.convert(["shared"]))) + + ## Copy sorted data to output + with ib.for_range(0, 2) as n: + with ib.if_scope(tid + n + start < size): + keys[base_idx + (tid + n + start) * axis_mul_after] = tmp_keys_swap[tid + n] + keys_swap[base_idx + (tid + n + start) * axis_mul_after] = tmp_keys_swap[tid + n] + if values_swap is not None: + values[base_idx + (tid + n + start) * axis_mul_after] = tmp_values_swap[tid + n] + values_swap[base_idx + (tid + n + start) * axis_mul_after] = tmp_values_swap[ + tid + n + ] + + +def _sort_common( + ib, + size, + axis_mul_before, + axis_mul_after, + is_ascend, + keys, + keys_swap, + values=None, + values_swap=None, +): + """Either sort only values or sort values by keys.""" + + ## This function performs a multi-level mergesort + ## For blocks of length <= block_size, it does odd-even transpose sort + ## in GPU shared memory + ## For intermediate block sizes (>block_size, < max_threads * thread_work) + ## it uses the mergpath algorthim https://arxiv.org/abs/1406.2628 + ## to merge blocks in parallel + ## At some point, the size of the blocks to be merged is too big for max_threads + ## and we switch to using a dual-level mergepath where the outer mergepath + ## finds the start/end locations of the inner mergepath so that we can split + ## the merge into more blocks + + max_threads = int(tvm.target.Target.current(allow_none=False).max_num_threads) + nthread_by = axis_mul_before * axis_mul_after + nthread_tx = max_threads + nthread_bx = ceil_div(size, nthread_tx) + + def compare(a, b): + """ + Compare a and b in proper ascending or descending order + """ + if is_ascend: + out = a <= b + else: + out = b <= a + return out + + # Sort the lower levels of the merge using odd-even sort, it's fast for small inputs + lower_lim = ceil_log2(block_size) + + _odd_even_sort( + ib, + size, + axis_mul_before * axis_mul_after, + 1, + is_ascend, + keys, + keys_swap, + values, + values_swap, + ) + + upper_lim = ceil_log2(size) + + def get_merge_begin(source, base_idx, aCount, bCount, aStart, bStart, diag, step_count): + first = ib.allocate("int64", (1,), name="first", scope="local") + mid = ib.allocate("int64", (1,), name="mid", scope="local") + last = ib.allocate("int64", (1,), name="last", scope="local") + first[0] = tvm.te.max(0, diag - bCount) + last[0] = tvm.te.min(diag, aCount) + with ib.while_loop(first[0] < last[0]): + mid = (first[0] + last[0]) >> 1 + a = source[base_idx + (aStart + mid)] + b = source[base_idx + (bStart + diag - 1 - mid)] + with ib.if_scope(compare(a, b)): + first[0] = mid + 1 + with ib.else_scope(): + last[0] = mid + return first[0], last[0] + + def serial_merge( + source, + dest, + source_idx, + dest_idx, + base_idx, + aCount, + bCount, + aStart, + bStart, + kStart, + diag, + step_count, + first, + last, + ): + i = ib.allocate("int64", (1,), name="i", scope="local") + j = ib.allocate("int64", (1,), name="j", scope="local") + i[0] = aStart + first + j[0] = bStart + diag - last + with ib.for_range(0, tvm.te.min(aCount + bCount - diag, step_count)) as count: + i_idx = base_idx + i[0] + j_idx = base_idx + j[0] + k_idx = base_idx + (kStart + diag + count) + + def assign_i(): + """assign i value to current output""" + dest[k_idx] = source[i_idx] + if values is not None: + dest_idx[k_idx] = source_idx[i_idx] + i[0] += 1 + + def assign_j(): + """assign j value to current output""" + dest[k_idx] = source[j_idx] + if values is not None: + dest_idx[k_idx] = source_idx[j_idx] + j[0] += 1 + + ## if both of the iterators are in range + with ib.if_scope(tvm.tir.all(i[0] < aStart + aCount, j[0] < bStart + bCount)): + # compare them and insert whichever is next into the output + with ib.if_scope(compare(source[i_idx], source[j_idx])): + assign_i() + with ib.else_scope(): + assign_j() + # otherwise, simply copy the remainder of the valid iterator to the output + with ib.else_scope(): + with ib.if_scope(i[0] < aStart + aCount): + assign_i() + with ib.else_scope(): + assign_j() + + with ib.for_range(0, cast(upper_lim - lower_lim, "int64"), dtype="int64") as l2_width: + width = 2 << (l2_width + lower_lim) + # Define and launch the cuda kernel + with ib.new_scope(): + target = tvm.target.Target.current() + if "vulkan" in str(target): + # Vulkan can't handle dynamic nthread, so we thread slightly differently + # for vulkan. We don't do this generally because it causes a 15% perf + # regression on other platforms + ntx = max_threads + nbx = tvm.tir.generic.cast(ceil_div(width, max_threads * thread_work), "int32") + nbz = tvm.tir.generic.cast(ceil_div(size, width), "int32") + tx, bx, by = _get_threads(ib, ntx, nbx, nthread_by * nbz) + else: + ntx = tvm.tir.generic.cast(tvm.te.min(max_threads, width), "int32") + nbx = tvm.tir.generic.cast(ceil_div(width, max_threads * thread_work), "int32") + nbz = tvm.tir.generic.cast(ceil_div(size, width), "int32") + tx, bx, by = _get_threads(ib, ntx, nbx, nthread_by * nbz) + by, bz = by % nthread_by, by // nthread_by + + def mergepath( + source, + dest, + source_idx, + dest_idx, + aCount, + bCount, + aStart, + bStart, + kStart, + step_count, + even, + ): + # pylint: disable=arguments-out-of-order + def merge(source, dest, source_idx, dest_idx): + diag = tx * step_count + first, last = get_merge_begin( + source, + by * size, + aCount, + bCount, + aStart, + bStart, + diag, + step_count, + ) + # iterate over the output loop + serial_merge( + source, + dest, + source_idx, + dest_idx, + by * size, + aCount, + bCount, + aStart, + bStart, + kStart, + diag, + step_count, + first, + last, + ) + + with ib.if_scope(even): + merge(source, dest, source_idx, dest_idx) + with ib.else_scope(): + merge(dest, source, dest_idx, source_idx) + + def mergesort(source, dest, source_idx, dest_idx, size, width, even): + # calculate the start, mid, and end points of this section + start = width * bz + middle = cast(tvm.te.min(start + tvm.tir.indexdiv(width, 2), size), "int64") + end = cast(tvm.te.min(start + width, size), "int64") + with ib.if_scope(start < size): + with ib.if_scope(nbx == 1): + ## merge the start->middle and middle->end arrays + aCount = middle - start + bCount = end - middle + mergepath( + source, + dest, + source_idx, + dest_idx, + aCount, + bCount, + start, + middle, + start, + ceil_div(width, ntx), + even, + ) + with ib.else_scope(): + step_count = max_threads * thread_work + diag = bx * step_count + + def do_merge(first, last): + aStart = start + first + bStart = middle + diag - last + aCount = tvm.te.min(middle - aStart, step_count) + bCount = tvm.te.min(end - bStart, step_count) + mergepath( + source, + dest, + source_idx, + dest_idx, + aCount, + bCount, + aStart, + bStart, + start + diag, + thread_work, + even, + ) + + with ib.if_scope(even): + first, last = get_merge_begin( + source, + by * size, + middle - start, + end - middle, + start, + middle, + diag, + step_count, + ) + do_merge(first, last) + with ib.else_scope(): + first, last = get_merge_begin( + dest, + by * size, + middle - start, + end - middle, + start, + middle, + diag, + step_count, + ) + do_merge(first, last) + + # Call the kernel + mergesort( + keys, + keys_swap, + values, + values_swap, + size, + width, + tvm.tir.indexmod(l2_width, 2) == 0, + ) + nthread_by = axis_mul_before * axis_mul_after + nthread_tx = max_threads + nthread_bx = ceil_div(size, nthread_tx) + ## if the final sorted data ended up in the swap, copy it to the real output + with ib.if_scope( + tvm.tir.all(upper_lim > lower_lim, tvm.tir.indexmod(upper_lim - lower_lim, 2) == 1) + ): + with ib.new_scope(): + tx, bx, by = _get_threads(ib, nthread_tx, nthread_bx, nthread_by) + tid = bx * nthread_tx + tx + idx = by * size + tid + with ib.if_scope(tid < size): + keys[idx] = keys_swap[idx] + if values is not None: + values[idx] = values_swap[idx] + + +def sort_ir( + data, values_out, values_out_swap, axis, is_ascend, indices_out=None, indices_out_swap=None +): + """Low level IR to do sorting on the GPU, same usage as tvm.contrib.sort.argsort on the CPU. + + Parameters + ---------- + data: Buffer + Buffer of input data. Data will be sorted in place. + + values_out : Buffer + Output buffer of values of sorted tensor with same shape as data. + + values_out_swap : Buffer + Output buffer of values with same shape as data to use as swap. + + axis : Int + Axis long which to sort the input tensor. + + is_ascend : Boolean + Whether to sort in ascending or descending order. + + indicess_out : Buffer + Output buffer of indices of sorted tensor with same shape as data. + + indices_out_swap : Buffer + Output buffer of indices with same shape as data to use as swap. + + Returns + ------- + stmt : Stmt + The result IR statement. + """ + ib = tvm.tir.ir_builder.create() + shape = data.shape + + data = ib.buffer_ptr(data) + values_out = ib.buffer_ptr(values_out) + values_out_swap = ib.buffer_ptr(values_out_swap) + if indices_out is not None: + indices_out = ib.buffer_ptr(indices_out) + assert indices_out_swap is not None + indices_out_swap = ib.buffer_ptr(indices_out_swap) + + with ib.if_scope(shape[axis] > 0): + axis_mul_before, axis_mul_after = _sort_init( + ib, + shape, + axis, + data, + values_out, + indices_out, + value_init_func=lambda _, tid: tvm.tir.generic.cast(tid, indices_out.dtype), + ) + + _sort_common( + ib, + shape[axis], + axis_mul_before, + axis_mul_after, + is_ascend, + values_out, + values_out_swap, + values=indices_out, + values_swap=indices_out_swap, + ) + + return ib.get() + + +def sort(data, axis=-1, is_ascend=1): + """Performs sorting along the given axis and returns an array of + sorted values with the same shape as the input data. + + Parameters + ---------- + data: tvm.te.Tensor + The input array. + + axis : int, optional + Axis long which to sort the input tensor. + + is_ascend : boolean, optional + Whether to sort in ascending or descending order. + + Returns + ------- + out : tvm.te.Tensor + The output of this function. + """ + ndim = len(data.shape) + axis = ndim + axis if axis < 0 else axis + if axis != ndim - 1: + # Prepare for sorting along axis -1. + axes = swap(list(range(ndim)), axis) + data = transpose(data, axes) + + value_buf = tvm.tir.decl_buffer(data.shape, data.dtype, "value_buf", data_alignment=8) + value_buf_swap = tvm.tir.decl_buffer(data.shape, data.dtype, "value_buf_swap", data_alignment=8) + + out = te.extern( + [data.shape, data.shape], + [data], + lambda ins, outs: sort_ir(ins[0], outs[0], outs[1], -1, is_ascend), + out_buffers=[value_buf, value_buf_swap], + name="sort_gpu", + tag="sort_gpu", + )[0] + + if axis != ndim - 1: + axes = swap(list(range(ndim)), axis) + out = transpose(out, axes) + + return out + + +def sort_thrust(data, axis=-1, is_ascend=1, workspace=None): + """Performs sorting along the given axis and returns an array of + sorted values with the same shape as the input data. + + Parameters + ---------- + data: tvm.te.Tensor + The input array. + + axis : int, optional + Axis long which to sort the input tensor. + + is_ascend : boolean, optional + Whether to sort in ascending or descending order. + + workspace: Optional[tvm.te.Tensor] + A buffer to store intermediate results. The size of the workspace should be sufficiently + large, this can be obtained by overestimation or memory usage profiling. If None, it will + fallback to use thrust internal memory allocation. + + + Returns + ------- + out : tvm.te.Tensor + The output of this function. + """ + dtype = "float32" + + ndim = len(data.shape) + axis = ndim + axis if axis < 0 else axis + + if axis != ndim - 1: + # Prepare for sorting along axis -1. + axes = swap(list(range(ndim)), axis) + data = transpose(data, axes) + + value_buf = tvm.tir.decl_buffer(data.shape, data.dtype, "value_buf", data_alignment=8) + indices_buf = tvm.tir.decl_buffer(data.shape, dtype, "out_buf", data_alignment=8) + + def f_compute(ins, outs): + args = ["tvm.contrib.thrust.sort", ins[0], outs[0], outs[1], is_ascend] + if workspace is not None: + args.append(ins[1]) + return tvm.tir.call_packed(*args) + + out = te.extern( + [data.shape, data.shape], + [data] if workspace is None else [data, workspace], + ## TODO(mbrookhart): This thrust function is actually doing argsort, not sort + ## For performance, we should probably rename the contrib function and add + ## a pure sort + f_compute, + out_buffers=[value_buf, indices_buf], + name="sort_gpu", + tag="sort_gpu", + )[0] + + if axis != ndim - 1: + axes = swap(list(range(ndim)), axis) + out = transpose(out, axes) + return out + + +def argsort(data, axis=-1, is_ascend=1, dtype="float32", ret_type="indices"): + """Performs sorting along the given axis and returns an array of indices + having same shape as an input array that index data in sorted order. + + Parameters + ---------- + data: tvm.te.Tensor + The input array. + + axis : int, optional + Axis long which to sort the input tensor. + + is_ascend : boolean, optional + Whether to sort in ascending or descending order. + + dtype : string, optional + DType of the output indices. + + ret_type : string, optional + The return type [both, indices]. + "both": return both sorted data and indices. + "indices": return sorted indices only. + + Returns + ------- + out : tvm.te.Tensor + The output of this function. + """ + ndim = len(data.shape) + axis = ndim + axis if axis < 0 else axis + if axis != ndim - 1: + # Prepare for sorting along axis -1. + axes = swap(list(range(ndim)), axis) + data = transpose(data, axes) + + value_buf = tvm.tir.decl_buffer(data.shape, data.dtype, "value_buf", data_alignment=8) + value_swap_buf = tvm.tir.decl_buffer(data.shape, data.dtype, "value_swap_buf", data_alignment=8) + indices_buf = tvm.tir.decl_buffer(data.shape, dtype, "out_buf", data_alignment=8) + indices_swap_buf = tvm.tir.decl_buffer(data.shape, dtype, "out_swap_buf", data_alignment=8) + + outs = te.extern( + [data.shape, data.shape, data.shape, data.shape], + [data], + lambda ins, outs: sort_ir( + ins[0], + outs[0], + outs[2], + -1, + is_ascend, + indices_out=outs[1], + indices_out_swap=outs[3], + ), + out_buffers=[value_buf, indices_buf, value_swap_buf, indices_swap_buf], + name="argsort_gpu", + tag="argsort_gpu", + ) + + if axis != ndim - 1: + axes = swap(list(range(ndim)), axis) + outs = [transpose(out, axes) for out in outs] + + if ret_type == "indices": + return outs[1] + + return outs[0], outs[1] + + +def argsort_thrust(data, axis=-1, is_ascend=1, dtype="float32", ret_type="indices", workspace=None): + """Performs sorting along the given axis and returns an array of indices + having same shape as an input array that index data in sorted order. + + Parameters + ---------- + data: tvm.te.Tensor + The input array. + + axis : int, optional + Axis long which to sort the input tensor. + + is_ascend : boolean, optional + Whether to sort in ascending or descending order. + + dtype : string, optional + DType of the output indices. + + ret_type : string, optional + The return type [both, indices]. + "both": return both sorted data and indices. + "indices": return sorted indices only. + + workspace : Optional[tvm.te.Tensor] + A buffer to store intermediate results. The size of the workspace should be sufficiently + large, this can be obtained by overestimation or memory usage profiling. If None, it will + fallback to use thrust internal memory allocation. + + Returns + ------- + out : tvm.te.Tensor + The output of this function. + """ + return topk_thrust(data, 0, axis, ret_type, is_ascend, dtype, workspace) + + +def topk(data, k=1, axis=-1, ret_type="both", is_ascend=False, dtype="int64"): + """Get the top k elements in an input tensor along the given axis. + + Parameters + ---------- + data : tvm.te.Tensor + The input tensor. + + k : int, optional + Number of top elements to select. Return all elements if k < 1. + + axis : int, optional + Axis long which to sort the input tensor. + + ret_type: str, optional + The return type [both, values, indices]. + "both": return both top k data and indices. + "values": return top k data only. + "indices": return top k indices only. + + is_ascend : boolean, optional + Whether to sort in ascending or descending order. + + dtype : string, optional + The data type of the indices output. + + Returns + ------- + out : tvm.te.Tensor or List[tvm.te.Tensor] + The computed result. + """ + assert ret_type in ["both", "values", "indices"] + ndim = len(data.shape) + axis = axis + ndim if axis < 0 else axis + assert 0 <= axis < ndim + dshape = data.shape + if axis != ndim - 1: + axes = swap(list(range(ndim)), axis) + data = transpose(data, axes) + + values_buf = tvm.tir.decl_buffer(data.shape, data.dtype, "values_buf", data_alignment=8) + values_swap_buf = tvm.tir.decl_buffer( + data.shape, data.dtype, "values_swap_buf", data_alignment=8 + ) + indices_buf = tvm.tir.decl_buffer(data.shape, dtype, "indices_buf", data_alignment=8) + indices_swap_buf = tvm.tir.decl_buffer(data.shape, dtype, "indies_swap_buf", data_alignment=8) + + if ret_type == "values": + output = te.extern( + [data.shape, data.shape], + [data], + lambda ins, outs: sort_ir(ins[0], outs[0], outs[1], -1, is_ascend), + out_buffers=[values_buf, values_swap_buf], + name="topk_gpu", + tag="topk_gpu", + )[0] + if axis != ndim - 1: + axes = swap(list(range(ndim)), axis) + output = transpose(output, axes) + else: + output = te.extern( + [data.shape, data.shape, data.shape, data.shape], + [data], + lambda ins, outs: sort_ir( + ins[0], + outs[0], + outs[2], + -1, + is_ascend, + indices_out=outs[1], + indices_out_swap=outs[3], + ), + out_buffers=[values_buf, indices_buf, values_swap_buf, indices_swap_buf], + name="topk_gpu", + tag="topk_gpu", + )[0:2] + if axis != ndim - 1: + axes = swap(list(range(ndim)), axis) + output[0] = transpose(output[0], axes) + output[1] = transpose(output[1], axes) + + if isinstance(k, int) and k < 1: + if ret_type == "indices": + return output[1] + return output + beg = [0] * ndim + end = [] + strides = [1] * ndim + for i in range(ndim): + if i == axis: + end.append(k if isinstance(k, int) else tvm.te.size_var("dim")) + else: + end.append(dshape[i]) + if ret_type == "both": + values_out, indices_out = output + values_out = strided_slice(values_out, beg, end, strides) + indices_out = strided_slice(indices_out, beg, end, strides) + output = [values_out, indices_out] + elif ret_type == "values": + output = [strided_slice(output, beg, end, strides)] + else: # ret_type == "indices" + indices_out = output[1] + output = [strided_slice(indices_out, beg, end, strides)] + return output + + +def topk_thrust( + data, k=1, axis=-1, ret_type="both", is_ascend=False, dtype="int64", workspace=None +): + """Get the top k elements in an input tensor along the given axis. + + Parameters + ---------- + data : tvm.te.Tensor + The input tensor. + + k : int, optional + Number of top elements to select. Return all elements if k < 1. + + axis : int, optional + Axis long which to sort the input tensor. + + ret_type: str, optional + The return type [both, values, indices]. + "both": return both top k data and indices. + "values": return top k data only. + "indices": return top k indices only. + + is_ascend : boolean, optional + Whether to sort in ascending or descending order. + + dtype : string, optional + The data type of the indices output. + + workspace : Optional[tvm.te.Tensor] + A buffer to store intermediate results. The size of the workspace should be sufficiently + large, this can be obtained by overestimation or memory usage profiling. If None, it will + fallback to use thrust internal memory allocation. + + Returns + ------- + out : tvm.te.Tensor or List[tvm.te.Tensor] + The computed result. + """ + assert ret_type in ["both", "values", "indices"] + ndim = len(data.shape) + axis = ndim + axis if axis < 0 else axis + + if axis != ndim - 1: + # Prepare for sorting along axis -1. + axes = swap(list(range(ndim)), axis) + data = transpose(data, axes) + + data_buf = tvm.tir.decl_buffer(data.shape, data.dtype, "data_buf", data_alignment=8) + if workspace is not None: + workspace_buf = tvm.tir.decl_buffer( + workspace.shape, workspace.dtype, "workspace_buf", data_alignment=8 + ) + else: + workspace_buf = None + out_bufs = [ + tvm.tir.decl_buffer(data.shape, data.dtype, "value_buf", data_alignment=8), + tvm.tir.decl_buffer(data.shape, dtype, "indices_buf", data_alignment=8), + ] + + def f_compute(ins, outs): + args = ["tvm.contrib.thrust.sort", ins[0], outs[0], outs[1], is_ascend] + if workspace is not None: + args.append(ins[1]) + return tvm.tir.call_packed(*args) + + is_ascend = 1 if is_ascend else 0 + + out = te.extern( + [data.shape, data.shape], + [data] if workspace is None else [data, workspace], + f_compute, + in_buffers=[data_buf] if workspace is None else [data_buf, workspace_buf], + out_buffers=out_bufs, + name="topk_gpu", + tag="topk_gpu", + ) + + if isinstance(k, tvm.tir.IntImm): + k = k.value + + if not isinstance(k, int) or k > 0: + beg = [0] * ndim + end = data.shape[:-1] + [k if isinstance(k, int) else tvm.te.size_var("dim")] + strides = [1] * ndim + out = [strided_slice(o, beg, end, strides) for o in out] + + if axis != ndim - 1: + axes = swap(list(range(ndim)), axis) + out = [transpose(o, axes) for o in out] + + if ret_type == "values": + out = out[0] + elif ret_type == "indices": + out = out[1] + + return out diff --git a/rust/tvm-rt/Cargo.toml b/rust/tvm-rt/Cargo.toml index 24d9061a21..e813c69419 100644 --- a/rust/tvm-rt/Cargo.toml +++ b/rust/tvm-rt/Cargo.toml @@ -52,11 +52,9 @@ use-openmp = ["tvm-sys/use-openmp"] use-relay-debug = ["tvm-sys/use-relay-debug"] use-rtti = ["tvm-sys/use-rtti"] use-mscv-mt = ["tvm-sys/use-mscv-mt"] -use-micro = ["tvm-sys/use-micro"] use-install-dev = ["tvm-sys/use-install-dev"] hide-private-symbols = ["tvm-sys/hide-private-symbols"] use-fallback-stl-map = ["tvm-sys/use-fallback-stl-map"] -use-ethosn = ["tvm-sys/use-ethosn"] use-index-default-i64 = ["tvm-sys/use-index-default-i64"] use-tf-tvmdsoop = ["tvm-sys/use-tf-tvmdsoop"] use-byodt-posit = ["tvm-sys/use-byodt-posit"] @@ -71,7 +69,6 @@ use-rocblas = ["tvm-sys/use-rocblas"] use-sort = ["tvm-sys/use-sort"] use-nnpack = ["tvm-sys/use-nnpack"] use-random = ["tvm-sys/use-random"] -use-micro-standalone-runtime = ["tvm-sys/use-micro-standalone-runtime"] use-cpp-rpc = ["tvm-sys/use-cpp-rpc"] use-tflite = ["tvm-sys/use-tflite"] use-coreml = ["tvm-sys/use-coreml"] diff --git a/tests/cpp/runtime_test.cc b/tests/cpp/runtime_test.cc deleted file mode 100644 index be81ded5d7..0000000000 --- a/tests/cpp/runtime_test.cc +++ /dev/null @@ -1,163 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ - -#include <gtest/gtest.h> -#include <tvm/driver/driver_api.h> -#include <tvm/ir/memory_pools.h> -#include <tvm/ir/module.h> -#include <tvm/relay/analysis.h> -#include <tvm/relay/executor.h> -#include <tvm/relay/expr.h> -#include <tvm/relay/op_attr_types.h> -#include <tvm/relay/op_strategy.h> -#include <tvm/relay/runtime.h> -#include <tvm/relay/transform.h> -#include <tvm/relay/type.h> -#include <tvm/runtime/executor_info.h> -#include <tvm/runtime/module.h> -#include <tvm/runtime/packed_func.h> -#include <tvm/runtime/registry.h> -#include <tvm/te/operation.h> -#include <tvm/topi/broadcast.h> -#include <tvm/topi/generic/injective.h> - -using namespace tvm; -using namespace tvm::relay; - -TVM_REGISTER_GLOBAL("runtime_test.strategy") - .set_body_typed([](const Attrs& attrs, const Array<te::Tensor>& inputs, const Type& out_type, - const Target& target) { - FTVMCompute fcompute = [](const Attrs& attrs, const Array<te::Tensor>& inputs, - const Type& out_type) -> Array<te::Tensor> { - ICHECK_EQ(inputs.size(), 2U); - return {topi::add(inputs[0], inputs[1])}; - }; - FTVMSchedule fschedule = [](const Attrs& attrs, const Array<te::Tensor>& outs, - const Target& target) { - With<Target> target_scope(target); - return topi::generic::schedule_injective(target, outs); - }; - - auto n = make_object<OpStrategyNode>(); - auto strategy = tvm::relay::OpStrategy(std::move(n)); - strategy.AddImplementation(fcompute, fschedule, "runtime_test.strategy", 10); - return strategy; - }); - -TEST(Runtime, ZeroCopy) { - auto tensor_type = relay::TensorType({2, 3}, DataType::Float(32)); - auto a = relay::Var("a", tensor_type); - auto b = relay::Var("b", tensor_type); - auto add_op = relay::Op::Get("add"); - auto x = relay::Call(add_op, {a, b}, tvm::Attrs(), {}); - auto c = relay::Var("c", tensor_type); - auto y = relay::Call(add_op, {x, c}, tvm::Attrs(), {}); - auto func = relay::Function(relay::FreeVars(y), y, relay::Type(), {}); - auto A = tvm::runtime::NDArray::Empty({2, 3}, {kDLFloat, 32, 1}, {kDLCPU, 0}); - auto B = tvm::runtime::NDArray::Empty({2, 3}, {kDLFloat, 32, 1}, {kDLCPU, 0}); - auto C = tvm::runtime::NDArray::Empty({2, 3}, {kDLFloat, 32, 1}, {kDLCPU, 0}); - auto Y = tvm::runtime::NDArray::Empty({2, 3}, {kDLFloat, 32, 1}, {kDLCPU, 0}); - - auto pA = static_cast<float*>(A->data); - auto pB = static_cast<float*>(B->data); - auto pC = static_cast<float*>(C->data); - auto pY = static_cast<float*>(Y->data); - - for (int i = 0; i < 6; ++i) { - pA[i] = i; - pB[i] = i + 1; - pC[i] = i + 2; - } - // get schedule - auto reg = tvm::runtime::Registry::Get("ir.RegisterOpAttr"); - if (!reg) { - LOG(FATAL) << "no _Register"; - } - auto reset = tvm::runtime::Registry::Get("ir.OpResetAttr"); - if (!reset) { - LOG(FATAL) << "Reset is not defined."; - } - auto fs = tvm::runtime::Registry::Get("runtime_test.strategy"); - if (!fs) { - LOG(FATAL) << "No test_strategy registered."; - } - auto fgeneric = GenericFunc::Get("runtime_test.strategy_generic").set_default(*fs, true); - (*reset)(add_op, "FTVMStrategy"); - (*reg)("add", "FTVMStrategy", fgeneric, 10); - Array<Integer> dep; - dep.push_back(0); - (*reset)(add_op, "TShapeDataDependent"); - (*reg)("add", "TShapeDataDependent", dep, 10); - // build - auto pfb = tvm::runtime::Registry::Get("relay.build_module._BuildModule"); - tvm::runtime::Module build_mod = (*pfb)(); - auto build_f = build_mod.GetFunction("build", false); - auto json_f = build_mod.GetFunction("get_graph_json", false); - auto mod_f = build_mod.GetFunction("get_module", false); - Target llvm_tgt = Target("llvm"); - Array<Target> targets = {llvm_tgt}; - auto relay_mod = tvm::IRModule::FromExpr(func); - ICHECK(relay_mod.defined()) << "Module must be defined"; - build_f(relay_mod, targets, llvm_tgt, Executor::Create("graph"), Runtime::Create("cpp"), - WorkspaceMemoryPools(), ConstantMemoryPools(), ""); - // create graph executor - std::string json = json_f(); - tvm::runtime::Module mod = mod_f(); - auto dev = A->device; - auto pfr = tvm::runtime::Registry::Get("tvm.graph_executor.create"); - ICHECK(mod.defined()) << "Module must be defined"; - tvm::runtime::Module run_mod = - (*pfr)(json, mod, static_cast<int>(dev.device_type), dev.device_id); - // get function - auto set_input_f = run_mod.GetFunction("set_input_zero_copy", false); - auto set_output_f = run_mod.GetFunction("set_output_zero_copy", false); - auto run_f = run_mod.GetFunction("run", false); - // set input zero copy - set_input_f("a", const_cast<DLTensor*>(A.operator->())); - set_input_f("b", const_cast<DLTensor*>(B.operator->())); - set_input_f("c", const_cast<DLTensor*>(C.operator->())); - // set output zero copy - set_output_f(0, const_cast<DLTensor*>(Y.operator->())); - run_f(); - // check correctness - for (int i = 0; i < 6; ++i) { - ICHECK_LT(fabs(pY[i] - (i + (i + 1) + (i + 2))), 1e-4); - } - // mutate the input a bit and run it again - for (int i = 0; i < 6; ++i) { - pB[i] = i + 3; - } - run_f(); - // check correctness - for (int i = 0; i < 6; ++i) { - ICHECK_LT(fabs(pY[i] - (i + (i + 3) + (i + 2))), 1e-4); - } - // attach a different input and run it again - auto C2 = tvm::runtime::NDArray::Empty({2, 3}, {kDLFloat, 32, 1}, {kDLCPU, 0}); - auto pC2 = static_cast<float*>(C2->data); - for (int i = 0; i < 6; ++i) { - pC2[i] = i + 4; - } - set_input_f("c", const_cast<DLTensor*>(C2.operator->())); - run_f(); - // check correctness - for (int i = 0; i < 6; ++i) { - ICHECK_LT(fabs(pY[i] - (i + (i + 3) + (i + 4))), 1e-4); - } -} diff --git a/tests/python/codegen/test_target_codegen_cuda.py b/tests/python/codegen/test_target_codegen_cuda.py index 764328a49b..7b370f3e32 100644 --- a/tests/python/codegen/test_target_codegen_cuda.py +++ b/tests/python/codegen/test_target_codegen_cuda.py @@ -432,7 +432,6 @@ def test_rfactor_predicates(target, dev): def test_cuda_const_float_to_half(): # This import is required to use nvcc to perform code gen; # otherwise it is found that the code gen is done by nvrtc. - from tvm import autotvm shape = (2, 3, 4) a = te.placeholder(shape, dtype="float16", name="a") diff --git a/tests/python/relax/test_backend_dispatch_sort_scan.py b/tests/python/relax/test_backend_dispatch_sort_scan.py index 1efbd690f0..4fe6de9e09 100644 --- a/tests/python/relax/test_backend_dispatch_sort_scan.py +++ b/tests/python/relax/test_backend_dispatch_sort_scan.py @@ -93,13 +93,13 @@ def test_dispatch_scanop_cuda(): with bb.function("main", (x,), {"global_symbol": "main"}): with bb.dataflow(): lv = bb.emit_te( - topi.cuda.cumsum, + topi.gpu.cumsum, x, axis=1, exclusive=True, ) out = bb.emit_te( - topi.cuda.cumprod, + topi.gpu.cumprod, lv, axis=1, ) @@ -178,7 +178,7 @@ def test_dispatch_sort_cuda(): with bb.function("foo", (x,), {"global_symbol": "foo"}): with bb.dataflow(): out = bb.emit_te( - topi.cuda.sort, + topi.gpu.sort, x, axis=1, ) @@ -193,14 +193,14 @@ def test_dispatch_sort_cuda(): ) ) out = bb.emit_te( - topi.cuda.sort_thrust, + topi.gpu.sort_thrust, y, axis=0, is_ascend=False, workspace=workspace, ) else: - out = bb.emit_te(topi.cuda.sort, y, axis=0, is_ascend=False) + out = bb.emit_te(topi.gpu.sort, y, axis=0, is_ascend=False) out = bb.emit_output(out) bb.emit_func_output(out) expected_mod = bb.finalize() @@ -273,7 +273,7 @@ def test_dispatch_argsort_cuda(): with target: with bb.function("foo", (x,), {"global_symbol": "foo"}): with bb.dataflow(): - out = bb.emit_te(topi.cuda.argsort, x, axis=1, is_ascend=True, dtype="int32") + out = bb.emit_te(topi.gpu.argsort, x, axis=1, is_ascend=True, dtype="int32") out = bb.emit_output(out) bb.emit_func_output(out) with bb.function("foo2", (y,), {"global_symbol": "foo2"}): @@ -285,7 +285,7 @@ def test_dispatch_argsort_cuda(): ) ) out = bb.emit_te( - topi.cuda.argsort_thrust, + topi.gpu.argsort_thrust, y, axis=0, is_ascend=False, @@ -293,7 +293,7 @@ def test_dispatch_argsort_cuda(): workspace=workspace, ) else: - out = bb.emit_te(topi.cuda.argsort, y, axis=0, is_ascend=False, dtype="int64") + out = bb.emit_te(topi.gpu.argsort, y, axis=0, is_ascend=False, dtype="int64") out = bb.emit_output(out) bb.emit_func_output(out) expected_mod = bb.finalize() @@ -357,7 +357,7 @@ def test_dispatch_topk_cuda(): with target: with bb.function("foo", (x,), {"global_symbol": "foo"}): with bb.dataflow(): - out = bb.emit_te(topi.cuda.topk, x, k=2, axis=1, is_ascend=False, dtype="int32") + out = bb.emit_te(topi.gpu.topk, x, k=2, axis=1, is_ascend=False, dtype="int32") out = bb.emit_output(out) bb.emit_func_output(out) expected_mod = bb.finalize() @@ -393,8 +393,8 @@ def test_dispatch_topk_gpu(): with target: with bb.function("foo", (x,), {"global_symbol": "foo"}): with bb.dataflow(): - lv0 = bb.emit_te(topi.cuda.topk, x, k=2, axis=1, is_ascend=False, dtype="int32") - lv1 = bb.emit_te(topi.cuda.topk, x, k=2, axis=1, is_ascend=False, dtype="int32") + lv0 = bb.emit_te(topi.gpu.topk, x, k=2, axis=1, is_ascend=False, dtype="int32") + lv1 = bb.emit_te(topi.gpu.topk, x, k=2, axis=1, is_ascend=False, dtype="int32") out = (lv0, lv1) out = bb.emit_output(out) bb.emit_func_output(out) diff --git a/tests/python/relax/test_dataflow_pattern.py b/tests/python/relax/test_dataflow_pattern.py index 3aa316cff7..4b5da0d9e6 100644 --- a/tests/python/relax/test_dataflow_pattern.py +++ b/tests/python/relax/test_dataflow_pattern.py @@ -278,13 +278,13 @@ def test_extern_fn_pattern(): def test_op_attr(): x = rx.Var("x", R.Tensor("float32")) y = rx.Var("y", R.Tensor("float32")) - conv2d = rx.nn.conv2d(x, y, kernel_size=(3, 3)) + conv2d = rx.op.nn.conv2d(x, y, strides=(3, 3)) xp = is_var("x") yp = is_var("y") # TODO(@yuchen): reenable the assert after figuring out why it fails - # assert is_op("nn.conv2d")(xp, yp).has_attr({"kernel_size": [3, 3]}).match(conv2d) - assert not is_op("nn.conv2d")(xp, yp).has_attr({"kernel_size": [4, 3]}).match(conv2d) - assert not is_op("nn.conv2d")(xp, yp).has_attr({"kernel_size_": [3, 3]}).match(conv2d) + # assert is_op("nn.conv2d")(xp, yp).has_attr({"strides": [3, 3]}).match(conv2d) + assert not is_op("nn.conv2d")(xp, yp).has_attr({"strides": [4, 3]}).match(conv2d) + assert not is_op("nn.conv2d")(xp, yp).has_attr({"strides": [3, 3]}).match(conv2d) def test_match_call_attr(): diff --git a/tests/python/runtime/test_runtime_graph_cuda_graph.py b/tests/python/runtime/test_runtime_graph_cuda_graph.py deleted file mode 100644 index 0282161c60..0000000000 --- a/tests/python/runtime/test_runtime_graph_cuda_graph.py +++ /dev/null @@ -1,100 +0,0 @@ -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. -import json -import os -import re -import sys -import time - -import pytest - -import tvm -import tvm.testing -from tvm import te -import numpy as np - -from tvm.contrib import utils, graph_executor -from tvm.contrib.cuda_graph import cuda_graph_executor - - -bx = te.thread_axis("blockIdx.x") -tx = te.thread_axis("threadIdx.x") - - [email protected]_cudagraph -def test_graph_simple(): - n = 32 - A = te.placeholder((n,), name="A") - B = te.compute(A.shape, lambda *i: A(*i) + 1.0, name="B") - s = te.create_schedule(B.op) - xo, xi = s[B].split(B.op.axis[0], factor=8) - s[B].bind(xo, bx) - s[B].bind(xi, tx) - - node0 = {"op": "null", "name": "x", "inputs": []} - node1 = { - "op": "tvm_op", - "name": "add", - "inputs": [[0, 0, 0]], - "attrs": {"func_name": "myadd", "flatten_data": "1", "num_inputs": "1", "num_outputs": "1"}, - } - nodes = [node0, node1] - arg_nodes = [0] - node_row_ptr = [0, 1, 2] - outputs = [[1, 0, 0]] - shape = (n,) - attrs = { - "shape": ["list_shape", [shape, shape]], - "dltype": ["list_str", ["float32", "float32"]], - "storage_id": ["list_int", [0, 1]], - } - graph = { - "nodes": nodes, - "arg_nodes": arg_nodes, - "node_row_ptr": node_row_ptr, - "heads": outputs, - "attrs": attrs, - } - graph = json.dumps(graph) - - def check_verify(): - mlib = tvm.build(s, [A, B], "cuda", name="myadd") - dev = tvm.cuda(0) - try: - mod = cuda_graph_executor.create(graph, mlib, dev) - except ValueError: - return - - for i in range(3): - a = np.random.uniform(size=(n,)).astype(A.dtype) - mod.run(x=a) # The first run captured a CUDA graph - out = mod.get_output(0, tvm.nd.empty((n,))) - np.testing.assert_equal(out.numpy(), a + 1) - - # capture / run CUDA graph manually - mod.capture_cuda_graph() - a = np.random.uniform(size=(n,)).astype(A.dtype) - mod.set_input(x=a) - mod.run_cuda_graph() - out = mod.get_output(0, tvm.nd.empty((n,))) - np.testing.assert_equal(out.numpy(), a + 1) - - check_verify() - - -if __name__ == "__main__": - test_graph_simple() diff --git a/tests/python/te/test_te_create_primfunc.py b/tests/python/te/test_te_create_primfunc.py index 0fb64e8d0f..486fc0b18c 100644 --- a/tests/python/te/test_te_create_primfunc.py +++ b/tests/python/te/test_te_create_primfunc.py @@ -18,7 +18,7 @@ import numpy as np import tvm import tvm.testing -from tvm import te, tir, topi, relay +from tvm import te, tir, topi from tvm.script import tir as T import pytest @@ -640,59 +640,6 @@ def test_reshape(): _check_workload(te_reshape, tir_reshape, index_dtype_override="int64") [email protected]_func -def argmax_expected( - p0: T.Buffer((T.int64(1), T.int64(64), T.int64(56), T.int64(56)), "uint8"), - p0_red: T.Buffer((T.int64(1), T.int64(56), T.int64(56)), "int32"), -): - T.func_attr({"global_symbol": "main", "tir.noalias": True}) - p0_red_temp_v0 = T.alloc_buffer([T.int64(1), T.int64(56), T.int64(56)], dtype="int32") - p0_red_temp_v1 = T.alloc_buffer([T.int64(1), T.int64(56), T.int64(56)], dtype="uint8") - for ax0, ax1, ax2, k1 in T.grid(T.int64(1), T.int64(56), T.int64(56), T.int64(64)): - with T.block("p0_red_temp"): - v_ax0, v_ax1, v_ax2, v_k1 = T.axis.remap("SSSR", [ax0, ax1, ax2, k1]) - T.reads(p0[v_ax0, v_k1, v_ax1, v_ax2]) - T.writes(p0_red_temp_v0[v_ax0, v_ax1, v_ax2], p0_red_temp_v1[v_ax0, v_ax1, v_ax2]) - with T.init(): - p0_red_temp_v0[v_ax0, v_ax1, v_ax2] = -1 - p0_red_temp_v1[v_ax0, v_ax1, v_ax2] = T.uint8(0) - v_p0_red_temp_v0: T.int64 = T.Select( - p0_red_temp_v1[v_ax0, v_ax1, v_ax2] > p0[v_ax0, v_k1, v_ax1, v_ax2] - or ( - p0_red_temp_v1[v_ax0, v_ax1, v_ax2] == p0[v_ax0, v_k1, v_ax1, v_ax2] - and T.Cast("int64", p0_red_temp_v0[v_ax0, v_ax1, v_ax2]) < v_k1 - ), - T.Cast("int64", p0_red_temp_v0[v_ax0, v_ax1, v_ax2]), - v_k1, - ) - v_p0_red_temp_v1: T.uint8 = T.Select( - p0_red_temp_v1[v_ax0, v_ax1, v_ax2] > p0[v_ax0, v_k1, v_ax1, v_ax2], - p0_red_temp_v1[v_ax0, v_ax1, v_ax2], - p0[v_ax0, v_k1, v_ax1, v_ax2], - ) - p0_red_temp_v0[v_ax0, v_ax1, v_ax2] = T.Cast("int32", v_p0_red_temp_v0) - p0_red_temp_v1[v_ax0, v_ax1, v_ax2] = v_p0_red_temp_v1 - for ax0, ax1, ax2 in T.grid(T.int64(1), T.int64(56), T.int64(56)): - with T.block("p0_red"): - v_ax0, v_ax1, v_ax2 = T.axis.remap("SSS", [ax0, ax1, ax2]) - T.reads(p0_red_temp_v0[v_ax0, v_ax1, v_ax2]) - T.writes(p0_red[v_ax0, v_ax1, v_ax2]) - p0_red[v_ax0, v_ax1, v_ax2] = p0_red_temp_v0[v_ax0, v_ax1, v_ax2] - - -def test_argmax(): - data = relay.var("data", shape=(1, 64, 56, 56), dtype="uint8") - mod = tvm.IRModule.from_expr(relay.argmax(data, axis=1)) - - target = tvm.target.Target("llvm") - - opt_mod, _ = relay.optimize(mod, params={}, target=target) - - prim_func = relay.backend.te_compiler.lower_to_primfunc(opt_mod["main"].body.op, target) - - tvm.ir.assert_structural_equal(prim_func, argmax_expected) - - def te_resize2d_symbolic(): oh = tir.Var("oh", "int64") ow = tir.Var("ow", "int64") diff --git a/tests/python/te/test_te_tensor_overload.py b/tests/python/te/test_te_tensor_overload.py deleted file mode 100644 index 6ee2bae352..0000000000 --- a/tests/python/te/test_te_tensor_overload.py +++ /dev/null @@ -1,276 +0,0 @@ -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. -import numpy as np -import tvm -from tvm import te -from tvm import topi -import tvm.topi.testing -from tvm.topi.utils import get_const_tuple -import tvm.testing - - -def test_operator_type_and_tags(): - k = 1 - n = te.var("n") - A = te.placeholder((), name="A") - B = te.placeholder((10, 5), name="B") - B1 = B[0] - B2 = B[0, 0] - - assert isinstance(k + n, tvm.tir.PrimExpr) - assert isinstance(n + n, tvm.tir.PrimExpr) - assert isinstance(k + A, te.tensor.Tensor) - assert isinstance(A + k, te.tensor.Tensor) - assert isinstance(n + A, te.tensor.Tensor) - assert isinstance(A + n, te.tensor.Tensor) - assert isinstance(A + A, te.tensor.Tensor) - - assert isinstance(k + B, te.tensor.Tensor) - assert isinstance(B + k, te.tensor.Tensor) - assert isinstance(n + B, te.tensor.Tensor) - assert isinstance(B + n, te.tensor.Tensor) - assert isinstance(A + B, te.tensor.Tensor) - assert isinstance(B + A, te.tensor.Tensor) - assert isinstance(B + B, te.tensor.Tensor) - - assert (k + B).op.tag == topi.tag.ELEMWISE - assert (B + k).op.tag == topi.tag.ELEMWISE - assert (n + B).op.tag == topi.tag.ELEMWISE - assert (B + n).op.tag == topi.tag.ELEMWISE - assert (A + B).op.tag == topi.tag.BROADCAST - assert (B + A).op.tag == topi.tag.BROADCAST - assert (B + B).op.tag == topi.tag.BROADCAST - - assert isinstance(k + B2, tvm.tir.PrimExpr) - assert isinstance(B2 + k, tvm.tir.PrimExpr) - assert isinstance(n + B2, tvm.tir.PrimExpr) - assert isinstance(B2 + n, tvm.tir.PrimExpr) - assert isinstance(B2 + B2, tvm.tir.PrimExpr) - assert isinstance(B2 + A, te.tensor.Tensor) - assert isinstance(A + B2, te.tensor.Tensor) - assert isinstance(B2 + B, te.tensor.Tensor) - assert isinstance(B + B2, te.tensor.Tensor) - - -def test_combination(): - k = 3 - n = 5 - m = 10 - x = te.var("x") - A = te.placeholder((n, m), name="A") - B = te.placeholder((n, m), name="B") - C = te.placeholder((n, m), name="C") - D = k + A - B * C + x - s = te.create_schedule(D.op) - foo = tvm.build(s, [x, A, B, C, D], "llvm") - dev = tvm.cpu(0) - x = 2 - a = tvm.nd.array(np.random.uniform(size=(n, m)).astype(A.dtype), dev) - b = tvm.nd.array(np.random.uniform(size=(n, m)).astype(B.dtype), dev) - c = tvm.nd.array(np.random.uniform(size=(n, m)).astype(C.dtype), dev) - d = tvm.nd.array(np.zeros((n, m), dtype=D.dtype), dev) - foo(x, a, b, c, d) - tvm.testing.assert_allclose(d.numpy(), k + a.numpy() - b.numpy() * c.numpy() + x) - - -def verify_tensor_scalar_bop(shape, typ="add"): - """Verify non-constant Tensor and scalar binary operations.""" - sh = [te.size_var("n%d" % i) for i in range(0, len(shape))] - k = te.var("k") - A = te.placeholder(sh, name="A") - if typ == "add": - B = A + k - elif typ == "sub": - B = A - k - elif typ == "mul": - B = A * k - elif typ == "div": - B = A / k - else: - raise NotImplementedError() - - def check_device(device): - if not tvm.testing.device_enabled(device): - print("Skip because %s is not enabled" % device) - return - dev = tvm.device(device, 0) - print("Running on target: %s" % device) - with tvm.target.Target(device): - s = tvm.topi.testing.get_elemwise_schedule(device)(B) - - k_ = 2 - foo = tvm.build(s, [A, B, k] + sh, device, name="tensor_scalar_" + typ) - a_npy = np.random.uniform(size=shape).astype(A.dtype) - if typ == "add": - b_npy = a_npy + k_ - elif typ == "sub": - b_npy = a_npy - k_ - elif typ == "mul": - b_npy = a_npy * k_ - elif typ == "div": - b_npy = a_npy / k_ - else: - raise NotImplementedError() - - a_nd = tvm.nd.array(a_npy, dev) - b_nd = tvm.nd.array(np.empty(b_npy.shape).astype(B.dtype), dev) - foo(a_nd, b_nd, k_, *shape) - tvm.testing.assert_allclose(b_nd.numpy(), b_npy, rtol=1e-5) - - for device in ["llvm", "cuda", "opencl", "metal", "rocm", "vulkan"]: - check_device(device) - - -def verify_broadcast_bop(lhs_shape, rhs_shape, typ="add"): - A = te.placeholder(shape=lhs_shape, name="A") - B = te.placeholder(shape=rhs_shape, name="B") - if typ == "add": - C = A + B - elif typ == "sub": - C = A - B - elif typ == "mul": - C = A * B - elif typ == "div": - C = A / B - else: - raise NotImplementedError() - - def check_device(device): - dev = tvm.device(device, 0) - if not tvm.testing.device_enabled(device): - print("Skip because %s is not enabled" % device) - return - print("Running on target: %s" % device) - with tvm.target.Target(device): - s = tvm.topi.testing.get_broadcast_schedule(device)(C) - - foo = tvm.build(s, [A, B, C], device, name="broadcast_binary" + "_" + typ) - lhs_npy = np.random.uniform(size=lhs_shape).astype(A.dtype) - rhs_npy = np.random.uniform(size=rhs_shape).astype(A.dtype) - if typ == "add": - out_npy = lhs_npy + rhs_npy - elif typ == "sub": - out_npy = lhs_npy - rhs_npy - elif typ == "mul": - out_npy = lhs_npy * rhs_npy - elif typ == "div": - rhs_npy = np.abs(rhs_npy) + 0.001 - out_npy = lhs_npy / rhs_npy - else: - raise NotImplementedError() - - lhs_nd = tvm.nd.array(lhs_npy, dev) - rhs_nd = tvm.nd.array(rhs_npy, dev) - out_nd = tvm.nd.array(np.empty(out_npy.shape).astype(B.dtype), dev) - for _ in range(1): - foo(lhs_nd, rhs_nd, out_nd) - tvm.testing.assert_allclose(out_nd.numpy(), out_npy, rtol=1e-4, atol=1e-4) - - for device in ["llvm", "cuda", "opencl", "metal", "rocm", "vulkan"]: - check_device(device) - - [email protected]_gpu -def verify_conv2d_scalar_bop( - batch, in_size, in_channel, num_filter, kernel, stride, padding, typ="add" -): - def check_device(device): - dev = tvm.device(device, 0) - if not tvm.testing.device_enabled(device): - print("Skip because %s is not enabled" % device) - return - print("Running on target: %s" % device) - - conv2d_nchw, schedule_conv2d_nchw = tvm.topi.testing.get_conv2d_nchw_implement(device) - - k = 10.0 - dilation = (1, 1) - with tvm.target.Target(device): - A = te.placeholder((batch, in_channel, in_size, in_size), name="A") - W = te.placeholder((num_filter, in_channel, kernel, kernel), name="W") - B = conv2d_nchw(A, W, stride, padding, dilation, A.dtype) - if typ == "add": - C = B + k - elif typ == "sub": - C = B - k - elif typ == "mul": - C = B * k - elif typ == "div": - C = B / k - else: - raise NotImplementedError() - s = schedule_conv2d_nchw([C]) - - foo = tvm.build(s, [A, W, B, C], device, name="conv2d_scalar_" + typ) - - a_npy = np.random.uniform(size=get_const_tuple(A.shape)).astype(A.dtype) - w_npy = np.random.uniform(size=get_const_tuple(W.shape)).astype(W.dtype) - b_npy = tvm.topi.testing.conv2d_nchw_python(a_npy, w_npy, stride, padding) - c_npy = np.random.uniform(size=get_const_tuple(B.shape)).astype(B.dtype) - if typ == "add": - c_npy = b_npy + k - elif typ == "sub": - c_npy = b_npy - k - elif typ == "mul": - c_npy = b_npy * k - elif typ == "div": - c_npy = b_npy / k - else: - raise NotImplementedError() - - a_nd = tvm.nd.array(a_npy, dev) - w_nd = tvm.nd.array(w_npy, dev) - b_nd = tvm.nd.array(np.empty(b_npy.shape).astype(B.dtype), dev) - c_nd = tvm.nd.array(np.empty(c_npy.shape).astype(C.dtype), dev) - foo(a_nd, w_nd, b_nd, c_nd) - tvm.testing.assert_allclose(c_nd.numpy(), c_npy, rtol=1e-4, atol=1e-4) - - for device in ["llvm", "cuda", "opencl", "metal", "rocm", "vulkan"]: - check_device(device) - - [email protected]_gpu -def test_tensor_scalar_bop(): - verify_tensor_scalar_bop((1,), typ="add") - verify_tensor_scalar_bop((3, 5), typ="sub") - verify_tensor_scalar_bop((1, 3, 5), typ="mul") - verify_tensor_scalar_bop((2, 3, 1, 32), typ="div") - - [email protected]_gpu -def test_broadcast_bop(): - verify_broadcast_bop((2, 3), (), typ="add") - verify_broadcast_bop((5, 2, 3), (1,), typ="add") - verify_broadcast_bop((1, 32), (64, 32), typ="sub") - verify_broadcast_bop((5, 64, 128), (2, 5, 64, 1), typ="mul") - verify_broadcast_bop((2, 3, 1, 32), (64, 32), typ="div") - - [email protected]_gpu -def test_conv2d_scalar_bop(): - verify_conv2d_scalar_bop(1, 16, 4, 4, 3, 1, 1, typ="add") - verify_conv2d_scalar_bop(1, 32, 2, 1, 3, 1, 1, typ="sub") - verify_conv2d_scalar_bop(1, 32, 1, 1, 3, 1, 1, typ="mul") - verify_conv2d_scalar_bop(1, 16, 2, 1, 3, 1, 1, typ="div") - - -if __name__ == "__main__": - test_operator_type_and_tags() - test_combination() - test_tensor_scalar_bop() - test_broadcast_bop() - test_conv2d_scalar_bop() diff --git a/tests/python/testing/test_format_si_prefix.py b/tests/python/testing/test_format_si_prefix.py deleted file mode 100644 index e0276ce022..0000000000 --- a/tests/python/testing/test_format_si_prefix.py +++ /dev/null @@ -1,41 +0,0 @@ -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. - -from numpy import isclose -import random -from tvm.autotvm import utils - - -SI_PREFIXES = "yzafpn\xb5m kMGTPEZY" - - -def test_format_si_prefix(): - # test float conversion - assert utils.format_si_prefix(1024, "k") == 1.024 - - for i, prefix in enumerate(SI_PREFIXES): - integer, decimal = random.randint(0, 1000), random.randint(0, 1000) - exp = -24 + 3 * i # 0th prefix (yocto) is 10^-24 - number = integer * (10**exp) + decimal * (10 ** (exp - 3)) - expected = integer + decimal / 1000 - assert isclose(utils.format_si_prefix(number, prefix), expected) - - assert utils.format_si_prefix(0, "y") == 0 - - -if __name__ == "__main__": - test_format_si_prefix() diff --git a/tests/python/tir-transform/test_tir_transform_fp8_legalize.py b/tests/python/tir-transform/test_tir_transform_fp8_legalize.py index 62e7072479..e1f487c572 100644 --- a/tests/python/tir-transform/test_tir_transform_fp8_legalize.py +++ b/tests/python/tir-transform/test_tir_transform_fp8_legalize.py @@ -206,8 +206,6 @@ promote_dtype = tvm.testing.parameter("float16", "float32") def test_fp8_compute_legalize(dtype, promote_dtype): - if tvm.contrib.nvcc.have_fp8(tvm.contrib.nvcc.get_target_compute_version()): - return target = Target("cuda") before = BindTarget(target)(get_before(dtype)) expected = BindTarget(target)(get_after_compute_legalize(dtype, promote_dtype)) @@ -219,8 +217,6 @@ def test_fp8_compute_legalize(dtype, promote_dtype): def test_fp8_storage_legalize(dtype, promote_dtype): - if tvm.contrib.nvcc.have_fp8(tvm.contrib.nvcc.get_target_compute_version()): - return target = Target("cuda") before = BindTarget(target)(get_after_compute_legalize(dtype, promote_dtype)) after = tvm.tir.transform.FP8StorageLegalize()(before) diff --git a/tests/scripts/task_config_build_cpu.sh b/tests/scripts/task_config_build_cpu.sh index 6007b68f57..9e195de9bc 100755 --- a/tests/scripts/task_config_build_cpu.sh +++ b/tests/scripts/task_config_build_cpu.sh @@ -49,4 +49,5 @@ echo set\(BACKTRACE_ON_SEGFAULT ON\) >> config.cmake echo set\(USE_CCACHE OFF\) >> config.cmake echo set\(USE_UMA ON\) >> config.cmake echo set\(SUMMARIZE ON\) >> config.cmake -echo set\(USE_MSC ON\) >> config.cmake +# Temporary disable MSC +# echo set\(USE_MSC ON\) >> config.cmake diff --git a/tests/scripts/task_config_build_gpu.sh b/tests/scripts/task_config_build_gpu.sh index e3599695a9..e411ee2c5e 100755 --- a/tests/scripts/task_config_build_gpu.sh +++ b/tests/scripts/task_config_build_gpu.sh @@ -47,5 +47,6 @@ echo set\(SUMMARIZE ON\) >> config.cmake echo set\(HIDE_PRIVATE_SYMBOLS ON\) >> config.cmake echo set\(USE_PIPELINE_EXECUTOR ON\) >> config.cmake echo set\(USE_CUTLASS ON\) >> config.cmake -echo set\(USE_MSC ON\) >> config.cmake +# Temporary disable MSC +# echo set\(USE_MSC ON\) >> config.cmake echo set\(CMAKE_CUDA_ARCHITECTURES 75\) >> config.cmake diff --git a/tests/scripts/task_rust.sh b/tests/scripts/task_rust.sh index f31c703abd..442b9d771e 100755 --- a/tests/scripts/task_rust.sh +++ b/tests/scripts/task_rust.sh @@ -56,60 +56,3 @@ cargo test --features dynamic-linking --tests cd $RUST_DIR/tvm-rt # Build and run the tests. cargo test - -# Next we test the graph executor crate. -cd $RUST_DIR/tvm-graph-rt - -# We first we compile a model using the Python bindings then run the tests. -python3 tests/build_model.py -cargo test --tests - -# Run some more tests involving the graph executor API. -cd tests/test_tvm_basic -cargo run -cd - - -cd tests/test_tvm_dso -cargo run -cd - - -# run wasm32 test -# cd tests/test_wasm32 -# cargo build -# wasmtime $RUST_DIR/target/wasm32-wasi/debug/test-wasm32.wasm -# cd - - -# Disabled, see https://github.com/apache/tvm/issues/11419 -# # run nn graph test -# cd tests/test_nn -# cargo run -# cd - - -# Finally we test the TVM crate which provides both runtime -# and compiler bindings. -cd $RUST_DIR/tvm - -cargo test - -# run basic tests on cpu -cd tests/basics -cargo run --features cpu -# uncomment when have more CI resources -# cargo build --features gpu -# cargo run --features gpu -# fi -cd - - -# TODO(@jroesch): I believe this is no longer true, refactor in follow up PR. -# run callback tests separately: https://discuss.tvm.ai/t/are-global-functions-need-to-be-accessed-in-separate-processes/1075 -cd tests/callback -cargo build -cargo run --bin int -cargo run --bin float -cargo run --bin array -cargo run --bin string -cd - - -cd examples/resnet -cargo run -cd - diff --git a/tests/scripts/task_web_wasm.sh b/tests/scripts/task_web_wasm.sh index 8a08c1ecb5..91bbbac523 100755 --- a/tests/scripts/task_web_wasm.sh +++ b/tests/scripts/task_web_wasm.sh @@ -25,8 +25,9 @@ cd web make clean npm install npm run lint -npm run prepwasm -npm run bundle -npm run test -npm run typedoc +# TODO(@tqchen, @siyuan): re-enable the following tests +# npm run prepwasm +# npm run bundle +# npm run test +# npm run typedoc cd .. diff --git a/tests/scripts/unity/task_python_relax.sh b/tests/scripts/unity/task_python_relax.sh index 28dd78bf6b..688812b35d 100755 --- a/tests/scripts/unity/task_python_relax.sh +++ b/tests/scripts/unity/task_python_relax.sh @@ -38,4 +38,4 @@ TVM_TEST_TARGETS="${TVM_RELAY_TEST_TARGETS:-llvm}" pytest tests/python/dlight # python3 ./apps/relax_examples/resnet.py # Test for MSC -pytest tests/python/contrib/test_msc +# pytest tests/python/contrib/test_msc
