(tvm) branch main updated: [Runtime] Allow aborting fetchNDArray through AbortSignal (#17208)

2024-07-29 Thread tqchen
This is an automated email from the ASF dual-hosted git repository.

tqchen pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/tvm.git


The following commit(s) were added to refs/heads/main by this push:
 new 2c9af0f500 [Runtime] Allow aborting fetchNDArray through AbortSignal 
(#17208)
2c9af0f500 is described below

commit 2c9af0f500c04383aa7220ab2c9220a608f75cbf
Author: Nestor Qin 
AuthorDate: Mon Jul 29 08:17:55 2024 -0400

[Runtime] Allow aborting fetchNDArray through AbortSignal (#17208)

[Runtime] Allow aborting fetchNDArray
---
 web/src/artifact_cache.ts | 11 ++-
 web/src/runtime.ts| 13 +
 2 files changed, 15 insertions(+), 9 deletions(-)

diff --git a/web/src/artifact_cache.ts b/web/src/artifact_cache.ts
index f833df1be5..9690ed3320 100644
--- a/web/src/artifact_cache.ts
+++ b/web/src/artifact_cache.ts
@@ -58,13 +58,14 @@ export interface ArtifactCacheTemplate {
*
* @param url: The url to the data to be cached.
* @param storetype: Only applies to `ArtifactIndexedDBCache`. Since 
`indexedDB` stores the actual
+   * @param signal: An optional AbortSignal to abort data retrival
* data rather than a request, we specify `storagetype`. There are two 
options:
* 1. "json": IndexedDB stores `fetch(url).json()`
* 2. "arraybuffer": IndexedDB stores `fetch(url).arrayBuffer()`
*
* @note This is an async function.
*/
-  addToCache(url: string, storetype?: string): Promise;
+  addToCache(url: string, storetype?: string, signal?: AbortSignal): 
Promise;
 
   /**
* check if cache has all keys in Cache
@@ -126,8 +127,8 @@ export class ArtifactCache implements ArtifactCacheTemplate 
{
   }
 
   // eslint-disable-next-line @typescript-eslint/no-unused-vars
-  async addToCache(url: string, storetype?: string) {
-const request = new Request(url);
+  async addToCache(url: string, storetype?: string, signal?: AbortSignal) {
+const request = new Request(url, signal ? { signal } : undefined);
 if (this.cache === undefined) {
   this.cache = await caches.open(this.scope);
 }
@@ -282,7 +283,7 @@ export class ArtifactIndexedDBCache implements 
ArtifactCacheTemplate {
 });
   }
 
-  async addToCache(url: string, storetype?: string): Promise {
+  async addToCache(url: string, storetype?: string, signal?: AbortSignal): 
Promise {
 await this.initDB(); // await the initDB process
 // If already cached, nothing to do
 const isInDB = await this.isUrlInDB(url);
@@ -290,7 +291,7 @@ export class ArtifactIndexedDBCache implements 
ArtifactCacheTemplate {
   return;
 }
 try {
-  const response = await fetch(url);
+  const response = await fetch(url, signal ? { signal } : undefined);
   if (!response.ok) {
 throw new Error('Network response was not ok');
   }
diff --git a/web/src/runtime.ts b/web/src/runtime.ts
index fd7bcc6ab2..d71c98e7d1 100644
--- a/web/src/runtime.ts
+++ b/web/src/runtime.ts
@@ -1444,13 +1444,15 @@ export class Instance implements Disposable {
* @param device The device to be fetched to.
* @param cacheScope The scope identifier of the cache
* @param cacheType The type of the cache: "cache" or "indexedDB"
+   * @param signal An optional AbortSignal to abort the fetch
* @returns The meta data
*/
   async fetchNDArrayCache(
 ndarrayCacheUrl: string,
 device: DLDevice,
 cacheScope = "tvmjs",
-cacheType = "cache"
+cacheType = "cache",
+signal?: AbortSignal,
   ): Promise {
 let artifactCache: ArtifactCacheTemplate;
 if (cacheType === undefined || cacheType.toLowerCase() === "cache") {
@@ -1465,7 +1467,8 @@ export class Instance implements Disposable {
 const list = await artifactCache.fetchWithCache(jsonUrl, "json");
 await this.fetchNDArrayCacheInternal(
   ndarrayCacheUrl,
-  list["records"] as Array, device, artifactCache);
+  list["records"] as Array, device, artifactCache,
+  signal);
 this.cacheMetadata = { ...this.cacheMetadata, ...(list["metadata"] as 
Record) };
   }
 
@@ -1477,12 +1480,14 @@ export class Instance implements Disposable {
* @param list The list of array data.
* @param device The device to store the data to.
* @param artifactCache The artifact cache
+   * @param signal An optional AbortSignal to abort the fetch
*/
   private async fetchNDArrayCacheInternal(
 ndarrayCacheUrl: string,
 list: Array,
 device: DLDevice,
-artifactCache: ArtifactCacheTemplate
+artifactCache: ArtifactCacheTemplate,
+signal?: AbortSignal,
   ) {
 const perf = compact.getPerformance();
 const tstart = perf.now();
@@ -1537,7 +1542,7 @@ export class Instance implements Disposable {
 const shard = list[i];
 const dataUrl = new URL(shard.dataPath, ndarrayCacheUrl).href;
 tr

(tvm) branch main updated: [LLVM] Fix for getHostCPUFeatures API change (#17199)

2024-07-26 Thread tqchen
This is an automated email from the ASF dual-hosted git repository.

tqchen pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/tvm.git


The following commit(s) were added to refs/heads/main by this push:
 new df33d73cec [LLVM] Fix for getHostCPUFeatures API change (#17199)
df33d73cec is described below

commit df33d73ceca1d0c4ba280cfbcce504b232111d4c
Author: Anirudh Sundar Subramaniam 
AuthorDate: Fri Jul 26 19:08:27 2024 +0530

[LLVM] Fix for getHostCPUFeatures API change (#17199)

This patch fixes a minor API change in latest LLVM.
---
 src/target/llvm/codegen_llvm.cc | 11 +++
 1 file changed, 11 insertions(+)

diff --git a/src/target/llvm/codegen_llvm.cc b/src/target/llvm/codegen_llvm.cc
index 6098a3f32f..4c5bea8c9b 100644
--- a/src/target/llvm/codegen_llvm.cc
+++ b/src/target/llvm/codegen_llvm.cc
@@ -2315,6 +2315,16 @@ 
TVM_REGISTER_GLOBAL("tvm.codegen.llvm.GetHostCPUName").set_body_typed([]() -> st
 
 TVM_REGISTER_GLOBAL("tvm.codegen.llvm.GetHostCPUFeatures")
 .set_body_typed([]() -> Map {
+#if TVM_LLVM_VERSION >= 200
+  Map ret;
+  auto features = llvm::sys::getHostCPUFeatures();
+  for (auto it = features.begin(); it != features.end(); ++it) {
+std::string name = it->getKey().str();
+bool value = it->getValue();
+ret.Set(name, IntImm(DataType::Bool(), value));
+  }
+  return ret;
+#else
   llvm::StringMap features;
   if (llvm::sys::getHostCPUFeatures(features)) {
 Map ret;
@@ -2325,6 +2335,7 @@ TVM_REGISTER_GLOBAL("tvm.codegen.llvm.GetHostCPUFeatures")
 }
 return ret;
   }
+#endif
   LOG(WARNING) << "Current version of LLVM does not support feature 
detection on your CPU";
   return {};
 });



(tvm) branch main updated: [Cython][FFI] Fix crash when call del operator for handle (#17190)

2024-07-25 Thread tqchen
This is an automated email from the ASF dual-hosted git repository.

tqchen pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/tvm.git


The following commit(s) were added to refs/heads/main by this push:
 new 08d75197e1 [Cython][FFI] Fix crash when call del operator for handle 
(#17190)
08d75197e1 is described below

commit 08d75197e1033d64cba5da0407a7489759c5dba5
Author: Egor Churaev 
AuthorDate: Thu Jul 25 16:44:55 2024 +0300

[Cython][FFI] Fix crash when call del operator for handle (#17190)

* [Cython][FFI] Fix crash when call del operator for handle

In case of cython when we create a set function for property then the
following code will be generated:
```
static int __pyx_setprop_4test_9TestClass_handle(PyObject *o, PyObject *v, 
CYTHON_UNUSED void *x) {
  if (v) {
return __pyx_pw_4test_9TestClass_6handle_3__set__(o, v);
  }
  else {
PyErr_SetString(PyExc_NotImplementedError, "__del__");
return -1;
  }
}
```

And when we call operator `del` for this handler, then the memory will
be released and operator `__set__` will be called for NULL object. In
this case an exception that operator `__del__` is not implemented will
be generated. To avoid this problem we need to declare `__del__`
function for each property where we define operator `__set__`.

* Apply comments

* Set dref.handle to None instead of using __del__ functions
---
 python/tvm/runtime/disco/session.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/python/tvm/runtime/disco/session.py 
b/python/tvm/runtime/disco/session.py
index 38c4f2a235..89ef549df3 100644
--- a/python/tvm/runtime/disco/session.py
+++ b/python/tvm/runtime/disco/session.py
@@ -92,7 +92,7 @@ class DModule(DRef):
 
 def __init__(self, dref: DRef, session: "Session") -> None:
 self.handle = dref.handle
-del dref.handle
+dref.handle = None
 self.session = session
 
 def __getitem__(self, name: str) -> DPackedFunc:



(tvm) branch main updated: Add support for `torch.nn.functional.max_pool2d` (#17189)

2024-07-24 Thread tqchen
This is an automated email from the ASF dual-hosted git repository.

tqchen pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/tvm.git


The following commit(s) were added to refs/heads/main by this push:
 new cc8afdb0e3 Add support for `torch.nn.functional.max_pool2d` (#17189)
cc8afdb0e3 is described below

commit cc8afdb0e3be52a3aa162ff14a81b11a793dca6b
Author: Masahiro Hiramori 
AuthorDate: Wed Jul 24 22:36:19 2024 +0900

Add support for `torch.nn.functional.max_pool2d` (#17189)

* add a testcase for call_function

* add maxpool2d to call_function
---
 python/tvm/relax/frontend/torch/fx_translator.py | 1 +
 tests/python/relax/test_frontend_from_fx.py  | 8 
 2 files changed, 9 insertions(+)

diff --git a/python/tvm/relax/frontend/torch/fx_translator.py 
b/python/tvm/relax/frontend/torch/fx_translator.py
index e6b39c3eee..093f3ae4cf 100644
--- a/python/tvm/relax/frontend/torch/fx_translator.py
+++ b/python/tvm/relax/frontend/torch/fx_translator.py
@@ -1476,6 +1476,7 @@ class TorchFXImporter:
 "getitem": self._getitem,
 "contiguous": lambda node: self.env[node.args[0]],
 "to": self._to,
+"max_pool2d": self._max_pool2d,
 "avg_pool2d": self._avg_pool2d,
 "adaptive_avg_pool2d": self._adaptive_avg_pool2d(is_module=False),
 "layer_norm": self._layer_norm,
diff --git a/tests/python/relax/test_frontend_from_fx.py 
b/tests/python/relax/test_frontend_from_fx.py
index b4ac3fa60c..1a2cc5da62 100644
--- a/tests/python/relax/test_frontend_from_fx.py
+++ b/tests/python/relax/test_frontend_from_fx.py
@@ -796,6 +796,13 @@ def test_maxpool2d():
 def forward(self, input):
 return self.pool(input)
 
+class MaxPool2d_functional(Module):
+def __init__(self):
+super().__init__()
+
+def forward(self, input):
+return torch.nn.functional.max_pool2d(input, kernel_size=[1, 1])
+
 @tvm.script.ir_module
 class expected1:
 @R.function
@@ -876,6 +883,7 @@ def test_maxpool2d():
 return gv
 
 verify_model(MaxPool2d(), input_info, {}, expected1)
+verify_model(MaxPool2d_functional(), input_info, {}, expected1)
 verify_model(MaxPool2d2(), input_info, {}, expected2)
 verify_model(MaxPool2d3(), input_info, {}, expected3)
 



(tvm) branch main updated: [TIR][Analyzer] Simplify `x==x` expressions for all dtypes (#17158)

2024-07-24 Thread tqchen
This is an automated email from the ASF dual-hosted git repository.

tqchen pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/tvm.git


The following commit(s) were added to refs/heads/main by this push:
 new 9f0f301c6f [TIR][Analyzer] Simplify `x==x` expressions for all dtypes 
(#17158)
9f0f301c6f is described below

commit 9f0f301c6f6de7548c6b2026bcb51590e0881ac5
Author: Eric Lunderberg 
AuthorDate: Wed Jul 24 08:24:15 2024 -0500

[TIR][Analyzer] Simplify `x==x` expressions for all dtypes (#17158)

* [TIR][Analyzer] Simplify `x==x` expressions for all dtypes

Prior to this commit, there was no rule to simplify `x == x` into
`True`.  In some cases, despite not having an explicit rewrite rule in
`RewriteSimplifier`, the `RewriteSimplifier::CanProve` function would
check if `x-x` simplifies to zero, relying on the rewrite rules used
for `tir::Sub`.  However, the rule to rewrite `x-x` into zero was only
enabled for `int32`, `int64`, and floating-point types, so relying on
this behavior was inconsistent.

This commit updates the rewrite rules for both `tir::EQ` and
`tir::Sub` to check for simplification of `x-x` or `x==x`, regardless
of the datatype.  This change preserves the fast-path for index
data-types, in which `int32` and `int64` expressions may be simplified
without checking for side effects.  For all other dtypes, the
cancellation only applies when evaluating `x` has no side effects.

* Add comment about simplifications of NaN/Inf
---
 src/arith/rewrite_simplify.cc | 21 -
 tests/python/arith/test_arith_rewrite_simplify.py | 36 +++
 tests/python/arith/test_arith_simplify.py | 29 ++
 3 files changed, 85 insertions(+), 1 deletion(-)

diff --git a/src/arith/rewrite_simplify.cc b/src/arith/rewrite_simplify.cc
index f4d4a9048c..3682054e8e 100644
--- a/src/arith/rewrite_simplify.cc
+++ b/src/arith/rewrite_simplify.cc
@@ -543,6 +543,7 @@ PrimExpr RewriteSimplifier::Impl::VisitExpr_(const SubNode* 
op) {
   PVar c1, c2, c3;
   // Pattern var for lanes in broadcast and ramp
   PVar lanes;
+
   // Vector rules
   if (op->dtype.is_scalable_or_fixed_length_vector()) {
 TVM_TRY_REWRITE(ramp(b1, s1, lanes) - ramp(b2, s2, lanes), ramp(b1 - b2, 
s1 - s2, lanes));
@@ -697,9 +698,15 @@ PrimExpr RewriteSimplifier::Impl::VisitExpr_(const 
SubNode* op) {
 TVM_TRY_RECURSIVE_REWRITE(x - (y + c1), (x - y) + (0 - c1));
 TVM_TRY_RECURSIVE_REWRITE(x - (y - z), (x + z) - y);
 TVM_TRY_RECURSIVE_REWRITE(x - y * c1, x + y * (0 - c1));
-  } else if (op->dtype.is_float()) {
+  } else {
 // Cancellation rules.  Deliberately off of the integer path, to
 // avoid introducing checks on the side effects for the fast path.
+//
+// These simplifications do not preserve NaN/Inf that may occur in
+// the inputs.  For IEEE floats, `NaN - NaN` is `NaN`, and does
+// not cancel out.  However, since models should not encounter NaN
+// in the first place, this allows better simplification for the
+// supported path.
 TVM_TRY_REWRITE_IF(x - x, ZeroWithTypeLike(x),
SideEffect(x.Eval()) <= CallEffectKind::kReadState);
 TVM_TRY_REWRITE_IF((x + y) - y, x, SideEffect(y.Eval()) <= 
CallEffectKind::kReadState);
@@ -1678,6 +1685,7 @@ PrimExpr RewriteSimplifier::Impl::ApplyRewriteRules(EQ 
ret) {
   // Pattern var match IntImm
   PVar c1, c2;
   PVar lanes;
+  PConst ctrue(make_const(ret->dtype, true));
 
   // vector rule
   if (ret->dtype.is_scalable_or_fixed_length_vector()) {
@@ -1698,6 +1706,17 @@ PrimExpr RewriteSimplifier::Impl::ApplyRewriteRules(EQ 
ret) {
 TVM_TRY_REWRITE(c1 - x == c2, x == c1 - c2);
 TVM_TRY_REWRITE(x + c1 == c2, x == c2 - c1);
 TVM_TRY_RECURSIVE_REWRITE(x * y == 0, x == 0 || y == 0);
+TVM_TRY_REWRITE(x == x, ctrue);
+  } else {
+// Mimic the cancellation rules for SubNode.  For Index datatypes,
+// we skip the check for side effects.
+//
+// These simplifications do not preserve NaN/Inf that may occur in
+// the inputs.  For IEEE floats, `NaN - NaN` is `NaN`, and does
+// not cancel out.  However, since models should not encounter NaN
+// in the first place, this allows better simplification for the
+// supported path.
+TVM_TRY_REWRITE_IF(x == x, ctrue, SideEffect(x.Eval()) <= 
CallEffectKind::kReadState);
   }
   return std::move(ret);
 }
diff --git a/tests/python/arith/test_arith_rewrite_simplify.py 
b/tests/python/arith/test_arith_rewrite_simplify.py
index 1ebaab53af..90f0aeef47 100644
--- a/tests/python/arith/test_arith_rewrite_simplify.py
+++ b/tests/python/arith/test_arith_rewrite_simplify.py
@@ -321,6 +321,42 @@ class TestSelect(BaseCompare):
 )
 
 
+class TestCancellation(BaseCompare):
+var_int8 = tir.Var("var_int8", "int8")
+var_int32 = tir.Var("

(tvm) branch main updated: [Disco] Cross-group and p2p send/receive primitives (#17191)

2024-07-24 Thread tqchen
This is an automated email from the ASF dual-hosted git repository.

tqchen pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/tvm.git


The following commit(s) were added to refs/heads/main by this push:
 new ae1be53d6d [Disco] Cross-group and p2p send/receive primitives (#17191)
ae1be53d6d is described below

commit ae1be53d6dc08ad8a95ddf6af022880e836e8704
Author: Ruihang Lai 
AuthorDate: Wed Jul 24 08:03:21 2024 -0400

[Disco] Cross-group and p2p send/receive primitives (#17191)

This PR introduces the disco CCL primitives for cross-group
and p2p communication.

Specifically, we introduce the send/receive primitives for one group
to send a buffer to its next group, where every worker in the first
group sends the buffer to the corresponding worker in the second
group. The p2p communication refer to the send/receive operations
to/from a target global worker.
---
 include/tvm/runtime/disco/builtin.h  | 24 ++
 python/tvm/relax/frontend/nn/core.py |  6 +--
 src/runtime/disco/builtin.cc | 16 +++
 src/runtime/disco/nccl/nccl.cc   | 86 
 tests/python/disco/test_ccl.py   | 40 -
 5 files changed, 168 insertions(+), 4 deletions(-)

diff --git a/include/tvm/runtime/disco/builtin.h 
b/include/tvm/runtime/disco/builtin.h
index 7d15e35fbd..4453d9737f 100644
--- a/include/tvm/runtime/disco/builtin.h
+++ b/include/tvm/runtime/disco/builtin.h
@@ -114,6 +114,30 @@ TVM_DLL void GatherToWorker0(NDArray send, bool in_group, 
Optional recv
  * \param buffer The buffer to be received
  */
 TVM_DLL void RecvFromWorker0(NDArray buffer);
+/*!
+ * \brief Send a buffer to the corresponding worker in the next group.
+ * An error is thrown if the worker is already in the last group.
+ * \param buffer The sending buffer.
+ */
+TVM_DLL void SendToNextGroup(NDArray buffer);
+/*!
+ * \brief Receive a buffer from the corresponding worker in the previous group.
+ * An error is thrown if the worker is already in the first group.
+ * \param buffer The receiving buffer.
+ */
+TVM_DLL void RecvFromPrevGroup(NDArray buffer);
+/*!
+ * \brief Send a buffer to the target receiver worker (globally across all 
groups).
+ * \param buffer The sending buffer.
+ * \param receiver_id The global receiver worker id.
+ */
+TVM_DLL void SendToWorker(NDArray buffer, int receiver_id);
+/*!
+ * \brief Receive a buffer from the target sender worker (globally across all 
groups).
+ * \param buffer The receiving buffer.
+ * \param sender_id The global sender worker id.
+ */
+TVM_DLL void RecvFromWorker(NDArray buffer, int sender_id);
 /*! \brief Get the local worker id */
 TVM_DLL int WorkerId();
 /*!
diff --git a/python/tvm/relax/frontend/nn/core.py 
b/python/tvm/relax/frontend/nn/core.py
index 46e016a242..3511c38a2b 100644
--- a/python/tvm/relax/frontend/nn/core.py
+++ b/python/tvm/relax/frontend/nn/core.py
@@ -549,16 +549,16 @@ class ModuleList(Module):
 def __iter__(self):
 return iter(self.modules)
 
-def __getitem__(self, idx):
+def __getitem__(self, idx: int) -> Module:
 return self.modules[idx]
 
-def __setitem__(self, idx, module):
+def __setitem__(self, idx: int, module: Module) -> None:
 self.modules[idx] = module
 
 def __len__(self):
 return len(self.modules)
 
-def append(self, module):
+def append(self, module: Module):
 """Add a module to the end of the ModuleList"""
 self.modules.append(module)
 
diff --git a/src/runtime/disco/builtin.cc b/src/runtime/disco/builtin.cc
index 0cb2ee6f5d..760a330a7a 100644
--- a/src/runtime/disco/builtin.cc
+++ b/src/runtime/disco/builtin.cc
@@ -101,6 +101,18 @@ void GatherToWorker0(NDArray send, bool in_group, 
Optional recv) {
 
 void RecvFromWorker0(NDArray buffer) { 
GetCCLFunc("recv_from_worker0")(buffer); }
 
+void SendToNextGroup(NDArray buffer) { 
GetCCLFunc("send_to_next_group")(buffer); }
+
+void RecvFromPrevGroup(NDArray buffer) { 
GetCCLFunc("recv_from_prev_group")(buffer); }
+
+void SendToWorker(NDArray buffer, int receiver_id) {
+  GetCCLFunc("send_to_worker")(buffer, receiver_id);
+}
+
+void RecvFromWorker(NDArray buffer, int sender_id) {
+  GetCCLFunc("recv_from_worker")(buffer, sender_id);
+}
+
 int WorkerId() { return DiscoWorker::ThreadLocal()->worker_id; }
 
 void SyncWorker() {
@@ -136,6 +148,10 @@ 
TVM_REGISTER_GLOBAL("runtime.disco.broadcast_from_worker0").set_body_typed(Broad
 
TVM_REGISTER_GLOBAL("runtime.disco.scatter_from_worker0").set_body_typed(ScatterFromWorker0);
 
TVM_REGISTER_GLOBAL("runtime.disco.gather_to_worker0").set_body_typed(GatherToWorker0);
 
TVM_REGISTER_GLOBAL("runtime.disco.recv_from_worker0").set_body_typed(RecvFromWorker0);
+TVM_REGISTER_GLOBAL("runtime.disco.send_to_next_group").set_body

(tvm) branch main updated: Remove and replace deprecated `distutils.util.strtobool()` (#17185)

2024-07-23 Thread tqchen
This is an automated email from the ASF dual-hosted git repository.

tqchen pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/tvm.git


The following commit(s) were added to refs/heads/main by this push:
 new 7c9969bbdf Remove and replace deprecated `distutils.util.strtobool()` 
(#17185)
7c9969bbdf is described below

commit 7c9969bbdfc7f032f270f9f75eeb53bf6e78ff7b
Author: Masahiro Hiramori 
AuthorDate: Wed Jul 24 00:33:06 2024 +0900

Remove and replace deprecated `distutils.util.strtobool()` (#17185)

remove and replace deprecated distutils.util.strtobool
---
 python/tvm/auto_scheduler/testing/tune_onnx.py|  2 +-
 python/tvm/auto_scheduler/testing/tune_relay.py   |  2 +-
 python/tvm/auto_scheduler/testing/tune_te.py  |  2 +-
 python/tvm/autotvm/testing/tune_relay.py  |  2 +-
 python/tvm/meta_schedule/testing/tune_onnx.py |  2 +-
 python/tvm/meta_schedule/testing/tune_relay.py|  2 +-
 python/tvm/meta_schedule/testing/tune_te.py   |  2 +-
 python/tvm/meta_schedule/testing/validate_database.py |  2 +-
 python/tvm/testing/utils.py   | 15 +++
 9 files changed, 23 insertions(+), 8 deletions(-)

diff --git a/python/tvm/auto_scheduler/testing/tune_onnx.py 
b/python/tvm/auto_scheduler/testing/tune_onnx.py
index a3299c05bb..334b5d6726 100644
--- a/python/tvm/auto_scheduler/testing/tune_onnx.py
+++ b/python/tvm/auto_scheduler/testing/tune_onnx.py
@@ -15,7 +15,6 @@
 # specific language governing permissions and limitations
 # under the License.
 # pylint: disable=missing-docstring
-from distutils.util import strtobool
 import argparse
 import json
 import os
@@ -30,6 +29,7 @@ from tvm.meta_schedule.testing.tune_utils import 
generate_input_data, create_tim
 from tvm.meta_schedule.utils import cpu_count
 from tvm.relay.frontend import from_onnx
 from tvm.support import describe
+from tvm.testing.utils import strtobool
 
 
 def _parse_args():
diff --git a/python/tvm/auto_scheduler/testing/tune_relay.py 
b/python/tvm/auto_scheduler/testing/tune_relay.py
index 9773fbbc65..babec2cf50 100644
--- a/python/tvm/auto_scheduler/testing/tune_relay.py
+++ b/python/tvm/auto_scheduler/testing/tune_relay.py
@@ -18,7 +18,6 @@
 import argparse
 import json
 import os
-from distutils.util import strtobool
 
 import tvm
 from tvm import auto_scheduler
@@ -29,6 +28,7 @@ from tvm.meta_schedule.testing.relay_workload import 
get_network
 from tvm.meta_schedule.testing.tune_utils import create_timer, 
generate_input_data
 from tvm.meta_schedule.utils import cpu_count
 from tvm.support import describe
+from tvm.testing.utils import strtobool
 
 
 def _parse_args():
diff --git a/python/tvm/auto_scheduler/testing/tune_te.py 
b/python/tvm/auto_scheduler/testing/tune_te.py
index da3584512d..9452d88a4e 100644
--- a/python/tvm/auto_scheduler/testing/tune_te.py
+++ b/python/tvm/auto_scheduler/testing/tune_te.py
@@ -15,7 +15,6 @@
 # specific language governing permissions and limitations
 # under the License.
 # pylint: disable=missing-docstring
-from distutils.util import strtobool
 import argparse
 import os
 
@@ -25,6 +24,7 @@ from tvm import meta_schedule as ms
 from tvm.meta_schedule.testing.te_workload import CONFIGS
 from tvm.meta_schedule.utils import cpu_count
 from tvm.support import describe
+from tvm.testing.utils import strtobool
 
 
 def _parse_args():
diff --git a/python/tvm/autotvm/testing/tune_relay.py 
b/python/tvm/autotvm/testing/tune_relay.py
index 96e42fbea0..916b2a800b 100644
--- a/python/tvm/autotvm/testing/tune_relay.py
+++ b/python/tvm/autotvm/testing/tune_relay.py
@@ -19,7 +19,6 @@ import argparse
 import json
 import os
 import warnings
-from distutils.util import strtobool
 
 import tvm
 from tvm import autotvm
@@ -31,6 +30,7 @@ from tvm.meta_schedule.testing.custom_builder_runner import 
run_module_via_rpc
 from tvm.meta_schedule.testing.relay_workload import get_network
 from tvm.meta_schedule.testing.tune_utils import create_timer, 
generate_input_data
 from tvm.support import describe
+from tvm.testing.utils import strtobool
 
 
 def _parse_args():
diff --git a/python/tvm/meta_schedule/testing/tune_onnx.py 
b/python/tvm/meta_schedule/testing/tune_onnx.py
index a7c177afdc..2100f0e7c9 100644
--- a/python/tvm/meta_schedule/testing/tune_onnx.py
+++ b/python/tvm/meta_schedule/testing/tune_onnx.py
@@ -18,7 +18,6 @@
 import argparse
 import json
 import logging
-from distutils.util import strtobool
 
 import onnx  # type: ignore
 import tvm
@@ -26,6 +25,7 @@ from tvm import meta_schedule as ms
 from tvm.meta_schedule.testing.custom_builder_runner import run_module_via_rpc
 from tvm.relay.frontend import from_onnx
 from tvm.support import describe
+from tvm.testing.utils import strtobool
 
 from .tune_utils import create_timer, generate_input_data
 
diff --git a/python/tvm/meta_schedule/testing/tune_relay.py 
b/python/tvm/meta_schedule/testing/tune_relay.py
index de1668c1dd..98eddf793f

(tvm) branch main updated: [DLIGHT][GPU] Add OpenCL dequant matmul schedule (#17187)

2024-07-23 Thread tqchen
This is an automated email from the ASF dual-hosted git repository.

tqchen pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/tvm.git


The following commit(s) were added to refs/heads/main by this push:
 new 50d1c97dc9 [DLIGHT][GPU] Add OpenCL dequant matmul schedule (#17187)
50d1c97dc9 is described below

commit 50d1c97dc982c6ddfe089852d1fbbac3ea629851
Author: krishnaraj36 
AuthorDate: Tue Jul 23 20:57:53 2024 +0530

[DLIGHT][GPU] Add OpenCL dequant matmul schedule (#17187)

* [DLIGHT][GPU] Add OpenCL dequant matmul schedule

1. Enhanced the GPU matmul schedule for OpenCL Android and windows backend.
2. It improves the 2X performance gain for Llama-2-7B prefill process
Model   device  Earlier prefill perf
  Optimized prefill perf
Llama-2-7B-chat-hf  Snapdragon® 8 Gen 327 tok/sec   
 50 tok/sec

* Update matmul.py
---
 python/tvm/dlight/gpu/matmul.py| 144 +++--
 tests/python/dlight/test_gpu_matmul.py | 192 +++--
 2 files changed, 292 insertions(+), 44 deletions(-)

diff --git a/python/tvm/dlight/gpu/matmul.py b/python/tvm/dlight/gpu/matmul.py
index a5759941ca..25cc649b44 100644
--- a/python/tvm/dlight/gpu/matmul.py
+++ b/python/tvm/dlight/gpu/matmul.py
@@ -27,7 +27,7 @@ from tvm.tir import IterVar, PrimExpr, Var
 from tvm.tir.analysis import undefined_vars
 from tvm.tir.schedule.schedule import BlockRV
 
-from ..base import analysis
+from ..base import analysis, BlockInfo, IterInfo
 from .base import GPUScheduleRule
 
 
@@ -273,6 +273,32 @@ def get_index_map(block: tir.Block) -> 
Optional[Tuple[tir.IndexMap, ...]]:
 )
 
 
+def get_block_info(sch: tir.Schedule, block: tir.schedule.BlockRV) -> 
BlockInfo:
+def _iter_kind(loop: tir.IterVar) -> str:
+return {tir.IterVar.DataPar: "S", tir.IterVar.CommReduce: 
"R"}.get(loop.iter_type, "O")
+
+def _is_reduction_block(block: tir.schedule.BlockRV):
+for iter_var in sch.get(block).iter_vars:
+if _iter_kind(iter_var) == "R":
+return True
+return False
+
+return BlockInfo(
+name=sch.get(block).name_hint,
+iters=[
+IterInfo(
+kind=_iter_kind(iter_var),
+var=iter_var.var,
+dom=iter_var.dom.extent,
+loop_rv=loop_rv,
+)
+for loop_rv, iter_var in zip(sch.get_loops(block), 
sch.get(block).iter_vars)
+],
+block_rv=block,
+reduction_block=_is_reduction_block(block),
+)
+
+
 def get_reduction_blocks(sch, blocks) -> bool:
 # Get the main computation block
 def is_reduction(block: BlockRV) -> bool:
@@ -914,17 +940,19 @@ class Matmul(GPUScheduleRule):
 storage_align=True,
 inner_x=False,
 )
-elif target.kind.name == "opencl" and "android" in str(target.host):
+elif target.kind.name == "opencl" and (
+("android" in str(target.host)) or ("windows" in str(target.host))
+):
 return Matmul.Config(
-block_size_x=8,
-block_size_y=16,
+block_size_x=32,
+block_size_y=8,
 vthread_x=1,
 vthread_y=1,
 micro_size_x=8,
 micro_size_y=2,
 micro_size_k=16,
 vector_size=8,
-unroll=64,
+unroll=4,
 use_shared=False,
 storage_align=False,
 inner_x=True,
@@ -941,6 +969,7 @@ class Matmul(GPUScheduleRule):
 if not isinstance(func, tir.PrimFunc) or not 
self.is_target_available(target):
 return None
 sch = tir.Schedule(func)
+config = self.get_configs(target)
 root_block = analysis.get_root_block(sch)
 blocks = sch.get_child_blocks(root_block)
 
@@ -953,9 +982,22 @@ class Matmul(GPUScheduleRule):
 index_maps = get_index_map(block_stmt)
 if index_maps is None:
 return None
-matmul_index_map, a_index_map, b_index_map, c_index_map = index_maps
+
+main_block_info = get_block_info(sch, main_block)
+iter_infos = main_block_info.iters
+
+# Checks if it's a inner reduction by getting the last matrix's inner 
Index
+def is_inner_reduction(block_stmt, iter_infos):
+end_it = block_stmt.reads[-1].region[-1].min
+return {it.var: it.kind for it in iter_infos}.get(end_it, "O") == 
"R"
+
+if target.kind.name == "opencl" and not is_inner_reduction(block_stmt, 
iter_infos):
+ret = self.sch_outer_reduction(sch, config, main_block, blocks)
+if ret is not None:
+ 

(tvm) branch main updated: [MetaSchedule] Replace `xgboost.rabit` with `xgboost.collective` because it's deprecated (#17166)

2024-07-23 Thread tqchen
This is an automated email from the ASF dual-hosted git repository.

tqchen pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/tvm.git


The following commit(s) were added to refs/heads/main by this push:
 new e647684775 [MetaSchedule] Replace `xgboost.rabit` with 
`xgboost.collective` because it's deprecated (#17166)
e647684775 is described below

commit e6476847753c80e054719ac47bc2091c888418b6
Author: Masahiro Hiramori 
AuthorDate: Tue Jul 23 21:39:48 2024 +0900

[MetaSchedule] Replace `xgboost.rabit` with `xgboost.collective` because 
it's deprecated (#17166)

* use collective instead of rabit

* can work with xgb==1.4.2 in CI
---
 python/tvm/meta_schedule/cost_model/xgb_model.py | 9 +++--
 1 file changed, 7 insertions(+), 2 deletions(-)

diff --git a/python/tvm/meta_schedule/cost_model/xgb_model.py 
b/python/tvm/meta_schedule/cost_model/xgb_model.py
index 6b6b7a2dc1..aaee58fc94 100644
--- a/python/tvm/meta_schedule/cost_model/xgb_model.py
+++ b/python/tvm/meta_schedule/cost_model/xgb_model.py
@@ -755,7 +755,12 @@ def _get_custom_call_back(
 raise ValueError("wrong metric value", value)
 
 import xgboost as xgb
-from xgboost import rabit  # type: ignore
+
+# make it compatible with xgboost<1.7
+try:
+from xgboost import rabit as collective  # type: ignore
+except ImportError:
+from xgboost import collective  # type: ignore
 
 try:
 from xgboost.training import aggcv  # type: ignore
@@ -841,7 +846,7 @@ def _get_custom_call_back(
 elif epoch - best_iteration >= self.early_stopping_rounds:
 best_msg = self.state["best_msg"]
 
-if self.verbose_eval and rabit.get_rank() == 0:
+if self.verbose_eval and collective.get_rank() == 0:
 logger.debug("XGB stopped. Best iteration: %s ", best_msg)
 # instead of raising EarlyStopException, returning True to end 
the training
 return True



(tvm) branch main updated: Add `packaging` to `python/gen_requirements.py` (#17188)

2024-07-22 Thread tqchen
This is an automated email from the ASF dual-hosted git repository.

tqchen pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/tvm.git


The following commit(s) were added to refs/heads/main by this push:
 new 432f305ce1 Add `packaging` to `python/gen_requirements.py` (#17188)
432f305ce1 is described below

commit 432f305ce188f9a679965fb32d1141f92d25b8d0
Author: Masahiro Hiramori 
AuthorDate: Tue Jul 23 08:13:57 2024 +0900

Add `packaging` to `python/gen_requirements.py` (#17188)

add packaging as a base dependency
---
 python/gen_requirements.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/python/gen_requirements.py b/python/gen_requirements.py
index 0c8200f60b..5919d2a9c7 100644
--- a/python/gen_requirements.py
+++ b/python/gen_requirements.py
@@ -68,6 +68,7 @@ REQUIREMENTS_BY_PIECE: RequirementsByPieceType = [
 "decorator",
 "ml_dtypes",
 "numpy",
+"packaging",
 "psutil",
 "scipy",
 "tornado",



(tvm) branch main updated: [Hexagon] [CMake] Fix v66 build issue (#17169)

2024-07-22 Thread tqchen
This is an automated email from the ASF dual-hosted git repository.

tqchen pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/tvm.git


The following commit(s) were added to refs/heads/main by this push:
 new 9b09984636 [Hexagon] [CMake] Fix v66 build issue (#17169)
9b09984636 is described below

commit 9b0998463698c34906bcbc431e43adc4eed70759
Author: Anirudh Sundar Subramaniam 
AuthorDate: Tue Jul 23 04:43:43 2024 +0530

[Hexagon] [CMake] Fix v66 build issue (#17169)

This patch fixes the issue mentioned in 
[#17163](https://github.com/apache/tvm/issues/17163)
---
 apps/hexagon_api/CMakeLists.txt |  7 ++-
 cmake/modules/Hexagon.cmake | 44 +++--
 2 files changed, 35 insertions(+), 16 deletions(-)

diff --git a/apps/hexagon_api/CMakeLists.txt b/apps/hexagon_api/CMakeLists.txt
index 3b5300ac55..f7144835db 100644
--- a/apps/hexagon_api/CMakeLists.txt
+++ b/apps/hexagon_api/CMakeLists.txt
@@ -114,6 +114,11 @@ if(DEFINED USE_HEXAGON_GTEST)
   set(GTEST_FLAG "-DUSE_HEXAGON_GTEST=${USE_HEXAGON_GTEST}")
 endif()
 
+if(NOT DEFINED USE_HEXAGON_QHL)
+  # USE_HEXAGON_QHL defaults to ON for rpc runtime if not explicitly set to OFF
+  set(USE_HEXAGON_QHL ON)
+endif()
+
 ExternalProject_Add(hexagon_tvm_runtime_rpc
   SOURCE_DIR "${TVM_SOURCE_DIR}"
   BUILD_COMMAND $(MAKE) runtime hexagon_rpc_sim
@@ -135,7 +140,7 @@ ExternalProject_Add(hexagon_tvm_runtime_rpc
 "-DCMAKE_BUILD_TYPE=${CMAKE_BUILD_TYPE}"
 "-DUSE_ALTERNATIVE_LINKER=OFF"
 "-DUSE_CUSTOM_LOGGING=ON"
-"-DUSE_HEXAGON_QHL=ON"
+"-DUSE_HEXAGON_QHL=${USE_HEXAGON_QHL}"
 "-DUSE_RANDOM=ON"
 "${GTEST_FLAG}"
   INSTALL_COMMAND ""
diff --git a/cmake/modules/Hexagon.cmake b/cmake/modules/Hexagon.cmake
index 21a909e315..75b0094ed6 100644
--- a/cmake/modules/Hexagon.cmake
+++ b/cmake/modules/Hexagon.cmake
@@ -134,11 +134,22 @@ else()
   )
 endif()
 
+set(htp_supported_archs "v68" "v69" "v73" "v75")
+list(FIND htp_supported_archs "${USE_HEXAGON_ARCH}" supported_arch_index)
+if(${supported_arch_index} EQUAL -1)
+  # Exclude User DMA files when building for archs below v68
+  list(REMOVE_ITEM RUNTIME_HEXAGON_SRCS 
"${TVMRT_SOURCE_DIR}/hexagon/hexagon_user_dma.cc")
+endif()
+
 if(BUILD_FOR_HEXAGON)
   if(DEFINED USE_HEXAGON_GTEST AND EXISTS ${USE_HEXAGON_GTEST})
 file_glob_append(RUNTIME_HEXAGON_SRCS
   "${CMAKE_SOURCE_DIR}/tests/cpp-runtime/hexagon/*.cc"
 )
+if(${supported_arch_index} EQUAL -1)
+  # Exclude User DMA files when building for archs below v68
+  list(REMOVE_ITEM RUNTIME_HEXAGON_SRCS 
"${TVMRT_SOURCE_DIR}/hexagon/hexagon_user_dma_tests.cc")
+endif()
   endif()
   get_hexagon_sdk_property("${USE_HEXAGON_SDK}" "${USE_HEXAGON_ARCH}"
 SDK_INCLUDE   SDK_INCLUDE_DIRS
@@ -176,24 +187,27 @@ if(BUILD_FOR_HEXAGON)
 
   endif()
 
-  # Hand-written ops
-  file_glob_append(RUNTIME_HEXAGON_SRCS
-"${TVMRT_SOURCE_DIR}/hexagon/ops/*.cc"
-  )
+  # Exclude HVX implementation files when building for archs below v68
+  if(${supported_arch_index} GREATER -1)
+# Hand-written ops
+file_glob_append(RUNTIME_HEXAGON_SRCS
+  "${TVMRT_SOURCE_DIR}/hexagon/ops/*.cc"
+)
 
-  include_directories(
-"${TVMRT_SOURCE_DIR}/hexagon/ops"
-  )
+include_directories(
+  "${TVMRT_SOURCE_DIR}/hexagon/ops"
+)
 
-  set_source_files_properties(
-"${TVMRT_SOURCE_DIR}/hexagon/ops/conv2d_quant_hvx.cc"
-PROPERTIES COMPILE_FLAGS "-mhvx"
-  )
+set_source_files_properties(
+  "${TVMRT_SOURCE_DIR}/hexagon/ops/conv2d_quant_hvx.cc"
+  PROPERTIES COMPILE_FLAGS "-mhvx"
+)
 
-  set_source_files_properties(
-"${TVMRT_SOURCE_DIR}/hexagon/ops/conv2d_fp16_hvx.cc"
-PROPERTIES COMPILE_FLAGS "-mhvx"
-  )
+set_source_files_properties(
+  "${TVMRT_SOURCE_DIR}/hexagon/ops/conv2d_fp16_hvx.cc"
+  PROPERTIES COMPILE_FLAGS "-mhvx"
+)
+  endif()
 
   # Include hexagon external library runtime sources
   if(USE_HEXAGON_EXTERNAL_LIBS)



(tvm) branch main updated: [FFI] Add python signal handler for ctypes FFI (#17181)

2024-07-22 Thread tqchen
This is an automated email from the ASF dual-hosted git repository.

tqchen pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/tvm.git


The following commit(s) were added to refs/heads/main by this push:
 new 91e9c63b42 [FFI] Add python signal handler for ctypes FFI (#17181)
91e9c63b42 is described below

commit 91e9c63b42fcccec196a8ef9ed7a7bc7f82c2e52
Author: Wuwei Lin 
AuthorDate: Mon Jul 22 16:12:53 2024 -0700

[FFI] Add python signal handler for ctypes FFI (#17181)
---
 python/tvm/_ffi/_ctypes/packed_func.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/python/tvm/_ffi/_ctypes/packed_func.py 
b/python/tvm/_ffi/_ctypes/packed_func.py
index 6465e0335d..5f3aa04914 100644
--- a/python/tvm/_ffi/_ctypes/packed_func.py
+++ b/python/tvm/_ffi/_ctypes/packed_func.py
@@ -195,6 +195,7 @@ class PackedFuncBase(object):
 """Function base."""
 
 __slots__ = ["handle", "is_global"]
+
 # pylint: disable=no-member
 def __init__(self, handle, is_global):
 """Initialize the function with handle
@@ -342,6 +343,7 @@ def _init_pythonapi_inc_def_ref():
 register_func(c_str("Py_DecRef"), ctypes.pythonapi.Py_DecRef)
 register_func(c_str("PyGILState_Ensure"), 
ctypes.pythonapi.PyGILState_Ensure)
 register_func(c_str("PyGILState_Release"), 
ctypes.pythonapi.PyGILState_Release)
+register_func(c_str("PyErr_CheckSignals"), 
ctypes.pythonapi.PyErr_CheckSignals)
 
 
 _init_pythonapi_inc_def_ref()



(tvm) branch main updated: Use `packaging.version.parse` instead of `distutils.version.LooseVersion` (#17173)

2024-07-19 Thread tqchen
This is an automated email from the ASF dual-hosted git repository.

tqchen pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/tvm.git


The following commit(s) were added to refs/heads/main by this push:
 new 3c7adfb1f7 Use `packaging.version.parse` instead of 
`distutils.version.LooseVersion` (#17173)
3c7adfb1f7 is described below

commit 3c7adfb1f7015078903ba53cc5317ead1b4f5f32
Author: Masahiro Hiramori 
AuthorDate: Sat Jul 20 04:00:01 2024 +0900

Use `packaging.version.parse` instead of `distutils.version.LooseVersion` 
(#17173)

use `packaging.version.parse` instead of `distutils.version.LooseVersion`
---
 python/tvm/contrib/msc/core/utils/info.py |  6 +++---
 python/tvm/relay/frontend/pytorch_utils.py|  4 ++--
 python/tvm/relay/op/contrib/ethosn.py |  6 +++---
 python/tvm/relay/testing/tflite.py|  4 ++--
 .../contrib/test_arm_compute_lib/test_network.py  |  4 ++--
 tests/python/frontend/tensorflow/test_forward.py  |  9 -
 tests/python/frontend/tflite/test_forward.py  | 19 +--
 7 files changed, 25 insertions(+), 27 deletions(-)

diff --git a/python/tvm/contrib/msc/core/utils/info.py 
b/python/tvm/contrib/msc/core/utils/info.py
index 4fea45f8fa..58b0811279 100644
--- a/python/tvm/contrib/msc/core/utils/info.py
+++ b/python/tvm/contrib/msc/core/utils/info.py
@@ -17,7 +17,7 @@
 """tvm.contrib.msc.core.utils.info"""
 
 from typing import List, Tuple, Dict, Any, Union
-from distutils.version import LooseVersion
+from packaging.version import parse
 import numpy as np
 
 import tvm
@@ -409,8 +409,8 @@ def get_version(framework: str) -> List[int]:
 raw_version = "1.0.0"
 except:  # pylint: disable=bare-except
 raw_version = "1.0.0"
-raw_version = raw_version or "1.0.0"
-return LooseVersion(raw_version).version
+version = parse(raw_version or "1.0.0")
+return [version.major, version.minor, version.micro]
 
 
 def compare_version(given_version: List[int], target_version: List[int]) -> 
int:
diff --git a/python/tvm/relay/frontend/pytorch_utils.py 
b/python/tvm/relay/frontend/pytorch_utils.py
index 7de1248bda..8686be4b1e 100644
--- a/python/tvm/relay/frontend/pytorch_utils.py
+++ b/python/tvm/relay/frontend/pytorch_utils.py
@@ -36,7 +36,7 @@ def is_version_greater_than(ver):
 than the one given as an argument.
 """
 import torch
-from distutils.version import LooseVersion
+from packaging.version import parse
 
 torch_ver = torch.__version__
 # PT version numbers can include +cu[cuda version code]
@@ -44,7 +44,7 @@ def is_version_greater_than(ver):
 if "+cu" in torch_ver:
 torch_ver = torch_ver.split("+cu")[0]
 
-return LooseVersion(torch_ver) > ver
+return parse(torch_ver) > parse(ver)
 
 
 def getattr_attr_name(node):
diff --git a/python/tvm/relay/op/contrib/ethosn.py 
b/python/tvm/relay/op/contrib/ethosn.py
index 81534d48a2..c1e87ad5d9 100644
--- a/python/tvm/relay/op/contrib/ethosn.py
+++ b/python/tvm/relay/op/contrib/ethosn.py
@@ -17,7 +17,7 @@
 # pylint: disable=invalid-name, unused-argument
 """Arm(R) Ethos(TM)-N NPU supported operators."""
 from enum import Enum
-from distutils.version import LooseVersion
+from packaging.version import parse
 
 import tvm.ir
 from tvm.relay import transform
@@ -118,7 +118,7 @@ def partition_for_ethosn(mod, params=None, **opts):
 """
 api_version = ethosn_api_version()
 supported_api_versions = ["3.2.0"]
-if all(api_version != LooseVersion(exp_ver) for exp_ver in 
supported_api_versions):
+if all(parse(api_version) != parse(exp_ver) for exp_ver in 
supported_api_versions):
 raise ValueError(
 f"Driver stack version {api_version} is unsupported. "
 f"Please use version in {supported_api_versions}."
@@ -433,7 +433,7 @@ def split(expr):
 """Check if a split is supported by Ethos-N."""
 if not ethosn_available():
 return False
-if ethosn_api_version() == LooseVersion("3.0.1"):
+if parse(ethosn_api_version()) == parse("3.0.1"):
 return False
 if not _ethosn.split(expr):
 return False
diff --git a/python/tvm/relay/testing/tflite.py 
b/python/tvm/relay/testing/tflite.py
index df9c0bcadf..29f6bc62ca 100644
--- a/python/tvm/relay/testing/tflite.py
+++ b/python/tvm/relay/testing/tflite.py
@@ -15,7 +15,7 @@
 # specific language governing permissions and limitations
 # under the License.
 """Common utilities for creating TFLite models"""
-from distutils.version import LooseVersion
+from packaging.version import parse
 import numpy as np
 import pytest
 import tflite.Model  # pylint: disa

(tvm) branch main updated: [Relax] [ONNX] Add support for Sign and Not (#17167)

2024-07-18 Thread tqchen
This is an automated email from the ASF dual-hosted git repository.

tqchen pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/tvm.git


The following commit(s) were added to refs/heads/main by this push:
 new d006ecac35 [Relax] [ONNX] Add support for Sign and Not (#17167)
d006ecac35 is described below

commit d006ecac35fd3100ee547d2d0356e21245a93ed0
Author: tsu-bin <81693503+tsu-...@users.noreply.github.com>
AuthorDate: Thu Jul 18 21:50:14 2024 +0800

[Relax] [ONNX] Add support for Sign and Not (#17167)

Co-authored-by: tsu-bin 
---
 python/tvm/relax/frontend/onnx/onnx_frontend.py | 18 ++
 tests/python/relax/test_frontend_onnx.py|  8 
 2 files changed, 26 insertions(+)

diff --git a/python/tvm/relax/frontend/onnx/onnx_frontend.py 
b/python/tvm/relax/frontend/onnx/onnx_frontend.py
index 3a70cd090a..85d4402d66 100644
--- a/python/tvm/relax/frontend/onnx/onnx_frontend.py
+++ b/python/tvm/relax/frontend/onnx/onnx_frontend.py
@@ -1948,6 +1948,22 @@ class HardSwish(OnnxOpConverter):
 )
 
 
+class Sign(OnnxOpConverter):
+"""Converts an onnx Sign node into an equivalent Relax expression."""
+
+@classmethod
+def _impl_v9(cls, bb, inputs, attr, params):
+return relax.op.sign(inputs[0])
+
+
+class Not(OnnxOpConverter):
+"""Converts an onnx Not node into an equivalent Relax expression."""
+
+@classmethod
+def _impl_v1(cls, bb, inputs, attr, params):
+return relax.op.logical_not(inputs[0])
+
+
 def _get_convert_map():
 return {
 "MatMul": MatMul,
@@ -2030,6 +2046,8 @@ def _get_convert_map():
 "Elu": Elu,
 "HardSigmoid": HardSigmoid,
 "HardSwish": HardSwish,
+"Sign": Sign,
+"Not": Not,
 }
 
 
diff --git a/tests/python/relax/test_frontend_onnx.py 
b/tests/python/relax/test_frontend_onnx.py
index 0fc7ec0644..05316f2699 100644
--- a/tests/python/relax/test_frontend_onnx.py
+++ b/tests/python/relax/test_frontend_onnx.py
@@ -600,6 +600,14 @@ def test_hardswish():
 verify_unary("HardSwish", [32, 32])
 
 
+def test_sign():
+verify_unary("Sign", [32, 32])
+
+
+def test_not():
+verify_unary("Not", [32, 32], dtype=TensorProto.BOOL)
+
+
 def test_conv():
 def _verify_conv(input_shape, weight_shape, output_shape):
 bias_shape = [output_shape[1]]



(tvm) branch main updated: [Relax] Fix fuseOps via pattern (#17160)

2024-07-16 Thread tqchen
This is an automated email from the ASF dual-hosted git repository.

tqchen pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/tvm.git


The following commit(s) were added to refs/heads/main by this push:
 new 70c53082e6 [Relax] Fix fuseOps via pattern  (#17160)
70c53082e6 is described below

commit 70c53082e6715516aefefcdca6262e195f36a0de
Author: Siyuan Feng 
AuthorDate: Wed Jul 17 02:34:19 2024 +0800

[Relax] Fix fuseOps via pattern  (#17160)

fix fuseops via pattern
---
 src/relax/transform/fuse_ops.cc | 15 ++-
 1 file changed, 10 insertions(+), 5 deletions(-)

diff --git a/src/relax/transform/fuse_ops.cc b/src/relax/transform/fuse_ops.cc
index 45d70fc3e2..2be7ad41f3 100644
--- a/src/relax/transform/fuse_ops.cc
+++ b/src/relax/transform/fuse_ops.cc
@@ -1222,7 +1222,12 @@ class CompositeFunctionAnnotator : public ExprMutator {
   IRModule Run() {
 auto mod = builder_->GetContextIRModule();
 for (const auto& gv : mod->GetGlobalVars()) {
-  const auto& base_func = mod->Lookup(gv);
+  auto it = mod->functions.find(gv);
+  // Note that the fusion pass may have already removed the function.
+  if (it == mod->functions.end()) {
+continue;
+  }
+  const auto& base_func = (*it).second;
   if (const auto* func = base_func.as()) {
 if (func->GetAttr(attr::kComposite).defined() ||
 func->GetAttr(attr::kCodegen).defined()) {
@@ -1399,7 +1404,7 @@ Pass FuseOps(int fuse_opt_level) {
   };
   return CreateModulePass(/*pass_function=*/pass_func,  //
   /*opt_level=*/0,  //
-  /*pass_name=*/"FuseOps",  //
+  /*name=*/"FuseOps",   //
   /*required=*/{});
 }
 
@@ -1412,9 +1417,9 @@ Pass FuseOpsByPattern(const tvm::Array& 
patterns, bool bind_const
 return relax::FuseOpsByPattern(patterns, m, bind_constants, 
annotate_codegen,
entry_function_names);
   };
-  return CreateModulePass(/*pass_function=*/pass_func,   //
-  /*opt_level=*/0,   //
-  /*pass_name=*/"FuseOpsByPattern",  //
+  return CreateModulePass(/*pass_function=*/pass_func,  //
+  /*opt_level=*/0,  //
+  /*name=*/"FuseOpsByPattern",  //
   /*required=*/{});
 }
 



(tvm) branch main updated: [Relax] Fix cublas dispatch for corner cases (#17139)

2024-07-10 Thread tqchen
This is an automated email from the ASF dual-hosted git repository.

tqchen pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/tvm.git


The following commit(s) were added to refs/heads/main by this push:
 new 37a6200185 [Relax] Fix cublas dispatch for corner cases (#17139)
37a6200185 is described below

commit 37a62001857c812afed1f6f7df3b49ff01bd2988
Author: Siyuan Feng 
AuthorDate: Thu Jul 11 00:24:06 2024 +0800

[Relax] Fix cublas dispatch for corner cases (#17139)

Fix case when `lhs_batches` and `rhs_batches` are symbolic expressions,
but not standalone variables.
---
 python/tvm/relax/backend/contrib/cublas.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/python/tvm/relax/backend/contrib/cublas.py 
b/python/tvm/relax/backend/contrib/cublas.py
index febb401bc0..287b18b440 100644
--- a/python/tvm/relax/backend/contrib/cublas.py
+++ b/python/tvm/relax/backend/contrib/cublas.py
@@ -134,7 +134,7 @@ def _check_matmul(context: PatternCheckContext) -> bool:
 isinstance(lhs_batches, tvm.tir.Var)
 or isinstance(rhs_batches, tvm.tir.Var)
 or (analyzer.can_prove_equal(lhs_batches, rhs_batches))
-or (lhs_batches >= 1 and rhs_batches == 1)
+or (analyzer.can_prove(lhs_batches >= 1) and 
analyzer.can_prove(rhs_batches == 1))
 )
 
 



(tvm) branch main updated: [DOC] Fix typo for the "We utilize the intermediate representation of nn.Graph to convert the OneFlow model to Reley." (#17146)

2024-07-10 Thread tqchen
This is an automated email from the ASF dual-hosted git repository.

tqchen pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/tvm.git


The following commit(s) were added to refs/heads/main by this push:
 new fc814e7041 [DOC] Fix typo for the "We utilize the intermediate 
representation of nn.Graph to convert the OneFlow model to Reley." (#17146)
fc814e7041 is described below

commit fc814e704138bbb0d24cee7c77919e9bf3e01d7d
Author: Redempt1onzzZZ <84373897+redmept...@users.noreply.github.com>
AuthorDate: Thu Jul 11 00:23:52 2024 +0800

[DOC] Fix typo for the "We utilize the intermediate representation of 
nn.Graph to convert the OneFlow model to Reley." (#17146)

Update oneflow.py
---
 python/tvm/relay/frontend/oneflow.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/python/tvm/relay/frontend/oneflow.py 
b/python/tvm/relay/frontend/oneflow.py
index 72f3b20ecb..369bec445f 100644
--- a/python/tvm/relay/frontend/oneflow.py
+++ b/python/tvm/relay/frontend/oneflow.py
@@ -1867,7 +1867,7 @@ def from_oneflow(graph, model_dir_path):
 OneFlow offers nn.Graph, so that users can use the eager-like programming 
style to build
 static graphs and train the models.
 
-We utilize the intermediate representation of nn.Graph to convert the 
OneFlow model to Reley.
+We utilize the intermediate representation of nn.Graph to convert the 
OneFlow model to Relay.
 
 Parameters
 --



(tvm) branch main updated: [Backend][ROCm] Fix error when building TVM with LLVM 19 (#17141)

2024-07-10 Thread tqchen
This is an automated email from the ASF dual-hosted git repository.

tqchen pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/tvm.git


The following commit(s) were added to refs/heads/main by this push:
 new e41d554308 [Backend][ROCm] Fix error when building TVM with LLVM 19 
(#17141)
e41d554308 is described below

commit e41d554308f165bf4730d7c33e4dd8914b6d7e6b
Author: Masahiro Hiramori 
AuthorDate: Thu Jul 11 01:18:43 2024 +0900

[Backend][ROCm] Fix error when building TVM with LLVM 19 (#17141)

* fix error when building with llvm>=19

* always need to include llvm/IR/Module.h
---
 src/target/llvm/codegen_amdgpu.cc | 1 +
 1 file changed, 1 insertion(+)

diff --git a/src/target/llvm/codegen_amdgpu.cc 
b/src/target/llvm/codegen_amdgpu.cc
index 80c2abb5f1..fafe718fee 100644
--- a/src/target/llvm/codegen_amdgpu.cc
+++ b/src/target/llvm/codegen_amdgpu.cc
@@ -45,6 +45,7 @@
 #if TVM_LLVM_VERSION < 170
 #include 
 #endif
+#include 
 #include 
 #include 
 #include 



(tvm) branch main updated: [WebGPU] Implement `tir.dp4a` with WGSL built-in function `dot4I8Packed` (#16976)

2024-07-04 Thread tqchen
This is an automated email from the ASF dual-hosted git repository.

tqchen pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/tvm.git


The following commit(s) were added to refs/heads/main by this push:
 new 3e08e702fa [WebGPU] Implement `tir.dp4a` with WGSL built-in function 
`dot4I8Packed` (#16976)
3e08e702fa is described below

commit 3e08e702fa27b51a948792d467a7734cd6995cf4
Author: Jiawei Shao 
AuthorDate: Fri Jul 5 02:03:56 2024 +0800

[WebGPU] Implement `tir.dp4a` with WGSL built-in function `dot4I8Packed` 
(#16976)

* [WebGPU] Support `__dp4a(int8x4, int8x4)` as a pure extern method

This patch adds the support of `__dp4a(int8x4, int8x4)` as a pure
extern method of WebGPU target. In the generated WGSL shader,
`int8x4` will be translated into `u32`, and `__dp4a(int8x4, int8x4)`
will be translated into the WGSL built-in function
`dot4I8Packed(u32, u32)`.

Here is an example to use `__dp4a` in WebGPU target:

```
n = te.var("n")
A = te.placeholder((n,), "int8x4", name="A")
B = te.placeholder((n,), "int8x4", name="B")
C = te.compute(A.shape, lambda i: tvm.tir.call_pure_extern("int32", 
"__dp4a", A[i], B[i]), name="C")
s = te.create_schedule(C.op)
bx, tx = s[C].split(C.op.axis[0], factor=64)
s[C].bind(bx, te.thread_axis("blockIdx.x"))
s[C].bind(tx, te.thread_axis("threadIdx.x"))
mod = tvm.build(s, [A, B, C], tgt, name="dp4aTest")
```

Issue: #16627

* Add validation

* Add `dot4I8Packed` to WebGPU lower intrinsic

* Implement builtin `dp4a` with `dot4I8Packed`

* Small fix

* Add missing comment
---
 src/target/source/codegen_webgpu.cc | 8 
 1 file changed, 8 insertions(+)

diff --git a/src/target/source/codegen_webgpu.cc 
b/src/target/source/codegen_webgpu.cc
index a95f6e0fa0..b76b05470d 100644
--- a/src/target/source/codegen_webgpu.cc
+++ b/src/target/source/codegen_webgpu.cc
@@ -410,6 +410,14 @@ void CodeGenWebGPU::VisitExpr_(const CallNode* op, 
std::ostream& os) {  // NOLIN
   this->EndScope(else_scope);
 }
 os << result;
+  } else if (op->op.same_as(builtin::dp4a())) {
+// generate `dot4I8Packed(vec1, vec2) + acc` for the builtin `dp4a`
+os << "dot4I8Packed(";
+this->PrintExpr(op->args[0], os);
+os << ", ";
+this->PrintExpr(op->args[1], os);
+os << ") + ";
+this->PrintExpr(op->args[2], os);
   } else {
 CodeGenC::VisitExpr_(op, os);
   }



(tvm) branch main updated: [Bugfix] Restrict CopyOnWrite to _type_final (#17132)

2024-07-02 Thread tqchen
This is an automated email from the ASF dual-hosted git repository.

tqchen pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/tvm.git


The following commit(s) were added to refs/heads/main by this push:
 new 0df4103675 [Bugfix] Restrict CopyOnWrite to _type_final (#17132)
0df4103675 is described below

commit 0df4103675a52cc5b9e6356cb003bb17c66bc1a4
Author: Eric Lunderberg 
AuthorDate: Tue Jul 2 10:18:08 2024 -0500

[Bugfix] Restrict CopyOnWrite to _type_final (#17132)

Prior to this commit, the `TVM_DEFINE_OBJECT_REF_COW_METHOD` could be
used in any `ObjectRef` subclass to provide a `CopyOnWrite` method.
However, the implementation of this method method was invalid if the
object's `ContainerType` could itself be subclassed.  In that case,
using `obj.CopyOnWrite()` when the object contains a subclass, and
when a copy is required, would silently convert `obj` to instead
contain a base class.

This commit adds a `static_assert`, to the
`TVM_DEFINE_OBJECT_REF_COW_METHOD` macro, preventing the macro from being
used in classes that would have incorrect usage.

Compilation with this change found two classes, `relax::Var` and
`relax::BindingBlock` that were susceptible to this error, and the macro
has been removed from these classes.  For backwards-compatibility, the
`CopyOnWrite` function for these two classes is provided explicitly.
---
 include/tvm/relax/expr.h |  7 ---
 include/tvm/runtime/object.h | 20 
 src/relax/ir/expr.cc | 38 ++
 3 files changed, 54 insertions(+), 11 deletions(-)

diff --git a/include/tvm/relax/expr.h b/include/tvm/relax/expr.h
index 401aaa9248..60032c3462 100644
--- a/include/tvm/relax/expr.h
+++ b/include/tvm/relax/expr.h
@@ -427,7 +427,8 @@ class Var : public LeafExpr {
 
   TVM_DLL explicit Var(Id vid, Optional struct_info_annotation, 
Span span = Span());
   TVM_DEFINE_OBJECT_REF_METHODS(Var, LeafExpr, VarNode);
-  TVM_DEFINE_OBJECT_REF_COW_METHOD(VarNode);
+
+  VarNode* CopyOnWrite();
 };
 
 /*! \brief A sub-type of the variable node used to mark dataflow variables from
@@ -784,10 +785,10 @@ class BindingBlock : public ObjectRef {
  public:
   TVM_DLL explicit BindingBlock(Array bindings, Span span = Span());
   TVM_DEFINE_OBJECT_REF_METHODS(BindingBlock, ObjectRef, BindingBlockNode);
-  TVM_DEFINE_OBJECT_REF_COW_METHOD(BindingBlockNode);
+
+  BindingBlockNode* CopyOnWrite();
 };
 
-class DataflowBlock;
 class DataflowBlockNode : public BindingBlockNode {
  public:
   bool SEqualReduce(const DataflowBlockNode* other, SEqualReducer equal) const 
{
diff --git a/include/tvm/runtime/object.h b/include/tvm/runtime/object.h
index 172316daae..4483867f3c 100644
--- a/include/tvm/runtime/object.h
+++ b/include/tvm/runtime/object.h
@@ -823,14 +823,18 @@ struct ObjectPtrEqual {
  *
  * \endcode
  */
-#define TVM_DEFINE_OBJECT_REF_COW_METHOD(ObjectName) \
-  ObjectName* CopyOnWrite() {\
-ICHECK(data_ != nullptr);\
-if (!data_.unique()) {   \
-  auto n = make_object(*(operator->())); \
-  ObjectPtr(std::move(n)).swap(data_);   \
-}\
-return static_cast(data_.get());\
+#define TVM_DEFINE_OBJECT_REF_COW_METHOD(ObjectName)   \
+  static_assert(ObjectName::_type_final,   \
+"TVM's CopyOnWrite may only be used for "  \
+"Object types that are declared as final, "\
+"using the TVM_DECLARE_FINAL_OBJECT_INFO macro."); \
+  ObjectName* CopyOnWrite() {  \
+ICHECK(data_ != nullptr);  \
+if (!data_.unique()) { \
+  auto n = make_object(*(operator->()));   \
+  ObjectPtr(std::move(n)).swap(data_); \
+}  \
+return static_cast(data_.get());  \
   }
 
 // Implementations details below
diff --git a/src/relax/ir/expr.cc b/src/relax/ir/expr.cc
index 59b6a0aeb7..a14ba1d9aa 100644
--- a/src/relax/ir/expr.cc
+++ b/src/relax/ir/expr.cc
@@ -265,6 +265,25 @@ Var::Var(Id vid, Optional 
struct_info_annotation, Span span) {
   data_ = std::move(n);
 }
 
+VarNode* Var::CopyOnWrite() {
+  // The `TVM_DEFINE_OBJECT_REF_COW_METHOD` cannot be used for
+  // Var, because it is the base class for `DataflowBlock`.
+  // If the `TVM_DEFINE_OBJECT_REF_COW_METHOD` were used, the
+  // automatic implementation would erroneously convert from a
+  // `DataflowBlock` to a `Var`.
+  ICHECK(data_ != nullptr);
+  if (!data_.unique()) {
+ObjectPtr node;
+if (auto dataflow_var = as()) {
+  node = make_object(*data

(tvm) branch main updated: [Hexagon] Add support for v75 (#17123)

2024-07-01 Thread tqchen
This is an automated email from the ASF dual-hosted git repository.

tqchen pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/tvm.git


The following commit(s) were added to refs/heads/main by this push:
 new 8de396c6fb [Hexagon] Add support for v75 (#17123)
8de396c6fb is described below

commit 8de396c6fba06a2aa681a2aeb5dba12c133701fc
Author: Anirudh Sundar Subramaniam 
AuthorDate: Mon Jul 1 18:26:02 2024 +0530

[Hexagon] Add support for v75 (#17123)

Add support for executing v75 (Snapdragon 8 gen 3). This PR just adds
the support, but to build and execute for v75, the Hexagon SDK used
should be 5.4+.
---
 apps/hexagon_launcher/README.md  | 16 
 cmake/config.cmake   |  2 +-
 cmake/modules/HexagonSDK.cmake   |  6 +-
 python/tvm/contrib/hexagon/session.py| 15 ++-
 python/tvm/target/target.py  |  3 ++-
 src/runtime/hexagon/README.md|  4 ++--
 src/runtime/hexagon/rpc/simulator/session.cc |  7 +++
 tests/python/contrib/test_hexagon/README.md  |  2 +-
 8 files changed, 36 insertions(+), 19 deletions(-)

diff --git a/apps/hexagon_launcher/README.md b/apps/hexagon_launcher/README.md
index 69d9fdc98a..be0015b17a 100644
--- a/apps/hexagon_launcher/README.md
+++ b/apps/hexagon_launcher/README.md
@@ -43,10 +43,10 @@ Create a subdirectory for the build files, and run `cmake` 
with the
 following variables set:
 
 ```
-cmake -DCMAKE_C_COMPILER=/path/to/hexagon-clang \
-  -DCMAKE_CXX_COMPILER=/path/to/hexagon-clang++ \
-  -DUSE_HEXAGON_ARCH=v65|v66|v68|v69|v73\
-  -DUSE_HEXAGON_SDK=/path/to/hexagon/SDK\
+cmake -DCMAKE_C_COMPILER=/path/to/hexagon-clang \
+  -DCMAKE_CXX_COMPILER=/path/to/hexagon-clang++ \
+  -DUSE_HEXAGON_ARCH=v65|v66|v68|v69|v73|v75\
+  -DUSE_HEXAGON_SDK=/path/to/hexagon/SDK\
   /path/to/apps/hexagon_launcher/cmake/hexagon
 ```
 
@@ -60,10 +60,10 @@ the TVM runtime for Hexagon will be built as a part of the 
process.
 
 ```
 cmake 
-DCMAKE_TOOLCHAIN_FILE=/path/to/android-ndk/build/cmake/android.toolchain.cmake 
\
-  -DANDROID_ABI=arm64-v8a   \
-  -DANDROID_PLATFORM=android-28 \
-  -DUSE_HEXAGON_SDK=/p/Hexagon_SDK/4.3.0.0  \
-  -DUSE_HEXAGON_ARCH=v65|v66|v68|v69|v73\
+  -DANDROID_ABI=arm64-v8a   \
+  -DANDROID_PLATFORM=android-28 \
+  -DUSE_HEXAGON_SDK=/p/Hexagon_SDK/4.3.0.0  \
+  -DUSE_HEXAGON_ARCH=v65|v66|v68|v69|v73|v75\
   /path/to/apps/hexagon_launcher/cmake/android
 ```
 
diff --git a/cmake/config.cmake b/cmake/config.cmake
index 5847acc298..416eec0dcb 100644
--- a/cmake/config.cmake
+++ b/cmake/config.cmake
@@ -367,7 +367,7 @@ set(USE_HEXAGON_RPC OFF)
 # compiling _by_ TVM). This applies to components like the TVM runtime, but is
 # also used to select correct include/library paths from the Hexagon SDK when
 # building runtime for Android.
-# Valid values are v65, v66, v68, v69, v73.
+# Valid values are v65, v66, v68, v69, v73, v75.
 set(USE_HEXAGON_ARCH "v68")
 
 # Whether use MRVL codegen
diff --git a/cmake/modules/HexagonSDK.cmake b/cmake/modules/HexagonSDK.cmake
index 9196396646..5ca889afbf 100644
--- a/cmake/modules/HexagonSDK.cmake
+++ b/cmake/modules/HexagonSDK.cmake
@@ -109,11 +109,12 @@ function(_get_hexagon_sdk_property_impl
   set(_hexarch_dir_v68 "computev68")
   set(_hexarch_dir_v69 "computev69")
   set(_hexarch_dir_v73 "computev73")
+  set(_hexarch_dir_v75 "computev75")
   set(_hexarch_dir_str "_hexarch_dir_${_hexagon_arch}")
   set(_hexarch_dir "${${_hexarch_dir_str}}")
 
   if(NOT _hexarch_dir)
-message(SEND_ERROR "Please set Hexagon architecture to one of v65, v66, 
v68, v69, v73")
+message(SEND_ERROR "Please set Hexagon architecture to one of v65, v66, 
v68, v69, v73, v75")
   endif()
 
   if(_property STREQUAL "VERSION")
@@ -160,6 +161,9 @@ function(_get_hexagon_sdk_property_impl
 elseif(_property STREQUAL "QURT_INCLUDE")
   # Set the Hexagon arch directory for runtime linker.
   set(_rtld_dir "hexagon_toolv84_${_hexagon_arch}")
+  if(_hexagon_arch STREQUAL "v75")
+set(_rtld_dir "hexagon_toolv87_v75") # Use hexagon_toolv87_v75 for v75
+  endif()
   if(_hexagon_arch STREQUAL "v69")
 set(_rtld_dir "hexagon_toolv84_v68") # Use hexagon_toolv84_v68 for v69
   endif()
diff --git a/python/tvm/contrib/hexagon/session.py 
b/python/tvm/contrib/hexagon/session.py
index fc0c96fbe5..9f11668234 100644
--- a/python/tvm/contrib/hexagon/session.py
+++ b/python/tvm/contrib/hexagon/session.py
@@ -286,7 +286,9 @@ class Session:
 graph_json, graph_debug_mod, self.device, dump_root=s

(tvm) branch main updated: [WebGPU] Add `tir.dp4a` (#17124)

2024-07-01 Thread tqchen
This is an automated email from the ASF dual-hosted git repository.

tqchen pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/tvm.git


The following commit(s) were added to refs/heads/main by this push:
 new 4247433e33 [WebGPU] Add `tir.dp4a` (#17124)
4247433e33 is described below

commit 4247433e33dfeff9bc82521ed4c7e85605d94893
Author: Jiawei Shao 
AuthorDate: Mon Jul 1 20:36:14 2024 +0800

[WebGPU] Add `tir.dp4a` (#17124)

* [WebGPU] Add `tir.dp4a`

This patch adds `tir.dp4a` as a new TIR built-in operator as a
preparation of supporting int8 computation with `dot4I8Packed`
in WebGPU backend.

* Fix format issues

* Fix format issue

* Replace `accumulation` with `accumulator`
---
 include/tvm/tir/builtin.h  |  5 +
 python/tvm/script/ir_builder/tir/ir.py |  2 ++
 python/tvm/tir/__init__.py |  1 +
 python/tvm/tir/op.py   | 25 +
 src/tir/op/builtin.cc  |  5 +
 tests/python/tir-base/test_tir_op_types.py |  8 
 6 files changed, 46 insertions(+)

diff --git a/include/tvm/tir/builtin.h b/include/tvm/tir/builtin.h
index 120c1b71be..ea2d07903e 100644
--- a/include/tvm/tir/builtin.h
+++ b/include/tvm/tir/builtin.h
@@ -816,6 +816,11 @@ TVM_DLL const Op& vectorlow();
  */
 TVM_DLL const Op& vectorcombine();
 
+/*!
+ * \brief Dot product of two int8x4 vectors and add an optional accumulator
+ */
+TVM_DLL const Op& dp4a();
+
 /*!
  * \brief atomic add instruction, corresponding e.g. to atomicAdd in CUDA
  */
diff --git a/python/tvm/script/ir_builder/tir/ir.py 
b/python/tvm/script/ir_builder/tir/ir.py
index caefc6a6bc..bdbd6e2cda 100644
--- a/python/tvm/script/ir_builder/tir/ir.py
+++ b/python/tvm/script/ir_builder/tir/ir.py
@@ -1932,6 +1932,7 @@ vectorlow = _dtype_forward(_tir_op.vectorlow)
 vectorhigh = _dtype_forward(_tir_op.vectorhigh)
 vectorcombine = _dtype_forward(_tir_op.vectorcombine)
 get_active_lane_mask = _dtype_forward(_tir_op.get_active_lane_mask)
+dp4a = _dtype_forward(_tir_op.dp4a)
 
 
 broadcast = Broadcast
@@ -2191,6 +2192,7 @@ __all__ = [
 "vectorlow",
 "vectorhigh",
 "vectorcombine",
+"dp4a",
 "assume",
 "undef",
 "tvm_call_packed",
diff --git a/python/tvm/tir/__init__.py b/python/tvm/tir/__init__.py
index 5360ab2b96..bcfbe6575d 100644
--- a/python/tvm/tir/__init__.py
+++ b/python/tvm/tir/__init__.py
@@ -95,6 +95,7 @@ from .op import q_multiply_shift, q_multiply_shift_per_axis, 
shift_left, shift_r
 from .op import TVMBackendAllocWorkspace, TVMBackendFreeWorkspace
 from .op import start_profile_intrinsic, end_profile_intrinsic
 from .op import vscale, get_active_lane_mask, get_vscale_expr
+from .op import dp4a
 from .generic import add, subtract, multiply
 
 from .schedule import StmtSRef, BlockScope, ScheduleState, Schedule, 
ScheduleError
diff --git a/python/tvm/tir/op.py b/python/tvm/tir/op.py
index 81d6604259..0bc299e403 100644
--- a/python/tvm/tir/op.py
+++ b/python/tvm/tir/op.py
@@ -1813,6 +1813,31 @@ def vectorcombine(dtype, vec1, vec2):
 return call_intrin(dtype, "tir.vectorcombine", vec1, vec2)
 
 
+def dp4a(vec1, vec2, acc=0):
+"""Dot product of two int8x4 vectors and add an optional accumulator
+
+Parameters
+--
+vec1 : int8x4
+   The input vector.
+
+vec2 : int8x4
+   The input vector.
+
+acc : int32
+   The accumulator.
+
+Returns
+---
+call : PrimExpr
+The call expression.
+"""
+vec1 = convert(vec1)
+vec2 = convert(vec2)
+acc = convert(acc)
+return call_intrin("int32", "tir.dp4a", vec1, vec2, acc)
+
+
 def ret(val):
 """Create a tir return expression
 
diff --git a/src/tir/op/builtin.cc b/src/tir/op/builtin.cc
index 0404fd2823..0d4a213a23 100644
--- a/src/tir/op/builtin.cc
+++ b/src/tir/op/builtin.cc
@@ -355,6 +355,11 @@ TIR_DEFINE_BUILTIN_FUNC(vectorcombine)
 .set_attr("TScriptDtypePrintLocation",
  
Integer(ScriptDtypePrintLocation::kFirst));
 
+TIR_DEFINE_BUILTIN_FUNC(dp4a)
+.set_attr("TCallEffectKind", 
Integer(CallEffectKind::kPure))
+.set_attr("TScriptDtypePrintLocation",
+ 
Integer(ScriptDtypePrintLocation::kFirst));
+
 TIR_DEFINE_BUILTIN_FUNC(atomic_add)
 .set_attr("TCallEffectKind", 
Integer(CallEffectKind::kOpaque));
 
diff --git a/tests/python/tir-base/test_tir_op_types.py 
b/tests/python/tir-base/test_tir_op_types.py
index 7398ee781b..aefab62559 100644
--- a/tests/python/tir-base/test_tir_op_types.py
+++ b/tests/python/tir-base/test_tir_op_types.py
@@ -295,6 +295,14 @@ def test_tir_op_vectorhigh():
 assert expr.op.name == "tir.vectorhigh"

(tvm) branch main updated: [TIR][DLight] Enable SimdGroup op for Metal (#17112)

2024-06-24 Thread tqchen
This is an automated email from the ASF dual-hosted git repository.

tqchen pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/tvm.git


The following commit(s) were added to refs/heads/main by this push:
 new c0abab769f [TIR][DLight] Enable SimdGroup op for Metal (#17112)
c0abab769f is described below

commit c0abab769ff152d87f84963f18a98d2f7c9bdf31
Author: Siyuan Feng 
AuthorDate: Mon Jun 24 21:24:32 2024 +0800

[TIR][DLight] Enable SimdGroup op for Metal (#17112)
---
 include/tvm/tir/builtin.h|  44 ++-
 python/tvm/dlight/gpu/matmul.py  | 145 ++
 python/tvm/script/ir_builder/tir/ir.py   |   8 +
 python/tvm/tir/__init__.py   |   6 +
 python/tvm/tir/op.py | 191 -
 python/tvm/tir/tensor_intrin/metal.py| 350 +++
 src/runtime/thread_storage_scope.h   |   7 +
 src/target/source/codegen_metal.cc   |  82 +-
 src/target/source/codegen_metal.h|   3 +
 src/tir/op/builtin.cc|  12 +
 tests/python/dlight/test_gpu_matmul_tensorize.py | 283 +-
 11 files changed, 1124 insertions(+), 7 deletions(-)

diff --git a/include/tvm/tir/builtin.h b/include/tvm/tir/builtin.h
index 5836eb8ea9..120c1b71be 100644
--- a/include/tvm/tir/builtin.h
+++ b/include/tvm/tir/builtin.h
@@ -746,7 +746,7 @@ TVM_DLL const Op& create_barriers();
 TVM_DLL const Op& mma_store();
 
 /*!
- * \brief tvm intrinsic for zero-initalizing an MMA accumulation registor.
+ * \brief tvm intrinsic for zero-initializing an MMA accumulation register.
  *For example, if each thread in a warp of size 32 has 8 elements from 
the A matrix in
  *m16xn8xk16 MMA in its registers, this intrinsic can be used to 
zero-initialize its
  *4 accumulation registers.
@@ -758,6 +758,48 @@ TVM_DLL const Op& mma_store();
  */
 TVM_DLL const Op& mma_fill();
 
+// Metal SimdGroup matrix intrinsics
+
+/*!
+ * \brief tvm intrinsic for initializing and simdgroup with given value.
+ * \note only 8x8 shape is supported by Metal Spec and TVM, but we still keep 
shape as params,
+ *   keeping the similar interface with Metal Spec.
+ *
+ * void make_filled_simdgroup_matrix(Var d, PrimExpr index, PrimExpr value,
+ *   int col = 8, int row = 8);
+ */
+TVM_DLL const Op& make_filled_simdgroup_matrix();
+
+/*!
+ * \brief tvm intrinsic for loading data from device memory or threadgroup 
memory to simdgroup.
+ * \note only 8x8 shape is supported by Metal Spec and TVM, but we still keep 
shape as params,
+ *   keeping the similar interface with Metal Spec.
+ *
+ * void simdgroup_load(Var d, PrimExpr index, PrimExpr ptr, PrimExpr stride,
+   int col = 8, int row = 8, bool transpose_matrix = 
false);
+ */
+TVM_DLL const Op& simdgroup_load();
+
+/*!
+ * \brief tvm intrinsic for storing data from simdgroup to device memory or 
threadgroup memory.
+ * \note only 8x8 shape is supported by Metal Spec and TVM, but we still keep 
shape as params,
+ *   keeping the similar interface with Metal Spec.
+ *
+ * void simdgroup_store(Var d, PrimExpr index, PrimExpr ptr, PrimExpr stride,
+ *  int col = 8, int row = 8, bool transpose_matrix = 
false);
+ */
+TVM_DLL const Op& simdgroup_store();
+
+/*!
+ * \brief tvm intrinsic for multiply and accumulate two matrices in simdgroup
+ * \note only 8x8 shape is supported by Metal Spec and TVM, but we still keep 
shape as params,
+ *   keeping the similar interface with Metal Spec.
+ *
+ * void simdgroup_mma(Var d, PrimExpr index_d, Var a, PrimExpr index_a,
+ *Var b, PrimExpr index_b, Var c, PrimExpr index_c);
+ */
+TVM_DLL const Op& simdgroup_multiply_accumulate();
+
 // TODO(tvm-team) replace the usage of the vector operations by Shuffle.
 /*!
  * \brief Get the high level half of the vector
diff --git a/python/tvm/dlight/gpu/matmul.py b/python/tvm/dlight/gpu/matmul.py
index f4ef1f5044..a5759941ca 100644
--- a/python/tvm/dlight/gpu/matmul.py
+++ b/python/tvm/dlight/gpu/matmul.py
@@ -313,6 +313,146 @@ def check_sm_version(arch: str) -> int:
 return int(sm_version) if sm_version.isdigit() else -1
 
 
+class MetalMatmul(GPUScheduleRule):
+"""
+The schedule rule for Metal matmul computation.
+"""
+
+def apply(  # pylint: disable=too-many-locals,missing-docstring
+self,
+func: tir.PrimFunc,
+target: Target,
+_: bool,
+) -> Optional[tir.Schedule]:
+from tvm.tir.tensor_intrin.metal import (  # pylint: 
disable=import-outside-toplevel
+get_simdgroup_intrin_group,
+)
+
+if not isinstance(func, tir.PrimFunc) or not 
self.is_target_available(target):
+return None
+sch = tir.Schedule(func

(tvm) branch main updated: [TVMScript] Better Type Annotation for TIR OP (#17107)

2024-06-20 Thread tqchen
This is an automated email from the ASF dual-hosted git repository.

tqchen pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/tvm.git


The following commit(s) were added to refs/heads/main by this push:
 new 36b9535ff3 [TVMScript] Better Type Annotation for TIR OP (#17107)
36b9535ff3 is described below

commit 36b9535ff364c484d04b384555106731049f44cd
Author: Siyuan Feng 
AuthorDate: Thu Jun 20 20:35:38 2024 +0800

[TVMScript] Better Type Annotation for TIR OP (#17107)

Enable ParamType for TIR op, so that we can have better experience when
writing TVMScript in Python with tools.

However, ParamType is introduced in Python 3.10, so we only enable it
when Python version is 3.10 or above.
---
 python/tvm/script/ir_builder/tir/ir.py | 32 +---
 1 file changed, 25 insertions(+), 7 deletions(-)

diff --git a/python/tvm/script/ir_builder/tir/ir.py 
b/python/tvm/script/ir_builder/tir/ir.py
index 8289ea96ae..18abc0ca5d 100644
--- a/python/tvm/script/ir_builder/tir/ir.py
+++ b/python/tvm/script/ir_builder/tir/ir.py
@@ -19,6 +19,7 @@
 import functools
 import inspect
 from numbers import Integral
+import sys
 from typing import Any, Callable, Dict, List, Optional, Tuple, Union
 
 # isort: off
@@ -1764,14 +1765,31 @@ class meta_var:  # pylint: disable=invalid-name
 # pylint: disable=invalid-name
 
 
-def _op_wrapper(func):
-@functools.wraps(func)
-def wrapped(*args, **kwargs):
-if "dtype" in kwargs:
-kwargs.pop("dtype")
-return func(*args, **kwargs)
+if sys.version_info >= (3, 10):
+from typing import ParamSpec, TypeVar  # pylint: disable=import-error
 
-return wrapped
+T = TypeVar("T")
+P = ParamSpec("P")
+
+def _op_wrapper(func: Callable[P, T]) -> Callable[P, T]:
+@functools.wraps(func)
+def wrapped(*args, **kwargs) -> T:
+if "dtype" in kwargs:
+kwargs.pop("dtype")
+return func(*args, **kwargs)
+
+return wrapped
+
+else:
+
+def _op_wrapper(func):
+@functools.wraps(func)
+def wrapped(*args, **kwargs):
+if "dtype" in kwargs:
+kwargs.pop("dtype")
+return func(*args, **kwargs)
+
+return wrapped
 
 
 abs = _op_wrapper(_tir_op.abs)  # pylint: disable=redefined-builtin



(tvm) branch main updated (675a02336d -> f6fe2aa331)

2024-06-18 Thread tqchen
This is an automated email from the ASF dual-hosted git repository.

tqchen pushed a change to branch main
in repository https://gitbox.apache.org/repos/asf/tvm.git


from 675a02336d  [KVCache] Unlimited depth blocks (#17100)
 add f6fe2aa331 [TIR][RPC] Allow RPC calls to compiled PrimFuncs with no 
arguments (#17098)

No new revisions were added by this update.

Summary of changes:
 src/tir/transforms/make_packed_api.cc  | 10 ++--
 tests/python/runtime/test_runtime_rpc.py   | 55 --
 .../test_tir_transform_make_packed_api.py  | 41 
 3 files changed, 99 insertions(+), 7 deletions(-)



(tvm) branch main updated (5bfca2e7a2 -> 675a02336d)

2024-06-18 Thread tqchen
This is an automated email from the ASF dual-hosted git repository.

tqchen pushed a change to branch main
in repository https://gitbox.apache.org/repos/asf/tvm.git


from 5bfca2e7a2 [Transform] Modify FuseTIR pass to propagate buffer 
attributes (#17075)
 add 675a02336d  [KVCache] Unlimited depth blocks (#17100)

No new revisions were added by this update.

Summary of changes:
 src/runtime/relax_vm/paged_kv_cache.cc | 175 +
 ..._builtin_paged_attention_kv_cache_flashinfer.py |  22 +--
 ...runtime_builtin_paged_attention_kv_cache_tir.py |  74 +++--
 3 files changed, 185 insertions(+), 86 deletions(-)



(tvm-site) branch main updated: add download link for v0.14-v0.16 (#43)

2024-06-12 Thread tqchen
This is an automated email from the ASF dual-hosted git repository.

tqchen pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/tvm-site.git


The following commit(s) were added to refs/heads/main by this push:
 new 2624afe651 add download link for v0.14-v0.16 (#43)
2624afe651 is described below

commit 2624afe6517ac11c856e0ff32edcab232b389836
Author: Siyuan Feng 
AuthorDate: Wed Jun 12 21:30:52 2024 +0800

add download link for v0.14-v0.16 (#43)
---
 download.md | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/download.md b/download.md
index 384817893a..74572861de 100644
--- a/download.md
+++ b/download.md
@@ -17,6 +17,9 @@ Choose your flavor of download from the following links:
 
 | Version | Source | PGP | SHA |
 | --- | -- | --- | --- |
+| 0.16.0   | 
[apache-tvm-src-v0.16.0.tar.gz](https://www.apache.org/dyn/closer.lua/tvm/tvm-v0.16.0/apache-tvm-src-v0.16.0.tar.gz)
 | 
[.asc](https://downloads.apache.org/tvm/tvm-v0.16.0/apache-tvm-src-v0.16.0.tar.gz.asc)
 | 
[.sha512](https://downloads.apache.org/tvm/tvm-v0.16.0/apache-tvm-src-v0.16.0.tar.gz.sha512)
 |
+| 0.15.0   | 
[apache-tvm-src-v0.15.0.tar.gz](https://www.apache.org/dyn/closer.lua/tvm/tvm-v0.15.0/apache-tvm-src-v0.15.0.tar.gz)
 | 
[.asc](https://downloads.apache.org/tvm/tvm-v0.15.0/apache-tvm-src-v0.15.0.tar.gz.asc)
 | 
[.sha512](https://downloads.apache.org/tvm/tvm-v0.15.0/apache-tvm-src-v0.15.0.tar.gz.sha512)
 |
+| 0.14.0   | 
[apache-tvm-src-v0.14.0.tar.gz](https://www.apache.org/dyn/closer.lua/tvm/tvm-v0.14.0/apache-tvm-src-v0.14.0.tar.gz)
 | 
[.asc](https://downloads.apache.org/tvm/tvm-v0.14.0/apache-tvm-src-v0.14.0.tar.gz.asc)
 | 
[.sha512](https://downloads.apache.org/tvm/tvm-v0.14.0/apache-tvm-src-v0.14.0.tar.gz.sha512)
 |
 | 0.13.0   | 
[apache-tvm-src-v0.13.0.tar.gz](https://www.apache.org/dyn/closer.lua/tvm/tvm-v0.13.0/apache-tvm-src-v0.13.0.tar.gz)
 | 
[.asc](https://downloads.apache.org/tvm/tvm-v0.13.0/apache-tvm-src-v0.13.0.tar.gz.asc)
 | 
[.sha512](https://downloads.apache.org/tvm/tvm-v0.13.0/apache-tvm-src-v0.13.0.tar.gz.sha512)
 |
 | 0.12.0   | 
[apache-tvm-src-v0.12.0.tar.gz](https://www.apache.org/dyn/closer.lua/tvm/tvm-v0.12.0/apache-tvm-src-v0.12.0.tar.gz)
 | 
[.asc](https://downloads.apache.org/tvm/tvm-v0.12.0/apache-tvm-src-v0.12.0.tar.gz.asc)
 | 
[.sha512](https://downloads.apache.org/tvm/tvm-v0.12.0/apache-tvm-src-v0.12.0.tar.gz.sha512)
 |
 | 0.11.0   | 
[apache-tvm-src-v0.11.0.tar.gz](https://www.apache.org/dyn/closer.lua/tvm/tvm-v0.11.0/apache-tvm-src-v0.11.0.tar.gz)
 | 
[.asc](https://downloads.apache.org/tvm/tvm-v0.11.0/apache-tvm-src-v0.11.0.tar.gz.asc)
 | 
[.sha512](https://downloads.apache.org/tvm/tvm-v0.11.0/apache-tvm-src-v0.11.0.tar.gz.sha512)
 |



(tvm) branch main updated (2f800df89d -> 1d761dac45)

2024-06-07 Thread tqchen
This is an automated email from the ASF dual-hosted git repository.

tqchen pushed a change to branch main
in repository https://gitbox.apache.org/repos/asf/tvm.git


from 2f800df89d [WebGPU] Translate `int8x4` into `u32` (#17071)
 add 1d761dac45 [Metal] Enable Debug Label (#17059)

No new revisions were added by this update.

Summary of changes:
 src/runtime/metal/metal_common.h  | 5 -
 src/runtime/metal/metal_device_api.mm | 6 --
 src/runtime/metal/metal_module.mm | 3 ++-
 3 files changed, 10 insertions(+), 4 deletions(-)



(tvm) branch main updated: [WebGPU] Translate `int8x4` into `u32` (#17071)

2024-06-07 Thread tqchen
This is an automated email from the ASF dual-hosted git repository.

tqchen pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/tvm.git


The following commit(s) were added to refs/heads/main by this push:
 new 2f800df89d [WebGPU] Translate `int8x4` into `u32` (#17071)
2f800df89d is described below

commit 2f800df89d9e4ba366a1285b4246f286680951a6
Author: Jiawei Shao 
AuthorDate: Fri Jun 7 21:03:29 2024 +0800

[WebGPU] Translate `int8x4` into `u32` (#17071)

This patch translates an `int8x4` into a `u32` in WGSL shaders as
8-bit integers are not supported in WebGPU right now and the WGSL
built-in function `dot4I8Packed()` accepts `u32` as its inputs and
each of the `u32` value logically represents a 4-element 8-bit
integer vector.

issue: #16627
---
 src/target/source/codegen_webgpu.cc | 5 +
 1 file changed, 5 insertions(+)

diff --git a/src/target/source/codegen_webgpu.cc 
b/src/target/source/codegen_webgpu.cc
index f62e0db7ff..a95f6e0fa0 100644
--- a/src/target/source/codegen_webgpu.cc
+++ b/src/target/source/codegen_webgpu.cc
@@ -298,6 +298,11 @@ void CodeGenWebGPU::PrintType(DataType t, std::ostream& 
os) {  // NOLINT(*)
 
   if (lanes != 1) {
 ICHECK(lanes >= 2 && lanes <= 4) << "CodeGenWebGPU: only allows vector 
with lanes in {2, 3, 4}";
+// Currently WebGPU doesn't support `i8` and an `int8x4` is represented as 
a `u32`.
+if (t.is_int() && t.bits() == 8 && lanes == 4) {
+  os << "u32";
+  return;
+}
 os << "vec" << lanes << "<";
   }
 



(tvm) branch main updated: [Relax][Frontend][Onnx] Cast Op special handling for ShapeExpr input (#17061)

2024-06-03 Thread tqchen
This is an automated email from the ASF dual-hosted git repository.

tqchen pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/tvm.git


The following commit(s) were added to refs/heads/main by this push:
 new f5d3fc264d [Relax][Frontend][Onnx] Cast Op special handling for 
ShapeExpr input (#17061)
f5d3fc264d is described below

commit f5d3fc264d4a9c7c31fbaba8413cbd81eea963e8
Author: tsu-bin <81693503+tsu-...@users.noreply.github.com>
AuthorDate: Tue Jun 4 08:34:34 2024 +0800

[Relax][Frontend][Onnx] Cast Op special handling for ShapeExpr input 
(#17061)

Co-authored-by: tsu-bin 
---
 python/tvm/relax/frontend/onnx/onnx_frontend.py | 6 ++
 1 file changed, 6 insertions(+)

diff --git a/python/tvm/relax/frontend/onnx/onnx_frontend.py 
b/python/tvm/relax/frontend/onnx/onnx_frontend.py
index 86c77538e8..ba121b7ec4 100644
--- a/python/tvm/relax/frontend/onnx/onnx_frontend.py
+++ b/python/tvm/relax/frontend/onnx/onnx_frontend.py
@@ -442,6 +442,11 @@ class Cast(OnnxOpConverter):
 @classmethod
 def _impl_v13(cls, bb, inputs, attr, params):
 to_type = get_type(attr["to"])
+if isinstance(inputs[0], relax.ShapeExpr):
+shape = inputs[0]
+if all([isinstance(x, tir.IntImm) for x in shape]):
+shape = [int(x) for x in shape]
+return relax.const(shape, to_type)
 if isinstance(inputs[0], relax.Constant):
 output = inputs[0].data.numpy().astype(to_type)
 return relax.const(output, to_type)
@@ -2210,6 +2215,7 @@ class ONNXGraphImporter:
 "Concat",
 "Equal",
 "Where",
+"Cast",
 ]
 for i, inp in enumerate(inputs):
 if (



(tvm) branch main updated: Introduce outer reduction for metal (#17058)

2024-06-03 Thread tqchen
This is an automated email from the ASF dual-hosted git repository.

tqchen pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/tvm.git


The following commit(s) were added to refs/heads/main by this push:
 new 1c05902017 Introduce outer reduction for metal (#17058)
1c05902017 is described below

commit 1c05902017e85d79388f0b919757c3d883799c06
Author: Siyuan Feng 
AuthorDate: Tue Jun 4 08:34:18 2024 +0800

Introduce outer reduction for metal (#17058)
---
 python/tvm/dlight/gpu/gemv.py  |  92 +++
 python/tvm/dlight/gpu/low_batch_gemv.py| 227 
 python/tvm/dlight/gpu/utils.py |  24 +-
 tests/python/dlight/test_gpu_gemv.py   | 359 +++--
 tests/python/dlight/test_gpu_low_batch_gemv.py | 146 ++
 5 files changed, 426 insertions(+), 422 deletions(-)

diff --git a/python/tvm/dlight/gpu/gemv.py b/python/tvm/dlight/gpu/gemv.py
index 9ad6f3f89a..ce1c5986e1 100644
--- a/python/tvm/dlight/gpu/gemv.py
+++ b/python/tvm/dlight/gpu/gemv.py
@@ -18,7 +18,7 @@
 from functools import reduce
 from typing import List, Optional, Union
 
-from tvm import DataType, arith, ir, tir
+from tvm import arith, ir, tir
 from tvm.target import Target
 
 from ..base import (
@@ -31,6 +31,7 @@ from ..base import (
 try_inline_contiguous_spatial,
 )
 from .base import GPUScheduleRule
+from .utils import auto_vectorize, get_bytes, get_extent
 
 
 def _get_reduction_expr(block: tir.Block) -> Optional[tir.PrimExpr]:
@@ -49,17 +50,6 @@ def _get_reduction_expr(block: tir.Block) -> 
Optional[tir.PrimExpr]:
 return buffer_store.value.b
 
 
-def get_extent(sch: tir.Schedule, loop_rv: tir.schedule.LoopRV):
-loop: tir.For = sch.get(loop_rv)
-return loop.extent.value if isinstance(loop.extent, tir.IntImm) else 
loop.extent
-
-
-def get_bytes(dtype: Union[DataType, str]) -> int:
-if isinstance(dtype, str):
-dtype = DataType(dtype)
-return dtype.itemsize()
-
-
 def is_gemv(sch: tir.Schedule, block_info: BlockInfo) -> 
Optional[List[tir.Buffer]]:
 """Check if the block is a GEMV.
 
@@ -207,17 +197,13 @@ class GEMV(GPUScheduleRule):
 return None
 elif is_inner_reduction:
 return self.sch_inner_reduction(sch, target, block, 
vector_input_buffers, epilogue)
-elif target.kind.name == "opencl" and "android" in str(target.host):
+else:
 ret = self.sch_outer_reduction(sch, target, block, 
vector_input_buffers, epilogue)
 if ret is None:
 return self.sch_outer_reduction_fallback(
 sch, target, block, vector_input_buffers, epilogue
 )
 return sch
-else:
-return self.sch_outer_reduction_fallback(
-sch, target, block, vector_input_buffers, epilogue
-)
 
 def sch_inner_reduction(  # pylint: disable=too-many-arguments, 
invalid-name, unused-argument
 self,
@@ -535,9 +521,11 @@ class GEMV(GPUScheduleRule):
 
 TILE_S, TILE_R = (
 1,
-len_c
-if len_c > 1
-else max(get_max_factor(len_r, [TR * 1, TR * 2, TR * 4, TR * 8]) 
// TR, 1),
+(
+len_c
+if len_c > 1
+else max(get_max_factor(len_r, [TR * 1, TR * 2, TR * 4, TR * 
8]) // TR, 1)
+),
 )
 VEC_C = min(get_max_factor(TILE_R, [1, 2, 4, 8]), VEC_C)
 
@@ -614,9 +602,9 @@ class GEMV(GPUScheduleRule):
 sch.reorder(bx, ts, tr, r, v_tile, tile_r, vec_c)
 # sch.bind(batch, "blockIdx.z")
 sch.bind(bx, "blockIdx.x")
-sch.bind(ts, "threadIdx.x")
-sch.bind(tr, "threadIdx.y")
-sch.vectorize(vec_c)
+sch.bind(ts, TAG_S)
+sch.bind(tr, TAG_R)
+auto_vectorize(sch, vec_c, VEC_C)
 
 # decompose independent scale read to outer loop
 block_rf_stmt = sch.get(rf)
@@ -635,26 +623,26 @@ class GEMV(GPUScheduleRule):
 V_shared = sch.cache_read(rf, read_buffer_index=0, 
storage_scope="shared")
 sch.compute_at(V_shared, r, preserve_unit_loops=True)
 l = sch.get_loops(block=V_shared)[-1]
-_, v_tile, tx, ty, vec = sch.split(
+_, v_tile, ts, tr, vec = sch.split(
 l, factors=[None, LOAD_V_TILE, TS, TR, LOAD_V_VEC], 
preserve_unit_iters=True
 )
-sch.bind(ty, "threadIdx.y")
-sch.bind(tx, "threadIdx.x")
-sch.vectorize(vec)
+sch.bind(tr, TAG_R)
+sch.bind(ts, TAG_S)
+auto_vectorize(sch, vec, LOAD_V_VEC)
 
 # reduce tile_s * tr * vec to tile_s * tr
 sch.reverse_compute_at(rf2, l

(tvm) branch main updated: [Runtime] Stateless interface of PagedKVCache leaf node commit (#17057)

2024-06-02 Thread tqchen
This is an automated email from the ASF dual-hosted git repository.

tqchen pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/tvm.git


The following commit(s) were added to refs/heads/main by this push:
 new b87d1f9b01 [Runtime] Stateless interface of PagedKVCache leaf node 
commit (#17057)
b87d1f9b01 is described below

commit b87d1f9b0124877769d537d4748c63546d2b2d8b
Author: Ruihang Lai 
AuthorDate: Sun Jun 2 07:43:09 2024 -0400

[Runtime] Stateless interface of PagedKVCache leaf node commit (#17057)

This PR changes the interface of the function
`CommitAcceptedTokenTreeNodeToKVCache` introduced recently for
PagedKVCache to a stateless interface. Previously the interace
is a stateful one, which makes strong assumption on the caller
side. This commit removes the assumption so that the interface
becomes less confusing.
---
 src/runtime/relax_vm/kv_state.h|   4 +-
 src/runtime/relax_vm/paged_kv_cache.cc | 177 +
 ...runtime_builtin_paged_attention_kv_cache_tir.py |   9 +-
 3 files changed, 119 insertions(+), 71 deletions(-)

diff --git a/src/runtime/relax_vm/kv_state.h b/src/runtime/relax_vm/kv_state.h
index 8de560f122..f4d6036b96 100644
--- a/src/runtime/relax_vm/kv_state.h
+++ b/src/runtime/relax_vm/kv_state.h
@@ -151,9 +151,11 @@ class AttentionKVCacheObj : public KVStateObj {
* The commit will update the KV cache, by compacting the KV data and discard
* the KV data of rejected tokens.
* This is a mandatory step when the BeginForward is given with a token tree.
+   * \param seq_ids The ids of the sequences to commit.
* \param leaf_indices The leaf token tree node index of each sequence.
*/
-  virtual void CommitAcceptedTokenTreeNodes(const IntTuple& leaf_indices) = 0;
+  virtual void CommitAcceptedTokenTreeNodes(const IntTuple& seq_ids,
+const IntTuple& leaf_indices) = 0;
 
   /** Attention **/
 
diff --git a/src/runtime/relax_vm/paged_kv_cache.cc 
b/src/runtime/relax_vm/paged_kv_cache.cc
index a5b970e817..2fc5da78e9 100644
--- a/src/runtime/relax_vm/paged_kv_cache.cc
+++ b/src/runtime/relax_vm/paged_kv_cache.cc
@@ -151,6 +151,18 @@ struct Sequence {
*/
   int last_block_attn_sink_size = 0;
 
+  /*! \brief Whether the current appended tokens form a chain (not a tree). */
+  bool is_chain = true;
+  /*! \brief The token tree parent pointer array of the current appended 
tokens. */
+  std::vector token_tree_parent_ptr;
+  /*! \brief The depth of each node in the token tree. */
+  std::vector token_tree_node_depths;
+  /*!
+   * \brief A boolean denoting whether the accepted token tree indices of
+   * this sequence are committed
+   */
+  bool accepted_indices_committed = true;
+
   explicit Sequence(std::vector* global_block_pool, int32_t 
last_block_idx) {
 ++global_block_pool->at(last_block_idx).external_ref_cnt;
 this->last_block_idx = last_block_idx;
@@ -879,10 +891,6 @@ class PagedAttentionKVCacheObj : public 
AttentionKVCacheObj {
   IntTuple cur_seq_ids_;
   /*! \brief The append lengths of the sequences in the current round of 
forwarding. */
   IntTuple cur_append_lengths_;
-  /*! \brief The token tree parent array of the sequences in the current round 
of forwarding. */
-  IntTuple cur_token_tree_parent_ptr_{nullptr};
-  /*! \brief The depth of each node in the token tree, for the sequences in 
the current batch. */
-  std::vector> cur_token_tree_node_depths_;
   /*! \brief Whether the current batch of sequences are token chains (not 
token trees). */
   bool is_chain_;
   /*! \brief Number of fork depth in the current round of forward. */
@@ -1187,6 +1195,9 @@ class PagedAttentionKVCacheObj : public 
AttentionKVCacheObj {
 << "The forked position should be non-negative, or -1 for last 
position as default.";
 CHECK_LE(fork_pos, parent_it->second.seq_length)
 << "The forked position should not exceed the total length of parent 
sequence.";
+CHECK(parent_it->second.accepted_indices_committed)
+<< "The parent sequence's token tree computed in the last round of 
forward has not been "
+   "committed with accepted nodes.";
 
 int32_t child_block_idx = GetFreeBlock();
 if (fork_pos == -1 || fork_pos == parent_it->second.seq_length) {
@@ -1434,10 +1445,6 @@ class PagedAttentionKVCacheObj : public 
AttentionKVCacheObj {
 
   void BeginForward(const IntTuple& seq_ids, const IntTuple& append_lengths,
 const Optional& opt_token_tree_parent_ptr) final 
{
-CHECK(!cur_token_tree_parent_ptr_.defined())
-<< "The last round of forward which involves token tree has not been 
committed. Please "
-   "call \"CommitAcceptedTreeNodes\" to commit the accepted tokens.";

(tvm) branch main updated: Use adapter.info when available instead of requestAdapterInfo (#17051)

2024-06-01 Thread tqchen
This is an automated email from the ASF dual-hosted git repository.

tqchen pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/tvm.git


The following commit(s) were added to refs/heads/main by this push:
 new 4ab91d4c4f Use adapter.info when available instead of 
requestAdapterInfo (#17051)
4ab91d4c4f is described below

commit 4ab91d4c4fb20aee02717b08f0597e06fb2675bd
Author: François Beaufort 
AuthorDate: Sat Jun 1 13:02:09 2024 +0200

Use adapter.info when available instead of requestAdapterInfo (#17051)

* Use adapter.info when available instead of requestAdapterInfo

* Update package.json
---
 web/package.json  | 2 +-
 web/src/webgpu.ts | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/web/package.json b/web/package.json
index a8a552f3fc..63aa63cd5a 100644
--- a/web/package.json
+++ b/web/package.json
@@ -25,7 +25,7 @@
 "@types/node": "^20.4.5",
 "@typescript-eslint/eslint-plugin": "^5.59.6",
 "@typescript-eslint/parser": "^5.59.6",
-"@webgpu/types": "^0.1.40",
+"@webgpu/types": "^0.1.42",
 "eslint": "^8.41.0",
 "jest": "^26.0.1",
 "rollup": "^2.56.2",
diff --git a/web/src/webgpu.ts b/web/src/webgpu.ts
index 10d4aab643..bd8d236974 100644
--- a/web/src/webgpu.ts
+++ b/web/src/webgpu.ts
@@ -105,7 +105,7 @@ export async function detectGPUDevice(): 
Promise

(tvm) branch main updated: [Runtime] Support PagedKVCache with tree attention (#17049)

2024-06-01 Thread tqchen
This is an automated email from the ASF dual-hosted git repository.

tqchen pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/tvm.git


The following commit(s) were added to refs/heads/main by this push:
 new 31f4721596 [Runtime] Support PagedKVCache with tree attention (#17049)
31f4721596 is described below

commit 31f47215965b3a4d58a0ee1f450965a43ce2fcd0
Author: Ruihang Lai 
AuthorDate: Sat Jun 1 07:01:56 2024 -0400

[Runtime] Support PagedKVCache with tree attention (#17049)

* [Runtime] Support PagedKVCache with tree attention

This PR introduces the tree attention to PagedKVCache. With this
feature, now the KV cache is ready for tree attention cases such as
speculative decoding trees.

This PR adds tree attention tests to test the correctness.

The changes in this PR to KVState interface are backward compatible.

* Update kv_state.cc

* Update kv_state.cc

-

Co-authored-by: Tianqi Chen 
---
 src/runtime/relax_vm/kv_state.cc   |  15 +-
 src/runtime/relax_vm/kv_state.h|  15 +-
 src/runtime/relax_vm/paged_kv_cache.cc | 657 +
 src/runtime/relax_vm/rnn_state.cc  |  16 +-
 ...runtime_builtin_paged_attention_kv_cache_tir.py | 561 +-
 5 files changed, 1149 insertions(+), 115 deletions(-)

diff --git a/src/runtime/relax_vm/kv_state.cc b/src/runtime/relax_vm/kv_state.cc
index b1572bf409..b730a4eb07 100644
--- a/src/runtime/relax_vm/kv_state.cc
+++ b/src/runtime/relax_vm/kv_state.cc
@@ -40,13 +40,26 @@ TVM_REGISTER_GLOBAL("vm.builtin.kv_state_fork_sequence")
 .set_body_method(::ForkSequence);
 
TVM_REGISTER_GLOBAL("vm.builtin.kv_state_popn").set_body_method(::PopN);
 TVM_REGISTER_GLOBAL("vm.builtin.kv_state_begin_forward")
-.set_body_method(::BeginForward);
+.set_body([](TVMArgs args, TVMRetValue* rv) {
+  CHECK(args.size() == 3 || args.size() == 4)
+  << "KVState BeginForward only accepts 3 or 4 arguments";
+  KVState kv_state = args[0];
+  IntTuple seq_ids = args[1];
+  IntTuple append_lengths = args[2];
+  Optional token_tree_parent_ptr{nullptr};
+  if (args.size() == 4) {
+token_tree_parent_ptr = args[3].operator Optional();
+  }
+  kv_state->BeginForward(seq_ids, append_lengths, token_tree_parent_ptr);
+});
 TVM_REGISTER_GLOBAL("vm.builtin.kv_state_end_forward")
 .set_body_method(::EndForward);
 
 // Attention KV Cache methods
 
TVM_REGISTER_GLOBAL("vm.builtin.attention_kv_cache_enable_sliding_window_for_seq")
 
.set_body_method(::EnableSlidingWindowForSeq);
+TVM_REGISTER_GLOBAL("vm.builtin.attention_kv_cache_commit_accepted_token_tree_nodes")
+
.set_body_method(::CommitAcceptedTokenTreeNodes);
 TVM_REGISTER_GLOBAL("vm.builtin.attention_kv_cache_empty")
 .set_body_method(::Empty);
 TVM_REGISTER_GLOBAL("vm.builtin.attention_kv_cache_get_num_available_pages")
diff --git a/src/runtime/relax_vm/kv_state.h b/src/runtime/relax_vm/kv_state.h
index 12a18ba895..8de560f122 100644
--- a/src/runtime/relax_vm/kv_state.h
+++ b/src/runtime/relax_vm/kv_state.h
@@ -89,8 +89,12 @@ class KVStateObj : public Object {
* in the model forward function.
* \param seq_ids The ids of the sequence to run in the incoming model 
forward.
* \param append_lengths The sequence lengths to run forward for for each 
sequence.
+   * \param token_tree_parent_ptr The parent idx array of the token trees. Its 
length
+   * is the sum of "append_lengths". Nullptr means the token tree of each 
sequence
+   * is a chain.
*/
-  virtual void BeginForward(const IntTuple& seq_ids, const IntTuple& 
append_lengths) = 0;
+  virtual void BeginForward(const IntTuple& seq_ids, const IntTuple& 
append_lengths,
+const Optional& token_tree_parent_ptr = 
NullOpt) = 0;
 
   /*!
* \brief Mark the start of the forward function.
@@ -142,6 +146,15 @@ class AttentionKVCacheObj : public KVStateObj {
   virtual void EnableSlidingWindowForSeq(int64_t seq_id, int32_t 
sliding_window_size,
  int32_t attn_sink_size) = 0;
 
+  /*!
+   * \brief Committed the accepted token tree nodes to KV cache.
+   * The commit will update the KV cache, by compacting the KV data and discard
+   * the KV data of rejected tokens.
+   * This is a mandatory step when the BeginForward is given with a token tree.
+   * \param leaf_indices The leaf token tree node index of each sequence.
+   */
+  virtual void CommitAcceptedTokenTreeNodes(const IntTuple& leaf_indices) = 0;
+
   /** Attention **/
 
   /*!
diff --git a/src/runtime/relax_vm/paged_kv_cache.cc 
b/src/runtime/relax_vm/paged_kv_cache.cc
index 4ab0f3f0c6..a5b970e817 100644
--- a/src/runtime/relax_vm/pag

(tvm) branch main updated: [DLight] Skip GEMV rules when more than one vector (#17052)

2024-05-31 Thread tqchen
This is an automated email from the ASF dual-hosted git repository.

tqchen pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/tvm.git


The following commit(s) were added to refs/heads/main by this push:
 new 515c07937b [DLight] Skip GEMV rules when more than one vector (#17052)
515c07937b is described below

commit 515c07937bbf9c0bd7575928217c258caaa5867c
Author: Siyuan Feng 
AuthorDate: Fri May 31 22:26:50 2024 +0800

[DLight] Skip GEMV rules when more than one vector (#17052)

The current dlight GEMV rule require only one vector buffer, otherwise
raise an error. This PR change this behavior to skip the rule.
---
 python/tvm/dlight/gpu/gemv.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/python/tvm/dlight/gpu/gemv.py b/python/tvm/dlight/gpu/gemv.py
index b8a2c6a15f..9ad6f3f89a 100644
--- a/python/tvm/dlight/gpu/gemv.py
+++ b/python/tvm/dlight/gpu/gemv.py
@@ -206,8 +206,7 @@ class GEMV(GPUScheduleRule):
 if is_inner_reduction is None:
 return None
 elif is_inner_reduction:
-self.sch_inner_reduction(sch, target, block, vector_input_buffers, 
epilogue)
-return sch
+return self.sch_inner_reduction(sch, target, block, 
vector_input_buffers, epilogue)
 elif target.kind.name == "opencl" and "android" in str(target.host):
 ret = self.sch_outer_reduction(sch, target, block, 
vector_input_buffers, epilogue)
 if ret is None:
@@ -313,7 +312,8 @@ class GEMV(GPUScheduleRule):
 
 # load vector into shared memory, shape should be the whole vector
 if LOAD_V_SHARED:
-assert len(vector_input_buffers) == 1
+if len(vector_input_buffers) != 1:
+return None
 V_shared = sch.cache_read(rf, read_buffer_index=0, 
storage_scope="shared")
 sch.compute_at(V_shared, tr, preserve_unit_loops=True)
 l = sch.get_loops(block=V_shared)[-1]



(tvm-rfcs) branch main updated: [RFC] Add NNEF frontend (#108)

2024-05-30 Thread tqchen
This is an automated email from the ASF dual-hosted git repository.

tqchen pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/tvm-rfcs.git


The following commit(s) were added to refs/heads/main by this push:
 new f0f982f  [RFC] Add NNEF frontend (#108)
f0f982f is described below

commit f0f982f2bf8168b5953f0193610c0aea977c75a8
Author: Czobor Ágoston Mátyás <73029973+agoston...@users.noreply.github.com>
AuthorDate: Fri May 31 02:29:27 2024 +0200

[RFC] Add NNEF frontend (#108)

* [RFC] Add NNEF frontend (#108)

* update md

* Add Relax to RFC
---
 rfcs/0108-add-nnef-frontend.md | 132 +
 1 file changed, 132 insertions(+)

diff --git a/rfcs/0108-add-nnef-frontend.md b/rfcs/0108-add-nnef-frontend.md
new file mode 100644
index 000..db7aebc
--- /dev/null
+++ b/rfcs/0108-add-nnef-frontend.md
@@ -0,0 +1,132 @@
+- Feature Name: `NNEF frontend to Relay and Relax`
+- Start Date: 2024-04-11
+- RFC PR: [apache/tvm-rfcs#0108](https://github.com/apache/tvm-rfcs/pull/0108)
+- GitHub Issue: [apache/tvm#](https://github.com/apache/tvm/issues/)
+
+# Summary
+[summary]: #summary
+
+Add the Khronos Neural Network Exchange Format (NNEF) as a frontend to TVM 
Relay and Relax.
+
+# Motivation
+[motivation]: #motivation
+
+NNEF is an open, standardized format for neural network exchange developed by 
the Khronos Group since 2018 (https://www.khronos.org/nnef). It is aimed at 
deploying trained neural networks from deep learning frameworks to proprietary 
inference engines of neural network hardware vendors. Such inference engines 
often require an offline compilation step for running models more efficiently, 
hence hardware vendors are are looing into open source compiler stacks to be 
leveraged. On one hand, ha [...]
+
+The Khronos Group also maintains a set of tools for handling NNEF models. 
Since NNEF is mainly a textual format, these include a parser (with C++ and 
Python interfaces), and conversion tools from other formats. NNEF supports 
conversion from models of various deep learning frameworks, including Caffe, 
TensorFlow (also Lite) and all those that support ONNX, such as PyTorch. 
Creating NNEF models is also possible manually by directly writing the model 
text file(s) (since NNEF is similar to a [...]
+
+For example, loading an NNEF model in Python is as simple as follows:
+
+```python
+import nnef
+graph = nnef.load_graph('example.nnef')
+```
+
+The resulting graph object, containing tensors and operators can then be 
traversed and processed, for example converted into TVM representation, as done 
in this PR.
+
+The NNEF tools also provide a simple C++ based reference implementation for 
NNEF models, whose main purpose is testing/debugging conversions, and serving 
as a reference for other more efficient inference backends. Furthermore, a 
PyTorch based interpreter is also supported, which is able to execute NNEF 
models via on/the-fly conversion to PyTorch calls, and can also be used as a 
(more efficient) reference.
+
+
+# Guide-level explanation
+[guide-level-explanation]: #guide-level-explanation
+
+We are going to add support for models in NNEF format. The model may be 
provided either as an NNEF model folder, or an `nnef.Graph` object 
+already loaded into memory.
+The conversion is done via the new frontend function
+```python
+# for relay frontend:
+import tvm.relay as relay
+mod, params = relay.frontend.from_nnef(model, freeze_vars=False)
+```
+- model: either a string / PathLike to an NNEF model folder, or an 
`nnef.Graph` object.
+- freeze_vars: bool (optional), which sets whether the parameters should be 
considered variables or constants for optimization.
+
+```python
+# for relax frontend:
+import tvm.relax as relax
+import tvm.relax.frontend.nnef
+mod = relax.frontend.nnef.from_nnef(model, keep_params_in_input=False)
+```
+- model: either a string / PathLike to an NNEF model folder, or an 
`nnef.Graph` object.
+- keep_params_in_input: bool (optional), sets whether the nnef variables will 
be converted to constants and folded into the model, or need to be given as 
inputs.
+
+
+Example usages (assuming we have a valid NNEF model)
+```python
+import nnef
+from tvm import relay
+
+model_path = 'path/to/model.nnef'
+
+# If modification is warranted the graph can be imported with 
`nnef.load_graph` 
+graph = nnef.load_graph(model_path)
+
+mod, params = relay.frontend.from_nnef(graph)
+
+# Or the converter can read the graph from path as well
+
+mod, params = relay.frontend.from_nnef(model_path)
+
+```
+
+
+```python
+import tvm.relax as relax
+import tvm.relax.frontend.nnef
+
+model_path = 'path/to/model.nnef'
+
+# If modification is warranted the graph can be imported with 
`nnef.load_graph` 
+graph = nnef.load_graph(model_path)
+
+mod = relax.frontend.nnef.from_nnef(graph)
+
+# Or the converter can read the graph from path as well
+mod = relax.frontend.nnef.from_nnef(mode

(tvm) branch main updated: [Runtime] Fix PagedKVCache for PopN and enhance tests (#17045)

2024-05-30 Thread tqchen
This is an automated email from the ASF dual-hosted git repository.

tqchen pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/tvm.git


The following commit(s) were added to refs/heads/main by this push:
 new 1eac17857f [Runtime] Fix PagedKVCache for PopN and enhance tests 
(#17045)
1eac17857f is described below

commit 1eac17857fc95a28e1cbaf90a9c34575807622e1
Author: Ruihang Lai 
AuthorDate: Thu May 30 15:13:12 2024 -0400

[Runtime] Fix PagedKVCache for PopN and enhance tests (#17045)

This PR fixes a bug in the PagedKVCache which may happen when the
sequence removal order is not consistent with the reverse order
of sequence add/fork order. With this fix, the PagedKVCache now
supports removing sequences in any order without breaking.

This PR also adds an `empty` function to PagedKVCache to check if
the KV cache is empty. Right now this function is only used for test
purpose, where we check if everything in the KV cache is freed after
removing all sequences.
---
 src/runtime/relax_vm/kv_state.cc   |  2 +
 src/runtime/relax_vm/kv_state.h|  2 +
 src/runtime/relax_vm/paged_kv_cache.cc | 49 ++
 ...runtime_builtin_paged_attention_kv_cache_tir.py | 30 +++--
 4 files changed, 62 insertions(+), 21 deletions(-)

diff --git a/src/runtime/relax_vm/kv_state.cc b/src/runtime/relax_vm/kv_state.cc
index 05ba7c9650..b1572bf409 100644
--- a/src/runtime/relax_vm/kv_state.cc
+++ b/src/runtime/relax_vm/kv_state.cc
@@ -47,6 +47,8 @@ TVM_REGISTER_GLOBAL("vm.builtin.kv_state_end_forward")
 // Attention KV Cache methods
 
TVM_REGISTER_GLOBAL("vm.builtin.attention_kv_cache_enable_sliding_window_for_seq")
 
.set_body_method(::EnableSlidingWindowForSeq);
+TVM_REGISTER_GLOBAL("vm.builtin.attention_kv_cache_empty")
+.set_body_method(::Empty);
 TVM_REGISTER_GLOBAL("vm.builtin.attention_kv_cache_get_num_available_pages")
 
.set_body_method(::GetNumAvailablePages);
 TVM_REGISTER_GLOBAL("vm.builtin.attention_kv_cache_get_total_sequence_length")
diff --git a/src/runtime/relax_vm/kv_state.h b/src/runtime/relax_vm/kv_state.h
index 7b90ffce50..12a18ba895 100644
--- a/src/runtime/relax_vm/kv_state.h
+++ b/src/runtime/relax_vm/kv_state.h
@@ -117,6 +117,8 @@ class AttentionKVCacheObj : public KVStateObj {
  public:
   /** Raw Info Query **/
 
+  /*! \brief Check if the KV cache is empty. */
+  virtual bool Empty() const = 0;
   /*!
* \brief Get the number of available pages in the KV cache.
* When the underlying KV cache implementation is not
diff --git a/src/runtime/relax_vm/paged_kv_cache.cc 
b/src/runtime/relax_vm/paged_kv_cache.cc
index 62750d6d7d..4ab0f3f0c6 100644
--- a/src/runtime/relax_vm/paged_kv_cache.cc
+++ b/src/runtime/relax_vm/paged_kv_cache.cc
@@ -147,13 +147,14 @@ struct Sequence {
*/
   int last_block_attn_sink_size = 0;
 
-  explicit Sequence(const std::vector& global_block_pool, int32_t 
last_block_idx) {
+  explicit Sequence(std::vector* global_block_pool, int32_t 
last_block_idx) {
+++global_block_pool->at(last_block_idx).external_ref_cnt;
 this->last_block_idx = last_block_idx;
 int32_t block_ptr = last_block_idx;
 // Go through each block in the sequence, sum up the length.
 int depth = 0;
 while (true) {
-  const Block& block = global_block_pool[block_ptr];
+  const Block& block = global_block_pool->at(block_ptr);
   this->seq_length += block.seq_length;
   ++depth;
   if (block.parent_idx == -1) {
@@ -965,7 +966,7 @@ class PagedAttentionKVCacheObj : public AttentionKVCacheObj 
{
 CHECK(seq_map_.find(seq_id) == seq_map_.end())
 << "The sequence \"" << seq_id << "\" is already in the KV cache.";
 int32_t block_idx = GetFreeBlock();
-seq_map_.insert({seq_id, Sequence(global_block_pool_, block_idx)});
+seq_map_.insert({seq_id, Sequence(_block_pool_, block_idx)});
 dirty_aux_data_device_ = true;
   }
 
@@ -973,9 +974,9 @@ class PagedAttentionKVCacheObj : public AttentionKVCacheObj 
{
 auto it = seq_map_.find(seq_id);
 CHECK(it != seq_map_.end()) << "The sequence \"" << seq_id << "\" cannot 
be found in KV cache.";
 int32_t block_idx = it->second.last_block_idx;
-CHECK_EQ(global_block_pool_[block_idx].external_ref_cnt, 0)
-<< "The sequence is currently referenced by other sequence and thus 
cannot be removed.";
-while (block_idx != -1 && global_block_pool_[block_idx].external_ref_cnt 
== 0) {
+// The block should have at least one reference, which comes from the 
sequence.
+ICHECK_GE(global_block_pool_[block_idx].external_ref_cnt, 1);
+while (block_idx != -1 && global_block_pool_[block_idx].exte

(tvm) branch main updated: [Runtime] Compatibility with dmlc::Stream API changes (#16998)

2024-05-30 Thread tqchen
This is an automated email from the ASF dual-hosted git repository.

tqchen pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/tvm.git


The following commit(s) were added to refs/heads/main by this push:
 new 820f1b617a [Runtime] Compatibility with dmlc::Stream API changes 
(#16998)
820f1b617a is described below

commit 820f1b617a4f8ccf196803c5e48a4f155c929c4a
Author: Eric Lunderberg 
AuthorDate: Thu May 30 11:41:03 2024 -0500

[Runtime] Compatibility with dmlc::Stream API changes (#16998)

* [Runtime] Compatibility with dmlc::Stream API changes

This commit updates TVM implementations of `dmlc::Stream`.  With
https://github.com/dmlc/dmlc-core/pull/686, this API now requires
the `Write` method to return the number of bytes written.  This change
allows partial writes to be correctly handled.

* Update dmlc-core version

* lint fix
---
 3rdparty/dmlc-core|  2 +-
 src/runtime/disco/process_session.cc  |  3 ++-
 src/runtime/disco/threaded_session.cc |  3 ++-
 src/runtime/file_utils.h  |  8 ++--
 src/runtime/rpc/rpc_endpoint.cc   |  8 ++--
 src/runtime/rpc/rpc_socket_impl.cc|  7 ++-
 src/support/base64.h  |  5 +++--
 src/support/pipe.h| 24 +++-
 8 files changed, 33 insertions(+), 27 deletions(-)

diff --git a/3rdparty/dmlc-core b/3rdparty/dmlc-core
index 09511cf9fe..3031e4a61a 16
--- a/3rdparty/dmlc-core
+++ b/3rdparty/dmlc-core
@@ -1 +1 @@
-Subproject commit 09511cf9fe5ff103900a5eafb50870dc84cc17c8
+Subproject commit 3031e4a61a98f49f07a42cfdec6242340fb2fd8c
diff --git a/src/runtime/disco/process_session.cc 
b/src/runtime/disco/process_session.cc
index b507758777..179010db8a 100644
--- a/src/runtime/disco/process_session.cc
+++ b/src/runtime/disco/process_session.cc
@@ -113,10 +113,11 @@ class DiscoPipeMessageQueue : private dmlc::Stream, 
private DiscoProtocolWrite(data, 
size); }
+  // write the data to the channel.
+  size_t Write(const void* data, size_t size) final {
+writer_->Write(data, size);
+return size;
+  }
+
   // Number of pending bytes requests
   size_t pending_request_bytes_{0};
   // The ring buffer to read data from.
diff --git a/src/runtime/rpc/rpc_socket_impl.cc 
b/src/runtime/rpc/rpc_socket_impl.cc
index 1d0b5d5470..6882ba4ded 100644
--- a/src/runtime/rpc/rpc_socket_impl.cc
+++ b/src/runtime/rpc/rpc_socket_impl.cc
@@ -159,11 +159,8 @@ class SimpleSockHandler : public dmlc::Stream {
   // Internal supporting.
   // Override methods that inherited from dmlc::Stream.
  private:
-  size_t Read(void* data, size_t size) final {
-ICHECK_EQ(sock_.RecvAll(data, size), size);
-return size;
-  }
-  void Write(const void* data, size_t size) final { 
ICHECK_EQ(sock_.SendAll(data, size), size); }
+  size_t Read(void* data, size_t size) final { return sock_.Recv(data, size); }
+  size_t Write(const void* data, size_t size) final { return sock_.Send(data, 
size); }
 
   // Things of current class.
  private:
diff --git a/src/support/base64.h b/src/support/base64.h
index aba4197bce..2bfc42c27f 100644
--- a/src/support/base64.h
+++ b/src/support/base64.h
@@ -206,7 +206,7 @@ class Base64InStream : public dmlc::Stream {
 }
 return size - tlen;
   }
-  virtual void Write(const void* ptr, size_t size) {
+  size_t Write(const void* ptr, size_t size) final {
 LOG(FATAL) << "Base64InStream do not support write";
   }
 
@@ -229,7 +229,7 @@ class Base64OutStream : public dmlc::Stream {
 
   using dmlc::Stream::Write;
 
-  void Write(const void* ptr, size_t size) final {
+  size_t Write(const void* ptr, size_t size) final {
 using base64::EncodeTable;
 size_t tlen = size;
 const unsigned char* cptr = static_cast(ptr);
@@ -247,6 +247,7 @@ class Base64OutStream : public dmlc::Stream {
 buf__top_ = 0;
   }
 }
+return size;
   }
   virtual size_t Read(void* ptr, size_t size) {
 LOG(FATAL) << "Base64OutStream do not support read";
diff --git a/src/support/pipe.h b/src/support/pipe.h
index 7251a6f14a..9d5aa1e486 100644
--- a/src/support/pipe.h
+++ b/src/support/pipe.h
@@ -112,8 +112,8 @@ class Pipe : public dmlc::Stream {
* \param size block size
* \return the size of data read
*/
-  void Write(const void* ptr, size_t size) final {
-if (size == 0) return;
+  size_t Write(const void* ptr, size_t size) final {
+if (size == 0) return 0;
 #ifdef _WIN32
 auto fwrite = [&]() -> ssize_t {
   DWORD nwrite;
@@ -124,18 +124,16 @@ class Pipe : public dmlc::Stream {
 DWORD nwrite = static_cast(RetryCallOnEINTR(fwrite, 
GetLastErrorCode));
 ICHECK_EQ(static_cast(nwrite), size) << "Write Error: " << 
GetLastError();
 #else
-while (size) {
-  ssize_t nwrite =
-  RetryCallOnEINTR([&]() { return write(handle_, ptr, size); }, 
GetLastErrorCode);
-  ICHECK_NE(n

(tvm) branch main updated: [Disco][QoL] Implement broadcast/scatter methods for Session (#17035)

2024-05-30 Thread tqchen
This is an automated email from the ASF dual-hosted git repository.

tqchen pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/tvm.git


The following commit(s) were added to refs/heads/main by this push:
 new 7c2c0d9337 [Disco][QoL] Implement broadcast/scatter methods for 
Session (#17035)
7c2c0d9337 is described below

commit 7c2c0d9337f3b353576bccc30f61c16abcc633a7
Author: Eric Lunderberg 
AuthorDate: Thu May 30 06:28:50 2024 -0500

[Disco][QoL] Implement broadcast/scatter methods for Session (#17035)

* [Disco][QoL] Implement broadcast/scatter methods for Session

Prior to this commit, use of the `disco.Session` API to broadcast or
scatter an array required several steps from the caller.

1. Allocate memory on worker0
2. Transfer data from the controller to worker0
3. Allocate memory on each worker
4. Broadcast/scatter data from worker0 to all workers

While exposing these steps is necessary for performance, especially
when used repeatedly, it can be tedious/error-prone to use for
initialization that is only performed once.

This commit adds utility methods `Session.broadcast` and
`Session.scatter`, which are implemented in terms of the existing
lower-level methods `Session.broadcast_from_worker0` and
`Session.scatter_from_worker0`.  These methods perform the transfer
from the controller to worker0, and from worker0 to all other
workers.

* lint fix
---
 python/tvm/runtime/disco/session.py | 102 +---
 tests/python/disco/test_ccl.py  |  70 ++---
 2 files changed, 158 insertions(+), 14 deletions(-)

diff --git a/python/tvm/runtime/disco/session.py 
b/python/tvm/runtime/disco/session.py
index 97edeff1d1..ddde1bc1f3 100644
--- a/python/tvm/runtime/disco/session.py
+++ b/python/tvm/runtime/disco/session.py
@@ -249,17 +249,34 @@ class Session(Object):
 """
 return _ffi_api.SessionCopyFromWorker0(self, host_array, remote_array) 
 # type: ignore # pylint: disable=no-member
 
-def copy_to_worker_0(self, host_array: NDArray, remote_array: DRef) -> 
None:
+def copy_to_worker_0(self, host_array: NDArray, remote_array: 
Optional[DRef] = None) -> DRef:
 """Copy the controller-side NDArray to worker-0.
 
 Parameters
 --
-host_array : numpy.ndarray
-The array to be copied from worker-0.
-remote_array : NDArray
-The NDArray on worker-0.
+host_array : NDArray
+
+The array to be copied to worker-0.
+
+remote_array : Optiona[DRef]
+
+The destination NDArray on worker-0.
+
+Returns
+---
+output_array: DRef
+
+The DRef containing the copied data on worker0, and
+NullOpt on all other workers.  If `remote_array` was
+provided, this return value is the same as `remote_array`.
+Otherwise, it is the newly allocated space.
+
 """
-return _ffi_api.SessionCopyToWorker0(self, host_array, remote_array)  
# type: ignore # pylint: disable=no-member
+if remote_array is None:
+remote_array = self.empty(host_array.shape, host_array.dtype, 
worker0_only=True)
+
+_ffi_api.SessionCopyToWorker0(self, host_array, remote_array)  # type: 
ignore # pylint: disable=no-member
+return remote_array
 
 def load_vm_module(
 self,
@@ -302,6 +319,40 @@ class Session(Object):
 _ffi_api.SessionInitCCL(self, ccl, ShapeTuple(device_ids))  # type: 
ignore # pylint: disable=no-member
 self._clear_ipc_memory_pool()
 
+def broadcast(self, src: Union[np.ndarray, NDArray], dst: Optional[DRef] = 
None) -> DRef:
+"""Broadcast an array to all workers
+
+Parameters
+--
+src: Union[np.ndarray, NDArray]
+
+The array to be broadcasted.
+
+dst: Optional[DRef]
+
+The output array.  If None, an array matching the shape
+and dtype of `src` will be allocated on each worker.
+
+Returns
+---
+output_array: DRef
+
+The DRef containing the broadcasted data on all workers.
+If `dst` was provided, this return value is the same as
+`dst`.  Otherwise, it is the newly allocated space.
+
+"""
+if not isinstance(src, NDArray):
+src = _as_NDArray(src)
+
+if dst is None:
+dst = self.empty(src.shape, src.dtype)
+
+src_dref = self.copy_to_worker_0(src)
+self.broadcast_from_worker0(src_dref, dst)
+
+return dst
+
 def broadcast_from_worker0(self, src: DRef, dst: DRef) -> DRef:
 """Broadcast an array from worker-0 to all other workers.
 
@@ -313,6 +364,45 @@ class Session(Object):

(tvm) branch main updated: [Bugfix][Support] Fix copy constructor for support::OrderedSet (#17044)

2024-05-30 Thread tqchen
This is an automated email from the ASF dual-hosted git repository.

tqchen pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/tvm.git


The following commit(s) were added to refs/heads/main by this push:
 new f6aab98ace [Bugfix][Support] Fix copy constructor for 
support::OrderedSet (#17044)
f6aab98ace is described below

commit f6aab98ace3c7c15df309b5a89f39ac3e92e5a6c
Author: Eric Lunderberg 
AuthorDate: Thu May 30 06:28:35 2024 -0500

[Bugfix][Support] Fix copy constructor for support::OrderedSet (#17044)

Prior to this commit, the `support::OrderedSet` utility used the
default copy constructor and copy assignment, which would copy both
the `OrderedSet::elements_` and `OrderedSet::elem_to_iter_` members.
While this is the correct behavior for `elements_`, the copy of
`elem_to_iter_` would contain references to the original's `element_`,
rather than to its own.

While `elem_to_iter_` is used in both `OrderedSet::push_back` and
`OrderedSet::erase`, the implementation of `OrderedSet::push_back`
only depends on the keys used in `elem_to_iter_`, and does not depend
on the values stored.  As a result, this bug could go undetected for
append-only usage, which is the most frequent use of `OrderedSet`.

This commit updates `support::OrderedSet` to have an explicit copy
constructor and copy assignment.  Only the `std::list elements_`
member may be copied, while the `elem_to_iter_` must instead be
rebuilt.
---
 src/support/ordered_set.h | 31 +++
 1 file changed, 27 insertions(+), 4 deletions(-)

diff --git a/src/support/ordered_set.h b/src/support/ordered_set.h
index 741f0b18e6..11acb8c3fe 100644
--- a/src/support/ordered_set.h
+++ b/src/support/ordered_set.h
@@ -54,11 +54,28 @@ class OrderedSet {
  public:
   OrderedSet() = default;
 
+  /* \brief Explicit copy constructor
+   *
+   * The default copy constructor would copy both `elements_` and
+   * `elem_to_iter_`.  While this is the correct behavior for
+   * `elements_`, the copy of `elem_to_iter_` would contain references
+   * to the original's `element_`, rather than to its own
+   */
+  OrderedSet(const OrderedSet& other) : elements_(other.elements_) { 
InitElementToIter(); }
+
+  /* \brief Explicit copy assignment
+   *
+   * Implemented in terms of the copy constructor, and the default
+   * move assignment.
+   */
+  OrderedSet& operator=(const OrderedSet& other) { return *this = 
OrderedSet(other); }
+
+  OrderedSet(OrderedSet&&) = default;
+  OrderedSet& operator=(OrderedSet&&) = default;
+
   template 
-  OrderedSet(Iter begin, Iter end) {
-for (auto it = begin; it != end; it++) {
-  push_back(*it);
-}
+  OrderedSet(Iter begin, Iter end) : elements_(begin, end) {
+InitElementToIter();
   }
 
   void push_back(const T& t) {
@@ -90,6 +107,12 @@ class OrderedSet {
   auto empty() const { return elements_.empty(); }
 
  private:
+  void InitElementToIter() {
+for (auto it = elements_.begin(); it != elements_.end(); it++) {
+  elem_to_iter_[*it] = it;
+}
+  }
+
   std::list elements_;
   typename detail::OrderedSetLookupType::MapType elem_to_iter_;
 };



(tvm) branch main updated: [Runtime][ROCm] Enable ROCm host memory support (#17037)

2024-05-30 Thread tqchen
This is an automated email from the ASF dual-hosted git repository.

tqchen pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/tvm.git


The following commit(s) were added to refs/heads/main by this push:
 new 08b32a7976 [Runtime][ROCm] Enable ROCm host memory support (#17037)
08b32a7976 is described below

commit 08b32a797642515b0b263ead292af6962fea0cf4
Author: Ruihang Lai 
AuthorDate: Thu May 30 07:28:26 2024 -0400

[Runtime][ROCm] Enable ROCm host memory support (#17037)

This PR enables the ROCMHost memory support in ROCm device API.
---
 src/runtime/ndarray.cc  |  3 ++-
 src/runtime/rocm/rocm_device_api.cc | 40 -
 2 files changed, 37 insertions(+), 6 deletions(-)

diff --git a/src/runtime/ndarray.cc b/src/runtime/ndarray.cc
index c2efa79c0c..c2cf5f388a 100644
--- a/src/runtime/ndarray.cc
+++ b/src/runtime/ndarray.cc
@@ -316,7 +316,8 @@ void NDArray::CopyFromTo(const DLTensor* from, DLTensor* 
to, TVMStreamHandle str
 
   ICHECK(from->device.device_type == to->device.device_type || 
from->device.device_type == kDLCPU ||
  to->device.device_type == kDLCPU || from->device.device_type == 
kDLCUDAHost ||
- to->device.device_type == kDLCUDAHost)
+ to->device.device_type == kDLCUDAHost || from->device.device_type == 
kDLROCMHost ||
+ to->device.device_type == kDLROCMHost)
   << "Can not copy across different device types directly. From device 
type: "
   << from->device.device_type << " to device type: " << 
to->device.device_type;
 
diff --git a/src/runtime/rocm/rocm_device_api.cc 
b/src/runtime/rocm/rocm_device_api.cc
index f3cc46f927..e2a5048ca0 100644
--- a/src/runtime/rocm/rocm_device_api.cc
+++ b/src/runtime/rocm/rocm_device_api.cc
@@ -144,16 +144,26 @@ class ROCMDeviceAPI final : public DeviceAPI {
 *rv = value;
   }
   void* AllocDataSpace(Device dev, size_t nbytes, size_t alignment, DLDataType 
type_hint) final {
-ROCM_CALL(hipSetDevice(dev.device_id));
 ICHECK_EQ(256 % alignment, 0U) << "ROCM space is aligned at 256 bytes";
 void* ret;
-ROCM_CALL(hipMalloc(, nbytes));
+if (dev.device_type == kDLROCMHost) {
+  VLOG(1) << "allocating " << nbytes << "bytes on host";
+  ROCM_CALL(hipHostMalloc(, nbytes));
+} else {
+  ROCM_CALL(hipSetDevice(dev.device_id));
+  VLOG(1) << "allocating " << nbytes << " bytes on device";
+  ROCM_CALL(hipMalloc(, nbytes));
+}
 return ret;
   }
 
   void FreeDataSpace(Device dev, void* ptr) final {
-ROCM_CALL(hipSetDevice(dev.device_id));
-ROCM_CALL(hipFree(ptr));
+if (dev.device_type == kDLROCMHost) {
+  ROCM_CALL(hipHostFree(ptr));
+} else {
+  ROCM_CALL(hipSetDevice(dev.device_id));
+  ROCM_CALL(hipFree(ptr));
+}
   }
 
   void CopyDataFromTo(const void* from, size_t from_offset, void* to, size_t 
to_offset, size_t size,
@@ -162,6 +172,21 @@ class ROCMDeviceAPI final : public DeviceAPI {
 hipStream_t hip_stream = static_cast(stream);
 from = static_cast(from) + from_offset;
 to = static_cast(to) + to_offset;
+
+if (dev_from.device_type == kDLROCMHost) {
+  dev_from.device_type = kDLCPU;
+}
+
+if (dev_to.device_type == kDLROCMHost) {
+  dev_to.device_type = kDLCPU;
+}
+
+// In case there is a copy from host mem to host mem */
+if (dev_to.device_type == kDLCPU && dev_from.device_type == kDLCPU) {
+  memcpy(to, from, size);
+  return;
+}
+
 if (dev_from.device_type == kDLROCM && dev_to.device_type == kDLROCM) {
   ROCM_CALL(hipSetDevice(dev_from.device_id));
   if (dev_from.device_id == dev_to.device_id) {
@@ -210,7 +235,7 @@ class ROCMDeviceAPI final : public DeviceAPI {
  private:
   static void GPUCopy(const void* from, void* to, size_t size, hipMemcpyKind 
kind,
   hipStream_t stream) {
-if (stream != 0) {
+if (stream != nullptr) {
   ROCM_CALL(hipMemcpyAsync(to, from, size, kind, stream));
 } else {
   ROCM_CALL(hipMemcpy(to, from, size, kind));
@@ -229,6 +254,11 @@ TVM_REGISTER_GLOBAL("device_api.rocm").set_body([](TVMArgs 
args, TVMRetValue* rv
   *rv = static_cast(ptr);
 });
 
+TVM_REGISTER_GLOBAL("device_api.rocm_host").set_body([](TVMArgs args, 
TVMRetValue* rv) {
+  DeviceAPI* ptr = ROCMDeviceAPI::Global();
+  *rv = static_cast(ptr);
+});
+
 class ROCMTimerNode : public TimerNode {
  public:
   virtual void Start() {



(tvm) branch main updated: [Runtime] Use preferred host memory (pinned memory) in KV cache (#17036)

2024-05-29 Thread tqchen
This is an automated email from the ASF dual-hosted git repository.

tqchen pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/tvm.git


The following commit(s) were added to refs/heads/main by this push:
 new 71f7af7985 [Runtime] Use preferred host memory (pinned memory) in KV 
cache (#17036)
71f7af7985 is described below

commit 71f7af7985e2c883494a9aa80e0f5d12c154a990
Author: Ruihang Lai 
AuthorDate: Wed May 29 17:14:17 2024 -0400

[Runtime] Use preferred host memory (pinned memory) in KV cache (#17036)

This PR updates the PagedKVCache with the pinned memory support,
which can reduce the copy overhead between CPU and GPU.

This PR also bumps FlashInfer version, which now supports
* specifying kernels to build via cmake,
* pinned memory as host memory.

We also update CMakeLists.txt and config.cmake to include the
FlashInfer compile options. Prior to this PR, the kernels being
built is hardcoded in FlashInfer header files.
---
 3rdparty/flashinfer|   2 +-
 CMakeLists.txt |   6 +-
 cmake/config.cmake |  13 ++
 include/tvm/runtime/ndarray.h  |  17 +++
 src/runtime/relax_vm/paged_kv_cache.cc | 265 +
 5 files changed, 205 insertions(+), 98 deletions(-)

diff --git a/3rdparty/flashinfer b/3rdparty/flashinfer
index f978e02565..7e9cc7ff42 16
--- a/3rdparty/flashinfer
+++ b/3rdparty/flashinfer
@@ -1 +1 @@
-Subproject commit f978e02565d7157d57803eb4153369e046fc4106
+Subproject commit 7e9cc7ff42ca283c317061a877305d09a395fad2
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 683ce819db..7575d6c2b4 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -960,13 +960,13 @@ option(USE_FLASHINFER "Build TVM with FlashInfer" OFF)
 if (USE_FLASHINFER STREQUAL "ON")
   message(STATUS "Build with FlashInfer")
   set(FLASHINFER_TVM_BINDING ON)
-  set(FLASHINFER_TVM_HOME ${PROJECT_SOURCE_DIR})
-  set(FLASHINFER_ENABLE_FP8 OFF)
-  set(FLASHINFER_ENABLE_BF16 OFF)
+  set(FLASHINFER_TVM_SOURCE_DIR ${PROJECT_SOURCE_DIR})
   set(FLASHINFER_PREFILL OFF)
   set(FLASHINFER_DECODE OFF)
   set(FLASHINFER_PAGE OFF)
   set(FLASHINFER_CASCADE OFF)
+  set(FLASHINFER_SAMPLING OFF)
+  set(FLASHINFER_NORM OFF)
   add_subdirectory(3rdparty/flashinfer)
 else ()
   message(STATUS "Build without FlashInfer")
diff --git a/cmake/config.cmake b/cmake/config.cmake
index ccb449fe2b..5847acc298 100644
--- a/cmake/config.cmake
+++ b/cmake/config.cmake
@@ -444,6 +444,19 @@ set(USE_GTEST AUTO)
 # Need to have USE_CUDA=ON
 set(USE_CUTLASS OFF)
 
+# Whether to enable FlashInfer or not.
+set(USE_FLASHINFER OFF)
+# Options for FlashInfer kernel compilation.
+set(FLASHINFER_ENABLE_FP8 OFF)
+set(FLASHINFER_ENABLE_BF16 OFF)
+set(FLASHINFER_GEN_GROUP_SIZES 1 4 6 8)
+set(FLASHINFER_GEN_PAGE_SIZES 16)
+set(FLASHINFER_GEN_HEAD_DIMS 128)
+set(FLASHINFER_GEN_KV_LAYOUTS 0 1)
+set(FLASHINFER_GEN_POS_ENCODING_MODES 0 1)
+set(FLASHINFER_GEN_ALLOW_FP16_QK_REDUCTIONS "false")
+set(FLASHINFER_GEN_CASUALS "false" "true")
+
 # Enable to show a summary of TVM options
 set(SUMMARIZE OFF)
 
diff --git a/include/tvm/runtime/ndarray.h b/include/tvm/runtime/ndarray.h
index 5bdc883649..3eb225fccf 100644
--- a/include/tvm/runtime/ndarray.h
+++ b/include/tvm/runtime/ndarray.h
@@ -534,6 +534,23 @@ inline bool NDArray::Load(dmlc::Stream* strm) {
   return true;
 }
 
+/*!
+ * \brief Get the preferred host device from the input device.
+ * - For CUDA and ROCm, CUDAHost and ROCMHost will be returned for pinned 
memory,
+ * since pinned memory reduces copy overhead.
+ * - For other devices, CPU is returned as a fallback.
+ */
+inline Device GetPreferredHostDevice(Device device) {
+  if (device.device_type == DLDeviceType::kDLCUDA) {
+return Device{DLDeviceType::kDLCUDAHost, 0};
+  } else if (device.device_type == DLDeviceType::kDLROCM) {
+return Device{DLDeviceType::kDLROCMHost, 0};
+  } else {
+// Fallback to CPU.
+return Device{DLDeviceType::kDLCPU, 0};
+  }
+}
+
 }  // namespace runtime
 }  // namespace tvm
 
diff --git a/src/runtime/relax_vm/paged_kv_cache.cc 
b/src/runtime/relax_vm/paged_kv_cache.cc
index a5d2d9f415..62750d6d7d 100644
--- a/src/runtime/relax_vm/paged_kv_cache.cc
+++ b/src/runtime/relax_vm/paged_kv_cache.cc
@@ -194,6 +194,56 @@ enum class RoPEMode : int {
   kInline = 2,
 };
 
+/*!
+ * \brief The class of host memory int32 vector in "std::vector" interface.
+ * This vector allocates static memory on the specified host memory
+ * at the time of construction.
+ */
+class HostMemoryVector {
+ public:
+  HostMemoryVector() = default;
+  HostMemoryVector(const HostMemoryVector&) = delete;
+  HostMemoryVector(HostMemoryVector&& other) = default;
+  HostMemoryVector& operator=(const HostMemoryVector&) = delete;
+  HostMemoryVector& operator=(HostMemoryVector&am

(tvm) branch main updated: [TOPI] Fix SME conv2d schedule import and intrin argument (#17040)

2024-05-29 Thread tqchen
This is an automated email from the ASF dual-hosted git repository.

tqchen pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/tvm.git


The following commit(s) were added to refs/heads/main by this push:
 new 8bdd54b2fd [TOPI] Fix SME conv2d schedule import and intrin argument 
(#17040)
8bdd54b2fd is described below

commit 8bdd54b2fd652f064dc7b0f56a89688fb555bf1e
Author: Luke Hutton 
AuthorDate: Wed May 29 16:44:46 2024 +0100

[TOPI] Fix SME conv2d schedule import and intrin argument (#17040)

Fixes a merge conflict between #16981 and #17003.

Change-Id: Ifcc983ef0b8c00250568a048fd682933adfdcde4
---
 python/tvm/topi/arm_cpu/conv2d.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/python/tvm/topi/arm_cpu/conv2d.py 
b/python/tvm/topi/arm_cpu/conv2d.py
index 58c909301e..d0fe251e7e 100644
--- a/python/tvm/topi/arm_cpu/conv2d.py
+++ b/python/tvm/topi/arm_cpu/conv2d.py
@@ -729,7 +729,7 @@ def schedule_conv2d_NHWC_hybrid_TIR(sch: tvm.tir.Schedule):
 # pylint: disable=import-outside-toplevel
 from tvm.topi.arm_cpu.pstate_attributes import SMEAttributes
 from tvm.tir.tensor_intrin.arm_cpu import (
-ARM_SME_2SVLx2SVL_TRANSPOSE_INTERLEAVE,
+ARM_SME_2SVLx2SVL_FP32_TRANSPOSE_INTERLEAVE,
 ARM_SME_2SVLx2SVL_GEMM_INTERLEAVED_MOPA,
 ARM_SME_INIT,
 get_sme_gemm_interleaved_mopa_2svlx2svl_intrin,
@@ -743,7 +743,7 @@ def schedule_conv2d_NHWC_hybrid_TIR(sch: tvm.tir.Schedule):
 ko, ki = sch.split(k, factors=(None, tile_K), disable_predication=True)
 sch.parallel(b)
 sch.reorder(b, ko, mo, ki, mi)
-sch.tensorize(ki, ARM_SME_2SVLx2SVL_TRANSPOSE_INTERLEAVE)
+sch.tensorize(ki, ARM_SME_2SVLx2SVL_FP32_TRANSPOSE_INTERLEAVE)
 
 # Split and reorder the loops of the GeMM for tensorization
 b, m, n, k = sch.get_loops(gemm_block)
@@ -760,7 +760,7 @@ def schedule_conv2d_NHWC_hybrid_TIR(sch: tvm.tir.Schedule):
 sme_gemm_interleaved_intrin_name = 
ARM_SME_2SVLx2SVL_GEMM_INTERLEAVED_MOPA + f"_{K_padded}"
 tvm.tir.TensorIntrin.register(
 sme_gemm_interleaved_intrin_name,
-*get_sme_gemm_interleaved_mopa_2svlx2svl_intrin(K_padded),
+*get_sme_gemm_interleaved_mopa_2svlx2svl_intrin(K_padded, dtype),
 override=True,
 )
 sch.tensorize(mi, sme_gemm_interleaved_intrin_name)



(tvm) branch main updated: [Relax][Bugfix] Apply FuseOps to nested DataflowBlock (#17033)

2024-05-29 Thread tqchen
This is an automated email from the ASF dual-hosted git repository.

tqchen pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/tvm.git


The following commit(s) were added to refs/heads/main by this push:
 new d9240e4814 [Relax][Bugfix] Apply FuseOps to nested DataflowBlock 
(#17033)
d9240e4814 is described below

commit d9240e4814b33993d8720a488abfd2571131908f
Author: Eric Lunderberg 
AuthorDate: Wed May 29 06:43:41 2024 -0500

[Relax][Bugfix] Apply FuseOps to nested DataflowBlock (#17033)

While it is ill-formed for control-flow to occur within a
`DataflowBlock`, it is legal for a `DataflowBlock` to be contained
within a control-flow.  Prior to this commit, the `FuseOps` and
`FuseOpsByPattern` transforms erroneously skipped `DataflowBlock`
instances that were contained within a `relax::If` node.

This commit updates `FuseOps` to apply operator fusion to any dataflow
block, regardless of whether it is found at the top level of a a Relax
function.

Co-authored-by: Chris Sullivan 
---
 src/relax/transform/fuse_ops.cc|  39 +++-
 .../relax/test_transform_fuse_ops_by_pattern.py| 101 +
 2 files changed, 115 insertions(+), 25 deletions(-)

diff --git a/src/relax/transform/fuse_ops.cc b/src/relax/transform/fuse_ops.cc
index e89c5e4445..c4bd52eff1 100644
--- a/src/relax/transform/fuse_ops.cc
+++ b/src/relax/transform/fuse_ops.cc
@@ -108,9 +108,16 @@ class GraphCreator : public ExprVisitor {
   static IndexedForwardGraph Create(IRModule mod, support::Arena* arena) {
 GraphCreator creator(mod, arena);
 for (const auto& it : mod->functions) {
-  // Only visit Relax function without attr kPrimitive.
+  // Only visit Relax functions with neither attr::kPrimitive nor
+  // attr::kCodegen.  Relax functions with `attr::kPrimitive` are
+  // previously fused functions, potentially from a previous use
+  // of `FuseOps` or `FuseOpsByPattern`.  Relax functions with
+  // `attr::kCodegen` are previously fused functions from
+  // `FuseOpsByPattern`, when the `annotate_codegen` option is
+  // true.
   const auto* func = it.second.as();
-  if (func == nullptr || func->HasNonzeroAttr(attr::kPrimitive)) {
+  if (func == nullptr || func->HasNonzeroAttr(attr::kPrimitive) ||
+  func->GetAttr(attr::kCodegen).defined()) {
 continue;
   }
   creator(GetRef(func));
@@ -142,13 +149,6 @@ class GraphCreator : public ExprVisitor {
 ExprVisitor::VisitExpr_(func);
   }
 
-  void VisitBindingBlock(const BindingBlock& block) final {
-if (const auto* df_block = block.as()) {
-  VisitBindingBlock_(df_block);
-}
-// We skip ordinary binding blocks since they might be impure (with side 
effect or control flow)
-  }
-
   void VisitBinding_(const MatchCastNode* binding) final {
 IndexedForwardGraph::Node* node = CreateNode(binding->var.get());
 SetNodePattern(node, OpPatternKind::kOpaque);
@@ -262,16 +262,11 @@ class GraphCreator : public ExprVisitor {
 IndexedForwardGraph::Node* leaf_node = nullptr;
 if (it != graph_.node_map.end()) {
   leaf_node = it->second;
-} else if (leaf_expr->IsInstance() || 
leaf_expr->IsInstance() ||
-   leaf_expr->IsInstance() || 
leaf_expr->IsInstance() ||
-   leaf_expr->IsInstance()) {
+} else {
   leaf_node = CreateNode(leaf_expr.get());
   // Since we never fuse constants, the pattern of the constant is set to 
`kOpaque`.
   SetNodePattern(leaf_node, OpPatternKind::kOpaque);
   AddToPostDFSOrder(leaf_node, leaf_expr.get());
-} else {
-  LOG(FATAL) << "The leaf Expr is supposed to be defined before, but got: 
" << leaf_expr
- << " used before definition.";
 }
 AddEdge(leaf_node, binding_var_node, pattern);
   }
@@ -701,8 +696,10 @@ class OperatorFusor : public ExprMutator {
 }
 for (const auto& gv : entry_functions) {
   const auto& func = mod_->Lookup(gv);
-  // Only visit Relax function without attr kPrimitive.
-  if (func->IsInstance() && 
!func->HasNonzeroAttr(attr::kPrimitive)) {
+  // Only visit Relax functions with neither attr::kPrimitive nor
+  // attr::kCodegen.
+  if (func->IsInstance() && 
!func->HasNonzeroAttr(attr::kPrimitive) &&
+  !func->GetAttr(attr::kCodegen).defined()) {
 auto updated_func = Downcast(VisitExpr(func));
 builder_->UpdateFunction(gv, updated_func);
   }
@@ -739,14 +736,6 @@ class OperatorFusor : public ExprMutator {
 return false;
   }
 
-  BindingBlock VisitBindingBlock(const BindingBlock& block) final {
-if (const auto* df_block = block.as()) {
-  return VisitBindingBlock_(df_block);
-}
-// We skip ordinary binding blocks since they might be impur

(tvm) branch main updated: [Relax][Bugfix] Annotate ComputePrimValue output as host function (#17032)

2024-05-28 Thread tqchen
This is an automated email from the ASF dual-hosted git repository.

tqchen pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/tvm.git


The following commit(s) were added to refs/heads/main by this push:
 new c9d87ef54f [Relax][Bugfix] Annotate ComputePrimValue output as host 
function (#17032)
c9d87ef54f is described below

commit c9d87ef54fbba29b16a0a8420fb61c669808a256
Author: Eric Lunderberg 
AuthorDate: Tue May 28 19:49:20 2024 -0500

[Relax][Bugfix] Annotate ComputePrimValue output as host function (#17032)

The `ComputePrimValue` transform is used to compute the value of
symbolic expressions that may appear within a Relax function.  For
example, to compute a boolean condition used for a `relax::If` node.
These functions are used for small host-side computations, prior to
launching a device kernel.

This commit updates `ComputePrimValue` to annotate the generated
`PrimFunc` with `tir::attr::kIsHostFunc`.  This annotation is required
for correct behavior in `tvm.dlight.ApplyDefaultSchedule`, to avoid
erroneous scheduling of this function for the GPU, and for
`tir::transform::BindTarget`, to ensure that the function is compiled
for execution on the host.

Co-authored-by: Chris Sullivan 
---
 src/relax/transform/compute_prim_value.cc   | 3 ++-
 tests/python/relax/test_transform_compute_prim_value.py | 3 +++
 2 files changed, 5 insertions(+), 1 deletion(-)

diff --git a/src/relax/transform/compute_prim_value.cc 
b/src/relax/transform/compute_prim_value.cc
index 9fe2a3a06f..716550ba04 100644
--- a/src/relax/transform/compute_prim_value.cc
+++ b/src/relax/transform/compute_prim_value.cc
@@ -45,7 +45,8 @@ class PrimValueComputeInjector : public ExprMutator {
 auto param_vars = tir::UndefinedVars(node->value);
 tir::Stmt body = tir::Evaluate(tir::Call(ret_dtype, tir::builtin::ret(), 
{node->value}));
 
-tir::PrimFunc func(param_vars, body, PrimType(ret_dtype));
+tir::PrimFunc func(param_vars, body, PrimType(ret_dtype), {},
+   DictAttrs({{tir::attr::kIsHostFunc, Bool(true)}}));
 func = tir::RenewDefs(func);
 
 auto callee = builder_->AddFunction(func, "compute_symbolic_expr");
diff --git a/tests/python/relax/test_transform_compute_prim_value.py 
b/tests/python/relax/test_transform_compute_prim_value.py
index 9fee35414d..5d9caf2d36 100644
--- a/tests/python/relax/test_transform_compute_prim_value.py
+++ b/tests/python/relax/test_transform_compute_prim_value.py
@@ -44,6 +44,7 @@ class TestPrimValueInAssertCondition(BaseCompare):
 
 @T.prim_func(private=True)
 def compute_symbolic_expr(N: T.int64) -> T.bool:
+T.func_attr({"tir.is_host_func": True})
 T.ret(N % 16 == 0)
 
 
@@ -73,6 +74,7 @@ class TestPrimValueInBranchCondition(BaseCompare):
 
 @T.prim_func(private=True)
 def compute_symbolic_expr(N: T.int64) -> T.bool:
+T.func_attr({"tir.is_host_func": True})
 T.ret(N % 16 == 0)
 
 
@@ -97,6 +99,7 @@ class TestPrimValueInPureFunction(BaseCompare):
 
 @T.prim_func(private=True)
 def compute_symbolic_expr(N: T.int64, M: T.int64) -> T.int64:
+T.func_attr({"tir.is_host_func": True})
 T.ret(N * M)
 
 



(tvm) 01/01: Revert "[SME][TOPI] Add conv2d NHWC SME fp32 schedule (#17003)"

2024-05-28 Thread tqchen
This is an automated email from the ASF dual-hosted git repository.

tqchen pushed a commit to branch revert-17003-sme-conv2d-fp32
in repository https://gitbox.apache.org/repos/asf/tvm.git

commit b71a9a3827d81ac17da5f5bc608583f1a02bd0d8
Author: Tianqi Chen 
AuthorDate: Tue May 28 19:56:33 2024 -0400

Revert "[SME][TOPI] Add conv2d NHWC SME fp32 schedule (#17003)"

This reverts commit cab54e0dee82f84d94cd65f8fe0432ee1c2f2e22.
---
 python/tvm/relay/op/strategy/arm_cpu.py|  15 --
 python/tvm/testing/utils.py|   7 -
 python/tvm/topi/arm_cpu/arm_utils.py   |  18 +-
 python/tvm/topi/arm_cpu/conv2d.py  | 238 +
 python/tvm/topi/arm_cpu/conv2d_gemm.py |  12 +-
 python/tvm/topi/nn/conv2d.py   |   6 +-
 src/arith/scalable_expression.cc   |   7 +
 tests/python/arith/test_arith_simplify.py  |  10 +
 .../python/codegen/test_target_codegen_aarch64.py  |  69 +-
 tests/python/relay/strategy/arm_cpu/test_conv2d.py | 138 +---
 .../relay/strategy/test_select_implementation.py   |   8 -
 tests/python/topi/test_topi_conv2d_nhwc.py |  52 +
 12 files changed, 45 insertions(+), 535 deletions(-)

diff --git a/python/tvm/relay/op/strategy/arm_cpu.py 
b/python/tvm/relay/op/strategy/arm_cpu.py
index 12f19462f7..5e94b38772 100644
--- a/python/tvm/relay/op/strategy/arm_cpu.py
+++ b/python/tvm/relay/op/strategy/arm_cpu.py
@@ -253,18 +253,6 @@ def conv2d_strategy_arm_cpu(attrs, inputs, out_type, 
target):
 )
 # Non-quantized cases
 if is_aarch64 and data.dtype in ["float32", "float16"]:
-if (
-target.features.has_sme
-and data.dtype in ["float32"]
-and kernel.dtype in ["float32"]
-and out_type.dtype in ["float32"]
-):
-strategy.add_implementation(
-
wrap_compute_conv2d(topi.arm_cpu.compute_conv2d_NHWC_hybrid_SME),
-lambda: None,
-name="conv2d_NHWC_hybrid_SME.arm_cpu",
-plevel=12,
-)
 if target.features.has_sve:
 # This strategy is currently suboptimal because of 
LLVM's limited support
 # for scalable vector alias analysis, which causes 
redundant loads / stores
@@ -818,9 +806,6 @@ def arm_cpu_tir_strategy(sch: tir.Schedule) -> bool:
 if matmul_block and sch.get(matmul_block).annotations.get("schedule_type", 
"") == "sme":
 topi.arm_cpu.matmul.tir_schedule_matmul_sme(sch)
 return True
-elif has_block(sch, "conv2d_gemm_output"):
-topi.arm_cpu.schedule_conv2d_NHWC_hybrid_TIR(sch)
-return True
 
 # Fallback to TE schedule for operators we have not written a special TIR 
schedule for
 return False
diff --git a/python/tvm/testing/utils.py b/python/tvm/testing/utils.py
index a208459dd8..84b631cf38 100644
--- a/python/tvm/testing/utils.py
+++ b/python/tvm/testing/utils.py
@@ -1071,13 +1071,6 @@ requires_aarch64_sve = Feature(
 )
 
 
-requires_aarch64_sme = Feature(
-"arm_sme",
-"AArch64 SME",
-run_time_check=lambda: _has_cpu_feat("sme"),
-)
-
-
 requires_x86_vnni = Feature(
 "x86_vnni",
 "x86 VNNI Extensions",
diff --git a/python/tvm/topi/arm_cpu/arm_utils.py 
b/python/tvm/topi/arm_cpu/arm_utils.py
index 5c4b3c0456..f2e01c5aef 100644
--- a/python/tvm/topi/arm_cpu/arm_utils.py
+++ b/python/tvm/topi/arm_cpu/arm_utils.py
@@ -22,7 +22,7 @@ from tvm.target import Target
 from tvm.tir.expr import PrimExpr
 
 
-def get_tiling_A(interleave_A, in_dtype, use_sme=False):
+def get_tiling_A(interleave_A, in_dtype):
 """Compute the tiling information for matrix A in C=A*B,
 which corresponds to the im2col-transformed input matrix.
 
@@ -42,8 +42,6 @@ def get_tiling_A(interleave_A, in_dtype, use_sme=False):
 determines if A is expected to be interleaved
 in_dtype : str
 input datatype
-use_sme : bool
-determines if SME operations on scalable vectors are expected
 
 Returns
 --
@@ -67,11 +65,8 @@ def get_tiling_A(interleave_A, in_dtype, use_sme=False):
 # tile size should be 4x16
 tile_M = 4
 tile_K = 16
-elif use_sme:
-tile_M = 2 * 4 * tvm.tir.vscale()
-tile_K = 2 * 4 * tvm.tir.vscale()
 else:
-# In non-SME, non-quantized cases, A is not interleaved.
+# In non-quantized cases, A is not interleaved.
 # We are loading 4 rows from A.
 # Each row will contain 4 elements, along the 

(tvm) branch revert-17003-sme-conv2d-fp32 created (now b71a9a3827)

2024-05-28 Thread tqchen
This is an automated email from the ASF dual-hosted git repository.

tqchen pushed a change to branch revert-17003-sme-conv2d-fp32
in repository https://gitbox.apache.org/repos/asf/tvm.git


  at b71a9a3827 Revert "[SME][TOPI] Add conv2d NHWC SME fp32 schedule 
(#17003)"

This branch includes the following new commits:

 new b71a9a3827 Revert "[SME][TOPI] Add conv2d NHWC SME fp32 schedule 
(#17003)"

The 1 revisions listed above as "new" are entirely new to this
repository and will be described in separate emails.  The revisions
listed as "add" were already present in the repository and have only
been added to this reference.




(tvm) branch main updated: [Relax][Bugfix] Bind symbolic variables in R.match_cast (#17034)

2024-05-28 Thread tqchen
This is an automated email from the ASF dual-hosted git repository.

tqchen pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/tvm.git


The following commit(s) were added to refs/heads/main by this push:
 new b2c61162f0 [Relax][Bugfix] Bind symbolic variables in R.match_cast 
(#17034)
b2c61162f0 is described below

commit b2c61162f006504b192493e9ceeac9b89a87da65
Author: Eric Lunderberg 
AuthorDate: Tue May 28 18:52:01 2024 -0500

[Relax][Bugfix] Bind symbolic variables in R.match_cast (#17034)

Prior to this commit, variable replacement by `BindSymbolicVars` would
fail to replace variables that occur within a `relax::MatchCast` node.
This pattern is rare, because the `bind_symbolic_vars` method can only
replace variables that are exposed as part of the function signature,
and most uses of `relax::MatchCast` act as a definition for symbolic
variables that are not exposed through the function signature.  This
pattern is well-formed, though, since the `relax::MatchCast` node can
also act as a user of previously-defined symbolic variables.

The root cause for this bug was in the `ExprMutator` visitor for
`relax::MatchCast`, which did not visit the struct info field.  As a
result, the virtual `ExprMutator::VisitPrimExpr` function was not
called for expressions that occur within the `StructInfo` of a
`relax::MatchCast`.  This commit updates `ExprMutator` to resolve this
bug, and applies an analogous fix for `ExprVisitor`.

Co-authored-by: Chris Sullivan 
---
 src/relax/ir/expr_functor.cc  | 22 --
 tests/python/relax/test_bind_symbolic_vars.py | 22 ++
 2 files changed, 38 insertions(+), 6 deletions(-)

diff --git a/src/relax/ir/expr_functor.cc b/src/relax/ir/expr_functor.cc
index dbfaf60fec..63c74db7e3 100644
--- a/src/relax/ir/expr_functor.cc
+++ b/src/relax/ir/expr_functor.cc
@@ -257,6 +257,7 @@ RELAX_EXPR_VISITOR_VISIT_BINDING_IMPL(DataTypeImmNode);
 
 void ExprVisitor::VisitBinding_(const MatchCastNode* binding) {
   this->VisitExpr(binding->value);
+  this->VisitExprDepStructInfoField(binding->struct_info);
   this->VisitVarDef(binding->var);
 }
 
@@ -690,16 +691,25 @@ void ExprMutator::ReEmitBinding(const VarBindingNode* 
binding, Expr new_value) {
 }
 
 void ExprMutator::VisitBinding_(const MatchCastNode* binding) {
-  Var new_var = this->VisitVarDef(binding->var);
   Expr new_value = this->VisitExpr(binding->value);
+  StructInfo new_struct_info = 
this->VisitExprDepStructInfoField(binding->struct_info);
 
-  // re-emit old binding if nothing changes
-  if (new_var.same_as(binding->var) && new_value.same_as(binding->value)) {
+  Var new_var = this->VisitVarDef(binding->var);
+
+  if (new_var.same_as(binding->var) && new_value.same_as(binding->value) &&
+  new_struct_info.same_as(binding->struct_info)) {
+// re-emit old binding if nothing changes
 builder_->EmitNormalized(GetRef(binding));
-  } else {
-new_value = builder_->NormalizeArgument(new_value);
-builder_->EmitNormalized(MatchCast(new_var, new_value, 
binding->struct_info, binding->span));
+return;
   }
+
+  new_value = builder_->NormalizeArgument(new_value);
+  new_var = WithStructInfo(new_var, new_struct_info);
+
+  var_remap_[binding->var->vid] = new_var;
+  var_remap_[new_var->vid] = new_var;
+
+  builder_->EmitNormalized(MatchCast(new_var, new_value, new_struct_info, 
binding->span));
 }
 
 BindingBlock ExprMutator::VisitBindingBlock_(const BindingBlockNode* block) {
diff --git a/tests/python/relax/test_bind_symbolic_vars.py 
b/tests/python/relax/test_bind_symbolic_vars.py
index 82798c56df..18246d224b 100644
--- a/tests/python/relax/test_bind_symbolic_vars.py
+++ b/tests/python/relax/test_bind_symbolic_vars.py
@@ -286,5 +286,27 @@ def test_bind_strided_slice():
 tvm.ir.assert_structural_equal(expected, after)
 
 
+def test_bind_inside_match_cast():
+"""Symbolic variables may occur within R.match_cast"""
+
+@R.function(private=True)
+def before(A: R.Tensor(["M", "N"]), B: R.Tensor(ndim=2)):
+M = T.int64()
+N = T.int64()
+C = R.match_cast(B, R.Tensor([M, N]))
+D = R.add(A, C)
+return D
+
+@R.function(private=True)
+def expected(A: R.Tensor(["M", 32]), B: R.Tensor(ndim=2)):
+M = T.int64()
+C = R.match_cast(B, R.Tensor([M, 32]))
+D = R.add(A, C)
+return D
+
+after = before.bind_symbolic_vars({"N": 32})
+tvm.ir.assert_structural_equal(expected, after)
+
+
 if __name__ == "__main__":
 tvm.testing.main()



(tvm) branch main updated: [Web] Fix string to uint8 array for special characters (#17031)

2024-05-28 Thread tqchen
This is an automated email from the ASF dual-hosted git repository.

tqchen pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/tvm.git


The following commit(s) were added to refs/heads/main by this push:
 new d4b096f905 [Web] Fix string to uint8 array for special characters 
(#17031)
d4b096f905 is described below

commit d4b096f905ad32be448c3a188ecf93a14c5734d5
Author: Charlie Ruan <53290280+charliefr...@users.noreply.github.com>
AuthorDate: Tue May 28 10:35:06 2024 -0700

[Web] Fix string to uint8 array for special characters (#17031)
---
 web/src/memory.ts  |  5 +++--
 web/src/support.ts | 11 ++-
 2 files changed, 9 insertions(+), 7 deletions(-)

diff --git a/web/src/memory.ts b/web/src/memory.ts
index dbbb449a0b..b0d4ff3bf1 100644
--- a/web/src/memory.ts
+++ b/web/src/memory.ts
@@ -375,8 +375,9 @@ export class CachedCallStack implements Disposable {
* @param data The string content.
*/
   allocThenSetArgString(offset: PtrOffset, data: string): void {
-const strOffset = this.allocRawBytes(data.length + 1);
-this.storeRawBytes(strOffset, StringToUint8Array(data));
+const dataUint8: Uint8Array = StringToUint8Array(data);
+const strOffset = this.allocRawBytes(dataUint8.length);
+this.storeRawBytes(strOffset, dataUint8);
 this.addressToSetTargetValue.push([offset, strOffset]);
   }
   /**
diff --git a/web/src/support.ts b/web/src/support.ts
index 2fa87ed291..be85e85b7b 100644
--- a/web/src/support.ts
+++ b/web/src/support.ts
@@ -35,12 +35,13 @@ export function isPromise(value: any): boolean {
  * @returns The corresponding Uint8Array.
  */
 export function StringToUint8Array(str: string): Uint8Array {
-  const arr = new Uint8Array(str.length + 1);
-  for (let i = 0; i < str.length; ++i) {
-arr[i] = str.charCodeAt(i);
+  const arr: Uint8Array = new TextEncoder().encode(str);
+  const resArr = new Uint8Array(arr.length + 1);
+  for (let i = 0; i < arr.length; ++i) {
+resArr[i] = arr[i];
   }
-  arr[str.length] = 0;
-  return arr;
+  resArr[arr.length] = 0;
+  return resArr;
 }
 
 /**



(tvm) branch main updated: [Contrib] Implement NDArray cache update (#17029)

2024-05-27 Thread tqchen
This is an automated email from the ASF dual-hosted git repository.

tqchen pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/tvm.git


The following commit(s) were added to refs/heads/main by this push:
 new b598f28a1c [Contrib] Implement NDArray cache update (#17029)
b598f28a1c is described below

commit b598f28a1cecabf95a1986dcc55a864c8c9ab743
Author: Wuwei Lin 
AuthorDate: Mon May 27 06:25:15 2024 -0700

[Contrib] Implement NDArray cache update (#17029)
---
 python/tvm/contrib/tvmjs.py| 76 +++---
 tests/python/relax/test_runtime_builtin.py | 25 ++
 2 files changed, 94 insertions(+), 7 deletions(-)

diff --git a/python/tvm/contrib/tvmjs.py b/python/tvm/contrib/tvmjs.py
index 923301a1f5..2a7604c0ad 100644
--- a/python/tvm/contrib/tvmjs.py
+++ b/python/tvm/contrib/tvmjs.py
@@ -24,7 +24,7 @@ import shutil
 # pylint: disable=unused-import
 import sys
 from types import GeneratorType
-from typing import Iterator, Mapping, Tuple, Union
+from typing import Any, Iterator, Mapping, Optional, Set, Tuple, Union
 
 import numpy as np
 
@@ -73,7 +73,13 @@ def _calculate_md5(filename):
 class NDArrayCacheShardingManager:
 """Internal helper to shard ndarrays."""
 
-def __init__(self, cache_dir: str, prefix: str, shard_cap_nbytes: int):
+def __init__(
+self,
+cache_dir: str,
+prefix: str,
+shard_cap_nbytes: int,
+initial_shard_records: Optional[Mapping[str, Any]] = None,
+):
 self.cache_dir = cache_dir
 self.prefix = prefix
 self.curr_records = []
@@ -81,8 +87,17 @@ class NDArrayCacheShardingManager:
 self.shard_records = []
 self.shard_cap_nbytes = shard_cap_nbytes
 self.counter = 0
+self.name_to_record: Mapping[str, Tuple[int, Mapping[str, Any]]] = {}
+self.updated_shards: Set[int] = set()
 
-def append(self, data, name, shape, dtype, encode_format):
+if initial_shard_records is not None:
+self.shard_records = initial_shard_records
+self.counter = len(initial_shard_records)
+for idx, shard in enumerate(initial_shard_records):
+for rec in shard["records"]:
+self.name_to_record[rec["name"]] = (idx, rec)
+
+def append_or_update(self, data, name, shape, dtype, encode_format, 
allow_update: bool = False):
 """Commit a record to the manager.
 
 Parameters
@@ -101,6 +116,9 @@ class NDArrayCacheShardingManager:
 
 encode_format:
 The encode format of the entry
+
+allow_update: bool
+If the record already exists, update the record. Otherwise, raise 
an error.
 """
 rec = {
 "name": name,
@@ -109,6 +127,13 @@ class NDArrayCacheShardingManager:
 "format": encode_format,
 "nbytes": len(data),
 }
+if name in self.name_to_record:
+if not allow_update:
+raise ValueError(f"Duplicate name {name} found in the cache.")
+self.update_single_record(rec, data)
+return
+
+self.name_to_record[name] = (self.counter, rec)
 
 if self.pending_nbytes + len(data) >= self.shard_cap_nbytes:
 if len(data) * 2 >= self.shard_cap_nbytes:
@@ -121,6 +146,20 @@ class NDArrayCacheShardingManager:
 self.curr_records.append(rec)
 self.curr_data += data
 
+def update_single_record(self, rec, data):
+"""Update a single record in a shard file."""
+name = rec["name"]
+idx, old_rec = self.name_to_record[name]
+if old_rec["nbytes"] != rec["nbytes"]:
+raise ValueError(f"Cannot update record {name}, size mismatch.")
+data_path = self.shard_records[idx]["dataPath"]
+full_path = os.path.join(self.cache_dir, data_path)
+with open(full_path, "r+b") as outfile:
+outfile.seek(old_rec["byteOffset"])
+outfile.write(data)
+self.name_to_record[name] = (idx, rec)
+self.updated_shards.add(idx)
+
 def commit(self):
 """Commit a record"""
 if self.pending_nbytes != 0:
@@ -131,6 +170,9 @@ class NDArrayCacheShardingManager:
 def finish(self):
 """Finish building and return shard records."""
 self.commit()
+for idx in self.updated_shards:
+full_path = os.path.join(self.cache_dir, 
self.shard_records[idx]["dataPath"])
+self.shard_records[idx]["md5sum"] = _calculate_md5(full_path)
 return self.shard_records
 
 def _commit_internal(self, data, records):
@@ -165,6 +207,7 @@ def dump_

(tvm) branch main updated: [TIR] Fix Shuffle rewrite (#17030)

2024-05-27 Thread tqchen
This is an automated email from the ASF dual-hosted git repository.

tqchen pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/tvm.git


The following commit(s) were added to refs/heads/main by this push:
 new 7359313b40 [TIR] Fix Shuffle rewrite (#17030)
7359313b40 is described below

commit 7359313b40dd1927cd27e2c60539575ae08a4dc5
Author: Siyuan Feng 
AuthorDate: Mon May 27 21:25:06 2024 +0800

[TIR] Fix Shuffle rewrite (#17030)

This PR fixes the shuffle rewrite pass to handle the case where the
vector lanes are larger than the data type of the input vector.
---
 src/target/source/codegen_c.cc |  4 +-
 src/tir/transforms/storage_rewrite.cc  |  2 +-
 ...est_tir_transform_pointer_value_type_rewrite.py | 46 --
 3 files changed, 47 insertions(+), 5 deletions(-)

diff --git a/src/target/source/codegen_c.cc b/src/target/source/codegen_c.cc
index 009fc1672a..344d0392d4 100644
--- a/src/target/source/codegen_c.cc
+++ b/src/target/source/codegen_c.cc
@@ -932,7 +932,9 @@ void CodeGenC::VisitExpr_(const ShuffleNode* op, 
std::ostream& os) {  // NOLINT(
   }
   if (op->indices.size() == 1) {
 // This is an extract element
-os << concat_vec[Downcast(op->indices[0])->value];
+int64_t idx = Downcast(op->indices[0])->value;
+ICHECK_LT(idx, concat_vec.size());
+os << concat_vec[idx];
   } else {
 // Print the shuffle as vector constructor
 // vec(e0, e1, e2, .. en)
diff --git a/src/tir/transforms/storage_rewrite.cc 
b/src/tir/transforms/storage_rewrite.cc
index 2ebb767149..1c3f916a44 100644
--- a/src/tir/transforms/storage_rewrite.cc
+++ b/src/tir/transforms/storage_rewrite.cc
@@ -1493,7 +1493,7 @@ class VectorTypeRewriter : public StmtExprMutator {
   arith::ModularSet me = analyzer_.modular_set(last_dim_index);
   ICHECK(me->coeff == 0 || info.factor() % me->coeff == 0);
   PrimExpr new_index = last_dim_index / make_const(last_dim_index.dtype(), 
info.factor());
-  shuffle_index = me->base;
+  shuffle_index = me->base % info.factor();
   indices.Set(indices.size() - 1, new_index);
 }
 
diff --git 
a/tests/python/tir-transform/test_tir_transform_pointer_value_type_rewrite.py 
b/tests/python/tir-transform/test_tir_transform_pointer_value_type_rewrite.py
index 7baa96c1a1..186f6bd02a 100644
--- 
a/tests/python/tir-transform/test_tir_transform_pointer_value_type_rewrite.py
+++ 
b/tests/python/tir-transform/test_tir_transform_pointer_value_type_rewrite.py
@@ -14,10 +14,10 @@
 # KIND, either express or implied.  See the License for the
 # specific language governing permissions and limitations
 # under the License.
+# pylint: disable=invalid-name, missing-docstring
+
 import tvm
 import tvm.testing
-from tvm import te
-from tvm.driver.build_module import schedule_to_module
 from tvm.script import tir as T
 
 
@@ -25,7 +25,7 @@ class BaseCompare(tvm.testing.CompareBeforeAfter):
 transform = tvm.tir.transform.PointerValueTypeRewrite()
 
 
-class TestRewriteToShuffle(BaseCompare):
+class TestRewriteToShuffle0(BaseCompare):
 @T.prim_func
 def before(A: T.Buffer((16,), "float32"), B: T.Buffer((4,), "float32")):
 A_local_data = T.allocate([16], "float32", scope="local")
@@ -50,6 +50,42 @@ class TestRewriteToShuffle(BaseCompare):
 )
 
 
+class TestRewriteToShuffle1(BaseCompare):
+@T.prim_func
+def before(A: T.Buffer((8,), "float32"), B: T.Buffer((1,), "float32")):
+A_local_data = T.allocate([8], "float32", scope="local")
+A_local = T.Buffer((8,), "float32", data=A_local_data, scope="local")
+A_local[0:4] = A[0:4]
+A_local[4:8] = A[4:8]
+B[0] = (
+A_local[0]
++ A_local[1]
++ A_local[2]
++ A_local[3]
++ A_local[4]
++ A_local[5]
++ A_local[6]
++ A_local[7]
+)
+
+@T.prim_func
+def expected(A: T.Buffer((2,), "float32x4"), B: T.Buffer((1,), "float32")):
+A_local_data = T.allocate([2], "float32x4", "local")
+A_local = T.Buffer((2,), "float32x4", data=A_local_data, scope="local")
+A_local[0] = A[0]
+A_local[1] = A[1]
+B[0] = (
+T.Shuffle([A_local[0]], [0])
++ T.Shuffle([A_local[0]], [1])
++ T.Shuffle([A_local[0]], [2])
++ T.Shuffle([A_local[0]], [3])
++ T.Shuffle([A_local[1]], [0])
++ T.Shuffle([A_local[1]], [1])
++ T.Shuffle([A_local[1]], [2])
++ T.Shuffle([A_local[1]], [3])
+)
+
+
 class TestAddressOf(BaseCompare):
 @T.prim_func
 def before(A: T.Buffer((16,), "float32"), B: T.Buffer((16,), "float32")):
@@ -71,3 +107,7 @@ class TestScalarReadWithoutWrite(BaseCompare):
 T.evaluate(A[i * 4])
 
 expected = before
+
+
+if __name__ == "__main__":
+tvm.testing.main()



(tvm) branch main updated (4f1e2df409 -> 27a3b90105)

2024-05-26 Thread tqchen
This is an automated email from the ASF dual-hosted git repository.

tqchen pushed a change to branch main
in repository https://gitbox.apache.org/repos/asf/tvm.git


from 4f1e2df409 [picojson] Let objects be ordered when serializing (#17027)
 add 27a3b90105 [Web] Add dtype and offset for CreateView in runtime 
(#17028)

No new revisions were added by this update.

Summary of changes:
 web/src/runtime.ts | 13 +++--
 1 file changed, 11 insertions(+), 2 deletions(-)



(tvm) branch main updated (f498cef930 -> 4f1e2df409)

2024-05-25 Thread tqchen
This is an automated email from the ASF dual-hosted git repository.

tqchen pushed a change to branch main
in repository https://gitbox.apache.org/repos/asf/tvm.git


from f498cef930 [WebGPU] Update error messages to be more user-friendly 
(#17021)
 add 4f1e2df409 [picojson] Let objects be ordered when serializing (#17027)

No new revisions were added by this update.

Summary of changes:
 3rdparty/picojson/picojson.h| 19 +++
 3rdparty/picojson/test_picojson.cpp | 13 +
 2 files changed, 32 insertions(+)



(tvm) branch main updated: [WebGPU] Update error messages to be more user-friendly (#17021)

2024-05-25 Thread tqchen
This is an automated email from the ASF dual-hosted git repository.

tqchen pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/tvm.git


The following commit(s) were added to refs/heads/main by this push:
 new f498cef930 [WebGPU] Update error messages to be more user-friendly 
(#17021)
f498cef930 is described below

commit f498cef9306d38c3e6ee0ad3de8ea30cf01d1936
Author: Nestor Qin 
AuthorDate: Sat May 25 08:05:30 2024 -0400

[WebGPU] Update error messages to be more user-friendly (#17021)
---
 web/src/webgpu.ts | 7 ++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/web/src/webgpu.ts b/web/src/webgpu.ts
index 8d699c4c48..10d4aab643 100644
--- a/web/src/webgpu.ts
+++ b/web/src/webgpu.ts
@@ -37,7 +37,12 @@ export async function detectGPUDevice(): 
Promisehttps://webgpureport.org/;
+  );
 }
 const computeMB = (value: number) => {
   return Math.ceil(value / (1 << 20)) + "MB";



(tvm) branch main updated: [DLight] Perf improvement for low_batch_gemv on Metal (#17026)

2024-05-24 Thread tqchen
This is an automated email from the ASF dual-hosted git repository.

tqchen pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/tvm.git


The following commit(s) were added to refs/heads/main by this push:
 new 7f7762d53a [DLight] Perf improvement for low_batch_gemv on Metal 
(#17026)
7f7762d53a is described below

commit 7f7762d53a2cf073e55e88e3cb7550a6a60cba3d
Author: Siyuan Feng 
AuthorDate: Fri May 24 22:37:41 2024 +0800

[DLight] Perf improvement for low_batch_gemv on Metal (#17026)

This PR improves the performance of low_batch_gemv on Metal by changing
schedule config. The performance improvement is around 2x when bucket
larger than 2.
---
 python/tvm/dlight/gpu/low_batch_gemv.py|  13 ++-
 tests/python/dlight/test_gpu_low_batch_gemv.py | 138 -
 2 files changed, 75 insertions(+), 76 deletions(-)

diff --git a/python/tvm/dlight/gpu/low_batch_gemv.py 
b/python/tvm/dlight/gpu/low_batch_gemv.py
index 696722c3f0..20911f0e7d 100644
--- a/python/tvm/dlight/gpu/low_batch_gemv.py
+++ b/python/tvm/dlight/gpu/low_batch_gemv.py
@@ -500,7 +500,7 @@ class LowBatchGEMV(GPUScheduleRule):
 sch.set_scope(block, 0, "shared")
 _, _, _, *s = sch.get_loops(epilogue)  # pylint: 
disable=invalid-name
 _, tx = sch.split(sch.fuse(*s), factors=[None, TX])
-sch.bind(tx, "threadIdx.x")
+sch.bind(tx, TAG_S)
 else:
 sch.reverse_compute_at(epilogue, bx, 
preserve_unit_loops=True)
 ts_tile_s = sch.fuse(*sch.get_loops(epilogue)[3:])
@@ -538,17 +538,16 @@ class LowBatchGEMV(GPUScheduleRule):
 else:
 TS, TR = 16, 32
 elif target.kind.name == "metal":
-# Note that the following tile size is tuned on M2 Ultra for 7B
-TAG_S, TAG_R = "threadIdx.x", "threadIdx.y"
-VEC_C = 1
+VEC_C = 4
 LOAD_V_SHARED = False
 LOAD_V_VEC = -1
-UNROLL = 256
+UNROLL = 8
 if isinstance(len_S, int):
 if len_S > len_R:
-TS, TR = 2, 32
+TS, TR = 8, 32
 else:
-TS, TR = 2, 64
+TAG_S, TAG_R = "threadIdx.x", "threadIdx.y"
+TS, TR = 8, 32
 elif target.kind.name == "rocm":
 VEC_C = 4
 LOAD_V_SHARED = True
diff --git a/tests/python/dlight/test_gpu_low_batch_gemv.py 
b/tests/python/dlight/test_gpu_low_batch_gemv.py
index 4b63cfddba..6072664b3a 100644
--- a/tests/python/dlight/test_gpu_low_batch_gemv.py
+++ b/tests/python/dlight/test_gpu_low_batch_gemv.py
@@ -15,7 +15,6 @@
 # specific language governing permissions and limitations
 # under the License.
 # pylint: disable=missing-docstring
-import pytest
 
 import tvm.testing
 from tvm import dlight as dl
@@ -65,82 +64,83 @@ def test_batch_decode_gemv():
 # with T.block("root"):
 dequantize_intermediate_intermediate_local = 
T.alloc_buffer((T.int64(4096), T.int64(28672)), "float16", scope="local")
 NT_matmul_intermediate_pad_local = T.alloc_buffer(((batch_size + 
T.int64(3)) // T.int64(4) * T.int64(4), T.int64(1), T.int64(4096)), "float16", 
scope="local")
-NT_matmul_intermediate_pad_rf_local = T.alloc_buffer((T.int64(64), 
(batch_size + T.int64(3)) // T.int64(4) * T.int64(4), T.int64(1), 
T.int64(4096)), "float16", scope="local")
-NT_matmul_intermediate_pad_rf_local_1 = T.alloc_buffer((T.int64(64), 
(batch_size + T.int64(3)) // T.int64(4) * T.int64(4), T.int64(1), 
T.int64(4096)), "float16", scope="local")
+NT_matmul_intermediate_pad_rf_local = T.alloc_buffer((T.int64(128), 
(batch_size + T.int64(3)) // T.int64(4) * T.int64(4), T.int64(1), 
T.int64(4096)), "float16", scope="local")
+NT_matmul_intermediate_pad_rf_local_1 = T.alloc_buffer((T.int64(32), 
(batch_size + T.int64(3)) // T.int64(4) * T.int64(4), T.int64(1), 
T.int64(4096)), "float16", scope="local")
 for ax0_0 in T.thread_binding((batch_size + T.int64(3)) // T.int64(4), 
thread="blockIdx.y"):
-for u_fused_ax1_fused_fused_0 in T.thread_binding(T.int64(1024), 
thread="blockIdx.x"):
-for u_fused_ax1_fused_fused_1 in T.thread_binding(T.int64(2), 
thread="threadIdx.x"):
-for ax2_fused_u_fused_1_ax2_fused_u_fused_3_fused_0 in 
T.thread_binding(T.int64(64), thread="threadIdx.y"):
+for u_fused_ax1_fused_fused_0 in T.thread_binding(T.int64(256), 
thread="blockIdx.x"):
+for u_fused_ax1_fused_fused_1 in T.thread_binding(T.int64

(tvm) branch main updated: [Relax][UnitTest] Validate IRModule with multiple targets (#16960)

2024-05-24 Thread tqchen
This is an automated email from the ASF dual-hosted git repository.

tqchen pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/tvm.git


The following commit(s) were added to refs/heads/main by this push:
 new cf2753eafd  [Relax][UnitTest] Validate IRModule with multiple targets 
(#16960)
cf2753eafd is described below

commit cf2753eafd03cecbb6de2b500d5e049c62c54958
Author: Eric Lunderberg 
AuthorDate: Fri May 24 05:55:14 2024 -0500

 [Relax][UnitTest] Validate IRModule with multiple targets (#16960)

[Relax][UnitTest] Validate IRModule with multiple targets

This commit adds a unit test to verify that a single `IRModule` can
contain functions that will be used on multiple distinct targets.
Previously, this test case caused errors when running the
`LegalizeOps` and `ApplyDefaultSchedule` transforms.
---
 tests/python/relax/test_vm_build.py | 59 +
 1 file changed, 59 insertions(+)

diff --git a/tests/python/relax/test_vm_build.py 
b/tests/python/relax/test_vm_build.py
index 180535231d..ab40e181a3 100644
--- a/tests/python/relax/test_vm_build.py
+++ b/tests/python/relax/test_vm_build.py
@@ -1246,5 +1246,64 @@ def test_set_input_get_failure_rpc(exec_mode):
 run_on_rpc(TestVMSetInput, set_input_attempt_get, exec_mode)
 
 
+@tvm.testing.requires_gpu
+def test_relax_module_with_multiple_targets(exec_mode):
+"""Relax functions may contain kernels for multiple targets
+
+In this example, the module contains one function to execute on
+LLVM, and one function to execute on CUDA.
+
+"""
+
+@I.ir_module
+class Module:
+I.module_global_infos({"vdevice": [I.vdevice("llvm")]})
+
+@R.function
+def func_cuda(A: R.Tensor([32, 32], "float32"), B: R.Tensor([32, 32], 
"float32")):
+C = R.add(A, B)
+return C
+
+@R.function
+def func_llvm(
+A: R.Tensor([32, 32], "float32", "llvm"), B: R.Tensor([32, 32], 
"float32", "llvm")
+):
+C = R.add(A, B)
+return C
+
+seq = tvm.ir.transform.Sequential(
+[
+tvm.relax.transform.LegalizeOps(),
+tvm.dlight.ApplyDefaultSchedule(tvm.dlight.gpu.Fallback()),
+],
+name="LegalizeAndSchedule",
+)
+with tvm.target.Target("cuda"):
+built = tvm.relax.build(seq(Module))
+
+np_A = np.random.random([32, 32]).astype("float32")
+np_B = np.random.random([32, 32]).astype("float32")
+
+dev_llvm = tvm.device("llvm")
+vm_llvm = tvm.relax.VirtualMachine(built, device=dev_llvm)
+llvm_output = vm_llvm["func_llvm"](
+tvm.nd.array(np_A, dev_llvm),
+tvm.nd.array(np_B, dev_llvm),
+)
+
+dev_cuda = tvm.device("cuda")
+vm_cuda = tvm.relax.VirtualMachine(built, device=dev_cuda)
+
+cuda_output = vm_cuda["func_cuda"](
+tvm.nd.array(np_A, dev_cuda),
+tvm.nd.array(np_B, dev_cuda),
+)
+
+np_C = np_A + np_B
+
+tvm.testing.assert_allclose(llvm_output.numpy(), np_C)
+tvm.testing.assert_allclose(cuda_output.numpy(), np_C)
+
+
 if __name__ == "__main__":
 tvm.testing.main()



(tvm) branch main updated: Support multinomial_from_uniform dispatch (#17010)

2024-05-24 Thread tqchen
This is an automated email from the ASF dual-hosted git repository.

tqchen pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/tvm.git


The following commit(s) were added to refs/heads/main by this push:
 new 604fbbdf0e Support multinomial_from_uniform dispatch (#17010)
604fbbdf0e is described below

commit 604fbbdf0e6f5c101c692fbcb5b69b610e6d624c
Author: Siyuan Feng 
AuthorDate: Fri May 24 18:52:03 2024 +0800

Support multinomial_from_uniform dispatch (#17010)
---
 include/tvm/relax/attrs/sampling.h |  46 +++
 python/tvm/relax/backend/__init__.py   |   3 +-
 python/tvm/relax/backend/dispatch_sampling.py  |  94 ++
 python/tvm/relax/backend/dispatch_sort_scan.py |  46 +--
 python/tvm/relax/backend/utils.py  |  55 +++-
 python/tvm/relax/backend_tir/__init__.py   |   3 +-
 python/tvm/relax/backend_tir/cumsum.py |   8 +-
 python/tvm/relax/backend_tir/sampling.py   | 339 +
 python/tvm/relax/frontend/nn/op.py |  46 +--
 python/tvm/relax/op/__init__.py|   7 +-
 python/tvm/relax/op/sampling.py|  87 ++
 python/tvm/relax/pipeline.py   |   1 +
 python/tvm/script/ir_builder/relax/ir.py   |  83 ++---
 python/tvm/script/parser/tir/parser.py |  25 +-
 python/tvm/target/detect_target.py |   4 +
 src/relax/op/tensor/index.cc   |   2 +-
 src/relax/op/tensor/sampling.cc| 143 +
 src/relax/op/tensor/sampling.h |  57 
 .../python/relax/test_backend_dispatch_sampling.py | 201 
 tests/python/relax/test_frontend_nn_op.py  |  40 +--
 tests/python/relax/test_op_sampling.py |  69 +
 .../python/tvmscript/test_tvmscript_parser_tir.py  |  24 ++
 22 files changed, 1222 insertions(+), 161 deletions(-)

diff --git a/include/tvm/relax/attrs/sampling.h 
b/include/tvm/relax/attrs/sampling.h
new file mode 100644
index 00..a878dd9766
--- /dev/null
+++ b/include/tvm/relax/attrs/sampling.h
@@ -0,0 +1,46 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ * \file tvm/relax/attrs/sampling.h
+ * \brief Attributes for sampling operators.
+ */
+#ifndef TVM_RELAX_ATTRS_SAMPLING_H_
+#define TVM_RELAX_ATTRS_SAMPLING_H_
+
+#include 
+
+namespace tvm {
+namespace relax {
+
+/*! \brief Attributes used in multinomial_from_uniform operator */
+struct MultinomialFromUniformAttrs : public 
tvm::AttrsNode {
+  DataType dtype;
+
+  TVM_DECLARE_ATTRS(MultinomialFromUniformAttrs, 
"relax.attrs.MultinomialFromUniformAttrs") {
+TVM_ATTR_FIELD(dtype)
+.set_default(DataType::Int(64))
+.describe("Data type of the output indices.");
+  }
+};  // struct MultinomialFromUniformAttrs
+
+}  // namespace relax
+}  // namespace tvm
+
+#endif  // TVM_RELAX_ATTRS_SAMPLING_H_
diff --git a/python/tvm/relax/backend/__init__.py 
b/python/tvm/relax/backend/__init__.py
index e4a89bdb95..6d0ca30201 100644
--- a/python/tvm/relax/backend/__init__.py
+++ b/python/tvm/relax/backend/__init__.py
@@ -17,5 +17,6 @@
 """Relax backends"""
 
 from . import contrib
-from .pattern_registry import get_pattern, get_patterns_with_prefix
+from .dispatch_sampling import DispatchSampling
 from .dispatch_sort_scan import DispatchSortScan
+from .pattern_registry import get_pattern, get_patterns_with_prefix
diff --git a/python/tvm/relax/backend/dispatch_sampling.py 
b/python/tvm/relax/backend/dispatch_sampling.py
new file mode 100644
index 00..68d162fdf1
--- /dev/null
+++ b/python/tvm/relax/backend/dispatch_sampling.py
@@ -0,0 +1,94 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   ht

(tvm) branch main updated: [Metal] Support metal device profiling (#17025)

2024-05-24 Thread tqchen
This is an automated email from the ASF dual-hosted git repository.

tqchen pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/tvm.git


The following commit(s) were added to refs/heads/main by this push:
 new 7463b37b88 [Metal] Support metal device profiling (#17025)
7463b37b88 is described below

commit 7463b37b88b488bf1cf8696632765c51760fe3be
Author: Siyuan Feng 
AuthorDate: Fri May 24 18:51:27 2024 +0800

[Metal] Support metal device profiling (#17025)

Enable native metal device profiling through API `sampleTimestamps`
---
 src/runtime/metal/metal_device_api.mm | 37 +++
 1 file changed, 37 insertions(+)

diff --git a/src/runtime/metal/metal_device_api.mm 
b/src/runtime/metal/metal_device_api.mm
index 37fb9dc347..42dd249630 100644
--- a/src/runtime/metal/metal_device_api.mm
+++ b/src/runtime/metal/metal_device_api.mm
@@ -21,6 +21,7 @@
  * \file metal_device_api.mm
  */
 #include 
+#include 
 #include 
 #include "metal_common.h"
 
@@ -366,6 +367,42 @@ 
TVM_REGISTER_GLOBAL("metal.ResetGlobalState").set_body_typed([]() {
   MetalWorkspace::Global()->ReinitializeDefaultStreams();
 });
 
+class MetalTimerNode : public TimerNode {
+ public:
+  MetalTimerNode() {}
+  explicit MetalTimerNode(Device dev) : dev_(dev) {
+mtl_dev_ = MetalWorkspace::Global()->GetDevice(dev_);
+  }
+
+  virtual void Start() {
+[mtl_dev_ sampleTimestamps:_cpu_time_ gpuTimestamp:_gpu_time_];
+  }
+  virtual void Stop() {
+auto ws = MetalWorkspace::Global();
+ws->StreamSync(dev_, ws->GetCurrentStream(dev_));
+[mtl_dev_ sampleTimestamps:_cpu_time_ gpuTimestamp:_gpu_time_];
+  }
+  virtual int64_t SyncAndGetElapsedNanos() { return stop_gpu_time_ - 
start_gpu_time_; }
+
+  static constexpr const char* _type_key = "MetalTimerNode";
+  TVM_DECLARE_FINAL_OBJECT_INFO(MetalTimerNode, TimerNode);
+
+ private:
+  Device dev_;
+  id mtl_dev_;
+
+  MTLTimestamp start_cpu_time_;
+  MTLTimestamp start_gpu_time_;
+  MTLTimestamp stop_cpu_time_;
+  MTLTimestamp stop_gpu_time_;
+};
+
+TVM_REGISTER_OBJECT_TYPE(MetalTimerNode);
+
+TVM_REGISTER_GLOBAL("profiling.timer.metal").set_body_typed([](Device dev) {
+  return Timer(make_object(dev));
+});
+
 }  // namespace metal
 }  // namespace runtime
 }  // namespace tvm



(tvm) branch cmake-debug created (now 41a13e7f5f)

2024-05-23 Thread tqchen
This is an automated email from the ASF dual-hosted git repository.

tqchen pushed a change to branch cmake-debug
in repository https://gitbox.apache.org/repos/asf/tvm.git


  at 41a13e7f5f Update

This branch includes the following new commits:

 new 41a13e7f5f Update

The 1 revisions listed above as "new" are entirely new to this
repository and will be described in separate emails.  The revisions
listed as "add" were already present in the repository and have only
been added to this reference.




(tvm) 01/01: Update

2024-05-23 Thread tqchen
This is an automated email from the ASF dual-hosted git repository.

tqchen pushed a commit to branch cmake-debug
in repository https://gitbox.apache.org/repos/asf/tvm.git

commit 41a13e7f5f78becd855b6aefd4739160cb166099
Author: tqchen 
AuthorDate: Thu May 23 19:24:20 2024 -0400

Update
---
 3rdparty/flashinfer | 2 +-
 docker/bash.sh  | 2 ++
 2 files changed, 3 insertions(+), 1 deletion(-)

diff --git a/3rdparty/flashinfer b/3rdparty/flashinfer
index f978e02565..920672776a 16
--- a/3rdparty/flashinfer
+++ b/3rdparty/flashinfer
@@ -1 +1 @@
-Subproject commit f978e02565d7157d57803eb4153369e046fc4106
+Subproject commit 920672776a2bf2244acf7a2e0516f46be9e93b15
diff --git a/docker/bash.sh b/docker/bash.sh
index a3d57bfd42..1d3c2e1a74 100755
--- a/docker/bash.sh
+++ b/docker/bash.sh
@@ -517,3 +517,5 @@ if ${DRY_RUN}; then
 else
 ${DOCKER_CMD[@]+"${DOCKER_CMD[@]}"}
 fi
+
+echo Finish running exit status $?



(tvm) branch main updated: [DLight] Update Adreno GEMV Rules (#17016)

2024-05-21 Thread tqchen
This is an automated email from the ASF dual-hosted git repository.

tqchen pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/tvm.git


The following commit(s) were added to refs/heads/main by this push:
 new 2e56421dda [DLight] Update Adreno GEMV Rules (#17016)
2e56421dda is described below

commit 2e56421dda32755a0b9c41cd1515ec4f8e4d598e
Author: Siyuan Feng 
AuthorDate: Tue May 21 22:59:36 2024 +0800

[DLight] Update Adreno GEMV Rules (#17016)

When reduction axis is small, it's not necessary to use rfactor. This
PR updates the gemv rule to use rfactor only when the reduction axis is
large enough.
---
 python/tvm/dlight/gpu/gemv.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/python/tvm/dlight/gpu/gemv.py b/python/tvm/dlight/gpu/gemv.py
index da6a4ef834..b8a2c6a15f 100644
--- a/python/tvm/dlight/gpu/gemv.py
+++ b/python/tvm/dlight/gpu/gemv.py
@@ -711,7 +711,7 @@ class GEMV(GPUScheduleRule):
 if LOAD_V_SHARED is False:
 LOAD_V_TILE = 1
 
-if not isinstance(len_r, int):
+if not isinstance(len_r, int) or len_r < LOAD_V_TILE * TR * SCALE_PACK 
* DEC_PACK:
 return None
 
 if not isinstance(len_s, int):



(tvm) branch main updated: [KVCache] Support KVCache decode from forked sequence and pop more tokens (#16995)

2024-05-20 Thread tqchen
This is an automated email from the ASF dual-hosted git repository.

tqchen pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/tvm.git


The following commit(s) were added to refs/heads/main by this push:
 new 18a2a250f8 [KVCache] Support KVCache decode from forked sequence and 
pop more tokens (#16995)
18a2a250f8 is described below

commit 18a2a250f8c7f16f5f5be6753861ba5db8fb89fa
Author: Yaxing Cai 
AuthorDate: Mon May 20 08:13:50 2024 -0700

[KVCache] Support KVCache decode from forked sequence and pop more tokens 
(#16995)
---
 src/runtime/relax_vm/paged_kv_cache.cc | 65 +++---
 1 file changed, 53 insertions(+), 12 deletions(-)

diff --git a/src/runtime/relax_vm/paged_kv_cache.cc 
b/src/runtime/relax_vm/paged_kv_cache.cc
index b07ae3d76d..a5d2d9f415 100644
--- a/src/runtime/relax_vm/paged_kv_cache.cc
+++ b/src/runtime/relax_vm/paged_kv_cache.cc
@@ -925,10 +925,21 @@ class PagedAttentionKVCacheObj : public 
AttentionKVCacheObj {
 if (fork_pos == -1 || fork_pos == parent_it->second.seq_length) {
   // Fork at last by appending a new block directly
   int32_t parent_block_idx = parent_it->second.last_block_idx;
+  if (!global_block_pool_[parent_block_idx].seq_length) {
+// If parent ends with empty block, fork from parent's parent block
+parent_block_idx = global_block_pool_[parent_block_idx].parent_idx;
+  }
   ++global_block_pool_[parent_block_idx].external_ref_cnt;
   // Update child block start position and parent index
   global_block_pool_[child_block_idx].start_pos = 
parent_it->second.seq_length;
   global_block_pool_[child_block_idx].parent_idx = parent_block_idx;
+  if (global_block_pool_[parent_block_idx].seq_length) {
+// If parent is not empty, append a new block
+int32_t new_parent_block_idx = GetFreeBlock();
+global_block_pool_[new_parent_block_idx].start_pos = 
parent_it->second.seq_length;
+global_block_pool_[new_parent_block_idx].parent_idx = parent_block_idx;
+parent_it->second.last_block_idx = new_parent_block_idx;
+  }
 } else {
   // Locate the block to fork from and calculate in-block offset
   std::vector trace = 
parent_it->second.GetBlockTrace(global_block_pool_);
@@ -1038,21 +1049,51 @@ class PagedAttentionKVCacheObj : public 
AttentionKVCacheObj {
 auto it = seq_map_.find(seq_id);
 CHECK(it != seq_map_.end()) << "The sequence \"" << seq_id << "\" cannot 
be found in KV cache.";
 
-Block& block = global_block_pool_[it->second.last_block_idx];
 CHECK_GE(n, 0) << "The length of popping " << n << " cannot be negative.";
-CHECK_LE(n, block.seq_length) << "The sequence only has length " << 
block.seq_length
-  << " in the last block, while the length of 
pop is " << n
-  << " which exceeds the last-block sequence 
length.";
+CHECK_LE(n, it->second.seq_length)
+<< "The sequence only has length " << it->second.seq_length
+<< ", while the length of pop is " << n << " which exceeds the whole 
sequence length.";
+int32_t block_idx = it->second.last_block_idx;
+while (block_idx != -1 && global_block_pool_[block_idx].external_ref_cnt 
== 0) {
+  if (n > global_block_pool_[block_idx].seq_length) {
+n -= global_block_pool_[block_idx].seq_length;
+it->second.seq_length -= global_block_pool_[block_idx].seq_length;
+for (int32_t page_id : global_block_pool_[block_idx].page_ids) {
+  free_page_ids_.push_back(page_id);
+}
+free_block_idx_.push_back(block_idx);
+block_idx = global_block_pool_[block_idx].parent_idx;
+it->second.last_block_idx = block_idx;
+continue;
+  }
+  if (n <= global_block_pool_[block_idx].seq_length) {
+int64_t cur_npage = global_block_pool_[block_idx].page_ids.size();
+int64_t tgt_npage =
+(global_block_pool_[block_idx].seq_length - n + page_size_ - 1) / 
page_size_;
+while (cur_npage > tgt_npage) {
+  
free_page_ids_.push_back(global_block_pool_[block_idx].page_ids.back());
+  global_block_pool_[block_idx].page_ids.pop_back();
+  --cur_npage;
+}
+it->second.seq_length -= n;
+global_block_pool_[block_idx].seq_length -= n;
+n = 0;
+break;
+  }
+}
 
-int64_t cur_npage = block.page_ids.size();
-int64_t tgt_npage = (block.seq_length - n + page_size_ - 1) / page_size_;
-while (cur_npage > tgt_npage) {
-  free_page_ids_.push_back(block.page_ids.back());
-  block.page_ids.pop_back();
-  --cur_npage;
+if (n) {
+

(tvm) branch main updated: [WebGPU] Handle device OOM in createBuffer (#17005)

2024-05-17 Thread tqchen
This is an automated email from the ASF dual-hosted git repository.

tqchen pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/tvm.git


The following commit(s) were added to refs/heads/main by this push:
 new afb6416234 [WebGPU] Handle device OOM in createBuffer (#17005)
afb6416234 is described below

commit afb64162342bc911cb101a5038139441cbbd8bbc
Author: Charlie Ruan <53290280+charliefr...@users.noreply.github.com>
AuthorDate: Fri May 17 09:41:57 2024 -0700

[WebGPU] Handle device OOM in createBuffer (#17005)
---
 web/src/runtime.ts | 15 +++
 web/src/webgpu.ts  | 29 ++---
 2 files changed, 41 insertions(+), 3 deletions(-)

diff --git a/web/src/runtime.ts b/web/src/runtime.ts
index ff4dce497d..080003b4f0 100644
--- a/web/src/runtime.ts
+++ b/web/src/runtime.ts
@@ -1014,6 +1014,7 @@ export class Instance implements Disposable {
   private asyncifyHandler: AsyncifyHandler;
   private initProgressCallback: Array = [];
   private rng: LinearCongruentialGenerator;
+  private deviceLostIsError = true;  // whether device.lost is due to actual 
error or dispose()
 
   /**
* Internal function(registered by the runtime)
@@ -1107,11 +1108,14 @@ export class Instance implements Disposable {
   }
 
   dispose(): void {
+this.deviceLostIsError = false;  // prevent dispose to trigger device.lost 
error
 // order matters
 // ctx release goes back into lib.
 this.ctx.dispose();
 this.lib.dispose();
+this.deviceLostIsError = true;
   }
+
   /**
* Obtain the runtime information in readable format.
*/
@@ -2094,6 +2098,17 @@ export class Instance implements Disposable {
* @param device The given GPU device.
*/
   initWebGPU(device: GPUDevice): void {
+device.addEventListener("uncapturederror", (event) => {
+  console.error("A WebGPU error was not captured: ", event);
+});
+
+device.lost.then((info: any) => {
+  if (this.deviceLostIsError) {
+console.error("Device lost, calling Instance.dispose(). Please 
initialize again. ", info);
+this.dispose();
+  }
+});
+
 const webGPUContext = new WebGPUContext(
   this.memory, device
 );
diff --git a/web/src/webgpu.ts b/web/src/webgpu.ts
index 55c53bb8d5..8d699c4c48 100644
--- a/web/src/webgpu.ts
+++ b/web/src/webgpu.ts
@@ -120,6 +120,29 @@ export async function detectGPUDevice(): 
Promise {if (error) {device.destroy(); 
console.error(error);}});
+  device.popErrorScope().then((error) => {if (error) {device.destroy(); 
console.error(error);}});
+  device.popErrorScope().then((error) => {if (error) {device.destroy(); 
console.error(error);}});
+
+  return buffer;
+}
+
 const canvasRenderWGSL = `
 @group(0) @binding(0) var my_sampler : sampler;
 @group(0) @binding(1) var my_texture : texture_2d;
@@ -504,7 +527,7 @@ export class WebGPUContext {
 
 if (buffer == undefined) {
   // create uniform buffer
-  buffer = this.device.createBuffer({
+  buffer = tryCreateBuffer(this.device, {
 size: allocSize,
 usage: GPUBufferUsage.UNIFORM | GPUBufferUsage.COPY_DST,
   });
@@ -779,7 +802,7 @@ export class WebGPUContext {
 if (nbytes == 0) {
   nbytes = 1;
 }
-const buffer = this.device.createBuffer({
+const buffer = tryCreateBuffer(this.device, {
   size: nbytes,
   usage: GPUBufferUsage.STORAGE | GPUBufferUsage.COPY_SRC | 
GPUBufferUsage.COPY_DST,
 });
@@ -833,7 +856,7 @@ export class WebGPUContext {
 nbytes: number
   ): void {
 // Perhaps it would be more useful to resuse a staging buffer?
-const gpuTemp = this.device.createBuffer({
+const gpuTemp = tryCreateBuffer(this.device, {
   size: nbytes,
   usage: GPUBufferUsage.MAP_READ | GPUBufferUsage.COPY_DST,
 });



(tvm) branch main updated: [KVCache] Fix the aux data syncing order of paged KV cache (#16988)

2024-05-12 Thread tqchen
This is an automated email from the ASF dual-hosted git repository.

tqchen pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/tvm.git


The following commit(s) were added to refs/heads/main by this push:
 new d1ac1c0202 [KVCache] Fix the aux data syncing order of paged KV cache 
(#16988)
d1ac1c0202 is described below

commit d1ac1c0202b3d8cb2af268ce79c2ac710554152b
Author: Rick Zhou 
AuthorDate: Sun May 12 18:22:18 2024 -0700

[KVCache] Fix the aux data syncing order of paged KV cache (#16988)

Fix the aux data syncing order of paged KV cache
---
 src/runtime/relax_vm/paged_kv_cache.cc | 21 +++--
 1 file changed, 11 insertions(+), 10 deletions(-)

diff --git a/src/runtime/relax_vm/paged_kv_cache.cc 
b/src/runtime/relax_vm/paged_kv_cache.cc
index efedac235b..9a17354fe5 100644
--- a/src/runtime/relax_vm/paged_kv_cache.cc
+++ b/src/runtime/relax_vm/paged_kv_cache.cc
@@ -1709,24 +1709,28 @@ class PagedAttentionKVCacheObj : public 
AttentionKVCacheObj {
 // - Reset the copy.
 aux_data_manager_->ResetCopy();
 
-// 1. qo_indptr_on_depths
+// 1. q_rope_position_map
+// q_rope_position_map has to be synced first so that it has a 0 byte 
offset
+ICHECK_EQ(q_rope_position_map_host_.size(), total_append_length);
+q_rope_position_map_view_ = 
aux_data_manager_->CopyQRoPEPosMapAsync(_rope_position_map_host_);
+// 2. qo_indptr_on_depths
 for (int d = 0; d < num_depths_; ++d) {
   qo_indptr_on_depths_view_[d] =
   
aux_data_manager_->CopyQOIndptrOnDepthAsync(_indptr_on_depths_host_[d], d);
 }
-// 2. page_indptr_on_depths
+// 3. page_indptr_on_depths
 for (int d = 0; d < num_depths_; ++d) {
   ICHECK_EQ(page_indptr_on_depths_host_[d].size(), 
qo_indptr_on_depths_host_[d].size());
   page_indptr_on_depths_view_[d] =
   
aux_data_manager_->CopyPageIndptrOnDepthAsync(_indptr_on_depths_host_[d], 
d);
 }
-// 3. page_indices_on_depths
+// 4. page_indices_on_depths
 for (int d = 0; d < num_depths_; ++d) {
   ICHECK_EQ(page_indices_on_depths_host_[d].size(), 
page_indptr_on_depths_host_[d].back());
   page_indices_on_depths_view_[d] =
   
aux_data_manager_->CopyPageIndicesOnDepthAsync(_indices_on_depths_host_[d],
 d);
 }
-// 4. length_info_on_depths
+// 5. length_info_on_depths
 // last_page_len_on_depths_host_;
 // sliding_window_offset_on_depths_host_;
 // sink_size_on_depths_host_;
@@ -1746,23 +1750,20 @@ class PagedAttentionKVCacheObj : public 
AttentionKVCacheObj {
 _size_on_depths_host_[d], d);
   }
 }
-// 5. k_rope_pos_offset_on_depths
+// 6. k_rope_pos_offset_on_depths
 for (int d = 0; d < num_depths_; ++d) {
   ICHECK_EQ(k_rope_pos_offset_on_depths_host_[d].size() + 1,
 qo_indptr_on_depths_host_[d].size());
   k_rope_pos_offset_view_[d] = 
aux_data_manager_->CopyKRoPEPosOffsetOnDepthAsync(
   _rope_pos_offset_on_depths_host_[d], d);
 }
-// 6. cur_append_lengths_indptr
+// 7. cur_append_lengths_indptr
 cur_append_length_indptr_view_ =
 
aux_data_manager_->CopyCurAppendLengthIndptrAsync(_append_lengths_indptr_host_);
-// 7. k_ragged_rope_pos_offset
+// 8. k_ragged_rope_pos_offset
 ICHECK_EQ(k_ragged_rope_pos_offset_host_.size(), num_sequences);
 k_ragged_rope_pos_offset_view_ =
 
aux_data_manager_->CopyKRaggedRoPEPosOffsetAsync(_ragged_rope_pos_offset_host_);
-// 8. q_rope_position_map
-ICHECK_EQ(q_rope_position_map_host_.size(), total_append_length);
-q_rope_position_map_view_ = 
aux_data_manager_->CopyQRoPEPosMapAsync(_rope_position_map_host_);
 // 9. append_position_map
 append_position_map_view_ =
 
aux_data_manager_->CopyAppendPositionMapAsync(_position_map_host_);



(tvm) branch main updated: [TOPI] Remove `blockIdx.z` in topi sort (#16977)

2024-05-10 Thread tqchen
This is an automated email from the ASF dual-hosted git repository.

tqchen pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/tvm.git


The following commit(s) were added to refs/heads/main by this push:
 new 825dc1ffb5 [TOPI] Remove `blockIdx.z` in topi sort (#16977)
825dc1ffb5 is described below

commit 825dc1ffb51c25506600136d2ec8fb336f476c84
Author: Siyuan Feng 
AuthorDate: Fri May 10 21:08:17 2024 +0800

[TOPI] Remove `blockIdx.z` in topi sort (#16977)

As `blockIdx.z` is not allowed in WebGPU, this PR split `blockIdx.z`
into `blockIdx.y` to support WebGPU
---
 python/tvm/topi/cuda/sort.py | 31 ++-
 1 file changed, 14 insertions(+), 17 deletions(-)

diff --git a/python/tvm/topi/cuda/sort.py b/python/tvm/topi/cuda/sort.py
index dc72aa8cc1..9151744b69 100644
--- a/python/tvm/topi/cuda/sort.py
+++ b/python/tvm/topi/cuda/sort.py
@@ -57,18 +57,16 @@ def _schedule_sort(outs):
 return s
 
 
-def _get_threads(ib, nthread_tx, nthread_bx, nthread_by, nthread_bz):
+def _get_threads(ib, nthread_tx, nthread_bx, nthread_by):
 tx = te.thread_axis("threadIdx.x")
 bx = te.thread_axis("blockIdx.x")
 ib.scope_attr(tx, "thread_extent", nthread_tx)
 ib.scope_attr(bx, "thread_extent", nthread_bx)
 
 by = te.thread_axis("blockIdx.y")
-bz = te.thread_axis("blockIdx.z")
 ib.scope_attr(by, "thread_extent", nthread_by)
-ib.scope_attr(bz, "thread_extent", nthread_bz)
 
-return tx, bx, by, bz
+return tx, bx, by
 
 
 def _sort_init(ib, shape, axis, keys_in, keys_out, values_out=None, 
value_init_func=None):
@@ -87,13 +85,13 @@ def _sort_init(ib, shape, axis, keys_in, keys_out, 
values_out=None, value_init_f
 max_threads = 
int(tvm.target.Target.current(allow_none=False).max_num_threads)
 nthread_tx = max_threads
 nthread_bx = ceil_div(shape[axis], max_threads)
-nthread_by = axis_mul_before
-nthread_bz = axis_mul_after
+nthread_by = axis_mul_before * axis_mul_after
 
 # Copy the keys_in to initial output
 with ib.new_scope():
-tx, bx, by, bz = _get_threads(ib, nthread_tx, nthread_bx, nthread_by, 
nthread_bz)
+tx, bx, by = _get_threads(ib, nthread_tx, nthread_bx, nthread_by)
 tid = bx * nthread_tx + tx
+by, bz = by % axis_mul_before, by // axis_mul_before
 idx = (by * shape[axis] + tid) * axis_mul_after + bz
 with ib.if_scope(tid < shape[axis]):
 keys_out[idx] = keys_in[idx]
@@ -122,11 +120,11 @@ def _odd_even_sort(
 ):
 nthread_tx = block_size // 2
 nthread_bx = ceil_div(size, block_size)
-nthread_by = axis_mul_before
-nthread_bz = axis_mul_after
+nthread_by = axis_mul_before * axis_mul_after
 with ib.new_scope():
 ib.scope_attr(tvm.tir.const(0), "hand_threaded", 0)
-tx, bx, by, bz = _get_threads(ib, nthread_tx, nthread_bx, nthread_by, 
nthread_bz)
+tx, bx, by = _get_threads(ib, nthread_tx, nthread_bx, nthread_by)
+by, bz = by % axis_mul_before, by // axis_mul_before
 tid = 2 * tx
 start = bx * block_size
 
@@ -222,7 +220,6 @@ def _sort_common(
 
 max_threads = 
int(tvm.target.Target.current(allow_none=False).max_num_threads)
 nthread_by = axis_mul_before * axis_mul_after
-nthread_bz = 1
 nthread_tx = max_threads
 nthread_bx = ceil_div(size, nthread_tx)
 
@@ -334,12 +331,13 @@ def _sort_common(
 ntx = max_threads
 nbx = tvm.tir.generic.cast(ceil_div(width, max_threads * 
thread_work), "int32")
 nbz = tvm.tir.generic.cast(ceil_div(size, width), "int32")
-tx, bx, by, bz = _get_threads(ib, ntx, nbx, nthread_by, nbz)
+tx, bx, by = _get_threads(ib, ntx, nbx, nthread_by * nbz)
 else:
 ntx = tvm.tir.generic.cast(tvm.te.min(max_threads, width), 
"int32")
 nbx = tvm.tir.generic.cast(ceil_div(width, max_threads * 
thread_work), "int32")
 nbz = tvm.tir.generic.cast(ceil_div(size, width), "int32")
-tx, bx, by, bz = _get_threads(ib, ntx, nbx, nthread_by, nbz)
+tx, bx, by = _get_threads(ib, ntx, nbx, nthread_by * nbz)
+by, bz = by % nthread_by, by // nthread_by
 
 def mergepath(
 source,
@@ -471,8 +469,7 @@ def _sort_common(
 width,
 tvm.tir.indexmod(l2_width, 2) == 0,
 )
-nthread_by = axis_mul_before
-nthread_bz = axis_mul_after
+nthread_by = axis_mul_before * axis_mul_after
 nthread_tx = max_threads
 nthread_bx = ceil_div(size, nthread_tx)
 ## if the final sorted data ended up in the swap, copy it to the real 
output
@@ -480,9 +477,9 @@ def _sort_common(
 tvm.tir.all(upper_lim > lower_lim, tvm.tir.indexmod(upper_l

(tvm) branch main updated (4c1ebcf81a -> fffd168d00)

2024-05-09 Thread tqchen
This is an automated email from the ASF dual-hosted git repository.

tqchen pushed a change to branch main
in repository https://gitbox.apache.org/repos/asf/tvm.git


from 4c1ebcf81a [Relax] Implement relax.op.view (#16955)
 add fffd168d00 [Unity][BYOC] Use arith.Analyzer to check batch equality of 
matmul in cublas (#16982)

No new revisions were added by this update.

Summary of changes:
 python/tvm/relax/backend/contrib/cublas.py | 5 -
 tests/python/relax/test_codegen_cublas.py  | 2 ++
 2 files changed, 6 insertions(+), 1 deletion(-)



(tvm) branch main updated: [Relax] Support nested ModuleList in nn.Module (#16971)

2024-05-07 Thread tqchen
This is an automated email from the ASF dual-hosted git repository.

tqchen pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/tvm.git


The following commit(s) were added to refs/heads/main by this push:
 new 819b0023e4 [Relax] Support nested ModuleList in nn.Module (#16971)
819b0023e4 is described below

commit 819b0023e46dd85a5ae8ce6294e5456abaf78f3c
Author: Wuwei Lin 
AuthorDate: Tue May 7 06:09:32 2024 -0700

[Relax] Support nested ModuleList in nn.Module (#16971)
---
 python/tvm/relax/frontend/nn/core.py   | 15 +--
 tests/python/relax/test_frontend_nn_modules.py | 15 +++
 2 files changed, 24 insertions(+), 6 deletions(-)

diff --git a/python/tvm/relax/frontend/nn/core.py 
b/python/tvm/relax/frontend/nn/core.py
index 4953c1c817..46e016a242 100644
--- a/python/tvm/relax/frontend/nn/core.py
+++ b/python/tvm/relax/frontend/nn/core.py
@@ -607,16 +607,19 @@ def wrap_nested(expr: rx.Expr, name: str) -> 
Union[Tensor, Sequence[Tensor]]:
 
 def _attribute_finder(root: Module, prefix: str, condition_yield: 
Callable[[Any], bool]):
 """Find attributes that satisfy the condition recursively"""
+if isinstance(root, ModuleList):
+for i, subitem in enumerate(root):
+yield from _attribute_finder(subitem, prefix + f"{i}.", 
condition_yield)
+return
 for name, item in root.__dict__.items():
 if condition_yield(item):
 yield prefix + name, item
 elif isinstance(item, ModuleList):
-for i, subitem in enumerate(item):
-yield from _attribute_finder(
-subitem,
-prefix + name + f".{i}.",
-condition_yield,
-)
+yield from _attribute_finder(
+item,
+prefix + name + ".",
+condition_yield,
+)
 elif isinstance(item, Module):
 yield from _attribute_finder(
 item,
diff --git a/tests/python/relax/test_frontend_nn_modules.py 
b/tests/python/relax/test_frontend_nn_modules.py
index 5ddc105055..23250f28aa 100644
--- a/tests/python/relax/test_frontend_nn_modules.py
+++ b/tests/python/relax/test_frontend_nn_modules.py
@@ -700,5 +700,20 @@ def test_nn_module_list_input():
 assert_structural_equal(tvm_mod["forward"], forward)
 
 
+def test_module_list():
+class Module(nn.Module):
+def __init__(self):
+self.layers = nn.ModuleList(
+[nn.ModuleList([nn.Linear(4, 4, bias=False) for _ in 
range(2)]) for _ in range(1)]
+)
+
+def forward(self, x: nn.Tensor):
+return self.layers(x)
+
+mod = Module()
+named_params = dict(mod.named_parameters())
+assert ["layers.0.0.weight", "layers.0.1.weight"] == 
sorted(list(named_params.keys()))
+
+
 if __name__ == "__main__":
 tvm.testing.main()



(tvm) branch main updated: [TIR] Support narrow dtype for let binding (#16947)

2024-05-06 Thread tqchen
This is an automated email from the ASF dual-hosted git repository.

tqchen pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/tvm.git


The following commit(s) were added to refs/heads/main by this push:
 new 28d32b52cb [TIR] Support narrow dtype for let binding (#16947)
28d32b52cb is described below

commit 28d32b52cbde45600dc14a41af7f5ef9b6b778c5
Author: Siyuan Feng 
AuthorDate: Mon May 6 20:07:42 2024 +0800

[TIR] Support narrow dtype for let binding (#16947)

The current pass `ForceNarrowIndexToI32` fails to narrow dtype for let
binding. This PR fixes the issue.

BTW, this PR addresses the comments in #16934
---
 include/tvm/tir/data_type_rewriter.h   |  1 +
 python/tvm/relax/backend/dispatch_sort_scan.py |  6 +-
 src/tir/ir/data_type_rewriter.cc   | 19 
 .../relax/test_backend_dispatch_sort_scan.py   | 22 +--
 ...test_tir_transform_force_narrow_index_to_i32.py | 25 ++
 5 files changed, 60 insertions(+), 13 deletions(-)

diff --git a/include/tvm/tir/data_type_rewriter.h 
b/include/tvm/tir/data_type_rewriter.h
index 846cda74c6..913e2ab189 100644
--- a/include/tvm/tir/data_type_rewriter.h
+++ b/include/tvm/tir/data_type_rewriter.h
@@ -110,6 +110,7 @@ class IndexDataTypeRewriter : public DataTypeLegalizer {
   Stmt VisitStmt_(const IfThenElseNode* op) override;
   Stmt VisitStmt_(const DeclBufferNode* op) override;
   Stmt VisitStmt_(const AllocateNode* op) override;
+  Stmt VisitStmt_(const LetStmtNode* op) override;
   PrimExpr VisitExpr_(const EQNode* op) override;
   PrimExpr VisitExpr_(const NENode* op) override;
   PrimExpr VisitExpr_(const LTNode* op) override;
diff --git a/python/tvm/relax/backend/dispatch_sort_scan.py 
b/python/tvm/relax/backend/dispatch_sort_scan.py
index e25c28e571..53948b8449 100644
--- a/python/tvm/relax/backend/dispatch_sort_scan.py
+++ b/python/tvm/relax/backend/dispatch_sort_scan.py
@@ -155,9 +155,13 @@ class SortScanDispatcher(PyExprMutator):
 tgt = self._get_target(call.struct_info)
 axis = int(call.attrs.axis) if call.attrs.axis is not None else 
call.attrs.axis
 shape = call.struct_info.shape
+# TODO(tvm-team): Support fully dynamic case with `shape=None`
+if shape is None:
+raise ValueError("non-symbolic shape is not supported for now")
 kwargs = {}
 if (
-(axis == -1 or axis == len(shape) - 1)
+shape is not None
+and (axis == -1 or axis == len(shape) - 1)
 and is_gpu_target(tgt)
 and not can_use_thrust(tgt, "tvm.contrib.thrust.sum_scan")
 and call.op.name == "relax.cumsum"
diff --git a/src/tir/ir/data_type_rewriter.cc b/src/tir/ir/data_type_rewriter.cc
index c03e19137e..2bc1cd5797 100644
--- a/src/tir/ir/data_type_rewriter.cc
+++ b/src/tir/ir/data_type_rewriter.cc
@@ -27,6 +27,10 @@
 #include 
 
 #include "./functor_common.h"
+#include "tvm/ir/expr.h"
+#include "tvm/tir/expr.h"
+#include "tvm/tir/stmt.h"
+#include "tvm/tir/var.h"
 
 namespace tvm {
 namespace tir {
@@ -558,6 +562,21 @@ Stmt IndexDataTypeRewriter::VisitStmt_(const ForNode* op) {
   }
 }
 
+Stmt IndexDataTypeRewriter::VisitStmt_(const LetStmtNode* op) {
+  LetStmt let_stmt = Downcast(DataTypeLegalizer::VisitStmt_(op));
+  if (var_remap_.find(let_stmt->var.get()) == var_remap_.end()) {
+return let_stmt;
+  }
+  bool is_enabled = is_enabled_;
+  is_enabled_ = true;
+  PrimExpr value = VisitExpr(op->value);
+  Var var = var_remap_[let_stmt->var.get()];
+  is_enabled_ = is_enabled;
+  ICHECK(value.dtype() == var.dtype());
+  // No need to re-visit body
+  return LetStmt(var, value, let_stmt->body, let_stmt->span);
+}
+
 #define TVM_DEFINE_CMPOP_EXPR_MUTATE_WITH_TYPE_MATCH(OP, FUNC) 
\
   PrimExpr IndexDataTypeRewriter::VisitExpr_(const OP* op) {   
\
 bool is_enabled = is_enabled_; 
\
diff --git a/tests/python/relax/test_backend_dispatch_sort_scan.py 
b/tests/python/relax/test_backend_dispatch_sort_scan.py
index a539621060..2ab5afaabf 100644
--- a/tests/python/relax/test_backend_dispatch_sort_scan.py
+++ b/tests/python/relax/test_backend_dispatch_sort_scan.py
@@ -273,7 +273,7 @@ def test_dispatch_argsort_cuda():
 if can_use_thrust(target, "tvm.contrib.thrust.sort"):
 workspace = bb.emit(
 relax.op.builtin.alloc_tensor(
-R.shape([4194568]), R.dtype("uint8"), 
R.prim_value(0), R.str("global")
+R.shape([8388872]), R.dtype("uint8"), 
R.prim_value(0), R.str("global")
 )
 

(tvm) branch main updated: [LLVM] Stringref API deprecation fixes (#16968)

2024-05-06 Thread tqchen
This is an automated email from the ASF dual-hosted git repository.

tqchen pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/tvm.git


The following commit(s) were added to refs/heads/main by this push:
 new 876f52805d [LLVM] Stringref API deprecation fixes (#16968)
876f52805d is described below

commit 876f52805d3184d6d8b05439e9c9578687b6ae77
Author: Anirudh Sundar Subramaniam 
AuthorDate: Mon May 6 17:36:15 2024 +0530

[LLVM] Stringref API deprecation fixes (#16968)

The `startswith`/`endswith` functions in `StringRef` API were 
[changed](https://reviews.llvm.org/D136030) to
`starts_with` and `ends_with` to be compatible with `std::string` and
the older APIs were deprecated and removed.
---
 src/target/llvm/codegen_hexagon.cc | 11 +++
 src/target/llvm/codegen_llvm.cc|  4 
 src/target/llvm/llvm_instance.cc   |  4 
 3 files changed, 19 insertions(+)

diff --git a/src/target/llvm/codegen_hexagon.cc 
b/src/target/llvm/codegen_hexagon.cc
index 6ef5e064c0..5113957aa1 100644
--- a/src/target/llvm/codegen_hexagon.cc
+++ b/src/target/llvm/codegen_hexagon.cc
@@ -126,9 +126,16 @@ void CodeGenHexagon::InitTarget() {
   const auto hvx_length_feature = "+hvx-length";  // +hvx-length{64|128}b
   for (const std::string& f : llvm_target_->GetTargetFeatures()) {
 llvm::StringRef fs(f);
+#if TVM_LLVM_VERSION >= 180
+if (!fs.starts_with(hvx_length_feature)) continue;
+
+ICHECK(fs.ends_with("b")) << "malformed target feature: " << f;
+#else
 if (!fs.startswith(hvx_length_feature)) continue;
 
 ICHECK(fs.endswith("b")) << "malformed target feature: " << f;
+#endif
+
 int hvx_bytes = 0;
 size_t len_begin = std::strlen(hvx_length_feature);
 ICHECK(!fs.substr(len_begin, fs.size() - len_begin - 1).getAsInteger(10, 
hvx_bytes))
@@ -639,7 +646,11 @@ runtime::Module BuildHexagon(IRModule mod, Target target) {
   Map extra_args;
   if (target->attrs.count("mcpu")) {
 std::string mcpu = Downcast(target->attrs.at("mcpu"));
+#if TVM_LLVM_VERSION >= 180
+ICHECK(llvm::StringRef(mcpu).starts_with("hexagon"))
+#else
 ICHECK(llvm::StringRef(mcpu).startswith("hexagon"))
+#endif
 << "unexpected -mcpu value in target:" << mcpu;
 extra_args.Set("hex_arch", 
llvm::StringRef(mcpu).drop_front(strlen("hexagon")).str());
   }
diff --git a/src/target/llvm/codegen_llvm.cc b/src/target/llvm/codegen_llvm.cc
index 6566bb4291..6fc083d17c 100644
--- a/src/target/llvm/codegen_llvm.cc
+++ b/src/target/llvm/codegen_llvm.cc
@@ -372,7 +372,11 @@ std::unique_ptr CodeGenLLVM::Finish() {
 void CodeGenLLVM::HandleImport(const std::string& code) {
   llvm::StringRef code_str(code);
   std::unique_ptr mlib;
+#if TVM_LLVM_VERSION >= 180
+  if (code_str.ends_with(".ll") || code_str.ends_with(".bc")) {
+#else
   if (code_str.endswith(".ll") || code_str.endswith(".bc")) {
+#endif
 mlib = llvm_target_->GetInstance().LoadIR(code);
   } else {
 mlib = llvm_target_->GetInstance().ParseIR(code);
diff --git a/src/target/llvm/llvm_instance.cc b/src/target/llvm/llvm_instance.cc
index bd2eee85b0..dd5a3fb681 100644
--- a/src/target/llvm/llvm_instance.cc
+++ b/src/target/llvm/llvm_instance.cc
@@ -916,7 +916,11 @@ std::string LLVMTarget::GetTargetMetadata(const 
llvm::Module& module) {
   if (llvm::Metadata* tvm_target = module.getModuleFlag("tvm_target")) {
 auto* mdstr = llvm::cast(tvm_target);
 llvm::StringRef meta = mdstr->getString();
+#if TVM_LLVM_VERSION >= 180
+if (meta.starts_with("llvm")) {
+#else
 if (meta.startswith("llvm")) {
+#endif
   return meta.str();
 }
   }



(tvm) branch main updated: [TVMScript] Fix error reporting inside Macro func (#16967)

2024-05-05 Thread tqchen
This is an automated email from the ASF dual-hosted git repository.

tqchen pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/tvm.git


The following commit(s) were added to refs/heads/main by this push:
 new 9cfebca136 [TVMScript] Fix error reporting inside Macro func (#16967)
9cfebca136 is described below

commit 9cfebca136a6dd58e59deeb19690d37cc6e9426a
Author: Siyuan Feng 
AuthorDate: Sun May 5 21:51:53 2024 +0800

[TVMScript] Fix error reporting inside Macro func (#16967)
---
 python/tvm/script/parser/core/parser.py | 53 +++--
 1 file changed, 38 insertions(+), 15 deletions(-)

diff --git a/python/tvm/script/parser/core/parser.py 
b/python/tvm/script/parser/core/parser.py
index b41a05689d..0ecf669566 100644
--- a/python/tvm/script/parser/core/parser.py
+++ b/python/tvm/script/parser/core/parser.py
@@ -145,26 +145,27 @@ class ScriptMacro(abc.ABC):
 local_vars = param_binding.arguments
 parser = self._find_parser_def()
 
-if self.hygienic:
-saved_var_table = parser.var_table
-parser.var_table = VarTable()
+with parser.with_diag_source(self.source):
+if self.hygienic:
+saved_var_table = parser.var_table
+parser.var_table = VarTable()
 
-with parser.var_table.with_frame():
-for k, v in self.closure_vars.items():
-parser.var_table.add(k, v)
-for k, v in local_vars.items():
-parser.var_table.add(k, v)
+with parser.var_table.with_frame():
+for k, v in self.closure_vars.items():
+parser.var_table.add(k, v)
+for k, v in local_vars.items():
+parser.var_table.add(k, v)
 
-parse_result = self.parse_macro(parser)
+parse_result = self.parse_macro(parser)
 
-parser.var_table = saved_var_table
+parser.var_table = saved_var_table
 
-else:
-with parser.var_table.with_frame():
-for k, v in local_vars.items():
-parser.var_table.add(k, v)
+else:
+with parser.var_table.with_frame():
+for k, v in local_vars.items():
+parser.var_table.add(k, v)
 
-parse_result = self.parse_macro(parser)
+parse_result = self.parse_macro(parser)
 
 return parse_result
 
@@ -415,6 +416,28 @@ class Parser(doc.NodeVisitor):
 
 return _deferred(pop_token)
 
+def with_diag_source(self, source: Source):
+"""Add a new source as with statement.
+
+Parameters
+--
+source : Source
+The source for diagnostics.
+
+Returns
+---
+res : Any
+The context with new source.
+"""
+
+last_diag = self.diag
+self.diag = Diagnostics(source)
+
+def pop_source():
+self.diag = last_diag
+
+return _deferred(pop_source)
+
 def eval_expr(
 self,
 node: Union[doc.Expression, doc.expr],



(tvm) branch main updated: [SVE] Add get_active_lane_mask builtin (#16965)

2024-05-04 Thread tqchen
This is an automated email from the ASF dual-hosted git repository.

tqchen pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/tvm.git


The following commit(s) were added to refs/heads/main by this push:
 new 944d180fba [SVE] Add get_active_lane_mask builtin (#16965)
944d180fba is described below

commit 944d180fba18660f7846eccf4ef4931284a7d38b
Author: Luke Hutton 
AuthorDate: Sat May 4 14:23:52 2024 +0100

[SVE] Add get_active_lane_mask builtin (#16965)

Adds a `get_active_lane_mask` builtin and lowering to
`llvm.get.active.lane.mask` intrinsic. This will be used in subsequent
patches for expressing predicated buffer loads/stores in TIR. Further
information can be found in the 
[RFC](https://github.com/apache/tvm-rfcs/blob/main/rfcs/0104-scalable-vectors-in-tir.md#predication).

Co-authored-by: Elen Kalda 
Co-authored-by: Neil Hickey 

Change-Id: Id9d65f9f11503ad35dd0b3db4bfc81249a76f701
---
 include/tvm/tir/builtin.h   |  8 
 python/tvm/script/ir_builder/tir/ir.py  |  2 ++
 python/tvm/tir/__init__.py  |  2 +-
 python/tvm/tir/op.py| 21 +
 src/target/llvm/codegen_llvm.cc |  5 +
 src/tir/op/builtin.cc   |  7 +++
 tests/python/codegen/test_target_codegen_aarch64.py | 20 
 7 files changed, 64 insertions(+), 1 deletion(-)

diff --git a/include/tvm/tir/builtin.h b/include/tvm/tir/builtin.h
index 10e5b462d1..5836eb8ea9 100644
--- a/include/tvm/tir/builtin.h
+++ b/include/tvm/tir/builtin.h
@@ -915,6 +915,14 @@ TVM_DLL const Op& anylist_setitem_call_cpacked();
  */
 TVM_DLL const Op& vscale();
 
+/*!
+ * \brief Calculate a predicate mask given an upper bound (limit) and a 
current value (base).
+ *
+ * It will be lowered to the llvm.get.active.lane.mask intrinsic.
+ * (https://llvm.org/docs/LangRef.html#llvm-get-active-lane-mask-intrinsics)
+ */
+TVM_DLL const Op& get_active_lane_mask();
+
 /*! \brief The kind of structure field info used in intrinsic */
 enum TVMStructFieldKind : int {
   // array head address
diff --git a/python/tvm/script/ir_builder/tir/ir.py 
b/python/tvm/script/ir_builder/tir/ir.py
index c04ac780c9..5a0a564a2a 100644
--- a/python/tvm/script/ir_builder/tir/ir.py
+++ b/python/tvm/script/ir_builder/tir/ir.py
@@ -1903,6 +1903,7 @@ mma_fill = _dtype_forward(_tir_op.mma_fill)
 vectorlow = _dtype_forward(_tir_op.vectorlow)
 vectorhigh = _dtype_forward(_tir_op.vectorhigh)
 vectorcombine = _dtype_forward(_tir_op.vectorcombine)
+get_active_lane_mask = _dtype_forward(_tir_op.get_active_lane_mask)
 
 
 broadcast = Broadcast
@@ -2219,4 +2220,5 @@ __all__ = [
 "CommReducer",
 "Range",
 "vscale",
+"get_active_lane_mask",
 ]
diff --git a/python/tvm/tir/__init__.py b/python/tvm/tir/__init__.py
index 1723804388..24ba4ccd2e 100644
--- a/python/tvm/tir/__init__.py
+++ b/python/tvm/tir/__init__.py
@@ -88,7 +88,7 @@ from .op import comm_reducer, min, max, sum
 from .op import q_multiply_shift, q_multiply_shift_per_axis, shift_left, 
shift_right
 from .op import TVMBackendAllocWorkspace, TVMBackendFreeWorkspace
 from .op import start_profile_intrinsic, end_profile_intrinsic
-from .op import vscale
+from .op import vscale, get_active_lane_mask
 from .generic import add, subtract, multiply
 
 from .schedule import StmtSRef, BlockScope, ScheduleState, Schedule, 
ScheduleError
diff --git a/python/tvm/tir/op.py b/python/tvm/tir/op.py
index 6b72e63f29..db52bec598 100644
--- a/python/tvm/tir/op.py
+++ b/python/tvm/tir/op.py
@@ -3349,6 +3349,27 @@ def vscale():
 return call_intrin("int32", "tir.vscale")
 
 
+def get_active_lane_mask(dtype, base, limit):
+"""
+Calculate a predicate mask given an upper bound (limit) and a current 
value (base).
+
+It will be lowered to the llvm.get.active.lane.mask intrinsic.
+(https://llvm.org/docs/LangRef.html#llvm-get-active-lane-mask-intrinsics)
+
+Parameters
+--
+dtype : str
+The data type of the result.
+
+base : PrimExpr
+An expression reprsenting the base.
+
+limit : PrimExpr
+An expression representing the limit.
+"""
+return call_intrin(dtype, "tir.get_active_lane_mask", base, limit)
+
+
 # pylint: disable=unnecessary-lambda
 sum = comm_reducer(lambda x, y: x + y, lambda t: const(0, dtype=t), name="sum")
 min = comm_reducer(lambda x, y: _ffi_api._OpMin(x, y, None), max_value, 
name="min")  # type: ignore
diff --git a/src/target/llvm/codegen_llvm.cc b/src/target/llvm/codegen_llvm.cc
index 95512a00a7..6566bb4291 100644
--- a/src/target/llvm/codegen_llvm.cc
+++ b/src/target/llvm/codegen_llvm.cc
@@ -1478,6 +1478,11 @@ llvm::Value* CodeGenLLVM::CreateIntrinsic(const 
CallNode* op) {
 llvm

(tvm) branch main updated (c0385c7523 -> b4a69de47b)

2024-04-29 Thread tqchen
This is an automated email from the ASF dual-hosted git repository.

tqchen pushed a change to branch main
in repository https://gitbox.apache.org/repos/asf/tvm.git


from c0385c7523 [Runtime] Allow offset to be specified in 
NDArray::CreateView (#16938)
 add b4a69de47b Enable gemv schedule for adreno (#16932)

No new revisions were added by this update.

Summary of changes:
 python/tvm/dlight/gpu/gemv.py  | 198 ++-
 python/tvm/dlight/gpu/matmul.py|   2 +-
 tests/python/dlight/test_gpu_gemv.py   | 450 +++--
 tests/python/dlight/test_gpu_matmul.py |  12 +-
 4 files changed, 577 insertions(+), 85 deletions(-)



(tvm) branch main updated: [CI] Update image tag to 20240428-060115-0b09ed018 (#16948)

2024-04-29 Thread tqchen
This is an automated email from the ASF dual-hosted git repository.

tqchen pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/tvm.git


The following commit(s) were added to refs/heads/main by this push:
 new dd09c85f87 [CI] Update image tag to 20240428-060115-0b09ed018 (#16948)
dd09c85f87 is described below

commit dd09c85f8787662c00afb952cbcf8725edbdbfc0
Author: Yong Wu 
AuthorDate: Mon Apr 29 04:56:38 2024 -0700

[CI] Update image tag to 20240428-060115-0b09ed018 (#16948)

* [CI] Update image tag to 20240428-060115-0b09ed018

* Skip a flaky test

* Remove msg in pytest.skip

* format
---
 ci/jenkins/docker-images.ini | 20 ++--
 tests/micro/zephyr/test_zephyr.py|  2 +-
 .../metaschedule_e2e/test_resnet50_fp16.py   |  2 +-
 .../metaschedule_e2e/test_resnet50_int8.py   |  4 ++--
 .../contrib/test_hexagon/test_meta_schedule.py   | 10 +-
 .../test_hexagon/topi/slice_op/test_cast_slice.py|  4 ++--
 tests/python/relax/test_codegen_cudnn.py |  1 +
 7 files changed, 22 insertions(+), 21 deletions(-)

diff --git a/ci/jenkins/docker-images.ini b/ci/jenkins/docker-images.ini
index 211ea02970..6e55160521 100644
--- a/ci/jenkins/docker-images.ini
+++ b/ci/jenkins/docker-images.ini
@@ -17,13 +17,13 @@
 
 # This data file is read during when Jenkins runs job to determine docker 
images.
 [jenkins]
-ci_arm: tlcpack/ci-arm:20240126-070121-8ade9c30e
-ci_cortexm: tlcpack/ci-cortexm:20240126-070121-8ade9c30e
-ci_cpu: tlcpack/ci_cpu:20240322-060059-89cd74c07
-ci_gpu: tlcpack/ci-gpu:20240126-070121-8ade9c30e
-ci_hexagon: tlcpack/ci-hexagon:20240126-070121-8ade9c30e
-ci_i386: tlcpack/ci-i386:20240126-070121-8ade9c30e
-ci_lint: tlcpack/ci-lint:20240126-070121-8ade9c30e
-ci_minimal: tlcpack/ci-minimal:20240126-070121-8ade9c30e
-ci_riscv: tlcpack/ci-riscv:20240126-070121-8ade9c30e
-ci_wasm: tlcpack/ci-wasm:20240126-070121-8ade9c30e
+ci_arm: tlcpack/ci-arm:20240428-060115-0b09ed018
+ci_cortexm: tlcpack/ci-cortexm:20240428-060115-0b09ed018
+ci_cpu: tlcpack/ci_cpu:20240428-060115-0b09ed018
+ci_gpu: tlcpack/ci-gpu:20240428-060115-0b09ed018
+ci_hexagon: tlcpack/ci-hexagon:20240428-060115-0b09ed018
+ci_i386: tlcpack/ci-i386:20240428-060115-0b09ed018
+ci_lint: tlcpack/ci-lint:20240428-060115-0b09ed018
+ci_minimal: tlcpack/ci-minimal:20240428-060115-0b09ed018
+ci_riscv: tlcpack/ci-riscv:20240428-060115-0b09ed018
+ci_wasm: tlcpack/ci-wasm:20240428-060115-0b09ed018
diff --git a/tests/micro/zephyr/test_zephyr.py 
b/tests/micro/zephyr/test_zephyr.py
index 72a0a85cf9..d247e2187b 100644
--- a/tests/micro/zephyr/test_zephyr.py
+++ b/tests/micro/zephyr/test_zephyr.py
@@ -650,7 +650,7 @@ def test_debugging_enabled(workspace_dir):
 def test_qemu_make_fail(workspace_dir, board, microtvm_debug, serial_number):
 """Testing QEMU make fail."""
 if not utils.ZEPHYR_BOARDS[board]["is_qemu"]:
-pytest.skip(msg="Only for QEMU targets.")
+pytest.skip("Only for QEMU targets.")
 
 build_config = {"debug": microtvm_debug}
 shape = (10,)
diff --git 
a/tests/python/contrib/test_hexagon/metaschedule_e2e/test_resnet50_fp16.py 
b/tests/python/contrib/test_hexagon/metaschedule_e2e/test_resnet50_fp16.py
index 117e9d4b6f..52892c60ad 100644
--- a/tests/python/contrib/test_hexagon/metaschedule_e2e/test_resnet50_fp16.py
+++ b/tests/python/contrib/test_hexagon/metaschedule_e2e/test_resnet50_fp16.py
@@ -47,7 +47,7 @@ def test_resnet50(hexagon_launcher):
 model_params = "resnet50_fp16.params"
 
 if not os.path.exists(model_json):
-pytest.skip(msg="Run python export_models.py first.")
+pytest.skip("Run python export_models.py first.")
 
 with open(model_json, "r") as file:
 mod = tvm.ir.load_json(file.read())
diff --git 
a/tests/python/contrib/test_hexagon/metaschedule_e2e/test_resnet50_int8.py 
b/tests/python/contrib/test_hexagon/metaschedule_e2e/test_resnet50_int8.py
index 111448ea57..84c796bee5 100644
--- a/tests/python/contrib/test_hexagon/metaschedule_e2e/test_resnet50_int8.py
+++ b/tests/python/contrib/test_hexagon/metaschedule_e2e/test_resnet50_int8.py
@@ -54,7 +54,7 @@ TARGET_HEXAGON = get_hexagon_target("v68")
 def load_model():
 """Load renset50 model."""
 if not os.path.exists(MODEL_JSON):
-pytest.skip(msg="Run python export_models.py first.")
+pytest.skip("Run python export_models.py first.")
 
 with open(MODEL_JSON, "r") as file:
 mod = tvm.ir.load_json(file.read())
@@ -172,7 +172,7 @@ def test_resnet50(hexagon_launcher):
 pytest.skip("Skipping test since it takes too long in CI.")
 
 if not os.path.exists(MODEL_JSON):
-pytest.skip(msg="Run python export_models.py f

(tvm) branch main updated: [3rdparty] Bump FlashInfer for sampling functions (#16935)

2024-04-27 Thread tqchen
This is an automated email from the ASF dual-hosted git repository.

tqchen pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/tvm.git


The following commit(s) were added to refs/heads/main by this push:
 new 0b09ed0185 [3rdparty] Bump FlashInfer for sampling functions (#16935)
0b09ed0185 is described below

commit 0b09ed0185eaa095664ef0ae095744d3aa9276c1
Author: Ruihang Lai 
AuthorDate: Sat Apr 27 15:15:36 2024 -0400

[3rdparty] Bump FlashInfer for sampling functions (#16935)

This PR bumps the 3rdparty FlashInfer revision to include the
efficient sampling function implementation on CUDA.
---
 3rdparty/flashinfer | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/3rdparty/flashinfer b/3rdparty/flashinfer
index 920672776a..f978e02565 16
--- a/3rdparty/flashinfer
+++ b/3rdparty/flashinfer
@@ -1 +1 @@
-Subproject commit 920672776a2bf2244acf7a2e0516f46be9e93b15
+Subproject commit f978e02565d7157d57803eb4153369e046fc4106



(tvm) branch main updated (3ff3daa26d -> 63e0a0ff82)

2024-04-27 Thread tqchen
This is an automated email from the ASF dual-hosted git repository.

tqchen pushed a change to branch main
in repository https://gitbox.apache.org/repos/asf/tvm.git


from 3ff3daa26d [CI] Upgrade CUDA to 12.4 (#16939)
 add 63e0a0ff82 [Thrust] Increase static workspace size (#16937)

No new revisions were added by this update.

Summary of changes:
 python/tvm/relax/backend/dispatch_sort_scan.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)



(tvm) branch main updated: [CI] Upgrade CUDA to 12.4 (#16939)

2024-04-27 Thread tqchen
This is an automated email from the ASF dual-hosted git repository.

tqchen pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/tvm.git


The following commit(s) were added to refs/heads/main by this push:
 new 3ff3daa26d [CI] Upgrade CUDA to 12.4 (#16939)
3ff3daa26d is described below

commit 3ff3daa26dd8eb377cc146b28b6b639c31282bc8
Author: Yong Wu 
AuthorDate: Sat Apr 27 12:11:46 2024 -0700

[CI] Upgrade CUDA to 12.4 (#16939)
---
 docker/Dockerfile.ci_gpu | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docker/Dockerfile.ci_gpu b/docker/Dockerfile.ci_gpu
index 03f34ebc70..acb0310a41 100644
--- a/docker/Dockerfile.ci_gpu
+++ b/docker/Dockerfile.ci_gpu
@@ -17,7 +17,7 @@
 
 # CI docker GPU env
 # tag: v0.60
-FROM nvidia/cuda:11.8.0-cudnn8-devel-ubuntu22.04
+FROM nvidia/cuda:12.4.1-cudnn-devel-ubuntu22.04
 
 COPY utils/apt-install-and-clear.sh /usr/local/bin/apt-install-and-clear
 



(tvm) branch main updated: [Relax][TIR] Introduce new `cumsum` op for gpu (#16934)

2024-04-26 Thread tqchen
This is an automated email from the ASF dual-hosted git repository.

tqchen pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/tvm.git


The following commit(s) were added to refs/heads/main by this push:
 new 278a6af085 [Relax][TIR] Introduce new `cumsum` op for gpu (#16934)
278a6af085 is described below

commit 278a6af085d1a149bc9ae4ff4a7ac4b33fc6b6bb
Author: Siyuan Feng 
AuthorDate: Fri Apr 26 23:15:38 2024 +0800

[Relax][TIR] Introduce new `cumsum` op for gpu (#16934)
---
 python/tvm/relax/backend/dispatch_sort_scan.py |  41 +
 python/tvm/relax/backend_tir/__init__.py   |   1 +
 python/tvm/relax/backend_tir/cumsum.py | 193 +
 .../relax/test_backend_dispatch_sort_scan.py   |  38 +++-
 4 files changed, 268 insertions(+), 5 deletions(-)

diff --git a/python/tvm/relax/backend/dispatch_sort_scan.py 
b/python/tvm/relax/backend/dispatch_sort_scan.py
index eb82e49d9a..870e6138d7 100644
--- a/python/tvm/relax/backend/dispatch_sort_scan.py
+++ b/python/tvm/relax/backend/dispatch_sort_scan.py
@@ -154,7 +154,48 @@ class SortScanDispatcher(PyExprMutator):
 if call.op.name in ("relax.cumprod", "relax.cumsum"):
 tgt = self._get_target(call.struct_info)
 axis = int(call.attrs.axis) if call.attrs.axis is not None else 
call.attrs.axis
+shape = call.struct_info.shape
 kwargs = {}
+if (
+(axis == -1 or axis == len(shape) - 1)
+and is_gpu_target(tgt)
+and not can_use_thrust(tgt, "tvm.contrib.thrust.sum_scan")
+and call.op.name == "relax.cumsum"
+and call.attrs.exclusive == 0
+):
+from tvm.relax.backend_tir import (  # pylint: 
disable=import-outside-toplevel
+gpu_2d_continuous_cumsum,
+)
+
+dim = 1
+for i in range(len(shape) - 1):
+dim *= shape[i]
+in_dtype = call.args[0].struct_info.dtype
+out_dtype = call.attrs.dtype
+out_dtype = out_dtype or in_dtype
+cumsum_2d_shape = relax.ShapeExpr([dim, shape[-1]])
+reshape = relax.call_pure_packed(
+"vm.builtin.reshape",
+call.args[0],
+cumsum_2d_shape,
+sinfo_args=relax.TensorStructInfo(cumsum_2d_shape, 
out_dtype),
+)
+gv = self.builder_.add_func(
+gpu_2d_continuous_cumsum(in_dtype=in_dtype, 
out_dtype=out_dtype),
+"gpu_2d_continuous_cumsum",
+)
+cumsum = relax.call_tir(
+gv,
+reshape,
+out_sinfo=relax.TensorStructInfo(cumsum_2d_shape, 
out_dtype),
+)
+return relax.call_pure_packed(
+"vm.builtin.reshape",
+cumsum,
+shape,
+sinfo_args=call.struct_info,
+)
+
 with tgt:
 if call.op.name == "relax.cumsum":
 te_func = topi.cuda.cumsum if is_gpu_target(tgt) else 
topi.cumsum
diff --git a/python/tvm/relax/backend_tir/__init__.py 
b/python/tvm/relax/backend_tir/__init__.py
index eeb8fe438f..10def47b8d 100644
--- a/python/tvm/relax/backend_tir/__init__.py
+++ b/python/tvm/relax/backend_tir/__init__.py
@@ -18,3 +18,4 @@
 
 from . import contrib
 from .pattern import get_tir_pattern
+from .cumsum import gpu_2d_continuous_cumsum
diff --git a/python/tvm/relax/backend_tir/cumsum.py 
b/python/tvm/relax/backend_tir/cumsum.py
new file mode 100644
index 00..ade961ecf1
--- /dev/null
+++ b/python/tvm/relax/backend_tir/cumsum.py
@@ -0,0 +1,193 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+# pylint: disable=invalid-name, too-many-nested-blocks
+"""Backend kernels for cumsum operator."""
+
+import math
+from typing import Optional
+
+from tvm.script import tir as T
+from tvm.tir 

(tvm) branch main updated: [Fix][Dlight] Fix GeneralReduction for log-sum-exp (#16923)

2024-04-25 Thread tqchen
This is an automated email from the ASF dual-hosted git repository.

tqchen pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/tvm.git


The following commit(s) were added to refs/heads/main by this push:
 new 51cfb70f86 [Fix][Dlight] Fix GeneralReduction for log-sum-exp (#16923)
51cfb70f86 is described below

commit 51cfb70f868c057d0d73aa60bc96b99ce722ecd2
Author: Ruihang Lai 
AuthorDate: Thu Apr 25 20:31:46 2024 -0400

[Fix][Dlight] Fix GeneralReduction for log-sum-exp (#16923)

This PR fixes the GeneralReduction dlight rule so that it can support
scheduling log-sum-exp function.

Prior to this issue, the rule makes a strong assumption on the pattern
of the given function, which allows scheduling softmax, but fails to
schedule log-sum-exp due to pattern mismatch. This PR enhances the rule
and makes it able to match the pattern of log-sum-exp and apply
subsequent scheduling.

A regression test is added.
---
 python/tvm/dlight/gpu/general_reduction.py|  35 +++--
 tests/python/dlight/test_gpu_general_reduction.py | 149 ++
 2 files changed, 176 insertions(+), 8 deletions(-)

diff --git a/python/tvm/dlight/gpu/general_reduction.py 
b/python/tvm/dlight/gpu/general_reduction.py
index 28b68a8b62..ef6bb1db91 100644
--- a/python/tvm/dlight/gpu/general_reduction.py
+++ b/python/tvm/dlight/gpu/general_reduction.py
@@ -18,7 +18,7 @@
 """Reduction rule for operators including softmax, layer norm, RMS norm, etc"""
 from typing import List, Union
 
-from tvm import tir
+from tvm import arith, tir
 from tvm.target import Target
 
 from ..base import normalize_prim_func, try_inline_contiguous_spatial
@@ -57,13 +57,32 @@ class GeneralReduction(GPUScheduleRule):
 # Align the number of block iters of the last block.
 num_last_block_iter = len(block_infos[-1].dom_kind())
 if num_last_block_iter < len(dom_kind):
-index_map = tir.IndexMap.from_func(
-lambda *iters: (
-[tir.const(0, iters[0].dtype)] * (len(dom_kind) - 
num_last_block_iter)
-+ list(iters)
-),
-ndim=num_last_block_iter,
-)
+
+def f_layout_mapping(*iters):
+analyzer = arith.Analyzer()
+# Try to match the iters of last block to the iters of the 
first block.
+# For matched positions, use the iter from the input `iters`.
+# For unmatched positions, use a new iter which is constant 0.
+num_matched = 0
+target_layout_iters = []
+for block_iter in block_infos[0].iters:
+if num_matched < len(iters) and analyzer.can_prove_equal(
+block_iter.dom, block_infos[-1].iters[num_matched].dom
+):
+target_layout_iters.append(iters[num_matched])
+num_matched += 1
+else:
+target_layout_iters.append(tir.const(0, 
iters[0].dtype))
+
+# If all the iters of the last block can match, return the new 
layout.
+if num_matched == len(iters):
+return target_layout_iters
+# Otherwise, fallback to appending zeros in the beginning.
+return [tir.const(0, iters[0].dtype)] * (
+len(dom_kind) - num_last_block_iter
+) + list(iters)
+
+index_map = tir.IndexMap.from_func(f_layout_mapping, 
ndim=num_last_block_iter)
 sch.transform_block_layout(block_infos[-1].block_rv, index_map)
 
 try:
diff --git a/tests/python/dlight/test_gpu_general_reduction.py 
b/tests/python/dlight/test_gpu_general_reduction.py
index 44c9a4a126..e1a9a8e018 100644
--- a/tests/python/dlight/test_gpu_general_reduction.py
+++ b/tests/python/dlight/test_gpu_general_reduction.py
@@ -453,5 +453,154 @@ def test_group_norm():
 _check(Before, After)
 
 
+def test_logsumexp():
+@I.ir_module
+class Before:
+@T.prim_func
+def compute_lse(var_A: T.handle, var_blocked_lse: T.handle):
+T.func_attr({"tir.noalias": T.bool(True)})
+batch_size = T.int64(is_size_var=True)
+vocab_size = T.int64(is_size_var=True)
+num_chunks = T.int64(is_size_var=True)
+A = T.match_buffer(var_A, (batch_size, vocab_size), 
dtype="float32")
+blocked_lse = T.match_buffer(var_blocked_lse, (batch_size, 
num_chunks), dtype="float32")
+A_pad = T.alloc_buffer((batch_size, num_chunks, T.int64(4096)), 
dtype="float32")
+temp_max = T.alloc_buffer((batch_size, num_chunks), 
dtype="float32")
+temp_sum = T.alloc_buffer((batch_size, num_chunks), 
dtype="float32")
+
+for

(tvm) branch main updated: [Fix] Fix SSA conversion for SizeVar retention (#16924)

2024-04-25 Thread tqchen
This is an automated email from the ASF dual-hosted git repository.

tqchen pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/tvm.git


The following commit(s) were added to refs/heads/main by this push:
 new 39f2482580 [Fix] Fix SSA conversion for SizeVar retention (#16924)
39f2482580 is described below

commit 39f2482580b57fa5b1f6c1a1dc0e6f5e823ee4c0
Author: Ruihang Lai 
AuthorDate: Thu Apr 25 08:11:46 2024 -0400

[Fix] Fix SSA conversion for SizeVar retention (#16924)

This PR fixes the var construction in IRConvertSSA, which always casts
SizeVar to Var. This behavior leads to expr not being able to get
simplified in the LowerIntrin pass later on. Specifically, if not using
SizeVar, the LowerIntrin pass loses the information of the non-negative
var information, and cannot simply a bunch of FloorDiv/FloorMod
expressions.

One regression test for SplitHostDevice is added to ensure the retention
of SizeVar. Adding the test in SplitHostDevice because this is where
the SSA conversion is used.
---
 src/tir/transforms/ir_utils.cc | 13 +--
 .../test_tir_transform_split_host_device.py| 25 --
 2 files changed, 34 insertions(+), 4 deletions(-)

diff --git a/src/tir/transforms/ir_utils.cc b/src/tir/transforms/ir_utils.cc
index 584b3cbf58..c52027acba 100644
--- a/src/tir/transforms/ir_utils.cc
+++ b/src/tir/transforms/ir_utils.cc
@@ -435,10 +435,19 @@ class IRConvertSSA final : public StmtExprMutator {
  private:
   struct ScopedRedefine {
 ScopedRedefine(IRConvertSSA* parent, Var old_var) : parent(parent), 
old_var(old_var) {
+  bool is_size_var = old_var->IsInstance();
   if (old_var->type_annotation.defined()) {
-new_var = Var(old_var->name_hint, old_var->type_annotation);
+if (is_size_var) {
+  new_var = SizeVar(old_var->name_hint, old_var->type_annotation);
+} else {
+  new_var = Var(old_var->name_hint, old_var->type_annotation);
+}
   } else {
-new_var = Var(old_var->name_hint, old_var->dtype);
+if (is_size_var) {
+  new_var = SizeVar(old_var->name_hint, old_var->dtype);
+} else {
+  new_var = Var(old_var->name_hint, old_var->dtype);
+}
   }
   parent->scope_[old_var.get()].push_back(new_var);
 }
diff --git a/tests/python/tir-transform/test_tir_transform_split_host_device.py 
b/tests/python/tir-transform/test_tir_transform_split_host_device.py
index 6adfbeb81d..2d0d8a68d8 100644
--- a/tests/python/tir-transform/test_tir_transform_split_host_device.py
+++ b/tests/python/tir-transform/test_tir_transform_split_host_device.py
@@ -15,9 +15,10 @@
 # specific language governing permissions and limitations
 # under the License.
 import tvm
-from tvm import te
 import tvm.testing
-from tvm.script import tir as T, ir as I
+from tvm import te
+from tvm.script import ir as I
+from tvm.script import tir as T
 
 
 @tvm.testing.requires_cuda
@@ -345,5 +346,25 @@ def test_dynamic_launch_thread():
 tvm.ir.assert_structural_equal(expected, after)
 
 
+def test_size_var():
+@I.ir_module
+class Module:
+@T.prim_func
+def main(var_A: T.handle, var_B: T.handle):
+T.func_attr({"target": T.target("cuda")})
+m = T.int64(is_size_var=True)
+A = T.match_buffer(var_A, (m,))
+B = T.match_buffer(var_B, (m,))
+T.attr(T.target("cuda"), "target", 0)
+blockIdx_x = T.launch_thread("blockIdx.x", m)
+B_1 = T.Buffer((m,), data=B.data)
+A_1 = T.Buffer((m,), data=A.data)
+B_1[blockIdx_x] = A_1[blockIdx_x]
+
+after = tvm.tir.transform.SplitHostDevice()(Module)
+assert len(after["main_kernel"].params) == 3
+assert isinstance(after["main_kernel"].params[2], tvm.tir.SizeVar)
+
+
 if __name__ == "__main__":
 tvm.testing.main()



(tvm) branch main updated: [TVMScript] Support `T.launch_thread` with i64 dtype (#16916)

2024-04-24 Thread tqchen
This is an automated email from the ASF dual-hosted git repository.

tqchen pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/tvm.git


The following commit(s) were added to refs/heads/main by this push:
 new 4f8c03fad3 [TVMScript] Support `T.launch_thread` with i64 dtype 
(#16916)
4f8c03fad3 is described below

commit 4f8c03fad393c360008f1fb208f117c66c04090c
Author: Siyuan Feng 
AuthorDate: Wed Apr 24 20:44:46 2024 +0800

[TVMScript] Support `T.launch_thread` with i64 dtype (#16916)

This PR fixes the bug of mismatched dtype in `T.launch_thread` when the 
dtype is `i64`.
---
 include/tvm/script/ir_builder/tir/ir.h|  3 ++-
 python/tvm/script/ir_builder/tir/ir.py|  7 +--
 src/script/ir_builder/tir/ir.cc   | 10 +-
 .../test_tir_transform_inject_ptx_async_copy.py   |  4 ++--
 tests/python/tvmscript/test_tvmscript_parser_tir.py   | 15 +++
 5 files changed, 29 insertions(+), 10 deletions(-)

diff --git a/include/tvm/script/ir_builder/tir/ir.h 
b/include/tvm/script/ir_builder/tir/ir.h
index c4ba44f673..5b44f79ad7 100644
--- a/include/tvm/script/ir_builder/tir/ir.h
+++ b/include/tvm/script/ir_builder/tir/ir.h
@@ -401,9 +401,10 @@ LaunchThreadFrame LaunchThread(String thread_tag, PrimExpr 
extent);
 /*!
  * \brief Bind a var to thread env.
  * \param thread_tag The thread type tag.
+ * \param dtype The data type of the variable.
  * \return The result variable which gets bound to the thread env.
  */
-Var EnvThread(String thread_tag);
+Var EnvThread(String thread_tag, DataType dtype = DataType::Int(32));
 
 /*!
  * \brief Store data in a buffer.
diff --git a/python/tvm/script/ir_builder/tir/ir.py 
b/python/tvm/script/ir_builder/tir/ir.py
index 127d2a4356..c04ac780c9 100644
--- a/python/tvm/script/ir_builder/tir/ir.py
+++ b/python/tvm/script/ir_builder/tir/ir.py
@@ -1241,7 +1241,7 @@ def launch_thread(
 return _ffi_api.LaunchThread(thread, extent)  # type: ignore[attr-defined] 
# pylint: disable=no-member
 
 
-def env_thread(thread_tag: str) -> IterVar:
+def env_thread(thread_tag: str, dtype: str = "int32") -> IterVar:
 """Bind a var to thread env
 
 Parameters
@@ -1249,13 +1249,16 @@ def env_thread(thread_tag: str) -> IterVar:
 thread_tag : str
 The thread type tag.
 
+dtype : str
+The data type of the thread env.
+
 Returns
 ---
 res : IterVar
 The result iteration variable gets bound to the thread env.
 
 """
-return _ffi_api.EnvThread(thread_tag)  # type: ignore[attr-defined] # 
pylint: disable=no-member
+return _ffi_api.EnvThread(thread_tag, dtype)  # type: ignore[attr-defined] 
# pylint: disable=no-member
 
 
 def buffer_store(
diff --git a/src/script/ir_builder/tir/ir.cc b/src/script/ir_builder/tir/ir.cc
index ccb5a8b57b..3ce5c15e6c 100644
--- a/src/script/ir_builder/tir/ir.cc
+++ b/src/script/ir_builder/tir/ir.cc
@@ -432,7 +432,8 @@ LaunchThreadFrame LaunchThread(Var var, PrimExpr extent) {
   }
   ObjectPtr n = make_object();
   if (!iter_var->dom.defined()) {
-const_cast(iter_var.get())->dom = Range(0, extent);
+const_cast(iter_var.get())->dom =
+Range(tvm::tir::make_zero(extent.dtype()), extent);
   } else if (!arith::Analyzer().CanProveEqual(iter_var->dom->extent, extent)) {
 LOG(FATAL) << "ValueError: Inconsistent extents of environment thread. "
<< iter_var->dom->extent << " vs " << extent;
@@ -444,7 +445,7 @@ LaunchThreadFrame LaunchThread(Var var, PrimExpr extent) {
 }
 
 LaunchThreadFrame LaunchThread(String thread_tag, PrimExpr extent) {
-  return LaunchThread(EnvThread(thread_tag), extent);
+  return LaunchThread(EnvThread(thread_tag, extent.dtype()), extent);
 }
 
 RealizeFrame Realize(tvm::tir::BufferRegion buffer_slice, String storage_scope,
@@ -512,9 +513,8 @@ ElseFrame Else() {
   return ElseFrame(n);
 }
 
-Var EnvThread(String thread_tag) {
-  IterVar iter_var(Range{nullptr}, Var("", DataType::Int(32)), 
tvm::tir::IterVarType::kThreadIndex,
-   thread_tag);
+Var EnvThread(String thread_tag, DataType dtype) {
+  IterVar iter_var(Range{nullptr}, Var("", dtype), 
tvm::tir::IterVarType::kThreadIndex, thread_tag);
   Var var = iter_var->var;
   if (Optional opt_frame = 
IRBuilder::Current()->FindFrame()) {
 opt_frame.value()->env_threads.Set(var, iter_var);
diff --git 
a/tests/python/tir-transform/test_tir_transform_inject_ptx_async_copy.py 
b/tests/python/tir-transform/test_tir_transform_inject_ptx_async_copy.py
index 4c94dc04cc..c160e4a31d 100644
--- a/tests/python/tir-transform/test_tir_transform_inject_ptx_async_copy.py
+++ b/tests/python/tir-transform/test_tir_transform_inject_ptx_async_copy.py
@@ -969,9 +969,9 @@ class 
TestMultiplicationNodesAreInligned(tvm.testing.Co

(tvm) branch main updated: [Misc] Enhance Release Note Script and Remove Useless File (#16913)

2024-04-21 Thread tqchen
This is an automated email from the ASF dual-hosted git repository.

tqchen pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/tvm.git


The following commit(s) were added to refs/heads/main by this push:
 new 6b77cbabe8 [Misc] Enhance Release Note Script and Remove Useless File 
(#16913)
6b77cbabe8 is described below

commit 6b77cbabe847c4653f9354e587127519cb43e3b1
Author: ysh329 
AuthorDate: Mon Apr 22 04:46:48 2024 +0800

[Misc] Enhance Release Note Script and Remove Useless File (#16913)
---
 tests/scripts/release/PRERELEASE_NOTES.md | 24 
 tests/scripts/release/make_notes.py   |  4 
 2 files changed, 4 insertions(+), 24 deletions(-)

diff --git a/tests/scripts/release/PRERELEASE_NOTES.md 
b/tests/scripts/release/PRERELEASE_NOTES.md
deleted file mode 100644
index 933d8d2720..00
--- a/tests/scripts/release/PRERELEASE_NOTES.md
+++ /dev/null
@@ -1,24 +0,0 @@
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-Notable changes since last release
---
-
-* PR12509:
-   - Changed `TargetKind::device_type` to `TargetKind::default_device_type`.
-   - Introduced "target_default_device" attribute that overrides the default 
device.
-   - Added `Target::GetTargetDeviceType` to return the effective device type 
for the target.
diff --git a/tests/scripts/release/make_notes.py 
b/tests/scripts/release/make_notes.py
index 09994f8652..2835a7241f 100644
--- a/tests/scripts/release/make_notes.py
+++ b/tests/scripts/release/make_notes.py
@@ -93,6 +93,10 @@ TAG_DICT = {
 "quantization": "Relay",
 "relax": "Relax",
 "unity": "Relax",
+"transform": "Relax",
+"kvcache": "Relax",
+"dlight": "Dlight",
+"disco": "Disco",
 "tvmscript": "TVMScript",
 "tvmscripts": "TVMScript",
 "tvmc": "TVMC",



(tvm) branch main updated (6afbc12e27 -> 2978427c2a)

2024-04-19 Thread tqchen
This is an automated email from the ASF dual-hosted git repository.

tqchen pushed a change to branch main
in repository https://gitbox.apache.org/repos/asf/tvm.git


from 6afbc12e27 [Bugfix][Relax] Raise exception for OOM allocation (#16905)
 add 2978427c2a [Relax] Prevent to generate duplicate func in 
dispatch_sort_scan (#16904)

No new revisions were added by this update.

Summary of changes:
 python/tvm/relax/backend/dispatch_sort_scan.py | 57 +-
 .../relax/test_backend_dispatch_sort_scan.py   | 38 +++
 2 files changed, 71 insertions(+), 24 deletions(-)



(tvm) branch main updated: [Upd] Fixed lld search in rocm (#16907)

2024-04-19 Thread tqchen
This is an automated email from the ASF dual-hosted git repository.

tqchen pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/tvm.git


The following commit(s) were added to refs/heads/main by this push:
 new 36efa36f53 [Upd] Fixed lld search in rocm (#16907)
36efa36f53 is described below

commit 36efa36f53f4ad9f302ece4208e5b8296c86c8bb
Author: Shrey Gupta <51860471+shreygupta2...@users.noreply.github.com>
AuthorDate: Fri Apr 19 04:41:05 2024 -0400

[Upd] Fixed lld search in rocm (#16907)

fixed lld search
---
 python/tvm/contrib/rocm.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/python/tvm/contrib/rocm.py b/python/tvm/contrib/rocm.py
index 0ef2e7d06a..119a2c588c 100644
--- a/python/tvm/contrib/rocm.py
+++ b/python/tvm/contrib/rocm.py
@@ -52,7 +52,8 @@ def find_lld(required=True):
 if major is not None:
 lld_list += [f"ld.lld-{major}.0"]
 lld_list += [f"ld.lld-{major}"]
-lld_list += ["ld.lld", "/opt/rocm/llvm/bin"]
+lld_list += ["ld.lld"]
+lld_list += [f"/opt/rocm/llvm/bin/{x}" for x in lld_list]
 valid_list = [utils.which(x) for x in lld_list]
 valid_list = [x for x in valid_list if x]
 if not valid_list and required:



(tvm) branch main updated (7dc0472aef -> 59376eeca3)

2024-04-18 Thread tqchen
This is an automated email from the ASF dual-hosted git repository.

tqchen pushed a change to branch main
in repository https://gitbox.apache.org/repos/asf/tvm.git


from 7dc0472aef [Bugfix] CudaDeviceAPI::GetAttr may check kExist when GPUs 
absent (#16903)
 add 59376eeca3 [Relax] Allow specifying entry_funcs for BYOC (#16902)

No new revisions were added by this update.

Summary of changes:
 include/tvm/relax/transform.h   |  5 ++-
 python/tvm/relax/transform/transform.py |  5 +++
 src/relax/transform/fuse_ops.cc | 69 ++---
 src/relax/transform/utils.h |  3 +-
 4 files changed, 57 insertions(+), 25 deletions(-)



(tvm) branch main updated (de91c5ca94 -> 7dc0472aef)

2024-04-18 Thread tqchen
This is an automated email from the ASF dual-hosted git repository.

tqchen pushed a change to branch main
in repository https://gitbox.apache.org/repos/asf/tvm.git


from de91c5ca94 [Bugfix] rocm shared memory issue on MI250 (#16901)
 add 7dc0472aef [Bugfix] CudaDeviceAPI::GetAttr may check kExist when GPUs 
absent (#16903)

No new revisions were added by this update.

Summary of changes:
 src/runtime/cuda/cuda_device_api.cc|  7 +--
 .../test_runtime_device_api.py}| 54 +++---
 2 files changed, 32 insertions(+), 29 deletions(-)
 copy tests/python/{relax/test_json_compact.py => 
runtime/test_runtime_device_api.py} (56%)



(tvm) branch main updated (da56c89f32 -> de91c5ca94)

2024-04-17 Thread tqchen
This is an automated email from the ASF dual-hosted git repository.

tqchen pushed a change to branch main
in repository https://gitbox.apache.org/repos/asf/tvm.git


from da56c89f32 [Dlight] Enhance vectorization for gpu matmul (#16894)
 add de91c5ca94 [Bugfix] rocm shared memory issue on MI250 (#16901)

No new revisions were added by this update.

Summary of changes:
 python/tvm/dlight/gpu/gemv.py | 5 -
 1 file changed, 4 insertions(+), 1 deletion(-)



(tvm) branch main updated (b3ffd97569 -> da56c89f32)

2024-04-17 Thread tqchen
This is an automated email from the ASF dual-hosted git repository.

tqchen pushed a change to branch main
in repository https://gitbox.apache.org/repos/asf/tvm.git


from b3ffd97569 [BYOC] Add layout check and update shape check for cublas 
FP8 BYOC (#16895)
 add da56c89f32 [Dlight] Enhance vectorization for gpu matmul (#16894)

No new revisions were added by this update.

Summary of changes:
 python/tvm/dlight/gpu/matmul.py  |  7 +-
 tests/python/dlight/test_gpu_matmul.py   | 81 
 tests/python/dlight/test_gpu_matmul_tensorize.py | 18 +++---
 3 files changed, 54 insertions(+), 52 deletions(-)



(tvm) branch main updated (857fe614ab -> b3ffd97569)

2024-04-17 Thread tqchen
This is an automated email from the ASF dual-hosted git repository.

tqchen pushed a change to branch main
in repository https://gitbox.apache.org/repos/asf/tvm.git


from 857fe614ab [Target] Don't register AArch64 target tags without LLVM 
compiler support (#16897)
 add b3ffd97569 [BYOC] Add layout check and update shape check for cublas 
FP8 BYOC (#16895)

No new revisions were added by this update.

Summary of changes:
 python/tvm/relax/backend/contrib/cublas.py | 28 
 tests/python/relax/test_codegen_cublas.py  | 20 
 2 files changed, 36 insertions(+), 12 deletions(-)



(tvm) branch main updated (d030ce27a1 -> 857fe614ab)

2024-04-17 Thread tqchen
This is an automated email from the ASF dual-hosted git repository.

tqchen pushed a change to branch main
in repository https://gitbox.apache.org/repos/asf/tvm.git


from d030ce27a1 [TVMScript] Optionally use `ruff format` instead of `black` 
(#16876)
 add 857fe614ab [Target] Don't register AArch64 target tags without LLVM 
compiler support (#16897)

No new revisions were added by this update.

Summary of changes:
 cmake/modules/LLVM.cmake   |  1 +
 cmake/utils/FindLLVM.cmake | 18 ++
 src/target/parsers/aprofile.cc |  7 ---
 src/target/tag.cc  |  6 +-
 4 files changed, 28 insertions(+), 4 deletions(-)



(tvm) branch main updated: [Contrib] Enable fp16 for thrust sort (#16887)

2024-04-16 Thread tqchen
This is an automated email from the ASF dual-hosted git repository.

tqchen pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/tvm.git


The following commit(s) were added to refs/heads/main by this push:
 new cdfdd0e4ec [Contrib] Enable fp16 for thrust sort (#16887)
cdfdd0e4ec is described below

commit cdfdd0e4ec7452bedf4e79ba0ff474d2de70bbbf
Author: Siyuan Feng 
AuthorDate: Tue Apr 16 20:13:21 2024 +0800

[Contrib] Enable fp16 for thrust sort (#16887)

[Contrib] Enable fp16 for thrust

Enable fp16 for thrust to support LLM cases
---
 src/runtime/contrib/thrust/thrust.cu | 14 +-
 1 file changed, 13 insertions(+), 1 deletion(-)

diff --git a/src/runtime/contrib/thrust/thrust.cu 
b/src/runtime/contrib/thrust/thrust.cu
index 28edba64aa..048df518e3 100644
--- a/src/runtime/contrib/thrust/thrust.cu
+++ b/src/runtime/contrib/thrust/thrust.cu
@@ -167,7 +167,19 @@ void thrust_sort(DLTensor* input, DLTensor* out_values, 
DLTensor* out_indices, b
 void thrust_sort_common(DLTensor* input, DLTensor* values_out, DLTensor* 
indices_out,
 bool is_ascend, int sort_len, std::string data_dtype, 
std::string out_dtype,
 DLTensor* workspace) {
-  if (data_dtype == "float32") {
+  if (data_dtype == "float16") {
+if (out_dtype == "int32") {
+  thrust_sort(input, values_out, indices_out, is_ascend, 
sort_len, workspace);
+} else if (out_dtype == "int64") {
+  thrust_sort(input, values_out, indices_out, is_ascend, 
sort_len, workspace);
+} else if (out_dtype == "float32") {
+  thrust_sort(input, values_out, indices_out, is_ascend, 
sort_len, workspace);
+} else if (out_dtype == "float64") {
+  thrust_sort(input, values_out, indices_out, is_ascend, 
sort_len, workspace);
+} else {
+  LOG(FATAL) << "Unsupported output dtype: " << out_dtype;
+}
+  } else if (data_dtype == "float32") {
 if (out_dtype == "int32") {
   thrust_sort(input, values_out, indices_out, is_ascend, 
sort_len, workspace);
 } else if (out_dtype == "int64") {



(tvm) branch main updated (e738f1d4f1 -> 95d6778908)

2024-04-16 Thread tqchen
This is an automated email from the ASF dual-hosted git repository.

tqchen pushed a change to branch main
in repository https://gitbox.apache.org/repos/asf/tvm.git


from e738f1d4f1 [Relax][Frontend] Fix sort, argsort and topk in nn module 
(#16886)
 add 95d6778908 [dlight] Add check for matmul dtype and fix reduction rule 
(#16884)

No new revisions were added by this update.

Summary of changes:
 python/tvm/dlight/gpu/matmul.py|  3 ++-
 python/tvm/dlight/gpu/reduction.py | 16 
 2 files changed, 10 insertions(+), 9 deletions(-)



(tvm) branch main updated (cdfdd0e4ec -> e738f1d4f1)

2024-04-16 Thread tqchen
This is an automated email from the ASF dual-hosted git repository.

tqchen pushed a change to branch main
in repository https://gitbox.apache.org/repos/asf/tvm.git


from cdfdd0e4ec [Contrib] Enable fp16 for thrust sort (#16887)
 add e738f1d4f1 [Relax][Frontend] Fix sort, argsort and topk in nn module 
(#16886)

No new revisions were added by this update.

Summary of changes:
 python/tvm/relax/frontend/nn/op.py|  6 +++---
 tests/python/relax/test_frontend_nn_op.py | 29 +
 2 files changed, 32 insertions(+), 3 deletions(-)



(tvm) branch main updated (a64d1f1cc3 -> f267691fa4)

2024-04-15 Thread tqchen
This is an automated email from the ASF dual-hosted git repository.

tqchen pushed a change to branch main
in repository https://gitbox.apache.org/repos/asf/tvm.git


from a64d1f1cc3 [TIR] Make T.reinterpret nop when dtype is the same (#16879)
 add f267691fa4 [Relax] Stabilize relax pass mutation order (#16883)

No new revisions were added by this update.

Summary of changes:
 include/tvm/ir/module.h  |  3 ++-
 python/tvm/relax/frontend/nn/core.py |  6 +++---
 src/ir/module.cc |  4 
 src/relax/transform/alter_op_impl.cc |  3 ++-
 src/relax/transform/dead_code_elimination.cc |  3 ++-
 src/relax/transform/fuse_ops.cc  | 22 --
 src/relax/transform/fuse_tir.cc  |  3 ++-
 src/relax/transform/legalize_ops.cc  |  8 +---
 8 files changed, 32 insertions(+), 20 deletions(-)



(tvm) branch main updated: [TIR] Make T.reinterpret nop when dtype is the same (#16879)

2024-04-14 Thread tqchen
This is an automated email from the ASF dual-hosted git repository.

tqchen pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/tvm.git


The following commit(s) were added to refs/heads/main by this push:
 new a64d1f1cc3 [TIR] Make T.reinterpret nop when dtype is the same (#16879)
a64d1f1cc3 is described below

commit a64d1f1cc37da7f202d943c2bea7eb747e624599
Author: Wuwei Lin 
AuthorDate: Sun Apr 14 08:21:30 2024 -0700

[TIR] Make T.reinterpret nop when dtype is the same (#16879)

* [TIR] Make T.reinterpret nop when dtype is the same

* fix scalable vec handling
---
 python/tvm/tir/op.py   |  4 ++--
 src/tir/op/op.cc   |  8 ++--
 tests/python/codegen/test_target_codegen_cuda.py   |  2 +-
 .../python/tvmscript/test_tvmscript_parser_tir.py  | 22 ++
 4 files changed, 31 insertions(+), 5 deletions(-)

diff --git a/python/tvm/tir/op.py b/python/tvm/tir/op.py
index 8816880e7b..6b72e63f29 100644
--- a/python/tvm/tir/op.py
+++ b/python/tvm/tir/op.py
@@ -1789,7 +1789,7 @@ def infinity(dtype: str, span: Optional[Span] = None) -> 
Any:
 return _ffi_api.infinity(dtype, span)  # type: ignore
 
 
-def reinterpret(dtype, value) -> Any:
+def reinterpret(dtype, value, span: Optional[Span] = None) -> Any:
 """infinity value of dtype
 
 Parameters
@@ -1808,7 +1808,7 @@ def reinterpret(dtype, value) -> Any:
 value : tvm.Expr
 The reinterpret cast value of dtype.
 """
-return call_intrin(dtype, "tir.reinterpret", value)
+return _ffi_api.reinterpret(dtype, value, span)  # type: ignore
 
 
 def exp(x):
diff --git a/src/tir/op/op.cc b/src/tir/op/op.cc
index 7f47e66062..b613639786 100644
--- a/src/tir/op/op.cc
+++ b/src/tir/op/op.cc
@@ -409,8 +409,10 @@ PrimExpr cast(const DataType& t, PrimExpr value, Span 
span) {
 // reinterpret
 PrimExpr reinterpret(const DataType& t, PrimExpr value, Span span) {
   if (value.dtype() == t) return value;
-  ICHECK(value.dtype().bits() * value.dtype().lanes() == t.bits() * t.lanes())
-  << "Bitcast requires size match " << t << " vs " << value.dtype();
+  if (!t.is_scalable_vector() && !value.dtype().is_scalable_vector()) {
+ICHECK(value.dtype().bits() * value.dtype().lanes() == t.bits() * 
t.lanes())
+<< "Bitcast requires size match " << t << " vs " << value.dtype();
+  }
   return tir::Call(t, tir::builtin::reinterpret(), {value}, span);
 }
 
@@ -1083,6 +1085,8 @@ 
TVM_REGISTER_GLOBAL("tir.trunc").set_body_typed(tvm::trunc);
 
 TVM_REGISTER_GLOBAL("tir._cast").set_body_typed(tvm::cast);
 
+TVM_REGISTER_GLOBAL("tir.reinterpret").set_body_typed(tvm::reinterpret);
+
 // operator overloading, smarter than make
 #define REGISTER_MAKE_BINARY_OP(Node, Func)
\
   TVM_REGISTER_GLOBAL("tir." #Node).set_body_typed([](PrimExpr a, PrimExpr b, 
Span span) { \
diff --git a/tests/python/codegen/test_target_codegen_cuda.py 
b/tests/python/codegen/test_target_codegen_cuda.py
index 23ba0fc3ce..112c521d06 100644
--- a/tests/python/codegen/test_target_codegen_cuda.py
+++ b/tests/python/codegen/test_target_codegen_cuda.py
@@ -1120,7 +1120,7 @@ def test_invalid_reinterpret():
 @T.prim_func
 def func(A: T.Buffer((4,), "uint32"), B: T.Buffer((4,), "uint8")) -> None:
 for tx in T.thread_binding(4, "threadIdx.x"):
-B[tx] = T.reinterpret("uint8", A[tx])
+B[tx] = T.call_intrin("uint8", "tir.reinterpret", A[tx])
 
 with pytest.raises(tvm.error.TVMError):
 tvm.build(func, target="cuda")
diff --git a/tests/python/tvmscript/test_tvmscript_parser_tir.py 
b/tests/python/tvmscript/test_tvmscript_parser_tir.py
index 465ffa5cb6..530746a6fc 100644
--- a/tests/python/tvmscript/test_tvmscript_parser_tir.py
+++ b/tests/python/tvmscript/test_tvmscript_parser_tir.py
@@ -449,5 +449,27 @@ def test_inferred_sinfo_with_dynamic_buffer():
 tvm.ir.assert_structural_equal(func.struct_info, expected)
 
 
+def test_reinterpret_nop():
+"""Test builtin reinterpret op"""
+
+@T.prim_func
+def func(A: T.Buffer((32,), "float32"), B: T.Buffer((32,), "float32")) -> 
None:
+T.func_attr({"global_symbol": "main"})
+for i in T.serial(0, 32):
+with T.block():
+vi = T.axis.remap("S", [i])
+B[vi] = T.reinterpret("float32", A[vi])
+
+@T.prim_func
+def expected(A: T.Buffer((32,), "float32"), B: T.Buffer((32,), "float32")) 
-> None:
+T.func_attr({"global_symbol": "main"})
+for i in T.serial(0, 32):
+with T.block():
+vi = T.axis.remap("S", [i])
+B[vi] = A[vi]
+
+tvm.ir.assert_structural_equal(func, expected)
+
+
 if __name__ == "__main__":
 tvm.testing.main()



(tvm) branch main updated: [Runtime] Implemented Datatype.itemsize() (#16880)

2024-04-13 Thread tqchen
This is an automated email from the ASF dual-hosted git repository.

tqchen pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/tvm.git


The following commit(s) were added to refs/heads/main by this push:
 new 64911ab5da [Runtime] Implemented Datatype.itemsize() (#16880)
64911ab5da is described below

commit 64911ab5da3640be4d9fb675513e57b742e188b1
Author: Wuwei Lin 
AuthorDate: Sat Apr 13 18:33:12 2024 -0700

[Runtime] Implemented Datatype.itemsize() (#16880)

* [Runtime] Implemented Datatype.itemsize()
---
 python/tvm/_ffi/runtime_ctypes.py   | 14 
 python/tvm/dlight/gpu/gemv.py   |  2 +-
 python/tvm/dlight/gpu/low_batch_gemv.py |  8 +++
 tests/python/ir/test_dtype.py   | 40 +
 4 files changed, 58 insertions(+), 6 deletions(-)

diff --git a/python/tvm/_ffi/runtime_ctypes.py 
b/python/tvm/_ffi/runtime_ctypes.py
index dc5582d045..099cbe972a 100644
--- a/python/tvm/_ffi/runtime_ctypes.py
+++ b/python/tvm/_ffi/runtime_ctypes.py
@@ -212,6 +212,20 @@ class DataType(ctypes.Structure):
 def __ne__(self, other):
 return not self.__eq__(other)
 
+def itemsize(self):
+"""Get the number of bytes of a single element of this data type. When 
the number of lanes
+is greater than 1, the itemsize is the size of the vector type.
+
+Returns
+---
+itemsize : int
+The number of bytes of a single element of this data type
+"""
+lanes_as_int = ctypes.c_int16(self.lanes).value
+if lanes_as_int < 0:
+raise ValueError("Cannot determine itemsize for scalable vector 
types")
+return (self.bits * self.lanes + 7) // 8
+
 
 if ml_dtypes is not None:
 DataType.NUMPY2STR[np.dtype(ml_dtypes.bfloat16)] = "bfloat16"
diff --git a/python/tvm/dlight/gpu/gemv.py b/python/tvm/dlight/gpu/gemv.py
index c1ce876620..644f4e6dfa 100644
--- a/python/tvm/dlight/gpu/gemv.py
+++ b/python/tvm/dlight/gpu/gemv.py
@@ -57,7 +57,7 @@ def get_extent(sch: tir.Schedule, loop_rv: 
tir.schedule.LoopRV):
 def get_bytes(dtype: Union[DataType, str]) -> int:
 if isinstance(dtype, str):
 dtype = DataType(dtype)
-return dtype.bits * dtype.lanes // 8
+return dtype.itemsize()
 
 
 def is_gemv(sch: tir.Schedule, block_info: BlockInfo) -> 
Optional[List[tir.Buffer]]:
diff --git a/python/tvm/dlight/gpu/low_batch_gemv.py 
b/python/tvm/dlight/gpu/low_batch_gemv.py
index 9a92c9e0e9..696722c3f0 100644
--- a/python/tvm/dlight/gpu/low_batch_gemv.py
+++ b/python/tvm/dlight/gpu/low_batch_gemv.py
@@ -15,7 +15,6 @@
 # specific language governing permissions and limitations
 # under the License.
 """A rule for low-batch GEMM / decode-GEMM using GEMV schedule."""
-import re
 from functools import reduce
 from typing import List, Optional, Set, Union
 
@@ -55,10 +54,9 @@ def get_extent(sch: tir.Schedule, loop_rv: 
tir.schedule.LoopRV):
 
 
 def get_bytes(dtype: Union[DataType, str]) -> int:
-num = re.findall(r"\d+", dtype)
-if len(num) != 1:
-raise ValueError(f"Cannot get bytes from {dtype}")
-return int(num[0]) // 8
+if isinstance(dtype, str):
+dtype = DataType(dtype)
+return dtype.itemsize()
 
 
 def is_gemv(sch: tir.Schedule, block_info: BlockInfo) -> 
Optional[List[tir.Buffer]]:
diff --git a/tests/python/ir/test_dtype.py b/tests/python/ir/test_dtype.py
new file mode 100644
index 00..77cd1d7e4b
--- /dev/null
+++ b/tests/python/ir/test_dtype.py
@@ -0,0 +1,40 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+"""Test data type related API"""
+import tvm
+from tvm import DataType
+import tvm.testing
+import pytest
+
+
+@pytest.mark.parametrize(
+"dtype_str, expected_size",
+[("float32", 4), ("float32x4", 16), ("e5m2_float8x4", 4), ("uint8", 1)],
+)
+def test_dtype_itemsize(dtype_str, expected_size):
+dtype = DataType(dtype_str)
+assert dtype.itemsize() == expected_size
+
+
+@pytest.mark.parametrize("dtype_str", [("int32xvscalex4")])
+def test_dtype_itemmize_error(dtype_str):
+with pytest.raises(ValueError):
+size = DataType(dtype_str).itemsize()
+
+
+if __name__ == "__main__":
+tvm.testing.main()



(tvm) branch main updated: [Dlight] Enhance vectorization loading weight for gemv (#16878)

2024-04-13 Thread tqchen
This is an automated email from the ASF dual-hosted git repository.

tqchen pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/tvm.git


The following commit(s) were added to refs/heads/main by this push:
 new 5c80691c81 [Dlight] Enhance vectorization loading weight for gemv 
(#16878)
5c80691c81 is described below

commit 5c80691c81070df0d79fa22f64579945f4807c5e
Author: Wuwei Lin 
AuthorDate: Sat Apr 13 11:48:00 2024 -0700

[Dlight] Enhance vectorization loading weight for gemv (#16878)

* [Dlight] Enhance vectorization loading weight for gemv


* Update gemv.py
---
 python/tvm/dlight/gpu/gemv.py| 18 ++--
 tests/python/dlight/test_gpu_gemv.py | 57 ++--
 2 files changed, 38 insertions(+), 37 deletions(-)

diff --git a/python/tvm/dlight/gpu/gemv.py b/python/tvm/dlight/gpu/gemv.py
index 55b38fc66b..c1ce876620 100644
--- a/python/tvm/dlight/gpu/gemv.py
+++ b/python/tvm/dlight/gpu/gemv.py
@@ -15,7 +15,6 @@
 # specific language governing permissions and limitations
 # under the License.
 """A rule for GEMV and DecodeGEMV."""
-import re
 from functools import reduce
 from typing import List, Optional, Union
 
@@ -56,10 +55,9 @@ def get_extent(sch: tir.Schedule, loop_rv: 
tir.schedule.LoopRV):
 
 
 def get_bytes(dtype: Union[DataType, str]) -> int:
-num = re.findall(r"\d+", dtype)
-if len(num) != 1:
-raise ValueError(f"Cannot get bytes from {dtype}")
-return int(num[0]) // 8
+if isinstance(dtype, str):
+dtype = DataType(dtype)
+return dtype.bits * dtype.lanes // 8
 
 
 def is_gemv(sch: tir.Schedule, block_info: BlockInfo) -> 
Optional[List[tir.Buffer]]:
@@ -297,10 +295,11 @@ class GEMV(GPUScheduleRule):
 Aq_local = sch.cache_read(rf, read_buffer_index=1, 
storage_scope="local")
 sch.compute_at(Aq_local, r, preserve_unit_loops=True)
 s_local, r_local = sch.get_loops(block=Aq_local)[-2:]
-s_local, vec_load = sch.split(
-s_local, factors=[None, VEC_LOAD], preserve_unit_iters=True
+fused_load = sch.fuse(s_local, r_local)
+aq_vec_len = max(1, VEC_LOAD // 
get_bytes(sch.get(Aq_local).reads[0].buffer.dtype))
+fused_load, vec_load = sch.split(
+fused_load, factors=[None, aq_vec_len], 
preserve_unit_iters=True
 )
-sch.reorder(s_local, r_local, vec_load)  # either s_local or 
r_local should be 1
 sch.vectorize(vec_load)
 
 # load vector into shared memory, shape should be the whole vector
@@ -442,10 +441,12 @@ class GEMV(GPUScheduleRule):
 
 TAG_S, TAG_R = "threadIdx.y", "threadIdx.x"
 SUPPORT_WARP_SHUFFLE = False
+VEC_LOAD = 1
 if target.kind.name == "cuda":
 VEC_C = 4
 LOAD_V_SHARED = True
 LOAD_V_VEC = 8
+VEC_LOAD = 4
 UNROLL = 256
 SUPPORT_WARP_SHUFFLE = True
 if isinstance(len_S, int):
@@ -522,7 +523,6 @@ class GEMV(GPUScheduleRule):
 else max(get_max_factor(len_r, [TR * 1, TR * 2, TR * 4, TR * 8]) 
// TR, 1),
 )
 VEC_C = min(get_max_factor(TILE_R, [1, 2, 4, 8]), VEC_C)
-VEC_LOAD = 1
 
 return apply(
 sch,
diff --git a/tests/python/dlight/test_gpu_gemv.py 
b/tests/python/dlight/test_gpu_gemv.py
index 8903babbc0..0fd7f79159 100644
--- a/tests/python/dlight/test_gpu_gemv.py
+++ b/tests/python/dlight/test_gpu_gemv.py
@@ -120,13 +120,13 @@ class TestGEMV(BaseBeforeAfter):
 
T.writes(var_NT_matmul_intermediate_rf_local[vax2_fused_u_fused_1_ax2_fused_u_fused_3_fused,
 0, v0, 0, v1])
 
var_NT_matmul_intermediate_rf_local[vax2_fused_u_fused_1_ax2_fused_u_fused_3_fused,
 0, v0, 0, v1] = T.float16(0)
 for ax2_fused_u_fused_0 in T.serial(1, 
annotations={"pragma_auto_unroll_max_step": 256, "pragma_unroll_explicit": 1}):
-for ax0, ax1, ax2_0, ax3 in T.grid(1, 1, 1, 2):
-for ax2_1 in T.vectorized(1):
+for ax0, ax1, ax2_ax3_fused_0 in T.grid(1, 1, 1):
+for ax2_ax3_fused_1 in T.vectorized(2):
 with T.block("lv1638_local"):
 v0 = T.axis.spatial(1, ax0)
 v1 = T.axis.spatial(32, 
ax0_fused_ax1_fused_fused_0 // n + ax1)
-v2 = T.axis.spatial(n, 
ax0_fused_ax1_fused_fused_0 % n + ax2_0 + ax2_1)
-v3 = T.axis.spatial(128, 
ax2_fused_u_fused_1_ax2_fused_u_fused_3_fused_0 * 2 + ax3)
+v2 = T.axis.spatial(n, 
ax0_fused_ax1_fused_fused_0 % n)
+ 

(tvm) branch main updated: [Relax] Enhance symbolic expr estimation in memory planning (#16872)

2024-04-12 Thread tqchen
This is an automated email from the ASF dual-hosted git repository.

tqchen pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/tvm.git


The following commit(s) were added to refs/heads/main by this push:
 new 0a3fe22208 [Relax] Enhance symbolic expr estimation in memory planning 
(#16872)
0a3fe22208 is described below

commit 0a3fe22208329edc596db0116752b3259f5d90a2
Author: Ruihang Lai 
AuthorDate: Fri Apr 12 09:50:35 2024 -0400

[Relax] Enhance symbolic expr estimation in memory planning (#16872)

This PR enhances the symbolic expression upper bound estimation in
static memory planning.

Prior to this PR, we are not able to estimate the upper bound of
`a * b` when `a` has an upper bound while `b` does not. This PR
enhances the estimation with arith::IntSet.

We introduce another TIR attribute `tir_non_negative_var` to indicate
the non-negative TIR variables for memory planning use.

A new unit test is introduced for this enhancement.
---
 src/relax/transform/static_plan_block_memory.cc|  45 +++--
 .../test_transform_static_plan_block_memory.py | 102 +
 2 files changed, 137 insertions(+), 10 deletions(-)

diff --git a/src/relax/transform/static_plan_block_memory.cc 
b/src/relax/transform/static_plan_block_memory.cc
index 453c996916..2b16d86509 100644
--- a/src/relax/transform/static_plan_block_memory.cc
+++ b/src/relax/transform/static_plan_block_memory.cc
@@ -353,8 +353,10 @@ class StorageAllocatorBaseVisitor : public ExprVisitor {
  * the input function signature in the analyzer.
  * \param func The function to be analyzed.
  * \param ana The analyzer which contains the TIR var upper bounds.
+ * \param dom_map The domain map of the TIR variables.
  */
-void SetTIRVarUpperBound(Function func, arith::Analyzer* ana) {
+void SetTIRVarUpperBound(Function func, arith::Analyzer* ana,
+ Map* dom_map) {
   // Use the attribute-annotated TIR var upper bounds as the TIR var values for
   // memory planning.
   // NOTE: we only apply the annotated upper bounds to the TIR variables that
@@ -362,7 +364,10 @@ void SetTIRVarUpperBound(Function func, arith::Analyzer* 
ana) {
   Map var_upper_bound_attr_raw =
   func->GetAttr>("tir_var_upper_bound")
   .value_or(Map());
+  Array non_negative_var_attr_raw =
+  
func->GetAttr>("tir_non_negative_var").value_or(Array());
   std::unordered_map var_upper_bound_attr;
+  std::unordered_set non_negative_var_attr;
   // We manually check the value type to ensure the values are all positive 
IntImm.
   for (auto it : var_upper_bound_attr_raw) {
 const auto* key = it.first.as();
@@ -378,13 +383,23 @@ void SetTIRVarUpperBound(Function func, arith::Analyzer* 
ana) {
 << value->value << " is got.";
 var_upper_bound_attr[GetRef(key)] = GetRef(value);
   }
+  for (ObjectRef var_name : non_negative_var_attr_raw) {
+const auto* key = var_name.as();
+CHECK(key != nullptr) << "The element of attr `tir_non_negative_var` 
should be string. However "
+  << key->GetTypeKey() << " is got.";
+non_negative_var_attr.insert(GetRef(key));
+  }
   Array var_in_signature = TIRVarsInStructInfo(GetStructInfo(func));
   for (const tir::Var& tir_var : var_in_signature) {
 auto it = var_upper_bound_attr.find(tir_var->name_hint);
 if (it != var_upper_bound_attr.end()) {
-  ana->Bind(tir_var,
-tvm::Range::FromMinExtent(tvm::IntImm(DataType::Int(64), 0),
-  tvm::IntImm(DataType::Int(64), 
(*it).second->value + 1)));
+  tvm::Range range =
+  tvm::Range::FromMinExtent(tvm::IntImm(DataType::Int(64), 0),
+tvm::IntImm(DataType::Int(64), 
(*it).second->value + 1));
+  ana->Bind(tir_var, range);
+  dom_map->Set(tir_var, arith::IntSet::FromRange(range));
+} else if (non_negative_var_attr.count(tir_var->name_hint)) {
+  ana->MarkGlobalNonNegValue(tir_var);
 }
   }
 }
@@ -398,14 +413,20 @@ void SetTIRVarUpperBound(Function func, arith::Analyzer* 
ana) {
  * \return The upper-bounded shape. When a dimension's upper bound
  * cannot be determined, we keep the dimension unchanged.
  */
-Array GetUpperBoundShape(Array shape, arith::Analyzer* 
ana) {
+Array GetUpperBoundShape(Array shape, arith::Analyzer* ana,
+   const Map& 
dom_map) {
   // Use the upper bounds of TIR vars as their values.
   Array upper_bounded_shape;
   upper_bounded_shape.reserve(shape.size());
   for (const PrimExpr& dim_len : shape) {
 int64_t max_bound = ana->const_int_bound(dim_len)->max_value;
 if (max_bound == std::numeric_limits::max()) {
-  upper_bounded_shape.push_back(dim_len);
+  arith::IntSet int_set 

(tvm) branch main updated: [Thrust] Fix thrust workspace allocation (#16873)

2024-04-12 Thread tqchen
This is an automated email from the ASF dual-hosted git repository.

tqchen pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/tvm.git


The following commit(s) were added to refs/heads/main by this push:
 new 3f09e7f5ce [Thrust] Fix thrust workspace allocation (#16873)
3f09e7f5ce is described below

commit 3f09e7f5cea7aaa113286e4652f0e430d52fc110
Author: Wuwei Lin 
AuthorDate: Fri Apr 12 04:57:39 2024 -0700

[Thrust] Fix thrust workspace allocation (#16873)

* [Thrust] Fix thrust workspace allocation

* Fix typo and use workspace for `device_vector` in sort

-

Co-authored-by: Ruihang Lai 
---
 python/tvm/relax/backend/dispatch_sort_scan.py  | 16 +---
 src/runtime/contrib/thrust/thrust.cu| 21 -
 .../python/relax/test_backend_dispatch_sort_scan.py |  4 ++--
 3 files changed, 27 insertions(+), 14 deletions(-)

diff --git a/python/tvm/relax/backend/dispatch_sort_scan.py 
b/python/tvm/relax/backend/dispatch_sort_scan.py
index 064d3abf25..f0e42f401b 100644
--- a/python/tvm/relax/backend/dispatch_sort_scan.py
+++ b/python/tvm/relax/backend/dispatch_sort_scan.py
@@ -174,9 +174,19 @@ class SortScanDispatcher(PyExprMutator):
 """
 input_shape = call.args[0].struct_info.shape
 input_byte_per_elem = DataType(call.args[0].struct_info.dtype).bits // 
8
-input_size = reduce(mul, input_shape, 1) * input_byte_per_elem
-# Most GPU algorithms take O(n) space or less, we choose 2N + 4MB as a 
safe estimation
-return 2 * input_size + 4 * 1024 * 1024
+int64_byte_per_elem = DataType("int64").bits // 8
+int32_byte_per_elem = DataType("int32").bits // 8
+num_elem = reduce(mul, input_shape, 1)
+input_size = num_elem * input_byte_per_elem
+# Most GPU algorithms take O(n) space or less, we choose 8N + 4MB as a 
safe estimation
+# for algorithm workspace.
+# The current thrust sort implementation may need extra int64 and 
int32 arrays
+# for temporary data, so we further add this part to the workspace.
+return (
+8 * input_size
++ 4 * 1024 * 1024
++ num_elem * (int64_byte_per_elem + int32_byte_per_elem)
+)
 
 def allocate_workspace(self, call: relax.Call) -> relax.Var:
 """
diff --git a/src/runtime/contrib/thrust/thrust.cu 
b/src/runtime/contrib/thrust/thrust.cu
index 9e35290fab..28edba64aa 100644
--- a/src/runtime/contrib/thrust/thrust.cu
+++ b/src/runtime/contrib/thrust/thrust.cu
@@ -65,6 +65,8 @@ class WorkspaceMemoryResource : public 
thrust::mr::memory_resource {
   void* result = std::align(alignment, bytes, workspace, workspace_size);
   CHECK(result) << "Failed to allocate " << bytes << " bytes with 
alignment " << alignment
 << " bytes.";
+  workspace = static_cast(workspace) + bytes;
+  workspace_size -= bytes;
   return result;
 }
 return thrust_pool_->do_allocate(bytes, alignment).get();
@@ -120,14 +122,15 @@ void thrust_sort(DLTensor* input, DLTensor* out_values, 
DLTensor* out_indices, b
 // segmented sort by key
 // Follow the back-to-back stable_sort_by_key strategy explained below
 // https://groups.google.com/g/thrust-users/c/BoLsxO6b4FY
-thrust::device_vector argsort_order(size);
-thrust::sequence(argsort_order.begin(), argsort_order.end());
+thrust::device_ptr argsort_order(
+static_cast(mr.do_allocate(sizeof(int64_t) * size, 
sizeof(int64_t;
+thrust::sequence(argsort_order, argsort_order + size);
 
 // First, sort values and store the sorted order in argsort_order.
 if (is_ascend) {
-  thrust::stable_sort_by_key(policy, values_ptr, values_ptr + size, 
argsort_order.begin());
+  thrust::stable_sort_by_key(policy, values_ptr, values_ptr + size, 
argsort_order);
 } else {
-  thrust::stable_sort_by_key(policy, values_ptr, values_ptr + size, 
argsort_order.begin(),
+  thrust::stable_sort_by_key(policy, values_ptr, values_ptr + size, 
argsort_order,
  thrust::greater());
 }
 
@@ -141,15 +144,15 @@ void thrust_sort(DLTensor* input, DLTensor* out_values, 
DLTensor* out_indices, b
 thrust::make_transform_iterator(counting_iter, 
linear_index_to_sort_axis_index);
 
 // This will reorder indices 0, 1, 2 ... in the sorted order of values_ptr
-thrust::gather(policy, argsort_order.begin(), argsort_order.end(), 
init_indices_iter,
-   indices_ptr);
+thrust::gather(policy, argsort_order, argsort_order + size, 
init_indices_iter, indices_ptr);
 
-thrust::device_vector segment_ids(size);
+thrust::device_ptr segment_ids(
+static_cast(mr.do_allocate(sizeof(int) * size, sizeof(int;
 auto linear_index_to_se

(tvm) branch main updated: [3rdparty] Bump flashinfer (#16868)

2024-04-11 Thread tqchen
This is an automated email from the ASF dual-hosted git repository.

tqchen pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/tvm.git


The following commit(s) were added to refs/heads/main by this push:
 new 88a1c6560c [3rdparty] Bump flashinfer (#16868)
88a1c6560c is described below

commit 88a1c6560cb5fe3a757b9b9053bb71421728aedd
Author: Wuwei Lin 
AuthorDate: Thu Apr 11 15:32:46 2024 -0700

[3rdparty] Bump flashinfer (#16868)

* [3rdparty] Bump flashinfer
---
 3rdparty/flashinfer| 2 +-
 tests/python/micro/test_micro_ms_tuning.py | 1 +
 2 files changed, 2 insertions(+), 1 deletion(-)

diff --git a/3rdparty/flashinfer b/3rdparty/flashinfer
index a22aeb6000..920672776a 16
--- a/3rdparty/flashinfer
+++ b/3rdparty/flashinfer
@@ -1 +1 @@
-Subproject commit a22aeb60009f4f224fd94f9cc7d9d133a8398545
+Subproject commit 920672776a2bf2244acf7a2e0516f46be9e93b15
diff --git a/tests/python/micro/test_micro_ms_tuning.py 
b/tests/python/micro/test_micro_ms_tuning.py
index f55f3219cc..1a06c100b4 100644
--- a/tests/python/micro/test_micro_ms_tuning.py
+++ b/tests/python/micro/test_micro_ms_tuning.py
@@ -27,6 +27,7 @@ from tvm.contrib import graph_executor, utils
 from tvm import meta_schedule as ms
 
 
+@pytest.mark.skip(reason="flaky test")
 @tvm.testing.requires_micro
 def test_micro_tuning_with_meta_schedule():
 from tests.micro.zephyr.test_ms_tuning import create_relay_module



(tvm) branch main updated: [PageKV] allow PopN to pop all the tokens in last block (#16871)

2024-04-11 Thread tqchen
This is an automated email from the ASF dual-hosted git repository.

tqchen pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/tvm.git


The following commit(s) were added to refs/heads/main by this push:
 new 0aae97d8e4 [PageKV] allow PopN to pop all the tokens in last block 
(#16871)
0aae97d8e4 is described below

commit 0aae97d8e421fb60260b3d1ee0351393a6ae420c
Author: ZCHNO 
AuthorDate: Fri Apr 12 05:56:49 2024 +0800

[PageKV] allow PopN to pop all the tokens in last block (#16871)
---
 src/runtime/relax_vm/paged_kv_cache.cc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/runtime/relax_vm/paged_kv_cache.cc 
b/src/runtime/relax_vm/paged_kv_cache.cc
index 0c635967f2..64759d465b 100644
--- a/src/runtime/relax_vm/paged_kv_cache.cc
+++ b/src/runtime/relax_vm/paged_kv_cache.cc
@@ -1021,7 +1021,7 @@ class PagedAttentionKVCacheObj : public 
AttentionKVCacheObj {
 
 Block& block = global_block_pool_[it->second.last_block_idx];
 CHECK_GE(n, 0) << "The length of popping " << n << " cannot be negative.";
-CHECK_LT(n, block.seq_length) << "The sequence only has length " << 
block.seq_length
+CHECK_LE(n, block.seq_length) << "The sequence only has length " << 
block.seq_length
   << " in the last block, while the length of 
pop is " << n
   << " which exceeds the last-block sequence 
length.";
 



  1   2   3   4   5   6   7   8   9   10   >