This is an automated email from the ASF dual-hosted git repository.
MasterJH5574 pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/tvm.git
The following commit(s) were added to refs/heads/main by this push:
new 9dc87f1931 [Docs] Refactor BYOC example NPU tutorial (#19439)
9dc87f1931 is described below
commit 9dc87f1931849541877c412af3362dcde17e1a99
Author: Shushi Hong <[email protected]>
AuthorDate: Sat Apr 25 16:01:58 2026 -0400
[Docs] Refactor BYOC example NPU tutorial (#19439)
This pr refactors the BYOC tutorial for the example NPU backend so the
full pipeline (register → partition → codegen → VM execute) actually
runs and visibly demonstrates fusion.
Also picks up several latent bugs in the example backend that the
original tutorial was implicitly papering over.
---
...oc_npu_example.py => bring_your_own_codegen.py} | 61 ++++++++++++++++------
docs/index.rst | 1 +
.../relax/backend/contrib/example_npu/patterns.py | 55 ++++++++++++++++---
.../contrib/example_npu/example_npu_runtime.cc | 21 +++++---
4 files changed, 108 insertions(+), 30 deletions(-)
diff --git a/docs/how_to/tutorials/byoc_npu_example.py
b/docs/how_to/tutorials/bring_your_own_codegen.py
similarity index 72%
rename from docs/how_to/tutorials/byoc_npu_example.py
rename to docs/how_to/tutorials/bring_your_own_codegen.py
index 143d097dc4..b6039e4930 100644
--- a/docs/how_to/tutorials/byoc_npu_example.py
+++ b/docs/how_to/tutorials/bring_your_own_codegen.py
@@ -16,21 +16,25 @@
# under the License.
"""
-.. _tutorial-byoc-npu-example:
+.. _tutorial-bring-your-own-codegen:
Bring Your Own Codegen: NPU Backend Example
===========================================
-**Author**: `Sheldon Aristide <https://github.com/Aristide021/>`_
-This tutorial walks through the example NPU BYOC backend included in TVM.
-It demonstrates the key concepts needed to offload operations to a custom
-accelerator: pattern registration, graph partitioning, codegen, and runtime
-dispatch.
+This tutorial shows how to integrate a custom hardware backend with TVM's
+BYOC framework, using the bundled example NPU backend (CPU emulation, no
+real hardware required) as the worked example. You will see the key
+concepts needed to offload operations to a custom accelerator: pattern
+registration, graph partitioning, codegen, and runtime dispatch.
NPUs are purpose-built accelerators designed around a fixed set of operations
common in neural network inference, such as matrix multiplication, convolution,
and activation functions.
-The example backend uses CPU emulation so no real NPU hardware is required.
+The example backend's runtime is a *stub*: it logs the dispatch decisions an
+NPU would make (memory tier, execution engine, fusion) but performs no real
+computation, so output buffers are uninitialized. Assertions in this tutorial
+therefore check shapes, not values. When you replace the runtime with your
+hardware SDK calls, the same flow produces real results.
**Prerequisites**: Build TVM with ``USE_EXAMPLE_NPU_CODEGEN=ON`` and
``USE_EXAMPLE_NPU_RUNTIME=ON``.
@@ -58,6 +62,8 @@ The example backend uses CPU emulation so no real NPU
hardware is required.
# Importing the module is enough to register all supported patterns with
# TVM's pattern registry.
+import numpy as np
+
import tvm
import tvm.relax.backend.contrib.example_npu # registers patterns
from tvm import relax
@@ -69,6 +75,8 @@ has_example_npu_codegen =
tvm.get_global_func("relax.ext.example_npu", True)
has_example_npu_runtime =
tvm.get_global_func("runtime.ExampleNPUJSONRuntimeCreate", True)
has_example_npu = has_example_npu_codegen and has_example_npu_runtime
+target = tvm.target.Target("llvm")
+
patterns = get_patterns_with_prefix("example_npu")
print("Registered patterns:", [p.name for p in patterns])
@@ -98,8 +106,22 @@ class MatmulReLU:
# ---------------------------
#
# ``FuseOpsByPattern`` groups ops that match a registered pattern into
-# composite functions. ``MergeCompositeFunctions`` consolidates them
-# so each group becomes a single external call.
+# composite functions, controlled by two flags:
+#
+# - ``bind_constants=False`` keeps weights as function arguments instead
+# of baking them in, so the host stays in charge of parameter
+# ownership.
+# - ``annotate_codegen=True`` tags each composite with its backend name
+# (``example_npu``); without this tag, ``RunCodegen`` has no way to
+# route the composite to a backend.
+#
+# ``MergeCompositeFunctions`` then consolidates adjacent composites
+# that target the same backend so each group becomes a single external
+# call. Note that consolidation depends on the patterns themselves: an
+# ``op_a + op_b`` chain only collapses into one composite if a fused
+# pattern (e.g. ``matmul_relu_fused``) was registered for it; otherwise
+# each op stays as its own composite even when both target the same
+# backend.
mod = MatmulReLU
mod = FuseOpsByPattern(patterns, bind_constants=False,
annotate_codegen=True)(mod)
@@ -130,28 +152,27 @@ if has_example_npu:
# Build the module for the host target, create a virtual machine, and
# execute the compiled function.
- import numpy as np
-
np.random.seed(0)
x_np = np.random.randn(2, 4).astype("float32")
w_np = np.random.randn(4, 8).astype("float32")
- target = tvm.target.Target("llvm")
with tvm.transform.PassContext(opt_level=3):
built = relax.build(mod, target)
vm = relax.VirtualMachine(built, tvm.cpu())
result = vm["main"](tvm.runtime.tensor(x_np, tvm.cpu()),
tvm.runtime.tensor(w_np, tvm.cpu()))
- expected_shape = (2, 8)
- assert result.numpy().shape == expected_shape
+ assert result.numpy().shape == (2, 8)
print("Execution completed. Output shape:", result.numpy().shape)
######################################################################
# Step 6: Conv2D + ReLU
# ---------------------
#
-# The same flow applies to convolution workloads.
+# The same flow applies to convolution workloads. Because the fused
+# ``conv2d + relu`` pattern is registered after the standalone
+# ``conv2d`` pattern in ``patterns.py`` (later entries have higher
+# priority), both ops are offloaded as a single composite function.
@tvm.script.ir_module
@@ -177,7 +198,15 @@ if has_example_npu:
with tvm.transform.PassContext(opt_level=3):
built2 = relax.build(mod2, target)
- print("Conv2dReLU compiled successfully.")
+ x2_np = np.random.randn(1, 3, 32, 32).astype("float32")
+ w2_np = np.random.randn(16, 3, 3, 3).astype("float32")
+
+ vm2 = relax.VirtualMachine(built2, tvm.cpu())
+ result2 = vm2["main"](
+ tvm.runtime.tensor(x2_np, tvm.cpu()), tvm.runtime.tensor(w2_np,
tvm.cpu())
+ )
+ assert result2.numpy().shape == (1, 16, 30, 30)
+ print("Conv2dReLU output shape:", result2.numpy().shape)
######################################################################
# Next steps
diff --git a/docs/index.rst b/docs/index.rst
index 01a4a64f08..2c66c4295d 100644
--- a/docs/index.rst
+++ b/docs/index.rst
@@ -48,6 +48,7 @@ driving its costs down.
how_to/tutorials/cross_compilation_and_rpc
how_to/tutorials/export_and_load_executable
how_to/tutorials/mix_python_and_tvm_with_pymodule
+ how_to/tutorials/bring_your_own_codegen
how_to/dev/index
.. The Deep Dive content is comprehensive
diff --git a/python/tvm/relax/backend/contrib/example_npu/patterns.py
b/python/tvm/relax/backend/contrib/example_npu/patterns.py
index f55ce47dfb..d224388fa3 100644
--- a/python/tvm/relax/backend/contrib/example_npu/patterns.py
+++ b/python/tvm/relax/backend/contrib/example_npu/patterns.py
@@ -117,6 +117,40 @@ def conv2d_relu_fused_pattern():
return ("example_npu.conv2d_relu_fused", *_make_conv2d_relu_pattern(),
_check_conv2d_relu)
+def matmul_relu_fused_pattern():
+ """
+ NPU-optimized MatMul+ReLU fusion pattern.
+
+ Fusing the matrix engine output with the activation unit avoids a
+ write/read round-trip through L1 SRAM, mirroring the conv2d+relu
+ fusion below.
+ """
+
+ def _make_matmul_relu_pattern():
+ input_tensor = wildcard()
+ weight = wildcard()
+ matmul = is_op("relax.matmul")(input_tensor, weight)
+ relu = is_op("relax.nn.relu")(matmul)
+
+ annotations = {
+ "input": input_tensor,
+ "weight": weight,
+ "matmul": matmul,
+ "root": relu,
+ }
+ return relu, annotations
+
+ def _check_matmul_relu(context: PatternCheckContext) -> bool:
+ """Check if MatMul+ReLU fusion is beneficial for NPU"""
+ if not _check_npu_memory_constraints(context):
+ return False
+ if not _check_npu_quantization(context):
+ return False
+ return True
+
+ return ("example_npu.matmul_relu_fused", *_make_matmul_relu_pattern(),
_check_matmul_relu)
+
+
def matmul_patterns():
"""
NPU-optimized matrix multiplication patterns.
@@ -486,18 +520,25 @@ def quantization_patterns():
# Register all NPU patterns with architectural awareness
+# register_patterns priority: patterns that appear LATER in the list win.
+# So we place general / standalone patterns first, and fused (more
+# specific) patterns last so they take precedence over their constituents.
register_patterns(
[
- conv2d_relu_fused_pattern(), # Fused patterns first (higher priority)
+ *quantization_patterns(),
+ *elementwise_patterns(),
+ *activation_patterns(),
+ *softmax_patterns(),
+ *batch_norm_patterns(),
+ *pooling_patterns(),
*matmul_patterns(),
*conv1d_patterns(),
+ # Plain conv2d is more general than depthwise (groups>1); list
+ # plain first so depthwise wins on grouped convs.
*conv2d_patterns(),
*depthwise_conv2d_patterns(),
- *pooling_patterns(),
- *batch_norm_patterns(),
- *softmax_patterns(),
- *activation_patterns(),
- *elementwise_patterns(),
- *quantization_patterns(),
+ # Fused patterns last (highest priority).
+ matmul_relu_fused_pattern(),
+ conv2d_relu_fused_pattern(),
]
)
diff --git a/src/runtime/contrib/example_npu/example_npu_runtime.cc
b/src/runtime/contrib/example_npu/example_npu_runtime.cc
index 4f4e70d4e5..440a5d9715 100644
--- a/src/runtime/contrib/example_npu/example_npu_runtime.cc
+++ b/src/runtime/contrib/example_npu/example_npu_runtime.cc
@@ -319,16 +319,19 @@ class ExampleNPURuntime : public JSONRuntimeBase {
LOG(INFO) << " Executing fused operation - reducing memory traffic";
}
- // Dispatch to appropriate implementation
+ // Dispatch to appropriate implementation.
+ // More specific names must be checked before more general ones, since
+ // op_name.find() is a substring match (e.g. "depthwise_conv2d" also
+ // contains "conv2d", and "dequantize" also contains "quantize").
if (op_name.find("matmul") != std::string::npos ||
op_name.find("dense") != std::string::npos) {
- ExecuteMatMul(node, engine);
+ ExecuteMatMul(node, engine, is_fused);
+ } else if (op_name.find("depthwise") != std::string::npos) {
+ ExecuteDepthwiseConv2D(node, engine);
} else if (op_name.find("conv2d") != std::string::npos) {
ExecuteConv2D(node, engine, is_fused);
} else if (op_name.find("conv1d") != std::string::npos) {
ExecuteConv1D(node, engine);
- } else if (op_name.find("depthwise") != std::string::npos) {
- ExecuteDepthwiseConv2D(node, engine);
} else if (op_name.find("pool") != std::string::npos) {
ExecutePooling(node, engine);
} else if (op_name.find("relu") != std::string::npos ||
@@ -340,10 +343,10 @@ class ExampleNPURuntime : public JSONRuntimeBase {
} else if (op_name.find("add") != std::string::npos ||
op_name.find("multiply") != std::string::npos) {
ExecuteElementwise(node, engine);
- } else if (op_name.find("quantize") != std::string::npos) {
- ExecuteQuantization(node);
} else if (op_name.find("dequantize") != std::string::npos) {
ExecuteDequantization(node);
+ } else if (op_name.find("quantize") != std::string::npos) {
+ ExecuteQuantization(node);
} else {
LOG(WARNING) << "Unsupported operation: " << op_name;
}
@@ -431,7 +434,7 @@ class ExampleNPURuntime : public JSONRuntimeBase {
/*!
* \brief Execute matrix multiplication on NPU matrix engine
*/
- void ExecuteMatMul(const JSONGraphNode& node, ExecutionEngine engine) {
+ void ExecuteMatMul(const JSONGraphNode& node, ExecutionEngine engine, bool
is_fused) {
LOG(INFO) << " Executing MatMul on " << GetEngineString(engine);
// Get input shapes
@@ -448,6 +451,10 @@ class ExampleNPURuntime : public JSONRuntimeBase {
LOG(INFO) << " Using 16x16 systolic array for acceleration";
}
+ if (is_fused) {
+ LOG(INFO) << " Fused with activation - saving memory bandwidth";
+ }
+
// In a real implementation: dispatch to NPU matrix multiplication unit
}