This is an automated email from the ASF dual-hosted git repository. tqchen pushed a commit to branch main in repository https://gitbox.apache.org/repos/asf/tvm.git
The following commit(s) were added to refs/heads/main by this push: new cd3448603d [Doc] Customize Optimization (#17320) cd3448603d is described below commit cd3448603dffea2340e406dd7751a37b0440d81f Author: Siyuan Feng <hzfen...@sjtu.edu.cn> AuthorDate: Mon Sep 2 14:06:37 2024 +0800 [Doc] Customize Optimization (#17320) [Doc] Customization Optimization --- docs/how_to/index.rst | 1 + docs/how_to/tutorials/customize_opt.py | 225 +++++++++++++++++++++++++++++++++ 2 files changed, 226 insertions(+) diff --git a/docs/how_to/index.rst b/docs/how_to/index.rst index 976b2f1bd4..c5b9d703f0 100644 --- a/docs/how_to/index.rst +++ b/docs/how_to/index.rst @@ -19,5 +19,6 @@ :maxdepth: 1 tutorials/e2e_opt_model + tutorials/customize_opt tutorials/cross_compilation_and_rpc dev/index diff --git a/docs/how_to/tutorials/customize_opt.py b/docs/how_to/tutorials/customize_opt.py new file mode 100644 index 0000000000..5806d6ce5d --- /dev/null +++ b/docs/how_to/tutorials/customize_opt.py @@ -0,0 +1,225 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +""" +.. _customize_opt: + +Customize Optimization +====================== +One main design goal of Apache TVM is to enable easy customization of the optimization pipeline +for both research or development purposes and iterate the engineering optimizations. In this +tutorial we will + +.. contents:: Table of Contents + :local: + :depth: 1 +""" + +###################################################################### +# Review Overall Flow +# ------------------- +# .. figure:: https://raw.githubusercontent.com/tlc-pack/web-data/main/images/design/tvm_overall_flow.svg +# :align: center +# :width: 80% +# +# The overall flow consists of the following steps: +# +# - **Construct or Import a Model**: Construct a neural network model or import a pre-trained +# model from other frameworks (e.g. PyTorch, ONNX), and create the TVM IRModule, which contains +# all the information needed for compilation, including high-level Relax functions for +# computational graph, and low-level TensorIR functions for tensor program. +# - **Perform Composable Optimizations**: Perform a series of optimization transformations, +# such as graph optimizations, tensor program optimizations, and library dispatching. +# - **Build and Universal Deployment**: Build the optimized model to a deployable module to the +# universal runtime, and execute it on different devices, such as CPU, GPU, or other accelerators. +# + +import os +import tempfile +import numpy as np +import tvm +from tvm import IRModule, relax +from tvm.relax.frontend import nn + +###################################################################### +# Composable IRModule Optimization +# -------------------------------- +# Apache TVM Unity provides a flexible way to optimize the IRModule. Everything centered +# around IRModule optimization can be composed with existing pipelines. Note that each optimization +# can focus on **part of the computation graph**, enabling partial lowering or partial optimization. +# +# In this tutorial, we will demonstrate how to optimize a model with Apache TVM Unity. + +###################################################################### +# Prepare a Relax Module +# ~~~~~~~~~~~~~~~~~~~~~~~~~~ +# We first prepare a Relax module. The module can be imported from other frameworks, constructed +# with NN module frontend or TVMScript. Here we use a simple neural network model as an example. + + +class RelaxModel(nn.Module): + def __init__(self): + super(RelaxModel, self).__init__() + self.fc1 = nn.Linear(784, 256) + self.relu1 = nn.ReLU() + self.fc2 = nn.Linear(256, 10, bias=False) + + def forward(self, x): + x = self.fc1(x) + x = self.relu1(x) + x = self.fc2(x) + return x + + +input_shape = (1, 784) +mod, params = RelaxModel().export_tvm({"forward": {"x": nn.spec.Tensor(input_shape, "float32")}}) +mod.show() + +###################################################################### +# Library Dispatch +# ~~~~~~~~~~~~~~~~ +# We would like to quickly try out a variant of library optimization for certain platforms +# (e.g., GPU). We can write a certain dispatching pass for the specific platform and +# operator. Here we demonstrate how to dispatch the CUBLAS library for certain patterns. +# +# .. note:: +# This tutorial only demonstrates a single operator dispatching for CUBLAS, highlighting +# the flexibility of the optimization pipeline. In real-world cases, we can import multiple +# patterns and dispatch them to different kernels. + + +# Import cublas pattern +import tvm.relax.backend.contrib.cublas as _cublas + + +# Define a new pass for CUBLAS dispatch +@tvm.transform.module_pass(opt_level=0, name="CublasDispatch") +class CublasDispatch: + def transform_module(self, mod: IRModule, _ctx: tvm.transform.PassContext) -> IRModule: + # Check if CUBLAS is enabled + if not tvm.get_global_func("relax.ext.cublas", True): + raise Exception("CUBLAS is not enabled.") + + # Get interested patterns + patterns = [relax.backend.get_pattern("cublas.matmul_transposed_bias_relu")] + # Note in real-world cases, we usually get all patterns + # patterns = relax.backend.get_patterns_with_prefix("cublas") + + # Fuse ops by patterns and then run codegen + mod = relax.transform.FuseOpsByPattern(patterns, annotate_codegen=True)(mod) + mod = relax.transform.RunCodegen()(mod) + return mod + + +mod = CublasDispatch()(mod) +mod.show() + +###################################################################### +# After the dispatching pass, we can see that the first ``nn.Linear`` and ``nn.ReLU`` are fused +# and rewritten to a ``call_dps_packed`` function which call the CUBLAS library. Notably, the +# other part is not changed, which means we can selectively dispatch the optimization for +# certain computation. + +###################################################################### +# Auto Tuning +# ~~~~~~~~~~~ +# Continuing from the previous example, we can further optimize the model with auto-tuning for +# the **rest part of the computation**. Here we demonstrate how to use the meta-schedule to auto-tune +# the model. +# +# We can use ``MetaScheduleTuneTIR`` pass to simply tuning the model, while ``MetaScheduleApplyDatabase`` +# pass to apply the best configuration to the model. The tuning process will generate search space, +# tune the model and the following steps will apply the best configuration to the model. Before +# running the passes, we need to lowering relax operator into TensorIR functions via ``LegalizeOps`` +# +# .. note:: +# +# To save CI time and avoid flakiness, we skip the tuning process in CI environment. +# + +device = tvm.cuda(0) +target = tvm.target.Target.from_device(device) +if os.getenv("CI", "") != "true": + trials = 2000 + with target, tempfile.TemporaryDirectory() as tmp_dir: + mod = tvm.ir.transform.Sequential( + [ + relax.get_pipeline("zero"), + relax.transform.MetaScheduleTuneTIR(work_dir=tmp_dir, max_trials_global=trials), + relax.transform.MetaScheduleApplyDatabase(work_dir=tmp_dir), + ] + )(mod) + + mod.show() + +###################################################################### +# DLight Rules +# ~~~~~~~~~~~~ +# DLight rules are a set of default rules for scheduling and optimization the kernel. +# DLight rules are designed for fast compilation and **fair** performance. In some cases, +# e.g. language model, DLight provides excellent performance, while for generic models, +# it achieves a balance between performance and compilation time. + +from tvm import dlight as dl + +# Apply DLight rules +with target: + mod = tvm.ir.transform.Sequential( + [ + relax.get_pipeline("zero"), + dl.ApplyDefaultSchedule( # pylint: disable=not-callable + dl.gpu.Matmul(), + dl.gpu.GEMV(), + dl.gpu.Reduction(), + dl.gpu.GeneralReduction(), + dl.gpu.Fallback(), + ), + ] + )(mod) + +mod.show() + +###################################################################### +# .. note:: +# +# This tutorial focuses on the demonstration of the optimization pipeline, instead of +# pushing the performance to the limit. The current optimization may not be the best. + + +###################################################################### +# Deploy the Optimized Model +# -------------------------- +# We can build and deploy the optimized model to the TVM runtime. + +ex = relax.build(mod, target="cuda") +dev = tvm.device("cuda", 0) +vm = relax.VirtualMachine(ex, dev) +# Need to allocate data and params on GPU device +data = tvm.nd.array(np.random.rand(*input_shape).astype("float32"), dev) +gpu_params = [tvm.nd.array(np.random.rand(*p.shape).astype(p.dtype), dev) for _, p in params] +gpu_out = vm["forward"](data, *gpu_params).numpy() +print(gpu_out) + + +###################################################################### +# Summary +# ------- +# This tutorial demonstrates how to customize the optimization pipeline for ML models in Apache TVM. +# We can easily compose the optimization passes and customize the optimization for different parts +# of the computation graph. The flexibility of the optimization pipeline enables us to quickly +# iterate the optimization and improve the performance of the model. +#