This is an automated email from the ASF dual-hosted git repository. masahi pushed a commit to branch main in repository https://gitbox.apache.org/repos/asf/tvm.git
The following commit(s) were added to refs/heads/main by this push: new ecd3c884de [Runtime][PipelineExecutor] Tutorial of using pipeline executor. (#11557) ecd3c884de is described below commit ecd3c884de6b37d10b766bc9300bc71ee3776402 Author: Hua Jiang <h...@xilinx.com> AuthorDate: Fri Jul 22 12:54:24 2022 -0700 [Runtime][PipelineExecutor] Tutorial of using pipeline executor. (#11557) * [Runtime][PipelineExecutor] Tutorial of using pipeline executor. Tutorial of using pipeline executor including the byoc use case. * fix ci issue * document change. * triger build * fix doc issue * fix ci issue * doc issue * fix ci issue * fix ci issue. * fix __file__ not found problem. this is a known issue of sphinx-gallery https://github.com/sphinx-gallery/sphinx-gallery/issues/211 * fix byoc with dnnl issue * enable dnnl and pipeline executor * trigger build * trigger build * fix build issue * trigger build * oneflow cause crash, do test with change * add sphinx skip * plint * remove from_oneflow change test. * remove pipeline executor change for test * plint * enable DNNL and pipeline * disable DNNL * enable DNNL without pipeline * remove dnnl and add cutlass * use cutlass with byoc * change into cutlass * fix doc convention issue * remove duplicate variable * fix plint issue. * address review comments. * address review comments * fix bug. * polish the document * fix plint issue * address review comments. * address review comments * address review comments --- .../work_with_relay/using_pipeline_executor.py | 248 +++++++++++++++++++++ python/tvm/contrib/pipeline_executor.py | 26 ++- python/tvm/contrib/pipeline_executor_build.py | 14 +- tests/scripts/task_config_build_gpu.sh | 2 + 4 files changed, 281 insertions(+), 9 deletions(-) diff --git a/gallery/how_to/work_with_relay/using_pipeline_executor.py b/gallery/how_to/work_with_relay/using_pipeline_executor.py new file mode 100755 index 0000000000..5496058265 --- /dev/null +++ b/gallery/how_to/work_with_relay/using_pipeline_executor.py @@ -0,0 +1,248 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +""" +Using Pipeline Executor in Relay +================================= +**Author**: `Hua Jiang <https://https://github.com/huajsj>`_ + +This is a short tutorial on how to use "Pipeline Executor" with Relay. +""" +import tvm +from tvm import te +import numpy as np +from tvm.contrib import graph_executor as runtime +from tvm.relay.op.contrib.cutlass import partition_for_cutlass +from tvm import relay +from tvm.relay import testing +import tvm.testing +from tvm.contrib.cutlass import ( + has_cutlass, + num_cutlass_partitions, + finalize_modules, + finalize_modules_vm, +) + +img_size = 8 +####################################################################### +# Create a simple network, this network can be a pre-trained model too. +# --------------------------------------------------------------------- +# Let's create a very simple network for demonstration. +# It consists of convolution, batch normalization, dense, and ReLU activation. +def get_network(): + out_channels = 16 + batch_size = 1 + data = relay.var("data", relay.TensorType((batch_size, 3, img_size, img_size), "float16")) + dense_weight = relay.var( + "dweight", relay.TensorType((batch_size, 16 * img_size * img_size), "float16") + ) + weight = relay.var("weight") + second_weight = relay.var("second_weight") + bn_gamma = relay.var("bn_gamma") + bn_beta = relay.var("bn_beta") + bn_mmean = relay.var("bn_mean") + bn_mvar = relay.var("bn_var") + simple_net = relay.nn.conv2d( + data=data, weight=weight, kernel_size=(3, 3), channels=out_channels, padding=(1, 1) + ) + simple_net = relay.nn.batch_norm(simple_net, bn_gamma, bn_beta, bn_mmean, bn_mvar)[0] + simple_net = relay.nn.relu(simple_net) + simple_net = relay.nn.batch_flatten(simple_net) + simple_net = relay.nn.dense(simple_net, dense_weight) + simple_net = relay.Function(relay.analysis.free_vars(simple_net), simple_net) + data_shape = (batch_size, 3, img_size, img_size) + net, params = testing.create_workload(simple_net) + return net, params, data_shape + + +net, params, data_shape = get_network() +########################################### +# Splitting the network into two subgraphs. +# ----------------------------------------- +# This function called 'graph_split' from a unit test is just an example. User can create a customized logic +# to split the graph. +import inspect +import os + +tutorial_dir = os.path.dirname(inspect.getfile(lambda: None)) +os.sys.path.append(os.path.join(tutorial_dir, "../../../tests/python/relay")) +from test_pipeline_executor import graph_split + +########################################### +# Splitting the network into two subgraphs. +split_config = [{"op_name": "nn.relu", "op_index": 0}] +subgraphs = graph_split(net["main"], split_config, params) +########################################################### +# The generated subgraphs should look something like below. + +""" +#subgraphs[0]) + + def @main(%data: Tensor[(1, 3, img_size, img_size), float16]) { + %0 = nn.conv2d(%data, meta[relay.Constant][0] /* ty=Tensor[(16, 3, 3, 3), float16] */, padding=[1, 1, 1, 1], channels=16, kernel_size=[3, 3]) /* ty=Tensor[(1, 16, img_size, img_size), float16] */; + %1 = nn.batch_norm(%0, meta[relay.Constant][1] /* ty=Tensor[(16), float16] */, meta[relay.Constant][2] /* ty=Tensor[(16), float16]*/, meta[relay.Constant][3] /* ty=Tensor[(16), float16] */, meta[relay.Constant][4] /* ty=Tensor[(16), float16] */) /* ty=(Tensor[(1,16, img_size, img_size), float16], Tensor[(16), float16], Tensor[(16), float16]) */; + %2 = %1.0; + nn.relu(%2) /* ty=Tensor[(1, 16, img_size, img_size), float16] */ + } + +#subgraphs[1] + + def @main(%data_n_0: Tensor[(1, 16, 8, 8), float16] /* ty=Tensor[(1, 16, 8, 8), float16] */) { + %0 = nn.batch_flatten(%data_n_0) /* ty=Tensor[(1, 1024), float16] */; + nn.dense(%0, meta[relay.Constant][0] /* ty=Tensor[(1, 1024), float16] */, units=None) /* ty=Tensor[(1, 1), float16] */ + } + +""" + +# sphinx_gallery_start_ignore +from tvm import testing + +testing.utils.install_request_hook(depth=3) +# sphinx_gallery_end_ignore + +######################################### +# Build the subgraph with cutlass target. +# --------------------------------------- + +cutlass = tvm.target.Target( + { + "kind": "cutlass", + "sm": int(tvm.target.Target("cuda").arch.split("_")[1]), + "use_3xtf32": True, + "split_k_slices": [1], + "profile_all_alignments": False, + "find_first_valid": True, + "use_multiprocessing": True, + "use_fast_math": False, + "tmp_dir": "./tmp", + }, + host=tvm.target.Target("llvm"), +) + + +def cutlass_build(mod, target, params=None, target_host=None, mod_name="default"): + target = [target, cutlass] + lib = relay.build_module.build( + mod, target=target, params=params, target_host=target_host, mod_name=mod_name + ) + return lib + + +########################################################### +# Run the two subgraphs in pipeline with pipeline executor. +# --------------------------------------------------------- +# Set 'USE_PIPELINE_EXECUTOR' as ON, and set USE_CUTLASS' as ON in cmake. +from tvm.contrib import graph_executor, pipeline_executor, pipeline_executor_build + +######################################### +# Create subgraph pipeline configuration. +# Associate a subgraph module with a target. +# Use CUTLASS BYOC to build the second subgraph module. +mod0, mod1 = subgraphs[0], subgraphs[1] +# Use cutlass as the codegen. +mod1 = partition_for_cutlass(mod1) +################################################# +# Get the pipeline executor configuration object. +pipe_config = pipeline_executor_build.PipelineConfig() +########################################################################### +# Set the compile target of the subgraph module. +pipe_config[mod0].target = "llvm" +pipe_config[mod0].dev = tvm.cpu(0) +############################################################## +# Set the compile target of the second subgraph module as cuda. +pipe_config[mod1].target = "cuda" +pipe_config[mod1].dev = tvm.device("cuda", 0) +pipe_config[mod1].build_func = cutlass_build +pipe_config[mod1].export_cc = "nvcc" +# Create the pipeline by connecting the subgraph modules. +# The global input will be forwarded to the input interface of the first module named mod0 +pipe_config["input"]["data"].connect(pipe_config[mod0]["input"]["data"]) +# The first output of mod0 will be forwarded to the input interface of mod1 +pipe_config[mod0]["output"][0].connect(pipe_config[mod1]["input"]["data_n_0"]) +# The first output of mod1 will be the first global output. +pipe_config[mod1]["output"][0].connect(pipe_config["output"][0]) +###################################### +# The pipeline configuration as below. +""" +print(pipe_config) + Inputs + |data: mod0:data + + output + |output(0) : mod1.output(0) + + connections + |mod0.output(0)-> mod1.data_n_0 +""" + +# sphinx_gallery_start_ignore +from tvm import testing + +# testing.utils.install_request_hook(depth=3) +# sphinx_gallery_end_ignore +############################## +# Build the pipeline executor. +# ---------------------------- +with tvm.transform.PassContext(opt_level=3): + pipeline_mod_factory = pipeline_executor_build.build(pipe_config) +############################################### +# Export the parameter configuration to a file. +directory_path = tvm.contrib.utils.tempdir().temp_dir +os.makedirs(directory_path, exist_ok=True) +config_file_name = pipeline_mod_factory.export_library(directory_path) +################################################################ +# Use the load function to create and initialize PipelineModule. +# -------------------------------------------------------------- +pipeline_module = pipeline_executor.PipelineModule.load_library(config_file_name) + +############################ +# Run the pipeline executor. +# -------------------------- +# Allocate input data. +data = np.random.uniform(-1, 1, size=data_shape).astype("float16") +pipeline_module.set_input("data", tvm.nd.array(data)) +########################################################################## +# Run the two subgraph in the pipeline mode to get the output asynchronously +# or synchronously. In the following example, it is synchronous. +pipeline_module.run() +outputs = pipeline_module.get_output() +###################################### +# Use graph_executor for verification. +# ------------------------------------ +# Run these two subgraphs in sequence with graph_executor to get the output. +target = "llvm" +dev0 = tvm.device(target, 0) +lib0 = relay.build_module.build(mod0, target, params=params) +module0 = runtime.GraphModule(lib0["default"](dev0)) +cuda = tvm.target.Target("cuda", host=tvm.target.Target("llvm")) +lib1 = relay.build_module.build(mod1, [cuda, cutlass], params=params) +lib1 = finalize_modules(lib1, "compile.so", "./tmp") + +dev1 = tvm.device("cuda", 0) + +module1 = runtime.GraphModule(lib1["default"](dev1)) + +module0.set_input("data", data) +module0.run() +out_shape = (1, 16, img_size, img_size) +out = module0.get_output(0, tvm.nd.empty(out_shape, "float16")) +module1.set_input("data_n_0", out) +module1.run() +out_shape = (1, 1) +out = module1.get_output(0, tvm.nd.empty(out_shape, "float16")) +#################### +# Verify the result. +tvm.testing.assert_allclose(outputs[0].numpy(), out.numpy()) diff --git a/python/tvm/contrib/pipeline_executor.py b/python/tvm/contrib/pipeline_executor.py index 5ef309bb28..b614630737 100644 --- a/python/tvm/contrib/pipeline_executor.py +++ b/python/tvm/contrib/pipeline_executor.py @@ -17,6 +17,7 @@ """Pipeline executor that executes a series of modules in a pipeline fashion.""" import json import os +import time from tvm import runtime from tvm._ffi import get_global_func from tvm.contrib import graph_executor @@ -131,14 +132,26 @@ class PipelineModule(object): """ return self._get_input(key) - def get_output(self): + def get_output(self, synchronize=True, sleep_interval=0.001): """Get the output. Returns ------- data : Array[NDArray] A list of output data. + synchronize : BOOL + Whether to do a synchronize poll. + sleep_interval : Float32 + When doing the synchronize loop poll, how many seconds the loop should sleep for yield. """ - return self._get_output() + outputs = [] + if not synchronize: + outputs = self._get_output() + else: + while not outputs: + outputs = self._get_output() + time.sleep(sleep_interval) + + return outputs @property def num_executing_pipeline(self): @@ -302,11 +315,16 @@ class PipelineExecutorFactoryModule(object): self.pipeline_mods[lib_index]["dev"].device_type, self.pipeline_mods[lib_index]["dev"].device_id, ) - # Get the graph, lib, and parameters from GraphExecutorFactoryModule. lib = self.pipeline_mods[lib_index]["lib"] # Export the lib, graph, and parameters to disk. - lib.export_library(mconfig["lib_name"]) + if self.pipeline_mods[lib_index]["export_cc"]: + lib.export_library( + mconfig["lib_name"], cc=self.pipeline_mods[lib_index]["export_cc"] + ) + else: + lib.export_library(mconfig["lib_name"]) + with open(mconfig["json_name"], "w") as file_handle: file_handle.write(lib.graph_json) with open(mconfig["params_name"], "wb") as file_handle: diff --git a/python/tvm/contrib/pipeline_executor_build.py b/python/tvm/contrib/pipeline_executor_build.py index 520156b474..324383ab7c 100644 --- a/python/tvm/contrib/pipeline_executor_build.py +++ b/python/tvm/contrib/pipeline_executor_build.py @@ -86,7 +86,12 @@ def build(pipe_configs): # Use "mod_idx" as the key to create a "module_connection" map which is not only # for the module index but also for the module connection used to build the pipeline. module_string_config[mod_idx] = pipe_config - libs[mod_idx] = {"lib": lib, "dev": dev, "fcompile": mod_config["fcompile"]} + libs[mod_idx] = { + "lib": lib, + "dev": dev, + "fcompile": mod_config["fcompile"], + "export_cc": mod_config["export_cc"], + } # Creating a text form configuration to record the "input_connection" and the # "module_connection" information. The "input_connection" is used to record the @@ -132,10 +137,7 @@ def export_library(factory, directory_path): mconfig["json_name"] = "{}/json{}".format(directory_path, lib_index) mconfig["params_name"] = "{}/params{}".format(directory_path, lib_index) lib_config = factory.pipeline_mods[lib_index] - mconfig["dev"] = "{},{}".format( - lib_config["dev"].device_type, - lib_config["dev"].device_id, - ) + mconfig["dev"] = "{},{}".format(lib_config["dev"].device_type, lib_config["dev"].device_id) fcompile = lib_config["fcompile"] if not fcompile: fcompile = False @@ -413,6 +415,7 @@ class PipelineConfig(object): self.fcompile = None self.name = None self.dev = None + self.export_cc = None self.cpu_affinity = "" self.idx = None self.mod = mod @@ -601,6 +604,7 @@ class PipelineConfig(object): "target": module.target, "fcompile": module.fcompile, "dev": module.dev, + "export_cc": module.export_cc, } # Creating a map including pipeline inputs and subgraph inputs. diff --git a/tests/scripts/task_config_build_gpu.sh b/tests/scripts/task_config_build_gpu.sh index 9a71983886..f79076e213 100755 --- a/tests/scripts/task_config_build_gpu.sh +++ b/tests/scripts/task_config_build_gpu.sh @@ -47,3 +47,5 @@ echo set\(USE_LIBBACKTRACE AUTO\) >> config.cmake echo set\(USE_CCACHE OFF\) >> config.cmake echo set\(SUMMARIZE ON\) >> config.cmake echo set\(HIDE_PRIVATE_SYMBOLS ON\) >> config.cmake +echo set\(USE_PIPELINE_EXECUTOR ON\) >> config.cmake +echo set\(USE_CUTLASS ON\) >> config.cmake