[GitHub] [tvm] csullivan commented on a diff in pull request #12587: [Hexagon] Initial support for meta schedule tuning

2022-08-26 Thread GitBox


csullivan commented on code in PR #12587:
URL: https://github.com/apache/tvm/pull/12587#discussion_r956192435


##
tests/python/contrib/test_hexagon/test_meta_schedule.py:
##
@@ -0,0 +1,211 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+""" Test rpc based launcher for hexagon """
+import pytest
+import numpy as np
+import tempfile
+
+import tvm.testing
+from tvm import te
+from tvm import meta_schedule as ms
+from tvm.meta_schedule.arg_info import TensorInfo
+from tvm.meta_schedule.builder import BuilderInput
+from tvm.script import tir as T
+from tvm.tir import FloatImm
+from tvm.tir.tensor_intrin.hexagon import VRMPY_u8u8i32_INTRIN
+from tvm.meta_schedule.runner import RunnerInput
+from tvm.contrib.hexagon.meta_schedule import get_hexagon_local_builder, 
get_hexagon_rpc_runner
+
+MATMUL_N = 16
+MATMUL_M = 32
+
+
+@tvm.script.ir_module
+class MatmulModule:
+@T.prim_func
+def main(a: T.handle, b: T.handle, c: T.handle) -> None:  # pylint: 
disable=no-self-argument
+T.func_attr({"global_symbol": "main", "tir.noalias": True})
+A = T.match_buffer(a, (16, 16), "float32")
+B = T.match_buffer(b, (16, 16), "float32")
+C = T.match_buffer(c, (16, 16), "float32")
+for i, j, k in T.grid(16, 16, 16):
+with T.block("matmul"):
+vi, vj, vk = T.axis.remap("SSR", [i, j, k])
+with T.init():
+C[vi, vj] = 0.0
+C[vi, vj] = C[vi, vj] + A[vi, vk] * B[vk, vj]
+
+
+@tvm.testing.requires_hexagon
+def test_builder_runner(hexagon_launcher):
+if hexagon_launcher._serial_number == "simulator":
+pytest.skip(msg="Tuning on simulator not supported.")
+
+target_hexagon = tvm.target.hexagon("v68", link_params=True)
+target = tvm.target.Target(target_hexagon, host=target_hexagon)
+mod = MatmulModule
+
+builder = get_hexagon_local_builder()
+runner = get_hexagon_rpc_runner(hexagon_launcher, number=1, repeat=1, 
min_repeat_ms=0)
+
+(builder_result,) = builder.build([BuilderInput(mod, target)])
+assert builder_result.artifact_path is not None
+assert builder_result.error_msg is None
+
+runner_input = RunnerInput(
+builder_result.artifact_path,
+"llvm",
+[
+TensorInfo("float32", (MATMUL_N, MATMUL_N)),
+TensorInfo("float32", (MATMUL_N, MATMUL_N)),
+TensorInfo("float32", (MATMUL_N, MATMUL_N)),
+],
+)
+
+# Run the module
+(runner_future,) = runner.run([runner_input])
+runner_result = runner_future.result()
+
+assert runner_result.error_msg is None
+for result in runner_result.run_secs:
+if isinstance(result, FloatImm):
+result = result.value
+assert isinstance(result, float)
+assert result >= 0.0
+
+
+def dense(m, n, k):
+X = te.placeholder((m, k), name="X", dtype="uint8")
+packedW = te.placeholder((n // 32, k // 4, 32, 4), name="packedW", 
dtype="uint8")
+
+ak = te.reduce_axis((0, k), name="k")
+out = te.compute(
+(m, n),
+lambda i, j: te.sum(
+X[i, ak].astype("int32")
+* packedW[tvm.tir.indexdiv(j, 32), tvm.tir.indexdiv(ak, 4), j % 
32, ak % 4].astype(
+"int32"
+),
+axis=ak,
+),
+name="compute",
+)
+return [X, packedW, out]
+
+
+def schedule_dense(sch, block, M, do_tune):
+a_y, a_x, _ = sch.get_loops(block)[-3:]
+
+if do_tune:
+y_factors = sch.sample_perfect_tile(a_y, n=2, max_innermost_factor=128)
+a_yo, a_yi = sch.split(a_y, factors=y_factors)
+else:
+a_yo, a_yi = sch.split(a_y, factors=[None, min(M, 32)])
+
+a_xo, a_xi = sch.split(a_x, factors=[None, 32])
+sch.reorder(a_yo, a_xo, a_yi, a_xi)
+
+a_xi, a_k = sch.get_loops(block)[-2:]
+a_ko, a_ki = sch.split(a_k, factors=[None, 4])
+sch.reorder(a_ko, a_xi, a_ki)
+
+fused = sch.fuse(a_yo, a_xo)
+
+sch.parallel(fused)
+
+dec = sch.decompose_reduction(block, a_ko)
+
+init_loop = sch.get_loops(dec)[-1]
+sch.vectorize(init_loop)
+
+sch.tensorize(a_xi, VRMPY_u8u8i32_INTRIN)
+
+
+def verify_dense(sch, target, M, N, K, hexagon_session):
+f = t

[GitHub] [tvm] csullivan commented on a diff in pull request #12587: [Hexagon] Initial support for meta schedule tuning

2022-08-25 Thread GitBox


csullivan commented on code in PR #12587:
URL: https://github.com/apache/tvm/pull/12587#discussion_r955391842


##
tests/python/contrib/test_hexagon/test_meta_schedule.py:
##
@@ -0,0 +1,211 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+""" Test rpc based launcher for hexagon """
+import pytest
+import numpy as np
+import tempfile
+
+import tvm.testing
+from tvm import te
+from tvm import meta_schedule as ms
+from tvm.meta_schedule.arg_info import TensorInfo
+from tvm.meta_schedule.builder import BuilderInput
+from tvm.script import tir as T
+from tvm.tir import FloatImm
+from tvm.tir.tensor_intrin.hexagon import VRMPY_u8u8i32_INTRIN
+from tvm.meta_schedule.runner import RunnerInput
+from tvm.contrib.hexagon.meta_schedule import get_hexagon_local_builder, 
get_hexagon_rpc_runner
+
+MATMUL_N = 16
+MATMUL_M = 32
+
+
+@tvm.script.ir_module
+class MatmulModule:
+@T.prim_func
+def main(a: T.handle, b: T.handle, c: T.handle) -> None:  # pylint: 
disable=no-self-argument
+T.func_attr({"global_symbol": "main", "tir.noalias": True})
+A = T.match_buffer(a, (16, 16), "float32")
+B = T.match_buffer(b, (16, 16), "float32")
+C = T.match_buffer(c, (16, 16), "float32")
+for i, j, k in T.grid(16, 16, 16):
+with T.block("matmul"):
+vi, vj, vk = T.axis.remap("SSR", [i, j, k])
+with T.init():
+C[vi, vj] = 0.0
+C[vi, vj] = C[vi, vj] + A[vi, vk] * B[vk, vj]
+
+
+@tvm.testing.requires_hexagon
+def test_builder_runner(hexagon_launcher):
+if hexagon_launcher._serial_number == "simulator":
+pytest.skip(msg="Tuning on simulator not supported.")
+
+target_hexagon = tvm.target.hexagon("v68", link_params=True)
+target = tvm.target.Target(target_hexagon, host=target_hexagon)
+mod = MatmulModule
+
+builder = get_hexagon_local_builder()
+runner = get_hexagon_rpc_runner(hexagon_launcher, number=1, repeat=1, 
min_repeat_ms=0)
+
+(builder_result,) = builder.build([BuilderInput(mod, target)])
+assert builder_result.artifact_path is not None
+assert builder_result.error_msg is None
+
+runner_input = RunnerInput(
+builder_result.artifact_path,
+"llvm",
+[
+TensorInfo("float32", (MATMUL_N, MATMUL_N)),
+TensorInfo("float32", (MATMUL_N, MATMUL_N)),
+TensorInfo("float32", (MATMUL_N, MATMUL_N)),
+],
+)
+
+# Run the module
+(runner_future,) = runner.run([runner_input])
+runner_result = runner_future.result()
+
+assert runner_result.error_msg is None
+for result in runner_result.run_secs:
+if isinstance(result, FloatImm):
+result = result.value
+assert isinstance(result, float)
+assert result >= 0.0
+
+
+def dense(m, n, k):
+X = te.placeholder((m, k), name="X", dtype="uint8")
+packedW = te.placeholder((n // 32, k // 4, 32, 4), name="packedW", 
dtype="uint8")
+
+ak = te.reduce_axis((0, k), name="k")
+out = te.compute(
+(m, n),
+lambda i, j: te.sum(
+X[i, ak].astype("int32")
+* packedW[tvm.tir.indexdiv(j, 32), tvm.tir.indexdiv(ak, 4), j % 
32, ak % 4].astype(
+"int32"
+),
+axis=ak,
+),
+name="compute",
+)
+return [X, packedW, out]
+
+
+def schedule_dense(sch, block, M, do_tune):
+a_y, a_x, _ = sch.get_loops(block)[-3:]
+
+if do_tune:
+y_factors = sch.sample_perfect_tile(a_y, n=2, max_innermost_factor=128)
+a_yo, a_yi = sch.split(a_y, factors=y_factors)
+else:
+a_yo, a_yi = sch.split(a_y, factors=[None, min(M, 32)])
+
+a_xo, a_xi = sch.split(a_x, factors=[None, 32])
+sch.reorder(a_yo, a_xo, a_yi, a_xi)
+
+a_xi, a_k = sch.get_loops(block)[-2:]
+a_ko, a_ki = sch.split(a_k, factors=[None, 4])
+sch.reorder(a_ko, a_xi, a_ki)
+
+fused = sch.fuse(a_yo, a_xo)
+
+sch.parallel(fused)
+
+dec = sch.decompose_reduction(block, a_ko)
+
+init_loop = sch.get_loops(dec)[-1]
+sch.vectorize(init_loop)
+
+sch.tensorize(a_xi, VRMPY_u8u8i32_INTRIN)
+
+
+def verify_dense(sch, target, M, N, K, hexagon_session):
+f = t