insop commented on pull request #7111: URL: https://github.com/apache/tvm/pull/7111#issuecomment-745979049
@tsupei , @zhiics How about this? by this, we can test. Patch is [here](https://github.com/insop/incubator-tvm/commit/9b7b1589f4a05a739e19652b7a2705e4bfc7bfcb.patch). ``` From 9b7b1589f4a05a739e19652b7a2705e4bfc7bfcb Mon Sep 17 00:00:00 2001 From: Insop Song <insop.s...@gmail.com> Date: Wed, 16 Dec 2020 01:19:04 -0800 Subject: [PATCH] Add test to dynamic batch matmul --- .../topi/python/test_topi_batch_matmul.py | 38 ++++++++++++++++--- 1 file changed, 32 insertions(+), 6 deletions(-) diff --git a/tests/python/topi/python/test_topi_batch_matmul.py b/tests/python/topi/python/test_topi_batch_matmul.py index e939f6c21..be6552f03 100644 --- a/tests/python/topi/python/test_topi_batch_matmul.py +++ b/tests/python/topi/python/test_topi_batch_matmul.py @@ -32,10 +32,24 @@ _batch_matmul_implement = { } -def verify_batch_matmul(x_batch, y_batch, M, N, K): - x = te.placeholder((x_batch, M, K), name="x") - y = te.placeholder((y_batch, N, K), name="y") - dtype = x.dtype +def verify_batch_matmul(x_batch, y_batch, M, N, K, dynamic=False, debug=False): + + if not dynamic: + x = te.placeholder((x_batch, M, K), name="x") + y = te.placeholder((y_batch, N, K), name="y") + dtype = x.dtype + else: + assert x_batch == y_batch or x_batch == 1 or y_batch == 1 + batch_size = max(x_batch, y_batch) + dynamic_batch_size = te.var("dynamic_batch_size") + dynamic_M = te.var("dynamic_M") + dynamic_N = te.var("dynamic_N") + dynamic_K = te.var("dynamic_K") + + x = te.placeholder((dynamic_batch_size, dynamic_M, dynamic_K), name="x") + y = te.placeholder((dynamic_batch_size, dynamic_N, dynamic_K), name="y") + dtype = x.dtype + # use memoize to pickle the test data for next time use @memoize("topi.tests.test_topi_batch_matmul") @@ -53,10 +67,19 @@ def verify_batch_matmul(x_batch, y_batch, M, N, K): with tvm.target.Target(device): fcompute, fschedule = tvm.topi.testing.dispatch(device, _batch_matmul_implement) out = fcompute(x, y) - s = fschedule([out]) + if not dynamic: + s = fschedule([out]) + out_shape = out.shape + else: + s = te.create_schedule(out.op) + out_shape = (batch_size, M, N) + + if debug: + print(tvm.lower(s, [x, y, out], simple_mode=True)) + a = tvm.nd.array(a_np, ctx) b = tvm.nd.array(b_np, ctx) - c = tvm.nd.array(np.zeros(get_const_tuple(out.shape), dtype=dtype), ctx) + c = tvm.nd.array(np.zeros(get_const_tuple(out_shape), dtype=dtype), ctx) f = tvm.build(s, [x, y, out], device, name="dense") f(a, b, c) tvm.testing.assert_allclose(c.asnumpy(), c_np, rtol=1e-5) @@ -75,6 +98,9 @@ def test_batch_matmul(): verify_batch_matmul(1, 5, 16, 16, 32) verify_batch_matmul(5, 1, 16, 16, 32) + # Test dynamic batch + verify_batch_matmul(1, 1, 16, 16, 32, dynamic=True, debug=True) + verify_batch_matmul(5, 5, 16, 16, 32, dynamic=True) if __name__ == "__main__": test_batch_matmul() -- 2.28.0 ``` Test restults ``` $ python ./test_topi_batch_matmul.py Running on target: llvm -device=arm_cpu Cannot find config for target=llvm -keys=arm_cpu,cpu -device=arm_cpu -link-params=0, workload=('batch_matmul.x86', ('TENSOR', (1, 16, 32), 'float32'), ('TENSOR', (1, 16, 32), 'float32')). A fallback configuration is used, which may bring great performance regression. Running on target: llvm Cannot find config for target=llvm -keys=cpu -link-params=0, workload=('batch_matmul.x86', ('TENSOR', (1, 16, 32), 'float32'), ('TENSOR', (1, 16, 32), 'float32')). A fallback configuration is used, which may bring great performance regression. Running on target: llvm -device=arm_cpu Cannot find config for target=llvm -keys=arm_cpu,cpu -device=arm_cpu -link-params=0, workload=('batch_matmul.x86', ('TENSOR', (5, 16, 32), 'float32'), ('TENSOR', (5, 16, 32), 'float32')). A fallback configuration is used, which may bring great performance regression. Running on target: llvm Cannot find config for target=llvm -keys=cpu -link-params=0, workload=('batch_matmul.x86', ('TENSOR', (5, 16, 32), 'float32'), ('TENSOR', (5, 16, 32), 'float32')). A fallback configuration is used, which may bring great performance regression. Running on target: llvm -device=arm_cpu Cannot find config for target=llvm -keys=arm_cpu,cpu -device=arm_cpu -link-params=0, workload=('batch_matmul.x86', ('TENSOR', (5, 16, 32), 'float32'), ('TENSOR', (5, 20, 32), 'float32')). A fallback configuration is used, which may bring great performance regression. Running on target: llvm Cannot find config for target=llvm -keys=cpu -link-params=0, workload=('batch_matmul.x86', ('TENSOR', (5, 16, 32), 'float32'), ('TENSOR', (5, 20, 32), 'float32')). A fallback configuration is used, which may bring great performance regression. Running on target: llvm -device=arm_cpu Cannot find config for target=llvm -keys=arm_cpu,cpu -device=arm_cpu -link-params=0, workload=('batch_matmul.x86', ('TENSOR', (30, 16, 32), 'float32'), ('TENSOR', (30, 20, 32), 'float32')). A fallback configuration is used, which may bring great performance regression. Running on target: llvm Cannot find config for target=llvm -keys=cpu -link-params=0, workload=('batch_matmul.x86', ('TENSOR', (30, 16, 32), 'float32'), ('TENSOR', (30, 20, 32), 'float32')). A fallback configuration is used, which may bring great performance regression. Running on target: llvm -device=arm_cpu Cannot find config for target=llvm -keys=arm_cpu,cpu -device=arm_cpu -link-params=0, workload=('batch_matmul.x86', ('TENSOR', (1, 16, 32), 'float32'), ('TENSOR', (5, 16, 32), 'float32')). A fallback configuration is used, which may bring great performance regression. Running on target: llvm Cannot find config for target=llvm -keys=cpu -link-params=0, workload=('batch_matmul.x86', ('TENSOR', (1, 16, 32), 'float32'), ('TENSOR', (5, 16, 32), 'float32')). A fallback configuration is used, which may bring great performance regression. Running on target: llvm -device=arm_cpu Cannot find config for target=llvm -keys=arm_cpu,cpu -device=arm_cpu -link-params=0, workload=('batch_matmul.x86', ('TENSOR', (5, 16, 32), 'float32'), ('TENSOR', (1, 16, 32), 'float32')). A fallback configuration is used, which may bring great performance regression. Running on target: llvm Cannot find config for target=llvm -keys=cpu -link-params=0, workload=('batch_matmul.x86', ('TENSOR', (5, 16, 32), 'float32'), ('TENSOR', (1, 16, 32), 'float32')). A fallback configuration is used, which may bring great performance regression. Running on target: llvm -device=arm_cpu Cannot find config for target=llvm -keys=arm_cpu,cpu -device=arm_cpu -link-params=0, workload=('batch_matmul.x86', ('TENSOR', (dynamic_batch_size, dynamic_M, dynamic_K), 'float32'), ('TENSOR', (dynamic_batch_size, dynamic_N, dynamic_K), 'float32')). A fallback configuration is used, which may bring great performance regression. primfn(x_1: handle, y_1: handle, compute_1: handle) -> () attr = {"global_symbol": "main", "tir.noalias": True} buffers = {compute: Buffer(compute_2: Pointer(float32), float32, [dynamic_batch_size: int32, dynamic_M: int32, dynamic_N: int32], [stride: int32, stride_1: int32, stride_2: int32], type="auto"), y: Buffer(y_2: Pointer(float32), float32, [dynamic_batch_size, dynamic_N, dynamic_K: int32], [stride_3: int32, stride_4: int32, stride_5: int32], type="auto"), x: Buffer(x_2: Pointer(float32), float32, [dynamic_batch_size, dynamic_M, dynamic_K], [stride_6: int32, stride_7: int32, stride_8: int32], type="auto")} buffer_map = {x_1: x, y_1: y, compute_1: compute} { for (b: int32, 0, dynamic_batch_size) { for (i: int32, 0, dynamic_M) { for (j: int32, 0, dynamic_N) { compute_2[(((b*stride) + (i*stride_1)) + (j*stride_2))] = 0f32 for (k: int32, 0, dynamic_K) { compute_2[(((b*stride) + (i*stride_1)) + (j*stride_2))] = ((float32*)compute_2[(((b*stride) + (i*stride_1)) + (j*stride_2))] + ((float32*)x_2[(((b*stride_6) + (i*stride_7)) + (k*stride_8))]*(float32*)y_2[(((b*stride_3) + (j*stride_4)) + (k*stride_5))])) } } } } } Running on target: llvm Cannot find config for target=llvm -keys=cpu -link-params=0, workload=('batch_matmul.x86', ('TENSOR', (dynamic_batch_size, dynamic_M, dynamic_K), 'float32'), ('TENSOR', (dynamic_batch_size, dynamic_N, dynamic_K), 'float32')). A fallback configuration is used, which may bring great performance regression. primfn(x_1: handle, y_1: handle, compute_1: handle) -> () attr = {"global_symbol": "main", "tir.noalias": True} buffers = {y: Buffer(y_2: Pointer(float32), float32, [dynamic_batch_size: int32, dynamic_N: int32, dynamic_K: int32], [stride: int32, stride_1: int32, stride_2: int32], type="auto"), compute: Buffer(compute_2: Pointer(float32), float32, [dynamic_batch_size, dynamic_M: int32, dynamic_N], [stride_3: int32, stride_4: int32, stride_5: int32], type="auto"), x: Buffer(x_2: Pointer(float32), float32, [dynamic_batch_size, dynamic_M, dynamic_K], [stride_6: int32, stride_7: int32, stride_8: int32], type="auto")} buffer_map = {x_1: x, y_1: y, compute_1: compute} { for (b: int32, 0, dynamic_batch_size) { for (i: int32, 0, dynamic_M) { for (j: int32, 0, dynamic_N) { compute_2[(((b*stride_3) + (i*stride_4)) + (j*stride_5))] = 0f32 for (k: int32, 0, dynamic_K) { compute_2[(((b*stride_3) + (i*stride_4)) + (j*stride_5))] = ((float32*)compute_2[(((b*stride_3) + (i*stride_4)) + (j*stride_5))] + ((float32*)x_2[(((b*stride_6) + (i*stride_7)) + (k*stride_8))]*(float32*)y_2[(((b*stride) + (j*stride_1)) + (k*stride_2))])) } } } } } ``` ---------------------------------------------------------------- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org