[GitHub] [tvm] insop commented on pull request #7111: Fix a bug in batch_matmul that te.max should be used

GitBox Wed, 16 Dec 2020 01:24:32 -0800


insop commented on pull request #7111:
URL: https://github.com/apache/tvm/pull/7111#issuecomment-745979049



   @tsupei , @zhiics 
   
   How about this? by this, we can test.
   Patch is 
[here](https://github.com/insop/incubator-tvm/commit/9b7b1589f4a05a739e19652b7a2705e4bfc7bfcb.patch).
   
   ```
   From 9b7b1589f4a05a739e19652b7a2705e4bfc7bfcb Mon Sep 17 00:00:00 2001
   From: Insop Song <insop.s...@gmail.com>
   Date: Wed, 16 Dec 2020 01:19:04 -0800
   Subject: [PATCH] Add test to dynamic batch matmul
   
   ---
    .../topi/python/test_topi_batch_matmul.py     | 38 ++++++++++++++++---
    1 file changed, 32 insertions(+), 6 deletions(-)
   
   diff --git a/tests/python/topi/python/test_topi_batch_matmul.py 
b/tests/python/topi/python/test_topi_batch_matmul.py
   index e939f6c21..be6552f03 100644
   --- a/tests/python/topi/python/test_topi_batch_matmul.py
   +++ b/tests/python/topi/python/test_topi_batch_matmul.py
   @@ -32,10 +32,24 @@ _batch_matmul_implement = {
    }
    
    
   -def verify_batch_matmul(x_batch, y_batch, M, N, K):
   -    x = te.placeholder((x_batch, M, K), name="x")
   -    y = te.placeholder((y_batch, N, K), name="y")
   -    dtype = x.dtype
   +def verify_batch_matmul(x_batch, y_batch, M, N, K, dynamic=False, 
debug=False):
   +
   +    if not dynamic:
   +        x = te.placeholder((x_batch, M, K), name="x")
   +        y = te.placeholder((y_batch, N, K), name="y")
   +        dtype = x.dtype
   +    else:
   +        assert x_batch == y_batch or x_batch == 1 or y_batch == 1
   +        batch_size = max(x_batch, y_batch)
   +        dynamic_batch_size = te.var("dynamic_batch_size")
   +        dynamic_M = te.var("dynamic_M")
   +        dynamic_N = te.var("dynamic_N")
   +        dynamic_K = te.var("dynamic_K")
   +
   +        x = te.placeholder((dynamic_batch_size, dynamic_M, dynamic_K), 
name="x")
   +        y = te.placeholder((dynamic_batch_size, dynamic_N, dynamic_K), 
name="y")
   +        dtype = x.dtype
   +
    
        # use memoize to pickle the test data for next time use
        @memoize("topi.tests.test_topi_batch_matmul")
   @@ -53,10 +67,19 @@ def verify_batch_matmul(x_batch, y_batch, M, N, K):
            with tvm.target.Target(device):
                fcompute, fschedule = tvm.topi.testing.dispatch(device, 
_batch_matmul_implement)
                out = fcompute(x, y)
   -            s = fschedule([out])
   +            if not dynamic:
   +                s = fschedule([out])
   +                out_shape = out.shape
   +            else:
   +                s = te.create_schedule(out.op)
   +                out_shape = (batch_size, M, N)
   +
   +            if debug:
   +                print(tvm.lower(s, [x, y, out], simple_mode=True))
   +
            a = tvm.nd.array(a_np, ctx)
            b = tvm.nd.array(b_np, ctx)
   -        c = tvm.nd.array(np.zeros(get_const_tuple(out.shape), dtype=dtype), 
ctx)
   +        c = tvm.nd.array(np.zeros(get_const_tuple(out_shape), dtype=dtype), 
ctx)
            f = tvm.build(s, [x, y, out], device, name="dense")
            f(a, b, c)
            tvm.testing.assert_allclose(c.asnumpy(), c_np, rtol=1e-5)
   @@ -75,6 +98,9 @@ def test_batch_matmul():
        verify_batch_matmul(1, 5, 16, 16, 32)
        verify_batch_matmul(5, 1, 16, 16, 32)
    
   +    # Test dynamic batch
   +    verify_batch_matmul(1, 1, 16, 16, 32, dynamic=True, debug=True)
   +    verify_batch_matmul(5, 5, 16, 16, 32, dynamic=True)
    
    if __name__ == "__main__":
        test_batch_matmul()
   -- 
   2.28.0
   ```
   
   Test restults
   
   ```
   $ python ./test_topi_batch_matmul.py 
   Running on target: llvm -device=arm_cpu
   Cannot find config for target=llvm -keys=arm_cpu,cpu -device=arm_cpu 
-link-params=0, workload=('batch_matmul.x86', ('TENSOR', (1, 16, 32), 
'float32'), ('TENSOR', (1, 16, 32), 'float32')). A fallback configuration is 
used, which may bring great performance regression.
   Running on target: llvm
   Cannot find config for target=llvm -keys=cpu -link-params=0, 
workload=('batch_matmul.x86', ('TENSOR', (1, 16, 32), 'float32'), ('TENSOR', 
(1, 16, 32), 'float32')). A fallback configuration is used, which may bring 
great performance regression.
   Running on target: llvm -device=arm_cpu
   Cannot find config for target=llvm -keys=arm_cpu,cpu -device=arm_cpu 
-link-params=0, workload=('batch_matmul.x86', ('TENSOR', (5, 16, 32), 
'float32'), ('TENSOR', (5, 16, 32), 'float32')). A fallback configuration is 
used, which may bring great performance regression.
   Running on target: llvm
   Cannot find config for target=llvm -keys=cpu -link-params=0, 
workload=('batch_matmul.x86', ('TENSOR', (5, 16, 32), 'float32'), ('TENSOR', 
(5, 16, 32), 'float32')). A fallback configuration is used, which may bring 
great performance regression.
   Running on target: llvm -device=arm_cpu
   Cannot find config for target=llvm -keys=arm_cpu,cpu -device=arm_cpu 
-link-params=0, workload=('batch_matmul.x86', ('TENSOR', (5, 16, 32), 
'float32'), ('TENSOR', (5, 20, 32), 'float32')). A fallback configuration is 
used, which may bring great performance regression.
   Running on target: llvm
   Cannot find config for target=llvm -keys=cpu -link-params=0, 
workload=('batch_matmul.x86', ('TENSOR', (5, 16, 32), 'float32'), ('TENSOR', 
(5, 20, 32), 'float32')). A fallback configuration is used, which may bring 
great performance regression.
   Running on target: llvm -device=arm_cpu
   Cannot find config for target=llvm -keys=arm_cpu,cpu -device=arm_cpu 
-link-params=0, workload=('batch_matmul.x86', ('TENSOR', (30, 16, 32), 
'float32'), ('TENSOR', (30, 20, 32), 'float32')). A fallback configuration is 
used, which may bring great performance regression.
   Running on target: llvm
   Cannot find config for target=llvm -keys=cpu -link-params=0, 
workload=('batch_matmul.x86', ('TENSOR', (30, 16, 32), 'float32'), ('TENSOR', 
(30, 20, 32), 'float32')). A fallback configuration is used, which may bring 
great performance regression.
   Running on target: llvm -device=arm_cpu
   Cannot find config for target=llvm -keys=arm_cpu,cpu -device=arm_cpu 
-link-params=0, workload=('batch_matmul.x86', ('TENSOR', (1, 16, 32), 
'float32'), ('TENSOR', (5, 16, 32), 'float32')). A fallback configuration is 
used, which may bring great performance regression.
   Running on target: llvm
   Cannot find config for target=llvm -keys=cpu -link-params=0, 
workload=('batch_matmul.x86', ('TENSOR', (1, 16, 32), 'float32'), ('TENSOR', 
(5, 16, 32), 'float32')). A fallback configuration is used, which may bring 
great performance regression.
   Running on target: llvm -device=arm_cpu
   Cannot find config for target=llvm -keys=arm_cpu,cpu -device=arm_cpu 
-link-params=0, workload=('batch_matmul.x86', ('TENSOR', (5, 16, 32), 
'float32'), ('TENSOR', (1, 16, 32), 'float32')). A fallback configuration is 
used, which may bring great performance regression.
   Running on target: llvm
   Cannot find config for target=llvm -keys=cpu -link-params=0, 
workload=('batch_matmul.x86', ('TENSOR', (5, 16, 32), 'float32'), ('TENSOR', 
(1, 16, 32), 'float32')). A fallback configuration is used, which may bring 
great performance regression.
   Running on target: llvm -device=arm_cpu
   Cannot find config for target=llvm -keys=arm_cpu,cpu -device=arm_cpu 
-link-params=0, workload=('batch_matmul.x86', ('TENSOR', (dynamic_batch_size, 
dynamic_M, dynamic_K), 'float32'), ('TENSOR', (dynamic_batch_size, dynamic_N, 
dynamic_K), 'float32')). A fallback configuration is used, which may bring 
great performance regression.
   primfn(x_1: handle, y_1: handle, compute_1: handle) -> ()
     attr = {"global_symbol": "main", "tir.noalias": True}
     buffers = {compute: Buffer(compute_2: Pointer(float32), float32, 
[dynamic_batch_size: int32, dynamic_M: int32, dynamic_N: int32], [stride: 
int32, stride_1: int32, stride_2: int32], type="auto"),
                y: Buffer(y_2: Pointer(float32), float32, [dynamic_batch_size, 
dynamic_N, dynamic_K: int32], [stride_3: int32, stride_4: int32, stride_5: 
int32], type="auto"),
                x: Buffer(x_2: Pointer(float32), float32, [dynamic_batch_size, 
dynamic_M, dynamic_K], [stride_6: int32, stride_7: int32, stride_8: int32], 
type="auto")}
     buffer_map = {x_1: x, y_1: y, compute_1: compute} {
     for (b: int32, 0, dynamic_batch_size) {
       for (i: int32, 0, dynamic_M) {
         for (j: int32, 0, dynamic_N) {
           compute_2[(((b*stride) + (i*stride_1)) + (j*stride_2))] = 0f32
           for (k: int32, 0, dynamic_K) {
             compute_2[(((b*stride) + (i*stride_1)) + (j*stride_2))] = 
((float32*)compute_2[(((b*stride) + (i*stride_1)) + (j*stride_2))] + 
((float32*)x_2[(((b*stride_6) + (i*stride_7)) + 
(k*stride_8))]*(float32*)y_2[(((b*stride_3) + (j*stride_4)) + (k*stride_5))]))
           }
         }
       }
     }
   }
   
   
   Running on target: llvm
   Cannot find config for target=llvm -keys=cpu -link-params=0, 
workload=('batch_matmul.x86', ('TENSOR', (dynamic_batch_size, dynamic_M, 
dynamic_K), 'float32'), ('TENSOR', (dynamic_batch_size, dynamic_N, dynamic_K), 
'float32')). A fallback configuration is used, which may bring great 
performance regression.
   primfn(x_1: handle, y_1: handle, compute_1: handle) -> ()
     attr = {"global_symbol": "main", "tir.noalias": True}
     buffers = {y: Buffer(y_2: Pointer(float32), float32, [dynamic_batch_size: 
int32, dynamic_N: int32, dynamic_K: int32], [stride: int32, stride_1: int32, 
stride_2: int32], type="auto"),
                compute: Buffer(compute_2: Pointer(float32), float32, 
[dynamic_batch_size, dynamic_M: int32, dynamic_N], [stride_3: int32, stride_4: 
int32, stride_5: int32], type="auto"),
                x: Buffer(x_2: Pointer(float32), float32, [dynamic_batch_size, 
dynamic_M, dynamic_K], [stride_6: int32, stride_7: int32, stride_8: int32], 
type="auto")}
     buffer_map = {x_1: x, y_1: y, compute_1: compute} {
     for (b: int32, 0, dynamic_batch_size) {
       for (i: int32, 0, dynamic_M) {
         for (j: int32, 0, dynamic_N) {
           compute_2[(((b*stride_3) + (i*stride_4)) + (j*stride_5))] = 0f32
           for (k: int32, 0, dynamic_K) {
             compute_2[(((b*stride_3) + (i*stride_4)) + (j*stride_5))] = 
((float32*)compute_2[(((b*stride_3) + (i*stride_4)) + (j*stride_5))] + 
((float32*)x_2[(((b*stride_6) + (i*stride_7)) + 
(k*stride_8))]*(float32*)y_2[(((b*stride) + (j*stride_1)) + (k*stride_2))]))
           }
         }
       }
     }
   }
   
   ```


----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org

[GitHub] [tvm] insop commented on pull request #7111: Fix a bug in batch_matmul that te.max should be used

Reply via email to