In my understanding, autotvm implements operators based on opreation and
schedule defination.
For example, there are pack and no pack implementations in x86/dense.py.
In the process of autotuning, the implementation-defined configs are fed into
the actual measurement, but No such correctness verification similar to the
tvm.testing.assert_allclose() interface was found in the entire tuning process.
So I completely extracted the pack and nopack in x86/dense.py and checked the
result of tvm.testing.assert_allclose(), and found:
1. If the segmented axis is implemented in opreation, the tile size must be
divisible, otherwise the calculation result cannot be guaranteed to be correct;
2. Other axes that are not defined in the opreation in advance can be divisible
by not necessarily;
3. Next, I observed the lower difference between them, and found that the code
generated with the tile size that cannot be divisible in 1 is wrong;
4. So I think this should be a code generation bug.
For example, I test x86/dense.py(nopack implement)
M N K = 1 1000 512 and Tm Tn Tk = 1 100 10
```
produce compute {
parallel (y.outer.x.outer.fused, 0, 10) {
produce compute {
for (z.y.fused.init, 0, 100) {
compute[ramp(((y.outer.x.outer.fused*1000) + (z.y.fused.init*10)), 1,
10)] = x10(0f)
}
for (k, 0, 51) {
for (z.y.fused, 0, 100) {
compute[ramp(((y.outer.x.outer.fused*1000) + (z.y.fused*10)), 1, 10)]
= (compute[ramp(((y.outer.x.outer.fused*1000) + (z.y.fused*10)), 1, 10)] +
(data[ramp((k*10), 1, 10)]*weight[ramp((((y.outer.x.outer.fused*51200) +
(z.y.fused*512)) + (k*10)), 1, 10)]))
}
}
}
for (x.inner, 0, 100) {
compute[((y.outer.x.outer.fused*100) + x.inner)] = 0f
for (kk, 0, 10) {
compute[((y.outer.x.outer.fused*100) + x.inner)] =
(compute[((y.outer.x.outer.fused*100) + x.inner)] +
compute[(((y.outer.x.outer.fused*1000) + (x.inner*10)) + kk)])
}
}
}
}
Traceback (most recent call last):
File "OpsGemm/gemm_v3_scheduling.py", line 388, in <module>
buildandevaluation(s, data, weight, out, a, bt, ct, ctx, ct_np)
File "OpsGemm/gemm_v3_scheduling.py", line 44, in buildandevaluation
tvm.testing.assert_allclose(c.asnumpy(), c_np, rtol=1e-5)
File "/root/tvm/python/tvm/testing.py", line 29, in assert_allclose
np.testing.assert_allclose(actual, desired, rtol=rtol, atol=atol,
verbose=True)
File
"/anaconda3/lib/python3.7/site-packages/numpy/testing/_private/utils.py", line
1452, in assert_allclose
verbose=verbose, header=header, equal_nan=equal_nan)
File
"/anaconda3/lib/python3.7/site-packages/numpy/testing/_private/utils.py", line
789, in assert_array_compare
raise AssertionError(msg)
AssertionError:
Not equal to tolerance rtol=1e-05, atol=1e-07
(mismatch 100.0%)
x: array([128.66817 , 119.130806, 129.70555 , 126.0419 , 126.232285,
129.39488 , 128.2362 , 124.842926, 128.9357 , 126.89033 ,
132.58101 , 128.5313 , 129.82468 , 129.89973 , 125.16623 ,...
y: array([128.91081 , 119.22253 , 130.08371 , 126.27693 , 126.305466,
129.55656 , 128.48164 , 124.97862 , 129.08452 , 127.26959 ,
132.87903 , 128.80084 , 129.95262 , 130.14275 , 125.569626,...
```
if M N K = 1 1000 512 and Tm Tn Tk = 1 100 16
```
produce compute {
parallel (y.outer.x.outer.fused, 0, 10) {
produce compute {
for (z.y.fused.init, 0, 100) {
compute[ramp(((y.outer.x.outer.fused*1600) + (z.y.fused.init*16)), 1,
16)] = x16(0f)
}
for (k, 0, 32) {
for (z.y.fused, 0, 100) {
compute[ramp(((y.outer.x.outer.fused*1600) + (z.y.fused*16)), 1, 16)]
= (compute[ramp(((y.outer.x.outer.fused*1600) + (z.y.fused*16)), 1, 16)] +
(data[ramp((k*16), 1, 16)]*weight[ramp((((y.outer.x.outer.fused*51200) +
(z.y.fused*512)) + (k*16)), 1, 16)]))
}
}
}
for (x.inner, 0, 100) {
compute[((y.outer.x.outer.fused*100) + x.inner)] = 0f
for (kk, 0, 16) {
compute[((y.outer.x.outer.fused*100) + x.inner)] =
(compute[((y.outer.x.outer.fused*100) + x.inner)] +
compute[(((y.outer.x.outer.fused*1600) + (x.inner*16)) + kk)])
}
}
}
}
time: 0.000062
```
if M N K = 1 1000 512 and Tm Tn Tk = 1 23 16
```
produce compute {
parallel (y.outer.x.outer.fused, 0, 44) {
produce compute {
for (z.y.fused.init, 0, 23) {
compute[ramp(((y.outer.x.outer.fused*368) + (z.y.fused.init*16)), 1,
16)] = x16(0f)
}
for (k, 0, 32) {
for (z.y.fused, 0, 23) {
if (likely((((y.outer.x.outer.fused*23) + z.y.fused) < 1000))) {
compute[ramp(((y.outer.x.outer.fused*368) + (z.y.fused*16)), 1,
16)] = (compute[ramp(((y.outer.x.outer.fused*368) + (z.y.fused*16)), 1, 16)] +
(data[ramp((k*16), 1, 16)]*weight[ramp((((y.outer.x.outer.fused*11776) +
(z.y.fused*512)) + (k*16)), 1, 16)]))
}
}
}
}
for (x.inner, 0, 23) {
if (likely((((y.outer.x.outer.fused*23) + x.inner) < 1000))) {
compute[((y.outer.x.outer.fused*23) + x.inner)] = 0f
}
for (kk, 0, 16) {
if (likely((((y.outer.x.outer.fused*23) + x.inner) < 1000))) {
if (likely((((y.outer.x.outer.fused*23) + x.inner) < 1000))) {
compute[((y.outer.x.outer.fused*23) + x.inner)] =
(compute[((y.outer.x.outer.fused*23) + x.inner)] +
compute[(((y.outer.x.outer.fused*368) + (x.inner*16)) + kk)])
}
}
}
}
}
}
time: 0.000958
```
(ps:In order to observe the lower code, I closed all unroll operations)
Is this what I missed? And how does autotvm ensure the correctness of the
results in the autotuning process?
---
[Visit
Topic](https://discuss.tvm.apache.org/t/how-to-verify-the-correctness-of-different-schedule-and-tile-size-in-autotvm/8143/1)
to respond.
You are receiving this because you enabled mailing list mode.
To unsubscribe from these emails, [click
here](https://discuss.tvm.apache.org/email/unsubscribe/4d37445b7b155ceff55f827f2db7b69d67271750c46d9ec080008a827d1edac7).