I post an exmple for intrinsics choosing.
```
for (i, 0, 65535) {
C[i] = (A[i] + B[i])
}
```
```
Call Engine: veadd_mm
// normal ===stmt cost : 2061.94 (smallest cost) shape : 1x65535
[ tx.veadd_mm(tir.tvm_access_ptr(tir.type_annotation(), C, (int64)0,
(int64)65535, 2), tir.tvm_access_ptr(tir.type_annotation(), A, (int64)0,
(int64)65535, 1), tir.tvm_access_ptr(tir.type_annotation(), B, (int64)0,
(int64)65535, 1), tx.csrw("CSR_SHAPE_S1_COL", (int64)65535, "CSR_SHAPE_S1_ROW",
(int64)1, "CSR_STRIDE_D", (int64)0, "CSR_STRIDE_S", (int64)0))
]
// normal and align === stmt cost : 2071.91 shape : 1x65472
[ tx.veadd_mm(tir.tvm_access_ptr(tir.type_annotation(), C, (int64)0,
(int64)65535, 2), tir.tvm_access_ptr(tir.type_annotation(), A, (int64)0,
(int64)65535, 1), tir.tvm_access_ptr(tir.type_annotation(), B, (int64)0,
(int64)65535, 1), tx.csrw("CSR_SHAPE_S1_COL", (int64)65472, "CSR_SHAPE_S1_ROW",
(int64)1, "CSR_STRIDE_D", (int64)0, "CSR_STRIDE_S", (int64)0))
tx.veadd_mm(tir.tvm_access_ptr(tir.type_annotation(), C, (int64)65472,
(int64)63, 2), tir.tvm_access_ptr(tir.type_annotation(), A, (int64)65472,
(int64)63, 1), tir.tvm_access_ptr(tir.type_annotation(), B, (int64)65472,
(int64)63, 1), tx.csrw("CSR_SHAPE_S1_COL", (int64)63, "CSR_SHAPE_S1_ROW",
(int64)1, "CSR_STRIDE_D", (int64)0, "CSR_STRIDE_S", (int64)0))
]
// reshape === stmt cost : 131080
[ tx.veadd_mm(tir.tvm_access_ptr(tir.type_annotation(), C, (int64)0,
(int64)65535, 2), tir.tvm_access_ptr(tir.type_annotation(), A, (int64)0,
(int64)65535, 1), tir.tvm_access_ptr(tir.type_annotation(), B, (int64)0,
(int64)65535, 1), tx.csrw("CSR_SHAPE_S1_COL", (int64)1, "CSR_SHAPE_S1_ROW",
(int64)65535, "CSR_STRIDE_D", 0, "CSR_STRIDE_S", 0))
]
// === stmt cost : 786420
[ for (i, 0, (int64)65535) {
tx.veadd_mm(tir.tvm_access_ptr(tir.type_annotation(), C, int64(i),
((int64)65535 - int64(i)), 2), tir.tvm_access_ptr(tir.type_annotation(), A,
int64(i), ((int64)65535 - int64(i)), 1),
tir.tvm_access_ptr(tir.type_annotation(), B, int64(i), ((int64)65535 -
int64(i)), 1), tx.csrw("CSR_SHAPE_S1_COL", (int64)1, "CSR_SHAPE_S1_ROW",
(int64)1, "CSR_STRIDE_D", (int64)0, "CSR_STRIDE_S", (int64)0))
}
]
Call Engine: veadd_mv_dimh
// normal === stmt cost : 3085.91
[ tx.veadd_mv_dimh(tir.tvm_access_ptr(tir.type_annotation(), C, (int64)0,
(int64)65535, 2), tir.tvm_access_ptr(tir.type_annotation(), B, (int64)0,
(int64)65535, 1), tir.tvm_access_ptr(tir.type_annotation(), A, (int64)0,
(int64)65535, 1), tx.csrw("CSR_SHAPE_S1_COL", (int64)65535, "CSR_SHAPE_S1_ROW",
(int64)1, "CSR_STRIDE_D", (int64)0, "CSR_STRIDE_S", (int64)0))
]
// normal and align === stmt cost : 2069.94
[ tx.veadd_mv_dimh(tir.tvm_access_ptr(tir.type_annotation(), C, (int64)0,
(int64)65535, 2), tir.tvm_access_ptr(tir.type_annotation(), B, (int64)0,
(int64)65535, 1), tir.tvm_access_ptr(tir.type_annotation(), A, (int64)0,
(int64)65535, 1), tx.csrw("CSR_SHAPE_S1_COL", (int64)65472, "CSR_SHAPE_S1_ROW",
(int64)1, "CSR_STRIDE_D", (int64)0, "CSR_STRIDE_S", (int64)0))
tx.veadd_mv_dimh(tir.tvm_access_ptr(tir.type_annotation(), C, (int64)65472,
(int64)63, 2), tir.tvm_access_ptr(tir.type_annotation(), B, (int64)65472,
(int64)63, 1), tir.tvm_access_ptr(tir.type_annotation(), A, (int64)65472,
(int64)63, 1), tx.csrw("CSR_SHAPE_S1_COL", (int64)63, "CSR_SHAPE_S1_ROW",
(int64)1, "CSR_STRIDE_D", (int64)0, "CSR_STRIDE_S", (int64)0))
]
// === stmt cost : 720885
[ for (i, 0, (int64)65535) {
tx.veadd_mv_dimh(tir.tvm_access_ptr(tir.type_annotation(), C, int64(i),
((int64)65535 - int64(i)), 2), tir.tvm_access_ptr(tir.type_annotation(), B,
int64(i), ((int64)65535 - int64(i)), 1),
tir.tvm_access_ptr(tir.type_annotation(), A, int64(i), ((int64)65535 -
int64(i)), 1), tx.csrw("CSR_SHAPE_S1_COL", (int64)1, "CSR_SHAPE_S1_ROW",
(int64)1, "CSR_STRIDE_D", (int64)0, "CSR_STRIDE_S", (int64)0))
}
]
Call Engine: veadd_mf
// === stmt cost : 720885
[ for (i, 0, (int64)65535) {
tx.veadd_mf(tir.tvm_access_ptr(tir.type_annotation(), C, int64(i),
((int64)65535 - int64(i)), 2), tir.tvm_access_ptr(tir.type_annotation(), B,
int64(i), ((int64)65535 - int64(i)), 1), A[i], tx.csrw("CSR_SHAPE_S1_COL",
(int64)1, "CSR_SHAPE_S1_ROW", (int64)1, "CSR_STRIDE_D", (int64)0,
"CSR_STRIDE_S", (int64)0))
}
]
```
So we need a big module(lots of design and code) to emit intrinsics,
tensorization at the first place doesn't fit well for NPUs.
---
[Visit
Topic](https://discuss.tvm.apache.org/t/do-we-have-any-way-to-process-codegen-with-more-fine-grade-control/9908/9)
to respond.
You are receiving this because you enabled mailing list mode.
To unsubscribe from these emails, [click
here](https://discuss.tvm.apache.org/email/unsubscribe/90426346a6e940da5c2d7f8a7e430372529e40098a9ed0f5c0c18b2365137cb1).