================ @@ -662,4 +662,152 @@ def XeGPU_UpdateOffsetOp: XeGPU_Op<"update_offset", }]; } +def XeGPU_DpasOp : XeGPU_Op<"dpas", [Pure, AllElementTypesMatch<["lhs", "rhs"]>]> { + let summary = "It performs mma computation"; + + let description = [{DPAS performs matrix multiplication on matrix A of `mxk` + size, B of `kxn` size, and accumulate on matrix C of `mxn` to the same size + matrix , `m=8`, `n=16` and `k=8 * 32/bit_width_of_elem_type`. So for fp16 + data type, the matrices are `A: vector<8x16xf16>`, `B: vector<16x16xf16>`, + and `C/D: vector<8x16xf32>`. Besides the matrix size requirements, DPAS + also requires A and B to be loaded with the required data layout. Specially, + VNNI layout is required for B operand. It is achieved via setting `vnni_axis = 0` + of the corresponding `load_nd` operator. To keep both operands as 3D vector, + operand A is loaded via setting `vnni_axis = 1` without impacting the + physical layouts change in register. Due to the VNNI transformation, A and B operands + are represented as 3D vector, with the last dimension representing the VNNI factor, + which is computed as `32/bit_width_of_elem_type`. Therefore, `A: vector<8x16xf16>` + is represented as `A: vector<4x8x2xf16>`, and `B:vector<16x16xf16>` is + represented as `B: vector<8x16x2xf16>`. + + Note: on PVC, the hardware can perform load with VNN transformation when data + element type is 16-bit or lower precision, taking 2 or 4 elements from + the first dimension and inserted into the newly added innermost dimension. + }]; + + let arguments = (ins + XeGPU_DpasOpType : $lhs, + XeGPU_DpasOpType : $rhs, + Optional<XeGPU_Vector2DType>: $acc); + let results = (outs XeGPU_Vector2DType: $result); + + let extraClassDeclaration = [{ + VectorType getLhsType() { + return getLhs().getType(); + } + + VectorType getRhsType() { + return getRhs().getType(); + } + + VectorType getAccType() { + if (getAcc()) + return getAcc().getType(); + return {}; + } + + VectorType getResultType() { + return getResult().getType(); + } + }]; + + let assemblyFormat = [{ + $lhs `,` $rhs (`,` $acc^)? attr-dict `:` type($lhs)`,` type($rhs) (`,` type($acc)^)? `->` type($result) + }]; + + let hasVerifier = 1; +} + +def XeGPU_AtomicRMWOp: XeGPU_Op<"atomic_rmw", [Pure, + AllElementTypesMatch<["tensorDesc", "value", "result"]>, + AllShapesMatch<["tensorDesc", "mask", "value", "result"]>]> { + let summary = "A ready-modify-write operation. "; + + let description = [{ + `AtomicRMWOp` has same semantic to `memref.atomic_rmw`, except that + it work on a `TensorDescType` object while `memref.atomic_rmw` works + on a `MemRefType` object. It also has a `mask` variable, which has the + same shape with `TensorDesc`, to enable or disable some data points of + the `TensorDesc`. + }]; + + let arguments = (ins + AtomicRMWKindAttr:$kind, + XeGPU_TensorDesc:$tensorDesc, + XeGPU_MaskType:$mask, + XeGPU_ValueType:$value); + + let results = (outs XeGPU_ValueType:$result); + + let assemblyFormat = [{ + $kind $tensorDesc `,` $mask `,` $value attr-dict `:` + type($tensorDesc) `,` type($mask) `,` type($value) `->` type($result) + }]; +} + +def XeGPU_AllocNbarrierOp: XeGPU_Op<"alloc_nbarrier", []> { + let summary = "It allocates a set of named barriers."; + let description = [{AllocNbarrier is to create a set of named barriers as + specified by `nbarrier_num`. Named barriers are workgroup level resources, + and are shared by all threads in the workgroup. For example, there are + up to 32 barriers (range 0-31) for each Xecore on PVC. A typical use case ---------------- adam-smnk wrote:
```suggestion up to 32 barriers (range 0-31) for each XeCore on PVC. A typical use case ``` https://github.com/llvm/llvm-project/pull/88439 _______________________________________________ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits