yaoyaoding commented on PR #283:
URL: https://github.com/apache/tvm-ffi/pull/283#issuecomment-3565962032

   The Triton kernel:
   ```python
       # Define the kernel dynamically
       @triton.jit
       def square_kernel(X_ptr, Y_ptr, n, BLOCK: tl.constexpr = 1024):  # noqa
           pid = tl.program_id(0)
           start = pid * BLOCK
           offsets = start + tl.arange(0, BLOCK)
           mask = offsets < n
           x = tl.load(X_ptr + offsets, mask=mask, other=0.0)
           y = x * x
           tl.store(Y_ptr + offsets, y, mask=mask)
   
       # Trigger kernel compilation by doing a dummy call
       x_dummy = torch.ones(1024, dtype=torch.float32, device="cuda")
       y_dummy = torch.empty(1024, dtype=torch.float32, device="cuda")
       square_kernel[1, 1](x_dummy, y_dummy, 1024)
   ```
   
   The PTX:
   ```
   .visible .entry square_kernel(
           .param .u64 .ptr .global .align 1 square_kernel_param_0,
           .param .u64 .ptr .global .align 1 square_kernel_param_1,
           .param .u32 square_kernel_param_2,
           .param .u64 .ptr .global .align 1 square_kernel_param_3,
           .param .u64 .ptr .global .align 1 square_kernel_param_4
   )
   .reqntid 128
   {
           .reg .pred      %p<5>;
           .reg .b32       %r<33>;
           .reg .b64       %rd<8>;
           .loc    1 53 0                          // 
example_triton_cubin.py:53:0
   $L__func_begin0:
           .loc    1 53 0                          // 
example_triton_cubin.py:53:0
   
   // %bb.0:
           ld.param.b64    %rd5, [square_kernel_param_0];
           ld.param.b64    %rd6, [square_kernel_param_1];
   $L__tmp0:
           .loc    1 54 24                         // 
example_triton_cubin.py:54:24
           mov.u32         %r25, %ctaid.x;
           .loc    1 55 18                         // 
example_triton_cubin.py:55:18
           shl.b32         %r26, %r25, 10;
           ld.param.b32    %r27, [square_kernel_param_2];
           .loc    1 56 35                         // 
example_triton_cubin.py:56:35
           mov.u32         %r28, %tid.x;
           shl.b32         %r29, %r28, 2;
           and.b32         %r30, %r29, 508;
           .loc    1 56 22                         // 
example_triton_cubin.py:56:22
           or.b32  %r31, %r30, %r26;
           or.b32  %r32, %r31, 512;
           .loc    1 57 21                         // 
example_triton_cubin.py:57:21
           setp.lt.s32     %p1, %r31, %r27;
           setp.lt.s32     %p2, %r32, %r27;
           .loc    1 58 24                         // 
example_triton_cubin.py:58:24
           mul.wide.s32    %rd7, %r31, 4;
           add.s64         %rd1, %rd5, %rd7;
           add.s64         %rd2, %rd1, 2048;
           mov.b32         %r5, 0;
           .loc    1 58 16                         // 
example_triton_cubin.py:58:16
           // begin inline asm
           mov.u32 %r1, %r5;
           mov.u32 %r2, %r5;
           mov.u32 %r3, %r5;
           mov.u32 %r4, %r5;
           @%p1 ld.global.v4.b32 { %r1, %r2, %r3, %r4 }, [ %rd1 + 0 ];
           // end inline asm
           // begin inline asm
           mov.u32 %r9, %r5;
           mov.u32 %r10, %r5;
           mov.u32 %r11, %r5;
           mov.u32 %r12, %r5;
           @%p2 ld.global.v4.b32 { %r9, %r10, %r11, %r12 }, [ %rd2 + 0 ];
           // end inline asm
           .loc    1 59 12                         // 
example_triton_cubin.py:59:12
           mul.f32         %r17, %r1, %r1;
           mul.f32         %r18, %r2, %r2;
           mul.f32         %r19, %r3, %r3;
           mul.f32         %r20, %r4, %r4;
           mul.f32         %r21, %r9, %r9;
           mul.f32         %r22, %r10, %r10;
           mul.f32         %r23, %r11, %r11;
           mul.f32         %r24, %r12, %r12;
           .loc    1 60 21                         // 
example_triton_cubin.py:60:21
           add.s64         %rd3, %rd6, %rd7;
           add.s64         %rd4, %rd3, 2048;
           .loc    1 60 30                         // 
example_triton_cubin.py:60:30
           // begin inline asm
           @%p1 st.global.v4.b32 [ %rd3 + 0 ], { %r17, %r18, %r19, %r20 };
           // end inline asm
           // begin inline asm
           @%p2 st.global.v4.b32 [ %rd4 + 0 ], { %r21, %r22, %r23, %r24 };
           // end inline asm
           .loc    1 60 4                          // 
example_triton_cubin.py:60:4
           ret;
   $L__tmp1:
   $L__func_end0:
                                           // -- End function
   }
   ```


-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Reply via email to