jaykang10 added a comment.

In https://reviews.llvm.org/D30810#702614, @Anastasia wrote:

> In https://reviews.llvm.org/D30810#702443, @bruno wrote:
>
> > > As a result, I think it would be good for clang to have both of features 
> > > and I would like to stick to the option "-fpresereve-vec3' to change the 
> > > behavior easily.
> >
> > The motivation doesn't seem solid to me, who else is going to benefit from 
> > this flag?
>
>
> There are some off the main tree implementation that would benefit. But in 
> the case of AMD GPU 3 loads/stores will be produced instead of 4. Sounds like 
> a good optimization to me. As I said in my previous comment I think it should 
> have been the default behavior from the beginning, but since different 
> implementation landed first we can integrate this one now with an additional 
> option.


Additionally, Here is assembly output from vec3 with amdgcn target. :)

LLVM IR

  define void @foo(<3 x float>* nocapture %a, <3 x float>* nocapture readonly 
%b) {
  entry:
    %0 = load <3 x float>, <3 x float>* %b, align 16
    store <3 x float> %0, <3 x float>* %a, align 16
    ret void
  }

Assembly Output

        .text
        .section        .AMDGPU.config
        .long   47176
        .long   11272256
        .long   47180
        .long   132
        .long   47200
        .long   0
        .long   4
        .long   0
        .long   8
        .long   0
        .text
        .globl  foo
        .p2align        8
        .type   foo,@function
  foo:                                    ; @foo
  ; BB#0:                                 ; %entry
        s_load_dword s2, s[0:1], 0x9
        s_load_dword s0, s[0:1], 0xa
        s_mov_b32 s4, SCRATCH_RSRC_DWORD0
        s_mov_b32 s5, SCRATCH_RSRC_DWORD1
        s_mov_b32 s6, -1
        s_mov_b32 s8, s3
        s_mov_b32 s7, 0xe8f000
        s_waitcnt lgkmcnt(0)
        v_mov_b32_e32 v0, s0
        buffer_load_dword v2, v0, s[4:7], s8 offen
        buffer_load_dword v3, v0, s[4:7], s8 offen offset:8
        buffer_load_dword v0, v0, s[4:7], s8 offen offset:4
        v_mov_b32_e32 v1, s2
        s_waitcnt vmcnt(0)
        buffer_store_dword v0, v1, s[4:7], s8 offen offset:4
        buffer_store_dword v2, v1, s[4:7], s8 offen
        buffer_store_dword v3, v1, s[4:7], s8 offen offset:8
        s_endpgm
  .Lfunc_end0:
        .size   foo, .Lfunc_end0-foo
  
        .section        .AMDGPU.csdata
  ; Kernel info:
  ; codeLenInByte = 112
  ; NumSgprs: 9
  ; NumVgprs: 4
  ; FloatMode: 192
  ; IeeeMode: 1
  ; ScratchSize: 0
  ; LDSByteSize: 0 bytes/workgroup (compile time only)
  ; SGPRBlocks: 1
  ; VGPRBlocks: 0
  ; NumSGPRsForWavesPerEU: 9
  ; NumVGPRsForWavesPerEU: 4
  ; ReservedVGPRFirst: 0
  ; ReservedVGPRCount: 0
  ; COMPUTE_PGM_RSRC2:USER_SGPR: 2
  ; COMPUTE_PGM_RSRC2:TRAP_HANDLER: 0
  ; COMPUTE_PGM_RSRC2:TGID_X_EN: 1
  ; COMPUTE_PGM_RSRC2:TGID_Y_EN: 0
  ; COMPUTE_PGM_RSRC2:TGID_Z_EN: 0
  ; COMPUTE_PGM_RSRC2:TIDIG_COMP_CNT: 0
  
        .section        ".note.GNU-stack"


https://reviews.llvm.org/D30810



_______________________________________________
cfe-commits mailing list
cfe-commits@lists.llvm.org
http://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits

Reply via email to