Issue 176974
Summary [SSE4.1 + BMI2] `vpextrb` fed into byte mask of `bzhi` could be `vmovd` instead
Labels new issue
Assignees
Reporter Validark
    [Zig Godbolt](https://zig.godbo.lt/#g:!((g:!((g:!((h:codeEditor,i:(filename:'1',fontScale:21,fontUsePx:'0',j:1,lang:zig,selection:(endColumn:38,endLineNumber:9,positionColumn:1,positionLineNumber:1,selectionStartColumn:38,selectionStartLineNumber:9,startColumn:1,startLineNumber:1),source:'export+fn+vpextrb_fed_into_bzhi(%0A++++a:+u64,%0A++++table:+@Vector(64,+u8),%0A)+u64+%7B%0A++++return+bzhi(a,+table%5B0%5D)%3B%0A%7D%0A%0Aextern+fn+@%22llvm.x86.bmi.bzhi.64%22(u64,+u64)+u64%3B%0Aconst+bzhi+%3D+@%22llvm.x86.bmi.bzhi.64%22%3B%0A%0Aexport+fn+optimized_version(%0A++++a:+u64,%0A++++table:+@Vector(64,+u8),%0A)+u64+%7B%0A++++return+bzhi(a,+@as(@Vector(16,+u32),+@bitCast(table))%5B0%5D)%3B%0A%7D%0A%0A'),l:'5',n:'0',o:'Zig+source+%231',t:'0')),k:50.70616224653202,l:'4',n:'0',o:'',s:0,t:'0'),(g:!((h:compiler,i:(compiler:ztrunk,filters:(b:'0',binary:'1',binaryObject:'1',commentOnly:'0',debugCalls:'1',demangle:'0',directives:'0',execute:'0',intel:'0',libraryCode:'0',trim:'1',verboseDemangling:'0'),flagsViewOpen:'1',fontScale:21,fontUsePx:'0',j:1,lang:zig,libs:!(),options:'-O+ReleaseFast+-mcpu%3Dicelake_server+-target+x86_64-linux+-fomit-frame-pointer',overrides:!(),selection:(endColumn:30,endLineNumber:9,positionColumn:1,positionLineNumber:1,selectionStartColumn:30,selectionStartLineNumber:9,startColumn:1,startLineNumber:1),source:1),l:'5',n:'0',o:'+zig+trunk+(Editor+%231)',t:'0')),header:(),k:49.29383775346799,l:'4',m:100,n:'0',o:'',s:0,t:'0')),l:'2',n:'0',o:'',t:'0')),version:4)
[LLVM Godbolt](https://llvm.godbo.lt/#g:!((g:!((g:!((h:codeEditor,i:(filename:'1',fontScale:14,fontUsePx:'0',j:1,lang:llvm,selection:(endColumn:1,endLineNumber:19,positionColumn:1,positionLineNumber:19,selectionStartColumn:1,selectionStartLineNumber:19,startColumn:1,startLineNumber:19),source:'define+dso_local+i64+@vpextrb_fed_into_bzhi(i64+%250,+%3C64+x+i8%3E+%251)+local_unnamed_addr+%7B%0AEntry:%0A++%25.0.vec.extract+%3D+extractelement+%3C64+x+i8%3E+%251,+i64+0%0a++%252+%3d+zext+i8+%25.0.vec.extract+to+i64%0a++%253+%[email protected](i64+%250,+i64+%252)%0a++ret+i64+%253%0a%7d%0a%[email protected](i64,+i64)+%231%0A%0Adefine+dso_local+i64+@optimized_version(i64+%250,+%3C64+x+i8%3E+%251)+local_unnamed_addr+%7B%0AEntry:%0A++%252+%3D+bitcast+%3C64+x+i8%3E+%251+to+%3C16+x+i32%3E%0A++%25.0.vec.extract+%3D+extractelement+%3C16+x+i32%3E+%252,+i64+0%0a++%253+%3d+zext+i32+%25.0.vec.extract+to+i64%0a++%254+%[email protected](i64+%250,+i64+%253)%0A++ret+i64+%254%0A%7D%0A'),l:'5',n:'0',o:'LLVM+IR+source+%231',t:'0')),k:50.76820307281229,l:'4',n:'0',o:'',s:0,t:'0'),(g:!((h:compiler,i:(compiler:llctrunk,filters:(b:'0',binary:'1',binaryObject:'1',commentOnly:'0',debugCalls:'1',demangle:'0',directives:'0',execute:'1',intel:'0',libraryCode:'0',trim:'1',verboseDemangling:'0'),flagsViewOpen:'1',fontScale:14,fontUsePx:'0',j:1,lang:llvm,libs:!(),options:'-O3+-mcpu%3Dznver5',overrides:!(),selection:(endColumn:1,endLineNumber:1,positionColumn:1,positionLineNumber:1,selectionStartColumn:1,selectionStartLineNumber:1,startColumn:1,startLineNumber:1),source:1),l:'5',n:'0',o:'+llc+(trunk)+(Editor+%231)',t:'0')),k:49.23179692718771,l:'4',m:100,n:'0',o:'',s:0,t:'0')),l:'2',n:'0',o:'',t:'0')),version:4)

```zig
export fn vpextrb_fed_into_bzhi(
    a: u64,
    table: @Vector(64, u8),
) u64 {
    return bzhi(a, table[0]);
}

extern fn @"llvm.x86.bmi.bzhi.64"(u64, u64) u64;
const bzhi = @"llvm.x86.bmi.bzhi.64";
```

This gets us:

```asm
vpextrb_fed_into_bzhi:
        vpextrb eax, xmm0, 0
 bzhi    rax, rdi, rax
        vzeroupper
        ret
```

Could be:

```asm
optimized_version:
        vmovd   eax, xmm0
        bzhi rax, rdi, rax
        vzeroupper
        ret
```

This works because `bzhi` only looks at the bottom byte of the last operand, therefore it doesn't matter what is in the other bytes of the register.

Optimized LLVM dump:

```llvm
define dso_local i64 @vpextrb_fed_into_bzhi(i64 %0, <64 x i8> %1) local_unnamed_addr {
Entry:
  %.0.vec.extract = extractelement <64 x i8> %1, i64 0
  %2 = zext i8 %.0.vec.extract to i64
  %3 = tail call i64 @llvm.x86.bmi.bzhi.64(i64 %0, i64 %2)
  ret i64 %3
}

declare i64 @llvm.x86.bmi.bzhi.64(i64, i64) #1

define dso_local i64 @optimized_version(i64 %0, <64 x i8> %1) local_unnamed_addr {
Entry:
  %2 = bitcast <64 x i8> %1 to <16 x i32>
  %.0.vec.extract = extractelement <16 x i32> %2, i64 0
  %3 = zext i32 %.0.vec.extract to i64
  %4 = tail call i64 @llvm.x86.bmi.bzhi.64(i64 %0, i64 %3)
  ret i64 %4
}
```
_______________________________________________
llvm-bugs mailing list
[email protected]
https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-bugs

Reply via email to