Re: [PATCH 1/2] x86/bitops: implement __test_bit

2015-09-01 Thread H. Peter Anvin
On 09/01/2015 08:03 AM, Michael S. Tsirkin wrote:
>>>
>>> Hmm - so do you take back the ack?
>>
>> I have no strong feelings either way, it simply strikes me as misguided to 
>> explicitly optimize for something that is listed as a high overhead 
>> instruction.
>>
> 
>  [mst@robin test]$ diff a.c b.c
>  31c31
>  <   if (__variable_test_bit(1, ))
>  ---
>  >   if (__constant_test_bit(1, ))
> 
> [mst@robin test]$ gcc -Wall -O2 a.c; time ./a.out
> 
> real0m0.532s
> user0m0.531s
> sys 0m0.000s
> [mst@robin test]$ gcc -Wall -O2 b.c; time ./a.out
> 
> real0m0.517s
> user0m0.517s
> sys 0m0.000s
> 
> 
> So __constant_test_bit is faster even though it's using more
> instructions
> $ gcc -Wall -O2 a.c; -objdump -ld ./a.out
> 

I think this is well understood.  The use of bts/btc in locked
operations is sometimes justified since it reports the bit status back
out, whereas in unlocked operations bts/btc has no benefit except for
code size.  bt is a read operation, and is therefore "never/always"
atomic; it cannot be locked because there is no read/write pair to lock.

So it is strictly an issue of code size versus performance.

However, your test is simply faulty:

 804843f:   50  push   %eax
 8048440:   6a 01   push   $0x1
 8048442:   e8 b4 ff ff ff  call   80483fb <__variable_test_bit>

You're encapsulating the __variable_test_bit() version into an expensive
function call, whereas the __constant_test_bit() seems to emit code that
is quite frankly completely bonkers insane:

 8048444:   8b 45 ecmov-0x14(%ebp),%eax
 8048447:   83 e0 1fand$0x1f,%eax
 804844a:   89 c1   mov%eax,%ecx
 804844c:   d3 ea   shr%cl,%edx
 804844e:   89 d0   mov%edx,%eax
 8048450:   83 e0 01and$0x1,%eax
 8048453:   85 c0   test   %eax,%eax
 8048455:   0f 95 c0setne  %al
 8048458:   0f b6 c0movzbl %al,%eax
 804845b:   85 c0   test   %eax,%eax
 804845d:   74 00   je 804845f 

Observe the sequence and/test/setne/movzbl/test!

-hpa


--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH 1/2] x86/bitops: implement __test_bit

2015-09-01 Thread Michael S. Tsirkin
On Tue, Sep 01, 2015 at 01:39:42PM +0200, Ingo Molnar wrote:
> 
> * Michael S. Tsirkin  wrote:
> 
> > On Tue, Sep 01, 2015 at 11:24:22AM +0200, Ingo Molnar wrote:
> > > 
> > > * Michael S. Tsirkin  wrote:
> > > 
> > > > I applied this patch on top of mine:
> > > 
> > > Yeah, looks similar to the one I sent.
> > > 
> > > > -static inline int __variable_test_bit(long nr, const unsigned long 
> > > > *addr)
> > > > -{
> > > > -   int oldbit;
> > > > -
> > > > -   asm volatile("bt %2,%1\n\t"
> > > > -"sbb %0,%0"
> > > > -: "=r" (oldbit)
> > > > -: "m" (*addr), "Ir" (nr));
> > > > -
> > > > -   return oldbit;
> > > > -}
> > > 
> > > > And the code size went up:
> > > > 
> > > >13483629978372  146205   23b1d arch/x86/kvm/kvm-intel.ko  ->
> > > >13484629978372  146215   23b27 arch/x86/kvm/kvm-intel.ko 
> > > > 
> > > >342690   47640 441  390771   5f673 arch/x86/kvm/kvm.ko ->
> > > >342738   47640 441  390819   5f6a3 arch/x86/kvm/kvm.ko   
> > > > 
> > > > I tried removing  __always_inline, this had no effect.
> > > 
> > > But code size isn't the only factor.
> > > 
> > > Uros Bizjak pointed out that the reason GCC does not use the "BT reg,mem" 
> > > instruction is that it's highly suboptimal even on recent 
> > > microarchitectures, 
> > > Sandy Bridge is listed as having a 10 cycles latency (!) for this 
> > > instruction:
> > > 
> > >http://www.agner.org/optimize/instruction_tables.pdf
> > > 
> > > this instruction had bad latency going back to Pentium 4 CPUs.
> > > 
> > > ... so unless something changed in this area with Skylake I think using 
> > > the 
> > > __variable_test_bit() code of the kernel is a bad choice and looking at 
> > > kernel 
> > > size only is misleading.
> > > 
> > > It makes sense for atomics, but not for unlocked access.
> > > 
> > > Thanks,
> > > 
> > >   Ingo
> > 
> > Hmm - so do you take back the ack?
> 
> I have no strong feelings either way, it simply strikes me as misguided to 
> explicitly optimize for something that is listed as a high overhead 
> instruction.
> 
> Assuming it really is high overhead:
> 
> > I compared this:
> > int main(int argc, char **argv)
> > {
> > 
> > long long int i;
> > const unsigned long addr = 0;
> > for (i = 0; i < 10ull; ++i) {
> > asm volatile("");
> > if (__variable_test_bit(1, ))
> > asm volatile("");
> > }
> > return 0;
> > }
> > 
> > with the __constant_test_bit variant.
> > 
> > __constant_test_bit code does appear to be slower on an i7 processor.


Ouch. I meant the reverse:

 [mst@robin test]$ diff a.c b.c
 31c31
 <   if (__variable_test_bit(1, ))
 ---
 >   if (__constant_test_bit(1, ))

[mst@robin test]$ gcc -Wall -O2 a.c; time ./a.out

real0m0.532s
user0m0.531s
sys 0m0.000s
[mst@robin test]$ gcc -Wall -O2 b.c; time ./a.out

real0m0.517s
user0m0.517s
sys 0m0.000s


So __constant_test_bit is faster even though it's using more
instructions
$ gcc -Wall -O2 a.c; -objdump -ld ./a.out


08048414 :
main():
 8048414:   8d 4c 24 04 lea0x4(%esp),%ecx
 8048418:   83 e4 f8and$0xfff8,%esp
 804841b:   ff 71 fcpushl  -0x4(%ecx)
 804841e:   55  push   %ebp
 804841f:   89 e5   mov%esp,%ebp
 8048421:   51  push   %ecx
 8048422:   83 ec 14sub$0x14,%esp
 8048425:   c7 45 ec 00 00 00 00movl   $0x0,-0x14(%ebp)
 804842c:   c7 45 f0 00 00 00 00movl   $0x0,-0x10(%ebp)
 8048433:   c7 45 f4 00 00 00 00movl   $0x0,-0xc(%ebp)
 804843a:   eb 1a   jmp8048456 
 804843c:   8d 45 eclea-0x14(%ebp),%eax
 804843f:   50  push   %eax
 8048440:   6a 01   push   $0x1
 8048442:   e8 b4 ff ff ff  call   80483fb <__variable_test_bit>
 8048447:   83 c4 08add$0x8,%esp
 804844a:   85 c0   test   %eax,%eax
 804844c:   74 00   je 804844e 
 804844e:   83 45 f0 01 addl   $0x1,-0x10(%ebp)
 8048452:   83 55 f4 00 adcl   $0x0,-0xc(%ebp)
 8048456:   8b 45 f0mov-0x10(%ebp),%eax
 8048459:   8b 55 f4mov-0xc(%ebp),%edx
 804845c:   83 fa 00cmp$0x0,%edx
 804845f:   72 db   jb 804843c 
 8048461:   83 fa 00cmp$0x0,%edx
 8048464:   77 07   ja 804846d 
 8048466:   3d ff c9 9a 3b  cmp$0x3b9ac9ff,%eax
 804846b:   76 cf   jbe804843c 
 804846d:   b8 00 00 00 00  mov$0x0,%eax
 8048472:   8b 4d fcmov-0x4(%ebp),%ecx
 8048475:   c9

Re: [PATCH 1/2] x86/bitops: implement __test_bit

2015-09-01 Thread Ingo Molnar

* Michael S. Tsirkin  wrote:

> On Tue, Sep 01, 2015 at 11:24:22AM +0200, Ingo Molnar wrote:
> > 
> > * Michael S. Tsirkin  wrote:
> > 
> > > I applied this patch on top of mine:
> > 
> > Yeah, looks similar to the one I sent.
> > 
> > > -static inline int __variable_test_bit(long nr, const unsigned long *addr)
> > > -{
> > > - int oldbit;
> > > -
> > > - asm volatile("bt %2,%1\n\t"
> > > -  "sbb %0,%0"
> > > -  : "=r" (oldbit)
> > > -  : "m" (*addr), "Ir" (nr));
> > > -
> > > - return oldbit;
> > > -}
> > 
> > > And the code size went up:
> > > 
> > >13483629978372  146205   23b1d arch/x86/kvm/kvm-intel.ko  ->
> > >13484629978372  146215   23b27 arch/x86/kvm/kvm-intel.ko 
> > > 
> > >342690   47640 441  390771   5f673 arch/x86/kvm/kvm.ko ->
> > >342738   47640 441  390819   5f6a3 arch/x86/kvm/kvm.ko   
> > > 
> > > I tried removing  __always_inline, this had no effect.
> > 
> > But code size isn't the only factor.
> > 
> > Uros Bizjak pointed out that the reason GCC does not use the "BT reg,mem" 
> > instruction is that it's highly suboptimal even on recent 
> > microarchitectures, 
> > Sandy Bridge is listed as having a 10 cycles latency (!) for this 
> > instruction:
> > 
> >http://www.agner.org/optimize/instruction_tables.pdf
> > 
> > this instruction had bad latency going back to Pentium 4 CPUs.
> > 
> > ... so unless something changed in this area with Skylake I think using the 
> > __variable_test_bit() code of the kernel is a bad choice and looking at 
> > kernel 
> > size only is misleading.
> > 
> > It makes sense for atomics, but not for unlocked access.
> > 
> > Thanks,
> > 
> > Ingo
> 
> Hmm - so do you take back the ack?

I have no strong feelings either way, it simply strikes me as misguided to 
explicitly optimize for something that is listed as a high overhead instruction.

Assuming it really is high overhead:

> I compared this:
> int main(int argc, char **argv)
> {
> 
> long long int i;
> const unsigned long addr = 0;
> for (i = 0; i < 10ull; ++i) {
> asm volatile("");
> if (__variable_test_bit(1, ))
> asm volatile("");
> }
> return 0;
> }
> 
> with the __constant_test_bit variant.
> 
> __constant_test_bit code does appear to be slower on an i7 processor.

Hm, so this seems to be contradictory: if I'm right with the argument above 
then 
we'd expect the opposite result: variable_test_bit (BT using asm variant) 
should 
be slower than constant_test_bit (GCC version), correct?

Btw., to be sure it's a representative performance test instead of a barrier() 
in 
your testcase I'd actually do something with the result in a way neither the 
compiler nor the CPU can optimize it out as unused.

> test_bit isn't atomic either. Maybe drop variable_test_bit there too?

Yes, but only if I'm right about BT being suboptimal in this case on modern x86 
CPUs!

Thanks,

Ingo
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH 1/2] x86/bitops: implement __test_bit

2015-09-01 Thread Michael S. Tsirkin
On Tue, Sep 01, 2015 at 11:24:22AM +0200, Ingo Molnar wrote:
> 
> * Michael S. Tsirkin  wrote:
> 
> > I applied this patch on top of mine:
> 
> Yeah, looks similar to the one I sent.
> 
> > -static inline int __variable_test_bit(long nr, const unsigned long *addr)
> > -{
> > -   int oldbit;
> > -
> > -   asm volatile("bt %2,%1\n\t"
> > -"sbb %0,%0"
> > -: "=r" (oldbit)
> > -: "m" (*addr), "Ir" (nr));
> > -
> > -   return oldbit;
> > -}
> 
> > And the code size went up:
> > 
> >13483629978372  146205   23b1d arch/x86/kvm/kvm-intel.ko  ->
> >13484629978372  146215   23b27 arch/x86/kvm/kvm-intel.ko 
> > 
> >342690   47640 441  390771   5f673 arch/x86/kvm/kvm.ko ->
> >342738   47640 441  390819   5f6a3 arch/x86/kvm/kvm.ko   
> > 
> > I tried removing  __always_inline, this had no effect.
> 
> But code size isn't the only factor.
> 
> Uros Bizjak pointed out that the reason GCC does not use the "BT reg,mem" 
> instruction is that it's highly suboptimal even on recent microarchitectures, 
> Sandy Bridge is listed as having a 10 cycles latency (!) for this instruction:
> 
>http://www.agner.org/optimize/instruction_tables.pdf
> 
> this instruction had bad latency going back to Pentium 4 CPUs.
> 
> ... so unless something changed in this area with Skylake I think using the 
> __variable_test_bit() code of the kernel is a bad choice and looking at 
> kernel 
> size only is misleading.
> 
> It makes sense for atomics, but not for unlocked access.
> 
> Thanks,
> 
>   Ingo

Hmm - so do you take back the ack?

I compared this:
int main(int argc, char **argv)
{

long long int i;
const unsigned long addr = 0;
for (i = 0; i < 10ull; ++i) {
asm volatile("");
if (__variable_test_bit(1, ))
asm volatile("");
}
return 0;
}

with the __constant_test_bit variant.

__constant_test_bit code does appear to be slower on an i7 processor.

test_bit isn't atomic either. Maybe drop variable_test_bit there too?

-- 
MST
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH 1/2] x86/bitops: implement __test_bit

2015-09-01 Thread Ingo Molnar

* Michael S. Tsirkin  wrote:

> I applied this patch on top of mine:

Yeah, looks similar to the one I sent.

> -static inline int __variable_test_bit(long nr, const unsigned long *addr)
> -{
> - int oldbit;
> -
> - asm volatile("bt %2,%1\n\t"
> -  "sbb %0,%0"
> -  : "=r" (oldbit)
> -  : "m" (*addr), "Ir" (nr));
> -
> - return oldbit;
> -}

> And the code size went up:
> 
>13483629978372  146205   23b1d arch/x86/kvm/kvm-intel.ko  ->
>13484629978372  146215   23b27 arch/x86/kvm/kvm-intel.ko 
> 
>342690   47640 441  390771   5f673 arch/x86/kvm/kvm.ko ->
>342738   47640 441  390819   5f6a3 arch/x86/kvm/kvm.ko   
> 
> I tried removing  __always_inline, this had no effect.

But code size isn't the only factor.

Uros Bizjak pointed out that the reason GCC does not use the "BT reg,mem" 
instruction is that it's highly suboptimal even on recent microarchitectures, 
Sandy Bridge is listed as having a 10 cycles latency (!) for this instruction:

   http://www.agner.org/optimize/instruction_tables.pdf

this instruction had bad latency going back to Pentium 4 CPUs.

... so unless something changed in this area with Skylake I think using the 
__variable_test_bit() code of the kernel is a bad choice and looking at kernel 
size only is misleading.

It makes sense for atomics, but not for unlocked access.

Thanks,

Ingo
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH 1/2] x86/bitops: implement __test_bit

2015-09-01 Thread H. Peter Anvin
On 09/01/2015 08:03 AM, Michael S. Tsirkin wrote:
>>>
>>> Hmm - so do you take back the ack?
>>
>> I have no strong feelings either way, it simply strikes me as misguided to 
>> explicitly optimize for something that is listed as a high overhead 
>> instruction.
>>
> 
>  [mst@robin test]$ diff a.c b.c
>  31c31
>  <   if (__variable_test_bit(1, ))
>  ---
>  >   if (__constant_test_bit(1, ))
> 
> [mst@robin test]$ gcc -Wall -O2 a.c; time ./a.out
> 
> real0m0.532s
> user0m0.531s
> sys 0m0.000s
> [mst@robin test]$ gcc -Wall -O2 b.c; time ./a.out
> 
> real0m0.517s
> user0m0.517s
> sys 0m0.000s
> 
> 
> So __constant_test_bit is faster even though it's using more
> instructions
> $ gcc -Wall -O2 a.c; -objdump -ld ./a.out
> 

I think this is well understood.  The use of bts/btc in locked
operations is sometimes justified since it reports the bit status back
out, whereas in unlocked operations bts/btc has no benefit except for
code size.  bt is a read operation, and is therefore "never/always"
atomic; it cannot be locked because there is no read/write pair to lock.

So it is strictly an issue of code size versus performance.

However, your test is simply faulty:

 804843f:   50  push   %eax
 8048440:   6a 01   push   $0x1
 8048442:   e8 b4 ff ff ff  call   80483fb <__variable_test_bit>

You're encapsulating the __variable_test_bit() version into an expensive
function call, whereas the __constant_test_bit() seems to emit code that
is quite frankly completely bonkers insane:

 8048444:   8b 45 ecmov-0x14(%ebp),%eax
 8048447:   83 e0 1fand$0x1f,%eax
 804844a:   89 c1   mov%eax,%ecx
 804844c:   d3 ea   shr%cl,%edx
 804844e:   89 d0   mov%edx,%eax
 8048450:   83 e0 01and$0x1,%eax
 8048453:   85 c0   test   %eax,%eax
 8048455:   0f 95 c0setne  %al
 8048458:   0f b6 c0movzbl %al,%eax
 804845b:   85 c0   test   %eax,%eax
 804845d:   74 00   je 804845f 

Observe the sequence and/test/setne/movzbl/test!

-hpa


--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH 1/2] x86/bitops: implement __test_bit

2015-09-01 Thread Michael S. Tsirkin
On Tue, Sep 01, 2015 at 11:24:22AM +0200, Ingo Molnar wrote:
> 
> * Michael S. Tsirkin  wrote:
> 
> > I applied this patch on top of mine:
> 
> Yeah, looks similar to the one I sent.
> 
> > -static inline int __variable_test_bit(long nr, const unsigned long *addr)
> > -{
> > -   int oldbit;
> > -
> > -   asm volatile("bt %2,%1\n\t"
> > -"sbb %0,%0"
> > -: "=r" (oldbit)
> > -: "m" (*addr), "Ir" (nr));
> > -
> > -   return oldbit;
> > -}
> 
> > And the code size went up:
> > 
> >13483629978372  146205   23b1d arch/x86/kvm/kvm-intel.ko  ->
> >13484629978372  146215   23b27 arch/x86/kvm/kvm-intel.ko 
> > 
> >342690   47640 441  390771   5f673 arch/x86/kvm/kvm.ko ->
> >342738   47640 441  390819   5f6a3 arch/x86/kvm/kvm.ko   
> > 
> > I tried removing  __always_inline, this had no effect.
> 
> But code size isn't the only factor.
> 
> Uros Bizjak pointed out that the reason GCC does not use the "BT reg,mem" 
> instruction is that it's highly suboptimal even on recent microarchitectures, 
> Sandy Bridge is listed as having a 10 cycles latency (!) for this instruction:
> 
>http://www.agner.org/optimize/instruction_tables.pdf
> 
> this instruction had bad latency going back to Pentium 4 CPUs.
> 
> ... so unless something changed in this area with Skylake I think using the 
> __variable_test_bit() code of the kernel is a bad choice and looking at 
> kernel 
> size only is misleading.
> 
> It makes sense for atomics, but not for unlocked access.
> 
> Thanks,
> 
>   Ingo

Hmm - so do you take back the ack?

I compared this:
int main(int argc, char **argv)
{

long long int i;
const unsigned long addr = 0;
for (i = 0; i < 10ull; ++i) {
asm volatile("");
if (__variable_test_bit(1, ))
asm volatile("");
}
return 0;
}

with the __constant_test_bit variant.

__constant_test_bit code does appear to be slower on an i7 processor.

test_bit isn't atomic either. Maybe drop variable_test_bit there too?

-- 
MST
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH 1/2] x86/bitops: implement __test_bit

2015-09-01 Thread Ingo Molnar

* Michael S. Tsirkin  wrote:

> I applied this patch on top of mine:

Yeah, looks similar to the one I sent.

> -static inline int __variable_test_bit(long nr, const unsigned long *addr)
> -{
> - int oldbit;
> -
> - asm volatile("bt %2,%1\n\t"
> -  "sbb %0,%0"
> -  : "=r" (oldbit)
> -  : "m" (*addr), "Ir" (nr));
> -
> - return oldbit;
> -}

> And the code size went up:
> 
>13483629978372  146205   23b1d arch/x86/kvm/kvm-intel.ko  ->
>13484629978372  146215   23b27 arch/x86/kvm/kvm-intel.ko 
> 
>342690   47640 441  390771   5f673 arch/x86/kvm/kvm.ko ->
>342738   47640 441  390819   5f6a3 arch/x86/kvm/kvm.ko   
> 
> I tried removing  __always_inline, this had no effect.

But code size isn't the only factor.

Uros Bizjak pointed out that the reason GCC does not use the "BT reg,mem" 
instruction is that it's highly suboptimal even on recent microarchitectures, 
Sandy Bridge is listed as having a 10 cycles latency (!) for this instruction:

   http://www.agner.org/optimize/instruction_tables.pdf

this instruction had bad latency going back to Pentium 4 CPUs.

... so unless something changed in this area with Skylake I think using the 
__variable_test_bit() code of the kernel is a bad choice and looking at kernel 
size only is misleading.

It makes sense for atomics, but not for unlocked access.

Thanks,

Ingo
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH 1/2] x86/bitops: implement __test_bit

2015-09-01 Thread Ingo Molnar

* Michael S. Tsirkin  wrote:

> On Tue, Sep 01, 2015 at 11:24:22AM +0200, Ingo Molnar wrote:
> > 
> > * Michael S. Tsirkin  wrote:
> > 
> > > I applied this patch on top of mine:
> > 
> > Yeah, looks similar to the one I sent.
> > 
> > > -static inline int __variable_test_bit(long nr, const unsigned long *addr)
> > > -{
> > > - int oldbit;
> > > -
> > > - asm volatile("bt %2,%1\n\t"
> > > -  "sbb %0,%0"
> > > -  : "=r" (oldbit)
> > > -  : "m" (*addr), "Ir" (nr));
> > > -
> > > - return oldbit;
> > > -}
> > 
> > > And the code size went up:
> > > 
> > >13483629978372  146205   23b1d arch/x86/kvm/kvm-intel.ko  ->
> > >13484629978372  146215   23b27 arch/x86/kvm/kvm-intel.ko 
> > > 
> > >342690   47640 441  390771   5f673 arch/x86/kvm/kvm.ko ->
> > >342738   47640 441  390819   5f6a3 arch/x86/kvm/kvm.ko   
> > > 
> > > I tried removing  __always_inline, this had no effect.
> > 
> > But code size isn't the only factor.
> > 
> > Uros Bizjak pointed out that the reason GCC does not use the "BT reg,mem" 
> > instruction is that it's highly suboptimal even on recent 
> > microarchitectures, 
> > Sandy Bridge is listed as having a 10 cycles latency (!) for this 
> > instruction:
> > 
> >http://www.agner.org/optimize/instruction_tables.pdf
> > 
> > this instruction had bad latency going back to Pentium 4 CPUs.
> > 
> > ... so unless something changed in this area with Skylake I think using the 
> > __variable_test_bit() code of the kernel is a bad choice and looking at 
> > kernel 
> > size only is misleading.
> > 
> > It makes sense for atomics, but not for unlocked access.
> > 
> > Thanks,
> > 
> > Ingo
> 
> Hmm - so do you take back the ack?

I have no strong feelings either way, it simply strikes me as misguided to 
explicitly optimize for something that is listed as a high overhead instruction.

Assuming it really is high overhead:

> I compared this:
> int main(int argc, char **argv)
> {
> 
> long long int i;
> const unsigned long addr = 0;
> for (i = 0; i < 10ull; ++i) {
> asm volatile("");
> if (__variable_test_bit(1, ))
> asm volatile("");
> }
> return 0;
> }
> 
> with the __constant_test_bit variant.
> 
> __constant_test_bit code does appear to be slower on an i7 processor.

Hm, so this seems to be contradictory: if I'm right with the argument above 
then 
we'd expect the opposite result: variable_test_bit (BT using asm variant) 
should 
be slower than constant_test_bit (GCC version), correct?

Btw., to be sure it's a representative performance test instead of a barrier() 
in 
your testcase I'd actually do something with the result in a way neither the 
compiler nor the CPU can optimize it out as unused.

> test_bit isn't atomic either. Maybe drop variable_test_bit there too?

Yes, but only if I'm right about BT being suboptimal in this case on modern x86 
CPUs!

Thanks,

Ingo
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH 1/2] x86/bitops: implement __test_bit

2015-09-01 Thread Michael S. Tsirkin
On Tue, Sep 01, 2015 at 01:39:42PM +0200, Ingo Molnar wrote:
> 
> * Michael S. Tsirkin  wrote:
> 
> > On Tue, Sep 01, 2015 at 11:24:22AM +0200, Ingo Molnar wrote:
> > > 
> > > * Michael S. Tsirkin  wrote:
> > > 
> > > > I applied this patch on top of mine:
> > > 
> > > Yeah, looks similar to the one I sent.
> > > 
> > > > -static inline int __variable_test_bit(long nr, const unsigned long 
> > > > *addr)
> > > > -{
> > > > -   int oldbit;
> > > > -
> > > > -   asm volatile("bt %2,%1\n\t"
> > > > -"sbb %0,%0"
> > > > -: "=r" (oldbit)
> > > > -: "m" (*addr), "Ir" (nr));
> > > > -
> > > > -   return oldbit;
> > > > -}
> > > 
> > > > And the code size went up:
> > > > 
> > > >13483629978372  146205   23b1d arch/x86/kvm/kvm-intel.ko  ->
> > > >13484629978372  146215   23b27 arch/x86/kvm/kvm-intel.ko 
> > > > 
> > > >342690   47640 441  390771   5f673 arch/x86/kvm/kvm.ko ->
> > > >342738   47640 441  390819   5f6a3 arch/x86/kvm/kvm.ko   
> > > > 
> > > > I tried removing  __always_inline, this had no effect.
> > > 
> > > But code size isn't the only factor.
> > > 
> > > Uros Bizjak pointed out that the reason GCC does not use the "BT reg,mem" 
> > > instruction is that it's highly suboptimal even on recent 
> > > microarchitectures, 
> > > Sandy Bridge is listed as having a 10 cycles latency (!) for this 
> > > instruction:
> > > 
> > >http://www.agner.org/optimize/instruction_tables.pdf
> > > 
> > > this instruction had bad latency going back to Pentium 4 CPUs.
> > > 
> > > ... so unless something changed in this area with Skylake I think using 
> > > the 
> > > __variable_test_bit() code of the kernel is a bad choice and looking at 
> > > kernel 
> > > size only is misleading.
> > > 
> > > It makes sense for atomics, but not for unlocked access.
> > > 
> > > Thanks,
> > > 
> > >   Ingo
> > 
> > Hmm - so do you take back the ack?
> 
> I have no strong feelings either way, it simply strikes me as misguided to 
> explicitly optimize for something that is listed as a high overhead 
> instruction.
> 
> Assuming it really is high overhead:
> 
> > I compared this:
> > int main(int argc, char **argv)
> > {
> > 
> > long long int i;
> > const unsigned long addr = 0;
> > for (i = 0; i < 10ull; ++i) {
> > asm volatile("");
> > if (__variable_test_bit(1, ))
> > asm volatile("");
> > }
> > return 0;
> > }
> > 
> > with the __constant_test_bit variant.
> > 
> > __constant_test_bit code does appear to be slower on an i7 processor.


Ouch. I meant the reverse:

 [mst@robin test]$ diff a.c b.c
 31c31
 <   if (__variable_test_bit(1, ))
 ---
 >   if (__constant_test_bit(1, ))

[mst@robin test]$ gcc -Wall -O2 a.c; time ./a.out

real0m0.532s
user0m0.531s
sys 0m0.000s
[mst@robin test]$ gcc -Wall -O2 b.c; time ./a.out

real0m0.517s
user0m0.517s
sys 0m0.000s


So __constant_test_bit is faster even though it's using more
instructions
$ gcc -Wall -O2 a.c; -objdump -ld ./a.out


08048414 :
main():
 8048414:   8d 4c 24 04 lea0x4(%esp),%ecx
 8048418:   83 e4 f8and$0xfff8,%esp
 804841b:   ff 71 fcpushl  -0x4(%ecx)
 804841e:   55  push   %ebp
 804841f:   89 e5   mov%esp,%ebp
 8048421:   51  push   %ecx
 8048422:   83 ec 14sub$0x14,%esp
 8048425:   c7 45 ec 00 00 00 00movl   $0x0,-0x14(%ebp)
 804842c:   c7 45 f0 00 00 00 00movl   $0x0,-0x10(%ebp)
 8048433:   c7 45 f4 00 00 00 00movl   $0x0,-0xc(%ebp)
 804843a:   eb 1a   jmp8048456 
 804843c:   8d 45 eclea-0x14(%ebp),%eax
 804843f:   50  push   %eax
 8048440:   6a 01   push   $0x1
 8048442:   e8 b4 ff ff ff  call   80483fb <__variable_test_bit>
 8048447:   83 c4 08add$0x8,%esp
 804844a:   85 c0   test   %eax,%eax
 804844c:   74 00   je 804844e 
 804844e:   83 45 f0 01 addl   $0x1,-0x10(%ebp)
 8048452:   83 55 f4 00 adcl   $0x0,-0xc(%ebp)
 8048456:   8b 45 f0mov-0x10(%ebp),%eax
 8048459:   8b 55 f4mov-0xc(%ebp),%edx
 804845c:   83 fa 00cmp$0x0,%edx
 804845f:   72 db   jb 804843c 
 8048461:   83 fa 00cmp$0x0,%edx
 8048464:   77 07   ja 804846d 
 8048466:   3d ff c9 9a 3b  cmp$0x3b9ac9ff,%eax
 804846b:   76 cf   jbe804843c 
 804846d:   b8 00 00 00 00  mov$0x0,%eax
 

Re: [PATCH 1/2] x86/bitops: implement __test_bit

2015-08-31 Thread Michael S. Tsirkin
On Mon, Aug 31, 2015 at 09:59:47AM +0200, Ingo Molnar wrote:
> 
> * Michael S. Tsirkin  wrote:
> 
> > On Sun, Aug 30, 2015 at 11:13:20PM -0700, H. Peter Anvin wrote:
> > > Presumably because gcc can't generate bt... whether or not it is worth it 
> > > is another matter.
> > > 
> > > On August 30, 2015 11:05:49 PM PDT, Ingo Molnar  wrote:
> > > >
> > > >* Michael S. Tsirkin  wrote:
> > > >
> > > >> +static __always_inline int __constant_test_bit(long nr, const
> > > >unsigned long *addr)
> > > >> +{
> > > >> +  return ((1UL << (nr & (BITS_PER_LONG-1))) &
> > > >> +  (addr[nr >> _BITOPS_LONG_SHIFT])) != 0;
> > > >> +}
> > > >> +
> > > >> +static inline int __variable_test_bit(long nr, const unsigned long
> > > >*addr)
> > > >> +{
> > > >> +  int oldbit;
> > > >> +
> > > >> +  asm volatile("bt %2,%1\n\t"
> > > >> +   "sbb %0,%0"
> > > >> +   : "=r" (oldbit)
> > > >> +   : "m" (*addr), "Ir" (nr));
> > > >> +
> > > >> +  return oldbit;
> > > >> +}
> > > >
> > > >Color me confused, why use assembly for this at all?
> > > >
> > > >Why not just use C for testing the bit (i.e. turn __constant_test_bit()
> > > >into 
> > > >__test_bit()) - that would also allow the compiler to propagate the
> > > >result, 
> > > >potentially more optimally than we can do it via SBB...
> > > >
> > > >Thanks,
> > > >
> > > > Ingo
> > 
> > Exactly:
> > 
> > 
> > Disassembly of section .text:
> > 
> >  <__variable_test_bit>:
> > __variable_test_bit():
> >0:   8b 54 24 08 mov0x8(%esp),%edx
> >4:   8b 44 24 04 mov0x4(%esp),%eax
> >8:   0f a3 02bt %eax,(%edx)
> >b:   19 c0   sbb%eax,%eax
> >d:   c3  ret
> >e:   66 90   xchg   %ax,%ax
> > 
> > 0010 <__constant_test_bit>:
> > __constant_test_bit():
> >   10:   8b 4c 24 04 mov0x4(%esp),%ecx
> >   14:   8b 44 24 08 mov0x8(%esp),%eax
> >   18:   89 ca   mov%ecx,%edx
> >   1a:   c1 fa 04sar$0x4,%edx
> >   1d:   8b 04 90mov(%eax,%edx,4),%eax
> >   20:   d3 e8   shr%cl,%eax
> >   22:   83 e0 01and$0x1,%eax
> >   25:   c3  ret
> 
> But that's due to the forced interface of generating a return code. Please 
> compare 
> it at an inlined usage site, where GCC is free to do the comparison directly 
> and 
> use the result in flags.
> 
> Thanks,
> 
>   Ingo

I applied this patch on top of mine:


diff --git a/arch/x86/include/asm/bitops.h b/arch/x86/include/asm/bitops.h
index 9229334..2aed985 100644
--- a/arch/x86/include/asm/bitops.h
+++ b/arch/x86/include/asm/bitops.h
@@ -323,24 +323,17 @@ static inline int variable_test_bit(long nr, volatile 
const unsigned long *addr)
return oldbit;
 }
 
-static __always_inline int __constant_test_bit(long nr, const unsigned long 
*addr)
+/**
+ * __test_bit - Determine whether a bit is set
+ * @nr: bit number to test
+ * @addr: Address to start counting from
+ */
+static __always_inline int __test_bit(long nr, const unsigned long *addr)
 {
return ((1UL << (nr & (BITS_PER_LONG-1))) &
(addr[nr >> _BITOPS_LONG_SHIFT])) != 0;
 }
 
-static inline int __variable_test_bit(long nr, const unsigned long *addr)
-{
-   int oldbit;
-
-   asm volatile("bt %2,%1\n\t"
-"sbb %0,%0"
-: "=r" (oldbit)
-: "m" (*addr), "Ir" (nr));
-
-   return oldbit;
-}
-
 #if 0 /* Fool kernel-doc since it doesn't do macros yet */
 /**
  * test_bit - Determine whether a bit is set
@@ -348,13 +341,6 @@ static inline int __variable_test_bit(long nr, const 
unsigned long *addr)
  * @addr: Address to start counting from
  */
 static int test_bit(int nr, const volatile unsigned long *addr);
-
-/**
- * __test_bit - Determine whether a bit is set
- * @nr: bit number to test
- * @addr: Address to start counting from
- */
-static int __test_bit(int nr, const volatile unsigned long *addr);
 #endif
 
 #define test_bit(nr, addr) \
@@ -362,10 +348,6 @@ static int __test_bit(int nr, const volatile unsigned long 
*addr);
 ? constant_test_bit((nr), (addr))  \
 : variable_test_bit((nr), (addr)))
 
-#define __test_bit(nr, addr)   \
-   (__builtin_constant_p((nr)) \
-? __constant_test_bit((nr), (addr))\
-: __variable_test_bit((nr), (addr)))
 
 /**
  * __ffs - find first set bit in word


And the code size went up:

   13483629978372  146205   23b1d arch/x86/kvm/kvm-intel.ko  ->
   13484629978372  146215   23b27 arch/x86/kvm/kvm-intel.ko 

   342690   47640 441  390771   5f673 arch/x86/kvm/kvm.ko ->
   342738   47640 441  390819   5f6a3 arch/x86/kvm/kvm.ko   

I tried removing  __always_inline, this had no effect.

-- 
MST

Re: [PATCH 1/2] x86/bitops: implement __test_bit

2015-08-31 Thread Ingo Molnar

* yalin wang  wrote:

> 
> > On Aug 31, 2015, at 15:59, Ingo Molnar  wrote:
> > 
> > 
> > * Michael S. Tsirkin  wrote:
> > 
> >> On Sun, Aug 30, 2015 at 11:13:20PM -0700, H. Peter Anvin wrote:
> >>> Presumably because gcc can't generate bt... whether or not it is worth it 
> >>> is another matter.
> >>> 
> >>> On August 30, 2015 11:05:49 PM PDT, Ingo Molnar  wrote:
>  
>  * Michael S. Tsirkin  wrote:
>  
> > +static __always_inline int __constant_test_bit(long nr, const
>  unsigned long *addr)
> > +{
> > +   return ((1UL << (nr & (BITS_PER_LONG-1))) &
> > +   (addr[nr >> _BITOPS_LONG_SHIFT])) != 0;
> > +}
> > +
> > +static inline int __variable_test_bit(long nr, const unsigned long
>  *addr)
> > +{
> > +   int oldbit;
> > +
> > +   asm volatile("bt %2,%1\n\t"
> > +"sbb %0,%0"
> > +: "=r" (oldbit)
> > +: "m" (*addr), "Ir" (nr));
> > +
> > +   return oldbit;
> > +}
>  
>  Color me confused, why use assembly for this at all?
>  
>  Why not just use C for testing the bit (i.e. turn __constant_test_bit()
>  into 
>  __test_bit()) - that would also allow the compiler to propagate the
>  result, 
>  potentially more optimally than we can do it via SBB...
>  
>  Thanks,
>  
>   Ingo
> >> 
> >> Exactly:
> >> 
> >> 
> >> Disassembly of section .text:
> >> 
> >>  <__variable_test_bit>:
> >> __variable_test_bit():
> >>   0:   8b 54 24 08 mov0x8(%esp),%edx
> >>   4:   8b 44 24 04 mov0x4(%esp),%eax
> >>   8:   0f a3 02bt %eax,(%edx)
> >>   b:   19 c0   sbb%eax,%eax
> >>   d:   c3  ret
> >>   e:   66 90   xchg   %ax,%ax
> >> 
> >> 0010 <__constant_test_bit>:
> >> __constant_test_bit():
> >>  10:   8b 4c 24 04 mov0x4(%esp),%ecx
> >>  14:   8b 44 24 08 mov0x8(%esp),%eax
> >>  18:   89 ca   mov%ecx,%edx
> >>  1a:   c1 fa 04sar$0x4,%edx
> >>  1d:   8b 04 90mov(%eax,%edx,4),%eax
> >>  20:   d3 e8   shr%cl,%eax
> >>  22:   83 e0 01and$0x1,%eax
> >>  25:   c3  ret
> > 
> > But that's due to the forced interface of generating a return code. Please 
> > compare 
> > it at an inlined usage site, where GCC is free to do the comparison 
> > directly and 
> > use the result in flags.
> just curious :
> it seems __variable_test_bit()  use less instructions,
> why not always use __variable_test_bit() , remove __constant_test_bit() 
> version ?

It's an artifact of testing it in isolation.

For constant bit positions GCC is able to do a fairly good job:

8103d2a0 :
8103d2a0:   f6 87 4a 02 00 00 08testb  $0x8,0x24a(%rdi)
...
8103d2ab:   75 39   jne8103d2e6 



with just 2 instructions: a TESTB plus using the flag result in a JNE.

Using variable_test_bit() forces the result into a register, which is 
suboptimal.

Thanks,

Ingo
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH 1/2] x86/bitops: implement __test_bit

2015-08-31 Thread yalin wang

> On Aug 31, 2015, at 15:59, Ingo Molnar  wrote:
> 
> 
> * Michael S. Tsirkin  wrote:
> 
>> On Sun, Aug 30, 2015 at 11:13:20PM -0700, H. Peter Anvin wrote:
>>> Presumably because gcc can't generate bt... whether or not it is worth it 
>>> is another matter.
>>> 
>>> On August 30, 2015 11:05:49 PM PDT, Ingo Molnar  wrote:
 
 * Michael S. Tsirkin  wrote:
 
> +static __always_inline int __constant_test_bit(long nr, const
 unsigned long *addr)
> +{
> + return ((1UL << (nr & (BITS_PER_LONG-1))) &
> + (addr[nr >> _BITOPS_LONG_SHIFT])) != 0;
> +}
> +
> +static inline int __variable_test_bit(long nr, const unsigned long
 *addr)
> +{
> + int oldbit;
> +
> + asm volatile("bt %2,%1\n\t"
> +  "sbb %0,%0"
> +  : "=r" (oldbit)
> +  : "m" (*addr), "Ir" (nr));
> +
> + return oldbit;
> +}
 
 Color me confused, why use assembly for this at all?
 
 Why not just use C for testing the bit (i.e. turn __constant_test_bit()
 into 
 __test_bit()) - that would also allow the compiler to propagate the
 result, 
 potentially more optimally than we can do it via SBB...
 
 Thanks,
 
Ingo
>> 
>> Exactly:
>> 
>> 
>> Disassembly of section .text:
>> 
>>  <__variable_test_bit>:
>> __variable_test_bit():
>>   0:   8b 54 24 08 mov0x8(%esp),%edx
>>   4:   8b 44 24 04 mov0x4(%esp),%eax
>>   8:   0f a3 02bt %eax,(%edx)
>>   b:   19 c0   sbb%eax,%eax
>>   d:   c3  ret
>>   e:   66 90   xchg   %ax,%ax
>> 
>> 0010 <__constant_test_bit>:
>> __constant_test_bit():
>>  10:   8b 4c 24 04 mov0x4(%esp),%ecx
>>  14:   8b 44 24 08 mov0x8(%esp),%eax
>>  18:   89 ca   mov%ecx,%edx
>>  1a:   c1 fa 04sar$0x4,%edx
>>  1d:   8b 04 90mov(%eax,%edx,4),%eax
>>  20:   d3 e8   shr%cl,%eax
>>  22:   83 e0 01and$0x1,%eax
>>  25:   c3  ret
> 
> But that's due to the forced interface of generating a return code. Please 
> compare 
> it at an inlined usage site, where GCC is free to do the comparison directly 
> and 
> use the result in flags.
just curious :
it seems __variable_test_bit()  use less instructions,
why not always use __variable_test_bit() , remove __constant_test_bit() version 
?

Thanks




--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH 1/2] x86/bitops: implement __test_bit

2015-08-31 Thread Ingo Molnar

* Ingo Molnar  wrote:

> > Disassembly of section .text:
> > 
> >  <__variable_test_bit>:
> > __variable_test_bit():
> >0:   8b 54 24 08 mov0x8(%esp),%edx
> >4:   8b 44 24 04 mov0x4(%esp),%eax
> >8:   0f a3 02bt %eax,(%edx)
> >b:   19 c0   sbb%eax,%eax
> >d:   c3  ret
> >e:   66 90   xchg   %ax,%ax
> > 
> > 0010 <__constant_test_bit>:
> > __constant_test_bit():
> >   10:   8b 4c 24 04 mov0x4(%esp),%ecx
> >   14:   8b 44 24 08 mov0x8(%esp),%eax
> >   18:   89 ca   mov%ecx,%edx
> >   1a:   c1 fa 04sar$0x4,%edx
> >   1d:   8b 04 90mov(%eax,%edx,4),%eax
> >   20:   d3 e8   shr%cl,%eax
> >   22:   83 e0 01and$0x1,%eax
> >   25:   c3  ret
> 
> But that's due to the forced interface of generating a return code. Please 
> compare it at an inlined usage site, where GCC is free to do the comparison 
> directly and use the result in flags.

So I was thinking about the patch below on top of yours.

But turns out GCC indeed generates worse code even under the best of 
circumstances. For example the nested_vmx_disable_intercept_for_msr() change:

@@ -4275,24 +4275,24 @@ static void nested_vmx_disable_intercept
 */
if (msr <= 0x1fff) {
if (type & MSR_TYPE_R &&
-  !test_bit(msr, msr_bitmap_l1 + 0x000 / f))
+  !__test_bit(msr, msr_bitmap_l1 + 0x000 / f))
/* read-low */
__clear_bit(msr, msr_bitmap_nested + 0x000 / f);

before (i.e. your series):

818b1082:   89 d0   mov%edx,%eax
818b1084:   48 0f a3 07 bt %rax,(%rdi)
818b1088:   45 19 c0sbb%r8d,%r8d
818b108b:   45 85 c0test   %r8d,%r8d
818b108e:   75 04   jne818b1094 


after (with my 'optimization' patch applied):

818b1091:   89 d0   mov%edx,%eax
818b1093:   49 89 c0mov%rax,%r8
818b1096:   49 c1 f8 06 sar$0x6,%r8
818b109a:   4e 8b 04 c7 mov(%rdi,%r8,8),%r8
818b109e:   49 0f a3 d0 bt %rdx,%r8
818b10a2:   72 04   jb 818b10a8 


So GCC when left to its own devices, generates one more instruction and 4 more 
bytes. Why does GCC do that? Why doesn't it use BT directly and use the flag, 
i.e. 
something like [pseudocode]:

818b1082:   89 d0   mov%edx,%eax
818b1084:   48 0f a3 07 bt %rax,(%rdi)
818b108e:   75 04   jne818b1094 


?

In any case I take back my objection:

  Acked-by: Ingo Molnar 

Thanks,

Ingo


---
 arch/x86/include/asm/bitops.h |   19 +--
 1 file changed, 1 insertion(+), 18 deletions(-)

Index: tip/arch/x86/include/asm/bitops.h
===
--- tip.orig/arch/x86/include/asm/bitops.h
+++ tip/arch/x86/include/asm/bitops.h
@@ -323,24 +323,12 @@ static inline int variable_test_bit(long
return oldbit;
 }
 
-static __always_inline int __constant_test_bit(long nr, const unsigned long 
*addr)
+static __always_inline int __test_bit(long nr, const unsigned long *addr)
 {
return ((1UL << (nr & (BITS_PER_LONG-1))) &
(addr[nr >> _BITOPS_LONG_SHIFT])) != 0;
 }
 
-static inline int __variable_test_bit(long nr, const unsigned long *addr)
-{
-   int oldbit;
-
-   asm volatile("bt %2,%1\n\t"
-"sbb %0,%0"
-: "=r" (oldbit)
-: "m" (*addr), "Ir" (nr));
-
-   return oldbit;
-}
-
 #if 0 /* Fool kernel-doc since it doesn't do macros yet */
 /**
  * test_bit - Determine whether a bit is set
@@ -362,11 +350,6 @@ static int __test_bit(int nr, const vola
 ? constant_test_bit((nr), (addr))  \
 : variable_test_bit((nr), (addr)))
 
-#define __test_bit(nr, addr)   \
-   (__builtin_constant_p((nr)) \
-? __constant_test_bit((nr), (addr))\
-: __variable_test_bit((nr), (addr)))
-
 /**
  * __ffs - find first set bit in word
  * @word: The word to search

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH 1/2] x86/bitops: implement __test_bit

2015-08-31 Thread Ingo Molnar

* Michael S. Tsirkin  wrote:

> On Sun, Aug 30, 2015 at 11:13:20PM -0700, H. Peter Anvin wrote:
> > Presumably because gcc can't generate bt... whether or not it is worth it 
> > is another matter.
> > 
> > On August 30, 2015 11:05:49 PM PDT, Ingo Molnar  wrote:
> > >
> > >* Michael S. Tsirkin  wrote:
> > >
> > >> +static __always_inline int __constant_test_bit(long nr, const
> > >unsigned long *addr)
> > >> +{
> > >> +return ((1UL << (nr & (BITS_PER_LONG-1))) &
> > >> +(addr[nr >> _BITOPS_LONG_SHIFT])) != 0;
> > >> +}
> > >> +
> > >> +static inline int __variable_test_bit(long nr, const unsigned long
> > >*addr)
> > >> +{
> > >> +int oldbit;
> > >> +
> > >> +asm volatile("bt %2,%1\n\t"
> > >> + "sbb %0,%0"
> > >> + : "=r" (oldbit)
> > >> + : "m" (*addr), "Ir" (nr));
> > >> +
> > >> +return oldbit;
> > >> +}
> > >
> > >Color me confused, why use assembly for this at all?
> > >
> > >Why not just use C for testing the bit (i.e. turn __constant_test_bit()
> > >into 
> > >__test_bit()) - that would also allow the compiler to propagate the
> > >result, 
> > >potentially more optimally than we can do it via SBB...
> > >
> > >Thanks,
> > >
> > >   Ingo
> 
> Exactly:
> 
> 
> Disassembly of section .text:
> 
>  <__variable_test_bit>:
> __variable_test_bit():
>0:   8b 54 24 08 mov0x8(%esp),%edx
>4:   8b 44 24 04 mov0x4(%esp),%eax
>8:   0f a3 02bt %eax,(%edx)
>b:   19 c0   sbb%eax,%eax
>d:   c3  ret
>e:   66 90   xchg   %ax,%ax
> 
> 0010 <__constant_test_bit>:
> __constant_test_bit():
>   10:   8b 4c 24 04 mov0x4(%esp),%ecx
>   14:   8b 44 24 08 mov0x8(%esp),%eax
>   18:   89 ca   mov%ecx,%edx
>   1a:   c1 fa 04sar$0x4,%edx
>   1d:   8b 04 90mov(%eax,%edx,4),%eax
>   20:   d3 e8   shr%cl,%eax
>   22:   83 e0 01and$0x1,%eax
>   25:   c3  ret

But that's due to the forced interface of generating a return code. Please 
compare 
it at an inlined usage site, where GCC is free to do the comparison directly 
and 
use the result in flags.

Thanks,

Ingo
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH 1/2] x86/bitops: implement __test_bit

2015-08-31 Thread Michael S. Tsirkin
On Sun, Aug 30, 2015 at 11:13:20PM -0700, H. Peter Anvin wrote:
> Presumably because gcc can't generate bt... whether or not it is worth it is 
> another matter.
> 
> On August 30, 2015 11:05:49 PM PDT, Ingo Molnar  wrote:
> >
> >* Michael S. Tsirkin  wrote:
> >
> >> +static __always_inline int __constant_test_bit(long nr, const
> >unsigned long *addr)
> >> +{
> >> +  return ((1UL << (nr & (BITS_PER_LONG-1))) &
> >> +  (addr[nr >> _BITOPS_LONG_SHIFT])) != 0;
> >> +}
> >> +
> >> +static inline int __variable_test_bit(long nr, const unsigned long
> >*addr)
> >> +{
> >> +  int oldbit;
> >> +
> >> +  asm volatile("bt %2,%1\n\t"
> >> +   "sbb %0,%0"
> >> +   : "=r" (oldbit)
> >> +   : "m" (*addr), "Ir" (nr));
> >> +
> >> +  return oldbit;
> >> +}
> >
> >Color me confused, why use assembly for this at all?
> >
> >Why not just use C for testing the bit (i.e. turn __constant_test_bit()
> >into 
> >__test_bit()) - that would also allow the compiler to propagate the
> >result, 
> >potentially more optimally than we can do it via SBB...
> >
> >Thanks,
> >
> > Ingo

Exactly:


Disassembly of section .text:

 <__variable_test_bit>:
__variable_test_bit():
   0:   8b 54 24 08 mov0x8(%esp),%edx
   4:   8b 44 24 04 mov0x4(%esp),%eax
   8:   0f a3 02bt %eax,(%edx)
   b:   19 c0   sbb%eax,%eax
   d:   c3  ret
   e:   66 90   xchg   %ax,%ax

0010 <__constant_test_bit>:
__constant_test_bit():
  10:   8b 4c 24 04 mov0x4(%esp),%ecx
  14:   8b 44 24 08 mov0x8(%esp),%eax
  18:   89 ca   mov%ecx,%edx
  1a:   c1 fa 04sar$0x4,%edx
  1d:   8b 04 90mov(%eax,%edx,4),%eax
  20:   d3 e8   shr%cl,%eax
  22:   83 e0 01and$0x1,%eax
  25:   c3  ret


That's also probably why we still have variable_test_bit
for test_bit too. It's best to be consistent with that - do you agree?
Or would you rather drop that too?

-- 
MST
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH 1/2] x86/bitops: implement __test_bit

2015-08-31 Thread H. Peter Anvin
Presumably because gcc can't generate bt... whether or not it is worth it is 
another matter.

On August 30, 2015 11:05:49 PM PDT, Ingo Molnar  wrote:
>
>* Michael S. Tsirkin  wrote:
>
>> +static __always_inline int __constant_test_bit(long nr, const
>unsigned long *addr)
>> +{
>> +return ((1UL << (nr & (BITS_PER_LONG-1))) &
>> +(addr[nr >> _BITOPS_LONG_SHIFT])) != 0;
>> +}
>> +
>> +static inline int __variable_test_bit(long nr, const unsigned long
>*addr)
>> +{
>> +int oldbit;
>> +
>> +asm volatile("bt %2,%1\n\t"
>> + "sbb %0,%0"
>> + : "=r" (oldbit)
>> + : "m" (*addr), "Ir" (nr));
>> +
>> +return oldbit;
>> +}
>
>Color me confused, why use assembly for this at all?
>
>Why not just use C for testing the bit (i.e. turn __constant_test_bit()
>into 
>__test_bit()) - that would also allow the compiler to propagate the
>result, 
>potentially more optimally than we can do it via SBB...
>
>Thanks,
>
>   Ingo

-- 
Sent from my Android device with K-9 Mail. Please excuse my brevity.
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH 1/2] x86/bitops: implement __test_bit

2015-08-31 Thread Ingo Molnar

* Michael S. Tsirkin  wrote:

> +static __always_inline int __constant_test_bit(long nr, const unsigned long 
> *addr)
> +{
> + return ((1UL << (nr & (BITS_PER_LONG-1))) &
> + (addr[nr >> _BITOPS_LONG_SHIFT])) != 0;
> +}
> +
> +static inline int __variable_test_bit(long nr, const unsigned long *addr)
> +{
> + int oldbit;
> +
> + asm volatile("bt %2,%1\n\t"
> +  "sbb %0,%0"
> +  : "=r" (oldbit)
> +  : "m" (*addr), "Ir" (nr));
> +
> + return oldbit;
> +}

Color me confused, why use assembly for this at all?

Why not just use C for testing the bit (i.e. turn __constant_test_bit() into 
__test_bit()) - that would also allow the compiler to propagate the result, 
potentially more optimally than we can do it via SBB...

Thanks,

Ingo
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH 1/2] x86/bitops: implement __test_bit

2015-08-31 Thread Ingo Molnar

* Michael S. Tsirkin  wrote:

> +static __always_inline int __constant_test_bit(long nr, const unsigned long 
> *addr)
> +{
> + return ((1UL << (nr & (BITS_PER_LONG-1))) &
> + (addr[nr >> _BITOPS_LONG_SHIFT])) != 0;
> +}
> +
> +static inline int __variable_test_bit(long nr, const unsigned long *addr)
> +{
> + int oldbit;
> +
> + asm volatile("bt %2,%1\n\t"
> +  "sbb %0,%0"
> +  : "=r" (oldbit)
> +  : "m" (*addr), "Ir" (nr));
> +
> + return oldbit;
> +}

Color me confused, why use assembly for this at all?

Why not just use C for testing the bit (i.e. turn __constant_test_bit() into 
__test_bit()) - that would also allow the compiler to propagate the result, 
potentially more optimally than we can do it via SBB...

Thanks,

Ingo
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH 1/2] x86/bitops: implement __test_bit

2015-08-31 Thread H. Peter Anvin
Presumably because gcc can't generate bt... whether or not it is worth it is 
another matter.

On August 30, 2015 11:05:49 PM PDT, Ingo Molnar  wrote:
>
>* Michael S. Tsirkin  wrote:
>
>> +static __always_inline int __constant_test_bit(long nr, const
>unsigned long *addr)
>> +{
>> +return ((1UL << (nr & (BITS_PER_LONG-1))) &
>> +(addr[nr >> _BITOPS_LONG_SHIFT])) != 0;
>> +}
>> +
>> +static inline int __variable_test_bit(long nr, const unsigned long
>*addr)
>> +{
>> +int oldbit;
>> +
>> +asm volatile("bt %2,%1\n\t"
>> + "sbb %0,%0"
>> + : "=r" (oldbit)
>> + : "m" (*addr), "Ir" (nr));
>> +
>> +return oldbit;
>> +}
>
>Color me confused, why use assembly for this at all?
>
>Why not just use C for testing the bit (i.e. turn __constant_test_bit()
>into 
>__test_bit()) - that would also allow the compiler to propagate the
>result, 
>potentially more optimally than we can do it via SBB...
>
>Thanks,
>
>   Ingo

-- 
Sent from my Android device with K-9 Mail. Please excuse my brevity.
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH 1/2] x86/bitops: implement __test_bit

2015-08-31 Thread Michael S. Tsirkin
On Sun, Aug 30, 2015 at 11:13:20PM -0700, H. Peter Anvin wrote:
> Presumably because gcc can't generate bt... whether or not it is worth it is 
> another matter.
> 
> On August 30, 2015 11:05:49 PM PDT, Ingo Molnar  wrote:
> >
> >* Michael S. Tsirkin  wrote:
> >
> >> +static __always_inline int __constant_test_bit(long nr, const
> >unsigned long *addr)
> >> +{
> >> +  return ((1UL << (nr & (BITS_PER_LONG-1))) &
> >> +  (addr[nr >> _BITOPS_LONG_SHIFT])) != 0;
> >> +}
> >> +
> >> +static inline int __variable_test_bit(long nr, const unsigned long
> >*addr)
> >> +{
> >> +  int oldbit;
> >> +
> >> +  asm volatile("bt %2,%1\n\t"
> >> +   "sbb %0,%0"
> >> +   : "=r" (oldbit)
> >> +   : "m" (*addr), "Ir" (nr));
> >> +
> >> +  return oldbit;
> >> +}
> >
> >Color me confused, why use assembly for this at all?
> >
> >Why not just use C for testing the bit (i.e. turn __constant_test_bit()
> >into 
> >__test_bit()) - that would also allow the compiler to propagate the
> >result, 
> >potentially more optimally than we can do it via SBB...
> >
> >Thanks,
> >
> > Ingo

Exactly:


Disassembly of section .text:

 <__variable_test_bit>:
__variable_test_bit():
   0:   8b 54 24 08 mov0x8(%esp),%edx
   4:   8b 44 24 04 mov0x4(%esp),%eax
   8:   0f a3 02bt %eax,(%edx)
   b:   19 c0   sbb%eax,%eax
   d:   c3  ret
   e:   66 90   xchg   %ax,%ax

0010 <__constant_test_bit>:
__constant_test_bit():
  10:   8b 4c 24 04 mov0x4(%esp),%ecx
  14:   8b 44 24 08 mov0x8(%esp),%eax
  18:   89 ca   mov%ecx,%edx
  1a:   c1 fa 04sar$0x4,%edx
  1d:   8b 04 90mov(%eax,%edx,4),%eax
  20:   d3 e8   shr%cl,%eax
  22:   83 e0 01and$0x1,%eax
  25:   c3  ret


That's also probably why we still have variable_test_bit
for test_bit too. It's best to be consistent with that - do you agree?
Or would you rather drop that too?

-- 
MST
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH 1/2] x86/bitops: implement __test_bit

2015-08-31 Thread yalin wang

> On Aug 31, 2015, at 15:59, Ingo Molnar  wrote:
> 
> 
> * Michael S. Tsirkin  wrote:
> 
>> On Sun, Aug 30, 2015 at 11:13:20PM -0700, H. Peter Anvin wrote:
>>> Presumably because gcc can't generate bt... whether or not it is worth it 
>>> is another matter.
>>> 
>>> On August 30, 2015 11:05:49 PM PDT, Ingo Molnar  wrote:
 
 * Michael S. Tsirkin  wrote:
 
> +static __always_inline int __constant_test_bit(long nr, const
 unsigned long *addr)
> +{
> + return ((1UL << (nr & (BITS_PER_LONG-1))) &
> + (addr[nr >> _BITOPS_LONG_SHIFT])) != 0;
> +}
> +
> +static inline int __variable_test_bit(long nr, const unsigned long
 *addr)
> +{
> + int oldbit;
> +
> + asm volatile("bt %2,%1\n\t"
> +  "sbb %0,%0"
> +  : "=r" (oldbit)
> +  : "m" (*addr), "Ir" (nr));
> +
> + return oldbit;
> +}
 
 Color me confused, why use assembly for this at all?
 
 Why not just use C for testing the bit (i.e. turn __constant_test_bit()
 into 
 __test_bit()) - that would also allow the compiler to propagate the
 result, 
 potentially more optimally than we can do it via SBB...
 
 Thanks,
 
Ingo
>> 
>> Exactly:
>> 
>> 
>> Disassembly of section .text:
>> 
>>  <__variable_test_bit>:
>> __variable_test_bit():
>>   0:   8b 54 24 08 mov0x8(%esp),%edx
>>   4:   8b 44 24 04 mov0x4(%esp),%eax
>>   8:   0f a3 02bt %eax,(%edx)
>>   b:   19 c0   sbb%eax,%eax
>>   d:   c3  ret
>>   e:   66 90   xchg   %ax,%ax
>> 
>> 0010 <__constant_test_bit>:
>> __constant_test_bit():
>>  10:   8b 4c 24 04 mov0x4(%esp),%ecx
>>  14:   8b 44 24 08 mov0x8(%esp),%eax
>>  18:   89 ca   mov%ecx,%edx
>>  1a:   c1 fa 04sar$0x4,%edx
>>  1d:   8b 04 90mov(%eax,%edx,4),%eax
>>  20:   d3 e8   shr%cl,%eax
>>  22:   83 e0 01and$0x1,%eax
>>  25:   c3  ret
> 
> But that's due to the forced interface of generating a return code. Please 
> compare 
> it at an inlined usage site, where GCC is free to do the comparison directly 
> and 
> use the result in flags.
just curious :
it seems __variable_test_bit()  use less instructions,
why not always use __variable_test_bit() , remove __constant_test_bit() version 
?

Thanks




--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH 1/2] x86/bitops: implement __test_bit

2015-08-31 Thread Ingo Molnar

* Ingo Molnar  wrote:

> > Disassembly of section .text:
> > 
> >  <__variable_test_bit>:
> > __variable_test_bit():
> >0:   8b 54 24 08 mov0x8(%esp),%edx
> >4:   8b 44 24 04 mov0x4(%esp),%eax
> >8:   0f a3 02bt %eax,(%edx)
> >b:   19 c0   sbb%eax,%eax
> >d:   c3  ret
> >e:   66 90   xchg   %ax,%ax
> > 
> > 0010 <__constant_test_bit>:
> > __constant_test_bit():
> >   10:   8b 4c 24 04 mov0x4(%esp),%ecx
> >   14:   8b 44 24 08 mov0x8(%esp),%eax
> >   18:   89 ca   mov%ecx,%edx
> >   1a:   c1 fa 04sar$0x4,%edx
> >   1d:   8b 04 90mov(%eax,%edx,4),%eax
> >   20:   d3 e8   shr%cl,%eax
> >   22:   83 e0 01and$0x1,%eax
> >   25:   c3  ret
> 
> But that's due to the forced interface of generating a return code. Please 
> compare it at an inlined usage site, where GCC is free to do the comparison 
> directly and use the result in flags.

So I was thinking about the patch below on top of yours.

But turns out GCC indeed generates worse code even under the best of 
circumstances. For example the nested_vmx_disable_intercept_for_msr() change:

@@ -4275,24 +4275,24 @@ static void nested_vmx_disable_intercept
 */
if (msr <= 0x1fff) {
if (type & MSR_TYPE_R &&
-  !test_bit(msr, msr_bitmap_l1 + 0x000 / f))
+  !__test_bit(msr, msr_bitmap_l1 + 0x000 / f))
/* read-low */
__clear_bit(msr, msr_bitmap_nested + 0x000 / f);

before (i.e. your series):

818b1082:   89 d0   mov%edx,%eax
818b1084:   48 0f a3 07 bt %rax,(%rdi)
818b1088:   45 19 c0sbb%r8d,%r8d
818b108b:   45 85 c0test   %r8d,%r8d
818b108e:   75 04   jne818b1094 


after (with my 'optimization' patch applied):

818b1091:   89 d0   mov%edx,%eax
818b1093:   49 89 c0mov%rax,%r8
818b1096:   49 c1 f8 06 sar$0x6,%r8
818b109a:   4e 8b 04 c7 mov(%rdi,%r8,8),%r8
818b109e:   49 0f a3 d0 bt %rdx,%r8
818b10a2:   72 04   jb 818b10a8 


So GCC when left to its own devices, generates one more instruction and 4 more 
bytes. Why does GCC do that? Why doesn't it use BT directly and use the flag, 
i.e. 
something like [pseudocode]:

818b1082:   89 d0   mov%edx,%eax
818b1084:   48 0f a3 07 bt %rax,(%rdi)
818b108e:   75 04   jne818b1094 


?

In any case I take back my objection:

  Acked-by: Ingo Molnar 

Thanks,

Ingo


---
 arch/x86/include/asm/bitops.h |   19 +--
 1 file changed, 1 insertion(+), 18 deletions(-)

Index: tip/arch/x86/include/asm/bitops.h
===
--- tip.orig/arch/x86/include/asm/bitops.h
+++ tip/arch/x86/include/asm/bitops.h
@@ -323,24 +323,12 @@ static inline int variable_test_bit(long
return oldbit;
 }
 
-static __always_inline int __constant_test_bit(long nr, const unsigned long 
*addr)
+static __always_inline int __test_bit(long nr, const unsigned long *addr)
 {
return ((1UL << (nr & (BITS_PER_LONG-1))) &
(addr[nr >> _BITOPS_LONG_SHIFT])) != 0;
 }
 
-static inline int __variable_test_bit(long nr, const unsigned long *addr)
-{
-   int oldbit;
-
-   asm volatile("bt %2,%1\n\t"
-"sbb %0,%0"
-: "=r" (oldbit)
-: "m" (*addr), "Ir" (nr));
-
-   return oldbit;
-}
-
 #if 0 /* Fool kernel-doc since it doesn't do macros yet */
 /**
  * test_bit - Determine whether a bit is set
@@ -362,11 +350,6 @@ static int __test_bit(int nr, const vola
 ? constant_test_bit((nr), (addr))  \
 : variable_test_bit((nr), (addr)))
 
-#define __test_bit(nr, addr)   \
-   (__builtin_constant_p((nr)) \
-? __constant_test_bit((nr), (addr))\
-: __variable_test_bit((nr), (addr)))
-
 /**
  * __ffs - find first set bit in word
  * @word: The word to search

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH 1/2] x86/bitops: implement __test_bit

2015-08-31 Thread Ingo Molnar

* yalin wang  wrote:

> 
> > On Aug 31, 2015, at 15:59, Ingo Molnar  wrote:
> > 
> > 
> > * Michael S. Tsirkin  wrote:
> > 
> >> On Sun, Aug 30, 2015 at 11:13:20PM -0700, H. Peter Anvin wrote:
> >>> Presumably because gcc can't generate bt... whether or not it is worth it 
> >>> is another matter.
> >>> 
> >>> On August 30, 2015 11:05:49 PM PDT, Ingo Molnar  wrote:
>  
>  * Michael S. Tsirkin  wrote:
>  
> > +static __always_inline int __constant_test_bit(long nr, const
>  unsigned long *addr)
> > +{
> > +   return ((1UL << (nr & (BITS_PER_LONG-1))) &
> > +   (addr[nr >> _BITOPS_LONG_SHIFT])) != 0;
> > +}
> > +
> > +static inline int __variable_test_bit(long nr, const unsigned long
>  *addr)
> > +{
> > +   int oldbit;
> > +
> > +   asm volatile("bt %2,%1\n\t"
> > +"sbb %0,%0"
> > +: "=r" (oldbit)
> > +: "m" (*addr), "Ir" (nr));
> > +
> > +   return oldbit;
> > +}
>  
>  Color me confused, why use assembly for this at all?
>  
>  Why not just use C for testing the bit (i.e. turn __constant_test_bit()
>  into 
>  __test_bit()) - that would also allow the compiler to propagate the
>  result, 
>  potentially more optimally than we can do it via SBB...
>  
>  Thanks,
>  
>   Ingo
> >> 
> >> Exactly:
> >> 
> >> 
> >> Disassembly of section .text:
> >> 
> >>  <__variable_test_bit>:
> >> __variable_test_bit():
> >>   0:   8b 54 24 08 mov0x8(%esp),%edx
> >>   4:   8b 44 24 04 mov0x4(%esp),%eax
> >>   8:   0f a3 02bt %eax,(%edx)
> >>   b:   19 c0   sbb%eax,%eax
> >>   d:   c3  ret
> >>   e:   66 90   xchg   %ax,%ax
> >> 
> >> 0010 <__constant_test_bit>:
> >> __constant_test_bit():
> >>  10:   8b 4c 24 04 mov0x4(%esp),%ecx
> >>  14:   8b 44 24 08 mov0x8(%esp),%eax
> >>  18:   89 ca   mov%ecx,%edx
> >>  1a:   c1 fa 04sar$0x4,%edx
> >>  1d:   8b 04 90mov(%eax,%edx,4),%eax
> >>  20:   d3 e8   shr%cl,%eax
> >>  22:   83 e0 01and$0x1,%eax
> >>  25:   c3  ret
> > 
> > But that's due to the forced interface of generating a return code. Please 
> > compare 
> > it at an inlined usage site, where GCC is free to do the comparison 
> > directly and 
> > use the result in flags.
> just curious :
> it seems __variable_test_bit()  use less instructions,
> why not always use __variable_test_bit() , remove __constant_test_bit() 
> version ?

It's an artifact of testing it in isolation.

For constant bit positions GCC is able to do a fairly good job:

8103d2a0 :
8103d2a0:   f6 87 4a 02 00 00 08testb  $0x8,0x24a(%rdi)
...
8103d2ab:   75 39   jne8103d2e6 



with just 2 instructions: a TESTB plus using the flag result in a JNE.

Using variable_test_bit() forces the result into a register, which is 
suboptimal.

Thanks,

Ingo
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH 1/2] x86/bitops: implement __test_bit

2015-08-31 Thread Michael S. Tsirkin
On Mon, Aug 31, 2015 at 09:59:47AM +0200, Ingo Molnar wrote:
> 
> * Michael S. Tsirkin  wrote:
> 
> > On Sun, Aug 30, 2015 at 11:13:20PM -0700, H. Peter Anvin wrote:
> > > Presumably because gcc can't generate bt... whether or not it is worth it 
> > > is another matter.
> > > 
> > > On August 30, 2015 11:05:49 PM PDT, Ingo Molnar  wrote:
> > > >
> > > >* Michael S. Tsirkin  wrote:
> > > >
> > > >> +static __always_inline int __constant_test_bit(long nr, const
> > > >unsigned long *addr)
> > > >> +{
> > > >> +  return ((1UL << (nr & (BITS_PER_LONG-1))) &
> > > >> +  (addr[nr >> _BITOPS_LONG_SHIFT])) != 0;
> > > >> +}
> > > >> +
> > > >> +static inline int __variable_test_bit(long nr, const unsigned long
> > > >*addr)
> > > >> +{
> > > >> +  int oldbit;
> > > >> +
> > > >> +  asm volatile("bt %2,%1\n\t"
> > > >> +   "sbb %0,%0"
> > > >> +   : "=r" (oldbit)
> > > >> +   : "m" (*addr), "Ir" (nr));
> > > >> +
> > > >> +  return oldbit;
> > > >> +}
> > > >
> > > >Color me confused, why use assembly for this at all?
> > > >
> > > >Why not just use C for testing the bit (i.e. turn __constant_test_bit()
> > > >into 
> > > >__test_bit()) - that would also allow the compiler to propagate the
> > > >result, 
> > > >potentially more optimally than we can do it via SBB...
> > > >
> > > >Thanks,
> > > >
> > > > Ingo
> > 
> > Exactly:
> > 
> > 
> > Disassembly of section .text:
> > 
> >  <__variable_test_bit>:
> > __variable_test_bit():
> >0:   8b 54 24 08 mov0x8(%esp),%edx
> >4:   8b 44 24 04 mov0x4(%esp),%eax
> >8:   0f a3 02bt %eax,(%edx)
> >b:   19 c0   sbb%eax,%eax
> >d:   c3  ret
> >e:   66 90   xchg   %ax,%ax
> > 
> > 0010 <__constant_test_bit>:
> > __constant_test_bit():
> >   10:   8b 4c 24 04 mov0x4(%esp),%ecx
> >   14:   8b 44 24 08 mov0x8(%esp),%eax
> >   18:   89 ca   mov%ecx,%edx
> >   1a:   c1 fa 04sar$0x4,%edx
> >   1d:   8b 04 90mov(%eax,%edx,4),%eax
> >   20:   d3 e8   shr%cl,%eax
> >   22:   83 e0 01and$0x1,%eax
> >   25:   c3  ret
> 
> But that's due to the forced interface of generating a return code. Please 
> compare 
> it at an inlined usage site, where GCC is free to do the comparison directly 
> and 
> use the result in flags.
> 
> Thanks,
> 
>   Ingo

I applied this patch on top of mine:


diff --git a/arch/x86/include/asm/bitops.h b/arch/x86/include/asm/bitops.h
index 9229334..2aed985 100644
--- a/arch/x86/include/asm/bitops.h
+++ b/arch/x86/include/asm/bitops.h
@@ -323,24 +323,17 @@ static inline int variable_test_bit(long nr, volatile 
const unsigned long *addr)
return oldbit;
 }
 
-static __always_inline int __constant_test_bit(long nr, const unsigned long 
*addr)
+/**
+ * __test_bit - Determine whether a bit is set
+ * @nr: bit number to test
+ * @addr: Address to start counting from
+ */
+static __always_inline int __test_bit(long nr, const unsigned long *addr)
 {
return ((1UL << (nr & (BITS_PER_LONG-1))) &
(addr[nr >> _BITOPS_LONG_SHIFT])) != 0;
 }
 
-static inline int __variable_test_bit(long nr, const unsigned long *addr)
-{
-   int oldbit;
-
-   asm volatile("bt %2,%1\n\t"
-"sbb %0,%0"
-: "=r" (oldbit)
-: "m" (*addr), "Ir" (nr));
-
-   return oldbit;
-}
-
 #if 0 /* Fool kernel-doc since it doesn't do macros yet */
 /**
  * test_bit - Determine whether a bit is set
@@ -348,13 +341,6 @@ static inline int __variable_test_bit(long nr, const 
unsigned long *addr)
  * @addr: Address to start counting from
  */
 static int test_bit(int nr, const volatile unsigned long *addr);
-
-/**
- * __test_bit - Determine whether a bit is set
- * @nr: bit number to test
- * @addr: Address to start counting from
- */
-static int __test_bit(int nr, const volatile unsigned long *addr);
 #endif
 
 #define test_bit(nr, addr) \
@@ -362,10 +348,6 @@ static int __test_bit(int nr, const volatile unsigned long 
*addr);
 ? constant_test_bit((nr), (addr))  \
 : variable_test_bit((nr), (addr)))
 
-#define __test_bit(nr, addr)   \
-   (__builtin_constant_p((nr)) \
-? __constant_test_bit((nr), (addr))\
-: __variable_test_bit((nr), (addr)))
 
 /**
  * __ffs - find first set bit in word


And the code size went up:

   13483629978372  146205   23b1d arch/x86/kvm/kvm-intel.ko  ->
   13484629978372  146215   23b27 arch/x86/kvm/kvm-intel.ko 

   342690   47640 441  390771   5f673 arch/x86/kvm/kvm.ko ->
   342738   47640 441  390819   5f6a3 arch/x86/kvm/kvm.ko   

I tried 

Re: [PATCH 1/2] x86/bitops: implement __test_bit

2015-08-31 Thread Ingo Molnar

* Michael S. Tsirkin  wrote:

> On Sun, Aug 30, 2015 at 11:13:20PM -0700, H. Peter Anvin wrote:
> > Presumably because gcc can't generate bt... whether or not it is worth it 
> > is another matter.
> > 
> > On August 30, 2015 11:05:49 PM PDT, Ingo Molnar  wrote:
> > >
> > >* Michael S. Tsirkin  wrote:
> > >
> > >> +static __always_inline int __constant_test_bit(long nr, const
> > >unsigned long *addr)
> > >> +{
> > >> +return ((1UL << (nr & (BITS_PER_LONG-1))) &
> > >> +(addr[nr >> _BITOPS_LONG_SHIFT])) != 0;
> > >> +}
> > >> +
> > >> +static inline int __variable_test_bit(long nr, const unsigned long
> > >*addr)
> > >> +{
> > >> +int oldbit;
> > >> +
> > >> +asm volatile("bt %2,%1\n\t"
> > >> + "sbb %0,%0"
> > >> + : "=r" (oldbit)
> > >> + : "m" (*addr), "Ir" (nr));
> > >> +
> > >> +return oldbit;
> > >> +}
> > >
> > >Color me confused, why use assembly for this at all?
> > >
> > >Why not just use C for testing the bit (i.e. turn __constant_test_bit()
> > >into 
> > >__test_bit()) - that would also allow the compiler to propagate the
> > >result, 
> > >potentially more optimally than we can do it via SBB...
> > >
> > >Thanks,
> > >
> > >   Ingo
> 
> Exactly:
> 
> 
> Disassembly of section .text:
> 
>  <__variable_test_bit>:
> __variable_test_bit():
>0:   8b 54 24 08 mov0x8(%esp),%edx
>4:   8b 44 24 04 mov0x4(%esp),%eax
>8:   0f a3 02bt %eax,(%edx)
>b:   19 c0   sbb%eax,%eax
>d:   c3  ret
>e:   66 90   xchg   %ax,%ax
> 
> 0010 <__constant_test_bit>:
> __constant_test_bit():
>   10:   8b 4c 24 04 mov0x4(%esp),%ecx
>   14:   8b 44 24 08 mov0x8(%esp),%eax
>   18:   89 ca   mov%ecx,%edx
>   1a:   c1 fa 04sar$0x4,%edx
>   1d:   8b 04 90mov(%eax,%edx,4),%eax
>   20:   d3 e8   shr%cl,%eax
>   22:   83 e0 01and$0x1,%eax
>   25:   c3  ret

But that's due to the forced interface of generating a return code. Please 
compare 
it at an inlined usage site, where GCC is free to do the comparison directly 
and 
use the result in flags.

Thanks,

Ingo
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH 1/2] x86/bitops: implement __test_bit

2015-08-30 Thread Michael S. Tsirkin
One little known side effect of test_bit is that it adds
a kind of a compiler barrier since the pointer parameter
is volatile.

It seems risky to change the semantics of test_bit so let's just
add __test_bit (matching __set_bit and __clear_bit) that does
not add such a barrier.

Will be used by kvm on x86, where it shaves several bytes off
the binary size. Small win, but comes at no cost, so why not.

Signed-off-by: Michael S. Tsirkin 
---

x86 maintainers - please specify whether you are ok with
adding this to arch/x86/include/asm/bitops.h
An alternative is to add this to kvm/x86 only.
It might be worth it to add this to all architectures,
though I haven't explored too much.

 arch/x86/include/asm/bitops.h | 30 ++
 1 file changed, 30 insertions(+)

diff --git a/arch/x86/include/asm/bitops.h b/arch/x86/include/asm/bitops.h
index cfe3b95..9229334 100644
--- a/arch/x86/include/asm/bitops.h
+++ b/arch/x86/include/asm/bitops.h
@@ -323,6 +323,24 @@ static inline int variable_test_bit(long nr, volatile 
const unsigned long *addr)
return oldbit;
 }
 
+static __always_inline int __constant_test_bit(long nr, const unsigned long 
*addr)
+{
+   return ((1UL << (nr & (BITS_PER_LONG-1))) &
+   (addr[nr >> _BITOPS_LONG_SHIFT])) != 0;
+}
+
+static inline int __variable_test_bit(long nr, const unsigned long *addr)
+{
+   int oldbit;
+
+   asm volatile("bt %2,%1\n\t"
+"sbb %0,%0"
+: "=r" (oldbit)
+: "m" (*addr), "Ir" (nr));
+
+   return oldbit;
+}
+
 #if 0 /* Fool kernel-doc since it doesn't do macros yet */
 /**
  * test_bit - Determine whether a bit is set
@@ -330,6 +348,13 @@ static inline int variable_test_bit(long nr, volatile 
const unsigned long *addr)
  * @addr: Address to start counting from
  */
 static int test_bit(int nr, const volatile unsigned long *addr);
+
+/**
+ * __test_bit - Determine whether a bit is set
+ * @nr: bit number to test
+ * @addr: Address to start counting from
+ */
+static int __test_bit(int nr, const volatile unsigned long *addr);
 #endif
 
 #define test_bit(nr, addr) \
@@ -337,6 +362,11 @@ static int test_bit(int nr, const volatile unsigned long 
*addr);
 ? constant_test_bit((nr), (addr))  \
 : variable_test_bit((nr), (addr)))
 
+#define __test_bit(nr, addr)   \
+   (__builtin_constant_p((nr)) \
+? __constant_test_bit((nr), (addr))\
+: __variable_test_bit((nr), (addr)))
+
 /**
  * __ffs - find first set bit in word
  * @word: The word to search
-- 
MST

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH 1/2] x86/bitops: implement __test_bit

2015-08-30 Thread Michael S. Tsirkin
One little known side effect of test_bit is that it adds
a kind of a compiler barrier since the pointer parameter
is volatile.

It seems risky to change the semantics of test_bit so let's just
add __test_bit (matching __set_bit and __clear_bit) that does
not add such a barrier.

Will be used by kvm on x86, where it shaves several bytes off
the binary size. Small win, but comes at no cost, so why not.

Signed-off-by: Michael S. Tsirkin m...@redhat.com
---

x86 maintainers - please specify whether you are ok with
adding this to arch/x86/include/asm/bitops.h
An alternative is to add this to kvm/x86 only.
It might be worth it to add this to all architectures,
though I haven't explored too much.

 arch/x86/include/asm/bitops.h | 30 ++
 1 file changed, 30 insertions(+)

diff --git a/arch/x86/include/asm/bitops.h b/arch/x86/include/asm/bitops.h
index cfe3b95..9229334 100644
--- a/arch/x86/include/asm/bitops.h
+++ b/arch/x86/include/asm/bitops.h
@@ -323,6 +323,24 @@ static inline int variable_test_bit(long nr, volatile 
const unsigned long *addr)
return oldbit;
 }
 
+static __always_inline int __constant_test_bit(long nr, const unsigned long 
*addr)
+{
+   return ((1UL  (nr  (BITS_PER_LONG-1))) 
+   (addr[nr  _BITOPS_LONG_SHIFT])) != 0;
+}
+
+static inline int __variable_test_bit(long nr, const unsigned long *addr)
+{
+   int oldbit;
+
+   asm volatile(bt %2,%1\n\t
+sbb %0,%0
+: =r (oldbit)
+: m (*addr), Ir (nr));
+
+   return oldbit;
+}
+
 #if 0 /* Fool kernel-doc since it doesn't do macros yet */
 /**
  * test_bit - Determine whether a bit is set
@@ -330,6 +348,13 @@ static inline int variable_test_bit(long nr, volatile 
const unsigned long *addr)
  * @addr: Address to start counting from
  */
 static int test_bit(int nr, const volatile unsigned long *addr);
+
+/**
+ * __test_bit - Determine whether a bit is set
+ * @nr: bit number to test
+ * @addr: Address to start counting from
+ */
+static int __test_bit(int nr, const volatile unsigned long *addr);
 #endif
 
 #define test_bit(nr, addr) \
@@ -337,6 +362,11 @@ static int test_bit(int nr, const volatile unsigned long 
*addr);
 ? constant_test_bit((nr), (addr))  \
 : variable_test_bit((nr), (addr)))
 
+#define __test_bit(nr, addr)   \
+   (__builtin_constant_p((nr)) \
+? __constant_test_bit((nr), (addr))\
+: __variable_test_bit((nr), (addr)))
+
 /**
  * __ffs - find first set bit in word
  * @word: The word to search
-- 
MST

--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/