On 27.8.19. 20:52, Richard Henderson wrote:
On 8/27/19 2:37 AM, Stefan Brankovic wrote:
+ for (i = 0; i < 4; i++) {
+ switch (i) {
+ case 0:
+ /*
+ * Get high doubleword of vA to perfrom 6-5-5 pack of pixels
+ * 1 and 2.
+ */
+ get_avr64(avr, VA, true);
+ tcg_gen_movi_i64(result, 0x0ULL);
+ break;
+ case 1:
+ /*
+ * Get low doubleword of vA to perfrom 6-5-5 pack of pixels
+ * 3 and 4.
+ */
+ get_avr64(avr, VA, false);
+ break;
+ case 2:
+ /*
+ * Get high doubleword of vB to perfrom 6-5-5 pack of pixels
+ * 5 and 6.
+ */
+ get_avr64(avr, VB, true);
+ tcg_gen_movi_i64(result, 0x0ULL);
+ break;
+ case 3:
+ /*
+ * Get low doubleword of vB to perfrom 6-5-5 pack of pixels
+ * 7 and 8.
+ */
+ get_avr64(avr, VB, false);
+ break;
+ }
+ /* Perform the packing for 2 pixels(each iteration for 1). */
+ tcg_gen_movi_i64(tmp, 0x0ULL);
+ for (j = 0; j < 2; j++) {
+ tcg_gen_shri_i64(shifted, avr, (j * 16 + 3));
+ tcg_gen_andi_i64(shifted, shifted, mask1 << (j * 16));
+ tcg_gen_or_i64(tmp, tmp, shifted);
+
+ tcg_gen_shri_i64(shifted, avr, (j * 16 + 6));
+ tcg_gen_andi_i64(shifted, shifted, mask2 << (j * 16));
+ tcg_gen_or_i64(tmp, tmp, shifted);
+
+ tcg_gen_shri_i64(shifted, avr, (j * 16 + 9));
+ tcg_gen_andi_i64(shifted, shifted, mask3 << (j * 16));
+ tcg_gen_or_i64(tmp, tmp, shifted);
+ }
+ if ((i == 0) || (i == 2)) {
+ tcg_gen_shli_i64(tmp, tmp, 32);
+ }
+ tcg_gen_or_i64(result, result, tmp);
+ if (i == 1) {
+ /* Place packed pixels 1:4 to high doubleword of vD. */
+ tcg_gen_mov_i64(result1, result);
+ }
+ if (i == 3) {
+ /* Place packed pixels 5:8 to low doubleword of vD. */
+ tcg_gen_mov_i64(result2, result);
+ }
+ }
+ set_avr64(VT, result1, true);
+ set_avr64(VT, result2, false);
I really have a hard time believing that it is worthwhile to inline all of this
code. By my count this is 82 non-move opcodes. That is a *lot* of inline
expansion.
However, I can well imagine that the existing out-of-line helper is less than
optimal.
-void helper_vpkpx(ppc_avr_t *r, ppc_avr_t *a, ppc_avr_t *b)
-{
- int i, j;
- ppc_avr_t result;
-#if defined(HOST_WORDS_BIGENDIAN)
- const ppc_avr_t *x[2] = { a, b };
-#else
- const ppc_avr_t *x[2] = { b, a };
-#endif
-
- VECTOR_FOR_INORDER_I(i, u64) {
- VECTOR_FOR_INORDER_I(j, u32) {
- uint32_t e = x[i]->u32[j];
Double indirect loads?
-
- result.u16[4 * i + j] = (((e >> 9) & 0xfc00) |
- ((e >> 6) & 0x3e0) |
- ((e >> 3) & 0x1f));
Store to temporary ...
- }
- }
- *r = result;
... and then copy?
Try replacing the existing helper with something like the following.
r~
static inline uint64_t pkpx_1(uint64_t a, int shr, int shl)
{
uint64_t r;
r = ((a >> (shr + 9)) & 0x3f) << shl;
r |= ((a >> (shr + 6)) & 0x1f) << shl;
r |= ((a >> (shr + 3)) & 0x1f) << shl;
return r;
}
static inline uint64_t pkpx_2(uint64_t ah, uint64_t al)
{
return pkpx_1(ah, 32, 48)
| pkpx_1(ah, 0, 32)
| pkpx_1(al, 32, 16)
| pkpx_1(al, 0, 0);
}
void helper_vpkpx(uint64_t *r, uint64_t *a, uint64_t *b)
{
uint64_t rh = pkpx_2(a->VsrD(0), a->VsrD(1));
uint64_t rl = pkpx_2(b->VsrD(0), b->VsrD(1));
r->VsrD(0) = rh;
r->VsrD(1) = rl;
}
I implemented vpkpx as you suggested above with small modifications(so
it builds and gives correct result). It looks like this:
static inline uint64_t pkpx_1(uint64_t a, int shr, int shl)
{
uint64_t r;
r = ((a >> (shr + 9)) & 0xfc00) << shl;
r |= ((a >> (shr + 6)) & 0x3e0) << shl;
r |= ((a >> (shr + 3)) & 0x1f) << shl;
return r;
}
static inline uint64_t pkpx_2(uint64_t ah, uint64_t al)
{
return pkpx_1(ah, 32, 48)
| pkpx_1(ah, 0, 32)
| pkpx_1(al, 32, 16)
| pkpx_1(al, 0, 0);
}
void helper_vpkpx(ppc_avr_t *r, ppc_avr_t *a, ppc_avr_t *b)
{
uint64_t rh = pkpx_2(a->u64[1], a->u64[0]);
uint64_t rl = pkpx_2(b->u64[1], b->u64[0]);
r->u64[1] = rh;
r->u64[0] = rl;
}
I also noticed that this would work only for little_endian hosts, so we
would need to modify it in order to support big_endian hosts (this
shouldn't affect performance results).
Then I run my performance tests and I got following results(test is
calling vpkpx 100000 times):
1) Current helper implementation: ~ 157 ms
2) helper implementation you suggested: ~94 ms
3) tcg implementation: ~75 ms
Attached file contains assembly code for both current implementation and
implementation you suggested, so please take a look at that as well.
Kind Regards,
Stefan
Current vpkpx implementation:
1)Both c and assembly code:
Dump of assembler code for function helper_vpkpx:
1267 {
0x0000000000195fe0 <+0>: 48 83 ec 38 sub $0x38,%rsp
1268 int i, j;
1269 ppc_avr_t result;
1270 #if defined(HOST_WORDS_BIGENDIAN)
1271 const ppc_avr_t *x[2] = { a, b };
1272 #else
1273 const ppc_avr_t *x[2] = { b, a };
0x0000000000195fe4 <+4>: b9 07 00 00 00 mov $0x7,%ecx
1267 {
0x0000000000195fe9 <+9>: 64 48 8b 04 25 28 00 00 00 mov
%fs:0x28,%rax
0x0000000000195ff2 <+18>: 48 89 44 24 28 mov %rax,0x28(%rsp)
0x0000000000195ff7 <+23>: 31 c0 xor %eax,%eax
0x0000000000195ff9 <+25>: 4c 8d 4c 24 10 lea 0x10(%rsp),%r9
1268 int i, j;
1269 ppc_avr_t result;
1270 #if defined(HOST_WORDS_BIGENDIAN)
1271 const ppc_avr_t *x[2] = { a, b };
1272 #else
1273 const ppc_avr_t *x[2] = { b, a };
0x0000000000195ffe <+30>: 48 89 54 24 10 mov %rdx,0x10(%rsp)
0x0000000000196003 <+35>: 48 89 74 24 18 mov %rsi,0x18(%rsp)
0x0000000000196008 <+40>: 44 8d 51 fc lea -0x4(%rcx),%r10d
0x000000000019600c <+44>: 48 83 c6 0c add $0xc,%rsi
1278 uint32_t e = x[i]->u32[j];
0x0000000000196010 <+48>: 8b 06 mov (%rsi),%eax
1279
1280 result.u16[4 * i + j] = (((e >> 9) & 0xfc00) |
0x0000000000196012 <+50>: 4c 63 d9 movslq %ecx,%r11
0x0000000000196015 <+53>: 83 e9 01 sub $0x1,%ecx
0x0000000000196018 <+56>: 48 83 ee 04 sub $0x4,%rsi
0x000000000019601c <+60>: 89 c2 mov %eax,%edx
0x000000000019601e <+62>: c1 ea 09 shr $0x9,%edx
0x0000000000196021 <+65>: 41 89 d0 mov %edx,%r8d
0x0000000000196024 <+68>: 89 c2 mov %eax,%edx
0x0000000000196026 <+70>: c1 e8 03 shr $0x3,%eax
0x0000000000196029 <+73>: c1 ea 06 shr $0x6,%edx
0x000000000019602c <+76>: 66 41 81 e0 00 fc and $0xfc00,%r8w
0x0000000000196032 <+82>: 83 e0 1f and $0x1f,%eax
0x0000000000196035 <+85>: 66 81 e2 e0 03 and $0x3e0,%dx
0x000000000019603a <+90>: 44 09 c2 or %r8d,%edx
0x000000000019603d <+93>: 09 d0 or %edx,%eax
1277 VECTOR_FOR_INORDER_I(j, u32) {
0x000000000019603f <+95>: 41 39 ca cmp %ecx,%r10d
1279
1280 result.u16[4 * i + j] = (((e >> 9) & 0xfc00) |
0x0000000000196042 <+98>: 66 42 89 04 5c mov %ax,(%rsp,%r11,2)
1277 VECTOR_FOR_INORDER_I(j, u32) {
0x0000000000196047 <+103>: 75 c7 jne 0x196010 <helper_vpkpx+48>
1276 VECTOR_FOR_INORDER_I(i, u64) {
0x0000000000196049 <+105>: 41 83 fa ff cmp $0xffffffff,%r10d
0x000000000019604d <+109>: 44 89 d1 mov %r10d,%ecx
0x0000000000196050 <+112>: 74 0e je 0x196060 <helper_vpkpx+128>
0x0000000000196052 <+114>: 49 8b 31 mov (%r9),%rsi
0x0000000000196055 <+117>: 49 83 e9 08 sub $0x8,%r9
0x0000000000196059 <+121>: eb ad jmp 0x196008 <helper_vpkpx+40>
0x000000000019605b <+123>: 0f 1f 44 00 00 nopl 0x0(%rax,%rax,1)
1281 ((e >> 6) & 0x3e0) |
1282 ((e >> 3) & 0x1f));
1283 // printf("%x\n",result.u16[4 * i + j]);
1284 }
1285 }
1286 // printf("%lx\n",result.u64[0]);
1287 // printf("%lx\n",result.u64[1]);
1288 *r = result;
0x0000000000196060 <+128>: 48 8b 04 24 mov (%rsp),%rax
0x0000000000196064 <+132>: 48 8b 54 24 08 mov 0x8(%rsp),%rdx
0x0000000000196069 <+137>: 48 89 07 mov %rax,(%rdi)
0x000000000019606c <+140>: 48 89 57 08 mov %rdx,0x8(%rdi)
1289 }
0x0000000000196070 <+144>: 48 8b 44 24 28 mov 0x28(%rsp),%rax
0x0000000000196075 <+149>: 64 48 33 04 25 28 00 00 00 xor
%fs:0x28,%rax
0x000000000019607e <+158>: 75 05 jne 0x196085 <helper_vpkpx+165>
0x0000000000196080 <+160>: 48 83 c4 38 add $0x38,%rsp
0x0000000000196084 <+164>: c3 retq
0x0000000000196085 <+165>: e8 2e 66 f0 ff callq 0x9c6b8
End of assembler dump.
2) Only assembly code:
Dump of assembler code for function helper_vpkpx:
0x0000000000195fe0 <+0>: 48 83 ec 38 sub $0x38,%rsp
0x0000000000195fe4 <+4>: b9 07 00 00 00 mov $0x7,%ecx
0x0000000000195fe9 <+9>: 64 48 8b 04 25 28 00 00 00 mov
%fs:0x28,%rax
0x0000000000195ff2 <+18>: 48 89 44 24 28 mov %rax,0x28(%rsp)
0x0000000000195ff7 <+23>: 31 c0 xor %eax,%eax
0x0000000000195ff9 <+25>: 4c 8d 4c 24 10 lea 0x10(%rsp),%r9
0x0000000000195ffe <+30>: 48 89 54 24 10 mov %rdx,0x10(%rsp)
0x0000000000196003 <+35>: 48 89 74 24 18 mov %rsi,0x18(%rsp)
0x0000000000196008 <+40>: 44 8d 51 fc lea -0x4(%rcx),%r10d
0x000000000019600c <+44>: 48 83 c6 0c add $0xc,%rsi
0x0000000000196010 <+48>: 8b 06 mov (%rsi),%eax
0x0000000000196012 <+50>: 4c 63 d9 movslq %ecx,%r11
0x0000000000196015 <+53>: 83 e9 01 sub $0x1,%ecx
0x0000000000196018 <+56>: 48 83 ee 04 sub $0x4,%rsi
0x000000000019601c <+60>: 89 c2 mov %eax,%edx
0x000000000019601e <+62>: c1 ea 09 shr $0x9,%edx
0x0000000000196021 <+65>: 41 89 d0 mov %edx,%r8d
0x0000000000196024 <+68>: 89 c2 mov %eax,%edx
0x0000000000196026 <+70>: c1 e8 03 shr $0x3,%eax
0x0000000000196029 <+73>: c1 ea 06 shr $0x6,%edx
0x000000000019602c <+76>: 66 41 81 e0 00 fc and $0xfc00,%r8w
0x0000000000196032 <+82>: 83 e0 1f and $0x1f,%eax
0x0000000000196035 <+85>: 66 81 e2 e0 03 and $0x3e0,%dx
0x000000000019603a <+90>: 44 09 c2 or %r8d,%edx
0x000000000019603d <+93>: 09 d0 or %edx,%eax
0x000000000019603f <+95>: 41 39 ca cmp %ecx,%r10d
0x0000000000196042 <+98>: 66 42 89 04 5c mov %ax,(%rsp,%r11,2)
0x0000000000196047 <+103>: 75 c7 jne 0x196010 <helper_vpkpx+48>
0x0000000000196049 <+105>: 41 83 fa ff cmp $0xffffffff,%r10d
0x000000000019604d <+109>: 44 89 d1 mov %r10d,%ecx
0x0000000000196050 <+112>: 74 0e je 0x196060 <helper_vpkpx+128>
0x0000000000196052 <+114>: 49 8b 31 mov (%r9),%rsi
0x0000000000196055 <+117>: 49 83 e9 08 sub $0x8,%r9
0x0000000000196059 <+121>: eb ad jmp 0x196008 <helper_vpkpx+40>
0x000000000019605b <+123>: 0f 1f 44 00 00 nopl 0x0(%rax,%rax,1)
0x0000000000196060 <+128>: 48 8b 04 24 mov (%rsp),%rax
0x0000000000196064 <+132>: 48 8b 54 24 08 mov 0x8(%rsp),%rdx
0x0000000000196069 <+137>: 48 89 07 mov %rax,(%rdi)
0x000000000019606c <+140>: 48 89 57 08 mov %rdx,0x8(%rdi)
0x0000000000196070 <+144>: 48 8b 44 24 28 mov 0x28(%rsp),%rax
0x0000000000196075 <+149>: 64 48 33 04 25 28 00 00 00 xor
%fs:0x28,%rax
0x000000000019607e <+158>: 75 05 jne 0x196085 <helper_vpkpx+165>
0x0000000000196080 <+160>: 48 83 c4 38 add $0x38,%rsp
0x0000000000196084 <+164>: c3 retq
0x0000000000196085 <+165>: e8 2e 66 f0 ff callq 0x9c6b8
End of assembler dump.
Implementation you suggested:
1)Both c and assembly code:
Dump of assembler code for function helper_vpkpx:
1313 {
0x0000000000195fe0 <+0>: 55 push %rbp
0x0000000000195fe1 <+1>: 53 push %rbx
1314 uint64_t rh = pkpx_2(a->u64[1], a->u64[0]);
0x0000000000195fe2 <+2>: 48 8b 46 08 mov 0x8(%rsi),%rax
0x0000000000195fe6 <+6>: 48 8b 0e mov (%rsi),%rcx
1297 r |= ((a >> (shr + 6)) & 0x3e0);
0x0000000000195fe9 <+9>: 49 89 c1 mov %rax,%r9
1296 r = ((a >> (shr + 9)) & 0xfc00);
0x0000000000195fec <+12>: 48 89 c6 mov %rax,%rsi
0x0000000000195fef <+15>: 49 89 c3 mov %rax,%r11
0x0000000000195ff2 <+18>: 48 c1 ee 29 shr $0x29,%rsi
1297 r |= ((a >> (shr + 6)) & 0x3e0);
0x0000000000195ff6 <+22>: 49 c1 e9 26 shr $0x26,%r9
1296 r = ((a >> (shr + 9)) & 0xfc00);
0x0000000000195ffa <+26>: 49 c1 eb 09 shr $0x9,%r11
0x0000000000195ffe <+30>: 81 e6 00 fc 00 00 and $0xfc00,%esi
1297 r |= ((a >> (shr + 6)) & 0x3e0);
0x0000000000196004 <+36>: 41 81 e1 e0 03 00 00 and $0x3e0,%r9d
0x000000000019600b <+43>: 49 89 ca mov %rcx,%r10
1296 r = ((a >> (shr + 9)) & 0xfc00);
0x000000000019600e <+46>: 49 89 f0 mov %rsi,%r8
1297 r |= ((a >> (shr + 6)) & 0x3e0);
0x0000000000196011 <+49>: 4c 89 ce mov %r9,%rsi
1298 r |= ((a >> (shr + 3)) & 0x1f);
0x0000000000196014 <+52>: 49 89 c1 mov %rax,%r9
0x0000000000196017 <+55>: 49 c1 e9 23 shr $0x23,%r9
1297 r |= ((a >> (shr + 6)) & 0x3e0);
0x000000000019601b <+59>: 4c 09 c6 or %r8,%rsi
0x000000000019601e <+62>: 49 c1 ea 26 shr $0x26,%r10
1298 r |= ((a >> (shr + 3)) & 0x1f);
0x0000000000196022 <+66>: 41 83 e1 1f and $0x1f,%r9d
1297 r |= ((a >> (shr + 6)) & 0x3e0);
0x0000000000196026 <+70>: 41 81 e2 e0 03 00 00 and $0x3e0,%r10d
1298 r |= ((a >> (shr + 3)) & 0x1f);
0x000000000019602d <+77>: 49 09 f1 or %rsi,%r9
1296 r = ((a >> (shr + 9)) & 0xfc00);
0x0000000000196030 <+80>: 4c 89 de mov %r11,%rsi
1297 r |= ((a >> (shr + 6)) & 0x3e0);
0x0000000000196033 <+83>: 49 89 c3 mov %rax,%r11
0x0000000000196036 <+86>: 49 c1 eb 06 shr $0x6,%r11
1296 r = ((a >> (shr + 9)) & 0xfc00);
0x000000000019603a <+90>: 81 e6 00 fc 00 00 and $0xfc00,%esi
1298 r |= ((a >> (shr + 3)) & 0x1f);
0x0000000000196040 <+96>: 48 c1 e8 03 shr $0x3,%rax
1297 r |= ((a >> (shr + 6)) & 0x3e0);
0x0000000000196044 <+100>: 41 81 e3 e0 03 00 00 and $0x3e0,%r11d
1298 r |= ((a >> (shr + 3)) & 0x1f);
0x000000000019604b <+107>: 83 e0 1f and $0x1f,%eax
1297 r |= ((a >> (shr + 6)) & 0x3e0);
0x000000000019604e <+110>: 49 09 f3 or %rsi,%r11
1298 r |= ((a >> (shr + 3)) & 0x1f);
0x0000000000196051 <+113>: 49 09 c3 or %rax,%r11
1296 r = ((a >> (shr + 9)) & 0xfc00);
0x0000000000196054 <+116>: 48 89 c8 mov %rcx,%rax
0x0000000000196057 <+119>: 48 c1 e8 29 shr $0x29,%rax
0x000000000019605b <+123>: 25 00 fc 00 00 and $0xfc00,%eax
0x0000000000196060 <+128>: 48 89 c6 mov %rax,%rsi
1297 r |= ((a >> (shr + 6)) & 0x3e0);
0x0000000000196063 <+131>: 4c 89 d0 mov %r10,%rax
1298 r |= ((a >> (shr + 3)) & 0x1f);
0x0000000000196066 <+134>: 49 89 ca mov %rcx,%r10
0x0000000000196069 <+137>: 49 c1 ea 23 shr $0x23,%r10
1297 r |= ((a >> (shr + 6)) & 0x3e0);
0x000000000019606d <+141>: 48 09 f0 or %rsi,%rax
1298 r |= ((a >> (shr + 3)) & 0x1f);
0x0000000000196070 <+144>: 41 83 e2 1f and $0x1f,%r10d
0x0000000000196074 <+148>: 49 09 c2 or %rax,%r10
1315 uint64_t rl = pkpx_2(b->u64[1], b->u64[0]);
0x0000000000196077 <+151>: 48 8b 02 mov (%rdx),%rax
0x000000000019607a <+154>: 48 8b 52 08 mov 0x8(%rdx),%rdx
1297 r |= ((a >> (shr + 6)) & 0x3e0);
0x000000000019607e <+158>: 49 89 d0 mov %rdx,%r8
1296 r = ((a >> (shr + 9)) & 0xfc00);
0x0000000000196081 <+161>: 48 89 d6 mov %rdx,%rsi
1297 r |= ((a >> (shr + 6)) & 0x3e0);
0x0000000000196084 <+164>: 49 c1 e8 26 shr $0x26,%r8
1296 r = ((a >> (shr + 9)) & 0xfc00);
0x0000000000196088 <+168>: 48 c1 ee 29 shr $0x29,%rsi
1297 r |= ((a >> (shr + 6)) & 0x3e0);
0x000000000019608c <+172>: 41 81 e0 e0 03 00 00 and $0x3e0,%r8d
1296 r = ((a >> (shr + 9)) & 0xfc00);
0x0000000000196093 <+179>: 48 89 f3 mov %rsi,%rbx
1297 r |= ((a >> (shr + 6)) & 0x3e0);
0x0000000000196096 <+182>: 4c 89 c6 mov %r8,%rsi
1298 r |= ((a >> (shr + 3)) & 0x1f);
0x0000000000196099 <+185>: 49 89 d0 mov %rdx,%r8
1296 r = ((a >> (shr + 9)) & 0xfc00);
0x000000000019609c <+188>: 81 e3 00 fc 00 00 and $0xfc00,%ebx
1298 r |= ((a >> (shr + 3)) & 0x1f);
0x00000000001960a2 <+194>: 49 c1 e8 23 shr $0x23,%r8
1297 r |= ((a >> (shr + 6)) & 0x3e0);
0x00000000001960a6 <+198>: 48 09 de or %rbx,%rsi
1298 r |= ((a >> (shr + 3)) & 0x1f);
0x00000000001960a9 <+201>: 41 83 e0 1f and $0x1f,%r8d
0x00000000001960ad <+205>: 49 09 f0 or %rsi,%r8
1296 r = ((a >> (shr + 9)) & 0xfc00);
0x00000000001960b0 <+208>: 48 89 d6 mov %rdx,%rsi
0x00000000001960b3 <+211>: 48 c1 ee 09 shr $0x9,%rsi
1316 r->u64[1] = rh;
0x00000000001960b7 <+215>: 49 c1 e1 30 shl $0x30,%r9
0x00000000001960bb <+219>: 49 c1 e3 20 shl $0x20,%r11
1296 r = ((a >> (shr + 9)) & 0xfc00);
0x00000000001960bf <+223>: 48 89 f3 mov %rsi,%rbx
1297 r |= ((a >> (shr + 6)) & 0x3e0);
0x00000000001960c2 <+226>: 48 89 d6 mov %rdx,%rsi
1298 r |= ((a >> (shr + 3)) & 0x1f);
0x00000000001960c5 <+229>: 48 c1 ea 03 shr $0x3,%rdx
1297 r |= ((a >> (shr + 6)) & 0x3e0);
0x00000000001960c9 <+233>: 48 c1 ee 06 shr $0x6,%rsi
1296 r = ((a >> (shr + 9)) & 0xfc00);
0x00000000001960cd <+237>: 81 e3 00 fc 00 00 and $0xfc00,%ebx
1298 r |= ((a >> (shr + 3)) & 0x1f);
0x00000000001960d3 <+243>: 83 e2 1f and $0x1f,%edx
1297 r |= ((a >> (shr + 6)) & 0x3e0);
0x00000000001960d6 <+246>: 81 e6 e0 03 00 00 and $0x3e0,%esi
1316 r->u64[1] = rh;
0x00000000001960dc <+252>: 49 c1 e2 10 shl $0x10,%r10
1297 r |= ((a >> (shr + 6)) & 0x3e0);
0x00000000001960e0 <+256>: 48 09 de or %rbx,%rsi
1296 r = ((a >> (shr + 9)) & 0xfc00);
0x00000000001960e3 <+259>: 48 89 c3 mov %rax,%rbx
1298 r |= ((a >> (shr + 3)) & 0x1f);
0x00000000001960e6 <+262>: 48 09 f2 or %rsi,%rdx
1297 r |= ((a >> (shr + 6)) & 0x3e0);
0x00000000001960e9 <+265>: 48 89 c6 mov %rax,%rsi
1296 r = ((a >> (shr + 9)) & 0xfc00);
0x00000000001960ec <+268>: 48 c1 eb 29 shr $0x29,%rbx
1297 r |= ((a >> (shr + 6)) & 0x3e0);
0x00000000001960f0 <+272>: 48 c1 ee 26 shr $0x26,%rsi
1296 r = ((a >> (shr + 9)) & 0xfc00);
0x00000000001960f4 <+276>: 81 e3 00 fc 00 00 and $0xfc00,%ebx
1297 r |= ((a >> (shr + 6)) & 0x3e0);
0x00000000001960fa <+282>: 81 e6 e0 03 00 00 and $0x3e0,%esi
1296 r = ((a >> (shr + 9)) & 0xfc00);
0x0000000000196100 <+288>: 48 89 dd mov %rbx,%rbp
1297 r |= ((a >> (shr + 6)) & 0x3e0);
0x0000000000196103 <+291>: 48 89 f3 mov %rsi,%rbx
1298 r |= ((a >> (shr + 3)) & 0x1f);
0x0000000000196106 <+294>: 48 89 c6 mov %rax,%rsi
0x0000000000196109 <+297>: 48 c1 ee 23 shr $0x23,%rsi
1297 r |= ((a >> (shr + 6)) & 0x3e0);
0x000000000019610d <+301>: 48 09 eb or %rbp,%rbx
1298 r |= ((a >> (shr + 3)) & 0x1f);
0x0000000000196110 <+304>: 83 e6 1f and $0x1f,%esi
0x0000000000196113 <+307>: 48 09 de or %rbx,%rsi
1316 r->u64[1] = rh;
0x0000000000196116 <+310>: 48 89 cb mov %rcx,%rbx
0x0000000000196119 <+313>: 48 c1 eb 09 shr $0x9,%rbx
0x000000000019611d <+317>: 81 e3 00 fc 00 00 and $0xfc00,%ebx
0x0000000000196123 <+323>: 48 89 dd mov %rbx,%rbp
0x0000000000196126 <+326>: 48 89 cb mov %rcx,%rbx
0x0000000000196129 <+329>: 48 c1 e9 03 shr $0x3,%rcx
0x000000000019612d <+333>: 48 c1 eb 06 shr $0x6,%rbx
0x0000000000196131 <+337>: 83 e1 1f and $0x1f,%ecx
0x0000000000196134 <+340>: 81 e3 e0 03 00 00 and $0x3e0,%ebx
0x000000000019613a <+346>: 48 09 eb or %rbp,%rbx
0x000000000019613d <+349>: 48 09 d9 or %rbx,%rcx
0x0000000000196140 <+352>: 4c 09 c9 or %r9,%rcx
0x0000000000196143 <+355>: 4c 09 d9 or %r11,%rcx
0x0000000000196146 <+358>: 4c 09 d1 or %r10,%rcx
0x0000000000196149 <+361>: 48 89 4f 08 mov %rcx,0x8(%rdi)
1317 r->u64[0] = rl;
0x000000000019614d <+365>: 48 89 c1 mov %rax,%rcx
0x0000000000196150 <+368>: 48 c1 e9 09 shr $0x9,%rcx
0x0000000000196154 <+372>: 81 e1 00 fc 00 00 and $0xfc00,%ecx
0x000000000019615a <+378>: 49 89 c9 mov %rcx,%r9
0x000000000019615d <+381>: 48 89 c1 mov %rax,%rcx
0x0000000000196160 <+384>: 48 c1 e9 06 shr $0x6,%rcx
0x0000000000196164 <+388>: 48 c1 e8 03 shr $0x3,%rax
0x0000000000196168 <+392>: 49 c1 e0 30 shl $0x30,%r8
0x000000000019616c <+396>: 81 e1 e0 03 00 00 and $0x3e0,%ecx
0x0000000000196172 <+402>: 83 e0 1f and $0x1f,%eax
0x0000000000196175 <+405>: 48 c1 e2 20 shl $0x20,%rdx
0x0000000000196179 <+409>: 4c 09 c9 or %r9,%rcx
0x000000000019617c <+412>: 48 09 c8 or %rcx,%rax
0x000000000019617f <+415>: 48 89 f1 mov %rsi,%rcx
0x0000000000196182 <+418>: 4c 09 c0 or %r8,%rax
0x0000000000196185 <+421>: 48 c1 e1 10 shl $0x10,%rcx
0x0000000000196189 <+425>: 48 09 d0 or %rdx,%rax
0x000000000019618c <+428>: 48 09 c8 or %rcx,%rax
1318 }
0x000000000019618f <+431>: 5b pop %rbx
1317 r->u64[0] = rl;
0x0000000000196190 <+432>: 48 89 07 mov %rax,(%rdi)
1318 }
0x0000000000196193 <+435>: 5d pop %rbp
0x0000000000196194 <+436>: c3 retq
End of assembler dump.
2) Only assembly code:
Dump of assembler code for function helper_vpkpx:
0x0000000000195fe0 <+0>: 55 push %rbp
0x0000000000195fe1 <+1>: 53 push %rbx
0x0000000000195fe2 <+2>: 48 8b 46 08 mov 0x8(%rsi),%rax
0x0000000000195fe6 <+6>: 48 8b 0e mov (%rsi),%rcx
0x0000000000195fe9 <+9>: 49 89 c1 mov %rax,%r9
0x0000000000195fec <+12>: 48 89 c6 mov %rax,%rsi
0x0000000000195fef <+15>: 49 89 c3 mov %rax,%r11
0x0000000000195ff2 <+18>: 48 c1 ee 29 shr $0x29,%rsi
0x0000000000195ff6 <+22>: 49 c1 e9 26 shr $0x26,%r9
0x0000000000195ffa <+26>: 49 c1 eb 09 shr $0x9,%r11
0x0000000000195ffe <+30>: 81 e6 00 fc 00 00 and $0xfc00,%esi
0x0000000000196004 <+36>: 41 81 e1 e0 03 00 00 and $0x3e0,%r9d
0x000000000019600b <+43>: 49 89 ca mov %rcx,%r10
0x000000000019600e <+46>: 49 89 f0 mov %rsi,%r8
0x0000000000196011 <+49>: 4c 89 ce mov %r9,%rsi
0x0000000000196014 <+52>: 49 89 c1 mov %rax,%r9
0x0000000000196017 <+55>: 49 c1 e9 23 shr $0x23,%r9
0x000000000019601b <+59>: 4c 09 c6 or %r8,%rsi
0x000000000019601e <+62>: 49 c1 ea 26 shr $0x26,%r10
0x0000000000196022 <+66>: 41 83 e1 1f and $0x1f,%r9d
0x0000000000196026 <+70>: 41 81 e2 e0 03 00 00 and $0x3e0,%r10d
0x000000000019602d <+77>: 49 09 f1 or %rsi,%r9
0x0000000000196030 <+80>: 4c 89 de mov %r11,%rsi
0x0000000000196033 <+83>: 49 89 c3 mov %rax,%r11
0x0000000000196036 <+86>: 49 c1 eb 06 shr $0x6,%r11
0x000000000019603a <+90>: 81 e6 00 fc 00 00 and $0xfc00,%esi
0x0000000000196040 <+96>: 48 c1 e8 03 shr $0x3,%rax
0x0000000000196044 <+100>: 41 81 e3 e0 03 00 00 and $0x3e0,%r11d
0x000000000019604b <+107>: 83 e0 1f and $0x1f,%eax
0x000000000019604e <+110>: 49 09 f3 or %rsi,%r11
0x0000000000196051 <+113>: 49 09 c3 or %rax,%r11
0x0000000000196054 <+116>: 48 89 c8 mov %rcx,%rax
0x0000000000196057 <+119>: 48 c1 e8 29 shr $0x29,%rax
0x000000000019605b <+123>: 25 00 fc 00 00 and $0xfc00,%eax
0x0000000000196060 <+128>: 48 89 c6 mov %rax,%rsi
0x0000000000196063 <+131>: 4c 89 d0 mov %r10,%rax
0x0000000000196066 <+134>: 49 89 ca mov %rcx,%r10
0x0000000000196069 <+137>: 49 c1 ea 23 shr $0x23,%r10
0x000000000019606d <+141>: 48 09 f0 or %rsi,%rax
0x0000000000196070 <+144>: 41 83 e2 1f and $0x1f,%r10d
0x0000000000196074 <+148>: 49 09 c2 or %rax,%r10
0x0000000000196077 <+151>: 48 8b 02 mov (%rdx),%rax
0x000000000019607a <+154>: 48 8b 52 08 mov 0x8(%rdx),%rdx
0x000000000019607e <+158>: 49 89 d0 mov %rdx,%r8
0x0000000000196081 <+161>: 48 89 d6 mov %rdx,%rsi
0x0000000000196084 <+164>: 49 c1 e8 26 shr $0x26,%r8
0x0000000000196088 <+168>: 48 c1 ee 29 shr $0x29,%rsi
0x000000000019608c <+172>: 41 81 e0 e0 03 00 00 and $0x3e0,%r8d
0x0000000000196093 <+179>: 48 89 f3 mov %rsi,%rbx
0x0000000000196096 <+182>: 4c 89 c6 mov %r8,%rsi
0x0000000000196099 <+185>: 49 89 d0 mov %rdx,%r8
0x000000000019609c <+188>: 81 e3 00 fc 00 00 and $0xfc00,%ebx
0x00000000001960a2 <+194>: 49 c1 e8 23 shr $0x23,%r8
0x00000000001960a6 <+198>: 48 09 de or %rbx,%rsi
0x00000000001960a9 <+201>: 41 83 e0 1f and $0x1f,%r8d
0x00000000001960ad <+205>: 49 09 f0 or %rsi,%r8
0x00000000001960b0 <+208>: 48 89 d6 mov %rdx,%rsi
0x00000000001960b3 <+211>: 48 c1 ee 09 shr $0x9,%rsi
0x00000000001960b7 <+215>: 49 c1 e1 30 shl $0x30,%r9
0x00000000001960bb <+219>: 49 c1 e3 20 shl $0x20,%r11
0x00000000001960bf <+223>: 48 89 f3 mov %rsi,%rbx
0x00000000001960c2 <+226>: 48 89 d6 mov %rdx,%rsi
0x00000000001960c5 <+229>: 48 c1 ea 03 shr $0x3,%rdx
0x00000000001960c9 <+233>: 48 c1 ee 06 shr $0x6,%rsi
0x00000000001960cd <+237>: 81 e3 00 fc 00 00 and $0xfc00,%ebx
0x00000000001960d3 <+243>: 83 e2 1f and $0x1f,%edx
0x00000000001960d6 <+246>: 81 e6 e0 03 00 00 and $0x3e0,%esi
0x00000000001960dc <+252>: 49 c1 e2 10 shl $0x10,%r10
0x00000000001960e0 <+256>: 48 09 de or %rbx,%rsi
0x00000000001960e3 <+259>: 48 89 c3 mov %rax,%rbx
0x00000000001960e6 <+262>: 48 09 f2 or %rsi,%rdx
0x00000000001960e9 <+265>: 48 89 c6 mov %rax,%rsi
0x00000000001960ec <+268>: 48 c1 eb 29 shr $0x29,%rbx
0x00000000001960f0 <+272>: 48 c1 ee 26 shr $0x26,%rsi
0x00000000001960f4 <+276>: 81 e3 00 fc 00 00 and $0xfc00,%ebx
0x00000000001960fa <+282>: 81 e6 e0 03 00 00 and $0x3e0,%esi
0x0000000000196100 <+288>: 48 89 dd mov %rbx,%rbp
0x0000000000196103 <+291>: 48 89 f3 mov %rsi,%rbx
0x0000000000196106 <+294>: 48 89 c6 mov %rax,%rsi
0x0000000000196109 <+297>: 48 c1 ee 23 shr $0x23,%rsi
0x000000000019610d <+301>: 48 09 eb or %rbp,%rbx
0x0000000000196110 <+304>: 83 e6 1f and $0x1f,%esi
0x0000000000196113 <+307>: 48 09 de or %rbx,%rsi
0x0000000000196116 <+310>: 48 89 cb mov %rcx,%rbx
0x0000000000196119 <+313>: 48 c1 eb 09 shr $0x9,%rbx
0x000000000019611d <+317>: 81 e3 00 fc 00 00 and $0xfc00,%ebx
0x0000000000196123 <+323>: 48 89 dd mov %rbx,%rbp
0x0000000000196126 <+326>: 48 89 cb mov %rcx,%rbx
0x0000000000196129 <+329>: 48 c1 e9 03 shr $0x3,%rcx
0x000000000019612d <+333>: 48 c1 eb 06 shr $0x6,%rbx
0x0000000000196131 <+337>: 83 e1 1f and $0x1f,%ecx
0x0000000000196134 <+340>: 81 e3 e0 03 00 00 and $0x3e0,%ebx
0x000000000019613a <+346>: 48 09 eb or %rbp,%rbx
0x000000000019613d <+349>: 48 09 d9 or %rbx,%rcx
0x0000000000196140 <+352>: 4c 09 c9 or %r9,%rcx
0x0000000000196143 <+355>: 4c 09 d9 or %r11,%rcx
0x0000000000196146 <+358>: 4c 09 d1 or %r10,%rcx
0x0000000000196149 <+361>: 48 89 4f 08 mov %rcx,0x8(%rdi)
0x000000000019614d <+365>: 48 89 c1 mov %rax,%rcx
0x0000000000196150 <+368>: 48 c1 e9 09 shr $0x9,%rcx
0x0000000000196154 <+372>: 81 e1 00 fc 00 00 and $0xfc00,%ecx
0x000000000019615a <+378>: 49 89 c9 mov %rcx,%r9
0x000000000019615d <+381>: 48 89 c1 mov %rax,%rcx
0x0000000000196160 <+384>: 48 c1 e9 06 shr $0x6,%rcx
0x0000000000196164 <+388>: 48 c1 e8 03 shr $0x3,%rax
0x0000000000196168 <+392>: 49 c1 e0 30 shl $0x30,%r8
0x000000000019616c <+396>: 81 e1 e0 03 00 00 and $0x3e0,%ecx
0x0000000000196172 <+402>: 83 e0 1f and $0x1f,%eax
0x0000000000196175 <+405>: 48 c1 e2 20 shl $0x20,%rdx
0x0000000000196179 <+409>: 4c 09 c9 or %r9,%rcx
0x000000000019617c <+412>: 48 09 c8 or %rcx,%rax
0x000000000019617f <+415>: 48 89 f1 mov %rsi,%rcx
0x0000000000196182 <+418>: 4c 09 c0 or %r8,%rax
0x0000000000196185 <+421>: 48 c1 e1 10 shl $0x10,%rcx
0x0000000000196189 <+425>: 48 09 d0 or %rdx,%rax
0x000000000019618c <+428>: 48 09 c8 or %rcx,%rax
0x000000000019618f <+431>: 5b pop %rbx
0x0000000000196190 <+432>: 48 89 07 mov %rax,(%rdi)
0x0000000000196193 <+435>: 5d pop %rbp
0x0000000000196194 <+436>: c3 retq
End of assembler dump.