On 27.8.19. 20:52, Richard Henderson wrote:
On 8/27/19 2:37 AM, Stefan Brankovic wrote:
+    for (i = 0; i < 4; i++) {
+        switch (i) {
+        case 0:
+            /*
+             * Get high doubleword of vA to perfrom 6-5-5 pack of pixels
+             * 1 and 2.
+             */
+            get_avr64(avr, VA, true);
+            tcg_gen_movi_i64(result, 0x0ULL);
+            break;
+        case 1:
+            /*
+             * Get low doubleword of vA to perfrom 6-5-5 pack of pixels
+             * 3 and 4.
+             */
+            get_avr64(avr, VA, false);
+            break;
+        case 2:
+            /*
+             * Get high doubleword of vB to perfrom 6-5-5 pack of pixels
+             * 5 and 6.
+             */
+            get_avr64(avr, VB, true);
+            tcg_gen_movi_i64(result, 0x0ULL);
+            break;
+        case 3:
+            /*
+             * Get low doubleword of vB to perfrom 6-5-5 pack of pixels
+             * 7 and 8.
+             */
+            get_avr64(avr, VB, false);
+            break;
+        }
+        /* Perform the packing for 2 pixels(each iteration for 1). */
+        tcg_gen_movi_i64(tmp, 0x0ULL);
+        for (j = 0; j < 2; j++) {
+            tcg_gen_shri_i64(shifted, avr, (j * 16 + 3));
+            tcg_gen_andi_i64(shifted, shifted, mask1 << (j * 16));
+            tcg_gen_or_i64(tmp, tmp, shifted);
+            tcg_gen_shri_i64(shifted, avr, (j * 16 + 6));
+            tcg_gen_andi_i64(shifted, shifted, mask2 << (j * 16));
+            tcg_gen_or_i64(tmp, tmp, shifted);
+            tcg_gen_shri_i64(shifted, avr, (j * 16 + 9));
+            tcg_gen_andi_i64(shifted, shifted, mask3 << (j * 16));
+            tcg_gen_or_i64(tmp, tmp, shifted);
+        }
+        if ((i == 0) || (i == 2)) {
+            tcg_gen_shli_i64(tmp, tmp, 32);
+        }
+        tcg_gen_or_i64(result, result, tmp);
+        if (i == 1) {
+            /* Place packed pixels 1:4 to high doubleword of vD. */
+            tcg_gen_mov_i64(result1, result);
+        }
+        if (i == 3) {
+            /* Place packed pixels 5:8 to low doubleword of vD. */
+            tcg_gen_mov_i64(result2, result);
+        }
+    }
+    set_avr64(VT, result1, true);
+    set_avr64(VT, result2, false);
I really have a hard time believing that it is worthwhile to inline all of this
code.  By my count this is 82 non-move opcodes.  That is a *lot* of inline

However, I can well imagine that the existing out-of-line helper is less than

-void helper_vpkpx(ppc_avr_t *r, ppc_avr_t *a, ppc_avr_t *b)
-    int i, j;
-    ppc_avr_t result;
-    const ppc_avr_t *x[2] = { a, b };
-    const ppc_avr_t *x[2] = { b, a };
-    VECTOR_FOR_INORDER_I(i, u64) {
-        VECTOR_FOR_INORDER_I(j, u32) {
-            uint32_t e = x[i]->u32[j];
Double indirect loads?

-            result.u16[4 * i + j] = (((e >> 9) & 0xfc00) |
-                                     ((e >> 6) & 0x3e0) |
-                                     ((e >> 3) & 0x1f));
Store to temporary ...

-        }
-    }
-    *r = result;
... and then copy?

Try replacing the existing helper with something like the following.


static inline uint64_t pkpx_1(uint64_t a, int shr, int shl)
     uint64_t r;

     r  = ((a >> (shr + 9)) & 0x3f) << shl;
     r |= ((a >> (shr + 6)) & 0x1f) << shl;
     r |= ((a >> (shr + 3)) & 0x1f) << shl;

     return r;

static inline uint64_t pkpx_2(uint64_t ah, uint64_t al)
     return pkpx_1(ah, 32, 48)
          | pkpx_1(ah,  0, 32)
          | pkpx_1(al, 32, 16)
          | pkpx_1(al,  0,  0);

void helper_vpkpx(uint64_t *r, uint64_t *a, uint64_t *b)
     uint64_t rh = pkpx_2(a->VsrD(0), a->VsrD(1));
     uint64_t rl = pkpx_2(b->VsrD(0), b->VsrD(1));
     r->VsrD(0) = rh;
     r->VsrD(1) = rl;

I implemented vpkpx as you suggested above with small modifications(so it builds and gives correct result). It looks like this:

static inline uint64_t pkpx_1(uint64_t a, int shr, int shl)
    uint64_t r;

    r  = ((a >> (shr + 9)) & 0xfc00) << shl;
    r |= ((a >> (shr + 6)) & 0x3e0) << shl;
    r |= ((a >> (shr + 3)) & 0x1f) << shl;

    return r;

static inline uint64_t pkpx_2(uint64_t ah, uint64_t al)
    return pkpx_1(ah, 32, 48)
         | pkpx_1(ah,  0, 32)
         | pkpx_1(al, 32, 16)
         | pkpx_1(al,  0,  0);

void helper_vpkpx(ppc_avr_t *r, ppc_avr_t *a, ppc_avr_t *b)
    uint64_t rh = pkpx_2(a->u64[1], a->u64[0]);
    uint64_t rl = pkpx_2(b->u64[1], b->u64[0]);
    r->u64[1] = rh;
    r->u64[0] = rl;

I also noticed that this would work only for little_endian hosts, so we would need to modify it in order to support big_endian hosts (this shouldn't affect performance results).

Then I run my performance tests and I got following results(test is calling vpkpx 100000 times):

1) Current helper implementation: ~ 157 ms

2) helper implementation you suggested: ~94 ms

3) tcg implementation: ~75 ms

Attached file contains assembly code for both current implementation and implementation you suggested, so please take a look at that as well.

Kind Regards,


Current vpkpx implementation:

1)Both c and assembly code:

Dump of assembler code for function helper_vpkpx:
1267    {
   0x0000000000195fe0 <+0>:     48 83 ec 38     sub    $0x38,%rsp

1268        int i, j;
1269        ppc_avr_t result;
1270    #if defined(HOST_WORDS_BIGENDIAN)
1271        const ppc_avr_t *x[2] = { a, b };
1272    #else
1273        const ppc_avr_t *x[2] = { b, a };
   0x0000000000195fe4 <+4>:     b9 07 00 00 00  mov    $0x7,%ecx

1267    {
   0x0000000000195fe9 <+9>:     64 48 8b 04 25 28 00 00 00      mov    
   0x0000000000195ff2 <+18>:    48 89 44 24 28  mov    %rax,0x28(%rsp)
   0x0000000000195ff7 <+23>:    31 c0   xor    %eax,%eax
   0x0000000000195ff9 <+25>:    4c 8d 4c 24 10  lea    0x10(%rsp),%r9

1268        int i, j;
1269        ppc_avr_t result;
1270    #if defined(HOST_WORDS_BIGENDIAN)
1271        const ppc_avr_t *x[2] = { a, b };
1272    #else
1273        const ppc_avr_t *x[2] = { b, a };
   0x0000000000195ffe <+30>:    48 89 54 24 10  mov    %rdx,0x10(%rsp)
   0x0000000000196003 <+35>:    48 89 74 24 18  mov    %rsi,0x18(%rsp)
   0x0000000000196008 <+40>:    44 8d 51 fc     lea    -0x4(%rcx),%r10d
   0x000000000019600c <+44>:    48 83 c6 0c     add    $0xc,%rsi

1278                uint32_t e = x[i]->u32[j];
   0x0000000000196010 <+48>:    8b 06   mov    (%rsi),%eax

1280                result.u16[4 * i + j] = (((e >> 9) & 0xfc00) |
   0x0000000000196012 <+50>:    4c 63 d9        movslq %ecx,%r11
   0x0000000000196015 <+53>:    83 e9 01        sub    $0x1,%ecx
   0x0000000000196018 <+56>:    48 83 ee 04     sub    $0x4,%rsi
   0x000000000019601c <+60>:    89 c2   mov    %eax,%edx
   0x000000000019601e <+62>:    c1 ea 09        shr    $0x9,%edx
   0x0000000000196021 <+65>:    41 89 d0        mov    %edx,%r8d
   0x0000000000196024 <+68>:    89 c2   mov    %eax,%edx
   0x0000000000196026 <+70>:    c1 e8 03        shr    $0x3,%eax
   0x0000000000196029 <+73>:    c1 ea 06        shr    $0x6,%edx
   0x000000000019602c <+76>:    66 41 81 e0 00 fc       and    $0xfc00,%r8w
   0x0000000000196032 <+82>:    83 e0 1f        and    $0x1f,%eax
   0x0000000000196035 <+85>:    66 81 e2 e0 03  and    $0x3e0,%dx
   0x000000000019603a <+90>:    44 09 c2        or     %r8d,%edx
   0x000000000019603d <+93>:    09 d0   or     %edx,%eax

1277            VECTOR_FOR_INORDER_I(j, u32) {
   0x000000000019603f <+95>:    41 39 ca        cmp    %ecx,%r10d

1280                result.u16[4 * i + j] = (((e >> 9) & 0xfc00) |
   0x0000000000196042 <+98>:    66 42 89 04 5c  mov    %ax,(%rsp,%r11,2)

1277            VECTOR_FOR_INORDER_I(j, u32) {
   0x0000000000196047 <+103>:   75 c7   jne    0x196010 <helper_vpkpx+48>

1276        VECTOR_FOR_INORDER_I(i, u64) {
   0x0000000000196049 <+105>:   41 83 fa ff     cmp    $0xffffffff,%r10d
   0x000000000019604d <+109>:   44 89 d1        mov    %r10d,%ecx
   0x0000000000196050 <+112>:   74 0e   je     0x196060 <helper_vpkpx+128>
   0x0000000000196052 <+114>:   49 8b 31        mov    (%r9),%rsi
   0x0000000000196055 <+117>:   49 83 e9 08     sub    $0x8,%r9
   0x0000000000196059 <+121>:   eb ad   jmp    0x196008 <helper_vpkpx+40>
   0x000000000019605b <+123>:   0f 1f 44 00 00  nopl   0x0(%rax,%rax,1)

1281                                         ((e >> 6) & 0x3e0) |
1282                                         ((e >> 3) & 0x1f));
1283    //            printf("%x\n",result.u16[4 * i + j]);
1284            }
1285        }
1286    //    printf("%lx\n",result.u64[0]);
1287    //    printf("%lx\n",result.u64[1]);
1288        *r = result;
   0x0000000000196060 <+128>:   48 8b 04 24     mov    (%rsp),%rax
   0x0000000000196064 <+132>:   48 8b 54 24 08  mov    0x8(%rsp),%rdx
   0x0000000000196069 <+137>:   48 89 07        mov    %rax,(%rdi)
   0x000000000019606c <+140>:   48 89 57 08     mov    %rdx,0x8(%rdi)

1289    }
   0x0000000000196070 <+144>:   48 8b 44 24 28  mov    0x28(%rsp),%rax
   0x0000000000196075 <+149>:   64 48 33 04 25 28 00 00 00      xor    
   0x000000000019607e <+158>:   75 05   jne    0x196085 <helper_vpkpx+165>
   0x0000000000196080 <+160>:   48 83 c4 38     add    $0x38,%rsp
   0x0000000000196084 <+164>:   c3      retq   
   0x0000000000196085 <+165>:   e8 2e 66 f0 ff  callq  0x9c6b8
End of assembler dump.

2) Only assembly code:

Dump of assembler code for function helper_vpkpx:
   0x0000000000195fe0 <+0>:     48 83 ec 38     sub    $0x38,%rsp
   0x0000000000195fe4 <+4>:     b9 07 00 00 00  mov    $0x7,%ecx
   0x0000000000195fe9 <+9>:     64 48 8b 04 25 28 00 00 00      mov    
   0x0000000000195ff2 <+18>:    48 89 44 24 28  mov    %rax,0x28(%rsp)
   0x0000000000195ff7 <+23>:    31 c0   xor    %eax,%eax
   0x0000000000195ff9 <+25>:    4c 8d 4c 24 10  lea    0x10(%rsp),%r9
   0x0000000000195ffe <+30>:    48 89 54 24 10  mov    %rdx,0x10(%rsp)
   0x0000000000196003 <+35>:    48 89 74 24 18  mov    %rsi,0x18(%rsp)
   0x0000000000196008 <+40>:    44 8d 51 fc     lea    -0x4(%rcx),%r10d
   0x000000000019600c <+44>:    48 83 c6 0c     add    $0xc,%rsi
   0x0000000000196010 <+48>:    8b 06   mov    (%rsi),%eax
   0x0000000000196012 <+50>:    4c 63 d9        movslq %ecx,%r11
   0x0000000000196015 <+53>:    83 e9 01        sub    $0x1,%ecx
   0x0000000000196018 <+56>:    48 83 ee 04     sub    $0x4,%rsi
   0x000000000019601c <+60>:    89 c2   mov    %eax,%edx
   0x000000000019601e <+62>:    c1 ea 09        shr    $0x9,%edx
   0x0000000000196021 <+65>:    41 89 d0        mov    %edx,%r8d
   0x0000000000196024 <+68>:    89 c2   mov    %eax,%edx
   0x0000000000196026 <+70>:    c1 e8 03        shr    $0x3,%eax
   0x0000000000196029 <+73>:    c1 ea 06        shr    $0x6,%edx
   0x000000000019602c <+76>:    66 41 81 e0 00 fc       and    $0xfc00,%r8w
   0x0000000000196032 <+82>:    83 e0 1f        and    $0x1f,%eax
   0x0000000000196035 <+85>:    66 81 e2 e0 03  and    $0x3e0,%dx
   0x000000000019603a <+90>:    44 09 c2        or     %r8d,%edx
   0x000000000019603d <+93>:    09 d0   or     %edx,%eax
   0x000000000019603f <+95>:    41 39 ca        cmp    %ecx,%r10d
   0x0000000000196042 <+98>:    66 42 89 04 5c  mov    %ax,(%rsp,%r11,2)
   0x0000000000196047 <+103>:   75 c7   jne    0x196010 <helper_vpkpx+48>
   0x0000000000196049 <+105>:   41 83 fa ff     cmp    $0xffffffff,%r10d
   0x000000000019604d <+109>:   44 89 d1        mov    %r10d,%ecx
   0x0000000000196050 <+112>:   74 0e   je     0x196060 <helper_vpkpx+128>
   0x0000000000196052 <+114>:   49 8b 31        mov    (%r9),%rsi
   0x0000000000196055 <+117>:   49 83 e9 08     sub    $0x8,%r9
   0x0000000000196059 <+121>:   eb ad   jmp    0x196008 <helper_vpkpx+40>
   0x000000000019605b <+123>:   0f 1f 44 00 00  nopl   0x0(%rax,%rax,1)
   0x0000000000196060 <+128>:   48 8b 04 24     mov    (%rsp),%rax
   0x0000000000196064 <+132>:   48 8b 54 24 08  mov    0x8(%rsp),%rdx
   0x0000000000196069 <+137>:   48 89 07        mov    %rax,(%rdi)
   0x000000000019606c <+140>:   48 89 57 08     mov    %rdx,0x8(%rdi)
   0x0000000000196070 <+144>:   48 8b 44 24 28  mov    0x28(%rsp),%rax
   0x0000000000196075 <+149>:   64 48 33 04 25 28 00 00 00      xor    
   0x000000000019607e <+158>:   75 05   jne    0x196085 <helper_vpkpx+165>
   0x0000000000196080 <+160>:   48 83 c4 38     add    $0x38,%rsp
   0x0000000000196084 <+164>:   c3      retq   
   0x0000000000196085 <+165>:   e8 2e 66 f0 ff  callq  0x9c6b8
End of assembler dump.

Implementation you suggested:

1)Both c and assembly code:

Dump of assembler code for function helper_vpkpx:
1313    {
   0x0000000000195fe0 <+0>:     55      push   %rbp
   0x0000000000195fe1 <+1>:     53      push   %rbx

1314        uint64_t rh = pkpx_2(a->u64[1], a->u64[0]);
   0x0000000000195fe2 <+2>:     48 8b 46 08     mov    0x8(%rsi),%rax
   0x0000000000195fe6 <+6>:     48 8b 0e        mov    (%rsi),%rcx

1297        r |= ((a >> (shr + 6)) & 0x3e0);
   0x0000000000195fe9 <+9>:     49 89 c1        mov    %rax,%r9

1296        r  = ((a >> (shr + 9)) & 0xfc00);
   0x0000000000195fec <+12>:    48 89 c6        mov    %rax,%rsi
   0x0000000000195fef <+15>:    49 89 c3        mov    %rax,%r11
   0x0000000000195ff2 <+18>:    48 c1 ee 29     shr    $0x29,%rsi

1297        r |= ((a >> (shr + 6)) & 0x3e0);
   0x0000000000195ff6 <+22>:    49 c1 e9 26     shr    $0x26,%r9

1296        r  = ((a >> (shr + 9)) & 0xfc00);
   0x0000000000195ffa <+26>:    49 c1 eb 09     shr    $0x9,%r11
   0x0000000000195ffe <+30>:    81 e6 00 fc 00 00       and    $0xfc00,%esi

1297        r |= ((a >> (shr + 6)) & 0x3e0);
   0x0000000000196004 <+36>:    41 81 e1 e0 03 00 00    and    $0x3e0,%r9d
   0x000000000019600b <+43>:    49 89 ca        mov    %rcx,%r10

1296        r  = ((a >> (shr + 9)) & 0xfc00);
   0x000000000019600e <+46>:    49 89 f0        mov    %rsi,%r8

1297        r |= ((a >> (shr + 6)) & 0x3e0);
   0x0000000000196011 <+49>:    4c 89 ce        mov    %r9,%rsi

1298        r |= ((a >> (shr + 3)) & 0x1f);
   0x0000000000196014 <+52>:    49 89 c1        mov    %rax,%r9
   0x0000000000196017 <+55>:    49 c1 e9 23     shr    $0x23,%r9

1297        r |= ((a >> (shr + 6)) & 0x3e0);
   0x000000000019601b <+59>:    4c 09 c6        or     %r8,%rsi
   0x000000000019601e <+62>:    49 c1 ea 26     shr    $0x26,%r10

1298        r |= ((a >> (shr + 3)) & 0x1f);
   0x0000000000196022 <+66>:    41 83 e1 1f     and    $0x1f,%r9d

1297        r |= ((a >> (shr + 6)) & 0x3e0);
   0x0000000000196026 <+70>:    41 81 e2 e0 03 00 00    and    $0x3e0,%r10d

1298        r |= ((a >> (shr + 3)) & 0x1f);
   0x000000000019602d <+77>:    49 09 f1        or     %rsi,%r9

1296        r  = ((a >> (shr + 9)) & 0xfc00);
   0x0000000000196030 <+80>:    4c 89 de        mov    %r11,%rsi

1297        r |= ((a >> (shr + 6)) & 0x3e0);
   0x0000000000196033 <+83>:    49 89 c3        mov    %rax,%r11
   0x0000000000196036 <+86>:    49 c1 eb 06     shr    $0x6,%r11

1296        r  = ((a >> (shr + 9)) & 0xfc00);
   0x000000000019603a <+90>:    81 e6 00 fc 00 00       and    $0xfc00,%esi

1298        r |= ((a >> (shr + 3)) & 0x1f);
   0x0000000000196040 <+96>:    48 c1 e8 03     shr    $0x3,%rax

1297        r |= ((a >> (shr + 6)) & 0x3e0);
   0x0000000000196044 <+100>:   41 81 e3 e0 03 00 00    and    $0x3e0,%r11d

1298        r |= ((a >> (shr + 3)) & 0x1f);
   0x000000000019604b <+107>:   83 e0 1f        and    $0x1f,%eax

1297        r |= ((a >> (shr + 6)) & 0x3e0);
   0x000000000019604e <+110>:   49 09 f3        or     %rsi,%r11

1298        r |= ((a >> (shr + 3)) & 0x1f);
   0x0000000000196051 <+113>:   49 09 c3        or     %rax,%r11

1296        r  = ((a >> (shr + 9)) & 0xfc00);
   0x0000000000196054 <+116>:   48 89 c8        mov    %rcx,%rax
   0x0000000000196057 <+119>:   48 c1 e8 29     shr    $0x29,%rax
   0x000000000019605b <+123>:   25 00 fc 00 00  and    $0xfc00,%eax
   0x0000000000196060 <+128>:   48 89 c6        mov    %rax,%rsi

1297        r |= ((a >> (shr + 6)) & 0x3e0);
   0x0000000000196063 <+131>:   4c 89 d0        mov    %r10,%rax

1298        r |= ((a >> (shr + 3)) & 0x1f);
   0x0000000000196066 <+134>:   49 89 ca        mov    %rcx,%r10
   0x0000000000196069 <+137>:   49 c1 ea 23     shr    $0x23,%r10

1297        r |= ((a >> (shr + 6)) & 0x3e0);
   0x000000000019606d <+141>:   48 09 f0        or     %rsi,%rax

1298        r |= ((a >> (shr + 3)) & 0x1f);
   0x0000000000196070 <+144>:   41 83 e2 1f     and    $0x1f,%r10d
   0x0000000000196074 <+148>:   49 09 c2        or     %rax,%r10

1315        uint64_t rl = pkpx_2(b->u64[1], b->u64[0]);
   0x0000000000196077 <+151>:   48 8b 02        mov    (%rdx),%rax
   0x000000000019607a <+154>:   48 8b 52 08     mov    0x8(%rdx),%rdx

1297        r |= ((a >> (shr + 6)) & 0x3e0);
   0x000000000019607e <+158>:   49 89 d0        mov    %rdx,%r8

1296        r  = ((a >> (shr + 9)) & 0xfc00);
   0x0000000000196081 <+161>:   48 89 d6        mov    %rdx,%rsi

1297        r |= ((a >> (shr + 6)) & 0x3e0);
   0x0000000000196084 <+164>:   49 c1 e8 26     shr    $0x26,%r8

1296        r  = ((a >> (shr + 9)) & 0xfc00);
   0x0000000000196088 <+168>:   48 c1 ee 29     shr    $0x29,%rsi

1297        r |= ((a >> (shr + 6)) & 0x3e0);
   0x000000000019608c <+172>:   41 81 e0 e0 03 00 00    and    $0x3e0,%r8d

1296        r  = ((a >> (shr + 9)) & 0xfc00);
   0x0000000000196093 <+179>:   48 89 f3        mov    %rsi,%rbx

1297        r |= ((a >> (shr + 6)) & 0x3e0);
   0x0000000000196096 <+182>:   4c 89 c6        mov    %r8,%rsi

1298        r |= ((a >> (shr + 3)) & 0x1f);
   0x0000000000196099 <+185>:   49 89 d0        mov    %rdx,%r8

1296        r  = ((a >> (shr + 9)) & 0xfc00);
   0x000000000019609c <+188>:   81 e3 00 fc 00 00       and    $0xfc00,%ebx

1298        r |= ((a >> (shr + 3)) & 0x1f);
   0x00000000001960a2 <+194>:   49 c1 e8 23     shr    $0x23,%r8

1297        r |= ((a >> (shr + 6)) & 0x3e0);
   0x00000000001960a6 <+198>:   48 09 de        or     %rbx,%rsi

1298        r |= ((a >> (shr + 3)) & 0x1f);
   0x00000000001960a9 <+201>:   41 83 e0 1f     and    $0x1f,%r8d
   0x00000000001960ad <+205>:   49 09 f0        or     %rsi,%r8

1296        r  = ((a >> (shr + 9)) & 0xfc00);
   0x00000000001960b0 <+208>:   48 89 d6        mov    %rdx,%rsi
   0x00000000001960b3 <+211>:   48 c1 ee 09     shr    $0x9,%rsi

1316        r->u64[1] = rh;
   0x00000000001960b7 <+215>:   49 c1 e1 30     shl    $0x30,%r9
   0x00000000001960bb <+219>:   49 c1 e3 20     shl    $0x20,%r11

1296        r  = ((a >> (shr + 9)) & 0xfc00);
   0x00000000001960bf <+223>:   48 89 f3        mov    %rsi,%rbx

1297        r |= ((a >> (shr + 6)) & 0x3e0);
   0x00000000001960c2 <+226>:   48 89 d6        mov    %rdx,%rsi

1298        r |= ((a >> (shr + 3)) & 0x1f);
   0x00000000001960c5 <+229>:   48 c1 ea 03     shr    $0x3,%rdx

1297        r |= ((a >> (shr + 6)) & 0x3e0);
   0x00000000001960c9 <+233>:   48 c1 ee 06     shr    $0x6,%rsi

1296        r  = ((a >> (shr + 9)) & 0xfc00);
   0x00000000001960cd <+237>:   81 e3 00 fc 00 00       and    $0xfc00,%ebx

1298        r |= ((a >> (shr + 3)) & 0x1f);
   0x00000000001960d3 <+243>:   83 e2 1f        and    $0x1f,%edx

1297        r |= ((a >> (shr + 6)) & 0x3e0);
   0x00000000001960d6 <+246>:   81 e6 e0 03 00 00       and    $0x3e0,%esi

1316        r->u64[1] = rh;
   0x00000000001960dc <+252>:   49 c1 e2 10     shl    $0x10,%r10

1297        r |= ((a >> (shr + 6)) & 0x3e0);
   0x00000000001960e0 <+256>:   48 09 de        or     %rbx,%rsi

1296        r  = ((a >> (shr + 9)) & 0xfc00);
   0x00000000001960e3 <+259>:   48 89 c3        mov    %rax,%rbx

1298        r |= ((a >> (shr + 3)) & 0x1f);
   0x00000000001960e6 <+262>:   48 09 f2        or     %rsi,%rdx

1297        r |= ((a >> (shr + 6)) & 0x3e0);
   0x00000000001960e9 <+265>:   48 89 c6        mov    %rax,%rsi

1296        r  = ((a >> (shr + 9)) & 0xfc00);
   0x00000000001960ec <+268>:   48 c1 eb 29     shr    $0x29,%rbx

1297        r |= ((a >> (shr + 6)) & 0x3e0);
   0x00000000001960f0 <+272>:   48 c1 ee 26     shr    $0x26,%rsi

1296        r  = ((a >> (shr + 9)) & 0xfc00);
   0x00000000001960f4 <+276>:   81 e3 00 fc 00 00       and    $0xfc00,%ebx

1297        r |= ((a >> (shr + 6)) & 0x3e0);
   0x00000000001960fa <+282>:   81 e6 e0 03 00 00       and    $0x3e0,%esi

1296        r  = ((a >> (shr + 9)) & 0xfc00);
   0x0000000000196100 <+288>:   48 89 dd        mov    %rbx,%rbp

1297        r |= ((a >> (shr + 6)) & 0x3e0);
   0x0000000000196103 <+291>:   48 89 f3        mov    %rsi,%rbx

1298        r |= ((a >> (shr + 3)) & 0x1f);
   0x0000000000196106 <+294>:   48 89 c6        mov    %rax,%rsi
   0x0000000000196109 <+297>:   48 c1 ee 23     shr    $0x23,%rsi

1297        r |= ((a >> (shr + 6)) & 0x3e0);
   0x000000000019610d <+301>:   48 09 eb        or     %rbp,%rbx

1298        r |= ((a >> (shr + 3)) & 0x1f);
   0x0000000000196110 <+304>:   83 e6 1f        and    $0x1f,%esi
   0x0000000000196113 <+307>:   48 09 de        or     %rbx,%rsi

1316        r->u64[1] = rh;
   0x0000000000196116 <+310>:   48 89 cb        mov    %rcx,%rbx
   0x0000000000196119 <+313>:   48 c1 eb 09     shr    $0x9,%rbx
   0x000000000019611d <+317>:   81 e3 00 fc 00 00       and    $0xfc00,%ebx
   0x0000000000196123 <+323>:   48 89 dd        mov    %rbx,%rbp
   0x0000000000196126 <+326>:   48 89 cb        mov    %rcx,%rbx
   0x0000000000196129 <+329>:   48 c1 e9 03     shr    $0x3,%rcx
   0x000000000019612d <+333>:   48 c1 eb 06     shr    $0x6,%rbx
   0x0000000000196131 <+337>:   83 e1 1f        and    $0x1f,%ecx
   0x0000000000196134 <+340>:   81 e3 e0 03 00 00       and    $0x3e0,%ebx
   0x000000000019613a <+346>:   48 09 eb        or     %rbp,%rbx
   0x000000000019613d <+349>:   48 09 d9        or     %rbx,%rcx
   0x0000000000196140 <+352>:   4c 09 c9        or     %r9,%rcx
   0x0000000000196143 <+355>:   4c 09 d9        or     %r11,%rcx
   0x0000000000196146 <+358>:   4c 09 d1        or     %r10,%rcx
   0x0000000000196149 <+361>:   48 89 4f 08     mov    %rcx,0x8(%rdi)

1317        r->u64[0] = rl;
   0x000000000019614d <+365>:   48 89 c1        mov    %rax,%rcx
   0x0000000000196150 <+368>:   48 c1 e9 09     shr    $0x9,%rcx
   0x0000000000196154 <+372>:   81 e1 00 fc 00 00       and    $0xfc00,%ecx
   0x000000000019615a <+378>:   49 89 c9        mov    %rcx,%r9
   0x000000000019615d <+381>:   48 89 c1        mov    %rax,%rcx
   0x0000000000196160 <+384>:   48 c1 e9 06     shr    $0x6,%rcx
   0x0000000000196164 <+388>:   48 c1 e8 03     shr    $0x3,%rax
   0x0000000000196168 <+392>:   49 c1 e0 30     shl    $0x30,%r8
   0x000000000019616c <+396>:   81 e1 e0 03 00 00       and    $0x3e0,%ecx
   0x0000000000196172 <+402>:   83 e0 1f        and    $0x1f,%eax
   0x0000000000196175 <+405>:   48 c1 e2 20     shl    $0x20,%rdx
   0x0000000000196179 <+409>:   4c 09 c9        or     %r9,%rcx
   0x000000000019617c <+412>:   48 09 c8        or     %rcx,%rax
   0x000000000019617f <+415>:   48 89 f1        mov    %rsi,%rcx
   0x0000000000196182 <+418>:   4c 09 c0        or     %r8,%rax
   0x0000000000196185 <+421>:   48 c1 e1 10     shl    $0x10,%rcx
   0x0000000000196189 <+425>:   48 09 d0        or     %rdx,%rax
   0x000000000019618c <+428>:   48 09 c8        or     %rcx,%rax

1318    }
   0x000000000019618f <+431>:   5b      pop    %rbx

1317        r->u64[0] = rl;
   0x0000000000196190 <+432>:   48 89 07        mov    %rax,(%rdi)

1318    }
   0x0000000000196193 <+435>:   5d      pop    %rbp
   0x0000000000196194 <+436>:   c3      retq   
End of assembler dump.

2) Only assembly code:

Dump of assembler code for function helper_vpkpx:
   0x0000000000195fe0 <+0>:     55      push   %rbp
   0x0000000000195fe1 <+1>:     53      push   %rbx
   0x0000000000195fe2 <+2>:     48 8b 46 08     mov    0x8(%rsi),%rax
   0x0000000000195fe6 <+6>:     48 8b 0e        mov    (%rsi),%rcx
   0x0000000000195fe9 <+9>:     49 89 c1        mov    %rax,%r9
   0x0000000000195fec <+12>:    48 89 c6        mov    %rax,%rsi
   0x0000000000195fef <+15>:    49 89 c3        mov    %rax,%r11
   0x0000000000195ff2 <+18>:    48 c1 ee 29     shr    $0x29,%rsi
   0x0000000000195ff6 <+22>:    49 c1 e9 26     shr    $0x26,%r9
   0x0000000000195ffa <+26>:    49 c1 eb 09     shr    $0x9,%r11
   0x0000000000195ffe <+30>:    81 e6 00 fc 00 00       and    $0xfc00,%esi
   0x0000000000196004 <+36>:    41 81 e1 e0 03 00 00    and    $0x3e0,%r9d
   0x000000000019600b <+43>:    49 89 ca        mov    %rcx,%r10
   0x000000000019600e <+46>:    49 89 f0        mov    %rsi,%r8
   0x0000000000196011 <+49>:    4c 89 ce        mov    %r9,%rsi
   0x0000000000196014 <+52>:    49 89 c1        mov    %rax,%r9
   0x0000000000196017 <+55>:    49 c1 e9 23     shr    $0x23,%r9
   0x000000000019601b <+59>:    4c 09 c6        or     %r8,%rsi
   0x000000000019601e <+62>:    49 c1 ea 26     shr    $0x26,%r10
   0x0000000000196022 <+66>:    41 83 e1 1f     and    $0x1f,%r9d
   0x0000000000196026 <+70>:    41 81 e2 e0 03 00 00    and    $0x3e0,%r10d
   0x000000000019602d <+77>:    49 09 f1        or     %rsi,%r9
   0x0000000000196030 <+80>:    4c 89 de        mov    %r11,%rsi
   0x0000000000196033 <+83>:    49 89 c3        mov    %rax,%r11
   0x0000000000196036 <+86>:    49 c1 eb 06     shr    $0x6,%r11
   0x000000000019603a <+90>:    81 e6 00 fc 00 00       and    $0xfc00,%esi
   0x0000000000196040 <+96>:    48 c1 e8 03     shr    $0x3,%rax
   0x0000000000196044 <+100>:   41 81 e3 e0 03 00 00    and    $0x3e0,%r11d
   0x000000000019604b <+107>:   83 e0 1f        and    $0x1f,%eax
   0x000000000019604e <+110>:   49 09 f3        or     %rsi,%r11
   0x0000000000196051 <+113>:   49 09 c3        or     %rax,%r11
   0x0000000000196054 <+116>:   48 89 c8        mov    %rcx,%rax
   0x0000000000196057 <+119>:   48 c1 e8 29     shr    $0x29,%rax
   0x000000000019605b <+123>:   25 00 fc 00 00  and    $0xfc00,%eax
   0x0000000000196060 <+128>:   48 89 c6        mov    %rax,%rsi
   0x0000000000196063 <+131>:   4c 89 d0        mov    %r10,%rax
   0x0000000000196066 <+134>:   49 89 ca        mov    %rcx,%r10
   0x0000000000196069 <+137>:   49 c1 ea 23     shr    $0x23,%r10
   0x000000000019606d <+141>:   48 09 f0        or     %rsi,%rax
   0x0000000000196070 <+144>:   41 83 e2 1f     and    $0x1f,%r10d
   0x0000000000196074 <+148>:   49 09 c2        or     %rax,%r10
   0x0000000000196077 <+151>:   48 8b 02        mov    (%rdx),%rax
   0x000000000019607a <+154>:   48 8b 52 08     mov    0x8(%rdx),%rdx
   0x000000000019607e <+158>:   49 89 d0        mov    %rdx,%r8
   0x0000000000196081 <+161>:   48 89 d6        mov    %rdx,%rsi
   0x0000000000196084 <+164>:   49 c1 e8 26     shr    $0x26,%r8
   0x0000000000196088 <+168>:   48 c1 ee 29     shr    $0x29,%rsi
   0x000000000019608c <+172>:   41 81 e0 e0 03 00 00    and    $0x3e0,%r8d
   0x0000000000196093 <+179>:   48 89 f3        mov    %rsi,%rbx
   0x0000000000196096 <+182>:   4c 89 c6        mov    %r8,%rsi
   0x0000000000196099 <+185>:   49 89 d0        mov    %rdx,%r8
   0x000000000019609c <+188>:   81 e3 00 fc 00 00       and    $0xfc00,%ebx
   0x00000000001960a2 <+194>:   49 c1 e8 23     shr    $0x23,%r8
   0x00000000001960a6 <+198>:   48 09 de        or     %rbx,%rsi
   0x00000000001960a9 <+201>:   41 83 e0 1f     and    $0x1f,%r8d
   0x00000000001960ad <+205>:   49 09 f0        or     %rsi,%r8
   0x00000000001960b0 <+208>:   48 89 d6        mov    %rdx,%rsi
   0x00000000001960b3 <+211>:   48 c1 ee 09     shr    $0x9,%rsi
   0x00000000001960b7 <+215>:   49 c1 e1 30     shl    $0x30,%r9
   0x00000000001960bb <+219>:   49 c1 e3 20     shl    $0x20,%r11
   0x00000000001960bf <+223>:   48 89 f3        mov    %rsi,%rbx
   0x00000000001960c2 <+226>:   48 89 d6        mov    %rdx,%rsi
   0x00000000001960c5 <+229>:   48 c1 ea 03     shr    $0x3,%rdx
   0x00000000001960c9 <+233>:   48 c1 ee 06     shr    $0x6,%rsi
   0x00000000001960cd <+237>:   81 e3 00 fc 00 00       and    $0xfc00,%ebx
   0x00000000001960d3 <+243>:   83 e2 1f        and    $0x1f,%edx
   0x00000000001960d6 <+246>:   81 e6 e0 03 00 00       and    $0x3e0,%esi
   0x00000000001960dc <+252>:   49 c1 e2 10     shl    $0x10,%r10
   0x00000000001960e0 <+256>:   48 09 de        or     %rbx,%rsi
   0x00000000001960e3 <+259>:   48 89 c3        mov    %rax,%rbx
   0x00000000001960e6 <+262>:   48 09 f2        or     %rsi,%rdx
   0x00000000001960e9 <+265>:   48 89 c6        mov    %rax,%rsi
   0x00000000001960ec <+268>:   48 c1 eb 29     shr    $0x29,%rbx
   0x00000000001960f0 <+272>:   48 c1 ee 26     shr    $0x26,%rsi
   0x00000000001960f4 <+276>:   81 e3 00 fc 00 00       and    $0xfc00,%ebx
   0x00000000001960fa <+282>:   81 e6 e0 03 00 00       and    $0x3e0,%esi
   0x0000000000196100 <+288>:   48 89 dd        mov    %rbx,%rbp
   0x0000000000196103 <+291>:   48 89 f3        mov    %rsi,%rbx
   0x0000000000196106 <+294>:   48 89 c6        mov    %rax,%rsi
   0x0000000000196109 <+297>:   48 c1 ee 23     shr    $0x23,%rsi
   0x000000000019610d <+301>:   48 09 eb        or     %rbp,%rbx
   0x0000000000196110 <+304>:   83 e6 1f        and    $0x1f,%esi
   0x0000000000196113 <+307>:   48 09 de        or     %rbx,%rsi
   0x0000000000196116 <+310>:   48 89 cb        mov    %rcx,%rbx
   0x0000000000196119 <+313>:   48 c1 eb 09     shr    $0x9,%rbx
   0x000000000019611d <+317>:   81 e3 00 fc 00 00       and    $0xfc00,%ebx
   0x0000000000196123 <+323>:   48 89 dd        mov    %rbx,%rbp
   0x0000000000196126 <+326>:   48 89 cb        mov    %rcx,%rbx
   0x0000000000196129 <+329>:   48 c1 e9 03     shr    $0x3,%rcx
   0x000000000019612d <+333>:   48 c1 eb 06     shr    $0x6,%rbx
   0x0000000000196131 <+337>:   83 e1 1f        and    $0x1f,%ecx
   0x0000000000196134 <+340>:   81 e3 e0 03 00 00       and    $0x3e0,%ebx
   0x000000000019613a <+346>:   48 09 eb        or     %rbp,%rbx
   0x000000000019613d <+349>:   48 09 d9        or     %rbx,%rcx
   0x0000000000196140 <+352>:   4c 09 c9        or     %r9,%rcx
   0x0000000000196143 <+355>:   4c 09 d9        or     %r11,%rcx
   0x0000000000196146 <+358>:   4c 09 d1        or     %r10,%rcx
   0x0000000000196149 <+361>:   48 89 4f 08     mov    %rcx,0x8(%rdi)
   0x000000000019614d <+365>:   48 89 c1        mov    %rax,%rcx
   0x0000000000196150 <+368>:   48 c1 e9 09     shr    $0x9,%rcx
   0x0000000000196154 <+372>:   81 e1 00 fc 00 00       and    $0xfc00,%ecx
   0x000000000019615a <+378>:   49 89 c9        mov    %rcx,%r9
   0x000000000019615d <+381>:   48 89 c1        mov    %rax,%rcx
   0x0000000000196160 <+384>:   48 c1 e9 06     shr    $0x6,%rcx
   0x0000000000196164 <+388>:   48 c1 e8 03     shr    $0x3,%rax
   0x0000000000196168 <+392>:   49 c1 e0 30     shl    $0x30,%r8
   0x000000000019616c <+396>:   81 e1 e0 03 00 00       and    $0x3e0,%ecx
   0x0000000000196172 <+402>:   83 e0 1f        and    $0x1f,%eax
   0x0000000000196175 <+405>:   48 c1 e2 20     shl    $0x20,%rdx
   0x0000000000196179 <+409>:   4c 09 c9        or     %r9,%rcx
   0x000000000019617c <+412>:   48 09 c8        or     %rcx,%rax
   0x000000000019617f <+415>:   48 89 f1        mov    %rsi,%rcx
   0x0000000000196182 <+418>:   4c 09 c0        or     %r8,%rax
   0x0000000000196185 <+421>:   48 c1 e1 10     shl    $0x10,%rcx
   0x0000000000196189 <+425>:   48 09 d0        or     %rdx,%rax
   0x000000000019618c <+428>:   48 09 c8        or     %rcx,%rax
   0x000000000019618f <+431>:   5b      pop    %rbx
   0x0000000000196190 <+432>:   48 89 07        mov    %rax,(%rdi)
   0x0000000000196193 <+435>:   5d      pop    %rbp
   0x0000000000196194 <+436>:   c3      retq   
End of assembler dump.

