On Wednesday 13 July 2011 14:26:02 Jason wrote:
> On Wednesday 13 July 2011 14:01:39 Cactus wrote:
> > Darn, I already did the conversion :-(
> > 
> > I don't have enough registers to use only the 32-bit registers so I have
> > to put stuff in r8 and r9 instead.   Given this involved prefix opcodes,
> > I am wondering what I should o with your coded nops since any alignment
> > you are seeking won't be the same with r8 and r9?    ANy ideas on how to
> > optimise this when r8 and r9 are used instead of rsi and rdx?
> 
> The nop's in the loop are for the schedulers/pick I believe , not for code
> alignment. So it shouldn't matter. If it does then you could try moving the
> nop's around , or as only two registers are used in the loop just swap them
> around in the feedin code. See next post on alternative.
> 
> >    Brian

Compare the difference  of running our benchmpn on the current svn , with the 
assembler as gas as opposed to yasm on a K8

8c8
<                 addmul_2      4783
---
>                 addmul_2      4782
11c11
<                 addsub_n      2194
---
>                 addsub_n      2193
14,16c14,16
<                   rshift      2195
<                  lshift2      1524
<                  rshift2      1857
---
>                   rshift      2197
>                  lshift2      1525
>                  rshift2      1862
23,26c23,26
<                 addlsh_n      3032
<                 sublsh_n      3282
<                 inclsh_n      3031
<                 declsh_n      3283
---
>                 addlsh_n      3035
>                 sublsh_n      3284
>                 inclsh_n      3034
>                 declsh_n      3284
36c36
<                 popcount      4701
---
>                 popcount      4704
40,42c40,42
<                    and_n      1522
<                    xor_n      1522
<                    ior_n      1522
---
>                    and_n      1525
>                    xor_n      1525
>                    ior_n      1525
48c48
<                  lshiftc      2654
---
>                  lshiftc      2657
54,55c54,55
<               add_err1_n      2798
<               sub_err1_n      2797
---
>               add_err1_n      2796
>               sub_err1_n      2796
58,60c58,60
<     divrem_euclidean_r_1      3212
<                 divrem_1      10821
<                 divrem_2      21069
---
>     divrem_euclidean_r_1      3215
>                 divrem_1      10824
>                 divrem_2      21070
67c67
< rsh_divrem_hensel_qr_1_1      10065
---
> rsh_divrem_hensel_qr_1_1      10066
70,71c70,71
<                  mod_1_2      3527
<                  mod_1_3      3039
---
>                  mod_1_2      4022
>                  mod_1_3      3042

the only significant difference is mod_1_2 where gas runs at 3.5 and yasm at 
4.0 
cycles per word, The listings are attached and you can see that the first 
difference is at "jc skiplp" a forward jump to skip the loop where the 
displacements are different , and the next difference is at "jnc lp"
 where again the displacements are different , but note the addresses are all 
the same  , something is wrong here. I'll post to yasm's list .

Jason



Jason

-- 
You received this message because you are subscribed to the Google Groups 
"mpir-devel" group.
To post to this group, send email to [email protected].
To unsubscribe from this group, send email to 
[email protected].
For more options, visit this group at 
http://groups.google.com/group/mpir-devel?hl=en.

   1 0000 4154     	push %r12
   2 0002 4155     	push %r13
   3 0004 4156     	push %r14
   4 0006 4C8B74D6 	mov -8(%rsi,%rdx,8),%r14
   4      F8
   5 000b 4C8B6CD6 	mov -16(%rsi,%rdx,8),%r13
   5      F0
   6 0010 4C8B01   	mov (%rcx),%r8
   7 0013 4C8B4908 	mov 8(%rcx),%r9
   8 0017 4C8B5110 	mov 16(%rcx),%r10
   9 001b 4889D1   	mov %rdx,%rcx
  10 001e 488B44D6 	mov -24(%rsi,%rdx,8),%rax
  10      E8
  11 0023 49F7E0   	mul %r8
  12 0026 4C8B5CCE 	mov -32(%rsi,%rcx,8),%r11
  12      E0
  13 002b 4D31E4   	xor %r12,%r12
  14 002e 4883E906 	sub $6,%rcx
  15 0032 7248     	jc skiplp
  16 0034 6666662E 	.align 16
  16      0F1F8400 
  16      00000000 
  17 0040 4901C3   	lp:add %rax,%r11
  18 0043 4911D4   	 adc %rdx,%r12
  19 0046 4C89C8   	 mov %r9,%rax
  20 0049 49F7E5   	 mul %r13
  21 004c 4901C3   	 add %rax,%r11
  22 004f 4911D4   	 adc %rdx,%r12
  23 0052 4D89DD   	 mov %r11,%r13
  24 0055 4C89D0   	 mov %r10,%rax
  25 0058 49F7E6   	 mul %r14
  26 005b 4901C5   	 add %rax,%r13
  27 005e 488B44CE 	 mov 8(%rsi,%rcx,8),%rax
  27      08
  28 0063 4D89E6   	 mov %r12,%r14
  29 0066 4911D6   	 adc %rdx,%r14
  30 0069 49F7E0   	 mul %r8
  31 006c 41BC0000 	 mov $0,%r12d
  31      0000
  32 0072 4C8B1CCE 	 mov 0(%rsi,%rcx,8),%r11
  33 0076 4883E902 	 sub $2,%rcx
  34 007a 73C4     	 jnc lp
  35              	skiplp:
  36 007c 4901C3   	 add %rax,%r11
  37 007f 4911D4   	 adc %rdx,%r12
  38 0082 4C89C8   	 mov %r9,%rax
  39 0085 49F7E5   	 mul %r13
  40 0088 4901C3   	 add %rax,%r11
  41 008b 4911D4   	 adc %rdx,%r12
  42 008e 4D89DD   	 mov %r11,%r13
  43 0091 4C89D0   	 mov %r10,%rax
  44 0094 49F7E6   	 mul %r14
  45 0097 4901C5   	 add %rax,%r13
  46 009a 4D89E6   	 mov %r12,%r14
  47 009d 4911D6   	 adc %rdx,%r14
  48 00a0 4883F9FE 	cmp $-2,%rcx
  49 00a4 7426     	je case0
  50              	case1:
  51 00a6 4C8B5CCE 	 mov 8(%rsi,%rcx,8),%r11
  51      08
  52 00ab 4D31E4   	 xor %r12,%r12
  53 00ae 4C89C0   	 mov %r8,%rax
  54 00b1 49F7E5   	 mul %r13
  55 00b4 4901C3   	 add %rax,%r11
  56 00b7 4911D4   	 adc %rdx,%r12
  57 00ba 4D89DD   	 mov %r11,%r13
  58 00bd 4C89C8   	 mov %r9,%rax
  59 00c0 49F7E6   	 mul %r14
  60 00c3 4901C5   	 add %rax,%r13
  61 00c6 4D89E6   	 mov %r12,%r14
  62 00c9 4911D6   	 adc %rdx,%r14
  63              	case0:
  64 00cc 4C89C0   	mov %r8,%rax
  65 00cf 49F7E6   	mul %r14
  66 00d2 4901C5   	add %rax,%r13
  67 00d5 4883D200 	adc $0,%rdx
  68 00d9 4C892F   	mov %r13,(%rdi)
  69 00dc 48895708 	mov %rdx,8(%rdi)
  70 00e0 415E     	pop %r14
  71 00e2 415D     	pop %r13
  72 00e4 415C     	pop %r12
  73 00e6 C3       	ret
     1 00000000 4154                   push %r12
     2 00000002 4155                   push %r13
     3 00000004 4156                   push %r14
     4 00000006 4C8B74D6F8             mov -8(%rsi,%rdx,8),%r14
     5 0000000B 4C8B6CD6F0             mov -16(%rsi,%rdx,8),%r13
     6 00000010 4C8B01                 mov (%rcx),%r8
     7 00000013 4C8B4908               mov 8(%rcx),%r9
     8 00000017 4C8B5110               mov 16(%rcx),%r10
     9 0000001B 4889D1                 mov %rdx,%rcx
    10 0000001E 488B44D6E8             mov -24(%rsi,%rdx,8),%rax
    11 00000023 49F7E0                 mul %r8
    12 00000026 4C8B5CCEE0             mov -32(%rsi,%rcx,8),%r11
    13 0000002B 4D31E4                 xor %r12,%r12
    14 0000002E 4883E906               sub $6,%rcx
    15 00000032 7246                   jc skiplp
    16 00000034 6666662E0F1F840000-    .align 16
    17 00000034 000000             
    18 00000040 4901C3                 lp:	add %rax,%r11
    19 00000043 4911D4                 	adc %rdx,%r12
    20 00000046 4C89C8                 	mov %r9,%rax
    21 00000049 49F7E5                 	mul %r13
    22 0000004C 4901C3                 	add %rax,%r11
    23 0000004F 4911D4                 	adc %rdx,%r12
    24 00000052 4D89DD                 	mov %r11,%r13
    25 00000055 4C89D0                 	mov %r10,%rax
    26 00000058 49F7E6                 	mul %r14
    27 0000005B 4901C5                 	add %rax,%r13
    28 0000005E 488B44CE08             	mov 8(%rsi,%rcx,8),%rax
    29 00000063 4D89E6                 	mov %r12,%r14
    30 00000066 4911D6                 	adc %rdx,%r14
    31 00000069 49F7E0                 	mul %r8
    32 0000006C 41BC00000000           	mov $0,%r12d
    33 00000072 4C8B1CCE               	mov 0(%rsi,%rcx,8),%r11
    34 00000076 4883E902               	sub $2,%rcx
    35 0000007A 73C2                   	jnc lp
    36                                 skiplp:
    37 0000007C 4901C3                 	add %rax,%r11
    38 0000007F 4911D4                 	adc %rdx,%r12
    39 00000082 4C89C8                 	mov %r9,%rax
    40 00000085 49F7E5                 	mul %r13
    41 00000088 4901C3                 	add %rax,%r11
    42 0000008B 4911D4                 	adc %rdx,%r12
    43 0000008E 4D89DD                 	mov %r11,%r13
    44 00000091 4C89D0                 	mov %r10,%rax
    45 00000094 49F7E6                 	mul %r14
    46 00000097 4901C5                 	add %rax,%r13
    47 0000009A 4D89E6                 	mov %r12,%r14
    48 0000009D 4911D6                 	adc %rdx,%r14
    49 000000A0 4883F9FE               cmp $-2,%rcx
    50 000000A4 7424                   je case0
    51                                 case1:
    52 000000A6 4C8B5CCE08             	mov 8(%rsi,%rcx,8),%r11
    53 000000AB 4D31E4                 	xor %r12,%r12
    54 000000AE 4C89C0                 	mov %r8,%rax
    55 000000B1 49F7E5                 	mul %r13
    56 000000B4 4901C3                 	add %rax,%r11
    57 000000B7 4911D4                 	adc %rdx,%r12
    58 000000BA 4D89DD                 	mov %r11,%r13
    59 000000BD 4C89C8                 	mov %r9,%rax
    60 000000C0 49F7E6                 	mul %r14
    61 000000C3 4901C5                 	add %rax,%r13
    62 000000C6 4D89E6                 	mov %r12,%r14
    63 000000C9 4911D6                 	adc %rdx,%r14
    64                                 case0:	
    65 000000CC 4C89C0                 mov %r8,%rax
    66 000000CF 49F7E6                 mul %r14
    67 000000D2 4901C5                 add %rax,%r13
    68 000000D5 4883D200               adc $0,%rdx
    69 000000D9 4C892F                 mov %r13,(%rdi)
    70 000000DC 48895708               mov %rdx,8(%rdi)
    71 000000E0 415E                   pop %r14
    72 000000E2 415D                   pop %r13
    73 000000E4 415C                   pop %r12
    74 000000E6 C3                     ret
    75                                 

Reply via email to