On Wednesday 13 July 2011 14:26:02 Jason wrote:
> On Wednesday 13 July 2011 14:01:39 Cactus wrote:
> > Darn, I already did the conversion :-(
> >
> > I don't have enough registers to use only the 32-bit registers so I have
> > to put stuff in r8 and r9 instead. Given this involved prefix opcodes,
> > I am wondering what I should o with your coded nops since any alignment
> > you are seeking won't be the same with r8 and r9? ANy ideas on how to
> > optimise this when r8 and r9 are used instead of rsi and rdx?
>
> The nop's in the loop are for the schedulers/pick I believe , not for code
> alignment. So it shouldn't matter. If it does then you could try moving the
> nop's around , or as only two registers are used in the loop just swap them
> around in the feedin code. See next post on alternative.
>
> > Brian
Compare the difference of running our benchmpn on the current svn , with the
assembler as gas as opposed to yasm on a K8
8c8
< addmul_2 4783
---
> addmul_2 4782
11c11
< addsub_n 2194
---
> addsub_n 2193
14,16c14,16
< rshift 2195
< lshift2 1524
< rshift2 1857
---
> rshift 2197
> lshift2 1525
> rshift2 1862
23,26c23,26
< addlsh_n 3032
< sublsh_n 3282
< inclsh_n 3031
< declsh_n 3283
---
> addlsh_n 3035
> sublsh_n 3284
> inclsh_n 3034
> declsh_n 3284
36c36
< popcount 4701
---
> popcount 4704
40,42c40,42
< and_n 1522
< xor_n 1522
< ior_n 1522
---
> and_n 1525
> xor_n 1525
> ior_n 1525
48c48
< lshiftc 2654
---
> lshiftc 2657
54,55c54,55
< add_err1_n 2798
< sub_err1_n 2797
---
> add_err1_n 2796
> sub_err1_n 2796
58,60c58,60
< divrem_euclidean_r_1 3212
< divrem_1 10821
< divrem_2 21069
---
> divrem_euclidean_r_1 3215
> divrem_1 10824
> divrem_2 21070
67c67
< rsh_divrem_hensel_qr_1_1 10065
---
> rsh_divrem_hensel_qr_1_1 10066
70,71c70,71
< mod_1_2 3527
< mod_1_3 3039
---
> mod_1_2 4022
> mod_1_3 3042
the only significant difference is mod_1_2 where gas runs at 3.5 and yasm at
4.0
cycles per word, The listings are attached and you can see that the first
difference is at "jc skiplp" a forward jump to skip the loop where the
displacements are different , and the next difference is at "jnc lp"
where again the displacements are different , but note the addresses are all
the same , something is wrong here. I'll post to yasm's list .
Jason
Jason
--
You received this message because you are subscribed to the Google Groups
"mpir-devel" group.
To post to this group, send email to [email protected].
To unsubscribe from this group, send email to
[email protected].
For more options, visit this group at
http://groups.google.com/group/mpir-devel?hl=en.
1 0000 4154 push %r12
2 0002 4155 push %r13
3 0004 4156 push %r14
4 0006 4C8B74D6 mov -8(%rsi,%rdx,8),%r14
4 F8
5 000b 4C8B6CD6 mov -16(%rsi,%rdx,8),%r13
5 F0
6 0010 4C8B01 mov (%rcx),%r8
7 0013 4C8B4908 mov 8(%rcx),%r9
8 0017 4C8B5110 mov 16(%rcx),%r10
9 001b 4889D1 mov %rdx,%rcx
10 001e 488B44D6 mov -24(%rsi,%rdx,8),%rax
10 E8
11 0023 49F7E0 mul %r8
12 0026 4C8B5CCE mov -32(%rsi,%rcx,8),%r11
12 E0
13 002b 4D31E4 xor %r12,%r12
14 002e 4883E906 sub $6,%rcx
15 0032 7248 jc skiplp
16 0034 6666662E .align 16
16 0F1F8400
16 00000000
17 0040 4901C3 lp:add %rax,%r11
18 0043 4911D4 adc %rdx,%r12
19 0046 4C89C8 mov %r9,%rax
20 0049 49F7E5 mul %r13
21 004c 4901C3 add %rax,%r11
22 004f 4911D4 adc %rdx,%r12
23 0052 4D89DD mov %r11,%r13
24 0055 4C89D0 mov %r10,%rax
25 0058 49F7E6 mul %r14
26 005b 4901C5 add %rax,%r13
27 005e 488B44CE mov 8(%rsi,%rcx,8),%rax
27 08
28 0063 4D89E6 mov %r12,%r14
29 0066 4911D6 adc %rdx,%r14
30 0069 49F7E0 mul %r8
31 006c 41BC0000 mov $0,%r12d
31 0000
32 0072 4C8B1CCE mov 0(%rsi,%rcx,8),%r11
33 0076 4883E902 sub $2,%rcx
34 007a 73C4 jnc lp
35 skiplp:
36 007c 4901C3 add %rax,%r11
37 007f 4911D4 adc %rdx,%r12
38 0082 4C89C8 mov %r9,%rax
39 0085 49F7E5 mul %r13
40 0088 4901C3 add %rax,%r11
41 008b 4911D4 adc %rdx,%r12
42 008e 4D89DD mov %r11,%r13
43 0091 4C89D0 mov %r10,%rax
44 0094 49F7E6 mul %r14
45 0097 4901C5 add %rax,%r13
46 009a 4D89E6 mov %r12,%r14
47 009d 4911D6 adc %rdx,%r14
48 00a0 4883F9FE cmp $-2,%rcx
49 00a4 7426 je case0
50 case1:
51 00a6 4C8B5CCE mov 8(%rsi,%rcx,8),%r11
51 08
52 00ab 4D31E4 xor %r12,%r12
53 00ae 4C89C0 mov %r8,%rax
54 00b1 49F7E5 mul %r13
55 00b4 4901C3 add %rax,%r11
56 00b7 4911D4 adc %rdx,%r12
57 00ba 4D89DD mov %r11,%r13
58 00bd 4C89C8 mov %r9,%rax
59 00c0 49F7E6 mul %r14
60 00c3 4901C5 add %rax,%r13
61 00c6 4D89E6 mov %r12,%r14
62 00c9 4911D6 adc %rdx,%r14
63 case0:
64 00cc 4C89C0 mov %r8,%rax
65 00cf 49F7E6 mul %r14
66 00d2 4901C5 add %rax,%r13
67 00d5 4883D200 adc $0,%rdx
68 00d9 4C892F mov %r13,(%rdi)
69 00dc 48895708 mov %rdx,8(%rdi)
70 00e0 415E pop %r14
71 00e2 415D pop %r13
72 00e4 415C pop %r12
73 00e6 C3 ret
1 00000000 4154 push %r12
2 00000002 4155 push %r13
3 00000004 4156 push %r14
4 00000006 4C8B74D6F8 mov -8(%rsi,%rdx,8),%r14
5 0000000B 4C8B6CD6F0 mov -16(%rsi,%rdx,8),%r13
6 00000010 4C8B01 mov (%rcx),%r8
7 00000013 4C8B4908 mov 8(%rcx),%r9
8 00000017 4C8B5110 mov 16(%rcx),%r10
9 0000001B 4889D1 mov %rdx,%rcx
10 0000001E 488B44D6E8 mov -24(%rsi,%rdx,8),%rax
11 00000023 49F7E0 mul %r8
12 00000026 4C8B5CCEE0 mov -32(%rsi,%rcx,8),%r11
13 0000002B 4D31E4 xor %r12,%r12
14 0000002E 4883E906 sub $6,%rcx
15 00000032 7246 jc skiplp
16 00000034 6666662E0F1F840000- .align 16
17 00000034 000000
18 00000040 4901C3 lp: add %rax,%r11
19 00000043 4911D4 adc %rdx,%r12
20 00000046 4C89C8 mov %r9,%rax
21 00000049 49F7E5 mul %r13
22 0000004C 4901C3 add %rax,%r11
23 0000004F 4911D4 adc %rdx,%r12
24 00000052 4D89DD mov %r11,%r13
25 00000055 4C89D0 mov %r10,%rax
26 00000058 49F7E6 mul %r14
27 0000005B 4901C5 add %rax,%r13
28 0000005E 488B44CE08 mov 8(%rsi,%rcx,8),%rax
29 00000063 4D89E6 mov %r12,%r14
30 00000066 4911D6 adc %rdx,%r14
31 00000069 49F7E0 mul %r8
32 0000006C 41BC00000000 mov $0,%r12d
33 00000072 4C8B1CCE mov 0(%rsi,%rcx,8),%r11
34 00000076 4883E902 sub $2,%rcx
35 0000007A 73C2 jnc lp
36 skiplp:
37 0000007C 4901C3 add %rax,%r11
38 0000007F 4911D4 adc %rdx,%r12
39 00000082 4C89C8 mov %r9,%rax
40 00000085 49F7E5 mul %r13
41 00000088 4901C3 add %rax,%r11
42 0000008B 4911D4 adc %rdx,%r12
43 0000008E 4D89DD mov %r11,%r13
44 00000091 4C89D0 mov %r10,%rax
45 00000094 49F7E6 mul %r14
46 00000097 4901C5 add %rax,%r13
47 0000009A 4D89E6 mov %r12,%r14
48 0000009D 4911D6 adc %rdx,%r14
49 000000A0 4883F9FE cmp $-2,%rcx
50 000000A4 7424 je case0
51 case1:
52 000000A6 4C8B5CCE08 mov 8(%rsi,%rcx,8),%r11
53 000000AB 4D31E4 xor %r12,%r12
54 000000AE 4C89C0 mov %r8,%rax
55 000000B1 49F7E5 mul %r13
56 000000B4 4901C3 add %rax,%r11
57 000000B7 4911D4 adc %rdx,%r12
58 000000BA 4D89DD mov %r11,%r13
59 000000BD 4C89C8 mov %r9,%rax
60 000000C0 49F7E6 mul %r14
61 000000C3 4901C5 add %rax,%r13
62 000000C6 4D89E6 mov %r12,%r14
63 000000C9 4911D6 adc %rdx,%r14
64 case0:
65 000000CC 4C89C0 mov %r8,%rax
66 000000CF 49F7E6 mul %r14
67 000000D2 4901C5 add %rax,%r13
68 000000D5 4883D200 adc $0,%rdx
69 000000D9 4C892F mov %r13,(%rdi)
70 000000DC 48895708 mov %rdx,8(%rdi)
71 000000E0 415E pop %r14
72 000000E2 415D pop %r13
73 000000E4 415C pop %r12
74 000000E6 C3 ret
75