On 26 Jul 01 at 18:26, Christopher C. Chimelis wrote: > > ncpfs code is affected by this (fortunately it still links as I > > gave up teaching gcc to inline code long ago). And as there is no > > documentation since when this option is supported, it is just guessing > > (or someone has to add -flimit-inline check to autoconf or kernel makefile). > > Ok, to clarify what I think you're trying to say: > gcc 2.95.x used -finline-limit-N > gcc 3.0.x uses -finline-limit=N > (no, that's not a typo on my part...the hyphen was replaced by an equal > sign according to the docs) > > Since neither 2.95.x or 3.0.x understand the other's options, it's > not inlining the code that needs to be? I understand what the various gcc > passes do WRT inlining, so no need to explain that portion...I'm just > trying to understand why changing the kernel Makefile is such a big deal, > especially since the kernel folks haven't began recommending that the
Fortunately 3.0.x understands -finline-limit-N , so it can be simple check for -fno-strict-aliasing is. Unless 3.0.2 will stop supporting this older format. > kernel be compiled with gcc 3.0.x yet anyway. There are MANY more > pitfalls involved in compiling kernels with gcc 3.0.x on some > platforms. I've had to patch Alpha's kernel sources just to compile the > kernel at all with gcc 3.0. > For the record, according to 2.4.6's Documentation/Changes file: I was always under impression that in future you want to get old gcc2.91.xx replaced with newer one. If newer one refuses to compile kernel... > > > extern inline void copy_siginfo(siginfo_t* to, siginfo_t* from) { > > if (from->si_code < 0) > > memcpy(to, from, sizeof(siginfo_t)); > > else > > memcpy(to, from, 3*sizeof(int) + sizeof(from->_sifields._sigchld)); > > } > > > > then I think that there is something wrong... > > Hmm...I'm definitely going to have to play with this. Can you come up > with a small testcase so that I can test other architectures as well? After digging through - problem is not with copy_siginfo, but with kernel's memcpy. I understand that internal code for __constant_memcpy can be very large due to switch(), but as only one branch is then really used (and correctly retrieved in copy_siginfo), compiler should not give up with function too large. It does what documentation says it does (modulo default limit), but - it could even optimize whole function to nothing, but still complain about non-inlinable code. And btw, for this particular example __constant_memcpy inlines with '-finline-limit=143' or larger. So I think that 100 is just unreasonable low inline limit, as real examples show that code generated from such function contains only 5 asm instructions... Maybe if optimizer could check longest code path in __constant_memcpy instead of size of whole function, it could work. Or even better - I believe that there is bug in how size of copy_siginfo was computed. As compiler knew size parameter passed to __constant_memcpy even before whole copy_siginfo was compiled, it should threw unused portions of inlined __constant_memcpy out, should not? (and I do not believe that after this throwing out copy_siginfo size is 143...) Thanks, Petr Vandrovec [EMAIL PROTECTED] Generated code (+ 2 comments): .file "xx.c" .section .rodata .LC0: .string "%u\n" .text .align 16 .globl main .type main,@function main: pushl %edi xorl %eax, %eax subl $288, %esp cld movl %esp, %edi movl $33, %ecx rep stosl movl %esp, %eax ## Why not push %esp? pushl %eax ## It does same thing... leal 148(%esp), %eax pushl %eax call copy_siginfo movl 152(%esp), %eax ## pushl 152(%esp) ? pushl %eax ## instead of using %eax pushl $.LC0 call printf addl $304, %esp popl %edi ret .Lfe1: .size main,.Lfe1-main .align 16 .type copy_siginfo,@function copy_siginfo: pushl %edi pushl %esi movl 16(%esp), %esi movl 12(%esp), %edi movl (%esi), %edx testl %edx, %edx js .L64 movl $6, %ecx .L65: #APP rep ; movsl #NO_APP popl %esi popl %edi ret .p2align 4,,7 .L64: movl $33, %ecx jmp .L65 .Lfe2: .size copy_siginfo,.Lfe2-copy_siginfo .ident "GCC: (GNU) 3.0.1 20010721 (Debian prerelease)" Source: #include <linux/types.h> static inline void * __memcpy(void * to, const void * from, size_t n) { int d0, d1, d2; __asm__ __volatile__( "rep ; movsl\n\t" "testb $2,%b4\n\t" "je 1f\n\t" "movsw\n" "1:\ttestb $1,%b4\n\t" "je 2f\n\t" "movsb\n" "2:" : "=&c" (d0), "=&D" (d1), "=&S" (d2) :"0" (n/4), "q" (n),"1" ((long) to),"2" ((long) from) : "memory"); return (to); } /* * This looks horribly ugly, but the compiler can optimize it totally, * as the count is constant. */ static inline void * __constant_memcpy(void * to, const void * from, size_t n) { switch (n) { case 0: return to; case 1: *(unsigned char *)to = *(const unsigned char *)from; return to; case 2: *(unsigned short *)to = *(const unsigned short *)from; return to; case 3: *(unsigned short *)to = *(const unsigned short *)from; *(2+(unsigned char *)to) = *(2+(const unsigned char *)from); return to; case 4: *(unsigned long *)to = *(const unsigned long *)from; return to; case 6: /* for Ethernet addresses */ *(unsigned long *)to = *(const unsigned long *)from; *(2+(unsigned short *)to) = *(2+(const unsigned short *)from); return to; case 8: *(unsigned long *)to = *(const unsigned long *)from; *(1+(unsigned long *)to) = *(1+(const unsigned long *)from); return to; case 12: *(unsigned long *)to = *(const unsigned long *)from; *(1+(unsigned long *)to) = *(1+(const unsigned long *)from); *(2+(unsigned long *)to) = *(2+(const unsigned long *)from); return to; case 16: *(unsigned long *)to = *(const unsigned long *)from; *(1+(unsigned long *)to) = *(1+(const unsigned long *)from); *(2+(unsigned long *)to) = *(2+(const unsigned long *)from); *(3+(unsigned long *)to) = *(3+(const unsigned long *)from); return to; case 20: *(unsigned long *)to = *(const unsigned long *)from; *(1+(unsigned long *)to) = *(1+(const unsigned long *)from); *(2+(unsigned long *)to) = *(2+(const unsigned long *)from); *(3+(unsigned long *)to) = *(3+(const unsigned long *)from); *(4+(unsigned long *)to) = *(4+(const unsigned long *)from); return to; } #define COMMON(x) \ __asm__ __volatile__( \ "rep ; movsl" \ x \ : "=&c" (d0), "=&D" (d1), "=&S" (d2) \ : "0" (n/4),"1" ((long) to),"2" ((long) from) \ : "memory"); { int d0, d1, d2; switch (n % 4) { case 0: COMMON(""); return to; case 1: COMMON("\n\tmovsb"); return to; case 2: COMMON("\n\tmovsw"); return to; default: COMMON("\n\tmovsw\n\tmovsb"); return to; } } #undef COMMON } #if 0 #define memcpy(t, f, n) \ (__builtin_constant_p(n) ? \ __constant_memcpy((t),(f),(n)) : \ __memcpy((t),(f),(n))) #else #define memcpy(t, f, n) \ __constant_memcpy((t),(f),(n)) /* #define memcpy(t, f, n) \ __memcpy((t),(f),(n)) works fine, so switch is a culprit */ #endif /* XXX: This structure was copied from the Alpha; is there an iBCS version? */ typedef struct siginfo { int si_code; int _pad[32]; } siginfo_t; /* change this to extern... */ static inline void copy_siginfo(siginfo_t *to, siginfo_t *from) { if (from->si_code < 0) memcpy(to, from, sizeof(siginfo_t)); else memcpy(to, from, sizeof(int) * 6); } void main(void) { siginfo_t a; siginfo_t b; memset(&b, 0, sizeof(b)); copy_siginfo(&a, &b); printf("%u\n", a.si_code); }