On 26 Jul 01 at 18:26, Christopher C. Chimelis wrote:
> > ncpfs code is affected by this (fortunately it still links as I
> > gave up teaching gcc to inline code long ago). And as there is no
> > documentation since when this option is supported, it is just guessing
> > (or someone has to add -flimit-inline check to autoconf or kernel makefile).
> 
> Ok, to clarify what I think you're trying to say:
>     gcc 2.95.x used -finline-limit-N
>     gcc 3.0.x uses  -finline-limit=N
> (no, that's not a typo on my part...the hyphen was replaced by an equal
> sign according to the docs)
> 
> Since neither 2.95.x or 3.0.x understand the other's options, it's
> not inlining the code that needs to be?  I understand what the various gcc
> passes do WRT inlining, so no need to explain that portion...I'm just
> trying to understand why changing the kernel Makefile is such a big deal,
> especially since the kernel folks haven't began recommending that the

Fortunately 3.0.x understands -finline-limit-N , so it can be simple
check for -fno-strict-aliasing is. Unless 3.0.2 will stop supporting
this older format.

> kernel be compiled with gcc 3.0.x yet anyway.  There are MANY more
> pitfalls involved in compiling kernels with gcc 3.0.x on some
> platforms.  I've had to patch Alpha's kernel sources just to compile the
> kernel at all with gcc 3.0.
 
> For the record, according to 2.4.6's Documentation/Changes file:

I was always under impression that in future you want to get old gcc2.91.xx 
replaced with newer one. If newer one refuses to compile kernel...
> 
> > extern inline void copy_siginfo(siginfo_t* to, siginfo_t* from) {
> >   if (from->si_code < 0) 
> >      memcpy(to, from, sizeof(siginfo_t));
> >   else
> >      memcpy(to, from, 3*sizeof(int) + sizeof(from->_sifields._sigchld));
> > }
> > 
> > then I think that there is something wrong...
> 
> Hmm...I'm definitely going to have to play with this.  Can you come up
> with a small testcase so that I can test other architectures as well?

After digging through - problem is not with copy_siginfo, but with
kernel's memcpy. 

I understand that internal code for __constant_memcpy can be very large
due to switch(), but as only one branch is then really used (and
correctly retrieved in copy_siginfo), compiler should not give up
with function too large. It does what documentation says it does (modulo
default limit), but - it could even optimize whole function to nothing,
but still complain about non-inlinable code.

And btw, for this particular example __constant_memcpy inlines with 
'-finline-limit=143' or larger. So I think that 100 is just unreasonable 
low inline limit, as real examples show that code generated from such 
function contains only 5 asm instructions... Maybe if optimizer could 
check longest code path in __constant_memcpy instead of size of whole
function, it could work.

Or even better - I believe that there is bug in how size of
copy_siginfo was computed. As compiler knew size parameter passed to
__constant_memcpy even before whole copy_siginfo was compiled,
it should threw unused portions of inlined __constant_memcpy out,
should not? (and I do not believe that after this throwing out
copy_siginfo size is 143...)
                                            Thanks,
                                                Petr Vandrovec
                                                [EMAIL PROTECTED]
                                                
Generated code (+ 2 comments):
                                                
    .file   "xx.c"
    .section    .rodata
.LC0:
    .string "%u\n"
    .text
    .align 16
.globl main
    .type   main,@function
main:
    pushl   %edi
    xorl    %eax, %eax
    subl    $288, %esp
    cld
    movl    %esp, %edi
    movl    $33, %ecx
    rep
    stosl
    movl    %esp, %eax             ## Why not push %esp?
    pushl   %eax                   ## It does same thing...
    leal    148(%esp), %eax
    pushl   %eax
    call    copy_siginfo
    movl    152(%esp), %eax        ## pushl 152(%esp) ?
    pushl   %eax                   ## instead of using %eax
    pushl   $.LC0
    call    printf
    addl    $304, %esp
    popl    %edi
    ret
.Lfe1:
    .size   main,.Lfe1-main
    .align 16
    .type   copy_siginfo,@function
copy_siginfo:
    pushl   %edi
    pushl   %esi
    movl    16(%esp), %esi
    movl    12(%esp), %edi
    movl    (%esi), %edx
    testl   %edx, %edx
    js  .L64
    movl    $6, %ecx
.L65:
#APP
    rep ; movsl
#NO_APP
    popl    %esi
    popl    %edi
    ret
    .p2align 4,,7
.L64:
    movl    $33, %ecx
    jmp .L65
.Lfe2:
    .size   copy_siginfo,.Lfe2-copy_siginfo
    .ident  "GCC: (GNU) 3.0.1 20010721 (Debian prerelease)"


Source:

#include <linux/types.h>

static inline void * __memcpy(void * to, const void * from, size_t n)
{
int d0, d1, d2;
__asm__ __volatile__(
    "rep ; movsl\n\t"
    "testb $2,%b4\n\t"
    "je 1f\n\t"
    "movsw\n"
    "1:\ttestb $1,%b4\n\t"
    "je 2f\n\t"
    "movsb\n"
    "2:"
    : "=&c" (d0), "=&D" (d1), "=&S" (d2)
    :"0" (n/4), "q" (n),"1" ((long) to),"2" ((long) from)
    : "memory");
return (to);
}

/*
 * This looks horribly ugly, but the compiler can optimize it totally,
 * as the count is constant.
 */
static inline void * __constant_memcpy(void * to, const void * from, size_t n)
{
    switch (n) {
        case 0:
            return to;
        case 1:
            *(unsigned char *)to = *(const unsigned char *)from;
            return to;
        case 2:
            *(unsigned short *)to = *(const unsigned short *)from;
            return to;
        case 3:
            *(unsigned short *)to = *(const unsigned short *)from;
            *(2+(unsigned char *)to) = *(2+(const unsigned char *)from);
            return to;
        case 4:
            *(unsigned long *)to = *(const unsigned long *)from;
            return to;
        case 6: /* for Ethernet addresses */
            *(unsigned long *)to = *(const unsigned long *)from;
            *(2+(unsigned short *)to) = *(2+(const unsigned short *)from);
            return to;
        case 8:
            *(unsigned long *)to = *(const unsigned long *)from;
            *(1+(unsigned long *)to) = *(1+(const unsigned long *)from);
            return to;
        case 12:
            *(unsigned long *)to = *(const unsigned long *)from;
            *(1+(unsigned long *)to) = *(1+(const unsigned long *)from);
            *(2+(unsigned long *)to) = *(2+(const unsigned long *)from);
            return to;
        case 16:
            *(unsigned long *)to = *(const unsigned long *)from;
            *(1+(unsigned long *)to) = *(1+(const unsigned long *)from);
            *(2+(unsigned long *)to) = *(2+(const unsigned long *)from);
            *(3+(unsigned long *)to) = *(3+(const unsigned long *)from);
            return to;
        case 20:
            *(unsigned long *)to = *(const unsigned long *)from;
            *(1+(unsigned long *)to) = *(1+(const unsigned long *)from);
            *(2+(unsigned long *)to) = *(2+(const unsigned long *)from);
            *(3+(unsigned long *)to) = *(3+(const unsigned long *)from);
            *(4+(unsigned long *)to) = *(4+(const unsigned long *)from);
            return to;
    }
#define COMMON(x) \
__asm__ __volatile__( \
    "rep ; movsl" \
    x \
    : "=&c" (d0), "=&D" (d1), "=&S" (d2) \
    : "0" (n/4),"1" ((long) to),"2" ((long) from) \
    : "memory");
{
    int d0, d1, d2;
    switch (n % 4) {
        case 0: COMMON(""); return to;
        case 1: COMMON("\n\tmovsb"); return to;
        case 2: COMMON("\n\tmovsw"); return to;
        default: COMMON("\n\tmovsw\n\tmovsb"); return to;
    }
}
  
#undef COMMON
}

#if 0
#define memcpy(t, f, n) \
(__builtin_constant_p(n) ? \
 __constant_memcpy((t),(f),(n)) : \
 __memcpy((t),(f),(n)))
#else
#define memcpy(t, f, n) \
 __constant_memcpy((t),(f),(n))

/*
#define memcpy(t, f, n) \
 __memcpy((t),(f),(n))
 
works fine, so switch is a culprit */
#endif

/* XXX: This structure was copied from the Alpha; is there an iBCS version?  */

typedef struct siginfo {
    int si_code;
    int _pad[32];
} siginfo_t;

/* change this to extern... */
static inline void copy_siginfo(siginfo_t *to, siginfo_t *from)
{
    if (from->si_code < 0)
        memcpy(to, from, sizeof(siginfo_t));
    else
        memcpy(to, from, sizeof(int) * 6);
}

void main(void) {
    siginfo_t a;
    siginfo_t b;

    memset(&b, 0, sizeof(b));
    copy_siginfo(&a, &b);
    printf("%u\n", a.si_code);
}


Reply via email to