https://gcc.gnu.org/bugzilla/show_bug.cgi?id=79693

            Bug ID: 79693
           Summary: Memory buffer handling - additional optimization
                    proposal
           Product: gcc
           Version: 5.4.0
            Status: UNCONFIRMED
          Severity: normal
          Priority: P3
         Component: c
          Assignee: unassigned at gcc dot gnu.org
          Reporter: bugzi...@poradnik-webmastera.com
  Target Milestone: ---

I have checked how gcc treats temporary buffers allocated in different ways
(local buffer on stack, malloced locally in function, malloced outside and
passed via argument) and found that gcc could do better work there. I have few
proposals how to make things better:

1. Introduce __builtin_assume_temporary(ptr) and/or __attribute__((temporary))
which will tell gcc that given function argument or variable is a temporary
buffer and its contents after function execution will not be used. By doing so
gcc could treat it as if it was allocated locally in function and optimize out
unnecessary memory writes.

2. One of function versions used malloc() to allocate buffer of constant size
at beginning of function, and free() to delete it at the end. When this
function was inlined and called twice, its body was inserted twice in my code.
Other function versions with buffer on stack or malloced outside and passed via
argument were inserted once. I am not sure if this is some bug or feature,
someone familiar with this should check it.

3. If there is valid reason to actually duplicate this code (e.g. loop
unrolling), gcc could detect that code calls malloc and free in loop and
repeatedly allocate/free buffer of the same size. In such case gcc could
optimize this to call malloc and free once. If someone will use calloc, gcc
could simply clear buffer before each loop iteration.

I tested this with gcc 5.4.0 shipped by default in Cygwin 64 bit. Test code
used by me is below.

Code:

#include <stdio.h>
#include <stdlib.h>

#define CNT 4

#define ATTR_INLINE __attribute__((hot, always_inline)) static inline
//#define ATTR_INLINE

ATTR_INLINE int func_1(int* data)
{
    int buff[CNT * CNT];

    for (int i = 0; i < CNT; ++i)
    {
        for (int j = 0; j < CNT; ++j)
        {
            buff[i * CNT + j] = data[i];
        }
    }

    for (int n = 0; n < CNT; ++n)
    {
        for (int i = 0; i < n; ++i)
        {
            for (int j = 0; j < n; ++j)
            {
                buff[i * CNT + j] = buff[(i+1) * CNT + j] + buff[i * CNT +
j+1];
            }
        }
    }

    return buff[0];
}

ATTR_INLINE int func_2(int* data)
{
    int* buff = (int*)malloc(CNT * CNT * sizeof(int));

    for (int i = 0; i < CNT; ++i)
    {
        for (int j = 0; j < CNT; ++j)
        {
            buff[i * CNT + j] = data[i];
        }
    }

    for (int n = 0; n < CNT; ++n)
    {
        for (int i = 0; i < n; ++i)
        {
            for (int j = 0; j < n; ++j)
            {
                buff[i * CNT + j] = buff[(i+1) * CNT + j] + buff[i * CNT +
j+1];
            }
        }
    }

    int result = buff[0];
    free(buff);
    return result;
}

ATTR_INLINE int func_3(int* __restrict__ data, int* __restrict__ buff)
{
    for (int i = 0; i < CNT; ++i)
    {
        for (int j = 0; j < CNT; ++j)
        {
            buff[i * CNT + j] = data[i];
        }
    }

    for (int n = 0; n < CNT; ++n)
    {
        for (int i = 0; i < n; ++i)
        {
            for (int j = 0; j < n; ++j)
            {
                buff[i * CNT + j] = buff[(i+1) * CNT + j] + buff[i * CNT +
j+1];
            }
        }
    }

    return buff[0];
}

ATTR_INLINE int func_4(int* data, int* buff)
{
    for (int i = 0; i < CNT; ++i)
    {
        for (int j = 0; j < CNT; ++j)
        {
            buff[i * CNT + j] = data[i];
        }
    }

    for (int n = 0; n < CNT; ++n)
    {
        for (int i = 0; i < n; ++i)
        {
            for (int j = 0; j < n; ++j)
            {
                buff[i * CNT + j] = buff[(i+1) * CNT + j] + buff[i * CNT +
j+1];
            }
        }
    }

    return buff[0];
}

int main()
{
    int* data = (int*)malloc(CNT * sizeof(int));
    int* buff = (int*)malloc(CNT * sizeof(int));
    for (int n = 0; n < CNT; ++n)
        data[n] = rand();

    int sum = 0;
    printf("1");
    sum += func_1(data);
    sum += func_1(data);
    printf("2");
    sum += func_2(data);
    sum += func_2(data);
    printf("3");
    sum += func_3(data, buff);
    sum += func_3(data, buff);
    printf("4");
    sum += func_4(data, buff);
    sum += func_4(data, buff);
    printf("4");

    return sum;
}

Reply via email to