I read in the alpha reference manual (page (I) 4-145) that there is an
instruction wh64 that avoids the cachelines to be reloaded from memory
during large writes. This in theory should be ok for copy_page or
clear_page. I given a try with clear_page and wh64 instead only decrease
a lot the performances on a 21264.
-------------------------------------------------------------------
/*
* Copyright (C) 2000 Andrea Arcangeli <[EMAIL PROTECTED]> SuSE
*
* Assembly clear_page for alpha (testing for wh64).
*/
#define __KERNEL__
#include <asm/page.h>
#include <asm/timex.h>
#include <asm/system.h>
#include <string.h>
#define NR_PAGES 3000
static inline void _clear_page(unsigned long page)
{
unsigned long count = PAGE_SIZE/64;
unsigned long *ptr = (unsigned long *)page;
asm volatile("1:\n\t"
"wh64 (%1)\n\t"
"stq $31,0(%1)\n\t"
"stq $31,8(%1)\n\t"
"stq $31,16(%1)\n\t"
"stq $31,24(%1)\n\t"
"subq %0,1,%0\n\t"
"stq $31,32(%1)\n\t"
"stq $31,40(%1)\n\t"
"stq $31,48(%1)\n\t"
"stq $31,56(%1)\n\t"
"addq %1,64,%1\n\t"
"bne %0,1b" :
"=&r" (count), "=&r" (ptr) :
"0" (count), "1" (ptr));
}
static inline void _clear_page2(unsigned long page)
{
unsigned long count = PAGE_SIZE/64;
unsigned long *ptr = (unsigned long *)page;
asm volatile("1:\n\t"
"stq $31,0(%1)\n\t"
"stq $31,8(%1)\n\t"
"stq $31,16(%1)\n\t"
"stq $31,24(%1)\n\t"
"subq %0,1,%0\n\t"
"stq $31,32(%1)\n\t"
"stq $31,40(%1)\n\t"
"stq $31,48(%1)\n\t"
"stq $31,56(%1)\n\t"
"addq %1,64,%1\n\t"
"bne %0,1b" :
"=&r" (count), "=&r" (ptr) :
"0" (count), "1" (ptr));
}
main()
{
char * first = (char *) (((unsigned long )malloc(PAGE_SIZE*NR_PAGES) +
~PAGE_MASK)&PAGE_MASK);
char * page[NR_PAGES];
cycles_t start, orig, wh64, other;
int i;
memset(first, 1, PAGE_SIZE*NR_PAGES);
for (i = 0; i < NR_PAGES; i++)
page[i] = first + PAGE_SIZE * i;
start = get_cycles();
for (i = 0; i < NR_PAGES/3; i++)
clear_page((unsigned long)page[i]);
orig = get_cycles();
orig -= start;
start = get_cycles();
for (i = NR_PAGES/3; i < NR_PAGES/3*2; i++)
_clear_page((unsigned long)page[i]);
wh64 = get_cycles();
wh64 -= start;
start = get_cycles();
for (i = NR_PAGES/3*2; i < NR_PAGES; i++)
_clear_page2((unsigned long)page[i]);
other = get_cycles();
other -= start;
printf("original %u, wh64 %u diff %d, other %d diff %d\n",
orig, wh64, orig-wh64, other, orig-other);
if (memcmp(page[0], page[NR_PAGES/3], PAGE_SIZE/3*NR_PAGES))
printf("error wh64\n");
if (memcmp(page[0], page[NR_PAGES/3*2], PAGE_SIZE/3*NR_PAGES))
printf("error other\n");
}
-------------------------------------------------------------------
The output I get from my bench is:
original 7631627, wh64 9230262 diff -1598635, other 7579267 diff 52360
^^^^^^^ it's much slower than the
original clear_page in C
Comments/hints?
Andrea
PS. The above proggy compiles only with a GAS with the binutils fix that
I posted a few hours ago applyed.