Using dcbz avoids first reading a cache line from memory before writing to the 
line.
Timing results (starting with clean cache, ie no write-backs for dirty lines):

JS20:
elapsed time: 0x0000000000009f5e
elapsed time using dcbz: 0x000000000000569e

elapsed time: 0x0000000000009fe9
elapsed time using dcbz: 0x0000000000005765


JS21:
elapsed time: 0x000000000000089e
elapsed time using dcbz: 0x0000000000000439

elapsed time: 0x0000000000000886
elapsed time using dcbz: 0x0000000000000438

.........................................

#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <errno.h>

typedef unsigned char uchar;
typedef unsigned long ulong;

#define LINE_SIZE 128
#define PAGE_SIZE 0x1000

#define BUF1_SIZE (PAGE_SIZE * 64)
#define BUF2_SIZE (PAGE_SIZE)
#define BUF3_SIZE (0x800000)

static __inline__ ulong time_base(void);
static __inline__ void copy_page(void *dp, void *sp);
static __inline__ void cacheable_copy_page(void *dp, void *sp);
static __inline__ void cacheable_clear_page(void *addr);

static uchar clean_cache(uchar *buf3);


int main(int argc, char **argv){

  int i;
  ulong tb1, tb2;
  uchar *buf1, *buf2, *buf3, *bufp;

  buf1 = malloc(BUF1_SIZE + PAGE_SIZE);
  buf2 = malloc(BUF2_SIZE + PAGE_SIZE);
  buf3 = malloc(BUF3_SIZE + PAGE_SIZE);

  buf1 = (uchar *)((ulong)(buf1 + (PAGE_SIZE - 1)) & ~(PAGE_SIZE - 1));
  buf2 = (uchar *)((ulong)(buf2 + (PAGE_SIZE - 1)) & ~(PAGE_SIZE - 1));
  buf3 = (uchar *)((ulong)(buf3 + (PAGE_SIZE - 1)) & ~(PAGE_SIZE - 1));

  memset(buf1, 1, BUF1_SIZE);
  memset(buf2, 2, BUF2_SIZE);
  memset(buf3, 3, BUF3_SIZE);

  clean_cache(buf3);  
  tb1 = time_base();

  for (bufp = buf1, i = 0; i < 4; i++, bufp += PAGE_SIZE*16){
      copy_page(bufp, buf2);
      copy_page(bufp+(PAGE_SIZE*1), buf2);
      copy_page(bufp+(PAGE_SIZE*2), buf2);
      copy_page(bufp+(PAGE_SIZE*3), buf2);
      copy_page(bufp+(PAGE_SIZE*4), buf2);
      copy_page(bufp+(PAGE_SIZE*5), buf2);
      copy_page(bufp+(PAGE_SIZE*6), buf2);
      copy_page(bufp+(PAGE_SIZE*7), buf2);
      
      copy_page(bufp+(PAGE_SIZE*8), buf2);
      copy_page(bufp+(PAGE_SIZE*9), buf2);
      copy_page(bufp+(PAGE_SIZE*10), buf2);
      copy_page(bufp+(PAGE_SIZE*11), buf2);
      copy_page(bufp+(PAGE_SIZE*12), buf2);
      copy_page(bufp+(PAGE_SIZE*13), buf2);
      copy_page(bufp+(PAGE_SIZE*14), buf2);
      copy_page(bufp+(PAGE_SIZE*15), buf2);
  }

  tb2 = time_base();
  printf("elapsed time: 0x%016lx\n", tb2 - tb1);


  clean_cache(buf3);  
  tb1 = time_base();

  for (bufp = buf1, i = 0; i < 4; i++, bufp += PAGE_SIZE*16){
      cacheable_copy_page(bufp, buf2);
      cacheable_copy_page(bufp+(PAGE_SIZE*1), buf2);
      cacheable_copy_page(bufp+(PAGE_SIZE*2), buf2);
      cacheable_copy_page(bufp+(PAGE_SIZE*3), buf2);
      cacheable_copy_page(bufp+(PAGE_SIZE*4), buf2);
      cacheable_copy_page(bufp+(PAGE_SIZE*5), buf2);
      cacheable_copy_page(bufp+(PAGE_SIZE*6), buf2);
      cacheable_copy_page(bufp+(PAGE_SIZE*7), buf2);
      
      cacheable_copy_page(bufp+(PAGE_SIZE*8), buf2);
      cacheable_copy_page(bufp+(PAGE_SIZE*9), buf2);
      cacheable_copy_page(bufp+(PAGE_SIZE*10), buf2);
      cacheable_copy_page(bufp+(PAGE_SIZE*11), buf2);
      cacheable_copy_page(bufp+(PAGE_SIZE*12), buf2);
      cacheable_copy_page(bufp+(PAGE_SIZE*13), buf2);
      cacheable_copy_page(bufp+(PAGE_SIZE*14), buf2);
      cacheable_copy_page(bufp+(PAGE_SIZE*15), buf2);
  }

  tb2 = time_base();
  printf("elapsed time using dcbz: 0x%016lx\n", tb2 - tb1);

  return(0);
}


static __inline__ ulong time_base(void)
{
        ulong tb;

        __asm__ __volatile__(
        "mftb   %0      # read time base"
        : "=r" (tb));

        return tb;
}


static __inline__ void cacheable_clear_page(void *addr)
{
        ulong lines, line_size;

        line_size = LINE_SIZE;
        lines = PAGE_SIZE / line_size;

        __asm__ __volatile__(
        "mtctr  %1      # clear_page\n\
1:      dcbz    0,%0\n\
        add     %0,%0,%3\n\
        bdnz    1b"
        : "=r" (addr)
        : "r" (lines), "0" (addr), "r" (line_size)
        : "%ctr", "memory");
}


static __inline__ void copy_page(void *dp, void *sp)
{
        ulong dwords, dword_size;

        dword_size = 8;
        dwords = (PAGE_SIZE / dword_size) - 1;

        __asm__ __volatile__(
        "mtctr  %2      # copy_page\n\
        ld      %2,0(%1)\n\
        std     %2,0(%0)\n\
1:      ldu     %2,8(%1)\n\
        stdu    %2,8(%0)\n\
        bdnz    1b"
        : /* no result */
        : "r" (dp), "r" (sp), "r" (dwords)
        : "%ctr", "memory");
}


static __inline__ void cacheable_copy_page(void *dp, void *sp)
{

        cacheable_clear_page(dp);
        copy_page(dp, sp);
}


static uchar clean_cache(uchar *buf3)
{     
      int i;
      uchar uc, *ucp = buf3;

      for (i = 0; i < BUF3_SIZE / LINE_SIZE; i++){
          uc += *ucp;
          ucp += LINE_SIZE;
      }

      return(uc);
}

_______________________________________________
Xen-ppc-devel mailing list
Xen-ppc-devel@lists.xensource.com
http://lists.xensource.com/xen-ppc-devel

Reply via email to