Hi Eric,

Attached benchmark test-cwf.c(cc -o test-cwf test-cwf.c), the result
shows when last level cache(LLC) miss and CPU fetches data from
memory, critical word as first 64bit member in cache line has better
performance(costs 158290336 cycles ) than other positions(offset 0x10,
costs 164100732 ) in cache line, the performance is improved by 3.6%
in this case.
cpu-info is also involved too.

Thanks
Ling
#include<stdio.h>
#include<string.h>
#include<stdlib.h>
#include<unistd.h>
#define MAX_BUF_NUM (1 << 20)
#define MAX_BUF_SIZE (1 << 8)
#define ACCESS_OFFSET (0x10)

# define HP_TIMING_NOW(Var) \
 ({ unsigned long long _hi, _lo; \
  asm volatile ("rdtsc" : "=a" (_lo), "=d" (_hi)); \
  (Var) = _hi << 32 | _lo; })

#define repeat_times  (64)

static void init_buf(char **buf)
{
	int i = 0;
	char *start;
	char *end;
	int pagesize = getpagesize();
	*buf = malloc(MAX_BUF_SIZE * MAX_BUF_NUM + pagesize);
	if(*buf == NULL) {
		printf("\nfait to malloc space!\n");
		exit(1);
	} else  {
		*buf = *buf + pagesize;
		*buf = (char *)(((unsigned long)*buf) & (-pagesize));
	}
	
	start = *buf;
	end = *buf + (MAX_BUF_SIZE * MAX_BUF_NUM) - MAX_BUF_SIZE;

	while(1) {
		*((unsigned char **)start) = end;
		*((unsigned char **)(start + ACCESS_OFFSET)) = (end + ACCESS_OFFSET);
		start = start + MAX_BUF_SIZE;
		if(start == end)
			break;
		*((unsigned char **)end) = start;
		*((unsigned char **)(end + ACCESS_OFFSET)) = start + ACCESS_OFFSET;
		end = end - MAX_BUF_SIZE;
	}

}

unsigned long lookingup_memmory(char *access, int num)
{
	__asm__("sub $1, %rsi");
	__asm__("xor %rax, %rax");
	__asm__("1:");
	__asm__("mov (%rdi), %r8");
	__asm__("add %r8, %rax");
	__asm__("mov %r8, %rdi");
	__asm__("sub $1, %rsi");
	__asm__("jae 1b");
}

static unsigned long test_lookup_time(char *buf)
{
	unsigned long i, start, end, best_time = ~0;

	for(i = 0; i < repeat_times; i++) {
		HP_TIMING_NOW(start);
		lookingup_memmory(buf, MAX_BUF_NUM);
		HP_TIMING_NOW(end);
		if(best_time > (end - start))
			best_time = (end - start);
	}

	return best_time;

}
void main (void)
{
	char *buf1 = NULL;
	char *buf2 = NULL;
	unsigned long aligned_time, unaligned_time;
	

	init_buf(&buf1);
	init_buf(&buf2);
	
	aligned_time = test_lookup_time(buf1);
	unaligned_time = test_lookup_time(buf2 + ACCESS_OFFSET);

	printf("looking-up aligned time %ld, looking-up unaligned time %ld\n", aligned_time, unaligned_time);
}




Attachment: cpu-info
Description: Binary data

Reply via email to