#include <stdlib.h>
#include <stdio.h>
#include <string.h>
#include <sys/time.h>


#define DISPLAY_W 240
#define DISPLAY_H 320

#define BANDWIDTH_TESTS_COUNT 3000
#define WBUFFER_SIZE (DISPLAY_W * DISPLAY_H * 2)
#define RBUFFER_SIZE (240*320 * 2)
#define CACHE_SIZE (32 * 1024)
#define CACHE_LINE 32



char __attribute__ ((aligned(32))) rbuffer[RBUFFER_SIZE + 3];
char __attribute__ ((aligned(32))) wbuffer[WBUFFER_SIZE + 3];
char __attribute__ ((aligned(32))) cache_buffer[CACHE_SIZE];



typedef struct
{
  const char *name;
  void *(*copy) (void *dst, const void *src, size_t size);
} tests;


double get_time ()
{
  struct timeval      timev;

  gettimeofday(&timev, NULL);
  return (double)timev.tv_sec + (((double)timev.tv_usec) / 1000000);
}

void *memcpy_libc(void *dst, const void *src, size_t size);
void *memcpy_uclibc(void *dst, const void *src, size_t size);


void
memcpy_test (const char *name,
	     void *(*copy) (void *dst, const void *src, size_t size),
	     size_t size,
	     int align)
{
  double t1;
  double t2;
  int count;
  int i;
  int j;

  if (size > RBUFFER_SIZE) size = RBUFFER_SIZE;
  count = RBUFFER_SIZE / size;

  //flush the cache
  for (i = 0; i < CACHE_SIZE; i += CACHE_LINE)
    __asm__ __volatile__ ("pld [%[cache_buffer], %[offset]]"::
			  [cache_buffer] "r" (cache_buffer),
			  [offset] "r" (i)
			  );

  t1 = get_time ();

  for (j=0;j<BANDWIDTH_TESTS_COUNT;++j)
    for (i = 0; i < count; ++i)
      copy(wbuffer + i * size + align, rbuffer + i * size, size);

  t2 = get_time ();

  printf("%s (align=%d, size=%d): %.2fMB/s\n",
	 name, align, size,
	 (((float)count*BANDWIDTH_TESTS_COUNT / 1000.0) *
	  ((float)size / 1000.0)) /
	 ((float)(t2 - t1)));
}

void
memcpy_test_suite (size_t size)
{
  tests t[] = {
    { "std  ", memcpy },
    { "libc ", memcpy_libc },
    { "uclibc ", memcpy_uclibc },
    { NULL, NULL }
  };
  int align;
  int i;

  for (align = 0; align < 4; align++)
    for (i = 0; t[i].name; i++)
      memcpy_test (t[i].name, t[i].copy, size, align);
}

int main()
{
  size_t sizes[] = { 320 * 240 * 2 };
  int i;
  unsigned char c8;

  for (i = 0; i < RBUFFER_SIZE ; i++) {
    c8 = rand();
    rbuffer[i] = c8;
  }

  for (i = 0; i < (sizeof (sizes) / sizeof (size_t)); i++)
    memcpy_test_suite (sizes[i]);

  return 0;
}
