#define _GNU_SOURCE
#include <sched.h>
#include <pthread.h>
#include <unistd.h>
#include <stdio.h>
#include <stdlib.h>
#include <assert.h>
#include <math.h>
#include <sys/mman.h>
#include <stdarg.h>

#include "atomic.h"

#if QUEUE
#include "qspinlock.h"
#else
#include "ticket.h"
#endif

#include "perf.h"
#include "stat.h"

int nr_threads = 2, nr_loops = 100000, nr_waits = 10000;

arch_spinlock_t lock = __ARCH_SPIN_LOCK_UNLOCKED;
atomic_t cnt = ATOMIC_INIT(0);
atomic_t var = ATOMIC_INIT(1);

__thread int __curr_cpu;

static struct perf_event_attr perf_attr = {
	.type = PERF_TYPE_HARDWARE,
	.config = PERF_COUNT_HW_CPU_CYCLES,
	.exclude_kernel = 1,
	.pinned = 1,
};

void die(const char *err, ...)
{
	va_list params;

	va_start(params, err);
	vfprintf(stderr, err, params);
	va_end(params);

	exit(-1);
}

static unsigned long page_size;

static atomic_t running = ATOMIC_INIT(0);
static int start = 0;
static double total = 0;

void *thread_func(void *arg)
{
	cpu_set_t cpus;
	int c, i, l;
	int fd, ret;
	void *event;
	struct stats s1, s2, s3;
	u64 cyc, delta;
	double t;

	init_stats(&s1);
	init_stats(&s2);
	init_stats(&s3);

	__curr_cpu = (unsigned long)arg;

	CPU_ZERO(&cpus);
	CPU_SET(__curr_cpu, &cpus);

	ret = sched_setaffinity(0, sizeof(cpu_set_t), &cpus);
	if (ret)
		die("failed to set affinity");

	fd = sys_perf_event_open(&perf_attr, 0, -1, -1, 0);
	if (fd < 0)
		die("failed to create perf_event");

	event = mmap(NULL, page_size, PROT_READ, MAP_SHARED, fd, 0);
	if (event == (void *)-1)
		die("failed to mmap perf_event");

	close(fd);

	cyc = mmap_read_pinned(event);

	atomic_inc(&running);

	while (!ACCESS_ONCE(start))
		barrier();

	for (l = 0; l < nr_loops; l++) {
		unsigned int t = l * __curr_cpu;
		/*
		 * measure no-op
		 */
		cyc = mmap_read_pinned(event);
		barrier();
		cyc = mmap_read_pinned(event) - cyc;
		update_stats(&s1, cyc);

		/*
		 * measure spin_lock
		 */
		cyc = mmap_read_pinned(event);
#if ATOMIC
#if COND
		atomic_fetch_or(t, &var);
#elif REF
		refcount_inc(&var);
#else
		atomic_inc(&var);
#endif
#else
		arch_spin_lock(&lock);
#endif
		delta = mmap_read_pinned(event) - cyc;
		update_stats(&s2, delta);

		c = atomic_inc_return(&cnt);
#if !ATOMIC
		assert(c == 1);
#endif

		for (i = 0; i < nr_waits; i++)
			barrier();

		atomic_dec(&cnt);

		/*
		 * measure spin_lock + spin_unlock
		 */
		cyc = mmap_read_pinned(event);
#if ATOMIC
		atomic_dec(&var);
#else
		arch_spin_unlock(&lock);
#endif
		cyc = mmap_read_pinned(event) - cyc;
		update_stats(&s3, cyc + delta);

		for (i = 0; i < nr_waits/2; i++)
			barrier();
	}

	munmap(event, page_size);


	/*
	 * print: spin_lock - no_op & spin_lock + spin_unlock + 2*no_op
	 */
	printf("%d; avg: %f +- %f\n", __curr_cpu,
		avg_stats(&s2) - avg_stats(&s1),
		stddev_stats(&s2));

	t = avg_stats(&s3) - 2*avg_stats(&s1);
	printf("%d; avg: %f +- %f\n", __curr_cpu,
		t, stddev_stats(&s3));

	arch_spin_lock(&lock);
	total += t;
	arch_spin_unlock(&lock);

	return NULL;
}

int cpu_map[256];

int main(int argc, char **argv)
{
	pthread_t thread[256];
	int i, j;

	printf("%s\n", VERSION);

	page_size = sysconf(_SC_PAGESIZE);

	for (i = 0; i< 256; i++)
		cpu_map[i] = i;

	if (argc > 1) {
		nr_threads = 0;
		for (i = 1; i < argc; i++) {
			j = atoi(argv[i]);
			if (j < 0) {
				nr_threads = -j;
				break;
			}
			cpu_map[nr_threads++] = j;
		}
	}

	for (i = 0; i < nr_threads; i++) {
		printf("thread %d on cpu %d\n", i, cpu_map[i]);
		pthread_create(&thread[i], NULL, thread_func, 
				(void *)(unsigned long)cpu_map[i]);
	}

	while (atomic_read(&running) != nr_threads)
		barrier();

	ACCESS_ONCE(start) = 1;

	for (i = 0; i < nr_threads; i++)
		pthread_join(thread[i], NULL);

	/*
	 * crappy number..
	 */
	printf("\ntotal: %f\n", total/nr_threads);

	return 0;
}

