This is the microbenchmark I used.

For the record, Intel's intention going forward is that 0F 1F will
always be as fast or faster than any other alternative.

        -hpa

#define _GNU_SOURCE
#include <stdio.h>
#include <stdlib.h>
#include <time.h>
#include <stdbool.h>
#include <sys/time.h>

static void nop_p6(void)
{
  asm volatile(".rept 1000\n"
               ".byte 0x0f,0x1f,0x44,0x00,0x00\n"
               ".endr");
}

static void nop_k8(void)
{
  asm volatile(".rept 1000\n"
               ".byte 0x66,0x66,0x66,0x66,0x90\n"
               ".endr");
}

static void nop_lea(void)
{
#ifdef __x86_64__
  asm volatile(".rept 1000\n"
               ".byte 0x48,0x8d,0x74,0x26,0x00\n"
               ".endr");
#else
  asm volatile(".rept 1000\n"
               ".byte 0x3e,0x8d,0x74,0x26,0x00\n"
               ".endr");
#endif
}

static void nop_jmp5(void)
{
  asm volatile(".rept 1000\n"
               ".byte 0xe9,0,0,0,0\n"
               ".endr");
}

static void nop_jmp2(void)
{
  asm volatile(".rept 1000\n"
               ".byte 0xeb,3,0x90,0x90,0x90\n"
               ".endr");
}

static void nop_xchg(void)
{
  asm volatile(".rept 1000\n"
               ".byte 0x66,0x66,0x66,0x87,0xc0\n"
               ".endr");
}

static void nop_mov(void)
{
  asm volatile(".rept 1000\n"
               ".byte 0x66,0x66,0x66,0x89,0xc0\n"
               ".endr");
}

static void nop_fdisi(void)
{
  asm volatile(".rept 1000\n"
               ".byte 0x66,0x66,0x66,0xdb,0xe1\n"
               ".endr");
}
  
static void nop_feni(void)
{
  asm volatile(".rept 1000\n"
               ".byte 0x66,0x66,0x66,0xdb,0xe0\n"
               ".endr");
}

struct test_list {
  const char *name;
  void (*func)(void);
};

static const struct test_list tests[] = {
  { "P6 NOPs (NOPL)", nop_p6 },
  { "K8 NOPs (66 90)", nop_k8 },
  { "LEA", nop_lea },
  { "XCHG", nop_xchg },
  { "MOV", nop_mov },
  { "FDISI", nop_fdisi },
  { "FENI", nop_feni },
  { "E9 JMP", nop_jmp5 },
  { "EB JMP", nop_jmp2 },
  { NULL, NULL }
};

static void benchmark(const struct test_list *test, bool warmup)
{
  struct timeval tv0, tv1;
  int i;
  const int reps = 100000;
  unsigned long long us;
  
  gettimeofday(&tv0, NULL);
  for (i = 0; i < reps; i++)
    test->func();
  gettimeofday(&tv1, NULL);

  us = (tv1.tv_sec - tv0.tv_sec) * 1000000ULL +
    ((int)tv1.tv_usec - (int)tv0.tv_usec);

  if (!warmup)
    printf("%s: %d repetitions at %llu us\n", test->name, reps, us);
}

int main(void)
{
  const struct test_list *test;
  
  for (test = tests; test->func; test++) {
    benchmark(test, true);
    benchmark(test, false);
  }

  return 0;
}

Reply via email to